diff --git a/0006-vk-introduce-vkernel.patch b/0006-vk-introduce-vkernel.patch new file mode 100644 index 0000000000000000000000000000000000000000..46006b20a7c485f569be5923c21aa1d2b41de932 --- /dev/null +++ b/0006-vk-introduce-vkernel.patch @@ -0,0 +1,7033 @@ +From ac335c25bcd536ea42fa917dc0ba9bd6a949ec54 Mon Sep 17 00:00:00 2001 +From: Hang Huang +Date: Wed, 12 Feb 2025 14:44:52 +0800 +Subject: [PATCH] vk: introduce vkernel + +virt inclusion +category: feature +bugzilla: https://gitee.com/openeuler/kernel/issues/IBD46F + +------------------------------------------------- + +Initial version provides following features: +* syscall isolation: hook do_syscall_x64 and do_futex to isolate syscalls. +* cap enhancement: hook cap_capable to enhance capability protection. +* file protection: hook generic_permission to customize inode protection. +* log isolation: distinguish the owner of kernel log to isolate logs. +* param isolation: use independent sysctl params for each instance. + +This patch squashes the following commits: +vk: update config +vk: implement mem policy customization +vk: implement cpu policy customization +vk: introduce raw sysctl interface +vk: implement vm sysctl customization +vk: implement net sysctl customization +vk: implement kernel sysctl customization +vk: implement fs sysctl customization +vk: implement vkernel data isolation in driver +vk: implement vkernel framework as a module +vk: introduce mem policy customization +vk: introduce cpu policy customization +vk: introduce vm sysctl customization +vk: introduce net sysctl customization +vk: introduce kernel sysctl customization +vk: introduce fs sysctl customization +vk: introduce vkernel data isolation +vk: introduce vkernel framework + +Signed-off-by: Hang Huang +--- + arch/arm64/configs/openeuler_defconfig | 1 + + arch/arm64/include/asm/vkernel.h | 25 + + arch/arm64/kernel/syscall.c | 21 + + arch/x86/configs/openeuler_defconfig | 1 + + arch/x86/entry/common.c | 19 + + arch/x86/include/asm/vkernel.h | 26 + + drivers/Makefile | 2 + + drivers/vkernel/Makefile | 11 + + drivers/vkernel/fs/acl.c | 369 ++++++ + drivers/vkernel/include/fs.h | 18 + + drivers/vkernel/include/mm.h | 31 + + drivers/vkernel/include/sched.h | 11 + + drivers/vkernel/include/security.h | 15 + + drivers/vkernel/include/syscall.h | 24 + + drivers/vkernel/include/sysctl.h | 53 + + drivers/vkernel/include/utils.h | 11 + + drivers/vkernel/mm/mm.c | 72 ++ + drivers/vkernel/sched/cpu.c | 36 + + drivers/vkernel/security/capability.c | 111 ++ + drivers/vkernel/syscall.c | 636 +++++++++++ + drivers/vkernel/sysctl/fs.c | 73 ++ + drivers/vkernel/sysctl/kernel.c | 112 ++ + drivers/vkernel/sysctl/net.c | 416 +++++++ + drivers/vkernel/sysctl/raw.c | 689 +++++++++++ + drivers/vkernel/sysctl/vm.c | 102 ++ + drivers/vkernel/utils/kallsyms.c | 63 ++ + drivers/vkernel/vkernel_main.c | 1444 ++++++++++++++++++++++++ + fs/devpts/inode.c | 26 +- + fs/exec.c | 12 + + fs/file.c | 19 + + fs/file_table.c | 46 +- + fs/inode.c | 55 +- + fs/namei.c | 13 + + fs/namespace.c | 10 + + fs/proc/meminfo.c | 18 +- + fs/userfaultfd.c | 11 + + include/linux/miscdevice.h | 1 + + include/linux/mman.h | 4 + + include/linux/vkernel.h | 578 ++++++++++ + init/Kconfig | 20 + + ipc/shm.c | 16 + + kernel/Makefile | 1 + + kernel/exit.c | 10 + + kernel/fork.c | 30 + + kernel/futex/syscalls.c | 10 + + kernel/printk/printk.c | 14 + + kernel/printk/printk_ringbuffer.c | 11 + + kernel/printk/printk_ringbuffer.h | 3 + + kernel/sys.c | 28 + + kernel/vkernel_hook.c | 92 ++ + mm/huge_memory.c | 51 +- + mm/memory.c | 15 + + mm/mmap.c | 96 ++ + mm/mremap.c | 18 + + mm/nommu.c | 11 + + mm/shmem.c | 16 +- + mm/util.c | 73 +- + security/commoncap.c | 11 + + 58 files changed, 5698 insertions(+), 13 deletions(-) + create mode 100644 arch/arm64/include/asm/vkernel.h + create mode 100644 arch/x86/include/asm/vkernel.h + create mode 100644 drivers/vkernel/Makefile + create mode 100644 drivers/vkernel/fs/acl.c + create mode 100644 drivers/vkernel/include/fs.h + create mode 100644 drivers/vkernel/include/mm.h + create mode 100644 drivers/vkernel/include/sched.h + create mode 100644 drivers/vkernel/include/security.h + create mode 100644 drivers/vkernel/include/syscall.h + create mode 100644 drivers/vkernel/include/sysctl.h + create mode 100644 drivers/vkernel/include/utils.h + create mode 100644 drivers/vkernel/mm/mm.c + create mode 100644 drivers/vkernel/sched/cpu.c + create mode 100644 drivers/vkernel/security/capability.c + create mode 100644 drivers/vkernel/syscall.c + create mode 100644 drivers/vkernel/sysctl/fs.c + create mode 100644 drivers/vkernel/sysctl/kernel.c + create mode 100644 drivers/vkernel/sysctl/net.c + create mode 100644 drivers/vkernel/sysctl/raw.c + create mode 100644 drivers/vkernel/sysctl/vm.c + create mode 100644 drivers/vkernel/utils/kallsyms.c + create mode 100644 drivers/vkernel/vkernel_main.c + create mode 100644 include/linux/vkernel.h + create mode 100644 kernel/vkernel_hook.c + +diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig +index 3cfff0701479..36c862ac6ec7 100644 +--- a/arch/arm64/configs/openeuler_defconfig ++++ b/arch/arm64/configs/openeuler_defconfig +@@ -208,6 +208,7 @@ CONFIG_IPC_NS=y + CONFIG_USER_NS=y + CONFIG_PID_NS=y + CONFIG_NET_NS=y ++# CONFIG_VKERNEL is not set + CONFIG_SCHED_STEAL=y + CONFIG_CHECKPOINT_RESTORE=y + CONFIG_SCHED_AUTOGROUP=y +diff --git a/arch/arm64/include/asm/vkernel.h b/arch/arm64/include/asm/vkernel.h +new file mode 100644 +index 000000000000..31feb6967075 +--- /dev/null ++++ b/arch/arm64/include/asm/vkernel.h +@@ -0,0 +1,25 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * Copyright (C) 2024 ARM Ltd. ++ * Author: Hang Huang ++ */ ++ ++#ifndef __ASM__VKERNEL_H ++#define __ASM__VKERNEL_H ++ ++#define sys_call_vk_t syscall_fn_t ++ ++DECLARE_PER_CPU(struct task_struct *, current_syscall_task); ++DECLARE_PER_CPU(struct vkernel *, current_syscall_vk); ++ ++static __always_inline struct task_struct *get_current_syscall_task(void) ++{ ++ return this_cpu_read_8(current_syscall_task); ++} ++ ++static __always_inline struct vkernel *get_current_syscall_vk(void) ++{ ++ return this_cpu_read_8(current_syscall_vk); ++} ++ ++#endif +diff --git a/arch/arm64/kernel/syscall.c b/arch/arm64/kernel/syscall.c +index 558e9c9da8a4..bf2da0b4a494 100644 +--- a/arch/arm64/kernel/syscall.c ++++ b/arch/arm64/kernel/syscall.c +@@ -7,6 +7,9 @@ + #include + #include + #include ++#ifdef CONFIG_VKERNEL ++#include ++#endif + + #include + #include +@@ -42,13 +45,31 @@ static void invoke_syscall(struct pt_regs *regs, unsigned int scno, + const syscall_fn_t syscall_table[]) + { + long ret; ++#ifdef CONFIG_VKERNEL ++ struct vkernel *vk; ++#endif + + add_random_kstack_offset(); + + if (scno < sc_nr) { + syscall_fn_t syscall_fn; ++#ifdef CONFIG_VKERNEL ++ vk = vkernel_find_vk_by_task(current); ++ if (!vk) { ++ syscall_fn = syscall_table[array_index_nospec(scno, sc_nr)]; ++ ret = __invoke_syscall(regs, syscall_fn); ++ } else { ++ syscall_fn = (vk->syscall.table)[array_index_nospec(scno, sc_nr)]; ++ this_cpu_write(current_syscall_task, current); ++ this_cpu_write(current_syscall_vk, vk); ++ ret = __invoke_syscall(regs, syscall_fn); ++ this_cpu_write(current_syscall_vk, NULL); ++ this_cpu_write(current_syscall_task, NULL); ++ } ++#else + syscall_fn = syscall_table[array_index_nospec(scno, sc_nr)]; + ret = __invoke_syscall(regs, syscall_fn); ++#endif + } else { + ret = do_ni_syscall(regs, scno); + } +diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig +index 3cbae4c5f390..1b0d0d8c2025 100644 +--- a/arch/x86/configs/openeuler_defconfig ++++ b/arch/x86/configs/openeuler_defconfig +@@ -226,6 +226,7 @@ CONFIG_IPC_NS=y + CONFIG_USER_NS=y + CONFIG_PID_NS=y + CONFIG_NET_NS=y ++# CONFIG_VKERNEL is not set + CONFIG_SCHED_STEAL=y + CONFIG_CHECKPOINT_RESTORE=y + CONFIG_SCHED_AUTOGROUP=y +diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c +index e72dac092245..77c2a25447e4 100644 +--- a/arch/x86/entry/common.c ++++ b/arch/x86/entry/common.c +@@ -19,6 +19,9 @@ + #include + #include + #include ++#ifdef CONFIG_VKERNEL ++#include ++#endif + + #ifdef CONFIG_XEN_PV + #include +@@ -45,10 +48,26 @@ static __always_inline bool do_syscall_x64(struct pt_regs *regs, int nr) + * numbers for comparisons. + */ + unsigned int unr = nr; ++#ifdef CONFIG_VKERNEL ++ struct vkernel *vk; ++#endif + + if (likely(unr < NR_syscalls)) { + unr = array_index_nospec(unr, NR_syscalls); ++#ifdef CONFIG_VKERNEL ++ vk = vkernel_find_vk_by_task(current); ++ if (!vk) ++ regs->ax = sys_call_table[unr](regs); ++ else { ++ this_cpu_write(current_syscall_task, current); ++ this_cpu_write(current_syscall_vk, vk); ++ regs->ax = (vk->syscall.table)[unr](regs); ++ this_cpu_write(current_syscall_vk, NULL); ++ this_cpu_write(current_syscall_task, NULL); ++ } ++#else + regs->ax = x64_sys_call(regs, unr); ++#endif + return true; + } + return false; +diff --git a/arch/x86/include/asm/vkernel.h b/arch/x86/include/asm/vkernel.h +new file mode 100644 +index 000000000000..f46c3262e3c3 +--- /dev/null ++++ b/arch/x86/include/asm/vkernel.h +@@ -0,0 +1,26 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * Copyright (C) 2024 ARM Ltd. ++ * Author: Hang Huang ++ */ ++ ++#ifndef __ASM_X86_VKERNEL_H ++#define __ASM_X86_VKERNEL_H ++ ++#define sys_call_vk_t sys_call_ptr_t ++ ++DECLARE_PER_CPU(struct task_struct *, current_syscall_task); ++DECLARE_PER_CPU(struct vkernel *, current_syscall_vk); ++ ++static __always_inline struct task_struct *get_current_syscall_task(void) ++{ ++ return this_cpu_read_stable(current_syscall_task); ++} ++ ++static __always_inline struct vkernel *get_current_syscall_vk(void) ++{ ++ return this_cpu_read_stable(current_syscall_vk); ++} ++ ++ ++#endif +diff --git a/drivers/Makefile b/drivers/Makefile +index 79d803250002..2008d7dd6dd1 100644 +--- a/drivers/Makefile ++++ b/drivers/Makefile +@@ -206,4 +206,6 @@ obj-$(CONFIG_ROH) += roh/ + + obj-$(CONFIG_HISI_VIRTCCA_CODA) += coda/ + ++obj-$(CONFIG_VKERNEL_DRIVER) += vkernel/ ++ + obj-$(CONFIG_ARM_SPE_MEM_SAMPLING) += arm/mm_monitor/ +diff --git a/drivers/vkernel/Makefile b/drivers/vkernel/Makefile +new file mode 100644 +index 000000000000..1fd838c21967 +--- /dev/null ++++ b/drivers/vkernel/Makefile +@@ -0,0 +1,11 @@ ++obj-$(CONFIG_VKERNEL_DRIVER) += vkernel.o ++ ++ccflags-y := -I$(srctree)/drivers/vkernel/include ++ ++vkernel-y := vkernel_main.o syscall.o ++vkernel-y += fs/acl.o ++vkernel-y += mm/mm.o ++vkernel-y += security/capability.o ++vkernel-y += sched/cpu.o ++vkernel-y += sysctl/fs.o sysctl/kernel.o sysctl/net.o sysctl/vm.o sysctl/raw.o ++vkernel-y += utils/kallsyms.o +diff --git a/drivers/vkernel/fs/acl.c b/drivers/vkernel/fs/acl.c +new file mode 100644 +index 000000000000..ce77f1881d24 +--- /dev/null ++++ b/drivers/vkernel/fs/acl.c +@@ -0,0 +1,369 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Copyright (C) 2024 ARM Ltd. ++ * Author: Hang Huang ++ */ ++ ++#include ++ ++#include "fs.h" ++ ++static char *def_path[] = { ++ /* open, access, append, read, exec */ ++ "/proc/sys/abi", ++ "/proc/sys/debug", ++ "/proc/sys/dev", ++ "/proc/sys/fs", ++ "/proc/sys/net", ++ "/proc/sys/user", ++ "/proc/sys/vm", ++ /* open, read, exec */ ++ "/sys/kernel", ++ "/sys/power", ++ "/sys/class", ++ "/sys/devices", ++ "/sys/dev", ++ "/sys/hypervisor", ++ "/sys/bus", ++ "/sys/block", ++ "/sys/module", ++ "/sys/firmware", ++ "/sys/fs/ecryptfs", ++ "/sys/fs/pstore", ++ "/sys/fs/bpf", ++ "/sys/fs/fuse", ++ "/sys/fs/ext4", ++ /* open */ ++ "/proc/sysrq-trigger", ++ "/sys/kernel/security", ++ /* nop */ ++ "/sys/fs/cgroup", ++ "/dev/vkernel", ++}; ++ ++static unsigned short def_mode[] = { ++ 0x803d, 0x803d, 0x803d, 0x803d, 0x803d, 0x803d, 0x803d, 0x8025, ++ 0x8025, 0x8025, 0x8025, 0x8025, 0x8025, 0x8025, 0x8025, 0x8025, ++ 0x8025, 0x8024, 0x8024, 0x8024, 0x8024, 0x8024, 0x8020, 0x8020, ++ 0x0000, 0x0000, ++}; ++ ++static struct kmem_cache *acl_node_cache; ++ ++int vk_acl_init(void) ++{ ++ acl_node_cache = kmem_cache_create("vkernel_acl_node", ++ sizeof(struct vkernel_acl_node), 0, SLAB_ACCOUNT, NULL); ++ if (!acl_node_cache) { ++ pr_err("failed to create slab for acl node\n"); ++ return -ENOMEM; ++ } ++ ++ return 0; ++} ++ ++void vk_acl_uninit(void) ++{ ++ kmem_cache_destroy(acl_node_cache); ++} ++ ++int vk_init_acl(struct vkernel_acl *acl, unsigned int bits) ++{ ++ ++ acl->ht = kcalloc( ++ 1UL << bits, sizeof(struct hlist_head), GFP_KERNEL); ++ if (!acl->ht) ++ return -ENOMEM; ++ ++ acl->bits = bits; ++ INIT_LIST_HEAD(&acl->nodes); ++ acl->active = false; ++ ++ return 0; ++} ++ ++void vk_uninit_acl(struct vkernel_acl *acl) ++{ ++ struct hlist_head *ht = acl->ht; ++ struct vkernel_acl_node *node; ++ struct vkernel_acl_node *tmp; ++ ++ if (!acl->ht || !acl->bits) ++ return; ++ ++ acl->active = false; ++ list_for_each_entry_safe(node, tmp, &acl->nodes, link) { ++ if (!hlist_unhashed(&node->hash)) ++ hlist_del(&node->hash); ++ list_del(&node->link); ++ kmem_cache_free(acl_node_cache, node); ++ } ++ INIT_LIST_HEAD(&acl->nodes); ++ ++ acl->bits = 0; ++ kfree(ht); ++} ++ ++/* inode hash, copy from inode.c */ ++static unsigned long inode_hash(struct inode *inode, unsigned long shift) ++{ ++ struct super_block *sb = inode->i_sb; ++ unsigned long hashval = inode->i_ino; ++ unsigned long tmp; ++ ++ tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) / ++ L1_CACHE_BYTES; ++ tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> shift); ++ return tmp; ++} ++ ++static struct vkernel_acl_node *vk_acl_node_get(struct vkernel_acl *acl, struct inode *inode) ++{ ++ struct hlist_head *ht = acl->ht; ++ struct vkernel_acl_node *node; ++ unsigned long key = inode_hash(inode, acl->bits); ++ ++ hlist_for_each_entry(node, &ht[hash_min(key, acl->bits)], hash) { ++ if (inode->i_ino == node->ino && inode->i_sb == node->sb) ++ return node; ++ } ++ ++ return NULL; ++} ++ ++static int vk_acl_node_del(struct vkernel_acl *acl, struct inode *inode) ++{ ++ struct vkernel_acl_node *node; ++ ++ node = vk_acl_node_get(acl, inode); ++ if (!node) ++ return -1; ++ ++ hlist_del(&node->hash); ++ node->ino = 0; ++ node->sb = NULL; ++ ++ return 0; ++} ++ ++static int vk_acl_node_add(struct vkernel_acl *acl, struct inode *inode, ++ struct vkernel_acl_node *node) ++{ ++ struct hlist_head *ht = acl->ht; ++ unsigned long key = inode_hash(inode, acl->bits); ++ ++ /* Remove old rule if exists */ ++ vk_acl_node_del(acl, inode); ++ ++ node->ino = inode->i_ino; ++ node->sb = inode->i_sb; ++ hlist_add_head(&node->hash, &ht[hash_min(key, acl->bits)]); ++ ++ return 0; ++} ++ ++/* ++ * Inode from file->f_inode may be destroyed at following access ++ * Using kern_path is also unstable, is there a better way? ++ */ ++static struct inode *kern_path_to_inode(const char *filename) ++{ ++ struct path path; ++ struct inode *inode; ++ int ret; ++ ++ ret = kern_path(filename, LOOKUP_FOLLOW | LOOKUP_OPEN, &path); ++ if (ret) ++ return NULL; ++ ++ inode = path.dentry->d_inode; ++ path_put(&path); ++ ++ return inode; ++} ++ ++static int vk_activate_acl(struct vkernel_acl *acl, struct vkernel_acl_node *node) ++{ ++ struct inode *inode; ++ ++ inode = kern_path_to_inode(node->path); ++ if (!inode) { ++ pr_warn("vkernel: cannot set cal, no such file or directory %s\n", node->path); ++ return 0; ++ } ++ ++ if (!vk_acl_node_add(acl, inode, node)) { ++ if (S_ISDIR(inode->i_mode)) ++ inode->i_opflags |= IOP_VKERNEL_DIR; ++ else ++ inode->i_opflags |= IOP_VKERNEL_REG; ++ } ++ ++ pr_debug("activate acl, path %s mode 0x%x ino %lu\n", node->path, node->mode, inode->i_ino); ++ ++ return 0; ++} ++ ++int vk_deactivate_acl(struct vkernel_acl *acl, struct vkernel_acl_node *node) ++{ ++ struct inode *inode; ++ ++ inode = kern_path_to_inode(node->path); ++ if (!inode) ++ return -EINVAL; ++ ++ if (!vk_acl_node_del(acl, inode)) { ++ if (S_ISDIR(inode->i_mode)) ++ inode->i_opflags &= ~IOP_VKERNEL_DIR; ++ else ++ inode->i_opflags &= ~IOP_VKERNEL_REG; ++ } ++ ++ return 0; ++} ++ ++static void vk_activate_acl_all(struct vkernel_acl *acl) ++{ ++ static DEFINE_MUTEX(vk_activate_lock); ++ struct vkernel_acl_node *node; ++ ++ /* Failure on trylock means someone is doing this job */ ++ if (!mutex_trylock(&vk_activate_lock)) ++ return; ++ ++ acl->active = true; ++ list_for_each_entry(node, &acl->nodes, link) { ++ if (hlist_unhashed(&node->hash)) ++ vk_activate_acl(acl, node); ++ } ++ ++ mutex_unlock(&vk_activate_lock); ++} ++ ++static int vk_permission(struct vkernel *vk, struct inode *inode, int mask) ++{ ++ struct vkernel_acl_node *node; ++ ++ node = vk_acl_node_get(&vk->acl, inode); ++ if (node) { ++ if ((mask & ~(node->mode) & (MAY_READ | MAY_WRITE | MAY_EXEC)) != 0) { ++ pr_err("vkernel: permision denied, pid %d mask 0x%x vmode 0x%x path %s\n", ++ current->pid, mask, node->mode, node->path); ++ return -EACCES; ++ } ++ } ++ ++ return 0; ++} ++ ++/* ++ * Note: some filesystems or inodes may define their own permission hook. ++ * In such cases, vkernel permission check will be skipped. ++ */ ++int vk_generic_permission(struct vkernel *vk, struct mnt_idmap *idmap, ++ struct inode *inode, int mask) ++{ ++ int ret = 0; ++ ++ /* Activate acl at first check */ ++ if (unlikely(!vk->acl.active)) ++ vk_activate_acl_all(&vk->acl); ++ ++ if (inode->i_opflags & (IOP_VKERNEL_REG|IOP_VKERNEL_DIR)) ++ ret = vk_permission(vk, inode, mask); ++ ++ return ret; ++} ++ ++int vkernel_set_acl(struct vkernel_acl *acl, char *path, unsigned short mode) ++{ ++ struct vkernel_acl_node *node; ++ ++ pr_debug("set acl, path %s mode 0x%x\n", path, mode); ++ node = kmem_cache_alloc(acl_node_cache, GFP_KERNEL_ACCOUNT); ++ if (!node) { ++ pr_err("failed to alloc acl node\n"); ++ return -ENOMEM; ++ } ++ INIT_HLIST_NODE(&node->hash); ++ node->ino = 0; ++ node->sb = NULL; ++ memcpy(node->path, path, VKERNEL_PATH_MAX); ++ node->mode = mode; ++ list_add_tail(&node->link, &acl->nodes); ++ ++ if (acl->active) ++ return vk_activate_acl(acl, node); ++ ++ return 0; ++} ++EXPORT_SYMBOL(vkernel_set_acl); ++ ++int vkernel_clear_acl(struct vkernel_acl *acl, char *path) ++{ ++ struct vkernel_acl_node *node; ++ bool found = false; ++ ++ list_for_each_entry(node, &acl->nodes, link) { ++ if (!strncmp(node->path, path, VKERNEL_PATH_MAX)) { ++ found = true; ++ break; ++ } ++ } ++ if (!found) ++ return -EINVAL; ++ ++ if (!hlist_unhashed(&node->hash)) ++ vk_deactivate_acl(acl, node); ++ ++ list_del(&node->link); ++ kmem_cache_free(acl_node_cache, node); ++ ++ return 0; ++} ++EXPORT_SYMBOL(vkernel_clear_acl); ++ ++int vkernel_set_acl_set(struct vkernel_acl *acl, struct vkernel_file_desc_set *set) ++{ ++ u64 i; ++ int r; ++ ++ for (i = 0; i < set->nr_descs; i++) { ++ r = vkernel_set_acl(acl, set->descs[i].path, set->descs[i].mode); ++ if (r) ++ return r; ++ } ++ ++ return 0; ++} ++EXPORT_SYMBOL(vkernel_set_acl_set); ++ ++int vkernel_clear_acl_set(struct vkernel_acl *acl, struct vkernel_file_desc_set *set) ++{ ++ u64 i; ++ int r; ++ ++ for (i = 0; i < set->nr_descs; i++) { ++ r = vkernel_clear_acl(acl, set->descs[i].path); ++ if (r) ++ return r; ++ } ++ ++ return 0; ++} ++EXPORT_SYMBOL(vkernel_clear_acl_set); ++ ++int vkernel_set_default_acl_set(struct vkernel_acl *acl) ++{ ++ u64 i; ++ int r; ++ ++ for (i = 0; i < ARRAY_SIZE(def_path); i++) { ++ r = vkernel_set_acl(acl, def_path[i], def_mode[i]); ++ if (r) ++ return r; ++ } ++ ++ return 0; ++} ++EXPORT_SYMBOL(vkernel_set_default_acl_set); +diff --git a/drivers/vkernel/include/fs.h b/drivers/vkernel/include/fs.h +new file mode 100644 +index 000000000000..ce94c3274827 +--- /dev/null ++++ b/drivers/vkernel/include/fs.h +@@ -0,0 +1,18 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++ ++#ifndef _VKERNEL_FS_H ++#define _VKERNEL_FS_H ++ ++#include ++ ++int vk_acl_init(void); ++void vk_acl_uninit(void); ++ ++int vk_init_acl(struct vkernel_acl *acl, unsigned int bits); ++void vk_uninit_acl(struct vkernel_acl *acl); ++int vkernel_set_default_acl_set(struct vkernel_acl *acl); ++ ++int vk_generic_permission(struct vkernel *vk, struct mnt_idmap *idmap, ++ struct inode *inode, int mask); ++ ++#endif +diff --git a/drivers/vkernel/include/mm.h b/drivers/vkernel/include/mm.h +new file mode 100644 +index 000000000000..c2fdf89ba8eb +--- /dev/null ++++ b/drivers/vkernel/include/mm.h +@@ -0,0 +1,31 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++ ++#ifndef _VKERNEL_MM_H ++#define _VKERNEL_MM_H ++ ++#include ++ ++/* Copy from mm/shmem.c */ ++ ++#define SHMEM_HUGE_NEVER 0 ++#define SHMEM_HUGE_ALWAYS 1 ++#define SHMEM_HUGE_WITHIN_SIZE 2 ++#define SHMEM_HUGE_ADVISE 3 ++ ++/* ++ * Special values. ++ * Only can be set via /sys/kernel/mm/transparent_hugepage/shmem_enabled: ++ * ++ * SHMEM_HUGE_DENY: ++ * disables huge on shm_mnt and all mounts, for emergency use; ++ * SHMEM_HUGE_FORCE: ++ * enables huge on shm_mnt and all mounts, w/o needing option, for testing; ++ * ++ */ ++#define SHMEM_HUGE_DENY (-1) ++#define SHMEM_HUGE_FORCE (-2) ++ ++int vk_init_memory_pref(struct vkernel_mem_pref *mem); ++void vk_uninit_memory_pref(struct vkernel_mem_pref *mem); ++ ++#endif +diff --git a/drivers/vkernel/include/sched.h b/drivers/vkernel/include/sched.h +new file mode 100644 +index 000000000000..c273e3dea619 +--- /dev/null ++++ b/drivers/vkernel/include/sched.h +@@ -0,0 +1,11 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++ ++#ifndef _VKERNEL_SCHED_H ++#define _VKERNEL_SCHED_H ++ ++#include ++ ++int vk_init_cpu_pref(struct vkernel_cpu_pref *cpu); ++void vk_uninit_cpu_pref(struct vkernel_cpu_pref *cpu); ++ ++#endif +diff --git a/drivers/vkernel/include/security.h b/drivers/vkernel/include/security.h +new file mode 100644 +index 000000000000..0eac382ffd1b +--- /dev/null ++++ b/drivers/vkernel/include/security.h +@@ -0,0 +1,15 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++ ++#ifndef _VKERNEL_SECURITY_H ++#define _VKERNEL_SECURITY_H ++ ++#include ++ ++int vk_cap_init(void); ++void vk_cap_uninit(void); ++ ++int vk_cap_capable(struct vkernel *vk, const struct cred *cred, ++ struct user_namespace *targ_ns, ++ int cap, unsigned int opts); ++ ++#endif +diff --git a/drivers/vkernel/include/syscall.h b/drivers/vkernel/include/syscall.h +new file mode 100644 +index 000000000000..4bb75703f430 +--- /dev/null ++++ b/drivers/vkernel/include/syscall.h +@@ -0,0 +1,24 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++ ++#ifndef _VKERNEL_SYSCALL_H ++#define _VKERNEL_SYSCALL_H ++ ++#include ++ ++extern sys_call_vk_t *sys_call_table_ptr; ++ ++int vk_syscall_init(void); ++void vk_syscall_uninit(void); ++ ++long vk_sys_ni_syscall(const struct pt_regs *regs); ++long vk_sys_forbid_syscall(const struct pt_regs *regs); ++long vk_sys_ni_cond_syscall(const struct pt_regs *regs); ++long vk_sys_forbid_cond_syscall(const struct pt_regs *regs); ++ ++int vk_init_syscall(struct vkernel_syscall *syscall); ++void vk_uninit_syscall(struct vkernel_syscall *syscall); ++void vk_install_default_syscalls(struct vkernel_syscall *syscall); ++ ++extern struct vkernel_custom_type analysis_custom; ++ ++#endif +diff --git a/drivers/vkernel/include/sysctl.h b/drivers/vkernel/include/sysctl.h +new file mode 100644 +index 000000000000..7520ee7cbf74 +--- /dev/null ++++ b/drivers/vkernel/include/sysctl.h +@@ -0,0 +1,53 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++ ++#ifndef _VKERNEL_SYSCTL_H ++#define _VKERNEL_SYSCTL_H ++ ++#include ++#include ++ ++#define IPC_SEM_IDS 0 ++#define IPC_MSG_IDS 1 ++#define IPC_SHM_IDS 2 ++ ++/* defined at kernel/fork.c */ ++#define MIN_THREADS 20 ++#define MAX_THREADS FUTEX_TID_MASK ++ ++int vk_init_sysctl_fs(struct vkernel_sysctl_fs *fs); ++void vk_uninit_sysctl_fs(struct vkernel_sysctl_fs *fs); ++ ++int vk_init_sysctl_kernel(struct vkernel_sysctl_kernel *k); ++void vk_uninit_sysctl_kernel(struct vkernel_sysctl_kernel *k); ++ ++int vk_init_sysctl_net(struct vkernel_sysctl_net *net, struct task_struct *tsk); ++void vk_uninit_sysctl_net(struct vkernel_sysctl_net *net); ++ ++extern int (*tcp_set_default_congestion_control_ptr)(struct net *net, const char *name); ++ ++int devconf_proc(struct net *net, struct ipv4_devconf *conf, ++ int val, int i, int type); ++int devconf_forward(struct net *net, struct ipv4_devconf *conf, ++ int val, int i, int type); ++int devconf_flush(struct net *net, struct ipv4_devconf *conf, ++ int val, int i, int type); ++ ++int vk_init_sysctl_vm(struct vkernel_sysctl_vm *vm); ++void vk_uninit_sysctl_vm(struct vkernel_sysctl_vm *vm); ++ ++void vk_sync_overcommit_as(struct vkernel *vk); ++ ++int vkernel_set_sysctl_raw(struct vkernel *vk, char *buf); ++ ++/* Defined at ipc/util.h, MODIFIED */ ++static inline int sem_check_semmni(struct ipc_namespace *ns) ++{ ++ /* ++ * Check semmni range [0, ipc_mni] ++ * semmni is the last element of sem_ctls[4] array ++ */ ++ return ((ns->sem_ctls[3] < 0) || (ns->sem_ctls[3] > (1<<15))) ++ ? -ERANGE : 0; ++} ++ ++#endif +diff --git a/drivers/vkernel/include/utils.h b/drivers/vkernel/include/utils.h +new file mode 100644 +index 000000000000..9bcb29e144ca +--- /dev/null ++++ b/drivers/vkernel/include/utils.h +@@ -0,0 +1,11 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++ ++#ifndef _VKERNEL_UTILS_H ++#define _VKERNEL_UTILS_H ++ ++int vk_kallsyms_init(void); ++void vk_kallsyms_uninit(void); ++ ++unsigned long lookup_name(const char *name); ++ ++#endif +diff --git a/drivers/vkernel/mm/mm.c b/drivers/vkernel/mm/mm.c +new file mode 100644 +index 000000000000..6eee69ccf25a +--- /dev/null ++++ b/drivers/vkernel/mm/mm.c +@@ -0,0 +1,72 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Copyright (C) 2024 ARM Ltd. ++ * Author: Hang Huang ++ */ ++ ++#include "mm.h" ++ ++int vk_init_memory_pref(struct vkernel_mem_pref *mem) ++{ ++ mem->default_policy.refcnt = (atomic_t)ATOMIC_INIT(1); ++ mem->default_policy.mode = MPOL_LOCAL; ++ ++ mem->shmem_huge = SHMEM_HUGE_NEVER; ++ mem->thp_flags = ++#ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS ++ (1<thp_flags; ++ ++ if (desc->numa_mode >= 0 && desc->numa_mode < MPOL_MAX) { ++ /* TODO: Setup all fields */ ++ // mem->default_policy.mode = desc->numa_mode; ++ pr_info("set default numa policy is not supported yet\n"); ++ } ++ ++ if (desc->shmem_enabled >= SHMEM_HUGE_FORCE && ++ desc->shmem_enabled <= SHMEM_HUGE_ADVISE) ++ mem->shmem_huge = desc->shmem_enabled; ++ ++ if (desc->thp_enabled > -1 && ++ desc->thp_enabled < TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG) { ++ clear_bit(TRANSPARENT_HUGEPAGE_FLAG, flags); ++ clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, flags); ++ if (desc->thp_enabled > TRANSPARENT_HUGEPAGE_UNSUPPORTED) ++ set_bit(desc->thp_enabled, flags); ++ } ++ ++ if (desc->thp_defrag > -1 && ++ desc->thp_defrag < TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG) { ++ clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, flags); ++ clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, flags); ++ clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, flags); ++ clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, flags); ++ if (desc->thp_defrag > TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG) ++ set_bit(desc->thp_defrag, flags); ++ } ++ ++ if (desc->thp_use_zero_page == 0) ++ clear_bit(TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG, flags); ++ else if (desc->thp_use_zero_page == 1) ++ set_bit(TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG, flags); ++ ++ return 0; ++} +diff --git a/drivers/vkernel/sched/cpu.c b/drivers/vkernel/sched/cpu.c +new file mode 100644 +index 000000000000..16af7da43480 +--- /dev/null ++++ b/drivers/vkernel/sched/cpu.c +@@ -0,0 +1,36 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Copyright (C) 2024 ARM Ltd. ++ * Author: Hang Huang ++ */ ++ ++#include "sched.h" ++ ++int vk_init_cpu_pref(struct vkernel_cpu_pref *cpu) ++{ ++ cpu->policy = SCHED_NORMAL; ++ cpu->rr_timeslice_us = 0; ++ cpu->wakeup_gran_us = 0; ++ ++ return 0; ++} ++ ++void vk_uninit_cpu_pref(struct vkernel_cpu_pref *cpu) ++{ ++ ++} ++ ++int vkernel_set_cpu_pref(struct vkernel *vk, struct vkernel_cpu_desc *desc) ++{ ++ if (desc->policy >= 0) ++ vk->cpu_pref.policy = desc->policy; ++ ++ if (desc->rr_timeslice_us > 0) ++ vk->cpu_pref.rr_timeslice_us = desc->rr_timeslice_us; ++ ++ if (desc->wakeup_gran_us > 0) ++ vk->cpu_pref.wakeup_gran_us = desc->wakeup_gran_us; ++ ++ return 0; ++} ++EXPORT_SYMBOL(vkernel_set_cpu_pref); +diff --git a/drivers/vkernel/security/capability.c b/drivers/vkernel/security/capability.c +new file mode 100644 +index 000000000000..2b07101f260b +--- /dev/null ++++ b/drivers/vkernel/security/capability.c +@@ -0,0 +1,111 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Copyright (C) 2024 ARM Ltd. ++ * Author: Hang Huang ++ */ ++ ++#include ++#include ++ ++#include "security.h" ++#include "utils.h" ++ ++int (*cap_capget_ptr)(struct task_struct *target, kernel_cap_t *effective, ++ kernel_cap_t *inheritable, kernel_cap_t *permitted); ++int (*cap_capset_ptr)(struct cred *new, const struct cred *old, ++ const kernel_cap_t *effective, ++ const kernel_cap_t *inheritable, ++ const kernel_cap_t *permitted); ++int (*cap_task_prctl_ptr)(int option, unsigned long arg2, unsigned long arg3, ++ unsigned long arg4, unsigned long arg5); ++ ++int vk_cap_init(void) ++{ ++ cap_capget_ptr = (void *)lookup_name("cap_capget"); ++ cap_capset_ptr = (void *)lookup_name("cap_capset"); ++ cap_task_prctl_ptr = (void *)lookup_name("cap_task_prctl"); ++ if (!cap_capget_ptr || !cap_capset_ptr || !cap_task_prctl_ptr) { ++ pr_err("failed to find cap symbols, get: %p, set: %p, prctl: %p\n", ++ cap_capget_ptr, cap_capset_ptr, cap_task_prctl_ptr); ++ return -1; ++ } ++ ++ return 0; ++} ++ ++void vk_cap_uninit(void) {} ++ ++int vk_cap_capable(struct vkernel *vk, const struct cred *cred, struct user_namespace *ns, ++ int cap, unsigned int opts) ++{ ++ /* Check cred and real_cred to allow fs overried_creds */ ++ if (current_cred() == current_real_cred() && ++ !cap_issubset(cred->cap_effective, vk->linux_cap.effective)) { ++ pr_debug("vkernel: cap eff %llx escalated? use vk eff %llx instead\n", ++ cred->cap_effective.val, vk->linux_cap.effective.val); ++ for (;;) { ++ if (ns == cred->user_ns) ++ return cap_raised(vk->linux_cap.effective, cap) ? 0 : -EPERM; ++ if (ns->level <= cred->user_ns->level) ++ return -EPERM; ++ if ((ns->parent == cred->user_ns) && uid_eq(ns->owner, cred->euid)) ++ return 0; ++ ns = ns->parent; ++ } ++ } ++ return 0; ++} ++ ++/* ++ * Set cap for `current`, and `current` should be vk->init_process ++ * ++ * Note: this operation will take effect immediately. ++ */ ++int vkernel_set_linux_cap(struct vkernel *vk, struct vkernel_linux_cap *cap) ++{ ++ kernel_cap_t effective, inheritable, permitted; ++ struct cred *cred; ++ int action; ++ int ret; ++ int i; ++ ++ vk->linux_cap = *cap; ++ ++ /* Get current [effective,inheritable,permitted] */ ++ cap_capget_ptr(vk->init_process, &effective, &inheritable, &permitted); ++ ++ /* Drop bset according to linux_cap, which affects the following capset */ ++ if (cap_raised(effective, CAP_SETPCAP)) { ++ for (i = 0; i <= CAP_LAST_CAP; i++) { ++ if (!cap_raised(cap->bset, i)) { ++ ret = cap_task_prctl_ptr(PR_CAPBSET_DROP, i, 0, 0, 0); ++ if (ret) ++ return ret; ++ } ++ } ++ } ++ ++ /* Set current [effective,inheritable,permitted], ambient is automatically updated */ ++ cred = prepare_creds(); ++ if (!cred) ++ return -ENOMEM; ++ ret = cap_capset_ptr(cred, current_cred(), &cap->effective, &cap->inheritable, ++ &cap->permitted); ++ if (ret) ++ return ret; ++ commit_creds(cred); ++ ++ /* Raise or lower abmient according to linux_cap */ ++ for (i = 0; i < CAP_LAST_CAP; i++) { ++ if (cap_raised(cap->ambient, i)) ++ action = PR_CAP_AMBIENT_RAISE; ++ else ++ action = PR_CAP_AMBIENT_LOWER; ++ ret = cap_task_prctl_ptr(PR_CAP_AMBIENT, action, i, 0, 0); ++ if (ret) ++ return ret; ++ } ++ ++ return 0; ++} ++EXPORT_SYMBOL(vkernel_set_linux_cap); +diff --git a/drivers/vkernel/syscall.c b/drivers/vkernel/syscall.c +new file mode 100644 +index 000000000000..0aa03db71dcc +--- /dev/null ++++ b/drivers/vkernel/syscall.c +@@ -0,0 +1,636 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Copyright (C) 2024 ARM Ltd. ++ * Author: Hang Huang ++ */ ++ ++#include ++#include ++#include ++#include ++ ++#include "syscall.h" ++#include "utils.h" ++ ++sys_call_vk_t *sys_call_table_ptr; ++ ++int (*force_sig_seccomp_ptr)(int syscall, int reason, bool force_coredump); ++void (*do_exit_ptr)(long code); ++ ++#define NOTIF_SYSCALL_RULE(name) \ ++{ \ ++ .nr = __NR_##name, \ ++ .act = (VKERNEL_SYSCALL_ACT_ERRNO << VKERNEL_SYSCALL_ERRNO_BITS) | ENOSYS, \ ++} \ ++ ++static struct vkernel_syscall_rule_desc def_rules[] = { ++ NOTIF_SYSCALL_RULE(move_pages), ++ NOTIF_SYSCALL_RULE(fsconfig), ++ NOTIF_SYSCALL_RULE(kexec_load), ++ // NOTIF_SYSCALL_RULE(sysfs), ++ NOTIF_SYSCALL_RULE(fsopen), ++ NOTIF_SYSCALL_RULE(pkey_mprotect), ++ // NOTIF_SYSCALL_RULE(ustat), ++ NOTIF_SYSCALL_RULE(pkey_free), ++ NOTIF_SYSCALL_RULE(pkey_alloc), ++ NOTIF_SYSCALL_RULE(userfaultfd), ++ NOTIF_SYSCALL_RULE(migrate_pages), ++ NOTIF_SYSCALL_RULE(add_key), ++ NOTIF_SYSCALL_RULE(keyctl), ++ NOTIF_SYSCALL_RULE(clone3), ++ NOTIF_SYSCALL_RULE(kexec_file_load), ++ NOTIF_SYSCALL_RULE(swapoff), ++ NOTIF_SYSCALL_RULE(fsmount), ++ NOTIF_SYSCALL_RULE(open_tree), ++ // NOTIF_SYSCALL_RULE(_sysctl), ++ NOTIF_SYSCALL_RULE(move_mount), ++ NOTIF_SYSCALL_RULE(swapon), ++ NOTIF_SYSCALL_RULE(pivot_root), ++ NOTIF_SYSCALL_RULE(fspick), ++}; ++ ++static struct kmem_cache *syscall_rule_cache; ++ ++int vk_syscall_init(void) ++{ ++ sys_call_table_ptr = (void *)lookup_name("sys_call_table"); ++ if (!sys_call_table_ptr) { ++ pr_err("failed to find sys_call_table\n"); ++ return -1; ++ } ++ ++ force_sig_seccomp_ptr = (void *)lookup_name("force_sig_seccomp"); ++ if (!force_sig_seccomp_ptr) { ++ pr_err("failed to find force_sig_seccomp\n"); ++ return -1; ++ } ++ ++ do_exit_ptr = (void *)lookup_name("do_exit"); ++ if (!force_sig_seccomp_ptr) { ++ pr_err("failed to find do_exit\n"); ++ return -1; ++ } ++ ++ syscall_rule_cache = kmem_cache_create("vkernel_syscall_rule", ++ sizeof(struct vkernel_syscall_rule), 0, SLAB_ACCOUNT, NULL); ++ if (!syscall_rule_cache) { ++ pr_err("failed to create slab for syscall rule\n"); ++ return -ENOMEM; ++ } ++ ++ return 0; ++} ++ ++void vk_syscall_uninit(void) ++{ ++ kmem_cache_destroy(syscall_rule_cache); ++} ++ ++static inline bool check_cond(int op, unsigned long arg, ++ unsigned long oprand1, unsigned long oprand2) ++{ ++ switch (op) { ++ case VKERNEL_SYSCALL_CMP_EQ: ++ return arg == oprand1; ++ case VKERNEL_SYSCALL_CMP_NE: ++ return arg != oprand1; ++ case VKERNEL_SYSCALL_CMP_LT: ++ return arg < oprand1; ++ case VKERNEL_SYSCALL_CMP_LE: ++ return arg <= oprand1; ++ case VKERNEL_SYSCALL_CMP_GT: ++ return arg > oprand1; ++ case VKERNEL_SYSCALL_CMP_GE: ++ return arg >= oprand1; ++ case VKRENEL_SYSCALL_CMP_ME: ++ return (arg & oprand1) == oprand2; ++ } ++ ++ return false; ++} ++ ++ ++static bool check_rule(struct vkernel_syscall_rule *rule, struct pt_regs *regs) ++{ ++ struct vkernel_syscall_cond *cond; ++ unsigned long args[6]; ++ int i; ++ ++ /* Corner case */ ++ if (!rule) ++ return true; ++ ++ syscall_get_arguments(current, regs, args); ++ for (i = 0; i < 6; i++) { ++ cond = &rule->conds[i]; ++ if (cond->op == VKERNEL_SYSCALL_CMP_ED) ++ break; ++ if (!check_cond(cond->op, args[cond->index], cond->oprand1, cond->oprand2)) ++ return false; ++ } ++ ++ return true; ++} ++ ++asmlinkage long vk_sys_act_cond(const struct pt_regs *regs) ++{ ++ struct vkernel *vk; ++ struct vkernel_syscall_rule *rule; ++ struct pt_regs *curr_regs; ++ int nr; ++ unsigned int act; ++ ++ curr_regs = current_pt_regs(); ++ nr = syscall_get_nr(current, curr_regs); ++ if (likely(current_vk_task == current)) ++ vk = current_vk; ++ else ++ vk = vkernel_find_vk_by_task(current); ++ ++ act = vk->syscall.def_act; ++ list_for_each_entry(rule, &vk->syscall.rule_chains[nr], link) { ++ if (check_rule(rule, curr_regs)) { ++ act = rule->act; ++ break; ++ } ++ } ++ ++ switch (act >> VKERNEL_SYSCALL_ERRNO_BITS) { ++ case VKERNEL_SYSCALL_ACT_TRAP: ++ pr_info("vkernel: cond trap for syscall %d\n", nr); ++ syscall_rollback(current, curr_regs); ++ force_sig_seccomp_ptr(nr, -EPERM, false); ++ fallthrough; ++ case VKERNEL_SYSCALL_ACT_ERRNO: ++ pr_info("vkernel: cond err for syscall %d\n", nr); ++ return -(act & VKERNEL_SYSCALL_ERRNO_MASK); ++ ++ case VKERNEL_SYSCALL_ACT_USER_NOTIF: ++ pr_info("vkernel: cond user notif (nosys) for syscall %d\n", nr); ++ return -ENOSYS; ++ ++ case VKERNEL_SYSCALL_ACT_TRACE: ++ pr_info("vkernel: cond trace (nosys) for syscall %d\n", nr); ++ return -ENOSYS; ++ ++ case VKERNEL_SYSCALL_ACT_LOG: ++ pr_info("vkernel: cond log for syscall %d\n", nr); ++ fallthrough; ++ case VKERNEL_SYSCALL_ACT_ALLOW: ++ return sys_call_table_ptr[nr](regs); ++ ++ case VKERNEL_SYSCALL_ACT_KILL_PROCESS: ++ case VKERNEL_SYSCALL_ACT_KILL_THREAD: ++ default: ++ pr_info("vkernel: cond kill process/thread for syscall %d\n", nr); ++ if ((act >> VKERNEL_SYSCALL_ERRNO_BITS) != SECCOMP_RET_KILL_THREAD || ++ (atomic_read(¤t->signal->live) == 1)) { ++ /* Show the original registers in the dump. */ ++ syscall_rollback(current, curr_regs); ++ /* Trigger a coredump with SIGSYS */ ++ force_sig_seccomp_ptr(nr, -EPERM, true); ++ } else { ++ /* Call do_exit since there is missing unified pt_reg api */ ++ do_exit_ptr(SIGSYS); ++ } ++ return -1; ++ } ++ ++ /* We never get here */ ++ unreachable(); ++ ++ return -1; ++} ++ ++asmlinkage long vk_sys_act_invalid(const struct pt_regs *regs) ++{ ++ pr_info("invalid syscall, never get here\n"); ++ return -ENOSYS; ++} ++ ++asmlinkage long vk_sys_act_kill_process(const struct pt_regs *regs) ++{ ++ struct pt_regs *curr_regs; ++ int nr; ++ ++ curr_regs = current_pt_regs(); ++ nr = syscall_get_nr(current, curr_regs); ++ pr_info("vkernel: kill process for syscall %d\n", nr); ++ syscall_rollback(current, curr_regs); ++ force_sig_seccomp_ptr(nr, -EPERM, true); ++ ++ return -1; ++} ++ ++asmlinkage long vk_sys_act_kill_thread(const struct pt_regs *regs) ++{ ++ struct pt_regs *curr_regs; ++ int nr; ++ ++ curr_regs = current_pt_regs(); ++ nr = syscall_get_nr(current, curr_regs); ++ pr_info("vkernel: kill thread for syscall %d\n", nr); ++ if ((atomic_read(¤t->signal->live) == 1)) { ++ syscall_rollback(current, current_pt_regs()); ++ force_sig_seccomp_ptr(nr, -EPERM, true); ++ } else { ++ /* Call do_exit since there is missing unified pt_reg api */ ++ do_exit_ptr(SIGSYS); ++ } ++ ++ return -1; ++} ++ ++asmlinkage long vk_sys_act_trap(const struct pt_regs *regs) ++{ ++ struct pt_regs *curr_regs; ++ int nr; ++ ++ curr_regs = current_pt_regs(); ++ nr = syscall_get_nr(current, curr_regs); ++ pr_info("vkernel: trap for syscall %d\n", nr); ++ syscall_rollback(current, curr_regs); ++ force_sig_seccomp_ptr(nr, -EPERM, false); ++ ++ return -1; ++} ++ ++asmlinkage long vk_sys_act_user_notif(const struct pt_regs *regs) ++{ ++ pr_err("vkernel: user notif for syscall nr %d\n", ++ syscall_get_nr(current, current_pt_regs())); ++ return -ENOSYS; ++} ++ ++asmlinkage long vk_sys_act_trace(const struct pt_regs *regs) ++{ ++ pr_err("vkernel: trace for syscall nr %d\n", ++ syscall_get_nr(current, current_pt_regs())); ++ return -ENOSYS; ++} ++ ++asmlinkage long vk_sys_act_errno(const struct pt_regs *regs) ++{ ++ struct vkernel *vk; ++ struct vkernel_syscall_rule *rule; ++ struct pt_regs *curr_regs; ++ int nr; ++ int errno; ++ ++ if (likely(current_vk_task == current)) ++ vk = current_vk; ++ else ++ vk = vkernel_find_vk_by_task(current); ++ curr_regs = current_pt_regs(); ++ nr = syscall_get_nr(current, curr_regs); ++ if (list_empty(&vk->syscall.rule_chains[nr])) ++ errno = vk->syscall.def_act & 0xffff; ++ else { ++ rule = list_first_entry(&vk->syscall.rule_chains[nr], ++ struct vkernel_syscall_rule, link); ++ errno = rule->act & VKERNEL_SYSCALL_ERRNO_MASK; ++ } ++ ++ pr_err("vkernel: err for syscall nr %d errno -%d\n", nr, errno); ++ return -errno; ++} ++ ++asmlinkage long vk_sys_act_log(const struct pt_regs *regs) ++{ ++ int nr; ++ ++ nr = syscall_get_nr(current, current_pt_regs()); ++ pr_info("vkernel: log for syscall %d\n", nr); ++ ++ return sys_call_table_ptr[nr](regs); ++} ++ ++static void clear_syscall_rule_chain(struct list_head *chain) ++{ ++ struct vkernel_syscall_rule *rule; ++ struct vkernel_syscall_rule *tmp; ++ ++ list_for_each_entry_safe(rule, tmp, chain, link) { ++ list_del(&rule->link); ++ kmem_cache_free(syscall_rule_cache, rule); ++ } ++ INIT_LIST_HEAD(chain); ++} ++ ++int vk_init_syscall(struct vkernel_syscall *syscall) ++{ ++ int i; ++ ++ for (i = 0; i < NR_syscalls; i++) { ++ syscall->table[i] = sys_call_table_ptr[i]; ++ INIT_LIST_HEAD(&syscall->rule_chains[i]); ++ } ++ syscall->def_act = VKERNEL_SYSCALL_ACT_ALLOW << VKERNEL_SYSCALL_ERRNO_BITS; ++ ++ return 0; ++} ++ ++void vk_uninit_syscall(struct vkernel_syscall *syscall) ++{ ++ int i; ++ ++ for (i = 0; i < NR_syscalls; i++) ++ clear_syscall_rule_chain(&syscall->rule_chains[i]); ++} ++ ++int vkernel_set_syscall(struct vkernel_syscall *syscall, unsigned int nr, ++ sys_call_vk_t call) ++{ ++ if (unlikely(nr >= NR_syscalls)) ++ return -EINVAL; ++ ++ clear_syscall_rule_chain(&syscall->rule_chains[nr]); ++ syscall->table[nr] = call; ++ ++ return 0; ++} ++EXPORT_SYMBOL(vkernel_set_syscall); ++ ++static sys_call_vk_t uncond_table[] = { ++ [VKERNEL_SYSCALL_ACT_INVALID] = vk_sys_act_invalid, ++ [VKERNEL_SYSCALL_ACT_KILL_PROCESS] = vk_sys_act_kill_process, ++ [VKERNEL_SYSCALL_ACT_KILL_THREAD] = vk_sys_act_kill_thread, ++ [VKERNEL_SYSCALL_ACT_TRAP] = vk_sys_act_trap, ++ [VKERNEL_SYSCALL_ACT_ERRNO] = vk_sys_act_errno, ++ [VKERNEL_SYSCALL_ACT_USER_NOTIF] = vk_sys_act_user_notif, ++ [VKERNEL_SYSCALL_ACT_TRACE] = vk_sys_act_trace, ++ [VKERNEL_SYSCALL_ACT_LOG] = vk_sys_act_log, ++}; ++ ++/* ++ * Call before adding rules ++ */ ++int vkernel_set_default_syscall_rule(struct vkernel_syscall *syscall, u32 act) ++{ ++ unsigned int action; ++ int i; ++ ++ action = act >> VKERNEL_SYSCALL_ERRNO_BITS; ++ if (action == VKERNEL_SYSCALL_ACT_INVALID || ++ action > VKERNEL_SYSCALL_ACT_ALLOW || ++ act == syscall->def_act) { ++ pr_err("invalid default rule, act 0x%x, old 0x%x\n", act, syscall->def_act); ++ return -EINVAL; ++ } ++ ++ for (i = 0; i < NR_syscalls; i++) { ++ clear_syscall_rule_chain(&syscall->rule_chains[i]); ++ if (action < VKERNEL_SYSCALL_ACT_ALLOW) ++ syscall->table[i] = uncond_table[action]; ++ else ++ syscall->table[i] = sys_call_table_ptr[i]; ++ } ++ syscall->def_act = act; ++ ++ return 0; ++} ++EXPORT_SYMBOL(vkernel_set_default_syscall_rule); ++ ++int vkernel_add_syscall_rule(struct vkernel_syscall *syscall, ++ struct vkernel_syscall_rule_desc *desc) ++{ ++ struct vkernel_syscall_rule *rule; ++ unsigned int nr; ++ unsigned int action; ++ int index; ++ ++ pr_debug("set syscall rule, nr %u act 0x%x has_cond %d\n", ++ desc->nr, desc->act, desc->conds[0].op != VKERNEL_SYSCALL_CMP_ED); ++ ++ nr = desc->nr; ++ action = (desc->act >> VKERNEL_SYSCALL_ERRNO_BITS); ++ if (nr >= NR_syscalls || ++ action == VKERNEL_SYSCALL_ACT_INVALID || ++ action > VKERNEL_SYSCALL_ACT_ALLOW || ++ (desc->act == syscall->def_act && list_empty(&syscall->rule_chains[nr]))) { ++ pr_err("invalid rule, nr %u act 0x%x def_act 0x%x\n", ++ desc->nr, desc->act, syscall->def_act); ++ return -EINVAL; ++ } ++ ++ /* Update syscall rule chain */ ++ rule = kmem_cache_alloc(syscall_rule_cache, GFP_KERNEL_ACCOUNT); ++ if (!rule) { ++ pr_err("failed to alloc syscall rule\n"); ++ return -ENOMEM; ++ } ++ ++ rule->act = desc->act; ++ for (index = 0; index < 6; index++) { ++ rule->conds[index] = desc->conds[index]; ++ if (desc->conds[index].op == VKERNEL_SYSCALL_CMP_ED) ++ break; ++ } ++ list_add(&rule->link, &syscall->rule_chains[nr]); ++ ++ /* Update syscall table */ ++ if (index > 0) ++ syscall->table[nr] = vk_sys_act_cond; ++ else if (action < VKERNEL_SYSCALL_ACT_ALLOW) ++ syscall->table[nr] = uncond_table[action]; ++ else ++ syscall->table[nr] = sys_call_table_ptr[nr]; ++ ++ return 0; ++} ++EXPORT_SYMBOL(vkernel_add_syscall_rule); ++ ++void vk_install_default_syscalls(struct vkernel_syscall *syscall) ++{ ++ int i; ++ ++ for (i = 0; i < ARRAY_SIZE(def_rules); i++) ++ vkernel_add_syscall_rule(syscall, &def_rules[i]); ++} ++EXPORT_SYMBOL(vk_install_default_syscalls); ++ ++ ++struct vkernel_analysis { ++ unsigned int syscalls[NR_syscalls + 1]; ++ unsigned int exec_count; ++ unsigned int exec_capacity; ++ char *execs[]; ++}; ++ ++asmlinkage long vk_sys_act_analysis(const struct pt_regs *regs) ++{ ++ struct vkernel *vk; ++ struct vkernel_analysis *data; ++ struct vkernel_analysis *newdata; ++ char __user *uname; ++ char *kname; ++ struct pt_regs *curr_regs; ++ int nr; ++ ++ if (likely(current_vk_task == current)) ++ vk = current_vk; ++ else ++ vk = vkernel_find_vk_by_task(current); ++ data = (struct vkernel_analysis *)vk->private; ++ curr_regs = current_pt_regs(); ++ nr = syscall_get_nr(current, curr_regs); ++ if (data->syscalls[nr] < UINT_MAX) ++ data->syscalls[nr]++; ++ if (nr == __NR_execve || nr == __NR_execveat) { ++ kname = __getname(); ++ if (unlikely(!kname)) { ++ pr_err("failed to alloc name\n"); ++ return -ENOMEM; ++ } ++ if (nr == __NR_execve) ++ uname = (char __user *)regs_get_kernel_argument(curr_regs, 0); ++ else ++ uname = (char __user *)regs_get_kernel_argument(curr_regs, 1); ++ if (strncpy_from_user(kname, uname, PATH_MAX) < 0) { ++ pr_err("failed to copy user filename\n"); ++ __putname(kname); ++ return -EFAULT; ++ } ++ if (data->exec_count >= data->exec_capacity) { ++ newdata = kzalloc(sizeof(*data) + ++ sizeof(char *) * (data->exec_capacity << 1), GFP_KERNEL); ++ if (!newdata) ++ return -ENOMEM; ++ memcpy(newdata, data, sizeof(*data) + sizeof(char *) * data->exec_capacity); ++ newdata->exec_capacity <<= 1; ++ ++ vk->private = newdata; ++ /* TODO: fix race window */ ++ while (refcount_read(&vk->users_count) > 1) ++ ; ++ kfree(data); ++ data = newdata; ++ } ++ data->execs[data->exec_count++] = kname; ++ } ++ ++ return sys_call_table_ptr[nr](regs); ++} ++ ++static int analysis_show(struct seq_file *m, void *v) ++{ ++ struct vkernel *vk = m->private; ++ struct vkernel_analysis *data = vk->private; ++ unsigned int i; ++ bool first; ++ ++ seq_puts(m, "{\n"); ++ seq_puts(m, " \"syscalls\": ["); ++ first = true; ++ for (i = 0; i < NR_syscalls; i++) { ++ if (!data->syscalls[i]) ++ continue; ++ if (first) { ++ seq_printf(m, "%u", i); ++ first = false; ++ } else ++ seq_printf(m, ", %u", i); ++ } ++ seq_puts(m, "],\n"); ++ seq_puts(m, " \"execs\": [\n"); ++ first = true; ++ for (i = 0; i < data->exec_count; i++) { ++ if (unlikely(!data->execs[i])) { ++ pr_warn("encounter nil exec path in vkernel_analysis\n"); ++ continue; ++ } ++ if (first) { ++ seq_printf(m, " \"%s\"", data->execs[i]); ++ first = false; ++ } else ++ seq_printf(m, ",\n \"%s\"", data->execs[i]); ++ } ++ seq_puts(m, "\n ],\n"); ++ seq_puts(m, " \"syscall_details\": [\n"); ++ first = true; ++ for (i = 0; i < NR_syscalls; i++) { ++ if (!data->syscalls[i]) ++ continue; ++ if (first) { ++ seq_printf(m, " {\"nr\": %u, \"count\": %u}", i, data->syscalls[i]); ++ first = false; ++ } else ++ seq_printf(m, ",\n {\"nr\": %u, \"count\": %u}", i, data->syscalls[i]); ++ } ++ seq_puts(m, "\n ]\n"); ++ seq_puts(m, "}\n"); ++ ++ return 0; ++} ++ ++static int analysis_open(struct inode *inode, struct file *file) ++{ ++ struct vkernel *vk = inode->i_private; ++ int r; ++ ++ if (!vkernel_get_vk_safe(vk)) ++ return -ENOENT; ++ ++ r = single_open(file, analysis_show, inode->i_private); ++ if (r < 0) ++ vkernel_put_vk(vk); ++ ++ return r; ++} ++ ++static int analysis_release(struct inode *inode, struct file *file) ++{ ++ struct vkernel *vk = inode->i_private; ++ ++ vkernel_put_vk(vk); ++ ++ return single_release(inode, file); ++} ++ ++static const struct file_operations analysis_fops = { ++ .open = analysis_open, ++ .release = analysis_release, ++ .read = seq_read, ++ .llseek = seq_lseek, ++}; ++ ++static int analysis_post_create(struct vkernel *vk) ++{ ++ struct vkernel_analysis *data; ++ struct vkernel_syscall *syscall; ++ int i; ++ ++ data = kzalloc(sizeof(*data) + sizeof(char *) * 64, GFP_KERNEL); ++ if (!data) ++ return -ENOMEM; ++ data->exec_capacity = 4; ++ vk->private = data; ++ ++ syscall = &vk->syscall; ++ for (i = 0; i < NR_syscalls; i++) ++ syscall->table[i] = vk_sys_act_analysis; ++ ++ debugfs_create_file("analysis", 0444, vk->debugfs_dentry, vk, &analysis_fops); ++ ++ return 0; ++} ++ ++static void analysis_pre_destroy(struct vkernel *vk) ++{ ++ struct vkernel_analysis *data = (struct vkernel_analysis *)vk->private; ++ ++ if (unlikely(!data)) { ++ pr_warn("detroy an analysis vk without vkernel_analysis data\n"); ++ return; ++ } ++ ++ kfree(data); ++ vk->private = NULL; ++} ++ ++struct vkernel_custom_type analysis_custom = { ++ .owner = THIS_MODULE, ++ .name = "analysis", ++ .post_create = analysis_post_create, ++ .pre_destroy = analysis_pre_destroy, ++}; +diff --git a/drivers/vkernel/sysctl/fs.c b/drivers/vkernel/sysctl/fs.c +new file mode 100644 +index 000000000000..d57ebae8cfb3 +--- /dev/null ++++ b/drivers/vkernel/sysctl/fs.c +@@ -0,0 +1,73 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Copyright (C) 2024 ARM Ltd. ++ * Author: Hang Huang ++ */ ++ ++#include ++#include ++ ++#include "sysctl.h" ++ ++int vk_init_sysctl_fs(struct vkernel_sysctl_fs *fs) ++{ ++ unsigned long n; ++ unsigned long nr_pages = totalram_pages(); ++ unsigned long memreserve = (nr_pages - nr_free_pages()) * 3/2; ++ ++ memreserve = min(memreserve, nr_pages - 1); ++ n = ((nr_pages - memreserve) * (PAGE_SIZE / 1024)) / 10; ++ fs->files_stat.max_files = max_t(unsigned long, n, NR_FILE); ++ fs->nr_open = 1024 * 1024; ++ if (percpu_counter_init(&fs->nr_files, 0, GFP_KERNEL)) { ++ pr_err("vkernel: failed to init sysctl_fs nr_files\n"); ++ return -ENOMEM; ++ } ++ ++ fs->nr_inodes = alloc_percpu_gfp(unsigned long, GFP_KERNEL); ++ if (!fs->nr_inodes) { ++ pr_err("vkernel: failed to alloc sysctl_fs nr_inodes\n"); ++ return -ENOMEM; ++ } ++ fs->nr_unused = alloc_percpu_gfp(unsigned long, GFP_KERNEL); ++ if (!fs->nr_unused) { ++ pr_err("vkernel: failed to alloc sysctl_fs nr_unused\n"); ++ return -ENOMEM; ++ } ++ ++ fs->leases_enable = 1; ++ fs->lease_break_time = 45; ++ ++ fs->mount_max = 100000; ++ ++ return 0; ++} ++ ++void vk_uninit_sysctl_fs(struct vkernel_sysctl_fs *fs) ++{ ++ if (fs->nr_inodes) ++ free_percpu(fs->nr_inodes); ++ if (fs->nr_unused) ++ free_percpu(fs->nr_unused); ++ ++ percpu_counter_destroy(&fs->nr_files); ++} ++ ++int vkernel_set_sysctl_fs(struct vkernel_sysctl_fs *fs, struct vkernel_sysctl_fs_desc *desc) ++{ ++ if (desc->file_max) ++ fs->files_stat.max_files = desc->file_max; ++ if (desc->nr_open) ++ fs->nr_open = desc->nr_open; ++ ++ if (desc->leases_enable == 0 || desc->leases_enable == 1) ++ fs->leases_enable = desc->leases_enable; ++ if (desc->lease_break_time > 0) ++ fs->lease_break_time = desc->lease_break_time; ++ ++ if (desc->mount_max) ++ fs->mount_max = desc->mount_max; ++ ++ return 0; ++} ++EXPORT_SYMBOL(vkernel_set_sysctl_fs); +diff --git a/drivers/vkernel/sysctl/kernel.c b/drivers/vkernel/sysctl/kernel.c +new file mode 100644 +index 000000000000..5690e565a5b7 +--- /dev/null ++++ b/drivers/vkernel/sysctl/kernel.c +@@ -0,0 +1,112 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Copyright (C) 2024 ARM Ltd. ++ * Author: Hang Huang ++ */ ++ ++#include ++#include ++#include ++ ++#include "sysctl.h" ++ ++int vk_init_sysctl_kernel(struct vkernel_sysctl_kernel *k) ++{ ++ u64 threads; ++ unsigned long nr_pages = totalram_pages(); ++ ++ k->nb_mode = NUMA_BALANCING_DISABLED; ++ k->nb_promote_rate_limit = 65536; ++ ++ k->sched_cfs_bandwidth_slice = 5000UL; ++ k->sched_child_runs_first = 0; ++ ++ k->sched_dl_period_max = 1 << 22; /* ~4 seconds */ ++ k->sched_dl_period_min = 100; /* 100 us */ ++ ++ k->sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE; ++ k->sched_rt_period = 1000000; ++ k->sched_rt_runtime = 950000; ++ ++ /* ++ * The number of threads shall be limited such that the thread ++ * structures may only consume a small part of the available memory. ++ */ ++ if (fls64(nr_pages) + fls64(PAGE_SIZE) > 64) ++ threads = MAX_THREADS; ++ else ++ threads = div64_u64((u64) nr_pages * (u64) PAGE_SIZE, ++ (u64) THREAD_SIZE * 8UL); ++ if (threads > MAX_THREADS) ++ threads = MAX_THREADS; ++ k->nr_threads = 0; ++ k->max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS); ++ ++ k->key_gc_delay = 5 * 60; ++ k->persistent_keyring_expiry = 3 * 24 * 3600; /* Expire after 3 days of non-use */ ++ k->key_quota_root_maxbytes = 25000000; ++ k->key_quota_root_maxkeys = 1000000; ++ k->key_quota_maxbytes = 20000; ++ k->key_quota_maxkeys = 200; ++ ++ k->pty_limit = NR_UNIX98_PTY_DEFAULT; ++ k->pty_reserve = NR_UNIX98_PTY_RESERVE; ++ k->pty_count = (atomic_t)ATOMIC_INIT(0); ++ ++ return 0; ++} ++ ++void vk_uninit_sysctl_kernel(struct vkernel_sysctl_kernel *k) ++{ ++ ++} ++ ++int vkernel_set_sysctl_kernel(struct vkernel_sysctl_kernel *k, ++ struct vkernel_sysctl_kernel_desc *desc) ++{ ++ if (desc->numa_balancing >= 0) ++ k->nb_mode = desc->numa_balancing; ++ if (desc->numa_balancing_promote_rate_limit > 0) ++ k->nb_promote_rate_limit = desc->numa_balancing_promote_rate_limit; ++ ++ if (desc->sched_cfs_bandwidth_slice) ++ k->sched_cfs_bandwidth_slice = desc->sched_cfs_bandwidth_slice; ++ if (desc->sched_child_runs_first == 0 || desc->sched_child_runs_first == 1) ++ k->sched_child_runs_first = desc->sched_child_runs_first; ++ ++ if (desc->sched_dl_period_max) ++ k->sched_dl_period_max = desc->sched_dl_period_max; ++ if (desc->sched_dl_period_min) ++ k->sched_dl_period_min = desc->sched_dl_period_min; ++ ++ if (desc->sched_rr_timeslice > 0) ++ k->sched_rr_timeslice = desc->sched_rr_timeslice; ++ if (desc->sched_rt_period > 0) ++ k->sched_rt_period = desc->sched_rt_period; ++ if (desc->sched_rt_runtime > 0) ++ k->sched_rt_runtime = desc->sched_rt_runtime; ++ ++ if (desc->max_threads > 0) ++ k->max_threads = clamp_t(u64, desc->max_threads, MIN_THREADS, MAX_THREADS); ++ ++ if (desc->key_gc_delay) ++ k->key_gc_delay = desc->key_gc_delay; ++ if (desc->key_persistent_keyring_expiry) ++ k->persistent_keyring_expiry = desc->key_persistent_keyring_expiry; ++ if (desc->key_quota_root_maxbytes) ++ k->key_quota_root_maxbytes = desc->key_quota_root_maxbytes; ++ if (desc->key_quota_root_maxkeys) ++ k->key_quota_root_maxkeys = desc->key_quota_root_maxkeys; ++ if (desc->key_quota_maxbytes) ++ k->key_quota_maxbytes = desc->key_quota_maxbytes; ++ if (desc->key_quota_maxkeys) ++ k->key_quota_maxkeys = desc->key_quota_maxkeys; ++ ++ if (desc->pty_limit > 0) ++ k->pty_limit = desc->pty_limit; ++ if (desc->pty_reserve > 0) ++ k->pty_reserve = desc->pty_reserve; ++ ++ return 0; ++} ++EXPORT_SYMBOL(vkernel_set_sysctl_kernel); +diff --git a/drivers/vkernel/sysctl/net.c b/drivers/vkernel/sysctl/net.c +new file mode 100644 +index 000000000000..deaff3d13d16 +--- /dev/null ++++ b/drivers/vkernel/sysctl/net.c +@@ -0,0 +1,416 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Copyright (C) 2024 ARM Ltd. ++ * Author: Hang Huang ++ */ ++ ++#include ++#include ++ ++#include "sysctl.h" ++#include "utils.h" ++ ++int (*tcp_set_default_congestion_control_ptr)(struct net *net, const char *name); ++void (*rt_cache_flush_ptr)(struct net *net); ++void (*inet_netconf_notify_devconf_ptr)(struct net *net, int event, int type, ++ int ifindex, struct ipv4_devconf *devconf); ++ ++// extern unsigned int nf_conntrack_max; ++ ++int vk_init_sysctl_net(struct vkernel_sysctl_net *net, struct task_struct *tsk) ++{ ++ tcp_set_default_congestion_control_ptr = ++ (void *)lookup_name("tcp_set_default_congestion_control"); ++ rt_cache_flush_ptr = (void *)lookup_name("rt_cache_flush"); ++ inet_netconf_notify_devconf_ptr = ++ (void *)lookup_name("inet_netconf_notify_devconf"); ++ ++ /* congestion_control can be null */ ++ if (!rt_cache_flush_ptr || !inet_netconf_notify_devconf_ptr) { ++ pr_err("failed to find net symbols, flush: %p, notify: %p\n", ++ rt_cache_flush_ptr, inet_netconf_notify_devconf_ptr); ++ return -1; ++ } ++ ++ if (!tsk) { ++ pr_err("failed to init sysctl net with invalid task\n"); ++ return -1; ++ } ++ ++ // net->nf_conntrack_max = nf_conntrack_max; ++ net->nf_conntrack_max = 1572864; ++ ++ net->net_busy_poll = 0; ++ net->net_busy_read = 0; ++ ++ net->weight_p = 64; ++ net->dev_weight_rx_bias = 1; ++ net->dev_weight_tx_bias = 1; ++ net->dev_rx_weight = 64; ++ net->dev_tx_weight = 64; ++ ++ net->netdev_budget = 300; ++ net->netdev_budget_usecs = 2 * USEC_PER_SEC / HZ; ++ net->netdev_max_backlog = 1000; ++ ++ net->optmem_max = sizeof(unsigned long)*(2*UIO_MAXIOV+512); ++ net->wmem_max = SK_WMEM_MAX; ++ net->rmem_max = SK_RMEM_MAX; ++ net->wmem_default = SK_WMEM_MAX; ++ net->rmem_default = SK_RMEM_MAX; ++ ++ net->net = ERR_PTR(-ESRCH); ++ rcu_read_lock(); ++ task_lock(tsk); ++ if (tsk->nsproxy) ++ net->net = get_net(tsk->nsproxy->net_ns); ++ task_unlock(tsk); ++ rcu_read_unlock(); ++ if (IS_ERR(net->net)) { ++ pr_err("failed to get net ns, error %ld\n", PTR_ERR(net->net)); ++ return -1; ++ } ++ ++ return 0; ++} ++ ++void vk_uninit_sysctl_net(struct vkernel_sysctl_net *net) ++{ ++ if (!IS_ERR(net->net)) ++ put_net(net->net); ++} ++ ++enum { ++ DEVCONF_ALL, ++ DEVCONF_DFLT, ++ DEVCONF_OTHER ++}; ++ ++#define IPV4_DEVCONF_DFLT(net, attr) \ ++ IPV4_DEVCONF((*net->ipv4.devconf_dflt), attr) ++ ++static void devinet_copy_dflt_conf(struct net *net, int i) ++{ ++ struct net_device *dev; ++ ++ rcu_read_lock(); ++ for_each_netdev_rcu(net, dev) { ++ struct in_device *in_dev; ++ ++ in_dev = __in_dev_get_rcu(dev); ++ if (in_dev && !test_bit(i, in_dev->cnf.state)) ++ in_dev->cnf.data[i] = net->ipv4.devconf_dflt->data[i]; ++ } ++ rcu_read_unlock(); ++} ++ ++static void inet_forward_change(struct net *net) ++{ ++ struct net_device *dev; ++ int on = IPV4_DEVCONF_ALL(net, FORWARDING); ++ ++ IPV4_DEVCONF_ALL(net, ACCEPT_REDIRECTS) = !on; ++ IPV4_DEVCONF_DFLT(net, FORWARDING) = on; ++ inet_netconf_notify_devconf_ptr(net, RTM_NEWNETCONF, ++ NETCONFA_FORWARDING, ++ NETCONFA_IFINDEX_ALL, ++ net->ipv4.devconf_all); ++ inet_netconf_notify_devconf_ptr(net, RTM_NEWNETCONF, ++ NETCONFA_FORWARDING, ++ NETCONFA_IFINDEX_DEFAULT, ++ net->ipv4.devconf_dflt); ++ ++ for_each_netdev(net, dev) { ++ struct in_device *in_dev; ++ ++ if (on) ++ dev_disable_lro(dev); ++ ++ in_dev = __in_dev_get_rtnl(dev); ++ if (in_dev) { ++ IN_DEV_CONF_SET(in_dev, FORWARDING, on); ++ inet_netconf_notify_devconf_ptr(net, RTM_NEWNETCONF, ++ NETCONFA_FORWARDING, ++ dev->ifindex, &in_dev->cnf); ++ } ++ } ++} ++ ++static int devinet_conf_ifindex(struct net *net, struct ipv4_devconf *cnf) ++{ ++ struct in_device *idev; ++ ++ if (cnf == net->ipv4.devconf_dflt) ++ return NETCONFA_IFINDEX_DEFAULT; ++ else if (cnf == net->ipv4.devconf_all) ++ return NETCONFA_IFINDEX_ALL; ++ ++ idev = container_of(cnf, struct in_device, cnf); ++ return idev->dev->ifindex; ++} ++ ++int devconf_proc(struct net *net, struct ipv4_devconf *conf, ++ int val, int i, int type) ++{ ++ int old_val; ++ int ifindex; ++ ++ old_val = conf->data[i - 1]; ++ conf->data[i - 1] = val; ++ ++ set_bit(i - 1, conf->state); ++ ++ if (type == DEVCONF_DFLT) ++ devinet_copy_dflt_conf(net, i - 1); // inline ++ if (i == IPV4_DEVCONF_ACCEPT_LOCAL || i == IPV4_DEVCONF_ROUTE_LOCALNET) ++ if (conf->data[i - 1] == 0 && old_val != 0) ++ rt_cache_flush_ptr(net); ++ ++ if (i == IPV4_DEVCONF_BC_FORWARDING && conf->data[i - 1] != old_val) ++ rt_cache_flush_ptr(net); ++ ++ if (i == IPV4_DEVCONF_RP_FILTER && conf->data[i - 1] != old_val) { ++ ifindex = devinet_conf_ifindex(net, conf); // inline ++ inet_netconf_notify_devconf_ptr(net, RTM_NEWNETCONF, ++ NETCONFA_RP_FILTER, ++ ifindex, conf); ++ } ++ if (i == IPV4_DEVCONF_PROXY_ARP && conf->data[i - 1] != old_val) { ++ ifindex = devinet_conf_ifindex(net, conf); ++ inet_netconf_notify_devconf_ptr(net, RTM_NEWNETCONF, ++ NETCONFA_PROXY_NEIGH, ++ ifindex, conf); ++ } ++ if (i == IPV4_DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN && conf->data[i - 1] != old_val) { ++ ifindex = devinet_conf_ifindex(net, conf); ++ inet_netconf_notify_devconf_ptr(net, RTM_NEWNETCONF, ++ NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN, ++ ifindex, conf); ++ } ++ ++ return 0; ++} ++ ++int devconf_forward(struct net *net, struct ipv4_devconf *conf, ++ int val, int i, int type) ++{ ++ int old_val; ++ ++ old_val = conf->data[i - 1]; ++ conf->data[i - 1] = val; ++ if (conf->data[i - 1] != old_val) { ++ if (type != DEVCONF_DFLT) { ++ if (!rtnl_trylock()) { ++ conf->data[i - 1] = old_val; ++ return -EBUSY; ++ } ++ if (type == DEVCONF_ALL) ++ inet_forward_change(net); // inline ++ else { ++ struct in_device *idev = ++ container_of(conf, struct in_device, cnf); ++ dev_disable_lro(idev->dev); ++ inet_netconf_notify_devconf_ptr(net, RTM_NEWNETCONF, ++ NETCONFA_FORWARDING, ++ idev->dev->ifindex, ++ conf); ++ } ++ } else ++ inet_netconf_notify_devconf_ptr(net, RTM_NEWNETCONF, ++ NETCONFA_FORWARDING, ++ NETCONFA_IFINDEX_DEFAULT, ++ conf); ++ } ++ ++ return 0; ++} ++ ++int devconf_flush(struct net *net, struct ipv4_devconf *conf, ++ int val, int i, int type) ++{ ++ int old_val; ++ ++ old_val = conf->data[i - 1]; ++ conf->data[i - 1] = val; ++ if (conf->data[i - 1] != old_val) ++ rt_cache_flush_ptr(net); ++ ++ return 0; ++} ++ ++int vkernel_set_sysctl_net(struct vkernel_sysctl_net *net, struct vkernel_sysctl_net_desc *desc) ++{ ++ struct net *n = net->net; ++ int weight; ++ int val; ++ int i; ++ ++ /* netns specific */ ++ if (desc->nf_conntrack_max) ++ net->nf_conntrack_max = desc->nf_conntrack_max; ++ ++ /* core, poll/select specific */ ++ net->net_busy_poll = desc->core_busy_poll; ++ net->net_busy_read = desc->core_busy_read; ++ ++ /* napi_struct specific */ ++ if (desc->core_dev_weight > 0) { ++ net->weight_p = desc->core_dev_weight; ++ weight = READ_ONCE(net->weight_p); ++ WRITE_ONCE(net->dev_rx_weight, weight * net->dev_weight_rx_bias); ++ WRITE_ONCE(net->dev_tx_weight, weight * net->dev_weight_tx_bias); ++ } ++ ++ /* softnet_data specific */ ++ if (desc->core_netdev_budget > 0) ++ net->netdev_budget = desc->core_netdev_budget; ++ if (desc->core_netdev_budget_us > 0) ++ net->netdev_budget_usecs = desc->core_netdev_budget_us; ++ if (desc->core_netdev_max_backlog > 0) ++ net->netdev_max_backlog = desc->core_netdev_max_backlog; ++ ++ /* sock specific (netns specific) */ ++ if (desc->core_optmem_max > 0) ++ net->optmem_max = desc->core_optmem_max; ++ if (desc->core_wmem_max) ++ net->wmem_max = desc->core_wmem_max; ++ if (desc->core_rmem_max) ++ net->rmem_max = desc->core_rmem_max; ++ if (desc->core_wmem_default) ++ net->wmem_default = desc->core_wmem_default; ++ if (desc->core_rmem_default) ++ net->rmem_default = desc->core_rmem_default; ++ ++ /* net ns specific */ ++ ++ /* core */ ++ if (desc->core_somaxconn) ++ n->core.sysctl_somaxconn = desc->core_somaxconn; ++ ++ /* ipv4 */ ++ if (desc->ipv4_icmp_echo_ignore_broadcasts == 0 || ++ desc->ipv4_icmp_echo_ignore_broadcasts == 1) ++ n->ipv4.sysctl_icmp_echo_ignore_broadcasts = desc->ipv4_icmp_echo_ignore_broadcasts; ++ if (desc->ipv4_ip_local_port_range[0] > 0 && desc->ipv4_ip_local_port_range[1] > 0) { ++ n->ipv4.ip_local_ports.range[0] = desc->ipv4_ip_local_port_range[0]; ++ n->ipv4.ip_local_ports.range[1] = desc->ipv4_ip_local_port_range[1]; ++ } ++ if (desc->ipv4_max_tw_buckets > 0) ++ n->ipv4.tcp_death_row.sysctl_max_tw_buckets = desc->ipv4_max_tw_buckets; ++ if (desc->ipv4_tcp_ecn <= 2) ++ n->ipv4.sysctl_tcp_ecn = desc->ipv4_tcp_ecn; ++ if (desc->ipv4_ip_default_ttl >= 1 && desc->ipv4_ip_default_ttl <= 255) ++ n->ipv4.sysctl_ip_default_ttl = desc->ipv4_ip_default_ttl; ++ if (desc->ipv4_ip_no_pmtu_disc == 0 || desc->ipv4_ip_no_pmtu_disc == 1) ++ n->ipv4.sysctl_ip_no_pmtu_disc = desc->ipv4_ip_no_pmtu_disc; ++ if (desc->ipv4_tcp_keepalive_time > 0) ++ WRITE_ONCE(n->ipv4.sysctl_tcp_keepalive_time, desc->ipv4_tcp_keepalive_time * HZ); ++ if (desc->ipv4_tcp_keepalive_intvl > 0) ++ WRITE_ONCE(n->ipv4.sysctl_tcp_keepalive_intvl, desc->ipv4_tcp_keepalive_intvl * HZ); ++ if (desc->ipv4_tcp_keepalive_probes) ++ n->ipv4.sysctl_tcp_keepalive_probes = desc->ipv4_tcp_keepalive_probes; ++ if (desc->ipv4_tcp_syn_retries >= 1 && desc->ipv4_tcp_syn_retries <= MAX_TCP_SYNCNT) ++ n->ipv4.sysctl_tcp_syn_retries = desc->ipv4_tcp_syn_retries; ++ if (desc->ipv4_tcp_synack_retries) ++ n->ipv4.sysctl_tcp_synack_retries = desc->ipv4_tcp_synack_retries; ++ if (desc->ipv4_tcp_syncookies >= 0 && desc->ipv4_tcp_syncookies <= 2) ++ n->ipv4.sysctl_tcp_syncookies = desc->ipv4_tcp_syncookies; ++ if (desc->ipv4_tcp_reordering > 0) ++ n->ipv4.sysctl_tcp_reordering = desc->ipv4_tcp_reordering; ++ if (desc->ipv4_tcp_retries1 && desc->ipv4_tcp_retries1 <= 255) ++ n->ipv4.sysctl_tcp_retries1 = desc->ipv4_tcp_retries1; ++ if (desc->ipv4_tcp_retries2) ++ n->ipv4.sysctl_tcp_retries2 = desc->ipv4_tcp_retries2; ++ if (desc->ipv4_tcp_orphan_retries) ++ n->ipv4.sysctl_tcp_orphan_retries = desc->ipv4_tcp_orphan_retries; ++ if (desc->ipv4_tcp_tw_reuse >= 0 && desc->ipv4_tcp_tw_reuse <= 2) ++ n->ipv4.sysctl_tcp_tw_reuse = desc->ipv4_tcp_tw_reuse; ++ if (desc->ipv4_tcp_fin_timeout > 0) ++ WRITE_ONCE(n->ipv4.sysctl_tcp_fin_timeout, desc->ipv4_tcp_fin_timeout * HZ); ++ if (desc->ipv4_tcp_sack == 0 || desc->ipv4_tcp_sack == 1) ++ n->ipv4.sysctl_tcp_sack = desc->ipv4_tcp_sack; ++ if (desc->ipv4_tcp_window_scaling == 0 || ++ desc->ipv4_tcp_window_scaling == 1) ++ n->ipv4.sysctl_tcp_window_scaling = desc->ipv4_tcp_window_scaling; ++ if (desc->ipv4_tcp_timestamps == 0 || desc->ipv4_tcp_timestamps == 1) ++ n->ipv4.sysctl_tcp_timestamps = desc->ipv4_tcp_timestamps; ++ if (desc->ipv4_tcp_thin_linear_timeouts == 0 || ++ desc->ipv4_tcp_thin_linear_timeouts == 1) ++ n->ipv4.sysctl_tcp_thin_linear_timeouts = desc->ipv4_tcp_thin_linear_timeouts; ++ if (desc->ipv4_tcp_retrans_collapse == 0 || ++ desc->ipv4_tcp_retrans_collapse == 1) ++ n->ipv4.sysctl_tcp_retrans_collapse = desc->ipv4_tcp_retrans_collapse; ++ if (desc->ipv4_tcp_fack == 0 || desc->ipv4_tcp_fack == 1) ++ n->ipv4.sysctl_tcp_fack = desc->ipv4_tcp_fack; ++ if (desc->ipv4_tcp_adv_win_scale >= 0 && desc->ipv4_tcp_adv_win_scale <= 4) ++ n->ipv4.sysctl_tcp_adv_win_scale = desc->ipv4_tcp_adv_win_scale; ++ if (desc->ipv4_tcp_dsack == 0 || desc->ipv4_tcp_dsack == 1) ++ n->ipv4.sysctl_tcp_dsack = desc->ipv4_tcp_dsack; ++ if (desc->ipv4_tcp_nometrics_save == 0 || desc->ipv4_tcp_nometrics_save == 1) ++ n->ipv4.sysctl_tcp_nometrics_save = desc->ipv4_tcp_nometrics_save; ++ if (desc->ipv4_tcp_moderate_rcvbuf == 0 || desc->ipv4_tcp_moderate_rcvbuf == 1) ++ n->ipv4.sysctl_tcp_moderate_rcvbuf = desc->ipv4_tcp_moderate_rcvbuf; ++ if (desc->ipv4_tcp_min_tso_segs) ++ n->ipv4.sysctl_tcp_min_tso_segs = desc->ipv4_tcp_min_tso_segs; ++ if (desc->ipv4_tcp_wmem[0] > 0 && desc->ipv4_tcp_wmem[1] > 0 && ++ desc->ipv4_tcp_wmem[2] > 0) { ++ n->ipv4.sysctl_tcp_wmem[0] = desc->ipv4_tcp_wmem[0]; ++ n->ipv4.sysctl_tcp_wmem[1] = desc->ipv4_tcp_wmem[1]; ++ n->ipv4.sysctl_tcp_wmem[2] = desc->ipv4_tcp_wmem[2]; ++ } ++ if (desc->ipv4_tcp_rmem[0] > 0 && desc->ipv4_tcp_rmem[1] > 0 && ++ desc->ipv4_tcp_rmem[2] > 0) { ++ n->ipv4.sysctl_tcp_rmem[0] = desc->ipv4_tcp_rmem[0]; ++ n->ipv4.sysctl_tcp_rmem[1] = desc->ipv4_tcp_rmem[1]; ++ n->ipv4.sysctl_tcp_rmem[2] = desc->ipv4_tcp_rmem[2]; ++ } ++ if (desc->ipv4_max_syn_backlog > 0) ++ n->ipv4.sysctl_max_syn_backlog = desc->ipv4_max_syn_backlog; ++ if (desc->ipv4_tcp_fastopen == 1 || desc->ipv4_tcp_fastopen == 2 || ++ desc->ipv4_tcp_fastopen == 4) ++ n->ipv4.sysctl_tcp_fastopen = desc->ipv4_tcp_fastopen; ++ if (tcp_set_default_congestion_control_ptr && strlen(desc->ipv4_tcp_congestion_control) > 1) ++ tcp_set_default_congestion_control_ptr(n, desc->ipv4_tcp_congestion_control); ++ ++ /* ipv4 conf */ ++ for (i = IPV4_DEVCONF_FORWARDING; i <= IPV4_DEVCONF_MAX; i++) { ++ val = desc->ipv4_conf_all[i - 1]; ++ if (val < 0) ++ continue; ++ ++ if (i == IPV4_DEVCONF_FORWARDING) ++ devconf_forward(n, n->ipv4.devconf_all, val, i, DEVCONF_ALL); ++ else if (i == IPV4_DEVCONF_NOXFRM || ++ i == IPV4_DEVCONF_NOPOLICY || ++ i == IPV4_DEVCONF_PROMOTE_SECONDARIES || ++ i == IPV4_DEVCONF_ROUTE_LOCALNET || ++ i == IPV4_DEVCONF_DROP_UNICAST_IN_L2_MULTICAST) ++ devconf_flush(n, n->ipv4.devconf_all, val, i, DEVCONF_ALL); ++ else ++ devconf_proc(n, n->ipv4.devconf_all, val, i, DEVCONF_ALL); ++ } ++ /* ipv4 conf default */ ++ for (i = IPV4_DEVCONF_FORWARDING; i <= IPV4_DEVCONF_MAX; i++) { ++ val = desc->ipv4_conf_default[i - 1]; ++ if (val != 0 && val != 1) ++ continue; ++ ++ if (i == IPV4_DEVCONF_FORWARDING) ++ devconf_forward(n, n->ipv4.devconf_dflt, val, i, DEVCONF_DFLT); ++ else if (i == IPV4_DEVCONF_NOXFRM || ++ i == IPV4_DEVCONF_NOPOLICY || ++ i == IPV4_DEVCONF_PROMOTE_SECONDARIES || ++ i == IPV4_DEVCONF_ROUTE_LOCALNET || ++ i == IPV4_DEVCONF_DROP_UNICAST_IN_L2_MULTICAST) ++ devconf_flush(n, n->ipv4.devconf_dflt, val, i, DEVCONF_DFLT); ++ else ++ devconf_proc(n, n->ipv4.devconf_dflt, val, i, DEVCONF_DFLT); ++ } ++ ++ /* unix */ ++ if (desc->unix_max_dgram_qlen > 0) ++ n->unx.sysctl_max_dgram_qlen = desc->unix_max_dgram_qlen; ++ ++ return 0; ++} ++EXPORT_SYMBOL(vkernel_set_sysctl_net); +diff --git a/drivers/vkernel/sysctl/raw.c b/drivers/vkernel/sysctl/raw.c +new file mode 100644 +index 000000000000..2b67fa89370e +--- /dev/null ++++ b/drivers/vkernel/sysctl/raw.c +@@ -0,0 +1,689 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Copyright (C) 2024 ARM Ltd. ++ * Author: Hang Huang ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++ ++#include "sysctl.h" ++#include "utils.h" ++ ++enum { ++ DEVCONF_ALL, ++ DEVCONF_DFLT, ++ DEVCONF_OTHER ++}; ++ ++int vkernel_set_sysctl_raw(struct vkernel *vk, char *buf) ++{ ++ struct ipc_namespace *ipc_ns = NULL; ++ struct net *n; ++ char *name; ++ char *val; ++ char *p; ++ u64 uval; ++ s64 sval, old_sval, third_sval; ++ bool has_uval = false, has_sval = false; ++ ++ val = strchr(buf, '='); ++ if (!val) ++ return -EINVAL; ++ *val++ = 0; ++ name = strstrip(buf); ++ val = strstrip(val); ++ ++ if (!kstrtou64(val, 10, &uval)) ++ has_uval = true; ++ else ++ pr_warn("failed to parse raw sysctl val %s to u64\n", val); ++ if (!kstrtos64(val, 10, &sval)) ++ has_sval = true; ++ else ++ pr_warn("failed to parse raw sysctl val %s to s64\n", val); ++ ++ if (vk->init_process->nsproxy) ++ ipc_ns = vk->init_process->nsproxy->ipc_ns; ++ ++ n = vk->sysctl_net.net; ++ ++ if (!strcmp(name, "fs.file-max")) { ++ if (has_uval && uval) ++ vk->sysctl_fs.files_stat.max_files = uval; ++ } else if (!strcmp(name, "fs.nr_open")) { ++ if (has_uval && uval) ++ vk->sysctl_fs.nr_open = uval; ++ } else if (!strcmp(name, "fs.lease-break-time")) { ++ if (has_uval && sval > 0) ++ vk->sysctl_fs.leases_enable = sval; ++ } else if (!strcmp(name, "fs.leases-enable")) { ++ if (has_sval && (sval == 0 || sval == 1)) ++ vk->sysctl_fs.lease_break_time = sval; ++ } else if (!strcmp(name, "fs.mount-max")) { ++ if (has_uval && uval) ++ vk->sysctl_fs.mount_max = uval; ++ } else if (!strcmp(name, "kernel.msgmax")) { ++ if (has_uval && ipc_ns && uval) ++ ipc_ns->msg_ctlmax = uval; ++ } else if (!strcmp(name, "kernel.msgmnb")) { ++ if (has_uval && ipc_ns && uval) ++ ipc_ns->msg_ctlmnb = uval; ++ } else if (!strcmp(name, "kernel.msgmni")) { ++ if (has_uval && ipc_ns && uval) ++ ipc_ns->msg_ctlmni = uval; ++ } ++#ifdef CONFIG_CHECKPOINT_RESTORE ++ else if (!strcmp(name, "kernel.msg_next_id")) { ++ if (has_sval && ipc_ns && sval >= -1) ++ ipc_ns->ids[IPC_MSG_IDS].next_id = sval; ++ } ++#endif ++ else if (!strcmp(name, "kernel.sem")) { ++ if (ipc_ns) { ++ old_sval = ipc_ns->sem_ctls[3]; ++ uval = 0; ++ while ((p = strsep(&val, " \t")) != NULL && uval < 4) { ++ if (!*p) ++ continue; ++ if (!kstrtos64(p, 10, &sval) && sval > 0) ++ ipc_ns->sem_ctls[uval] = sval; ++ uval++; ++ } ++ if (sem_check_semmni(ipc_ns)) ++ ipc_ns->sem_ctls[3] = old_sval; ++ } ++ } ++#ifdef CONFIG_CHECKPOINT_RESTORE ++ else if (!strcmp(name, "kernel.sem_next_id")) { ++ if (has_sval && ipc_ns && sval >= -1) ++ ipc_ns->ids[IPC_SEM_IDS].next_id = sval; ++ } ++#endif ++ else if (!strcmp(name, "kernel.shmall")) { ++ if (has_uval && ipc_ns && uval) ++ ipc_ns->shm_ctlall = uval; ++ } else if (!strcmp(name, "kernel.shmmax")) { ++ if (has_uval && ipc_ns && uval) ++ ipc_ns->shm_ctlmax = uval; ++ } else if (!strcmp(name, "kernel.shmmni")) { ++ if (has_uval && ipc_ns && uval) ++ ipc_ns->shm_ctlmni = uval; ++ } ++#ifdef CONFIG_CHECKPOINT_RESTORE ++ else if (!strcmp(name, "kernel.shm_next_id")) { ++ if (has_uval && ipc_ns && uval) ++ ipc_ns->ids[IPC_SHM_IDS].next_id = uval; ++ } ++#endif ++ else if (!strcmp(name, "kernel.shm_rmid_forced")) { ++ if (has_sval && ipc_ns && (sval == 0 || sval == 1)) ++ ipc_ns->shm_rmid_forced = sval; ++ } else if (!strcmp(name, "kernel.numa_balancing")) { ++ /* inactive */ ++ if (has_sval && sval >= 0) ++ vk->sysctl_kernel.nb_mode = sval; ++ } else if (!strcmp(name, "kernel.numa_balancing_promote_rate_limit_MBps")) { ++ /* inactive */ ++ if (has_sval && sval > 0) ++ vk->sysctl_kernel.nb_promote_rate_limit = sval; ++ } else if (!strcmp(name, "kernel.sched_cfs_bandwidth_slice_us")) { ++ if (has_uval && uval) ++ vk->sysctl_kernel.sched_cfs_bandwidth_slice = uval; ++ } else if (!strcmp(name, "kernel.sched_child_runs_first")) { ++ if (has_uval && (uval == 0 || uval == 1)) ++ vk->sysctl_kernel.sched_child_runs_first = uval; ++ } else if (!strcmp(name, "kernel.sched_deadline_period_max_us")) { ++ if (has_uval && uval) ++ vk->sysctl_kernel.sched_dl_period_max = uval; ++ } else if (!strcmp(name, "kernel.sched_deadline_period_min_us")) { ++ if (has_uval && uval) ++ vk->sysctl_kernel.sched_dl_period_min = uval; ++ } else if (!strcmp(name, "kernel.sched_rr_timeslice_ms")) { ++ /* inactive */ ++ if (has_sval && sval > 0) ++ vk->sysctl_kernel.sched_rr_timeslice = sval; ++ } else if (!strcmp(name, "kernel.sched_rt_period_us")) { ++ /* inactive */ ++ if (has_sval && sval > 0) ++ vk->sysctl_kernel.sched_rt_period = sval; ++ } else if (!strcmp(name, "kernel.sched_rt_runtime_us")) { ++ /* inactive */ ++ if (has_sval && sval > 0) ++ vk->sysctl_kernel.sched_rt_runtime = sval; ++ } else if (!strcmp(name, "kernel.threads-max")) { ++ if (has_sval && sval > 0) ++ vk->sysctl_kernel.max_threads = clamp_t(u64, sval, ++ MIN_THREADS, MAX_THREADS); ++ } else if (!strcmp(name, "kernel.keys.gc_delay")) { ++ if (has_uval && uval > 0) ++ vk->sysctl_kernel.key_gc_delay = uval; ++ } else if (!strcmp(name, "kernel.keys.maxbytes")) { ++ if (has_uval && uval > 0) ++ vk->sysctl_kernel.key_quota_maxbytes = uval; ++ } else if (!strcmp(name, "kernel.keys.maxkeys")) { ++ if (has_uval && uval > 0) ++ vk->sysctl_kernel.key_quota_maxkeys = uval; ++ } else if (!strcmp(name, "kernel.keys.persistent_keyring_expiry")) { ++ if (has_uval && uval > 0) ++ vk->sysctl_kernel.persistent_keyring_expiry = uval; ++ } else if (!strcmp(name, "kernel.keys.root_maxbytes")) { ++ if (has_uval && uval > 0) ++ vk->sysctl_kernel.key_quota_root_maxbytes = uval; ++ } else if (!strcmp(name, "kernel.keys.root_maxkeys")) { ++ if (has_uval && uval > 0) ++ vk->sysctl_kernel.key_quota_root_maxkeys = uval; ++ } else if (!strcmp(name, "kernel.pty.max")) { ++ if (has_sval && sval > 0) ++ vk->sysctl_kernel.pty_limit = sval; ++ } else if (!strcmp(name, "kernel.pty.reserve")) { ++ if (has_sval && sval > 0) ++ vk->sysctl_kernel.pty_reserve = sval; ++ } else if (!strcmp(name, "net.nf_conntrack_max")) { ++ if (has_uval && uval > 0) ++ vk->sysctl_net.nf_conntrack_max = uval; ++ } else if (!strcmp(name, "net.core.busy_poll")) { ++ if (has_uval) ++ vk->sysctl_net.net_busy_poll = uval; ++ } else if (!strcmp(name, "net.core.busy_read")) { ++ if (has_uval) ++ vk->sysctl_net.net_busy_read = uval; ++ } else if (!strcmp(name, "net.core.optmem_max")) { ++ if (has_sval && sval > 0) ++ vk->sysctl_net.optmem_max = sval; ++ } else if (!strcmp(name, "net.core.wmem_max")) { ++ if (has_uval && uval) ++ vk->sysctl_net.wmem_max = uval; ++ } else if (!strcmp(name, "net.core.rmem_max")) { ++ if (has_uval && uval) ++ vk->sysctl_net.rmem_max = uval; ++ } else if (!strcmp(name, "net.core.wmem_default")) { ++ if (has_uval && uval) ++ vk->sysctl_net.wmem_default = uval; ++ } else if (!strcmp(name, "net.core.rmem_default")) { ++ if (has_uval && uval) ++ vk->sysctl_net.rmem_default = uval; ++ } else if (!strcmp(name, "net.core.somaxconn")) { ++ if (has_uval && uval) ++ n->core.sysctl_somaxconn = uval; ++ } else if (!strcmp(name, "net.ipv4.icmp_echo_ignore_broadcasts")) { ++ if (has_uval && (uval == 0 || uval == 1)) ++ n->ipv4.sysctl_icmp_echo_ignore_broadcasts = uval; ++ } else if (!strcmp(name, "net.ipv4.ip_local_port_range")) { ++ uval = 0; ++ while ((p = strsep(&val, " \t")) != NULL && uval < 2) { ++ if (!*p) ++ continue; ++ if (uval == 0) { ++ if (kstrtos64(p, 10, &sval)) ++ sval = 0; ++ } else { ++ if (kstrtos64(p, 10, &old_sval)) ++ old_sval = 0; ++ } ++ uval++; ++ } ++ if (sval > 0 && old_sval > 0) { ++ n->ipv4.ip_local_ports.range[0] = sval; ++ n->ipv4.ip_local_ports.range[1] = old_sval; ++ } ++ } else if (!strcmp(name, "net.ipv4.tcp_max_tw_buckets")) { ++ if (has_sval && sval > 0) ++ n->ipv4.tcp_death_row.sysctl_max_tw_buckets = sval; ++ } else if (!strcmp(name, "net.ipv4.tcp_ecn")) { ++ if (has_uval && uval <= 2) ++ n->ipv4.sysctl_tcp_ecn = uval; ++ } else if (!strcmp(name, "net.ipv4.ip_default_ttl")) { ++ if (has_uval && (uval >= 1 && uval <= 255)) ++ n->ipv4.sysctl_ip_default_ttl = uval; ++ } else if (!strcmp(name, "net.ipv4.ip_no_pmtu_disc")) { ++ if (has_uval && (uval == 0 || uval == 1)) ++ n->ipv4.sysctl_ip_no_pmtu_disc = uval; ++ } else if (!strcmp(name, "net.ipv4.tcp_keepalive_time")) { ++ if (has_sval && sval > 0) ++ WRITE_ONCE(n->ipv4.sysctl_tcp_keepalive_time, sval * HZ); ++ } else if (!strcmp(name, "net.ipv4.tcp_keepalive_intvl")) { ++ if (has_sval && sval > 0) ++ WRITE_ONCE(n->ipv4.sysctl_tcp_keepalive_intvl, sval * HZ); ++ } else if (!strcmp(name, "net.ipv4.tcp_keepalive_probes")) { ++ if (has_uval && uval) ++ n->ipv4.sysctl_tcp_keepalive_probes = uval; ++ } else if (!strcmp(name, "net.ipv4.tcp_syn_retries")) { ++ if (has_uval && uval >= 1 && uval <= MAX_TCP_SYNCNT) ++ n->ipv4.sysctl_tcp_syn_retries = uval; ++ } else if (!strcmp(name, "net.ipv4.tcp_synack_retries")) { ++ if (has_uval && uval) ++ n->ipv4.sysctl_tcp_synack_retries = uval; ++ } else if (!strcmp(name, "net.ipv4.tcp_syncookies")) { ++ if (has_uval && uval >= 0 && uval <= 2) ++ n->ipv4.sysctl_tcp_syncookies = uval; ++ } else if (!strcmp(name, "net.ipv4.tcp_reordering")) { ++ if (has_sval && sval > 0) ++ n->ipv4.sysctl_tcp_reordering = sval; ++ } else if (!strcmp(name, "net.ipv4.tcp_retries1")) { ++ if (has_uval && uval && uval <= 255) ++ n->ipv4.sysctl_tcp_retries1 = uval; ++ } else if (!strcmp(name, "net.ipv4.tcp_retries2")) { ++ if (has_uval && uval) ++ n->ipv4.sysctl_tcp_retries2 = uval; ++ } else if (!strcmp(name, "net.ipv4.tcp_orphan_retries")) { ++ if (has_uval && uval) ++ n->ipv4.sysctl_tcp_orphan_retries = uval; ++ } else if (!strcmp(name, "net.ipv4.tcp_tw_reuse")) { ++ if (has_uval && uval >= 0 && uval <= 2) ++ n->ipv4.sysctl_tcp_tw_reuse = uval; ++ } else if (!strcmp(name, "net.ipv4.tcp_fin_timeout")) { ++ if (has_sval && sval > 0) ++ WRITE_ONCE(n->ipv4.sysctl_tcp_fin_timeout, sval * HZ); ++ } else if (!strcmp(name, "net.ipv4.tcp_sack")) { ++ if (has_uval && (uval == 0 || uval == 1)) ++ n->ipv4.sysctl_tcp_sack = uval; ++ } else if (!strcmp(name, "net.ipv4.tcp_window_scaling")) { ++ if (has_uval && (uval == 0 || uval == 1)) ++ n->ipv4.sysctl_tcp_window_scaling = uval; ++ } else if (!strcmp(name, "net.ipv4.tcp_timestamps")) { ++ if (has_uval && (uval == 0 || uval == 1)) ++ n->ipv4.sysctl_tcp_timestamps = uval; ++ } else if (!strcmp(name, "net.ipv4.tcp_thin_linear_timeouts")) { ++ if (has_uval && (uval == 0 || uval == 1)) ++ n->ipv4.sysctl_tcp_thin_linear_timeouts = uval; ++ } else if (!strcmp(name, "net.ipv4.tcp_retrans_collapse")) { ++ if (has_uval && (uval == 0 || uval == 1)) ++ n->ipv4.sysctl_tcp_retrans_collapse = uval; ++ } else if (!strcmp(name, "net.ipv4.tcp_fack")) { ++ if (has_uval && (uval == 0 || uval == 1)) ++ n->ipv4.sysctl_tcp_fack = uval; ++ } else if (!strcmp(name, "net.ipv4.tcp_adv_win_scale")) { ++ if (has_sval && sval >= 0 && sval <= 4) ++ n->ipv4.sysctl_tcp_adv_win_scale = sval; ++ } else if (!strcmp(name, "net.ipv4.tcp_dsack")) { ++ if (has_uval && (uval == 0 || uval == 1)) ++ n->ipv4.sysctl_tcp_dsack = uval; // ? ++ } else if (!strcmp(name, "net.ipv4.tcp_nometrics_save")) { ++ if (has_uval && (uval == 0 || uval == 1)) ++ n->ipv4.sysctl_tcp_nometrics_save = uval; ++ } else if (!strcmp(name, "net.ipv4.tcp_moderate_rcvbuf")) { ++ if (has_uval && (uval == 0 || uval == 1)) ++ n->ipv4.sysctl_tcp_moderate_rcvbuf = uval; ++ } else if (!strcmp(name, "net.ipv4.tcp_min_tso_segs")) { ++ if (has_uval && uval) ++ n->ipv4.sysctl_tcp_min_tso_segs = uval; ++ } else if (!strcmp(name, "net.ipv4.tcp_wmem")) { ++ uval = 0; ++ while ((p = strsep(&val, " \t")) != NULL && uval < 3) { ++ if (!*p) ++ continue; ++ if (uval == 0) { ++ if (kstrtos64(p, 10, &sval)) ++ sval = 0; ++ } else if (uval == 1) { ++ if (kstrtos64(p, 10, &old_sval)) ++ old_sval = 0; ++ } else { ++ if (kstrtos64(p, 10, &third_sval)) ++ third_sval = 0; ++ } ++ uval++; ++ } ++ if (sval > 0 && old_sval > 0 && third_sval > 0) { ++ n->ipv4.sysctl_tcp_wmem[0] = sval; ++ n->ipv4.sysctl_tcp_wmem[1] = old_sval; ++ n->ipv4.sysctl_tcp_wmem[2] = third_sval; ++ } ++ } else if (!strcmp(name, "net.ipv4.tcp_rmem")) { ++ uval = 0; ++ while ((p = strsep(&val, " \t")) != NULL && uval < 3) { ++ if (!*p) ++ continue; ++ if (uval == 0) { ++ if (kstrtos64(p, 10, &sval)) ++ sval = 0; ++ } else if (uval == 1) { ++ if (kstrtos64(p, 10, &old_sval)) ++ old_sval = 0; ++ } else { ++ if (kstrtos64(p, 10, &third_sval)) ++ third_sval = 0; ++ } ++ uval++; ++ } ++ if (sval > 0 && old_sval > 0 && third_sval > 0) { ++ n->ipv4.sysctl_tcp_rmem[0] = sval; ++ n->ipv4.sysctl_tcp_rmem[1] = old_sval; ++ n->ipv4.sysctl_tcp_rmem[2] = third_sval; ++ } ++ } else if (!strcmp(name, "net.ipv4.max_syn_backlog")) { ++ if (has_sval && sval > 0) ++ n->ipv4.sysctl_max_syn_backlog = sval; ++ } else if (!strcmp(name, "net.ipv4.tcp_fastopen")) { ++ if (has_sval && (sval == 1 || sval == 2 || sval == 4)) ++ n->ipv4.sysctl_tcp_fastopen = sval; ++ } else if (!strcmp(name, "net.ipv4.tcp_congestion_control")) { ++ if (strlen(val) > 1) ++ tcp_set_default_congestion_control_ptr(n, val); ++ } else if (!strcmp(name, "net.ipv4.conf.all.forwarding")) { ++ if (has_sval && sval >= 0) ++ devconf_forward(n, n->ipv4.devconf_all, sval, ++ IPV4_DEVCONF_FORWARDING, DEVCONF_ALL); ++ } else if (!strcmp(name, "net.ipv4.conf.all.mc_forwarding")) { ++ if (has_sval && sval >= 0) ++ devconf_proc(n, n->ipv4.devconf_all, sval, ++ IPV4_DEVCONF_MC_FORWARDING, DEVCONF_ALL); ++ } else if (!strcmp(name, "net.ipv4.conf.all.proxy_arp")) { ++ if (has_sval && sval >= 0) ++ devconf_proc(n, n->ipv4.devconf_all, sval, ++ IPV4_DEVCONF_PROXY_ARP, DEVCONF_ALL); ++ } else if (!strcmp(name, "net.ipv4.conf.all.accept_redirects")) { ++ if (has_sval && sval >= 0) ++ devconf_proc(n, n->ipv4.devconf_all, sval, ++ IPV4_DEVCONF_ACCEPT_REDIRECTS, DEVCONF_ALL); ++ } else if (!strcmp(name, "net.ipv4.conf.all.secure_redirects")) { ++ if (has_sval && sval >= 0) ++ devconf_proc(n, n->ipv4.devconf_all, sval, ++ IPV4_DEVCONF_SECURE_REDIRECTS, DEVCONF_ALL); ++ } else if (!strcmp(name, "net.ipv4.conf.all.send_redirects")) { ++ if (has_sval && sval >= 0) ++ devconf_proc(n, n->ipv4.devconf_all, sval, ++ IPV4_DEVCONF_SEND_REDIRECTS, DEVCONF_ALL); ++ } else if (!strcmp(name, "net.ipv4.conf.all.shared_media")) { ++ if (has_sval && sval >= 0) ++ devconf_proc(n, n->ipv4.devconf_all, sval, ++ IPV4_DEVCONF_SHARED_MEDIA, DEVCONF_ALL); ++ } else if (!strcmp(name, "net.ipv4.conf.all.rp_filter")) { ++ if (has_sval && sval >= 0) ++ devconf_proc(n, n->ipv4.devconf_all, sval, ++ IPV4_DEVCONF_RP_FILTER, DEVCONF_ALL); ++ } else if (!strcmp(name, "net.ipv4.conf.all.accept_source_route")) { ++ if (has_sval && sval >= 0) ++ devconf_proc(n, n->ipv4.devconf_all, sval, ++ IPV4_DEVCONF_ACCEPT_SOURCE_ROUTE, DEVCONF_ALL); ++ } else if (!strcmp(name, "net.ipv4.conf.all.bootp_relay")) { ++ if (has_sval && sval >= 0) ++ devconf_proc(n, n->ipv4.devconf_all, sval, ++ IPV4_DEVCONF_BOOTP_RELAY, DEVCONF_ALL); ++ } else if (!strcmp(name, "net.ipv4.conf.all.log_martians")) { ++ if (has_sval && sval >= 0) ++ devconf_proc(n, n->ipv4.devconf_all, sval, ++ IPV4_DEVCONF_LOG_MARTIANS, DEVCONF_ALL); ++ } else if (!strcmp(name, "net.ipv4.conf.all.tag")) { ++ if (has_sval && sval >= 0) ++ devconf_proc(n, n->ipv4.devconf_all, sval, ++ IPV4_DEVCONF_TAG, DEVCONF_ALL); ++ } else if (!strcmp(name, "net.ipv4.conf.all.arp_filter")) { ++ if (has_sval && sval >= 0) ++ devconf_proc(n, n->ipv4.devconf_all, sval, ++ IPV4_DEVCONF_ARPFILTER, DEVCONF_ALL); ++ } else if (!strcmp(name, "net.ipv4.conf.all.medium_id")) { ++ if (has_sval && sval >= 0) ++ devconf_proc(n, n->ipv4.devconf_all, sval, ++ IPV4_DEVCONF_MEDIUM_ID, DEVCONF_ALL); ++ } else if (!strcmp(name, "net.ipv4.conf.all.disable_xfrm")) { ++ if (has_sval && sval >= 0) ++ devconf_flush(n, n->ipv4.devconf_all, sval, ++ IPV4_DEVCONF_NOXFRM, DEVCONF_ALL); ++ } else if (!strcmp(name, "net.ipv4.conf.all.disable_policy")) { ++ if (has_sval && sval >= 0) ++ devconf_flush(n, n->ipv4.devconf_all, sval, ++ IPV4_DEVCONF_NOPOLICY, DEVCONF_ALL); ++ } else if (!strcmp(name, "net.ipv4.conf.all.force_igmp_version")) { ++ if (has_sval && sval >= 0) ++ devconf_proc(n, n->ipv4.devconf_all, sval, ++ IPV4_DEVCONF_FORCE_IGMP_VERSION, DEVCONF_ALL); ++ } else if (!strcmp(name, "net.ipv4.conf.all.arp_announce")) { ++ if (has_sval && sval >= 0) ++ devconf_proc(n, n->ipv4.devconf_all, sval, ++ IPV4_DEVCONF_ARP_ANNOUNCE, DEVCONF_ALL); ++ } else if (!strcmp(name, "net.ipv4.conf.all.arp_ignore")) { ++ if (has_sval && sval >= 0) ++ devconf_proc(n, n->ipv4.devconf_all, sval, ++ IPV4_DEVCONF_ARP_IGNORE, DEVCONF_ALL); ++ } else if (!strcmp(name, "net.ipv4.conf.all.promote_secondaries")) { ++ if (has_sval && sval >= 0) ++ devconf_proc(n, n->ipv4.devconf_all, sval, ++ IPV4_DEVCONF_PROMOTE_SECONDARIES, DEVCONF_ALL); ++ } else if (!strcmp(name, "net.ipv4.conf.all.arp_accept")) { ++ if (has_sval && sval >= 0) ++ devconf_proc(n, n->ipv4.devconf_all, sval, ++ IPV4_DEVCONF_ARP_ACCEPT, DEVCONF_ALL); ++ } else if (!strcmp(name, "net.ipv4.conf.all.arp_notify")) { ++ if (has_sval && sval >= 0) ++ devconf_proc(n, n->ipv4.devconf_all, sval, ++ IPV4_DEVCONF_ARP_NOTIFY, DEVCONF_ALL); ++ } else if (!strcmp(name, "net.ipv4.conf.all.accept_local")) { ++ if (has_sval && sval >= 0) ++ devconf_proc(n, n->ipv4.devconf_all, sval, ++ IPV4_DEVCONF_ACCEPT_LOCAL, DEVCONF_ALL); ++ } else if (!strcmp(name, "net.ipv4.conf.all.src_valid_mark")) { ++ if (has_sval && sval >= 0) ++ devconf_proc(n, n->ipv4.devconf_all, sval, ++ IPV4_DEVCONF_SRC_VMARK, DEVCONF_ALL); ++ } else if (!strcmp(name, "net.ipv4.conf.all.proxy_arp_pvlan")) { ++ if (has_sval && sval >= 0) ++ devconf_proc(n, n->ipv4.devconf_all, sval, ++ IPV4_DEVCONF_PROXY_ARP_PVLAN, DEVCONF_ALL); ++ } else if (!strcmp(name, "net.ipv4.conf.all.route_localnet")) { ++ if (has_sval && sval >= 0) ++ devconf_flush(n, n->ipv4.devconf_all, sval, ++ IPV4_DEVCONF_ROUTE_LOCALNET, DEVCONF_ALL); ++ } else if (!strcmp(name, "net.ipv4.conf.all.igmpv2_unsolicited_report_interval")) { ++ if (has_sval && sval >= 0) ++ devconf_proc(n, n->ipv4.devconf_all, sval, ++ IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL, ++ DEVCONF_ALL); ++ } else if (!strcmp(name, "net.ipv4.conf.all.igmpv3_unsolicited_report_interval")) { ++ if (has_sval && sval >= 0) ++ devconf_proc(n, n->ipv4.devconf_all, sval, ++ IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL, ++ DEVCONF_ALL); ++ } else if (!strcmp(name, "net.ipv4.conf.all.ignore_routes_with_linkdown")) { ++ if (has_sval && sval >= 0) ++ devconf_proc(n, n->ipv4.devconf_all, sval, ++ IPV4_DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN, ++ DEVCONF_ALL); ++ } else if (!strcmp(name, "net.ipv4.conf.all.drop_unicast_in_l2_multicast")) { ++ if (has_sval && sval >= 0) ++ devconf_flush(n, n->ipv4.devconf_all, sval, ++ IPV4_DEVCONF_DROP_UNICAST_IN_L2_MULTICAST, ++ DEVCONF_ALL); ++ } else if (!strcmp(name, "net.ipv4.conf.all.drop_gratuitous_arp")) { ++ if (has_sval && sval >= 0) ++ devconf_proc(n, n->ipv4.devconf_all, sval, ++ IPV4_DEVCONF_DROP_GRATUITOUS_ARP, DEVCONF_ALL); ++ } else if (!strcmp(name, "net.ipv4.conf.all.bc_forwarding")) { ++ if (has_sval && sval >= 0) ++ devconf_proc(n, n->ipv4.devconf_all, sval, ++ IPV4_DEVCONF_BC_FORWARDING, DEVCONF_ALL); ++ } else if (!strcmp(name, "net.ipv4.conf.all.arp_evict_nocarrier")) { ++ if (has_sval && sval >= 0) ++ devconf_proc(n, n->ipv4.devconf_all, sval, ++ IPV4_DEVCONF_ARP_EVICT_NOCARRIER, DEVCONF_ALL); ++ } else if (!strcmp(name, "net.ipv4.conf.default.forwarding")) { ++ if (has_sval && sval >= 0) ++ devconf_forward(n, n->ipv4.devconf_dflt, sval, ++ IPV4_DEVCONF_FORWARDING, DEVCONF_DFLT); ++ } else if (!strcmp(name, "net.ipv4.conf.default.mc_forwarding")) { ++ if (has_sval && sval >= 0) ++ devconf_proc(n, n->ipv4.devconf_dflt, sval, ++ IPV4_DEVCONF_MC_FORWARDING, DEVCONF_DFLT); ++ } else if (!strcmp(name, "net.ipv4.conf.default.proxy_arp")) { ++ if (has_sval && sval >= 0) ++ devconf_proc(n, n->ipv4.devconf_dflt, sval, ++ IPV4_DEVCONF_PROXY_ARP, DEVCONF_DFLT); ++ } else if (!strcmp(name, "net.ipv4.conf.default.accept_redirects")) { ++ if (has_sval && sval >= 0) ++ devconf_proc(n, n->ipv4.devconf_dflt, sval, ++ IPV4_DEVCONF_ACCEPT_REDIRECTS, DEVCONF_DFLT); ++ } else if (!strcmp(name, "net.ipv4.conf.default.secure_redirects")) { ++ if (has_sval && sval >= 0) ++ devconf_proc(n, n->ipv4.devconf_dflt, sval, ++ IPV4_DEVCONF_SECURE_REDIRECTS, DEVCONF_DFLT); ++ } else if (!strcmp(name, "net.ipv4.conf.default.send_redirects")) { ++ if (has_sval && sval >= 0) ++ devconf_proc(n, n->ipv4.devconf_dflt, sval, ++ IPV4_DEVCONF_SEND_REDIRECTS, DEVCONF_DFLT); ++ } else if (!strcmp(name, "net.ipv4.conf.default.shared_media")) { ++ if (has_sval && sval >= 0) ++ devconf_proc(n, n->ipv4.devconf_dflt, sval, ++ IPV4_DEVCONF_SHARED_MEDIA, DEVCONF_DFLT); ++ } else if (!strcmp(name, "net.ipv4.conf.default.rp_filter")) { ++ if (has_sval && sval >= 0) ++ devconf_proc(n, n->ipv4.devconf_dflt, sval, ++ IPV4_DEVCONF_RP_FILTER, DEVCONF_DFLT); ++ } else if (!strcmp(name, "net.ipv4.conf.default.accept_source_route")) { ++ if (has_sval && sval >= 0) ++ devconf_proc(n, n->ipv4.devconf_dflt, sval, ++ IPV4_DEVCONF_ACCEPT_SOURCE_ROUTE, DEVCONF_DFLT); ++ } else if (!strcmp(name, "net.ipv4.conf.default.bootp_relay")) { ++ if (has_sval && sval >= 0) ++ devconf_proc(n, n->ipv4.devconf_dflt, sval, ++ IPV4_DEVCONF_BOOTP_RELAY, DEVCONF_DFLT); ++ } else if (!strcmp(name, "net.ipv4.conf.default.log_martians")) { ++ if (has_sval && sval >= 0) ++ devconf_proc(n, n->ipv4.devconf_dflt, sval, ++ IPV4_DEVCONF_LOG_MARTIANS, DEVCONF_DFLT); ++ } else if (!strcmp(name, "net.ipv4.conf.default.tag")) { ++ if (has_sval && sval >= 0) ++ devconf_proc(n, n->ipv4.devconf_dflt, sval, ++ IPV4_DEVCONF_TAG, DEVCONF_DFLT); ++ } else if (!strcmp(name, "net.ipv4.conf.default.arp_filter")) { ++ if (has_sval && sval >= 0) ++ devconf_proc(n, n->ipv4.devconf_dflt, sval, ++ IPV4_DEVCONF_ARPFILTER, DEVCONF_DFLT); ++ } else if (!strcmp(name, "net.ipv4.conf.default.medium_id")) { ++ if (has_sval && sval >= 0) ++ devconf_proc(n, n->ipv4.devconf_dflt, sval, ++ IPV4_DEVCONF_MEDIUM_ID, DEVCONF_DFLT); ++ } else if (!strcmp(name, "net.ipv4.conf.default.disable_xfrm")) { ++ if (has_sval && sval >= 0) ++ devconf_flush(n, n->ipv4.devconf_dflt, sval, ++ IPV4_DEVCONF_NOXFRM, DEVCONF_DFLT); ++ } else if (!strcmp(name, "net.ipv4.conf.default.disable_policy")) { ++ if (has_sval && sval >= 0) ++ devconf_flush(n, n->ipv4.devconf_dflt, sval, ++ IPV4_DEVCONF_NOPOLICY, DEVCONF_DFLT); ++ } else if (!strcmp(name, "net.ipv4.conf.default.force_igmp_version")) { ++ if (has_sval && sval >= 0) ++ devconf_proc(n, n->ipv4.devconf_dflt, sval, ++ IPV4_DEVCONF_FORCE_IGMP_VERSION, DEVCONF_DFLT); ++ } else if (!strcmp(name, "net.ipv4.conf.default.arp_announce")) { ++ if (has_sval && sval >= 0) ++ devconf_proc(n, n->ipv4.devconf_dflt, sval, ++ IPV4_DEVCONF_ARP_ANNOUNCE, DEVCONF_DFLT); ++ } else if (!strcmp(name, "net.ipv4.conf.default.arp_ignore")) { ++ if (has_sval && sval >= 0) ++ devconf_proc(n, n->ipv4.devconf_dflt, sval, ++ IPV4_DEVCONF_ARP_IGNORE, DEVCONF_DFLT); ++ } else if (!strcmp(name, "net.ipv4.conf.default.promote_secondaries")) { ++ if (has_sval && sval >= 0) ++ devconf_proc(n, n->ipv4.devconf_dflt, sval, ++ IPV4_DEVCONF_PROMOTE_SECONDARIES, DEVCONF_DFLT); ++ } else if (!strcmp(name, "net.ipv4.conf.default.arp_accept")) { ++ if (has_sval && sval >= 0) ++ devconf_proc(n, n->ipv4.devconf_dflt, sval, ++ IPV4_DEVCONF_ARP_ACCEPT, DEVCONF_DFLT); ++ } else if (!strcmp(name, "net.ipv4.conf.default.arp_notify")) { ++ if (has_sval && sval >= 0) ++ devconf_proc(n, n->ipv4.devconf_dflt, sval, ++ IPV4_DEVCONF_ARP_NOTIFY, DEVCONF_DFLT); ++ } else if (!strcmp(name, "net.ipv4.conf.default.accept_local")) { ++ if (has_sval && sval >= 0) ++ devconf_proc(n, n->ipv4.devconf_dflt, sval, ++ IPV4_DEVCONF_ACCEPT_LOCAL, DEVCONF_DFLT); ++ } else if (!strcmp(name, "net.ipv4.conf.default.src_valid_mark")) { ++ if (has_sval && sval >= 0) ++ devconf_proc(n, n->ipv4.devconf_dflt, sval, ++ IPV4_DEVCONF_SRC_VMARK, DEVCONF_DFLT); ++ } else if (!strcmp(name, "net.ipv4.conf.default.proxy_arp_pvlan")) { ++ if (has_sval && sval >= 0) ++ devconf_proc(n, n->ipv4.devconf_dflt, sval, ++ IPV4_DEVCONF_PROXY_ARP_PVLAN, DEVCONF_DFLT); ++ } else if (!strcmp(name, "net.ipv4.conf.default.route_localnet")) { ++ if (has_sval && sval >= 0) ++ devconf_flush(n, n->ipv4.devconf_dflt, sval, ++ IPV4_DEVCONF_ROUTE_LOCALNET, DEVCONF_DFLT); ++ } else if (!strcmp(name, "net.ipv4.conf.default.igmpv2_unsolicited_report_interval")) { ++ if (has_sval && sval >= 0) ++ devconf_proc(n, n->ipv4.devconf_dflt, sval, ++ IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL, ++ DEVCONF_DFLT); ++ } else if (!strcmp(name, "net.ipv4.conf.default.igmpv3_unsolicited_report_interval")) { ++ if (has_sval && sval >= 0) ++ devconf_proc(n, n->ipv4.devconf_dflt, sval, ++ IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL, ++ DEVCONF_DFLT); ++ } else if (!strcmp(name, "net.ipv4.conf.default.ignore_routes_with_linkdown")) { ++ if (has_sval && sval >= 0) ++ devconf_proc(n, n->ipv4.devconf_dflt, sval, ++ IPV4_DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN, ++ DEVCONF_DFLT); ++ } else if (!strcmp(name, "net.ipv4.conf.default.drop_unicast_in_l2_multicast")) { ++ if (has_sval && sval >= 0) ++ devconf_flush(n, n->ipv4.devconf_dflt, sval, ++ IPV4_DEVCONF_DROP_UNICAST_IN_L2_MULTICAST, ++ DEVCONF_DFLT); ++ } else if (!strcmp(name, "net.ipv4.conf.default.drop_gratuitous_arp")) { ++ if (has_sval && sval >= 0) ++ devconf_proc(n, n->ipv4.devconf_dflt, sval, ++ IPV4_DEVCONF_DROP_GRATUITOUS_ARP, DEVCONF_DFLT); ++ } else if (!strcmp(name, "net.ipv4.conf.default.bc_forwarding")) { ++ if (has_sval && sval >= 0) ++ devconf_proc(n, n->ipv4.devconf_dflt, sval, ++ IPV4_DEVCONF_BC_FORWARDING, DEVCONF_DFLT); ++ } else if (!strcmp(name, "net.ipv4.conf.default.arp_evict_nocarrier")) { ++ if (has_sval && sval >= 0) ++ devconf_proc(n, n->ipv4.devconf_dflt, sval, ++ IPV4_DEVCONF_ARP_EVICT_NOCARRIER, DEVCONF_DFLT); ++ } else if (!strcmp(name, "net.ipv4.unix_max_dgram_qlen")) { ++ if (has_sval && sval > 0) ++ n->unx.sysctl_max_dgram_qlen = sval; ++ } else if (!strcmp(name, "vm.max_map_count")) { ++ if (has_sval && sval > 0) ++ vk->sysctl_vm.max_map_count = sval; ++ } else if (!strcmp(name, "vm.mmap_min_addr")) { ++ if (!has_uval && kstrtou64(val, 16, &uval)) { ++ pr_warn("failed to parse raw sysctl val %s to u64\n", val); ++ return -EINVAL; ++ } ++ if (uval) { ++ vk->sysctl_vm.dac_mmap_min_addr = uval; ++#ifdef CONFIG_LSM_MMAP_MIN_ADDR ++ if (vk->sysctl_vm.dac_mmap_min_addr > CONFIG_LSM_MMAP_MIN_ADDR) ++ vk->sysctl_vm.mmap_min_addr = vk->sysctl_vm.dac_mmap_min_addr; ++ else ++ vk->sysctl_vm.mmap_min_addr = CONFIG_LSM_MMAP_MIN_ADDR; ++#else ++ vk->sysctl_vm.mmap_min_addr = vk->sysctl_vm.dac_mmap_min_addr; ++#endif ++ } ++ } else if (!strcmp(name, "vm.overcommit_kbytes")) { ++ if (has_uval && uval) { ++ vk->sysctl_vm.overcommit_kbytes = uval; ++ vk->sysctl_vm.overcommit_ratio = 0; ++ } ++ } else if (!strcmp(name, "vm.overcommit_memory")) { ++ if (has_sval && sval > 0) { ++ if (sval == OVERCOMMIT_NEVER) ++ vk_sync_overcommit_as(vk); ++ vk->sysctl_vm.overcommit_memory = sval; ++ } ++ } else if (!strcmp(name, "vm.overcommit_ratio")) { ++ if (has_sval && sval) { ++ vk->sysctl_vm.overcommit_ratio = sval; ++ vk->sysctl_vm.overcommit_kbytes = 0; ++ } ++ } else { ++ pr_err("vkernel: unsupported sysctl %s\n", name); ++ return -EINVAL; ++ } ++ ++ pr_debug("handled sysctl %s\n", name); ++ return 0; ++} +diff --git a/drivers/vkernel/sysctl/vm.c b/drivers/vkernel/sysctl/vm.c +new file mode 100644 +index 000000000000..4b322b455da2 +--- /dev/null ++++ b/drivers/vkernel/sysctl/vm.c +@@ -0,0 +1,102 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Copyright (C) 2024 ARM Ltd. ++ * Author: Hang Huang ++ */ ++ ++#include ++#include ++ ++#include "sysctl.h" ++ ++static s32 vk_mm_compute_batch(void) ++{ ++ u64 memsized_batch; ++ s32 nr = num_present_cpus(); ++ s32 batch = max_t(s32, nr * 2, 32); ++ ++ /* batch size set to 0.4% of (total memory/#cpus), or max int32 */ ++ memsized_batch = min_t(u64, (totalram_pages() / nr) / 256, INT_MAX); ++ ++ return max_t(s32, memsized_batch, batch); ++} ++ ++void vk_sync_overcommit_as(struct vkernel *vk) ++{ ++ struct percpu_counter *fbc = &vk->sysctl_vm.vm_committed_as; ++ unsigned long flags; ++ int cpu; ++ s32 *pcount; ++ s32 count; ++ ++ raw_spin_lock_irqsave(&fbc->lock, flags); ++ for_each_cpu_or(cpu, cpu_online_mask, cpu_dying_mask) { ++ pcount = per_cpu_ptr(fbc->counters, cpu); ++ count = *pcount; ++ fbc->count += count; ++ *pcount -= count; ++ } ++ raw_spin_unlock_irqrestore(&fbc->lock, flags); ++} ++ ++int vk_init_sysctl_vm(struct vkernel_sysctl_vm *vm) ++{ ++ vm->max_map_count = DEFAULT_MAX_MAP_COUNT; ++ vm->dac_mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR; ++#ifdef CONFIG_LSM_MMAP_MIN_ADDR ++ if (vm->dac_mmap_min_addr > CONFIG_LSM_MMAP_MIN_ADDR) ++ vm->mmap_min_addr = vm->dac_mmap_min_addr; ++ else ++ vm->mmap_min_addr = CONFIG_LSM_MMAP_MIN_ADDR; ++#else ++ vm->mmap_min_addr = vm->dac_mmap_min_addr; ++#endif ++ ++ vm->overcommit_memory = 0; ++ vm->overcommit_ratio = 50; ++ vm->overcommit_kbytes = 0; ++ vm->as_batch = vk_mm_compute_batch(); ++ if (percpu_counter_init(&vm->vm_committed_as, 0, GFP_KERNEL)) { ++ pr_err("vkernel: failed to init sysctl_vm vm_committed_as\n"); ++ return -ENOMEM; ++ } ++ ++ return 0; ++} ++ ++void vk_uninit_sysctl_vm(struct vkernel_sysctl_vm *vm) ++{ ++ percpu_counter_destroy(&vm->vm_committed_as); ++} ++ ++int vkernel_set_sysctl_vm(struct vkernel_sysctl_vm *vm, struct vkernel_sysctl_vm_desc *desc) ++{ ++ if (desc->max_map_count > 0) ++ vm->max_map_count = desc->max_map_count; ++ ++ if (desc->mmap_min_addr) { ++ vm->dac_mmap_min_addr = desc->mmap_min_addr; ++#ifdef CONFIG_LSM_MMAP_MIN_ADDR ++ if (vm->dac_mmap_min_addr > CONFIG_LSM_MMAP_MIN_ADDR) ++ vm->mmap_min_addr = vm->dac_mmap_min_addr; ++ else ++ vm->mmap_min_addr = CONFIG_LSM_MMAP_MIN_ADDR; ++#else ++ vm->mmap_min_addr = vm->dac_mmap_min_addr; ++#endif ++ } ++ ++ if (desc->overcommit_memory > 0) { ++ vm->overcommit_memory = desc->overcommit_memory; ++ if (desc->overcommit_ratio > 0) { ++ vm->overcommit_ratio = desc->overcommit_ratio; ++ vm->overcommit_kbytes = 0; ++ } else if (desc->overcommit_kbytes) { ++ vm->overcommit_ratio = 0; ++ vm->overcommit_kbytes = desc->overcommit_kbytes; ++ } ++ } ++ ++ return 0; ++} ++EXPORT_SYMBOL(vkernel_set_sysctl_vm); +diff --git a/drivers/vkernel/utils/kallsyms.c b/drivers/vkernel/utils/kallsyms.c +new file mode 100644 +index 000000000000..613d1f0b28dc +--- /dev/null ++++ b/drivers/vkernel/utils/kallsyms.c +@@ -0,0 +1,63 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Wrapper of lookup_name ++ * Define the wrapper, so other components can include a function not a symbol ++ * ++ * Copyright (C) 2024 ARM Ltd. ++ * Author: Hang Huang ++ */ ++ ++#include ++#include ++ ++#include "utils.h" ++ ++/* ++ * There are two ways of preventing vicious recursive loops when hooking: ++ * - detect recusion using function return address (USE_FENTRY_OFFSET = 0) ++ * - avoid recusion by jumping over the ftrace call (USE_FENTRY_OFFSET = 1) ++ */ ++#define USE_FENTRY_OFFSET 0 ++ ++/* ++ * Tail call optimization can interfere with recursion detection based on ++ * return address on the stack. Disable it to avoid machine hangups. ++ */ ++#if !USE_FENTRY_OFFSET ++#pragma GCC optimize("-fno-optimize-sibling-calls") ++#endif ++ ++unsigned long vk_lookup_name(const char *name) ++{ ++ struct kprobe kp = { .symbol_name = name }; ++ unsigned long retval; ++ ++ if (register_kprobe(&kp) < 0) ++ return 0; ++ ++ retval = (unsigned long)kp.addr; ++ unregister_kprobe(&kp); ++ ++ return retval; ++} ++ ++static unsigned long (*kallsyms_lookup_name_ptr)(const char *name); ++ ++int vk_kallsyms_init(void) ++{ ++ kallsyms_lookup_name_ptr = (void *)vk_lookup_name("kallsyms_lookup_name"); ++ if (!kallsyms_lookup_name_ptr) { ++ pr_err("cannot resolve symbol: kallsyms_lookup_name\n"); ++ return -ENOENT; ++ } ++ ++ return 0; ++} ++ ++void vk_kallsyms_uninit(void) {} ++ ++unsigned long lookup_name(const char *name) ++{ ++ return kallsyms_lookup_name_ptr(name); ++} ++EXPORT_SYMBOL(lookup_name); +diff --git a/drivers/vkernel/vkernel_main.c b/drivers/vkernel/vkernel_main.c +new file mode 100644 +index 000000000000..32e6c0a428bb +--- /dev/null ++++ b/drivers/vkernel/vkernel_main.c +@@ -0,0 +1,1444 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/** ++ * vkernel core ++ * ++ * Copyright (C) 2024 ARM Ltd. ++ * Author: Hang Huang ++ **/ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "fs.h" ++#include "mm.h" ++#include "sched.h" ++#include "security.h" ++#include "syscall.h" ++#include "sysctl.h" ++#include "utils.h" ++ ++MODULE_AUTHOR("JYH Lab"); ++MODULE_LICENSE("GPL"); ++MODULE_DESCRIPTION("vkernel core module"); ++ ++/* Worst case buffer size needed for holding an integer. */ ++#define ITOA_MAX_LEN 12 ++ ++static DEFINE_MUTEX(vk_lock); ++static LIST_HEAD(vk_list); ++ ++static DEFINE_MUTEX(custom_lock); ++static DEFINE_HASHTABLE(custom_ht, 6); ++ ++struct dentry *vkernel_debugfs_dir; ++EXPORT_SYMBOL_GPL(vkernel_debugfs_dir); ++ ++static const struct file_operations vkernel_chardev_ops; ++ ++#define CONFIG_VKERNEL_COMPAT ++ ++#ifdef CONFIG_VKERNEL_COMPAT ++#define VKERNEL_COMPAT(c) .compat_ioctl = (c) ++#else ++/* ++ * For architectures that don't implement a compat infrastructure, ++ * adopt a double line of defense: ++ * - Prevent a compat task from opening /dev/vkernel ++ * - If the open has been done by a 64bit task, and the vkernel fd ++ * passed to a compat task, let the ioctls fail. ++ */ ++static long vkernel_no_compat_ioctl(struct file *file, unsigned int ioctl, ++ unsigned long arg) ++{ ++ return -EINVAL; ++} ++ ++static int vkernel_no_compat_open(struct inode *inode, struct file *file) ++{ ++ return is_compat_task() ? -ENODEV : 0; ++} ++#define VKERNEL_COMPAT(c) .compat_ioctl = vkernel_no_compat_ioctl, \ ++ .open = vkernel_no_compat_open ++#endif ++ ++#define VKERNEL_EVENT_CREATE_VK 0 ++#define VKERNEL_EVENT_DESTROY_VK 1 ++ ++#define VKERNEL_CAP_MASK ((1 << VKERNEL_CAP_ISOLATE_ANON) |\ ++ (1 << VKERNEL_CAP_ISOLATE_ANON_PIPE) | \ ++ (1 << VKERNEL_CAP_ISOLATE_RAMFS)) ++ ++static void vkernel_uevent_notify_change(unsigned int type, struct vkernel *vk); ++static DEFINE_MUTEX(event_lock); ++static unsigned long long vkernel_createvk_count; ++static unsigned long long vkernel_active_vks; ++ ++ ++static int default_post_create(struct vkernel *vk) ++{ ++ /* Set default syscall and acl rules */ ++ vk_install_default_syscalls(&vk->syscall); ++ return vkernel_set_default_acl_set(&vk->acl); ++} ++ ++static struct vkernel_custom_type default_custom = { ++ .owner = THIS_MODULE, ++ .name = "default", ++ .post_create = default_post_create, ++ .pre_destroy = NULL, ++}; ++ ++struct vkernel_custom_type *vkernel_find_custom(const char *name) ++{ ++ struct vkernel_custom_type *custom; ++ unsigned int key; ++ ++ key = full_name_hash(NULL, name, strlen(name)); ++ ++ hash_for_each_possible(custom_ht, custom, hash, key) { ++ if (!strcmp(name, custom->name)) ++ return custom; ++ } ++ ++ return NULL; ++} ++EXPORT_SYMBOL(vkernel_find_custom); ++ ++int vkernel_register_custom(struct vkernel_custom_type *custom) ++{ ++ unsigned int key; ++ ++ if (!custom->owner) { ++ pr_err("custom type %s has no owner\n", custom->name); ++ return -EINVAL; ++ } ++ ++ if (vkernel_find_custom(custom->name)) { ++ pr_err("custom type %s already existed\n", custom->name); ++ return -EEXIST; ++ } ++ ++ key = full_name_hash(NULL, custom->name, strlen(custom->name)); ++ mutex_lock(&custom_lock); ++ hash_add(custom_ht, &custom->hash, key); ++ mutex_unlock(&custom_lock); ++ ++ pr_info("register cutom type %s\n", custom->name); ++ ++ return 0; ++} ++EXPORT_SYMBOL(vkernel_register_custom); ++ ++int vkernel_unregister_custom(struct vkernel_custom_type *custom) ++{ ++ pr_info("unregister cutom type %s\n", custom->name); ++ ++ mutex_lock(&custom_lock); ++ /* It is also ok to remove an unhashed custom */ ++ hash_del(&custom->hash); ++ mutex_unlock(&custom_lock); ++ ++ return 0; ++} ++EXPORT_SYMBOL(vkernel_unregister_custom); ++ ++ ++__weak int vkernel_arch_vk_ioctl(struct file *filp, ++ unsigned int ioctl, unsigned long arg) ++{ ++ return 0; ++} ++ ++__weak int vkernel_arch_dev_ioctl(struct file *filp, ++ unsigned int ioctl, unsigned long arg) ++{ ++ return 0; ++} ++ ++static int vkernel_vk_ioctl_set_def_syscall(struct vkernel *vk, unsigned long arg) ++{ ++ return vkernel_set_default_syscall_rule(&vk->syscall, arg); ++} ++ ++static int vkernel_vk_ioctl_restrict_syscall(struct vkernel *vk, unsigned long arg) ++{ ++ void __user *argp = (void __user *)arg; ++ struct vkernel_syscall_rule_desc desc; ++ ++ if (copy_from_user(&desc, argp, sizeof(desc))) ++ return -EFAULT; ++ ++ return vkernel_add_syscall_rule(&vk->syscall, &desc); ++} ++ ++static int vkernel_vk_ioctl_restrict_file(struct vkernel *vk, unsigned long arg) ++{ ++ void __user *argp = (void __user *)arg; ++ struct vkernel_file_desc_set header; ++ struct vkernel_file_desc_set *set = NULL; ++ unsigned long full_size; ++ int r = 0; ++ ++ if (copy_from_user(&header, argp, sizeof(header))) { ++ r = -EFAULT; ++ goto out; ++ } ++ if (!header.nr_descs) { ++ r = -EINVAL; ++ goto out; ++ } ++ ++ full_size = sizeof(header) + sizeof(struct vkernel_file_desc) * header.nr_descs; ++ set = kmalloc(full_size, GFP_KERNEL); ++ if (!set) { ++ r = -ENOMEM; ++ goto out_set; ++ } ++ if (copy_from_user(set, argp, full_size)) { ++ r = -EFAULT; ++ goto out_set; ++ } ++ ++ r = vkernel_set_acl_set(&vk->acl, set); ++ ++out_set: ++ kfree(set); ++out: ++ return r; ++} ++ ++static int vkernel_vk_ioctl_restrict_linux_cap(struct vkernel *vk, unsigned long arg) ++{ ++ void __user *argp = (void __user *)arg; ++ struct vkernel_linux_cap cap; ++ ++ if (copy_from_user(&cap, argp, sizeof(cap))) ++ return -EFAULT; ++ ++ return vkernel_set_linux_cap(vk, &cap); ++} ++ ++static int vkernel_vk_ioctl_set_cpu(struct vkernel *vk, unsigned long arg) ++{ ++ void __user *argp = (void __user *)arg; ++ struct vkernel_cpu_desc desc; ++ ++ if (copy_from_user(&desc, argp, sizeof(desc))) ++ return -EFAULT; ++ ++ return vkernel_set_cpu_pref(vk, &desc); ++} ++ ++static int vkernel_vk_ioctl_set_memory(struct vkernel *vk, unsigned long arg) ++{ ++ void __user *argp = (void __user *)arg; ++ struct vkernel_mem_desc desc; ++ ++ if (copy_from_user(&desc, argp, sizeof(desc))) ++ return -EFAULT; ++ ++ return vkernel_set_memory_pref(&vk->mem_pref, &desc); ++} ++ ++static int vkernel_vk_ioctl_set_sysctl_fs(struct vkernel *vk, unsigned long arg) ++{ ++ void __user *argp = (void __user *)arg; ++ struct vkernel_sysctl_fs_desc desc; ++ ++ if (copy_from_user(&desc, argp, sizeof(desc))) ++ return -EFAULT; ++ ++ return vkernel_set_sysctl_fs(&vk->sysctl_fs, &desc); ++} ++ ++static int vkernel_vk_ioctl_set_sysctl_kernel(struct vkernel *vk, unsigned long arg) ++{ ++ void __user *argp = (void __user *)arg; ++ struct vkernel_sysctl_kernel_desc desc; ++ struct ipc_namespace *ipc_ns; ++ ++ if (copy_from_user(&desc, argp, sizeof(desc))) ++ return -EFAULT; ++ ++ /* Handle namespace fields */ ++ if (vk->init_process->nsproxy) ++ ipc_ns = vk->init_process->nsproxy->ipc_ns; ++ if (likely(ipc_ns)) { ++ if (desc.msgmax) ++ ipc_ns->msg_ctlmax = desc.msgmax; ++ if (desc.msgmnb) ++ ipc_ns->msg_ctlmnb = desc.msgmnb; ++ if (desc.msgmni) ++ ipc_ns->msg_ctlmni = desc.msgmni; ++#ifdef CONFIG_CHECKPOINT_RESTORE ++ if (desc.msg_next_id >= -1) ++ ipc_ns->ids[IPC_MSG_IDS].next_id = desc.msg_next_id; ++#endif ++ if (desc.semmsl > 0) ++ ipc_ns->sem_ctls[0] = desc.semmsl; ++ if (desc.semmns > 0) ++ ipc_ns->sem_ctls[1] = desc.semmns; ++ if (desc.semopm > 0) ++ ipc_ns->sem_ctls[2] = desc.semopm; ++ if (desc.semmni > 0) ++ ipc_ns->sem_ctls[3] = desc.semmni; ++#ifdef CONFIG_CHECKPOINT_RESTORE ++ if (desc.sem_next_id >= -1) ++ ipc_ns->ids[IPC_SEM_IDS].next_id = desc.sem_next_id; ++#endif ++ if (desc.shmall) ++ ipc_ns->shm_ctlall = desc.shmall; ++ if (desc.shmmax) ++ ipc_ns->shm_ctlmax = desc.shmmax; ++ if (desc.shmmni) ++ ipc_ns->shm_ctlmni = desc.shmmni; ++#ifdef CONFIG_CHECKPOINT_RESTORE ++ if (desc.shm_next_id) ++ ipc_ns->ids[IPC_SHM_IDS].next_id = desc.shm_next_id; ++#endif ++ if (desc.shm_rmid_forced == 0 || desc.shm_rmid_forced == 1) ++ ipc_ns->shm_rmid_forced = desc.shm_rmid_forced; ++ } ++ ++ return vkernel_set_sysctl_kernel(&vk->sysctl_kernel, &desc); ++} ++ ++static int vkernel_vk_ioctl_set_sysctl_net(struct vkernel *vk, unsigned long arg) ++{ ++ void __user *argp = (void __user *)arg; ++ struct vkernel_sysctl_net_desc desc; ++ ++ if (copy_from_user(&desc, argp, sizeof(desc))) ++ return -EFAULT; ++ ++ return vkernel_set_sysctl_net(&vk->sysctl_net, &desc); ++} ++ ++static int vkernel_vk_ioctl_set_sysctl_vm(struct vkernel *vk, unsigned long arg) ++{ ++ void __user *argp = (void __user *)arg; ++ struct vkernel_sysctl_vm_desc desc; ++ ++ if (copy_from_user(&desc, argp, sizeof(desc))) ++ return -EFAULT; ++ ++ if (desc.overcommit_memory == OVERCOMMIT_NEVER) ++ vk_sync_overcommit_as(vk); ++ ++ return vkernel_set_sysctl_vm(&vk->sysctl_vm, &desc); ++} ++ ++static int vkernel_vk_ioctl_check_extension(struct vkernel *vk, unsigned long arg) ++{ ++ int r = 0; ++ ++ switch (arg) { ++ case VKERNEL_CAP_ISOLATE_LOG: ++ r = 0; ++ break; ++ default: ++ r = -EOPNOTSUPP; ++ break; ++ } ++ ++ return r; ++} ++ ++static int vkernel_vk_ioctl_enable_cap(struct vkernel *vk, unsigned long arg) ++{ ++ int r = 0; ++ ++ if (arg >= VKERNEL_CAP_NUM) ++ return -EINVAL; ++ ++ if (vk->caps & (arg << 1)) ++ return 0; ++ ++ switch (arg) { ++ case VKERNEL_CAP_ISOLATE_LOG: ++ vk->log_ns = vk->pid_ns->ns.inum; ++ break; ++ default: ++ r = -EOPNOTSUPP; ++ } ++ ++ if (!r) ++ vk->caps |= (1 << arg); ++ ++ return r; ++} ++ ++static int stat_show(struct seq_file *m, void *v) ++{ ++ struct vkernel *vk = m->private; ++ ++ seq_puts(m, "=== BASIC ===\n"); ++ seq_printf(m, "Name: %s\n", vk->name); ++ seq_printf(m, "Pid ns: %u\n", vk->pid_ns->ns.inum); ++ seq_printf(m, "Uts ns: %u\n", vk->uts_ns->ns.inum); ++ seq_printf(m, "Init pid: %d\n", vk->init_pid); ++ seq_printf(m, "Users count: %d\n", refcount_read(&vk->users_count)); ++ seq_printf(m, "Active: %d\n", vk->active); ++ ++ seq_puts(m, "=== SECURITY ===\n"); ++ seq_printf(m, "Syscall def act: %d\n", vk->syscall.def_act); ++ seq_printf(m, "Syscall do_futex %p\n", vk->syscall.do_futex); ++ seq_printf(m, "ACL bits: %d\n", vk->acl.bits); ++ seq_printf(m, "ACL active: %d\n", vk->acl.active); ++ seq_printf(m, "Cap inheritable: 0x%llx\n", vk->linux_cap.inheritable.val); ++ seq_printf(m, "Cap permitted: 0x%llx\n", vk->linux_cap.permitted.val); ++ seq_printf(m, "Cap effective: 0x%llx\n", vk->linux_cap.effective.val); ++ seq_printf(m, "Cap bset: 0x%llx\n", vk->linux_cap.bset.val); ++ seq_printf(m, "Cap ambient: 0x%llx\n", vk->linux_cap.ambient.val); ++ ++ seq_puts(m, "=== RESOURCE ===\n"); ++ seq_printf(m, "Cpu policy: %d\n", vk->cpu_pref.policy); ++ seq_printf(m, "Cpu rr timeslice: %lu\n", vk->cpu_pref.rr_timeslice_us); ++ seq_printf(m, "Cpu wakeup gran: %lu\n", vk->cpu_pref.wakeup_gran_us); ++ seq_printf(m, "Mem def polciy: %u\n", vk->mem_pref.default_policy.mode); ++ seq_printf(m, "Mem shmem huge: %d\n", vk->mem_pref.shmem_huge); ++ seq_printf(m, "Mem thp flags: 0x%lx\n", vk->mem_pref.thp_flags); ++ ++ seq_puts(m, "EXTENSION CAP\n"); ++ seq_printf(m, "Isolation caps: 0x%lx\n", vk->caps); ++ seq_printf(m, "Log ns: %u\n", vk->log_ns); ++ ++ seq_puts(m, "=== SYSCTL ===\n"); ++ seq_printf(m, "fs.file-max=%lu\n", vk->sysctl_fs.files_stat.max_files); ++ seq_printf(m, "fs.nr_open=%u\n", vk->sysctl_fs.nr_open); ++ seq_printf(m, "fs.lease-break-time=%d\n", vk->sysctl_fs.lease_break_time); ++ seq_printf(m, "fs.leases-enable=%d\n", vk->sysctl_fs.leases_enable); ++ seq_printf(m, "fs.mount-max=%u\n", vk->sysctl_fs.mount_max); ++ seq_printf(m, "kernel.numa_balancing=%d\n", vk->sysctl_kernel.nb_mode); ++ seq_printf(m, "kernel.numa_balancing_promote_rate_limit_MBps=%d\n", ++ vk->sysctl_kernel.nb_promote_rate_limit); ++ seq_printf(m, "kernel.sched_cfs_bandwidth_slice_us=%u\n", ++ vk->sysctl_kernel.sched_cfs_bandwidth_slice); ++ seq_printf(m, "kernel.sched_child_runs_first=%u\n", ++ vk->sysctl_kernel.sched_child_runs_first); ++ seq_printf(m, "kernel.sched_deadline_period_max_us=%u\n", ++ vk->sysctl_kernel.sched_dl_period_max); ++ seq_printf(m, "kernel.sched_deadline_period_min_us=%u\n", ++ vk->sysctl_kernel.sched_dl_period_min); ++ seq_printf(m, "kernel.sched_rr_timeslice_ms=%d\n", ++ vk->sysctl_kernel.sched_rr_timeslice); ++ seq_printf(m, "kernel.sched_rt_period_us=%d\n", ++ vk->sysctl_kernel.sched_rt_period); ++ seq_printf(m, "kernel.sched_rt_runtime_us=%d\n", ++ vk->sysctl_kernel.sched_rt_runtime); ++ seq_printf(m, "kernel.threads-max=%d\n", vk->sysctl_kernel.max_threads); ++ seq_printf(m, "kernel.keys.gc_delay=%u\n", vk->sysctl_kernel.key_gc_delay); ++ seq_printf(m, "kernel.keys.maxbytes=%u\n", vk->sysctl_kernel.key_quota_maxbytes); ++ seq_printf(m, "kernel.keys.maxkeys=%u\n", vk->sysctl_kernel.key_quota_maxkeys); ++ seq_printf(m, "kernel.keys.persistent_keyring_expiry=%u\n", ++ vk->sysctl_kernel.persistent_keyring_expiry); ++ seq_printf(m, "kernel.keys.root_maxbytes=%u\n", ++ vk->sysctl_kernel.key_quota_root_maxbytes); ++ seq_printf(m, "kernel.keys.root_maxkeys=%u\n", ++ vk->sysctl_kernel.key_quota_root_maxkeys); ++ seq_printf(m, "kernel.pty.max=%d\n", vk->sysctl_kernel.pty_limit); ++ seq_printf(m, "kernel.pty.reserve=%d\n", vk->sysctl_kernel.pty_reserve); ++ seq_printf(m, "net.nf_conntrack_max=%u\n", vk->sysctl_net.nf_conntrack_max); ++ seq_printf(m, "net.core.busy_poll=%u\n", vk->sysctl_net.net_busy_poll); ++ seq_printf(m, "net.core.busy_read=%u\n", vk->sysctl_net.net_busy_read); ++ seq_printf(m, "net.core.optmem_max=%d\n", vk->sysctl_net.optmem_max); ++ seq_printf(m, "net.core.wmem_max=%u\n", vk->sysctl_net.wmem_max); ++ seq_printf(m, "net.core.rmem_max=%u\n", vk->sysctl_net.rmem_max); ++ seq_printf(m, "net.core.wmem_default=%u\n", vk->sysctl_net.wmem_default); ++ seq_printf(m, "net.core.rmem_default=%u\n", vk->sysctl_net.rmem_default); ++ seq_printf(m, "vm.max_map_count=%d\n", vk->sysctl_vm.max_map_count); ++ seq_printf(m, "vm.mmap_min_addr=0x%lx\n", vk->sysctl_vm.mmap_min_addr); ++ seq_printf(m, "vm.dac_mmap_min_addr=0x%lx\n", vk->sysctl_vm.dac_mmap_min_addr); ++ seq_printf(m, "vm.overcommit_kbytes=%lu\n", vk->sysctl_vm.overcommit_kbytes); ++ seq_printf(m, "vm.overcommit_memory=%d\n", vk->sysctl_vm.overcommit_memory); ++ seq_printf(m, "vm.overcommit_ratio=%d\n", vk->sysctl_vm.overcommit_ratio); ++ ++ seq_puts(m, "=== OPERATION ===\n"); ++ seq_printf(m, "Op cap_capable: %p\n", vk->ops.cap_capable); ++ seq_printf(m, "Op generic_permission: %p\n", vk->ops.generic_permission); ++ ++ seq_puts(m, "=== CUSTOM ===\n"); ++ seq_printf(m, "Custom type: %s\n", vk->custom->name); ++ seq_printf(m, "Custom post_create: %p\n", vk->custom->post_create); ++ seq_printf(m, "Custom pre_destroy: %p\n", vk->custom->pre_destroy); ++ ++ return 0; ++} ++ ++static int stat_open(struct inode *inode, struct file *file) ++{ ++ struct vkernel *vk = inode->i_private; ++ int r; ++ ++ if (!vkernel_get_vk_safe(vk)) ++ return -ENOENT; ++ ++ r = single_open(file, stat_show, inode->i_private); ++ if (r < 0) ++ vkernel_put_vk(vk); ++ ++ return r; ++} ++ ++static int stat_release(struct inode *inode, struct file *file) ++{ ++ struct vkernel *vk = inode->i_private; ++ ++ vkernel_put_vk(vk); ++ ++ return single_release(inode, file); ++} ++ ++static const struct file_operations vk_stat_fops = { ++ .open = stat_open, ++ .release = stat_release, ++ .read = seq_read, ++ .llseek = seq_lseek, ++}; ++ ++static int sysctl_show(struct seq_file *m, void *v) ++{ ++ struct vkernel *vk = m->private; ++ struct ipc_namespace *ipc_ns = NULL; ++ struct net *n; ++ ++ if (vk->init_process->nsproxy) ++ ipc_ns = vk->init_process->nsproxy->ipc_ns; ++ ++ n = vk->sysctl_net.net; ++ ++ seq_puts(m, "=== fs ===\n"); ++ seq_printf(m, "fs.file-max=%lu\n", vk->sysctl_fs.files_stat.max_files); ++ seq_printf(m, "fs.nr_open=%u\n", vk->sysctl_fs.nr_open); ++ seq_printf(m, "fs.lease-break-time=%d\n", vk->sysctl_fs.lease_break_time); ++ seq_printf(m, "fs.leases-enable=%d\n", vk->sysctl_fs.leases_enable); ++ seq_printf(m, "fs.mount-max=%u\n", vk->sysctl_fs.mount_max); ++ ++ seq_puts(m, "=== kernel ===\n"); ++ if (ipc_ns) { ++ seq_printf(m, "kernel.msgmax=%u\n", ipc_ns->msg_ctlmax); ++ seq_printf(m, "kernel.msgmnb=%u\n", ipc_ns->msg_ctlmnb); ++ seq_printf(m, "kernel.msgmni=%u\n", ipc_ns->msg_ctlmni); ++#ifdef CONFIG_CHECKPOINT_RESTORE ++ seq_printf(m, "kernel.msg_next_id=%d\n", ipc_ns->ids[IPC_MSG_IDS].next_id); ++#endif ++ seq_printf(m, "kernel.sem=%d %d %d\n", ++ ipc_ns->sem_ctls[0], ipc_ns->sem_ctls[1], ipc_ns->sem_ctls[2]); ++#ifdef CONFIG_CHECKPOINT_RESTORE ++ seq_printf(m, "kernel.sem_next_id=%d\n", ipc_ns->ids[IPC_SEM_IDS].next_id); ++#endif ++ seq_printf(m, "kernel.shmall=%lu\n", ipc_ns->shm_ctlall); ++ seq_printf(m, "kernel.shmmax=%lu\n", ipc_ns->shm_ctlmax); ++ seq_printf(m, "kernel.shmmni=%d\n", ipc_ns->shm_ctlmni); ++#ifdef CONFIG_CHECKPOINT_RESTORE ++ seq_printf(m, "kernel.shm_next_id=%d\n", ipc_ns->ids[IPC_SHM_IDS].next_id); ++#endif ++ seq_printf(m, "kernel.shm_rmid_forced=%d\n", ipc_ns->shm_rmid_forced); ++ } ++ seq_printf(m, "kernel.numa_balancing=%d\n", vk->sysctl_kernel.nb_mode); ++ seq_printf(m, "kernel.numa_balancing_promote_rate_limit_MBps=%d\n", ++ vk->sysctl_kernel.nb_promote_rate_limit); ++ seq_printf(m, "kernel.sched_cfs_bandwidth_slice_us=%u\n", ++ vk->sysctl_kernel.sched_cfs_bandwidth_slice); ++ seq_printf(m, "kernel.sched_child_runs_first=%u\n", ++ vk->sysctl_kernel.sched_child_runs_first); ++ seq_printf(m, "kernel.sched_deadline_period_max_us=%u\n", ++ vk->sysctl_kernel.sched_dl_period_max); ++ seq_printf(m, "kernel.sched_deadline_period_min_us=%u\n", ++ vk->sysctl_kernel.sched_dl_period_min); ++ seq_printf(m, "kernel.sched_rr_timeslice_ms=%d\n", ++ vk->sysctl_kernel.sched_rr_timeslice); ++ seq_printf(m, "kernel.sched_rt_period_us=%d\n", ++ vk->sysctl_kernel.sched_rt_period); ++ seq_printf(m, "kernel.sched_rt_runtime_us=%d\n", ++ vk->sysctl_kernel.sched_rt_runtime); ++ seq_printf(m, "kernel.threads-max=%d\n", vk->sysctl_kernel.max_threads); ++ seq_printf(m, "kernel.keys.gc_delay=%u\n", vk->sysctl_kernel.key_gc_delay); ++ seq_printf(m, "kernel.keys.maxbytes=%u\n", vk->sysctl_kernel.key_quota_maxbytes); ++ seq_printf(m, "kernel.keys.maxkeys=%u\n", vk->sysctl_kernel.key_quota_maxkeys); ++ seq_printf(m, "kernel.keys.persistent_keyring_expiry=%u\n", ++ vk->sysctl_kernel.persistent_keyring_expiry); ++ seq_printf(m, "kernel.keys.root_maxbytes=%u\n", ++ vk->sysctl_kernel.key_quota_root_maxbytes); ++ seq_printf(m, "kernel.keys.root_maxkeys=%u\n", ++ vk->sysctl_kernel.key_quota_root_maxkeys); ++ seq_printf(m, "kernel.pty.max=%d\n", vk->sysctl_kernel.pty_limit); ++ seq_printf(m, "kernel.pty.reserve=%d\n", vk->sysctl_kernel.pty_reserve); ++ ++ seq_puts(m, "=== net ===\n"); ++ seq_printf(m, "net.nf_conntrack_max=%u\n", vk->sysctl_net.nf_conntrack_max); ++ seq_printf(m, "net.core.busy_poll=%u\n", vk->sysctl_net.net_busy_poll); ++ seq_printf(m, "net.core.busy_read=%u\n", vk->sysctl_net.net_busy_read); ++ seq_printf(m, "net.core.optmem_max=%d\n", vk->sysctl_net.optmem_max); ++ seq_printf(m, "net.core.wmem_max=%u\n", vk->sysctl_net.wmem_max); ++ seq_printf(m, "net.core.rmem_max=%u\n", vk->sysctl_net.rmem_max); ++ seq_printf(m, "net.core.wmem_default=%u\n", vk->sysctl_net.wmem_default); ++ seq_printf(m, "net.core.rmem_default=%u\n", vk->sysctl_net.rmem_default); ++ ++ seq_printf(m, "net.core.somaxconn=%d\n", n->core.sysctl_somaxconn); ++ seq_printf(m, "net.ipv4.icmp_echo_ignore_broadcasts=%u\n", ++ n->ipv4.sysctl_icmp_echo_ignore_broadcasts); ++ seq_printf(m, "net.ipv4.ip_local_port_range=%d %d\n", ++ n->ipv4.ip_local_ports.range[0], n->ipv4.ip_local_ports.range[1]); ++ seq_printf(m, "net.ipv4.tcp_max_tw_buckets=%d\n", ++ n->ipv4.tcp_death_row.sysctl_max_tw_buckets); ++ seq_printf(m, "net.ipv4.tcp_ecn=%u\n", n->ipv4.sysctl_tcp_ecn); ++ seq_printf(m, "net.ipv4.ip_default_ttl=%u\n", n->ipv4.sysctl_ip_default_ttl); ++ seq_printf(m, "net.ipv4.ip_no_pmtu_disc=%u\n", n->ipv4.sysctl_ip_no_pmtu_disc); ++ seq_printf(m, "net.ipv4.tcp_keepalive_time=%d\n", ++ READ_ONCE(n->ipv4.sysctl_tcp_keepalive_time) / HZ); ++ seq_printf(m, "net.ipv4.tcp_keepalive_intvl=%d\n", ++ READ_ONCE(n->ipv4.sysctl_tcp_keepalive_intvl) / HZ); ++ seq_printf(m, "net.ipv4.tcp_keepalive_probes=%u\n", ++ n->ipv4.sysctl_tcp_keepalive_probes); ++ seq_printf(m, "net.ipv4.tcp_syn_retries=%u\n", n->ipv4.sysctl_tcp_syn_retries); ++ seq_printf(m, "net.ipv4.tcp_synack_retries=%u\n", n->ipv4.sysctl_tcp_synack_retries); ++ seq_printf(m, "net.ipv4.tcp_syncookies=%u\n", n->ipv4.sysctl_tcp_syncookies); ++ seq_printf(m, "net.ipv4.tcp_reordering=%d\n", n->ipv4.sysctl_tcp_reordering); ++ seq_printf(m, "net.ipv4.tcp_retries1=%u\n", n->ipv4.sysctl_tcp_retries1); ++ seq_printf(m, "net.ipv4.tcp_retries2=%u\n", n->ipv4.sysctl_tcp_retries2); ++ seq_printf(m, "net.ipv4.tcp_orphan_retries=%u\n", n->ipv4.sysctl_tcp_orphan_retries); ++ seq_printf(m, "net.ipv4.tcp_tw_reuse=%u\n", n->ipv4.sysctl_tcp_tw_reuse); ++ seq_printf(m, "net.ipv4.tcp_fin_timeout=%d\n", ++ READ_ONCE(n->ipv4.sysctl_tcp_fin_timeout) / HZ); ++ seq_printf(m, "net.ipv4.tcp_sack=%u\n", n->ipv4.sysctl_tcp_sack); ++ seq_printf(m, "net.ipv4.tcp_window_scaling=%u\n", n->ipv4.sysctl_tcp_window_scaling); ++ seq_printf(m, "net.ipv4.tcp_timestamps=%u\n", n->ipv4.sysctl_tcp_timestamps); ++ seq_printf(m, "net.ipv4.tcp_thin_linear_timeouts=%u\n", ++ n->ipv4.sysctl_tcp_thin_linear_timeouts); ++ seq_printf(m, "net.ipv4.tcp_retrans_collapse=%u\n", n->ipv4.sysctl_tcp_retrans_collapse); ++ seq_printf(m, "net.ipv4.tcp_fack=%u\n", n->ipv4.sysctl_tcp_fack); ++ seq_printf(m, "net.ipv4.tcp_adv_win_scale=%d\n", n->ipv4.sysctl_tcp_adv_win_scale); ++ seq_printf(m, "net.ipv4.tcp_dsack=%u\n", n->ipv4.sysctl_tcp_dsack); ++ seq_printf(m, "net.ipv4.tcp_nometrics_save=%u\n", n->ipv4.sysctl_tcp_nometrics_save); ++ seq_printf(m, "net.ipv4.tcp_moderate_rcvbuf=%u\n", n->ipv4.sysctl_tcp_moderate_rcvbuf); ++ seq_printf(m, "net.ipv4.tcp_min_tso_segs=%u\n", n->ipv4.sysctl_tcp_min_tso_segs); ++ seq_printf(m, "net.ipv4.tcp_wmem=%d %d %d\n", ++ n->ipv4.sysctl_tcp_wmem[0], n->ipv4.sysctl_tcp_wmem[1], ++ n->ipv4.sysctl_tcp_wmem[2]); ++ seq_printf(m, "net.ipv4.tcp_rmem=%d %d %d\n", ++ n->ipv4.sysctl_tcp_rmem[0], n->ipv4.sysctl_tcp_rmem[1], ++ n->ipv4.sysctl_tcp_rmem[2]); ++ seq_printf(m, "net.ipv4.max_syn_backlog=%d\n", n->ipv4.sysctl_max_syn_backlog); ++ seq_printf(m, "net.ipv4.tcp_fastopen=%u\n", n->ipv4.sysctl_tcp_fastopen); ++ seq_printf(m, "net.ipv4.tcp_congestion_control=%s\n", ++ n->ipv4.tcp_congestion_control->name); ++ ++ seq_printf(m, "net.ipv4.conf.all.forwarding=%d\n", ++ n->ipv4.devconf_all->data[IPV4_DEVCONF_FORWARDING - 1]); ++ seq_printf(m, "net.ipv4.conf.all.mc_forwarding=%d\n", ++ n->ipv4.devconf_all->data[IPV4_DEVCONF_MC_FORWARDING - 1]); ++ seq_printf(m, "net.ipv4.conf.all.proxy_arp=%d\n", ++ n->ipv4.devconf_all->data[IPV4_DEVCONF_PROXY_ARP - 1]); ++ seq_printf(m, "net.ipv4.conf.all.accept_redirects=%d\n", ++ n->ipv4.devconf_all->data[IPV4_DEVCONF_ACCEPT_REDIRECTS - 1]); ++ seq_printf(m, "net.ipv4.conf.all.secure_redirects=%d\n", ++ n->ipv4.devconf_all->data[IPV4_DEVCONF_SECURE_REDIRECTS - 1]); ++ seq_printf(m, "net.ipv4.conf.all.send_redirects=%d\n", ++ n->ipv4.devconf_all->data[IPV4_DEVCONF_SEND_REDIRECTS - 1]); ++ seq_printf(m, "net.ipv4.conf.all.shared_media=%d\n", ++ n->ipv4.devconf_all->data[IPV4_DEVCONF_SHARED_MEDIA - 1]); ++ seq_printf(m, "net.ipv4.conf.all.rp_filter=%d\n", ++ n->ipv4.devconf_all->data[IPV4_DEVCONF_RP_FILTER - 1]); ++ seq_printf(m, "net.ipv4.conf.all.accept_source_route=%d\n", ++ n->ipv4.devconf_all->data[IPV4_DEVCONF_ACCEPT_SOURCE_ROUTE - 1]); ++ seq_printf(m, "net.ipv4.conf.all.bootp_relay=%d\n", ++ n->ipv4.devconf_all->data[IPV4_DEVCONF_BOOTP_RELAY - 1]); ++ seq_printf(m, "net.ipv4.conf.all.log_martians=%d\n", ++ n->ipv4.devconf_all->data[IPV4_DEVCONF_LOG_MARTIANS - 1]); ++ seq_printf(m, "net.ipv4.conf.all.tag=%d\n", ++ n->ipv4.devconf_all->data[IPV4_DEVCONF_TAG - 1]); ++ seq_printf(m, "net.ipv4.conf.all.arp_filter=%d\n", ++ n->ipv4.devconf_all->data[IPV4_DEVCONF_ARPFILTER - 1]); ++ seq_printf(m, "net.ipv4.conf.all.medium_id=%d\n", ++ n->ipv4.devconf_all->data[IPV4_DEVCONF_MEDIUM_ID - 1]); ++ seq_printf(m, "net.ipv4.conf.all.disable_xfrm=%d\n", ++ n->ipv4.devconf_all->data[IPV4_DEVCONF_NOXFRM - 1]); ++ seq_printf(m, "net.ipv4.conf.all.disable_policy=%d\n", ++ n->ipv4.devconf_all->data[IPV4_DEVCONF_NOPOLICY - 1]); ++ seq_printf(m, "net.ipv4.conf.all.force_igmp_version=%d\n", ++ n->ipv4.devconf_all->data[IPV4_DEVCONF_FORCE_IGMP_VERSION - 1]); ++ seq_printf(m, "net.ipv4.conf.all.arp_announce=%d\n", ++ n->ipv4.devconf_all->data[IPV4_DEVCONF_ARP_ANNOUNCE - 1]); ++ seq_printf(m, "net.ipv4.conf.all.arp_ignore=%d\n", ++ n->ipv4.devconf_all->data[IPV4_DEVCONF_ARP_IGNORE - 1]); ++ seq_printf(m, "net.ipv4.conf.all.promote_secondaries=%d\n", ++ n->ipv4.devconf_all->data[IPV4_DEVCONF_PROMOTE_SECONDARIES - 1]); ++ seq_printf(m, "net.ipv4.conf.all.arp_accept=%d\n", ++ n->ipv4.devconf_all->data[IPV4_DEVCONF_ARP_ACCEPT - 1]); ++ seq_printf(m, "net.ipv4.conf.all.arp_notify=%d\n", ++ n->ipv4.devconf_all->data[IPV4_DEVCONF_ARP_NOTIFY - 1]); ++ seq_printf(m, "net.ipv4.conf.all.accept_local=%d\n", ++ n->ipv4.devconf_all->data[IPV4_DEVCONF_ACCEPT_LOCAL - 1]); ++ seq_printf(m, "net.ipv4.conf.all.src_valid_mark=%d\n", ++ n->ipv4.devconf_all->data[IPV4_DEVCONF_SRC_VMARK - 1]); ++ seq_printf(m, "net.ipv4.conf.all.proxy_arp_pvlan=%d\n", ++ n->ipv4.devconf_all->data[IPV4_DEVCONF_PROXY_ARP_PVLAN - 1]); ++ seq_printf(m, "net.ipv4.conf.all.route_localnet=%d\n", ++ n->ipv4.devconf_all->data[IPV4_DEVCONF_ROUTE_LOCALNET - 1]); ++ seq_printf(m, "net.ipv4.conf.all.igmpv2_unsolicited_report_interval=%d\n", ++ n->ipv4.devconf_all->data[ ++ IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL - 1]); ++ seq_printf(m, "net.ipv4.conf.all.igmpv3_unsolicited_report_interval=%d\n", ++ n->ipv4.devconf_all->data[ ++ IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL - 1]); ++ seq_printf(m, "net.ipv4.conf.all.ignore_routes_with_linkdown=%d\n", ++ n->ipv4.devconf_all->data[ ++ IPV4_DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN - 1]); ++ seq_printf(m, "net.ipv4.conf.all.drop_unicast_in_l2_multicast=%d\n", ++ n->ipv4.devconf_all->data[ ++ IPV4_DEVCONF_DROP_UNICAST_IN_L2_MULTICAST - 1]); ++ seq_printf(m, "net.ipv4.conf.all.drop_gratuitous_arp=%d\n", ++ n->ipv4.devconf_all->data[IPV4_DEVCONF_DROP_GRATUITOUS_ARP - 1]); ++ seq_printf(m, "net.ipv4.conf.all.bc_forwarding=%d\n", ++ n->ipv4.devconf_all->data[IPV4_DEVCONF_BC_FORWARDING - 1]); ++ seq_printf(m, "net.ipv4.conf.all.arp_evict_nocarrier=%d\n", ++ n->ipv4.devconf_all->data[IPV4_DEVCONF_ARP_EVICT_NOCARRIER - 1]); ++ ++ seq_printf(m, "net.ipv4.conf.default.forwarding=%d\n", ++ n->ipv4.devconf_dflt->data[IPV4_DEVCONF_FORWARDING - 1]); ++ seq_printf(m, "net.ipv4.conf.default.mc_forwarding=%d\n", ++ n->ipv4.devconf_dflt->data[IPV4_DEVCONF_MC_FORWARDING - 1]); ++ seq_printf(m, "net.ipv4.conf.default.proxy_arp=%d\n", ++ n->ipv4.devconf_dflt->data[IPV4_DEVCONF_PROXY_ARP - 1]); ++ seq_printf(m, "net.ipv4.conf.default.accept_redirects=%d\n", ++ n->ipv4.devconf_dflt->data[IPV4_DEVCONF_ACCEPT_REDIRECTS - 1]); ++ seq_printf(m, "net.ipv4.conf.default.secure_redirects=%d\n", ++ n->ipv4.devconf_dflt->data[IPV4_DEVCONF_SECURE_REDIRECTS - 1]); ++ seq_printf(m, "net.ipv4.conf.default.send_redirects=%d\n", ++ n->ipv4.devconf_dflt->data[IPV4_DEVCONF_SEND_REDIRECTS - 1]); ++ seq_printf(m, "net.ipv4.conf.default.shared_media=%d\n", ++ n->ipv4.devconf_dflt->data[IPV4_DEVCONF_SHARED_MEDIA - 1]); ++ seq_printf(m, "net.ipv4.conf.default.rp_filter=%d\n", ++ n->ipv4.devconf_dflt->data[IPV4_DEVCONF_RP_FILTER - 1]); ++ seq_printf(m, "net.ipv4.conf.default.accept_source_route=%d\n", ++ n->ipv4.devconf_dflt->data[IPV4_DEVCONF_ACCEPT_SOURCE_ROUTE - 1]); ++ seq_printf(m, "net.ipv4.conf.default.bootp_relay=%d\n", ++ n->ipv4.devconf_dflt->data[IPV4_DEVCONF_BOOTP_RELAY - 1]); ++ seq_printf(m, "net.ipv4.conf.default.log_martians=%d\n", ++ n->ipv4.devconf_dflt->data[IPV4_DEVCONF_LOG_MARTIANS - 1]); ++ seq_printf(m, "net.ipv4.conf.default.tag=%d\n", ++ n->ipv4.devconf_dflt->data[IPV4_DEVCONF_TAG - 1]); ++ seq_printf(m, "net.ipv4.conf.default.arp_filter=%d\n", ++ n->ipv4.devconf_dflt->data[IPV4_DEVCONF_ARPFILTER - 1]); ++ seq_printf(m, "net.ipv4.conf.default.medium_id=%d\n", ++ n->ipv4.devconf_dflt->data[IPV4_DEVCONF_MEDIUM_ID - 1]); ++ seq_printf(m, "net.ipv4.conf.default.disable_xfrm=%d\n", ++ n->ipv4.devconf_dflt->data[IPV4_DEVCONF_NOXFRM - 1]); ++ seq_printf(m, "net.ipv4.conf.default.disable_policy=%d\n", ++ n->ipv4.devconf_dflt->data[IPV4_DEVCONF_NOPOLICY - 1]); ++ seq_printf(m, "net.ipv4.conf.default.force_igmp_version=%d\n", ++ n->ipv4.devconf_dflt->data[IPV4_DEVCONF_FORCE_IGMP_VERSION - 1]); ++ seq_printf(m, "net.ipv4.conf.default.arp_announce=%d\n", ++ n->ipv4.devconf_dflt->data[IPV4_DEVCONF_ARP_ANNOUNCE - 1]); ++ seq_printf(m, "net.ipv4.conf.default.arp_ignore=%d\n", ++ n->ipv4.devconf_dflt->data[IPV4_DEVCONF_ARP_IGNORE - 1]); ++ seq_printf(m, "net.ipv4.conf.default.promote_secondaries=%d\n", ++ n->ipv4.devconf_dflt->data[IPV4_DEVCONF_PROMOTE_SECONDARIES - 1]); ++ seq_printf(m, "net.ipv4.conf.default.arp_accept=%d\n", ++ n->ipv4.devconf_dflt->data[IPV4_DEVCONF_ARP_ACCEPT - 1]); ++ seq_printf(m, "net.ipv4.conf.default.arp_notify=%d\n", ++ n->ipv4.devconf_dflt->data[IPV4_DEVCONF_ARP_NOTIFY - 1]); ++ seq_printf(m, "net.ipv4.conf.default.accept_local=%d\n", ++ n->ipv4.devconf_dflt->data[IPV4_DEVCONF_ACCEPT_LOCAL - 1]); ++ seq_printf(m, "net.ipv4.conf.default.src_valid_mark=%d\n", ++ n->ipv4.devconf_dflt->data[IPV4_DEVCONF_SRC_VMARK - 1]); ++ seq_printf(m, "net.ipv4.conf.default.proxy_arp_pvlan=%d\n", ++ n->ipv4.devconf_dflt->data[IPV4_DEVCONF_PROXY_ARP_PVLAN - 1]); ++ seq_printf(m, "net.ipv4.conf.default.route_localnet=%d\n", ++ n->ipv4.devconf_dflt->data[IPV4_DEVCONF_ROUTE_LOCALNET - 1]); ++ seq_printf(m, "net.ipv4.conf.default.igmpv2_unsolicited_report_interval=%d\n", ++ n->ipv4.devconf_dflt->data[ ++ IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL - 1]); ++ seq_printf(m, "net.ipv4.conf.default.igmpv3_unsolicited_report_interval=%d\n", ++ n->ipv4.devconf_dflt->data[ ++ IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL - 1]); ++ seq_printf(m, "net.ipv4.conf.default.ignore_routes_with_linkdown=%d\n", ++ n->ipv4.devconf_dflt->data[ ++ IPV4_DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN - 1]); ++ seq_printf(m, "net.ipv4.conf.default.drop_unicast_in_l2_multicast=%d\n", ++ n->ipv4.devconf_dflt->data[ ++ IPV4_DEVCONF_DROP_UNICAST_IN_L2_MULTICAST - 1]); ++ seq_printf(m, "net.ipv4.conf.default.drop_gratuitous_arp=%d\n", ++ n->ipv4.devconf_dflt->data[IPV4_DEVCONF_DROP_GRATUITOUS_ARP - 1]); ++ seq_printf(m, "net.ipv4.conf.default.bc_forwarding=%d\n", ++ n->ipv4.devconf_dflt->data[IPV4_DEVCONF_BC_FORWARDING - 1]); ++ seq_printf(m, "net.ipv4.conf.default.arp_evict_nocarrier=%d\n", ++ n->ipv4.devconf_dflt->data[IPV4_DEVCONF_ARP_EVICT_NOCARRIER - 1]); ++ ++ seq_puts(m, "=== vm ===\n"); ++ seq_printf(m, "vm.max_map_count=%d\n", vk->sysctl_vm.max_map_count); ++ seq_printf(m, "vm.mmap_min_addr=0x%lx\n", vk->sysctl_vm.mmap_min_addr); ++ seq_printf(m, "vm.dac_mmap_min_addr=0x%lx\n", vk->sysctl_vm.dac_mmap_min_addr); ++ seq_printf(m, "vm.overcommit_kbytes=%lu\n", vk->sysctl_vm.overcommit_kbytes); ++ seq_printf(m, "vm.overcommit_memory=%d\n", vk->sysctl_vm.overcommit_memory); ++ seq_printf(m, "vm.overcommit_ratio=%d\n", vk->sysctl_vm.overcommit_ratio); ++ ++ return 0; ++} ++ ++static int sysctl_open(struct inode *inode, struct file *file) ++{ ++ struct vkernel *vk = inode->i_private; ++ int r; ++ ++ if (!vkernel_get_vk_safe(vk)) ++ return -ENOENT; ++ ++ r = single_open(file, sysctl_show, inode->i_private); ++ if (r < 0) ++ vkernel_put_vk(vk); ++ ++ return r; ++} ++ ++static int sysctl_release(struct inode *inode, struct file *file) ++{ ++ struct vkernel *vk = inode->i_private; ++ ++ vkernel_put_vk(vk); ++ ++ return single_release(inode, file); ++} ++ ++static ssize_t ++sysctl_write(struct file *filp, const char __user *ubuf, ++ size_t cnt, loff_t *ppos) ++{ ++ struct inode *inode; ++ struct vkernel *vk; ++ char buf[256]; ++ size_t ret; ++ ++ inode = file_inode(filp); ++ vk = inode->i_private; ++ ++ if (cnt > 255) ++ cnt = 255; ++ ++ if (copy_from_user(&buf, ubuf, cnt)) ++ return -EFAULT; ++ ++ buf[cnt] = 0; ++ ++ pr_debug("sysctl write, vk %s, buf %s\n", vk->name, buf); ++ ++ ret = vkernel_set_sysctl_raw(vk, buf); ++ if (ret) ++ return ret; ++ ++ return cnt; ++} ++ ++static const struct file_operations vk_sysctl_fops = { ++ .open = sysctl_open, ++ .release = sysctl_release, ++ .read = seq_read, ++ .write = sysctl_write, ++ .llseek = seq_lseek, ++}; ++ ++static void vkernel_destroy_vk_debugfs(struct vkernel *vk) ++{ ++ if (IS_ERR(vk->debugfs_dentry)) ++ return; ++ ++ debugfs_remove_recursive(vk->debugfs_dentry); ++} ++ ++static int vkernel_create_vk_debugfs(struct vkernel *vk, const char *name) ++{ ++ static DEFINE_MUTEX(vkernel_debugfs_lock); ++ struct dentry *dent; ++ ++ if (!debugfs_initialized()) ++ return 0; ++ ++ mutex_lock(&vkernel_debugfs_lock); ++ dent = debugfs_lookup(name, vkernel_debugfs_dir); ++ if (dent) { ++ pr_warn_ratelimited("vkernel: debugfs: duplicate directory %s\n", name); ++ dput(dent); ++ mutex_unlock(&vkernel_debugfs_lock); ++ return 0; ++ } ++ ++ dent = debugfs_create_dir(name, vkernel_debugfs_dir); ++ mutex_unlock(&vkernel_debugfs_lock); ++ if (IS_ERR(dent)) ++ return 0; ++ ++ vk->debugfs_dentry = dent; ++ ++ debugfs_create_file("stat", 0444, dent, vk, &vk_stat_fops); ++ debugfs_create_file("sysctl", 0644, dent, vk, &vk_sysctl_fops); ++ ++ return 0; ++} ++ ++void vkernel_destroy_vk(struct vkernel *vk) ++{ ++ pr_info("vkernel: destroy vk %s\n", vk->name); ++ ++ vk->active = false; ++ vkernel_unregister_vk(vk); ++ ++ mutex_lock(&vk_lock); ++#ifdef CONFIG_DEBUG_LIST ++ list_del(&vk->link); ++#else ++ if (vk->link.prev) ++ list_del(&vk->link); ++#endif ++ mutex_unlock(&vk_lock); ++ ++ if (vk->custom->pre_destroy) ++ vk->custom->pre_destroy(vk); ++ if (vk->custom->owner != vkernel_chardev_ops.owner) ++ module_put(vk->custom->owner); ++ ++ vkernel_destroy_vk_debugfs(vk); ++ ++ vk_uninit_sysctl_vm(&vk->sysctl_vm); ++ vk_uninit_sysctl_net(&vk->sysctl_net); ++ vk_uninit_sysctl_kernel(&vk->sysctl_kernel); ++ vk_uninit_sysctl_fs(&vk->sysctl_fs); ++ vk_uninit_memory_pref(&vk->mem_pref); ++ vk_uninit_cpu_pref(&vk->cpu_pref); ++ vk_uninit_acl(&vk->acl); ++ vk_uninit_syscall(&vk->syscall); ++ kfree(vk); ++ module_put(vkernel_chardev_ops.owner); ++} ++EXPORT_SYMBOL(vkernel_destroy_vk); ++ ++struct vkernel *vkernel_create_vk(struct task_struct *tsk, const char *name, ++ const char *custom) ++{ ++ struct vkernel *vk; ++ int r = -ENOMEM; ++ ++ vk = kzalloc(sizeof(struct vkernel), GFP_KERNEL); ++ if (!vk) ++ return ERR_PTR(-ENOMEM); ++ ++ __module_get(vkernel_chardev_ops.owner); ++ ++ /* Init basic info */ ++ strscpy(vk->name, name, VKERNEL_NAME_LEN); ++ INIT_HLIST_NODE(&vk->hash); ++ vk->pid_ns = task_active_pid_ns(tsk); ++ vk->uts_ns = tsk->nsproxy->uts_ns; ++ vk->init_process = tsk; ++ vk->init_pid = tsk->pid; ++ refcount_set(&vk->users_count, 1); ++ ++ /* ++ * Force subsequent debugfs file creations to fail if the vk directory ++ * is not created (by vkernel_create_vk_debugfs()). ++ */ ++ vk->debugfs_dentry = ERR_PTR(-ENOENT); ++ ++ /* Init syscall */ ++ r = vk_init_syscall(&vk->syscall); ++ if (r) ++ goto err_vk; ++ /* Init acl */ ++ r = vk_init_acl(&vk->acl, VKERNEL_ACL_HASH_BITS); ++ if (r) ++ goto err_syscall; ++ /* Init linux cap */ ++ vk->linux_cap.inheritable = tsk->cred->cap_inheritable; ++ vk->linux_cap.permitted = tsk->cred->cap_permitted; ++ vk->linux_cap.effective = tsk->cred->cap_effective; ++ vk->linux_cap.bset = tsk->cred->cap_bset; ++ vk->linux_cap.ambient = tsk->cred->cap_ambient; ++ ++ /* Init cpu preference */ ++ r = vk_init_cpu_pref(&vk->cpu_pref); ++ if (r) ++ goto err_acl; ++ /* Init memory preference */ ++ r = vk_init_memory_pref(&vk->mem_pref); ++ if (r) ++ goto err_cpu; ++ ++ /* Init extension cap */ ++ vk->caps = (1 << VKERNEL_CAP_ISOLATE_LOG); ++ vk->log_ns = vk->pid_ns->ns.inum; ++ ++ /* Init sysctl */ ++ r = vk_init_sysctl_fs(&vk->sysctl_fs); ++ if (r) ++ goto err_mem; ++ r = vk_init_sysctl_kernel(&vk->sysctl_kernel); ++ if (r) ++ goto err_fs; ++ r = vk_init_sysctl_net(&vk->sysctl_net, tsk); ++ if (r) ++ goto err_kernel; ++ r = vk_init_sysctl_vm(&vk->sysctl_vm); ++ if (r) ++ goto err_net; ++ ++ /* Init default operations */ ++ vk->ops.cap_capable = vk_cap_capable; ++ vk->ops.generic_permission = vk_generic_permission; ++ ++ r = vkernel_create_vk_debugfs(vk, name); ++ if (r) ++ goto err_vm; ++ ++ /* Custom initializations */ ++ vk->custom = vkernel_find_custom(custom); ++ if (!vk->custom) ++ vk->custom = &default_custom; ++ if (vk->custom->owner != vkernel_chardev_ops.owner) ++ __module_get(vk->custom->owner); ++ if (vk->custom->post_create) { ++ r = vk->custom->post_create(vk); ++ if (r) ++ goto err_custom_debugfs; ++ } ++ ++ mutex_lock(&vk_lock); ++ list_add(&vk->link, &vk_list); ++ mutex_unlock(&vk_lock); ++ ++ /* Register vk into kernel. It is inactive state. */ ++ vkernel_register_vk(vk); ++ ++ pr_info("vkernel: create vk %s, init %d, custom %s (expect %s)", ++ vk->name, vk->init_pid, vk->custom->name, custom); ++ ++ return vk; ++ ++err_custom_debugfs: ++ if (vk->custom->owner != vkernel_chardev_ops.owner) ++ module_put(vk->custom->owner); ++ ++ vkernel_destroy_vk_debugfs(vk); ++err_vm: ++ vk_uninit_sysctl_vm(&vk->sysctl_vm); ++err_net: ++ vk_uninit_sysctl_net(&vk->sysctl_net); ++err_kernel: ++ vk_uninit_sysctl_kernel(&vk->sysctl_kernel); ++err_fs: ++ vk_uninit_sysctl_fs(&vk->sysctl_fs); ++err_mem: ++ vk_uninit_memory_pref(&vk->mem_pref); ++err_cpu: ++ vk_uninit_cpu_pref(&vk->cpu_pref); ++err_acl: ++ vk_uninit_acl(&vk->acl); ++err_syscall: ++ vk_uninit_syscall(&vk->syscall); ++err_vk: ++ kfree(vk); ++ module_put(vkernel_chardev_ops.owner); ++ ++ return ERR_PTR(r); ++} ++EXPORT_SYMBOL(vkernel_create_vk); ++ ++void vkernel_get_vk(struct vkernel *vk) ++{ ++ refcount_inc(&vk->users_count); ++} ++EXPORT_SYMBOL(vkernel_get_vk); ++ ++/* ++ * Make sure the vk is not during destruction, which is a safe version of ++ * vkernel_get_vk(). Return true if vk referenced successfully, false otherwise. ++ */ ++bool vkernel_get_vk_safe(struct vkernel *vk) ++{ ++ return refcount_inc_not_zero(&vk->users_count); ++} ++EXPORT_SYMBOL(vkernel_get_vk_safe); ++ ++void vkernel_put_vk(struct vkernel *vk) ++{ ++ if (refcount_dec_and_test(&vk->users_count)) ++ vkernel_destroy_vk(vk); ++} ++EXPORT_SYMBOL(vkernel_put_vk); ++ ++/* ++ * Used to put a reference that was taken on behalf of an object associated ++ * with a user-visible file descriptor, e.g. a vcpu or device, if installation ++ * of the new file descriptor fails and the reference cannot be transferred to ++ * its final owner. In such cases, the caller is still actively using @vk and ++ * will fail miserably if the refcount unexpectedly hits zero. ++ */ ++void vkernel_put_vk_no_destroy(struct vkernel *vk) ++{ ++ WARN_ON(refcount_dec_and_test(&vk->users_count)); ++} ++EXPORT_SYMBOL(vkernel_put_vk_no_destroy); ++ ++static int vkernel_vk_release(struct inode *inode, struct file *filp) ++{ ++ struct vkernel *vk = filp->private_data; ++ ++ pr_info("vkernel: release vk fd of %s. Currently, vk is still alive\n", vk->name); ++ ++ // vkernel_put_vk(vk); ++ return 0; ++} ++ ++static long vkernel_vk_ioctl(struct file *filp, ++ unsigned int ioctl, unsigned long arg) ++{ ++ struct vkernel *vk = filp->private_data; ++ int r = 0; ++ ++ switch (ioctl) { ++ case VKERNEL_SET_DEF_SYSCALL: ++ r = vkernel_vk_ioctl_set_def_syscall(vk, arg); ++ break; ++ case VKERNEL_RESTRICT_SYSCALL: ++ r = vkernel_vk_ioctl_restrict_syscall(vk, arg); ++ break; ++ case VKERNEL_RESTRICT_FILE: ++ r = vkernel_vk_ioctl_restrict_file(vk, arg); ++ break; ++ case VKERNEL_RESTRICT_LINUX_CAP: ++ r = vkernel_vk_ioctl_restrict_linux_cap(vk, arg); ++ break; ++ case VKERNEL_SET_CPU_PREF: ++ r = vkernel_vk_ioctl_set_cpu(vk, arg); ++ break; ++ case VKERNEL_SET_MEMORY_PREF: ++ r = vkernel_vk_ioctl_set_memory(vk, arg); ++ break; ++ case VKERNEL_SET_SYSCTL_FS: ++ r = vkernel_vk_ioctl_set_sysctl_fs(vk, arg); ++ break; ++ case VKERNEL_SET_SYSCTL_KERNEL: ++ r = vkernel_vk_ioctl_set_sysctl_kernel(vk, arg); ++ break; ++ case VKERNEL_SET_SYSCTL_NET: ++ r = vkernel_vk_ioctl_set_sysctl_net(vk, arg); ++ break; ++ case VKERNEL_SET_SYSCTL_VM: ++ r = vkernel_vk_ioctl_set_sysctl_vm(vk, arg); ++ break; ++ case VKERNEL_CHECK_EXTENSION: ++ r = vkernel_vk_ioctl_check_extension(vk, arg); ++ break; ++ case VKERNEL_ENABLE_CAP: ++ r = vkernel_vk_ioctl_enable_cap(vk, arg); ++ break; ++ case VKERNEL_REGISTER: ++ pr_warn("vkernel: [deprecated] register vk, init %d id %u ret %d\n", ++ vk->init_process->pid, vk->pid_ns->ns.inum, r); ++ break; ++ case VKERNEL_UNREGISTER: ++ pr_warn("vkernel: [deprecated] unregister vk, init %d id %u ret %d\n", ++ vk->init_process->pid, vk->pid_ns->ns.inum, r); ++ break; ++ case VKERNEL_ACTIVATE: ++ vk->active = true; ++ break; ++ case VKERNEL_DEACTIVATE: ++ vk->active = false; ++ break; ++ default: ++ r = vkernel_arch_vk_ioctl(filp, ioctl, arg); ++ } ++ ++ return r; ++} ++ ++#ifdef CONFIG_VKERNEL_COMPAT ++long __weak vkernel_arch_vk_compat_ioctl(struct file *filp, unsigned int ioctl, ++ unsigned long arg) ++{ ++ return -ENOTTY; ++} ++ ++static long vkernel_vk_compat_ioctl(struct file *filp, ++ unsigned int ioctl, unsigned long arg) ++{ ++ int r; ++ ++ r = vkernel_arch_vk_compat_ioctl(filp, ioctl, arg); ++ if (r != -ENOTTY) ++ return r; ++ ++ return vkernel_vk_ioctl(filp, ioctl, arg); ++} ++#endif ++ ++static const struct file_operations vkernel_vk_fops = { ++ .release = vkernel_vk_release, ++ .unlocked_ioctl = vkernel_vk_ioctl, ++ .llseek = noop_llseek, ++ VKERNEL_COMPAT(vkernel_vk_compat_ioctl), ++}; ++ ++static int vkernel_dev_ioctl_create_vk(unsigned long arg) ++{ ++ void __user *argp = (void __user *)arg; ++ struct vkernel_desc desc; ++ struct task_struct *tsk; ++ struct vkernel *vk; ++ struct file *file; ++ char fdname[ITOA_MAX_LEN * 2 + 2]; ++ int r, fd; ++ ++ if (copy_from_user(&desc, argp, sizeof(desc))) ++ return -EFAULT; ++ ++ RCU_LOCKDEP_WARN(!rcu_read_lock_held(), ++ "find_task_by_pid_ns() needs rcu_read_lock() protection"); ++ tsk = pid_task(find_pid_ns(desc.pid, &init_pid_ns), PIDTYPE_PID); ++ if (!tsk) { ++ pr_err("cannot find pid %d\n", desc.pid); ++ return -EINVAL; ++ } ++ ++ fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC); ++ if (fd < 0) { ++ pr_err("cannot get unused fd\n"); ++ return fd; ++ } ++ ++ snprintf(fdname, sizeof(fdname), "%d-%d", desc.pid, fd); ++ ++ vk = vkernel_create_vk(tsk, fdname, desc.custom); ++ if (IS_ERR(vk)) { ++ r = PTR_ERR(vk); ++ goto put_fd; ++ } ++ ++ file = anon_inode_getfile("vkernel-vk", &vkernel_vk_fops, vk, O_RDWR); ++ if (IS_ERR(file)) { ++ r = PTR_ERR(file); ++ goto put_kernel; ++ } ++ ++ vkernel_uevent_notify_change(VKERNEL_EVENT_CREATE_VK, vk); ++ ++ fd_install(fd, file); ++ return fd; ++ ++put_kernel: ++ vkernel_put_vk(vk); ++put_fd: ++ put_unused_fd(fd); ++ return r; ++} ++ ++static int vkernel_dev_ioctl_destroy_vk(unsigned long arg) ++{ ++ struct vkernel *vk; ++ unsigned int id = (unsigned int)arg; ++ ++ pr_info("vkernel: try to destroy vk with id %u\n", id); ++ ++ vk = vkernel_find_vk_by_id(id); ++ if (!vk) ++ return -EINVAL; ++ ++ vkernel_put_vk(vk); ++ return 0; ++} ++ ++static long vkernel_dev_ioctl(struct file *filp, ++ unsigned int ioctl, unsigned long arg) ++{ ++ int r = -EINVAL; ++ ++ switch (ioctl) { ++ case VKERNEL_GET_API_VERSION: ++ if (arg) ++ goto out; ++ r = VKERNEL_API_VERSION; ++ break; ++ case VKERNEL_CREATE_VK: ++ r = vkernel_dev_ioctl_create_vk(arg); ++ break; ++ case VKERNEL_DESTROY_VK: ++ r = vkernel_dev_ioctl_destroy_vk(arg); ++ break; ++ case VKERNEL_CHECK_EXTENSION: ++ r = vkernel_vk_ioctl_check_extension(NULL, arg); ++ break; ++ case VKERNEL_TRACE_ENABLE: ++ case VKERNEL_TRACE_PAUSE: ++ case VKERNEL_TRACE_DISABLE: ++ r = -EOPNOTSUPP; ++ break; ++ default: ++ r = vkernel_arch_dev_ioctl(filp, ioctl, arg); ++ } ++out: ++ return r; ++} ++ ++static const struct file_operations vkernel_chardev_ops = { ++ .owner = THIS_MODULE, ++ .unlocked_ioctl = vkernel_dev_ioctl, ++ .llseek = noop_llseek, ++ VKERNEL_COMPAT(vkernel_dev_ioctl), ++}; ++ ++static struct miscdevice vkernel_dev = { ++ VKERNEL_MINOR, ++ "vkernel", ++ &vkernel_chardev_ops, ++}; ++ ++static void vkernel_uevent_notify_change(unsigned int type, struct vkernel *vk) ++{ ++ struct kobj_uevent_env *env; ++ unsigned long long created, active; ++ ++ if (!vkernel_dev.this_device || !vk) ++ return; ++ ++ mutex_lock(&event_lock); ++ if (type == VKERNEL_EVENT_CREATE_VK) { ++ vkernel_createvk_count++; ++ vkernel_active_vks++; ++ } else if (type == VKERNEL_EVENT_DESTROY_VK) { ++ vkernel_active_vks--; ++ } ++ created = vkernel_createvk_count; ++ active = vkernel_active_vks; ++ mutex_unlock(&event_lock); ++ ++ env = kzalloc(sizeof(*env), GFP_KERNEL_ACCOUNT); ++ if (!env) ++ return; ++ ++ add_uevent_var(env, "CREATED=%llu", created); ++ add_uevent_var(env, "COUNT=%llu", active); ++ ++ if (type == VKERNEL_EVENT_CREATE_VK) ++ add_uevent_var(env, "EVENT=create"); ++ else if (type == VKERNEL_EVENT_DESTROY_VK) ++ add_uevent_var(env, "EVENT=destroy"); ++ add_uevent_var(env, "VKID=%d", vk->pid_ns->ns.inum); ++ ++ if (!IS_ERR(vk->debugfs_dentry)) { ++ char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL_ACCOUNT); ++ ++ if (p) { ++ tmp = dentry_path_raw(vk->debugfs_dentry, p, PATH_MAX); ++ if (!IS_ERR(tmp)) ++ add_uevent_var(env, "STATS_PATH=%s", tmp); ++ kfree(p); ++ } ++ } ++ /* no need for checks, since we are adding at most only 5 keys */ ++ env->envp[env->envp_idx++] = NULL; ++ kobject_uevent_env(&vkernel_dev.this_device->kobj, KOBJ_CHANGE, env->envp); ++ kfree(env); ++} ++ ++static int clear_zombie_vks(void) ++{ ++ struct vkernel *vk; ++ struct vkernel *tmp; ++ struct task_struct *tsk; ++ int count = 0; ++ ++ list_for_each_entry_safe(vk, tmp, &vk_list, link) { ++ tsk = pid_task(find_pid_ns(vk->init_pid, &init_pid_ns), PIDTYPE_PID); ++ if (tsk != vk->init_process) { ++ if (refcount_read(&vk->users_count) > 1) ++ pr_err("vkernel: BUG! zombie vk %s has other refs, init %d custom %s\n", ++ vk->name, vk->init_pid, vk->custom->name); ++ vkernel_put_vk(vk); ++ count++; ++ } ++ } ++ ++ return count; ++} ++ ++static int clear_zombie_set(void *data, u64 val) ++{ ++ int count; ++ ++ count = clear_zombie_vks(); ++ pr_info("cleared %d zombie vks\n", count); ++ return 0; ++} ++ ++DEFINE_DEBUGFS_ATTRIBUTE(clear_zombie_fops, NULL, clear_zombie_set, ++ "%lld\n"); ++ ++static void vkernel_init_debug(void) ++{ ++ vkernel_debugfs_dir = debugfs_create_dir("vkernel", NULL); ++ ++ debugfs_create_file("clear_zombie", 0200, vkernel_debugfs_dir, ++ NULL, &clear_zombie_fops); ++} ++ ++int vkernel_init(void) ++{ ++ int ret; ++ ++ if (vk_kallsyms_init()) ++ return -1; ++ if (vk_cap_init()) ++ return -1; ++ if (vk_syscall_init()) ++ return -1; ++ if (vk_acl_init()) ++ return -1; ++ ++ vkernel_init_debug(); ++ ++ ret = misc_register(&vkernel_dev); ++ if (ret) { ++ pr_err("vkernel: misc device register failed\n"); ++ return ret; ++ } ++ ++ vkernel_register_custom(&default_custom); ++ vkernel_register_custom(&analysis_custom); ++ pr_info("vkernel: load vkernel\n"); ++ ++ return 0; ++} ++EXPORT_SYMBOL(vkernel_init); ++ ++void vkernel_exit(void) ++{ ++ clear_zombie_vks(); ++ ++ pr_info("vkernel: unlod vkernel\n"); ++ vkernel_unregister_custom(&analysis_custom); ++ vkernel_unregister_custom(&default_custom); ++ ++ misc_deregister(&vkernel_dev); ++ ++ debugfs_remove_recursive(vkernel_debugfs_dir); ++ ++ vk_acl_uninit(); ++ vk_syscall_uninit(); ++ vk_cap_uninit(); ++ vk_kallsyms_uninit(); ++} ++EXPORT_SYMBOL(vkernel_exit); ++ ++module_init(vkernel_init); ++module_exit(vkernel_exit); +diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c +index 299c295a27a0..9db216c62c64 100644 +--- a/fs/devpts/inode.c ++++ b/fs/devpts/inode.c +@@ -24,6 +24,9 @@ + #include + #include + #include ++#ifdef CONFIG_VKERNEL ++#include ++#endif + + #define DEVPTS_DEFAULT_MODE 0600 + /* +@@ -512,6 +515,15 @@ static struct file_system_type devpts_fs_type = { + int devpts_new_index(struct pts_fs_info *fsi) + { + int index = -ENOSPC; ++#ifdef CONFIG_VKERNEL ++ struct vkernel *vk; ++ ++ vk = vkernel_find_vk_by_task(current); ++ if (vk && atomic_inc_return(&vk->sysctl_kernel.pty_count) >= ++ (vk->sysctl_kernel.pty_limit - ++ (fsi->mount_opts.reserve ? 0 : vk->sysctl_kernel.pty_reserve))) ++ goto out; ++#endif + + if (atomic_inc_return(&pty_count) >= (pty_limit - + (fsi->mount_opts.reserve ? 0 : pty_reserve))) +@@ -521,13 +533,25 @@ int devpts_new_index(struct pts_fs_info *fsi) + GFP_KERNEL); + + out: +- if (index < 0) ++ if (index < 0) { ++#ifdef CONFIG_VKERNEL ++ if (vk) ++ atomic_dec(&vk->sysctl_kernel.pty_count); ++#endif + atomic_dec(&pty_count); ++ } + return index; + } + + void devpts_kill_index(struct pts_fs_info *fsi, int idx) + { ++#ifdef CONFIG_VKERNEL ++ struct vkernel *vk; ++ ++ vk = vkernel_find_vk_by_task(current); ++ if (vk) ++ atomic_dec(&vk->sysctl_kernel.pty_count); ++#endif + ida_free(&fsi->allocated_ptys, idx); + atomic_dec(&pty_count); + } +diff --git a/fs/exec.c b/fs/exec.c +index 2ac3ef80628f..0642bdcac342 100644 +--- a/fs/exec.c ++++ b/fs/exec.c +@@ -66,6 +66,9 @@ + #include + #include + #include ++#ifdef CONFIG_VKERNEL ++#include ++#endif + + #include + #include +@@ -759,6 +762,9 @@ int setup_arg_pages(struct linux_binprm *bprm, + unsigned long rlim_stack; + struct mmu_gather tlb; + struct vma_iterator vmi; ++#ifdef CONFIG_VKERNEL ++ struct vkernel *vk; ++#endif + + #ifdef CONFIG_STACK_GROWSUP + /* Limit stack size */ +@@ -783,6 +789,12 @@ int setup_arg_pages(struct linux_binprm *bprm, + stack_top = arch_align_stack(stack_top); + stack_top = PAGE_ALIGN(stack_top); + ++#ifdef CONFIG_VKERNEL ++ vk = vkernel_find_vk_by_task(current); ++ if (vk && (unlikely(stack_top < vk->sysctl_vm.mmap_min_addr) || ++ unlikely(vma->vm_end - vma->vm_start >= stack_top - vk->sysctl_vm.mmap_min_addr))) ++ return -ENOMEM; ++#endif + if (unlikely(stack_top < mmap_min_addr) || + unlikely(vma->vm_end - vma->vm_start >= stack_top - mmap_min_addr)) + return -ENOMEM; +diff --git a/fs/file.c b/fs/file.c +index c37c958514e7..0b7636fb673a 100644 +--- a/fs/file.c ++++ b/fs/file.c +@@ -22,6 +22,9 @@ + #include + #include + #include ++#ifdef CONFIG_VKERNEL ++#include ++#endif + + #include "internal.h" + +@@ -103,6 +106,9 @@ static struct fdtable * alloc_fdtable(unsigned int nr) + { + struct fdtable *fdt; + void *data; ++#ifdef CONFIG_VKERNEL ++ struct vkernel *vk; ++#endif + + /* + * Figure out how many fds we actually want to support in this fdtable. +@@ -123,6 +129,11 @@ static struct fdtable * alloc_fdtable(unsigned int nr) + * We make sure that nr remains a multiple of BITS_PER_LONG - otherwise + * bitmaps handling below becomes unpleasant, to put it mildly... + */ ++#ifdef CONFIG_VKERNEL ++ vk = vkernel_find_vk_by_task(current); ++ if (vk && unlikely(nr > vk->sysctl_fs.nr_open)) ++ nr = ((vk->sysctl_fs.nr_open - 1) | (BITS_PER_LONG - 1)) + 1; ++#endif + if (unlikely(nr > sysctl_nr_open)) + nr = ((sysctl_nr_open - 1) | (BITS_PER_LONG - 1)) + 1; + +@@ -214,6 +225,9 @@ static int expand_files(struct files_struct *files, unsigned int nr) + { + struct fdtable *fdt; + int expanded = 0; ++#ifdef CONFIG_VKERNEL ++ struct vkernel *vk; ++#endif + + repeat: + fdt = files_fdtable(files); +@@ -223,6 +237,11 @@ static int expand_files(struct files_struct *files, unsigned int nr) + return expanded; + + /* Can we expand? */ ++#ifdef CONFIG_VKERNEL ++ vk = vkernel_find_vk_by_task(current); ++ if (vk && nr >= vk->sysctl_fs.nr_open) ++ return -EMFILE; ++#endif + if (nr >= sysctl_nr_open) + return -EMFILE; + +diff --git a/fs/file_table.c b/fs/file_table.c +index a5a3a385f24c..a0175354a1ef 100644 +--- a/fs/file_table.c ++++ b/fs/file_table.c +@@ -29,6 +29,9 @@ + #include + #include + #include ++#ifdef CONFIG_VKERNEL ++#include ++#endif + + #include + +@@ -77,8 +80,16 @@ static inline void file_free(struct file *f) + security_file_free(f); + if (unlikely(f->f_mode & FMODE_BACKING)) + path_put(backing_file_user_path(f)); +- if (likely(!(f->f_mode & FMODE_NOACCOUNT))) ++ if (likely(!(f->f_mode & FMODE_NOACCOUNT))) { ++#ifdef CONFIG_VKERNEL ++ struct vkernel *vk; ++ ++ vk = vkernel_find_vk_by_task(current); ++ if (vk) ++ percpu_counter_dec(&vk->sysctl_fs.nr_files); ++#endif + percpu_counter_dec(&nr_files); ++ } + call_rcu(&f->f_rcuhead, file_free_rcu); + } + +@@ -90,6 +101,13 @@ static long get_nr_files(void) + return percpu_counter_read_positive(&nr_files); + } + ++#ifdef CONFIG_VKERNEL ++static long vk_get_nr_files(struct vkernel_sysctl_fs *fs) ++{ ++ return percpu_counter_read_positive(&fs->nr_files); ++} ++#endif ++ + /* + * Return the maximum number of open files in the system + */ +@@ -190,7 +208,19 @@ struct file *alloc_empty_file(int flags, const struct cred *cred) + static long old_max; + struct file *f; + int error; +- ++#ifdef CONFIG_VKERNEL ++ struct vkernel *vk; ++ struct vkernel_sysctl_fs *fs = NULL; ++ ++ vk = vkernel_find_vk_by_task(current); ++ if (vk) { ++ fs = &vk->sysctl_fs; ++ if (vk_get_nr_files(fs) >= fs->files_stat.max_files && !capable(CAP_SYS_ADMIN)) { ++ if (percpu_counter_sum_positive(&fs->nr_files) >= fs->files_stat.max_files) ++ goto over_vk; ++ } ++ } ++#endif + /* + * Privileged users can go above max_files + */ +@@ -213,10 +243,22 @@ struct file *alloc_empty_file(int flags, const struct cred *cred) + return ERR_PTR(error); + } + ++#ifdef CONFIG_VKERNEL ++ if (fs) ++ percpu_counter_inc(&fs->nr_files); ++#endif + percpu_counter_inc(&nr_files); + + return f; + ++#ifdef CONFIG_VKERNEL ++over_vk: ++ /* Ran out of vk filps, fs cannot be NULL here */ ++ if (vk_get_nr_files(fs) > fs->old_max) { ++ pr_info("VFS: vkernel file-max limit %lu reached\n", fs->files_stat.max_files); ++ fs->old_max = vk_get_nr_files(fs); ++ } ++#endif + over: + /* Ran out of filps - report that */ + if (get_nr_files() > old_max) { +diff --git a/fs/inode.c b/fs/inode.c +index ad7445342ee9..10dbaaa43ac6 100644 +--- a/fs/inode.c ++++ b/fs/inode.c +@@ -20,6 +20,9 @@ + #include + #include + #include ++#ifdef CONFIG_VKERNEL ++#include ++#endif + #include + #include "internal.h" + +@@ -158,6 +161,9 @@ int inode_init_always(struct super_block *sb, struct inode *inode) + static const struct inode_operations empty_iops; + static const struct file_operations no_open_fops = {.open = no_open}; + struct address_space *const mapping = &inode->i_data; ++#ifdef CONFIG_VKERNEL ++ struct vkernel *vk; ++#endif + + inode->i_sb = sb; + inode->i_blkbits = sb->s_blocksize_bits; +@@ -231,6 +237,11 @@ int inode_init_always(struct super_block *sb, struct inode *inode) + + if (unlikely(security_inode_alloc(inode))) + return -ENOMEM; ++#ifdef CONFIG_VKERNEL ++ vk = vkernel_find_vk_by_task(current); ++ if (vk) ++ this_cpu_inc(*vk->sysctl_fs.nr_inodes); ++#endif + this_cpu_inc(nr_inodes); + + return 0; +@@ -281,6 +292,10 @@ static struct inode *alloc_inode(struct super_block *sb) + + void __destroy_inode(struct inode *inode) + { ++#ifdef CONFIG_VKERNEL ++ struct vkernel *vk; ++#endif ++ + BUG_ON(inode_has_buffers(inode)); + inode_detach_wb(inode); + security_inode_free(inode); +@@ -296,6 +311,11 @@ void __destroy_inode(struct inode *inode) + posix_acl_release(inode->i_acl); + if (inode->i_default_acl && !is_uncached_acl(inode->i_default_acl)) + posix_acl_release(inode->i_default_acl); ++#endif ++#ifdef CONFIG_VKERNEL ++ vk = vkernel_find_vk_by_task(current); ++ if (vk) ++ this_cpu_dec(*vk->sysctl_fs.nr_inodes); + #endif + this_cpu_dec(nr_inodes); + } +@@ -455,6 +475,10 @@ EXPORT_SYMBOL(ihold); + + static void __inode_add_lru(struct inode *inode, bool rotate) + { ++#ifdef CONFIG_VKERNEL ++ struct vkernel *vk; ++#endif ++ + if (inode->i_state & (I_DIRTY_ALL | I_SYNC | I_FREEING | I_WILL_FREE)) + return; + if (atomic_read(&inode->i_count)) +@@ -464,8 +488,14 @@ static void __inode_add_lru(struct inode *inode, bool rotate) + if (!mapping_shrinkable(&inode->i_data)) + return; + +- if (list_lru_add(&inode->i_sb->s_inode_lru, &inode->i_lru)) ++ if (list_lru_add(&inode->i_sb->s_inode_lru, &inode->i_lru)) { ++#ifdef CONFIG_VKERNEL ++ vk = vkernel_find_vk_by_task(current); ++ if (vk) ++ this_cpu_inc(*vk->sysctl_fs.nr_unused); ++#endif + this_cpu_inc(nr_unused); ++ } + else if (rotate) + inode->i_state |= I_REFERENCED; + } +@@ -482,8 +512,16 @@ void inode_add_lru(struct inode *inode) + + static void inode_lru_list_del(struct inode *inode) + { +- if (list_lru_del(&inode->i_sb->s_inode_lru, &inode->i_lru)) ++ if (list_lru_del(&inode->i_sb->s_inode_lru, &inode->i_lru)) { ++#ifdef CONFIG_VKERNEL ++ struct vkernel *vk; ++ ++ vk = vkernel_find_vk_by_task(current); ++ if (vk) ++ this_cpu_dec(*vk->sysctl_fs.nr_unused); ++#endif + this_cpu_dec(nr_unused); ++ } + } + + static void inode_pin_lru_isolating(struct inode *inode) +@@ -850,6 +888,11 @@ static enum lru_status inode_lru_isolate(struct list_head *item, + { + struct list_head *freeable = arg; + struct inode *inode = container_of(item, struct inode, i_lru); ++#ifdef CONFIG_VKERNEL ++ struct vkernel *vk; ++ ++ vk = vkernel_find_vk_by_task(current); ++#endif + + /* + * We are inverting the lru lock/inode->i_lock here, so use a +@@ -869,6 +912,10 @@ static enum lru_status inode_lru_isolate(struct list_head *item, + !mapping_shrinkable(&inode->i_data)) { + list_lru_isolate(lru, &inode->i_lru); + spin_unlock(&inode->i_lock); ++#ifdef CONFIG_VKERNEL ++ if (vk) ++ this_cpu_dec(*vk->sysctl_fs.nr_unused); ++#endif + this_cpu_dec(nr_unused); + return LRU_REMOVED; + } +@@ -908,6 +955,10 @@ static enum lru_status inode_lru_isolate(struct list_head *item, + list_lru_isolate_move(lru, &inode->i_lru, freeable); + spin_unlock(&inode->i_lock); + ++#ifdef CONFIG_VKERNEL ++ if (vk) ++ this_cpu_dec(*vk->sysctl_fs.nr_unused); ++#endif + this_cpu_dec(nr_unused); + return LRU_REMOVED; + } +diff --git a/fs/namei.c b/fs/namei.c +index 5e58afc0d0d9..d6e568cd5955 100644 +--- a/fs/namei.c ++++ b/fs/namei.c +@@ -41,6 +41,9 @@ + #include + #include + #include ++#ifdef CONFIG_VKERNEL ++#include ++#endif + + #include "internal.h" + #include "mount.h" +@@ -401,6 +404,16 @@ int generic_permission(struct mnt_idmap *idmap, struct inode *inode, + int mask) + { + int ret; ++#ifdef CONFIG_VKERNEL ++ struct vkernel *vk; ++ ++ vk = vkernel_find_vk_by_task(current); ++ if (vk) { ++ ret = vk->ops.generic_permission(vk, idmap, inode, mask); ++ if (ret) ++ return ret; ++ } ++#endif + + /* + * Do the basic permission checks. +diff --git a/fs/namespace.c b/fs/namespace.c +index 5a74d92dec6b..4ee4d368e26b 100644 +--- a/fs/namespace.c ++++ b/fs/namespace.c +@@ -32,6 +32,9 @@ + #include + #include + #include ++#ifdef CONFIG_VKERNEL ++#include ++#endif + + #include "pnode.h" + #include "internal.h" +@@ -2203,6 +2206,13 @@ int count_mounts(struct mnt_namespace *ns, struct mount *mnt) + unsigned int max = READ_ONCE(sysctl_mount_max); + unsigned int mounts = 0; + struct mount *p; ++#ifdef CONFIG_VKERNEL ++ struct vkernel *vk; ++ ++ vk = vkernel_find_vk_by_task(current); ++ if (vk && ns->mounts > READ_ONCE(vk->sysctl_fs.mount_max)) ++ return -ENOSPC; ++#endif + + if (ns->mounts >= max) + return -ENOSPC; +diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c +index 57a431c1130b..b4be42d62c0a 100644 +--- a/fs/proc/meminfo.c ++++ b/fs/proc/meminfo.c +@@ -19,6 +19,10 @@ + #endif + #include + #include ++#include ++#ifdef CONFIG_VKERNEL ++#include ++#endif + #include + #include "internal.h" + +@@ -35,12 +39,24 @@ static void show_val_kb(struct seq_file *m, const char *s, unsigned long num) + static int meminfo_proc_show(struct seq_file *m, void *v) + { + struct sysinfo i; ++ unsigned long commit_limit; + unsigned long committed; + long cached; + long available; + unsigned long pages[NR_LRU_LISTS]; + unsigned long sreclaimable, sunreclaim; + int lru; ++#if defined(CONFIG_VKERNEL) && defined(CONFIG_MEMCG) ++ struct vkernel *vk; ++ struct mem_cgroup *memcg; ++ ++ vk = vkernel_find_vk_by_task(current); ++ memcg = mem_cgroup_from_task(current); ++ if (vk && memcg) ++ commit_limit = vk_vm_commit_limit(&vk->sysctl_vm, memcg); ++#else ++ commit_limit = vm_commit_limit(); ++#endif + + si_meminfo(&i); + si_swapinfo(&i); +@@ -126,7 +142,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) + global_zone_page_state(NR_BOUNCE)); + show_val_kb(m, "WritebackTmp: ", + global_node_page_state(NR_WRITEBACK_TEMP)); +- show_val_kb(m, "CommitLimit: ", vm_commit_limit()); ++ show_val_kb(m, "CommitLimit: ", commit_limit); + show_val_kb(m, "Committed_AS: ", committed); + seq_printf(m, "VmallocTotal: %8lu kB\n", + (unsigned long)VMALLOC_TOTAL >> 10); +diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c +index c64ad1fca4e4..661fb0f8b979 100644 +--- a/fs/userfaultfd.c ++++ b/fs/userfaultfd.c +@@ -32,6 +32,9 @@ + #include + #include + #include ++#ifdef CONFIG_VKERNEL ++#include ++#endif + + static int sysctl_unprivileged_userfaultfd __read_mostly; + +@@ -1337,11 +1340,19 @@ static __always_inline int validate_unaligned_range( + struct mm_struct *mm, __u64 start, __u64 len) + { + __u64 task_size = mm->task_size; ++#ifdef CONFIG_VKERNEL ++ struct vkernel *vk; ++#endif + + if (len & ~PAGE_MASK) + return -EINVAL; + if (!len) + return -EINVAL; ++#ifdef CONFIG_VKERNEL ++ vk = vkernel_find_vk_by_task(current); ++ if (vk && start < vk->sysctl_vm.mmap_min_addr) ++ return -EINVAL; ++#endif + if (start < mmap_min_addr) + return -EINVAL; + if (start >= task_size) +diff --git a/include/linux/miscdevice.h b/include/linux/miscdevice.h +index c0fea6ca5076..dc8fb910c84d 100644 +--- a/include/linux/miscdevice.h ++++ b/include/linux/miscdevice.h +@@ -62,6 +62,7 @@ + #define FUSE_MINOR 229 + #define SNAPSHOT_MINOR 231 + #define KVM_MINOR 232 ++#define VKERNEL_MINOR 233 + #define BTRFS_MINOR 234 + #define AUTOFS_MINOR 235 + #define MAPPER_CTRL_MINOR 236 +diff --git a/include/linux/mman.h b/include/linux/mman.h +index 8ddca62d6460..beb4dc58b4b9 100644 +--- a/include/linux/mman.h ++++ b/include/linux/mman.h +@@ -74,10 +74,14 @@ static inline void mm_compute_batch(int overcommit_policy) + + unsigned long vm_memory_committed(void); + ++#ifdef CONFIG_VKERNEL ++void vm_acct_memory(long pages); ++#else + static inline void vm_acct_memory(long pages) + { + percpu_counter_add_batch(&vm_committed_as, pages, vm_committed_as_batch); + } ++#endif + + static inline void vm_unacct_memory(long pages) + { +diff --git a/include/linux/vkernel.h b/include/linux/vkernel.h +new file mode 100644 +index 000000000000..e98d9d14ac73 +--- /dev/null ++++ b/include/linux/vkernel.h +@@ -0,0 +1,578 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * Copyright (C) 2024 ARM Ltd. ++ * Author: Hang Huang ++ */ ++ ++#ifndef _LINUX_VKERNEL_H ++#define _LINUX_VKERNEL_H ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++ ++#include ++ ++#define VKERNEL_API_VERSION 1 ++ ++#define VKERNEL_NAME_LEN 64 ++#define VKERNEL_PATH_MAX 128 ++#define VKERNEL_ACL_HASH_BITS 8 ++ ++#define NOT_FOUND 0x8000 ++#define IOP_VKERNEL_REG 0x8000 ++#define IOP_VKERNEL_DIR 0x4000 ++ ++/* Refer KVM */ ++#define VKERNELIO 0xAF ++ ++/* System/VK IOCTL list */ ++#define VKERNEL_GET_API_VERSION _IO(VKERNELIO, 0x00) ++#define VKERNEL_CREATE_VK _IO(VKERNELIO, 0x01) ++#define VKERNEL_DESTROY_VK _IO(VKERNELIO, 0x02) ++#define VKERNEL_CHECK_EXTENSION _IO(VKERNELIO, 0x03) ++#define VKERNEL_TRACE_ENABLE _IO(VKERNELIO, 0x04) ++#define VKERNEL_TRACE_PAUSE _IO(VKERNELIO, 0x05) ++#define VKERNEL_TRACE_DISABLE _IO(VKERNELIO, 0x06) ++#define VKERNEL_SET_DEF_SYSCALL _IO(VKERNELIO, 0x07) ++#define VKERNEL_RESTRICT_SYSCALL _IO(VKERNELIO, 0x08) ++#define VKERNEL_RESTRICT_FILE _IO(VKERNELIO, 0x09) ++#define VKERNEL_RESTRICT_LINUX_CAP _IO(VKERNELIO, 0x0a) ++#define VKERNEL_SET_CPU_PREF _IO(VKERNELIO, 0X0b) ++#define VKERNEL_SET_MEMORY_PREF _IO(VKERNELIO, 0X0c) ++#define VKERNEL_SET_SYSCTL_FS _IO(VKERNELIO, 0X0d) ++#define VKERNEL_SET_SYSCTL_KERNEL _IO(VKERNELIO, 0x0e) ++#define VKERNEL_SET_SYSCTL_NET _IO(VKERNELIO, 0x0f) ++#define VKERNEL_SET_SYSCTL_VM _IO(VKERNELIO, 0x10) ++#define VKERNEL_ENABLE_CAP _IO(VKERNELIO, 0x11) ++#define VKERNEL_REGISTER _IO(VKERNELIO, 0x12) ++#define VKERNEL_UNREGISTER _IO(VKERNELIO, 0x13) ++#define VKERNEL_ACTIVATE _IO(VKERNELIO, 0x14) ++#define VKERNEL_DEACTIVATE _IO(VKERNELIO, 0x15) ++ ++/* syscall condition compare operations */ ++#define VKERNEL_SYSCALL_CMP_ED 0 /* invalid op, means the end of conditions */ ++#define VKERNEL_SYSCALL_CMP_EQ 1 /* equal, arg == val */ ++#define VKERNEL_SYSCALL_CMP_NE 2 /* not equal, arg != val */ ++#define VKERNEL_SYSCALL_CMP_LT 3 /* less than, arg < val */ ++#define VKERNEL_SYSCALL_CMP_LE 4 /* less than or equal, arg <= val */ ++#define VKERNEL_SYSCALL_CMP_GT 5 /* greater than, arg > val */ ++#define VKERNEL_SYSCALL_CMP_GE 6 /* greater than or equal, arg >= val */ ++#define VKRENEL_SYSCALL_CMP_ME 7 /* masked equal, arg & mask == val, mask is val1 */ ++ ++/* syscall rule actions */ ++#define VKERNEL_SYSCALL_ACT_INVALID 0 ++#define VKERNEL_SYSCALL_ACT_KILL_PROCESS 1 ++#define VKERNEL_SYSCALL_ACT_KILL_THREAD 2 ++#define VKERNEL_SYSCALL_ACT_TRAP 3 ++#define VKERNEL_SYSCALL_ACT_ERRNO 4 ++#define VKERNEL_SYSCALL_ACT_USER_NOTIF 5 ++#define VKERNEL_SYSCALL_ACT_TRACE 6 ++#define VKERNEL_SYSCALL_ACT_LOG 7 ++#define VKERNEL_SYSCALL_ACT_ALLOW 8 ++ ++#define VKERNEL_SYSCALL_ACT_BITS 16 ++#define VKERNEL_SYSCALL_ERRNO_BITS 16 ++#define VKERNEL_SYSCALL_ERRNO_MASK ((1U << VKERNEL_SYSCALL_ERRNO_BITS) - 1) ++ ++/* Extension capability list */ ++#define VKERNEL_CAP_ISOLATE_LOG 0 ++#define VKERNEL_CAP_ISOLATE_ANON 1 ++#define VKERNEL_CAP_ISOLATE_ANON_PIPE 2 ++#define VKERNEL_CAP_ISOLATE_RAMFS 3 ++#define VKERNEL_CAP_NUM 4 ++ ++#define current_vk_task get_current_syscall_task() ++#define current_vk get_current_syscall_vk() ++ ++#define vk_hugepage_flags_enabled(flags) \ ++ (flags & \ ++ ((1< + #include + #include ++#ifdef CONFIG_VKERNEL ++#include ++#endif + + #include + +@@ -760,6 +763,18 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) + file = hugetlb_file_setup(name, hugesize, acctflag, + HUGETLB_SHMFS_INODE, (shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK); + } else { ++#ifdef CONFIG_VKERNEL ++ struct vkernel *vk; ++ ++ vk = vkernel_find_vk_by_task(current); ++ if (vk) { ++ if ((shmflg & SHM_NORESERVE) && ++ vk->sysctl_vm.overcommit_memory != OVERCOMMIT_NEVER) ++ acctflag = VM_NORESERVE; ++ } else if ((shmflg & SHM_NORESERVE) && ++ sysctl_overcommit_memory != OVERCOMMIT_NEVER) ++ acctflag = VM_NORESERVE; ++#else + /* + * Do not allow no accounting for OVERCOMMIT_NEVER, even + * if it's asked for. +@@ -767,6 +782,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) + if ((shmflg & SHM_NORESERVE) && + sysctl_overcommit_memory != OVERCOMMIT_NEVER) + acctflag = VM_NORESERVE; ++#endif + file = shmem_kernel_file_setup(name, size, acctflag); + } + error = PTR_ERR(file); +diff --git a/kernel/Makefile b/kernel/Makefile +index da4c2d1838dc..340c8a9c62bb 100644 +--- a/kernel/Makefile ++++ b/kernel/Makefile +@@ -15,6 +15,7 @@ obj-y = fork.o exec_domain.o panic.o \ + obj-$(CONFIG_USERMODE_DRIVER) += usermode_driver.o + obj-$(CONFIG_MULTIUSER) += groups.o + obj-$(CONFIG_VHOST_TASK) += vhost_task.o ++obj-$(CONFIG_VKERNEL) += vkernel_hook.o + + ifdef CONFIG_FUNCTION_TRACER + # Do not trace internal ftrace files +diff --git a/kernel/exit.c b/kernel/exit.c +index f2b87b2a7009..f74eff48e956 100644 +--- a/kernel/exit.c ++++ b/kernel/exit.c +@@ -69,6 +69,9 @@ + #include + #include + #include ++#ifdef CONFIG_VKERNEL ++#include ++#endif + + #include + #include +@@ -122,6 +125,13 @@ late_initcall(kernel_exit_sysfs_init); + + static void __unhash_process(struct task_struct *p, bool group_dead) + { ++#ifdef CONFIG_VKERNEL ++ struct vkernel *vk; ++ ++ vk = vkernel_find_vk_by_task(current); ++ if (vk) ++ vk->sysctl_kernel.nr_threads--; ++#endif + nr_threads--; + detach_pid(p, PIDTYPE_PID); + if (group_dead) { +diff --git a/kernel/fork.c b/kernel/fork.c +index 96c6a9e446ac..4cce74c0792b 100644 +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -103,6 +103,9 @@ + #include + #endif + #include ++#ifdef CONFIG_VKERNEL ++#include ++#endif + #include + + #include +@@ -2336,6 +2339,9 @@ __latent_entropy struct task_struct *copy_process( + struct file *pidfile = NULL; + const u64 clone_flags = args->flags; + struct nsproxy *nsp = current->nsproxy; ++#ifdef CONFIG_VKERNEL ++ struct vkernel *vk; ++#endif + + /* + * Don't allow sharing the root directory with processes in a different +@@ -2488,6 +2494,12 @@ __latent_entropy struct task_struct *copy_process( + * to stop root fork bombs. + */ + retval = -EAGAIN; ++#ifdef CONFIG_VKERNEL ++ vk = vkernel_find_vk_by_task(current); ++ /* Vkernel: Check vkernel data race */ ++ if (vk && data_race(vk->sysctl_kernel.nr_threads >= vk->sysctl_kernel.max_threads)) ++ goto bad_fork_cleanup_count; ++#endif + if (data_race(nr_threads >= max_threads)) + goto bad_fork_cleanup_count; + +@@ -2823,6 +2835,10 @@ __latent_entropy struct task_struct *copy_process( + &p->signal->thread_head); + } + attach_pid(p, PIDTYPE_PID); ++#ifdef CONFIG_VKERNEL ++ if (vk) ++ vk->sysctl_kernel.nr_threads++; ++#endif + nr_threads++; + } + total_forks++; +@@ -3650,6 +3666,13 @@ int sysctl_max_threads(struct ctl_table *table, int write, + int threads = max_threads; + int min = 1; + int max = MAX_THREADS; ++#ifdef CONFIG_VKERNEL ++ struct vkernel *vk; ++ ++ vk = vkernel_find_vk_by_task(current); ++ if (vk) ++ threads = vk->sysctl_kernel.max_threads; ++#endif + + t = *table; + t.data = &threads; +@@ -3660,7 +3683,14 @@ int sysctl_max_threads(struct ctl_table *table, int write, + if (ret || !write) + return ret; + ++#ifdef CONFIG_VKERNEL ++ if (vk) ++ vk->sysctl_kernel.max_threads = threads; ++ else ++ max_threads = threads; ++#else + max_threads = threads; ++#endif + + return 0; + } +diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c +index a8074079b09e..9a38095d5d00 100644 +--- a/kernel/futex/syscalls.c ++++ b/kernel/futex/syscalls.c +@@ -3,6 +3,9 @@ + #include + #include + #include ++#ifdef CONFIG_VKERNEL ++#include ++#endif + + #include "futex.h" + +@@ -87,6 +90,13 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, + { + int cmd = op & FUTEX_CMD_MASK; + unsigned int flags = 0; ++#ifdef CONFIG_VKERNEL ++ struct vkernel *vk; ++ ++ vk = vkernel_find_vk_by_task(current); ++ if (vk && vk->syscall.do_futex) ++ return vk->syscall.do_futex(uaddr, op, val, timeout, uaddr2, val2, val3); ++#endif + + if (!(op & FUTEX_PRIVATE_FLAG)) + flags |= FLAGS_SHARED; +diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c +index de9f3fc6ec2b..bed4d6579ef1 100644 +--- a/kernel/printk/printk.c ++++ b/kernel/printk/printk.c +@@ -47,6 +47,9 @@ + #include + #include + #include ++#ifdef CONFIG_VKERNEL ++#include ++#endif + + #include + #include +@@ -1112,6 +1115,9 @@ static unsigned int __init add_to_rb(struct printk_ringbuffer *rb, + dest_r.info->flags = r->info->flags; + dest_r.info->ts_nsec = r->info->ts_nsec; + dest_r.info->caller_id = r->info->caller_id; ++#ifdef CONFIG_VKERNEL ++ dest_r.info->ns = r->info->ns; ++#endif + memcpy(&dest_r.info->dev_info, &r->info->dev_info, sizeof(dest_r.info->dev_info)); + + prb_final_commit(&e); +@@ -2206,6 +2212,9 @@ int vprintk_store(int facility, int level, + u16 text_len; + int ret = 0; + u64 ts_nsec; ++#ifdef CONFIG_VKERNEL ++ struct vkernel *vk; ++#endif + + if (!printk_enter_irqsave(recursion_ptr, irqflags)) + return 0; +@@ -2293,6 +2302,11 @@ int vprintk_store(int facility, int level, + r.info->flags = flags & 0x1f; + r.info->ts_nsec = ts_nsec; + r.info->caller_id = caller_id; ++#ifdef CONFIG_VKERNEL ++ /* Set log namespace (host can set any invalid value) */ ++ vk = vkernel_find_vk_by_task(current); ++ r.info->ns = vk ? vk->log_ns : 0; ++#endif + if (dev_info) + memcpy(&r.info->dev_info, dev_info, sizeof(r.info->dev_info)); + +diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c +index fde338606ce8..c230231abb69 100644 +--- a/kernel/printk/printk_ringbuffer.c ++++ b/kernel/printk/printk_ringbuffer.c +@@ -5,6 +5,9 @@ + #include + #include + #include ++#ifdef CONFIG_VKERNEL ++#include ++#endif + #include "printk_ringbuffer.h" + + /** +@@ -1803,6 +1806,14 @@ static int prb_read(struct printk_ringbuffer *rb, u64 seq, + struct prb_desc desc; + unsigned long id; + int err; ++#ifdef CONFIG_VKERNEL ++ struct vkernel *vk; ++ ++ /* Skip record when reading log owned by other ns */ ++ vk = vkernel_find_vk_by_task(current); ++ if (vk && vk->log_ns != info->ns) ++ return -ENOENT; ++#endif + + /* Extract the ID, used to specify the descriptor to read. */ + id = DESC_ID(atomic_long_read(state_var)); +diff --git a/kernel/printk/printk_ringbuffer.h b/kernel/printk/printk_ringbuffer.h +index 18cd25e489b8..72d591b7842e 100644 +--- a/kernel/printk/printk_ringbuffer.h ++++ b/kernel/printk/printk_ringbuffer.h +@@ -20,6 +20,9 @@ struct printk_info { + u8 flags:5; /* internal record flags */ + u8 level:3; /* syslog level */ + u32 caller_id; /* thread id or processor id */ ++#ifdef CONFIG_VKERNEL ++ u64 ns; /* log namespace */ ++#endif + + struct dev_printk_info dev_info; + }; +diff --git a/kernel/sys.c b/kernel/sys.c +index 355de0b65c23..e85d7cb1e490 100644 +--- a/kernel/sys.c ++++ b/kernel/sys.c +@@ -64,6 +64,9 @@ + #include + #include + #include ++#ifdef CONFIG_VKERNEL ++#include ++#endif + + #include + +@@ -1457,6 +1460,9 @@ static int do_prlimit(struct task_struct *tsk, unsigned int resource, + { + struct rlimit *rlim; + int retval = 0; ++#ifdef CONFIG_VKERNEL ++ struct vkernel *vk; ++#endif + + if (resource >= RLIM_NLIMITS) + return -EINVAL; +@@ -1465,6 +1471,12 @@ static int do_prlimit(struct task_struct *tsk, unsigned int resource, + if (new_rlim) { + if (new_rlim->rlim_cur > new_rlim->rlim_max) + return -EINVAL; ++#ifdef CONFIG_VKERNEL ++ vk = vkernel_find_vk_by_task(current); ++ if (vk && resource == RLIMIT_NOFILE && ++ new_rlim->rlim_max > vk->sysctl_fs.nr_open) ++ return -EPERM; ++#endif + if (resource == RLIMIT_NOFILE && + new_rlim->rlim_max > sysctl_nr_open) + return -EPERM; +@@ -1928,6 +1940,9 @@ static int validate_prctl_map_addr(struct prctl_mm_map *prctl_map) + { + unsigned long mmap_max_addr = TASK_SIZE; + int error = -EINVAL, i; ++#ifdef CONFIG_VKERNEL ++ struct vkernel *vk; ++#endif + + static const unsigned char offsets[] = { + offsetof(struct prctl_mm_map, start_code), +@@ -1950,6 +1965,11 @@ static int validate_prctl_map_addr(struct prctl_mm_map *prctl_map) + for (i = 0; i < ARRAY_SIZE(offsets); i++) { + u64 val = *(u64 *)((char *)prctl_map + offsets[i]); + ++#ifdef CONFIG_VKERNEL ++ vk = vkernel_find_vk_by_task(current); ++ if (vk && (unsigned long)val < vk->sysctl_vm.mmap_min_addr) ++ goto out; ++#endif + if ((unsigned long)val >= mmap_max_addr || + (unsigned long)val < mmap_min_addr) + goto out; +@@ -2135,6 +2155,9 @@ static int prctl_set_mm(int opt, unsigned long addr, + }; + struct vm_area_struct *vma; + int error; ++#ifdef CONFIG_VKERNEL ++ struct vkernel *vk; ++#endif + + if (arg5 || (arg4 && (opt != PR_SET_MM_AUXV && + opt != PR_SET_MM_MAP && +@@ -2155,6 +2178,11 @@ static int prctl_set_mm(int opt, unsigned long addr, + if (opt == PR_SET_MM_AUXV) + return prctl_set_auxv(mm, addr, arg4); + ++#ifdef CONFIG_VKERNEL ++ vk = vkernel_find_vk_by_task(current); ++ if (vk && addr < vk->sysctl_vm.mmap_min_addr) ++ return -EINVAL; ++#endif + if (addr >= TASK_SIZE || addr < mmap_min_addr) + return -EINVAL; + +diff --git a/kernel/vkernel_hook.c b/kernel/vkernel_hook.c +new file mode 100644 +index 000000000000..7a438f251941 +--- /dev/null ++++ b/kernel/vkernel_hook.c +@@ -0,0 +1,92 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/** ++ * Vkernel hook ++ * ++ * Vkernel polcies are implemented as loadable module(s) and ++ * applied by hooks ++ * ++ * Copyright (C) 2024 ARM Ltd. ++ * Author: Hang Huang ++ **/ ++ ++#include ++ ++static DEFINE_MUTEX(vkernel_lock); ++static DEFINE_HASHTABLE(vkernel_ht, 6); ++ ++/* id -> vk cache */ ++static unsigned int id_cache; ++static struct vkernel *vk_cache; ++ ++DEFINE_PER_CPU(struct task_struct *, current_syscall_task); ++EXPORT_PER_CPU_SYMBOL(current_syscall_task); ++ ++DEFINE_PER_CPU(struct vkernel *, current_syscall_vk); ++EXPORT_PER_CPU_SYMBOL(current_syscall_vk); ++ ++struct vkernel *vkernel_find_vk_by_id(unsigned int id) ++{ ++ struct vkernel *vk; ++ ++ if (id == id_cache) ++ return vk_cache; ++ ++ /* TODO: protect with rwlock? */ ++ hash_for_each_possible(vkernel_ht, vk, hash, id) { ++ if (id == vk->pid_ns->ns.inum) { ++ id_cache = vk->pid_ns->ns.inum; ++ vk_cache = vk; ++ return vk; ++ } ++ } ++ ++ return NULL; ++} ++EXPORT_SYMBOL(vkernel_find_vk_by_id); ++ ++struct vkernel *vkernel_find_vk_by_task(struct task_struct *tsk) ++{ ++ struct vkernel *vk; ++ struct pid_namespace *ns; ++ ++ ns = task_active_pid_ns(tsk); ++ if (!ns || ns == &init_pid_ns) ++ return NULL; ++ ++ vk = vkernel_find_vk_by_id(ns->ns.inum); ++ if (vk && vk->active) ++ return vk; ++ ++ return NULL; ++} ++EXPORT_SYMBOL(vkernel_find_vk_by_task); ++ ++int vkernel_register_vk(struct vkernel *vk) ++{ ++ if (!hlist_unhashed(&vk->hash)) ++ return -EEXIST; ++ ++ mutex_lock(&vkernel_lock); ++ hash_add(vkernel_ht, &vk->hash, vk->pid_ns->ns.inum); ++ mutex_unlock(&vkernel_lock); ++ id_cache = vk->pid_ns->ns.inum; ++ vk_cache = vk; ++ ++ return 0; ++} ++EXPORT_SYMBOL(vkernel_register_vk); ++ ++int vkernel_unregister_vk(struct vkernel *vk) ++{ ++ if (vk->pid_ns->ns.inum == id_cache) { ++ id_cache = 0; ++ vk_cache = NULL; ++ } ++ /* It is also ok to remove an unhashed vk */ ++ mutex_lock(&vkernel_lock); ++ hash_del(&vk->hash); ++ mutex_unlock(&vkernel_lock); ++ ++ return 0; ++} ++EXPORT_SYMBOL(vkernel_unregister_vk); +diff --git a/mm/huge_memory.c b/mm/huge_memory.c +index 260d8f3ec934..76d214aebb17 100644 +--- a/mm/huge_memory.c ++++ b/mm/huge_memory.c +@@ -39,6 +39,9 @@ + #include + #include + #include ++#ifdef CONFIG_VKERNEL ++#include ++#endif + + #include + #include +@@ -89,6 +92,14 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, + bool in_pf = tva_flags & TVA_IN_PF; + bool enforce_sysfs = tva_flags & TVA_ENFORCE_SYSFS; + unsigned long supported_orders; ++#ifdef CONFIG_VKERNEL ++ unsigned long flags = transparent_hugepage_flags; ++ struct vkernel *vk; ++ ++ vk = vkernel_find_vk_by_task(current); ++ if (vk) ++ flags = vk->mem_pref.thp_flags; ++#endif + + /* Check the intersection of requested and supported orders. */ + if (vma_is_anonymous(vma)) +@@ -105,7 +116,11 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, + if (!vma->vm_mm) /* vdso */ + return 0; + ++#ifdef CONFIG_VKERNEL ++ if (vk_thp_disabled_by_hw(flags) || vma_thp_disabled(vma, vm_flags)) ++#else + if (thp_disabled_by_hw() || vma_thp_disabled(vma, vm_flags)) ++#endif + return 0; + + /* khugepaged doesn't collapse DAX vma, but page fault is fine. */ +@@ -157,9 +172,15 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, + * Enforce sysfs THP requirements as necessary. Anonymous vmas + * were already handled in thp_vma_allowable_orders(). + */ ++#ifdef CONFIG_VKERNEL ++ if (enforce_sysfs && ++ (!vk_hugepage_flags_enabled(flags) || (!(vm_flags & VM_HUGEPAGE) && ++ !vk_hugepage_flags_always(flags)))) ++#else + if (enforce_sysfs && + (!hugepage_global_enabled() || (!(vm_flags & VM_HUGEPAGE) && + !hugepage_global_always()))) ++#endif + return 0; + + /* +@@ -1404,23 +1425,33 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, + gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma) + { + const bool vma_madvised = vma && (vma->vm_flags & VM_HUGEPAGE); ++ unsigned long *flags = &transparent_hugepage_flags; ++#ifdef CONFIG_VKERNEL ++ struct vkernel *vk; ++ ++ vk = vkernel_find_vk_by_task(current); ++ if (vk) ++ flags = &vk->mem_pref.thp_flags; ++ ++ /* FIXME: should we both check global and local flags? */ ++#endif + + /* Always do synchronous compaction */ +- if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) ++ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, flags)) + return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY); + + /* Kick kcompactd and fail quickly */ +- if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) ++ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, flags)) + return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM; + + /* Synchronous compaction if madvised, otherwise kick kcompactd */ +- if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags)) ++ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, flags)) + return GFP_TRANSHUGE_LIGHT | + (vma_madvised ? __GFP_DIRECT_RECLAIM : + __GFP_KSWAPD_RECLAIM); + + /* Only do synchronous compaction if madvised */ +- if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags)) ++ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, flags)) + return GFP_TRANSHUGE_LIGHT | + (vma_madvised ? __GFP_DIRECT_RECLAIM : 0); + +@@ -1449,6 +1480,14 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) + struct folio *folio; + unsigned long haddr = vmf->address & HPAGE_PMD_MASK; + vm_fault_t ret; ++#ifdef CONFIG_VKERNEL ++ unsigned long flags = transparent_hugepage_flags; ++ struct vkernel *vk; ++ ++ vk = vkernel_find_vk_by_task(current); ++ if (vk) ++ flags = vk->mem_pref.thp_flags; ++#endif + + if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER)) + return VM_FAULT_FALLBACK; +@@ -1459,7 +1498,11 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) + + if (!(vmf->flags & FAULT_FLAG_WRITE) && + !mm_forbids_zeropage(vma->vm_mm) && ++#ifdef CONFIG_VKERNEL ++ vk_transparent_hugepage_use_zero_page(flags)) { ++#else + transparent_hugepage_use_zero_page()) { ++#endif + pgtable_t pgtable; + struct page *zero_page; + vm_fault_t ret; +diff --git a/mm/memory.c b/mm/memory.c +index c81a2c3be013..e19a0c5dfb01 100644 +--- a/mm/memory.c ++++ b/mm/memory.c +@@ -79,6 +79,9 @@ + #include + #include + #include ++#ifdef CONFIG_VKERNEL ++#include ++#endif + + #include + +@@ -4929,6 +4932,14 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) + unsigned long haddr = vmf->address & HPAGE_PMD_MASK; + pmd_t entry; + vm_fault_t ret = VM_FAULT_FALLBACK; ++#ifdef CONFIG_VKERNEL ++ unsigned long flags = transparent_hugepage_flags; ++ struct vkernel *vk; ++ ++ vk = vkernel_find_vk_by_task(current); ++ if (vk) ++ flags = vk->mem_pref.thp_flags; ++#endif + + /* + * It is too late to allocate a small folio, we already have a large +@@ -4936,7 +4947,11 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) + * PMD mappings, but PTE-mapped THP are fine. So let's simply refuse any + * PMD mappings if THPs are disabled. + */ ++#ifdef CONFIG_VKERNEL ++ if (vk_thp_disabled_by_hw(flags) || vma_thp_disabled(vma, vma->vm_flags)) ++#else + if (thp_disabled_by_hw() || vma_thp_disabled(vma, vma->vm_flags)) ++#endif + return ret; + + if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER)) +diff --git a/mm/mmap.c b/mm/mmap.c +index 32799ed58022..96782a776633 100644 +--- a/mm/mmap.c ++++ b/mm/mmap.c +@@ -48,6 +48,9 @@ + #include + #include + #include ++#ifdef CONFIG_VKERNEL ++#include ++#endif + + #include + #include +@@ -1198,7 +1201,16 @@ struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma) + */ + static inline unsigned long round_hint_to_min(unsigned long hint) + { ++#ifdef CONFIG_VKERNEL ++ struct vkernel *vk; ++#endif ++ + hint &= PAGE_MASK; ++#ifdef CONFIG_VKERNEL ++ vk = vkernel_find_vk_by_task(current); ++ if (vk && ((void *)hint != NULL) && (hint < vk->sysctl_vm.mmap_min_addr)) ++ hint = PAGE_ALIGN(vk->sysctl_vm.mmap_min_addr); ++#endif + if (((void *)hint != NULL) && + (hint < mmap_min_addr)) + return PAGE_ALIGN(mmap_min_addr); +@@ -1268,6 +1280,9 @@ unsigned long __do_mmap_mm(struct mm_struct *mm, struct file *file, unsigned lon + struct list_head *uf) + { + int pkey = 0; ++#ifdef CONFIG_VKERNEL ++ struct vkernel *vk; ++#endif + + *populate = 0; + +@@ -1301,6 +1316,11 @@ unsigned long __do_mmap_mm(struct mm_struct *mm, struct file *file, unsigned lon + return -EOVERFLOW; + + /* Too many mappings? */ ++#ifdef CONFIG_VKERNEL ++ vk = vkernel_find_vk_by_task(current); ++ if (vk && mm->map_count > vk->sysctl_vm.max_map_count) ++ return -ENOMEM; ++#endif + if (mm->map_count > sysctl_max_map_count) + return -ENOMEM; + +@@ -1422,9 +1442,17 @@ unsigned long __do_mmap_mm(struct mm_struct *mm, struct file *file, unsigned lon + * memory use of this mapping. + */ + if (flags & MAP_NORESERVE) { ++#ifdef CONFIG_VKERNEL ++ if (vk) { ++ if (vk->sysctl_vm.overcommit_memory != OVERCOMMIT_NEVER) ++ vm_flags |= VM_NORESERVE; ++ } else if (sysctl_overcommit_memory != OVERCOMMIT_NEVER) ++ vm_flags |= VM_NORESERVE; ++#else + /* We honor MAP_NORESERVE if allowed to overcommit */ + if (sysctl_overcommit_memory != OVERCOMMIT_NEVER) + vm_flags |= VM_NORESERVE; ++#endif + + /* hugetlb applies strict overcommit unless MAP_NORESERVE */ + if (file && is_file_hugepages(file)) +@@ -1635,6 +1663,9 @@ static unsigned long unmapped_area(struct vm_unmapped_area_info *info) + unsigned long length, gap; + unsigned long low_limit, high_limit; + struct vm_area_struct *tmp; ++#ifdef CONFIG_VKERNEL ++ struct vkernel *vk; ++#endif + + MA_STATE(mas, ¤t->mm->mm_mt, 0, 0); + +@@ -1644,6 +1675,11 @@ static unsigned long unmapped_area(struct vm_unmapped_area_info *info) + return -ENOMEM; + + low_limit = info->low_limit; ++#ifdef CONFIG_VKERNEL ++ vk = vkernel_find_vk_by_task(current); ++ if (vk && low_limit < vk->sysctl_vm.mmap_min_addr) ++ low_limit = vk->sysctl_vm.mmap_min_addr; ++#endif + if (low_limit < mmap_min_addr) + low_limit = mmap_min_addr; + high_limit = info->high_limit; +@@ -1687,6 +1723,9 @@ static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) + unsigned long length, gap, gap_end; + unsigned long low_limit, high_limit; + struct vm_area_struct *tmp; ++#ifdef CONFIG_VKERNEL ++ struct vkernel *vk; ++#endif + + MA_STATE(mas, ¤t->mm->mm_mt, 0, 0); + /* Adjust search length to account for worst case alignment overhead */ +@@ -1695,6 +1734,11 @@ static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) + return -ENOMEM; + + low_limit = info->low_limit; ++#ifdef CONFIG_VKERNEL ++ vk = vkernel_find_vk_by_task(current); ++ if (vk && low_limit < vk->sysctl_vm.mmap_min_addr) ++ low_limit = vk->sysctl_vm.mmap_min_addr; ++#endif + if (low_limit < mmap_min_addr) + low_limit = mmap_min_addr; + high_limit = info->high_limit; +@@ -1767,6 +1811,13 @@ generic_get_unmapped_area(struct file *filp, unsigned long addr, + struct vm_area_struct *vma, *prev; + struct vm_unmapped_area_info info; + const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags); ++#ifdef CONFIG_VKERNEL ++ struct vkernel *vk; ++ ++ vk = vkernel_find_vk_by_task(current); ++ if (vk && len > mmap_end - vk->sysctl_vm.mmap_min_addr) ++ return -ENOMEM; ++#endif + + if (len > mmap_end - mmap_min_addr) + return -ENOMEM; +@@ -1781,6 +1832,9 @@ generic_get_unmapped_area(struct file *filp, unsigned long addr, + addr = PAGE_ALIGN(addr); + vma = find_vma_prev(mm, addr, &prev); + if (mmap_end - len >= addr && addr >= mmap_min_addr && ++#ifdef CONFIG_VKERNEL ++ (!vk || addr >= vk->sysctl_vm.mmap_min_addr) && ++#endif + (!vma || addr + len <= vm_start_gap(vma)) && + (!prev || addr >= vm_end_gap(prev))) + return addr; +@@ -1818,6 +1872,13 @@ generic_get_unmapped_area_topdown(struct file *filp, unsigned long addr, + struct mm_struct *mm = current->mm; + struct vm_unmapped_area_info info; + const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags); ++#ifdef CONFIG_VKERNEL ++ struct vkernel *vk; ++ ++ vk = vkernel_find_vk_by_task(current); ++ if (vk && len > mmap_end - vk->sysctl_vm.mmap_min_addr) ++ return -ENOMEM; ++#endif + + /* requested length too big for entire address space */ + if (len > mmap_end - mmap_min_addr) +@@ -1834,6 +1895,9 @@ generic_get_unmapped_area_topdown(struct file *filp, unsigned long addr, + addr = PAGE_ALIGN(addr); + vma = find_vma_prev(mm, addr, &prev); + if (mmap_end - len >= addr && addr >= mmap_min_addr && ++#ifdef CONFIG_VKERNEL ++ (!vk || addr >= vk->sysctl_vm.mmap_min_addr) && ++#endif + (!vma || addr + len <= vm_start_gap(vma)) && + (!prev || addr >= vm_end_gap(prev))) + return addr; +@@ -2137,11 +2201,19 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address) + MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_start); + struct vm_area_struct *prev; + int error = 0; ++#ifdef CONFIG_VKERNEL ++ struct vkernel *vk; ++#endif + + if (!(vma->vm_flags & VM_GROWSDOWN)) + return -EFAULT; + + address &= PAGE_MASK; ++#ifdef CONFIG_VKERNEL ++ vk = vkernel_find_vk_by_task(current); ++ if (vk && address < vk->sysctl_vm.mmap_min_addr) ++ return -EPERM; ++#endif + if (address < mmap_min_addr || address < FIRST_USER_ADDRESS) + return -EPERM; + +@@ -2508,6 +2580,13 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, + int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, + unsigned long addr, int new_below) + { ++#ifdef CONFIG_VKERNEL ++ struct vkernel *vk; ++ ++ vk = vkernel_find_vk_by_task(current); ++ if (vk && vma->vm_mm->map_count >= vk->sysctl_vm.max_map_count) ++ return -ENOMEM; ++#endif + if (vma->vm_mm->map_count >= sysctl_max_map_count) + return -ENOMEM; + +@@ -2539,6 +2618,10 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, + int error = -ENOMEM; + unsigned long locked_vm = 0; + MA_STATE(mas_detach, &mt_detach, 0, 0); ++#ifdef CONFIG_VKERNEL ++ struct vkernel *vk; ++#endif ++ + mt_init_flags(&mt_detach, vmi->mas.tree->ma_flags & MT_FLAGS_LOCK_MASK); + mt_on_stack(mt_detach); + +@@ -2558,6 +2641,11 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, + * not exceed its limit; but let map_count go just above + * its limit temporarily, to help free resources as expected. + */ ++#ifdef CONFIG_VKERNEL ++ vk = vkernel_find_vk_by_task(current); ++ if (vk && mm->map_count >= vk->sysctl_vm.max_map_count) ++ goto map_count_exceeded; ++#endif + if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count) + goto map_count_exceeded; + +@@ -3208,6 +3296,9 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, + { + struct mm_struct *mm = current->mm; + struct vma_prepare vp; ++#ifdef CONFIG_VKERNEL ++ struct vkernel *vk; ++#endif + + /* + * Check against address space limits by the changed size +@@ -3217,6 +3308,11 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, + if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT)) + return -ENOMEM; + ++#ifdef CONFIG_VKERNEL ++ vk = vkernel_find_vk_by_task(current); ++ if (vk && mm->map_count > vk->sysctl_vm.max_map_count) ++ return -ENOMEM; ++#endif + if (mm->map_count > sysctl_max_map_count) + return -ENOMEM; + +diff --git a/mm/mremap.c b/mm/mremap.c +index e990bb8c8918..ccc99ed556be 100644 +--- a/mm/mremap.c ++++ b/mm/mremap.c +@@ -27,6 +27,9 @@ + #include + #include + #include ++#ifdef CONFIG_VKERNEL ++#include ++#endif + + #include + #include +@@ -607,6 +610,13 @@ static unsigned long move_vma(struct vm_area_struct *vma, + int err = 0; + bool need_rmap_locks; + struct vma_iterator vmi; ++#ifdef CONFIG_VKERNEL ++ struct vkernel *vk; ++ ++ vk = vkernel_find_vk_by_task(current); ++ if (vk && mm->map_count >= vk->sysctl_vm.max_map_count - 3) ++ return -ENOMEM; ++#endif + + /* + * We'd prefer to avoid failure later on in do_munmap: +@@ -816,6 +826,9 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, + struct vm_area_struct *vma; + unsigned long ret = -EINVAL; + unsigned long map_flags = 0; ++#ifdef CONFIG_VKERNEL ++ struct vkernel *vk; ++#endif + + if (offset_in_page(new_addr)) + goto out; +@@ -841,6 +854,11 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, + * Check whether current map count plus 2 still leads us to 4 maps below + * the threshold, otherwise return -ENOMEM here to be more safe. + */ ++#ifdef CONFIG_VKERNEL ++ vk = vkernel_find_vk_by_task(current); ++ if (vk && (mm->map_count + 2) >= vk->sysctl_vm.max_map_count - 3) ++ return -ENOMEM; ++#endif + if ((mm->map_count + 2) >= sysctl_max_map_count - 3) + return -ENOMEM; + +diff --git a/mm/nommu.c b/mm/nommu.c +index a8f350568067..6fd1869e4e89 100644 +--- a/mm/nommu.c ++++ b/mm/nommu.c +@@ -34,6 +34,9 @@ + #include + #include + #include ++#ifdef CONFIG_VKERNEL ++#include ++#endif + + #include + #include +@@ -1316,6 +1319,9 @@ int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, + struct vm_region *region; + unsigned long npages; + struct mm_struct *mm; ++#ifdef CONFIG_VKERNEL ++ struct vkernel *vk; ++#endif + + /* we're only permitted to split anonymous regions (these should have + * only a single usage on the region) */ +@@ -1323,6 +1329,11 @@ int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, + return -ENOMEM; + + mm = vma->vm_mm; ++#ifdef CONFIG_VKERNEL ++ vk = vkernel_find_vk_by_task(current); ++ if (vk && mm->map_count >= vk->sysctl_kernel.max_map_count) ++ return -ENOMEM; ++#endif + if (mm->map_count >= sysctl_max_map_count) + return -ENOMEM; + +diff --git a/mm/shmem.c b/mm/shmem.c +index f4c248f74838..a243dbcf7d38 100644 +--- a/mm/shmem.c ++++ b/mm/shmem.c +@@ -40,6 +40,9 @@ + #include + #include + #include ++#ifdef CONFIG_VKERNEL ++#include ++#endif + #include "swap.h" + + static struct vfsmount *shm_mnt; +@@ -1766,11 +1769,22 @@ unsigned long shmem_allowable_huge_orders(struct inode *inode, + unsigned long vm_flags = vma ? vma->vm_flags : 0; + unsigned int global_orders; + ++#ifdef CONFIG_VKERNEL ++ unsigned long flags = transparent_hugepage_flags; ++ struct vkernel *vk; ++ ++ vk = vkernel_find_vk_by_task(current); ++ if (vk) ++ flags = vk->mem_pref.thp_flags; ++ ++ if (vk_thp_disabled_by_hw(flags) || (vma && vma_thp_disabled(vma, vm_flags))) ++#else + if (thp_disabled_by_hw() || (vma && vma_thp_disabled(vma, vm_flags))) ++#endif + return 0; + + global_orders = shmem_huge_global_enabled(inode, index, write_end, +- shmem_huge_force, vma, vm_flags); ++ shmem_huge_force, vma, vm_flags); + /* Tmpfs huge pages allocation */ + if (!vma || !vma_is_anon_shmem(vma)) + return global_orders; +diff --git a/mm/util.c b/mm/util.c +index f3d6751b2f2a..96057f33fcd5 100644 +--- a/mm/util.c ++++ b/mm/util.c +@@ -24,6 +24,9 @@ + #include + #include + #include ++#ifdef CONFIG_VKERNEL ++#include ++#endif + + #include + #include +@@ -919,6 +922,28 @@ unsigned long vm_commit_limit(void) + return allowed; + } + ++#if defined(CONFIG_VKERNEL) && defined(CONFIG_MEMCG) ++unsigned long vk_vm_commit_limit(struct vkernel_sysctl_vm *vm, ++ struct mem_cgroup *memcg) ++{ ++ unsigned long allowed; ++ struct mem_cgroup *iter; ++ unsigned long limit; ++ ++ if (vm->overcommit_kbytes) ++ allowed = vm->overcommit_kbytes >> (PAGE_SHIFT - 10); ++ else { ++ limit = totalram_pages() - hugetlb_total_pages(); ++ for (iter = memcg; iter; iter = parent_mem_cgroup(iter)) ++ limit = min(limit, iter->memory.max); ++ allowed = (limit * vm->overcommit_ratio / 100); ++ } ++ allowed += min_t(unsigned long, total_swap_pages, memcg->swap.max); ++ ++ return allowed; ++} ++#endif ++ + /* + * Make sure vm_committed_as in one cacheline and not cacheline shared with + * other variables. It can be updated by several CPUs frequently. +@@ -940,10 +965,30 @@ struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp; + */ + unsigned long vm_memory_committed(void) + { ++#ifdef CONFIG_VKERNEL ++ struct vkernel *vk; ++ ++ vk = vkernel_find_vk_by_task(current); ++ if (vk) ++ return percpu_counter_sum_positive(&vk->sysctl_vm.vm_committed_as); ++#endif + return percpu_counter_sum_positive(&vm_committed_as); + } + EXPORT_SYMBOL_GPL(vm_memory_committed); + ++#ifdef CONFIG_VKERNEL ++void vm_acct_memory(long pages) ++{ ++ struct vkernel *vk; ++ ++ vk = vkernel_find_vk_by_task(current); ++ if (vk) ++ percpu_counter_add_batch(&vk->sysctl_vm.vm_committed_as, pages, ++ vk->sysctl_vm.as_batch); ++ percpu_counter_add_batch(&vm_committed_as, pages, vm_committed_as_batch); ++} ++#endif ++ + /* + * Check that a process has enough memory to allocate a new virtual + * mapping. 0 means there is enough memory for the allocation to +@@ -963,16 +1008,34 @@ EXPORT_SYMBOL_GPL(vm_memory_committed); + int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) + { + long allowed; ++ int overcommit = sysctl_overcommit_memory; ++#ifdef CONFIG_VKERNEL ++ struct vkernel *vk; ++#ifdef CONFIG_MEMCG ++ struct mem_cgroup *memcg; ++ long memcg_allowed; ++#endif ++ ++ vk = vkernel_find_vk_by_task(current); ++ if (vk) { ++ overcommit = vk->sysctl_vm.overcommit_memory; ++#ifdef CONFIG_MEMCG ++ memcg = mem_cgroup_from_task(current); ++ if (memcg) ++ memcg_allowed = vk_vm_commit_limit(&vk->sysctl_vm, memcg); ++#endif ++ } ++#endif + + vm_acct_memory(pages); + + /* + * Sometimes we want to use more memory than we have + */ +- if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS) ++ if (overcommit == OVERCOMMIT_ALWAYS) + return 0; + +- if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { ++ if (overcommit == OVERCOMMIT_GUESS) { + if (pages > totalram_pages() + total_swap_pages) + goto error; + return 0; +@@ -994,6 +1057,12 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) + allowed -= min_t(long, mm->total_vm / 32, reserve); + } + ++#if defined(CONFIG_VKERNEL) && defined(CONFIG_MEMCG) ++ if (vk && ++ percpu_counter_read_positive(&vk->sysctl_vm.vm_committed_as) < memcg_allowed) ++ return 0; ++#endif ++ + if (percpu_counter_read_positive(&vm_committed_as) < allowed) + return 0; + error: +diff --git a/security/commoncap.c b/security/commoncap.c +index bc0521104197..f7d30e13fb0d 100644 +--- a/security/commoncap.c ++++ b/security/commoncap.c +@@ -25,6 +25,9 @@ + #include + #include + #include ++#ifdef CONFIG_VKERNEL ++#include ++#endif + + /* + * If a non-root user executes a setuid-root binary in +@@ -67,6 +70,14 @@ int cap_capable(const struct cred *cred, struct user_namespace *targ_ns, + int cap, unsigned int opts) + { + struct user_namespace *ns = targ_ns; ++#ifdef CONFIG_VKERNEL ++ struct vkernel *vk; ++ ++ /* vkernel: check initial capability first */ ++ vk = vkernel_find_vk_by_task(current); ++ if (vk && vk->ops.cap_capable(vk, cred, targ_ns, cap, opts)) ++ return -EPERM; ++#endif + + /* See if cred has the capability in the target user namespace + * by examining the target user namespace and all of the target +-- +2.34.1 + diff --git a/vk-kernel.spec b/vk-kernel.spec new file mode 100644 index 0000000000000000000000000000000000000000..1c8ce641755d7ba61b62231757b7eb6c87efb182 --- /dev/null +++ b/vk-kernel.spec @@ -0,0 +1,1145 @@ +%define with_signmodules 1 +%define with_kabichk 0 + +# Default without toolchain_clang +%bcond_with toolchain_clang + +%if %{with toolchain_clang} +%global toolchain clang +%endif + +%bcond_with clang_lto + +%if %{with clang_lto} && "%{toolchain}" != "clang" +{error:clang_lto requires --with toolchain_clang} +%endif + +%define modsign_cmd %{SOURCE10} + +%if 0%{?openEuler_sign_rsa} +# Use the open-source signature when the EBS permission is insufficient. +# Now only the admin user in EBS can send the signature request. But the +# user triggering the acces control build task and the personal build +# task is non-admin. Inorder to avoid build failures caused by failed +# signing, use the open-source signature. +# The flag_openEuler_has_sign_perm used in the rpm execution phase +# The openEuler_has_sign_perm used in the rpm execution phase + +%define openEuler_check_EBS_perm openEuler_has_sign_perm=0 \ +echo "" >> test_openEuler_sign.ko \ +sh /usr/lib/rpm/brp-ebs-sign --module test_openEuler_sign.ko || \ +[ $? -ne 2 ] && openEuler_has_sign_perm=1 \ +%global flag_openEuler_has_sign_perm $openEuler_has_sign_perm \ +rm -f test_openEuler_sign.ko test_openEuler_sign.ko.sig +%endif + +%global Arch $(echo %{_host_cpu} | sed -e s/i.86/x86/ -e s/x86_64/x86/ -e s/aarch64.*/arm64/ -e s/riscv.*/riscv/ -e s/powerpc64le/powerpc/ -e s/loongarch64/loongarch/) + +%global KernelVer %{version}-%{release}.%{_target_cpu} +%global debuginfodir /usr/lib/debug + +%global upstream_version 6.6 +%global upstream_sublevel 0 +%global devel_release 102 +%global maintenance_release .4.0 +%global pkg_release .54 + +%global openeuler_lts 0 +%global openeuler_major 2509 +%global openeuler_minor 0 + +# +# Support input parameter to overwrite the preceding version numbers. +# + +%bcond_with openeuler_version + +%if %{with openeuler_version} +%global openeuler_lts %{?_openeuler_lts} %{?!_openeuler_lts: 0} +%global openeuler_major %{?_openeuler_major} %{?!_openeuler_major: 0} +%global openeuler_minor %{?_openeuler_minor} %{?!_openeuler_minor: 0} +%endif + +%define with_debuginfo 1 +# Do not recompute the build-id of vmlinux in find-debuginfo.sh +%global _missing_build_ids_terminate_build 1 +%global _no_recompute_build_ids 1 +%undefine _include_minidebuginfo +%undefine _include_gdb_index +%undefine _unique_build_ids + +%define with_source 1 +%define with_python2 0 + +# failed if there is new config options +%define listnewconfig_fail 0 + +%ifarch aarch64 +%define with_64kb %{?_with_64kb: 1} %{?!_with_64kb: 0} +%if %{with_64kb} +%global package64kb -64kb +%endif +%else +%define with_64kb 0 +%endif + +#default is enabled. You can disable it with --without option +%define with_perf %{?_without_perf: 0} %{?!_without_perf: 1} + +Name: vk-kernel%{?package64kb} +Version: %{upstream_version}.%{upstream_sublevel} +Release: %{devel_release}%{?maintenance_release}%{?pkg_release} +Summary: Linux Kernel +License: GPLv2 +URL: http://www.kernel.org/ +Source0: kernel.tar.gz +Source10: sign-modules +Source11: x509.genkey +Source12: extra_certificates + +%if 0%{?openEuler_sign_rsa} +Source15: openeuler_kernel_cert.cer +Source16: sign-modules-openeuler +%endif + +%if 0%{?with_kabichk} +Source18: check-kabi +Source20: Module.kabi_aarch64 +Source21: Module.kabi_x86_64 +%endif + +Source200: mkgrub-menu-aarch64.sh +Source300: extra-modules_x86_64.list +Source301: extra-modules_aarch64.list + +Source2000: cpupower.service +Source2001: cpupower.config + +%if 0%{?with_patch} +Source9000: apply-patches +Source9001: guards +Source9002: series.conf +Source9998: patches.tar.bz2 +%endif + +Patch0006: 0006-vk-introduce-vkernel.patch + +#BuildRequires: +BuildRequires: module-init-tools, patch >= 2.5.4, bash >= 2.03, tar +BuildRequires: bzip2, xz, findutils, gzip, m4, perl, make >= 3.78, diffutils, gawk +BuildRequires: libcap-devel, libcap-ng-devel, rsync +BuildRequires: gcc >= 3.4.2, binutils >= 2.12 +BuildRequires: hostname, net-tools, bc +BuildRequires: xmlto, asciidoc +BuildRequires: openssl-devel openssl +BuildRequires: hmaccalc +BuildRequires: ncurses-devel +#BuildRequires: pesign >= 0.109-4 +BuildRequires: elfutils-libelf-devel +BuildRequires: rpm >= 4.14.2 +#BuildRequires: sparse >= 0.4.1 +%if 0%{?with_python2} +BuildRequires: python-devel +%endif + +BuildRequires: elfutils-devel zlib-devel binutils-devel newt-devel perl(ExtUtils::Embed) bison +BuildRequires: audit-libs-devel libpfm-devel libtraceevent-devel +BuildRequires: pciutils-devel gettext +BuildRequires: rpm-build, elfutils +BuildRequires: numactl-devel python3-devel glibc-static python3-docutils +BuildRequires: perl-generators perl(Carp) libunwind-devel gtk2-devel libbabeltrace-devel java-1.8.0-openjdk java-1.8.0-openjdk-devel perl-devel + +AutoReq: no +AutoProv: yes + +Conflicts: device-mapper-libs < 1.02.63-2 e2fsprogs < 1.37-4 initscripts < 7.23 iptables < 1.3.2-1 +Conflicts: ipw2200-firmware < 2.4 isdn4k-utils < 3.2-32 iwl4965-firmware < 228.57.2 jfsutils < 1.1.7-2 +Conflicts: mdadm < 3.2.1-5 nfs-utils < 1.0.7-12 oprofile < 0.9.1-2 ppp < 2.4.3-3 procps < 3.2.5-6.3 +Conflicts: reiserfs-utils < 3.6.19-2 selinux-policy-targeted < 1.25.3-14 squashfs-tools < 4.0 +Conflicts: udev < 063-6 util-linux < 2.12 wireless-tools < 29-3 xfsprogs < 2.6.13-4 + +Provides: kernel-%{_target_cpu} = %{version}-%{release} kernel-drm = 4.3.0 kernel-drm-nouveau = 16 kernel-modeset = 1 +Provides: kernel-uname-r = %{KernelVer} kernel=%{KernelVer} + +Requires: dracut >= 001-7 grubby >= 8.28-2 initscripts >= 8.11.1-1 linux-firmware >= 20100806-2 module-init-tools >= 3.16-2 + +ExclusiveArch: noarch aarch64 i686 x86_64 riscv64 ppc64le loongarch64 +ExclusiveOS: Linux + +%if %{with_perf} +BuildRequires: flex xz-devel libzstd-devel +BuildRequires: java-devel +%ifarch aarch64 +BuildRequires: OpenCSD +%endif +%endif + +BuildRequires: dwarves +BuildRequires: clang >= 10.0.0 +BuildRequires: llvm +BuildRequires: llvm-devel +%if %{with clang_lto} +BuildRequires: lld +%endif + +%description +The Linux Kernel, the operating system core itself. + +%package headers +Summary: Header files for the Linux kernel for use by glibc +Obsoletes: glibc-kernheaders < 3.0-46 +Provides: glibc-kernheaders = 3.0-46 +%description headers +Kernel-headers includes the C header files that specify the interface +between the Linux kernel and userspace libraries and programs. The +header files define structures and constants that are needed for +building most standard programs and are also needed for rebuilding the +glibc package. + + +%package devel +Summary: Development package for building kernel modules to match the %{KernelVer} kernel +AutoReqProv: no +Provides: kernel-devel-uname-r = %{KernelVer} +Provides: kernel-devel-%{_target_cpu} = %{version}-%{release} +Requires: perl findutils + +%description devel +This package provides kernel headers and makefiles sufficient to build modules +against the %{KernelVer} kernel package. + +%package tools +Summary: Assortment of tools for the Linux kernel +Provides: %{name}-tools-libs +Obsoletes: %{name}-tools-libs +Provides: cpufreq-utils = 1:009-0.6.p1 +Provides: cpufrequtils = 1:009-0.6.p1 +Obsoletes: cpufreq-utils < 1:009-0.6.p1 +Obsoletes: cpufrequtils < 1:009-0.6.p1 +Obsoletes: cpuspeed < 1:1.5-16 +%description tools +This package contains the tools/ directory from the kernel source +and the supporting documentation. + +%package tools-devel +Summary: Assortment of tools for the Linux kernel +Requires: %{name}-tools = %{version}-%{release} +Requires: %{name}-tools-libs = %{version}-%{release} +Provides: %{name}-tools-libs-devel = %{version}-%{release} +Obsoletes: %{name}-tools-libs-devel +%description tools-devel +This package contains the development files for the tools/ directory from +the kernel source. + +%ifarch x86_64 aarch64 +%package extra-modules +Summary: Extra kernel modules to match the kernel +AutoReqProv: no +Provides: kernel-extra-modules = %{version}-%{release} +%description extra-modules +This package contains optional modules that may be dynamically loaded but not needed for base system operation. +%endif + +%if %{with_perf} +%package -n perf +Summary: Performance monitoring for the Linux kernel +%description -n perf +This package contains the perf tool, which enables performance monitoring +of the Linux kernel. + +%if 0%{?with_python2} +%package -n python2-perf +Provides: python-perf = %{version}-%{release} +Obsoletes: python-perf +Summary: Python bindings for apps which will manipulate perf events + +%description -n python2-perf +A Python module that permits applications written in the Python programming +language to use the interface to manipulate perf events. +%endif + +%package -n python3-perf +Summary: Python bindings for apps which will manipulate perf events +%description -n python3-perf +A Python module that permits applications written in the Python programming +language to use the interface to manipulate perf events. +# with_perf +%endif + +%package -n bpftool +Summary: Inspection and simple manipulation of eBPF programs and maps +%description -n bpftool +This package contains the bpftool, which allows inspection and simple +manipulation of eBPF programs and maps. + +%package source +Summary: the kernel source +%description source +This package contains vaious source files from the kernel. + +%if 0%{?with_debuginfo} +%define _debuginfo_template %{nil} +%define _debuginfo_subpackages 0 + +%define debuginfo_template(n:) \ +%package -n %{-n*}-debuginfo\ +Summary: Debug information for package %{-n*}\ +Group: Development/Debug\ +AutoReq: 0\ +AutoProv: 1\ +%description -n %{-n*}-debuginfo\ +This package provides debug information for package %{-n*}.\ +Debug information is useful when developing applications that use this\ +package or when debugging this package.\ +%{nil} + +%debuginfo_template -n kernel +%files -n kernel-debuginfo -f kernel-debugfiles.list -f debugfiles.list +%{expand:%%global _find_debuginfo_opts %{?_find_debuginfo_opts} --keep-section '.BTF' -p '.*/%{KernelVer}/.*|.*/vmlinux|XXX' -o kernel-debugfiles.list} + +%debuginfo_template -n bpftool +%files -n bpftool-debuginfo -f bpftool-debugfiles.list +%{expand:%%global _find_debuginfo_opts %{?_find_debuginfo_opts} -p '.*%{_sbindir}/bpftool.*(\.debug)?|XXX' -o bpftool-debugfiles.list} + +%debuginfo_template -n kernel-tools +%files -n kernel-tools-debuginfo -f kernel-tools-debugfiles.list +%{expand:%%global _find_debuginfo_opts %{?_find_debuginfo_opts} -p '.*%{_bindir}/centrino-decode.*(\.debug)?|.*%{_bindir}/powernow-k8-decode.*(\.debug)?|.*%{_bindir}/cpupower.*(\.debug)?|.*%{_libdir}/libcpupower.*|.*%{_libdir}/libcpupower.*|.*%{_bindir}/turbostat.(\.debug)?|.*%{_bindir}/.*gpio.*(\.debug)?|.*%{_bindir}/.*iio.*(\.debug)?|.*%{_bindir}/tmon.*(.debug)?|XXX' -o kernel-tools-debugfiles.list} + +%if %{with_perf} +%debuginfo_template -n perf +%files -n perf-debuginfo -f perf-debugfiles.list +%{expand:%%global _find_debuginfo_opts %{?_find_debuginfo_opts} -p '.*%{_bindir}/perf.*(\.debug)?|.*%{_libexecdir}/perf-core/.*|.*%{_libdir}/traceevent/.*|XXX' -o perf-debugfiles.list} + +%if 0%{?with_python2} +%debuginfo_template -n python2-perf +%files -n python2-perf-debuginfo -f python2-perf-debugfiles.list +%{expand:%%global _find_debuginfo_opts %{?_find_debuginfo_opts} -p '.*%{python2_sitearch}/perf.*(.debug)?|XXX' -o python2-perf-debugfiles.list} +%endif + +%debuginfo_template -n python3-perf +%files -n python3-perf-debuginfo -f python3-perf-debugfiles.list +%{expand:%%global _find_debuginfo_opts %{?_find_debuginfo_opts} -p '.*%{python3_sitearch}/perf.*(.debug)?|XXX' -o python3-perf-debugfiles.list} +#with_perf +%endif + +%endif + +%prep +%setup -q -n kernel-%{version} -c + +%if 0%{?with_patch} +tar -xjf %{SOURCE9998} +%endif + +mv kernel linux-%{KernelVer} +cd linux-%{KernelVer} + +%if 0%{?with_patch} +cp %{SOURCE9000} . +cp %{SOURCE9001} . +cp %{SOURCE9002} . + +if [ ! -d patches ];then + mv ../patches . +fi + +Applypatches() +{ + set -e + set -o pipefail + local SERIESCONF=$1 + local PATCH_DIR=$2 + sed -i '/^#/d' $SERIESCONF + sed -i '/^[\s]*$/d' $SERIESCONF + ( + echo "trap 'echo \"*** patch \$_ failed ***\"' ERR" + echo "set -ex" + cat $SERIESCONF | \ + sed "s!^!patch -s -F0 -E -p1 --no-backup-if-mismatch -i $PATCH_DIR/!" \ + ) | sh +} + +Applypatches series.conf %{_builddir}/kernel-%{version}/linux-%{KernelVer} +%endif + +# riscv-kernel patch +%ifarch riscv64 +%endif + +%if "%toolchain" == "clang" +%endif + +# vkernel patch +%patch0006 -p1 + +find . \( -name "*.orig" -o -name "*~" \) -exec rm -f {} \; >/dev/null +find . -name .gitignore -exec rm -f {} \; >/dev/null + +%if 0%{?with_signmodules} + cp %{SOURCE11} certs/. +%endif + +%if 0%{?with_source} +# Copy directory backup for kernel-source +cp -a ../linux-%{KernelVer} ../linux-%{KernelVer}-source +find ../linux-%{KernelVer}-source -type f -name "\.*" -exec rm -rf {} \; >/dev/null +%endif + +cp -a tools/perf tools/python3-perf + +%build +cd linux-%{KernelVer} + +perl -p -i -e "s/^EXTRAVERSION.*/EXTRAVERSION = -%{release}.%{_target_cpu}/" Makefile + +%if %{with openeuler_version} +perl -p -i -e "s/^OPENEULER_LTS.*/OPENEULER_LTS = %{openeuler_lts}/" Makefile.oever +perl -p -i -e "s/^OPENEULER_MAJOR.*/OPENEULER_MAJOR = %{openeuler_major}/" Makefile.oever +perl -p -i -e "s/^OPENEULER_MINOR.*/OPENEULER_MINOR = %{openeuler_minor}/" Makefile.oever +perl -p -i -e "s/^OPENEULER_RELEASE.*/OPENEULER_RELEASE = \"%{release}\"/" Makefile.oever +%endif + +## make linux +make mrproper %{_smp_mflags} + +%if %{with_64kb} +sed -i arch/arm64/configs/openeuler_defconfig -e 's/^CONFIG_ARM64_4K_PAGES.*/CONFIG_ARM64_64K_PAGES=y/' +sed -i arch/arm64/configs/openeuler_defconfig -e 's/^CONFIG_ARM64_PA_BITS=.*/CONFIG_ARM64_PA_BITS=52/' +sed -i arch/arm64/configs/openeuler_defconfig -e 's/^CONFIG_ARM64_PA_BITS_.*/CONFIG_ARM64_PA_BITS_52=y/' +sed -i arch/arm64/configs/openeuler_defconfig -e 's/^CONFIG_ARM64_VA_BITS=.*/CONFIG_ARM64_VA_BITS=52/' +sed -i arch/arm64/configs/openeuler_defconfig -e 's/^CONFIG_ARM64_VA_BITS_.*/CONFIG_ARM64_VA_BITS_52=y/' +%endif + +%if "%toolchain" == "clang" + +%ifarch s390x ppc64le +%global llvm_ias 0 +%else +%global llvm_ias 1 +%endif + +%global clang_make_opts HOSTCC=clang CC=clang LLVM_IAS=%{llvm_ias} + +%if %{with clang_lto} +%global clang_make_opts %{clang_make_opts} HOSTLD=ld.lld LD=ld.lld AR=llvm-ar NM=llvm-nm HOSTAR=llvm-ar HOSTNM=llvm-nm +%endif + +%endif + +%global make %{__make} %{?clang_make_opts} HOSTCFLAGS="%{?build_cflags}" HOSTLDFLAGS="%{?build_ldflags}" + +%ifarch loongarch64 + +%if 0%{with_signmodules} +echo "CONFIG_MODULE_SIG=y" >>arch/loongarch/configs/loongson3_defconfig +%endif + +%if 0%{with_debuginfo} +echo "CONFIG_DEBUG_INFO=y" >>arch/loongarch/configs/loongson3_defconfig +%endif + +make ARCH=%{Arch} loongson3_defconfig + +%else +%{make} ARCH=%{Arch} openeuler_defconfig +%endif + +%if %{with clang_lto} +scripts/config -e LTO_CLANG_FULL +sed -i 's/# CONFIG_LTO_CLANG_FULL is not set/CONFIG_LTO_CLANG_FULL=y/' .config +sed -i 's/CONFIG_LTO_NONE=y/# CONFIG_LTO_NONE is not set/' .config +%endif + +%if 0%{?openEuler_sign_rsa} + %{openEuler_check_EBS_perm} + if [ $openEuler_has_sign_perm -eq 1 ]; then + cp %{SOURCE15} ./certs/openeuler-cert.pem + # close kernel native signature + sed -i 's/CONFIG_MODULE_SIG_KEY=.*$/CONFIG_MODULE_SIG_KEY=""/g' .config + sed -i 's/CONFIG_SYSTEM_TRUSTED_KEYS=.*$/CONFIG_SYSTEM_TRUSTED_KEYS="certs\/openeuler-cert.pem"/g' .config + sed -i 's/CONFIG_MODULE_SIG_ALL=y$/CONFIG_MODULE_SIG_ALL=n/g' .config + fi +%endif + +# vkernel config +sed -i 's/# CONFIG_VKERNEL is not set/CONFIG_VKERNEL=y/' .config +echo "CONFIG_VKERNEL_DRIVER=m" >> .config + +TargetImage=$(basename $(make -s image_name)) + +%{make} ARCH=%{Arch} $TargetImage %{?_smp_mflags} +%{make} ARCH=%{Arch} modules %{?_smp_mflags} + +%if 0%{?with_kabichk} + chmod 0755 %{SOURCE18} + if [ -e $RPM_SOURCE_DIR/Module.kabi_%{_target_cpu} ]; then + %{SOURCE18} -k $RPM_SOURCE_DIR/Module.kabi_%{_target_cpu} -s Module.symvers || exit 1 + else + echo "**** NOTE: Cannot find reference Module.kabi file. ****" + fi +%endif + +# aarch64 make dtbs +%ifarch aarch64 riscv64 + %{make} ARCH=%{Arch} dtbs +%endif + +## make tools +%if %{with_perf} +# perf +%ifarch aarch64 +# aarch64 make perf with CORESIGHT=1 +%global perf_make \ + make %{?clang_make_opts} EXTRA_LDFLAGS="%[ "%{toolchain}" == "clang" ? "-z now" : "" ]" EXTRA_CFLAGS="%[ "%{toolchain}" == "clang" ? "" : "-Wl,-z,now" ] -g -Wall -fstack-protector-strong -fPIC" EXTRA_PERFLIBS="-fpie" %{?_smp_mflags} -s V=1 WERROR=0 NO_LIBUNWIND=1 HAVE_CPLUS_DEMANGLE=1 NO_GTK2=1 NO_LIBNUMA=1 NO_STRLCPY=1 CORESIGHT=1 prefix=%{_prefix} +%else +%global perf_make \ + make %{?clang_make_opts} EXTRA_LDFLAGS="%[ "%{toolchain}" == "clang" ? "-z now" : "" ]" EXTRA_CFLAGS="%[ "%{toolchain}" == "clang" ? "" : "-Wl,-z,now" ] -g -Wall -fstack-protector-strong -fPIC" EXTRA_PERFLIBS="-fpie" %{?_smp_mflags} -s V=1 WERROR=0 NO_LIBUNWIND=1 HAVE_CPLUS_DEMANGLE=1 NO_GTK2=1 NO_LIBNUMA=1 NO_STRLCPY=1 prefix=%{_prefix} +%endif +%if 0%{?with_python2} +%global perf_python2 -C tools/perf PYTHON=%{__python2} +%global perf_python3 -C tools/python3-perf PYTHON=%{__python3} +%else +%global perf_python3 -C tools/perf PYTHON=%{__python3} +%endif + +chmod +x tools/perf/check-headers.sh +# perf +%if 0%{?with_python2} +%{perf_make} %{perf_python2} all +%endif + +# make sure check-headers.sh is executable +chmod +x tools/python3-perf/check-headers.sh +%{perf_make} %{perf_python3} all + +pushd tools/perf/Documentation/ +%{make} %{?_smp_mflags} man +popd +%endif + +# bpftool +pushd tools/bpf/bpftool +%{make} +popd + +# cpupower +chmod +x tools/power/cpupower/utils/version-gen.sh +%{make} %{?_smp_mflags} -C tools/power/cpupower CPUFREQ_BENCH=false +%ifarch %{ix86} + pushd tools/power/cpupower/debug/i386 + %{make} %{?_smp_mflags} centrino-decode powernow-k8-decode + popd +%endif +%ifarch x86_64 + pushd tools/power/cpupower/debug/x86_64 + %{make} %{?_smp_mflags} centrino-decode powernow-k8-decode + popd +%endif +%ifarch %{ix86} x86_64 + pushd tools/power/x86/x86_energy_perf_policy/ + %{make} + popd + pushd tools/power/x86/turbostat + %{make} + popd +%endif +# thermal +pushd tools/thermal/tmon/ +%{make} +popd +# iio +pushd tools/iio/ +%{make} +popd +# gpio +pushd tools/gpio/ +%{make} +popd +# kvm +pushd tools/kvm/kvm_stat/ +%{make} %{?_smp_mflags} man +popd +# libbpf.a and bpf_helper_defs.h +pushd tools/lib/bpf +%{make} +popd + +%install +%if 0%{?with_source} + %define _python_bytecompile_errors_terminate_build 0 + mkdir -p $RPM_BUILD_ROOT/usr/src/ + mv linux-%{KernelVer}-source $RPM_BUILD_ROOT/usr/src/linux-%{KernelVer} + cp linux-%{KernelVer}/.config $RPM_BUILD_ROOT/usr/src/linux-%{KernelVer}/ +%endif + +cd linux-%{KernelVer} + +## install linux + +# deal with kernel-source, now we don't need kernel-source +#mkdir $RPM_BUILD_ROOT/usr/src/linux-%{KernelVer} +#tar cf - --exclude SCCS --exclude BitKeeper --exclude .svn --exclude CVS --exclude .pc --exclude .hg --exclude .git --exclude=.tmp_versions --exclude=*vmlinux* --exclude=*.o --exclude=*.ko --exclude=*.cmd --exclude=Documentation --exclude=.config.old --exclude=.missing-syscalls.d --exclude=patches . | tar xf - -C %{buildroot}/usr/src/linux-%{KernelVer} + +mkdir -p $RPM_BUILD_ROOT/boot +dd if=/dev/zero of=$RPM_BUILD_ROOT/boot/initramfs-%{KernelVer}.img bs=1M count=20 + +%ifarch loongarch64 +strip -s vmlinux -o vmlinux.elf +install -m 755 vmlinux.elf $RPM_BUILD_ROOT/boot/vmlinuz-%{KernelVer} +%else +install -m 755 $(make -s image_name) $RPM_BUILD_ROOT/boot/vmlinuz-%{KernelVer} +%endif + +%if 0%{?openEuler_sign_rsa} + %{openEuler_check_EBS_perm} + if [ $openEuler_has_sign_perm -eq 1 ]; then + echo "start sign" + %ifarch %arm aarch64 + gunzip -c $RPM_BUILD_ROOT/boot/vmlinuz-%{KernelVer}>$RPM_BUILD_ROOT/boot/vmlinuz-%{KernelVer}.unzip.efi + sh /usr/lib/rpm/brp-ebs-sign --efi $RPM_BUILD_ROOT/boot/vmlinuz-%{KernelVer}.unzip.efi + mv $RPM_BUILD_ROOT/boot/vmlinuz-%{KernelVer}.unzip.efi.sig $RPM_BUILD_ROOT/boot/vmlinuz-%{KernelVer}.unzip.efi + mv $RPM_BUILD_ROOT/boot/vmlinuz-%{KernelVer}.unzip.efi $RPM_BUILD_ROOT/boot/vmlinuz-%{KernelVer}.unzip + gzip -c $RPM_BUILD_ROOT/boot/vmlinuz-%{KernelVer}.unzip>$RPM_BUILD_ROOT/boot/vmlinuz-%{KernelVer} + rm -f $RPM_BUILD_ROOT/boot/vmlinuz-%{KernelVer}.unzip + %endif + %ifarch x86_64 + mv $RPM_BUILD_ROOT/boot/vmlinuz-%{KernelVer} $RPM_BUILD_ROOT/boot/vmlinuz-%{KernelVer}.efi + sh /usr/lib/rpm/brp-ebs-sign --efi $RPM_BUILD_ROOT/boot/vmlinuz-%{KernelVer}.efi + mv $RPM_BUILD_ROOT/boot/vmlinuz-%{KernelVer}.efi.sig $RPM_BUILD_ROOT/boot/vmlinuz-%{KernelVer}.efi + mv $RPM_BUILD_ROOT/boot/vmlinuz-%{KernelVer}.efi $RPM_BUILD_ROOT/boot/vmlinuz-%{KernelVer} + %endif + fi +%endif + +pushd $RPM_BUILD_ROOT/boot +sha512hmac ./vmlinuz-%{KernelVer} >./.vmlinuz-%{KernelVer}.hmac +popd + +install -m 644 .config $RPM_BUILD_ROOT/boot/config-%{KernelVer} +install -m 644 System.map $RPM_BUILD_ROOT/boot/System.map-%{KernelVer} + +gzip -c9 < Module.symvers > $RPM_BUILD_ROOT/boot/symvers-%{KernelVer}.gz + +mkdir -p $RPM_BUILD_ROOT%{_sbindir} +install -m 755 %{SOURCE200} $RPM_BUILD_ROOT%{_sbindir}/mkgrub-menu-%{version}-%{devel_release}%{?maintenance_release}%{?pkg_release}.sh + + +%if 0%{?with_debuginfo} + mkdir -p $RPM_BUILD_ROOT%{debuginfodir}/lib/modules/%{KernelVer} + cp vmlinux $RPM_BUILD_ROOT%{debuginfodir}/lib/modules/%{KernelVer} +%endif + +# deal with module, if not kdump +%{make} ARCH=%{Arch} INSTALL_MOD_PATH=$RPM_BUILD_ROOT modules_install KERNELRELEASE=%{KernelVer} mod-fw= +######## to collect ko to module.filelist about netwoking. block. drm. modesetting ############### +# 1. Generates extra-module.list by: +# - Processing predefined extra-modules_%{_target_cpu}.list +# 2. Creates core-module.list by comparing installed modules with extra modules +# Final output: Generates kernel-modules-filelist (core) and kernel-extra-modules-filelist +# for RPM package specification, while ensuring proper path mapping to /lib/modules/%{KernelVer}/ + +pushd $RPM_BUILD_ROOT/lib/modules/%{KernelVer} +%ifnarch x86_64 aarch64 +find -type f -name "*.ko" >modnames +%else +sed 's!^!kernel/!; s!\.ko$!!' %{_sourcedir}/extra-modules_%{_target_cpu}.list > modules-extra.list + +find -type f -name "*.ko" | sort >modnames + +grep -vFf modules-extra.list modnames > modules-core.list || true + +sed -e 's!^\.\/kernel/!/lib/modules/%{KernelVer}/kernel/!; s!\.ko$!.ko.xz!' \ + modules-core.list > %{_builddir}/kernel-%{version}/kernel-modules-filelist + +if [ -s modules-extra.list ]; then + sed -e 's!^\.\/kernel/!/lib/modules/%{KernelVer}/kernel/!; s!\.ko$!.ko.xz!' modules-extra.list > %{_builddir}/kernel-%{version}/kernel-extra-modules-filelist +else + echo "%ghost /nonexistent/dummy/file" > %{_builddir}/kernel-%{version}/kernel-extra-modules-filelist +fi +%endif + +# mark modules executable so that strip-to-file can strip them +xargs --no-run-if-empty chmod u+x < modnames + +# Generate a list of modules for block and networking. + +grep -F /drivers/ modnames | xargs --no-run-if-empty nm -upA | +sed -n 's,^.*/\([^/]*\.ko\): *U \(.*\)$,\1 \2,p' > drivers.undef + +collect_modules_list() +{ + sed -r -n -e "s/^([^ ]+) \\.?($2)\$/\\1/p" drivers.undef | + LC_ALL=C sort -u > modules.$1 + if [ ! -z "$3" ]; then + sed -r -e "/^($3)\$/d" -i modules.$1 + fi +} + +collect_modules_list networking \ + 'register_netdev|ieee80211_register_hw|usbnet_probe|phy_driver_register|rt2x00(pci|usb)_probe|register_netdevice' +collect_modules_list block \ + 'ata_scsi_ioctl|scsi_add_host|scsi_add_host_with_dma|blk_alloc_queue|blk_init_queue|register_mtd_blktrans|scsi_esp_register|scsi_register_device_handler|blk_queue_physical_block_size|ahci_platform_get_resources' 'pktcdvd.ko|dm-mod.ko' +collect_modules_list drm \ + 'drm_open|drm_init' +collect_modules_list modesetting \ + 'drm_crtc_init' + +# detect missing or incorrect license tags +rm -f modinfo +while read i +do + echo -n "$i " >> modinfo + /sbin/modinfo -l $i >> modinfo +done < modnames + +grep -E -v \ + 'GPL( v2)?$|Dual BSD/GPL$|Dual MPL/GPL$|GPL and additional rights$' \ + modinfo && exit 1 + +rm -f modinfo modnames drivers.undef +%ifarch x86_64 aarch64 +rm -f modules-extra.list modules-core.list modules.list +%endif + +for i in alias alias.bin builtin.bin ccwmap dep dep.bin ieee1394map inputmap isapnpmap ofmap pcimap seriomap symbols symbols.bin usbmap +do + rm -f $RPM_BUILD_ROOT/lib/modules/$KernelVer/modules.$i +done +popd +# modsign module ko;need after find-debuginfo,strip +%define __modsign_install_post \ + if [ "%{with_signmodules}" -eq "1" ];then \ + cp certs/signing_key.pem . \ + cp certs/signing_key.x509 . \ + chmod 0755 %{modsign_cmd} \ + %{modsign_cmd} $RPM_BUILD_ROOT/lib/modules/%{KernelVer} || exit 1 \ + fi \ + find $RPM_BUILD_ROOT/lib/modules/ -type f -name '*.ko' | xargs -n1 -P`nproc --all` xz; \ +%{nil} + +%if 0%{?openEuler_sign_rsa} +%define __modsign_install_post \ + if [ "%{with_signmodules}" -eq "1" ];then \ + if [ %flag_openEuler_has_sign_perm -eq 1 ]; then \ + sh %{SOURCE16} $RPM_BUILD_ROOT/lib/modules/%{KernelVer} || exit 1 \ + else \ + cp certs/signing_key.pem . \ + cp certs/signing_key.x509 . \ + chmod 0755 %{modsign_cmd} \ + %{modsign_cmd} $RPM_BUILD_ROOT/lib/modules/%{KernelVer} || exit 1 \ + fi \ + fi \ + find $RPM_BUILD_ROOT/lib/modules/ -type f -name '*.ko' | xargs -n1 -P`nproc --all` xz; \ +%{nil} +%endif + +# deal with header +%{make} ARCH=%{Arch} INSTALL_HDR_PATH=$RPM_BUILD_ROOT/usr KBUILD_SRC= headers_install +find $RPM_BUILD_ROOT/usr/include -name "\.*" -exec rm -rf {} \; + +# dtbs install +%ifarch aarch64 riscv64 + mkdir -p $RPM_BUILD_ROOT/boot/dtb-%{KernelVer} + install -m 644 $(find arch/%{Arch}/boot -name "*.dtb") $RPM_BUILD_ROOT/boot/dtb-%{KernelVer}/ + rm -f $(find arch/$Arch/boot -name "*.dtb") +%endif + +# deal with riscv SoC dtb search path +%ifarch riscv64 + mkdir -p $RPM_BUILD_ROOT/boot/dtb-%{KernelVer}/thead + mv $(find $RPM_BUILD_ROOT/boot/dtb-%{KernelVer}/ -name "th1520*.dtb") $RPM_BUILD_ROOT/boot/dtb-%{KernelVer}/thead +%endif + +# deal with vdso +%ifnarch ppc64le +%{make} -s ARCH=%{Arch} INSTALL_MOD_PATH=$RPM_BUILD_ROOT vdso_install KERNELRELEASE=%{KernelVer} +%endif +if [ ! -s ldconfig-kernel.conf ]; then + echo "# Placeholder file, no vDSO hwcap entries used in this kernel." >ldconfig-kernel.conf +fi +install -D -m 444 ldconfig-kernel.conf $RPM_BUILD_ROOT/etc/ld.so.conf.d/kernel-%{KernelVer}.conf + +# deal with /lib/module/ path- sub path: build source kernel +rm -f $RPM_BUILD_ROOT/lib/modules/%{KernelVer}/build +rm -f $RPM_BUILD_ROOT/lib/modules/%{KernelVer}/source +mkdir -p $RPM_BUILD_ROOT/lib/modules/%{KernelVer}/build +mkdir -p $RPM_BUILD_ROOT/lib/modules/%{KernelVer}/extra +mkdir -p $RPM_BUILD_ROOT/lib/modules/%{KernelVer}/updates +mkdir -p $RPM_BUILD_ROOT/lib/modules/%{KernelVer}/weak-updates +############ to do collect devel file ######### +# 1. Makefile And Kconfig, .config sysmbol +# 2. scrpits dir +# 3. .h file +find -type f \( -name "Makefile*" -o -name "Kconfig*" \) -exec cp --parents {} $RPM_BUILD_ROOT/lib/modules/%{KernelVer}/build \; +for f in Module.symvers System.map Module.markers .config;do + test -f $f || continue + cp $f $RPM_BUILD_ROOT/lib/modules/%{KernelVer}/build +done + +cp -a scripts $RPM_BUILD_ROOT/lib/modules/%{KernelVer}/build +if [ -d arch/%{Arch}/scripts ]; then + cp -a arch/%{Arch}/scripts $RPM_BUILD_ROOT/lib/modules/%{KernelVer}/build/arch/%{_arch} || : +fi +if [ -f arch/%{Arch}/*lds ]; then + cp -a arch/%{Arch}/*lds $RPM_BUILD_ROOT/lib/modules/%{KernelVer}/build/arch/%{_arch}/ || : +fi +find $RPM_BUILD_ROOT/lib/modules/%{KernelVer}/build/scripts/ -name "*.o" -exec rm -rf {} \; + +if [ -d arch/%{Arch}/include ]; then + cp -a --parents arch/%{Arch}/include $RPM_BUILD_ROOT/lib/modules/%{KernelVer}/build/ +fi +cp -a include $RPM_BUILD_ROOT/lib/modules/%{KernelVer}/build/include + +if [ -f arch/%{Arch}/kernel/module.lds ]; then + cp -a --parents arch/%{Arch}/kernel/module.lds $RPM_BUILD_ROOT/lib/modules/%{KernelVer}/build/ +fi + +# module.lds is moved to scripts by commit 596b0474d3d9 in linux 5.10. +if [ -f scripts/module.lds ]; then + cp -a --parents scripts/module.lds $RPM_BUILD_ROOT/lib/modules/%{KernelVer}/build/ +fi + +%ifarch aarch64 + cp -a --parents arch/arm/include/asm $RPM_BUILD_ROOT/lib/modules/%{KernelVer}/build/ +%endif + +# copy objtool for kernel-devel (needed for building external modules) +if grep -q CONFIG_OBJTOOL=y .config; then + mkdir -p $RPM_BUILD_ROOT/lib/modules/%{KernelVer}/build/tools/objtool + cp -a tools/objtool/objtool $RPM_BUILD_ROOT/lib/modules/%{KernelVer}/build/tools/objtool +fi + +# Make sure the Makefile and version.h have a matching timestamp so that +# external modules can be built +touch -r $RPM_BUILD_ROOT/lib/modules/%{KernelVer}/build/Makefile $RPM_BUILD_ROOT/lib/modules/%{KernelVer}/build/include/generated/uapi/linux/version.h +touch -r $RPM_BUILD_ROOT/lib/modules/%{KernelVer}/build/.config $RPM_BUILD_ROOT/lib/modules/%{KernelVer}/build/include/generated/autoconf.h +# for make prepare +if [ ! -f $RPM_BUILD_ROOT/lib/modules/%{KernelVer}/build/include/config/auto.conf ];then + cp .config $RPM_BUILD_ROOT/lib/modules/%{KernelVer}/build/include/config/auto.conf +fi + +mkdir -p %{buildroot}/usr/src/kernels +mv $RPM_BUILD_ROOT/lib/modules/%{KernelVer}/build $RPM_BUILD_ROOT/usr/src/kernels/%{KernelVer} + +find $RPM_BUILD_ROOT/usr/src/kernels/%{KernelVer} -name ".*.cmd" -exec rm -f {} \; + +pushd $RPM_BUILD_ROOT/lib/modules/%{KernelVer} +ln -sf /usr/src/kernels/%{KernelVer} build +ln -sf build source +popd + + +# deal with doc , now we don't need + + +# deal with kernel abi whitelists. now we don't need + + +## install tools +%if %{with_perf} +# perf +# perf tool binary and supporting scripts/binaries +%if 0%{?with_python2} +%{perf_make} %{perf_python2} DESTDIR=%{buildroot} lib=%{_lib} install-bin +%else +%{perf_make} %{perf_python3} DESTDIR=%{buildroot} lib=%{_lib} install-bin +%endif +# remove the 'trace' symlink. +rm -f %{buildroot}%{_bindir}/trace + +# remove examples +rm -rf %{buildroot}/usr/lib/perf/examples +# remove the stray header file that somehow got packaged in examples +rm -rf %{buildroot}/usr/lib/perf/include/bpf/ + +# python-perf extension +%{perf_make} %{perf_python3} DESTDIR=%{buildroot} install-python_ext +%if 0%{?with_python2} +%{perf_make} %{perf_python2} DESTDIR=%{buildroot} install-python_ext +%endif + +# perf man pages (note: implicit rpm magic compresses them later) +install -d %{buildroot}/%{_mandir}/man1 +install -pm0644 tools/perf/Documentation/*.1 %{buildroot}/%{_mandir}/man1/ +%endif + +# bpftool +pushd tools/bpf/bpftool +%{make} DESTDIR=%{buildroot} prefix=%{_prefix} bash_compdir=%{_sysconfdir}/bash_completion.d/ mandir=%{_mandir} install doc-install +popd + +# resolve_btfids +mkdir -p %{buildroot}/usr/src/kernels/%{KernelVer}/tools/bpf/resolve_btfids +cp tools/bpf/resolve_btfids/resolve_btfids %{buildroot}/usr/src/kernels/%{KernelVer}/tools/bpf/resolve_btfids + +# cpupower +%{make} -C tools/power/cpupower DESTDIR=%{buildroot} libdir=%{_libdir} mandir=%{_mandir} CPUFREQ_BENCH=false install +rm -f %{buildroot}%{_libdir}/*.{a,la} +%find_lang cpupower +mv cpupower.lang ../ +%ifarch %{ix86} + pushd tools/power/cpupower/debug/i386 + install -m755 centrino-decode %{buildroot}%{_bindir}/centrino-decode + install -m755 powernow-k8-decode %{buildroot}%{_bindir}/powernow-k8-decode + popd +%endif +%ifarch x86_64 + pushd tools/power/cpupower/debug/x86_64 + install -m755 centrino-decode %{buildroot}%{_bindir}/centrino-decode + install -m755 powernow-k8-decode %{buildroot}%{_bindir}/powernow-k8-decode + popd +%endif +chmod 0755 %{buildroot}%{_libdir}/libcpupower.so* +mkdir -p %{buildroot}%{_unitdir} %{buildroot}%{_sysconfdir}/sysconfig +install -m644 %{SOURCE2000} %{buildroot}%{_unitdir}/cpupower.service +install -m644 %{SOURCE2001} %{buildroot}%{_sysconfdir}/sysconfig/cpupower +%ifarch %{ix86} x86_64 + mkdir -p %{buildroot}%{_mandir}/man8 + pushd tools/power/x86/x86_energy_perf_policy + %{make} DESTDIR=%{buildroot} install + popd + pushd tools/power/x86/turbostat + %{make} DESTDIR=%{buildroot} install + popd +%endif +# thermal +pushd tools/thermal/tmon +%{make} INSTALL_ROOT=%{buildroot} install +popd +# iio +pushd tools/iio +%{make} DESTDIR=%{buildroot} install +popd +# gpio +pushd tools/gpio +%{make} DESTDIR=%{buildroot} install +popd +# kvm +pushd tools/kvm/kvm_stat +%{make} INSTALL_ROOT=%{buildroot} install-tools +popd + +%define __spec_install_post\ +%{?__debug_package:%{__debug_install_post}}\ +%{__arch_install_post}\ +%{__os_install_post}\ +%{__modsign_install_post}\ +%{nil} + +%post +%{_sbindir}/new-kernel-pkg --package kernel --install %{KernelVer} || exit $? + +%preun +if [ `uname -i` == "aarch64" ] && + [ -f /boot/EFI/grub2/grub.cfg ]; then + /usr/bin/sh %{_sbindir}/mkgrub-menu-%{version}-%{devel_release}%{?maintenance_release}%{?pkg_release}.sh %{version}-%{release}.aarch64 /boot/EFI/grub2/grub.cfg remove +fi + +%postun +%{_sbindir}/new-kernel-pkg --rminitrd --rmmoddep --remove %{KernelVer} || exit $? +if [ -x %{_sbindir}/weak-modules ] +then + %{_sbindir}/weak-modules --remove-kernel %{KernelVer} || exit $? +fi + +# remove empty directory +if [ -d /lib/modules/%{KernelVer} ] && [ "`ls -A /lib/modules/%{KernelVer}`" = "" ]; then + rm -rf /lib/modules/%{KernelVer} +fi +if [ `uname -i` == "loongarch64" ];then + [ -f /etc/grub2.cfg ] && GRUB_CFG=`readlink -f /etc/grub2.cfg` + [ "x${GRUB_CFG}" == "x" ] && [ -f /etc/grub2-efi.cfg ] && GRUB_CFG=`readlink -f /etc/grub2-efi.cfg` + [ "x${GRUB_CFG}" == "x" ] && [ -f /boot/efi/EFI/openEuler/grub.cfg ] && GRUB_CFG=/boot/efi/EFI/openEuler/grub.cfg + [ "x${GRUB_CFG}" != "x" ] && grub2-mkconfig -o ${GRUB_CFG} +fi + +%posttrans +%{_sbindir}/new-kernel-pkg --package kernel --mkinitrd --dracut --depmod --update %{KernelVer} || exit $? +%{_sbindir}/new-kernel-pkg --package kernel --rpmposttrans %{KernelVer} || exit $? +if [ `uname -i` == "aarch64" ] && + [ -f /boot/EFI/grub2/grub.cfg ]; then + /usr/bin/sh %{_sbindir}/mkgrub-menu-%{version}-%{devel_release}%{?maintenance_release}%{?pkg_release}.sh %{version}-%{release}.aarch64 /boot/EFI/grub2/grub.cfg update +fi +if [ `uname -i` == "loongarch64" ];then + [ -f /etc/grub2.cfg ] && GRUB_CFG=`readlink -f /etc/grub2.cfg` + [ "x${GRUB_CFG}" == "x" ] && [ -f /etc/grub2-efi.cfg ] && GRUB_CFG=`readlink -f /etc/grub2-efi.cfg` + [ "x${GRUB_CFG}" == "x" ] && [ -f /boot/efi/EFI/openEuler/grub.cfg ] && GRUB_CFG=/boot/efi/EFI/openEuler/grub.cfg + [ "x${GRUB_CFG}" != "x" ] && grub2-mkconfig -o ${GRUB_CFG} + grubby --set-default=/boot/vmlinuz-%{KernelVer} +fi +if [ -x %{_sbindir}/weak-modules ] +then + %{_sbindir}/weak-modules --add-kernel %{KernelVer} || exit $? +fi +%{_sbindir}/new-kernel-pkg --package kernel --mkinitrd --dracut --depmod --update %{KernelVer} || exit $? +%{_sbindir}/new-kernel-pkg --package kernel --rpmposttrans %{KernelVer} || exit $? + +%post devel +if [ -f /etc/sysconfig/kernel ] +then + . /etc/sysconfig/kernel || exit $? +fi +if [ "$HARDLINK" != "no" -a -x /usr/sbin/hardlink ] +then + (cd /usr/src/kernels/%{KernelVer} && + /usr/bin/find . -type f | while read f; do + hardlink -c /usr/src/kernels/*.oe*.*/$f $f + done) +fi + +%post -n %{name}-tools +/sbin/ldconfig +%systemd_post cpupower.service + +%preun -n %{name}-tools +%systemd_preun cpupower.service + +%postun -n %{name}-tools +/sbin/ldconfig +%systemd_postun cpupower.service + +%ifnarch x86_64 aarch64 +%files +%else +%files -f kernel-modules-filelist +%endif +%defattr (-, root, root) +%doc +/boot/config-* +%ifarch aarch64 riscv64 +/boot/dtb-* +%endif +/boot/symvers-* +/boot/System.map-* +/boot/vmlinuz-* +%ghost /boot/initramfs-%{KernelVer}.img +/boot/.vmlinuz-*.hmac +/etc/ld.so.conf.d/* +%ifnarch x86_64 aarch64 +/lib/modules/%{KernelVer}/ +%else +/lib/modules/%{KernelVer}/vdso/ +/lib/modules/%{KernelVer}/modules.* +%endif +%exclude /lib/modules/%{KernelVer}/source +%exclude /lib/modules/%{KernelVer}/build +%{_sbindir}/mkgrub-menu*.sh + +%ifarch x86_64 aarch64 +%files extra-modules -f kernel-extra-modules-filelist +%defattr(-,root,root) +%endif + +%files devel +%defattr (-, root, root) +%doc +/lib/modules/%{KernelVer}/source +/lib/modules/%{KernelVer}/build +/usr/src/kernels/%{KernelVer} + +%files headers +%defattr (-, root, root) +/usr/include/* +%exclude %{_includedir}/cpufreq.h +%exclude %{_includedir}/cpuidle.h + +%if %{with_perf} +%files -n perf +%{_bindir}/perf +%{_libdir}/libperf-jvmti.so +%{_libexecdir}/perf-core +%{_datadir}/perf-core/ +%{_mandir}/man[1-8]/perf* +%{_sysconfdir}/bash_completion.d/perf +%doc linux-%{KernelVer}/tools/perf/Documentation/examples.txt +%dir %{_datadir}/doc/perf-tip +%{_datadir}/doc/perf-tip/* +%license linux-%{KernelVer}/COPYING + +%if 0%{?with_python2} +%files -n python2-perf +%license linux-%{KernelVer}/COPYING +%{python2_sitearch}/* +%endif + +%files -n python3-perf +%license linux-%{KernelVer}/COPYING +%{python3_sitearch}/* +%endif + +%files -n %{name}-tools -f cpupower.lang +%{_bindir}/cpupower +%ifarch %{ix86} x86_64 +%{_bindir}/centrino-decode +%{_bindir}/powernow-k8-decode +%endif +%{_unitdir}/cpupower.service +%{_datadir}/bash-completion/completions/cpupower +%{_mandir}/man[1-8]/cpupower* +%config(noreplace) %{_sysconfdir}/sysconfig/cpupower +%ifarch %{ix86} x86_64 +%{_bindir}/x86_energy_perf_policy +%{_mandir}/man8/x86_energy_perf_policy* +%{_bindir}/turbostat +%{_mandir}/man8/turbostat* +%endif +%{_bindir}/tmon +%{_bindir}/iio_event_monitor +%{_bindir}/iio_generic_buffer +%{_bindir}/lsiio +%{_bindir}/lsgpio +%{_bindir}/gpio-hammer +%{_bindir}/gpio-event-mon +%{_bindir}/gpio-watch +%{_bindir}/kvm_stat +%{_libdir}/libcpupower.so.1 +%{_libdir}/libcpupower.so.0.0.1 +%license linux-%{KernelVer}/COPYING + +%files -n %{name}-tools-devel +%{_libdir}/libcpupower.so +%{_includedir}/cpufreq.h +%{_includedir}/cpuidle.h + +%files -n bpftool +%{_sbindir}/bpftool +%{_sysconfdir}/bash_completion.d/bpftool +%{_mandir}/man8/bpftool-cgroup.8.gz +%{_mandir}/man8/bpftool-map.8.gz +%{_mandir}/man8/bpftool-prog.8.gz +%{_mandir}/man8/bpftool-perf.8.gz +%{_mandir}/man8/bpftool.8.gz +%{_mandir}/man8/bpftool-btf.8.gz +%{_mandir}/man8/bpftool-feature.8.gz +%{_mandir}/man8/bpftool-gen.8.gz +%{_mandir}/man8/bpftool-iter.8.gz +%{_mandir}/man8/bpftool-link.8.gz +%{_mandir}/man8/bpftool-net.8.gz +%{_mandir}/man8/bpftool-struct_ops.8.gz +%license linux-%{KernelVer}/COPYING + +%if 0%{?with_source} +%files source +%defattr(-,root,root) +/usr/src/linux-%{KernelVer}/* +/usr/src/linux-%{KernelVer}/.config +%endif + +%changelog +* Tue Aug 12 2025 Liu Wang <1823363429@qq.com> - 6.6.0-102.0.0.3 +- Split kernel modules into kernel-extra-modules subpackage +- Prioritizes core kmod (networking/drm/block/modesetting) in main kernel package +- All extra modules and their dependencies are isolated to kernel-extra-modules +- Added module-filelist generation scripts for both packages +- Updated post-install scripts to handle extra modules dependency + +* Thu Jul 31 2025 Li Nan - 6.6.0-102.0.0.2 +- Update kabicheck to fix build POSTTRANS scriptlet error + +* Tue Jul 29 2025 Tengda Wu - 6.6.0-102.0.0.1 +- package change based on openEuler kernel 6.6.0-102.0.0 + +* Sat Jan 14 2023 Xie XiuQi - 6.1.0-1.0.0.1 +- package init based on upstream v6.1 +* Thu Mar 13 2025 Hang Huang - 6.6.0-102.4.0.54 +- vk: introduce vkernel