diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl index d056a548358ea024fc64f3a107849951a26838d7..48e4ee18c596ca16d135d257289ceb04e4782266 100644 --- a/arch/arm/tools/syscall.tbl +++ b/arch/arm/tools/syscall.tbl @@ -454,3 +454,4 @@ 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 440 common process_madvise sys_process_madvise +441 common process_mrelease sys_process_mrelease \ No newline at end of file diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h index b3b2019f8d16bbeef51948c249dcfb6dbd97a0fb..949788f5ba4007049d0e92b55895ef738c1949fa 100644 --- a/arch/arm64/include/asm/unistd.h +++ b/arch/arm64/include/asm/unistd.h @@ -38,7 +38,7 @@ #define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5) #define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800) -#define __NR_compat_syscalls 441 +#define __NR_compat_syscalls 443 #endif #define __ARCH_WANT_SYS_CLONE diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h index 107f08e03b9fdc4d54b41c7fca707727e1524c92..e1786b2e8551b31e7242627ca75e6e844cde7a20 100644 --- a/arch/arm64/include/asm/unistd32.h +++ b/arch/arm64/include/asm/unistd32.h @@ -889,7 +889,8 @@ __SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd) __SYSCALL(__NR_faccessat2, sys_faccessat2) #define __NR_process_madvise 440 __SYSCALL(__NR_process_madvise, sys_process_madvise) - +#define __NR_process_mrelease 441 +__SYSCALL(__NR_process_mrelease, sys_process_mrelease) /* * Please add new compat syscalls above this comment and update * __NR_compat_syscalls in asm/unistd.h. diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index aea0ce9f3b745ad8f8bfeed1b74ad9c1307f1c28..112ff24ea927b3b8b488d5a58bb70f634496d183 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -905,6 +905,7 @@ asmlinkage long sys_mincore(unsigned long start, size_t len, asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior); asmlinkage long sys_process_madvise(int pidfd, const struct iovec __user *vec, size_t vlen, int behavior, unsigned int flags); +asmlinkage long sys_process_mrelease(int pidfd, unsigned int flags); asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size, unsigned long prot, unsigned long pgoff, unsigned long flags); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 419a814f467e0f4f96907db9621273aaa10dea6a..120635b521b51ccb6d141d6c00ae5e0e39f3269d 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -43,6 +43,8 @@ #include #include #include +#include +#include #include #include "internal.h" @@ -1137,3 +1139,85 @@ void pagefault_out_of_memory(void) if (__ratelimit(&pfoom_rs)) pr_warn("Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF\n"); } + +static struct pid *pidfd_to_pid_local(const struct file *file) +{ + struct pid *pid; + + pid = pidfd_pid(file); + if (!IS_ERR(pid)) + return pid; + + return tgid_pidfd_to_pid(file); +} + +static struct pid *pidfd_get_pid_local(unsigned int fd, unsigned int *flags) +{ + struct fd f; + struct pid *pid; + + f = fdget(fd); + + if (!f.file) + return ERR_PTR(-EBADF); + + pid = pidfd_to_pid_local(f.file); + + if (!IS_ERR(pid)) { + get_pid(pid); + *flags = f.file->f_flags; + } + + fdput(f); + return pid; +} + +SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags) +{ +#ifdef CONFIG_MMU + struct mm_struct *mm = NULL; + struct task_struct *task; + unsigned int f_flags; + struct pid *pid; + long ret = 0; + if (flags != 0) + return -EINVAL; + pid = pidfd_get_pid_local(pidfd, &f_flags); + if (IS_ERR(pid)) + return PTR_ERR(pid); + task = get_pid_task(pid, PIDTYPE_PID); + if (!task) { + ret = -ESRCH; + goto put_pid; + } + pr_warn("get pid: %d task struct",pid); + /* + * If the task is dying and in the process of releasing its memory + * then get its mm. + */ + task_lock(task); + if (task_will_free_mem(task) && (task->flags & PF_KTHREAD) == 0) { + pr_warn("get mm"); + mm = task->mm; + mmget(mm); + } + task_unlock(task); + if (!mm) { + pr_warn("do not get mm"); + ret = -EINVAL; + goto put_task; + } + mmap_read_lock(mm); + if (!__oom_reap_task_mm(mm)) + ret = -EAGAIN; + mmap_read_unlock(mm); + mmput(mm); +put_task: + put_task_struct(task); +put_pid: + put_pid(pid); + return ret; +#else + return -ENOSYS; +#endif /* CONFIG_MMU */ +} diff --git a/tools/testing/selftests/vm/mrelease_test.c b/tools/testing/selftests/vm/mrelease_test.c new file mode 100644 index 0000000000000000000000000000000000000000..ea707820830b955d90048e6c4a6276fbcec6cb06 --- /dev/null +++ b/tools/testing/selftests/vm/mrelease_test.c @@ -0,0 +1,182 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2022 Google LLC + */ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include "util.h" +#include "../kselftest.h" +#ifndef __NR_pidfd_open +#define __NR_pidfd_open 434 +#endif +#ifndef __NR_process_mrelease +#define __NR_process_mrelease 441 +#endif +#define MB(x) (x << 20) +#define MAX_SIZE_MB 1024 +static int alloc_noexit(unsigned long nr_pages, int pipefd) +{ + int ppid = getppid(); + int timeout = 10; /* 10sec timeout to get killed */ + unsigned long i; + char *buf; + buf = (char *)mmap(NULL, nr_pages * PAGE_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANON, 0, 0); + if (buf == MAP_FAILED) { + perror("mmap failed, halting the test"); + return KSFT_FAIL; + } + for (i = 0; i < nr_pages; i++) + *((unsigned long *)(buf + (i * PAGE_SIZE))) = i; + /* Signal the parent that the child is ready */ + printf("child process waking up parent !!!!\n"); + if (write(pipefd, "", 1) < 0) { + perror("write"); + return KSFT_FAIL; + } + /* Wait to be killed (when reparenting happens) */ + while (getppid() == ppid && timeout > 0) { + sleep(1); + timeout--; + } + munmap(buf, nr_pages * PAGE_SIZE); + return (timeout > 0) ? KSFT_PASS : KSFT_FAIL; +} +/* The process_mrelease calls in this test are expected to fail */ +static void run_negative_tests(int pidfd) +{ + /* Test invalid flags. Expect to fail with EINVAL error code. */ + if (!syscall(__NR_process_mrelease, pidfd, (unsigned int)-1) || + errno != EINVAL) { + perror("process_mrelease with wrong flags"); + exit(errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL); + } + /* + * Test reaping while process is alive with no pending SIGKILL. + * Expect to fail with EINVAL error code. + */ + if (!syscall(__NR_process_mrelease, pidfd, 0) || errno != EINVAL) { + perror("process_mrelease on a live process"); + exit(errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL); + } +} +static int child_main(int pipefd[], size_t size) +{ + int res; + printf("child process start !!!!\n"); + /* Allocate and fault-in memory and wait to be killed */ + close(pipefd[0]); + res = alloc_noexit(MB(size) / PAGE_SIZE, pipefd[1]); + close(pipefd[1]); + return res; +} +int main(void) +{ + int pipefd[2], pidfd; + bool success, retry; + size_t size; + pid_t pid; + char byte; + int res; + /* Test a wrong pidfd */ + if (!syscall(__NR_process_mrelease, -1, 0) || errno != EBADF) { + perror("process_mrelease with wrong pidfd"); + exit(errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL); + } + /* Start the test with 1MB child memory allocation */ + size = 1; +retry: + /* + * Pipe for the child to signal when it's done allocating + * memory + */ + if (pipe(pipefd)) { + perror("pipe"); + exit(KSFT_FAIL); + } + pid = fork(); + if (pid < 0) { + perror("fork"); + close(pipefd[0]); + close(pipefd[1]); + exit(KSFT_FAIL); + } + if (pid == 0) { + /* Child main routine */ + res = child_main(pipefd, size); + exit(res); + } + /* + * Parent main routine: + * Wait for the child to finish allocations, then kill and reap + */ + close(pipefd[1]); + /* Block until the child is ready */ + printf("parent process waiting child !!!!!\n"); + res = read(pipefd[0], &byte, 1); + close(pipefd[0]); + if (res < 0) { + perror("read"); + if (!kill(pid, SIGKILL)) + waitpid(pid, NULL, 0); + exit(KSFT_FAIL); + } + pidfd = syscall(__NR_pidfd_open, pid, 0); + if (pidfd < 0) { + perror("pidfd_open"); + if (!kill(pid, SIGKILL)) + waitpid(pid, NULL, 0); + exit(KSFT_FAIL); + } + /* Run negative tests which require a live child */ + // run_negative_tests(pidfd); + printf("parent process start kill !!!\n"); + if (kill(pid, SIGKILL)) { + perror("kill"); + exit(errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL); + } + printf("parent process start process_mrelease !!!\n"); + + success = (syscall(__NR_process_mrelease, pidfd, 0) == 0); + printf("success: %d,errono:%d\n",success,errno); + if (!success) { + /* + * If we failed to reap because the child exited too soon, + * before we could call process_mrelease. Double child's memory + * which causes it to spend more time on cleanup and increases + * our chances of reaping its memory before it exits. + * Retry until we succeed or reach MAX_SIZE_MB. + */ + if (errno == ESRCH) { + retry = (size <= MAX_SIZE_MB); + } + // } else { + + // perror("process_mrelease"); + // waitpid(pid, NULL, 0); + // exit(errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL); + // } + } + /* Cleanup to prevent zombies */ + if (waitpid(pid, NULL, 0) < 0) { + perror("waitpid"); + // exit(KSFT_FAIL); + } + close(pidfd); + if (!success) { + if (retry) { + size *= 2; + goto retry; + } + printf("All process_mrelease attempts failed!\n"); + exit(KSFT_FAIL); + } + printf("Success reaping a child with %zuMB of memory allocations\n", + size); + return KSFT_PASS; +} \ No newline at end of file diff --git a/tools/testing/selftests/vm/util.h b/tools/testing/selftests/vm/util.h new file mode 100644 index 0000000000000000000000000000000000000000..6b2f641ccf9583415998641daf2f2c580c4a43b7 --- /dev/null +++ b/tools/testing/selftests/vm/util.h @@ -0,0 +1,69 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __KSELFTEST_VM_UTIL_H +#define __KSELFTEST_VM_UTIL_H + +#include +#include +#include +#include /* ffsl() */ +#include /* _SC_PAGESIZE */ + +static unsigned int __page_size; +static unsigned int __page_shift; + +static inline unsigned int page_size(void) +{ + if (!__page_size) + __page_size = sysconf(_SC_PAGESIZE); + return __page_size; +} + +static inline unsigned int page_shift(void) +{ + if (!__page_shift) + __page_shift = (ffsl(page_size()) - 1); + return __page_shift; +} + +#define PAGE_SHIFT (page_shift()) +#define PAGE_SIZE (page_size()) +/* + * On ppc64 this will only work with radix 2M hugepage size + */ +#define HPAGE_SHIFT 21 +#define HPAGE_SIZE (1 << HPAGE_SHIFT) + +#define PAGEMAP_PRESENT(ent) (((ent) & (1ull << 63)) != 0) +#define PAGEMAP_PFN(ent) ((ent) & ((1ull << 55) - 1)) + + +static inline int64_t allocate_transhuge(void *ptr, int pagemap_fd) +{ + uint64_t ent[2]; + + /* drop pmd */ + if (mmap(ptr, HPAGE_SIZE, PROT_READ | PROT_WRITE, + MAP_FIXED | MAP_ANONYMOUS | + MAP_NORESERVE | MAP_PRIVATE, -1, 0) != ptr) + errx(2, "mmap transhuge"); + + if (madvise(ptr, HPAGE_SIZE, MADV_HUGEPAGE)) + err(2, "MADV_HUGEPAGE"); + + /* allocate transparent huge page */ + *(volatile void **)ptr = ptr; + + if (pread(pagemap_fd, ent, sizeof(ent), + (uintptr_t)ptr >> (PAGE_SHIFT - 3)) != sizeof(ent)) + err(2, "read pagemap"); + + if (PAGEMAP_PRESENT(ent[0]) && PAGEMAP_PRESENT(ent[1]) && + PAGEMAP_PFN(ent[0]) + 1 == PAGEMAP_PFN(ent[1]) && + !(PAGEMAP_PFN(ent[0]) & ((1 << (HPAGE_SHIFT - PAGE_SHIFT)) - 1))) + return PAGEMAP_PFN(ent[0]); + + return -1; +} + +#endif \ No newline at end of file