diff --git a/0002-compel-add-rseq-syscall-into-compel-std-plugin-sysca.patch b/0002-compel-add-rseq-syscall-into-compel-std-plugin-sysca.patch new file mode 100644 index 0000000000000000000000000000000000000000..ac103f47572010732534a6125d8f0e6c31d6df8e --- /dev/null +++ b/0002-compel-add-rseq-syscall-into-compel-std-plugin-sysca.patch @@ -0,0 +1,74 @@ +From ee46b1b5755eacf3be02a67934f0dc690293745b Mon Sep 17 00:00:00 2001 +From: bb-cat +Date: Wed, 2 Mar 2022 13:28:51 +0800 +Subject: [PATCH 02/16] compel: add rseq syscall into compel std plugin syscall + tables Add rseq syscall numbers for: arm/aarch64, mips64, ppc64le, s390, + x86_64/x86 + +Signed-off-by: Alexander Mikhalitsyn +--- + compel/arch/arm/plugins/std/syscalls/syscall.def | 1 + + compel/arch/mips/plugins/std/syscalls/syscall_64.tbl | 1 + + .../compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl | 1 + + .../compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl | 1 + + compel/arch/x86/plugins/std/syscalls/syscall_32.tbl | 1 + + compel/arch/x86/plugins/std/syscalls/syscall_64.tbl | 1 + + 6 files changed, 6 insertions(+) + +diff --git a/compel/arch/arm/plugins/std/syscalls/syscall.def b/compel/arch/arm/plugins/std/syscalls/syscall.def +index 1b877d1..bb78cbb 100644 +--- a/compel/arch/arm/plugins/std/syscalls/syscall.def ++++ b/compel/arch/arm/plugins/std/syscalls/syscall.def +@@ -119,3 +119,4 @@ clone3 435 435 (struct clone_args *uargs, size_t size) + sched_setaffinity 122 241 (int fd, size_t cpusetsize, const cpu_set_t *mask) + pidfd_open 434 434 (pid_t pid, unsigned int flags) + pidfd_getfd 438 438 (int pidfd, int targetfd, unsigned int flags) ++rseq 293 398 (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) +diff --git a/compel/arch/mips/plugins/std/syscalls/syscall_64.tbl b/compel/arch/mips/plugins/std/syscalls/syscall_64.tbl +index 7a6db19..95dc7d3 100644 +--- a/compel/arch/mips/plugins/std/syscalls/syscall_64.tbl ++++ b/compel/arch/mips/plugins/std/syscalls/syscall_64.tbl +@@ -115,3 +115,4 @@ __NR_fsmount 5432 sys_fsmount (int fd, unsigned int flags, unsigned int attr + __NR_clone3 5435 sys_clone3 (struct clone_args *uargs, size_t size) + __NR_pidfd_open 5434 sys_pidfd_open (pid_t pid, unsigned int flags) + __NR_pidfd_getfd 5438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) ++__NR_rseq 5327 sys_rseq (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) +diff --git a/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl b/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl +index dd79187..ad0d94f 100644 +--- a/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl ++++ b/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl +@@ -115,3 +115,4 @@ __NR_clone3 435 sys_clone3 (struct clone_args *uargs, size_t size) + __NR_sched_setaffinity 222 sys_sched_setaffinity (int fd, size_t cpusetsize, const cpu_set_t *mask) + __NR_pidfd_open 434 sys_pidfd_open (pid_t pid, unsigned int flags) + __NR_pidfd_getfd 438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) ++__NR_rseq 387 sys_rseq (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) +diff --git a/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl b/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl +index 282adaf..916b697 100644 +--- a/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl ++++ b/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl +@@ -115,3 +115,4 @@ __NR_clone3 435 sys_clone3 (struct clone_args *uargs, size_t size) + __NR_sched_setaffinity 239 sys_sched_setaffinity (int fd, size_t cpusetsize, const cpu_set_t *mask) + __NR_pidfd_open 434 sys_pidfd_open (pid_t pid, unsigned int flags) + __NR_pidfd_getfd 438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) ++__NR_rseq 383 sys_rseq (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) +diff --git a/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl b/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl +index 3fe3194..90f23d5 100644 +--- a/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl ++++ b/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl +@@ -103,3 +103,4 @@ __NR_fsmount 432 sys_fsmount (int fd, unsigned int flags, unsigned int attr_f + __NR_clone3 435 sys_clone3 (struct clone_args *uargs, size_t size) + __NR_pidfd_open 434 sys_pidfd_open (pid_t pid, unsigned int flags) + __NR_pidfd_getfd 438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) ++__NR_rseq 386 sys_rseq (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) +diff --git a/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl b/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl +index c1d119d..323fab1 100644 +--- a/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl ++++ b/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl +@@ -114,3 +114,4 @@ __NR_fsmount 432 sys_fsmount (int fd, unsigned int flags, unsigned int attr_ + __NR_clone3 435 sys_clone3 (struct clone_args *uargs, size_t size) + __NR_pidfd_open 434 sys_pidfd_open (pid_t pid, unsigned int flags) + __NR_pidfd_getfd 438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) ++__NR_rseq 334 sys_rseq (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) +-- +2.30.0 + diff --git a/0003-kerndat-check-for-rseq-syscall-support.patch b/0003-kerndat-check-for-rseq-syscall-support.patch new file mode 100644 index 0000000000000000000000000000000000000000..1729b14d09452869045758ae768c8503ef075e9f --- /dev/null +++ b/0003-kerndat-check-for-rseq-syscall-support.patch @@ -0,0 +1,62 @@ +From ebd917f395b8bb3c4d6bbe51f9210d1aeca2e1fd Mon Sep 17 00:00:00 2001 +From: bb-cat +Date: Wed, 2 Mar 2022 13:34:10 +0800 +Subject: [PATCH 03/16] kerndat: check for rseq syscall support Signed-off-by: + Alexander Mikhalitsyn + +--- + criu/include/kerndat.h | 1 + + criu/kerndat.c | 18 ++++++++++++++++++ + 2 files changed, 19 insertions(+) + +diff --git a/criu/include/kerndat.h b/criu/include/kerndat.h +index 80bad7f..44a6976 100644 +--- a/criu/include/kerndat.h ++++ b/criu/include/kerndat.h +@@ -74,6 +74,7 @@ struct kerndat_s { + bool has_pidfd_getfd; + bool has_nspid; + bool has_nftables_concat; ++ bool has_rseq; + }; + + extern struct kerndat_s kdat; +diff --git a/criu/kerndat.c b/criu/kerndat.c +index 0e88ba4..f5a4490 100644 +--- a/criu/kerndat.c ++++ b/criu/kerndat.c +@@ -816,6 +816,20 @@ static int kerndat_x86_has_ptrace_fpu_xsave_bug(void) + return 0; + } + ++static int kerndat_has_rseq(void) ++{ ++ if (syscall(__NR_rseq, NULL, 0, 0, 0) != -1) { ++ pr_err("rseq should fail\n"); ++ return -1; ++ } ++ if (errno == ENOSYS) ++ pr_info("rseq syscall isn't supported\n"); ++ else ++ kdat.has_rseq = true; ++ ++ return 0; ++} ++ + #define KERNDAT_CACHE_FILE KDAT_RUNDIR "/criu.kdat" + #define KERNDAT_CACHE_FILE_TMP KDAT_RUNDIR "/.criu.kdat" + +@@ -1360,6 +1374,10 @@ int kerndat_init(void) + ret = -1; + } + ++ if (!ret && kerndat_has_rseq()) { ++ pr_err("kerndat_has_rseq failed when initializing kerndat.\n"); ++ ret = -1; ++ } + kerndat_lsm(); + kerndat_mmap_min_addr(); + kerndat_files_stat(); +-- +2.30.0 + diff --git a/0004-util-move-fork_and_ptrace_attach-helper-from-cr-chec.patch b/0004-util-move-fork_and_ptrace_attach-helper-from-cr-chec.patch new file mode 100644 index 0000000000000000000000000000000000000000..51457c6e87f7694594b96fc58597bfcf58eb0d14 --- /dev/null +++ b/0004-util-move-fork_and_ptrace_attach-helper-from-cr-chec.patch @@ -0,0 +1,161 @@ +From fe1f84eb98092b1aff60ae2be11e351b165f3f43 Mon Sep 17 00:00:00 2001 +From: bb-cat +Date: Wed, 2 Mar 2022 13:35:53 +0800 +Subject: [PATCH 04/16] util: move fork_and_ptrace_attach helper from cr-check + Signed-off-by: Alexander Mikhalitsyn + +--- + criu/cr-check.c | 55 ------------------------------- + criu/include/util.h | 1 + + criu/util.c | 57 +++++++++++++++++++++++++++++++++ + 3 files changed, 58 insertions(+), 55 deletions(-) + +diff --git a/criu/cr-check.c b/criu/cr-check.c +index 3575fb3..d41ef8f 100644 +--- a/criu/cr-check.c ++++ b/criu/cr-check.c +@@ -537,61 +537,6 @@ static int check_sigqueuinfo(void) + return 0; + } + +-static pid_t fork_and_ptrace_attach(int (*child_setup)(void)) +-{ +- pid_t pid; +- int sk_pair[2], sk; +- char c = 0; +- +- if (socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair)) { +- pr_perror("socketpair"); +- return -1; +- } +- +- pid = fork(); +- if (pid < 0) { +- pr_perror("fork"); +- return -1; +- } else if (pid == 0) { +- sk = sk_pair[1]; +- close(sk_pair[0]); +- +- if (child_setup && child_setup() != 0) +- exit(1); +- +- if (write(sk, &c, 1) != 1) { +- pr_perror("write"); +- exit(1); +- } +- +- while (1) +- sleep(1000); +- exit(1); +- } +- +- sk = sk_pair[0]; +- close(sk_pair[1]); +- +- if (read(sk, &c, 1) != 1) { +- close(sk); +- kill(pid, SIGKILL); +- pr_perror("read"); +- return -1; +- } +- +- close(sk); +- +- if (ptrace(PTRACE_ATTACH, pid, NULL, NULL) == -1) { +- pr_perror("Unable to ptrace the child"); +- kill(pid, SIGKILL); +- return -1; +- } +- +- waitpid(pid, NULL, 0); +- +- return pid; +-} +- + static int check_ptrace_peeksiginfo(void) + { + struct ptrace_peeksiginfo_args arg; +diff --git a/criu/include/util.h b/criu/include/util.h +index a2dac22..1c0b3c7 100644 +--- a/criu/include/util.h ++++ b/criu/include/util.h +@@ -166,6 +166,7 @@ extern int is_anon_link_type(char *link, char *type); + + extern int cr_system(int in, int out, int err, char *cmd, char *const argv[], unsigned flags); + extern int cr_system_userns(int in, int out, int err, char *cmd, char *const argv[], unsigned flags, int userns_pid); ++extern pid_t fork_and_ptrace_attach(int (*child_setup)(void)); + extern int cr_daemon(int nochdir, int noclose, int close_fd); + extern int status_ready(void); + extern int is_root_user(void); +diff --git a/criu/util.c b/criu/util.c +index 06124c2..e682161 100644 +--- a/criu/util.c ++++ b/criu/util.c +@@ -654,6 +654,63 @@ out: + return ret; + } + ++pid_t fork_and_ptrace_attach(int (*child_setup)(void)) ++{ ++ pid_t pid; ++ int sk_pair[2], sk; ++ char c = 0; ++ ++ if (socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair)) { ++ pr_perror("socketpair"); ++ return -1; ++ } ++ ++ pid = fork(); ++ if (pid < 0) { ++ pr_perror("fork"); ++ return -1; ++ } else if (pid == 0) { ++ sk = sk_pair[1]; ++ close(sk_pair[0]); ++ ++ if (child_setup && child_setup() != 0) ++ exit(1); ++ ++ if (write(sk, &c, 1) != 1) { ++ pr_perror("write"); ++ exit(1); ++ } ++ ++ while (1) ++ sleep(1000); ++ exit(1); ++ } ++ ++ sk = sk_pair[0]; ++ close(sk_pair[1]); ++ ++ if (read(sk, &c, 1) != 1) { ++ close(sk); ++ kill(pid, SIGKILL); ++ waitpid(pid, NULL, 0); ++ pr_perror("read"); ++ return -1; ++ } ++ ++ close(sk); ++ ++ if (ptrace(PTRACE_ATTACH, pid, NULL, NULL) == -1) { ++ pr_perror("Unable to ptrace the child"); ++ kill(pid, SIGKILL); ++ waitpid(pid, NULL, 0); ++ return -1; ++ } ++ ++ waitpid(pid, NULL, 0); ++ ++ return pid; ++} ++ + int status_ready(void) + { + char c = 0; +-- +2.30.0 + diff --git a/0005-cr-check-Add-ptrace-rseq-conf-dump-feature.patch b/0005-cr-check-Add-ptrace-rseq-conf-dump-feature.patch new file mode 100644 index 0000000000000000000000000000000000000000..5a82e084015fc162a56423e55359a83ac23991af --- /dev/null +++ b/0005-cr-check-Add-ptrace-rseq-conf-dump-feature.patch @@ -0,0 +1,162 @@ +From 3c567693f2e6579109dbabcca0e90c059ce5af25 Mon Sep 17 00:00:00 2001 +From: bb-cat +Date: Wed, 2 Mar 2022 15:30:18 +0800 +Subject: [PATCH 05/16] cr-check: Add ptrace rseq conf dump feature Add + "get_rseq_conf" feature corresponding to the + ptrace(PTRACE_GET_RSEQ_CONFIGURATION) support. + +Signed-off-by: Alexander Mikhalitsyn +--- + compel/include/uapi/ptrace.h | 12 +++++++ + criu/cr-check.c | 11 +++++++ + criu/include/kerndat.h | 1 + + criu/kerndat.c | 41 ++++++++++++++++++++++++ + 4 files changed, 65 insertions(+) + +diff --git a/compel/include/uapi/ptrace.h b/compel/include/uapi/ptrace.h +index c5291d2..bfe28c7 100644 +--- a/compel/include/uapi/ptrace.h ++++ b/compel/include/uapi/ptrace.h +@@ -65,6 +65,18 @@ typedef struct { + uint64_t flags; /* Output: filter's flags */ + } seccomp_metadata_t; + ++#ifndef PTRACE_GET_RSEQ_CONFIGURATION ++#define PTRACE_GET_RSEQ_CONFIGURATION 0x420f ++ ++struct ptrace_rseq_configuration { ++ __u64 rseq_abi_pointer; ++ __u32 rseq_abi_size; ++ __u32 signature; ++ __u32 flags; ++ __u32 pad; ++}; ++#endif ++ + #ifdef PTRACE_EVENT_STOP + #if PTRACE_EVENT_STOP == 7 /* Bad value from Linux 3.1-3.3, fixed in 3.4 */ + #undef PTRACE_EVENT_STOP +diff --git a/criu/cr-check.c b/criu/cr-check.c +index d41ef8f..ba87511 100644 +--- a/criu/cr-check.c ++++ b/criu/cr-check.c +@@ -794,6 +794,15 @@ static int check_ptrace_dump_seccomp_filters(void) + return ret; + } + ++static int check_ptrace_get_rseq_conf(void) ++{ ++ if (!kdat.has_ptrace_get_rseq_conf) { ++ pr_warn("ptrace(PTRACE_GET_RSEQ_CONFIGURATION) isn't supported. C/R of processes which are using rseq() won't work.\n"); ++ return -1; ++ } ++ return 0; ++} ++ + static int check_mem_dirty_track(void) + { + if (!kdat.has_dirty_track) { +@@ -1435,6 +1444,7 @@ int cr_check(void) + ret |= check_ns_pid(); + ret |= check_apparmor_stacking(); + ret |= check_network_lock_nftables(); ++ ret |= check_ptrace_get_rseq_conf(); + } + + /* +@@ -1547,6 +1557,7 @@ static struct feature_list feature_list[] = { + { "ns_pid", check_ns_pid }, + { "apparmor_stacking", check_apparmor_stacking }, + { "network_lock_nftables", check_network_lock_nftables }, ++ { "get_rseq_conf", check_ptrace_get_rseq_conf }, + { NULL, NULL }, + }; + +diff --git a/criu/include/kerndat.h b/criu/include/kerndat.h +index 44a6976..05abeda 100644 +--- a/criu/include/kerndat.h ++++ b/criu/include/kerndat.h +@@ -75,6 +75,7 @@ struct kerndat_s { + bool has_nspid; + bool has_nftables_concat; + bool has_rseq; ++ bool has_ptrace_get_rseq_conf; + }; + + extern struct kerndat_s kdat; +diff --git a/criu/kerndat.c b/criu/kerndat.c +index f5a4490..4841387 100644 +--- a/criu/kerndat.c ++++ b/criu/kerndat.c +@@ -4,6 +4,8 @@ + #include + #include + #include ++#include ++#include + #include + #include + #include +@@ -36,6 +38,7 @@ + #include "sockets.h" + #include "net.h" + #include "tun.h" ++#include + #include + #include "netfilter.h" + #include "fsnotify.h" +@@ -830,6 +833,40 @@ static int kerndat_has_rseq(void) + return 0; + } + ++static int kerndat_has_ptrace_get_rseq_conf(void) ++{ ++ pid_t pid; ++ int len; ++ struct ptrace_rseq_configuration rseq; ++ ++ pid = fork_and_ptrace_attach(NULL); ++ if (pid < 0) ++ return -1; ++ ++ len = ptrace(PTRACE_GET_RSEQ_CONFIGURATION, pid, sizeof(rseq), &rseq); ++ if (len != sizeof(rseq)) { ++ kdat.has_ptrace_get_rseq_conf = false; ++ pr_info("ptrace(PTRACE_GET_RSEQ_CONFIGURATION) is not supported\n"); ++ goto out; ++ } ++ ++ /* ++ * flags is always zero from the kernel side, if it will be changed ++ * we need to pay attention to that and, possibly, make changes on the CRIU side. ++ */ ++ if (rseq.flags != 0) { ++ kdat.has_ptrace_get_rseq_conf = false; ++ pr_err("ptrace(PTRACE_GET_RSEQ_CONFIGURATION): rseq.flags != 0\n"); ++ } else { ++ kdat.has_ptrace_get_rseq_conf = true; ++ } ++ ++out: ++ kill(pid, SIGKILL); ++ waitpid(pid, NULL, 0); ++ return 0; ++} ++ + #define KERNDAT_CACHE_FILE KDAT_RUNDIR "/criu.kdat" + #define KERNDAT_CACHE_FILE_TMP KDAT_RUNDIR "/.criu.kdat" + +@@ -1378,6 +1415,10 @@ int kerndat_init(void) + pr_err("kerndat_has_rseq failed when initializing kerndat.\n"); + ret = -1; + } ++ if (!ret && kerndat_has_ptrace_get_rseq_conf()) { ++ pr_err("kerndat_has_ptrace_get_rseq_conf failed when initializing kerndat.\n"); ++ ret = -1; ++ } + kerndat_lsm(); + kerndat_mmap_min_addr(); + kerndat_files_stat(); +-- +2.30.0 + diff --git a/0006-rseq-initial-support.patch b/0006-rseq-initial-support.patch new file mode 100644 index 0000000000000000000000000000000000000000..4c6898552bdd29bfe4bdc0259fe8eefcb9f531ad --- /dev/null +++ b/0006-rseq-initial-support.patch @@ -0,0 +1,702 @@ +From e444c089ebfb03fb2b6d69a40322d31ab33c0597 Mon Sep 17 00:00:00 2001 +From: bb-cat +Date: Wed, 2 Mar 2022 14:52:35 +0800 +Subject: [PATCH 06/16] rseq: initial support TODO: 1. properly handle case + when the kernel has rseq() support but has no + ptrace(PTRACE_GET_RSEQ_CONFIGURATION) support and user processes haven't used + rseq(). 2. properly handle "transient" states, when CRIU comes during rseq + was executed. We need test for this case with some "heavy" rseq + we need to + properly handle RSEQ_CS_* flags. + +Fixes: #1696 + +Reported-by: Radostin Stoyanov +Suggested-by: Florian Weimer +Signed-off-by: Alexander Mikhalitsyn +--- + compel/include/uapi/ptrace.h | 16 +-- + criu/cr-dump.c | 99 ++++++++++++++++ + criu/cr-restore.c | 17 +++ + criu/include/linux/rseq.h | 144 +++++++++++++++++++++++ + criu/include/parasite.h | 7 ++ + criu/include/restorer.h | 7 ++ + criu/kerndat.c | 2 +- + criu/parasite-syscall.c | 11 ++ + criu/pie/parasite.c | 99 ++++++++++++++++ + criu/pie/restorer.c | 24 ++++ + images/Makefile | 1 + + images/core.proto | 2 + + images/rseq.proto | 9 ++ + 13 files changed, 429 insertions(+), 9 deletions(-) + create mode 100644 criu/include/linux/rseq.h + create mode 100644 images/rseq.proto + +diff --git a/compel/include/uapi/ptrace.h b/compel/include/uapi/ptrace.h +index bfe28c7..d807a92 100644 +--- a/compel/include/uapi/ptrace.h ++++ b/compel/include/uapi/ptrace.h +@@ -66,14 +66,14 @@ typedef struct { + } seccomp_metadata_t; + + #ifndef PTRACE_GET_RSEQ_CONFIGURATION +-#define PTRACE_GET_RSEQ_CONFIGURATION 0x420f +- +-struct ptrace_rseq_configuration { +- __u64 rseq_abi_pointer; +- __u32 rseq_abi_size; +- __u32 signature; +- __u32 flags; +- __u32 pad; ++#define PTRACE_GET_RSEQ_CONFIGURATION 0x420f ++ ++struct __ptrace_rseq_configuration { ++ uint64_t rseq_abi_pointer; ++ uint32_t rseq_abi_size; ++ uint32_t signature; ++ uint32_t flags; ++ uint32_t pad; + }; + #endif + +diff --git a/criu/cr-dump.c b/criu/cr-dump.c +index f07fe6e..91dd08a 100644 +--- a/criu/cr-dump.c ++++ b/criu/cr-dump.c +@@ -45,6 +45,7 @@ + #include "proc_parse.h" + #include "parasite.h" + #include "parasite-syscall.h" ++#include + #include "files.h" + #include "files-reg.h" + #include "shmem.h" +@@ -200,6 +201,25 @@ static int dump_sched_info(int pid, ThreadCoreEntry *tc) + return 0; + } + ++static int check_thread_rseq(pid_t tid, const struct parasite_check_rseq *ti_rseq, bool has_tc_rseq_entry) ++{ ++ if (!kdat.has_rseq || kdat.has_ptrace_get_rseq_conf) ++ return 0; ++ ++ pr_debug("%d has rseq_inited = %d\n", tid, ti_rseq->rseq_inited); ++ ++ /* ++ * We have no kdat.has_ptrace_get_rseq_conf and user ++ * process has rseq() used, let's fail dump. ++ */ ++ if (ti_rseq->rseq_inited) { ++ pr_err("%d has rseq but kernel lacks get_rseq_conf feature\n", tid); ++ return -1; ++ } ++ ++ return 0; ++} ++ + struct cr_imgset *glob_imgset; + + static int collect_fds(pid_t pid, struct parasite_drain_fd **dfds) +@@ -730,6 +750,17 @@ int dump_thread_core(int pid, CoreEntry *core, const struct parasite_dump_thread + if (!ret) + ret = seccomp_dump_thread(pid, tc); + ++ /* ++ * We are dumping rseq() in the dump_thread_rseq() function, ++ * *before* processes gets infected (because of ptrace requests ++ * API restriction). At this point, if the kernel lacks ++ * kdat.has_ptrace_get_rseq_conf support we have to ensure ++ * that dumpable processes haven't initialized rseq() or ++ * fail dump if rseq() was used. ++ */ ++ if (!ret) ++ ret = check_thread_rseq(pid, &ti->rseq, !!tc->rseq_entry); ++ + return ret; + } + +@@ -1016,6 +1047,68 @@ static int dump_task_signals(pid_t pid, struct pstree_item *item) + return 0; + } + ++static int dump_thread_rseq(pid_t tid, RseqEntry **rseqep) ++{ ++ struct __ptrace_rseq_configuration rseq; ++ RseqEntry *rseqe = NULL; ++ int ret; ++ ++ /* ++ * If we are here it means that rseq() syscall is supported, ++ * but ptrace(PTRACE_GET_RSEQ_CONFIGURATION) isn't supported, ++ * we can just fail dump here. But this is bad idea, IMHO. ++ * ++ * So, we will try to detect if victim process was used rseq(). ++ * See check_rseq() and check_thread_rseq() functions. ++ */ ++ if (!kdat.has_ptrace_get_rseq_conf) ++ return 0; ++ ++ ret = ptrace(PTRACE_GET_RSEQ_CONFIGURATION, tid, sizeof(rseq), &rseq); ++ if (ret != sizeof(rseq)) { ++ pr_perror("ptrace(PTRACE_GET_RSEQ_CONFIGURATION, %d) = %d", tid, ret); ++ return -1; ++ } ++ ++ if (rseq.flags != 0) { ++ pr_err("something wrong with ptrace(PTRACE_GET_RSEQ_CONFIGURATION, %d) flags = 0x%x\n", tid, ++ rseq.flags); ++ return -1; ++ } ++ ++ pr_err("Dump rseq of %d: ptr = 0x%lx sign = 0x%x\n", tid, (unsigned long)rseq.rseq_abi_pointer, rseq.signature); ++ ++ rseqe = xmalloc(sizeof(*rseqe)); ++ if (!rseqe) ++ return -1; ++ ++ rseq_entry__init(rseqe); ++ ++ rseqe->rseq_abi_pointer = rseq.rseq_abi_pointer; ++ rseqe->rseq_abi_size = rseq.rseq_abi_size; ++ rseqe->signature = rseq.signature; ++ ++ *rseqep = rseqe; ++ ++ return 0; ++} ++ ++static int dump_task_rseq(pid_t pid, struct pstree_item *item) ++{ ++ int i; ++ ++ /* if rseq() syscall isn't supported then nothing to dump */ ++ if (!kdat.has_rseq) ++ return 0; ++ ++ for (i = 0; i < item->nr_threads; i++) { ++ if (dump_thread_rseq(item->threads[i].real, &item->core[i]->thread_core->rseq_entry)) ++ return -1; ++ } ++ ++ return 0; ++} ++ + static struct proc_pid_stat pps_buf; + + static int dump_task_threads(struct parasite_ctl *parasite_ctl, const struct pstree_item *item) +@@ -1304,6 +1397,12 @@ static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) + goto err; + } + ++ ret = dump_task_rseq(pid, item); ++ if (ret) { ++ pr_err("Dump %d rseq failed %d\n", pid, ret); ++ goto err; ++ } ++ + parasite_ctl = parasite_infect_seized(pid, item, &vmas); + if (!parasite_ctl) { + pr_err("Can't infect (pid: %d) with parasite\n", pid); +diff --git a/criu/cr-restore.c b/criu/cr-restore.c +index 5b645c1..b2bd044 100644 +--- a/criu/cr-restore.c ++++ b/criu/cr-restore.c +@@ -2975,6 +2975,19 @@ static int prep_sched_info(struct rst_sched_param *sp, ThreadCoreEntry *tc) + return 0; + } + ++static int prep_rseq(struct rst_rseq_param *rseq, ThreadCoreEntry *tc) ++{ ++ /* compatibility with older CRIU versions */ ++ if (!tc->rseq_entry) ++ return 0; ++ ++ rseq->rseq_abi_pointer = tc->rseq_entry->rseq_abi_pointer; ++ rseq->rseq_abi_size = tc->rseq_entry->rseq_abi_size; ++ rseq->signature = tc->rseq_entry->signature; ++ ++ return 0; ++} ++ + static rlim_t decode_rlim(rlim_t ival) + { + return ival == -1 ? RLIM_INFINITY : ival; +@@ -3704,6 +3717,10 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns + thread_args[i].clear_tid_addr = CORE_THREAD_ARCH_INFO(tcore)->clear_tid_addr; + core_get_tls(tcore, &thread_args[i].tls); + ++ ret = prep_rseq(&thread_args[i].rseq, tcore->thread_core); ++ if (ret) ++ goto err; ++ + rst_reloc_creds(&thread_args[i], &creds_pos_next); + + thread_args[i].futex_rla = tcore->thread_core->futex_rla; +diff --git a/criu/include/linux/rseq.h b/criu/include/linux/rseq.h +new file mode 100644 +index 0000000..5c1706a +--- /dev/null ++++ b/criu/include/linux/rseq.h +@@ -0,0 +1,144 @@ ++/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ ++#ifndef _UAPI_LINUX_RSEQ_H ++#define _UAPI_LINUX_RSEQ_H ++ ++/* ++ * linux/rseq.h ++ * ++ * Restartable sequences system call API ++ * ++ * Copyright (c) 2015-2018 Mathieu Desnoyers ++ */ ++ ++#include ++#include ++ ++enum rseq_cpu_id_state { ++ RSEQ_CPU_ID_UNINITIALIZED = -1, ++ RSEQ_CPU_ID_REGISTRATION_FAILED = -2, ++}; ++ ++enum rseq_flags { ++ RSEQ_FLAG_UNREGISTER = (1 << 0), ++}; ++ ++enum rseq_cs_flags_bit { ++ RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT = 0, ++ RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT = 1, ++ RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT = 2, ++}; ++ ++enum rseq_cs_flags { ++ RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT = (1U << RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT), ++ RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL = (1U << RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT), ++ RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE = (1U << RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT), ++}; ++ ++/* ++ * struct rseq_cs is aligned on 4 * 8 bytes to ensure it is always ++ * contained within a single cache-line. It is usually declared as ++ * link-time constant data. ++ */ ++struct rseq_cs { ++ /* Version of this structure. */ ++ __u32 version; ++ /* enum rseq_cs_flags */ ++ __u32 flags; ++ __u64 start_ip; ++ /* Offset from start_ip. */ ++ __u64 post_commit_offset; ++ __u64 abort_ip; ++} __attribute__((aligned(4 * sizeof(__u64)))); ++ ++/* ++ * struct rseq is aligned on 4 * 8 bytes to ensure it is always ++ * contained within a single cache-line. ++ * ++ * A single struct rseq per thread is allowed. ++ */ ++struct rseq { ++ /* ++ * Restartable sequences cpu_id_start field. Updated by the ++ * kernel. Read by user-space with single-copy atomicity ++ * semantics. This field should only be read by the thread which ++ * registered this data structure. Aligned on 32-bit. Always ++ * contains a value in the range of possible CPUs, although the ++ * value may not be the actual current CPU (e.g. if rseq is not ++ * initialized). This CPU number value should always be compared ++ * against the value of the cpu_id field before performing a rseq ++ * commit or returning a value read from a data structure indexed ++ * using the cpu_id_start value. ++ */ ++ __u32 cpu_id_start; ++ /* ++ * Restartable sequences cpu_id field. Updated by the kernel. ++ * Read by user-space with single-copy atomicity semantics. This ++ * field should only be read by the thread which registered this ++ * data structure. Aligned on 32-bit. Values ++ * RSEQ_CPU_ID_UNINITIALIZED and RSEQ_CPU_ID_REGISTRATION_FAILED ++ * have a special semantic: the former means "rseq uninitialized", ++ * and latter means "rseq initialization failed". This value is ++ * meant to be read within rseq critical sections and compared ++ * with the cpu_id_start value previously read, before performing ++ * the commit instruction, or read and compared with the ++ * cpu_id_start value before returning a value loaded from a data ++ * structure indexed using the cpu_id_start value. ++ */ ++ __u32 cpu_id; ++ /* ++ * Restartable sequences rseq_cs field. ++ * ++ * Contains NULL when no critical section is active for the current ++ * thread, or holds a pointer to the currently active struct rseq_cs. ++ * ++ * Updated by user-space, which sets the address of the currently ++ * active rseq_cs at the beginning of assembly instruction sequence ++ * block, and set to NULL by the kernel when it restarts an assembly ++ * instruction sequence block, as well as when the kernel detects that ++ * it is preempting or delivering a signal outside of the range ++ * targeted by the rseq_cs. Also needs to be set to NULL by user-space ++ * before reclaiming memory that contains the targeted struct rseq_cs. ++ * ++ * Read and set by the kernel. Set by user-space with single-copy ++ * atomicity semantics. This field should only be updated by the ++ * thread which registered this data structure. Aligned on 64-bit. ++ */ ++ union { ++ __u64 ptr64; ++#ifdef __LP64__ ++ __u64 ptr; ++#else ++ struct { ++#if (defined(__BYTE_ORDER) && (__BYTE_ORDER == __BIG_ENDIAN)) || defined(__BIG_ENDIAN) ++ __u32 padding; /* Initialized to zero. */ ++ __u32 ptr32; ++#else /* LITTLE */ ++ __u32 ptr32; ++ __u32 padding; /* Initialized to zero. */ ++#endif /* ENDIAN */ ++ } ptr; ++#endif ++ } rseq_cs; ++ ++ /* ++ * Restartable sequences flags field. ++ * ++ * This field should only be updated by the thread which ++ * registered this data structure. Read by the kernel. ++ * Mainly used for single-stepping through rseq critical sections ++ * with debuggers. ++ * ++ * - RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT ++ * Inhibit instruction sequence block restart on preemption ++ * for this thread. ++ * - RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL ++ * Inhibit instruction sequence block restart on signal ++ * delivery for this thread. ++ * - RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE ++ * Inhibit instruction sequence block restart on migration for ++ * this thread. ++ */ ++ __u32 flags; ++} __attribute__((aligned(4 * sizeof(__u64)))); ++ ++#endif /* _UAPI_LINUX_RSEQ_H */ +diff --git a/criu/include/parasite.h b/criu/include/parasite.h +index 8107aa4..5fde809 100644 +--- a/criu/include/parasite.h ++++ b/criu/include/parasite.h +@@ -164,10 +164,17 @@ struct parasite_dump_creds { + unsigned int groups[0]; + }; + ++struct parasite_check_rseq { ++ bool has_rseq; ++ bool has_ptrace_get_rseq_conf; /* no need to check if supported */ ++ bool rseq_inited; ++}; ++ + struct parasite_dump_thread { + unsigned int *tid_addr; + pid_t tid; + tls_t tls; ++ struct parasite_check_rseq rseq; + stack_t sas; + int pdeath_sig; + char comm[TASK_COMM_LEN]; +diff --git a/criu/include/restorer.h b/criu/include/restorer.h +index c2ef8f0..c29d869 100644 +--- a/criu/include/restorer.h ++++ b/criu/include/restorer.h +@@ -45,6 +45,12 @@ struct rst_sched_param { + int prio; + }; + ++struct rst_rseq_param { ++ u64 rseq_abi_pointer; ++ u32 rseq_abi_size; ++ u32 signature; ++}; ++ + struct restore_posix_timer { + struct str_posix_timer spt; + struct itimerspec val; +@@ -99,6 +105,7 @@ struct thread_restore_args { + struct task_restore_args *ta; + + tls_t tls; ++ struct rst_rseq_param rseq; + + siginfo_t *siginfo; + unsigned int siginfo_n; +diff --git a/criu/kerndat.c b/criu/kerndat.c +index 4841387..af7113a 100644 +--- a/criu/kerndat.c ++++ b/criu/kerndat.c +@@ -837,7 +837,7 @@ static int kerndat_has_ptrace_get_rseq_conf(void) + { + pid_t pid; + int len; +- struct ptrace_rseq_configuration rseq; ++ struct __ptrace_rseq_configuration rseq; + + pid = fork_and_ptrace_attach(NULL); + if (pid < 0) +diff --git a/criu/parasite-syscall.c b/criu/parasite-syscall.c +index 7175ade..ee4fa86 100644 +--- a/criu/parasite-syscall.c ++++ b/criu/parasite-syscall.c +@@ -132,6 +132,13 @@ static int alloc_groups_copy_creds(CredsEntry *ce, struct parasite_dump_creds *c + return ce->groups ? 0 : -ENOMEM; + } + ++static void init_parasite_rseq_arg(struct parasite_check_rseq *rseq) ++{ ++ rseq->has_rseq = kdat.has_rseq; ++ rseq->has_ptrace_get_rseq_conf = kdat.has_ptrace_get_rseq_conf; ++ rseq->rseq_inited = false; ++} ++ + int parasite_dump_thread_leader_seized(struct parasite_ctl *ctl, int pid, CoreEntry *core) + { + ThreadCoreEntry *tc = core->thread_core; +@@ -144,6 +151,8 @@ int parasite_dump_thread_leader_seized(struct parasite_ctl *ctl, int pid, CoreEn + pc = args->creds; + pc->cap_last_cap = kdat.last_cap; + ++ init_parasite_rseq_arg(&args->rseq); ++ + ret = compel_rpc_call_sync(PARASITE_CMD_DUMP_THREAD, ctl); + if (ret < 0) + return ret; +@@ -197,6 +206,8 @@ int parasite_dump_thread_seized(struct parasite_thread_ctl *tctl, struct parasit + + compel_arch_get_tls_thread(tctl, &args->tls); + ++ init_parasite_rseq_arg(&args->rseq); ++ + ret = compel_run_in_thread(tctl, PARASITE_CMD_DUMP_THREAD); + if (ret) { + pr_err("Can't init thread in parasite %d\n", pid); +diff --git a/criu/pie/parasite.c b/criu/pie/parasite.c +index bc0a33c..e49958b 100644 +--- a/criu/pie/parasite.c ++++ b/criu/pie/parasite.c +@@ -8,6 +8,8 @@ + #include + #include + ++#include "linux/rseq.h" ++ + #include "common/config.h" + #include "int.h" + #include "types.h" +@@ -167,6 +169,7 @@ static int dump_posix_timers(struct parasite_dump_posix_timers_args *args) + } + + static int dump_creds(struct parasite_dump_creds *args); ++static int check_rseq(struct parasite_check_rseq *rseq); + + static int dump_thread_common(struct parasite_dump_thread *ti) + { +@@ -197,6 +200,12 @@ static int dump_thread_common(struct parasite_dump_thread *ti) + goto out; + } + ++ ret = check_rseq(&ti->rseq); ++ if (ret) { ++ pr_err("Unable to check if rseq() is initialized: %d\n", ret); ++ goto out; ++ } ++ + ret = dump_creds(ti->creds); + out: + return ret; +@@ -313,6 +322,96 @@ grps_err: + return -1; + } + ++static int check_rseq(struct parasite_check_rseq *rseq) ++{ ++ int ret; ++ unsigned long rseq_abi_pointer; ++ unsigned long rseq_abi_size; ++ uint32_t rseq_signature; ++ void *addr; ++ ++ /* no need to do hacky check if we can get all info from ptrace() */ ++ if (!rseq->has_rseq || rseq->has_ptrace_get_rseq_conf) ++ return 0; ++ ++ /* ++ * We need to determine if victim process has rseq() ++ * initialized, but we have no *any* proper kernel interface ++ * supported at this point. ++ * Our plan: ++ * 1. We know that if we call rseq() syscall and process already ++ * has current->rseq filled, then we get: ++ * -EINVAL if current->rseq != rseq || rseq_len != sizeof(*rseq), ++ * -EPERM if current->rseq_sig != sig), ++ * -EBUSY if current->rseq == rseq && rseq_len == sizeof(*rseq) && ++ * current->rseq_sig != sig ++ * if current->rseq == NULL (rseq() wasn't used) then we go to: ++ * IS_ALIGNED(rseq ...) check, if we fail it we get -EINVAL and it ++ * will be hard to distinguish case when rseq() was initialized or not. ++ * Let's construct arguments payload ++ * with: ++ * 1. correct rseq_abi_size ++ * 2. aligned and correct rseq_abi_pointer ++ * And see what rseq() return to us. ++ * If ret value is: ++ * 0: it means that rseq *wasn't* used and we successfuly registered it, ++ * -EINVAL or : it means that rseq is already initialized, ++ * so we *have* to dump it. But as we have has_ptrace_get_rseq_conf = false, ++ * we should just fail dump as it's unsafe to skip rseq() dump for processes ++ * with rseq() initialized. ++ * -EPERM or -EBUSY: should not happen as we take a fresh memory area for rseq ++ */ ++ addr = (void *)sys_mmap(NULL, sizeof(struct rseq), PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); ++ if (addr == MAP_FAILED) { ++ pr_err("mmap() failed for struct rseq ret = %lx\n", (unsigned long)addr); ++ return -1; ++ } ++ ++ memset(addr, 0, sizeof(struct rseq)); ++ ++ /* sys_mmap returns page aligned addresses */ ++ rseq_abi_pointer = (unsigned long)addr; ++ rseq_abi_size = (unsigned long)sizeof(struct rseq); ++ /* it's not so important to have unique signature for us, ++ * because rseq_abi_pointer is guaranteed to be unique ++ */ ++ rseq_signature = 0x12345612; ++ ++ pr_info("\ttrying sys_rseq(%lx, %lx, %x, %x)\n", rseq_abi_pointer, rseq_abi_size, 0, rseq_signature); ++ ret = sys_rseq((void *)rseq_abi_pointer, rseq_abi_size, 0, rseq_signature); ++ if (ret) { ++ if (ret == -EINVAL) { ++ pr_info("\trseq is initialized in the victim\n"); ++ rseq->rseq_inited = true; ++ ++ ret = 0; ++ } else { ++ pr_err("\tunexpected failure of sys_rseq(%lx, %lx, %x, %x) = %d\n", rseq_abi_pointer, ++ rseq_abi_size, 0, rseq_signature, ret); ++ ++ ret = -1; ++ } ++ } else { ++ ret = sys_rseq((void *)rseq_abi_pointer, sizeof(struct rseq), RSEQ_FLAG_UNREGISTER, rseq_signature); ++ if (ret) { ++ pr_err("\tfailed to unregister sys_rseq(%lx, %lx, %x, %x) = %d\n", rseq_abi_pointer, ++ rseq_abi_size, RSEQ_FLAG_UNREGISTER, rseq_signature, ret); ++ ++ ret = -1; ++ goto out; ++ } ++ ++ pr_info("\tsys_rseq succeed, let's unregister it back... ok Error\n"); ++ pr_info("\trseq is non-initialized in the victim Error\n"); ++ rseq->rseq_inited = false; ++ ret = 0; ++ } ++ ++out: ++ sys_munmap(addr, sizeof(struct rseq)); ++ return ret; ++} ++ + static int fill_fds_fown(int fd, struct fd_opts *p) + { + int flags, ret; +diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c +index fbc89fe..368b5a0 100644 +--- a/criu/pie/restorer.c ++++ b/criu/pie/restorer.c +@@ -459,6 +459,27 @@ static int restore_cpu_affinity(struct task_restore_args *args) + return 0; + } + ++static int restore_rseq(struct rst_rseq_param *rseq) ++{ ++ int ret; ++ ++ if (!rseq->rseq_abi_pointer) { ++ pr_debug("rseq: nothing to restore\n"); ++ return 0; ++ } ++ ++ pr_debug("rseq: rseq_abi_pointer = %lx signature = %x\n", (unsigned long)rseq->rseq_abi_pointer, rseq->signature); ++ ++ ret = sys_rseq(decode_pointer(rseq->rseq_abi_pointer), rseq->rseq_abi_size, 0, rseq->signature); ++ if (ret) { ++ pr_err("failed sys_rseq(%lx, %lx, %x, %x) = %d\n", (unsigned long)rseq->rseq_abi_pointer, ++ (unsigned long)rseq->rseq_abi_size, 0, rseq->signature, ret); ++ return -1; ++ } ++ ++ return 0; ++} ++ + static int restore_seccomp_filter(pid_t tid, struct thread_restore_args *args) + { + unsigned int flags = args->seccomp_force_tsync ? SECCOMP_FILTER_FLAG_TSYNC : 0; +@@ -583,6 +604,9 @@ static int restore_thread_common(struct thread_restore_args *args) + + restore_tls(&args->tls); + ++ if (restore_rseq(&args->rseq)) ++ return -1; ++ + return 0; + } + +diff --git a/images/Makefile b/images/Makefile +index 2eaeb7c..004e22e 100644 +--- a/images/Makefile ++++ b/images/Makefile +@@ -71,6 +71,7 @@ proto-obj-y += img-streamer.o + proto-obj-y += bpfmap-file.o + proto-obj-y += bpfmap-data.o + proto-obj-y += apparmor.o ++proto-obj-y += rseq.o + + CFLAGS += -iquote $(obj)/ + +diff --git a/images/core.proto b/images/core.proto +index 39e7f32..b66230e 100644 +--- a/images/core.proto ++++ b/images/core.proto +@@ -14,6 +14,7 @@ import "timer.proto"; + import "creds.proto"; + import "sa.proto"; + import "siginfo.proto"; ++import "rseq.proto"; + + import "opts.proto"; + +@@ -106,6 +107,7 @@ message thread_core_entry { + optional string comm = 13; + optional uint64 blk_sigset_extended = 14; + required thread_allowedcpus_entry allowed_cpus = 15; ++ optional rseq_entry rseq_entry = 16; + } + + message task_rlimits_entry { +diff --git a/images/rseq.proto b/images/rseq.proto +new file mode 100644 +index 0000000..be28004 +--- /dev/null ++++ b/images/rseq.proto +@@ -0,0 +1,9 @@ ++// SPDX-License-Identifier: MIT ++ ++syntax = "proto2"; ++ ++message rseq_entry { ++ required uint64 rseq_abi_pointer = 1; ++ required uint32 rseq_abi_size = 2; ++ required uint32 signature = 3; ++} +-- +2.30.0 + diff --git a/0007-zdtm-add-simple-test-for-rseq-C-R.patch b/0007-zdtm-add-simple-test-for-rseq-C-R.patch new file mode 100644 index 0000000000000000000000000000000000000000..bb317ed499f4a7d9dbc17a12b4e3200c3eb574c5 --- /dev/null +++ b/0007-zdtm-add-simple-test-for-rseq-C-R.patch @@ -0,0 +1,217 @@ +From 5005c08e32dc29dbf0b3a2a582e75d249c190d96 Mon Sep 17 00:00:00 2001 +From: bb-cat +Date: Wed, 2 Mar 2022 14:54:28 +0800 +Subject: [PATCH 07/16] zdtm: add simple test for rseq C/R Signed-off-by: + Alexander Mikhalitsyn + +--- + test/zdtm/static/Makefile | 1 + + test/zdtm/static/rseq00.c | 174 +++++++++++++++++++++++ + test/zdtm/static/rseq00.desc | 1 + + 3 files changed, 176 insertions(+) + create mode 100644 test/zdtm/static/rseq00.c + create mode 100644 test/zdtm/static/rseq00.desc + +diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile +index 70123cf..563d947 100644 +--- a/test/zdtm/static/Makefile ++++ b/test/zdtm/static/Makefile +@@ -61,6 +61,7 @@ TST_NOFILE := \ + pthread02 \ + pthread_timers \ + pthread_timers_h \ ++ rseq00 \ + vdso00 \ + vdso01 \ + vdso02 \ +diff --git a/test/zdtm/static/rseq00.c b/test/zdtm/static/rseq00.c +new file mode 100644 +index 0000000..26f41a2 +--- /dev/null ++++ b/test/zdtm/static/rseq00.c +@@ -0,0 +1,174 @@ ++/* ++ * test for rseq() syscall ++ * See also https://www.efficios.com/blog/2019/02/08/linux-restartable-sequences/ ++ * https://github.com/torvalds/linux/commit/d7822b1e24f2df5df98c76f0e94a5416349ff759 ++ */ ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "zdtmtst.h" ++ ++#if defined(__x86_64__) ++ ++const char *test_doc = "Check that rseq() basic C/R works"; ++const char *test_author = "Alexander Mikhalitsyn "; ++/* parts of code borrowed from https://www.efficios.com/blog/2019/02/08/linux-restartable-sequences/ */ ++ ++/* some useful definitions from kernel uapi */ ++enum rseq_flags { ++ RSEQ_FLAG_UNREGISTER = (1 << 0), ++}; ++ ++struct rseq { ++ uint32_t cpu_id_start; ++ uint32_t cpu_id; ++ uint64_t rseq_cs; ++ uint32_t flags; ++} __attribute__((aligned(4 * sizeof(uint64_t)))); ++ ++#ifndef __NR_rseq ++#define __NR_rseq 334 ++#endif ++/* EOF */ ++ ++static __thread volatile struct rseq __rseq_abi; ++ ++#define RSEQ_SIG 0x53053053 ++ ++static int sys_rseq(volatile struct rseq *rseq_abi, uint32_t rseq_len, int flags, uint32_t sig) ++{ ++ return syscall(__NR_rseq, rseq_abi, rseq_len, flags, sig); ++} ++ ++static void register_thread(void) ++{ ++ int rc; ++ rc = sys_rseq(&__rseq_abi, sizeof(struct rseq), 0, RSEQ_SIG); ++ if (rc) { ++ fail("Failed to register rseq"); ++ exit(1); ++ } ++} ++ ++static void unregister_thread(void) ++{ ++ int rc; ++ rc = sys_rseq(&__rseq_abi, sizeof(struct rseq), RSEQ_FLAG_UNREGISTER, RSEQ_SIG); ++ if (rc) { ++ fail("Failed to unregister rseq"); ++ exit(1); ++ } ++} ++ ++static void check_thread(void) ++{ ++ int rc; ++ rc = sys_rseq(&__rseq_abi, sizeof(struct rseq), 0, RSEQ_SIG); ++ if (!(rc && errno == EBUSY)) { ++ fail("Failed to check rseq %d", rc); ++ exit(1); ++ } ++} ++ ++#define RSEQ_ACCESS_ONCE(x) (*(__volatile__ __typeof__(x) *)&(x)) ++ ++static int rseq_addv(intptr_t *v, intptr_t count, int cpu) ++{ ++ /* clang-format off */ ++ __asm__ __volatile__ goto( ++ ".pushsection __rseq_table, \"aw\"\n\t" ++ ".balign 32\n\t" ++ "cs_obj:\n\t" ++ /* version, flags */ ++ ".long 0, 0\n\t" ++ /* start_ip, post_commit_ip, abort_ip */ ++ ".quad 1f, (2f-1f), 4f\n\t" ++ ".popsection\n\t" ++ "1:\n\t" ++ "leaq cs_obj(%%rip), %%rax\n\t" ++ "movq %%rax, %[rseq_cs]\n\t" ++ "cmpl %[cpu_id], %[current_cpu_id]\n\t" ++ "jnz 4f\n\t" ++ "addq %[count], %[v]\n\t" /* final store */ ++ "2:\n\t" ++ ".pushsection __rseq_failure, \"ax\"\n\t" ++ /* Disassembler-friendly signature: nopl (%rip). */ ++ ".byte 0x0f, 0x1f, 0x05\n\t" ++ ".long 0x53053053\n\t" /* RSEQ_FLAGS */ ++ "4:\n\t" ++ "jmp abort\n\t" ++ ".popsection\n\t" ++ : /* gcc asm goto does not allow outputs */ ++ : [cpu_id] "r" (cpu), ++ [current_cpu_id] "m" (__rseq_abi.cpu_id), ++ [rseq_cs] "m" (__rseq_abi.rseq_cs), ++ /* final store input */ ++ [v] "m" (*v), ++ [count] "er" (count) ++ : "memory", "cc", "rax" ++ : abort ++ ); ++ /* clang-format on */ ++ ++ return 0; ++abort: ++ return -1; ++} ++ ++int main(int argc, char *argv[]) ++{ ++ int cpu, ret; ++ intptr_t *cpu_data; ++ long nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); ++ ++ test_init(argc, argv); ++ ++ cpu_data = calloc(nr_cpus, sizeof(*cpu_data)); ++ if (!cpu_data) { ++ fail("calloc"); ++ exit(EXIT_FAILURE); ++ } ++ ++ register_thread(); ++ ++ test_daemon(); ++ test_waitsig(); ++ ++ check_thread(); ++ ++ cpu = RSEQ_ACCESS_ONCE(__rseq_abi.cpu_id_start); ++ ret = rseq_addv(&cpu_data[cpu], 2, cpu); ++ if (ret) ++ fail("Failed to increment per-cpu counter"); ++ else ++ test_msg("cpu_data[%d] == %ld\n", cpu, (long int)cpu_data[cpu]); ++ ++ if (cpu_data[cpu] == 2) ++ pass(); ++ else ++ fail(); ++ ++ return 0; ++} ++ ++#else ++ ++int main(int argc, char *argv[]) ++{ ++ test_init(argc, argv); ++ skip("Unsupported arch"); ++ return 0; ++} ++ ++#endif +\ No newline at end of file +diff --git a/test/zdtm/static/rseq00.desc b/test/zdtm/static/rseq00.desc +new file mode 100644 +index 0000000..0324fa3 +--- /dev/null ++++ b/test/zdtm/static/rseq00.desc +@@ -0,0 +1 @@ ++{'flavor': 'h', 'arch': 'x86_64', 'feature': 'get_rseq_conf'} +-- +2.30.0 + diff --git a/0008-ci-add-Fedora-Rawhide-based-test-on-Cirrus.patch b/0008-ci-add-Fedora-Rawhide-based-test-on-Cirrus.patch new file mode 100644 index 0000000000000000000000000000000000000000..2f6b6420e6d98d2ba9bbbe3842fa420e2d6be905 --- /dev/null +++ b/0008-ci-add-Fedora-Rawhide-based-test-on-Cirrus.patch @@ -0,0 +1,123 @@ +From 56fad25776a652e143175a22676a1f909476c880 Mon Sep 17 00:00:00 2001 +From: bb-cat +Date: Wed, 2 Mar 2022 14:57:16 +0800 +Subject: [PATCH 08/16] ci: add Fedora Rawhide based test on Cirrus We have + ability to use nested virtualization on Cirrus, and already have "Vagrant + Fedora based test (no VDSO)" test, let's do analogical for Fedora Rawhide to + get fresh kernel. + +Suggested-by: Adrian Reber +Signed-off-by: Alexander Mikhalitsyn +--- + .cirrus.yml | 21 +++++++++++++++++++++ + scripts/ci/Makefile | 7 +++++-- + scripts/ci/run-ci-tests.sh | 5 +++++ + scripts/ci/vagrant.sh | 21 +++++++++++++++++++++ + 4 files changed, 52 insertions(+), 2 deletions(-) + +diff --git a/.cirrus.yml b/.cirrus.yml +index 671178d..9716e58 100644 +--- a/.cirrus.yml ++++ b/.cirrus.yml +@@ -19,6 +19,27 @@ task: + build_script: | + make -C scripts/ci vagrant-fedora-no-vdso + ++task: ++ name: Vagrant Fedora Rawhide based test ++ environment: ++ HOME: "/root" ++ CIRRUS_WORKING_DIR: "/tmp/criu" ++ ++ compute_engine_instance: ++ image_project: cirrus-images ++ image: family/docker-kvm ++ platform: linux ++ cpu: 4 ++ memory: 16G ++ nested_virtualization: true ++ ++ setup_script: | ++ scripts/ci/apt-install make gcc pkg-config git perl-modules iproute2 kmod wget cpu-checker ++ sudo kvm-ok ++ ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto ++ build_script: | ++ make -C scripts/ci vagrant-fedora-rawhide ++ + task: + name: CentOS 8 based test + environment: +diff --git a/scripts/ci/Makefile b/scripts/ci/Makefile +index 02b4d87..9c9264d 100644 +--- a/scripts/ci/Makefile ++++ b/scripts/ci/Makefile +@@ -41,7 +41,7 @@ export CONTAINER_TERMINAL + ifeq ($(UNAME),x86_64) + # On anything besides x86_64 Travis is running unprivileged LXD + # containers which do not support running docker with '--privileged'. +- CONTAINER_OPTS := --rm $(CONTAINER_TERMINAL) --privileged -v /lib/modules:/lib/modules --tmpfs /run ++ CONTAINER_OPTS := --rm $(CONTAINER_TERMINAL) --privileged --userns=host --cgroupns=host -v /lib/modules:/lib/modules --tmpfs /run + else + CONTAINER_OPTS := --rm -v /lib/modules:/lib/modules --tmpfs /run + endif +@@ -92,7 +92,10 @@ setup-vagrant: + vagrant-fedora-no-vdso: setup-vagrant + ./vagrant.sh fedora-no-vdso + +-.PHONY: setup-vagrant vagrant-fedora-no-vdso ++vagrant-fedora-rawhide: setup-vagrant ++ ./vagrant.sh fedora-rawhide ++ ++.PHONY: setup-vagrant vagrant-fedora-no-vdso vagrant-fedora-rawhide + + %: + $(MAKE) -C ../build $@$(target-suffix) +diff --git a/scripts/ci/run-ci-tests.sh b/scripts/ci/run-ci-tests.sh +index 7c66e68..95b4ec7 100755 +--- a/scripts/ci/run-ci-tests.sh ++++ b/scripts/ci/run-ci-tests.sh +@@ -194,6 +194,11 @@ if [ "${STREAM_TEST}" = "1" ]; then + exit 0 + fi + ++# print some useful debug info ++cat /proc/self/status ++ls -la /proc/self/ns ++cat /proc/self/cgroup ++ + # shellcheck disable=SC2086 + ./test/zdtm.py run -a -p 2 --keep-going $ZDTM_OPTS + +diff --git a/scripts/ci/vagrant.sh b/scripts/ci/vagrant.sh +index 839b100..f961b8d 100755 +--- a/scripts/ci/vagrant.sh ++++ b/scripts/ci/vagrant.sh +@@ -58,4 +58,25 @@ fedora-no-vdso() { + ssh default 'cd /vagrant/criu/test; sudo ./zdtm.py run -t zdtm/transition/pidfd_store_sk --rpc --pre 2' + } + ++fedora-rawhide() { ++ #ssh default sudo grubby --update-kernel ALL --args="selinux=0 systemd.unified_cgroup_hierarchy=0" ++ ssh default sudo grubby --update-kernel ALL ++ # ++ # Workaround the problem: ++ # error running container: error from /usr/bin/crun creating container for [...]: sd-bus call: Transport endpoint is not connected ++ # Let's just use runc instead of crun ++ # see also https://github.com/kata-containers/tests/issues/4283 ++ # ++ ssh default 'sudo dnf remove -y crun || true' ++ ssh default sudo dnf install -y podman runc ++ vagrant reload ++ #ssh default sudo setenforce 0 ++ ssh default cat /proc/cmdline ++ ssh default ls -la /proc/self/ns ++ ssh default sudo cat /proc/self/status ++ ssh default sudo cat /proc/self/cgroup ++ #ssh default sudo capsh --print ++ ssh default 'cd /vagrant; tar xf criu.tar; cd criu; sudo -E make -C scripts/ci fedora-rawhide CONTAINER_RUNTIME=podman BUILD_OPTIONS="--security-opt seccomp=unconfined"' ++} ++ + $1 +-- +2.30.0 + diff --git a/0009-include-add-thread_pointer.h-from-Glibc.patch b/0009-include-add-thread_pointer.h-from-Glibc.patch new file mode 100644 index 0000000000000000000000000000000000000000..51513496c42bd3b37918e71b05d328814955ee21 --- /dev/null +++ b/0009-include-add-thread_pointer.h-from-Glibc.patch @@ -0,0 +1,244 @@ +From 99da2f789ca92aa52eeca07b97aee2cbd3d60fca Mon Sep 17 00:00:00 2001 +From: bb-cat +Date: Wed, 2 Mar 2022 15:00:07 +0800 +Subject: [PATCH 09/16] include: add thread_pointer.h from Glibc Implementation + was taken from the Glibc. + +https://sourceware.org/git/?p=glibc.git;a=commitdiff;h=8dbeb0561eeb876f557ac9eef5721912ec074ea5 +https://sourceware.org/git/?p=glibc.git;a=commitdiff;h=cb976fba4c51ede7bf8cee5035888527c308dfbc + +Signed-off-by: Alexander Mikhalitsyn +--- + .../arch/aarch64/include/asm/thread_pointer.h | 27 ++++++++++++++ + .../arch/arm/include/asm/thread_pointer.h | 27 ++++++++++++++ + .../arch/mips/include/asm/thread_pointer.h | 27 ++++++++++++++ + .../arch/ppc64/include/asm/thread_pointer.h | 33 +++++++++++++++++ + .../arch/s390/include/asm/thread_pointer.h | 27 ++++++++++++++ + .../arch/x86/include/asm/thread_pointer.h | 37 +++++++++++++++++++ + 6 files changed, 178 insertions(+) + create mode 100644 criu/arch/aarch64/include/asm/thread_pointer.h + create mode 100644 criu/arch/arm/include/asm/thread_pointer.h + create mode 100644 criu/arch/mips/include/asm/thread_pointer.h + create mode 100644 criu/arch/ppc64/include/asm/thread_pointer.h + create mode 100644 criu/arch/s390/include/asm/thread_pointer.h + create mode 100644 criu/arch/x86/include/asm/thread_pointer.h + +diff --git a/criu/arch/aarch64/include/asm/thread_pointer.h b/criu/arch/aarch64/include/asm/thread_pointer.h +new file mode 100644 +index 0000000..f7e0706 +--- /dev/null ++++ b/criu/arch/aarch64/include/asm/thread_pointer.h +@@ -0,0 +1,27 @@ ++/* __thread_pointer definition. Generic version. ++ Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library. If not, see ++ . */ ++ ++#ifndef _SYS_THREAD_POINTER_H ++#define _SYS_THREAD_POINTER_H ++ ++static inline void *__criu_thread_pointer(void) ++{ ++ return __builtin_thread_pointer(); ++} ++ ++#endif /* _SYS_THREAD_POINTER_H */ +diff --git a/criu/arch/arm/include/asm/thread_pointer.h b/criu/arch/arm/include/asm/thread_pointer.h +new file mode 100644 +index 0000000..f7e0706 +--- /dev/null ++++ b/criu/arch/arm/include/asm/thread_pointer.h +@@ -0,0 +1,27 @@ ++/* __thread_pointer definition. Generic version. ++ Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library. If not, see ++ . */ ++ ++#ifndef _SYS_THREAD_POINTER_H ++#define _SYS_THREAD_POINTER_H ++ ++static inline void *__criu_thread_pointer(void) ++{ ++ return __builtin_thread_pointer(); ++} ++ ++#endif /* _SYS_THREAD_POINTER_H */ +diff --git a/criu/arch/mips/include/asm/thread_pointer.h b/criu/arch/mips/include/asm/thread_pointer.h +new file mode 100644 +index 0000000..f7e0706 +--- /dev/null ++++ b/criu/arch/mips/include/asm/thread_pointer.h +@@ -0,0 +1,27 @@ ++/* __thread_pointer definition. Generic version. ++ Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library. If not, see ++ . */ ++ ++#ifndef _SYS_THREAD_POINTER_H ++#define _SYS_THREAD_POINTER_H ++ ++static inline void *__criu_thread_pointer(void) ++{ ++ return __builtin_thread_pointer(); ++} ++ ++#endif /* _SYS_THREAD_POINTER_H */ +diff --git a/criu/arch/ppc64/include/asm/thread_pointer.h b/criu/arch/ppc64/include/asm/thread_pointer.h +new file mode 100644 +index 0000000..304516f +--- /dev/null ++++ b/criu/arch/ppc64/include/asm/thread_pointer.h +@@ -0,0 +1,33 @@ ++/* __thread_pointer definition. powerpc version. ++ Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library. If not, see ++ . */ ++ ++#ifndef _SYS_THREAD_POINTER_H ++#define _SYS_THREAD_POINTER_H ++ ++#ifdef __powerpc64__ ++register void *__thread_register asm("r13"); ++#else ++register void *__thread_register asm("r2"); ++#endif ++ ++static inline void *__criu_thread_pointer(void) ++{ ++ return __thread_register; ++} ++ ++#endif /* _SYS_THREAD_POINTER_H */ +\ No newline at end of file +diff --git a/criu/arch/s390/include/asm/thread_pointer.h b/criu/arch/s390/include/asm/thread_pointer.h +new file mode 100644 +index 0000000..f7e0706 +--- /dev/null ++++ b/criu/arch/s390/include/asm/thread_pointer.h +@@ -0,0 +1,27 @@ ++/* __thread_pointer definition. Generic version. ++ Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library. If not, see ++ . */ ++ ++#ifndef _SYS_THREAD_POINTER_H ++#define _SYS_THREAD_POINTER_H ++ ++static inline void *__criu_thread_pointer(void) ++{ ++ return __builtin_thread_pointer(); ++} ++ ++#endif /* _SYS_THREAD_POINTER_H */ +diff --git a/criu/arch/x86/include/asm/thread_pointer.h b/criu/arch/x86/include/asm/thread_pointer.h +new file mode 100644 +index 0000000..08603ae +--- /dev/null ++++ b/criu/arch/x86/include/asm/thread_pointer.h +@@ -0,0 +1,37 @@ ++/* __thread_pointer definition. x86 version. ++ Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library. If not, see ++ . */ ++ ++#ifndef _SYS_THREAD_POINTER_H ++#define _SYS_THREAD_POINTER_H ++ ++static inline void *__criu_thread_pointer(void) ++{ ++#if __GNUC_PREREQ(11, 1) ++ return __builtin_thread_pointer(); ++#else ++ void *__result; ++#ifdef __x86_64__ ++ __asm__("mov %%fs:0, %0" : "=r"(__result)); ++#else ++ __asm__("mov %%gs:0, %0" : "=r"(__result)); ++#endif ++ return __result; ++#endif /* !GCC 11 */ ++} ++ ++#endif /* _SYS_THREAD_POINTER_H */ +\ No newline at end of file +-- +2.30.0 + diff --git a/0010-clone-noasan-unregister-rseq-at-the-thread-start-for.patch b/0010-clone-noasan-unregister-rseq-at-the-thread-start-for.patch new file mode 100644 index 0000000000000000000000000000000000000000..a8e8e995795c5e1940e84a1424df1cb3c707f7a7 --- /dev/null +++ b/0010-clone-noasan-unregister-rseq-at-the-thread-start-for.patch @@ -0,0 +1,102 @@ +From d43ad9913c19afa6d80cb8124015d47361152db8 Mon Sep 17 00:00:00 2001 +From: bb-cat +Date: Wed, 2 Mar 2022 15:00:43 +0800 +Subject: [PATCH 10/16] clone-noasan: unregister rseq at the thread start for + new glibc Fresh glibc does rseq registration by default during + start_thread(). [ see + https://sourceware.org/git/?p=glibc.git;a=commitdiff;h=95e114a0919d844d8fe07839cb6538b7f5ee920e + ] + +This cause process crashes during memory restore procedure, because +memory which corresponds to the struct rseq will be overwritten. + +See also +("nptl: Add public rseq symbols and ") +https://sourceware.org/git?p=glibc.git;a=commit;h=c901c3e764d7c7079f006b4e21e877d5036eb4f5 +("nptl: Add for defining __thread_pointer") +https://sourceware.org/git?p=glibc.git;a=commit;h=8dbeb0561eeb876f557ac9eef5721912ec074ea5 + +Signed-off-by: Alexander Mikhalitsyn +--- + criu/clone-noasan.c | 42 +++++++++++++++++++++++++++++++-- + 1 file changed, 40 insertions(+), 2 deletions(-) + +diff --git a/criu/clone-noasan.c b/criu/clone-noasan.c +index d657ea2..5f8dd1b 100644 +--- a/criu/clone-noasan.c ++++ b/criu/clone-noasan.c +@@ -2,6 +2,13 @@ + #include + #include + ++#ifdef __has_include ++#if __has_include ("sys/rseq.h") ++#include ++#include "asm/thread_pointer.h" ++#endif ++#endif ++ + #include + + #include "sched.h" +@@ -34,16 +41,45 @@ + * ... wait for process to finish ... + * unlock_last_pid + */ ++ ++#if defined(RSEQ_SIG) ++static inline void unregister_glibc_rseq(void) ++{ ++ /* unregister rseq */ ++ syscall(__NR_rseq, (void *)((char *)__criu_thread_pointer() + __rseq_offset), __rseq_size, 1, RSEQ_SIG); ++} ++#else ++static inline void unregister_glibc_rseq(void) ++{ ++} ++#endif ++ ++struct call_fn_args { ++ int (*fn)(void *); ++ void *arg; ++}; ++ ++int call_fn(void *arg) ++{ ++ struct call_fn_args *cargs = arg; ++ unregister_glibc_rseq(); ++ return cargs->fn(cargs->arg); ++} ++ + int clone_noasan(int (*fn)(void *), int flags, void *arg) + { + void *stack_ptr = (void *)round_down((unsigned long)&stack_ptr - 1024, 16); ++ struct call_fn_args a = { ++ .fn = fn, ++ .arg = arg, ++ }; + + BUG_ON((flags & CLONE_VM) && !(flags & CLONE_VFORK)); + /* + * Reserve some bytes for clone() internal needs + * and use as stack the address above this area. + */ +- return clone(fn, stack_ptr, flags, arg); ++ return clone(call_fn, stack_ptr, flags, (void *)&a); + } + + int clone3_with_pid_noasan(int (*fn)(void *), void *arg, int flags, int exit_signal, pid_t pid) +@@ -78,7 +114,9 @@ int clone3_with_pid_noasan(int (*fn)(void *), void *arg, int flags, int exit_sig + c_args.set_tid = ptr_to_u64(&pid); + c_args.set_tid_size = 1; + pid = syscall(__NR_clone3, &c_args, sizeof(c_args)); +- if (pid == 0) ++ if (pid == 0) { ++ unregister_glibc_rseq(); + exit(fn(arg)); ++ } + return pid; + } +-- +2.30.0 + diff --git a/0011-zdtm-static-rseq00-fix-rseq-test-when-linking-with-a.patch b/0011-zdtm-static-rseq00-fix-rseq-test-when-linking-with-a.patch new file mode 100644 index 0000000000000000000000000000000000000000..e5745acef268a8f0b2677e54b27cd03bbacfe0b3 --- /dev/null +++ b/0011-zdtm-static-rseq00-fix-rseq-test-when-linking-with-a.patch @@ -0,0 +1,158 @@ +From 4f4d5acc34046954aea9e8ea10b5f71ff5f0fbd5 Mon Sep 17 00:00:00 2001 +From: bb-cat +Date: Wed, 2 Mar 2022 15:01:34 +0800 +Subject: [PATCH 11/16] zdtm/static/rseq00: fix rseq test when linking with a + fresh Glibc Fresh Glibc does rseq() register by default. We need to + unregister rseq before registering our own. + +Signed-off-by: Alexander Mikhalitsyn +--- + test/zdtm/static/rseq00.c | 76 ++++++++++++++++++++------- + 1 file changed, 58 insertions(+), 18 deletions(-) + +diff --git a/test/zdtm/static/rseq00.c b/test/zdtm/static/rseq00.c +index 26f41a2..87053b8 100644 +--- a/test/zdtm/static/rseq00.c ++++ b/test/zdtm/static/rseq00.c +@@ -19,13 +19,48 @@ + + #include "zdtmtst.h" + +-#if defined(__x86_64__) ++#ifdef __has_include ++#if __has_include("sys/rseq.h") ++#include ++#endif ++#endif ++ ++#if defined(__i386__) || defined(__x86_64__) ++ ++#if defined(RSEQ_SIG) ++static inline void *__criu_thread_pointer(void) ++{ ++#if __GNUC_PREREQ(11, 1) ++ return __builtin_thread_pointer(); ++#else ++ void *__result; ++#ifdef __x86_64__ ++ __asm__("mov %%fs:0, %0" : "=r"(__result)); ++#else ++ __asm__("mov %%gs:0, %0" : "=r"(__result)); ++#endif ++ return __result; ++#endif /* !GCC 11 */ ++} ++ ++static inline void unregister_glibc_rseq(void) ++{ ++ /* unregister rseq */ ++ syscall(__NR_rseq, (void *)((char *)__criu_thread_pointer() + __rseq_offset), __rseq_size, 1, RSEQ_SIG); ++} ++#else ++static inline void unregister_glibc_rseq(void) ++{ ++} ++#endif + + const char *test_doc = "Check that rseq() basic C/R works"; + const char *test_author = "Alexander Mikhalitsyn "; + /* parts of code borrowed from https://www.efficios.com/blog/2019/02/08/linux-restartable-sequences/ */ + + /* some useful definitions from kernel uapi */ ++#ifndef RSEQ_SIG ++ + enum rseq_flags { + RSEQ_FLAG_UNREGISTER = (1 << 0), + }; +@@ -37,14 +72,21 @@ struct rseq { + uint32_t flags; + } __attribute__((aligned(4 * sizeof(uint64_t)))); + ++#define RSEQ_SIG 0x53053053 ++ ++#endif ++ + #ifndef __NR_rseq + #define __NR_rseq 334 + #endif + /* EOF */ + +-static __thread volatile struct rseq __rseq_abi; ++#define RSEQ_TLS_ALLOC 0 + +-#define RSEQ_SIG 0x53053053 ++static volatile struct rseq *rseq_ptr; ++#if RSEQ_TLS_ALLOC ++static __thread volatile struct rseq __rseq_abi; ++#endif + + static int sys_rseq(volatile struct rseq *rseq_abi, uint32_t rseq_len, int flags, uint32_t sig) + { +@@ -54,27 +96,18 @@ static int sys_rseq(volatile struct rseq *rseq_abi, uint32_t rseq_len, int flags + static void register_thread(void) + { + int rc; +- rc = sys_rseq(&__rseq_abi, sizeof(struct rseq), 0, RSEQ_SIG); ++ unregister_glibc_rseq(); ++ rc = sys_rseq(rseq_ptr, sizeof(struct rseq), 0, RSEQ_SIG); + if (rc) { + fail("Failed to register rseq"); + exit(1); + } + } + +-static void unregister_thread(void) +-{ +- int rc; +- rc = sys_rseq(&__rseq_abi, sizeof(struct rseq), RSEQ_FLAG_UNREGISTER, RSEQ_SIG); +- if (rc) { +- fail("Failed to unregister rseq"); +- exit(1); +- } +-} +- + static void check_thread(void) + { + int rc; +- rc = sys_rseq(&__rseq_abi, sizeof(struct rseq), 0, RSEQ_SIG); ++ rc = sys_rseq(rseq_ptr, sizeof(struct rseq), 0, RSEQ_SIG); + if (!(rc && errno == EBUSY)) { + fail("Failed to check rseq %d", rc); + exit(1); +@@ -111,8 +144,8 @@ static int rseq_addv(intptr_t *v, intptr_t count, int cpu) + ".popsection\n\t" + : /* gcc asm goto does not allow outputs */ + : [cpu_id] "r" (cpu), +- [current_cpu_id] "m" (__rseq_abi.cpu_id), +- [rseq_cs] "m" (__rseq_abi.rseq_cs), ++ [current_cpu_id] "m" (rseq_ptr->cpu_id), ++ [rseq_cs] "m" (rseq_ptr->rseq_cs), + /* final store input */ + [v] "m" (*v), + [count] "er" (count) +@@ -132,6 +165,13 @@ int main(int argc, char *argv[]) + intptr_t *cpu_data; + long nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); + ++#if RSEQ_TLS_ALLOC ++ rseq_ptr = &__rseq_abi; ++#else ++ //rseq_ptr = malloc(sizeof(struct rseq)); ++ rseq_ptr = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_SHARED, 0, 0); ++#endif ++ + test_init(argc, argv); + + cpu_data = calloc(nr_cpus, sizeof(*cpu_data)); +@@ -147,7 +187,7 @@ int main(int argc, char *argv[]) + + check_thread(); + +- cpu = RSEQ_ACCESS_ONCE(__rseq_abi.cpu_id_start); ++ cpu = RSEQ_ACCESS_ONCE(rseq_ptr->cpu_id_start); + ret = rseq_addv(&cpu_data[cpu], 2, cpu); + if (ret) + fail("Failed to increment per-cpu counter"); +-- +2.30.0 + diff --git a/0012-compel-add-helpers-to-get-set-instruction-pointer.patch b/0012-compel-add-helpers-to-get-set-instruction-pointer.patch new file mode 100644 index 0000000000000000000000000000000000000000..33acd47dde4265510ced29559200ada1b26505b1 --- /dev/null +++ b/0012-compel-add-helpers-to-get-set-instruction-pointer.patch @@ -0,0 +1,265 @@ +From 06cb51057ce1cc31b79c6321273dfa0b4cb7f980 Mon Sep 17 00:00:00 2001 +From: bb-cat +Date: Wed, 2 Mar 2022 15:02:08 +0800 +Subject: [PATCH 12/16] compel: add helpers to get/set instruction pointer + Signed-off-by: Alexander Mikhalitsyn + +--- + .../src/lib/include/uapi/asm/infect-types.h | 9 +++++---- + .../src/lib/include/uapi/asm/infect-types.h | 9 +++++---- + .../src/lib/include/uapi/asm/infect-types.h | 9 +++++---- + .../src/lib/include/uapi/asm/infect-types.h | 9 +++++---- + .../src/lib/include/uapi/asm/infect-types.h | 7 ++++--- + .../src/lib/include/uapi/asm/infect-types.h | 9 +++++---- + compel/include/uapi/infect.h | 6 ++++++ + compel/src/lib/infect.c | 20 +++++++++++++++++++ + .../criu/arch/aarch64/include/asm/types.h | 2 ++ + criu/arch/arm/include/asm/types.h | 2 ++ + .../criu/arch/mips/include/asm/types.h | 2 ++ + .../criu/arch/ppc64/include/asm/types.h | 2 ++ + .../criu/arch/s390/include/asm/types.h | 2 ++ + criu/arch/x86/include/asm/types.h | 2 ++ + 14 files changed, 67 insertions(+), 23 deletions(-) + +diff --git a/compel/arch/aarch64/src/lib/include/uapi/asm/infect-types.h b/compel/arch/aarch64/src/lib/include/uapi/asm/infect-types.h +index f91e73d..9d4ce7e 100644 +--- a/compel/arch/aarch64/src/lib/include/uapi/asm/infect-types.h ++++ b/compel/arch/aarch64/src/lib/include/uapi/asm/infect-types.h +@@ -23,10 +23,11 @@ typedef struct user_fpsimd_state user_fpregs_struct_t; + #define compel_arch_get_tls_task(ctl, tls) + #define compel_arch_get_tls_thread(tctl, tls) + +-#define REG_RES(r) ((uint64_t)(r).regs[0]) +-#define REG_IP(r) ((uint64_t)(r).pc) +-#define REG_SP(r) ((uint64_t)((r).sp)) +-#define REG_SYSCALL_NR(r) ((uint64_t)(r).regs[8]) ++#define REG_RES(r) ((uint64_t)(r).regs[0]) ++#define REG_IP(r) ((uint64_t)(r).pc) ++#define SET_REG_IP(r, val) ((r).pc = (val)) ++#define REG_SP(r) ((uint64_t)((r).sp)) ++#define REG_SYSCALL_NR(r) ((uint64_t)(r).regs[8]) + + #define user_regs_native(pregs) true + +diff --git a/compel/arch/arm/src/lib/include/uapi/asm/infect-types.h b/compel/arch/arm/src/lib/include/uapi/asm/infect-types.h +index 159b6a9..8d32825 100644 +--- a/compel/arch/arm/src/lib/include/uapi/asm/infect-types.h ++++ b/compel/arch/arm/src/lib/include/uapi/asm/infect-types.h +@@ -56,10 +56,11 @@ struct user_vfp_exc { + unsigned long fpinst2; + }; + +-#define REG_RES(regs) ((regs).ARM_r0) +-#define REG_IP(regs) ((regs).ARM_pc) +-#define REG_SP(regs) ((regs).ARM_sp) +-#define REG_SYSCALL_NR(regs) ((regs).ARM_r7) ++#define REG_RES(regs) ((regs).ARM_r0) ++#define REG_IP(regs) ((regs).ARM_pc) ++#define SET_REG_IP(regs, val) ((regs).ARM_pc = (val)) ++#define REG_SP(regs) ((regs).ARM_sp) ++#define REG_SYSCALL_NR(regs) ((regs).ARM_r7) + + #define user_regs_native(pregs) true + +diff --git a/compel/arch/mips/src/lib/include/uapi/asm/infect-types.h b/compel/arch/mips/src/lib/include/uapi/asm/infect-types.h +index 70b3f85..481566a 100644 +--- a/compel/arch/mips/src/lib/include/uapi/asm/infect-types.h ++++ b/compel/arch/mips/src/lib/include/uapi/asm/infect-types.h +@@ -56,10 +56,11 @@ static inline bool user_regs_native(user_regs_struct_t *pregs) + #define compel_arch_get_tls_task(ctl, tls) + #define compel_arch_get_tls_thread(tctl, tls) + +-#define REG_RES(regs) ((regs).MIPS_v0) +-#define REG_IP(regs) ((regs).cp0_epc) +-#define REG_SP(regs) ((regs).MIPS_sp) +-#define REG_SYSCALL_NR(regs) ((regs).MIPS_v0) ++#define REG_RES(regs) ((regs).MIPS_v0) ++#define REG_IP(regs) ((regs).cp0_epc) ++#define SET_REG_IP(regs, val) ((regs).cp0_epc = (val)) ++#define REG_SP(regs) ((regs).MIPS_sp) ++#define REG_SYSCALL_NR(regs) ((regs).MIPS_v0) + + //#define __NR(syscall, compat) ((compat) ? __NR32_##syscall : __NR_##syscall) + #define __NR(syscall, compat) __NR_##syscall +diff --git a/compel/arch/ppc64/src/lib/include/uapi/asm/infect-types.h b/compel/arch/ppc64/src/lib/include/uapi/asm/infect-types.h +index fe6192e..bf2cc95 100644 +--- a/compel/arch/ppc64/src/lib/include/uapi/asm/infect-types.h ++++ b/compel/arch/ppc64/src/lib/include/uapi/asm/infect-types.h +@@ -72,10 +72,11 @@ typedef struct { + } tm; + } user_fpregs_struct_t; + +-#define REG_RES(regs) ((uint64_t)(regs).gpr[3]) +-#define REG_IP(regs) ((uint64_t)(regs).nip) +-#define REG_SP(regs) ((uint64_t)(regs).gpr[1]) +-#define REG_SYSCALL_NR(regs) ((uint64_t)(regs).gpr[0]) ++#define REG_RES(regs) ((uint64_t)(regs).gpr[3]) ++#define REG_IP(regs) ((uint64_t)(regs).nip) ++#define SET_REG_IP(regs, val) ((regs).nip = (val)) ++#define REG_SP(regs) ((uint64_t)(regs).gpr[1]) ++#define REG_SYSCALL_NR(regs) ((uint64_t)(regs).gpr[0]) + + #define user_regs_native(pregs) true + +diff --git a/compel/arch/s390/src/lib/include/uapi/asm/infect-types.h b/compel/arch/s390/src/lib/include/uapi/asm/infect-types.h +index 896d70e..87283bc 100644 +--- a/compel/arch/s390/src/lib/include/uapi/asm/infect-types.h ++++ b/compel/arch/s390/src/lib/include/uapi/asm/infect-types.h +@@ -62,9 +62,10 @@ typedef struct { + uint32_t system_call; + } user_regs_struct_t; + +-#define REG_RES(r) ((uint64_t)(r).prstatus.gprs[2]) +-#define REG_IP(r) ((uint64_t)(r).prstatus.psw.addr) +-#define REG_SP(r) ((uint64_t)(r).prstatus.gprs[15]) ++#define REG_RES(r) ((uint64_t)(r).prstatus.gprs[2]) ++#define REG_IP(r) ((uint64_t)(r).prstatus.psw.addr) ++#define SET_REG_IP(r, val) ((r).prstatus.psw.addr = (val)) ++#define REG_SP(r) ((uint64_t)(r).prstatus.gprs[15]) + /* + * We assume that REG_SYSCALL_NR() is only used for pie code where we + * always use svc 0 with opcode in %r1. +diff --git a/compel/arch/x86/src/lib/include/uapi/asm/infect-types.h b/compel/arch/x86/src/lib/include/uapi/asm/infect-types.h +index 34b3ad0..b35504f 100644 +--- a/compel/arch/x86/src/lib/include/uapi/asm/infect-types.h ++++ b/compel/arch/x86/src/lib/include/uapi/asm/infect-types.h +@@ -127,10 +127,11 @@ typedef struct { + + typedef struct xsave_struct user_fpregs_struct_t; + +-#define REG_RES(regs) get_user_reg(®s, ax) +-#define REG_IP(regs) get_user_reg(®s, ip) +-#define REG_SP(regs) get_user_reg(®s, sp) +-#define REG_SYSCALL_NR(regs) get_user_reg(®s, orig_ax) ++#define REG_RES(regs) get_user_reg(®s, ax) ++#define REG_IP(regs) get_user_reg(®s, ip) ++#define SET_REG_IP(regs, val) set_user_reg(®s, ip, val) ++#define REG_SP(regs) get_user_reg(®s, sp) ++#define REG_SYSCALL_NR(regs) get_user_reg(®s, orig_ax) + + #define __NR(syscall, compat) ((compat) ? __NR32_##syscall : __NR_##syscall) + +diff --git a/compel/include/uapi/infect.h b/compel/include/uapi/infect.h +index c3d2ee6..389878e 100644 +--- a/compel/include/uapi/infect.h ++++ b/compel/include/uapi/infect.h +@@ -168,4 +168,10 @@ extern unsigned long compel_task_size(void); + extern uint64_t compel_get_leader_sp(struct parasite_ctl *ctl); + extern uint64_t compel_get_thread_sp(struct parasite_thread_ctl *tctl); + ++extern uint64_t compel_get_leader_ip(struct parasite_ctl *ctl); ++extern uint64_t compel_get_thread_ip(struct parasite_thread_ctl *tctl); ++ ++void compel_set_leader_ip(struct parasite_ctl *ctl, uint64_t v); ++void compel_set_thread_ip(struct parasite_thread_ctl *tctl, uint64_t v); ++ + #endif +diff --git a/compel/src/lib/infect.c b/compel/src/lib/infect.c +index 0fb9e71..6a13cc1 100644 +--- a/compel/src/lib/infect.c ++++ b/compel/src/lib/infect.c +@@ -1686,3 +1686,23 @@ uint64_t compel_get_thread_sp(struct parasite_thread_ctl *tctl) + { + return REG_SP(tctl->th.regs); + } ++ ++uint64_t compel_get_leader_ip(struct parasite_ctl *ctl) ++{ ++ return REG_IP(ctl->orig.regs); ++} ++ ++uint64_t compel_get_thread_ip(struct parasite_thread_ctl *tctl) ++{ ++ return REG_IP(tctl->th.regs); ++} ++ ++void compel_set_leader_ip(struct parasite_ctl *ctl, uint64_t v) ++{ ++ SET_REG_IP(ctl->orig.regs, v); ++} ++ ++void compel_set_thread_ip(struct parasite_thread_ctl *tctl, uint64_t v) ++{ ++ SET_REG_IP(tctl->th.regs, v); ++} +diff --git a/criu/arch/aarch64/include/asm/types.h b/criu/arch/aarch64/include/asm/types.h +index c860af1..363c1ca 100644 +--- a/criu/arch/aarch64/include/asm/types.h ++++ b/criu/arch/aarch64/include/asm/types.h +@@ -22,6 +22,8 @@ typedef UserAarch64RegsEntry UserRegsEntry; + + #define TI_SP(core) ((core)->ti_aarch64->gpregs->sp) + ++#define TI_IP(core) ((core)->ti_aarch64->gpregs->pc) ++ + static inline void *decode_pointer(uint64_t v) + { + return (void *)v; +diff --git a/criu/arch/arm/include/asm/types.h b/criu/arch/arm/include/asm/types.h +index cfcb8a1..93d2dc2 100644 +--- a/criu/arch/arm/include/asm/types.h ++++ b/criu/arch/arm/include/asm/types.h +@@ -21,6 +21,8 @@ typedef UserArmRegsEntry UserRegsEntry; + + #define TI_SP(core) ((core)->ti_arm->gpregs->sp) + ++#define TI_IP(core) ((core)->ti_arm->gpregs->ip) ++ + static inline void *decode_pointer(u64 v) + { + return (void *)(u32)v; +diff --git a/criu/arch/mips/include/asm/types.h b/criu/arch/mips/include/asm/types.h +index 237471f..2c75b6a 100644 +--- a/criu/arch/mips/include/asm/types.h ++++ b/criu/arch/mips/include/asm/types.h +@@ -18,6 +18,8 @@ + + #define CORE_THREAD_ARCH_INFO(core) core->ti_mips + ++#define TI_IP(core) ((core)->ti_mips->gpregs->cp0_epc) ++ + typedef UserMipsRegsEntry UserRegsEntry; + + static inline u64 encode_pointer(void *p) +diff --git a/criu/arch/ppc64/include/asm/types.h b/criu/arch/ppc64/include/asm/types.h +index fedeff2..d60aadd 100644 +--- a/criu/arch/ppc64/include/asm/types.h ++++ b/criu/arch/ppc64/include/asm/types.h +@@ -19,6 +19,8 @@ typedef UserPpc64RegsEntry UserRegsEntry; + + #define CORE_THREAD_ARCH_INFO(core) core->ti_ppc64 + ++#define TI_IP(core) ((core)->ti_ppc64->gpregs->nip) ++ + static inline void *decode_pointer(uint64_t v) + { + return (void *)v; +diff --git a/criu/arch/s390/include/asm/types.h b/criu/arch/s390/include/asm/types.h +index 7522cf2..abf12de 100644 +--- a/criu/arch/s390/include/asm/types.h ++++ b/criu/arch/s390/include/asm/types.h +@@ -19,6 +19,8 @@ typedef UserS390RegsEntry UserRegsEntry; + + #define CORE_THREAD_ARCH_INFO(core) core->ti_s390 + ++#define TI_IP(core) ((core)->ti_s390->gpregs->psw_addr) ++ + static inline u64 encode_pointer(void *p) + { + return (u64)p; +diff --git a/criu/arch/x86/include/asm/types.h b/criu/arch/x86/include/asm/types.h +index a0a8ed9..8919d0a 100644 +--- a/criu/arch/x86/include/asm/types.h ++++ b/criu/arch/x86/include/asm/types.h +@@ -28,6 +28,8 @@ static inline int core_is_compat(CoreEntry *c) + + #define CORE_THREAD_ARCH_INFO(core) core->thread_info + ++#define TI_IP(core) ((core)->thread_info->gpregs->ip) ++ + typedef UserX86RegsEntry UserRegsEntry; + + static inline u64 encode_pointer(void *p) +-- +2.30.0 + diff --git a/0013-cr-dump-fixup-thread-IP-when-inside-rseq-cs.patch b/0013-cr-dump-fixup-thread-IP-when-inside-rseq-cs.patch new file mode 100644 index 0000000000000000000000000000000000000000..cd8ef176ddd77e4a13fee010fd690983e0973c71 --- /dev/null +++ b/0013-cr-dump-fixup-thread-IP-when-inside-rseq-cs.patch @@ -0,0 +1,248 @@ +From 33abfc12b973560b3d98afdbac7554b8c0542c3d Mon Sep 17 00:00:00 2001 +From: bb-cat +Date: Wed, 2 Mar 2022 15:04:54 +0800 +Subject: [PATCH 13/16] cr-dump: fixup thread IP when inside rseq cs + Signed-off-by: Alexander Mikhalitsyn + +--- + criu/cr-dump.c | 155 +++++++++++++++++++++++++++- + criu/include/parasite.h | 2 + + criu/include/pstree.h | 1 + + 3 files changed, 154 insertions(+), 4 deletions(-) + +diff --git a/criu/cr-dump.c b/criu/cr-dump.c +index 91dd08a..a3f8973 100644 +--- a/criu/cr-dump.c ++++ b/criu/cr-dump.c +@@ -1047,11 +1047,58 @@ static int dump_task_signals(pid_t pid, struct pstree_item *item) + return 0; + } + +-static int dump_thread_rseq(pid_t tid, RseqEntry **rseqep) ++static int read_rseq_cs(pid_t tid, struct __ptrace_rseq_configuration *rseq, struct rseq_cs *rseq_cs) ++{ ++ int ret; ++ uint64_t addr; ++ ++ /* rseq is not registered */ ++ if (!rseq->rseq_abi_pointer) ++ return 0; ++ ++ /* ++ * We need to cover the case when victim process was inside rseq critical section ++ * at the moment when CRIU comes and seized it. We need to determine the borders ++ * of rseq critical section at first. To achieve that we need to access thread ++ * memory and read pointer to struct rseq_cs. ++ * ++ * We have two ways to access thread memory: from the parasite and using ptrace(). ++ * But it this case we can't use parasite, because if victim process returns to the ++ * execution, on the kernel side __rseq_handle_notify_resume hook will be called, ++ * then rseq_ip_fixup() -> clear_rseq_cs() and user space memory with struct rseq ++ * will be cleared. So, let's use ptrace(PTRACE_PEEKDATA). ++ */ ++ ret = ptrace_peek_area(tid, &addr, decode_pointer(rseq->rseq_abi_pointer + offsetof(struct rseq, rseq_cs)), ++ sizeof(uint64_t)); ++ if (ret) { ++ pr_err("ptrace_peek_area(%d, %lx, %lx, %lx): fail to read rseq_cs addr\n", tid, (unsigned long)&addr, ++ (unsigned long)(rseq->rseq_abi_pointer + offsetof(struct rseq, rseq_cs)), sizeof(uint64_t)); ++ return -1; ++ } ++ ++ /* (struct rseq)->rseq_cs is NULL */ ++ if (!addr) ++ return 0; ++ ++ ret = ptrace_peek_area(tid, rseq_cs, decode_pointer(addr), sizeof(struct rseq_cs)); ++ if (ret) { ++ pr_err("ptrace_peek_area(%d, %lx, %lx, %lx): fail to read rseq_cs struct\n", tid, ++ (unsigned long)rseq_cs, (unsigned long)addr, sizeof(struct rseq_cs)); ++ return -1; ++ } ++ ++ return 0; ++} ++ ++static int dump_thread_rseq(struct pstree_item *item, int i) + { + struct __ptrace_rseq_configuration rseq; + RseqEntry *rseqe = NULL; + int ret; ++ CoreEntry *core = item->core[i]; ++ RseqEntry **rseqep = &core->thread_core->rseq_entry; ++ struct rseq_cs *rseq_cs = &dmpi(item)->thread_rseq_cs[i]; ++ pid_t tid = item->threads[i].real; + + /* + * If we are here it means that rseq() syscall is supported, +@@ -1076,7 +1123,8 @@ static int dump_thread_rseq(pid_t tid, RseqEntry **rseqep) + return -1; + } + +- pr_err("Dump rseq of %d: ptr = 0x%lx sign = 0x%x\n", tid, (unsigned long)rseq.rseq_abi_pointer, rseq.signature); ++ pr_info("Dump rseq of %d: ptr = 0x%lx sign = 0x%x\n", tid, (unsigned long)rseq.rseq_abi_pointer, ++ rseq.signature); + + rseqe = xmalloc(sizeof(*rseqe)); + if (!rseqe) +@@ -1088,25 +1136,118 @@ static int dump_thread_rseq(pid_t tid, RseqEntry **rseqep) + rseqe->rseq_abi_size = rseq.rseq_abi_size; + rseqe->signature = rseq.signature; + ++ if (read_rseq_cs(tid, &rseq, rseq_cs)) ++ goto err; ++ ++ /* save rseq entry to the image */ + *rseqep = rseqe; + + return 0; ++ ++err: ++ xfree(rseqe); ++ return -1; + } + + static int dump_task_rseq(pid_t pid, struct pstree_item *item) + { + int i; ++ struct rseq_cs *thread_rseq_cs; + + /* if rseq() syscall isn't supported then nothing to dump */ + if (!kdat.has_rseq) + return 0; + ++ thread_rseq_cs = xzalloc(sizeof(*thread_rseq_cs) * item->nr_threads); ++ if (!thread_rseq_cs) ++ return -1; ++ ++ dmpi(item)->thread_rseq_cs = thread_rseq_cs; ++ + for (i = 0; i < item->nr_threads; i++) { +- if (dump_thread_rseq(item->threads[i].real, &item->core[i]->thread_core->rseq_entry)) +- return -1; ++ if (dump_thread_rseq(item, i)) ++ goto free_rseq; + } + + return 0; ++ ++free_rseq: ++ xfree(thread_rseq_cs); ++ dmpi(item)->thread_rseq_cs = NULL; ++ return -1; ++} ++ ++static bool task_in_rseq(struct rseq_cs *rseq_cs, uint64_t addr) ++{ ++ return addr >= rseq_cs->start_ip && addr < rseq_cs->start_ip + rseq_cs->post_commit_offset; ++} ++ ++static int fixup_thread_rseq(struct pstree_item *item, int i) ++{ ++ CoreEntry *core = item->core[i]; ++ struct rseq_cs *rseq_cs = &dmpi(item)->thread_rseq_cs[i]; ++ pid_t tid = item->threads[i].real; ++ ++ /* (struct rseq)->rseq_cs is NULL */ ++ if (!rseq_cs->start_ip) ++ return 0; ++ ++ pr_info("fixup_thread_rseq for %d: rseq_cs start_ip = %llx abort_ip = %llx post_commit_offset = %llx flags = %x version = %x; IP = %lx\n", ++ tid, rseq_cs->start_ip, rseq_cs->abort_ip, rseq_cs->post_commit_offset, rseq_cs->flags, ++ rseq_cs->version, (unsigned long)TI_IP(core)); ++ ++ if (rseq_cs->version != 0) { ++ pr_err("unsupported RSEQ ABI version = %d\n", rseq_cs->version); ++ return -1; ++ } ++ ++ if (task_in_rseq(rseq_cs, TI_IP(core))) { ++ struct pid *tid = &item->threads[i]; ++ ++ pr_info("The %d task is in rseq critical section. IP will be set to rseq abort handler addr\n", ++ tid->real); ++ ++ /* ++ * We need to fixup task instruction pointer from ++ * the original one (which lays inside rseq critical section) ++ * to rseq abort handler address. ++ * ++ * It's worth to mention that we need to fixup IP in CoreEntry ++ * (used when full dump/restore is performed) and also in ++ * the parasite regs storage (used if --leave-running option is used, ++ * or if dump error occured and process execution is resumed). ++ */ ++ TI_IP(core) = rseq_cs->abort_ip; ++ ++ if (item->pid->real == tid->real) { ++ compel_set_leader_ip(dmpi(item)->parasite_ctl, rseq_cs->abort_ip); ++ } else { ++ compel_set_thread_ip(dmpi(item)->thread_ctls[i], rseq_cs->abort_ip); ++ } ++ } ++ ++ return 0; ++} ++ ++static int fixup_task_rseq(pid_t pid, struct pstree_item *item) ++{ ++ int ret = 0; ++ int i; ++ ++ if (!kdat.has_ptrace_get_rseq_conf) ++ return 0; ++ ++ for (i = 0; i < item->nr_threads; i++) { ++ if (fixup_thread_rseq(item, i)) { ++ ret = -1; ++ goto exit; ++ } ++ } ++ ++exit: ++ xfree(dmpi(item)->thread_rseq_cs); ++ dmpi(item)->thread_rseq_cs = NULL; ++ return ret; + } + + static struct proc_pid_stat pps_buf; +@@ -1409,6 +1550,12 @@ static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) + goto err; + } + ++ ret = fixup_task_rseq(pid, item); ++ if (ret) { ++ pr_err("Fixup rseq for %d failed %d\n", pid, ret); ++ goto err; ++ } ++ + if (fault_injected(FI_DUMP_EARLY)) { + pr_info("fault: CRIU sudden detach\n"); + kill(getpid(), SIGKILL); +diff --git a/criu/include/parasite.h b/criu/include/parasite.h +index 5fde809..d2a0688 100644 +--- a/criu/include/parasite.h ++++ b/criu/include/parasite.h +@@ -10,6 +10,8 @@ + #include + #include + ++#include "linux/rseq.h" ++ + #include "image.h" + #include "util-pie.h" + #include "common/lock.h" +diff --git a/criu/include/pstree.h b/criu/include/pstree.h +index c5b0fa7..458e5f9 100644 +--- a/criu/include/pstree.h ++++ b/criu/include/pstree.h +@@ -63,6 +63,7 @@ struct dmp_info { + struct parasite_ctl *parasite_ctl; + struct parasite_thread_ctl **thread_ctls; + uint64_t *thread_sp; ++ struct rseq_cs *thread_rseq_cs; + + /* + * Although we don't support dumping different struct creds in general, +-- +2.30.0 + diff --git a/0014-zdtm-add-rseq-transition-test-for-amd64.patch b/0014-zdtm-add-rseq-transition-test-for-amd64.patch new file mode 100644 index 0000000000000000000000000000000000000000..d1379bd2dcbf7eff26bc8079a43e4a135e948c2a --- /dev/null +++ b/0014-zdtm-add-rseq-transition-test-for-amd64.patch @@ -0,0 +1,250 @@ +From f76aa4ade354649e3291b5e7274c368740b05417 Mon Sep 17 00:00:00 2001 +From: bb-cat +Date: Wed, 2 Mar 2022 15:05:34 +0800 +Subject: [PATCH 14/16] zdtm: add rseq transition test for amd64 Signed-off-by: + Alexander Mikhalitsyn + +--- + test/zdtm/transition/Makefile | 1 + + test/zdtm/transition/rseq01.c | 208 +++++++++++++++++++ + test/zdtm/transition/rseq01.desc | 1 + + 3 files changed, 210 insertions(+) + create mode 100644 test/zdtm/transition/rseq01.c + create mode 100644 test/zdtm/transition/rseq01.desc + +diff --git a/test/zdtm/transition/Makefile b/test/zdtm/transition/Makefile +index 9388157..fae4e27 100644 +--- a/test/zdtm/transition/Makefile ++++ b/test/zdtm/transition/Makefile +@@ -23,6 +23,7 @@ TST_NOFILE = \ + lazy-thp \ + pid_reuse \ + pidfd_store_sk \ ++ rseq01 \ + + + TST_FILE = \ +diff --git a/test/zdtm/transition/rseq01.c b/test/zdtm/transition/rseq01.c +new file mode 100644 +index 0000000..5fac5a6 +--- /dev/null ++++ b/test/zdtm/transition/rseq01.c +@@ -0,0 +1,208 @@ ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "zdtmtst.h" ++ ++#ifdef __has_include ++# if __has_include ("sys/rseq.h") ++# include ++# endif ++#endif ++ ++#if defined(__x86_64__) ++ ++#if defined(__x86_64__) && defined(RSEQ_SIG) ++static inline void *thread_pointer(void) ++{ ++ void *result; ++ asm("mov %%fs:0, %0" : "=r"(result)); ++ return result; ++} ++ ++static inline void unregister_old_rseq(void) ++{ ++ /* unregister rseq */ ++ syscall(__NR_rseq, (void *)((char *)thread_pointer() + __rseq_offset), __rseq_size, 1, RSEQ_SIG); ++} ++#else ++static inline void unregister_old_rseq(void) ++{ ++} ++#endif ++ ++const char *test_doc = "rseq() transition test"; ++const char *test_author = "Alexander Mikhalitsyn "; ++ ++/* parts of code borrowed from https://www.efficios.com/blog/2019/02/08/linux-restartable-sequences/ */ ++ ++/* some useful definitions from kernel uapi */ ++#ifndef RSEQ_SIG ++ ++enum rseq_flags { ++ RSEQ_FLAG_UNREGISTER = (1 << 0), ++}; ++ ++struct rseq { ++ uint32_t cpu_id_start; ++ uint32_t cpu_id; ++ uint64_t rseq_cs; ++ uint32_t flags; ++} __attribute__((aligned(4 * sizeof(uint64_t)))); ++ ++#define RSEQ_SIG 0x53053053 ++ ++#endif ++ ++#ifndef __NR_rseq ++#define __NR_rseq 334 ++#endif ++/* EOF */ ++ ++static volatile struct rseq *rseq_ptr; ++static __thread volatile struct rseq __rseq_abi; ++ ++static int sys_rseq(volatile struct rseq *rseq_abi, uint32_t rseq_len, int flags, uint32_t sig) ++{ ++ return syscall(__NR_rseq, rseq_abi, rseq_len, flags, sig); ++} ++ ++static void register_thread(void) ++{ ++ int rc; ++ unregister_old_rseq(); ++ rc = sys_rseq(rseq_ptr, sizeof(struct rseq), 0, RSEQ_SIG); ++ if (rc) { ++ fail("Failed to register rseq"); ++ exit(1); ++ } ++} ++ ++static void check_thread(void) ++{ ++ int rc; ++ rc = sys_rseq(rseq_ptr, sizeof(struct rseq), 0, RSEQ_SIG); ++ if (!(rc && errno == EBUSY)) { ++ fail("Failed to check rseq %d", rc); ++ exit(1); ++ } ++} ++ ++#define RSEQ_ACCESS_ONCE(x) (*(__volatile__ __typeof__(x) *)&(x)) ++ ++static int rseq_addv(intptr_t *v, intptr_t count, int cpu) ++{ ++ double a = 10000000000000000.0; ++ double b = -1; ++ /*test_msg("enter %f %f\n", a, b);*/ ++ ++ /* clang-format off */ ++ __asm__ __volatile__ goto( ++ ".pushsection __rseq_table, \"aw\"\n\t" ++ ".balign 32\n\t" ++ "cs_obj:\n\t" ++ /* version, flags */ ++ ".long 0, 0\n\t" ++ /* start_ip, post_commit_offset, abort_ip */ ++ ".quad 1f, (2f-1f), 4f\n\t" ++ ".popsection\n\t" ++ "1:\n\t" ++ "leaq cs_obj(%%rip), %%rax\n\t" ++ "movq %%rax, %[rseq_cs]\n\t" ++ "cmpl %[cpu_id], %[current_cpu_id]\n\t" ++ "jnz 4f\n\t" ++ "addq %[count], %[v]\n\t" /* final store */ ++ "mov $10000000, %%rcx\n\t" ++ "fldl %[x]\n\t" /* we have st clobbered */ ++ "5:\n\t" ++ "fsqrt\n\t" /* heavy instruction */ ++ "dec %%rcx\n\t" ++ "jnz 5b\n\t" ++ "fstpl %[y]\n\t" ++ "2:\n\t" ++ ".pushsection __rseq_failure, \"ax\"\n\t" ++ /* Disassembler-friendly signature: nopl (%rip). */ ++ ".byte 0x0f, 0xb9, 0x3d\n\t" ++ ".long 0x53053053\n\t" /* RSEQ_FLAGS */ ++ "4:\n\t" ++ /*"fstpl %[y]\n\t"*/ ++ "jmp %l[abort]\n\t" ++ /*"jmp 1b\n\t"*/ ++ ".popsection\n\t" ++ : /* gcc asm goto does not allow outputs */ ++ : [cpu_id] "r" (cpu), ++ [current_cpu_id] "m" (rseq_ptr->cpu_id), ++ [rseq_cs] "m" (rseq_ptr->rseq_cs), ++ /* final store input */ ++ [v] "m" (*v), ++ [count] "er" (count), ++ [x] "m" (a), ++ [y] "m" (b) ++ : "memory", "cc", "rax", "rcx", "st" ++ : abort ++ ); ++ /* clang-format on */ ++ /*test_msg("exit %f %f\n", a, b);*/ ++ return 0; ++abort: ++ /*test_msg("abort %f %f\n", a, b);*/ ++ return -1; ++} ++ ++int main(int argc, char *argv[]) ++{ ++ int cpu = 0; ++ int ret; ++ intptr_t *cpu_data; ++ long nr_cpus; ++ ++ rseq_ptr = &__rseq_abi; ++ memset((void *)rseq_ptr, 0, sizeof(struct rseq)); ++ ++ test_init(argc, argv); ++ nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); ++ ++ cpu_data = calloc(nr_cpus, sizeof(*cpu_data)); ++ if (!cpu_data) { ++ fail("calloc"); ++ exit(EXIT_FAILURE); ++ } ++ register_thread(); ++ ++ test_daemon(); ++ ++ while (test_go()) { ++ cpu = RSEQ_ACCESS_ONCE(rseq_ptr->cpu_id_start); ++ ret = rseq_addv(&cpu_data[cpu], 2, cpu); ++ if (ret) ++ fail("Failed to increment per-cpu counter"); ++ } ++ ++ test_waitsig(); ++ ++ check_thread(); ++ pass(); ++ ++ return 0; ++} ++ ++#else ++ ++int main(int argc, char *argv[]) ++{ ++ test_init(argc, argv); ++ skip("Unsupported arch"); ++ return 0; ++} ++ ++#endif +diff --git a/test/zdtm/transition/rseq01.desc b/test/zdtm/transition/rseq01.desc +new file mode 100644 +index 0000000..0324fa3 +--- /dev/null ++++ b/test/zdtm/transition/rseq01.desc +@@ -0,0 +1 @@ ++{'flavor': 'h', 'arch': 'x86_64', 'feature': 'get_rseq_conf'} +-- +2.30.0 + diff --git a/0015-cr-dump-handle-rseq-flags-field.patch b/0015-cr-dump-handle-rseq-flags-field.patch new file mode 100644 index 0000000000000000000000000000000000000000..d54477411aa7b36382bcc340eece9441cc69abed --- /dev/null +++ b/0015-cr-dump-handle-rseq-flags-field.patch @@ -0,0 +1,330 @@ +From deac94521c373c13add63eaf88118187ea3c2cb2 Mon Sep 17 00:00:00 2001 +From: bb-cat +Date: Wed, 2 Mar 2022 15:09:44 +0800 +Subject: [PATCH 15/16] cr-dump: handle rseq flags field Userspace may + configure rseq critical section by def + +Signed-off-by: Alexander Mikhalitsyn +--- + criu/cr-dump.c | 86 +++++++++++++++++++------------ + criu/cr-restore.c | 63 ++++++++++++++++++++++ + criu/include/pstree.h | 1 + + images/rseq.proto | 1 + + 4 files changed, 119 insertions(+), 32 deletions(-) + +diff --git a/criu/cr-dump.c b/criu/cr-dump.c +index a3f8973..79387fb 100644 +--- a/criu/cr-dump.c ++++ b/criu/cr-dump.c +@@ -1047,13 +1047,13 @@ static int dump_task_signals(pid_t pid, struct pstree_item *item) + return 0; + } + +-static int read_rseq_cs(pid_t tid, struct __ptrace_rseq_configuration *rseq, struct rseq_cs *rseq_cs) ++static int read_rseq_cs(pid_t tid, struct __ptrace_rseq_configuration *rseqc, ++ struct rseq_cs *rseq_cs, struct rseq *rseq) + { + int ret; +- uint64_t addr; + + /* rseq is not registered */ +- if (!rseq->rseq_abi_pointer) ++ if (!rseqc->rseq_abi_pointer) + return 0; + + /* +@@ -1068,22 +1068,21 @@ static int read_rseq_cs(pid_t tid, struct __ptrace_rseq_configuration *rseq, str + * then rseq_ip_fixup() -> clear_rseq_cs() and user space memory with struct rseq + * will be cleared. So, let's use ptrace(PTRACE_PEEKDATA). + */ +- ret = ptrace_peek_area(tid, &addr, decode_pointer(rseq->rseq_abi_pointer + offsetof(struct rseq, rseq_cs)), +- sizeof(uint64_t)); ++ ret = ptrace_peek_area(tid, rseq, decode_pointer(rseqc->rseq_abi_pointer), ++ sizeof(struct rseq)); + if (ret) { +- pr_err("ptrace_peek_area(%d, %lx, %lx, %lx): fail to read rseq_cs addr\n", tid, (unsigned long)&addr, +- (unsigned long)(rseq->rseq_abi_pointer + offsetof(struct rseq, rseq_cs)), sizeof(uint64_t)); ++ pr_err("ptrace_peek_area(%d, %lx, %lx, %lx): fail to read rseq struct\n", tid, (unsigned long)rseq, ++ (unsigned long)(rseqc->rseq_abi_pointer), sizeof(uint64_t)); + return -1; + } + +- /* (struct rseq)->rseq_cs is NULL */ +- if (!addr) ++ if (!rseq->rseq_cs.ptr64) + return 0; + +- ret = ptrace_peek_area(tid, rseq_cs, decode_pointer(addr), sizeof(struct rseq_cs)); ++ ret = ptrace_peek_area(tid, rseq_cs, decode_pointer(rseq->rseq_cs.ptr64), sizeof(struct rseq_cs)); + if (ret) { + pr_err("ptrace_peek_area(%d, %lx, %lx, %lx): fail to read rseq_cs struct\n", tid, +- (unsigned long)rseq_cs, (unsigned long)addr, sizeof(struct rseq_cs)); ++ (unsigned long)rseq_cs, (unsigned long)rseq->rseq_cs.ptr64, sizeof(struct rseq_cs)); + return -1; + } + +@@ -1092,11 +1091,12 @@ static int read_rseq_cs(pid_t tid, struct __ptrace_rseq_configuration *rseq, str + + static int dump_thread_rseq(struct pstree_item *item, int i) + { +- struct __ptrace_rseq_configuration rseq; ++ struct __ptrace_rseq_configuration rseqc; + RseqEntry *rseqe = NULL; + int ret; + CoreEntry *core = item->core[i]; + RseqEntry **rseqep = &core->thread_core->rseq_entry; ++ struct rseq rseq; + struct rseq_cs *rseq_cs = &dmpi(item)->thread_rseq_cs[i]; + pid_t tid = item->threads[i].real; + +@@ -1111,20 +1111,20 @@ static int dump_thread_rseq(struct pstree_item *item, int i) + if (!kdat.has_ptrace_get_rseq_conf) + return 0; + +- ret = ptrace(PTRACE_GET_RSEQ_CONFIGURATION, tid, sizeof(rseq), &rseq); +- if (ret != sizeof(rseq)) { ++ ret = ptrace(PTRACE_GET_RSEQ_CONFIGURATION, tid, sizeof(rseqc), &rseqc); ++ if (ret != sizeof(rseqc)) { + pr_perror("ptrace(PTRACE_GET_RSEQ_CONFIGURATION, %d) = %d", tid, ret); + return -1; + } + +- if (rseq.flags != 0) { ++ if (rseqc.flags != 0) { + pr_err("something wrong with ptrace(PTRACE_GET_RSEQ_CONFIGURATION, %d) flags = 0x%x\n", tid, +- rseq.flags); ++ rseqc.flags); + return -1; + } + +- pr_info("Dump rseq of %d: ptr = 0x%lx sign = 0x%x\n", tid, (unsigned long)rseq.rseq_abi_pointer, +- rseq.signature); ++ pr_info("Dump rseq of %d: ptr = 0x%lx sign = 0x%x\n", tid, (unsigned long)rseqc.rseq_abi_pointer, ++ rseqc.signature); + + rseqe = xmalloc(sizeof(*rseqe)); + if (!rseqe) +@@ -1132,13 +1132,22 @@ static int dump_thread_rseq(struct pstree_item *item, int i) + + rseq_entry__init(rseqe); + +- rseqe->rseq_abi_pointer = rseq.rseq_abi_pointer; +- rseqe->rseq_abi_size = rseq.rseq_abi_size; +- rseqe->signature = rseq.signature; ++ rseqe->rseq_abi_pointer = rseqc.rseq_abi_pointer; ++ rseqe->rseq_abi_size = rseqc.rseq_abi_size; ++ rseqe->signature = rseqc.signature; + +- if (read_rseq_cs(tid, &rseq, rseq_cs)) ++ if (read_rseq_cs(tid, &rseqc, rseq_cs, &rseq)) + goto err; + ++ rseqe->has_rseq_cs_pointer = true; ++ rseqe->rseq_cs_pointer = rseq.rseq_cs.ptr64; ++ pr_err("cs pointer %lx\n", rseqe->rseq_cs_pointer); ++ /* we won't save rseq_cs to the image (only pointer), ++ * so let's combine flags from both struct rseq and struct rseq_cs ++ * (kernel does the same when interpreting RSEQ_CS_FLAG_*) ++ */ ++ rseq_cs->flags |= rseq.flags; ++ + /* save rseq entry to the image */ + *rseqep = rseqe; + +@@ -1188,11 +1197,11 @@ static int fixup_thread_rseq(struct pstree_item *item, int i) + struct rseq_cs *rseq_cs = &dmpi(item)->thread_rseq_cs[i]; + pid_t tid = item->threads[i].real; + +- /* (struct rseq)->rseq_cs is NULL */ ++ /* equivalent to (struct rseq)->rseq_cs is NULL */ + if (!rseq_cs->start_ip) + return 0; + +- pr_info("fixup_thread_rseq for %d: rseq_cs start_ip = %llx abort_ip = %llx post_commit_offset = %llx flags = %x version = %x; IP = %lx\n", ++ pr_debug("fixup_thread_rseq for %d: rseq_cs start_ip = %llx abort_ip = %llx post_commit_offset = %llx flags = %x version = %x; IP = %lx\n", + tid, rseq_cs->start_ip, rseq_cs->abort_ip, rseq_cs->post_commit_offset, rseq_cs->flags, + rseq_cs->version, (unsigned long)TI_IP(core)); + +@@ -1204,25 +1213,38 @@ static int fixup_thread_rseq(struct pstree_item *item, int i) + if (task_in_rseq(rseq_cs, TI_IP(core))) { + struct pid *tid = &item->threads[i]; + +- pr_info("The %d task is in rseq critical section. IP will be set to rseq abort handler addr\n", +- tid->real); +- + /* + * We need to fixup task instruction pointer from + * the original one (which lays inside rseq critical section) +- * to rseq abort handler address. ++ * to rseq abort handler address. But we need to look on rseq_cs->flags ++ * (please refer to struct rseq -> flags field description). ++ * Naive idea of flags support may be like... let's change instruction pointer (IP) ++ * to rseq_cs->abort_ip if !(rseq_cs->flags & RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL). ++ * But unfortunately, it doesn't work properly, because the kernel does ++ * clean up of rseq_cs field in the struct rseq (modifies userspace memory). ++ * So, we need to preserve original value of (struct rseq)->rseq_cs field in the ++ * image and restore it's value before releasing threads. + * + * It's worth to mention that we need to fixup IP in CoreEntry + * (used when full dump/restore is performed) and also in + * the parasite regs storage (used if --leave-running option is used, + * or if dump error occured and process execution is resumed). + */ +- TI_IP(core) = rseq_cs->abort_ip; + +- if (item->pid->real == tid->real) { +- compel_set_leader_ip(dmpi(item)->parasite_ctl, rseq_cs->abort_ip); ++ if (rseq_cs->flags & RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL) { ++ pr_err("The %d task is in rseq critical section.!!! IP will be set to rseq abort handler addr\n", ++ tid->real); + } else { +- compel_set_thread_ip(dmpi(item)->thread_ctls[i], rseq_cs->abort_ip); ++ pr_warn("The %d task is in rseq critical section. IP will be set to rseq abort handler addr\n", ++ tid->real); ++ ++ TI_IP(core) = rseq_cs->abort_ip; ++ ++ if (item->pid->real == tid->real) { ++ compel_set_leader_ip(dmpi(item)->parasite_ctl, rseq_cs->abort_ip); ++ } else { ++ compel_set_thread_ip(dmpi(item)->thread_ctls[i], rseq_cs->abort_ip); ++ } + } + } + +diff --git a/criu/cr-restore.c b/criu/cr-restore.c +index b2bd044..864140f 100644 +--- a/criu/cr-restore.c ++++ b/criu/cr-restore.c +@@ -23,6 +23,7 @@ + #include "common/compiler.h" + + #include "linux/mount.h" ++#include "linux/rseq.h" + + #include "clone-noasan.h" + #include "cr_options.h" +@@ -779,6 +780,7 @@ static int open_cores(int pid, CoreEntry *leader_core) + { + int i, tpid; + CoreEntry **cores = NULL; ++ //RseqEntry *rseqs; + + cores = xmalloc(sizeof(*cores) * current->nr_threads); + if (!cores) +@@ -812,6 +814,19 @@ static int open_cores(int pid, CoreEntry *leader_core) + } + } + ++ ++ pr_err("item %lx\n", (uint64_t)current); ++ ++ for (i = 0; i < current->nr_threads; i++) { ++ ThreadCoreEntry *tc = cores[i]->thread_core; ++ ++ /* compatibility with older CRIU versions */ ++ if (!tc->rseq_entry) ++ continue; ++ ++ current->rseqe[i] = *tc->rseq_entry; ++ } ++ + return 0; + err: + xfree(cores); +@@ -868,8 +883,15 @@ static int restore_one_alive_task(int pid, CoreEntry *core) + { + unsigned args_len; + struct task_restore_args *ta; ++ RseqEntry *rseqs; + pr_info("Restoring resources\n"); + ++ rseqs = shmalloc(sizeof(*rseqs) * current->nr_threads); ++ if (!rseqs) ++ return -1; ++ ++ current->rseqe = rseqs; ++ + rst_mem_switch_to_private(); + + args_len = round_up(sizeof(*ta) + sizeof(struct thread_restore_args) * current->nr_threads, page_size()); +@@ -1966,6 +1988,44 @@ static int attach_to_tasks(bool root_seized) + return 0; + } + ++static int restore_rseq_cs(void) ++{ ++ struct pstree_item *item; ++ ++ for_each_pstree_item(item) { ++ int i; ++ ++ if (!task_alive(item)) ++ continue; ++ ++ if (item->nr_threads == 1) { ++ item->threads[0].real = item->pid->real; ++ } else { ++ if (parse_threads(item->pid->real, &item->threads, &item->nr_threads)) ++ return -1; ++ } ++ ++ for (i = 0; i < item->nr_threads; i++) { ++ pid_t pid = item->threads[i].real; ++ ++ if (!item->rseqe[i].rseq_cs_pointer || !item->rseqe[i].rseq_abi_pointer) { ++ pr_err("item %lx rseqe %lx\n", (uint64_t)item, (uint64_t)item->rseqe); ++ pr_err("nothing to do with cs_pointer\n"); ++ continue; ++ } ++ ++ pr_err("restoring cs ... %lx \n", item->rseqe[i].rseq_cs_pointer); ++ ++ if (ptrace_poke_area(pid, &item->rseqe[i].rseq_cs_pointer, (void *)(item->rseqe[i].rseq_abi_pointer + offsetof(struct rseq, rseq_cs)), sizeof(uint64_t))) { ++ pr_err("Can't restore memfd args (pid: %d)\n", pid); ++ return -1; ++ } ++ } ++ } ++ ++ return 0; ++} ++ + static int catch_tasks(bool root_seized, enum trace_flags *flag) + { + struct pstree_item *item; +@@ -2400,6 +2460,9 @@ skip_ns_bouncing: + if (restore_freezer_state()) + pr_err("Unable to restore freezer state\n"); + ++ /* just before releasing threads we have to restore rseq_cs */ ++ restore_rseq_cs(); ++ + /* Detaches from processes and they continue run through sigreturn. */ + if (finalize_restore_detach()) + goto out_kill_network_unlocked; +diff --git a/criu/include/pstree.h b/criu/include/pstree.h +index 458e5f9..97bef11 100644 +--- a/criu/include/pstree.h ++++ b/criu/include/pstree.h +@@ -25,6 +25,7 @@ struct pstree_item { + int nr_threads; /* number of threads */ + struct pid *threads; /* array of threads */ + CoreEntry **core; ++ RseqEntry *rseqe; + TaskKobjIdsEntry *ids; + union { + futex_t task_st; +diff --git a/images/rseq.proto b/images/rseq.proto +index be28004..45cb847 100644 +--- a/images/rseq.proto ++++ b/images/rseq.proto +@@ -6,4 +6,5 @@ message rseq_entry { + required uint64 rseq_abi_pointer = 1; + required uint32 rseq_abi_size = 2; + required uint32 signature = 3; ++ optional uint64 rseq_cs_pointer = 4; + } +-- +2.30.0 + diff --git a/0016-zdtm-add-rseq02-transition-test-with-NO_RESTART-CS-f.patch b/0016-zdtm-add-rseq02-transition-test-with-NO_RESTART-CS-f.patch new file mode 100644 index 0000000000000000000000000000000000000000..73038a7dc5a9970c43b6a11d933a5ae52b11a152 --- /dev/null +++ b/0016-zdtm-add-rseq02-transition-test-with-NO_RESTART-CS-f.patch @@ -0,0 +1,177 @@ +From bb8295ae4f1224db2236fdd3134912e093ed20d9 Mon Sep 17 00:00:00 2001 +From: bb-cat +Date: Wed, 2 Mar 2022 15:10:24 +0800 +Subject: [PATCH 16/16] zdtm: add rseq02 transition test with NO_RESTART CS + flag Signed-off-by: Alexander Mikhalitsyn + + +--- + test/zdtm/transition/Makefile | 2 + + test/zdtm/transition/rseq01.c | 61 +++++++++++++++++++- + test/zdtm/transition/rseq02.c | 1 + + test/zdtm/transition/rseq02.desc | 1 + + 4 files changed, 63 insertions(+), 2 deletions(-) + create mode 120000 test/zdtm/transition/rseq02.c + create mode 120000 test/zdtm/transition/rseq02.desc + +diff --git a/test/zdtm/transition/Makefile b/test/zdtm/transition/Makefile +index fae4e27..378a4fc 100644 +--- a/test/zdtm/transition/Makefile ++++ b/test/zdtm/transition/Makefile +@@ -24,6 +24,7 @@ TST_NOFILE = \ + pid_reuse \ + pidfd_store_sk \ + rseq01 \ ++ rseq02 \ + + + TST_FILE = \ +@@ -82,6 +83,7 @@ ptrace: LDFLAGS += -pthread + fork2: CFLAGS += -D FORK2 + thread-bomb.o: CFLAGS += -pthread + thread-bomb: LDFLAGS += -pthread ++rseq02: CFLAGS += -D NOABORT + + %: %.sh + cp $< $@ +diff --git a/test/zdtm/transition/rseq01.c b/test/zdtm/transition/rseq01.c +index 5fac5a6..25e1d61 100644 +--- a/test/zdtm/transition/rseq01.c ++++ b/test/zdtm/transition/rseq01.c +@@ -53,6 +53,18 @@ enum rseq_flags { + RSEQ_FLAG_UNREGISTER = (1 << 0), + }; + ++enum rseq_cs_flags_bit { ++ RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT = 0, ++ RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT = 1, ++ RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT = 2, ++}; ++ ++enum rseq_cs_flags { ++ RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT = (1U << RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT), ++ RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL = (1U << RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT), ++ RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE = (1U << RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT), ++}; ++ + struct rseq { + uint32_t cpu_id_start; + uint32_t cpu_id; +@@ -104,6 +116,7 @@ static int rseq_addv(intptr_t *v, intptr_t count, int cpu) + { + double a = 10000000000000000.0; + double b = -1; ++ uint64_t rseq_cs1, rseq_cs2; + /*test_msg("enter %f %f\n", a, b);*/ + + /* clang-format off */ +@@ -129,6 +142,9 @@ static int rseq_addv(intptr_t *v, intptr_t count, int cpu) + "dec %%rcx\n\t" + "jnz 5b\n\t" + "fstpl %[y]\n\t" ++ "movq %%rax, %[rseq_cs_check2]\n\t" ++ "movq %[rseq_cs], %%rax\n\t" ++ "movq %%rax, %[rseq_cs_check1]\n\t" + "2:\n\t" + ".pushsection __rseq_failure, \"ax\"\n\t" + /* Disassembler-friendly signature: nopl (%rip). */ +@@ -143,6 +159,8 @@ static int rseq_addv(intptr_t *v, intptr_t count, int cpu) + : [cpu_id] "r" (cpu), + [current_cpu_id] "m" (rseq_ptr->cpu_id), + [rseq_cs] "m" (rseq_ptr->rseq_cs), ++ [rseq_cs_check1] "m" (rseq_cs1), ++ [rseq_cs_check2] "m" (rseq_cs2), + /* final store input */ + [v] "m" (*v), + [count] "er" (count), +@@ -153,8 +171,20 @@ static int rseq_addv(intptr_t *v, intptr_t count, int cpu) + ); + /* clang-format on */ + /*test_msg("exit %f %f\n", a, b);*/ ++ test_msg("%lx %lx\n", rseq_cs1, rseq_cs2); ++ if (rseq_cs1 != rseq_cs2) { ++ /* ++ * It means that we finished critical section ++ * *normally* (haven't jumped to abort) but the kernel had cleaned up ++ * rseq_ptr->rseq_cs before we left critical section ++ * and CRIU wasn't restored it correctly. ++ * That's a bug picture. ++ */ ++ return -1; ++ } + return 0; + abort: ++ test_msg("%lx %lx\n", rseq_cs1, rseq_cs2); + /*test_msg("abort %f %f\n", a, b);*/ + return -1; + } +@@ -177,21 +207,48 @@ int main(int argc, char *argv[]) + fail("calloc"); + exit(EXIT_FAILURE); + } ++ + register_thread(); + ++ /* ++ * We want to test that RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL ++ * is handled properly by CRIU, but that flag can be used ++ * only with all another flags set. ++ * Please, refer to ++ * https://github.com/torvalds/linux/blob/master/kernel/rseq.c#L192 ++ */ ++#ifdef NOABORT ++ rseq_ptr->flags = RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT | ++ RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL | ++ RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE; ++#endif ++ + test_daemon(); + + while (test_go()) { + cpu = RSEQ_ACCESS_ONCE(rseq_ptr->cpu_id_start); + ret = rseq_addv(&cpu_data[cpu], 2, cpu); +- if (ret) ++#ifndef NOABORT ++ /* just ignore abort */ ++ ret = 0; ++#else ++ if (ret) { + fail("Failed to increment per-cpu counter"); ++ break; ++ } else { ++ //test_msg("cpu_data[%d] == %ld\n", cpu, (long int)cpu_data[cpu]); ++ } ++#endif + } + + test_waitsig(); + + check_thread(); +- pass(); ++ ++ if (ret) ++ fail(); ++ else ++ pass(); + + return 0; + } +diff --git a/test/zdtm/transition/rseq02.c b/test/zdtm/transition/rseq02.c +new file mode 120000 +index 0000000..d564917 +--- /dev/null ++++ b/test/zdtm/transition/rseq02.c +@@ -0,0 +1 @@ ++rseq01.c +\ No newline at end of file +diff --git a/test/zdtm/transition/rseq02.desc b/test/zdtm/transition/rseq02.desc +new file mode 120000 +index 0000000..b888f0d +--- /dev/null ++++ b/test/zdtm/transition/rseq02.desc +@@ -0,0 +1 @@ ++rseq01.desc +\ No newline at end of file +-- +2.30.0 + diff --git a/criu.spec b/criu.spec index 1987d2ddcf0002cf1c97282866d1b9ff3154ef2f..621c765574f72302cc5bbd4404eab99dbe5c0d12 100644 --- a/criu.spec +++ b/criu.spec @@ -1,6 +1,6 @@ Name: criu Version: 3.16.1 -Release: 2 +Release: 3 Provides: crtools = %{version}-%{release} Obsoletes: crtools <= 1.0-2 Summary: A tool of Checkpoint/Restore in User-space @@ -17,6 +17,21 @@ Obsoletes: %{name}-libs < %{version}-%{release} Patch1: 0001-criu-dump-and-restore-cpu-affinity-of-each-thread.patch Patch2: 0002-mm-add-pin-memory-method-for-criu.patch +Patch3: 0002-compel-add-rseq-syscall-into-compel-std-plugin-sysca.patch +Patch4: 0003-kerndat-check-for-rseq-syscall-support.patch +Patch5: 0004-util-move-fork_and_ptrace_attach-helper-from-cr-chec.patch +Patch6: 0005-cr-check-Add-ptrace-rseq-conf-dump-feature.patch +Patch7: 0006-rseq-initial-support.patch +Patch8: 0007-zdtm-add-simple-test-for-rseq-C-R.patch +Patch9: 0008-ci-add-Fedora-Rawhide-based-test-on-Cirrus.patch +Patch10: 0009-include-add-thread_pointer.h-from-Glibc.patch +Patch11: 0010-clone-noasan-unregister-rseq-at-the-thread-start-for.patch +Patch12: 0011-zdtm-static-rseq00-fix-rseq-test-when-linking-with-a.patch +Patch13: 0012-compel-add-helpers-to-get-set-instruction-pointer.patch +Patch14: 0013-cr-dump-fixup-thread-IP-when-inside-rseq-cs.patch +Patch15: 0014-zdtm-add-rseq-transition-test-for-amd64.patch +Patch16: 0015-cr-dump-handle-rseq-flags-field.patch +Patch17: 0016-zdtm-add-rseq02-transition-test-with-NO_RESTART-CS-f.patch %description Checkpoint/Restore in Userspace(CRIU),is a software tool for the linux operating system. @@ -99,6 +114,9 @@ chmod 0755 %{buildroot}/run/%{name}/ %doc %{_mandir}/man1/{compel.1*,crit.1*,criu-ns.1*} %changelog +* Fri Mar 4 2022 ningyu - 3.16.1-3 +- rseq c/r support + * Sat Feb 26 2022 luolongjun - 3.16.1-2 - add support for pin memory