diff --git a/0001-Fix-crit-encode-TypeError.patch b/0001-Fix-crit-encode-TypeError.patch deleted file mode 100644 index ef187d393f5331868e095ac06be3f8b4ce133d8d..0000000000000000000000000000000000000000 --- a/0001-Fix-crit-encode-TypeError.patch +++ /dev/null @@ -1,28 +0,0 @@ -From 22bd1e20cbd3c26d2e5dba76e3b0a95ff0a2e154 Mon Sep 17 00:00:00 2001 -From: lingsheng -Date: Tue, 22 Sep 2020 14:36:55 +0800 -Subject: [PATCH 1/6] Fix crit encode TypeError - ---- - lib/py/cli.py | 5 ++++- - 1 file changed, 4 insertions(+), 1 deletion(-) - -diff --git a/lib/py/cli.py b/lib/py/cli.py -index da34302..966dd4e 100755 ---- a/lib/py/cli.py -+++ b/lib/py/cli.py -@@ -16,7 +16,10 @@ def inf(opts): - - def outf(opts): - if opts['out']: -- return open(opts['out'], 'w+') -+ if getattr(opts['func'], '__name__') == 'encode': -+ return open(opts['out'], 'wb+') -+ else: -+ return open(opts['out'], 'w+') - else: - return sys.stdout - --- -1.8.3.1 - diff --git a/0004-criu-dump-and-restore-cpu-affinity-of-each-thread.patch b/0001-criu-dump-and-restore-cpu-affinity-of-each-thread.patch similarity index 70% rename from 0004-criu-dump-and-restore-cpu-affinity-of-each-thread.patch rename to 0001-criu-dump-and-restore-cpu-affinity-of-each-thread.patch index 0384f1ed322249f28708a7b755cb351185592962..474e4c9c7b6962f4d4e11ad6d3ffdc8b06436ae8 100644 --- a/0004-criu-dump-and-restore-cpu-affinity-of-each-thread.patch +++ b/0001-criu-dump-and-restore-cpu-affinity-of-each-thread.patch @@ -16,56 +16,62 @@ at restore. Signed-off-by: Sang Yan --- - compel/arch/arm/plugins/std/syscalls/syscall.def | 1 + - .../ppc64/plugins/std/syscalls/syscall-ppc64.tbl | 1 + - .../s390/plugins/std/syscalls/syscall-s390.tbl | 1 + - .../arch/x86/plugins/std/syscalls/syscall_32.tbl | 1 + - .../arch/x86/plugins/std/syscalls/syscall_64.tbl | 1 + - criu/config.c | 1 + - criu/cr-dump.c | 14 ++++++++ - criu/cr-restore.c | 26 ++++++++++++++ - criu/crtools.c | 2 ++ - criu/include/cr_options.h | 2 ++ - criu/include/restorer.h | 3 ++ - criu/pie/restorer.c | 38 ++++++++++++++++++++ - criu/pstree.c | 7 ++++ - images/core.proto | 5 +++ - test/zdtm/static/Makefile | 1 + - test/zdtm/static/cpu-affinity0.c | 42 ++++++++++++++++++++++ - test/zdtm/static/cpu-affinity0.desc | 1 + - 17 files changed, 147 insertions(+) + .../arch/arm/plugins/std/syscalls/syscall.def | 1 + + .../plugins/std/syscalls/syscall-ppc64.tbl | 1 + + .../plugins/std/syscalls/syscall-s390.tbl | 1 + + .../x86/plugins/std/syscalls/syscall_32.tbl | 1 + + .../x86/plugins/std/syscalls/syscall_64.tbl | 1 + + criu/config.c | 1 + + criu/cr-dump.c | 13 ++++++ + criu/cr-restore.c | 23 ++++++++++ + criu/crtools.c | 2 + + criu/include/cr_options.h | 2 + + criu/include/restorer.h | 3 ++ + criu/pie/restorer.c | 38 +++++++++++++++++ + criu/pstree.c | 7 ++++ + images/core.proto | 5 +++ + test/zdtm/static/Makefile | 1 + + test/zdtm/static/cpu-affinity0.c | 42 +++++++++++++++++++ + test/zdtm/static/cpu-affinity0.desc | 1 + + 17 files changed, 143 insertions(+) create mode 100644 test/zdtm/static/cpu-affinity0.c create mode 100644 test/zdtm/static/cpu-affinity0.desc diff --git a/compel/arch/arm/plugins/std/syscalls/syscall.def b/compel/arch/arm/plugins/std/syscalls/syscall.def -index f7ebc85..d577373 100644 +index e6508ed..1b877d1 100644 --- a/compel/arch/arm/plugins/std/syscalls/syscall.def +++ b/compel/arch/arm/plugins/std/syscalls/syscall.def -@@ -116,3 +116,4 @@ fsopen 430 430 (char *fsname, unsigned int flags) +@@ -116,5 +116,6 @@ fsopen 430 430 (char *fsname, unsigned int flags) fsconfig 431 431 (int fd, unsigned int cmd, const char *key, const char *value, int aux) fsmount 432 432 (int fd, unsigned int flags, unsigned int attr_flags) clone3 435 435 (struct clone_args *uargs, size_t size) -+sched_setaffinity 122 241 (int fd, size_t cpusetsize, const cpu_set_t *mask) ++sched_setaffinity 122 241 (int fd, size_t cpusetsize, const cpu_set_t *mask) + pidfd_open 434 434 (pid_t pid, unsigned int flags) + pidfd_getfd 438 438 (int pidfd, int targetfd, unsigned int flags) diff --git a/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl b/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl -index 1afaf1e..fa64545 100644 +index 1bb626b..dd79187 100644 --- a/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl +++ b/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl -@@ -112,3 +112,4 @@ __NR_fsopen 430 sys_fsopen (char *fsname, unsigned int flags) +@@ -112,5 +112,6 @@ __NR_fsopen 430 sys_fsopen (char *fsname, unsigned int flags) __NR_fsconfig 431 sys_fsconfig (int fd, unsigned int cmd, const char *key, const char *value, int aux) __NR_fsmount 432 sys_fsmount (int fd, unsigned int flags, unsigned int attr_flags) __NR_clone3 435 sys_clone3 (struct clone_args *uargs, size_t size) -+__NR_sched_setaffinity 222 sys_sched_setaffinity (int fd, size_t cpusetsize, const cpu_set_t *mask) ++__NR_sched_setaffinity 222 sys_sched_setaffinity (int fd, size_t cpusetsize, const cpu_set_t *mask) + __NR_pidfd_open 434 sys_pidfd_open (pid_t pid, unsigned int flags) + __NR_pidfd_getfd 438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) diff --git a/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl b/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl -index ae6fdb5..16f1994 100644 +index 7178bf4..282adaf 100644 --- a/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl +++ b/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl -@@ -112,3 +112,4 @@ __NR_fsopen 430 sys_fsopen (char *fsname, unsigned int flags) +@@ -112,5 +112,6 @@ __NR_fsopen 430 sys_fsopen (char *fsname, unsigned int flags) __NR_fsconfig 431 sys_fsconfig (int fd, unsigned int cmd, const char *key, const char *value, int aux) __NR_fsmount 432 sys_fsmount (int fd, unsigned int flags, unsigned int attr_flags) __NR_clone3 435 sys_clone3 (struct clone_args *uargs, size_t size) -+__NR_sched_setaffinity 239 sys_sched_setaffinity (int fd, size_t cpusetsize, const cpu_set_t *mask) ++__NR_sched_setaffinity 239 sys_sched_setaffinity (int fd, size_t cpusetsize, const cpu_set_t *mask) + __NR_pidfd_open 434 sys_pidfd_open (pid_t pid, unsigned int flags) + __NR_pidfd_getfd 438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) diff --git a/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl b/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl -index 7a48711..29c13e3 100644 +index 7e456cd..3fe3194 100644 --- a/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl +++ b/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl @@ -63,6 +63,7 @@ __NR_mincore 218 sys_mincore (void *addr, unsigned long size, unsigned char * @@ -77,7 +83,7 @@ index 7a48711..29c13e3 100644 __NR_get_thread_area 244 sys_get_thread_area (user_desc_t *info) __NR_io_setup 245 sys_io_setup (unsigned nr_reqs, aio_context_t *ctx32p) diff --git a/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl b/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl -index 6667c07..74f5482 100644 +index 2dfcc6e..c1d119d 100644 --- a/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl +++ b/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl @@ -73,6 +73,7 @@ __NR_mount 165 sys_mount (char *dev_nmae, char *dir_name, char *type, unsign @@ -89,22 +95,22 @@ index 6667c07..74f5482 100644 __NR_io_setup 206 sys_io_setup (unsigned nr_events, aio_context_t *ctx) __NR_io_getevents 208 sys_io_getevents (aio_context_t ctx, long min_nr, long nr, struct io_event *evs, struct timespec *tmo) diff --git a/criu/config.c b/criu/config.c -index 08606fb..5a53256 100644 +index 91fb0b6..71f99c9 100644 --- a/criu/config.c +++ b/criu/config.c -@@ -541,6 +541,7 @@ int parse_options(int argc, char **argv, bool *usage_error, - { "cgroup-yard", required_argument, 0, 1096 }, - { "pre-dump-mode", required_argument, 0, 1097}, - { "file-validation", required_argument, 0, 1098 }, +@@ -695,6 +695,7 @@ int parse_options(int argc, char **argv, bool *usage_error, bool *has_exec_cmd, + { "cgroup-yard", required_argument, 0, 1096 }, + { "pre-dump-mode", required_argument, 0, 1097 }, + { "file-validation", required_argument, 0, 1098 }, + BOOL_OPT("with-cpu-affinity", &opts.with_cpu_affinity), - { }, - }; - + { "lsm-mount-context", required_argument, 0, 1099 }, + { "network-lock", required_argument, 0, 1100 }, + {}, diff --git a/criu/cr-dump.c b/criu/cr-dump.c -index b9d2914..f078c27 100644 +index 940f622..f07fe6e 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c -@@ -140,6 +140,7 @@ static int dump_sched_info(int pid, ThreadCoreEntry *tc) +@@ -139,6 +139,7 @@ static int dump_sched_info(int pid, ThreadCoreEntry *tc) { int ret; struct sched_param sp; @@ -112,10 +118,10 @@ index b9d2914..f078c27 100644 BUILD_BUG_ON(SCHED_OTHER != 0); /* default in proto message */ -@@ -185,6 +186,19 @@ static int dump_sched_info(int pid, ThreadCoreEntry *tc) +@@ -183,6 +184,18 @@ static int dump_sched_info(int pid, ThreadCoreEntry *tc) + pr_info("\tdumping %d nice for %d\n", ret, pid); tc->has_sched_nice = true; tc->sched_nice = ret; - + pr_info("\tdumping allowed cpus for %d\n", pid); + ret = syscall(__NR_sched_getaffinity, pid, sizeof(cpumask), &cpumask); + if (ret < 0) { @@ -124,16 +130,15 @@ index b9d2914..f078c27 100644 + } + memcpy(tc->allowed_cpus->cpumask, &cpumask, sizeof(cpu_set_t)); + pr_info("\t 0x%llx, 0x%llx, 0x%llx, 0x%llx\n", -+ (unsigned long long)tc->allowed_cpus->cpumask[3], -+ (unsigned long long)tc->allowed_cpus->cpumask[2], -+ (unsigned long long)tc->allowed_cpus->cpumask[1], -+ (unsigned long long)tc->allowed_cpus->cpumask[0]); -+ ++ (unsigned long long)tc->allowed_cpus->cpumask[3], ++ (unsigned long long)tc->allowed_cpus->cpumask[2], ++ (unsigned long long)tc->allowed_cpus->cpumask[1], ++ (unsigned long long)tc->allowed_cpus->cpumask[0]); + return 0; } - diff --git a/criu/cr-restore.c b/criu/cr-restore.c -index 589087f..da2e53d 100644 +index 9d2d957..5b645c1 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -118,6 +118,7 @@ static int prepare_restorer_blob(void); @@ -144,7 +149,7 @@ index 589087f..da2e53d 100644 /* * Architectures can overwrite this function to restore registers that are not -@@ -922,6 +923,9 @@ static int restore_one_alive_task(int pid, CoreEntry *core) +@@ -899,6 +900,9 @@ static int restore_one_alive_task(int pid, CoreEntry *core) if (prepare_signals(pid, ta, core)) return -1; @@ -154,7 +159,7 @@ index 589087f..da2e53d 100644 if (prepare_posix_timers(pid, ta, core)) return -1; -@@ -3196,6 +3200,27 @@ out: +@@ -3153,6 +3157,24 @@ out: return ret; } @@ -168,21 +173,18 @@ index 589087f..da2e53d 100644 + + need_cpu_affinity = rst_mem_alloc(sizeof(int), RM_PRIVATE); + *need_cpu_affinity = opts.with_cpu_affinity; -+ + for (i = 0; i < current->nr_threads; i++) { + cpumaks = rst_mem_alloc(sizeof(cpu_set_t), RM_PRIVATE); + if (!cpumaks) + return -1; -+ + memcpy(cpumaks, current->core[i]->thread_core->allowed_cpus->cpumask, sizeof(cpu_set_t)); + } + return 0; +} -+ extern void __gcov_flush(void) __attribute__((weak)); - void __gcov_flush(void) {} - -@@ -3655,6 +3680,7 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns + void __gcov_flush(void) + { +@@ -3603,6 +3625,7 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns RST_MEM_FIXUP_PPTR(task_args->timerfd); RST_MEM_FIXUP_PPTR(task_args->posix_timers); RST_MEM_FIXUP_PPTR(task_args->siginfo); @@ -191,33 +193,33 @@ index 589087f..da2e53d 100644 RST_MEM_FIXUP_PPTR(task_args->helpers); RST_MEM_FIXUP_PPTR(task_args->zombies); diff --git a/criu/crtools.c b/criu/crtools.c -index 2eb5dba..0f04a85 100644 +index 6a75cd1..b5a36b9 100644 --- a/criu/crtools.c +++ b/criu/crtools.c -@@ -441,6 +441,8 @@ usage: - " --file-validation METHOD\n" - " pass the validation method to be used; argument\n" - " can be 'filesize' or 'buildid' (default).\n" -+" --with-cpu-affinity Allow to restore cpu affinity. Only for hosts with\n" -+" same cpu quantity.\n" - "\n" - "Check options:\n" - " Without options, \"criu check\" checks availability of absolutely required\n" +@@ -445,6 +445,8 @@ usage: + " --file-validation METHOD\n" + " pass the validation method to be used; argument\n" + " can be 'filesize' or 'buildid' (default).\n" ++ " --with-cpu-affinity Allow to restore cpu affinity. Only for hosts with\n" ++ " same cpu quantity.\n" + "\n" + "Check options:\n" + " Without options, \"criu check\" checks availability of absolutely required\n" diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h -index ac1c9e9..fda54a4 100644 +index a34f8db..3b50e59 100644 --- a/criu/include/cr_options.h +++ b/criu/include/cr_options.h -@@ -174,6 +174,8 @@ struct cr_options { +@@ -188,6 +188,8 @@ struct cr_options { /* This stores which method to use for file validation. */ - int file_validation_method; + int file_validation_method; + /* restore cpu affinity */ -+ int with_cpu_affinity; ++ int with_cpu_affinity; }; extern struct cr_options opts; diff --git a/criu/include/restorer.h b/criu/include/restorer.h -index dfb4e6b..bd6ef6a 100644 +index 934d60c..c2ef8f0 100644 --- a/criu/include/restorer.h +++ b/criu/include/restorer.h @@ -1,6 +1,7 @@ @@ -229,19 +231,19 @@ index dfb4e6b..bd6ef6a 100644 #include #include @@ -162,6 +163,8 @@ struct task_restore_args { - siginfo_t *siginfo; - unsigned int siginfo_n; + siginfo_t *siginfo; + unsigned int siginfo_n; -+ char *allowed_cpus; ++ char *allowed_cpus; + - struct rst_tcp_sock *tcp_socks; - unsigned int tcp_socks_n; + struct rst_tcp_sock *tcp_socks; + unsigned int tcp_socks_n; diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c -index b3d7e2b..c63f96b 100644 +index 4304691..fbc89fe 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c -@@ -432,6 +432,40 @@ static int restore_signals(siginfo_t *ptr, int nr, bool group) +@@ -425,6 +425,40 @@ static int restore_signals(siginfo_t *ptr, int nr, bool group) return 0; } @@ -265,10 +267,10 @@ index b3d7e2b..c63f96b 100644 + pid = args->thread_args[i].pid; + cpumask = &allowed_cpus[i]; + pr_info("Restoring %d allowed_cpus %llx, %llx, %llx, %llx\n", pid, -+ (unsigned long long)cpumask->__bits[3], -+ (unsigned long long)cpumask->__bits[2], -+ (unsigned long long)cpumask->__bits[1], -+ (unsigned long long)cpumask->__bits[0]); ++ (unsigned long long)cpumask->__bits[3], ++ (unsigned long long)cpumask->__bits[2], ++ (unsigned long long)cpumask->__bits[1], ++ (unsigned long long)cpumask->__bits[0]); + ret = sys_sched_setaffinity(pid, sizeof(cpu_set_t), cpumask); + if (ret) { + pr_err("\t Restore %d cpumask failed.\n", pid); @@ -282,7 +284,7 @@ index b3d7e2b..c63f96b 100644 static int restore_seccomp_filter(pid_t tid, struct thread_restore_args *args) { unsigned int flags = args->seccomp_force_tsync ? SECCOMP_FILTER_FLAG_TSYNC : 0; -@@ -1900,6 +1934,10 @@ long __export_restore_task(struct task_restore_args *args) +@@ -1856,6 +1890,10 @@ long __export_restore_task(struct task_restore_args *args) if (ret) goto core_restore_end; @@ -294,7 +296,7 @@ index b3d7e2b..c63f96b 100644 rst_tcp_socks_all(args); diff --git a/criu/pstree.c b/criu/pstree.c -index a876615..f0d7622 100644 +index d5080e5..778c884 100644 --- a/criu/pstree.c +++ b/criu/pstree.c @@ -58,11 +58,13 @@ CoreEntry *core_entry_alloc(int th, int tsk) @@ -312,8 +314,8 @@ index a876615..f0d7622 100644 * @groups are dynamic and allocated * on demand. @@ -127,6 +129,11 @@ CoreEntry *core_entry_alloc(int th, int tsk) - ce->cap_eff = xptr_pull_s(&m, CR_CAP_SIZE * sizeof(ce->cap_eff[0])); - ce->cap_bnd = xptr_pull_s(&m, CR_CAP_SIZE * sizeof(ce->cap_bnd[0])); + ce->cap_eff = xptr_pull_s(&m, CR_CAP_SIZE * sizeof(ce->cap_eff[0])); + ce->cap_bnd = xptr_pull_s(&m, CR_CAP_SIZE * sizeof(ce->cap_bnd[0])); + core->thread_core->allowed_cpus = xptr_pull(&m, ThreadAllowedcpusEntry); + thread_allowedcpus_entry__init(core->thread_core->allowed_cpus); @@ -324,43 +326,43 @@ index a876615..f0d7622 100644 xfree(core); core = NULL; diff --git a/images/core.proto b/images/core.proto -index 9e9e393..2981120 100644 +index b713119..39e7f32 100644 --- a/images/core.proto +++ b/images/core.proto -@@ -81,6 +81,10 @@ message thread_sas_entry { +@@ -83,6 +83,10 @@ message thread_sas_entry { required uint32 ss_flags = 3; } +message thread_allowedcpus_entry { -+ repeated uint64 cpumask = 1; ++ repeated uint64 cpumask = 1; +} + message thread_core_entry { required uint64 futex_rla = 1; required uint32 futex_rla_len = 2; -@@ -99,6 +103,7 @@ message thread_core_entry { +@@ -101,6 +105,7 @@ message thread_core_entry { optional string comm = 13; optional uint64 blk_sigset_extended = 14; -+ required thread_allowedcpus_entry allowed_cpus = 15; ++ required thread_allowedcpus_entry allowed_cpus = 15; } message task_rlimits_entry { diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile -index aae4983..ad8fc6a 100644 +index c9e6589..70123cf 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile -@@ -235,6 +235,7 @@ TST_NOFILE := \ +@@ -246,6 +246,7 @@ TST_NOFILE := \ timens_nested \ timens_for_kids \ zombie_leader \ -+ cpu-affinity0 \ - # jobctl00 \ - - pkg-config-check = $(shell sh -c 'pkg-config $(1) && echo y') ++ cpu-affinity0 \ + sigtrap \ + sigtrap01 \ + change_mnt_context \ diff --git a/test/zdtm/static/cpu-affinity0.c b/test/zdtm/static/cpu-affinity0.c new file mode 100644 -index 0000000..83dee19 +index 0000000..27afe73 --- /dev/null +++ b/test/zdtm/static/cpu-affinity0.c @@ -0,0 +1,42 @@ @@ -382,7 +384,7 @@ index 0000000..83dee19 + + CPU_ZERO(&old); + CPU_ZERO(&new); -+ ++ + /* test only 0 core because of CI test env limited */ + CPU_SET(0, &old); + @@ -414,5 +416,5 @@ index 0000000..0d0b8ae @@ -0,0 +1 @@ +{'dopts': '', 'ropts': '--with-cpu-affinity', 'flags': 'reqrst '} -- -1.8.3.1 +2.27.0 diff --git a/0002-Fix-crit-info-struct-unpack-error.patch b/0002-Fix-crit-info-struct-unpack-error.patch deleted file mode 100644 index 3f6354abc19ba3d5da493d8a12b6ccc5a25bf3c3..0000000000000000000000000000000000000000 --- a/0002-Fix-crit-info-struct-unpack-error.patch +++ /dev/null @@ -1,25 +0,0 @@ -From be4a5e65791d18d1e26d6299e80a65324c5fc07e Mon Sep 17 00:00:00 2001 -From: lingsheng -Date: Tue, 22 Sep 2020 14:39:22 +0800 -Subject: [PATCH 2/6] Fix crit info struct unpack error - ---- - lib/py/images/images.py | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/lib/py/images/images.py b/lib/py/images/images.py -index 9c8e144..c330b97 100644 ---- a/lib/py/images/images.py -+++ b/lib/py/images/images.py -@@ -171,7 +171,7 @@ class entry_handler: - - while True: - buf = f.read(4) -- if buf == '': -+ if len(buf) == 0: - break - size, = struct.unpack('i', buf) - f.seek(size, 1) --- -1.8.3.1 - diff --git a/0002-compel-add-rseq-syscall-into-compel-std-plugin-sysca.patch b/0002-compel-add-rseq-syscall-into-compel-std-plugin-sysca.patch new file mode 100644 index 0000000000000000000000000000000000000000..ac103f47572010732534a6125d8f0e6c31d6df8e --- /dev/null +++ b/0002-compel-add-rseq-syscall-into-compel-std-plugin-sysca.patch @@ -0,0 +1,74 @@ +From ee46b1b5755eacf3be02a67934f0dc690293745b Mon Sep 17 00:00:00 2001 +From: bb-cat +Date: Wed, 2 Mar 2022 13:28:51 +0800 +Subject: [PATCH 02/16] compel: add rseq syscall into compel std plugin syscall + tables Add rseq syscall numbers for: arm/aarch64, mips64, ppc64le, s390, + x86_64/x86 + +Signed-off-by: Alexander Mikhalitsyn +--- + compel/arch/arm/plugins/std/syscalls/syscall.def | 1 + + compel/arch/mips/plugins/std/syscalls/syscall_64.tbl | 1 + + .../compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl | 1 + + .../compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl | 1 + + compel/arch/x86/plugins/std/syscalls/syscall_32.tbl | 1 + + compel/arch/x86/plugins/std/syscalls/syscall_64.tbl | 1 + + 6 files changed, 6 insertions(+) + +diff --git a/compel/arch/arm/plugins/std/syscalls/syscall.def b/compel/arch/arm/plugins/std/syscalls/syscall.def +index 1b877d1..bb78cbb 100644 +--- a/compel/arch/arm/plugins/std/syscalls/syscall.def ++++ b/compel/arch/arm/plugins/std/syscalls/syscall.def +@@ -119,3 +119,4 @@ clone3 435 435 (struct clone_args *uargs, size_t size) + sched_setaffinity 122 241 (int fd, size_t cpusetsize, const cpu_set_t *mask) + pidfd_open 434 434 (pid_t pid, unsigned int flags) + pidfd_getfd 438 438 (int pidfd, int targetfd, unsigned int flags) ++rseq 293 398 (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) +diff --git a/compel/arch/mips/plugins/std/syscalls/syscall_64.tbl b/compel/arch/mips/plugins/std/syscalls/syscall_64.tbl +index 7a6db19..95dc7d3 100644 +--- a/compel/arch/mips/plugins/std/syscalls/syscall_64.tbl ++++ b/compel/arch/mips/plugins/std/syscalls/syscall_64.tbl +@@ -115,3 +115,4 @@ __NR_fsmount 5432 sys_fsmount (int fd, unsigned int flags, unsigned int attr + __NR_clone3 5435 sys_clone3 (struct clone_args *uargs, size_t size) + __NR_pidfd_open 5434 sys_pidfd_open (pid_t pid, unsigned int flags) + __NR_pidfd_getfd 5438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) ++__NR_rseq 5327 sys_rseq (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) +diff --git a/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl b/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl +index dd79187..ad0d94f 100644 +--- a/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl ++++ b/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl +@@ -115,3 +115,4 @@ __NR_clone3 435 sys_clone3 (struct clone_args *uargs, size_t size) + __NR_sched_setaffinity 222 sys_sched_setaffinity (int fd, size_t cpusetsize, const cpu_set_t *mask) + __NR_pidfd_open 434 sys_pidfd_open (pid_t pid, unsigned int flags) + __NR_pidfd_getfd 438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) ++__NR_rseq 387 sys_rseq (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) +diff --git a/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl b/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl +index 282adaf..916b697 100644 +--- a/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl ++++ b/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl +@@ -115,3 +115,4 @@ __NR_clone3 435 sys_clone3 (struct clone_args *uargs, size_t size) + __NR_sched_setaffinity 239 sys_sched_setaffinity (int fd, size_t cpusetsize, const cpu_set_t *mask) + __NR_pidfd_open 434 sys_pidfd_open (pid_t pid, unsigned int flags) + __NR_pidfd_getfd 438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) ++__NR_rseq 383 sys_rseq (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) +diff --git a/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl b/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl +index 3fe3194..90f23d5 100644 +--- a/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl ++++ b/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl +@@ -103,3 +103,4 @@ __NR_fsmount 432 sys_fsmount (int fd, unsigned int flags, unsigned int attr_f + __NR_clone3 435 sys_clone3 (struct clone_args *uargs, size_t size) + __NR_pidfd_open 434 sys_pidfd_open (pid_t pid, unsigned int flags) + __NR_pidfd_getfd 438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) ++__NR_rseq 386 sys_rseq (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) +diff --git a/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl b/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl +index c1d119d..323fab1 100644 +--- a/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl ++++ b/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl +@@ -114,3 +114,4 @@ __NR_fsmount 432 sys_fsmount (int fd, unsigned int flags, unsigned int attr_ + __NR_clone3 435 sys_clone3 (struct clone_args *uargs, size_t size) + __NR_pidfd_open 434 sys_pidfd_open (pid_t pid, unsigned int flags) + __NR_pidfd_getfd 438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) ++__NR_rseq 334 sys_rseq (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) +-- +2.30.0 + diff --git a/0003-Fix-crit-x-UnicodeDecodeError.patch b/0003-Fix-crit-x-UnicodeDecodeError.patch deleted file mode 100644 index 5c7c50602420d6f541f0f84ec06a959671696821..0000000000000000000000000000000000000000 --- a/0003-Fix-crit-x-UnicodeDecodeError.patch +++ /dev/null @@ -1,25 +0,0 @@ -From 4f139d2803773c86e5cf557c879392e7b79238b3 Mon Sep 17 00:00:00 2001 -From: lingsheng -Date: Tue, 22 Sep 2020 14:40:35 +0800 -Subject: [PATCH 3/6] Fix crit x UnicodeDecodeError - ---- - lib/py/cli.py | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/lib/py/cli.py b/lib/py/cli.py -index 966dd4e..f7bda23 100755 ---- a/lib/py/cli.py -+++ b/lib/py/cli.py -@@ -25,7 +25,7 @@ def outf(opts): - - - def dinf(opts, name): -- return open(os.path.join(opts['dir'], name)) -+ return open(os.path.join(opts['dir'], name), 'rb') - - - def decode(opts): --- -1.8.3.1 - diff --git a/0003-kerndat-check-for-rseq-syscall-support.patch b/0003-kerndat-check-for-rseq-syscall-support.patch new file mode 100644 index 0000000000000000000000000000000000000000..1729b14d09452869045758ae768c8503ef075e9f --- /dev/null +++ b/0003-kerndat-check-for-rseq-syscall-support.patch @@ -0,0 +1,62 @@ +From ebd917f395b8bb3c4d6bbe51f9210d1aeca2e1fd Mon Sep 17 00:00:00 2001 +From: bb-cat +Date: Wed, 2 Mar 2022 13:34:10 +0800 +Subject: [PATCH 03/16] kerndat: check for rseq syscall support Signed-off-by: + Alexander Mikhalitsyn + +--- + criu/include/kerndat.h | 1 + + criu/kerndat.c | 18 ++++++++++++++++++ + 2 files changed, 19 insertions(+) + +diff --git a/criu/include/kerndat.h b/criu/include/kerndat.h +index 80bad7f..44a6976 100644 +--- a/criu/include/kerndat.h ++++ b/criu/include/kerndat.h +@@ -74,6 +74,7 @@ struct kerndat_s { + bool has_pidfd_getfd; + bool has_nspid; + bool has_nftables_concat; ++ bool has_rseq; + }; + + extern struct kerndat_s kdat; +diff --git a/criu/kerndat.c b/criu/kerndat.c +index 0e88ba4..f5a4490 100644 +--- a/criu/kerndat.c ++++ b/criu/kerndat.c +@@ -816,6 +816,20 @@ static int kerndat_x86_has_ptrace_fpu_xsave_bug(void) + return 0; + } + ++static int kerndat_has_rseq(void) ++{ ++ if (syscall(__NR_rseq, NULL, 0, 0, 0) != -1) { ++ pr_err("rseq should fail\n"); ++ return -1; ++ } ++ if (errno == ENOSYS) ++ pr_info("rseq syscall isn't supported\n"); ++ else ++ kdat.has_rseq = true; ++ ++ return 0; ++} ++ + #define KERNDAT_CACHE_FILE KDAT_RUNDIR "/criu.kdat" + #define KERNDAT_CACHE_FILE_TMP KDAT_RUNDIR "/.criu.kdat" + +@@ -1360,6 +1374,10 @@ int kerndat_init(void) + ret = -1; + } + ++ if (!ret && kerndat_has_rseq()) { ++ pr_err("kerndat_has_rseq failed when initializing kerndat.\n"); ++ ret = -1; ++ } + kerndat_lsm(); + kerndat_mmap_min_addr(); + kerndat_files_stat(); +-- +2.30.0 + diff --git a/0004-util-move-fork_and_ptrace_attach-helper-from-cr-chec.patch b/0004-util-move-fork_and_ptrace_attach-helper-from-cr-chec.patch new file mode 100644 index 0000000000000000000000000000000000000000..51457c6e87f7694594b96fc58597bfcf58eb0d14 --- /dev/null +++ b/0004-util-move-fork_and_ptrace_attach-helper-from-cr-chec.patch @@ -0,0 +1,161 @@ +From fe1f84eb98092b1aff60ae2be11e351b165f3f43 Mon Sep 17 00:00:00 2001 +From: bb-cat +Date: Wed, 2 Mar 2022 13:35:53 +0800 +Subject: [PATCH 04/16] util: move fork_and_ptrace_attach helper from cr-check + Signed-off-by: Alexander Mikhalitsyn + +--- + criu/cr-check.c | 55 ------------------------------- + criu/include/util.h | 1 + + criu/util.c | 57 +++++++++++++++++++++++++++++++++ + 3 files changed, 58 insertions(+), 55 deletions(-) + +diff --git a/criu/cr-check.c b/criu/cr-check.c +index 3575fb3..d41ef8f 100644 +--- a/criu/cr-check.c ++++ b/criu/cr-check.c +@@ -537,61 +537,6 @@ static int check_sigqueuinfo(void) + return 0; + } + +-static pid_t fork_and_ptrace_attach(int (*child_setup)(void)) +-{ +- pid_t pid; +- int sk_pair[2], sk; +- char c = 0; +- +- if (socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair)) { +- pr_perror("socketpair"); +- return -1; +- } +- +- pid = fork(); +- if (pid < 0) { +- pr_perror("fork"); +- return -1; +- } else if (pid == 0) { +- sk = sk_pair[1]; +- close(sk_pair[0]); +- +- if (child_setup && child_setup() != 0) +- exit(1); +- +- if (write(sk, &c, 1) != 1) { +- pr_perror("write"); +- exit(1); +- } +- +- while (1) +- sleep(1000); +- exit(1); +- } +- +- sk = sk_pair[0]; +- close(sk_pair[1]); +- +- if (read(sk, &c, 1) != 1) { +- close(sk); +- kill(pid, SIGKILL); +- pr_perror("read"); +- return -1; +- } +- +- close(sk); +- +- if (ptrace(PTRACE_ATTACH, pid, NULL, NULL) == -1) { +- pr_perror("Unable to ptrace the child"); +- kill(pid, SIGKILL); +- return -1; +- } +- +- waitpid(pid, NULL, 0); +- +- return pid; +-} +- + static int check_ptrace_peeksiginfo(void) + { + struct ptrace_peeksiginfo_args arg; +diff --git a/criu/include/util.h b/criu/include/util.h +index a2dac22..1c0b3c7 100644 +--- a/criu/include/util.h ++++ b/criu/include/util.h +@@ -166,6 +166,7 @@ extern int is_anon_link_type(char *link, char *type); + + extern int cr_system(int in, int out, int err, char *cmd, char *const argv[], unsigned flags); + extern int cr_system_userns(int in, int out, int err, char *cmd, char *const argv[], unsigned flags, int userns_pid); ++extern pid_t fork_and_ptrace_attach(int (*child_setup)(void)); + extern int cr_daemon(int nochdir, int noclose, int close_fd); + extern int status_ready(void); + extern int is_root_user(void); +diff --git a/criu/util.c b/criu/util.c +index 06124c2..e682161 100644 +--- a/criu/util.c ++++ b/criu/util.c +@@ -654,6 +654,63 @@ out: + return ret; + } + ++pid_t fork_and_ptrace_attach(int (*child_setup)(void)) ++{ ++ pid_t pid; ++ int sk_pair[2], sk; ++ char c = 0; ++ ++ if (socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair)) { ++ pr_perror("socketpair"); ++ return -1; ++ } ++ ++ pid = fork(); ++ if (pid < 0) { ++ pr_perror("fork"); ++ return -1; ++ } else if (pid == 0) { ++ sk = sk_pair[1]; ++ close(sk_pair[0]); ++ ++ if (child_setup && child_setup() != 0) ++ exit(1); ++ ++ if (write(sk, &c, 1) != 1) { ++ pr_perror("write"); ++ exit(1); ++ } ++ ++ while (1) ++ sleep(1000); ++ exit(1); ++ } ++ ++ sk = sk_pair[0]; ++ close(sk_pair[1]); ++ ++ if (read(sk, &c, 1) != 1) { ++ close(sk); ++ kill(pid, SIGKILL); ++ waitpid(pid, NULL, 0); ++ pr_perror("read"); ++ return -1; ++ } ++ ++ close(sk); ++ ++ if (ptrace(PTRACE_ATTACH, pid, NULL, NULL) == -1) { ++ pr_perror("Unable to ptrace the child"); ++ kill(pid, SIGKILL); ++ waitpid(pid, NULL, 0); ++ return -1; ++ } ++ ++ waitpid(pid, NULL, 0); ++ ++ return pid; ++} ++ + int status_ready(void) + { + char c = 0; +-- +2.30.0 + diff --git a/0005-cr-check-Add-ptrace-rseq-conf-dump-feature.patch b/0005-cr-check-Add-ptrace-rseq-conf-dump-feature.patch new file mode 100644 index 0000000000000000000000000000000000000000..5a82e084015fc162a56423e55359a83ac23991af --- /dev/null +++ b/0005-cr-check-Add-ptrace-rseq-conf-dump-feature.patch @@ -0,0 +1,162 @@ +From 3c567693f2e6579109dbabcca0e90c059ce5af25 Mon Sep 17 00:00:00 2001 +From: bb-cat +Date: Wed, 2 Mar 2022 15:30:18 +0800 +Subject: [PATCH 05/16] cr-check: Add ptrace rseq conf dump feature Add + "get_rseq_conf" feature corresponding to the + ptrace(PTRACE_GET_RSEQ_CONFIGURATION) support. + +Signed-off-by: Alexander Mikhalitsyn +--- + compel/include/uapi/ptrace.h | 12 +++++++ + criu/cr-check.c | 11 +++++++ + criu/include/kerndat.h | 1 + + criu/kerndat.c | 41 ++++++++++++++++++++++++ + 4 files changed, 65 insertions(+) + +diff --git a/compel/include/uapi/ptrace.h b/compel/include/uapi/ptrace.h +index c5291d2..bfe28c7 100644 +--- a/compel/include/uapi/ptrace.h ++++ b/compel/include/uapi/ptrace.h +@@ -65,6 +65,18 @@ typedef struct { + uint64_t flags; /* Output: filter's flags */ + } seccomp_metadata_t; + ++#ifndef PTRACE_GET_RSEQ_CONFIGURATION ++#define PTRACE_GET_RSEQ_CONFIGURATION 0x420f ++ ++struct ptrace_rseq_configuration { ++ __u64 rseq_abi_pointer; ++ __u32 rseq_abi_size; ++ __u32 signature; ++ __u32 flags; ++ __u32 pad; ++}; ++#endif ++ + #ifdef PTRACE_EVENT_STOP + #if PTRACE_EVENT_STOP == 7 /* Bad value from Linux 3.1-3.3, fixed in 3.4 */ + #undef PTRACE_EVENT_STOP +diff --git a/criu/cr-check.c b/criu/cr-check.c +index d41ef8f..ba87511 100644 +--- a/criu/cr-check.c ++++ b/criu/cr-check.c +@@ -794,6 +794,15 @@ static int check_ptrace_dump_seccomp_filters(void) + return ret; + } + ++static int check_ptrace_get_rseq_conf(void) ++{ ++ if (!kdat.has_ptrace_get_rseq_conf) { ++ pr_warn("ptrace(PTRACE_GET_RSEQ_CONFIGURATION) isn't supported. C/R of processes which are using rseq() won't work.\n"); ++ return -1; ++ } ++ return 0; ++} ++ + static int check_mem_dirty_track(void) + { + if (!kdat.has_dirty_track) { +@@ -1435,6 +1444,7 @@ int cr_check(void) + ret |= check_ns_pid(); + ret |= check_apparmor_stacking(); + ret |= check_network_lock_nftables(); ++ ret |= check_ptrace_get_rseq_conf(); + } + + /* +@@ -1547,6 +1557,7 @@ static struct feature_list feature_list[] = { + { "ns_pid", check_ns_pid }, + { "apparmor_stacking", check_apparmor_stacking }, + { "network_lock_nftables", check_network_lock_nftables }, ++ { "get_rseq_conf", check_ptrace_get_rseq_conf }, + { NULL, NULL }, + }; + +diff --git a/criu/include/kerndat.h b/criu/include/kerndat.h +index 44a6976..05abeda 100644 +--- a/criu/include/kerndat.h ++++ b/criu/include/kerndat.h +@@ -75,6 +75,7 @@ struct kerndat_s { + bool has_nspid; + bool has_nftables_concat; + bool has_rseq; ++ bool has_ptrace_get_rseq_conf; + }; + + extern struct kerndat_s kdat; +diff --git a/criu/kerndat.c b/criu/kerndat.c +index f5a4490..4841387 100644 +--- a/criu/kerndat.c ++++ b/criu/kerndat.c +@@ -4,6 +4,8 @@ + #include + #include + #include ++#include ++#include + #include + #include + #include +@@ -36,6 +38,7 @@ + #include "sockets.h" + #include "net.h" + #include "tun.h" ++#include + #include + #include "netfilter.h" + #include "fsnotify.h" +@@ -830,6 +833,40 @@ static int kerndat_has_rseq(void) + return 0; + } + ++static int kerndat_has_ptrace_get_rseq_conf(void) ++{ ++ pid_t pid; ++ int len; ++ struct ptrace_rseq_configuration rseq; ++ ++ pid = fork_and_ptrace_attach(NULL); ++ if (pid < 0) ++ return -1; ++ ++ len = ptrace(PTRACE_GET_RSEQ_CONFIGURATION, pid, sizeof(rseq), &rseq); ++ if (len != sizeof(rseq)) { ++ kdat.has_ptrace_get_rseq_conf = false; ++ pr_info("ptrace(PTRACE_GET_RSEQ_CONFIGURATION) is not supported\n"); ++ goto out; ++ } ++ ++ /* ++ * flags is always zero from the kernel side, if it will be changed ++ * we need to pay attention to that and, possibly, make changes on the CRIU side. ++ */ ++ if (rseq.flags != 0) { ++ kdat.has_ptrace_get_rseq_conf = false; ++ pr_err("ptrace(PTRACE_GET_RSEQ_CONFIGURATION): rseq.flags != 0\n"); ++ } else { ++ kdat.has_ptrace_get_rseq_conf = true; ++ } ++ ++out: ++ kill(pid, SIGKILL); ++ waitpid(pid, NULL, 0); ++ return 0; ++} ++ + #define KERNDAT_CACHE_FILE KDAT_RUNDIR "/criu.kdat" + #define KERNDAT_CACHE_FILE_TMP KDAT_RUNDIR "/.criu.kdat" + +@@ -1378,6 +1415,10 @@ int kerndat_init(void) + pr_err("kerndat_has_rseq failed when initializing kerndat.\n"); + ret = -1; + } ++ if (!ret && kerndat_has_ptrace_get_rseq_conf()) { ++ pr_err("kerndat_has_ptrace_get_rseq_conf failed when initializing kerndat.\n"); ++ ret = -1; ++ } + kerndat_lsm(); + kerndat_mmap_min_addr(); + kerndat_files_stat(); +-- +2.30.0 + diff --git a/0005-vdso-fix-segmentation-fault-caused-by-char-pointer-a.patch b/0005-vdso-fix-segmentation-fault-caused-by-char-pointer-a.patch deleted file mode 100644 index 6935cc6d2340ef269cf6ee9fe54f2e5197bcff94..0000000000000000000000000000000000000000 --- a/0005-vdso-fix-segmentation-fault-caused-by-char-pointer-a.patch +++ /dev/null @@ -1,193 +0,0 @@ -From 1c34f736caefd92ed6e131c5a4eb1699e2a24e35 Mon Sep 17 00:00:00 2001 -From: anatasluo -Date: Fri, 29 Jan 2021 13:48:57 +0000 -Subject: [PATCH 5/6] vdso: fix segmentation fault caused by char pointer array - -When I compile criu with "make DEBUG=1" and run it to restore my -program, it produces a segmentation fault. - -In aarch64, with compile flag "-O0", when criu executes the code in pie, -it is unable to visit the content of ARCH_VDSO_SYMBOLS. So I put these -variables into the stack. - -Signed-off-by: anatasluo ---- - criu/arch/aarch64/include/asm/vdso.h | 17 +++++++++-------- - criu/arch/arm/include/asm/vdso.h | 9 ++++++--- - criu/arch/ppc64/include/asm/vdso.h | 34 +++++++++++++++++++++++----------- - criu/arch/s390/include/asm/vdso.h | 17 +++++++++++------ - criu/arch/x86/include/asm/vdso.h | 23 ++++++++++++++++------- - criu/pie/util-vdso.c | 2 ++ - 6 files changed, 67 insertions(+), 35 deletions(-) - -diff --git a/criu/arch/aarch64/include/asm/vdso.h b/criu/arch/aarch64/include/asm/vdso.h -index 8a65e09..97a2440 100644 ---- a/criu/arch/aarch64/include/asm/vdso.h -+++ b/criu/arch/aarch64/include/asm/vdso.h -@@ -16,15 +16,16 @@ - * Workaround for VDSO array symbol table's relocation. - * XXX: remove when compel/piegen will support aarch64. - */ --static const char* __maybe_unused aarch_vdso_symbol1 = "__kernel_clock_getres"; --static const char* __maybe_unused aarch_vdso_symbol2 = "__kernel_clock_gettime"; --static const char* __maybe_unused aarch_vdso_symbol3 = "__kernel_gettimeofday"; --static const char* __maybe_unused aarch_vdso_symbol4 = "__kernel_rt_sigreturn"; -+#define ARCH_VDSO_SYMBOLS_LIST \ -+ const char* aarch_vdso_symbol1 = "__kernel_clock_getres"; \ -+ const char* aarch_vdso_symbol2 = "__kernel_clock_gettime"; \ -+ const char* aarch_vdso_symbol3 = "__kernel_gettimeofday"; \ -+ const char* aarch_vdso_symbol4 = "__kernel_rt_sigreturn"; - --#define ARCH_VDSO_SYMBOLS \ -- aarch_vdso_symbol1, \ -- aarch_vdso_symbol2, \ -- aarch_vdso_symbol3, \ -+#define ARCH_VDSO_SYMBOLS \ -+ aarch_vdso_symbol1, \ -+ aarch_vdso_symbol2, \ -+ aarch_vdso_symbol3, \ - aarch_vdso_symbol4 - - extern void write_intraprocedure_branch(unsigned long to, unsigned long from); -diff --git a/criu/arch/arm/include/asm/vdso.h b/criu/arch/arm/include/asm/vdso.h -index f57790a..e96514e 100644 ---- a/criu/arch/arm/include/asm/vdso.h -+++ b/criu/arch/arm/include/asm/vdso.h -@@ -11,8 +11,11 @@ - */ - #define VDSO_SYMBOL_MAX 2 - #define VDSO_SYMBOL_GTOD 1 --#define ARCH_VDSO_SYMBOLS \ -- "__vdso_clock_gettime", \ -- "__vdso_gettimeofday" -+#define ARCH_VDSO_SYMBOLS_LIST \ -+ const char* aarch_vdso_symbol1 = "__vdso_clock_gettime"; \ -+ const char* aarch_vdso_symbol2 = "__vdso_gettimeofday"; -+#define ARCH_VDSO_SYMBOLS \ -+ aarch_vdso_symbol1, \ -+ aarch_vdso_symbol2, - - #endif /* __CR_ASM_VDSO_H__ */ -diff --git a/criu/arch/ppc64/include/asm/vdso.h b/criu/arch/ppc64/include/asm/vdso.h -index 6c92348..fe04336 100644 ---- a/criu/arch/ppc64/include/asm/vdso.h -+++ b/criu/arch/ppc64/include/asm/vdso.h -@@ -14,16 +14,28 @@ - */ - #define VDSO_SYMBOL_MAX 10 - #define VDSO_SYMBOL_GTOD 5 --#define ARCH_VDSO_SYMBOLS \ -- "__kernel_clock_getres", \ -- "__kernel_clock_gettime", \ -- "__kernel_get_syscall_map", \ -- "__kernel_get_tbfreq", \ -- "__kernel_getcpu", \ -- "__kernel_gettimeofday", \ -- "__kernel_sigtramp_rt64", \ -- "__kernel_sync_dicache", \ -- "__kernel_sync_dicache_p5", \ -- "__kernel_time" -+#define ARCH_VDSO_SYMBOLS_LIST \ -+ const char* aarch_vdso_symbol1 = "__kernel_clock_getres"; \ -+ const char* aarch_vdso_symbol2 = "__kernel_clock_gettime"; \ -+ const char* aarch_vdso_symbol3 = "__kernel_get_syscall_map"; \ -+ const char* aarch_vdso_symbol4 = "__kernel_get_tbfreq"; \ -+ const char* aarch_vdso_symbol5 = "__kernel_getcpu"; \ -+ const char* aarch_vdso_symbol6 = "__kernel_gettimeofday"; \ -+ const char* aarch_vdso_symbol7 = "__kernel_sigtramp_rt64"; \ -+ const char* aarch_vdso_symbol8 = "__kernel_sync_dicache"; \ -+ const char* aarch_vdso_symbol9 = "__kernel_sync_dicache_p5"; \ -+ const char* aarch_vdso_symbol10 = "__kernel_time"; -+ -+#define ARCH_VDSO_SYMBOLS \ -+ aarch_vdso_symbol1, \ -+ aarch_vdso_symbol2, \ -+ aarch_vdso_symbol3, \ -+ aarch_vdso_symbol4, \ -+ aarch_vdso_symbol5, \ -+ aarch_vdso_symbol6, \ -+ aarch_vdso_symbol7, \ -+ aarch_vdso_symbol8, \ -+ aarch_vdso_symbol9, \ -+ aarch_vdso_symbol10 - - #endif /* __CR_ASM_VDSO_H__ */ -diff --git a/criu/arch/s390/include/asm/vdso.h b/criu/arch/s390/include/asm/vdso.h -index c54d848..ac71f59 100644 ---- a/criu/arch/s390/include/asm/vdso.h -+++ b/criu/arch/s390/include/asm/vdso.h -@@ -12,13 +12,18 @@ - #define VDSO_SYMBOL_GTOD 0 - - /* -- * This definition is used in pie/util-vdso.c to initialize the vdso symbol -+ * These definitions are used in pie/util-vdso.c to initialize the vdso symbol - * name string table 'vdso_symbols' - */ --#define ARCH_VDSO_SYMBOLS \ -- "__kernel_gettimeofday", \ -- "__kernel_clock_gettime", \ -- "__kernel_clock_getres", \ -- "__kernel_getcpu" -+#define ARCH_VDSO_SYMBOLS_LIST \ -+ const char* aarch_vdso_symbol1 = "__kernel_gettimeofday"; \ -+ const char* aarch_vdso_symbol2 = "__kernel_clock_gettime"; \ -+ const char* aarch_vdso_symbol3 = "__kernel_clock_getres"; \ -+ const char* aarch_vdso_symbol4 = "__kernel_getcpu"; -+#define ARCH_VDSO_SYMBOLS \ -+ aarch_vdso_symbol1, \ -+ aarch_vdso_symbol2, \ -+ aarch_vdso_symbol3, \ -+ aarch_vdso_symbol4 - - #endif /* __CR_ASM_VDSO_H__ */ -diff --git a/criu/arch/x86/include/asm/vdso.h b/criu/arch/x86/include/asm/vdso.h -index 28ae2d1..54d1fba 100644 ---- a/criu/arch/x86/include/asm/vdso.h -+++ b/criu/arch/x86/include/asm/vdso.h -@@ -35,13 +35,22 @@ - * vsyscall will be patched again when addressing: - * https://github.com/checkpoint-restore/criu/issues/512 - */ --#define ARCH_VDSO_SYMBOLS \ -- "__vdso_clock_gettime", \ -- "__vdso_getcpu", \ -- "__vdso_gettimeofday", \ -- "__vdso_time", \ -- "__kernel_sigreturn", \ -- "__kernel_rt_sigreturn" -+ -+#define ARCH_VDSO_SYMBOLS_LIST \ -+ const char* aarch_vdso_symbol1 = "__vdso_clock_gettime"; \ -+ const char* aarch_vdso_symbol2 = "__vdso_getcpu"; \ -+ const char* aarch_vdso_symbol3 = "__vdso_gettimeofday"; \ -+ const char* aarch_vdso_symbol4 = "__vdso_time"; \ -+ const char* aarch_vdso_symbol5 = "__kernel_sigreturn"; \ -+ const char* aarch_vdso_symbol6 = "__kernel_rt_sigreturn"; -+ -+#define ARCH_VDSO_SYMBOLS \ -+ aarch_vdso_symbol1, \ -+ aarch_vdso_symbol2, \ -+ aarch_vdso_symbol3, \ -+ aarch_vdso_symbol4, \ -+ aarch_vdso_symbol5, \ -+ aarch_vdso_symbol6 - - /* "__kernel_vsyscall", */ - -diff --git a/criu/pie/util-vdso.c b/criu/pie/util-vdso.c -index 58b2768..c717f2d 100644 ---- a/criu/pie/util-vdso.c -+++ b/criu/pie/util-vdso.c -@@ -219,6 +219,8 @@ static void parse_elf_symbols(uintptr_t mem, size_t size, Phdr_t *load, - struct vdso_symtable *t, uintptr_t dynsymbol_names, - Hash_t *hash, Dyn_t *dyn_symtab) - { -+ ARCH_VDSO_SYMBOLS_LIST -+ - const char *vdso_symbols[VDSO_SYMBOL_MAX] = { - ARCH_VDSO_SYMBOLS - }; --- -1.8.3.1 - diff --git a/0006-criu-add-pin-memory-method.patch b/0006-criu-add-pin-memory-method.patch deleted file mode 100644 index e29a0e6a7584a68fa829ca6abc5b045af8c0c4e8..0000000000000000000000000000000000000000 --- a/0006-criu-add-pin-memory-method.patch +++ /dev/null @@ -1,268 +0,0 @@ -From 4c11832330e6c7b924b96c7ea70c14025fe0d970 Mon Sep 17 00:00:00 2001 -From: "fu.lin" -Date: Tue, 13 Apr 2021 14:10:23 +0800 -Subject: [PATCH 6/6] criu: add pin memory method - -We can use the checkpoint and restore in userspace method to dump -and restore tasks when updating the kernel. Currently, criu needs -dump all memory data of tasks to files. When the memory size is -very large (large than 1GiB), the cost time of the dumping data -will be very long (more than 1 min). - -We can pin the memory data of tasks and collect the corresponding -physical pages mapping info in checkpoint process, and remap the -physical pages to restore tasks in restore process. - -Signed-off-by: Jingxian He ---- - criu/config.c | 1 + - criu/cr-restore.c | 5 +++ - criu/include/cr_options.h | 1 + - criu/include/restorer.h | 24 ++++++++++++ - criu/mem.c | 96 ++++++++++++++++++++++++++++++++++++++++++++++- - criu/pie/restorer.c | 21 ++++++++++- - 6 files changed, 146 insertions(+), 2 deletions(-) - -diff --git a/criu/config.c b/criu/config.c -index 5a53256..61b81fa 100644 ---- a/criu/config.c -+++ b/criu/config.c -@@ -542,6 +542,7 @@ int parse_options(int argc, char **argv, bool *usage_error, - { "pre-dump-mode", required_argument, 0, 1097}, - { "file-validation", required_argument, 0, 1098 }, - BOOL_OPT("with-cpu-affinity", &opts.with_cpu_affinity), -+ BOOL_OPT("pin-memory", &opts.pin_memory), - { }, - }; - -diff --git a/criu/cr-restore.c b/criu/cr-restore.c -index da2e53d..ff41976 100644 ---- a/criu/cr-restore.c -+++ b/criu/cr-restore.c -@@ -3866,6 +3866,11 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns - task_args->clone_restore_fn, - task_args->thread_args); - -+ if (opts.pin_memory) -+ task_args->pin_memory = true; -+ else -+ task_args->pin_memory = false; -+ - /* - * An indirect call to task_restore, note it never returns - * and restoring core is extremely destructive. -diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h -index fda54a4..a4dc5b8 100644 ---- a/criu/include/cr_options.h -+++ b/criu/include/cr_options.h -@@ -176,6 +176,7 @@ struct cr_options { - int file_validation_method; - /* restore cpu affinity */ - int with_cpu_affinity; -+ int pin_memory; - }; - - extern struct cr_options opts; -diff --git a/criu/include/restorer.h b/criu/include/restorer.h -index bd6ef6a..fc37e6d 100644 ---- a/criu/include/restorer.h -+++ b/criu/include/restorer.h -@@ -225,6 +225,7 @@ struct task_restore_args { - int lsm_type; - int child_subreaper; - bool has_clone3_set_tid; -+ bool pin_memory; - } __aligned(64); - - /* -@@ -317,4 +318,27 @@ enum { - #define __r_sym(name) restorer_sym ## name - #define restorer_sym(rblob, name) (void*)(rblob + __r_sym(name)) - -+#define PIN_MEM_FILE "/dev/pinmem" -+#define PIN_MEM_MAGIC 0x59 -+#define _SET_PIN_MEM_AREA 1 -+#define _CLEAR_PIN_MEM_AREA 2 -+#define _REMAP_PIN_MEM_AREA 3 -+#define _PIN_MEM_IOC_MAX_NR 4 -+#define SET_PIN_MEM_AREA _IOW(PIN_MEM_MAGIC, _SET_PIN_MEM_AREA, struct pin_mem_area_set) -+#define CLEAR_PIN_MEM_AREA _IOW(PIN_MEM_MAGIC, _CLEAR_PIN_MEM_AREA, int) -+#define REMAP_PIN_MEM_AREA _IOW(PIN_MEM_MAGIC, _REMAP_PIN_MEM_AREA, int) -+ -+#define ONCE_PIN_MEM_SIZE_LIMIT 32 * 1024 * 1024 -+#define MAX_PIN_MEM_AREA_NUM 16 -+struct pin_mem_area { -+ unsigned long virt_start; -+ unsigned long virt_end; -+}; -+ -+struct pin_mem_area_set { -+ unsigned int pid; -+ unsigned int area_num; -+ struct pin_mem_area mem_area[MAX_PIN_MEM_AREA_NUM]; -+}; -+ - #endif /* __CR_RESTORER_H__ */ -diff --git a/criu/mem.c b/criu/mem.c -index 167838b..709de4e 100644 ---- a/criu/mem.c -+++ b/criu/mem.c -@@ -438,6 +438,88 @@ again: - return ret; - } - -+bool should_pin_vmae(VmaEntry *vmae) -+{ -+ /* -+ * vDSO area must be always dumped because on restore -+ * we might need to generate a proxy. -+ */ -+ if (vma_entry_is(vmae, VMA_AREA_VDSO)) -+ return false; -+ /* -+ * In turn VVAR area is special and referenced from -+ * vDSO area by IP addressing (at least on x86) thus -+ * never ever dump its content but always use one provided -+ * by the kernel on restore, ie runtime VVAR area must -+ * be remapped into proper place.. -+ */ -+ if (vma_entry_is(vmae, VMA_AREA_VVAR)) -+ return false; -+ -+ if (vma_entry_is(vmae, VMA_AREA_AIORING)) -+ return false; -+ if (vma_entry_is(vmae, VMA_ANON_PRIVATE)) { -+ pr_debug("find private anon vma: %lx-%lx\n", vmae->start, vmae->end); -+ return true; -+ } -+ -+ return false; -+} -+ -+static int pin_one_pmas(int fd, unsigned long start, -+ unsigned long *pend, struct pstree_item *item) -+{ -+ int ret; -+ unsigned int index = 0; -+ unsigned long end; -+ unsigned long next = start; -+ struct pin_mem_area_set pmas; -+ struct pin_mem_area *pma; -+ -+ end = *pend; -+ while (start < end) { -+ next = (start + ONCE_PIN_MEM_SIZE_LIMIT > end) ? end : (start + ONCE_PIN_MEM_SIZE_LIMIT); -+ pma = &(pmas.mem_area[index]); -+ pma->virt_start = start; -+ pma->virt_end = next; -+ pr_info("start pin %lx-%lx\n", start, next); -+ index++; -+ start += ONCE_PIN_MEM_SIZE_LIMIT; -+ if (index >= MAX_PIN_MEM_AREA_NUM) -+ break; -+ } -+ *pend = next; -+ pmas.area_num = index; -+ pmas.pid = vpid(item); -+ pr_info("begin pin memory for pid:%d\n", pmas.pid); -+ ret = ioctl(fd, SET_PIN_MEM_AREA, &pmas); -+ if (ret < 0) -+ pr_err("pin mem fail, errno: %s\n", strerror(errno)); -+ return ret; -+} -+static int pin_vmae(VmaEntry *vmae, struct pstree_item *item) -+{ -+ int fd; -+ int ret = 0; -+ unsigned long start, end; -+ -+ fd = open(PIN_MEM_FILE, O_RDWR); -+ if (fd < 0) { -+ pr_err("open file: %s fail.\n", PIN_MEM_FILE); -+ return -1; -+ } -+ start = vmae->start; -+ while (start < vmae->end) { -+ end = vmae->end; -+ ret = pin_one_pmas(fd, start, &end, item); -+ if (ret < 0) -+ break; -+ start = end; -+ } -+ close(fd); -+ return ret; -+} -+ - static int __parasite_dump_pages_seized(struct pstree_item *item, - struct parasite_dump_pages_args *args, - struct vm_area_list *vma_area_list, -@@ -513,7 +595,16 @@ static int __parasite_dump_pages_seized(struct pstree_item *item, - if (possible_pid_reuse == -1) - goto out_xfer; - } -- -+ if (opts.pin_memory) { -+ /* pin memory before dump pages */ -+ list_for_each_entry(vma_area, &vma_area_list->h, list) { -+ if (should_pin_vmae(vma_area->e)) { -+ ret = pin_vmae(vma_area->e, item); -+ if (ret) -+ goto out_xfer; -+ } -+ } -+ } - - /* - * Step 1 -- generate the pagemap -@@ -524,6 +615,9 @@ static int __parasite_dump_pages_seized(struct pstree_item *item, - parent_predump_mode = mdc->parent_ie->pre_dump_mode; - - list_for_each_entry(vma_area, &vma_area_list->h, list) { -+ if (opts.pin_memory && should_pin_vmae(vma_area->e)) -+ continue; -+ - ret = generate_vma_iovs(item, vma_area, pp, &xfer, args, ctl, - &pmc, has_parent, mdc->pre_dump, - parent_predump_mode); -diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c -index c63f96b..f3bd541 100644 ---- a/criu/pie/restorer.c -+++ b/criu/pie/restorer.c -@@ -1414,6 +1414,24 @@ int cleanup_current_inotify_events(struct task_restore_args *task_args) - return 0; - } - -+int remap_vmas(int pid) -+{ -+ int fd, ret = 0; -+ -+ fd = sys_open(PIN_MEM_FILE, O_RDWR, 0); -+ if (fd == -1) { -+ pr_err("open file: %s fail.\n", PIN_MEM_FILE); -+ return -1;; -+ } -+ -+ ret = sys_ioctl(fd, REMAP_PIN_MEM_AREA, (unsigned long) &pid); -+ if (ret < 0) -+ pr_err("remap pin mem fail for pid: %d\n", pid); -+ sys_close(fd); -+ return ret; -+} -+ -+ - /* - * The main routine to restore task via sigreturn. - * This one is very special, we never return there -@@ -1585,7 +1603,8 @@ long __export_restore_task(struct task_restore_args *args) - goto core_restore_end; - } - } -- -+ if (args->pin_memory) -+ remap_vmas(my_pid); - /* - * Now read the contents (if any) - */ --- -1.8.3.1 - diff --git a/0006-rseq-initial-support.patch b/0006-rseq-initial-support.patch new file mode 100644 index 0000000000000000000000000000000000000000..4c6898552bdd29bfe4bdc0259fe8eefcb9f531ad --- /dev/null +++ b/0006-rseq-initial-support.patch @@ -0,0 +1,702 @@ +From e444c089ebfb03fb2b6d69a40322d31ab33c0597 Mon Sep 17 00:00:00 2001 +From: bb-cat +Date: Wed, 2 Mar 2022 14:52:35 +0800 +Subject: [PATCH 06/16] rseq: initial support TODO: 1. properly handle case + when the kernel has rseq() support but has no + ptrace(PTRACE_GET_RSEQ_CONFIGURATION) support and user processes haven't used + rseq(). 2. properly handle "transient" states, when CRIU comes during rseq + was executed. We need test for this case with some "heavy" rseq + we need to + properly handle RSEQ_CS_* flags. + +Fixes: #1696 + +Reported-by: Radostin Stoyanov +Suggested-by: Florian Weimer +Signed-off-by: Alexander Mikhalitsyn +--- + compel/include/uapi/ptrace.h | 16 +-- + criu/cr-dump.c | 99 ++++++++++++++++ + criu/cr-restore.c | 17 +++ + criu/include/linux/rseq.h | 144 +++++++++++++++++++++++ + criu/include/parasite.h | 7 ++ + criu/include/restorer.h | 7 ++ + criu/kerndat.c | 2 +- + criu/parasite-syscall.c | 11 ++ + criu/pie/parasite.c | 99 ++++++++++++++++ + criu/pie/restorer.c | 24 ++++ + images/Makefile | 1 + + images/core.proto | 2 + + images/rseq.proto | 9 ++ + 13 files changed, 429 insertions(+), 9 deletions(-) + create mode 100644 criu/include/linux/rseq.h + create mode 100644 images/rseq.proto + +diff --git a/compel/include/uapi/ptrace.h b/compel/include/uapi/ptrace.h +index bfe28c7..d807a92 100644 +--- a/compel/include/uapi/ptrace.h ++++ b/compel/include/uapi/ptrace.h +@@ -66,14 +66,14 @@ typedef struct { + } seccomp_metadata_t; + + #ifndef PTRACE_GET_RSEQ_CONFIGURATION +-#define PTRACE_GET_RSEQ_CONFIGURATION 0x420f +- +-struct ptrace_rseq_configuration { +- __u64 rseq_abi_pointer; +- __u32 rseq_abi_size; +- __u32 signature; +- __u32 flags; +- __u32 pad; ++#define PTRACE_GET_RSEQ_CONFIGURATION 0x420f ++ ++struct __ptrace_rseq_configuration { ++ uint64_t rseq_abi_pointer; ++ uint32_t rseq_abi_size; ++ uint32_t signature; ++ uint32_t flags; ++ uint32_t pad; + }; + #endif + +diff --git a/criu/cr-dump.c b/criu/cr-dump.c +index f07fe6e..91dd08a 100644 +--- a/criu/cr-dump.c ++++ b/criu/cr-dump.c +@@ -45,6 +45,7 @@ + #include "proc_parse.h" + #include "parasite.h" + #include "parasite-syscall.h" ++#include + #include "files.h" + #include "files-reg.h" + #include "shmem.h" +@@ -200,6 +201,25 @@ static int dump_sched_info(int pid, ThreadCoreEntry *tc) + return 0; + } + ++static int check_thread_rseq(pid_t tid, const struct parasite_check_rseq *ti_rseq, bool has_tc_rseq_entry) ++{ ++ if (!kdat.has_rseq || kdat.has_ptrace_get_rseq_conf) ++ return 0; ++ ++ pr_debug("%d has rseq_inited = %d\n", tid, ti_rseq->rseq_inited); ++ ++ /* ++ * We have no kdat.has_ptrace_get_rseq_conf and user ++ * process has rseq() used, let's fail dump. ++ */ ++ if (ti_rseq->rseq_inited) { ++ pr_err("%d has rseq but kernel lacks get_rseq_conf feature\n", tid); ++ return -1; ++ } ++ ++ return 0; ++} ++ + struct cr_imgset *glob_imgset; + + static int collect_fds(pid_t pid, struct parasite_drain_fd **dfds) +@@ -730,6 +750,17 @@ int dump_thread_core(int pid, CoreEntry *core, const struct parasite_dump_thread + if (!ret) + ret = seccomp_dump_thread(pid, tc); + ++ /* ++ * We are dumping rseq() in the dump_thread_rseq() function, ++ * *before* processes gets infected (because of ptrace requests ++ * API restriction). At this point, if the kernel lacks ++ * kdat.has_ptrace_get_rseq_conf support we have to ensure ++ * that dumpable processes haven't initialized rseq() or ++ * fail dump if rseq() was used. ++ */ ++ if (!ret) ++ ret = check_thread_rseq(pid, &ti->rseq, !!tc->rseq_entry); ++ + return ret; + } + +@@ -1016,6 +1047,68 @@ static int dump_task_signals(pid_t pid, struct pstree_item *item) + return 0; + } + ++static int dump_thread_rseq(pid_t tid, RseqEntry **rseqep) ++{ ++ struct __ptrace_rseq_configuration rseq; ++ RseqEntry *rseqe = NULL; ++ int ret; ++ ++ /* ++ * If we are here it means that rseq() syscall is supported, ++ * but ptrace(PTRACE_GET_RSEQ_CONFIGURATION) isn't supported, ++ * we can just fail dump here. But this is bad idea, IMHO. ++ * ++ * So, we will try to detect if victim process was used rseq(). ++ * See check_rseq() and check_thread_rseq() functions. ++ */ ++ if (!kdat.has_ptrace_get_rseq_conf) ++ return 0; ++ ++ ret = ptrace(PTRACE_GET_RSEQ_CONFIGURATION, tid, sizeof(rseq), &rseq); ++ if (ret != sizeof(rseq)) { ++ pr_perror("ptrace(PTRACE_GET_RSEQ_CONFIGURATION, %d) = %d", tid, ret); ++ return -1; ++ } ++ ++ if (rseq.flags != 0) { ++ pr_err("something wrong with ptrace(PTRACE_GET_RSEQ_CONFIGURATION, %d) flags = 0x%x\n", tid, ++ rseq.flags); ++ return -1; ++ } ++ ++ pr_err("Dump rseq of %d: ptr = 0x%lx sign = 0x%x\n", tid, (unsigned long)rseq.rseq_abi_pointer, rseq.signature); ++ ++ rseqe = xmalloc(sizeof(*rseqe)); ++ if (!rseqe) ++ return -1; ++ ++ rseq_entry__init(rseqe); ++ ++ rseqe->rseq_abi_pointer = rseq.rseq_abi_pointer; ++ rseqe->rseq_abi_size = rseq.rseq_abi_size; ++ rseqe->signature = rseq.signature; ++ ++ *rseqep = rseqe; ++ ++ return 0; ++} ++ ++static int dump_task_rseq(pid_t pid, struct pstree_item *item) ++{ ++ int i; ++ ++ /* if rseq() syscall isn't supported then nothing to dump */ ++ if (!kdat.has_rseq) ++ return 0; ++ ++ for (i = 0; i < item->nr_threads; i++) { ++ if (dump_thread_rseq(item->threads[i].real, &item->core[i]->thread_core->rseq_entry)) ++ return -1; ++ } ++ ++ return 0; ++} ++ + static struct proc_pid_stat pps_buf; + + static int dump_task_threads(struct parasite_ctl *parasite_ctl, const struct pstree_item *item) +@@ -1304,6 +1397,12 @@ static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) + goto err; + } + ++ ret = dump_task_rseq(pid, item); ++ if (ret) { ++ pr_err("Dump %d rseq failed %d\n", pid, ret); ++ goto err; ++ } ++ + parasite_ctl = parasite_infect_seized(pid, item, &vmas); + if (!parasite_ctl) { + pr_err("Can't infect (pid: %d) with parasite\n", pid); +diff --git a/criu/cr-restore.c b/criu/cr-restore.c +index 5b645c1..b2bd044 100644 +--- a/criu/cr-restore.c ++++ b/criu/cr-restore.c +@@ -2975,6 +2975,19 @@ static int prep_sched_info(struct rst_sched_param *sp, ThreadCoreEntry *tc) + return 0; + } + ++static int prep_rseq(struct rst_rseq_param *rseq, ThreadCoreEntry *tc) ++{ ++ /* compatibility with older CRIU versions */ ++ if (!tc->rseq_entry) ++ return 0; ++ ++ rseq->rseq_abi_pointer = tc->rseq_entry->rseq_abi_pointer; ++ rseq->rseq_abi_size = tc->rseq_entry->rseq_abi_size; ++ rseq->signature = tc->rseq_entry->signature; ++ ++ return 0; ++} ++ + static rlim_t decode_rlim(rlim_t ival) + { + return ival == -1 ? RLIM_INFINITY : ival; +@@ -3704,6 +3717,10 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns + thread_args[i].clear_tid_addr = CORE_THREAD_ARCH_INFO(tcore)->clear_tid_addr; + core_get_tls(tcore, &thread_args[i].tls); + ++ ret = prep_rseq(&thread_args[i].rseq, tcore->thread_core); ++ if (ret) ++ goto err; ++ + rst_reloc_creds(&thread_args[i], &creds_pos_next); + + thread_args[i].futex_rla = tcore->thread_core->futex_rla; +diff --git a/criu/include/linux/rseq.h b/criu/include/linux/rseq.h +new file mode 100644 +index 0000000..5c1706a +--- /dev/null ++++ b/criu/include/linux/rseq.h +@@ -0,0 +1,144 @@ ++/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ ++#ifndef _UAPI_LINUX_RSEQ_H ++#define _UAPI_LINUX_RSEQ_H ++ ++/* ++ * linux/rseq.h ++ * ++ * Restartable sequences system call API ++ * ++ * Copyright (c) 2015-2018 Mathieu Desnoyers ++ */ ++ ++#include ++#include ++ ++enum rseq_cpu_id_state { ++ RSEQ_CPU_ID_UNINITIALIZED = -1, ++ RSEQ_CPU_ID_REGISTRATION_FAILED = -2, ++}; ++ ++enum rseq_flags { ++ RSEQ_FLAG_UNREGISTER = (1 << 0), ++}; ++ ++enum rseq_cs_flags_bit { ++ RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT = 0, ++ RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT = 1, ++ RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT = 2, ++}; ++ ++enum rseq_cs_flags { ++ RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT = (1U << RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT), ++ RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL = (1U << RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT), ++ RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE = (1U << RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT), ++}; ++ ++/* ++ * struct rseq_cs is aligned on 4 * 8 bytes to ensure it is always ++ * contained within a single cache-line. It is usually declared as ++ * link-time constant data. ++ */ ++struct rseq_cs { ++ /* Version of this structure. */ ++ __u32 version; ++ /* enum rseq_cs_flags */ ++ __u32 flags; ++ __u64 start_ip; ++ /* Offset from start_ip. */ ++ __u64 post_commit_offset; ++ __u64 abort_ip; ++} __attribute__((aligned(4 * sizeof(__u64)))); ++ ++/* ++ * struct rseq is aligned on 4 * 8 bytes to ensure it is always ++ * contained within a single cache-line. ++ * ++ * A single struct rseq per thread is allowed. ++ */ ++struct rseq { ++ /* ++ * Restartable sequences cpu_id_start field. Updated by the ++ * kernel. Read by user-space with single-copy atomicity ++ * semantics. This field should only be read by the thread which ++ * registered this data structure. Aligned on 32-bit. Always ++ * contains a value in the range of possible CPUs, although the ++ * value may not be the actual current CPU (e.g. if rseq is not ++ * initialized). This CPU number value should always be compared ++ * against the value of the cpu_id field before performing a rseq ++ * commit or returning a value read from a data structure indexed ++ * using the cpu_id_start value. ++ */ ++ __u32 cpu_id_start; ++ /* ++ * Restartable sequences cpu_id field. Updated by the kernel. ++ * Read by user-space with single-copy atomicity semantics. This ++ * field should only be read by the thread which registered this ++ * data structure. Aligned on 32-bit. Values ++ * RSEQ_CPU_ID_UNINITIALIZED and RSEQ_CPU_ID_REGISTRATION_FAILED ++ * have a special semantic: the former means "rseq uninitialized", ++ * and latter means "rseq initialization failed". This value is ++ * meant to be read within rseq critical sections and compared ++ * with the cpu_id_start value previously read, before performing ++ * the commit instruction, or read and compared with the ++ * cpu_id_start value before returning a value loaded from a data ++ * structure indexed using the cpu_id_start value. ++ */ ++ __u32 cpu_id; ++ /* ++ * Restartable sequences rseq_cs field. ++ * ++ * Contains NULL when no critical section is active for the current ++ * thread, or holds a pointer to the currently active struct rseq_cs. ++ * ++ * Updated by user-space, which sets the address of the currently ++ * active rseq_cs at the beginning of assembly instruction sequence ++ * block, and set to NULL by the kernel when it restarts an assembly ++ * instruction sequence block, as well as when the kernel detects that ++ * it is preempting or delivering a signal outside of the range ++ * targeted by the rseq_cs. Also needs to be set to NULL by user-space ++ * before reclaiming memory that contains the targeted struct rseq_cs. ++ * ++ * Read and set by the kernel. Set by user-space with single-copy ++ * atomicity semantics. This field should only be updated by the ++ * thread which registered this data structure. Aligned on 64-bit. ++ */ ++ union { ++ __u64 ptr64; ++#ifdef __LP64__ ++ __u64 ptr; ++#else ++ struct { ++#if (defined(__BYTE_ORDER) && (__BYTE_ORDER == __BIG_ENDIAN)) || defined(__BIG_ENDIAN) ++ __u32 padding; /* Initialized to zero. */ ++ __u32 ptr32; ++#else /* LITTLE */ ++ __u32 ptr32; ++ __u32 padding; /* Initialized to zero. */ ++#endif /* ENDIAN */ ++ } ptr; ++#endif ++ } rseq_cs; ++ ++ /* ++ * Restartable sequences flags field. ++ * ++ * This field should only be updated by the thread which ++ * registered this data structure. Read by the kernel. ++ * Mainly used for single-stepping through rseq critical sections ++ * with debuggers. ++ * ++ * - RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT ++ * Inhibit instruction sequence block restart on preemption ++ * for this thread. ++ * - RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL ++ * Inhibit instruction sequence block restart on signal ++ * delivery for this thread. ++ * - RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE ++ * Inhibit instruction sequence block restart on migration for ++ * this thread. ++ */ ++ __u32 flags; ++} __attribute__((aligned(4 * sizeof(__u64)))); ++ ++#endif /* _UAPI_LINUX_RSEQ_H */ +diff --git a/criu/include/parasite.h b/criu/include/parasite.h +index 8107aa4..5fde809 100644 +--- a/criu/include/parasite.h ++++ b/criu/include/parasite.h +@@ -164,10 +164,17 @@ struct parasite_dump_creds { + unsigned int groups[0]; + }; + ++struct parasite_check_rseq { ++ bool has_rseq; ++ bool has_ptrace_get_rseq_conf; /* no need to check if supported */ ++ bool rseq_inited; ++}; ++ + struct parasite_dump_thread { + unsigned int *tid_addr; + pid_t tid; + tls_t tls; ++ struct parasite_check_rseq rseq; + stack_t sas; + int pdeath_sig; + char comm[TASK_COMM_LEN]; +diff --git a/criu/include/restorer.h b/criu/include/restorer.h +index c2ef8f0..c29d869 100644 +--- a/criu/include/restorer.h ++++ b/criu/include/restorer.h +@@ -45,6 +45,12 @@ struct rst_sched_param { + int prio; + }; + ++struct rst_rseq_param { ++ u64 rseq_abi_pointer; ++ u32 rseq_abi_size; ++ u32 signature; ++}; ++ + struct restore_posix_timer { + struct str_posix_timer spt; + struct itimerspec val; +@@ -99,6 +105,7 @@ struct thread_restore_args { + struct task_restore_args *ta; + + tls_t tls; ++ struct rst_rseq_param rseq; + + siginfo_t *siginfo; + unsigned int siginfo_n; +diff --git a/criu/kerndat.c b/criu/kerndat.c +index 4841387..af7113a 100644 +--- a/criu/kerndat.c ++++ b/criu/kerndat.c +@@ -837,7 +837,7 @@ static int kerndat_has_ptrace_get_rseq_conf(void) + { + pid_t pid; + int len; +- struct ptrace_rseq_configuration rseq; ++ struct __ptrace_rseq_configuration rseq; + + pid = fork_and_ptrace_attach(NULL); + if (pid < 0) +diff --git a/criu/parasite-syscall.c b/criu/parasite-syscall.c +index 7175ade..ee4fa86 100644 +--- a/criu/parasite-syscall.c ++++ b/criu/parasite-syscall.c +@@ -132,6 +132,13 @@ static int alloc_groups_copy_creds(CredsEntry *ce, struct parasite_dump_creds *c + return ce->groups ? 0 : -ENOMEM; + } + ++static void init_parasite_rseq_arg(struct parasite_check_rseq *rseq) ++{ ++ rseq->has_rseq = kdat.has_rseq; ++ rseq->has_ptrace_get_rseq_conf = kdat.has_ptrace_get_rseq_conf; ++ rseq->rseq_inited = false; ++} ++ + int parasite_dump_thread_leader_seized(struct parasite_ctl *ctl, int pid, CoreEntry *core) + { + ThreadCoreEntry *tc = core->thread_core; +@@ -144,6 +151,8 @@ int parasite_dump_thread_leader_seized(struct parasite_ctl *ctl, int pid, CoreEn + pc = args->creds; + pc->cap_last_cap = kdat.last_cap; + ++ init_parasite_rseq_arg(&args->rseq); ++ + ret = compel_rpc_call_sync(PARASITE_CMD_DUMP_THREAD, ctl); + if (ret < 0) + return ret; +@@ -197,6 +206,8 @@ int parasite_dump_thread_seized(struct parasite_thread_ctl *tctl, struct parasit + + compel_arch_get_tls_thread(tctl, &args->tls); + ++ init_parasite_rseq_arg(&args->rseq); ++ + ret = compel_run_in_thread(tctl, PARASITE_CMD_DUMP_THREAD); + if (ret) { + pr_err("Can't init thread in parasite %d\n", pid); +diff --git a/criu/pie/parasite.c b/criu/pie/parasite.c +index bc0a33c..e49958b 100644 +--- a/criu/pie/parasite.c ++++ b/criu/pie/parasite.c +@@ -8,6 +8,8 @@ + #include + #include + ++#include "linux/rseq.h" ++ + #include "common/config.h" + #include "int.h" + #include "types.h" +@@ -167,6 +169,7 @@ static int dump_posix_timers(struct parasite_dump_posix_timers_args *args) + } + + static int dump_creds(struct parasite_dump_creds *args); ++static int check_rseq(struct parasite_check_rseq *rseq); + + static int dump_thread_common(struct parasite_dump_thread *ti) + { +@@ -197,6 +200,12 @@ static int dump_thread_common(struct parasite_dump_thread *ti) + goto out; + } + ++ ret = check_rseq(&ti->rseq); ++ if (ret) { ++ pr_err("Unable to check if rseq() is initialized: %d\n", ret); ++ goto out; ++ } ++ + ret = dump_creds(ti->creds); + out: + return ret; +@@ -313,6 +322,96 @@ grps_err: + return -1; + } + ++static int check_rseq(struct parasite_check_rseq *rseq) ++{ ++ int ret; ++ unsigned long rseq_abi_pointer; ++ unsigned long rseq_abi_size; ++ uint32_t rseq_signature; ++ void *addr; ++ ++ /* no need to do hacky check if we can get all info from ptrace() */ ++ if (!rseq->has_rseq || rseq->has_ptrace_get_rseq_conf) ++ return 0; ++ ++ /* ++ * We need to determine if victim process has rseq() ++ * initialized, but we have no *any* proper kernel interface ++ * supported at this point. ++ * Our plan: ++ * 1. We know that if we call rseq() syscall and process already ++ * has current->rseq filled, then we get: ++ * -EINVAL if current->rseq != rseq || rseq_len != sizeof(*rseq), ++ * -EPERM if current->rseq_sig != sig), ++ * -EBUSY if current->rseq == rseq && rseq_len == sizeof(*rseq) && ++ * current->rseq_sig != sig ++ * if current->rseq == NULL (rseq() wasn't used) then we go to: ++ * IS_ALIGNED(rseq ...) check, if we fail it we get -EINVAL and it ++ * will be hard to distinguish case when rseq() was initialized or not. ++ * Let's construct arguments payload ++ * with: ++ * 1. correct rseq_abi_size ++ * 2. aligned and correct rseq_abi_pointer ++ * And see what rseq() return to us. ++ * If ret value is: ++ * 0: it means that rseq *wasn't* used and we successfuly registered it, ++ * -EINVAL or : it means that rseq is already initialized, ++ * so we *have* to dump it. But as we have has_ptrace_get_rseq_conf = false, ++ * we should just fail dump as it's unsafe to skip rseq() dump for processes ++ * with rseq() initialized. ++ * -EPERM or -EBUSY: should not happen as we take a fresh memory area for rseq ++ */ ++ addr = (void *)sys_mmap(NULL, sizeof(struct rseq), PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); ++ if (addr == MAP_FAILED) { ++ pr_err("mmap() failed for struct rseq ret = %lx\n", (unsigned long)addr); ++ return -1; ++ } ++ ++ memset(addr, 0, sizeof(struct rseq)); ++ ++ /* sys_mmap returns page aligned addresses */ ++ rseq_abi_pointer = (unsigned long)addr; ++ rseq_abi_size = (unsigned long)sizeof(struct rseq); ++ /* it's not so important to have unique signature for us, ++ * because rseq_abi_pointer is guaranteed to be unique ++ */ ++ rseq_signature = 0x12345612; ++ ++ pr_info("\ttrying sys_rseq(%lx, %lx, %x, %x)\n", rseq_abi_pointer, rseq_abi_size, 0, rseq_signature); ++ ret = sys_rseq((void *)rseq_abi_pointer, rseq_abi_size, 0, rseq_signature); ++ if (ret) { ++ if (ret == -EINVAL) { ++ pr_info("\trseq is initialized in the victim\n"); ++ rseq->rseq_inited = true; ++ ++ ret = 0; ++ } else { ++ pr_err("\tunexpected failure of sys_rseq(%lx, %lx, %x, %x) = %d\n", rseq_abi_pointer, ++ rseq_abi_size, 0, rseq_signature, ret); ++ ++ ret = -1; ++ } ++ } else { ++ ret = sys_rseq((void *)rseq_abi_pointer, sizeof(struct rseq), RSEQ_FLAG_UNREGISTER, rseq_signature); ++ if (ret) { ++ pr_err("\tfailed to unregister sys_rseq(%lx, %lx, %x, %x) = %d\n", rseq_abi_pointer, ++ rseq_abi_size, RSEQ_FLAG_UNREGISTER, rseq_signature, ret); ++ ++ ret = -1; ++ goto out; ++ } ++ ++ pr_info("\tsys_rseq succeed, let's unregister it back... ok Error\n"); ++ pr_info("\trseq is non-initialized in the victim Error\n"); ++ rseq->rseq_inited = false; ++ ret = 0; ++ } ++ ++out: ++ sys_munmap(addr, sizeof(struct rseq)); ++ return ret; ++} ++ + static int fill_fds_fown(int fd, struct fd_opts *p) + { + int flags, ret; +diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c +index fbc89fe..368b5a0 100644 +--- a/criu/pie/restorer.c ++++ b/criu/pie/restorer.c +@@ -459,6 +459,27 @@ static int restore_cpu_affinity(struct task_restore_args *args) + return 0; + } + ++static int restore_rseq(struct rst_rseq_param *rseq) ++{ ++ int ret; ++ ++ if (!rseq->rseq_abi_pointer) { ++ pr_debug("rseq: nothing to restore\n"); ++ return 0; ++ } ++ ++ pr_debug("rseq: rseq_abi_pointer = %lx signature = %x\n", (unsigned long)rseq->rseq_abi_pointer, rseq->signature); ++ ++ ret = sys_rseq(decode_pointer(rseq->rseq_abi_pointer), rseq->rseq_abi_size, 0, rseq->signature); ++ if (ret) { ++ pr_err("failed sys_rseq(%lx, %lx, %x, %x) = %d\n", (unsigned long)rseq->rseq_abi_pointer, ++ (unsigned long)rseq->rseq_abi_size, 0, rseq->signature, ret); ++ return -1; ++ } ++ ++ return 0; ++} ++ + static int restore_seccomp_filter(pid_t tid, struct thread_restore_args *args) + { + unsigned int flags = args->seccomp_force_tsync ? SECCOMP_FILTER_FLAG_TSYNC : 0; +@@ -583,6 +604,9 @@ static int restore_thread_common(struct thread_restore_args *args) + + restore_tls(&args->tls); + ++ if (restore_rseq(&args->rseq)) ++ return -1; ++ + return 0; + } + +diff --git a/images/Makefile b/images/Makefile +index 2eaeb7c..004e22e 100644 +--- a/images/Makefile ++++ b/images/Makefile +@@ -71,6 +71,7 @@ proto-obj-y += img-streamer.o + proto-obj-y += bpfmap-file.o + proto-obj-y += bpfmap-data.o + proto-obj-y += apparmor.o ++proto-obj-y += rseq.o + + CFLAGS += -iquote $(obj)/ + +diff --git a/images/core.proto b/images/core.proto +index 39e7f32..b66230e 100644 +--- a/images/core.proto ++++ b/images/core.proto +@@ -14,6 +14,7 @@ import "timer.proto"; + import "creds.proto"; + import "sa.proto"; + import "siginfo.proto"; ++import "rseq.proto"; + + import "opts.proto"; + +@@ -106,6 +107,7 @@ message thread_core_entry { + optional string comm = 13; + optional uint64 blk_sigset_extended = 14; + required thread_allowedcpus_entry allowed_cpus = 15; ++ optional rseq_entry rseq_entry = 16; + } + + message task_rlimits_entry { +diff --git a/images/rseq.proto b/images/rseq.proto +new file mode 100644 +index 0000000..be28004 +--- /dev/null ++++ b/images/rseq.proto +@@ -0,0 +1,9 @@ ++// SPDX-License-Identifier: MIT ++ ++syntax = "proto2"; ++ ++message rseq_entry { ++ required uint64 rseq_abi_pointer = 1; ++ required uint32 rseq_abi_size = 2; ++ required uint32 signature = 3; ++} +-- +2.30.0 + diff --git a/0007-zdtm-add-simple-test-for-rseq-C-R.patch b/0007-zdtm-add-simple-test-for-rseq-C-R.patch new file mode 100644 index 0000000000000000000000000000000000000000..bc6de57773252c273da6f8482b1e2e5268bd6715 --- /dev/null +++ b/0007-zdtm-add-simple-test-for-rseq-C-R.patch @@ -0,0 +1,214 @@ +From 5005c08e32dc29dbf0b3a2a582e75d249c190d96 Mon Sep 17 00:00:00 2001 +From: bb-cat +Date: Wed, 2 Mar 2022 14:54:28 +0800 +Subject: [PATCH 07/16] zdtm: add simple test for rseq C/R Signed-off-by: + Alexander Mikhalitsyn + +--- + test/zdtm/static/Makefile | 1 + + test/zdtm/static/rseq00.c | 174 +++++++++++++++++++++++ + test/zdtm/static/rseq00.desc | 1 + + 3 files changed, 176 insertions(+) + create mode 100644 test/zdtm/static/rseq00.c + create mode 100644 test/zdtm/static/rseq00.desc + +diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile +index 70123cf..563d947 100644 +--- a/test/zdtm/static/Makefile ++++ b/test/zdtm/static/Makefile +@@ -61,6 +61,7 @@ TST_NOFILE := \ + pthread02 \ + pthread_timers \ + pthread_timers_h \ ++ rseq00 \ + vdso00 \ + vdso01 \ + vdso02 \ +diff --git a/test/zdtm/static/rseq00.c b/test/zdtm/static/rseq00.c +new file mode 100644 +index 0000000..26f41a2 +--- /dev/null ++++ b/test/zdtm/static/rseq00.c +@@ -0,0 +1,174 @@ ++/* ++ * test for rseq() syscall ++ */ ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "zdtmtst.h" ++ ++#if defined(__x86_64__) ++ ++const char *test_doc = "Check that rseq() basic C/R works"; ++const char *test_author = "Alexander Mikhalitsyn "; ++ ++/* some useful definitions from kernel uapi */ ++enum rseq_flags { ++ RSEQ_FLAG_UNREGISTER = (1 << 0), ++}; ++ ++struct rseq { ++ uint32_t cpu_id_start; ++ uint32_t cpu_id; ++ uint64_t rseq_cs; ++ uint32_t flags; ++} __attribute__((aligned(4 * sizeof(uint64_t)))); ++ ++#ifndef __NR_rseq ++#define __NR_rseq 334 ++#endif ++/* EOF */ ++ ++static __thread volatile struct rseq __rseq_abi; ++ ++#define RSEQ_SIG 0x53053053 ++ ++static int sys_rseq(volatile struct rseq *rseq_abi, uint32_t rseq_len, int flags, uint32_t sig) ++{ ++ return syscall(__NR_rseq, rseq_abi, rseq_len, flags, sig); ++} ++ ++static void register_thread(void) ++{ ++ int rc; ++ rc = sys_rseq(&__rseq_abi, sizeof(struct rseq), 0, RSEQ_SIG); ++ if (rc) { ++ fail("Failed to register rseq"); ++ exit(1); ++ } ++} ++ ++static void unregister_thread(void) ++{ ++ int rc; ++ rc = sys_rseq(&__rseq_abi, sizeof(struct rseq), RSEQ_FLAG_UNREGISTER, RSEQ_SIG); ++ if (rc) { ++ fail("Failed to unregister rseq"); ++ exit(1); ++ } ++} ++ ++static void check_thread(void) ++{ ++ int rc; ++ rc = sys_rseq(&__rseq_abi, sizeof(struct rseq), 0, RSEQ_SIG); ++ if (!(rc && errno == EBUSY)) { ++ fail("Failed to check rseq %d", rc); ++ exit(1); ++ } ++} ++ ++#define RSEQ_ACCESS_ONCE(x) (*(__volatile__ __typeof__(x) *)&(x)) ++ ++static int rseq_addv(intptr_t *v, intptr_t count, int cpu) ++{ ++ /* clang-format off */ ++ __asm__ __volatile__ goto( ++ ".pushsection __rseq_table, \"aw\"\n\t" ++ ".balign 32\n\t" ++ "cs_obj:\n\t" ++ /* version, flags */ ++ ".long 0, 0\n\t" ++ /* start_ip, post_commit_ip, abort_ip */ ++ ".quad 1f, (2f-1f), 4f\n\t" ++ ".popsection\n\t" ++ "1:\n\t" ++ "leaq cs_obj(%%rip), %%rax\n\t" ++ "movq %%rax, %[rseq_cs]\n\t" ++ "cmpl %[cpu_id], %[current_cpu_id]\n\t" ++ "jnz 4f\n\t" ++ "addq %[count], %[v]\n\t" /* final store */ ++ "2:\n\t" ++ ".pushsection __rseq_failure, \"ax\"\n\t" ++ /* Disassembler-friendly signature: nopl (%rip). */ ++ ".byte 0x0f, 0x1f, 0x05\n\t" ++ ".long 0x53053053\n\t" /* RSEQ_FLAGS */ ++ "4:\n\t" ++ "jmp abort\n\t" ++ ".popsection\n\t" ++ : /* gcc asm goto does not allow outputs */ ++ : [cpu_id] "r" (cpu), ++ [current_cpu_id] "m" (__rseq_abi.cpu_id), ++ [rseq_cs] "m" (__rseq_abi.rseq_cs), ++ /* final store input */ ++ [v] "m" (*v), ++ [count] "er" (count) ++ : "memory", "cc", "rax" ++ : abort ++ ); ++ /* clang-format on */ ++ ++ return 0; ++abort: ++ return -1; ++} ++ ++int main(int argc, char *argv[]) ++{ ++ int cpu, ret; ++ intptr_t *cpu_data; ++ long nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); ++ ++ test_init(argc, argv); ++ ++ cpu_data = calloc(nr_cpus, sizeof(*cpu_data)); ++ if (!cpu_data) { ++ fail("calloc"); ++ exit(EXIT_FAILURE); ++ } ++ ++ register_thread(); ++ ++ test_daemon(); ++ test_waitsig(); ++ ++ check_thread(); ++ ++ cpu = RSEQ_ACCESS_ONCE(__rseq_abi.cpu_id_start); ++ ret = rseq_addv(&cpu_data[cpu], 2, cpu); ++ if (ret) ++ fail("Failed to increment per-cpu counter"); ++ else ++ test_msg("cpu_data[%d] == %ld\n", cpu, (long int)cpu_data[cpu]); ++ ++ if (cpu_data[cpu] == 2) ++ pass(); ++ else ++ fail(); ++ ++ return 0; ++} ++ ++#else ++ ++int main(int argc, char *argv[]) ++{ ++ test_init(argc, argv); ++ skip("Unsupported arch"); ++ return 0; ++} ++ ++#endif +\ No newline at end of file +diff --git a/test/zdtm/static/rseq00.desc b/test/zdtm/static/rseq00.desc +new file mode 100644 +index 0000000..0324fa3 +--- /dev/null ++++ b/test/zdtm/static/rseq00.desc +@@ -0,0 +1 @@ ++{'flavor': 'h', 'arch': 'x86_64', 'feature': 'get_rseq_conf'} +-- +2.30.0 + diff --git a/0008-ci-add-Fedora-Rawhide-based-test-on-Cirrus.patch b/0008-ci-add-Fedora-Rawhide-based-test-on-Cirrus.patch new file mode 100644 index 0000000000000000000000000000000000000000..dfe58541c3d4eaad4c25cbd6e9aeb80d3845ba74 --- /dev/null +++ b/0008-ci-add-Fedora-Rawhide-based-test-on-Cirrus.patch @@ -0,0 +1,122 @@ +From 56fad25776a652e143175a22676a1f909476c880 Mon Sep 17 00:00:00 2001 +From: bb-cat +Date: Wed, 2 Mar 2022 14:57:16 +0800 +Subject: [PATCH 08/16] ci: add Fedora Rawhide based test on Cirrus We have + ability to use nested virtualization on Cirrus, and already have "Vagrant + Fedora based test (no VDSO)" test, let's do analogical for Fedora Rawhide to + get fresh kernel. + +Suggested-by: Adrian Reber +Signed-off-by: Alexander Mikhalitsyn +--- + .cirrus.yml | 21 +++++++++++++++++++++ + scripts/ci/Makefile | 7 +++++-- + scripts/ci/run-ci-tests.sh | 5 +++++ + scripts/ci/vagrant.sh | 21 +++++++++++++++++++++ + 4 files changed, 52 insertions(+), 2 deletions(-) + +diff --git a/.cirrus.yml b/.cirrus.yml +index 671178d..9716e58 100644 +--- a/.cirrus.yml ++++ b/.cirrus.yml +@@ -19,6 +19,27 @@ task: + build_script: | + make -C scripts/ci vagrant-fedora-no-vdso + ++task: ++ name: Vagrant Fedora Rawhide based test ++ environment: ++ HOME: "/root" ++ CIRRUS_WORKING_DIR: "/tmp/criu" ++ ++ compute_engine_instance: ++ image_project: cirrus-images ++ image: family/docker-kvm ++ platform: linux ++ cpu: 4 ++ memory: 16G ++ nested_virtualization: true ++ ++ setup_script: | ++ scripts/ci/apt-install make gcc pkg-config git perl-modules iproute2 kmod wget cpu-checker ++ sudo kvm-ok ++ ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto ++ build_script: | ++ make -C scripts/ci vagrant-fedora-rawhide ++ + task: + name: CentOS 8 based test + environment: +diff --git a/scripts/ci/Makefile b/scripts/ci/Makefile +index 02b4d87..9c9264d 100644 +--- a/scripts/ci/Makefile ++++ b/scripts/ci/Makefile +@@ -41,7 +41,7 @@ export CONTAINER_TERMINAL + ifeq ($(UNAME),x86_64) + # On anything besides x86_64 Travis is running unprivileged LXD + # containers which do not support running docker with '--privileged'. +- CONTAINER_OPTS := --rm $(CONTAINER_TERMINAL) --privileged -v /lib/modules:/lib/modules --tmpfs /run ++ CONTAINER_OPTS := --rm $(CONTAINER_TERMINAL) --privileged --userns=host --cgroupns=host -v /lib/modules:/lib/modules --tmpfs /run + else + CONTAINER_OPTS := --rm -v /lib/modules:/lib/modules --tmpfs /run + endif +@@ -92,7 +92,10 @@ setup-vagrant: + vagrant-fedora-no-vdso: setup-vagrant + ./vagrant.sh fedora-no-vdso + +-.PHONY: setup-vagrant vagrant-fedora-no-vdso ++vagrant-fedora-rawhide: setup-vagrant ++ ./vagrant.sh fedora-rawhide ++ ++.PHONY: setup-vagrant vagrant-fedora-no-vdso vagrant-fedora-rawhide + + %: + $(MAKE) -C ../build $@$(target-suffix) +diff --git a/scripts/ci/run-ci-tests.sh b/scripts/ci/run-ci-tests.sh +index 7c66e68..95b4ec7 100755 +--- a/scripts/ci/run-ci-tests.sh ++++ b/scripts/ci/run-ci-tests.sh +@@ -194,6 +194,11 @@ if [ "${STREAM_TEST}" = "1" ]; then + exit 0 + fi + ++# print some useful debug info ++cat /proc/self/status ++ls -la /proc/self/ns ++cat /proc/self/cgroup ++ + # shellcheck disable=SC2086 + ./test/zdtm.py run -a -p 2 --keep-going $ZDTM_OPTS + +diff --git a/scripts/ci/vagrant.sh b/scripts/ci/vagrant.sh +index 839b100..f961b8d 100755 +--- a/scripts/ci/vagrant.sh ++++ b/scripts/ci/vagrant.sh +@@ -58,4 +58,25 @@ fedora-no-vdso() { + ssh default 'cd /vagrant/criu/test; sudo ./zdtm.py run -t zdtm/transition/pidfd_store_sk --rpc --pre 2' + } + ++fedora-rawhide() { ++ #ssh default sudo grubby --update-kernel ALL --args="selinux=0 systemd.unified_cgroup_hierarchy=0" ++ ssh default sudo grubby --update-kernel ALL ++ # ++ # Workaround the problem: ++ # error running container: error from /usr/bin/crun creating container for [...]: sd-bus call: Transport endpoint is not connected ++ # Let's just use runc instead of crun ++ # ++ ssh default 'sudo dnf remove -y crun || true' ++ ssh default sudo dnf install -y podman runc ++ vagrant reload ++ #ssh default sudo setenforce 0 ++ ssh default cat /proc/cmdline ++ ssh default ls -la /proc/self/ns ++ ssh default sudo cat /proc/self/status ++ ssh default sudo cat /proc/self/cgroup ++ #ssh default sudo capsh --print ++ ssh default 'cd /vagrant; tar xf criu.tar; cd criu; sudo -E make -C scripts/ci fedora-rawhide CONTAINER_RUNTIME=podman BUILD_OPTIONS="--security-opt seccomp=unconfined"' ++} ++ + $1 +-- +2.30.0 + diff --git a/0009-include-add-thread_pointer.h-from-Glibc.patch b/0009-include-add-thread_pointer.h-from-Glibc.patch new file mode 100644 index 0000000000000000000000000000000000000000..bf462d48f38322829473482f3aace93db11bc7dc --- /dev/null +++ b/0009-include-add-thread_pointer.h-from-Glibc.patch @@ -0,0 +1,235 @@ +From 99da2f789ca92aa52eeca07b97aee2cbd3d60fca Mon Sep 17 00:00:00 2001 +From: bb-cat +Date: Wed, 2 Mar 2022 15:00:07 +0800 +Subject: [PATCH 09/16] include: add thread_pointer.h from Glibc Implementation + was taken from the Glibc. + +Signed-off-by: Alexander Mikhalitsyn +--- + .../arch/aarch64/include/asm/thread_pointer.h | 27 ++++++++++++++ + .../arch/arm/include/asm/thread_pointer.h | 27 ++++++++++++++ + .../arch/mips/include/asm/thread_pointer.h | 27 ++++++++++++++ + .../arch/ppc64/include/asm/thread_pointer.h | 33 +++++++++++++++++ + .../arch/s390/include/asm/thread_pointer.h | 27 ++++++++++++++ + .../arch/x86/include/asm/thread_pointer.h | 37 +++++++++++++++++++ + 6 files changed, 178 insertions(+) + create mode 100644 criu/arch/aarch64/include/asm/thread_pointer.h + create mode 100644 criu/arch/arm/include/asm/thread_pointer.h + create mode 100644 criu/arch/mips/include/asm/thread_pointer.h + create mode 100644 criu/arch/ppc64/include/asm/thread_pointer.h + create mode 100644 criu/arch/s390/include/asm/thread_pointer.h + create mode 100644 criu/arch/x86/include/asm/thread_pointer.h + +diff --git a/criu/arch/aarch64/include/asm/thread_pointer.h b/criu/arch/aarch64/include/asm/thread_pointer.h +new file mode 100644 +index 0000000..f7e0706 +--- /dev/null ++++ b/criu/arch/aarch64/include/asm/thread_pointer.h +@@ -0,0 +1,27 @@ ++/* __thread_pointer definition. Generic version. ++ Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library.*/ ++ ++#ifndef _SYS_THREAD_POINTER_H ++#define _SYS_THREAD_POINTER_H ++ ++static inline void *__criu_thread_pointer(void) ++{ ++ return __builtin_thread_pointer(); ++} ++ ++#endif /* _SYS_THREAD_POINTER_H */ +diff --git a/criu/arch/arm/include/asm/thread_pointer.h b/criu/arch/arm/include/asm/thread_pointer.h +new file mode 100644 +index 0000000..f7e0706 +--- /dev/null ++++ b/criu/arch/arm/include/asm/thread_pointer.h +@@ -0,0 +1,27 @@ ++/* __thread_pointer definition. Generic version. ++ Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library.*/ ++ ++#ifndef _SYS_THREAD_POINTER_H ++#define _SYS_THREAD_POINTER_H ++ ++static inline void *__criu_thread_pointer(void) ++{ ++ return __builtin_thread_pointer(); ++} ++ ++#endif /* _SYS_THREAD_POINTER_H */ +diff --git a/criu/arch/mips/include/asm/thread_pointer.h b/criu/arch/mips/include/asm/thread_pointer.h +new file mode 100644 +index 0000000..f7e0706 +--- /dev/null ++++ b/criu/arch/mips/include/asm/thread_pointer.h +@@ -0,0 +1,27 @@ ++/* __thread_pointer definition. Generic version. ++ Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library.*/ ++ ++#ifndef _SYS_THREAD_POINTER_H ++#define _SYS_THREAD_POINTER_H ++ ++static inline void *__criu_thread_pointer(void) ++{ ++ return __builtin_thread_pointer(); ++} ++ ++#endif /* _SYS_THREAD_POINTER_H */ +diff --git a/criu/arch/ppc64/include/asm/thread_pointer.h b/criu/arch/ppc64/include/asm/thread_pointer.h +new file mode 100644 +index 0000000..304516f +--- /dev/null ++++ b/criu/arch/ppc64/include/asm/thread_pointer.h +@@ -0,0 +1,33 @@ ++/* __thread_pointer definition. powerpc version. ++ Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library.*/ ++ ++#ifndef _SYS_THREAD_POINTER_H ++#define _SYS_THREAD_POINTER_H ++ ++#ifdef __powerpc64__ ++register void *__thread_register asm("r13"); ++#else ++register void *__thread_register asm("r2"); ++#endif ++ ++static inline void *__criu_thread_pointer(void) ++{ ++ return __thread_register; ++} ++ ++#endif /* _SYS_THREAD_POINTER_H */ +\ No newline at end of file +diff --git a/criu/arch/s390/include/asm/thread_pointer.h b/criu/arch/s390/include/asm/thread_pointer.h +new file mode 100644 +index 0000000..f7e0706 +--- /dev/null ++++ b/criu/arch/s390/include/asm/thread_pointer.h +@@ -0,0 +1,27 @@ ++/* __thread_pointer definition. Generic version. ++ Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library.*/ ++ ++#ifndef _SYS_THREAD_POINTER_H ++#define _SYS_THREAD_POINTER_H ++ ++static inline void *__criu_thread_pointer(void) ++{ ++ return __builtin_thread_pointer(); ++} ++ ++#endif /* _SYS_THREAD_POINTER_H */ +diff --git a/criu/arch/x86/include/asm/thread_pointer.h b/criu/arch/x86/include/asm/thread_pointer.h +new file mode 100644 +index 0000000..08603ae +--- /dev/null ++++ b/criu/arch/x86/include/asm/thread_pointer.h +@@ -0,0 +1,37 @@ ++/* __thread_pointer definition. x86 version. ++ Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library.*/ ++ ++#ifndef _SYS_THREAD_POINTER_H ++#define _SYS_THREAD_POINTER_H ++ ++static inline void *__criu_thread_pointer(void) ++{ ++#if __GNUC_PREREQ(11, 1) ++ return __builtin_thread_pointer(); ++#else ++ void *__result; ++#ifdef __x86_64__ ++ __asm__("mov %%fs:0, %0" : "=r"(__result)); ++#else ++ __asm__("mov %%gs:0, %0" : "=r"(__result)); ++#endif ++ return __result; ++#endif /* !GCC 11 */ ++} ++ ++#endif /* _SYS_THREAD_POINTER_H */ +\ No newline at end of file +-- +2.30.0 + diff --git a/0010-clone-noasan-unregister-rseq-at-the-thread-start-for.patch b/0010-clone-noasan-unregister-rseq-at-the-thread-start-for.patch new file mode 100644 index 0000000000000000000000000000000000000000..ca084df6d4f3cfec4f2cb47e360e33e52894fcee --- /dev/null +++ b/0010-clone-noasan-unregister-rseq-at-the-thread-start-for.patch @@ -0,0 +1,94 @@ +From d43ad9913c19afa6d80cb8124015d47361152db8 Mon Sep 17 00:00:00 2001 +From: bb-cat +Date: Wed, 2 Mar 2022 15:00:43 +0800 +Subject: [PATCH 10/16] clone-noasan: unregister rseq at the thread start for + new glibc Fresh glibc does rseq registration by default during + start_thread(). + +This cause process crashes during memory restore procedure, because +memory which corresponds to the struct rseq will be overwritten. + +Signed-off-by: Alexander Mikhalitsyn +--- + criu/clone-noasan.c | 42 +++++++++++++++++++++++++++++++-- + 1 file changed, 40 insertions(+), 2 deletions(-) + +diff --git a/criu/clone-noasan.c b/criu/clone-noasan.c +index d657ea2..5f8dd1b 100644 +--- a/criu/clone-noasan.c ++++ b/criu/clone-noasan.c +@@ -2,6 +2,13 @@ + #include + #include + ++#ifdef __has_include ++#if __has_include ("sys/rseq.h") ++#include ++#include "asm/thread_pointer.h" ++#endif ++#endif ++ + #include + + #include "sched.h" +@@ -34,16 +41,45 @@ + * ... wait for process to finish ... + * unlock_last_pid + */ ++ ++#if defined(RSEQ_SIG) ++static inline void unregister_glibc_rseq(void) ++{ ++ /* unregister rseq */ ++ syscall(__NR_rseq, (void *)((char *)__criu_thread_pointer() + __rseq_offset), __rseq_size, 1, RSEQ_SIG); ++} ++#else ++static inline void unregister_glibc_rseq(void) ++{ ++} ++#endif ++ ++struct call_fn_args { ++ int (*fn)(void *); ++ void *arg; ++}; ++ ++int call_fn(void *arg) ++{ ++ struct call_fn_args *cargs = arg; ++ unregister_glibc_rseq(); ++ return cargs->fn(cargs->arg); ++} ++ + int clone_noasan(int (*fn)(void *), int flags, void *arg) + { + void *stack_ptr = (void *)round_down((unsigned long)&stack_ptr - 1024, 16); ++ struct call_fn_args a = { ++ .fn = fn, ++ .arg = arg, ++ }; + + BUG_ON((flags & CLONE_VM) && !(flags & CLONE_VFORK)); + /* + * Reserve some bytes for clone() internal needs + * and use as stack the address above this area. + */ +- return clone(fn, stack_ptr, flags, arg); ++ return clone(call_fn, stack_ptr, flags, (void *)&a); + } + + int clone3_with_pid_noasan(int (*fn)(void *), void *arg, int flags, int exit_signal, pid_t pid) +@@ -78,7 +114,9 @@ int clone3_with_pid_noasan(int (*fn)(void *), void *arg, int flags, int exit_sig + c_args.set_tid = ptr_to_u64(&pid); + c_args.set_tid_size = 1; + pid = syscall(__NR_clone3, &c_args, sizeof(c_args)); +- if (pid == 0) ++ if (pid == 0) { ++ unregister_glibc_rseq(); + exit(fn(arg)); ++ } + return pid; + } +-- +2.30.0 + diff --git a/0011-zdtm-static-rseq00-fix-rseq-test-when-linking-with-a.patch b/0011-zdtm-static-rseq00-fix-rseq-test-when-linking-with-a.patch new file mode 100644 index 0000000000000000000000000000000000000000..f4502398bdca586b316840019b2a7308727889cc --- /dev/null +++ b/0011-zdtm-static-rseq00-fix-rseq-test-when-linking-with-a.patch @@ -0,0 +1,157 @@ +From 4f4d5acc34046954aea9e8ea10b5f71ff5f0fbd5 Mon Sep 17 00:00:00 2001 +From: bb-cat +Date: Wed, 2 Mar 2022 15:01:34 +0800 +Subject: [PATCH 11/16] zdtm/static/rseq00: fix rseq test when linking with a + fresh Glibc Fresh Glibc does rseq() register by default. We need to + unregister rseq before registering our own. + +Signed-off-by: Alexander Mikhalitsyn +--- + test/zdtm/static/rseq00.c | 76 ++++++++++++++++++++------- + 1 file changed, 58 insertions(+), 18 deletions(-) + +diff --git a/test/zdtm/static/rseq00.c b/test/zdtm/static/rseq00.c +index 26f41a2..87053b8 100644 +--- a/test/zdtm/static/rseq00.c ++++ b/test/zdtm/static/rseq00.c +@@ -19,13 +19,48 @@ + + #include "zdtmtst.h" + +-#if defined(__x86_64__) ++#ifdef __has_include ++#if __has_include("sys/rseq.h") ++#include ++#endif ++#endif ++ ++#if defined(__i386__) || defined(__x86_64__) ++ ++#if defined(RSEQ_SIG) ++static inline void *__criu_thread_pointer(void) ++{ ++#if __GNUC_PREREQ(11, 1) ++ return __builtin_thread_pointer(); ++#else ++ void *__result; ++#ifdef __x86_64__ ++ __asm__("mov %%fs:0, %0" : "=r"(__result)); ++#else ++ __asm__("mov %%gs:0, %0" : "=r"(__result)); ++#endif ++ return __result; ++#endif /* !GCC 11 */ ++} ++ ++static inline void unregister_glibc_rseq(void) ++{ ++ /* unregister rseq */ ++ syscall(__NR_rseq, (void *)((char *)__criu_thread_pointer() + __rseq_offset), __rseq_size, 1, RSEQ_SIG); ++} ++#else ++static inline void unregister_glibc_rseq(void) ++{ ++} ++#endif + + const char *test_doc = "Check that rseq() basic C/R works"; + const char *test_author = "Alexander Mikhalitsyn "; + + /* some useful definitions from kernel uapi */ ++#ifndef RSEQ_SIG ++ + enum rseq_flags { + RSEQ_FLAG_UNREGISTER = (1 << 0), + }; +@@ -37,14 +72,21 @@ struct rseq { + uint32_t flags; + } __attribute__((aligned(4 * sizeof(uint64_t)))); + ++#define RSEQ_SIG 0x53053053 ++ ++#endif ++ + #ifndef __NR_rseq + #define __NR_rseq 334 + #endif + /* EOF */ + +-static __thread volatile struct rseq __rseq_abi; ++#define RSEQ_TLS_ALLOC 0 + +-#define RSEQ_SIG 0x53053053 ++static volatile struct rseq *rseq_ptr; ++#if RSEQ_TLS_ALLOC ++static __thread volatile struct rseq __rseq_abi; ++#endif + + static int sys_rseq(volatile struct rseq *rseq_abi, uint32_t rseq_len, int flags, uint32_t sig) + { +@@ -54,27 +96,18 @@ static int sys_rseq(volatile struct rseq *rseq_abi, uint32_t rseq_len, int flags + static void register_thread(void) + { + int rc; +- rc = sys_rseq(&__rseq_abi, sizeof(struct rseq), 0, RSEQ_SIG); ++ unregister_glibc_rseq(); ++ rc = sys_rseq(rseq_ptr, sizeof(struct rseq), 0, RSEQ_SIG); + if (rc) { + fail("Failed to register rseq"); + exit(1); + } + } + +-static void unregister_thread(void) +-{ +- int rc; +- rc = sys_rseq(&__rseq_abi, sizeof(struct rseq), RSEQ_FLAG_UNREGISTER, RSEQ_SIG); +- if (rc) { +- fail("Failed to unregister rseq"); +- exit(1); +- } +-} +- + static void check_thread(void) + { + int rc; +- rc = sys_rseq(&__rseq_abi, sizeof(struct rseq), 0, RSEQ_SIG); ++ rc = sys_rseq(rseq_ptr, sizeof(struct rseq), 0, RSEQ_SIG); + if (!(rc && errno == EBUSY)) { + fail("Failed to check rseq %d", rc); + exit(1); +@@ -111,8 +144,8 @@ static int rseq_addv(intptr_t *v, intptr_t count, int cpu) + ".popsection\n\t" + : /* gcc asm goto does not allow outputs */ + : [cpu_id] "r" (cpu), +- [current_cpu_id] "m" (__rseq_abi.cpu_id), +- [rseq_cs] "m" (__rseq_abi.rseq_cs), ++ [current_cpu_id] "m" (rseq_ptr->cpu_id), ++ [rseq_cs] "m" (rseq_ptr->rseq_cs), + /* final store input */ + [v] "m" (*v), + [count] "er" (count) +@@ -132,6 +165,13 @@ int main(int argc, char *argv[]) + intptr_t *cpu_data; + long nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); + ++#if RSEQ_TLS_ALLOC ++ rseq_ptr = &__rseq_abi; ++#else ++ //rseq_ptr = malloc(sizeof(struct rseq)); ++ rseq_ptr = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_SHARED, 0, 0); ++#endif ++ + test_init(argc, argv); + + cpu_data = calloc(nr_cpus, sizeof(*cpu_data)); +@@ -147,7 +187,7 @@ int main(int argc, char *argv[]) + + check_thread(); + +- cpu = RSEQ_ACCESS_ONCE(__rseq_abi.cpu_id_start); ++ cpu = RSEQ_ACCESS_ONCE(rseq_ptr->cpu_id_start); + ret = rseq_addv(&cpu_data[cpu], 2, cpu); + if (ret) + fail("Failed to increment per-cpu counter"); +-- +2.30.0 + diff --git a/0012-compel-add-helpers-to-get-set-instruction-pointer.patch b/0012-compel-add-helpers-to-get-set-instruction-pointer.patch new file mode 100644 index 0000000000000000000000000000000000000000..33acd47dde4265510ced29559200ada1b26505b1 --- /dev/null +++ b/0012-compel-add-helpers-to-get-set-instruction-pointer.patch @@ -0,0 +1,265 @@ +From 06cb51057ce1cc31b79c6321273dfa0b4cb7f980 Mon Sep 17 00:00:00 2001 +From: bb-cat +Date: Wed, 2 Mar 2022 15:02:08 +0800 +Subject: [PATCH 12/16] compel: add helpers to get/set instruction pointer + Signed-off-by: Alexander Mikhalitsyn + +--- + .../src/lib/include/uapi/asm/infect-types.h | 9 +++++---- + .../src/lib/include/uapi/asm/infect-types.h | 9 +++++---- + .../src/lib/include/uapi/asm/infect-types.h | 9 +++++---- + .../src/lib/include/uapi/asm/infect-types.h | 9 +++++---- + .../src/lib/include/uapi/asm/infect-types.h | 7 ++++--- + .../src/lib/include/uapi/asm/infect-types.h | 9 +++++---- + compel/include/uapi/infect.h | 6 ++++++ + compel/src/lib/infect.c | 20 +++++++++++++++++++ + .../criu/arch/aarch64/include/asm/types.h | 2 ++ + criu/arch/arm/include/asm/types.h | 2 ++ + .../criu/arch/mips/include/asm/types.h | 2 ++ + .../criu/arch/ppc64/include/asm/types.h | 2 ++ + .../criu/arch/s390/include/asm/types.h | 2 ++ + criu/arch/x86/include/asm/types.h | 2 ++ + 14 files changed, 67 insertions(+), 23 deletions(-) + +diff --git a/compel/arch/aarch64/src/lib/include/uapi/asm/infect-types.h b/compel/arch/aarch64/src/lib/include/uapi/asm/infect-types.h +index f91e73d..9d4ce7e 100644 +--- a/compel/arch/aarch64/src/lib/include/uapi/asm/infect-types.h ++++ b/compel/arch/aarch64/src/lib/include/uapi/asm/infect-types.h +@@ -23,10 +23,11 @@ typedef struct user_fpsimd_state user_fpregs_struct_t; + #define compel_arch_get_tls_task(ctl, tls) + #define compel_arch_get_tls_thread(tctl, tls) + +-#define REG_RES(r) ((uint64_t)(r).regs[0]) +-#define REG_IP(r) ((uint64_t)(r).pc) +-#define REG_SP(r) ((uint64_t)((r).sp)) +-#define REG_SYSCALL_NR(r) ((uint64_t)(r).regs[8]) ++#define REG_RES(r) ((uint64_t)(r).regs[0]) ++#define REG_IP(r) ((uint64_t)(r).pc) ++#define SET_REG_IP(r, val) ((r).pc = (val)) ++#define REG_SP(r) ((uint64_t)((r).sp)) ++#define REG_SYSCALL_NR(r) ((uint64_t)(r).regs[8]) + + #define user_regs_native(pregs) true + +diff --git a/compel/arch/arm/src/lib/include/uapi/asm/infect-types.h b/compel/arch/arm/src/lib/include/uapi/asm/infect-types.h +index 159b6a9..8d32825 100644 +--- a/compel/arch/arm/src/lib/include/uapi/asm/infect-types.h ++++ b/compel/arch/arm/src/lib/include/uapi/asm/infect-types.h +@@ -56,10 +56,11 @@ struct user_vfp_exc { + unsigned long fpinst2; + }; + +-#define REG_RES(regs) ((regs).ARM_r0) +-#define REG_IP(regs) ((regs).ARM_pc) +-#define REG_SP(regs) ((regs).ARM_sp) +-#define REG_SYSCALL_NR(regs) ((regs).ARM_r7) ++#define REG_RES(regs) ((regs).ARM_r0) ++#define REG_IP(regs) ((regs).ARM_pc) ++#define SET_REG_IP(regs, val) ((regs).ARM_pc = (val)) ++#define REG_SP(regs) ((regs).ARM_sp) ++#define REG_SYSCALL_NR(regs) ((regs).ARM_r7) + + #define user_regs_native(pregs) true + +diff --git a/compel/arch/mips/src/lib/include/uapi/asm/infect-types.h b/compel/arch/mips/src/lib/include/uapi/asm/infect-types.h +index 70b3f85..481566a 100644 +--- a/compel/arch/mips/src/lib/include/uapi/asm/infect-types.h ++++ b/compel/arch/mips/src/lib/include/uapi/asm/infect-types.h +@@ -56,10 +56,11 @@ static inline bool user_regs_native(user_regs_struct_t *pregs) + #define compel_arch_get_tls_task(ctl, tls) + #define compel_arch_get_tls_thread(tctl, tls) + +-#define REG_RES(regs) ((regs).MIPS_v0) +-#define REG_IP(regs) ((regs).cp0_epc) +-#define REG_SP(regs) ((regs).MIPS_sp) +-#define REG_SYSCALL_NR(regs) ((regs).MIPS_v0) ++#define REG_RES(regs) ((regs).MIPS_v0) ++#define REG_IP(regs) ((regs).cp0_epc) ++#define SET_REG_IP(regs, val) ((regs).cp0_epc = (val)) ++#define REG_SP(regs) ((regs).MIPS_sp) ++#define REG_SYSCALL_NR(regs) ((regs).MIPS_v0) + + //#define __NR(syscall, compat) ((compat) ? __NR32_##syscall : __NR_##syscall) + #define __NR(syscall, compat) __NR_##syscall +diff --git a/compel/arch/ppc64/src/lib/include/uapi/asm/infect-types.h b/compel/arch/ppc64/src/lib/include/uapi/asm/infect-types.h +index fe6192e..bf2cc95 100644 +--- a/compel/arch/ppc64/src/lib/include/uapi/asm/infect-types.h ++++ b/compel/arch/ppc64/src/lib/include/uapi/asm/infect-types.h +@@ -72,10 +72,11 @@ typedef struct { + } tm; + } user_fpregs_struct_t; + +-#define REG_RES(regs) ((uint64_t)(regs).gpr[3]) +-#define REG_IP(regs) ((uint64_t)(regs).nip) +-#define REG_SP(regs) ((uint64_t)(regs).gpr[1]) +-#define REG_SYSCALL_NR(regs) ((uint64_t)(regs).gpr[0]) ++#define REG_RES(regs) ((uint64_t)(regs).gpr[3]) ++#define REG_IP(regs) ((uint64_t)(regs).nip) ++#define SET_REG_IP(regs, val) ((regs).nip = (val)) ++#define REG_SP(regs) ((uint64_t)(regs).gpr[1]) ++#define REG_SYSCALL_NR(regs) ((uint64_t)(regs).gpr[0]) + + #define user_regs_native(pregs) true + +diff --git a/compel/arch/s390/src/lib/include/uapi/asm/infect-types.h b/compel/arch/s390/src/lib/include/uapi/asm/infect-types.h +index 896d70e..87283bc 100644 +--- a/compel/arch/s390/src/lib/include/uapi/asm/infect-types.h ++++ b/compel/arch/s390/src/lib/include/uapi/asm/infect-types.h +@@ -62,9 +62,10 @@ typedef struct { + uint32_t system_call; + } user_regs_struct_t; + +-#define REG_RES(r) ((uint64_t)(r).prstatus.gprs[2]) +-#define REG_IP(r) ((uint64_t)(r).prstatus.psw.addr) +-#define REG_SP(r) ((uint64_t)(r).prstatus.gprs[15]) ++#define REG_RES(r) ((uint64_t)(r).prstatus.gprs[2]) ++#define REG_IP(r) ((uint64_t)(r).prstatus.psw.addr) ++#define SET_REG_IP(r, val) ((r).prstatus.psw.addr = (val)) ++#define REG_SP(r) ((uint64_t)(r).prstatus.gprs[15]) + /* + * We assume that REG_SYSCALL_NR() is only used for pie code where we + * always use svc 0 with opcode in %r1. +diff --git a/compel/arch/x86/src/lib/include/uapi/asm/infect-types.h b/compel/arch/x86/src/lib/include/uapi/asm/infect-types.h +index 34b3ad0..b35504f 100644 +--- a/compel/arch/x86/src/lib/include/uapi/asm/infect-types.h ++++ b/compel/arch/x86/src/lib/include/uapi/asm/infect-types.h +@@ -127,10 +127,11 @@ typedef struct { + + typedef struct xsave_struct user_fpregs_struct_t; + +-#define REG_RES(regs) get_user_reg(®s, ax) +-#define REG_IP(regs) get_user_reg(®s, ip) +-#define REG_SP(regs) get_user_reg(®s, sp) +-#define REG_SYSCALL_NR(regs) get_user_reg(®s, orig_ax) ++#define REG_RES(regs) get_user_reg(®s, ax) ++#define REG_IP(regs) get_user_reg(®s, ip) ++#define SET_REG_IP(regs, val) set_user_reg(®s, ip, val) ++#define REG_SP(regs) get_user_reg(®s, sp) ++#define REG_SYSCALL_NR(regs) get_user_reg(®s, orig_ax) + + #define __NR(syscall, compat) ((compat) ? __NR32_##syscall : __NR_##syscall) + +diff --git a/compel/include/uapi/infect.h b/compel/include/uapi/infect.h +index c3d2ee6..389878e 100644 +--- a/compel/include/uapi/infect.h ++++ b/compel/include/uapi/infect.h +@@ -168,4 +168,10 @@ extern unsigned long compel_task_size(void); + extern uint64_t compel_get_leader_sp(struct parasite_ctl *ctl); + extern uint64_t compel_get_thread_sp(struct parasite_thread_ctl *tctl); + ++extern uint64_t compel_get_leader_ip(struct parasite_ctl *ctl); ++extern uint64_t compel_get_thread_ip(struct parasite_thread_ctl *tctl); ++ ++void compel_set_leader_ip(struct parasite_ctl *ctl, uint64_t v); ++void compel_set_thread_ip(struct parasite_thread_ctl *tctl, uint64_t v); ++ + #endif +diff --git a/compel/src/lib/infect.c b/compel/src/lib/infect.c +index 0fb9e71..6a13cc1 100644 +--- a/compel/src/lib/infect.c ++++ b/compel/src/lib/infect.c +@@ -1686,3 +1686,23 @@ uint64_t compel_get_thread_sp(struct parasite_thread_ctl *tctl) + { + return REG_SP(tctl->th.regs); + } ++ ++uint64_t compel_get_leader_ip(struct parasite_ctl *ctl) ++{ ++ return REG_IP(ctl->orig.regs); ++} ++ ++uint64_t compel_get_thread_ip(struct parasite_thread_ctl *tctl) ++{ ++ return REG_IP(tctl->th.regs); ++} ++ ++void compel_set_leader_ip(struct parasite_ctl *ctl, uint64_t v) ++{ ++ SET_REG_IP(ctl->orig.regs, v); ++} ++ ++void compel_set_thread_ip(struct parasite_thread_ctl *tctl, uint64_t v) ++{ ++ SET_REG_IP(tctl->th.regs, v); ++} +diff --git a/criu/arch/aarch64/include/asm/types.h b/criu/arch/aarch64/include/asm/types.h +index c860af1..363c1ca 100644 +--- a/criu/arch/aarch64/include/asm/types.h ++++ b/criu/arch/aarch64/include/asm/types.h +@@ -22,6 +22,8 @@ typedef UserAarch64RegsEntry UserRegsEntry; + + #define TI_SP(core) ((core)->ti_aarch64->gpregs->sp) + ++#define TI_IP(core) ((core)->ti_aarch64->gpregs->pc) ++ + static inline void *decode_pointer(uint64_t v) + { + return (void *)v; +diff --git a/criu/arch/arm/include/asm/types.h b/criu/arch/arm/include/asm/types.h +index cfcb8a1..93d2dc2 100644 +--- a/criu/arch/arm/include/asm/types.h ++++ b/criu/arch/arm/include/asm/types.h +@@ -21,6 +21,8 @@ typedef UserArmRegsEntry UserRegsEntry; + + #define TI_SP(core) ((core)->ti_arm->gpregs->sp) + ++#define TI_IP(core) ((core)->ti_arm->gpregs->ip) ++ + static inline void *decode_pointer(u64 v) + { + return (void *)(u32)v; +diff --git a/criu/arch/mips/include/asm/types.h b/criu/arch/mips/include/asm/types.h +index 237471f..2c75b6a 100644 +--- a/criu/arch/mips/include/asm/types.h ++++ b/criu/arch/mips/include/asm/types.h +@@ -18,6 +18,8 @@ + + #define CORE_THREAD_ARCH_INFO(core) core->ti_mips + ++#define TI_IP(core) ((core)->ti_mips->gpregs->cp0_epc) ++ + typedef UserMipsRegsEntry UserRegsEntry; + + static inline u64 encode_pointer(void *p) +diff --git a/criu/arch/ppc64/include/asm/types.h b/criu/arch/ppc64/include/asm/types.h +index fedeff2..d60aadd 100644 +--- a/criu/arch/ppc64/include/asm/types.h ++++ b/criu/arch/ppc64/include/asm/types.h +@@ -19,6 +19,8 @@ typedef UserPpc64RegsEntry UserRegsEntry; + + #define CORE_THREAD_ARCH_INFO(core) core->ti_ppc64 + ++#define TI_IP(core) ((core)->ti_ppc64->gpregs->nip) ++ + static inline void *decode_pointer(uint64_t v) + { + return (void *)v; +diff --git a/criu/arch/s390/include/asm/types.h b/criu/arch/s390/include/asm/types.h +index 7522cf2..abf12de 100644 +--- a/criu/arch/s390/include/asm/types.h ++++ b/criu/arch/s390/include/asm/types.h +@@ -19,6 +19,8 @@ typedef UserS390RegsEntry UserRegsEntry; + + #define CORE_THREAD_ARCH_INFO(core) core->ti_s390 + ++#define TI_IP(core) ((core)->ti_s390->gpregs->psw_addr) ++ + static inline u64 encode_pointer(void *p) + { + return (u64)p; +diff --git a/criu/arch/x86/include/asm/types.h b/criu/arch/x86/include/asm/types.h +index a0a8ed9..8919d0a 100644 +--- a/criu/arch/x86/include/asm/types.h ++++ b/criu/arch/x86/include/asm/types.h +@@ -28,6 +28,8 @@ static inline int core_is_compat(CoreEntry *c) + + #define CORE_THREAD_ARCH_INFO(core) core->thread_info + ++#define TI_IP(core) ((core)->thread_info->gpregs->ip) ++ + typedef UserX86RegsEntry UserRegsEntry; + + static inline u64 encode_pointer(void *p) +-- +2.30.0 + diff --git a/0013-cr-dump-fixup-thread-IP-when-inside-rseq-cs.patch b/0013-cr-dump-fixup-thread-IP-when-inside-rseq-cs.patch new file mode 100644 index 0000000000000000000000000000000000000000..cd8ef176ddd77e4a13fee010fd690983e0973c71 --- /dev/null +++ b/0013-cr-dump-fixup-thread-IP-when-inside-rseq-cs.patch @@ -0,0 +1,248 @@ +From 33abfc12b973560b3d98afdbac7554b8c0542c3d Mon Sep 17 00:00:00 2001 +From: bb-cat +Date: Wed, 2 Mar 2022 15:04:54 +0800 +Subject: [PATCH 13/16] cr-dump: fixup thread IP when inside rseq cs + Signed-off-by: Alexander Mikhalitsyn + +--- + criu/cr-dump.c | 155 +++++++++++++++++++++++++++- + criu/include/parasite.h | 2 + + criu/include/pstree.h | 1 + + 3 files changed, 154 insertions(+), 4 deletions(-) + +diff --git a/criu/cr-dump.c b/criu/cr-dump.c +index 91dd08a..a3f8973 100644 +--- a/criu/cr-dump.c ++++ b/criu/cr-dump.c +@@ -1047,11 +1047,58 @@ static int dump_task_signals(pid_t pid, struct pstree_item *item) + return 0; + } + +-static int dump_thread_rseq(pid_t tid, RseqEntry **rseqep) ++static int read_rseq_cs(pid_t tid, struct __ptrace_rseq_configuration *rseq, struct rseq_cs *rseq_cs) ++{ ++ int ret; ++ uint64_t addr; ++ ++ /* rseq is not registered */ ++ if (!rseq->rseq_abi_pointer) ++ return 0; ++ ++ /* ++ * We need to cover the case when victim process was inside rseq critical section ++ * at the moment when CRIU comes and seized it. We need to determine the borders ++ * of rseq critical section at first. To achieve that we need to access thread ++ * memory and read pointer to struct rseq_cs. ++ * ++ * We have two ways to access thread memory: from the parasite and using ptrace(). ++ * But it this case we can't use parasite, because if victim process returns to the ++ * execution, on the kernel side __rseq_handle_notify_resume hook will be called, ++ * then rseq_ip_fixup() -> clear_rseq_cs() and user space memory with struct rseq ++ * will be cleared. So, let's use ptrace(PTRACE_PEEKDATA). ++ */ ++ ret = ptrace_peek_area(tid, &addr, decode_pointer(rseq->rseq_abi_pointer + offsetof(struct rseq, rseq_cs)), ++ sizeof(uint64_t)); ++ if (ret) { ++ pr_err("ptrace_peek_area(%d, %lx, %lx, %lx): fail to read rseq_cs addr\n", tid, (unsigned long)&addr, ++ (unsigned long)(rseq->rseq_abi_pointer + offsetof(struct rseq, rseq_cs)), sizeof(uint64_t)); ++ return -1; ++ } ++ ++ /* (struct rseq)->rseq_cs is NULL */ ++ if (!addr) ++ return 0; ++ ++ ret = ptrace_peek_area(tid, rseq_cs, decode_pointer(addr), sizeof(struct rseq_cs)); ++ if (ret) { ++ pr_err("ptrace_peek_area(%d, %lx, %lx, %lx): fail to read rseq_cs struct\n", tid, ++ (unsigned long)rseq_cs, (unsigned long)addr, sizeof(struct rseq_cs)); ++ return -1; ++ } ++ ++ return 0; ++} ++ ++static int dump_thread_rseq(struct pstree_item *item, int i) + { + struct __ptrace_rseq_configuration rseq; + RseqEntry *rseqe = NULL; + int ret; ++ CoreEntry *core = item->core[i]; ++ RseqEntry **rseqep = &core->thread_core->rseq_entry; ++ struct rseq_cs *rseq_cs = &dmpi(item)->thread_rseq_cs[i]; ++ pid_t tid = item->threads[i].real; + + /* + * If we are here it means that rseq() syscall is supported, +@@ -1076,7 +1123,8 @@ static int dump_thread_rseq(pid_t tid, RseqEntry **rseqep) + return -1; + } + +- pr_err("Dump rseq of %d: ptr = 0x%lx sign = 0x%x\n", tid, (unsigned long)rseq.rseq_abi_pointer, rseq.signature); ++ pr_info("Dump rseq of %d: ptr = 0x%lx sign = 0x%x\n", tid, (unsigned long)rseq.rseq_abi_pointer, ++ rseq.signature); + + rseqe = xmalloc(sizeof(*rseqe)); + if (!rseqe) +@@ -1088,25 +1136,118 @@ static int dump_thread_rseq(pid_t tid, RseqEntry **rseqep) + rseqe->rseq_abi_size = rseq.rseq_abi_size; + rseqe->signature = rseq.signature; + ++ if (read_rseq_cs(tid, &rseq, rseq_cs)) ++ goto err; ++ ++ /* save rseq entry to the image */ + *rseqep = rseqe; + + return 0; ++ ++err: ++ xfree(rseqe); ++ return -1; + } + + static int dump_task_rseq(pid_t pid, struct pstree_item *item) + { + int i; ++ struct rseq_cs *thread_rseq_cs; + + /* if rseq() syscall isn't supported then nothing to dump */ + if (!kdat.has_rseq) + return 0; + ++ thread_rseq_cs = xzalloc(sizeof(*thread_rseq_cs) * item->nr_threads); ++ if (!thread_rseq_cs) ++ return -1; ++ ++ dmpi(item)->thread_rseq_cs = thread_rseq_cs; ++ + for (i = 0; i < item->nr_threads; i++) { +- if (dump_thread_rseq(item->threads[i].real, &item->core[i]->thread_core->rseq_entry)) +- return -1; ++ if (dump_thread_rseq(item, i)) ++ goto free_rseq; + } + + return 0; ++ ++free_rseq: ++ xfree(thread_rseq_cs); ++ dmpi(item)->thread_rseq_cs = NULL; ++ return -1; ++} ++ ++static bool task_in_rseq(struct rseq_cs *rseq_cs, uint64_t addr) ++{ ++ return addr >= rseq_cs->start_ip && addr < rseq_cs->start_ip + rseq_cs->post_commit_offset; ++} ++ ++static int fixup_thread_rseq(struct pstree_item *item, int i) ++{ ++ CoreEntry *core = item->core[i]; ++ struct rseq_cs *rseq_cs = &dmpi(item)->thread_rseq_cs[i]; ++ pid_t tid = item->threads[i].real; ++ ++ /* (struct rseq)->rseq_cs is NULL */ ++ if (!rseq_cs->start_ip) ++ return 0; ++ ++ pr_info("fixup_thread_rseq for %d: rseq_cs start_ip = %llx abort_ip = %llx post_commit_offset = %llx flags = %x version = %x; IP = %lx\n", ++ tid, rseq_cs->start_ip, rseq_cs->abort_ip, rseq_cs->post_commit_offset, rseq_cs->flags, ++ rseq_cs->version, (unsigned long)TI_IP(core)); ++ ++ if (rseq_cs->version != 0) { ++ pr_err("unsupported RSEQ ABI version = %d\n", rseq_cs->version); ++ return -1; ++ } ++ ++ if (task_in_rseq(rseq_cs, TI_IP(core))) { ++ struct pid *tid = &item->threads[i]; ++ ++ pr_info("The %d task is in rseq critical section. IP will be set to rseq abort handler addr\n", ++ tid->real); ++ ++ /* ++ * We need to fixup task instruction pointer from ++ * the original one (which lays inside rseq critical section) ++ * to rseq abort handler address. ++ * ++ * It's worth to mention that we need to fixup IP in CoreEntry ++ * (used when full dump/restore is performed) and also in ++ * the parasite regs storage (used if --leave-running option is used, ++ * or if dump error occured and process execution is resumed). ++ */ ++ TI_IP(core) = rseq_cs->abort_ip; ++ ++ if (item->pid->real == tid->real) { ++ compel_set_leader_ip(dmpi(item)->parasite_ctl, rseq_cs->abort_ip); ++ } else { ++ compel_set_thread_ip(dmpi(item)->thread_ctls[i], rseq_cs->abort_ip); ++ } ++ } ++ ++ return 0; ++} ++ ++static int fixup_task_rseq(pid_t pid, struct pstree_item *item) ++{ ++ int ret = 0; ++ int i; ++ ++ if (!kdat.has_ptrace_get_rseq_conf) ++ return 0; ++ ++ for (i = 0; i < item->nr_threads; i++) { ++ if (fixup_thread_rseq(item, i)) { ++ ret = -1; ++ goto exit; ++ } ++ } ++ ++exit: ++ xfree(dmpi(item)->thread_rseq_cs); ++ dmpi(item)->thread_rseq_cs = NULL; ++ return ret; + } + + static struct proc_pid_stat pps_buf; +@@ -1409,6 +1550,12 @@ static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) + goto err; + } + ++ ret = fixup_task_rseq(pid, item); ++ if (ret) { ++ pr_err("Fixup rseq for %d failed %d\n", pid, ret); ++ goto err; ++ } ++ + if (fault_injected(FI_DUMP_EARLY)) { + pr_info("fault: CRIU sudden detach\n"); + kill(getpid(), SIGKILL); +diff --git a/criu/include/parasite.h b/criu/include/parasite.h +index 5fde809..d2a0688 100644 +--- a/criu/include/parasite.h ++++ b/criu/include/parasite.h +@@ -10,6 +10,8 @@ + #include + #include + ++#include "linux/rseq.h" ++ + #include "image.h" + #include "util-pie.h" + #include "common/lock.h" +diff --git a/criu/include/pstree.h b/criu/include/pstree.h +index c5b0fa7..458e5f9 100644 +--- a/criu/include/pstree.h ++++ b/criu/include/pstree.h +@@ -63,6 +63,7 @@ struct dmp_info { + struct parasite_ctl *parasite_ctl; + struct parasite_thread_ctl **thread_ctls; + uint64_t *thread_sp; ++ struct rseq_cs *thread_rseq_cs; + + /* + * Although we don't support dumping different struct creds in general, +-- +2.30.0 + diff --git a/0014-zdtm-add-rseq-transition-test-for-amd64.patch b/0014-zdtm-add-rseq-transition-test-for-amd64.patch new file mode 100644 index 0000000000000000000000000000000000000000..ae9cb9b36d7d03e4b08fe0ae1beff82fb8b974e3 --- /dev/null +++ b/0014-zdtm-add-rseq-transition-test-for-amd64.patch @@ -0,0 +1,248 @@ +From f76aa4ade354649e3291b5e7274c368740b05417 Mon Sep 17 00:00:00 2001 +From: bb-cat +Date: Wed, 2 Mar 2022 15:05:34 +0800 +Subject: [PATCH 14/16] zdtm: add rseq transition test for amd64 Signed-off-by: + Alexander Mikhalitsyn + +--- + test/zdtm/transition/Makefile | 1 + + test/zdtm/transition/rseq01.c | 208 +++++++++++++++++++ + test/zdtm/transition/rseq01.desc | 1 + + 3 files changed, 210 insertions(+) + create mode 100644 test/zdtm/transition/rseq01.c + create mode 100644 test/zdtm/transition/rseq01.desc + +diff --git a/test/zdtm/transition/Makefile b/test/zdtm/transition/Makefile +index 9388157..fae4e27 100644 +--- a/test/zdtm/transition/Makefile ++++ b/test/zdtm/transition/Makefile +@@ -23,6 +23,7 @@ TST_NOFILE = \ + lazy-thp \ + pid_reuse \ + pidfd_store_sk \ ++ rseq01 \ + + + TST_FILE = \ +diff --git a/test/zdtm/transition/rseq01.c b/test/zdtm/transition/rseq01.c +new file mode 100644 +index 0000000..5fac5a6 +--- /dev/null ++++ b/test/zdtm/transition/rseq01.c +@@ -0,0 +1,208 @@ ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "zdtmtst.h" ++ ++#ifdef __has_include ++# if __has_include ("sys/rseq.h") ++# include ++# endif ++#endif ++ ++#if defined(__x86_64__) ++ ++#if defined(__x86_64__) && defined(RSEQ_SIG) ++static inline void *thread_pointer(void) ++{ ++ void *result; ++ asm("mov %%fs:0, %0" : "=r"(result)); ++ return result; ++} ++ ++static inline void unregister_old_rseq(void) ++{ ++ /* unregister rseq */ ++ syscall(__NR_rseq, (void *)((char *)thread_pointer() + __rseq_offset), __rseq_size, 1, RSEQ_SIG); ++} ++#else ++static inline void unregister_old_rseq(void) ++{ ++} ++#endif ++ ++const char *test_doc = "rseq() transition test"; ++const char *test_author = "Alexander Mikhalitsyn "; ++ ++/* some useful definitions from kernel uapi */ ++#ifndef RSEQ_SIG ++ ++enum rseq_flags { ++ RSEQ_FLAG_UNREGISTER = (1 << 0), ++}; ++ ++struct rseq { ++ uint32_t cpu_id_start; ++ uint32_t cpu_id; ++ uint64_t rseq_cs; ++ uint32_t flags; ++} __attribute__((aligned(4 * sizeof(uint64_t)))); ++ ++#define RSEQ_SIG 0x53053053 ++ ++#endif ++ ++#ifndef __NR_rseq ++#define __NR_rseq 334 ++#endif ++/* EOF */ ++ ++static volatile struct rseq *rseq_ptr; ++static __thread volatile struct rseq __rseq_abi; ++ ++static int sys_rseq(volatile struct rseq *rseq_abi, uint32_t rseq_len, int flags, uint32_t sig) ++{ ++ return syscall(__NR_rseq, rseq_abi, rseq_len, flags, sig); ++} ++ ++static void register_thread(void) ++{ ++ int rc; ++ unregister_old_rseq(); ++ rc = sys_rseq(rseq_ptr, sizeof(struct rseq), 0, RSEQ_SIG); ++ if (rc) { ++ fail("Failed to register rseq"); ++ exit(1); ++ } ++} ++ ++static void check_thread(void) ++{ ++ int rc; ++ rc = sys_rseq(rseq_ptr, sizeof(struct rseq), 0, RSEQ_SIG); ++ if (!(rc && errno == EBUSY)) { ++ fail("Failed to check rseq %d", rc); ++ exit(1); ++ } ++} ++ ++#define RSEQ_ACCESS_ONCE(x) (*(__volatile__ __typeof__(x) *)&(x)) ++ ++static int rseq_addv(intptr_t *v, intptr_t count, int cpu) ++{ ++ double a = 10000000000000000.0; ++ double b = -1; ++ /*test_msg("enter %f %f\n", a, b);*/ ++ ++ /* clang-format off */ ++ __asm__ __volatile__ goto( ++ ".pushsection __rseq_table, \"aw\"\n\t" ++ ".balign 32\n\t" ++ "cs_obj:\n\t" ++ /* version, flags */ ++ ".long 0, 0\n\t" ++ /* start_ip, post_commit_offset, abort_ip */ ++ ".quad 1f, (2f-1f), 4f\n\t" ++ ".popsection\n\t" ++ "1:\n\t" ++ "leaq cs_obj(%%rip), %%rax\n\t" ++ "movq %%rax, %[rseq_cs]\n\t" ++ "cmpl %[cpu_id], %[current_cpu_id]\n\t" ++ "jnz 4f\n\t" ++ "addq %[count], %[v]\n\t" /* final store */ ++ "mov $10000000, %%rcx\n\t" ++ "fldl %[x]\n\t" /* we have st clobbered */ ++ "5:\n\t" ++ "fsqrt\n\t" /* heavy instruction */ ++ "dec %%rcx\n\t" ++ "jnz 5b\n\t" ++ "fstpl %[y]\n\t" ++ "2:\n\t" ++ ".pushsection __rseq_failure, \"ax\"\n\t" ++ /* Disassembler-friendly signature: nopl (%rip). */ ++ ".byte 0x0f, 0xb9, 0x3d\n\t" ++ ".long 0x53053053\n\t" /* RSEQ_FLAGS */ ++ "4:\n\t" ++ /*"fstpl %[y]\n\t"*/ ++ "jmp %l[abort]\n\t" ++ /*"jmp 1b\n\t"*/ ++ ".popsection\n\t" ++ : /* gcc asm goto does not allow outputs */ ++ : [cpu_id] "r" (cpu), ++ [current_cpu_id] "m" (rseq_ptr->cpu_id), ++ [rseq_cs] "m" (rseq_ptr->rseq_cs), ++ /* final store input */ ++ [v] "m" (*v), ++ [count] "er" (count), ++ [x] "m" (a), ++ [y] "m" (b) ++ : "memory", "cc", "rax", "rcx", "st" ++ : abort ++ ); ++ /* clang-format on */ ++ /*test_msg("exit %f %f\n", a, b);*/ ++ return 0; ++abort: ++ /*test_msg("abort %f %f\n", a, b);*/ ++ return -1; ++} ++ ++int main(int argc, char *argv[]) ++{ ++ int cpu = 0; ++ int ret; ++ intptr_t *cpu_data; ++ long nr_cpus; ++ ++ rseq_ptr = &__rseq_abi; ++ memset((void *)rseq_ptr, 0, sizeof(struct rseq)); ++ ++ test_init(argc, argv); ++ nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); ++ ++ cpu_data = calloc(nr_cpus, sizeof(*cpu_data)); ++ if (!cpu_data) { ++ fail("calloc"); ++ exit(EXIT_FAILURE); ++ } ++ register_thread(); ++ ++ test_daemon(); ++ ++ while (test_go()) { ++ cpu = RSEQ_ACCESS_ONCE(rseq_ptr->cpu_id_start); ++ ret = rseq_addv(&cpu_data[cpu], 2, cpu); ++ if (ret) ++ fail("Failed to increment per-cpu counter"); ++ } ++ ++ test_waitsig(); ++ ++ check_thread(); ++ pass(); ++ ++ return 0; ++} ++ ++#else ++ ++int main(int argc, char *argv[]) ++{ ++ test_init(argc, argv); ++ skip("Unsupported arch"); ++ return 0; ++} ++ ++#endif +diff --git a/test/zdtm/transition/rseq01.desc b/test/zdtm/transition/rseq01.desc +new file mode 100644 +index 0000000..0324fa3 +--- /dev/null ++++ b/test/zdtm/transition/rseq01.desc +@@ -0,0 +1 @@ ++{'flavor': 'h', 'arch': 'x86_64', 'feature': 'get_rseq_conf'} +-- +2.30.0 + diff --git a/0015-cr-dump-handle-rseq-flags-field.patch b/0015-cr-dump-handle-rseq-flags-field.patch new file mode 100644 index 0000000000000000000000000000000000000000..d54477411aa7b36382bcc340eece9441cc69abed --- /dev/null +++ b/0015-cr-dump-handle-rseq-flags-field.patch @@ -0,0 +1,330 @@ +From deac94521c373c13add63eaf88118187ea3c2cb2 Mon Sep 17 00:00:00 2001 +From: bb-cat +Date: Wed, 2 Mar 2022 15:09:44 +0800 +Subject: [PATCH 15/16] cr-dump: handle rseq flags field Userspace may + configure rseq critical section by def + +Signed-off-by: Alexander Mikhalitsyn +--- + criu/cr-dump.c | 86 +++++++++++++++++++------------ + criu/cr-restore.c | 63 ++++++++++++++++++++++ + criu/include/pstree.h | 1 + + images/rseq.proto | 1 + + 4 files changed, 119 insertions(+), 32 deletions(-) + +diff --git a/criu/cr-dump.c b/criu/cr-dump.c +index a3f8973..79387fb 100644 +--- a/criu/cr-dump.c ++++ b/criu/cr-dump.c +@@ -1047,13 +1047,13 @@ static int dump_task_signals(pid_t pid, struct pstree_item *item) + return 0; + } + +-static int read_rseq_cs(pid_t tid, struct __ptrace_rseq_configuration *rseq, struct rseq_cs *rseq_cs) ++static int read_rseq_cs(pid_t tid, struct __ptrace_rseq_configuration *rseqc, ++ struct rseq_cs *rseq_cs, struct rseq *rseq) + { + int ret; +- uint64_t addr; + + /* rseq is not registered */ +- if (!rseq->rseq_abi_pointer) ++ if (!rseqc->rseq_abi_pointer) + return 0; + + /* +@@ -1068,22 +1068,21 @@ static int read_rseq_cs(pid_t tid, struct __ptrace_rseq_configuration *rseq, str + * then rseq_ip_fixup() -> clear_rseq_cs() and user space memory with struct rseq + * will be cleared. So, let's use ptrace(PTRACE_PEEKDATA). + */ +- ret = ptrace_peek_area(tid, &addr, decode_pointer(rseq->rseq_abi_pointer + offsetof(struct rseq, rseq_cs)), +- sizeof(uint64_t)); ++ ret = ptrace_peek_area(tid, rseq, decode_pointer(rseqc->rseq_abi_pointer), ++ sizeof(struct rseq)); + if (ret) { +- pr_err("ptrace_peek_area(%d, %lx, %lx, %lx): fail to read rseq_cs addr\n", tid, (unsigned long)&addr, +- (unsigned long)(rseq->rseq_abi_pointer + offsetof(struct rseq, rseq_cs)), sizeof(uint64_t)); ++ pr_err("ptrace_peek_area(%d, %lx, %lx, %lx): fail to read rseq struct\n", tid, (unsigned long)rseq, ++ (unsigned long)(rseqc->rseq_abi_pointer), sizeof(uint64_t)); + return -1; + } + +- /* (struct rseq)->rseq_cs is NULL */ +- if (!addr) ++ if (!rseq->rseq_cs.ptr64) + return 0; + +- ret = ptrace_peek_area(tid, rseq_cs, decode_pointer(addr), sizeof(struct rseq_cs)); ++ ret = ptrace_peek_area(tid, rseq_cs, decode_pointer(rseq->rseq_cs.ptr64), sizeof(struct rseq_cs)); + if (ret) { + pr_err("ptrace_peek_area(%d, %lx, %lx, %lx): fail to read rseq_cs struct\n", tid, +- (unsigned long)rseq_cs, (unsigned long)addr, sizeof(struct rseq_cs)); ++ (unsigned long)rseq_cs, (unsigned long)rseq->rseq_cs.ptr64, sizeof(struct rseq_cs)); + return -1; + } + +@@ -1092,11 +1091,12 @@ static int read_rseq_cs(pid_t tid, struct __ptrace_rseq_configuration *rseq, str + + static int dump_thread_rseq(struct pstree_item *item, int i) + { +- struct __ptrace_rseq_configuration rseq; ++ struct __ptrace_rseq_configuration rseqc; + RseqEntry *rseqe = NULL; + int ret; + CoreEntry *core = item->core[i]; + RseqEntry **rseqep = &core->thread_core->rseq_entry; ++ struct rseq rseq; + struct rseq_cs *rseq_cs = &dmpi(item)->thread_rseq_cs[i]; + pid_t tid = item->threads[i].real; + +@@ -1111,20 +1111,20 @@ static int dump_thread_rseq(struct pstree_item *item, int i) + if (!kdat.has_ptrace_get_rseq_conf) + return 0; + +- ret = ptrace(PTRACE_GET_RSEQ_CONFIGURATION, tid, sizeof(rseq), &rseq); +- if (ret != sizeof(rseq)) { ++ ret = ptrace(PTRACE_GET_RSEQ_CONFIGURATION, tid, sizeof(rseqc), &rseqc); ++ if (ret != sizeof(rseqc)) { + pr_perror("ptrace(PTRACE_GET_RSEQ_CONFIGURATION, %d) = %d", tid, ret); + return -1; + } + +- if (rseq.flags != 0) { ++ if (rseqc.flags != 0) { + pr_err("something wrong with ptrace(PTRACE_GET_RSEQ_CONFIGURATION, %d) flags = 0x%x\n", tid, +- rseq.flags); ++ rseqc.flags); + return -1; + } + +- pr_info("Dump rseq of %d: ptr = 0x%lx sign = 0x%x\n", tid, (unsigned long)rseq.rseq_abi_pointer, +- rseq.signature); ++ pr_info("Dump rseq of %d: ptr = 0x%lx sign = 0x%x\n", tid, (unsigned long)rseqc.rseq_abi_pointer, ++ rseqc.signature); + + rseqe = xmalloc(sizeof(*rseqe)); + if (!rseqe) +@@ -1132,13 +1132,22 @@ static int dump_thread_rseq(struct pstree_item *item, int i) + + rseq_entry__init(rseqe); + +- rseqe->rseq_abi_pointer = rseq.rseq_abi_pointer; +- rseqe->rseq_abi_size = rseq.rseq_abi_size; +- rseqe->signature = rseq.signature; ++ rseqe->rseq_abi_pointer = rseqc.rseq_abi_pointer; ++ rseqe->rseq_abi_size = rseqc.rseq_abi_size; ++ rseqe->signature = rseqc.signature; + +- if (read_rseq_cs(tid, &rseq, rseq_cs)) ++ if (read_rseq_cs(tid, &rseqc, rseq_cs, &rseq)) + goto err; + ++ rseqe->has_rseq_cs_pointer = true; ++ rseqe->rseq_cs_pointer = rseq.rseq_cs.ptr64; ++ pr_err("cs pointer %lx\n", rseqe->rseq_cs_pointer); ++ /* we won't save rseq_cs to the image (only pointer), ++ * so let's combine flags from both struct rseq and struct rseq_cs ++ * (kernel does the same when interpreting RSEQ_CS_FLAG_*) ++ */ ++ rseq_cs->flags |= rseq.flags; ++ + /* save rseq entry to the image */ + *rseqep = rseqe; + +@@ -1188,11 +1197,11 @@ static int fixup_thread_rseq(struct pstree_item *item, int i) + struct rseq_cs *rseq_cs = &dmpi(item)->thread_rseq_cs[i]; + pid_t tid = item->threads[i].real; + +- /* (struct rseq)->rseq_cs is NULL */ ++ /* equivalent to (struct rseq)->rseq_cs is NULL */ + if (!rseq_cs->start_ip) + return 0; + +- pr_info("fixup_thread_rseq for %d: rseq_cs start_ip = %llx abort_ip = %llx post_commit_offset = %llx flags = %x version = %x; IP = %lx\n", ++ pr_debug("fixup_thread_rseq for %d: rseq_cs start_ip = %llx abort_ip = %llx post_commit_offset = %llx flags = %x version = %x; IP = %lx\n", + tid, rseq_cs->start_ip, rseq_cs->abort_ip, rseq_cs->post_commit_offset, rseq_cs->flags, + rseq_cs->version, (unsigned long)TI_IP(core)); + +@@ -1204,25 +1213,38 @@ static int fixup_thread_rseq(struct pstree_item *item, int i) + if (task_in_rseq(rseq_cs, TI_IP(core))) { + struct pid *tid = &item->threads[i]; + +- pr_info("The %d task is in rseq critical section. IP will be set to rseq abort handler addr\n", +- tid->real); +- + /* + * We need to fixup task instruction pointer from + * the original one (which lays inside rseq critical section) +- * to rseq abort handler address. ++ * to rseq abort handler address. But we need to look on rseq_cs->flags ++ * (please refer to struct rseq -> flags field description). ++ * Naive idea of flags support may be like... let's change instruction pointer (IP) ++ * to rseq_cs->abort_ip if !(rseq_cs->flags & RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL). ++ * But unfortunately, it doesn't work properly, because the kernel does ++ * clean up of rseq_cs field in the struct rseq (modifies userspace memory). ++ * So, we need to preserve original value of (struct rseq)->rseq_cs field in the ++ * image and restore it's value before releasing threads. + * + * It's worth to mention that we need to fixup IP in CoreEntry + * (used when full dump/restore is performed) and also in + * the parasite regs storage (used if --leave-running option is used, + * or if dump error occured and process execution is resumed). + */ +- TI_IP(core) = rseq_cs->abort_ip; + +- if (item->pid->real == tid->real) { +- compel_set_leader_ip(dmpi(item)->parasite_ctl, rseq_cs->abort_ip); ++ if (rseq_cs->flags & RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL) { ++ pr_err("The %d task is in rseq critical section.!!! IP will be set to rseq abort handler addr\n", ++ tid->real); + } else { +- compel_set_thread_ip(dmpi(item)->thread_ctls[i], rseq_cs->abort_ip); ++ pr_warn("The %d task is in rseq critical section. IP will be set to rseq abort handler addr\n", ++ tid->real); ++ ++ TI_IP(core) = rseq_cs->abort_ip; ++ ++ if (item->pid->real == tid->real) { ++ compel_set_leader_ip(dmpi(item)->parasite_ctl, rseq_cs->abort_ip); ++ } else { ++ compel_set_thread_ip(dmpi(item)->thread_ctls[i], rseq_cs->abort_ip); ++ } + } + } + +diff --git a/criu/cr-restore.c b/criu/cr-restore.c +index b2bd044..864140f 100644 +--- a/criu/cr-restore.c ++++ b/criu/cr-restore.c +@@ -23,6 +23,7 @@ + #include "common/compiler.h" + + #include "linux/mount.h" ++#include "linux/rseq.h" + + #include "clone-noasan.h" + #include "cr_options.h" +@@ -779,6 +780,7 @@ static int open_cores(int pid, CoreEntry *leader_core) + { + int i, tpid; + CoreEntry **cores = NULL; ++ //RseqEntry *rseqs; + + cores = xmalloc(sizeof(*cores) * current->nr_threads); + if (!cores) +@@ -812,6 +814,19 @@ static int open_cores(int pid, CoreEntry *leader_core) + } + } + ++ ++ pr_err("item %lx\n", (uint64_t)current); ++ ++ for (i = 0; i < current->nr_threads; i++) { ++ ThreadCoreEntry *tc = cores[i]->thread_core; ++ ++ /* compatibility with older CRIU versions */ ++ if (!tc->rseq_entry) ++ continue; ++ ++ current->rseqe[i] = *tc->rseq_entry; ++ } ++ + return 0; + err: + xfree(cores); +@@ -868,8 +883,15 @@ static int restore_one_alive_task(int pid, CoreEntry *core) + { + unsigned args_len; + struct task_restore_args *ta; ++ RseqEntry *rseqs; + pr_info("Restoring resources\n"); + ++ rseqs = shmalloc(sizeof(*rseqs) * current->nr_threads); ++ if (!rseqs) ++ return -1; ++ ++ current->rseqe = rseqs; ++ + rst_mem_switch_to_private(); + + args_len = round_up(sizeof(*ta) + sizeof(struct thread_restore_args) * current->nr_threads, page_size()); +@@ -1966,6 +1988,44 @@ static int attach_to_tasks(bool root_seized) + return 0; + } + ++static int restore_rseq_cs(void) ++{ ++ struct pstree_item *item; ++ ++ for_each_pstree_item(item) { ++ int i; ++ ++ if (!task_alive(item)) ++ continue; ++ ++ if (item->nr_threads == 1) { ++ item->threads[0].real = item->pid->real; ++ } else { ++ if (parse_threads(item->pid->real, &item->threads, &item->nr_threads)) ++ return -1; ++ } ++ ++ for (i = 0; i < item->nr_threads; i++) { ++ pid_t pid = item->threads[i].real; ++ ++ if (!item->rseqe[i].rseq_cs_pointer || !item->rseqe[i].rseq_abi_pointer) { ++ pr_err("item %lx rseqe %lx\n", (uint64_t)item, (uint64_t)item->rseqe); ++ pr_err("nothing to do with cs_pointer\n"); ++ continue; ++ } ++ ++ pr_err("restoring cs ... %lx \n", item->rseqe[i].rseq_cs_pointer); ++ ++ if (ptrace_poke_area(pid, &item->rseqe[i].rseq_cs_pointer, (void *)(item->rseqe[i].rseq_abi_pointer + offsetof(struct rseq, rseq_cs)), sizeof(uint64_t))) { ++ pr_err("Can't restore memfd args (pid: %d)\n", pid); ++ return -1; ++ } ++ } ++ } ++ ++ return 0; ++} ++ + static int catch_tasks(bool root_seized, enum trace_flags *flag) + { + struct pstree_item *item; +@@ -2400,6 +2460,9 @@ skip_ns_bouncing: + if (restore_freezer_state()) + pr_err("Unable to restore freezer state\n"); + ++ /* just before releasing threads we have to restore rseq_cs */ ++ restore_rseq_cs(); ++ + /* Detaches from processes and they continue run through sigreturn. */ + if (finalize_restore_detach()) + goto out_kill_network_unlocked; +diff --git a/criu/include/pstree.h b/criu/include/pstree.h +index 458e5f9..97bef11 100644 +--- a/criu/include/pstree.h ++++ b/criu/include/pstree.h +@@ -25,6 +25,7 @@ struct pstree_item { + int nr_threads; /* number of threads */ + struct pid *threads; /* array of threads */ + CoreEntry **core; ++ RseqEntry *rseqe; + TaskKobjIdsEntry *ids; + union { + futex_t task_st; +diff --git a/images/rseq.proto b/images/rseq.proto +index be28004..45cb847 100644 +--- a/images/rseq.proto ++++ b/images/rseq.proto +@@ -6,4 +6,5 @@ message rseq_entry { + required uint64 rseq_abi_pointer = 1; + required uint32 rseq_abi_size = 2; + required uint32 signature = 3; ++ optional uint64 rseq_cs_pointer = 4; + } +-- +2.30.0 + diff --git a/0016-zdtm-add-rseq02-transition-test-with-NO_RESTART-CS-f.patch b/0016-zdtm-add-rseq02-transition-test-with-NO_RESTART-CS-f.patch new file mode 100644 index 0000000000000000000000000000000000000000..3fd0f02e66abade374b6802738b07b8b6125f07c --- /dev/null +++ b/0016-zdtm-add-rseq02-transition-test-with-NO_RESTART-CS-f.patch @@ -0,0 +1,175 @@ +From bb8295ae4f1224db2236fdd3134912e093ed20d9 Mon Sep 17 00:00:00 2001 +From: bb-cat +Date: Wed, 2 Mar 2022 15:10:24 +0800 +Subject: [PATCH 16/16] zdtm: add rseq02 transition test with NO_RESTART CS + flag Signed-off-by: Alexander Mikhalitsyn + + +--- + test/zdtm/transition/Makefile | 2 + + test/zdtm/transition/rseq01.c | 61 +++++++++++++++++++- + test/zdtm/transition/rseq02.c | 1 + + test/zdtm/transition/rseq02.desc | 1 + + 4 files changed, 63 insertions(+), 2 deletions(-) + create mode 120000 test/zdtm/transition/rseq02.c + create mode 120000 test/zdtm/transition/rseq02.desc + +diff --git a/test/zdtm/transition/Makefile b/test/zdtm/transition/Makefile +index fae4e27..378a4fc 100644 +--- a/test/zdtm/transition/Makefile ++++ b/test/zdtm/transition/Makefile +@@ -24,6 +24,7 @@ TST_NOFILE = \ + pid_reuse \ + pidfd_store_sk \ + rseq01 \ ++ rseq02 \ + + + TST_FILE = \ +@@ -82,6 +83,7 @@ ptrace: LDFLAGS += -pthread + fork2: CFLAGS += -D FORK2 + thread-bomb.o: CFLAGS += -pthread + thread-bomb: LDFLAGS += -pthread ++rseq02: CFLAGS += -D NOABORT + + %: %.sh + cp $< $@ +diff --git a/test/zdtm/transition/rseq01.c b/test/zdtm/transition/rseq01.c +index 5fac5a6..25e1d61 100644 +--- a/test/zdtm/transition/rseq01.c ++++ b/test/zdtm/transition/rseq01.c +@@ -53,6 +53,18 @@ enum rseq_flags { + RSEQ_FLAG_UNREGISTER = (1 << 0), + }; + ++enum rseq_cs_flags_bit { ++ RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT = 0, ++ RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT = 1, ++ RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT = 2, ++}; ++ ++enum rseq_cs_flags { ++ RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT = (1U << RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT), ++ RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL = (1U << RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT), ++ RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE = (1U << RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT), ++}; ++ + struct rseq { + uint32_t cpu_id_start; + uint32_t cpu_id; +@@ -104,6 +116,7 @@ static int rseq_addv(intptr_t *v, intptr_t count, int cpu) + { + double a = 10000000000000000.0; + double b = -1; ++ uint64_t rseq_cs1, rseq_cs2; + /*test_msg("enter %f %f\n", a, b);*/ + + /* clang-format off */ +@@ -129,6 +142,9 @@ static int rseq_addv(intptr_t *v, intptr_t count, int cpu) + "dec %%rcx\n\t" + "jnz 5b\n\t" + "fstpl %[y]\n\t" ++ "movq %%rax, %[rseq_cs_check2]\n\t" ++ "movq %[rseq_cs], %%rax\n\t" ++ "movq %%rax, %[rseq_cs_check1]\n\t" + "2:\n\t" + ".pushsection __rseq_failure, \"ax\"\n\t" + /* Disassembler-friendly signature: nopl (%rip). */ +@@ -143,6 +159,8 @@ static int rseq_addv(intptr_t *v, intptr_t count, int cpu) + : [cpu_id] "r" (cpu), + [current_cpu_id] "m" (rseq_ptr->cpu_id), + [rseq_cs] "m" (rseq_ptr->rseq_cs), ++ [rseq_cs_check1] "m" (rseq_cs1), ++ [rseq_cs_check2] "m" (rseq_cs2), + /* final store input */ + [v] "m" (*v), + [count] "er" (count), +@@ -153,8 +171,20 @@ static int rseq_addv(intptr_t *v, intptr_t count, int cpu) + ); + /* clang-format on */ + /*test_msg("exit %f %f\n", a, b);*/ ++ test_msg("%lx %lx\n", rseq_cs1, rseq_cs2); ++ if (rseq_cs1 != rseq_cs2) { ++ /* ++ * It means that we finished critical section ++ * *normally* (haven't jumped to abort) but the kernel had cleaned up ++ * rseq_ptr->rseq_cs before we left critical section ++ * and CRIU wasn't restored it correctly. ++ * That's a bug picture. ++ */ ++ return -1; ++ } + return 0; + abort: ++ test_msg("%lx %lx\n", rseq_cs1, rseq_cs2); + /*test_msg("abort %f %f\n", a, b);*/ + return -1; + } +@@ -177,21 +207,48 @@ int main(int argc, char *argv[]) + fail("calloc"); + exit(EXIT_FAILURE); + } ++ + register_thread(); + ++ /* ++ * We want to test that RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL ++ * is handled properly by CRIU, but that flag can be used ++ * only with all another flags set. ++ */ ++#ifdef NOABORT ++ rseq_ptr->flags = RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT | ++ RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL | ++ RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE; ++#endif ++ + test_daemon(); + + while (test_go()) { + cpu = RSEQ_ACCESS_ONCE(rseq_ptr->cpu_id_start); + ret = rseq_addv(&cpu_data[cpu], 2, cpu); +- if (ret) ++#ifndef NOABORT ++ /* just ignore abort */ ++ ret = 0; ++#else ++ if (ret) { + fail("Failed to increment per-cpu counter"); ++ break; ++ } else { ++ //test_msg("cpu_data[%d] == %ld\n", cpu, (long int)cpu_data[cpu]); ++ } ++#endif + } + + test_waitsig(); + + check_thread(); +- pass(); ++ ++ if (ret) ++ fail(); ++ else ++ pass(); + + return 0; + } +diff --git a/test/zdtm/transition/rseq02.c b/test/zdtm/transition/rseq02.c +new file mode 120000 +index 0000000..d564917 +--- /dev/null ++++ b/test/zdtm/transition/rseq02.c +@@ -0,0 +1 @@ ++rseq01.c +\ No newline at end of file +diff --git a/test/zdtm/transition/rseq02.desc b/test/zdtm/transition/rseq02.desc +new file mode 120000 +index 0000000..b888f0d +--- /dev/null ++++ b/test/zdtm/transition/rseq02.desc +@@ -0,0 +1 @@ ++rseq01.desc +\ No newline at end of file +-- +2.30.0 + diff --git a/criu-3.15.tar.bz2 b/criu-3.15.tar.bz2 deleted file mode 100644 index 2c34f9c392b2a987c78318f158263016b0458b61..0000000000000000000000000000000000000000 Binary files a/criu-3.15.tar.bz2 and /dev/null differ diff --git a/criu-3.16.1.tar.gz b/criu-3.16.1.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..b4767a39de787397647c9e7cbabfc396a26208f1 Binary files /dev/null and b/criu-3.16.1.tar.gz differ diff --git a/criu.spec b/criu.spec index 5f2ad070204510cc02904865c78e8b57ada62029..9dace211bee82adb1c533ed440fbc51b91428d95 100644 --- a/criu.spec +++ b/criu.spec @@ -1,12 +1,12 @@ Name: criu -Version: 3.15 -Release: 3 +Version: 3.16.1 +Release: 2 Provides: crtools = %{version}-%{release} Obsoletes: crtools <= 1.0-2 Summary: A tool of Checkpoint/Restore in User-space License: GPL-2.0-or-later or LGPL-2.1-only URL: http://criu.org/ -Source0: http://download.openvz.org/criu/criu-%{version}.tar.bz2 +Source0: http://github.com/chechpoint-restore/criu/archive/v%{version}/%{name}-%{version}.tar.gz BuildRequires: systemd libnet-devel asciidoc xmlto perl-interpreter libselinux-devel gcc BuildRequires: protobuf-devel protobuf-c-devel python3-devel libnl3-devel libcap-devel Recommends: tar @@ -15,12 +15,22 @@ Requires: %{name} = %{version}-%{release} Provides: %{name}-libs = %{version}-%{release} Obsoletes: %{name}-libs < %{version}-%{release} -Patch0001: 0001-Fix-crit-encode-TypeError.patch -Patch0002: 0002-Fix-crit-info-struct-unpack-error.patch -Patch0003: 0003-Fix-crit-x-UnicodeDecodeError.patch -Patch0004: 0004-criu-dump-and-restore-cpu-affinity-of-each-thread.patch -Patch0005: 0005-vdso-fix-segmentation-fault-caused-by-char-pointer-a.patch -Patch0006: 0006-criu-add-pin-memory-method.patch +Patch1: 0001-criu-dump-and-restore-cpu-affinity-of-each-thread.patch +Patch2: 0002-compel-add-rseq-syscall-into-compel-std-plugin-sysca.patch +Patch3: 0003-kerndat-check-for-rseq-syscall-support.patch +Patch4: 0004-util-move-fork_and_ptrace_attach-helper-from-cr-chec.patch +Patch5: 0005-cr-check-Add-ptrace-rseq-conf-dump-feature.patch +Patch6: 0006-rseq-initial-support.patch +Patch7: 0007-zdtm-add-simple-test-for-rseq-C-R.patch +Patch8: 0008-ci-add-Fedora-Rawhide-based-test-on-Cirrus.patch +Patch9: 0009-include-add-thread_pointer.h-from-Glibc.patch +Patch10: 0010-clone-noasan-unregister-rseq-at-the-thread-start-for.patch +Patch11: 0011-zdtm-static-rseq00-fix-rseq-test-when-linking-with-a.patch +Patch12: 0012-compel-add-helpers-to-get-set-instruction-pointer.patch +Patch13: 0013-cr-dump-fixup-thread-IP-when-inside-rseq-cs.patch +Patch14: 0014-zdtm-add-rseq-transition-test-for-amd64.patch +Patch15: 0015-cr-dump-handle-rseq-flags-field.patch +Patch16: 0016-zdtm-add-rseq02-transition-test-with-NO_RESTART-CS-f.patch %description Checkpoint/Restore in Userspace(CRIU),is a software tool for the linux operating system. @@ -50,6 +60,12 @@ Requires: python3-criu = %{version}-%{release} %description -n crit A tool for CRIU image. +%package -n criu-ns +Summary: Tool to run CRIU in different namespaces +Requires: %{name} = %{version}-%{release} + +%description -n criu-ns + %package help Summary: Help documents for criu @@ -60,7 +76,7 @@ Help documents for criu. %autosetup -n %{name}-%{version} -p1 %build -CFLAGS+=`echo %{optflags} | sed -e 's,-fstack-protector\S*,,g'` make V=1 WERROR=0 PREFIX=%{_prefix} RUNDIR=/run/criu PYTHON=python3 +CFLAGS+=`echo %{optflags}` make V=1 WERROR=0 PREFIX=%{_prefix} RUNDIR=/run/criu PYTHON=python3 %install make install-criu DESTDIR=%{buildroot} PREFIX=%{_prefix} LIBDIR=%{_libdir} @@ -88,12 +104,24 @@ chmod 0755 %{buildroot}/run/%{name}/ %files -n crit %{_bindir}/crit +%files -n criu-ns +%{_sbindir}/criu-ns + %files help %doc README.md COPYING %doc %{_mandir}/man8/criu.8* -%doc %{_mandir}/man1/{compel.1*,crit.1*} +%doc %{_mandir}/man1/{compel.1*,crit.1*,criu-ns.1*} %changelog +* Fri Mar 4 2022 ningyu - 3.16.1-2 +- rseq c/r support + +* Thu Dec 2 2021 zhouwenpei - 3.16.1-1 +- upgrade criu version to 3.16.1 + +* Tue Sep 07 2021 chenchen - 3.15-4 +- add "-fstack-protector-strong" for libcriu.so.2.0 + * Mon May 31 2021 baizhonggui - 3.15-3 - Add gcc in BuildRequires