From a40e7d9578c59dbd7299b0c89bf664a000b19b26 Mon Sep 17 00:00:00 2001 From: "fu.lin" Date: Tue, 13 Apr 2021 14:20:56 +0800 Subject: [PATCH] criu: add backport patch set upstream backport: - 0005-vdso-fix-segmentation-fault-caused-by-char-pointer-a.patch * commit id: 2cb1156 feature: - 0004-criu-dump-and-restore-cpu-affinity-of-each-thread.patch * support checkpoint/restore cpu affinity - 0006-criu-add-pin-memory-method.patch * support kernel feature pin memory option Signed-off-by: fu.lin --- 0001-Fix-crit-encode-TypeError.patch | 6 +- 0002-Fix-crit-info-struct-unpack-error.patch | 8 +- 0003-Fix-crit-x-UnicodeDecodeError.patch | 6 +- ...-restore-cpu-affinity-of-each-thread.patch | 418 ++++++++++++++++++ ...ation-fault-caused-by-char-pointer-a.patch | 193 ++++++++ 0006-criu-add-pin-memory-method.patch | 268 +++++++++++ criu.spec | 3 + 7 files changed, 892 insertions(+), 10 deletions(-) create mode 100644 0004-criu-dump-and-restore-cpu-affinity-of-each-thread.patch create mode 100644 0005-vdso-fix-segmentation-fault-caused-by-char-pointer-a.patch create mode 100644 0006-criu-add-pin-memory-method.patch diff --git a/0001-Fix-crit-encode-TypeError.patch b/0001-Fix-crit-encode-TypeError.patch index 145c559..ef187d3 100644 --- a/0001-Fix-crit-encode-TypeError.patch +++ b/0001-Fix-crit-encode-TypeError.patch @@ -1,7 +1,7 @@ -From d1c7216c4265c45bcb8b9380b8ad4e5ed69d014e Mon Sep 17 00:00:00 2001 +From 22bd1e20cbd3c26d2e5dba76e3b0a95ff0a2e154 Mon Sep 17 00:00:00 2001 From: lingsheng Date: Tue, 22 Sep 2020 14:36:55 +0800 -Subject: [PATCH 1/3] Fix crit encode TypeError +Subject: [PATCH 1/6] Fix crit encode TypeError --- lib/py/cli.py | 5 ++++- @@ -24,5 +24,5 @@ index da34302..966dd4e 100755 return sys.stdout -- -2.23.0 +1.8.3.1 diff --git a/0002-Fix-crit-info-struct-unpack-error.patch b/0002-Fix-crit-info-struct-unpack-error.patch index 4bfe29d..3f6354a 100644 --- a/0002-Fix-crit-info-struct-unpack-error.patch +++ b/0002-Fix-crit-info-struct-unpack-error.patch @@ -1,14 +1,14 @@ -From a1d4d678de01b0569e8d36894a8d60a8b75bb016 Mon Sep 17 00:00:00 2001 +From be4a5e65791d18d1e26d6299e80a65324c5fc07e Mon Sep 17 00:00:00 2001 From: lingsheng Date: Tue, 22 Sep 2020 14:39:22 +0800 -Subject: [PATCH 2/3] Fix crit info struct unpack error +Subject: [PATCH 2/6] Fix crit info struct unpack error --- lib/py/images/images.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/py/images/images.py b/lib/py/images/images.py -index f4517d8..72205fe 100644 +index 9c8e144..c330b97 100644 --- a/lib/py/images/images.py +++ b/lib/py/images/images.py @@ -171,7 +171,7 @@ class entry_handler: @@ -21,5 +21,5 @@ index f4517d8..72205fe 100644 size, = struct.unpack('i', buf) f.seek(size, 1) -- -2.23.0 +1.8.3.1 diff --git a/0003-Fix-crit-x-UnicodeDecodeError.patch b/0003-Fix-crit-x-UnicodeDecodeError.patch index c59f2b7..5c7c506 100644 --- a/0003-Fix-crit-x-UnicodeDecodeError.patch +++ b/0003-Fix-crit-x-UnicodeDecodeError.patch @@ -1,7 +1,7 @@ -From b2eea766a1f41553b76fef8d669e288ff552d0ed Mon Sep 17 00:00:00 2001 +From 4f139d2803773c86e5cf557c879392e7b79238b3 Mon Sep 17 00:00:00 2001 From: lingsheng Date: Tue, 22 Sep 2020 14:40:35 +0800 -Subject: [PATCH 3/3] Fix crit x UnicodeDecodeError +Subject: [PATCH 3/6] Fix crit x UnicodeDecodeError --- lib/py/cli.py | 2 +- @@ -21,5 +21,5 @@ index 966dd4e..f7bda23 100755 def decode(opts): -- -2.23.0 +1.8.3.1 diff --git a/0004-criu-dump-and-restore-cpu-affinity-of-each-thread.patch b/0004-criu-dump-and-restore-cpu-affinity-of-each-thread.patch new file mode 100644 index 0000000..0384f1e --- /dev/null +++ b/0004-criu-dump-and-restore-cpu-affinity-of-each-thread.patch @@ -0,0 +1,418 @@ +From 4a49af49be378835b65016d5465eae44107a52e1 Mon Sep 17 00:00:00 2001 +From: "fu.lin" +Date: Tue, 13 Apr 2021 10:39:45 +0800 +Subject: [PATCH 4/6] criu: dump and restore cpu affinity of each thread + +Criu should dump and restore threads' or processes' +cpu affinity. + +Add one entry of thread_cpuallow_entry into +thread_core_entry to save cpu affinity info. + +Restore it after threads restored but before running. + +Add option --with-cpu-affinity to enable this function +at restore. + +Signed-off-by: Sang Yan +--- + compel/arch/arm/plugins/std/syscalls/syscall.def | 1 + + .../ppc64/plugins/std/syscalls/syscall-ppc64.tbl | 1 + + .../s390/plugins/std/syscalls/syscall-s390.tbl | 1 + + .../arch/x86/plugins/std/syscalls/syscall_32.tbl | 1 + + .../arch/x86/plugins/std/syscalls/syscall_64.tbl | 1 + + criu/config.c | 1 + + criu/cr-dump.c | 14 ++++++++ + criu/cr-restore.c | 26 ++++++++++++++ + criu/crtools.c | 2 ++ + criu/include/cr_options.h | 2 ++ + criu/include/restorer.h | 3 ++ + criu/pie/restorer.c | 38 ++++++++++++++++++++ + criu/pstree.c | 7 ++++ + images/core.proto | 5 +++ + test/zdtm/static/Makefile | 1 + + test/zdtm/static/cpu-affinity0.c | 42 ++++++++++++++++++++++ + test/zdtm/static/cpu-affinity0.desc | 1 + + 17 files changed, 147 insertions(+) + create mode 100644 test/zdtm/static/cpu-affinity0.c + create mode 100644 test/zdtm/static/cpu-affinity0.desc + +diff --git a/compel/arch/arm/plugins/std/syscalls/syscall.def b/compel/arch/arm/plugins/std/syscalls/syscall.def +index f7ebc85..d577373 100644 +--- a/compel/arch/arm/plugins/std/syscalls/syscall.def ++++ b/compel/arch/arm/plugins/std/syscalls/syscall.def +@@ -116,3 +116,4 @@ fsopen 430 430 (char *fsname, unsigned int flags) + fsconfig 431 431 (int fd, unsigned int cmd, const char *key, const char *value, int aux) + fsmount 432 432 (int fd, unsigned int flags, unsigned int attr_flags) + clone3 435 435 (struct clone_args *uargs, size_t size) ++sched_setaffinity 122 241 (int fd, size_t cpusetsize, const cpu_set_t *mask) +diff --git a/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl b/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl +index 1afaf1e..fa64545 100644 +--- a/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl ++++ b/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl +@@ -112,3 +112,4 @@ __NR_fsopen 430 sys_fsopen (char *fsname, unsigned int flags) + __NR_fsconfig 431 sys_fsconfig (int fd, unsigned int cmd, const char *key, const char *value, int aux) + __NR_fsmount 432 sys_fsmount (int fd, unsigned int flags, unsigned int attr_flags) + __NR_clone3 435 sys_clone3 (struct clone_args *uargs, size_t size) ++__NR_sched_setaffinity 222 sys_sched_setaffinity (int fd, size_t cpusetsize, const cpu_set_t *mask) +diff --git a/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl b/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl +index ae6fdb5..16f1994 100644 +--- a/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl ++++ b/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl +@@ -112,3 +112,4 @@ __NR_fsopen 430 sys_fsopen (char *fsname, unsigned int flags) + __NR_fsconfig 431 sys_fsconfig (int fd, unsigned int cmd, const char *key, const char *value, int aux) + __NR_fsmount 432 sys_fsmount (int fd, unsigned int flags, unsigned int attr_flags) + __NR_clone3 435 sys_clone3 (struct clone_args *uargs, size_t size) ++__NR_sched_setaffinity 239 sys_sched_setaffinity (int fd, size_t cpusetsize, const cpu_set_t *mask) +diff --git a/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl b/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl +index 7a48711..29c13e3 100644 +--- a/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl ++++ b/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl +@@ -63,6 +63,7 @@ __NR_mincore 218 sys_mincore (void *addr, unsigned long size, unsigned char * + __NR_madvise 219 sys_madvise (unsigned long start, size_t len, int behavior) + __NR_gettid 224 sys_gettid (void) + __NR_futex 240 sys_futex (uint32_t *uaddr, int op, uint32_t val, struct timespec *utime, uint32_t *uaddr2, uint32_t val3) ++__NR_sched_setaffinity 241 sys_sched_setaffinity (int fd, size_t cpusetsize, const cpu_set_t *mask) + __NR_set_thread_area 243 sys_set_thread_area (user_desc_t *info) + __NR_get_thread_area 244 sys_get_thread_area (user_desc_t *info) + __NR_io_setup 245 sys_io_setup (unsigned nr_reqs, aio_context_t *ctx32p) +diff --git a/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl b/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl +index 6667c07..74f5482 100644 +--- a/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl ++++ b/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl +@@ -73,6 +73,7 @@ __NR_mount 165 sys_mount (char *dev_nmae, char *dir_name, char *type, unsign + __NR_umount2 166 sys_umount2 (char *name, int flags) + __NR_gettid 186 sys_gettid (void) + __NR_futex 202 sys_futex (uint32_t *uaddr, int op, uint32_t val, struct timespec *utime, uint32_t *uaddr2, uint32_t val3) ++__NR_sched_setaffinity 203 sys_sched_setaffinity (int fd, size_t cpusetsize, const cpu_set_t *mask) + __NR_set_thread_area 205 sys_set_thread_area (user_desc_t *info) + __NR_io_setup 206 sys_io_setup (unsigned nr_events, aio_context_t *ctx) + __NR_io_getevents 208 sys_io_getevents (aio_context_t ctx, long min_nr, long nr, struct io_event *evs, struct timespec *tmo) +diff --git a/criu/config.c b/criu/config.c +index 08606fb..5a53256 100644 +--- a/criu/config.c ++++ b/criu/config.c +@@ -541,6 +541,7 @@ int parse_options(int argc, char **argv, bool *usage_error, + { "cgroup-yard", required_argument, 0, 1096 }, + { "pre-dump-mode", required_argument, 0, 1097}, + { "file-validation", required_argument, 0, 1098 }, ++ BOOL_OPT("with-cpu-affinity", &opts.with_cpu_affinity), + { }, + }; + +diff --git a/criu/cr-dump.c b/criu/cr-dump.c +index b9d2914..f078c27 100644 +--- a/criu/cr-dump.c ++++ b/criu/cr-dump.c +@@ -140,6 +140,7 @@ static int dump_sched_info(int pid, ThreadCoreEntry *tc) + { + int ret; + struct sched_param sp; ++ cpu_set_t cpumask; + + BUILD_BUG_ON(SCHED_OTHER != 0); /* default in proto message */ + +@@ -185,6 +186,19 @@ static int dump_sched_info(int pid, ThreadCoreEntry *tc) + tc->has_sched_nice = true; + tc->sched_nice = ret; + ++ pr_info("\tdumping allowed cpus for %d\n", pid); ++ ret = syscall(__NR_sched_getaffinity, pid, sizeof(cpumask), &cpumask); ++ if (ret < 0) { ++ pr_perror("Can't get sched affinity for %d", pid); ++ return -1; ++ } ++ memcpy(tc->allowed_cpus->cpumask, &cpumask, sizeof(cpu_set_t)); ++ pr_info("\t 0x%llx, 0x%llx, 0x%llx, 0x%llx\n", ++ (unsigned long long)tc->allowed_cpus->cpumask[3], ++ (unsigned long long)tc->allowed_cpus->cpumask[2], ++ (unsigned long long)tc->allowed_cpus->cpumask[1], ++ (unsigned long long)tc->allowed_cpus->cpumask[0]); ++ + return 0; + } + +diff --git a/criu/cr-restore.c b/criu/cr-restore.c +index 589087f..da2e53d 100644 +--- a/criu/cr-restore.c ++++ b/criu/cr-restore.c +@@ -118,6 +118,7 @@ static int prepare_restorer_blob(void); + static int prepare_rlimits(int pid, struct task_restore_args *, CoreEntry *core); + static int prepare_posix_timers(int pid, struct task_restore_args *ta, CoreEntry *core); + static int prepare_signals(int pid, struct task_restore_args *, CoreEntry *core); ++static int prepare_allowed_cpus(int pid, struct task_restore_args *ta, CoreEntry *leader_core); + + /* + * Architectures can overwrite this function to restore registers that are not +@@ -922,6 +923,9 @@ static int restore_one_alive_task(int pid, CoreEntry *core) + if (prepare_signals(pid, ta, core)) + return -1; + ++ if (prepare_allowed_cpus(pid, ta, core)) ++ return -1; ++ + if (prepare_posix_timers(pid, ta, core)) + return -1; + +@@ -3196,6 +3200,27 @@ out: + return ret; + } + ++static int prepare_allowed_cpus(int pid, struct task_restore_args *ta, CoreEntry *leader_core) ++{ ++ int i; ++ int *need_cpu_affinity; ++ cpu_set_t *cpumaks; ++ ++ ta->allowed_cpus = (char *)rst_mem_align_cpos(RM_PRIVATE); ++ ++ need_cpu_affinity = rst_mem_alloc(sizeof(int), RM_PRIVATE); ++ *need_cpu_affinity = opts.with_cpu_affinity; ++ ++ for (i = 0; i < current->nr_threads; i++) { ++ cpumaks = rst_mem_alloc(sizeof(cpu_set_t), RM_PRIVATE); ++ if (!cpumaks) ++ return -1; ++ ++ memcpy(cpumaks, current->core[i]->thread_core->allowed_cpus->cpumask, sizeof(cpu_set_t)); ++ } ++ return 0; ++} ++ + extern void __gcov_flush(void) __attribute__((weak)); + void __gcov_flush(void) {} + +@@ -3655,6 +3680,7 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns + RST_MEM_FIXUP_PPTR(task_args->timerfd); + RST_MEM_FIXUP_PPTR(task_args->posix_timers); + RST_MEM_FIXUP_PPTR(task_args->siginfo); ++ RST_MEM_FIXUP_PPTR(task_args->allowed_cpus); + RST_MEM_FIXUP_PPTR(task_args->rlims); + RST_MEM_FIXUP_PPTR(task_args->helpers); + RST_MEM_FIXUP_PPTR(task_args->zombies); +diff --git a/criu/crtools.c b/criu/crtools.c +index 2eb5dba..0f04a85 100644 +--- a/criu/crtools.c ++++ b/criu/crtools.c +@@ -441,6 +441,8 @@ usage: + " --file-validation METHOD\n" + " pass the validation method to be used; argument\n" + " can be 'filesize' or 'buildid' (default).\n" ++" --with-cpu-affinity Allow to restore cpu affinity. Only for hosts with\n" ++" same cpu quantity.\n" + "\n" + "Check options:\n" + " Without options, \"criu check\" checks availability of absolutely required\n" +diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h +index ac1c9e9..fda54a4 100644 +--- a/criu/include/cr_options.h ++++ b/criu/include/cr_options.h +@@ -174,6 +174,8 @@ struct cr_options { + + /* This stores which method to use for file validation. */ + int file_validation_method; ++ /* restore cpu affinity */ ++ int with_cpu_affinity; + }; + + extern struct cr_options opts; +diff --git a/criu/include/restorer.h b/criu/include/restorer.h +index dfb4e6b..bd6ef6a 100644 +--- a/criu/include/restorer.h ++++ b/criu/include/restorer.h +@@ -1,6 +1,7 @@ + #ifndef __CR_RESTORER_H__ + #define __CR_RESTORER_H__ + ++#include + #include + #include + #include +@@ -162,6 +163,8 @@ struct task_restore_args { + siginfo_t *siginfo; + unsigned int siginfo_n; + ++ char *allowed_cpus; ++ + struct rst_tcp_sock *tcp_socks; + unsigned int tcp_socks_n; + +diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c +index b3d7e2b..c63f96b 100644 +--- a/criu/pie/restorer.c ++++ b/criu/pie/restorer.c +@@ -432,6 +432,40 @@ static int restore_signals(siginfo_t *ptr, int nr, bool group) + return 0; + } + ++static int restore_cpu_affinity(struct task_restore_args *args) ++{ ++ int i; ++ int pid; ++ int ret; ++ int *need_cpu_affinity; ++ cpu_set_t *cpumask; ++ cpu_set_t *allowed_cpus; ++ ++ need_cpu_affinity = (int *)args->allowed_cpus; ++ if (!*need_cpu_affinity) { ++ pr_debug("No need to restore cpu affinity.\n"); ++ return 0; ++ } ++ ++ allowed_cpus = (cpu_set_t *)(args->allowed_cpus + sizeof(int)); ++ for (i = 0; i < args->nr_threads; i++) { ++ pid = args->thread_args[i].pid; ++ cpumask = &allowed_cpus[i]; ++ pr_info("Restoring %d allowed_cpus %llx, %llx, %llx, %llx\n", pid, ++ (unsigned long long)cpumask->__bits[3], ++ (unsigned long long)cpumask->__bits[2], ++ (unsigned long long)cpumask->__bits[1], ++ (unsigned long long)cpumask->__bits[0]); ++ ret = sys_sched_setaffinity(pid, sizeof(cpu_set_t), cpumask); ++ if (ret) { ++ pr_err("\t Restore %d cpumask failed.\n", pid); ++ return ret; ++ } ++ } ++ ++ return 0; ++} ++ + static int restore_seccomp_filter(pid_t tid, struct thread_restore_args *args) + { + unsigned int flags = args->seccomp_force_tsync ? SECCOMP_FILTER_FLAG_TSYNC : 0; +@@ -1900,6 +1934,10 @@ long __export_restore_task(struct task_restore_args *args) + if (ret) + goto core_restore_end; + ++ ret = restore_cpu_affinity(args); ++ if (ret) ++ goto core_restore_end; ++ + restore_finish_stage(task_entries_local, CR_STATE_RESTORE_SIGCHLD); + + rst_tcp_socks_all(args); +diff --git a/criu/pstree.c b/criu/pstree.c +index a876615..f0d7622 100644 +--- a/criu/pstree.c ++++ b/criu/pstree.c +@@ -58,11 +58,13 @@ CoreEntry *core_entry_alloc(int th, int tsk) + CredsEntry *ce = NULL; + + sz += sizeof(ThreadCoreEntry) + sizeof(ThreadSasEntry) + sizeof(CredsEntry); ++ sz += sizeof(ThreadAllowedcpusEntry); + + sz += CR_CAP_SIZE * sizeof(ce->cap_inh[0]); + sz += CR_CAP_SIZE * sizeof(ce->cap_prm[0]); + sz += CR_CAP_SIZE * sizeof(ce->cap_eff[0]); + sz += CR_CAP_SIZE * sizeof(ce->cap_bnd[0]); ++ sz += sizeof(cpu_set_t); + /* + * @groups are dynamic and allocated + * on demand. +@@ -127,6 +129,11 @@ CoreEntry *core_entry_alloc(int th, int tsk) + ce->cap_eff = xptr_pull_s(&m, CR_CAP_SIZE * sizeof(ce->cap_eff[0])); + ce->cap_bnd = xptr_pull_s(&m, CR_CAP_SIZE * sizeof(ce->cap_bnd[0])); + ++ core->thread_core->allowed_cpus = xptr_pull(&m, ThreadAllowedcpusEntry); ++ thread_allowedcpus_entry__init(core->thread_core->allowed_cpus); ++ core->thread_core->allowed_cpus->n_cpumask = sizeof(cpu_set_t) / sizeof(uint64_t); ++ core->thread_core->allowed_cpus->cpumask = xptr_pull_s(&m, sizeof(cpu_set_t)); ++ + if (arch_alloc_thread_info(core)) { + xfree(core); + core = NULL; +diff --git a/images/core.proto b/images/core.proto +index 9e9e393..2981120 100644 +--- a/images/core.proto ++++ b/images/core.proto +@@ -81,6 +81,10 @@ message thread_sas_entry { + required uint32 ss_flags = 3; + } + ++message thread_allowedcpus_entry { ++ repeated uint64 cpumask = 1; ++} ++ + message thread_core_entry { + required uint64 futex_rla = 1; + required uint32 futex_rla_len = 2; +@@ -99,6 +103,7 @@ message thread_core_entry { + + optional string comm = 13; + optional uint64 blk_sigset_extended = 14; ++ required thread_allowedcpus_entry allowed_cpus = 15; + } + + message task_rlimits_entry { +diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile +index aae4983..ad8fc6a 100644 +--- a/test/zdtm/static/Makefile ++++ b/test/zdtm/static/Makefile +@@ -235,6 +235,7 @@ TST_NOFILE := \ + timens_nested \ + timens_for_kids \ + zombie_leader \ ++ cpu-affinity0 \ + # jobctl00 \ + + pkg-config-check = $(shell sh -c 'pkg-config $(1) && echo y') +diff --git a/test/zdtm/static/cpu-affinity0.c b/test/zdtm/static/cpu-affinity0.c +new file mode 100644 +index 0000000..83dee19 +--- /dev/null ++++ b/test/zdtm/static/cpu-affinity0.c +@@ -0,0 +1,42 @@ ++#include ++#include ++#include ++ ++#include "zdtmtst.h" ++ ++const char *test_doc = "Check that with-cpu-affinity option can restore cpu affinity"; ++const char *test_author = "Sang Yan "; ++ ++int main(int argc, char **argv) ++{ ++ cpu_set_t old; ++ cpu_set_t new; ++ ++ test_init(argc, argv); ++ ++ CPU_ZERO(&old); ++ CPU_ZERO(&new); ++ ++ /* test only 0 core because of CI test env limited */ ++ CPU_SET(0, &old); ++ ++ if (sched_setaffinity(getpid(), sizeof(old), &old) < 0) { ++ pr_perror("Can't set old cpu affinity! errno: %d", errno); ++ exit(1); ++ } ++ ++ test_daemon(); ++ test_waitsig(); ++ ++ if (sched_getaffinity(getpid(), sizeof(new), &new) < 0) { ++ pr_perror("Can't get new cpu affinity! errno: %d", errno); ++ exit(1); ++ } ++ ++ if (memcmp(&old, &new, sizeof(cpu_set_t))) ++ fail("Cpu affinity restore failed."); ++ else ++ pass(); ++ ++ return 0; ++} +diff --git a/test/zdtm/static/cpu-affinity0.desc b/test/zdtm/static/cpu-affinity0.desc +new file mode 100644 +index 0000000..0d0b8ae +--- /dev/null ++++ b/test/zdtm/static/cpu-affinity0.desc +@@ -0,0 +1 @@ ++{'dopts': '', 'ropts': '--with-cpu-affinity', 'flags': 'reqrst '} +-- +1.8.3.1 + diff --git a/0005-vdso-fix-segmentation-fault-caused-by-char-pointer-a.patch b/0005-vdso-fix-segmentation-fault-caused-by-char-pointer-a.patch new file mode 100644 index 0000000..6935cc6 --- /dev/null +++ b/0005-vdso-fix-segmentation-fault-caused-by-char-pointer-a.patch @@ -0,0 +1,193 @@ +From 1c34f736caefd92ed6e131c5a4eb1699e2a24e35 Mon Sep 17 00:00:00 2001 +From: anatasluo +Date: Fri, 29 Jan 2021 13:48:57 +0000 +Subject: [PATCH 5/6] vdso: fix segmentation fault caused by char pointer array + +When I compile criu with "make DEBUG=1" and run it to restore my +program, it produces a segmentation fault. + +In aarch64, with compile flag "-O0", when criu executes the code in pie, +it is unable to visit the content of ARCH_VDSO_SYMBOLS. So I put these +variables into the stack. + +Signed-off-by: anatasluo +--- + criu/arch/aarch64/include/asm/vdso.h | 17 +++++++++-------- + criu/arch/arm/include/asm/vdso.h | 9 ++++++--- + criu/arch/ppc64/include/asm/vdso.h | 34 +++++++++++++++++++++++----------- + criu/arch/s390/include/asm/vdso.h | 17 +++++++++++------ + criu/arch/x86/include/asm/vdso.h | 23 ++++++++++++++++------- + criu/pie/util-vdso.c | 2 ++ + 6 files changed, 67 insertions(+), 35 deletions(-) + +diff --git a/criu/arch/aarch64/include/asm/vdso.h b/criu/arch/aarch64/include/asm/vdso.h +index 8a65e09..97a2440 100644 +--- a/criu/arch/aarch64/include/asm/vdso.h ++++ b/criu/arch/aarch64/include/asm/vdso.h +@@ -16,15 +16,16 @@ + * Workaround for VDSO array symbol table's relocation. + * XXX: remove when compel/piegen will support aarch64. + */ +-static const char* __maybe_unused aarch_vdso_symbol1 = "__kernel_clock_getres"; +-static const char* __maybe_unused aarch_vdso_symbol2 = "__kernel_clock_gettime"; +-static const char* __maybe_unused aarch_vdso_symbol3 = "__kernel_gettimeofday"; +-static const char* __maybe_unused aarch_vdso_symbol4 = "__kernel_rt_sigreturn"; ++#define ARCH_VDSO_SYMBOLS_LIST \ ++ const char* aarch_vdso_symbol1 = "__kernel_clock_getres"; \ ++ const char* aarch_vdso_symbol2 = "__kernel_clock_gettime"; \ ++ const char* aarch_vdso_symbol3 = "__kernel_gettimeofday"; \ ++ const char* aarch_vdso_symbol4 = "__kernel_rt_sigreturn"; + +-#define ARCH_VDSO_SYMBOLS \ +- aarch_vdso_symbol1, \ +- aarch_vdso_symbol2, \ +- aarch_vdso_symbol3, \ ++#define ARCH_VDSO_SYMBOLS \ ++ aarch_vdso_symbol1, \ ++ aarch_vdso_symbol2, \ ++ aarch_vdso_symbol3, \ + aarch_vdso_symbol4 + + extern void write_intraprocedure_branch(unsigned long to, unsigned long from); +diff --git a/criu/arch/arm/include/asm/vdso.h b/criu/arch/arm/include/asm/vdso.h +index f57790a..e96514e 100644 +--- a/criu/arch/arm/include/asm/vdso.h ++++ b/criu/arch/arm/include/asm/vdso.h +@@ -11,8 +11,11 @@ + */ + #define VDSO_SYMBOL_MAX 2 + #define VDSO_SYMBOL_GTOD 1 +-#define ARCH_VDSO_SYMBOLS \ +- "__vdso_clock_gettime", \ +- "__vdso_gettimeofday" ++#define ARCH_VDSO_SYMBOLS_LIST \ ++ const char* aarch_vdso_symbol1 = "__vdso_clock_gettime"; \ ++ const char* aarch_vdso_symbol2 = "__vdso_gettimeofday"; ++#define ARCH_VDSO_SYMBOLS \ ++ aarch_vdso_symbol1, \ ++ aarch_vdso_symbol2, + + #endif /* __CR_ASM_VDSO_H__ */ +diff --git a/criu/arch/ppc64/include/asm/vdso.h b/criu/arch/ppc64/include/asm/vdso.h +index 6c92348..fe04336 100644 +--- a/criu/arch/ppc64/include/asm/vdso.h ++++ b/criu/arch/ppc64/include/asm/vdso.h +@@ -14,16 +14,28 @@ + */ + #define VDSO_SYMBOL_MAX 10 + #define VDSO_SYMBOL_GTOD 5 +-#define ARCH_VDSO_SYMBOLS \ +- "__kernel_clock_getres", \ +- "__kernel_clock_gettime", \ +- "__kernel_get_syscall_map", \ +- "__kernel_get_tbfreq", \ +- "__kernel_getcpu", \ +- "__kernel_gettimeofday", \ +- "__kernel_sigtramp_rt64", \ +- "__kernel_sync_dicache", \ +- "__kernel_sync_dicache_p5", \ +- "__kernel_time" ++#define ARCH_VDSO_SYMBOLS_LIST \ ++ const char* aarch_vdso_symbol1 = "__kernel_clock_getres"; \ ++ const char* aarch_vdso_symbol2 = "__kernel_clock_gettime"; \ ++ const char* aarch_vdso_symbol3 = "__kernel_get_syscall_map"; \ ++ const char* aarch_vdso_symbol4 = "__kernel_get_tbfreq"; \ ++ const char* aarch_vdso_symbol5 = "__kernel_getcpu"; \ ++ const char* aarch_vdso_symbol6 = "__kernel_gettimeofday"; \ ++ const char* aarch_vdso_symbol7 = "__kernel_sigtramp_rt64"; \ ++ const char* aarch_vdso_symbol8 = "__kernel_sync_dicache"; \ ++ const char* aarch_vdso_symbol9 = "__kernel_sync_dicache_p5"; \ ++ const char* aarch_vdso_symbol10 = "__kernel_time"; ++ ++#define ARCH_VDSO_SYMBOLS \ ++ aarch_vdso_symbol1, \ ++ aarch_vdso_symbol2, \ ++ aarch_vdso_symbol3, \ ++ aarch_vdso_symbol4, \ ++ aarch_vdso_symbol5, \ ++ aarch_vdso_symbol6, \ ++ aarch_vdso_symbol7, \ ++ aarch_vdso_symbol8, \ ++ aarch_vdso_symbol9, \ ++ aarch_vdso_symbol10 + + #endif /* __CR_ASM_VDSO_H__ */ +diff --git a/criu/arch/s390/include/asm/vdso.h b/criu/arch/s390/include/asm/vdso.h +index c54d848..ac71f59 100644 +--- a/criu/arch/s390/include/asm/vdso.h ++++ b/criu/arch/s390/include/asm/vdso.h +@@ -12,13 +12,18 @@ + #define VDSO_SYMBOL_GTOD 0 + + /* +- * This definition is used in pie/util-vdso.c to initialize the vdso symbol ++ * These definitions are used in pie/util-vdso.c to initialize the vdso symbol + * name string table 'vdso_symbols' + */ +-#define ARCH_VDSO_SYMBOLS \ +- "__kernel_gettimeofday", \ +- "__kernel_clock_gettime", \ +- "__kernel_clock_getres", \ +- "__kernel_getcpu" ++#define ARCH_VDSO_SYMBOLS_LIST \ ++ const char* aarch_vdso_symbol1 = "__kernel_gettimeofday"; \ ++ const char* aarch_vdso_symbol2 = "__kernel_clock_gettime"; \ ++ const char* aarch_vdso_symbol3 = "__kernel_clock_getres"; \ ++ const char* aarch_vdso_symbol4 = "__kernel_getcpu"; ++#define ARCH_VDSO_SYMBOLS \ ++ aarch_vdso_symbol1, \ ++ aarch_vdso_symbol2, \ ++ aarch_vdso_symbol3, \ ++ aarch_vdso_symbol4 + + #endif /* __CR_ASM_VDSO_H__ */ +diff --git a/criu/arch/x86/include/asm/vdso.h b/criu/arch/x86/include/asm/vdso.h +index 28ae2d1..54d1fba 100644 +--- a/criu/arch/x86/include/asm/vdso.h ++++ b/criu/arch/x86/include/asm/vdso.h +@@ -35,13 +35,22 @@ + * vsyscall will be patched again when addressing: + * https://github.com/checkpoint-restore/criu/issues/512 + */ +-#define ARCH_VDSO_SYMBOLS \ +- "__vdso_clock_gettime", \ +- "__vdso_getcpu", \ +- "__vdso_gettimeofday", \ +- "__vdso_time", \ +- "__kernel_sigreturn", \ +- "__kernel_rt_sigreturn" ++ ++#define ARCH_VDSO_SYMBOLS_LIST \ ++ const char* aarch_vdso_symbol1 = "__vdso_clock_gettime"; \ ++ const char* aarch_vdso_symbol2 = "__vdso_getcpu"; \ ++ const char* aarch_vdso_symbol3 = "__vdso_gettimeofday"; \ ++ const char* aarch_vdso_symbol4 = "__vdso_time"; \ ++ const char* aarch_vdso_symbol5 = "__kernel_sigreturn"; \ ++ const char* aarch_vdso_symbol6 = "__kernel_rt_sigreturn"; ++ ++#define ARCH_VDSO_SYMBOLS \ ++ aarch_vdso_symbol1, \ ++ aarch_vdso_symbol2, \ ++ aarch_vdso_symbol3, \ ++ aarch_vdso_symbol4, \ ++ aarch_vdso_symbol5, \ ++ aarch_vdso_symbol6 + + /* "__kernel_vsyscall", */ + +diff --git a/criu/pie/util-vdso.c b/criu/pie/util-vdso.c +index 58b2768..c717f2d 100644 +--- a/criu/pie/util-vdso.c ++++ b/criu/pie/util-vdso.c +@@ -219,6 +219,8 @@ static void parse_elf_symbols(uintptr_t mem, size_t size, Phdr_t *load, + struct vdso_symtable *t, uintptr_t dynsymbol_names, + Hash_t *hash, Dyn_t *dyn_symtab) + { ++ ARCH_VDSO_SYMBOLS_LIST ++ + const char *vdso_symbols[VDSO_SYMBOL_MAX] = { + ARCH_VDSO_SYMBOLS + }; +-- +1.8.3.1 + diff --git a/0006-criu-add-pin-memory-method.patch b/0006-criu-add-pin-memory-method.patch new file mode 100644 index 0000000..e29a0e6 --- /dev/null +++ b/0006-criu-add-pin-memory-method.patch @@ -0,0 +1,268 @@ +From 4c11832330e6c7b924b96c7ea70c14025fe0d970 Mon Sep 17 00:00:00 2001 +From: "fu.lin" +Date: Tue, 13 Apr 2021 14:10:23 +0800 +Subject: [PATCH 6/6] criu: add pin memory method + +We can use the checkpoint and restore in userspace method to dump +and restore tasks when updating the kernel. Currently, criu needs +dump all memory data of tasks to files. When the memory size is +very large (large than 1GiB), the cost time of the dumping data +will be very long (more than 1 min). + +We can pin the memory data of tasks and collect the corresponding +physical pages mapping info in checkpoint process, and remap the +physical pages to restore tasks in restore process. + +Signed-off-by: Jingxian He +--- + criu/config.c | 1 + + criu/cr-restore.c | 5 +++ + criu/include/cr_options.h | 1 + + criu/include/restorer.h | 24 ++++++++++++ + criu/mem.c | 96 ++++++++++++++++++++++++++++++++++++++++++++++- + criu/pie/restorer.c | 21 ++++++++++- + 6 files changed, 146 insertions(+), 2 deletions(-) + +diff --git a/criu/config.c b/criu/config.c +index 5a53256..61b81fa 100644 +--- a/criu/config.c ++++ b/criu/config.c +@@ -542,6 +542,7 @@ int parse_options(int argc, char **argv, bool *usage_error, + { "pre-dump-mode", required_argument, 0, 1097}, + { "file-validation", required_argument, 0, 1098 }, + BOOL_OPT("with-cpu-affinity", &opts.with_cpu_affinity), ++ BOOL_OPT("pin-memory", &opts.pin_memory), + { }, + }; + +diff --git a/criu/cr-restore.c b/criu/cr-restore.c +index da2e53d..ff41976 100644 +--- a/criu/cr-restore.c ++++ b/criu/cr-restore.c +@@ -3866,6 +3866,11 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns + task_args->clone_restore_fn, + task_args->thread_args); + ++ if (opts.pin_memory) ++ task_args->pin_memory = true; ++ else ++ task_args->pin_memory = false; ++ + /* + * An indirect call to task_restore, note it never returns + * and restoring core is extremely destructive. +diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h +index fda54a4..a4dc5b8 100644 +--- a/criu/include/cr_options.h ++++ b/criu/include/cr_options.h +@@ -176,6 +176,7 @@ struct cr_options { + int file_validation_method; + /* restore cpu affinity */ + int with_cpu_affinity; ++ int pin_memory; + }; + + extern struct cr_options opts; +diff --git a/criu/include/restorer.h b/criu/include/restorer.h +index bd6ef6a..fc37e6d 100644 +--- a/criu/include/restorer.h ++++ b/criu/include/restorer.h +@@ -225,6 +225,7 @@ struct task_restore_args { + int lsm_type; + int child_subreaper; + bool has_clone3_set_tid; ++ bool pin_memory; + } __aligned(64); + + /* +@@ -317,4 +318,27 @@ enum { + #define __r_sym(name) restorer_sym ## name + #define restorer_sym(rblob, name) (void*)(rblob + __r_sym(name)) + ++#define PIN_MEM_FILE "/dev/pinmem" ++#define PIN_MEM_MAGIC 0x59 ++#define _SET_PIN_MEM_AREA 1 ++#define _CLEAR_PIN_MEM_AREA 2 ++#define _REMAP_PIN_MEM_AREA 3 ++#define _PIN_MEM_IOC_MAX_NR 4 ++#define SET_PIN_MEM_AREA _IOW(PIN_MEM_MAGIC, _SET_PIN_MEM_AREA, struct pin_mem_area_set) ++#define CLEAR_PIN_MEM_AREA _IOW(PIN_MEM_MAGIC, _CLEAR_PIN_MEM_AREA, int) ++#define REMAP_PIN_MEM_AREA _IOW(PIN_MEM_MAGIC, _REMAP_PIN_MEM_AREA, int) ++ ++#define ONCE_PIN_MEM_SIZE_LIMIT 32 * 1024 * 1024 ++#define MAX_PIN_MEM_AREA_NUM 16 ++struct pin_mem_area { ++ unsigned long virt_start; ++ unsigned long virt_end; ++}; ++ ++struct pin_mem_area_set { ++ unsigned int pid; ++ unsigned int area_num; ++ struct pin_mem_area mem_area[MAX_PIN_MEM_AREA_NUM]; ++}; ++ + #endif /* __CR_RESTORER_H__ */ +diff --git a/criu/mem.c b/criu/mem.c +index 167838b..709de4e 100644 +--- a/criu/mem.c ++++ b/criu/mem.c +@@ -438,6 +438,88 @@ again: + return ret; + } + ++bool should_pin_vmae(VmaEntry *vmae) ++{ ++ /* ++ * vDSO area must be always dumped because on restore ++ * we might need to generate a proxy. ++ */ ++ if (vma_entry_is(vmae, VMA_AREA_VDSO)) ++ return false; ++ /* ++ * In turn VVAR area is special and referenced from ++ * vDSO area by IP addressing (at least on x86) thus ++ * never ever dump its content but always use one provided ++ * by the kernel on restore, ie runtime VVAR area must ++ * be remapped into proper place.. ++ */ ++ if (vma_entry_is(vmae, VMA_AREA_VVAR)) ++ return false; ++ ++ if (vma_entry_is(vmae, VMA_AREA_AIORING)) ++ return false; ++ if (vma_entry_is(vmae, VMA_ANON_PRIVATE)) { ++ pr_debug("find private anon vma: %lx-%lx\n", vmae->start, vmae->end); ++ return true; ++ } ++ ++ return false; ++} ++ ++static int pin_one_pmas(int fd, unsigned long start, ++ unsigned long *pend, struct pstree_item *item) ++{ ++ int ret; ++ unsigned int index = 0; ++ unsigned long end; ++ unsigned long next = start; ++ struct pin_mem_area_set pmas; ++ struct pin_mem_area *pma; ++ ++ end = *pend; ++ while (start < end) { ++ next = (start + ONCE_PIN_MEM_SIZE_LIMIT > end) ? end : (start + ONCE_PIN_MEM_SIZE_LIMIT); ++ pma = &(pmas.mem_area[index]); ++ pma->virt_start = start; ++ pma->virt_end = next; ++ pr_info("start pin %lx-%lx\n", start, next); ++ index++; ++ start += ONCE_PIN_MEM_SIZE_LIMIT; ++ if (index >= MAX_PIN_MEM_AREA_NUM) ++ break; ++ } ++ *pend = next; ++ pmas.area_num = index; ++ pmas.pid = vpid(item); ++ pr_info("begin pin memory for pid:%d\n", pmas.pid); ++ ret = ioctl(fd, SET_PIN_MEM_AREA, &pmas); ++ if (ret < 0) ++ pr_err("pin mem fail, errno: %s\n", strerror(errno)); ++ return ret; ++} ++static int pin_vmae(VmaEntry *vmae, struct pstree_item *item) ++{ ++ int fd; ++ int ret = 0; ++ unsigned long start, end; ++ ++ fd = open(PIN_MEM_FILE, O_RDWR); ++ if (fd < 0) { ++ pr_err("open file: %s fail.\n", PIN_MEM_FILE); ++ return -1; ++ } ++ start = vmae->start; ++ while (start < vmae->end) { ++ end = vmae->end; ++ ret = pin_one_pmas(fd, start, &end, item); ++ if (ret < 0) ++ break; ++ start = end; ++ } ++ close(fd); ++ return ret; ++} ++ + static int __parasite_dump_pages_seized(struct pstree_item *item, + struct parasite_dump_pages_args *args, + struct vm_area_list *vma_area_list, +@@ -513,7 +595,16 @@ static int __parasite_dump_pages_seized(struct pstree_item *item, + if (possible_pid_reuse == -1) + goto out_xfer; + } +- ++ if (opts.pin_memory) { ++ /* pin memory before dump pages */ ++ list_for_each_entry(vma_area, &vma_area_list->h, list) { ++ if (should_pin_vmae(vma_area->e)) { ++ ret = pin_vmae(vma_area->e, item); ++ if (ret) ++ goto out_xfer; ++ } ++ } ++ } + + /* + * Step 1 -- generate the pagemap +@@ -524,6 +615,9 @@ static int __parasite_dump_pages_seized(struct pstree_item *item, + parent_predump_mode = mdc->parent_ie->pre_dump_mode; + + list_for_each_entry(vma_area, &vma_area_list->h, list) { ++ if (opts.pin_memory && should_pin_vmae(vma_area->e)) ++ continue; ++ + ret = generate_vma_iovs(item, vma_area, pp, &xfer, args, ctl, + &pmc, has_parent, mdc->pre_dump, + parent_predump_mode); +diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c +index c63f96b..f3bd541 100644 +--- a/criu/pie/restorer.c ++++ b/criu/pie/restorer.c +@@ -1414,6 +1414,24 @@ int cleanup_current_inotify_events(struct task_restore_args *task_args) + return 0; + } + ++int remap_vmas(int pid) ++{ ++ int fd, ret = 0; ++ ++ fd = sys_open(PIN_MEM_FILE, O_RDWR, 0); ++ if (fd == -1) { ++ pr_err("open file: %s fail.\n", PIN_MEM_FILE); ++ return -1;; ++ } ++ ++ ret = sys_ioctl(fd, REMAP_PIN_MEM_AREA, (unsigned long) &pid); ++ if (ret < 0) ++ pr_err("remap pin mem fail for pid: %d\n", pid); ++ sys_close(fd); ++ return ret; ++} ++ ++ + /* + * The main routine to restore task via sigreturn. + * This one is very special, we never return there +@@ -1585,7 +1603,8 @@ long __export_restore_task(struct task_restore_args *args) + goto core_restore_end; + } + } +- ++ if (args->pin_memory) ++ remap_vmas(my_pid); + /* + * Now read the contents (if any) + */ +-- +1.8.3.1 + diff --git a/criu.spec b/criu.spec index a35472c..7469b21 100644 --- a/criu.spec +++ b/criu.spec @@ -18,6 +18,9 @@ Obsoletes: %{name}-libs < %{version}-%{release} Patch0001: 0001-Fix-crit-encode-TypeError.patch Patch0002: 0002-Fix-crit-info-struct-unpack-error.patch Patch0003: 0003-Fix-crit-x-UnicodeDecodeError.patch +Patch0004: 0004-criu-dump-and-restore-cpu-affinity-of-each-thread.patch +Patch0005: 0005-vdso-fix-segmentation-fault-caused-by-char-pointer-a.patch +Patch0006: 0006-criu-add-pin-memory-method.patch %description Checkpoint/Restore in Userspace(CRIU),is a software tool for the linux operating system. -- Gitee