From d04ec7f3543f0fb05aa2058845a50ab77cbbbd71 Mon Sep 17 00:00:00 2001 From: "fu.lin" Date: Wed, 1 Dec 2021 16:44:49 +0800 Subject: [PATCH] criu: backport upstream and customizing patches More details refer changelog Signed-off-by: fu.lin --- 0001-Fix-crit-encode-TypeError.patch | 28 - ...01-Fix-crit-info-struct-unpack-error.patch | 6 +- ...> 0002-Fix-crit-x-UnicodeDecodeError.patch | 6 +- ...f-system-support-clone3-with-set_tid.patch | 26 +- ...004-Add-assembler-wrapper-for-clone3.patch | 16 +- ...ne3-with-set_tid-to-create-processes.patch | 12 +- ...one3-handle-clone3-with-CLONE_PARENT.patch | 6 +- ... 0007-aarch64-use-clone3-if-possible.patch | 8 +- ...-restore-cpu-affinity-of-each-thread.patch | 40 +- ...ation-fault-caused-by-char-pointer-a.patch | 17 +- ...-offsets-to-remap-vdso-and-vvar-mapp.patch | 7 +- ...riu-fix-build-failure-against-gcc-10.patch | 85 ++ ...leading-underscores-from-protobuf-st.patch | 169 +++ ...roto-adds-additional-fields-to-RegFi.patch | 74 ++ ...-add-build-id-validation-functionali.patch | 478 ++++++++ ...ks-even-when-the-network-is-unlocked.patch | 171 +++ ...e-Warn-if-restorer-can-t-be-unmapped.patch | 40 + ...infect-Warn-if-close-failed-on-memfd.patch | 88 ++ ...-if-compel-succeed-in-executing-munm.patch | 59 + ...ump-Try-to-cure-remote-on-err-pathes.patch | 56 + ...Warn-if-unmapping-local-memfd-failed.patch | 41 + ...-Log-if-can-t-cure-on-failed-infecti.patch | 30 + 0022-compel-criu-Add-__must_check.patch | 326 ++++++ ...build-add-secure-compilation-options.patch | 112 ++ ...--tty-fix-NULL-pointer-access-in-tty.patch | 29 + ...func-address-print-to-make-someone-h.patch | 31 + ...4--mm-add-pin-memory-method-for-criu.patch | 203 +++- ...-pid-add-pid-recover-method-for-criu.patch | 195 ++++ ...ifier-calling-method-for-checkpoint-.patch | 650 +++++++++++ ...ovide-cred-checkpoint-restore-method.patch | 254 +++++ ...ce-dump-block-device-as-reguler-file.patch | 60 + ...-inode-add-support-for-anon-inode-fd.patch | 354 ++++++ ...port-for-char-device-dump-and-restor.patch | 773 +++++++++++++ ...t-fix-connect-error-of-invalid-param.patch | 95 ++ ...fd-fix-for-improper-usage-in-appdata.patch | 97 ++ ...-add-task-exit-notify-mask-method-fo.patch | 143 +++ ...inux-fix-selinux-context-lable-check.patch | 52 + ...t-add-support-for-unix-stream-socket.patch | 269 +++++ ...e-and-restore-sigev_notify_thread_id.patch | 98 ++ ...dump-restore-sysv-shm-in-host-ipc-ns.patch | 116 ++ backport-0018--add-netlink-repair-modes.patch | 45 + backport-0019--ignore-special-page-dump.patch | 84 ++ ...rt-0020--add-O_REPAIR-flag-to-vma-fd.patch | 45 + ...k-add-repair-mode-to-dump-file-locks.patch | 307 +++++ ...2--unlock-network-when-restore-fails.patch | 60 + ...hared-socket-recover-method-for-criu.patch | 330 ++++++ ...024--clean-repair-res-when-dump-fail.patch | 130 +++ ...o-ip_local_reserved_ports-when-dump-.patch | 248 ++++ ...-dump-fail-problem-with-null-seek-op.patch | 35 + ...oblem-with-no-access-to-get-socket-f.patch | 31 + ...ma-offset-value-for-the-sysfs-file-o.patch | 131 +++ ...029--looser-file-mode-and-size-check.patch | 68 ++ ...ethod-for-recover-deleted-file-state.patch | 205 ++++ ...31--fix-share-sockets-repair-problem.patch | 132 +++ backport-0032--nftables-add-mnl-api.patch | 271 +++++ ...--nftables-implement-nft-api-for-tcp.patch | 1011 +++++++++++++++++ ...es-implement-nft-api-for-lock-net-ns.patch | 146 +++ ...rt-0035--criu-switch-to-nftables-api.patch | 391 +++++++ ...gaction-handler-register-in-restorer.patch | 51 + ...7--remove-ignore_special_dump-option.patch | 72 ++ ...ear-pin-mem-and-init-page-map-option.patch | 97 ++ ...map-restore-dev-hisi_sec2-deivce-vma.patch | 492 ++++++++ ...ds-list-restore-and-rollback-problem.patch | 55 + ...041--log-print-error-log-to-dev-kmsg.patch | 83 ++ ...-char-dev-fd-check-and-repair-method.patch | 68 ++ ...43--unix-sk-improve-dgram-robustness.patch | 159 +++ ...gnore-the-bind-error-for-icmp-socket.patch | 44 + ...iband-fix-the-infiniband-fd-conflict.patch | 286 +++++ ...ptimization-parallel-collecting-vmas.patch | 520 +++++++++ ...re-children-exit-to-accelerate-speed.patch | 38 + ...48--parallel-parallel-nft-delete-set.patch | 177 +++ ...-0049--ptrace-trace-specific-syscall.patch | 707 ++++++++++++ criu.spec | 99 +- 73 files changed, 11774 insertions(+), 164 deletions(-) delete mode 100644 0001-Fix-crit-encode-TypeError.patch rename 0002-Fix-crit-info-struct-unpack-error.patch => 0001-Fix-crit-info-struct-unpack-error.patch (81%) rename 0003-Fix-crit-x-UnicodeDecodeError.patch => 0002-Fix-crit-x-UnicodeDecodeError.patch (79%) rename 0004-kerndat-detect-if-system-support-clone3-with-set_tid.patch => 0003-kerndat-detect-if-system-support-clone3-with-set_tid.patch (91%) rename 0005-Add-assembler-wrapper-for-clone3.patch => 0004-Add-assembler-wrapper-for-clone3.patch (95%) rename 0006-Use-clone3-with-set_tid-to-create-processes.patch => 0005-Use-clone3-with-set_tid-to-create-processes.patch (96%) rename 0007-clone3-handle-clone3-with-CLONE_PARENT.patch => 0006-clone3-handle-clone3-with-CLONE_PARENT.patch (90%) rename 0008-aarch64-use-clone3-if-possible.patch => 0007-aarch64-use-clone3-if-possible.patch (96%) rename 0009-criu-dump-and-restore-cpu-affinity-of-each-thread.patch => 0008-criu-dump-and-restore-cpu-affinity-of-each-thread.patch (92%) rename 0010-vdso-fix-segmentation-fault-caused-by-char-pointer-a.patch => 0009-vdso-fix-segmentation-fault-caused-by-char-pointer-a.patch (93%) rename 0011-vdso-use-correct-offsets-to-remap-vdso-and-vvar-mapp.patch => 0010-vdso-use-correct-offsets-to-remap-vdso-and-vvar-mapp.patch (86%) create mode 100644 0011-criu-fix-build-failure-against-gcc-10.patch create mode 100644 0012-protobuf-remove-leading-underscores-from-protobuf-st.patch create mode 100644 0013-images-regfile.proto-adds-additional-fields-to-RegFi.patch create mode 100644 0014-criu-files-reg.c-add-build-id-validation-functionali.patch create mode 100644 0015-criu-Kill-tasks-even-when-the-network-is-unlocked.patch create mode 100644 0016-cr-restore-Warn-if-restorer-can-t-be-unmapped.patch create mode 100644 0017-compel-infect-Warn-if-close-failed-on-memfd.patch create mode 100644 0018-lib-infect-Check-if-compel-succeed-in-executing-munm.patch create mode 100644 0019-cr-dump-Try-to-cure-remote-on-err-pathes.patch create mode 100644 0020-cr-dump-Warn-if-unmapping-local-memfd-failed.patch create mode 100644 0021-parasite-syscall-Log-if-can-t-cure-on-failed-infecti.patch create mode 100644 0022-compel-criu-Add-__must_check.patch create mode 100644 backport-0001--build-add-secure-compilation-options.patch create mode 100644 backport-0002--tty-fix-NULL-pointer-access-in-tty.patch create mode 100644 backport-0003--namespaces-drop-func-address-print-to-make-someone-h.patch rename 0012-add-pin-memory-method-for-criu.patch => backport-0004--mm-add-pin-memory-method-for-criu.patch (53%) create mode 100644 backport-0005--pid-add-pid-recover-method-for-criu.patch create mode 100644 backport-0006--notifier-add-notifier-calling-method-for-checkpoint-.patch create mode 100644 backport-0007--cred-provide-cred-checkpoint-restore-method.patch create mode 100644 backport-0008--block-device-dump-block-device-as-reguler-file.patch create mode 100644 backport-0009--anon-inode-add-support-for-anon-inode-fd.patch create mode 100644 backport-0010--char_dev-add-support-for-char-device-dump-and-restor.patch create mode 100644 backport-0011--socket-fix-connect-error-of-invalid-param.patch create mode 100644 backport-0012--criu-eventpollfd-fix-for-improper-usage-in-appdata.patch create mode 100644 backport-0013--task_exit_notify-add-task-exit-notify-mask-method-fo.patch create mode 100644 backport-0014--selinux-fix-selinux-context-lable-check.patch create mode 100644 backport-0015--unix-socket-add-support-for-unix-stream-socket.patch create mode 100644 backport-0016--save-and-restore-sigev_notify_thread_id.patch create mode 100644 backport-0017--sysvshm-add-dump-restore-sysv-shm-in-host-ipc-ns.patch create mode 100644 backport-0018--add-netlink-repair-modes.patch create mode 100644 backport-0019--ignore-special-page-dump.patch create mode 100644 backport-0020--add-O_REPAIR-flag-to-vma-fd.patch create mode 100644 backport-0021--file-lock-add-repair-mode-to-dump-file-locks.patch create mode 100644 backport-0022--unlock-network-when-restore-fails.patch create mode 100644 backport-0023--net-add-shared-socket-recover-method-for-criu.patch create mode 100644 backport-0024--clean-repair-res-when-dump-fail.patch create mode 100644 backport-0025--save-src-ports-to-ip_local_reserved_ports-when-dump-.patch create mode 100644 backport-0026--fix-dump-fail-problem-with-null-seek-op.patch create mode 100644 backport-0027--fix-dump-fail-problem-with-no-access-to-get-socket-f.patch create mode 100644 backport-0028--proc-parse-fix-vma-offset-value-for-the-sysfs-file-o.patch create mode 100644 backport-0029--looser-file-mode-and-size-check.patch create mode 100644 backport-0030--add-reuse-file-method-for-recover-deleted-file-state.patch create mode 100644 backport-0031--fix-share-sockets-repair-problem.patch create mode 100644 backport-0032--nftables-add-mnl-api.patch create mode 100644 backport-0033--nftables-implement-nft-api-for-tcp.patch create mode 100644 backport-0034--nftables-implement-nft-api-for-lock-net-ns.patch create mode 100644 backport-0035--criu-switch-to-nftables-api.patch create mode 100644 backport-0036--remove-sigaction-handler-register-in-restorer.patch create mode 100644 backport-0037--remove-ignore_special_dump-option.patch create mode 100644 backport-0038--add-clear-pin-mem-and-init-page-map-option.patch create mode 100644 backport-0039--mmap-restore-dev-hisi_sec2-deivce-vma.patch create mode 100644 backport-0040--fix-fds-list-restore-and-rollback-problem.patch create mode 100644 backport-0041--log-print-error-log-to-dev-kmsg.patch create mode 100644 backport-0042--improve-char-dev-fd-check-and-repair-method.patch create mode 100644 backport-0043--unix-sk-improve-dgram-robustness.patch create mode 100644 backport-0044--sk-ignore-the-bind-error-for-icmp-socket.patch create mode 100644 backport-0045--infiniband-fix-the-infiniband-fd-conflict.patch create mode 100644 backport-0046--optimization-parallel-collecting-vmas.patch create mode 100644 backport-0047--dump-ignore-children-exit-to-accelerate-speed.patch create mode 100644 backport-0048--parallel-parallel-nft-delete-set.patch create mode 100644 backport-0049--ptrace-trace-specific-syscall.patch diff --git a/0001-Fix-crit-encode-TypeError.patch b/0001-Fix-crit-encode-TypeError.patch deleted file mode 100644 index 145c559..0000000 --- a/0001-Fix-crit-encode-TypeError.patch +++ /dev/null @@ -1,28 +0,0 @@ -From d1c7216c4265c45bcb8b9380b8ad4e5ed69d014e Mon Sep 17 00:00:00 2001 -From: lingsheng -Date: Tue, 22 Sep 2020 14:36:55 +0800 -Subject: [PATCH 1/3] Fix crit encode TypeError - ---- - lib/py/cli.py | 5 ++++- - 1 file changed, 4 insertions(+), 1 deletion(-) - -diff --git a/lib/py/cli.py b/lib/py/cli.py -index da34302..966dd4e 100755 ---- a/lib/py/cli.py -+++ b/lib/py/cli.py -@@ -16,7 +16,10 @@ def inf(opts): - - def outf(opts): - if opts['out']: -- return open(opts['out'], 'w+') -+ if getattr(opts['func'], '__name__') == 'encode': -+ return open(opts['out'], 'wb+') -+ else: -+ return open(opts['out'], 'w+') - else: - return sys.stdout - --- -2.23.0 - diff --git a/0002-Fix-crit-info-struct-unpack-error.patch b/0001-Fix-crit-info-struct-unpack-error.patch similarity index 81% rename from 0002-Fix-crit-info-struct-unpack-error.patch rename to 0001-Fix-crit-info-struct-unpack-error.patch index 4bfe29d..f131744 100644 --- a/0002-Fix-crit-info-struct-unpack-error.patch +++ b/0001-Fix-crit-info-struct-unpack-error.patch @@ -1,7 +1,7 @@ -From a1d4d678de01b0569e8d36894a8d60a8b75bb016 Mon Sep 17 00:00:00 2001 +From 745ec690af90a00680b68761fd5fb1aacb939c9a Mon Sep 17 00:00:00 2001 From: lingsheng Date: Tue, 22 Sep 2020 14:39:22 +0800 -Subject: [PATCH 2/3] Fix crit info struct unpack error +Subject: [PATCH 01/22] Fix crit info struct unpack error --- lib/py/images/images.py | 2 +- @@ -21,5 +21,5 @@ index f4517d8..72205fe 100644 size, = struct.unpack('i', buf) f.seek(size, 1) -- -2.23.0 +2.34.0 diff --git a/0003-Fix-crit-x-UnicodeDecodeError.patch b/0002-Fix-crit-x-UnicodeDecodeError.patch similarity index 79% rename from 0003-Fix-crit-x-UnicodeDecodeError.patch rename to 0002-Fix-crit-x-UnicodeDecodeError.patch index c59f2b7..00d2577 100644 --- a/0003-Fix-crit-x-UnicodeDecodeError.patch +++ b/0002-Fix-crit-x-UnicodeDecodeError.patch @@ -1,7 +1,7 @@ -From b2eea766a1f41553b76fef8d669e288ff552d0ed Mon Sep 17 00:00:00 2001 +From ee6b6ab7d8dab33ff51d0c10ff1733ac75a15de9 Mon Sep 17 00:00:00 2001 From: lingsheng Date: Tue, 22 Sep 2020 14:40:35 +0800 -Subject: [PATCH 3/3] Fix crit x UnicodeDecodeError +Subject: [PATCH 02/22] Fix crit x UnicodeDecodeError --- lib/py/cli.py | 2 +- @@ -21,5 +21,5 @@ index 966dd4e..f7bda23 100755 def decode(opts): -- -2.23.0 +2.34.0 diff --git a/0004-kerndat-detect-if-system-support-clone3-with-set_tid.patch b/0003-kerndat-detect-if-system-support-clone3-with-set_tid.patch similarity index 91% rename from 0004-kerndat-detect-if-system-support-clone3-with-set_tid.patch rename to 0003-kerndat-detect-if-system-support-clone3-with-set_tid.patch index 285c29d..3e0ff2c 100644 --- a/0004-kerndat-detect-if-system-support-clone3-with-set_tid.patch +++ b/0003-kerndat-detect-if-system-support-clone3-with-set_tid.patch @@ -1,7 +1,7 @@ -From 4f5b57b143d2f92682a0ab14c00df3b2f6f87c05 Mon Sep 17 00:00:00 2001 +From 97ec800fe349827e85c907c320a5d5d074d0758c Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Sun, 15 Dec 2019 20:38:46 +0000 -Subject: [PATCH] kerndat: detect if system support clone3() with set_tid +Subject: [PATCH 03/22] kerndat: detect if system support clone3() with set_tid Linux kernel 5.4 extends clone3() with set_tid to allow processes to specify the PID of a newly created process. This introduces detection @@ -12,16 +12,16 @@ This first implementation is X86_64 only. Signed-off-by: Adrian Reber Signed-off-by: Sang Yan --- - compel/arch/arm/plugins/std/syscalls/syscall.def | 1 + - .../ppc64/plugins/std/syscalls/syscall-ppc64.tbl | 1 + - .../s390/plugins/std/syscalls/syscall-s390.tbl | 1 + - .../arch/x86/plugins/std/syscalls/syscall_32.tbl | 1 + - .../arch/x86/plugins/std/syscalls/syscall_64.tbl | 1 + - compel/plugins/include/uapi/std/syscall-types.h | 1 + - criu/cr-check.c | 12 +++++++ - criu/include/kerndat.h | 1 + - criu/include/sched.h | 33 +++++++++++++++++ - criu/kerndat.c | 41 ++++++++++++++++++++++ + .../arch/arm/plugins/std/syscalls/syscall.def | 1 + + .../plugins/std/syscalls/syscall-ppc64.tbl | 1 + + .../plugins/std/syscalls/syscall-s390.tbl | 1 + + .../x86/plugins/std/syscalls/syscall_32.tbl | 1 + + .../x86/plugins/std/syscalls/syscall_64.tbl | 1 + + .../plugins/include/uapi/std/syscall-types.h | 1 + + criu/cr-check.c | 12 ++++++ + criu/include/kerndat.h | 1 + + criu/include/sched.h | 33 +++++++++++++++ + criu/kerndat.c | 41 +++++++++++++++++++ 10 files changed, 93 insertions(+) create mode 100644 criu/include/sched.h @@ -237,5 +237,5 @@ index 39cacb8..a13adbc 100644 kerndat_lsm(); kerndat_mmap_min_addr(); -- -2.9.5 +2.34.0 diff --git a/0005-Add-assembler-wrapper-for-clone3.patch b/0004-Add-assembler-wrapper-for-clone3.patch similarity index 95% rename from 0005-Add-assembler-wrapper-for-clone3.patch rename to 0004-Add-assembler-wrapper-for-clone3.patch index c688505..c279994 100644 --- a/0005-Add-assembler-wrapper-for-clone3.patch +++ b/0004-Add-assembler-wrapper-for-clone3.patch @@ -1,7 +1,7 @@ -From bd283ef8b9ed6c5efaf1d6bba96c105b0410ab65 Mon Sep 17 00:00:00 2001 +From ef81827a3822e9ac651b3af17ef82281cea31e77 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Mon, 16 Dec 2019 07:57:03 +0000 -Subject: [PATCH] Add assembler wrapper for clone3() +Subject: [PATCH 04/22] Add assembler wrapper for clone3() To create a new process/thread with a certain PID based on clone3() a new assembler wrapper is necessary as there is not glibc wrapper (yet). @@ -9,11 +9,11 @@ new assembler wrapper is necessary as there is not glibc wrapper (yet). Signed-off-by: Adrian Reber Signed-off-by: Sang Yan --- - criu/arch/aarch64/include/asm/restorer.h | 7 +++ - criu/arch/arm/include/asm/restorer.h | 7 +++ - criu/arch/ppc64/include/asm/restorer.h | 7 +++ - criu/arch/s390/include/asm/restorer.h | 7 +++ - criu/arch/x86/include/asm/restorer.h | 92 ++++++++++++++++++++++++++++++++ + criu/arch/aarch64/include/asm/restorer.h | 7 ++ + criu/arch/arm/include/asm/restorer.h | 7 ++ + criu/arch/ppc64/include/asm/restorer.h | 7 ++ + criu/arch/s390/include/asm/restorer.h | 7 ++ + criu/arch/x86/include/asm/restorer.h | 92 ++++++++++++++++++++++++ 5 files changed, 120 insertions(+) diff --git a/criu/arch/aarch64/include/asm/restorer.h b/criu/arch/aarch64/include/asm/restorer.h @@ -199,5 +199,5 @@ index 25559b5..731477e 100644 asm volatile( \ "movq %0, %%rsp \n" \ -- -2.9.5 +2.34.0 diff --git a/0006-Use-clone3-with-set_tid-to-create-processes.patch b/0005-Use-clone3-with-set_tid-to-create-processes.patch similarity index 96% rename from 0006-Use-clone3-with-set_tid-to-create-processes.patch rename to 0005-Use-clone3-with-set_tid-to-create-processes.patch index c37caac..2974a42 100644 --- a/0006-Use-clone3-with-set_tid-to-create-processes.patch +++ b/0005-Use-clone3-with-set_tid-to-create-processes.patch @@ -1,7 +1,7 @@ -From eb742711bb08d11f670204492a0d0fc165f89d0b Mon Sep 17 00:00:00 2001 +From 52689259f49e73ac71e1433d29734f4a52509e66 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Mon, 16 Dec 2019 10:42:13 +0000 -Subject: [PATCH] Use clone3() with set_tid to create processes +Subject: [PATCH 05/22] Use clone3() with set_tid to create processes With the in Linux Kernel 5.4 introduced clone3() with set_tid it is no longer necessary to write to to /proc/../ns_last_pid to influence the @@ -20,12 +20,12 @@ should just keep on working as before. Signed-off-by: Adrian Reber Signed-off-by: Sang Yan --- - criu/clone-noasan.c | 32 +++++++++++++++++++++++ - criu/cr-restore.c | 64 ++++++++++++++++++++++++++++----------------- + criu/clone-noasan.c | 32 +++++++++++++++++++ + criu/cr-restore.c | 64 +++++++++++++++++++++++-------------- criu/include/clone-noasan.h | 2 ++ criu/include/restorer.h | 1 + criu/include/rst_info.h | 1 + - criu/pie/restorer.c | 64 +++++++++++++++++++++++++++++---------------- + criu/pie/restorer.c | 64 ++++++++++++++++++++++++------------- 6 files changed, 117 insertions(+), 47 deletions(-) diff --git a/criu/clone-noasan.c b/criu/clone-noasan.c @@ -303,5 +303,5 @@ index 390c0e1..8bdc88a 100644 pr_err("Unable to create a thread: %ld\n", ret); mutex_unlock(&task_entries_local->last_pid_mutex); -- -2.9.5 +2.34.0 diff --git a/0007-clone3-handle-clone3-with-CLONE_PARENT.patch b/0006-clone3-handle-clone3-with-CLONE_PARENT.patch similarity index 90% rename from 0007-clone3-handle-clone3-with-CLONE_PARENT.patch rename to 0006-clone3-handle-clone3-with-CLONE_PARENT.patch index a6d61a0..644dce7 100644 --- a/0007-clone3-handle-clone3-with-CLONE_PARENT.patch +++ b/0006-clone3-handle-clone3-with-CLONE_PARENT.patch @@ -1,7 +1,7 @@ -From 4b547f723a3fdc60c2b68ed0141b150b94d54c8c Mon Sep 17 00:00:00 2001 +From e1f1923d8d562d764627f5cad854748305299024 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Sat, 25 Jan 2020 13:25:21 +0100 -Subject: [PATCH] clone3: handle clone3() with CLONE_PARENT +Subject: [PATCH 06/22] clone3: handle clone3() with CLONE_PARENT clone3() explicitly blocks setting an exit_signal if CLONE_PARENT is specified. With clone() it also did not work, but there was no error @@ -39,5 +39,5 @@ index 2784d12..7485a52 100644 c_args.set_tid = ptr_to_u64(&pid); c_args.set_tid_size = 1; -- -2.9.5 +2.34.0 diff --git a/0008-aarch64-use-clone3-if-possible.patch b/0007-aarch64-use-clone3-if-possible.patch similarity index 96% rename from 0008-aarch64-use-clone3-if-possible.patch rename to 0007-aarch64-use-clone3-if-possible.patch index 112d62e..3555d48 100644 --- a/0008-aarch64-use-clone3-if-possible.patch +++ b/0007-aarch64-use-clone3-if-possible.patch @@ -1,7 +1,7 @@ -From b7563d356de8f0765d8832d7b5f3911869ad5a0d Mon Sep 17 00:00:00 2001 +From 213106e7348bd26374f68bf88ca2cf5b7e696888 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Sun, 19 Jan 2020 21:42:58 +0100 -Subject: [PATCH] aarch64: use clone3() if possible +Subject: [PATCH 07/22] aarch64: use clone3() if possible This adds the parasite clone3() with set_tid wrapper for aarch64. @@ -10,7 +10,7 @@ Tested on Fedora 31 with 5.5.0-rc6. Signed-off-by: Adrian Reber Signed-off-by: Sang Yan --- - criu/arch/aarch64/include/asm/restorer.h | 67 +++++++++++++++++++++++++++++--- + criu/arch/aarch64/include/asm/restorer.h | 67 +++++++++++++++++++++--- criu/kerndat.c | 4 +- 2 files changed, 63 insertions(+), 8 deletions(-) @@ -110,5 +110,5 @@ index a13adbc..52aac55 100644 kdat.has_clone3_set_tid = false; return 0; -- -2.9.5 +2.34.0 diff --git a/0009-criu-dump-and-restore-cpu-affinity-of-each-thread.patch b/0008-criu-dump-and-restore-cpu-affinity-of-each-thread.patch similarity index 92% rename from 0009-criu-dump-and-restore-cpu-affinity-of-each-thread.patch rename to 0008-criu-dump-and-restore-cpu-affinity-of-each-thread.patch index 962a334..d2ac774 100644 --- a/0009-criu-dump-and-restore-cpu-affinity-of-each-thread.patch +++ b/0008-criu-dump-and-restore-cpu-affinity-of-each-thread.patch @@ -1,7 +1,7 @@ -From baa12b00eeb88bee4de11e28df623662a2b32078 Mon Sep 17 00:00:00 2001 +From d8c2acdc8aad65b9d42ded8e6ed7ad58af190482 Mon Sep 17 00:00:00 2001 From: Sang Yan Date: Thu, 26 Nov 2020 21:18:54 +0800 -Subject: [PATCH] criu: dump and restore cpu affinity of each thread +Subject: [PATCH 08/22] criu: dump and restore cpu affinity of each thread Criu should dump and restore threads' or processes' cpu affinity. @@ -16,23 +16,23 @@ at restore. Signed-off-by: Sang Yan --- - compel/arch/arm/plugins/std/syscalls/syscall.def | 1 + - .../ppc64/plugins/std/syscalls/syscall-ppc64.tbl | 1 + - .../s390/plugins/std/syscalls/syscall-s390.tbl | 1 + - .../arch/x86/plugins/std/syscalls/syscall_32.tbl | 1 + - .../arch/x86/plugins/std/syscalls/syscall_64.tbl | 1 + - criu/config.c | 1 + - criu/cr-dump.c | 14 ++++++++ - criu/cr-restore.c | 26 ++++++++++++++ - criu/crtools.c | 2 ++ - criu/include/cr_options.h | 1 + - criu/include/restorer.h | 3 ++ - criu/pie/restorer.c | 38 ++++++++++++++++++++ - criu/pstree.c | 7 ++++ - images/core.proto | 5 +++ - test/zdtm/static/Makefile | 1 + - test/zdtm/static/cpu-affinity0.c | 42 ++++++++++++++++++++++ - test/zdtm/static/cpu-affinity0.desc | 1 + + .../arch/arm/plugins/std/syscalls/syscall.def | 1 + + .../plugins/std/syscalls/syscall-ppc64.tbl | 1 + + .../plugins/std/syscalls/syscall-s390.tbl | 1 + + .../x86/plugins/std/syscalls/syscall_32.tbl | 1 + + .../x86/plugins/std/syscalls/syscall_64.tbl | 1 + + criu/config.c | 1 + + criu/cr-dump.c | 14 +++++++ + criu/cr-restore.c | 26 ++++++++++++ + criu/crtools.c | 2 + + criu/include/cr_options.h | 1 + + criu/include/restorer.h | 3 ++ + criu/pie/restorer.c | 38 +++++++++++++++++ + criu/pstree.c | 7 ++++ + images/core.proto | 5 +++ + test/zdtm/static/Makefile | 1 + + test/zdtm/static/cpu-affinity0.c | 42 +++++++++++++++++++ + test/zdtm/static/cpu-affinity0.desc | 1 + 17 files changed, 146 insertions(+) create mode 100644 test/zdtm/static/cpu-affinity0.c create mode 100644 test/zdtm/static/cpu-affinity0.desc @@ -413,5 +413,5 @@ index 0000000..0d0b8ae @@ -0,0 +1 @@ +{'dopts': '', 'ropts': '--with-cpu-affinity', 'flags': 'reqrst '} -- -2.9.5 +2.34.0 diff --git a/0010-vdso-fix-segmentation-fault-caused-by-char-pointer-a.patch b/0009-vdso-fix-segmentation-fault-caused-by-char-pointer-a.patch similarity index 93% rename from 0010-vdso-fix-segmentation-fault-caused-by-char-pointer-a.patch rename to 0009-vdso-fix-segmentation-fault-caused-by-char-pointer-a.patch index a3ee15c..20872ab 100644 --- a/0010-vdso-fix-segmentation-fault-caused-by-char-pointer-a.patch +++ b/0009-vdso-fix-segmentation-fault-caused-by-char-pointer-a.patch @@ -1,7 +1,8 @@ -From bcd44583d237684226442aa92cf2ffc41e4ec7e0 Mon Sep 17 00:00:00 2001 +From 2066a54e374eed87ea78c77ed054340b7bc55bbb Mon Sep 17 00:00:00 2001 From: anatasluo Date: Fri, 29 Jan 2021 13:48:57 +0000 -Subject: [PATCH] vdso: fix segmentation fault caused by char pointer array +Subject: [PATCH 09/22] vdso: fix segmentation fault caused by char pointer + array When I compile criu with "make DEBUG=1" and run it to restore my program, it produces a segmentation fault. @@ -12,11 +13,11 @@ variables into the stack. Signed-off-by: anatasluo --- - criu/arch/aarch64/include/asm/vdso.h | 17 +++++++++-------- - criu/arch/arm/include/asm/vdso.h | 9 ++++++--- - criu/arch/ppc64/include/asm/vdso.h | 34 +++++++++++++++++++++++----------- - criu/arch/s390/include/asm/vdso.h | 17 +++++++++++------ - criu/arch/x86/include/asm/vdso.h | 23 ++++++++++++++++------- + criu/arch/aarch64/include/asm/vdso.h | 17 +++++++------- + criu/arch/arm/include/asm/vdso.h | 9 +++++--- + criu/arch/ppc64/include/asm/vdso.h | 34 +++++++++++++++++++--------- + criu/arch/s390/include/asm/vdso.h | 17 +++++++++----- + criu/arch/x86/include/asm/vdso.h | 23 +++++++++++++------ criu/pie/util-vdso.c | 2 ++ 6 files changed, 67 insertions(+), 35 deletions(-) @@ -189,5 +190,5 @@ index 104da06..a383f4a 100644 ARCH_VDSO_SYMBOLS }; -- -2.9.5 +2.34.0 diff --git a/0011-vdso-use-correct-offsets-to-remap-vdso-and-vvar-mapp.patch b/0010-vdso-use-correct-offsets-to-remap-vdso-and-vvar-mapp.patch similarity index 86% rename from 0011-vdso-use-correct-offsets-to-remap-vdso-and-vvar-mapp.patch rename to 0010-vdso-use-correct-offsets-to-remap-vdso-and-vvar-mapp.patch index 6e80740..977eecf 100644 --- a/0011-vdso-use-correct-offsets-to-remap-vdso-and-vvar-mapp.patch +++ b/0010-vdso-use-correct-offsets-to-remap-vdso-and-vvar-mapp.patch @@ -1,7 +1,8 @@ -From 3482094d4d62fcca1e90f1762b7862bd9ae95fea Mon Sep 17 00:00:00 2001 +From d744516b42c6c7aa31927ca2b77aa57f673741a9 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 4 Feb 2020 23:13:43 -0800 -Subject: [PATCH] vdso: use correct offsets to remap vdso and vvar mappings +Subject: [PATCH 10/22] vdso: use correct offsets to remap vdso and vvar + mappings In the current version, the offsets of remapping vvar and vdso regions are mixed up. @@ -31,5 +32,5 @@ index 38da766..3a1684d 100644 #ifndef CONFIG_COMPAT -- -2.9.5 +2.34.0 diff --git a/0011-criu-fix-build-failure-against-gcc-10.patch b/0011-criu-fix-build-failure-against-gcc-10.patch new file mode 100644 index 0000000..7a530b9 --- /dev/null +++ b/0011-criu-fix-build-failure-against-gcc-10.patch @@ -0,0 +1,85 @@ +From 3769e38a12ad254131c64eee66a299a88dda74a9 Mon Sep 17 00:00:00 2001 +From: Sergei Trofimovich +Date: Wed, 1 Dec 2021 10:15:47 +0800 +Subject: [PATCH 11/22] criu: fix build failure against gcc-10 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +On gcc-10 (and gcc-9 -fno-common) build fails as: + +``` +ld: criu/arch/x86/crtools.o:criu/include/cr_options.h:159: + multiple definition of `rpc_cfg_file'; criu/arch/x86/cpu.o:criu/include/cr_options.h:159: first defined here +make[2]: *** [scripts/nmk/scripts/build.mk:164: criu/arch/x86/crtools.built-in.o] Error 1 +``` + +gcc-10 will change the default from -fcommon to fno-common: +https://gcc.gnu.org/PR85678. + +The error also happens if CFLAGS=-fno-common passed explicitly. + +Reported-by: Toralf Förster +Bug: https://bugs.gentoo.org/707942 +Signed-off-by: Sergei Trofimovich +--- + criu/config.c | 1 + + criu/include/cr_options.h | 2 +- + criu/include/pstree.h | 2 +- + criu/include/tun.h | 2 +- + 4 files changed, 4 insertions(+), 3 deletions(-) + +diff --git a/criu/config.c b/criu/config.c +index 76d6e5f..8cb6bb2 100644 +--- a/criu/config.c ++++ b/criu/config.c +@@ -30,6 +30,7 @@ + #include "common/xmalloc.h" + + struct cr_options opts; ++char *rpc_cfg_file; + + static int count_elements(char **to_count) + { +diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h +index 98c5a44..52211a7 100644 +--- a/criu/include/cr_options.h ++++ b/criu/include/cr_options.h +@@ -148,7 +148,7 @@ struct cr_options { + }; + + extern struct cr_options opts; +-char *rpc_cfg_file; ++extern char *rpc_cfg_file; + + extern int parse_options(int argc, char **argv, bool *usage_error, bool *has_exec_cmd, int state); + extern int check_options(); +diff --git a/criu/include/pstree.h b/criu/include/pstree.h +index 7303c1f..61ab0ce 100644 +--- a/criu/include/pstree.h ++++ b/criu/include/pstree.h +@@ -42,7 +42,7 @@ enum { + }; + #define FDS_EVENT (1 << FDS_EVENT_BIT) + +-struct pstree_item *current; ++extern struct pstree_item *current; + + struct rst_info; + /* See alloc_pstree_item() for details */ +diff --git a/criu/include/tun.h b/criu/include/tun.h +index ce0b266..b82c445 100644 +--- a/criu/include/tun.h ++++ b/criu/include/tun.h +@@ -5,7 +5,7 @@ + #define TUN_MINOR 200 + #endif + +-struct ns_id *ns; ++extern struct ns_id *ns; + + #include + +-- +2.34.0 + diff --git a/0012-protobuf-remove-leading-underscores-from-protobuf-st.patch b/0012-protobuf-remove-leading-underscores-from-protobuf-st.patch new file mode 100644 index 0000000..599c640 --- /dev/null +++ b/0012-protobuf-remove-leading-underscores-from-protobuf-st.patch @@ -0,0 +1,169 @@ +From 940ea5aa5af5fce3e898c9f3d63f01b29d1a772f Mon Sep 17 00:00:00 2001 +From: Zeyad Yasser +Date: Thu, 22 Jul 2021 11:39:34 +0200 +Subject: [PATCH 12/22] protobuf: remove leading underscores from protobuf + structs + +Fixes: #1560 + +The latest protobuf-c compiler breaks CRIU because they removed +leading underscores from structs in 1.4.0. + +This replaces those definitions with the standard generated structs. + +v2: remove struct _VmaEntry, struct _CredsEntry and struct _CoreEntry + +Signed-off-by: Zeyad Yasser +--- + criu/include/irmap.h | 8 +++++--- + criu/include/parasite-syscall.h | 8 +++----- + criu/include/pstree.h | 2 +- + criu/include/rst_info.h | 5 ++--- + criu/include/shmem.h | 1 - + criu/mem.c | 2 +- + lib/c/criu.h | 3 ++- + 7 files changed, 14 insertions(+), 15 deletions(-) + +diff --git a/criu/include/irmap.h b/criu/include/irmap.h +index 033f71e..188d753 100644 +--- a/criu/include/irmap.h ++++ b/criu/include/irmap.h +@@ -1,13 +1,15 @@ + #ifndef __CR_IRMAP__H__ + #define __CR_IRMAP__H__ ++ ++#include "images/fh.pb-c.h" ++ + char *irmap_lookup(unsigned int s_dev, unsigned long i_ino); +-struct _FhEntry; + int irmap_queue_cache(unsigned int dev, unsigned long ino, +- struct _FhEntry *fh); ++ FhEntry *fh); + int irmap_predump_prep(void); + int irmap_predump_run(void); + int check_open_handle(unsigned int s_dev, unsigned long i_ino, +- struct _FhEntry *f_handle); ++ FhEntry *f_handle); + int irmap_load_cache(void); + int irmap_scan_path_add(char *path); + #endif +diff --git a/criu/include/parasite-syscall.h b/criu/include/parasite-syscall.h +index c86a724..afba95a 100644 +--- a/criu/include/parasite-syscall.h ++++ b/criu/include/parasite-syscall.h +@@ -11,8 +11,6 @@ struct parasite_dump_misc; + struct parasite_drain_fd; + struct vm_area_list; + struct pstree_item; +-struct _CredsEntry; +-struct _CoreEntry; + struct list_head; + struct cr_imgset; + struct fd_opts; +@@ -31,11 +29,11 @@ extern int parasite_dump_posix_timers_seized(struct proc_posix_timers_stat *proc + struct parasite_ctl *ctl, struct pstree_item *); + + extern int parasite_dump_misc_seized(struct parasite_ctl *ctl, struct parasite_dump_misc *misc); +-extern int parasite_dump_creds(struct parasite_ctl *ctl, struct _CredsEntry *ce); +-extern int parasite_dump_thread_leader_seized(struct parasite_ctl *ctl, int pid, struct _CoreEntry *core); ++extern int parasite_dump_creds(struct parasite_ctl *ctl, CredsEntry *ce); ++extern int parasite_dump_thread_leader_seized(struct parasite_ctl *ctl, int pid, CoreEntry *core); + extern int parasite_dump_thread_seized(struct parasite_thread_ctl *tctl, + struct parasite_ctl *ctl, int id, +- struct pid *tid, struct _CoreEntry *core); ++ struct pid *tid, CoreEntry *core); + extern int dump_thread_core(int pid, CoreEntry *core, + const struct parasite_dump_thread *dt); + +diff --git a/criu/include/pstree.h b/criu/include/pstree.h +index 61ab0ce..17d22e7 100644 +--- a/criu/include/pstree.h ++++ b/criu/include/pstree.h +@@ -113,7 +113,7 @@ extern int prepare_task_entries(void); + extern int prepare_dummy_task_state(struct pstree_item *pi); + + extern int get_task_ids(struct pstree_item *); +-extern struct _TaskKobjIdsEntry *root_ids; ++extern TaskKobjIdsEntry *root_ids; + + extern void core_entry_free(CoreEntry *core); + extern CoreEntry *core_entry_alloc(int alloc_thread_info, int alloc_tc); +diff --git a/criu/include/rst_info.h b/criu/include/rst_info.h +index 3283849..3dc119a 100644 +--- a/criu/include/rst_info.h ++++ b/criu/include/rst_info.h +@@ -5,6 +5,7 @@ + #include "common/list.h" + #include "vma.h" + #include "kerndat.h" ++#include "images/mm.pb-c.h" + + struct task_entries { + int nr_threads, nr_tasks, nr_helpers; +@@ -25,8 +26,6 @@ struct fdt { + futex_t fdt_lock; + }; + +-struct _MmEntry; +- + struct rst_info { + struct list_head fds; + +@@ -40,7 +39,7 @@ struct rst_info { + struct fdt *fdt; + + struct vm_area_list vmas; +- struct _MmEntry *mm; ++ MmEntry *mm; + struct list_head vma_io; + unsigned int pages_img_id; + +diff --git a/criu/include/shmem.h b/criu/include/shmem.h +index 04ab8d0..3fa6512 100644 +--- a/criu/include/shmem.h ++++ b/criu/include/shmem.h +@@ -5,7 +5,6 @@ + #include "common/lock.h" + #include "images/vma.pb-c.h" + +-struct _VmaEntry; + struct vma_area; + + extern int collect_shmem(int pid, struct vma_area *vma); +diff --git a/criu/mem.c b/criu/mem.c +index de66a62..e096853 100644 +--- a/criu/mem.c ++++ b/criu/mem.c +@@ -1126,7 +1126,7 @@ err_addr: + + static int maybe_disable_thp(struct pstree_item *t, struct page_read *pr) + { +- struct _MmEntry *mm = rsti(t)->mm; ++ MmEntry *mm = rsti(t)->mm; + + /* + * There is no need to disable it if the page read doesn't +diff --git a/lib/c/criu.h b/lib/c/criu.h +index 76f3547..688fe56 100644 +--- a/lib/c/criu.h ++++ b/lib/c/criu.h +@@ -22,6 +22,7 @@ + #include + + #include "version.h" ++#include "rpc.pb-c.h" + + #ifdef __GNUG__ + extern "C" { +@@ -104,7 +105,7 @@ int criu_set_page_server_address_port(const char *address, int port); + * some non-existing one is reported. + */ + +-typedef struct _CriuNotify *criu_notify_arg_t; ++typedef CriuNotify *criu_notify_arg_t; + void criu_set_notify_cb(int (*cb)(char *action, criu_notify_arg_t na)); + + /* Get pid of root task. 0 if not available */ +-- +2.34.0 + diff --git a/0013-images-regfile.proto-adds-additional-fields-to-RegFi.patch b/0013-images-regfile.proto-adds-additional-fields-to-RegFi.patch new file mode 100644 index 0000000..dff1903 --- /dev/null +++ b/0013-images-regfile.proto-adds-additional-fields-to-RegFi.patch @@ -0,0 +1,74 @@ +From c89d4856864bb58aeefce2755745d76490551d0d Mon Sep 17 00:00:00 2001 +From: Ajay Bharadwaj +Date: Fri, 29 May 2020 11:56:22 +0530 +Subject: [PATCH 13/22] images/regfile.proto: adds additional fields to + RegFileEntry + +This adds build-id, checksum, checksum-config and checksum-parameter fields +to RegFileEntry to store metadata used for file verification. + +build_id: Holds the build-id if it could be obtained + +checksum: Holds the checksum if it could be obtained + +checksum_config: Holds the configuration of bytes for which checksum has +been calculated (The entire file, first N bytes or every Nth byte) + +checksum_parameter: Specifies the value of 'N', if required, for the +configuration of bytes + +Signed-off-by: Ajay Bharadwaj +--- + images/regfile.proto | 38 ++++++++++++++++++++++++++++---------- + 1 file changed, 28 insertions(+), 10 deletions(-) + +diff --git a/images/regfile.proto b/images/regfile.proto +index bc4c14d..49884dd 100644 +--- a/images/regfile.proto ++++ b/images/regfile.proto +@@ -4,13 +4,31 @@ import "opts.proto"; + import "fown.proto"; + + message reg_file_entry { +- required uint32 id = 1; +- required uint32 flags = 2 [(criu).flags = "rfile.flags"]; +- required uint64 pos = 3; +- required fown_entry fown = 5; +- required string name = 6; +- optional sint32 mnt_id = 7 [default = -1]; +- optional uint64 size = 8; +- optional bool ext = 9; +- optional uint32 mode = 10; +-} ++ required uint32 id = 1; ++ required uint32 flags = 2 [(criu).flags = "rfile.flags"]; ++ required uint64 pos = 3; ++ required fown_entry fown = 5; ++ required string name = 6; ++ optional sint32 mnt_id = 7 [default = -1]; ++ optional uint64 size = 8; ++ optional bool ext = 9; ++ optional uint32 mode = 10; ++ ++ /* This field stores the build-ID of the file if it could be obtained. */ ++ repeated uint32 build_id = 11; ++ ++ /* This field stores the CRC32C checksum of the file if it could be obtained. */ ++ optional uint32 checksum = 12; ++ ++ /* ++ * This field stores the configuration of bytes which were used in the ++ * calculation of the checksum, if it could be obtained. ++ */ ++ optional uint32 checksum_config = 13; ++ ++ /* ++ * This field stores the checksum parameter if it was used in the calculation ++ * of the checksum, if it could be obtained. ++ */ ++ optional uint32 checksum_parameter = 14; ++} +\ No newline at end of file +-- +2.34.0 + diff --git a/0014-criu-files-reg.c-add-build-id-validation-functionali.patch b/0014-criu-files-reg.c-add-build-id-validation-functionali.patch new file mode 100644 index 0000000..47d6a07 --- /dev/null +++ b/0014-criu-files-reg.c-add-build-id-validation-functionali.patch @@ -0,0 +1,478 @@ +From 547f4bd4aeb633f8b5d22190b6d0ce0b165269d9 Mon Sep 17 00:00:00 2001 +From: Ajay Bharadwaj +Date: Fri, 26 Jun 2020 14:43:40 +0530 +Subject: [PATCH 14/22] criu/files-reg.c: add build-id validation functionality + +efi.h: Required for accessing the build-id of .efi files + +This adds functions to find, store and compare with the stored build-id. +get_build_id() calls 32-bit or 64-bit helper functions depending on +the bitness of the ELF file after first ensuring that it is actually +an ELF file by checking for the magic number. + +The number of iterations while searching the elf file for the +build-id before giving up (500 while searching the note section) +are limited. + +Signed-off-by: Ajay Bharadwaj +--- + criu/files-reg.c | 395 +++++++++++++++++++++++++++++++++++++++++++++-- + 1 file changed, 385 insertions(+), 10 deletions(-) + +diff --git a/criu/files-reg.c b/criu/files-reg.c +index 2f68bc0..28c3360 100644 +--- a/criu/files-reg.c ++++ b/criu/files-reg.c +@@ -12,6 +12,7 @@ + #include + #include + #include ++#include + + #ifndef SEEK_DATA + #define SEEK_DATA 3 +@@ -22,6 +23,13 @@ + #define SILLYNAME_PREF ".nfs" + #define SILLYNAME_SUFF_LEN (((unsigned)sizeof(u64) << 1) + ((unsigned)sizeof(unsigned int) << 1)) + ++/* ++ * If the build-id exists, then it will most likely be present in the ++ * beginning of the file. Therefore only the first 1MB will be mapped ++ * and checked. ++ */ ++#define BUILD_ID_MAP_SIZE 1048576 ++ + #include "cr_options.h" + #include "imgset.h" + #include "file-ids.h" +@@ -1264,12 +1272,309 @@ static bool should_check_size(int flags) + return true; + } + ++/* ++ * Gets the build-id (If it exists) from 32-bit ELF files. ++ * Returns the number of bytes of the build-id if it could ++ * be obtained, else -1. ++ */ ++static int get_build_id_32(Elf32_Ehdr *file_header, unsigned char **build_id, ++ const int fd, size_t mapped_size) ++{ ++ int size, num_iterations; ++ size_t file_header_end; ++ Elf32_Phdr *program_header, *program_header_end; ++ Elf32_Nhdr *note_header, *note_header_end; ++ ++ file_header_end = (size_t) file_header + mapped_size; ++ if (sizeof(Elf32_Ehdr) > mapped_size) ++ return -1; ++ ++ /* ++ * If the file doesn't have atleast 1 program header entry, it definitely can't ++ * have a build-id. ++ */ ++ if (!file_header->e_phnum) { ++ pr_warn("Couldn't find any program headers for file with fd %d\n", fd); ++ return -1; ++ } ++ ++ program_header = (Elf32_Phdr *) (file_header->e_phoff + (char *) file_header); ++ if (program_header <= (Elf32_Phdr *) file_header) ++ return -1; ++ ++ program_header_end = (Elf32_Phdr *) (file_header_end - sizeof(Elf32_Phdr)); ++ num_iterations = file_header->e_phnum + 1; ++ ++ /* ++ * If the file has a build-id, it will be in the PT_NOTE program header ++ * entry AKA the note sections. ++ */ ++ while (num_iterations-- && program_header <= program_header_end && ++ program_header->p_type != PT_NOTE) ++ program_header++; ++ ++ if (!num_iterations || program_header >= program_header_end) { ++ pr_warn("Couldn't find the note program header for file with fd %d\n", fd); ++ return -1; ++ } ++ ++ note_header = (Elf32_Nhdr *) (program_header->p_offset + (char *) file_header); ++ if (note_header <= (Elf32_Nhdr *) file_header) ++ return -1; ++ ++ note_header_end = (Elf32_Nhdr *) min_t(char*, ++ (char *) note_header + program_header->p_filesz, ++ (char *) (file_header_end - sizeof(Elf32_Nhdr))); ++ ++ /* The note type for the build-id is NT_GNU_BUILD_ID. */ ++ while (note_header <= note_header_end && note_header->n_type != NT_GNU_BUILD_ID) ++ note_header = (Elf32_Nhdr *) ((char *) note_header + sizeof(Elf32_Nhdr) + ++ ALIGN(note_header->n_namesz, 4) + ++ ALIGN(note_header->n_descsz, 4)); ++ ++ if (note_header >= note_header_end) { ++ pr_warn("Couldn't find the build-id note for file with fd %d\n", fd); ++ return -1; ++ } ++ ++ /* ++ * If the size of the notes description is too large or is invalid ++ * then the build-id could not be obtained. ++ */ ++ if (note_header->n_descsz <= 0 || note_header->n_descsz > 512) { ++ pr_warn("Invalid description size for build-id note for file with fd %d\n", fd); ++ return -1; ++ } ++ ++ size = note_header->n_descsz; ++ note_header = (Elf32_Nhdr *) ((char *) note_header + sizeof(Elf32_Nhdr) + ++ ALIGN(note_header->n_namesz, 4)); ++ note_header_end = (Elf32_Nhdr *) (file_header_end - size); ++ if (note_header <= (Elf32_Nhdr *) file_header || note_header > note_header_end) ++ return -1; ++ ++ *build_id = (unsigned char *) xmalloc(size); ++ if (!*build_id) ++ return -1; ++ ++ memcpy(*build_id, (void *) note_header, size); ++ return size; ++} ++ ++/* ++ * Gets the build-id (If it exists) from 64-bit ELF files. ++ * Returns the number of bytes of the build-id if it could ++ * be obtained, else -1. ++ */ ++static int get_build_id_64(Elf64_Ehdr *file_header, unsigned char **build_id, ++ const int fd, size_t mapped_size) ++{ ++ int size, num_iterations; ++ size_t file_header_end; ++ Elf64_Phdr *program_header, *program_header_end; ++ Elf64_Nhdr *note_header, *note_header_end; ++ ++ file_header_end = (size_t) file_header + mapped_size; ++ if (sizeof(Elf64_Ehdr) > mapped_size) ++ return -1; ++ ++ /* ++ * If the file doesn't have atleast 1 program header entry, it definitely can't ++ * have a build-id. ++ */ ++ if (!file_header->e_phnum) { ++ pr_warn("Couldn't find any program headers for file with fd %d\n", fd); ++ return -1; ++ } ++ ++ program_header = (Elf64_Phdr *) (file_header->e_phoff + (char *) file_header); ++ if (program_header <= (Elf64_Phdr *) file_header) ++ return -1; ++ ++ program_header_end = (Elf64_Phdr *) (file_header_end - sizeof(Elf64_Phdr)); ++ num_iterations = file_header->e_phnum + 1; ++ ++ /* ++ * If the file has a build-id, it will be in the PT_NOTE program header ++ * entry AKA the note sections. ++ */ ++ while (num_iterations-- && program_header <= program_header_end && ++ program_header->p_type != PT_NOTE) ++ program_header++; ++ ++ if (!num_iterations || program_header >= program_header_end) { ++ pr_warn("Couldn't find the note program header for file with fd %d\n", fd); ++ return -1; ++ } ++ ++ note_header = (Elf64_Nhdr *) (program_header->p_offset + (char *) file_header); ++ if (note_header <= (Elf64_Nhdr *) file_header) ++ return -1; ++ ++ note_header_end = (Elf64_Nhdr *) min_t(char*, ++ (char *) note_header + program_header->p_filesz, ++ (char *) (file_header_end - sizeof(Elf64_Nhdr))); ++ ++ /* The note type for the build-id is NT_GNU_BUILD_ID. */ ++ while (note_header <= note_header_end && note_header->n_type != NT_GNU_BUILD_ID) ++ note_header = (Elf64_Nhdr *) ((char *) note_header + sizeof(Elf64_Nhdr) + ++ ALIGN(note_header->n_namesz, 4) + ++ ALIGN(note_header->n_descsz, 4)); ++ ++ if (note_header >= note_header_end) { ++ pr_warn("Couldn't find the build-id note for file with fd %d\n", fd); ++ return -1; ++ } ++ ++ /* ++ * If the size of the notes description is too large or is invalid ++ * then the build-id could not be obtained. ++ */ ++ if (note_header->n_descsz <= 0 || note_header->n_descsz > 512) { ++ pr_warn("Invalid description size for build-id note for file with fd %d\n", fd); ++ return -1; ++ } ++ ++ size = note_header->n_descsz; ++ note_header = (Elf64_Nhdr *) ((char *) note_header + sizeof(Elf64_Nhdr) + ++ ALIGN(note_header->n_namesz, 4)); ++ note_header_end = (Elf64_Nhdr *) (file_header_end - size); ++ if (note_header <= (Elf64_Nhdr *) file_header || note_header > note_header_end) ++ return -1; ++ ++ *build_id = (unsigned char *) xmalloc(size); ++ if (!*build_id) ++ return -1; ++ ++ memcpy(*build_id, (void *) note_header, size); ++ return size; ++} ++ ++/* ++ * Finds the build-id of the file by checking if the file is an ELF file ++ * and then calling either the 32-bit or the 64-bit function as necessary. ++ * Returns the number of bytes of the build-id if it could be ++ * obtained, else -1. ++ */ ++static int get_build_id(const int fd, const struct stat *fd_status, ++ unsigned char **build_id) ++{ ++ char buf[SELFMAG+1]; ++ void *start_addr; ++ size_t mapped_size; ++ int ret = -1; ++ ++ if (read(fd, buf, SELFMAG+1) != SELFMAG+1) ++ return -1; ++ ++ /* ++ * The first 4 bytes contain a magic number identifying the file as an ++ * ELF file. They should contain the characters ‘\x7f’, ‘E’, ‘L’, and ++ * ‘F’, respectively. These characters are together defined as ELFMAG. ++ */ ++ if (strncmp(buf, ELFMAG, SELFMAG)) ++ return -1; ++ ++ /* ++ * If the build-id exists, then it will most likely be present in the ++ * beginning of the file. Therefore at most only the first 1 MB of the ++ * file is mapped. ++ */ ++ mapped_size = min_t(size_t, fd_status->st_size, BUILD_ID_MAP_SIZE); ++ start_addr = mmap(0, mapped_size, PROT_READ, MAP_PRIVATE | MAP_FILE, fd, 0); ++ if (start_addr == MAP_FAILED) { ++ pr_warn("Couldn't mmap file with fd %d", fd); ++ return -1; ++ } ++ ++ if (buf[EI_CLASS] == ELFCLASS32) ++ ret = get_build_id_32(start_addr, build_id, fd, mapped_size); ++ if (buf[EI_CLASS] == ELFCLASS64) ++ ret = get_build_id_64(start_addr, build_id, fd, mapped_size); ++ ++ munmap(start_addr, mapped_size); ++ return ret; ++} ++ ++/* ++ * Finds and stores the build-id of a file, if it exists, so that it can be validated ++ * while restoring. ++ * Returns 1 if the build-id of the file could be stored, -1 if there was an error ++ * or 0 if the build-id could not be obtained. ++ */ ++static int store_validation_data_build_id(RegFileEntry *rfe, int lfd, ++ const struct fd_parms *p) ++{ ++ unsigned char *build_id = NULL; ++ int build_id_size; ++ int fd; ++ ++ /* ++ * Checks whether the file is atleast big enough to try and read the first ++ * four (SELFMAG) bytes which should correspond to the ELF magic number ++ * and the next byte which indicates whether the file is 32-bit or 64-bit. ++ */ ++ if (p->stat.st_size < SELFMAG+1) ++ return 0; ++ ++ fd = open_proc(PROC_SELF, "fd/%d", lfd); ++ if (fd < 0) { ++ pr_err("Build-ID (For validation) could not be obtained for file %s because can't open the file\n", ++ rfe->name); ++ return -1; ++ } ++ ++ build_id_size = get_build_id(fd, &(p->stat), &build_id); ++ close(fd); ++ if (!build_id || build_id_size == -1) ++ return 0; ++ ++ rfe->build_id = xmalloc(round_up(build_id_size, sizeof(uint32_t))); ++ if (!rfe->build_id) { ++ pr_warn("Build-ID (For validation) could not be set for file %s\n", ++ rfe->name); ++ return -1; ++ } ++ ++ rfe->n_build_id = build_id_size; ++ memcpy(rfe->build_id, (void *) build_id, rfe->n_build_id); ++ ++ xfree(build_id); ++ return 1; ++} ++ ++/* ++ * This routine stores metadata about the open file (File size, build-id, CRC32C checksum) ++ * so that validation can be done while restoring to make sure that the right file is ++ * being restored. ++ * Returns true if atleast some metadata was stored, if there was an error it returns false. ++ */ ++static bool store_validation_data(RegFileEntry *rfe, ++ const struct fd_parms *p, int lfd) ++{ ++ int result = 1; ++ ++ rfe->has_size = true; ++ rfe->size = p->stat.st_size; ++ ++ result = store_validation_data_build_id(rfe, lfd, p); ++ ++ if (result == -1) ++ return false; ++ ++ if (!result) ++ pr_info("Only file size could be stored for validation for file %s\n", ++ rfe->name); ++ return true; ++} ++ + int dump_one_reg_file(int lfd, u32 id, const struct fd_parms *p) + { + struct fd_link _link, *link; + struct mount_info *mi; + struct cr_img *rimg; + char ext_id[64]; ++ int ret; + FileEntry fe = FILE_ENTRY__INIT; + RegFileEntry rfe = REG_FILE_ENTRY__INIT; + +@@ -1330,17 +1635,21 @@ ext: + rfe.has_mode = true; + rfe.mode = p->stat.st_mode; + +- if (S_ISREG(p->stat.st_mode) && should_check_size(rfe.flags)) { +- rfe.has_size = true; +- rfe.size = p->stat.st_size; +- } ++ if (S_ISREG(p->stat.st_mode) && should_check_size(rfe.flags) && ++ !store_validation_data(&rfe, p, lfd)) ++ return -1; + + fe.type = FD_TYPES__REG; + fe.id = rfe.id; + fe.reg = &rfe; + + rimg = img_from_set(glob_imgset, CR_FD_FILES); +- return pb_write_one(rimg, &fe, PB_FILE); ++ ret = pb_write_one(rimg, &fe, PB_FILE); ++ ++ if (rfe.build_id) ++ xfree(rfe.build_id); ++ ++ return ret; + } + + const struct fdtype_ops regfile_dump_ops = { +@@ -1610,6 +1919,76 @@ out_root: + return 0; + } + ++/* ++ * Compares the file's build-id with the stored value. ++ * Returns 1 if the build-id of the file matches the build-id that was stored ++ * while dumping, -1 if there is a mismatch or 0 if the build-id has not been ++ * stored or could not be obtained. ++ */ ++static int validate_with_build_id(const int fd, const struct stat *fd_status, ++ const struct reg_file_info *rfi) ++{ ++ unsigned char *build_id; ++ int build_id_size; ++ ++ if (!rfi->rfe->has_size) ++ return 1; ++ ++ if (!rfi->rfe->n_build_id) ++ return 0; ++ ++ build_id = NULL; ++ build_id_size = get_build_id(fd, fd_status, &build_id); ++ if (!build_id || build_id_size == -1) ++ return 0; ++ ++ if (build_id_size != rfi->rfe->n_build_id) { ++ pr_err("File %s has bad build-ID length %d (expect %d)\n", rfi->path, ++ build_id_size, (int) rfi->rfe->n_build_id); ++ xfree(build_id); ++ return -1; ++ } ++ ++ if (memcmp(build_id, rfi->rfe->build_id, build_id_size)) { ++ pr_err("File %s has bad build-ID\n", rfi->path); ++ xfree(build_id); ++ return -1; ++ } ++ ++ xfree(build_id); ++ return 1; ++} ++ ++/* ++ * This function determines whether it was the same file that was open during dump ++ * by checking the file's size, build-id and/or checksum with the same metadata ++ * that was stored before dumping. ++ * Checksum is calculated with CRC32C. ++ * Returns true if the metadata of the file matches the metadata stored while ++ * dumping else returns false. ++ */ ++static bool validate_file(const int fd, const struct stat *fd_status, ++ const struct reg_file_info *rfi) ++{ ++ int result = 1; ++ ++ if (rfi->rfe->has_size && (fd_status->st_size != rfi->rfe->size)) { ++ pr_err("File %s has bad size %"PRIu64" (expect %"PRIu64")\n", ++ rfi->path, fd_status->st_size, rfi->rfe->size); ++ return false; ++ } ++ ++ result = validate_with_build_id(fd, fd_status, rfi); ++ ++ if (result == -1) ++ return false; ++ ++ if (!result) ++ pr_info("File %s could only be validated with file size\n", ++ rfi->path); ++ return true; ++} ++ + int open_path(struct file_desc *d, + int(*open_cb)(int mntns_root, struct reg_file_info *, void *), void *arg) + { +@@ -1704,12 +2083,8 @@ ext: + return -1; + } + +- if (rfi->rfe->has_size && (st.st_size != rfi->rfe->size)) { +- pr_err("File %s has bad size %"PRIu64" (expect %"PRIu64")\n", +- rfi->path, st.st_size, +- rfi->rfe->size); ++ if (!validate_file(tmp, &st, rfi)) + return -1; +- } + + if (rfi->rfe->has_mode && (st.st_mode != rfi->rfe->mode)) { + pr_err("File %s has bad mode 0%o (expect 0%o)\n", +-- +2.34.0 + diff --git a/0015-criu-Kill-tasks-even-when-the-network-is-unlocked.patch b/0015-criu-Kill-tasks-even-when-the-network-is-unlocked.patch new file mode 100644 index 0000000..8f624c1 --- /dev/null +++ b/0015-criu-Kill-tasks-even-when-the-network-is-unlocked.patch @@ -0,0 +1,171 @@ +From 0cfe638468cc3eb2834f0709d3a107bce0e2e679 Mon Sep 17 00:00:00 2001 +From: Dmitry Safonov +Date: Sat, 9 Nov 2019 22:20:44 +0000 +Subject: [PATCH 15/22] criu: Kill tasks even when the network is unlocked + +Currently if anything fails after network has been unlocked tasks aren't +killed. Which doesn't work anyway: any stage sets `ret` and nothing +later gets called. Which means the tasks aren't resumed properly. +Furthermore, functions like catch_tasks() and compel_stop_on_syscall() +return failure on the first error. + +Let's do the cleanup even when the network is unlocked. +If we want to keep the mess and ignore failures - a cli option should be +introduced for that (and existing code should be reworked with decisions +what is critical and what can be ignored). + +Move "Restore finished successfully" message accordingly where +everything is evidently good. + +While at here, any late failure will result not only in cleanup but in +criu returning error code. + +Which in result makes tests to fail in such case: +> ======================= Run zdtm/static/inotify04 in ns ======================== +> Start test +> ./inotify04 --pidfile=inotify04.pid --outfile=inotify04.out --dirname=inotify04.test +> Run criu dump +> =[log]=> dump/zdtm/static/inotify04/84/1/dump.log +> ------------------------ grep Error ------------------------ +> (00.119763) fsnotify: openable (inode match) as zdtm/static/inotify04.test/inotify-testfile +> (00.119766) fsnotify: Dumping /zdtm/static/inotify04.test/inotify-testfile as path for handle +> (00.119769) fsnotify: id 0x00000b flags 0x000800 +> (00.119787) 88 fdinfo 5: pos: 0 flags: 4000/0 +> (00.119796) Warn (criu/fsnotify.c:336): fsnotify: The 0x00000c inotify events will be dropped +> ------------------------ ERROR OVER ------------------------ +> Run criu restore +> =[log]=> dump/zdtm/static/inotify04/84/1/restore.log +> ------------------------ grep Error ------------------------ +> (00.391582) 123 was stopped +> (00.391667) 106 was trapped +> (00.391674) 106 (native) is going to execute the syscall 11, required is 11 +> (00.391697) 106 was stopped +> (00.391720) Error (compel/src/lib/infect.c:1439): Task 123 is in unexpected state: b7f +> (00.391736) Error (compel/src/lib/infect.c:1447): Task stopped with 11: Segmentation fault +> ------------------------ ERROR OVER ------------------------ +> 5: Old maps lost: set([]) +> 5: New maps appeared: set([u'10000-1a000 rwxp', u'1a000-24000 rw-p']) +> ############### Test zdtm/static/inotify04 FAIL at maps compare ################ +> Send the 9 signal to 106 +> Wait for zdtm/static/inotify04(106) to die for 0.100000 +> ======================= Test zdtm/static/inotify04 PASS ======================== + +Signed-off-by: Dmitry Safonov +Signed-off-by: Andrei Vagin +--- + criu/cr-restore.c | 50 ++++++++++++++++++++++++++++++----------------- + 1 file changed, 32 insertions(+), 18 deletions(-) + +diff --git a/criu/cr-restore.c b/criu/cr-restore.c +index c2be323..2ec1058 100644 +--- a/criu/cr-restore.c ++++ b/criu/cr-restore.c +@@ -1990,7 +1990,7 @@ static void finalize_restore(void) + } + } + +-static void finalize_restore_detach(int status) ++static int finalize_restore_detach(void) + { + struct pstree_item *item; + +@@ -2004,16 +2004,21 @@ static void finalize_restore_detach(int status) + for (i = 0; i < item->nr_threads; i++) { + pid = item->threads[i].real; + if (pid < 0) { +- BUG_ON(status >= 0); +- break; ++ pr_err("pstree item has unvalid pid %d\n", pid); ++ continue; + } + +- if (arch_set_thread_regs_nosigrt(&item->threads[i])) ++ if (arch_set_thread_regs_nosigrt(&item->threads[i])) { + pr_perror("Restoring regs for %d failed", pid); +- if (ptrace(PTRACE_DETACH, pid, NULL, 0)) +- pr_perror("Unable to execute %d", pid); ++ return -1; ++ } ++ if (ptrace(PTRACE_DETACH, pid, NULL, 0)) { ++ pr_perror("Unable to detach %d", pid); ++ return -1; ++ } + } + } ++ return 0; + } + + static void ignore_kids(void) +@@ -2271,32 +2276,37 @@ skip_ns_bouncing: + + /* + * ------------------------------------------------------------- +- * Below this line nothing should fail, because network is unlocked ++ * Network is unlocked. If something fails below - we lose data ++ * or a connection. + */ + attach_to_tasks(root_seized); + +- ret = restore_switch_stage(CR_STATE_RESTORE_CREDS); +- BUG_ON(ret); ++ if (restore_switch_stage(CR_STATE_RESTORE_CREDS)) ++ goto out_kill_network_unlocked; + + timing_stop(TIME_RESTORE); + +- ret = catch_tasks(root_seized, &flag); ++ if (catch_tasks(root_seized, &flag)) { ++ pr_err("Can't catch all tasks\n"); ++ goto out_kill_network_unlocked; ++ } + + if (lazy_pages_finish_restore()) +- goto out_kill; ++ goto out_kill_network_unlocked; + +- pr_info("Restore finished successfully. Resuming tasks.\n"); + __restore_switch_stage(CR_STATE_COMPLETE); + +- if (ret == 0) +- ret = compel_stop_on_syscall(task_entries->nr_threads, +- __NR(rt_sigreturn, 0), __NR(rt_sigreturn, 1), flag); ++ ret = compel_stop_on_syscall(task_entries->nr_threads, ++ __NR(rt_sigreturn, 0), __NR(rt_sigreturn, 1), flag); ++ if (ret) { ++ pr_err("Can't stop all tasks on rt_sigreturn\n"); ++ goto out_kill_network_unlocked; ++ } + + if (clear_breakpoints()) + pr_err("Unable to flush breakpoints\n"); + +- if (ret == 0) +- finalize_restore(); ++ finalize_restore(); + + ret = run_scripts(ACT_PRE_RESUME); + if (ret) +@@ -2308,8 +2318,10 @@ skip_ns_bouncing: + fini_cgroup(); + + /* Detaches from processes and they continue run through sigreturn. */ +- finalize_restore_detach(ret); ++ if (finalize_restore_detach()) ++ goto out_kill_network_unlocked; + ++ pr_info("Restore finished successfully. Tasks resumed.\n"); + write_stats(RESTORE_STATS); + + ret = run_scripts(ACT_POST_RESUME); +@@ -2321,6 +2333,8 @@ skip_ns_bouncing: + + return 0; + ++out_kill_network_unlocked: ++ pr_err("Killing processes because of failure on restore.\nThe Network was unlocked so some data or a connection may have been lost.\n"); + out_kill: + /* + * The processes can be killed only when all of them have been created, +-- +2.34.0 + diff --git a/0016-cr-restore-Warn-if-restorer-can-t-be-unmapped.patch b/0016-cr-restore-Warn-if-restorer-can-t-be-unmapped.patch new file mode 100644 index 0000000..d54213e --- /dev/null +++ b/0016-cr-restore-Warn-if-restorer-can-t-be-unmapped.patch @@ -0,0 +1,40 @@ +From 683940473ed773845ab11c01b0dd868e95092b14 Mon Sep 17 00:00:00 2001 +From: Dmitry Safonov +Date: Sat, 9 Nov 2019 22:20:41 +0000 +Subject: [PATCH 16/22] cr-restore: Warn if restorer can't be unmapped + +Too late to stop restore: it's already printed that restore was +successful. Oh, well warn aloud about infection. + +Signed-off-by: Dmitry Safonov +Signed-off-by: Andrei Vagin +--- + criu/cr-restore.c | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +diff --git a/criu/cr-restore.c b/criu/cr-restore.c +index 2ec1058..a3caad7 100644 +--- a/criu/cr-restore.c ++++ b/criu/cr-restore.c +@@ -1971,6 +1971,7 @@ static void finalize_restore(void) + for_each_pstree_item(item) { + pid_t pid = item->pid->real; + struct parasite_ctl *ctl; ++ unsigned long restorer_addr; + + if (!task_alive(item)) + continue; +@@ -1980,7 +1981,9 @@ static void finalize_restore(void) + if (ctl == NULL) + continue; + +- compel_unmap(ctl, (unsigned long)rsti(item)->munmap_restorer); ++ restorer_addr = (unsigned long)rsti(item)->munmap_restorer; ++ if (compel_unmap(ctl, restorer_addr)) ++ pr_err("Failed to unmap restorer from %d\n", pid); + + xfree(ctl); + +-- +2.34.0 + diff --git a/0017-compel-infect-Warn-if-close-failed-on-memfd.patch b/0017-compel-infect-Warn-if-close-failed-on-memfd.patch new file mode 100644 index 0000000..dd9b7b5 --- /dev/null +++ b/0017-compel-infect-Warn-if-close-failed-on-memfd.patch @@ -0,0 +1,88 @@ +From 7b84380fa49478a2e22e6a563e8e20abff390d79 Mon Sep 17 00:00:00 2001 +From: Dmitry Safonov +Date: Sat, 9 Nov 2019 22:20:36 +0000 +Subject: [PATCH 17/22] compel/infect: Warn if close() failed on memfd + +As a preparation for __must_check on compel_syscall(), check it on +close() too - maybe not as useful as with other syscalls, but why not. + +Signed-off-by: Dmitry Safonov +Signed-off-by: Andrei Vagin +--- + compel/src/lib/infect.c | 24 +++++++++++++++++------- + 1 file changed, 17 insertions(+), 7 deletions(-) + +diff --git a/compel/src/lib/infect.c b/compel/src/lib/infect.c +index f0bcaf3..f726a98 100644 +--- a/compel/src/lib/infect.c ++++ b/compel/src/lib/infect.c +@@ -718,14 +718,25 @@ static int parasite_mmap_exchange(struct parasite_ctl *ctl, unsigned long size) + return 0; + } + ++static void parasite_memfd_close(struct parasite_ctl *ctl, int fd) ++{ ++ bool __maybe_unused compat = !compel_mode_native(ctl); ++ long ret; ++ int err; ++ ++ err = compel_syscall(ctl, __NR(close, compat), &ret, fd, 0, 0, 0, 0, 0); ++ if (err || ret) ++ pr_err("Can't close memfd\n"); ++} ++ + static int parasite_memfd_exchange(struct parasite_ctl *ctl, unsigned long size) + { + void *where = (void *)ctl->ictx.syscall_ip + BUILTIN_SYSCALL_SIZE; ++ bool __maybe_unused compat_task = !compel_mode_native(ctl); + uint8_t orig_code[MEMFD_FNAME_SZ] = MEMFD_FNAME; + pid_t pid = ctl->rpid; + long sret = -ENOSYS; + int ret, fd, lfd; +- bool __maybe_unused compat_task = !compel_mode_native(ctl); + + if (ctl->ictx.flags & INFECT_NO_MEMFD) + return 1; +@@ -741,10 +752,9 @@ static int parasite_memfd_exchange(struct parasite_ctl *ctl, unsigned long size) + (unsigned long)where, 0, 0, 0, 0, 0); + + if (ptrace_poke_area(pid, orig_code, where, sizeof(orig_code))) { +- fd = (int)(long)sret; ++ fd = (int)sret; + if (fd >= 0) +- compel_syscall(ctl, __NR(close, compat_task), &sret, +- fd, 0, 0, 0, 0, 0); ++ parasite_memfd_close(ctl, fd); + pr_err("Can't restore memfd args (pid: %d)\n", pid); + return -1; + } +@@ -752,7 +762,7 @@ static int parasite_memfd_exchange(struct parasite_ctl *ctl, unsigned long size) + if (ret < 0) + return ret; + +- fd = (int)(long)sret; ++ fd = (int)sret; + if (fd == -ENOSYS) + return 1; + if (fd < 0) { +@@ -787,7 +797,7 @@ static int parasite_memfd_exchange(struct parasite_ctl *ctl, unsigned long size) + goto err_curef; + } + +- compel_syscall(ctl, __NR(close, compat_task), &sret, fd, 0, 0, 0, 0, 0); ++ parasite_memfd_close(ctl, fd); + close(lfd); + + pr_info("Set up parasite blob using memfd\n"); +@@ -796,7 +806,7 @@ static int parasite_memfd_exchange(struct parasite_ctl *ctl, unsigned long size) + err_curef: + close(lfd); + err_cure: +- compel_syscall(ctl, __NR(close, compat_task), &sret, fd, 0, 0, 0, 0, 0); ++ parasite_memfd_close(ctl, fd); + return -1; + } + +-- +2.34.0 + diff --git a/0018-lib-infect-Check-if-compel-succeed-in-executing-munm.patch b/0018-lib-infect-Check-if-compel-succeed-in-executing-munm.patch new file mode 100644 index 0000000..15af6a6 --- /dev/null +++ b/0018-lib-infect-Check-if-compel-succeed-in-executing-munm.patch @@ -0,0 +1,59 @@ +From 7f459cf23916958ef4636a6fa7f167d6980fbfef Mon Sep 17 00:00:00 2001 +From: Dmitry Safonov +Date: Sat, 9 Nov 2019 22:20:38 +0000 +Subject: [PATCH 18/22] lib/infect: Check if compel succeed in executing munmap + +Signed-off-by: Dmitry Safonov +Signed-off-by: Andrei Vagin +--- + compel/arch/s390/src/lib/infect.c | 4 +++- + compel/src/lib/infect.c | 10 +++++++--- + 2 files changed, 10 insertions(+), 4 deletions(-) + +diff --git a/compel/arch/s390/src/lib/infect.c b/compel/arch/s390/src/lib/infect.c +index 00e9c36..7e7d24c 100644 +--- a/compel/arch/s390/src/lib/infect.c ++++ b/compel/arch/s390/src/lib/infect.c +@@ -453,8 +453,10 @@ void *remote_mmap(struct parasite_ctl *ctl, + if (ptrace_poke_area(pid, &arg_struct, where, sizeof(arg_struct))) { + pr_err("Can't restore mmap args (pid: %d)\n", pid); + if (map != 0) { +- compel_syscall(ctl, __NR_munmap, NULL, map, ++ err = compel_syscall(ctl, __NR_munmap, NULL, map, + length, 0, 0, 0, 0); ++ if (err) ++ pr_err("Can't munmap %d\n", err); + map = 0; + } + } +diff --git a/compel/src/lib/infect.c b/compel/src/lib/infect.c +index f726a98..19f0d10 100644 +--- a/compel/src/lib/infect.c ++++ b/compel/src/lib/infect.c +@@ -1303,6 +1303,7 @@ int compel_stop_daemon(struct parasite_ctl *ctl) + int compel_cure_remote(struct parasite_ctl *ctl) + { + long ret; ++ int err; + + if (compel_stop_daemon(ctl)) + return -1; +@@ -1310,9 +1311,12 @@ int compel_cure_remote(struct parasite_ctl *ctl) + if (!ctl->remote_map) + return 0; + +- compel_syscall(ctl, __NR(munmap, !compel_mode_native(ctl)), &ret, +- (unsigned long)ctl->remote_map, ctl->map_length, +- 0, 0, 0, 0); ++ err = compel_syscall(ctl, __NR(munmap, !compel_mode_native(ctl)), &ret, ++ (unsigned long)ctl->remote_map, ctl->map_length, ++ 0, 0, 0, 0); ++ if (err) ++ return err; ++ + if (ret) { + pr_err("munmap for remote map %p, %lu returned %lu\n", + ctl->remote_map, ctl->map_length, ret); +-- +2.34.0 + diff --git a/0019-cr-dump-Try-to-cure-remote-on-err-pathes.patch b/0019-cr-dump-Try-to-cure-remote-on-err-pathes.patch new file mode 100644 index 0000000..a1b2e88 --- /dev/null +++ b/0019-cr-dump-Try-to-cure-remote-on-err-pathes.patch @@ -0,0 +1,56 @@ +From cdbb583201ee2a34978d23de700eaa25ebab44db Mon Sep 17 00:00:00 2001 +From: Dmitry Safonov +Date: Sat, 9 Nov 2019 22:20:39 +0000 +Subject: [PATCH 19/22] cr-dump: Try to cure remote on err-pathes + +On daemon stop or threads dump failures it's still desired to remove +parasite from the remote (if possible). Try best and keep hopeing. + +Signed-off-by: Dmitry Safonov +Signed-off-by: Andrei Vagin +--- + criu/cr-dump.c | 14 ++++++++++---- + 1 file changed, 10 insertions(+), 4 deletions(-) + +diff --git a/criu/cr-dump.c b/criu/cr-dump.c +index 0d67073..45b2771 100644 +--- a/criu/cr-dump.c ++++ b/criu/cr-dump.c +@@ -1401,16 +1401,20 @@ static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) + + ret = compel_stop_daemon(parasite_ctl); + if (ret) { +- pr_err("Can't cure (pid: %d) from parasite\n", pid); +- goto err; ++ pr_err("Can't stop daemon in parasite (pid: %d)\n", pid); ++ goto err_cure; + } + + ret = dump_task_threads(parasite_ctl, item); + if (ret) { + pr_err("Can't dump threads\n"); +- goto err; ++ goto err_cure; + } + ++ /* ++ * On failure local map will be cured in cr_dump_finish() ++ * for lazy pages. ++ */ + if (opts.lazy_pages) + ret = compel_cure_remote(parasite_ctl); + else +@@ -1443,7 +1447,9 @@ err: + err_cure: + close_cr_imgset(&cr_imgset); + err_cure_imgset: +- compel_cure(parasite_ctl); ++ ret = compel_cure(parasite_ctl); ++ if (ret) ++ pr_err("Can't cure (pid: %d) from parasite\n", pid); + goto err; + } + +-- +2.34.0 + diff --git a/0020-cr-dump-Warn-if-unmapping-local-memfd-failed.patch b/0020-cr-dump-Warn-if-unmapping-local-memfd-failed.patch new file mode 100644 index 0000000..0d6e09a --- /dev/null +++ b/0020-cr-dump-Warn-if-unmapping-local-memfd-failed.patch @@ -0,0 +1,41 @@ +From 617c6c02a61d0fac2921b4a924ac18305a92c2d5 Mon Sep 17 00:00:00 2001 +From: Dmitry Safonov +Date: Sat, 9 Nov 2019 22:20:40 +0000 +Subject: [PATCH 20/22] cr-dump: Warn if unmapping local memfd failed + +Probably, not the worst that could happen, but still unexpected. +Preparing the ground to make compel_cure*() functions __must_check. + +Signed-off-by: Dmitry Safonov +Signed-off-by: Andrei Vagin +--- + criu/cr-dump.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +diff --git a/criu/cr-dump.c b/criu/cr-dump.c +index 45b2771..1e5c7f2 100644 +--- a/criu/cr-dump.c ++++ b/criu/cr-dump.c +@@ -1542,7 +1542,8 @@ static int cr_pre_dump_finish(int status) + timing_stop(TIME_MEMWRITE); + + destroy_page_pipe(mem_pp); +- compel_cure_local(ctl); ++ if (compel_cure_local(ctl)) ++ pr_err("Can't cure local: something happened with mapping?\n"); + } + + free_pstree(root_item); +@@ -1669,7 +1670,8 @@ static int cr_lazy_mem_dump(void) + for_each_pstree_item(item) { + if (item->pid->state != TASK_DEAD) { + destroy_page_pipe(dmpi(item)->mem_pp); +- compel_cure_local(dmpi(item)->parasite_ctl); ++ if (compel_cure_local(dmpi(item)->parasite_ctl)) ++ pr_err("Can't cure local: something happened with mapping?\n"); + } + } + +-- +2.34.0 + diff --git a/0021-parasite-syscall-Log-if-can-t-cure-on-failed-infecti.patch b/0021-parasite-syscall-Log-if-can-t-cure-on-failed-infecti.patch new file mode 100644 index 0000000..86752b0 --- /dev/null +++ b/0021-parasite-syscall-Log-if-can-t-cure-on-failed-infecti.patch @@ -0,0 +1,30 @@ +From 5195b6b41edd31f525cf2f65e4465fc246b57785 Mon Sep 17 00:00:00 2001 +From: Dmitry Safonov +Date: Sat, 9 Nov 2019 22:20:42 +0000 +Subject: [PATCH 21/22] parasite-syscall: Log if can't cure on failed infection + +Maybe expected, hopefully never happens - let's warn in any case. + +Signed-off-by: Dmitry Safonov +Signed-off-by: Andrei Vagin +--- + criu/parasite-syscall.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/criu/parasite-syscall.c b/criu/parasite-syscall.c +index b9788a4..e5a8194 100644 +--- a/criu/parasite-syscall.c ++++ b/criu/parasite-syscall.c +@@ -565,7 +565,8 @@ struct parasite_ctl *parasite_infect_seized(pid_t pid, struct pstree_item *item, + parasite_ensure_args_size(aio_rings_args_size(vma_area_list)); + + if (compel_infect(ctl, item->nr_threads, parasite_args_size) < 0) { +- compel_cure(ctl); ++ if (compel_cure(ctl)) ++ pr_warn("Can't cure failed infection\n"); + return NULL; + } + +-- +2.34.0 + diff --git a/0022-compel-criu-Add-__must_check.patch b/0022-compel-criu-Add-__must_check.patch new file mode 100644 index 0000000..c93bd6e --- /dev/null +++ b/0022-compel-criu-Add-__must_check.patch @@ -0,0 +1,326 @@ +From 23571a5812e899e2b5ae3ce3dc2cfb159e63e6b5 Mon Sep 17 00:00:00 2001 +From: Dmitry Safonov +Date: Sat, 9 Nov 2019 22:20:45 +0000 +Subject: [PATCH 22/22] compel/criu: Add __must_check + +All those compel functions can fail by various reasons. +It may be status of the system, interruption by user or anything else. +It's really desired to handle as many PIE related errors as possible +otherwise it's hard to analyze statuses of parasite/restorer +and the C/R process. + +At least warning for logs should be produced or even C/R stopped. + +Signed-off-by: Dmitry Safonov +Signed-off-by: Andrei Vagin +--- + compel/include/uapi/cpu.h | 2 +- + compel/include/uapi/infect-rpc.h | 6 ++-- + compel/include/uapi/infect-util.h | 5 ++- + compel/include/uapi/infect.h | 39 +++++++++++++----------- + compel/include/uapi/ptrace.h | 7 +++-- + compel/include/uapi/sigframe-common.h | 5 +-- + compel/plugins/include/uapi/plugin-fds.h | 2 +- + compel/plugins/include/uapi/std/infect.h | 8 +++-- + compel/plugins/include/uapi/std/log.h | 1 + + criu/seize.c | 2 +- + include/common/compiler.h | 27 ++++++++++++++++ + 11 files changed, 71 insertions(+), 33 deletions(-) + +diff --git a/compel/include/uapi/cpu.h b/compel/include/uapi/cpu.h +index 6f827d4..72c8a51 100644 +--- a/compel/include/uapi/cpu.h ++++ b/compel/include/uapi/cpu.h +@@ -6,7 +6,7 @@ + + #include + +-extern int compel_cpuid(compel_cpuinfo_t *info); ++extern int /* TODO: __must_check */ compel_cpuid(compel_cpuinfo_t *info); + extern bool compel_cpu_has_feature(unsigned int feature); + extern bool compel_fpu_has_feature(unsigned int feature); + extern uint32_t compel_fpu_feature_size(unsigned int feature); +diff --git a/compel/include/uapi/infect-rpc.h b/compel/include/uapi/infect-rpc.h +index 0176c11..180dedf 100644 +--- a/compel/include/uapi/infect-rpc.h ++++ b/compel/include/uapi/infect-rpc.h +@@ -6,9 +6,9 @@ + #include + + struct parasite_ctl; +-extern int compel_rpc_sync(unsigned int cmd, struct parasite_ctl *ctl); +-extern int compel_rpc_call(unsigned int cmd, struct parasite_ctl *ctl); +-extern int compel_rpc_call_sync(unsigned int cmd, struct parasite_ctl *ctl); ++extern int __must_check compel_rpc_sync(unsigned int cmd, struct parasite_ctl *ctl); ++extern int __must_check compel_rpc_call(unsigned int cmd, struct parasite_ctl *ctl); ++extern int __must_check compel_rpc_call_sync(unsigned int cmd, struct parasite_ctl *ctl); + extern int compel_rpc_sock(struct parasite_ctl *ctl); + + #define PARASITE_USER_CMDS 64 +diff --git a/compel/include/uapi/infect-util.h b/compel/include/uapi/infect-util.h +index 7307ba5..4e32d13 100644 +--- a/compel/include/uapi/infect-util.h ++++ b/compel/include/uapi/infect-util.h +@@ -1,6 +1,9 @@ + #ifndef __COMPEL_INFECT_UTIL_H__ + #define __COMPEL_INFECT_UTIL_H__ ++ ++#include "common/compiler.h" ++ + struct parasite_ctl; +-extern int compel_util_send_fd(struct parasite_ctl *ctl, int fd); ++extern int __must_check compel_util_send_fd(struct parasite_ctl *ctl, int fd); + extern int compel_util_recv_fd(struct parasite_ctl *ctl, int *pfd); + #endif +diff --git a/compel/include/uapi/infect.h b/compel/include/uapi/infect.h +index 08beaff..dd672bc 100644 +--- a/compel/include/uapi/infect.h ++++ b/compel/include/uapi/infect.h +@@ -13,7 +13,7 @@ + + #define PARASITE_START_AREA_MIN (4096) + +-extern int compel_interrupt_task(int pid); ++extern int __must_check compel_interrupt_task(int pid); + + struct seize_task_status { + unsigned long long sigpnd; +@@ -23,27 +23,28 @@ struct seize_task_status { + int seccomp_mode; + }; + +-extern int compel_wait_task(int pid, int ppid, ++extern int __must_check compel_wait_task(int pid, int ppid, + int (*get_status)(int pid, struct seize_task_status *, void *data), + void (*free_status)(int pid, struct seize_task_status *, void *data), + struct seize_task_status *st, void *data); + +-extern int compel_stop_task(int pid); ++extern int __must_check compel_stop_task(int pid); + extern int compel_resume_task(pid_t pid, int orig_state, int state); + + struct parasite_ctl; + struct parasite_thread_ctl; + +-extern struct parasite_ctl *compel_prepare(int pid); +-extern struct parasite_ctl *compel_prepare_noctx(int pid); +-extern int compel_infect(struct parasite_ctl *ctl, unsigned long nr_threads, unsigned long args_size); +-extern struct parasite_thread_ctl *compel_prepare_thread(struct parasite_ctl *ctl, int pid); ++extern struct parasite_ctl __must_check *compel_prepare(int pid); ++extern struct parasite_ctl __must_check *compel_prepare_noctx(int pid); ++extern int __must_check compel_infect(struct parasite_ctl *ctl, ++ unsigned long nr_threads, unsigned long args_size); ++extern struct parasite_thread_ctl __must_check *compel_prepare_thread(struct parasite_ctl *ctl, int pid); + extern void compel_release_thread(struct parasite_thread_ctl *); + +-extern int compel_stop_daemon(struct parasite_ctl *ctl); +-extern int compel_cure_remote(struct parasite_ctl *ctl); +-extern int compel_cure_local(struct parasite_ctl *ctl); +-extern int compel_cure(struct parasite_ctl *ctl); ++extern int __must_check compel_stop_daemon(struct parasite_ctl *ctl); ++extern int __must_check compel_cure_remote(struct parasite_ctl *ctl); ++extern int __must_check compel_cure_local(struct parasite_ctl *ctl); ++extern int __must_check compel_cure(struct parasite_ctl *ctl); + + #define PARASITE_ARG_SIZE_MIN ( 1 << 12) + +@@ -58,15 +59,16 @@ extern int compel_cure(struct parasite_ctl *ctl); + extern void *compel_parasite_args_p(struct parasite_ctl *ctl); + extern void *compel_parasite_args_s(struct parasite_ctl *ctl, unsigned long args_size); + +-extern int compel_syscall(struct parasite_ctl *ctl, int nr, long *ret, ++extern int __must_check compel_syscall(struct parasite_ctl *ctl, ++ int nr, long *ret, + unsigned long arg1, + unsigned long arg2, + unsigned long arg3, + unsigned long arg4, + unsigned long arg5, + unsigned long arg6); +-extern int compel_run_in_thread(struct parasite_thread_ctl *tctl, unsigned int cmd); +-extern int compel_run_at(struct parasite_ctl *ctl, unsigned long ip, user_regs_struct_t *ret_regs); ++extern int __must_check compel_run_in_thread(struct parasite_thread_ctl *tctl, unsigned int cmd); ++extern int __must_check compel_run_at(struct parasite_ctl *ctl, unsigned long ip, user_regs_struct_t *ret_regs); + + /* + * The PTRACE_SYSCALL will trap task twice -- on +@@ -80,12 +82,13 @@ enum trace_flags { + TRACE_EXIT, + }; + +-extern int compel_stop_on_syscall(int tasks, int sys_nr, ++extern int __must_check compel_stop_on_syscall(int tasks, int sys_nr, + int sys_nr_compat, enum trace_flags trace); + +-extern int compel_stop_pie(pid_t pid, void *addr, enum trace_flags *tf, bool no_bp); ++extern int __must_check compel_stop_pie(pid_t pid, void *addr, ++ enum trace_flags *tf, bool no_bp); + +-extern int compel_unmap(struct parasite_ctl *ctl, unsigned long addr); ++extern int __must_check compel_unmap(struct parasite_ctl *ctl, unsigned long addr); + + extern int compel_mode_native(struct parasite_ctl *ctl); + +@@ -159,7 +162,7 @@ struct parasite_blob_desc { + + extern struct parasite_blob_desc *compel_parasite_blob_desc(struct parasite_ctl *); + +-extern int compel_get_thread_regs(struct parasite_thread_ctl *, save_regs_t, void *); ++extern int __must_check compel_get_thread_regs(struct parasite_thread_ctl *, save_regs_t, void *); + + extern void compel_relocs_apply(void *mem, void *vbase, size_t size, compel_reloc_t *elf_relocs, size_t nr_relocs); + +diff --git a/compel/include/uapi/ptrace.h b/compel/include/uapi/ptrace.h +index 4df00b6..13eed72 100644 +--- a/compel/include/uapi/ptrace.h ++++ b/compel/include/uapi/ptrace.h +@@ -1,6 +1,7 @@ + #ifndef UAPI_COMPEL_PTRACE_H__ + #define UAPI_COMPEL_PTRACE_H__ + ++#include "common/compiler.h" + /* + * We'd want to include both sys/ptrace.h and linux/ptrace.h, + * hoping that most definitions come from either one or another. +@@ -75,8 +76,8 @@ typedef struct { + + extern int ptrace_suspend_seccomp(pid_t pid); + +-extern int ptrace_peek_area(pid_t pid, void *dst, void *addr, long bytes); +-extern int ptrace_poke_area(pid_t pid, void *src, void *addr, long bytes); +-extern int ptrace_swap_area(pid_t pid, void *dst, void *src, long bytes); ++extern int __must_check ptrace_peek_area(pid_t pid, void *dst, void *addr, long bytes); ++extern int __must_check ptrace_poke_area(pid_t pid, void *src, void *addr, long bytes); ++extern int __must_check ptrace_swap_area(pid_t pid, void *dst, void *src, long bytes); + + #endif /* UAPI_COMPEL_PTRACE_H__ */ +diff --git a/compel/include/uapi/sigframe-common.h b/compel/include/uapi/sigframe-common.h +index fc93c54..177bf4c 100644 +--- a/compel/include/uapi/sigframe-common.h ++++ b/compel/include/uapi/sigframe-common.h +@@ -8,6 +8,7 @@ + # error "Direct inclusion is forbidden, use instead" + #endif + ++#include "common/compiler.h" + #include + #include + +@@ -56,7 +57,7 @@ struct rt_ucontext { + unsigned long uc_regspace[128] __attribute__((aligned(8))); + }; + +-extern int sigreturn_prep_fpu_frame(struct rt_sigframe *frame, +- struct rt_sigframe *rframe); ++extern int __must_check sigreturn_prep_fpu_frame(struct rt_sigframe *frame, ++ struct rt_sigframe *rframe); + + #endif /* UAPI_COMPEL_SIGFRAME_COMMON_H__ */ +diff --git a/compel/plugins/include/uapi/plugin-fds.h b/compel/plugins/include/uapi/plugin-fds.h +index cececb2..e995b4b 100644 +--- a/compel/plugins/include/uapi/plugin-fds.h ++++ b/compel/plugins/include/uapi/plugin-fds.h +@@ -1,7 +1,7 @@ + #ifndef COMPEL_PLUGIN_STD_STD_H__ + #define COMPEL_PLUGIN_STD_STD_H__ + +-extern int fds_send_fd(int fd); ++extern int __must_check fds_send_fd(int fd); + extern int fds_recv_fd(void); + + #endif /* COMPEL_PLUGIN_STD_STD_H__ */ +diff --git a/compel/plugins/include/uapi/std/infect.h b/compel/plugins/include/uapi/std/infect.h +index 800df25..1e784f8 100644 +--- a/compel/plugins/include/uapi/std/infect.h ++++ b/compel/plugins/include/uapi/std/infect.h +@@ -1,14 +1,16 @@ + #ifndef COMPEL_PLUGIN_STD_INFECT_H__ + #define COMPEL_PLUGIN_STD_INFECT_H__ + ++#include "common/compiler.h" ++ + extern int parasite_get_rpc_sock(void); +-extern int parasite_service(unsigned int cmd, void *args); ++extern int __must_check parasite_service(unsigned int cmd, void *args); + + /* + * Must be supplied by user plugins. + */ +-extern int parasite_daemon_cmd(int cmd, void *args); +-extern int parasite_trap_cmd(int cmd, void *args); ++extern int __must_check parasite_daemon_cmd(int cmd, void *args); ++extern int __must_check parasite_trap_cmd(int cmd, void *args); + extern void parasite_cleanup(void); + + /* +diff --git a/compel/plugins/include/uapi/std/log.h b/compel/plugins/include/uapi/std/log.h +index f21b6df..91462c8 100644 +--- a/compel/plugins/include/uapi/std/log.h ++++ b/compel/plugins/include/uapi/std/log.h +@@ -2,6 +2,7 @@ + #define COMPEL_PLUGIN_STD_LOG_H__ + + #include "compel/loglevels.h" ++#include "common/compiler.h" + + #define STD_LOG_SIMPLE_CHUNK 256 + +diff --git a/criu/seize.c b/criu/seize.c +index cce8911..e1e6b81 100644 +--- a/criu/seize.c ++++ b/criu/seize.c +@@ -483,7 +483,7 @@ static int collect_children(struct pstree_item *item) + + if (!opts.freeze_cgroup) + /* fails when meets a zombie */ +- compel_interrupt_task(pid); ++ __ignore_value(compel_interrupt_task(pid)); + + ret = compel_wait_task(pid, item->pid->real, parse_pid_status, NULL, &creds.s, NULL); + if (ret < 0) { +diff --git a/include/common/compiler.h b/include/common/compiler.h +index fc8abcf..1d431a5 100644 +--- a/include/common/compiler.h ++++ b/include/common/compiler.h +@@ -22,6 +22,7 @@ + #define __used __attribute__((__used__)) + #define __maybe_unused __attribute__((unused)) + #define __always_unused __attribute__((unused)) ++#define __must_check __attribute__((__warn_unused_result__)) + + #define __section(S) __attribute__ ((__section__(#S))) + +@@ -99,4 +100,30 @@ + + #define is_log2(v) (((v) & ((v) - 1)) == 0) + ++/* ++ * Use "__ignore_value" to avoid a warning when using a function declared with ++ * gcc's warn_unused_result attribute, but for which you really do want to ++ * ignore the result. Traditionally, people have used a "(void)" cast to ++ * indicate that a function's return value is deliberately unused. However, ++ * if the function is declared with __attribute__((warn_unused_result)), ++ * gcc issues a warning even with the cast. ++ * ++ * Caution: most of the time, you really should heed gcc's warning, and ++ * check the return value. However, in those exceptional cases in which ++ * you're sure you know what you're doing, use this function. ++ * ++ * Normally casting an expression to void discards its value, but GCC ++ * versions 3.4 and newer have __attribute__ ((__warn_unused_result__)) ++ * which may cause unwanted diagnostics in that case. Use __typeof__ ++ * and __extension__ to work around the problem, if the workaround is ++ * known to be needed. ++ * Written by Jim Meyering, Eric Blake and Pádraig Brady. ++ * (See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66425 for the details) ++ */ ++#if 3 < __GNUC__ + (4 <= __GNUC_MINOR__) ++# define __ignore_value(x) ({ __typeof__ (x) __x = (x); (void) __x; }) ++#else ++# define __ignore_value(x) ((void) (x)) ++#endif ++ + #endif /* __CR_COMPILER_H__ */ +-- +2.34.0 + diff --git a/backport-0001--build-add-secure-compilation-options.patch b/backport-0001--build-add-secure-compilation-options.patch new file mode 100644 index 0000000..d1c0d78 --- /dev/null +++ b/backport-0001--build-add-secure-compilation-options.patch @@ -0,0 +1,112 @@ +From 69786cc26699497a1b6a02f8bd35bb04eb74830e Mon Sep 17 00:00:00 2001 +From: Jingxian He +Date: Wed, 19 May 2021 22:49:57 +0800 +Subject: [PATCH 01/49] build: add secure compilation options + +Add secure compilation options: +-fstack-protector -fstack-protector-all +-Wl,-z,relro,-z,now,-z,noexecstack + +Signed-off-by: fu.lin +--- + Makefile | 4 ++++ + criu/Makefile | 2 +- + criu/pie/Makefile | 1 + + criu/pie/Makefile.library | 2 ++ + lib/Makefile | 1 + + lib/c/Makefile | 2 +- + scripts/nmk/scripts/build.mk | 5 +++-- + 7 files changed, 13 insertions(+), 4 deletions(-) + +diff --git a/Makefile b/Makefile +index 0140330..a95a642 100644 +--- a/Makefile ++++ b/Makefile +@@ -82,6 +82,10 @@ ifeq ($(ARCH),s390) + CFLAGS_PIE := -fno-optimize-sibling-calls + endif + ++# secure compilation options ++CFLAGS += -fstack-protector-all -fPIE ++LDFLAGS += -pie ++ + CFLAGS_PIE += -DCR_NOGLIBC + export CFLAGS_PIE + +diff --git a/criu/Makefile b/criu/Makefile +index 4134e50..419fbf1 100644 +--- a/criu/Makefile ++++ b/criu/Makefile +@@ -85,7 +85,7 @@ $(obj)/%: pie + + $(obj)/criu: $(PROGRAM-BUILTINS) + $(call msg-link, $@) +- $(Q) $(CC) $(CFLAGS) $^ $(LIBS) $(WRAPFLAGS) $(LDFLAGS) $(GMONLDOPT) -rdynamic -o $@ ++ $(Q) $(CC) $(CFLAGS) $^ $(LIBS) $(WRAPFLAGS) $(LDFLAGS) $(GMONLDOPT) -rdynamic -o $@ -Wl,-z,relro,-z,now,-z,noexecstack -fPIE -pie + + + # +diff --git a/criu/pie/Makefile b/criu/pie/Makefile +index 1ad456f..39a3998 100644 +--- a/criu/pie/Makefile ++++ b/criu/pie/Makefile +@@ -6,6 +6,7 @@ target := parasite restorer + + CFLAGS := $(filter-out -pg $(CFLAGS-GCOV) $(CFLAGS-ASAN),$(CFLAGS)) + CFLAGS += $(CFLAGS_PIE) ++CFLAGS := $(filter-out -fstack-protector -fstack-protector-all,$(CFLAGS)) + ccflags-y += -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=0 + ccflags-y += -Wp,-U_FORTIFY_SOURCE -Wp,-D_FORTIFY_SOURCE=0 + +diff --git a/criu/pie/Makefile.library b/criu/pie/Makefile.library +index 658c8a4..cb0d929 100644 +--- a/criu/pie/Makefile.library ++++ b/criu/pie/Makefile.library +@@ -23,3 +23,5 @@ endif + + CFLAGS := $(filter-out -pg $(CFLAGS-GCOV) $(CFLAGS-ASAN),$(CFLAGS)) + CFLAGS += $(CFLAGS_PIE) ++ ++CFLAGS := $(filter-out -fstack-protector -fstack-protector-all,$(CFLAGS)) +diff --git a/lib/Makefile b/lib/Makefile +index f9b6670..bc1b513 100644 +--- a/lib/Makefile ++++ b/lib/Makefile +@@ -14,6 +14,7 @@ lib/c/Makefile: ; + lib/c/%: .FORCE + $(Q) $(MAKE) $(build)=lib/c $@ + ++CFLAGS := $(filter-out -fPIE,$(CFLAGS)) + cflags-so += $(CFLAGS) -rdynamic -Wl,-soname,$(CRIU_SO).$(CRIU_SO_VERSION_MAJOR) + ldflags-so += -lprotobuf-c + +diff --git a/lib/c/Makefile b/lib/c/Makefile +index af01467..d7f6491 100644 +--- a/lib/c/Makefile ++++ b/lib/c/Makefile +@@ -4,5 +4,5 @@ obj-y += ./images/rpc.pb-c.o + ccflags-y += -iquote criu/$(ARCH_DIR)/include + ccflags-y += -iquote criu/include + ccflags-y += -iquote images +-ccflags-y += -fPIC -fno-stack-protector ++ccflags-y += -fPIC + ldflags-y += -r -z noexecstack +diff --git a/scripts/nmk/scripts/build.mk b/scripts/nmk/scripts/build.mk +index d01d2b7..6f366d7 100644 +--- a/scripts/nmk/scripts/build.mk ++++ b/scripts/nmk/scripts/build.mk +@@ -15,8 +15,9 @@ lib-name := + lib-target := + hostprogs-y := + libso-y := +-ld_flags := +-ldflags-so := ++ld_flags := -Wl,-z,relro,-z,now,-z,noexecstack ++ldflags-so := -Wl,-z,relro,-z,now,-z,noexecstack ++ldflags-y := -z relro -z now -z noexecstack + arflags-y := + target := + deps-y := +-- +2.34.0 + diff --git a/backport-0002--tty-fix-NULL-pointer-access-in-tty.patch b/backport-0002--tty-fix-NULL-pointer-access-in-tty.patch new file mode 100644 index 0000000..2271d65 --- /dev/null +++ b/backport-0002--tty-fix-NULL-pointer-access-in-tty.patch @@ -0,0 +1,29 @@ +From a4975cd1b9384199734b76155695828cd3085f20 Mon Sep 17 00:00:00 2001 +From: "fu.lin" +Date: Thu, 15 Jul 2021 11:00:25 +0800 +Subject: [PATCH 02/49] tty: fix NULL pointer access in tty + +Signed-off-by: fu.lin +--- + criu/tty.c | 5 +++++ + 1 file changed, 5 insertions(+) + +diff --git a/criu/tty.c b/criu/tty.c +index dee8d46..b34cfc2 100644 +--- a/criu/tty.c ++++ b/criu/tty.c +@@ -2023,6 +2023,11 @@ static int dump_one_tty(int lfd, u32 id, const struct fd_parms *p) + pr_info("Dumping tty %d with id %#x\n", lfd, id); + + driver = get_tty_driver(p->stat.st_rdev, p->stat.st_dev); ++ if (driver == NULL) { ++ pr_err("Can't get tty driver\n"); ++ return -1; ++ } ++ + if (driver->fd_get_index) + index = driver->fd_get_index(lfd, p); + else +-- +2.34.0 + diff --git a/backport-0003--namespaces-drop-func-address-print-to-make-someone-h.patch b/backport-0003--namespaces-drop-func-address-print-to-make-someone-h.patch new file mode 100644 index 0000000..7f64707 --- /dev/null +++ b/backport-0003--namespaces-drop-func-address-print-to-make-someone-h.patch @@ -0,0 +1,31 @@ +From ee707cbd517472125fe6082a0ecbe5797d286a5d Mon Sep 17 00:00:00 2001 +From: "fu.lin" +Date: Thu, 15 Jul 2021 11:10:46 +0800 +Subject: [PATCH 03/49] namespaces: drop func address print to make someone + happy + +Signed-off-by: fu.lin +--- + criu/namespaces.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/criu/namespaces.c b/criu/namespaces.c +index a228737..7fef175 100644 +--- a/criu/namespaces.c ++++ b/criu/namespaces.c +@@ -1239,10 +1239,10 @@ static int usernsd(int sk) + } + + unsc_msg_pid_fd(&um, &pid, &fd); +- pr_debug("uns: daemon calls %p (%d, %d, %x)\n", call, pid, fd, flags); ++ pr_debug("uns: daemon calls (%d, %d, %x)\n", pid, fd, flags); + + if (fd < 0 && flags & UNS_FDOUT) { +- pr_err("uns: bad flags/fd %p %d %x\n", call, fd, flags); ++ pr_err("uns: bad flags/fd %d %x\n", fd, flags); + BUG(); + } + +-- +2.34.0 + diff --git a/0012-add-pin-memory-method-for-criu.patch b/backport-0004--mm-add-pin-memory-method-for-criu.patch similarity index 53% rename from 0012-add-pin-memory-method-for-criu.patch rename to backport-0004--mm-add-pin-memory-method-for-criu.patch index 3214161..a86cecc 100644 --- a/0012-add-pin-memory-method-for-criu.patch +++ b/backport-0004--mm-add-pin-memory-method-for-criu.patch @@ -1,32 +1,30 @@ -From ebde25b4819ebae068f9547744f0deebd390dc0c Mon Sep 17 00:00:00 2001 +From 3f2f975db95d13bfe42e14af6fa5bd355c3ec0b0 Mon Sep 17 00:00:00 2001 From: Jingxian He -Date: Mon, 1 Mar 2021 21:23:46 +0800 -Subject: [PATCH] add pin memory method for criu +Date: Fri, 23 Apr 2021 21:22:08 +0800 +Subject: [PATCH 04/49] mm: add pin memory method for criu -We can use the checkpoint and restore in userspace method to dump and restore tasks -when updating the kernel. Currently, criu needs dump all memory data of tasks to files. -When the memory size is very large(larger than 1G), the cost time of the dumping data -will be very long(more than 1 min). - -We can pin the memory data of tasks and collect the corresponding physical pages -mapping info in checkpoint process, -and remap the physical pages to restore tasks in restore process. +Add pin memory method for criu to improve memory recover +speed and avoid user private data saving to files. Signed-off-by: Jingxian He --- - criu/config.c | 1 + - criu/cr-restore.c | 5 +++ - criu/include/cr_options.h | 1 + - criu/include/restorer.h | 24 ++++++++++++ - criu/mem.c | 96 ++++++++++++++++++++++++++++++++++++++++++++++- - criu/pie/restorer.c | 21 ++++++++++- - 6 files changed, 146 insertions(+), 2 deletions(-) + criu/config.c | 1 + + criu/cr-dump.c | 5 ++ + criu/cr-restore.c | 2 + + criu/crtools.c | 1 + + criu/include/cr_options.h | 1 + + criu/include/mem.h | 4 ++ + criu/include/restorer.h | 28 +++++++++ + criu/mem.c | 129 +++++++++++++++++++++++++++++++++++++- + criu/pie/restorer.c | 25 +++++++- + criu/seize.c | 6 ++ + 10 files changed, 200 insertions(+), 2 deletions(-) diff --git a/criu/config.c b/criu/config.c -index 76d6e5f..e517548 100644 +index 8cb6bb2..d30636b 100644 --- a/criu/config.c +++ b/criu/config.c -@@ -517,6 +517,7 @@ int parse_options(int argc, char **argv, bool *usage_error, +@@ -518,6 +518,7 @@ int parse_options(int argc, char **argv, bool *usage_error, BOOL_OPT("tls", &opts.tls), {"tls-no-cn-verify", no_argument, &opts.tls_no_cn_verify, true}, BOOL_OPT("with-cpu-affinity", &opts.with_cpu_affinity), @@ -34,24 +32,49 @@ index 76d6e5f..e517548 100644 { }, }; +diff --git a/criu/cr-dump.c b/criu/cr-dump.c +index 1e5c7f2..b125985 100644 +--- a/criu/cr-dump.c ++++ b/criu/cr-dump.c +@@ -1761,6 +1761,11 @@ static int cr_dump_finish(int ret) + + close_service_fd(CR_PROC_FD_OFF); + ++ if (ret == 0 && opts.pin_memory) { ++ pr_info("start restore_task_special_pages\n"); ++ restore_task_special_pages(0); ++ } ++ + if (ret) { + pr_err("Dumping FAILED.\n"); + } else { diff --git a/criu/cr-restore.c b/criu/cr-restore.c -index c2be323..bd49f1f 100644 +index a3caad7..86c78a8 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c -@@ -3651,6 +3651,11 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns +@@ -3668,6 +3668,8 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns task_args->clone_restore_fn, task_args->thread_args); -+ if (opts.pin_memory) -+ task_args->pin_memory = true; -+ else -+ task_args->pin_memory = false; ++ task_args->pin_memory = opts.pin_memory ? true : false; + /* * An indirect call to task_restore, note it never returns * and restoring core is extremely destructive. +diff --git a/criu/crtools.c b/criu/crtools.c +index a22664d..fce29f3 100644 +--- a/criu/crtools.c ++++ b/criu/crtools.c +@@ -390,6 +390,7 @@ usage: + " user:PID,UID,GID\n" + " --with-cpu-affinity Allow to restore cpu affinity. Only for hosts with\n" + " same cpu quantity.\n" ++" --pin-memory Use pin memory method for checkpoint and restore.\n" + "\n" + "Check options:\n" + " Without options, \"criu check\" checks availability of absolutely required\n" diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h -index 98c5a44..f19d588 100644 +index 52211a7..15f8cb3 100644 --- a/criu/include/cr_options.h +++ b/criu/include/cr_options.h @@ -145,6 +145,7 @@ struct cr_options { @@ -62,34 +85,51 @@ index 98c5a44..f19d588 100644 }; extern struct cr_options opts; +diff --git a/criu/include/mem.h b/criu/include/mem.h +index 251cb1a..4241b0c 100644 +--- a/criu/include/mem.h ++++ b/criu/include/mem.h +@@ -50,4 +50,8 @@ int prepare_vmas(struct pstree_item *t, struct task_restore_args *ta); + int unmap_guard_pages(struct pstree_item *t); + int prepare_mappings(struct pstree_item *t); + bool should_dump_page(VmaEntry *vmae, u64 pme); ++ ++int dump_task_special_pages(int pid); ++int restore_task_special_pages(int pid); ++ + #endif /* __CR_MEM_H__ */ diff --git a/criu/include/restorer.h b/criu/include/restorer.h -index bd6ef6a..fc37e6d 100644 +index bd6ef6a..fbd5262 100644 --- a/criu/include/restorer.h +++ b/criu/include/restorer.h @@ -225,6 +225,7 @@ struct task_restore_args { int lsm_type; int child_subreaper; bool has_clone3_set_tid; -+ bool pin_memory; ++ bool pin_memory; } __aligned(64); /* -@@ -317,4 +318,27 @@ enum { +@@ -317,4 +318,31 @@ enum { #define __r_sym(name) restorer_sym ## name #define restorer_sym(rblob, name) (void*)(rblob + __r_sym(name)) +#define PIN_MEM_FILE "/dev/pinmem" +#define PIN_MEM_MAGIC 0x59 -+#define _SET_PIN_MEM_AREA 1 -+#define _CLEAR_PIN_MEM_AREA 2 -+#define _REMAP_PIN_MEM_AREA 3 -+#define _PIN_MEM_IOC_MAX_NR 4 ++#define _SET_PIN_MEM_AREA 1 ++#define _CLEAR_PIN_MEM_AREA 2 ++#define _REMAP_PIN_MEM_AREA 3 ++#define _DUMP_SEPCIAL_PAGES 6 ++#define _RETORE_SEPCIAL_PAGES 7 +#define SET_PIN_MEM_AREA _IOW(PIN_MEM_MAGIC, _SET_PIN_MEM_AREA, struct pin_mem_area_set) +#define CLEAR_PIN_MEM_AREA _IOW(PIN_MEM_MAGIC, _CLEAR_PIN_MEM_AREA, int) +#define REMAP_PIN_MEM_AREA _IOW(PIN_MEM_MAGIC, _REMAP_PIN_MEM_AREA, int) ++#define DUMP_SEPCIAL_PAGES _IOW(PIN_MEM_MAGIC, _DUMP_SEPCIAL_PAGES, int) ++#define RETORE_SEPCIAL_PAGES _IOW(PIN_MEM_MAGIC, _RETORE_SEPCIAL_PAGES, int) + +#define ONCE_PIN_MEM_SIZE_LIMIT 32 * 1024 * 1024 +#define MAX_PIN_MEM_AREA_NUM 16 ++ +struct pin_mem_area { + unsigned long virt_start; + unsigned long virt_end; @@ -103,10 +143,10 @@ index bd6ef6a..fc37e6d 100644 + #endif /* __CR_RESTORER_H__ */ diff --git a/criu/mem.c b/criu/mem.c -index de66a62..4c34456 100644 +index e096853..d2ff6e0 100644 --- a/criu/mem.c +++ b/criu/mem.c -@@ -391,6 +391,88 @@ again: +@@ -391,6 +391,119 @@ again: return ret; } @@ -130,10 +170,8 @@ index de66a62..4c34456 100644 + + if (vma_entry_is(vmae, VMA_AREA_AIORING)) + return false; -+ if (vma_entry_is(vmae, VMA_ANON_PRIVATE)) { -+ pr_debug("find private anon vma: %lx-%lx\n", vmae->start, vmae->end); ++ if (vma_entry_is(vmae, VMA_ANON_PRIVATE)) + return true; -+ } + + return false; +} @@ -154,7 +192,6 @@ index de66a62..4c34456 100644 + pma = &(pmas.mem_area[index]); + pma->virt_start = start; + pma->virt_end = next; -+ pr_info("start pin %lx-%lx\n", start, next); + index++; + start += ONCE_PIN_MEM_SIZE_LIMIT; + if (index >= MAX_PIN_MEM_AREA_NUM) @@ -163,7 +200,6 @@ index de66a62..4c34456 100644 + *pend = next; + pmas.area_num = index; + pmas.pid = vpid(item); -+ pr_info("begin pin memory for pid:%d\n", pmas.pid); + ret = ioctl(fd, SET_PIN_MEM_AREA, &pmas); + if (ret < 0) + pr_err("pin mem fail, errno: %s\n", strerror(errno)); @@ -191,11 +227,46 @@ index de66a62..4c34456 100644 + close(fd); + return ret; +} ++ ++int dump_task_special_pages(int pid) ++{ ++ int fd, ret; ++ ++ fd = open(PIN_MEM_FILE, O_RDWR, 0); ++ if (fd < 0) { ++ pr_warn("error open file: %s\n", PIN_MEM_FILE); ++ return -1; ++ } ++ ret = ioctl(fd, DUMP_SEPCIAL_PAGES, (unsigned long) &pid); ++ if (ret < 0) { ++ pr_warn("No need DUMP_SEPCIAL_PAGES for %d\n", pid); ++ } ++ close(fd); ++ return ret; ++} ++ ++int restore_task_special_pages(int pid) ++{ ++ int fd, ret; ++ ++ fd = open(PIN_MEM_FILE, O_RDWR, 0); ++ if (fd < 0) { ++ pr_warn("error open file: %s\n", PIN_MEM_FILE); ++ return -1; ++ } ++ ret = ioctl(fd, RETORE_SEPCIAL_PAGES, (unsigned long) &pid); ++ if (ret < 0) { ++ pr_warn("No need RETORE_SEPCIAL_PAGES for %d\n", pid); ++ } ++ close(fd); ++ return ret; ++} ++ + static int __parasite_dump_pages_seized(struct pstree_item *item, struct parasite_dump_pages_args *args, struct vm_area_list *vma_area_list, -@@ -465,7 +547,16 @@ static int __parasite_dump_pages_seized(struct pstree_item *item, +@@ -465,7 +578,18 @@ static int __parasite_dump_pages_seized(struct pstree_item *item, if (possible_pid_reuse == -1) goto out_xfer; } @@ -205,26 +276,28 @@ index de66a62..4c34456 100644 + list_for_each_entry(vma_area, &vma_area_list->h, list) { + if (should_pin_vmae(vma_area->e)) { + ret = pin_vmae(vma_area->e, item); -+ if (ret) ++ if (ret) { ++ exit_code = -1; + goto out_xfer; ++ } + } + } + } /* * Step 1 -- generate the pagemap -@@ -473,6 +564,9 @@ static int __parasite_dump_pages_seized(struct pstree_item *item, +@@ -473,6 +597,9 @@ static int __parasite_dump_pages_seized(struct pstree_item *item, args->off = 0; has_parent = !!xfer.parent && !possible_pid_reuse; list_for_each_entry(vma_area, &vma_area_list->h, list) { -+ if (opts.pin_memory && should_pin_vmae(vma_area->e)) { ++ if (opts.pin_memory && should_pin_vmae(vma_area->e)) + continue; -+ } ++ ret = generate_vma_iovs(item, vma_area, pp, &xfer, args, ctl, &pmc, has_parent, mdc->pre_dump); if (ret < 0) diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c -index 571341d..7eb51b7 100644 +index 571341d..9a24feb 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -1412,6 +1412,24 @@ int cleanup_current_inotify_events(struct task_restore_args *task_args) @@ -252,16 +325,44 @@ index 571341d..7eb51b7 100644 /* * The main routine to restore task via sigreturn. * This one is very special, we never return there -@@ -1583,7 +1601,8 @@ long __export_restore_task(struct task_restore_args *args) +@@ -1583,7 +1601,12 @@ long __export_restore_task(struct task_restore_args *args) goto core_restore_end; } } - -+ if (args->pin_memory) -+ remap_vmas(my_pid); ++ if (args->pin_memory) { ++ if (remap_vmas(my_pid) < 0) { ++ pr_err("Remap vmas fail\n"); ++ goto core_restore_end; ++ } ++ } /* * Now read the contents (if any) */ +diff --git a/criu/seize.c b/criu/seize.c +index e1e6b81..0e79aba 100644 +--- a/criu/seize.c ++++ b/criu/seize.c +@@ -23,6 +23,7 @@ + #include "xmalloc.h" + #include "util.h" + #include ++#include "mem.h" + + #define NR_ATTEMPTS 5 + +@@ -530,6 +531,11 @@ static void unseize_task_and_threads(const struct pstree_item *item, int st) + if (item->pid->state == TASK_DEAD) + return; + ++ if (opts.pin_memory) { ++ for (i = 0; i < item->nr_threads; i++) ++ dump_task_special_pages(item->threads[i].real); ++ } ++ + /* + * The st is the state we want to switch tasks into, + * the item->state is the state task was in when we seized one. -- -2.9.5 +2.34.0 diff --git a/backport-0005--pid-add-pid-recover-method-for-criu.patch b/backport-0005--pid-add-pid-recover-method-for-criu.patch new file mode 100644 index 0000000..e9b479d --- /dev/null +++ b/backport-0005--pid-add-pid-recover-method-for-criu.patch @@ -0,0 +1,195 @@ +From d01d0cb104bbd62e51dc25137a2113bb7ea4636d Mon Sep 17 00:00:00 2001 +From: Jingxian He +Date: Wed, 1 Dec 2021 10:43:39 +0800 +Subject: [PATCH 05/49] pid: add pid recover method for criu + +The default pid recover method cannot recover the task +pid at every time. +We add a new pid recover method by setting the fork_pid of +the parent task struct, add the kernel will alloc pid by +the fork_pid. +The new pid recover method can also avoid other tasks using +the dumping task pids. + +Signed-off-by: Jingxian He +--- + criu/config.c | 1 + + criu/cr-restore.c | 25 ++++++++++++++++++++++++- + criu/crtools.c | 1 + + criu/include/cr_options.h | 1 + + criu/include/restorer.h | 3 +++ + criu/pie/restorer.c | 25 ++++++++++++++++++++++++- + 6 files changed, 54 insertions(+), 2 deletions(-) + +diff --git a/criu/config.c b/criu/config.c +index d30636b..fdd8e07 100644 +--- a/criu/config.c ++++ b/criu/config.c +@@ -519,6 +519,7 @@ int parse_options(int argc, char **argv, bool *usage_error, + {"tls-no-cn-verify", no_argument, &opts.tls_no_cn_verify, true}, + BOOL_OPT("with-cpu-affinity", &opts.with_cpu_affinity), + BOOL_OPT("pin-memory", &opts.pin_memory), ++ BOOL_OPT("use-fork-pid", &opts.use_fork_pid), + { }, + }; + +diff --git a/criu/cr-restore.c b/criu/cr-restore.c +index 86c78a8..9bc5262 100644 +--- a/criu/cr-restore.c ++++ b/criu/cr-restore.c +@@ -1323,6 +1323,23 @@ static bool needs_prep_creds(struct pstree_item *item) + return (!item->parent && ((root_ns_mask & CLONE_NEWUSER) || getuid())); + } + ++static int write_fork_pid(int pid) ++{ ++ int fd, ret; ++ ++ fd = open(PIN_MEM_FILE, O_RDWR); ++ if (fd < 0) { ++ pr_warn("error open file: %s\n", PIN_MEM_FILE); ++ return -1; ++ } ++ ret = ioctl(fd, SET_FORK_PID, &pid); ++ if (ret < 0) { ++ pr_warn("write fork pid fail, errno: %s\n", strerror(errno)); ++ } ++ close(fd); ++ return ret; ++} ++ + static inline int fork_with_pid(struct pstree_item *item) + { + struct cr_clone_arg ca; +@@ -1378,7 +1395,7 @@ static inline int fork_with_pid(struct pstree_item *item) + int len; + int fd = -1; + +- if (!kdat.has_clone3_set_tid) { ++ if (!kdat.has_clone3_set_tid && opts.use_fork_pid) { + fd = open_proc_rw(PROC_GEN, LAST_PID_PATH); + if (fd < 0) + goto err; +@@ -1406,6 +1423,11 @@ static inline int fork_with_pid(struct pstree_item *item) + ~(CLONE_NEWNET | CLONE_NEWCGROUP)), + SIGCHLD, pid); + } else { ++ if (opts.use_fork_pid) { ++ ret = write_fork_pid(pid); ++ if (ret < 0) ++ goto err_unlock; ++ } + /* + * Some kernel modules, such as network packet generator + * run kernel thread upon net-namespace creation taking +@@ -3669,6 +3691,7 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns + task_args->thread_args); + + task_args->pin_memory = opts.pin_memory ? true : false; ++ task_args->use_fork_pid = opts.use_fork_pid ? true : false; + + /* + * An indirect call to task_restore, note it never returns +diff --git a/criu/crtools.c b/criu/crtools.c +index fce29f3..15f2fc9 100644 +--- a/criu/crtools.c ++++ b/criu/crtools.c +@@ -391,6 +391,7 @@ usage: + " --with-cpu-affinity Allow to restore cpu affinity. Only for hosts with\n" + " same cpu quantity.\n" + " --pin-memory Use pin memory method for checkpoint and restore.\n" ++" --use-fork-pid Allow to restore task pid by setting fork pid of task struct.\n" + "\n" + "Check options:\n" + " Without options, \"criu check\" checks availability of absolutely required\n" +diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h +index 15f8cb3..a91bfff 100644 +--- a/criu/include/cr_options.h ++++ b/criu/include/cr_options.h +@@ -146,6 +146,7 @@ struct cr_options { + int tls_no_cn_verify; + int with_cpu_affinity; /* restore cpu affinity */ + int pin_memory; ++ int use_fork_pid; + }; + + extern struct cr_options opts; +diff --git a/criu/include/restorer.h b/criu/include/restorer.h +index fbd5262..481eeaa 100644 +--- a/criu/include/restorer.h ++++ b/criu/include/restorer.h +@@ -226,6 +226,7 @@ struct task_restore_args { + int child_subreaper; + bool has_clone3_set_tid; + bool pin_memory; ++ bool use_fork_pid; + } __aligned(64); + + /* +@@ -325,11 +326,13 @@ enum { + #define _REMAP_PIN_MEM_AREA 3 + #define _DUMP_SEPCIAL_PAGES 6 + #define _RETORE_SEPCIAL_PAGES 7 ++#define _SET_FORK_PID 8 + #define SET_PIN_MEM_AREA _IOW(PIN_MEM_MAGIC, _SET_PIN_MEM_AREA, struct pin_mem_area_set) + #define CLEAR_PIN_MEM_AREA _IOW(PIN_MEM_MAGIC, _CLEAR_PIN_MEM_AREA, int) + #define REMAP_PIN_MEM_AREA _IOW(PIN_MEM_MAGIC, _REMAP_PIN_MEM_AREA, int) + #define DUMP_SEPCIAL_PAGES _IOW(PIN_MEM_MAGIC, _DUMP_SEPCIAL_PAGES, int) + #define RETORE_SEPCIAL_PAGES _IOW(PIN_MEM_MAGIC, _RETORE_SEPCIAL_PAGES, int) ++#define SET_FORK_PID _IOW(PIN_MEM_MAGIC, _SET_FORK_PID, int) + + #define ONCE_PIN_MEM_SIZE_LIMIT 32 * 1024 * 1024 + #define MAX_PIN_MEM_AREA_NUM 16 +diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c +index 9a24feb..f4f4e6a 100644 +--- a/criu/pie/restorer.c ++++ b/criu/pie/restorer.c +@@ -1429,6 +1429,22 @@ int remap_vmas(int pid) + return ret; + } + ++int write_fork_pid(int pid) ++{ ++ int fd, ret; ++ ++ fd = sys_open(PIN_MEM_FILE, O_RDWR, 0); ++ if (fd < 0) { ++ pr_warn("error open file: %s\n", PIN_MEM_FILE); ++ return -1; ++ } ++ ret = sys_ioctl(fd, SET_FORK_PID, (unsigned long) &pid); ++ if (ret < 0) { ++ pr_warn("write fork pid fail fail: %d\n", pid); ++ } ++ sys_close(fd); ++ return ret; ++} + + /* + * The main routine to restore task via sigreturn. +@@ -1831,7 +1847,7 @@ long __export_restore_task(struct task_restore_args *args) + long parent_tid; + int i, fd = -1; + +- if (!args->has_clone3_set_tid) { ++ if (!args->has_clone3_set_tid && !args->use_fork_pid) { + /* One level pid ns hierarhy */ + fd = sys_openat(args->proc_fd, LAST_PID_PATH, O_RDWR, 0); + if (fd < 0) { +@@ -1863,6 +1879,13 @@ long __export_restore_task(struct task_restore_args *args) + c_args.parent_tid = ptr_to_u64(&parent_tid); + pr_debug("Using clone3 to restore the process\n"); + RUN_CLONE3_RESTORE_FN(ret, c_args, sizeof(c_args), &thread_args[i], args->clone_restore_fn); ++ } else if (args->use_fork_pid) { ++ if (write_fork_pid(thread_args[i].pid) < 0) { ++ pr_err("Clone fail with fork pid\n"); ++ mutex_unlock(&task_entries_local->last_pid_mutex); ++ goto core_restore_end; ++ } ++ RUN_CLONE_RESTORE_FN(ret, clone_flags, new_sp, parent_tid, thread_args, args->clone_restore_fn); + } else { + last_pid_len = std_vprint_num(last_pid_buf, sizeof(last_pid_buf), thread_args[i].pid - 1, &s); + sys_lseek(fd, 0, SEEK_SET); +-- +2.34.0 + diff --git a/backport-0006--notifier-add-notifier-calling-method-for-checkpoint-.patch b/backport-0006--notifier-add-notifier-calling-method-for-checkpoint-.patch new file mode 100644 index 0000000..f22a080 --- /dev/null +++ b/backport-0006--notifier-add-notifier-calling-method-for-checkpoint-.patch @@ -0,0 +1,650 @@ +From 4dd270f407dc9c4ce002fd706604a68f131eb4a9 Mon Sep 17 00:00:00 2001 +From: Jingxian He +Date: Wed, 1 Dec 2021 10:54:43 +0800 +Subject: [PATCH 06/49] notifier: add notifier calling method for checkpoint + and restore + +Add notifier calling method for checkpoint and restore during kernel module upgrading. + +Signed-off-by: Xiaoguang Li +Signed-off-by: He Jingxian +Signed-off-by: fu.lin +--- + criu/config.c | 1 + + criu/cr-dump.c | 33 ++++++++++ + criu/cr-restore.c | 22 ++++++- + criu/crtools.c | 3 + + criu/include/cr_options.h | 1 + + criu/include/restorer.h | 1 + + criu/include/util.h | 42 ++++++++++++ + criu/pie/restorer.c | 135 ++++++++++++++++++++++++++++++++++---- + criu/pie/util.c | 91 +++++++++++++++++++++++++ + include/common/lock.h | 4 ++ + 10 files changed, 319 insertions(+), 14 deletions(-) + +diff --git a/criu/config.c b/criu/config.c +index fdd8e07..2bee578 100644 +--- a/criu/config.c ++++ b/criu/config.c +@@ -520,6 +520,7 @@ int parse_options(int argc, char **argv, bool *usage_error, + BOOL_OPT("with-cpu-affinity", &opts.with_cpu_affinity), + BOOL_OPT("pin-memory", &opts.pin_memory), + BOOL_OPT("use-fork-pid", &opts.use_fork_pid), ++ BOOL_OPT("with-notifier", &opts.with_notifier_kup), + { }, + }; + +diff --git a/criu/cr-dump.c b/criu/cr-dump.c +index b125985..6a8ed18 100644 +--- a/criu/cr-dump.c ++++ b/criu/cr-dump.c +@@ -1683,6 +1683,8 @@ static int cr_lazy_mem_dump(void) + return ret; + } + ++static enum notifier_state notifier_state = NOTHING_COMPLETE; ++ + static int cr_dump_finish(int ret) + { + int post_dump_ret = 0; +@@ -1766,6 +1768,20 @@ static int cr_dump_finish(int ret) + restore_task_special_pages(0); + } + ++ if (ret != 0 && opts.with_notifier_kup) { ++ pr_info("call notifier rollback\n"); ++ switch (notifier_state) { ++ case PRE_FREEZE_COMPLETE: ++ notifier_kup(PRE_FREEZE, ROLLBACK, true); ++ break; ++ case FREEZE_TO_KILL_COMPLETE: ++ notifier_kup(FREEZE_TO_KILL, ROLLBACK, true); ++ break; ++ default: ++ break; ++ } ++ } ++ + if (ret) { + pr_err("Dumping FAILED.\n"); + } else { +@@ -1799,6 +1815,14 @@ int cr_dump_tasks(pid_t pid) + goto err; + root_item->pid->real = pid; + ++ if (notifier_kup(PRE_FREEZE, PREPARE, opts.with_notifier_kup)) { ++ /* disable rollback function because we has already rollbacked. */ ++ opts.with_notifier_kup = false; ++ pr_err("call notifier: %d err\n", PRE_FREEZE); ++ goto err; ++ } else ++ notifier_state = PRE_FREEZE_COMPLETE; ++ + pre_dump_ret = run_scripts(ACT_PRE_DUMP); + if (pre_dump_ret != 0) { + pr_err("Pre dump script failed with %d!\n", pre_dump_ret); +@@ -1946,6 +1970,15 @@ int cr_dump_tasks(pid_t pid) + ret = write_img_inventory(&he); + if (ret) + goto err; ++ ++ ret = notifier_kup(FREEZE_TO_KILL, PREPARE, opts.with_notifier_kup); ++ if (ret) { ++ opts.with_notifier_kup = false; ++ pr_err("call notifier:%d err\n", FREEZE_TO_KILL); ++ goto err; ++ } else ++ notifier_state = FREEZE_TO_KILL_COMPLETE; ++ + err: + if (parent_ie) + inventory_entry__free_unpacked(parent_ie, NULL); +diff --git a/criu/cr-restore.c b/criu/cr-restore.c +index 9bc5262..6817c88 100644 +--- a/criu/cr-restore.c ++++ b/criu/cr-restore.c +@@ -1866,8 +1866,10 @@ static int restore_task_with_children(void *_arg) + return 0; + + err: +- if (current->parent == NULL) ++ if (current->parent == NULL) { ++ do_notifier_rollback(opts.with_notifier_kup, POST_UPDATE_KERNEL_COMPLETE); + futex_abort_and_wake(&task_entries->nr_in_progress); ++ } + exit(1); + } + +@@ -2306,8 +2308,10 @@ skip_ns_bouncing: + */ + attach_to_tasks(root_seized); + +- if (restore_switch_stage(CR_STATE_RESTORE_CREDS)) ++ if (restore_switch_stage(CR_STATE_RESTORE_CREDS)) { ++ pr_err("Can't switch to CR_STATE_RESTORE_CREDS stage\n"); + goto out_kill_network_unlocked; ++ } + + timing_stop(TIME_RESTORE); + +@@ -2481,6 +2485,15 @@ int cr_restore_tasks(void) + goto err; + + ret = restore_root_task(root_item); ++ if (ret) ++ goto err; ++ ++ ret = notifier_kup(POST_RUN, PREPARE, opts.with_notifier_kup); ++ if (ret < 0) { ++ opts.with_notifier_kup = false; ++ pr_err("calling POST_RUN notifier list return err"); ++ } ++ + err: + cr_plugin_fini(CR_PLUGIN_STAGE__RESTORE, ret); + return ret; +@@ -3660,6 +3673,8 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns + */ + task_args->lsm_type = kdat.lsm; + ++ task_args->with_notifier_kup = opts.with_notifier_kup; ++ + /* + * Make root and cwd restore _that_ late not to break any + * attempts to open files by paths above (e.g. /proc). +@@ -3703,6 +3718,9 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns + err: + free_mappings(&self_vmas); + err_nv: ++ if (current->parent == NULL && opts.with_notifier_kup) ++ do_notifier_rollback(true, POST_UPDATE_KERNEL_COMPLETE); ++ + /* Just to be sure */ + exit(1); + return -1; +diff --git a/criu/crtools.c b/criu/crtools.c +index 15f2fc9..dbdea53 100644 +--- a/criu/crtools.c ++++ b/criu/crtools.c +@@ -392,6 +392,9 @@ usage: + " same cpu quantity.\n" + " --pin-memory Use pin memory method for checkpoint and restore.\n" + " --use-fork-pid Allow to restore task pid by setting fork pid of task struct.\n" ++" --with-notifier Allow to checkout/restore kup notifier chain. This\n" ++" feature needs the kernel's assistance.\n" ++" Only for the host with these feature.\n" + "\n" + "Check options:\n" + " Without options, \"criu check\" checks availability of absolutely required\n" +diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h +index a91bfff..b1caddc 100644 +--- a/criu/include/cr_options.h ++++ b/criu/include/cr_options.h +@@ -147,6 +147,7 @@ struct cr_options { + int with_cpu_affinity; /* restore cpu affinity */ + int pin_memory; + int use_fork_pid; ++ int with_notifier_kup; + }; + + extern struct cr_options opts; +diff --git a/criu/include/restorer.h b/criu/include/restorer.h +index 481eeaa..10f3684 100644 +--- a/criu/include/restorer.h ++++ b/criu/include/restorer.h +@@ -227,6 +227,7 @@ struct task_restore_args { + bool has_clone3_set_tid; + bool pin_memory; + bool use_fork_pid; ++ bool with_notifier_kup; + } __aligned(64); + + /* +diff --git a/criu/include/util.h b/criu/include/util.h +index 313aacd..351d72e 100644 +--- a/criu/include/util.h ++++ b/criu/include/util.h +@@ -13,6 +13,8 @@ + #include + #include + #include ++#include ++#include + + #include "int.h" + #include "common/compiler.h" +@@ -380,4 +382,44 @@ static inline void print_stack_trace(pid_t pid) {} + ___ret; \ + }) + ++#define NOTIFY_PROC_PATH "/sys/kernel/modrestore/nvwa_notifier" ++ ++#if __has_include("linux/modrestore.h") ++#define CONFIG_EULEROS_MODRESTORE_NOTIFY ++# include ++#else ++enum KUP_HOOK_POINT { ++ PRE_FREEZE, ++ FREEZE_TO_KILL, ++ PRE_UPDATE_KERNEL, ++ POST_UPDATE_KERNEL, ++ UNFREEZE_TO_RUN, ++ POST_RUN, ++ ++ KUP_HOOK_MAX, ++}; ++ ++enum nvwa_cmd { ++ PREPARE = 0, ++ ROLLBACK, ++ ++ NVWA_CMD_MAX, ++}; ++#endif ++ ++enum notifier_state { ++ NOTHING_COMPLETE, ++ PRE_FREEZE_COMPLETE, ++ FREEZE_TO_KILL_COMPLETE, ++ PRE_UPDATE_KERNEL_COMPLETE, ++ POST_UPDATE_KERNEL_COMPLETE, ++ UNFREEZE_TO_RUN_COMPLETE, ++ POST_RUN_COMPLETE, ++ ++ NOTIFIER_ROLLBACK_DONE = 0xfc17173b, /* has done rollback */ ++}; ++ ++int notifier_kup(enum KUP_HOOK_POINT, enum nvwa_cmd, bool); ++void do_notifier_rollback(bool, enum notifier_state); ++ + #endif /* __CR_UTIL_H__ */ +diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c +index f4f4e6a..7fbf788 100644 +--- a/criu/pie/restorer.c ++++ b/criu/pie/restorer.c +@@ -77,6 +77,7 @@ + + static struct task_entries *task_entries_local; + static futex_t thread_inprogress; ++static futex_t thread_start; + static pid_t *helpers; + static int n_helpers; + static pid_t *zombies; +@@ -119,10 +120,28 @@ void parasite_cleanup(void) + extern void cr_restore_rt (void) asm ("__cr_restore_rt") + __attribute__ ((visibility ("hidden"))); + ++static int args_with_notifier_kup; ++static enum notifier_state notifier_state = POST_UPDATE_KERNEL_COMPLETE; ++static futex_t notifier_done; ++ + static void sigchld_handler(int signal, siginfo_t *siginfo, void *data) + { + char *r; + int i; ++ rt_sigaction_t act; ++ ++ if (signal == SIGSEGV || signal == SIGBUS || signal == SIGILL) { ++ /* Make sure we exit with the right signal at the end. So for instance ++ * the core will be dumped if enabled. */ ++ pr_info("recv signal: %d\n", signal); ++ do_notifier_rollback(args_with_notifier_kup, notifier_state); ++ ksigemptyset (&act.rt_sa_mask); ++ act.rt_sa_flags = SA_SIGINFO | SA_RESTART; ++ act.rt_sa_handler = (rt_sighandler_t)SIG_DFL; ++ sys_sigaction(signal, &act, NULL, sizeof(k_rtsigset_t)); ++ sys_kill(sys_getpid(),signal); ++ return; ++ } + + /* We can ignore helpers that die, we expect them to after + * CR_STATE_RESTORE is finished. */ +@@ -149,10 +168,14 @@ static void sigchld_handler(int signal, siginfo_t *siginfo, void *data) + + pr_info("Task %d %s %d\n", siginfo->si_pid, r, siginfo->si_status); + ++ pr_info("%s: trace do_notifier_rollback\n", __func__); ++ do_notifier_rollback(args_with_notifier_kup, notifier_state); + futex_abort_and_wake(&task_entries_local->nr_in_progress); + /* sa_restorer may be unmaped, so we can't go back to userspace*/ + sys_kill(sys_getpid(), SIGSTOP); + sys_exit_group(1); ++ ++ /* for notifier, do nothing when receiving SIGCHLD signal */ + } + + static int lsm_set_label(char *label, char *type, int procfd) +@@ -604,6 +627,27 @@ static void noinline rst_sigreturn(unsigned long new_sp, + ARCH_RT_SIGRETURN(new_sp, sigframe); + } + ++/* Notice: only one task, so it isn't necessary to consider concurrent. */ ++static int do_notifier(bool *notify) ++{ ++ int retval = 0; ++ ++ if (!*notify) ++ return 0; ++ ++ pr_info("unfreeze_to_run restore notifier\n"); ++ retval = notifier_kup(UNFREEZE_TO_RUN, PREPARE, true); ++ if (retval) { ++ *notify = false; ++ notifier_state = NOTIFIER_ROLLBACK_DONE; ++ pr_err("call notifier: %d err\n", UNFREEZE_TO_RUN); ++ } ++ ++ notifier_state = UNFREEZE_TO_RUN_COMPLETE; ++ ++ return retval; ++} ++ + /* + * Threads restoration via sigreturn. Note it's locked + * routine and calls for unlock at the end. +@@ -642,12 +686,18 @@ long __export_restore_thread(struct thread_restore_args *args) + + pr_info("%ld: Restored\n", sys_gettid()); + +- restore_finish_stage(task_entries_local, CR_STATE_RESTORE); ++ if (!!(restore_finish_stage(task_entries_local, CR_STATE_RESTORE) & FUTEX_ABORT_FLAG)) { ++ pr_err("%s: abort by CR_STATE_RESTORE\n", __func__); ++ goto core_restore_end; ++ } + + if (restore_signals(args->siginfo, args->siginfo_n, false)) + goto core_restore_end; + +- restore_finish_stage(task_entries_local, CR_STATE_RESTORE_SIGCHLD); ++ if (!!(restore_finish_stage(task_entries_local, CR_STATE_RESTORE_SIGCHLD) & FUTEX_ABORT_FLAG)) { ++ pr_err("%s: abort by CR_STATE_RESTORE_SIGCHLD\n", __func__); ++ goto core_restore_end; ++ } + + /* + * Make sure it's before creds, since it's privileged +@@ -663,16 +713,29 @@ long __export_restore_thread(struct thread_restore_args *args) + if (ret) + BUG(); + +- restore_finish_stage(task_entries_local, CR_STATE_RESTORE_CREDS); ++ if (!!(restore_finish_stage(task_entries_local, CR_STATE_RESTORE_CREDS) & FUTEX_ABORT_FLAG)) { ++ pr_err("%s: abort by CR_STATE_RESTORE_CREDS\n", __func__); ++ goto core_restore_end; ++ } + + futex_dec_and_wake(&thread_inprogress); ++ futex_wait_while(&thread_start, 0); ++ if (!!(futex_get(&thread_start) & FUTEX_ABORT_FLAG)) { ++ pr_err("%s: abort by thread_start\n", __func__); ++ goto wait_notifier; ++ } + + new_sp = (long)rt_sigframe + RT_SIGFRAME_OFFSET(rt_sigframe); + rst_sigreturn(new_sp, rt_sigframe); + + core_restore_end: +- pr_err("Restorer abnormal termination for %ld\n", sys_getpid()); +- futex_abort_and_wake(&task_entries_local->nr_in_progress); ++ futex_abort_and_wake(&thread_start); ++ futex_abort_and_wake(&task_entries_local->start); ++ ++wait_notifier: ++ pr_err("%s: Restorer abnormal termination for %ld\n", __func__, sys_getpid()); ++ futex_wait_while(¬ifier_done, 0); ++ + sys_exit_group(1); + return -1; + } +@@ -1468,6 +1531,10 @@ long __export_restore_task(struct task_restore_args *args) + rt_sigaction_t act; + bool has_vdso_proxy; + ++ futex_set(&thread_inprogress, 1); ++ futex_set(&thread_start, 0); ++ futex_set(¬ifier_done, 0); ++ + bootstrap_start = args->bootstrap_start; + bootstrap_len = args->bootstrap_len; + +@@ -1484,6 +1551,7 @@ long __export_restore_task(struct task_restore_args *args) + #ifdef ARCH_HAS_LONG_PAGES + __page_size = args->page_size; + #endif ++ args_with_notifier_kup = args->with_notifier_kup; + + ksigfillset(&act.rt_sa_mask); + act.rt_sa_handler = sigchld_handler; +@@ -1494,9 +1562,29 @@ long __export_restore_task(struct task_restore_args *args) + pr_err("Failed to set SIGCHLD %ld\n", ret); + goto core_restore_end; + } ++ ret = sys_sigaction(SIGSEGV, &act, NULL, sizeof(k_rtsigset_t)); ++ if (ret) { ++ pr_err("Failed to set SIGCHLD %ld\n", ret); ++ goto core_restore_end; ++ } ++ ++ ret = sys_sigaction(SIGBUS, &act, NULL, sizeof(k_rtsigset_t)); ++ if (ret) { ++ pr_err("Failed to set SIGCHLD %ld\n", ret); ++ goto core_restore_end; ++ } ++ ++ ret = sys_sigaction(SIGILL, &act, NULL, sizeof(k_rtsigset_t)); ++ if (ret) { ++ pr_err("Failed to set SIGCHLD %ld\n", ret); ++ goto core_restore_end; ++ } + + ksigemptyset(&to_block); + ksigaddset(&to_block, SIGCHLD); ++ ksigaddset(&to_block, SIGSEGV); ++ ksigaddset(&to_block, SIGBUS); ++ ksigaddset(&to_block, SIGILL); + ret = sys_sigprocmask(SIG_UNBLOCK, &to_block, NULL, sizeof(k_rtsigset_t)); + if (ret) { + pr_err("Failed to unblock SIGCHLD %ld\n", ret); +@@ -1909,7 +1997,8 @@ long __export_restore_task(struct task_restore_args *args) + pr_err("Unable to create a thread: %ld\n", ret); + mutex_unlock(&task_entries_local->last_pid_mutex); + goto core_restore_end; +- } ++ } else ++ futex_inc(&thread_inprogress); + } + + mutex_unlock(&task_entries_local->last_pid_mutex); +@@ -1933,7 +2022,14 @@ long __export_restore_task(struct task_restore_args *args) + + pr_info("%ld: Restored\n", sys_getpid()); + +- restore_finish_stage(task_entries_local, CR_STATE_RESTORE); ++ if (!!(restore_finish_stage(task_entries_local, CR_STATE_RESTORE) & FUTEX_ABORT_FLAG)) { ++ pr_err("%s: abort by CR_STATE_RESTORE\n", __func__); ++ goto core_restore_end; ++ } ++ ++ ret = do_notifier(&args->with_notifier_kup); ++ if (ret) ++ goto core_restore_end; + + if (cleanup_current_inotify_events(args)) + goto core_restore_end; +@@ -1981,7 +2077,8 @@ long __export_restore_task(struct task_restore_args *args) + if (ret) + goto core_restore_end; + +- restore_finish_stage(task_entries_local, CR_STATE_RESTORE_SIGCHLD); ++ if (!!(restore_finish_stage(task_entries_local, CR_STATE_RESTORE_SIGCHLD) & FUTEX_ABORT_FLAG)) ++ goto core_restore_end; + + rst_tcp_socks_all(args); + +@@ -2003,15 +2100,20 @@ long __export_restore_task(struct task_restore_args *args) + ret = ret || restore_pdeath_sig(args->t); + ret = ret || restore_child_subreaper(args->child_subreaper); + +- futex_set_and_wake(&thread_inprogress, args->nr_threads); +- +- restore_finish_stage(task_entries_local, CR_STATE_RESTORE_CREDS); ++ if (!!(restore_finish_stage(task_entries_local, CR_STATE_RESTORE_CREDS) & FUTEX_ABORT_FLAG)) ++ goto core_restore_end; + + if (ret) + BUG(); + + /* Wait until children stop to use args->task_entries */ + futex_wait_while_gt(&thread_inprogress, 1); ++ if (!!(futex_get(&thread_start) & FUTEX_ABORT_FLAG)) { ++ pr_err("%s: terminate by main thread futex_start\n", __func__); ++ goto handle_notifier; ++ } ++ ++ futex_set_and_wake(&thread_start, 1); + + sys_close(args->proc_fd); + std_log_set_fd(-1); +@@ -2049,8 +2151,17 @@ long __export_restore_task(struct task_restore_args *args) + rst_sigreturn(new_sp, rt_sigframe); + + core_restore_end: +- futex_abort_and_wake(&task_entries_local->nr_in_progress); ++ futex_abort_and_wake(&thread_start); ++ futex_abort_and_wake(&task_entries_local->start); ++ ++handle_notifier: ++ do_notifier_rollback(args->with_notifier_kup, notifier_state); ++ ++ futex_abort_and_wake(&task_entries_local->nr_in_progress); /* notifier the criu main process */ + pr_err("Restorer fail %ld\n", sys_getpid()); ++ ++ futex_set_and_wake(¬ifier_done, 1); /* wake all other threads to exit */ ++ + sys_exit_group(1); + return -1; + } +diff --git a/criu/pie/util.c b/criu/pie/util.c +index 4945483..752e5d0 100644 +--- a/criu/pie/util.c ++++ b/criu/pie/util.c +@@ -11,6 +11,7 @@ + #include "fcntl.h" + #include "log.h" + #include "util-pie.h" ++#include "util.h" + + #ifdef CR_NOGLIBC + # include +@@ -52,3 +53,93 @@ err_close: + __sys(close)(fd); + return -1; + } ++ ++#define KUP_BUF_SIZE 256 ++ ++static int int_to_string(unsigned number, char *buf, size_t total) { ++ unsigned remainder, quotient, i, len; ++ ++ quotient = number; ++ len = 0; ++ do { ++ quotient /= 10; ++ len += 1; ++ } while (quotient > 0); ++ ++ if (len > total - 1) ++ return -1; ++ ++ quotient = number; ++ i = 1; ++ do { ++ remainder = quotient % 10; ++ quotient = quotient / 10; ++ buf[len-i] = '0' + remainder; ++ i++; ++ } while (quotient > 0); ++ buf[len] = '\0'; ++ ++ return len == 0 ? -1 : len; ++} ++ ++int notifier_kup(enum KUP_HOOK_POINT action, enum nvwa_cmd cmd, bool enable) ++{ ++ int fd, count = 0, retval = 0; ++ char buf[KUP_BUF_SIZE] = {0}; ++ ++ if (!enable) ++ return 0; ++ ++ fd = __sys(open)(NOTIFY_PROC_PATH, O_WRONLY, 0); ++ if (fd == -EACCES) { ++ /* there is no priviledge to open file, ignore this condition. */ ++ pr_info("%s: open %s failed, retval: %d (-EACCES)\n", ++ __func__, NOTIFY_PROC_PATH, -EACCES); ++ return 0; ++ } else if (fd < 0) { ++ __pr_perror("%s: Can't open %s: %d\n", __func__, NOTIFY_PROC_PATH, fd); ++ return fd; ++ } ++ ++ retval = int_to_string(action, buf, sizeof(buf)-count); ++ if (retval <= 0) { ++ __pr_perror("%s: int_to_string error\n", __func__); ++ goto err_close; ++ } ++ ++ buf[retval] = ':'; ++ count = retval + 1; ++ ++ retval = int_to_string(cmd, buf+count, sizeof(buf)-count); ++ if (retval <= 0) { ++ __pr_perror("%s: int_to_string error\n", __func__); ++ goto err_close; ++ } ++ ++ count += retval; ++ retval = __sys(write)(fd, buf, count); ++ if (retval < 0) ++ __pr_perror("%s: Can't write to %s\n", __func__, NOTIFY_PROC_PATH); ++ ++err_close: ++ __sys(close)(fd); ++ ++ return retval < 0 ? -1 : 0; ++} ++ ++void do_notifier_rollback(bool rollback, enum notifier_state status) ++{ ++ if (!rollback) ++ return; ++ ++ switch (status) { ++ case POST_UPDATE_KERNEL_COMPLETE: ++ notifier_kup(POST_UPDATE_KERNEL, ROLLBACK, true); ++ break; ++ case UNFREEZE_TO_RUN_COMPLETE: ++ notifier_kup(UNFREEZE_TO_RUN, ROLLBACK, true); ++ break; ++ default: ++ break; ++ } ++} +diff --git a/include/common/lock.h b/include/common/lock.h +index 4782b63..3db17ae 100644 +--- a/include/common/lock.h ++++ b/include/common/lock.h +@@ -106,6 +106,10 @@ static inline void futex_inc_and_wake(futex_t *f) + LOCK_BUG_ON(sys_futex((uint32_t *)&f->raw.counter, FUTEX_WAKE, INT_MAX, NULL, NULL, 0) < 0); + } + ++static inline uint32_t futex_inc_return(futex_t *f) { ++ return atomic_inc_return(&f->raw); ++} ++ + /* Plain increment futex @f value */ + static inline void futex_inc(futex_t *f) { atomic_inc(&f->raw); } + +-- +2.34.0 + diff --git a/backport-0007--cred-provide-cred-checkpoint-restore-method.patch b/backport-0007--cred-provide-cred-checkpoint-restore-method.patch new file mode 100644 index 0000000..82ce6b7 --- /dev/null +++ b/backport-0007--cred-provide-cred-checkpoint-restore-method.patch @@ -0,0 +1,254 @@ +From ca6c1bd2e064414b972d6d9370e3a8a7bc855c01 Mon Sep 17 00:00:00 2001 +From: luolongjun +Date: Wed, 1 Dec 2021 11:00:34 +0800 +Subject: [PATCH 07/49] cred: provide cred checkpoint restore method + +criu checkpoint/restore the task, it only restore the context instead of +the memory address storing the context. + +To handle the problem resulted by CVE bugfix, details: +- https://nvd.nist.gov/vuln/detail/CVE-2016-4565 +- https://openfabrics.org/images/2018workshop/presentations/113_MRuhl_JourneytoVerbsIOCTL.pdf + +Brief: + Refresh the security context address of file. The infiniband code use +write()` as bi-directional `ioctl()`, there is `struct cred` address +uring `write()` process. However, criu uses some syscall, such as +capset()` and `setgroups()`, to regenerate the new cred, the file +red is fixed by `fcntl(F_SETOWN)`, then the address of new cred is +ifferent from the file. + This patch fix the `struct cred` address checking problem resulted by +VE fixed in infiniband drivers. + +Signed-off-by: luolongjun +Signed-off-by: fu.lin +--- + criu/config.c | 1 + + criu/cr-restore.c | 35 +++++++++++++++++++++++++++++++++++ + criu/crtools.c | 2 ++ + criu/include/cr_options.h | 1 + + criu/include/fcntl.h | 4 ++++ + criu/include/prctl.h | 4 ++++ + criu/include/restorer.h | 3 +++ + criu/pie/restorer.c | 38 ++++++++++++++++++++++++++++++++++++++ + 8 files changed, 88 insertions(+) + +diff --git a/criu/config.c b/criu/config.c +index 2bee578..98789a7 100644 +--- a/criu/config.c ++++ b/criu/config.c +@@ -521,6 +521,7 @@ int parse_options(int argc, char **argv, bool *usage_error, + BOOL_OPT("pin-memory", &opts.pin_memory), + BOOL_OPT("use-fork-pid", &opts.use_fork_pid), + BOOL_OPT("with-notifier", &opts.with_notifier_kup), ++ BOOL_OPT("with-fd-cred", &opts.with_fd_cred), + { }, + }; + +diff --git a/criu/cr-restore.c b/criu/cr-restore.c +index 6817c88..52386b8 100644 +--- a/criu/cr-restore.c ++++ b/criu/cr-restore.c +@@ -691,6 +691,28 @@ static int __collect_child_pids(struct pstree_item *p, int state, unsigned int * + return 0; + } + ++static int collect_child_fds(int state, unsigned int *n, struct pstree_item *me) ++{ ++ struct list_head *list = &rsti(me)->fds; ++ struct fdinfo_list_entry *fle, *tmp; ++ ++ *n = 0; ++ list_for_each_entry_safe(fle, tmp, list, ps_list) { ++ if (fle->fe->type == state) { ++ int *child; ++ ++ child = rst_mem_alloc(sizeof(*child), RM_PRIVATE); ++ if (!child) ++ return -1; ++ ++ (*n)++; ++ *child = fle->fe->fd; ++ } ++ } ++ ++ return 0; ++} ++ + static int collect_child_pids(int state, unsigned int *n) + { + struct pstree_item *pi; +@@ -715,6 +737,12 @@ static int collect_child_pids(int state, unsigned int *n) + return __collect_child_pids(current, state, n); + } + ++static int collect_chr_fds(struct pstree_item *me, struct task_restore_args *ta) ++{ ++ ta->setcred_pids = (int *)rst_mem_align_cpos(RM_PRIVATE); ++ return collect_child_fds(FD_TYPES__CHR, &ta->setcred_pids_n, me); ++} ++ + static int collect_helper_pids(struct task_restore_args *ta) + { + ta->helpers = (pid_t *)rst_mem_align_cpos(RM_PRIVATE); +@@ -918,6 +946,9 @@ static int restore_one_alive_task(int pid, CoreEntry *core) + if (collect_zombie_pids(ta) < 0) + return -1; + ++ if (opts.with_fd_cred && collect_chr_fds(current, ta) < 0) ++ return -1; ++ + if (collect_inotify_fds(ta) < 0) + return -1; + +@@ -3541,6 +3572,10 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns + RST_MEM_FIXUP_PPTR(task_args->helpers); + RST_MEM_FIXUP_PPTR(task_args->zombies); + RST_MEM_FIXUP_PPTR(task_args->vma_ios); ++ if (opts.with_fd_cred) ++ RST_MEM_FIXUP_PPTR(task_args->setcred_pids); ++ else ++ task_args->setcred_pids_n = UINT_MAX; + RST_MEM_FIXUP_PPTR(task_args->inotify_fds); + + task_args->compatible_mode = core_is_compat(core); +diff --git a/criu/crtools.c b/criu/crtools.c +index dbdea53..da1cdd1 100644 +--- a/criu/crtools.c ++++ b/criu/crtools.c +@@ -395,6 +395,8 @@ usage: + " --with-notifier Allow to checkout/restore kup notifier chain. This\n" + " feature needs the kernel's assistance.\n" + " Only for the host with these feature.\n" ++" --with-fd-cred Allow to make the restored process has the same cred\n" ++" as checkout assisted by kernel.\n" + "\n" + "Check options:\n" + " Without options, \"criu check\" checks availability of absolutely required\n" +diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h +index b1caddc..191153b 100644 +--- a/criu/include/cr_options.h ++++ b/criu/include/cr_options.h +@@ -148,6 +148,7 @@ struct cr_options { + int pin_memory; + int use_fork_pid; + int with_notifier_kup; ++ int with_fd_cred; + }; + + extern struct cr_options opts; +diff --git a/criu/include/fcntl.h b/criu/include/fcntl.h +index d9c5c5e..3abcd51 100644 +--- a/criu/include/fcntl.h ++++ b/criu/include/fcntl.h +@@ -19,6 +19,10 @@ struct f_owner_ex { + #define F_GETOWNER_UIDS 17 + #endif + ++#ifndef F_SETCRED ++#define F_SETCRED 18 ++#endif ++ + /* + * These things are required to compile on CentOS-6 + */ +diff --git a/criu/include/prctl.h b/criu/include/prctl.h +index 8e7fef3..ecbc69a 100644 +--- a/criu/include/prctl.h ++++ b/criu/include/prctl.h +@@ -82,4 +82,8 @@ struct prctl_mm_map { + # define PR_GET_THP_DISABLE 42 + #endif + ++#ifndef PR_DEFAULT_CRED ++# define PR_DEFAULT_CRED 54 ++#endif ++ + #endif /* __CR_PRCTL_H__ */ +diff --git a/criu/include/restorer.h b/criu/include/restorer.h +index 10f3684..4954915 100644 +--- a/criu/include/restorer.h ++++ b/criu/include/restorer.h +@@ -180,6 +180,9 @@ struct task_restore_args { + pid_t *zombies; + unsigned int zombies_n; + ++ int *setcred_pids; ++ unsigned int setcred_pids_n; ++ + int *inotify_fds; /* fds to cleanup inotify events at CR_STATE_RESTORE_SIGCHLD stage */ + unsigned int inotify_fds_n; + +diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c +index 7fbf788..c3b3da0 100644 +--- a/criu/pie/restorer.c ++++ b/criu/pie/restorer.c +@@ -78,6 +78,7 @@ + static struct task_entries *task_entries_local; + static futex_t thread_inprogress; + static futex_t thread_start; ++static futex_t cred_set; + static pid_t *helpers; + static int n_helpers; + static pid_t *zombies; +@@ -345,6 +346,41 @@ static int restore_creds(struct thread_creds_args *args, int procfd, + return 0; + } + ++static int update_cred_ref(struct task_restore_args *ta) ++{ ++ int i; ++ int ret; ++ int pid = sys_getpid(); ++ long int tid = sys_gettid(); ++ ++ if (ta->setcred_pids_n == UINT_MAX) { ++ pr_info("no need to keep the same cred \n"); ++ return 0; ++ } ++ ++ if (pid == tid) { ++ /* let main thread finish cred update first */ ++ ret = sys_prctl(PR_DEFAULT_CRED, 0, 0, 0, 0); ++ pr_info("main cred restore \n"); ++ futex_set_and_wake(&cred_set, 1); ++ } else { ++ futex_wait_until(&cred_set, 1); ++ pr_info("other cred restore \n"); ++ ret = sys_prctl(PR_DEFAULT_CRED, 0, 0, 0, 0); ++ } ++ ++ if (ret) ++ return ret; ++ ++ pr_info("%ld (%d) is going to update current cred \n", tid, pid); ++ ++ for (i = 0; i < ta->setcred_pids_n; i++) { ++ sys_fcntl(ta->setcred_pids[i], F_SETCRED, 0); ++ } ++ ++ return 0; ++} ++ + /* + * This should be done after creds restore, as + * some creds changes might drop the value back +@@ -708,6 +744,7 @@ long __export_restore_thread(struct thread_restore_args *args) + + ret = restore_creds(args->creds_args, args->ta->proc_fd, + args->ta->lsm_type); ++ ret = ret || update_cred_ref(args->ta); + ret = ret || restore_dumpable_flag(&args->ta->mm); + ret = ret || restore_pdeath_sig(args); + if (ret) +@@ -2096,6 +2133,7 @@ long __export_restore_task(struct task_restore_args *args) + */ + ret = restore_creds(args->t->creds_args, args->proc_fd, + args->lsm_type); ++ ret = ret || update_cred_ref(args); + ret = ret || restore_dumpable_flag(&args->mm); + ret = ret || restore_pdeath_sig(args->t); + ret = ret || restore_child_subreaper(args->child_subreaper); +-- +2.34.0 + diff --git a/backport-0008--block-device-dump-block-device-as-reguler-file.patch b/backport-0008--block-device-dump-block-device-as-reguler-file.patch new file mode 100644 index 0000000..206c44f --- /dev/null +++ b/backport-0008--block-device-dump-block-device-as-reguler-file.patch @@ -0,0 +1,60 @@ +From bb0edc8da9032a502181c5390084642da5c2cb3c Mon Sep 17 00:00:00 2001 +From: Jingxian He +Date: Wed, 19 May 2021 21:49:15 +0800 +Subject: [PATCH 08/49] block-device: dump block device as reguler file + +Add block device dump and restore method for kernel module upgrading. + +Signed-off-by: Xiaoguang Li +--- + criu/files.c | 27 +++++++++++++++++++++++++++ + 1 file changed, 27 insertions(+) + +diff --git a/criu/files.c b/criu/files.c +index ffdaa45..82ac3a3 100644 +--- a/criu/files.c ++++ b/criu/files.c +@@ -438,6 +438,30 @@ static const struct fdtype_ops *get_mem_dev_ops(struct fd_parms *p, int minor) + return ops; + } + ++static int dump_blkdev(struct fd_parms *p, int lfd, FdinfoEntry *e) ++{ ++ struct fd_link *link_old = p->link; ++ int maj = major(p->stat.st_rdev); ++ const struct fdtype_ops *ops; ++ int err; ++ ++ switch (maj) { ++ case SCSI_DISK0_MAJOR: ++ ops = ®file_dump_ops; ++ break; ++ default: { ++ char more[32] = "block_dev"; ++ ++ err = dump_unsupp_fd(p, lfd, "blk", more, e); ++ p->link = link_old; ++ return err; ++ } ++ } ++ err = do_dump_gen_file(p, lfd, ops, e); ++ p->link = link_old; ++ return err; ++} ++ + static int dump_chrdev(struct fd_parms *p, int lfd, FdinfoEntry *e) + { + struct fd_link *link_old = p->link; +@@ -505,6 +529,9 @@ static int dump_one_file(struct pid *pid, int fd, int lfd, struct fd_opts *opts, + p.fd_ctl = ctl; /* Some dump_opts require this to talk to parasite */ + p.dfds = dfds; /* epoll needs to verify if target fd exist */ + ++ if (S_ISBLK(p.stat.st_mode)) ++ return dump_blkdev(&p, lfd, e); ++ + if (S_ISSOCK(p.stat.st_mode)) + return dump_socket(&p, lfd, e); + +-- +2.34.0 + diff --git a/backport-0009--anon-inode-add-support-for-anon-inode-fd.patch b/backport-0009--anon-inode-add-support-for-anon-inode-fd.patch new file mode 100644 index 0000000..4788365 --- /dev/null +++ b/backport-0009--anon-inode-add-support-for-anon-inode-fd.patch @@ -0,0 +1,354 @@ +From 3fcb691774825400a3fd1030586ba8a55ff38be1 Mon Sep 17 00:00:00 2001 +From: Xiaoguang Li +Date: Wed, 1 Dec 2021 11:06:06 +0800 +Subject: [PATCH 09/49] anon-inode: add support for anon inode fd + +Add support for anon inode fd dump and restore during module upgrade. + +Signed-off-by: Xiaoguang Li +Signed-off-by: Jingxian He + +Signed-off-by: fu.lin +--- + criu/cr-restore.c | 3 +++ + criu/files-reg.c | 3 ++- + criu/include/image.h | 1 + + criu/include/mem.h | 1 + + criu/include/restorer.h | 6 ++++++ + criu/mem.c | 24 +++++++++++++++++++++++- + criu/pie/restorer.c | 32 ++++++++++++++++++++++++++++++++ + criu/proc_parse.c | 36 ++++++++++++++++++++++++++++++------ + images/vma.proto | 1 + + 9 files changed, 99 insertions(+), 8 deletions(-) + +diff --git a/criu/cr-restore.c b/criu/cr-restore.c +index 52386b8..ec59b69 100644 +--- a/criu/cr-restore.c ++++ b/criu/cr-restore.c +@@ -981,6 +981,8 @@ static int restore_one_alive_task(int pid, CoreEntry *core) + if (prepare_vmas(current, ta)) + return -1; + ++ if (prepare_vma_names(current, ta)) ++ return -1; + /* + * Sockets have to be restored in their network namespaces, + * so a task namespace has to be restored after sockets. +@@ -3562,6 +3564,7 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns + #endif + + RST_MEM_FIXUP_PPTR(task_args->vmas); ++ RST_MEM_FIXUP_PPTR(task_args->vma_names); + RST_MEM_FIXUP_PPTR(task_args->rings); + RST_MEM_FIXUP_PPTR(task_args->tcp_socks); + RST_MEM_FIXUP_PPTR(task_args->timerfd); +diff --git a/criu/files-reg.c b/criu/files-reg.c +index 28c3360..17b3234 100644 +--- a/criu/files-reg.c ++++ b/criu/files-reg.c +@@ -2125,7 +2125,7 @@ int do_open_reg_noseek_flags(int ns_root_fd, struct reg_file_info *rfi, void *ar + + /* unnamed temporary files are restored as ghost files */ + flags &= ~O_TMPFILE; +- ++ pr_info("openat path is: %s\n", rfi->path); + fd = openat(ns_root_fd, rfi->path, flags); + if (fd < 0) { + pr_perror("Can't open file %s on restore", rfi->path); +@@ -2285,6 +2285,7 @@ int collect_filemap(struct vma_area *vma) + if (!fd) + return -1; + ++ pr_info("find fd for %lx, shmid: %lx\n", vma->e->start, vma->e->shmid); + vma->vmfd = fd; + vma->vm_open = open_filemap; + return 0; +diff --git a/criu/include/image.h b/criu/include/image.h +index 2baa394..b2aad86 100644 +--- a/criu/include/image.h ++++ b/criu/include/image.h +@@ -83,6 +83,7 @@ + #define VMA_AREA_SOCKET (1 << 11) + #define VMA_AREA_VVAR (1 << 12) + #define VMA_AREA_AIORING (1 << 13) ++#define VMA_AREA_ANON_INODE (1 << 15) + + #define VMA_CLOSE (1 << 28) + #define VMA_NO_PROT_WRITE (1 << 29) +diff --git a/criu/include/mem.h b/criu/include/mem.h +index 4241b0c..b138e26 100644 +--- a/criu/include/mem.h ++++ b/criu/include/mem.h +@@ -47,6 +47,7 @@ extern int parasite_dump_pages_seized(struct pstree_item *item, + struct task_restore_args; + int open_vmas(struct pstree_item *t); + int prepare_vmas(struct pstree_item *t, struct task_restore_args *ta); ++int prepare_vma_names(struct pstree_item *t, struct task_restore_args *ta); + int unmap_guard_pages(struct pstree_item *t); + int prepare_mappings(struct pstree_item *t); + bool should_dump_page(VmaEntry *vmae, u64 pme); +diff --git a/criu/include/restorer.h b/criu/include/restorer.h +index 4954915..b8f74e9 100644 +--- a/criu/include/restorer.h ++++ b/criu/include/restorer.h +@@ -127,6 +127,10 @@ struct restore_vma_io { + + #define RIO_SIZE(niovs) (sizeof(struct restore_vma_io) + (niovs) * sizeof(struct iovec)) + ++struct vma_names { ++ char name[PATH_MAX]; ++}; ++ + struct task_restore_args { + struct thread_restore_args *t; /* thread group leader */ + +@@ -150,6 +154,8 @@ struct task_restore_args { + VmaEntry *vmas; + unsigned int vmas_n; + ++ struct vma_names *vma_names; ++ + int vma_ios_fd; + struct restore_vma_io *vma_ios; + unsigned int vma_ios_n; +diff --git a/criu/mem.c b/criu/mem.c +index d2ff6e0..dbd479a 100644 +--- a/criu/mem.c ++++ b/criu/mem.c +@@ -600,6 +600,9 @@ static int __parasite_dump_pages_seized(struct pstree_item *item, + if (opts.pin_memory && should_pin_vmae(vma_area->e)) + continue; + ++ if (vma_entry_is(vma_area->e, VMA_AREA_ANON_INODE)) ++ continue; ++ + ret = generate_vma_iovs(item, vma_area, pp, &xfer, args, ctl, + &pmc, has_parent, mdc->pre_dump); + if (ret < 0) +@@ -745,7 +748,6 @@ int prepare_mm_pid(struct pstree_item *i) + } + + pr_info("vma 0x%"PRIx64" 0x%"PRIx64"\n", vma->e->start, vma->e->end); +- + if (vma_area_is(vma, VMA_ANON_SHARED)) + ret = collect_shmem(pid, vma); + else if (vma_area_is(vma, VMA_FILE_PRIVATE) || +@@ -1400,6 +1402,9 @@ int open_vmas(struct pstree_item *t) + filemap_ctx_init(false); + + list_for_each_entry(vma, &vmas->h, list) { ++ if (vma_area_is(vma, VMA_AREA_ANON_INODE)) ++ continue; ++ + if (!vma_area_is(vma, VMA_AREA_REGULAR) || !vma->vm_open) + continue; + +@@ -1471,3 +1476,20 @@ int prepare_vmas(struct pstree_item *t, struct task_restore_args *ta) + + return prepare_vma_ios(t, ta); + } ++ ++int prepare_vma_names(struct pstree_item *t, struct task_restore_args *ta) ++{ ++ struct vma_area *vma; ++ struct vm_area_list *vmas = &rsti(t)->vmas; ++ ta->vma_names = (struct vma_names *)rst_mem_align_cpos(RM_PRIVATE); ++ ++ list_for_each_entry(vma, &vmas->h, list) { ++ struct vma_names *vma_names; ++ vma_names = rst_mem_alloc(sizeof(*vma_names), RM_PRIVATE); ++ if (!vma_names) ++ return -1; ++ ++ memcpy(vma_names->name, vma->e->name, strlen(vma->e->name) + 1); ++ } ++ return 0; ++} +diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c +index c3b3da0..808e862 100644 +--- a/criu/pie/restorer.c ++++ b/criu/pie/restorer.c +@@ -66,6 +66,7 @@ + #define FALLOC_FL_PUNCH_HOLE 0x02 + #endif + ++#define ANON_PROC_PATH "/sys/kernel/modrestore/anon_state_restore" + + #define sys_prctl_safe(opcode, val1, val2, val3) \ + ({ \ +@@ -798,6 +799,25 @@ unsigned long arch_shmat(int shmid, void *shmaddr, + } + #endif + ++static int restore_anon_mapping(VmaEntry *vma_entry, struct vma_names *vma_name) ++{ ++ int fd; ++ ++ fd = sys_open(ANON_PROC_PATH, O_WRONLY, 0); ++ if (fd < 0) { ++ pr_info("anon sys fs open fail:%s\n", ANON_PROC_PATH); ++ return fd; ++ } ++ pr_info("restore anon mapping: %s\n", vma_name->name); ++ ++ if (sys_write(fd, vma_name->name, 4096) < 0) { ++ sys_close(fd); ++ return -1; ++ } ++ sys_close(fd); ++ return 0; ++} ++ + static unsigned long restore_mapping(VmaEntry *vma_entry) + { + int prot = vma_entry->prot; +@@ -1567,6 +1587,7 @@ long __export_restore_task(struct task_restore_args *args) + pid_t my_pid = sys_getpid(); + rt_sigaction_t act; + bool has_vdso_proxy; ++ struct vma_names *vma_name; + + futex_set(&thread_inprogress, 1); + futex_set(&thread_start, 0); +@@ -1727,6 +1748,14 @@ long __export_restore_task(struct task_restore_args *args) + */ + for (i = 0; i < args->vmas_n; i++) { + vma_entry = args->vmas + i; ++ vma_name = args->vma_names + i; ++ ++ if (vma_entry_is(vma_entry, VMA_AREA_ANON_INODE)) { ++ pr_info("anon vma name:%s\n", vma_name->name); ++ if (restore_anon_mapping(vma_entry, vma_name) < 0) ++ goto core_restore_end; ++ continue; ++ } + + if (!vma_entry_is(vma_entry, VMA_AREA_REGULAR) && + !vma_entry_is(vma_entry, VMA_AREA_AIORING)) +@@ -1850,6 +1879,9 @@ long __export_restore_task(struct task_restore_args *args) + if (!vma_entry->has_madv || !vma_entry->madv) + continue; + ++ if (vma_entry_is(vma_entry, VMA_AREA_ANON_INODE)) ++ continue; ++ + for (m = 0; m < sizeof(vma_entry->madv) * 8; m++) { + if (vma_entry->madv & (1ul << m)) { + ret = sys_madvise(vma_entry->start, +diff --git a/criu/proc_parse.c b/criu/proc_parse.c +index 0e8b6f2..b0c8e89 100644 +--- a/criu/proc_parse.c ++++ b/criu/proc_parse.c +@@ -76,6 +76,7 @@ static char *buf = __buf.buf; + */ + + #define AIO_FNAME "/[aio]" ++#define ANON_FNAME "anon_inode" + + /* check the @line starts with "%lx-%lx" format */ + static bool __is_vma_range_fmt(char *line) +@@ -173,8 +174,19 @@ static void parse_vma_vmflags(char *buf, struct vma_area *vma_area) + * only exception is VVAR area that mapped by the kernel as + * VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP + */ +- if (io_pf && !vma_area_is(vma_area, VMA_AREA_VVAR)) ++ /* There are many types of io/pf vm_map, not only vvar, but also ++ * anon_inode, and char device. ++ * For anon_inode and char device, we use anon_notifier to restore ++ * status. Therefore, we disable the broken code here. ++ */ ++ /* ++ if (io_pf && !vma_area_is(vma_area, VMA_AREA_VVAR) && ++ !vma_area_is(vma_area, VMA_AREA_ANON_INODE)) ++ { ++ pr_info("set current status tp VMA_UNSUPP\n"); + vma_area->e->status |= VMA_UNSUPP; ++ } ++ */ + + if (vma_area->e->madv) + vma_area->e->has_madv = true; +@@ -414,7 +426,6 @@ static int vma_get_mapfile(const char *fname, struct vma_area *vma, DIR *mfd, + + if (fstatat(dirfd(mfd), path, &buf, 0)) + return -1; +- + if (S_ISSOCK(buf.st_mode)) { + pr_info("Found socket mapping @%"PRIx64"\n", vma->e->start); + vma->vm_socket_id = buf.st_ino; +@@ -429,6 +440,21 @@ static int vma_get_mapfile(const char *fname, struct vma_area *vma, DIR *mfd, + return 0; + } + ++ if (!strncmp(fname, ANON_FNAME, sizeof(ANON_FNAME) - 1)) { ++ /*anon_inode*/ ++ close_safe(vm_file_fd); ++ vma->e->status = VMA_AREA_ANON_INODE; ++ vma->e->name = xmalloc(PATH_MAX); ++ if (!vma->e->name) { ++ pr_err("alloc vma name of anon-inode fail.\n"); ++ return -1; ++ } ++ snprintf(vma->e->name, PATH_MAX - 1, "%"PRIx64"-%"PRIx64 " %s", vma->e->start, vma->e->end, fname); ++ vma->e->name[PATH_MAX - 1] = 0; ++ pr_info("set vma_area status to: %d, name:%s\n", vma->e->status, vma->e->name); ++ return 0; ++ } ++ + pr_err("Unknown shit %o (%s)\n", buf.st_mode, fname); + return -1; + } +@@ -527,7 +553,7 @@ static int handle_vma(pid_t pid, struct vma_area *vma_area, + if (vma_get_mapfile(file_path, vma_area, map_files_dir, + vfi, prev_vfi, vm_file_fd)) + goto err_bogus_mapfile; +- ++ pr_info("handle_vam, vma status is: %d\n", vma_area->e->status); + if (vma_area->e->status != 0) + return 0; + +@@ -563,6 +589,7 @@ static int handle_vma(pid_t pid, struct vma_area *vma_area, + vma_area->e->shmid = prev->e->shmid; + vma_area->vmst = prev->vmst; + vma_area->mnt_id = prev->mnt_id; ++ vma_area->e->name = prev->e->name; + } else if (*vm_file_fd >= 0) { + struct stat *st_buf = vma_area->vmst; + +@@ -728,7 +755,6 @@ int parse_smaps(pid_t pid, struct vm_area_list *vma_area_list, + if (IS_ERR(str)) + goto err; + eof = (str == NULL); +- + if (!eof && !__is_vma_range_fmt(str)) { + if (!strncmp(str, "Nonlinear", 9)) { + BUG_ON(!vma_area); +@@ -747,7 +773,6 @@ int parse_smaps(pid_t pid, struct vm_area_list *vma_area_list, + } else + continue; + } +- + if (vma_area && vma_list_add(vma_area, vma_area_list, + &prev_end, &vfi, &prev_vfi)) + goto err; +@@ -794,7 +819,6 @@ int parse_smaps(pid_t pid, struct vm_area_list *vma_area_list, + if (handle_vma(pid, vma_area, str + path_off, map_files_dir, + &vfi, &prev_vfi, &vm_file_fd)) + goto err; +- + if (vma_entry_is(vma_area->e, VMA_FILE_PRIVATE) || + vma_entry_is(vma_area->e, VMA_FILE_SHARED)) { + if (dump_filemap && dump_filemap(vma_area, vm_file_fd)) +diff --git a/images/vma.proto b/images/vma.proto +index 7085f42..f1ae4fb 100644 +--- a/images/vma.proto ++++ b/images/vma.proto +@@ -22,4 +22,5 @@ message vma_entry { + + /* file status flags */ + optional uint32 fdflags = 10 [(criu).hex = true]; ++ required string name = 11; + } +-- +2.34.0 + diff --git a/backport-0010--char_dev-add-support-for-char-device-dump-and-restor.patch b/backport-0010--char_dev-add-support-for-char-device-dump-and-restor.patch new file mode 100644 index 0000000..3d46c43 --- /dev/null +++ b/backport-0010--char_dev-add-support-for-char-device-dump-and-restor.patch @@ -0,0 +1,773 @@ +From e7e6443682d9b8e053f5c1d5f281b9034028fdbb Mon Sep 17 00:00:00 2001 +From: Xiaoguang Li +Date: Wed, 1 Dec 2021 11:19:52 +0800 +Subject: [PATCH 10/49] char_dev: add support for char device dump and restore + +Add support for char device dump and restore during module upgrade. + +`/sys/kernel/repairing_device` provides the char device whiltelist +with `IOCTL_CMD_{NEEDREPAIR, REPAIR}` command besides the internal +device list. +The device modules could use `mures_{add, del}_devname()` to add, or +delete the char device whitelist dynamically. + +Signed-off-by: Xiaoguang Li +Signed-off-by: Jingxian He +Signed-off-by: fu.lin +--- + criu/Makefile.crtools | 1 + + criu/config.c | 1 + + criu/cr-dump.c | 3 + + criu/cr-restore.c | 4 +- + criu/crtools.c | 1 + + criu/devname.c | 130 ++++++++++++++++++++++++++++ + criu/files-reg.c | 34 +++++++- + criu/files.c | 159 ++++++++++++++++++++++++++++++++++- + criu/include/cr_options.h | 1 + + criu/include/files-reg.h | 9 ++ + criu/include/files.h | 6 ++ + criu/include/image-desc.h | 1 + + criu/include/image.h | 1 + + criu/include/protobuf-desc.h | 1 + + criu/include/util.h | 3 + + criu/mem.c | 6 +- + criu/proc_parse.c | 16 +++- + images/Makefile | 1 + + images/chr.proto | 12 +++ + images/fdinfo.proto | 3 + + 20 files changed, 382 insertions(+), 11 deletions(-) + create mode 100644 criu/devname.c + create mode 100644 images/chr.proto + +diff --git a/criu/Makefile.crtools b/criu/Makefile.crtools +index 4588ea5..c093a59 100644 +--- a/criu/Makefile.crtools ++++ b/criu/Makefile.crtools +@@ -86,6 +86,7 @@ obj-y += config.o + obj-y += servicefd.o + obj-y += pie-util-vdso.o + obj-y += vdso.o ++obj-y += devname.o + obj-$(CONFIG_COMPAT) += pie-util-vdso-elf32.o + CFLAGS_pie-util-vdso-elf32.o += -DCONFIG_VDSO_32 + obj-$(CONFIG_COMPAT) += vdso-compat.o +diff --git a/criu/config.c b/criu/config.c +index 98789a7..af4d4e4 100644 +--- a/criu/config.c ++++ b/criu/config.c +@@ -522,6 +522,7 @@ int parse_options(int argc, char **argv, bool *usage_error, + BOOL_OPT("use-fork-pid", &opts.use_fork_pid), + BOOL_OPT("with-notifier", &opts.with_notifier_kup), + BOOL_OPT("with-fd-cred", &opts.with_fd_cred), ++ BOOL_OPT("dump-char-dev", &opts.dump_char_dev), + { }, + }; + +diff --git a/criu/cr-dump.c b/criu/cr-dump.c +index 6a8ed18..c09d8bd 100644 +--- a/criu/cr-dump.c ++++ b/criu/cr-dump.c +@@ -1810,6 +1810,9 @@ int cr_dump_tasks(pid_t pid) + */ + rlimit_unlimit_nofile(); + ++ if (opts.dump_char_dev && parse_devname() < 0) ++ goto err; ++ + root_item = alloc_pstree_item(); + if (!root_item) + goto err; +diff --git a/criu/cr-restore.c b/criu/cr-restore.c +index ec59b69..00100fa 100644 +--- a/criu/cr-restore.c ++++ b/criu/cr-restore.c +@@ -338,11 +338,11 @@ static int root_prepare_shared(void) + if (pi->pid->state == TASK_HELPER) + continue; + +- ret = prepare_mm_pid(pi); ++ ret = prepare_fd_pid(pi); + if (ret < 0) + break; + +- ret = prepare_fd_pid(pi); ++ ret = prepare_mm_pid(pi); + if (ret < 0) + break; + +diff --git a/criu/crtools.c b/criu/crtools.c +index da1cdd1..638fb40 100644 +--- a/criu/crtools.c ++++ b/criu/crtools.c +@@ -396,6 +396,7 @@ usage: + " feature needs the kernel's assistance.\n" + " Only for the host with these feature.\n" + " --with-fd-cred Allow to make the restored process has the same cred\n" ++" --dump-char-dev Dump char dev files as normal file with repair cmd\n" + " as checkout assisted by kernel.\n" + "\n" + "Check options:\n" +diff --git a/criu/devname.c b/criu/devname.c +new file mode 100644 +index 0000000..5f6fbed +--- /dev/null ++++ b/criu/devname.c +@@ -0,0 +1,130 @@ ++#include ++#include ++#include ++#include ++ ++#include "log.h" ++#include "common/xmalloc.h" ++ ++#define REPAIRING_DEVICE_FILE "/sys/kernel/repairing_device" ++#define ASCII_SIZE 128 ++ ++static void *root_bucket[ASCII_SIZE]; ++ ++static int insert_devname_internal(void *bucket[], const char *name) ++{ ++ void *new = NULL; ++ int idx = *name; ++ ++ if (bucket[idx] != NULL) ++ return insert_devname_internal(bucket[idx], name+1); ++ else if (idx == '\0') { ++ new = xmalloc(sizeof(void *)); ++ if (!new) { ++ pr_perror("alloc devname failed\n"); ++ return -1; ++ } ++ bucket[idx] = new; ++ return 0; ++ } else { ++ new = xmalloc(sizeof(void *) * ASCII_SIZE); ++ if (!new) { ++ pr_perror("alloc devname failed\n"); ++ return -1; ++ } ++ memset(new, 0, sizeof(void *) * ASCII_SIZE); ++ bucket[idx] = new; ++ return insert_devname_internal(bucket[idx], name+1); ++ } ++} ++ ++int insert_devname(const char *devname) ++{ ++ if (devname == NULL || *devname == '\0') // ignore ++ return 0; ++ ++ pr_debug("insert device '%s'\n", devname); ++ return insert_devname_internal(root_bucket, devname); ++} ++ ++int parse_devname(void) ++{ ++ int retval = -1; ++ char *line = NULL; ++ size_t len = 0; ++ ssize_t nread = 0; ++ FILE *fp = NULL; ++ ++ fp = fopen(REPAIRING_DEVICE_FILE, "r"); ++ if (fp == NULL) { ++ pr_info("Unable to open %s, downgrade to use internal whitelist\n", ++ REPAIRING_DEVICE_FILE); ++ return 0; ++ } ++ ++ while ((nread = getline(&line, &len, fp)) != -1) { ++ if (nread <= 1) // ignore empty string ++ continue; ++ ++ line[nread-1] = '\0'; // drop '\n' ++ retval = insert_devname(line); ++ if (retval != 0) ++ goto out; ++ } ++ retval = 0; ++ ++out: ++ free(line); ++ fclose(fp); ++ return retval; ++} ++ ++static const char *steal_devname(const char *name, ssize_t len) ++{ ++ ssize_t off = len; ++ ++ for (off -= 1; off > 0; off--) { ++ if (name[off] == '/') ++ break; ++ } ++ ++ return name + off + 1; ++} ++ ++static bool find_devname_internal(void *bucket[], const char *name) ++{ ++ int idx = *name; ++ ++ if (*name == '\0' && bucket[idx] != NULL) ++ return true; ++ else if (bucket[idx] == NULL) ++ return false; ++ else { ++ return find_devname_internal(bucket[idx], name+1); ++ } ++} ++ ++bool find_devname(const char *name) ++{ ++ const char *devname; ++ size_t len = 0; ++ bool found = false; ++ ++ if (name == NULL) ++ return false; ++ else if ((len = strlen(name)) == 0) ++ return false; ++ ++ devname = steal_devname(name, len); ++ found = find_devname_internal(root_bucket, devname); ++ ++ pr_debug("device '%s' (original name '%s') %s found in %s\n", ++ devname, name, found ? "is" : "isn't", REPAIRING_DEVICE_FILE); ++ ++ /* Compatible with the old version, there are still `strstr` branch in the following */ ++ found |= (strstr(name, "uverbs") != NULL ++ || strstr(name, "rdma_cm") != NULL ++ || strstr(name, "umad") != NULL); ++ ++ return found; ++} +diff --git a/criu/files-reg.c b/criu/files-reg.c +index 17b3234..c7a4d1e 100644 +--- a/criu/files-reg.c ++++ b/criu/files-reg.c +@@ -1613,8 +1613,8 @@ int dump_one_reg_file(int lfd, u32 id, const struct fd_parms *p) + rfe.has_mnt_id = true; + } + +- pr_info("Dumping path for %d fd via self %d [%s]\n", +- p->fd, lfd, &link->name[1]); ++ pr_info("Dumping path for %d fd via self %d [%s], id: %d\n", ++ p->fd, lfd, &link->name[1], id); + + /* + * The regular path we can handle should start with slash. +@@ -2267,6 +2267,34 @@ static int open_filemap(int pid, struct vma_area *vma) + return 0; + } + ++int collect_chr_map(struct pstree_item *me, struct vma_area *vma) ++{ ++ struct list_head *list = &rsti(me)->fds; ++ struct fdinfo_list_entry *fle, *tmp; ++ struct chrfile_info *ci; ++ bool exist_fd; ++ ++ list_for_each_entry_safe(fle, tmp, list, ps_list) { ++ struct file_desc *d = fle->desc; ++ ++ if (d->ops->type != FD_TYPES__CHR) ++ continue; ++ ++ ci = container_of(d, struct chrfile_info, d); ++ if (!strcmp(vma->e->name, ci->path)) { ++ vma->vmfd = d; ++ vma->e->fd = fle->fe->fd; ++ exist_fd = true; ++ break; ++ } ++ } ++ ++ if (!exist_fd) ++ return -EEXIST; ++ ++ return 0; ++} ++ + int collect_filemap(struct vma_area *vma) + { + struct file_desc *fd; +@@ -2351,7 +2379,7 @@ static int collect_one_regfile(void *o, ProtobufCMessage *base, struct cr_img *i + rfi->remap = NULL; + rfi->size_mode_checked = false; + +- pr_info("Collected [%s] ID %#x\n", rfi->path, rfi->rfe->id); ++ pr_info("Collected regfile [%s] ID %#x\n", rfi->path, rfi->rfe->id); + return file_desc_add(&rfi->d, rfi->rfe->id, ®_desc_ops); + } + +diff --git a/criu/files.c b/criu/files.c +index 82ac3a3..732abd6 100644 +--- a/criu/files.c ++++ b/criu/files.c +@@ -329,10 +329,32 @@ int do_dump_gen_file(struct fd_parms *p, int lfd, + e->fd = p->fd; + e->flags = p->fd_flags; + ++ pr_info("fdinfoEntry fd: %d\n", e->fd); + ret = fd_id_generate(p->pid, e, p); + if (ret == 1) /* new ID generated */ + ret = ops->dump(lfd, e->id, p); +- else ++ else if (ops->type == FD_TYPES__CHR) { ++ /* ++ * Sometimes the app_data subprocess may inherit the fd from ++ * app_data. Those fds may result the unconditional oops during ++ * the restoration of app_data. Therefore, prevent the dump in ++ * those condition. ++ */ ++ struct fd_link _link, *link; ++ ++ if (!p->link) { ++ if (fill_fdlink(lfd, p, &_link)) ++ return -1; ++ link = &_link; ++ } else ++ link = p->link; ++ ++ if (find_devname(link->name)) { ++ pr_err("char dev '%s' fd %d is owned by multi-processes\n", ++ link->name, e->fd); ++ ret = -1; ++ } ++ } else + /* Remove locks generated by the fd before going to the next */ + discard_dup_locks_tail(p->pid, e->fd); + +@@ -462,6 +484,58 @@ static int dump_blkdev(struct fd_parms *p, int lfd, FdinfoEntry *e) + return err; + } + ++static int dump_chr_file(int lfd, u32 id, const struct fd_parms *p) ++{ ++ int ret; ++ struct fd_link _link, *link; ++ struct cr_img *img; ++ FileEntry fe = FILE_ENTRY__INIT; ++ ChrfileEntry cfe = CHRFILE_ENTRY__INIT; ++ ++ if (!p->link) { ++ if (fill_fdlink(lfd, p, &_link)) ++ return -1; ++ link = &_link; ++ } else ++ link = p->link; ++ ++ pr_info("Dumping chr-file fd %d with lfd %d with id %d, name: %s\n", p->fd, lfd, id, link->name); ++ ++ if (strstr(link->name, "(deleted)") != NULL) { ++ pr_err("char device '%s' is deleted\n", link->name); ++ return -ENXIO; ++ } ++ ++ cfe.repair = false; ++ if (find_devname(link->name)) { ++ ret = ioctl(lfd, IOCTL_CMD_NEEDREPAIR, 0); ++ if (ret <= 0) { ++ pr_err("ioctl cmd needrepair failed, errno: %d, %s\n", ret, strerror(errno)); ++ return -1; ++ } else { ++ pr_info("char device needrepair cmd return: %d\n", ret); ++ cfe.index = ret; ++ cfe.repair = true; ++ } ++ } ++ ++ cfe.id = id; ++ cfe.name = &link->name[1]; ++ cfe.flags = p->flags; ++ fe.type = FD_TYPES__CHR; ++ fe.id = cfe.id; ++ fe.chr = &cfe; ++ ++ img = img_from_set(glob_imgset, CR_FD_FILES); ++ ret = pb_write_one(img, &fe, PB_FILE); ++ return ret; ++} ++ ++const struct fdtype_ops chr_dump_ops = { ++ .type = FD_TYPES__CHR, ++ .dump = dump_chr_file, ++}; ++ + static int dump_chrdev(struct fd_parms *p, int lfd, FdinfoEntry *e) + { + struct fd_link *link_old = p->link; +@@ -489,6 +563,10 @@ static int dump_chrdev(struct fd_parms *p, int lfd, FdinfoEntry *e) + ops = &tty_dump_ops; + break; + } ++ if (opts.dump_char_dev) { ++ ops = &chr_dump_ops; ++ break; ++ } + + sprintf(more, "%d:%d", maj, minor(p->stat.st_rdev)); + err = dump_unsupp_fd(p, lfd, "chr", more, e); +@@ -502,6 +580,12 @@ static int dump_chrdev(struct fd_parms *p, int lfd, FdinfoEntry *e) + return err; + } + ++/* Checks if file descriptor @lfd is infinibandevent */ ++int is_infiniband_link(char *link) ++{ ++ return is_anon_link_type(link, "[infinibandevent]"); ++} ++ + static int dump_one_file(struct pid *pid, int fd, int lfd, struct fd_opts *opts, + struct parasite_ctl *ctl, FdinfoEntry *e, + struct parasite_drain_fd *dfds) +@@ -556,6 +640,8 @@ static int dump_one_file(struct pid *pid, int fd, int lfd, struct fd_opts *opts, + ops = &signalfd_dump_ops; + else if (is_timerfd_link(link)) + ops = &timerfd_dump_ops; ++ else if (is_infiniband_link(link)) ++ return 1; + else + return dump_unsupp_fd(&p, lfd, "anon", link, e); + +@@ -653,9 +739,15 @@ int dump_task_files_seized(struct parasite_ctl *ctl, struct pstree_item *item, + + ret = dump_one_file(item->pid, dfds->fds[i + off], + lfds[i], opts + i, ctl, &e, dfds); +- if (ret) ++ if (ret < 0) + break; ++ /* infiniband link file */ ++ if (ret > 0) { ++ ret = 0; ++ continue; ++ } + ++ pr_info("write fdinfoEntry fd=%d id=%d\n", (&e)->fd, (&e)->id); + ret = pb_write_one(img, &e, PB_FDINFO); + if (ret) + break; +@@ -913,6 +1005,7 @@ int prepare_fd_pid(struct pstree_item *item) + if (!img) + return -1; + ++ pr_info("prepare_fd_pid\n"); + while (1) { + FdinfoEntry *e; + +@@ -1120,6 +1213,7 @@ int setup_and_serve_out(struct fdinfo_list_entry *fle, int new_fd) + if (reopen_fd_as(fle->fe->fd, new_fd)) + return -1; + ++ pr_info("*******flags: %d",fle->fe->flags); + if (fcntl(fle->fe->fd, F_SETFD, fle->fe->flags) == -1) { + pr_perror("Unable to set file descriptor flags"); + return -1; +@@ -1670,6 +1764,64 @@ out: + return ret; + } + ++static int chrfile_open(struct file_desc *d, int *new_fd) ++{ ++ int fd, mntns_root; ++ int ret = 0; ++ struct chrfile_info *ci; ++ ++ ci = container_of(d, struct chrfile_info, d); ++ ++ mntns_root = open_pid_proc(getpid()); ++ fd = openat(mntns_root, ci->path, ci->cfe->flags | O_REPAIR); ++ if (fd < 0){ ++ pr_err("open chr file failed\n"); ++ return -1; ++ } ++ ++ if (ci->cfe->repair) { ++ ret = ioctl(fd, IOCTL_CMD_REPAIR , ci->cfe->index); ++ pr_info("repair ioctl return: %d, index: %d\n", ret, ci->cfe->index); ++ if (ret) ++ goto err; ++ } ++ ++ *new_fd = fd; ++ return ret; ++err: ++ close(fd); ++ return ret; ++} ++ ++static struct file_desc_ops chrfile_desc_ops = { ++ .type = FD_TYPES__CHR, ++ .open = chrfile_open, ++}; ++ ++static int collect_one_chrfile(void *o, ProtobufCMessage *base, struct cr_img *i) ++{ ++ struct chrfile_info *ci = o; ++ static char dot[] = "."; ++ ++ ci->cfe = pb_msg(base, ChrfileEntry); ++ if (ci->cfe->name[1] == '\0') ++ ci->path = dot; ++ else ++ ci->path = ci->cfe->name; ++ ++ pr_info("Collected chr file: %#x, name: %s\n", ci->cfe->id, ci->path); ++ file_desc_add(&ci->d, ci->cfe->id, &chrfile_desc_ops); ++ ++ return 0; ++} ++ ++struct collect_image_info chrfile_cinfo = { ++ .fd_type = CR_FD_CHRFILE, ++ .pb_type = PB_CHRFILE, ++ .priv_size = sizeof(struct chrfile_info), ++ .collect = collect_one_chrfile, ++}; ++ + static int collect_one_file_entry(FileEntry *fe, u_int32_t id, ProtobufCMessage *base, + struct collect_image_info *cinfo) + { +@@ -1742,6 +1894,9 @@ static int collect_one_file(void *o, ProtobufCMessage *base, struct cr_img *i) + case FD_TYPES__TTY: + ret = collect_one_file_entry(fe, fe->tty->id, &fe->tty->base, &tty_cinfo); + break; ++ case FD_TYPES__CHR: ++ ret = collect_one_file_entry(fe, fe->chr->id, &fe->chr->base, &chrfile_cinfo); ++ break; + } + + return ret; +diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h +index 191153b..a53468e 100644 +--- a/criu/include/cr_options.h ++++ b/criu/include/cr_options.h +@@ -149,6 +149,7 @@ struct cr_options { + int use_fork_pid; + int with_notifier_kup; + int with_fd_cred; ++ int dump_char_dev; + }; + + extern struct cr_options opts; +diff --git a/criu/include/files-reg.h b/criu/include/files-reg.h +index 7a22d4d..d9f0638 100644 +--- a/criu/include/files-reg.h ++++ b/criu/include/files-reg.h +@@ -4,6 +4,7 @@ + #include "files.h" + + #include "images/regfile.pb-c.h" ++#include "images/chr.pb-c.h" + #include "images/ghost-file.pb-c.h" + + struct cr_imgset; +@@ -26,6 +27,12 @@ struct reg_file_info { + char *path; + }; + ++struct chrfile_info { ++ struct file_desc d; ++ ChrfileEntry *cfe; ++ char *path; ++}; ++ + extern int open_reg_by_id(u32 id); + extern int open_reg_fd(struct file_desc *); + extern int open_path(struct file_desc *, int (*open_cb)(int ns_root_fd, +@@ -33,6 +40,7 @@ extern int open_path(struct file_desc *, int (*open_cb)(int ns_root_fd, + extern void clear_ghost_files(void); + + extern const struct fdtype_ops regfile_dump_ops; ++extern const struct fdtype_ops chr_dump_ops; + extern int do_open_reg_noseek_flags(int ns_root_fd, struct reg_file_info *rfi, void *arg); + extern int dump_one_reg_file(int lfd, u32 id, const struct fd_parms *p); + +@@ -41,6 +49,7 @@ extern struct file_remap *lookup_ghost_remap(u32 dev, u32 ino); + extern struct file_desc *try_collect_special_file(u32 id, int optional); + #define collect_special_file(id) try_collect_special_file(id, 0) + extern int collect_filemap(struct vma_area *); ++extern int collect_chr_map(struct pstree_item *me, struct vma_area *); + extern void filemap_ctx_init(bool auto_close); + extern void filemap_ctx_fini(void); + +diff --git a/criu/include/files.h b/criu/include/files.h +index 2c1e1e7..b12d079 100644 +--- a/criu/include/files.h ++++ b/criu/include/files.h +@@ -15,6 +15,12 @@ + #include "images/fown.pb-c.h" + #include "images/vma.pb-c.h" + ++#ifndef IOCTL_CMD_NEEDREPAIR ++#define IOCTL_CMD_NEEDREPAIR 0x00100000UL ++#define IOCTL_CMD_REPAIR 0x00200000UL ++#define O_REPAIR 040000000 ++#endif ++ + struct parasite_drain_fd; + struct pstree_item; + struct file_desc; +diff --git a/criu/include/image-desc.h b/criu/include/image-desc.h +index 3135f56..d5e2ac4 100644 +--- a/criu/include/image-desc.h ++++ b/criu/include/image-desc.h +@@ -108,6 +108,7 @@ enum { + CR_FD_TTY_FILES, + + CR_FD_AUTOFS, ++ CR_FD_CHRFILE, + + CR_FD_MAX + }; +diff --git a/criu/include/image.h b/criu/include/image.h +index b2aad86..e9257e4 100644 +--- a/criu/include/image.h ++++ b/criu/include/image.h +@@ -84,6 +84,7 @@ + #define VMA_AREA_VVAR (1 << 12) + #define VMA_AREA_AIORING (1 << 13) + #define VMA_AREA_ANON_INODE (1 << 15) ++#define VMA_AREA_CHR (1 << 16) + + #define VMA_CLOSE (1 << 28) + #define VMA_NO_PROT_WRITE (1 << 29) +diff --git a/criu/include/protobuf-desc.h b/criu/include/protobuf-desc.h +index 31f5b9a..4112be5 100644 +--- a/criu/include/protobuf-desc.h ++++ b/criu/include/protobuf-desc.h +@@ -61,6 +61,7 @@ enum { + PB_AUTOFS, + PB_GHOST_CHUNK, + PB_FILE, ++ PB_CHRFILE, + + /* PB_AUTOGEN_STOP */ + +diff --git a/criu/include/util.h b/criu/include/util.h +index 351d72e..d56c905 100644 +--- a/criu/include/util.h ++++ b/criu/include/util.h +@@ -422,4 +422,7 @@ enum notifier_state { + int notifier_kup(enum KUP_HOOK_POINT, enum nvwa_cmd, bool); + void do_notifier_rollback(bool, enum notifier_state); + ++int parse_devname(void); ++bool find_devname(const char *name); ++ + #endif /* __CR_UTIL_H__ */ +diff --git a/criu/mem.c b/criu/mem.c +index dbd479a..e3a2ee4 100644 +--- a/criu/mem.c ++++ b/criu/mem.c +@@ -748,7 +748,9 @@ int prepare_mm_pid(struct pstree_item *i) + } + + pr_info("vma 0x%"PRIx64" 0x%"PRIx64"\n", vma->e->start, vma->e->end); +- if (vma_area_is(vma, VMA_ANON_SHARED)) ++ if (vma_area_is(vma, VMA_AREA_CHR)) ++ ret = collect_chr_map(i, vma); ++ else if (vma_area_is(vma, VMA_ANON_SHARED)) + ret = collect_shmem(pid, vma); + else if (vma_area_is(vma, VMA_FILE_PRIVATE) || + vma_area_is(vma, VMA_FILE_SHARED)) +@@ -1402,7 +1404,7 @@ int open_vmas(struct pstree_item *t) + filemap_ctx_init(false); + + list_for_each_entry(vma, &vmas->h, list) { +- if (vma_area_is(vma, VMA_AREA_ANON_INODE)) ++ if (vma_area_is(vma, VMA_AREA_ANON_INODE) || vma_area_is(vma, VMA_AREA_CHR)) + continue; + + if (!vma_area_is(vma, VMA_AREA_REGULAR) || !vma->vm_open) +diff --git a/criu/proc_parse.c b/criu/proc_parse.c +index b0c8e89..974a802 100644 +--- a/criu/proc_parse.c ++++ b/criu/proc_parse.c +@@ -593,11 +593,23 @@ static int handle_vma(pid_t pid, struct vma_area *vma_area, + } else if (*vm_file_fd >= 0) { + struct stat *st_buf = vma_area->vmst; + ++ pr_info("file mode is: %x, st_ino: %ld\n", st_buf->st_mode, st_buf->st_ino); + if (S_ISREG(st_buf->st_mode)) + /* regular file mapping -- supported */; +- else if (S_ISCHR(st_buf->st_mode) && (st_buf->st_rdev == DEVZERO)) ++ else if (S_ISCHR(st_buf->st_mode)) { + /* devzero mapping -- also makes sense */; +- else { ++ if (opts.dump_char_dev && (strstr(file_path, "uverbs") != NULL)) { ++ int len = strlen(file_path) + 1; ++ vma_area->e->status |= VMA_AREA_CHR; ++ vma_area->e->name = xmalloc(len); ++ if (!vma_area->e->name) { ++ pr_err("alloc vma area name fail\n"); ++ goto err; ++ } ++ strncpy(vma_area->e->name, file_path, len); ++ pr_info("uverbs name content is: %s\n", vma_area->e->name); ++ } ++ } else { + pr_err("Can't handle non-regular mapping on %d's map %"PRIx64"\n", pid, vma_area->e->start); + goto err; + } +diff --git a/images/Makefile b/images/Makefile +index edaab06..76e89ce 100644 +--- a/images/Makefile ++++ b/images/Makefile +@@ -63,6 +63,7 @@ proto-obj-y += sysctl.o + proto-obj-y += autofs.o + proto-obj-y += macvlan.o + proto-obj-y += sit.o ++proto-obj-y += chr.o + + CFLAGS += -iquote $(obj)/ + +diff --git a/images/chr.proto b/images/chr.proto +new file mode 100644 +index 0000000..67929db +--- /dev/null ++++ b/images/chr.proto +@@ -0,0 +1,12 @@ ++syntax = "proto2"; ++ ++import "opts.proto"; ++ ++message chrfile_entry { ++ required uint32 id = 1; ++ required uint32 flags = 2 [(criu).flags = "rfile.flags"]; ++ required uint32 index = 3; ++ required string name = 4; ++ required bool repair = 5; ++}; ++ +diff --git a/images/fdinfo.proto b/images/fdinfo.proto +index 77e375a..c483bd8 100644 +--- a/images/fdinfo.proto ++++ b/images/fdinfo.proto +@@ -16,6 +16,7 @@ import "sk-unix.proto"; + import "fifo.proto"; + import "pipe.proto"; + import "tty.proto"; ++import "chr.proto"; + + enum fd_types { + UND = 0; +@@ -36,6 +37,7 @@ enum fd_types { + TUNF = 15; + EXT = 16; + TIMERFD = 17; ++ CHR = 21; + + /* Any number above the real used. Not stored to image */ + CTL_TTY = 65534; +@@ -70,4 +72,5 @@ message file_entry { + optional fifo_entry fifo = 17; + optional pipe_entry pipe = 18; + optional tty_file_entry tty = 19; ++ optional chrfile_entry chr = 23; + } +-- +2.34.0 + diff --git a/backport-0011--socket-fix-connect-error-of-invalid-param.patch b/backport-0011--socket-fix-connect-error-of-invalid-param.patch new file mode 100644 index 0000000..8d38468 --- /dev/null +++ b/backport-0011--socket-fix-connect-error-of-invalid-param.patch @@ -0,0 +1,95 @@ +From 02717e0b2e42617e5e25ab9eea65a409f57abb87 Mon Sep 17 00:00:00 2001 +From: Xiaoguang Li +Date: Wed, 1 Dec 2021 11:24:50 +0800 +Subject: [PATCH 11/49] socket: fix connect error of invalid param + +Fix connect error of invalid param during module upgrade. + +Signed-off-by: Xiaoguang Li +Signed-off-by: fu.lin +--- + criu/include/sockets.h | 1 + + criu/sk-inet.c | 13 +++++++++++-- + criu/sockets.c | 5 ++++- + 3 files changed, 16 insertions(+), 3 deletions(-) + +diff --git a/criu/include/sockets.h b/criu/include/sockets.h +index cd98d18..e647f3a 100644 +--- a/criu/include/sockets.h ++++ b/criu/include/sockets.h +@@ -27,6 +27,7 @@ struct socket_desc { + extern int dump_socket(struct fd_parms *p, int lfd, FdinfoEntry *); + extern int dump_socket_opts(int sk, SkOptsEntry *soe); + extern int restore_socket_opts(int sk, SkOptsEntry *soe); ++extern int restore_bound_opts(int sk, SkOptsEntry *soe); + extern void release_skopts(SkOptsEntry *); + extern int restore_prepare_socket(int sk); + extern void preload_socket_modules(void); +diff --git a/criu/sk-inet.c b/criu/sk-inet.c +index f9c64c7..a17fec6 100644 +--- a/criu/sk-inet.c ++++ b/criu/sk-inet.c +@@ -101,19 +101,24 @@ static void show_one_inet(const char *act, const struct inet_sk_desc *sk) + static void show_one_inet_img(const char *act, const InetSkEntry *e) + { + char src_addr[INET_ADDR_LEN] = ""; ++ char dst_addr[INET_ADDR_LEN] = ""; + + if (inet_ntop(e->family, (void *)e->src_addr, src_addr, + INET_ADDR_LEN) == NULL) { + pr_perror("Failed to translate address"); + } ++ if (inet_ntop(e->family, (void *)e->dst_addr, dst_addr, ++ INET_ADDR_LEN) == NULL) { ++ pr_perror("Failed to translate address"); ++ } + + pr_debug("\t%s: family %-10s type %-14s proto %-16s port %d " +- "state %-16s src_addr %s\n", act, ++ "state %-16s src_addr %s dst_addr %s\n", act, + ___socket_family_name(e->family), + ___socket_type_name(e->type), + ___socket_proto_name(e->proto), + e->src_port, ___tcp_state_name(e->state), +- src_addr); ++ src_addr, dst_addr); + } + + static int can_dump_ipproto(int ino, int proto, int type) +@@ -840,6 +845,10 @@ static int open_inet_sk(struct file_desc *d, int *new_fd) + if (restore_opt(sk, SOL_SOCKET, SO_REUSEPORT, &yes)) + goto err; + ++ if(restore_bound_opts(sk, ie->opts) < 0){ ++ goto err; ++ } ++ + if (tcp_connection(ie)) { + if (!opts.tcp_established_ok && !opts.tcp_close) { + pr_err("Connected TCP socket in image\n"); +diff --git a/criu/sockets.c b/criu/sockets.c +index 312b55c..4a2df60 100644 +--- a/criu/sockets.c ++++ b/criu/sockets.c +@@ -580,7 +580,6 @@ int restore_socket_opts(int sk, SkOptsEntry *soe) + tv.tv_usec = soe->so_rcv_tmo_usec; + ret |= restore_opt(sk, SOL_SOCKET, SO_RCVTIMEO, &tv); + +- ret |= restore_bound_dev(sk, soe); + ret |= restore_socket_filter(sk, soe); + + /* The restore of SO_REUSEADDR depends on type of socket */ +@@ -588,6 +587,10 @@ int restore_socket_opts(int sk, SkOptsEntry *soe) + return ret; + } + ++int restore_bound_opts(int sk, SkOptsEntry *soe){ ++ return restore_bound_dev(sk, soe); ++} ++ + int do_dump_opt(int sk, int level, int name, void *val, int len) + { + socklen_t aux = len; +-- +2.34.0 + diff --git a/backport-0012--criu-eventpollfd-fix-for-improper-usage-in-appdata.patch b/backport-0012--criu-eventpollfd-fix-for-improper-usage-in-appdata.patch new file mode 100644 index 0000000..72ea3b6 --- /dev/null +++ b/backport-0012--criu-eventpollfd-fix-for-improper-usage-in-appdata.patch @@ -0,0 +1,97 @@ +From 082e5505a994ee435b1f7bd6f967aae60eeeec36 Mon Sep 17 00:00:00 2001 +From: Jingxian He +Date: Wed, 19 May 2021 21:56:38 +0800 +Subject: [PATCH 12/49] criu: eventpollfd fix for improper usage in appdata + +Fix eventpollfd problem of improper usage in appdata. + +Signed-off-by: Jingxian He +Signed-off-by: fu.lin +--- + criu/eventpoll.c | 16 +++++++++++----- + criu/proc_parse.c | 2 ++ + images/eventpoll.proto | 3 +++ + 3 files changed, 16 insertions(+), 5 deletions(-) + +diff --git a/criu/eventpoll.c b/criu/eventpoll.c +index e1384fa..d069a07 100644 +--- a/criu/eventpoll.c ++++ b/criu/eventpoll.c +@@ -67,8 +67,8 @@ int is_eventpoll_link(char *link) + + static void pr_info_eventpoll_tfd(char *action, uint32_t id, EventpollTfdEntry *e) + { +- pr_info("%seventpoll-tfd: id %#08x tfd %8d events %#08x data %#016"PRIx64"\n", +- action, id, e->tfd, e->events, e->data); ++ pr_info("%seventpoll-tfd: id %#08x tfd %8d events %#08x data %#016"PRIx64" ignore %d\n", ++ action, id, e->tfd, e->events, e->data, e->ignore); + } + + static void pr_info_eventpoll(char *action, EventpollFileEntry *e) +@@ -146,9 +146,9 @@ int flush_eventpoll_dinfo_queue(void) + }; + struct kid_elem *t = kid_lookup_epoll_tfd(&fd_tree, &ke, &slot); + if (!t) { +- pr_debug("kid_lookup_epoll: no match pid %d efd %d tfd %d toff %u\n", +- dinfo->pid, dinfo->efd, tfde->tfd, dinfo->toff[i].off); +- goto err; ++ tfde->ignore = 1; ++ pr_info("Drop tfd entry, efd=%d, tfd=%d\n", slot.efd, slot.tfd); ++ continue; + } + + pr_debug("kid_lookup_epoll: rbsearch match pid %d efd %d tfd %d toff %u -> %d\n", +@@ -161,6 +161,7 @@ int flush_eventpoll_dinfo_queue(void) + goto err; + } + ++ pr_info("Change tfd: %d -> %d @ efd=%d\n", tfde->tfd, t->idx, slot.efd); + tfde->tfd = t->idx; + } + +@@ -411,6 +412,11 @@ static int eventpoll_retore_tfd(int fd, int id, EventpollTfdEntry *tdefe) + { + struct epoll_event event; + ++ if (tdefe->ignore) { ++ pr_info_eventpoll_tfd("Ignore ", id, tdefe); ++ return 0; ++ } ++ + pr_info_eventpoll_tfd("Restore ", id, tdefe); + + event.events = tdefe->events; +diff --git a/criu/proc_parse.c b/criu/proc_parse.c +index 974a802..748d02e 100644 +--- a/criu/proc_parse.c ++++ b/criu/proc_parse.c +@@ -1792,10 +1792,12 @@ static int parse_fdinfo_pid_s(int pid, int fd, int type, void *arg) + e->has_dev = false; + e->has_inode = false; + e->has_pos = false; ++ e->has_ignore = false; + } else if (ret == 6) { + e->has_dev = true; + e->has_inode = true; + e->has_pos = true; ++ e->has_ignore = true; + } else if (ret < 6) { + eventpoll_tfd_entry__free_unpacked(e, NULL); + goto parse_err; +diff --git a/images/eventpoll.proto b/images/eventpoll.proto +index 4a8d1b8..20c9a15 100644 +--- a/images/eventpoll.proto ++++ b/images/eventpoll.proto +@@ -12,6 +12,9 @@ message eventpoll_tfd_entry { + optional uint32 dev = 5; + optional uint64 inode = 6; + optional uint64 pos = 7; ++ ++ /* entry validation */ ++ optional uint32 ignore = 8; + } + + message eventpoll_file_entry { +-- +2.34.0 + diff --git a/backport-0013--task_exit_notify-add-task-exit-notify-mask-method-fo.patch b/backport-0013--task_exit_notify-add-task-exit-notify-mask-method-fo.patch new file mode 100644 index 0000000..ac86945 --- /dev/null +++ b/backport-0013--task_exit_notify-add-task-exit-notify-mask-method-fo.patch @@ -0,0 +1,143 @@ +From da3127728b594716b2492859133e3dcf9102dd8e Mon Sep 17 00:00:00 2001 +From: Jingxian He +Date: Wed, 1 Dec 2021 11:32:06 +0800 +Subject: [PATCH 13/49] task_exit_notify: add task exit notify mask method for + criu + +Add task exit notify mask method for criu during kernel module upgrade. + +Signed-off-by: Jingxian He +--- + criu/config.c | 1 + + criu/cr-restore.c | 8 ++++++++ + criu/crtools.c | 1 + + criu/include/cr_options.h | 1 + + criu/include/util.h | 5 +++++ + criu/seize.c | 33 ++++++++++++++++++++++++++++++++- + 6 files changed, 48 insertions(+), 1 deletion(-) + +diff --git a/criu/config.c b/criu/config.c +index af4d4e4..61a60af 100644 +--- a/criu/config.c ++++ b/criu/config.c +@@ -523,6 +523,7 @@ int parse_options(int argc, char **argv, bool *usage_error, + BOOL_OPT("with-notifier", &opts.with_notifier_kup), + BOOL_OPT("with-fd-cred", &opts.with_fd_cred), + BOOL_OPT("dump-char-dev", &opts.dump_char_dev), ++ BOOL_OPT("mask-exit-notify", &opts.mask_exit_notify), + { }, + }; + +diff --git a/criu/cr-restore.c b/criu/cr-restore.c +index 00100fa..b99deec 100644 +--- a/criu/cr-restore.c ++++ b/criu/cr-restore.c +@@ -1492,6 +1492,14 @@ static inline int fork_with_pid(struct pstree_item *item) + item->pid->real, vpid(item)); + } + ++ if (opts.mask_exit_notify) { ++ int mask_pid = ret; ++ pr_info("start unmask for %d\n", mask_pid); ++ ret = mask_task_exit_notify(mask_pid, false); ++ if (ret) ++ pr_err("unmask exit notify fail for: %d\n", mask_pid); ++ } ++ + err_unlock: + if (!(ca.clone_flags & CLONE_NEWPID)) + unlock_last_pid(); +diff --git a/criu/crtools.c b/criu/crtools.c +index 638fb40..faa3bfc 100644 +--- a/criu/crtools.c ++++ b/criu/crtools.c +@@ -398,6 +398,7 @@ usage: + " --with-fd-cred Allow to make the restored process has the same cred\n" + " --dump-char-dev Dump char dev files as normal file with repair cmd\n" + " as checkout assisted by kernel.\n" ++" --mask-exit-notify Mask task exit notify during dump and restore\n" + "\n" + "Check options:\n" + " Without options, \"criu check\" checks availability of absolutely required\n" +diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h +index a53468e..7a73ea0 100644 +--- a/criu/include/cr_options.h ++++ b/criu/include/cr_options.h +@@ -150,6 +150,7 @@ struct cr_options { + int with_notifier_kup; + int with_fd_cred; + int dump_char_dev; ++ int mask_exit_notify; + }; + + extern struct cr_options opts; +diff --git a/criu/include/util.h b/criu/include/util.h +index d56c905..38aa214 100644 +--- a/criu/include/util.h ++++ b/criu/include/util.h +@@ -425,4 +425,9 @@ void do_notifier_rollback(bool, enum notifier_state); + int parse_devname(void); + bool find_devname(const char *name); + ++#define PID_BUF_SIZE 32 ++#define MASK_EXIT_NOTIFY_DIR "/sys/kernel/mask_exit_notify" ++#define UNMASK_EXIT_NOTIFY_DIR "/sys/kernel/unmask_exit_notify" ++int mask_task_exit_notify(int pid, bool mask); ++ + #endif /* __CR_UTIL_H__ */ +diff --git a/criu/seize.c b/criu/seize.c +index 0e79aba..140a9d4 100644 +--- a/criu/seize.c ++++ b/criu/seize.c +@@ -524,9 +524,35 @@ free: + return ret < 0 ? ret : nr_inprogress; + } + ++int mask_task_exit_notify(int pid, bool mask) ++{ ++ int fd, retval; ++ char buf[PID_BUF_SIZE] = {0}; ++ ++ if (pid <= 0) ++ return -1; ++ ++ snprintf(buf, PID_BUF_SIZE - 1, "%d", pid); ++ if (mask) ++ fd = open(MASK_EXIT_NOTIFY_DIR, O_WRONLY, 0); ++ else ++ fd = open(UNMASK_EXIT_NOTIFY_DIR, O_WRONLY, 0); ++ if (fd < 0) { ++ pr_err("open mask exit notify file fail\n"); ++ return fd; ++ } ++ ++ retval = write(fd, buf, PID_BUF_SIZE); ++ if (retval < 0) ++ pr_err("Write mask exit pid: %s fail\n", buf); ++ close(fd); ++ ++ return retval < 0 ? -1 : 0; ++} ++ + static void unseize_task_and_threads(const struct pstree_item *item, int st) + { +- int i; ++ int i, ret; + + if (item->pid->state == TASK_DEAD) + return; +@@ -535,6 +561,11 @@ static void unseize_task_and_threads(const struct pstree_item *item, int st) + for (i = 0; i < item->nr_threads; i++) + dump_task_special_pages(item->threads[i].real); + } ++ if (opts.mask_exit_notify) { ++ ret = mask_task_exit_notify(item->threads[0].real, true); ++ if (ret) ++ pr_err("mask exit notify for %d fail.\n", item->threads[0].real); ++ } + + /* + * The st is the state we want to switch tasks into, +-- +2.34.0 + diff --git a/backport-0014--selinux-fix-selinux-context-lable-check.patch b/backport-0014--selinux-fix-selinux-context-lable-check.patch new file mode 100644 index 0000000..1f3ae69 --- /dev/null +++ b/backport-0014--selinux-fix-selinux-context-lable-check.patch @@ -0,0 +1,52 @@ +From 8f91849ca116cb77b1e36eded7471ce356bf61c7 Mon Sep 17 00:00:00 2001 +From: Xiaoguang Li +Date: Tue, 25 May 2021 02:40:30 +0000 +Subject: [PATCH 14/49] selinux: fix selinux context lable check + +Background: + SELinux has three status: disabled, permissive, and enforcing. + If the status of the SELinux wasn't disabled, it would configure + the rules using `/etc/selinux/targeted`. However, because of the + non-existed rules in `/etc/selinux/targeted`, the security lable + of processes is `kernel` instead of + `unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023` readed + from `/proc//attr/current`. It will result the failure of + criu dumping. + +Signed-off-by: lixiaoguang2 +Signed-off-by: fu.lin +--- + criu/lsm.c | 12 +++++++++++- + 1 file changed, 11 insertions(+), 1 deletion(-) + +diff --git a/criu/lsm.c b/criu/lsm.c +index 9d7e55c..c979d23 100644 +--- a/criu/lsm.c ++++ b/criu/lsm.c +@@ -78,12 +78,22 @@ static int selinux_get_label(pid_t pid, char **output) + if (!*output) + goto err; + ++ pos = (char*)ctx; ++ /* ++ * If the SElinux context is not configured, the label maybe look like ++ * this: ++ * "kernel" ++ */ ++ if (!strstr(pos, ":")) { ++ ret = 0; ++ goto err; ++ } ++ + /* + * Make sure it is a valid SELinux label. It should look like this: + * + * unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 + */ +- pos = (char*)ctx; + for (i = 0; i < 3; i++) { + pos = strstr(pos, ":"); + if (!pos) { +-- +2.34.0 + diff --git a/backport-0015--unix-socket-add-support-for-unix-stream-socket.patch b/backport-0015--unix-socket-add-support-for-unix-stream-socket.patch new file mode 100644 index 0000000..a65af1f --- /dev/null +++ b/backport-0015--unix-socket-add-support-for-unix-stream-socket.patch @@ -0,0 +1,269 @@ +From ee65205c0000540b676e6ce42d3ea3b64875228a Mon Sep 17 00:00:00 2001 +From: luolongjun +Date: Wed, 1 Dec 2021 11:35:52 +0800 +Subject: [PATCH 15/49] unix socket: add support for unix stream socket + +When dump unix stream socket with external connections, +we will tell kernel to turn repair mode on for this sock. +And then kernel will keep this sock before restoring it. +In this process, the other socket which communicates with +this sock in repair mode will get EAGAIN or blocked. + +Signed-off-by: Luo Longjun + +fix unix socket dump and restore err + +Fix name-less unix socket dump and restore problem. + +Signed-off-by: Jingxian He + +unix socket:ignore repair error from kernel + +leave error for applications to deal with. + +Signed-off-by: Luo Longjun + +update +--- + criu/cr-dump.c | 1 + + criu/include/sockets.h | 1 + + criu/sk-unix.c | 105 +++++++++++++++++++++++++++++++++++++---- + images/sk-unix.proto | 2 + + 4 files changed, 100 insertions(+), 9 deletions(-) + +diff --git a/criu/cr-dump.c b/criu/cr-dump.c +index c09d8bd..882b563 100644 +--- a/criu/cr-dump.c ++++ b/criu/cr-dump.c +@@ -1699,6 +1699,7 @@ static int cr_dump_finish(int ret) + + cr_plugin_fini(CR_PLUGIN_STAGE__DUMP, ret); + cgp_fini(); ++ unix_stream_unlock(ret); + + if (!ret) { + /* +diff --git a/criu/include/sockets.h b/criu/include/sockets.h +index e647f3a..f60e7fc 100644 +--- a/criu/include/sockets.h ++++ b/criu/include/sockets.h +@@ -43,6 +43,7 @@ extern int add_fake_unix_queuers(void); + extern int fix_external_unix_sockets(void); + extern int prepare_scms(void); + extern int unix_note_scm_rights(int id_for, uint32_t *file_ids, int *fds, int n_ids); ++extern void unix_stream_unlock(int ret); + + extern struct collect_image_info netlink_sk_cinfo; + +diff --git a/criu/sk-unix.c b/criu/sk-unix.c +index f0620e6..049dc84 100644 +--- a/criu/sk-unix.c ++++ b/criu/sk-unix.c +@@ -72,6 +72,7 @@ struct unix_sk_desc { + char *name; + unsigned int nr_icons; + unsigned int *icons; ++ int repair_ino; + + unsigned int vfs_dev; + unsigned int vfs_ino; +@@ -92,6 +93,11 @@ struct unix_sk_desc { + UnixSkEntry *ue; + }; + ++struct unix_stream_extern_socket_desc { ++ struct list_head list; ++ int fd; ++}; ++ + /* + * The mutex_ghost is accessed from different tasks, + * so make sure it is in shared memory. +@@ -99,6 +105,7 @@ struct unix_sk_desc { + static mutex_t *mutex_ghost; + + static LIST_HEAD(unix_sockets); ++static LIST_HEAD(unix_stream_external_sockets); + static LIST_HEAD(unix_ghost_addr); + + static int unix_resolve_name(int lfd, uint32_t id, struct unix_sk_desc *d, +@@ -117,6 +124,26 @@ struct unix_sk_listen_icon { + + static struct unix_sk_listen_icon *unix_listen_icons[SK_HASH_SIZE]; + ++static int unix_stream_repair_on(int fd) ++{ ++ int ret, aux = 1; ++ ret = setsockopt(fd, SOL_TCP, TCP_REPAIR_OPTIONS, &aux, sizeof(aux)); ++ if (ret < 0) ++ pr_err("Can't turn repair mod for unix stream on. \n"); ++ ++ return ret; ++} ++ ++static int unix_stream_repair_off(int fd) ++{ ++ int ret, aux = 0; ++ ret = setsockopt(fd, SOL_TCP, TCP_REPAIR_OPTIONS, &aux, sizeof(aux)); ++ if (ret < 0) ++ pr_err("Can't turn repair mod for unix stream off. \n"); ++ ++ return ret; ++} ++ + static struct unix_sk_listen_icon *lookup_unix_listen_icons(unsigned int peer_ino) + { + struct unix_sk_listen_icon *ic; +@@ -338,6 +365,8 @@ static int dump_one_unix_fd(int lfd, uint32_t id, const struct fd_parms *p) + FilePermsEntry *perms; + FownEntry *fown; + void *m; ++ unsigned int len; ++ int ret; + + m = xmalloc(sizeof(UnixSkEntry) + + sizeof(SkOptsEntry) + +@@ -431,6 +460,31 @@ static int dump_one_unix_fd(int lfd, uint32_t id, const struct fd_parms *p) + goto err; + } + ++ if (peer->name && ue->type == SOCK_STREAM) { ++ struct unix_stream_extern_socket_desc *d; ++ ++ /* Attention: used for upgrade in the same machine ++ * May in conflict with original usage ++ */ ++ pr_info("set %d unix stream repair on \n", lfd); ++ ret = unix_stream_repair_on(lfd); ++ if (ret < 0) ++ goto err; ++ ++ d = xzalloc(sizeof(*d)); ++ if (!d) ++ goto err; ++ ++ d->fd = dup(lfd); ++ pr_info("add %d into unix_stream_external_sockets \n", d->fd); ++ list_add_tail(&d->list, &unix_stream_external_sockets); ++ ++ len = sizeof(ue->repair_ino); ++ ret = getsockopt(lfd, SOL_TCP, TCP_REPAIR_OPTIONS, &ue->repair_ino, &len); ++ if (ret < 0) ++ goto err; ++ } ++ + /* + * Peer should have us as peer or have a name by which + * we can access one. +@@ -810,16 +864,18 @@ static int __dump_external_socket(struct unix_sk_desc *sk, + return -1; + } + +- if (peer->type != SOCK_DGRAM) { +- show_one_unix("Ext stream not supported", peer); +- pr_err("Can't dump half of stream unix connection.\n"); ++ if (peer->type != SOCK_DGRAM && ++ peer->type != SOCK_STREAM) { ++ show_one_unix("Ext unix type not supported", peer); ++ pr_err("Can't dump this kind of unix connection.\n"); + return -1; + } + +- if (!peer->name) { ++ /* part 1: prevent NULL pointer oops */ ++ if (!peer->name && !sk->name) { + show_one_unix("Ext dgram w/o name", peer); ++ show_one_unix("Ext dgram w/o name", sk); + pr_err("Can't dump name-less external socket.\n"); +- pr_err("%d\n", sk->fd); + return -1; + } + +@@ -866,7 +922,7 @@ int fix_external_unix_sockets(void) + + fd_id_generate_special(NULL, &e.id); + e.ino = sk->sd.ino; +- e.type = SOCK_DGRAM; ++ e.type = sk->type; + e.state = TCP_LISTEN; + e.name.data = (void *)sk->name; + e.name.len = (size_t)sk->namelen; +@@ -893,6 +949,19 @@ err: + return -1; + } + ++void unix_stream_unlock(int ret) ++{ ++ struct unix_stream_extern_socket_desc *d; ++ pr_debug("Unlocking unix stream sockets\n"); ++ list_for_each_entry(d, &unix_stream_external_sockets, list) { ++ if (ret) { ++ pr_debug("unlock fd %d \n", d->fd); ++ unix_stream_repair_off(d->fd); ++ } ++ close_safe(&d->fd); ++ } ++} ++ + struct unix_sk_info { + UnixSkEntry *ue; + struct list_head list; +@@ -1263,6 +1332,7 @@ static int post_open_standalone(struct file_desc *d, int fd) + struct unix_sk_info *peer; + struct sockaddr_un addr; + int cwd_fd = -1, root_fd = -1, ns_fd = -1; ++ int ret, value; + + ui = container_of(d, struct unix_sk_info, d); + BUG_ON((ui->flags & (USK_PAIR_MASTER | USK_PAIR_SLAVE)) || +@@ -1320,7 +1390,22 @@ static int post_open_standalone(struct file_desc *d, int fd) + * while we're connecting in sake of ghost sockets. + */ + mutex_lock(mutex_ghost); +- if (connect(fd, (struct sockaddr *)&addr, sizeof(addr.sun_family) + len) < 0) { ++ ++ /* we handle unix stream with external connections here */ ++ if (peer->name && ui->ue->type == SOCK_STREAM) { ++ value = ui->ue->repair_ino; ++ ret = setsockopt(fd, SOL_TCP, TCP_REPAIR, &value, sizeof(value)); ++ if (ret < 0) { ++ /* permit the unix sk resume successfully when the peer has been ++ * closed, just warn here */ ++ pr_warn("Can't repair %d socket\n", value); ++ } ++ ++ ret = unix_stream_repair_off(fd); ++ if (ret < 0) { ++ goto err_revert_and_exit; ++ } ++ } else if (connect(fd, (struct sockaddr *)&addr, sizeof(addr.sun_family) + len) < 0) { + pr_perror("Can't connect %d socket", ui->ue->ino); + goto err_revert_and_exit; + } +@@ -2022,8 +2107,10 @@ static int init_unix_sk_info(struct unix_sk_info *ui, UnixSkEntry *ue) + } + + ui->name = (void *)ue->name.data; +- } else +- ui->name = NULL; ++ } else { ++ /* part 2: prevent NULL pointer oops */ ++ ui->name = ""; ++ } + ui->name_dir = (void *)ue->name_dir; + + ui->flags = 0; +diff --git a/images/sk-unix.proto b/images/sk-unix.proto +index c59644f..0226d9a 100644 +--- a/images/sk-unix.proto ++++ b/images/sk-unix.proto +@@ -51,4 +51,6 @@ message unix_sk_entry { + + optional uint32 ns_id = 16; + optional sint32 mnt_id = 17 [default = -1]; ++ /* Please, don't use field with number 18. */ ++ required sint32 repair_ino = 19; + } +-- +2.34.0 + diff --git a/backport-0016--save-and-restore-sigev_notify_thread_id.patch b/backport-0016--save-and-restore-sigev_notify_thread_id.patch new file mode 100644 index 0000000..d5e935f --- /dev/null +++ b/backport-0016--save-and-restore-sigev_notify_thread_id.patch @@ -0,0 +1,98 @@ +From 503b2e5bf5ad2aa6cf21567a5ddddbfdbfcb6ca7 Mon Sep 17 00:00:00 2001 +From: Liu Chao +Date: Mon, 28 Jun 2021 08:17:26 +0000 +Subject: [PATCH 16/49] save and restore sigev_notify_thread_id + +When sigev_notify_thread_id is not set, get_pid will return a NULL +pointer and do_timer_create will return -EINVAL in kernel. So criu +will failed to create posix timer: + +(09.806760) pie: 41301: Error (criu/pie/restorer.c:1998): Can't restore posix timers -22 +(09.806824) pie: 41301: Error (criu/pie/restorer.c:2133): Restorer fail 41301 +(09.891880) Error (criu/cr-restore.c:2596): Restoring FAILED. + +Signed-off-by: Liu Chao +--- + criu/cr-restore.c | 1 + + criu/include/posix-timer.h | 1 + + criu/parasite-syscall.c | 1 + + criu/pie/restorer.c | 1 + + criu/proc_parse.c | 1 + + images/timer.proto | 1 + + 6 files changed, 6 insertions(+) + +diff --git a/criu/cr-restore.c b/criu/cr-restore.c +index b99deec..db98723 100644 +--- a/criu/cr-restore.c ++++ b/criu/cr-restore.c +@@ -2730,6 +2730,7 @@ static inline int decode_posix_timer(PosixTimerEntry *pte, + pt->spt.si_signo = pte->si_signo; + pt->spt.it_sigev_notify = pte->it_sigev_notify; + pt->spt.sival_ptr = decode_pointer(pte->sival_ptr); ++ pt->spt.sigev_notify_thread_id = pte->sigev_notify_thread_id; + pt->overrun = pte->overrun; + + return 0; +diff --git a/criu/include/posix-timer.h b/criu/include/posix-timer.h +index fa99d86..11b7618 100644 +--- a/criu/include/posix-timer.h ++++ b/criu/include/posix-timer.h +@@ -8,6 +8,7 @@ struct str_posix_timer { + int clock_id; + int si_signo; + int it_sigev_notify; ++ int sigev_notify_thread_id; + void * sival_ptr; + }; + +diff --git a/criu/parasite-syscall.c b/criu/parasite-syscall.c +index e5a8194..cc7df77 100644 +--- a/criu/parasite-syscall.c ++++ b/criu/parasite-syscall.c +@@ -307,6 +307,7 @@ static void encode_posix_timer(struct posix_timer *v, + pte->si_signo = vp->spt.si_signo; + pte->it_sigev_notify = vp->spt.it_sigev_notify; + pte->sival_ptr = encode_pointer(vp->spt.sival_ptr); ++ pte->sigev_notify_thread_id = vp->spt.sigev_notify_thread_id; + + pte->overrun = v->overrun; + +diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c +index 808e862..f7eb4a3 100644 +--- a/criu/pie/restorer.c ++++ b/criu/pie/restorer.c +@@ -1224,6 +1224,7 @@ static int create_posix_timers(struct task_restore_args *args) + sev.sigev_notify = args->posix_timers[i].spt.it_sigev_notify; + sev.sigev_signo = args->posix_timers[i].spt.si_signo; + sev.sigev_value.sival_ptr = args->posix_timers[i].spt.sival_ptr; ++ sev._sigev_un._tid = args->posix_timers[i].spt.sigev_notify_thread_id; + + while (1) { + ret = sys_timer_create(args->posix_timers[i].spt.clock_id, &sev, &next_id); +diff --git a/criu/proc_parse.c b/criu/proc_parse.c +index 748d02e..117b696 100644 +--- a/criu/proc_parse.c ++++ b/criu/proc_parse.c +@@ -2239,6 +2239,7 @@ int parse_posix_timers(pid_t pid, struct proc_posix_timers_stat *args) + + if ( tidpid[0] == 't') { + timer->spt.it_sigev_notify = SIGEV_THREAD_ID; ++ timer->spt.sigev_notify_thread_id = pid_t; + } else { + switch (sigpid[0]) { + case 's' : +diff --git a/images/timer.proto b/images/timer.proto +index a254a6f..41db460 100644 +--- a/images/timer.proto ++++ b/images/timer.proto +@@ -19,6 +19,7 @@ message posix_timer_entry { + required uint64 insec = 8; + required uint64 vsec = 9; + required uint64 vnsec = 10; ++ required int32 sigev_notify_thread_id = 11; + } + + message task_timers_entry { +-- +2.34.0 + diff --git a/backport-0017--sysvshm-add-dump-restore-sysv-shm-in-host-ipc-ns.patch b/backport-0017--sysvshm-add-dump-restore-sysv-shm-in-host-ipc-ns.patch new file mode 100644 index 0000000..74592aa --- /dev/null +++ b/backport-0017--sysvshm-add-dump-restore-sysv-shm-in-host-ipc-ns.patch @@ -0,0 +1,116 @@ +From c32b64eec010ba1e2d23ea566fa1f388030f0629 Mon Sep 17 00:00:00 2001 +From: "fu.lin" +Date: Sat, 26 Jun 2021 15:18:15 +0800 +Subject: [PATCH 17/49] sysvshm: add dump/restore sysv-shm in host ipc ns + +In original criu design, SysVIPC memory segment, which belongs +to host ipcns, shouldn't be dumped because criu requires the +whole ipcns to be dumped. During the restoring ipcns, the new +shared memory will be created, and fill the original page data +in it. + +This patch makes the shared-memory in host ipcns restore possible. +Idea: + The SysVIPC memory won't disappear after the task exit. The basic +information can be got from `/proc/sysvipc/shm` as long as the +system doesn't reboot. Compared with restoring the whole ipcns, +the processes of the shared memory creating and page data filling +are ignored. + +Reference: +- https://www.criu.org/What_cannot_be_checkpointed + +Signed-off-by: fu.lin +--- + criu/cr-dump.c | 9 ++++----- + criu/cr-restore.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++ + 2 files changed, 50 insertions(+), 5 deletions(-) + +diff --git a/criu/cr-dump.c b/criu/cr-dump.c +index 882b563..b1ed2a2 100644 +--- a/criu/cr-dump.c ++++ b/criu/cr-dump.c +@@ -436,12 +436,11 @@ static int dump_filemap(struct vma_area *vma_area, int fd) + + static int check_sysvipc_map_dump(pid_t pid, VmaEntry *vma) + { +- if (root_ns_mask & CLONE_NEWIPC) +- return 0; ++ if (!(root_ns_mask & CLONE_NEWIPC)) ++ pr_info("Task %d with SysVIPC shmem map @%"PRIx64" lives in host IPC ns\n", ++ pid, vma->start); + +- pr_err("Task %d with SysVIPC shmem map @%"PRIx64" doesn't live in IPC ns\n", +- pid, vma->start); +- return -1; ++ return 0; + } + + static int get_task_auxv(pid_t pid, MmEntry *mm) +diff --git a/criu/cr-restore.c b/criu/cr-restore.c +index db98723..1cb5ce2 100644 +--- a/criu/cr-restore.c ++++ b/criu/cr-restore.c +@@ -1737,6 +1737,49 @@ static int create_children_and_session(void) + return 0; + } + ++static int prepare_rootns_sysv_shm(unsigned long clone_flags) ++{ ++ int retval = 0; ++ char *line = NULL; ++ size_t len = 0; ++ FILE *fp; ++ key_t key; ++ int shmid; ++ mode_t mode; ++ size_t size; ++ ++ /* This is completed by `prepare_namespace()` */ ++ if (!!(clone_flags & CLONE_NEWIPC)) ++ return 0; ++ ++ pr_info("Restoring SYSV shm in host namespace\n"); ++ ++ fp = fopen("/proc/sysvipc/shm", "r"); ++ if (fp == NULL) { ++ pr_err("Can't open '/proc/sysvipc/shm', errno(%d): %s\n", errno, strerror(errno)); ++ return -1; ++ } ++ ++#if BITS_PER_LONG <= 32 ++# define SIZE_SPEC "%10lu" ++#else ++# define SIZE_SPEC "%21lu" ++#endif ++ ++ while (getline(&line, &len, fp) != -1) { ++ if (sscanf(line, "%10d %10d %4o" SIZE_SPEC, &key, &shmid, &mode, &size) != 4) ++ continue; ++ ++ retval = collect_sysv_shmem(shmid, size); ++ if (retval != 0) ++ goto out; ++ } ++ ++out: ++ fclose(fp); ++ return retval; ++} ++ + static int restore_task_with_children(void *_arg) + { + struct cr_clone_arg *ca = _arg; +@@ -1836,6 +1879,9 @@ static int restore_task_with_children(void *_arg) + if (prepare_namespace(current, ca->clone_flags)) + goto err; + ++ if (prepare_rootns_sysv_shm(ca->clone_flags)) ++ goto err; ++ + if (restore_finish_ns_stage(CR_STATE_PREPARE_NAMESPACES, CR_STATE_FORKING) < 0) + goto err; + +-- +2.34.0 + diff --git a/backport-0018--add-netlink-repair-modes.patch b/backport-0018--add-netlink-repair-modes.patch new file mode 100644 index 0000000..f32b825 --- /dev/null +++ b/backport-0018--add-netlink-repair-modes.patch @@ -0,0 +1,45 @@ +From b752c7cae0302143ef3c93a23ad613bf0f9bcd4d Mon Sep 17 00:00:00 2001 +From: Xiaoguang Li +Date: Mon, 29 Mar 2021 20:58:28 -0400 +Subject: [PATCH 18/49] add netlink repair modes + +--- + criu/sk-netlink.c | 15 +++++++++++++++ + 1 file changed, 15 insertions(+) + +diff --git a/criu/sk-netlink.c b/criu/sk-netlink.c +index e4163f5..28ee892 100644 +--- a/criu/sk-netlink.c ++++ b/criu/sk-netlink.c +@@ -68,6 +68,17 @@ int netlink_receive_one(struct nlmsghdr *hdr, struct ns_id *ns, void *arg) + return sk_collect_one(m->ndiag_ino, PF_NETLINK, &sd->sd, ns); + } + ++static int netlink_repair_on(int fd) ++{ ++ int ret, aux = 1; ++ ++ ret = setsockopt(fd, SOL_NETLINK, TCP_REPAIR, &aux, sizeof(aux)); ++ if (ret < 0) ++ pr_err("Can't turn netlink repair mode ON, error: %d\n", ret); ++ ++ return ret; ++} ++ + static bool can_dump_netlink_sk(int lfd) + { + int ret; +@@ -90,6 +101,10 @@ static int dump_one_netlink_fd(int lfd, u32 id, const struct fd_parms *p) + if (IS_ERR(sk)) + goto err; + ++ if (netlink_repair_on(lfd) < 0) { ++ goto err; ++ } ++ + ne.id = id; + ne.ino = p->stat.st_ino; + +-- +2.34.0 + diff --git a/backport-0019--ignore-special-page-dump.patch b/backport-0019--ignore-special-page-dump.patch new file mode 100644 index 0000000..b6fa62a --- /dev/null +++ b/backport-0019--ignore-special-page-dump.patch @@ -0,0 +1,84 @@ +From a7d2042e02293931499c56b43902bb6d1c42de27 Mon Sep 17 00:00:00 2001 +From: Jingxian He +Date: Wed, 1 Dec 2021 11:43:33 +0800 +Subject: [PATCH 19/49] ignore special page dump + +The special page dump will cost too long time when +thread num is very large. And special page dump +is not useful at every time. +Provide Ignore method for special page dump. + +Signed-off-by: Jingxian He +--- + criu/config.c | 1 + + criu/cr-dump.c | 2 +- + criu/crtools.c | 1 + + criu/include/cr_options.h | 1 + + criu/seize.c | 2 +- + 5 files changed, 5 insertions(+), 2 deletions(-) + +diff --git a/criu/config.c b/criu/config.c +index 61a60af..51c2cdc 100644 +--- a/criu/config.c ++++ b/criu/config.c +@@ -524,6 +524,7 @@ int parse_options(int argc, char **argv, bool *usage_error, + BOOL_OPT("with-fd-cred", &opts.with_fd_cred), + BOOL_OPT("dump-char-dev", &opts.dump_char_dev), + BOOL_OPT("mask-exit-notify", &opts.mask_exit_notify), ++ BOOL_OPT("ignore-special-dump", &opts.ignore_special_dump), + { }, + }; + +diff --git a/criu/cr-dump.c b/criu/cr-dump.c +index b1ed2a2..8cdc0a7 100644 +--- a/criu/cr-dump.c ++++ b/criu/cr-dump.c +@@ -1763,7 +1763,7 @@ static int cr_dump_finish(int ret) + + close_service_fd(CR_PROC_FD_OFF); + +- if (ret == 0 && opts.pin_memory) { ++ if (ret == 0 && opts.pin_memory && !opts.ignore_special_dump) { + pr_info("start restore_task_special_pages\n"); + restore_task_special_pages(0); + } +diff --git a/criu/crtools.c b/criu/crtools.c +index faa3bfc..e88232a 100644 +--- a/criu/crtools.c ++++ b/criu/crtools.c +@@ -399,6 +399,7 @@ usage: + " --dump-char-dev Dump char dev files as normal file with repair cmd\n" + " as checkout assisted by kernel.\n" + " --mask-exit-notify Mask task exit notify during dump and restore\n" ++" --ignore-special-dump Ignore special task tid page dump\n" + "\n" + "Check options:\n" + " Without options, \"criu check\" checks availability of absolutely required\n" +diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h +index 7a73ea0..b394591 100644 +--- a/criu/include/cr_options.h ++++ b/criu/include/cr_options.h +@@ -151,6 +151,7 @@ struct cr_options { + int with_fd_cred; + int dump_char_dev; + int mask_exit_notify; ++ int ignore_special_dump; + }; + + extern struct cr_options opts; +diff --git a/criu/seize.c b/criu/seize.c +index 140a9d4..73baf40 100644 +--- a/criu/seize.c ++++ b/criu/seize.c +@@ -557,7 +557,7 @@ static void unseize_task_and_threads(const struct pstree_item *item, int st) + if (item->pid->state == TASK_DEAD) + return; + +- if (opts.pin_memory) { ++ if (opts.pin_memory && !opts.ignore_special_dump) { + for (i = 0; i < item->nr_threads; i++) + dump_task_special_pages(item->threads[i].real); + } +-- +2.34.0 + diff --git a/backport-0020--add-O_REPAIR-flag-to-vma-fd.patch b/backport-0020--add-O_REPAIR-flag-to-vma-fd.patch new file mode 100644 index 0000000..b5fc3a4 --- /dev/null +++ b/backport-0020--add-O_REPAIR-flag-to-vma-fd.patch @@ -0,0 +1,45 @@ +From 462af9f365045febd97914c83f4437ce629e1958 Mon Sep 17 00:00:00 2001 +From: Jingxian He +Date: Wed, 1 Dec 2021 11:45:50 +0800 +Subject: [PATCH 20/49] add O_REPAIR flag to vma fd + +Add O_REPAIR flag when openning vma fd. + +Signed-off-by: Jingxian He +--- + criu/files-reg.c | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +diff --git a/criu/files-reg.c b/criu/files-reg.c +index c7a4d1e..36fd9f9 100644 +--- a/criu/files-reg.c ++++ b/criu/files-reg.c +@@ -2236,6 +2236,7 @@ void filemap_ctx_fini(void) + } + } + ++#define O_REPAIR 040000000 + static int open_filemap(int pid, struct vma_area *vma) + { + u32 flags; +@@ -2248,7 +2249,7 @@ static int open_filemap(int pid, struct vma_area *vma) + */ + + BUG_ON((vma->vmfd == NULL) || !vma->e->has_fdflags); +- flags = vma->e->fdflags; ++ flags = vma->e->fdflags | O_REPAIR; + + if (ctx.flags != flags || ctx.desc != vma->vmfd) { + ret = open_path(vma->vmfd, do_open_reg_noseek_flags, &flags); +@@ -2260,6 +2261,8 @@ static int open_filemap(int pid, struct vma_area *vma) + ctx.flags = flags; + ctx.desc = vma->vmfd; + ctx.fd = ret; ++ } else { ++ ret = open_path(vma->vmfd, do_open_reg_noseek_flags, &flags); + } + + ctx.vma = vma; +-- +2.34.0 + diff --git a/backport-0021--file-lock-add-repair-mode-to-dump-file-locks.patch b/backport-0021--file-lock-add-repair-mode-to-dump-file-locks.patch new file mode 100644 index 0000000..fac4987 --- /dev/null +++ b/backport-0021--file-lock-add-repair-mode-to-dump-file-locks.patch @@ -0,0 +1,307 @@ +From c9375dba53f57e75119752e5f98eeb7f709befb8 Mon Sep 17 00:00:00 2001 +From: Sang Yan +Date: Wed, 1 Dec 2021 11:48:08 +0800 +Subject: [PATCH 21/49] file-lock: add repair mode to dump file locks + +Add new options "--file-locks-repair" to enable repair mode +while dumping file locks. +Repair mode keeps locks locked while process were killed in +dumping operation. Then resume the locks from repair mode at +process resuming. + +Signed-off-by: Sang Yan +--- + criu/config.c | 1 + + criu/cr-dump.c | 8 ++++++ + criu/crtools.c | 1 + + criu/file-lock.c | 10 +++++++ + criu/include/cr_options.h | 1 + + criu/include/fcntl.h | 7 +++++ + criu/include/parasite-syscall.h | 2 ++ + criu/include/parasite.h | 10 +++++++ + criu/parasite-syscall.c | 33 ++++++++++++++++++++++ + criu/pie/parasite.c | 50 +++++++++++++++++++++++++++++++++ + 10 files changed, 123 insertions(+) + +diff --git a/criu/config.c b/criu/config.c +index 51c2cdc..00a791c 100644 +--- a/criu/config.c ++++ b/criu/config.c +@@ -525,6 +525,7 @@ int parse_options(int argc, char **argv, bool *usage_error, + BOOL_OPT("dump-char-dev", &opts.dump_char_dev), + BOOL_OPT("mask-exit-notify", &opts.mask_exit_notify), + BOOL_OPT("ignore-special-dump", &opts.ignore_special_dump), ++ BOOL_OPT("file-locks-repair", &opts.file_locks_repair), + { }, + }; + +diff --git a/criu/cr-dump.c b/criu/cr-dump.c +index 8cdc0a7..e6608dc 100644 +--- a/criu/cr-dump.c ++++ b/criu/cr-dump.c +@@ -1392,6 +1392,14 @@ static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) + goto err_cure; + } + ++ if (opts.file_locks_repair) { ++ ret = parasite_dump_file_locks(parasite_ctl, pid); ++ if (ret) { ++ pr_err("Can't parasite dump file locks (pid: %d)\n", pid); ++ goto err_cure; ++ } ++ } ++ + ret = dump_task_core_all(parasite_ctl, item, &pps_buf, cr_imgset, &misc); + if (ret) { + pr_err("Dump core (pid: %d) failed with %d\n", pid, ret); +diff --git a/criu/crtools.c b/criu/crtools.c +index e88232a..4b604a5 100644 +--- a/criu/crtools.c ++++ b/criu/crtools.c +@@ -400,6 +400,7 @@ usage: + " as checkout assisted by kernel.\n" + " --mask-exit-notify Mask task exit notify during dump and restore\n" + " --ignore-special-dump Ignore special task tid page dump\n" ++" --file-locks-repair Use repair mode to dump and restore file locks\n" + "\n" + "Check options:\n" + " Without options, \"criu check\" checks availability of absolutely required\n" +diff --git a/criu/file-lock.c b/criu/file-lock.c +index 8be7589..44ecc92 100644 +--- a/criu/file-lock.c ++++ b/criu/file-lock.c +@@ -428,6 +428,8 @@ void discard_dup_locks_tail(pid_t pid, int fd) + list_for_each_entry_safe_reverse(fl, p, &file_lock_list, list) { + if (fl->owners_fd != fd || pid != fl->fl_holder) + break; ++ if (fl->fl_kind == FL_POSIX) ++ continue; + + list_del(&fl->list); + xfree(fl); +@@ -618,8 +620,12 @@ static int restore_file_lock(FileLockEntry *fle) + cmd = fle->type; + } else if (fle->type == F_RDLCK) { + cmd = LOCK_SH; ++ if (opts.file_locks_repair) ++ cmd = LOCK_REPAIR; + } else if (fle->type == F_WRLCK) { + cmd = LOCK_EX; ++ if (opts.file_locks_repair) ++ cmd = LOCK_REPAIR; + } else if (fle->type == F_UNLCK) { + cmd = LOCK_UN; + } else { +@@ -645,6 +651,10 @@ static int restore_file_lock(FileLockEntry *fle) + flk.l_pid = fle->pid; + flk.l_type = fle->type; + ++ if (opts.file_locks_repair) ++ if (fle->type == F_RDLCK || fle->type == F_WRLCK) ++ flk.l_type = F_REPAIR; ++ + pr_info("(posix)flag: %d, type: %d, pid: %d, fd: %d, " + "start: %8"PRIx64", len: %8"PRIx64"\n", + fle->flag, fle->type, fle->pid, fle->fd, +diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h +index b394591..6824b27 100644 +--- a/criu/include/cr_options.h ++++ b/criu/include/cr_options.h +@@ -152,6 +152,7 @@ struct cr_options { + int dump_char_dev; + int mask_exit_notify; + int ignore_special_dump; ++ int file_locks_repair; + }; + + extern struct cr_options opts; +diff --git a/criu/include/fcntl.h b/criu/include/fcntl.h +index 3abcd51..2408b9b 100644 +--- a/criu/include/fcntl.h ++++ b/criu/include/fcntl.h +@@ -23,6 +23,13 @@ struct f_owner_ex { + #define F_SETCRED 18 + #endif + ++#ifndef F_NEED_REPAIR ++#define F_NEED_REPAIR 16 ++#define F_REPAIR 32 ++#define LOCK_NEED_REPAIR 256 /* REPAIRING lock */ ++#define LOCK_REPAIR 512 /* REPAIR lock */ ++#endif ++ + /* + * These things are required to compile on CentOS-6 + */ +diff --git a/criu/include/parasite-syscall.h b/criu/include/parasite-syscall.h +index afba95a..e8db90b 100644 +--- a/criu/include/parasite-syscall.h ++++ b/criu/include/parasite-syscall.h +@@ -52,4 +52,6 @@ extern int parasite_dump_cgroup(struct parasite_ctl *ctl, struct parasite_dump_c + + extern struct parasite_tty_args *parasite_dump_tty(struct parasite_ctl *ctl, int fd, int type); + ++extern int parasite_dump_file_locks(struct parasite_ctl *ctl, int pid); ++ + #endif /* __CR_PARASITE_SYSCALL_H__ */ +diff --git a/criu/include/parasite.h b/criu/include/parasite.h +index d957094..1c702f0 100644 +--- a/criu/include/parasite.h ++++ b/criu/include/parasite.h +@@ -35,6 +35,7 @@ enum { + PARASITE_CMD_CHECK_VDSO_MARK, + PARASITE_CMD_CHECK_AIOS, + PARASITE_CMD_DUMP_CGROUP, ++ PARASITE_CMD_DUMP_FILELOCKS, + + PARASITE_CMD_MAX, + }; +@@ -236,6 +237,15 @@ struct parasite_dump_cgroup_args { + char contents[1 << 12]; + }; + ++struct parasite_dump_filelocks_args { ++ short kind; ++ short type; ++ long start; ++ long len; ++ int pid; ++ int fd; ++}; ++ + #endif /* !__ASSEMBLY__ */ + + #endif /* __CR_PARASITE_H__ */ +diff --git a/criu/parasite-syscall.c b/criu/parasite-syscall.c +index cc7df77..34c7c51 100644 +--- a/criu/parasite-syscall.c ++++ b/criu/parasite-syscall.c +@@ -32,6 +32,7 @@ + #include + #include "signal.h" + #include "sigframe.h" ++#include "file-lock.h" + + #include + #include +@@ -577,3 +578,35 @@ struct parasite_ctl *parasite_infect_seized(pid_t pid, struct pstree_item *item, + + return ctl; + } ++ ++int parasite_dump_file_locks(struct parasite_ctl *ctl, int pid) ++{ ++ struct parasite_dump_filelocks_args *args; ++ struct file_lock *fl; ++ int ret; ++ ++ args = compel_parasite_args(ctl, struct parasite_dump_filelocks_args); ++ ++ list_for_each_entry(fl, &file_lock_list, list) { ++ if (fl->real_owner != pid) ++ continue; ++ ++ args->pid = fl->real_owner; ++ args->fd = fl->owners_fd; ++ args->kind = fl->fl_kind; ++ args->type = fl->fl_ltype; ++ args->start = fl->start; ++ if (!strncmp(fl->end, "EOF", 3)) ++ args->len = 0; ++ else ++ args->len = (atoll(fl->end) + 1) - fl->start; ++ ++ ret = compel_rpc_call_sync(PARASITE_CMD_DUMP_FILELOCKS, ctl); ++ if (ret < 0) { ++ pr_err("Parasite dump file lock failed! (pid: %d)\n", pid); ++ return ret; ++ } ++ } ++ ++ return 0; ++} +diff --git a/criu/pie/parasite.c b/criu/pie/parasite.c +index 387a976..b31341b 100644 +--- a/criu/pie/parasite.c ++++ b/criu/pie/parasite.c +@@ -7,6 +7,8 @@ + #include + #include + #include ++#include ++#include + + #include "common/config.h" + #include "int.h" +@@ -20,6 +22,7 @@ + #include "criu-log.h" + #include "tty.h" + #include "aio.h" ++#include "file-lock.h" + + #include "asm/parasite.h" + #include "restorer.h" +@@ -654,6 +657,50 @@ static int parasite_dump_cgroup(struct parasite_dump_cgroup_args *args) + return 0; + } + ++static int set_filelocks_needrepair(struct parasite_dump_filelocks_args *args) ++{ ++ int ret; ++ ++ if (args->kind == FL_FLOCK) { ++ if (args->type == F_RDLCK || args->type == F_WRLCK) { ++ int cmd = LOCK_NEED_REPAIR; ++ ++ pr_info("Need Repair flock kind: %d, type: %d, cmd: %d, pid: %d, fd: %d\n", ++ args->kind, args->type, cmd, args->pid, args->fd); ++ ++ ret = sys_flock(args->fd, cmd); ++ if (ret < 0) { ++ pr_err("Can not set NEED_REPAIR flock!\n"); ++ return ret; ++ } ++ } ++ } else if (args->kind == FL_POSIX) { ++ if (args->type == F_RDLCK || args->type == F_WRLCK) { ++ struct flock flk; ++ memset(&flk, 0, sizeof(flk)); ++ ++ flk.l_whence = SEEK_SET; ++ flk.l_start = args->start; ++ flk.l_len = args->len; ++ flk.l_pid = args->pid; ++ flk.l_type = F_NEED_REPAIR; ++ ++ pr_info("Need Repair posix lock kind: %d, type: %d, cmd: %d, pid: %d, fd: %d, " ++ "start: %8"PRIx64", len: %8"PRIx64"\n", ++ args->kind, args->type, flk.l_type, args->pid, args->fd, ++ args->start, args->len); ++ ++ ret = sys_fcntl(args->fd, F_SETLKW, (long)&flk); ++ if (ret < 0) { ++ pr_err("Can not set NEED_REPAIR posix lock!\n"); ++ return ret; ++ } ++ } ++ } ++ ++ return 0; ++} ++ + void parasite_cleanup(void) + { + if (mprotect_args) { +@@ -706,6 +753,9 @@ int parasite_daemon_cmd(int cmd, void *args) + case PARASITE_CMD_DUMP_CGROUP: + ret = parasite_dump_cgroup(args); + break; ++ case PARASITE_CMD_DUMP_FILELOCKS: ++ ret = set_filelocks_needrepair(args); ++ break; + default: + pr_err("Unknown command in parasite daemon thread leader: %d\n", cmd); + ret = -1; +-- +2.34.0 + diff --git a/backport-0022--unlock-network-when-restore-fails.patch b/backport-0022--unlock-network-when-restore-fails.patch new file mode 100644 index 0000000..ffab5f0 --- /dev/null +++ b/backport-0022--unlock-network-when-restore-fails.patch @@ -0,0 +1,60 @@ +From f933b002f39856de26e2a4c3f5d7dbb7e336369a Mon Sep 17 00:00:00 2001 +From: Liu Chao +Date: Fri, 9 Jul 2021 07:32:20 +0000 +Subject: [PATCH 22/49] unlock network when restore fails + +Signed-off-by: Liu Chao +Signed-off-by: fu.lin +--- + criu/cr-restore.c | 14 ++++++++++++++ + 1 file changed, 14 insertions(+) + +diff --git a/criu/cr-restore.c b/criu/cr-restore.c +index 1cb5ce2..00f16dd 100644 +--- a/criu/cr-restore.c ++++ b/criu/cr-restore.c +@@ -107,6 +107,9 @@ + #endif + + struct pstree_item *current; ++#define NETWORK_COLLECTED 0x1 ++#define NETWORK_UNLOCK 0x2 ++static int network_status = 0; + + static int restore_task_with_children(void *); + static int sigreturn_restore(pid_t pid, struct task_restore_args *ta, unsigned long alen, CoreEntry *core); +@@ -240,6 +243,7 @@ static int crtools_prepare_shared(void) + /* Connections are unlocked from criu */ + if (!files_collected() && collect_image(&inet_sk_cinfo)) + return -1; ++ network_status |= NETWORK_COLLECTED; + + if (collect_binfmt_misc()) + return -1; +@@ -2381,6 +2385,7 @@ skip_ns_bouncing: + + /* Unlock network before disabling repair mode on sockets */ + network_unlock(); ++ network_status |= NETWORK_UNLOCK; + + /* + * Stop getting sigchld, after we resume the tasks they +@@ -2583,6 +2588,15 @@ int cr_restore_tasks(void) + + err: + cr_plugin_fini(CR_PLUGIN_STAGE__RESTORE, ret); ++ if (ret < 0) { ++ if ((network_status & NETWORK_COLLECTED) == 0) { ++ if (!files_collected() && collect_image(&inet_sk_cinfo)) ++ pr_err("collect inet sk cinfo fail"); ++ } ++ if ((network_status & NETWORK_UNLOCK) == 0) ++ network_unlock(); ++ } ++ + return ret; + } + +-- +2.34.0 + diff --git a/backport-0023--net-add-shared-socket-recover-method-for-criu.patch b/backport-0023--net-add-shared-socket-recover-method-for-criu.patch new file mode 100644 index 0000000..98a8742 --- /dev/null +++ b/backport-0023--net-add-shared-socket-recover-method-for-criu.patch @@ -0,0 +1,330 @@ +From 14538ce083f64852ba71c32c63822f775cf8b4be Mon Sep 17 00:00:00 2001 +From: Jingxian He +Date: Wed, 1 Dec 2021 11:52:16 +0800 +Subject: [PATCH 23/49] net: add shared socket recover method for criu + +When the socket file is shared with another process, +it will not be freed during dumping process. +We can repair the socket file by installing it to +the old fd number. + +Add new options: "--share-dst-ports" and "--share-src-ports" +for user to tell criu which socket ports are shared. + +Signed-off-by: Jingxian He +--- + criu/config.c | 8 ++ + criu/crtools.c | 3 + + criu/files.c | 18 ++++- + criu/include/cr_options.h | 2 + + criu/include/files.h | 4 + + criu/include/net.h | 1 + + criu/include/sk-inet.h | 3 + + criu/sk-inet.c | 151 ++++++++++++++++++++++++++++++++++++++ + 8 files changed, 189 insertions(+), 1 deletion(-) + +diff --git a/criu/config.c b/criu/config.c +index 00a791c..db2ae30 100644 +--- a/criu/config.c ++++ b/criu/config.c +@@ -517,6 +517,8 @@ int parse_options(int argc, char **argv, bool *usage_error, + { "tls-key", required_argument, 0, 1095}, + BOOL_OPT("tls", &opts.tls), + {"tls-no-cn-verify", no_argument, &opts.tls_no_cn_verify, true}, ++ { "share-dst-ports", required_argument, 0, 1099 }, ++ { "share-src-ports", required_argument, 0, 1100 }, + BOOL_OPT("with-cpu-affinity", &opts.with_cpu_affinity), + BOOL_OPT("pin-memory", &opts.pin_memory), + BOOL_OPT("use-fork-pid", &opts.use_fork_pid), +@@ -824,6 +826,12 @@ int parse_options(int argc, char **argv, bool *usage_error, + case 1095: + SET_CHAR_OPTS(tls_key, optarg); + break; ++ case 1099: ++ SET_CHAR_OPTS(share_dst_ports, optarg); ++ break; ++ case 1100: ++ SET_CHAR_OPTS(share_src_ports, optarg); ++ break; + case 'V': + pr_msg("Version: %s\n", CRIU_VERSION); + if (strcmp(CRIU_GITID, "0")) +diff --git a/criu/crtools.c b/criu/crtools.c +index 4b604a5..6abdc7d 100644 +--- a/criu/crtools.c ++++ b/criu/crtools.c +@@ -80,6 +80,9 @@ int main(int argc, char *argv[], char *envp[]) + if (ret == 2) + goto usage; + ++ if (parse_share_ports()) ++ goto usage; ++ + log_set_loglevel(opts.log_level); + + if (!strcmp(argv[1], "swrk")) { +diff --git a/criu/files.c b/criu/files.c +index 732abd6..b0bfbc4 100644 +--- a/criu/files.c ++++ b/criu/files.c +@@ -699,6 +699,8 @@ int dump_my_file(int lfd, u32 *id, int *type) + return 0; + } + ++int dst_pid; ++ + int dump_task_files_seized(struct parasite_ctl *ctl, struct pstree_item *item, + struct parasite_drain_fd *dfds) + { +@@ -723,7 +725,7 @@ int dump_task_files_seized(struct parasite_ctl *ctl, struct pstree_item *item, + img = open_image(CR_FD_FDINFO, O_DUMP, item->ids->files_id); + if (!img) + goto err; +- ++ dst_pid = item->pid->real; + ret = 0; /* Don't fail if nr_fds == 0 */ + for (off = 0; ret == 0 && off < dfds->nr_fds; off += nr_fds) { + if (nr_fds + off > dfds->nr_fds) +@@ -1242,6 +1244,20 @@ static int open_fd(struct fdinfo_list_entry *fle) + goto out; + } + ++ if (d->ops->type == FD_TYPES__INETSK) { ++ if (check_need_repair(d)) { ++ ret = repair_share_socket(d->id); ++ if (!ret) { ++ new_fd = get_share_socket(); ++ pr_info("get share socket:%d\n", new_fd); ++ if (new_fd <= 0 || setup_and_serve_out(fle, new_fd) < 0) ++ return -1; ++ fle->stage = FLE_RESTORED; ++ return 0; ++ } ++ } ++ } ++ + /* + * Open method returns the following values: + * 0 -- restore is successfully finished; +diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h +index 6824b27..8bd66bc 100644 +--- a/criu/include/cr_options.h ++++ b/criu/include/cr_options.h +@@ -153,6 +153,8 @@ struct cr_options { + int mask_exit_notify; + int ignore_special_dump; + int file_locks_repair; ++ char *share_dst_ports; ++ char *share_src_ports; + }; + + extern struct cr_options opts; +diff --git a/criu/include/files.h b/criu/include/files.h +index b12d079..85ca617 100644 +--- a/criu/include/files.h ++++ b/criu/include/files.h +@@ -210,4 +210,8 @@ extern int open_transport_socket(void); + extern int set_fds_event(pid_t virt); + extern void wait_fds_event(void); + ++extern int repair_share_socket(int id); ++extern int check_need_repair(struct file_desc *d); ++extern int get_share_socket(void); ++ + #endif /* __CR_FILES_H__ */ +diff --git a/criu/include/net.h b/criu/include/net.h +index 9976f6e..bcb94fb 100644 +--- a/criu/include/net.h ++++ b/criu/include/net.h +@@ -16,6 +16,7 @@ extern int dump_net_ns(struct ns_id *ns); + extern int prepare_net_namespaces(void); + extern void fini_net_namespaces(void); + extern int netns_keep_nsfd(void); ++extern int parse_share_ports(void); + + struct pstree_item; + extern int restore_task_net_ns(struct pstree_item *current); +diff --git a/criu/include/sk-inet.h b/criu/include/sk-inet.h +index 7996651..38dc9b2 100644 +--- a/criu/include/sk-inet.h ++++ b/criu/include/sk-inet.h +@@ -102,4 +102,7 @@ union libsoccr_addr; + int restore_sockaddr(union libsoccr_addr *sa, + int family, u32 pb_port, u32 *pb_addr, u32 ifindex); + ++#define MAX_SHARE_PORT_NUM 64 ++extern int dst_pid; ++ + #endif /* __CR_SK_INET_H__ */ +diff --git a/criu/sk-inet.c b/criu/sk-inet.c +index a17fec6..10bcf6e 100644 +--- a/criu/sk-inet.c ++++ b/criu/sk-inet.c +@@ -420,6 +420,152 @@ static bool needs_scope_id(uint32_t *src_addr) + return false; + } + ++#define ADD_SHARE_SOCKET_PATH "/sys/kernel/add_share_socket" ++#define REPAIR_SHARE_SOCKET_PATH "/sys/kernel/repair_share_socket" ++#define SHARE_SOCKET_PATH "/sys/kernel/share_socket" ++ ++int add_share_socket(u32 id, int fd, int pid, int port) ++{ ++ int retval; ++ char buf[256] = {0}; ++ ++ retval = snprintf(buf, 256, "%u,%d,%d,%d", id, fd, pid, port); ++ if (retval <= 0) ++ return -EFAULT; ++ ++ fd = open(ADD_SHARE_SOCKET_PATH, O_WRONLY, 0); ++ if (fd < 0) { ++ pr_err("open file:%s fail\n", ADD_SHARE_SOCKET_PATH); ++ return fd; ++ } ++ ++ retval = write(fd, buf, strlen(buf)); ++ close(fd); ++ return retval < 0 ? -1 : 0; ++} ++ ++ ++int repair_share_socket(int id) ++{ ++ int retval, fd; ++ char buf[256] = {0}; ++ ++ retval = snprintf(buf, 256, "%u", id); ++ if (retval <= 0) ++ return -EFAULT; ++ ++ fd = open(REPAIR_SHARE_SOCKET_PATH, O_WRONLY, 0); ++ if (fd < 0) { ++ pr_err("open file:%s fail\n", REPAIR_SHARE_SOCKET_PATH); ++ return fd; ++ } ++ retval = write(fd, buf, strlen(buf)); ++ ++ close(fd); ++ return retval < 0 ? -1 : 0; ++} ++ ++int get_share_socket(void) ++{ ++ int fd; ++ ssize_t count; ++ int retval = -1; ++ char buf[32] = {0}; ++ ++ fd = open(SHARE_SOCKET_PATH, O_RDONLY, 0); ++ if (fd < 0) { ++ pr_err("open file:%s fail\n", SHARE_SOCKET_PATH); ++ return fd; ++ } ++ ++ count = read(fd, buf, sizeof(buf)); ++ if (count > 0) ++ retval = atoi(buf); ++ ++ close(fd); ++ return retval; ++} ++ ++int g_share_dst_ports[MAX_SHARE_PORT_NUM]; ++int g_share_dst_port_num; ++int g_share_src_ports[MAX_SHARE_PORT_NUM]; ++int g_share_src_port_num; ++ ++int parse_share_ports(void) ++{ ++ char *save, *p; ++ ++ if (opts.share_dst_ports) { ++ p = strtok_r(opts.share_dst_ports, ",", &save); ++ while (p != NULL) { ++ if (g_share_dst_port_num >= MAX_SHARE_PORT_NUM) ++ return -1; ++ g_share_dst_ports[g_share_dst_port_num] = atoi(p); ++ if (!g_share_dst_ports[g_share_dst_port_num]) ++ return -1; ++ g_share_dst_port_num++; ++ p = strtok_r(NULL, ",", &save); ++ } ++ } ++ ++ if (opts.share_src_ports) { ++ p = strtok_r(opts.share_src_ports, ",", &save); ++ while (p != NULL) { ++ if (g_share_src_port_num >= MAX_SHARE_PORT_NUM) ++ return -1; ++ g_share_src_ports[g_share_src_port_num] = atoi(p); ++ if (!g_share_src_ports[g_share_src_port_num]) ++ return -1; ++ g_share_src_port_num++; ++ p = strtok_r(NULL, ",", &save); ++ } ++ } ++ return 0; ++} ++ ++int check_share_dst_port(int dst_port) ++{ ++ int i; ++ int ret = 0; ++ ++ for (i = 0; i < g_share_dst_port_num; i++) { ++ if (dst_port == g_share_dst_ports[i]) { ++ ret = 1; ++ break; ++ } ++ } ++ return ret; ++} ++ ++int check_share_src_port(int src_port) ++{ ++ int i; ++ int ret = 0; ++ ++ for (i = 0; i < g_share_src_port_num; i++) { ++ if (src_port == g_share_src_ports[i]) { ++ ret = 1; ++ break; ++ } ++ } ++ ++ return ret; ++} ++ ++int check_need_repair(struct file_desc *d) ++{ ++ struct inet_sk_info *ii; ++ InetSkEntry *ie; ++ ++ ii = container_of(d, struct inet_sk_info, d); ++ ie = ii->ie; ++ if (check_share_dst_port(ie->dst_port) || ++ check_share_src_port(ie->src_port)) ++ return 1; ++ else ++ return 0; ++} ++ + static int do_dump_one_inet_fd(int lfd, u32 id, const struct fd_parms *p, int family) + { + struct inet_sk_desc *sk; +@@ -478,6 +624,11 @@ static int do_dump_one_inet_fd(int lfd, u32 id, const struct fd_parms *p, int fa + + BUG_ON(sk->sd.already_dumped); + ++ if (check_share_dst_port(sk->dst_port) || check_share_src_port(sk->src_port)) { ++ pr_info("Start add share prot:%d src %d\n", sk->dst_port, sk->src_port); ++ add_share_socket(id, lfd, dst_pid, sk->src_port); ++ } ++ + ie.id = id; + ie.ino = sk->sd.ino; + if (sk->sd.sk_ns) { +-- +2.34.0 + diff --git a/backport-0024--clean-repair-res-when-dump-fail.patch b/backport-0024--clean-repair-res-when-dump-fail.patch new file mode 100644 index 0000000..9640754 --- /dev/null +++ b/backport-0024--clean-repair-res-when-dump-fail.patch @@ -0,0 +1,130 @@ +From c56b0ed548a2b4baa632e698e06de5f6d5fb968c Mon Sep 17 00:00:00 2001 +From: Jingxian He +Date: Mon, 19 Jul 2021 14:43:10 +0800 +Subject: [PATCH 24/49] clean repair res when dump fail + +Clean pin mem and netlink repair res when dump fail. + +Signed-off-by: Jingxian He +--- + criu/cr-dump.c | 22 ++++++++++++++++++++++ + criu/include/net.h | 1 + + criu/sk-netlink.c | 40 +++++++++++++++++++++++++++++++++++----- + 3 files changed, 58 insertions(+), 5 deletions(-) + +diff --git a/criu/cr-dump.c b/criu/cr-dump.c +index e6608dc..58597ab 100644 +--- a/criu/cr-dump.c ++++ b/criu/cr-dump.c +@@ -80,6 +80,7 @@ + #include "fault-injection.h" + #include "dump.h" + #include "eventpoll.h" ++#include "restorer.h" + + /* + * Architectures can overwrite this function to restore register sets that +@@ -1690,6 +1691,23 @@ static int cr_lazy_mem_dump(void) + return ret; + } + ++int clear_pin_mem(int pid) ++{ ++ int fd, ret; ++ ++ fd = open(PIN_MEM_FILE, O_RDWR, 0); ++ if (fd < 0) { ++ pr_warn("error open file: %s\n", PIN_MEM_FILE); ++ return -1; ++ } ++ ret = ioctl(fd, CLEAR_PIN_MEM_AREA, (unsigned long) &pid); ++ if (ret < 0) { ++ pr_warn("clear pin mem fail: %d\n", pid); ++ } ++ close(fd); ++ return ret; ++} ++ + static enum notifier_state notifier_state = NOTHING_COMPLETE; + + static int cr_dump_finish(int ret) +@@ -1777,6 +1795,10 @@ static int cr_dump_finish(int ret) + } + + if (ret != 0 && opts.with_notifier_kup) { ++ pr_info("repair off netlink fd\n"); ++ netlink_repair_off(); ++ pr_info("clear pin mem info\n"); ++ clear_pin_mem(0); + pr_info("call notifier rollback\n"); + switch (notifier_state) { + case PRE_FREEZE_COMPLETE: +diff --git a/criu/include/net.h b/criu/include/net.h +index bcb94fb..a318299 100644 +--- a/criu/include/net.h ++++ b/criu/include/net.h +@@ -54,5 +54,6 @@ extern void check_has_netns_ioc(int fd, bool *kdat_val, const char *name); + extern int net_set_ext(struct ns_id *ns); + extern struct ns_id *get_root_netns(); + extern int read_net_ns_img(); ++extern int netlink_repair_off(void); + + #endif /* __CR_NET_H__ */ +diff --git a/criu/sk-netlink.c b/criu/sk-netlink.c +index 28ee892..a6b6bec 100644 +--- a/criu/sk-netlink.c ++++ b/criu/sk-netlink.c +@@ -68,15 +68,45 @@ int netlink_receive_one(struct nlmsghdr *hdr, struct ns_id *ns, void *arg) + return sk_collect_one(m->ndiag_ino, PF_NETLINK, &sd->sd, ns); + } + ++struct netlink_repair_fd { ++ int netlink_fd; ++ struct list_head nlist; ++}; ++ ++static LIST_HEAD(netlink_repair_fds); ++ + static int netlink_repair_on(int fd) + { +- int ret, aux = 1; ++ int ret, aux = 1; ++ struct netlink_repair_fd *nrf; + +- ret = setsockopt(fd, SOL_NETLINK, TCP_REPAIR, &aux, sizeof(aux)); +- if (ret < 0) +- pr_err("Can't turn netlink repair mode ON, error: %d\n", ret); ++ ret = setsockopt(fd, SOL_NETLINK, TCP_REPAIR, &aux, sizeof(aux)); ++ if (ret < 0) { ++ pr_err("Can't turn netlink repair mode ON, error: %d\n", ret); ++ return ret; ++ } ++ nrf = malloc(sizeof(*nrf)); ++ if (!nrf) ++ return -ENOMEM; ++ nrf->netlink_fd = dup(fd); ++ list_add_tail(&nrf->nlist, &netlink_repair_fds); ++ return ret; ++} + +- return ret; ++int netlink_repair_off(void) ++{ ++ int aux = 0, ret; ++ struct netlink_repair_fd *nrf, *n; ++ ++ list_for_each_entry_safe(nrf, n, &netlink_repair_fds, nlist) { ++ ret = setsockopt(nrf->netlink_fd, SOL_NETLINK, TCP_REPAIR, &aux, sizeof(aux)); ++ if (ret < 0) ++ pr_err("Failed to turn off repair mode on netlink\n"); ++ close(nrf->netlink_fd); ++ list_del(&nrf->nlist); ++ free(nrf); ++ } ++ return 0; + } + + static bool can_dump_netlink_sk(int lfd) +-- +2.34.0 + diff --git a/backport-0025--save-src-ports-to-ip_local_reserved_ports-when-dump-.patch b/backport-0025--save-src-ports-to-ip_local_reserved_ports-when-dump-.patch new file mode 100644 index 0000000..97eff46 --- /dev/null +++ b/backport-0025--save-src-ports-to-ip_local_reserved_ports-when-dump-.patch @@ -0,0 +1,248 @@ +From 3fa6a1e0ead93167e384835e2eac2d75e4397181 Mon Sep 17 00:00:00 2001 +From: Liu Chao +Date: Mon, 19 Jul 2021 03:19:30 +0000 +Subject: [PATCH 25/49] save src ports to ip_local_reserved_ports when dump + tasks and retore it when restore tasks + +--- + criu/config.c | 8 +++- + criu/cr-dump.c | 3 ++ + criu/crtools.c | 1 + + criu/include/cr_options.h | 1 + + criu/include/sk-inet.h | 4 ++ + criu/include/util.h | 2 + + criu/net.c | 6 ++- + criu/sk-tcp.c | 85 +++++++++++++++++++++++++++++++++++++++ + 8 files changed, 108 insertions(+), 2 deletions(-) + +diff --git a/criu/config.c b/criu/config.c +index db2ae30..f02991d 100644 +--- a/criu/config.c ++++ b/criu/config.c +@@ -441,7 +441,7 @@ int parse_options(int argc, char **argv, bool *usage_error, + {OPT_NAME, no_argument, SAVE_TO, true},\ + {"no-" OPT_NAME, no_argument, SAVE_TO, false} + +- static const char short_opts[] = "dSsRt:hD:o:v::x::Vr:jJ:lW:L:M:"; ++ static const char short_opts[] = "dSsRt:hD:o:v::x::Vr:jJ:lW:L:M:P:"; + static struct option long_opts[] = { + { "tree", required_argument, 0, 't' }, + { "leave-stopped", no_argument, 0, 's' }, +@@ -528,6 +528,7 @@ int parse_options(int argc, char **argv, bool *usage_error, + BOOL_OPT("mask-exit-notify", &opts.mask_exit_notify), + BOOL_OPT("ignore-special-dump", &opts.ignore_special_dump), + BOOL_OPT("file-locks-repair", &opts.file_locks_repair), ++ {"reserve-ports", required_argument, 0, 'P' }, + { }, + }; + +@@ -840,6 +841,11 @@ int parse_options(int argc, char **argv, bool *usage_error, + case 'h': + *usage_error = false; + return 2; ++ case 'P': ++ opts.reserve_ports = atoi(optarg); ++ if (opts.reserve_ports < 0) ++ goto bad_arg; ++ break; + default: + return 2; + } +diff --git a/criu/cr-dump.c b/criu/cr-dump.c +index 58597ab..26fcf7c 100644 +--- a/criu/cr-dump.c ++++ b/criu/cr-dump.c +@@ -1938,6 +1938,9 @@ int cr_dump_tasks(pid_t pid) + goto err; + } + ++ if (opts.reserve_ports > 0) ++ set_reserved_ports(); ++ + if (parent_ie) { + inventory_entry__free_unpacked(parent_ie, NULL); + parent_ie = NULL; +diff --git a/criu/crtools.c b/criu/crtools.c +index 6abdc7d..d437f35 100644 +--- a/criu/crtools.c ++++ b/criu/crtools.c +@@ -404,6 +404,7 @@ usage: + " --mask-exit-notify Mask task exit notify during dump and restore\n" + " --ignore-special-dump Ignore special task tid page dump\n" + " --file-locks-repair Use repair mode to dump and restore file locks\n" ++" --reserve-ports Reserve src ports in kernel\n" + "\n" + "Check options:\n" + " Without options, \"criu check\" checks availability of absolutely required\n" +diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h +index 8bd66bc..055c062 100644 +--- a/criu/include/cr_options.h ++++ b/criu/include/cr_options.h +@@ -155,6 +155,7 @@ struct cr_options { + int file_locks_repair; + char *share_dst_ports; + char *share_src_ports; ++ int reserve_ports; + }; + + extern struct cr_options opts; +diff --git a/criu/include/sk-inet.h b/criu/include/sk-inet.h +index 38dc9b2..2ee46ea 100644 +--- a/criu/include/sk-inet.h ++++ b/criu/include/sk-inet.h +@@ -83,6 +83,10 @@ extern void tcp_locked_conn_add(struct inet_sk_info *); + extern void rst_unlock_tcp_connections(void); + extern void cpt_unlock_tcp_connections(void); + ++extern void read_reserved_ports(char *path); ++extern void write_reserved_ports(char *path); ++extern void set_reserved_ports(void); ++ + extern int dump_one_tcp(int sk, struct inet_sk_desc *sd); + extern int restore_one_tcp(int sk, struct inet_sk_info *si); + +diff --git a/criu/include/util.h b/criu/include/util.h +index 38aa214..6b652b0 100644 +--- a/criu/include/util.h ++++ b/criu/include/util.h +@@ -430,4 +430,6 @@ bool find_devname(const char *name); + #define UNMASK_EXIT_NOTIFY_DIR "/sys/kernel/unmask_exit_notify" + int mask_task_exit_notify(int pid, bool mask); + ++#define RESERVED_PORTS_PATH "/proc/sys/net/ipv4/ip_local_reserved_ports" ++ + #endif /* __CR_UTIL_H__ */ +diff --git a/criu/net.c b/criu/net.c +index 44b0ce2..6ca5ef5 100644 +--- a/criu/net.c ++++ b/criu/net.c +@@ -2634,7 +2634,6 @@ static int network_unlock_internal() + if (switch_ns(root_item->pid->real, &net_ns_desc, &nsret)) + return -1; + +- + ret |= iptables_restore(false, conf, sizeof(conf) - 1); + if (kdat.ipv6) + ret |= iptables_restore(true, conf, sizeof(conf) - 1); +@@ -2663,6 +2662,11 @@ void network_unlock(void) + { + pr_info("Unlock network\n"); + ++ if (opts.reserve_ports) { ++ read_reserved_ports("ip_local_reserved_ports"); ++ write_reserved_ports(RESERVED_PORTS_PATH); ++ } ++ + cpt_unlock_tcp_connections(); + rst_unlock_tcp_connections(); + +diff --git a/criu/sk-tcp.c b/criu/sk-tcp.c +index 4fd2eb8..c58e3f6 100644 +--- a/criu/sk-tcp.c ++++ b/criu/sk-tcp.c +@@ -23,6 +23,7 @@ + #include "kerndat.h" + #include "restorer.h" + #include "rst-malloc.h" ++#include "xmalloc.h" + + #include "protobuf.h" + #include "images/tcp-stream.pb-c.h" +@@ -33,6 +34,9 @@ + static LIST_HEAD(cpt_tcp_repair_sockets); + static LIST_HEAD(rst_tcp_repair_sockets); + ++static char* reserved_ports; ++static int reserved_ports_num; ++ + static int tcp_repair_established(int fd, struct inet_sk_desc *sk) + { + int ret; +@@ -446,3 +450,84 @@ void rst_unlock_tcp_connections(void) + list_for_each_entry(ii, &rst_tcp_repair_sockets, rlist) + nf_unlock_connection_info(ii); + } ++ ++void read_reserved_ports(char *path) ++{ ++ FILE *file = NULL; ++ char *ch = NULL; ++ size_t size = 0; ++ ++ if (reserved_ports) { ++ free(reserved_ports); ++ reserved_ports = NULL; ++ } ++ ++ file = fopen(path, "r"); ++ if (!file) { ++ pr_err("Cannot fopen %s\n", path); ++ return; ++ } ++ ++ if (getline(&reserved_ports, &size, file) <= 0) ++ pr_err("Cannot getline from %s\n", path); ++ fclose(file); ++ ++ if (!reserved_ports) ++ return; ++ ++ ch = strstr(reserved_ports, "\n"); ++ if (ch) ++ *ch = '\0'; ++} ++ ++void write_reserved_ports(char *path) ++{ ++ int fd = -1; ++ char buf[PATH_MAX]; ++ ++ fd = open(path, O_RDWR | O_CREAT, 0640); ++ if (fd < 0) { ++ pr_err("Cannot open %s ret %d cwd: %s\n", path, fd, buf); ++ return; ++ } ++ ++ cr_system(-1, fd, -1, "/usr/bin/echo", ++ (char *[]) { "echo", reserved_ports, NULL}, 0); ++ close(fd); ++} ++ ++static int add_reserved_ports(struct inet_sk_desc *sk) ++{ ++ if (reserved_ports_num >= opts.reserve_ports) ++ return -1; ++ ++ if (strlen(reserved_ports) == 0) ++ snprintf(reserved_ports, 6, "%u", sk->src_port); ++ else ++ snprintf(reserved_ports + strlen(reserved_ports), 7, ",%u", sk->src_port); ++ reserved_ports_num++; ++ ++ return 0; ++} ++ ++void set_reserved_ports(void) ++{ ++ struct inet_sk_desc *sk = NULL; ++ size_t size = 0; ++ ++ read_reserved_ports(RESERVED_PORTS_PATH); ++ ++ write_reserved_ports("ip_local_reserved_ports"); ++ ++ size = strlen(reserved_ports) + 6 * opts.reserve_ports + 1; ++ if (xrealloc_safe(&reserved_ports, size)) ++ exit(1); ++ ++ list_for_each_entry(sk, &cpt_tcp_repair_sockets, rlist) ++ add_reserved_ports(sk); ++ ++ write_reserved_ports(RESERVED_PORTS_PATH); ++ ++ free(reserved_ports); ++ reserved_ports = NULL; ++} +-- +2.34.0 + diff --git a/backport-0026--fix-dump-fail-problem-with-null-seek-op.patch b/backport-0026--fix-dump-fail-problem-with-null-seek-op.patch new file mode 100644 index 0000000..e1b8d0a --- /dev/null +++ b/backport-0026--fix-dump-fail-problem-with-null-seek-op.patch @@ -0,0 +1,35 @@ +From 5961c45207b20e81e610b6424193f3f1f0137be4 Mon Sep 17 00:00:00 2001 +From: Zhuling +Date: Wed, 1 Dec 2021 14:15:11 +0800 +Subject: [PATCH 26/49] fix dump fail problem with null seek op + +Fix file dumping fail problem when the file seek op is null. + +Signed-off-by: Jingxian He +--- + criu/files-reg.c | 9 ++++++--- + 1 file changed, 6 insertions(+), 3 deletions(-) + +diff --git a/criu/files-reg.c b/criu/files-reg.c +index 36fd9f9..65912fe 100644 +--- a/criu/files-reg.c ++++ b/criu/files-reg.c +@@ -2150,9 +2150,12 @@ static int do_open_reg(int ns_root_fd, struct reg_file_info *rfi, void *arg) + + if ((rfi->rfe->pos != -1ULL) && + lseek(fd, rfi->rfe->pos, SEEK_SET) < 0) { +- pr_perror("Can't restore file pos"); +- close(fd); +- return -1; ++ pr_info("No ability to restore file pos"); ++ if (errno != ESPIPE) { ++ pr_perror("Can't restore file pos"); ++ close(fd); ++ return -1; ++ } + } + + return fd; +-- +2.34.0 + diff --git a/backport-0027--fix-dump-fail-problem-with-no-access-to-get-socket-f.patch b/backport-0027--fix-dump-fail-problem-with-no-access-to-get-socket-f.patch new file mode 100644 index 0000000..46a974a --- /dev/null +++ b/backport-0027--fix-dump-fail-problem-with-no-access-to-get-socket-f.patch @@ -0,0 +1,31 @@ +From 6eecf7ce1e6ac4a70a93eb3fc3a5f930a5c23876 Mon Sep 17 00:00:00 2001 +From: Zhuling +Date: Sat, 24 Jul 2021 16:37:17 +0800 +Subject: [PATCH 27/49] fix dump fail problem with no access to get socket + filter + +Fix socket dumping fail problem when user space has no access to getting socket filter. + +Signed-off-by: Jingxian He +--- + criu/sockets.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/criu/sockets.c b/criu/sockets.c +index 4a2df60..8591474 100644 +--- a/criu/sockets.c ++++ b/criu/sockets.c +@@ -372,7 +372,9 @@ static int dump_socket_filter(int sk, SkOptsEntry *soe) + + ret = getsockopt(sk, SOL_SOCKET, SO_GET_FILTER, NULL, &len); + if (ret) { +- pr_perror("Can't get socket filter len"); ++ pr_warn("Can't get socket filter len"); ++ if (errno == EACCES) ++ return 0; + return ret; + } + +-- +2.34.0 + diff --git a/backport-0028--proc-parse-fix-vma-offset-value-for-the-sysfs-file-o.patch b/backport-0028--proc-parse-fix-vma-offset-value-for-the-sysfs-file-o.patch new file mode 100644 index 0000000..7a4dfbb --- /dev/null +++ b/backport-0028--proc-parse-fix-vma-offset-value-for-the-sysfs-file-o.patch @@ -0,0 +1,131 @@ +From d3647a91436fdcab732fdb57971eb496d8eebcb3 Mon Sep 17 00:00:00 2001 +From: "fu.lin" +Date: Tue, 27 Jul 2021 11:40:34 +0800 +Subject: [PATCH 28/49] proc parse: fix vma offset value for the sysfs file of + pci devices + +Some pci devices create bin sysfs file which permit to use `mmap()` +syscall, the 6th parameter `offset` is always 0 when those kinds of +files create file mapping. The value of `offset` will be assign to +`vma->vm_pgoff` in kernel. However, it will be changed to pci address +automically during mmap callback function `pci_mmap_resource_range()`, +and the offset in `/proc//maps` will show non-zero. It will result +criu restore fails. + +There are many of those files. Just retry the mmap action. + +Signed-off-by: He Jingxian +Signed-off-by: fu.lin +--- + criu/include/image.h | 1 + + criu/pie/restorer.c | 16 +++++++++++++--- + criu/proc_parse.c | 32 ++++++++++++++++++++++++++++++++ + 3 files changed, 46 insertions(+), 3 deletions(-) + +diff --git a/criu/include/image.h b/criu/include/image.h +index e9257e4..ebff4d7 100644 +--- a/criu/include/image.h ++++ b/criu/include/image.h +@@ -85,6 +85,7 @@ + #define VMA_AREA_AIORING (1 << 13) + #define VMA_AREA_ANON_INODE (1 << 15) + #define VMA_AREA_CHR (1 << 16) ++#define VMA_AREA_DEV_SHARE (1 << 17) + + #define VMA_CLOSE (1 << 28) + #define VMA_NO_PROT_WRITE (1 << 29) +diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c +index f7eb4a3..5548076 100644 +--- a/criu/pie/restorer.c ++++ b/criu/pie/restorer.c +@@ -871,9 +871,9 @@ static unsigned long restore_mapping(VmaEntry *vma_entry) + * that mechanism as it causes the process to be charged for memory + * immediately upon mmap, not later upon preadv(). + */ +- pr_debug("\tmmap(%"PRIx64" -> %"PRIx64", %x %x %d)\n", ++ pr_debug("\tmmap(%"PRIx64" -> %"PRIx64", %x %x %d %lx)\n", + vma_entry->start, vma_entry->end, +- prot, flags, (int)vma_entry->fd); ++ prot, flags, (int)vma_entry->fd, vma_entry->pgoff); + /* + * Should map memory here. Note we map them as + * writable since we're going to restore page +@@ -885,6 +885,15 @@ static unsigned long restore_mapping(VmaEntry *vma_entry) + vma_entry->fd, + vma_entry->pgoff); + ++ if (addr == -EINVAL) { ++ pr_info("need try mmap with offset 0\n"); ++ addr = sys_mmap(decode_pointer(vma_entry->start), ++ vma_entry_len(vma_entry), ++ prot, flags, ++ vma_entry->fd, ++ 0); ++ } ++ + if ((vma_entry->fd != -1) && + (vma_entry->status & VMA_CLOSE)) + sys_close(vma_entry->fd); +@@ -1880,7 +1889,8 @@ long __export_restore_task(struct task_restore_args *args) + if (!vma_entry->has_madv || !vma_entry->madv) + continue; + +- if (vma_entry_is(vma_entry, VMA_AREA_ANON_INODE)) ++ if (vma_entry_is(vma_entry, VMA_AREA_ANON_INODE) || ++ vma_entry_is(vma_entry, VMA_AREA_DEV_SHARE)) + continue; + + for (m = 0; m < sizeof(vma_entry->madv) * 8; m++) { +diff --git a/criu/proc_parse.c b/criu/proc_parse.c +index 117b696..d04a8ff 100644 +--- a/criu/proc_parse.c ++++ b/criu/proc_parse.c +@@ -544,6 +544,35 @@ static inline int handle_vvar_vma(struct vma_area *vma) + return 0; + } + ++static bool is_sysfs_resource(const char *path) ++{ ++ char *sub = NULL; ++ const char *prefix = "resource"; ++ const char *suffix = "_wc"; ++ ++ if (strstr(path, "devices/") == NULL) ++ return false; ++ ++ sub = rindex(path, '/'); ++ if (sub == NULL) ++ return false; ++ ++ sub += 1; ++ if (strncmp(sub, prefix, strlen(prefix)) != 0) ++ return false; ++ ++ sub += strlen(prefix); ++ while (*sub != '\0' && (*sub >= '0' && *sub <= '9')) ++ sub += 1; ++ ++ if (*sub == '\0') ++ return true; ++ if (!strcmp(sub, suffix)) ++ return true; ++ else ++ return false; ++} ++ + static int handle_vma(pid_t pid, struct vma_area *vma_area, + const char *file_path, DIR *map_files_dir, + struct vma_file_info *vfi, +@@ -568,6 +597,9 @@ static int handle_vma(pid_t pid, struct vma_area *vma_area, + goto err; + } else if (!strcmp(file_path, "[heap]")) { + vma_area->e->status |= VMA_AREA_REGULAR | VMA_AREA_HEAP; ++ } else if (is_sysfs_resource(file_path)) { ++ pr_info("find sys device module share memory\n"); ++ vma_area->e->status |= VMA_AREA_REGULAR | VMA_AREA_DEV_SHARE; + } else { + vma_area->e->status = VMA_AREA_REGULAR; + } +-- +2.34.0 + diff --git a/backport-0029--looser-file-mode-and-size-check.patch b/backport-0029--looser-file-mode-and-size-check.patch new file mode 100644 index 0000000..1d9c4ef --- /dev/null +++ b/backport-0029--looser-file-mode-and-size-check.patch @@ -0,0 +1,68 @@ +From 58d4a009fa4e86686427207df6f38a67bc2dab08 Mon Sep 17 00:00:00 2001 +From: Jingxian He +Date: Sat, 26 Jun 2021 11:41:18 +0800 +Subject: [PATCH 29/49] looser file mode and size check + +When the file mode and size larger than dump data, +make the restoring process run success. + +Signed-off-by: Jingxian He +--- + criu/config.c | 1 + + criu/files-reg.c | 8 +++++--- + criu/include/cr_options.h | 1 + + 3 files changed, 7 insertions(+), 3 deletions(-) + +diff --git a/criu/config.c b/criu/config.c +index f02991d..7397a3c 100644 +--- a/criu/config.c ++++ b/criu/config.c +@@ -526,6 +526,7 @@ int parse_options(int argc, char **argv, bool *usage_error, + BOOL_OPT("with-fd-cred", &opts.with_fd_cred), + BOOL_OPT("dump-char-dev", &opts.dump_char_dev), + BOOL_OPT("mask-exit-notify", &opts.mask_exit_notify), ++ BOOL_OPT("weak-file-check", &opts.weak_file_check), + BOOL_OPT("ignore-special-dump", &opts.ignore_special_dump), + BOOL_OPT("file-locks-repair", &opts.file_locks_repair), + {"reserve-ports", required_argument, 0, 'P' }, +diff --git a/criu/files-reg.c b/criu/files-reg.c +index 65912fe..f2f4f3e 100644 +--- a/criu/files-reg.c ++++ b/criu/files-reg.c +@@ -1972,7 +1972,8 @@ static bool validate_file(const int fd, const struct stat *fd_status, + { + int result = 1; + +- if (rfi->rfe->has_size && (fd_status->st_size != rfi->rfe->size)) { ++ if (rfi->rfe->has_size && ((!opts.weak_file_check && fd_status->st_size != rfi->rfe->size) || ++ (fd_status->st_size < rfi->rfe->size))) { + pr_err("File %s has bad size %"PRIu64" (expect %"PRIu64")\n", + rfi->path, fd_status->st_size, rfi->rfe->size); + return false; +@@ -2086,8 +2087,9 @@ ext: + if (!validate_file(tmp, &st, rfi)) + return -1; + +- if (rfi->rfe->has_mode && (st.st_mode != rfi->rfe->mode)) { +- pr_err("File %s has bad mode 0%o (expect 0%o)\n", ++ if (rfi->rfe->has_mode && ((!opts.weak_file_check && st.st_mode != rfi->rfe->mode) || ++ (st.st_mode < rfi->rfe->mode))) { ++ pr_err("%d File %s has bad mode 0%o (expect 0%o)\n", opts.weak_file_check, + rfi->path, (int)st.st_mode, + rfi->rfe->mode); + return -1; +diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h +index 055c062..dce5832 100644 +--- a/criu/include/cr_options.h ++++ b/criu/include/cr_options.h +@@ -151,6 +151,7 @@ struct cr_options { + int with_fd_cred; + int dump_char_dev; + int mask_exit_notify; ++ int weak_file_check; + int ignore_special_dump; + int file_locks_repair; + char *share_dst_ports; +-- +2.34.0 + diff --git a/backport-0030--add-reuse-file-method-for-recover-deleted-file-state.patch b/backport-0030--add-reuse-file-method-for-recover-deleted-file-state.patch new file mode 100644 index 0000000..a3c463f --- /dev/null +++ b/backport-0030--add-reuse-file-method-for-recover-deleted-file-state.patch @@ -0,0 +1,205 @@ +From c77dc7c79d7d08f69ecb9506c478b812f2ac5fc9 Mon Sep 17 00:00:00 2001 +From: Jingxian He +Date: Sat, 14 Aug 2021 16:45:40 +0800 +Subject: [PATCH 30/49] add reuse file method for recover deleted file state + +Add reuse file method for recover file state of deleted files. + +Signed-off-by: Jingxian He +--- + criu/files-reg.c | 75 +++++++++++++++++++++++++++++++++++++--- + criu/files.c | 24 ++++++++++--- + criu/include/files-reg.h | 9 +++++ + 3 files changed, 99 insertions(+), 9 deletions(-) + +diff --git a/criu/files-reg.c b/criu/files-reg.c +index f2f4f3e..54fa388 100644 +--- a/criu/files-reg.c ++++ b/criu/files-reg.c +@@ -1113,6 +1113,70 @@ int strip_deleted(struct fd_link *link) + return 0; + } + ++int add_reuse_file(u32 id, int fd, int pid) ++{ ++ int retval; ++ char buf[256] = {0}; ++ ++ retval = snprintf(buf, 256, "%u,%d,%d", id, fd, pid); ++ if (retval <= 0) ++ return -EFAULT; ++ ++ fd = open(ADD_REUSE_FILE_PATH, O_WRONLY, 0); ++ if (fd < 0) { ++ pr_err("open file:%s fail\n", ADD_REUSE_FILE_PATH); ++ return fd; ++ } ++ ++ retval = write(fd, buf, strlen(buf)); ++ close(fd); ++ return retval < 0 ? -1 : 0; ++} ++ ++ ++int repair_reuse_file(int id) ++{ ++ int retval, fd; ++ char buf[256] = {0}; ++ ++ retval = snprintf(buf, 256, "%u", id); ++ if (retval <= 0) ++ return -EFAULT; ++ ++ fd = open(REPAIR_REUSE_FILE_PATH, O_WRONLY, 0); ++ if (fd < 0) { ++ pr_err("open file:%s fail\n", REPAIR_REUSE_FILE_PATH); ++ return fd; ++ } ++ retval = write(fd, buf, strlen(buf)); ++ ++ close(fd); ++ return retval < 0 ? -1 : 0; ++} ++ ++int get_reuse_file(void) ++{ ++ int fd; ++ ssize_t count; ++ int retval = -1; ++ char buf[32] = {0}; ++ ++ fd = open(REUSE_FILE_PATH, O_RDONLY , 0); ++ if (fd < 0) { ++ pr_err("open file:%s fail\n", REUSE_FILE_PATH); ++ return fd; ++ } ++ ++ count = read(fd, buf, sizeof(buf)); ++ if (count > 0) ++ retval = atoi(buf); ++ ++ close(fd); ++ return retval; ++} ++ ++extern int dst_pid; ++extern int need_reuse_flag; + static int check_path_remap(struct fd_link *link, const struct fd_parms *parms, + int lfd, u32 id, struct ns_id *nsid) + { +@@ -1226,9 +1290,12 @@ static int check_path_remap(struct fd_link *link, const struct fd_parms *parms, + * name. + */ + +- if (errno == ENOENT) +- return dump_linked_remap(rpath + 1, plen - 1, +- ost, lfd, id, nsid); ++ if (errno == ENOENT) { ++ pr_info("start add no exist file:%s\n", rpath + 1); ++ add_reuse_file(id, lfd, dst_pid); ++ need_reuse_flag = O_REUSE; ++ return 0; ++ } + + pr_perror("Can't stat path"); + return -1; +@@ -1636,7 +1703,7 @@ ext: + rfe.mode = p->stat.st_mode; + + if (S_ISREG(p->stat.st_mode) && should_check_size(rfe.flags) && +- !store_validation_data(&rfe, p, lfd)) ++ (need_reuse_flag != O_REUSE) && !store_validation_data(&rfe, p, lfd)) + return -1; + + fe.type = FD_TYPES__REG; +diff --git a/criu/files.c b/criu/files.c +index b0bfbc4..4d3db0d 100644 +--- a/criu/files.c ++++ b/criu/files.c +@@ -700,7 +700,7 @@ int dump_my_file(int lfd, u32 *id, int *type) + } + + int dst_pid; +- ++int need_reuse_flag; + int dump_task_files_seized(struct parasite_ctl *ctl, struct pstree_item *item, + struct parasite_drain_fd *dfds) + { +@@ -738,7 +738,7 @@ int dump_task_files_seized(struct parasite_ctl *ctl, struct pstree_item *item, + + for (i = 0; i < nr_fds; i++) { + FdinfoEntry e = FDINFO_ENTRY__INIT; +- ++ need_reuse_flag = 0; + ret = dump_one_file(item->pid, dfds->fds[i + off], + lfds[i], opts + i, ctl, &e, dfds); + if (ret < 0) +@@ -748,7 +748,7 @@ int dump_task_files_seized(struct parasite_ctl *ctl, struct pstree_item *item, + ret = 0; + continue; + } +- ++ e.flags |= need_reuse_flag; + pr_info("write fdinfoEntry fd=%d id=%d\n", (&e)->fd, (&e)->id); + ret = pb_write_one(img, &e, PB_FDINFO); + if (ret) +@@ -944,8 +944,8 @@ int collect_fd(int pid, FdinfoEntry *e, struct rst_info *rst_info, bool fake) + { + struct file_desc *fdesc; + +- pr_info("Collect fdinfo pid=%d fd=%d id=%#x\n", +- pid, e->fd, e->id); ++ pr_info("Collect fdinfo pid=%d fd=%d id=%#x flags:%x\n", ++ pid, e->fd, e->id, e->flags); + + fdesc = find_file_desc(e); + if (fdesc == NULL) { +@@ -1235,6 +1235,7 @@ static int open_fd(struct fdinfo_list_entry *fle) + struct fdinfo_list_entry *flem; + int new_fd = -1, ret; + ++ pr_info("open file flags:%x\n", fle->fe->flags); + flem = file_master(d); + if (fle != flem) { + BUG_ON (fle->stage != FLE_INITIALIZED); +@@ -1256,6 +1257,19 @@ static int open_fd(struct fdinfo_list_entry *fle) + return 0; + } + } ++ } else if (fle->fe->flags & O_REUSE) { ++ pr_info("find reuse file:%d\n", d->id); ++ ret = repair_reuse_file(d->id); ++ if (!ret) { ++ new_fd = get_reuse_file(); ++ pr_info("get reuse file:%d\n", new_fd); ++ if (new_fd <= 0 || setup_and_serve_out(fle, new_fd) < 0) { ++ pr_err("setup reuse file fail\n"); ++ return -1; ++ } ++ fle->stage = FLE_RESTORED; ++ return 0; ++ } + } + + /* +diff --git a/criu/include/files-reg.h b/criu/include/files-reg.h +index d9f0638..3e8b93b 100644 +--- a/criu/include/files-reg.h ++++ b/criu/include/files-reg.h +@@ -65,4 +65,13 @@ extern int strip_deleted(struct fd_link *link); + + extern int dead_pid_conflict(void); + ++#define ADD_REUSE_FILE_PATH "/sys/kernel/add_reuse_file" ++#define REPAIR_REUSE_FILE_PATH "/sys/kernel/repair_reuse_file" ++#define REUSE_FILE_PATH "/sys/kernel/reuse_file" ++#define O_REUSE 0100000000 ++ ++extern int add_reuse_file(u32 id, int fd, int pid); ++extern int repair_reuse_file(int id); ++extern int get_reuse_file(void); ++ + #endif /* __CR_FILES_REG_H__ */ +-- +2.34.0 + diff --git a/backport-0031--fix-share-sockets-repair-problem.patch b/backport-0031--fix-share-sockets-repair-problem.patch new file mode 100644 index 0000000..ac271cc --- /dev/null +++ b/backport-0031--fix-share-sockets-repair-problem.patch @@ -0,0 +1,132 @@ +From 8da37b13f7437bfaf97c135e610270f04083572b Mon Sep 17 00:00:00 2001 +From: Jingxian He +Date: Wed, 11 Aug 2021 15:01:27 +0800 +Subject: [PATCH 31/49] fix share sockets repair problem + +Repair off the share sockets after reusing them +to recover the share socket state. + +Signed-off-by: Jingxian He +--- + criu/files.c | 34 ++++++++++++++++++++++++++++++++-- + criu/sk-inet.c | 6 ++++-- + criu/sk-netlink.c | 5 +++-- + 3 files changed, 39 insertions(+), 6 deletions(-) + +diff --git a/criu/files.c b/criu/files.c +index 4d3db0d..c9e5d8b 100644 +--- a/criu/files.c ++++ b/criu/files.c +@@ -51,7 +51,7 @@ + #include "util.h" + #include "images/fs.pb-c.h" + #include "images/ext-file.pb-c.h" +- ++#include "sk-inet.h" + #include "plugin.h" + + #define FDESC_HASH_SIZE 64 +@@ -1215,7 +1215,7 @@ int setup_and_serve_out(struct fdinfo_list_entry *fle, int new_fd) + if (reopen_fd_as(fle->fe->fd, new_fd)) + return -1; + +- pr_info("*******flags: %d",fle->fe->flags); ++ pr_info("*******flags: %d\n",fle->fe->flags); + if (fcntl(fle->fe->fd, F_SETFD, fle->fe->flags) == -1) { + pr_perror("Unable to set file descriptor flags"); + return -1; +@@ -1229,6 +1229,30 @@ int setup_and_serve_out(struct fdinfo_list_entry *fle, int new_fd) + return 0; + } + ++#define MAX_SHARE_SOCKETS_NUM 1024 ++int repair_share_sockets[MAX_SHARE_SOCKETS_NUM]; ++int repair_share_num; ++ ++int add_repair_share_socket(int fd) ++{ ++ if (repair_share_num >= MAX_SHARE_SOCKETS_NUM) ++ return -1; ++ repair_share_sockets[repair_share_num] = fd; ++ repair_share_num++; ++ return 0; ++} ++ ++void repair_off_share_sockets(void) ++{ ++ int i; ++ ++ for (i = 0; i < repair_share_num; i++) { ++ tcp_repair_off(repair_share_sockets[i]); ++ pr_info("repair off socket:%d\n", repair_share_sockets[i]); ++ } ++ repair_share_num = 0; ++} ++ + static int open_fd(struct fdinfo_list_entry *fle) + { + struct file_desc *d = fle->desc; +@@ -1247,6 +1271,7 @@ static int open_fd(struct fdinfo_list_entry *fle) + + if (d->ops->type == FD_TYPES__INETSK) { + if (check_need_repair(d)) { ++ pr_info("start repair for:%d\n", d->id); + ret = repair_share_socket(d->id); + if (!ret) { + new_fd = get_share_socket(); +@@ -1254,6 +1279,10 @@ static int open_fd(struct fdinfo_list_entry *fle) + if (new_fd <= 0 || setup_and_serve_out(fle, new_fd) < 0) + return -1; + fle->stage = FLE_RESTORED; ++ if (add_repair_share_socket(fle->fe->fd)) { ++ pr_perror("add repair share socket fail\n"); ++ return -1; ++ } + return 0; + } + } +@@ -1368,6 +1397,7 @@ static int open_fdinfos(struct pstree_item *me) + wait_fds_event(); + } while (again || progress); + ++ repair_off_share_sockets(); + BUG_ON(!list_empty(list)); + /* + * Fake fles may be used for restore other +diff --git a/criu/sk-inet.c b/criu/sk-inet.c +index 10bcf6e..cdd8969 100644 +--- a/criu/sk-inet.c ++++ b/criu/sk-inet.c +@@ -625,8 +625,10 @@ static int do_dump_one_inet_fd(int lfd, u32 id, const struct fd_parms *p, int fa + BUG_ON(sk->sd.already_dumped); + + if (check_share_dst_port(sk->dst_port) || check_share_src_port(sk->src_port)) { +- pr_info("Start add share prot:%d src %d\n", sk->dst_port, sk->src_port); +- add_share_socket(id, lfd, dst_pid, sk->src_port); ++ pr_info("Start add share port:%d-%d, dst_pid:%d id:%d\n", sk->dst_port, sk->src_port, dst_pid, id); ++ ret = add_share_socket(id, lfd, dst_pid, sk->src_port); ++ if (ret) ++ pr_warn("add share socket ret:%d\n", ret); + } + + ie.id = id; +diff --git a/criu/sk-netlink.c b/criu/sk-netlink.c +index a6b6bec..2d58574 100644 +--- a/criu/sk-netlink.c ++++ b/criu/sk-netlink.c +@@ -115,9 +115,10 @@ static bool can_dump_netlink_sk(int lfd) + + ret = fd_has_data(lfd); + if (ret == 1) +- pr_err("The socket has data to read\n"); ++ pr_warn("The socket has data to read\n"); + +- return ret == 0; ++ /* ignore netlink socket data */ ++ return true; + } + + static int dump_one_netlink_fd(int lfd, u32 id, const struct fd_parms *p) +-- +2.34.0 + diff --git a/backport-0032--nftables-add-mnl-api.patch b/backport-0032--nftables-add-mnl-api.patch new file mode 100644 index 0000000..a21e350 --- /dev/null +++ b/backport-0032--nftables-add-mnl-api.patch @@ -0,0 +1,271 @@ +From 1e69468d31eb13ef1a4db78e1b188bea63e7b8bc Mon Sep 17 00:00:00 2001 +From: "fu.lin" +Date: Wed, 11 Aug 2021 16:50:49 +0800 +Subject: [PATCH 32/49] nftables: add mnl api + +libmnl provides the communication between userspace and kernelspace for +netfilter netlink. I abstract here for the next usage. + +Signed-off-by: fu.lin +--- + criu/Makefile.crtools | 1 + + criu/Makefile.packages | 8 ++ + criu/include/nftables.h | 28 +++++++ + criu/mnl.c | 165 ++++++++++++++++++++++++++++++++++++++++ + 4 files changed, 202 insertions(+) + create mode 100644 criu/include/nftables.h + create mode 100644 criu/mnl.c + +diff --git a/criu/Makefile.crtools b/criu/Makefile.crtools +index c093a59..4548768 100644 +--- a/criu/Makefile.crtools ++++ b/criu/Makefile.crtools +@@ -87,6 +87,7 @@ obj-y += servicefd.o + obj-y += pie-util-vdso.o + obj-y += vdso.o + obj-y += devname.o ++obj-y += mnl.o + obj-$(CONFIG_COMPAT) += pie-util-vdso-elf32.o + CFLAGS_pie-util-vdso-elf32.o += -DCONFIG_VDSO_32 + obj-$(CONFIG_COMPAT) += vdso-compat.o +diff --git a/criu/Makefile.packages b/criu/Makefile.packages +index f380fa2..ce04529 100644 +--- a/criu/Makefile.packages ++++ b/criu/Makefile.packages +@@ -7,6 +7,8 @@ REQ-RPM-PKG-NAMES += protobuf-python + REQ-RPM-PKG-NAMES += libnl3-devel + REQ-RPM-PKG-NAMES += libcap-devel + REQ-RPM-PKG-NAMES += $(PYTHON)-future ++REQ-RPM-PKG-NAMES += libmnl-devel ++REQ-RPM-PKG-NAMES += libnftnl-devel + + REQ-RPM-PKG-TEST-NAMES += libaio-devel + +@@ -17,6 +19,8 @@ REQ-DEB-PKG-NAMES += protobuf-compiler + REQ-DEB-PKG-NAMES += python-protobuf + REQ-DEB-PKG-NAMES += libnl-3-dev + REQ-DEB-PKG-NAMES += libcap-dev ++REQ-DEB-PKG-NAMES += libmnl-dev ++REQ-DEB-PKG-NAMES += libnftnl-dev + + REQ-DEB-PKG-TEST-NAMES += python-yaml + REQ-DEB-PKG-TEST-NAMES += libaio-dev +@@ -33,6 +37,10 @@ REQ-RPM-PKG-TEST-NAMES += $(PYTHON)-pyyaml + endif + + export LIBS += -lprotobuf-c -ldl -lnl-3 -lsoccr -Lsoccr/ -lnet ++export LIBS += $(shell pkg-config --libs libmnl) ++export LIBS += $(shell pkg-config --libs libnftnl) ++export CFLAGS += $(shell pkg-config --cflags libmnl) ++export CFLAGS += $(shell pkg-config --cflags libnftnl) + + check-packages-failed: + $(warning Can not find some of the required libraries) +diff --git a/criu/include/nftables.h b/criu/include/nftables.h +new file mode 100644 +index 0000000..0bdab31 +--- /dev/null ++++ b/criu/include/nftables.h +@@ -0,0 +1,28 @@ ++#ifndef __CR_NFTABLES_H__ ++#define __CR_NFTABLES_H__ ++ ++#include ++ ++struct mnl_params { ++ struct mnl_socket *nl; ++ char *buf; ++ struct mnl_nlmsg_batch *batch; ++ uint32_t seq; ++}; ++ ++typedef struct nlmsghdr * (*buf_func_t)(struct mnl_params *mnl_params, void *args); ++typedef int (*batch_func_t)(struct mnl_params *mnl_params, void *args); ++typedef int (*mnl_func_t)(struct mnl_params *mnl, batch_func_t cb, void *args); ++ ++struct mnl_cb_params { ++ pid_t tree_id; ++ bool create; ++ bool ipv6; ++}; ++ ++int mnl_sendmsg(batch_func_t batch_cb, void *args); ++int mnl_common(mnl_func_t mnl_cb, void *arg1, void *arg2); ++int mnl_batch_send_and_recv(struct mnl_params *mnl_params, batch_func_t cb, void *args, int *result); ++int mnl_buf_send_and_recv(struct mnl_params *mnl_params, buf_func_t cb, void *args, int *result); ++ ++#endif /* __CR_NFTABLES_H__ */ +diff --git a/criu/mnl.c b/criu/mnl.c +new file mode 100644 +index 0000000..3a03202 +--- /dev/null ++++ b/criu/mnl.c +@@ -0,0 +1,165 @@ ++#include ++#include ++#include ++ ++#include ++ ++#include "nftables.h" ++#include "log.h" ++ ++int mnl_common(mnl_func_t mnl_cb, void *arg1, void *arg2) ++{ ++ char buf[MNL_SOCKET_BUFFER_SIZE]; ++ struct mnl_params mnl = { ++ .seq = time(NULL), ++ }; ++ int retval = -1; ++ ++ mnl.nl = mnl_socket_open(NETLINK_NETFILTER); ++ if (mnl.nl == NULL) { ++ pr_err("mnl_socket_open failed with %d: %s\n", errno, strerror(errno)); ++ return -1; ++ } ++ ++ if (mnl_socket_bind(mnl.nl, 0, MNL_SOCKET_AUTOPID) < 0) { ++ pr_err("mnl_socket_bind wailed with %d: %s\n", errno, strerror(errno)); ++ goto err_mnl; ++ } ++ ++ mnl.buf = buf; ++ mnl.batch = mnl_nlmsg_batch_start(buf, sizeof(buf)); ++ if (mnl.batch == NULL) ++ goto err_mnl; ++ ++ if (mnl_cb(&mnl, arg1, arg2) < 0) ++ goto err_batch; ++ ++ retval = 0; ++ ++err_batch: ++ mnl_nlmsg_batch_stop(mnl.batch); ++err_mnl: ++ mnl_socket_close(mnl.nl); ++ ++ return retval; ++} ++ ++static int mnl_sendmsg_internal(struct mnl_params *mnl, batch_func_t cb, void *args) ++{ ++ int retval = -1; ++ ++ nftnl_batch_begin(mnl_nlmsg_batch_current(mnl->batch), mnl->seq++); ++ mnl_nlmsg_batch_next(mnl->batch); ++ ++ if (cb(mnl, args) < 0) ++ goto err_batch; ++ ++ nftnl_batch_end(mnl_nlmsg_batch_current(mnl->batch), mnl->seq++); ++ mnl_nlmsg_batch_next(mnl->batch); ++ ++ if (mnl_socket_sendto(mnl->nl, mnl_nlmsg_batch_head(mnl->batch), ++ mnl_nlmsg_batch_size(mnl->batch)) < 0) { ++ pr_err("%s: mnl_socket_sendto failed with %d: %s\n", ++ __func__, errno, strerror(errno)); ++ goto err_batch; ++ } ++ ++ retval = 0; ++ ++err_batch: ++ return retval; ++} ++ ++int mnl_sendmsg(batch_func_t batch_cb, void *args) ++{ ++ return mnl_common(mnl_sendmsg_internal, batch_cb, args); ++} ++ ++int mnl_batch_send_and_recv(struct mnl_params *mnl_params, batch_func_t cb, ++ void *args, int *result) ++{ ++ struct mnl_socket *nl = mnl_params->nl; ++ struct mnl_nlmsg_batch *batch = mnl_params->batch; ++ uint32_t *seq = &mnl_params->seq; ++ char buf[MNL_SOCKET_BUFFER_SIZE]; ++ int retval; ++ ++ mnl_nlmsg_batch_reset(batch); ++ nftnl_batch_begin(mnl_nlmsg_batch_current(batch), (*seq)++); ++ mnl_nlmsg_batch_next(batch); ++ ++ if (cb(mnl_params, args) < 0) ++ return -1; ++ ++ nftnl_batch_end(mnl_nlmsg_batch_current(batch), (*seq)++); ++ mnl_nlmsg_batch_next(batch); ++ ++ if (mnl_socket_sendto(nl, mnl_nlmsg_batch_head(batch), ++ mnl_nlmsg_batch_size(batch)) < 0) { ++ pr_err("%s: mnl_socket_sendto failed with %d: %s\n", ++ __func__, errno, strerror(errno)); ++ return -1; ++ } ++ ++ /* don't care the netlink retval, and nlmsg hdr flags has no `NLM_F_ACK` */ ++ if (result == NULL) ++ return 0; ++ ++ retval = mnl_socket_recvfrom(nl, buf, sizeof(buf)); ++ while (retval > 0) { ++ retval = mnl_cb_run(buf, retval, 0, mnl_socket_get_portid(nl), NULL, NULL); ++ if (retval <= 0) ++ break; ++ retval = mnl_socket_recvfrom(nl, buf, sizeof(buf)); ++ } ++ ++ if (retval < 0) { ++ pr_err("%s: mnl batch socket recv errno with %d: %s\n", ++ __func__, errno, strerror(errno)); ++ *result = errno; ++ return -1; ++ } ++ ++ *result = 0; ++ return 0; ++} ++ ++int mnl_buf_send_and_recv(struct mnl_params *mnl_params, buf_func_t cb, ++ void *args, int *result) ++{ ++ struct mnl_socket *nl = mnl_params->nl; ++ char buf[MNL_SOCKET_BUFFER_SIZE]; ++ struct nlmsghdr *nlh; ++ int retval = 0; ++ ++ if ((nlh = cb(mnl_params, args)) == NULL) ++ return -1; ++ ++ if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) < 0) { ++ pr_err("%s: mnl_socket_sendto failed with %d: %s\n", ++ __func__, errno, strerror(errno)); ++ return -1; ++ } ++ ++ /* don't care the netlink retval, and nlmsg hdr flags has no `NLM_F_ACK` */ ++ if (result == NULL) ++ return 0; ++ ++ retval = mnl_socket_recvfrom(nl, buf, sizeof(buf)); ++ while (retval > 0) { ++ retval = mnl_cb_run(buf, retval, 0, mnl_socket_get_portid(nl), NULL, NULL); ++ if (retval <= 0) ++ break; ++ retval = mnl_socket_recvfrom(nl, buf, sizeof(buf)); ++ } ++ ++ if (retval < 0) { ++ pr_info("%s: mnl buf socket recv errno with %d: %s\n", ++ __func__, errno, strerror(errno)); ++ *result = errno; ++ return -1; ++ } ++ ++ *result = 0; ++ return 0; ++} +-- +2.34.0 + diff --git a/backport-0033--nftables-implement-nft-api-for-tcp.patch b/backport-0033--nftables-implement-nft-api-for-tcp.patch new file mode 100644 index 0000000..5e65ac7 --- /dev/null +++ b/backport-0033--nftables-implement-nft-api-for-tcp.patch @@ -0,0 +1,1011 @@ +From a066cc491f9ff063a0788392bec33284fc606596 Mon Sep 17 00:00:00 2001 +From: "fu.lin" +Date: Wed, 11 Aug 2021 16:50:49 +0800 +Subject: [PATCH 33/49] nftables: implement nft api for tcp + +Signed-off-by: fu.lin +--- + criu/Makefile.crtools | 1 + + criu/include/nftables.h | 138 +++++++ + criu/nftables.c | 823 ++++++++++++++++++++++++++++++++++++++++ + 3 files changed, 962 insertions(+) + create mode 100644 criu/nftables.c + +diff --git a/criu/Makefile.crtools b/criu/Makefile.crtools +index 4548768..2a8ec61 100644 +--- a/criu/Makefile.crtools ++++ b/criu/Makefile.crtools +@@ -88,6 +88,7 @@ obj-y += pie-util-vdso.o + obj-y += vdso.o + obj-y += devname.o + obj-y += mnl.o ++obj-y += nftables.o + obj-$(CONFIG_COMPAT) += pie-util-vdso-elf32.o + CFLAGS_pie-util-vdso-elf32.o += -DCONFIG_VDSO_32 + obj-$(CONFIG_COMPAT) += vdso-compat.o +diff --git a/criu/include/nftables.h b/criu/include/nftables.h +index 0bdab31..3b51a3d 100644 +--- a/criu/include/nftables.h ++++ b/criu/include/nftables.h +@@ -3,6 +3,99 @@ + + #include + ++#include ++#include ++#include ++#include ++#include ++ ++#define construct_buf(buf, type, family, flags, seq, payload, cb_prefix) \ ++ ({ \ ++ struct nlmsghdr *_nlh; \ ++ \ ++ _nlh = nftnl_##cb_prefix##_nlmsg_build_hdr((buf), \ ++ (type), (family), (flags), (seq)); \ ++ nftnl_##cb_prefix##_nlmsg_build_payload(_nlh, (payload)); \ ++ nftnl_##cb_prefix##_free((payload)); \ ++ _nlh; \ ++ }) ++ ++#define construct_table_buf(buf, type, family, flags, seq, payload) \ ++ construct_buf((buf), (type), (family), (flags), (seq), \ ++ (payload), table) ++ ++#define construct_chain_buf(buf, type, family, flags, seq, payload) \ ++ construct_buf((buf), (type), (family), (flags), (seq), \ ++ (payload), chain) ++ ++#define construct_batch(batch, type, family, flags, seq, payload, cb_prefix) \ ++ { \ ++ struct nlmsghdr *_nlh; \ ++ \ ++ _nlh = nftnl_##cb_prefix##_nlmsg_build_hdr( \ ++ mnl_nlmsg_batch_current(batch), \ ++ (type), (family), (flags), (seq)); \ ++ nftnl_##cb_prefix##_nlmsg_build_payload(_nlh, (payload)); \ ++ nftnl_##cb_prefix##_free((payload)); \ ++ mnl_nlmsg_batch_next((batch)); \ ++ } ++ ++#define construct_table_batch(batch, type, family, flags, seq, payload) \ ++ construct_batch((batch), (type), (family), (flags), (seq), \ ++ (payload), table) ++ ++#define construct_chain_batch(batch, type, family, flags, seq, payload) \ ++ construct_batch((batch), (type), (family), (flags), (seq), \ ++ (payload), chain) ++ ++#define construct_set_batch(batch, type, family, flags, seq, payload) \ ++ construct_batch((batch), (type), (family), (flags), (seq), \ ++ (payload), set) ++ ++#define construct_rule_batch(batch, type, family, flags, seq, payload) \ ++ construct_batch((batch), (type), (family), (flags), (seq), \ ++ (payload), rule) ++ ++#define construct_set_elems_batch(batch, type, family, flags, seq, payload) \ ++ { \ ++ struct nlmsghdr *_nlh; \ ++ \ ++ _nlh = nftnl_nlmsg_build_hdr( \ ++ mnl_nlmsg_batch_current(batch), \ ++ (type), (family), (flags), (seq)); \ ++ nftnl_set_elems_nlmsg_build_payload(_nlh, (payload)); \ ++ nftnl_set_free((payload)); \ ++ mnl_nlmsg_batch_next((batch)); \ ++ } ++ ++#define TABLE_NAME "filter" ++#define INPUT_CHAIN_NAME "criu-input" ++#define OUTPUT_CHAIN_NAME "criu-output" ++#define INPUT_IPV4_SET_NAME "criu-input-ipv4-blacklist-%d" ++#define INPUT_IPV6_SET_NAME "criu-input-ipv6-blacklist-%d" ++#define OUTPUT_IPV4_SET_NAME "criu-output-ipv4-blacklist-%d" ++#define OUTPUT_IPV6_SET_NAME "criu-output-ipv6-blacklist-%d" ++ ++/* set key type, see nftables/include/datatypes.h ++ * The rule of the datatype calculation: ++ * Each type occupies 6 bits, type: ++ * - ipaddr: 7, 4 bytes ++ * - ip6addr: 8, 16 types ++ * - inet_service: 13, 2 bytes (pading to 4 bytes) ++ * ++ * 0x1cd1cd: 0b 000111 001101 000111 001101 ++ * 0x20d20d: 0b 001000 001101 001000 001101 ++ */ ++#define INET_SERVICE_LEN 2 ++#define IPADDR_LEN 4 ++#define IP6ADDR_LEN 16 ++#define div_round_up(n, d) (((n) + (d) - 1) / (d)) ++ ++#define IPv4_KEY_TYPE 0x1cd1cd ++#define IPv4_KEY_LEN div_round_up(IPADDR_LEN + INET_SERVICE_LEN, 4) * 4 * 2 ++#define IPv6_KEY_TYPE 0x20d20d ++#define IPv6_KEY_LEN div_round_up(IP6ADDR_LEN + INET_SERVICE_LEN, 4) * 4 * 2 ++ + struct mnl_params { + struct mnl_socket *nl; + char *buf; +@@ -25,4 +118,49 @@ int mnl_common(mnl_func_t mnl_cb, void *arg1, void *arg2); + int mnl_batch_send_and_recv(struct mnl_params *mnl_params, batch_func_t cb, void *args, int *result); + int mnl_buf_send_and_recv(struct mnl_params *mnl_params, buf_func_t cb, void *args, int *result); + ++struct nft_chain_params { ++ char *name; ++ uint32_t hooknum; ++ char *type; ++ uint32_t prio; ++ uint32_t policy; ++}; ++ ++struct nft_set_params { ++ char name[128]; ++ uint32_t id; ++ uint32_t datatype; ++ uint32_t key_len; ++}; ++ ++struct nft_rule_params { ++ char *chain_name; ++ char set_name[128]; ++ uint32_t mark; ++ uint16_t mark_op; ++ uint32_t nfproto; ++ uint8_t l4proto; ++ unsigned int stmt; ++ bool ipv6; ++}; ++ ++struct nft_set_elem_params { ++ char set_name[128]; ++ char data[40]; ++ size_t data_len; ++}; ++ ++struct nf_conn_params { ++ uint8_t family; ++ uint32_t *src_addr; ++ uint16_t src_port; ++ uint32_t *dst_addr; ++ uint16_t dst_port; ++ bool lock; ++ pid_t tree_id; ++}; ++ ++struct inet_sk_desc; ++int nft_connection_switch(struct inet_sk_desc *sk, bool lock, pid_t tree_id); ++ + #endif /* __CR_NFTABLES_H__ */ +diff --git a/criu/nftables.c b/criu/nftables.c +new file mode 100644 +index 0000000..57774e6 +--- /dev/null ++++ b/criu/nftables.c +@@ -0,0 +1,823 @@ ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "sk-inet.h" ++#include "nftables.h" ++ ++#include "../soccr/soccr.h" ++ ++#include "log.h" ++ ++static struct nftnl_table *setup_table(uint8_t family, const char *table) ++{ ++ struct nftnl_table *t; ++ ++ t = nftnl_table_alloc(); ++ if (t == NULL) ++ return NULL; ++ ++ nftnl_table_set_u32(t, NFTNL_TABLE_FAMILY, family); ++ if (nftnl_table_set_str(t, NFTNL_TABLE_NAME, table) < 0) ++ goto err; ++ ++ return t; ++err: ++ nftnl_table_free(t); ++ return NULL; ++} ++ ++static struct nftnl_chain *setup_chain(const char *table, ++ struct nft_chain_params *params, ++ bool create) ++{ ++ struct nftnl_chain *c; ++ ++ c = nftnl_chain_alloc(); ++ if (c == NULL) ++ return NULL; ++ ++ if (nftnl_chain_set_str(c, NFTNL_CHAIN_TABLE, table) < 0) ++ goto err; ++ if (nftnl_chain_set_str(c, NFTNL_CHAIN_NAME, params->name) < 0) ++ goto err; ++ if (create) { ++ nftnl_chain_set_u32(c, NFTNL_CHAIN_HOOKNUM, params->hooknum); ++ if (nftnl_chain_set_str(c, NFTNL_CHAIN_TYPE, params->type) < 0) ++ goto err; ++ nftnl_chain_set_u32(c, NFTNL_CHAIN_PRIO, params->prio); ++ nftnl_chain_set_u32(c, NFTNL_CHAIN_POLICY, params->policy); ++ } ++ ++ return c; ++err: ++ nftnl_chain_free(c); ++ return NULL; ++} ++ ++static struct nftnl_set *setup_set(uint8_t family, const char *table, ++ struct nft_set_params *params, ++ bool create) ++{ ++ struct nftnl_set *s; ++ ++ s = nftnl_set_alloc(); ++ if (s == NULL) ++ return NULL; ++ ++ if (nftnl_set_set_str(s, NFTNL_SET_TABLE, table) < 0) ++ goto err; ++ if (nftnl_set_set_str(s, NFTNL_SET_NAME, params->name) < 0) ++ goto err; ++ if (create) { ++ nftnl_set_set_u32(s, NFTNL_SET_FAMILY, family); ++ nftnl_set_set_u32(s, NFTNL_SET_ID, params->id); ++ ++ nftnl_set_set_u32(s, NFTNL_SET_KEY_TYPE, params->datatype); ++ nftnl_set_set_u32(s, NFTNL_SET_KEY_LEN, params->key_len); ++ } ++ ++ return s; ++err: ++ nftnl_set_free(s); ++ return NULL; ++} ++ ++static int add_mark(struct nftnl_rule *r, uint32_t meta_key, enum nft_registers dreg) ++{ ++ struct nftnl_expr *e; ++ ++ e = nftnl_expr_alloc("meta"); ++ if (e == NULL) ++ return -1; ++ ++ nftnl_expr_set_u32(e, NFTNL_EXPR_META_KEY, meta_key); ++ nftnl_expr_set_u32(e, NFTNL_EXPR_META_DREG, dreg); ++ ++ nftnl_rule_add_expr(r, e); ++ ++ return 0; ++} ++ ++static int add_proto(struct nftnl_rule *r, enum nft_registers dreg) ++{ ++ struct nftnl_expr *e; ++ ++ e = nftnl_expr_alloc("meta"); ++ if (e == NULL) ++ return -1; ++ ++ nftnl_expr_set_u32(e, NFTNL_EXPR_META_KEY, NFT_META_L4PROTO); ++ nftnl_expr_set_u32(e, NFTNL_EXPR_META_DREG, dreg); ++ ++ nftnl_rule_add_expr(r, e); ++ ++ return 0; ++} ++ ++static int add_payload(struct nftnl_rule *r, uint32_t base, uint32_t dreg, ++ uint32_t offset, uint32_t len) ++{ ++ struct nftnl_expr *e; ++ ++ e = nftnl_expr_alloc("payload"); ++ if (e == NULL) ++ return -1; ++ ++ nftnl_expr_set_u32(e, NFTNL_EXPR_PAYLOAD_BASE, base); ++ nftnl_expr_set_u32(e, NFTNL_EXPR_PAYLOAD_DREG, dreg); ++ nftnl_expr_set_u32(e, NFTNL_EXPR_PAYLOAD_OFFSET, offset); ++ nftnl_expr_set_u32(e, NFTNL_EXPR_PAYLOAD_LEN, len); ++ ++ nftnl_rule_add_expr(r, e); ++ ++ return 0; ++} ++ ++static int add_cmp(struct nftnl_rule *r, enum nft_registers sreg, uint32_t op, ++ const void *data, uint32_t data_len) ++{ ++ struct nftnl_expr *e; ++ ++ e = nftnl_expr_alloc("cmp"); ++ if (e == NULL) ++ return -1; ++ ++ nftnl_expr_set_u32(e, NFTNL_EXPR_CMP_SREG, sreg); ++ nftnl_expr_set_u32(e, NFTNL_EXPR_CMP_OP, op); ++ nftnl_expr_set(e, NFTNL_EXPR_CMP_DATA, data, data_len); ++ ++ nftnl_rule_add_expr(r, e); ++ ++ return 0; ++} ++ ++static int add_lookup(struct nftnl_rule *r, enum nft_registers sreg, ++ const char *set) ++{ ++ struct nftnl_expr *e; ++ ++ e = nftnl_expr_alloc("lookup"); ++ if (e == NULL) ++ return -1; ++ ++ if (nftnl_expr_set_str(e, NFTNL_EXPR_LOOKUP_SET, set) < 0) ++ goto err; ++ nftnl_expr_set_u32(e, NFTNL_EXPR_LOOKUP_SREG, sreg); ++ ++ nftnl_rule_add_expr(r, e); ++ ++ return 0; ++err: ++ nftnl_expr_free(e); ++ return -1; ++} ++ ++static int add_counter(struct nftnl_rule *r) ++{ ++ struct nftnl_expr *e; ++ ++ e = nftnl_expr_alloc("counter"); ++ if (e == NULL) ++ return -1; ++ ++ nftnl_rule_add_expr(r, e); ++ ++ return 0; ++} ++ ++static int add_verdict(struct nftnl_rule *r, const char *chain, int verdict) ++{ ++ struct nftnl_expr *e; ++ ++ e = nftnl_expr_alloc("immediate"); ++ if (e == NULL) ++ return -1; ++ ++ nftnl_expr_set_u32(e, NFTNL_EXPR_IMM_DREG, NFT_REG_VERDICT); ++ nftnl_expr_set_u32(e, NFTNL_EXPR_IMM_VERDICT, verdict); ++ ++ nftnl_rule_add_expr(r, e); ++ ++ return 0; ++} ++ ++static int __setup_rule(struct nftnl_rule *r, struct nft_rule_params *params) ++{ ++ /* meta nfproto == */ ++ if (add_mark(r, NFT_META_PROTOCOL, NFT_REG32_00) < 0) ++ return -1; ++ if (add_cmp(r, NFT_REG32_00, NFT_CMP_EQ, ¶ms->nfproto, sizeof(uint32_t))< 0) ++ return -1; ++ ++ /* meta l4proto == */ ++ if (add_proto(r, NFT_REG32_00) < 0) ++ return -1; ++ if (add_cmp(r, NFT_REG32_00, NFT_CMP_EQ, ¶ms->l4proto, sizeof(uint8_t)) < 0) ++ return -1; ++ ++ /* ip saddr . sport . daddr . dport @ */ ++ if (params->ipv6 == false) { ++ if (add_payload(r, NFT_PAYLOAD_NETWORK_HEADER, NFT_REG32_00, ++ offsetof(struct iphdr, saddr), IPADDR_LEN) < 0) ++ return -1; ++ if (add_payload(r, NFT_PAYLOAD_TRANSPORT_HEADER, NFT_REG32_01, ++ offsetof(struct tcphdr, source), INET_SERVICE_LEN) < 0) ++ return -1; ++ if (add_payload(r, NFT_PAYLOAD_NETWORK_HEADER, NFT_REG32_02, ++ offsetof(struct iphdr, daddr), IPADDR_LEN) < 0) ++ return -1; ++ if (add_payload(r, NFT_PAYLOAD_TRANSPORT_HEADER, NFT_REG32_03, ++ offsetof(struct tcphdr, dest), INET_SERVICE_LEN) < 0) ++ return -1; ++ ++ if (add_lookup(r, NFT_REG32_00, params->set_name) < 0) ++ return -1; ++ } else { ++ if (add_payload(r, NFT_PAYLOAD_NETWORK_HEADER, NFT_REG32_00, ++ offsetof(struct ipv6hdr, saddr), IP6ADDR_LEN) < 0) ++ return -1; ++ if (add_payload(r, NFT_PAYLOAD_TRANSPORT_HEADER, NFT_REG32_04, ++ offsetof(struct tcphdr, source), INET_SERVICE_LEN) < 0) ++ return -1; ++ if (add_payload(r, NFT_PAYLOAD_NETWORK_HEADER, NFT_REG32_05, ++ offsetof(struct ipv6hdr, daddr), IP6ADDR_LEN) < 0) ++ return -1; ++ if (add_payload(r, NFT_PAYLOAD_TRANSPORT_HEADER, NFT_REG32_09, ++ offsetof(struct tcphdr, dest), INET_SERVICE_LEN) < 0) ++ return -1; ++ ++ if (add_lookup(r, NFT_REG32_00, params->set_name) < 0) ++ return -1; ++ } ++ ++ /* counter */ ++ if (add_counter(r) < 0) ++ return -1; ++ ++ return 0; ++} ++ ++static struct nftnl_rule *setup_rule(uint8_t family, const char *table, ++ struct nft_rule_params *params, ++ bool create, bool ns) ++{ ++ struct nftnl_rule *r = NULL; ++ ++ r = nftnl_rule_alloc(); ++ if (r == NULL) ++ return NULL; ++ ++ if (nftnl_rule_set_str(r, NFTNL_RULE_TABLE, table) < 0) ++ goto err; ++ nftnl_rule_set_u32(r, NFTNL_RULE_FAMILY, family); ++ if (nftnl_rule_set_str(r, NFTNL_RULE_CHAIN, params->chain_name) < 0) ++ goto err; ++ ++ if (params->mark != 0) { ++ /* meta mark != */ ++ if (add_mark(r, NFT_META_MARK, NFT_REG32_00) < 0) ++ goto err; ++ if (add_cmp(r, NFT_REG32_00, params->mark_op, ¶ms->mark, sizeof(uint32_t)) < 0) ++ goto err; ++ } ++ ++ if (!ns && __setup_rule(r, params) < 0) ++ goto err; ++ ++ /* drop */ ++ if (add_verdict(r, params->chain_name, params->stmt) < 0) ++ goto err; ++ ++ return r; ++ ++err: ++ nftnl_rule_free(r); ++ return NULL; ++} ++ ++static struct nlmsghdr *nft_table_detect(struct mnl_params *mnl_params, void *args) ++{ ++ struct nftnl_table *table; ++ ++ table = setup_table(NFPROTO_INET, TABLE_NAME); ++ if (table == NULL) ++ return NULL; ++ ++ return construct_table_buf(mnl_params->buf, NFT_MSG_GETTABLE, NFPROTO_INET, ++ NLM_F_ACK, mnl_params->seq++, table); ++} ++ ++static int nft_table_create(struct mnl_params *mnl_params, void *args) ++{ ++ struct nftnl_table *table; ++ ++ table = setup_table(NFPROTO_INET, TABLE_NAME); ++ if (table == NULL) ++ return -1; ++ ++ construct_table_batch(mnl_params->batch, NFT_MSG_NEWTABLE, NFPROTO_INET, ++ NLM_F_CREATE|NLM_F_EXCL|NLM_F_ACK, ++ mnl_params->seq++, table); ++ ++ return 0; ++} ++ ++static int nft_table_prepare(struct mnl_params *mnl_params) ++{ ++ int result = 0; ++ ++ if (mnl_buf_send_and_recv(mnl_params, nft_table_detect, NULL, &result) == 0) ++ return 0; ++ ++ pr_debug("%s: detect table result %d\n", __func__, result); ++ ++ if (result == ENOENT && ++ (mnl_batch_send_and_recv(mnl_params, nft_table_create, NULL, &result) < 0 ++ && (result != 0 && result != EEXIST))) { ++ pr_err("%s: create nftables table failed!\n", __func__); ++ return -1; ++ } else if (result != 0) { ++ pr_err("%s: detect table result %d\n", __func__, -result); ++ return -1; ++ } ++ ++ return 0; ++} ++ ++static struct nlmsghdr *nft_chain_detect(struct mnl_params *mnl_params, void *args) ++{ ++ struct nftnl_chain *chain; ++ ++ chain = setup_chain(TABLE_NAME, args, false); ++ if (chain == NULL) ++ return NULL; ++ ++ return construct_chain_buf(mnl_params->buf, NFT_MSG_GETCHAIN, NFPROTO_INET, ++ NLM_F_ACK, mnl_params->seq++, chain); ++} ++ ++static int nft_chain_create(struct mnl_params *mnl_params, void *args) ++{ ++ struct nftnl_chain *chain; ++ ++ chain = setup_chain(TABLE_NAME, args, true); ++ if (chain == NULL) ++ return -1; ++ ++ construct_chain_batch(mnl_params->batch, NFT_MSG_NEWCHAIN, NFPROTO_INET, ++ NLM_F_CREATE|NLM_F_EXCL|NLM_F_ACK, mnl_params->seq++, chain); ++ ++ return 0; ++} ++ ++static int nft_chain_prepare_internal(struct mnl_params *mnl_params, ++ struct nft_chain_params *params) ++{ ++ int result = 0; ++ ++ if (mnl_buf_send_and_recv(mnl_params, nft_chain_detect, params, &result) == 0) ++ return 0; ++ ++ pr_debug("%s: detect chain result %d\n", __func__, result); ++ ++ if (result == ENOENT && ++ (mnl_batch_send_and_recv(mnl_params, nft_chain_create, params, &result) < 0 ++ && (result != 0 && result != EEXIST))) { ++ pr_err("%s: nftables create chain %s failed!\n", ++ __func__, params->name); ++ return -1; ++ } else if (result != 0) { ++ pr_err("%s: detect chain result %d\n", __func__, -result); ++ return -1; ++ } ++ ++ return result; ++} ++ ++static int nft_chain_prepare(struct mnl_params *mnl_params) ++{ ++ struct nft_chain_params params = { ++ .type = "filter", ++ .prio = NF_IP_PRI_FILTER, ++ .policy = NF_ACCEPT, ++ }; ++ ++ /* prepare ipv4 input chain in filter table */ ++ params.name = INPUT_CHAIN_NAME; ++ params.hooknum = NF_INET_LOCAL_IN; ++ ++ if (nft_chain_prepare_internal(mnl_params, ¶ms) < 0) ++ return -1; ++ ++ /* prepare ipv4 output chain in filter table */ ++ params.name = OUTPUT_CHAIN_NAME; ++ params.hooknum = NF_INET_LOCAL_OUT; ++ ++ if (nft_chain_prepare_internal(mnl_params, ¶ms) < 0) ++ return -1; ++ ++ return 0; ++} ++ ++static int nft_set_internal(uint8_t family, struct mnl_params *mnl_params, ++ struct nft_set_params *params, bool create) ++{ ++ struct nftnl_set *set; ++ ++ set = setup_set(family, TABLE_NAME, params, create); ++ if (set == NULL) ++ return -1; ++ ++ if (create) { ++ construct_set_batch(mnl_params->batch, NFT_MSG_NEWSET, family, ++ NLM_F_CREATE|NLM_F_EXCL|NLM_F_ACK, mnl_params->seq++, set); ++ } else { ++ construct_set_batch(mnl_params->batch, NFT_MSG_DELSET, family, ++ 0, mnl_params->seq++, set); ++ } ++ ++ return 0; ++} ++ ++static int nft_set_raw(struct mnl_params *mnl_params, ++ struct mnl_cb_params *args, bool input) ++{ ++ const uint32_t set_id_base = input ? 0x12315 : 0x17173; ++ const uint8_t family = NFPROTO_INET; ++ struct nft_set_params params = { 0 }; ++ char *set_name; ++ int idx = 0; ++ ++ if (!args->ipv6) { ++ params.datatype = IPv4_KEY_TYPE; ++ params.key_len = IPv4_KEY_LEN; ++ idx = 4; ++ } else { ++ params.datatype = IPv6_KEY_TYPE; ++ params.key_len = IPv6_KEY_LEN; ++ idx = 6; ++ } ++ ++ if (args->ipv6 && input) ++ set_name = INPUT_IPV6_SET_NAME; ++ else if (args->ipv6 && !input) ++ set_name = OUTPUT_IPV6_SET_NAME; ++ else if (!args->ipv6 && input) ++ set_name = INPUT_IPV4_SET_NAME; ++ else ++ set_name = OUTPUT_IPV4_SET_NAME; ++ ++ snprintf(params.name, sizeof(params.name)-1, set_name, args->tree_id); ++ params.id = set_id_base + args->tree_id + idx; ++ ++ if (nft_set_internal(family, mnl_params, ¶ms, args->create) < 0) { ++ pr_err("%s: create nftables %s %s set failed!\n", __func__, ++ args->ipv6 ? "ipv6" : "ipv4", ++ input ? "input" : "output"); ++ return -1; ++ } ++ ++ return 0; ++} ++ ++static int nft_set(struct mnl_params *mnl_params, void *args) ++{ ++ struct mnl_cb_params *params = args; ++ ++ params->ipv6 = false; ++ if (nft_set_raw(mnl_params, params, true) < 0) ++ return -1; ++ ++ if (nft_set_raw(mnl_params, params, false) < 0) ++ return -1; ++ ++ params->ipv6 = true; ++ if (nft_set_raw(mnl_params, params, true) < 0) ++ return -1; ++ ++ if (nft_set_raw(mnl_params, params, false) < 0) ++ return -1; ++ ++ return 0; ++} ++ ++static int nft_set_common(struct mnl_params *mnl_params, pid_t tree_id, bool create) ++{ ++ struct mnl_cb_params params = { ++ .tree_id = tree_id, ++ .create = create, ++ }; ++ int result = 0; ++ ++ if (create && ++ (mnl_batch_send_and_recv(mnl_params, nft_set, ¶ms, &result) < 0 ++ && (result != 0 && result != EEXIST))) { ++ pr_err("%s: create set failed!\n", __func__); ++ return -1; ++ } else if (!create && ++ mnl_batch_send_and_recv(mnl_params, nft_set, ¶ms, NULL) < 0) { ++ pr_err("%s: delete set failed!\n", __func__); ++ return -1; ++ } ++ ++ return 0; ++} ++ ++static int nft_rule_internal(uint8_t family, struct mnl_params *mnl_params, ++ struct nft_rule_params *params, bool create) ++{ ++ struct nftnl_rule *rule; ++ ++ rule = setup_rule(family, TABLE_NAME, params, create, false); ++ if (rule == NULL) ++ return -1; ++ ++ if (create) { ++ construct_rule_batch(mnl_params->batch, NFT_MSG_NEWRULE, family, ++ NLM_F_CREATE|NLM_F_EXCL|NLM_F_ACK, ++ mnl_params->seq++, rule); ++ } else { ++ construct_rule_batch(mnl_params->batch, NFT_MSG_DELRULE, family, ++ 0, mnl_params->seq++, rule); ++ } ++ ++ return 0; ++} ++ ++static int nft_rule_raw(struct mnl_params *mnl_params, struct mnl_cb_params *args, ++ struct nft_rule_params *params) ++{ ++ char *set_name; ++ ++ params->nfproto = params->ipv6 ? htons(ETH_P_IPV6) : htons(ETH_P_IP); ++ ++ set_name = params->ipv6 ? INPUT_IPV6_SET_NAME : INPUT_IPV4_SET_NAME; ++ params->chain_name = INPUT_CHAIN_NAME; ++ snprintf(params->set_name, sizeof(params->set_name)-1, set_name, args->tree_id); ++ if (nft_rule_internal(NFPROTO_INET, mnl_params, params, args->create) < 0) { ++ pr_err("%s: create nft %s input rule failed!\n", ++ __func__, params->ipv6 ? "ipv6" : "ipv4"); ++ return -1; ++ } ++ ++ set_name = params->ipv6 ? OUTPUT_IPV6_SET_NAME : OUTPUT_IPV4_SET_NAME; ++ params->chain_name = OUTPUT_CHAIN_NAME; ++ snprintf(params->set_name, sizeof(params->set_name)-1, set_name, args->tree_id); ++ if (nft_rule_internal(NFPROTO_INET, mnl_params, params, args->create) < 0) { ++ pr_err("%s: create nftables %s output rule failed!\n", ++ __func__, params->ipv6 ? "ipv6" : "ipv4"); ++ return -1; ++ } ++ ++ return 0; ++} ++ ++static int nft_rule(struct mnl_params *mnl_params, void *args) ++{ ++ struct nft_rule_params params = { ++ .l4proto = IPPROTO_TCP, ++ .mark = SOCCR_MARK, ++ .mark_op = NFT_CMP_NEQ, ++ .stmt = NF_DROP, ++ }; ++ ++ params.ipv6 = false; ++ if (nft_rule_raw(mnl_params, args, ¶ms) < 0) ++ return -1; ++ ++ params.ipv6 = true; ++ if (nft_rule_raw(mnl_params, args, ¶ms) < 0) ++ return -1; ++ ++ return 0; ++} ++ ++static int nft_rule_common(struct mnl_params *mnl_params, pid_t tree_id, bool create) ++{ ++ struct mnl_cb_params params = { ++ .tree_id = tree_id, ++ .create = create, ++ }; ++ int result = 0; ++ ++ if (create && ++ (mnl_batch_send_and_recv(mnl_params, nft_rule, ¶ms, &result) < 0 ++ && (result != 0 && result != EEXIST))) { ++ pr_err("%s: create rule failed!\n", __func__); ++ return -1; ++ } else if (!create && ++ mnl_batch_send_and_recv(mnl_params, nft_rule, ¶ms, NULL) < 0) { ++ pr_err("%s: delete rule failed!\n", __func__); ++ return -1; ++ } ++ ++ return 0; ++} ++ ++static int network_prepare_internal(struct mnl_params *params, batch_func_t _, void *args) ++{ ++ pid_t tree_id = *(pid_t *)args; ++ ++ if (nft_table_prepare(params) < 0) ++ return -1; ++ ++ if (nft_chain_prepare(params) < 0) ++ return -1; ++ ++ if (nft_set_common(params, tree_id, true) < 0) ++ return -1; ++ ++ if (nft_rule_common(params, tree_id, true) < 0) ++ return -1; ++ ++ return 0; ++} ++ ++int network_prepare(pid_t tree_id) ++{ ++ pr_info("Prepare network\n"); ++ ++ return mnl_common(network_prepare_internal, NULL, &tree_id); ++} ++ ++static int network_unprepare_internal(struct mnl_params *params, ++ batch_func_t _, void *args) ++{ ++ pid_t tree_id = *(pid_t *)args; ++ ++ if (nft_rule_common(params, tree_id, false) < 0) ++ return -1; ++ ++ if (nft_set_common(params, tree_id, false) < 0) ++ return -1; ++ ++ return 0; ++} ++ ++void network_unprepare(pid_t tree_id) ++{ ++ pr_info("Unprepare network\n"); ++ ++ mnl_common(network_unprepare_internal, NULL, &tree_id); ++} ++ ++static int add_set_elem_internal(struct nftnl_set *s, void *data, size_t len) ++{ ++ struct nftnl_set_elem *e; ++ ++ e = nftnl_set_elem_alloc(); ++ if (e == NULL) ++ return -1; ++ ++ nftnl_set_elem_set(e, NFTNL_SET_ELEM_KEY, data, len); ++ ++ nftnl_set_elem_add(s, e); ++ ++ return 0; ++} ++ ++static struct nftnl_set *add_set_elem(const char *table, const char *set, ++ void *data, size_t len) ++{ ++ struct nftnl_set *s; ++ ++ s = nftnl_set_alloc(); ++ if (s == NULL) ++ return NULL; ++ ++ if (nftnl_set_set_str(s, NFTNL_SET_TABLE, table) < 0) ++ goto err; ++ if (nftnl_set_set_str(s, NFTNL_SET_NAME, set) < 0) ++ goto err; ++ ++ if (add_set_elem_internal(s, data, len) < 0) ++ goto err; ++ ++ return s; ++ ++err: ++ nftnl_set_free(s); ++ return NULL; ++} ++ ++static int nft_set_elem(uint8_t family, struct mnl_params *mnl_param, ++ struct nft_set_elem_params *elem_param, ++ bool lock) ++{ ++ struct nftnl_set *set; ++ ++ set = add_set_elem(TABLE_NAME, elem_param->set_name, ++ elem_param->data, elem_param->data_len); ++ if (set == NULL) ++ return -1; ++ ++ if (lock) { ++ construct_set_elems_batch(mnl_param->batch, NFT_MSG_NEWSETELEM, ++ family, NLM_F_CREATE|NLM_F_EXCL, ++ mnl_param->seq++, set); ++ } else { ++ construct_set_elems_batch(mnl_param->batch, NFT_MSG_DELSETELEM, ++ family, 0, mnl_param->seq++, set); ++ } ++ ++ return 0; ++} ++ ++static void construct_set_elem_key(void *data, struct nf_conn_params *param, bool output) ++{ ++ size_t offset = 0; ++ size_t addr_len = param->family == AF_INET ? IPADDR_LEN : IP6ADDR_LEN; ++ ++ memcpy(data+offset, output ? param->src_addr : param->dst_addr, addr_len); ++ offset = addr_len; ++ *(uint32_t *)(data + offset) = htons(output ? param->src_port : param->dst_port); ++ offset += sizeof(uint32_t); ++ memcpy(data+offset, output ? param->dst_addr : param->src_addr, addr_len); ++ offset += addr_len; ++ *(uint32_t *)(data + offset) = htons(output ? param->dst_port : param->src_port); ++} ++ ++static int nf_connection_switch_raw(struct mnl_params *mnl_params, void *args) ++{ ++ struct nf_conn_params *param = args; ++ char *input_set_name, *output_set_name; ++ struct nft_set_elem_params elem; ++ ++ switch (param->family) { ++ case AF_INET: ++ input_set_name = INPUT_IPV4_SET_NAME; ++ output_set_name = OUTPUT_IPV4_SET_NAME; ++ elem.data_len = IPv4_KEY_LEN; ++ break; ++ case AF_INET6: ++ input_set_name = INPUT_IPV6_SET_NAME; ++ output_set_name = OUTPUT_IPV6_SET_NAME; ++ elem.data_len = IPv6_KEY_LEN; ++ break; ++ default: ++ pr_err("Unknown socket family %d\n", param->family); ++ return -1; ++ } ++ ++ construct_set_elem_key(elem.data, param, false); ++ snprintf(elem.set_name, sizeof(elem.set_name)-1, input_set_name, param->tree_id); ++ if (nft_set_elem(NFPROTO_INET, mnl_params, &elem, param->lock) < 0) ++ return -1; ++ ++ construct_set_elem_key(elem.data, param, true); ++ snprintf(elem.set_name, sizeof(elem.set_name)-1, output_set_name, param->tree_id); ++ if (nft_set_elem(NFPROTO_INET, mnl_params, &elem, param->lock) < 0) ++ return -1; ++ ++ return 0; ++} ++ ++/* IPv4-Mapped IPv6 Addresses */ ++static int ipv6_addr_mapped(uint32_t *addr) ++{ ++ return (addr[2] == htonl(0x0000ffff)); ++} ++ ++int nft_connection_switch(struct inet_sk_desc *sk, bool lock, pid_t tree_id) ++{ ++ char sip[INET_ADDR_LEN], dip[INET_ADDR_LEN]; ++ struct nf_conn_params param = { ++ .family = sk->sd.family, ++ .src_addr = sk->src_addr, ++ .src_port = sk->src_port, ++ .dst_addr = sk->dst_addr, ++ .dst_port = sk->dst_port, ++ .lock = lock, ++ .tree_id = tree_id, ++ }; ++ ++ if (param.family == AF_INET6 && ipv6_addr_mapped(param.dst_addr)) { ++ param.family = AF_INET; ++ param.src_addr = ¶m.src_addr[3]; ++ param.dst_addr = ¶m.dst_addr[3]; ++ } ++ ++ if (!inet_ntop(param.family, (void *)param.src_addr, sip, INET_ADDR_LEN) || ++ !inet_ntop(param.family, (void *)param.dst_addr, dip, INET_ADDR_LEN)) { ++ pr_perror("nf: Can't translate ip addr"); ++ return -1; ++ } ++ ++ pr_info("%s %s:%d - %s:%d connection\n", lock ? "Locked" : "Unlocked", ++ sip, (int)param.src_port, dip, (int)param.dst_port); ++ ++ return mnl_sendmsg(nf_connection_switch_raw, ¶m); ++} +-- +2.34.0 + diff --git a/backport-0034--nftables-implement-nft-api-for-lock-net-ns.patch b/backport-0034--nftables-implement-nft-api-for-lock-net-ns.patch new file mode 100644 index 0000000..7516276 --- /dev/null +++ b/backport-0034--nftables-implement-nft-api-for-lock-net-ns.patch @@ -0,0 +1,146 @@ +From 2b690ba323949ae37f186ea4f422154e56ab37d5 Mon Sep 17 00:00:00 2001 +From: "fu.lin" +Date: Wed, 18 Aug 2021 10:59:41 +0800 +Subject: [PATCH 34/49] nftables: implement nft api for lock net ns + +Signed-off-by: fu.lin +--- + criu/include/nftables.h | 2 + + criu/nftables.c | 112 ++++++++++++++++++++++++++++++++++++++++ + 2 files changed, 114 insertions(+) + +diff --git a/criu/include/nftables.h b/criu/include/nftables.h +index 3b51a3d..e462919 100644 +--- a/criu/include/nftables.h ++++ b/criu/include/nftables.h +@@ -162,5 +162,7 @@ struct nf_conn_params { + + struct inet_sk_desc; + int nft_connection_switch(struct inet_sk_desc *sk, bool lock, pid_t tree_id); ++int nft_lock(void); ++int nft_unlock(void); + + #endif /* __CR_NFTABLES_H__ */ +diff --git a/criu/nftables.c b/criu/nftables.c +index 57774e6..817f157 100644 +--- a/criu/nftables.c ++++ b/criu/nftables.c +@@ -821,3 +821,115 @@ int nft_connection_switch(struct inet_sk_desc *sk, bool lock, pid_t tree_id) + + return mnl_sendmsg(nf_connection_switch_raw, ¶m); + } ++ ++static int nft_ns_rule_internal(uint8_t family, struct mnl_params *mnl_params, ++ struct nft_rule_params *params, bool create) ++{ ++ struct nftnl_rule *rule; ++ ++ rule = setup_rule(family, TABLE_NAME, params, create, true); ++ if (rule == NULL) ++ return -1; ++ ++ if (create) { ++ construct_rule_batch(mnl_params->batch, NFT_MSG_NEWRULE, family, ++ NLM_F_CREATE|NLM_F_EXCL|NLM_F_ACK, ++ mnl_params->seq++, rule); ++ } else { ++ construct_rule_batch(mnl_params->batch, NFT_MSG_DELRULE, family, ++ 0, mnl_params->seq++, rule); ++ } ++ ++ return 0; ++} ++ ++static int nft_ns_rule_raw(struct mnl_params *mnl_params, struct mnl_cb_params *args, ++ struct nft_rule_params *params) ++{ ++ params->chain_name = INPUT_CHAIN_NAME; ++ if (nft_ns_rule_internal(NFPROTO_INET, mnl_params, params, args->create) < 0) { ++ pr_err("%s: create nft input rule failed!\n", __func__); ++ return -1; ++ } ++ ++ params->chain_name = OUTPUT_CHAIN_NAME; ++ if (nft_ns_rule_internal(NFPROTO_INET, mnl_params, params, args->create) < 0) { ++ pr_err("%s: create nft output rule failed!\n", __func__); ++ return -1; ++ } ++ ++ return 0; ++} ++ ++static int nft_ns_rule(struct mnl_params *mnl_params, void *args) ++{ ++ struct nft_rule_params params = { 0 }; ++ ++ params.mark = 0; ++ params.mark_op = NFT_CMP_EQ; ++ params.stmt = NF_DROP; ++ if (nft_ns_rule_raw(mnl_params, args, ¶ms) < 0) ++ return -1; ++ ++ params.mark = SOCCR_MARK; ++ ++ params.stmt = NF_ACCEPT; ++ if (nft_ns_rule_raw(mnl_params, args, ¶ms) < 0) ++ return -1; ++ ++ return 0; ++} ++ ++static int nft_ns_rule_common(struct mnl_params *mnl_params, bool create) ++{ ++ struct mnl_cb_params params = { ++ .create = create, ++ }; ++ int result = 0; ++ ++ if (create && ++ (mnl_batch_send_and_recv(mnl_params, nft_ns_rule, ¶ms, &result) < 0 ++ && (result != 0 && result != EEXIST))) { ++ pr_err("%s: crete ns rule failed!\n", __func__); ++ return -1; ++ } else if (!create && ++ (mnl_batch_send_and_recv(mnl_params, nft_ns_rule, ¶ms, NULL) < 0)) { ++ pr_err("%s: delete ns rule failed!\n", __func__); ++ return -1; ++ } ++ ++ return 0; ++} ++ ++static int network_lock_internal(struct mnl_params *params, ++ batch_func_t _, void *args) ++{ ++ if (nft_table_prepare(params) < 0) ++ return -1; ++ ++ if (nft_chain_prepare(params) < 0) ++ return -1; ++ ++ if (nft_ns_rule_common(params, true) < 0) ++ return -1; ++ ++ return 0; ++} ++ ++int nft_lock(void) ++{ ++ return mnl_common(network_lock_internal, NULL, NULL); ++} ++ ++static int network_unlock_internal(struct mnl_params *params, batch_func_t _, ++ void *args) ++{ ++ if (nft_ns_rule_common(params, false) < 0) ++ return -1; ++ return 0; ++} ++ ++int nft_unlock(void) ++{ ++ return mnl_common(network_unlock_internal, NULL, NULL); ++} +-- +2.34.0 + diff --git a/backport-0035--criu-switch-to-nftables-api.patch b/backport-0035--criu-switch-to-nftables-api.patch new file mode 100644 index 0000000..b7f090d --- /dev/null +++ b/backport-0035--criu-switch-to-nftables-api.patch @@ -0,0 +1,391 @@ +From 1c52f9fdf0e04218df92d763b7b721dea031ead1 Mon Sep 17 00:00:00 2001 +From: "fu.lin" +Date: Thu, 19 Aug 2021 16:29:49 +0800 +Subject: [PATCH 35/49] criu: switch to nftables api + +usage: criu --use-nft + +Signed-off-by: fu.lin +--- + criu/config.c | 1 + + criu/cr-dump.c | 4 ++-- + criu/cr-restore.c | 6 ++++-- + criu/crtools.c | 1 + + criu/include/cr_options.h | 1 + + criu/include/net.h | 8 ++++++-- + criu/include/netfilter.h | 7 +++++-- + criu/include/sk-inet.h | 2 +- + criu/net.c | 37 +++++++++++++++++++++++++++---------- + criu/netfilter.c | 14 +++++++++++--- + criu/nftables.c | 29 +++++++++++++++++++---------- + criu/sk-tcp.c | 14 +++++++------- + 12 files changed, 85 insertions(+), 39 deletions(-) + +diff --git a/criu/config.c b/criu/config.c +index 7397a3c..62359ac 100644 +--- a/criu/config.c ++++ b/criu/config.c +@@ -530,6 +530,7 @@ int parse_options(int argc, char **argv, bool *usage_error, + BOOL_OPT("ignore-special-dump", &opts.ignore_special_dump), + BOOL_OPT("file-locks-repair", &opts.file_locks_repair), + {"reserve-ports", required_argument, 0, 'P' }, ++ BOOL_OPT("use-nft", &opts.use_nft), + { }, + }; + +diff --git a/criu/cr-dump.c b/criu/cr-dump.c +index 26fcf7c..7e7c2b9 100644 +--- a/criu/cr-dump.c ++++ b/criu/cr-dump.c +@@ -1766,7 +1766,7 @@ static int cr_dump_finish(int ret) + * start rollback procedure and cleanup everything. + */ + if (ret || post_dump_ret || opts.final_state == TASK_ALIVE) { +- network_unlock(); ++ network_unlock(opts.tree_id); + delete_link_remaps(); + clean_cr_time_mounts(); + } +@@ -1914,7 +1914,7 @@ int cr_dump_tasks(pid_t pid) + if (collect_pstree_ids()) + goto err; + +- if (network_lock()) ++ if (network_lock(opts.tree_id)) + goto err; + + if (collect_file_locks()) +diff --git a/criu/cr-restore.c b/criu/cr-restore.c +index 00f16dd..99d82a8 100644 +--- a/criu/cr-restore.c ++++ b/criu/cr-restore.c +@@ -2384,7 +2384,7 @@ skip_ns_bouncing: + goto out_kill; + + /* Unlock network before disabling repair mode on sockets */ +- network_unlock(); ++ network_unlock(vpid(init)); + network_status |= NETWORK_UNLOCK; + + /* +@@ -2449,6 +2449,8 @@ skip_ns_bouncing: + if (ret != 0) + pr_err("Post-resume script ret code %d\n", ret); + ++ network_delete_set(vpid(init)); ++ + if (!opts.restore_detach && !opts.exec_cmd) + wait(NULL); + +@@ -2594,7 +2596,7 @@ err: + pr_err("collect inet sk cinfo fail"); + } + if ((network_status & NETWORK_UNLOCK) == 0) +- network_unlock(); ++ network_unlock(vpid(root_item)); + } + + return ret; +diff --git a/criu/crtools.c b/criu/crtools.c +index d437f35..ec8e75d 100644 +--- a/criu/crtools.c ++++ b/criu/crtools.c +@@ -405,6 +405,7 @@ usage: + " --ignore-special-dump Ignore special task tid page dump\n" + " --file-locks-repair Use repair mode to dump and restore file locks\n" + " --reserve-ports Reserve src ports in kernel\n" ++" --use-nft Use nft API instead of iptables cmd in network locking\n" + "\n" + "Check options:\n" + " Without options, \"criu check\" checks availability of absolutely required\n" +diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h +index dce5832..32108a1 100644 +--- a/criu/include/cr_options.h ++++ b/criu/include/cr_options.h +@@ -157,6 +157,7 @@ struct cr_options { + char *share_dst_ports; + char *share_src_ports; + int reserve_ports; ++ int use_nft; + }; + + extern struct cr_options opts; +diff --git a/criu/include/net.h b/criu/include/net.h +index a318299..4f220d8 100644 +--- a/criu/include/net.h ++++ b/criu/include/net.h +@@ -30,8 +30,12 @@ struct veth_pair { + + extern int collect_net_namespaces(bool for_dump); + +-extern int network_lock(void); +-extern void network_unlock(void); ++extern int network_prepare(pid_t tree_id); ++extern void network_delete_rule(pid_t tree_id); ++extern void network_delete_set(pid_t tree_id); ++extern int network_lock(pid_t tree_id); ++extern void network_unlock(pid_t tree_id); ++ + extern int network_lock_internal(); + + extern struct ns_desc net_ns_desc; +diff --git a/criu/include/netfilter.h b/criu/include/netfilter.h +index 35ef262..c92762c 100644 +--- a/criu/include/netfilter.h ++++ b/criu/include/netfilter.h +@@ -1,9 +1,12 @@ + #ifndef __CR_NETFILTER_H__ + #define __CR_NETFILTER_H__ + ++#include ++#include ++ + struct inet_sk_desc; +-extern int nf_lock_connection(struct inet_sk_desc *); +-extern int nf_unlock_connection(struct inet_sk_desc *); ++extern int nf_lock_connection(struct inet_sk_desc *, pid_t, bool); ++extern int nf_unlock_connection(struct inet_sk_desc *, bool); + + struct inet_sk_info; + extern int nf_unlock_connection_info(struct inet_sk_info *); +diff --git a/criu/include/sk-inet.h b/criu/include/sk-inet.h +index 2ee46ea..68d6fcf 100644 +--- a/criu/include/sk-inet.h ++++ b/criu/include/sk-inet.h +@@ -81,7 +81,7 @@ static inline void tcp_repair_off(int fd) + + extern void tcp_locked_conn_add(struct inet_sk_info *); + extern void rst_unlock_tcp_connections(void); +-extern void cpt_unlock_tcp_connections(void); ++extern void cpt_unlock_tcp_connections(bool); + + extern void read_reserved_ports(char *path); + extern void write_reserved_ports(char *path); +diff --git a/criu/net.c b/criu/net.c +index 6ca5ef5..a8e8c24 100644 +--- a/criu/net.c ++++ b/criu/net.c +@@ -41,6 +41,7 @@ + #include "util.h" + #include "external.h" + #include "fdstore.h" ++#include "nftables.h" + + #include "protobuf.h" + #include "images/netdev.pb-c.h" +@@ -2605,9 +2606,13 @@ int network_lock_internal() + return -1; + + +- ret |= iptables_restore(false, conf, sizeof(conf) - 1); +- if (kdat.ipv6) +- ret |= iptables_restore(true, conf, sizeof(conf) - 1); ++ if (opts.use_nft) ++ ret = nft_lock(); ++ else { ++ ret |= iptables_restore(false, conf, sizeof(conf) - 1); ++ if (kdat.ipv6) ++ ret |= iptables_restore(true, conf, sizeof(conf) - 1); ++ } + + if (ret) + pr_err("Locking network failed: iptables-restore returned %d. " +@@ -2634,9 +2639,13 @@ static int network_unlock_internal() + if (switch_ns(root_item->pid->real, &net_ns_desc, &nsret)) + return -1; + +- ret |= iptables_restore(false, conf, sizeof(conf) - 1); +- if (kdat.ipv6) +- ret |= iptables_restore(true, conf, sizeof(conf) - 1); ++ if (opts.use_nft) ++ ret = nft_unlock(); ++ else { ++ ret |= iptables_restore(false, conf, sizeof(conf) - 1); ++ if (kdat.ipv6) ++ ret |= iptables_restore(true, conf, sizeof(conf) - 1); ++ } + + if (restore_ns(nsret, &net_ns_desc)) + ret = -1; +@@ -2644,10 +2653,13 @@ static int network_unlock_internal() + return ret; + } + +-int network_lock(void) ++int network_lock(pid_t tree_id) + { + pr_info("Lock network\n"); + ++ if (opts.use_nft && opts.tcp_established_ok && network_prepare(tree_id) < 0) ++ return -1; ++ + /* Each connection will be locked on dump */ + if (!(root_ns_mask & CLONE_NEWNET)) + return 0; +@@ -2658,7 +2670,7 @@ int network_lock(void) + return network_lock_internal(); + } + +-void network_unlock(void) ++void network_unlock(pid_t tree_id) + { + pr_info("Unlock network\n"); + +@@ -2667,8 +2679,13 @@ void network_unlock(void) + write_reserved_ports(RESERVED_PORTS_PATH); + } + +- cpt_unlock_tcp_connections(); +- rst_unlock_tcp_connections(); ++ if (opts.use_nft && opts.tcp_established_ok) ++ network_delete_rule(tree_id); ++ ++ cpt_unlock_tcp_connections(opts.use_nft); ++ ++ if (!opts.use_nft) ++ rst_unlock_tcp_connections(); + + if (root_ns_mask & CLONE_NEWNET) { + run_scripts(ACT_NET_UNLOCK); +diff --git a/criu/netfilter.c b/criu/netfilter.c +index 368651c..b2ec7ed 100644 +--- a/criu/netfilter.c ++++ b/criu/netfilter.c +@@ -15,6 +15,8 @@ + #include "sk-inet.h" + #include "kerndat.h" + ++#include "nftables.h" ++ + static char buf[512]; + + /* +@@ -129,13 +131,19 @@ static int nf_connection_switch(struct inet_sk_desc *sk, bool lock) + return ret; + } + +-int nf_lock_connection(struct inet_sk_desc *sk) ++int nf_lock_connection(struct inet_sk_desc *sk, pid_t tree_id, bool use_nft) + { +- return nf_connection_switch(sk, true); ++ if (use_nft) ++ return nft_connection_switch(sk, true, tree_id); ++ else ++ return nf_connection_switch(sk, true); + } + +-int nf_unlock_connection(struct inet_sk_desc *sk) ++int nf_unlock_connection(struct inet_sk_desc *sk, bool use_nft) + { ++ if (use_nft) ++ return 0; ++ + return nf_connection_switch(sk, false); + } + +diff --git a/criu/nftables.c b/criu/nftables.c +index 817f157..739aee4 100644 +--- a/criu/nftables.c ++++ b/criu/nftables.c +@@ -653,25 +653,34 @@ int network_prepare(pid_t tree_id) + return mnl_common(network_prepare_internal, NULL, &tree_id); + } + +-static int network_unprepare_internal(struct mnl_params *params, +- batch_func_t _, void *args) ++static int network_delete_rule_internal(struct mnl_params *params, ++ batch_func_t _, void *args) + { + pid_t tree_id = *(pid_t *)args; + +- if (nft_rule_common(params, tree_id, false) < 0) +- return -1; ++ return nft_rule_common(params, tree_id, false); ++} + +- if (nft_set_common(params, tree_id, false) < 0) +- return -1; ++void network_delete_rule(pid_t tree_id) ++{ ++ pr_info("unlock network\n"); + +- return 0; ++ mnl_common(network_delete_rule_internal, NULL, &tree_id); ++} ++ ++static int network_delete_set_internal(struct mnl_params *params, ++ batch_func_t _, void *args) ++{ ++ pid_t tree_id = *(pid_t *)args; ++ ++ return nft_set_common(params, tree_id, false); + } + +-void network_unprepare(pid_t tree_id) ++void network_delete_set(pid_t tree_id) + { +- pr_info("Unprepare network\n"); ++ pr_info("clear nft set\n"); + +- mnl_common(network_unprepare_internal, NULL, &tree_id); ++ mnl_common(network_delete_set_internal, NULL, &tree_id); + } + + static int add_set_elem_internal(struct nftnl_set *s, void *data, size_t len) +diff --git a/criu/sk-tcp.c b/criu/sk-tcp.c +index c58e3f6..ae71e4d 100644 +--- a/criu/sk-tcp.c ++++ b/criu/sk-tcp.c +@@ -55,7 +55,7 @@ static int tcp_repair_established(int fd, struct inet_sk_desc *sk) + } + + if (!(root_ns_mask & CLONE_NEWNET)) { +- ret = nf_lock_connection(sk); ++ ret = nf_lock_connection(sk, opts.tree_id, opts.use_nft); + if (ret < 0) + goto err2; + } +@@ -70,21 +70,21 @@ static int tcp_repair_established(int fd, struct inet_sk_desc *sk) + + err3: + if (!(root_ns_mask & CLONE_NEWNET)) +- nf_unlock_connection(sk); ++ nf_unlock_connection(sk, opts.use_nft); + err2: + close(sk->rfd); + err1: + return -1; + } + +-static void tcp_unlock_one(struct inet_sk_desc *sk) ++static void tcp_unlock_one(struct inet_sk_desc *sk, bool use_nft) + { + int ret; + + list_del(&sk->rlist); + +- if (!(root_ns_mask & CLONE_NEWNET)) { +- ret = nf_unlock_connection(sk); ++ if (!(root_ns_mask & CLONE_NEWNET) && !use_nft) { ++ ret = nf_unlock_connection(sk, false); + if (ret < 0) + pr_perror("Failed to unlock TCP connection"); + } +@@ -101,12 +101,12 @@ static void tcp_unlock_one(struct inet_sk_desc *sk) + close(sk->rfd); + } + +-void cpt_unlock_tcp_connections(void) ++void cpt_unlock_tcp_connections(bool use_nft) + { + struct inet_sk_desc *sk, *n; + + list_for_each_entry_safe(sk, n, &cpt_tcp_repair_sockets, rlist) +- tcp_unlock_one(sk); ++ tcp_unlock_one(sk, use_nft); + } + + static int dump_tcp_conn_state(struct inet_sk_desc *sk) +-- +2.34.0 + diff --git a/backport-0036--remove-sigaction-handler-register-in-restorer.patch b/backport-0036--remove-sigaction-handler-register-in-restorer.patch new file mode 100644 index 0000000..43bc120 --- /dev/null +++ b/backport-0036--remove-sigaction-handler-register-in-restorer.patch @@ -0,0 +1,51 @@ +From ad18bcf6d3c61b67d5adb0e089223016c7b515b2 Mon Sep 17 00:00:00 2001 +From: Jingxian He +Date: Tue, 24 Aug 2021 12:26:35 +0000 +Subject: [PATCH 36/49] remove sigaction handler register in restorer + +The sigaction handler register in restorer will change +the original sigaction handler of restoring app, +We need to remove them or recover them before resuming app. + +Signed-off-by: Jingxian He +--- + criu/pie/restorer.c | 20 -------------------- + 1 file changed, 20 deletions(-) + +diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c +index 5548076..9a9cb14 100644 +--- a/criu/pie/restorer.c ++++ b/criu/pie/restorer.c +@@ -1630,29 +1630,9 @@ long __export_restore_task(struct task_restore_args *args) + pr_err("Failed to set SIGCHLD %ld\n", ret); + goto core_restore_end; + } +- ret = sys_sigaction(SIGSEGV, &act, NULL, sizeof(k_rtsigset_t)); +- if (ret) { +- pr_err("Failed to set SIGCHLD %ld\n", ret); +- goto core_restore_end; +- } +- +- ret = sys_sigaction(SIGBUS, &act, NULL, sizeof(k_rtsigset_t)); +- if (ret) { +- pr_err("Failed to set SIGCHLD %ld\n", ret); +- goto core_restore_end; +- } +- +- ret = sys_sigaction(SIGILL, &act, NULL, sizeof(k_rtsigset_t)); +- if (ret) { +- pr_err("Failed to set SIGCHLD %ld\n", ret); +- goto core_restore_end; +- } + + ksigemptyset(&to_block); + ksigaddset(&to_block, SIGCHLD); +- ksigaddset(&to_block, SIGSEGV); +- ksigaddset(&to_block, SIGBUS); +- ksigaddset(&to_block, SIGILL); + ret = sys_sigprocmask(SIG_UNBLOCK, &to_block, NULL, sizeof(k_rtsigset_t)); + if (ret) { + pr_err("Failed to unblock SIGCHLD %ld\n", ret); +-- +2.34.0 + diff --git a/backport-0037--remove-ignore_special_dump-option.patch b/backport-0037--remove-ignore_special_dump-option.patch new file mode 100644 index 0000000..2ae0ba1 --- /dev/null +++ b/backport-0037--remove-ignore_special_dump-option.patch @@ -0,0 +1,72 @@ +From 15e3ba4e353613ffc8cd55c607c88faddef961b3 Mon Sep 17 00:00:00 2001 +From: root +Date: Wed, 8 Sep 2021 03:45:59 +0000 +Subject: [PATCH 37/49] remove ignore_special_dump option + +Remove the useless ignore_special_dump option. + +Signed-off-by: Jingxian He +--- + criu/config.c | 1 - + criu/cr-dump.c | 2 +- + criu/include/cr_options.h | 1 - + criu/seize.c | 6 +++--- + 4 files changed, 4 insertions(+), 6 deletions(-) + +diff --git a/criu/config.c b/criu/config.c +index 62359ac..a822ef2 100644 +--- a/criu/config.c ++++ b/criu/config.c +@@ -527,7 +527,6 @@ int parse_options(int argc, char **argv, bool *usage_error, + BOOL_OPT("dump-char-dev", &opts.dump_char_dev), + BOOL_OPT("mask-exit-notify", &opts.mask_exit_notify), + BOOL_OPT("weak-file-check", &opts.weak_file_check), +- BOOL_OPT("ignore-special-dump", &opts.ignore_special_dump), + BOOL_OPT("file-locks-repair", &opts.file_locks_repair), + {"reserve-ports", required_argument, 0, 'P' }, + BOOL_OPT("use-nft", &opts.use_nft), +diff --git a/criu/cr-dump.c b/criu/cr-dump.c +index 7e7c2b9..35e80f5 100644 +--- a/criu/cr-dump.c ++++ b/criu/cr-dump.c +@@ -1789,7 +1789,7 @@ static int cr_dump_finish(int ret) + + close_service_fd(CR_PROC_FD_OFF); + +- if (ret == 0 && opts.pin_memory && !opts.ignore_special_dump) { ++ if (ret == 0 && opts.pin_memory) { + pr_info("start restore_task_special_pages\n"); + restore_task_special_pages(0); + } +diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h +index 32108a1..672070e 100644 +--- a/criu/include/cr_options.h ++++ b/criu/include/cr_options.h +@@ -152,7 +152,6 @@ struct cr_options { + int dump_char_dev; + int mask_exit_notify; + int weak_file_check; +- int ignore_special_dump; + int file_locks_repair; + char *share_dst_ports; + char *share_src_ports; +diff --git a/criu/seize.c b/criu/seize.c +index 73baf40..c9fdd41 100644 +--- a/criu/seize.c ++++ b/criu/seize.c +@@ -557,9 +557,9 @@ static void unseize_task_and_threads(const struct pstree_item *item, int st) + if (item->pid->state == TASK_DEAD) + return; + +- if (opts.pin_memory && !opts.ignore_special_dump) { +- for (i = 0; i < item->nr_threads; i++) +- dump_task_special_pages(item->threads[i].real); ++ if (opts.pin_memory) { ++ for (i = 0; i < item->nr_threads; i++) ++ dump_task_special_pages(item->threads[i].real); + } + if (opts.mask_exit_notify) { + ret = mask_task_exit_notify(item->threads[0].real, true); +-- +2.34.0 + diff --git a/backport-0038--add-clear-pin-mem-and-init-page-map-option.patch b/backport-0038--add-clear-pin-mem-and-init-page-map-option.patch new file mode 100644 index 0000000..664aeec --- /dev/null +++ b/backport-0038--add-clear-pin-mem-and-init-page-map-option.patch @@ -0,0 +1,97 @@ +From c51b1d35a65cf3ce3837a91f33abc20f1244933a Mon Sep 17 00:00:00 2001 +From: root +Date: Wed, 8 Sep 2021 08:23:11 +0000 +Subject: [PATCH 38/49] add clear pin mem and init page map option + +Add 'clear-pin-mem' option for clearing pin memory data, +and 'init-page-map' option for initializationing buffer for +reading page map info. + +Signed-off-by: Jingxian He +--- + criu/crtools.c | 28 ++++++++++++++++++++++++++++ + criu/include/restorer.h | 4 ++++ + 2 files changed, 32 insertions(+) + +diff --git a/criu/crtools.c b/criu/crtools.c +index ec8e75d..5ae7ca0 100644 +--- a/criu/crtools.c ++++ b/criu/crtools.c +@@ -46,6 +46,26 @@ + + #include "setproctitle.h" + #include "sysctl.h" ++#include "restorer.h" ++ ++int init_pagemap_read(int para) ++{ ++ int fd, ret; ++ ++ fd = open(PIN_MEM_FILE, O_RDWR, 0); ++ if (fd < 0) { ++ pr_warn("error open file: %s\n", PIN_MEM_FILE); ++ return -1; ++ } ++ ++ ret = ioctl(fd, INIT_PAGEMAP_READ, (unsigned long) ¶); ++ if (ret < 0) { ++ pr_warn("Init pagemap read fail, errno: %s\n", strerror(errno)); ++ } ++ ++ close(fd); ++ return ret; ++} + + int main(int argc, char *argv[], char *envp[]) + { +@@ -146,6 +166,12 @@ int main(int argc, char *argv[], char *envp[]) + } + } + ++ if (!strcmp(argv[optind], "clear-pin-memory")) ++ return clear_pin_mem(0); ++ ++ if (!strcmp(argv[optind], "init-pagemap-read")) ++ return init_pagemap_read(0); ++ + /* We must not open imgs dir, if service is called */ + if (strcmp(argv[optind], "service")) { + ret = open_image_dir(opts.imgs_dir); +@@ -281,6 +307,8 @@ usage: + " dedup remove duplicates in memory dump\n" + " cpuinfo dump writes cpu information into image file\n" + " cpuinfo check validates cpu information read from image file\n" ++" clear-pin-memory clear pin memory manage data\n" ++" init-pagemap-read init data buffer for reading page map info\n" + ); + + if (usage_error) { +diff --git a/criu/include/restorer.h b/criu/include/restorer.h +index b8f74e9..31fd683 100644 +--- a/criu/include/restorer.h ++++ b/criu/include/restorer.h +@@ -334,12 +334,14 @@ enum { + #define _SET_PIN_MEM_AREA 1 + #define _CLEAR_PIN_MEM_AREA 2 + #define _REMAP_PIN_MEM_AREA 3 ++#define _INIT_PAGEMAP_READ 5 + #define _DUMP_SEPCIAL_PAGES 6 + #define _RETORE_SEPCIAL_PAGES 7 + #define _SET_FORK_PID 8 + #define SET_PIN_MEM_AREA _IOW(PIN_MEM_MAGIC, _SET_PIN_MEM_AREA, struct pin_mem_area_set) + #define CLEAR_PIN_MEM_AREA _IOW(PIN_MEM_MAGIC, _CLEAR_PIN_MEM_AREA, int) + #define REMAP_PIN_MEM_AREA _IOW(PIN_MEM_MAGIC, _REMAP_PIN_MEM_AREA, int) ++#define INIT_PAGEMAP_READ _IOW(PIN_MEM_MAGIC, _INIT_PAGEMAP_READ, int) + #define DUMP_SEPCIAL_PAGES _IOW(PIN_MEM_MAGIC, _DUMP_SEPCIAL_PAGES, int) + #define RETORE_SEPCIAL_PAGES _IOW(PIN_MEM_MAGIC, _RETORE_SEPCIAL_PAGES, int) + #define SET_FORK_PID _IOW(PIN_MEM_MAGIC, _SET_FORK_PID, int) +@@ -358,4 +360,6 @@ struct pin_mem_area_set { + struct pin_mem_area mem_area[MAX_PIN_MEM_AREA_NUM]; + }; + ++int clear_pin_mem(int pid); ++ + #endif /* __CR_RESTORER_H__ */ +-- +2.34.0 + diff --git a/backport-0039--mmap-restore-dev-hisi_sec2-deivce-vma.patch b/backport-0039--mmap-restore-dev-hisi_sec2-deivce-vma.patch new file mode 100644 index 0000000..04f0610 --- /dev/null +++ b/backport-0039--mmap-restore-dev-hisi_sec2-deivce-vma.patch @@ -0,0 +1,492 @@ +From b95670afafc92b9a826ecaae1f0f12ac694b5514 Mon Sep 17 00:00:00 2001 +From: "fu.lin" +Date: Fri, 10 Sep 2021 16:06:55 +0800 +Subject: [PATCH 39/49] mmap: restore /dev/hisi_sec2* deivce vma + +There are two kinds of vmas: anonymous vma and file-based vma. For +anonymous vma, criu just map area and fill content to it; for file-based +vma, criu preprocess it, such as setting `open_vm()` callback function. + +`/dev/hisi_sec2*` char device is different from the normal. The `open`, +`mmap`, and `close` syscall actions has a special meaning. + - `open`: allocate physical resource of the device + - `mmap`: create instance + - `close`: release physical resource +The vma means the instance in this device. One fd may be associated with +a group instances: one mmio (vma size is 2 pages, pgoff is 0), one dus +(vma size is 37 pages, pgoff is 0x2000). As for dus vma, it's split two +vmas by `mprotect(addr, 0x5000, PROT_READ)`: one size is 0x20000, one +size is 0x5000. + +This patch makes the /dev/hisi_sec* restore possible. Idea: + It's impossible for criu to know the relationship between vma and the +mapped file fd. Therefore, just collect the total fds number during +collecting /dev/hisi_sec* files, then the fd is tagged that which +function is used during vma restoration, and aissign the unused fd to the +specific vma. And during `mmap()` process, dus vma is splitted by `mprotect`. + +Note: +- criu use ino to index the fd. +- this physical device drivers is hisi_sec2.ko, which is located in + `drivers/crypto/hisilicon/sec2/` of linux kernel. +- this device name has prefix "hisi_sec2" that is found from + `drivers/crypto/hisilicon/sec2/sec_main.c`. + +Signed-off-by: fu.lin +--- + criu/files-reg.c | 113 ++++++++++++++++++++++++++++++++++ + criu/files.c | 17 ++++-- + criu/include/files-reg.h | 8 +++ + criu/include/util.h | 8 +++ + criu/include/vma.h | 12 ++++ + criu/pie/restorer.c | 129 ++++++++++++++++++++++++++++++++++++++- + criu/proc_parse.c | 19 +++--- + 7 files changed, 292 insertions(+), 14 deletions(-) + +diff --git a/criu/files-reg.c b/criu/files-reg.c +index 54fa388..345a72e 100644 +--- a/criu/files-reg.c ++++ b/criu/files-reg.c +@@ -2342,6 +2342,109 @@ static int open_filemap(int pid, struct vma_area *vma) + return 0; + } + ++#define MAX_HISI_SEC_SIZE 3 /* one physical device expose three char dev */ ++static struct hlist_head hisi_sec_fds_hash[MAX_HISI_SEC_SIZE]; ++ ++static int collect_hisi_sec_fds(struct list_head *list) ++{ ++ struct fdinfo_list_entry *fle, *tmp; ++ struct chrfile_info *ci; ++ struct file_desc *d; ++ struct hisi_sec_desc *desc; ++ int idx; ++ int nr = 0; ++ ++ for (idx = 0; idx < MAX_HISI_SEC_SIZE; idx++) ++ INIT_HLIST_HEAD(&hisi_sec_fds_hash[idx]); ++ ++ list_for_each_entry_safe(fle, tmp, list, ps_list) { ++ d = fle->desc; ++ ++ if (d->ops->type != FD_TYPES__CHR) ++ continue; ++ ++ ci = container_of(d, struct chrfile_info, d); ++ ++ if (strstr(ci->path, HISI_SEC_DEV) != NULL) { ++ desc = shmalloc(sizeof(*desc)); ++ if (desc == NULL) ++ return -ENOMEM; ++ ++ desc->name = ci->path; ++ desc->fd = fle->fe->fd; ++ desc->mmio = desc->dus = 0; ++ ++ idx = (ci->path[strlen(ci->path)-1] - '0') % MAX_HISI_SEC_SIZE; ++ hlist_add_head(&desc->hash, &hisi_sec_fds_hash[idx]); ++ ++ nr += 1; ++ } ++ } ++ ++ return nr; ++} ++ ++static long delivery_hisi_sec_fd(struct list_head *fds, struct vma_area *vma) ++{ ++ extern unsigned hisi_sec_fds_n; /* defined in criu/files.c */ ++ static bool initialized = false; ++ struct hisi_sec_desc *desc; ++ int fd = -1, idx; ++ ++ if (!initialized) { ++ int nr; ++ ++ pr_info("find %d fds for hisi_sec char device\n", hisi_sec_fds_n); ++ ++ nr = collect_hisi_sec_fds(fds); ++ if (nr != hisi_sec_fds_n) { ++ pr_err("Collected fds(%d) aren't equal opened(%d)\n", ++ nr, hisi_sec_fds_n); ++ return -1; ++ } ++ ++ initialized = true; ++ } else if (vma->e->pgoff != HISI_SEC_MMIO && vma->e->pgoff != HISI_SEC_DUS) { ++ /* It's impossible value for fd, just as a tag to show it's a ++ * vma by `mprotect` syscall. ++ */ ++ return LONG_MAX; ++ } ++ ++ idx = (vma->e->name[strlen(vma->e->name)-1] - '0') % MAX_HISI_SEC_SIZE; ++ hlist_for_each_entry(desc, &hisi_sec_fds_hash[idx], hash) { ++ if (strcmp(desc->name, vma->e->name) != 0) ++ continue; ++ ++ if (vma->e->pgoff == HISI_SEC_MMIO && !desc->mmio) { ++ fd = desc->fd; ++ desc->mmio = true; ++ break; ++ } else if (vma->e->pgoff == HISI_SEC_DUS && !desc->dus) { ++ fd = desc->fd; ++ desc->dus = true; ++ break; ++ } ++ } ++ ++ return fd; ++} ++ ++static int handle_hisi_vma(struct list_head *fds, struct vma_area *vma) ++{ ++ long fd = delivery_hisi_sec_fd(fds, vma); ++ ++ if (fd < 0) { ++ pr_err("find fd for char dev vma pgoff %lx named %s failed.\n", ++ vma->e->pgoff, vma->e->name); ++ return -1; ++ } ++ ++ vma->e->fd = fd; ++ ++ return 0; ++} ++ + int collect_chr_map(struct pstree_item *me, struct vma_area *vma) + { + struct list_head *list = &rsti(me)->fds; +@@ -2349,6 +2452,13 @@ int collect_chr_map(struct pstree_item *me, struct vma_area *vma) + struct chrfile_info *ci; + bool exist_fd; + ++ if (strstr(vma->e->name, HISI_SEC_DEV) != NULL) { ++ if (handle_hisi_vma(list, vma) != 0) { ++ return -1; ++ } else ++ goto out; ++ } ++ + list_for_each_entry_safe(fle, tmp, list, ps_list) { + struct file_desc *d = fle->desc; + +@@ -2367,6 +2477,9 @@ int collect_chr_map(struct pstree_item *me, struct vma_area *vma) + if (!exist_fd) + return -EEXIST; + ++out: ++ pr_info(" `- find fd %ld for dev %s at this vma\n", vma->e->fd, vma->e->name); ++ + return 0; + } + +diff --git a/criu/files.c b/criu/files.c +index c9e5d8b..14085a7 100644 +--- a/criu/files.c ++++ b/criu/files.c +@@ -61,6 +61,8 @@ static LIST_HEAD(fake_master_head); + + static u32 max_file_desc_id = 0; + ++unsigned hisi_sec_fds_n; ++ + static void init_fdesc_hash(void) + { + int i; +@@ -1827,11 +1829,14 @@ out: + static int chrfile_open(struct file_desc *d, int *new_fd) + { + int fd, mntns_root; +- int ret = 0; ++ int ret = -1; + struct chrfile_info *ci; + + ci = container_of(d, struct chrfile_info, d); + ++ pr_info("charfile: Opening %s (repair %d index %d)\n", ++ ci->path, ci->cfe->repair, ci->cfe->index); ++ + mntns_root = open_pid_proc(getpid()); + fd = openat(mntns_root, ci->path, ci->cfe->flags | O_REPAIR); + if (fd < 0){ +@@ -1847,6 +1852,8 @@ static int chrfile_open(struct file_desc *d, int *new_fd) + } + + *new_fd = fd; ++ ret = 0; ++ + return ret; + err: + close(fd); +@@ -1869,10 +1876,12 @@ static int collect_one_chrfile(void *o, ProtobufCMessage *base, struct cr_img *i + else + ci->path = ci->cfe->name; + +- pr_info("Collected chr file: %#x, name: %s\n", ci->cfe->id, ci->path); +- file_desc_add(&ci->d, ci->cfe->id, &chrfile_desc_ops); ++ /* collect `/dev/hisi_sec2*` fds */ ++ if (strstr(ci->path, HISI_SEC_DEV) != NULL) ++ hisi_sec_fds_n += 1; + +- return 0; ++ pr_info("Collected chr file: %#x, name: %s\n", ci->cfe->id, ci->path); ++ return file_desc_add(&ci->d, ci->cfe->id, &chrfile_desc_ops); + } + + struct collect_image_info chrfile_cinfo = { +diff --git a/criu/include/files-reg.h b/criu/include/files-reg.h +index 3e8b93b..0458463 100644 +--- a/criu/include/files-reg.h ++++ b/criu/include/files-reg.h +@@ -33,6 +33,14 @@ struct chrfile_info { + char *path; + }; + ++struct hisi_sec_desc { ++ struct hlist_node hash; ++ char *name; ++ bool mmio; ++ bool dus; ++ int fd; ++}; ++ + extern int open_reg_by_id(u32 id); + extern int open_reg_fd(struct file_desc *); + extern int open_path(struct file_desc *, int (*open_cb)(int ns_root_fd, +diff --git a/criu/include/util.h b/criu/include/util.h +index 6b652b0..bbc19fc 100644 +--- a/criu/include/util.h ++++ b/criu/include/util.h +@@ -432,4 +432,12 @@ int mask_task_exit_notify(int pid, bool mask); + + #define RESERVED_PORTS_PATH "/proc/sys/net/ipv4/ip_local_reserved_ports" + ++#define HISI_SEC_DEV "hisi_sec2" /* `/dev/hisi_sec2*` char device */ ++ ++/* here is the selection of offset in `mmap`, they're from drivers */ ++enum hisi_sec_dev { ++ HISI_SEC_MMIO = 0x0, ++ HISI_SEC_DUS = 0x2000, ++}; ++ + #endif /* __CR_UTIL_H__ */ +diff --git a/criu/include/vma.h b/criu/include/vma.h +index 5e3f352..f649a95 100644 +--- a/criu/include/vma.h ++++ b/criu/include/vma.h +@@ -133,4 +133,16 @@ static inline bool vma_entry_can_be_lazy(VmaEntry *e) + !(vma_entry_is(e, VMA_AREA_VSYSCALL))); + } + ++struct vma_attr { ++ int prot; ++ int flags; ++}; ++ ++enum ALIEN_MAP_METHOD { ++ PGOFF_IS_ZERO, ++ MAP_THEN_PROTECT, ++ ++ MAX_ALIEN_MAP_METHOD, ++}; ++ + #endif /* __CR_VMA_H__ */ +diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c +index 9a9cb14..4cc4c31 100644 +--- a/criu/pie/restorer.c ++++ b/criu/pie/restorer.c +@@ -901,6 +901,129 @@ static unsigned long restore_mapping(VmaEntry *vma_entry) + return addr; + } + ++static unsigned long restore_map_then_protect_mapping(VmaEntry *curr, ++ struct vma_attr *curr_attr, ++ VmaEntry *next, ++ struct vma_attr *next_attr) ++{ ++ int retval; ++ unsigned long addr; ++ ++ if (next->fd != LONG_MAX ++ || curr->end != next->start ++ || (vma_entry_len(curr) + curr->pgoff) != next->pgoff ++ || curr->prot == next->prot ++ || curr->flags != next->flags) { ++ pr_err("They looks not currect:\n"); ++ pr_err(" `- vma A: (%x %x %d %lx)\n", ++ curr_attr->prot, curr_attr->flags, ++ (int)curr->fd, curr->pgoff); ++ pr_err(" `- vma B: (%x %x %d %lx)\n", ++ next_attr->prot, next_attr->flags, ++ (int)next->fd, next->pgoff); ++ return -1; ++ } ++ ++ pr_info("\tmmap(%x %x %d %lx) in map then protect mapping\n", ++ curr_attr->prot, curr_attr->flags, ++ (int)curr->fd, curr->pgoff); ++ ++ addr = sys_mmap(decode_pointer(curr->start), ++ vma_entry_len(curr) + vma_entry_len(next), ++ curr_attr->prot, curr_attr->flags, curr->fd, curr->pgoff); ++ if (addr != curr->start) { ++ pr_err("%s: mmap failed with code %ld\n", __func__, addr); ++ goto out; ++ } ++ ++ pr_info("\t mprotect(%x)\n", next_attr->prot); ++ retval = sys_mprotect(decode_pointer(next->start), ++ vma_entry_len(next), next_attr->prot); ++ if (retval != 0) { ++ addr = retval; ++ pr_err("%s: mprotect failed with code %d\n", __func__, retval); ++ } ++ ++out: ++ return addr; ++} ++ ++static unsigned long restore_pgoff_is_zero_mapping(VmaEntry *curr, struct vma_attr *attr) ++{ ++ unsigned long addr; ++ ++ pr_debug("\tmmap(%x %x %d %lx) in pgoff is zero mapping\n", ++ attr->prot, attr->flags, (int)curr->fd, curr->pgoff); ++ ++ addr = sys_mmap(decode_pointer(curr->start), ++ vma_entry_len(curr), ++ attr->prot, attr->flags, ++ curr->fd, curr->pgoff); ++ ++ return addr; ++} ++ ++static unsigned long restore_hisi_sec_mapping(struct task_restore_args *args, ++ int i, int *step) ++{ ++ VmaEntry *curr = args->vmas + i; ++ VmaEntry *next = args->vmas + i + 1; ++ struct vma_attr curr_attr = { ++ .prot = curr->prot, ++ .flags = curr->flags | MAP_FIXED, ++ }; ++ struct vma_attr next_attr = { ++ .prot = next->prot, ++ .flags = next->flags | MAP_FIXED, ++ }; ++ unsigned long addr; ++ ++ switch (curr->pgoff) { ++ case HISI_SEC_MMIO: ++ addr = restore_pgoff_is_zero_mapping(curr, &curr_attr); ++ break; ++ case HISI_SEC_DUS: ++ *step = 2; ++ addr = restore_map_then_protect_mapping(curr, &curr_attr, next, &next_attr); ++ break; ++ default: ++ pr_err("invalid pgoff %lx for vma\n", curr->pgoff); ++ return -1; ++ } ++ return addr; ++} ++ ++static bool find(const char *s1, const char *s2) ++{ ++ if (s1 == NULL || s2 == NULL) ++ return NULL; ++ ++ while (*s1 != '\0' && *s2 != '\0') { ++ if (*s1 == *s2) { ++ s1 += 1; ++ s2 += 1; ++ } else ++ s1 += 1; ++ ++ if (*s2 == '\0') ++ return true; ++ } ++ ++ return false; ++} ++ ++static unsigned long distribute_restore_mapping(struct task_restore_args *args, ++ int i, int *step) ++{ ++ VmaEntry *vma = args->vmas + i; ++ struct vma_names *vma_name = args->vma_names + i; ++ ++ if (vma_entry_is(vma, VMA_AREA_CHR) && find(vma_name->name, HISI_SEC_DEV)) ++ return restore_hisi_sec_mapping(args, i, step); ++ else ++ return restore_mapping(vma); ++} ++ + /* + * This restores aio ring header, content, head and in-kernel position + * of tail. To set tail, we write to /dev/null and use the fact this +@@ -1586,7 +1709,7 @@ int write_fork_pid(int pid) + long __export_restore_task(struct task_restore_args *args) + { + long ret = -1; +- int i; ++ int i, step; + VmaEntry *vma_entry; + unsigned long va; + struct restore_vma_io *rio; +@@ -1736,7 +1859,7 @@ long __export_restore_task(struct task_restore_args *args) + /* + * OK, lets try to map new one. + */ +- for (i = 0; i < args->vmas_n; i++) { ++ for (i = 0, step = 1; i < args->vmas_n; i += step, step = 1) { + vma_entry = args->vmas + i; + vma_name = args->vma_names + i; + +@@ -1754,7 +1877,7 @@ long __export_restore_task(struct task_restore_args *args) + if (vma_entry_is(vma_entry, VMA_PREMMAPED)) + continue; + +- va = restore_mapping(vma_entry); ++ va = distribute_restore_mapping(args, i, &step); + + if (va != vma_entry->start) { + pr_err("Can't restore %"PRIx64" mapping with %lx\n", vma_entry->start, va); +diff --git a/criu/proc_parse.c b/criu/proc_parse.c +index d04a8ff..17d5cbf 100644 +--- a/criu/proc_parse.c ++++ b/criu/proc_parse.c +@@ -630,17 +630,22 @@ static int handle_vma(pid_t pid, struct vma_area *vma_area, + /* regular file mapping -- supported */; + else if (S_ISCHR(st_buf->st_mode)) { + /* devzero mapping -- also makes sense */; +- if (opts.dump_char_dev && (strstr(file_path, "uverbs") != NULL)) { +- int len = strlen(file_path) + 1; +- vma_area->e->status |= VMA_AREA_CHR; +- vma_area->e->name = xmalloc(len); +- if (!vma_area->e->name) { ++ ++ if (!opts.dump_char_dev) { ++ /* do nothing, it's original progoss */ ++ } else if (strstr(file_path, "uverbs") != NULL ++ || strstr(file_path, HISI_SEC_DEV) != NULL) { ++ int len = strlen(file_path) + 1; ++ ++ vma_area->e->status |= VMA_AREA_CHR; ++ vma_area->e->name = xmalloc(len); ++ if (!vma_area->e->name) { + pr_err("alloc vma area name fail\n"); + goto err; + } + strncpy(vma_area->e->name, file_path, len); +- pr_info("uverbs name content is: %s\n", vma_area->e->name); +- } ++ pr_info("vma name content is: %s\n", vma_area->e->name); ++ } + } else { + pr_err("Can't handle non-regular mapping on %d's map %"PRIx64"\n", pid, vma_area->e->start); + goto err; +-- +2.34.0 + diff --git a/backport-0040--fix-fds-list-restore-and-rollback-problem.patch b/backport-0040--fix-fds-list-restore-and-rollback-problem.patch new file mode 100644 index 0000000..29206dc --- /dev/null +++ b/backport-0040--fix-fds-list-restore-and-rollback-problem.patch @@ -0,0 +1,55 @@ +From 8921694f991d8a2881462c873844f1baae14d86f Mon Sep 17 00:00:00 2001 +From: Jingxian He +Date: Thu, 16 Sep 2021 13:50:46 +0000 +Subject: [PATCH 40/49] fix fds list restore and rollback problem + +When there exist multi processes need to dump, the child process may +have the same fds as parent process. During the restore processing, +criu choose the process which has the min pid value to be the master +process to recover fds. However, choosing the parent process as the +master process is more suitable. + +Signed-off-by: Jingxian He +--- + criu/cr-restore.c | 5 ++--- + criu/files.c | 7 +------ + 2 files changed, 3 insertions(+), 9 deletions(-) + +diff --git a/criu/cr-restore.c b/criu/cr-restore.c +index 99d82a8..77cdd0c 100644 +--- a/criu/cr-restore.c ++++ b/criu/cr-restore.c +@@ -1957,10 +1957,9 @@ static int restore_task_with_children(void *_arg) + return 0; + + err: +- if (current->parent == NULL) { +- do_notifier_rollback(opts.with_notifier_kup, POST_UPDATE_KERNEL_COMPLETE); ++ do_notifier_rollback(opts.with_notifier_kup, POST_UPDATE_KERNEL_COMPLETE); ++ if (current->parent == NULL) + futex_abort_and_wake(&task_entries->nr_in_progress); +- } + exit(1); + } + +diff --git a/criu/files.c b/criu/files.c +index 14085a7..5ee7971 100644 +--- a/criu/files.c ++++ b/criu/files.c +@@ -905,12 +905,7 @@ static struct fdinfo_list_entry *alloc_fle(int pid, FdinfoEntry *fe) + + static void __collect_desc_fle(struct fdinfo_list_entry *new_le, struct file_desc *fdesc) + { +- struct fdinfo_list_entry *le; +- +- list_for_each_entry_reverse(le, &fdesc->fd_info_head, desc_list) +- if (pid_rst_prio_eq(le->pid, new_le->pid)) +- break; +- list_add(&new_le->desc_list, &le->desc_list); ++ list_add_tail(&new_le->desc_list, &fdesc->fd_info_head); + } + + static void collect_desc_fle(struct fdinfo_list_entry *new_le, +-- +2.34.0 + diff --git a/backport-0041--log-print-error-log-to-dev-kmsg.patch b/backport-0041--log-print-error-log-to-dev-kmsg.patch new file mode 100644 index 0000000..340683a --- /dev/null +++ b/backport-0041--log-print-error-log-to-dev-kmsg.patch @@ -0,0 +1,83 @@ +From 7262b00416e5cba1b06f5a650ea95ffaf1cba32d Mon Sep 17 00:00:00 2001 +From: "fu.lin" +Date: Tue, 19 Oct 2021 20:53:19 +0800 +Subject: [PATCH 41/49] log: print error log to /dev/kmsg + +The criu log can't be flushed to disk when OS crash in storage +environment, therefore, output high level msg to /dev/kmsg. + +Signed-off-by: fu.lin +--- + criu/Makefile.crtools | 1 + + criu/include/log.h | 3 +++ + criu/kmsg.c | 16 ++++++++++++++++ + criu/log.c | 4 ++++ + 4 files changed, 24 insertions(+) + create mode 100644 criu/kmsg.c + +diff --git a/criu/Makefile.crtools b/criu/Makefile.crtools +index 2a8ec61..1c3918e 100644 +--- a/criu/Makefile.crtools ++++ b/criu/Makefile.crtools +@@ -89,6 +89,7 @@ obj-y += vdso.o + obj-y += devname.o + obj-y += mnl.o + obj-y += nftables.o ++obj-y += kmsg.o + obj-$(CONFIG_COMPAT) += pie-util-vdso-elf32.o + CFLAGS_pie-util-vdso-elf32.o += -DCONFIG_VDSO_32 + obj-$(CONFIG_COMPAT) += vdso-compat.o +diff --git a/criu/include/log.h b/criu/include/log.h +index 15787b0..cce34a2 100644 +--- a/criu/include/log.h ++++ b/criu/include/log.h +@@ -71,6 +71,9 @@ void flush_early_log_buffer(int fd); + print_on_level(LOG_DEBUG, \ + LOG_PREFIX fmt, ##__VA_ARGS__) + ++#include ++void write_kmsg(const void *buf, size_t count); ++ + #ifndef CR_NOGLIBC + + #define pr_perror(fmt, ...) \ +diff --git a/criu/kmsg.c b/criu/kmsg.c +new file mode 100644 +index 0000000..c956dfb +--- /dev/null ++++ b/criu/kmsg.c +@@ -0,0 +1,16 @@ ++#include ++#include ++ ++#define SYSLOG_DEV "/dev/kmsg" ++ ++void write_kmsg(const void *buf, size_t count) ++{ ++ int fd; ++ ++ fd = open(SYSLOG_DEV, O_CLOEXEC | O_WRONLY); ++ if (fd < 0) ++ return; ++ ++ write(fd, buf, count); ++ close(fd); ++} +diff --git a/criu/log.c b/criu/log.c +index 8bdf835..ccb1f26 100644 +--- a/criu/log.c ++++ b/criu/log.c +@@ -378,6 +378,10 @@ void vprint_on_level(unsigned int loglevel, const char *format, va_list params) + size += buf_off; + + while (off < size) { ++ if (loglevel <= LOG_WARN) { ++ write_kmsg(buffer + off, size - off); ++ } ++ + ret = write(fd, buffer + off, size - off); + if (ret <= 0) + break; +-- +2.34.0 + diff --git a/backport-0042--improve-char-dev-fd-check-and-repair-method.patch b/backport-0042--improve-char-dev-fd-check-and-repair-method.patch new file mode 100644 index 0000000..705bef4 --- /dev/null +++ b/backport-0042--improve-char-dev-fd-check-and-repair-method.patch @@ -0,0 +1,68 @@ +From 049540d884286e9ec001bc9248de2da2f7fe1232 Mon Sep 17 00:00:00 2001 +From: Jingxian He +Date: Sun, 24 Oct 2021 15:20:27 +0800 +Subject: [PATCH 42/49] improve char dev fd check and repair method + +Some special char dev cannot work in child processes, we make dump fail +when the special char dev fd is in child processes. +In the char dev repair process, user may need recover fd. We should +make thre repair process running after the char dev fd is reopened as dumped fd. + +Signed-off-by: Jingxian He +--- + criu/files.c | 22 ++++++++++++---------- + 1 file changed, 12 insertions(+), 10 deletions(-) + +diff --git a/criu/files.c b/criu/files.c +index 5ee7971..f096783 100644 +--- a/criu/files.c ++++ b/criu/files.c +@@ -1255,6 +1255,7 @@ static int open_fd(struct fdinfo_list_entry *fle) + struct file_desc *d = fle->desc; + struct fdinfo_list_entry *flem; + int new_fd = -1, ret; ++ struct chrfile_info *ci; + + pr_info("open file flags:%x\n", fle->fe->flags); + flem = file_master(d); +@@ -1315,6 +1316,17 @@ static int open_fd(struct fdinfo_list_entry *fle) + if (ret != -1 && new_fd >= 0) { + if (setup_and_serve_out(fle, new_fd) < 0) + return -1; ++ if (d->ops->type == FD_TYPES__CHR) { ++ ci = container_of(d, struct chrfile_info, d); ++ if (ci->cfe->repair) { ++ ret = ioctl(fle->fe->fd, IOCTL_CMD_REPAIR , ci->cfe->index); ++ pr_info("repair ioctl return: %d, index: %d\n", ret, ci->cfe->index); ++ if (ret) { ++ close(fle->fe->fd); ++ return -1; ++ } ++ } ++ } + } + out: + if (ret == 0) +@@ -1839,19 +1851,9 @@ static int chrfile_open(struct file_desc *d, int *new_fd) + return -1; + } + +- if (ci->cfe->repair) { +- ret = ioctl(fd, IOCTL_CMD_REPAIR , ci->cfe->index); +- pr_info("repair ioctl return: %d, index: %d\n", ret, ci->cfe->index); +- if (ret) +- goto err; +- } +- + *new_fd = fd; + ret = 0; + +- return ret; +-err: +- close(fd); + return ret; + } + +-- +2.34.0 + diff --git a/backport-0043--unix-sk-improve-dgram-robustness.patch b/backport-0043--unix-sk-improve-dgram-robustness.patch new file mode 100644 index 0000000..450897e --- /dev/null +++ b/backport-0043--unix-sk-improve-dgram-robustness.patch @@ -0,0 +1,159 @@ +From ac5620191cef4eeb241c6402e0d9ba2cc472b2a7 Mon Sep 17 00:00:00 2001 +From: "fu.lin" +Date: Tue, 26 Oct 2021 11:13:27 +0800 +Subject: [PATCH 43/49] unix sk: improve dgram robustness + +We should try out best to ensure the success of criu. As for unix dgram +socket, criu use re-connect instead of repair instead of unix stream +socket. Therefore, this patch does the following things: + +- detect unix dgram unix sock file when criu dumps unix dgram socket +- add the fault tolerance of unix dgram socket connecting (focus on the + condition of `/dev/log` disappearance when rsyslog restart) + +Signed-off-by: fu.lin +--- + criu/sk-unix.c | 99 +++++++++++++++++++++++++++++++++++++++++++++++++- + 1 file changed, 97 insertions(+), 2 deletions(-) + +diff --git a/criu/sk-unix.c b/criu/sk-unix.c +index 049dc84..2e4cada 100644 +--- a/criu/sk-unix.c ++++ b/criu/sk-unix.c +@@ -11,6 +11,7 @@ + #include + #include + #include ++#include + + #include "libnetlink.h" + #include "cr_options.h" +@@ -357,6 +358,58 @@ err: + return -ENOENT; + } + ++static int unix_resolve_dgram_name(int lfd, uint32_t id, struct unix_sk_desc *sk, ++ struct unix_sk_desc *peer) ++{ ++ char *name = peer->name; ++ char rpath[PATH_MAX]; ++ struct stat st; ++ struct ns_id *ns; ++ int mntns_root; ++ ++ /* The unnamed or abstrace unix socket contion. */ ++ if (peer->namelen == 0 || name[0] == '\0') ++ return 0; ++ ++ if (name[0] != '/') { ++ pr_warn("Not support relative path, following the original rule." ++ " socket %#x(%s) ino %d peer %d\n", ++ id, name, sk->sd.ino, sk->peer_ino); ++ return 0; ++ } else if (kdat.sk_unix_file && (root_ns_mask & CLONE_NEWNS)) { ++ pr_warn("Not support mnt namespace, following the original rule." ++ " socket %#x(%s) ino %d peer %d\n", ++ id, name, sk->sd.ino, sk->peer_ino); ++ return 0; ++ } ++ ++ ns = lookup_ns_by_id(root_item->ids->mnt_ns_id, &mnt_ns_desc); ++ if (!ns) { ++ pr_err("Can't find ns. socket %#x(%s) ino %d peer %d\n", ++ id, name, sk->sd.ino, sk->peer_ino); ++ goto err; ++ } ++ ++ mntns_root = mntns_get_root_fd(ns); ++ if (mntns_root < 0) { ++ pr_err("Can't get root fd. socket %#x(%s) ino %d peer %d\n", ++ id, name, sk->sd.ino, sk->peer_ino); ++ goto err; ++ } ++ ++ snprintf(rpath, sizeof(rpath), ".%s", name); ++ if (fstatat(mntns_root, rpath, &st, 0)) { ++ pr_err("Can't stat the connected DGRAM type socket id %#x," ++ " peer ino %d path '%s'\n", ++ id, sk->peer_ino, name); ++ goto err; ++ } ++ ++ return 0; ++err: ++ return -ENOENT; ++} ++ + static int dump_one_unix_fd(int lfd, uint32_t id, const struct fd_parms *p) + { + struct unix_sk_desc *sk, *peer; +@@ -483,6 +536,18 @@ static int dump_one_unix_fd(int lfd, uint32_t id, const struct fd_parms *p) + ret = getsockopt(lfd, SOL_TCP, TCP_REPAIR_OPTIONS, &ue->repair_ino, &len); + if (ret < 0) + goto err; ++ } else if (peer->name && sk->type == SOCK_DGRAM) { ++ /* ++ * As for unix stream socket, we can use the kernel] ++ * feature which cmdline is `unix_stream_restore_enable` ++ * to dump/restore it. Because of the feature, we can ++ * not consider the unix stream socket file status. ++ * But as for unix dgram socket, it's different. We ++ * must ensure the existence of the socket file when ++ * dump/restore, otherwise it will fail. ++ */ ++ if (unix_resolve_dgram_name(lfd, id, sk, peer) != 0) ++ goto err; + } + + /* +@@ -1325,6 +1390,33 @@ err: + return -1; + } + ++/* ++ * Sometimes, `/dev/log` will disappear because of the restart of rsyslog when ++ * rotating, criu try to connect `/dev/log` will report error at this time. We ++ * should try our best to ensure the success of criu restoration. Therefore, ++ * retry three times here. ++ */ ++static int unix_dgram_reconnect(int fd, struct sockaddr_un *addr, int len) ++{ ++ int retval = 0; ++ struct timespec tim = { ++ .tv_sec = 0, ++ .tv_nsec = 5e+8, ++ }; ++ ++ for (int i = 0; i < 3; i++) { ++ nanosleep(&tim, NULL); ++ pr_warn("Can't connect unix socket(%s), %d retry\n", ++ addr->sun_path, i); ++ retval = connect(fd, (struct sockaddr *)addr, ++ sizeof(addr->sun_family) + len); ++ if (retval == 0) ++ break; ++ } ++ ++ return retval; ++} ++ + static int post_open_standalone(struct file_desc *d, int fd) + { + int fdstore_fd = -1, procfs_self_dir = -1, len; +@@ -1406,8 +1498,11 @@ static int post_open_standalone(struct file_desc *d, int fd) + goto err_revert_and_exit; + } + } else if (connect(fd, (struct sockaddr *)&addr, sizeof(addr.sun_family) + len) < 0) { +- pr_perror("Can't connect %d socket", ui->ue->ino); +- goto err_revert_and_exit; ++ if (ui->ue->type != SOCK_DGRAM || errno != ENOENT ++ || unix_dgram_reconnect(fd, &addr, len) != 0) { ++ pr_perror("Can't connect %d socket", ui->ue->ino); ++ goto err_revert_and_exit; ++ } + } + mutex_unlock(mutex_ghost); + +-- +2.34.0 + diff --git a/backport-0044--sk-ignore-the-bind-error-for-icmp-socket.patch b/backport-0044--sk-ignore-the-bind-error-for-icmp-socket.patch new file mode 100644 index 0000000..d27925e --- /dev/null +++ b/backport-0044--sk-ignore-the-bind-error-for-icmp-socket.patch @@ -0,0 +1,44 @@ +From 579f5c8e89a1c799501afe7dcc07caa7a3caa252 Mon Sep 17 00:00:00 2001 +From: "fu.lin" +Date: Wed, 27 Oct 2021 11:57:43 +0800 +Subject: [PATCH 44/49] sk: ignore the bind error for icmp socket + +Signed-off-by: fu.lin +--- + criu/sk-inet.c | 20 ++++++++++++++++++-- + 1 file changed, 18 insertions(+), 2 deletions(-) + +diff --git a/criu/sk-inet.c b/criu/sk-inet.c +index cdd8969..42e4828 100644 +--- a/criu/sk-inet.c ++++ b/criu/sk-inet.c +@@ -1151,8 +1151,24 @@ int inet_bind(int sk, struct inet_sk_info *ii) + } + + if (bind(sk, (struct sockaddr *)&addr, addr_size) == -1) { +- pr_perror("Can't bind inet socket (id %d)", ii->ie->id); +- return -1; ++ InetSkEntry *ie = ii->ie; ++ ++ /* ++ * Sometimes the ping-like program restoration may appear ++ * `bind()` error when it is specified the address. In view ++ * of the principle that we should try our best to restore the ++ * process, and ping-like program works abnormal can tolerate, ++ * just warn here instead of report error. ++ */ ++ if (ie->proto == IPPROTO_ICMP || ie->proto == IPPROTO_ICMPV6) { ++ pr_warn("Can't bind inet socket (id %d) proto %s\n", ++ ie->id, ++ ie->proto == IPPROTO_ICMP ? ++ "IPPROTO_ICMP" : "IPPROTO_ICMPV6"); ++ } else { ++ pr_perror("Can't bind inet socket (id %d)", ii->ie->id); ++ return -1; ++ } + } + + if (rst_freebind) { +-- +2.34.0 + diff --git a/backport-0045--infiniband-fix-the-infiniband-fd-conflict.patch b/backport-0045--infiniband-fix-the-infiniband-fd-conflict.patch new file mode 100644 index 0000000..cb8cbef --- /dev/null +++ b/backport-0045--infiniband-fix-the-infiniband-fd-conflict.patch @@ -0,0 +1,286 @@ +From 439c04a5529dc94dc69f59334ab3995751861def Mon Sep 17 00:00:00 2001 +From: "fu.lin" +Date: Mon, 8 Nov 2021 15:08:12 +0800 +Subject: [PATCH 45/49] infiniband: fix the infiniband fd conflict + +Phenomenon: + Operating uverbs device will generate anonymous fd named +`anon_inode:[infinibandevent]`. When `anon_inode:[infinibandevent]` fd +is the last opened fd, and some kind of unix socket fd exist, which is +generated by syscalls like `socketpair()` at the same tim, +`anon_inode:[infinibandevent]` will restore fail probabilistically. + +log as the following: + +``` +(00.254523) 63959: open file flags:1 +(00.254526) 63959: unix: Opening standalone (stage 0 id 0x1ff ino 1019605 peer 0) +(00.254571) 63959: *******flags: 0 +(00.254575) 63959: Create fd for 1408 # the fake fd +(00.254578) 63959: *******flags: 1 +(00.254580) 63959: Create fd for 445 # the restoration fd +``` + +Reason: + During the restoration of unix socket, `socketpair()` will generate +two fds, one is used to the current restoration, another is called fake +fd which fd nr is owned by `find_unused_fd()`. When +`anon_inode:[infinibandevent]` fd is the last one, criu don't dump the +fd information for `anon_inode:[infinibandevent]` in original +implementation, and criu think the fd nr which should belong to +`anon_inode:[infinibandevent]` isn't used. Therefore, it cause the +`anon_inode:[infinibandevent]` restoration fail. + +This patch fix the above problem. Core: dump +`anon_inode:[infinibandevent]` fd information, make the criu is aware +that that fd nr is used. + +Signed-off-by: fu.lin +--- + criu/Makefile.crtools | 1 + + criu/char.c | 68 ++++++++++++++++++++++++++++++++++++ + criu/files.c | 23 ++++++------ + criu/include/char.h | 17 +++++++++ + criu/include/image-desc.h | 1 + + criu/include/protobuf-desc.h | 1 + + images/chr.proto | 3 ++ + images/fdinfo.proto | 2 ++ + 8 files changed, 103 insertions(+), 13 deletions(-) + create mode 100644 criu/char.c + create mode 100644 criu/include/char.h + +diff --git a/criu/Makefile.crtools b/criu/Makefile.crtools +index 1c3918e..9d8c5a3 100644 +--- a/criu/Makefile.crtools ++++ b/criu/Makefile.crtools +@@ -90,6 +90,7 @@ obj-y += devname.o + obj-y += mnl.o + obj-y += nftables.o + obj-y += kmsg.o ++obj-y += char.o + obj-$(CONFIG_COMPAT) += pie-util-vdso-elf32.o + CFLAGS_pie-util-vdso-elf32.o += -DCONFIG_VDSO_32 + obj-$(CONFIG_COMPAT) += vdso-compat.o +diff --git a/criu/char.c b/criu/char.c +new file mode 100644 +index 0000000..153145f +--- /dev/null ++++ b/criu/char.c +@@ -0,0 +1,68 @@ ++#include "imgset.h" ++#include "char.h" ++#include "log.h" ++ ++#include "protobuf.h" ++ ++static void pr_info_infiniband(char *action, InfinibandEntry *infiniband) ++{ ++ pr_info("%sinfiniband: id %#08x\n", action, infiniband->id); ++} ++ ++/* Checks if file descriptor @lfd is infinibandevent */ ++int is_infiniband_link(char *link) ++{ ++ return is_anon_link_type(link, "[infinibandevent]"); ++} ++ ++static int dump_one_infiniband(int lfd, u32 id, const struct fd_parms *p) ++{ ++ FileEntry fe = FILE_ENTRY__INIT; ++ InfinibandEntry infiniband = INFINIBAND_ENTRY__INIT; ++ ++ infiniband.id = id; ++ ++ fe.type = FD_TYPES__INFINIBAND; ++ fe.id = infiniband.id; ++ fe.infiniband = &infiniband; ++ ++ pr_info_infiniband("Dumping ", &infiniband); ++ ++ return pb_write_one(img_from_set(glob_imgset, CR_FD_FILES), &fe, PB_FILE); ++} ++ ++const struct fdtype_ops infiniband_dump_ops = { ++ .type = FD_TYPES__INFINIBAND, ++ .dump = dump_one_infiniband, ++}; ++ ++static int infiniband_open(struct file_desc *d, int *new_fd) { ++ /* ++ * `*new_fd == -1` at this time, it means this open operation shouldn't ++ * be served out, which is why this function does nothing here. ++ */ ++ return 0; ++}; ++ ++static struct file_desc_ops infiniband_desc_ops = { ++ .type = FD_TYPES__INFINIBAND, ++ .open = infiniband_open, ++}; ++ ++static int collect_one_infiniband(void *o, ProtobufCMessage *base, struct cr_img *i) ++{ ++ struct infiniband_file_info *info = o; ++ ++ info->infiniband = pb_msg(base, InfinibandEntry); ++ pr_info_infiniband("Collected ", info->infiniband); ++ ++ /* add the fd to `file_desc_hash` list to prevent from NULL pointer */ ++ return file_desc_add(&info->d, info->infiniband->id, &infiniband_desc_ops); ++} ++ ++struct collect_image_info infiniband_cinfo = { ++ .fd_type = CR_FD_INFINIBAND, ++ .pb_type = PB_INFINIBAND, ++ .priv_size = sizeof(struct infiniband_file_info), ++ .collect = collect_one_infiniband, ++}; +diff --git a/criu/files.c b/criu/files.c +index f096783..200f8f9 100644 +--- a/criu/files.c ++++ b/criu/files.c +@@ -46,6 +46,7 @@ + #include "parasite-syscall.h" + #include "kerndat.h" + #include "fdstore.h" ++#include "char.h" + + #include "protobuf.h" + #include "util.h" +@@ -582,12 +583,6 @@ static int dump_chrdev(struct fd_parms *p, int lfd, FdinfoEntry *e) + return err; + } + +-/* Checks if file descriptor @lfd is infinibandevent */ +-int is_infiniband_link(char *link) +-{ +- return is_anon_link_type(link, "[infinibandevent]"); +-} +- + static int dump_one_file(struct pid *pid, int fd, int lfd, struct fd_opts *opts, + struct parasite_ctl *ctl, FdinfoEntry *e, + struct parasite_drain_fd *dfds) +@@ -643,7 +638,7 @@ static int dump_one_file(struct pid *pid, int fd, int lfd, struct fd_opts *opts, + else if (is_timerfd_link(link)) + ops = &timerfd_dump_ops; + else if (is_infiniband_link(link)) +- return 1; ++ ops = &infiniband_dump_ops; + else + return dump_unsupp_fd(&p, lfd, "anon", link, e); + +@@ -745,11 +740,7 @@ int dump_task_files_seized(struct parasite_ctl *ctl, struct pstree_item *item, + lfds[i], opts + i, ctl, &e, dfds); + if (ret < 0) + break; +- /* infiniband link file */ +- if (ret > 0) { +- ret = 0; +- continue; +- } ++ + e.flags |= need_reuse_flag; + pr_info("write fdinfoEntry fd=%d id=%d\n", (&e)->fd, (&e)->id); + ret = pb_write_one(img, &e, PB_FDINFO); +@@ -1844,8 +1835,11 @@ static int chrfile_open(struct file_desc *d, int *new_fd) + pr_info("charfile: Opening %s (repair %d index %d)\n", + ci->path, ci->cfe->repair, ci->cfe->index); + ++ if (ci->cfe->repair) ++ ci->cfe->flags |= O_REPAIR; ++ + mntns_root = open_pid_proc(getpid()); +- fd = openat(mntns_root, ci->path, ci->cfe->flags | O_REPAIR); ++ fd = openat(mntns_root, ci->path, ci->cfe->flags); + if (fd < 0){ + pr_err("open chr file failed\n"); + return -1; +@@ -1963,6 +1957,9 @@ static int collect_one_file(void *o, ProtobufCMessage *base, struct cr_img *i) + case FD_TYPES__CHR: + ret = collect_one_file_entry(fe, fe->chr->id, &fe->chr->base, &chrfile_cinfo); + break; ++ case FD_TYPES__INFINIBAND: ++ ret = collect_one_file_entry(fe, fe->infiniband->id, &fe->infiniband->base, &infiniband_cinfo); ++ break; + } + + return ret; +diff --git a/criu/include/char.h b/criu/include/char.h +new file mode 100644 +index 0000000..c63b8f1 +--- /dev/null ++++ b/criu/include/char.h +@@ -0,0 +1,17 @@ ++#ifndef __CR_CHAR_H__ ++#define __CR_CHAR_H__ ++ ++#include "files.h" ++#include "images/chr.pb-c.h" ++ ++struct infiniband_file_info { ++ InfinibandEntry *infiniband; ++ struct file_desc d; ++}; ++ ++extern const struct fdtype_ops infiniband_dump_ops; ++extern struct collect_image_info infiniband_cinfo; ++ ++int is_infiniband_link(char *link); ++ ++#endif /* __CR_CHAR_H__ */ +diff --git a/criu/include/image-desc.h b/criu/include/image-desc.h +index d5e2ac4..56f565d 100644 +--- a/criu/include/image-desc.h ++++ b/criu/include/image-desc.h +@@ -109,6 +109,7 @@ enum { + + CR_FD_AUTOFS, + CR_FD_CHRFILE, ++ CR_FD_INFINIBAND, + + CR_FD_MAX + }; +diff --git a/criu/include/protobuf-desc.h b/criu/include/protobuf-desc.h +index 4112be5..312c6dd 100644 +--- a/criu/include/protobuf-desc.h ++++ b/criu/include/protobuf-desc.h +@@ -62,6 +62,7 @@ enum { + PB_GHOST_CHUNK, + PB_FILE, + PB_CHRFILE, ++ PB_INFINIBAND, + + /* PB_AUTOGEN_STOP */ + +diff --git a/images/chr.proto b/images/chr.proto +index 67929db..ed65005 100644 +--- a/images/chr.proto ++++ b/images/chr.proto +@@ -10,3 +10,6 @@ message chrfile_entry { + required bool repair = 5; + }; + ++message infiniband_entry { ++ required uint32 id = 1; ++}; +diff --git a/images/fdinfo.proto b/images/fdinfo.proto +index c483bd8..a5ffb8d 100644 +--- a/images/fdinfo.proto ++++ b/images/fdinfo.proto +@@ -38,6 +38,7 @@ enum fd_types { + EXT = 16; + TIMERFD = 17; + CHR = 21; ++ INFINIBAND = 22; + + /* Any number above the real used. Not stored to image */ + CTL_TTY = 65534; +@@ -73,4 +74,5 @@ message file_entry { + optional pipe_entry pipe = 18; + optional tty_file_entry tty = 19; + optional chrfile_entry chr = 23; ++ optional infiniband_entry infiniband = 25; + } +-- +2.34.0 + diff --git a/backport-0046--optimization-parallel-collecting-vmas.patch b/backport-0046--optimization-parallel-collecting-vmas.patch new file mode 100644 index 0000000..8bb491c --- /dev/null +++ b/backport-0046--optimization-parallel-collecting-vmas.patch @@ -0,0 +1,520 @@ +From 0026b42e1f407230a80845f696491588fcf0b44e Mon Sep 17 00:00:00 2001 +From: "fu.lin" +Date: Fri, 12 Nov 2021 17:58:50 +0800 +Subject: [PATCH 46/49] optimization: parallel collecting vmas + +collecting smaps has no influence with other processes, we can collect +parallelly early to accelerate speed. + +In order to prevent the concurrency problem by `find_unused_fd`, only +the main root task will parallel. + +Usage: + criu --parallel + +Note: ensure criu can use multi-core, otherwise the performance will +deterioration. + +Signed-off-by: fu.lin +--- + criu/Makefile.crtools | 1 + + criu/Makefile.packages | 1 + + criu/config.c | 1 + + criu/cr-dump.c | 53 +++++++++++----- + criu/crtools.c | 1 + + criu/include/cr_options.h | 1 + + criu/include/pstree.h | 3 + + criu/include/taskqueue.h | 50 +++++++++++++++ + criu/namespaces.c | 9 ++- + criu/proc_parse.c | 6 ++ + criu/taskqueue.c | 124 ++++++++++++++++++++++++++++++++++++++ + 11 files changed, 234 insertions(+), 16 deletions(-) + create mode 100644 criu/include/taskqueue.h + create mode 100644 criu/taskqueue.c + +diff --git a/criu/Makefile.crtools b/criu/Makefile.crtools +index 9d8c5a3..3263704 100644 +--- a/criu/Makefile.crtools ++++ b/criu/Makefile.crtools +@@ -91,6 +91,7 @@ obj-y += mnl.o + obj-y += nftables.o + obj-y += kmsg.o + obj-y += char.o ++obj-y += taskqueue.o + obj-$(CONFIG_COMPAT) += pie-util-vdso-elf32.o + CFLAGS_pie-util-vdso-elf32.o += -DCONFIG_VDSO_32 + obj-$(CONFIG_COMPAT) += vdso-compat.o +diff --git a/criu/Makefile.packages b/criu/Makefile.packages +index ce04529..f65f25a 100644 +--- a/criu/Makefile.packages ++++ b/criu/Makefile.packages +@@ -39,6 +39,7 @@ endif + export LIBS += -lprotobuf-c -ldl -lnl-3 -lsoccr -Lsoccr/ -lnet + export LIBS += $(shell pkg-config --libs libmnl) + export LIBS += $(shell pkg-config --libs libnftnl) ++export LIBS += -lpthread + export CFLAGS += $(shell pkg-config --cflags libmnl) + export CFLAGS += $(shell pkg-config --cflags libnftnl) + +diff --git a/criu/config.c b/criu/config.c +index a822ef2..322a79e 100644 +--- a/criu/config.c ++++ b/criu/config.c +@@ -530,6 +530,7 @@ int parse_options(int argc, char **argv, bool *usage_error, + BOOL_OPT("file-locks-repair", &opts.file_locks_repair), + {"reserve-ports", required_argument, 0, 'P' }, + BOOL_OPT("use-nft", &opts.use_nft), ++ BOOL_OPT("parallel", &opts.parallel), + { }, + }; + +diff --git a/criu/cr-dump.c b/criu/cr-dump.c +index 35e80f5..6b7f735 100644 +--- a/criu/cr-dump.c ++++ b/criu/cr-dump.c +@@ -18,6 +18,7 @@ + + #include + #include ++#include + + #include "types.h" + #include "protobuf.h" +@@ -81,6 +82,7 @@ + #include "dump.h" + #include "eventpoll.h" + #include "restorer.h" ++#include "taskqueue.h" + + /* + * Architectures can overwrite this function to restore register sets that +@@ -399,7 +401,7 @@ static int dump_pid_misc(pid_t pid, TaskCoreEntry *tc) + return 0; + } + +-static int dump_filemap(struct vma_area *vma_area, int fd) ++int dump_filemap(struct vma_area *vma_area, int fd) + { + struct fd_parms p = FD_PARMS_INIT; + VmaEntry *vma = vma_area->e; +@@ -1227,7 +1229,7 @@ err_cure: + static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) + { + pid_t pid = item->pid->real; +- struct vm_area_list vmas; ++ struct vm_area_list *vmas = NULL; + struct parasite_ctl *parasite_ctl; + int ret, exit_code = -1; + struct parasite_dump_misc misc; +@@ -1236,8 +1238,6 @@ static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) + struct proc_posix_timers_stat proc_args; + struct mem_dump_ctl mdc; + +- vm_area_list_init(&vmas); +- + pr_info("========================================\n"); + pr_info("Dumping task (pid: %d)\n", pid); + pr_info("========================================\n"); +@@ -1248,12 +1248,23 @@ static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) + */ + return 0; + ++ if (!opts.parallel || root_item->pid->real != item->pid->real ) { ++ vmas = xmalloc(sizeof(struct vm_area_list)); ++ if (vmas == NULL) { ++ pr_err("xmalloc no memory\n"); ++ return -1; ++ } ++ vm_area_list_init(vmas); ++ } else ++ vmas = item->maps_info.vmas; ++ + pr_info("Obtaining task stat ... \n"); + ret = parse_pid_stat(pid, &pps_buf); + if (ret < 0) + goto err; + +- ret = collect_mappings(pid, &vmas, dump_filemap); ++ ret = (opts.parallel && root_item->pid->real == item->pid->real) ? ++ 0 : collect_mappings(pid, vmas, dump_filemap); + if (ret) { + pr_err("Collect mappings (pid: %d) failed with %d\n", pid, ret); + goto err; +@@ -1287,7 +1298,10 @@ static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) + goto err; + } + +- parasite_ctl = parasite_infect_seized(pid, item, &vmas); ++ if (opts.parallel && end_collect_mappings_thread(item)) ++ goto err; ++ ++ parasite_ctl = parasite_infect_seized(pid, item, vmas); + if (!parasite_ctl) { + pr_err("Can't infect (pid: %d) with parasite\n", pid); + goto err; +@@ -1311,13 +1325,13 @@ static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) + goto err_cure_imgset; + } + +- ret = parasite_fixup_vdso(parasite_ctl, pid, &vmas); ++ ret = parasite_fixup_vdso(parasite_ctl, pid, vmas); + if (ret) { + pr_err("Can't fixup vdso VMAs (pid: %d)\n", pid); + goto err_cure_imgset; + } + +- ret = parasite_collect_aios(parasite_ctl, &vmas); /* FIXME -- merge with above */ ++ ret = parasite_collect_aios(parasite_ctl, vmas); /* FIXME -- merge with above */ + if (ret) { + pr_err("Failed to check aio rings (pid: %d)\n", pid); + goto err_cure_imgset; +@@ -1371,7 +1385,7 @@ static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) + mdc.stat = &pps_buf; + mdc.parent_ie = parent_ie; + +- ret = parasite_dump_pages_seized(item, &vmas, &mdc, parasite_ctl); ++ ret = parasite_dump_pages_seized(item, vmas, &mdc, parasite_ctl); + if (ret) + goto err_cure; + +@@ -1432,7 +1446,7 @@ static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) + goto err; + } + +- ret = dump_task_mm(pid, &pps_buf, &misc, &vmas, cr_imgset); ++ ret = dump_task_mm(pid, &pps_buf, &misc, vmas, cr_imgset); + if (ret) { + pr_err("Dump mappings (pid: %d) failed with %d\n", pid, ret); + goto err; +@@ -1448,7 +1462,8 @@ static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) + exit_code = 0; + err: + close_pid_proc(); +- free_mappings(&vmas); ++ free_mappings(vmas); ++ free(vmas); + xfree(dfds); + return exit_code; + +@@ -1818,6 +1833,13 @@ static int cr_dump_finish(int ret) + write_stats(DUMP_STATS); + pr_info("Dumping finished successfully\n"); + } ++ ++ /* ++ * Don't care threads' status and ignore unfree resources, use ++ * `exit_group()` to ensure exit all threads. ++ */ ++ syscall(SYS_exit_group, post_dump_ret ? : (ret != 0)); ++ + return post_dump_ret ? : (ret != 0); + } + +@@ -1843,6 +1865,9 @@ int cr_dump_tasks(pid_t pid) + if (opts.dump_char_dev && parse_devname() < 0) + goto err; + ++ if (opts.parallel && init_parallel_env() != 0) ++ goto err; ++ + root_item = alloc_pstree_item(); + if (!root_item) + goto err; +@@ -1920,13 +1945,13 @@ int cr_dump_tasks(pid_t pid) + if (collect_file_locks()) + goto err; + +- if (collect_namespaces(true) < 0) +- goto err; +- + glob_imgset = cr_glob_imgset_open(O_DUMP); + if (!glob_imgset) + goto err; + ++ if (collect_namespaces(true) < 0) ++ goto err; ++ + if (seccomp_collect_dump_filters() < 0) + goto err; + +diff --git a/criu/crtools.c b/criu/crtools.c +index 5ae7ca0..e39dc07 100644 +--- a/criu/crtools.c ++++ b/criu/crtools.c +@@ -434,6 +434,7 @@ usage: + " --file-locks-repair Use repair mode to dump and restore file locks\n" + " --reserve-ports Reserve src ports in kernel\n" + " --use-nft Use nft API instead of iptables cmd in network locking\n" ++" --parallel Parallel to accellrate dumping speed\n\n" + "\n" + "Check options:\n" + " Without options, \"criu check\" checks availability of absolutely required\n" +diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h +index 672070e..7575929 100644 +--- a/criu/include/cr_options.h ++++ b/criu/include/cr_options.h +@@ -157,6 +157,7 @@ struct cr_options { + char *share_src_ports; + int reserve_ports; + int use_nft; ++ int parallel; + }; + + extern struct cr_options opts; +diff --git a/criu/include/pstree.h b/criu/include/pstree.h +index 17d22e7..5a67ab3 100644 +--- a/criu/include/pstree.h ++++ b/criu/include/pstree.h +@@ -1,6 +1,8 @@ + #ifndef __CR_PSTREE_H__ + #define __CR_PSTREE_H__ + ++#include "taskqueue.h" ++ + #include "common/list.h" + #include "common/lock.h" + #include "pid.h" +@@ -30,6 +32,7 @@ struct pstree_item { + futex_t task_st; + unsigned long task_st_le_bits; + }; ++ struct mappings_info maps_info; + }; + + static inline pid_t vpid(const struct pstree_item *i) +diff --git a/criu/include/taskqueue.h b/criu/include/taskqueue.h +new file mode 100644 +index 0000000..16f9e3d +--- /dev/null ++++ b/criu/include/taskqueue.h +@@ -0,0 +1,50 @@ ++#ifndef __CR_TASKQUEUE_H__ ++#define __CR_TASKQUEUE_H__ ++ ++#include ++#include ++#include ++ ++#include "vma.h" ++#include "pstree.h" ++ ++#include "common/list.h" ++ ++#define TASKQUEUE_HASH_SIZE 8 ++ ++struct taskqueue { ++ pthread_t task; ++ void *(*routine)(void *); ++ void *arg; ++ int result; ++}; ++#define queue_task queue.task ++#define queue_routine queue.routine ++#define queue_arg queue.arg ++#define queue_result queue.result ++ ++int init_parallel_env(void); ++ ++static inline int taskqueue_create(struct taskqueue *queue) ++{ ++ return pthread_create(&queue->task, NULL, queue->routine, queue->arg); ++} ++ ++static inline int taskqueue_join(struct taskqueue *queue) ++{ ++ return pthread_join(queue->task, NULL); ++} ++ ++/* parallel collect smaps */ ++struct mappings_info { ++ struct hlist_node hash; ++ pid_t pid; ++ struct vm_area_list *vmas; ++ dump_filemap_t dump_file; ++ struct taskqueue queue; ++}; ++ ++int start_collect_mappings_thread(void); ++int end_collect_mappings_thread(struct pstree_item *item); ++ ++#endif /* __CR_TASKQUEUE_H__ */ +diff --git a/criu/namespaces.c b/criu/namespaces.c +index 7fef175..f3b9939 100644 +--- a/criu/namespaces.c ++++ b/criu/namespaces.c +@@ -26,6 +26,7 @@ + #include "net.h" + #include "cgroup.h" + #include "fdstore.h" ++#include "taskqueue.h" + + #include "protobuf.h" + #include "util.h" +@@ -1515,11 +1516,15 @@ int collect_namespaces(bool for_dump) + { + int ret; + +- ret = collect_user_namespaces(for_dump); ++ ret = collect_mnt_namespaces(for_dump); + if (ret < 0) + return ret; + +- ret = collect_mnt_namespaces(for_dump); ++ /* need mnt info provided by `mntinfo` */ ++ if (opts.parallel && start_collect_mappings_thread()) ++ return -1; ++ ++ ret = collect_user_namespaces(for_dump); + if (ret < 0) + return ret; + +diff --git a/criu/proc_parse.c b/criu/proc_parse.c +index 17d5cbf..d4d707b 100644 +--- a/criu/proc_parse.c ++++ b/criu/proc_parse.c +@@ -63,6 +63,12 @@ + + #define BUF_SIZE 4096 /* Good enough value - can be changed */ + ++/* cancel log to optimize performance because of the lock contention of print */ ++#undef pr_info ++#undef pr_debug ++#define pr_info(fmt, ...) ++#define pr_debug(fmt, ...) ++ + struct buffer { + char buf[BUF_SIZE]; + char end; /* '\0' */ +diff --git a/criu/taskqueue.c b/criu/taskqueue.c +new file mode 100644 +index 0000000..1196a5e +--- /dev/null ++++ b/criu/taskqueue.c +@@ -0,0 +1,124 @@ ++/* ++ * Target: ++ * parallel dump process ++ */ ++ ++#include ++#include ++#include ++#include ++ ++#include "pstree.h" ++#include "log.h" ++#include "taskqueue.h" ++ ++/* ++ * Sometimes, only one cpu can be used which is bad for parallel routine. ++ * Therefore, set cpu affinity for criu routine. ++ */ ++static int set_cpuaffinity(void) ++{ ++ cpu_set_t *set; ++ int num_cpus = get_nprocs_conf(); ++ size_t cpusetsize = CPU_ALLOC_SIZE(num_cpus); ++ int retval; ++ ++ set = CPU_ALLOC(num_cpus); ++ memset(set, 0xff, cpusetsize); ++ ++ retval = sched_setaffinity(getpid(), cpusetsize, set); ++ if (retval != 0) ++ pr_err("sched_setaffinity failed: %s\n", strerror(errno)); ++ ++ CPU_FREE(set); ++ ++ return retval; ++} ++ ++int init_parallel_env(void) ++{ ++ return set_cpuaffinity(); ++} ++ ++static void *collect_mappings_routine(void *_arg) ++{ ++ struct mappings_info *info = _arg; ++ ++ info->queue_result = collect_mappings(info->pid, info->vmas, info->dump_file); ++ return NULL; ++} ++ ++int dump_filemap(struct vma_area *vma_area, int fd); /* defined in criu/cr-dump.c */ ++ ++int start_collect_mappings_thread(void) ++{ ++ struct pstree_item *pi; ++ struct mappings_info *info; ++ ++ for_each_pstree_item(pi) { ++ /* disable parallel collect for non-root item because of the ++ * concurrence. ++ */ ++ if (pi->pid->real != root_item->pid->real) ++ continue; ++ ++ info = &pi->maps_info; ++ ++ info->vmas = xmalloc(sizeof(struct vm_area_list)); ++ if (info->vmas == NULL) { ++ pr_err("xzalloc vmas no memory\n"); ++ return -1; ++ } ++ vm_area_list_init(info->vmas); ++ ++ info->pid = pi->pid->real; ++ info->dump_file = dump_filemap; ++ info->queue_routine = collect_mappings_routine; ++ info->queue_arg = info; ++ ++ pr_info("Start thread to collect %d mappings\n", info->pid); ++ ++ if (taskqueue_create(&info->queue) < 0) { ++ pr_err("parallel_collect_mappings failed: %s\n", strerror(errno)); ++ free(info->vmas); ++ /* ++ * Don't care other threads status, use `exit_group()` ++ * to ensure all threads exit. ++ */ ++ return -1; ++ } ++ } ++ ++ return 0; ++} ++ ++int end_collect_mappings_thread(struct pstree_item *item) ++{ ++ struct mappings_info *info = &item->maps_info; ++ int retval; ++ ++ /* disable parallel collect for non-root item because of the ++ * concurrence. ++ */ ++ if (root_item->pid->real != item->pid->real) ++ return 0; ++ ++ retval = taskqueue_join(&info->queue); ++ if (retval != 0 || info->queue_result != 0) { ++ pr_err("taskqueue_join failed, retval %d(errno %d: %s)," ++ " queue_result: %d\n", ++ retval, ++ retval == 0 ? 0 : errno, ++ retval == 0 ? "nil" : strerror(errno), ++ info->queue_result); ++ retval = -1; ++ } ++ ++ pr_info("End thread to collect %d mappings\n", info->pid); ++ ++ /* ++ * Don't care other threads status, use `exit_group()` to ensure all ++ * threads exit. ++ */ ++ return retval; ++} +-- +2.34.0 + diff --git a/backport-0047--dump-ignore-children-exit-to-accelerate-speed.patch b/backport-0047--dump-ignore-children-exit-to-accelerate-speed.patch new file mode 100644 index 0000000..a6f41d0 --- /dev/null +++ b/backport-0047--dump-ignore-children-exit-to-accelerate-speed.patch @@ -0,0 +1,38 @@ +From 17cc7bd04401faa10671622dde5509ae909995b6 Mon Sep 17 00:00:00 2001 +From: "fu.lin" +Date: Mon, 29 Nov 2021 19:50:39 +0800 +Subject: [PATCH 47/49] dump: ignore children exit to accelerate speed + +don't care the tracee exit status to accelerate dump speed. Just ignore +SIGCHLD signal. + +Theory: +- criu don't care about `wait4()` status for tracee: in original process, + criu just complains if the status of `wait4()` is abnormal, no action + will be processed. +- the tracee will be adopted by the tracer's parent if the tracer exited + is early than tracee, no zombie tracee will be left. + +Signed-off-by: fu.lin +--- + criu/seize.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/criu/seize.c b/criu/seize.c +index c9fdd41..7875aea 100644 +--- a/criu/seize.c ++++ b/criu/seize.c +@@ -636,7 +636,9 @@ void pstree_switch_state(struct pstree_item *root_item, int st) + for_each_pstree_item(item) + unseize_task_and_threads(item, st); + +- if (st == TASK_DEAD) ++ if (st == TASK_DEAD && opts.parallel) ++ signal(SIGCHLD, SIG_IGN); /* ignore children exit */ ++ else if (st == TASK_DEAD) + pstree_wait(root_item); + } + +-- +2.34.0 + diff --git a/backport-0048--parallel-parallel-nft-delete-set.patch b/backport-0048--parallel-parallel-nft-delete-set.patch new file mode 100644 index 0000000..34b809c --- /dev/null +++ b/backport-0048--parallel-parallel-nft-delete-set.patch @@ -0,0 +1,177 @@ +From 93483156a4a8e48ddd1709d0472a3742c367ac34 Mon Sep 17 00:00:00 2001 +From: "fu.lin" +Date: Mon, 29 Nov 2021 16:03:02 +0800 +Subject: [PATCH 48/49] parallel: parallel nft delete set + +The nft has two part: rules and set. criu delete nft rules to unlock network +during restoration. The set deletion action consumes about hundreds of ms when +there are too many elements in nft set. Dealying set deletion is helpful +to save restoration time. + +Signed-off-by: fu.lin +--- + criu/cr-dump.c | 1 + + criu/cr-restore.c | 3 ++- + criu/include/taskqueue.h | 12 ++++++++++- + criu/nftables.c | 14 +++++++++++-- + criu/taskqueue.c | 45 ++++++++++++++++++++++++++++++++++++++++ + 5 files changed, 71 insertions(+), 4 deletions(-) + +diff --git a/criu/cr-dump.c b/criu/cr-dump.c +index 6b7f735..f203615 100644 +--- a/criu/cr-dump.c ++++ b/criu/cr-dump.c +@@ -1782,6 +1782,7 @@ static int cr_dump_finish(int ret) + */ + if (ret || post_dump_ret || opts.final_state == TASK_ALIVE) { + network_unlock(opts.tree_id); ++ parallel_nft_clean((long)opts.tree_id); + delete_link_remaps(); + clean_cr_time_mounts(); + } +diff --git a/criu/cr-restore.c b/criu/cr-restore.c +index 77cdd0c..130ad20 100644 +--- a/criu/cr-restore.c ++++ b/criu/cr-restore.c +@@ -2448,7 +2448,7 @@ skip_ns_bouncing: + if (ret != 0) + pr_err("Post-resume script ret code %d\n", ret); + +- network_delete_set(vpid(init)); ++ parallel_nft_clean((long)vpid(init)); + + if (!opts.restore_detach && !opts.exec_cmd) + wait(NULL); +@@ -2457,6 +2457,7 @@ skip_ns_bouncing: + + out_kill_network_unlocked: + pr_err("Killing processes because of failure on restore.\nThe Network was unlocked so some data or a connection may have been lost.\n"); ++ parallel_nft_clean((long)vpid(init)); + out_kill: + /* + * The processes can be killed only when all of them have been created, +diff --git a/criu/include/taskqueue.h b/criu/include/taskqueue.h +index 16f9e3d..906c784 100644 +--- a/criu/include/taskqueue.h ++++ b/criu/include/taskqueue.h +@@ -6,7 +6,6 @@ + #include + + #include "vma.h" +-#include "pstree.h" + + #include "common/list.h" + +@@ -47,4 +46,15 @@ struct mappings_info { + int start_collect_mappings_thread(void); + int end_collect_mappings_thread(struct pstree_item *item); + ++#define STACK_SIZE (1024 *1024) ++typedef void (*daemon_t)(void *); ++int parallel_task(daemon_t fn, void *_arg); ++ ++struct daemon { ++ daemon_t fn; ++ void *arg; ++}; ++ ++void parallel_nft_clean(long tree_id); ++ + #endif /* __CR_TASKQUEUE_H__ */ +diff --git a/criu/nftables.c b/criu/nftables.c +index 739aee4..0c529ed 100644 +--- a/criu/nftables.c ++++ b/criu/nftables.c +@@ -16,6 +16,7 @@ + + #include "sk-inet.h" + #include "nftables.h" ++#include "taskqueue.h" + + #include "../soccr/soccr.h" + +@@ -661,10 +662,9 @@ static int network_delete_rule_internal(struct mnl_params *params, + return nft_rule_common(params, tree_id, false); + } + ++/* here split the deletion of rule and set to accelete the restoration process */ + void network_delete_rule(pid_t tree_id) + { +- pr_info("unlock network\n"); +- + mnl_common(network_delete_rule_internal, NULL, &tree_id); + } + +@@ -683,6 +683,16 @@ void network_delete_set(pid_t tree_id) + mnl_common(network_delete_set_internal, NULL, &tree_id); + } + ++void parallel_nft_clean_internal(void *arg) ++{ ++ network_delete_set((long)arg); ++} ++ ++void parallel_nft_clean(long tree_id) ++{ ++ parallel_task(parallel_nft_clean_internal, (void *)tree_id); ++} ++ + static int add_set_elem_internal(struct nftnl_set *s, void *data, size_t len) + { + struct nftnl_set_elem *e; +diff --git a/criu/taskqueue.c b/criu/taskqueue.c +index 1196a5e..7d500e9 100644 +--- a/criu/taskqueue.c ++++ b/criu/taskqueue.c +@@ -122,3 +122,48 @@ int end_collect_mappings_thread(struct pstree_item *item) + */ + return retval; + } ++ ++static int daemonize(void *arg) ++{ ++ struct daemon *d = arg; ++ ++ if (daemon(0, 0) < 0) ++ pr_perror("daemonize failed"); ++ ++ d->fn(d->arg); ++ ++ return 0; ++} ++ ++int parallel_task(daemon_t fn, void *_arg) ++{ ++ struct daemon arg = { ++ .fn = fn, ++ .arg = _arg, ++ }; ++ char *stack; ++ char *stack_top; ++ pid_t pid; ++ ++ stack = mmap(NULL, STACK_SIZE, PROT_READ | PROT_WRITE, ++ MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0); ++ if (stack == MAP_FAILED) { ++ pr_perror("mmap failed"); ++ return -1; ++ } ++ ++ stack_top = stack + STACK_SIZE; ++ ++ /* ignore SIGCHLD signal */ ++ pid = clone(daemonize, stack_top, 0, &arg); ++ if (pid > 0) ++ return 0; /* parent */ ++ else if (pid < 0) { ++ pr_perror("clone failed"); ++ return -1; ++ } ++ ++ /* unreachable */ ++ __builtin_unreachable(); ++ return 0; ++} +-- +2.34.0 + diff --git a/backport-0049--ptrace-trace-specific-syscall.patch b/backport-0049--ptrace-trace-specific-syscall.patch new file mode 100644 index 0000000..87eae45 --- /dev/null +++ b/backport-0049--ptrace-trace-specific-syscall.patch @@ -0,0 +1,707 @@ +From bc64bfe1bbe0e5dbd47723dacebebb503ed06d25 Mon Sep 17 00:00:00 2001 +From: "fu.lin" +Date: Tue, 23 Nov 2021 16:08:17 +0800 +Subject: [PATCH 49/49] ptrace: trace specific syscall + +criu use `ptrace(PTRACE_SYSCALL)` to watch whether the tracee steps in +correct status, it isn't necessory to stop tracee at every syscall. +Therefore, customizing `ptrace(PTRACE_SYSCALL)` to make tracee stop at +the specific syscall can save time (1000 threads consume about 140ms). + +ptrace syntax: + long ptrace(PTRACE_SYSCALL, pid_t pid, void *addr, void *data); + +the argument `addr` is unused in original `ptrace(PTRACE_SYSCALL)`, +use `addr` parameter to give the specific sysno which is wanted to +trace. + +Signed-off-by: fu.lin +--- + compel/Makefile | 1 + + compel/include/uapi/bisect.h | 30 +++++++ + compel/include/uapi/infect.h | 11 ++- + compel/src/lib/bisect.c | 92 +++++++++++++++++++ + compel/src/lib/infect.c | 167 ++++++++++++++++++++++++++++++++--- + criu/config.c | 1 + + criu/cr-dump.c | 2 +- + criu/cr-restore.c | 97 +++++++++++++++++++- + criu/crtools.c | 1 + + criu/include/cr_options.h | 1 + + 10 files changed, 385 insertions(+), 18 deletions(-) + create mode 100644 compel/include/uapi/bisect.h + create mode 100644 compel/src/lib/bisect.c + +diff --git a/compel/Makefile b/compel/Makefile +index de9318c..eea93a7 100644 +--- a/compel/Makefile ++++ b/compel/Makefile +@@ -27,6 +27,7 @@ lib-y += src/lib/infect-rpc.o + lib-y += src/lib/infect-util.o + lib-y += src/lib/infect.o + lib-y += src/lib/ptrace.o ++lib-y += src/lib/bisect.o + + # handle_elf() has no support of ELF relocations on ARM (yet?) + ifneq ($(filter arm aarch64,$(ARCH)),) +diff --git a/compel/include/uapi/bisect.h b/compel/include/uapi/bisect.h +new file mode 100644 +index 0000000..9c00513 +--- /dev/null ++++ b/compel/include/uapi/bisect.h +@@ -0,0 +1,30 @@ ++#ifndef __COMPEL_BISECT_H__ ++#define __COMPEL_BISECT_H__ ++ ++#include ++ ++enum tf { ++ TRACE_INTERRUPT = 0x17173, ++ TRACE_SYSCALL_ENTER, ++ TRACE_SYSCALL_EXIT, ++}; ++ ++struct trace_flag { ++ pid_t key; ++ enum tf flag; /* TODO: enum trace_flags flag */ ++}; ++ ++struct bisect_meta { ++ int size; ++ int used; ++ void *data; /* data pointer array */ ++ void *__data; /* data array */ ++}; ++ ++struct trace_flag *tf_bisect(struct bisect_meta *meta, pid_t key); ++struct trace_flag *tf_insert(struct bisect_meta *meta, pid_t key); ++int tf_create(struct bisect_meta *meta, int len); ++void tf_destroy(struct bisect_meta *meta); ++void tf_clear(struct bisect_meta *meta); ++ ++#endif /* __COMPEL_BISECT_H__ */ +diff --git a/compel/include/uapi/infect.h b/compel/include/uapi/infect.h +index dd672bc..c0bf03b 100644 +--- a/compel/include/uapi/infect.h ++++ b/compel/include/uapi/infect.h +@@ -8,6 +8,7 @@ + #include + #include + #include ++#include + + #include "common/compiler.h" + +@@ -41,7 +42,7 @@ extern int __must_check compel_infect(struct parasite_ctl *ctl, + extern struct parasite_thread_ctl __must_check *compel_prepare_thread(struct parasite_ctl *ctl, int pid); + extern void compel_release_thread(struct parasite_thread_ctl *); + +-extern int __must_check compel_stop_daemon(struct parasite_ctl *ctl); ++extern int __must_check compel_stop_daemon(struct parasite_ctl *ctl, bool customize); + extern int __must_check compel_cure_remote(struct parasite_ctl *ctl); + extern int __must_check compel_cure_local(struct parasite_ctl *ctl); + extern int __must_check compel_cure(struct parasite_ctl *ctl); +@@ -90,6 +91,14 @@ extern int __must_check compel_stop_pie(pid_t pid, void *addr, + + extern int __must_check compel_unmap(struct parasite_ctl *ctl, unsigned long addr); + ++extern int __must_check compel_stop_on_syscall_customize(int tasks, ++ const int sys_nr, const int exit_sys_nr, struct bisect_meta *meta); ++ ++extern int __must_check compel_stop_pie_customize(pid_t pid, ++ const int sys_nr, struct trace_flag *tf); ++ ++extern int __must_check compel_unmap_customize(struct parasite_ctl *ctl, unsigned long addr); ++ + extern int compel_mode_native(struct parasite_ctl *ctl); + + extern k_rtsigset_t *compel_task_sigmask(struct parasite_ctl *ctl); +diff --git a/compel/src/lib/bisect.c b/compel/src/lib/bisect.c +new file mode 100644 +index 0000000..807a5a9 +--- /dev/null ++++ b/compel/src/lib/bisect.c +@@ -0,0 +1,92 @@ ++#include ++ ++#include "log.h" ++#include "common/xmalloc.h" ++#include "bisect.h" ++ ++struct trace_flag *tf_bisect(struct bisect_meta *meta, pid_t key) ++{ ++ struct trace_flag **tfs = meta->data; ++ int lo = 0, hi = meta->used, mid; ++ ++ if (meta->used <= 0) ++ return NULL; ++ ++ while (lo < hi) { ++ mid = (int)((lo + hi) / 2); ++ if (tfs[mid]->key == key) { ++ return tfs[mid]; ++ } else if (tfs[mid]->key > key) { ++ hi = mid; ++ } else { ++ lo = mid + 1; ++ } ++ } ++ ++ return NULL; ++} ++ ++/* used in cr-restore */ ++struct trace_flag *tf_insert(struct bisect_meta *meta, pid_t key) ++{ ++ struct trace_flag **tfs = meta->data; ++ struct trace_flag *tf = &((struct trace_flag *)meta->__data)[meta->used]; ++ int i = 0, j = 0; ++ ++ if (meta->used == meta->size) ++ return NULL; ++ ++ for (i = 0; i < meta->used; i++) { ++ if (tfs[i]->key >= key) /* impossible condition: `tfs[i]->key == key` */ ++ break; ++ } ++ ++ j = meta->used; ++ meta->used += 1; ++ ++ while (j > i) { ++ tfs[j] = tfs[j-1]; ++ j -= 1; ++ } ++ ++ tfs[i] = tf; ++ tf->key = key; ++ ++ return tf; ++} ++ ++int tf_create(struct bisect_meta *meta, int len) ++{ ++ struct trace_flag *tfs; ++ struct trace_flag **tfs_ptr; ++ ++ tfs = xzalloc(sizeof(*tfs) * len); ++ if (tfs == NULL) ++ return -1; ++ ++ tfs_ptr = xmalloc(sizeof(*tfs_ptr) * len); ++ if (tfs_ptr == NULL) ++ goto err; ++ ++ meta->size = len; ++ meta->used = 0; ++ meta->__data = tfs; ++ meta->data = tfs_ptr; ++ ++ return 0; ++err: ++ xfree(tfs); ++ return -1; ++} ++ ++void tf_destroy(struct bisect_meta *meta) ++{ ++ xfree(meta->__data); ++ xfree(meta->data); ++} ++ ++void tf_clear(struct bisect_meta *meta) ++{ ++ meta->used = 0; ++ __builtin_memset(meta->data, 0, sizeof(struct trace_flag **)*meta->size); ++} +diff --git a/compel/src/lib/infect.c b/compel/src/lib/infect.c +index 19f0d10..33c8fec 100644 +--- a/compel/src/lib/infect.c ++++ b/compel/src/lib/infect.c +@@ -438,7 +438,7 @@ static int restore_child_handler(struct parasite_ctl *ctl) + } + + static int parasite_run(pid_t pid, int cmd, unsigned long ip, void *stack, +- user_regs_struct_t *regs, struct thread_ctx *octx) ++ user_regs_struct_t *regs, struct thread_ctx *octx, void *addr) + { + k_rtsigset_t block; + +@@ -454,7 +454,7 @@ static int parasite_run(pid_t pid, int cmd, unsigned long ip, void *stack, + goto err_regs; + } + +- if (ptrace(cmd, pid, NULL, NULL)) { ++ if (ptrace(cmd, pid, addr, NULL)) { + pr_perror("Can't run parasite at %d", pid); + goto err_cont; + } +@@ -561,7 +561,7 @@ int compel_execute_syscall(struct parasite_ctl *ctl, + return -1; + } + +- err = parasite_run(pid, PTRACE_CONT, ctl->ictx.syscall_ip, 0, regs, &ctl->orig); ++ err = parasite_run(pid, PTRACE_CONT, ctl->ictx.syscall_ip, 0, regs, &ctl->orig, NULL); + if (!err) + err = parasite_trap(ctl, pid, regs, &ctl->orig); + +@@ -579,7 +579,7 @@ int compel_run_at(struct parasite_ctl *ctl, unsigned long ip, user_regs_struct_t + user_regs_struct_t regs = ctl->orig.regs; + int ret; + +- ret = parasite_run(ctl->rpid, PTRACE_CONT, ip, 0, ®s, &ctl->orig); ++ ret = parasite_run(ctl->rpid, PTRACE_CONT, ip, 0, ®s, &ctl->orig, NULL); + if (!ret) + ret = parasite_trap(ctl, ctl->rpid, ret_regs ? ret_regs : ®s, &ctl->orig); + return ret; +@@ -628,7 +628,7 @@ static int parasite_init_daemon(struct parasite_ctl *ctl) + goto err; + + regs = ctl->orig.regs; +- if (parasite_run(pid, PTRACE_CONT, ctl->parasite_ip, ctl->rstack, ®s, &ctl->orig)) ++ if (parasite_run(pid, PTRACE_CONT, ctl->parasite_ip, ctl->rstack, ®s, &ctl->orig, NULL)) + goto err; + + futex_wait_while_eq(&args->daemon_connected, 0); +@@ -1212,7 +1212,7 @@ static bool task_in_parasite(struct parasite_ctl *ctl, user_regs_struct_t *regs) + addr < ctl->remote_map + ctl->map_length; + } + +-static int parasite_fini_seized(struct parasite_ctl *ctl) ++static int parasite_fini_seized(struct parasite_ctl *ctl, bool customize) + { + pid_t pid = ctl->rpid; + user_regs_struct_t regs; +@@ -1257,9 +1257,38 @@ static int parasite_fini_seized(struct parasite_ctl *ctl) + if (ret) + return -1; + ++ /* use customize ptrace */ ++ if (customize) { ++ struct trace_flag tf = { .key = pid, .flag = TRACE_SYSCALL_ENTER }; ++ struct trace_flag *tf_ptr[] = { &tf }; ++ struct bisect_meta meta = { ++ .size = 1, ++ .used = 1, ++ .__data = &tf, ++ .data = tf_ptr, ++ }; ++ ++ // TODO: compitable? ++ ret = compel_stop_pie_customize(pid, __NR(rt_sigreturn, 0), &tf); ++ if (ret < 0) ++ return ret; ++ ++ /* The process is going to execute the required syscall, the ++ * original syscall should be forgot(set `-1`) in ++ * `syscall_trace_enter()` handler in kernel when no other ++ * else operation in tracer. ++ * ++ * Note: -1 means NO_SYSCALL which is defined in ++ * `arch/arm64/include/asm/ptrace.h`. ++ */ ++ return compel_stop_on_syscall_customize(1, ++ __NR(rt_sigreturn, 0), ++ -1, &meta); ++ } ++ + /* Go to sigreturn as closer as we can */ + ret = compel_stop_pie(pid, ctl->sigreturn_addr, &flag, +- ctl->ictx.flags & INFECT_NO_BREAKPOINTS); ++ ctl->ictx.flags & INFECT_NO_BREAKPOINTS); + if (ret < 0) + return ret; + +@@ -1279,7 +1308,7 @@ static int parasite_fini_seized(struct parasite_ctl *ctl) + return 0; + } + +-int compel_stop_daemon(struct parasite_ctl *ctl) ++int compel_stop_daemon(struct parasite_ctl *ctl, bool customize) + { + if (ctl->daemonized) { + /* +@@ -1289,7 +1318,7 @@ int compel_stop_daemon(struct parasite_ctl *ctl) + if (ctl->tsock < 0) + return -1; + +- if (parasite_fini_seized(ctl)) { ++ if (parasite_fini_seized(ctl, customize)) { + close_safe(&ctl->tsock); + return -1; + } +@@ -1305,7 +1334,7 @@ int compel_cure_remote(struct parasite_ctl *ctl) + long ret; + int err; + +- if (compel_stop_daemon(ctl)) ++ if (compel_stop_daemon(ctl, false)) + return -1; + + if (!ctl->remote_map) +@@ -1374,7 +1403,7 @@ int compel_run_in_thread(struct parasite_thread_ctl *tctl, unsigned int cmd) + + *ctl->addr_cmd = cmd; + +- ret = parasite_run(pid, PTRACE_CONT, ctl->parasite_ip, stack, ®s, octx); ++ ret = parasite_run(pid, PTRACE_CONT, ctl->parasite_ip, stack, ®s, octx, NULL); + if (ret == 0) + ret = parasite_trap(ctl, pid, ®s, octx); + if (ret == 0) +@@ -1397,7 +1426,7 @@ int compel_unmap(struct parasite_ctl *ctl, unsigned long addr) + pid_t pid = ctl->rpid; + int ret = -1; + +- ret = parasite_run(pid, PTRACE_SYSCALL, addr, ctl->rstack, ®s, &ctl->orig); ++ ret = parasite_run(pid, PTRACE_SYSCALL, addr, ctl->rstack, ®s, &ctl->orig, NULL); + if (ret) + goto err; + +@@ -1410,6 +1439,44 @@ err: + return ret; + } + ++int compel_unmap_customize(struct parasite_ctl *ctl, unsigned long addr) ++{ ++ user_regs_struct_t regs = ctl->orig.regs; ++ pid_t pid = ctl->rpid; ++ int ret = -1; ++ struct trace_flag tf = { .key = pid, .flag = TRACE_SYSCALL_ENTER }; ++ struct trace_flag *tf_ptr[] = { &tf }; ++ struct bisect_meta meta = { ++ .size = 1, ++ .used = 1, ++ .__data = &tf, ++ .data = tf_ptr, ++ }; ++ ++ /* ++ * Here it parasite code. Unlike trap code `compel_stop_pie()`, it ++ * won't let tracee forget the original syscall. In such way, tracer ++ * just trace the syscall called by tracee. The log likes the following: ++ * ++ * [ 817.638332] set pid 1877 ptrace sysno 215 ++ * [ 817.638343] syscall_trace_enter: pid 1877 ptrace_sysno 0 current_sysno 215 ++ * [ 817.638363] (00.006280) Error (compel/src/lib/infect.c:1582): 1877 (native) is going to execute the syscall 215, required is 215 ++ * [ 817.638368] set pid 1877 ptrace sysno 0 ++ * [ 817.638402] syscall_trace_exit: pid 1877 ptrace_sysno 0 current_sysno 215 ++ */ ++ ret = parasite_run(pid, PTRACE_SYSCALL, addr, ctl->rstack, ®s, ++ &ctl->orig, (void *)(long)__NR(munmap, 0)); ++ if (ret) ++ goto err; ++ ++ ret = compel_stop_on_syscall_customize(1, __NR(munmap, 0), 0, &meta); ++ ++ if (restore_thread_ctx(pid, &ctl->orig)) ++ ret = -1; ++err: ++ return ret; ++} ++ + int compel_stop_pie(pid_t pid, void *addr, enum trace_flags *tf, bool no_bp) + { + int ret; +@@ -1445,6 +1512,17 @@ int compel_stop_pie(pid_t pid, void *addr, enum trace_flags *tf, bool no_bp) + return 0; + } + ++int compel_stop_pie_customize(pid_t pid, const int sys_nr, struct trace_flag *tf) ++{ ++ if (ptrace(PTRACE_SYSCALL, pid, sys_nr, NULL)) { ++ pr_perror("Unable to restart the %d process", pid); ++ return -1; ++ } ++ ++ tf->flag = TRACE_SYSCALL_ENTER; ++ return 0; ++} ++ + static bool task_is_trapped(int status, pid_t pid) + { + if (WIFSTOPPED(status) && WSTOPSIG(status) == SIGTRAP) +@@ -1557,6 +1635,71 @@ goon: + return 0; + } + ++int compel_stop_on_syscall_customize(int tasks, const int sys_nr, ++ const int exit_sys_nr, struct bisect_meta *meta) ++{ ++ struct trace_flag *tf; ++ user_regs_struct_t regs; ++ int status, ret; ++ pid_t pid; ++ ++ while (tasks) { ++ pid = wait4(-1, &status, __WALL, NULL); ++ if (pid == -1) { ++ pr_perror("wait4 failed"); ++ return -1; ++ } ++ ++ if (!task_is_trapped(status, pid)) ++ return -1; ++ ++ tf = tf_bisect(meta, pid); ++ if (tf == NULL) { ++ pr_err("Can't find ptrace status for %d\n", pid); ++ return -1; ++ } ++ ++ switch (tf->flag) { ++ case TRACE_SYSCALL_ENTER: ++ pr_debug("%d was trapped\n", pid); ++ pr_debug("`- Expecting exit\n"); ++ ++ ret = ptrace_get_regs(pid, ®s); ++ if (ret) { ++ pr_perror("ptrace"); ++ return -1; ++ } ++ ++ if (is_required_syscall(®s, pid, sys_nr, sys_nr)) { ++ ret = ptrace(PTRACE_SYSCALL, pid, exit_sys_nr, NULL); ++ if (ret) { ++ pr_perror("ptrace"); ++ return -1; ++ } ++ tf->flag = TRACE_SYSCALL_EXIT; ++ } else { ++ pr_warn("Impossible condition, check the system, try our best to restore...\n"); ++ ret = ptrace(PTRACE_SYSCALL, pid, sys_nr, NULL); ++ if (ret) { ++ pr_perror("ptrace"); ++ return -1; ++ } ++ } ++ break; ++ case TRACE_SYSCALL_EXIT: ++ pr_debug("%d was stopped\n", pid); ++ tasks--; ++ break; ++ ++ default: ++ pr_err("pid %d invalid status: %d\n", pid, tf->flag); ++ return -1; ++ } ++ } ++ ++ return 0; ++} ++ + int compel_mode_native(struct parasite_ctl *ctl) + { + return user_regs_native(&ctl->orig.regs); +diff --git a/criu/config.c b/criu/config.c +index 322a79e..080d734 100644 +--- a/criu/config.c ++++ b/criu/config.c +@@ -531,6 +531,7 @@ int parse_options(int argc, char **argv, bool *usage_error, + {"reserve-ports", required_argument, 0, 'P' }, + BOOL_OPT("use-nft", &opts.use_nft), + BOOL_OPT("parallel", &opts.parallel), ++ BOOL_OPT("customize-ptrace", &opts.customize_ptrace), + { }, + }; + +diff --git a/criu/cr-dump.c b/criu/cr-dump.c +index f203615..582706a 100644 +--- a/criu/cr-dump.c ++++ b/criu/cr-dump.c +@@ -1421,7 +1421,7 @@ static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) + goto err_cure; + } + +- ret = compel_stop_daemon(parasite_ctl); ++ ret = compel_stop_daemon(parasite_ctl, opts.customize_ptrace); + if (ret) { + pr_err("Can't stop daemon in parasite (pid: %d)\n", pid); + goto err_cure; +diff --git a/criu/cr-restore.c b/criu/cr-restore.c +index 130ad20..49b15c4 100644 +--- a/criu/cr-restore.c ++++ b/criu/cr-restore.c +@@ -2060,6 +2060,64 @@ static int catch_tasks(bool root_seized, enum trace_flags *flag) + return 0; + } + ++static int cache_tasks_customize(bool root_seized, struct bisect_meta *meta) ++{ ++ struct pstree_item *item; ++ struct trace_flag *tf; ++ ++ for_each_pstree_item(item) { ++ int status, i, ret; ++ pid_t pid; ++ ++ if (!task_alive(item)) ++ continue; ++ ++ if (item->nr_threads == 1) { ++ item->threads[0].real = item->pid->real; ++ } else { ++ if (parse_threads(item->pid->real, &item->threads, &item->nr_threads)) ++ return -1; ++ } ++ ++ for (i = 0; i < item->nr_threads; i++) { ++ pid = item->threads[i].real; ++ ++ if (ptrace(PTRACE_INTERRUPT, pid, 0, 0)) { ++ pr_perror("Can't interrupt the %d task", pid); ++ return -1; ++ } ++ ++ tf = tf_insert(meta, pid); ++ if (tf == NULL) { ++ pr_err("Can't find trace flag for %d, used %d\n", ++ pid, meta->used); ++ return -1; ++ } ++ tf->flag = TRACE_INTERRUPT; ++ } ++ ++ for (i = 0; i < item->nr_threads; i++) { ++ pid = wait4(-1, &status, __WALL, NULL); ++ ++ tf = tf_bisect(meta, pid); ++ if (tf == NULL) { ++ pr_err("Can't find trace flag for %d, used %d\n", ++ pid, meta->used); ++ return -1; ++ } ++ ++ ret = compel_stop_pie_customize(pid, ++ __NR(rt_sigreturn, 0), ++ tf); ++ if (ret < 0) ++ return -1; ++ ++ } ++ } ++ ++ return 0; ++} ++ + static int clear_breakpoints() + { + struct pstree_item *item; +@@ -2086,6 +2144,7 @@ static void finalize_restore(void) + pid_t pid = item->pid->real; + struct parasite_ctl *ctl; + unsigned long restorer_addr; ++ int retval; + + if (!task_alive(item)) + continue; +@@ -2096,7 +2155,12 @@ static void finalize_restore(void) + continue; + + restorer_addr = (unsigned long)rsti(item)->munmap_restorer; +- if (compel_unmap(ctl, restorer_addr)) ++ if (!opts.customize_ptrace) ++ retval = compel_unmap(ctl, restorer_addr); ++ else ++ retval = compel_unmap_customize(ctl, restorer_addr); ++ ++ if (retval) + pr_err("Failed to unmap restorer from %d\n", pid); + + xfree(ctl); +@@ -2201,11 +2265,18 @@ static int write_restored_pid(void) + + static int restore_root_task(struct pstree_item *init) + { ++ struct bisect_meta tfs_meta; + enum trace_flags flag = TRACE_ALL; + int ret, fd, mnt_ns_fd = -1; + int root_seized = 0; + struct pstree_item *item; + ++ if (opts.customize_ptrace ++ && tf_create(&tfs_meta, task_entries->nr_threads) != 0) { ++ pr_err("Can't alloc memory, tf_create failed\n"); ++ return -1; ++ } ++ + ret = run_scripts(ACT_PRE_RESTORE); + if (ret != 0) { + pr_err("Aborting restore due to pre-restore script ret code %d\n", ret); +@@ -2406,7 +2477,12 @@ skip_ns_bouncing: + + timing_stop(TIME_RESTORE); + +- if (catch_tasks(root_seized, &flag)) { ++ if (!opts.customize_ptrace) ++ ret = catch_tasks(root_seized, &flag); ++ else ++ ret = cache_tasks_customize(root_seized, &tfs_meta); ++ ++ if (ret) { + pr_err("Can't catch all tasks\n"); + goto out_kill_network_unlocked; + } +@@ -2416,8 +2492,14 @@ skip_ns_bouncing: + + __restore_switch_stage(CR_STATE_COMPLETE); + +- ret = compel_stop_on_syscall(task_entries->nr_threads, +- __NR(rt_sigreturn, 0), __NR(rt_sigreturn, 1), flag); ++ if (!opts.customize_ptrace) { ++ ret = compel_stop_on_syscall(task_entries->nr_threads, ++ __NR(rt_sigreturn, 0), __NR(rt_sigreturn, 1), flag); ++ } else { ++ ret = compel_stop_on_syscall_customize(task_entries->nr_threads, ++ __NR(rt_sigreturn, 0), ++ -1, &tfs_meta); ++ } + if (ret) { + pr_err("Can't stop all tasks on rt_sigreturn\n"); + goto out_kill_network_unlocked; +@@ -2453,6 +2535,9 @@ skip_ns_bouncing: + if (!opts.restore_detach && !opts.exec_cmd) + wait(NULL); + ++ if (opts.customize_ptrace) ++ tf_destroy(&tfs_meta); ++ + return 0; + + out_kill_network_unlocked: +@@ -2487,6 +2572,10 @@ out: + stop_usernsd(); + __restore_switch_stage(CR_STATE_FAIL); + pr_err("Restoring FAILED.\n"); ++ ++ if (opts.customize_ptrace) ++ tf_destroy(&tfs_meta); ++ + return -1; + } + +diff --git a/criu/crtools.c b/criu/crtools.c +index e39dc07..81e21d1 100644 +--- a/criu/crtools.c ++++ b/criu/crtools.c +@@ -435,6 +435,7 @@ usage: + " --reserve-ports Reserve src ports in kernel\n" + " --use-nft Use nft API instead of iptables cmd in network locking\n" + " --parallel Parallel to accellrate dumping speed\n\n" ++" --customize-ptrace Use customize ptrace(PTRACE_SYSCALL)\n" + "\n" + "Check options:\n" + " Without options, \"criu check\" checks availability of absolutely required\n" +diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h +index 7575929..1807a89 100644 +--- a/criu/include/cr_options.h ++++ b/criu/include/cr_options.h +@@ -158,6 +158,7 @@ struct cr_options { + int reserve_ports; + int use_nft; + int parallel; ++ int customize_ptrace; + }; + + extern struct cr_options opts; +-- +2.34.0 + diff --git a/criu.spec b/criu.spec index 9f090af..e87ee50 100644 --- a/criu.spec +++ b/criu.spec @@ -1,5 +1,5 @@ Name: criu -Version: 3.13 +Version: 3.14 Release: 10 Provides: crtools = %{version}-%{release} Obsoletes: crtools <= 1.0-2 @@ -8,25 +8,87 @@ License: GPLv2 URL: http://criu.org/ Source0: http://download.openvz.org/criu/criu-%{version}.tar.bz2 BuildRequires: systemd libnet-devel asciidoc xmlto perl-interpreter libselinux-devel -BuildRequires: protobuf-devel protobuf-c-devel python3-devel libnl3-devel libcap-devel +BuildRequires: gcc bc hostname +BuildRequires: protobuf-devel protobuf-c-devel python3-devel libnl3-devel libcap-devel libmnl-devel libnftnl-devel Recommends: tar ExclusiveArch: x86_64 %{arm} ppc64le aarch64 s390x Requires: %{name} = %{version}-%{release} Provides: %{name}-libs = %{version}-%{release} Obsoletes: %{name}-libs < %{version}-%{release} -Patch0001: 0001-Fix-crit-encode-TypeError.patch -Patch0002: 0002-Fix-crit-info-struct-unpack-error.patch -Patch0003: 0003-Fix-crit-x-UnicodeDecodeError.patch -Patch0004: 0004-kerndat-detect-if-system-support-clone3-with-set_tid.patch -Patch0005: 0005-Add-assembler-wrapper-for-clone3.patch -Patch0006: 0006-Use-clone3-with-set_tid-to-create-processes.patch -Patch0007: 0007-clone3-handle-clone3-with-CLONE_PARENT.patch -Patch0008: 0008-aarch64-use-clone3-if-possible.patch -Patch0009: 0009-criu-dump-and-restore-cpu-affinity-of-each-thread.patch -Patch0010: 0010-vdso-fix-segmentation-fault-caused-by-char-pointer-a.patch -Patch0011: 0011-vdso-use-correct-offsets-to-remap-vdso-and-vvar-mapp.patch -Patch0012: 0012-add-pin-memory-method-for-criu.patch +Patch: 0001-Fix-crit-info-struct-unpack-error.patch +Patch: 0002-Fix-crit-x-UnicodeDecodeError.patch +Patch: 0003-kerndat-detect-if-system-support-clone3-with-set_tid.patch +Patch: 0004-Add-assembler-wrapper-for-clone3.patch +Patch: 0005-Use-clone3-with-set_tid-to-create-processes.patch +Patch: 0006-clone3-handle-clone3-with-CLONE_PARENT.patch +Patch: 0007-aarch64-use-clone3-if-possible.patch +Patch: 0008-criu-dump-and-restore-cpu-affinity-of-each-thread.patch +Patch: 0009-vdso-fix-segmentation-fault-caused-by-char-pointer-a.patch +Patch: 0010-vdso-use-correct-offsets-to-remap-vdso-and-vvar-mapp.patch +Patch: 0011-criu-fix-build-failure-against-gcc-10.patch +Patch: 0012-protobuf-remove-leading-underscores-from-protobuf-st.patch +Patch: 0013-images-regfile.proto-adds-additional-fields-to-RegFi.patch +Patch: 0014-criu-files-reg.c-add-build-id-validation-functionali.patch +Patch: 0015-criu-Kill-tasks-even-when-the-network-is-unlocked.patch +Patch: 0016-cr-restore-Warn-if-restorer-can-t-be-unmapped.patch +Patch: 0017-compel-infect-Warn-if-close-failed-on-memfd.patch +Patch: 0018-lib-infect-Check-if-compel-succeed-in-executing-munm.patch +Patch: 0019-cr-dump-Try-to-cure-remote-on-err-pathes.patch +Patch: 0020-cr-dump-Warn-if-unmapping-local-memfd-failed.patch +Patch: 0021-parasite-syscall-Log-if-can-t-cure-on-failed-infecti.patch +Patch: 0022-compel-criu-Add-__must_check.patch +%ifarch aarch64 +Patch: backport-0001--build-add-secure-compilation-options.patch +Patch: backport-0002--tty-fix-NULL-pointer-access-in-tty.patch +Patch: backport-0003--namespaces-drop-func-address-print-to-make-someone-h.patch +Patch: backport-0004--mm-add-pin-memory-method-for-criu.patch +Patch: backport-0005--pid-add-pid-recover-method-for-criu.patch +Patch: backport-0006--notifier-add-notifier-calling-method-for-checkpoint-.patch +Patch: backport-0007--cred-provide-cred-checkpoint-restore-method.patch +Patch: backport-0008--block-device-dump-block-device-as-reguler-file.patch +Patch: backport-0009--anon-inode-add-support-for-anon-inode-fd.patch +Patch: backport-0010--char_dev-add-support-for-char-device-dump-and-restor.patch +Patch: backport-0011--socket-fix-connect-error-of-invalid-param.patch +Patch: backport-0012--criu-eventpollfd-fix-for-improper-usage-in-appdata.patch +Patch: backport-0013--task_exit_notify-add-task-exit-notify-mask-method-fo.patch +Patch: backport-0014--selinux-fix-selinux-context-lable-check.patch +Patch: backport-0015--unix-socket-add-support-for-unix-stream-socket.patch +Patch: backport-0016--save-and-restore-sigev_notify_thread_id.patch +Patch: backport-0017--sysvshm-add-dump-restore-sysv-shm-in-host-ipc-ns.patch +Patch: backport-0018--add-netlink-repair-modes.patch +Patch: backport-0019--ignore-special-page-dump.patch +Patch: backport-0020--add-O_REPAIR-flag-to-vma-fd.patch +Patch: backport-0021--file-lock-add-repair-mode-to-dump-file-locks.patch +Patch: backport-0022--unlock-network-when-restore-fails.patch +Patch: backport-0023--net-add-shared-socket-recover-method-for-criu.patch +Patch: backport-0024--clean-repair-res-when-dump-fail.patch +Patch: backport-0025--save-src-ports-to-ip_local_reserved_ports-when-dump-.patch +Patch: backport-0026--fix-dump-fail-problem-with-null-seek-op.patch +Patch: backport-0027--fix-dump-fail-problem-with-no-access-to-get-socket-f.patch +Patch: backport-0028--proc-parse-fix-vma-offset-value-for-the-sysfs-file-o.patch +Patch: backport-0029--looser-file-mode-and-size-check.patch +Patch: backport-0030--add-reuse-file-method-for-recover-deleted-file-state.patch +Patch: backport-0031--fix-share-sockets-repair-problem.patch +Patch: backport-0032--nftables-add-mnl-api.patch +Patch: backport-0033--nftables-implement-nft-api-for-tcp.patch +Patch: backport-0034--nftables-implement-nft-api-for-lock-net-ns.patch +Patch: backport-0035--criu-switch-to-nftables-api.patch +Patch: backport-0036--remove-sigaction-handler-register-in-restorer.patch +Patch: backport-0037--remove-ignore_special_dump-option.patch +Patch: backport-0038--add-clear-pin-mem-and-init-page-map-option.patch +Patch: backport-0039--mmap-restore-dev-hisi_sec2-deivce-vma.patch +Patch: backport-0040--fix-fds-list-restore-and-rollback-problem.patch +Patch: backport-0041--log-print-error-log-to-dev-kmsg.patch +Patch: backport-0042--improve-char-dev-fd-check-and-repair-method.patch +Patch: backport-0043--unix-sk-improve-dgram-robustness.patch +Patch: backport-0044--sk-ignore-the-bind-error-for-icmp-socket.patch +Patch: backport-0045--infiniband-fix-the-infiniband-fd-conflict.patch +Patch: backport-0046--optimization-parallel-collecting-vmas.patch +Patch: backport-0047--dump-ignore-children-exit-to-accelerate-speed.patch +Patch: backport-0048--parallel-parallel-nft-delete-set.patch +Patch: backport-0049--ptrace-trace-specific-syscall.patch +%endif %description Checkpoint/Restore in Userspace(CRIU),is a software tool for the linux operating system. @@ -100,6 +162,15 @@ chmod 0755 %{buildroot}/run/%{name}/ %doc %{_mandir}/man1/{compel.1*,crit.1*} %changelog +* Wed Dec 1 2021 fu.lin - 3.13-14 +- add buildrequires gcc bc hostname +- backports: + * fix multiple build {warnings, errors} on high gcc version + * add compilier attribute to force check compel functions + * kill tasks even when the network is unlocked in restoration progress + * add build-id validation functionality +- kinds of feature/bugfix for the module upgrade of OceanStor Dorado + * Mon Mar 1 2021 Jingxian He - 3.13-10 - Add pin memory method for criu -- Gitee