From 9c2b383d61a8d63e46345d16429f51da43c409ec Mon Sep 17 00:00:00 2001 From: "fu.lin" Date: Wed, 13 Apr 2022 15:05:10 +0800 Subject: [PATCH] criu: backport kinds of features/bugfix Signed-off-by: fu.lin --- ...-restore-cpu-affinity-of-each-thread.patch | 6 +- ...syscall-into-compel-std-plugin-sysca.patch | 18 +- ...r-rseq-syscall-support-Signed-off-by.patch | 6 +- ...nd_ptrace_attach-helper-from-cr-chec.patch | 10 +- ...ace-rseq-conf-dump-feature-Add-get_r.patch | 12 +- ...port-TODO-1.-properly-handle-case-wh.patch | 22 +- ...test-for-rseq-C-R-Signed-off-by-Alex.patch | 8 +- ...whide-based-test-on-Cirrus-We-have-a.patch | 6 +- ...ad_pointer.h-from-Glibc-Implementati.patch | 16 +- ...egister-rseq-at-the-thread-start-for.patch | 8 +- ...00-fix-rseq-test-when-linking-with-a.patch | 8 +- ...rs-to-get-set-instruction-pointer-Si.patch | 22 +- ...read-IP-when-inside-rseq-cs-Signed-o.patch | 8 +- ...ansition-test-for-amd64-Signed-off-b.patch | 8 +- ...seq-flags-field-Userspace-may-config.patch | 10 +- ...transition-test-with-NO_RESTART-CS-f.patch | 10 +- ...fix-zdtm-static-maps00-case-in-arm64.patch | 56 + ...-flush-ipt-rules-after-program-exits.patch | 52 + ...zdtm-fix-cleaning-step-of-zdtm_netns.patch | 48 + 0020-mm-add-pin-memory-method-for-criu.patch | 453 ++++ ...-pid-add-pid-recover-method-for-criu.patch | 213 ++ ...ifier-calling-method-for-checkpoint-.patch | 621 +++++ ...ce-dump-block-device-as-reguler-file.patch | 62 + ...-inode-add-support-for-anon-inode-fd.patch | 316 +++ ...port-for-char-device-dump-and-restor.patch | 784 ++++++ ...-char-dev-fd-check-and-repair-method.patch | 74 + ...map-restore-dev-hisi_sec2-deivce-vma.patch | 472 ++++ ...iband-fix-the-infiniband-fd-conflict.patch | 223 ++ ...ovide-cred-checkpoint-restore-method.patch | 255 ++ ...t-fix-connect-error-of-invalid-param.patch | 93 + ...fd-fix-for-improper-usage-in-appdata.patch | 99 + ...-add-task-exit-notify-mask-method-fo.patch | 193 ++ ...t-add-support-for-unix-stream-socket.patch | 403 ++++ ...ir-modes-and-clear-resource-when-fai.patch | 104 + ...dump-restore-sysv-shm-in-host-ipc-ns.patch | 114 + 0036-add-O_REPAIR-flag-to-vma-fd.patch | 47 + 0037-looser-file-mode-and-size-check.patch | 90 + ...k-add-repair-mode-to-dump-file-locks.patch | 308 +++ 0039-unlock-network-when-restore-fails.patch | 60 + ...hared-socket-recover-method-for-criu.patch | 332 +++ ...ts-to-ip_local_reserved_ports-when-d.patch | 273 +++ ...-dump-fail-problem-with-null-seek-op.patch | 45 + ...oblem-with-no-access-to-get-socket-f.patch | 39 + ...ma-offset-value-for-the-sysfs-file-o.patch | 139 ++ ...ethod-for-recover-deleted-file-state.patch | 244 ++ ...-sk-fix-share-sockets-repair-problem.patch | 133 ++ ...ear-pin-mem-and-init-page-map-option.patch | 107 + 0048-fds-fix-fds-list-restore.patch | 37 + 0049-log-print-error-log-to-dev-kmsg.patch | 88 + 0050-unix-sk-improve-dgram-robustness.patch | 83 + ...gnore-the-bind-error-for-icmp-socket.patch | 46 + ...ptimization-parallel-collecting-vmas.patch | 505 ++++ ...-mm-add-exec-file-mapping-pin-method.patch | 120 + 0054-ptrace-trace-specific-syscall.patch | 774 ++++++ ...tifier-rollback-when-open-img-failed.patch | 150 ++ ...l-task-when-ptrace-PTRACE_DETACH-ret.patch | 38 + ...build-add-secure-compilation-options.patch | 114 + 0058-nftables-add-mnl-api.patch | 283 +++ 0059-nftables-implement-nft-api-for-tcp.patch | 1011 ++++++++ 0060-net-switch-to-nftables-API.patch | 55 + 0061-zdtm-unlink-kdat-before-testing.patch | 46 + 0062-zdtm-add-host-ns-sysvshm-ipc-case.patch | 302 +++ 0063-zdtm-add-pinmem-testcase.patch | 2091 +++++++++++++++++ 0064-zdtm-init-notifier-testcase.patch | 620 +++++ ...rno-info-when-accessing-.out-failure.patch | 35 + 0066-zdtm-print-more-info-for-fs.c.patch | 43 + 0067-zdtm-add-chardev-testcase.patch | 288 +++ 0068-zdtm-add-infiniband-testcase.patch | 256 ++ 0069-zdtm-add-share-port-testcase.patch | 145 ++ 0070-zdtm-tmp-test-script.patch | 59 + 0071-mod-add-criu-indepent-test.patch | 512 ++++ 0072-kabichk-add-KABI-check-code.patch | 611 +++++ criu.changes | 25 + criu.spec | 113 +- 74 files changed, 14954 insertions(+), 126 deletions(-) rename 0003-kerndat-check-for-rseq-syscall-support.patch => 0003-kerndat-check-for-rseq-syscall-support-Signed-off-by.patch (91%) rename 0005-cr-check-Add-ptrace-rseq-conf-dump-feature.patch => 0005-cr-check-Add-ptrace-rseq-conf-dump-feature-Add-get_r.patch (93%) rename 0006-rseq-initial-support.patch => 0006-rseq-initial-support-TODO-1.-properly-handle-case-wh.patch (97%) rename 0007-zdtm-add-simple-test-for-rseq-C-R.patch => 0007-zdtm-add-simple-test-for-rseq-C-R-Signed-off-by-Alex.patch (96%) rename 0008-ci-add-Fedora-Rawhide-based-test-on-Cirrus.patch => 0008-ci-add-Fedora-Rawhide-based-test-on-Cirrus-We-have-a.patch (96%) rename 0009-include-add-thread_pointer.h-from-Glibc.patch => 0009-include-add-thread_pointer.h-from-Glibc-Implementati.patch (95%) rename 0012-compel-add-helpers-to-get-set-instruction-pointer.patch => 0012-compel-add-helpers-to-get-set-instruction-pointer-Si.patch (94%) rename 0013-cr-dump-fixup-thread-IP-when-inside-rseq-cs.patch => 0013-cr-dump-fixup-thread-IP-when-inside-rseq-cs-Signed-o.patch (97%) rename 0014-zdtm-add-rseq-transition-test-for-amd64.patch => 0014-zdtm-add-rseq-transition-test-for-amd64-Signed-off-b.patch (96%) rename 0015-cr-dump-handle-rseq-flags-field.patch => 0015-cr-dump-handle-rseq-flags-field-Userspace-may-config.patch (97%) create mode 100644 0017-zdtm-fix-zdtm-static-maps00-case-in-arm64.patch create mode 100644 0018-test-flush-ipt-rules-after-program-exits.patch create mode 100644 0019-zdtm-fix-cleaning-step-of-zdtm_netns.patch create mode 100644 0020-mm-add-pin-memory-method-for-criu.patch create mode 100644 0021-pid-add-pid-recover-method-for-criu.patch create mode 100644 0022-notifier-add-notifier-calling-method-for-checkpoint-.patch create mode 100644 0023-block-device-dump-block-device-as-reguler-file.patch create mode 100644 0024-anon-inode-add-support-for-anon-inode-fd.patch create mode 100644 0025-char_dev-add-support-for-char-device-dump-and-restor.patch create mode 100644 0026-improve-char-dev-fd-check-and-repair-method.patch create mode 100644 0027-mmap-restore-dev-hisi_sec2-deivce-vma.patch create mode 100644 0028-infiniband-fix-the-infiniband-fd-conflict.patch create mode 100644 0029-cred-provide-cred-checkpoint-restore-method.patch create mode 100644 0030-socket-fix-connect-error-of-invalid-param.patch create mode 100644 0031-criu-eventpollfd-fix-for-improper-usage-in-appdata.patch create mode 100644 0032-task_exit_notify-add-task-exit-notify-mask-method-fo.patch create mode 100644 0033-unix-socket-add-support-for-unix-stream-socket.patch create mode 100644 0034-netlink-add-repair-modes-and-clear-resource-when-fai.patch create mode 100644 0035-sysvshm-add-dump-restore-sysv-shm-in-host-ipc-ns.patch create mode 100644 0036-add-O_REPAIR-flag-to-vma-fd.patch create mode 100644 0037-looser-file-mode-and-size-check.patch create mode 100644 0038-file-lock-add-repair-mode-to-dump-file-locks.patch create mode 100644 0039-unlock-network-when-restore-fails.patch create mode 100644 0040-net-add-shared-socket-recover-method-for-criu.patch create mode 100644 0041-tcp-save-src-ports-to-ip_local_reserved_ports-when-d.patch create mode 100644 0042-reg-file-fix-dump-fail-problem-with-null-seek-op.patch create mode 100644 0043-fix-dump-fail-problem-with-no-access-to-get-socket-f.patch create mode 100644 0044-proc-parse-fix-vma-offset-value-for-the-sysfs-file-o.patch create mode 100644 0045-add-reuse-file-method-for-recover-deleted-file-state.patch create mode 100644 0046-sk-fix-share-sockets-repair-problem.patch create mode 100644 0047-mm-add-clear-pin-mem-and-init-page-map-option.patch create mode 100644 0048-fds-fix-fds-list-restore.patch create mode 100644 0049-log-print-error-log-to-dev-kmsg.patch create mode 100644 0050-unix-sk-improve-dgram-robustness.patch create mode 100644 0051-sk-ignore-the-bind-error-for-icmp-socket.patch create mode 100644 0052-optimization-parallel-collecting-vmas.patch create mode 100644 0053-mm-add-exec-file-mapping-pin-method.patch create mode 100644 0054-ptrace-trace-specific-syscall.patch create mode 100644 0055-notifier-rollback-when-open-img-failed.patch create mode 100644 0056-detach-don-t-kill-task-when-ptrace-PTRACE_DETACH-ret.patch create mode 100644 0057-build-add-secure-compilation-options.patch create mode 100644 0058-nftables-add-mnl-api.patch create mode 100644 0059-nftables-implement-nft-api-for-tcp.patch create mode 100644 0060-net-switch-to-nftables-API.patch create mode 100644 0061-zdtm-unlink-kdat-before-testing.patch create mode 100644 0062-zdtm-add-host-ns-sysvshm-ipc-case.patch create mode 100644 0063-zdtm-add-pinmem-testcase.patch create mode 100644 0064-zdtm-init-notifier-testcase.patch create mode 100644 0065-zdtm-print-errno-info-when-accessing-.out-failure.patch create mode 100644 0066-zdtm-print-more-info-for-fs.c.patch create mode 100644 0067-zdtm-add-chardev-testcase.patch create mode 100644 0068-zdtm-add-infiniband-testcase.patch create mode 100644 0069-zdtm-add-share-port-testcase.patch create mode 100644 0070-zdtm-tmp-test-script.patch create mode 100644 0071-mod-add-criu-indepent-test.patch create mode 100644 0072-kabichk-add-KABI-check-code.patch create mode 100644 criu.changes diff --git a/0001-criu-dump-and-restore-cpu-affinity-of-each-thread.patch b/0001-criu-dump-and-restore-cpu-affinity-of-each-thread.patch index 474e4c9..5ad6a5c 100644 --- a/0001-criu-dump-and-restore-cpu-affinity-of-each-thread.patch +++ b/0001-criu-dump-and-restore-cpu-affinity-of-each-thread.patch @@ -1,7 +1,7 @@ -From 4a49af49be378835b65016d5465eae44107a52e1 Mon Sep 17 00:00:00 2001 +From 746a5dd20bb688e1d830e216059e1de7e59186a3 Mon Sep 17 00:00:00 2001 From: "fu.lin" Date: Tue, 13 Apr 2021 10:39:45 +0800 -Subject: [PATCH 4/6] criu: dump and restore cpu affinity of each thread +Subject: [PATCH 01/72] criu: dump and restore cpu affinity of each thread Criu should dump and restore threads' or processes' cpu affinity. @@ -416,5 +416,5 @@ index 0000000..0d0b8ae @@ -0,0 +1 @@ +{'dopts': '', 'ropts': '--with-cpu-affinity', 'flags': 'reqrst '} -- -2.27.0 +2.34.1 diff --git a/0002-compel-add-rseq-syscall-into-compel-std-plugin-sysca.patch b/0002-compel-add-rseq-syscall-into-compel-std-plugin-sysca.patch index ac103f4..5f72eb9 100644 --- a/0002-compel-add-rseq-syscall-into-compel-std-plugin-sysca.patch +++ b/0002-compel-add-rseq-syscall-into-compel-std-plugin-sysca.patch @@ -1,18 +1,18 @@ -From ee46b1b5755eacf3be02a67934f0dc690293745b Mon Sep 17 00:00:00 2001 +From dc6dbe893f7a8b644b655a56e4a0edfb854c577f Mon Sep 17 00:00:00 2001 From: bb-cat Date: Wed, 2 Mar 2022 13:28:51 +0800 -Subject: [PATCH 02/16] compel: add rseq syscall into compel std plugin syscall +Subject: [PATCH 02/72] compel: add rseq syscall into compel std plugin syscall tables Add rseq syscall numbers for: arm/aarch64, mips64, ppc64le, s390, x86_64/x86 Signed-off-by: Alexander Mikhalitsyn --- - compel/arch/arm/plugins/std/syscalls/syscall.def | 1 + - compel/arch/mips/plugins/std/syscalls/syscall_64.tbl | 1 + - .../compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl | 1 + - .../compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl | 1 + - compel/arch/x86/plugins/std/syscalls/syscall_32.tbl | 1 + - compel/arch/x86/plugins/std/syscalls/syscall_64.tbl | 1 + + compel/arch/arm/plugins/std/syscalls/syscall.def | 1 + + compel/arch/mips/plugins/std/syscalls/syscall_64.tbl | 1 + + compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl | 1 + + compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl | 1 + + compel/arch/x86/plugins/std/syscalls/syscall_32.tbl | 1 + + compel/arch/x86/plugins/std/syscalls/syscall_64.tbl | 1 + 6 files changed, 6 insertions(+) diff --git a/compel/arch/arm/plugins/std/syscalls/syscall.def b/compel/arch/arm/plugins/std/syscalls/syscall.def @@ -70,5 +70,5 @@ index c1d119d..323fab1 100644 __NR_pidfd_getfd 438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) +__NR_rseq 334 sys_rseq (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) -- -2.30.0 +2.34.1 diff --git a/0003-kerndat-check-for-rseq-syscall-support.patch b/0003-kerndat-check-for-rseq-syscall-support-Signed-off-by.patch similarity index 91% rename from 0003-kerndat-check-for-rseq-syscall-support.patch rename to 0003-kerndat-check-for-rseq-syscall-support-Signed-off-by.patch index 1729b14..4a6ebc1 100644 --- a/0003-kerndat-check-for-rseq-syscall-support.patch +++ b/0003-kerndat-check-for-rseq-syscall-support-Signed-off-by.patch @@ -1,7 +1,7 @@ -From ebd917f395b8bb3c4d6bbe51f9210d1aeca2e1fd Mon Sep 17 00:00:00 2001 +From 35053ab4bb8fe09818da9421a053e2e13c7ad817 Mon Sep 17 00:00:00 2001 From: bb-cat Date: Wed, 2 Mar 2022 13:34:10 +0800 -Subject: [PATCH 03/16] kerndat: check for rseq syscall support Signed-off-by: +Subject: [PATCH 03/72] kerndat: check for rseq syscall support Signed-off-by: Alexander Mikhalitsyn --- @@ -58,5 +58,5 @@ index 0e88ba4..f5a4490 100644 kerndat_mmap_min_addr(); kerndat_files_stat(); -- -2.30.0 +2.34.1 diff --git a/0004-util-move-fork_and_ptrace_attach-helper-from-cr-chec.patch b/0004-util-move-fork_and_ptrace_attach-helper-from-cr-chec.patch index 51457c6..ff73bad 100644 --- a/0004-util-move-fork_and_ptrace_attach-helper-from-cr-chec.patch +++ b/0004-util-move-fork_and_ptrace_attach-helper-from-cr-chec.patch @@ -1,13 +1,13 @@ -From fe1f84eb98092b1aff60ae2be11e351b165f3f43 Mon Sep 17 00:00:00 2001 +From 30381c725f7c6738bd0df0f822aace1e66065b65 Mon Sep 17 00:00:00 2001 From: bb-cat Date: Wed, 2 Mar 2022 13:35:53 +0800 -Subject: [PATCH 04/16] util: move fork_and_ptrace_attach helper from cr-check +Subject: [PATCH 04/72] util: move fork_and_ptrace_attach helper from cr-check Signed-off-by: Alexander Mikhalitsyn --- - criu/cr-check.c | 55 ------------------------------- + criu/cr-check.c | 55 ------------------------------------------- criu/include/util.h | 1 + - criu/util.c | 57 +++++++++++++++++++++++++++++++++ + criu/util.c | 57 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 58 insertions(+), 55 deletions(-) diff --git a/criu/cr-check.c b/criu/cr-check.c @@ -157,5 +157,5 @@ index 06124c2..e682161 100644 { char c = 0; -- -2.30.0 +2.34.1 diff --git a/0005-cr-check-Add-ptrace-rseq-conf-dump-feature.patch b/0005-cr-check-Add-ptrace-rseq-conf-dump-feature-Add-get_r.patch similarity index 93% rename from 0005-cr-check-Add-ptrace-rseq-conf-dump-feature.patch rename to 0005-cr-check-Add-ptrace-rseq-conf-dump-feature-Add-get_r.patch index 5a82e08..0375b6d 100644 --- a/0005-cr-check-Add-ptrace-rseq-conf-dump-feature.patch +++ b/0005-cr-check-Add-ptrace-rseq-conf-dump-feature-Add-get_r.patch @@ -1,16 +1,16 @@ -From 3c567693f2e6579109dbabcca0e90c059ce5af25 Mon Sep 17 00:00:00 2001 +From f84bab6b29146ef7fb9867af0324efb90596e12c Mon Sep 17 00:00:00 2001 From: bb-cat Date: Wed, 2 Mar 2022 15:30:18 +0800 -Subject: [PATCH 05/16] cr-check: Add ptrace rseq conf dump feature Add +Subject: [PATCH 05/72] cr-check: Add ptrace rseq conf dump feature Add "get_rseq_conf" feature corresponding to the ptrace(PTRACE_GET_RSEQ_CONFIGURATION) support. Signed-off-by: Alexander Mikhalitsyn --- - compel/include/uapi/ptrace.h | 12 +++++++ - criu/cr-check.c | 11 +++++++ + compel/include/uapi/ptrace.h | 12 +++++++++++ + criu/cr-check.c | 11 ++++++++++ criu/include/kerndat.h | 1 + - criu/kerndat.c | 41 ++++++++++++++++++++++++ + criu/kerndat.c | 41 ++++++++++++++++++++++++++++++++++++ 4 files changed, 65 insertions(+) diff --git a/compel/include/uapi/ptrace.h b/compel/include/uapi/ptrace.h @@ -158,5 +158,5 @@ index f5a4490..4841387 100644 kerndat_mmap_min_addr(); kerndat_files_stat(); -- -2.30.0 +2.34.1 diff --git a/0006-rseq-initial-support.patch b/0006-rseq-initial-support-TODO-1.-properly-handle-case-wh.patch similarity index 97% rename from 0006-rseq-initial-support.patch rename to 0006-rseq-initial-support-TODO-1.-properly-handle-case-wh.patch index 4c68985..4c19ba5 100644 --- a/0006-rseq-initial-support.patch +++ b/0006-rseq-initial-support-TODO-1.-properly-handle-case-wh.patch @@ -1,7 +1,7 @@ -From e444c089ebfb03fb2b6d69a40322d31ab33c0597 Mon Sep 17 00:00:00 2001 +From c905adf3aaa116984e28a51700c53917f3651e3b Mon Sep 17 00:00:00 2001 From: bb-cat Date: Wed, 2 Mar 2022 14:52:35 +0800 -Subject: [PATCH 06/16] rseq: initial support TODO: 1. properly handle case +Subject: [PATCH 06/72] rseq: initial support TODO: 1. properly handle case when the kernel has rseq() support but has no ptrace(PTRACE_GET_RSEQ_CONFIGURATION) support and user processes haven't used rseq(). 2. properly handle "transient" states, when CRIU comes during rseq @@ -14,19 +14,19 @@ Reported-by: Radostin Stoyanov Suggested-by: Florian Weimer Signed-off-by: Alexander Mikhalitsyn --- - compel/include/uapi/ptrace.h | 16 +-- - criu/cr-dump.c | 99 ++++++++++++++++ - criu/cr-restore.c | 17 +++ - criu/include/linux/rseq.h | 144 +++++++++++++++++++++++ + compel/include/uapi/ptrace.h | 16 ++-- + criu/cr-dump.c | 99 ++++++++++++++++++++++++ + criu/cr-restore.c | 17 +++++ + criu/include/linux/rseq.h | 144 +++++++++++++++++++++++++++++++++++ criu/include/parasite.h | 7 ++ criu/include/restorer.h | 7 ++ criu/kerndat.c | 2 +- - criu/parasite-syscall.c | 11 ++ - criu/pie/parasite.c | 99 ++++++++++++++++ - criu/pie/restorer.c | 24 ++++ + criu/parasite-syscall.c | 11 +++ + criu/pie/parasite.c | 99 ++++++++++++++++++++++++ + criu/pie/restorer.c | 24 ++++++ images/Makefile | 1 + images/core.proto | 2 + - images/rseq.proto | 9 ++ + images/rseq.proto | 9 +++ 13 files changed, 429 insertions(+), 9 deletions(-) create mode 100644 criu/include/linux/rseq.h create mode 100644 images/rseq.proto @@ -698,5 +698,5 @@ index 0000000..be28004 + required uint32 signature = 3; +} -- -2.30.0 +2.34.1 diff --git a/0007-zdtm-add-simple-test-for-rseq-C-R.patch b/0007-zdtm-add-simple-test-for-rseq-C-R-Signed-off-by-Alex.patch similarity index 96% rename from 0007-zdtm-add-simple-test-for-rseq-C-R.patch rename to 0007-zdtm-add-simple-test-for-rseq-C-R-Signed-off-by-Alex.patch index bb317ed..a0ab3f0 100644 --- a/0007-zdtm-add-simple-test-for-rseq-C-R.patch +++ b/0007-zdtm-add-simple-test-for-rseq-C-R-Signed-off-by-Alex.patch @@ -1,12 +1,12 @@ -From 5005c08e32dc29dbf0b3a2a582e75d249c190d96 Mon Sep 17 00:00:00 2001 +From dc83ed27d305237298b8754d1159f2e7f5c926ae Mon Sep 17 00:00:00 2001 From: bb-cat Date: Wed, 2 Mar 2022 14:54:28 +0800 -Subject: [PATCH 07/16] zdtm: add simple test for rseq C/R Signed-off-by: +Subject: [PATCH 07/72] zdtm: add simple test for rseq C/R Signed-off-by: Alexander Mikhalitsyn --- test/zdtm/static/Makefile | 1 + - test/zdtm/static/rseq00.c | 174 +++++++++++++++++++++++ + test/zdtm/static/rseq00.c | 174 +++++++++++++++++++++++++++++++++++ test/zdtm/static/rseq00.desc | 1 + 3 files changed, 176 insertions(+) create mode 100644 test/zdtm/static/rseq00.c @@ -213,5 +213,5 @@ index 0000000..0324fa3 @@ -0,0 +1 @@ +{'flavor': 'h', 'arch': 'x86_64', 'feature': 'get_rseq_conf'} -- -2.30.0 +2.34.1 diff --git a/0008-ci-add-Fedora-Rawhide-based-test-on-Cirrus.patch b/0008-ci-add-Fedora-Rawhide-based-test-on-Cirrus-We-have-a.patch similarity index 96% rename from 0008-ci-add-Fedora-Rawhide-based-test-on-Cirrus.patch rename to 0008-ci-add-Fedora-Rawhide-based-test-on-Cirrus-We-have-a.patch index 2f6b642..fc69648 100644 --- a/0008-ci-add-Fedora-Rawhide-based-test-on-Cirrus.patch +++ b/0008-ci-add-Fedora-Rawhide-based-test-on-Cirrus-We-have-a.patch @@ -1,7 +1,7 @@ -From 56fad25776a652e143175a22676a1f909476c880 Mon Sep 17 00:00:00 2001 +From 4ebfba180d44706e50afb525cc992ac708e83883 Mon Sep 17 00:00:00 2001 From: bb-cat Date: Wed, 2 Mar 2022 14:57:16 +0800 -Subject: [PATCH 08/16] ci: add Fedora Rawhide based test on Cirrus We have +Subject: [PATCH 08/72] ci: add Fedora Rawhide based test on Cirrus We have ability to use nested virtualization on Cirrus, and already have "Vagrant Fedora based test (no VDSO)" test, let's do analogical for Fedora Rawhide to get fresh kernel. @@ -119,5 +119,5 @@ index 839b100..f961b8d 100755 + $1 -- -2.30.0 +2.34.1 diff --git a/0009-include-add-thread_pointer.h-from-Glibc.patch b/0009-include-add-thread_pointer.h-from-Glibc-Implementati.patch similarity index 95% rename from 0009-include-add-thread_pointer.h-from-Glibc.patch rename to 0009-include-add-thread_pointer.h-from-Glibc-Implementati.patch index 5151349..f10df29 100644 --- a/0009-include-add-thread_pointer.h-from-Glibc.patch +++ b/0009-include-add-thread_pointer.h-from-Glibc-Implementati.patch @@ -1,7 +1,7 @@ -From 99da2f789ca92aa52eeca07b97aee2cbd3d60fca Mon Sep 17 00:00:00 2001 +From 159d2b7c889ae23ece99595af8a12f766c7b1aff Mon Sep 17 00:00:00 2001 From: bb-cat Date: Wed, 2 Mar 2022 15:00:07 +0800 -Subject: [PATCH 09/16] include: add thread_pointer.h from Glibc Implementation +Subject: [PATCH 09/72] include: add thread_pointer.h from Glibc Implementation was taken from the Glibc. https://sourceware.org/git/?p=glibc.git;a=commitdiff;h=8dbeb0561eeb876f557ac9eef5721912ec074ea5 @@ -10,11 +10,11 @@ https://sourceware.org/git/?p=glibc.git;a=commitdiff;h=cb976fba4c51ede7bf8cee503 Signed-off-by: Alexander Mikhalitsyn --- .../arch/aarch64/include/asm/thread_pointer.h | 27 ++++++++++++++ - .../arch/arm/include/asm/thread_pointer.h | 27 ++++++++++++++ - .../arch/mips/include/asm/thread_pointer.h | 27 ++++++++++++++ - .../arch/ppc64/include/asm/thread_pointer.h | 33 +++++++++++++++++ - .../arch/s390/include/asm/thread_pointer.h | 27 ++++++++++++++ - .../arch/x86/include/asm/thread_pointer.h | 37 +++++++++++++++++++ + criu/arch/arm/include/asm/thread_pointer.h | 27 ++++++++++++++ + criu/arch/mips/include/asm/thread_pointer.h | 27 ++++++++++++++ + criu/arch/ppc64/include/asm/thread_pointer.h | 33 +++++++++++++++++ + criu/arch/s390/include/asm/thread_pointer.h | 27 ++++++++++++++ + criu/arch/x86/include/asm/thread_pointer.h | 37 +++++++++++++++++++ 6 files changed, 178 insertions(+) create mode 100644 criu/arch/aarch64/include/asm/thread_pointer.h create mode 100644 criu/arch/arm/include/asm/thread_pointer.h @@ -240,5 +240,5 @@ index 0000000..08603ae +#endif /* _SYS_THREAD_POINTER_H */ \ No newline at end of file -- -2.30.0 +2.34.1 diff --git a/0010-clone-noasan-unregister-rseq-at-the-thread-start-for.patch b/0010-clone-noasan-unregister-rseq-at-the-thread-start-for.patch index a8e8e99..e02dfe0 100644 --- a/0010-clone-noasan-unregister-rseq-at-the-thread-start-for.patch +++ b/0010-clone-noasan-unregister-rseq-at-the-thread-start-for.patch @@ -1,7 +1,7 @@ -From d43ad9913c19afa6d80cb8124015d47361152db8 Mon Sep 17 00:00:00 2001 +From 0fdb1cf439c08f6e957e2e7d234a015ef3b84dfc Mon Sep 17 00:00:00 2001 From: bb-cat Date: Wed, 2 Mar 2022 15:00:43 +0800 -Subject: [PATCH 10/16] clone-noasan: unregister rseq at the thread start for +Subject: [PATCH 10/72] clone-noasan: unregister rseq at the thread start for new glibc Fresh glibc does rseq registration by default during start_thread(). [ see https://sourceware.org/git/?p=glibc.git;a=commitdiff;h=95e114a0919d844d8fe07839cb6538b7f5ee920e @@ -18,7 +18,7 @@ https://sourceware.org/git?p=glibc.git;a=commit;h=8dbeb0561eeb876f557ac9eef57219 Signed-off-by: Alexander Mikhalitsyn --- - criu/clone-noasan.c | 42 +++++++++++++++++++++++++++++++-- + criu/clone-noasan.c | 42 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 40 insertions(+), 2 deletions(-) diff --git a/criu/clone-noasan.c b/criu/clone-noasan.c @@ -98,5 +98,5 @@ index d657ea2..5f8dd1b 100644 return pid; } -- -2.30.0 +2.34.1 diff --git a/0011-zdtm-static-rseq00-fix-rseq-test-when-linking-with-a.patch b/0011-zdtm-static-rseq00-fix-rseq-test-when-linking-with-a.patch index e5745ac..cd99dde 100644 --- a/0011-zdtm-static-rseq00-fix-rseq-test-when-linking-with-a.patch +++ b/0011-zdtm-static-rseq00-fix-rseq-test-when-linking-with-a.patch @@ -1,13 +1,13 @@ -From 4f4d5acc34046954aea9e8ea10b5f71ff5f0fbd5 Mon Sep 17 00:00:00 2001 +From 7cc800d2cfbfb6fe686345a652472b194ca2b9cf Mon Sep 17 00:00:00 2001 From: bb-cat Date: Wed, 2 Mar 2022 15:01:34 +0800 -Subject: [PATCH 11/16] zdtm/static/rseq00: fix rseq test when linking with a +Subject: [PATCH 11/72] zdtm/static/rseq00: fix rseq test when linking with a fresh Glibc Fresh Glibc does rseq() register by default. We need to unregister rseq before registering our own. Signed-off-by: Alexander Mikhalitsyn --- - test/zdtm/static/rseq00.c | 76 ++++++++++++++++++++------- + test/zdtm/static/rseq00.c | 76 +++++++++++++++++++++++++++++---------- 1 file changed, 58 insertions(+), 18 deletions(-) diff --git a/test/zdtm/static/rseq00.c b/test/zdtm/static/rseq00.c @@ -154,5 +154,5 @@ index 26f41a2..87053b8 100644 if (ret) fail("Failed to increment per-cpu counter"); -- -2.30.0 +2.34.1 diff --git a/0012-compel-add-helpers-to-get-set-instruction-pointer.patch b/0012-compel-add-helpers-to-get-set-instruction-pointer-Si.patch similarity index 94% rename from 0012-compel-add-helpers-to-get-set-instruction-pointer.patch rename to 0012-compel-add-helpers-to-get-set-instruction-pointer-Si.patch index 33acd47..d5d00ed 100644 --- a/0012-compel-add-helpers-to-get-set-instruction-pointer.patch +++ b/0012-compel-add-helpers-to-get-set-instruction-pointer-Si.patch @@ -1,7 +1,7 @@ -From 06cb51057ce1cc31b79c6321273dfa0b4cb7f980 Mon Sep 17 00:00:00 2001 +From 65eb254d6ad2f1b1d36e95f431b05faf9e67524d Mon Sep 17 00:00:00 2001 From: bb-cat Date: Wed, 2 Mar 2022 15:02:08 +0800 -Subject: [PATCH 12/16] compel: add helpers to get/set instruction pointer +Subject: [PATCH 12/72] compel: add helpers to get/set instruction pointer Signed-off-by: Alexander Mikhalitsyn --- @@ -11,14 +11,14 @@ Subject: [PATCH 12/16] compel: add helpers to get/set instruction pointer .../src/lib/include/uapi/asm/infect-types.h | 9 +++++---- .../src/lib/include/uapi/asm/infect-types.h | 7 ++++--- .../src/lib/include/uapi/asm/infect-types.h | 9 +++++---- - compel/include/uapi/infect.h | 6 ++++++ - compel/src/lib/infect.c | 20 +++++++++++++++++++ - .../criu/arch/aarch64/include/asm/types.h | 2 ++ - criu/arch/arm/include/asm/types.h | 2 ++ - .../criu/arch/mips/include/asm/types.h | 2 ++ - .../criu/arch/ppc64/include/asm/types.h | 2 ++ - .../criu/arch/s390/include/asm/types.h | 2 ++ - criu/arch/x86/include/asm/types.h | 2 ++ + compel/include/uapi/infect.h | 6 ++++++ + compel/src/lib/infect.c | 20 +++++++++++++++++++ + criu/arch/aarch64/include/asm/types.h | 2 ++ + criu/arch/arm/include/asm/types.h | 2 ++ + criu/arch/mips/include/asm/types.h | 2 ++ + criu/arch/ppc64/include/asm/types.h | 2 ++ + criu/arch/s390/include/asm/types.h | 2 ++ + criu/arch/x86/include/asm/types.h | 2 ++ 14 files changed, 67 insertions(+), 23 deletions(-) diff --git a/compel/arch/aarch64/src/lib/include/uapi/asm/infect-types.h b/compel/arch/aarch64/src/lib/include/uapi/asm/infect-types.h @@ -261,5 +261,5 @@ index a0a8ed9..8919d0a 100644 static inline u64 encode_pointer(void *p) -- -2.30.0 +2.34.1 diff --git a/0013-cr-dump-fixup-thread-IP-when-inside-rseq-cs.patch b/0013-cr-dump-fixup-thread-IP-when-inside-rseq-cs-Signed-o.patch similarity index 97% rename from 0013-cr-dump-fixup-thread-IP-when-inside-rseq-cs.patch rename to 0013-cr-dump-fixup-thread-IP-when-inside-rseq-cs-Signed-o.patch index cd8ef17..ca9caf1 100644 --- a/0013-cr-dump-fixup-thread-IP-when-inside-rseq-cs.patch +++ b/0013-cr-dump-fixup-thread-IP-when-inside-rseq-cs-Signed-o.patch @@ -1,11 +1,11 @@ -From 33abfc12b973560b3d98afdbac7554b8c0542c3d Mon Sep 17 00:00:00 2001 +From afe090a86d6634e3620ebae16d32960f2c4933cc Mon Sep 17 00:00:00 2001 From: bb-cat Date: Wed, 2 Mar 2022 15:04:54 +0800 -Subject: [PATCH 13/16] cr-dump: fixup thread IP when inside rseq cs +Subject: [PATCH 13/72] cr-dump: fixup thread IP when inside rseq cs Signed-off-by: Alexander Mikhalitsyn --- - criu/cr-dump.c | 155 +++++++++++++++++++++++++++- + criu/cr-dump.c | 155 ++++++++++++++++++++++++++++++++++++++-- criu/include/parasite.h | 2 + criu/include/pstree.h | 1 + 3 files changed, 154 insertions(+), 4 deletions(-) @@ -244,5 +244,5 @@ index c5b0fa7..458e5f9 100644 /* * Although we don't support dumping different struct creds in general, -- -2.30.0 +2.34.1 diff --git a/0014-zdtm-add-rseq-transition-test-for-amd64.patch b/0014-zdtm-add-rseq-transition-test-for-amd64-Signed-off-b.patch similarity index 96% rename from 0014-zdtm-add-rseq-transition-test-for-amd64.patch rename to 0014-zdtm-add-rseq-transition-test-for-amd64-Signed-off-b.patch index d1379bd..f99e1e6 100644 --- a/0014-zdtm-add-rseq-transition-test-for-amd64.patch +++ b/0014-zdtm-add-rseq-transition-test-for-amd64-Signed-off-b.patch @@ -1,12 +1,12 @@ -From f76aa4ade354649e3291b5e7274c368740b05417 Mon Sep 17 00:00:00 2001 +From 961a05f47822444406edeb3d90d9113bba44cdf3 Mon Sep 17 00:00:00 2001 From: bb-cat Date: Wed, 2 Mar 2022 15:05:34 +0800 -Subject: [PATCH 14/16] zdtm: add rseq transition test for amd64 Signed-off-by: +Subject: [PATCH 14/72] zdtm: add rseq transition test for amd64 Signed-off-by: Alexander Mikhalitsyn --- test/zdtm/transition/Makefile | 1 + - test/zdtm/transition/rseq01.c | 208 +++++++++++++++++++ + test/zdtm/transition/rseq01.c | 208 +++++++++++++++++++++++++++++++ test/zdtm/transition/rseq01.desc | 1 + 3 files changed, 210 insertions(+) create mode 100644 test/zdtm/transition/rseq01.c @@ -246,5 +246,5 @@ index 0000000..0324fa3 @@ -0,0 +1 @@ +{'flavor': 'h', 'arch': 'x86_64', 'feature': 'get_rseq_conf'} -- -2.30.0 +2.34.1 diff --git a/0015-cr-dump-handle-rseq-flags-field.patch b/0015-cr-dump-handle-rseq-flags-field-Userspace-may-config.patch similarity index 97% rename from 0015-cr-dump-handle-rseq-flags-field.patch rename to 0015-cr-dump-handle-rseq-flags-field-Userspace-may-config.patch index d544774..0ac2b3e 100644 --- a/0015-cr-dump-handle-rseq-flags-field.patch +++ b/0015-cr-dump-handle-rseq-flags-field-Userspace-may-config.patch @@ -1,13 +1,13 @@ -From deac94521c373c13add63eaf88118187ea3c2cb2 Mon Sep 17 00:00:00 2001 +From 50f04f06eb3ecbdd465e417e8c5c8b19d43ec2f4 Mon Sep 17 00:00:00 2001 From: bb-cat Date: Wed, 2 Mar 2022 15:09:44 +0800 -Subject: [PATCH 15/16] cr-dump: handle rseq flags field Userspace may +Subject: [PATCH 15/72] cr-dump: handle rseq flags field Userspace may configure rseq critical section by def Signed-off-by: Alexander Mikhalitsyn --- - criu/cr-dump.c | 86 +++++++++++++++++++------------ - criu/cr-restore.c | 63 ++++++++++++++++++++++ + criu/cr-dump.c | 86 +++++++++++++++++++++++++++---------------- + criu/cr-restore.c | 63 +++++++++++++++++++++++++++++++ criu/include/pstree.h | 1 + images/rseq.proto | 1 + 4 files changed, 119 insertions(+), 32 deletions(-) @@ -326,5 +326,5 @@ index be28004..45cb847 100644 + optional uint64 rseq_cs_pointer = 4; } -- -2.30.0 +2.34.1 diff --git a/0016-zdtm-add-rseq02-transition-test-with-NO_RESTART-CS-f.patch b/0016-zdtm-add-rseq02-transition-test-with-NO_RESTART-CS-f.patch index 73038a7..3fe2cc5 100644 --- a/0016-zdtm-add-rseq02-transition-test-with-NO_RESTART-CS-f.patch +++ b/0016-zdtm-add-rseq02-transition-test-with-NO_RESTART-CS-f.patch @@ -1,13 +1,13 @@ -From bb8295ae4f1224db2236fdd3134912e093ed20d9 Mon Sep 17 00:00:00 2001 +From dc5f32571e66ab72842e735259d0c442ed1c603b Mon Sep 17 00:00:00 2001 From: bb-cat Date: Wed, 2 Mar 2022 15:10:24 +0800 -Subject: [PATCH 16/16] zdtm: add rseq02 transition test with NO_RESTART CS +Subject: [PATCH 16/72] zdtm: add rseq02 transition test with NO_RESTART CS flag Signed-off-by: Alexander Mikhalitsyn --- - test/zdtm/transition/Makefile | 2 + - test/zdtm/transition/rseq01.c | 61 +++++++++++++++++++- + test/zdtm/transition/Makefile | 2 ++ + test/zdtm/transition/rseq01.c | 61 ++++++++++++++++++++++++++++++-- test/zdtm/transition/rseq02.c | 1 + test/zdtm/transition/rseq02.desc | 1 + 4 files changed, 63 insertions(+), 2 deletions(-) @@ -173,5 +173,5 @@ index 0000000..b888f0d +rseq01.desc \ No newline at end of file -- -2.30.0 +2.34.1 diff --git a/0017-zdtm-fix-zdtm-static-maps00-case-in-arm64.patch b/0017-zdtm-fix-zdtm-static-maps00-case-in-arm64.patch new file mode 100644 index 0000000..1da6183 --- /dev/null +++ b/0017-zdtm-fix-zdtm-static-maps00-case-in-arm64.patch @@ -0,0 +1,56 @@ +From 1f760a8bbb539e81b1ef48aeedbebb792d7b74b2 Mon Sep 17 00:00:00 2001 +From: "fu.lin" +Date: Fri, 14 Jan 2022 16:39:32 +0800 +Subject: [PATCH 17/72] zdtm: fix zdtm/static/maps00 case in arm64 + +This case sometimes will cause SIGILL signal in arm64 platform. + +<> notes: + The ARM architecture does not require the hardware to ensure coherency + between instruction caches and memory, even for locations of shared + memory. + +Therefore, we need flush dcache and icache for self-modifying code. + +- https://developer.arm.com/documentation/den0024/a/Caches/Point-of-coherency-and-unification + +Signed-off-by: fu.lin +--- + test/zdtm/static/maps00.c | 8 +++++--- + 1 file changed, 5 insertions(+), 3 deletions(-) + +diff --git a/test/zdtm/static/maps00.c b/test/zdtm/static/maps00.c +index 10a4cac..5ef8f1a 100644 +--- a/test/zdtm/static/maps00.c ++++ b/test/zdtm/static/maps00.c +@@ -158,7 +158,8 @@ static int check_map(struct map *map) + + if (!sigsetjmp(segv_ret, 1)) { + if (map->prot & PROT_WRITE) { +- memcpy(map->ptr, test_func, getpagesize()); ++ memcpy(map->ptr,test_func, ONE_MAP_SIZE); ++ __builtin___clear_cache(map->ptr, map->ptr+ONE_MAP_SIZE); + } else { + if (!(map->flag & MAP_ANONYMOUS)) { + uint8_t funlen = (uint8_t *)check_map - (uint8_t *)test_func; +@@ -169,14 +170,15 @@ static int check_map(struct map *map) + } + } + } +- if (!(map->flag & MAP_ANONYMOUS) || map->prot & PROT_WRITE) ++ if (!(map->flag & MAP_ANONYMOUS) || (map->prot & PROT_WRITE)) + /* Function body has been copied into the mapping */ + ((int (*)(void))map->ptr)(); /* perform exec access */ +- else ++ else { + /* No way to copy function body into mapping, + * clear exec bit from effective protection + */ + prot &= PROT_WRITE | PROT_READ | !PROT_EXEC; ++ } + } else + prot &= PROT_WRITE | PROT_READ | !PROT_EXEC; + +-- +2.34.1 + diff --git a/0018-test-flush-ipt-rules-after-program-exits.patch b/0018-test-flush-ipt-rules-after-program-exits.patch new file mode 100644 index 0000000..9d61eb3 --- /dev/null +++ b/0018-test-flush-ipt-rules-after-program-exits.patch @@ -0,0 +1,52 @@ +From 003edcab5c2dc1a3f00dba7f4b7bcdd017eb34b5 Mon Sep 17 00:00:00 2001 +From: "fu.lin" +Date: Thu, 20 Jan 2022 19:45:14 +0800 +Subject: [PATCH 18/72] test: flush ipt rules after program exits + +Signed-off-by: fu.lin +--- + test/zdtm/static/socket-tcp-nfconntrack.desc | 2 +- + test/zdtm/static/socket-tcp.c | 13 +++++++++++++ + 2 files changed, 14 insertions(+), 1 deletion(-) + +diff --git a/test/zdtm/static/socket-tcp-nfconntrack.desc b/test/zdtm/static/socket-tcp-nfconntrack.desc +index add2513..05bdb49 100644 +--- a/test/zdtm/static/socket-tcp-nfconntrack.desc ++++ b/test/zdtm/static/socket-tcp-nfconntrack.desc +@@ -1 +1 @@ +-{'flavor': 'h', 'opts': '--tcp-established', 'flags': 'suid'} ++{'flavor': 'h', 'opts': '--tcp-established', 'flags': 'suid excl'} +diff --git a/test/zdtm/static/socket-tcp.c b/test/zdtm/static/socket-tcp.c +index f6ef473..29b0fce 100644 +--- a/test/zdtm/static/socket-tcp.c ++++ b/test/zdtm/static/socket-tcp.c +@@ -57,6 +57,13 @@ int write_data(int fd, const unsigned char *buf, int size) + return 0; + } + ++#ifdef ZDTM_CONNTRACK ++static void ipt_flush(void) ++{ ++ system("iptables -w --flush"); ++} ++#endif ++ + int main(int argc, char **argv) + { + unsigned char buf[BUF_SIZE]; +@@ -72,6 +79,12 @@ int main(int argc, char **argv) + pr_perror("unshare"); + return 1; + } ++ ++ if (atexit(ipt_flush) != 0) { ++ pr_perror("atexit"); ++ return 1; ++ } ++ + if (system("ip link set up dev lo")) + return 1; + if (system("iptables -w -A INPUT -i lo -p tcp -m state --state NEW,ESTABLISHED -j ACCEPT")) +-- +2.34.1 + diff --git a/0019-zdtm-fix-cleaning-step-of-zdtm_netns.patch b/0019-zdtm-fix-cleaning-step-of-zdtm_netns.patch new file mode 100644 index 0000000..5d7540c --- /dev/null +++ b/0019-zdtm-fix-cleaning-step-of-zdtm_netns.patch @@ -0,0 +1,48 @@ +From 5e68ba283e442467baef762bfcf87910d84e01ae Mon Sep 17 00:00:00 2001 +From: "fu.lin" +Date: Wed, 19 Jan 2022 10:01:25 +0800 +Subject: [PATCH 19/72] zdtm: fix cleaning step of zdtm_netns + +Signed-off-by: fu.lin +--- + test/zdtm.py | 10 +++++++--- + 1 file changed, 7 insertions(+), 3 deletions(-) + +diff --git a/test/zdtm.py b/test/zdtm.py +index 0a52e1b..0feece0 100755 +--- a/test/zdtm.py ++++ b/test/zdtm.py +@@ -1,4 +1,6 @@ + #!/usr/bin/env python ++# -*- coding: utf-8 -*- ++ + from __future__ import absolute_import, division, print_function, unicode_literals + + import argparse +@@ -2110,7 +2112,8 @@ class Launcher: + + if self.__fail: + print_sep("FAIL", "#") +- sys.exit(1) ++ ++ return self.__fail + + + def all_tests(opts): +@@ -2375,10 +2378,11 @@ def run_tests(opts): + else: + launcher.skip(t, "no flavors") + finally: +- launcher.finish() ++ fail = launcher.finish() + if opts['join_ns']: + subprocess.Popen(["ip", "netns", "delete", "zdtm_netns"]).wait() +- ++ if fail: ++ sys.exit(1) + + sti_fmt = "%-40s%-10s%s" + +-- +2.34.1 + diff --git a/0020-mm-add-pin-memory-method-for-criu.patch b/0020-mm-add-pin-memory-method-for-criu.patch new file mode 100644 index 0000000..6f150ce --- /dev/null +++ b/0020-mm-add-pin-memory-method-for-criu.patch @@ -0,0 +1,453 @@ +From 3858f7e228b15d0e1ce553f530fda4da9aa4efab Mon Sep 17 00:00:00 2001 +From: Jingxian He +Date: Fri, 23 Apr 2021 21:22:08 +0800 +Subject: [PATCH 20/72] mm: add pin memory method for criu + +Add pin memory method for criu to improve memory recover +speed and avoid user private data saving to files. + +Signed-off-by: Jingxian He +Signed-off-by: fu.lin +--- + criu/Makefile.crtools | 1 + + criu/config.c | 1 + + criu/cr-dump.c | 9 +++ + criu/cr-restore.c | 2 + + criu/crtools.c | 1 + + criu/include/cr_options.h | 1 + + criu/include/pin-mem.h | 49 +++++++++++++ + criu/include/restorer.h | 1 + + criu/mem.c | 16 +++++ + criu/pie/restorer.c | 26 ++++++- + criu/pin-mem.c | 146 ++++++++++++++++++++++++++++++++++++++ + criu/seize.c | 6 ++ + 12 files changed, 258 insertions(+), 1 deletion(-) + create mode 100644 criu/include/pin-mem.h + create mode 100644 criu/pin-mem.c + +diff --git a/criu/Makefile.crtools b/criu/Makefile.crtools +index 50a2fa9..98c4135 100644 +--- a/criu/Makefile.crtools ++++ b/criu/Makefile.crtools +@@ -90,6 +90,7 @@ obj-y += servicefd.o + obj-y += pie-util-vdso.o + obj-y += vdso.o + obj-y += timens.o ++obj-y += pin-mem.o + obj-$(CONFIG_HAS_LIBBPF) += bpfmap.o + obj-$(CONFIG_COMPAT) += pie-util-vdso-elf32.o + CFLAGS_pie-util-vdso-elf32.o += -DCONFIG_VDSO_32 +diff --git a/criu/config.c b/criu/config.c +index 71f99c9..53a5cfd 100644 +--- a/criu/config.c ++++ b/criu/config.c +@@ -696,6 +696,7 @@ int parse_options(int argc, char **argv, bool *usage_error, bool *has_exec_cmd, + { "pre-dump-mode", required_argument, 0, 1097 }, + { "file-validation", required_argument, 0, 1098 }, + BOOL_OPT("with-cpu-affinity", &opts.with_cpu_affinity), ++ BOOL_OPT("pin-memory", &opts.pin_memory), + { "lsm-mount-context", required_argument, 0, 1099 }, + { "network-lock", required_argument, 0, 1100 }, + {}, +diff --git a/criu/cr-dump.c b/criu/cr-dump.c +index 79387fb..5fac9ce 100644 +--- a/criu/cr-dump.c ++++ b/criu/cr-dump.c +@@ -86,6 +86,7 @@ + #include "pidfd-store.h" + #include "apparmor.h" + #include "asm/dump.h" ++#include "pin-mem.h" + + /* + * Architectures can overwrite this function to restore register sets that +@@ -2058,6 +2059,14 @@ static int cr_dump_finish(int ret) + close_service_fd(CR_PROC_FD_OFF); + close_image_dir(); + ++ if (ret == 0 && opts.pin_memory) { ++ pr_info("start restore_task_special_pages\n"); ++ restore_task_special_pages(0); ++ } else if (ret != 0 && opts.pin_memory) { ++ pr_info("clear pin mem info\n"); ++ clear_pin_mem(0); ++ } ++ + if (ret) { + pr_err("Dumping FAILED.\n"); + } else { +diff --git a/criu/cr-restore.c b/criu/cr-restore.c +index 864140f..5514c29 100644 +--- a/criu/cr-restore.c ++++ b/criu/cr-restore.c +@@ -3885,6 +3885,8 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns + task_args, task_args->t->pid, task_args->nr_threads, task_args->clone_restore_fn, + task_args->thread_args); + ++ task_args->pin_memory = opts.pin_memory; ++ + /* + * An indirect call to task_restore, note it never returns + * and restoring core is extremely destructive. +diff --git a/criu/crtools.c b/criu/crtools.c +index b5a36b9..1b90481 100644 +--- a/criu/crtools.c ++++ b/criu/crtools.c +@@ -447,6 +447,7 @@ usage: + " can be 'filesize' or 'buildid' (default).\n" + " --with-cpu-affinity Allow to restore cpu affinity. Only for hosts with\n" + " same cpu quantity.\n" ++ " --pin-memory Use pin memory method for checkpoint and restore.\n" + "\n" + "Check options:\n" + " Without options, \"criu check\" checks availability of absolutely required\n" +diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h +index 3b50e59..61898fd 100644 +--- a/criu/include/cr_options.h ++++ b/criu/include/cr_options.h +@@ -190,6 +190,7 @@ struct cr_options { + int file_validation_method; + /* restore cpu affinity */ + int with_cpu_affinity; ++ int pin_memory; + }; + + extern struct cr_options opts; +diff --git a/criu/include/pin-mem.h b/criu/include/pin-mem.h +new file mode 100644 +index 0000000..7e53b12 +--- /dev/null ++++ b/criu/include/pin-mem.h +@@ -0,0 +1,49 @@ ++#ifndef __CRIU_PIN_MEM_H__ ++#define __CRIU_PIN_MEM_H__ ++ ++#include ++ ++#include "vma.pb-c.h" ++ ++#if __has_include("linux/pin_memory.h") ++# include ++#else ++ ++#define PIN_MEM_MAGIC 0x59 ++#define _SET_PIN_MEM_AREA 1 ++#define _CLEAR_PIN_MEM_AREA 2 ++#define _REMAP_PIN_MEM_AREA 3 ++#define _DUMP_SEPCIAL_PAGES 6 ++#define _RETORE_SEPCIAL_PAGES 7 ++ ++#define SET_PIN_MEM_AREA _IOW(PIN_MEM_MAGIC, _SET_PIN_MEM_AREA, struct pin_mem_area_set) ++#define CLEAR_PIN_MEM_AREA _IOW(PIN_MEM_MAGIC, _CLEAR_PIN_MEM_AREA, int) ++#define REMAP_PIN_MEM_AREA _IOW(PIN_MEM_MAGIC, _REMAP_PIN_MEM_AREA, int) ++#define DUMP_SPECIAL_PAGES _IOW(PIN_MEM_MAGIC, _DUMP_SEPCIAL_PAGES, int) ++#define RETORE_SPECIAL_PAGES _IOW(PIN_MEM_MAGIC, _RETORE_SEPCIAL_PAGES, int) ++ ++#define MAX_PIN_MEM_AREA_NUM 16 ++ ++struct _pin_mem_area { ++ unsigned long virt_start; ++ unsigned long virt_end; ++}; ++ ++struct pin_mem_area_set { ++ unsigned int pid; ++ unsigned int area_num; ++ struct _pin_mem_area mem_area[MAX_PIN_MEM_AREA_NUM]; ++}; ++ ++#endif /* __has_include("linux/pin_memory.h") */ ++ ++#define PIN_MEM_FILE "/dev/pinmem" ++#define ONCE_PIN_MEM_SIZE_LIMIT (32 * 1024 * 1024) ++ ++bool should_pin_vmae(VmaEntry *vmae); ++int pin_vmae(VmaEntry *vmae, struct pstree_item *item); ++int dump_task_special_pages(int pid); ++int restore_task_special_pages(int pid); ++int clear_pin_mem(int pid); ++ ++#endif /* __CRIU_PIN_MEM_H__ */ +diff --git a/criu/include/restorer.h b/criu/include/restorer.h +index c29d869..e0bdc04 100644 +--- a/criu/include/restorer.h ++++ b/criu/include/restorer.h +@@ -232,6 +232,7 @@ struct task_restore_args { + int lsm_type; + int child_subreaper; + bool has_clone3_set_tid; ++ bool pin_memory; + } __aligned(64); + + /* +diff --git a/criu/mem.c b/criu/mem.c +index ca74bfb..07efdbe 100644 +--- a/criu/mem.c ++++ b/criu/mem.c +@@ -31,6 +31,7 @@ + #include "prctl.h" + #include "compel/infect-util.h" + #include "pidfd-store.h" ++#include "pin-mem.h" + + #include "protobuf.h" + #include "images/pagemap.pb-c.h" +@@ -500,6 +501,17 @@ static int __parasite_dump_pages_seized(struct pstree_item *item, struct parasit + goto out_xfer; + } + ++ if (opts.pin_memory) { ++ /* pin memory before dump pages */ ++ list_for_each_entry(vma_area, &vma_area_list->h, list) { ++ if (should_pin_vmae(vma_area->e) ++ && pin_vmae(vma_area->e, item) != 0) { ++ exit_code = -1; ++ goto out_xfer; ++ } ++ } ++ } ++ + /* + * Step 1 -- generate the pagemap + */ +@@ -509,6 +521,10 @@ static int __parasite_dump_pages_seized(struct pstree_item *item, struct parasit + parent_predump_mode = mdc->parent_ie->pre_dump_mode; + + list_for_each_entry(vma_area, &vma_area_list->h, list) { ++ if (opts.pin_memory && should_pin_vmae(vma_area->e)) { ++ continue; ++ } ++ + ret = generate_vma_iovs(item, vma_area, pp, &xfer, args, ctl, &pmc, has_parent, mdc->pre_dump, + parent_predump_mode); + if (ret < 0) +diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c +index 368b5a0..db01ba5 100644 +--- a/criu/pie/restorer.c ++++ b/criu/pie/restorer.c +@@ -49,6 +49,7 @@ + + #include "shmem.h" + #include "restorer.h" ++#include "pin-mem.h" + + #ifndef PR_SET_PDEATHSIG + #define PR_SET_PDEATHSIG 1 +@@ -1408,6 +1409,24 @@ int cleanup_current_inotify_events(struct task_restore_args *task_args) + return 0; + } + ++int remap_vmas(int pid) ++{ ++ int fd, ret = 0; ++ ++ fd = sys_open(PIN_MEM_FILE, O_RDWR, 0); ++ if (fd == -1) { ++ pr_err("open file: %s fail.\n", PIN_MEM_FILE); ++ return -1;; ++ } ++ ++ ret = sys_ioctl(fd, REMAP_PIN_MEM_AREA, (unsigned long) &pid); ++ if (ret < 0) ++ pr_err("remap pin mem fail for pid: %d\n", pid); ++ sys_close(fd); ++ return ret; ++} ++ ++ + /* + * The main routine to restore task via sigreturn. + * This one is very special, we never return there +@@ -1577,7 +1596,12 @@ long __export_restore_task(struct task_restore_args *args) + goto core_restore_end; + } + } +- ++ if (args->pin_memory) { ++ if (remap_vmas(my_pid) < 0) { ++ pr_err("Remap vmas fail\n"); ++ goto core_restore_end; ++ } ++ } + /* + * Now read the contents (if any) + */ +diff --git a/criu/pin-mem.c b/criu/pin-mem.c +new file mode 100644 +index 0000000..b18db97 +--- /dev/null ++++ b/criu/pin-mem.c +@@ -0,0 +1,146 @@ ++#include ++#include ++#include ++ ++#include "pstree.h" ++#include "mem.h" ++#include "vma.h" ++#include "pin-mem.h" ++ ++bool should_pin_vmae(VmaEntry *vmae) ++{ ++ /* ++ * vDSO area must be always dumped because on restore ++ * we might need to generate a proxy. ++ */ ++ if (vma_entry_is(vmae, VMA_AREA_VDSO)) ++ return false; ++ /* ++ * In turn VVAR area is special and referenced from ++ * vDSO area by IP addressing (at least on x86) thus ++ * never ever dump its content but always use one provided ++ * by the kernel on restore, ie runtime VVAR area must ++ * be remapped into proper place.. ++ */ ++ if (vma_entry_is(vmae, VMA_AREA_VVAR)) ++ return false; ++ ++ if (vma_entry_is(vmae, VMA_AREA_AIORING)) ++ return false; ++ if (vma_entry_is(vmae, VMA_ANON_PRIVATE)) ++ return true; ++ ++ return false; ++} ++ ++static int pin_one_pmas(int fd, unsigned long start, ++ unsigned long *pend, struct pstree_item *item) ++{ ++ int ret; ++ unsigned int index = 0; ++ unsigned long end; ++ unsigned long next = start; ++ struct pin_mem_area_set pmas; ++ struct _pin_mem_area *pma; ++ ++ end = *pend; ++ while (start < end) { ++ next = (start + ONCE_PIN_MEM_SIZE_LIMIT > end) ++ ? end : (start + ONCE_PIN_MEM_SIZE_LIMIT); ++ pma = &(pmas.mem_area[index]); ++ pma->virt_start = start; ++ pma->virt_end = next; ++ index++; ++ start += ONCE_PIN_MEM_SIZE_LIMIT; ++ if (index >= MAX_PIN_MEM_AREA_NUM) ++ break; ++ } ++ ++ *pend = next; ++ pmas.area_num = index; ++ pmas.pid = vpid(item); ++ ++ ret = ioctl(fd, SET_PIN_MEM_AREA, &pmas); ++ if (ret < 0) ++ pr_err("pin mem fail, errno: %s\n", strerror(errno)); ++ return ret; ++} ++ ++int pin_vmae(VmaEntry *vmae, struct pstree_item *item) ++{ ++ int fd; ++ int ret = 0; ++ unsigned long start, end; ++ ++ fd = open(PIN_MEM_FILE, O_RDWR); ++ if (fd < 0) { ++ pr_err("open file: %s fail.\n", PIN_MEM_FILE); ++ return -1; ++ } ++ ++ start = vmae->start; ++ while (start < vmae->end) { ++ end = vmae->end; ++ ret = pin_one_pmas(fd, start, &end, item); ++ if (ret < 0) ++ break; ++ start = end; ++ } ++ close(fd); ++ return ret; ++} ++ ++int dump_task_special_pages(int pid) ++{ ++ int fd, ret; ++ ++ fd = open(PIN_MEM_FILE, O_RDWR, 0); ++ if (fd < 0) { ++ pr_warn("error open file: %s\n", PIN_MEM_FILE); ++ return -1; ++ } ++ ++ ret = ioctl(fd, DUMP_SPECIAL_PAGES, (unsigned long) &pid); ++ if (ret < 0) ++ pr_warn("No need DUMP_SPECIAL_PAGES for %d\n", pid); ++ ++ close(fd); ++ return ret; ++} ++ ++int restore_task_special_pages(int pid) ++{ ++ int fd, ret; ++ ++ fd = open(PIN_MEM_FILE, O_RDWR, 0); ++ if (fd < 0) { ++ pr_warn("error open file: %s\n", PIN_MEM_FILE); ++ return -1; ++ } ++ ++ ret = ioctl(fd, RETORE_SPECIAL_PAGES, (unsigned long) &pid); ++ if (ret < 0) ++ pr_warn("No need RETORE_SPECIAL_PAGES for %d\n", pid); ++ ++ close(fd); ++ return ret; ++} ++ ++int clear_pin_mem(int pid) ++{ ++ int fd, ret; ++ ++ fd = open(PIN_MEM_FILE, O_RDWR, 0); ++ if (fd < 0) { ++ pr_warn("error open file: %s\n", PIN_MEM_FILE); ++ return -1; ++ } ++ ++ ret = ioctl(fd, CLEAR_PIN_MEM_AREA, (unsigned long) &pid); ++ if (ret < 0) { ++ pr_warn("clear pin mem fail: %d\n", pid); ++ } ++ ++ close(fd); ++ return ret; ++} +diff --git a/criu/seize.c b/criu/seize.c +index 95bf9ef..8a35c3c 100644 +--- a/criu/seize.c ++++ b/criu/seize.c +@@ -23,6 +23,7 @@ + #include "string.h" + #include "xmalloc.h" + #include "util.h" ++#include "pin-mem.h" + + #define NR_ATTEMPTS 5 + +@@ -640,6 +641,11 @@ static void unseize_task_and_threads(const struct pstree_item *item, int st) + if (item->pid->state == TASK_DEAD) + return; + ++ if (opts.pin_memory) { ++ for (i = 0; i < item->nr_threads; i++) ++ dump_task_special_pages(item->threads[i].real); ++ } ++ + /* + * The st is the state we want to switch tasks into, + * the item->state is the state task was in when we seized one. +-- +2.34.1 + diff --git a/0021-pid-add-pid-recover-method-for-criu.patch b/0021-pid-add-pid-recover-method-for-criu.patch new file mode 100644 index 0000000..b7a9467 --- /dev/null +++ b/0021-pid-add-pid-recover-method-for-criu.patch @@ -0,0 +1,213 @@ +From 2911f505eefcfaea582d457c1fa18df34d151954 Mon Sep 17 00:00:00 2001 +From: Jingxian He +Date: Wed, 19 May 2021 21:33:22 +0800 +Subject: [PATCH 21/72] pid: add pid recover method for criu + +The default pid recover method cannot recover the task +pid at every time. +We add a new pid recover method by setting the fork_pid of +the parent task struct, add the kernel will alloc pid by +the fork_pid. +The new pid recover method can also avoid other tasks using +the dumping task pids. + +Signed-off-by: Jingxian He +--- + criu/config.c | 1 + + criu/cr-restore.c | 27 ++++++++++++++++++++++++++- + criu/crtools.c | 1 + + criu/include/cr_options.h | 1 + + criu/include/pin-mem.h | 4 ++++ + criu/include/restorer.h | 1 + + criu/pie/restorer.c | 25 ++++++++++++++++++++++++- + 7 files changed, 58 insertions(+), 2 deletions(-) + +diff --git a/criu/config.c b/criu/config.c +index 53a5cfd..6dfbb01 100644 +--- a/criu/config.c ++++ b/criu/config.c +@@ -699,6 +699,7 @@ int parse_options(int argc, char **argv, bool *usage_error, bool *has_exec_cmd, + BOOL_OPT("pin-memory", &opts.pin_memory), + { "lsm-mount-context", required_argument, 0, 1099 }, + { "network-lock", required_argument, 0, 1100 }, ++ BOOL_OPT("use-fork-pid", &opts.use_fork_pid), + {}, + }; + +diff --git a/criu/cr-restore.c b/criu/cr-restore.c +index 5514c29..497dd14 100644 +--- a/criu/cr-restore.c ++++ b/criu/cr-restore.c +@@ -80,6 +80,7 @@ + #include "timens.h" + #include "bpfmap.h" + #include "apparmor.h" ++#include "pin-mem.h" + + #include "parasite-syscall.h" + #include "files-reg.h" +@@ -1340,6 +1341,23 @@ static int set_next_pid(void *arg) + return 0; + } + ++static int write_fork_pid(int pid) ++{ ++ int fd, ret; ++ ++ fd = open(PIN_MEM_FILE, O_RDWR); ++ if (fd < 0) { ++ pr_warn("error open file: %s\n", PIN_MEM_FILE); ++ return -1; ++ } ++ ret = ioctl(fd, SET_FORK_PID, &pid); ++ if (ret < 0) { ++ pr_warn("write fork pid fail, errno: %s\n", strerror(errno)); ++ } ++ close(fd); ++ return ret; ++} ++ + static inline int fork_with_pid(struct pstree_item *item) + { + struct cr_clone_arg ca; +@@ -1424,7 +1442,7 @@ static inline int fork_with_pid(struct pstree_item *item) + if (!(ca.clone_flags & CLONE_NEWPID)) { + lock_last_pid(); + +- if (!kdat.has_clone3_set_tid) { ++ if (!kdat.has_clone3_set_tid && !opts.use_fork_pid) { + if (external_pidns) { + /* + * Restoring into another namespace requires a helper +@@ -1434,6 +1452,12 @@ static inline int fork_with_pid(struct pstree_item *item) + */ + ret = call_in_child_process(set_next_pid, (void *)&pid); + } else { ++ if (opts.use_fork_pid) { ++ ret = write_fork_pid(pid); ++ if (ret < 0) ++ goto err_unlock; ++ } ++ + ret = set_next_pid((void *)&pid); + } + if (ret != 0) { +@@ -3886,6 +3910,7 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns + task_args->thread_args); + + task_args->pin_memory = opts.pin_memory; ++ task_args->use_fork_pid = opts.use_fork_pid; + + /* + * An indirect call to task_restore, note it never returns +diff --git a/criu/crtools.c b/criu/crtools.c +index 1b90481..502acdf 100644 +--- a/criu/crtools.c ++++ b/criu/crtools.c +@@ -448,6 +448,7 @@ usage: + " --with-cpu-affinity Allow to restore cpu affinity. Only for hosts with\n" + " same cpu quantity.\n" + " --pin-memory Use pin memory method for checkpoint and restore.\n" ++ " --use-fork-pid Allow to restore task pid by setting fork pid of task struct.\n" + "\n" + "Check options:\n" + " Without options, \"criu check\" checks availability of absolutely required\n" +diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h +index 61898fd..923cc5f 100644 +--- a/criu/include/cr_options.h ++++ b/criu/include/cr_options.h +@@ -191,6 +191,7 @@ struct cr_options { + /* restore cpu affinity */ + int with_cpu_affinity; + int pin_memory; ++ int use_fork_pid; + }; + + extern struct cr_options opts; +diff --git a/criu/include/pin-mem.h b/criu/include/pin-mem.h +index 7e53b12..2b54996 100644 +--- a/criu/include/pin-mem.h ++++ b/criu/include/pin-mem.h +@@ -6,6 +6,7 @@ + #include "vma.pb-c.h" + + #if __has_include("linux/pin_memory.h") ++# define CONFIG_PID_RESERVE + # include + #else + +@@ -35,6 +36,9 @@ struct pin_mem_area_set { + struct _pin_mem_area mem_area[MAX_PIN_MEM_AREA_NUM]; + }; + ++#define _SET_FORK_PID 8 ++#define SET_FORK_PID _IOW(PIN_MEM_MAGIC, _SET_FORK_PID, int) ++ + #endif /* __has_include("linux/pin_memory.h") */ + + #define PIN_MEM_FILE "/dev/pinmem" +diff --git a/criu/include/restorer.h b/criu/include/restorer.h +index e0bdc04..93f87f4 100644 +--- a/criu/include/restorer.h ++++ b/criu/include/restorer.h +@@ -233,6 +233,7 @@ struct task_restore_args { + int child_subreaper; + bool has_clone3_set_tid; + bool pin_memory; ++ bool use_fork_pid; + } __aligned(64); + + /* +diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c +index db01ba5..1317582 100644 +--- a/criu/pie/restorer.c ++++ b/criu/pie/restorer.c +@@ -1426,6 +1426,22 @@ int remap_vmas(int pid) + return ret; + } + ++int write_fork_pid(int pid) ++{ ++ int fd, ret; ++ ++ fd = sys_open(PIN_MEM_FILE, O_RDWR, 0); ++ if (fd < 0) { ++ pr_warn("error open file: %s\n", PIN_MEM_FILE); ++ return -1; ++ } ++ ret = sys_ioctl(fd, SET_FORK_PID, (unsigned long) &pid); ++ if (ret < 0) { ++ pr_warn("write fork pid fail fail: %d\n", pid); ++ } ++ sys_close(fd); ++ return ret; ++} + + /* + * The main routine to restore task via sigreturn. +@@ -1815,7 +1831,7 @@ long __export_restore_task(struct task_restore_args *args) + long parent_tid; + int i, fd = -1; + +- if (!args->has_clone3_set_tid) { ++ if (!args->has_clone3_set_tid && !args->use_fork_pid) { + /* One level pid ns hierarhy */ + fd = sys_openat(args->proc_fd, LAST_PID_PATH, O_RDWR, 0); + if (fd < 0) { +@@ -1847,6 +1863,13 @@ long __export_restore_task(struct task_restore_args *args) + pr_debug("Using clone3 to restore the process\n"); + RUN_CLONE3_RESTORE_FN(ret, c_args, sizeof(c_args), &thread_args[i], + args->clone_restore_fn); ++ } else if (args->use_fork_pid) { ++ if (write_fork_pid(thread_args[i].pid) < 0) { ++ pr_err("Clone fail with fork pid\n"); ++ mutex_unlock(&task_entries_local->last_pid_mutex); ++ goto core_restore_end; ++ } ++ RUN_CLONE_RESTORE_FN(ret, clone_flags, new_sp, parent_tid, thread_args, args->clone_restore_fn); + } else { + last_pid_len = + std_vprint_num(last_pid_buf, sizeof(last_pid_buf), thread_args[i].pid - 1, &s); +-- +2.34.1 + diff --git a/0022-notifier-add-notifier-calling-method-for-checkpoint-.patch b/0022-notifier-add-notifier-calling-method-for-checkpoint-.patch new file mode 100644 index 0000000..318c53b --- /dev/null +++ b/0022-notifier-add-notifier-calling-method-for-checkpoint-.patch @@ -0,0 +1,621 @@ +From 33c351e18eddc2517f799c1cac20790ebabddbc8 Mon Sep 17 00:00:00 2001 +From: Jingxian He +Date: Wed, 19 May 2021 21:45:03 +0800 +Subject: [PATCH 22/72] notifier: add notifier calling method for checkpoint + and restore + +Add notifier calling method for checkpoint and restore during kernel module upgrading. + +Signed-off-by: Xiaoguang Li +Signed-off-by: He Jingxian +Signed-off-by: fu.lin +--- + criu/config.c | 1 + + criu/cr-dump.c | 34 +++++++++++ + criu/cr-restore.c | 18 +++++- + criu/crtools.c | 2 + + criu/include/cr_options.h | 1 + + criu/include/notifier.h | 44 +++++++++++++++ + criu/include/restorer.h | 1 + + criu/include/util.h | 2 + + criu/pie/restorer.c | 116 ++++++++++++++++++++++++++++++++++---- + criu/pie/util.c | 91 ++++++++++++++++++++++++++++++ + 10 files changed, 297 insertions(+), 13 deletions(-) + create mode 100644 criu/include/notifier.h + +diff --git a/criu/config.c b/criu/config.c +index 6dfbb01..5d1cff6 100644 +--- a/criu/config.c ++++ b/criu/config.c +@@ -700,6 +700,7 @@ int parse_options(int argc, char **argv, bool *usage_error, bool *has_exec_cmd, + { "lsm-mount-context", required_argument, 0, 1099 }, + { "network-lock", required_argument, 0, 1100 }, + BOOL_OPT("use-fork-pid", &opts.use_fork_pid), ++ BOOL_OPT("with-notifier", &opts.with_notifier_kup), + {}, + }; + +diff --git a/criu/cr-dump.c b/criu/cr-dump.c +index 5fac9ce..50a2f9b 100644 +--- a/criu/cr-dump.c ++++ b/criu/cr-dump.c +@@ -87,6 +87,7 @@ + #include "apparmor.h" + #include "asm/dump.h" + #include "pin-mem.h" ++#include "notifier.h" + + /* + * Architectures can overwrite this function to restore register sets that +@@ -1981,6 +1982,8 @@ static int cr_lazy_mem_dump(void) + return ret; + } + ++static enum notifier_state notifier_state = NOTHING_COMPLETE; ++ + static int cr_dump_finish(int ret) + { + int post_dump_ret = 0; +@@ -2067,6 +2070,20 @@ static int cr_dump_finish(int ret) + clear_pin_mem(0); + } + ++ if (ret != 0 && opts.with_notifier_kup) { ++ pr_info("call notifier rollback\n"); ++ switch (notifier_state) { ++ case PRE_FREEZE_COMPLETE: ++ notifier_kup(PRE_FREEZE, ROLLBACK, true); ++ break; ++ case FREEZE_TO_KILL_COMPLETE: ++ notifier_kup(FREEZE_TO_KILL, ROLLBACK, true); ++ break; ++ default: ++ break; ++ } ++ } ++ + if (ret) { + pr_err("Dumping FAILED.\n"); + } else { +@@ -2100,6 +2117,14 @@ int cr_dump_tasks(pid_t pid) + goto err; + root_item->pid->real = pid; + ++ if (notifier_kup(PRE_FREEZE, PREPARE, opts.with_notifier_kup)) { ++ /* disable rollback function because we has already rollbacked. */ ++ opts.with_notifier_kup = false; ++ pr_err("call notifier: %d err\n", PRE_FREEZE); ++ goto err; ++ } else ++ notifier_state = PRE_FREEZE_COMPLETE; ++ + pre_dump_ret = run_scripts(ACT_PRE_DUMP); + if (pre_dump_ret != 0) { + pr_err("Pre dump script failed with %d!\n", pre_dump_ret); +@@ -2258,6 +2283,15 @@ int cr_dump_tasks(pid_t pid) + ret = write_img_inventory(&he); + if (ret) + goto err; ++ ++ ret = notifier_kup(FREEZE_TO_KILL, PREPARE, opts.with_notifier_kup); ++ if (ret) { ++ opts.with_notifier_kup = false; ++ pr_err("call notifier:%d err\n", FREEZE_TO_KILL); ++ goto err; ++ } else ++ notifier_state = FREEZE_TO_KILL_COMPLETE; ++ + err: + if (parent_ie) + inventory_entry__free_unpacked(parent_ie, NULL); +diff --git a/criu/cr-restore.c b/criu/cr-restore.c +index 497dd14..03511b6 100644 +--- a/criu/cr-restore.c ++++ b/criu/cr-restore.c +@@ -81,6 +81,7 @@ + #include "bpfmap.h" + #include "apparmor.h" + #include "pin-mem.h" ++#include "notifier.h" + + #include "parasite-syscall.h" + #include "files-reg.h" +@@ -1951,6 +1952,7 @@ static int restore_task_with_children(void *_arg) + return 0; + + err: ++ do_notifier_rollback(opts.with_notifier_kup, POST_UPDATE_KERNEL_COMPLETE); + if (current->parent == NULL) + futex_abort_and_wake(&task_entries->nr_in_progress); + exit(1); +@@ -2451,8 +2453,10 @@ skip_ns_bouncing: + */ + attach_to_tasks(root_seized); + +- if (restore_switch_stage(CR_STATE_RESTORE_CREDS)) ++ if (restore_switch_stage(CR_STATE_RESTORE_CREDS)) { ++ pr_err("Can't switch to CR_STATE_RESTORE_CREDS stage\n"); + goto out_kill_network_unlocked; ++ } + + timing_stop(TIME_RESTORE); + +@@ -2631,6 +2635,15 @@ int cr_restore_tasks(void) + goto clean_cgroup; + + ret = restore_root_task(root_item); ++ if (ret) ++ goto err; ++ ++ ret = notifier_kup(POST_RUN, PREPARE, opts.with_notifier_kup); ++ if (ret < 0) { ++ opts.with_notifier_kup = false; ++ pr_err("calling POST_RUN notifier list return err\n"); ++ } ++ + clean_cgroup: + fini_cgroup(); + err: +@@ -3922,6 +3935,9 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns + err: + free_mappings(&self_vmas); + err_nv: ++ if (current->parent == NULL && opts.with_notifier_kup) ++ do_notifier_rollback(true, POST_UPDATE_KERNEL_COMPLETE); ++ + /* Just to be sure */ + exit(1); + return -1; +diff --git a/criu/crtools.c b/criu/crtools.c +index 502acdf..1d08620 100644 +--- a/criu/crtools.c ++++ b/criu/crtools.c +@@ -449,6 +449,8 @@ usage: + " same cpu quantity.\n" + " --pin-memory Use pin memory method for checkpoint and restore.\n" + " --use-fork-pid Allow to restore task pid by setting fork pid of task struct.\n" ++ " --with-notifier Allow to checkpoint/restore kup notifier chain.\n" ++ " This feature needs the kernel assistance.\n" + "\n" + "Check options:\n" + " Without options, \"criu check\" checks availability of absolutely required\n" +diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h +index 923cc5f..039edba 100644 +--- a/criu/include/cr_options.h ++++ b/criu/include/cr_options.h +@@ -192,6 +192,7 @@ struct cr_options { + int with_cpu_affinity; + int pin_memory; + int use_fork_pid; ++ int with_notifier_kup; + }; + + extern struct cr_options opts; +diff --git a/criu/include/notifier.h b/criu/include/notifier.h +new file mode 100644 +index 0000000..e4972a7 +--- /dev/null ++++ b/criu/include/notifier.h +@@ -0,0 +1,44 @@ ++#ifndef __CRIU_NOTIFIER_H__ ++#define __CRIU_NOTIFIER_H__ ++ ++#define NOTIFY_PROC_PATH "/sys/kernel/modrestore/nvwa_notifier" ++ ++#if __has_include("linux/modrestore.h") ++# define CONFIG_EULEROS_MODRESTORE_NOTIFY /* useless, historical factors */ ++# include ++#else ++enum KUP_HOOK_POINT { ++ PRE_FREEZE, ++ FREEZE_TO_KILL, ++ PRE_UPDATE_KERNEL, ++ POST_UPDATE_KERNEL, ++ UNFREEZE_TO_RUN, ++ POST_RUN, ++ ++ KUP_HOOK_MAX, ++}; ++ ++enum nvwa_cmd { ++ PREPARE = 0, ++ ROLLBACK, ++ ++ NVWA_CMD_MAX, ++}; ++#endif ++ ++enum notifier_state { ++ NOTHING_COMPLETE, ++ PRE_FREEZE_COMPLETE, ++ FREEZE_TO_KILL_COMPLETE, ++ PRE_UPDATE_KERNEL_COMPLETE, ++ POST_UPDATE_KERNEL_COMPLETE, ++ UNFREEZE_TO_RUN_COMPLETE, ++ POST_RUN_COMPLETE, ++ ++ NOTIFIER_ROLLBACK_DONE = 0xfc17173b, /* has done rollback */ ++}; ++ ++int notifier_kup(enum KUP_HOOK_POINT, enum nvwa_cmd, bool); ++void do_notifier_rollback(bool, enum notifier_state); ++ ++#endif /* __CRIU_NOTIFIER_H__ */ +diff --git a/criu/include/restorer.h b/criu/include/restorer.h +index 93f87f4..2f7345b 100644 +--- a/criu/include/restorer.h ++++ b/criu/include/restorer.h +@@ -234,6 +234,7 @@ struct task_restore_args { + bool has_clone3_set_tid; + bool pin_memory; + bool use_fork_pid; ++ bool with_notifier_kup; + } __aligned(64); + + /* +diff --git a/criu/include/util.h b/criu/include/util.h +index 1c0b3c7..e0049a6 100644 +--- a/criu/include/util.h ++++ b/criu/include/util.h +@@ -13,6 +13,8 @@ + #include + #include + #include ++#include ++#include + + #include "int.h" + #include "common/compiler.h" +diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c +index 1317582..4a1d38d 100644 +--- a/criu/pie/restorer.c ++++ b/criu/pie/restorer.c +@@ -36,6 +36,7 @@ + #include "vma.h" + #include "uffd.h" + #include "sched.h" ++#include "notifier.h" + + #include "common/lock.h" + #include "common/page.h" +@@ -77,6 +78,7 @@ + + static struct task_entries *task_entries_local; + static futex_t thread_inprogress; ++static futex_t thread_start; + static pid_t *helpers; + static int n_helpers; + static pid_t *zombies; +@@ -118,10 +120,28 @@ void parasite_cleanup(void) + + extern void cr_restore_rt(void) asm("__cr_restore_rt") __attribute__((visibility("hidden"))); + ++static int args_with_notifier_kup; ++static enum notifier_state notifier_state = POST_UPDATE_KERNEL_COMPLETE; ++static futex_t notifier_done; ++ + static void sigchld_handler(int signal, siginfo_t *siginfo, void *data) + { + char *r; + int i; ++ rt_sigaction_t act; ++ ++ if (signal == SIGSEGV || signal == SIGBUS || signal == SIGILL) { ++ /* Make sure we exit with the right signal at the end. So for instance ++ * the core will be dumped if enabled. */ ++ pr_info("recv signal: %d\n", signal); ++ do_notifier_rollback(args_with_notifier_kup, notifier_state); ++ ksigemptyset (&act.rt_sa_mask); ++ act.rt_sa_flags = SA_SIGINFO | SA_RESTART; ++ act.rt_sa_handler = (rt_sighandler_t)SIG_DFL; ++ sys_sigaction(signal, &act, NULL, sizeof(k_rtsigset_t)); ++ sys_kill(sys_getpid(),signal); ++ return; ++ } + + /* We can ignore helpers that die, we expect them to after + * CR_STATE_RESTORE is finished. */ +@@ -148,10 +168,14 @@ static void sigchld_handler(int signal, siginfo_t *siginfo, void *data) + + pr_info("Task %d %s %d\n", siginfo->si_pid, r, siginfo->si_status); + ++ pr_info("%s: trace do_notifier_rollback\n", __func__); ++ do_notifier_rollback(args_with_notifier_kup, notifier_state); + futex_abort_and_wake(&task_entries_local->nr_in_progress); + /* sa_restorer may be unmaped, so we can't go back to userspace*/ + sys_kill(sys_getpid(), SIGSTOP); + sys_exit_group(1); ++ ++ /* for notifier, do nothing when receiving SIGCHLD signal */ + } + + static int lsm_set_label(char *label, char *type, int procfd) +@@ -616,6 +640,27 @@ static void noinline rst_sigreturn(unsigned long new_sp, struct rt_sigframe *sig + ARCH_RT_SIGRETURN(new_sp, sigframe); + } + ++/* Notice: only one task, so it isn't necessary to consider concurrent. */ ++static int do_notifier(bool *notify) ++{ ++ int retval = 0; ++ ++ if (!*notify) ++ return 0; ++ ++ pr_info("unfreeze_to_run restore notifier\n"); ++ retval = notifier_kup(UNFREEZE_TO_RUN, PREPARE, true); ++ if (retval) { ++ *notify = false; ++ notifier_state = NOTIFIER_ROLLBACK_DONE; ++ pr_err("call notifier: %d err\n", UNFREEZE_TO_RUN); ++ } ++ ++ notifier_state = UNFREEZE_TO_RUN_COMPLETE; ++ ++ return retval; ++} ++ + /* + * Threads restoration via sigreturn. Note it's locked + * routine and calls for unlock at the end. +@@ -654,12 +699,18 @@ long __export_restore_thread(struct thread_restore_args *args) + + pr_info("%ld: Restored\n", sys_gettid()); + +- restore_finish_stage(task_entries_local, CR_STATE_RESTORE); ++ if (!!(restore_finish_stage(task_entries_local, CR_STATE_RESTORE) & FUTEX_ABORT_FLAG)) { ++ pr_err("%s: abort by CR_STATE_RESTORE\n", __func__); ++ goto core_restore_end; ++ } + + if (restore_signals(args->siginfo, args->siginfo_n, false)) + goto core_restore_end; + +- restore_finish_stage(task_entries_local, CR_STATE_RESTORE_SIGCHLD); ++ if (!!(restore_finish_stage(task_entries_local, CR_STATE_RESTORE_SIGCHLD) & FUTEX_ABORT_FLAG)) { ++ pr_err("%s: abort by CR_STATE_RESTORE_SIGCHLD\n", __func__); ++ goto core_restore_end; ++ } + + /* + * Make sure it's before creds, since it's privileged +@@ -674,16 +725,29 @@ long __export_restore_thread(struct thread_restore_args *args) + if (ret) + BUG(); + +- restore_finish_stage(task_entries_local, CR_STATE_RESTORE_CREDS); ++ if (!!(restore_finish_stage(task_entries_local, CR_STATE_RESTORE_CREDS) & FUTEX_ABORT_FLAG)) { ++ pr_err("%s: abort by CR_STATE_RESTORE_CREDS\n", __func__); ++ goto core_restore_end; ++ } + + futex_dec_and_wake(&thread_inprogress); ++ futex_wait_while(&thread_start, 0); ++ if (!!(futex_get(&thread_start) & FUTEX_ABORT_FLAG)) { ++ pr_err("%s: abort by thread_start\n", __func__); ++ goto wait_notifier; ++ } + + new_sp = (long)rt_sigframe + RT_SIGFRAME_OFFSET(rt_sigframe); + rst_sigreturn(new_sp, rt_sigframe); + + core_restore_end: +- pr_err("Restorer abnormal termination for %ld\n", sys_getpid()); +- futex_abort_and_wake(&task_entries_local->nr_in_progress); ++ futex_abort_and_wake(&thread_start); ++ futex_abort_and_wake(&task_entries_local->start); ++ ++wait_notifier: ++ pr_err("%s: Restorer abnormal termination for %ld\n", __func__, sys_getpid()); ++ futex_wait_while(¬ifier_done, 0); ++ + sys_exit_group(1); + return -1; + } +@@ -1465,6 +1529,10 @@ long __export_restore_task(struct task_restore_args *args) + rt_sigaction_t act; + bool has_vdso_proxy; + ++ futex_set(&thread_inprogress, 1); ++ futex_set(&thread_start, 0); ++ futex_set(¬ifier_done, 0); ++ + bootstrap_start = args->bootstrap_start; + bootstrap_len = args->bootstrap_len; + +@@ -1481,6 +1549,7 @@ long __export_restore_task(struct task_restore_args *args) + #ifdef ARCH_HAS_LONG_PAGES + __page_size = args->page_size; + #endif ++ args_with_notifier_kup = args->with_notifier_kup; + + ksigfillset(&act.rt_sa_mask); + act.rt_sa_handler = sigchld_handler; +@@ -1895,7 +1964,8 @@ long __export_restore_task(struct task_restore_args *args) + pr_err("Unable to create a thread: %ld\n", ret); + mutex_unlock(&task_entries_local->last_pid_mutex); + goto core_restore_end; +- } ++ } else ++ futex_inc(&thread_inprogress); + } + + mutex_unlock(&task_entries_local->last_pid_mutex); +@@ -1919,7 +1989,14 @@ long __export_restore_task(struct task_restore_args *args) + + pr_info("%ld: Restored\n", sys_getpid()); + +- restore_finish_stage(task_entries_local, CR_STATE_RESTORE); ++ if (!!(restore_finish_stage(task_entries_local, CR_STATE_RESTORE) & FUTEX_ABORT_FLAG)) { ++ pr_err("%s: abort by CR_STATE_RESTORE\n", __func__); ++ goto core_restore_end; ++ } ++ ++ ret = do_notifier(&args->with_notifier_kup); ++ if (ret) ++ goto core_restore_end; + + if (wait_helpers(args) < 0) + goto core_restore_end; +@@ -1965,7 +2042,8 @@ long __export_restore_task(struct task_restore_args *args) + if (ret) + goto core_restore_end; + +- restore_finish_stage(task_entries_local, CR_STATE_RESTORE_SIGCHLD); ++ if (!!(restore_finish_stage(task_entries_local, CR_STATE_RESTORE_SIGCHLD) & FUTEX_ABORT_FLAG)) ++ goto core_restore_end; + + rst_tcp_socks_all(args); + +@@ -1986,15 +2064,20 @@ long __export_restore_task(struct task_restore_args *args) + ret = ret || restore_pdeath_sig(args->t); + ret = ret || restore_child_subreaper(args->child_subreaper); + +- futex_set_and_wake(&thread_inprogress, args->nr_threads); +- +- restore_finish_stage(task_entries_local, CR_STATE_RESTORE_CREDS); ++ if (!!(restore_finish_stage(task_entries_local, CR_STATE_RESTORE_CREDS) & FUTEX_ABORT_FLAG)) ++ goto core_restore_end; + + if (ret) + BUG(); + + /* Wait until children stop to use args->task_entries */ + futex_wait_while_gt(&thread_inprogress, 1); ++ if (!!(futex_get(&thread_start) & FUTEX_ABORT_FLAG)) { ++ pr_err("%s: terminate by main thread futex_start\n", __func__); ++ goto handle_notifier; ++ } ++ ++ futex_set_and_wake(&thread_start, 1); + + sys_close(args->proc_fd); + std_log_set_fd(-1); +@@ -2030,8 +2113,17 @@ long __export_restore_task(struct task_restore_args *args) + rst_sigreturn(new_sp, rt_sigframe); + + core_restore_end: +- futex_abort_and_wake(&task_entries_local->nr_in_progress); ++ futex_abort_and_wake(&thread_start); ++ futex_abort_and_wake(&task_entries_local->start); ++ ++handle_notifier: ++ do_notifier_rollback(args->with_notifier_kup, notifier_state); ++ ++ futex_abort_and_wake(&task_entries_local->nr_in_progress); /* notifier the criu main process */ + pr_err("Restorer fail %ld\n", sys_getpid()); ++ ++ futex_set_and_wake(¬ifier_done, 1); /* wake all other threads to exit */ ++ + sys_exit_group(1); + return -1; + } +diff --git a/criu/pie/util.c b/criu/pie/util.c +index e7a5a9f..9871db7 100644 +--- a/criu/pie/util.c ++++ b/criu/pie/util.c +@@ -11,6 +11,7 @@ + #include "fcntl.h" + #include "log.h" + #include "util-pie.h" ++#include "notifier.h" + + #ifdef CR_NOGLIBC + #include +@@ -52,3 +53,93 @@ err_close: + __sys(close)(fd); + return -1; + } ++ ++#define KUP_BUF_SIZE 256 ++ ++static int int_to_string(unsigned number, char *buf, size_t total) { ++ unsigned remainder, quotient, i, len; ++ ++ quotient = number; ++ len = 0; ++ do { ++ quotient /= 10; ++ len += 1; ++ } while (quotient > 0); ++ ++ if (len > total - 1) ++ return -1; ++ ++ quotient = number; ++ i = 1; ++ do { ++ remainder = quotient % 10; ++ quotient = quotient / 10; ++ buf[len-i] = '0' + remainder; ++ i++; ++ } while (quotient > 0); ++ buf[len] = '\0'; ++ ++ return len == 0 ? -1 : len; ++} ++ ++int notifier_kup(enum KUP_HOOK_POINT action, enum nvwa_cmd cmd, bool enable) ++{ ++ int fd, count = 0, retval = 0; ++ char buf[KUP_BUF_SIZE] = {0}; ++ ++ if (!enable) ++ return 0; ++ ++ fd = __sys(open)(NOTIFY_PROC_PATH, O_WRONLY, 0); ++ if (fd == -EACCES) { ++ /* there is no priviledge to open file, ignore this condition. */ ++ pr_info("%s: open %s failed, retval: %d (-EACCES)\n", ++ __func__, NOTIFY_PROC_PATH, -EACCES); ++ return 0; ++ } else if (fd < 0) { ++ __pr_perror("%s: Can't open %s: %d\n", __func__, NOTIFY_PROC_PATH, fd); ++ return fd; ++ } ++ ++ retval = int_to_string(action, buf, sizeof(buf)-count); ++ if (retval <= 0) { ++ __pr_perror("%s: int_to_string error\n", __func__); ++ goto err_close; ++ } ++ ++ buf[retval] = ':'; ++ count = retval + 1; ++ ++ retval = int_to_string(cmd, buf+count, sizeof(buf)-count); ++ if (retval <= 0) { ++ __pr_perror("%s: int_to_string error\n", __func__); ++ goto err_close; ++ } ++ ++ count += retval; ++ retval = __sys(write)(fd, buf, count); ++ if (retval < 0) ++ __pr_perror("%s: Can't write to %s\n", __func__, NOTIFY_PROC_PATH); ++ ++err_close: ++ __sys(close)(fd); ++ ++ return retval < 0 ? -1 : 0; ++} ++ ++void do_notifier_rollback(bool rollback, enum notifier_state status) ++{ ++ if (!rollback) ++ return; ++ ++ switch (status) { ++ case POST_UPDATE_KERNEL_COMPLETE: ++ notifier_kup(POST_UPDATE_KERNEL, ROLLBACK, true); ++ break; ++ case UNFREEZE_TO_RUN_COMPLETE: ++ notifier_kup(UNFREEZE_TO_RUN, ROLLBACK, true); ++ break; ++ default: ++ break; ++ } ++} +-- +2.34.1 + diff --git a/0023-block-device-dump-block-device-as-reguler-file.patch b/0023-block-device-dump-block-device-as-reguler-file.patch new file mode 100644 index 0000000..678cfdf --- /dev/null +++ b/0023-block-device-dump-block-device-as-reguler-file.patch @@ -0,0 +1,62 @@ +From 48c6f11d0b3c5f0549ff52cce0c8ce31ad67518f Mon Sep 17 00:00:00 2001 +From: Jingxian He +Date: Wed, 19 May 2021 21:49:15 +0800 +Subject: [PATCH 23/72] block-device: dump block device as reguler file + +Add block device dump and restore method for kernel module upgrading. + +Conflict:NA +Reference:https://gitee.com/src-openeuler/criu/pulls/21 +Signed-off-by: Xiaoguang Li +--- + criu/files.c | 27 +++++++++++++++++++++++++++ + 1 file changed, 27 insertions(+) + +diff --git a/criu/files.c b/criu/files.c +index 93754fb..f262d80 100644 +--- a/criu/files.c ++++ b/criu/files.c +@@ -442,6 +442,30 @@ static const struct fdtype_ops *get_mem_dev_ops(struct fd_parms *p, int minor) + return ops; + } + ++static int dump_blkdev(struct fd_parms *p, int lfd, FdinfoEntry *e) ++{ ++ struct fd_link *link_old = p->link; ++ int maj = major(p->stat.st_rdev); ++ const struct fdtype_ops *ops; ++ int err; ++ ++ switch (maj) { ++ case SCSI_DISK0_MAJOR: ++ ops = ®file_dump_ops; ++ break; ++ default: { ++ char more[32] = "block_dev"; ++ ++ err = dump_unsupp_fd(p, lfd, "blk", more, e); ++ p->link = link_old; ++ return err; ++ } ++ } ++ err = do_dump_gen_file(p, lfd, ops, e); ++ p->link = link_old; ++ return err; ++} ++ + static int dump_chrdev(struct fd_parms *p, int lfd, FdinfoEntry *e) + { + struct fd_link *link_old = p->link; +@@ -508,6 +532,9 @@ static int dump_one_file(struct pid *pid, int fd, int lfd, struct fd_opts *opts, + p.fd_ctl = ctl; /* Some dump_opts require this to talk to parasite */ + p.dfds = dfds; /* epoll needs to verify if target fd exist */ + ++ if (S_ISBLK(p.stat.st_mode)) ++ return dump_blkdev(&p, lfd, e); ++ + if (S_ISSOCK(p.stat.st_mode)) + return dump_socket(&p, lfd, e); + +-- +2.34.1 + diff --git a/0024-anon-inode-add-support-for-anon-inode-fd.patch b/0024-anon-inode-add-support-for-anon-inode-fd.patch new file mode 100644 index 0000000..0e2bfc4 --- /dev/null +++ b/0024-anon-inode-add-support-for-anon-inode-fd.patch @@ -0,0 +1,316 @@ +From 9bb9af3189ae8a7eadf975befa2aa30b7227259e Mon Sep 17 00:00:00 2001 +From: Jingxian He +Date: Wed, 19 May 2021 21:52:49 +0800 +Subject: [PATCH 24/72] anon-inode: add support for anon inode fd + +Add support for anon inode fd dump and restore during module upgrade. + +Signed-off-by: Xiaoguang Li +Signed-off-by: Jingxian He +Signed-off-by: fu.lin +--- + criu/cr-restore.c | 3 +++ + criu/files-reg.c | 3 ++- + criu/include/image.h | 1 + + criu/include/mem.h | 1 + + criu/include/restorer.h | 6 ++++++ + criu/mem.c | 23 +++++++++++++++++++++++ + criu/pie/restorer.c | 37 +++++++++++++++++++++++++++++++++++++ + criu/proc_parse.c | 31 ++++++++++++++++++++++++++++--- + images/vma.proto | 1 + + 9 files changed, 102 insertions(+), 4 deletions(-) + +diff --git a/criu/cr-restore.c b/criu/cr-restore.c +index 03511b6..b805265 100644 +--- a/criu/cr-restore.c ++++ b/criu/cr-restore.c +@@ -971,6 +971,8 @@ static int restore_one_alive_task(int pid, CoreEntry *core) + if (prepare_vmas(current, ta)) + return -1; + ++ if (prepare_vma_names(current, ta)) ++ return -1; + /* + * Sockets have to be restored in their network namespaces, + * so a task namespace has to be restored after sockets. +@@ -3733,6 +3735,7 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns + #endif + + RST_MEM_FIXUP_PPTR(task_args->vmas); ++ RST_MEM_FIXUP_PPTR(task_args->vma_names); + RST_MEM_FIXUP_PPTR(task_args->rings); + RST_MEM_FIXUP_PPTR(task_args->tcp_socks); + RST_MEM_FIXUP_PPTR(task_args->timerfd); +diff --git a/criu/files-reg.c b/criu/files-reg.c +index ee54d1d..fbdf811 100644 +--- a/criu/files-reg.c ++++ b/criu/files-reg.c +@@ -2137,7 +2137,7 @@ int do_open_reg_noseek_flags(int ns_root_fd, struct reg_file_info *rfi, void *ar + + /* unnamed temporary files are restored as ghost files */ + flags &= ~O_TMPFILE; +- ++ pr_info("openat path is: %s\n", rfi->path); + fd = openat(ns_root_fd, rfi->path, flags); + if (fd < 0) { + pr_perror("Can't open file %s on restore", rfi->path); +@@ -2307,6 +2307,7 @@ int collect_filemap(struct vma_area *vma) + if (!fd) + return -1; + ++ pr_info("find fd for %lx, shmid: %lx\n", vma->e->start, vma->e->shmid); + vma->vmfd = fd; + vma->vm_open = open_filemap; + return 0; +diff --git a/criu/include/image.h b/criu/include/image.h +index 14659db..f598de7 100644 +--- a/criu/include/image.h ++++ b/criu/include/image.h +@@ -84,6 +84,7 @@ + #define VMA_AREA_VVAR (1 << 12) + #define VMA_AREA_AIORING (1 << 13) + #define VMA_AREA_MEMFD (1 << 14) ++#define VMA_AREA_ANON_INODE (1 << 15) + + #define VMA_CLOSE (1 << 28) + #define VMA_NO_PROT_WRITE (1 << 29) +diff --git a/criu/include/mem.h b/criu/include/mem.h +index 03574ea..ccf8da6 100644 +--- a/criu/include/mem.h ++++ b/criu/include/mem.h +@@ -45,6 +45,7 @@ extern int parasite_dump_pages_seized(struct pstree_item *item, struct vm_area_l + struct task_restore_args; + int open_vmas(struct pstree_item *t); + int prepare_vmas(struct pstree_item *t, struct task_restore_args *ta); ++int prepare_vma_names(struct pstree_item *t, struct task_restore_args *ta); + int unmap_guard_pages(struct pstree_item *t); + int prepare_mappings(struct pstree_item *t); + bool should_dump_page(VmaEntry *vmae, u64 pme); +diff --git a/criu/include/restorer.h b/criu/include/restorer.h +index 2f7345b..a81cc1b 100644 +--- a/criu/include/restorer.h ++++ b/criu/include/restorer.h +@@ -134,6 +134,10 @@ struct restore_vma_io { + + #define RIO_SIZE(niovs) (sizeof(struct restore_vma_io) + (niovs) * sizeof(struct iovec)) + ++struct vma_names { ++ char name[PATH_MAX]; ++}; ++ + struct task_restore_args { + struct thread_restore_args *t; /* thread group leader */ + +@@ -157,6 +161,8 @@ struct task_restore_args { + VmaEntry *vmas; + unsigned int vmas_n; + ++ struct vma_names *vma_names; ++ + int vma_ios_fd; + struct restore_vma_io *vma_ios; + unsigned int vma_ios_n; +diff --git a/criu/mem.c b/criu/mem.c +index 07efdbe..00965f0 100644 +--- a/criu/mem.c ++++ b/criu/mem.c +@@ -525,6 +525,9 @@ static int __parasite_dump_pages_seized(struct pstree_item *item, struct parasit + continue; + } + ++ if (vma_entry_is(vma_area->e, VMA_AREA_ANON_INODE)) ++ continue; ++ + ret = generate_vma_iovs(item, vma_area, pp, &xfer, args, ctl, &pmc, has_parent, mdc->pre_dump, + parent_predump_mode); + if (ret < 0) +@@ -1355,6 +1358,9 @@ int open_vmas(struct pstree_item *t) + filemap_ctx_init(false); + + list_for_each_entry(vma, &vmas->h, list) { ++ if (vma_area_is(vma, VMA_AREA_ANON_INODE)) ++ continue; ++ + if (!vma_area_is(vma, VMA_AREA_REGULAR) || !vma->vm_open) + continue; + +@@ -1437,3 +1443,20 @@ int prepare_vmas(struct pstree_item *t, struct task_restore_args *ta) + + return prepare_vma_ios(t, ta); + } ++ ++int prepare_vma_names(struct pstree_item *t, struct task_restore_args *ta) ++{ ++ struct vma_area *vma; ++ struct vm_area_list *vmas = &rsti(t)->vmas; ++ ta->vma_names = (struct vma_names *)rst_mem_align_cpos(RM_PRIVATE); ++ ++ list_for_each_entry(vma, &vmas->h, list) { ++ struct vma_names *vma_names; ++ vma_names = rst_mem_alloc(sizeof(*vma_names), RM_PRIVATE); ++ if (!vma_names) ++ return -1; ++ ++ memcpy(vma_names->name, vma->e->name, strlen(vma->e->name) + 1); ++ } ++ return 0; ++} +diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c +index 4a1d38d..549bbd6 100644 +--- a/criu/pie/restorer.c ++++ b/criu/pie/restorer.c +@@ -68,6 +68,27 @@ + #define FALLOC_FL_PUNCH_HOLE 0x02 + #endif + ++#define ANON_PROC_PATH "/sys/kernel/modrestore/anon_state_restore" ++ ++static int restore_anon_mapping(VmaEntry *vma_entry, struct vma_names *vma_name) ++{ ++ int fd; ++ ++ fd = sys_open(ANON_PROC_PATH, O_WRONLY, 0); ++ if (fd < 0) { ++ pr_info("anon sys fs open fail:%s\n", ANON_PROC_PATH); ++ return fd; ++ } ++ pr_info("restore anon mapping: %s\n", vma_name->name); ++ ++ if (sys_write(fd, vma_name->name, 4096) < 0) { ++ sys_close(fd); ++ return -1; ++ } ++ sys_close(fd); ++ return 0; ++} ++ + #define sys_prctl_safe(opcode, val1, val2, val3) \ + ({ \ + long __ret = sys_prctl(opcode, val1, val2, val3, 0); \ +@@ -1348,6 +1369,10 @@ static bool can_restore_vdso(struct task_restore_args *args) + } + + /* ++ * pr_info("anon vma name:%s\n", vma_name->name); ++ * if (restore_anon_mapping(vma_entry, vma_name) < 0) ++ * goto core_restore_end; ++ * continue; + * There is a use-case for restoring vvar alone: valgrind (see #488). + * On the other side, we expect that vvar is touched by application + * only from vdso. So, we can put a stale page and proceed restore +@@ -1528,6 +1553,7 @@ long __export_restore_task(struct task_restore_args *args) + pid_t my_pid = sys_getpid(); + rt_sigaction_t act; + bool has_vdso_proxy; ++ struct vma_names *vma_name; + + futex_set(&thread_inprogress, 1); + futex_set(&thread_start, 0); +@@ -1667,6 +1693,14 @@ long __export_restore_task(struct task_restore_args *args) + */ + for (i = 0; i < args->vmas_n; i++) { + vma_entry = args->vmas + i; ++ vma_name = args->vma_names + i; ++ ++ if (vma_entry_is(vma_entry, VMA_AREA_ANON_INODE)) { ++ pr_info("anon vma name:%s\n", vma_name->name); ++ if (restore_anon_mapping(vma_entry, vma_name) < 0) ++ goto core_restore_end; ++ continue; ++ } + + if (!vma_entry_is(vma_entry, VMA_AREA_REGULAR) && !vma_entry_is(vma_entry, VMA_AREA_AIORING)) + continue; +@@ -1784,6 +1818,9 @@ long __export_restore_task(struct task_restore_args *args) + if (!vma_entry->has_madv || !vma_entry->madv) + continue; + ++ if (vma_entry_is(vma_entry, VMA_AREA_ANON_INODE)) ++ continue; ++ + for (m = 0; m < sizeof(vma_entry->madv) * 8; m++) { + if (vma_entry->madv & (1ul << m)) { + ret = sys_madvise(vma_entry->start, vma_entry_len(vma_entry), m); +diff --git a/criu/proc_parse.c b/criu/proc_parse.c +index f3491e7..e41d43a 100644 +--- a/criu/proc_parse.c ++++ b/criu/proc_parse.c +@@ -76,6 +76,7 @@ static char *buf = __buf.buf; + */ + + #define AIO_FNAME "/[aio]" ++#define ANON_FNAME "anon_inode" + + /* check the @line starts with "%lx-%lx" format */ + static bool __is_vma_range_fmt(char *line) +@@ -171,8 +172,17 @@ static void parse_vma_vmflags(char *buf, struct vma_area *vma_area) + * only exception is VVAR area that mapped by the kernel as + * VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP + */ +- if (io_pf && !vma_area_is(vma_area, VMA_AREA_VVAR) && !vma_entry_is(vma_area->e, VMA_FILE_SHARED)) +- vma_area->e->status |= VMA_UNSUPP; ++ /* There are many types of io/pf vm_map, not only vvar, but also ++ * anon_inode, and char device. ++ * For anon_inode and char device, we use anon_notifier to restore ++ * status. Therefore, we disable the broken code here. ++ */ ++// if (io_pf && !vma_area_is(vma_area, VMA_AREA_VVAR) && ++// !vma_area_is(vma_area, VMA_AREA_ANON_INODE)) ++// { ++// pr_info("set current status tp VMA_UNSUPP\n"); ++// vma_area->e->status |= VMA_UNSUPP; ++// } + + if (vma_area->e->madv) + vma_area->e->has_madv = true; +@@ -437,6 +447,21 @@ static int vma_get_mapfile(const char *fname, struct vma_area *vma, DIR *mfd, st + return 0; + } + ++ if (!strncmp(fname, ANON_FNAME, sizeof(ANON_FNAME) - 1)) { ++ /*anon_inode*/ ++ close_safe(vm_file_fd); ++ vma->e->status = VMA_AREA_ANON_INODE; ++ vma->e->name = xmalloc(PATH_MAX); ++ if (!vma->e->name) { ++ pr_err("alloc vma name of anon-inode fail.\n"); ++ return -1; ++ } ++ snprintf(vma->e->name, PATH_MAX - 1, "%"PRIx64"-%"PRIx64 " %s", vma->e->start, vma->e->end, fname); ++ vma->e->name[PATH_MAX - 1] = 0; ++ pr_info("set vma_area status to: %d, name:%s\n", vma->e->status, vma->e->name); ++ return 0; ++ } ++ + pr_err("Unknown shit %o (%s)\n", buf.st_mode, fname); + return -1; + } +@@ -566,6 +591,7 @@ static int handle_vma(pid_t pid, struct vma_area *vma_area, const char *file_pat + vma_area->e->shmid = prev->e->shmid; + vma_area->vmst = prev->vmst; + vma_area->mnt_id = prev->mnt_id; ++ vma_area->e->name = prev->e->name; + + if (!(vma_area->e->status & VMA_AREA_SYSVIPC)) { + vma_area->e->status &= ~(VMA_FILE_PRIVATE | VMA_FILE_SHARED); +@@ -728,7 +754,6 @@ int parse_smaps(pid_t pid, struct vm_area_list *vma_area_list, dump_filemap_t du + if (IS_ERR(str)) + goto err; + eof = (str == NULL); +- + if (!eof && !__is_vma_range_fmt(str)) { + if (!strncmp(str, "Nonlinear", 9)) { + BUG_ON(!vma_area); +diff --git a/images/vma.proto b/images/vma.proto +index 0c07d51..1aa30f9 100644 +--- a/images/vma.proto ++++ b/images/vma.proto +@@ -24,4 +24,5 @@ message vma_entry { + + /* file status flags */ + optional uint32 fdflags = 10 [(criu).hex = true]; ++ required string name = 11; + } +-- +2.34.1 + diff --git a/0025-char_dev-add-support-for-char-device-dump-and-restor.patch b/0025-char_dev-add-support-for-char-device-dump-and-restor.patch new file mode 100644 index 0000000..8c30f59 --- /dev/null +++ b/0025-char_dev-add-support-for-char-device-dump-and-restor.patch @@ -0,0 +1,784 @@ +From 2eebb9de411333628ce8fc5894f072b6ed6179e0 Mon Sep 17 00:00:00 2001 +From: Jingxian He +Date: Wed, 19 May 2021 21:55:34 +0800 +Subject: [PATCH 25/72] char_dev: add support for char device dump and restore + +Add support for char device dump and restore during module upgrade. + +`/sys/kernel/repairing_device` provides the char device whiltelist +with `IOCTL_CMD_{NEEDREPAIR, REPAIR}` command besides the internal +device list. +The device modules could use `mures_{add, del}_devname()` to add, or +delete the char device whitelist dynamically. + +Signed-off-by: Xiaoguang Li +Signed-off-by: Jingxian He +Signed-off-by: fu.lin +--- + criu/Makefile.crtools | 2 + + criu/config.c | 1 + + criu/cr-dump.c | 4 ++ + criu/cr-restore.c | 4 +- + criu/crtools.c | 2 + + criu/devname.c | 130 +++++++++++++++++++++++++++++++++++ + criu/files-chr.c | 104 ++++++++++++++++++++++++++++ + criu/files-reg.c | 6 +- + criu/files.c | 93 ++++++++++++++++++++++++- + criu/include/cr_options.h | 1 + + criu/include/files-chr.h | 25 +++++++ + criu/include/files.h | 6 ++ + criu/include/image-desc.h | 1 + + criu/include/image.h | 1 + + criu/include/protobuf-desc.h | 1 + + criu/mem.c | 7 +- + criu/proc_parse.c | 21 +++++- + images/Makefile | 1 + + images/chr.proto | 12 ++++ + images/fdinfo.proto | 3 + + 20 files changed, 417 insertions(+), 8 deletions(-) + create mode 100644 criu/devname.c + create mode 100644 criu/files-chr.c + create mode 100644 criu/include/files-chr.h + create mode 100644 images/chr.proto + +diff --git a/criu/Makefile.crtools b/criu/Makefile.crtools +index 98c4135..2e82912 100644 +--- a/criu/Makefile.crtools ++++ b/criu/Makefile.crtools +@@ -91,6 +91,8 @@ obj-y += pie-util-vdso.o + obj-y += vdso.o + obj-y += timens.o + obj-y += pin-mem.o ++obj-y += devname.o ++obj-y += files-chr.o + obj-$(CONFIG_HAS_LIBBPF) += bpfmap.o + obj-$(CONFIG_COMPAT) += pie-util-vdso-elf32.o + CFLAGS_pie-util-vdso-elf32.o += -DCONFIG_VDSO_32 +diff --git a/criu/config.c b/criu/config.c +index 5d1cff6..03cad66 100644 +--- a/criu/config.c ++++ b/criu/config.c +@@ -701,6 +701,7 @@ int parse_options(int argc, char **argv, bool *usage_error, bool *has_exec_cmd, + { "network-lock", required_argument, 0, 1100 }, + BOOL_OPT("use-fork-pid", &opts.use_fork_pid), + BOOL_OPT("with-notifier", &opts.with_notifier_kup), ++ BOOL_OPT("dump-char-dev", &opts.dump_char_dev), + {}, + }; + +diff --git a/criu/cr-dump.c b/criu/cr-dump.c +index 50a2f9b..fd17413 100644 +--- a/criu/cr-dump.c ++++ b/criu/cr-dump.c +@@ -88,6 +88,7 @@ + #include "asm/dump.h" + #include "pin-mem.h" + #include "notifier.h" ++#include "files-chr.h" + + /* + * Architectures can overwrite this function to restore register sets that +@@ -1880,6 +1881,9 @@ int cr_pre_dump_tasks(pid_t pid) + */ + rlimit_unlimit_nofile(); + ++ if (opts.dump_char_dev && parse_devname() < 0) ++ goto err; ++ + root_item = alloc_pstree_item(); + if (!root_item) + goto err; +diff --git a/criu/cr-restore.c b/criu/cr-restore.c +index b805265..2904a75 100644 +--- a/criu/cr-restore.c ++++ b/criu/cr-restore.c +@@ -332,11 +332,11 @@ static int root_prepare_shared(void) + if (pi->pid->state == TASK_HELPER) + continue; + +- ret = prepare_mm_pid(pi); ++ ret = prepare_fd_pid(pi); + if (ret < 0) + break; + +- ret = prepare_fd_pid(pi); ++ ret = prepare_mm_pid(pi); + if (ret < 0) + break; + +diff --git a/criu/crtools.c b/criu/crtools.c +index 1d08620..dc6d603 100644 +--- a/criu/crtools.c ++++ b/criu/crtools.c +@@ -451,6 +451,8 @@ usage: + " --use-fork-pid Allow to restore task pid by setting fork pid of task struct.\n" + " --with-notifier Allow to checkpoint/restore kup notifier chain.\n" + " This feature needs the kernel assistance.\n" ++ " --dump-char-dev Dump char dev files as normal file with repair cmd\n" ++ \ + "\n" + "Check options:\n" + " Without options, \"criu check\" checks availability of absolutely required\n" +diff --git a/criu/devname.c b/criu/devname.c +new file mode 100644 +index 0000000..5f6fbed +--- /dev/null ++++ b/criu/devname.c +@@ -0,0 +1,130 @@ ++#include ++#include ++#include ++#include ++ ++#include "log.h" ++#include "common/xmalloc.h" ++ ++#define REPAIRING_DEVICE_FILE "/sys/kernel/repairing_device" ++#define ASCII_SIZE 128 ++ ++static void *root_bucket[ASCII_SIZE]; ++ ++static int insert_devname_internal(void *bucket[], const char *name) ++{ ++ void *new = NULL; ++ int idx = *name; ++ ++ if (bucket[idx] != NULL) ++ return insert_devname_internal(bucket[idx], name+1); ++ else if (idx == '\0') { ++ new = xmalloc(sizeof(void *)); ++ if (!new) { ++ pr_perror("alloc devname failed\n"); ++ return -1; ++ } ++ bucket[idx] = new; ++ return 0; ++ } else { ++ new = xmalloc(sizeof(void *) * ASCII_SIZE); ++ if (!new) { ++ pr_perror("alloc devname failed\n"); ++ return -1; ++ } ++ memset(new, 0, sizeof(void *) * ASCII_SIZE); ++ bucket[idx] = new; ++ return insert_devname_internal(bucket[idx], name+1); ++ } ++} ++ ++int insert_devname(const char *devname) ++{ ++ if (devname == NULL || *devname == '\0') // ignore ++ return 0; ++ ++ pr_debug("insert device '%s'\n", devname); ++ return insert_devname_internal(root_bucket, devname); ++} ++ ++int parse_devname(void) ++{ ++ int retval = -1; ++ char *line = NULL; ++ size_t len = 0; ++ ssize_t nread = 0; ++ FILE *fp = NULL; ++ ++ fp = fopen(REPAIRING_DEVICE_FILE, "r"); ++ if (fp == NULL) { ++ pr_info("Unable to open %s, downgrade to use internal whitelist\n", ++ REPAIRING_DEVICE_FILE); ++ return 0; ++ } ++ ++ while ((nread = getline(&line, &len, fp)) != -1) { ++ if (nread <= 1) // ignore empty string ++ continue; ++ ++ line[nread-1] = '\0'; // drop '\n' ++ retval = insert_devname(line); ++ if (retval != 0) ++ goto out; ++ } ++ retval = 0; ++ ++out: ++ free(line); ++ fclose(fp); ++ return retval; ++} ++ ++static const char *steal_devname(const char *name, ssize_t len) ++{ ++ ssize_t off = len; ++ ++ for (off -= 1; off > 0; off--) { ++ if (name[off] == '/') ++ break; ++ } ++ ++ return name + off + 1; ++} ++ ++static bool find_devname_internal(void *bucket[], const char *name) ++{ ++ int idx = *name; ++ ++ if (*name == '\0' && bucket[idx] != NULL) ++ return true; ++ else if (bucket[idx] == NULL) ++ return false; ++ else { ++ return find_devname_internal(bucket[idx], name+1); ++ } ++} ++ ++bool find_devname(const char *name) ++{ ++ const char *devname; ++ size_t len = 0; ++ bool found = false; ++ ++ if (name == NULL) ++ return false; ++ else if ((len = strlen(name)) == 0) ++ return false; ++ ++ devname = steal_devname(name, len); ++ found = find_devname_internal(root_bucket, devname); ++ ++ pr_debug("device '%s' (original name '%s') %s found in %s\n", ++ devname, name, found ? "is" : "isn't", REPAIRING_DEVICE_FILE); ++ ++ /* Compatible with the old version, there are still `strstr` branch in the following */ ++ found |= (strstr(name, "uverbs") != NULL ++ || strstr(name, "rdma_cm") != NULL ++ || strstr(name, "umad") != NULL); ++ ++ return found; ++} +diff --git a/criu/files-chr.c b/criu/files-chr.c +new file mode 100644 +index 0000000..2eb023e +--- /dev/null ++++ b/criu/files-chr.c +@@ -0,0 +1,104 @@ ++#include ++ ++#include "imgset.h" ++#include "pstree.h" ++#include "files-chr.h" ++#include "log.h" ++ ++#include "protobuf.h" ++ ++/* Checks if file descriptor @lfd is infinibandevent */ ++int is_infiniband_link(char *link) ++{ ++ return is_anon_link_type(link, "[infinibandevent]"); ++} ++ ++static int chrfile_open(struct file_desc *d, int *new_fd) ++{ ++ int fd, mntns_root; ++ int ret = 0; ++ struct chrfile_info *ci; ++ ++ ci = container_of(d, struct chrfile_info, d); ++ ++ if (ci->cfe->repair) ++ ci->cfe->flags |= O_REPAIR; ++ ++ mntns_root = open_pid_proc(getpid()); ++ fd = openat(mntns_root, ci->path, ci->cfe->flags); ++ if (fd < 0){ ++ pr_err("open chr file failed\n"); ++ return -1; ++ } ++ ++ if (ci->cfe->repair) { ++ ret = ioctl(fd, IOCTL_CMD_REPAIR , ci->cfe->index); ++ pr_info("repair ioctl return: %d, index: %d\n", ret, ci->cfe->index); ++ if (ret) ++ goto err; ++ } ++ ++ *new_fd = fd; ++ return ret; ++err: ++ close(fd); ++ return ret; ++} ++ ++static struct file_desc_ops chrfile_desc_ops = { ++ .type = FD_TYPES__CHR, ++ .open = chrfile_open, ++}; ++ ++static int collect_one_chrfile(void *o, ProtobufCMessage *base, struct cr_img *i) ++{ ++ struct chrfile_info *ci = o; ++ static char dot[] = "."; ++ ++ ci->cfe = pb_msg(base, ChrfileEntry); ++ if (ci->cfe->name[1] == '\0') ++ ci->path = dot; ++ else ++ ci->path = ci->cfe->name; ++ ++ pr_info("Collected chr file: %#x, name: %s\n", ci->cfe->id, ci->path); ++ file_desc_add(&ci->d, ci->cfe->id, &chrfile_desc_ops); ++ ++ return 0; ++} ++ ++struct collect_image_info chrfile_cinfo = { ++ .fd_type = CR_FD_CHRFILE, ++ .pb_type = PB_CHRFILE, ++ .priv_size = sizeof(struct chrfile_info), ++ .collect = collect_one_chrfile, ++}; ++ ++int collect_chr_map(struct pstree_item *me, struct vma_area *vma) ++{ ++ struct list_head *list = &rsti(me)->fds; ++ struct fdinfo_list_entry *fle, *tmp; ++ struct chrfile_info *ci; ++ bool exist_fd; ++ ++ ++ list_for_each_entry_safe(fle, tmp, list, ps_list) { ++ struct file_desc *d = fle->desc; ++ ++ if (d->ops->type != FD_TYPES__CHR) ++ continue; ++ ++ ci = container_of(d, struct chrfile_info, d); ++ if (!strcmp(vma->e->name, ci->path)) { ++ vma->vmfd = d; ++ vma->e->fd = fle->fe->fd; ++ exist_fd = true; ++ break; ++ } ++ } ++ ++ if (!exist_fd) ++ return -EEXIST; ++ ++ return 0; ++} +diff --git a/criu/files-reg.c b/criu/files-reg.c +index fbdf811..b9576a4 100644 +--- a/criu/files-reg.c ++++ b/criu/files-reg.c +@@ -45,6 +45,7 @@ + #include "fault-injection.h" + #include "external.h" + #include "memfd.h" ++#include "files-chr.h" + + #include "protobuf.h" + #include "util.h" +@@ -1640,7 +1641,8 @@ int dump_one_reg_file(int lfd, u32 id, const struct fd_parms *p) + rfe.has_mnt_id = true; + } + +- pr_info("Dumping path for %d fd via self %d [%s]\n", p->fd, lfd, &link->name[1]); ++ pr_info("Dumping path for %d fd via self %d [%s], id: %d\n", ++ p->fd, lfd, &link->name[1], id); + + /* + * The regular path we can handle should start with slash. +@@ -2373,7 +2375,7 @@ static int collect_one_regfile(void *o, ProtobufCMessage *base, struct cr_img *i + rfi->remap = NULL; + rfi->size_mode_checked = false; + +- pr_info("Collected [%s] ID %#x\n", rfi->path, rfi->rfe->id); ++ pr_info("Collected regfile [%s] ID %#x\n", rfi->path, rfi->rfe->id); + return file_desc_add(&rfi->d, rfi->rfe->id, ®_desc_ops); + } + +diff --git a/criu/files.c b/criu/files.c +index f262d80..e1681a1 100644 +--- a/criu/files.c ++++ b/criu/files.c +@@ -49,6 +49,7 @@ + #include "kerndat.h" + #include "fdstore.h" + #include "bpfmap.h" ++#include "files-chr.h" + + #include "protobuf.h" + #include "util.h" +@@ -325,10 +326,32 @@ int do_dump_gen_file(struct fd_parms *p, int lfd, const struct fdtype_ops *ops, + e->fd = p->fd; + e->flags = p->fd_flags; + ++ pr_info("fdinfoEntry fd: %d\n", e->fd); + ret = fd_id_generate(p->pid, e, p); + if (ret == 1) /* new ID generated */ + ret = ops->dump(lfd, e->id, p); +- else ++ else if (ops->type == FD_TYPES__CHR) { ++ /* ++ * Sometimes the app_data subprocess may inherit the fd from ++ * app_data. Those fds may result the unconditional oops during ++ * the restoration of app_data. Therefore, prevent the dump in ++ * those condition. ++ */ ++ struct fd_link _link, *link; ++ ++ if (!p->link) { ++ if (fill_fdlink(lfd, p, &_link)) ++ return -1; ++ link = &_link; ++ } else ++ link = p->link; ++ ++ if (find_devname(link->name)) { ++ pr_err("char dev '%s' fd %d is owned by multi-processes\n", ++ link->name, e->fd); ++ ret = -1; ++ } ++ } else + /* Remove locks generated by the fd before going to the next */ + discard_dup_locks_tail(p->pid, e->fd); + +@@ -466,6 +489,58 @@ static int dump_blkdev(struct fd_parms *p, int lfd, FdinfoEntry *e) + return err; + } + ++static int dump_chr_file(int lfd, u32 id, const struct fd_parms *p) ++{ ++ int ret; ++ struct fd_link _link, *link; ++ struct cr_img *img; ++ FileEntry fe = FILE_ENTRY__INIT; ++ ChrfileEntry cfe = CHRFILE_ENTRY__INIT; ++ ++ if (!p->link) { ++ if (fill_fdlink(lfd, p, &_link)) ++ return -1; ++ link = &_link; ++ } else ++ link = p->link; ++ ++ pr_info("Dumping chr-file fd %d with lfd %d with id %d, name: %s\n", p->fd, lfd, id, link->name); ++ ++ if (strstr(link->name, "(deleted)") != NULL) { ++ pr_err("char device '%s' is deleted\n", link->name); ++ return -ENXIO; ++ } ++ ++ cfe.repair = false; ++ if (find_devname(link->name)) { ++ ret = ioctl(lfd, IOCTL_CMD_NEEDREPAIR, 0); ++ if (ret <= 0) { ++ pr_err("ioctl cmd needrepair failed, errno: %d, %s\n", ret, strerror(errno)); ++ return -1; ++ } else { ++ pr_info("char device needrepair cmd return: %d\n", ret); ++ cfe.index = ret; ++ cfe.repair = true; ++ } ++ } ++ ++ cfe.id = id; ++ cfe.name = &link->name[1]; ++ cfe.flags = p->flags; ++ fe.type = FD_TYPES__CHR; ++ fe.id = cfe.id; ++ fe.chr = &cfe; ++ ++ img = img_from_set(glob_imgset, CR_FD_FILES); ++ ret = pb_write_one(img, &fe, PB_FILE); ++ return ret; ++} ++ ++const struct fdtype_ops chr_dump_ops = { ++ .type = FD_TYPES__CHR, ++ .dump = dump_chr_file, ++}; ++ + static int dump_chrdev(struct fd_parms *p, int lfd, FdinfoEntry *e) + { + struct fd_link *link_old = p->link; +@@ -493,6 +568,10 @@ static int dump_chrdev(struct fd_parms *p, int lfd, FdinfoEntry *e) + ops = &tty_dump_ops; + break; + } ++ if (opts.dump_char_dev) { ++ ops = &chr_dump_ops; ++ break; ++ } + + sprintf(more, "%d:%d", maj, minor(p->stat.st_rdev)); + err = dump_unsupp_fd(p, lfd, "chr", more, e); +@@ -559,6 +638,8 @@ static int dump_one_file(struct pid *pid, int fd, int lfd, struct fd_opts *opts, + ops = &signalfd_dump_ops; + else if (is_timerfd_link(link)) + ops = &timerfd_dump_ops; ++ else if (is_infiniband_link(link)) ++ return 1; + #ifdef CONFIG_HAS_LIBBPF + else if (is_bpfmap_link(link)) + ops = &bpfmap_dump_ops; +@@ -663,6 +744,11 @@ int dump_task_files_seized(struct parasite_ctl *ctl, struct pstree_item *item, s + ret = dump_one_file(item->pid, dfds->fds[i + off], lfds[i], opts + i, ctl, &e, dfds); + if (ret) + break; ++ /* infiniband link file */ ++ if (ret > 0) { ++ ret = 0; ++ continue; ++ } + + ret = pb_write_one(img, &e, PB_FDINFO); + if (ret) +@@ -917,6 +1003,7 @@ int prepare_fd_pid(struct pstree_item *item) + if (!img) + return -1; + ++ pr_info("prepare_fd_pid\n"); + while (1) { + FdinfoEntry *e; + +@@ -1125,6 +1212,7 @@ int setup_and_serve_out(struct fdinfo_list_entry *fle, int new_fd) + if (reopen_fd_as(fle->fe->fd, new_fd)) + return -1; + ++ pr_info("*******flags: %d",fle->fe->flags); + if (fcntl(fle->fe->fd, F_SETFD, fle->fe->flags) == -1) { + pr_perror("Unable to set file descriptor flags"); + return -1; +@@ -1761,6 +1849,9 @@ static int collect_one_file(void *o, ProtobufCMessage *base, struct cr_img *i) + ret = collect_one_file_entry(fe, fe->bpf->id, &fe->bpf->base, &bpfmap_cinfo); + break; + #endif ++ case FD_TYPES__CHR: ++ ret = collect_one_file_entry(fe, fe->chr->id, &fe->chr->base, &chrfile_cinfo); ++ break; + } + + return ret; +diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h +index 039edba..226acb2 100644 +--- a/criu/include/cr_options.h ++++ b/criu/include/cr_options.h +@@ -193,6 +193,7 @@ struct cr_options { + int pin_memory; + int use_fork_pid; + int with_notifier_kup; ++ int dump_char_dev; + }; + + extern struct cr_options opts; +diff --git a/criu/include/files-chr.h b/criu/include/files-chr.h +new file mode 100644 +index 0000000..5be11f5 +--- /dev/null ++++ b/criu/include/files-chr.h +@@ -0,0 +1,25 @@ ++#ifndef __CRIU_FILES_CHR_H__ ++#define __CRIU_FILES_CHR_H__ ++ ++#include "files.h" ++ ++#include "images/chr.pb-c.h" ++ ++struct chrfile_info { ++ struct file_desc d; ++ ChrfileEntry *cfe; ++ char *path; ++}; ++ ++extern struct collect_image_info chrfile_cinfo; ++ ++extern const struct fdtype_ops chr_dump_ops; ++extern int collect_chr_map(struct pstree_item *me, struct vma_area *); ++ ++int parse_devname(void); ++bool find_devname(const char *name); ++ ++int collect_chr_map(struct pstree_item *me, struct vma_area *vma); ++int is_infiniband_link(char *link); ++ ++#endif /* __CRIU_FILES_CHR_H__ */ +diff --git a/criu/include/files.h b/criu/include/files.h +index 96face7..1d979a9 100644 +--- a/criu/include/files.h ++++ b/criu/include/files.h +@@ -15,6 +15,12 @@ + #include "images/fown.pb-c.h" + #include "images/vma.pb-c.h" + ++#ifndef IOCTL_CMD_NEEDREPAIR ++#define IOCTL_CMD_NEEDREPAIR 0x00100000UL ++#define IOCTL_CMD_REPAIR 0x00200000UL ++#define O_REPAIR 040000000 ++#endif ++ + struct parasite_drain_fd; + struct pstree_item; + struct file_desc; +diff --git a/criu/include/image-desc.h b/criu/include/image-desc.h +index 5045bae..e35f8b2 100644 +--- a/criu/include/image-desc.h ++++ b/criu/include/image-desc.h +@@ -115,6 +115,7 @@ enum { + CR_FD_MEMFD_FILE, + + CR_FD_AUTOFS, ++ CR_FD_CHRFILE, + + CR_FD_MAX + }; +diff --git a/criu/include/image.h b/criu/include/image.h +index f598de7..66492c0 100644 +--- a/criu/include/image.h ++++ b/criu/include/image.h +@@ -85,6 +85,7 @@ + #define VMA_AREA_AIORING (1 << 13) + #define VMA_AREA_MEMFD (1 << 14) + #define VMA_AREA_ANON_INODE (1 << 15) ++#define VMA_AREA_CHR (1 << 16) + + #define VMA_CLOSE (1 << 28) + #define VMA_NO_PROT_WRITE (1 << 29) +diff --git a/criu/include/protobuf-desc.h b/criu/include/protobuf-desc.h +index 3824de1..2468e8f 100644 +--- a/criu/include/protobuf-desc.h ++++ b/criu/include/protobuf-desc.h +@@ -70,6 +70,7 @@ enum { + PB_BPFMAP_FILE, + PB_BPFMAP_DATA, + PB_APPARMOR, ++ PB_CHRFILE, + + /* PB_AUTOGEN_STOP */ + +diff --git a/criu/mem.c b/criu/mem.c +index 00965f0..b955d66 100644 +--- a/criu/mem.c ++++ b/criu/mem.c +@@ -32,6 +32,7 @@ + #include "compel/infect-util.h" + #include "pidfd-store.h" + #include "pin-mem.h" ++#include "files-chr.h" + + #include "protobuf.h" + #include "images/pagemap.pb-c.h" +@@ -717,7 +718,9 @@ int prepare_mm_pid(struct pstree_item *i) + + pr_info("vma 0x%" PRIx64 " 0x%" PRIx64 "\n", vma->e->start, vma->e->end); + +- if (vma_area_is(vma, VMA_ANON_SHARED)) ++ if (vma_area_is(vma, VMA_AREA_CHR)) ++ ret = collect_chr_map(i, vma); ++ else if (vma_area_is(vma, VMA_ANON_SHARED)) + ret = collect_shmem(pid, vma); + else if (vma_area_is(vma, VMA_FILE_PRIVATE) || vma_area_is(vma, VMA_FILE_SHARED)) + ret = collect_filemap(vma); +@@ -1358,7 +1361,7 @@ int open_vmas(struct pstree_item *t) + filemap_ctx_init(false); + + list_for_each_entry(vma, &vmas->h, list) { +- if (vma_area_is(vma, VMA_AREA_ANON_INODE)) ++ if (vma_area_is(vma, VMA_AREA_ANON_INODE) || vma_area_is(vma, VMA_AREA_CHR)) + continue; + + if (!vma_area_is(vma, VMA_AREA_REGULAR) || !vma->vm_open) +diff --git a/criu/proc_parse.c b/criu/proc_parse.c +index e41d43a..8913d93 100644 +--- a/criu/proc_parse.c ++++ b/criu/proc_parse.c +@@ -603,11 +603,30 @@ static int handle_vma(pid_t pid, struct vma_area *vma_area, const char *file_pat + } else if (*vm_file_fd >= 0) { + struct stat *st_buf = vma_area->vmst; + ++ pr_info("file mode is: %x, st_ino: %ld\n", ++ st_buf->st_mode, st_buf->st_ino); + if (S_ISREG(st_buf->st_mode)) + /* regular file mapping -- supported */; + else if (S_ISCHR(st_buf->st_mode) && (st_buf->st_rdev == DEVZERO)) + /* devzero mapping -- also makes sense */; +- else { ++ else if (S_ISCHR(st_buf->st_mode) && opts.dump_char_dev) { ++ /* NOTICE: if `--dump-char-dev` option is set, permmit ++ * all char device memory area dumping. ++ */ ++ if (strstr(file_path, "uverbs") != NULL) { ++ int len = strlen(file_path) + 1; ++ ++ vma_area->e->status |= VMA_AREA_CHR; ++ vma_area->e->name = xmalloc(len); ++ if (!vma_area->e->name) { ++ pr_err("alloc vma area name failed\n"); ++ goto err; ++ strncpy(vma_area->e->name, file_path, len); ++ pr_info("vma name content is: %s\n", ++ vma_area->e->name); ++ } ++ } ++ } else { + pr_err("Can't handle non-regular mapping on %d's map %" PRIx64 "\n", pid, vma_area->e->start); + goto err; + } +diff --git a/images/Makefile b/images/Makefile +index 004e22e..37dff9a 100644 +--- a/images/Makefile ++++ b/images/Makefile +@@ -72,6 +72,7 @@ proto-obj-y += bpfmap-file.o + proto-obj-y += bpfmap-data.o + proto-obj-y += apparmor.o + proto-obj-y += rseq.o ++proto-obj-y += chr.o + + CFLAGS += -iquote $(obj)/ + +diff --git a/images/chr.proto b/images/chr.proto +new file mode 100644 +index 0000000..67929db +--- /dev/null ++++ b/images/chr.proto +@@ -0,0 +1,12 @@ ++syntax = "proto2"; ++ ++import "opts.proto"; ++ ++message chrfile_entry { ++ required uint32 id = 1; ++ required uint32 flags = 2 [(criu).flags = "rfile.flags"]; ++ required uint32 index = 3; ++ required string name = 4; ++ required bool repair = 5; ++}; ++ +diff --git a/images/fdinfo.proto b/images/fdinfo.proto +index 88f1c11..6549472 100644 +--- a/images/fdinfo.proto ++++ b/images/fdinfo.proto +@@ -20,6 +20,7 @@ import "pipe.proto"; + import "tty.proto"; + import "memfd.proto"; + import "bpfmap-file.proto"; ++import "chr.proto"; + + enum fd_types { + UND = 0; +@@ -42,6 +43,7 @@ enum fd_types { + TIMERFD = 17; + MEMFD = 18; + BPFMAP = 19; ++ CHR = 21; + + /* Any number above the real used. Not stored to image */ + CTL_TTY = 65534; +@@ -78,4 +80,5 @@ message file_entry { + optional tty_file_entry tty = 19; + optional memfd_file_entry memfd = 20; + optional bpfmap_file_entry bpf = 21; ++ optional chrfile_entry chr = 23; + } +-- +2.34.1 + diff --git a/0026-improve-char-dev-fd-check-and-repair-method.patch b/0026-improve-char-dev-fd-check-and-repair-method.patch new file mode 100644 index 0000000..5d7a9a4 --- /dev/null +++ b/0026-improve-char-dev-fd-check-and-repair-method.patch @@ -0,0 +1,74 @@ +From 539add7149df575d6d8cdce60ad6fb2c2300e27d Mon Sep 17 00:00:00 2001 +From: Jingxian He +Date: Sun, 24 Oct 2021 15:20:27 +0800 +Subject: [PATCH 26/72] improve char dev fd check and repair method + +Some special char dev cannot work in child processes, we make dump fail +when the special char dev fd is in child processes. +In the char dev repair process, user may need recover fd. We should +make thre repair process running after the char dev fd is reopened as dumped fd. + +Conflict:NA +Reference:https://gitee.com/src-openeuler/criu/pulls/21 +Signed-off-by: Jingxian He +--- + criu/files-chr.c | 11 +---------- + criu/files.c | 12 ++++++++++++ + 2 files changed, 13 insertions(+), 10 deletions(-) + +diff --git a/criu/files-chr.c b/criu/files-chr.c +index 2eb023e..315e9c6 100644 +--- a/criu/files-chr.c ++++ b/criu/files-chr.c +@@ -31,17 +31,8 @@ static int chrfile_open(struct file_desc *d, int *new_fd) + return -1; + } + +- if (ci->cfe->repair) { +- ret = ioctl(fd, IOCTL_CMD_REPAIR , ci->cfe->index); +- pr_info("repair ioctl return: %d, index: %d\n", ret, ci->cfe->index); +- if (ret) +- goto err; +- } +- + *new_fd = fd; +- return ret; +-err: +- close(fd); ++ + return ret; + } + +diff --git a/criu/files.c b/criu/files.c +index e1681a1..7b688f5 100644 +--- a/criu/files.c ++++ b/criu/files.c +@@ -1231,6 +1231,7 @@ static int open_fd(struct fdinfo_list_entry *fle) + struct file_desc *d = fle->desc; + struct fdinfo_list_entry *flem; + int new_fd = -1, ret; ++ struct chrfile_info *ci; + + flem = file_master(d); + if (fle != flem) { +@@ -1258,6 +1259,17 @@ static int open_fd(struct fdinfo_list_entry *fle) + if (ret != -1 && new_fd >= 0) { + if (setup_and_serve_out(fle, new_fd) < 0) + return -1; ++ if (d->ops->type == FD_TYPES__CHR) { ++ ci = container_of(d, struct chrfile_info, d); ++ if (ci->cfe->repair) { ++ ret = ioctl(fle->fe->fd, IOCTL_CMD_REPAIR , ci->cfe->index); ++ pr_info("repair ioctl return: %d, index: %d\n", ret, ci->cfe->index); ++ if (ret) { ++ close(fle->fe->fd); ++ return -1; ++ } ++ } ++ } + } + out: + if (ret == 0) +-- +2.34.1 + diff --git a/0027-mmap-restore-dev-hisi_sec2-deivce-vma.patch b/0027-mmap-restore-dev-hisi_sec2-deivce-vma.patch new file mode 100644 index 0000000..ed5d313 --- /dev/null +++ b/0027-mmap-restore-dev-hisi_sec2-deivce-vma.patch @@ -0,0 +1,472 @@ +From fe19a2639373175c134fa51a7c1c26ca5306d22c Mon Sep 17 00:00:00 2001 +From: "fu.lin" +Date: Fri, 10 Sep 2021 16:06:55 +0800 +Subject: [PATCH 27/72] mmap: restore /dev/hisi_sec2* deivce vma + +There are two kinds of vmas: anonymous vma and file-based vma. For +anonymous vma, criu just map area and fill content to it; for file-based +vma, criu preprocess it, such as setting `open_vm()` callback function. + +`/dev/hisi_sec2*` char device is different from the normal. The `open`, +`mmap`, and `close` syscall actions has a special meaning. + - `open`: allocate physical resource of the device + - `mmap`: create instance + - `close`: release physical resource +The vma means the instance in this device. One fd may be associated with +a group instances: one mmio (vma size is 2 pages, pgoff is 0), one dus +(vma size is 37 pages, pgoff is 0x2000). As for dus vma, it's split two +vmas by `mprotect(addr, 0x5000, PROT_READ)`: one size is 0x20000, one +size is 0x5000. + +This patch makes the /dev/hisi_sec* restore possible. Idea: + It's impossible for criu to know the relationship between vma and the +mapped file fd. Therefore, just collect the total fds number during +collecting /dev/hisi_sec* files, then the fd is tagged that which +function is used during vma restoration, and aissign the unused fd to the +specific vma. And during `mmap()` process, dus vma is splitted by `mprotect`. + +Note: +- criu use ino to index the fd. +- this physical device drivers is hisi_sec2.ko, which is located in + `drivers/crypto/hisilicon/sec2/` of linux kernel. +- this device name has prefix "hisi_sec2" that is found from + `drivers/crypto/hisilicon/sec2/sec_main.c`. + +Conflict:NA +Reference:https://gitee.com/src-openeuler/criu/pulls/21 +Signed-off-by: fu.lin +--- + criu/files-chr.c | 130 +++++++++++++++++++++++++++++++++++++-- + criu/include/files-chr.h | 16 +++++ + criu/include/vma.h | 12 ++++ + criu/pie/restorer.c | 130 ++++++++++++++++++++++++++++++++++++++- + criu/proc_parse.c | 4 +- + 5 files changed, 284 insertions(+), 8 deletions(-) + +diff --git a/criu/files-chr.c b/criu/files-chr.c +index 315e9c6..95d93e1 100644 +--- a/criu/files-chr.c ++++ b/criu/files-chr.c +@@ -6,6 +6,9 @@ + #include "log.h" + + #include "protobuf.h" ++#include "rst-malloc.h" ++ ++static unsigned hisi_sec_fds_n; + + /* Checks if file descriptor @lfd is infinibandevent */ + int is_infiniband_link(char *link) +@@ -16,11 +19,14 @@ int is_infiniband_link(char *link) + static int chrfile_open(struct file_desc *d, int *new_fd) + { + int fd, mntns_root; +- int ret = 0; ++ int ret = -1; + struct chrfile_info *ci; + + ci = container_of(d, struct chrfile_info, d); + ++ pr_info("charfile: Opening %s (repair %d index %d)\n", ++ ci->path, ci->cfe->repair, ci->cfe->index); ++ + if (ci->cfe->repair) + ci->cfe->flags |= O_REPAIR; + +@@ -32,6 +38,7 @@ static int chrfile_open(struct file_desc *d, int *new_fd) + } + + *new_fd = fd; ++ ret = 0; + + return ret; + } +@@ -52,10 +59,12 @@ static int collect_one_chrfile(void *o, ProtobufCMessage *base, struct cr_img *i + else + ci->path = ci->cfe->name; + +- pr_info("Collected chr file: %#x, name: %s\n", ci->cfe->id, ci->path); +- file_desc_add(&ci->d, ci->cfe->id, &chrfile_desc_ops); ++ /* collect `/dev/hisi_sec2*` fds */ ++ if (strstr(ci->path, HISI_SEC_DEV) != NULL) ++ hisi_sec_fds_n += 1; + +- return 0; ++ pr_info("Collected chr file: %#x, name: %s\n", ci->cfe->id, ci->path); ++ return file_desc_add(&ci->d, ci->cfe->id, &chrfile_desc_ops); + } + + struct collect_image_info chrfile_cinfo = { +@@ -65,6 +74,7 @@ struct collect_image_info chrfile_cinfo = { + .collect = collect_one_chrfile, + }; + ++static int handle_hisi_vma(struct list_head *fds, struct vma_area *vma); + int collect_chr_map(struct pstree_item *me, struct vma_area *vma) + { + struct list_head *list = &rsti(me)->fds; +@@ -72,6 +82,12 @@ int collect_chr_map(struct pstree_item *me, struct vma_area *vma) + struct chrfile_info *ci; + bool exist_fd; + ++ if (strstr(vma->e->name, HISI_SEC_DEV) != NULL) { ++ if (handle_hisi_vma(list, vma) != 0) { ++ return -1; ++ } else ++ goto out; ++ } + + list_for_each_entry_safe(fle, tmp, list, ps_list) { + struct file_desc *d = fle->desc; +@@ -91,5 +107,111 @@ int collect_chr_map(struct pstree_item *me, struct vma_area *vma) + if (!exist_fd) + return -EEXIST; + ++out: ++ pr_info(" `- find fd %ld for dev %s at this vma\n", vma->e->fd, vma->e->name); ++ ++ return 0; ++} ++ ++#define MAX_HISI_SEC_SIZE 3 /* one physical device expose three char dev */ ++static struct hlist_head hisi_sec_fds_hash[MAX_HISI_SEC_SIZE]; ++ ++static int collect_hisi_sec_fds(struct list_head *list) ++{ ++ struct fdinfo_list_entry *fle, *tmp; ++ struct chrfile_info *ci; ++ struct file_desc *d; ++ struct hisi_sec_desc *desc; ++ int idx; ++ int nr = 0; ++ ++ for (idx = 0; idx < MAX_HISI_SEC_SIZE; idx++) ++ INIT_HLIST_HEAD(&hisi_sec_fds_hash[idx]); ++ ++ list_for_each_entry_safe(fle, tmp, list, ps_list) { ++ d = fle->desc; ++ ++ if (d->ops->type != FD_TYPES__CHR) ++ continue; ++ ++ ci = container_of(d, struct chrfile_info, d); ++ ++ if (strstr(ci->path, HISI_SEC_DEV) != NULL) { ++ desc = shmalloc(sizeof(*desc)); ++ if (desc == NULL) ++ return -ENOMEM; ++ ++ desc->name = ci->path; ++ desc->fd = fle->fe->fd; ++ desc->mmio = desc->dus = 0; ++ ++ idx = (ci->path[strlen(ci->path)-1] - '0') % MAX_HISI_SEC_SIZE; ++ hlist_add_head(&desc->hash, &hisi_sec_fds_hash[idx]); ++ ++ nr += 1; ++ } ++ } ++ ++ return nr; ++} ++ ++static long delivery_hisi_sec_fd(struct list_head *fds, struct vma_area *vma) ++{ ++ extern unsigned hisi_sec_fds_n; /* defined in criu/files.c */ ++ static bool initialized = false; ++ struct hisi_sec_desc *desc; ++ int fd = -1, idx; ++ ++ if (!initialized) { ++ int nr; ++ ++ pr_info("find %d fds for hisi_sec char device\n", hisi_sec_fds_n); ++ ++ nr = collect_hisi_sec_fds(fds); ++ if (nr != hisi_sec_fds_n) { ++ pr_err("Collected fds(%d) aren't equal opened(%d)\n", ++ nr, hisi_sec_fds_n); ++ return -1; ++ } ++ ++ initialized = true; ++ } else if (vma->e->pgoff != HISI_SEC_MMIO && vma->e->pgoff != HISI_SEC_DUS) { ++ /* It's impossible value for fd, just as a tag to show it's a ++ * vma by `mprotect` syscall. ++ */ ++ return LONG_MAX; ++ } ++ ++ idx = (vma->e->name[strlen(vma->e->name)-1] - '0') % MAX_HISI_SEC_SIZE; ++ hlist_for_each_entry(desc, &hisi_sec_fds_hash[idx], hash) { ++ if (strcmp(desc->name, vma->e->name) != 0) ++ continue; ++ ++ if (vma->e->pgoff == HISI_SEC_MMIO && !desc->mmio) { ++ fd = desc->fd; ++ desc->mmio = true; ++ break; ++ } else if (vma->e->pgoff == HISI_SEC_DUS && !desc->dus) { ++ fd = desc->fd; ++ desc->dus = true; ++ break; ++ } ++ } ++ ++ return fd; ++} ++ ++static int handle_hisi_vma(struct list_head *fds, struct vma_area *vma) ++{ ++ long fd = delivery_hisi_sec_fd(fds, vma); ++ ++ if (fd < 0) { ++ pr_err("find fd for char dev vma pgoff %lx named %s failed.\n", ++ vma->e->pgoff, vma->e->name); ++ return -1; ++ } ++ ++ vma->e->fd = fd; ++ + return 0; + } +diff --git a/criu/include/files-chr.h b/criu/include/files-chr.h +index 5be11f5..26b8fb2 100644 +--- a/criu/include/files-chr.h ++++ b/criu/include/files-chr.h +@@ -22,4 +22,20 @@ bool find_devname(const char *name); + int collect_chr_map(struct pstree_item *me, struct vma_area *vma); + int is_infiniband_link(char *link); + ++struct hisi_sec_desc { ++ struct hlist_node hash; ++ char *name; ++ bool mmio; ++ bool dus; ++ int fd; ++}; ++ ++#define HISI_SEC_DEV "hisi_sec2" /* `/dev/hisi_sec2*` char device */ ++ ++/* here is the selection of offset in `mmap`, they're from drivers */ ++enum hisi_sec_dev { ++ HISI_SEC_MMIO = 0x0, ++ HISI_SEC_DUS = 0x2000, ++}; ++ + #endif /* __CRIU_FILES_CHR_H__ */ +diff --git a/criu/include/vma.h b/criu/include/vma.h +index ed9f31e..2b6e86f 100644 +--- a/criu/include/vma.h ++++ b/criu/include/vma.h +@@ -125,4 +125,16 @@ static inline bool vma_entry_can_be_lazy(VmaEntry *e) + !(vma_entry_is(e, VMA_AREA_VDSO)) && !(vma_entry_is(e, VMA_AREA_VSYSCALL))); + } + ++struct vma_attr { ++ int prot; ++ int flags; ++}; ++ ++enum ALIEN_MAP_METHOD { ++ PGOFF_IS_ZERO, ++ MAP_THEN_PROTECT, ++ ++ MAX_ALIEN_MAP_METHOD, ++}; ++ + #endif /* __CR_VMA_H__ */ +diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c +index 549bbd6..dcc922e 100644 +--- a/criu/pie/restorer.c ++++ b/criu/pie/restorer.c +@@ -37,6 +37,7 @@ + #include "uffd.h" + #include "sched.h" + #include "notifier.h" ++#include "files-chr.h" + + #include "common/lock.h" + #include "common/page.h" +@@ -861,6 +862,129 @@ static unsigned long restore_mapping(VmaEntry *vma_entry) + return addr; + } + ++static unsigned long restore_map_then_protect_mapping(VmaEntry *curr, ++ struct vma_attr *curr_attr, ++ VmaEntry *next, ++ struct vma_attr *next_attr) ++{ ++ int retval; ++ unsigned long addr; ++ ++ if (next->fd != LONG_MAX ++ || curr->end != next->start ++ || (vma_entry_len(curr) + curr->pgoff) != next->pgoff ++ || curr->prot == next->prot ++ || curr->flags != next->flags) { ++ pr_err("They looks not currect:\n"); ++ pr_err(" `- vma A: (%x %x %d %lx)\n", ++ curr_attr->prot, curr_attr->flags, ++ (int)curr->fd, curr->pgoff); ++ pr_err(" `- vma B: (%x %x %d %lx)\n", ++ next_attr->prot, next_attr->flags, ++ (int)next->fd, next->pgoff); ++ return -1; ++ } ++ ++ pr_info("\tmmap(%x %x %d %lx) in map then protect mapping\n", ++ curr_attr->prot, curr_attr->flags, ++ (int)curr->fd, curr->pgoff); ++ ++ addr = sys_mmap(decode_pointer(curr->start), ++ vma_entry_len(curr) + vma_entry_len(next), ++ curr_attr->prot, curr_attr->flags, curr->fd, curr->pgoff); ++ if (addr != curr->start) { ++ pr_err("%s: mmap failed with code %ld\n", __func__, addr); ++ goto out; ++ } ++ ++ pr_info("\t mprotect(%x)\n", next_attr->prot); ++ retval = sys_mprotect(decode_pointer(next->start), ++ vma_entry_len(next), next_attr->prot); ++ if (retval != 0) { ++ addr = retval; ++ pr_err("%s: mprotect failed with code %d\n", __func__, retval); ++ } ++ ++out: ++ return addr; ++} ++ ++static unsigned long restore_pgoff_is_zero_mapping(VmaEntry *curr, struct vma_attr *attr) ++{ ++ unsigned long addr; ++ ++ pr_debug("\tmmap(%x %x %d %lx) in pgoff is zero mapping\n", ++ attr->prot, attr->flags, (int)curr->fd, curr->pgoff); ++ ++ addr = sys_mmap(decode_pointer(curr->start), ++ vma_entry_len(curr), ++ attr->prot, attr->flags, ++ curr->fd, curr->pgoff); ++ ++ return addr; ++} ++ ++static unsigned long restore_hisi_sec_mapping(struct task_restore_args *args, ++ int i, int *step) ++{ ++ VmaEntry *curr = args->vmas + i; ++ VmaEntry *next = args->vmas + i + 1; ++ struct vma_attr curr_attr = { ++ .prot = curr->prot, ++ .flags = curr->flags | MAP_FIXED, ++ }; ++ struct vma_attr next_attr = { ++ .prot = next->prot, ++ .flags = next->flags | MAP_FIXED, ++ }; ++ unsigned long addr; ++ ++ switch (curr->pgoff) { ++ case HISI_SEC_MMIO: ++ addr = restore_pgoff_is_zero_mapping(curr, &curr_attr); ++ break; ++ case HISI_SEC_DUS: ++ *step = 2; ++ addr = restore_map_then_protect_mapping(curr, &curr_attr, next, &next_attr); ++ break; ++ default: ++ pr_err("invalid pgoff %lx for vma\n", curr->pgoff); ++ return -1; ++ } ++ return addr; ++} ++ ++static bool find(const char *s1, const char *s2) ++{ ++ if (s1 == NULL || s2 == NULL) ++ return NULL; ++ ++ while (*s1 != '\0' && *s2 != '\0') { ++ if (*s1 == *s2) { ++ s1 += 1; ++ s2 += 1; ++ } else ++ s1 += 1; ++ ++ if (*s2 == '\0') ++ return true; ++ } ++ ++ return false; ++} ++ ++static unsigned long distribute_restore_mapping(struct task_restore_args *args, ++ int i, int *step) ++{ ++ VmaEntry *vma = args->vmas + i; ++ struct vma_names *vma_name = args->vma_names + i; ++ ++ if (vma_entry_is(vma, VMA_AREA_CHR) && find(vma_name->name, HISI_SEC_DEV)) ++ return restore_hisi_sec_mapping(args, i, step); ++ else ++ return restore_mapping(vma); ++} ++ + /* + * This restores aio ring header, content, head and in-kernel position + * of tail. To set tail, we write to /dev/null and use the fact this +@@ -1542,7 +1666,7 @@ int write_fork_pid(int pid) + long __export_restore_task(struct task_restore_args *args) + { + long ret = -1; +- int i; ++ int i, step; + VmaEntry *vma_entry; + unsigned long va; + struct restore_vma_io *rio; +@@ -1691,7 +1815,7 @@ long __export_restore_task(struct task_restore_args *args) + /* + * OK, lets try to map new one. + */ +- for (i = 0; i < args->vmas_n; i++) { ++ for (i = 0, step = 1; i < args->vmas_n; i += step, step = 1) { + vma_entry = args->vmas + i; + vma_name = args->vma_names + i; + +@@ -1708,7 +1832,7 @@ long __export_restore_task(struct task_restore_args *args) + if (vma_entry_is(vma_entry, VMA_PREMMAPED)) + continue; + +- va = restore_mapping(vma_entry); ++ va = distribute_restore_mapping(args, i, &step); + + if (va != vma_entry->start) { + pr_err("Can't restore %" PRIx64 " mapping with %lx\n", vma_entry->start, va); +diff --git a/criu/proc_parse.c b/criu/proc_parse.c +index 8913d93..daa54d9 100644 +--- a/criu/proc_parse.c ++++ b/criu/proc_parse.c +@@ -41,6 +41,7 @@ + #include "path.h" + #include "fault-injection.h" + #include "memfd.h" ++#include "files-chr.h" + + #include "protobuf.h" + #include "images/fdinfo.pb-c.h" +@@ -613,7 +614,8 @@ static int handle_vma(pid_t pid, struct vma_area *vma_area, const char *file_pat + /* NOTICE: if `--dump-char-dev` option is set, permmit + * all char device memory area dumping. + */ +- if (strstr(file_path, "uverbs") != NULL) { ++ if (strstr(file_path, "uverbs") != NULL ++ || strstr(file_path, HISI_SEC_DEV) != NULL) { + int len = strlen(file_path) + 1; + + vma_area->e->status |= VMA_AREA_CHR; +-- +2.34.1 + diff --git a/0028-infiniband-fix-the-infiniband-fd-conflict.patch b/0028-infiniband-fix-the-infiniband-fd-conflict.patch new file mode 100644 index 0000000..45fc13d --- /dev/null +++ b/0028-infiniband-fix-the-infiniband-fd-conflict.patch @@ -0,0 +1,223 @@ +From 5ff0e810f04de4b31f605ba3179dec3b3777978a Mon Sep 17 00:00:00 2001 +From: "fu.lin" +Date: Mon, 8 Nov 2021 15:08:12 +0800 +Subject: [PATCH 28/72] infiniband: fix the infiniband fd conflict + +Phenomenon: + Operating uverbs device will generate anonymous fd named +`anon_inode:[infinibandevent]`. When `anon_inode:[infinibandevent]` fd +is the last opened fd, and some kind of unix socket fd exist, which is +generated by syscalls like `socketpair()` at the same tim, +`anon_inode:[infinibandevent]` will restore fail probabilistically. + +log as the following: + +``` +(00.254523) 63959: open file flags:1 +(00.254526) 63959: unix: Opening standalone (stage 0 id 0x1ff ino 1019605 peer 0) +(00.254571) 63959: *******flags: 0 +(00.254575) 63959: Create fd for 1408 # the fake fd +(00.254578) 63959: *******flags: 1 +(00.254580) 63959: Create fd for 445 # the restoration fd +``` + +Reason: + During the restoration of unix socket, `socketpair()` will generate +two fds, one is used to the current restoration, another is called fake +fd which fd nr is owned by `find_unused_fd()`. When +`anon_inode:[infinibandevent]` fd is the last one, criu don't dump the +fd information for `anon_inode:[infinibandevent]` in original +implementation, and criu think the fd nr which should belong to +`anon_inode:[infinibandevent]` isn't used. Therefore, it cause the +`anon_inode:[infinibandevent]` restoration fail. + +This patch fix the above problem. Core: dump +`anon_inode:[infinibandevent]` fd information, make the criu is aware +that that fd nr is used. + +Conflict:NA +Reference:https://gitee.com/src-openeuler/criu/pulls/21 +Signed-off-by: fu.lin +--- + criu/files-chr.c | 57 ++++++++++++++++++++++++++++++++++++ + criu/files.c | 10 +++---- + criu/include/files-chr.h | 8 +++++ + criu/include/image-desc.h | 1 + + criu/include/protobuf-desc.h | 1 + + images/chr.proto | 3 ++ + images/fdinfo.proto | 2 ++ + 7 files changed, 76 insertions(+), 6 deletions(-) + +diff --git a/criu/files-chr.c b/criu/files-chr.c +index 95d93e1..6d87c33 100644 +--- a/criu/files-chr.c ++++ b/criu/files-chr.c +@@ -215,3 +215,60 @@ static int handle_hisi_vma(struct list_head *fds, struct vma_area *vma) + + return 0; + } ++ ++static void pr_info_infiniband(char *action, InfinibandEntry *infiniband) ++{ ++ pr_info("%sinfiniband: id %#08x\n", action, infiniband->id); ++} ++ ++static int dump_one_infiniband(int lfd, u32 id, const struct fd_parms *p) ++{ ++ FileEntry fe = FILE_ENTRY__INIT; ++ InfinibandEntry infiniband = INFINIBAND_ENTRY__INIT; ++ ++ infiniband.id = id; ++ ++ fe.type = FD_TYPES__INFINIBAND; ++ fe.id = infiniband.id; ++ fe.infiniband = &infiniband; ++ ++ pr_info_infiniband("Dumping ", &infiniband); ++ ++ return pb_write_one(img_from_set(glob_imgset, CR_FD_FILES), &fe, PB_FILE); ++} ++ ++const struct fdtype_ops infiniband_dump_ops = { ++ .type = FD_TYPES__INFINIBAND, ++ .dump = dump_one_infiniband, ++}; ++ ++static int infiniband_open(struct file_desc *d, int *new_fd) { ++ /* ++ * `*new_fd == -1` at this time, it means this open operation shouldn't ++ * be served out, which is why this function does nothing here. ++ */ ++ return 0; ++}; ++ ++static struct file_desc_ops infiniband_desc_ops = { ++ .type = FD_TYPES__INFINIBAND, ++ .open = infiniband_open, ++}; ++ ++static int collect_one_infiniband(void *o, ProtobufCMessage *base, struct cr_img *i) ++{ ++ struct infiniband_file_info *info = o; ++ ++ info->infiniband = pb_msg(base, InfinibandEntry); ++ pr_info_infiniband("Collected ", info->infiniband); ++ ++ /* add the fd to `file_desc_hash` list to prevent from NULL pointer */ ++ return file_desc_add(&info->d, info->infiniband->id, &infiniband_desc_ops); ++} ++ ++struct collect_image_info infiniband_cinfo = { ++ .fd_type = CR_FD_INFINIBAND, ++ .pb_type = PB_INFINIBAND, ++ .priv_size = sizeof(struct infiniband_file_info), ++ .collect = collect_one_infiniband, ++}; +diff --git a/criu/files.c b/criu/files.c +index 7b688f5..1ec5281 100644 +--- a/criu/files.c ++++ b/criu/files.c +@@ -639,7 +639,7 @@ static int dump_one_file(struct pid *pid, int fd, int lfd, struct fd_opts *opts, + else if (is_timerfd_link(link)) + ops = &timerfd_dump_ops; + else if (is_infiniband_link(link)) +- return 1; ++ ops = &infiniband_dump_ops; + #ifdef CONFIG_HAS_LIBBPF + else if (is_bpfmap_link(link)) + ops = &bpfmap_dump_ops; +@@ -744,11 +744,6 @@ int dump_task_files_seized(struct parasite_ctl *ctl, struct pstree_item *item, s + ret = dump_one_file(item->pid, dfds->fds[i + off], lfds[i], opts + i, ctl, &e, dfds); + if (ret) + break; +- /* infiniband link file */ +- if (ret > 0) { +- ret = 0; +- continue; +- } + + ret = pb_write_one(img, &e, PB_FDINFO); + if (ret) +@@ -1864,6 +1859,9 @@ static int collect_one_file(void *o, ProtobufCMessage *base, struct cr_img *i) + case FD_TYPES__CHR: + ret = collect_one_file_entry(fe, fe->chr->id, &fe->chr->base, &chrfile_cinfo); + break; ++ case FD_TYPES__INFINIBAND: ++ ret = collect_one_file_entry(fe, fe->infiniband->id, &fe->infiniband->base, &infiniband_cinfo); ++ break; + } + + return ret; +diff --git a/criu/include/files-chr.h b/criu/include/files-chr.h +index 26b8fb2..261c4b2 100644 +--- a/criu/include/files-chr.h ++++ b/criu/include/files-chr.h +@@ -38,4 +38,12 @@ enum hisi_sec_dev { + HISI_SEC_DUS = 0x2000, + }; + ++struct infiniband_file_info { ++ InfinibandEntry *infiniband; ++ struct file_desc d; ++}; ++ ++extern const struct fdtype_ops infiniband_dump_ops; ++extern struct collect_image_info infiniband_cinfo; ++ + #endif /* __CRIU_FILES_CHR_H__ */ +diff --git a/criu/include/image-desc.h b/criu/include/image-desc.h +index e35f8b2..9ad5fa0 100644 +--- a/criu/include/image-desc.h ++++ b/criu/include/image-desc.h +@@ -116,6 +116,7 @@ enum { + + CR_FD_AUTOFS, + CR_FD_CHRFILE, ++ CR_FD_INFINIBAND, + + CR_FD_MAX + }; +diff --git a/criu/include/protobuf-desc.h b/criu/include/protobuf-desc.h +index 2468e8f..72a9e1d 100644 +--- a/criu/include/protobuf-desc.h ++++ b/criu/include/protobuf-desc.h +@@ -71,6 +71,7 @@ enum { + PB_BPFMAP_DATA, + PB_APPARMOR, + PB_CHRFILE, ++ PB_INFINIBAND, + + /* PB_AUTOGEN_STOP */ + +diff --git a/images/chr.proto b/images/chr.proto +index 67929db..ed65005 100644 +--- a/images/chr.proto ++++ b/images/chr.proto +@@ -10,3 +10,6 @@ message chrfile_entry { + required bool repair = 5; + }; + ++message infiniband_entry { ++ required uint32 id = 1; ++}; +diff --git a/images/fdinfo.proto b/images/fdinfo.proto +index 6549472..eb52f35 100644 +--- a/images/fdinfo.proto ++++ b/images/fdinfo.proto +@@ -44,6 +44,7 @@ enum fd_types { + MEMFD = 18; + BPFMAP = 19; + CHR = 21; ++ INFINIBAND = 22; + + /* Any number above the real used. Not stored to image */ + CTL_TTY = 65534; +@@ -81,4 +82,5 @@ message file_entry { + optional memfd_file_entry memfd = 20; + optional bpfmap_file_entry bpf = 21; + optional chrfile_entry chr = 23; ++ optional infiniband_entry infiniband = 25; + } +-- +2.34.1 + diff --git a/0029-cred-provide-cred-checkpoint-restore-method.patch b/0029-cred-provide-cred-checkpoint-restore-method.patch new file mode 100644 index 0000000..2ede5b5 --- /dev/null +++ b/0029-cred-provide-cred-checkpoint-restore-method.patch @@ -0,0 +1,255 @@ +From e522deb5680840e878b8f05c66f040cfd3b49d90 Mon Sep 17 00:00:00 2001 +From: Jingxian He +Date: Wed, 19 May 2021 21:47:28 +0800 +Subject: [PATCH 29/72] cred: provide cred checkpoint restore method + +criu checkpoint/restore the task, it only restore the context instead of +the memory address storing the context. + +To handle the problem resulted by CVE bugfix, details: +- https://nvd.nist.gov/vuln/detail/CVE-2016-4565 +- https://openfabrics.org/images/2018workshop/presentations/113_MRuhl_JourneytoVerbsIOCTL.pdf + +Brief: + Refresh the security context address of file. The infiniband code use +write()` as bi-directional `ioctl()`, there is `struct cred` address +uring `write()` process. However, criu uses some syscall, such as +capset()` and `setgroups()`, to regenerate the new cred, the file +red is fixed by `fcntl(F_SETOWN)`, then the address of new cred is +ifferent from the file. + This patch fix the `struct cred` address checking problem resulted by +VE fixed in infiniband drivers. + +Conflict:NA +Reference:https://gitee.com/src-openeuler/criu/pulls/21 +Signed-off-by: luolongjun +Signed-off-by: fu.lin +--- + criu/config.c | 1 + + criu/cr-restore.c | 35 +++++++++++++++++++++++++++++++++++ + criu/crtools.c | 1 + + criu/include/cr_options.h | 1 + + criu/include/fcntl.h | 4 ++++ + criu/include/prctl.h | 4 ++++ + criu/include/restorer.h | 3 +++ + criu/pie/restorer.c | 38 ++++++++++++++++++++++++++++++++++++++ + 8 files changed, 87 insertions(+) + +diff --git a/criu/config.c b/criu/config.c +index 03cad66..cf99fb1 100644 +--- a/criu/config.c ++++ b/criu/config.c +@@ -702,6 +702,7 @@ int parse_options(int argc, char **argv, bool *usage_error, bool *has_exec_cmd, + BOOL_OPT("use-fork-pid", &opts.use_fork_pid), + BOOL_OPT("with-notifier", &opts.with_notifier_kup), + BOOL_OPT("dump-char-dev", &opts.dump_char_dev), ++ BOOL_OPT("with-fd-cred", &opts.with_fd_cred), + {}, + }; + +diff --git a/criu/cr-restore.c b/criu/cr-restore.c +index 2904a75..ac677a1 100644 +--- a/criu/cr-restore.c ++++ b/criu/cr-restore.c +@@ -692,6 +692,28 @@ static int __collect_child_pids(struct pstree_item *p, int state, unsigned int * + return 0; + } + ++static int collect_child_fds(int state, unsigned int *n, struct pstree_item *me) ++{ ++ struct list_head *list = &rsti(me)->fds; ++ struct fdinfo_list_entry *fle, *tmp; ++ ++ *n = 0; ++ list_for_each_entry_safe(fle, tmp, list, ps_list) { ++ if (fle->fe->type == state) { ++ int *child; ++ ++ child = rst_mem_alloc(sizeof(*child), RM_PRIVATE); ++ if (!child) ++ return -1; ++ ++ (*n)++; ++ *child = fle->fe->fd; ++ } ++ } ++ ++ return 0; ++} ++ + static int collect_child_pids(int state, unsigned int *n) + { + struct pstree_item *pi; +@@ -715,6 +737,12 @@ static int collect_child_pids(int state, unsigned int *n) + return __collect_child_pids(current, state, n); + } + ++static int collect_chr_fds(struct pstree_item *me, struct task_restore_args *ta) ++{ ++ ta->setcred_pids = (int *)rst_mem_align_cpos(RM_PRIVATE); ++ return collect_child_fds(FD_TYPES__CHR, &ta->setcred_pids_n, me); ++} ++ + static int collect_helper_pids(struct task_restore_args *ta) + { + ta->helpers = (pid_t *)rst_mem_align_cpos(RM_PRIVATE); +@@ -939,6 +967,9 @@ static int restore_one_alive_task(int pid, CoreEntry *core) + if (collect_zombie_pids(ta) < 0) + return -1; + ++ if (opts.with_fd_cred && collect_chr_fds(current, ta) < 0) ++ return -1; ++ + if (collect_inotify_fds(ta) < 0) + return -1; + +@@ -3746,6 +3777,10 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns + RST_MEM_FIXUP_PPTR(task_args->helpers); + RST_MEM_FIXUP_PPTR(task_args->zombies); + RST_MEM_FIXUP_PPTR(task_args->vma_ios); ++ if (opts.with_fd_cred) ++ RST_MEM_FIXUP_PPTR(task_args->setcred_pids); ++ else ++ task_args->setcred_pids_n = UINT_MAX; + RST_MEM_FIXUP_PPTR(task_args->inotify_fds); + + task_args->compatible_mode = core_is_compat(core); +diff --git a/criu/crtools.c b/criu/crtools.c +index dc6d603..ed7bd99 100644 +--- a/criu/crtools.c ++++ b/criu/crtools.c +@@ -453,6 +453,7 @@ usage: + " This feature needs the kernel assistance.\n" + " --dump-char-dev Dump char dev files as normal file with repair cmd\n" + \ ++ " --with-fd-cred Allow to make the restored process has the same cred\n" + "\n" + "Check options:\n" + " Without options, \"criu check\" checks availability of absolutely required\n" +diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h +index 226acb2..1d6ddcf 100644 +--- a/criu/include/cr_options.h ++++ b/criu/include/cr_options.h +@@ -194,6 +194,7 @@ struct cr_options { + int use_fork_pid; + int with_notifier_kup; + int dump_char_dev; ++ int with_fd_cred; + }; + + extern struct cr_options opts; +diff --git a/criu/include/fcntl.h b/criu/include/fcntl.h +index 35f8805..568977c 100644 +--- a/criu/include/fcntl.h ++++ b/criu/include/fcntl.h +@@ -19,6 +19,10 @@ struct f_owner_ex { + #define F_GETOWNER_UIDS 17 + #endif + ++#ifndef F_SETCRED ++#define F_SETCRED 18 ++#endif ++ + /* + * These things are required to compile on CentOS-6 + */ +diff --git a/criu/include/prctl.h b/criu/include/prctl.h +index c843f40..81dda9d 100644 +--- a/criu/include/prctl.h ++++ b/criu/include/prctl.h +@@ -82,4 +82,8 @@ struct prctl_mm_map { + #define PR_GET_THP_DISABLE 42 + #endif + ++#ifndef PR_DEFAULT_CRED ++#define PR_DEFAULT_CRED 54 ++#endif ++ + #endif /* __CR_PRCTL_H__ */ +diff --git a/criu/include/restorer.h b/criu/include/restorer.h +index a81cc1b..60c1dab 100644 +--- a/criu/include/restorer.h ++++ b/criu/include/restorer.h +@@ -193,6 +193,9 @@ struct task_restore_args { + pid_t *zombies; + unsigned int zombies_n; + ++ int *setcred_pids; ++ unsigned int setcred_pids_n; ++ + int *inotify_fds; /* fds to cleanup inotify events at CR_STATE_RESTORE_SIGCHLD stage */ + unsigned int inotify_fds_n; + +diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c +index dcc922e..fde6e30 100644 +--- a/criu/pie/restorer.c ++++ b/criu/pie/restorer.c +@@ -101,6 +101,7 @@ static int restore_anon_mapping(VmaEntry *vma_entry, struct vma_names *vma_name) + static struct task_entries *task_entries_local; + static futex_t thread_inprogress; + static futex_t thread_start; ++static futex_t cred_set; + static pid_t *helpers; + static int n_helpers; + static pid_t *zombies; +@@ -365,6 +366,41 @@ static int restore_creds(struct thread_creds_args *args, int procfd, int lsm_typ + return 0; + } + ++static int update_cred_ref(struct task_restore_args *ta) ++{ ++ int i; ++ int ret; ++ int pid = sys_getpid(); ++ long int tid = sys_gettid(); ++ ++ if (ta->setcred_pids_n == UINT_MAX) { ++ pr_info("no need to keep the same cred \n"); ++ return 0; ++ } ++ ++ if (pid == tid) { ++ /* let main thread finish cred update first */ ++ ret = sys_prctl(PR_DEFAULT_CRED, 0, 0, 0, 0); ++ pr_info("main cred restore \n"); ++ futex_set_and_wake(&cred_set, 1); ++ } else { ++ futex_wait_until(&cred_set, 1); ++ pr_info("other cred restore \n"); ++ ret = sys_prctl(PR_DEFAULT_CRED, 0, 0, 0, 0); ++ } ++ ++ if (ret) ++ return ret; ++ ++ pr_info("%ld (%d) is going to update current cred \n", tid, pid); ++ ++ for (i = 0; i < ta->setcred_pids_n; i++) { ++ sys_fcntl(ta->setcred_pids[i], F_SETCRED, 0); ++ } ++ ++ return 0; ++} ++ + /* + * This should be done after creds restore, as + * some creds changes might drop the value back +@@ -742,6 +778,7 @@ long __export_restore_thread(struct thread_restore_args *args) + BUG(); + + ret = restore_creds(args->creds_args, args->ta->proc_fd, args->ta->lsm_type); ++ ret = ret || update_cred_ref(args->ta); + ret = ret || restore_dumpable_flag(&args->ta->mm); + ret = ret || restore_pdeath_sig(args); + if (ret) +@@ -2221,6 +2258,7 @@ long __export_restore_task(struct task_restore_args *args) + * thus restore* creds _after_ all of the above. + */ + ret = restore_creds(args->t->creds_args, args->proc_fd, args->lsm_type); ++ ret = ret || update_cred_ref(args); + ret = ret || restore_dumpable_flag(&args->mm); + ret = ret || restore_pdeath_sig(args->t); + ret = ret || restore_child_subreaper(args->child_subreaper); +-- +2.34.1 + diff --git a/0030-socket-fix-connect-error-of-invalid-param.patch b/0030-socket-fix-connect-error-of-invalid-param.patch new file mode 100644 index 0000000..32a43d8 --- /dev/null +++ b/0030-socket-fix-connect-error-of-invalid-param.patch @@ -0,0 +1,93 @@ +From 8afde209d2a9245d902eabe40ca7c514aeb6ee9a Mon Sep 17 00:00:00 2001 +From: Jingxian He +Date: Wed, 19 May 2021 21:56:16 +0800 +Subject: [PATCH 30/72] socket: fix connect error of invalid param + +Fix connect error of invalid param during module upgrade. + +Conflict:NA +Reference:https://gitee.com/src-openeuler/criu/pulls/21 +Signed-off-by: Xiaoguang Li +Signed-off-by: fu.lin +--- + criu/include/sockets.h | 1 + + criu/sk-inet.c | 13 +++++++++++-- + criu/sockets.c | 5 ++++- + 3 files changed, 16 insertions(+), 3 deletions(-) + +diff --git a/criu/include/sockets.h b/criu/include/sockets.h +index 3e8f3d6..2391b48 100644 +--- a/criu/include/sockets.h ++++ b/criu/include/sockets.h +@@ -27,6 +27,7 @@ struct socket_desc { + extern int dump_socket(struct fd_parms *p, int lfd, FdinfoEntry *); + extern int dump_socket_opts(int sk, SkOptsEntry *soe); + extern int restore_socket_opts(int sk, SkOptsEntry *soe); ++extern int restore_bound_opts(int sk, SkOptsEntry *soe); + extern void release_skopts(SkOptsEntry *); + extern int restore_prepare_socket(int sk); + extern void preload_socket_modules(void); +diff --git a/criu/sk-inet.c b/criu/sk-inet.c +index e52b198..05048c8 100644 +--- a/criu/sk-inet.c ++++ b/criu/sk-inet.c +@@ -100,15 +100,20 @@ static void show_one_inet(const char *act, const struct inet_sk_desc *sk) + static void show_one_inet_img(const char *act, const InetSkEntry *e) + { + char src_addr[INET_ADDR_LEN] = ""; ++ char dst_addr[INET_ADDR_LEN] = ""; + + if (inet_ntop(e->family, (void *)e->src_addr, src_addr, INET_ADDR_LEN) == NULL) { + pr_perror("Failed to translate address"); + } ++ if (inet_ntop(e->family, (void *)e->dst_addr, dst_addr, ++ INET_ADDR_LEN) == NULL) { ++ pr_perror("Failed to translate address"); ++ } + + pr_debug("\t%s: family %-10s type %-14s proto %-16s port %d " +- "state %-16s src_addr %s\n", ++ "state %-16s src_addr %s dst_addr %s\n", + act, ___socket_family_name(e->family), ___socket_type_name(e->type), ___socket_proto_name(e->proto), +- e->src_port, ___tcp_state_name(e->state), src_addr); ++ e->src_port, ___tcp_state_name(e->state), src_addr, dst_addr); + } + + static int can_dump_ipproto(unsigned int ino, int proto, int type) +@@ -852,6 +857,10 @@ static int open_inet_sk(struct file_desc *d, int *new_fd) + if (restore_opt(sk, SOL_SOCKET, SO_REUSEPORT, &yes)) + goto err; + ++ if(restore_bound_opts(sk, ie->opts) < 0){ ++ goto err; ++ } ++ + if (tcp_connection(ie)) { + if (!opts.tcp_established_ok && !opts.tcp_close) { + pr_err("Connected TCP socket in image\n"); +diff --git a/criu/sockets.c b/criu/sockets.c +index 9426b5b..2ddf85e 100644 +--- a/criu/sockets.c ++++ b/criu/sockets.c +@@ -586,7 +586,6 @@ int restore_socket_opts(int sk, SkOptsEntry *soe) + tv.tv_usec = soe->so_rcv_tmo_usec; + ret |= restore_opt(sk, SOL_SOCKET, SO_RCVTIMEO, &tv); + +- ret |= restore_bound_dev(sk, soe); + ret |= restore_socket_filter(sk, soe); + + /* The restore of SO_REUSEADDR depends on type of socket */ +@@ -594,6 +593,10 @@ int restore_socket_opts(int sk, SkOptsEntry *soe) + return ret; + } + ++int restore_bound_opts(int sk, SkOptsEntry *soe){ ++ return restore_bound_dev(sk, soe); ++} ++ + int do_dump_opt(int sk, int level, int name, void *val, int len) + { + socklen_t aux = len; +-- +2.34.1 + diff --git a/0031-criu-eventpollfd-fix-for-improper-usage-in-appdata.patch b/0031-criu-eventpollfd-fix-for-improper-usage-in-appdata.patch new file mode 100644 index 0000000..c3e2a6a --- /dev/null +++ b/0031-criu-eventpollfd-fix-for-improper-usage-in-appdata.patch @@ -0,0 +1,99 @@ +From 89eb9deee6da8acc7747e103ee591f299fec2043 Mon Sep 17 00:00:00 2001 +From: Jingxian He +Date: Wed, 19 May 2021 21:56:38 +0800 +Subject: [PATCH 31/72] criu: eventpollfd fix for improper usage in appdata + +Fix eventpollfd problem of improper usage in appdata. + +Conflict:NA +Reference:https://gitee.com/src-openeuler/criu/pulls/21 +Signed-off-by: Jingxian He +Signed-off-by: fu.lin +--- + criu/eventpoll.c | 16 +++++++++++----- + criu/proc_parse.c | 2 ++ + images/eventpoll.proto | 3 +++ + 3 files changed, 16 insertions(+), 5 deletions(-) + +diff --git a/criu/eventpoll.c b/criu/eventpoll.c +index 978dca5..8900d50 100644 +--- a/criu/eventpoll.c ++++ b/criu/eventpoll.c +@@ -67,8 +67,8 @@ int is_eventpoll_link(char *link) + + static void pr_info_eventpoll_tfd(char *action, uint32_t id, EventpollTfdEntry *e) + { +- pr_info("%seventpoll-tfd: id %#08x tfd %8d events %#08x data %#016" PRIx64 "\n", action, id, e->tfd, e->events, +- e->data); ++ pr_info("%seventpoll-tfd: id %#08x tfd %8d events %#08x data %#016" PRIx64 " ignore %d\n", ++ action, id, e->tfd, e->events, e->data, e->ignore); + } + + static void pr_info_eventpoll(char *action, EventpollFileEntry *e) +@@ -144,9 +144,9 @@ int flush_eventpoll_dinfo_queue(void) + }; + struct kid_elem *t = kid_lookup_epoll_tfd(&fd_tree, &ke, &slot); + if (!t) { +- pr_debug("kid_lookup_epoll: no match pid %d efd %d tfd %d toff %u\n", dinfo->pid, +- dinfo->efd, tfde->tfd, dinfo->toff[i].off); +- goto err; ++ pr_info("Drop tfd entry, pid %d efd %d tfd %d toff %u\n", ++ dinfo->pid, dinfo->efd, tfde->tfd, dinfo->toff[i].off); ++ continue; + } + + pr_debug("kid_lookup_epoll: rbsearch match pid %d efd %d tfd %d toff %u -> %d\n", dinfo->pid, +@@ -159,6 +159,7 @@ int flush_eventpoll_dinfo_queue(void) + goto err; + } + ++ pr_info("Change tfd: %d -> %d @ efd=%d\n", tfde->tfd, t->idx, slot.efd); + tfde->tfd = t->idx; + } + +@@ -409,6 +410,11 @@ static int eventpoll_retore_tfd(int fd, int id, EventpollTfdEntry *tdefe) + { + struct epoll_event event; + ++ if (tdefe->ignore) { ++ pr_info_eventpoll_tfd("Ignore ", id, tdefe); ++ return 0; ++ } ++ + pr_info_eventpoll_tfd("Restore ", id, tdefe); + + event.events = tdefe->events; +diff --git a/criu/proc_parse.c b/criu/proc_parse.c +index daa54d9..d13589c 100644 +--- a/criu/proc_parse.c ++++ b/criu/proc_parse.c +@@ -1895,10 +1895,12 @@ static int parse_fdinfo_pid_s(int pid, int fd, int type, void *arg) + e->has_dev = false; + e->has_inode = false; + e->has_pos = false; ++ e->has_ignore = false; + } else if (ret == 6) { + e->has_dev = true; + e->has_inode = true; + e->has_pos = true; ++ e->has_ignore = true; + } else if (ret < 6) { + eventpoll_tfd_entry__free_unpacked(e, NULL); + goto parse_err; +diff --git a/images/eventpoll.proto b/images/eventpoll.proto +index 0f3e8a8..2fd9598 100644 +--- a/images/eventpoll.proto ++++ b/images/eventpoll.proto +@@ -14,6 +14,9 @@ message eventpoll_tfd_entry { + optional uint32 dev = 5; + optional uint64 inode = 6; + optional uint64 pos = 7; ++ ++ /* entry validation */ ++ optional uint32 ignore = 8; + } + + message eventpoll_file_entry { +-- +2.34.1 + diff --git a/0032-task_exit_notify-add-task-exit-notify-mask-method-fo.patch b/0032-task_exit_notify-add-task-exit-notify-mask-method-fo.patch new file mode 100644 index 0000000..f1d2396 --- /dev/null +++ b/0032-task_exit_notify-add-task-exit-notify-mask-method-fo.patch @@ -0,0 +1,193 @@ +From 58a8c9eb07c2cff6232c20f9a59edc634bb1e5e0 Mon Sep 17 00:00:00 2001 +From: Jingxian He +Date: Wed, 19 May 2021 21:59:24 +0800 +Subject: [PATCH 32/72] task_exit_notify: add task exit notify mask method for + criu + +Add task exit notify mask method for criu during kernel module upgrade. + +Signed-off-by: Jingxian He +Signed-off-by: fu.lin +--- + criu/Makefile.crtools | 1 + + criu/config.c | 1 + + criu/cr-restore.c | 10 ++++++++++ + criu/crtools.c | 1 + + criu/exit-notify.c | 34 ++++++++++++++++++++++++++++++++++ + criu/include/cr_options.h | 1 + + criu/include/exit-notify.h | 10 ++++++++++ + criu/seize.c | 10 +++++++++- + 8 files changed, 67 insertions(+), 1 deletion(-) + create mode 100644 criu/exit-notify.c + create mode 100644 criu/include/exit-notify.h + +diff --git a/criu/Makefile.crtools b/criu/Makefile.crtools +index 2e82912..65cc215 100644 +--- a/criu/Makefile.crtools ++++ b/criu/Makefile.crtools +@@ -93,6 +93,7 @@ obj-y += timens.o + obj-y += pin-mem.o + obj-y += devname.o + obj-y += files-chr.o ++obj-y += exit-notify.o + obj-$(CONFIG_HAS_LIBBPF) += bpfmap.o + obj-$(CONFIG_COMPAT) += pie-util-vdso-elf32.o + CFLAGS_pie-util-vdso-elf32.o += -DCONFIG_VDSO_32 +diff --git a/criu/config.c b/criu/config.c +index cf99fb1..bd0f84d 100644 +--- a/criu/config.c ++++ b/criu/config.c +@@ -703,6 +703,7 @@ int parse_options(int argc, char **argv, bool *usage_error, bool *has_exec_cmd, + BOOL_OPT("with-notifier", &opts.with_notifier_kup), + BOOL_OPT("dump-char-dev", &opts.dump_char_dev), + BOOL_OPT("with-fd-cred", &opts.with_fd_cred), ++ BOOL_OPT("mask-exit-notify", &opts.mask_exit_notify), + {}, + }; + +diff --git a/criu/cr-restore.c b/criu/cr-restore.c +index ac677a1..09f135b 100644 +--- a/criu/cr-restore.c ++++ b/criu/cr-restore.c +@@ -82,6 +82,7 @@ + #include "apparmor.h" + #include "pin-mem.h" + #include "notifier.h" ++#include "exit-notify.h" + + #include "parasite-syscall.h" + #include "files-reg.h" +@@ -1542,6 +1543,15 @@ static inline int fork_with_pid(struct pstree_item *item) + pr_debug("PID: real %d virt %d\n", item->pid->real, vpid(item)); + } + ++ if (opts.mask_exit_notify) { ++ int pid = ret; ++ ++ pr_info("Start unmask exit notifier for pid %d\n", pid); ++ ret = mask_task_exit_notify(pid, false); ++ if (ret) ++ pr_err("Can't unmask exit notifier for pid %d\n", pid); ++ } ++ + err_unlock: + if (!(ca.clone_flags & CLONE_NEWPID)) + unlock_last_pid(); +diff --git a/criu/crtools.c b/criu/crtools.c +index ed7bd99..1a41be4 100644 +--- a/criu/crtools.c ++++ b/criu/crtools.c +@@ -454,6 +454,7 @@ usage: + " --dump-char-dev Dump char dev files as normal file with repair cmd\n" + \ + " --with-fd-cred Allow to make the restored process has the same cred\n" ++ " --mask-exit-notify Mask task exit notify during dump and restore\n" + "\n" + "Check options:\n" + " Without options, \"criu check\" checks availability of absolutely required\n" +diff --git a/criu/exit-notify.c b/criu/exit-notify.c +new file mode 100644 +index 0000000..5c86724 +--- /dev/null ++++ b/criu/exit-notify.c +@@ -0,0 +1,34 @@ ++#include ++#include ++#include ++#include ++ ++#include "exit-notify.h" ++#include "log.h" ++ ++int mask_task_exit_notify(int pid, bool mask) ++{ ++ int fd, retval; ++ char buf[PID_BUF_SIZE] = {0}; ++ ++ if (pid <= 0) ++ return -1; ++ ++ snprintf(buf, PID_BUF_SIZE - 1, "%d", pid); ++ if (mask) ++ fd = open(MASK_EXIT_NOTIFY_DIR, O_WRONLY, 0); ++ else ++ fd = open(UNMASK_EXIT_NOTIFY_DIR, O_WRONLY, 0); ++ ++ if (fd < 0) { ++ pr_err("open mask exit notify file fail\n"); ++ return fd; ++ } ++ ++ retval = write(fd, buf, PID_BUF_SIZE); ++ if (retval < 0) ++ pr_err("Write mask exit pid: %s fail\n", buf); ++ close(fd); ++ ++ return retval < 0 ? -1 : 0; ++} +diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h +index 1d6ddcf..26ae5b6 100644 +--- a/criu/include/cr_options.h ++++ b/criu/include/cr_options.h +@@ -195,6 +195,7 @@ struct cr_options { + int with_notifier_kup; + int dump_char_dev; + int with_fd_cred; ++ int mask_exit_notify; + }; + + extern struct cr_options opts; +diff --git a/criu/include/exit-notify.h b/criu/include/exit-notify.h +new file mode 100644 +index 0000000..34f2c8d +--- /dev/null ++++ b/criu/include/exit-notify.h +@@ -0,0 +1,10 @@ ++#ifndef __CRIU_EXIT_NOTIFY_H__ ++#define __CRIU_EXIT_NOTIFY_H__ ++ ++#define PID_BUF_SIZE 32 ++#define MASK_EXIT_NOTIFY_DIR "/sys/kernel/mask_exit_notify" ++#define UNMASK_EXIT_NOTIFY_DIR "/sys/kernel/unmask_exit_notify" ++ ++int mask_task_exit_notify(int pid, bool mask); ++ ++#endif /* __CRIU_EXIT_NOTIFY_H__ */ +diff --git a/criu/seize.c b/criu/seize.c +index 8a35c3c..1e127ff 100644 +--- a/criu/seize.c ++++ b/criu/seize.c +@@ -24,6 +24,8 @@ + #include "xmalloc.h" + #include "util.h" + #include "pin-mem.h" ++#include "mem.h" ++#include "exit-notify.h" + + #define NR_ATTEMPTS 5 + +@@ -636,7 +638,7 @@ free: + + static void unseize_task_and_threads(const struct pstree_item *item, int st) + { +- int i; ++ int i, ret; + + if (item->pid->state == TASK_DEAD) + return; +@@ -646,6 +648,12 @@ static void unseize_task_and_threads(const struct pstree_item *item, int st) + dump_task_special_pages(item->threads[i].real); + } + ++ if (opts.mask_exit_notify && (st == TASK_DEAD)) { ++ ret = mask_task_exit_notify(item->threads[0].real, true); ++ if (ret) ++ pr_err("mask exit notify for %d fail.\n", item->threads[0].real); ++ } ++ + /* + * The st is the state we want to switch tasks into, + * the item->state is the state task was in when we seized one. +-- +2.34.1 + diff --git a/0033-unix-socket-add-support-for-unix-stream-socket.patch b/0033-unix-socket-add-support-for-unix-stream-socket.patch new file mode 100644 index 0000000..2d61915 --- /dev/null +++ b/0033-unix-socket-add-support-for-unix-stream-socket.patch @@ -0,0 +1,403 @@ +From fe39f73462e84a1a59d9b2b81a97e26cd1f2d20c Mon Sep 17 00:00:00 2001 +From: Luo Longjun +Date: Mon, 7 Jun 2021 11:50:42 +0800 +Subject: [PATCH 33/72] unix socket: add support for unix stream socket + +When dump unix stream socket with external connections, +we will tell kernel to turn repair mode on for this sock. +And then kernel will keep this sock before restoring it. +In this process, the other socket which communicates with +this sock in repair mode will get EAGAIN or blocked. + +Signed-off-by: Luo Longjun + +fix unix socket dump and restore err +Fix name-less unix socket dump and restore problem. + +Signed-off-by: Jingxian He + +unix socket:ignore repair error from kernel +leave error for applications to deal with. + +Signed-off-by: Luo Longjun + +- enable this feature by check cmdline `unix_stream_restore_enable` +- don't set repair mode for non-external socket + +Signed-off-by: fu.lin +--- + criu/cr-dump.c | 1 + + criu/include/kerndat.h | 1 + + criu/include/sockets.h | 1 + + criu/kerndat.c | 33 +++++++++ + criu/sk-unix.c | 150 ++++++++++++++++++++++++++++++++++++++--- + images/sk-unix.proto | 1 + + 6 files changed, 178 insertions(+), 9 deletions(-) + +diff --git a/criu/cr-dump.c b/criu/cr-dump.c +index fd17413..e0e11cc 100644 +--- a/criu/cr-dump.c ++++ b/criu/cr-dump.c +@@ -2002,6 +2002,7 @@ static int cr_dump_finish(int ret) + + cr_plugin_fini(CR_PLUGIN_STAGE__DUMP, ret); + cgp_fini(); ++ unix_stream_unlock(ret); + + if (!ret) { + /* +diff --git a/criu/include/kerndat.h b/criu/include/kerndat.h +index 05abeda..3979939 100644 +--- a/criu/include/kerndat.h ++++ b/criu/include/kerndat.h +@@ -76,6 +76,7 @@ struct kerndat_s { + bool has_nftables_concat; + bool has_rseq; + bool has_ptrace_get_rseq_conf; ++ bool has_unix_sk_repair; + }; + + extern struct kerndat_s kdat; +diff --git a/criu/include/sockets.h b/criu/include/sockets.h +index 2391b48..e43a760 100644 +--- a/criu/include/sockets.h ++++ b/criu/include/sockets.h +@@ -43,6 +43,7 @@ extern int add_fake_unix_queuers(void); + extern int fix_external_unix_sockets(void); + extern int prepare_scms(void); + extern int unix_note_scm_rights(int id_for, uint32_t *file_ids, int *fds, int n_ids); ++extern void unix_stream_unlock(int ret); + + extern struct collect_image_info netlink_sk_cinfo; + +diff --git a/criu/kerndat.c b/criu/kerndat.c +index af7113a..6d6aac1 100644 +--- a/criu/kerndat.c ++++ b/criu/kerndat.c +@@ -1259,6 +1259,36 @@ static int kerndat_has_nftables_concat(void) + #endif + } + ++#define UNIX_STREAM_RESTORE_ENABLE_FILE "/sys/module/kernel/parameters/unix_stream_restore_enable" ++ ++static void kerndat_has_unix_sk_repair(void) ++{ ++ FILE *fp; ++ char ch = 'N'; ++ ++ if (access(UNIX_STREAM_RESTORE_ENABLE_FILE, F_OK) < 0) { ++ pr_debug("C/R external unix stream socket is not support\n"); ++ return; ++ } ++ ++ fp = fopen(UNIX_STREAM_RESTORE_ENABLE_FILE, "r"); ++ if (fp == NULL) { ++ pr_err("failed to open '%s': %s\n", ++ UNIX_STREAM_RESTORE_ENABLE_FILE, strerror(errno)); ++ return; ++ } ++ ++ fscanf(fp, "%c", &ch); ++ if (ch == 'Y') { ++ pr_debug("enable C/R external unix stream socket support\n"); ++ kdat.has_unix_sk_repair = true; ++ } ++ ++ fclose(fp); ++ ++ return; ++} ++ + int kerndat_init(void) + { + int ret; +@@ -1419,6 +1449,9 @@ int kerndat_init(void) + pr_err("kerndat_has_ptrace_get_rseq_conf failed when initializing kerndat.\n"); + ret = -1; + } ++ ++ kerndat_has_unix_sk_repair(); ++ + kerndat_lsm(); + kerndat_mmap_min_addr(); + kerndat_files_stat(); +diff --git a/criu/sk-unix.c b/criu/sk-unix.c +index f3fe60c..86bfa18 100644 +--- a/criu/sk-unix.c ++++ b/criu/sk-unix.c +@@ -72,6 +72,7 @@ struct unix_sk_desc { + char *name; + unsigned int nr_icons; + unsigned int *icons; ++ int repair_ino; + + unsigned int vfs_dev; + unsigned int vfs_ino; +@@ -89,9 +90,18 @@ struct unix_sk_desc { + struct list_head peer_list; + struct list_head peer_node; + ++ struct list_head repair_list; ++ struct list_head repair_node; ++ struct unix_stream_extern_socket_desc *ext_node; ++ + UnixSkEntry *ue; + }; + ++struct unix_stream_extern_socket_desc { ++ struct list_head list; ++ int fd; ++}; ++ + /* + * The mutex_ghost is accessed from different tasks, + * so make sure it is in shared memory. +@@ -100,6 +110,7 @@ static mutex_t *mutex_ghost; + + static LIST_HEAD(unix_sockets); + static LIST_HEAD(unix_ghost_addr); ++static LIST_HEAD(unix_stream_external_sockets); + + static int unix_resolve_name(int lfd, uint32_t id, struct unix_sk_desc *d, UnixSkEntry *ue, const struct fd_parms *p); + +@@ -116,6 +127,26 @@ struct unix_sk_listen_icon { + + static struct unix_sk_listen_icon *unix_listen_icons[SK_HASH_SIZE]; + ++static int unix_stream_repair_on(int fd) ++{ ++ int ret, aux = 1; ++ ret = setsockopt(fd, SOL_TCP, TCP_REPAIR_OPTIONS, &aux, sizeof(aux)); ++ if (ret < 0) ++ pr_err("Can't turn repair mod for unix stream on. \n"); ++ ++ return ret; ++} ++ ++static int unix_stream_repair_off(int fd) ++{ ++ int ret, aux = 0; ++ ret = setsockopt(fd, SOL_TCP, TCP_REPAIR_OPTIONS, &aux, sizeof(aux)); ++ if (ret < 0) ++ pr_err("Can't turn repair mod for unix stream off. \n"); ++ ++ return ret; ++} ++ + static struct unix_sk_listen_icon *lookup_unix_listen_icons(unsigned int peer_ino) + { + struct unix_sk_listen_icon *ic; +@@ -331,6 +362,8 @@ static int dump_one_unix_fd(int lfd, uint32_t id, const struct fd_parms *p) + FilePermsEntry *perms; + FownEntry *fown; + void *m; ++ unsigned int len; ++ int ret; + + m = xmalloc(sizeof(UnixSkEntry) + sizeof(SkOptsEntry) + sizeof(FilePermsEntry) + sizeof(FownEntry)); + if (!m) +@@ -372,6 +405,7 @@ static int dump_one_unix_fd(int lfd, uint32_t id, const struct fd_parms *p) + ue->fown = fown; + ue->opts = skopts; + ue->uflags = 0; ++ ue->repair_ino = 0; + + if (unix_resolve_name(lfd, id, sk, ue, p)) + goto err; +@@ -419,6 +453,41 @@ static int dump_one_unix_fd(int lfd, uint32_t id, const struct fd_parms *p) + goto err; + } + ++ /* ++ * Don't handle non-external unix socket, criu will restore it. ++ * ++ * use `sk->name != NULL || peer->name != NULL` to prevent ++ * `socketpair()` sk condition. ++ */ ++ if (kdat.has_unix_sk_repair && !sk->sd.already_dumped ++ && (sk->name != NULL || peer->name != NULL) ++ && ue->type == SOCK_STREAM) { ++ struct unix_stream_extern_socket_desc *d; ++ ++ d = xzalloc(sizeof(*d)); ++ if (!d) ++ goto err; ++ ++ /* Attention: used for upgrade in the same machine ++ * May in conflict with original usage ++ */ ++ pr_info("set %d(fd %d) unix stream repair on \n", sk->sd.ino, lfd); ++ ret = unix_stream_repair_on(lfd); ++ if (ret < 0) ++ goto err; ++ ++ d->fd = dup(lfd); ++ pr_info("add %d into unix_stream_external_sockets\n", sk->sd.ino); ++ list_add_tail(&d->list, &unix_stream_external_sockets); ++ list_add(&sk->repair_node, &peer->repair_list); ++ sk->ext_node = d; ++ ++ len = sizeof(ue->repair_ino); ++ ret = getsockopt(lfd, SOL_TCP, TCP_REPAIR_OPTIONS, &ue->repair_ino, &len); ++ if (ret < 0) ++ goto err; ++ } ++ + /* + * Peer should have us as peer or have a name by which + * we can access one. +@@ -520,6 +589,26 @@ dump: + + sk->sd.already_dumped = 1; + ++ while (!list_empty(&sk->repair_list)) { ++ struct unix_sk_desc *psk; ++ struct unix_stream_extern_socket_desc *d; ++ ++ psk = list_first_entry(&sk->repair_list, struct unix_sk_desc, repair_node); ++ list_del_init(&psk->repair_node); ++ ++ pr_info("delete ino %d into unix_stream_external_sockets\n", psk->sd.ino); ++ ++ d = psk->ext_node; ++ list_del_init(&d->list); ++ psk->ext_node = NULL; ++ /* ino start from 1, using 0 to tag the non-repairing socket is safe. */ ++ psk->ue->repair_ino = 0; ++ ++ unix_stream_repair_off(d->fd); ++ close_safe(&d->fd); ++ xfree(d); ++ } ++ + while (!list_empty(&sk->peer_list)) { + struct unix_sk_desc *psk; + psk = list_first_entry(&sk->peer_list, struct unix_sk_desc, peer_node); +@@ -754,6 +843,8 @@ static int unix_collect_one(const struct unix_diag_msg *m, struct nlattr **tb, s + + INIT_LIST_HEAD(&d->peer_list); + INIT_LIST_HEAD(&d->peer_node); ++ INIT_LIST_HEAD(&d->repair_list); ++ INIT_LIST_HEAD(&d->repair_node); + d->fd = -1; + + if (tb[UNIX_DIAG_SHUTDOWN]) +@@ -866,16 +957,18 @@ static int __dump_external_socket(struct unix_sk_desc *sk, struct unix_sk_desc * + return -1; + } + +- if (peer->type != SOCK_DGRAM) { +- show_one_unix("Ext stream not supported", peer); +- pr_err("Can't dump half of stream unix connection.\n"); ++ if (peer->type != SOCK_DGRAM && ++ peer->type != SOCK_STREAM) { ++ show_one_unix("Ext unix type not supported", peer); ++ pr_err("Can't dump this kind of unix connection.\n"); + return -1; + } + +- if (!peer->name) { ++ /* part 1: prevent NULL pointer oops */ ++ if (!peer->name && !sk->name) { + show_one_unix("Ext dgram w/o name", peer); ++ show_one_unix("Ext dgram w/o name", sk); + pr_err("Can't dump name-less external socket.\n"); +- pr_err("%d\n", sk->fd); + return -1; + } + +@@ -921,7 +1014,7 @@ int fix_external_unix_sockets(void) + + fd_id_generate_special(NULL, &e.id); + e.ino = sk->sd.ino; +- e.type = SOCK_DGRAM; ++ e.type = sk->type; + e.state = TCP_LISTEN; + e.name.data = (void *)sk->name; + e.name.len = (size_t)sk->namelen; +@@ -948,6 +1041,20 @@ err: + return -1; + } + ++void unix_stream_unlock(int ret) ++{ ++ struct unix_stream_extern_socket_desc *d; ++ pr_debug("Unlocking unix stream sockets\n"); ++ ++ list_for_each_entry(d, &unix_stream_external_sockets, list) { ++ if (ret) { ++ pr_debug("unlock fd %d \n", d->fd); ++ unix_stream_repair_off(d->fd); ++ } ++ close_safe(&d->fd); ++ } ++} ++ + struct unix_sk_info { + UnixSkEntry *ue; + struct list_head list; +@@ -1335,6 +1442,7 @@ static int post_open_standalone(struct file_desc *d, int fd) + struct unix_sk_info *peer; + struct sockaddr_un addr; + int cwd_fd = -1, root_fd = -1, ns_fd = -1; ++ int ret, value; + + ui = container_of(d, struct unix_sk_info, d); + BUG_ON((ui->flags & (USK_PAIR_MASTER | USK_PAIR_SLAVE)) || (ui->ue->uflags & (USK_CALLBACK | USK_INHERIT))); +@@ -1391,7 +1499,28 @@ static int post_open_standalone(struct file_desc *d, int fd) + * while we're connecting in sake of ghost sockets. + */ + mutex_lock(mutex_ghost); +- if (connect(fd, (struct sockaddr *)&addr, sizeof(addr.sun_family) + len) < 0) { ++ ++ /* we handle unix stream with external connections here. ++ * ++ * use `sk->name != NULL || peer->name != NULL` to prevent ++ * `socketpair()` sk condition. ++ */ ++ if (kdat.has_unix_sk_repair && peer->name ++ && (ui->name != NULL || peer->name != NULL) ++ && ui->ue->type == SOCK_STREAM && ui->ue->repair_ino != 0) { ++ value = ui->ue->repair_ino; ++ ret = setsockopt(fd, SOL_TCP, TCP_REPAIR, &value, sizeof(value)); ++ if (ret < 0) { ++ /* permit the unix sk resume successfully when the peer has been ++ * closed, just warn here */ ++ pr_warn("Can't repair %d socket\n", value); ++ } ++ ++ ret = unix_stream_repair_off(fd); ++ if (ret < 0) { ++ goto err_revert_and_exit; ++ } ++ } else if (connect(fd, (struct sockaddr *)&addr, sizeof(addr.sun_family) + len) < 0) { + pr_perror("Can't connect %d socket", ui->ue->ino); + goto err_revert_and_exit; + } +@@ -2068,8 +2197,11 @@ static int init_unix_sk_info(struct unix_sk_info *ui, UnixSkEntry *ue) + } + + ui->name = (void *)ue->name.data; +- } else +- ui->name = NULL; ++ } else { ++ /* part 2: prevent NULL pointer oops */ ++ ui->name = ""; ++ } ++ + ui->name_dir = (void *)ue->name_dir; + + ui->flags = 0; +diff --git a/images/sk-unix.proto b/images/sk-unix.proto +index 8ddbccd..3f77718 100644 +--- a/images/sk-unix.proto ++++ b/images/sk-unix.proto +@@ -54,4 +54,5 @@ message unix_sk_entry { + optional uint32 ns_id = 16; + optional sint32 mnt_id = 17 [default = -1]; + /* Please, don't use field with number 18. */ ++ required sint32 repair_ino = 19; + } +-- +2.34.1 + diff --git a/0034-netlink-add-repair-modes-and-clear-resource-when-fai.patch b/0034-netlink-add-repair-modes-and-clear-resource-when-fai.patch new file mode 100644 index 0000000..122df5b --- /dev/null +++ b/0034-netlink-add-repair-modes-and-clear-resource-when-fai.patch @@ -0,0 +1,104 @@ +From 9b556899d67d7b20c64422fbde6292528772094d Mon Sep 17 00:00:00 2001 +From: Xiaoguang Li +Date: Mon, 29 Mar 2021 20:58:28 -0400 +Subject: [PATCH 34/72] netlink: add repair modes and clear resource when + failure + +Signed-off-by: Jingxian He +--- + criu/cr-dump.c | 3 +++ + criu/include/net.h | 1 + + criu/sk-netlink.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ + 3 files changed, 49 insertions(+) + +diff --git a/criu/cr-dump.c b/criu/cr-dump.c +index e0e11cc..b7e0214 100644 +--- a/criu/cr-dump.c ++++ b/criu/cr-dump.c +@@ -2073,6 +2073,9 @@ static int cr_dump_finish(int ret) + } else if (ret != 0 && opts.pin_memory) { + pr_info("clear pin mem info\n"); + clear_pin_mem(0); ++ } else if (ret != 0 && opts.with_notifier_kup) { ++ pr_info("repair off netlink fd\n"); ++ netlink_repair_off(); + } + + if (ret != 0 && opts.with_notifier_kup) { +diff --git a/criu/include/net.h b/criu/include/net.h +index 0da4cad..718cc45 100644 +--- a/criu/include/net.h ++++ b/criu/include/net.h +@@ -55,5 +55,6 @@ extern void check_has_netns_ioc(int fd, bool *kdat_val, const char *name); + extern int net_set_ext(struct ns_id *ns); + extern struct ns_id *get_root_netns(void); + extern int read_net_ns_img(void); ++extern int netlink_repair_off(void); + + #endif /* __CR_NET_H__ */ +diff --git a/criu/sk-netlink.c b/criu/sk-netlink.c +index 754eed9..d4b3b7b 100644 +--- a/criu/sk-netlink.c ++++ b/criu/sk-netlink.c +@@ -68,6 +68,47 @@ int netlink_receive_one(struct nlmsghdr *hdr, struct ns_id *ns, void *arg) + return sk_collect_one(m->ndiag_ino, PF_NETLINK, &sd->sd, ns); + } + ++struct netlink_repair_fd { ++ int netlink_fd; ++ struct list_head nlist; ++}; ++ ++static LIST_HEAD(netlink_repair_fds); ++ ++static int netlink_repair_on(int fd) ++{ ++ int ret, aux = 1; ++ struct netlink_repair_fd *nrf; ++ ++ ret = setsockopt(fd, SOL_NETLINK, TCP_REPAIR, &aux, sizeof(aux)); ++ if (ret < 0) { ++ pr_err("Can't turn netlink repair mode ON, error: %d\n", ret); ++ return ret; ++ } ++ nrf = malloc(sizeof(*nrf)); ++ if (!nrf) ++ return -ENOMEM; ++ nrf->netlink_fd = dup(fd); ++ list_add_tail(&nrf->nlist, &netlink_repair_fds); ++ return ret; ++} ++ ++int netlink_repair_off(void) ++{ ++ int aux = 0, ret; ++ struct netlink_repair_fd *nrf, *n; ++ ++ list_for_each_entry_safe(nrf, n, &netlink_repair_fds, nlist) { ++ ret = setsockopt(nrf->netlink_fd, SOL_NETLINK, TCP_REPAIR, &aux, sizeof(aux)); ++ if (ret < 0) ++ pr_err("Failed to turn off repair mode on netlink\n"); ++ close(nrf->netlink_fd); ++ list_del(&nrf->nlist); ++ free(nrf); ++ } ++ return 0; ++} ++ + static bool can_dump_netlink_sk(int lfd) + { + int ret; +@@ -90,6 +131,10 @@ static int dump_one_netlink_fd(int lfd, u32 id, const struct fd_parms *p) + if (IS_ERR(sk)) + goto err; + ++ if (netlink_repair_on(lfd) < 0) { ++ goto err; ++ } ++ + ne.id = id; + ne.ino = p->stat.st_ino; + +-- +2.34.1 + diff --git a/0035-sysvshm-add-dump-restore-sysv-shm-in-host-ipc-ns.patch b/0035-sysvshm-add-dump-restore-sysv-shm-in-host-ipc-ns.patch new file mode 100644 index 0000000..6bdbfc4 --- /dev/null +++ b/0035-sysvshm-add-dump-restore-sysv-shm-in-host-ipc-ns.patch @@ -0,0 +1,114 @@ +From 852b4db35a06ed382e287d88cd055fdf20fc031f Mon Sep 17 00:00:00 2001 +From: "fu.lin" +Date: Sat, 26 Jun 2021 15:18:15 +0800 +Subject: [PATCH 35/72] sysvshm: add dump/restore sysv-shm in host ipc ns + +In original criu design, SysVIPC memory segment, which belongs +to host ipcns, shouldn't be dumped because criu requires the +whole ipcns to be dumped. During the restoring ipcns, the new +shared memory will be created, and fill the original page data +in it. + +This patch makes the shared-memory in host ipcns restore possible. +Idea: + The SysVIPC memory won't disappear after the task exit. The basic +information can be got from `/proc/sysvipc/shm` as long as the +system doesn't reboot. Compared with restoring the whole ipcns, +the processes of the shared memory creating and page data filling +are ignored. + +Reference: +- https://www.criu.org/What_cannot_be_checkpointed + +Conflict:NA +Reference:https://gitee.com/src-openeuler/criu/pulls/21 +Signed-off-by: fu.lin +--- + criu/cr-dump.c | 8 ++++---- + criu/cr-restore.c | 43 +++++++++++++++++++++++++++++++++++++++++++ + 2 files changed, 47 insertions(+), 4 deletions(-) + +diff --git a/criu/cr-dump.c b/criu/cr-dump.c +index b7e0214..e7b5787 100644 +--- a/criu/cr-dump.c ++++ b/criu/cr-dump.c +@@ -463,11 +463,11 @@ static int dump_filemap(struct vma_area *vma_area, int fd) + + static int check_sysvipc_map_dump(pid_t pid, VmaEntry *vma) + { +- if (root_ns_mask & CLONE_NEWIPC) +- return 0; ++ if (!(root_ns_mask & CLONE_NEWIPC)) ++ pr_info("Task %d with SysVIPC shmem map @%" PRIx64 " doesn't live in IPC ns\n", ++ pid, vma->start); + +- pr_err("Task %d with SysVIPC shmem map @%" PRIx64 " doesn't live in IPC ns\n", pid, vma->start); +- return -1; ++ return 0; + } + + static int get_task_auxv(pid_t pid, MmEntry *mm) +diff --git a/criu/cr-restore.c b/criu/cr-restore.c +index 09f135b..152bace 100644 +--- a/criu/cr-restore.c ++++ b/criu/cr-restore.c +@@ -1818,6 +1818,46 @@ static int create_children_and_session(void) + return 0; + } + ++static int prepare_rootns_sysv_shm(unsigned long clone_flags) ++{ ++ int retval = 0; ++ char *line = NULL; ++ size_t len = 0; ++ FILE *fp; ++ key_t key; ++ int shmid; ++ mode_t mode; ++ size_t size; ++ ++ /* This is completed by `prepare_namespace()` */ ++ if (!!(clone_flags & CLONE_NEWIPC)) ++ return 0; ++ ++ pr_info("Restoring SYSV shm in host namespace\n"); ++ ++ fp = fopen("/proc/sysvipc/shm", "r"); ++ if (fp == NULL) { ++ pr_err("Can't open '/proc/sysvipc/shm', errno(%d): %s\n", errno, strerror(errno)); ++ return -1; ++ } ++ ++ while (getline(&line, &len, fp) != -1) { ++ if (sscanf(line, "%d %d %o %lu", &key, &shmid, &mode, &size) != 4) ++ continue; ++ ++ pr_debug("sscanf key: %d shmid: %d mode %o size %lu\n", ++ key, shmid, mode, size); ++ ++ retval = collect_sysv_shmem(shmid, size); ++ if (retval != 0) ++ goto out; ++ } ++ ++out: ++ fclose(fp); ++ return retval; ++} ++ + static int restore_task_with_children(void *_arg) + { + struct cr_clone_arg *ca = _arg; +@@ -1924,6 +1964,9 @@ static int restore_task_with_children(void *_arg) + if (prepare_namespace(current, ca->clone_flags)) + goto err; + ++ if (prepare_rootns_sysv_shm(ca->clone_flags)) ++ goto err; ++ + if (restore_finish_ns_stage(CR_STATE_PREPARE_NAMESPACES, CR_STATE_FORKING) < 0) + goto err; + +-- +2.34.1 + diff --git a/0036-add-O_REPAIR-flag-to-vma-fd.patch b/0036-add-O_REPAIR-flag-to-vma-fd.patch new file mode 100644 index 0000000..3e937f9 --- /dev/null +++ b/0036-add-O_REPAIR-flag-to-vma-fd.patch @@ -0,0 +1,47 @@ +From 92fd13a21e52343b532eb1a163a159303107a6e2 Mon Sep 17 00:00:00 2001 +From: Jingxian He +Date: Thu, 24 Jun 2021 16:56:02 +0800 +Subject: [PATCH 36/72] add O_REPAIR flag to vma fd + +Add O_REPAIR flag when openning vma fd. + +Conflict:NA +Reference:https://gitee.com/src-openeuler/criu/pulls/21 +Signed-off-by: Jingxian He +--- + criu/files-reg.c | 7 +++++-- + 1 file changed, 5 insertions(+), 2 deletions(-) + +diff --git a/criu/files-reg.c b/criu/files-reg.c +index b9576a4..7bd8592 100644 +--- a/criu/files-reg.c ++++ b/criu/files-reg.c +@@ -2255,6 +2255,7 @@ void filemap_ctx_fini(void) + } + } + ++#define O_REPAIR 040000000 + static int open_filemap(int pid, struct vma_area *vma) + { + u32 flags; +@@ -2267,13 +2268,15 @@ static int open_filemap(int pid, struct vma_area *vma) + */ + + BUG_ON((vma->vmfd == NULL) || !vma->e->has_fdflags); +- flags = vma->e->fdflags; ++ flags = vma->e->fdflags | O_REPAIR; + + if (ctx.flags != flags || ctx.desc != vma->vmfd) { + if (vma->e->status & VMA_AREA_MEMFD) + ret = memfd_open(vma->vmfd, &flags); +- else ++ else { ++ + ret = open_path(vma->vmfd, do_open_reg_noseek_flags, &flags); ++ } + if (ret < 0) + return ret; + +-- +2.34.1 + diff --git a/0037-looser-file-mode-and-size-check.patch b/0037-looser-file-mode-and-size-check.patch new file mode 100644 index 0000000..1948c60 --- /dev/null +++ b/0037-looser-file-mode-and-size-check.patch @@ -0,0 +1,90 @@ +From bb60f8e71ec85dd11666bbb395508fac4403c251 Mon Sep 17 00:00:00 2001 +From: Jingxian He +Date: Sat, 26 Jun 2021 11:41:18 +0800 +Subject: [PATCH 37/72] looser file mode and size check + +When the file mode and size larger than dump data, +make the restoring process run success. + +Conflict:NA +Reference:https://gitee.com/src-openeuler/criu/pulls/21 +Signed-off-by: Jingxian He +Signed-off-by: fu.lin +--- + criu/config.c | 1 + + criu/crtools.c | 1 + + criu/files-reg.c | 14 +++++++++++--- + criu/include/cr_options.h | 1 + + 4 files changed, 14 insertions(+), 3 deletions(-) + +diff --git a/criu/config.c b/criu/config.c +index bd0f84d..a9eb699 100644 +--- a/criu/config.c ++++ b/criu/config.c +@@ -704,6 +704,7 @@ int parse_options(int argc, char **argv, bool *usage_error, bool *has_exec_cmd, + BOOL_OPT("dump-char-dev", &opts.dump_char_dev), + BOOL_OPT("with-fd-cred", &opts.with_fd_cred), + BOOL_OPT("mask-exit-notify", &opts.mask_exit_notify), ++ BOOL_OPT("weak-file-check", &opts.weak_file_check), + {}, + }; + +diff --git a/criu/crtools.c b/criu/crtools.c +index 1a41be4..e1afeca 100644 +--- a/criu/crtools.c ++++ b/criu/crtools.c +@@ -455,6 +455,7 @@ usage: + \ + " --with-fd-cred Allow to make the restored process has the same cred\n" + " --mask-exit-notify Mask task exit notify during dump and restore\n" ++ " --weak-file-check Allow file size and mod larger than dumping value\n" + "\n" + "Check options:\n" + " Without options, \"criu check\" checks availability of absolutely required\n" +diff --git a/criu/files-reg.c b/criu/files-reg.c +index 7bd8592..1a3b836 100644 +--- a/criu/files-reg.c ++++ b/criu/files-reg.c +@@ -1991,7 +1991,10 @@ static bool validate_file(const int fd, const struct stat *fd_status, const stru + { + int result = 1; + +- if (rfi->rfe->has_size && (fd_status->st_size != rfi->rfe->size)) { ++ /* NOTICE: customize for the storage module upgrade feature */ ++ if (rfi->rfe->has_size ++ && ((!opts.weak_file_check && fd_status->st_size != rfi->rfe->size) ++ || (fd_status->st_size < rfi->rfe->size))) { + pr_err("File %s has bad size %" PRIu64 " (expect %" PRIu64 ")\n", rfi->path, fd_status->st_size, + rfi->rfe->size); + return false; +@@ -2102,8 +2105,13 @@ ext: + if (!validate_file(tmp, &st, rfi)) + return -1; + +- if (rfi->rfe->has_mode && (st.st_mode != rfi->rfe->mode)) { +- pr_err("File %s has bad mode 0%o (expect 0%o)\n", rfi->path, (int)st.st_mode, rfi->rfe->mode); ++ /* NOTICE: customize for the storage module upgrade feature */ ++ if (rfi->rfe->has_mode ++ && ((!opts.weak_file_check && st.st_mode != rfi->rfe->mode) ++ || (st.st_mode < rfi->rfe->mode))) { ++ pr_err("File %s has bad mode 0%o (expect 0%o), weak check %d\n", ++ rfi->path, (int)st.st_mode, rfi->rfe->mode, ++ opts.weak_file_check); + return -1; + } + +diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h +index 26ae5b6..dec0082 100644 +--- a/criu/include/cr_options.h ++++ b/criu/include/cr_options.h +@@ -196,6 +196,7 @@ struct cr_options { + int dump_char_dev; + int with_fd_cred; + int mask_exit_notify; ++ int weak_file_check; + }; + + extern struct cr_options opts; +-- +2.34.1 + diff --git a/0038-file-lock-add-repair-mode-to-dump-file-locks.patch b/0038-file-lock-add-repair-mode-to-dump-file-locks.patch new file mode 100644 index 0000000..b1f5ccd --- /dev/null +++ b/0038-file-lock-add-repair-mode-to-dump-file-locks.patch @@ -0,0 +1,308 @@ +From 61ca95f5434573e89151d3557185c517cd69447a Mon Sep 17 00:00:00 2001 +From: Sang Yan +Date: Thu, 8 Jul 2021 14:12:42 +0800 +Subject: [PATCH 38/72] file-lock: add repair mode to dump file locks + +Add new options "--file-locks-repair" to enable repair mode +while dumping file locks. +Repair mode keeps locks locked while process were killed in +dumping operation. Then resume the locks from repair mode at +process resuming. + +Signed-off-by: Sang Yan +Signed-off-by: fu.lin +--- + criu/config.c | 1 + + criu/cr-dump.c | 8 ++++++ + criu/crtools.c | 1 + + criu/file-lock.c | 10 +++++++ + criu/include/cr_options.h | 1 + + criu/include/fcntl.h | 16 +++++++++++ + criu/include/parasite-syscall.h | 2 ++ + criu/include/parasite.h | 10 +++++++ + criu/parasite-syscall.c | 33 +++++++++++++++++++++++ + criu/pie/parasite.c | 48 +++++++++++++++++++++++++++++++++ + 10 files changed, 130 insertions(+) + +diff --git a/criu/config.c b/criu/config.c +index a9eb699..0a0623a 100644 +--- a/criu/config.c ++++ b/criu/config.c +@@ -705,6 +705,7 @@ int parse_options(int argc, char **argv, bool *usage_error, bool *has_exec_cmd, + BOOL_OPT("with-fd-cred", &opts.with_fd_cred), + BOOL_OPT("mask-exit-notify", &opts.mask_exit_notify), + BOOL_OPT("weak-file-check", &opts.weak_file_check), ++ BOOL_OPT("file-locks-repair", &opts.file_locks_repair), + {}, + }; + +diff --git a/criu/cr-dump.c b/criu/cr-dump.c +index e7b5787..607eac2 100644 +--- a/criu/cr-dump.c ++++ b/criu/cr-dump.c +@@ -1679,6 +1679,14 @@ static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) + goto err_cure; + } + ++ if (opts.file_locks_repair) { ++ ret = parasite_dump_file_locks(parasite_ctl, pid); ++ if (ret) { ++ pr_err("Can't parasite dump file locks (pid: %d)\n", pid); ++ goto err_cure; ++ } ++ } ++ + ret = dump_task_core_all(parasite_ctl, item, &pps_buf, cr_imgset, &misc); + if (ret) { + pr_err("Dump core (pid: %d) failed with %d\n", pid, ret); +diff --git a/criu/crtools.c b/criu/crtools.c +index e1afeca..7358918 100644 +--- a/criu/crtools.c ++++ b/criu/crtools.c +@@ -456,6 +456,7 @@ usage: + " --with-fd-cred Allow to make the restored process has the same cred\n" + " --mask-exit-notify Mask task exit notify during dump and restore\n" + " --weak-file-check Allow file size and mod larger than dumping value\n" ++ " --file-locks-repair Use repair mode to dump and restore file locks\n" + "\n" + "Check options:\n" + " Without options, \"criu check\" checks availability of absolutely required\n" +diff --git a/criu/file-lock.c b/criu/file-lock.c +index 6334462..c893083 100644 +--- a/criu/file-lock.c ++++ b/criu/file-lock.c +@@ -424,6 +424,8 @@ void discard_dup_locks_tail(pid_t pid, int fd) + list_for_each_entry_safe_reverse(fl, p, &file_lock_list, list) { + if (fl->owners_fd != fd || pid != fl->fl_holder) + break; ++ if (fl->fl_kind == FL_POSIX) ++ continue; + + list_del(&fl->list); + xfree(fl); +@@ -611,8 +613,12 @@ static int restore_file_lock(FileLockEntry *fle) + cmd = fle->type; + } else if (fle->type == F_RDLCK) { + cmd = LOCK_SH; ++ if (opts.file_locks_repair) ++ cmd = LOCK_REPAIR; + } else if (fle->type == F_WRLCK) { + cmd = LOCK_EX; ++ if (opts.file_locks_repair) ++ cmd = LOCK_REPAIR; + } else if (fle->type == F_UNLCK) { + cmd = LOCK_UN; + } else { +@@ -638,6 +644,10 @@ static int restore_file_lock(FileLockEntry *fle) + flk.l_pid = fle->pid; + flk.l_type = fle->type; + ++ if (opts.file_locks_repair ++ && (fle->type == F_RDLCK || fle->type == F_WRLCK)) ++ flk.l_type = F_REPAIR; ++ + pr_info("(posix)flag: %d, type: %d, pid: %d, fd: %d, " + "start: %8" PRIx64 ", len: %8" PRIx64 "\n", + fle->flag, fle->type, fle->pid, fle->fd, fle->start, fle->len); +diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h +index dec0082..9ec8034 100644 +--- a/criu/include/cr_options.h ++++ b/criu/include/cr_options.h +@@ -197,6 +197,7 @@ struct cr_options { + int with_fd_cred; + int mask_exit_notify; + int weak_file_check; ++ int file_locks_repair; + }; + + extern struct cr_options opts; +diff --git a/criu/include/fcntl.h b/criu/include/fcntl.h +index 568977c..0627818 100644 +--- a/criu/include/fcntl.h ++++ b/criu/include/fcntl.h +@@ -23,6 +23,22 @@ struct f_owner_ex { + #define F_SETCRED 18 + #endif + ++#ifndef F_NEED_REPAIR ++#define F_NEED_REPAIR 16 ++#endif ++ ++#ifndef F_REPAIR ++#define F_REPAIR 32 ++#endif ++ ++#ifndef LOCK_NEED_REPAIR ++#define LOCK_NEED_REPAIR 256 /* REPAIRING lock */ ++#endif ++ ++#ifndef LOCK_REPAIR ++#define LOCK_REPAIR 512 /* REPAIR lock */ ++#endif ++ + /* + * These things are required to compile on CentOS-6 + */ +diff --git a/criu/include/parasite-syscall.h b/criu/include/parasite-syscall.h +index 4540e11..9f2d3e0 100644 +--- a/criu/include/parasite-syscall.h ++++ b/criu/include/parasite-syscall.h +@@ -48,4 +48,6 @@ extern int parasite_dump_cgroup(struct parasite_ctl *ctl, struct parasite_dump_c + + extern struct parasite_tty_args *parasite_dump_tty(struct parasite_ctl *ctl, int fd, int type); + ++extern int parasite_dump_file_locks(struct parasite_ctl *ctl, int pid); ++ + #endif /* __CR_PARASITE_SYSCALL_H__ */ +diff --git a/criu/include/parasite.h b/criu/include/parasite.h +index d2a0688..230c453 100644 +--- a/criu/include/parasite.h ++++ b/criu/include/parasite.h +@@ -37,6 +37,7 @@ enum { + PARASITE_CMD_CHECK_VDSO_MARK, + PARASITE_CMD_CHECK_AIOS, + PARASITE_CMD_DUMP_CGROUP, ++ PARASITE_CMD_DUMP_FILELOCKS, + + PARASITE_CMD_MAX, + }; +@@ -244,6 +245,15 @@ struct parasite_dump_cgroup_args { + char contents[1 << 12]; + }; + ++struct parasite_dump_filelocks_args { ++ short kind; ++ short type; ++ long start; ++ long len; ++ int pid; ++ int fd; ++}; ++ + #endif /* !__ASSEMBLY__ */ + + #endif /* __CR_PARASITE_H__ */ +diff --git a/criu/parasite-syscall.c b/criu/parasite-syscall.c +index ee4fa86..c57f854 100644 +--- a/criu/parasite-syscall.c ++++ b/criu/parasite-syscall.c +@@ -32,6 +32,7 @@ + #include + #include "signal.h" + #include "sigframe.h" ++#include "file-lock.h" + + #include + #include +@@ -654,3 +655,35 @@ struct parasite_ctl *parasite_infect_seized(pid_t pid, struct pstree_item *item, + + return ctl; + } ++ ++int parasite_dump_file_locks(struct parasite_ctl *ctl, int pid) ++{ ++ struct parasite_dump_filelocks_args *args; ++ struct file_lock *fl; ++ int ret; ++ ++ args = compel_parasite_args(ctl, struct parasite_dump_filelocks_args); ++ ++ list_for_each_entry(fl, &file_lock_list, list) { ++ if (fl->real_owner != pid) ++ continue; ++ ++ args->pid = fl->real_owner; ++ args->fd = fl->owners_fd; ++ args->kind = fl->fl_kind; ++ args->type = fl->fl_ltype; ++ args->start = fl->start; ++ if (!strncmp(fl->end, "EOF", 3)) ++ args->len = 0; ++ else ++ args->len = (atoll(fl->end) + 1) - fl->start; ++ ++ ret = compel_rpc_call_sync(PARASITE_CMD_DUMP_FILELOCKS, ctl); ++ if (ret < 0) { ++ pr_err("Parasite dump file lock failed! (pid: %d)\n", pid); ++ return ret; ++ } ++ } ++ ++ return 0; ++} +diff --git a/criu/pie/parasite.c b/criu/pie/parasite.c +index e49958b..c781303 100644 +--- a/criu/pie/parasite.c ++++ b/criu/pie/parasite.c +@@ -22,6 +22,7 @@ + #include "criu-log.h" + #include "tty.h" + #include "aio.h" ++#include "file-lock.h" + + #include "asm/parasite.h" + #include "restorer.h" +@@ -769,6 +770,50 @@ static int parasite_dump_cgroup(struct parasite_dump_cgroup_args *args) + return 0; + } + ++static int set_filelocks_needrepair(struct parasite_dump_filelocks_args *args) ++{ ++ int ret; ++ ++ if (args->kind == FL_FLOCK) { ++ if (args->type == F_RDLCK || args->type == F_WRLCK) { ++ int cmd = LOCK_NEED_REPAIR; ++ ++ pr_info("Need Repair flock kind: %d, type: %d, cmd: %d, pid: %d, fd: %d\n", ++ args->kind, args->type, cmd, args->pid, args->fd); ++ ++ ret = sys_flock(args->fd, cmd); ++ if (ret < 0) { ++ pr_err("Can not set NEED_REPAIR flock!\n"); ++ return ret; ++ } ++ } ++ } else if (args->kind == FL_POSIX) { ++ if (args->type == F_RDLCK || args->type == F_WRLCK) { ++ struct flock flk; ++ memset(&flk, 0, sizeof(flk)); ++ ++ flk.l_whence = SEEK_SET; ++ flk.l_start = args->start; ++ flk.l_len = args->len; ++ flk.l_pid = args->pid; ++ flk.l_type = F_NEED_REPAIR; ++ ++ pr_info("Need Repair posix lock kind: %d, type: %d, cmd: %d, pid: %d, fd: %d, " ++ "start: %8"PRIx64", len: %8"PRIx64"\n", ++ args->kind, args->type, flk.l_type, args->pid, args->fd, ++ args->start, args->len); ++ ++ ret = sys_fcntl(args->fd, F_SETLKW, (long)&flk); ++ if (ret < 0) { ++ pr_err("Can not set NEED_REPAIR posix lock!\n"); ++ return ret; ++ } ++ } ++ } ++ ++ return 0; ++} ++ + void parasite_cleanup(void) + { + if (mprotect_args) { +@@ -821,6 +866,9 @@ int parasite_daemon_cmd(int cmd, void *args) + case PARASITE_CMD_DUMP_CGROUP: + ret = parasite_dump_cgroup(args); + break; ++ case PARASITE_CMD_DUMP_FILELOCKS: ++ ret = set_filelocks_needrepair(args); ++ break; + default: + pr_err("Unknown command in parasite daemon thread leader: %d\n", cmd); + ret = -1; +-- +2.34.1 + diff --git a/0039-unlock-network-when-restore-fails.patch b/0039-unlock-network-when-restore-fails.patch new file mode 100644 index 0000000..43f6c8b --- /dev/null +++ b/0039-unlock-network-when-restore-fails.patch @@ -0,0 +1,60 @@ +From 5421245cf87bac71cbe999f257ba5b3a96c8733b Mon Sep 17 00:00:00 2001 +From: Liu Chao +Date: Fri, 9 Jul 2021 07:32:20 +0000 +Subject: [PATCH 39/72] unlock network when restore fails + +Conflict:NA +Reference:https://gitee.com/src-openeuler/criu/pulls/21 +Signed-off-by: fu.lin +--- + criu/cr-restore.c | 13 +++++++++++++ + 1 file changed, 13 insertions(+) + +diff --git a/criu/cr-restore.c b/criu/cr-restore.c +index 152bace..d19768d 100644 +--- a/criu/cr-restore.c ++++ b/criu/cr-restore.c +@@ -115,6 +115,9 @@ + #endif + + struct pstree_item *current; ++#define NETWORK_COLLECTED 0x1 ++#define NETWORK_UNLOCK 0x2 ++static int network_status = 0; + + static int restore_task_with_children(void *); + static int sigreturn_restore(pid_t pid, struct task_restore_args *ta, unsigned long alen, CoreEntry *core); +@@ -249,6 +252,7 @@ static int crtools_prepare_shared(void) + /* Connections are unlocked from criu */ + if (!files_collected() && collect_image(&inet_sk_cinfo)) + return -1; ++ network_status |= NETWORK_COLLECTED; + + if (collect_binfmt_misc()) + return -1; +@@ -2525,6 +2529,7 @@ skip_ns_bouncing: + + /* Unlock network before disabling repair mode on sockets */ + network_unlock(); ++ network_status |= NETWORK_UNLOCK; + + /* + * Stop getting sigchld, after we resume the tasks they +@@ -2734,6 +2739,14 @@ clean_cgroup: + fini_cgroup(); + err: + cr_plugin_fini(CR_PLUGIN_STAGE__RESTORE, ret); ++ if (ret < 0) { ++ if (!!(network_status & NETWORK_COLLECTED) ++ && !files_collected() && collect_image(&inet_sk_cinfo)) ++ pr_err("collect inet sk cinfo fail\n"); ++ ++ if (!!(network_status & NETWORK_UNLOCK)) ++ network_unlock(); ++ } + return ret; + } + +-- +2.34.1 + diff --git a/0040-net-add-shared-socket-recover-method-for-criu.patch b/0040-net-add-shared-socket-recover-method-for-criu.patch new file mode 100644 index 0000000..8de2a88 --- /dev/null +++ b/0040-net-add-shared-socket-recover-method-for-criu.patch @@ -0,0 +1,332 @@ +From a22542173083d2eeb5dde627c47452ea641c98c1 Mon Sep 17 00:00:00 2001 +From: Sang Yan +Date: Mon, 12 Jul 2021 16:14:45 +0800 +Subject: [PATCH 40/72] net: add shared socket recover method for criu + +When the socket file is shared with another process, +it will not be freed during dumping process. +We can repair the socket file by installing it to +the old fd number. + +Add new options: "--share-dst-ports" and "--share-src-ports" +for user to tell criu which socket ports are shared. + +Conflict:NA +Reference:https://gitee.com/src-openeuler/criu/pulls/21 +Signed-off-by: Jingxian He +--- + criu/config.c | 8 ++ + criu/crtools.c | 3 + + criu/files.c | 18 ++++- + criu/include/cr_options.h | 2 + + criu/include/files.h | 4 + + criu/include/net.h | 1 + + criu/include/sk-inet.h | 3 + + criu/sk-inet.c | 151 ++++++++++++++++++++++++++++++++++++++ + 8 files changed, 189 insertions(+), 1 deletion(-) + +diff --git a/criu/config.c b/criu/config.c +index 0a0623a..7e92731 100644 +--- a/criu/config.c ++++ b/criu/config.c +@@ -706,6 +706,8 @@ int parse_options(int argc, char **argv, bool *usage_error, bool *has_exec_cmd, + BOOL_OPT("mask-exit-notify", &opts.mask_exit_notify), + BOOL_OPT("weak-file-check", &opts.weak_file_check), + BOOL_OPT("file-locks-repair", &opts.file_locks_repair), ++ { "share-dst-ports", required_argument, 0, 2000 }, ++ { "share-src-ports", required_argument, 0, 2001 }, + {}, + }; + +@@ -1041,6 +1043,12 @@ int parse_options(int argc, char **argv, bool *usage_error, bool *has_exec_cmd, + return 1; + } + break; ++ case 2000: ++ SET_CHAR_OPTS(share_dst_ports, optarg); ++ break; ++ case 2001: ++ SET_CHAR_OPTS(share_src_ports, optarg); ++ break; + case 'V': + pr_msg("Version: %s\n", CRIU_VERSION); + if (strcmp(CRIU_GITID, "0")) +diff --git a/criu/crtools.c b/criu/crtools.c +index 7358918..cfa149a 100644 +--- a/criu/crtools.c ++++ b/criu/crtools.c +@@ -104,6 +104,9 @@ int main(int argc, char *argv[], char *envp[]) + goto usage; + } + ++ if (parse_share_ports()) ++ goto usage; ++ + log_set_loglevel(opts.log_level); + + if (optind < argc && !strcmp(argv[optind], "swrk")) { +diff --git a/criu/files.c b/criu/files.c +index 1ec5281..1c52cf4 100644 +--- a/criu/files.c ++++ b/criu/files.c +@@ -705,6 +705,8 @@ int dump_my_file(int lfd, u32 *id, int *type) + return 0; + } + ++int dst_pid; ++ + int dump_task_files_seized(struct parasite_ctl *ctl, struct pstree_item *item, struct parasite_drain_fd *dfds) + { + int *lfds = NULL; +@@ -728,7 +730,7 @@ int dump_task_files_seized(struct parasite_ctl *ctl, struct pstree_item *item, s + img = open_image(CR_FD_FDINFO, O_DUMP, item->ids->files_id); + if (!img) + goto err; +- ++ dst_pid = item->pid->real; + ret = 0; /* Don't fail if nr_fds == 0 */ + for (off = 0; ret == 0 && off < dfds->nr_fds; off += nr_fds) { + if (nr_fds + off > dfds->nr_fds) +@@ -1237,6 +1239,20 @@ static int open_fd(struct fdinfo_list_entry *fle) + goto out; + } + ++ if (d->ops->type == FD_TYPES__INETSK) { ++ if (check_need_repair(d)) { ++ ret = repair_share_socket(d->id); ++ if (!ret) { ++ new_fd = get_share_socket(); ++ pr_info("get share socket:%d\n", new_fd); ++ if (new_fd <= 0 || setup_and_serve_out(fle, new_fd) < 0) ++ return -1; ++ fle->stage = FLE_RESTORED; ++ return 0; ++ } ++ } ++ } ++ + /* + * Open method returns the following values: + * 0 -- restore is successfully finished; +diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h +index 9ec8034..b7c1e34 100644 +--- a/criu/include/cr_options.h ++++ b/criu/include/cr_options.h +@@ -198,6 +198,8 @@ struct cr_options { + int mask_exit_notify; + int weak_file_check; + int file_locks_repair; ++ char *share_dst_ports; ++ char *share_src_ports; + }; + + extern struct cr_options opts; +diff --git a/criu/include/files.h b/criu/include/files.h +index 1d979a9..0521c7e 100644 +--- a/criu/include/files.h ++++ b/criu/include/files.h +@@ -201,4 +201,8 @@ extern int open_transport_socket(void); + extern int set_fds_event(pid_t virt); + extern void wait_fds_event(void); + ++extern int repair_share_socket(int id); ++extern int check_need_repair(struct file_desc *d); ++extern int get_share_socket(void); ++ + #endif /* __CR_FILES_H__ */ +diff --git a/criu/include/net.h b/criu/include/net.h +index 718cc45..ec47b61 100644 +--- a/criu/include/net.h ++++ b/criu/include/net.h +@@ -16,6 +16,7 @@ extern int dump_net_ns(struct ns_id *ns); + extern int prepare_net_namespaces(void); + extern void fini_net_namespaces(void); + extern int netns_keep_nsfd(void); ++extern int parse_share_ports(void); + + struct pstree_item; + extern int restore_task_net_ns(struct pstree_item *current); +diff --git a/criu/include/sk-inet.h b/criu/include/sk-inet.h +index c832d63..27deceb 100644 +--- a/criu/include/sk-inet.h ++++ b/criu/include/sk-inet.h +@@ -101,4 +101,7 @@ struct rst_tcp_sock { + union libsoccr_addr; + int restore_sockaddr(union libsoccr_addr *sa, int family, u32 pb_port, u32 *pb_addr, u32 ifindex); + ++#define MAX_SHARE_PORT_NUM 64 ++extern int dst_pid; ++ + #endif /* __CR_SK_INET_H__ */ +diff --git a/criu/sk-inet.c b/criu/sk-inet.c +index 05048c8..c7de793 100644 +--- a/criu/sk-inet.c ++++ b/criu/sk-inet.c +@@ -431,6 +431,152 @@ static bool needs_scope_id(uint32_t *src_addr) + return false; + } + ++#define ADD_SHARE_SOCKET_PATH "/sys/kernel/add_share_socket" ++#define REPAIR_SHARE_SOCKET_PATH "/sys/kernel/repair_share_socket" ++#define SHARE_SOCKET_PATH "/sys/kernel/share_socket" ++ ++int add_share_socket(u32 id, int fd, int pid, int port) ++{ ++ int retval; ++ char buf[256] = {0}; ++ ++ retval = snprintf(buf, 256, "%u,%d,%d,%d", id, fd, pid, port); ++ if (retval <= 0) ++ return -EFAULT; ++ ++ fd = open(ADD_SHARE_SOCKET_PATH, O_WRONLY, 0); ++ if (fd < 0) { ++ pr_err("open file:%s fail\n", ADD_SHARE_SOCKET_PATH); ++ return fd; ++ } ++ ++ retval = write(fd, buf, strlen(buf)); ++ close(fd); ++ return retval < 0 ? -1 : 0; ++} ++ ++ ++int repair_share_socket(int id) ++{ ++ int retval, fd; ++ char buf[256] = {0}; ++ ++ retval = snprintf(buf, 256, "%u", id); ++ if (retval <= 0) ++ return -EFAULT; ++ ++ fd = open(REPAIR_SHARE_SOCKET_PATH, O_WRONLY, 0); ++ if (fd < 0) { ++ pr_err("open file:%s fail\n", REPAIR_SHARE_SOCKET_PATH); ++ return fd; ++ } ++ retval = write(fd, buf, strlen(buf)); ++ ++ close(fd); ++ return retval < 0 ? -1 : 0; ++} ++ ++int get_share_socket(void) ++{ ++ int fd; ++ ssize_t count; ++ int retval = -1; ++ char buf[32] = {0}; ++ ++ fd = open(SHARE_SOCKET_PATH, O_RDONLY, 0); ++ if (fd < 0) { ++ pr_err("open file:%s fail\n", SHARE_SOCKET_PATH); ++ return fd; ++ } ++ ++ count = read(fd, buf, sizeof(buf)); ++ if (count > 0) ++ retval = atoi(buf); ++ ++ close(fd); ++ return retval; ++} ++ ++int g_share_dst_ports[MAX_SHARE_PORT_NUM]; ++int g_share_dst_port_num; ++int g_share_src_ports[MAX_SHARE_PORT_NUM]; ++int g_share_src_port_num; ++ ++int parse_share_ports(void) ++{ ++ char *save, *p; ++ ++ if (opts.share_dst_ports) { ++ p = strtok_r(opts.share_dst_ports, ",", &save); ++ while (p != NULL) { ++ if (g_share_dst_port_num >= MAX_SHARE_PORT_NUM) ++ return -1; ++ g_share_dst_ports[g_share_dst_port_num] = atoi(p); ++ if (!g_share_dst_ports[g_share_dst_port_num]) ++ return -1; ++ g_share_dst_port_num++; ++ p = strtok_r(NULL, ",", &save); ++ } ++ } ++ ++ if (opts.share_src_ports) { ++ p = strtok_r(opts.share_src_ports, ",", &save); ++ while (p != NULL) { ++ if (g_share_src_port_num >= MAX_SHARE_PORT_NUM) ++ return -1; ++ g_share_src_ports[g_share_src_port_num] = atoi(p); ++ if (!g_share_src_ports[g_share_src_port_num]) ++ return -1; ++ g_share_src_port_num++; ++ p = strtok_r(NULL, ",", &save); ++ } ++ } ++ return 0; ++} ++ ++int check_share_dst_port(int dst_port) ++{ ++ int i; ++ int ret = 0; ++ ++ for (i = 0; i < g_share_dst_port_num; i++) { ++ if (dst_port == g_share_dst_ports[i]) { ++ ret = 1; ++ break; ++ } ++ } ++ return ret; ++} ++ ++int check_share_src_port(int src_port) ++{ ++ int i; ++ int ret = 0; ++ ++ for (i = 0; i < g_share_src_port_num; i++) { ++ if (src_port == g_share_src_ports[i]) { ++ ret = 1; ++ break; ++ } ++ } ++ ++ return ret; ++} ++ ++int check_need_repair(struct file_desc *d) ++{ ++ struct inet_sk_info *ii; ++ InetSkEntry *ie; ++ ++ ii = container_of(d, struct inet_sk_info, d); ++ ie = ii->ie; ++ if (check_share_dst_port(ie->dst_port) || ++ check_share_src_port(ie->src_port)) ++ return 1; ++ else ++ return 0; ++} ++ + static int do_dump_one_inet_fd(int lfd, u32 id, const struct fd_parms *p, int family) + { + struct inet_sk_desc *sk; +@@ -488,6 +634,11 @@ static int do_dump_one_inet_fd(int lfd, u32 id, const struct fd_parms *p, int fa + + BUG_ON(sk->sd.already_dumped); + ++ if (check_share_dst_port(sk->dst_port) || check_share_src_port(sk->src_port)) { ++ pr_info("Start add share prot:%d src %d\n", sk->dst_port, sk->src_port); ++ add_share_socket(id, lfd, dst_pid, sk->src_port); ++ } ++ + ie.id = id; + ie.ino = sk->sd.ino; + if (sk->sd.sk_ns) { +-- +2.34.1 + diff --git a/0041-tcp-save-src-ports-to-ip_local_reserved_ports-when-d.patch b/0041-tcp-save-src-ports-to-ip_local_reserved_ports-when-d.patch new file mode 100644 index 0000000..1159098 --- /dev/null +++ b/0041-tcp-save-src-ports-to-ip_local_reserved_ports-when-d.patch @@ -0,0 +1,273 @@ +From aac63cee766bb6840326d008ed1b1993bb7c629a Mon Sep 17 00:00:00 2001 +From: Liu Chao +Date: Mon, 19 Jul 2021 03:19:30 +0000 +Subject: [PATCH 41/72] tcp: save src ports to ip_local_reserved_ports when + dump tasks and retore it when restore tasks + +Signed-off-by: Liu Chao +Signed-off-by: fu.lin +--- + criu/Makefile.crtools | 1 + + criu/config.c | 8 ++- + criu/cr-dump.c | 4 ++ + criu/crtools.c | 1 + + criu/include/cr_options.h | 1 + + criu/include/reserved-ports.h | 10 ++++ + criu/net.c | 6 +++ + criu/reserved-ports.c | 98 +++++++++++++++++++++++++++++++++++ + criu/sk-tcp.c | 2 +- + 9 files changed, 129 insertions(+), 2 deletions(-) + create mode 100644 criu/include/reserved-ports.h + create mode 100644 criu/reserved-ports.c + +diff --git a/criu/Makefile.crtools b/criu/Makefile.crtools +index 65cc215..3e522b4 100644 +--- a/criu/Makefile.crtools ++++ b/criu/Makefile.crtools +@@ -94,6 +94,7 @@ obj-y += pin-mem.o + obj-y += devname.o + obj-y += files-chr.o + obj-y += exit-notify.o ++obj-y += reserved-ports.o + obj-$(CONFIG_HAS_LIBBPF) += bpfmap.o + obj-$(CONFIG_COMPAT) += pie-util-vdso-elf32.o + CFLAGS_pie-util-vdso-elf32.o += -DCONFIG_VDSO_32 +diff --git a/criu/config.c b/criu/config.c +index 7e92731..ae5f81e 100644 +--- a/criu/config.c ++++ b/criu/config.c +@@ -615,7 +615,7 @@ int parse_options(int argc, char **argv, bool *usage_error, bool *has_exec_cmd, + "no-" OPT_NAME, no_argument, SAVE_TO, false \ + } + +- static const char short_opts[] = "dSsRt:hD:o:v::x::Vr:jJ:lW:L:M:"; ++ static const char short_opts[] = "dSsRt:hD:o:v::x::Vr:jJ:lW:L:M:P:"; + static struct option long_opts[] = { + { "tree", required_argument, 0, 't' }, + { "leave-stopped", no_argument, 0, 's' }, +@@ -708,6 +708,7 @@ int parse_options(int argc, char **argv, bool *usage_error, bool *has_exec_cmd, + BOOL_OPT("file-locks-repair", &opts.file_locks_repair), + { "share-dst-ports", required_argument, 0, 2000 }, + { "share-src-ports", required_argument, 0, 2001 }, ++ { "reserve-ports", required_argument, 0, 'P' }, + {}, + }; + +@@ -1057,6 +1058,11 @@ int parse_options(int argc, char **argv, bool *usage_error, bool *has_exec_cmd, + case 'h': + *usage_error = false; + return 2; ++ case 'P': ++ opts.reserve_ports = atoi(optarg); ++ if (opts.reserve_ports < 0) ++ goto bad_arg; ++ break; + default: + return 2; + } +diff --git a/criu/cr-dump.c b/criu/cr-dump.c +index 607eac2..a8ab61e 100644 +--- a/criu/cr-dump.c ++++ b/criu/cr-dump.c +@@ -89,6 +89,7 @@ + #include "pin-mem.h" + #include "notifier.h" + #include "files-chr.h" ++#include "reserved-ports.h" + + /* + * Architectures can overwrite this function to restore register sets that +@@ -2223,6 +2224,9 @@ int cr_dump_tasks(pid_t pid) + goto err; + } + ++ if (opts.reserve_ports > 0) ++ set_reserved_ports(); ++ + if (parent_ie) { + inventory_entry__free_unpacked(parent_ie, NULL); + parent_ie = NULL; +diff --git a/criu/crtools.c b/criu/crtools.c +index cfa149a..ae858e8 100644 +--- a/criu/crtools.c ++++ b/criu/crtools.c +@@ -460,6 +460,7 @@ usage: + " --mask-exit-notify Mask task exit notify during dump and restore\n" + " --weak-file-check Allow file size and mod larger than dumping value\n" + " --file-locks-repair Use repair mode to dump and restore file locks\n" ++ " --reserve-ports Reserve src ports in kernel\n" + "\n" + "Check options:\n" + " Without options, \"criu check\" checks availability of absolutely required\n" +diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h +index b7c1e34..3b61c6b 100644 +--- a/criu/include/cr_options.h ++++ b/criu/include/cr_options.h +@@ -200,6 +200,7 @@ struct cr_options { + int file_locks_repair; + char *share_dst_ports; + char *share_src_ports; ++ int reserve_ports; + }; + + extern struct cr_options opts; +diff --git a/criu/include/reserved-ports.h b/criu/include/reserved-ports.h +new file mode 100644 +index 0000000..b614482 +--- /dev/null ++++ b/criu/include/reserved-ports.h +@@ -0,0 +1,10 @@ ++#ifndef __CRIU_RESERVED_PORTS_H__ ++#define __CRIU_RESERVED_PORTS_H__ ++ ++#define RESERVED_PORTS_PATH "/proc/sys/net/ipv4/ip_local_reserved_ports" ++ ++extern void read_reserved_ports(char *path); ++extern void write_reserved_ports(char *path); ++extern void set_reserved_ports(void); ++ ++#endif /* __CRIU_RESERVED_PORTS_H__ */ +diff --git a/criu/net.c b/criu/net.c +index 7b45f06..fff4c85 100644 +--- a/criu/net.c ++++ b/criu/net.c +@@ -46,6 +46,7 @@ + #include "external.h" + #include "fdstore.h" + #include "netfilter.h" ++#include "reserved-ports.h" + + #include "protobuf.h" + #include "images/netdev.pb-c.h" +@@ -3193,6 +3194,11 @@ void network_unlock(void) + { + pr_info("Unlock network\n"); + ++ if (opts.reserve_ports) { ++ read_reserved_ports("ip_local_reserved_ports"); ++ write_reserved_ports(RESERVED_PORTS_PATH); ++ } ++ + cpt_unlock_tcp_connections(); + rst_unlock_tcp_connections(); + +diff --git a/criu/reserved-ports.c b/criu/reserved-ports.c +new file mode 100644 +index 0000000..b4996ab +--- /dev/null ++++ b/criu/reserved-ports.c +@@ -0,0 +1,98 @@ ++#include ++#include ++#include ++#include ++#include ++ ++#include "log.h" ++#include "cr_options.h" ++#include "util.h" ++#include "sk-inet.h" ++#include "reserved-ports.h" ++ ++#include "common/list.h" ++ ++static char* reserved_ports; ++static int reserved_ports_num; ++extern struct list_head cpt_tcp_repair_sockets; ++ ++void read_reserved_ports(char *path) ++{ ++ FILE *file = NULL; ++ char *ch = NULL; ++ size_t size = 0; ++ ++ if (reserved_ports) { ++ free(reserved_ports); ++ reserved_ports = NULL; ++ } ++ ++ file = fopen(path, "r"); ++ if (!file) { ++ pr_err("Cannot fopen %s\n", path); ++ return; ++ } ++ ++ if (getline(&reserved_ports, &size, file) <= 0) ++ pr_err("Cannot getline from %s\n", path); ++ fclose(file); ++ ++ if (!reserved_ports) ++ return; ++ ++ ch = strstr(reserved_ports, "\n"); ++ if (ch) ++ *ch = '\0'; ++} ++ ++void write_reserved_ports(char *path) ++{ ++ int fd = -1; ++ char buf[PATH_MAX]; ++ ++ fd = open(path, O_RDWR | O_CREAT, 0640); ++ if (fd < 0) { ++ pr_err("Cannot open %s ret %d cwd: %s\n", path, fd, buf); ++ return; ++ } ++ ++ cr_system(-1, fd, -1, "/usr/bin/echo", ++ (char *[]) { "echo", reserved_ports, NULL}, 0); ++ close(fd); ++} ++ ++static int add_reserved_ports(struct inet_sk_desc *sk) ++{ ++ if (reserved_ports_num >= opts.reserve_ports) ++ return -1; ++ ++ if (strlen(reserved_ports) == 0) ++ snprintf(reserved_ports, 6, "%u", sk->src_port); ++ else ++ snprintf(reserved_ports + strlen(reserved_ports), 7, ",%u", sk->src_port); ++ reserved_ports_num++; ++ ++ return 0; ++} ++ ++void set_reserved_ports(void) ++{ ++ struct inet_sk_desc *sk = NULL; ++ size_t size = 0; ++ ++ read_reserved_ports(RESERVED_PORTS_PATH); ++ ++ write_reserved_ports("ip_local_reserved_ports"); ++ ++ size = strlen(reserved_ports) + 6 * opts.reserve_ports + 1; ++ if (xrealloc_safe(&reserved_ports, size)) ++ exit(1); ++ ++ list_for_each_entry(sk, &cpt_tcp_repair_sockets, rlist) ++ add_reserved_ports(sk); ++ ++ write_reserved_ports(RESERVED_PORTS_PATH); ++ ++ free(reserved_ports); ++ reserved_ports = NULL; ++} +diff --git a/criu/sk-tcp.c b/criu/sk-tcp.c +index 0afecd2..38889d7 100644 +--- a/criu/sk-tcp.c ++++ b/criu/sk-tcp.c +@@ -30,7 +30,7 @@ + #undef LOG_PREFIX + #define LOG_PREFIX "tcp: " + +-static LIST_HEAD(cpt_tcp_repair_sockets); ++LIST_HEAD(cpt_tcp_repair_sockets); + static LIST_HEAD(rst_tcp_repair_sockets); + + static int lock_connection(struct inet_sk_desc *sk) +-- +2.34.1 + diff --git a/0042-reg-file-fix-dump-fail-problem-with-null-seek-op.patch b/0042-reg-file-fix-dump-fail-problem-with-null-seek-op.patch new file mode 100644 index 0000000..76aeb09 --- /dev/null +++ b/0042-reg-file-fix-dump-fail-problem-with-null-seek-op.patch @@ -0,0 +1,45 @@ +From 06a0277c2aab1442c724217957fd5f915ace2753 Mon Sep 17 00:00:00 2001 +From: Zhuling +Date: Thu, 22 Jul 2021 10:15:15 +0800 +Subject: [PATCH 42/72] reg-file: fix dump fail problem with null seek op + +Some customizing `struct file_operations` implementation has +no `llseek`, therefore ignore the no-implementation errno. + +Fix file dumping fail problem when the file seek op is null. + +Signed-off-by: Jingxian He +Signed-off-by: fu.lin +--- + criu/files-reg.c | 15 ++++++++++++--- + 1 file changed, 12 insertions(+), 3 deletions(-) + +diff --git a/criu/files-reg.c b/criu/files-reg.c +index 1a3b836..6dc8745 100644 +--- a/criu/files-reg.c ++++ b/criu/files-reg.c +@@ -2176,9 +2176,18 @@ static int do_open_reg(int ns_root_fd, struct reg_file_info *rfi, void *arg) + */ + if (!(rfi->rfe->flags & O_PATH)) { + if (rfi->rfe->pos != -1ULL && lseek(fd, rfi->rfe->pos, SEEK_SET) < 0) { +- pr_perror("Can't restore file pos"); +- close(fd); +- return -1; ++ /* ++ * Some customizing `struct file_operations` ++ * implementation has no `llseek`, therefore ++ * ignore the no-implementation errno. ++ */ ++ if (errno == ESPIPE) { ++ pr_warn("No ability to restore file ops\n"); ++ } else { ++ pr_perror("Can't restore file pos"); ++ close(fd); ++ return -1; ++ } + } + } + +-- +2.34.1 + diff --git a/0043-fix-dump-fail-problem-with-no-access-to-get-socket-f.patch b/0043-fix-dump-fail-problem-with-no-access-to-get-socket-f.patch new file mode 100644 index 0000000..754ee2b --- /dev/null +++ b/0043-fix-dump-fail-problem-with-no-access-to-get-socket-f.patch @@ -0,0 +1,39 @@ +From 88274e29aaaec4a53df996ae84c37ad20f36395f Mon Sep 17 00:00:00 2001 +From: Zhuling +Date: Sat, 24 Jul 2021 16:37:17 +0800 +Subject: [PATCH 43/72] fix dump fail problem with no access to get socket + filter + +Someone uses bpf hook by writing the kernel function instead +of the bpf code, it causes the error here. + +Fix socket dumping fail problem when user space has no access +to getting socket filter. + +Signed-off-by: Jingxian He +Signed-off-by: fu.lin +--- + criu/sockets.c | 7 ++++++- + 1 file changed, 6 insertions(+), 1 deletion(-) + +diff --git a/criu/sockets.c b/criu/sockets.c +index 2ddf85e..e412a1d 100644 +--- a/criu/sockets.c ++++ b/criu/sockets.c +@@ -355,7 +355,12 @@ static int dump_socket_filter(int sk, SkOptsEntry *soe) + + ret = getsockopt(sk, SOL_SOCKET, SO_GET_FILTER, NULL, &len); + if (ret) { +- pr_perror("Can't get socket filter len"); ++ pr_warn("Can't get socket filter len"); ++ /* Someone uses bpf hook by writing the kernel function ++ * instead of the bpf code, it causes the error here. ++ */ ++ if (errno == EACCES) ++ return 0; + return ret; + } + +-- +2.34.1 + diff --git a/0044-proc-parse-fix-vma-offset-value-for-the-sysfs-file-o.patch b/0044-proc-parse-fix-vma-offset-value-for-the-sysfs-file-o.patch new file mode 100644 index 0000000..b363ba7 --- /dev/null +++ b/0044-proc-parse-fix-vma-offset-value-for-the-sysfs-file-o.patch @@ -0,0 +1,139 @@ +From c7f9888e234a626a4d7bf31b89d66b91607f9785 Mon Sep 17 00:00:00 2001 +From: "fu.lin" +Date: Tue, 27 Jul 2021 11:40:34 +0800 +Subject: [PATCH 44/72] proc parse: fix vma offset value for the sysfs file of + pci devices + +Some pci devices create bin sysfs file which permit to use `mmap()` +syscall, the 6th parameter `offset` is always 0 when those kinds of +files create file mapping. The value of `offset` will be assign to +`vma->vm_pgoff` in kernel. However, it will be changed to pci address +automically during mmap callback function `pci_mmap_resource_range()`, +and the offset in `/proc//maps` will show non-zero. It will result +criu restore fails. + +There are many of those files. Just retry the mmap action. + +NOTICE: the stragy is try best, not whitelist. + +Signed-off-by: He Jingxian +Signed-off-by: fu.lin +Signed-off-by: fu.lin +--- + criu/include/image.h | 1 + + criu/pie/restorer.c | 22 +++++++++++++++++++--- + criu/proc_parse.c | 32 ++++++++++++++++++++++++++++++++ + 3 files changed, 52 insertions(+), 3 deletions(-) + +diff --git a/criu/include/image.h b/criu/include/image.h +index 66492c0..0156314 100644 +--- a/criu/include/image.h ++++ b/criu/include/image.h +@@ -86,6 +86,7 @@ + #define VMA_AREA_MEMFD (1 << 14) + #define VMA_AREA_ANON_INODE (1 << 15) + #define VMA_AREA_CHR (1 << 16) ++#define VMA_AREA_DEV_SHARE (1 << 17) + + #define VMA_CLOSE (1 << 28) + #define VMA_NO_PROT_WRITE (1 << 29) +diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c +index fde6e30..67b0d4c 100644 +--- a/criu/pie/restorer.c ++++ b/criu/pie/restorer.c +@@ -883,8 +883,9 @@ static unsigned long restore_mapping(VmaEntry *vma_entry) + * that mechanism as it causes the process to be charged for memory + * immediately upon mmap, not later upon preadv(). + */ +- pr_debug("\tmmap(%" PRIx64 " -> %" PRIx64 ", %x %x %d)\n", vma_entry->start, vma_entry->end, prot, flags, +- (int)vma_entry->fd); ++ pr_debug("\tmmap(%" PRIx64 " -> %" PRIx64 ", %x %x %d %lx)\n", ++ vma_entry->start, vma_entry->end, prot, flags, ++ (int)vma_entry->fd, vma_entry->pgoff); + /* + * Should map memory here. Note we map them as + * writable since we're going to restore page +@@ -892,6 +893,20 @@ static unsigned long restore_mapping(VmaEntry *vma_entry) + */ + addr = sys_mmap(decode_pointer(vma_entry->start), vma_entry_len(vma_entry), prot, flags, vma_entry->fd, + vma_entry->pgoff); ++ /* Some drivers implements its own mmap callback, the `mmap()` argument ++ * `offset` has the differet semantic with POSIX standard. Therefore, ++ * try to re-mmap with offset 0. ++ * ++ * NOTICE: the stragy is try best, not whitelist. ++ */ ++ if (addr == -EINVAL && vma_entry->pgoff != 0) { ++ pr_info("try mmap with offset 0\n"); ++ addr = sys_mmap(decode_pointer(vma_entry->start), ++ vma_entry_len(vma_entry), ++ prot, flags, ++ vma_entry->fd, ++ 0); ++ } + + if ((vma_entry->fd != -1) && (vma_entry->status & VMA_CLOSE)) + sys_close(vma_entry->fd); +@@ -1979,7 +1994,8 @@ long __export_restore_task(struct task_restore_args *args) + if (!vma_entry->has_madv || !vma_entry->madv) + continue; + +- if (vma_entry_is(vma_entry, VMA_AREA_ANON_INODE)) ++ if (vma_entry_is(vma_entry, VMA_AREA_ANON_INODE) || ++ vma_entry_is(vma_entry, VMA_AREA_DEV_SHARE)) + continue; + + for (m = 0; m < sizeof(vma_entry->madv) * 8; m++) { +diff --git a/criu/proc_parse.c b/criu/proc_parse.c +index d13589c..282a2e9 100644 +--- a/criu/proc_parse.c ++++ b/criu/proc_parse.c +@@ -552,6 +552,35 @@ static inline int handle_vvar_vma(struct vma_area *vma) + return 0; + } + ++static bool is_sysfs_resource(const char *path) ++{ ++ char *sub = NULL; ++ const char *prefix = "resource"; ++ const char *suffix = "_wc"; ++ ++ if (strstr(path, "devices/") == NULL) ++ return false; ++ ++ sub = rindex(path, '/'); ++ if (sub == NULL) ++ return false; ++ ++ sub += 1; ++ if (strncmp(sub, prefix, strlen(prefix)) != 0) ++ return false; ++ ++ sub += strlen(prefix); ++ while (*sub != '\0' && (*sub >= '0' && *sub <= '9')) ++ sub += 1; ++ ++ if (*sub == '\0') ++ return true; ++ if (!strcmp(sub, suffix)) ++ return true; ++ else ++ return false; ++} ++ + static int handle_vma(pid_t pid, struct vma_area *vma_area, const char *file_path, DIR *map_files_dir, + struct vma_file_info *vfi, struct vma_file_info *prev_vfi, int *vm_file_fd) + { +@@ -571,6 +600,9 @@ static int handle_vma(pid_t pid, struct vma_area *vma_area, const char *file_pat + goto err; + } else if (!strcmp(file_path, "[heap]")) { + vma_area->e->status |= VMA_AREA_REGULAR | VMA_AREA_HEAP; ++ } else if (is_sysfs_resource(file_path)) { ++ pr_info("find sys device module share memory\n"); ++ vma_area->e->status |= VMA_AREA_REGULAR | VMA_AREA_DEV_SHARE; + } else { + vma_area->e->status = VMA_AREA_REGULAR; + } +-- +2.34.1 + diff --git a/0045-add-reuse-file-method-for-recover-deleted-file-state.patch b/0045-add-reuse-file-method-for-recover-deleted-file-state.patch new file mode 100644 index 0000000..1d8130e --- /dev/null +++ b/0045-add-reuse-file-method-for-recover-deleted-file-state.patch @@ -0,0 +1,244 @@ +From 1328e32ee05c59f7168039211c9d96176ff22791 Mon Sep 17 00:00:00 2001 +From: Jingxian He +Date: Sat, 14 Aug 2021 16:45:40 +0800 +Subject: [PATCH 45/72] add reuse file method for recover deleted file state + +Orphan inode maybe exist in checkpoint process. Sometimes it can't be +re-linked by `linkat()` syscall, e.g. sysfs. + +Therefore, add reuse file method for recover file state of deleted +files. + +Signed-off-by: Jingxian He +Signed-off-by: fu.lin +--- + criu/Makefile.crtools | 1 + + criu/files-reg.c | 10 ++++-- + criu/files.c | 22 +++++++++++- + criu/include/orphan-inode.h | 16 +++++++++ + criu/orphan-inode.c | 71 +++++++++++++++++++++++++++++++++++++ + 5 files changed, 116 insertions(+), 4 deletions(-) + create mode 100644 criu/include/orphan-inode.h + create mode 100644 criu/orphan-inode.c + +diff --git a/criu/Makefile.crtools b/criu/Makefile.crtools +index 3e522b4..7fee749 100644 +--- a/criu/Makefile.crtools ++++ b/criu/Makefile.crtools +@@ -95,6 +95,7 @@ obj-y += devname.o + obj-y += files-chr.o + obj-y += exit-notify.o + obj-y += reserved-ports.o ++obj-y += orphan-inode.o + obj-$(CONFIG_HAS_LIBBPF) += bpfmap.o + obj-$(CONFIG_COMPAT) += pie-util-vdso-elf32.o + CFLAGS_pie-util-vdso-elf32.o += -DCONFIG_VDSO_32 +diff --git a/criu/files-reg.c b/criu/files-reg.c +index 6dc8745..ed46764 100644 +--- a/criu/files-reg.c ++++ b/criu/files-reg.c +@@ -46,6 +46,7 @@ + #include "external.h" + #include "memfd.h" + #include "files-chr.h" ++#include "orphan-inode.h" + + #include "protobuf.h" + #include "util.h" +@@ -1260,8 +1261,10 @@ static int check_path_remap(struct fd_link *link, const struct fd_parms *parms, + */ + + if (errno == ENOENT) { +- link_strip_deleted(link); +- return dump_linked_remap(rpath + 1, plen - 1, ost, lfd, id, nsid); ++ pr_info("Start add no exist file: %s\n", rpath+1); ++ add_reuse_file(id, lfd, dst_pid); ++ need_reuse_flag = O_REUSE; ++ return 0; + } + + pr_perror("Can't stat path"); +@@ -1663,7 +1666,8 @@ ext: + rfe.has_mode = true; + rfe.mode = p->stat.st_mode; + +- if (S_ISREG(p->stat.st_mode) && should_check_size(rfe.flags) && !store_validation_data(&rfe, p, lfd)) ++ if (S_ISREG(p->stat.st_mode) && should_check_size(rfe.flags) ++ && (need_reuse_flag != O_REUSE) && !store_validation_data(&rfe, p, lfd)) + return -1; + + fe.type = FD_TYPES__REG; +diff --git a/criu/files.c b/criu/files.c +index 1c52cf4..e79052e 100644 +--- a/criu/files.c ++++ b/criu/files.c +@@ -50,6 +50,7 @@ + #include "fdstore.h" + #include "bpfmap.h" + #include "files-chr.h" ++#include "orphan-inode.h" + + #include "protobuf.h" + #include "util.h" +@@ -706,6 +707,7 @@ int dump_my_file(int lfd, u32 *id, int *type) + } + + int dst_pid; ++int need_reuse_flag; + + int dump_task_files_seized(struct parasite_ctl *ctl, struct pstree_item *item, struct parasite_drain_fd *dfds) + { +@@ -743,10 +745,13 @@ int dump_task_files_seized(struct parasite_ctl *ctl, struct pstree_item *item, s + for (i = 0; i < nr_fds; i++) { + FdinfoEntry e = FDINFO_ENTRY__INIT; + ++ need_reuse_flag = 0; + ret = dump_one_file(item->pid, dfds->fds[i + off], lfds[i], opts + i, ctl, &e, dfds); + if (ret) + break; + ++ e.flags |= need_reuse_flag; ++ pr_info("write fdinfoEntry fd=%d id=%d\n", (&e)->fd, (&e)->id); + ret = pb_write_one(img, &e, PB_FDINFO); + if (ret) + break; +@@ -939,7 +944,8 @@ int collect_fd(int pid, FdinfoEntry *e, struct rst_info *rst_info, bool fake) + { + struct file_desc *fdesc; + +- pr_info("Collect fdinfo pid=%d fd=%d id=%#x\n", pid, e->fd, e->id); ++ pr_info("Collect fdinfo pid=%d fd=%d id=%#x flags: %#x\n", ++ pid, e->fd, e->id, e->flags); + + fdesc = find_file_desc(e); + if (fdesc == NULL) { +@@ -1230,6 +1236,7 @@ static int open_fd(struct fdinfo_list_entry *fle) + int new_fd = -1, ret; + struct chrfile_info *ci; + ++ pr_info("open file flags: %#x\n", fle->fe->flags); + flem = file_master(d); + if (fle != flem) { + BUG_ON(fle->stage != FLE_INITIALIZED); +@@ -1251,6 +1258,19 @@ static int open_fd(struct fdinfo_list_entry *fle) + return 0; + } + } ++ } else if (fle->fe->flags & O_REUSE) { ++ pr_info("find reuse file:%d\n", d->id); ++ ret = repair_reuse_file(d->id); ++ if (!ret) { ++ new_fd = get_reuse_file(); ++ pr_info("get reuse file:%d\n", new_fd); ++ if (new_fd <= 0 || setup_and_serve_out(fle, new_fd) < 0) { ++ pr_err("setup reuse file fail\n"); ++ return -1; ++ } ++ fle->stage = FLE_RESTORED; ++ return 0; ++ } + } + + /* +diff --git a/criu/include/orphan-inode.h b/criu/include/orphan-inode.h +new file mode 100644 +index 0000000..bc3b6ae +--- /dev/null ++++ b/criu/include/orphan-inode.h +@@ -0,0 +1,16 @@ ++#ifndef __CRIU_ORPHAN_INODE_H__ ++#define __CRIU_ORPHAN_INODE_H__ ++ ++#define ADD_REUSE_FILE_PATH "/sys/kernel/add_reuse_file" ++#define REPAIR_REUSE_FILE_PATH "/sys/kernel/repair_reuse_file" ++#define REUSE_FILE_PATH "/sys/kernel/reuse_file" ++#define O_REUSE 0100000000 ++ ++extern int dst_pid; ++extern int need_reuse_flag; ++ ++int add_reuse_file(u32 id, int fd, int pid); ++int repair_reuse_file(int id); ++int get_reuse_file(void); ++ ++#endif /* __CRIU_ORPHAN_INODE_H__ */ +diff --git a/criu/orphan-inode.c b/criu/orphan-inode.c +new file mode 100644 +index 0000000..c4e38dc +--- /dev/null ++++ b/criu/orphan-inode.c +@@ -0,0 +1,71 @@ ++#include ++#include ++#include ++#include ++#include ++ ++#include "int.h" ++#include "log.h" ++#include "orphan-inode.h" ++ ++int add_reuse_file(u32 id, int fd, int pid) ++{ ++ int retval; ++ char buf[256] = {0}; ++ ++ retval = snprintf(buf, 256, "%u,%d,%d", id, fd, pid); ++ if (retval <= 0) ++ return -EFAULT; ++ ++ fd = open(ADD_REUSE_FILE_PATH, O_WRONLY, 0); ++ if (fd < 0) { ++ pr_err("open file:%s fail\n", ADD_REUSE_FILE_PATH); ++ return fd; ++ } ++ ++ retval = write(fd, buf, strlen(buf)); ++ close(fd); ++ ++ return retval < 0 ? -1 : 0; ++} ++ ++int repair_reuse_file(int id) ++{ ++ int retval, fd; ++ char buf[256] = {0}; ++ ++ retval = snprintf(buf, 256, "%u", id); ++ if (retval <= 0) ++ return -EFAULT; ++ ++ fd = open(REPAIR_REUSE_FILE_PATH, O_WRONLY, 0); ++ if (fd < 0) { ++ pr_err("open file:%s fail\n", REPAIR_REUSE_FILE_PATH); ++ return fd; ++ } ++ retval = write(fd, buf, strlen(buf)); ++ ++ close(fd); ++ return retval < 0 ? -1 : 0; ++} ++ ++int get_reuse_file(void) ++{ ++ int fd; ++ ssize_t count; ++ int retval = -1; ++ char buf[32] = {0}; ++ ++ fd = open(REUSE_FILE_PATH, O_RDONLY , 0); ++ if (fd < 0) { ++ pr_err("open file:%s fail\n", REUSE_FILE_PATH); ++ return fd; ++ } ++ ++ count = read(fd, buf, sizeof(buf)); ++ if (count > 0) ++ retval = atoi(buf); ++ ++ close(fd); ++ return retval; ++} +-- +2.34.1 + diff --git a/0046-sk-fix-share-sockets-repair-problem.patch b/0046-sk-fix-share-sockets-repair-problem.patch new file mode 100644 index 0000000..e1e666a --- /dev/null +++ b/0046-sk-fix-share-sockets-repair-problem.patch @@ -0,0 +1,133 @@ +From 8b1856d5c72c6870c04a87158718d2df62591a6c Mon Sep 17 00:00:00 2001 +From: Jingxian He +Date: Wed, 11 Aug 2021 15:01:27 +0800 +Subject: [PATCH 46/72] sk: fix share sockets repair problem + +Repair off the share sockets after reusing them +to recover the share socket state. + +Signed-off-by: Jingxian He +Signed-off-by: fu.lin +--- + criu/files.c | 33 ++++++++++++++++++++++++++++++++- + criu/sk-inet.c | 7 +++++-- + criu/sk-netlink.c | 5 +++-- + 3 files changed, 40 insertions(+), 5 deletions(-) + +diff --git a/criu/files.c b/criu/files.c +index e79052e..24ed219 100644 +--- a/criu/files.c ++++ b/criu/files.c +@@ -51,6 +51,7 @@ + #include "bpfmap.h" + #include "files-chr.h" + #include "orphan-inode.h" ++#include "sk-inet.h" + + #include "protobuf.h" + #include "util.h" +@@ -1215,7 +1216,7 @@ int setup_and_serve_out(struct fdinfo_list_entry *fle, int new_fd) + if (reopen_fd_as(fle->fe->fd, new_fd)) + return -1; + +- pr_info("*******flags: %d",fle->fe->flags); ++ pr_info("*******flags: %d\n",fle->fe->flags); + if (fcntl(fle->fe->fd, F_SETFD, fle->fe->flags) == -1) { + pr_perror("Unable to set file descriptor flags"); + return -1; +@@ -1229,6 +1230,30 @@ int setup_and_serve_out(struct fdinfo_list_entry *fle, int new_fd) + return 0; + } + ++#define MAX_SHARE_SOCKETS_NUM 25000 ++int repair_share_sockets[MAX_SHARE_SOCKETS_NUM]; ++int repair_share_num; ++ ++int add_repair_share_socket(int fd) ++{ ++ if (repair_share_num >= MAX_SHARE_SOCKETS_NUM) ++ return -1; ++ repair_share_sockets[repair_share_num] = fd; ++ repair_share_num++; ++ return 0; ++} ++ ++void repair_off_share_sockets(void) ++{ ++ int i; ++ ++ for (i = 0; i < repair_share_num; i++) { ++ tcp_repair_off(repair_share_sockets[i]); ++ pr_info("repair off socket:%d\n", repair_share_sockets[i]); ++ } ++ repair_share_num = 0; ++} ++ + static int open_fd(struct fdinfo_list_entry *fle) + { + struct file_desc *d = fle->desc; +@@ -1248,6 +1273,7 @@ static int open_fd(struct fdinfo_list_entry *fle) + + if (d->ops->type == FD_TYPES__INETSK) { + if (check_need_repair(d)) { ++ pr_info("start repair for:%d\n", d->id); + ret = repair_share_socket(d->id); + if (!ret) { + new_fd = get_share_socket(); +@@ -1255,6 +1281,10 @@ static int open_fd(struct fdinfo_list_entry *fle) + if (new_fd <= 0 || setup_and_serve_out(fle, new_fd) < 0) + return -1; + fle->stage = FLE_RESTORED; ++ if (add_repair_share_socket(fle->fe->fd)) { ++ pr_perror("add repair share socket fail\n"); ++ return -1; ++ } + return 0; + } + } +@@ -1379,6 +1409,7 @@ static int open_fdinfos(struct pstree_item *me) + wait_fds_event(); + } while (again || progress); + ++ repair_off_share_sockets(); + BUG_ON(!list_empty(list)); + /* + * Fake fles may be used for restore other +diff --git a/criu/sk-inet.c b/criu/sk-inet.c +index c7de793..c0251db 100644 +--- a/criu/sk-inet.c ++++ b/criu/sk-inet.c +@@ -635,8 +635,11 @@ static int do_dump_one_inet_fd(int lfd, u32 id, const struct fd_parms *p, int fa + BUG_ON(sk->sd.already_dumped); + + if (check_share_dst_port(sk->dst_port) || check_share_src_port(sk->src_port)) { +- pr_info("Start add share prot:%d src %d\n", sk->dst_port, sk->src_port); +- add_share_socket(id, lfd, dst_pid, sk->src_port); ++ pr_info("Start add share prot:%d-%d dst_pid %d id %d\n", ++ sk->dst_port, sk->src_port, dst_pid, id); ++ ret = add_share_socket(id, lfd, dst_pid, sk->src_port); ++ if (ret) ++ pr_warn("add share socket ret %d\n", ret); + } + + ie.id = id; +diff --git a/criu/sk-netlink.c b/criu/sk-netlink.c +index d4b3b7b..2832060 100644 +--- a/criu/sk-netlink.c ++++ b/criu/sk-netlink.c +@@ -115,9 +115,10 @@ static bool can_dump_netlink_sk(int lfd) + + ret = fd_has_data(lfd); + if (ret == 1) +- pr_err("The socket has data to read\n"); ++ pr_warn("The socket has data to read\n"); + +- return ret == 0; ++ /* ignore netlink socket data */ ++ return true; + } + + static int dump_one_netlink_fd(int lfd, u32 id, const struct fd_parms *p) +-- +2.34.1 + diff --git a/0047-mm-add-clear-pin-mem-and-init-page-map-option.patch b/0047-mm-add-clear-pin-mem-and-init-page-map-option.patch new file mode 100644 index 0000000..3ea17c5 --- /dev/null +++ b/0047-mm-add-clear-pin-mem-and-init-page-map-option.patch @@ -0,0 +1,107 @@ +From 1cb92fe0a930cf862f8a3ecd9a812d5b2e3aea60 Mon Sep 17 00:00:00 2001 +From: root +Date: Wed, 8 Sep 2021 08:23:11 +0000 +Subject: [PATCH 47/72] mm: add clear pin mem and init page map option + +Add 'clear-pin-mem' option for clearing pin memory data, +and 'init-page-map' option for initializationing buffer for +reading page map info. + +Signed-off-by: Jingxian He +Signed-off-by: fu.lin +--- + criu/crtools.c | 13 ++++++++++++- + criu/include/pin-mem.h | 4 ++++ + criu/pin-mem.c | 20 ++++++++++++++++++++ + 3 files changed, 36 insertions(+), 1 deletion(-) + +diff --git a/criu/crtools.c b/criu/crtools.c +index ae858e8..cc0a18f 100644 +--- a/criu/crtools.c ++++ b/criu/crtools.c +@@ -43,6 +43,7 @@ + #include "fault-injection.h" + #include "proc_parse.h" + #include "kerndat.h" ++#include "pin-mem.h" + + #include "setproctitle.h" + #include "sysctl.h" +@@ -169,6 +170,14 @@ int main(int argc, char *argv[], char *envp[]) + goto usage; + } + ++ if (!strcmp(argv[optind], "clear-pin-memory")) { ++ return clear_pin_mem(0); ++ } ++ ++ if (!strcmp(argv[optind], "init-pagemap-read")) { ++ return init_pagemap_read(0); ++ } ++ + /* We must not open imgs dir, if service is called */ + if (strcmp(argv[optind], "service")) { + ret = open_image_dir(opts.imgs_dir, image_dir_mode(argv, optind)); +@@ -320,7 +329,9 @@ usage: + " service launch service\n" + " dedup remove duplicates in memory dump\n" + " cpuinfo dump writes cpu information into image file\n" +- " cpuinfo check validates cpu information read from image file\n"); ++ " cpuinfo check validates cpu information read from image file\n" ++ " clear-pin-memory clear pin memory manage data\n" ++ " init-pagemap-read init data buffer for reading page map info\n"); + + if (usage_error) { + pr_msg("\nTry -h|--help for more info\n"); +diff --git a/criu/include/pin-mem.h b/criu/include/pin-mem.h +index 2b54996..b28ef3d 100644 +--- a/criu/include/pin-mem.h ++++ b/criu/include/pin-mem.h +@@ -39,6 +39,9 @@ struct pin_mem_area_set { + #define _SET_FORK_PID 8 + #define SET_FORK_PID _IOW(PIN_MEM_MAGIC, _SET_FORK_PID, int) + ++#define _INIT_PAGEMAP_READ 5 ++#define INIT_PAGEMAP_READ _IOW(PIN_MEM_MAGIC, _INIT_PAGEMAP_READ, int) ++ + #endif /* __has_include("linux/pin_memory.h") */ + + #define PIN_MEM_FILE "/dev/pinmem" +@@ -49,5 +52,6 @@ int pin_vmae(VmaEntry *vmae, struct pstree_item *item); + int dump_task_special_pages(int pid); + int restore_task_special_pages(int pid); + int clear_pin_mem(int pid); ++int init_pagemap_read(int para); + + #endif /* __CRIU_PIN_MEM_H__ */ +diff --git a/criu/pin-mem.c b/criu/pin-mem.c +index b18db97..96ca2c5 100644 +--- a/criu/pin-mem.c ++++ b/criu/pin-mem.c +@@ -144,3 +144,23 @@ int clear_pin_mem(int pid) + close(fd); + return ret; + } ++ ++int init_pagemap_read(int para) ++{ ++ int fd, ret; ++ ++ fd = open(PIN_MEM_FILE, O_RDWR, 0); ++ if (fd < 0) { ++ pr_warn("error open file: %s\n", PIN_MEM_FILE); ++ return -1; ++ } ++ ++ ret = ioctl(fd, INIT_PAGEMAP_READ, (unsigned long) ¶); ++ if (ret < 0) { ++ pr_warn("Init pagemap read fail, errno: %s\n", strerror(errno)); ++ } ++ ++ close(fd); ++ return ret; ++} ++ +-- +2.34.1 + diff --git a/0048-fds-fix-fds-list-restore.patch b/0048-fds-fix-fds-list-restore.patch new file mode 100644 index 0000000..11de7f4 --- /dev/null +++ b/0048-fds-fix-fds-list-restore.patch @@ -0,0 +1,37 @@ +From 803ee02298e0a71b07cf611eee68e23f702d259e Mon Sep 17 00:00:00 2001 +From: Jingxian He +Date: Thu, 16 Sep 2021 13:50:46 +0000 +Subject: [PATCH 48/72] fds: fix fds list restore + +When there exist multi processes need to dump, the child process may +have the same fds as parent process. During the restore processing, +criu choose the process which has the min pid value to be the master +process to recover fds. However, choosing the parent process as the +master process is more suitable. + +Signed-off-by: Jingxian He +--- + criu/files.c | 7 +------ + 1 file changed, 1 insertion(+), 6 deletions(-) + +diff --git a/criu/files.c b/criu/files.c +index 24ed219..6d8b812 100644 +--- a/criu/files.c ++++ b/criu/files.c +@@ -906,12 +906,7 @@ static struct fdinfo_list_entry *alloc_fle(int pid, FdinfoEntry *fe) + + static void __collect_desc_fle(struct fdinfo_list_entry *new_le, struct file_desc *fdesc) + { +- struct fdinfo_list_entry *le; +- +- list_for_each_entry_reverse(le, &fdesc->fd_info_head, desc_list) +- if (pid_rst_prio_eq(le->pid, new_le->pid)) +- break; +- list_add(&new_le->desc_list, &le->desc_list); ++ list_add(&new_le->desc_list, &fdesc->fd_info_head); + } + + static void collect_desc_fle(struct fdinfo_list_entry *new_le, struct file_desc *fdesc, bool force_master) +-- +2.34.1 + diff --git a/0049-log-print-error-log-to-dev-kmsg.patch b/0049-log-print-error-log-to-dev-kmsg.patch new file mode 100644 index 0000000..688bd88 --- /dev/null +++ b/0049-log-print-error-log-to-dev-kmsg.patch @@ -0,0 +1,88 @@ +From bec1445fd5dcfffb24918d725163f3be35f8b634 Mon Sep 17 00:00:00 2001 +From: "fu.lin" +Date: Tue, 19 Oct 2021 20:53:19 +0800 +Subject: [PATCH 49/72] log: print error log to /dev/kmsg + +The criu log can't be flushed to disk when OS crash in storage +environment, therefore, output high level msg to /dev/kmsg. + +Signed-off-by: fu.lin +--- + criu/Makefile.crtools | 1 + + criu/include/log.h | 3 +++ + criu/kmsg.c | 16 ++++++++++++++++ + criu/log.c | 4 ++++ + 4 files changed, 24 insertions(+) + create mode 100644 criu/kmsg.c + +diff --git a/criu/Makefile.crtools b/criu/Makefile.crtools +index 7fee749..3bb7c19 100644 +--- a/criu/Makefile.crtools ++++ b/criu/Makefile.crtools +@@ -96,6 +96,7 @@ obj-y += files-chr.o + obj-y += exit-notify.o + obj-y += reserved-ports.o + obj-y += orphan-inode.o ++obj-y += kmsg.o + obj-$(CONFIG_HAS_LIBBPF) += bpfmap.o + obj-$(CONFIG_COMPAT) += pie-util-vdso-elf32.o + CFLAGS_pie-util-vdso-elf32.o += -DCONFIG_VDSO_32 +diff --git a/criu/include/log.h b/criu/include/log.h +index 85e6dc2..aafea95 100644 +--- a/criu/include/log.h ++++ b/criu/include/log.h +@@ -2,6 +2,7 @@ + #define __CR_LOG_H__ + + #include ++#include + + #ifndef CR_NOGLIBC + +@@ -62,4 +63,6 @@ void flush_early_log_buffer(int fd); + + #endif /* CR_NOGLIBC */ + ++void write_kmsg(const void *buf, size_t count); ++ + #endif /* __CR_LOG_H__ */ +diff --git a/criu/kmsg.c b/criu/kmsg.c +new file mode 100644 +index 0000000..c956dfb +--- /dev/null ++++ b/criu/kmsg.c +@@ -0,0 +1,16 @@ ++#include ++#include ++ ++#define SYSLOG_DEV "/dev/kmsg" ++ ++void write_kmsg(const void *buf, size_t count) ++{ ++ int fd; ++ ++ fd = open(SYSLOG_DEV, O_CLOEXEC | O_WRONLY); ++ if (fd < 0) ++ return; ++ ++ write(fd, buf, count); ++ close(fd); ++} +diff --git a/criu/log.c b/criu/log.c +index c4ce90e..ba208f7 100644 +--- a/criu/log.c ++++ b/criu/log.c +@@ -373,6 +373,10 @@ static void vprint_on_level(unsigned int loglevel, const char *format, va_list p + size += buf_off; + + while (off < size) { ++ if (loglevel <= LOG_WARN) { ++ write_kmsg(buffer + off, size - off); ++ } ++ + ret = write(fd, buffer + off, size - off); + if (ret <= 0) + break; +-- +2.34.1 + diff --git a/0050-unix-sk-improve-dgram-robustness.patch b/0050-unix-sk-improve-dgram-robustness.patch new file mode 100644 index 0000000..b201702 --- /dev/null +++ b/0050-unix-sk-improve-dgram-robustness.patch @@ -0,0 +1,83 @@ +From 6dde331da8e28e129010aee391e7ef3d757490cd Mon Sep 17 00:00:00 2001 +From: "fu.lin" +Date: Tue, 26 Oct 2021 11:13:27 +0800 +Subject: [PATCH 50/72] unix sk: improve dgram robustness + +We should try out best to ensure the success of criu. As for unix dgram +socket, criu use re-connect instead of repair instead of unix stream +socket. Therefore, this patch does the following things: + +- detect unix dgram unix sock file when criu dumps unix dgram socket +- add the fault tolerance of unix dgram socket connecting (focus on the + condition of `/dev/log` disappearance when rsyslog restart) + +Conflict:NA +Reference:https://gitee.com/src-openeuler/criu/pulls/21 +Signed-off-by: fu.lin +--- + criu/sk-unix.c | 35 +++++++++++++++++++++++++++++++++-- + 1 file changed, 33 insertions(+), 2 deletions(-) + +diff --git a/criu/sk-unix.c b/criu/sk-unix.c +index 86bfa18..de75425 100644 +--- a/criu/sk-unix.c ++++ b/criu/sk-unix.c +@@ -11,6 +11,7 @@ + #include + #include + #include ++#include + + #include "libnetlink.h" + #include "cr_options.h" +@@ -1435,6 +1436,33 @@ err: + return -1; + } + ++/* ++ * Sometimes, `/dev/log` will disappear because of the restart of rsyslog when ++ * rotating, criu try to connect `/dev/log` will report error at this time. We ++ * should try our best to ensure the success of criu restoration. Therefore, ++ * retry three times here. ++ */ ++static int unix_dgram_reconnect(int fd, struct sockaddr_un *addr, int len) ++{ ++ int retval = 0; ++ struct timespec tim = { ++ .tv_sec = 0, ++ .tv_nsec = 5e+8, ++ }; ++ ++ for (int i = 0; i < 3; i++) { ++ nanosleep(&tim, NULL); ++ pr_warn("Can't connect unix socket(%s), %d retry\n", ++ addr->sun_path, i); ++ retval = connect(fd, (struct sockaddr *)addr, ++ sizeof(addr->sun_family) + len); ++ if (retval == 0) ++ break; ++ } ++ ++ return retval; ++} ++ + static int post_open_standalone(struct file_desc *d, int fd) + { + int fdstore_fd = -1, procfs_self_dir = -1, len; +@@ -1521,8 +1549,11 @@ static int post_open_standalone(struct file_desc *d, int fd) + goto err_revert_and_exit; + } + } else if (connect(fd, (struct sockaddr *)&addr, sizeof(addr.sun_family) + len) < 0) { +- pr_perror("Can't connect %d socket", ui->ue->ino); +- goto err_revert_and_exit; ++ if (ui->ue->type != SOCK_DGRAM || errno != ENOENT ++ || unix_dgram_reconnect(fd, &addr, len) != 0) { ++ pr_perror("Can't connect %d socket", ui->ue->ino); ++ goto err_revert_and_exit; ++ } + } + mutex_unlock(mutex_ghost); + +-- +2.34.1 + diff --git a/0051-sk-ignore-the-bind-error-for-icmp-socket.patch b/0051-sk-ignore-the-bind-error-for-icmp-socket.patch new file mode 100644 index 0000000..25a071f --- /dev/null +++ b/0051-sk-ignore-the-bind-error-for-icmp-socket.patch @@ -0,0 +1,46 @@ +From a7d5401953c548c9479c386b52fffcba6b49c0e3 Mon Sep 17 00:00:00 2001 +From: "fu.lin" +Date: Wed, 27 Oct 2021 11:57:43 +0800 +Subject: [PATCH 51/72] sk: ignore the bind error for icmp socket + +Conflict:NA +Reference:https://gitee.com/src-openeuler/criu/pulls/21 +Signed-off-by: fu.lin +--- + criu/sk-inet.c | 20 ++++++++++++++++++-- + 1 file changed, 18 insertions(+), 2 deletions(-) + +diff --git a/criu/sk-inet.c b/criu/sk-inet.c +index c0251db..96c2d09 100644 +--- a/criu/sk-inet.c ++++ b/criu/sk-inet.c +@@ -1160,8 +1160,24 @@ int inet_bind(int sk, struct inet_sk_info *ii) + } + + if (bind(sk, (struct sockaddr *)&addr, addr_size) == -1) { +- pr_perror("Can't bind inet socket (id %d)", ii->ie->id); +- return -1; ++ InetSkEntry *ie = ii->ie; ++ ++ /* ++ * Sometimes the ping-like program restoration may appear ++ * `bind()` error when it is specified the address. In view ++ * of the principle that we should try our best to restore the ++ * process, and ping-like program works abnormal can tolerate, ++ * just warn here instead of report error. ++ */ ++ if (ie->proto == IPPROTO_ICMP || ie->proto == IPPROTO_ICMPV6) { ++ pr_warn("Can't bind inet socket (id %d) proto %s\n", ++ ie->id, ++ ie->proto == IPPROTO_ICMP ? ++ "IPPROTO_ICMP" : "IPPROTO_ICMPV6"); ++ } else { ++ pr_perror("Can't bind inet socket (id %d)", ii->ie->id); ++ return -1; ++ } + } + + if (rst_freebind) { +-- +2.34.1 + diff --git a/0052-optimization-parallel-collecting-vmas.patch b/0052-optimization-parallel-collecting-vmas.patch new file mode 100644 index 0000000..3cdf13c --- /dev/null +++ b/0052-optimization-parallel-collecting-vmas.patch @@ -0,0 +1,505 @@ +From ade879e6ccdc4c74a1c153f0750d2cd87ec8a4ec Mon Sep 17 00:00:00 2001 +From: Jingxian He +Date: Tue, 30 Nov 2021 10:26:10 +0800 +Subject: [PATCH 52/72] optimization: parallel collecting vmas + +In order to improve criu dump performance, make the collecting vmas +operation parallel run with the other collecting operations. + +In order to prevent the concurrency problem by `find_unused_fd`, only +the main root task will parallel. + +Usage: + criu --parallel + +Note: + Ensure criu can use multi-core, otherwise the performance will +deterioration. + +Signed-off-by: fu.lin +Signed-off-by: hewenliang +Signed-off-by: Jingxian He +--- + criu/Makefile.crtools | 1 + + criu/Makefile.packages | 1 + + criu/config.c | 1 + + criu/cr-dump.c | 53 +++++++++++----- + criu/crtools.c | 1 + + criu/include/cr_options.h | 1 + + criu/include/pstree.h | 3 + + criu/include/taskqueue.h | 50 +++++++++++++++ + criu/namespaces.c | 9 ++- + criu/taskqueue.c | 124 ++++++++++++++++++++++++++++++++++++++ + 10 files changed, 228 insertions(+), 16 deletions(-) + create mode 100644 criu/include/taskqueue.h + create mode 100644 criu/taskqueue.c + +diff --git a/criu/Makefile.crtools b/criu/Makefile.crtools +index 3bb7c19..2ad0207 100644 +--- a/criu/Makefile.crtools ++++ b/criu/Makefile.crtools +@@ -97,6 +97,7 @@ obj-y += exit-notify.o + obj-y += reserved-ports.o + obj-y += orphan-inode.o + obj-y += kmsg.o ++obj-y += taskqueue.o + obj-$(CONFIG_HAS_LIBBPF) += bpfmap.o + obj-$(CONFIG_COMPAT) += pie-util-vdso-elf32.o + CFLAGS_pie-util-vdso-elf32.o += -DCONFIG_VDSO_32 +diff --git a/criu/Makefile.packages b/criu/Makefile.packages +index 13c346f..851489b 100644 +--- a/criu/Makefile.packages ++++ b/criu/Makefile.packages +@@ -31,6 +31,7 @@ REQ-RPM-PKG-TEST-NAMES += $(PYTHON)-pyyaml + endif + + export LIBS += -lprotobuf-c -ldl -lnl-3 -lsoccr -Lsoccr/ -lnet ++export LIBS += -lpthread + + check-packages-failed: + $(warning Can not find some of the required libraries) +diff --git a/criu/config.c b/criu/config.c +index ae5f81e..fdbc5eb 100644 +--- a/criu/config.c ++++ b/criu/config.c +@@ -709,6 +709,7 @@ int parse_options(int argc, char **argv, bool *usage_error, bool *has_exec_cmd, + { "share-dst-ports", required_argument, 0, 2000 }, + { "share-src-ports", required_argument, 0, 2001 }, + { "reserve-ports", required_argument, 0, 'P' }, ++ BOOL_OPT("parallel", &opts.parallel), + {}, + }; + +diff --git a/criu/cr-dump.c b/criu/cr-dump.c +index a8ab61e..ee826c0 100644 +--- a/criu/cr-dump.c ++++ b/criu/cr-dump.c +@@ -17,6 +17,7 @@ + + #include + #include ++#include + + #include "types.h" + #include "protobuf.h" +@@ -90,6 +91,7 @@ + #include "notifier.h" + #include "files-chr.h" + #include "reserved-ports.h" ++#include "taskqueue.h" + + /* + * Architectures can overwrite this function to restore register sets that +@@ -424,7 +426,7 @@ static int dump_pid_misc(pid_t pid, TaskCoreEntry *tc) + return 0; + } + +-static int dump_filemap(struct vma_area *vma_area, int fd) ++int dump_filemap(struct vma_area *vma_area, int fd) + { + struct fd_parms p = FD_PARMS_INIT; + VmaEntry *vma = vma_area->e; +@@ -1504,7 +1506,7 @@ err_cure: + static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) + { + pid_t pid = item->pid->real; +- struct vm_area_list vmas; ++ struct vm_area_list *vmas = NULL; + struct parasite_ctl *parasite_ctl; + int ret, exit_code = -1; + struct parasite_dump_misc misc; +@@ -1513,8 +1515,6 @@ static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) + struct proc_posix_timers_stat proc_args; + struct mem_dump_ctl mdc; + +- vm_area_list_init(&vmas); +- + pr_info("========================================\n"); + pr_info("Dumping task (pid: %d)\n", pid); + pr_info("========================================\n"); +@@ -1525,12 +1525,23 @@ static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) + */ + return 0; + ++ if (!opts.parallel || root_item->pid->real != item->pid->real ) { ++ vmas = xmalloc(sizeof(struct vm_area_list)); ++ if (vmas == NULL) { ++ pr_err("xmalloc no memory\n"); ++ return -1; ++ } ++ vm_area_list_init(vmas); ++ } else ++ vmas = item->maps_info.vmas; ++ + pr_info("Obtaining task stat ... \n"); + ret = parse_pid_stat(pid, &pps_buf); + if (ret < 0) + goto err; + +- ret = collect_mappings(pid, &vmas, dump_filemap); ++ ret = (opts.parallel && root_item->pid->real == item->pid->real) ? ++ 0 : collect_mappings(pid, vmas, dump_filemap); + if (ret) { + pr_err("Collect mappings (pid: %d) failed with %d\n", pid, ret); + goto err; +@@ -1570,7 +1581,10 @@ static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) + goto err; + } + +- parasite_ctl = parasite_infect_seized(pid, item, &vmas); ++ if (opts.parallel && end_collect_mappings_thread(item)) ++ goto err; ++ ++ parasite_ctl = parasite_infect_seized(pid, item, vmas); + if (!parasite_ctl) { + pr_err("Can't infect (pid: %d) with parasite\n", pid); + goto err; +@@ -1600,13 +1614,13 @@ static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) + goto err_cure_imgset; + } + +- ret = parasite_fixup_vdso(parasite_ctl, pid, &vmas); ++ ret = parasite_fixup_vdso(parasite_ctl, pid, vmas); + if (ret) { + pr_err("Can't fixup vdso VMAs (pid: %d)\n", pid); + goto err_cure_imgset; + } + +- ret = parasite_collect_aios(parasite_ctl, &vmas); /* FIXME -- merge with above */ ++ ret = parasite_collect_aios(parasite_ctl, vmas); /* FIXME -- merge with above */ + if (ret) { + pr_err("Failed to check aio rings (pid: %d)\n", pid); + goto err_cure_imgset; +@@ -1658,7 +1672,7 @@ static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) + mdc.stat = &pps_buf; + mdc.parent_ie = parent_ie; + +- ret = parasite_dump_pages_seized(item, &vmas, &mdc, parasite_ctl); ++ ret = parasite_dump_pages_seized(item, vmas, &mdc, parasite_ctl); + if (ret) + goto err_cure; + +@@ -1719,7 +1733,7 @@ static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) + goto err; + } + +- ret = dump_task_mm(pid, &pps_buf, &misc, &vmas, cr_imgset); ++ ret = dump_task_mm(pid, &pps_buf, &misc, vmas, cr_imgset); + if (ret) { + pr_err("Dump mappings (pid: %d) failed with %d\n", pid, ret); + goto err; +@@ -1735,7 +1749,8 @@ static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) + exit_code = 0; + err: + close_pid_proc(); +- free_mappings(&vmas); ++ free_mappings(vmas); ++ free(vmas); + xfree(dfds); + return exit_code; + +@@ -1893,6 +1908,9 @@ int cr_pre_dump_tasks(pid_t pid) + if (opts.dump_char_dev && parse_devname() < 0) + goto err; + ++ if (opts.parallel && init_parallel_env() != 0) ++ goto err; ++ + root_item = alloc_pstree_item(); + if (!root_item) + goto err; +@@ -2107,6 +2125,13 @@ static int cr_dump_finish(int ret) + write_stats(DUMP_STATS); + pr_info("Dumping finished successfully\n"); + } ++ ++ /* ++ * Don't care threads' status and ignore unfree resources, use ++ * `exit_group()` to ensure exit all threads. ++ */ ++ syscall(SYS_exit_group, post_dump_ret ? : (ret != 0)); ++ + return post_dump_ret ?: (ret != 0); + } + +@@ -2203,13 +2228,13 @@ int cr_dump_tasks(pid_t pid) + if (collect_file_locks()) + goto err; + +- if (collect_namespaces(true) < 0) +- goto err; +- + glob_imgset = cr_glob_imgset_open(O_DUMP); + if (!glob_imgset) + goto err; + ++ if (collect_namespaces(true) < 0) ++ goto err; ++ + if (seccomp_collect_dump_filters() < 0) + goto err; + +diff --git a/criu/crtools.c b/criu/crtools.c +index cc0a18f..c20b3b7 100644 +--- a/criu/crtools.c ++++ b/criu/crtools.c +@@ -472,6 +472,7 @@ usage: + " --weak-file-check Allow file size and mod larger than dumping value\n" + " --file-locks-repair Use repair mode to dump and restore file locks\n" + " --reserve-ports Reserve src ports in kernel\n" ++ " --parallel Collect smaps parallel to accellrate dumping speed\n" + "\n" + "Check options:\n" + " Without options, \"criu check\" checks availability of absolutely required\n" +diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h +index 3b61c6b..6478d4d 100644 +--- a/criu/include/cr_options.h ++++ b/criu/include/cr_options.h +@@ -201,6 +201,7 @@ struct cr_options { + char *share_dst_ports; + char *share_src_ports; + int reserve_ports; ++ int parallel; + }; + + extern struct cr_options opts; +diff --git a/criu/include/pstree.h b/criu/include/pstree.h +index 97bef11..87e4c47 100644 +--- a/criu/include/pstree.h ++++ b/criu/include/pstree.h +@@ -1,6 +1,8 @@ + #ifndef __CR_PSTREE_H__ + #define __CR_PSTREE_H__ + ++#include "taskqueue.h" ++ + #include "common/list.h" + #include "common/lock.h" + #include "pid.h" +@@ -31,6 +33,7 @@ struct pstree_item { + futex_t task_st; + unsigned long task_st_le_bits; + }; ++ struct mappings_info maps_info; + }; + + static inline pid_t vpid(const struct pstree_item *i) +diff --git a/criu/include/taskqueue.h b/criu/include/taskqueue.h +new file mode 100644 +index 0000000..16f9e3d +--- /dev/null ++++ b/criu/include/taskqueue.h +@@ -0,0 +1,50 @@ ++#ifndef __CR_TASKQUEUE_H__ ++#define __CR_TASKQUEUE_H__ ++ ++#include ++#include ++#include ++ ++#include "vma.h" ++#include "pstree.h" ++ ++#include "common/list.h" ++ ++#define TASKQUEUE_HASH_SIZE 8 ++ ++struct taskqueue { ++ pthread_t task; ++ void *(*routine)(void *); ++ void *arg; ++ int result; ++}; ++#define queue_task queue.task ++#define queue_routine queue.routine ++#define queue_arg queue.arg ++#define queue_result queue.result ++ ++int init_parallel_env(void); ++ ++static inline int taskqueue_create(struct taskqueue *queue) ++{ ++ return pthread_create(&queue->task, NULL, queue->routine, queue->arg); ++} ++ ++static inline int taskqueue_join(struct taskqueue *queue) ++{ ++ return pthread_join(queue->task, NULL); ++} ++ ++/* parallel collect smaps */ ++struct mappings_info { ++ struct hlist_node hash; ++ pid_t pid; ++ struct vm_area_list *vmas; ++ dump_filemap_t dump_file; ++ struct taskqueue queue; ++}; ++ ++int start_collect_mappings_thread(void); ++int end_collect_mappings_thread(struct pstree_item *item); ++ ++#endif /* __CR_TASKQUEUE_H__ */ +diff --git a/criu/namespaces.c b/criu/namespaces.c +index 7fa5868..05e6732 100644 +--- a/criu/namespaces.c ++++ b/criu/namespaces.c +@@ -28,6 +28,7 @@ + #include "cgroup.h" + #include "fdstore.h" + #include "kerndat.h" ++#include "taskqueue.h" + + #include "protobuf.h" + #include "util.h" +@@ -1607,11 +1608,15 @@ int collect_namespaces(bool for_dump) + { + int ret; + +- ret = collect_user_namespaces(for_dump); ++ ret = collect_mnt_namespaces(for_dump); + if (ret < 0) + return ret; + +- ret = collect_mnt_namespaces(for_dump); ++ /* need mnt info provided by `mntinfo` */ ++ if (opts.parallel && start_collect_mappings_thread()) ++ return -1; ++ ++ ret = collect_user_namespaces(for_dump); + if (ret < 0) + return ret; + +diff --git a/criu/taskqueue.c b/criu/taskqueue.c +new file mode 100644 +index 0000000..1196a5e +--- /dev/null ++++ b/criu/taskqueue.c +@@ -0,0 +1,124 @@ ++/* ++ * Target: ++ * parallel dump process ++ */ ++ ++#include ++#include ++#include ++#include ++ ++#include "pstree.h" ++#include "log.h" ++#include "taskqueue.h" ++ ++/* ++ * Sometimes, only one cpu can be used which is bad for parallel routine. ++ * Therefore, set cpu affinity for criu routine. ++ */ ++static int set_cpuaffinity(void) ++{ ++ cpu_set_t *set; ++ int num_cpus = get_nprocs_conf(); ++ size_t cpusetsize = CPU_ALLOC_SIZE(num_cpus); ++ int retval; ++ ++ set = CPU_ALLOC(num_cpus); ++ memset(set, 0xff, cpusetsize); ++ ++ retval = sched_setaffinity(getpid(), cpusetsize, set); ++ if (retval != 0) ++ pr_err("sched_setaffinity failed: %s\n", strerror(errno)); ++ ++ CPU_FREE(set); ++ ++ return retval; ++} ++ ++int init_parallel_env(void) ++{ ++ return set_cpuaffinity(); ++} ++ ++static void *collect_mappings_routine(void *_arg) ++{ ++ struct mappings_info *info = _arg; ++ ++ info->queue_result = collect_mappings(info->pid, info->vmas, info->dump_file); ++ return NULL; ++} ++ ++int dump_filemap(struct vma_area *vma_area, int fd); /* defined in criu/cr-dump.c */ ++ ++int start_collect_mappings_thread(void) ++{ ++ struct pstree_item *pi; ++ struct mappings_info *info; ++ ++ for_each_pstree_item(pi) { ++ /* disable parallel collect for non-root item because of the ++ * concurrence. ++ */ ++ if (pi->pid->real != root_item->pid->real) ++ continue; ++ ++ info = &pi->maps_info; ++ ++ info->vmas = xmalloc(sizeof(struct vm_area_list)); ++ if (info->vmas == NULL) { ++ pr_err("xzalloc vmas no memory\n"); ++ return -1; ++ } ++ vm_area_list_init(info->vmas); ++ ++ info->pid = pi->pid->real; ++ info->dump_file = dump_filemap; ++ info->queue_routine = collect_mappings_routine; ++ info->queue_arg = info; ++ ++ pr_info("Start thread to collect %d mappings\n", info->pid); ++ ++ if (taskqueue_create(&info->queue) < 0) { ++ pr_err("parallel_collect_mappings failed: %s\n", strerror(errno)); ++ free(info->vmas); ++ /* ++ * Don't care other threads status, use `exit_group()` ++ * to ensure all threads exit. ++ */ ++ return -1; ++ } ++ } ++ ++ return 0; ++} ++ ++int end_collect_mappings_thread(struct pstree_item *item) ++{ ++ struct mappings_info *info = &item->maps_info; ++ int retval; ++ ++ /* disable parallel collect for non-root item because of the ++ * concurrence. ++ */ ++ if (root_item->pid->real != item->pid->real) ++ return 0; ++ ++ retval = taskqueue_join(&info->queue); ++ if (retval != 0 || info->queue_result != 0) { ++ pr_err("taskqueue_join failed, retval %d(errno %d: %s)," ++ " queue_result: %d\n", ++ retval, ++ retval == 0 ? 0 : errno, ++ retval == 0 ? "nil" : strerror(errno), ++ info->queue_result); ++ retval = -1; ++ } ++ ++ pr_info("End thread to collect %d mappings\n", info->pid); ++ ++ /* ++ * Don't care other threads status, use `exit_group()` to ensure all ++ * threads exit. ++ */ ++ return retval; ++} +-- +2.34.1 + diff --git a/0053-mm-add-exec-file-mapping-pin-method.patch b/0053-mm-add-exec-file-mapping-pin-method.patch new file mode 100644 index 0000000..b6915d2 --- /dev/null +++ b/0053-mm-add-exec-file-mapping-pin-method.patch @@ -0,0 +1,120 @@ +From 5acbfc773177797d954645e40ba8f7ed94a55d60 Mon Sep 17 00:00:00 2001 +From: Jingxian He +Date: Tue, 30 Nov 2021 11:38:18 +0800 +Subject: [PATCH 53/72] mm: add exec file mapping pin method + +In order to improve criu dump and restore performance, +enable pin method for exec file mapping. + +Signed-off-by: Jingxian He +--- + criu/config.c | 4 ++++ + criu/crtools.c | 1 + + criu/include/cr_options.h | 1 + + criu/mem.c | 12 +++++++++++- + criu/pin-mem.c | 4 ++++ + 5 files changed, 21 insertions(+), 1 deletion(-) + +diff --git a/criu/config.c b/criu/config.c +index fdbc5eb..c0358e5 100644 +--- a/criu/config.c ++++ b/criu/config.c +@@ -710,6 +710,7 @@ int parse_options(int argc, char **argv, bool *usage_error, bool *has_exec_cmd, + { "share-src-ports", required_argument, 0, 2001 }, + { "reserve-ports", required_argument, 0, 'P' }, + BOOL_OPT("parallel", &opts.parallel), ++ { "exec-pin-start", required_argument, 0, 2002 }, + {}, + }; + +@@ -1051,6 +1052,9 @@ int parse_options(int argc, char **argv, bool *usage_error, bool *has_exec_cmd, + case 2001: + SET_CHAR_OPTS(share_src_ports, optarg); + break; ++ case 2002: ++ opts.exec_pin_start = atoi(optarg); ++ break; + case 'V': + pr_msg("Version: %s\n", CRIU_VERSION); + if (strcmp(CRIU_GITID, "0")) +diff --git a/criu/crtools.c b/criu/crtools.c +index c20b3b7..40e2d51 100644 +--- a/criu/crtools.c ++++ b/criu/crtools.c +@@ -473,6 +473,7 @@ usage: + " --file-locks-repair Use repair mode to dump and restore file locks\n" + " --reserve-ports Reserve src ports in kernel\n" + " --parallel Collect smaps parallel to accellrate dumping speed\n" ++ " --exec-pin-start Exec file map's pin start index\n" + "\n" + "Check options:\n" + " Without options, \"criu check\" checks availability of absolutely required\n" +diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h +index 6478d4d..a64e977 100644 +--- a/criu/include/cr_options.h ++++ b/criu/include/cr_options.h +@@ -202,6 +202,7 @@ struct cr_options { + char *share_src_ports; + int reserve_ports; + int parallel; ++ int exec_pin_start; + }; + + extern struct cr_options opts; +diff --git a/criu/mem.c b/criu/mem.c +index b955d66..ccb6ae6 100644 +--- a/criu/mem.c ++++ b/criu/mem.c +@@ -448,6 +448,7 @@ static int __parasite_dump_pages_seized(struct pstree_item *item, struct parasit + int possible_pid_reuse = 0; + bool has_parent; + int parent_predump_mode = -1; ++ int dump_iov; + + pr_info("\n"); + pr_info("Dumping pages (type: %d pid: %d)\n", CR_FD_PAGES, item->pid->real); +@@ -521,9 +522,18 @@ static int __parasite_dump_pages_seized(struct pstree_item *item, struct parasit + if (mdc->parent_ie) + parent_predump_mode = mdc->parent_ie->pre_dump_mode; + ++ dump_iov = 0; + list_for_each_entry(vma_area, &vma_area_list->h, list) { + if (opts.pin_memory && should_pin_vmae(vma_area->e)) { +- continue; ++ if (opts.exec_pin_start ++ && vma_entry_is(vma_area->e, VMA_FILE_PRIVATE) ++ && ((vma_area->e->prot & PROT_WRITE) ++ || !(vma_area->e->prot & PROT_EXEC))) { ++ dump_iov += 1; ++ if (dump_iov > opts.exec_pin_start + 1) ++ continue; ++ } else ++ continue; + } + + if (vma_entry_is(vma_area->e, VMA_AREA_ANON_INODE)) +diff --git a/criu/pin-mem.c b/criu/pin-mem.c +index 96ca2c5..686217f 100644 +--- a/criu/pin-mem.c ++++ b/criu/pin-mem.c +@@ -2,6 +2,7 @@ + #include + #include + ++#include "cr_options.h" + #include "pstree.h" + #include "mem.h" + #include "vma.h" +@@ -30,6 +31,9 @@ bool should_pin_vmae(VmaEntry *vmae) + if (vma_entry_is(vmae, VMA_ANON_PRIVATE)) + return true; + ++ if (opts.exec_pin_start && vma_entry_is(vmae, VMA_FILE_PRIVATE)) ++ return true; ++ + return false; + } + +-- +2.34.1 + diff --git a/0054-ptrace-trace-specific-syscall.patch b/0054-ptrace-trace-specific-syscall.patch new file mode 100644 index 0000000..b94469e --- /dev/null +++ b/0054-ptrace-trace-specific-syscall.patch @@ -0,0 +1,774 @@ +From 47412ba0d9ce6283071973387bf5b34bf876bb9a Mon Sep 17 00:00:00 2001 +From: "fu.lin" +Date: Wed, 1 Dec 2021 09:44:07 +0800 +Subject: [PATCH 54/72] ptrace: trace specific syscall + +criu use `ptrace(PTRACE_SYSCALL)` to watch whether the tracee steps in +correct status, it isn't necessory to stop tracee at every syscall. +Therefore, customizing `ptrace(PTRACE_SYSCALL_NR)` to make tracee stop at +the specific syscall can save time (1000 threads consume about 140ms). + +ptrace syntax: + long ptrace(PTRACE_SYSCALL_NR, pid_t pid, void *addr, void *data); + +The argument `addr` is unused in original `ptrace(PTRACE_SYSCALL)`, +Here `ptrace(PTRACE_SYSCALL_NR)` use `addr` parameter to give the +specific sysno which is wanted to trace. + +use `criu check` to generate `/run/criu.kdat` before the first usage of +criu, or auto-check during `criu {dump, restore}`. + +Conflict:NA +Reference:https://gitee.com/src-openeuler/criu/pulls/25 + +Signed-off-by: fu.lin +--- + compel/Makefile | 1 + + compel/include/uapi/bisect.h | 30 +++++++ + compel/include/uapi/infect.h | 15 +++- + compel/src/lib/bisect.c | 92 +++++++++++++++++++ + compel/src/lib/infect.c | 167 ++++++++++++++++++++++++++++++++--- + criu/cr-dump.c | 2 +- + criu/cr-restore.c | 97 +++++++++++++++++++- + criu/include/kerndat.h | 1 + + criu/kerndat.c | 61 +++++++++++++ + 9 files changed, 450 insertions(+), 16 deletions(-) + create mode 100644 compel/include/uapi/bisect.h + create mode 100644 compel/src/lib/bisect.c + +diff --git a/compel/Makefile b/compel/Makefile +index b79aee6..2168a26 100644 +--- a/compel/Makefile ++++ b/compel/Makefile +@@ -27,6 +27,7 @@ lib-y += src/lib/infect-rpc.o + lib-y += src/lib/infect-util.o + lib-y += src/lib/infect.o + lib-y += src/lib/ptrace.o ++lib-y += src/lib/bisect.o + + ifeq ($(ARCH),x86) + lib-y += arch/$(ARCH)/src/lib/thread_area.o +diff --git a/compel/include/uapi/bisect.h b/compel/include/uapi/bisect.h +new file mode 100644 +index 0000000..55ebcbd +--- /dev/null ++++ b/compel/include/uapi/bisect.h +@@ -0,0 +1,30 @@ ++#ifndef __COMPEL_BISECT_H__ ++#define __COMPEL_BISECT_H__ ++ ++#include ++ ++enum tf { ++ TRACE_INTERRUPT, ++ TRACE_SYSCALL_ENTER, ++ TRACE_SYSCALL_EXIT, ++}; ++ ++struct trace_flag { ++ pid_t key; ++ enum tf flag; ++}; ++ ++struct bisect_meta { ++ int size; ++ int used; ++ void *data; /* data pointer array */ ++ void *__data; /* data array */ ++}; ++ ++struct trace_flag *tf_bisect(struct bisect_meta *meta, pid_t key); ++struct trace_flag *tf_insert(struct bisect_meta *meta, pid_t key); ++int tf_create(struct bisect_meta *meta, int len); ++void tf_destroy(struct bisect_meta *meta); ++void tf_clear(struct bisect_meta *meta); ++ ++#endif /* __COMPEL_BISECT_H__ */ +diff --git a/compel/include/uapi/infect.h b/compel/include/uapi/infect.h +index 389878e..a23782e 100644 +--- a/compel/include/uapi/infect.h ++++ b/compel/include/uapi/infect.h +@@ -8,11 +8,16 @@ + #include + #include + #include ++#include + + #include "common/compiler.h" + + #define PARASITE_START_AREA_MIN (4096) + ++#ifndef PTRACE_SYSCALL_NR ++# define PTRACE_SYSCALL_NR 0xff00 ++#endif ++ + extern int __must_check compel_interrupt_task(int pid); + + struct seize_task_status { +@@ -41,7 +46,7 @@ extern int __must_check compel_infect(struct parasite_ctl *ctl, unsigned long nr + extern struct parasite_thread_ctl __must_check *compel_prepare_thread(struct parasite_ctl *ctl, int pid); + extern void compel_release_thread(struct parasite_thread_ctl *); + +-extern int __must_check compel_stop_daemon(struct parasite_ctl *ctl); ++extern int __must_check compel_stop_daemon(struct parasite_ctl *ctl, bool customize); + extern int __must_check compel_cure_remote(struct parasite_ctl *ctl); + extern int __must_check compel_cure_local(struct parasite_ctl *ctl); + extern int __must_check compel_cure(struct parasite_ctl *ctl); +@@ -83,6 +88,14 @@ extern int __must_check compel_stop_pie(pid_t pid, void *addr, enum trace_flags + + extern int __must_check compel_unmap(struct parasite_ctl *ctl, unsigned long addr); + ++extern int __must_check compel_stop_on_syscall_customize(int tasks, ++ const int sys_nr, const int exit_sys_nr, struct bisect_meta *meta); ++ ++extern int __must_check compel_stop_pie_customize(pid_t pid, ++ const int sys_nr, struct trace_flag *tf); ++ ++extern int __must_check compel_unmap_customize(struct parasite_ctl *ctl, unsigned long addr); ++ + extern int compel_mode_native(struct parasite_ctl *ctl); + + extern k_rtsigset_t *compel_task_sigmask(struct parasite_ctl *ctl); +diff --git a/compel/src/lib/bisect.c b/compel/src/lib/bisect.c +new file mode 100644 +index 0000000..807a5a9 +--- /dev/null ++++ b/compel/src/lib/bisect.c +@@ -0,0 +1,92 @@ ++#include ++ ++#include "log.h" ++#include "common/xmalloc.h" ++#include "bisect.h" ++ ++struct trace_flag *tf_bisect(struct bisect_meta *meta, pid_t key) ++{ ++ struct trace_flag **tfs = meta->data; ++ int lo = 0, hi = meta->used, mid; ++ ++ if (meta->used <= 0) ++ return NULL; ++ ++ while (lo < hi) { ++ mid = (int)((lo + hi) / 2); ++ if (tfs[mid]->key == key) { ++ return tfs[mid]; ++ } else if (tfs[mid]->key > key) { ++ hi = mid; ++ } else { ++ lo = mid + 1; ++ } ++ } ++ ++ return NULL; ++} ++ ++/* used in cr-restore */ ++struct trace_flag *tf_insert(struct bisect_meta *meta, pid_t key) ++{ ++ struct trace_flag **tfs = meta->data; ++ struct trace_flag *tf = &((struct trace_flag *)meta->__data)[meta->used]; ++ int i = 0, j = 0; ++ ++ if (meta->used == meta->size) ++ return NULL; ++ ++ for (i = 0; i < meta->used; i++) { ++ if (tfs[i]->key >= key) /* impossible condition: `tfs[i]->key == key` */ ++ break; ++ } ++ ++ j = meta->used; ++ meta->used += 1; ++ ++ while (j > i) { ++ tfs[j] = tfs[j-1]; ++ j -= 1; ++ } ++ ++ tfs[i] = tf; ++ tf->key = key; ++ ++ return tf; ++} ++ ++int tf_create(struct bisect_meta *meta, int len) ++{ ++ struct trace_flag *tfs; ++ struct trace_flag **tfs_ptr; ++ ++ tfs = xzalloc(sizeof(*tfs) * len); ++ if (tfs == NULL) ++ return -1; ++ ++ tfs_ptr = xmalloc(sizeof(*tfs_ptr) * len); ++ if (tfs_ptr == NULL) ++ goto err; ++ ++ meta->size = len; ++ meta->used = 0; ++ meta->__data = tfs; ++ meta->data = tfs_ptr; ++ ++ return 0; ++err: ++ xfree(tfs); ++ return -1; ++} ++ ++void tf_destroy(struct bisect_meta *meta) ++{ ++ xfree(meta->__data); ++ xfree(meta->data); ++} ++ ++void tf_clear(struct bisect_meta *meta) ++{ ++ meta->used = 0; ++ __builtin_memset(meta->data, 0, sizeof(struct trace_flag **)*meta->size); ++} +diff --git a/compel/src/lib/infect.c b/compel/src/lib/infect.c +index 6a13cc1..f9b8832 100644 +--- a/compel/src/lib/infect.c ++++ b/compel/src/lib/infect.c +@@ -449,7 +449,7 @@ static int restore_child_handler(struct parasite_ctl *ctl) + } + + static int parasite_run(pid_t pid, int cmd, unsigned long ip, void *stack, user_regs_struct_t *regs, +- struct thread_ctx *octx) ++ struct thread_ctx *octx, void *addr) + { + k_rtsigset_t block; + +@@ -470,7 +470,7 @@ static int parasite_run(pid_t pid, int cmd, unsigned long ip, void *stack, user_ + goto err_regs; + } + +- if (ptrace(cmd, pid, NULL, NULL)) { ++ if (ptrace(cmd, pid, addr, NULL)) { + pr_perror("Can't run parasite at %d", pid); + goto err_cont; + } +@@ -575,7 +575,7 @@ int compel_execute_syscall(struct parasite_ctl *ctl, user_regs_struct_t *regs, c + return -1; + } + +- err = parasite_run(pid, PTRACE_CONT, ctl->ictx.syscall_ip, 0, regs, &ctl->orig); ++ err = parasite_run(pid, PTRACE_CONT, ctl->ictx.syscall_ip, 0, regs, &ctl->orig, NULL); + if (!err) + err = parasite_trap(ctl, pid, regs, &ctl->orig, false); + +@@ -592,7 +592,7 @@ int compel_run_at(struct parasite_ctl *ctl, unsigned long ip, user_regs_struct_t + user_regs_struct_t regs = ctl->orig.regs; + int ret; + +- ret = parasite_run(ctl->rpid, PTRACE_CONT, ip, 0, ®s, &ctl->orig); ++ ret = parasite_run(ctl->rpid, PTRACE_CONT, ip, 0, ®s, &ctl->orig, NULL); + if (!ret) + ret = parasite_trap(ctl, ctl->rpid, ret_regs ? ret_regs : ®s, &ctl->orig, false); + return ret; +@@ -641,7 +641,7 @@ static int parasite_init_daemon(struct parasite_ctl *ctl) + goto err; + + regs = ctl->orig.regs; +- if (parasite_run(pid, PTRACE_CONT, ctl->parasite_ip, ctl->rstack, ®s, &ctl->orig)) ++ if (parasite_run(pid, PTRACE_CONT, ctl->parasite_ip, ctl->rstack, ®s, &ctl->orig, NULL)) + goto err; + + futex_wait_while_eq(&args->daemon_connected, 0); +@@ -1303,7 +1303,7 @@ static bool task_in_parasite(struct parasite_ctl *ctl, user_regs_struct_t *regs) + return addr >= ctl->remote_map && addr < ctl->remote_map + ctl->map_length; + } + +-static int parasite_fini_seized(struct parasite_ctl *ctl) ++static int parasite_fini_seized(struct parasite_ctl *ctl, bool customize) + { + pid_t pid = ctl->rpid; + user_regs_struct_t regs; +@@ -1348,6 +1348,34 @@ static int parasite_fini_seized(struct parasite_ctl *ctl) + if (ret) + return -1; + ++ /* use customize ptrace */ ++ if (customize) { ++ struct trace_flag tf = { .key = pid, .flag = TRACE_SYSCALL_ENTER }; ++ struct trace_flag *tf_ptr[] = { &tf }; ++ struct bisect_meta meta = { ++ .size = 1, ++ .used = 1, ++ .__data = &tf, ++ .data = tf_ptr, ++ }; ++ ++ ret = compel_stop_pie_customize(pid, __NR(rt_sigreturn, 0), &tf); ++ if (ret < 0) ++ return ret; ++ ++ /* The process is going to execute the required syscall, the ++ * original syscall should be forgot(set `-1`) in ++ * `syscall_trace_enter()` handler in kernel when no other ++ * else operation in tracer. ++ * ++ * Note: -1 means NO_SYSCALL which is defined in ++ * `arch/arm64/include/asm/ptrace.h`. ++ */ ++ return compel_stop_on_syscall_customize(1, ++ __NR(rt_sigreturn, 0), ++ -1, &meta); ++ } ++ + /* Go to sigreturn as closer as we can */ + ret = compel_stop_pie(pid, ctl->sigreturn_addr, &flag, ctl->ictx.flags & INFECT_NO_BREAKPOINTS); + if (ret < 0) +@@ -1368,7 +1396,7 @@ static int parasite_fini_seized(struct parasite_ctl *ctl) + return 0; + } + +-int compel_stop_daemon(struct parasite_ctl *ctl) ++int compel_stop_daemon(struct parasite_ctl *ctl, bool customize) + { + if (ctl->daemonized) { + /* +@@ -1378,7 +1406,7 @@ int compel_stop_daemon(struct parasite_ctl *ctl) + if (ctl->tsock < 0) + return -1; + +- if (parasite_fini_seized(ctl)) { ++ if (parasite_fini_seized(ctl, customize)) { + close_safe(&ctl->tsock); + return -1; + } +@@ -1394,7 +1422,7 @@ int compel_cure_remote(struct parasite_ctl *ctl) + long ret; + int err; + +- if (compel_stop_daemon(ctl)) ++ if (compel_stop_daemon(ctl, false)) + return -1; + + if (!ctl->remote_map) +@@ -1461,7 +1489,7 @@ int compel_run_in_thread(struct parasite_thread_ctl *tctl, unsigned int cmd) + + *ctl->cmd = cmd; + +- ret = parasite_run(pid, PTRACE_CONT, ctl->parasite_ip, stack, ®s, octx); ++ ret = parasite_run(pid, PTRACE_CONT, ctl->parasite_ip, stack, ®s, octx, NULL); + if (ret == 0) + ret = parasite_trap(ctl, pid, ®s, octx, true); + if (ret == 0) +@@ -1484,7 +1512,7 @@ int compel_unmap(struct parasite_ctl *ctl, unsigned long addr) + pid_t pid = ctl->rpid; + int ret = -1; + +- ret = parasite_run(pid, PTRACE_SYSCALL, addr, ctl->rstack, ®s, &ctl->orig); ++ ret = parasite_run(pid, PTRACE_SYSCALL, addr, ctl->rstack, ®s, &ctl->orig, NULL); + if (ret) + goto err; + +@@ -1500,6 +1528,45 @@ err: + return ret; + } + ++int compel_unmap_customize(struct parasite_ctl *ctl, unsigned long addr) ++{ ++ user_regs_struct_t regs = ctl->orig.regs; ++ pid_t pid = ctl->rpid; ++ int ret = -1; ++ struct trace_flag tf = { .key = pid, .flag = TRACE_SYSCALL_ENTER }; ++ struct trace_flag *tf_ptr[] = { &tf }; ++ struct bisect_meta meta = { ++ .size = 1, ++ .used = 1, ++ .__data = &tf, ++ .data = tf_ptr, ++ }; ++ ++ /* ++ * Here it parasite code. Unlike trap code `compel_stop_pie()`, it ++ * won't let tracee forget the original syscall. In such way, tracer ++ * just trace the syscall called by tracee. The log likes the following ++ * if tracee forget syscall: ++ * ++ * [ 817.638332] set pid 1877 ptrace sysno 215 ++ * [ 817.638343] syscall_trace_enter: pid 1877 ptrace_sysno 0 current_sysno 215 ++ * [ 817.638363] (00.006280) Error (compel/src/lib/infect.c:1582): 1877 (native) is going to execute the syscall 215, required is 215 ++ * [ 817.638368] set pid 1877 ptrace sysno 0 ++ * [ 817.638402] syscall_trace_exit: pid 1877 ptrace_sysno 0 current_sysno 215 ++ */ ++ ret = parasite_run(pid, PTRACE_SYSCALL_NR, addr, ctl->rstack, ®s, ++ &ctl->orig, (void *)(long)__NR(munmap, 0)); ++ if (ret) ++ goto err; ++ ++ ret = compel_stop_on_syscall_customize(1, __NR(munmap, 0), 0, &meta); ++ ++ if (restore_thread_ctx(pid, &ctl->orig, false)) ++ ret = -1; ++err: ++ return ret; ++} ++ + int compel_stop_pie(pid_t pid, void *addr, enum trace_flags *tf, bool no_bp) + { + int ret; +@@ -1535,6 +1602,17 @@ int compel_stop_pie(pid_t pid, void *addr, enum trace_flags *tf, bool no_bp) + return 0; + } + ++int compel_stop_pie_customize(pid_t pid, const int sys_nr, struct trace_flag *tf) ++{ ++ if (ptrace(PTRACE_SYSCALL_NR, pid, sys_nr, NULL)) { ++ pr_perror("Unable to restart the %d process", pid); ++ return -1; ++ } ++ ++ tf->flag = TRACE_SYSCALL_ENTER; ++ return 0; ++} ++ + static bool task_is_trapped(int status, pid_t pid) + { + if (WIFSTOPPED(status) && WSTOPSIG(status) == SIGTRAP) +@@ -1642,6 +1720,73 @@ int compel_stop_on_syscall(int tasks, const int sys_nr, const int sys_nr_compat, + return 0; + } + ++int compel_stop_on_syscall_customize(int tasks, const int sys_nr, ++ const int exit_sys_nr, struct bisect_meta *meta) ++{ ++ struct trace_flag *tf; ++ user_regs_struct_t regs; ++ int status, ret; ++ pid_t pid; ++ ++ while (tasks) { ++ pid = wait4(-1, &status, __WALL, NULL); ++ if (pid == -1) { ++ pr_perror("wait4 failed"); ++ return -1; ++ } ++ ++ tf = tf_bisect(meta, pid); ++ if (tf == NULL) { ++ pr_warn("Unexpected task %d, state %d signal %d: %s\n", ++ pid, WEXITSTATUS(status), ++ WTERMSIG(status), strsignal(WTERMSIG(status))); ++ continue; ++ } ++ ++ if (!task_is_trapped(status, pid)) ++ return -1; ++ ++ switch (tf->flag) { ++ case TRACE_SYSCALL_ENTER: ++ pr_debug("%d was trapped\n", pid); ++ pr_debug("`- Expecting exit\n"); ++ ++ ret = ptrace_get_regs(pid, ®s); ++ if (ret) { ++ pr_perror("ptrace"); ++ return -1; ++ } ++ ++ if (is_required_syscall(®s, pid, sys_nr, sys_nr)) { ++ ret = ptrace(PTRACE_SYSCALL_NR, pid, exit_sys_nr, NULL); ++ if (ret) { ++ pr_perror("ptrace"); ++ return -1; ++ } ++ tf->flag = TRACE_SYSCALL_EXIT; ++ } else { ++ pr_warn("Impossible condition, check the system, try our best to restore...\n"); ++ ret = ptrace(PTRACE_SYSCALL_NR, pid, sys_nr, NULL); ++ if (ret) { ++ pr_perror("ptrace"); ++ return -1; ++ } ++ } ++ break; ++ case TRACE_SYSCALL_EXIT: ++ pr_debug("%d was stopped\n", pid); ++ tasks--; ++ break; ++ ++ default: ++ pr_err("pid %d invalid status: %d\n", pid, tf->flag); ++ return -1; ++ } ++ } ++ ++ return 0; ++} ++ + int compel_mode_native(struct parasite_ctl *ctl) + { + return user_regs_native(&ctl->orig.regs); +diff --git a/criu/cr-dump.c b/criu/cr-dump.c +index ee826c0..9253e91 100644 +--- a/criu/cr-dump.c ++++ b/criu/cr-dump.c +@@ -1708,7 +1708,7 @@ static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) + goto err_cure; + } + +- ret = compel_stop_daemon(parasite_ctl); ++ ret = compel_stop_daemon(parasite_ctl, kdat.has_customize_ptrace); + if (ret) { + pr_err("Can't stop daemon in parasite (pid: %d)\n", pid); + goto err_cure; +diff --git a/criu/cr-restore.c b/criu/cr-restore.c +index d19768d..b0b3d30 100644 +--- a/criu/cr-restore.c ++++ b/criu/cr-restore.c +@@ -2181,6 +2181,64 @@ static int catch_tasks(bool root_seized, enum trace_flags *flag) + return 0; + } + ++static int cache_tasks_customize(bool root_seized, struct bisect_meta *meta) ++{ ++ struct pstree_item *item; ++ struct trace_flag *tf; ++ ++ for_each_pstree_item(item) { ++ int status, i, ret; ++ pid_t pid; ++ ++ if (!task_alive(item)) ++ continue; ++ ++ if (item->nr_threads == 1) { ++ item->threads[0].real = item->pid->real; ++ } else { ++ if (parse_threads(item->pid->real, &item->threads, &item->nr_threads)) ++ return -1; ++ } ++ ++ for (i = 0; i < item->nr_threads; i++) { ++ pid = item->threads[i].real; ++ ++ if (ptrace(PTRACE_INTERRUPT, pid, 0, 0)) { ++ pr_perror("Can't interrupt the %d task", pid); ++ return -1; ++ } ++ ++ tf = tf_insert(meta, pid); ++ if (tf == NULL) { ++ pr_err("Can't find trace flag for %d, used %d\n", ++ pid, meta->used); ++ return -1; ++ } ++ tf->flag = TRACE_INTERRUPT; ++ } ++ ++ for (i = 0; i < item->nr_threads; i++) { ++ pid = wait4(-1, &status, __WALL, NULL); ++ ++ tf = tf_bisect(meta, pid); ++ if (tf == NULL) { ++ pr_err("Can't find trace flag for %d, used %d\n", ++ pid, meta->used); ++ return -1; ++ } ++ ++ ret = compel_stop_pie_customize(pid, ++ __NR(rt_sigreturn, 0), ++ tf); ++ if (ret < 0) ++ return -1; ++ ++ } ++ } ++ ++ return 0; ++} ++ + static int clear_breakpoints(void) + { + struct pstree_item *item; +@@ -2207,6 +2265,7 @@ static void finalize_restore(void) + pid_t pid = item->pid->real; + struct parasite_ctl *ctl; + unsigned long restorer_addr; ++ int retval; + + if (!task_alive(item)) + continue; +@@ -2217,7 +2276,12 @@ static void finalize_restore(void) + continue; + + restorer_addr = (unsigned long)rsti(item)->munmap_restorer; +- if (compel_unmap(ctl, restorer_addr)) ++ if (!kdat.has_customize_ptrace) ++ retval = compel_unmap(ctl, restorer_addr); ++ else ++ retval = compel_unmap_customize(ctl, restorer_addr); ++ ++ if (retval) + pr_err("Failed to unmap restorer from %d\n", pid); + + xfree(ctl); +@@ -2333,11 +2397,18 @@ static void reap_zombies(void) + + static int restore_root_task(struct pstree_item *init) + { ++ struct bisect_meta tfs_meta; + enum trace_flags flag = TRACE_ALL; + int ret, fd, mnt_ns_fd = -1; + int root_seized = 0; + struct pstree_item *item; + ++ if (kdat.has_customize_ptrace ++ && tf_create(&tfs_meta, task_entries->nr_threads) != 0) { ++ pr_err("Can't alloc memory, tf_create failed\n"); ++ return -1; ++ } ++ + ret = run_scripts(ACT_PRE_RESTORE); + if (ret != 0) { + pr_err("Aborting restore due to pre-restore script ret code %d\n", ret); +@@ -2551,7 +2622,12 @@ skip_ns_bouncing: + + timing_stop(TIME_RESTORE); + +- if (catch_tasks(root_seized, &flag)) { ++ if (!kdat.has_customize_ptrace) ++ ret = catch_tasks(root_seized, &flag); ++ else ++ ret = cache_tasks_customize(root_seized, &tfs_meta); ++ ++ if (ret) { + pr_err("Can't catch all tasks\n"); + goto out_kill_network_unlocked; + } +@@ -2561,7 +2637,15 @@ skip_ns_bouncing: + + __restore_switch_stage(CR_STATE_COMPLETE); + +- ret = compel_stop_on_syscall(task_entries->nr_threads, __NR(rt_sigreturn, 0), __NR(rt_sigreturn, 1), flag); ++ if (!kdat.has_customize_ptrace) { ++ ret = compel_stop_on_syscall(task_entries->nr_threads, ++ __NR(rt_sigreturn, 0), ++ __NR(rt_sigreturn, 1), flag); ++ } else { ++ ret = compel_stop_on_syscall_customize(task_entries->nr_threads, ++ __NR(rt_sigreturn, 0), ++ -1, &tfs_meta); ++ } + if (ret) { + pr_err("Can't stop all tasks on rt_sigreturn\n"); + goto out_kill_network_unlocked; +@@ -2600,6 +2684,9 @@ skip_ns_bouncing: + reap_zombies(); + } + ++ if (kdat.has_customize_ptrace) ++ tf_destroy(&tfs_meta); ++ + return 0; + + out_kill_network_unlocked: +@@ -2631,6 +2718,10 @@ out: + stop_usernsd(); + __restore_switch_stage(CR_STATE_FAIL); + pr_err("Restoring FAILED.\n"); ++ ++ if (kdat.has_customize_ptrace) ++ tf_destroy(&tfs_meta); ++ + return -1; + } + +diff --git a/criu/include/kerndat.h b/criu/include/kerndat.h +index 3979939..8034db9 100644 +--- a/criu/include/kerndat.h ++++ b/criu/include/kerndat.h +@@ -77,6 +77,7 @@ struct kerndat_s { + bool has_rseq; + bool has_ptrace_get_rseq_conf; + bool has_unix_sk_repair; ++ bool has_customize_ptrace; + }; + + extern struct kerndat_s kdat; +diff --git a/criu/kerndat.c b/criu/kerndat.c +index 6d6aac1..630814e 100644 +--- a/criu/kerndat.c ++++ b/criu/kerndat.c +@@ -1289,6 +1289,66 @@ static void kerndat_has_unix_sk_repair(void) + return; + } + ++static void kerndat_has_customize_ptrace(void) ++{ ++ pid_t tracee = fork(); ++ int status; ++ int retval; ++ ++ if (tracee == 0) { ++ /* ensure */ ++ prctl(PR_SET_PDEATHSIG, SIGKILL); ++ ++ while (true) ++ sleep(1); ++ } else if (tracee > 0) { ++ pr_debug("fork task %d as tracee\n", tracee); ++ retval = ptrace(PTRACE_ATTACH, tracee, 0, 0); ++ if (retval < 0) { ++ pr_perror("Unexpect error from ptrace(PTRACE_ATTACH)"); ++ return; ++ } ++ ++ retval = wait4(-1, &status, __WALL, NULL); ++ if (retval == -1) ++ pr_perror("Unexpect error from wait"); ++ else if (retval != tracee || !(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP)) ++ pr_err("Task %d (expect %d) is unexpect, status: %d," ++ " stoped: %d signal: %d(%s)\n", ++ retval, tracee, status, ++ WIFSTOPPED(status), WSTOPSIG(status), ++ strsignal(WTERMSIG(status))); ++ else { ++ retval = ptrace(PTRACE_SYSCALL_NR, tracee, 0, 0); ++ if (retval == 0) ++ kdat.has_customize_ptrace = true; ++ else ++ pr_perror("Unexpect error from ptrace(PTRACE_SYSCALL_NR)"); ++ } ++ ++ if (kill(tracee, SIGKILL) != 0) { ++ pr_perror("kill tracee %d failed", tracee); ++ return; ++ } ++ ++ /* ++ * To prevent wait4 unexpect task when criu.kdat is generated ++ * in dump process. ++ */ ++ retval = waitpid(tracee, &status, 0); ++ if (retval == -1) ++ pr_err("waitpid() failed"); ++ else ++ pr_debug("tracee %d exited, status %d, signal %d(%s)\n", ++ WEXITSTATUS(status), WTERMSIG(status), ++ WTERMSIG(status), strsignal(WTERMSIG(status))); ++ } else { ++ pr_perror("Unexpected error from fork\n"); ++ } ++ ++ return; ++} ++ + int kerndat_init(void) + { + int ret; +@@ -1451,6 +1511,7 @@ int kerndat_init(void) + } + + kerndat_has_unix_sk_repair(); ++ kerndat_has_customize_ptrace(); + + kerndat_lsm(); + kerndat_mmap_min_addr(); +-- +2.34.1 + diff --git a/0055-notifier-rollback-when-open-img-failed.patch b/0055-notifier-rollback-when-open-img-failed.patch new file mode 100644 index 0000000..5784aa3 --- /dev/null +++ b/0055-notifier-rollback-when-open-img-failed.patch @@ -0,0 +1,150 @@ +From c79a274b378173ac64d42d1c72df1ec594085d66 Mon Sep 17 00:00:00 2001 +From: "fu.lin" +Date: Mon, 27 Dec 2021 21:34:39 +0800 +Subject: [PATCH 55/72] notifier: rollback when open img failed + +Conflict:NA +Reference:https://gitee.com/src-openeuler/criu/pulls/26 + +Signed-off-by: fu.lin +--- + criu/cr-restore.c | 69 +++++++++++++++++++++++++++++++++++++++++++ + criu/include/pstree.h | 1 + + criu/pstree.c | 8 +++++ + 3 files changed, 78 insertions(+) + +diff --git a/criu/cr-restore.c b/criu/cr-restore.c +index b0b3d30..13f0a93 100644 +--- a/criu/cr-restore.c ++++ b/criu/cr-restore.c +@@ -1542,6 +1542,9 @@ static inline int fork_with_pid(struct pstree_item *item) + goto err_unlock; + } + ++ /* disable criu rollback capability. */ ++ criu_roll = false; ++ + if (item == root_item) { + item->pid->real = ret; + pr_debug("PID: real %d virt %d\n", item->pid->real, vpid(item)); +@@ -2757,6 +2760,71 @@ int prepare_dummy_task_state(struct pstree_item *pi) + return 0; + } + ++static int criu_rollback_internal(void *_arg) ++{ ++ bool unmask = *(int *)_arg; ++ pid_t pid = getpid(); ++ ++ if (unmask && mask_task_exit_notify(pid, false) != 0) ++ pr_err("unmask exit notify failed for %d\n", pid); ++ ++ do_notifier_rollback(true, POST_UPDATE_KERNEL_COMPLETE); ++ return 0; ++} ++ ++static void criu_rollback(void) ++{ ++ pid_t pid; ++ unsigned long clone_flags; ++ int retval = 0; ++ ++ if (!criu_roll || !opts.with_notifier_kup) ++ return; ++ ++ pid = vpid(root_item); ++ clone_flags = rsti(root_item)->clone_flags; ++ ++ pr_info("do criu rollback\n"); ++ ++ /* Some rollback notifier must be call in the specific task context. */ ++ if (opts.use_fork_pid) ++ retval = write_fork_pid(vpid(root_item)); ++ else if (!kdat.has_clone3_set_tid) ++ retval = set_next_pid((void *)&pid); ++ ++ if (retval < 0) { ++ pr_err("set next pid %d failed, can't do rollback.", pid); ++ return; ++ } ++ ++ if (!kdat.has_clone3_set_tid) { ++ retval = clone_noasan(criu_rollback_internal, ++ clone_flags | SIGCHLD, ++ &opts.mask_exit_notify); ++ } else { ++ retval = clone3_with_pid_noasan(criu_rollback_internal, ++ &opts.mask_exit_notify, ++ clone_flags, ++ SIGCHLD, pid); ++ } ++ ++ if (retval < 0) { ++ pr_err("Can't fork for %d to do rollback: %s.\n", ++ pid, strerror(errno)); ++ } else { ++ int status; ++ ++ if (retval != pid) ++ pr_err("clone pid %d isn't equal with %d\n", ++ retval, pid); ++ ++ if (waitpid(pid, &status, 0) < 0) { ++ pr_warn("Unable to wait %d: %s\n", ++ pid, strerror(errno)); ++ } ++ } ++} ++ + int cr_restore_tasks(void) + { + int ret = -1; +@@ -2831,6 +2899,7 @@ clean_cgroup: + err: + cr_plugin_fini(CR_PLUGIN_STAGE__RESTORE, ret); + if (ret < 0) { ++ criu_rollback(); + if (!!(network_status & NETWORK_COLLECTED) + && !files_collected() && collect_image(&inet_sk_cinfo)) + pr_err("collect inet sk cinfo fail\n"); +diff --git a/criu/include/pstree.h b/criu/include/pstree.h +index 87e4c47..6c0765b 100644 +--- a/criu/include/pstree.h ++++ b/criu/include/pstree.h +@@ -46,6 +46,7 @@ enum { + }; + #define FDS_EVENT (1 << FDS_EVENT_BIT) + ++extern bool criu_roll; + extern struct pstree_item *current; + + struct rst_info; +diff --git a/criu/pstree.c b/criu/pstree.c +index 778c884..8992155 100644 +--- a/criu/pstree.c ++++ b/criu/pstree.c +@@ -20,6 +20,11 @@ + #include "images/pstree.pb-c.h" + #include "crtools.h" + ++/* ++ * Sometimes, img may be broken, set flag here to enable roll capibility ++ * before forking restorer. ++ */ ++bool criu_roll; + struct pstree_item *root_item; + static struct rb_root pid_root_rb; + +@@ -638,6 +643,9 @@ static int read_pstree_image(pid_t *pid_max) + if (!img) + return -1; + ++ /* enable rollback capibility when opening img successfully. */ ++ criu_roll = true; ++ + do { + ret = read_one_pstree_item(img, pid_max); + } while (ret > 0); +-- +2.34.1 + diff --git a/0056-detach-don-t-kill-task-when-ptrace-PTRACE_DETACH-ret.patch b/0056-detach-don-t-kill-task-when-ptrace-PTRACE_DETACH-ret.patch new file mode 100644 index 0000000..bc57061 --- /dev/null +++ b/0056-detach-don-t-kill-task-when-ptrace-PTRACE_DETACH-ret.patch @@ -0,0 +1,38 @@ +From 389a410ddfbca241bf724a4e4751fa96499ff6f1 Mon Sep 17 00:00:00 2001 +From: "fu.lin" +Date: Thu, 30 Dec 2021 10:45:16 +0800 +Subject: [PATCH 56/72] detach: don't kill task when `ptrace(PTRACE_DETACH)` + return ESRCH + +Conflict:NA +Reference:https://gitee.com/src-openeuler/criu/pulls/26 + +Signed-off-by: fu.lin +--- + criu/cr-restore.c | 10 ++++++++++ + 1 file changed, 10 insertions(+) + +diff --git a/criu/cr-restore.c b/criu/cr-restore.c +index 13f0a93..c3ff65d 100644 +--- a/criu/cr-restore.c ++++ b/criu/cr-restore.c +@@ -2317,6 +2317,16 @@ static int finalize_restore_detach(void) + return -1; + } + if (ptrace(PTRACE_DETACH, pid, NULL, 0)) { ++ /* ++ * There is delta between task resume and ++ * `ptrace(PTRACE_DETACH)`, task maybe exit ++ * initiative during this time. ++ */ ++ if (errno == ESRCH) { ++ pr_warn("Unable to detach %d, task has dead\n", pid); ++ continue; ++ } ++ + pr_perror("Unable to detach %d", pid); + return -1; + } +-- +2.34.1 + diff --git a/0057-build-add-secure-compilation-options.patch b/0057-build-add-secure-compilation-options.patch new file mode 100644 index 0000000..97c7544 --- /dev/null +++ b/0057-build-add-secure-compilation-options.patch @@ -0,0 +1,114 @@ +From 4a3b351a69083567392a70bfb8d91c3f666e0aff Mon Sep 17 00:00:00 2001 +From: Jingxian He +Date: Wed, 19 May 2021 22:49:57 +0800 +Subject: [PATCH 57/72] build: add secure compilation options + +Add secure compilation options: +-fstack-protector -fstack-protector-all +-Wl,-z,relro,-z,now,-z,noexecstack + +Conflict:NA +Reference:https://gitee.com/src-openeuler/criu/pulls/21 +Signed-off-by: Fu Lin +--- + Makefile | 4 ++++ + criu/Makefile | 2 +- + criu/pie/Makefile | 1 + + criu/pie/Makefile.library | 2 ++ + lib/Makefile | 1 + + lib/c/Makefile | 2 +- + scripts/nmk/scripts/build.mk | 5 +++-- + 7 files changed, 13 insertions(+), 4 deletions(-) + +diff --git a/Makefile b/Makefile +index 08761ef..c1eafdd 100644 +--- a/Makefile ++++ b/Makefile +@@ -80,6 +80,10 @@ ifeq ($(ARCH),mips) + DEFINES := -DCONFIG_MIPS + endif + ++# secure compilation options ++CFLAGS += -fstack-protector-all -fPIE ++LDFLAGS += -pie ++ + # + # CFLAGS_PIE: + # +diff --git a/criu/Makefile b/criu/Makefile +index db4e9d8..3b4d69f 100644 +--- a/criu/Makefile ++++ b/criu/Makefile +@@ -85,7 +85,7 @@ $(obj)/%: pie + + $(obj)/criu: $(PROGRAM-BUILTINS) + $(call msg-link, $@) +- $(Q) $(CC) $(CFLAGS) $^ $(LIBS) $(WRAPFLAGS) $(LDFLAGS) $(GMONLDOPT) -rdynamic -o $@ ++ $(Q) $(CC) $(CFLAGS) $^ $(LIBS) $(WRAPFLAGS) $(LDFLAGS) $(GMONLDOPT) -rdynamic -o $@ -Wl,-z,relro,-z,now,-z,noexecstack -fPIE -pie + + UNIT-BUILTINS += $(obj)/config.o + UNIT-BUILTINS += $(obj)/log.o +diff --git a/criu/pie/Makefile b/criu/pie/Makefile +index 265dcf8..40b5804 100644 +--- a/criu/pie/Makefile ++++ b/criu/pie/Makefile +@@ -6,6 +6,7 @@ target := parasite restorer + + CFLAGS := $(filter-out -pg $(CFLAGS-GCOV) $(CFLAGS-ASAN),$(CFLAGS)) + CFLAGS += $(CFLAGS_PIE) ++CFLAGS := $(filter-out -fstack-protector -fstack-protector-all,$(CFLAGS)) + ccflags-y += -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=0 + ccflags-y += -Wp,-U_FORTIFY_SOURCE -Wp,-D_FORTIFY_SOURCE=0 + +diff --git a/criu/pie/Makefile.library b/criu/pie/Makefile.library +index da2a2fa..c022d06 100644 +--- a/criu/pie/Makefile.library ++++ b/criu/pie/Makefile.library +@@ -27,3 +27,5 @@ CFLAGS += $(CFLAGS_PIE) + ifeq ($(ARCH),mips) + CFLAGS += -fno-stack-protector -DCR_NOGLIBC -mno-abicalls -fno-pic + endif ++ ++CFLAGS := $(filter-out -fstack-protector -fstack-protector-all,$(CFLAGS)) +diff --git a/lib/Makefile b/lib/Makefile +index 575a7ba..729c298 100644 +--- a/lib/Makefile ++++ b/lib/Makefile +@@ -14,6 +14,7 @@ lib/c/Makefile: ; + lib/c/%: .FORCE + $(Q) $(MAKE) $(build)=lib/c $@ + ++CFLAGS := $(filter-out -fPIE,$(CFLAGS)) + cflags-so += $(CFLAGS) -rdynamic -Wl,-soname,$(CRIU_SO).$(CRIU_SO_VERSION_MAJOR) + ldflags-so += -lprotobuf-c + +diff --git a/lib/c/Makefile b/lib/c/Makefile +index af01467..d7f6491 100644 +--- a/lib/c/Makefile ++++ b/lib/c/Makefile +@@ -4,5 +4,5 @@ obj-y += ./images/rpc.pb-c.o + ccflags-y += -iquote criu/$(ARCH_DIR)/include + ccflags-y += -iquote criu/include + ccflags-y += -iquote images +-ccflags-y += -fPIC -fno-stack-protector ++ccflags-y += -fPIC + ldflags-y += -r -z noexecstack +diff --git a/scripts/nmk/scripts/build.mk b/scripts/nmk/scripts/build.mk +index d01d2b7..6f366d7 100644 +--- a/scripts/nmk/scripts/build.mk ++++ b/scripts/nmk/scripts/build.mk +@@ -15,8 +15,9 @@ lib-name := + lib-target := + hostprogs-y := + libso-y := +-ld_flags := +-ldflags-so := ++ld_flags := -Wl,-z,relro,-z,now,-z,noexecstack ++ldflags-so := -Wl,-z,relro,-z,now,-z,noexecstack ++ldflags-y := -z relro -z now -z noexecstack + arflags-y := + target := + deps-y := +-- +2.34.1 + diff --git a/0058-nftables-add-mnl-api.patch b/0058-nftables-add-mnl-api.patch new file mode 100644 index 0000000..4445acd --- /dev/null +++ b/0058-nftables-add-mnl-api.patch @@ -0,0 +1,283 @@ +From e6dea32c64dfae3a6d06512b45f66416fc974556 Mon Sep 17 00:00:00 2001 +From: "fu.lin" +Date: Wed, 11 Aug 2021 16:50:49 +0800 +Subject: [PATCH 58/72] nftables: add mnl api + +libmnl provides the communication between userspace and kernelspace for +netfilter netlink. I abstract here for the next usage. + +Signed-off-by: fu.lin +--- + criu/Makefile | 2 + + criu/Makefile.crtools | 1 + + criu/Makefile.packages | 6 ++ + criu/include/nftables.h | 28 +++++++ + criu/mnl.c | 165 ++++++++++++++++++++++++++++++++++++++++ + 5 files changed, 202 insertions(+) + create mode 100644 criu/include/nftables.h + create mode 100644 criu/mnl.c + +diff --git a/criu/Makefile b/criu/Makefile +index 3b4d69f..8d11bd5 100644 +--- a/criu/Makefile ++++ b/criu/Makefile +@@ -28,6 +28,8 @@ CFLAGS += -iquote images + CFLAGS += -iquote $(ARCH_DIR)/include + CFLAGS += -iquote . + CFLAGS += $(shell $(PKG_CONFIG) --cflags libnl-3.0) ++CFLAGS += $(shell $(PKG_CONFIG) --cflags libnftnl) ++CFLAGS += $(shell $(PKG_CONFIG) --cflags libmnl) + CFLAGS += $(CONFIG-DEFINES) + + ifeq ($(GMON),1) +diff --git a/criu/Makefile.crtools b/criu/Makefile.crtools +index 2ad0207..a132810 100644 +--- a/criu/Makefile.crtools ++++ b/criu/Makefile.crtools +@@ -98,6 +98,7 @@ obj-y += reserved-ports.o + obj-y += orphan-inode.o + obj-y += kmsg.o + obj-y += taskqueue.o ++obj-y += mnl.o + obj-$(CONFIG_HAS_LIBBPF) += bpfmap.o + obj-$(CONFIG_COMPAT) += pie-util-vdso-elf32.o + CFLAGS_pie-util-vdso-elf32.o += -DCONFIG_VDSO_32 +diff --git a/criu/Makefile.packages b/criu/Makefile.packages +index 851489b..76e59ca 100644 +--- a/criu/Makefile.packages ++++ b/criu/Makefile.packages +@@ -7,6 +7,8 @@ REQ-RPM-PKG-NAMES += protobuf-python + REQ-RPM-PKG-NAMES += libnl3-devel + REQ-RPM-PKG-NAMES += libcap-devel + REQ-RPM-PKG-NAMES += $(PYTHON)-future ++REQ-RPM-PKG-NAMES += libmnl-devel ++REQ-RPM-PKG-NAMES += libnftnl-devel + + REQ-RPM-PKG-TEST-NAMES += libaio-devel + +@@ -18,6 +20,8 @@ REQ-DEB-PKG-NAMES += $(PYTHON)-protobuf + REQ-DEB-PKG-NAMES += $(PYTHON)-future + REQ-DEB-PKG-NAMES += libnl-3-dev + REQ-DEB-PKG-NAMES += libcap-dev ++REQ-DEB-PKG-NAMES += libmnl-dev ++REQ-DEB-PKG-NAMES += libnftnl-dev + + REQ-DEB-PKG-TEST-NAMES += $(PYTHON)-yaml + REQ-DEB-PKG-TEST-NAMES += libaio-dev +@@ -32,6 +36,8 @@ endif + + export LIBS += -lprotobuf-c -ldl -lnl-3 -lsoccr -Lsoccr/ -lnet + export LIBS += -lpthread ++export LIBS += $(shell $(PKG_CONFIG) --libs libmnl) ++export LIBS += $(shell $(PKG_CONFIG) --libs libnftnl) + + check-packages-failed: + $(warning Can not find some of the required libraries) +diff --git a/criu/include/nftables.h b/criu/include/nftables.h +new file mode 100644 +index 0000000..0bdab31 +--- /dev/null ++++ b/criu/include/nftables.h +@@ -0,0 +1,28 @@ ++#ifndef __CR_NFTABLES_H__ ++#define __CR_NFTABLES_H__ ++ ++#include ++ ++struct mnl_params { ++ struct mnl_socket *nl; ++ char *buf; ++ struct mnl_nlmsg_batch *batch; ++ uint32_t seq; ++}; ++ ++typedef struct nlmsghdr * (*buf_func_t)(struct mnl_params *mnl_params, void *args); ++typedef int (*batch_func_t)(struct mnl_params *mnl_params, void *args); ++typedef int (*mnl_func_t)(struct mnl_params *mnl, batch_func_t cb, void *args); ++ ++struct mnl_cb_params { ++ pid_t tree_id; ++ bool create; ++ bool ipv6; ++}; ++ ++int mnl_sendmsg(batch_func_t batch_cb, void *args); ++int mnl_common(mnl_func_t mnl_cb, void *arg1, void *arg2); ++int mnl_batch_send_and_recv(struct mnl_params *mnl_params, batch_func_t cb, void *args, int *result); ++int mnl_buf_send_and_recv(struct mnl_params *mnl_params, buf_func_t cb, void *args, int *result); ++ ++#endif /* __CR_NFTABLES_H__ */ +diff --git a/criu/mnl.c b/criu/mnl.c +new file mode 100644 +index 0000000..3a03202 +--- /dev/null ++++ b/criu/mnl.c +@@ -0,0 +1,165 @@ ++#include ++#include ++#include ++ ++#include ++ ++#include "nftables.h" ++#include "log.h" ++ ++int mnl_common(mnl_func_t mnl_cb, void *arg1, void *arg2) ++{ ++ char buf[MNL_SOCKET_BUFFER_SIZE]; ++ struct mnl_params mnl = { ++ .seq = time(NULL), ++ }; ++ int retval = -1; ++ ++ mnl.nl = mnl_socket_open(NETLINK_NETFILTER); ++ if (mnl.nl == NULL) { ++ pr_err("mnl_socket_open failed with %d: %s\n", errno, strerror(errno)); ++ return -1; ++ } ++ ++ if (mnl_socket_bind(mnl.nl, 0, MNL_SOCKET_AUTOPID) < 0) { ++ pr_err("mnl_socket_bind wailed with %d: %s\n", errno, strerror(errno)); ++ goto err_mnl; ++ } ++ ++ mnl.buf = buf; ++ mnl.batch = mnl_nlmsg_batch_start(buf, sizeof(buf)); ++ if (mnl.batch == NULL) ++ goto err_mnl; ++ ++ if (mnl_cb(&mnl, arg1, arg2) < 0) ++ goto err_batch; ++ ++ retval = 0; ++ ++err_batch: ++ mnl_nlmsg_batch_stop(mnl.batch); ++err_mnl: ++ mnl_socket_close(mnl.nl); ++ ++ return retval; ++} ++ ++static int mnl_sendmsg_internal(struct mnl_params *mnl, batch_func_t cb, void *args) ++{ ++ int retval = -1; ++ ++ nftnl_batch_begin(mnl_nlmsg_batch_current(mnl->batch), mnl->seq++); ++ mnl_nlmsg_batch_next(mnl->batch); ++ ++ if (cb(mnl, args) < 0) ++ goto err_batch; ++ ++ nftnl_batch_end(mnl_nlmsg_batch_current(mnl->batch), mnl->seq++); ++ mnl_nlmsg_batch_next(mnl->batch); ++ ++ if (mnl_socket_sendto(mnl->nl, mnl_nlmsg_batch_head(mnl->batch), ++ mnl_nlmsg_batch_size(mnl->batch)) < 0) { ++ pr_err("%s: mnl_socket_sendto failed with %d: %s\n", ++ __func__, errno, strerror(errno)); ++ goto err_batch; ++ } ++ ++ retval = 0; ++ ++err_batch: ++ return retval; ++} ++ ++int mnl_sendmsg(batch_func_t batch_cb, void *args) ++{ ++ return mnl_common(mnl_sendmsg_internal, batch_cb, args); ++} ++ ++int mnl_batch_send_and_recv(struct mnl_params *mnl_params, batch_func_t cb, ++ void *args, int *result) ++{ ++ struct mnl_socket *nl = mnl_params->nl; ++ struct mnl_nlmsg_batch *batch = mnl_params->batch; ++ uint32_t *seq = &mnl_params->seq; ++ char buf[MNL_SOCKET_BUFFER_SIZE]; ++ int retval; ++ ++ mnl_nlmsg_batch_reset(batch); ++ nftnl_batch_begin(mnl_nlmsg_batch_current(batch), (*seq)++); ++ mnl_nlmsg_batch_next(batch); ++ ++ if (cb(mnl_params, args) < 0) ++ return -1; ++ ++ nftnl_batch_end(mnl_nlmsg_batch_current(batch), (*seq)++); ++ mnl_nlmsg_batch_next(batch); ++ ++ if (mnl_socket_sendto(nl, mnl_nlmsg_batch_head(batch), ++ mnl_nlmsg_batch_size(batch)) < 0) { ++ pr_err("%s: mnl_socket_sendto failed with %d: %s\n", ++ __func__, errno, strerror(errno)); ++ return -1; ++ } ++ ++ /* don't care the netlink retval, and nlmsg hdr flags has no `NLM_F_ACK` */ ++ if (result == NULL) ++ return 0; ++ ++ retval = mnl_socket_recvfrom(nl, buf, sizeof(buf)); ++ while (retval > 0) { ++ retval = mnl_cb_run(buf, retval, 0, mnl_socket_get_portid(nl), NULL, NULL); ++ if (retval <= 0) ++ break; ++ retval = mnl_socket_recvfrom(nl, buf, sizeof(buf)); ++ } ++ ++ if (retval < 0) { ++ pr_err("%s: mnl batch socket recv errno with %d: %s\n", ++ __func__, errno, strerror(errno)); ++ *result = errno; ++ return -1; ++ } ++ ++ *result = 0; ++ return 0; ++} ++ ++int mnl_buf_send_and_recv(struct mnl_params *mnl_params, buf_func_t cb, ++ void *args, int *result) ++{ ++ struct mnl_socket *nl = mnl_params->nl; ++ char buf[MNL_SOCKET_BUFFER_SIZE]; ++ struct nlmsghdr *nlh; ++ int retval = 0; ++ ++ if ((nlh = cb(mnl_params, args)) == NULL) ++ return -1; ++ ++ if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) < 0) { ++ pr_err("%s: mnl_socket_sendto failed with %d: %s\n", ++ __func__, errno, strerror(errno)); ++ return -1; ++ } ++ ++ /* don't care the netlink retval, and nlmsg hdr flags has no `NLM_F_ACK` */ ++ if (result == NULL) ++ return 0; ++ ++ retval = mnl_socket_recvfrom(nl, buf, sizeof(buf)); ++ while (retval > 0) { ++ retval = mnl_cb_run(buf, retval, 0, mnl_socket_get_portid(nl), NULL, NULL); ++ if (retval <= 0) ++ break; ++ retval = mnl_socket_recvfrom(nl, buf, sizeof(buf)); ++ } ++ ++ if (retval < 0) { ++ pr_info("%s: mnl buf socket recv errno with %d: %s\n", ++ __func__, errno, strerror(errno)); ++ *result = errno; ++ return -1; ++ } ++ ++ *result = 0; ++ return 0; ++} +-- +2.34.1 + diff --git a/0059-nftables-implement-nft-api-for-tcp.patch b/0059-nftables-implement-nft-api-for-tcp.patch new file mode 100644 index 0000000..c263200 --- /dev/null +++ b/0059-nftables-implement-nft-api-for-tcp.patch @@ -0,0 +1,1011 @@ +From 099fe7c10a7eaac7df82d268d4d6bd831a68d44b Mon Sep 17 00:00:00 2001 +From: "fu.lin" +Date: Wed, 11 Aug 2021 16:50:49 +0800 +Subject: [PATCH 59/72] nftables: implement nft api for tcp + +Signed-off-by: fu.lin +--- + criu/Makefile.crtools | 1 + + criu/include/nftables.h | 138 +++++++ + criu/nftables.c | 823 ++++++++++++++++++++++++++++++++++++++++ + 3 files changed, 962 insertions(+) + create mode 100644 criu/nftables.c + +diff --git a/criu/Makefile.crtools b/criu/Makefile.crtools +index a132810..b2a7641 100644 +--- a/criu/Makefile.crtools ++++ b/criu/Makefile.crtools +@@ -99,6 +99,7 @@ obj-y += orphan-inode.o + obj-y += kmsg.o + obj-y += taskqueue.o + obj-y += mnl.o ++obj-y += nftables.o + obj-$(CONFIG_HAS_LIBBPF) += bpfmap.o + obj-$(CONFIG_COMPAT) += pie-util-vdso-elf32.o + CFLAGS_pie-util-vdso-elf32.o += -DCONFIG_VDSO_32 +diff --git a/criu/include/nftables.h b/criu/include/nftables.h +index 0bdab31..3b51a3d 100644 +--- a/criu/include/nftables.h ++++ b/criu/include/nftables.h +@@ -3,6 +3,99 @@ + + #include + ++#include ++#include ++#include ++#include ++#include ++ ++#define construct_buf(buf, type, family, flags, seq, payload, cb_prefix) \ ++ ({ \ ++ struct nlmsghdr *_nlh; \ ++ \ ++ _nlh = nftnl_##cb_prefix##_nlmsg_build_hdr((buf), \ ++ (type), (family), (flags), (seq)); \ ++ nftnl_##cb_prefix##_nlmsg_build_payload(_nlh, (payload)); \ ++ nftnl_##cb_prefix##_free((payload)); \ ++ _nlh; \ ++ }) ++ ++#define construct_table_buf(buf, type, family, flags, seq, payload) \ ++ construct_buf((buf), (type), (family), (flags), (seq), \ ++ (payload), table) ++ ++#define construct_chain_buf(buf, type, family, flags, seq, payload) \ ++ construct_buf((buf), (type), (family), (flags), (seq), \ ++ (payload), chain) ++ ++#define construct_batch(batch, type, family, flags, seq, payload, cb_prefix) \ ++ { \ ++ struct nlmsghdr *_nlh; \ ++ \ ++ _nlh = nftnl_##cb_prefix##_nlmsg_build_hdr( \ ++ mnl_nlmsg_batch_current(batch), \ ++ (type), (family), (flags), (seq)); \ ++ nftnl_##cb_prefix##_nlmsg_build_payload(_nlh, (payload)); \ ++ nftnl_##cb_prefix##_free((payload)); \ ++ mnl_nlmsg_batch_next((batch)); \ ++ } ++ ++#define construct_table_batch(batch, type, family, flags, seq, payload) \ ++ construct_batch((batch), (type), (family), (flags), (seq), \ ++ (payload), table) ++ ++#define construct_chain_batch(batch, type, family, flags, seq, payload) \ ++ construct_batch((batch), (type), (family), (flags), (seq), \ ++ (payload), chain) ++ ++#define construct_set_batch(batch, type, family, flags, seq, payload) \ ++ construct_batch((batch), (type), (family), (flags), (seq), \ ++ (payload), set) ++ ++#define construct_rule_batch(batch, type, family, flags, seq, payload) \ ++ construct_batch((batch), (type), (family), (flags), (seq), \ ++ (payload), rule) ++ ++#define construct_set_elems_batch(batch, type, family, flags, seq, payload) \ ++ { \ ++ struct nlmsghdr *_nlh; \ ++ \ ++ _nlh = nftnl_nlmsg_build_hdr( \ ++ mnl_nlmsg_batch_current(batch), \ ++ (type), (family), (flags), (seq)); \ ++ nftnl_set_elems_nlmsg_build_payload(_nlh, (payload)); \ ++ nftnl_set_free((payload)); \ ++ mnl_nlmsg_batch_next((batch)); \ ++ } ++ ++#define TABLE_NAME "filter" ++#define INPUT_CHAIN_NAME "criu-input" ++#define OUTPUT_CHAIN_NAME "criu-output" ++#define INPUT_IPV4_SET_NAME "criu-input-ipv4-blacklist-%d" ++#define INPUT_IPV6_SET_NAME "criu-input-ipv6-blacklist-%d" ++#define OUTPUT_IPV4_SET_NAME "criu-output-ipv4-blacklist-%d" ++#define OUTPUT_IPV6_SET_NAME "criu-output-ipv6-blacklist-%d" ++ ++/* set key type, see nftables/include/datatypes.h ++ * The rule of the datatype calculation: ++ * Each type occupies 6 bits, type: ++ * - ipaddr: 7, 4 bytes ++ * - ip6addr: 8, 16 types ++ * - inet_service: 13, 2 bytes (pading to 4 bytes) ++ * ++ * 0x1cd1cd: 0b 000111 001101 000111 001101 ++ * 0x20d20d: 0b 001000 001101 001000 001101 ++ */ ++#define INET_SERVICE_LEN 2 ++#define IPADDR_LEN 4 ++#define IP6ADDR_LEN 16 ++#define div_round_up(n, d) (((n) + (d) - 1) / (d)) ++ ++#define IPv4_KEY_TYPE 0x1cd1cd ++#define IPv4_KEY_LEN div_round_up(IPADDR_LEN + INET_SERVICE_LEN, 4) * 4 * 2 ++#define IPv6_KEY_TYPE 0x20d20d ++#define IPv6_KEY_LEN div_round_up(IP6ADDR_LEN + INET_SERVICE_LEN, 4) * 4 * 2 ++ + struct mnl_params { + struct mnl_socket *nl; + char *buf; +@@ -25,4 +118,49 @@ int mnl_common(mnl_func_t mnl_cb, void *arg1, void *arg2); + int mnl_batch_send_and_recv(struct mnl_params *mnl_params, batch_func_t cb, void *args, int *result); + int mnl_buf_send_and_recv(struct mnl_params *mnl_params, buf_func_t cb, void *args, int *result); + ++struct nft_chain_params { ++ char *name; ++ uint32_t hooknum; ++ char *type; ++ uint32_t prio; ++ uint32_t policy; ++}; ++ ++struct nft_set_params { ++ char name[128]; ++ uint32_t id; ++ uint32_t datatype; ++ uint32_t key_len; ++}; ++ ++struct nft_rule_params { ++ char *chain_name; ++ char set_name[128]; ++ uint32_t mark; ++ uint16_t mark_op; ++ uint32_t nfproto; ++ uint8_t l4proto; ++ unsigned int stmt; ++ bool ipv6; ++}; ++ ++struct nft_set_elem_params { ++ char set_name[128]; ++ char data[40]; ++ size_t data_len; ++}; ++ ++struct nf_conn_params { ++ uint8_t family; ++ uint32_t *src_addr; ++ uint16_t src_port; ++ uint32_t *dst_addr; ++ uint16_t dst_port; ++ bool lock; ++ pid_t tree_id; ++}; ++ ++struct inet_sk_desc; ++int nft_connection_switch(struct inet_sk_desc *sk, bool lock, pid_t tree_id); ++ + #endif /* __CR_NFTABLES_H__ */ +diff --git a/criu/nftables.c b/criu/nftables.c +new file mode 100644 +index 0000000..57774e6 +--- /dev/null ++++ b/criu/nftables.c +@@ -0,0 +1,823 @@ ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "sk-inet.h" ++#include "nftables.h" ++ ++#include "../soccr/soccr.h" ++ ++#include "log.h" ++ ++static struct nftnl_table *setup_table(uint8_t family, const char *table) ++{ ++ struct nftnl_table *t; ++ ++ t = nftnl_table_alloc(); ++ if (t == NULL) ++ return NULL; ++ ++ nftnl_table_set_u32(t, NFTNL_TABLE_FAMILY, family); ++ if (nftnl_table_set_str(t, NFTNL_TABLE_NAME, table) < 0) ++ goto err; ++ ++ return t; ++err: ++ nftnl_table_free(t); ++ return NULL; ++} ++ ++static struct nftnl_chain *setup_chain(const char *table, ++ struct nft_chain_params *params, ++ bool create) ++{ ++ struct nftnl_chain *c; ++ ++ c = nftnl_chain_alloc(); ++ if (c == NULL) ++ return NULL; ++ ++ if (nftnl_chain_set_str(c, NFTNL_CHAIN_TABLE, table) < 0) ++ goto err; ++ if (nftnl_chain_set_str(c, NFTNL_CHAIN_NAME, params->name) < 0) ++ goto err; ++ if (create) { ++ nftnl_chain_set_u32(c, NFTNL_CHAIN_HOOKNUM, params->hooknum); ++ if (nftnl_chain_set_str(c, NFTNL_CHAIN_TYPE, params->type) < 0) ++ goto err; ++ nftnl_chain_set_u32(c, NFTNL_CHAIN_PRIO, params->prio); ++ nftnl_chain_set_u32(c, NFTNL_CHAIN_POLICY, params->policy); ++ } ++ ++ return c; ++err: ++ nftnl_chain_free(c); ++ return NULL; ++} ++ ++static struct nftnl_set *setup_set(uint8_t family, const char *table, ++ struct nft_set_params *params, ++ bool create) ++{ ++ struct nftnl_set *s; ++ ++ s = nftnl_set_alloc(); ++ if (s == NULL) ++ return NULL; ++ ++ if (nftnl_set_set_str(s, NFTNL_SET_TABLE, table) < 0) ++ goto err; ++ if (nftnl_set_set_str(s, NFTNL_SET_NAME, params->name) < 0) ++ goto err; ++ if (create) { ++ nftnl_set_set_u32(s, NFTNL_SET_FAMILY, family); ++ nftnl_set_set_u32(s, NFTNL_SET_ID, params->id); ++ ++ nftnl_set_set_u32(s, NFTNL_SET_KEY_TYPE, params->datatype); ++ nftnl_set_set_u32(s, NFTNL_SET_KEY_LEN, params->key_len); ++ } ++ ++ return s; ++err: ++ nftnl_set_free(s); ++ return NULL; ++} ++ ++static int add_mark(struct nftnl_rule *r, uint32_t meta_key, enum nft_registers dreg) ++{ ++ struct nftnl_expr *e; ++ ++ e = nftnl_expr_alloc("meta"); ++ if (e == NULL) ++ return -1; ++ ++ nftnl_expr_set_u32(e, NFTNL_EXPR_META_KEY, meta_key); ++ nftnl_expr_set_u32(e, NFTNL_EXPR_META_DREG, dreg); ++ ++ nftnl_rule_add_expr(r, e); ++ ++ return 0; ++} ++ ++static int add_proto(struct nftnl_rule *r, enum nft_registers dreg) ++{ ++ struct nftnl_expr *e; ++ ++ e = nftnl_expr_alloc("meta"); ++ if (e == NULL) ++ return -1; ++ ++ nftnl_expr_set_u32(e, NFTNL_EXPR_META_KEY, NFT_META_L4PROTO); ++ nftnl_expr_set_u32(e, NFTNL_EXPR_META_DREG, dreg); ++ ++ nftnl_rule_add_expr(r, e); ++ ++ return 0; ++} ++ ++static int add_payload(struct nftnl_rule *r, uint32_t base, uint32_t dreg, ++ uint32_t offset, uint32_t len) ++{ ++ struct nftnl_expr *e; ++ ++ e = nftnl_expr_alloc("payload"); ++ if (e == NULL) ++ return -1; ++ ++ nftnl_expr_set_u32(e, NFTNL_EXPR_PAYLOAD_BASE, base); ++ nftnl_expr_set_u32(e, NFTNL_EXPR_PAYLOAD_DREG, dreg); ++ nftnl_expr_set_u32(e, NFTNL_EXPR_PAYLOAD_OFFSET, offset); ++ nftnl_expr_set_u32(e, NFTNL_EXPR_PAYLOAD_LEN, len); ++ ++ nftnl_rule_add_expr(r, e); ++ ++ return 0; ++} ++ ++static int add_cmp(struct nftnl_rule *r, enum nft_registers sreg, uint32_t op, ++ const void *data, uint32_t data_len) ++{ ++ struct nftnl_expr *e; ++ ++ e = nftnl_expr_alloc("cmp"); ++ if (e == NULL) ++ return -1; ++ ++ nftnl_expr_set_u32(e, NFTNL_EXPR_CMP_SREG, sreg); ++ nftnl_expr_set_u32(e, NFTNL_EXPR_CMP_OP, op); ++ nftnl_expr_set(e, NFTNL_EXPR_CMP_DATA, data, data_len); ++ ++ nftnl_rule_add_expr(r, e); ++ ++ return 0; ++} ++ ++static int add_lookup(struct nftnl_rule *r, enum nft_registers sreg, ++ const char *set) ++{ ++ struct nftnl_expr *e; ++ ++ e = nftnl_expr_alloc("lookup"); ++ if (e == NULL) ++ return -1; ++ ++ if (nftnl_expr_set_str(e, NFTNL_EXPR_LOOKUP_SET, set) < 0) ++ goto err; ++ nftnl_expr_set_u32(e, NFTNL_EXPR_LOOKUP_SREG, sreg); ++ ++ nftnl_rule_add_expr(r, e); ++ ++ return 0; ++err: ++ nftnl_expr_free(e); ++ return -1; ++} ++ ++static int add_counter(struct nftnl_rule *r) ++{ ++ struct nftnl_expr *e; ++ ++ e = nftnl_expr_alloc("counter"); ++ if (e == NULL) ++ return -1; ++ ++ nftnl_rule_add_expr(r, e); ++ ++ return 0; ++} ++ ++static int add_verdict(struct nftnl_rule *r, const char *chain, int verdict) ++{ ++ struct nftnl_expr *e; ++ ++ e = nftnl_expr_alloc("immediate"); ++ if (e == NULL) ++ return -1; ++ ++ nftnl_expr_set_u32(e, NFTNL_EXPR_IMM_DREG, NFT_REG_VERDICT); ++ nftnl_expr_set_u32(e, NFTNL_EXPR_IMM_VERDICT, verdict); ++ ++ nftnl_rule_add_expr(r, e); ++ ++ return 0; ++} ++ ++static int __setup_rule(struct nftnl_rule *r, struct nft_rule_params *params) ++{ ++ /* meta nfproto == */ ++ if (add_mark(r, NFT_META_PROTOCOL, NFT_REG32_00) < 0) ++ return -1; ++ if (add_cmp(r, NFT_REG32_00, NFT_CMP_EQ, ¶ms->nfproto, sizeof(uint32_t))< 0) ++ return -1; ++ ++ /* meta l4proto == */ ++ if (add_proto(r, NFT_REG32_00) < 0) ++ return -1; ++ if (add_cmp(r, NFT_REG32_00, NFT_CMP_EQ, ¶ms->l4proto, sizeof(uint8_t)) < 0) ++ return -1; ++ ++ /* ip saddr . sport . daddr . dport @ */ ++ if (params->ipv6 == false) { ++ if (add_payload(r, NFT_PAYLOAD_NETWORK_HEADER, NFT_REG32_00, ++ offsetof(struct iphdr, saddr), IPADDR_LEN) < 0) ++ return -1; ++ if (add_payload(r, NFT_PAYLOAD_TRANSPORT_HEADER, NFT_REG32_01, ++ offsetof(struct tcphdr, source), INET_SERVICE_LEN) < 0) ++ return -1; ++ if (add_payload(r, NFT_PAYLOAD_NETWORK_HEADER, NFT_REG32_02, ++ offsetof(struct iphdr, daddr), IPADDR_LEN) < 0) ++ return -1; ++ if (add_payload(r, NFT_PAYLOAD_TRANSPORT_HEADER, NFT_REG32_03, ++ offsetof(struct tcphdr, dest), INET_SERVICE_LEN) < 0) ++ return -1; ++ ++ if (add_lookup(r, NFT_REG32_00, params->set_name) < 0) ++ return -1; ++ } else { ++ if (add_payload(r, NFT_PAYLOAD_NETWORK_HEADER, NFT_REG32_00, ++ offsetof(struct ipv6hdr, saddr), IP6ADDR_LEN) < 0) ++ return -1; ++ if (add_payload(r, NFT_PAYLOAD_TRANSPORT_HEADER, NFT_REG32_04, ++ offsetof(struct tcphdr, source), INET_SERVICE_LEN) < 0) ++ return -1; ++ if (add_payload(r, NFT_PAYLOAD_NETWORK_HEADER, NFT_REG32_05, ++ offsetof(struct ipv6hdr, daddr), IP6ADDR_LEN) < 0) ++ return -1; ++ if (add_payload(r, NFT_PAYLOAD_TRANSPORT_HEADER, NFT_REG32_09, ++ offsetof(struct tcphdr, dest), INET_SERVICE_LEN) < 0) ++ return -1; ++ ++ if (add_lookup(r, NFT_REG32_00, params->set_name) < 0) ++ return -1; ++ } ++ ++ /* counter */ ++ if (add_counter(r) < 0) ++ return -1; ++ ++ return 0; ++} ++ ++static struct nftnl_rule *setup_rule(uint8_t family, const char *table, ++ struct nft_rule_params *params, ++ bool create, bool ns) ++{ ++ struct nftnl_rule *r = NULL; ++ ++ r = nftnl_rule_alloc(); ++ if (r == NULL) ++ return NULL; ++ ++ if (nftnl_rule_set_str(r, NFTNL_RULE_TABLE, table) < 0) ++ goto err; ++ nftnl_rule_set_u32(r, NFTNL_RULE_FAMILY, family); ++ if (nftnl_rule_set_str(r, NFTNL_RULE_CHAIN, params->chain_name) < 0) ++ goto err; ++ ++ if (params->mark != 0) { ++ /* meta mark != */ ++ if (add_mark(r, NFT_META_MARK, NFT_REG32_00) < 0) ++ goto err; ++ if (add_cmp(r, NFT_REG32_00, params->mark_op, ¶ms->mark, sizeof(uint32_t)) < 0) ++ goto err; ++ } ++ ++ if (!ns && __setup_rule(r, params) < 0) ++ goto err; ++ ++ /* drop */ ++ if (add_verdict(r, params->chain_name, params->stmt) < 0) ++ goto err; ++ ++ return r; ++ ++err: ++ nftnl_rule_free(r); ++ return NULL; ++} ++ ++static struct nlmsghdr *nft_table_detect(struct mnl_params *mnl_params, void *args) ++{ ++ struct nftnl_table *table; ++ ++ table = setup_table(NFPROTO_INET, TABLE_NAME); ++ if (table == NULL) ++ return NULL; ++ ++ return construct_table_buf(mnl_params->buf, NFT_MSG_GETTABLE, NFPROTO_INET, ++ NLM_F_ACK, mnl_params->seq++, table); ++} ++ ++static int nft_table_create(struct mnl_params *mnl_params, void *args) ++{ ++ struct nftnl_table *table; ++ ++ table = setup_table(NFPROTO_INET, TABLE_NAME); ++ if (table == NULL) ++ return -1; ++ ++ construct_table_batch(mnl_params->batch, NFT_MSG_NEWTABLE, NFPROTO_INET, ++ NLM_F_CREATE|NLM_F_EXCL|NLM_F_ACK, ++ mnl_params->seq++, table); ++ ++ return 0; ++} ++ ++static int nft_table_prepare(struct mnl_params *mnl_params) ++{ ++ int result = 0; ++ ++ if (mnl_buf_send_and_recv(mnl_params, nft_table_detect, NULL, &result) == 0) ++ return 0; ++ ++ pr_debug("%s: detect table result %d\n", __func__, result); ++ ++ if (result == ENOENT && ++ (mnl_batch_send_and_recv(mnl_params, nft_table_create, NULL, &result) < 0 ++ && (result != 0 && result != EEXIST))) { ++ pr_err("%s: create nftables table failed!\n", __func__); ++ return -1; ++ } else if (result != 0) { ++ pr_err("%s: detect table result %d\n", __func__, -result); ++ return -1; ++ } ++ ++ return 0; ++} ++ ++static struct nlmsghdr *nft_chain_detect(struct mnl_params *mnl_params, void *args) ++{ ++ struct nftnl_chain *chain; ++ ++ chain = setup_chain(TABLE_NAME, args, false); ++ if (chain == NULL) ++ return NULL; ++ ++ return construct_chain_buf(mnl_params->buf, NFT_MSG_GETCHAIN, NFPROTO_INET, ++ NLM_F_ACK, mnl_params->seq++, chain); ++} ++ ++static int nft_chain_create(struct mnl_params *mnl_params, void *args) ++{ ++ struct nftnl_chain *chain; ++ ++ chain = setup_chain(TABLE_NAME, args, true); ++ if (chain == NULL) ++ return -1; ++ ++ construct_chain_batch(mnl_params->batch, NFT_MSG_NEWCHAIN, NFPROTO_INET, ++ NLM_F_CREATE|NLM_F_EXCL|NLM_F_ACK, mnl_params->seq++, chain); ++ ++ return 0; ++} ++ ++static int nft_chain_prepare_internal(struct mnl_params *mnl_params, ++ struct nft_chain_params *params) ++{ ++ int result = 0; ++ ++ if (mnl_buf_send_and_recv(mnl_params, nft_chain_detect, params, &result) == 0) ++ return 0; ++ ++ pr_debug("%s: detect chain result %d\n", __func__, result); ++ ++ if (result == ENOENT && ++ (mnl_batch_send_and_recv(mnl_params, nft_chain_create, params, &result) < 0 ++ && (result != 0 && result != EEXIST))) { ++ pr_err("%s: nftables create chain %s failed!\n", ++ __func__, params->name); ++ return -1; ++ } else if (result != 0) { ++ pr_err("%s: detect chain result %d\n", __func__, -result); ++ return -1; ++ } ++ ++ return result; ++} ++ ++static int nft_chain_prepare(struct mnl_params *mnl_params) ++{ ++ struct nft_chain_params params = { ++ .type = "filter", ++ .prio = NF_IP_PRI_FILTER, ++ .policy = NF_ACCEPT, ++ }; ++ ++ /* prepare ipv4 input chain in filter table */ ++ params.name = INPUT_CHAIN_NAME; ++ params.hooknum = NF_INET_LOCAL_IN; ++ ++ if (nft_chain_prepare_internal(mnl_params, ¶ms) < 0) ++ return -1; ++ ++ /* prepare ipv4 output chain in filter table */ ++ params.name = OUTPUT_CHAIN_NAME; ++ params.hooknum = NF_INET_LOCAL_OUT; ++ ++ if (nft_chain_prepare_internal(mnl_params, ¶ms) < 0) ++ return -1; ++ ++ return 0; ++} ++ ++static int nft_set_internal(uint8_t family, struct mnl_params *mnl_params, ++ struct nft_set_params *params, bool create) ++{ ++ struct nftnl_set *set; ++ ++ set = setup_set(family, TABLE_NAME, params, create); ++ if (set == NULL) ++ return -1; ++ ++ if (create) { ++ construct_set_batch(mnl_params->batch, NFT_MSG_NEWSET, family, ++ NLM_F_CREATE|NLM_F_EXCL|NLM_F_ACK, mnl_params->seq++, set); ++ } else { ++ construct_set_batch(mnl_params->batch, NFT_MSG_DELSET, family, ++ 0, mnl_params->seq++, set); ++ } ++ ++ return 0; ++} ++ ++static int nft_set_raw(struct mnl_params *mnl_params, ++ struct mnl_cb_params *args, bool input) ++{ ++ const uint32_t set_id_base = input ? 0x12315 : 0x17173; ++ const uint8_t family = NFPROTO_INET; ++ struct nft_set_params params = { 0 }; ++ char *set_name; ++ int idx = 0; ++ ++ if (!args->ipv6) { ++ params.datatype = IPv4_KEY_TYPE; ++ params.key_len = IPv4_KEY_LEN; ++ idx = 4; ++ } else { ++ params.datatype = IPv6_KEY_TYPE; ++ params.key_len = IPv6_KEY_LEN; ++ idx = 6; ++ } ++ ++ if (args->ipv6 && input) ++ set_name = INPUT_IPV6_SET_NAME; ++ else if (args->ipv6 && !input) ++ set_name = OUTPUT_IPV6_SET_NAME; ++ else if (!args->ipv6 && input) ++ set_name = INPUT_IPV4_SET_NAME; ++ else ++ set_name = OUTPUT_IPV4_SET_NAME; ++ ++ snprintf(params.name, sizeof(params.name)-1, set_name, args->tree_id); ++ params.id = set_id_base + args->tree_id + idx; ++ ++ if (nft_set_internal(family, mnl_params, ¶ms, args->create) < 0) { ++ pr_err("%s: create nftables %s %s set failed!\n", __func__, ++ args->ipv6 ? "ipv6" : "ipv4", ++ input ? "input" : "output"); ++ return -1; ++ } ++ ++ return 0; ++} ++ ++static int nft_set(struct mnl_params *mnl_params, void *args) ++{ ++ struct mnl_cb_params *params = args; ++ ++ params->ipv6 = false; ++ if (nft_set_raw(mnl_params, params, true) < 0) ++ return -1; ++ ++ if (nft_set_raw(mnl_params, params, false) < 0) ++ return -1; ++ ++ params->ipv6 = true; ++ if (nft_set_raw(mnl_params, params, true) < 0) ++ return -1; ++ ++ if (nft_set_raw(mnl_params, params, false) < 0) ++ return -1; ++ ++ return 0; ++} ++ ++static int nft_set_common(struct mnl_params *mnl_params, pid_t tree_id, bool create) ++{ ++ struct mnl_cb_params params = { ++ .tree_id = tree_id, ++ .create = create, ++ }; ++ int result = 0; ++ ++ if (create && ++ (mnl_batch_send_and_recv(mnl_params, nft_set, ¶ms, &result) < 0 ++ && (result != 0 && result != EEXIST))) { ++ pr_err("%s: create set failed!\n", __func__); ++ return -1; ++ } else if (!create && ++ mnl_batch_send_and_recv(mnl_params, nft_set, ¶ms, NULL) < 0) { ++ pr_err("%s: delete set failed!\n", __func__); ++ return -1; ++ } ++ ++ return 0; ++} ++ ++static int nft_rule_internal(uint8_t family, struct mnl_params *mnl_params, ++ struct nft_rule_params *params, bool create) ++{ ++ struct nftnl_rule *rule; ++ ++ rule = setup_rule(family, TABLE_NAME, params, create, false); ++ if (rule == NULL) ++ return -1; ++ ++ if (create) { ++ construct_rule_batch(mnl_params->batch, NFT_MSG_NEWRULE, family, ++ NLM_F_CREATE|NLM_F_EXCL|NLM_F_ACK, ++ mnl_params->seq++, rule); ++ } else { ++ construct_rule_batch(mnl_params->batch, NFT_MSG_DELRULE, family, ++ 0, mnl_params->seq++, rule); ++ } ++ ++ return 0; ++} ++ ++static int nft_rule_raw(struct mnl_params *mnl_params, struct mnl_cb_params *args, ++ struct nft_rule_params *params) ++{ ++ char *set_name; ++ ++ params->nfproto = params->ipv6 ? htons(ETH_P_IPV6) : htons(ETH_P_IP); ++ ++ set_name = params->ipv6 ? INPUT_IPV6_SET_NAME : INPUT_IPV4_SET_NAME; ++ params->chain_name = INPUT_CHAIN_NAME; ++ snprintf(params->set_name, sizeof(params->set_name)-1, set_name, args->tree_id); ++ if (nft_rule_internal(NFPROTO_INET, mnl_params, params, args->create) < 0) { ++ pr_err("%s: create nft %s input rule failed!\n", ++ __func__, params->ipv6 ? "ipv6" : "ipv4"); ++ return -1; ++ } ++ ++ set_name = params->ipv6 ? OUTPUT_IPV6_SET_NAME : OUTPUT_IPV4_SET_NAME; ++ params->chain_name = OUTPUT_CHAIN_NAME; ++ snprintf(params->set_name, sizeof(params->set_name)-1, set_name, args->tree_id); ++ if (nft_rule_internal(NFPROTO_INET, mnl_params, params, args->create) < 0) { ++ pr_err("%s: create nftables %s output rule failed!\n", ++ __func__, params->ipv6 ? "ipv6" : "ipv4"); ++ return -1; ++ } ++ ++ return 0; ++} ++ ++static int nft_rule(struct mnl_params *mnl_params, void *args) ++{ ++ struct nft_rule_params params = { ++ .l4proto = IPPROTO_TCP, ++ .mark = SOCCR_MARK, ++ .mark_op = NFT_CMP_NEQ, ++ .stmt = NF_DROP, ++ }; ++ ++ params.ipv6 = false; ++ if (nft_rule_raw(mnl_params, args, ¶ms) < 0) ++ return -1; ++ ++ params.ipv6 = true; ++ if (nft_rule_raw(mnl_params, args, ¶ms) < 0) ++ return -1; ++ ++ return 0; ++} ++ ++static int nft_rule_common(struct mnl_params *mnl_params, pid_t tree_id, bool create) ++{ ++ struct mnl_cb_params params = { ++ .tree_id = tree_id, ++ .create = create, ++ }; ++ int result = 0; ++ ++ if (create && ++ (mnl_batch_send_and_recv(mnl_params, nft_rule, ¶ms, &result) < 0 ++ && (result != 0 && result != EEXIST))) { ++ pr_err("%s: create rule failed!\n", __func__); ++ return -1; ++ } else if (!create && ++ mnl_batch_send_and_recv(mnl_params, nft_rule, ¶ms, NULL) < 0) { ++ pr_err("%s: delete rule failed!\n", __func__); ++ return -1; ++ } ++ ++ return 0; ++} ++ ++static int network_prepare_internal(struct mnl_params *params, batch_func_t _, void *args) ++{ ++ pid_t tree_id = *(pid_t *)args; ++ ++ if (nft_table_prepare(params) < 0) ++ return -1; ++ ++ if (nft_chain_prepare(params) < 0) ++ return -1; ++ ++ if (nft_set_common(params, tree_id, true) < 0) ++ return -1; ++ ++ if (nft_rule_common(params, tree_id, true) < 0) ++ return -1; ++ ++ return 0; ++} ++ ++int network_prepare(pid_t tree_id) ++{ ++ pr_info("Prepare network\n"); ++ ++ return mnl_common(network_prepare_internal, NULL, &tree_id); ++} ++ ++static int network_unprepare_internal(struct mnl_params *params, ++ batch_func_t _, void *args) ++{ ++ pid_t tree_id = *(pid_t *)args; ++ ++ if (nft_rule_common(params, tree_id, false) < 0) ++ return -1; ++ ++ if (nft_set_common(params, tree_id, false) < 0) ++ return -1; ++ ++ return 0; ++} ++ ++void network_unprepare(pid_t tree_id) ++{ ++ pr_info("Unprepare network\n"); ++ ++ mnl_common(network_unprepare_internal, NULL, &tree_id); ++} ++ ++static int add_set_elem_internal(struct nftnl_set *s, void *data, size_t len) ++{ ++ struct nftnl_set_elem *e; ++ ++ e = nftnl_set_elem_alloc(); ++ if (e == NULL) ++ return -1; ++ ++ nftnl_set_elem_set(e, NFTNL_SET_ELEM_KEY, data, len); ++ ++ nftnl_set_elem_add(s, e); ++ ++ return 0; ++} ++ ++static struct nftnl_set *add_set_elem(const char *table, const char *set, ++ void *data, size_t len) ++{ ++ struct nftnl_set *s; ++ ++ s = nftnl_set_alloc(); ++ if (s == NULL) ++ return NULL; ++ ++ if (nftnl_set_set_str(s, NFTNL_SET_TABLE, table) < 0) ++ goto err; ++ if (nftnl_set_set_str(s, NFTNL_SET_NAME, set) < 0) ++ goto err; ++ ++ if (add_set_elem_internal(s, data, len) < 0) ++ goto err; ++ ++ return s; ++ ++err: ++ nftnl_set_free(s); ++ return NULL; ++} ++ ++static int nft_set_elem(uint8_t family, struct mnl_params *mnl_param, ++ struct nft_set_elem_params *elem_param, ++ bool lock) ++{ ++ struct nftnl_set *set; ++ ++ set = add_set_elem(TABLE_NAME, elem_param->set_name, ++ elem_param->data, elem_param->data_len); ++ if (set == NULL) ++ return -1; ++ ++ if (lock) { ++ construct_set_elems_batch(mnl_param->batch, NFT_MSG_NEWSETELEM, ++ family, NLM_F_CREATE|NLM_F_EXCL, ++ mnl_param->seq++, set); ++ } else { ++ construct_set_elems_batch(mnl_param->batch, NFT_MSG_DELSETELEM, ++ family, 0, mnl_param->seq++, set); ++ } ++ ++ return 0; ++} ++ ++static void construct_set_elem_key(void *data, struct nf_conn_params *param, bool output) ++{ ++ size_t offset = 0; ++ size_t addr_len = param->family == AF_INET ? IPADDR_LEN : IP6ADDR_LEN; ++ ++ memcpy(data+offset, output ? param->src_addr : param->dst_addr, addr_len); ++ offset = addr_len; ++ *(uint32_t *)(data + offset) = htons(output ? param->src_port : param->dst_port); ++ offset += sizeof(uint32_t); ++ memcpy(data+offset, output ? param->dst_addr : param->src_addr, addr_len); ++ offset += addr_len; ++ *(uint32_t *)(data + offset) = htons(output ? param->dst_port : param->src_port); ++} ++ ++static int nf_connection_switch_raw(struct mnl_params *mnl_params, void *args) ++{ ++ struct nf_conn_params *param = args; ++ char *input_set_name, *output_set_name; ++ struct nft_set_elem_params elem; ++ ++ switch (param->family) { ++ case AF_INET: ++ input_set_name = INPUT_IPV4_SET_NAME; ++ output_set_name = OUTPUT_IPV4_SET_NAME; ++ elem.data_len = IPv4_KEY_LEN; ++ break; ++ case AF_INET6: ++ input_set_name = INPUT_IPV6_SET_NAME; ++ output_set_name = OUTPUT_IPV6_SET_NAME; ++ elem.data_len = IPv6_KEY_LEN; ++ break; ++ default: ++ pr_err("Unknown socket family %d\n", param->family); ++ return -1; ++ } ++ ++ construct_set_elem_key(elem.data, param, false); ++ snprintf(elem.set_name, sizeof(elem.set_name)-1, input_set_name, param->tree_id); ++ if (nft_set_elem(NFPROTO_INET, mnl_params, &elem, param->lock) < 0) ++ return -1; ++ ++ construct_set_elem_key(elem.data, param, true); ++ snprintf(elem.set_name, sizeof(elem.set_name)-1, output_set_name, param->tree_id); ++ if (nft_set_elem(NFPROTO_INET, mnl_params, &elem, param->lock) < 0) ++ return -1; ++ ++ return 0; ++} ++ ++/* IPv4-Mapped IPv6 Addresses */ ++static int ipv6_addr_mapped(uint32_t *addr) ++{ ++ return (addr[2] == htonl(0x0000ffff)); ++} ++ ++int nft_connection_switch(struct inet_sk_desc *sk, bool lock, pid_t tree_id) ++{ ++ char sip[INET_ADDR_LEN], dip[INET_ADDR_LEN]; ++ struct nf_conn_params param = { ++ .family = sk->sd.family, ++ .src_addr = sk->src_addr, ++ .src_port = sk->src_port, ++ .dst_addr = sk->dst_addr, ++ .dst_port = sk->dst_port, ++ .lock = lock, ++ .tree_id = tree_id, ++ }; ++ ++ if (param.family == AF_INET6 && ipv6_addr_mapped(param.dst_addr)) { ++ param.family = AF_INET; ++ param.src_addr = ¶m.src_addr[3]; ++ param.dst_addr = ¶m.dst_addr[3]; ++ } ++ ++ if (!inet_ntop(param.family, (void *)param.src_addr, sip, INET_ADDR_LEN) || ++ !inet_ntop(param.family, (void *)param.dst_addr, dip, INET_ADDR_LEN)) { ++ pr_perror("nf: Can't translate ip addr"); ++ return -1; ++ } ++ ++ pr_info("%s %s:%d - %s:%d connection\n", lock ? "Locked" : "Unlocked", ++ sip, (int)param.src_port, dip, (int)param.dst_port); ++ ++ return mnl_sendmsg(nf_connection_switch_raw, ¶m); ++} +-- +2.34.1 + diff --git a/0060-net-switch-to-nftables-API.patch b/0060-net-switch-to-nftables-API.patch new file mode 100644 index 0000000..7cc7007 --- /dev/null +++ b/0060-net-switch-to-nftables-API.patch @@ -0,0 +1,55 @@ +From 073ed2ef448fb073aa3c6f0552e120e3e98a8906 Mon Sep 17 00:00:00 2001 +From: "fu.lin" +Date: Wed, 13 Apr 2022 14:30:54 +0800 +Subject: [PATCH 60/72] net: switch to nftables API + +This is fake patch + +Signed-off-by: fu.lin +--- + criu/config.c | 2 ++ + criu/crtools.c | 1 + + criu/include/cr_options.h | 2 ++ + 3 files changed, 5 insertions(+) + +diff --git a/criu/config.c b/criu/config.c +index c0358e5..7c4e230 100644 +--- a/criu/config.c ++++ b/criu/config.c +@@ -711,6 +711,8 @@ int parse_options(int argc, char **argv, bool *usage_error, bool *has_exec_cmd, + { "reserve-ports", required_argument, 0, 'P' }, + BOOL_OPT("parallel", &opts.parallel), + { "exec-pin-start", required_argument, 0, 2002 }, ++ BOOL_OPT("use-nft", &opts.use_nft), ++ BOOL_OPT("async-clear-nft", &opts.async_clear_nft), + {}, + }; + +diff --git a/criu/crtools.c b/criu/crtools.c +index 40e2d51..c555213 100644 +--- a/criu/crtools.c ++++ b/criu/crtools.c +@@ -474,6 +474,7 @@ usage: + " --reserve-ports Reserve src ports in kernel\n" + " --parallel Collect smaps parallel to accellrate dumping speed\n" + " --exec-pin-start Exec file map's pin start index\n" ++ " --use Use nft API instead of iptables cmd in network locking\n" + "\n" + "Check options:\n" + " Without options, \"criu check\" checks availability of absolutely required\n" +diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h +index a64e977..6dadaba 100644 +--- a/criu/include/cr_options.h ++++ b/criu/include/cr_options.h +@@ -203,6 +203,8 @@ struct cr_options { + int reserve_ports; + int parallel; + int exec_pin_start; ++ int use_nft; ++ int async_clear_nft; + }; + + extern struct cr_options opts; +-- +2.34.1 + diff --git a/0061-zdtm-unlink-kdat-before-testing.patch b/0061-zdtm-unlink-kdat-before-testing.patch new file mode 100644 index 0000000..a09395f --- /dev/null +++ b/0061-zdtm-unlink-kdat-before-testing.patch @@ -0,0 +1,46 @@ +From 926affe76a99871f9a95f3381190bd3fb601e6ec Mon Sep 17 00:00:00 2001 +From: "fu.lin" +Date: Fri, 21 Jan 2022 14:46:21 +0800 +Subject: [PATCH 61/72] zdtm: unlink kdat before testing + +Signed-off-by: fu.lin +--- + test/zdtm.py | 8 ++++++++ + 1 file changed, 8 insertions(+) + +diff --git a/test/zdtm.py b/test/zdtm.py +index 0feece0..1b2c7da 100755 +--- a/test/zdtm.py ++++ b/test/zdtm.py +@@ -24,6 +24,7 @@ import sys + import tempfile + import time + import socket ++import pathlib + from builtins import (input, int, open, range, str, zip) + + import pycriu as crpc +@@ -2662,6 +2663,9 @@ rp.add_argument("--pre-dump-mode", + help="Use splice or read mode of pre-dumping", + choices=['splice', 'read'], + default='splice') ++rp.add_argument("--kdat", ++ help="Path to criu.kdat, default '/run/criu.kdat'", ++ default="/run/criu.kdat") + + lp = sp.add_parser("list", help="List tests") + lp.set_defaults(action=list_tests) +@@ -2692,6 +2696,10 @@ if opts['debug']: + + if opts['action'] == 'run': + criu.available() ++ # remove kdat file before testing ++ kdat = pathlib.Path(opts['kdat']) ++ if kdat.exists(): ++ kdat.unlink() + for tst in test_classes.values(): + tst.available() + +-- +2.34.1 + diff --git a/0062-zdtm-add-host-ns-sysvshm-ipc-case.patch b/0062-zdtm-add-host-ns-sysvshm-ipc-case.patch new file mode 100644 index 0000000..5a98f4e --- /dev/null +++ b/0062-zdtm-add-host-ns-sysvshm-ipc-case.patch @@ -0,0 +1,302 @@ +From 3d945368250958f5ebf3b4053e07c816adafba33 Mon Sep 17 00:00:00 2001 +From: "fu.lin" +Date: Fri, 21 Jan 2022 17:20:05 +0800 +Subject: [PATCH 62/72] zdtm: add host ns sysvshm ipc case + +--- + test/zdtm/Makefile | 2 +- + test/zdtm/customization/Makefile | 53 ++++++++ + test/zdtm/customization/ipc.c | 202 +++++++++++++++++++++++++++++++ + test/zdtm/customization/ipc.desc | 1 + + 4 files changed, 257 insertions(+), 1 deletion(-) + create mode 100644 test/zdtm/customization/Makefile + create mode 100644 test/zdtm/customization/ipc.c + create mode 100644 test/zdtm/customization/ipc.desc + +diff --git a/test/zdtm/Makefile b/test/zdtm/Makefile +index 24a33f2..8f9857b 100644 +--- a/test/zdtm/Makefile ++++ b/test/zdtm/Makefile +@@ -1,4 +1,4 @@ +-SUBDIRS := lib static transition ++SUBDIRS := lib static transition customization + + all: $(SUBDIRS) + .PHONY: all $(SUBDIRS) +diff --git a/test/zdtm/customization/Makefile b/test/zdtm/customization/Makefile +new file mode 100644 +index 0000000..563b7b1 +--- /dev/null ++++ b/test/zdtm/customization/Makefile +@@ -0,0 +1,53 @@ ++LIBDIR := ../lib ++LIB := $(LIBDIR)/libzdtmtst.a ++LDLIBS += $(LIB) ++CPPFLAGS += -I$(LIBDIR) ++ ++TST = \ ++ ipc ++ ++SRC = $(TST:%=%.c) ++OBJ = $(SRC:%.c=%.o) ++DEP = $(SRC:%.c=%.d) ++PID = $(TST:%=%.pid) ++OUT = $(TST:%=%.out) ++ ++include ../Makefile.inc ++ ++all: $(TST) ++install: all ++.PHONY: all install ++ ++$(TST:%=%.pid): %.pid: % ++ $(/dev/null` 2>/dev/null || break; \ ++ sleep 1; \ ++ done ++ ++$(TST): | $(LIB) ++ ++%: %.sh ++ cp $< $@ ++ chmod +x $@ ++ ++$(LIB): force ++ $(Q) $(MAKE) -C $(LIBDIR) ++ ++.PHONY: force start check_start stop wait_stop +diff --git a/test/zdtm/customization/ipc.c b/test/zdtm/customization/ipc.c +new file mode 100644 +index 0000000..2b3c2b1 +--- /dev/null ++++ b/test/zdtm/customization/ipc.c +@@ -0,0 +1,202 @@ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "zdtmtst.h" ++ ++const char *test_doc="Tests ipc sems and shmems migrate fine"; ++const char *test_author="Pavel Emelianov "; ++ ++static struct sembuf unlock = { ++ .sem_op = 1, ++ .sem_num = 0, ++ .sem_flg = 0, ++}; ++ ++static struct sembuf lock = { ++ .sem_op = -1, ++ .sem_num = 0, ++ .sem_flg = 0, ++}; ++ ++#define DEF_MEM_SIZE (40960) ++unsigned int shmem_size = DEF_MEM_SIZE; ++TEST_OPTION(shmem_size, uint, "Size of shared memory segment", 0); ++ ++#define INIT_CRC (~0) ++ ++#define POISON 0xac ++static inline void poison_area(int *mem) ++{ ++ memset(mem, POISON, shmem_size); ++} ++ ++static int child(key_t key) ++{ ++ int sem, shm, ret, res = 0; ++ uint8_t *mem; ++ uint32_t crc; ++ ++ sem = semget(key, 1, 0777); ++ if (sem == -1) ++ return -1; ++ shm = shmget(key, shmem_size, 0777); ++ if (shm == -1) ++ return -2; ++ mem = shmat(shm, NULL, 0); ++ if (mem == (uint8_t *)-1) ++ return -3; ++ ++ while (test_go()) { ++ ret = semop(sem, &lock, 1); ++ if (ret) { ++ if (errno == EINTR) ++ continue; ++ fail("Error in semop lock"); ++ res = errno; ++ break; ++ } ++ crc = INIT_CRC; ++ datagen(mem, shmem_size, &crc); ++ while ((ret = semop(sem, &unlock, 1)) && (errno == EINTR)); ++ if (ret) { ++ fail("Error in semop unlock"); ++ res = errno; ++ break; ++ } ++ } ++ shmdt(mem); ++ return res; ++} ++ ++int main(int argc, char **argv) ++{ ++ key_t key; ++ int sem, shm, pid1, pid2; ++ int fail_count = 0; ++ uint8_t *mem; ++ uint32_t crc; ++ int ret; ++ ++ test_init(argc, argv); ++ ++ /* using the large number to fill string length */ ++ key = ftok(argv[0], 1822155650); ++ if (key == -1) { ++ pr_perror("Can't make key"); ++ goto out; ++ } ++ ++ sem = semget(key, 1, 0777 | IPC_CREAT | IPC_EXCL); ++ if (sem == -1) { ++ pr_perror("Can't get sem"); ++ goto out; ++ } ++ ++ if (semctl(sem, 0, SETVAL, 1) == -1) { ++ pr_perror("Can't init sem"); ++ fail_count++; ++ goto out_sem; ++ } ++ ++ shm = shmget(key, shmem_size, 0777 | IPC_CREAT | IPC_EXCL); ++ if (shm == -1) { ++ pr_perror("Can't get shm"); ++ fail_count++; ++ goto out_sem; ++ } ++ ++ mem = shmat(shm, NULL, 0); ++ if (mem == (void *)-1) { ++ pr_perror("Can't attach shm"); ++ fail_count++; ++ goto out_shm; ++ } ++ ++ poison_area((int *)mem); ++ ++ pid1 = test_fork(); ++ if (pid1 == -1) { ++ pr_perror("Can't fork 1st time"); ++ goto out_shdt; ++ } else if (pid1 == 0) ++ exit(child(key)); ++ ++ pid2 = test_fork(); ++ if (pid2 == -1) { ++ pr_perror("Can't fork 2nd time"); ++ fail_count++; ++ goto out_child; ++ } else if (pid2 == 0) ++ exit(child(key)); ++ ++ test_daemon(); ++ while (test_go()) { ++ ret = semop(sem, &lock, 1); ++ if (ret) { ++ if (errno == EINTR) ++ continue; ++ fail_count++; ++ fail("Error in semop lock"); ++ break; ++ } ++ if (mem[0] != POISON) { ++ crc = INIT_CRC; ++ if (datachk(mem, shmem_size, &crc)) { ++ fail_count++; ++ fail("Semaphore protection is broken or " ++ "shmem pages are messed"); ++ semop(sem, &unlock, 1); ++ break; ++ } ++ poison_area((int *)mem); ++ } ++ while ((ret = semop(sem, &unlock, 1)) && (errno == EINTR)); ++ if (ret) { ++ fail_count++; ++ fail("Error in semop unlock"); ++ break; ++ } ++ } ++ test_waitsig(); ++ ++ kill(pid2, SIGTERM); ++ waitpid(pid2, &ret, 0); ++ if (!WIFEXITED(ret)) { ++ fail_count++; ++ pr_perror("Child 2 was killed"); ++ } else if (WEXITSTATUS(ret)) { ++ fail_count++; ++ pr_perror("Child 2 couldn't inititalise"); ++ } ++out_child: ++ kill(pid1, SIGTERM); ++ waitpid(pid1, &ret, 0); ++ if (!WIFEXITED(ret)) { ++ fail_count++; ++ pr_perror("Child 1 was killed"); ++ } else if (WEXITSTATUS(ret)) { ++ fail_count++; ++ pr_perror("Child 1 couldn't inititalise"); ++ } ++out_shdt: ++ shmdt(mem); ++out_shm: ++ shmctl(shm, IPC_RMID, NULL); ++out_sem: ++ semctl(sem, 1, IPC_RMID); ++ if (fail_count == 0) ++ pass(); ++out: ++ return 0; ++} +diff --git a/test/zdtm/customization/ipc.desc b/test/zdtm/customization/ipc.desc +new file mode 100644 +index 0000000..63df42a +--- /dev/null ++++ b/test/zdtm/customization/ipc.desc +@@ -0,0 +1 @@ ++{'flavor': 'h'} +-- +2.34.1 + diff --git a/0063-zdtm-add-pinmem-testcase.patch b/0063-zdtm-add-pinmem-testcase.patch new file mode 100644 index 0000000..be9a474 --- /dev/null +++ b/0063-zdtm-add-pinmem-testcase.patch @@ -0,0 +1,2091 @@ +From 4f9fed183bcfda1285d7e99136ff02e3778012ba Mon Sep 17 00:00:00 2001 +From: "fu.lin" +Date: Tue, 25 Jan 2022 19:00:33 +0800 +Subject: [PATCH 63/72] zdtm: add pinmem testcase + +Signed-off-by: fu.lin +--- + test/zdtm.py | 68 ++- + test/zdtm/customization/Makefile | 23 +- + test/zdtm/customization/get_smaps_bits.c | 127 +++++ + test/zdtm/customization/get_smaps_bits.h | 6 + + test/zdtm/customization/ipc.desc | 2 +- + test/zdtm/customization/maps00.c | 271 +++++++++++ + test/zdtm/customization/maps00.desc | 1 + + test/zdtm/customization/maps007.c | 178 +++++++ + test/zdtm/customization/maps007.desc | 1 + + test/zdtm/customization/maps008.c | 514 ++++++++++++++++++++ + test/zdtm/customization/maps008.desc | 1 + + test/zdtm/customization/maps01.c | 183 +++++++ + test/zdtm/customization/maps01.desc | 1 + + test/zdtm/customization/maps02.c | 111 +++++ + test/zdtm/customization/maps02.desc | 1 + + test/zdtm/customization/maps04.c | 57 +++ + test/zdtm/customization/maps04.desc | 1 + + test/zdtm/customization/maps05.c | 91 ++++ + test/zdtm/customization/maps05.desc | 1 + + test/zdtm/customization/maps06.c | 70 +++ + test/zdtm/customization/maps06.desc | 1 + + test/zdtm/customization/maps_file_prot.c | 53 ++ + test/zdtm/customization/maps_file_prot.desc | 1 + + test/zdtm_ct.c | 13 +- + 24 files changed, 1766 insertions(+), 10 deletions(-) + create mode 100644 test/zdtm/customization/get_smaps_bits.c + create mode 100644 test/zdtm/customization/get_smaps_bits.h + create mode 100644 test/zdtm/customization/maps00.c + create mode 100644 test/zdtm/customization/maps00.desc + create mode 100644 test/zdtm/customization/maps007.c + create mode 100644 test/zdtm/customization/maps007.desc + create mode 100644 test/zdtm/customization/maps008.c + create mode 100644 test/zdtm/customization/maps008.desc + create mode 100644 test/zdtm/customization/maps01.c + create mode 100644 test/zdtm/customization/maps01.desc + create mode 100644 test/zdtm/customization/maps02.c + create mode 100644 test/zdtm/customization/maps02.desc + create mode 100644 test/zdtm/customization/maps04.c + create mode 100644 test/zdtm/customization/maps04.desc + create mode 100644 test/zdtm/customization/maps05.c + create mode 100644 test/zdtm/customization/maps05.desc + create mode 100644 test/zdtm/customization/maps06.c + create mode 100644 test/zdtm/customization/maps06.desc + create mode 100644 test/zdtm/customization/maps_file_prot.c + create mode 100644 test/zdtm/customization/maps_file_prot.desc + +diff --git a/test/zdtm.py b/test/zdtm.py +index 1b2c7da..d3b146f 100755 +--- a/test/zdtm.py ++++ b/test/zdtm.py +@@ -367,6 +367,9 @@ def test_flag(tdesc, flag): + return flag in tdesc.get('flags', '').split() + + ++def test_value(tdesc, opt, val): ++ return val in tdesc.get(opt, '').split() ++ + # + # Exception thrown when something inside the test goes wrong, + # e.g. test doesn't start, criu returns with non zero code or +@@ -1445,6 +1448,24 @@ class criu: + "check", ["--no-default-config", "-v0", "--feature", feature], + opts['criu_bin']) == 0 + ++ @staticmethod ++ def check_cmdline(cmdline): ++ with open("/proc/cmdline") as f: ++ bootparams = f.readline().strip().split() ++ ++ for arg in cmdline.split(): ++ words = [word.strip("'\" ") for word in arg.split('=')] ++ matched = False ++ for param in bootparams: ++ prefix = param.startswith(words[0]) ++ if (len(words) == 1 and prefix) \ ++ or (len(words) == 2 and prefix and param[len(words[0])+1:] == words[1]): ++ matched = True ++ break ++ if not matched: ++ return True ++ return False ++ + @staticmethod + def available(): + if not os.access(opts['criu_bin'], os.X_OK): +@@ -1516,6 +1537,11 @@ def cr(cr_api, test, opts): + + iters = iter_parm(opts['iters'], 1) + for i in iters[0]: ++ if "--pin-memory" in test.getdopts(): ++ print("Clear pin memory space") ++ cmd = [opts["criu_bin"], "clear-pin-memory"] ++ subprocess.run(cmd, shell=False, check=True) ++ + pres = iter_parm(opts['pre'], 0) + for p in pres[0]: + if opts['snaps']: +@@ -1965,6 +1991,21 @@ class Launcher: + testline = u"ok %d - %s # SKIP %s" % (self.__runtest, name, reason) + print(testline, file=self.__file_report) + ++ def modprobe_pin_memory(self, load): ++ if not load: ++ return ++ else: ++ found = False ++ with open("/proc/modules") as f: ++ for line in f.readlines(): ++ if "pin_memory" == line.split()[0]: ++ found = True ++ if not found: ++ subprocess.check_call(["modprobe", "pin_memory"]) ++ ++ cmd = [opts["criu_bin"], "init-pagemap-read"] ++ subprocess.check_call(cmd, shell=False) ++ + def run_test(self, name, desc, flavor): + + if len(self.__subs) >= self.__max: +@@ -1972,7 +2013,8 @@ class Launcher: + + with open("/proc/sys/kernel/tainted") as taintfd: + taint = taintfd.read() +- if self.__taint != taint: ++ # 0x1000 means the out of tree module has been loaded ++ if self.__taint != taint and (int(self.__taint) | 0x1000) != int(taint): + raise Exception("The kernel is tainted: %r (%r)" % + (taint, self.__taint)) + +@@ -1997,8 +2039,15 @@ class Launcher: + logf = None + log = None + ++ no_pid_ns = test_value(desc, 'opts', '--use-fork-pid') ++ zdtm_no_pid_ns = "1" if no_pid_ns else "0" ++ # load `pin_memory.ko`,`--pin-memory` option must be used with ++ # `--use-fork-pid`, so don't care `--pin-memory` option ++ self.modprobe_pin_memory(no_pid_ns) ++ + sub = subprocess.Popen(["./zdtm_ct", "zdtm.py"], +- env=dict(os.environ, CR_CT_TEST_INFO=arg), ++ env=dict(os.environ, CR_CT_TEST_INFO=arg, ++ ZDTM_NO_PID_NS=zdtm_no_pid_ns), + stdout=log, + stderr=subprocess.STDOUT, + close_fds=True) +@@ -2009,7 +2058,8 @@ class Launcher: + "start": time.time() + } + +- if test_flag(desc, 'excl'): ++ # pin memory function don't support concurrency ++ if test_flag(desc, 'excl') or test_value(desc, "opts", "--pin-memory"): + self.wait() + + def __wait_one(self, flags): +@@ -2356,6 +2406,12 @@ def run_tests(opts): + launcher.skip(t, "remote lazy pages are not supported") + continue + ++ cmdline = tdesc.get('cmdline', '') ++ if cmdline and criu.check_cmdline(cmdline): ++ launcher.skip( ++ t, f"cmdline '{cmdline}' isn't support, or don't set") ++ continue ++ + test_flavs = tdesc.get('flavor', 'h ns uns').split() + opts_flavs = (opts['flavor'] or 'h,ns,uns').split(',') + if opts_flavs != ['best']: +@@ -2385,6 +2441,7 @@ def run_tests(opts): + if fail: + sys.exit(1) + ++ + sti_fmt = "%-40s%-10s%s" + + +@@ -2664,8 +2721,8 @@ rp.add_argument("--pre-dump-mode", + choices=['splice', 'read'], + default='splice') + rp.add_argument("--kdat", +- help="Path to criu.kdat, default '/run/criu.kdat'", +- default="/run/criu.kdat") ++ help="Path to criu.kdat, default '/run/criu.kdat'", ++ default="/run/criu.kdat") + + lp = sp.add_parser("list", help="List tests") + lp.set_defaults(action=list_tests) +@@ -2700,6 +2757,7 @@ if opts['action'] == 'run': + kdat = pathlib.Path(opts['kdat']) + if kdat.exists(): + kdat.unlink() ++ + for tst in test_classes.values(): + tst.available() + +diff --git a/test/zdtm/customization/Makefile b/test/zdtm/customization/Makefile +index 563b7b1..82348f2 100644 +--- a/test/zdtm/customization/Makefile ++++ b/test/zdtm/customization/Makefile +@@ -3,9 +3,21 @@ LIB := $(LIBDIR)/libzdtmtst.a + LDLIBS += $(LIB) + CPPFLAGS += -I$(LIBDIR) + +-TST = \ +- ipc ++TST_NOFILE = \ ++ ipc \ ++ maps01 \ ++ maps02 \ ++ maps04 \ ++ maps05 \ ++ maps007 \ ++ maps008 + ++TST_FILE = \ ++ maps00 \ ++ maps06 \ ++ maps_file_prot ++ ++TST = $(TST_NOFILE) $(TST_FILE) + SRC = $(TST:%=%.c) + OBJ = $(SRC:%.c=%.o) + DEP = $(SRC:%.c=%.d) +@@ -18,9 +30,12 @@ all: $(TST) + install: all + .PHONY: all install + +-$(TST:%=%.pid): %.pid: % ++$(TST_NOFILE:%=%.pid): %.pid: % + $( ++#include ++#include "zdtmtst.h" ++ ++#ifndef MAP_HUGETLB ++# define MAP_HUGETLB 0x40000 ++#endif ++ ++#ifndef MADV_HUGEPAGE ++# define MADV_HUGEPAGE 14 ++#endif ++ ++#ifndef MADV_NOHUGEPAGE ++# define MADV_NOHUGEPAGE 15 ++#endif ++ ++#ifndef MADV_DONTDUMP ++# define MADV_DONTDUMP 16 ++#endif ++ ++static void parse_vmflags(char *buf, unsigned long *flags, unsigned long *madv) ++{ ++ char *tok; ++ ++ if (!buf[0]) ++ return; ++ ++ tok = strtok(buf, " \n"); ++ if (!tok) ++ return; ++ ++#define _vmflag_match(_t, _s) (_t[0] == _s[0] && _t[1] == _s[1]) ++ ++ do { ++ /* mmap() block */ ++ if (_vmflag_match(tok, "gd")) ++ *flags |= MAP_GROWSDOWN; ++ else if (_vmflag_match(tok, "lo")) ++ *flags |= MAP_LOCKED; ++ else if (_vmflag_match(tok, "nr")) ++ *flags |= MAP_NORESERVE; ++ else if (_vmflag_match(tok, "ht")) ++ *flags |= MAP_HUGETLB; ++ ++ /* madvise() block */ ++ if (_vmflag_match(tok, "sr")) ++ *madv |= (1ul << MADV_SEQUENTIAL); ++ else if (_vmflag_match(tok, "rr")) ++ *madv |= (1ul << MADV_RANDOM); ++ else if (_vmflag_match(tok, "dc")) ++ *madv |= (1ul << MADV_DONTFORK); ++ else if (_vmflag_match(tok, "dd")) ++ *madv |= (1ul << MADV_DONTDUMP); ++ else if (_vmflag_match(tok, "mg")) ++ *madv |= (1ul << MADV_MERGEABLE); ++ else if (_vmflag_match(tok, "hg")) ++ *madv |= (1ul << MADV_HUGEPAGE); ++ else if (_vmflag_match(tok, "nh")) ++ *madv |= (1ul << MADV_NOHUGEPAGE); ++ ++ /* ++ * Anything else is just ignored. ++ */ ++ } while ((tok = strtok(NULL, " \n"))); ++ ++#undef _vmflag_match ++} ++ ++#define is_hex_digit(c) \ ++ (((c) >= '0' && (c) <= '9') || \ ++ ((c) >= 'a' && (c) <= 'f') || \ ++ ((c) >= 'A' && (c) <= 'F')) ++ ++static int is_vma_range_fmt(char *line, unsigned long *start, unsigned long *end) ++{ ++ char *p = line; ++ while (*line && is_hex_digit(*line)) ++ line++; ++ ++ if (*line++ != '-') ++ return 0; ++ ++ while (*line && is_hex_digit(*line)) ++ line++; ++ ++ if (*line++ != ' ') ++ return 0; ++ ++ sscanf(p, "%lx-%lx", start, end); ++ return 1; ++} ++ ++int get_smaps_bits(unsigned long where, unsigned long *flags, unsigned long *madv) ++{ ++ unsigned long start = 0, end = 0; ++ FILE *smaps = NULL; ++ char buf[1024]; ++ int found = 0; ++ ++ if (!where) ++ return 0; ++ ++ smaps = fopen("/proc/self/smaps", "r"); ++ if (!smaps) { ++ pr_perror("Can't open smaps"); ++ return -1; ++ } ++ ++ while (fgets(buf, sizeof(buf), smaps)) { ++ is_vma_range_fmt(buf, &start, &end); ++ ++ if (!strncmp(buf, "VmFlags: ", 9) && start == where) { ++ found = 1; ++ parse_vmflags(buf, flags, madv); ++ break; ++ } ++ } ++ ++ fclose(smaps); ++ ++ if (!found) { ++ pr_perror("VmFlags not found for %lx", where); ++ return -1; ++ } ++ ++ return 0; ++} +diff --git a/test/zdtm/customization/get_smaps_bits.h b/test/zdtm/customization/get_smaps_bits.h +new file mode 100644 +index 0000000..ce1070d +--- /dev/null ++++ b/test/zdtm/customization/get_smaps_bits.h +@@ -0,0 +1,6 @@ ++#ifndef ZDTM_GET_SMAPS_BITS_H_ ++#define ZDTM_GET_SMAPS_BITS_H_ ++ ++extern int get_smaps_bits(unsigned long where, unsigned long *flags, unsigned long *madv); ++ ++#endif /* ZDTM_GET_SMAPS_BITS_H_ */ +diff --git a/test/zdtm/customization/ipc.desc b/test/zdtm/customization/ipc.desc +index 63df42a..4c127a0 100644 +--- a/test/zdtm/customization/ipc.desc ++++ b/test/zdtm/customization/ipc.desc +@@ -1 +1 @@ +-{'flavor': 'h'} ++{'arch': 'aarch64', 'flavor': 'h'} +diff --git a/test/zdtm/customization/maps00.c b/test/zdtm/customization/maps00.c +new file mode 100644 +index 0000000..83533f8 +--- /dev/null ++++ b/test/zdtm/customization/maps00.c +@@ -0,0 +1,271 @@ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "zdtmtst.h" ++ ++const char *test_doc = "Create all sorts of maps and compare /proc/pid/maps\n" ++ "before and after migration\n"; ++const char *test_author = "Pavel Emelianov "; ++ ++char *filename; ++TEST_OPTION(filename, string, "file name", 1); ++ ++const static int map_prots[] = { ++ PROT_NONE, ++ PROT_READ, ++ PROT_READ | PROT_WRITE, ++ PROT_READ | PROT_WRITE | PROT_EXEC, ++}; ++#define NUM_MPROTS sizeof(map_prots) / sizeof(int) ++#define RW_PROT(x) ((x) & (PROT_READ | PROT_WRITE)) ++#define X_PROT(x) ((x) & PROT_EXEC) ++ ++int check_prot(int src_prot, int dst_prot) ++{ ++ if (RW_PROT(src_prot) != RW_PROT(dst_prot)) ++ return 0; ++ /* If exec bit will be enabled may depend on NX capability of CPUs of ++ * source and destination nodes. In any case, migrated mapping should ++ * not have less permissions than newly created one ++ ** ++ * A is a subset of B iff (A & B) == A ++ */ ++ return (X_PROT(dst_prot) & X_PROT(src_prot)) == X_PROT(dst_prot); ++} ++ ++const static int map_flags[] = { ++ MAP_PRIVATE, ++ MAP_SHARED, ++ MAP_PRIVATE | MAP_ANONYMOUS, ++ MAP_SHARED | MAP_ANONYMOUS ++}; ++#define NUM_MFLAGS sizeof(map_flags) / sizeof(int) ++#define NUM_MAPS NUM_MPROTS * NUM_MFLAGS ++#define ONE_MAP_SIZE 0x2000 ++ ++struct map ++{ ++ int prot; ++ int prot_real; ++ int flag; ++ char filename[256]; ++ int fd; ++ void *ptr; ++}; ++ ++static void init_map(struct map *map, int prot_no, int flag_no) ++{ ++ map->fd = -1; ++ map->prot = map_prots[prot_no]; ++ map->flag = map_flags[flag_no]; ++} ++ ++static int make_map(struct map *map) ++{ ++ uint32_t crc; ++ uint8_t buf[ONE_MAP_SIZE]; ++ static int i = 0; ++ ++ if (!(map->flag & MAP_ANONYMOUS)) { ++ /* need file */ ++ if (snprintf(map->filename, sizeof(map->filename), ++ "%s-%02d", filename, i++) >= sizeof(map->filename)) { ++ pr_perror("filename %s is too long", filename); ++ return -1; ++ } ++ ++ map->fd = open(map->filename, O_RDWR | O_CREAT, 0600); ++ if (map->fd < 0) { ++ pr_perror("can't open %s", map->filename); ++ return -1; ++ } ++ ++ crc = ~0; ++ datagen(buf, sizeof(buf), &crc); ++ if (write(map->fd, buf, sizeof(buf)) != sizeof(buf)) { ++ pr_perror("failed to write %s", map->filename); ++ return -1; ++ } ++ } ++ ++ map->ptr = mmap(NULL, ONE_MAP_SIZE, map->prot, map->flag, map->fd, 0); ++ if (map->ptr == MAP_FAILED) { ++ pr_perror("can't create mapping"); ++ return -1; ++ } ++ ++ if ((map->flag & MAP_ANONYMOUS) && (map->prot & PROT_WRITE)) { ++ /* can't fill it with data otherwise */ ++ crc = ~0; ++ datagen(map->ptr, ONE_MAP_SIZE, &crc); ++ } ++ ++ test_msg("map: ptr %p flag %8x prot %8x\n", ++ map->ptr, map->flag, map->prot); ++ ++ return 0; ++} ++ ++static sigjmp_buf segv_ret; /* we need sig*jmp stuff, otherwise SIGSEGV will reset our handler */ ++static void segfault(int signo) ++{ ++ siglongjmp(segv_ret, 1); ++} ++ ++/* ++ * after test func should be placed check map, because size of test_func ++ * is calculated as (check_map-test_func) ++ */ ++int test_func(void) ++{ ++ return 1; ++} ++static int check_map(struct map *map) ++{ ++ int prot = PROT_WRITE | PROT_READ | PROT_EXEC; ++ ++ if (signal(SIGSEGV, segfault) == SIG_ERR) ++ { ++ fail("setting SIGSEGV handler failed: %m\n"); ++ return -1; ++ } ++ if (!sigsetjmp(segv_ret, 1)) ++ { ++ uint32_t crc = ~0; ++ if (datachk(map->ptr, ONE_MAP_SIZE, &crc)) /* perform read access */ ++ if (!(map->flag & MAP_ANONYMOUS) || ++ (map->prot & PROT_WRITE)) { /* anon maps could only be filled when r/w */ ++ fail("CRC mismatch: ptr %p flag %8x prot %8x\n", ++ map->ptr, map->flag, map->prot); ++ return -1; ++ } ++ /* prot |= PROT_READ// need barrier before this line, ++ because compiler change order commands. ++ I finded one method: look at next lines*/ ++ } else ++ prot &= PROT_WRITE | !PROT_READ | PROT_EXEC; ++ ++ if (signal(SIGSEGV, segfault) == SIG_ERR) ++ { ++ fail("setting SIGSEGV handler failed: %m\n"); ++ return -1; ++ } ++ ++ if (!sigsetjmp(segv_ret, 1)) ++ { ++ * (int *) (map->ptr) = 1234; /* perform write access */ ++ } else ++ prot &= !PROT_WRITE | PROT_READ | PROT_EXEC; ++ ++ if (signal(SIGSEGV, segfault) == SIG_ERR) ++ { ++ fail("restoring SIGSEGV handler failed: %m\n"); ++ return -1; ++ } ++ ++ if (!sigsetjmp(segv_ret, 1)) ++ { ++ if (map->prot & PROT_WRITE) { ++ memcpy(map->ptr,test_func, ONE_MAP_SIZE); ++ __builtin___clear_cache(map->ptr, map->ptr+ONE_MAP_SIZE); ++ } else { ++ if (!(map->flag & MAP_ANONYMOUS)) { ++ uint8_t funlen = (uint8_t *)check_map - (uint8_t *)test_func; ++ lseek(map->fd,0,SEEK_SET); ++ if (write(map->fd,test_func,funlen)filename); ++ return -1; ++ } ++ } ++ } ++ if (!(map->flag & MAP_ANONYMOUS) || (map->prot & PROT_WRITE)) { ++ /* Function body has been copied into the mapping */ ++ ((int (*)(void))map->ptr)(); /* perform exec access */ ++ } else { ++ /* No way to copy function body into mapping, ++ * clear exec bit from effective protection ++ */ ++ prot &= PROT_WRITE | PROT_READ | !PROT_EXEC; ++ } ++ } else ++ prot &= PROT_WRITE | PROT_READ | !PROT_EXEC; ++ ++ if (signal(SIGSEGV, SIG_DFL) == SIG_ERR) ++ { ++ fail("restoring SIGSEGV handler failed: %m\n"); ++ return -1; ++ } ++ ++ return prot; ++} ++ ++static void destroy_map(struct map *map) ++{ ++ munmap(map->ptr, ONE_MAP_SIZE); ++ ++ if (map->fd >= 0) ++ { ++ close(map->fd); ++ unlink(map->filename); ++ } ++} ++ ++ ++#define MAPS_LEN 0x10000 ++ ++int main(int argc, char ** argv) ++{ ++ struct map maps[NUM_MAPS] = {}, maps_compare[NUM_MAPS] = {}; ++ int i, j, k; ++ test_init(argc, argv); ++ ++ k = 0; ++ for (i = 0; i < NUM_MPROTS; i++) ++ for (j = 0; j < NUM_MFLAGS; j++) ++ init_map(maps + k++, i, j); ++ ++ for (i = 0; i < NUM_MAPS; i++) ++ if (make_map(maps + i)) ++ goto err; ++ ++ test_daemon(); ++ test_waitsig(); ++ ++ for (i = 0; i < NUM_MAPS; i++) ++ if ((maps[i].prot_real=check_map(maps + i))<0) ++ goto err; ++ k=0; ++ for (i = 0; i < NUM_MPROTS; i++) ++ for (j = 0; j < NUM_MFLAGS; j++) ++ init_map(maps_compare + k++, i, j); ++ for (i = 0; i < NUM_MAPS; i++) ++ if (make_map(maps_compare+ i)) ++ goto err; ++ for (i = 0; i < NUM_MAPS; i++) ++ if ((maps_compare[i].prot_real=check_map(maps_compare + i))<0) ++ goto err; ++ for (i = 0; i< NUM_MAPS; i++) ++ if (!check_prot(maps[i].prot_real, maps_compare[i].prot_real)){ ++ fail("protection on %i (flag=%d prot=%d) maps has changed (prot=%d(expected %d))", ++ i, maps[i].flag, maps[i].prot, maps[i].prot_real, maps_compare[i].prot_real); ++ goto err; ++ } ++ ++ pass(); ++ ++ for (i = 0; i < NUM_MAPS; i++) { ++ destroy_map(maps + i); ++ destroy_map(maps_compare + i); ++ } ++ return 0; ++ ++err: ++ return 1; ++} +diff --git a/test/zdtm/customization/maps00.desc b/test/zdtm/customization/maps00.desc +new file mode 100644 +index 0000000..dad462e +--- /dev/null ++++ b/test/zdtm/customization/maps00.desc +@@ -0,0 +1 @@ ++{'arch': 'aarch64', 'flavor': 'h', 'opts': '--pin-memory --use-fork-pid', 'flags': 'suid', 'cmdline': 'pinmemory max_pin_pid_num'} +diff --git a/test/zdtm/customization/maps007.c b/test/zdtm/customization/maps007.c +new file mode 100644 +index 0000000..ee5e7c7 +--- /dev/null ++++ b/test/zdtm/customization/maps007.c +@@ -0,0 +1,178 @@ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "zdtmtst.h" ++#include "lock.h" ++ ++#define MAP_SIZE (1UL << 20) ++#define MEM_SIZE (1UL << 29) ++ ++const char *test_doc = "create random mappings and touch memory"; ++ ++int sys_process_vm_readv(pid_t pid, void *addr, void *buf, int size) ++{ ++ struct iovec lvec = {.iov_base = buf, .iov_len = size }; ++ struct iovec rvec = {.iov_base = addr, .iov_len = size }; ++ /* workaround bug in glibc with sixth argument of syscall */ ++ char nop[PAGE_SIZE]; ++ ++ memset(nop, 0, sizeof(nop)); ++ ++ return syscall(__NR_process_vm_readv, pid, &lvec, 1, &rvec, 1, 0); ++} ++ ++/* The child follows the parents two steps behind. */ ++#define MAX_DELTA 1000 ++int main(int argc, char **argv) ++{ ++ void *start, *end, *p; ++ pid_t child; ++ struct { ++ futex_t delta; ++ futex_t stop; ++ } *shm; ++ uint32_t v; ++ unsigned long long count = 0; ++ int i; ++ ++ test_init(argc, argv); ++ ++ /* shared memory for synchronization */ ++ shm = mmap(NULL, PAGE_SIZE, PROT_WRITE | PROT_READ, MAP_ANONYMOUS | MAP_SHARED, -1, 0); ++ if (shm == MAP_FAILED) ++ return -1; ++ ++ /* allocate workspace */ ++ start = mmap(NULL, MEM_SIZE, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); ++ if (start == MAP_FAILED) ++ return -1; ++ ++ test_msg("%p-%p\n", start, start + MEM_SIZE); ++ ++ end = start + MEM_SIZE; ++ ++ v = 0; ++ futex_set(&shm->delta, v); ++ futex_set(&shm->stop, 0); ++ ++ child = fork(); ++ if (child < 0) { ++ pr_perror("fork"); ++ return 1; ++ } ++ ++ while (1) { ++ void *ret; ++ unsigned long size; ++ int prot = PROT_NONE; ++ ++ if (child) { ++ if (!test_go()) ++ break; ++ futex_wait_while_gt(&shm->delta, 2 * MAX_DELTA); ++ futex_inc_and_wake(&shm->delta); ++ } else { ++ if (!futex_get(&shm->stop)) ++ /* shm->delta must be always bigger than MAX_DELTA */ ++ futex_wait_while_lt(&shm->delta, MAX_DELTA + 2); ++ else if (count % 100 == 0) ++ test_msg("count %llu delta %d\n", ++ count, futex_get(&shm->delta)); /* heartbeat */ ++ ++ if (futex_get(&shm->stop) && atomic_get(&shm->delta.raw) == MAX_DELTA) ++ break; ++ futex_dec_and_wake(&shm->delta); ++ } ++ ++ count++; ++ if (child && count == MAX_DELTA + 1) ++ test_daemon(); ++ ++ p = start + ((lrand48() * PAGE_SIZE) % MEM_SIZE); ++ size = lrand48() * PAGE_SIZE; ++ size %= (end - p); ++ size %= MAP_SIZE; ++ if (size == 0) ++ size = PAGE_SIZE; ++ ++ if (lrand48() % 2) ++ prot |= PROT_READ; ++ if (lrand48() % 2) ++ prot |= PROT_EXEC; ++ if (lrand48() % 2) ++ prot |= PROT_WRITE; ++ ++ ret = mmap(p, size, prot, MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0); ++ if (ret == MAP_FAILED) { ++ pr_perror("%p-%p", p, p + size); ++ goto err; ++ } ++ ++ if (!(prot & PROT_WRITE)) ++ continue; ++ ++ for (i = 0; i < lrand48() % 50; i++) { ++ char *t = p + (lrand48() * PAGE_SIZE) % (size); ++ t[0] = lrand48(); ++ } ++ } ++ test_msg("count %llu\n", count); ++ ++ if (child == 0) { ++ if (!test_go()) ++ pr_perror("unexpected state"); ++ futex_set_and_wake(&shm->stop, 2); ++ test_waitsig(); ++ return 0; ++ } else { ++ int readable = 0, status = -1; ++ ++ /* stop the child */ ++ futex_set(&shm->stop, 1); ++ futex_add_and_wake(&shm->delta, MAX_DELTA); ++ /* wait until the child will be in the same point */ ++ futex_wait_until(&shm->stop, 2); ++ ++ /* check that child and parent have the identical content of memory */ ++ for (p = start; p < end; p += PAGE_SIZE) { ++ char rbuf[PAGE_SIZE], lbuf[PAGE_SIZE]; ++ int rret, lret; ++ ++ lret = sys_process_vm_readv(getpid(), p, lbuf, PAGE_SIZE); ++ rret = sys_process_vm_readv(child, p, rbuf, PAGE_SIZE); ++ if (rret != lret) { ++ pr_perror("%p %d %d", p, lret, rret); ++ goto err; ++ } ++ if (lret < 0) ++ continue; ++ readable++; ++ if (memcmp(rbuf, lbuf, PAGE_SIZE)) { ++ pr_perror("%p", p); ++ goto err; ++ } ++ } ++ test_msg("readable %d\n", readable); ++ kill(child, SIGTERM); ++ wait(&status); ++ if (status != 0) { ++ pr_perror("Non-zero exit code: %d", status); ++ goto err; ++ } ++ pass(); ++ } ++ ++ return 0; ++err: ++ kill(child, SIGSEGV); ++ *((volatile int *) 0) = 0; ++ return 1; ++} +diff --git a/test/zdtm/customization/maps007.desc b/test/zdtm/customization/maps007.desc +new file mode 100644 +index 0000000..9ed7e46 +--- /dev/null ++++ b/test/zdtm/customization/maps007.desc +@@ -0,0 +1 @@ ++{'arch': 'aarch64', 'opts': '--pin-memory --use-fork-pid', 'flags': 'suid', 'flavor': 'h', 'cmdline': 'pinmemory max_pin_pid_num'} +diff --git a/test/zdtm/customization/maps008.c b/test/zdtm/customization/maps008.c +new file mode 100644 +index 0000000..7ed7c10 +--- /dev/null ++++ b/test/zdtm/customization/maps008.c +@@ -0,0 +1,514 @@ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "zdtmtst.h" ++#include "lock.h" ++ ++const char *test_doc = "ps tree with anon shared vmas for dedup"; ++ ++/* ++ * 1. ps tree with non triavial anon shmem vmas is created first. ++ * 2. Each process gets its portion of shmem vmas. ++ * 3. Each process continuously datagens its portion until ++ * criu dump is finished. ++ * 4. Each process datachecks all its shmem portions after restore. ++ * 5. Contents of anon shmem vmas are checked for equality in ++ * different processes. ++ */ ++ ++typedef int (*proc_func_t)(task_waiter_t *setup_waiter); ++ ++static pid_t fork_and_setup(proc_func_t pfunc) ++{ ++ task_waiter_t setup_waiter; ++ pid_t pid; ++ ++ task_waiter_init(&setup_waiter); ++ pid = test_fork(); ++ if (pid < 0) { ++ pr_perror("fork failed"); ++ exit(1); ++ } ++ ++ if (pid == 0) ++ exit(pfunc(&setup_waiter)); ++ ++ task_waiter_wait4(&setup_waiter, pid); ++ task_waiter_fini(&setup_waiter); ++ return pid; ++} ++ ++static void cont_and_wait_child(pid_t pid) ++{ ++ int status; ++ ++ kill(pid, SIGTERM); ++ waitpid(pid, &status, 0); ++ if (WIFEXITED(status)) { ++ if (WEXITSTATUS(status)) ++ exit(WEXITSTATUS(status)); ++ } else ++ exit(1); ++} ++ ++static void *mmap_ashmem(size_t size) ++{ ++ void *mem = mmap(NULL, size, PROT_WRITE | PROT_READ, ++ MAP_SHARED | MAP_ANONYMOUS, -1, 0); ++ if (mem == MAP_FAILED) { ++ pr_perror("Can't map shmem %zx", size); ++ exit(1); ++ } ++ return mem; ++} ++ ++static void *mmap_proc_mem(pid_t pid, unsigned long addr, ++ unsigned long size) ++{ ++ int fd; ++ void *mem; ++ char path[PATH_MAX]; ++ ++ snprintf(path, PATH_MAX, "/proc/%d/map_files/%lx-%lx", ++ (int)pid, addr, addr + size); ++ fd = open(path, O_RDWR); ++ if (fd == -1) { ++ pr_perror("Can't open file %s", path); ++ exit(1); ++ } ++ ++ mem = mmap(NULL, size, PROT_READ, MAP_SHARED, fd, 0); ++ close(fd); ++ if (mem == MAP_FAILED) { ++ pr_perror("Can't map file %s", path); ++ exit(1); ++ } ++ return mem; ++} ++ ++static void check_mem_eq(void *addr1, size_t size1, void *addr2, size_t size2) ++{ ++ unsigned long min_size = size1 < size2 ? size1 : size2; ++ ++ if (memcmp(addr1, addr2, min_size)) { ++ pr_err("Mem differs %lx %lx %lx", (unsigned long)addr1, ++ (unsigned long)addr2, min_size); ++ exit(1); ++ } ++} ++ ++static void xmunmap(void *map, size_t size) ++{ ++ if (munmap(map, size)) { ++ pr_err("xmunmap"); ++ exit(1); ++ } ++} ++ ++static void chk_proc_mem_eq(pid_t pid1, void *addr1, unsigned long size1, ++ pid_t pid2, void *addr2, unsigned long size2) ++{ ++ void *map1, *map2; ++ ++ map1 = mmap_proc_mem(pid1, (unsigned long)addr1, size1); ++ map2 = mmap_proc_mem(pid2, (unsigned long)addr2, size2); ++ check_mem_eq(map1, size1, map2, size2); ++ xmunmap(map1, size1); ++ xmunmap(map2, size2); ++} ++ ++/* ++ * ps tree: ++ * proc1_______________ ++ * | | | ++ * proc11___ proc12 proc13 ++ * | | | ++ * proc111 proc112 proc131 ++ */ ++#define PROC_CNT 7 ++ ++#define PROC1_PGIX 0 ++#define PROC11_PGIX 1 ++#define PROC12_PGIX 2 ++#define PROC13_PGIX 3 ++#define PROC111_PGIX 4 ++#define PROC112_PGIX 5 ++#define PROC131_PGIX 6 ++#define ZERO_PGIX 7 ++/* unused pgix: 8 */ ++#define MEM_PERIOD (9 * PAGE_SIZE) ++ ++struct pstree { ++ pid_t proc1; ++ pid_t proc11; ++ pid_t proc12; ++ pid_t proc13; ++ pid_t proc111; ++ pid_t proc112; ++ pid_t proc131; ++}; ++struct pstree *pstree; ++ ++struct test_sync { ++ futex_t datagen; ++ futex_t datagen_exit_cnt; ++}; ++struct test_sync *test_sync; ++ ++size_t mem1_size, mem2_size, mem3_size; ++uint8_t *mem1, *mem2, *mem3; ++ ++#define CRC_EPOCH_OFFSET (PAGE_SIZE - sizeof(uint32_t)) ++ ++static void read_each_pg(volatile uint8_t *mem, size_t size, size_t off) ++{ ++ if (!mem) ++ return; ++ ++ while (off < size) { ++ (mem + off)[0]; ++ off += MEM_PERIOD; ++ } ++} ++ ++void datagen_each_pg(uint8_t *mem, size_t size, size_t off, uint32_t crc_epoch) ++{ ++ if (!mem) ++ return; ++ ++ while (futex_get(&test_sync->datagen) && (off < size)) { ++ uint32_t crc = crc_epoch; ++ ++ datagen(mem + off, CRC_EPOCH_OFFSET, &crc); ++ *(uint32_t *)(mem + off + CRC_EPOCH_OFFSET) = crc_epoch; ++ off += MEM_PERIOD; ++ } ++} ++ ++void datachck_each_pg(uint8_t *mem, size_t size, size_t off) ++{ ++ if (!mem) ++ return; ++ ++ while (off < size) { ++ uint32_t crc = *(uint32_t *)(mem + off + CRC_EPOCH_OFFSET); ++ ++ if (datachk(mem + off, CRC_EPOCH_OFFSET, &crc)) ++ exit(1); ++ off += MEM_PERIOD; ++ } ++} ++ ++static void mems_read_each_pgix(size_t pgix) ++{ ++ const size_t off = pgix * PAGE_SIZE; ++ ++ read_each_pg(mem1, mem1_size, off); ++ read_each_pg(mem2, mem2_size, off); ++ read_each_pg(mem3, mem3_size, off); ++} ++ ++static void mems_datagen_each_pgix(size_t pgix, uint32_t *crc_epoch) ++{ ++ const size_t off = pgix * PAGE_SIZE; ++ ++ ++(*crc_epoch); ++ datagen_each_pg(mem1, mem1_size, off, *crc_epoch); ++ datagen_each_pg(mem2, mem2_size, off, *crc_epoch); ++ datagen_each_pg(mem3, mem3_size, off, *crc_epoch); ++} ++ ++static void mems_datachck_each_pgix(size_t pgix) ++{ ++ const size_t off = pgix * PAGE_SIZE; ++ ++ datachck_each_pg(mem1, mem1_size, off); ++ datachck_each_pg(mem2, mem2_size, off); ++ datachck_each_pg(mem3, mem3_size, off); ++} ++ ++static int proc131_func(task_waiter_t *setup_waiter) ++{ ++ uint32_t crc_epoch = 0; ++ ++ pstree->proc131 = getpid(); ++ mems_datagen_each_pgix(PROC131_PGIX, &crc_epoch); ++ task_waiter_complete_current(setup_waiter); ++ ++ while (futex_get(&test_sync->datagen)) ++ mems_datagen_each_pgix(PROC131_PGIX, &crc_epoch); ++ futex_inc_and_wake(&test_sync->datagen_exit_cnt); ++ test_waitsig(); ++ ++ mems_datachck_each_pgix(PROC131_PGIX); ++ return 0; ++} ++ ++static int proc13_func(task_waiter_t *setup_waiter) ++{ ++ size_t MEM1_HOLE_START = 2 * MEM_PERIOD; ++ size_t MEM1_HOLE_SIZE = 1 * MEM_PERIOD; ++ uint32_t crc_epoch = 0; ++ ++ pstree->proc13 = getpid(); ++ xmunmap(mem1 + MEM1_HOLE_START, MEM1_HOLE_SIZE); ++ xmunmap(mem2, mem2_size); ++ xmunmap(mem3, mem3_size); ++ mem2 = mem1 + MEM1_HOLE_START + MEM1_HOLE_SIZE; ++ mem2_size = mem1_size - (mem2 - mem1); ++ mem1_size = MEM1_HOLE_START; ++ mem3 = mmap_ashmem(mem3_size); ++ mems_datagen_each_pgix(PROC13_PGIX, &crc_epoch); ++ fork_and_setup(proc131_func); ++ task_waiter_complete_current(setup_waiter); ++ ++ while (futex_get(&test_sync->datagen)) ++ mems_datagen_each_pgix(PROC13_PGIX, &crc_epoch); ++ futex_inc_and_wake(&test_sync->datagen_exit_cnt); ++ test_waitsig(); ++ ++ mems_datachck_each_pgix(PROC13_PGIX); ++ ++ chk_proc_mem_eq(pstree->proc13, mem1, mem1_size, ++ pstree->proc131, mem1, mem1_size); ++ chk_proc_mem_eq(pstree->proc13, mem2, mem2_size, ++ pstree->proc131, mem2, mem2_size); ++ chk_proc_mem_eq(pstree->proc13, mem3, mem3_size, ++ pstree->proc131, mem3, mem3_size); ++ ++ cont_and_wait_child(pstree->proc131); ++ return 0; ++} ++ ++static int proc12_func(task_waiter_t *setup_waiter) ++{ ++ uint32_t crc_epoch = 0; ++ ++ pstree->proc12 = getpid(); ++ mems_datagen_each_pgix(PROC12_PGIX, &crc_epoch); ++ task_waiter_complete_current(setup_waiter); ++ ++ while (futex_get(&test_sync->datagen)) ++ mems_datagen_each_pgix(PROC12_PGIX, &crc_epoch); ++ futex_inc_and_wake(&test_sync->datagen_exit_cnt); ++ test_waitsig(); ++ ++ mems_datachck_each_pgix(PROC12_PGIX); ++ ++ return 0; ++} ++ ++static int proc111_func(task_waiter_t *setup_waiter) ++{ ++ uint32_t crc_epoch = 0; ++ ++ pstree->proc111 = getpid(); ++ mems_datagen_each_pgix(PROC111_PGIX, &crc_epoch); ++ task_waiter_complete_current(setup_waiter); ++ ++ while (futex_get(&test_sync->datagen)) ++ mems_datagen_each_pgix(PROC111_PGIX, &crc_epoch); ++ futex_inc_and_wake(&test_sync->datagen_exit_cnt); ++ test_waitsig(); ++ ++ mems_datachck_each_pgix(PROC111_PGIX); ++ return 0; ++} ++ ++static int proc112_func(task_waiter_t *setup_waiter) ++{ ++ uint32_t crc_epoch = 0; ++ ++ pstree->proc112 = getpid(); ++ mems_datagen_each_pgix(PROC112_PGIX, &crc_epoch); ++ task_waiter_complete_current(setup_waiter); ++ ++ while (futex_get(&test_sync->datagen)) ++ mems_datagen_each_pgix(PROC112_PGIX, &crc_epoch); ++ futex_inc_and_wake(&test_sync->datagen_exit_cnt); ++ test_waitsig(); ++ ++ mems_datachck_each_pgix(PROC112_PGIX); ++ return 0; ++} ++ ++static int proc11_func(task_waiter_t *setup_waiter) ++{ ++ const size_t MEM3_START_CUT = 1 * MEM_PERIOD; ++ const size_t MEM3_END_CUT = 2 * MEM_PERIOD; ++ void *mem3_old = mem3; ++ size_t mem3_size_old = mem3_size; ++ uint32_t crc_epoch = 0; ++ uint8_t *proc1_mem3; ++ ++ pstree->proc11 = getpid(); ++ xmunmap(mem3, MEM3_START_CUT); ++ mem3 += MEM3_START_CUT; ++ mem3_size -= MEM3_START_CUT; ++ fork_and_setup(proc111_func); ++ fork_and_setup(proc112_func); ++ xmunmap(mem3 + mem3_size - MEM3_END_CUT, MEM3_END_CUT); ++ mem3_size -= MEM3_END_CUT; ++ mems_datagen_each_pgix(PROC11_PGIX, &crc_epoch); ++ task_waiter_complete_current(setup_waiter); ++ ++ while (futex_get(&test_sync->datagen)) ++ mems_datagen_each_pgix(PROC11_PGIX, &crc_epoch); ++ futex_inc_and_wake(&test_sync->datagen_exit_cnt); ++ test_waitsig(); ++ ++ mems_datachck_each_pgix(PROC11_PGIX); ++ ++ chk_proc_mem_eq(pstree->proc11, mem1, mem1_size, ++ pstree->proc111, mem1, mem1_size); ++ chk_proc_mem_eq(pstree->proc11, mem1, mem1_size, ++ pstree->proc112, mem1, mem1_size); ++ ++ chk_proc_mem_eq(pstree->proc11, mem2, mem2_size, ++ pstree->proc111, mem2, mem2_size); ++ chk_proc_mem_eq(pstree->proc11, mem2, mem2_size, ++ pstree->proc112, mem2, mem2_size); ++ ++ chk_proc_mem_eq(pstree->proc11, mem3, mem3_size, ++ pstree->proc111, mem3, mem3_size + MEM3_END_CUT); ++ chk_proc_mem_eq(pstree->proc11, mem3, mem3_size, ++ pstree->proc112, mem3, mem3_size + MEM3_END_CUT); ++ ++ proc1_mem3 = mmap_proc_mem(pstree->proc1, ++ (unsigned long)mem3_old, mem3_size_old); ++ check_mem_eq(mem3, mem3_size, proc1_mem3 + MEM3_START_CUT, mem3_size); ++ xmunmap(proc1_mem3, mem3_size_old); ++ ++ cont_and_wait_child(pstree->proc111); ++ cont_and_wait_child(pstree->proc112); ++ return 0; ++} ++ ++#define MAX(a, b) ((a) > (b) ? (a) : (b)) ++#define MB(n) ((n) * (1UL << 20)) ++ ++static int proc1_func(void) ++{ ++ uint32_t crc_epoch = 0; ++ uint8_t *mem2_old = NULL; ++ ++ /* ++ * Min mem size: ++ * At least 5 mem periods for mem pages and vma holes. ++ * At least 1 MB mem size not to test on tiny working set. ++ */ ++ mem1_size = MEM_PERIOD * MAX(5, MB(1) / MEM_PERIOD + 1); ++ mem2_size = mem1_size * 2; ++ mem3_size = mem2_size * 3; ++ ++ futex_set(&test_sync->datagen, 1); ++ pstree->proc1 = getpid(); ++ mem1 = mmap_ashmem(mem1_size); ++ mem2 = mmap_ashmem(mem2_size); ++ mem3 = mmap_ashmem(mem3_size); ++ mems_datagen_each_pgix(PROC1_PGIX, &crc_epoch); ++ mems_read_each_pgix(ZERO_PGIX); ++ ++ fork_and_setup(proc11_func); ++ fork_and_setup(proc12_func); ++ fork_and_setup(proc13_func); ++ ++ xmunmap(mem1, mem1_size); ++ if (mremap(mem2, mem2_size, mem1_size, MREMAP_MAYMOVE | MREMAP_FIXED, ++ mem1) != mem1) { ++ pr_perror("proc1 mem2 remap"); ++ exit(1); ++ } ++ mem2_old = mem2; ++ mem2 = NULL; ++ ++ test_daemon(); ++ while (test_go()) ++ mems_datagen_each_pgix(PROC1_PGIX, &crc_epoch); ++ test_waitsig(); ++ futex_set(&test_sync->datagen_exit_cnt, 0); ++ futex_set(&test_sync->datagen, 0); ++ futex_wait_while(&test_sync->datagen_exit_cnt, PROC_CNT); ++ ++ mems_datachck_each_pgix(PROC1_PGIX); ++ ++ chk_proc_mem_eq(pstree->proc1, mem1, mem1_size, ++ pstree->proc11, mem2_old, mem2_size); ++ chk_proc_mem_eq(pstree->proc1, mem1, mem1_size, ++ pstree->proc12, mem2_old, mem2_size); ++ ++ chk_proc_mem_eq(pstree->proc1, mem3, mem3_size, ++ pstree->proc12, mem3, mem3_size); ++ ++ cont_and_wait_child(pstree->proc11); ++ cont_and_wait_child(pstree->proc12); ++ cont_and_wait_child(pstree->proc13); ++ ++ pass(); ++ return 0; ++} ++ ++static void kill_pstree_from_root(void) ++{ ++ if (getpid() != pstree->proc1) ++ return; ++ ++ kill(pstree->proc11, SIGKILL); ++ kill(pstree->proc12, SIGKILL); ++ kill(pstree->proc13, SIGKILL); ++ kill(pstree->proc111, SIGKILL); ++ kill(pstree->proc112, SIGKILL); ++ kill(pstree->proc131, SIGKILL); ++} ++ ++static void sigchld_hand(int signo, siginfo_t *info, void *ucontext) ++{ ++ if (info->si_code != CLD_EXITED) ++ return; ++ if (!info->si_status) ++ return; ++ ++ /* ++ * If we are not ps tree root then propagate child error to parent. ++ * If we are ps tree root then also call all ++ * atexit handlers set up by zdtm test framework and this test. ++ * exit() is not async signal safe but it's ok for testing purposes. ++ * exit() usage allows us to use very simple error handling ++ * and pstree killing logic. ++ */ ++ exit(info->si_status); ++} ++ ++int main(int argc, char **argv) ++{ ++ struct sigaction sa = { ++ .sa_sigaction = sigchld_hand, ++ .sa_flags = SA_RESTART | SA_SIGINFO | SA_NOCLDSTOP ++ }; ++ sigemptyset(&sa.sa_mask); ++ ++ test_init(argc, argv); ++ ++ pstree = (struct pstree *)mmap_ashmem(PAGE_SIZE); ++ test_sync = (struct test_sync *)mmap_ashmem(sizeof(*test_sync)); ++ ++ if (sigaction(SIGCHLD, &sa, NULL)) { ++ pr_perror("SIGCHLD handler setup"); ++ exit(1); ++ }; ++ ++ if (atexit(kill_pstree_from_root)) { ++ pr_err("Can't setup atexit cleanup func"); ++ exit(1); ++ } ++ return proc1_func(); ++} +diff --git a/test/zdtm/customization/maps008.desc b/test/zdtm/customization/maps008.desc +new file mode 100644 +index 0000000..154ef8c +--- /dev/null ++++ b/test/zdtm/customization/maps008.desc +@@ -0,0 +1 @@ ++{'arch': 'aarch64', 'opts': '--pin-memory --use-fork-pid', 'flavor': 'h', 'flags': 'suid', 'cmdline': 'pinmemory max_pin_pid_num'} +diff --git a/test/zdtm/customization/maps01.c b/test/zdtm/customization/maps01.c +new file mode 100644 +index 0000000..119d7a6 +--- /dev/null ++++ b/test/zdtm/customization/maps01.c +@@ -0,0 +1,183 @@ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "zdtmtst.h" ++ ++#define MEM_SIZE (1LU << 30) ++#define MEM_OFFSET (1LU << 29) ++#define MEM_OFFSET2 (MEM_SIZE - PAGE_SIZE) ++#define MEM_OFFSET3 (20LU * PAGE_SIZE) ++ ++const char *test_doc = "Test shared memory"; ++const char *test_author = "Andrew Vagin > 20); ++ goto err; ++ } ++ ++ p = mmap(NULL, MEM_SIZE, PROT_WRITE | PROT_READ, ++ MAP_SHARED | MAP_ANONYMOUS, -1, 0); ++ ++ if (p == MAP_FAILED) { ++ pr_err("Failed to mmap %ld Mb shared anonymous R/W memory\n", ++ MEM_SIZE >> 20); ++ goto err; ++ } ++ ++ p2 = mmap(NULL, MEM_OFFSET, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); ++ if (p2 == MAP_FAILED) { ++ pr_err("Failed to mmap %lu Mb anonymous memory\n", ++ MEM_OFFSET >> 20); ++ goto err; ++ } ++ ++ pid = test_fork(); ++ if (pid < 0) { ++ pr_err("Fork failed with %d\n", pid); ++ goto err; ++ } else if (pid == 0) { ++ void *p3; ++ ++ p3 = mmap(NULL, MEM_OFFSET3, PROT_READ | PROT_WRITE, ++ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); ++ if (p3 == MAP_FAILED) { ++ pr_err("Failed to mmap %lu Mb anonymous R/W memory\n", ++ MEM_OFFSET3 >> 20); ++ goto err; ++ } ++ ++ crc = ~0; ++ datagen(m + MEM_OFFSET, PAGE_SIZE, &crc); ++ crc = ~0; ++ datagen(m + MEM_OFFSET2, PAGE_SIZE, &crc); ++ crc = ~0; ++ datagen(p + MEM_OFFSET + MEM_OFFSET3, PAGE_SIZE, &crc); ++ crc = ~0; ++ datagen(p + MEM_OFFSET + 2 * MEM_OFFSET3, PAGE_SIZE, &crc); ++ crc = ~0; ++ datagen(p + MEM_OFFSET3, PAGE_SIZE, &crc); ++ crc = ~0; ++ datagen(p3, PAGE_SIZE, &crc); ++ ++ task_waiter_complete(&t, 1); ++ ++ test_waitsig(); ++ ++ crc = ~0; ++ status = datachk(m + MEM_OFFSET, PAGE_SIZE, &crc); ++ if (status) ++ return 1; ++ crc = ~0; ++ status = datachk(m + MEM_OFFSET2, PAGE_SIZE, &crc); ++ if (status) ++ return 1; ++ crc = ~0; ++ status = datachk(m + PAGE_SIZE, PAGE_SIZE, &crc); ++ if (status) ++ return 1; ++ crc = ~0; ++ status = datachk(p + MEM_OFFSET + 2 * MEM_OFFSET3, PAGE_SIZE, &crc); ++ if (status) ++ return 1; ++ crc = ~0; ++ status = datachk(p + MEM_OFFSET3, PAGE_SIZE, &crc); ++ if (status) ++ return 1; ++ crc = ~0; ++ status = datachk(p3, PAGE_SIZE, &crc); ++ if (status) ++ return 1; ++ return 0; ++ } ++ task_waiter_wait4(&t, 1); ++ ++ munmap(p, MEM_OFFSET); ++ p2 = mremap(p + MEM_OFFSET, MEM_OFFSET, MEM_OFFSET, MREMAP_FIXED | MREMAP_MAYMOVE, p2); ++ if (p2 == MAP_FAILED) ++ goto err; ++ ++ snprintf(path, PATH_MAX, "/proc/self/map_files/%lx-%lx", ++ (unsigned long) m, ++ (unsigned long) m + MEM_SIZE); ++ fd = open(path, O_RDWR); ++ if (fd == -1) { ++ pr_perror("Can't open file %s", path); ++ goto err; ++ } ++ ++ m2 = mmap(NULL, PAGE_SIZE, PROT_WRITE | PROT_READ, MAP_SHARED, fd, MEM_OFFSET3); ++ if (m2 == MAP_FAILED) { ++ pr_perror("Can't map file %s", path); ++ goto err; ++ } ++ close(fd); ++ ++ munmap(m, PAGE_SIZE); ++ munmap(m + PAGE_SIZE * 10, PAGE_SIZE); ++ munmap(m + MEM_OFFSET2, PAGE_SIZE); ++ ++ crc = ~0; ++ datagen(m + PAGE_SIZE, PAGE_SIZE, &crc); ++ ++ crc = ~0; ++ datagen(m2, PAGE_SIZE, &crc); ++ ++ test_daemon(); ++ test_waitsig(); ++ ++ kill(pid, SIGTERM); ++ wait(&status); ++ if (WIFEXITED(status)) { ++ if (WEXITSTATUS(status)) ++ goto err; ++ } else ++ goto err; ++ ++ crc = ~0; ++ if (datachk(m + MEM_OFFSET, PAGE_SIZE, &crc)) ++ goto err; ++ ++ crc = ~0; ++ if (datachk(m2, PAGE_SIZE, &crc)) ++ goto err; ++ ++ crc = ~0; ++ if (datachk(p2 + MEM_OFFSET3, PAGE_SIZE, &crc)) ++ goto err; ++ ++ pass(); ++ ++ return 0; ++err: ++ if (waitpid(-1, NULL, WNOHANG) == 0) { ++ kill(pid, SIGTERM); ++ wait(NULL); ++ } ++ return 1; ++} +diff --git a/test/zdtm/customization/maps01.desc b/test/zdtm/customization/maps01.desc +new file mode 100644 +index 0000000..dad462e +--- /dev/null ++++ b/test/zdtm/customization/maps01.desc +@@ -0,0 +1 @@ ++{'arch': 'aarch64', 'flavor': 'h', 'opts': '--pin-memory --use-fork-pid', 'flags': 'suid', 'cmdline': 'pinmemory max_pin_pid_num'} +diff --git a/test/zdtm/customization/maps02.c b/test/zdtm/customization/maps02.c +new file mode 100644 +index 0000000..eb7c09b +--- /dev/null ++++ b/test/zdtm/customization/maps02.c +@@ -0,0 +1,111 @@ ++#include ++#include "zdtmtst.h" ++#include "get_smaps_bits.h" ++ ++#ifndef MADV_DONTDUMP ++#define MADV_DONTDUMP 16 ++#endif ++ ++const char *test_doc = "Test shared memory with advises"; ++const char *test_author = "Cyrill Gorcunov "; ++ ++struct mmap_data { ++ void *start; ++ unsigned long orig_flags; ++ unsigned long orig_madv; ++ unsigned long new_flags; ++ unsigned long new_madv; ++}; ++ ++#define MEM_SIZE (8192) ++ ++static int alloc_anon_mmap(struct mmap_data *m, int flags, int adv) ++{ ++ m->start = mmap(NULL, MEM_SIZE, PROT_READ | PROT_WRITE, ++ flags, -1, 0); ++ if (m->start == MAP_FAILED) { ++ pr_perror("mmap failed"); ++ return -1; ++ } ++ ++ if (madvise(m->start, MEM_SIZE, adv)) { ++ if (errno == EINVAL) { ++ test_msg("madvise failed, no kernel support\n"); ++ munmap(m->start, MEM_SIZE); ++ *m = (struct mmap_data){ }; ++ } else { ++ pr_perror("madvise failed"); ++ return -1; ++ } ++ } ++ ++ return 0; ++} ++ ++int main(int argc, char **argv) ++{ ++ struct mmap_data m[5] = { }; ++ size_t i; ++ ++ test_init(argc, argv); ++ ++ test_msg("Alloc growsdown\n"); ++ if (alloc_anon_mmap(&m[0], MAP_PRIVATE | MAP_ANONYMOUS, MADV_DONTFORK)) ++ return -1; ++ ++ test_msg("Alloc locked/sequential\n"); ++ if (alloc_anon_mmap(&m[1], MAP_PRIVATE | MAP_ANONYMOUS | MAP_LOCKED, MADV_SEQUENTIAL)) ++ return -1; ++ ++ test_msg("Alloc noreserve/dontdump\n"); ++ if (alloc_anon_mmap(&m[2], MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE, MADV_DONTDUMP)) ++ return -1; ++ ++ test_msg("Alloc hugetlb/hugepage\n"); ++ if (alloc_anon_mmap(&m[3], MAP_PRIVATE | MAP_ANONYMOUS, MADV_HUGEPAGE)) ++ return -1; ++ ++ test_msg("Alloc dontfork/random|mergeable\n"); ++ if (alloc_anon_mmap(&m[4], MAP_PRIVATE | MAP_ANONYMOUS, MADV_MERGEABLE)) ++ return -1; ++ ++ test_msg("Fetch existing flags/adv\n"); ++ for (i = 0; i < sizeof(m)/sizeof(m[0]); i++) { ++ if (get_smaps_bits((unsigned long)m[i].start, ++ &m[i].orig_flags, ++ &m[i].orig_madv)) ++ return -1; ++ } ++ ++ test_daemon(); ++ test_waitsig(); ++ ++ test_msg("Fetch restored flags/adv\n"); ++ for (i = 0; i < sizeof(m)/sizeof(m[0]); i++) { ++ if (get_smaps_bits((unsigned long)m[i].start, ++ &m[i].new_flags, ++ &m[i].new_madv)) ++ return -1; ++ ++ if (m[i].orig_flags != m[i].new_flags) { ++ pr_perror("Flags are changed %lx %lx -> %lx (%zu)", ++ (unsigned long)m[i].start, ++ m[i].orig_flags, m[i].new_flags, i); ++ fail(); ++ return -1; ++ } ++ ++ if (m[i].orig_madv != m[i].new_madv) { ++ pr_perror("Madvs are changed %lx %lx -> %lx (%zu)", ++ (unsigned long)m[i].start, ++ m[i].orig_madv, m[i].new_madv, i); ++ fail(); ++ return -1; ++ } ++ ++ } ++ ++ pass(); ++ ++ return 0; ++} +diff --git a/test/zdtm/customization/maps02.desc b/test/zdtm/customization/maps02.desc +new file mode 100644 +index 0000000..f14d661 +--- /dev/null ++++ b/test/zdtm/customization/maps02.desc +@@ -0,0 +1 @@ ++{'arch': 'aarch64', 'opts': '--pin-memory --use-fork-pid', 'flavor': 'h', 'cmdline': 'pinmemory max_pin_pid_num'} +diff --git a/test/zdtm/customization/maps04.c b/test/zdtm/customization/maps04.c +new file mode 100644 +index 0000000..780c566 +--- /dev/null ++++ b/test/zdtm/customization/maps04.c +@@ -0,0 +1,57 @@ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "zdtmtst.h" ++ ++#define MEM_SIZE (1L << 29) ++ ++const char *test_doc = "Test big mappings"; ++const char *test_author = "Andrew Vagin ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "zdtmtst.h" ++ ++const char *test_doc = "Create a bunch of small VMAs and test they survive transferring\n"; ++const char *test_author = "Cyrill Gorcunov "; ++ ++#define NR_MAPS 4096 ++ ++#define NR_MAPS_1 (NR_MAPS + 0) ++#define NR_MAPS_2 (NR_MAPS + 1) ++ ++#define MAPS_SIZE_1 (140 << 10) ++#define MAPS_SIZE_2 (8192) ++ ++int main(int argc, char *argv[]) ++{ ++ void *map[NR_MAPS + 2] = { }, *addr; ++ size_t i, summary; ++ ++ test_init(argc, argv); ++ ++ summary = NR_MAPS * 2 * 4096 + MAPS_SIZE_1 + MAPS_SIZE_2 + (1 << 20); ++ ++ addr = mmap(NULL, summary, PROT_NONE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); ++ if (addr == MAP_FAILED) { ++ pr_perror("Can't mmap"); ++ return 1; ++ } ++ munmap(addr, summary); ++ ++ for (i = 0; i < NR_MAPS; i++) { ++ map[i] = mmap(i > 0 ? map[i - 1] + 8192 : addr, 4096, PROT_READ | PROT_WRITE, ++ MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); ++ if (map[i] == MAP_FAILED) { ++ pr_perror("Can't mmap"); ++ return 1; ++ } else { ++ /* Dirtify it */ ++ int *v = (void *)map[i]; ++ *v = i; ++ } ++ } ++ ++ map[NR_MAPS_1] = mmap(map[NR_MAPS_1 - 1] + 8192, MAPS_SIZE_1, PROT_READ | PROT_WRITE | PROT_EXEC, ++ MAP_ANONYMOUS | MAP_PRIVATE | MAP_GROWSDOWN, -1, 0); ++ if (map[NR_MAPS_1] == MAP_FAILED) { ++ pr_perror("Can't mmap"); ++ return 1; ++ } else { ++ /* Dirtify it */ ++ int *v = (void *)map[NR_MAPS_1]; ++ *v = i; ++ test_msg("map-1: %p %p\n", map[NR_MAPS_1], map[NR_MAPS_1] + MAPS_SIZE_1); ++ } ++ ++ map[NR_MAPS_2] = mmap(map[NR_MAPS_1] + MAPS_SIZE_1, MAPS_SIZE_2, PROT_READ | PROT_WRITE, ++ MAP_ANONYMOUS | MAP_PRIVATE | MAP_GROWSDOWN, -1, 0); ++ if (map[NR_MAPS_2] == MAP_FAILED) { ++ pr_perror("Can't mmap"); ++ return 1; ++ } else { ++ /* Dirtify it */ ++ int *v = (void *)map[NR_MAPS_2]; ++ *v = i; ++ test_msg("map-2: %p %p\n", map[NR_MAPS_2], map[NR_MAPS_2] + MAPS_SIZE_2); ++ } ++ ++ test_daemon(); ++ test_waitsig(); ++ ++ for (i = 0; i < NR_MAPS; i++) { ++ int *v = (void *)map[i]; ++ ++ if (*v != i) { ++ fail("Data corrupted at page %lu", (unsigned long)i); ++ return 1; ++ } ++ } ++ ++ pass(); ++ return 0; ++} +diff --git a/test/zdtm/customization/maps05.desc b/test/zdtm/customization/maps05.desc +new file mode 100644 +index 0000000..f14d661 +--- /dev/null ++++ b/test/zdtm/customization/maps05.desc +@@ -0,0 +1 @@ ++{'arch': 'aarch64', 'opts': '--pin-memory --use-fork-pid', 'flavor': 'h', 'cmdline': 'pinmemory max_pin_pid_num'} +diff --git a/test/zdtm/customization/maps06.c b/test/zdtm/customization/maps06.c +new file mode 100644 +index 0000000..7480d6b +--- /dev/null ++++ b/test/zdtm/customization/maps06.c +@@ -0,0 +1,70 @@ ++#include "zdtmtst.h" ++#include ++#include ++#include ++#include ++ ++const char *test_doc = "Create a lot of file vma-s"; ++const char *test_author = "Andrei Vagin "; ++ ++char *filename; ++TEST_OPTION(filename, string, "file name", 1); ++ ++int main(int argc, char ** argv) ++{ ++ void *start; ++ int fd, i; ++ int ps = sysconf(_SC_PAGESIZE); ++ int test_size; ++ ++ test_init(argc, argv); ++ ++ fd = open(filename, O_RDWR | O_CREAT, 0666); ++ if (fd < 0) ++ return 1; ++ ++ ftruncate(fd, ps); ++ ++ if (ps == 0x1000) ++ test_size = 10240; ++ else ++ test_size = 512; ++ ++ start = mmap(0, ps * test_size * 4, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); ++ if (start == MAP_FAILED) ++ return 1; ++ ++ for (i = 0; i < test_size; i++) { ++ int *addr; ++ addr = mmap(start + i * 3 * ps, ps, ++ PROT_READ | PROT_WRITE, ++ MAP_PRIVATE | MAP_FILE | MAP_FIXED, fd, 0); ++ if (addr == MAP_FAILED) ++ return 1; ++ addr[0] = i * 2; ++ addr = mmap(start + (i * 3 + 1) * ps, ps, ++ PROT_READ | PROT_WRITE, ++ MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0); ++ if (addr == MAP_FAILED) ++ return 1; ++ addr[0] = i; ++ } ++ ++ test_daemon(); ++ ++ test_waitsig(); ++ ++ for (i = 0; i < test_size; i++) { ++ int *addr; ++ addr = start + i * 3 * ps; ++ if (addr[0] != i * 2) ++ fail(); ++ addr = start + (i * 3 + 1) * ps; ++ if (addr[0] != i) ++ fail(); ++ } ++ ++ pass(); ++ ++ return 0; ++} +diff --git a/test/zdtm/customization/maps06.desc b/test/zdtm/customization/maps06.desc +new file mode 100644 +index 0000000..f14d661 +--- /dev/null ++++ b/test/zdtm/customization/maps06.desc +@@ -0,0 +1 @@ ++{'arch': 'aarch64', 'opts': '--pin-memory --use-fork-pid', 'flavor': 'h', 'cmdline': 'pinmemory max_pin_pid_num'} +diff --git a/test/zdtm/customization/maps_file_prot.c b/test/zdtm/customization/maps_file_prot.c +new file mode 100644 +index 0000000..3b28c1f +--- /dev/null ++++ b/test/zdtm/customization/maps_file_prot.c +@@ -0,0 +1,53 @@ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "zdtmtst.h" ++ ++const char *test_doc = "Test mappings of same file with different prot"; ++const char *test_author = "Jamie Liu "; ++ ++char *filename; ++TEST_OPTION(filename, string, "file name", 1); ++ ++#define die(fmt, arg...) do { pr_perror(fmt, ## arg); return 1; } while (0) ++ ++int main(int argc, char ** argv) ++{ ++ void *ro_map, *rw_map; ++ int fd; ++ ++ test_init(argc, argv); ++ ++ fd = open(filename, O_RDWR | O_CREAT, 0644); ++ if (fd < 0) ++ die("open failed"); ++ if (ftruncate(fd, 2 * PAGE_SIZE)) ++ die("ftruncate failed"); ++ ++ ro_map = mmap(NULL, 2 * PAGE_SIZE, PROT_READ, MAP_SHARED, fd, 0); ++ if (ro_map == MAP_FAILED) ++ die("mmap failed"); ++ rw_map = ro_map + PAGE_SIZE; ++ if (mprotect(rw_map, PAGE_SIZE, PROT_READ | PROT_WRITE)) ++ die("mprotect failed"); ++ ++ close(fd); ++ ++ test_daemon(); ++ test_waitsig(); ++ ++ /* Check that rw_map is still writeable */ ++ *(volatile char *)rw_map = 1; ++ ++ if (mprotect(ro_map, PAGE_SIZE, PROT_READ | PROT_WRITE)) { ++ fail("mprotect after restore failed"); ++ return 1; ++ } ++ ++ pass(); ++ return 0; ++} +diff --git a/test/zdtm/customization/maps_file_prot.desc b/test/zdtm/customization/maps_file_prot.desc +new file mode 100644 +index 0000000..0ec4023 +--- /dev/null ++++ b/test/zdtm/customization/maps_file_prot.desc +@@ -0,0 +1 @@ ++{'arch': 'aarch64', 'opts': '--pin-memory --use-fork-pid', 'flavor': 'h'} +diff --git a/test/zdtm_ct.c b/test/zdtm_ct.c +index e8d45a9..3bbd3a3 100644 +--- a/test/zdtm_ct.c ++++ b/test/zdtm_ct.c +@@ -9,6 +9,7 @@ + #include + #include + #include ++#include + #include + + #ifndef CLONE_NEWTIME +@@ -95,13 +96,23 @@ int main(int argc, char **argv) + { + pid_t pid; + int status; ++ char *val = getenv("ZDTM_NO_PID_NS"); ++ int flags = CLONE_NEWNS | CLONE_NEWNET | CLONE_NEWIPC; ++ ++ /* ++ * Some customizing mechanism don't support pid namespace, ++ * so every customizing feature testcase will set ++ * 'ZDTM_NO_PID_NS' environment value. ++ */ ++ if (val == NULL || strcmp(val, "1") != 0) ++ flags |= CLONE_NEWPID; + + /* + * pidns is used to avoid conflicts + * mntns is used to mount /proc + * net is used to avoid conflicts of parasite sockets + */ +- if (unshare(CLONE_NEWNS | CLONE_NEWPID | CLONE_NEWNET | CLONE_NEWIPC)) ++ if (unshare(flags)) + return 1; + pid = fork(); + if (pid == 0) { +-- +2.34.1 + diff --git a/0064-zdtm-init-notifier-testcase.patch b/0064-zdtm-init-notifier-testcase.patch new file mode 100644 index 0000000..6cd6aca --- /dev/null +++ b/0064-zdtm-init-notifier-testcase.patch @@ -0,0 +1,620 @@ +From 8c7cfce7c9f90af9314b96c6ec34c97fb6f9be8a Mon Sep 17 00:00:00 2001 +From: "fu.lin" +Date: Mon, 14 Feb 2022 19:11:15 +0800 +Subject: [PATCH 64/72] zdtm: init notifier testcase + +Signed-off-by: fu.lin +--- + test/zdtm.py | 70 ++++++++-- + test/zdtm/customization/Makefile | 3 +- + test/zdtm/customization/notifier00.c | 68 ++++++++++ + test/zdtm/customization/notifier00.desc | 1 + + test/zdtm/mod/.gitignore | 163 ++++++++++++++++++++++++ + test/zdtm/mod/Makefile | 28 ++++ + test/zdtm/mod/notifier.c | 145 +++++++++++++++++++++ + 7 files changed, 466 insertions(+), 12 deletions(-) + create mode 100644 test/zdtm/customization/notifier00.c + create mode 100644 test/zdtm/customization/notifier00.desc + create mode 100644 test/zdtm/mod/.gitignore + create mode 100644 test/zdtm/mod/Makefile + create mode 100644 test/zdtm/mod/notifier.c + +diff --git a/test/zdtm.py b/test/zdtm.py +index d3b146f..d64a683 100755 +--- a/test/zdtm.py ++++ b/test/zdtm.py +@@ -25,6 +25,7 @@ import tempfile + import time + import socket + import pathlib ++import platform + from builtins import (input, int, open, range, str, zip) + + import pycriu as crpc +@@ -1466,6 +1467,13 @@ class criu: + return True + return False + ++ @staticmethod ++ def check_sysfs(pathes): ++ for path in pathes.split(): ++ if not pathlib.Path(path).exists(): ++ return True ++ return False ++ + @staticmethod + def available(): + if not os.access(opts['criu_bin'], os.X_OK): +@@ -1991,21 +1999,49 @@ class Launcher: + testline = u"ok %d - %s # SKIP %s" % (self.__runtest, name, reason) + print(testline, file=self.__file_report) + ++ def check_module(self, mod): ++ found = False ++ with open("/proc/modules") as f: ++ for line in f.readlines(): ++ if "pin_memory" == line.split()[0]: ++ found = True ++ return found ++ + def modprobe_pin_memory(self, load): ++ mod = "pin_memory" + if not load: + return +- else: +- found = False +- with open("/proc/modules") as f: +- for line in f.readlines(): +- if "pin_memory" == line.split()[0]: +- found = True +- if not found: +- subprocess.check_call(["modprobe", "pin_memory"]) ++ elif not self.check_module(mod): ++ subprocess.check_call(["modprobe", mod]) + + cmd = [opts["criu_bin"], "init-pagemap-read"] + subprocess.check_call(cmd, shell=False) + ++ def build_and_load_mod(self, target, kdir): ++ if platform.machine() != "aarch64" or not target: ++ return ++ ++ if not os.access("zdtm/mod", os.R_OK): ++ print("should be executed in the test subdir") ++ sys.exit(0) ++ ++ dirpath = f"MOD={os.getcwd()}/zdtm/mod" ++ build_mod = ["make", "-C", "zdtm/mod", dirpath, target] ++ if kdir: ++ build_mod.append(f"KDIR={kdir}") ++ subprocess.check_call(build_mod) ++ ++ # ensure the module has been unloaded ++ if self.check_module(target.rstrip(".ko")): ++ subprocess.run(["rmmod", target], check=False) ++ ++ modpath = f"zdtm/mod/{target}" ++ subprocess.check_call(["insmod", modpath]) ++ ++ def unload_mod(self, mod): ++ if mod: ++ subprocess.check_call(["rmmod", mod]) ++ + def run_test(self, name, desc, flavor): + + if len(self.__subs) >= self.__max: +@@ -2014,9 +2050,9 @@ class Launcher: + with open("/proc/sys/kernel/tainted") as taintfd: + taint = taintfd.read() + # 0x1000 means the out of tree module has been loaded +- if self.__taint != taint and (int(self.__taint) | 0x1000) != int(taint): ++ if self.__taint != taint and (int(self.__taint) | 0x3000) != int(taint): + raise Exception("The kernel is tainted: %r (%r)" % +- (taint, self.__taint)) ++ (taint, str(int(self.__taint) | 0x3000))) + + if test_flag(desc, 'excl'): + self.wait_all() +@@ -2045,6 +2081,8 @@ class Launcher: + # `--use-fork-pid`, so don't care `--pin-memory` option + self.modprobe_pin_memory(no_pid_ns) + ++ self.build_and_load_mod(desc.get("mod", ""), opts["kdir"]) ++ + sub = subprocess.Popen(["./zdtm_ct", "zdtm.py"], + env=dict(os.environ, CR_CT_TEST_INFO=arg, + ZDTM_NO_PID_NS=zdtm_no_pid_ns), +@@ -2059,9 +2097,11 @@ class Launcher: + } + + # pin memory function don't support concurrency +- if test_flag(desc, 'excl') or test_value(desc, "opts", "--pin-memory"): ++ if test_flag(desc, 'excl') or test_value(desc, "opts", "--pin-memory") or desc.get("mod", ""): + self.wait() + ++ self.unload_mod(desc.get("mod", "")) ++ + def __wait_one(self, flags): + pid = -1 + status = -1 +@@ -2412,6 +2452,11 @@ def run_tests(opts): + t, f"cmdline '{cmdline}' isn't support, or don't set") + continue + ++ sysfs = tdesc.get('sysfs', '') ++ if sysfs and criu.check_sysfs(sysfs): ++ launcher.skip(t, f"sysfs file {sysfs} don't exist") ++ continue ++ + test_flavs = tdesc.get('flavor', 'h ns uns').split() + opts_flavs = (opts['flavor'] or 'h,ns,uns').split(',') + if opts_flavs != ['best']: +@@ -2434,6 +2479,7 @@ def run_tests(opts): + launcher.run_test(t, tdesc, run_flavs) + else: + launcher.skip(t, "no flavors") ++ + finally: + fail = launcher.finish() + if opts['join_ns']: +@@ -2723,6 +2769,8 @@ rp.add_argument("--pre-dump-mode", + rp.add_argument("--kdat", + help="Path to criu.kdat, default '/run/criu.kdat'", + default="/run/criu.kdat") ++rp.add_argument( ++ "--kdir", help="specific kernel devel path, the default value is `/lib/modules/$(uname -r)/build`") + + lp = sp.add_parser("list", help="List tests") + lp.set_defaults(action=list_tests) +diff --git a/test/zdtm/customization/Makefile b/test/zdtm/customization/Makefile +index 82348f2..93922c7 100644 +--- a/test/zdtm/customization/Makefile ++++ b/test/zdtm/customization/Makefile +@@ -10,7 +10,8 @@ TST_NOFILE = \ + maps04 \ + maps05 \ + maps007 \ +- maps008 ++ maps008 \ ++ notifier00 + + TST_FILE = \ + maps00 \ +diff --git a/test/zdtm/customization/notifier00.c b/test/zdtm/customization/notifier00.c +new file mode 100644 +index 0000000..5fc3d54 +--- /dev/null ++++ b/test/zdtm/customization/notifier00.c +@@ -0,0 +1,68 @@ ++#include ++/* Historical reasons: in order to compatible with R10 */ ++#define CONFIG_EULEROS_MODRESTORE_NOTIFY ++#include ++ ++#include "zdtmtst.h" ++ ++const char *test_doc = "Tests the basic function of the notifiers"; ++static char *nvwa_notifiers[] = { ++ "PRE_FREEZE", ++ "FREEZE_TO_KILL", ++ "PRE_UPDATE_KERNEL", ++ "POST_UPDATE_KERNEL", ++ "UNFREEZE_TO_RUN", ++ "POST_RUN" ++}; ++ ++_Static_assert(sizeof(nvwa_notifiers)/sizeof(nvwa_notifiers[0]) == KUP_HOOK_MAX, "nvwa_notifiers number is wrong!"); ++ ++int main(int argc, char *argv[]) ++{ ++ int orig_values[KUP_HOOK_MAX] = {0}; ++ bool failure = false; ++ FILE *fp; ++ ++ test_init(argc, argv); ++ ++ fp = fopen("/sys/kernel/criu_notifier", "r"); ++ if (fp == NULL) { ++ pr_perror("fopen"); ++ return 1; ++ } ++ ++ for (int i = 0; i < KUP_HOOK_MAX; i++) ++ fscanf(fp, "%d ", orig_values+i); ++ ++ test_daemon(); ++ test_waitsig(); ++ ++ if (fseek(fp, 0, SEEK_SET) != 0) { ++ pr_perror("fseek"); ++ return 2; ++ } ++ ++ for (int i = 0; i < KUP_HOOK_MAX; i++) { ++ int val = 0; ++ int should = orig_values[i]+1; ++ ++ fscanf(fp, "%d ", &val); ++ ++ /* those are not called in criu */ ++ if (i == PRE_UPDATE_KERNEL || i == POST_UPDATE_KERNEL) ++ continue; ++ ++ if (val != should) { ++ pr_err("%s notifier is abnormal, it should be %d, but %d.\n", ++ nvwa_notifiers[i], should, val); ++ failure = true; ++ } ++ } ++ ++ if (failure) ++ fail("notifier is abnormal."); ++ else ++ pass(); ++ ++ return 0; ++} +diff --git a/test/zdtm/customization/notifier00.desc b/test/zdtm/customization/notifier00.desc +new file mode 100644 +index 0000000..1c6b512 +--- /dev/null ++++ b/test/zdtm/customization/notifier00.desc +@@ -0,0 +1 @@ ++{'arch': 'aarch64', 'opts': '--with-notifier', 'flavor': 'h', 'flags': 'suid', 'sysfs': '/sys/kernel/modrestore/nvwa_notifier', 'mod': 'notifier.ko'} +diff --git a/test/zdtm/mod/.gitignore b/test/zdtm/mod/.gitignore +new file mode 100644 +index 0000000..7afd412 +--- /dev/null ++++ b/test/zdtm/mod/.gitignore +@@ -0,0 +1,163 @@ ++# SPDX-License-Identifier: GPL-2.0-only ++# ++# NOTE! Don't add files that are generated in specific ++# subdirectories here. Add them in the ".gitignore" file ++# in that subdirectory instead. ++# ++# NOTE! Please use 'git ls-files -i --exclude-standard' ++# command after changing this file, to see if there are ++# any tracked files which get ignored after the change. ++# ++# Normal rules (sorted alphabetically) ++# ++.* ++*.a ++*.asn1.[ch] ++*.bin ++*.bz2 ++*.c.[012]*.* ++*.dt.yaml ++*.dtb ++*.dtbo ++*.dtb.S ++*.dwo ++*.elf ++*.gcno ++*.gz ++*.i ++*.ko ++*.lex.c ++*.ll ++*.lst ++*.lz4 ++*.lzma ++*.lzo ++*.mod ++*.mod.c ++*.o ++*.o.* ++*.patch ++*.s ++*.so ++*.so.dbg ++*.su ++*.symtypes ++*.symversions ++*.tab.[ch] ++*.tar ++*.xz ++*.zst ++Module.symvers ++modules.order ++ ++# ++# Top-level generic files ++# ++/linux ++/modules-only.symvers ++/vmlinux ++/vmlinux.32 ++/vmlinux.map ++/vmlinux.symvers ++/vmlinux-gdb.py ++/vmlinuz ++/System.map ++/Module.markers ++/modules.builtin ++/modules.builtin.modinfo ++/modules.nsdeps ++ ++# ++# RPM spec file (make rpm-pkg) ++# ++/*.spec ++ ++# ++# Debian directory (make deb-pkg) ++# ++/debian/ ++ ++# ++# Snap directory (make snap-pkg) ++# ++/snap/ ++ ++# ++# tar directory (make tar*-pkg) ++# ++/tar-install/ ++ ++# ++# We don't want to ignore the following even if they are dot-files ++# ++!.clang-format ++!.cocciconfig ++!.get_maintainer.ignore ++!.gitattributes ++!.gitignore ++!.mailmap ++ ++# ++# Generated include files ++# ++/include/config/ ++/include/generated/ ++/include/ksym/ ++/arch/*/include/generated/ ++ ++# stgit generated dirs ++patches-* ++ ++# quilt's files ++patches ++series ++ ++# ctags files ++tags ++TAGS ++ ++# cscope files ++cscope.* ++ncscope.* ++ ++# gnu global files ++GPATH ++GRTAGS ++GSYMS ++GTAGS ++ ++# id-utils files ++ID ++ ++*.orig ++*~ ++\#*# ++ ++# ++# Leavings from module signing ++# ++extra_certificates ++signing_key.pem ++signing_key.priv ++signing_key.x509 ++x509.genkey ++ ++# Kconfig presets ++/all.config ++/alldef.config ++/allmod.config ++/allno.config ++/allrandom.config ++/allyes.config ++ ++# Kconfig savedefconfig output ++/defconfig ++ ++# Kdevelop4 ++*.kdev4 ++ ++# Clang's compilation database file ++/compile_commands.json ++ ++# Documentation toolchain ++sphinx_*/ +diff --git a/test/zdtm/mod/Makefile b/test/zdtm/mod/Makefile +new file mode 100644 +index 0000000..10c9c9a +--- /dev/null ++++ b/test/zdtm/mod/Makefile +@@ -0,0 +1,28 @@ ++# notice: ++# `ARCH` var is used in both criu and kernel, but they have the different value ++# for the same architecture(e.g. arm64). Therefore, this Makefile can't be ++# included in the criu Makefile. ++obj-m += notifier.o ++ ++# specific the kernel devel path ++# example (use `/home/me/kernel` as `KDIR`): ++# $ export KDIR="/home/me/kernel" ++ifeq ($(KDIR),) ++ KDIR := /lib/modules/$(shell uname -r)/build ++endif ++ ++# specific the mod src path ++ifeq ($(MOD),) ++ MOD := $(PWD) ++endif ++ ++all: ++ $(MAKE) -C $(KDIR) M=$(MOD) modules ++ ++clean: ++ $(MAKE) -C $(KDIR) M=$(MOD) clean ++ ++.PHONY: all clean ++ ++notifier.ko: ++ $(MAKE) -C $(KDIR) M=$(MOD) notifier.ko +diff --git a/test/zdtm/mod/notifier.c b/test/zdtm/mod/notifier.c +new file mode 100644 +index 0000000..70a5b33 +--- /dev/null ++++ b/test/zdtm/mod/notifier.c +@@ -0,0 +1,145 @@ ++#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt ++ ++#include ++#include ++#include ++ ++static int values[KUP_HOOK_MAX]; ++static char *nvwa_actions[] = { ++ "PREPARE", ++ "ROLLBACK", ++}; ++static char *nvwa_notifiers[] = { ++ "PRE_FREEZE", ++ "FREEZE_TO_KILL", ++ "PRE_UPDATE_KERNEL", ++ "POST_UPDATE_KERNEL", ++ "UNFREEZE_TO_RUN", ++ "POST_RUN" ++}; ++ ++static int nvwa_notifier_func(struct notifier_block *nb, unsigned long val, void *data) ++{ ++ struct nvwa_action *action = data; ++ ++ switch (action->cmd) { ++ case PREPARE: ++ values[val] += 1; ++ break; ++ case ROLLBACK: ++ values[val] -= 1; ++ break; ++ default: ++ pr_err("invalid cmd: %d", action->cmd); ++ return NOTIFY_BAD; ++ } ++ ++ pr_info("nvwa notifier action %s", nvwa_actions[action->cmd]); ++ ++ return NOTIFY_DONE; ++} ++ ++#define DEFINE_NVWA_NB(name) \ ++ static struct notifier_block nvwa_##name##_nb = { \ ++ .notifier_call = nvwa_notifier_func, \ ++ } ++ ++DEFINE_NVWA_NB(pre_freeze); ++DEFINE_NVWA_NB(freeze_to_kill); ++DEFINE_NVWA_NB(pre_update_kernel); ++DEFINE_NVWA_NB(post_update_kernel); ++DEFINE_NVWA_NB(unfreeze_to_run); ++DEFINE_NVWA_NB(post_run); ++ ++static struct notifier_block *nvwa_nbs[] = { ++ &nvwa_pre_freeze_nb, ++ &nvwa_freeze_to_kill_nb, ++ &nvwa_pre_update_kernel_nb, ++ &nvwa_post_update_kernel_nb, ++ &nvwa_unfreeze_to_run_nb, ++ &nvwa_post_run_nb, ++}; ++ ++static int register_nvwa_notifiers(void) ++{ ++ int i; ++ ++ BUILD_BUG_ON_MSG(ARRAY_SIZE(nvwa_nbs) != KUP_HOOK_MAX, ++ "wrong nvwa notifier block size!"); ++ ++ for (i = 0; i < ARRAY_SIZE(nvwa_nbs); i++) { ++ if (register_nvwa_notifier(i, nvwa_nbs[i]) != 0) { ++ pr_err("register nvwa %s notifier failed!", nvwa_notifiers[i]); ++ goto error; ++ } ++ } ++ ++ return 0; ++ ++error: ++ ++ for (i -= 1; i >= 0; i -= 1) ++ unregister_nvwa_notifier(i, nvwa_nbs[i]); ++ ++ return -1; ++} ++ ++static void unregister_nvwa_notifiers(void) ++{ ++ int i; ++ ++ for (i = 0; i < ARRAY_SIZE(nvwa_nbs); i++) ++ unregister_nvwa_notifier(i, nvwa_nbs[i]); ++} ++ ++static ssize_t criu_notifier_store(struct kobject *kobj, ++ struct kobj_attribute *attr, ++ const char *buf, size_t count) ++{ ++ int i; ++ ++ for (i = 0; i < ARRAY_SIZE(values); i++) ++ values[i] = 0; ++ ++ return count; ++} ++ ++static ssize_t criu_notifier_show(struct kobject *kobj, ++ struct kobj_attribute *attr, ++ char *buf) ++{ ++ int i; ++ ssize_t count = 0; ++ ++ for (i = 0; i < ARRAY_SIZE(values); i++) ++ count += sprintf(buf+count, "%d ", values[i]); ++ ++ buf[count-1] = '\n'; ++ ++ return count; ++} ++ ++static struct kobj_attribute notifier_file = __ATTR_RW(criu_notifier); ++ ++static int __init notifier_init(void) ++{ ++ if (register_nvwa_notifiers() != 0) ++ return -1; ++ ++ if (sysfs_create_file(kernel_kobj, ¬ifier_file.attr) != 0) { ++ unregister_nvwa_notifiers(); ++ return -1; ++ } ++ ++ return 0; ++} ++ ++static void __exit notifier_exit(void) ++{ ++ sysfs_remove_file(kernel_kobj, ¬ifier_file.attr); ++ unregister_nvwa_notifiers(); ++} ++ ++module_init(notifier_init); ++module_exit(notifier_exit); ++MODULE_LICENSE("GPL"); +-- +2.34.1 + diff --git a/0065-zdtm-print-errno-info-when-accessing-.out-failure.patch b/0065-zdtm-print-errno-info-when-accessing-.out-failure.patch new file mode 100644 index 0000000..da1bcf5 --- /dev/null +++ b/0065-zdtm-print-errno-info-when-accessing-.out-failure.patch @@ -0,0 +1,35 @@ +From d17aedda384cfe6940b9948f4db36643495e0375 Mon Sep 17 00:00:00 2001 +From: "fu.lin" +Date: Tue, 15 Feb 2022 11:31:27 +0800 +Subject: [PATCH 65/72] zdtm: print errno info when accessing *.out failure + +The line `Output file *.out appears to exist, aborting` is confusing. +The one common reason is permission denied because of the test desc +is lack of suid flag. The zdtm.py will set `ZDTM_UID` and `ZDTM_GID`, +the function `test_init()` (in `zdtm/lib/test.c`) will change tester +itself to that uid and gid if no suid flag. + +Here print the errno when access *.out failed. + +Signed-off-by: fu.lin +--- + test/zdtm/lib/test.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/test/zdtm/lib/test.c b/test/zdtm/lib/test.c +index 81da81e..471980d 100644 +--- a/test/zdtm/lib/test.c ++++ b/test/zdtm/lib/test.c +@@ -74,7 +74,8 @@ static void test_fini(void) + static void setup_outfile(void) + { + if (!access(outfile, F_OK) || errno != ENOENT) { +- fprintf(stderr, "Output file %s appears to exist, aborting\n", outfile); ++ fprintf(stderr, "Output file %s appears to exist, aborting: %s\n", ++ outfile, strerror(errno)); + exit(1); + } + +-- +2.34.1 + diff --git a/0066-zdtm-print-more-info-for-fs.c.patch b/0066-zdtm-print-more-info-for-fs.c.patch new file mode 100644 index 0000000..5236456 --- /dev/null +++ b/0066-zdtm-print-more-info-for-fs.c.patch @@ -0,0 +1,43 @@ +From af97bc76b1dc1e6ca2b924d7e5666dd04a1847b2 Mon Sep 17 00:00:00 2001 +From: "fu.lin" +Date: Wed, 16 Feb 2022 10:39:06 +0800 +Subject: [PATCH 66/72] zdtm: print more info for fs.c + +--- + test/zdtm/lib/fs.c | 11 ++++++++--- + 1 file changed, 8 insertions(+), 3 deletions(-) + +diff --git a/test/zdtm/lib/fs.c b/test/zdtm/lib/fs.c +index 7b8be5f..a716b40 100644 +--- a/test/zdtm/lib/fs.c ++++ b/test/zdtm/lib/fs.c +@@ -4,6 +4,7 @@ + #include + #include + #include ++#include + + #include "zdtmtst.h" + #include "fs.h" +@@ -103,11 +104,15 @@ int get_cwd_check_perm(char **result) + } + + if (access(cwd, X_OK)) { +- pr_err("access check for bit X for current dir path '%s' " +- "failed for uid:%d,gid:%d, error: %d(%s). " ++ struct stat sb; ++ ++ stat(cwd, &sb); ++ pr_err("access check for bit X for current dir path '%s'(uid:%d,gid:%d,mode:%o) " ++ "failed for uid:%d,gid:%d,euid:%d, error: %d(%s). " + "Bit 'x' should be set in all path components of " + "this directory\n", +- cwd, getuid(), getgid(), errno, strerror(errno)); ++ cwd, sb.st_uid, sb.st_gid, sb.st_mode, getuid(), getgid(), ++ geteuid(), errno, strerror(errno)); + return -1; + } + +-- +2.34.1 + diff --git a/0067-zdtm-add-chardev-testcase.patch b/0067-zdtm-add-chardev-testcase.patch new file mode 100644 index 0000000..024f78e --- /dev/null +++ b/0067-zdtm-add-chardev-testcase.patch @@ -0,0 +1,288 @@ +From c44c68028f22751ef12fac02567008a16e992fea Mon Sep 17 00:00:00 2001 +From: "fu.lin" +Date: Thu, 17 Feb 2022 14:30:03 +0800 +Subject: [PATCH 67/72] zdtm: add chardev testcase + +- char dev `ioctl({IOCTL_CMD_NEEDREPAIR, IOCTL_CMD_REPAIR})` + checkpoint/restore test +- anonymous inode checkpoint/restore test +--- + test/zdtm/customization/Makefile | 3 +- + test/zdtm/customization/chardev00.c | 65 +++++++++++ + test/zdtm/customization/chardev00.desc | 1 + + test/zdtm/mod/Makefile | 5 +- + test/zdtm/mod/anon_inode.c | 148 +++++++++++++++++++++++++ + 5 files changed, 220 insertions(+), 2 deletions(-) + create mode 100644 test/zdtm/customization/chardev00.c + create mode 100644 test/zdtm/customization/chardev00.desc + create mode 100644 test/zdtm/mod/anon_inode.c + +diff --git a/test/zdtm/customization/Makefile b/test/zdtm/customization/Makefile +index 93922c7..7d08db3 100644 +--- a/test/zdtm/customization/Makefile ++++ b/test/zdtm/customization/Makefile +@@ -11,7 +11,8 @@ TST_NOFILE = \ + maps05 \ + maps007 \ + maps008 \ +- notifier00 ++ notifier00 \ ++ chardev00 + + TST_FILE = \ + maps00 \ +diff --git a/test/zdtm/customization/chardev00.c b/test/zdtm/customization/chardev00.c +new file mode 100644 +index 0000000..c708699 +--- /dev/null ++++ b/test/zdtm/customization/chardev00.c +@@ -0,0 +1,65 @@ ++#include ++#include ++#include ++#include ++#include ++#include "zdtmtst.h" ++ ++#define CHARDEV_PATH "/dev/anon_test" ++ ++const char *test_doc="Tests char dev and anonmous inode map checkpoint/restore"; ++ ++static int check_maps(unsigned long addr) ++{ ++ FILE *fp = fopen("/proc/self/maps", "r"); ++ char *line = NULL; ++ size_t n = 0; ++ unsigned long start = 0; ++ ++ if (fp == NULL) { ++ pr_perror("open self maps failed"); ++ return -1; ++ } ++ ++ while (getline(&line, &n, fp) != -1) { ++ test_msg("%s", line); ++ sscanf(line, "%lx-", &start); ++ if (start == addr) ++ return 0; ++ } ++ ++ return -1; ++} ++ ++int main(int argc, char *argv[]) ++{ ++ int fd, retval = 0; ++ unsigned long addr; ++ ++ test_init(argc, argv); ++ ++ fd = open(CHARDEV_PATH, O_RDWR); ++ if (fd < 0) { ++ pr_perror("open '%s' failed", CHARDEV_PATH); ++ return -1; ++ } ++ ++ retval = ioctl(fd, 0, &addr); ++ if (retval < 0) { ++ pr_perror("create anonymous map failed"); ++ retval = -1; ++ goto out; ++ } ++ test_msg("create anonymous vma start 0x%lx\n", addr); ++ ++ test_daemon(); ++ test_waitsig(); ++ ++ retval = check_maps(addr); ++ if (retval == 0) ++ pass(); ++ else ++ fail("anonymous inode map don't restore"); ++out: ++ return retval; ++} +diff --git a/test/zdtm/customization/chardev00.desc b/test/zdtm/customization/chardev00.desc +new file mode 100644 +index 0000000..9c51ba8 +--- /dev/null ++++ b/test/zdtm/customization/chardev00.desc +@@ -0,0 +1 @@ ++{'arch': 'aarch64', 'opts': '--dump-char-dev', 'flavor': 'h', 'flags': 'suid excl', 'sysfs': '/sys/kernel/modrestore/anon_state_restore /sys/kernel/repairing_device', 'mod': 'anon_inode.ko'} +diff --git a/test/zdtm/mod/Makefile b/test/zdtm/mod/Makefile +index 10c9c9a..0bc89f7 100644 +--- a/test/zdtm/mod/Makefile ++++ b/test/zdtm/mod/Makefile +@@ -2,7 +2,7 @@ + # `ARCH` var is used in both criu and kernel, but they have the different value + # for the same architecture(e.g. arm64). Therefore, this Makefile can't be + # included in the criu Makefile. +-obj-m += notifier.o ++obj-m += notifier.o anon_inode.o + + # specific the kernel devel path + # example (use `/home/me/kernel` as `KDIR`): +@@ -26,3 +26,6 @@ clean: + + notifier.ko: + $(MAKE) -C $(KDIR) M=$(MOD) notifier.ko ++ ++anon_inode.ko: ++ $(MAKE) -C $(KDIR) M=$(MOD) anon_inode.ko +diff --git a/test/zdtm/mod/anon_inode.c b/test/zdtm/mod/anon_inode.c +new file mode 100644 +index 0000000..d9c7d2a +--- /dev/null ++++ b/test/zdtm/mod/anon_inode.c +@@ -0,0 +1,148 @@ ++#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++static int anon_mmap(struct file *file, struct vm_area_struct *vma) ++{ ++ pr_info("call %s\n", __func__); ++ return 0; ++} ++ ++static const struct file_operations none_fops = { ++ .owner = THIS_MODULE, ++ .mmap = anon_mmap, ++}; ++ ++static unsigned long create_mmap(void) ++{ ++ struct file *filp; ++ unsigned long start; ++ ++ pr_info("call %s\n", __func__); ++ filp = anon_inode_getfile("test", &none_fops, NULL, O_RDWR); ++ if (IS_ERR(filp)) { ++ pr_warn("anon_inode_getfile('test') failed: %d\n", (int)PTR_ERR(filp)); ++ return PTR_ERR(filp); ++ } ++ ++ start = vm_mmap(filp, 0, 1<<20, PROT_READ | PROT_WRITE, MAP_SHARED, 0); ++ if (IS_ERR_VALUE(start)) { ++ pr_warn("vm_mmap failed with: %d\n", (int)PTR_ERR((void *)start)); ++ } ++ ++ fput(filp); ++ ++ return start; ++} ++ ++static int anon_inode_notifier(struct notifier_block *nb, ++ unsigned long action, void *data) ++{ ++ struct vma_anon_entry *vma_entry = data; ++ struct file *filp; ++ unsigned long start; ++ ++ filp = anon_inode_getfile("test", &none_fops, NULL, O_RDWR); ++ if (IS_ERR(filp)) { ++ pr_warn("anon_inode_getfile('test') failed: %d\n", (int)PTR_ERR(filp)); ++ return 0; ++ } ++ ++ start = vm_mmap(filp, vma_entry->start, vma_entry -> end-vma_entry->start, ++ PROT_READ | PROT_WRITE, MAP_SHARED, 0); ++ if (start != vma_entry->start) ++ pr_warn("vm_mmap() failed: %#lx\n", start); ++ ++ fput(filp); ++ return 0; ++} ++ ++static long anon_ioctl(struct file *file, unsigned int cmd, unsigned long argp) ++{ ++ unsigned long start; ++ ++ switch (cmd) { ++ case 0: ++ start = create_mmap(); ++ if (IS_ERR_VALUE(start)) ++ return -EINVAL; ++ if (put_user(start, (unsigned long __user *)argp)) ++ return -EFAULT; ++ break; ++ case IOCTL_CMD_NEEDREPAIR: ++ pr_info("call IOCTL_CMD_NEEDREPAIR"); ++ /* do nothing, just a request slot */ ++ return 17173; ++ case IOCTL_CMD_REPAIR: ++ pr_info("call IOCTL_CMD_REPAIR"); ++ /* do nothing, just a request slot */ ++ break; ++ default: ++ pr_warn("wrong cmd\n"); ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++static const struct file_operations anon_fops = { ++ .owner = THIS_MODULE, ++ .unlocked_ioctl = anon_ioctl, ++ .compat_ioctl = anon_ioctl, ++}; ++ ++static struct miscdevice anon_dev = { ++ .minor = MISC_DYNAMIC_MINOR, ++ .name = "anon_test", ++ .fops = &anon_fops, ++}; ++ ++static struct notifier_block anon_inode_nb = { ++ .notifier_call = anon_inode_notifier, ++}; ++ ++static int __init anon_init(void) ++{ ++ int retval; ++ ++ retval = mures_add_devname(anon_dev.name); ++ if (retval != 0) ++ goto out; ++ ++ retval = register_anon_notifier(&anon_inode_nb); ++ if (retval != 0) ++ goto del_devname; ++ ++ retval = misc_register(&anon_dev); ++ if (retval != 0) ++ goto del_notifier; ++ ++ return 0; ++ ++del_notifier: ++ unregister_anon_notifier(&anon_inode_nb); ++del_devname: ++ mures_del_devname(anon_dev.name); ++out: ++ return retval; ++} ++ ++static void __exit anon_exit(void) ++{ ++ mures_del_devname(anon_dev.name); ++ unregister_anon_notifier(&anon_inode_nb); ++ misc_deregister(&anon_dev); ++ return; ++} ++ ++module_init(anon_init); ++module_exit(anon_exit); ++MODULE_LICENSE("GPL"); +-- +2.34.1 + diff --git a/0068-zdtm-add-infiniband-testcase.patch b/0068-zdtm-add-infiniband-testcase.patch new file mode 100644 index 0000000..975c6bb --- /dev/null +++ b/0068-zdtm-add-infiniband-testcase.patch @@ -0,0 +1,256 @@ +From f7e452ffc5318b2aac8aabde5dd8b7bee910c6f7 Mon Sep 17 00:00:00 2001 +From: "fu.lin" +Date: Thu, 17 Feb 2022 14:59:37 +0800 +Subject: [PATCH 68/72] zdtm: add infiniband testcase + +--- + test/zdtm/customization/Makefile | 4 +- + .../customization/infiniband_with_unix_sk.c | 55 ++++++++ + .../infiniband_with_unix_sk.desc | 1 + + test/zdtm/mod/Makefile | 5 +- + test/zdtm/mod/infiniband_kern.c | 121 ++++++++++++++++++ + 5 files changed, 184 insertions(+), 2 deletions(-) + create mode 100644 test/zdtm/customization/infiniband_with_unix_sk.c + create mode 100644 test/zdtm/customization/infiniband_with_unix_sk.desc + create mode 100644 test/zdtm/mod/infiniband_kern.c + +diff --git a/test/zdtm/customization/Makefile b/test/zdtm/customization/Makefile +index 7d08db3..728646b 100644 +--- a/test/zdtm/customization/Makefile ++++ b/test/zdtm/customization/Makefile +@@ -12,7 +12,8 @@ TST_NOFILE = \ + maps007 \ + maps008 \ + notifier00 \ +- chardev00 ++ chardev00 \ ++ infiniband_with_unix_sk + + TST_FILE = \ + maps00 \ +@@ -61,6 +62,7 @@ wait_stop: + $(TST): | $(LIB) + + maps02: get_smaps_bits.o ++infiniband_with_unix_sk: LDFLAGS += -lpthread + + %: %.sh + cp $< $@ +diff --git a/test/zdtm/customization/infiniband_with_unix_sk.c b/test/zdtm/customization/infiniband_with_unix_sk.c +new file mode 100644 +index 0000000..4a9e108 +--- /dev/null ++++ b/test/zdtm/customization/infiniband_with_unix_sk.c +@@ -0,0 +1,55 @@ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "zdtmtst.h" ++ ++#define DEV_PATH "/dev/infiniband_test" ++ ++const char *test_doc = "test infiniband fd checkpoint/restore, and the conflict condition with the half-closing anonymous unix socket"; ++ ++static int fd; ++static int sv[2]; ++ ++static void *wait(void *arg) { ++ while (true) { ++ test_msg("sleep...\n"); ++ sleep(1); ++ } ++ ++ return NULL; ++} ++ ++int main(int argc, char *argv[]) { ++ pthread_t thread; ++ ++ test_init(argc, argv); ++ ++ if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sv) < 0) { ++ pr_perror("socketpair"); ++ return -1; ++ } ++ printf("sv[0]: %d sv[1]: %d\n", sv[0], sv[1]); ++ ++ if ((fd = open(DEV_PATH, O_RDWR)) < 0) { ++ pr_perror("open"); ++ return -1; ++ } ++ if (close(sv[1]) < 0) { ++ pr_perror("close"); ++ return -1; ++ } ++ ++ pthread_create(&thread, NULL, wait, NULL); ++ ++ test_daemon(); ++ test_waitsig(); ++ ++ pass(); ++ ++ return 0; ++} +diff --git a/test/zdtm/customization/infiniband_with_unix_sk.desc b/test/zdtm/customization/infiniband_with_unix_sk.desc +new file mode 100644 +index 0000000..43a93e6 +--- /dev/null ++++ b/test/zdtm/customization/infiniband_with_unix_sk.desc +@@ -0,0 +1 @@ ++{'arch': 'aarch64', 'opts': '--dump-char-dev', 'flavor': 'h', 'flags': 'suid excl', 'sysfs': '/sys/kernel/repairing_device', 'mod': 'infiniband_kern.ko'} +diff --git a/test/zdtm/mod/Makefile b/test/zdtm/mod/Makefile +index 0bc89f7..58f9a27 100644 +--- a/test/zdtm/mod/Makefile ++++ b/test/zdtm/mod/Makefile +@@ -2,7 +2,7 @@ + # `ARCH` var is used in both criu and kernel, but they have the different value + # for the same architecture(e.g. arm64). Therefore, this Makefile can't be + # included in the criu Makefile. +-obj-m += notifier.o anon_inode.o ++obj-m += notifier.o anon_inode.o infiniband_kern.o + + # specific the kernel devel path + # example (use `/home/me/kernel` as `KDIR`): +@@ -29,3 +29,6 @@ notifier.ko: + + anon_inode.ko: + $(MAKE) -C $(KDIR) M=$(MOD) anon_inode.ko ++ ++infiniband_kern.ko: ++ $(MAKE) -C $(KDIR) M=$(MOD) infiniband_kern.ko +diff --git a/test/zdtm/mod/infiniband_kern.c b/test/zdtm/mod/infiniband_kern.c +new file mode 100644 +index 0000000..a61df3a +--- /dev/null ++++ b/test/zdtm/mod/infiniband_kern.c +@@ -0,0 +1,121 @@ ++#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++static const struct file_operations none_fops = { ++ .owner = THIS_MODULE, ++}; ++ ++static const struct file_operations anonfd_fops = { ++ .owner = THIS_MODULE, ++}; ++ ++static int infiniband_open(struct inode *inode, struct file *filp) ++{ ++ long fd; ++ ++ if (!!(filp->f_flags & O_REPAIR)) { ++ pr_info("reuse\n"); ++ return 0; ++ } ++ ++ fd = anon_inode_getfd("[infinibandevent]", &anonfd_fops, NULL, 0); ++ if (fd < 0) ++ return fd; ++ else ++ filp->private_data = (void *)fd; ++ ++ return 0; ++} ++ ++static int infiniband_repair(struct file *filp, int from) ++{ ++ struct file *fp; ++ long fd; ++ int retval = 0; ++ ++ fp = anon_inode_getfile("[infinibandevent]", &anonfd_fops, NULL, 0); ++ if (IS_ERR(fp)) ++ return PTR_ERR(fp); ++ ++ fd = mures_f_dupfd(from, fp, 0); ++ if (fd != from) { ++ pr_err("different fd, old: %d, dup: %ld\n", from, fd); ++ retval = -EEXIST; ++ } ++ fput(fp); ++ filp->private_data = (long *)fd; ++ ++ return retval; ++} ++ ++static long infiniband_ioctl(struct file *filp, unsigned int cmd, unsigned long argp) ++{ ++ long retval = 0; ++ ++ switch (cmd) { ++ case IOCTL_CMD_NEEDREPAIR: ++ retval = (long )filp->private_data; ++ break; ++ case IOCTL_CMD_REPAIR: ++ retval = infiniband_repair(filp, argp); ++ break; ++ default: ++ pr_warn("wrong cmd\n"); ++ return -EINVAL; ++ } ++ return retval; ++} ++ ++static const struct file_operations infiniband_fops = { ++ .owner = THIS_MODULE, ++ .open = infiniband_open, ++ .unlocked_ioctl = infiniband_ioctl, ++ .compat_ioctl = infiniband_ioctl, ++}; ++ ++static struct miscdevice infiniband_dev = { ++ .minor = MISC_DYNAMIC_MINOR, ++ .name = "infiniband_test", ++ .fops = &infiniband_fops, ++}; ++ ++static int __init infiniband_init(void) ++{ ++ int retval; ++ ++ retval = mures_add_devname(infiniband_dev.name); ++ if (retval != 0) ++ goto out; ++ ++ retval = misc_register(&infiniband_dev); ++ if (retval != 0) ++ goto del_devname; ++ ++ return 0; ++ ++del_devname: ++ mures_del_devname(infiniband_dev.name); ++out: ++ return retval; ++} ++ ++static void __exit infiniband_exit(void) ++{ ++ mures_del_devname(infiniband_dev.name); ++ misc_deregister(&infiniband_dev); ++ return; ++} ++ ++module_init(infiniband_init); ++module_exit(infiniband_exit); ++MODULE_LICENSE("GPL"); +-- +2.34.1 + diff --git a/0069-zdtm-add-share-port-testcase.patch b/0069-zdtm-add-share-port-testcase.patch new file mode 100644 index 0000000..a7440b3 --- /dev/null +++ b/0069-zdtm-add-share-port-testcase.patch @@ -0,0 +1,145 @@ +From b766a8d6b04e9c358cd221b68405a205156c1fe2 Mon Sep 17 00:00:00 2001 +From: "fu.lin" +Date: Thu, 17 Feb 2022 17:19:46 +0800 +Subject: [PATCH 69/72] zdtm: add share port testcase + +--- + test/zdtm/customization/Makefile | 3 +- + test/zdtm/customization/tcp00.c | 101 +++++++++++++++++++++++++++++ + test/zdtm/customization/tcp00.desc | 1 + + 3 files changed, 104 insertions(+), 1 deletion(-) + create mode 100644 test/zdtm/customization/tcp00.c + create mode 100644 test/zdtm/customization/tcp00.desc + +diff --git a/test/zdtm/customization/Makefile b/test/zdtm/customization/Makefile +index 728646b..1111908 100644 +--- a/test/zdtm/customization/Makefile ++++ b/test/zdtm/customization/Makefile +@@ -13,7 +13,8 @@ TST_NOFILE = \ + maps008 \ + notifier00 \ + chardev00 \ +- infiniband_with_unix_sk ++ infiniband_with_unix_sk \ ++ tcp00 + + TST_FILE = \ + maps00 \ +diff --git a/test/zdtm/customization/tcp00.c b/test/zdtm/customization/tcp00.c +new file mode 100644 +index 0000000..d1ead82 +--- /dev/null ++++ b/test/zdtm/customization/tcp00.c +@@ -0,0 +1,101 @@ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "zdtmtst.h" ++ ++#define PORT 17173 ++ ++const char *test_doc = "Test TCP SO_REUSEADDR checkpoint/restore using `share_{src,dst}_ports`"; ++ ++static int sock_bind_and_listen(void) ++{ ++ int serv_sk; ++ int optval = 1; ++ struct sockaddr_in serv = { ++ .sin_family = AF_INET, ++ .sin_addr.s_addr = htonl(INADDR_ANY), ++ .sin_port = htons(PORT), ++ }; ++ ++ serv_sk = socket(AF_INET, SOCK_STREAM, 0); ++ if (serv_sk < 0) { ++ pr_perror("server socket failed"); ++ exit(1); ++ } ++ ++ if (setsockopt(serv_sk, SOL_SOCKET, SO_REUSEADDR, &optval, sizeof(optval)) < 0) { ++ pr_perror("setsockopt"); ++ exit(1); ++ } ++ ++ if (bind(serv_sk, (struct sockaddr *)&serv, sizeof(serv)) < 0) { ++ pr_perror("bind"); ++ exit(1); ++ } ++ ++ if (listen(serv_sk, 5) != 0) { ++ pr_perror("listen"); ++ exit(1); ++ } ++ ++ return serv_sk; ++} ++ ++static void client_connect(void) ++{ ++ int sk; ++ struct sockaddr_in sockaddr = { ++ .sin_family = AF_INET, ++ }; ++ ++ sk = socket(AF_INET, SOCK_STREAM, 0); ++ if (sk < 0) { ++ pr_perror("client socket failed"); ++ exit(1); ++ } ++ ++ sockaddr.sin_addr.s_addr = inet_addr("127.0.0.1"); ++ sockaddr.sin_port = htons(PORT); ++ ++ if (connect(sk, (struct sockaddr *)&sockaddr, sizeof(sockaddr)) < 0) { ++ pr_perror("connect failed"); ++ exit(1); ++ } ++ ++ close(sk); ++} ++ ++int main(int argc, char *argv[]) ++{ ++ int serv_sk; ++ int optval = 0; ++ socklen_t len = sizeof(optval); ++ ++ test_init(argc, argv); ++ ++ serv_sk = sock_bind_and_listen(); ++ ++ test_msg("listen 0.0.0.0: %d\n", PORT); ++ /* create CLOSE-WAIT status socket */ ++ client_connect(); ++ ++ test_daemon(); ++ test_waitsig(); ++ ++ if (getsockopt(serv_sk, SOL_SOCKET, SO_REUSEADDR, &optval, &len) != 0) { ++ pr_perror("getsockopt failed"); ++ return -1; ++ } ++ ++ if (optval != 1) { ++ pr_err("SO_REUSEADDR flag is %d, should 1", optval); ++ } else ++ pass(); ++ ++ return 0; ++} +\ No newline at end of file +diff --git a/test/zdtm/customization/tcp00.desc b/test/zdtm/customization/tcp00.desc +new file mode 100644 +index 0000000..bc3b4a8 +--- /dev/null ++++ b/test/zdtm/customization/tcp00.desc +@@ -0,0 +1 @@ ++{'arch': 'aarch64', 'opts': '--use-fork-pid --share-src-ports=17173 --share-dst-ports=17173 --skip-in-flight', 'flavor': 'h', 'sysfs': '/sys/kernel/repair_share_socket'} +-- +2.34.1 + diff --git a/0070-zdtm-tmp-test-script.patch b/0070-zdtm-tmp-test-script.patch new file mode 100644 index 0000000..51a1a83 --- /dev/null +++ b/0070-zdtm-tmp-test-script.patch @@ -0,0 +1,59 @@ +From a4f00a225ebfed401aed49956eefad391071d0ce Mon Sep 17 00:00:00 2001 +From: "fu.lin" +Date: Thu, 17 Feb 2022 11:02:08 +0800 +Subject: [PATCH 70/72] zdtm: tmp test script + +--- + test/jenkins/criu-lib.sh | 2 +- + test/jenkins/criu-test.sh | 26 ++++++++++++++++++++++++++ + 2 files changed, 27 insertions(+), 1 deletion(-) + create mode 100644 test/jenkins/criu-test.sh + +diff --git a/test/jenkins/criu-lib.sh b/test/jenkins/criu-lib.sh +index 72d41b5..89dc936 100644 +--- a/test/jenkins/criu-lib.sh ++++ b/test/jenkins/criu-lib.sh +@@ -15,7 +15,7 @@ function prep() + + ulimit -c unlimited && + export CFLAGS=-g +- git clean -dfx && ++# git clean -dfx && + make -j 4 && + make -j 4 -C test/zdtm/ && + make -C test zdtm_ct && +diff --git a/test/jenkins/criu-test.sh b/test/jenkins/criu-test.sh +new file mode 100644 +index 0000000..3035f21 +--- /dev/null ++++ b/test/jenkins/criu-test.sh +@@ -0,0 +1,26 @@ ++#!/bin/bash ++ ++set -e ++source `dirname $0`/criu-lib.sh ++prep ++ ++rm -rf /var/run/criu.kdat ++ ++make zdtm ++ ++if [ -z $(grep 58467 /etc/group) ]; then ++ groupadd -g 58467 zdtm ++fi ++if [ -z $(grep 58467 /etc/passwd) ]; then ++ useradd -u 18943 -g 58467 zdtm ++fi ++ ++#./test/zdtm.py run --all --keep-going --report report -f h --ignore-taint --parallel 1 --load-pinmem-dev || fail ++ ++#./test/zdtm.py run -t zdtm/static/del_standalone_un --keep-going -f h --ignore-taint --parallel 1 --load-pinmem-dev --keep-img always ++ ++./test/zdtm.py run -t zdtm/customization/chardev00 -t zdtm/customization/notifier00 --keep-going -f h --ignore-taint --parallel 1 --load-pinmem-dev --keep-img always ++ ++#./test/zdtm.py run -t zdtm/static/socket-tcp-nfconntrack --join-ns --keep-going --ignore-taint --parallel 1 --load-pinmem-dev --keep-img always ++ ++./test/zdtm.py run -t zdtm/customization/tcp00 --keep-going -f h --ignore-taint --parallel 1 --load-pinmem-dev --keep-img always +-- +2.34.1 + diff --git a/0071-mod-add-criu-indepent-test.patch b/0071-mod-add-criu-indepent-test.patch new file mode 100644 index 0000000..f44537b --- /dev/null +++ b/0071-mod-add-criu-indepent-test.patch @@ -0,0 +1,512 @@ +From 03d188c492efe079a520319ca48e40843367ddcf Mon Sep 17 00:00:00 2001 +From: "fu.lin" +Date: Fri, 18 Feb 2022 16:22:00 +0800 +Subject: [PATCH 71/72] mod: add criu-indepent test + +Signed-off-by: fu.lin +--- + test/modules/Makefile | 21 ++++++ + test/modules/idr.c | 79 +++++++++++++++++++++ + test/modules/jump_table.c | 107 ++++++++++++++++++++++++++++ + test/modules/var_kern.c | 72 +++++++++++++++++++ + test/modules/var_user.py | 40 +++++++++++ + test/modules/workqueue_kern.c | 130 ++++++++++++++++++++++++++++++++++ + 6 files changed, 449 insertions(+) + create mode 100644 test/modules/Makefile + create mode 100644 test/modules/idr.c + create mode 100644 test/modules/jump_table.c + create mode 100644 test/modules/var_kern.c + create mode 100644 test/modules/var_user.py + create mode 100644 test/modules/workqueue_kern.c + +diff --git a/test/modules/Makefile b/test/modules/Makefile +new file mode 100644 +index 0000000..9458aa7 +--- /dev/null ++++ b/test/modules/Makefile +@@ -0,0 +1,21 @@ ++obj-m := var_kern.o workqueue_kern.o jump_table.o idr.o ++ ++KDIR := /lib/modules/`uname -r`/build ++ ++all: ++ make -C $(KDIR) M=$(PWD) modules ++ ++clean: ++ make -C $(KDIR) M=$(PWD) clean ++ ++var_kern.ko: ++ make -C $(KDIR) M=$(PWD) var_kern.ko ++ ++workqueue_kern.ko: ++ make -C $(KDIR) M=$(PWD) workqueue_kern.ko ++ ++jump_table.ko: ++ make -C $(KDIR) M=$(PWD) jump_table.ko ++ ++idr.ko: ++ make -C $(KDIR) M=$(PWD) idr.ko +diff --git a/test/modules/idr.c b/test/modules/idr.c +new file mode 100644 +index 0000000..67f248e +--- /dev/null ++++ b/test/modules/idr.c +@@ -0,0 +1,79 @@ ++#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt ++ ++#include ++#include ++#include ++#include ++ ++DEFINE_IDR(idr_head); ++const int placeholder = 0; ++static int idr_uid = 0; ++ ++static int idr_test_show_internal(int id, void *p, void *data) ++{ ++ pr_info("id: %d p %pK\n", id, p); ++ sprintf(data+strlen(data), "%d\n", id); ++ return 0; ++} ++ ++static ssize_t idr_test_show(struct kobject *kobj, ++ struct kobj_attribute *attr, ++ char *buf) ++{ ++ idr_for_each(&idr_head, idr_test_show_internal, buf); ++ return strlen(buf); ++} ++ ++static ssize_t idr_test_store(struct kobject *kobj, ++ struct kobj_attribute *attr, ++ const char *buf, size_t count) ++{ ++ const unsigned long max = 65536; ++ unsigned id = 0; ++ int retval; ++ ++ if (sscanf(buf, "%u", &id) != 1) { ++ pr_err("sscanf empty\n"); ++ return -EINVAL; ++ } ++ ++ retval = idr_alloc_u32(&idr_head, (void *)&placeholder, &id, max, GFP_KERNEL); ++ pr_info("alloc idr id %u, errno %d\n", id, retval); ++ return retval < 0 ? retval : count; ++} ++ ++static struct kobj_attribute idr_test = __ATTR_RW(idr_test); ++ ++static int __init mod_init(void) ++{ ++ return sysfs_create_file(kernel_kobj, &idr_test.attr); ++} ++ ++static void __exit mod_exit(void) ++{ ++ sysfs_remove_file(kernel_kobj, &idr_test.attr); ++ idr_destroy(&idr_head); ++ return; ++} ++ ++static int __init mod_resume(void) ++{ ++ int retval = mures_restore_idr(idr_uid, &idr_head); ++ ++ if (retval == 0) ++ retval = sysfs_create_file(kernel_kobj, &idr_test.attr); ++ return retval; ++} ++ ++static int __exit mod_suspend(void) ++{ ++ sysfs_remove_file(kernel_kobj, &idr_test.attr); ++ return mures_save_idr(idr_uid, &idr_head); ++} ++ ++module_init(mod_init); ++module_exit(mod_exit); ++module_resume(mod_resume); ++module_suspend(mod_suspend); ++ ++MODULE_LICENSE("GPL"); +\ No newline at end of file +diff --git a/test/modules/jump_table.c b/test/modules/jump_table.c +new file mode 100644 +index 0000000..8648c2a +--- /dev/null ++++ b/test/modules/jump_table.c +@@ -0,0 +1,107 @@ ++#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt ++ ++#include ++#include ++#include ++#include ++#include ++ ++struct func_node { ++ struct hlist_node hash; ++ unsigned long key; ++ unsigned long value; ++}; ++ ++static int status __attribute__((section(".resume_0"))); ++ ++/* ++ * The `mures_vcall()` can't used in irq context because of the implementation. ++ * Therefore, we must generate cache. ++ */ ++DEFINE_HASHTABLE(__ro_after_init cache, 2); ++ ++static int foo(void) ++{ ++ status += 1; ++ return status; ++} ++ ++static void *find_func(unsigned long addr); ++ ++static ssize_t jp_test_show(struct kobject *kobj, ++ struct kobj_attribute *attr, ++ char *buf) ++{ ++ int (*func)(void) = find_func((unsigned long)foo); ++ ssize_t count = 0; ++ ++ if (func == NULL) { ++ count = sprintf(buf, "Not Found\n"); ++ } else { ++ count = sprintf(buf, "%d", func()); ++ } ++ ++ return count; ++} ++ ++static struct kobj_attribute jp_test = __ATTR_RO(jp_test); ++ ++struct func_node nodes[] __ro_after_init = { ++ { .key = (unsigned long)foo, }, ++}; ++ ++static void *find_func(unsigned long addr) ++{ ++ struct func_node *obj; ++ int i; ++ ++ pr_info("finding addr: %lx\n", addr); ++ hash_for_each(cache, i, obj, hash) {\ ++ pr_info("found key: %lx, val: %lx\n", obj->key, obj->value); ++ if (obj->key == addr) ++ return (void *)obj->value; ++ } ++ ++ return NULL; ++} ++ ++static void __init build_cache(void) ++{ ++ int i; ++ ++ for (i = 0; i < ARRAY_SIZE(nodes); i++) { ++ nodes[i].value = mures_vcall(nodes[i].key); ++ hash_add(cache, &nodes[i].hash, nodes[i].key); ++ } ++} ++ ++static int __init mod_init(void) ++{ ++ build_cache(); ++ return sysfs_create_file(kernel_kobj, &jp_test.attr); ++} ++ ++static void __exit mod_exit(void) ++{ ++ sysfs_remove_file(kernel_kobj, &jp_test.attr); ++ return; ++} ++ ++static int __init mod_resume(void) ++{ ++ build_cache(); ++ return sysfs_create_file(kernel_kobj, &jp_test.attr); ++} ++ ++static int __exit mod_suspend(void) ++{ ++ sysfs_remove_file(kernel_kobj, &jp_test.attr); ++ return 0; ++} ++ ++module_init(mod_init); ++module_exit(mod_exit); ++module_resume(mod_resume); ++module_suspend(mod_suspend); ++ ++MODULE_LICENSE("GPL"); +\ No newline at end of file +diff --git a/test/modules/var_kern.c b/test/modules/var_kern.c +new file mode 100644 +index 0000000..4321e3b +--- /dev/null ++++ b/test/modules/var_kern.c +@@ -0,0 +1,72 @@ ++#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt ++ ++#include ++#include ++#include ++ ++/* test variable persistence */ ++ ++static int mod_int __attribute__((section(".resume_0"))); ++static char *mod_str1 __attribute__((section(".resume_1"))) = "init"; ++static char *mod_str2 __attribute__((section(".resume_2"))) = "upgrade"; ++static char *mod_str __attribute__((section(".resume_3"))); ++ ++static ssize_t var_test_show(struct kobject *kobj, ++ struct kobj_attribute *attr, ++ char *buf) ++{ ++ ssize_t count = 0; ++ ++ count += sprintf(buf, "%d", mod_int); ++ count += sprintf(buf+count, " %s", mod_str); ++ ++ return count; ++} ++ ++static struct kobj_attribute sysfs_var = __ATTR_RO(var_test); ++ ++static __init int mod1_resume(void) ++{ ++ mod_int += 1; ++ mod_str = mod_str2; ++ ++ pr_info("This is %s, index %d\n", __func__, mod_int); ++ ++ return sysfs_create_file(kernel_kobj, &sysfs_var.attr); ++} ++ ++static __exit int mod1_suspend(void) ++{ ++ mod_int += 1; ++ ++ pr_info("This is %s, index %d\n", __func__, mod_int); ++ sysfs_remove_file(kernel_kobj, &sysfs_var.attr); ++ ++ return 0; ++} ++ ++static __init int mod1_init(void) ++{ ++ mod_int = 0; ++ mod_str = mod_str1; ++ ++ pr_info("This is %s, index %d\n", __func__, mod_int); ++ ++ return sysfs_create_file(kernel_kobj, &sysfs_var.attr); ++} ++ ++static __exit void mod1_exit(void) ++{ ++ mod_int += 1; ++ ++ pr_info("This is %s, index %d\n", __func__, mod_int); ++ sysfs_remove_file(kernel_kobj, &sysfs_var.attr); ++ ++ return; ++} ++ ++module_resume(mod1_resume); ++module_suspend(mod1_suspend); ++module_init(mod1_init); ++module_exit(mod1_exit); ++MODULE_LICENSE("GPL"); +diff --git a/test/modules/var_user.py b/test/modules/var_user.py +new file mode 100644 +index 0000000..98c5193 +--- /dev/null ++++ b/test/modules/var_user.py +@@ -0,0 +1,40 @@ ++import unittest ++import subprocess ++ ++ ++class TestVarMethods(unittest.TestCase): ++ mod_name = "var_kern" ++ ++ def unload_mod(self): ++ with open("/proc/modules") as f: ++ for line in f.readlines(): ++ words = line.split() ++ if words[0] == self.mod_name: ++ subprocess.check_call(["rmmod", self.mod_name]) ++ break ++ ++ def setUp(self): ++ subprocess.check_call(["make", "var_kern.ko"]) ++ self.unload_mod() ++ ++ def tearDown(self): ++ mod = f"{self.mod_name}.ko" ++ self.unload_mod() ++ ++ def test_var(self): ++ mod = f"{self.mod_name}.ko" ++ subprocess.check_call(["insmod", mod]) ++ with open("/sys/kernel/var_test") as f: ++ line = f.readline() ++ self.assertEqual(line, "0 init") ++ subprocess.check_call(["rmmod", "-r", mod]) ++ subprocess.check_call(["rmmod", mod]) ++ subprocess.check_call(["insmod", "-r", mod]) ++ with open("/sys/kernel/var_test") as f: ++ line = f.readline() ++ self.assertEqual(line, "2 upgrade") ++ subprocess.check_call(["rmmod", mod]) ++ ++ ++if __name__ == '__main__': ++ unittest.main() +diff --git a/test/modules/workqueue_kern.c b/test/modules/workqueue_kern.c +new file mode 100644 +index 0000000..cecfb8c +--- /dev/null ++++ b/test/modules/workqueue_kern.c +@@ -0,0 +1,130 @@ ++#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++struct mod_status { ++ struct workqueue_struct *wq; ++}; ++ ++static struct workqueue_struct *wq; ++static int wq_status __attribute__((section(".resume_0"))); ++ ++static void worker_func(struct work_struct *work) ++{ ++ wq_status += 1; ++ pr_info("worker run...\n"); ++ mdelay(100); ++ pr_info("worker end.\n"); ++ kfree(work); ++} ++ ++static ssize_t wq_test_show(struct kobject *kobj, ++ struct kobj_attribute *attr, ++ char *buf) ++{ ++ flush_workqueue(wq); ++ return sprintf(buf, "%pK %d", wq, wq_status); ++} ++ ++static struct kobj_attribute wq_test = __ATTR_RO(wq_test); ++ ++static int __init mod_init(void) ++{ ++ int retval; ++ ++ retval = sysfs_create_file(kernel_kobj, &wq_test.attr); ++ if (retval != 0) { ++ pr_err("sysfs_create_file failed.\n"); ++ return retval; ++ } ++ ++ wq = alloc_workqueue("workqueue_kern_test", WQ_UNBOUND, 0); ++ if (wq == NULL) { ++ pr_err("unable to allocate workqueue\n"); ++ sysfs_remove_file(kernel_kobj, &wq_test.attr); ++ retval = -ENOMEM; ++ goto out; ++ } ++ ++ retval = 0; ++out: ++ return retval; ++} ++ ++static void __exit mod_exit(void) ++{ ++ destroy_workqueue(wq); ++ sysfs_remove_file(kernel_kobj, &wq_test.attr); ++} ++ ++static int __init mod_resume(void) ++{ ++ struct mod_status *data; ++ int retval; ++ ++ data = get_module_state_space(KBUILD_MODNAME, NULL); ++ if (!data) { ++ pr_info("get_module_state_space failure\n"); ++ return -ENOMEM; ++ } ++ wq = data->wq; ++ ++ retval = sysfs_create_file(kernel_kobj, &wq_test.attr); ++ if (retval != 0) { ++ pr_err("sysfs_create_file failed.\n"); ++ return retval; ++ } ++ ++ return resume_workqueue(wq); ++} ++ ++static int __exit queue_worker(void) ++{ ++ struct delayed_work *worker = kzalloc(sizeof(struct work_struct), GFP_KERNEL); ++ ++ if (worker == NULL) { ++ pr_err("alloc worker space failed\n"); ++ return -ENOMEM; ++ } ++ ++ INIT_DELAYED_WORK(worker, worker_func); ++ queue_delayed_work(wq, worker, 100); ++ return 0; ++} ++ ++static int __exit mod_suspend(void) ++{ ++ struct mod_status *data; ++ int retval; ++ ++ data = alloc_module_state_space(KBUILD_MODNAME, sizeof(*data)); ++ if (data == NULL) { ++ pr_err("alloc_module_state_space failed\n"); ++ return -ENOMEM; ++ } ++ ++ data->wq = wq; ++ if (queue_worker() != 0) ++ return -ENOMEM; ++ ++ retval = suspend_workqueue(wq); ++ if (retval != 0) { ++ pr_err("suspend workqueue failed\n"); ++ return retval; ++ } ++ ++ sysfs_remove_file(kernel_kobj, &wq_test.attr); ++ return 0; ++} ++ ++module_init(mod_init); ++module_exit(mod_exit); ++module_resume(mod_resume); ++module_suspend(mod_suspend); ++ ++MODULE_LICENSE("GPL"); +\ No newline at end of file +-- +2.34.1 + diff --git a/0072-kabichk-add-KABI-check-code.patch b/0072-kabichk-add-KABI-check-code.patch new file mode 100644 index 0000000..e468742 --- /dev/null +++ b/0072-kabichk-add-KABI-check-code.patch @@ -0,0 +1,611 @@ +From 57f1017a9c971d8c3a5ef82d04e6c4bc584e9f00 Mon Sep 17 00:00:00 2001 +From: "fu.lin" +Date: Fri, 8 Apr 2022 16:14:40 +0800 +Subject: [PATCH 72/72] kabichk: add KABI check code + +Theory: + * The export symbol CRCs source: + - /boot/symvers-$(uname -r).gz for Image and in tree modules: the + ima mechanism could ensure the file credibility and non-tamper. + - ELF section `.symtab` for out of tree modules: the export symbols + has `__crc_` prefix, and `st_shndx` is `SHN_ABS` + * compare CRC value between the known and the module + +Design Details: + - collect export symbols from + * collect in tree symbols from `/boot/symvers-.gz` + * collect out of tree module symbols from the module self + - compare external symbols stored in `__versions` section for each module + +Usage: + python3 -m upgchk.kabichk \ + [[-r ],...] \ + [[-m ],...] \ + -c +Example: + python3 -m upgchk.kabichk -c /lib/modules/$(uname -r)/kernel/fs/mbcache.ko + python3 -m upgchk.kabichk -m notify.ko -c osp_proc.ko + +Note: + The pyelftools library can't be import, therefore using elfutils + wrapper to replace the library. + +Signed-off-by: fu.lin +--- + upgchk/Makefile | 23 ++++ + upgchk/lib/modsym.c | 268 ++++++++++++++++++++++++++++++++++++++ + upgchk/lib/modsym.h | 39 ++++++ + upgchk/setup.py | 20 +++ + upgchk/upgchk/__init__.py | 11 ++ + upgchk/upgchk/kabichk.py | 163 +++++++++++++++++++++++ + 6 files changed, 524 insertions(+) + create mode 100644 upgchk/Makefile + create mode 100644 upgchk/lib/modsym.c + create mode 100644 upgchk/lib/modsym.h + create mode 100644 upgchk/setup.py + create mode 100644 upgchk/upgchk/__init__.py + create mode 100644 upgchk/upgchk/kabichk.py + +diff --git a/upgchk/Makefile b/upgchk/Makefile +new file mode 100644 +index 0000000..df6b60e +--- /dev/null ++++ b/upgchk/Makefile +@@ -0,0 +1,23 @@ ++.PHONY: build install clean ++ ++PYTHON=/usr/bin/python3 ++TEST= ++PARAMETERS= ++ ++build: ++ ${PYTHON} setup.py build ++ ++dist: ++ ${PYTHON} setup.py sdist ++ ++install: ++ ${PYTHON} setup.py install ++ ++clean: ++ ${PYTHON} setup.py clean ++ rm -rf \ ++ build \ ++ dist \ ++ upgchk/__pycache__ \ ++ upgchk/*.so \ ++ upgchk.egg-info +diff --git a/upgchk/lib/modsym.c b/upgchk/lib/modsym.c +new file mode 100644 +index 0000000..eb75f68 +--- /dev/null ++++ b/upgchk/lib/modsym.c +@@ -0,0 +1,268 @@ ++#define PY_SSIZE_T_CLEAN ++#include ++ ++#include ++#include ++#include ++#include ++ ++#include "modsym.h" ++ ++static Elf_Data *get_elf_sec_data(Elf *elf, const char *sec_name) ++{ ++ Elf_Scn *scn = NULL; ++ size_t strndx; ++ GElf_Shdr mem; ++ GElf_Shdr *shdr; ++ const char *name; ++ ++ /* To get the section names. */ ++ if (elf_getshdrstrndx(elf, &strndx) != 0) ++ return NULL; ++ ++ while ((scn = elf_nextscn(elf, scn)) != NULL) { ++ shdr = gelf_getshdr(scn, &mem); ++ name = elf_strptr (elf, strndx, shdr->sh_name); ++ ++ if (strcmp(name, sec_name) == 0) ++ return elf_getdata(scn, NULL); ++ } ++ ++ return NULL; ++} ++ ++static void modvers_dealloc(PyObject *obj) ++{ ++ ModVersState *mvgstate = (ModVersState *)obj; ++ ++ elf_end(mvgstate->elf); ++ return; ++} ++ ++static PyObject *modvers_iternext(PyObject *obj) ++{ ++ ModVersState *mvgstate = (ModVersState *)obj; ++ struct modversion_info *info = mvgstate->d->d_buf; ++ PyObject *elem = NULL; ++ ++ if (mvgstate->seq_index >= 0) { ++ size_t i = mvgstate->enum_index; ++ /* seq_index < 0 means that the generator is exhausted. ++ * Returning NULL in this case is enough. The next() builtin ++ * will raise the StopIteration error for us. ++ */ ++ elem = Py_BuildValue("(sk)", info[i].name, info[i].crc); ++ mvgstate->seq_index -= 1; ++ mvgstate->enum_index += 1; ++ } else { ++ /* The reference to the sequence is cleared in the first ++ * generator call after its exhaustion (after the call that ++ * returned the last element). ++ * Py_CLEAR will be harmless for subsequent calls since it's ++ * idempotent on NULL. ++ */ ++ mvgstate->seq_index = -1; ++ } ++ ++ return elem; ++} ++ ++static PyObject *modvers_new(PyTypeObject *type, PyObject *args, PyObject *kwargs) ++{ ++ ModVersState *mvgstate = NULL; ++ PyObject *file; ++ int fd; ++ Py_ssize_t len; ++ ++ if (!PyArg_ParseTuple(args, "O", &file)) ++ return NULL; ++ ++ fd = PyObject_AsFileDescriptor(file); ++ if (fd < 0) ++ return NULL; ++ ++ mvgstate = (ModVersState *)type->tp_alloc(type, 0); ++ if (mvgstate == NULL) ++ return NULL; ++ ++ elf_version(EV_CURRENT); ++ mvgstate->elf = elf_begin(fd, ELF_C_READ_MMAP, NULL); ++ if (mvgstate->elf == NULL) { ++ PyErr_Format(PyExc_TypeError, "File not usable: %s\n", elf_errmsg(-1)); ++ goto free; ++ } ++ ++ mvgstate->d = get_elf_sec_data(mvgstate->elf, VERS_SEC_NAME); ++ if (mvgstate->d == NULL) { ++ PyErr_Format(PyExc_TypeError, "Can't find ELF section `%s`\n", VERS_SEC_NAME); ++ goto elf_end; ++ } ++ ++ len = mvgstate->d->d_size / sizeof(struct modversion_info); ++ mvgstate->seq_index = len - 1; ++ mvgstate->enum_index = 0; ++ ++ return (PyObject *)mvgstate; ++ ++elf_end: ++ elf_end(mvgstate->elf); ++free: ++ type->tp_free(mvgstate); ++ return NULL; ++} ++ ++PyTypeObject PyModVersGen_Type = { ++ PyVarObject_HEAD_INIT(NULL, 0) ++ .tp_name = "modvers", ++ .tp_basicsize = sizeof(PyModVersGen_Type), ++ .tp_itemsize = 0, ++ .tp_dealloc = modvers_dealloc, ++ .tp_flags = Py_TPFLAGS_DEFAULT, ++ .tp_iter = PyObject_SelfIter, ++ .tp_iternext = modvers_iternext, ++ .tp_alloc = PyType_GenericAlloc, ++ .tp_new = modvers_new, ++}; ++ ++static void modcrcs_dealloc(PyObject *obj) ++{ ++ ModCRCsState *mcgstate = (ModCRCsState *)obj; ++ ++ elf_end(mcgstate->elf); ++ return; ++} ++ ++static PyObject *modcrcs_iternext(PyObject *obj) ++{ ++ ModCRCsState *mcgstate = (ModCRCsState *)obj; ++ const char *strtab = mcgstate->strtab->d_buf; ++ GElf_Sym *sym = mcgstate->symtab->d_buf; ++ PyObject *elem = NULL; ++ ++ while (mcgstate->seq_index >= 0) { ++ size_t i = mcgstate->enum_index; ++ const char *name = strtab + sym[i].st_name; ++ ++ mcgstate->seq_index -= 1; ++ mcgstate->enum_index += 1; ++ ++ /* ++ * If the symbol has '__crc_' prefix and absolute value, ++ * it's export symbol, and has CRC. ++ */ ++ if (strncmp(name, CRC_SYM_PREFIX, strlen(CRC_SYM_PREFIX)) == 0 ++ && sym[i].st_shndx == SHN_ABS) { ++ elem = Py_BuildValue("(sk)", ++ name+strlen(CRC_SYM_PREFIX), ++ sym[i].st_value); ++ break; ++ } ++ } ++ ++ return elem; ++} ++ ++static PyObject *modcrcs_new(PyTypeObject *type, PyObject *args, PyObject *kwargs) ++{ ++ ModCRCsState *mcgstate = NULL; ++ PyObject *file; ++ Elf_Data *d; ++ int fd; ++ Py_ssize_t len; ++ ++ if (!PyArg_ParseTuple(args, "O", &file)) ++ return NULL; ++ ++ fd = PyObject_AsFileDescriptor(file); ++ if (fd < 0) ++ return NULL; ++ ++ mcgstate = (ModCRCsState *)type->tp_alloc(type, 0); ++ if (mcgstate == NULL) ++ return NULL; ++ ++ elf_version(EV_CURRENT); ++ mcgstate->elf = elf_begin(fd, ELF_C_READ_MMAP, NULL); ++ if (mcgstate->elf == NULL) { ++ PyErr_Format(PyExc_TypeError, "File not usable: %s\n", elf_errmsg(-1)); ++ goto free; ++ } ++ ++ mcgstate->strtab = get_elf_sec_data(mcgstate->elf, STRT_SEC_NAME); ++ if (mcgstate->strtab == NULL) { ++ PyErr_Format(PyExc_TypeError, "Can't find ELF section `%s`\n", STRT_SEC_NAME); ++ goto elf_end; ++ } ++ ++ mcgstate->symtab = get_elf_sec_data(mcgstate->elf, SYMT_SEC_NAME); ++ if (mcgstate->symtab == NULL) { ++ PyErr_Format(PyExc_TypeError, "Can't find ELF section `%s`\n", SYMT_SEC_NAME); ++ goto elf_end; ++ } ++ ++ len = mcgstate->symtab->d_size / sizeof(GElf_Sym); ++ mcgstate->seq_index = len - 1; ++ mcgstate->enum_index = 0; ++ ++ return (PyObject *)mcgstate; ++ ++elf_end: ++ elf_end(mcgstate->elf); ++free: ++ type->tp_free(mcgstate); ++ return NULL; ++} ++ ++PyTypeObject PyModCRCsGen_Type = { ++ PyVarObject_HEAD_INIT(NULL, 0) ++ .tp_name = "modcrcs", ++ .tp_basicsize = sizeof(PyModCRCsGen_Type), ++ .tp_itemsize = 0, ++ .tp_dealloc = modcrcs_dealloc, ++ .tp_flags = Py_TPFLAGS_DEFAULT, ++ .tp_iter = PyObject_SelfIter, ++ .tp_iternext = modcrcs_iternext, ++ .tp_alloc = PyType_GenericAlloc, ++ .tp_new = modcrcs_new, ++}; ++ ++/* Module structure */ ++/* Module structure */ ++static struct PyModuleDef modvers_module = { ++ PyModuleDef_HEAD_INIT, ++ .m_name = "modsym", ++ .m_doc = "iter `" VERS_SEC_NAME "` section items", ++ .m_size = -1, ++}; ++ ++/* Module initialization function */ ++PyMODINIT_FUNC PyInit_modsym(void) ++{ ++ PyObject *m = PyModule_Create(&modvers_module); ++ if (m == NULL) ++ return NULL; ++ ++ if (PyType_Ready(&PyModVersGen_Type) < 0) ++ return NULL; ++ ++ Py_INCREF(&PyModVersGen_Type); ++ if (PyModule_AddObject(m, PyModVersGen_Type.tp_name, ++ (PyObject *)&PyModVersGen_Type) < 0) ++ goto free_vers; ++ ++ if (PyType_Ready(&PyModCRCsGen_Type) < 0) ++ goto free_vers; ++ ++ Py_INCREF(&PyModCRCsGen_Type); ++ if (PyModule_AddObject(m, PyModCRCsGen_Type.tp_name, ++ (PyObject *)&PyModCRCsGen_Type) < 0) ++ goto free_crcs; ++ ++ return m; ++free_crcs: ++ Py_DECREF(&PyModCRCsGen_Type); ++free_vers: ++ Py_DECREF(&PyModVersGen_Type); ++ Py_DECREF(m); ++ return NULL; ++} +diff --git a/upgchk/lib/modsym.h b/upgchk/lib/modsym.h +new file mode 100644 +index 0000000..b8069c3 +--- /dev/null ++++ b/upgchk/lib/modsym.h +@@ -0,0 +1,39 @@ ++#ifndef __PYTHON_MODSYM_H__ ++#define __PYTHON_MODSYM_H__ ++ ++#include ++ ++typedef struct { ++ PyObject_HEAD ++ Py_ssize_t seq_index; ++ Py_ssize_t enum_index; ++ Elf *elf; ++ Elf_Data *d; ++} ModVersState; ++ ++#define VERS_SEC_NAME "__versions" ++ ++/* --- the following is copied from linux src --- */ ++#define MAX_PARAM_PREFIX_LEN (64 - sizeof(unsigned long)) ++#define MODULE_NAME_LEN MAX_PARAM_PREFIX_LEN ++ ++struct modversion_info { ++ unsigned long crc; ++ char name[MODULE_NAME_LEN]; ++}; ++/* --- end --- */ ++ ++typedef struct { ++ PyObject_HEAD ++ Py_ssize_t seq_index; ++ Py_ssize_t enum_index; ++ Elf *elf; ++ Elf_Data *strtab; ++ Elf_Data *symtab; ++} ModCRCsState; ++ ++#define STRT_SEC_NAME ".strtab" ++#define SYMT_SEC_NAME ".symtab" ++#define CRC_SYM_PREFIX "__crc_" ++ ++#endif /* __PYTHON_MODSYM_H__ */ +diff --git a/upgchk/setup.py b/upgchk/setup.py +new file mode 100644 +index 0000000..6758c95 +--- /dev/null ++++ b/upgchk/setup.py +@@ -0,0 +1,20 @@ ++#!/usr/bin/python3 ++# -*- coding: utf-8 -*- ++ ++from setuptools import setup, Extension ++ ++if __name__ == "__main__": ++ ++ setup(name="upgchk", ++ version="0.1", ++ description="Check the kernel upgrading environment", ++ ++ packages=["upgchk"], ++ ext_modules=[ ++ Extension("modsym", ++ sources=["lib/modsym.c"], ++ libraries=["elf"]) ++ ], ++ ++ python_requires='>=3.6', ++ ) +diff --git a/upgchk/upgchk/__init__.py b/upgchk/upgchk/__init__.py +new file mode 100644 +index 0000000..c831e1d +--- /dev/null ++++ b/upgchk/upgchk/__init__.py +@@ -0,0 +1,11 @@ ++# -*- coding: utf-8 -*- ++ ++""" ++.. module:: upgchk ++ :synopsis: Check the kernel upgrading environment ++""" ++ ++__title = "upgchk" ++__description = "Check the upgrade environment" ++__license__ = "GPL-2.0-or-later or LGPL-2.1-only" ++__version__ = "0.1" +diff --git a/upgchk/upgchk/kabichk.py b/upgchk/upgchk/kabichk.py +new file mode 100644 +index 0000000..cccacf3 +--- /dev/null ++++ b/upgchk/upgchk/kabichk.py +@@ -0,0 +1,163 @@ ++#!/usr/bin/python3 ++# -*- coding: utf-8 -*- ++ ++''' ++Theory: ++- compare CRC value between the known and the module ++- The export symbols CRC source: ++ * `/boot/symvers-.gz` for in tree modules and Image ++ - the ima mechanism could ensure the file credibility and non-tamper ++ * The `.symtab` section for out of tree modules ++ - name format: `__crc_` ++ - it's absolute value, means: `sym->st_shndx == SHN_ABS` ++ ++Design Details: ++- collect export symbols from ++ * collect in tree symbols from `/boot/symvers-.gz` ++ * collect out of tree module symbols from the module self ++- compare external symbols stored in `__versions` section for each module ++ ++`__versions` section data format: ++ ++ # define MAX_PARAM_PREFIX_LEN (64 - sizeof(unsigned long)) ++ # define MODULE_NAME_LEN MAX_PARAM_PREFIX_LEN ++ ++ struct modversion_info { ++ unsigned long crc; ++ char name[MODULE_NAME_LEN]; ++ }; ++ ++Usage: ++ python3 -m upgchk.kabichk \ ++ [[-r ],...] \ ++ [[-m ],...] \ ++ -c ++Example: ++ python3 -m upgchk.kabichk -c /lib/modules/$(uname -r)/kernel/fs/mbcache.ko ++ python3 -m upgchk.kabichk -m notify.ko -c osp_proc.ko ++''' ++ ++import argparse ++import gzip ++import pathlib ++import platform ++from typing import Tuple ++ ++import modsym ++ ++__all__ = ["KABI"] ++ ++ELF_SELFMAG = 4 ++ELF_ELFMAG = b"\177ELF" ++ ++ ++class KABI: ++ def __init__(self, version: str): ++ """ ++ read all symbols of the specific kernel ++ """ ++ self._symbols = dict() ++ filename = f"symvers-{version}.gz" ++ filepath = pathlib.Path("/boot/").joinpath(filename) ++ ++ with gzip.open(filepath, "rt") as f: ++ for line in f.readlines(): ++ # (crc, sym, loc, type) ++ (_crc, sym, loc, _) = line.split() ++ crc = int(_crc, 16) # convert hex crc to integer ++ self._insert(sym, (crc, sym, loc)) ++ ++ def _insert(self, key: str, val: Tuple[int, str, str]): ++ inst = self._symbols.get(key) ++ if inst is None: ++ self._symbols[key] = val ++ elif inst != val: ++ raise KeyError( ++ f"{key} already exits value {self._symbols[key]}, can't insert new value {val}") ++ ++ def _get(self, key: str) -> Tuple[int, str, str]: ++ return self._symbols.get(key) ++ ++ def _parse_mod_vers(self, filepath: pathlib.Path) -> Tuple[int, str]: ++ with open(filepath, "rb") as f: ++ magic = f.read(ELF_SELFMAG) ++ if magic != ELF_ELFMAG: ++ raise TypeError(f"{filepath} isn't an ELF file") ++ ++ for sym, crc in modsym.modvers(f): ++ yield (sym, crc) ++ ++ def check_mod_syms(self, filepath: pathlib.Path) -> Tuple[bool, str]: ++ if not filepath.exists(): ++ raise FileNotFoundError(f"{filepath} isn't found") ++ ++ for sym, crc in self._parse_mod_vers(filepath): ++ val = self._get(sym) ++ if val is None: ++ msg = f"symbol {sym} isn't known" ++ return (False, msg) ++ elif val[0] != crc: ++ msg = f"symbol {sym} CRC should be {hex(crc)}, but {hex(val[0])}" ++ return (False, msg) ++ ++ return (True, "") ++ ++ def _parse_mod_crcs(self, filepath: pathlib.Path) -> Tuple[int, str]: ++ with open(filepath, "rb") as f: ++ magic = f.read(ELF_SELFMAG) ++ if magic != ELF_ELFMAG: ++ raise TypeError(f"{filepath} isn't an ELF file") ++ ++ for inst in modsym.modcrcs(f): ++ yield inst ++ ++ def add_mod_crcs(self, filepath: pathlib.Path): ++ if not filepath.exists(): ++ raise FileNotFoundError(f"{filepath} isn't found") ++ ++ modname = filepath.name[:-3] ++ for (sym, crc) in self._parse_mod_crcs(filepath): ++ self._insert(sym, (crc, sym, modname)) ++ ++ ++def parse_argument() -> argparse.Namespace: ++ parser = argparse.ArgumentParser() ++ parser.add_argument("-r", "--release", action="store", ++ required=False, default=platform.release(), ++ help="specific the kernel release version") ++ parser.add_argument("-m", "--module", action="append", ++ required=False, default=[], ++ help="specific the out of tree modules") ++ parser.add_argument("-c", "--check", action="append", ++ required=True, ++ help="specific the checked module, e.g. -c a.ko -c b.ko") ++ options = parser.parse_args() ++ return (options.release, options.module, options.check) ++ ++ ++def main(): ++ release, modules, checks = parse_argument() ++ kabi = KABI(release) ++ ++ for mod in modules: ++ filepath = pathlib.Path(mod) ++ kabi.add_mod_crcs(filepath) ++ ++ print("-------------- start check --------------") ++ passed = 0 ++ failed = 0 ++ for mod in checks: ++ filepath = pathlib.Path(mod) ++ modname = filepath.name ++ result, msg = kabi.check_mod_syms(filepath) ++ if not result: ++ print(f"module {modname} fail: {msg}") ++ failed += 1 ++ else: ++ print(f"module {modname} pass") ++ passed += 1 ++ print(f"-------------- {passed} pass, {failed} failed --------------") ++ ++ ++if __name__ == '__main__': ++ main() +-- +2.34.1 + diff --git a/criu.changes b/criu.changes new file mode 100644 index 0000000..35e5439 --- /dev/null +++ b/criu.changes @@ -0,0 +1,25 @@ +%changelog +* Wed Apr 13 2022 fu.lin - 3.16.1-3 +- backport kinds of feature/bugfix +- spec: split changelog + +* Fri Mar 4 2022 ningyu - 3.16.1-2 +- rseq c/r support + +* Thu Dec 2 2021 zhouwenpei - 3.16.1-1 +- upgrade criu version to 3.16.1 + +* Tue Sep 07 2021 chenchen - 3.15-4 +- add "-fstack-protector-strong" for libcriu.so.2.0 + +* Mon May 31 2021 baizhonggui - 3.15-3 +- Add gcc in BuildRequires + +* Thu Apr 08 2021 fu.lin - 3.15-1 +- bump the criu version to v3.15 + +* Tue Sep 22 2020 lingsheng - 3.13-7 +- Fix crit errors + +* Fri Apr 24 2020 wutao - 3.13-6 +- Package init diff --git a/criu.spec b/criu.spec index 9dace21..934eace 100644 --- a/criu.spec +++ b/criu.spec @@ -1,6 +1,6 @@ Name: criu Version: 3.16.1 -Release: 2 +Release: 3 Provides: crtools = %{version}-%{release} Obsoletes: crtools <= 1.0-2 Summary: A tool of Checkpoint/Restore in User-space @@ -9,28 +9,87 @@ URL: http://criu.org/ Source0: http://github.com/chechpoint-restore/criu/archive/v%{version}/%{name}-%{version}.tar.gz BuildRequires: systemd libnet-devel asciidoc xmlto perl-interpreter libselinux-devel gcc BuildRequires: protobuf-devel protobuf-c-devel python3-devel libnl3-devel libcap-devel +BuildRequires: libmnl-devel libnftnl-devel Recommends: tar ExclusiveArch: x86_64 %{arm} ppc64le aarch64 s390x Requires: %{name} = %{version}-%{release} Provides: %{name}-libs = %{version}-%{release} Obsoletes: %{name}-libs < %{version}-%{release} -Patch1: 0001-criu-dump-and-restore-cpu-affinity-of-each-thread.patch -Patch2: 0002-compel-add-rseq-syscall-into-compel-std-plugin-sysca.patch -Patch3: 0003-kerndat-check-for-rseq-syscall-support.patch -Patch4: 0004-util-move-fork_and_ptrace_attach-helper-from-cr-chec.patch -Patch5: 0005-cr-check-Add-ptrace-rseq-conf-dump-feature.patch -Patch6: 0006-rseq-initial-support.patch -Patch7: 0007-zdtm-add-simple-test-for-rseq-C-R.patch -Patch8: 0008-ci-add-Fedora-Rawhide-based-test-on-Cirrus.patch -Patch9: 0009-include-add-thread_pointer.h-from-Glibc.patch -Patch10: 0010-clone-noasan-unregister-rseq-at-the-thread-start-for.patch -Patch11: 0011-zdtm-static-rseq00-fix-rseq-test-when-linking-with-a.patch -Patch12: 0012-compel-add-helpers-to-get-set-instruction-pointer.patch -Patch13: 0013-cr-dump-fixup-thread-IP-when-inside-rseq-cs.patch -Patch14: 0014-zdtm-add-rseq-transition-test-for-amd64.patch -Patch15: 0015-cr-dump-handle-rseq-flags-field.patch -Patch16: 0016-zdtm-add-rseq02-transition-test-with-NO_RESTART-CS-f.patch +Patch: 0001-criu-dump-and-restore-cpu-affinity-of-each-thread.patch +Patch: 0002-compel-add-rseq-syscall-into-compel-std-plugin-sysca.patch +Patch: 0003-kerndat-check-for-rseq-syscall-support-Signed-off-by.patch +Patch: 0004-util-move-fork_and_ptrace_attach-helper-from-cr-chec.patch +Patch: 0005-cr-check-Add-ptrace-rseq-conf-dump-feature-Add-get_r.patch +Patch: 0006-rseq-initial-support-TODO-1.-properly-handle-case-wh.patch +Patch: 0007-zdtm-add-simple-test-for-rseq-C-R-Signed-off-by-Alex.patch +Patch: 0008-ci-add-Fedora-Rawhide-based-test-on-Cirrus-We-have-a.patch +Patch: 0009-include-add-thread_pointer.h-from-Glibc-Implementati.patch +Patch: 0010-clone-noasan-unregister-rseq-at-the-thread-start-for.patch +Patch: 0011-zdtm-static-rseq00-fix-rseq-test-when-linking-with-a.patch +Patch: 0012-compel-add-helpers-to-get-set-instruction-pointer-Si.patch +Patch: 0013-cr-dump-fixup-thread-IP-when-inside-rseq-cs-Signed-o.patch +Patch: 0014-zdtm-add-rseq-transition-test-for-amd64-Signed-off-b.patch +Patch: 0015-cr-dump-handle-rseq-flags-field-Userspace-may-config.patch +Patch: 0016-zdtm-add-rseq02-transition-test-with-NO_RESTART-CS-f.patch +Patch: 0017-zdtm-fix-zdtm-static-maps00-case-in-arm64.patch +Patch: 0018-test-flush-ipt-rules-after-program-exits.patch +Patch: 0019-zdtm-fix-cleaning-step-of-zdtm_netns.patch +%ifarch aarch64 +Patch: 0020-mm-add-pin-memory-method-for-criu.patch +Patch: 0021-pid-add-pid-recover-method-for-criu.patch +Patch: 0022-notifier-add-notifier-calling-method-for-checkpoint-.patch +Patch: 0023-block-device-dump-block-device-as-reguler-file.patch +Patch: 0024-anon-inode-add-support-for-anon-inode-fd.patch +Patch: 0025-char_dev-add-support-for-char-device-dump-and-restor.patch +Patch: 0026-improve-char-dev-fd-check-and-repair-method.patch +Patch: 0027-mmap-restore-dev-hisi_sec2-deivce-vma.patch +Patch: 0028-infiniband-fix-the-infiniband-fd-conflict.patch +Patch: 0029-cred-provide-cred-checkpoint-restore-method.patch +Patch: 0030-socket-fix-connect-error-of-invalid-param.patch +Patch: 0031-criu-eventpollfd-fix-for-improper-usage-in-appdata.patch +Patch: 0032-task_exit_notify-add-task-exit-notify-mask-method-fo.patch +Patch: 0033-unix-socket-add-support-for-unix-stream-socket.patch +Patch: 0034-netlink-add-repair-modes-and-clear-resource-when-fai.patch +Patch: 0035-sysvshm-add-dump-restore-sysv-shm-in-host-ipc-ns.patch +Patch: 0036-add-O_REPAIR-flag-to-vma-fd.patch +Patch: 0037-looser-file-mode-and-size-check.patch +Patch: 0038-file-lock-add-repair-mode-to-dump-file-locks.patch +Patch: 0039-unlock-network-when-restore-fails.patch +Patch: 0040-net-add-shared-socket-recover-method-for-criu.patch +Patch: 0041-tcp-save-src-ports-to-ip_local_reserved_ports-when-d.patch +Patch: 0042-reg-file-fix-dump-fail-problem-with-null-seek-op.patch +Patch: 0043-fix-dump-fail-problem-with-no-access-to-get-socket-f.patch +Patch: 0044-proc-parse-fix-vma-offset-value-for-the-sysfs-file-o.patch +Patch: 0045-add-reuse-file-method-for-recover-deleted-file-state.patch +Patch: 0046-sk-fix-share-sockets-repair-problem.patch +Patch: 0047-mm-add-clear-pin-mem-and-init-page-map-option.patch +Patch: 0048-fds-fix-fds-list-restore.patch +Patch: 0049-log-print-error-log-to-dev-kmsg.patch +Patch: 0050-unix-sk-improve-dgram-robustness.patch +Patch: 0051-sk-ignore-the-bind-error-for-icmp-socket.patch +Patch: 0052-optimization-parallel-collecting-vmas.patch +Patch: 0053-mm-add-exec-file-mapping-pin-method.patch +Patch: 0054-ptrace-trace-specific-syscall.patch +Patch: 0055-notifier-rollback-when-open-img-failed.patch +Patch: 0056-detach-don-t-kill-task-when-ptrace-PTRACE_DETACH-ret.patch +Patch: 0057-build-add-secure-compilation-options.patch +Patch: 0058-nftables-add-mnl-api.patch +Patch: 0059-nftables-implement-nft-api-for-tcp.patch +Patch: 0060-net-switch-to-nftables-API.patch +Patch: 0061-zdtm-unlink-kdat-before-testing.patch +Patch: 0062-zdtm-add-host-ns-sysvshm-ipc-case.patch +Patch: 0063-zdtm-add-pinmem-testcase.patch +Patch: 0064-zdtm-init-notifier-testcase.patch +Patch: 0065-zdtm-print-errno-info-when-accessing-.out-failure.patch +Patch: 0066-zdtm-print-more-info-for-fs.c.patch +Patch: 0067-zdtm-add-chardev-testcase.patch +Patch: 0068-zdtm-add-infiniband-testcase.patch +Patch: 0069-zdtm-add-share-port-testcase.patch +Patch: 0070-zdtm-tmp-test-script.patch +Patch: 0071-mod-add-criu-indepent-test.patch +Patch: 0072-kabichk-add-KABI-check-code.patch +%endif %description Checkpoint/Restore in Userspace(CRIU),is a software tool for the linux operating system. @@ -113,23 +172,3 @@ chmod 0755 %{buildroot}/run/%{name}/ %doc %{_mandir}/man1/{compel.1*,crit.1*,criu-ns.1*} %changelog -* Fri Mar 4 2022 ningyu - 3.16.1-2 -- rseq c/r support - -* Thu Dec 2 2021 zhouwenpei - 3.16.1-1 -- upgrade criu version to 3.16.1 - -* Tue Sep 07 2021 chenchen - 3.15-4 -- add "-fstack-protector-strong" for libcriu.so.2.0 - -* Mon May 31 2021 baizhonggui - 3.15-3 -- Add gcc in BuildRequires - -* Thu Apr 08 2021 fu.lin - 3.15-1 -- bump the criu version to v3.15 - -* Tue Sep 22 2020 lingsheng - 3.13-7 -- Fix crit errors - -* Fri Apr 24 2020 wutao - 3.13-6 -- Package init -- Gitee