diff --git a/0004-criu-dump-and-restore-cpu-affinity-of-each-thread.patch b/backport-0001--criu-dump-and-restore-cpu-affinity-of-each-thread.patch similarity index 91% rename from 0004-criu-dump-and-restore-cpu-affinity-of-each-thread.patch rename to backport-0001--criu-dump-and-restore-cpu-affinity-of-each-thread.patch index 0384f1ed322249f28708a7b755cb351185592962..1cdc5fc3875690a8d0b0b943390d1ffc413a4831 100644 --- a/0004-criu-dump-and-restore-cpu-affinity-of-each-thread.patch +++ b/backport-0001--criu-dump-and-restore-cpu-affinity-of-each-thread.patch @@ -1,7 +1,7 @@ -From 4a49af49be378835b65016d5465eae44107a52e1 Mon Sep 17 00:00:00 2001 +From 7304583016558035439f77bb9b49013c05ac9bab Mon Sep 17 00:00:00 2001 From: "fu.lin" Date: Tue, 13 Apr 2021 10:39:45 +0800 -Subject: [PATCH 4/6] criu: dump and restore cpu affinity of each thread +Subject: [PATCH 01/50] criu: dump and restore cpu affinity of each thread Criu should dump and restore threads' or processes' cpu affinity. @@ -15,25 +15,26 @@ Add option --with-cpu-affinity to enable this function at restore. Signed-off-by: Sang Yan +Signed-off-by: fu.lin --- - compel/arch/arm/plugins/std/syscalls/syscall.def | 1 + - .../ppc64/plugins/std/syscalls/syscall-ppc64.tbl | 1 + - .../s390/plugins/std/syscalls/syscall-s390.tbl | 1 + - .../arch/x86/plugins/std/syscalls/syscall_32.tbl | 1 + - .../arch/x86/plugins/std/syscalls/syscall_64.tbl | 1 + - criu/config.c | 1 + - criu/cr-dump.c | 14 ++++++++ - criu/cr-restore.c | 26 ++++++++++++++ - criu/crtools.c | 2 ++ - criu/include/cr_options.h | 2 ++ - criu/include/restorer.h | 3 ++ - criu/pie/restorer.c | 38 ++++++++++++++++++++ - criu/pstree.c | 7 ++++ - images/core.proto | 5 +++ - test/zdtm/static/Makefile | 1 + - test/zdtm/static/cpu-affinity0.c | 42 ++++++++++++++++++++++ - test/zdtm/static/cpu-affinity0.desc | 1 + - 17 files changed, 147 insertions(+) + .../arch/arm/plugins/std/syscalls/syscall.def | 1 + + .../plugins/std/syscalls/syscall-ppc64.tbl | 1 + + .../plugins/std/syscalls/syscall-s390.tbl | 1 + + .../x86/plugins/std/syscalls/syscall_32.tbl | 1 + + .../x86/plugins/std/syscalls/syscall_64.tbl | 1 + + criu/config.c | 1 + + criu/cr-dump.c | 14 +++++++ + criu/cr-restore.c | 29 +++++++++++++ + criu/crtools.c | 2 + + criu/include/cr_options.h | 2 + + criu/include/restorer.h | 3 ++ + criu/pie/restorer.c | 38 +++++++++++++++++ + criu/pstree.c | 7 ++++ + images/core.proto | 5 +++ + test/zdtm/static/Makefile | 1 + + test/zdtm/static/cpu-affinity0.c | 42 +++++++++++++++++++ + test/zdtm/static/cpu-affinity0.desc | 1 + + 17 files changed, 150 insertions(+) create mode 100644 test/zdtm/static/cpu-affinity0.c create mode 100644 test/zdtm/static/cpu-affinity0.desc @@ -133,7 +134,7 @@ index b9d2914..f078c27 100644 } diff --git a/criu/cr-restore.c b/criu/cr-restore.c -index 589087f..da2e53d 100644 +index 589087f..1374a69 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -118,6 +118,7 @@ static int prepare_restorer_blob(void); @@ -154,7 +155,7 @@ index 589087f..da2e53d 100644 if (prepare_posix_timers(pid, ta, core)) return -1; -@@ -3196,6 +3200,27 @@ out: +@@ -3196,6 +3200,30 @@ out: return ret; } @@ -167,6 +168,9 @@ index 589087f..da2e53d 100644 + ta->allowed_cpus = (char *)rst_mem_align_cpos(RM_PRIVATE); + + need_cpu_affinity = rst_mem_alloc(sizeof(int), RM_PRIVATE); ++ if (need_cpu_affinity == NULL) ++ return -1; ++ + *need_cpu_affinity = opts.with_cpu_affinity; + + for (i = 0; i < current->nr_threads; i++) { @@ -182,7 +186,7 @@ index 589087f..da2e53d 100644 extern void __gcov_flush(void) __attribute__((weak)); void __gcov_flush(void) {} -@@ -3655,6 +3680,7 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns +@@ -3655,6 +3683,7 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns RST_MEM_FIXUP_PPTR(task_args->timerfd); RST_MEM_FIXUP_PPTR(task_args->posix_timers); RST_MEM_FIXUP_PPTR(task_args->siginfo); @@ -191,7 +195,7 @@ index 589087f..da2e53d 100644 RST_MEM_FIXUP_PPTR(task_args->helpers); RST_MEM_FIXUP_PPTR(task_args->zombies); diff --git a/criu/crtools.c b/criu/crtools.c -index 2eb5dba..0f04a85 100644 +index 2eb5dba..949dc9f 100644 --- a/criu/crtools.c +++ b/criu/crtools.c @@ -441,6 +441,8 @@ usage: @@ -414,5 +418,5 @@ index 0000000..0d0b8ae @@ -0,0 +1 @@ +{'dopts': '', 'ropts': '--with-cpu-affinity', 'flags': 'reqrst '} -- -1.8.3.1 +2.34.0 diff --git a/backport-0002--build-add-secure-compilation-options.patch b/backport-0002--build-add-secure-compilation-options.patch new file mode 100644 index 0000000000000000000000000000000000000000..cb5a52c7b3ff9bce253f42ee0f593e1fc3f1571f --- /dev/null +++ b/backport-0002--build-add-secure-compilation-options.patch @@ -0,0 +1,112 @@ +From ddc802a3e4a7086eda1084b94a366ca967fc39fb Mon Sep 17 00:00:00 2001 +From: Jingxian He +Date: Wed, 19 May 2021 22:49:57 +0800 +Subject: [PATCH 02/50] build: add secure compilation options + +Add secure compilation options: +-fstack-protector -fstack-protector-all +-Wl,-z,relro,-z,now,-z,noexecstack + +Signed-off-by: Fu Lin +--- + Makefile | 4 ++++ + criu/Makefile | 2 +- + criu/pie/Makefile | 1 + + criu/pie/Makefile.library | 2 ++ + lib/Makefile | 1 + + lib/c/Makefile | 2 +- + scripts/nmk/scripts/build.mk | 5 +++-- + 7 files changed, 13 insertions(+), 4 deletions(-) + +diff --git a/Makefile b/Makefile +index c33494b..a9d7d94 100644 +--- a/Makefile ++++ b/Makefile +@@ -80,6 +80,10 @@ ifeq ($(ARCH),mips) + DEFINES := -DCONFIG_MIPS + endif + ++# secure compilation options ++CFLAGS += -fstack-protector-all -fPIE ++LDFLAGS += -pie ++ + # + # CFLAGS_PIE: + # +diff --git a/criu/Makefile b/criu/Makefile +index ceb49ce..0fabffc 100644 +--- a/criu/Makefile ++++ b/criu/Makefile +@@ -85,7 +85,7 @@ $(obj)/%: pie + + $(obj)/criu: $(PROGRAM-BUILTINS) + $(call msg-link, $@) +- $(Q) $(CC) $(CFLAGS) $^ $(LIBS) $(WRAPFLAGS) $(LDFLAGS) $(GMONLDOPT) -rdynamic -o $@ ++ $(Q) $(CC) $(CFLAGS) $^ $(LIBS) $(WRAPFLAGS) $(LDFLAGS) $(GMONLDOPT) -rdynamic -o $@ -Wl,-z,relro,-z,now,-z,noexecstack -fPIE -pie + + + # +diff --git a/criu/pie/Makefile b/criu/pie/Makefile +index 265dcf8..40b5804 100644 +--- a/criu/pie/Makefile ++++ b/criu/pie/Makefile +@@ -6,6 +6,7 @@ target := parasite restorer + + CFLAGS := $(filter-out -pg $(CFLAGS-GCOV) $(CFLAGS-ASAN),$(CFLAGS)) + CFLAGS += $(CFLAGS_PIE) ++CFLAGS := $(filter-out -fstack-protector -fstack-protector-all,$(CFLAGS)) + ccflags-y += -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=0 + ccflags-y += -Wp,-U_FORTIFY_SOURCE -Wp,-D_FORTIFY_SOURCE=0 + +diff --git a/criu/pie/Makefile.library b/criu/pie/Makefile.library +index da2a2fa..c022d06 100644 +--- a/criu/pie/Makefile.library ++++ b/criu/pie/Makefile.library +@@ -27,3 +27,5 @@ CFLAGS += $(CFLAGS_PIE) + ifeq ($(ARCH),mips) + CFLAGS += -fno-stack-protector -DCR_NOGLIBC -mno-abicalls -fno-pic + endif ++ ++CFLAGS := $(filter-out -fstack-protector -fstack-protector-all,$(CFLAGS)) +diff --git a/lib/Makefile b/lib/Makefile +index f9b6670..bc1b513 100644 +--- a/lib/Makefile ++++ b/lib/Makefile +@@ -14,6 +14,7 @@ lib/c/Makefile: ; + lib/c/%: .FORCE + $(Q) $(MAKE) $(build)=lib/c $@ + ++CFLAGS := $(filter-out -fPIE,$(CFLAGS)) + cflags-so += $(CFLAGS) -rdynamic -Wl,-soname,$(CRIU_SO).$(CRIU_SO_VERSION_MAJOR) + ldflags-so += -lprotobuf-c + +diff --git a/lib/c/Makefile b/lib/c/Makefile +index af01467..d7f6491 100644 +--- a/lib/c/Makefile ++++ b/lib/c/Makefile +@@ -4,5 +4,5 @@ obj-y += ./images/rpc.pb-c.o + ccflags-y += -iquote criu/$(ARCH_DIR)/include + ccflags-y += -iquote criu/include + ccflags-y += -iquote images +-ccflags-y += -fPIC -fno-stack-protector ++ccflags-y += -fPIC + ldflags-y += -r -z noexecstack +diff --git a/scripts/nmk/scripts/build.mk b/scripts/nmk/scripts/build.mk +index d01d2b7..6f366d7 100644 +--- a/scripts/nmk/scripts/build.mk ++++ b/scripts/nmk/scripts/build.mk +@@ -15,8 +15,9 @@ lib-name := + lib-target := + hostprogs-y := + libso-y := +-ld_flags := +-ldflags-so := ++ld_flags := -Wl,-z,relro,-z,now,-z,noexecstack ++ldflags-so := -Wl,-z,relro,-z,now,-z,noexecstack ++ldflags-y := -z relro -z now -z noexecstack + arflags-y := + target := + deps-y := +-- +2.34.0 + diff --git a/backport-0003--tty-fix-NULL-pointer-access-in-tty.patch b/backport-0003--tty-fix-NULL-pointer-access-in-tty.patch new file mode 100644 index 0000000000000000000000000000000000000000..bdfac3dc8ab406047c1cf3fd25f1d02d6b9ce044 --- /dev/null +++ b/backport-0003--tty-fix-NULL-pointer-access-in-tty.patch @@ -0,0 +1,29 @@ +From 94932898b10ebfadc1013924fbec05740989e49a Mon Sep 17 00:00:00 2001 +From: "fu.lin" +Date: Thu, 15 Jul 2021 11:00:25 +0800 +Subject: [PATCH 03/50] tty: fix NULL pointer access in tty + +Signed-off-by: fu.lin +--- + criu/tty.c | 5 +++++ + 1 file changed, 5 insertions(+) + +diff --git a/criu/tty.c b/criu/tty.c +index dee8d46..b34cfc2 100644 +--- a/criu/tty.c ++++ b/criu/tty.c +@@ -2023,6 +2023,11 @@ static int dump_one_tty(int lfd, u32 id, const struct fd_parms *p) + pr_info("Dumping tty %d with id %#x\n", lfd, id); + + driver = get_tty_driver(p->stat.st_rdev, p->stat.st_dev); ++ if (driver == NULL) { ++ pr_err("Can't get tty driver\n"); ++ return -1; ++ } ++ + if (driver->fd_get_index) + index = driver->fd_get_index(lfd, p); + else +-- +2.34.0 + diff --git a/backport-0004--namespaces-drop-func-address-print-to-make-someone-h.patch b/backport-0004--namespaces-drop-func-address-print-to-make-someone-h.patch new file mode 100644 index 0000000000000000000000000000000000000000..e1886f3ec329cdace4bdbfb12548e8695aae86e9 --- /dev/null +++ b/backport-0004--namespaces-drop-func-address-print-to-make-someone-h.patch @@ -0,0 +1,31 @@ +From 17102bcd96da8b98815a08fc72c93358383780ff Mon Sep 17 00:00:00 2001 +From: "fu.lin" +Date: Thu, 15 Jul 2021 11:10:46 +0800 +Subject: [PATCH 04/50] namespaces: drop func address print to make someone + happy + +Signed-off-by: fu.lin +--- + criu/namespaces.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/criu/namespaces.c b/criu/namespaces.c +index 796f412..9ffcd16 100644 +--- a/criu/namespaces.c ++++ b/criu/namespaces.c +@@ -1294,10 +1294,10 @@ static int usernsd(int sk) + } + + unsc_msg_pid_fd(&um, &pid, &fd); +- pr_debug("uns: daemon calls %p (%d, %d, %x)\n", call, pid, fd, flags); ++ pr_debug("uns: daemon calls (%d, %d, %x)\n", pid, fd, flags); + + if (fd < 0 && flags & UNS_FDOUT) { +- pr_err("uns: bad flags/fd %p %d %x\n", call, fd, flags); ++ pr_err("uns: bad flags/fd %d %x\n", fd, flags); + BUG(); + } + +-- +2.34.0 + diff --git a/0006-criu-add-pin-memory-method.patch b/backport-0005--mm-add-pin-memory-method-for-criu.patch similarity index 53% rename from 0006-criu-add-pin-memory-method.patch rename to backport-0005--mm-add-pin-memory-method-for-criu.patch index e29a0e6a7584a68fa829ca6abc5b045af8c0c4e8..f7353ba3fe742c708d74dad21421e0f90d4cd5a0 100644 --- a/0006-criu-add-pin-memory-method.patch +++ b/backport-0005--mm-add-pin-memory-method-for-criu.patch @@ -1,27 +1,24 @@ -From 4c11832330e6c7b924b96c7ea70c14025fe0d970 Mon Sep 17 00:00:00 2001 -From: "fu.lin" -Date: Tue, 13 Apr 2021 14:10:23 +0800 -Subject: [PATCH 6/6] criu: add pin memory method +From cf676eeff74e4f872fc07fcc705b9762c0105a38 Mon Sep 17 00:00:00 2001 +From: Jingxian He +Date: Fri, 23 Apr 2021 21:22:08 +0800 +Subject: [PATCH 05/50] mm: add pin memory method for criu -We can use the checkpoint and restore in userspace method to dump -and restore tasks when updating the kernel. Currently, criu needs -dump all memory data of tasks to files. When the memory size is -very large (large than 1GiB), the cost time of the dumping data -will be very long (more than 1 min). - -We can pin the memory data of tasks and collect the corresponding -physical pages mapping info in checkpoint process, and remap the -physical pages to restore tasks in restore process. +Add pin memory method for criu to improve memory recover +speed and avoid user private data saving to files. Signed-off-by: Jingxian He --- - criu/config.c | 1 + - criu/cr-restore.c | 5 +++ - criu/include/cr_options.h | 1 + - criu/include/restorer.h | 24 ++++++++++++ - criu/mem.c | 96 ++++++++++++++++++++++++++++++++++++++++++++++- - criu/pie/restorer.c | 21 ++++++++++- - 6 files changed, 146 insertions(+), 2 deletions(-) + criu/config.c | 1 + + criu/cr-dump.c | 5 ++ + criu/cr-restore.c | 5 ++ + criu/crtools.c | 3 +- + criu/include/cr_options.h | 1 + + criu/include/mem.h | 2 + + criu/include/restorer.h | 28 ++++++++ + criu/mem.c | 130 +++++++++++++++++++++++++++++++++++++- + criu/pie/restorer.c | 25 +++++++- + criu/seize.c | 6 ++ + 10 files changed, 203 insertions(+), 3 deletions(-) diff --git a/criu/config.c b/criu/config.c index 5a53256..61b81fa 100644 @@ -35,11 +32,27 @@ index 5a53256..61b81fa 100644 { }, }; +diff --git a/criu/cr-dump.c b/criu/cr-dump.c +index f078c27..8575516 100644 +--- a/criu/cr-dump.c ++++ b/criu/cr-dump.c +@@ -1778,6 +1778,11 @@ static int cr_dump_finish(int ret) + close_service_fd(CR_PROC_FD_OFF); + close_image_dir(); + ++ if (ret == 0 && opts.pin_memory) { ++ pr_info("start restore_task_special_pages\n"); ++ restore_task_special_pages(0); ++ } ++ + if (ret) { + pr_err("Dumping FAILED.\n"); + } else { diff --git a/criu/cr-restore.c b/criu/cr-restore.c -index da2e53d..ff41976 100644 +index 1374a69..27f3c54 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c -@@ -3866,6 +3866,11 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns +@@ -3869,6 +3869,11 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns task_args->clone_restore_fn, task_args->thread_args); @@ -51,6 +64,21 @@ index da2e53d..ff41976 100644 /* * An indirect call to task_restore, note it never returns * and restoring core is extremely destructive. +diff --git a/criu/crtools.c b/criu/crtools.c +index 949dc9f..7bda86d 100644 +--- a/criu/crtools.c ++++ b/criu/crtools.c +@@ -441,8 +441,9 @@ usage: + " --file-validation METHOD\n" + " pass the validation method to be used; argument\n" + " can be 'filesize' or 'buildid' (default).\n" +-" --with-cpu-affinity Allow to restore cpu affinity. Only for hosts with\n" ++" --with-cpu-affinity Allow to restore cpu affinity. Only for hosts with\n" + " same cpu quantity.\n" ++" --pin-memory Use pin memory method for checkpoint and restore.\n" + "\n" + "Check options:\n" + " Without options, \"criu check\" checks availability of absolutely required\n" diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h index fda54a4..a4dc5b8 100644 --- a/criu/include/cr_options.h @@ -63,8 +91,19 @@ index fda54a4..a4dc5b8 100644 }; extern struct cr_options opts; +diff --git a/criu/include/mem.h b/criu/include/mem.h +index 251cb1a..3b3fdf8 100644 +--- a/criu/include/mem.h ++++ b/criu/include/mem.h +@@ -50,4 +50,6 @@ int prepare_vmas(struct pstree_item *t, struct task_restore_args *ta); + int unmap_guard_pages(struct pstree_item *t); + int prepare_mappings(struct pstree_item *t); + bool should_dump_page(VmaEntry *vmae, u64 pme); ++int dump_task_special_pages(int pid); ++int restore_task_special_pages(int pid); + #endif /* __CR_MEM_H__ */ diff --git a/criu/include/restorer.h b/criu/include/restorer.h -index bd6ef6a..fc37e6d 100644 +index bd6ef6a..9614720 100644 --- a/criu/include/restorer.h +++ b/criu/include/restorer.h @@ -225,6 +225,7 @@ struct task_restore_args { @@ -75,22 +114,26 @@ index bd6ef6a..fc37e6d 100644 } __aligned(64); /* -@@ -317,4 +318,27 @@ enum { +@@ -317,4 +318,31 @@ enum { #define __r_sym(name) restorer_sym ## name #define restorer_sym(rblob, name) (void*)(rblob + __r_sym(name)) +#define PIN_MEM_FILE "/dev/pinmem" +#define PIN_MEM_MAGIC 0x59 -+#define _SET_PIN_MEM_AREA 1 -+#define _CLEAR_PIN_MEM_AREA 2 -+#define _REMAP_PIN_MEM_AREA 3 -+#define _PIN_MEM_IOC_MAX_NR 4 ++#define _SET_PIN_MEM_AREA 1 ++#define _CLEAR_PIN_MEM_AREA 2 ++#define _REMAP_PIN_MEM_AREA 3 ++#define _DUMP_SEPCIAL_PAGES 6 ++#define _RETORE_SEPCIAL_PAGES 7 +#define SET_PIN_MEM_AREA _IOW(PIN_MEM_MAGIC, _SET_PIN_MEM_AREA, struct pin_mem_area_set) +#define CLEAR_PIN_MEM_AREA _IOW(PIN_MEM_MAGIC, _CLEAR_PIN_MEM_AREA, int) +#define REMAP_PIN_MEM_AREA _IOW(PIN_MEM_MAGIC, _REMAP_PIN_MEM_AREA, int) ++#define DUMP_SEPCIAL_PAGES _IOW(PIN_MEM_MAGIC, _DUMP_SEPCIAL_PAGES, int) ++#define RETORE_SEPCIAL_PAGES _IOW(PIN_MEM_MAGIC, _RETORE_SEPCIAL_PAGES, int) + +#define ONCE_PIN_MEM_SIZE_LIMIT 32 * 1024 * 1024 +#define MAX_PIN_MEM_AREA_NUM 16 ++ +struct pin_mem_area { + unsigned long virt_start; + unsigned long virt_end; @@ -104,10 +147,10 @@ index bd6ef6a..fc37e6d 100644 + #endif /* __CR_RESTORER_H__ */ diff --git a/criu/mem.c b/criu/mem.c -index 167838b..709de4e 100644 +index 167838b..2eabb8d 100644 --- a/criu/mem.c +++ b/criu/mem.c -@@ -438,6 +438,88 @@ again: +@@ -438,6 +438,119 @@ again: return ret; } @@ -131,10 +174,8 @@ index 167838b..709de4e 100644 + + if (vma_entry_is(vmae, VMA_AREA_AIORING)) + return false; -+ if (vma_entry_is(vmae, VMA_ANON_PRIVATE)) { -+ pr_debug("find private anon vma: %lx-%lx\n", vmae->start, vmae->end); ++ if (vma_entry_is(vmae, VMA_ANON_PRIVATE)) + return true; -+ } + + return false; +} @@ -155,7 +196,6 @@ index 167838b..709de4e 100644 + pma = &(pmas.mem_area[index]); + pma->virt_start = start; + pma->virt_end = next; -+ pr_info("start pin %lx-%lx\n", start, next); + index++; + start += ONCE_PIN_MEM_SIZE_LIMIT; + if (index >= MAX_PIN_MEM_AREA_NUM) @@ -164,7 +204,6 @@ index 167838b..709de4e 100644 + *pend = next; + pmas.area_num = index; + pmas.pid = vpid(item); -+ pr_info("begin pin memory for pid:%d\n", pmas.pid); + ret = ioctl(fd, SET_PIN_MEM_AREA, &pmas); + if (ret < 0) + pr_err("pin mem fail, errno: %s\n", strerror(errno)); @@ -192,11 +231,46 @@ index 167838b..709de4e 100644 + close(fd); + return ret; +} ++ ++int dump_task_special_pages(int pid) ++{ ++ int fd, ret; ++ ++ fd = open(PIN_MEM_FILE, O_RDWR, 0); ++ if (fd < 0) { ++ pr_warn("error open file: %s\n", PIN_MEM_FILE); ++ return -1; ++ } ++ ret = ioctl(fd, DUMP_SEPCIAL_PAGES, (unsigned long) &pid); ++ if (ret < 0) { ++ pr_warn("No need DUMP_SEPCIAL_PAGES for %d\n", pid); ++ } ++ close(fd); ++ return ret; ++} ++ ++int restore_task_special_pages(int pid) ++{ ++ int fd, ret; ++ ++ fd = open(PIN_MEM_FILE, O_RDWR, 0); ++ if (fd < 0) { ++ pr_warn("error open file: %s\n", PIN_MEM_FILE); ++ return -1; ++ } ++ ret = ioctl(fd, RETORE_SEPCIAL_PAGES, (unsigned long) &pid); ++ if (ret < 0) { ++ pr_warn("No need RETORE_SEPCIAL_PAGES for %d\n", pid); ++ } ++ close(fd); ++ return ret; ++} ++ + static int __parasite_dump_pages_seized(struct pstree_item *item, struct parasite_dump_pages_args *args, struct vm_area_list *vma_area_list, -@@ -513,7 +595,16 @@ static int __parasite_dump_pages_seized(struct pstree_item *item, +@@ -513,7 +626,18 @@ static int __parasite_dump_pages_seized(struct pstree_item *item, if (possible_pid_reuse == -1) goto out_xfer; } @@ -206,26 +280,29 @@ index 167838b..709de4e 100644 + list_for_each_entry(vma_area, &vma_area_list->h, list) { + if (should_pin_vmae(vma_area->e)) { + ret = pin_vmae(vma_area->e, item); -+ if (ret) ++ if (ret) { ++ exit_code = -1; + goto out_xfer; ++ } + } + } + } /* * Step 1 -- generate the pagemap -@@ -524,6 +615,9 @@ static int __parasite_dump_pages_seized(struct pstree_item *item, +@@ -524,6 +648,10 @@ static int __parasite_dump_pages_seized(struct pstree_item *item, parent_predump_mode = mdc->parent_ie->pre_dump_mode; list_for_each_entry(vma_area, &vma_area_list->h, list) { -+ if (opts.pin_memory && should_pin_vmae(vma_area->e)) ++ if (opts.pin_memory && should_pin_vmae(vma_area->e)) { + continue; ++ } + ret = generate_vma_iovs(item, vma_area, pp, &xfer, args, ctl, &pmc, has_parent, mdc->pre_dump, parent_predump_mode); diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c -index c63f96b..f3bd541 100644 +index c63f96b..1565e3c 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -1414,6 +1414,24 @@ int cleanup_current_inotify_events(struct task_restore_args *task_args) @@ -253,16 +330,44 @@ index c63f96b..f3bd541 100644 /* * The main routine to restore task via sigreturn. * This one is very special, we never return there -@@ -1585,7 +1603,8 @@ long __export_restore_task(struct task_restore_args *args) +@@ -1585,7 +1603,12 @@ long __export_restore_task(struct task_restore_args *args) goto core_restore_end; } } - -+ if (args->pin_memory) -+ remap_vmas(my_pid); ++ if (args->pin_memory) { ++ if (remap_vmas(my_pid) < 0) { ++ pr_err("Remap vmas fail\n"); ++ goto core_restore_end; ++ } ++ } /* * Now read the contents (if any) */ +diff --git a/criu/seize.c b/criu/seize.c +index f973806..a661097 100644 +--- a/criu/seize.c ++++ b/criu/seize.c +@@ -23,6 +23,7 @@ + #include "string.h" + #include "xmalloc.h" + #include "util.h" ++#include "mem.h" + + #define NR_ATTEMPTS 5 + +@@ -655,6 +656,11 @@ static void unseize_task_and_threads(const struct pstree_item *item, int st) + if (item->pid->state == TASK_DEAD) + return; + ++ if (opts.pin_memory) { ++ for (i = 0; i < item->nr_threads; i++) ++ dump_task_special_pages(item->threads[i].real); ++ } ++ + /* + * The st is the state we want to switch tasks into, + * the item->state is the state task was in when we seized one. -- -1.8.3.1 +2.34.0 diff --git a/0007-criu-add-pid-recover-method-for-criu.patch b/backport-0006--pid-add-pid-recover-method-for-criu.patch similarity index 82% rename from 0007-criu-add-pid-recover-method-for-criu.patch rename to backport-0006--pid-add-pid-recover-method-for-criu.patch index 6e2387330355d8a8bc897c0acf074afa0c0e2a7e..5d1b9f48636e25d0ddbc53cc42e781da01ce6e8a 100644 --- a/0007-criu-add-pid-recover-method-for-criu.patch +++ b/backport-0006--pid-add-pid-recover-method-for-criu.patch @@ -1,7 +1,7 @@ -From 9f32d95524683ae3644066eba4abb3227fe47c65 Mon Sep 17 00:00:00 2001 +From b7524ec1782730465d4c11c3885c244622ba77ac Mon Sep 17 00:00:00 2001 From: Jingxian He -Date: Fri, 23 Jul 2021 10:40:13 +0800 -Subject: [PATCH] add pid recover method +Date: Wed, 19 May 2021 21:33:22 +0800 +Subject: [PATCH 06/50] pid: add pid recover method for criu The default pid recover method cannot recover the task pid at every time. @@ -17,9 +17,9 @@ Signed-off-by: Jingxian He criu/cr-restore.c | 25 ++++++++++++++++++++++++- criu/crtools.c | 1 + criu/include/cr_options.h | 1 + - criu/include/restorer.h | 4 +++- + criu/include/restorer.h | 3 +++ criu/pie/restorer.c | 25 ++++++++++++++++++++++++- - 6 files changed, 54 insertions(+), 3 deletions(-) + 6 files changed, 54 insertions(+), 2 deletions(-) diff --git a/criu/config.c b/criu/config.c index 61b81fa..a5bcf10 100644 @@ -34,7 +34,7 @@ index 61b81fa..a5bcf10 100644 }; diff --git a/criu/cr-restore.c b/criu/cr-restore.c -index ff41976..6977443 100644 +index 27f3c54..e050b88 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -1365,6 +1365,23 @@ static int set_next_pid(void *arg) @@ -82,7 +82,7 @@ index ff41976..6977443 100644 /* * Some kernel modules, such as network packet generator * run kernel thread upon net-namespace creation taking -@@ -3870,6 +3892,7 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns +@@ -3873,6 +3895,7 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns task_args->pin_memory = true; else task_args->pin_memory = false; @@ -91,13 +91,13 @@ index ff41976..6977443 100644 /* * An indirect call to task_restore, note it never returns diff --git a/criu/crtools.c b/criu/crtools.c -index 949dc9f..c33902a 100644 +index 7bda86d..9b3ef33 100644 --- a/criu/crtools.c +++ b/criu/crtools.c -@@ -443,6 +443,7 @@ usage: - " can be 'filesize' or 'buildid' (default).\n" - " --with-cpu-affinity Allow to restore cpu affinity. Only for hosts with\n" +@@ -444,6 +444,7 @@ usage: + " --with-cpu-affinity Allow to restore cpu affinity. Only for hosts with\n" " same cpu quantity.\n" + " --pin-memory Use pin memory method for checkpoint and restore.\n" +" --use-fork-pid Allow to restore task pid by setting fork pid of task struct.\n" "\n" "Check options:\n" @@ -115,7 +115,7 @@ index a4dc5b8..7fad678 100644 extern struct cr_options opts; diff --git a/criu/include/restorer.h b/criu/include/restorer.h -index fc37e6d..3d1a3c0 100644 +index 9614720..8fd47e2 100644 --- a/criu/include/restorer.h +++ b/criu/include/restorer.h @@ -226,6 +226,7 @@ struct task_restore_args { @@ -126,21 +126,22 @@ index fc37e6d..3d1a3c0 100644 } __aligned(64); /* -@@ -323,10 +324,11 @@ enum { - #define _SET_PIN_MEM_AREA 1 - #define _CLEAR_PIN_MEM_AREA 2 - #define _REMAP_PIN_MEM_AREA 3 --#define _PIN_MEM_IOC_MAX_NR 4 -+#define _SET_FORK_PID 6 +@@ -325,11 +326,13 @@ enum { + #define _REMAP_PIN_MEM_AREA 3 + #define _DUMP_SEPCIAL_PAGES 6 + #define _RETORE_SEPCIAL_PAGES 7 ++#define _SET_FORK_PID 8 #define SET_PIN_MEM_AREA _IOW(PIN_MEM_MAGIC, _SET_PIN_MEM_AREA, struct pin_mem_area_set) #define CLEAR_PIN_MEM_AREA _IOW(PIN_MEM_MAGIC, _CLEAR_PIN_MEM_AREA, int) #define REMAP_PIN_MEM_AREA _IOW(PIN_MEM_MAGIC, _REMAP_PIN_MEM_AREA, int) -+#define SET_FORK_PID _IOW(PIN_MEM_MAGIC, _SET_FORK_PID, int) + #define DUMP_SEPCIAL_PAGES _IOW(PIN_MEM_MAGIC, _DUMP_SEPCIAL_PAGES, int) + #define RETORE_SEPCIAL_PAGES _IOW(PIN_MEM_MAGIC, _RETORE_SEPCIAL_PAGES, int) ++#define SET_FORK_PID _IOW(PIN_MEM_MAGIC, _SET_FORK_PID, int) #define ONCE_PIN_MEM_SIZE_LIMIT 32 * 1024 * 1024 #define MAX_PIN_MEM_AREA_NUM 16 diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c -index f3bd541..ce682ac 100644 +index 1565e3c..4ab8a45 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -1431,6 +1431,22 @@ int remap_vmas(int pid) @@ -166,7 +167,7 @@ index f3bd541..ce682ac 100644 /* * The main routine to restore task via sigreturn. -@@ -1830,7 +1846,7 @@ long __export_restore_task(struct task_restore_args *args) +@@ -1834,7 +1850,7 @@ long __export_restore_task(struct task_restore_args *args) long parent_tid; int i, fd = -1; @@ -175,7 +176,7 @@ index f3bd541..ce682ac 100644 /* One level pid ns hierarhy */ fd = sys_openat(args->proc_fd, LAST_PID_PATH, O_RDWR, 0); if (fd < 0) { -@@ -1862,6 +1878,13 @@ long __export_restore_task(struct task_restore_args *args) +@@ -1866,6 +1882,13 @@ long __export_restore_task(struct task_restore_args *args) c_args.parent_tid = ptr_to_u64(&parent_tid); pr_debug("Using clone3 to restore the process\n"); RUN_CLONE3_RESTORE_FN(ret, c_args, sizeof(c_args), &thread_args[i], args->clone_restore_fn); @@ -190,5 +191,5 @@ index f3bd541..ce682ac 100644 last_pid_len = std_vprint_num(last_pid_buf, sizeof(last_pid_buf), thread_args[i].pid - 1, &s); sys_lseek(fd, 0, SEEK_SET); -- -2.9.5 +2.34.0 diff --git a/backport-0007--notifier-add-notifier-calling-method-for-checkpoint-.patch b/backport-0007--notifier-add-notifier-calling-method-for-checkpoint-.patch new file mode 100644 index 0000000000000000000000000000000000000000..032893b02f85783fd455c81bb2767623dcd8376c --- /dev/null +++ b/backport-0007--notifier-add-notifier-calling-method-for-checkpoint-.patch @@ -0,0 +1,650 @@ +From 65a04d206be10f4871b0de2a971174bbe55d38d2 Mon Sep 17 00:00:00 2001 +From: Jingxian He +Date: Wed, 19 May 2021 21:45:03 +0800 +Subject: [PATCH 07/50] notifier: add notifier calling method for checkpoint + and restore + +Add notifier calling method for checkpoint and restore during kernel module upgrading. + +Signed-off-by: Xiaoguang Li +Signed-off-by: He Jingxian +Signed-off-by: fu.lin +--- + criu/config.c | 1 + + criu/cr-dump.c | 33 ++++++++++ + criu/cr-restore.c | 22 ++++++- + criu/crtools.c | 3 + + criu/include/cr_options.h | 1 + + criu/include/restorer.h | 1 + + criu/include/util.h | 42 ++++++++++++ + criu/pie/restorer.c | 135 ++++++++++++++++++++++++++++++++++---- + criu/pie/util.c | 91 +++++++++++++++++++++++++ + include/common/lock.h | 4 ++ + 10 files changed, 319 insertions(+), 14 deletions(-) + +diff --git a/criu/config.c b/criu/config.c +index a5bcf10..e1de191 100644 +--- a/criu/config.c ++++ b/criu/config.c +@@ -544,6 +544,7 @@ int parse_options(int argc, char **argv, bool *usage_error, + BOOL_OPT("with-cpu-affinity", &opts.with_cpu_affinity), + BOOL_OPT("pin-memory", &opts.pin_memory), + BOOL_OPT("use-fork-pid", &opts.use_fork_pid), ++ BOOL_OPT("with-notifier", &opts.with_notifier_kup), + { }, + }; + +diff --git a/criu/cr-dump.c b/criu/cr-dump.c +index 8575516..96c0cd3 100644 +--- a/criu/cr-dump.c ++++ b/criu/cr-dump.c +@@ -1699,6 +1699,8 @@ static int cr_lazy_mem_dump(void) + return ret; + } + ++static enum notifier_state notifier_state = NOTHING_COMPLETE; ++ + static int cr_dump_finish(int ret) + { + int post_dump_ret = 0; +@@ -1783,6 +1785,20 @@ static int cr_dump_finish(int ret) + restore_task_special_pages(0); + } + ++ if (ret != 0 && opts.with_notifier_kup) { ++ pr_info("call notifier rollback\n"); ++ switch (notifier_state) { ++ case PRE_FREEZE_COMPLETE: ++ notifier_kup(PRE_FREEZE, ROLLBACK, true); ++ break; ++ case FREEZE_TO_KILL_COMPLETE: ++ notifier_kup(FREEZE_TO_KILL, ROLLBACK, true); ++ break; ++ default: ++ break; ++ } ++ } ++ + if (ret) { + pr_err("Dumping FAILED.\n"); + } else { +@@ -1816,6 +1832,14 @@ int cr_dump_tasks(pid_t pid) + goto err; + root_item->pid->real = pid; + ++ if (notifier_kup(PRE_FREEZE, PREPARE, opts.with_notifier_kup)) { ++ /* disable rollback function because we has already rollbacked. */ ++ opts.with_notifier_kup = false; ++ pr_err("call notifier: %d err\n", PRE_FREEZE); ++ goto err; ++ } else ++ notifier_state = PRE_FREEZE_COMPLETE; ++ + pre_dump_ret = run_scripts(ACT_PRE_DUMP); + if (pre_dump_ret != 0) { + pr_err("Pre dump script failed with %d!\n", pre_dump_ret); +@@ -1971,6 +1995,15 @@ int cr_dump_tasks(pid_t pid) + ret = write_img_inventory(&he); + if (ret) + goto err; ++ ++ ret = notifier_kup(FREEZE_TO_KILL, PREPARE, opts.with_notifier_kup); ++ if (ret) { ++ opts.with_notifier_kup = false; ++ pr_err("call notifier:%d err\n", FREEZE_TO_KILL); ++ goto err; ++ } else ++ notifier_state = FREEZE_TO_KILL_COMPLETE; ++ + err: + if (parent_ie) + inventory_entry__free_unpacked(parent_ie, NULL); +diff --git a/criu/cr-restore.c b/criu/cr-restore.c +index e050b88..1e2ed9a 100644 +--- a/criu/cr-restore.c ++++ b/criu/cr-restore.c +@@ -1977,8 +1977,10 @@ static int restore_task_with_children(void *_arg) + return 0; + + err: +- if (current->parent == NULL) ++ if (current->parent == NULL) { ++ do_notifier_rollback(opts.with_notifier_kup, POST_UPDATE_KERNEL_COMPLETE); + futex_abort_and_wake(&task_entries->nr_in_progress); ++ } + exit(1); + } + +@@ -2421,8 +2423,10 @@ skip_ns_bouncing: + */ + attach_to_tasks(root_seized); + +- if (restore_switch_stage(CR_STATE_RESTORE_CREDS)) ++ if (restore_switch_stage(CR_STATE_RESTORE_CREDS)) { ++ pr_err("Can't switch to CR_STATE_RESTORE_CREDS stage\n"); + goto out_kill_network_unlocked; ++ } + + timing_stop(TIME_RESTORE); + +@@ -2599,6 +2603,15 @@ int cr_restore_tasks(void) + goto err; + + ret = restore_root_task(root_item); ++ if (ret) ++ goto err; ++ ++ ret = notifier_kup(POST_RUN, PREPARE, opts.with_notifier_kup); ++ if (ret < 0) { ++ opts.with_notifier_kup = false; ++ pr_err("calling POST_RUN notifier list return err"); ++ } ++ + err: + cr_plugin_fini(CR_PLUGIN_STAGE__RESTORE, ret); + return ret; +@@ -3861,6 +3874,8 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns + */ + task_args->lsm_type = kdat.lsm; + ++ task_args->with_notifier_kup = opts.with_notifier_kup; ++ + /* + * Make root and cwd restore _that_ late not to break any + * attempts to open files by paths above (e.g. /proc). +@@ -3907,6 +3922,9 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns + err: + free_mappings(&self_vmas); + err_nv: ++ if (current->parent == NULL && opts.with_notifier_kup) ++ do_notifier_rollback(true, POST_UPDATE_KERNEL_COMPLETE); ++ + /* Just to be sure */ + exit(1); + return -1; +diff --git a/criu/crtools.c b/criu/crtools.c +index 9b3ef33..d53be3d 100644 +--- a/criu/crtools.c ++++ b/criu/crtools.c +@@ -445,6 +445,9 @@ usage: + " same cpu quantity.\n" + " --pin-memory Use pin memory method for checkpoint and restore.\n" + " --use-fork-pid Allow to restore task pid by setting fork pid of task struct.\n" ++" --with-notifier Allow to checkout/restore kup notifier chain. This\n" ++" feature needs the kernel's assistance.\n" ++" Only for the host with these feature.\n" + "\n" + "Check options:\n" + " Without options, \"criu check\" checks availability of absolutely required\n" +diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h +index 7fad678..1acb5ef 100644 +--- a/criu/include/cr_options.h ++++ b/criu/include/cr_options.h +@@ -178,6 +178,7 @@ struct cr_options { + int with_cpu_affinity; + int pin_memory; + int use_fork_pid; ++ int with_notifier_kup; + }; + + extern struct cr_options opts; +diff --git a/criu/include/restorer.h b/criu/include/restorer.h +index 8fd47e2..7152b34 100644 +--- a/criu/include/restorer.h ++++ b/criu/include/restorer.h +@@ -227,6 +227,7 @@ struct task_restore_args { + bool has_clone3_set_tid; + bool pin_memory; + bool use_fork_pid; ++ bool with_notifier_kup; + } __aligned(64); + + /* +diff --git a/criu/include/util.h b/criu/include/util.h +index c2baf27..d226d2c 100644 +--- a/criu/include/util.h ++++ b/criu/include/util.h +@@ -13,6 +13,8 @@ + #include + #include + #include ++#include ++#include + + #include "int.h" + #include "common/compiler.h" +@@ -380,4 +382,44 @@ static inline void print_stack_trace(pid_t pid) {} + + extern int mount_detached_fs(const char *fsname); + ++#define NOTIFY_PROC_PATH "/sys/kernel/modrestore/nvwa_notifier" ++ ++#if __has_include("linux/modrestore.h") ++#define CONFIG_EULEROS_MODRESTORE_NOTIFY ++# include ++#else ++enum KUP_HOOK_POINT { ++ PRE_FREEZE, ++ FREEZE_TO_KILL, ++ PRE_UPDATE_KERNEL, ++ POST_UPDATE_KERNEL, ++ UNFREEZE_TO_RUN, ++ POST_RUN, ++ ++ KUP_HOOK_MAX, ++}; ++ ++enum nvwa_cmd { ++ PREPARE = 0, ++ ROLLBACK, ++ ++ NVWA_CMD_MAX, ++}; ++#endif ++ ++enum notifier_state { ++ NOTHING_COMPLETE, ++ PRE_FREEZE_COMPLETE, ++ FREEZE_TO_KILL_COMPLETE, ++ PRE_UPDATE_KERNEL_COMPLETE, ++ POST_UPDATE_KERNEL_COMPLETE, ++ UNFREEZE_TO_RUN_COMPLETE, ++ POST_RUN_COMPLETE, ++ ++ NOTIFIER_ROLLBACK_DONE = 0xfc17173b, /* has done rollback */ ++}; ++ ++int notifier_kup(enum KUP_HOOK_POINT, enum nvwa_cmd, bool); ++void do_notifier_rollback(bool, enum notifier_state); ++ + #endif /* __CR_UTIL_H__ */ +diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c +index 4ab8a45..a6245e4 100644 +--- a/criu/pie/restorer.c ++++ b/criu/pie/restorer.c +@@ -77,6 +77,7 @@ + + static struct task_entries *task_entries_local; + static futex_t thread_inprogress; ++static futex_t thread_start; + static pid_t *helpers; + static int n_helpers; + static pid_t *zombies; +@@ -119,10 +120,28 @@ void parasite_cleanup(void) + extern void cr_restore_rt (void) asm ("__cr_restore_rt") + __attribute__ ((visibility ("hidden"))); + ++static int args_with_notifier_kup; ++static enum notifier_state notifier_state = POST_UPDATE_KERNEL_COMPLETE; ++static futex_t notifier_done; ++ + static void sigchld_handler(int signal, siginfo_t *siginfo, void *data) + { + char *r; + int i; ++ rt_sigaction_t act; ++ ++ if (signal == SIGSEGV || signal == SIGBUS || signal == SIGILL) { ++ /* Make sure we exit with the right signal at the end. So for instance ++ * the core will be dumped if enabled. */ ++ pr_info("recv signal: %d\n", signal); ++ do_notifier_rollback(args_with_notifier_kup, notifier_state); ++ ksigemptyset (&act.rt_sa_mask); ++ act.rt_sa_flags = SA_SIGINFO | SA_RESTART; ++ act.rt_sa_handler = (rt_sighandler_t)SIG_DFL; ++ sys_sigaction(signal, &act, NULL, sizeof(k_rtsigset_t)); ++ sys_kill(sys_getpid(),signal); ++ return; ++ } + + /* We can ignore helpers that die, we expect them to after + * CR_STATE_RESTORE is finished. */ +@@ -149,10 +168,14 @@ static void sigchld_handler(int signal, siginfo_t *siginfo, void *data) + + pr_info("Task %d %s %d\n", siginfo->si_pid, r, siginfo->si_status); + ++ pr_info("%s: trace do_notifier_rollback\n", __func__); ++ do_notifier_rollback(args_with_notifier_kup, notifier_state); + futex_abort_and_wake(&task_entries_local->nr_in_progress); + /* sa_restorer may be unmaped, so we can't go back to userspace*/ + sys_kill(sys_getpid(), SIGSTOP); + sys_exit_group(1); ++ ++ /* for notifier, do nothing when receiving SIGCHLD signal */ + } + + static int lsm_set_label(char *label, char *type, int procfd) +@@ -604,6 +627,27 @@ static void noinline rst_sigreturn(unsigned long new_sp, + ARCH_RT_SIGRETURN(new_sp, sigframe); + } + ++/* Notice: only one task, so it isn't necessary to consider concurrent. */ ++static int do_notifier(bool *notify) ++{ ++ int retval = 0; ++ ++ if (!*notify) ++ return 0; ++ ++ pr_info("unfreeze_to_run restore notifier\n"); ++ retval = notifier_kup(UNFREEZE_TO_RUN, PREPARE, true); ++ if (retval) { ++ *notify = false; ++ notifier_state = NOTIFIER_ROLLBACK_DONE; ++ pr_err("call notifier: %d err\n", UNFREEZE_TO_RUN); ++ } ++ ++ notifier_state = UNFREEZE_TO_RUN_COMPLETE; ++ ++ return retval; ++} ++ + /* + * Threads restoration via sigreturn. Note it's locked + * routine and calls for unlock at the end. +@@ -642,12 +686,18 @@ long __export_restore_thread(struct thread_restore_args *args) + + pr_info("%ld: Restored\n", sys_gettid()); + +- restore_finish_stage(task_entries_local, CR_STATE_RESTORE); ++ if (!!(restore_finish_stage(task_entries_local, CR_STATE_RESTORE) & FUTEX_ABORT_FLAG)) { ++ pr_err("%s: abort by CR_STATE_RESTORE\n", __func__); ++ goto core_restore_end; ++ } + + if (restore_signals(args->siginfo, args->siginfo_n, false)) + goto core_restore_end; + +- restore_finish_stage(task_entries_local, CR_STATE_RESTORE_SIGCHLD); ++ if (!!(restore_finish_stage(task_entries_local, CR_STATE_RESTORE_SIGCHLD) & FUTEX_ABORT_FLAG)) { ++ pr_err("%s: abort by CR_STATE_RESTORE_SIGCHLD\n", __func__); ++ goto core_restore_end; ++ } + + /* + * Make sure it's before creds, since it's privileged +@@ -663,16 +713,29 @@ long __export_restore_thread(struct thread_restore_args *args) + if (ret) + BUG(); + +- restore_finish_stage(task_entries_local, CR_STATE_RESTORE_CREDS); ++ if (!!(restore_finish_stage(task_entries_local, CR_STATE_RESTORE_CREDS) & FUTEX_ABORT_FLAG)) { ++ pr_err("%s: abort by CR_STATE_RESTORE_CREDS\n", __func__); ++ goto core_restore_end; ++ } + + futex_dec_and_wake(&thread_inprogress); ++ futex_wait_while(&thread_start, 0); ++ if (!!(futex_get(&thread_start) & FUTEX_ABORT_FLAG)) { ++ pr_err("%s: abort by thread_start\n", __func__); ++ goto wait_notifier; ++ } + + new_sp = (long)rt_sigframe + RT_SIGFRAME_OFFSET(rt_sigframe); + rst_sigreturn(new_sp, rt_sigframe); + + core_restore_end: +- pr_err("Restorer abnormal termination for %ld\n", sys_getpid()); +- futex_abort_and_wake(&task_entries_local->nr_in_progress); ++ futex_abort_and_wake(&thread_start); ++ futex_abort_and_wake(&task_entries_local->start); ++ ++wait_notifier: ++ pr_err("%s: Restorer abnormal termination for %ld\n", __func__, sys_getpid()); ++ futex_wait_while(¬ifier_done, 0); ++ + sys_exit_group(1); + return -1; + } +@@ -1470,6 +1533,10 @@ long __export_restore_task(struct task_restore_args *args) + rt_sigaction_t act; + bool has_vdso_proxy; + ++ futex_set(&thread_inprogress, 1); ++ futex_set(&thread_start, 0); ++ futex_set(¬ifier_done, 0); ++ + bootstrap_start = args->bootstrap_start; + bootstrap_len = args->bootstrap_len; + +@@ -1486,6 +1553,7 @@ long __export_restore_task(struct task_restore_args *args) + #ifdef ARCH_HAS_LONG_PAGES + __page_size = args->page_size; + #endif ++ args_with_notifier_kup = args->with_notifier_kup; + + ksigfillset(&act.rt_sa_mask); + act.rt_sa_handler = sigchld_handler; +@@ -1496,9 +1564,29 @@ long __export_restore_task(struct task_restore_args *args) + pr_err("Failed to set SIGCHLD %ld\n", ret); + goto core_restore_end; + } ++ ret = sys_sigaction(SIGSEGV, &act, NULL, sizeof(k_rtsigset_t)); ++ if (ret) { ++ pr_err("Failed to set SIGCHLD %ld\n", ret); ++ goto core_restore_end; ++ } ++ ++ ret = sys_sigaction(SIGBUS, &act, NULL, sizeof(k_rtsigset_t)); ++ if (ret) { ++ pr_err("Failed to set SIGCHLD %ld\n", ret); ++ goto core_restore_end; ++ } ++ ++ ret = sys_sigaction(SIGILL, &act, NULL, sizeof(k_rtsigset_t)); ++ if (ret) { ++ pr_err("Failed to set SIGCHLD %ld\n", ret); ++ goto core_restore_end; ++ } + + ksigemptyset(&to_block); + ksigaddset(&to_block, SIGCHLD); ++ ksigaddset(&to_block, SIGSEGV); ++ ksigaddset(&to_block, SIGBUS); ++ ksigaddset(&to_block, SIGILL); + ret = sys_sigprocmask(SIG_UNBLOCK, &to_block, NULL, sizeof(k_rtsigset_t)); + if (ret) { + pr_err("Failed to unblock SIGCHLD %ld\n", ret); +@@ -1912,7 +2000,8 @@ long __export_restore_task(struct task_restore_args *args) + pr_err("Unable to create a thread: %ld\n", ret); + mutex_unlock(&task_entries_local->last_pid_mutex); + goto core_restore_end; +- } ++ } else ++ futex_inc(&thread_inprogress); + } + + mutex_unlock(&task_entries_local->last_pid_mutex); +@@ -1936,7 +2025,14 @@ long __export_restore_task(struct task_restore_args *args) + + pr_info("%ld: Restored\n", sys_getpid()); + +- restore_finish_stage(task_entries_local, CR_STATE_RESTORE); ++ if (!!(restore_finish_stage(task_entries_local, CR_STATE_RESTORE) & FUTEX_ABORT_FLAG)) { ++ pr_err("%s: abort by CR_STATE_RESTORE\n", __func__); ++ goto core_restore_end; ++ } ++ ++ ret = do_notifier(&args->with_notifier_kup); ++ if (ret) ++ goto core_restore_end; + + if (wait_helpers(args) < 0) + goto core_restore_end; +@@ -1984,7 +2080,8 @@ long __export_restore_task(struct task_restore_args *args) + if (ret) + goto core_restore_end; + +- restore_finish_stage(task_entries_local, CR_STATE_RESTORE_SIGCHLD); ++ if (!!(restore_finish_stage(task_entries_local, CR_STATE_RESTORE_SIGCHLD) & FUTEX_ABORT_FLAG)) ++ goto core_restore_end; + + rst_tcp_socks_all(args); + +@@ -2006,15 +2103,20 @@ long __export_restore_task(struct task_restore_args *args) + ret = ret || restore_pdeath_sig(args->t); + ret = ret || restore_child_subreaper(args->child_subreaper); + +- futex_set_and_wake(&thread_inprogress, args->nr_threads); +- +- restore_finish_stage(task_entries_local, CR_STATE_RESTORE_CREDS); ++ if (!!(restore_finish_stage(task_entries_local, CR_STATE_RESTORE_CREDS) & FUTEX_ABORT_FLAG)) ++ goto core_restore_end; + + if (ret) + BUG(); + + /* Wait until children stop to use args->task_entries */ + futex_wait_while_gt(&thread_inprogress, 1); ++ if (!!(futex_get(&thread_start) & FUTEX_ABORT_FLAG)) { ++ pr_err("%s: terminate by main thread futex_start\n", __func__); ++ goto handle_notifier; ++ } ++ ++ futex_set_and_wake(&thread_start, 1); + + sys_close(args->proc_fd); + std_log_set_fd(-1); +@@ -2052,8 +2154,17 @@ long __export_restore_task(struct task_restore_args *args) + rst_sigreturn(new_sp, rt_sigframe); + + core_restore_end: +- futex_abort_and_wake(&task_entries_local->nr_in_progress); ++ futex_abort_and_wake(&thread_start); ++ futex_abort_and_wake(&task_entries_local->start); ++ ++handle_notifier: ++ do_notifier_rollback(args->with_notifier_kup, notifier_state); ++ ++ futex_abort_and_wake(&task_entries_local->nr_in_progress); /* notifier the criu main process */ + pr_err("Restorer fail %ld\n", sys_getpid()); ++ ++ futex_set_and_wake(¬ifier_done, 1); /* wake all other threads to exit */ ++ + sys_exit_group(1); + return -1; + } +diff --git a/criu/pie/util.c b/criu/pie/util.c +index 4945483..752e5d0 100644 +--- a/criu/pie/util.c ++++ b/criu/pie/util.c +@@ -11,6 +11,7 @@ + #include "fcntl.h" + #include "log.h" + #include "util-pie.h" ++#include "util.h" + + #ifdef CR_NOGLIBC + # include +@@ -52,3 +53,93 @@ err_close: + __sys(close)(fd); + return -1; + } ++ ++#define KUP_BUF_SIZE 256 ++ ++static int int_to_string(unsigned number, char *buf, size_t total) { ++ unsigned remainder, quotient, i, len; ++ ++ quotient = number; ++ len = 0; ++ do { ++ quotient /= 10; ++ len += 1; ++ } while (quotient > 0); ++ ++ if (len > total - 1) ++ return -1; ++ ++ quotient = number; ++ i = 1; ++ do { ++ remainder = quotient % 10; ++ quotient = quotient / 10; ++ buf[len-i] = '0' + remainder; ++ i++; ++ } while (quotient > 0); ++ buf[len] = '\0'; ++ ++ return len == 0 ? -1 : len; ++} ++ ++int notifier_kup(enum KUP_HOOK_POINT action, enum nvwa_cmd cmd, bool enable) ++{ ++ int fd, count = 0, retval = 0; ++ char buf[KUP_BUF_SIZE] = {0}; ++ ++ if (!enable) ++ return 0; ++ ++ fd = __sys(open)(NOTIFY_PROC_PATH, O_WRONLY, 0); ++ if (fd == -EACCES) { ++ /* there is no priviledge to open file, ignore this condition. */ ++ pr_info("%s: open %s failed, retval: %d (-EACCES)\n", ++ __func__, NOTIFY_PROC_PATH, -EACCES); ++ return 0; ++ } else if (fd < 0) { ++ __pr_perror("%s: Can't open %s: %d\n", __func__, NOTIFY_PROC_PATH, fd); ++ return fd; ++ } ++ ++ retval = int_to_string(action, buf, sizeof(buf)-count); ++ if (retval <= 0) { ++ __pr_perror("%s: int_to_string error\n", __func__); ++ goto err_close; ++ } ++ ++ buf[retval] = ':'; ++ count = retval + 1; ++ ++ retval = int_to_string(cmd, buf+count, sizeof(buf)-count); ++ if (retval <= 0) { ++ __pr_perror("%s: int_to_string error\n", __func__); ++ goto err_close; ++ } ++ ++ count += retval; ++ retval = __sys(write)(fd, buf, count); ++ if (retval < 0) ++ __pr_perror("%s: Can't write to %s\n", __func__, NOTIFY_PROC_PATH); ++ ++err_close: ++ __sys(close)(fd); ++ ++ return retval < 0 ? -1 : 0; ++} ++ ++void do_notifier_rollback(bool rollback, enum notifier_state status) ++{ ++ if (!rollback) ++ return; ++ ++ switch (status) { ++ case POST_UPDATE_KERNEL_COMPLETE: ++ notifier_kup(POST_UPDATE_KERNEL, ROLLBACK, true); ++ break; ++ case UNFREEZE_TO_RUN_COMPLETE: ++ notifier_kup(UNFREEZE_TO_RUN, ROLLBACK, true); ++ break; ++ default: ++ break; ++ } ++} +diff --git a/include/common/lock.h b/include/common/lock.h +index 4782b63..3db17ae 100644 +--- a/include/common/lock.h ++++ b/include/common/lock.h +@@ -106,6 +106,10 @@ static inline void futex_inc_and_wake(futex_t *f) + LOCK_BUG_ON(sys_futex((uint32_t *)&f->raw.counter, FUTEX_WAKE, INT_MAX, NULL, NULL, 0) < 0); + } + ++static inline uint32_t futex_inc_return(futex_t *f) { ++ return atomic_inc_return(&f->raw); ++} ++ + /* Plain increment futex @f value */ + static inline void futex_inc(futex_t *f) { atomic_inc(&f->raw); } + +-- +2.34.0 + diff --git a/backport-0008--cred-provide-cred-checkpoint-restore-method.patch b/backport-0008--cred-provide-cred-checkpoint-restore-method.patch new file mode 100644 index 0000000000000000000000000000000000000000..409d07780e8087926cf590af35f67dd4cbca5a4e --- /dev/null +++ b/backport-0008--cred-provide-cred-checkpoint-restore-method.patch @@ -0,0 +1,254 @@ +From 27d2d541c40ccbd76784057e180d2c39b311c24f Mon Sep 17 00:00:00 2001 +From: Jingxian He +Date: Wed, 19 May 2021 21:47:28 +0800 +Subject: [PATCH 08/50] cred: provide cred checkpoint restore method + +criu checkpoint/restore the task, it only restore the context instead of +the memory address storing the context. + +To handle the problem resulted by CVE bugfix, details: +- https://nvd.nist.gov/vuln/detail/CVE-2016-4565 +- https://openfabrics.org/images/2018workshop/presentations/113_MRuhl_JourneytoVerbsIOCTL.pdf + +Brief: + Refresh the security context address of file. The infiniband code use +write()` as bi-directional `ioctl()`, there is `struct cred` address +uring `write()` process. However, criu uses some syscall, such as +capset()` and `setgroups()`, to regenerate the new cred, the file +red is fixed by `fcntl(F_SETOWN)`, then the address of new cred is +ifferent from the file. + This patch fix the `struct cred` address checking problem resulted by +VE fixed in infiniband drivers. + +Signed-off-by: luolongjun +Signed-off-by: fu.lin +--- + criu/config.c | 1 + + criu/cr-restore.c | 35 +++++++++++++++++++++++++++++++++++ + criu/crtools.c | 2 ++ + criu/include/cr_options.h | 1 + + criu/include/fcntl.h | 4 ++++ + criu/include/prctl.h | 4 ++++ + criu/include/restorer.h | 3 +++ + criu/pie/restorer.c | 38 ++++++++++++++++++++++++++++++++++++++ + 8 files changed, 88 insertions(+) + +diff --git a/criu/config.c b/criu/config.c +index e1de191..4d2b709 100644 +--- a/criu/config.c ++++ b/criu/config.c +@@ -545,6 +545,7 @@ int parse_options(int argc, char **argv, bool *usage_error, + BOOL_OPT("pin-memory", &opts.pin_memory), + BOOL_OPT("use-fork-pid", &opts.use_fork_pid), + BOOL_OPT("with-notifier", &opts.with_notifier_kup), ++ BOOL_OPT("with-fd-cred", &opts.with_fd_cred), + { }, + }; + +diff --git a/criu/cr-restore.c b/criu/cr-restore.c +index 1e2ed9a..05de2ef 100644 +--- a/criu/cr-restore.c ++++ b/criu/cr-restore.c +@@ -709,6 +709,28 @@ static int __collect_child_pids(struct pstree_item *p, int state, unsigned int * + return 0; + } + ++static int collect_child_fds(int state, unsigned int *n, struct pstree_item *me) ++{ ++ struct list_head *list = &rsti(me)->fds; ++ struct fdinfo_list_entry *fle, *tmp; ++ ++ *n = 0; ++ list_for_each_entry_safe(fle, tmp, list, ps_list) { ++ if (fle->fe->type == state) { ++ int *child; ++ ++ child = rst_mem_alloc(sizeof(*child), RM_PRIVATE); ++ if (!child) ++ return -1; ++ ++ (*n)++; ++ *child = fle->fe->fd; ++ } ++ } ++ ++ return 0; ++} ++ + static int collect_child_pids(int state, unsigned int *n) + { + struct pstree_item *pi; +@@ -733,6 +755,12 @@ static int collect_child_pids(int state, unsigned int *n) + return __collect_child_pids(current, state, n); + } + ++static int collect_chr_fds(struct pstree_item *me, struct task_restore_args *ta) ++{ ++ ta->setcred_pids = (int *)rst_mem_align_cpos(RM_PRIVATE); ++ return collect_child_fds(FD_TYPES__CHR, &ta->setcred_pids_n, me); ++} ++ + static int collect_helper_pids(struct task_restore_args *ta) + { + ta->helpers = (pid_t *)rst_mem_align_cpos(RM_PRIVATE); +@@ -938,6 +966,9 @@ static int restore_one_alive_task(int pid, CoreEntry *core) + if (collect_zombie_pids(ta) < 0) + return -1; + ++ if (opts.with_fd_cred && collect_chr_fds(current, ta) < 0) ++ return -1; ++ + if (collect_inotify_fds(ta) < 0) + return -1; + +@@ -3723,6 +3754,10 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns + RST_MEM_FIXUP_PPTR(task_args->helpers); + RST_MEM_FIXUP_PPTR(task_args->zombies); + RST_MEM_FIXUP_PPTR(task_args->vma_ios); ++ if (opts.with_fd_cred) ++ RST_MEM_FIXUP_PPTR(task_args->setcred_pids); ++ else ++ task_args->setcred_pids_n = UINT_MAX; + RST_MEM_FIXUP_PPTR(task_args->inotify_fds); + + task_args->compatible_mode = core_is_compat(core); +diff --git a/criu/crtools.c b/criu/crtools.c +index d53be3d..942e683 100644 +--- a/criu/crtools.c ++++ b/criu/crtools.c +@@ -448,6 +448,8 @@ usage: + " --with-notifier Allow to checkout/restore kup notifier chain. This\n" + " feature needs the kernel's assistance.\n" + " Only for the host with these feature.\n" ++" --with-fd-cred Allow to make the restored process has the same cred\n" ++" as checkout assisted by kernel.\n" + "\n" + "Check options:\n" + " Without options, \"criu check\" checks availability of absolutely required\n" +diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h +index 1acb5ef..5b0ff24 100644 +--- a/criu/include/cr_options.h ++++ b/criu/include/cr_options.h +@@ -179,6 +179,7 @@ struct cr_options { + int pin_memory; + int use_fork_pid; + int with_notifier_kup; ++ int with_fd_cred; + }; + + extern struct cr_options opts; +diff --git a/criu/include/fcntl.h b/criu/include/fcntl.h +index ea9d48c..0936337 100644 +--- a/criu/include/fcntl.h ++++ b/criu/include/fcntl.h +@@ -19,6 +19,10 @@ struct f_owner_ex { + #define F_GETOWNER_UIDS 17 + #endif + ++#ifndef F_SETCRED ++#define F_SETCRED 18 ++#endif ++ + /* + * These things are required to compile on CentOS-6 + */ +diff --git a/criu/include/prctl.h b/criu/include/prctl.h +index 8e7fef3..ecbc69a 100644 +--- a/criu/include/prctl.h ++++ b/criu/include/prctl.h +@@ -82,4 +82,8 @@ struct prctl_mm_map { + # define PR_GET_THP_DISABLE 42 + #endif + ++#ifndef PR_DEFAULT_CRED ++# define PR_DEFAULT_CRED 54 ++#endif ++ + #endif /* __CR_PRCTL_H__ */ +diff --git a/criu/include/restorer.h b/criu/include/restorer.h +index 7152b34..4afff1b 100644 +--- a/criu/include/restorer.h ++++ b/criu/include/restorer.h +@@ -180,6 +180,9 @@ struct task_restore_args { + pid_t *zombies; + unsigned int zombies_n; + ++ int *setcred_pids; ++ unsigned int setcred_pids_n; ++ + int *inotify_fds; /* fds to cleanup inotify events at CR_STATE_RESTORE_SIGCHLD stage */ + unsigned int inotify_fds_n; + +diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c +index a6245e4..2173c5e 100644 +--- a/criu/pie/restorer.c ++++ b/criu/pie/restorer.c +@@ -78,6 +78,7 @@ + static struct task_entries *task_entries_local; + static futex_t thread_inprogress; + static futex_t thread_start; ++static futex_t cred_set; + static pid_t *helpers; + static int n_helpers; + static pid_t *zombies; +@@ -345,6 +346,41 @@ static int restore_creds(struct thread_creds_args *args, int procfd, + return 0; + } + ++static int update_cred_ref(struct task_restore_args *ta) ++{ ++ int i; ++ int ret; ++ int pid = sys_getpid(); ++ long int tid = sys_gettid(); ++ ++ if (ta->setcred_pids_n == UINT_MAX) { ++ pr_info("no need to keep the same cred \n"); ++ return 0; ++ } ++ ++ if (pid == tid) { ++ /* let main thread finish cred update first */ ++ ret = sys_prctl(PR_DEFAULT_CRED, 0, 0, 0, 0); ++ pr_info("main cred restore \n"); ++ futex_set_and_wake(&cred_set, 1); ++ } else { ++ futex_wait_until(&cred_set, 1); ++ pr_info("other cred restore \n"); ++ ret = sys_prctl(PR_DEFAULT_CRED, 0, 0, 0, 0); ++ } ++ ++ if (ret) ++ return ret; ++ ++ pr_info("%ld (%d) is going to update current cred \n", tid, pid); ++ ++ for (i = 0; i < ta->setcred_pids_n; i++) { ++ sys_fcntl(ta->setcred_pids[i], F_SETCRED, 0); ++ } ++ ++ return 0; ++} ++ + /* + * This should be done after creds restore, as + * some creds changes might drop the value back +@@ -708,6 +744,7 @@ long __export_restore_thread(struct thread_restore_args *args) + + ret = restore_creds(args->creds_args, args->ta->proc_fd, + args->ta->lsm_type); ++ ret = ret || update_cred_ref(args->ta); + ret = ret || restore_dumpable_flag(&args->ta->mm); + ret = ret || restore_pdeath_sig(args); + if (ret) +@@ -2099,6 +2136,7 @@ long __export_restore_task(struct task_restore_args *args) + */ + ret = restore_creds(args->t->creds_args, args->proc_fd, + args->lsm_type); ++ ret = ret || update_cred_ref(args); + ret = ret || restore_dumpable_flag(&args->mm); + ret = ret || restore_pdeath_sig(args->t); + ret = ret || restore_child_subreaper(args->child_subreaper); +-- +2.34.0 + diff --git a/backport-0009--block-device-dump-block-device-as-reguler-file.patch b/backport-0009--block-device-dump-block-device-as-reguler-file.patch new file mode 100644 index 0000000000000000000000000000000000000000..bcf7033891d5aaefb6042275a63df664e738e206 --- /dev/null +++ b/backport-0009--block-device-dump-block-device-as-reguler-file.patch @@ -0,0 +1,60 @@ +From 912a4e39dea7eb71e23f2835e7ad3e055ff63bb8 Mon Sep 17 00:00:00 2001 +From: Jingxian He +Date: Wed, 19 May 2021 21:49:15 +0800 +Subject: [PATCH 09/50] block-device: dump block device as reguler file + +Add block device dump and restore method for kernel module upgrading. + +Signed-off-by: Xiaoguang Li +--- + criu/files.c | 27 +++++++++++++++++++++++++++ + 1 file changed, 27 insertions(+) + +diff --git a/criu/files.c b/criu/files.c +index 0912d1a..6f580af 100644 +--- a/criu/files.c ++++ b/criu/files.c +@@ -449,6 +449,30 @@ static const struct fdtype_ops *get_mem_dev_ops(struct fd_parms *p, int minor) + return ops; + } + ++static int dump_blkdev(struct fd_parms *p, int lfd, FdinfoEntry *e) ++{ ++ struct fd_link *link_old = p->link; ++ int maj = major(p->stat.st_rdev); ++ const struct fdtype_ops *ops; ++ int err; ++ ++ switch (maj) { ++ case SCSI_DISK0_MAJOR: ++ ops = ®file_dump_ops; ++ break; ++ default: { ++ char more[32] = "block_dev"; ++ ++ err = dump_unsupp_fd(p, lfd, "blk", more, e); ++ p->link = link_old; ++ return err; ++ } ++ } ++ err = do_dump_gen_file(p, lfd, ops, e); ++ p->link = link_old; ++ return err; ++} ++ + static int dump_chrdev(struct fd_parms *p, int lfd, FdinfoEntry *e) + { + struct fd_link *link_old = p->link; +@@ -516,6 +540,9 @@ static int dump_one_file(struct pid *pid, int fd, int lfd, struct fd_opts *opts, + p.fd_ctl = ctl; /* Some dump_opts require this to talk to parasite */ + p.dfds = dfds; /* epoll needs to verify if target fd exist */ + ++ if (S_ISBLK(p.stat.st_mode)) ++ return dump_blkdev(&p, lfd, e); ++ + if (S_ISSOCK(p.stat.st_mode)) + return dump_socket(&p, lfd, e); + +-- +2.34.0 + diff --git a/backport-0010--anon-inode-add-support-for-anon-inode-fd.patch b/backport-0010--anon-inode-add-support-for-anon-inode-fd.patch new file mode 100644 index 0000000000000000000000000000000000000000..cb71be06f307a0f5861bd1cc3abec8b8abe72b41 --- /dev/null +++ b/backport-0010--anon-inode-add-support-for-anon-inode-fd.patch @@ -0,0 +1,354 @@ +From daff89bbab6e94d29e1933f97c28c8700bd9ed2b Mon Sep 17 00:00:00 2001 +From: Jingxian He +Date: Wed, 19 May 2021 21:52:49 +0800 +Subject: [PATCH 10/50] anon-inode: add support for anon inode fd + +Add support for anon inode fd dump and restore during module upgrade. + +Signed-off-by: Xiaoguang Li +Signed-off-by: Jingxian He + +Signed-off-by: fu.lin +--- + criu/cr-restore.c | 3 +++ + criu/files-reg.c | 3 ++- + criu/include/image.h | 1 + + criu/include/mem.h | 1 + + criu/include/restorer.h | 6 ++++++ + criu/mem.c | 24 +++++++++++++++++++++++- + criu/pie/restorer.c | 32 ++++++++++++++++++++++++++++++++ + criu/proc_parse.c | 36 ++++++++++++++++++++++++++++++------ + images/vma.proto | 1 + + 9 files changed, 99 insertions(+), 8 deletions(-) + +diff --git a/criu/cr-restore.c b/criu/cr-restore.c +index 05de2ef..7ceb8fe 100644 +--- a/criu/cr-restore.c ++++ b/criu/cr-restore.c +@@ -1001,6 +1001,8 @@ static int restore_one_alive_task(int pid, CoreEntry *core) + if (prepare_vmas(current, ta)) + return -1; + ++ if (prepare_vma_names(current, ta)) ++ return -1; + /* + * Sockets have to be restored in their network namespaces, + * so a task namespace has to be restored after sockets. +@@ -3744,6 +3746,7 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns + #endif + + RST_MEM_FIXUP_PPTR(task_args->vmas); ++ RST_MEM_FIXUP_PPTR(task_args->vma_names); + RST_MEM_FIXUP_PPTR(task_args->rings); + RST_MEM_FIXUP_PPTR(task_args->tcp_socks); + RST_MEM_FIXUP_PPTR(task_args->timerfd); +diff --git a/criu/files-reg.c b/criu/files-reg.c +index aed1e73..4724994 100644 +--- a/criu/files-reg.c ++++ b/criu/files-reg.c +@@ -2215,7 +2215,7 @@ int do_open_reg_noseek_flags(int ns_root_fd, struct reg_file_info *rfi, void *ar + + /* unnamed temporary files are restored as ghost files */ + flags &= ~O_TMPFILE; +- ++ pr_info("openat path is: %s\n", rfi->path); + fd = openat(ns_root_fd, rfi->path, flags); + if (fd < 0) { + pr_perror("Can't open file %s on restore", rfi->path); +@@ -2387,6 +2387,7 @@ int collect_filemap(struct vma_area *vma) + if (!fd) + return -1; + ++ pr_info("find fd for %lx, shmid: %lx\n", vma->e->start, vma->e->shmid); + vma->vmfd = fd; + vma->vm_open = open_filemap; + return 0; +diff --git a/criu/include/image.h b/criu/include/image.h +index 62c8d7b..939db37 100644 +--- a/criu/include/image.h ++++ b/criu/include/image.h +@@ -84,6 +84,7 @@ + #define VMA_AREA_VVAR (1 << 12) + #define VMA_AREA_AIORING (1 << 13) + #define VMA_AREA_MEMFD (1 << 14) ++#define VMA_AREA_ANON_INODE (1 << 15) + + #define VMA_CLOSE (1 << 28) + #define VMA_NO_PROT_WRITE (1 << 29) +diff --git a/criu/include/mem.h b/criu/include/mem.h +index 3b3fdf8..b329c9e 100644 +--- a/criu/include/mem.h ++++ b/criu/include/mem.h +@@ -47,6 +47,7 @@ extern int parasite_dump_pages_seized(struct pstree_item *item, + struct task_restore_args; + int open_vmas(struct pstree_item *t); + int prepare_vmas(struct pstree_item *t, struct task_restore_args *ta); ++int prepare_vma_names(struct pstree_item *t, struct task_restore_args *ta); + int unmap_guard_pages(struct pstree_item *t); + int prepare_mappings(struct pstree_item *t); + bool should_dump_page(VmaEntry *vmae, u64 pme); +diff --git a/criu/include/restorer.h b/criu/include/restorer.h +index 4afff1b..f6b45d6 100644 +--- a/criu/include/restorer.h ++++ b/criu/include/restorer.h +@@ -127,6 +127,10 @@ struct restore_vma_io { + + #define RIO_SIZE(niovs) (sizeof(struct restore_vma_io) + (niovs) * sizeof(struct iovec)) + ++struct vma_names { ++ char name[PATH_MAX]; ++}; ++ + struct task_restore_args { + struct thread_restore_args *t; /* thread group leader */ + +@@ -150,6 +154,8 @@ struct task_restore_args { + VmaEntry *vmas; + unsigned int vmas_n; + ++ struct vma_names *vma_names; ++ + int vma_ios_fd; + struct restore_vma_io *vma_ios; + unsigned int vma_ios_n; +diff --git a/criu/mem.c b/criu/mem.c +index 2eabb8d..dd64f10 100644 +--- a/criu/mem.c ++++ b/criu/mem.c +@@ -652,6 +652,9 @@ static int __parasite_dump_pages_seized(struct pstree_item *item, + continue; + } + ++ if (vma_entry_is(vma_area->e, VMA_AREA_ANON_INODE)) ++ continue; ++ + ret = generate_vma_iovs(item, vma_area, pp, &xfer, args, ctl, + &pmc, has_parent, mdc->pre_dump, + parent_predump_mode); +@@ -845,7 +848,6 @@ int prepare_mm_pid(struct pstree_item *i) + } + + pr_info("vma 0x%"PRIx64" 0x%"PRIx64"\n", vma->e->start, vma->e->end); +- + if (vma_area_is(vma, VMA_ANON_SHARED)) + ret = collect_shmem(pid, vma); + else if (vma_area_is(vma, VMA_FILE_PRIVATE) || +@@ -1500,6 +1502,9 @@ int open_vmas(struct pstree_item *t) + filemap_ctx_init(false); + + list_for_each_entry(vma, &vmas->h, list) { ++ if (vma_area_is(vma, VMA_AREA_ANON_INODE)) ++ continue; ++ + if (!vma_area_is(vma, VMA_AREA_REGULAR) || !vma->vm_open) + continue; + +@@ -1585,3 +1590,20 @@ int prepare_vmas(struct pstree_item *t, struct task_restore_args *ta) + + return prepare_vma_ios(t, ta); + } ++ ++int prepare_vma_names(struct pstree_item *t, struct task_restore_args *ta) ++{ ++ struct vma_area *vma; ++ struct vm_area_list *vmas = &rsti(t)->vmas; ++ ta->vma_names = (struct vma_names *)rst_mem_align_cpos(RM_PRIVATE); ++ ++ list_for_each_entry(vma, &vmas->h, list) { ++ struct vma_names *vma_names; ++ vma_names = rst_mem_alloc(sizeof(*vma_names), RM_PRIVATE); ++ if (!vma_names) ++ return -1; ++ ++ memcpy(vma_names->name, vma->e->name, strlen(vma->e->name) + 1); ++ } ++ return 0; ++} +diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c +index 2173c5e..0bd220a 100644 +--- a/criu/pie/restorer.c ++++ b/criu/pie/restorer.c +@@ -66,6 +66,7 @@ + #define FALLOC_FL_PUNCH_HOLE 0x02 + #endif + ++#define ANON_PROC_PATH "/sys/kernel/modrestore/anon_state_restore" + + #define sys_prctl_safe(opcode, val1, val2, val3) \ + ({ \ +@@ -798,6 +799,25 @@ unsigned long arch_shmat(int shmid, void *shmaddr, + } + #endif + ++static int restore_anon_mapping(VmaEntry *vma_entry, struct vma_names *vma_name) ++{ ++ int fd; ++ ++ fd = sys_open(ANON_PROC_PATH, O_WRONLY, 0); ++ if (fd < 0) { ++ pr_info("anon sys fs open fail:%s\n", ANON_PROC_PATH); ++ return fd; ++ } ++ pr_info("restore anon mapping: %s\n", vma_name->name); ++ ++ if (sys_write(fd, vma_name->name, 4096) < 0) { ++ sys_close(fd); ++ return -1; ++ } ++ sys_close(fd); ++ return 0; ++} ++ + static unsigned long restore_mapping(VmaEntry *vma_entry) + { + int prot = vma_entry->prot; +@@ -1569,6 +1589,7 @@ long __export_restore_task(struct task_restore_args *args) + pid_t my_pid = sys_getpid(); + rt_sigaction_t act; + bool has_vdso_proxy; ++ struct vma_names *vma_name; + + futex_set(&thread_inprogress, 1); + futex_set(&thread_start, 0); +@@ -1729,6 +1750,14 @@ long __export_restore_task(struct task_restore_args *args) + */ + for (i = 0; i < args->vmas_n; i++) { + vma_entry = args->vmas + i; ++ vma_name = args->vma_names + i; ++ ++ if (vma_entry_is(vma_entry, VMA_AREA_ANON_INODE)) { ++ pr_info("anon vma name:%s\n", vma_name->name); ++ if (restore_anon_mapping(vma_entry, vma_name) < 0) ++ goto core_restore_end; ++ continue; ++ } + + if (!vma_entry_is(vma_entry, VMA_AREA_REGULAR) && + !vma_entry_is(vma_entry, VMA_AREA_AIORING)) +@@ -1853,6 +1882,9 @@ long __export_restore_task(struct task_restore_args *args) + if (!vma_entry->has_madv || !vma_entry->madv) + continue; + ++ if (vma_entry_is(vma_entry, VMA_AREA_ANON_INODE)) ++ continue; ++ + for (m = 0; m < sizeof(vma_entry->madv) * 8; m++) { + if (vma_entry->madv & (1ul << m)) { + ret = sys_madvise(vma_entry->start, +diff --git a/criu/proc_parse.c b/criu/proc_parse.c +index ba60832..23db7f3 100644 +--- a/criu/proc_parse.c ++++ b/criu/proc_parse.c +@@ -77,6 +77,7 @@ static char *buf = __buf.buf; + */ + + #define AIO_FNAME "/[aio]" ++#define ANON_FNAME "anon_inode" + + /* check the @line starts with "%lx-%lx" format */ + static bool __is_vma_range_fmt(char *line) +@@ -174,8 +175,19 @@ static void parse_vma_vmflags(char *buf, struct vma_area *vma_area) + * only exception is VVAR area that mapped by the kernel as + * VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP + */ +- if (io_pf && !vma_area_is(vma_area, VMA_AREA_VVAR)) ++ /* There are many types of io/pf vm_map, not only vvar, but also ++ * anon_inode, and char device. ++ * For anon_inode and char device, we use anon_notifier to restore ++ * status. Therefore, we disable the broken code here. ++ */ ++ /* ++ if (io_pf && !vma_area_is(vma_area, VMA_AREA_VVAR) && ++ !vma_area_is(vma_area, VMA_AREA_ANON_INODE)) ++ { ++ pr_info("set current status tp VMA_UNSUPP\n"); + vma_area->e->status |= VMA_UNSUPP; ++ } ++ */ + + if (vma_area->e->madv) + vma_area->e->has_madv = true; +@@ -435,7 +447,6 @@ static int vma_get_mapfile(const char *fname, struct vma_area *vma, DIR *mfd, + + if (fstatat(dirfd(mfd), path, &buf, 0)) + return -1; +- + if (S_ISSOCK(buf.st_mode)) { + pr_info("Found socket mapping @%"PRIx64"\n", vma->e->start); + vma->vm_socket_id = buf.st_ino; +@@ -450,6 +461,21 @@ static int vma_get_mapfile(const char *fname, struct vma_area *vma, DIR *mfd, + return 0; + } + ++ if (!strncmp(fname, ANON_FNAME, sizeof(ANON_FNAME) - 1)) { ++ /*anon_inode*/ ++ close_safe(vm_file_fd); ++ vma->e->status = VMA_AREA_ANON_INODE; ++ vma->e->name = xmalloc(PATH_MAX); ++ if (!vma->e->name) { ++ pr_err("alloc vma name of anon-inode fail.\n"); ++ return -1; ++ } ++ snprintf(vma->e->name, PATH_MAX - 1, "%"PRIx64"-%"PRIx64 " %s", vma->e->start, vma->e->end, fname); ++ vma->e->name[PATH_MAX - 1] = 0; ++ pr_info("set vma_area status to: %d, name:%s\n", vma->e->status, vma->e->name); ++ return 0; ++ } ++ + pr_err("Unknown shit %o (%s)\n", buf.st_mode, fname); + return -1; + } +@@ -548,7 +574,7 @@ static int handle_vma(pid_t pid, struct vma_area *vma_area, + if (vma_get_mapfile(file_path, vma_area, map_files_dir, + vfi, prev_vfi, vm_file_fd)) + goto err_bogus_mapfile; +- ++ pr_info("handle_vam, vma status is: %d\n", vma_area->e->status); + if (vma_area->e->status != 0) + return 0; + +@@ -584,6 +610,7 @@ static int handle_vma(pid_t pid, struct vma_area *vma_area, + vma_area->e->shmid = prev->e->shmid; + vma_area->vmst = prev->vmst; + vma_area->mnt_id = prev->mnt_id; ++ vma_area->e->name = prev->e->name; + + if (!(vma_area->e->status & VMA_AREA_SYSVIPC)) { + vma_area->e->status &= ~(VMA_FILE_PRIVATE | VMA_FILE_SHARED); +@@ -753,7 +780,6 @@ int parse_smaps(pid_t pid, struct vm_area_list *vma_area_list, + if (IS_ERR(str)) + goto err; + eof = (str == NULL); +- + if (!eof && !__is_vma_range_fmt(str)) { + if (!strncmp(str, "Nonlinear", 9)) { + BUG_ON(!vma_area); +@@ -772,7 +798,6 @@ int parse_smaps(pid_t pid, struct vm_area_list *vma_area_list, + } else + continue; + } +- + if (vma_area && vma_list_add(vma_area, vma_area_list, + &prev_end, &vfi, &prev_vfi)) + goto err; +@@ -819,7 +844,6 @@ int parse_smaps(pid_t pid, struct vm_area_list *vma_area_list, + if (handle_vma(pid, vma_area, str + path_off, map_files_dir, + &vfi, &prev_vfi, &vm_file_fd)) + goto err; +- + if (vma_entry_is(vma_area->e, VMA_FILE_PRIVATE) || + vma_entry_is(vma_area->e, VMA_FILE_SHARED)) { + if (dump_filemap && dump_filemap(vma_area, vm_file_fd)) +diff --git a/images/vma.proto b/images/vma.proto +index 7085f42..f1ae4fb 100644 +--- a/images/vma.proto ++++ b/images/vma.proto +@@ -22,4 +22,5 @@ message vma_entry { + + /* file status flags */ + optional uint32 fdflags = 10 [(criu).hex = true]; ++ required string name = 11; + } +-- +2.34.0 + diff --git a/backport-0011--char_dev-add-support-for-char-device-dump-and-restor.patch b/backport-0011--char_dev-add-support-for-char-device-dump-and-restor.patch new file mode 100644 index 0000000000000000000000000000000000000000..97724716d143f20b3db2053f843b79e8d8fd6303 --- /dev/null +++ b/backport-0011--char_dev-add-support-for-char-device-dump-and-restor.patch @@ -0,0 +1,772 @@ +From d30bf77f5b9f47b60fbad8f5af4e2c30ec01beae Mon Sep 17 00:00:00 2001 +From: Jingxian He +Date: Wed, 19 May 2021 21:55:34 +0800 +Subject: [PATCH 11/50] char_dev: add support for char device dump and restore + +Add support for char device dump and restore during module upgrade. + +`/sys/kernel/repairing_device` provides the char device whiltelist +with `IOCTL_CMD_{NEEDREPAIR, REPAIR}` command besides the internal +device list. +The device modules could use `mures_{add, del}_devname()` to add, or +delete the char device whitelist dynamically. + +Signed-off-by: Xiaoguang Li +Signed-off-by: Jingxian He +Signed-off-by: fu.lin +--- + criu/Makefile.crtools | 1 + + criu/config.c | 1 + + criu/cr-dump.c | 3 + + criu/cr-restore.c | 4 +- + criu/crtools.c | 1 + + criu/devname.c | 130 ++++++++++++++++++++++++++++ + criu/files-reg.c | 34 +++++++- + criu/files.c | 159 ++++++++++++++++++++++++++++++++++- + criu/include/cr_options.h | 1 + + criu/include/files-reg.h | 9 ++ + criu/include/files.h | 6 ++ + criu/include/image-desc.h | 1 + + criu/include/image.h | 1 + + criu/include/protobuf-desc.h | 1 + + criu/include/util.h | 3 + + criu/mem.c | 6 +- + criu/proc_parse.c | 16 +++- + images/Makefile | 1 + + images/chr.proto | 12 +++ + images/fdinfo.proto | 3 + + 20 files changed, 382 insertions(+), 11 deletions(-) + create mode 100644 criu/devname.c + create mode 100644 images/chr.proto + +diff --git a/criu/Makefile.crtools b/criu/Makefile.crtools +index dcffb4f..a9008f0 100644 +--- a/criu/Makefile.crtools ++++ b/criu/Makefile.crtools +@@ -89,6 +89,7 @@ obj-y += servicefd.o + obj-y += pie-util-vdso.o + obj-y += vdso.o + obj-y += timens.o ++obj-y += devname.o + obj-$(CONFIG_HAS_LIBBPF) += bpfmap.o + obj-$(CONFIG_COMPAT) += pie-util-vdso-elf32.o + CFLAGS_pie-util-vdso-elf32.o += -DCONFIG_VDSO_32 +diff --git a/criu/config.c b/criu/config.c +index 4d2b709..0ccd2b5 100644 +--- a/criu/config.c ++++ b/criu/config.c +@@ -546,6 +546,7 @@ int parse_options(int argc, char **argv, bool *usage_error, + BOOL_OPT("use-fork-pid", &opts.use_fork_pid), + BOOL_OPT("with-notifier", &opts.with_notifier_kup), + BOOL_OPT("with-fd-cred", &opts.with_fd_cred), ++ BOOL_OPT("dump-char-dev", &opts.dump_char_dev), + { }, + }; + +diff --git a/criu/cr-dump.c b/criu/cr-dump.c +index 96c0cd3..9ba27a2 100644 +--- a/criu/cr-dump.c ++++ b/criu/cr-dump.c +@@ -1827,6 +1827,9 @@ int cr_dump_tasks(pid_t pid) + */ + rlimit_unlimit_nofile(); + ++ if (opts.dump_char_dev && parse_devname() < 0) ++ goto err; ++ + root_item = alloc_pstree_item(); + if (!root_item) + goto err; +diff --git a/criu/cr-restore.c b/criu/cr-restore.c +index 7ceb8fe..7c198ce 100644 +--- a/criu/cr-restore.c ++++ b/criu/cr-restore.c +@@ -349,11 +349,11 @@ static int root_prepare_shared(void) + if (pi->pid->state == TASK_HELPER) + continue; + +- ret = prepare_mm_pid(pi); ++ ret = prepare_fd_pid(pi); + if (ret < 0) + break; + +- ret = prepare_fd_pid(pi); ++ ret = prepare_mm_pid(pi); + if (ret < 0) + break; + +diff --git a/criu/crtools.c b/criu/crtools.c +index 942e683..26010b5 100644 +--- a/criu/crtools.c ++++ b/criu/crtools.c +@@ -450,6 +450,7 @@ usage: + " Only for the host with these feature.\n" + " --with-fd-cred Allow to make the restored process has the same cred\n" + " as checkout assisted by kernel.\n" ++" --dump-char-dev Dump char dev files as normal file with repair cmd\n" + "\n" + "Check options:\n" + " Without options, \"criu check\" checks availability of absolutely required\n" +diff --git a/criu/devname.c b/criu/devname.c +new file mode 100644 +index 0000000..5f6fbed +--- /dev/null ++++ b/criu/devname.c +@@ -0,0 +1,130 @@ ++#include ++#include ++#include ++#include ++ ++#include "log.h" ++#include "common/xmalloc.h" ++ ++#define REPAIRING_DEVICE_FILE "/sys/kernel/repairing_device" ++#define ASCII_SIZE 128 ++ ++static void *root_bucket[ASCII_SIZE]; ++ ++static int insert_devname_internal(void *bucket[], const char *name) ++{ ++ void *new = NULL; ++ int idx = *name; ++ ++ if (bucket[idx] != NULL) ++ return insert_devname_internal(bucket[idx], name+1); ++ else if (idx == '\0') { ++ new = xmalloc(sizeof(void *)); ++ if (!new) { ++ pr_perror("alloc devname failed\n"); ++ return -1; ++ } ++ bucket[idx] = new; ++ return 0; ++ } else { ++ new = xmalloc(sizeof(void *) * ASCII_SIZE); ++ if (!new) { ++ pr_perror("alloc devname failed\n"); ++ return -1; ++ } ++ memset(new, 0, sizeof(void *) * ASCII_SIZE); ++ bucket[idx] = new; ++ return insert_devname_internal(bucket[idx], name+1); ++ } ++} ++ ++int insert_devname(const char *devname) ++{ ++ if (devname == NULL || *devname == '\0') // ignore ++ return 0; ++ ++ pr_debug("insert device '%s'\n", devname); ++ return insert_devname_internal(root_bucket, devname); ++} ++ ++int parse_devname(void) ++{ ++ int retval = -1; ++ char *line = NULL; ++ size_t len = 0; ++ ssize_t nread = 0; ++ FILE *fp = NULL; ++ ++ fp = fopen(REPAIRING_DEVICE_FILE, "r"); ++ if (fp == NULL) { ++ pr_info("Unable to open %s, downgrade to use internal whitelist\n", ++ REPAIRING_DEVICE_FILE); ++ return 0; ++ } ++ ++ while ((nread = getline(&line, &len, fp)) != -1) { ++ if (nread <= 1) // ignore empty string ++ continue; ++ ++ line[nread-1] = '\0'; // drop '\n' ++ retval = insert_devname(line); ++ if (retval != 0) ++ goto out; ++ } ++ retval = 0; ++ ++out: ++ free(line); ++ fclose(fp); ++ return retval; ++} ++ ++static const char *steal_devname(const char *name, ssize_t len) ++{ ++ ssize_t off = len; ++ ++ for (off -= 1; off > 0; off--) { ++ if (name[off] == '/') ++ break; ++ } ++ ++ return name + off + 1; ++} ++ ++static bool find_devname_internal(void *bucket[], const char *name) ++{ ++ int idx = *name; ++ ++ if (*name == '\0' && bucket[idx] != NULL) ++ return true; ++ else if (bucket[idx] == NULL) ++ return false; ++ else { ++ return find_devname_internal(bucket[idx], name+1); ++ } ++} ++ ++bool find_devname(const char *name) ++{ ++ const char *devname; ++ size_t len = 0; ++ bool found = false; ++ ++ if (name == NULL) ++ return false; ++ else if ((len = strlen(name)) == 0) ++ return false; ++ ++ devname = steal_devname(name, len); ++ found = find_devname_internal(root_bucket, devname); ++ ++ pr_debug("device '%s' (original name '%s') %s found in %s\n", ++ devname, name, found ? "is" : "isn't", REPAIRING_DEVICE_FILE); ++ ++ /* Compatible with the old version, there are still `strstr` branch in the following */ ++ found |= (strstr(name, "uverbs") != NULL ++ || strstr(name, "rdma_cm") != NULL ++ || strstr(name, "umad") != NULL); ++ ++ return found; ++} +diff --git a/criu/files-reg.c b/criu/files-reg.c +index 4724994..ba78c67 100644 +--- a/criu/files-reg.c ++++ b/criu/files-reg.c +@@ -1701,8 +1701,8 @@ int dump_one_reg_file(int lfd, u32 id, const struct fd_parms *p) + rfe.has_mnt_id = true; + } + +- pr_info("Dumping path for %d fd via self %d [%s]\n", +- p->fd, lfd, &link->name[1]); ++ pr_info("Dumping path for %d fd via self %d [%s], id: %d\n", ++ p->fd, lfd, &link->name[1], id); + + /* + * The regular path we can handle should start with slash. +@@ -2366,6 +2366,34 @@ static int open_filemap(int pid, struct vma_area *vma) + return 0; + } + ++int collect_chr_map(struct pstree_item *me, struct vma_area *vma) ++{ ++ struct list_head *list = &rsti(me)->fds; ++ struct fdinfo_list_entry *fle, *tmp; ++ struct chrfile_info *ci; ++ bool exist_fd; ++ ++ list_for_each_entry_safe(fle, tmp, list, ps_list) { ++ struct file_desc *d = fle->desc; ++ ++ if (d->ops->type != FD_TYPES__CHR) ++ continue; ++ ++ ci = container_of(d, struct chrfile_info, d); ++ if (!strcmp(vma->e->name, ci->path)) { ++ vma->vmfd = d; ++ vma->e->fd = fle->fe->fd; ++ exist_fd = true; ++ break; ++ } ++ } ++ ++ if (!exist_fd) ++ return -EEXIST; ++ ++ return 0; ++} ++ + int collect_filemap(struct vma_area *vma) + { + struct file_desc *fd; +@@ -2453,7 +2481,7 @@ static int collect_one_regfile(void *o, ProtobufCMessage *base, struct cr_img *i + rfi->remap = NULL; + rfi->size_mode_checked = false; + +- pr_info("Collected [%s] ID %#x\n", rfi->path, rfi->rfe->id); ++ pr_info("Collected regfile [%s] ID %#x\n", rfi->path, rfi->rfe->id); + return file_desc_add(&rfi->d, rfi->rfe->id, ®_desc_ops); + } + +diff --git a/criu/files.c b/criu/files.c +index 6f580af..34aa8be 100644 +--- a/criu/files.c ++++ b/criu/files.c +@@ -331,10 +331,32 @@ int do_dump_gen_file(struct fd_parms *p, int lfd, + e->fd = p->fd; + e->flags = p->fd_flags; + ++ pr_info("fdinfoEntry fd: %d\n", e->fd); + ret = fd_id_generate(p->pid, e, p); + if (ret == 1) /* new ID generated */ + ret = ops->dump(lfd, e->id, p); +- else ++ else if (ops->type == FD_TYPES__CHR) { ++ /* ++ * Sometimes the app_data subprocess may inherit the fd from ++ * app_data. Those fds may result the unconditional oops during ++ * the restoration of app_data. Therefore, prevent the dump in ++ * those condition. ++ */ ++ struct fd_link _link, *link; ++ ++ if (!p->link) { ++ if (fill_fdlink(lfd, p, &_link)) ++ return -1; ++ link = &_link; ++ } else ++ link = p->link; ++ ++ if (find_devname(link->name)) { ++ pr_err("char dev '%s' fd %d is owned by multi-processes\n", ++ link->name, e->fd); ++ ret = -1; ++ } ++ } else + /* Remove locks generated by the fd before going to the next */ + discard_dup_locks_tail(p->pid, e->fd); + +@@ -473,6 +495,58 @@ static int dump_blkdev(struct fd_parms *p, int lfd, FdinfoEntry *e) + return err; + } + ++static int dump_chr_file(int lfd, u32 id, const struct fd_parms *p) ++{ ++ int ret; ++ struct fd_link _link, *link; ++ struct cr_img *img; ++ FileEntry fe = FILE_ENTRY__INIT; ++ ChrfileEntry cfe = CHRFILE_ENTRY__INIT; ++ ++ if (!p->link) { ++ if (fill_fdlink(lfd, p, &_link)) ++ return -1; ++ link = &_link; ++ } else ++ link = p->link; ++ ++ pr_info("Dumping chr-file fd %d with lfd %d with id %d, name: %s\n", p->fd, lfd, id, link->name); ++ ++ if (strstr(link->name, "(deleted)") != NULL) { ++ pr_err("char device '%s' is deleted\n", link->name); ++ return -ENXIO; ++ } ++ ++ cfe.repair = false; ++ if (find_devname(link->name)) { ++ ret = ioctl(lfd, IOCTL_CMD_NEEDREPAIR, 0); ++ if (ret <= 0) { ++ pr_err("ioctl cmd needrepair failed, errno: %d, %s\n", ret, strerror(errno)); ++ return -1; ++ } else { ++ pr_info("char device needrepair cmd return: %d\n", ret); ++ cfe.index = ret; ++ cfe.repair = true; ++ } ++ } ++ ++ cfe.id = id; ++ cfe.name = &link->name[1]; ++ cfe.flags = p->flags; ++ fe.type = FD_TYPES__CHR; ++ fe.id = cfe.id; ++ fe.chr = &cfe; ++ ++ img = img_from_set(glob_imgset, CR_FD_FILES); ++ ret = pb_write_one(img, &fe, PB_FILE); ++ return ret; ++} ++ ++const struct fdtype_ops chr_dump_ops = { ++ .type = FD_TYPES__CHR, ++ .dump = dump_chr_file, ++}; ++ + static int dump_chrdev(struct fd_parms *p, int lfd, FdinfoEntry *e) + { + struct fd_link *link_old = p->link; +@@ -500,6 +574,10 @@ static int dump_chrdev(struct fd_parms *p, int lfd, FdinfoEntry *e) + ops = &tty_dump_ops; + break; + } ++ if (opts.dump_char_dev) { ++ ops = &chr_dump_ops; ++ break; ++ } + + sprintf(more, "%d:%d", maj, minor(p->stat.st_rdev)); + err = dump_unsupp_fd(p, lfd, "chr", more, e); +@@ -513,6 +591,12 @@ static int dump_chrdev(struct fd_parms *p, int lfd, FdinfoEntry *e) + return err; + } + ++/* Checks if file descriptor @lfd is infinibandevent */ ++int is_infiniband_link(char *link) ++{ ++ return is_anon_link_type(link, "[infinibandevent]"); ++} ++ + static int dump_one_file(struct pid *pid, int fd, int lfd, struct fd_opts *opts, + struct parasite_ctl *ctl, FdinfoEntry *e, + struct parasite_drain_fd *dfds) +@@ -567,6 +651,8 @@ static int dump_one_file(struct pid *pid, int fd, int lfd, struct fd_opts *opts, + ops = &signalfd_dump_ops; + else if (is_timerfd_link(link)) + ops = &timerfd_dump_ops; ++ else if (is_infiniband_link(link)) ++ return 1; + #ifdef CONFIG_HAS_LIBBPF + else if (is_bpfmap_link(link)) + ops = &bpfmap_dump_ops; +@@ -673,9 +759,15 @@ int dump_task_files_seized(struct parasite_ctl *ctl, struct pstree_item *item, + + ret = dump_one_file(item->pid, dfds->fds[i + off], + lfds[i], opts + i, ctl, &e, dfds); +- if (ret) ++ if (ret < 0) + break; ++ /* infiniband link file */ ++ if (ret > 0) { ++ ret = 0; ++ continue; ++ } + ++ pr_info("write fdinfoEntry fd=%d id=%d\n", (&e)->fd, (&e)->id); + ret = pb_write_one(img, &e, PB_FDINFO); + if (ret) + break; +@@ -933,6 +1025,7 @@ int prepare_fd_pid(struct pstree_item *item) + if (!img) + return -1; + ++ pr_info("prepare_fd_pid\n"); + while (1) { + FdinfoEntry *e; + +@@ -1140,6 +1233,7 @@ int setup_and_serve_out(struct fdinfo_list_entry *fle, int new_fd) + if (reopen_fd_as(fle->fe->fd, new_fd)) + return -1; + ++ pr_info("*******flags: %d",fle->fe->flags); + if (fcntl(fle->fe->fd, F_SETFD, fle->fe->flags) == -1) { + pr_perror("Unable to set file descriptor flags"); + return -1; +@@ -1690,6 +1784,64 @@ out: + return ret; + } + ++static int chrfile_open(struct file_desc *d, int *new_fd) ++{ ++ int fd, mntns_root; ++ int ret = 0; ++ struct chrfile_info *ci; ++ ++ ci = container_of(d, struct chrfile_info, d); ++ ++ mntns_root = open_pid_proc(getpid()); ++ fd = openat(mntns_root, ci->path, ci->cfe->flags | O_REPAIR); ++ if (fd < 0){ ++ pr_err("open chr file failed\n"); ++ return -1; ++ } ++ ++ if (ci->cfe->repair) { ++ ret = ioctl(fd, IOCTL_CMD_REPAIR , ci->cfe->index); ++ pr_info("repair ioctl return: %d, index: %d\n", ret, ci->cfe->index); ++ if (ret) ++ goto err; ++ } ++ ++ *new_fd = fd; ++ return ret; ++err: ++ close(fd); ++ return ret; ++} ++ ++static struct file_desc_ops chrfile_desc_ops = { ++ .type = FD_TYPES__CHR, ++ .open = chrfile_open, ++}; ++ ++static int collect_one_chrfile(void *o, ProtobufCMessage *base, struct cr_img *i) ++{ ++ struct chrfile_info *ci = o; ++ static char dot[] = "."; ++ ++ ci->cfe = pb_msg(base, ChrfileEntry); ++ if (ci->cfe->name[1] == '\0') ++ ci->path = dot; ++ else ++ ci->path = ci->cfe->name; ++ ++ pr_info("Collected chr file: %#x, name: %s\n", ci->cfe->id, ci->path); ++ file_desc_add(&ci->d, ci->cfe->id, &chrfile_desc_ops); ++ ++ return 0; ++} ++ ++struct collect_image_info chrfile_cinfo = { ++ .fd_type = CR_FD_CHRFILE, ++ .pb_type = PB_CHRFILE, ++ .priv_size = sizeof(struct chrfile_info), ++ .collect = collect_one_chrfile, ++}; ++ + static int collect_one_file_entry(FileEntry *fe, u_int32_t id, ProtobufCMessage *base, + struct collect_image_info *cinfo) + { +@@ -1770,6 +1922,9 @@ static int collect_one_file(void *o, ProtobufCMessage *base, struct cr_img *i) + ret = collect_one_file_entry(fe, fe->bpf->id, &fe->bpf->base, &bpfmap_cinfo); + break; + #endif ++ case FD_TYPES__CHR: ++ ret = collect_one_file_entry(fe, fe->chr->id, &fe->chr->base, &chrfile_cinfo); ++ break; + } + + return ret; +diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h +index 5b0ff24..5ca177a 100644 +--- a/criu/include/cr_options.h ++++ b/criu/include/cr_options.h +@@ -180,6 +180,7 @@ struct cr_options { + int use_fork_pid; + int with_notifier_kup; + int with_fd_cred; ++ int dump_char_dev; + }; + + extern struct cr_options opts; +diff --git a/criu/include/files-reg.h b/criu/include/files-reg.h +index 016d76a..458fe89 100644 +--- a/criu/include/files-reg.h ++++ b/criu/include/files-reg.h +@@ -4,6 +4,7 @@ + #include "files.h" + + #include "images/regfile.pb-c.h" ++#include "images/chr.pb-c.h" + #include "images/ghost-file.pb-c.h" + + struct cr_imgset; +@@ -26,12 +27,19 @@ struct reg_file_info { + char *path; + }; + ++struct chrfile_info { ++ struct file_desc d; ++ ChrfileEntry *cfe; ++ char *path; ++}; ++ + extern int open_reg_by_id(u32 id); + extern int open_reg_fd(struct file_desc *); + extern int open_path(struct file_desc *, int (*open_cb)(int ns_root_fd, + struct reg_file_info *, void *), void *arg); + + extern const struct fdtype_ops regfile_dump_ops; ++extern const struct fdtype_ops chr_dump_ops; + extern int do_open_reg_noseek_flags(int ns_root_fd, struct reg_file_info *rfi, void *arg); + extern int dump_one_reg_file(int lfd, u32 id, const struct fd_parms *p); + +@@ -40,6 +48,7 @@ extern struct file_remap *lookup_ghost_remap(u32 dev, u32 ino); + extern struct file_desc *try_collect_special_file(u32 id, int optional); + #define collect_special_file(id) try_collect_special_file(id, 0) + extern int collect_filemap(struct vma_area *); ++extern int collect_chr_map(struct pstree_item *me, struct vma_area *); + extern void filemap_ctx_init(bool auto_close); + extern void filemap_ctx_fini(void); + +diff --git a/criu/include/files.h b/criu/include/files.h +index 2c1e1e7..b12d079 100644 +--- a/criu/include/files.h ++++ b/criu/include/files.h +@@ -15,6 +15,12 @@ + #include "images/fown.pb-c.h" + #include "images/vma.pb-c.h" + ++#ifndef IOCTL_CMD_NEEDREPAIR ++#define IOCTL_CMD_NEEDREPAIR 0x00100000UL ++#define IOCTL_CMD_REPAIR 0x00200000UL ++#define O_REPAIR 040000000 ++#endif ++ + struct parasite_drain_fd; + struct pstree_item; + struct file_desc; +diff --git a/criu/include/image-desc.h b/criu/include/image-desc.h +index f69cc58..22676ae 100644 +--- a/criu/include/image-desc.h ++++ b/criu/include/image-desc.h +@@ -114,6 +114,7 @@ enum { + CR_FD_MEMFD_FILE, + + CR_FD_AUTOFS, ++ CR_FD_CHRFILE, + + CR_FD_MAX + }; +diff --git a/criu/include/image.h b/criu/include/image.h +index 939db37..70f17a5 100644 +--- a/criu/include/image.h ++++ b/criu/include/image.h +@@ -85,6 +85,7 @@ + #define VMA_AREA_AIORING (1 << 13) + #define VMA_AREA_MEMFD (1 << 14) + #define VMA_AREA_ANON_INODE (1 << 15) ++#define VMA_AREA_CHR (1 << 16) + + #define VMA_CLOSE (1 << 28) + #define VMA_NO_PROT_WRITE (1 << 29) +diff --git a/criu/include/protobuf-desc.h b/criu/include/protobuf-desc.h +index 35fa1a9..e7df57e 100644 +--- a/criu/include/protobuf-desc.h ++++ b/criu/include/protobuf-desc.h +@@ -69,6 +69,7 @@ enum { + PB_PIDNS, + PB_BPFMAP_FILE, + PB_BPFMAP_DATA, ++ PB_CHRFILE, + + /* PB_AUTOGEN_STOP */ + +diff --git a/criu/include/util.h b/criu/include/util.h +index d226d2c..cf9a8f4 100644 +--- a/criu/include/util.h ++++ b/criu/include/util.h +@@ -422,4 +422,7 @@ enum notifier_state { + int notifier_kup(enum KUP_HOOK_POINT, enum nvwa_cmd, bool); + void do_notifier_rollback(bool, enum notifier_state); + ++int parse_devname(void); ++bool find_devname(const char *name); ++ + #endif /* __CR_UTIL_H__ */ +diff --git a/criu/mem.c b/criu/mem.c +index dd64f10..d56f69e 100644 +--- a/criu/mem.c ++++ b/criu/mem.c +@@ -848,7 +848,9 @@ int prepare_mm_pid(struct pstree_item *i) + } + + pr_info("vma 0x%"PRIx64" 0x%"PRIx64"\n", vma->e->start, vma->e->end); +- if (vma_area_is(vma, VMA_ANON_SHARED)) ++ if (vma_area_is(vma, VMA_AREA_CHR)) ++ ret = collect_chr_map(i, vma); ++ else if (vma_area_is(vma, VMA_ANON_SHARED)) + ret = collect_shmem(pid, vma); + else if (vma_area_is(vma, VMA_FILE_PRIVATE) || + vma_area_is(vma, VMA_FILE_SHARED)) +@@ -1502,7 +1504,7 @@ int open_vmas(struct pstree_item *t) + filemap_ctx_init(false); + + list_for_each_entry(vma, &vmas->h, list) { +- if (vma_area_is(vma, VMA_AREA_ANON_INODE)) ++ if (vma_area_is(vma, VMA_AREA_ANON_INODE) || vma_area_is(vma, VMA_AREA_CHR)) + continue; + + if (!vma_area_is(vma, VMA_AREA_REGULAR) || !vma->vm_open) +diff --git a/criu/proc_parse.c b/criu/proc_parse.c +index 23db7f3..4f5bbaa 100644 +--- a/criu/proc_parse.c ++++ b/criu/proc_parse.c +@@ -622,11 +622,23 @@ static int handle_vma(pid_t pid, struct vma_area *vma_area, + } else if (*vm_file_fd >= 0) { + struct stat *st_buf = vma_area->vmst; + ++ pr_info("file mode is: %x, st_ino: %ld\n", st_buf->st_mode, st_buf->st_ino); + if (S_ISREG(st_buf->st_mode)) + /* regular file mapping -- supported */; +- else if (S_ISCHR(st_buf->st_mode) && (st_buf->st_rdev == DEVZERO)) ++ else if (S_ISCHR(st_buf->st_mode)) { + /* devzero mapping -- also makes sense */; +- else { ++ if (opts.dump_char_dev && (strstr(file_path, "uverbs") != NULL)) { ++ int len = strlen(file_path) + 1; ++ vma_area->e->status |= VMA_AREA_CHR; ++ vma_area->e->name = xmalloc(len); ++ if (!vma_area->e->name) { ++ pr_err("alloc vma area name fail\n"); ++ goto err; ++ } ++ strncpy(vma_area->e->name, file_path, len); ++ pr_info("uverbs name content is: %s\n", vma_area->e->name); ++ } ++ } else { + pr_err("Can't handle non-regular mapping on %d's map %"PRIx64"\n", pid, vma_area->e->start); + goto err; + } +diff --git a/images/Makefile b/images/Makefile +index 34bc367..efd6fcb 100644 +--- a/images/Makefile ++++ b/images/Makefile +@@ -70,6 +70,7 @@ proto-obj-y += timens.o + proto-obj-y += img-streamer.o + proto-obj-y += bpfmap-file.o + proto-obj-y += bpfmap-data.o ++proto-obj-y += chr.o + + CFLAGS += -iquote $(obj)/ + +diff --git a/images/chr.proto b/images/chr.proto +new file mode 100644 +index 0000000..67929db +--- /dev/null ++++ b/images/chr.proto +@@ -0,0 +1,12 @@ ++syntax = "proto2"; ++ ++import "opts.proto"; ++ ++message chrfile_entry { ++ required uint32 id = 1; ++ required uint32 flags = 2 [(criu).flags = "rfile.flags"]; ++ required uint32 index = 3; ++ required string name = 4; ++ required bool repair = 5; ++}; ++ +diff --git a/images/fdinfo.proto b/images/fdinfo.proto +index f5e1895..8561da4 100644 +--- a/images/fdinfo.proto ++++ b/images/fdinfo.proto +@@ -18,6 +18,7 @@ import "pipe.proto"; + import "tty.proto"; + import "memfd.proto"; + import "bpfmap-file.proto"; ++import "chr.proto"; + + enum fd_types { + UND = 0; +@@ -40,6 +41,7 @@ enum fd_types { + TIMERFD = 17; + MEMFD = 18; + BPFMAP = 19; ++ CHR = 21; + + /* Any number above the real used. Not stored to image */ + CTL_TTY = 65534; +@@ -76,4 +78,5 @@ message file_entry { + optional tty_file_entry tty = 19; + optional memfd_file_entry memfd = 20; + optional bpfmap_file_entry bpf = 21; ++ optional chrfile_entry chr = 23; + } +-- +2.34.0 + diff --git a/backport-0012--socket-fix-connect-error-of-invalid-param.patch b/backport-0012--socket-fix-connect-error-of-invalid-param.patch new file mode 100644 index 0000000000000000000000000000000000000000..4b24b4998785f65d228143267c12d9ff09c6727c --- /dev/null +++ b/backport-0012--socket-fix-connect-error-of-invalid-param.patch @@ -0,0 +1,95 @@ +From 1be115f1b1732e2a8490a83495b48aa35df8dd99 Mon Sep 17 00:00:00 2001 +From: Jingxian He +Date: Wed, 19 May 2021 21:56:16 +0800 +Subject: [PATCH 12/50] socket: fix connect error of invalid param + +Fix connect error of invalid param during module upgrade. + +Signed-off-by: Xiaoguang Li +Signed-off-by: fu.lin +--- + criu/include/sockets.h | 1 + + criu/sk-inet.c | 13 +++++++++++-- + criu/sockets.c | 5 ++++- + 3 files changed, 16 insertions(+), 3 deletions(-) + +diff --git a/criu/include/sockets.h b/criu/include/sockets.h +index e971f3e..74c5ae4 100644 +--- a/criu/include/sockets.h ++++ b/criu/include/sockets.h +@@ -27,6 +27,7 @@ struct socket_desc { + extern int dump_socket(struct fd_parms *p, int lfd, FdinfoEntry *); + extern int dump_socket_opts(int sk, SkOptsEntry *soe); + extern int restore_socket_opts(int sk, SkOptsEntry *soe); ++extern int restore_bound_opts(int sk, SkOptsEntry *soe); + extern void release_skopts(SkOptsEntry *); + extern int restore_prepare_socket(int sk); + extern void preload_socket_modules(void); +diff --git a/criu/sk-inet.c b/criu/sk-inet.c +index d90c53b..7a05de2 100644 +--- a/criu/sk-inet.c ++++ b/criu/sk-inet.c +@@ -102,19 +102,24 @@ static void show_one_inet(const char *act, const struct inet_sk_desc *sk) + static void show_one_inet_img(const char *act, const InetSkEntry *e) + { + char src_addr[INET_ADDR_LEN] = ""; ++ char dst_addr[INET_ADDR_LEN] = ""; + + if (inet_ntop(e->family, (void *)e->src_addr, src_addr, + INET_ADDR_LEN) == NULL) { + pr_perror("Failed to translate address"); + } ++ if (inet_ntop(e->family, (void *)e->dst_addr, dst_addr, ++ INET_ADDR_LEN) == NULL) { ++ pr_perror("Failed to translate address"); ++ } + + pr_debug("\t%s: family %-10s type %-14s proto %-16s port %d " +- "state %-16s src_addr %s\n", act, ++ "state %-16s src_addr %s dst_addr %s\n", act, + ___socket_family_name(e->family), + ___socket_type_name(e->type), + ___socket_proto_name(e->proto), + e->src_port, ___tcp_state_name(e->state), +- src_addr); ++ src_addr, dst_addr); + } + + static int can_dump_ipproto(unsigned int ino, int proto, int type) +@@ -876,6 +881,10 @@ static int open_inet_sk(struct file_desc *d, int *new_fd) + if (restore_opt(sk, SOL_SOCKET, SO_REUSEPORT, &yes)) + goto err; + ++ if(restore_bound_opts(sk, ie->opts) < 0){ ++ goto err; ++ } ++ + if (tcp_connection(ie)) { + if (!opts.tcp_established_ok && !opts.tcp_close) { + pr_err("Connected TCP socket in image\n"); +diff --git a/criu/sockets.c b/criu/sockets.c +index a73967e..609bfb1 100644 +--- a/criu/sockets.c ++++ b/criu/sockets.c +@@ -605,7 +605,6 @@ int restore_socket_opts(int sk, SkOptsEntry *soe) + tv.tv_usec = soe->so_rcv_tmo_usec; + ret |= restore_opt(sk, SOL_SOCKET, SO_RCVTIMEO, &tv); + +- ret |= restore_bound_dev(sk, soe); + ret |= restore_socket_filter(sk, soe); + + /* The restore of SO_REUSEADDR depends on type of socket */ +@@ -613,6 +612,10 @@ int restore_socket_opts(int sk, SkOptsEntry *soe) + return ret; + } + ++int restore_bound_opts(int sk, SkOptsEntry *soe){ ++ return restore_bound_dev(sk, soe); ++} ++ + int do_dump_opt(int sk, int level, int name, void *val, int len) + { + socklen_t aux = len; +-- +2.34.0 + diff --git a/backport-0013--criu-eventpollfd-fix-for-improper-usage-in-appdata.patch b/backport-0013--criu-eventpollfd-fix-for-improper-usage-in-appdata.patch new file mode 100644 index 0000000000000000000000000000000000000000..8b02d2d65544e06425e25d3d8a2d3665724424ff --- /dev/null +++ b/backport-0013--criu-eventpollfd-fix-for-improper-usage-in-appdata.patch @@ -0,0 +1,97 @@ +From b3a7dd41f8524b2291cc93897c4915afa9ac33c4 Mon Sep 17 00:00:00 2001 +From: Jingxian He +Date: Wed, 19 May 2021 21:56:38 +0800 +Subject: [PATCH 13/50] criu: eventpollfd fix for improper usage in appdata + +Fix eventpollfd problem of improper usage in appdata. + +Signed-off-by: Jingxian He +Signed-off-by: fu.lin +--- + criu/eventpoll.c | 16 +++++++++++----- + criu/proc_parse.c | 2 ++ + images/eventpoll.proto | 3 +++ + 3 files changed, 16 insertions(+), 5 deletions(-) + +diff --git a/criu/eventpoll.c b/criu/eventpoll.c +index 9818f24..6097e42 100644 +--- a/criu/eventpoll.c ++++ b/criu/eventpoll.c +@@ -67,8 +67,8 @@ int is_eventpoll_link(char *link) + + static void pr_info_eventpoll_tfd(char *action, uint32_t id, EventpollTfdEntry *e) + { +- pr_info("%seventpoll-tfd: id %#08x tfd %8d events %#08x data %#016"PRIx64"\n", +- action, id, e->tfd, e->events, e->data); ++ pr_info("%seventpoll-tfd: id %#08x tfd %8d events %#08x data %#016"PRIx64" ignore %d\n", ++ action, id, e->tfd, e->events, e->data, e->ignore); + } + + static void pr_info_eventpoll(char *action, EventpollFileEntry *e) +@@ -146,9 +146,9 @@ int flush_eventpoll_dinfo_queue(void) + }; + struct kid_elem *t = kid_lookup_epoll_tfd(&fd_tree, &ke, &slot); + if (!t) { +- pr_debug("kid_lookup_epoll: no match pid %d efd %d tfd %d toff %u\n", +- dinfo->pid, dinfo->efd, tfde->tfd, dinfo->toff[i].off); +- goto err; ++ tfde->ignore = 1; ++ pr_info("Drop tfd entry, efd=%d, tfd=%d\n", slot.efd, slot.tfd); ++ continue; + } + + pr_debug("kid_lookup_epoll: rbsearch match pid %d efd %d tfd %d toff %u -> %d\n", +@@ -161,6 +161,7 @@ int flush_eventpoll_dinfo_queue(void) + goto err; + } + ++ pr_info("Change tfd: %d -> %d @ efd=%d\n", tfde->tfd, t->idx, slot.efd); + tfde->tfd = t->idx; + } + +@@ -416,6 +417,11 @@ static int eventpoll_retore_tfd(int fd, int id, EventpollTfdEntry *tdefe) + { + struct epoll_event event; + ++ if (tdefe->ignore) { ++ pr_info_eventpoll_tfd("Ignore ", id, tdefe); ++ return 0; ++ } ++ + pr_info_eventpoll_tfd("Restore ", id, tdefe); + + event.events = tdefe->events; +diff --git a/criu/proc_parse.c b/criu/proc_parse.c +index 4f5bbaa..32d84b3 100644 +--- a/criu/proc_parse.c ++++ b/criu/proc_parse.c +@@ -1921,10 +1921,12 @@ static int parse_fdinfo_pid_s(int pid, int fd, int type, void *arg) + e->has_dev = false; + e->has_inode = false; + e->has_pos = false; ++ e->has_ignore = false; + } else if (ret == 6) { + e->has_dev = true; + e->has_inode = true; + e->has_pos = true; ++ e->has_ignore = true; + } else if (ret < 6) { + eventpoll_tfd_entry__free_unpacked(e, NULL); + goto parse_err; +diff --git a/images/eventpoll.proto b/images/eventpoll.proto +index 4a8d1b8..20c9a15 100644 +--- a/images/eventpoll.proto ++++ b/images/eventpoll.proto +@@ -12,6 +12,9 @@ message eventpoll_tfd_entry { + optional uint32 dev = 5; + optional uint64 inode = 6; + optional uint64 pos = 7; ++ ++ /* entry validation */ ++ optional uint32 ignore = 8; + } + + message eventpoll_file_entry { +-- +2.34.0 + diff --git a/backport-0014--task_exit_notify-add-task-exit-notify-mask-method-fo.patch b/backport-0014--task_exit_notify-add-task-exit-notify-mask-method-fo.patch new file mode 100644 index 0000000000000000000000000000000000000000..2e9f95a76e2ad9afd919cb92a7a98567b5d9cf45 --- /dev/null +++ b/backport-0014--task_exit_notify-add-task-exit-notify-mask-method-fo.patch @@ -0,0 +1,143 @@ +From 4aea5a9673e882a2ff2b67bca94a3f66f5bffea8 Mon Sep 17 00:00:00 2001 +From: Jingxian He +Date: Wed, 19 May 2021 21:59:24 +0800 +Subject: [PATCH 14/50] task_exit_notify: add task exit notify mask method for + criu + +Add task exit notify mask method for criu during kernel module upgrade. + +Signed-off-by: Jingxian He +--- + criu/config.c | 1 + + criu/cr-restore.c | 8 ++++++++ + criu/crtools.c | 1 + + criu/include/cr_options.h | 1 + + criu/include/util.h | 5 +++++ + criu/seize.c | 33 ++++++++++++++++++++++++++++++++- + 6 files changed, 48 insertions(+), 1 deletion(-) + +diff --git a/criu/config.c b/criu/config.c +index 0ccd2b5..9c4d8ce 100644 +--- a/criu/config.c ++++ b/criu/config.c +@@ -547,6 +547,7 @@ int parse_options(int argc, char **argv, bool *usage_error, + BOOL_OPT("with-notifier", &opts.with_notifier_kup), + BOOL_OPT("with-fd-cred", &opts.with_fd_cred), + BOOL_OPT("dump-char-dev", &opts.dump_char_dev), ++ BOOL_OPT("mask-exit-notify", &opts.mask_exit_notify), + { }, + }; + +diff --git a/criu/cr-restore.c b/criu/cr-restore.c +index 7c198ce..ecebdfe 100644 +--- a/criu/cr-restore.c ++++ b/criu/cr-restore.c +@@ -1583,6 +1583,14 @@ static inline int fork_with_pid(struct pstree_item *item) + item->pid->real, vpid(item)); + } + ++ if (opts.mask_exit_notify) { ++ int mask_pid = ret; ++ pr_info("start unmask for %d\n", mask_pid); ++ ret = mask_task_exit_notify(mask_pid, false); ++ if (ret) ++ pr_err("unmask exit notify fail for: %d\n", mask_pid); ++ } ++ + err_unlock: + if (!(clone_flags & CLONE_NEWPID)) + unlock_last_pid(); +diff --git a/criu/crtools.c b/criu/crtools.c +index 26010b5..8694ed0 100644 +--- a/criu/crtools.c ++++ b/criu/crtools.c +@@ -451,6 +451,7 @@ usage: + " --with-fd-cred Allow to make the restored process has the same cred\n" + " as checkout assisted by kernel.\n" + " --dump-char-dev Dump char dev files as normal file with repair cmd\n" ++" --mask-exit-notify Mask task exit notify during dump and restore\n" + "\n" + "Check options:\n" + " Without options, \"criu check\" checks availability of absolutely required\n" +diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h +index 5ca177a..5b3ff86 100644 +--- a/criu/include/cr_options.h ++++ b/criu/include/cr_options.h +@@ -181,6 +181,7 @@ struct cr_options { + int with_notifier_kup; + int with_fd_cred; + int dump_char_dev; ++ int mask_exit_notify; + }; + + extern struct cr_options opts; +diff --git a/criu/include/util.h b/criu/include/util.h +index cf9a8f4..3a4b8f9 100644 +--- a/criu/include/util.h ++++ b/criu/include/util.h +@@ -425,4 +425,9 @@ void do_notifier_rollback(bool, enum notifier_state); + int parse_devname(void); + bool find_devname(const char *name); + ++#define PID_BUF_SIZE 32 ++#define MASK_EXIT_NOTIFY_DIR "/sys/kernel/mask_exit_notify" ++#define UNMASK_EXIT_NOTIFY_DIR "/sys/kernel/unmask_exit_notify" ++int mask_task_exit_notify(int pid, bool mask); ++ + #endif /* __CR_UTIL_H__ */ +diff --git a/criu/seize.c b/criu/seize.c +index a661097..387681c 100644 +--- a/criu/seize.c ++++ b/criu/seize.c +@@ -649,9 +649,35 @@ free: + return ret < 0 ? ret : nr_inprogress; + } + ++int mask_task_exit_notify(int pid, bool mask) ++{ ++ int fd, retval; ++ char buf[PID_BUF_SIZE] = {0}; ++ ++ if (pid <= 0) ++ return -1; ++ ++ snprintf(buf, PID_BUF_SIZE - 1, "%d", pid); ++ if (mask) ++ fd = open(MASK_EXIT_NOTIFY_DIR, O_WRONLY, 0); ++ else ++ fd = open(UNMASK_EXIT_NOTIFY_DIR, O_WRONLY, 0); ++ if (fd < 0) { ++ pr_err("open mask exit notify file fail\n"); ++ return fd; ++ } ++ ++ retval = write(fd, buf, PID_BUF_SIZE); ++ if (retval < 0) ++ pr_err("Write mask exit pid: %s fail\n", buf); ++ close(fd); ++ ++ return retval < 0 ? -1 : 0; ++} ++ + static void unseize_task_and_threads(const struct pstree_item *item, int st) + { +- int i; ++ int i, ret; + + if (item->pid->state == TASK_DEAD) + return; +@@ -660,6 +686,11 @@ static void unseize_task_and_threads(const struct pstree_item *item, int st) + for (i = 0; i < item->nr_threads; i++) + dump_task_special_pages(item->threads[i].real); + } ++ if (opts.mask_exit_notify) { ++ ret = mask_task_exit_notify(item->threads[0].real, true); ++ if (ret) ++ pr_err("mask exit notify for %d fail.\n", item->threads[0].real); ++ } + + /* + * The st is the state we want to switch tasks into, +-- +2.34.0 + diff --git a/backport-0015--selinux-fix-selinux-context-lable-check.patch b/backport-0015--selinux-fix-selinux-context-lable-check.patch new file mode 100644 index 0000000000000000000000000000000000000000..d1986815600d858d697884c935a0fb29f85c84dc --- /dev/null +++ b/backport-0015--selinux-fix-selinux-context-lable-check.patch @@ -0,0 +1,52 @@ +From 1ead7caf44d6ce9330cf71a735db1b374bb463ea Mon Sep 17 00:00:00 2001 +From: Xiaoguang Li +Date: Tue, 25 May 2021 02:40:30 +0000 +Subject: [PATCH 15/50] selinux: fix selinux context lable check + +Background: + SELinux has three status: disabled, permissive, and enforcing. + If the status of the SELinux wasn't disabled, it would configure + the rules using `/etc/selinux/targeted`. However, because of the + non-existed rules in `/etc/selinux/targeted`, the security lable + of processes is `kernel` instead of + `unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023` readed + from `/proc//attr/current`. It will result the failure of + criu dumping. + +Signed-off-by: lixiaoguang2 +Signed-off-by: fu.lin +--- + criu/lsm.c | 12 +++++++++++- + 1 file changed, 11 insertions(+), 1 deletion(-) + +diff --git a/criu/lsm.c b/criu/lsm.c +index 7cc3604..6713ca7 100644 +--- a/criu/lsm.c ++++ b/criu/lsm.c +@@ -78,12 +78,22 @@ static int selinux_get_label(pid_t pid, char **output) + if (!*output) + goto err; + ++ pos = (char*)ctx; ++ /* ++ * If the SElinux context is not configured, the label maybe look like ++ * this: ++ * "kernel" ++ */ ++ if (!strstr(pos, ":")) { ++ ret = 0; ++ goto err; ++ } ++ + /* + * Make sure it is a valid SELinux label. It should look like this: + * + * unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 + */ +- pos = (char*)ctx; + for (i = 0; i < 3; i++) { + pos = strstr(pos, ":"); + if (!pos) { +-- +2.34.0 + diff --git a/backport-0016--unix-socket-add-support-for-unix-stream-socket.patch b/backport-0016--unix-socket-add-support-for-unix-stream-socket.patch new file mode 100644 index 0000000000000000000000000000000000000000..e197133c7fb2b8d40f0b69969a58d89b64b7e7ce --- /dev/null +++ b/backport-0016--unix-socket-add-support-for-unix-stream-socket.patch @@ -0,0 +1,268 @@ +From 933965a0ed10918778aa2563e05f276b7e2b0b6a Mon Sep 17 00:00:00 2001 +From: Luo Longjun +Date: Mon, 7 Jun 2021 11:50:42 +0800 +Subject: [PATCH 16/50] unix socket: add support for unix stream socket + +When dump unix stream socket with external connections, +we will tell kernel to turn repair mode on for this sock. +And then kernel will keep this sock before restoring it. +In this process, the other socket which communicates with +this sock in repair mode will get EAGAIN or blocked. + +Signed-off-by: Luo Longjun + +fix unix socket dump and restore err + +Fix name-less unix socket dump and restore problem. + +Signed-off-by: Jingxian He + +unix socket:ignore repair error from kernel + +leave error for applications to deal with. + +Signed-off-by: Luo Longjun + +update +--- + criu/cr-dump.c | 1 + + criu/include/sockets.h | 1 + + criu/sk-unix.c | 105 +++++++++++++++++++++++++++++++++++++---- + images/sk-unix.proto | 1 + + 4 files changed, 99 insertions(+), 9 deletions(-) + +diff --git a/criu/cr-dump.c b/criu/cr-dump.c +index 9ba27a2..2bbcef3 100644 +--- a/criu/cr-dump.c ++++ b/criu/cr-dump.c +@@ -1715,6 +1715,7 @@ static int cr_dump_finish(int ret) + + cr_plugin_fini(CR_PLUGIN_STAGE__DUMP, ret); + cgp_fini(); ++ unix_stream_unlock(ret); + + if (!ret) { + /* +diff --git a/criu/include/sockets.h b/criu/include/sockets.h +index 74c5ae4..c9cf427 100644 +--- a/criu/include/sockets.h ++++ b/criu/include/sockets.h +@@ -43,6 +43,7 @@ extern int add_fake_unix_queuers(void); + extern int fix_external_unix_sockets(void); + extern int prepare_scms(void); + extern int unix_note_scm_rights(int id_for, uint32_t *file_ids, int *fds, int n_ids); ++extern void unix_stream_unlock(int ret); + + extern struct collect_image_info netlink_sk_cinfo; + +diff --git a/criu/sk-unix.c b/criu/sk-unix.c +index 00d09cc..14e8adc 100644 +--- a/criu/sk-unix.c ++++ b/criu/sk-unix.c +@@ -72,6 +72,7 @@ struct unix_sk_desc { + char *name; + unsigned int nr_icons; + unsigned int *icons; ++ int repair_ino; + + unsigned int vfs_dev; + unsigned int vfs_ino; +@@ -92,6 +93,11 @@ struct unix_sk_desc { + UnixSkEntry *ue; + }; + ++struct unix_stream_extern_socket_desc { ++ struct list_head list; ++ int fd; ++}; ++ + /* + * The mutex_ghost is accessed from different tasks, + * so make sure it is in shared memory. +@@ -99,6 +105,7 @@ struct unix_sk_desc { + static mutex_t *mutex_ghost; + + static LIST_HEAD(unix_sockets); ++static LIST_HEAD(unix_stream_external_sockets); + static LIST_HEAD(unix_ghost_addr); + + static int unix_resolve_name(int lfd, uint32_t id, struct unix_sk_desc *d, +@@ -117,6 +124,26 @@ struct unix_sk_listen_icon { + + static struct unix_sk_listen_icon *unix_listen_icons[SK_HASH_SIZE]; + ++static int unix_stream_repair_on(int fd) ++{ ++ int ret, aux = 1; ++ ret = setsockopt(fd, SOL_TCP, TCP_REPAIR_OPTIONS, &aux, sizeof(aux)); ++ if (ret < 0) ++ pr_err("Can't turn repair mod for unix stream on. \n"); ++ ++ return ret; ++} ++ ++static int unix_stream_repair_off(int fd) ++{ ++ int ret, aux = 0; ++ ret = setsockopt(fd, SOL_TCP, TCP_REPAIR_OPTIONS, &aux, sizeof(aux)); ++ if (ret < 0) ++ pr_err("Can't turn repair mod for unix stream off. \n"); ++ ++ return ret; ++} ++ + static struct unix_sk_listen_icon *lookup_unix_listen_icons(unsigned int peer_ino) + { + struct unix_sk_listen_icon *ic; +@@ -338,6 +365,8 @@ static int dump_one_unix_fd(int lfd, uint32_t id, const struct fd_parms *p) + FilePermsEntry *perms; + FownEntry *fown; + void *m; ++ unsigned int len; ++ int ret; + + m = xmalloc(sizeof(UnixSkEntry) + + sizeof(SkOptsEntry) + +@@ -431,6 +460,31 @@ static int dump_one_unix_fd(int lfd, uint32_t id, const struct fd_parms *p) + goto err; + } + ++ if (peer->name && ue->type == SOCK_STREAM) { ++ struct unix_stream_extern_socket_desc *d; ++ ++ /* Attention: used for upgrade in the same machine ++ * May in conflict with original usage ++ */ ++ pr_info("set %d unix stream repair on \n", lfd); ++ ret = unix_stream_repair_on(lfd); ++ if (ret < 0) ++ goto err; ++ ++ d = xzalloc(sizeof(*d)); ++ if (!d) ++ goto err; ++ ++ d->fd = dup(lfd); ++ pr_info("add %d into unix_stream_external_sockets \n", d->fd); ++ list_add_tail(&d->list, &unix_stream_external_sockets); ++ ++ len = sizeof(ue->repair_ino); ++ ret = getsockopt(lfd, SOL_TCP, TCP_REPAIR_OPTIONS, &ue->repair_ino, &len); ++ if (ret < 0) ++ goto err; ++ } ++ + /* + * Peer should have us as peer or have a name by which + * we can access one. +@@ -810,16 +864,18 @@ static int __dump_external_socket(struct unix_sk_desc *sk, + return -1; + } + +- if (peer->type != SOCK_DGRAM) { +- show_one_unix("Ext stream not supported", peer); +- pr_err("Can't dump half of stream unix connection.\n"); ++ if (peer->type != SOCK_DGRAM && ++ peer->type != SOCK_STREAM) { ++ show_one_unix("Ext unix type not supported", peer); ++ pr_err("Can't dump this kind of unix connection.\n"); + return -1; + } + +- if (!peer->name) { ++ /* part 1: prevent NULL pointer oops */ ++ if (!peer->name && !sk->name) { + show_one_unix("Ext dgram w/o name", peer); ++ show_one_unix("Ext dgram w/o name", sk); + pr_err("Can't dump name-less external socket.\n"); +- pr_err("%d\n", sk->fd); + return -1; + } + +@@ -866,7 +922,7 @@ int fix_external_unix_sockets(void) + + fd_id_generate_special(NULL, &e.id); + e.ino = sk->sd.ino; +- e.type = SOCK_DGRAM; ++ e.type = sk->type; + e.state = TCP_LISTEN; + e.name.data = (void *)sk->name; + e.name.len = (size_t)sk->namelen; +@@ -893,6 +949,19 @@ err: + return -1; + } + ++void unix_stream_unlock(int ret) ++{ ++ struct unix_stream_extern_socket_desc *d; ++ pr_debug("Unlocking unix stream sockets\n"); ++ list_for_each_entry(d, &unix_stream_external_sockets, list) { ++ if (ret) { ++ pr_debug("unlock fd %d \n", d->fd); ++ unix_stream_repair_off(d->fd); ++ } ++ close_safe(&d->fd); ++ } ++} ++ + struct unix_sk_info { + UnixSkEntry *ue; + struct list_head list; +@@ -1278,6 +1347,7 @@ static int post_open_standalone(struct file_desc *d, int fd) + struct unix_sk_info *peer; + struct sockaddr_un addr; + int cwd_fd = -1, root_fd = -1, ns_fd = -1; ++ int ret, value; + + ui = container_of(d, struct unix_sk_info, d); + BUG_ON((ui->flags & (USK_PAIR_MASTER | USK_PAIR_SLAVE)) || +@@ -1335,7 +1405,22 @@ static int post_open_standalone(struct file_desc *d, int fd) + * while we're connecting in sake of ghost sockets. + */ + mutex_lock(mutex_ghost); +- if (connect(fd, (struct sockaddr *)&addr, sizeof(addr.sun_family) + len) < 0) { ++ ++ /* we handle unix stream with external connections here */ ++ if (peer->name && ui->ue->type == SOCK_STREAM) { ++ value = ui->ue->repair_ino; ++ ret = setsockopt(fd, SOL_TCP, TCP_REPAIR, &value, sizeof(value)); ++ if (ret < 0) { ++ /* permit the unix sk resume successfully when the peer has been ++ * closed, just warn here */ ++ pr_warn("Can't repair %d socket\n", value); ++ } ++ ++ ret = unix_stream_repair_off(fd); ++ if (ret < 0) { ++ goto err_revert_and_exit; ++ } ++ } else if (connect(fd, (struct sockaddr *)&addr, sizeof(addr.sun_family) + len) < 0) { + pr_perror("Can't connect %d socket", ui->ue->ino); + goto err_revert_and_exit; + } +@@ -2037,8 +2122,10 @@ static int init_unix_sk_info(struct unix_sk_info *ui, UnixSkEntry *ue) + } + + ui->name = (void *)ue->name.data; +- } else +- ui->name = NULL; ++ } else { ++ /* part 2: prevent NULL pointer oops */ ++ ui->name = ""; ++ } + ui->name_dir = (void *)ue->name_dir; + + ui->flags = 0; +diff --git a/images/sk-unix.proto b/images/sk-unix.proto +index 2a3a7cc..610080a 100644 +--- a/images/sk-unix.proto ++++ b/images/sk-unix.proto +@@ -52,4 +52,5 @@ message unix_sk_entry { + optional uint32 ns_id = 16; + optional sint32 mnt_id = 17 [default = -1]; + /* Please, don't use field with number 18. */ ++ required sint32 repair_ino = 19; + } +-- +2.34.0 + diff --git a/backport-0017--save-and-restore-sigev_notify_thread_id.patch b/backport-0017--save-and-restore-sigev_notify_thread_id.patch new file mode 100644 index 0000000000000000000000000000000000000000..999dd846af1d532c587c3ceae236018a5fd1c624 --- /dev/null +++ b/backport-0017--save-and-restore-sigev_notify_thread_id.patch @@ -0,0 +1,98 @@ +From 9ca791a197eda25501a35dbf9d490e4e9a3e58ea Mon Sep 17 00:00:00 2001 +From: Liu Chao +Date: Mon, 28 Jun 2021 08:17:26 +0000 +Subject: [PATCH 17/50] save and restore sigev_notify_thread_id + +When sigev_notify_thread_id is not set, get_pid will return a NULL +pointer and do_timer_create will return -EINVAL in kernel. So criu +will failed to create posix timer: + +(09.806760) pie: 41301: Error (criu/pie/restorer.c:1998): Can't restore posix timers -22 +(09.806824) pie: 41301: Error (criu/pie/restorer.c:2133): Restorer fail 41301 +(09.891880) Error (criu/cr-restore.c:2596): Restoring FAILED. + +Signed-off-by: Liu Chao +--- + criu/cr-restore.c | 1 + + criu/include/posix-timer.h | 1 + + criu/parasite-syscall.c | 1 + + criu/pie/restorer.c | 1 + + criu/proc_parse.c | 1 + + images/timer.proto | 1 + + 6 files changed, 6 insertions(+) + +diff --git a/criu/cr-restore.c b/criu/cr-restore.c +index ecebdfe..2ed61d0 100644 +--- a/criu/cr-restore.c ++++ b/criu/cr-restore.c +@@ -2848,6 +2848,7 @@ static inline int decode_posix_timer(PosixTimerEntry *pte, + pt->spt.si_signo = pte->si_signo; + pt->spt.it_sigev_notify = pte->it_sigev_notify; + pt->spt.sival_ptr = decode_pointer(pte->sival_ptr); ++ pt->spt.sigev_notify_thread_id = pte->sigev_notify_thread_id; + pt->overrun = pte->overrun; + + return 0; +diff --git a/criu/include/posix-timer.h b/criu/include/posix-timer.h +index fa99d86..11b7618 100644 +--- a/criu/include/posix-timer.h ++++ b/criu/include/posix-timer.h +@@ -8,6 +8,7 @@ struct str_posix_timer { + int clock_id; + int si_signo; + int it_sigev_notify; ++ int sigev_notify_thread_id; + void * sival_ptr; + }; + +diff --git a/criu/parasite-syscall.c b/criu/parasite-syscall.c +index c7074c7..8d9e01b 100644 +--- a/criu/parasite-syscall.c ++++ b/criu/parasite-syscall.c +@@ -316,6 +316,7 @@ static void encode_posix_timer(struct posix_timer *v, + pte->si_signo = vp->spt.si_signo; + pte->it_sigev_notify = vp->spt.it_sigev_notify; + pte->sival_ptr = encode_pointer(vp->spt.sival_ptr); ++ pte->sigev_notify_thread_id = vp->spt.sigev_notify_thread_id; + + pte->overrun = v->overrun; + +diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c +index 0bd220a..5e06abb 100644 +--- a/criu/pie/restorer.c ++++ b/criu/pie/restorer.c +@@ -1224,6 +1224,7 @@ static int create_posix_timers(struct task_restore_args *args) + sev.sigev_notify = args->posix_timers[i].spt.it_sigev_notify; + sev.sigev_signo = args->posix_timers[i].spt.si_signo; + sev.sigev_value.sival_ptr = args->posix_timers[i].spt.sival_ptr; ++ sev._sigev_un._tid = args->posix_timers[i].spt.sigev_notify_thread_id; + + while (1) { + ret = sys_timer_create(args->posix_timers[i].spt.clock_id, &sev, &next_id); +diff --git a/criu/proc_parse.c b/criu/proc_parse.c +index 32d84b3..c8a18cf 100644 +--- a/criu/proc_parse.c ++++ b/criu/proc_parse.c +@@ -2380,6 +2380,7 @@ int parse_posix_timers(pid_t pid, struct proc_posix_timers_stat *args) + + if ( tidpid[0] == 't') { + timer->spt.it_sigev_notify = SIGEV_THREAD_ID; ++ timer->spt.sigev_notify_thread_id = pid_t; + } else { + switch (sigpid[0]) { + case 's' : +diff --git a/images/timer.proto b/images/timer.proto +index a254a6f..41db460 100644 +--- a/images/timer.proto ++++ b/images/timer.proto +@@ -19,6 +19,7 @@ message posix_timer_entry { + required uint64 insec = 8; + required uint64 vsec = 9; + required uint64 vnsec = 10; ++ required int32 sigev_notify_thread_id = 11; + } + + message task_timers_entry { +-- +2.34.0 + diff --git a/backport-0018--sysvshm-add-dump-restore-sysv-shm-in-host-ipc-ns.patch b/backport-0018--sysvshm-add-dump-restore-sysv-shm-in-host-ipc-ns.patch new file mode 100644 index 0000000000000000000000000000000000000000..96f4eff76389e17e6454165aaee6ef7eaecceb9f --- /dev/null +++ b/backport-0018--sysvshm-add-dump-restore-sysv-shm-in-host-ipc-ns.patch @@ -0,0 +1,116 @@ +From d5571166b3ba038a8af527ff375183ba139a4a08 Mon Sep 17 00:00:00 2001 +From: "fu.lin" +Date: Sat, 26 Jun 2021 15:18:15 +0800 +Subject: [PATCH 18/50] sysvshm: add dump/restore sysv-shm in host ipc ns + +In original criu design, SysVIPC memory segment, which belongs +to host ipcns, shouldn't be dumped because criu requires the +whole ipcns to be dumped. During the restoring ipcns, the new +shared memory will be created, and fill the original page data +in it. + +This patch makes the shared-memory in host ipcns restore possible. +Idea: + The SysVIPC memory won't disappear after the task exit. The basic +information can be got from `/proc/sysvipc/shm` as long as the +system doesn't reboot. Compared with restoring the whole ipcns, +the processes of the shared memory creating and page data filling +are ignored. + +Reference: +- https://www.criu.org/What_cannot_be_checkpointed + +Signed-off-by: fu.lin +--- + criu/cr-dump.c | 9 ++++----- + criu/cr-restore.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++ + 2 files changed, 50 insertions(+), 5 deletions(-) + +diff --git a/criu/cr-dump.c b/criu/cr-dump.c +index 2bbcef3..e76fe5a 100644 +--- a/criu/cr-dump.c ++++ b/criu/cr-dump.c +@@ -444,12 +444,11 @@ static int dump_filemap(struct vma_area *vma_area, int fd) + + static int check_sysvipc_map_dump(pid_t pid, VmaEntry *vma) + { +- if (root_ns_mask & CLONE_NEWIPC) +- return 0; ++ if (!(root_ns_mask & CLONE_NEWIPC)) ++ pr_info("Task %d with SysVIPC shmem map @%"PRIx64" lives in host IPC ns\n", ++ pid, vma->start); + +- pr_err("Task %d with SysVIPC shmem map @%"PRIx64" doesn't live in IPC ns\n", +- pid, vma->start); +- return -1; ++ return 0; + } + + static int get_task_auxv(pid_t pid, MmEntry *mm) +diff --git a/criu/cr-restore.c b/criu/cr-restore.c +index 2ed61d0..ed82524 100644 +--- a/criu/cr-restore.c ++++ b/criu/cr-restore.c +@@ -1840,6 +1840,49 @@ static int create_children_and_session(void) + return 0; + } + ++static int prepare_rootns_sysv_shm(unsigned long clone_flags) ++{ ++ int retval = 0; ++ char *line = NULL; ++ size_t len = 0; ++ FILE *fp; ++ key_t key; ++ int shmid; ++ mode_t mode; ++ size_t size; ++ ++ /* This is completed by `prepare_namespace()` */ ++ if (!!(clone_flags & CLONE_NEWIPC)) ++ return 0; ++ ++ pr_info("Restoring SYSV shm in host namespace\n"); ++ ++ fp = fopen("/proc/sysvipc/shm", "r"); ++ if (fp == NULL) { ++ pr_err("Can't open '/proc/sysvipc/shm', errno(%d): %s\n", errno, strerror(errno)); ++ return -1; ++ } ++ ++#if BITS_PER_LONG <= 32 ++# define SIZE_SPEC "%10lu" ++#else ++# define SIZE_SPEC "%21lu" ++#endif ++ ++ while (getline(&line, &len, fp) != -1) { ++ if (sscanf(line, "%10d %10d %4o" SIZE_SPEC, &key, &shmid, &mode, &size) != 4) ++ continue; ++ ++ retval = collect_sysv_shmem(shmid, size); ++ if (retval != 0) ++ goto out; ++ } ++ ++out: ++ fclose(fp); ++ return retval; ++} ++ + static int restore_task_with_children(void *_arg) + { + struct cr_clone_arg *ca = _arg; +@@ -1947,6 +1990,9 @@ static int restore_task_with_children(void *_arg) + if (prepare_namespace(current, ca->clone_flags)) + goto err; + ++ if (prepare_rootns_sysv_shm(ca->clone_flags)) ++ goto err; ++ + if (restore_finish_ns_stage(CR_STATE_PREPARE_NAMESPACES, CR_STATE_FORKING) < 0) + goto err; + +-- +2.34.0 + diff --git a/backport-0019--add-netlink-repair-modes.patch b/backport-0019--add-netlink-repair-modes.patch new file mode 100644 index 0000000000000000000000000000000000000000..0755506eff936174e67281e6a9c5ef042fd0c2af --- /dev/null +++ b/backport-0019--add-netlink-repair-modes.patch @@ -0,0 +1,45 @@ +From 7a1ceeeade68ce31fa77026ea9d68a763c50f974 Mon Sep 17 00:00:00 2001 +From: Xiaoguang Li +Date: Mon, 29 Mar 2021 20:58:28 -0400 +Subject: [PATCH 19/50] add netlink repair modes + +--- + criu/sk-netlink.c | 15 +++++++++++++++ + 1 file changed, 15 insertions(+) + +diff --git a/criu/sk-netlink.c b/criu/sk-netlink.c +index 3b86a7d..6d8ab2d 100644 +--- a/criu/sk-netlink.c ++++ b/criu/sk-netlink.c +@@ -68,6 +68,17 @@ int netlink_receive_one(struct nlmsghdr *hdr, struct ns_id *ns, void *arg) + return sk_collect_one(m->ndiag_ino, PF_NETLINK, &sd->sd, ns); + } + ++static int netlink_repair_on(int fd) ++{ ++ int ret, aux = 1; ++ ++ ret = setsockopt(fd, SOL_NETLINK, TCP_REPAIR, &aux, sizeof(aux)); ++ if (ret < 0) ++ pr_err("Can't turn netlink repair mode ON, error: %d\n", ret); ++ ++ return ret; ++} ++ + static bool can_dump_netlink_sk(int lfd) + { + int ret; +@@ -90,6 +101,10 @@ static int dump_one_netlink_fd(int lfd, u32 id, const struct fd_parms *p) + if (IS_ERR(sk)) + goto err; + ++ if (netlink_repair_on(lfd) < 0) { ++ goto err; ++ } ++ + ne.id = id; + ne.ino = p->stat.st_ino; + +-- +2.34.0 + diff --git a/backport-0020--looser-file-mode-and-size-check.patch b/backport-0020--looser-file-mode-and-size-check.patch new file mode 100644 index 0000000000000000000000000000000000000000..aaf0841868b7c978987f410808befe28a2bddac3 --- /dev/null +++ b/backport-0020--looser-file-mode-and-size-check.patch @@ -0,0 +1,81 @@ +From a8103e911dceb3c3cdfab49802f716993b83dcd9 Mon Sep 17 00:00:00 2001 +From: Jingxian He +Date: Sat, 26 Jun 2021 11:41:18 +0800 +Subject: [PATCH 20/50] looser file mode and size check + +When the file mode and size larger than dump data, +make the restoring process run success. + +Signed-off-by: Jingxian He +--- + criu/config.c | 1 + + criu/crtools.c | 1 + + criu/files-reg.c | 8 +++++--- + criu/include/cr_options.h | 1 + + 4 files changed, 8 insertions(+), 3 deletions(-) + +diff --git a/criu/config.c b/criu/config.c +index 9c4d8ce..006753a 100644 +--- a/criu/config.c ++++ b/criu/config.c +@@ -548,6 +548,7 @@ int parse_options(int argc, char **argv, bool *usage_error, + BOOL_OPT("with-fd-cred", &opts.with_fd_cred), + BOOL_OPT("dump-char-dev", &opts.dump_char_dev), + BOOL_OPT("mask-exit-notify", &opts.mask_exit_notify), ++ BOOL_OPT("weak-file-check", &opts.weak_file_check), + { }, + }; + +diff --git a/criu/crtools.c b/criu/crtools.c +index 8694ed0..239464a 100644 +--- a/criu/crtools.c ++++ b/criu/crtools.c +@@ -452,6 +452,7 @@ usage: + " as checkout assisted by kernel.\n" + " --dump-char-dev Dump char dev files as normal file with repair cmd\n" + " --mask-exit-notify Mask task exit notify during dump and restore\n" ++" --weak-file-check Allow file size and mod larger than dumping value\n" + "\n" + "Check options:\n" + " Without options, \"criu check\" checks availability of absolutely required\n" +diff --git a/criu/files-reg.c b/criu/files-reg.c +index ba78c67..e6ae042 100644 +--- a/criu/files-reg.c ++++ b/criu/files-reg.c +@@ -2061,7 +2061,8 @@ static bool validate_file(const int fd, const struct stat *fd_status, + { + int result = 1; + +- if (rfi->rfe->has_size && (fd_status->st_size != rfi->rfe->size)) { ++ if (rfi->rfe->has_size && ((!opts.weak_file_check && fd_status->st_size != rfi->rfe->size) || ++ (fd_status->st_size < rfi->rfe->size))) { + pr_err("File %s has bad size %"PRIu64" (expect %"PRIu64")\n", + rfi->path, fd_status->st_size, rfi->rfe->size); + return false; +@@ -2176,8 +2177,9 @@ ext: + if (!validate_file(tmp, &st, rfi)) + return -1; + +- if (rfi->rfe->has_mode && (st.st_mode != rfi->rfe->mode)) { +- pr_err("File %s has bad mode 0%o (expect 0%o)\n", ++ if (rfi->rfe->has_mode && ((!opts.weak_file_check && st.st_mode != rfi->rfe->mode) || ++ (st.st_mode < rfi->rfe->mode))) { ++ pr_err("%d File %s has bad mode 0%o (expect 0%o)\n", opts.weak_file_check, + rfi->path, (int)st.st_mode, + rfi->rfe->mode); + return -1; +diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h +index 5b3ff86..fc7818c 100644 +--- a/criu/include/cr_options.h ++++ b/criu/include/cr_options.h +@@ -182,6 +182,7 @@ struct cr_options { + int with_fd_cred; + int dump_char_dev; + int mask_exit_notify; ++ int weak_file_check; + }; + + extern struct cr_options opts; +-- +2.34.0 + diff --git a/backport-0021--ignore-special-page-dump.patch b/backport-0021--ignore-special-page-dump.patch new file mode 100644 index 0000000000000000000000000000000000000000..566e7df3338ffb359db3756e2ce451c630594663 --- /dev/null +++ b/backport-0021--ignore-special-page-dump.patch @@ -0,0 +1,84 @@ +From a44cfbb1c428c59114e57a187478639b733b1c2d Mon Sep 17 00:00:00 2001 +From: Jingxian He +Date: Tue, 29 Jun 2021 11:01:38 +0800 +Subject: [PATCH 21/50] ignore special page dump + +The special page dump will cost too long time when +thread num is very large. And special page dump +is not useful at every time. +Provide Ignore method for special page dump. + +Signed-off-by: Jingxian He +--- + criu/config.c | 1 + + criu/cr-dump.c | 2 +- + criu/crtools.c | 1 + + criu/include/cr_options.h | 1 + + criu/seize.c | 2 +- + 5 files changed, 5 insertions(+), 2 deletions(-) + +diff --git a/criu/config.c b/criu/config.c +index 006753a..90d3951 100644 +--- a/criu/config.c ++++ b/criu/config.c +@@ -549,6 +549,7 @@ int parse_options(int argc, char **argv, bool *usage_error, + BOOL_OPT("dump-char-dev", &opts.dump_char_dev), + BOOL_OPT("mask-exit-notify", &opts.mask_exit_notify), + BOOL_OPT("weak-file-check", &opts.weak_file_check), ++ BOOL_OPT("ignore-special-dump", &opts.ignore_special_dump), + { }, + }; + +diff --git a/criu/cr-dump.c b/criu/cr-dump.c +index e76fe5a..9627190 100644 +--- a/criu/cr-dump.c ++++ b/criu/cr-dump.c +@@ -1780,7 +1780,7 @@ static int cr_dump_finish(int ret) + close_service_fd(CR_PROC_FD_OFF); + close_image_dir(); + +- if (ret == 0 && opts.pin_memory) { ++ if (ret == 0 && opts.pin_memory && !opts.ignore_special_dump) { + pr_info("start restore_task_special_pages\n"); + restore_task_special_pages(0); + } +diff --git a/criu/crtools.c b/criu/crtools.c +index 239464a..0a957b0 100644 +--- a/criu/crtools.c ++++ b/criu/crtools.c +@@ -453,6 +453,7 @@ usage: + " --dump-char-dev Dump char dev files as normal file with repair cmd\n" + " --mask-exit-notify Mask task exit notify during dump and restore\n" + " --weak-file-check Allow file size and mod larger than dumping value\n" ++" --ignore-special-dump Ignore special task tid page dump\n" + "\n" + "Check options:\n" + " Without options, \"criu check\" checks availability of absolutely required\n" +diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h +index fc7818c..681b519 100644 +--- a/criu/include/cr_options.h ++++ b/criu/include/cr_options.h +@@ -183,6 +183,7 @@ struct cr_options { + int dump_char_dev; + int mask_exit_notify; + int weak_file_check; ++ int ignore_special_dump; + }; + + extern struct cr_options opts; +diff --git a/criu/seize.c b/criu/seize.c +index 387681c..c615971 100644 +--- a/criu/seize.c ++++ b/criu/seize.c +@@ -682,7 +682,7 @@ static void unseize_task_and_threads(const struct pstree_item *item, int st) + if (item->pid->state == TASK_DEAD) + return; + +- if (opts.pin_memory) { ++ if (opts.pin_memory && !opts.ignore_special_dump) { + for (i = 0; i < item->nr_threads; i++) + dump_task_special_pages(item->threads[i].real); + } +-- +2.34.0 + diff --git a/backport-0022--add-O_REPAIR-flag-to-vma-fd.patch b/backport-0022--add-O_REPAIR-flag-to-vma-fd.patch new file mode 100644 index 0000000000000000000000000000000000000000..43775663a90e7f0c45b1d5145b15e6aeaec4f326 --- /dev/null +++ b/backport-0022--add-O_REPAIR-flag-to-vma-fd.patch @@ -0,0 +1,45 @@ +From a4e70d7e215adc8e61e26bcc67fea0ce80e6f0bd Mon Sep 17 00:00:00 2001 +From: Jingxian He +Date: Thu, 24 Jun 2021 16:56:02 +0800 +Subject: [PATCH 22/50] add O_REPAIR flag to vma fd + +Add O_REPAIR flag when openning vma fd. + +Signed-off-by: Jingxian He +--- + criu/files-reg.c | 7 +++++-- + 1 file changed, 5 insertions(+), 2 deletions(-) + +diff --git a/criu/files-reg.c b/criu/files-reg.c +index e6ae042..6747a3a 100644 +--- a/criu/files-reg.c ++++ b/criu/files-reg.c +@@ -2334,6 +2334,7 @@ void filemap_ctx_fini(void) + } + } + ++#define O_REPAIR 040000000 + static int open_filemap(int pid, struct vma_area *vma) + { + u32 flags; +@@ -2346,13 +2347,15 @@ static int open_filemap(int pid, struct vma_area *vma) + */ + + BUG_ON((vma->vmfd == NULL) || !vma->e->has_fdflags); +- flags = vma->e->fdflags; ++ flags = vma->e->fdflags | O_REPAIR; + + if (ctx.flags != flags || ctx.desc != vma->vmfd) { + if (vma->e->status & VMA_AREA_MEMFD) + ret = memfd_open(vma->vmfd, &flags); +- else ++ else { ++ + ret = open_path(vma->vmfd, do_open_reg_noseek_flags, &flags); ++ } + if (ret < 0) + return ret; + +-- +2.34.0 + diff --git a/backport-0023--file-lock-add-repair-mode-to-dump-file-locks.patch b/backport-0023--file-lock-add-repair-mode-to-dump-file-locks.patch new file mode 100644 index 0000000000000000000000000000000000000000..6ee16478fee18adfc998b0cf609ac69556ae22fb --- /dev/null +++ b/backport-0023--file-lock-add-repair-mode-to-dump-file-locks.patch @@ -0,0 +1,307 @@ +From 0c48ab55547b0298ce7dc22320b369044e57ea91 Mon Sep 17 00:00:00 2001 +From: Sang Yan +Date: Thu, 8 Jul 2021 14:12:42 +0800 +Subject: [PATCH 23/50] file-lock: add repair mode to dump file locks + +Add new options "--file-locks-repair" to enable repair mode +while dumping file locks. +Repair mode keeps locks locked while process were killed in +dumping operation. Then resume the locks from repair mode at +process resuming. + +Signed-off-by: Sang Yan +--- + criu/config.c | 1 + + criu/cr-dump.c | 8 ++++++ + criu/crtools.c | 1 + + criu/file-lock.c | 10 +++++++ + criu/include/cr_options.h | 1 + + criu/include/fcntl.h | 7 +++++ + criu/include/parasite-syscall.h | 2 ++ + criu/include/parasite.h | 10 +++++++ + criu/parasite-syscall.c | 33 ++++++++++++++++++++++ + criu/pie/parasite.c | 50 +++++++++++++++++++++++++++++++++ + 10 files changed, 123 insertions(+) + +diff --git a/criu/config.c b/criu/config.c +index 90d3951..9854d4c 100644 +--- a/criu/config.c ++++ b/criu/config.c +@@ -550,6 +550,7 @@ int parse_options(int argc, char **argv, bool *usage_error, + BOOL_OPT("mask-exit-notify", &opts.mask_exit_notify), + BOOL_OPT("weak-file-check", &opts.weak_file_check), + BOOL_OPT("ignore-special-dump", &opts.ignore_special_dump), ++ BOOL_OPT("file-locks-repair", &opts.file_locks_repair), + { }, + }; + +diff --git a/criu/cr-dump.c b/criu/cr-dump.c +index 9627190..9084847 100644 +--- a/criu/cr-dump.c ++++ b/criu/cr-dump.c +@@ -1398,6 +1398,14 @@ static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) + goto err_cure; + } + ++ if (opts.file_locks_repair) { ++ ret = parasite_dump_file_locks(parasite_ctl, pid); ++ if (ret) { ++ pr_err("Can't parasite dump file locks (pid: %d)\n", pid); ++ goto err_cure; ++ } ++ } ++ + ret = dump_task_core_all(parasite_ctl, item, &pps_buf, cr_imgset, &misc); + if (ret) { + pr_err("Dump core (pid: %d) failed with %d\n", pid, ret); +diff --git a/criu/crtools.c b/criu/crtools.c +index 0a957b0..bab5b1b 100644 +--- a/criu/crtools.c ++++ b/criu/crtools.c +@@ -454,6 +454,7 @@ usage: + " --mask-exit-notify Mask task exit notify during dump and restore\n" + " --weak-file-check Allow file size and mod larger than dumping value\n" + " --ignore-special-dump Ignore special task tid page dump\n" ++" --file-locks-repair Use repair mode to dump and restore file locks\n" + "\n" + "Check options:\n" + " Without options, \"criu check\" checks availability of absolutely required\n" +diff --git a/criu/file-lock.c b/criu/file-lock.c +index 8be7589..44ecc92 100644 +--- a/criu/file-lock.c ++++ b/criu/file-lock.c +@@ -428,6 +428,8 @@ void discard_dup_locks_tail(pid_t pid, int fd) + list_for_each_entry_safe_reverse(fl, p, &file_lock_list, list) { + if (fl->owners_fd != fd || pid != fl->fl_holder) + break; ++ if (fl->fl_kind == FL_POSIX) ++ continue; + + list_del(&fl->list); + xfree(fl); +@@ -618,8 +620,12 @@ static int restore_file_lock(FileLockEntry *fle) + cmd = fle->type; + } else if (fle->type == F_RDLCK) { + cmd = LOCK_SH; ++ if (opts.file_locks_repair) ++ cmd = LOCK_REPAIR; + } else if (fle->type == F_WRLCK) { + cmd = LOCK_EX; ++ if (opts.file_locks_repair) ++ cmd = LOCK_REPAIR; + } else if (fle->type == F_UNLCK) { + cmd = LOCK_UN; + } else { +@@ -645,6 +651,10 @@ static int restore_file_lock(FileLockEntry *fle) + flk.l_pid = fle->pid; + flk.l_type = fle->type; + ++ if (opts.file_locks_repair) ++ if (fle->type == F_RDLCK || fle->type == F_WRLCK) ++ flk.l_type = F_REPAIR; ++ + pr_info("(posix)flag: %d, type: %d, pid: %d, fd: %d, " + "start: %8"PRIx64", len: %8"PRIx64"\n", + fle->flag, fle->type, fle->pid, fle->fd, +diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h +index 681b519..e227bcd 100644 +--- a/criu/include/cr_options.h ++++ b/criu/include/cr_options.h +@@ -184,6 +184,7 @@ struct cr_options { + int mask_exit_notify; + int weak_file_check; + int ignore_special_dump; ++ int file_locks_repair; + }; + + extern struct cr_options opts; +diff --git a/criu/include/fcntl.h b/criu/include/fcntl.h +index 0936337..65f8b36 100644 +--- a/criu/include/fcntl.h ++++ b/criu/include/fcntl.h +@@ -23,6 +23,13 @@ struct f_owner_ex { + #define F_SETCRED 18 + #endif + ++#ifndef F_NEED_REPAIR ++#define F_NEED_REPAIR 16 ++#define F_REPAIR 32 ++#define LOCK_NEED_REPAIR 256 /* REPAIRING lock */ ++#define LOCK_REPAIR 512 /* REPAIR lock */ ++#endif ++ + /* + * These things are required to compile on CentOS-6 + */ +diff --git a/criu/include/parasite-syscall.h b/criu/include/parasite-syscall.h +index c86a724..14e1f31 100644 +--- a/criu/include/parasite-syscall.h ++++ b/criu/include/parasite-syscall.h +@@ -54,4 +54,6 @@ extern int parasite_dump_cgroup(struct parasite_ctl *ctl, struct parasite_dump_c + + extern struct parasite_tty_args *parasite_dump_tty(struct parasite_ctl *ctl, int fd, int type); + ++extern int parasite_dump_file_locks(struct parasite_ctl *ctl, int pid); ++ + #endif /* __CR_PARASITE_SYSCALL_H__ */ +diff --git a/criu/include/parasite.h b/criu/include/parasite.h +index d957094..1c702f0 100644 +--- a/criu/include/parasite.h ++++ b/criu/include/parasite.h +@@ -35,6 +35,7 @@ enum { + PARASITE_CMD_CHECK_VDSO_MARK, + PARASITE_CMD_CHECK_AIOS, + PARASITE_CMD_DUMP_CGROUP, ++ PARASITE_CMD_DUMP_FILELOCKS, + + PARASITE_CMD_MAX, + }; +@@ -236,6 +237,15 @@ struct parasite_dump_cgroup_args { + char contents[1 << 12]; + }; + ++struct parasite_dump_filelocks_args { ++ short kind; ++ short type; ++ long start; ++ long len; ++ int pid; ++ int fd; ++}; ++ + #endif /* !__ASSEMBLY__ */ + + #endif /* __CR_PARASITE_H__ */ +diff --git a/criu/parasite-syscall.c b/criu/parasite-syscall.c +index 8d9e01b..8fdb475 100644 +--- a/criu/parasite-syscall.c ++++ b/criu/parasite-syscall.c +@@ -32,6 +32,7 @@ + #include + #include "signal.h" + #include "sigframe.h" ++#include "file-lock.h" + + #include + #include +@@ -591,3 +592,35 @@ struct parasite_ctl *parasite_infect_seized(pid_t pid, struct pstree_item *item, + + return ctl; + } ++ ++int parasite_dump_file_locks(struct parasite_ctl *ctl, int pid) ++{ ++ struct parasite_dump_filelocks_args *args; ++ struct file_lock *fl; ++ int ret; ++ ++ args = compel_parasite_args(ctl, struct parasite_dump_filelocks_args); ++ ++ list_for_each_entry(fl, &file_lock_list, list) { ++ if (fl->real_owner != pid) ++ continue; ++ ++ args->pid = fl->real_owner; ++ args->fd = fl->owners_fd; ++ args->kind = fl->fl_kind; ++ args->type = fl->fl_ltype; ++ args->start = fl->start; ++ if (!strncmp(fl->end, "EOF", 3)) ++ args->len = 0; ++ else ++ args->len = (atoll(fl->end) + 1) - fl->start; ++ ++ ret = compel_rpc_call_sync(PARASITE_CMD_DUMP_FILELOCKS, ctl); ++ if (ret < 0) { ++ pr_err("Parasite dump file lock failed! (pid: %d)\n", pid); ++ return ret; ++ } ++ } ++ ++ return 0; ++} +diff --git a/criu/pie/parasite.c b/criu/pie/parasite.c +index d839783..635c3f8 100644 +--- a/criu/pie/parasite.c ++++ b/criu/pie/parasite.c +@@ -7,6 +7,8 @@ + #include + #include + #include ++#include ++#include + + #include "common/config.h" + #include "int.h" +@@ -20,6 +22,7 @@ + #include "criu-log.h" + #include "tty.h" + #include "aio.h" ++#include "file-lock.h" + + #include "asm/parasite.h" + #include "restorer.h" +@@ -677,6 +680,50 @@ static int parasite_dump_cgroup(struct parasite_dump_cgroup_args *args) + return 0; + } + ++static int set_filelocks_needrepair(struct parasite_dump_filelocks_args *args) ++{ ++ int ret; ++ ++ if (args->kind == FL_FLOCK) { ++ if (args->type == F_RDLCK || args->type == F_WRLCK) { ++ int cmd = LOCK_NEED_REPAIR; ++ ++ pr_info("Need Repair flock kind: %d, type: %d, cmd: %d, pid: %d, fd: %d\n", ++ args->kind, args->type, cmd, args->pid, args->fd); ++ ++ ret = sys_flock(args->fd, cmd); ++ if (ret < 0) { ++ pr_err("Can not set NEED_REPAIR flock!\n"); ++ return ret; ++ } ++ } ++ } else if (args->kind == FL_POSIX) { ++ if (args->type == F_RDLCK || args->type == F_WRLCK) { ++ struct flock flk; ++ memset(&flk, 0, sizeof(flk)); ++ ++ flk.l_whence = SEEK_SET; ++ flk.l_start = args->start; ++ flk.l_len = args->len; ++ flk.l_pid = args->pid; ++ flk.l_type = F_NEED_REPAIR; ++ ++ pr_info("Need Repair posix lock kind: %d, type: %d, cmd: %d, pid: %d, fd: %d, " ++ "start: %8"PRIx64", len: %8"PRIx64"\n", ++ args->kind, args->type, flk.l_type, args->pid, args->fd, ++ args->start, args->len); ++ ++ ret = sys_fcntl(args->fd, F_SETLKW, (long)&flk); ++ if (ret < 0) { ++ pr_err("Can not set NEED_REPAIR posix lock!\n"); ++ return ret; ++ } ++ } ++ } ++ ++ return 0; ++} ++ + void parasite_cleanup(void) + { + if (mprotect_args) { +@@ -729,6 +776,9 @@ int parasite_daemon_cmd(int cmd, void *args) + case PARASITE_CMD_DUMP_CGROUP: + ret = parasite_dump_cgroup(args); + break; ++ case PARASITE_CMD_DUMP_FILELOCKS: ++ ret = set_filelocks_needrepair(args); ++ break; + default: + pr_err("Unknown command in parasite daemon thread leader: %d\n", cmd); + ret = -1; +-- +2.34.0 + diff --git a/backport-0024--unlock-network-when-restore-fails.patch b/backport-0024--unlock-network-when-restore-fails.patch new file mode 100644 index 0000000000000000000000000000000000000000..b428d6bfc0b742e799be5c0fa15fee396bcdcc58 --- /dev/null +++ b/backport-0024--unlock-network-when-restore-fails.patch @@ -0,0 +1,59 @@ +From a0eb4b017d38f8026074b988576aea42f0be72ff Mon Sep 17 00:00:00 2001 +From: Liu Chao +Date: Fri, 9 Jul 2021 07:32:20 +0000 +Subject: [PATCH 24/50] unlock network when restore fails + +Signed-off-by: fu.lin +--- + criu/cr-restore.c | 14 ++++++++++++++ + 1 file changed, 14 insertions(+) + +diff --git a/criu/cr-restore.c b/criu/cr-restore.c +index ed82524..4fd29a5 100644 +--- a/criu/cr-restore.c ++++ b/criu/cr-restore.c +@@ -111,6 +111,9 @@ + #endif + + struct pstree_item *current; ++#define NETWORK_COLLECTED 0x1 ++#define NETWORK_UNLOCK 0x2 ++static int network_status = 0; + + static int restore_task_with_children(void *); + static int sigreturn_restore(pid_t pid, struct task_restore_args *ta, unsigned long alen, CoreEntry *core); +@@ -247,6 +250,7 @@ static int crtools_prepare_shared(void) + /* Connections are unlocked from criu */ + if (!files_collected() && collect_image(&inet_sk_cinfo)) + return -1; ++ network_status |= NETWORK_COLLECTED; + + if (collect_binfmt_misc()) + return -1; +@@ -2496,6 +2500,7 @@ skip_ns_bouncing: + + /* Unlock network before disabling repair mode on sockets */ + network_unlock(); ++ network_status |= NETWORK_UNLOCK; + + /* + * Stop getting sigchld, after we resume the tasks they +@@ -2701,6 +2706,15 @@ int cr_restore_tasks(void) + + err: + cr_plugin_fini(CR_PLUGIN_STAGE__RESTORE, ret); ++ if (ret < 0) { ++ if ((network_status & NETWORK_COLLECTED) == 0) { ++ if (!files_collected() && collect_image(&inet_sk_cinfo)) ++ pr_err("collect inet sk cinfo fail"); ++ } ++ if ((network_status & NETWORK_UNLOCK) == 0) ++ network_unlock(); ++ } ++ + return ret; + } + +-- +2.34.0 + diff --git a/backport-0025--net-add-shared-socket-recover-method-for-criu.patch b/backport-0025--net-add-shared-socket-recover-method-for-criu.patch new file mode 100644 index 0000000000000000000000000000000000000000..0ce7efad877a741d4e6e5dfca03ee3e35b6236c1 --- /dev/null +++ b/backport-0025--net-add-shared-socket-recover-method-for-criu.patch @@ -0,0 +1,330 @@ +From dddf8282218f97b77e0342173621ae1897edb48d Mon Sep 17 00:00:00 2001 +From: Sang Yan +Date: Mon, 12 Jul 2021 16:14:45 +0800 +Subject: [PATCH 25/50] net: add shared socket recover method for criu + +When the socket file is shared with another process, +it will not be freed during dumping process. +We can repair the socket file by installing it to +the old fd number. + +Add new options: "--share-dst-ports" and "--share-src-ports" +for user to tell criu which socket ports are shared. + +Signed-off-by: Jingxian He +--- + criu/config.c | 8 ++ + criu/crtools.c | 3 + + criu/files.c | 18 ++++- + criu/include/cr_options.h | 2 + + criu/include/files.h | 4 + + criu/include/net.h | 1 + + criu/include/sk-inet.h | 3 + + criu/sk-inet.c | 151 ++++++++++++++++++++++++++++++++++++++ + 8 files changed, 189 insertions(+), 1 deletion(-) + +diff --git a/criu/config.c b/criu/config.c +index 9854d4c..6beed04 100644 +--- a/criu/config.c ++++ b/criu/config.c +@@ -541,6 +541,8 @@ int parse_options(int argc, char **argv, bool *usage_error, + { "cgroup-yard", required_argument, 0, 1096 }, + { "pre-dump-mode", required_argument, 0, 1097}, + { "file-validation", required_argument, 0, 1098 }, ++ { "share-dst-ports", required_argument, 0, 1099 }, ++ { "share-src-ports", required_argument, 0, 1100 }, + BOOL_OPT("with-cpu-affinity", &opts.with_cpu_affinity), + BOOL_OPT("pin-memory", &opts.pin_memory), + BOOL_OPT("use-fork-pid", &opts.use_fork_pid), +@@ -880,6 +882,12 @@ int parse_options(int argc, char **argv, bool *usage_error, + if (parse_file_validation_method(&opts, optarg)) + return 2; + break; ++ case 1099: ++ SET_CHAR_OPTS(share_dst_ports, optarg); ++ break; ++ case 1100: ++ SET_CHAR_OPTS(share_src_ports, optarg); ++ break; + case 'V': + pr_msg("Version: %s\n", CRIU_VERSION); + if (strcmp(CRIU_GITID, "0")) +diff --git a/criu/crtools.c b/criu/crtools.c +index bab5b1b..d16961a 100644 +--- a/criu/crtools.c ++++ b/criu/crtools.c +@@ -107,6 +107,9 @@ int main(int argc, char *argv[], char *envp[]) + goto usage; + } + ++ if (parse_share_ports()) ++ goto usage; ++ + log_set_loglevel(opts.log_level); + + if (optind < argc && !strcmp(argv[optind], "swrk")) { +diff --git a/criu/files.c b/criu/files.c +index 34aa8be..0ebf26e 100644 +--- a/criu/files.c ++++ b/criu/files.c +@@ -719,6 +719,8 @@ int dump_my_file(int lfd, u32 *id, int *type) + return 0; + } + ++int dst_pid; ++ + int dump_task_files_seized(struct parasite_ctl *ctl, struct pstree_item *item, + struct parasite_drain_fd *dfds) + { +@@ -743,7 +745,7 @@ int dump_task_files_seized(struct parasite_ctl *ctl, struct pstree_item *item, + img = open_image(CR_FD_FDINFO, O_DUMP, item->ids->files_id); + if (!img) + goto err; +- ++ dst_pid = item->pid->real; + ret = 0; /* Don't fail if nr_fds == 0 */ + for (off = 0; ret == 0 && off < dfds->nr_fds; off += nr_fds) { + if (nr_fds + off > dfds->nr_fds) +@@ -1262,6 +1264,20 @@ static int open_fd(struct fdinfo_list_entry *fle) + goto out; + } + ++ if (d->ops->type == FD_TYPES__INETSK) { ++ if (check_need_repair(d)) { ++ ret = repair_share_socket(d->id); ++ if (!ret) { ++ new_fd = get_share_socket(); ++ pr_info("get share socket:%d\n", new_fd); ++ if (new_fd <= 0 || setup_and_serve_out(fle, new_fd) < 0) ++ return -1; ++ fle->stage = FLE_RESTORED; ++ return 0; ++ } ++ } ++ } ++ + /* + * Open method returns the following values: + * 0 -- restore is successfully finished; +diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h +index e227bcd..361aebc 100644 +--- a/criu/include/cr_options.h ++++ b/criu/include/cr_options.h +@@ -185,6 +185,8 @@ struct cr_options { + int weak_file_check; + int ignore_special_dump; + int file_locks_repair; ++ char *share_dst_ports; ++ char *share_src_ports; + }; + + extern struct cr_options opts; +diff --git a/criu/include/files.h b/criu/include/files.h +index b12d079..85ca617 100644 +--- a/criu/include/files.h ++++ b/criu/include/files.h +@@ -210,4 +210,8 @@ extern int open_transport_socket(void); + extern int set_fds_event(pid_t virt); + extern void wait_fds_event(void); + ++extern int repair_share_socket(int id); ++extern int check_need_repair(struct file_desc *d); ++extern int get_share_socket(void); ++ + #endif /* __CR_FILES_H__ */ +diff --git a/criu/include/net.h b/criu/include/net.h +index 0a556f3..795d5e8 100644 +--- a/criu/include/net.h ++++ b/criu/include/net.h +@@ -16,6 +16,7 @@ extern int dump_net_ns(struct ns_id *ns); + extern int prepare_net_namespaces(void); + extern void fini_net_namespaces(void); + extern int netns_keep_nsfd(void); ++extern int parse_share_ports(void); + + struct pstree_item; + extern int restore_task_net_ns(struct pstree_item *current); +diff --git a/criu/include/sk-inet.h b/criu/include/sk-inet.h +index dec67ca..2e28444 100644 +--- a/criu/include/sk-inet.h ++++ b/criu/include/sk-inet.h +@@ -102,4 +102,7 @@ union libsoccr_addr; + int restore_sockaddr(union libsoccr_addr *sa, + int family, u32 pb_port, u32 *pb_addr, u32 ifindex); + ++#define MAX_SHARE_PORT_NUM 64 ++extern int dst_pid; ++ + #endif /* __CR_SK_INET_H__ */ +diff --git a/criu/sk-inet.c b/criu/sk-inet.c +index 7a05de2..d29f03b 100644 +--- a/criu/sk-inet.c ++++ b/criu/sk-inet.c +@@ -449,6 +449,152 @@ static bool needs_scope_id(uint32_t *src_addr) + return false; + } + ++#define ADD_SHARE_SOCKET_PATH "/sys/kernel/add_share_socket" ++#define REPAIR_SHARE_SOCKET_PATH "/sys/kernel/repair_share_socket" ++#define SHARE_SOCKET_PATH "/sys/kernel/share_socket" ++ ++int add_share_socket(u32 id, int fd, int pid, int port) ++{ ++ int retval; ++ char buf[256] = {0}; ++ ++ retval = snprintf(buf, 256, "%u,%d,%d,%d", id, fd, pid, port); ++ if (retval <= 0) ++ return -EFAULT; ++ ++ fd = open(ADD_SHARE_SOCKET_PATH, O_WRONLY, 0); ++ if (fd < 0) { ++ pr_err("open file:%s fail\n", ADD_SHARE_SOCKET_PATH); ++ return fd; ++ } ++ ++ retval = write(fd, buf, strlen(buf)); ++ close(fd); ++ return retval < 0 ? -1 : 0; ++} ++ ++ ++int repair_share_socket(int id) ++{ ++ int retval, fd; ++ char buf[256] = {0}; ++ ++ retval = snprintf(buf, 256, "%u", id); ++ if (retval <= 0) ++ return -EFAULT; ++ ++ fd = open(REPAIR_SHARE_SOCKET_PATH, O_WRONLY, 0); ++ if (fd < 0) { ++ pr_err("open file:%s fail\n", REPAIR_SHARE_SOCKET_PATH); ++ return fd; ++ } ++ retval = write(fd, buf, strlen(buf)); ++ ++ close(fd); ++ return retval < 0 ? -1 : 0; ++} ++ ++int get_share_socket(void) ++{ ++ int fd; ++ ssize_t count; ++ int retval = -1; ++ char buf[32] = {0}; ++ ++ fd = open(SHARE_SOCKET_PATH, O_RDONLY, 0); ++ if (fd < 0) { ++ pr_err("open file:%s fail\n", SHARE_SOCKET_PATH); ++ return fd; ++ } ++ ++ count = read(fd, buf, sizeof(buf)); ++ if (count > 0) ++ retval = atoi(buf); ++ ++ close(fd); ++ return retval; ++} ++ ++int g_share_dst_ports[MAX_SHARE_PORT_NUM]; ++int g_share_dst_port_num; ++int g_share_src_ports[MAX_SHARE_PORT_NUM]; ++int g_share_src_port_num; ++ ++int parse_share_ports(void) ++{ ++ char *save, *p; ++ ++ if (opts.share_dst_ports) { ++ p = strtok_r(opts.share_dst_ports, ",", &save); ++ while (p != NULL) { ++ if (g_share_dst_port_num >= MAX_SHARE_PORT_NUM) ++ return -1; ++ g_share_dst_ports[g_share_dst_port_num] = atoi(p); ++ if (!g_share_dst_ports[g_share_dst_port_num]) ++ return -1; ++ g_share_dst_port_num++; ++ p = strtok_r(NULL, ",", &save); ++ } ++ } ++ ++ if (opts.share_src_ports) { ++ p = strtok_r(opts.share_src_ports, ",", &save); ++ while (p != NULL) { ++ if (g_share_src_port_num >= MAX_SHARE_PORT_NUM) ++ return -1; ++ g_share_src_ports[g_share_src_port_num] = atoi(p); ++ if (!g_share_src_ports[g_share_src_port_num]) ++ return -1; ++ g_share_src_port_num++; ++ p = strtok_r(NULL, ",", &save); ++ } ++ } ++ return 0; ++} ++ ++int check_share_dst_port(int dst_port) ++{ ++ int i; ++ int ret = 0; ++ ++ for (i = 0; i < g_share_dst_port_num; i++) { ++ if (dst_port == g_share_dst_ports[i]) { ++ ret = 1; ++ break; ++ } ++ } ++ return ret; ++} ++ ++int check_share_src_port(int src_port) ++{ ++ int i; ++ int ret = 0; ++ ++ for (i = 0; i < g_share_src_port_num; i++) { ++ if (src_port == g_share_src_ports[i]) { ++ ret = 1; ++ break; ++ } ++ } ++ ++ return ret; ++} ++ ++int check_need_repair(struct file_desc *d) ++{ ++ struct inet_sk_info *ii; ++ InetSkEntry *ie; ++ ++ ii = container_of(d, struct inet_sk_info, d); ++ ie = ii->ie; ++ if (check_share_dst_port(ie->dst_port) || ++ check_share_src_port(ie->src_port)) ++ return 1; ++ else ++ return 0; ++} ++ + static int do_dump_one_inet_fd(int lfd, u32 id, const struct fd_parms *p, int family) + { + struct inet_sk_desc *sk; +@@ -507,6 +653,11 @@ static int do_dump_one_inet_fd(int lfd, u32 id, const struct fd_parms *p, int fa + + BUG_ON(sk->sd.already_dumped); + ++ if (check_share_dst_port(sk->dst_port) || check_share_src_port(sk->src_port)) { ++ pr_info("Start add share prot:%d src %d\n", sk->dst_port, sk->src_port); ++ add_share_socket(id, lfd, dst_pid, sk->src_port); ++ } ++ + ie.id = id; + ie.ino = sk->sd.ino; + if (sk->sd.sk_ns) { +-- +2.34.0 + diff --git a/backport-0026--clean-repair-res-when-dump-fail.patch b/backport-0026--clean-repair-res-when-dump-fail.patch new file mode 100644 index 0000000000000000000000000000000000000000..0d02a7f866fd5092f44af5773416c01a7c6f09bf --- /dev/null +++ b/backport-0026--clean-repair-res-when-dump-fail.patch @@ -0,0 +1,130 @@ +From 2a849636a14eebe120e682afccde7b76d1f4fed8 Mon Sep 17 00:00:00 2001 +From: Sang Yan +Date: Mon, 19 Jul 2021 14:43:10 +0800 +Subject: [PATCH 26/50] clean repair res when dump fail + +Clean pin mem and netlink repair res when dump fail. + +Signed-off-by: Jingxian He +--- + criu/cr-dump.c | 22 ++++++++++++++++++++++ + criu/include/net.h | 1 + + criu/sk-netlink.c | 40 +++++++++++++++++++++++++++++++++++----- + 3 files changed, 58 insertions(+), 5 deletions(-) + +diff --git a/criu/cr-dump.c b/criu/cr-dump.c +index 9084847..9e899d0 100644 +--- a/criu/cr-dump.c ++++ b/criu/cr-dump.c +@@ -83,6 +83,7 @@ + #include "memfd.h" + #include "timens.h" + #include "img-streamer.h" ++#include "restorer.h" + + /* + * Architectures can overwrite this function to restore register sets that +@@ -1706,6 +1707,23 @@ static int cr_lazy_mem_dump(void) + return ret; + } + ++int clear_pin_mem(int pid) ++{ ++ int fd, ret; ++ ++ fd = open(PIN_MEM_FILE, O_RDWR, 0); ++ if (fd < 0) { ++ pr_warn("error open file: %s\n", PIN_MEM_FILE); ++ return -1; ++ } ++ ret = ioctl(fd, CLEAR_PIN_MEM_AREA, (unsigned long) &pid); ++ if (ret < 0) { ++ pr_warn("clear pin mem fail: %d\n", pid); ++ } ++ close(fd); ++ return ret; ++} ++ + static enum notifier_state notifier_state = NOTHING_COMPLETE; + + static int cr_dump_finish(int ret) +@@ -1794,6 +1812,10 @@ static int cr_dump_finish(int ret) + } + + if (ret != 0 && opts.with_notifier_kup) { ++ pr_info("repair off netlink fd\n"); ++ netlink_repair_off(); ++ pr_info("clear pin mem info\n"); ++ clear_pin_mem(0); + pr_info("call notifier rollback\n"); + switch (notifier_state) { + case PRE_FREEZE_COMPLETE: +diff --git a/criu/include/net.h b/criu/include/net.h +index 795d5e8..bda0ff3 100644 +--- a/criu/include/net.h ++++ b/criu/include/net.h +@@ -54,5 +54,6 @@ extern void check_has_netns_ioc(int fd, bool *kdat_val, const char *name); + extern int net_set_ext(struct ns_id *ns); + extern struct ns_id *get_root_netns(void); + extern int read_net_ns_img(void); ++extern int netlink_repair_off(void); + + #endif /* __CR_NET_H__ */ +diff --git a/criu/sk-netlink.c b/criu/sk-netlink.c +index 6d8ab2d..a6c56ff 100644 +--- a/criu/sk-netlink.c ++++ b/criu/sk-netlink.c +@@ -68,15 +68,45 @@ int netlink_receive_one(struct nlmsghdr *hdr, struct ns_id *ns, void *arg) + return sk_collect_one(m->ndiag_ino, PF_NETLINK, &sd->sd, ns); + } + ++struct netlink_repair_fd { ++ int netlink_fd; ++ struct list_head nlist; ++}; ++ ++static LIST_HEAD(netlink_repair_fds); ++ + static int netlink_repair_on(int fd) + { +- int ret, aux = 1; ++ int ret, aux = 1; ++ struct netlink_repair_fd *nrf; + +- ret = setsockopt(fd, SOL_NETLINK, TCP_REPAIR, &aux, sizeof(aux)); +- if (ret < 0) +- pr_err("Can't turn netlink repair mode ON, error: %d\n", ret); ++ ret = setsockopt(fd, SOL_NETLINK, TCP_REPAIR, &aux, sizeof(aux)); ++ if (ret < 0) { ++ pr_err("Can't turn netlink repair mode ON, error: %d\n", ret); ++ return ret; ++ } ++ nrf = malloc(sizeof(*nrf)); ++ if (!nrf) ++ return -ENOMEM; ++ nrf->netlink_fd = dup(fd); ++ list_add_tail(&nrf->nlist, &netlink_repair_fds); ++ return ret; ++} + +- return ret; ++int netlink_repair_off(void) ++{ ++ int aux = 0, ret; ++ struct netlink_repair_fd *nrf, *n; ++ ++ list_for_each_entry_safe(nrf, n, &netlink_repair_fds, nlist) { ++ ret = setsockopt(nrf->netlink_fd, SOL_NETLINK, TCP_REPAIR, &aux, sizeof(aux)); ++ if (ret < 0) ++ pr_err("Failed to turn off repair mode on netlink\n"); ++ close(nrf->netlink_fd); ++ list_del(&nrf->nlist); ++ free(nrf); ++ } ++ return 0; + } + + static bool can_dump_netlink_sk(int lfd) +-- +2.34.0 + diff --git a/backport-0027--save-src-ports-to-ip_local_reserved_ports-when-dump-.patch b/backport-0027--save-src-ports-to-ip_local_reserved_ports-when-dump-.patch new file mode 100644 index 0000000000000000000000000000000000000000..4fb33ecca9a4ddb4aaf32da0f553a517460f852c --- /dev/null +++ b/backport-0027--save-src-ports-to-ip_local_reserved_ports-when-dump-.patch @@ -0,0 +1,248 @@ +From 0b93b3c8707b27db9284021d2e04e90cbf9543ca Mon Sep 17 00:00:00 2001 +From: Liu Chao +Date: Mon, 19 Jul 2021 03:19:30 +0000 +Subject: [PATCH 27/50] save src ports to ip_local_reserved_ports when dump + tasks and retore it when restore tasks + +--- + criu/config.c | 8 +++- + criu/cr-dump.c | 3 ++ + criu/crtools.c | 1 + + criu/include/cr_options.h | 1 + + criu/include/sk-inet.h | 4 ++ + criu/include/util.h | 2 + + criu/net.c | 6 ++- + criu/sk-tcp.c | 85 +++++++++++++++++++++++++++++++++++++++ + 8 files changed, 108 insertions(+), 2 deletions(-) + +diff --git a/criu/config.c b/criu/config.c +index 6beed04..9268cd1 100644 +--- a/criu/config.c ++++ b/criu/config.c +@@ -461,7 +461,7 @@ int parse_options(int argc, char **argv, bool *usage_error, + {OPT_NAME, no_argument, SAVE_TO, true},\ + {"no-" OPT_NAME, no_argument, SAVE_TO, false} + +- static const char short_opts[] = "dSsRt:hD:o:v::x::Vr:jJ:lW:L:M:"; ++ static const char short_opts[] = "dSsRt:hD:o:v::x::Vr:jJ:lW:L:M:P:"; + static struct option long_opts[] = { + { "tree", required_argument, 0, 't' }, + { "leave-stopped", no_argument, 0, 's' }, +@@ -553,6 +553,7 @@ int parse_options(int argc, char **argv, bool *usage_error, + BOOL_OPT("weak-file-check", &opts.weak_file_check), + BOOL_OPT("ignore-special-dump", &opts.ignore_special_dump), + BOOL_OPT("file-locks-repair", &opts.file_locks_repair), ++ {"reserve-ports", required_argument, 0, 'P' }, + { }, + }; + +@@ -896,6 +897,11 @@ int parse_options(int argc, char **argv, bool *usage_error, + case 'h': + *usage_error = false; + return 2; ++ case 'P': ++ opts.reserve_ports = atoi(optarg); ++ if (opts.reserve_ports < 0) ++ goto bad_arg; ++ break; + default: + return 2; + } +diff --git a/criu/cr-dump.c b/criu/cr-dump.c +index 9e899d0..695c98a 100644 +--- a/criu/cr-dump.c ++++ b/criu/cr-dump.c +@@ -1955,6 +1955,9 @@ int cr_dump_tasks(pid_t pid) + goto err; + } + ++ if (opts.reserve_ports > 0) ++ set_reserved_ports(); ++ + if (parent_ie) { + inventory_entry__free_unpacked(parent_ie, NULL); + parent_ie = NULL; +diff --git a/criu/crtools.c b/criu/crtools.c +index d16961a..888714c 100644 +--- a/criu/crtools.c ++++ b/criu/crtools.c +@@ -458,6 +458,7 @@ usage: + " --weak-file-check Allow file size and mod larger than dumping value\n" + " --ignore-special-dump Ignore special task tid page dump\n" + " --file-locks-repair Use repair mode to dump and restore file locks\n" ++" --reserve-ports Reserve src ports in kernel\n" + "\n" + "Check options:\n" + " Without options, \"criu check\" checks availability of absolutely required\n" +diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h +index 361aebc..85d852f 100644 +--- a/criu/include/cr_options.h ++++ b/criu/include/cr_options.h +@@ -187,6 +187,7 @@ struct cr_options { + int file_locks_repair; + char *share_dst_ports; + char *share_src_ports; ++ int reserve_ports; + }; + + extern struct cr_options opts; +diff --git a/criu/include/sk-inet.h b/criu/include/sk-inet.h +index 2e28444..4181fbe 100644 +--- a/criu/include/sk-inet.h ++++ b/criu/include/sk-inet.h +@@ -83,6 +83,10 @@ extern void tcp_locked_conn_add(struct inet_sk_info *); + extern void rst_unlock_tcp_connections(void); + extern void cpt_unlock_tcp_connections(void); + ++extern void read_reserved_ports(char *path); ++extern void write_reserved_ports(char *path); ++extern void set_reserved_ports(void); ++ + extern int dump_one_tcp(int sk, struct inet_sk_desc *sd, SkOptsEntry *soe); + extern int restore_one_tcp(int sk, struct inet_sk_info *si); + +diff --git a/criu/include/util.h b/criu/include/util.h +index 3a4b8f9..d1510fc 100644 +--- a/criu/include/util.h ++++ b/criu/include/util.h +@@ -430,4 +430,6 @@ bool find_devname(const char *name); + #define UNMASK_EXIT_NOTIFY_DIR "/sys/kernel/unmask_exit_notify" + int mask_task_exit_notify(int pid, bool mask); + ++#define RESERVED_PORTS_PATH "/proc/sys/net/ipv4/ip_local_reserved_ports" ++ + #endif /* __CR_UTIL_H__ */ +diff --git a/criu/net.c b/criu/net.c +index 4f1f7d4..19329cf 100644 +--- a/criu/net.c ++++ b/criu/net.c +@@ -2897,7 +2897,6 @@ static int network_unlock_internal(void) + if (switch_ns(root_item->pid->real, &net_ns_desc, &nsret)) + return -1; + +- + ret |= iptables_restore(false, conf, sizeof(conf) - 1); + if (kdat.ipv6) + ret |= iptables_restore(true, conf, sizeof(conf) - 1); +@@ -2926,6 +2925,11 @@ void network_unlock(void) + { + pr_info("Unlock network\n"); + ++ if (opts.reserve_ports) { ++ read_reserved_ports("ip_local_reserved_ports"); ++ write_reserved_ports(RESERVED_PORTS_PATH); ++ } ++ + cpt_unlock_tcp_connections(); + rst_unlock_tcp_connections(); + +diff --git a/criu/sk-tcp.c b/criu/sk-tcp.c +index 0409e22..67846c3 100644 +--- a/criu/sk-tcp.c ++++ b/criu/sk-tcp.c +@@ -23,6 +23,7 @@ + #include "kerndat.h" + #include "restorer.h" + #include "rst-malloc.h" ++#include "xmalloc.h" + + #include "protobuf.h" + #include "images/tcp-stream.pb-c.h" +@@ -33,6 +34,9 @@ + static LIST_HEAD(cpt_tcp_repair_sockets); + static LIST_HEAD(rst_tcp_repair_sockets); + ++static char* reserved_ports; ++static int reserved_ports_num; ++ + static int tcp_repair_established(int fd, struct inet_sk_desc *sk) + { + int ret; +@@ -475,3 +479,84 @@ void rst_unlock_tcp_connections(void) + list_for_each_entry(ii, &rst_tcp_repair_sockets, rlist) + nf_unlock_connection_info(ii); + } ++ ++void read_reserved_ports(char *path) ++{ ++ FILE *file = NULL; ++ char *ch = NULL; ++ size_t size = 0; ++ ++ if (reserved_ports) { ++ free(reserved_ports); ++ reserved_ports = NULL; ++ } ++ ++ file = fopen(path, "r"); ++ if (!file) { ++ pr_err("Cannot fopen %s\n", path); ++ return; ++ } ++ ++ if (getline(&reserved_ports, &size, file) <= 0) ++ pr_err("Cannot getline from %s\n", path); ++ fclose(file); ++ ++ if (!reserved_ports) ++ return; ++ ++ ch = strstr(reserved_ports, "\n"); ++ if (ch) ++ *ch = '\0'; ++} ++ ++void write_reserved_ports(char *path) ++{ ++ int fd = -1; ++ char buf[PATH_MAX]; ++ ++ fd = open(path, O_RDWR | O_CREAT, 0640); ++ if (fd < 0) { ++ pr_err("Cannot open %s ret %d cwd: %s\n", path, fd, buf); ++ return; ++ } ++ ++ cr_system(-1, fd, -1, "/usr/bin/echo", ++ (char *[]) { "echo", reserved_ports, NULL}, 0); ++ close(fd); ++} ++ ++static int add_reserved_ports(struct inet_sk_desc *sk) ++{ ++ if (reserved_ports_num >= opts.reserve_ports) ++ return -1; ++ ++ if (strlen(reserved_ports) == 0) ++ snprintf(reserved_ports, 6, "%u", sk->src_port); ++ else ++ snprintf(reserved_ports + strlen(reserved_ports), 7, ",%u", sk->src_port); ++ reserved_ports_num++; ++ ++ return 0; ++} ++ ++void set_reserved_ports(void) ++{ ++ struct inet_sk_desc *sk = NULL; ++ size_t size = 0; ++ ++ read_reserved_ports(RESERVED_PORTS_PATH); ++ ++ write_reserved_ports("ip_local_reserved_ports"); ++ ++ size = strlen(reserved_ports) + 6 * opts.reserve_ports + 1; ++ if (xrealloc_safe(&reserved_ports, size)) ++ exit(1); ++ ++ list_for_each_entry(sk, &cpt_tcp_repair_sockets, rlist) ++ add_reserved_ports(sk); ++ ++ write_reserved_ports(RESERVED_PORTS_PATH); ++ ++ free(reserved_ports); ++ reserved_ports = NULL; ++} +-- +2.34.0 + diff --git a/backport-0028--fix-dump-fail-problem-with-null-seek-op.patch b/backport-0028--fix-dump-fail-problem-with-null-seek-op.patch new file mode 100644 index 0000000000000000000000000000000000000000..25cce42e8dfee39029f31f7ff23a4bd50622d009 --- /dev/null +++ b/backport-0028--fix-dump-fail-problem-with-null-seek-op.patch @@ -0,0 +1,35 @@ +From 3f5b884acd7bd9f0ec964af264c6062b69602488 Mon Sep 17 00:00:00 2001 +From: Zhuling +Date: Thu, 22 Jul 2021 10:15:15 +0800 +Subject: [PATCH 28/50] fix dump fail problem with null seek op + +Fix file dumping fail problem when the file seek op is null. + +Signed-off-by: Jingxian He +--- + criu/files-reg.c | 9 ++++++--- + 1 file changed, 6 insertions(+), 3 deletions(-) + +diff --git a/criu/files-reg.c b/criu/files-reg.c +index 6747a3a..6bbcbee 100644 +--- a/criu/files-reg.c ++++ b/criu/files-reg.c +@@ -2247,9 +2247,12 @@ static int do_open_reg(int ns_root_fd, struct reg_file_info *rfi, void *arg) + if (!(rfi->rfe->flags & O_PATH)) { + if (rfi->rfe->pos != -1ULL && + lseek(fd, rfi->rfe->pos, SEEK_SET) < 0) { +- pr_perror("Can't restore file pos"); +- close(fd); +- return -1; ++ pr_info("No ability to restore file pos"); ++ if (errno != ESPIPE) { ++ pr_perror("Can't restore file pos"); ++ close(fd); ++ return -1; ++ } + } + } + +-- +2.34.0 + diff --git a/backport-0029--fix-dump-fail-problem-with-no-access-to-get-socket-f.patch b/backport-0029--fix-dump-fail-problem-with-no-access-to-get-socket-f.patch new file mode 100644 index 0000000000000000000000000000000000000000..5691ec9a252c717c569736d1c14524c8c291fb9e --- /dev/null +++ b/backport-0029--fix-dump-fail-problem-with-no-access-to-get-socket-f.patch @@ -0,0 +1,31 @@ +From e2ce2f672f3e96e0011d853a1a1b23c83df6f620 Mon Sep 17 00:00:00 2001 +From: Zhuling +Date: Sat, 24 Jul 2021 16:37:17 +0800 +Subject: [PATCH 29/50] fix dump fail problem with no access to get socket + filter + +Fix socket dumping fail problem when user space has no access to getting socket filter. + +Signed-off-by: Jingxian He +--- + criu/sockets.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/criu/sockets.c b/criu/sockets.c +index 609bfb1..0c0b0e0 100644 +--- a/criu/sockets.c ++++ b/criu/sockets.c +@@ -372,7 +372,9 @@ static int dump_socket_filter(int sk, SkOptsEntry *soe) + + ret = getsockopt(sk, SOL_SOCKET, SO_GET_FILTER, NULL, &len); + if (ret) { +- pr_perror("Can't get socket filter len"); ++ pr_warn("Can't get socket filter len"); ++ if (errno == EACCES) ++ return 0; + return ret; + } + +-- +2.34.0 + diff --git a/backport-0030--proc-parse-fix-vma-offset-value-for-the-sysfs-file-o.patch b/backport-0030--proc-parse-fix-vma-offset-value-for-the-sysfs-file-o.patch new file mode 100644 index 0000000000000000000000000000000000000000..5c2c31ee3ee58df7ed349791fe32e081f6d78c74 --- /dev/null +++ b/backport-0030--proc-parse-fix-vma-offset-value-for-the-sysfs-file-o.patch @@ -0,0 +1,131 @@ +From 94404049aa5ab7fbda3b81fea7e4b7de42507c3b Mon Sep 17 00:00:00 2001 +From: "fu.lin" +Date: Tue, 27 Jul 2021 11:40:34 +0800 +Subject: [PATCH 30/50] proc parse: fix vma offset value for the sysfs file of + pci devices + +Some pci devices create bin sysfs file which permit to use `mmap()` +syscall, the 6th parameter `offset` is always 0 when those kinds of +files create file mapping. The value of `offset` will be assign to +`vma->vm_pgoff` in kernel. However, it will be changed to pci address +automically during mmap callback function `pci_mmap_resource_range()`, +and the offset in `/proc//maps` will show non-zero. It will result +criu restore fails. + +There are many of those files. Just retry the mmap action. + +Signed-off-by: He Jingxian +Signed-off-by: fu.lin +--- + criu/include/image.h | 1 + + criu/pie/restorer.c | 16 +++++++++++++--- + criu/proc_parse.c | 32 ++++++++++++++++++++++++++++++++ + 3 files changed, 46 insertions(+), 3 deletions(-) + +diff --git a/criu/include/image.h b/criu/include/image.h +index 70f17a5..c929fd0 100644 +--- a/criu/include/image.h ++++ b/criu/include/image.h +@@ -86,6 +86,7 @@ + #define VMA_AREA_MEMFD (1 << 14) + #define VMA_AREA_ANON_INODE (1 << 15) + #define VMA_AREA_CHR (1 << 16) ++#define VMA_AREA_DEV_SHARE (1 << 17) + + #define VMA_CLOSE (1 << 28) + #define VMA_NO_PROT_WRITE (1 << 29) +diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c +index 5e06abb..d87236d 100644 +--- a/criu/pie/restorer.c ++++ b/criu/pie/restorer.c +@@ -871,9 +871,9 @@ static unsigned long restore_mapping(VmaEntry *vma_entry) + * that mechanism as it causes the process to be charged for memory + * immediately upon mmap, not later upon preadv(). + */ +- pr_debug("\tmmap(%"PRIx64" -> %"PRIx64", %x %x %d)\n", ++ pr_debug("\tmmap(%"PRIx64" -> %"PRIx64", %x %x %d %lx)\n", + vma_entry->start, vma_entry->end, +- prot, flags, (int)vma_entry->fd); ++ prot, flags, (int)vma_entry->fd, vma_entry->pgoff); + /* + * Should map memory here. Note we map them as + * writable since we're going to restore page +@@ -885,6 +885,15 @@ static unsigned long restore_mapping(VmaEntry *vma_entry) + vma_entry->fd, + vma_entry->pgoff); + ++ if (addr == -EINVAL) { ++ pr_info("need try mmap with offset 0\n"); ++ addr = sys_mmap(decode_pointer(vma_entry->start), ++ vma_entry_len(vma_entry), ++ prot, flags, ++ vma_entry->fd, ++ 0); ++ } ++ + if ((vma_entry->fd != -1) && + (vma_entry->status & VMA_CLOSE)) + sys_close(vma_entry->fd); +@@ -1883,7 +1892,8 @@ long __export_restore_task(struct task_restore_args *args) + if (!vma_entry->has_madv || !vma_entry->madv) + continue; + +- if (vma_entry_is(vma_entry, VMA_AREA_ANON_INODE)) ++ if (vma_entry_is(vma_entry, VMA_AREA_ANON_INODE) || ++ vma_entry_is(vma_entry, VMA_AREA_DEV_SHARE)) + continue; + + for (m = 0; m < sizeof(vma_entry->madv) * 8; m++) { +diff --git a/criu/proc_parse.c b/criu/proc_parse.c +index c8a18cf..2c7b926 100644 +--- a/criu/proc_parse.c ++++ b/criu/proc_parse.c +@@ -565,6 +565,35 @@ static inline int handle_vvar_vma(struct vma_area *vma) + return 0; + } + ++static bool is_sysfs_resource(const char *path) ++{ ++ char *sub = NULL; ++ const char *prefix = "resource"; ++ const char *suffix = "_wc"; ++ ++ if (strstr(path, "devices/") == NULL) ++ return false; ++ ++ sub = rindex(path, '/'); ++ if (sub == NULL) ++ return false; ++ ++ sub += 1; ++ if (strncmp(sub, prefix, strlen(prefix)) != 0) ++ return false; ++ ++ sub += strlen(prefix); ++ while (*sub != '\0' && (*sub >= '0' && *sub <= '9')) ++ sub += 1; ++ ++ if (*sub == '\0') ++ return true; ++ if (!strcmp(sub, suffix)) ++ return true; ++ else ++ return false; ++} ++ + static int handle_vma(pid_t pid, struct vma_area *vma_area, + const char *file_path, DIR *map_files_dir, + struct vma_file_info *vfi, +@@ -589,6 +618,9 @@ static int handle_vma(pid_t pid, struct vma_area *vma_area, + goto err; + } else if (!strcmp(file_path, "[heap]")) { + vma_area->e->status |= VMA_AREA_REGULAR | VMA_AREA_HEAP; ++ } else if (is_sysfs_resource(file_path)) { ++ pr_info("find sys device module share memory\n"); ++ vma_area->e->status |= VMA_AREA_REGULAR | VMA_AREA_DEV_SHARE; + } else { + vma_area->e->status = VMA_AREA_REGULAR; + } +-- +2.34.0 + diff --git a/backport-0031--add-reuse-file-method-for-recover-deleted-file-state.patch b/backport-0031--add-reuse-file-method-for-recover-deleted-file-state.patch new file mode 100644 index 0000000000000000000000000000000000000000..94b3751ccf9afbc92600cb33f5671d4e5310d171 --- /dev/null +++ b/backport-0031--add-reuse-file-method-for-recover-deleted-file-state.patch @@ -0,0 +1,205 @@ +From 98b1a7c6312df27a9db0797aabdc34ec07db67fe Mon Sep 17 00:00:00 2001 +From: Jingxian He +Date: Sat, 14 Aug 2021 16:45:40 +0800 +Subject: [PATCH 31/50] add reuse file method for recover deleted file state + +Add reuse file method for recover file state of deleted files. + +Signed-off-by: Jingxian He +--- + criu/files-reg.c | 75 +++++++++++++++++++++++++++++++++++++--- + criu/files.c | 24 ++++++++++--- + criu/include/files-reg.h | 9 +++++ + 3 files changed, 99 insertions(+), 9 deletions(-) + +diff --git a/criu/files-reg.c b/criu/files-reg.c +index 6bbcbee..46e9eab 100644 +--- a/criu/files-reg.c ++++ b/criu/files-reg.c +@@ -1184,6 +1184,70 @@ int strip_deleted(struct fd_link *link) + return 0; + } + ++int add_reuse_file(u32 id, int fd, int pid) ++{ ++ int retval; ++ char buf[256] = {0}; ++ ++ retval = snprintf(buf, 256, "%u,%d,%d", id, fd, pid); ++ if (retval <= 0) ++ return -EFAULT; ++ ++ fd = open(ADD_REUSE_FILE_PATH, O_WRONLY, 0); ++ if (fd < 0) { ++ pr_err("open file:%s fail\n", ADD_REUSE_FILE_PATH); ++ return fd; ++ } ++ ++ retval = write(fd, buf, strlen(buf)); ++ close(fd); ++ return retval < 0 ? -1 : 0; ++} ++ ++ ++int repair_reuse_file(int id) ++{ ++ int retval, fd; ++ char buf[256] = {0}; ++ ++ retval = snprintf(buf, 256, "%u", id); ++ if (retval <= 0) ++ return -EFAULT; ++ ++ fd = open(REPAIR_REUSE_FILE_PATH, O_WRONLY, 0); ++ if (fd < 0) { ++ pr_err("open file:%s fail\n", REPAIR_REUSE_FILE_PATH); ++ return fd; ++ } ++ retval = write(fd, buf, strlen(buf)); ++ ++ close(fd); ++ return retval < 0 ? -1 : 0; ++} ++ ++int get_reuse_file(void) ++{ ++ int fd; ++ ssize_t count; ++ int retval = -1; ++ char buf[32] = {0}; ++ ++ fd = open(REUSE_FILE_PATH, O_RDONLY , 0); ++ if (fd < 0) { ++ pr_err("open file:%s fail\n", REUSE_FILE_PATH); ++ return fd; ++ } ++ ++ count = read(fd, buf, sizeof(buf)); ++ if (count > 0) ++ retval = atoi(buf); ++ ++ close(fd); ++ return retval; ++} ++ ++extern int dst_pid; ++extern int need_reuse_flag; + static int check_path_remap(struct fd_link *link, const struct fd_parms *parms, + int lfd, u32 id, struct ns_id *nsid) + { +@@ -1301,9 +1365,12 @@ static int check_path_remap(struct fd_link *link, const struct fd_parms *parms, + * name. + */ + +- if (errno == ENOENT) +- return dump_linked_remap(rpath + 1, plen - 1, +- ost, lfd, id, nsid); ++ if (errno == ENOENT) { ++ pr_info("start add no exist file:%s\n", rpath + 1); ++ add_reuse_file(id, lfd, dst_pid); ++ need_reuse_flag = O_REUSE; ++ return 0; ++ } + + pr_perror("Can't stat path"); + return -1; +@@ -1724,7 +1791,7 @@ ext: + rfe.mode = p->stat.st_mode; + + if (S_ISREG(p->stat.st_mode) && should_check_size(rfe.flags) && +- !store_validation_data(&rfe, p, lfd)) ++ (need_reuse_flag != O_REUSE) && !store_validation_data(&rfe, p, lfd)) + return -1; + + fe.type = FD_TYPES__REG; +diff --git a/criu/files.c b/criu/files.c +index 0ebf26e..16d34fd 100644 +--- a/criu/files.c ++++ b/criu/files.c +@@ -720,7 +720,7 @@ int dump_my_file(int lfd, u32 *id, int *type) + } + + int dst_pid; +- ++int need_reuse_flag; + int dump_task_files_seized(struct parasite_ctl *ctl, struct pstree_item *item, + struct parasite_drain_fd *dfds) + { +@@ -758,7 +758,7 @@ int dump_task_files_seized(struct parasite_ctl *ctl, struct pstree_item *item, + + for (i = 0; i < nr_fds; i++) { + FdinfoEntry e = FDINFO_ENTRY__INIT; +- ++ need_reuse_flag = 0; + ret = dump_one_file(item->pid, dfds->fds[i + off], + lfds[i], opts + i, ctl, &e, dfds); + if (ret < 0) +@@ -768,7 +768,7 @@ int dump_task_files_seized(struct parasite_ctl *ctl, struct pstree_item *item, + ret = 0; + continue; + } +- ++ e.flags |= need_reuse_flag; + pr_info("write fdinfoEntry fd=%d id=%d\n", (&e)->fd, (&e)->id); + ret = pb_write_one(img, &e, PB_FDINFO); + if (ret) +@@ -964,8 +964,8 @@ int collect_fd(int pid, FdinfoEntry *e, struct rst_info *rst_info, bool fake) + { + struct file_desc *fdesc; + +- pr_info("Collect fdinfo pid=%d fd=%d id=%#x\n", +- pid, e->fd, e->id); ++ pr_info("Collect fdinfo pid=%d fd=%d id=%#x flags:%x\n", ++ pid, e->fd, e->id, e->flags); + + fdesc = find_file_desc(e); + if (fdesc == NULL) { +@@ -1255,6 +1255,7 @@ static int open_fd(struct fdinfo_list_entry *fle) + struct fdinfo_list_entry *flem; + int new_fd = -1, ret; + ++ pr_info("open file flags:%x\n", fle->fe->flags); + flem = file_master(d); + if (fle != flem) { + BUG_ON (fle->stage != FLE_INITIALIZED); +@@ -1276,6 +1277,19 @@ static int open_fd(struct fdinfo_list_entry *fle) + return 0; + } + } ++ } else if (fle->fe->flags & O_REUSE) { ++ pr_info("find reuse file:%d\n", d->id); ++ ret = repair_reuse_file(d->id); ++ if (!ret) { ++ new_fd = get_reuse_file(); ++ pr_info("get reuse file:%d\n", new_fd); ++ if (new_fd <= 0 || setup_and_serve_out(fle, new_fd) < 0) { ++ pr_err("setup reuse file fail\n"); ++ return -1; ++ } ++ fle->stage = FLE_RESTORED; ++ return 0; ++ } + } + + /* +diff --git a/criu/include/files-reg.h b/criu/include/files-reg.h +index 458fe89..4ec0e14 100644 +--- a/criu/include/files-reg.h ++++ b/criu/include/files-reg.h +@@ -64,4 +64,13 @@ extern int strip_deleted(struct fd_link *link); + + extern int dead_pid_conflict(void); + ++#define ADD_REUSE_FILE_PATH "/sys/kernel/add_reuse_file" ++#define REPAIR_REUSE_FILE_PATH "/sys/kernel/repair_reuse_file" ++#define REUSE_FILE_PATH "/sys/kernel/reuse_file" ++#define O_REUSE 0100000000 ++ ++extern int add_reuse_file(u32 id, int fd, int pid); ++extern int repair_reuse_file(int id); ++extern int get_reuse_file(void); ++ + #endif /* __CR_FILES_REG_H__ */ +-- +2.34.0 + diff --git a/backport-0032--fix-share-sockets-repair-problem.patch b/backport-0032--fix-share-sockets-repair-problem.patch new file mode 100644 index 0000000000000000000000000000000000000000..aa358a5c6a5369daaa8fc07abc89b044cab13e16 --- /dev/null +++ b/backport-0032--fix-share-sockets-repair-problem.patch @@ -0,0 +1,132 @@ +From 77c420dfe350f35c13c71368affafd4c714357e4 Mon Sep 17 00:00:00 2001 +From: Jingxian He +Date: Wed, 11 Aug 2021 15:01:27 +0800 +Subject: [PATCH 32/50] fix share sockets repair problem + +Repair off the share sockets after reusing them +to recover the share socket state. + +Signed-off-by: Jingxian He +--- + criu/files.c | 34 ++++++++++++++++++++++++++++++++-- + criu/sk-inet.c | 6 ++++-- + criu/sk-netlink.c | 5 +++-- + 3 files changed, 39 insertions(+), 6 deletions(-) + +diff --git a/criu/files.c b/criu/files.c +index 16d34fd..cc812b9 100644 +--- a/criu/files.c ++++ b/criu/files.c +@@ -54,7 +54,7 @@ + #include "util.h" + #include "images/fs.pb-c.h" + #include "images/ext-file.pb-c.h" +- ++#include "sk-inet.h" + #include "plugin.h" + + #define FDESC_HASH_SIZE 64 +@@ -1235,7 +1235,7 @@ int setup_and_serve_out(struct fdinfo_list_entry *fle, int new_fd) + if (reopen_fd_as(fle->fe->fd, new_fd)) + return -1; + +- pr_info("*******flags: %d",fle->fe->flags); ++ pr_info("*******flags: %d\n",fle->fe->flags); + if (fcntl(fle->fe->fd, F_SETFD, fle->fe->flags) == -1) { + pr_perror("Unable to set file descriptor flags"); + return -1; +@@ -1249,6 +1249,30 @@ int setup_and_serve_out(struct fdinfo_list_entry *fle, int new_fd) + return 0; + } + ++#define MAX_SHARE_SOCKETS_NUM 1024 ++int repair_share_sockets[MAX_SHARE_SOCKETS_NUM]; ++int repair_share_num; ++ ++int add_repair_share_socket(int fd) ++{ ++ if (repair_share_num >= MAX_SHARE_SOCKETS_NUM) ++ return -1; ++ repair_share_sockets[repair_share_num] = fd; ++ repair_share_num++; ++ return 0; ++} ++ ++void repair_off_share_sockets(void) ++{ ++ int i; ++ ++ for (i = 0; i < repair_share_num; i++) { ++ tcp_repair_off(repair_share_sockets[i]); ++ pr_info("repair off socket:%d\n", repair_share_sockets[i]); ++ } ++ repair_share_num = 0; ++} ++ + static int open_fd(struct fdinfo_list_entry *fle) + { + struct file_desc *d = fle->desc; +@@ -1267,6 +1291,7 @@ static int open_fd(struct fdinfo_list_entry *fle) + + if (d->ops->type == FD_TYPES__INETSK) { + if (check_need_repair(d)) { ++ pr_info("start repair for:%d\n", d->id); + ret = repair_share_socket(d->id); + if (!ret) { + new_fd = get_share_socket(); +@@ -1274,6 +1299,10 @@ static int open_fd(struct fdinfo_list_entry *fle) + if (new_fd <= 0 || setup_and_serve_out(fle, new_fd) < 0) + return -1; + fle->stage = FLE_RESTORED; ++ if (add_repair_share_socket(fle->fe->fd)) { ++ pr_perror("add repair share socket fail\n"); ++ return -1; ++ } + return 0; + } + } +@@ -1388,6 +1417,7 @@ static int open_fdinfos(struct pstree_item *me) + wait_fds_event(); + } while (again || progress); + ++ repair_off_share_sockets(); + BUG_ON(!list_empty(list)); + /* + * Fake fles may be used for restore other +diff --git a/criu/sk-inet.c b/criu/sk-inet.c +index d29f03b..768c6ed 100644 +--- a/criu/sk-inet.c ++++ b/criu/sk-inet.c +@@ -654,8 +654,10 @@ static int do_dump_one_inet_fd(int lfd, u32 id, const struct fd_parms *p, int fa + BUG_ON(sk->sd.already_dumped); + + if (check_share_dst_port(sk->dst_port) || check_share_src_port(sk->src_port)) { +- pr_info("Start add share prot:%d src %d\n", sk->dst_port, sk->src_port); +- add_share_socket(id, lfd, dst_pid, sk->src_port); ++ pr_info("Start add share port:%d-%d, dst_pid:%d id:%d\n", sk->dst_port, sk->src_port, dst_pid, id); ++ ret = add_share_socket(id, lfd, dst_pid, sk->src_port); ++ if (ret) ++ pr_warn("add share socket ret:%d\n", ret); + } + + ie.id = id; +diff --git a/criu/sk-netlink.c b/criu/sk-netlink.c +index a6c56ff..70d245a 100644 +--- a/criu/sk-netlink.c ++++ b/criu/sk-netlink.c +@@ -115,9 +115,10 @@ static bool can_dump_netlink_sk(int lfd) + + ret = fd_has_data(lfd); + if (ret == 1) +- pr_err("The socket has data to read\n"); ++ pr_warn("The socket has data to read\n"); + +- return ret == 0; ++ /* ignore netlink socket data */ ++ return true; + } + + static int dump_one_netlink_fd(int lfd, u32 id, const struct fd_parms *p) +-- +2.34.0 + diff --git a/backport-0033--nftables-add-mnl-api.patch b/backport-0033--nftables-add-mnl-api.patch new file mode 100644 index 0000000000000000000000000000000000000000..6e9b948d64573e614c945544a08d7328e20fe560 --- /dev/null +++ b/backport-0033--nftables-add-mnl-api.patch @@ -0,0 +1,271 @@ +From 6a6f3da51b8a938825ab9183e7d7db5e4422e7e0 Mon Sep 17 00:00:00 2001 +From: "fu.lin" +Date: Wed, 11 Aug 2021 16:50:49 +0800 +Subject: [PATCH 1/1] nftables: add mnl api + +libmnl provides the communication between userspace and kernelspace for +netfilter netlink. I abstract here for the next usage. + +Signed-off-by: fu.lin +--- + criu/Makefile.crtools | 1 + + criu/Makefile.packages | 8 ++ + criu/include/nftables.h | 28 +++++++ + criu/mnl.c | 165 ++++++++++++++++++++++++++++++++++++++++ + 4 files changed, 202 insertions(+) + create mode 100644 criu/include/nftables.h + create mode 100644 criu/mnl.c + +diff --git a/criu/Makefile.crtools b/criu/Makefile.crtools +index a9008f0..ff6b597 100644 +--- a/criu/Makefile.crtools ++++ b/criu/Makefile.crtools +@@ -90,6 +90,7 @@ obj-y += pie-util-vdso.o + obj-y += vdso.o + obj-y += timens.o + obj-y += devname.o ++obj-y += mnl.o + obj-$(CONFIG_HAS_LIBBPF) += bpfmap.o + obj-$(CONFIG_COMPAT) += pie-util-vdso-elf32.o + CFLAGS_pie-util-vdso-elf32.o += -DCONFIG_VDSO_32 +diff --git a/criu/Makefile.packages b/criu/Makefile.packages +index 13c346f..9638a3d 100644 +--- a/criu/Makefile.packages ++++ b/criu/Makefile.packages +@@ -7,6 +7,8 @@ REQ-RPM-PKG-NAMES += protobuf-python + REQ-RPM-PKG-NAMES += libnl3-devel + REQ-RPM-PKG-NAMES += libcap-devel + REQ-RPM-PKG-NAMES += $(PYTHON)-future ++REQ-RPM-PKG-NAMES += libmnl-devel ++REQ-RPM-PKG-NAMES += libnftnl-devel + + REQ-RPM-PKG-TEST-NAMES += libaio-devel + +@@ -18,6 +20,8 @@ REQ-DEB-PKG-NAMES += $(PYTHON)-protobuf + REQ-DEB-PKG-NAMES += $(PYTHON)-future + REQ-DEB-PKG-NAMES += libnl-3-dev + REQ-DEB-PKG-NAMES += libcap-dev ++REQ-DEB-PKG-NAMES += libmnl-dev ++REQ-DEB-PKG-NAMES += libnftnl-dev + + REQ-DEB-PKG-TEST-NAMES += $(PYTHON)-yaml + REQ-DEB-PKG-TEST-NAMES += libaio-dev +@@ -31,6 +35,10 @@ REQ-RPM-PKG-TEST-NAMES += $(PYTHON)-pyyaml + endif + + export LIBS += -lprotobuf-c -ldl -lnl-3 -lsoccr -Lsoccr/ -lnet ++export LIBS += $(shell pkg-config --libs libmnl) ++export LIBS += $(shell pkg-config --libs libnftnl) ++export CFLAGS += $(shell pkg-config --cflags libmnl) ++export CFLAGS += $(shell pkg-config --cflags libnftnl) + + check-packages-failed: + $(warning Can not find some of the required libraries) +diff --git a/criu/include/nftables.h b/criu/include/nftables.h +new file mode 100644 +index 0000000..0bdab31 +--- /dev/null ++++ b/criu/include/nftables.h +@@ -0,0 +1,28 @@ ++#ifndef __CR_NFTABLES_H__ ++#define __CR_NFTABLES_H__ ++ ++#include ++ ++struct mnl_params { ++ struct mnl_socket *nl; ++ char *buf; ++ struct mnl_nlmsg_batch *batch; ++ uint32_t seq; ++}; ++ ++typedef struct nlmsghdr * (*buf_func_t)(struct mnl_params *mnl_params, void *args); ++typedef int (*batch_func_t)(struct mnl_params *mnl_params, void *args); ++typedef int (*mnl_func_t)(struct mnl_params *mnl, batch_func_t cb, void *args); ++ ++struct mnl_cb_params { ++ pid_t tree_id; ++ bool create; ++ bool ipv6; ++}; ++ ++int mnl_sendmsg(batch_func_t batch_cb, void *args); ++int mnl_common(mnl_func_t mnl_cb, void *arg1, void *arg2); ++int mnl_batch_send_and_recv(struct mnl_params *mnl_params, batch_func_t cb, void *args, int *result); ++int mnl_buf_send_and_recv(struct mnl_params *mnl_params, buf_func_t cb, void *args, int *result); ++ ++#endif /* __CR_NFTABLES_H__ */ +diff --git a/criu/mnl.c b/criu/mnl.c +new file mode 100644 +index 0000000..3a03202 +--- /dev/null ++++ b/criu/mnl.c +@@ -0,0 +1,165 @@ ++#include ++#include ++#include ++ ++#include ++ ++#include "nftables.h" ++#include "log.h" ++ ++int mnl_common(mnl_func_t mnl_cb, void *arg1, void *arg2) ++{ ++ char buf[MNL_SOCKET_BUFFER_SIZE]; ++ struct mnl_params mnl = { ++ .seq = time(NULL), ++ }; ++ int retval = -1; ++ ++ mnl.nl = mnl_socket_open(NETLINK_NETFILTER); ++ if (mnl.nl == NULL) { ++ pr_err("mnl_socket_open failed with %d: %s\n", errno, strerror(errno)); ++ return -1; ++ } ++ ++ if (mnl_socket_bind(mnl.nl, 0, MNL_SOCKET_AUTOPID) < 0) { ++ pr_err("mnl_socket_bind wailed with %d: %s\n", errno, strerror(errno)); ++ goto err_mnl; ++ } ++ ++ mnl.buf = buf; ++ mnl.batch = mnl_nlmsg_batch_start(buf, sizeof(buf)); ++ if (mnl.batch == NULL) ++ goto err_mnl; ++ ++ if (mnl_cb(&mnl, arg1, arg2) < 0) ++ goto err_batch; ++ ++ retval = 0; ++ ++err_batch: ++ mnl_nlmsg_batch_stop(mnl.batch); ++err_mnl: ++ mnl_socket_close(mnl.nl); ++ ++ return retval; ++} ++ ++static int mnl_sendmsg_internal(struct mnl_params *mnl, batch_func_t cb, void *args) ++{ ++ int retval = -1; ++ ++ nftnl_batch_begin(mnl_nlmsg_batch_current(mnl->batch), mnl->seq++); ++ mnl_nlmsg_batch_next(mnl->batch); ++ ++ if (cb(mnl, args) < 0) ++ goto err_batch; ++ ++ nftnl_batch_end(mnl_nlmsg_batch_current(mnl->batch), mnl->seq++); ++ mnl_nlmsg_batch_next(mnl->batch); ++ ++ if (mnl_socket_sendto(mnl->nl, mnl_nlmsg_batch_head(mnl->batch), ++ mnl_nlmsg_batch_size(mnl->batch)) < 0) { ++ pr_err("%s: mnl_socket_sendto failed with %d: %s\n", ++ __func__, errno, strerror(errno)); ++ goto err_batch; ++ } ++ ++ retval = 0; ++ ++err_batch: ++ return retval; ++} ++ ++int mnl_sendmsg(batch_func_t batch_cb, void *args) ++{ ++ return mnl_common(mnl_sendmsg_internal, batch_cb, args); ++} ++ ++int mnl_batch_send_and_recv(struct mnl_params *mnl_params, batch_func_t cb, ++ void *args, int *result) ++{ ++ struct mnl_socket *nl = mnl_params->nl; ++ struct mnl_nlmsg_batch *batch = mnl_params->batch; ++ uint32_t *seq = &mnl_params->seq; ++ char buf[MNL_SOCKET_BUFFER_SIZE]; ++ int retval; ++ ++ mnl_nlmsg_batch_reset(batch); ++ nftnl_batch_begin(mnl_nlmsg_batch_current(batch), (*seq)++); ++ mnl_nlmsg_batch_next(batch); ++ ++ if (cb(mnl_params, args) < 0) ++ return -1; ++ ++ nftnl_batch_end(mnl_nlmsg_batch_current(batch), (*seq)++); ++ mnl_nlmsg_batch_next(batch); ++ ++ if (mnl_socket_sendto(nl, mnl_nlmsg_batch_head(batch), ++ mnl_nlmsg_batch_size(batch)) < 0) { ++ pr_err("%s: mnl_socket_sendto failed with %d: %s\n", ++ __func__, errno, strerror(errno)); ++ return -1; ++ } ++ ++ /* don't care the netlink retval, and nlmsg hdr flags has no `NLM_F_ACK` */ ++ if (result == NULL) ++ return 0; ++ ++ retval = mnl_socket_recvfrom(nl, buf, sizeof(buf)); ++ while (retval > 0) { ++ retval = mnl_cb_run(buf, retval, 0, mnl_socket_get_portid(nl), NULL, NULL); ++ if (retval <= 0) ++ break; ++ retval = mnl_socket_recvfrom(nl, buf, sizeof(buf)); ++ } ++ ++ if (retval < 0) { ++ pr_err("%s: mnl batch socket recv errno with %d: %s\n", ++ __func__, errno, strerror(errno)); ++ *result = errno; ++ return -1; ++ } ++ ++ *result = 0; ++ return 0; ++} ++ ++int mnl_buf_send_and_recv(struct mnl_params *mnl_params, buf_func_t cb, ++ void *args, int *result) ++{ ++ struct mnl_socket *nl = mnl_params->nl; ++ char buf[MNL_SOCKET_BUFFER_SIZE]; ++ struct nlmsghdr *nlh; ++ int retval = 0; ++ ++ if ((nlh = cb(mnl_params, args)) == NULL) ++ return -1; ++ ++ if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) < 0) { ++ pr_err("%s: mnl_socket_sendto failed with %d: %s\n", ++ __func__, errno, strerror(errno)); ++ return -1; ++ } ++ ++ /* don't care the netlink retval, and nlmsg hdr flags has no `NLM_F_ACK` */ ++ if (result == NULL) ++ return 0; ++ ++ retval = mnl_socket_recvfrom(nl, buf, sizeof(buf)); ++ while (retval > 0) { ++ retval = mnl_cb_run(buf, retval, 0, mnl_socket_get_portid(nl), NULL, NULL); ++ if (retval <= 0) ++ break; ++ retval = mnl_socket_recvfrom(nl, buf, sizeof(buf)); ++ } ++ ++ if (retval < 0) { ++ pr_info("%s: mnl buf socket recv errno with %d: %s\n", ++ __func__, errno, strerror(errno)); ++ *result = errno; ++ return -1; ++ } ++ ++ *result = 0; ++ return 0; ++} +-- +2.34.0 + diff --git a/backport-0034--nftables-implement-nft-api-for-tcp.patch b/backport-0034--nftables-implement-nft-api-for-tcp.patch new file mode 100644 index 0000000000000000000000000000000000000000..4a408b7da6678f170b5046b81eadd50f0ca3c07b --- /dev/null +++ b/backport-0034--nftables-implement-nft-api-for-tcp.patch @@ -0,0 +1,1011 @@ +From dc230277b4936fc0451d92a4abc644494527a6ea Mon Sep 17 00:00:00 2001 +From: "fu.lin" +Date: Wed, 11 Aug 2021 16:50:49 +0800 +Subject: [PATCH 34/50] nftables: implement nft api for tcp + +Signed-off-by: fu.lin +--- + criu/Makefile.crtools | 1 + + criu/include/nftables.h | 138 +++++++ + criu/nftables.c | 823 ++++++++++++++++++++++++++++++++++++++++ + 3 files changed, 962 insertions(+) + create mode 100644 criu/nftables.c + +diff --git a/criu/Makefile.crtools b/criu/Makefile.crtools +index ff6b597..cda5b82 100644 +--- a/criu/Makefile.crtools ++++ b/criu/Makefile.crtools +@@ -91,6 +91,7 @@ obj-y += vdso.o + obj-y += timens.o + obj-y += devname.o + obj-y += mnl.o ++obj-y += nftables.o + obj-$(CONFIG_HAS_LIBBPF) += bpfmap.o + obj-$(CONFIG_COMPAT) += pie-util-vdso-elf32.o + CFLAGS_pie-util-vdso-elf32.o += -DCONFIG_VDSO_32 +diff --git a/criu/include/nftables.h b/criu/include/nftables.h +index 0bdab31..3b51a3d 100644 +--- a/criu/include/nftables.h ++++ b/criu/include/nftables.h +@@ -3,6 +3,99 @@ + + #include + ++#include ++#include ++#include ++#include ++#include ++ ++#define construct_buf(buf, type, family, flags, seq, payload, cb_prefix) \ ++ ({ \ ++ struct nlmsghdr *_nlh; \ ++ \ ++ _nlh = nftnl_##cb_prefix##_nlmsg_build_hdr((buf), \ ++ (type), (family), (flags), (seq)); \ ++ nftnl_##cb_prefix##_nlmsg_build_payload(_nlh, (payload)); \ ++ nftnl_##cb_prefix##_free((payload)); \ ++ _nlh; \ ++ }) ++ ++#define construct_table_buf(buf, type, family, flags, seq, payload) \ ++ construct_buf((buf), (type), (family), (flags), (seq), \ ++ (payload), table) ++ ++#define construct_chain_buf(buf, type, family, flags, seq, payload) \ ++ construct_buf((buf), (type), (family), (flags), (seq), \ ++ (payload), chain) ++ ++#define construct_batch(batch, type, family, flags, seq, payload, cb_prefix) \ ++ { \ ++ struct nlmsghdr *_nlh; \ ++ \ ++ _nlh = nftnl_##cb_prefix##_nlmsg_build_hdr( \ ++ mnl_nlmsg_batch_current(batch), \ ++ (type), (family), (flags), (seq)); \ ++ nftnl_##cb_prefix##_nlmsg_build_payload(_nlh, (payload)); \ ++ nftnl_##cb_prefix##_free((payload)); \ ++ mnl_nlmsg_batch_next((batch)); \ ++ } ++ ++#define construct_table_batch(batch, type, family, flags, seq, payload) \ ++ construct_batch((batch), (type), (family), (flags), (seq), \ ++ (payload), table) ++ ++#define construct_chain_batch(batch, type, family, flags, seq, payload) \ ++ construct_batch((batch), (type), (family), (flags), (seq), \ ++ (payload), chain) ++ ++#define construct_set_batch(batch, type, family, flags, seq, payload) \ ++ construct_batch((batch), (type), (family), (flags), (seq), \ ++ (payload), set) ++ ++#define construct_rule_batch(batch, type, family, flags, seq, payload) \ ++ construct_batch((batch), (type), (family), (flags), (seq), \ ++ (payload), rule) ++ ++#define construct_set_elems_batch(batch, type, family, flags, seq, payload) \ ++ { \ ++ struct nlmsghdr *_nlh; \ ++ \ ++ _nlh = nftnl_nlmsg_build_hdr( \ ++ mnl_nlmsg_batch_current(batch), \ ++ (type), (family), (flags), (seq)); \ ++ nftnl_set_elems_nlmsg_build_payload(_nlh, (payload)); \ ++ nftnl_set_free((payload)); \ ++ mnl_nlmsg_batch_next((batch)); \ ++ } ++ ++#define TABLE_NAME "filter" ++#define INPUT_CHAIN_NAME "criu-input" ++#define OUTPUT_CHAIN_NAME "criu-output" ++#define INPUT_IPV4_SET_NAME "criu-input-ipv4-blacklist-%d" ++#define INPUT_IPV6_SET_NAME "criu-input-ipv6-blacklist-%d" ++#define OUTPUT_IPV4_SET_NAME "criu-output-ipv4-blacklist-%d" ++#define OUTPUT_IPV6_SET_NAME "criu-output-ipv6-blacklist-%d" ++ ++/* set key type, see nftables/include/datatypes.h ++ * The rule of the datatype calculation: ++ * Each type occupies 6 bits, type: ++ * - ipaddr: 7, 4 bytes ++ * - ip6addr: 8, 16 types ++ * - inet_service: 13, 2 bytes (pading to 4 bytes) ++ * ++ * 0x1cd1cd: 0b 000111 001101 000111 001101 ++ * 0x20d20d: 0b 001000 001101 001000 001101 ++ */ ++#define INET_SERVICE_LEN 2 ++#define IPADDR_LEN 4 ++#define IP6ADDR_LEN 16 ++#define div_round_up(n, d) (((n) + (d) - 1) / (d)) ++ ++#define IPv4_KEY_TYPE 0x1cd1cd ++#define IPv4_KEY_LEN div_round_up(IPADDR_LEN + INET_SERVICE_LEN, 4) * 4 * 2 ++#define IPv6_KEY_TYPE 0x20d20d ++#define IPv6_KEY_LEN div_round_up(IP6ADDR_LEN + INET_SERVICE_LEN, 4) * 4 * 2 ++ + struct mnl_params { + struct mnl_socket *nl; + char *buf; +@@ -25,4 +118,49 @@ int mnl_common(mnl_func_t mnl_cb, void *arg1, void *arg2); + int mnl_batch_send_and_recv(struct mnl_params *mnl_params, batch_func_t cb, void *args, int *result); + int mnl_buf_send_and_recv(struct mnl_params *mnl_params, buf_func_t cb, void *args, int *result); + ++struct nft_chain_params { ++ char *name; ++ uint32_t hooknum; ++ char *type; ++ uint32_t prio; ++ uint32_t policy; ++}; ++ ++struct nft_set_params { ++ char name[128]; ++ uint32_t id; ++ uint32_t datatype; ++ uint32_t key_len; ++}; ++ ++struct nft_rule_params { ++ char *chain_name; ++ char set_name[128]; ++ uint32_t mark; ++ uint16_t mark_op; ++ uint32_t nfproto; ++ uint8_t l4proto; ++ unsigned int stmt; ++ bool ipv6; ++}; ++ ++struct nft_set_elem_params { ++ char set_name[128]; ++ char data[40]; ++ size_t data_len; ++}; ++ ++struct nf_conn_params { ++ uint8_t family; ++ uint32_t *src_addr; ++ uint16_t src_port; ++ uint32_t *dst_addr; ++ uint16_t dst_port; ++ bool lock; ++ pid_t tree_id; ++}; ++ ++struct inet_sk_desc; ++int nft_connection_switch(struct inet_sk_desc *sk, bool lock, pid_t tree_id); ++ + #endif /* __CR_NFTABLES_H__ */ +diff --git a/criu/nftables.c b/criu/nftables.c +new file mode 100644 +index 0000000..57774e6 +--- /dev/null ++++ b/criu/nftables.c +@@ -0,0 +1,823 @@ ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "sk-inet.h" ++#include "nftables.h" ++ ++#include "../soccr/soccr.h" ++ ++#include "log.h" ++ ++static struct nftnl_table *setup_table(uint8_t family, const char *table) ++{ ++ struct nftnl_table *t; ++ ++ t = nftnl_table_alloc(); ++ if (t == NULL) ++ return NULL; ++ ++ nftnl_table_set_u32(t, NFTNL_TABLE_FAMILY, family); ++ if (nftnl_table_set_str(t, NFTNL_TABLE_NAME, table) < 0) ++ goto err; ++ ++ return t; ++err: ++ nftnl_table_free(t); ++ return NULL; ++} ++ ++static struct nftnl_chain *setup_chain(const char *table, ++ struct nft_chain_params *params, ++ bool create) ++{ ++ struct nftnl_chain *c; ++ ++ c = nftnl_chain_alloc(); ++ if (c == NULL) ++ return NULL; ++ ++ if (nftnl_chain_set_str(c, NFTNL_CHAIN_TABLE, table) < 0) ++ goto err; ++ if (nftnl_chain_set_str(c, NFTNL_CHAIN_NAME, params->name) < 0) ++ goto err; ++ if (create) { ++ nftnl_chain_set_u32(c, NFTNL_CHAIN_HOOKNUM, params->hooknum); ++ if (nftnl_chain_set_str(c, NFTNL_CHAIN_TYPE, params->type) < 0) ++ goto err; ++ nftnl_chain_set_u32(c, NFTNL_CHAIN_PRIO, params->prio); ++ nftnl_chain_set_u32(c, NFTNL_CHAIN_POLICY, params->policy); ++ } ++ ++ return c; ++err: ++ nftnl_chain_free(c); ++ return NULL; ++} ++ ++static struct nftnl_set *setup_set(uint8_t family, const char *table, ++ struct nft_set_params *params, ++ bool create) ++{ ++ struct nftnl_set *s; ++ ++ s = nftnl_set_alloc(); ++ if (s == NULL) ++ return NULL; ++ ++ if (nftnl_set_set_str(s, NFTNL_SET_TABLE, table) < 0) ++ goto err; ++ if (nftnl_set_set_str(s, NFTNL_SET_NAME, params->name) < 0) ++ goto err; ++ if (create) { ++ nftnl_set_set_u32(s, NFTNL_SET_FAMILY, family); ++ nftnl_set_set_u32(s, NFTNL_SET_ID, params->id); ++ ++ nftnl_set_set_u32(s, NFTNL_SET_KEY_TYPE, params->datatype); ++ nftnl_set_set_u32(s, NFTNL_SET_KEY_LEN, params->key_len); ++ } ++ ++ return s; ++err: ++ nftnl_set_free(s); ++ return NULL; ++} ++ ++static int add_mark(struct nftnl_rule *r, uint32_t meta_key, enum nft_registers dreg) ++{ ++ struct nftnl_expr *e; ++ ++ e = nftnl_expr_alloc("meta"); ++ if (e == NULL) ++ return -1; ++ ++ nftnl_expr_set_u32(e, NFTNL_EXPR_META_KEY, meta_key); ++ nftnl_expr_set_u32(e, NFTNL_EXPR_META_DREG, dreg); ++ ++ nftnl_rule_add_expr(r, e); ++ ++ return 0; ++} ++ ++static int add_proto(struct nftnl_rule *r, enum nft_registers dreg) ++{ ++ struct nftnl_expr *e; ++ ++ e = nftnl_expr_alloc("meta"); ++ if (e == NULL) ++ return -1; ++ ++ nftnl_expr_set_u32(e, NFTNL_EXPR_META_KEY, NFT_META_L4PROTO); ++ nftnl_expr_set_u32(e, NFTNL_EXPR_META_DREG, dreg); ++ ++ nftnl_rule_add_expr(r, e); ++ ++ return 0; ++} ++ ++static int add_payload(struct nftnl_rule *r, uint32_t base, uint32_t dreg, ++ uint32_t offset, uint32_t len) ++{ ++ struct nftnl_expr *e; ++ ++ e = nftnl_expr_alloc("payload"); ++ if (e == NULL) ++ return -1; ++ ++ nftnl_expr_set_u32(e, NFTNL_EXPR_PAYLOAD_BASE, base); ++ nftnl_expr_set_u32(e, NFTNL_EXPR_PAYLOAD_DREG, dreg); ++ nftnl_expr_set_u32(e, NFTNL_EXPR_PAYLOAD_OFFSET, offset); ++ nftnl_expr_set_u32(e, NFTNL_EXPR_PAYLOAD_LEN, len); ++ ++ nftnl_rule_add_expr(r, e); ++ ++ return 0; ++} ++ ++static int add_cmp(struct nftnl_rule *r, enum nft_registers sreg, uint32_t op, ++ const void *data, uint32_t data_len) ++{ ++ struct nftnl_expr *e; ++ ++ e = nftnl_expr_alloc("cmp"); ++ if (e == NULL) ++ return -1; ++ ++ nftnl_expr_set_u32(e, NFTNL_EXPR_CMP_SREG, sreg); ++ nftnl_expr_set_u32(e, NFTNL_EXPR_CMP_OP, op); ++ nftnl_expr_set(e, NFTNL_EXPR_CMP_DATA, data, data_len); ++ ++ nftnl_rule_add_expr(r, e); ++ ++ return 0; ++} ++ ++static int add_lookup(struct nftnl_rule *r, enum nft_registers sreg, ++ const char *set) ++{ ++ struct nftnl_expr *e; ++ ++ e = nftnl_expr_alloc("lookup"); ++ if (e == NULL) ++ return -1; ++ ++ if (nftnl_expr_set_str(e, NFTNL_EXPR_LOOKUP_SET, set) < 0) ++ goto err; ++ nftnl_expr_set_u32(e, NFTNL_EXPR_LOOKUP_SREG, sreg); ++ ++ nftnl_rule_add_expr(r, e); ++ ++ return 0; ++err: ++ nftnl_expr_free(e); ++ return -1; ++} ++ ++static int add_counter(struct nftnl_rule *r) ++{ ++ struct nftnl_expr *e; ++ ++ e = nftnl_expr_alloc("counter"); ++ if (e == NULL) ++ return -1; ++ ++ nftnl_rule_add_expr(r, e); ++ ++ return 0; ++} ++ ++static int add_verdict(struct nftnl_rule *r, const char *chain, int verdict) ++{ ++ struct nftnl_expr *e; ++ ++ e = nftnl_expr_alloc("immediate"); ++ if (e == NULL) ++ return -1; ++ ++ nftnl_expr_set_u32(e, NFTNL_EXPR_IMM_DREG, NFT_REG_VERDICT); ++ nftnl_expr_set_u32(e, NFTNL_EXPR_IMM_VERDICT, verdict); ++ ++ nftnl_rule_add_expr(r, e); ++ ++ return 0; ++} ++ ++static int __setup_rule(struct nftnl_rule *r, struct nft_rule_params *params) ++{ ++ /* meta nfproto == */ ++ if (add_mark(r, NFT_META_PROTOCOL, NFT_REG32_00) < 0) ++ return -1; ++ if (add_cmp(r, NFT_REG32_00, NFT_CMP_EQ, ¶ms->nfproto, sizeof(uint32_t))< 0) ++ return -1; ++ ++ /* meta l4proto == */ ++ if (add_proto(r, NFT_REG32_00) < 0) ++ return -1; ++ if (add_cmp(r, NFT_REG32_00, NFT_CMP_EQ, ¶ms->l4proto, sizeof(uint8_t)) < 0) ++ return -1; ++ ++ /* ip saddr . sport . daddr . dport @ */ ++ if (params->ipv6 == false) { ++ if (add_payload(r, NFT_PAYLOAD_NETWORK_HEADER, NFT_REG32_00, ++ offsetof(struct iphdr, saddr), IPADDR_LEN) < 0) ++ return -1; ++ if (add_payload(r, NFT_PAYLOAD_TRANSPORT_HEADER, NFT_REG32_01, ++ offsetof(struct tcphdr, source), INET_SERVICE_LEN) < 0) ++ return -1; ++ if (add_payload(r, NFT_PAYLOAD_NETWORK_HEADER, NFT_REG32_02, ++ offsetof(struct iphdr, daddr), IPADDR_LEN) < 0) ++ return -1; ++ if (add_payload(r, NFT_PAYLOAD_TRANSPORT_HEADER, NFT_REG32_03, ++ offsetof(struct tcphdr, dest), INET_SERVICE_LEN) < 0) ++ return -1; ++ ++ if (add_lookup(r, NFT_REG32_00, params->set_name) < 0) ++ return -1; ++ } else { ++ if (add_payload(r, NFT_PAYLOAD_NETWORK_HEADER, NFT_REG32_00, ++ offsetof(struct ipv6hdr, saddr), IP6ADDR_LEN) < 0) ++ return -1; ++ if (add_payload(r, NFT_PAYLOAD_TRANSPORT_HEADER, NFT_REG32_04, ++ offsetof(struct tcphdr, source), INET_SERVICE_LEN) < 0) ++ return -1; ++ if (add_payload(r, NFT_PAYLOAD_NETWORK_HEADER, NFT_REG32_05, ++ offsetof(struct ipv6hdr, daddr), IP6ADDR_LEN) < 0) ++ return -1; ++ if (add_payload(r, NFT_PAYLOAD_TRANSPORT_HEADER, NFT_REG32_09, ++ offsetof(struct tcphdr, dest), INET_SERVICE_LEN) < 0) ++ return -1; ++ ++ if (add_lookup(r, NFT_REG32_00, params->set_name) < 0) ++ return -1; ++ } ++ ++ /* counter */ ++ if (add_counter(r) < 0) ++ return -1; ++ ++ return 0; ++} ++ ++static struct nftnl_rule *setup_rule(uint8_t family, const char *table, ++ struct nft_rule_params *params, ++ bool create, bool ns) ++{ ++ struct nftnl_rule *r = NULL; ++ ++ r = nftnl_rule_alloc(); ++ if (r == NULL) ++ return NULL; ++ ++ if (nftnl_rule_set_str(r, NFTNL_RULE_TABLE, table) < 0) ++ goto err; ++ nftnl_rule_set_u32(r, NFTNL_RULE_FAMILY, family); ++ if (nftnl_rule_set_str(r, NFTNL_RULE_CHAIN, params->chain_name) < 0) ++ goto err; ++ ++ if (params->mark != 0) { ++ /* meta mark != */ ++ if (add_mark(r, NFT_META_MARK, NFT_REG32_00) < 0) ++ goto err; ++ if (add_cmp(r, NFT_REG32_00, params->mark_op, ¶ms->mark, sizeof(uint32_t)) < 0) ++ goto err; ++ } ++ ++ if (!ns && __setup_rule(r, params) < 0) ++ goto err; ++ ++ /* drop */ ++ if (add_verdict(r, params->chain_name, params->stmt) < 0) ++ goto err; ++ ++ return r; ++ ++err: ++ nftnl_rule_free(r); ++ return NULL; ++} ++ ++static struct nlmsghdr *nft_table_detect(struct mnl_params *mnl_params, void *args) ++{ ++ struct nftnl_table *table; ++ ++ table = setup_table(NFPROTO_INET, TABLE_NAME); ++ if (table == NULL) ++ return NULL; ++ ++ return construct_table_buf(mnl_params->buf, NFT_MSG_GETTABLE, NFPROTO_INET, ++ NLM_F_ACK, mnl_params->seq++, table); ++} ++ ++static int nft_table_create(struct mnl_params *mnl_params, void *args) ++{ ++ struct nftnl_table *table; ++ ++ table = setup_table(NFPROTO_INET, TABLE_NAME); ++ if (table == NULL) ++ return -1; ++ ++ construct_table_batch(mnl_params->batch, NFT_MSG_NEWTABLE, NFPROTO_INET, ++ NLM_F_CREATE|NLM_F_EXCL|NLM_F_ACK, ++ mnl_params->seq++, table); ++ ++ return 0; ++} ++ ++static int nft_table_prepare(struct mnl_params *mnl_params) ++{ ++ int result = 0; ++ ++ if (mnl_buf_send_and_recv(mnl_params, nft_table_detect, NULL, &result) == 0) ++ return 0; ++ ++ pr_debug("%s: detect table result %d\n", __func__, result); ++ ++ if (result == ENOENT && ++ (mnl_batch_send_and_recv(mnl_params, nft_table_create, NULL, &result) < 0 ++ && (result != 0 && result != EEXIST))) { ++ pr_err("%s: create nftables table failed!\n", __func__); ++ return -1; ++ } else if (result != 0) { ++ pr_err("%s: detect table result %d\n", __func__, -result); ++ return -1; ++ } ++ ++ return 0; ++} ++ ++static struct nlmsghdr *nft_chain_detect(struct mnl_params *mnl_params, void *args) ++{ ++ struct nftnl_chain *chain; ++ ++ chain = setup_chain(TABLE_NAME, args, false); ++ if (chain == NULL) ++ return NULL; ++ ++ return construct_chain_buf(mnl_params->buf, NFT_MSG_GETCHAIN, NFPROTO_INET, ++ NLM_F_ACK, mnl_params->seq++, chain); ++} ++ ++static int nft_chain_create(struct mnl_params *mnl_params, void *args) ++{ ++ struct nftnl_chain *chain; ++ ++ chain = setup_chain(TABLE_NAME, args, true); ++ if (chain == NULL) ++ return -1; ++ ++ construct_chain_batch(mnl_params->batch, NFT_MSG_NEWCHAIN, NFPROTO_INET, ++ NLM_F_CREATE|NLM_F_EXCL|NLM_F_ACK, mnl_params->seq++, chain); ++ ++ return 0; ++} ++ ++static int nft_chain_prepare_internal(struct mnl_params *mnl_params, ++ struct nft_chain_params *params) ++{ ++ int result = 0; ++ ++ if (mnl_buf_send_and_recv(mnl_params, nft_chain_detect, params, &result) == 0) ++ return 0; ++ ++ pr_debug("%s: detect chain result %d\n", __func__, result); ++ ++ if (result == ENOENT && ++ (mnl_batch_send_and_recv(mnl_params, nft_chain_create, params, &result) < 0 ++ && (result != 0 && result != EEXIST))) { ++ pr_err("%s: nftables create chain %s failed!\n", ++ __func__, params->name); ++ return -1; ++ } else if (result != 0) { ++ pr_err("%s: detect chain result %d\n", __func__, -result); ++ return -1; ++ } ++ ++ return result; ++} ++ ++static int nft_chain_prepare(struct mnl_params *mnl_params) ++{ ++ struct nft_chain_params params = { ++ .type = "filter", ++ .prio = NF_IP_PRI_FILTER, ++ .policy = NF_ACCEPT, ++ }; ++ ++ /* prepare ipv4 input chain in filter table */ ++ params.name = INPUT_CHAIN_NAME; ++ params.hooknum = NF_INET_LOCAL_IN; ++ ++ if (nft_chain_prepare_internal(mnl_params, ¶ms) < 0) ++ return -1; ++ ++ /* prepare ipv4 output chain in filter table */ ++ params.name = OUTPUT_CHAIN_NAME; ++ params.hooknum = NF_INET_LOCAL_OUT; ++ ++ if (nft_chain_prepare_internal(mnl_params, ¶ms) < 0) ++ return -1; ++ ++ return 0; ++} ++ ++static int nft_set_internal(uint8_t family, struct mnl_params *mnl_params, ++ struct nft_set_params *params, bool create) ++{ ++ struct nftnl_set *set; ++ ++ set = setup_set(family, TABLE_NAME, params, create); ++ if (set == NULL) ++ return -1; ++ ++ if (create) { ++ construct_set_batch(mnl_params->batch, NFT_MSG_NEWSET, family, ++ NLM_F_CREATE|NLM_F_EXCL|NLM_F_ACK, mnl_params->seq++, set); ++ } else { ++ construct_set_batch(mnl_params->batch, NFT_MSG_DELSET, family, ++ 0, mnl_params->seq++, set); ++ } ++ ++ return 0; ++} ++ ++static int nft_set_raw(struct mnl_params *mnl_params, ++ struct mnl_cb_params *args, bool input) ++{ ++ const uint32_t set_id_base = input ? 0x12315 : 0x17173; ++ const uint8_t family = NFPROTO_INET; ++ struct nft_set_params params = { 0 }; ++ char *set_name; ++ int idx = 0; ++ ++ if (!args->ipv6) { ++ params.datatype = IPv4_KEY_TYPE; ++ params.key_len = IPv4_KEY_LEN; ++ idx = 4; ++ } else { ++ params.datatype = IPv6_KEY_TYPE; ++ params.key_len = IPv6_KEY_LEN; ++ idx = 6; ++ } ++ ++ if (args->ipv6 && input) ++ set_name = INPUT_IPV6_SET_NAME; ++ else if (args->ipv6 && !input) ++ set_name = OUTPUT_IPV6_SET_NAME; ++ else if (!args->ipv6 && input) ++ set_name = INPUT_IPV4_SET_NAME; ++ else ++ set_name = OUTPUT_IPV4_SET_NAME; ++ ++ snprintf(params.name, sizeof(params.name)-1, set_name, args->tree_id); ++ params.id = set_id_base + args->tree_id + idx; ++ ++ if (nft_set_internal(family, mnl_params, ¶ms, args->create) < 0) { ++ pr_err("%s: create nftables %s %s set failed!\n", __func__, ++ args->ipv6 ? "ipv6" : "ipv4", ++ input ? "input" : "output"); ++ return -1; ++ } ++ ++ return 0; ++} ++ ++static int nft_set(struct mnl_params *mnl_params, void *args) ++{ ++ struct mnl_cb_params *params = args; ++ ++ params->ipv6 = false; ++ if (nft_set_raw(mnl_params, params, true) < 0) ++ return -1; ++ ++ if (nft_set_raw(mnl_params, params, false) < 0) ++ return -1; ++ ++ params->ipv6 = true; ++ if (nft_set_raw(mnl_params, params, true) < 0) ++ return -1; ++ ++ if (nft_set_raw(mnl_params, params, false) < 0) ++ return -1; ++ ++ return 0; ++} ++ ++static int nft_set_common(struct mnl_params *mnl_params, pid_t tree_id, bool create) ++{ ++ struct mnl_cb_params params = { ++ .tree_id = tree_id, ++ .create = create, ++ }; ++ int result = 0; ++ ++ if (create && ++ (mnl_batch_send_and_recv(mnl_params, nft_set, ¶ms, &result) < 0 ++ && (result != 0 && result != EEXIST))) { ++ pr_err("%s: create set failed!\n", __func__); ++ return -1; ++ } else if (!create && ++ mnl_batch_send_and_recv(mnl_params, nft_set, ¶ms, NULL) < 0) { ++ pr_err("%s: delete set failed!\n", __func__); ++ return -1; ++ } ++ ++ return 0; ++} ++ ++static int nft_rule_internal(uint8_t family, struct mnl_params *mnl_params, ++ struct nft_rule_params *params, bool create) ++{ ++ struct nftnl_rule *rule; ++ ++ rule = setup_rule(family, TABLE_NAME, params, create, false); ++ if (rule == NULL) ++ return -1; ++ ++ if (create) { ++ construct_rule_batch(mnl_params->batch, NFT_MSG_NEWRULE, family, ++ NLM_F_CREATE|NLM_F_EXCL|NLM_F_ACK, ++ mnl_params->seq++, rule); ++ } else { ++ construct_rule_batch(mnl_params->batch, NFT_MSG_DELRULE, family, ++ 0, mnl_params->seq++, rule); ++ } ++ ++ return 0; ++} ++ ++static int nft_rule_raw(struct mnl_params *mnl_params, struct mnl_cb_params *args, ++ struct nft_rule_params *params) ++{ ++ char *set_name; ++ ++ params->nfproto = params->ipv6 ? htons(ETH_P_IPV6) : htons(ETH_P_IP); ++ ++ set_name = params->ipv6 ? INPUT_IPV6_SET_NAME : INPUT_IPV4_SET_NAME; ++ params->chain_name = INPUT_CHAIN_NAME; ++ snprintf(params->set_name, sizeof(params->set_name)-1, set_name, args->tree_id); ++ if (nft_rule_internal(NFPROTO_INET, mnl_params, params, args->create) < 0) { ++ pr_err("%s: create nft %s input rule failed!\n", ++ __func__, params->ipv6 ? "ipv6" : "ipv4"); ++ return -1; ++ } ++ ++ set_name = params->ipv6 ? OUTPUT_IPV6_SET_NAME : OUTPUT_IPV4_SET_NAME; ++ params->chain_name = OUTPUT_CHAIN_NAME; ++ snprintf(params->set_name, sizeof(params->set_name)-1, set_name, args->tree_id); ++ if (nft_rule_internal(NFPROTO_INET, mnl_params, params, args->create) < 0) { ++ pr_err("%s: create nftables %s output rule failed!\n", ++ __func__, params->ipv6 ? "ipv6" : "ipv4"); ++ return -1; ++ } ++ ++ return 0; ++} ++ ++static int nft_rule(struct mnl_params *mnl_params, void *args) ++{ ++ struct nft_rule_params params = { ++ .l4proto = IPPROTO_TCP, ++ .mark = SOCCR_MARK, ++ .mark_op = NFT_CMP_NEQ, ++ .stmt = NF_DROP, ++ }; ++ ++ params.ipv6 = false; ++ if (nft_rule_raw(mnl_params, args, ¶ms) < 0) ++ return -1; ++ ++ params.ipv6 = true; ++ if (nft_rule_raw(mnl_params, args, ¶ms) < 0) ++ return -1; ++ ++ return 0; ++} ++ ++static int nft_rule_common(struct mnl_params *mnl_params, pid_t tree_id, bool create) ++{ ++ struct mnl_cb_params params = { ++ .tree_id = tree_id, ++ .create = create, ++ }; ++ int result = 0; ++ ++ if (create && ++ (mnl_batch_send_and_recv(mnl_params, nft_rule, ¶ms, &result) < 0 ++ && (result != 0 && result != EEXIST))) { ++ pr_err("%s: create rule failed!\n", __func__); ++ return -1; ++ } else if (!create && ++ mnl_batch_send_and_recv(mnl_params, nft_rule, ¶ms, NULL) < 0) { ++ pr_err("%s: delete rule failed!\n", __func__); ++ return -1; ++ } ++ ++ return 0; ++} ++ ++static int network_prepare_internal(struct mnl_params *params, batch_func_t _, void *args) ++{ ++ pid_t tree_id = *(pid_t *)args; ++ ++ if (nft_table_prepare(params) < 0) ++ return -1; ++ ++ if (nft_chain_prepare(params) < 0) ++ return -1; ++ ++ if (nft_set_common(params, tree_id, true) < 0) ++ return -1; ++ ++ if (nft_rule_common(params, tree_id, true) < 0) ++ return -1; ++ ++ return 0; ++} ++ ++int network_prepare(pid_t tree_id) ++{ ++ pr_info("Prepare network\n"); ++ ++ return mnl_common(network_prepare_internal, NULL, &tree_id); ++} ++ ++static int network_unprepare_internal(struct mnl_params *params, ++ batch_func_t _, void *args) ++{ ++ pid_t tree_id = *(pid_t *)args; ++ ++ if (nft_rule_common(params, tree_id, false) < 0) ++ return -1; ++ ++ if (nft_set_common(params, tree_id, false) < 0) ++ return -1; ++ ++ return 0; ++} ++ ++void network_unprepare(pid_t tree_id) ++{ ++ pr_info("Unprepare network\n"); ++ ++ mnl_common(network_unprepare_internal, NULL, &tree_id); ++} ++ ++static int add_set_elem_internal(struct nftnl_set *s, void *data, size_t len) ++{ ++ struct nftnl_set_elem *e; ++ ++ e = nftnl_set_elem_alloc(); ++ if (e == NULL) ++ return -1; ++ ++ nftnl_set_elem_set(e, NFTNL_SET_ELEM_KEY, data, len); ++ ++ nftnl_set_elem_add(s, e); ++ ++ return 0; ++} ++ ++static struct nftnl_set *add_set_elem(const char *table, const char *set, ++ void *data, size_t len) ++{ ++ struct nftnl_set *s; ++ ++ s = nftnl_set_alloc(); ++ if (s == NULL) ++ return NULL; ++ ++ if (nftnl_set_set_str(s, NFTNL_SET_TABLE, table) < 0) ++ goto err; ++ if (nftnl_set_set_str(s, NFTNL_SET_NAME, set) < 0) ++ goto err; ++ ++ if (add_set_elem_internal(s, data, len) < 0) ++ goto err; ++ ++ return s; ++ ++err: ++ nftnl_set_free(s); ++ return NULL; ++} ++ ++static int nft_set_elem(uint8_t family, struct mnl_params *mnl_param, ++ struct nft_set_elem_params *elem_param, ++ bool lock) ++{ ++ struct nftnl_set *set; ++ ++ set = add_set_elem(TABLE_NAME, elem_param->set_name, ++ elem_param->data, elem_param->data_len); ++ if (set == NULL) ++ return -1; ++ ++ if (lock) { ++ construct_set_elems_batch(mnl_param->batch, NFT_MSG_NEWSETELEM, ++ family, NLM_F_CREATE|NLM_F_EXCL, ++ mnl_param->seq++, set); ++ } else { ++ construct_set_elems_batch(mnl_param->batch, NFT_MSG_DELSETELEM, ++ family, 0, mnl_param->seq++, set); ++ } ++ ++ return 0; ++} ++ ++static void construct_set_elem_key(void *data, struct nf_conn_params *param, bool output) ++{ ++ size_t offset = 0; ++ size_t addr_len = param->family == AF_INET ? IPADDR_LEN : IP6ADDR_LEN; ++ ++ memcpy(data+offset, output ? param->src_addr : param->dst_addr, addr_len); ++ offset = addr_len; ++ *(uint32_t *)(data + offset) = htons(output ? param->src_port : param->dst_port); ++ offset += sizeof(uint32_t); ++ memcpy(data+offset, output ? param->dst_addr : param->src_addr, addr_len); ++ offset += addr_len; ++ *(uint32_t *)(data + offset) = htons(output ? param->dst_port : param->src_port); ++} ++ ++static int nf_connection_switch_raw(struct mnl_params *mnl_params, void *args) ++{ ++ struct nf_conn_params *param = args; ++ char *input_set_name, *output_set_name; ++ struct nft_set_elem_params elem; ++ ++ switch (param->family) { ++ case AF_INET: ++ input_set_name = INPUT_IPV4_SET_NAME; ++ output_set_name = OUTPUT_IPV4_SET_NAME; ++ elem.data_len = IPv4_KEY_LEN; ++ break; ++ case AF_INET6: ++ input_set_name = INPUT_IPV6_SET_NAME; ++ output_set_name = OUTPUT_IPV6_SET_NAME; ++ elem.data_len = IPv6_KEY_LEN; ++ break; ++ default: ++ pr_err("Unknown socket family %d\n", param->family); ++ return -1; ++ } ++ ++ construct_set_elem_key(elem.data, param, false); ++ snprintf(elem.set_name, sizeof(elem.set_name)-1, input_set_name, param->tree_id); ++ if (nft_set_elem(NFPROTO_INET, mnl_params, &elem, param->lock) < 0) ++ return -1; ++ ++ construct_set_elem_key(elem.data, param, true); ++ snprintf(elem.set_name, sizeof(elem.set_name)-1, output_set_name, param->tree_id); ++ if (nft_set_elem(NFPROTO_INET, mnl_params, &elem, param->lock) < 0) ++ return -1; ++ ++ return 0; ++} ++ ++/* IPv4-Mapped IPv6 Addresses */ ++static int ipv6_addr_mapped(uint32_t *addr) ++{ ++ return (addr[2] == htonl(0x0000ffff)); ++} ++ ++int nft_connection_switch(struct inet_sk_desc *sk, bool lock, pid_t tree_id) ++{ ++ char sip[INET_ADDR_LEN], dip[INET_ADDR_LEN]; ++ struct nf_conn_params param = { ++ .family = sk->sd.family, ++ .src_addr = sk->src_addr, ++ .src_port = sk->src_port, ++ .dst_addr = sk->dst_addr, ++ .dst_port = sk->dst_port, ++ .lock = lock, ++ .tree_id = tree_id, ++ }; ++ ++ if (param.family == AF_INET6 && ipv6_addr_mapped(param.dst_addr)) { ++ param.family = AF_INET; ++ param.src_addr = ¶m.src_addr[3]; ++ param.dst_addr = ¶m.dst_addr[3]; ++ } ++ ++ if (!inet_ntop(param.family, (void *)param.src_addr, sip, INET_ADDR_LEN) || ++ !inet_ntop(param.family, (void *)param.dst_addr, dip, INET_ADDR_LEN)) { ++ pr_perror("nf: Can't translate ip addr"); ++ return -1; ++ } ++ ++ pr_info("%s %s:%d - %s:%d connection\n", lock ? "Locked" : "Unlocked", ++ sip, (int)param.src_port, dip, (int)param.dst_port); ++ ++ return mnl_sendmsg(nf_connection_switch_raw, ¶m); ++} +-- +2.34.0 + diff --git a/backport-0035--nftables-implement-nft-api-for-lock-net-ns.patch b/backport-0035--nftables-implement-nft-api-for-lock-net-ns.patch new file mode 100644 index 0000000000000000000000000000000000000000..3d485b6d1155fa3cc6cd3399a65f09ff18602f3b --- /dev/null +++ b/backport-0035--nftables-implement-nft-api-for-lock-net-ns.patch @@ -0,0 +1,146 @@ +From 8640db3bdf0ab589797b83ea5196b46302e1960c Mon Sep 17 00:00:00 2001 +From: "fu.lin" +Date: Wed, 18 Aug 2021 10:59:41 +0800 +Subject: [PATCH 35/50] nftables: implement nft api for lock net ns + +Signed-off-by: fu.lin +--- + criu/include/nftables.h | 2 + + criu/nftables.c | 112 ++++++++++++++++++++++++++++++++++++++++ + 2 files changed, 114 insertions(+) + +diff --git a/criu/include/nftables.h b/criu/include/nftables.h +index 3b51a3d..e462919 100644 +--- a/criu/include/nftables.h ++++ b/criu/include/nftables.h +@@ -162,5 +162,7 @@ struct nf_conn_params { + + struct inet_sk_desc; + int nft_connection_switch(struct inet_sk_desc *sk, bool lock, pid_t tree_id); ++int nft_lock(void); ++int nft_unlock(void); + + #endif /* __CR_NFTABLES_H__ */ +diff --git a/criu/nftables.c b/criu/nftables.c +index 57774e6..817f157 100644 +--- a/criu/nftables.c ++++ b/criu/nftables.c +@@ -821,3 +821,115 @@ int nft_connection_switch(struct inet_sk_desc *sk, bool lock, pid_t tree_id) + + return mnl_sendmsg(nf_connection_switch_raw, ¶m); + } ++ ++static int nft_ns_rule_internal(uint8_t family, struct mnl_params *mnl_params, ++ struct nft_rule_params *params, bool create) ++{ ++ struct nftnl_rule *rule; ++ ++ rule = setup_rule(family, TABLE_NAME, params, create, true); ++ if (rule == NULL) ++ return -1; ++ ++ if (create) { ++ construct_rule_batch(mnl_params->batch, NFT_MSG_NEWRULE, family, ++ NLM_F_CREATE|NLM_F_EXCL|NLM_F_ACK, ++ mnl_params->seq++, rule); ++ } else { ++ construct_rule_batch(mnl_params->batch, NFT_MSG_DELRULE, family, ++ 0, mnl_params->seq++, rule); ++ } ++ ++ return 0; ++} ++ ++static int nft_ns_rule_raw(struct mnl_params *mnl_params, struct mnl_cb_params *args, ++ struct nft_rule_params *params) ++{ ++ params->chain_name = INPUT_CHAIN_NAME; ++ if (nft_ns_rule_internal(NFPROTO_INET, mnl_params, params, args->create) < 0) { ++ pr_err("%s: create nft input rule failed!\n", __func__); ++ return -1; ++ } ++ ++ params->chain_name = OUTPUT_CHAIN_NAME; ++ if (nft_ns_rule_internal(NFPROTO_INET, mnl_params, params, args->create) < 0) { ++ pr_err("%s: create nft output rule failed!\n", __func__); ++ return -1; ++ } ++ ++ return 0; ++} ++ ++static int nft_ns_rule(struct mnl_params *mnl_params, void *args) ++{ ++ struct nft_rule_params params = { 0 }; ++ ++ params.mark = 0; ++ params.mark_op = NFT_CMP_EQ; ++ params.stmt = NF_DROP; ++ if (nft_ns_rule_raw(mnl_params, args, ¶ms) < 0) ++ return -1; ++ ++ params.mark = SOCCR_MARK; ++ ++ params.stmt = NF_ACCEPT; ++ if (nft_ns_rule_raw(mnl_params, args, ¶ms) < 0) ++ return -1; ++ ++ return 0; ++} ++ ++static int nft_ns_rule_common(struct mnl_params *mnl_params, bool create) ++{ ++ struct mnl_cb_params params = { ++ .create = create, ++ }; ++ int result = 0; ++ ++ if (create && ++ (mnl_batch_send_and_recv(mnl_params, nft_ns_rule, ¶ms, &result) < 0 ++ && (result != 0 && result != EEXIST))) { ++ pr_err("%s: crete ns rule failed!\n", __func__); ++ return -1; ++ } else if (!create && ++ (mnl_batch_send_and_recv(mnl_params, nft_ns_rule, ¶ms, NULL) < 0)) { ++ pr_err("%s: delete ns rule failed!\n", __func__); ++ return -1; ++ } ++ ++ return 0; ++} ++ ++static int network_lock_internal(struct mnl_params *params, ++ batch_func_t _, void *args) ++{ ++ if (nft_table_prepare(params) < 0) ++ return -1; ++ ++ if (nft_chain_prepare(params) < 0) ++ return -1; ++ ++ if (nft_ns_rule_common(params, true) < 0) ++ return -1; ++ ++ return 0; ++} ++ ++int nft_lock(void) ++{ ++ return mnl_common(network_lock_internal, NULL, NULL); ++} ++ ++static int network_unlock_internal(struct mnl_params *params, batch_func_t _, ++ void *args) ++{ ++ if (nft_ns_rule_common(params, false) < 0) ++ return -1; ++ return 0; ++} ++ ++int nft_unlock(void) ++{ ++ return mnl_common(network_unlock_internal, NULL, NULL); ++} +-- +2.34.0 + diff --git a/backport-0036--criu-switch-to-nftables-api.patch b/backport-0036--criu-switch-to-nftables-api.patch new file mode 100644 index 0000000000000000000000000000000000000000..6506d435e9196d637562f505325d724f024120ae --- /dev/null +++ b/backport-0036--criu-switch-to-nftables-api.patch @@ -0,0 +1,405 @@ +From 372e633f151b5b80f705ab1f45546c98ce38535c Mon Sep 17 00:00:00 2001 +From: "fu.lin" +Date: Thu, 19 Aug 2021 16:29:49 +0800 +Subject: [PATCH 36/50] criu: switch to nftables api + +usage: criu --use-nft + +Signed-off-by: fu.lin +--- + criu/config.c | 1 + + criu/cr-dump.c | 4 ++-- + criu/cr-restore.c | 6 ++++-- + criu/crtools.c | 1 + + criu/include/cr_options.h | 1 + + criu/include/net.h | 7 +++++-- + criu/include/netfilter.h | 7 +++++-- + criu/include/sk-inet.h | 2 +- + criu/kerndat.c | 3 ++- + criu/net.c | 37 +++++++++++++++++++++++++++---------- + criu/netfilter.c | 14 +++++++++++--- + criu/nftables.c | 29 +++++++++++++++++++---------- + criu/sk-tcp.c | 14 +++++++------- + 13 files changed, 86 insertions(+), 40 deletions(-) + +diff --git a/criu/config.c b/criu/config.c +index 9268cd1..676b424 100644 +--- a/criu/config.c ++++ b/criu/config.c +@@ -554,6 +554,7 @@ int parse_options(int argc, char **argv, bool *usage_error, + BOOL_OPT("ignore-special-dump", &opts.ignore_special_dump), + BOOL_OPT("file-locks-repair", &opts.file_locks_repair), + {"reserve-ports", required_argument, 0, 'P' }, ++ BOOL_OPT("use-nft", &opts.use_nft), + { }, + }; + +diff --git a/criu/cr-dump.c b/criu/cr-dump.c +index 695c98a..d54b7a7 100644 +--- a/criu/cr-dump.c ++++ b/criu/cr-dump.c +@@ -1782,7 +1782,7 @@ static int cr_dump_finish(int ret) + * start rollback procedure and cleanup everything. + */ + if (ret || post_dump_ret || opts.final_state == TASK_ALIVE) { +- network_unlock(); ++ network_unlock(opts.tree_id); + delete_link_remaps(); + clean_cr_time_mounts(); + } +@@ -1931,7 +1931,7 @@ int cr_dump_tasks(pid_t pid) + if (collect_pstree_ids()) + goto err; + +- if (network_lock()) ++ if (network_lock(opts.tree_id)) + goto err; + + if (collect_file_locks()) +diff --git a/criu/cr-restore.c b/criu/cr-restore.c +index 4fd29a5..945d984 100644 +--- a/criu/cr-restore.c ++++ b/criu/cr-restore.c +@@ -2499,7 +2499,7 @@ skip_ns_bouncing: + goto out_kill; + + /* Unlock network before disabling repair mode on sockets */ +- network_unlock(); ++ network_unlock(vpid(init)); + network_status |= NETWORK_UNLOCK; + + /* +@@ -2567,6 +2567,8 @@ skip_ns_bouncing: + if (ret != 0) + pr_err("Post-resume script ret code %d\n", ret); + ++ network_delete_set(vpid(init)); ++ + if (!opts.restore_detach && !opts.exec_cmd) + wait(NULL); + +@@ -2712,7 +2714,7 @@ err: + pr_err("collect inet sk cinfo fail"); + } + if ((network_status & NETWORK_UNLOCK) == 0) +- network_unlock(); ++ network_unlock(vpid(root_item)); + } + + return ret; +diff --git a/criu/crtools.c b/criu/crtools.c +index 888714c..aaae44e 100644 +--- a/criu/crtools.c ++++ b/criu/crtools.c +@@ -459,6 +459,7 @@ usage: + " --ignore-special-dump Ignore special task tid page dump\n" + " --file-locks-repair Use repair mode to dump and restore file locks\n" + " --reserve-ports Reserve src ports in kernel\n" ++" --use-nft Use nft API instead of iptables cmd in network locking" + "\n" + "Check options:\n" + " Without options, \"criu check\" checks availability of absolutely required\n" +diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h +index 85d852f..62fbbb1 100644 +--- a/criu/include/cr_options.h ++++ b/criu/include/cr_options.h +@@ -188,6 +188,7 @@ struct cr_options { + char *share_dst_ports; + char *share_src_ports; + int reserve_ports; ++ int use_nft; + }; + + extern struct cr_options opts; +diff --git a/criu/include/net.h b/criu/include/net.h +index bda0ff3..9daea8d 100644 +--- a/criu/include/net.h ++++ b/criu/include/net.h +@@ -30,8 +30,11 @@ struct veth_pair { + + extern int collect_net_namespaces(bool for_dump); + +-extern int network_lock(void); +-extern void network_unlock(void); ++extern int network_prepare(pid_t tree_id); ++extern void network_delete_rule(pid_t tree_id); ++extern void network_delete_set(pid_t tree_id); ++extern int network_lock(pid_t tree_id); ++extern void network_unlock(pid_t tree_id); + extern int network_lock_internal(void); + + extern struct ns_desc net_ns_desc; +diff --git a/criu/include/netfilter.h b/criu/include/netfilter.h +index 35ef262..c92762c 100644 +--- a/criu/include/netfilter.h ++++ b/criu/include/netfilter.h +@@ -1,9 +1,12 @@ + #ifndef __CR_NETFILTER_H__ + #define __CR_NETFILTER_H__ + ++#include ++#include ++ + struct inet_sk_desc; +-extern int nf_lock_connection(struct inet_sk_desc *); +-extern int nf_unlock_connection(struct inet_sk_desc *); ++extern int nf_lock_connection(struct inet_sk_desc *, pid_t, bool); ++extern int nf_unlock_connection(struct inet_sk_desc *, bool); + + struct inet_sk_info; + extern int nf_unlock_connection_info(struct inet_sk_info *); +diff --git a/criu/include/sk-inet.h b/criu/include/sk-inet.h +index 4181fbe..88e0881 100644 +--- a/criu/include/sk-inet.h ++++ b/criu/include/sk-inet.h +@@ -81,7 +81,7 @@ static inline void tcp_repair_off(int fd) + + extern void tcp_locked_conn_add(struct inet_sk_info *); + extern void rst_unlock_tcp_connections(void); +-extern void cpt_unlock_tcp_connections(void); ++extern void cpt_unlock_tcp_connections(bool); + + extern void read_reserved_ports(char *path); + extern void write_reserved_ports(char *path); +diff --git a/criu/kerndat.c b/criu/kerndat.c +index b2c47c5..59570a2 100644 +--- a/criu/kerndat.c ++++ b/criu/kerndat.c +@@ -1065,7 +1065,8 @@ int kerndat_init(void) + memset(&kdat, 0, sizeof(kdat)); + + preload_socket_modules(); +- preload_netfilter_modules(); ++ if (!opts.use_nft) ++ preload_netfilter_modules(); + + if (check_pagemap()) { + pr_err("check_pagemap failed when initializing kerndat.\n"); +diff --git a/criu/net.c b/criu/net.c +index 19329cf..2bd6f77 100644 +--- a/criu/net.c ++++ b/criu/net.c +@@ -45,6 +45,7 @@ + #include "util.h" + #include "external.h" + #include "fdstore.h" ++#include "nftables.h" + + #include "protobuf.h" + #include "images/netdev.pb-c.h" +@@ -2868,9 +2869,13 @@ int network_lock_internal(void) + return -1; + + +- ret |= iptables_restore(false, conf, sizeof(conf) - 1); +- if (kdat.ipv6) +- ret |= iptables_restore(true, conf, sizeof(conf) - 1); ++ if (opts.use_nft) ++ ret = nft_lock(); ++ else { ++ ret |= iptables_restore(false, conf, sizeof(conf) - 1); ++ if (kdat.ipv6) ++ ret |= iptables_restore(true, conf, sizeof(conf) - 1); ++ } + + if (ret) + pr_err("Locking network failed: iptables-restore returned %d. " +@@ -2897,9 +2902,13 @@ static int network_unlock_internal(void) + if (switch_ns(root_item->pid->real, &net_ns_desc, &nsret)) + return -1; + +- ret |= iptables_restore(false, conf, sizeof(conf) - 1); +- if (kdat.ipv6) +- ret |= iptables_restore(true, conf, sizeof(conf) - 1); ++ if (opts.use_nft) ++ ret = nft_unlock(); ++ else { ++ ret |= iptables_restore(false, conf, sizeof(conf) - 1); ++ if (kdat.ipv6) ++ ret |= iptables_restore(true, conf, sizeof(conf) - 1); ++ } + + if (restore_ns(nsret, &net_ns_desc)) + ret = -1; +@@ -2907,10 +2916,13 @@ static int network_unlock_internal(void) + return ret; + } + +-int network_lock(void) ++int network_lock(pid_t tree_id) + { + pr_info("Lock network\n"); + ++ if (opts.use_nft && opts.tcp_established_ok && network_prepare(tree_id) < 0) ++ return -1; ++ + /* Each connection will be locked on dump */ + if (!(root_ns_mask & CLONE_NEWNET)) + return 0; +@@ -2921,7 +2933,7 @@ int network_lock(void) + return network_lock_internal(); + } + +-void network_unlock(void) ++void network_unlock(pid_t tree_id) + { + pr_info("Unlock network\n"); + +@@ -2930,8 +2942,13 @@ void network_unlock(void) + write_reserved_ports(RESERVED_PORTS_PATH); + } + +- cpt_unlock_tcp_connections(); +- rst_unlock_tcp_connections(); ++ if (opts.use_nft && opts.tcp_established_ok) ++ network_delete_rule(tree_id); ++ ++ cpt_unlock_tcp_connections(opts.use_nft); ++ ++ if (!opts.use_nft) ++ rst_unlock_tcp_connections(); + + if (root_ns_mask & CLONE_NEWNET) { + run_scripts(ACT_NET_UNLOCK); +diff --git a/criu/netfilter.c b/criu/netfilter.c +index 368651c..b2ec7ed 100644 +--- a/criu/netfilter.c ++++ b/criu/netfilter.c +@@ -15,6 +15,8 @@ + #include "sk-inet.h" + #include "kerndat.h" + ++#include "nftables.h" ++ + static char buf[512]; + + /* +@@ -129,13 +131,19 @@ static int nf_connection_switch(struct inet_sk_desc *sk, bool lock) + return ret; + } + +-int nf_lock_connection(struct inet_sk_desc *sk) ++int nf_lock_connection(struct inet_sk_desc *sk, pid_t tree_id, bool use_nft) + { +- return nf_connection_switch(sk, true); ++ if (use_nft) ++ return nft_connection_switch(sk, true, tree_id); ++ else ++ return nf_connection_switch(sk, true); + } + +-int nf_unlock_connection(struct inet_sk_desc *sk) ++int nf_unlock_connection(struct inet_sk_desc *sk, bool use_nft) + { ++ if (use_nft) ++ return 0; ++ + return nf_connection_switch(sk, false); + } + +diff --git a/criu/nftables.c b/criu/nftables.c +index 817f157..739aee4 100644 +--- a/criu/nftables.c ++++ b/criu/nftables.c +@@ -653,25 +653,34 @@ int network_prepare(pid_t tree_id) + return mnl_common(network_prepare_internal, NULL, &tree_id); + } + +-static int network_unprepare_internal(struct mnl_params *params, +- batch_func_t _, void *args) ++static int network_delete_rule_internal(struct mnl_params *params, ++ batch_func_t _, void *args) + { + pid_t tree_id = *(pid_t *)args; + +- if (nft_rule_common(params, tree_id, false) < 0) +- return -1; ++ return nft_rule_common(params, tree_id, false); ++} + +- if (nft_set_common(params, tree_id, false) < 0) +- return -1; ++void network_delete_rule(pid_t tree_id) ++{ ++ pr_info("unlock network\n"); + +- return 0; ++ mnl_common(network_delete_rule_internal, NULL, &tree_id); ++} ++ ++static int network_delete_set_internal(struct mnl_params *params, ++ batch_func_t _, void *args) ++{ ++ pid_t tree_id = *(pid_t *)args; ++ ++ return nft_set_common(params, tree_id, false); + } + +-void network_unprepare(pid_t tree_id) ++void network_delete_set(pid_t tree_id) + { +- pr_info("Unprepare network\n"); ++ pr_info("clear nft set\n"); + +- mnl_common(network_unprepare_internal, NULL, &tree_id); ++ mnl_common(network_delete_set_internal, NULL, &tree_id); + } + + static int add_set_elem_internal(struct nftnl_set *s, void *data, size_t len) +diff --git a/criu/sk-tcp.c b/criu/sk-tcp.c +index 67846c3..a9a9047 100644 +--- a/criu/sk-tcp.c ++++ b/criu/sk-tcp.c +@@ -55,7 +55,7 @@ static int tcp_repair_established(int fd, struct inet_sk_desc *sk) + } + + if (!(root_ns_mask & CLONE_NEWNET)) { +- ret = nf_lock_connection(sk); ++ ret = nf_lock_connection(sk, opts.tree_id, opts.use_nft); + if (ret < 0) + goto err2; + } +@@ -70,21 +70,21 @@ static int tcp_repair_established(int fd, struct inet_sk_desc *sk) + + err3: + if (!(root_ns_mask & CLONE_NEWNET)) +- nf_unlock_connection(sk); ++ nf_unlock_connection(sk, opts.use_nft); + err2: + close(sk->rfd); + err1: + return -1; + } + +-static void tcp_unlock_one(struct inet_sk_desc *sk) ++static void tcp_unlock_one(struct inet_sk_desc *sk, bool use_nft) + { + int ret; + + list_del(&sk->rlist); + +- if (!(root_ns_mask & CLONE_NEWNET)) { +- ret = nf_unlock_connection(sk); ++ if (!(root_ns_mask & CLONE_NEWNET) && !use_nft) { ++ ret = nf_unlock_connection(sk, false); + if (ret < 0) + pr_perror("Failed to unlock TCP connection"); + } +@@ -101,12 +101,12 @@ static void tcp_unlock_one(struct inet_sk_desc *sk) + close(sk->rfd); + } + +-void cpt_unlock_tcp_connections(void) ++void cpt_unlock_tcp_connections(bool use_nft) + { + struct inet_sk_desc *sk, *n; + + list_for_each_entry_safe(sk, n, &cpt_tcp_repair_sockets, rlist) +- tcp_unlock_one(sk); ++ tcp_unlock_one(sk, use_nft); + } + + static int dump_tcp_conn_state(struct inet_sk_desc *sk) +-- +2.34.0 + diff --git a/backport-0037--remove-sigaction-handler-register-in-restorer.patch b/backport-0037--remove-sigaction-handler-register-in-restorer.patch new file mode 100644 index 0000000000000000000000000000000000000000..83c7ebdf3a40b594c60f82eb873b058eda90a4bc --- /dev/null +++ b/backport-0037--remove-sigaction-handler-register-in-restorer.patch @@ -0,0 +1,51 @@ +From ffd254baa6f8ff43fb58d9b7c745684608cc4541 Mon Sep 17 00:00:00 2001 +From: Jingxian He +Date: Tue, 24 Aug 2021 12:26:35 +0000 +Subject: [PATCH 37/50] remove sigaction handler register in restorer + +The sigaction handler register in restorer will change +the original sigaction handler of restoring app, +We need to remove them or recover them before resuming app. + +Signed-off-by: Jingxian He +--- + criu/pie/restorer.c | 20 -------------------- + 1 file changed, 20 deletions(-) + +diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c +index d87236d..603cbee 100644 +--- a/criu/pie/restorer.c ++++ b/criu/pie/restorer.c +@@ -1632,29 +1632,9 @@ long __export_restore_task(struct task_restore_args *args) + pr_err("Failed to set SIGCHLD %ld\n", ret); + goto core_restore_end; + } +- ret = sys_sigaction(SIGSEGV, &act, NULL, sizeof(k_rtsigset_t)); +- if (ret) { +- pr_err("Failed to set SIGCHLD %ld\n", ret); +- goto core_restore_end; +- } +- +- ret = sys_sigaction(SIGBUS, &act, NULL, sizeof(k_rtsigset_t)); +- if (ret) { +- pr_err("Failed to set SIGCHLD %ld\n", ret); +- goto core_restore_end; +- } +- +- ret = sys_sigaction(SIGILL, &act, NULL, sizeof(k_rtsigset_t)); +- if (ret) { +- pr_err("Failed to set SIGCHLD %ld\n", ret); +- goto core_restore_end; +- } + + ksigemptyset(&to_block); + ksigaddset(&to_block, SIGCHLD); +- ksigaddset(&to_block, SIGSEGV); +- ksigaddset(&to_block, SIGBUS); +- ksigaddset(&to_block, SIGILL); + ret = sys_sigprocmask(SIG_UNBLOCK, &to_block, NULL, sizeof(k_rtsigset_t)); + if (ret) { + pr_err("Failed to unblock SIGCHLD %ld\n", ret); +-- +2.34.0 + diff --git a/backport-0038--remove-ignore_special_dump-option.patch b/backport-0038--remove-ignore_special_dump-option.patch new file mode 100644 index 0000000000000000000000000000000000000000..b8d9ea5467cc81fde0a84e425e9fa276786b55f2 --- /dev/null +++ b/backport-0038--remove-ignore_special_dump-option.patch @@ -0,0 +1,72 @@ +From c5521fc5761a2fab580a7ac1545bd8bb0d5296e5 Mon Sep 17 00:00:00 2001 +From: root +Date: Wed, 8 Sep 2021 03:45:59 +0000 +Subject: [PATCH 38/50] remove ignore_special_dump option + +Remove the useless ignore_special_dump option. + +Signed-off-by: Jingxian He +--- + criu/config.c | 1 - + criu/cr-dump.c | 2 +- + criu/include/cr_options.h | 1 - + criu/seize.c | 6 +++--- + 4 files changed, 4 insertions(+), 6 deletions(-) + +diff --git a/criu/config.c b/criu/config.c +index 676b424..cdafe17 100644 +--- a/criu/config.c ++++ b/criu/config.c +@@ -551,7 +551,6 @@ int parse_options(int argc, char **argv, bool *usage_error, + BOOL_OPT("dump-char-dev", &opts.dump_char_dev), + BOOL_OPT("mask-exit-notify", &opts.mask_exit_notify), + BOOL_OPT("weak-file-check", &opts.weak_file_check), +- BOOL_OPT("ignore-special-dump", &opts.ignore_special_dump), + BOOL_OPT("file-locks-repair", &opts.file_locks_repair), + {"reserve-ports", required_argument, 0, 'P' }, + BOOL_OPT("use-nft", &opts.use_nft), +diff --git a/criu/cr-dump.c b/criu/cr-dump.c +index d54b7a7..2e940d5 100644 +--- a/criu/cr-dump.c ++++ b/criu/cr-dump.c +@@ -1806,7 +1806,7 @@ static int cr_dump_finish(int ret) + close_service_fd(CR_PROC_FD_OFF); + close_image_dir(); + +- if (ret == 0 && opts.pin_memory && !opts.ignore_special_dump) { ++ if (ret == 0 && opts.pin_memory) { + pr_info("start restore_task_special_pages\n"); + restore_task_special_pages(0); + } +diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h +index 62fbbb1..236d1c7 100644 +--- a/criu/include/cr_options.h ++++ b/criu/include/cr_options.h +@@ -183,7 +183,6 @@ struct cr_options { + int dump_char_dev; + int mask_exit_notify; + int weak_file_check; +- int ignore_special_dump; + int file_locks_repair; + char *share_dst_ports; + char *share_src_ports; +diff --git a/criu/seize.c b/criu/seize.c +index c615971..056454d 100644 +--- a/criu/seize.c ++++ b/criu/seize.c +@@ -682,9 +682,9 @@ static void unseize_task_and_threads(const struct pstree_item *item, int st) + if (item->pid->state == TASK_DEAD) + return; + +- if (opts.pin_memory && !opts.ignore_special_dump) { +- for (i = 0; i < item->nr_threads; i++) +- dump_task_special_pages(item->threads[i].real); ++ if (opts.pin_memory) { ++ for (i = 0; i < item->nr_threads; i++) ++ dump_task_special_pages(item->threads[i].real); + } + if (opts.mask_exit_notify) { + ret = mask_task_exit_notify(item->threads[0].real, true); +-- +2.34.0 + diff --git a/backport-0039--add-clear-pin-mem-and-init-page-map-option.patch b/backport-0039--add-clear-pin-mem-and-init-page-map-option.patch new file mode 100644 index 0000000000000000000000000000000000000000..2419b20921ae236f1873902950ca39ed8f93de79 --- /dev/null +++ b/backport-0039--add-clear-pin-mem-and-init-page-map-option.patch @@ -0,0 +1,106 @@ +From c242956d19c615d4a6985f77a47822f15e2acc60 Mon Sep 17 00:00:00 2001 +From: root +Date: Wed, 8 Sep 2021 08:23:11 +0000 +Subject: [PATCH 39/50] add clear pin mem and init page map option + +Add 'clear-pin-mem' option for clearing pin memory data, +and 'init-page-map' option for initializationing buffer for +reading page map info. + +Signed-off-by: Jingxian He +--- + criu/crtools.c | 30 ++++++++++++++++++++++++++++++ + criu/include/restorer.h | 4 ++++ + 2 files changed, 34 insertions(+) + +diff --git a/criu/crtools.c b/criu/crtools.c +index aaae44e..d95d903 100644 +--- a/criu/crtools.c ++++ b/criu/crtools.c +@@ -46,6 +46,7 @@ + + #include "setproctitle.h" + #include "sysctl.h" ++#include "restorer.h" + + void flush_early_log_to_stderr(void) __attribute__((destructor)); + +@@ -68,6 +69,25 @@ static int image_dir_mode(char *argv[], int optind) + return -1; + } + ++int init_pagemap_read(int para) ++{ ++ int fd, ret; ++ ++ fd = open(PIN_MEM_FILE, O_RDWR, 0); ++ if (fd < 0) { ++ pr_warn("error open file: %s\n", PIN_MEM_FILE); ++ return -1; ++ } ++ ++ ret = ioctl(fd, INIT_PAGEMAP_READ, (unsigned long) ¶); ++ if (ret < 0) { ++ pr_warn("Init pagemap read fail, errno: %s\n", strerror(errno)); ++ } ++ ++ close(fd); ++ return ret; ++} ++ + int main(int argc, char *argv[], char *envp[]) + { + int ret = -1; +@@ -173,6 +193,14 @@ int main(int argc, char *argv[], char *envp[]) + goto usage; + } + ++ if (!strcmp(argv[optind], "clear-pin-memory")) { ++ return clear_pin_mem(0); ++ } ++ ++ if (!strcmp(argv[optind], "init-pagemap-read")) { ++ return init_pagemap_read(0); ++ } ++ + /* We must not open imgs dir, if service is called */ + if (strcmp(argv[optind], "service")) { + ret = open_image_dir(opts.imgs_dir, image_dir_mode(argv, optind)); +@@ -324,6 +352,8 @@ usage: + " dedup remove duplicates in memory dump\n" + " cpuinfo dump writes cpu information into image file\n" + " cpuinfo check validates cpu information read from image file\n" ++" clear-pin-memory clear pin memory manage data\n" ++" init-pagemap-read init data buffer for reading page map info\n" + ); + + if (usage_error) { +diff --git a/criu/include/restorer.h b/criu/include/restorer.h +index f6b45d6..affc155 100644 +--- a/criu/include/restorer.h ++++ b/criu/include/restorer.h +@@ -334,12 +334,14 @@ enum { + #define _SET_PIN_MEM_AREA 1 + #define _CLEAR_PIN_MEM_AREA 2 + #define _REMAP_PIN_MEM_AREA 3 ++#define _INIT_PAGEMAP_READ 5 + #define _DUMP_SEPCIAL_PAGES 6 + #define _RETORE_SEPCIAL_PAGES 7 + #define _SET_FORK_PID 8 + #define SET_PIN_MEM_AREA _IOW(PIN_MEM_MAGIC, _SET_PIN_MEM_AREA, struct pin_mem_area_set) + #define CLEAR_PIN_MEM_AREA _IOW(PIN_MEM_MAGIC, _CLEAR_PIN_MEM_AREA, int) + #define REMAP_PIN_MEM_AREA _IOW(PIN_MEM_MAGIC, _REMAP_PIN_MEM_AREA, int) ++#define INIT_PAGEMAP_READ _IOW(PIN_MEM_MAGIC, _INIT_PAGEMAP_READ, int) + #define DUMP_SEPCIAL_PAGES _IOW(PIN_MEM_MAGIC, _DUMP_SEPCIAL_PAGES, int) + #define RETORE_SEPCIAL_PAGES _IOW(PIN_MEM_MAGIC, _RETORE_SEPCIAL_PAGES, int) + #define SET_FORK_PID _IOW(PIN_MEM_MAGIC, _SET_FORK_PID, int) +@@ -358,4 +360,6 @@ struct pin_mem_area_set { + struct pin_mem_area mem_area[MAX_PIN_MEM_AREA_NUM]; + }; + ++int clear_pin_mem(int pid); ++ + #endif /* __CR_RESTORER_H__ */ +-- +2.34.0 + diff --git a/backport-0040--mmap-restore-dev-hisi_sec2-deivce-vma.patch b/backport-0040--mmap-restore-dev-hisi_sec2-deivce-vma.patch new file mode 100644 index 0000000000000000000000000000000000000000..6bbabb664372686d6aed65d6f7d32615e655162b --- /dev/null +++ b/backport-0040--mmap-restore-dev-hisi_sec2-deivce-vma.patch @@ -0,0 +1,492 @@ +From 628608e49d9652f3ba48688c544a6840af453795 Mon Sep 17 00:00:00 2001 +From: "fu.lin" +Date: Fri, 10 Sep 2021 16:06:55 +0800 +Subject: [PATCH 40/50] mmap: restore /dev/hisi_sec2* deivce vma + +There are two kinds of vmas: anonymous vma and file-based vma. For +anonymous vma, criu just map area and fill content to it; for file-based +vma, criu preprocess it, such as setting `open_vm()` callback function. + +`/dev/hisi_sec2*` char device is different from the normal. The `open`, +`mmap`, and `close` syscall actions has a special meaning. + - `open`: allocate physical resource of the device + - `mmap`: create instance + - `close`: release physical resource +The vma means the instance in this device. One fd may be associated with +a group instances: one mmio (vma size is 2 pages, pgoff is 0), one dus +(vma size is 37 pages, pgoff is 0x2000). As for dus vma, it's split two +vmas by `mprotect(addr, 0x5000, PROT_READ)`: one size is 0x20000, one +size is 0x5000. + +This patch makes the /dev/hisi_sec* restore possible. Idea: + It's impossible for criu to know the relationship between vma and the +mapped file fd. Therefore, just collect the total fds number during +collecting /dev/hisi_sec* files, then the fd is tagged that which +function is used during vma restoration, and aissign the unused fd to the +specific vma. And during `mmap()` process, dus vma is splitted by `mprotect`. + +Note: +- criu use ino to index the fd. +- this physical device drivers is hisi_sec2.ko, which is located in + `drivers/crypto/hisilicon/sec2/` of linux kernel. +- this device name has prefix "hisi_sec2" that is found from + `drivers/crypto/hisilicon/sec2/sec_main.c`. + +Signed-off-by: fu.lin +--- + criu/files-reg.c | 113 ++++++++++++++++++++++++++++++++++ + criu/files.c | 17 ++++-- + criu/include/files-reg.h | 8 +++ + criu/include/util.h | 8 +++ + criu/include/vma.h | 12 ++++ + criu/pie/restorer.c | 129 ++++++++++++++++++++++++++++++++++++++- + criu/proc_parse.c | 19 +++--- + 7 files changed, 292 insertions(+), 14 deletions(-) + +diff --git a/criu/files-reg.c b/criu/files-reg.c +index 46e9eab..01e0895 100644 +--- a/criu/files-reg.c ++++ b/criu/files-reg.c +@@ -2441,6 +2441,109 @@ static int open_filemap(int pid, struct vma_area *vma) + return 0; + } + ++#define MAX_HISI_SEC_SIZE 3 /* one physical device expose three char dev */ ++static struct hlist_head hisi_sec_fds_hash[MAX_HISI_SEC_SIZE]; ++ ++static int collect_hisi_sec_fds(struct list_head *list) ++{ ++ struct fdinfo_list_entry *fle, *tmp; ++ struct chrfile_info *ci; ++ struct file_desc *d; ++ struct hisi_sec_desc *desc; ++ int idx; ++ int nr = 0; ++ ++ for (idx = 0; idx < MAX_HISI_SEC_SIZE; idx++) ++ INIT_HLIST_HEAD(&hisi_sec_fds_hash[idx]); ++ ++ list_for_each_entry_safe(fle, tmp, list, ps_list) { ++ d = fle->desc; ++ ++ if (d->ops->type != FD_TYPES__CHR) ++ continue; ++ ++ ci = container_of(d, struct chrfile_info, d); ++ ++ if (strstr(ci->path, HISI_SEC_DEV) != NULL) { ++ desc = shmalloc(sizeof(*desc)); ++ if (desc == NULL) ++ return -ENOMEM; ++ ++ desc->name = ci->path; ++ desc->fd = fle->fe->fd; ++ desc->mmio = desc->dus = 0; ++ ++ idx = (ci->path[strlen(ci->path)-1] - '0') % MAX_HISI_SEC_SIZE; ++ hlist_add_head(&desc->hash, &hisi_sec_fds_hash[idx]); ++ ++ nr += 1; ++ } ++ } ++ ++ return nr; ++} ++ ++static long delivery_hisi_sec_fd(struct list_head *fds, struct vma_area *vma) ++{ ++ extern unsigned hisi_sec_fds_n; /* defined in criu/files.c */ ++ static bool initialized = false; ++ struct hisi_sec_desc *desc; ++ int fd = -1, idx; ++ ++ if (!initialized) { ++ int nr; ++ ++ pr_info("find %d fds for hisi_sec char device\n", hisi_sec_fds_n); ++ ++ nr = collect_hisi_sec_fds(fds); ++ if (nr != hisi_sec_fds_n) { ++ pr_err("Collected fds(%d) aren't equal opened(%d)\n", ++ nr, hisi_sec_fds_n); ++ return -1; ++ } ++ ++ initialized = true; ++ } else if (vma->e->pgoff != HISI_SEC_MMIO && vma->e->pgoff != HISI_SEC_DUS) { ++ /* It's impossible value for fd, just as a tag to show it's a ++ * vma by `mprotect` syscall. ++ */ ++ return LONG_MAX; ++ } ++ ++ idx = (vma->e->name[strlen(vma->e->name)-1] - '0') % MAX_HISI_SEC_SIZE; ++ hlist_for_each_entry(desc, &hisi_sec_fds_hash[idx], hash) { ++ if (strcmp(desc->name, vma->e->name) != 0) ++ continue; ++ ++ if (vma->e->pgoff == HISI_SEC_MMIO && !desc->mmio) { ++ fd = desc->fd; ++ desc->mmio = true; ++ break; ++ } else if (vma->e->pgoff == HISI_SEC_DUS && !desc->dus) { ++ fd = desc->fd; ++ desc->dus = true; ++ break; ++ } ++ } ++ ++ return fd; ++} ++ ++static int handle_hisi_vma(struct list_head *fds, struct vma_area *vma) ++{ ++ long fd = delivery_hisi_sec_fd(fds, vma); ++ ++ if (fd < 0) { ++ pr_err("find fd for char dev vma pgoff %lx named %s failed.\n", ++ vma->e->pgoff, vma->e->name); ++ return -1; ++ } ++ ++ vma->e->fd = fd; ++ ++ return 0; ++} ++ + int collect_chr_map(struct pstree_item *me, struct vma_area *vma) + { + struct list_head *list = &rsti(me)->fds; +@@ -2448,6 +2551,13 @@ int collect_chr_map(struct pstree_item *me, struct vma_area *vma) + struct chrfile_info *ci; + bool exist_fd; + ++ if (strstr(vma->e->name, HISI_SEC_DEV) != NULL) { ++ if (handle_hisi_vma(list, vma) != 0) { ++ return -1; ++ } else ++ goto out; ++ } ++ + list_for_each_entry_safe(fle, tmp, list, ps_list) { + struct file_desc *d = fle->desc; + +@@ -2466,6 +2576,9 @@ int collect_chr_map(struct pstree_item *me, struct vma_area *vma) + if (!exist_fd) + return -EEXIST; + ++out: ++ pr_info(" `- find fd %ld for dev %s at this vma\n", vma->e->fd, vma->e->name); ++ + return 0; + } + +diff --git a/criu/files.c b/criu/files.c +index cc812b9..3efbea2 100644 +--- a/criu/files.c ++++ b/criu/files.c +@@ -64,6 +64,8 @@ static LIST_HEAD(fake_master_head); + + static u32 max_file_desc_id = 0; + ++unsigned hisi_sec_fds_n; ++ + static void init_fdesc_hash(void) + { + int i; +@@ -1847,11 +1849,14 @@ out: + static int chrfile_open(struct file_desc *d, int *new_fd) + { + int fd, mntns_root; +- int ret = 0; ++ int ret = -1; + struct chrfile_info *ci; + + ci = container_of(d, struct chrfile_info, d); + ++ pr_info("charfile: Opening %s (repair %d index %d)\n", ++ ci->path, ci->cfe->repair, ci->cfe->index); ++ + mntns_root = open_pid_proc(getpid()); + fd = openat(mntns_root, ci->path, ci->cfe->flags | O_REPAIR); + if (fd < 0){ +@@ -1867,6 +1872,8 @@ static int chrfile_open(struct file_desc *d, int *new_fd) + } + + *new_fd = fd; ++ ret = 0; ++ + return ret; + err: + close(fd); +@@ -1889,10 +1896,12 @@ static int collect_one_chrfile(void *o, ProtobufCMessage *base, struct cr_img *i + else + ci->path = ci->cfe->name; + +- pr_info("Collected chr file: %#x, name: %s\n", ci->cfe->id, ci->path); +- file_desc_add(&ci->d, ci->cfe->id, &chrfile_desc_ops); ++ /* collect `/dev/hisi_sec2*` fds */ ++ if (strstr(ci->path, HISI_SEC_DEV) != NULL) ++ hisi_sec_fds_n += 1; + +- return 0; ++ pr_info("Collected chr file: %#x, name: %s\n", ci->cfe->id, ci->path); ++ return file_desc_add(&ci->d, ci->cfe->id, &chrfile_desc_ops); + } + + struct collect_image_info chrfile_cinfo = { +diff --git a/criu/include/files-reg.h b/criu/include/files-reg.h +index 4ec0e14..6c15a19 100644 +--- a/criu/include/files-reg.h ++++ b/criu/include/files-reg.h +@@ -33,6 +33,14 @@ struct chrfile_info { + char *path; + }; + ++struct hisi_sec_desc { ++ struct hlist_node hash; ++ char *name; ++ bool mmio; ++ bool dus; ++ int fd; ++}; ++ + extern int open_reg_by_id(u32 id); + extern int open_reg_fd(struct file_desc *); + extern int open_path(struct file_desc *, int (*open_cb)(int ns_root_fd, +diff --git a/criu/include/util.h b/criu/include/util.h +index d1510fc..c176981 100644 +--- a/criu/include/util.h ++++ b/criu/include/util.h +@@ -432,4 +432,12 @@ int mask_task_exit_notify(int pid, bool mask); + + #define RESERVED_PORTS_PATH "/proc/sys/net/ipv4/ip_local_reserved_ports" + ++#define HISI_SEC_DEV "hisi_sec2" /* `/dev/hisi_sec2*` char device */ ++ ++/* here is the selection of offset in `mmap`, they're from drivers */ ++enum hisi_sec_dev { ++ HISI_SEC_MMIO = 0x0, ++ HISI_SEC_DUS = 0x2000, ++}; ++ + #endif /* __CR_UTIL_H__ */ +diff --git a/criu/include/vma.h b/criu/include/vma.h +index 5e3f352..f649a95 100644 +--- a/criu/include/vma.h ++++ b/criu/include/vma.h +@@ -133,4 +133,16 @@ static inline bool vma_entry_can_be_lazy(VmaEntry *e) + !(vma_entry_is(e, VMA_AREA_VSYSCALL))); + } + ++struct vma_attr { ++ int prot; ++ int flags; ++}; ++ ++enum ALIEN_MAP_METHOD { ++ PGOFF_IS_ZERO, ++ MAP_THEN_PROTECT, ++ ++ MAX_ALIEN_MAP_METHOD, ++}; ++ + #endif /* __CR_VMA_H__ */ +diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c +index 603cbee..949384e 100644 +--- a/criu/pie/restorer.c ++++ b/criu/pie/restorer.c +@@ -901,6 +901,129 @@ static unsigned long restore_mapping(VmaEntry *vma_entry) + return addr; + } + ++static unsigned long restore_map_then_protect_mapping(VmaEntry *curr, ++ struct vma_attr *curr_attr, ++ VmaEntry *next, ++ struct vma_attr *next_attr) ++{ ++ int retval; ++ unsigned long addr; ++ ++ if (next->fd != LONG_MAX ++ || curr->end != next->start ++ || (vma_entry_len(curr) + curr->pgoff) != next->pgoff ++ || curr->prot == next->prot ++ || curr->flags != next->flags) { ++ pr_err("They looks not currect:\n"); ++ pr_err(" `- vma A: (%x %x %d %lx)\n", ++ curr_attr->prot, curr_attr->flags, ++ (int)curr->fd, curr->pgoff); ++ pr_err(" `- vma B: (%x %x %d %lx)\n", ++ next_attr->prot, next_attr->flags, ++ (int)next->fd, next->pgoff); ++ return -1; ++ } ++ ++ pr_info("\tmmap(%x %x %d %lx) in map then protect mapping\n", ++ curr_attr->prot, curr_attr->flags, ++ (int)curr->fd, curr->pgoff); ++ ++ addr = sys_mmap(decode_pointer(curr->start), ++ vma_entry_len(curr) + vma_entry_len(next), ++ curr_attr->prot, curr_attr->flags, curr->fd, curr->pgoff); ++ if (addr != curr->start) { ++ pr_err("%s: mmap failed with code %ld\n", __func__, addr); ++ goto out; ++ } ++ ++ pr_info("\t mprotect(%x)\n", next_attr->prot); ++ retval = sys_mprotect(decode_pointer(next->start), ++ vma_entry_len(next), next_attr->prot); ++ if (retval != 0) { ++ addr = retval; ++ pr_err("%s: mprotect failed with code %d\n", __func__, retval); ++ } ++ ++out: ++ return addr; ++} ++ ++static unsigned long restore_pgoff_is_zero_mapping(VmaEntry *curr, struct vma_attr *attr) ++{ ++ unsigned long addr; ++ ++ pr_debug("\tmmap(%x %x %d %lx) in pgoff is zero mapping\n", ++ attr->prot, attr->flags, (int)curr->fd, curr->pgoff); ++ ++ addr = sys_mmap(decode_pointer(curr->start), ++ vma_entry_len(curr), ++ attr->prot, attr->flags, ++ curr->fd, curr->pgoff); ++ ++ return addr; ++} ++ ++static unsigned long restore_hisi_sec_mapping(struct task_restore_args *args, ++ int i, int *step) ++{ ++ VmaEntry *curr = args->vmas + i; ++ VmaEntry *next = args->vmas + i + 1; ++ struct vma_attr curr_attr = { ++ .prot = curr->prot, ++ .flags = curr->flags | MAP_FIXED, ++ }; ++ struct vma_attr next_attr = { ++ .prot = next->prot, ++ .flags = next->flags | MAP_FIXED, ++ }; ++ unsigned long addr; ++ ++ switch (curr->pgoff) { ++ case HISI_SEC_MMIO: ++ addr = restore_pgoff_is_zero_mapping(curr, &curr_attr); ++ break; ++ case HISI_SEC_DUS: ++ *step = 2; ++ addr = restore_map_then_protect_mapping(curr, &curr_attr, next, &next_attr); ++ break; ++ default: ++ pr_err("invalid pgoff %lx for vma\n", curr->pgoff); ++ return -1; ++ } ++ return addr; ++} ++ ++static bool find(const char *s1, const char *s2) ++{ ++ if (s1 == NULL || s2 == NULL) ++ return NULL; ++ ++ while (*s1 != '\0' && *s2 != '\0') { ++ if (*s1 == *s2) { ++ s1 += 1; ++ s2 += 1; ++ } else ++ s1 += 1; ++ ++ if (*s2 == '\0') ++ return true; ++ } ++ ++ return false; ++} ++ ++static unsigned long distribute_restore_mapping(struct task_restore_args *args, ++ int i, int *step) ++{ ++ VmaEntry *vma = args->vmas + i; ++ struct vma_names *vma_name = args->vma_names + i; ++ ++ if (vma_entry_is(vma, VMA_AREA_CHR) && find(vma_name->name, HISI_SEC_DEV)) ++ return restore_hisi_sec_mapping(args, i, step); ++ else ++ return restore_mapping(vma); ++} ++ + /* + * This restores aio ring header, content, head and in-kernel position + * of tail. To set tail, we write to /dev/null and use the fact this +@@ -1588,7 +1711,7 @@ int write_fork_pid(int pid) + long __export_restore_task(struct task_restore_args *args) + { + long ret = -1; +- int i; ++ int i, step; + VmaEntry *vma_entry; + unsigned long va; + struct restore_vma_io *rio; +@@ -1738,7 +1861,7 @@ long __export_restore_task(struct task_restore_args *args) + /* + * OK, lets try to map new one. + */ +- for (i = 0; i < args->vmas_n; i++) { ++ for (i = 0, step = 1; i < args->vmas_n; i += step, step = 1) { + vma_entry = args->vmas + i; + vma_name = args->vma_names + i; + +@@ -1756,7 +1879,7 @@ long __export_restore_task(struct task_restore_args *args) + if (vma_entry_is(vma_entry, VMA_PREMMAPED)) + continue; + +- va = restore_mapping(vma_entry); ++ va = distribute_restore_mapping(args, i, &step); + + if (va != vma_entry->start) { + pr_err("Can't restore %"PRIx64" mapping with %lx\n", vma_entry->start, va); +diff --git a/criu/proc_parse.c b/criu/proc_parse.c +index 2c7b926..b3d1c0b 100644 +--- a/criu/proc_parse.c ++++ b/criu/proc_parse.c +@@ -659,17 +659,22 @@ static int handle_vma(pid_t pid, struct vma_area *vma_area, + /* regular file mapping -- supported */; + else if (S_ISCHR(st_buf->st_mode)) { + /* devzero mapping -- also makes sense */; +- if (opts.dump_char_dev && (strstr(file_path, "uverbs") != NULL)) { +- int len = strlen(file_path) + 1; +- vma_area->e->status |= VMA_AREA_CHR; +- vma_area->e->name = xmalloc(len); +- if (!vma_area->e->name) { ++ ++ if (!opts.dump_char_dev) { ++ /* do nothing, it's original progoss */ ++ } else if (strstr(file_path, "uverbs") != NULL ++ || strstr(file_path, HISI_SEC_DEV) != NULL) { ++ int len = strlen(file_path) + 1; ++ ++ vma_area->e->status |= VMA_AREA_CHR; ++ vma_area->e->name = xmalloc(len); ++ if (!vma_area->e->name) { + pr_err("alloc vma area name fail\n"); + goto err; + } + strncpy(vma_area->e->name, file_path, len); +- pr_info("uverbs name content is: %s\n", vma_area->e->name); +- } ++ pr_info("vma name content is: %s\n", vma_area->e->name); ++ } + } else { + pr_err("Can't handle non-regular mapping on %d's map %"PRIx64"\n", pid, vma_area->e->start); + goto err; +-- +2.34.0 + diff --git a/backport-0041--fix-fds-list-restore-and-rollback-problem.patch b/backport-0041--fix-fds-list-restore-and-rollback-problem.patch new file mode 100644 index 0000000000000000000000000000000000000000..31c75e2caf6d4ef15b4bef8d75bace3cc29927cc --- /dev/null +++ b/backport-0041--fix-fds-list-restore-and-rollback-problem.patch @@ -0,0 +1,55 @@ +From 96b0adc3140c6c68c9818f08f2e538cbed6a0f04 Mon Sep 17 00:00:00 2001 +From: Jingxian He +Date: Thu, 16 Sep 2021 13:50:46 +0000 +Subject: [PATCH 41/50] fix fds list restore and rollback problem + +When there exist multi processes need to dump, the child process may +have the same fds as parent process. During the restore processing, +criu choose the process which has the min pid value to be the master +process to recover fds. However, choosing the parent process as the +master process is more suitable. + +Signed-off-by: Jingxian He +--- + criu/cr-restore.c | 5 ++--- + criu/files.c | 7 +------ + 2 files changed, 3 insertions(+), 9 deletions(-) + +diff --git a/criu/cr-restore.c b/criu/cr-restore.c +index 945d984..e94d5a6 100644 +--- a/criu/cr-restore.c ++++ b/criu/cr-restore.c +@@ -2068,10 +2068,9 @@ static int restore_task_with_children(void *_arg) + return 0; + + err: +- if (current->parent == NULL) { +- do_notifier_rollback(opts.with_notifier_kup, POST_UPDATE_KERNEL_COMPLETE); ++ do_notifier_rollback(opts.with_notifier_kup, POST_UPDATE_KERNEL_COMPLETE); ++ if (current->parent == NULL) + futex_abort_and_wake(&task_entries->nr_in_progress); +- } + exit(1); + } + +diff --git a/criu/files.c b/criu/files.c +index 3efbea2..ec0dcec 100644 +--- a/criu/files.c ++++ b/criu/files.c +@@ -925,12 +925,7 @@ static struct fdinfo_list_entry *alloc_fle(int pid, FdinfoEntry *fe) + + static void __collect_desc_fle(struct fdinfo_list_entry *new_le, struct file_desc *fdesc) + { +- struct fdinfo_list_entry *le; +- +- list_for_each_entry_reverse(le, &fdesc->fd_info_head, desc_list) +- if (pid_rst_prio_eq(le->pid, new_le->pid)) +- break; +- list_add(&new_le->desc_list, &le->desc_list); ++ list_add_tail(&new_le->desc_list, &fdesc->fd_info_head); + } + + static void collect_desc_fle(struct fdinfo_list_entry *new_le, +-- +2.34.0 + diff --git a/backport-0042--log-print-error-log-to-dev-kmsg.patch b/backport-0042--log-print-error-log-to-dev-kmsg.patch new file mode 100644 index 0000000000000000000000000000000000000000..dd51158658285af8dd591fef10de37e908dfc330 --- /dev/null +++ b/backport-0042--log-print-error-log-to-dev-kmsg.patch @@ -0,0 +1,83 @@ +From e0b30e8a6f5c9fe2f00291a55f6b684ef7b4d1f5 Mon Sep 17 00:00:00 2001 +From: "fu.lin" +Date: Tue, 19 Oct 2021 20:53:19 +0800 +Subject: [PATCH 1/1] log: print error log to /dev/kmsg + +The criu log can't be flushed to disk when OS crash in storage +environment, therefore, output high level msg to /dev/kmsg. + +Signed-off-by: fu.lin +--- + criu/Makefile.crtools | 1 + + criu/include/log.h | 3 +++ + criu/kmsg.c | 16 ++++++++++++++++ + criu/log.c | 4 ++++ + 4 files changed, 24 insertions(+) + create mode 100644 criu/kmsg.c + +diff --git a/criu/Makefile.crtools b/criu/Makefile.crtools +index cda5b82..0bea576 100644 +--- a/criu/Makefile.crtools ++++ b/criu/Makefile.crtools +@@ -92,6 +92,7 @@ obj-y += timens.o + obj-y += devname.o + obj-y += mnl.o + obj-y += nftables.o ++obj-y += kmsg.o + obj-$(CONFIG_HAS_LIBBPF) += bpfmap.o + obj-$(CONFIG_COMPAT) += pie-util-vdso-elf32.o + CFLAGS_pie-util-vdso-elf32.o += -DCONFIG_VDSO_32 +diff --git a/criu/include/log.h b/criu/include/log.h +index 58d0123..11acbf9 100644 +--- a/criu/include/log.h ++++ b/criu/include/log.h +@@ -72,6 +72,9 @@ void flush_early_log_buffer(int fd); + print_on_level(LOG_DEBUG, \ + LOG_PREFIX fmt, ##__VA_ARGS__) + ++#include ++void write_kmsg(const void *buf, size_t count); ++ + #ifndef CR_NOGLIBC + + #define pr_perror(fmt, ...) \ +diff --git a/criu/kmsg.c b/criu/kmsg.c +new file mode 100644 +index 0000000..c956dfb +--- /dev/null ++++ b/criu/kmsg.c +@@ -0,0 +1,16 @@ ++#include ++#include ++ ++#define SYSLOG_DEV "/dev/kmsg" ++ ++void write_kmsg(const void *buf, size_t count) ++{ ++ int fd; ++ ++ fd = open(SYSLOG_DEV, O_CLOEXEC | O_WRONLY); ++ if (fd < 0) ++ return; ++ ++ write(fd, buf, count); ++ close(fd); ++} +diff --git a/criu/log.c b/criu/log.c +index 439a899..4254f72 100644 +--- a/criu/log.c ++++ b/criu/log.c +@@ -379,6 +379,10 @@ static void vprint_on_level(unsigned int loglevel, const char *format, va_list p + size += buf_off; + + while (off < size) { ++ if (loglevel <= LOG_WARN) { ++ write_kmsg(buffer + off, size - off); ++ } ++ + ret = write(fd, buffer + off, size - off); + if (ret <= 0) + break; +-- +2.34.0 + diff --git a/backport-0043--improve-char-dev-fd-check-and-repair-method.patch b/backport-0043--improve-char-dev-fd-check-and-repair-method.patch new file mode 100644 index 0000000000000000000000000000000000000000..75205a343c255a3fb032c8b9746e36f0a2a49c32 --- /dev/null +++ b/backport-0043--improve-char-dev-fd-check-and-repair-method.patch @@ -0,0 +1,68 @@ +From 51f5191755e180324cba798bd3f8a68f15c0344e Mon Sep 17 00:00:00 2001 +From: Jingxian He +Date: Sun, 24 Oct 2021 15:20:27 +0800 +Subject: [PATCH 43/50] improve char dev fd check and repair method + +Some special char dev cannot work in child processes, we make dump fail +when the special char dev fd is in child processes. +In the char dev repair process, user may need recover fd. We should +make thre repair process running after the char dev fd is reopened as dumped fd. + +Signed-off-by: Jingxian He +--- + criu/files.c | 22 ++++++++++++---------- + 1 file changed, 12 insertions(+), 10 deletions(-) + +diff --git a/criu/files.c b/criu/files.c +index ec0dcec..cb50aca 100644 +--- a/criu/files.c ++++ b/criu/files.c +@@ -1275,6 +1275,7 @@ static int open_fd(struct fdinfo_list_entry *fle) + struct file_desc *d = fle->desc; + struct fdinfo_list_entry *flem; + int new_fd = -1, ret; ++ struct chrfile_info *ci; + + pr_info("open file flags:%x\n", fle->fe->flags); + flem = file_master(d); +@@ -1335,6 +1336,17 @@ static int open_fd(struct fdinfo_list_entry *fle) + if (ret != -1 && new_fd >= 0) { + if (setup_and_serve_out(fle, new_fd) < 0) + return -1; ++ if (d->ops->type == FD_TYPES__CHR) { ++ ci = container_of(d, struct chrfile_info, d); ++ if (ci->cfe->repair) { ++ ret = ioctl(fle->fe->fd, IOCTL_CMD_REPAIR , ci->cfe->index); ++ pr_info("repair ioctl return: %d, index: %d\n", ret, ci->cfe->index); ++ if (ret) { ++ close(fle->fe->fd); ++ return -1; ++ } ++ } ++ } + } + out: + if (ret == 0) +@@ -1859,19 +1871,9 @@ static int chrfile_open(struct file_desc *d, int *new_fd) + return -1; + } + +- if (ci->cfe->repair) { +- ret = ioctl(fd, IOCTL_CMD_REPAIR , ci->cfe->index); +- pr_info("repair ioctl return: %d, index: %d\n", ret, ci->cfe->index); +- if (ret) +- goto err; +- } +- + *new_fd = fd; + ret = 0; + +- return ret; +-err: +- close(fd); + return ret; + } + +-- +2.34.0 + diff --git a/backport-0044--unix-sk-improve-dgram-robustness.patch b/backport-0044--unix-sk-improve-dgram-robustness.patch new file mode 100644 index 0000000000000000000000000000000000000000..e95905c848d14c4de423448d35d3b8929fccbea2 --- /dev/null +++ b/backport-0044--unix-sk-improve-dgram-robustness.patch @@ -0,0 +1,159 @@ +From a2308891c2c02c4cd492f5b693fbb146e89b7547 Mon Sep 17 00:00:00 2001 +From: "fu.lin" +Date: Tue, 26 Oct 2021 11:13:27 +0800 +Subject: [PATCH 44/50] unix sk: improve dgram robustness + +We should try out best to ensure the success of criu. As for unix dgram +socket, criu use re-connect instead of repair instead of unix stream +socket. Therefore, this patch does the following things: + +- detect unix dgram unix sock file when criu dumps unix dgram socket +- add the fault tolerance of unix dgram socket connecting (focus on the + condition of `/dev/log` disappearance when rsyslog restart) + +Signed-off-by: fu.lin +--- + criu/sk-unix.c | 99 +++++++++++++++++++++++++++++++++++++++++++++++++- + 1 file changed, 97 insertions(+), 2 deletions(-) + +diff --git a/criu/sk-unix.c b/criu/sk-unix.c +index 14e8adc..a9424df 100644 +--- a/criu/sk-unix.c ++++ b/criu/sk-unix.c +@@ -11,6 +11,7 @@ + #include + #include + #include ++#include + + #include "libnetlink.h" + #include "cr_options.h" +@@ -357,6 +358,58 @@ err: + return -ENOENT; + } + ++static int unix_resolve_dgram_name(int lfd, uint32_t id, struct unix_sk_desc *sk, ++ struct unix_sk_desc *peer) ++{ ++ char *name = peer->name; ++ char rpath[PATH_MAX]; ++ struct stat st; ++ struct ns_id *ns; ++ int mntns_root; ++ ++ /* The unnamed or abstrace unix socket contion. */ ++ if (peer->namelen == 0 || name[0] == '\0') ++ return 0; ++ ++ if (name[0] != '/') { ++ pr_warn("Not support relative path, following the original rule." ++ " socket %#x(%s) ino %d peer %d\n", ++ id, name, sk->sd.ino, sk->peer_ino); ++ return 0; ++ } else if (kdat.sk_unix_file && (root_ns_mask & CLONE_NEWNS)) { ++ pr_warn("Not support mnt namespace, following the original rule." ++ " socket %#x(%s) ino %d peer %d\n", ++ id, name, sk->sd.ino, sk->peer_ino); ++ return 0; ++ } ++ ++ ns = lookup_ns_by_id(root_item->ids->mnt_ns_id, &mnt_ns_desc); ++ if (!ns) { ++ pr_err("Can't find ns. socket %#x(%s) ino %d peer %d\n", ++ id, name, sk->sd.ino, sk->peer_ino); ++ goto err; ++ } ++ ++ mntns_root = mntns_get_root_fd(ns); ++ if (mntns_root < 0) { ++ pr_err("Can't get root fd. socket %#x(%s) ino %d peer %d\n", ++ id, name, sk->sd.ino, sk->peer_ino); ++ goto err; ++ } ++ ++ snprintf(rpath, sizeof(rpath), ".%s", name); ++ if (fstatat(mntns_root, rpath, &st, 0)) { ++ pr_err("Can't stat the connected DGRAM type socket id %#x," ++ " peer ino %d path '%s'\n", ++ id, sk->peer_ino, name); ++ goto err; ++ } ++ ++ return 0; ++err: ++ return -ENOENT; ++} ++ + static int dump_one_unix_fd(int lfd, uint32_t id, const struct fd_parms *p) + { + struct unix_sk_desc *sk, *peer; +@@ -483,6 +536,18 @@ static int dump_one_unix_fd(int lfd, uint32_t id, const struct fd_parms *p) + ret = getsockopt(lfd, SOL_TCP, TCP_REPAIR_OPTIONS, &ue->repair_ino, &len); + if (ret < 0) + goto err; ++ } else if (peer->name && sk->type == SOCK_DGRAM) { ++ /* ++ * As for unix stream socket, we can use the kernel] ++ * feature which cmdline is `unix_stream_restore_enable` ++ * to dump/restore it. Because of the feature, we can ++ * not consider the unix stream socket file status. ++ * But as for unix dgram socket, it's different. We ++ * must ensure the existence of the socket file when ++ * dump/restore, otherwise it will fail. ++ */ ++ if (unix_resolve_dgram_name(lfd, id, sk, peer) != 0) ++ goto err; + } + + /* +@@ -1340,6 +1405,33 @@ err: + return -1; + } + ++/* ++ * Sometimes, `/dev/log` will disappear because of the restart of rsyslog when ++ * rotating, criu try to connect `/dev/log` will report error at this time. We ++ * should try our best to ensure the success of criu restoration. Therefore, ++ * retry three times here. ++ */ ++static int unix_dgram_reconnect(int fd, struct sockaddr_un *addr, int len) ++{ ++ int retval = 0; ++ struct timespec tim = { ++ .tv_sec = 0, ++ .tv_nsec = 5e+8, ++ }; ++ ++ for (int i = 0; i < 3; i++) { ++ nanosleep(&tim, NULL); ++ pr_warn("Can't connect unix socket(%s), %d retry\n", ++ addr->sun_path, i); ++ retval = connect(fd, (struct sockaddr *)addr, ++ sizeof(addr->sun_family) + len); ++ if (retval == 0) ++ break; ++ } ++ ++ return retval; ++} ++ + static int post_open_standalone(struct file_desc *d, int fd) + { + int fdstore_fd = -1, procfs_self_dir = -1, len; +@@ -1421,8 +1513,11 @@ static int post_open_standalone(struct file_desc *d, int fd) + goto err_revert_and_exit; + } + } else if (connect(fd, (struct sockaddr *)&addr, sizeof(addr.sun_family) + len) < 0) { +- pr_perror("Can't connect %d socket", ui->ue->ino); +- goto err_revert_and_exit; ++ if (ui->ue->type != SOCK_DGRAM || errno != ENOENT ++ || unix_dgram_reconnect(fd, &addr, len) != 0) { ++ pr_perror("Can't connect %d socket", ui->ue->ino); ++ goto err_revert_and_exit; ++ } + } + mutex_unlock(mutex_ghost); + +-- +2.34.0 + diff --git a/backport-0045--sk-ignore-the-bind-error-for-icmp-socket.patch b/backport-0045--sk-ignore-the-bind-error-for-icmp-socket.patch new file mode 100644 index 0000000000000000000000000000000000000000..863738fe4cc1a00f793500646451679e8c80c05d --- /dev/null +++ b/backport-0045--sk-ignore-the-bind-error-for-icmp-socket.patch @@ -0,0 +1,44 @@ +From cd91608de2b57be8987eecbbc3baa290105fc20f Mon Sep 17 00:00:00 2001 +From: "fu.lin" +Date: Wed, 27 Oct 2021 11:57:43 +0800 +Subject: [PATCH 45/50] sk: ignore the bind error for icmp socket + +Signed-off-by: fu.lin +--- + criu/sk-inet.c | 20 ++++++++++++++++++-- + 1 file changed, 18 insertions(+), 2 deletions(-) + +diff --git a/criu/sk-inet.c b/criu/sk-inet.c +index 768c6ed..b614cec 100644 +--- a/criu/sk-inet.c ++++ b/criu/sk-inet.c +@@ -1187,8 +1187,24 @@ int inet_bind(int sk, struct inet_sk_info *ii) + } + + if (bind(sk, (struct sockaddr *)&addr, addr_size) == -1) { +- pr_perror("Can't bind inet socket (id %d)", ii->ie->id); +- return -1; ++ InetSkEntry *ie = ii->ie; ++ ++ /* ++ * Sometimes the ping-like program restoration may appear ++ * `bind()` error when it is specified the address. In view ++ * of the principle that we should try our best to restore the ++ * process, and ping-like program works abnormal can tolerate, ++ * just warn here instead of report error. ++ */ ++ if (ie->proto == IPPROTO_ICMP || ie->proto == IPPROTO_ICMPV6) { ++ pr_warn("Can't bind inet socket (id %d) proto %s\n", ++ ie->id, ++ ie->proto == IPPROTO_ICMP ? ++ "IPPROTO_ICMP" : "IPPROTO_ICMPV6"); ++ } else { ++ pr_perror("Can't bind inet socket (id %d)", ii->ie->id); ++ return -1; ++ } + } + + if (rst_freebind) { +-- +2.34.0 + diff --git a/backport-0046--infiniband-fix-the-infiniband-fd-conflict.patch b/backport-0046--infiniband-fix-the-infiniband-fd-conflict.patch new file mode 100644 index 0000000000000000000000000000000000000000..5ca5efcba3b8b3f41f02bc61e8b87de0b6b9565a --- /dev/null +++ b/backport-0046--infiniband-fix-the-infiniband-fd-conflict.patch @@ -0,0 +1,286 @@ +From d4e3e321fb3a299ce3fcfcc3ccda1a32d546e849 Mon Sep 17 00:00:00 2001 +From: "fu.lin" +Date: Mon, 8 Nov 2021 15:08:12 +0800 +Subject: [PATCH 46/50] infiniband: fix the infiniband fd conflict + +Phenomenon: + Operating uverbs device will generate anonymous fd named +`anon_inode:[infinibandevent]`. When `anon_inode:[infinibandevent]` fd +is the last opened fd, and some kind of unix socket fd exist, which is +generated by syscalls like `socketpair()` at the same tim, +`anon_inode:[infinibandevent]` will restore fail probabilistically. + +log as the following: + +``` +(00.254523) 63959: open file flags:1 +(00.254526) 63959: unix: Opening standalone (stage 0 id 0x1ff ino 1019605 peer 0) +(00.254571) 63959: *******flags: 0 +(00.254575) 63959: Create fd for 1408 # the fake fd +(00.254578) 63959: *******flags: 1 +(00.254580) 63959: Create fd for 445 # the restoration fd +``` + +Reason: + During the restoration of unix socket, `socketpair()` will generate +two fds, one is used to the current restoration, another is called fake +fd which fd nr is owned by `find_unused_fd()`. When +`anon_inode:[infinibandevent]` fd is the last one, criu don't dump the +fd information for `anon_inode:[infinibandevent]` in original +implementation, and criu think the fd nr which should belong to +`anon_inode:[infinibandevent]` isn't used. Therefore, it cause the +`anon_inode:[infinibandevent]` restoration fail. + +This patch fix the above problem. Core: dump +`anon_inode:[infinibandevent]` fd information, make the criu is aware +that that fd nr is used. + +Signed-off-by: fu.lin +--- + criu/Makefile.crtools | 1 + + criu/char.c | 68 ++++++++++++++++++++++++++++++++++++ + criu/files.c | 23 ++++++------ + criu/include/char.h | 17 +++++++++ + criu/include/image-desc.h | 1 + + criu/include/protobuf-desc.h | 1 + + images/chr.proto | 3 ++ + images/fdinfo.proto | 2 ++ + 8 files changed, 103 insertions(+), 13 deletions(-) + create mode 100644 criu/char.c + create mode 100644 criu/include/char.h + +diff --git a/criu/Makefile.crtools b/criu/Makefile.crtools +index 0bea576..70d1b73 100644 +--- a/criu/Makefile.crtools ++++ b/criu/Makefile.crtools +@@ -93,6 +93,7 @@ obj-y += devname.o + obj-y += mnl.o + obj-y += nftables.o + obj-y += kmsg.o ++obj-y += char.o + obj-$(CONFIG_HAS_LIBBPF) += bpfmap.o + obj-$(CONFIG_COMPAT) += pie-util-vdso-elf32.o + CFLAGS_pie-util-vdso-elf32.o += -DCONFIG_VDSO_32 +diff --git a/criu/char.c b/criu/char.c +new file mode 100644 +index 0000000..153145f +--- /dev/null ++++ b/criu/char.c +@@ -0,0 +1,68 @@ ++#include "imgset.h" ++#include "char.h" ++#include "log.h" ++ ++#include "protobuf.h" ++ ++static void pr_info_infiniband(char *action, InfinibandEntry *infiniband) ++{ ++ pr_info("%sinfiniband: id %#08x\n", action, infiniband->id); ++} ++ ++/* Checks if file descriptor @lfd is infinibandevent */ ++int is_infiniband_link(char *link) ++{ ++ return is_anon_link_type(link, "[infinibandevent]"); ++} ++ ++static int dump_one_infiniband(int lfd, u32 id, const struct fd_parms *p) ++{ ++ FileEntry fe = FILE_ENTRY__INIT; ++ InfinibandEntry infiniband = INFINIBAND_ENTRY__INIT; ++ ++ infiniband.id = id; ++ ++ fe.type = FD_TYPES__INFINIBAND; ++ fe.id = infiniband.id; ++ fe.infiniband = &infiniband; ++ ++ pr_info_infiniband("Dumping ", &infiniband); ++ ++ return pb_write_one(img_from_set(glob_imgset, CR_FD_FILES), &fe, PB_FILE); ++} ++ ++const struct fdtype_ops infiniband_dump_ops = { ++ .type = FD_TYPES__INFINIBAND, ++ .dump = dump_one_infiniband, ++}; ++ ++static int infiniband_open(struct file_desc *d, int *new_fd) { ++ /* ++ * `*new_fd == -1` at this time, it means this open operation shouldn't ++ * be served out, which is why this function does nothing here. ++ */ ++ return 0; ++}; ++ ++static struct file_desc_ops infiniband_desc_ops = { ++ .type = FD_TYPES__INFINIBAND, ++ .open = infiniband_open, ++}; ++ ++static int collect_one_infiniband(void *o, ProtobufCMessage *base, struct cr_img *i) ++{ ++ struct infiniband_file_info *info = o; ++ ++ info->infiniband = pb_msg(base, InfinibandEntry); ++ pr_info_infiniband("Collected ", info->infiniband); ++ ++ /* add the fd to `file_desc_hash` list to prevent from NULL pointer */ ++ return file_desc_add(&info->d, info->infiniband->id, &infiniband_desc_ops); ++} ++ ++struct collect_image_info infiniband_cinfo = { ++ .fd_type = CR_FD_INFINIBAND, ++ .pb_type = PB_INFINIBAND, ++ .priv_size = sizeof(struct infiniband_file_info), ++ .collect = collect_one_infiniband, ++}; +diff --git a/criu/files.c b/criu/files.c +index cb50aca..34c4e18 100644 +--- a/criu/files.c ++++ b/criu/files.c +@@ -49,6 +49,7 @@ + #include "kerndat.h" + #include "fdstore.h" + #include "bpfmap.h" ++#include "char.h" + + #include "protobuf.h" + #include "util.h" +@@ -593,12 +594,6 @@ static int dump_chrdev(struct fd_parms *p, int lfd, FdinfoEntry *e) + return err; + } + +-/* Checks if file descriptor @lfd is infinibandevent */ +-int is_infiniband_link(char *link) +-{ +- return is_anon_link_type(link, "[infinibandevent]"); +-} +- + static int dump_one_file(struct pid *pid, int fd, int lfd, struct fd_opts *opts, + struct parasite_ctl *ctl, FdinfoEntry *e, + struct parasite_drain_fd *dfds) +@@ -654,7 +649,7 @@ static int dump_one_file(struct pid *pid, int fd, int lfd, struct fd_opts *opts, + else if (is_timerfd_link(link)) + ops = &timerfd_dump_ops; + else if (is_infiniband_link(link)) +- return 1; ++ ops = &infiniband_dump_ops; + #ifdef CONFIG_HAS_LIBBPF + else if (is_bpfmap_link(link)) + ops = &bpfmap_dump_ops; +@@ -765,11 +760,7 @@ int dump_task_files_seized(struct parasite_ctl *ctl, struct pstree_item *item, + lfds[i], opts + i, ctl, &e, dfds); + if (ret < 0) + break; +- /* infiniband link file */ +- if (ret > 0) { +- ret = 0; +- continue; +- } ++ + e.flags |= need_reuse_flag; + pr_info("write fdinfoEntry fd=%d id=%d\n", (&e)->fd, (&e)->id); + ret = pb_write_one(img, &e, PB_FDINFO); +@@ -1864,8 +1855,11 @@ static int chrfile_open(struct file_desc *d, int *new_fd) + pr_info("charfile: Opening %s (repair %d index %d)\n", + ci->path, ci->cfe->repair, ci->cfe->index); + ++ if (ci->cfe->repair) ++ ci->cfe->flags |= O_REPAIR; ++ + mntns_root = open_pid_proc(getpid()); +- fd = openat(mntns_root, ci->path, ci->cfe->flags | O_REPAIR); ++ fd = openat(mntns_root, ci->path, ci->cfe->flags); + if (fd < 0){ + pr_err("open chr file failed\n"); + return -1; +@@ -1991,6 +1985,9 @@ static int collect_one_file(void *o, ProtobufCMessage *base, struct cr_img *i) + case FD_TYPES__CHR: + ret = collect_one_file_entry(fe, fe->chr->id, &fe->chr->base, &chrfile_cinfo); + break; ++ case FD_TYPES__INFINIBAND: ++ ret = collect_one_file_entry(fe, fe->infiniband->id, &fe->infiniband->base, &infiniband_cinfo); ++ break; + } + + return ret; +diff --git a/criu/include/char.h b/criu/include/char.h +new file mode 100644 +index 0000000..c63b8f1 +--- /dev/null ++++ b/criu/include/char.h +@@ -0,0 +1,17 @@ ++#ifndef __CR_CHAR_H__ ++#define __CR_CHAR_H__ ++ ++#include "files.h" ++#include "images/chr.pb-c.h" ++ ++struct infiniband_file_info { ++ InfinibandEntry *infiniband; ++ struct file_desc d; ++}; ++ ++extern const struct fdtype_ops infiniband_dump_ops; ++extern struct collect_image_info infiniband_cinfo; ++ ++int is_infiniband_link(char *link); ++ ++#endif /* __CR_CHAR_H__ */ +diff --git a/criu/include/image-desc.h b/criu/include/image-desc.h +index 22676ae..4231716 100644 +--- a/criu/include/image-desc.h ++++ b/criu/include/image-desc.h +@@ -115,6 +115,7 @@ enum { + + CR_FD_AUTOFS, + CR_FD_CHRFILE, ++ CR_FD_INFINIBAND, + + CR_FD_MAX + }; +diff --git a/criu/include/protobuf-desc.h b/criu/include/protobuf-desc.h +index e7df57e..023bbfc 100644 +--- a/criu/include/protobuf-desc.h ++++ b/criu/include/protobuf-desc.h +@@ -70,6 +70,7 @@ enum { + PB_BPFMAP_FILE, + PB_BPFMAP_DATA, + PB_CHRFILE, ++ PB_INFINIBAND, + + /* PB_AUTOGEN_STOP */ + +diff --git a/images/chr.proto b/images/chr.proto +index 67929db..ed65005 100644 +--- a/images/chr.proto ++++ b/images/chr.proto +@@ -10,3 +10,6 @@ message chrfile_entry { + required bool repair = 5; + }; + ++message infiniband_entry { ++ required uint32 id = 1; ++}; +diff --git a/images/fdinfo.proto b/images/fdinfo.proto +index 8561da4..2fa34f8 100644 +--- a/images/fdinfo.proto ++++ b/images/fdinfo.proto +@@ -42,6 +42,7 @@ enum fd_types { + MEMFD = 18; + BPFMAP = 19; + CHR = 21; ++ INFINIBAND = 22; + + /* Any number above the real used. Not stored to image */ + CTL_TTY = 65534; +@@ -79,4 +80,5 @@ message file_entry { + optional memfd_file_entry memfd = 20; + optional bpfmap_file_entry bpf = 21; + optional chrfile_entry chr = 23; ++ optional infiniband_entry infiniband = 25; + } +-- +2.34.0 + diff --git a/backport-0047--optimization-parallel-collecting-vmas.patch b/backport-0047--optimization-parallel-collecting-vmas.patch new file mode 100644 index 0000000000000000000000000000000000000000..37df3187fb7b7c384f1a10c7e275e71866306060 --- /dev/null +++ b/backport-0047--optimization-parallel-collecting-vmas.patch @@ -0,0 +1,523 @@ +From 79cdedf23250137a56b8c9188133e073399648f0 Mon Sep 17 00:00:00 2001 +From: "fu.lin" +Date: Fri, 12 Nov 2021 17:58:50 +0800 +Subject: [PATCH 1/1] optimization: parallel collecting vmas + +collecting smaps has no influence with other processes, we can collect +parallelly early to accelerate speed. + +In order to prevent the concurrency problem by `find_unused_fd`, only +the main root task will parallel. + +Usage: + criu --parallel + +Note: ensure criu can use multi-core, otherwise the performance will +deterioration. + +Signed-off-by: fu.lin +--- + criu/Makefile.crtools | 1 + + criu/Makefile.packages | 1 + + criu/config.c | 1 + + criu/cr-dump.c | 54 ++++++++++++----- + criu/crtools.c | 3 +- + criu/include/cr_options.h | 1 + + criu/include/pstree.h | 3 + + criu/include/taskqueue.h | 50 +++++++++++++++ + criu/namespaces.c | 9 ++- + criu/proc_parse.c | 6 ++ + criu/taskqueue.c | 124 ++++++++++++++++++++++++++++++++++++++ + 11 files changed, 236 insertions(+), 17 deletions(-) + create mode 100644 criu/include/taskqueue.h + create mode 100644 criu/taskqueue.c + +diff --git a/criu/Makefile.crtools b/criu/Makefile.crtools +index 70d1b73..6deb855 100644 +--- a/criu/Makefile.crtools ++++ b/criu/Makefile.crtools +@@ -94,6 +94,7 @@ obj-y += mnl.o + obj-y += nftables.o + obj-y += kmsg.o + obj-y += char.o ++obj-y += taskqueue.o + obj-$(CONFIG_HAS_LIBBPF) += bpfmap.o + obj-$(CONFIG_COMPAT) += pie-util-vdso-elf32.o + CFLAGS_pie-util-vdso-elf32.o += -DCONFIG_VDSO_32 +diff --git a/criu/Makefile.packages b/criu/Makefile.packages +index 9638a3d..b9d1dea 100644 +--- a/criu/Makefile.packages ++++ b/criu/Makefile.packages +@@ -39,6 +39,7 @@ export LIBS += $(shell pkg-config --libs libmnl) + export LIBS += $(shell pkg-config --libs libnftnl) + export CFLAGS += $(shell pkg-config --cflags libmnl) + export CFLAGS += $(shell pkg-config --cflags libnftnl) ++export LIBS += -lpthread + + check-packages-failed: + $(warning Can not find some of the required libraries) +diff --git a/criu/config.c b/criu/config.c +index cdafe17..e09445a 100644 +--- a/criu/config.c ++++ b/criu/config.c +@@ -554,6 +554,7 @@ int parse_options(int argc, char **argv, bool *usage_error, + BOOL_OPT("file-locks-repair", &opts.file_locks_repair), + {"reserve-ports", required_argument, 0, 'P' }, + BOOL_OPT("use-nft", &opts.use_nft), ++ BOOL_OPT("parallel", &opts.parallel), + { }, + }; + +diff --git a/criu/cr-dump.c b/criu/cr-dump.c +index 2e940d5..30bbc5b 100644 +--- a/criu/cr-dump.c ++++ b/criu/cr-dump.c +@@ -18,6 +18,7 @@ + + #include + #include ++#include + + #include "types.h" + #include "protobuf.h" +@@ -85,6 +86,8 @@ + #include "img-streamer.h" + #include "restorer.h" + ++#include "taskqueue.h" ++ + /* + * Architectures can overwrite this function to restore register sets that + * are not covered by ptrace_set/get_regs(). +@@ -404,7 +407,7 @@ static int dump_pid_misc(pid_t pid, TaskCoreEntry *tc) + return 0; + } + +-static int dump_filemap(struct vma_area *vma_area, int fd) ++int dump_filemap(struct vma_area *vma_area, int fd) + { + struct fd_parms p = FD_PARMS_INIT; + VmaEntry *vma = vma_area->e; +@@ -1233,7 +1236,7 @@ err_cure: + static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) + { + pid_t pid = item->pid->real; +- struct vm_area_list vmas; ++ struct vm_area_list *vmas = NULL; + struct parasite_ctl *parasite_ctl; + int ret, exit_code = -1; + struct parasite_dump_misc misc; +@@ -1242,8 +1245,6 @@ static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) + struct proc_posix_timers_stat proc_args; + struct mem_dump_ctl mdc; + +- vm_area_list_init(&vmas); +- + pr_info("========================================\n"); + pr_info("Dumping task (pid: %d)\n", pid); + pr_info("========================================\n"); +@@ -1254,12 +1255,23 @@ static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) + */ + return 0; + ++ if (!opts.parallel || root_item->pid->real != item->pid->real ) { ++ vmas = xmalloc(sizeof(struct vm_area_list)); ++ if (vmas == NULL) { ++ pr_err("xmalloc no memory\n"); ++ return -1; ++ } ++ vm_area_list_init(vmas); ++ } else ++ vmas = item->maps_info.vmas; ++ + pr_info("Obtaining task stat ... \n"); + ret = parse_pid_stat(pid, &pps_buf); + if (ret < 0) + goto err; + +- ret = collect_mappings(pid, &vmas, dump_filemap); ++ ret = (opts.parallel && root_item->pid->real == item->pid->real) ? ++ 0 : collect_mappings(pid, vmas, dump_filemap); + if (ret) { + pr_err("Collect mappings (pid: %d) failed with %d\n", pid, ret); + goto err; +@@ -1293,7 +1305,10 @@ static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) + goto err; + } + +- parasite_ctl = parasite_infect_seized(pid, item, &vmas); ++ if (opts.parallel && end_collect_mappings_thread(item)) ++ goto err; ++ ++ parasite_ctl = parasite_infect_seized(pid, item, vmas); + if (!parasite_ctl) { + pr_err("Can't infect (pid: %d) with parasite\n", pid); + goto err; +@@ -1317,13 +1332,13 @@ static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) + goto err_cure_imgset; + } + +- ret = parasite_fixup_vdso(parasite_ctl, pid, &vmas); ++ ret = parasite_fixup_vdso(parasite_ctl, pid, vmas); + if (ret) { + pr_err("Can't fixup vdso VMAs (pid: %d)\n", pid); + goto err_cure_imgset; + } + +- ret = parasite_collect_aios(parasite_ctl, &vmas); /* FIXME -- merge with above */ ++ ret = parasite_collect_aios(parasite_ctl, vmas); /* FIXME -- merge with above */ + if (ret) { + pr_err("Failed to check aio rings (pid: %d)\n", pid); + goto err_cure_imgset; +@@ -1377,7 +1392,7 @@ static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) + mdc.stat = &pps_buf; + mdc.parent_ie = parent_ie; + +- ret = parasite_dump_pages_seized(item, &vmas, &mdc, parasite_ctl); ++ ret = parasite_dump_pages_seized(item, vmas, &mdc, parasite_ctl); + if (ret) + goto err_cure; + +@@ -1438,7 +1453,7 @@ static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) + goto err; + } + +- ret = dump_task_mm(pid, &pps_buf, &misc, &vmas, cr_imgset); ++ ret = dump_task_mm(pid, &pps_buf, &misc, vmas, cr_imgset); + if (ret) { + pr_err("Dump mappings (pid: %d) failed with %d\n", pid, ret); + goto err; +@@ -1454,7 +1469,8 @@ static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) + exit_code = 0; + err: + close_pid_proc(); +- free_mappings(&vmas); ++ free_mappings(vmas); ++ free(vmas); + xfree(dfds); + return exit_code; + +@@ -1835,6 +1851,13 @@ static int cr_dump_finish(int ret) + write_stats(DUMP_STATS); + pr_info("Dumping finished successfully\n"); + } ++ ++ /* ++ * Don't care threads' status and ignore unfree resources, use ++ * `exit_group()` to ensure exit all threads. ++ */ ++ syscall(SYS_exit_group, post_dump_ret ? : (ret != 0)); ++ + return post_dump_ret ? : (ret != 0); + } + +@@ -1860,6 +1883,9 @@ int cr_dump_tasks(pid_t pid) + if (opts.dump_char_dev && parse_devname() < 0) + goto err; + ++ if (opts.parallel && init_parallel_env() != 0) ++ goto err; ++ + root_item = alloc_pstree_item(); + if (!root_item) + goto err; +@@ -1937,13 +1963,13 @@ int cr_dump_tasks(pid_t pid) + if (collect_file_locks()) + goto err; + +- if (collect_namespaces(true) < 0) +- goto err; +- + glob_imgset = cr_glob_imgset_open(O_DUMP); + if (!glob_imgset) + goto err; + ++ if (collect_namespaces(true) < 0) ++ goto err; ++ + if (seccomp_collect_dump_filters() < 0) + goto err; + +diff --git a/criu/crtools.c b/criu/crtools.c +index d95d903..5127cde 100644 +--- a/criu/crtools.c ++++ b/criu/crtools.c +@@ -489,7 +489,8 @@ usage: + " --ignore-special-dump Ignore special task tid page dump\n" + " --file-locks-repair Use repair mode to dump and restore file locks\n" + " --reserve-ports Reserve src ports in kernel\n" +-" --use-nft Use nft API instead of iptables cmd in network locking" ++" --use-nft Use nft API instead of iptables cmd in network locking\n" ++" --parallel Parallel to accellrate dumping speed\n" + "\n" + "Check options:\n" + " Without options, \"criu check\" checks availability of absolutely required\n" +diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h +index 236d1c7..cc8d1ae 100644 +--- a/criu/include/cr_options.h ++++ b/criu/include/cr_options.h +@@ -188,6 +188,7 @@ struct cr_options { + char *share_src_ports; + int reserve_ports; + int use_nft; ++ int parallel; + }; + + extern struct cr_options opts; +diff --git a/criu/include/pstree.h b/criu/include/pstree.h +index 61ab0ce..be0942a 100644 +--- a/criu/include/pstree.h ++++ b/criu/include/pstree.h +@@ -1,6 +1,8 @@ + #ifndef __CR_PSTREE_H__ + #define __CR_PSTREE_H__ + ++#include "taskqueue.h" ++ + #include "common/list.h" + #include "common/lock.h" + #include "pid.h" +@@ -30,6 +32,7 @@ struct pstree_item { + futex_t task_st; + unsigned long task_st_le_bits; + }; ++ struct mappings_info maps_info; + }; + + static inline pid_t vpid(const struct pstree_item *i) +diff --git a/criu/include/taskqueue.h b/criu/include/taskqueue.h +new file mode 100644 +index 0000000..16f9e3d +--- /dev/null ++++ b/criu/include/taskqueue.h +@@ -0,0 +1,50 @@ ++#ifndef __CR_TASKQUEUE_H__ ++#define __CR_TASKQUEUE_H__ ++ ++#include ++#include ++#include ++ ++#include "vma.h" ++#include "pstree.h" ++ ++#include "common/list.h" ++ ++#define TASKQUEUE_HASH_SIZE 8 ++ ++struct taskqueue { ++ pthread_t task; ++ void *(*routine)(void *); ++ void *arg; ++ int result; ++}; ++#define queue_task queue.task ++#define queue_routine queue.routine ++#define queue_arg queue.arg ++#define queue_result queue.result ++ ++int init_parallel_env(void); ++ ++static inline int taskqueue_create(struct taskqueue *queue) ++{ ++ return pthread_create(&queue->task, NULL, queue->routine, queue->arg); ++} ++ ++static inline int taskqueue_join(struct taskqueue *queue) ++{ ++ return pthread_join(queue->task, NULL); ++} ++ ++/* parallel collect smaps */ ++struct mappings_info { ++ struct hlist_node hash; ++ pid_t pid; ++ struct vm_area_list *vmas; ++ dump_filemap_t dump_file; ++ struct taskqueue queue; ++}; ++ ++int start_collect_mappings_thread(void); ++int end_collect_mappings_thread(struct pstree_item *item); ++ ++#endif /* __CR_TASKQUEUE_H__ */ +diff --git a/criu/namespaces.c b/criu/namespaces.c +index 9ffcd16..e71817f 100644 +--- a/criu/namespaces.c ++++ b/criu/namespaces.c +@@ -27,6 +27,7 @@ + #include "net.h" + #include "cgroup.h" + #include "fdstore.h" ++#include "taskqueue.h" + + #include "protobuf.h" + #include "util.h" +@@ -1570,11 +1571,15 @@ int collect_namespaces(bool for_dump) + { + int ret; + +- ret = collect_user_namespaces(for_dump); ++ ret = collect_mnt_namespaces(for_dump); + if (ret < 0) + return ret; + +- ret = collect_mnt_namespaces(for_dump); ++ /* need mnt info provided by `mntinfo` */ ++ if (opts.parallel && start_collect_mappings_thread()) ++ return -1; ++ ++ ret = collect_user_namespaces(for_dump); + if (ret < 0) + return ret; + +diff --git a/criu/proc_parse.c b/criu/proc_parse.c +index b3d1c0b..4a6a598 100644 +--- a/criu/proc_parse.c ++++ b/criu/proc_parse.c +@@ -64,6 +64,12 @@ + + #define BUF_SIZE 4096 /* Good enough value - can be changed */ + ++/* cancel log to optimize performance because of the lock contention of print */ ++#undef pr_info ++#undef pr_debug ++#define pr_info(fmt, ...) ++#define pr_debug(fmt, ...) ++ + struct buffer { + char buf[BUF_SIZE]; + char end; /* '\0' */ +diff --git a/criu/taskqueue.c b/criu/taskqueue.c +new file mode 100644 +index 0000000..1196a5e +--- /dev/null ++++ b/criu/taskqueue.c +@@ -0,0 +1,124 @@ ++/* ++ * Target: ++ * parallel dump process ++ */ ++ ++#include ++#include ++#include ++#include ++ ++#include "pstree.h" ++#include "log.h" ++#include "taskqueue.h" ++ ++/* ++ * Sometimes, only one cpu can be used which is bad for parallel routine. ++ * Therefore, set cpu affinity for criu routine. ++ */ ++static int set_cpuaffinity(void) ++{ ++ cpu_set_t *set; ++ int num_cpus = get_nprocs_conf(); ++ size_t cpusetsize = CPU_ALLOC_SIZE(num_cpus); ++ int retval; ++ ++ set = CPU_ALLOC(num_cpus); ++ memset(set, 0xff, cpusetsize); ++ ++ retval = sched_setaffinity(getpid(), cpusetsize, set); ++ if (retval != 0) ++ pr_err("sched_setaffinity failed: %s\n", strerror(errno)); ++ ++ CPU_FREE(set); ++ ++ return retval; ++} ++ ++int init_parallel_env(void) ++{ ++ return set_cpuaffinity(); ++} ++ ++static void *collect_mappings_routine(void *_arg) ++{ ++ struct mappings_info *info = _arg; ++ ++ info->queue_result = collect_mappings(info->pid, info->vmas, info->dump_file); ++ return NULL; ++} ++ ++int dump_filemap(struct vma_area *vma_area, int fd); /* defined in criu/cr-dump.c */ ++ ++int start_collect_mappings_thread(void) ++{ ++ struct pstree_item *pi; ++ struct mappings_info *info; ++ ++ for_each_pstree_item(pi) { ++ /* disable parallel collect for non-root item because of the ++ * concurrence. ++ */ ++ if (pi->pid->real != root_item->pid->real) ++ continue; ++ ++ info = &pi->maps_info; ++ ++ info->vmas = xmalloc(sizeof(struct vm_area_list)); ++ if (info->vmas == NULL) { ++ pr_err("xzalloc vmas no memory\n"); ++ return -1; ++ } ++ vm_area_list_init(info->vmas); ++ ++ info->pid = pi->pid->real; ++ info->dump_file = dump_filemap; ++ info->queue_routine = collect_mappings_routine; ++ info->queue_arg = info; ++ ++ pr_info("Start thread to collect %d mappings\n", info->pid); ++ ++ if (taskqueue_create(&info->queue) < 0) { ++ pr_err("parallel_collect_mappings failed: %s\n", strerror(errno)); ++ free(info->vmas); ++ /* ++ * Don't care other threads status, use `exit_group()` ++ * to ensure all threads exit. ++ */ ++ return -1; ++ } ++ } ++ ++ return 0; ++} ++ ++int end_collect_mappings_thread(struct pstree_item *item) ++{ ++ struct mappings_info *info = &item->maps_info; ++ int retval; ++ ++ /* disable parallel collect for non-root item because of the ++ * concurrence. ++ */ ++ if (root_item->pid->real != item->pid->real) ++ return 0; ++ ++ retval = taskqueue_join(&info->queue); ++ if (retval != 0 || info->queue_result != 0) { ++ pr_err("taskqueue_join failed, retval %d(errno %d: %s)," ++ " queue_result: %d\n", ++ retval, ++ retval == 0 ? 0 : errno, ++ retval == 0 ? "nil" : strerror(errno), ++ info->queue_result); ++ retval = -1; ++ } ++ ++ pr_info("End thread to collect %d mappings\n", info->pid); ++ ++ /* ++ * Don't care other threads status, use `exit_group()` to ensure all ++ * threads exit. ++ */ ++ return retval; ++} +-- +2.34.0 + diff --git a/backport-0048--dump-ignore-children-exit-to-accelerate-speed.patch b/backport-0048--dump-ignore-children-exit-to-accelerate-speed.patch new file mode 100644 index 0000000000000000000000000000000000000000..2cc099703a2c22e436d4527e069f5bd5d1aff1aa --- /dev/null +++ b/backport-0048--dump-ignore-children-exit-to-accelerate-speed.patch @@ -0,0 +1,38 @@ +From 8122e6f9bc8940587ff89e653d06c9b0c7aae032 Mon Sep 17 00:00:00 2001 +From: "fu.lin" +Date: Mon, 29 Nov 2021 19:50:39 +0800 +Subject: [PATCH 48/50] dump: ignore children exit to accelerate speed + +don't care the tracee exit status to accelerate dump speed. Just ignore +SIGCHLD signal. + +Theory: +- criu don't care about `wait4()` status for tracee: in original process, + criu just complains if the status of `wait4()` is abnormal, no action + will be processed. +- the tracee will be adopted by the tracer's parent if the tracer exited + is early than tracee, no zombie tracee will be left. + +Signed-off-by: fu.lin +--- + criu/seize.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/criu/seize.c b/criu/seize.c +index 056454d..cad6919 100644 +--- a/criu/seize.c ++++ b/criu/seize.c +@@ -761,7 +761,9 @@ void pstree_switch_state(struct pstree_item *root_item, int st) + for_each_pstree_item(item) + unseize_task_and_threads(item, st); + +- if (st == TASK_DEAD) ++ if (st == TASK_DEAD && opts.parallel) ++ signal(SIGCHLD, SIG_IGN); /* ignore children exit */ ++ else if (st == TASK_DEAD) + pstree_wait(root_item); + } + +-- +2.34.0 + diff --git a/backport-0049--parallel-parallel-nft-delete-set.patch b/backport-0049--parallel-parallel-nft-delete-set.patch new file mode 100644 index 0000000000000000000000000000000000000000..9234d6a4d91d7e1f558ef1f13343f2730d72cad5 --- /dev/null +++ b/backport-0049--parallel-parallel-nft-delete-set.patch @@ -0,0 +1,177 @@ +From c31657313ea135906d3faeaf86212056ea1a76e9 Mon Sep 17 00:00:00 2001 +From: "fu.lin" +Date: Mon, 29 Nov 2021 16:03:02 +0800 +Subject: [PATCH 49/50] parallel: parallel nft delete set + +The nft has two part: rules and set. criu delete nft rules to unlock network +during restoration. The set deletion action consumes about hundreds of ms when +there are too many elements in nft set. Dealying set deletion is helpful +to save restoration time. + +Signed-off-by: fu.lin +--- + criu/cr-dump.c | 1 + + criu/cr-restore.c | 3 ++- + criu/include/taskqueue.h | 12 ++++++++++- + criu/nftables.c | 14 +++++++++++-- + criu/taskqueue.c | 45 ++++++++++++++++++++++++++++++++++++++++ + 5 files changed, 71 insertions(+), 4 deletions(-) + +diff --git a/criu/cr-dump.c b/criu/cr-dump.c +index 30bbc5b..24b0ef5 100644 +--- a/criu/cr-dump.c ++++ b/criu/cr-dump.c +@@ -1799,6 +1799,7 @@ static int cr_dump_finish(int ret) + */ + if (ret || post_dump_ret || opts.final_state == TASK_ALIVE) { + network_unlock(opts.tree_id); ++ parallel_nft_clean((long)opts.tree_id); + delete_link_remaps(); + clean_cr_time_mounts(); + } +diff --git a/criu/cr-restore.c b/criu/cr-restore.c +index e94d5a6..83bdc38 100644 +--- a/criu/cr-restore.c ++++ b/criu/cr-restore.c +@@ -2566,7 +2566,7 @@ skip_ns_bouncing: + if (ret != 0) + pr_err("Post-resume script ret code %d\n", ret); + +- network_delete_set(vpid(init)); ++ parallel_nft_clean((long)vpid(init)); + + if (!opts.restore_detach && !opts.exec_cmd) + wait(NULL); +@@ -2575,6 +2575,7 @@ skip_ns_bouncing: + + out_kill_network_unlocked: + pr_err("Killing processes because of failure on restore.\nThe Network was unlocked so some data or a connection may have been lost.\n"); ++ parallel_nft_clean((long)vpid(init)); + out_kill: + /* + * The processes can be killed only when all of them have been created, +diff --git a/criu/include/taskqueue.h b/criu/include/taskqueue.h +index 16f9e3d..906c784 100644 +--- a/criu/include/taskqueue.h ++++ b/criu/include/taskqueue.h +@@ -6,7 +6,6 @@ + #include + + #include "vma.h" +-#include "pstree.h" + + #include "common/list.h" + +@@ -47,4 +46,15 @@ struct mappings_info { + int start_collect_mappings_thread(void); + int end_collect_mappings_thread(struct pstree_item *item); + ++#define STACK_SIZE (1024 *1024) ++typedef void (*daemon_t)(void *); ++int parallel_task(daemon_t fn, void *_arg); ++ ++struct daemon { ++ daemon_t fn; ++ void *arg; ++}; ++ ++void parallel_nft_clean(long tree_id); ++ + #endif /* __CR_TASKQUEUE_H__ */ +diff --git a/criu/nftables.c b/criu/nftables.c +index 739aee4..0c529ed 100644 +--- a/criu/nftables.c ++++ b/criu/nftables.c +@@ -16,6 +16,7 @@ + + #include "sk-inet.h" + #include "nftables.h" ++#include "taskqueue.h" + + #include "../soccr/soccr.h" + +@@ -661,10 +662,9 @@ static int network_delete_rule_internal(struct mnl_params *params, + return nft_rule_common(params, tree_id, false); + } + ++/* here split the deletion of rule and set to accelete the restoration process */ + void network_delete_rule(pid_t tree_id) + { +- pr_info("unlock network\n"); +- + mnl_common(network_delete_rule_internal, NULL, &tree_id); + } + +@@ -683,6 +683,16 @@ void network_delete_set(pid_t tree_id) + mnl_common(network_delete_set_internal, NULL, &tree_id); + } + ++void parallel_nft_clean_internal(void *arg) ++{ ++ network_delete_set((long)arg); ++} ++ ++void parallel_nft_clean(long tree_id) ++{ ++ parallel_task(parallel_nft_clean_internal, (void *)tree_id); ++} ++ + static int add_set_elem_internal(struct nftnl_set *s, void *data, size_t len) + { + struct nftnl_set_elem *e; +diff --git a/criu/taskqueue.c b/criu/taskqueue.c +index 1196a5e..7d500e9 100644 +--- a/criu/taskqueue.c ++++ b/criu/taskqueue.c +@@ -122,3 +122,48 @@ int end_collect_mappings_thread(struct pstree_item *item) + */ + return retval; + } ++ ++static int daemonize(void *arg) ++{ ++ struct daemon *d = arg; ++ ++ if (daemon(0, 0) < 0) ++ pr_perror("daemonize failed"); ++ ++ d->fn(d->arg); ++ ++ return 0; ++} ++ ++int parallel_task(daemon_t fn, void *_arg) ++{ ++ struct daemon arg = { ++ .fn = fn, ++ .arg = _arg, ++ }; ++ char *stack; ++ char *stack_top; ++ pid_t pid; ++ ++ stack = mmap(NULL, STACK_SIZE, PROT_READ | PROT_WRITE, ++ MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0); ++ if (stack == MAP_FAILED) { ++ pr_perror("mmap failed"); ++ return -1; ++ } ++ ++ stack_top = stack + STACK_SIZE; ++ ++ /* ignore SIGCHLD signal */ ++ pid = clone(daemonize, stack_top, 0, &arg); ++ if (pid > 0) ++ return 0; /* parent */ ++ else if (pid < 0) { ++ pr_perror("clone failed"); ++ return -1; ++ } ++ ++ /* unreachable */ ++ __builtin_unreachable(); ++ return 0; ++} +-- +2.34.0 + diff --git a/backport-0050--ptrace-trace-specific-syscall.patch b/backport-0050--ptrace-trace-specific-syscall.patch new file mode 100644 index 0000000000000000000000000000000000000000..9cb1a3b94bffdf7de210abc978739c6dace836fc --- /dev/null +++ b/backport-0050--ptrace-trace-specific-syscall.patch @@ -0,0 +1,707 @@ +From d8e5833fbfa1f02b891630032f4b4bb7c7c5bfe7 Mon Sep 17 00:00:00 2001 +From: "fu.lin" +Date: Tue, 23 Nov 2021 16:08:17 +0800 +Subject: [PATCH 50/50] ptrace: trace specific syscall + +criu use `ptrace(PTRACE_SYSCALL)` to watch whether the tracee steps in +correct status, it isn't necessory to stop tracee at every syscall. +Therefore, customizing `ptrace(PTRACE_SYSCALL)` to make tracee stop at +the specific syscall can save time (1000 threads consume about 140ms). + +ptrace syntax: + long ptrace(PTRACE_SYSCALL, pid_t pid, void *addr, void *data); + +the argument `addr` is unused in original `ptrace(PTRACE_SYSCALL)`, +use `addr` parameter to give the specific sysno which is wanted to +trace. + +Signed-off-by: fu.lin +--- + compel/Makefile | 1 + + compel/include/uapi/bisect.h | 30 +++++++ + compel/include/uapi/infect.h | 11 ++- + compel/src/lib/bisect.c | 92 +++++++++++++++++++ + compel/src/lib/infect.c | 167 ++++++++++++++++++++++++++++++++--- + criu/config.c | 1 + + criu/cr-dump.c | 2 +- + criu/cr-restore.c | 97 +++++++++++++++++++- + criu/crtools.c | 1 + + criu/include/cr_options.h | 1 + + 10 files changed, 385 insertions(+), 18 deletions(-) + create mode 100644 compel/include/uapi/bisect.h + create mode 100644 compel/src/lib/bisect.c + +diff --git a/compel/Makefile b/compel/Makefile +index de9318c..eea93a7 100644 +--- a/compel/Makefile ++++ b/compel/Makefile +@@ -27,6 +27,7 @@ lib-y += src/lib/infect-rpc.o + lib-y += src/lib/infect-util.o + lib-y += src/lib/infect.o + lib-y += src/lib/ptrace.o ++lib-y += src/lib/bisect.o + + # handle_elf() has no support of ELF relocations on ARM (yet?) + ifneq ($(filter arm aarch64,$(ARCH)),) +diff --git a/compel/include/uapi/bisect.h b/compel/include/uapi/bisect.h +new file mode 100644 +index 0000000..9c00513 +--- /dev/null ++++ b/compel/include/uapi/bisect.h +@@ -0,0 +1,30 @@ ++#ifndef __COMPEL_BISECT_H__ ++#define __COMPEL_BISECT_H__ ++ ++#include ++ ++enum tf { ++ TRACE_INTERRUPT = 0x17173, ++ TRACE_SYSCALL_ENTER, ++ TRACE_SYSCALL_EXIT, ++}; ++ ++struct trace_flag { ++ pid_t key; ++ enum tf flag; /* TODO: enum trace_flags flag */ ++}; ++ ++struct bisect_meta { ++ int size; ++ int used; ++ void *data; /* data pointer array */ ++ void *__data; /* data array */ ++}; ++ ++struct trace_flag *tf_bisect(struct bisect_meta *meta, pid_t key); ++struct trace_flag *tf_insert(struct bisect_meta *meta, pid_t key); ++int tf_create(struct bisect_meta *meta, int len); ++void tf_destroy(struct bisect_meta *meta); ++void tf_clear(struct bisect_meta *meta); ++ ++#endif /* __COMPEL_BISECT_H__ */ +diff --git a/compel/include/uapi/infect.h b/compel/include/uapi/infect.h +index 257658a..99448a9 100644 +--- a/compel/include/uapi/infect.h ++++ b/compel/include/uapi/infect.h +@@ -8,6 +8,7 @@ + #include + #include + #include ++#include + + #include "common/compiler.h" + +@@ -41,7 +42,7 @@ extern int __must_check compel_infect(struct parasite_ctl *ctl, + extern struct parasite_thread_ctl __must_check *compel_prepare_thread(struct parasite_ctl *ctl, int pid); + extern void compel_release_thread(struct parasite_thread_ctl *); + +-extern int __must_check compel_stop_daemon(struct parasite_ctl *ctl); ++extern int __must_check compel_stop_daemon(struct parasite_ctl *ctl, bool customize); + extern int __must_check compel_cure_remote(struct parasite_ctl *ctl); + extern int __must_check compel_cure_local(struct parasite_ctl *ctl); + extern int __must_check compel_cure(struct parasite_ctl *ctl); +@@ -90,6 +91,14 @@ extern int __must_check compel_stop_pie(pid_t pid, void *addr, + + extern int __must_check compel_unmap(struct parasite_ctl *ctl, unsigned long addr); + ++extern int __must_check compel_stop_on_syscall_customize(int tasks, ++ const int sys_nr, const int exit_sys_nr, struct bisect_meta *meta); ++ ++extern int __must_check compel_stop_pie_customize(pid_t pid, ++ const int sys_nr, struct trace_flag *tf); ++ ++extern int __must_check compel_unmap_customize(struct parasite_ctl *ctl, unsigned long addr); ++ + extern int compel_mode_native(struct parasite_ctl *ctl); + + extern k_rtsigset_t *compel_task_sigmask(struct parasite_ctl *ctl); +diff --git a/compel/src/lib/bisect.c b/compel/src/lib/bisect.c +new file mode 100644 +index 0000000..807a5a9 +--- /dev/null ++++ b/compel/src/lib/bisect.c +@@ -0,0 +1,92 @@ ++#include ++ ++#include "log.h" ++#include "common/xmalloc.h" ++#include "bisect.h" ++ ++struct trace_flag *tf_bisect(struct bisect_meta *meta, pid_t key) ++{ ++ struct trace_flag **tfs = meta->data; ++ int lo = 0, hi = meta->used, mid; ++ ++ if (meta->used <= 0) ++ return NULL; ++ ++ while (lo < hi) { ++ mid = (int)((lo + hi) / 2); ++ if (tfs[mid]->key == key) { ++ return tfs[mid]; ++ } else if (tfs[mid]->key > key) { ++ hi = mid; ++ } else { ++ lo = mid + 1; ++ } ++ } ++ ++ return NULL; ++} ++ ++/* used in cr-restore */ ++struct trace_flag *tf_insert(struct bisect_meta *meta, pid_t key) ++{ ++ struct trace_flag **tfs = meta->data; ++ struct trace_flag *tf = &((struct trace_flag *)meta->__data)[meta->used]; ++ int i = 0, j = 0; ++ ++ if (meta->used == meta->size) ++ return NULL; ++ ++ for (i = 0; i < meta->used; i++) { ++ if (tfs[i]->key >= key) /* impossible condition: `tfs[i]->key == key` */ ++ break; ++ } ++ ++ j = meta->used; ++ meta->used += 1; ++ ++ while (j > i) { ++ tfs[j] = tfs[j-1]; ++ j -= 1; ++ } ++ ++ tfs[i] = tf; ++ tf->key = key; ++ ++ return tf; ++} ++ ++int tf_create(struct bisect_meta *meta, int len) ++{ ++ struct trace_flag *tfs; ++ struct trace_flag **tfs_ptr; ++ ++ tfs = xzalloc(sizeof(*tfs) * len); ++ if (tfs == NULL) ++ return -1; ++ ++ tfs_ptr = xmalloc(sizeof(*tfs_ptr) * len); ++ if (tfs_ptr == NULL) ++ goto err; ++ ++ meta->size = len; ++ meta->used = 0; ++ meta->__data = tfs; ++ meta->data = tfs_ptr; ++ ++ return 0; ++err: ++ xfree(tfs); ++ return -1; ++} ++ ++void tf_destroy(struct bisect_meta *meta) ++{ ++ xfree(meta->__data); ++ xfree(meta->data); ++} ++ ++void tf_clear(struct bisect_meta *meta) ++{ ++ meta->used = 0; ++ __builtin_memset(meta->data, 0, sizeof(struct trace_flag **)*meta->size); ++} +diff --git a/compel/src/lib/infect.c b/compel/src/lib/infect.c +index 38846c2..2582cd2 100644 +--- a/compel/src/lib/infect.c ++++ b/compel/src/lib/infect.c +@@ -442,7 +442,7 @@ static int restore_child_handler(struct parasite_ctl *ctl) + } + + static int parasite_run(pid_t pid, int cmd, unsigned long ip, void *stack, +- user_regs_struct_t *regs, struct thread_ctx *octx) ++ user_regs_struct_t *regs, struct thread_ctx *octx, void *addr) + { + k_rtsigset_t block; + +@@ -458,7 +458,7 @@ static int parasite_run(pid_t pid, int cmd, unsigned long ip, void *stack, + goto err_regs; + } + +- if (ptrace(cmd, pid, NULL, NULL)) { ++ if (ptrace(cmd, pid, addr, NULL)) { + pr_perror("Can't run parasite at %d", pid); + goto err_cont; + } +@@ -565,7 +565,7 @@ int compel_execute_syscall(struct parasite_ctl *ctl, + return -1; + } + +- err = parasite_run(pid, PTRACE_CONT, ctl->ictx.syscall_ip, 0, regs, &ctl->orig); ++ err = parasite_run(pid, PTRACE_CONT, ctl->ictx.syscall_ip, 0, regs, &ctl->orig, NULL); + if (!err) + err = parasite_trap(ctl, pid, regs, &ctl->orig); + +@@ -583,7 +583,7 @@ int compel_run_at(struct parasite_ctl *ctl, unsigned long ip, user_regs_struct_t + user_regs_struct_t regs = ctl->orig.regs; + int ret; + +- ret = parasite_run(ctl->rpid, PTRACE_CONT, ip, 0, ®s, &ctl->orig); ++ ret = parasite_run(ctl->rpid, PTRACE_CONT, ip, 0, ®s, &ctl->orig, NULL); + if (!ret) + ret = parasite_trap(ctl, ctl->rpid, ret_regs ? ret_regs : ®s, &ctl->orig); + return ret; +@@ -632,7 +632,7 @@ static int parasite_init_daemon(struct parasite_ctl *ctl) + goto err; + + regs = ctl->orig.regs; +- if (parasite_run(pid, PTRACE_CONT, ctl->parasite_ip, ctl->rstack, ®s, &ctl->orig)) ++ if (parasite_run(pid, PTRACE_CONT, ctl->parasite_ip, ctl->rstack, ®s, &ctl->orig, NULL)) + goto err; + + futex_wait_while_eq(&args->daemon_connected, 0); +@@ -1272,7 +1272,7 @@ static bool task_in_parasite(struct parasite_ctl *ctl, user_regs_struct_t *regs) + addr < ctl->remote_map + ctl->map_length; + } + +-static int parasite_fini_seized(struct parasite_ctl *ctl) ++static int parasite_fini_seized(struct parasite_ctl *ctl, bool customize) + { + pid_t pid = ctl->rpid; + user_regs_struct_t regs; +@@ -1317,9 +1317,38 @@ static int parasite_fini_seized(struct parasite_ctl *ctl) + if (ret) + return -1; + ++ /* use customize ptrace */ ++ if (customize) { ++ struct trace_flag tf = { .key = pid, .flag = TRACE_SYSCALL_ENTER }; ++ struct trace_flag *tf_ptr[] = { &tf }; ++ struct bisect_meta meta = { ++ .size = 1, ++ .used = 1, ++ .__data = &tf, ++ .data = tf_ptr, ++ }; ++ ++ // TODO: compitable? ++ ret = compel_stop_pie_customize(pid, __NR(rt_sigreturn, 0), &tf); ++ if (ret < 0) ++ return ret; ++ ++ /* The process is going to execute the required syscall, the ++ * original syscall should be forgot(set `-1`) in ++ * `syscall_trace_enter()` handler in kernel when no other ++ * else operation in tracer. ++ * ++ * Note: -1 means NO_SYSCALL which is defined in ++ * `arch/arm64/include/asm/ptrace.h`. ++ */ ++ return compel_stop_on_syscall_customize(1, ++ __NR(rt_sigreturn, 0), ++ -1, &meta); ++ } ++ + /* Go to sigreturn as closer as we can */ + ret = compel_stop_pie(pid, ctl->sigreturn_addr, &flag, +- ctl->ictx.flags & INFECT_NO_BREAKPOINTS); ++ ctl->ictx.flags & INFECT_NO_BREAKPOINTS); + if (ret < 0) + return ret; + +@@ -1339,7 +1368,7 @@ static int parasite_fini_seized(struct parasite_ctl *ctl) + return 0; + } + +-int compel_stop_daemon(struct parasite_ctl *ctl) ++int compel_stop_daemon(struct parasite_ctl *ctl, bool customize) + { + if (ctl->daemonized) { + /* +@@ -1349,7 +1378,7 @@ int compel_stop_daemon(struct parasite_ctl *ctl) + if (ctl->tsock < 0) + return -1; + +- if (parasite_fini_seized(ctl)) { ++ if (parasite_fini_seized(ctl, customize)) { + close_safe(&ctl->tsock); + return -1; + } +@@ -1365,7 +1394,7 @@ int compel_cure_remote(struct parasite_ctl *ctl) + long ret; + int err; + +- if (compel_stop_daemon(ctl)) ++ if (compel_stop_daemon(ctl, false)) + return -1; + + if (!ctl->remote_map) +@@ -1434,7 +1463,7 @@ int compel_run_in_thread(struct parasite_thread_ctl *tctl, unsigned int cmd) + + *ctl->cmd = cmd; + +- ret = parasite_run(pid, PTRACE_CONT, ctl->parasite_ip, stack, ®s, octx); ++ ret = parasite_run(pid, PTRACE_CONT, ctl->parasite_ip, stack, ®s, octx, NULL); + if (ret == 0) + ret = parasite_trap(ctl, pid, ®s, octx); + if (ret == 0) +@@ -1457,7 +1486,7 @@ int compel_unmap(struct parasite_ctl *ctl, unsigned long addr) + pid_t pid = ctl->rpid; + int ret = -1; + +- ret = parasite_run(pid, PTRACE_SYSCALL, addr, ctl->rstack, ®s, &ctl->orig); ++ ret = parasite_run(pid, PTRACE_SYSCALL, addr, ctl->rstack, ®s, &ctl->orig, NULL); + if (ret) + goto err; + +@@ -1470,6 +1499,44 @@ err: + return ret; + } + ++int compel_unmap_customize(struct parasite_ctl *ctl, unsigned long addr) ++{ ++ user_regs_struct_t regs = ctl->orig.regs; ++ pid_t pid = ctl->rpid; ++ int ret = -1; ++ struct trace_flag tf = { .key = pid, .flag = TRACE_SYSCALL_ENTER }; ++ struct trace_flag *tf_ptr[] = { &tf }; ++ struct bisect_meta meta = { ++ .size = 1, ++ .used = 1, ++ .__data = &tf, ++ .data = tf_ptr, ++ }; ++ ++ /* ++ * Here it parasite code. Unlike trap code `compel_stop_pie()`, it ++ * won't let tracee forget the original syscall. In such way, tracer ++ * just trace the syscall called by tracee. The log likes the following: ++ * ++ * [ 817.638332] set pid 1877 ptrace sysno 215 ++ * [ 817.638343] syscall_trace_enter: pid 1877 ptrace_sysno 0 current_sysno 215 ++ * [ 817.638363] (00.006280) Error (compel/src/lib/infect.c:1582): 1877 (native) is going to execute the syscall 215, required is 215 ++ * [ 817.638368] set pid 1877 ptrace sysno 0 ++ * [ 817.638402] syscall_trace_exit: pid 1877 ptrace_sysno 0 current_sysno 215 ++ */ ++ ret = parasite_run(pid, PTRACE_SYSCALL, addr, ctl->rstack, ®s, ++ &ctl->orig, (void *)(long)__NR(munmap, 0)); ++ if (ret) ++ goto err; ++ ++ ret = compel_stop_on_syscall_customize(1, __NR(munmap, 0), 0, &meta); ++ ++ if (restore_thread_ctx(pid, &ctl->orig)) ++ ret = -1; ++err: ++ return ret; ++} ++ + int compel_stop_pie(pid_t pid, void *addr, enum trace_flags *tf, bool no_bp) + { + int ret; +@@ -1505,6 +1572,17 @@ int compel_stop_pie(pid_t pid, void *addr, enum trace_flags *tf, bool no_bp) + return 0; + } + ++int compel_stop_pie_customize(pid_t pid, const int sys_nr, struct trace_flag *tf) ++{ ++ if (ptrace(PTRACE_SYSCALL, pid, sys_nr, NULL)) { ++ pr_perror("Unable to restart the %d process", pid); ++ return -1; ++ } ++ ++ tf->flag = TRACE_SYSCALL_ENTER; ++ return 0; ++} ++ + static bool task_is_trapped(int status, pid_t pid) + { + if (WIFSTOPPED(status) && WSTOPSIG(status) == SIGTRAP) +@@ -1617,6 +1695,71 @@ goon: + return 0; + } + ++int compel_stop_on_syscall_customize(int tasks, const int sys_nr, ++ const int exit_sys_nr, struct bisect_meta *meta) ++{ ++ struct trace_flag *tf; ++ user_regs_struct_t regs; ++ int status, ret; ++ pid_t pid; ++ ++ while (tasks) { ++ pid = wait4(-1, &status, __WALL, NULL); ++ if (pid == -1) { ++ pr_perror("wait4 failed"); ++ return -1; ++ } ++ ++ if (!task_is_trapped(status, pid)) ++ return -1; ++ ++ tf = tf_bisect(meta, pid); ++ if (tf == NULL) { ++ pr_err("Can't find ptrace status for %d\n", pid); ++ return -1; ++ } ++ ++ switch (tf->flag) { ++ case TRACE_SYSCALL_ENTER: ++ pr_debug("%d was trapped\n", pid); ++ pr_debug("`- Expecting exit\n"); ++ ++ ret = ptrace_get_regs(pid, ®s); ++ if (ret) { ++ pr_perror("ptrace"); ++ return -1; ++ } ++ ++ if (is_required_syscall(®s, pid, sys_nr, sys_nr)) { ++ ret = ptrace(PTRACE_SYSCALL, pid, exit_sys_nr, NULL); ++ if (ret) { ++ pr_perror("ptrace"); ++ return -1; ++ } ++ tf->flag = TRACE_SYSCALL_EXIT; ++ } else { ++ pr_warn("Impossible condition, check the system, try our best to restore...\n"); ++ ret = ptrace(PTRACE_SYSCALL, pid, sys_nr, NULL); ++ if (ret) { ++ pr_perror("ptrace"); ++ return -1; ++ } ++ } ++ break; ++ case TRACE_SYSCALL_EXIT: ++ pr_debug("%d was stopped\n", pid); ++ tasks--; ++ break; ++ ++ default: ++ pr_err("pid %d invalid status: %d\n", pid, tf->flag); ++ return -1; ++ } ++ } ++ ++ return 0; ++} ++ + int compel_mode_native(struct parasite_ctl *ctl) + { + return user_regs_native(&ctl->orig.regs); +diff --git a/criu/config.c b/criu/config.c +index e09445a..b0d3639 100644 +--- a/criu/config.c ++++ b/criu/config.c +@@ -555,6 +555,7 @@ int parse_options(int argc, char **argv, bool *usage_error, + {"reserve-ports", required_argument, 0, 'P' }, + BOOL_OPT("use-nft", &opts.use_nft), + BOOL_OPT("parallel", &opts.parallel), ++ BOOL_OPT("customize-ptrace", &opts.customize_ptrace), + { }, + }; + +diff --git a/criu/cr-dump.c b/criu/cr-dump.c +index 24b0ef5..21ae657 100644 +--- a/criu/cr-dump.c ++++ b/criu/cr-dump.c +@@ -1428,7 +1428,7 @@ static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) + goto err_cure; + } + +- ret = compel_stop_daemon(parasite_ctl); ++ ret = compel_stop_daemon(parasite_ctl, opts.customize_ptrace); + if (ret) { + pr_err("Can't stop daemon in parasite (pid: %d)\n", pid); + goto err_cure; +diff --git a/criu/cr-restore.c b/criu/cr-restore.c +index 83bdc38..33f56a3 100644 +--- a/criu/cr-restore.c ++++ b/criu/cr-restore.c +@@ -2171,6 +2171,64 @@ static int catch_tasks(bool root_seized, enum trace_flags *flag) + return 0; + } + ++static int cache_tasks_customize(bool root_seized, struct bisect_meta *meta) ++{ ++ struct pstree_item *item; ++ struct trace_flag *tf; ++ ++ for_each_pstree_item(item) { ++ int status, i, ret; ++ pid_t pid; ++ ++ if (!task_alive(item)) ++ continue; ++ ++ if (item->nr_threads == 1) { ++ item->threads[0].real = item->pid->real; ++ } else { ++ if (parse_threads(item->pid->real, &item->threads, &item->nr_threads)) ++ return -1; ++ } ++ ++ for (i = 0; i < item->nr_threads; i++) { ++ pid = item->threads[i].real; ++ ++ if (ptrace(PTRACE_INTERRUPT, pid, 0, 0)) { ++ pr_perror("Can't interrupt the %d task", pid); ++ return -1; ++ } ++ ++ tf = tf_insert(meta, pid); ++ if (tf == NULL) { ++ pr_err("Can't find trace flag for %d, used %d\n", ++ pid, meta->used); ++ return -1; ++ } ++ tf->flag = TRACE_INTERRUPT; ++ } ++ ++ for (i = 0; i < item->nr_threads; i++) { ++ pid = wait4(-1, &status, __WALL, NULL); ++ ++ tf = tf_bisect(meta, pid); ++ if (tf == NULL) { ++ pr_err("Can't find trace flag for %d, used %d\n", ++ pid, meta->used); ++ return -1; ++ } ++ ++ ret = compel_stop_pie_customize(pid, ++ __NR(rt_sigreturn, 0), ++ tf); ++ if (ret < 0) ++ return -1; ++ ++ } ++ } ++ ++ return 0; ++} ++ + static int clear_breakpoints(void) + { + struct pstree_item *item; +@@ -2197,6 +2255,7 @@ static void finalize_restore(void) + pid_t pid = item->pid->real; + struct parasite_ctl *ctl; + unsigned long restorer_addr; ++ int retval; + + if (!task_alive(item)) + continue; +@@ -2207,7 +2266,12 @@ static void finalize_restore(void) + continue; + + restorer_addr = (unsigned long)rsti(item)->munmap_restorer; +- if (compel_unmap(ctl, restorer_addr)) ++ if (!opts.customize_ptrace) ++ retval = compel_unmap(ctl, restorer_addr); ++ else ++ retval = compel_unmap_customize(ctl, restorer_addr); ++ ++ if (retval) + pr_err("Failed to unmap restorer from %d\n", pid); + + xfree(ctl); +@@ -2312,11 +2376,18 @@ static int write_restored_pid(void) + + static int restore_root_task(struct pstree_item *init) + { ++ struct bisect_meta tfs_meta; + enum trace_flags flag = TRACE_ALL; + int ret, fd, mnt_ns_fd = -1; + int root_seized = 0; + struct pstree_item *item; + ++ if (opts.customize_ptrace ++ && tf_create(&tfs_meta, task_entries->nr_threads) != 0) { ++ pr_err("Can't alloc memory, tf_create failed\n"); ++ return -1; ++ } ++ + ret = run_scripts(ACT_PRE_RESTORE); + if (ret != 0) { + pr_err("Aborting restore due to pre-restore script ret code %d\n", ret); +@@ -2521,7 +2592,12 @@ skip_ns_bouncing: + + timing_stop(TIME_RESTORE); + +- if (catch_tasks(root_seized, &flag)) { ++ if (!opts.customize_ptrace) ++ ret = catch_tasks(root_seized, &flag); ++ else ++ ret = cache_tasks_customize(root_seized, &tfs_meta); ++ ++ if (ret) { + pr_err("Can't catch all tasks\n"); + goto out_kill_network_unlocked; + } +@@ -2531,8 +2607,14 @@ skip_ns_bouncing: + + __restore_switch_stage(CR_STATE_COMPLETE); + +- ret = compel_stop_on_syscall(task_entries->nr_threads, +- __NR(rt_sigreturn, 0), __NR(rt_sigreturn, 1), flag); ++ if (!opts.customize_ptrace) { ++ ret = compel_stop_on_syscall(task_entries->nr_threads, ++ __NR(rt_sigreturn, 0), __NR(rt_sigreturn, 1), flag); ++ } else { ++ ret = compel_stop_on_syscall_customize(task_entries->nr_threads, ++ __NR(rt_sigreturn, 0), ++ -1, &tfs_meta); ++ } + if (ret) { + pr_err("Can't stop all tasks on rt_sigreturn\n"); + goto out_kill_network_unlocked; +@@ -2571,6 +2653,9 @@ skip_ns_bouncing: + if (!opts.restore_detach && !opts.exec_cmd) + wait(NULL); + ++ if (opts.customize_ptrace) ++ tf_destroy(&tfs_meta); ++ + return 0; + + out_kill_network_unlocked: +@@ -2605,6 +2690,10 @@ out: + stop_usernsd(); + __restore_switch_stage(CR_STATE_FAIL); + pr_err("Restoring FAILED.\n"); ++ ++ if (opts.customize_ptrace) ++ tf_destroy(&tfs_meta); ++ + return -1; + } + +diff --git a/criu/crtools.c b/criu/crtools.c +index 5127cde..0ef76b5 100644 +--- a/criu/crtools.c ++++ b/criu/crtools.c +@@ -491,6 +491,7 @@ usage: + " --reserve-ports Reserve src ports in kernel\n" + " --use-nft Use nft API instead of iptables cmd in network locking\n" + " --parallel Parallel to accellrate dumping speed\n" ++" --customize-ptrace Use customize ptrace(PTRACE_SYSCALL)\n" + "\n" + "Check options:\n" + " Without options, \"criu check\" checks availability of absolutely required\n" +diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h +index cc8d1ae..b7080ca 100644 +--- a/criu/include/cr_options.h ++++ b/criu/include/cr_options.h +@@ -189,6 +189,7 @@ struct cr_options { + int reserve_ports; + int use_nft; + int parallel; ++ int customize_ptrace; + }; + + extern struct cr_options opts; +-- +2.34.0 + diff --git a/criu.spec b/criu.spec index 11ef92799b7d791b9e3b4b34d4c783505ec338ad..c510d0e8d753b96b8ad277c4d8620e58b1d39e88 100644 --- a/criu.spec +++ b/criu.spec @@ -1,27 +1,76 @@ Name: criu Version: 3.15 -Release: 3 +Release: 4 Provides: crtools = %{version}-%{release} Obsoletes: crtools <= 1.0-2 Summary: A tool of Checkpoint/Restore in User-space License: GPL-2.0-or-later or LGPL-2.1-only URL: http://criu.org/ Source0: http://download.openvz.org/criu/criu-%{version}.tar.bz2 -BuildRequires: systemd libnet-devel asciidoc xmlto perl-interpreter libselinux-devel -BuildRequires: protobuf-devel protobuf-c-devel python3-devel libnl3-devel libcap-devel +BuildRequires: systemd libnet-devel asciidoc xmlto perl-interpreter libselinux-devel gcc +BuildRequires: protobuf-devel protobuf-c-devel python3-devel libnl3-devel libcap-devel libnftnl-devel libmnl-devel Recommends: tar ExclusiveArch: x86_64 %{arm} ppc64le aarch64 s390x Requires: %{name} = %{version}-%{release} Provides: %{name}-libs = %{version}-%{release} Obsoletes: %{name}-libs < %{version}-%{release} -Patch0001: 0001-Fix-crit-encode-TypeError.patch -Patch0002: 0002-Fix-crit-info-struct-unpack-error.patch -Patch0003: 0003-Fix-crit-x-UnicodeDecodeError.patch -Patch0004: 0004-criu-dump-and-restore-cpu-affinity-of-each-thread.patch -Patch0005: 0005-vdso-fix-segmentation-fault-caused-by-char-pointer-a.patch -Patch0006: 0006-criu-add-pin-memory-method.patch -Patch0007: 0007-criu-add-pid-recover-method-for-criu.patch +Patch: 0001-Fix-crit-encode-TypeError.patch +Patch: 0002-Fix-crit-info-struct-unpack-error.patch +Patch: 0003-Fix-crit-x-UnicodeDecodeError.patch +Patch: 0005-vdso-fix-segmentation-fault-caused-by-char-pointer-a.patch +%ifarch aarch64 +Patch: backport-0001--criu-dump-and-restore-cpu-affinity-of-each-thread.patch +Patch: backport-0002--build-add-secure-compilation-options.patch +Patch: backport-0003--tty-fix-NULL-pointer-access-in-tty.patch +Patch: backport-0004--namespaces-drop-func-address-print-to-make-someone-h.patch +Patch: backport-0005--mm-add-pin-memory-method-for-criu.patch +Patch: backport-0006--pid-add-pid-recover-method-for-criu.patch +Patch: backport-0007--notifier-add-notifier-calling-method-for-checkpoint-.patch +Patch: backport-0008--cred-provide-cred-checkpoint-restore-method.patch +Patch: backport-0009--block-device-dump-block-device-as-reguler-file.patch +Patch: backport-0010--anon-inode-add-support-for-anon-inode-fd.patch +Patch: backport-0011--char_dev-add-support-for-char-device-dump-and-restor.patch +Patch: backport-0012--socket-fix-connect-error-of-invalid-param.patch +Patch: backport-0013--criu-eventpollfd-fix-for-improper-usage-in-appdata.patch +Patch: backport-0014--task_exit_notify-add-task-exit-notify-mask-method-fo.patch +Patch: backport-0015--selinux-fix-selinux-context-lable-check.patch +Patch: backport-0016--unix-socket-add-support-for-unix-stream-socket.patch +Patch: backport-0017--save-and-restore-sigev_notify_thread_id.patch +Patch: backport-0018--sysvshm-add-dump-restore-sysv-shm-in-host-ipc-ns.patch +Patch: backport-0019--add-netlink-repair-modes.patch +Patch: backport-0020--looser-file-mode-and-size-check.patch +Patch: backport-0021--ignore-special-page-dump.patch +Patch: backport-0022--add-O_REPAIR-flag-to-vma-fd.patch +Patch: backport-0023--file-lock-add-repair-mode-to-dump-file-locks.patch +Patch: backport-0024--unlock-network-when-restore-fails.patch +Patch: backport-0025--net-add-shared-socket-recover-method-for-criu.patch +Patch: backport-0026--clean-repair-res-when-dump-fail.patch +Patch: backport-0027--save-src-ports-to-ip_local_reserved_ports-when-dump-.patch +Patch: backport-0028--fix-dump-fail-problem-with-null-seek-op.patch +Patch: backport-0029--fix-dump-fail-problem-with-no-access-to-get-socket-f.patch +Patch: backport-0030--proc-parse-fix-vma-offset-value-for-the-sysfs-file-o.patch +Patch: backport-0031--add-reuse-file-method-for-recover-deleted-file-state.patch +Patch: backport-0032--fix-share-sockets-repair-problem.patch +Patch: backport-0033--nftables-add-mnl-api.patch +Patch: backport-0034--nftables-implement-nft-api-for-tcp.patch +Patch: backport-0035--nftables-implement-nft-api-for-lock-net-ns.patch +Patch: backport-0036--criu-switch-to-nftables-api.patch +Patch: backport-0037--remove-sigaction-handler-register-in-restorer.patch +Patch: backport-0038--remove-ignore_special_dump-option.patch +Patch: backport-0039--add-clear-pin-mem-and-init-page-map-option.patch +Patch: backport-0040--mmap-restore-dev-hisi_sec2-deivce-vma.patch +Patch: backport-0041--fix-fds-list-restore-and-rollback-problem.patch +Patch: backport-0042--log-print-error-log-to-dev-kmsg.patch +Patch: backport-0043--improve-char-dev-fd-check-and-repair-method.patch +Patch: backport-0044--unix-sk-improve-dgram-robustness.patch +Patch: backport-0045--sk-ignore-the-bind-error-for-icmp-socket.patch +Patch: backport-0046--infiniband-fix-the-infiniband-fd-conflict.patch +Patch: backport-0047--optimization-parallel-collecting-vmas.patch +Patch: backport-0048--dump-ignore-children-exit-to-accelerate-speed.patch +Patch: backport-0049--parallel-parallel-nft-delete-set.patch +Patch: backport-0050--ptrace-trace-specific-syscall.patch +%endif %description Checkpoint/Restore in Userspace(CRIU),is a software tool for the linux operating system. @@ -95,6 +144,10 @@ chmod 0755 %{buildroot}/run/%{name}/ %doc %{_mandir}/man1/{compel.1*,crit.1*} %changelog +* Wed Dec 01 2021 fu.lin - 3.15-4 +- backport kinds of feature/bugfix for the module upgrade of OceanStor Dorado +- add buildrequires gcc + * Fri Jul 23 2021 snoweay - 3.15-3 - Add pid recover method for criu