From a0a80800166bad4c7cd04d6f8d24fbe3e6971910 Mon Sep 17 00:00:00 2001 From: root Date: Fri, 25 Jun 2021 14:21:25 +0800 Subject: [PATCH] merge --- CRIU_code/.gitignore | 44 + CRIU_code/.mailmap | 6 + CRIU_code/.travis.yml | 42 + CRIU_code/COPYING | 860 ++++ CRIU_code/CREDITS | 16 + CRIU_code/Documentation/.gitattributes | 1 + CRIU_code/Documentation/.gitignore | 6 + CRIU_code/Documentation/HOWTO.cross-compile | 39 + CRIU_code/Documentation/Makefile | 98 + CRIU_code/Documentation/asciidoc.conf | 1 + CRIU_code/Documentation/crit.txt | 58 + CRIU_code/Documentation/criu.txt | 748 ++++ CRIU_code/Documentation/custom.xsl | 8 + CRIU_code/INSTALL.md | 32 + CRIU_code/Makefile | 400 ++ CRIU_code/Makefile.compel | 77 + CRIU_code/Makefile.config | 71 + CRIU_code/Makefile.install | 57 + CRIU_code/Makefile.versions | 31 + README.en.md => CRIU_code/README.en.md | 4 +- CRIU_code/README.md | 121 + CRIU_code/README.md.orig | 122 + CRIU_code/compel/.gitignore | 14 + CRIU_code/compel/Makefile | 81 + .../aarch64/plugins/include/asm/prologue.h | 1 + .../plugins/include/asm/syscall-types.h | 28 + .../arch/aarch64/plugins/include/features.h | 4 + .../arch/aarch64/plugins/std/parasite-head.S | 20 + .../plugins/std/syscalls/Makefile.syscalls | 1 + .../plugins/std/syscalls/gen-sys-exec-tbl.pl | 1 + .../plugins/std/syscalls/gen-syscalls.pl | 1 + .../plugins/std/syscalls/syscall-aux.S | 37 + .../plugins/std/syscalls/syscall-aux.h | 3 + .../plugins/std/syscalls/syscall-common.S | 19 + .../aarch64/plugins/std/syscalls/syscall.def | 1 + .../arch/aarch64/scripts/compel-pack.lds.S | 36 + CRIU_code/compel/arch/aarch64/src/lib/cpu.c | 106 + .../arch/aarch64/src/lib/handle-elf-host.c | 1 + .../compel/arch/aarch64/src/lib/handle-elf.c | 35 + .../compel/arch/aarch64/src/lib/include/cpu.h | 6 + .../arch/aarch64/src/lib/include/handle-elf.h | 11 + .../arch/aarch64/src/lib/include/syscall.h | 4 + .../src/lib/include/uapi/asm/.gitignore | 0 .../src/lib/include/uapi/asm/breakpoints.h | 15 + .../aarch64/src/lib/include/uapi/asm/cpu.h | 487 +++ .../aarch64/src/lib/include/uapi/asm/fpu.h | 4 + .../src/lib/include/uapi/asm/infect-types.h | 32 + .../lib/include/uapi/asm/processor-flags.h | 4 + .../src/lib/include/uapi/asm/sigframe.h | 69 + .../compel/arch/aarch64/src/lib/infect.c | 178 + .../arch/arm/plugins/include/asm/prologue.h | 1 + .../arm/plugins/include/asm/syscall-types.h | 28 + .../arch/arm/plugins/include/features.h | 4 + .../arch/arm/plugins/std/parasite-head.S | 22 + .../plugins/std/syscalls/Makefile.syscalls | 59 + .../plugins/std/syscalls/gen-sys-exec-tbl.pl | 43 + .../arm/plugins/std/syscalls/gen-syscalls.pl | 99 + .../arm/plugins/std/syscalls/syscall-aux.S | 13 + .../arm/plugins/std/syscalls/syscall-aux.h | 27 + .../arm/plugins/std/syscalls/syscall-common.S | 34 + .../arch/arm/plugins/std/syscalls/syscall.def | 113 + .../compel/arch/arm/scripts/compel-pack.lds.S | 36 + CRIU_code/compel/arch/arm/src/lib/cpu.c | 1 + .../compel/arch/arm/src/lib/handle-elf-host.c | 1 + .../compel/arch/arm/src/lib/handle-elf.c | 22 + .../compel/arch/arm/src/lib/include/cpu.h | 0 .../arch/arm/src/lib/include/handle-elf.h | 11 + .../compel/arch/arm/src/lib/include/syscall.h | 4 + .../arm/src/lib/include/uapi/asm/.gitignore | 0 .../src/lib/include/uapi/asm/breakpoints.h | 15 + .../arch/arm/src/lib/include/uapi/asm/cpu.h | 6 + .../arch/arm/src/lib/include/uapi/asm/fpu.h | 4 + .../src/lib/include/uapi/asm/infect-types.h | 66 + .../lib/include/uapi/asm/processor-flags.h | 42 + .../arm/src/lib/include/uapi/asm/sigframe.h | 90 + CRIU_code/compel/arch/arm/src/lib/infect.c | 195 + .../arch/ppc64/plugins/include/asm/prologue.h | 1 + .../ppc64/plugins/include/asm/syscall-types.h | 28 + .../arch/ppc64/plugins/include/features.h | 7 + .../compel/arch/ppc64/plugins/std/memcmp.S | 236 + .../compel/arch/ppc64/plugins/std/memcpy.S | 212 + .../arch/ppc64/plugins/std/parasite-head.S | 45 + .../plugins/std/syscalls/Makefile.syscalls | 57 + .../std/syscalls/syscall-common-ppc64.S | 24 + .../plugins/std/syscalls/syscall-ppc64.tbl | 109 + .../arch/ppc64/scripts/compel-pack.lds.S | 40 + CRIU_code/compel/arch/ppc64/src/lib/cpu.c | 79 + .../arch/ppc64/src/lib/handle-elf-host.c | 1 + .../compel/arch/ppc64/src/lib/handle-elf.c | 35 + .../compel/arch/ppc64/src/lib/include/cpu.h | 0 .../arch/ppc64/src/lib/include/handle-elf.h | 13 + .../arch/ppc64/src/lib/include/syscall.h | 4 + .../ppc64/src/lib/include/uapi/asm/.gitignore | 0 .../src/lib/include/uapi/asm/breakpoints.h | 15 + .../arch/ppc64/src/lib/include/uapi/asm/cpu.h | 10 + .../arch/ppc64/src/lib/include/uapi/asm/fpu.h | 4 + .../src/lib/include/uapi/asm/infect-types.h | 86 + .../lib/include/uapi/asm/processor-flags.h | 4 + .../src/lib/include/uapi/asm/processor.h | 4 + .../ppc64/src/lib/include/uapi/asm/sigframe.h | 79 + CRIU_code/compel/arch/ppc64/src/lib/infect.c | 481 +++ .../arch/riscv/plugins/include/asm/prologue.h | 1 + .../riscv/plugins/include/asm/syscall-types.h | 29 + .../arch/riscv/plugins/include/features.h | 6 + .../compel/arch/riscv/plugins/std/memcpy.S | 18 + .../arch/riscv/plugins/std/parasite-head.S | 11 + .../plugins/std/syscalls/Makefile.syscalls | 117 + .../std/syscalls/syscall-common-riscv-64.S | 12 + .../riscv/plugins/std/syscalls/syscall_64.tbl | 110 + .../arch/riscv/scripts/compel-pack.lds.S | 33 + CRIU_code/compel/arch/riscv/src/lib/cpu.c | 37 + .../compel/arch/riscv/src/lib/handle-elf.c | 11 + .../arch/riscv/src/lib/include/handle-elf.h | 8 + .../arch/riscv/src/lib/include/syscall.h | 7 + .../src/lib/include/uapi/asm/breakpoints.h | 6 + .../arch/riscv/src/lib/include/uapi/asm/cpu.h | 5 + .../arch/riscv/src/lib/include/uapi/asm/fpu.h | 4 + .../src/lib/include/uapi/asm/infect-types.h | 103 + .../riscv/src/lib/include/uapi/asm/sigframe.h | 76 + .../riscv/src/lib/include/uapi/asm/siginfo.h | 116 + CRIU_code/compel/arch/riscv/src/lib/infect.c | 241 ++ .../arch/s390/plugins/include/asm/prologue.h | 1 + .../s390/plugins/include/asm/syscall-types.h | 34 + .../arch/s390/plugins/std/parasite-head.S | 26 + .../plugins/std/syscalls/Makefile.syscalls | 58 + .../std/syscalls/syscall-common-s390.S | 37 + .../plugins/std/syscalls/syscall-s390.tbl | 109 + .../s390/plugins/std/syscalls/syscalls-s390.c | 26 + .../arch/s390/scripts/compel-pack.lds.S | 40 + CRIU_code/compel/arch/s390/src/lib/cpu.c | 78 + .../arch/s390/src/lib/handle-elf-host.c | 1 + .../compel/arch/s390/src/lib/handle-elf.c | 22 + .../arch/s390/src/lib/include/handle-elf.h | 13 + .../arch/s390/src/lib/include/syscall.h | 8 + .../src/lib/include/uapi/asm/breakpoints.h | 15 + .../arch/s390/src/lib/include/uapi/asm/cpu.h | 10 + .../arch/s390/src/lib/include/uapi/asm/fpu.h | 14 + .../src/lib/include/uapi/asm/infect-types.h | 87 + .../s390/src/lib/include/uapi/asm/sigframe.h | 80 + CRIU_code/compel/arch/s390/src/lib/infect.c | 715 +++ .../arch/x86/plugins/include/asm/prologue.h | 36 + .../x86/plugins/include/asm/syscall-types.h | 60 + .../arch/x86/plugins/include/features.h | 6 + .../compel/arch/x86/plugins/std/memcpy.S | 28 + .../arch/x86/plugins/std/parasite-head.S | 52 + .../compel/arch/x86/plugins/std/prologue.S | 33 + .../plugins/std/syscalls/Makefile.syscalls | 122 + .../std/syscalls/syscall-common-x86-32.S | 36 + .../std/syscalls/syscall-common-x86-64.S | 21 + .../arch/x86/plugins/std/syscalls/syscall32.c | 85 + .../x86/plugins/std/syscalls/syscall_32.tbl | 97 + .../x86/plugins/std/syscalls/syscall_64.tbl | 108 + .../arch/x86/scripts/compel-pack-compat.lds.S | 41 + .../compel/arch/x86/scripts/compel-pack.lds.S | 41 + CRIU_code/compel/arch/x86/src/lib/cpu.c | 489 +++ .../compel/arch/x86/src/lib/handle-elf-host.c | 1 + .../compel/arch/x86/src/lib/handle-elf.c | 22 + .../compel/arch/x86/src/lib/include/cpu.h | 67 + .../arch/x86/src/lib/include/handle-elf.h | 22 + .../compel/arch/x86/src/lib/include/syscall.h | 13 + .../x86/src/lib/include/uapi/asm/.gitignore | 0 .../src/lib/include/uapi/asm/breakpoints.h | 6 + .../arch/x86/src/lib/include/uapi/asm/cpu.h | 350 ++ .../arch/x86/src/lib/include/uapi/asm/fpu.h | 321 ++ .../src/lib/include/uapi/asm/infect-types.h | 127 + .../lib/include/uapi/asm/processor-flags.h | 28 + .../x86/src/lib/include/uapi/asm/sigframe.h | 220 + CRIU_code/compel/arch/x86/src/lib/infect.c | 592 +++ CRIU_code/compel/compel-host | 8 + CRIU_code/compel/include/compel-cpu.h | 12 + CRIU_code/compel/include/elf32-types.h | 16 + CRIU_code/compel/include/elf64-types.h | 16 + CRIU_code/compel/include/errno.h | 9 + CRIU_code/compel/include/infect-priv.h | 71 + CRIU_code/compel/include/log.h | 64 + CRIU_code/compel/include/piegen.h | 28 + CRIU_code/compel/include/ptrace.h | 13 + CRIU_code/compel/include/rpc-pie-priv.h | 50 + CRIU_code/compel/include/shmem.h | 10 + CRIU_code/compel/include/uapi/asm | 1 + CRIU_code/compel/include/uapi/common | 1 + CRIU_code/compel/include/uapi/compel | 1 + CRIU_code/compel/include/uapi/compel.h | 14 + CRIU_code/compel/include/uapi/cpu.h | 17 + CRIU_code/compel/include/uapi/handle-elf.h | 15 + CRIU_code/compel/include/uapi/infect-rpc.h | 17 + CRIU_code/compel/include/uapi/infect-util.h | 6 + CRIU_code/compel/include/uapi/infect.h | 171 + CRIU_code/compel/include/uapi/ksigset.h | 25 + CRIU_code/compel/include/uapi/log.h | 11 + CRIU_code/compel/include/uapi/loglevels.h | 20 + CRIU_code/compel/include/uapi/plugins | 1 + CRIU_code/compel/include/uapi/plugins.h | 35 + CRIU_code/compel/include/uapi/ptrace.h | 82 + .../compel/include/uapi/sigframe-common.h | 62 + CRIU_code/compel/include/uapi/task-state.h | 19 + CRIU_code/compel/plugins/Makefile | 102 + CRIU_code/compel/plugins/fds/fds.c | 25 + CRIU_code/compel/plugins/include/std-priv.h | 6 + .../compel/plugins/include/uapi/plugin-fds.h | 7 + CRIU_code/compel/plugins/include/uapi/shmem.h | 17 + CRIU_code/compel/plugins/include/uapi/std.h | 11 + .../plugins/include/uapi/std/asm/.gitignore | 1 + .../compel/plugins/include/uapi/std/fds.h | 7 + .../compel/plugins/include/uapi/std/infect.h | 20 + .../compel/plugins/include/uapi/std/log.h | 15 + .../compel/plugins/include/uapi/std/string.h | 32 + .../plugins/include/uapi/std/syscall-types.h | 72 + CRIU_code/compel/plugins/shmem/shmem.c | 38 + CRIU_code/compel/plugins/std/fds.c | 16 + CRIU_code/compel/plugins/std/infect.c | 207 + CRIU_code/compel/plugins/std/log.c | 360 ++ CRIU_code/compel/plugins/std/std.c | 85 + CRIU_code/compel/plugins/std/string.c | 302 ++ CRIU_code/compel/src/lib/handle-elf-host.c | 1 + CRIU_code/compel/src/lib/handle-elf.c | 650 +++ CRIU_code/compel/src/lib/infect-rpc.c | 101 + CRIU_code/compel/src/lib/infect-util.c | 32 + CRIU_code/compel/src/lib/infect.c | 1589 +++++++ CRIU_code/compel/src/lib/log-host.c | 1 + CRIU_code/compel/src/lib/log.c | 38 + CRIU_code/compel/src/lib/ptrace.c | 99 + CRIU_code/compel/src/main-host.c | 1 + CRIU_code/compel/src/main.c | 420 ++ CRIU_code/compel/test/fdspy/.gitignore | 4 + CRIU_code/compel/test/fdspy/Makefile | 28 + CRIU_code/compel/test/fdspy/parasite.c | 20 + CRIU_code/compel/test/fdspy/spy.c | 169 + CRIU_code/compel/test/fdspy/victim.c | 12 + CRIU_code/compel/test/infect/.gitignore | 4 + CRIU_code/compel/test/infect/Makefile | 28 + CRIU_code/compel/test/infect/parasite.c | 33 + CRIU_code/compel/test/infect/spy.c | 178 + CRIU_code/compel/test/infect/victim.c | 16 + CRIU_code/compel/test/rsys/.gitignore | 2 + CRIU_code/compel/test/rsys/Makefile | 16 + CRIU_code/compel/test/rsys/spy.c | 136 + CRIU_code/compel/test/rsys/victim.c | 16 + CRIU_code/contrib/debian/dev-packages.lst | 20 + CRIU_code/contrib/docker_cr.sh | 466 ++ CRIU_code/coredump/criu-coredump | 40 + CRIU_code/coredump/criu_coredump/.gitignore | 1 + CRIU_code/coredump/criu_coredump/__init__.py | 2 + CRIU_code/coredump/criu_coredump/coredump.py | 830 ++++ CRIU_code/coredump/criu_coredump/elf.py | 526 +++ CRIU_code/coredump/pycriu | 1 + CRIU_code/crit/Makefile | 13 + CRIU_code/crit/crit-python2 | 6 + CRIU_code/crit/crit-python3 | 6 + CRIU_code/crit/pycriu | 1 + CRIU_code/criu/Makefile | 135 + CRIU_code/criu/Makefile.crtools | 105 + CRIU_code/criu/Makefile.packages | 53 + CRIU_code/criu/action-scripts.c | 171 + CRIU_code/criu/aio.c | 152 + CRIU_code/criu/arch/aarch64/Makefile | 8 + CRIU_code/criu/arch/aarch64/bitops.S | 18 + CRIU_code/criu/arch/aarch64/cpu.c | 125 + CRIU_code/criu/arch/aarch64/crtools.c | 139 + .../criu/arch/aarch64/include/asm/dump.h | 16 + CRIU_code/criu/arch/aarch64/include/asm/int.h | 6 + .../criu/arch/aarch64/include/asm/kerndat.h | 7 + .../aarch64/include/asm/parasite-syscall.h | 6 + .../criu/arch/aarch64/include/asm/parasite.h | 11 + .../criu/arch/aarch64/include/asm/restore.h | 28 + .../criu/arch/aarch64/include/asm/restorer.h | 76 + .../criu/arch/aarch64/include/asm/types.h | 32 + .../criu/arch/aarch64/include/asm/vdso.h | 31 + CRIU_code/criu/arch/aarch64/intraprocedure.S | 22 + CRIU_code/criu/arch/aarch64/restorer.c | 14 + CRIU_code/criu/arch/aarch64/sigframe.c | 9 + CRIU_code/criu/arch/aarch64/vdso-pie.c | 34 + CRIU_code/criu/arch/arm/Makefile | 8 + CRIU_code/criu/arch/arm/aeabi-helpers.S | 96 + CRIU_code/criu/arch/arm/bitops.S | 24 + CRIU_code/criu/arch/arm/cpu.c | 40 + CRIU_code/criu/arch/arm/crtools.c | 142 + CRIU_code/criu/arch/arm/include/asm/dump.h | 16 + CRIU_code/criu/arch/arm/include/asm/int.h | 6 + CRIU_code/criu/arch/arm/include/asm/kerndat.h | 7 + .../arch/arm/include/asm/parasite-syscall.h | 6 + .../criu/arch/arm/include/asm/parasite.h | 9 + CRIU_code/criu/arch/arm/include/asm/restore.h | 29 + .../criu/arch/arm/include/asm/restorer.h | 89 + CRIU_code/criu/arch/arm/include/asm/types.h | 31 + CRIU_code/criu/arch/arm/include/asm/vdso.h | 17 + CRIU_code/criu/arch/arm/pie-cacheflush.c | 7 + CRIU_code/criu/arch/arm/restorer.c | 73 + CRIU_code/criu/arch/arm/sigframe.c | 9 + CRIU_code/criu/arch/arm/vdso-pie.c | 58 + CRIU_code/criu/arch/ppc64/Makefile | 7 + CRIU_code/criu/arch/ppc64/cpu.c | 142 + CRIU_code/criu/arch/ppc64/crtools.c | 505 +++ CRIU_code/criu/arch/ppc64/include/asm/dump.h | 13 + CRIU_code/criu/arch/ppc64/include/asm/int.h | 6 + .../criu/arch/ppc64/include/asm/kerndat.h | 7 + .../arch/ppc64/include/asm/parasite-syscall.h | 6 + .../criu/arch/ppc64/include/asm/parasite.h | 7 + .../criu/arch/ppc64/include/asm/restore.h | 31 + .../criu/arch/ppc64/include/asm/restorer.h | 75 + CRIU_code/criu/arch/ppc64/include/asm/types.h | 42 + CRIU_code/criu/arch/ppc64/include/asm/vdso.h | 28 + CRIU_code/criu/arch/ppc64/misc.S | 197 + CRIU_code/criu/arch/ppc64/restorer.c | 62 + CRIU_code/criu/arch/ppc64/sigframe.c | 48 + CRIU_code/criu/arch/ppc64/vdso-pie.c | 154 + CRIU_code/criu/arch/ppc64/vdso-trampoline.S | 11 + CRIU_code/criu/arch/riscv/Makefile | 14 + CRIU_code/criu/arch/riscv/cpu.c | 53 + CRIU_code/criu/arch/riscv/crtools.c | 240 ++ CRIU_code/criu/arch/riscv/include/asm/dump.h | 14 + CRIU_code/criu/arch/riscv/include/asm/int.h | 6 + .../criu/arch/riscv/include/asm/kerndat.h | 7 + .../arch/riscv/include/asm/parasite-syscall.h | 8 + .../criu/arch/riscv/include/asm/parasite.h | 9 + .../criu/arch/riscv/include/asm/restore.h | 28 + .../criu/arch/riscv/include/asm/restorer.h | 79 + .../criu/arch/riscv/include/asm/syscall32.h | 17 + CRIU_code/criu/arch/riscv/include/asm/types.h | 31 + CRIU_code/criu/arch/riscv/include/asm/vdso.h | 19 + CRIU_code/criu/arch/riscv/restorer.c | 43 + CRIU_code/criu/arch/riscv/sigaction_compat.c | 18 + CRIU_code/criu/arch/riscv/sigframe.c | 13 + CRIU_code/criu/arch/riscv/vdso-pie.c | 56 + CRIU_code/criu/arch/s390/Makefile | 7 + CRIU_code/criu/arch/s390/cpu.c | 158 + CRIU_code/criu/arch/s390/crtools.c | 782 ++++ CRIU_code/criu/arch/s390/include/asm/dump.h | 12 + CRIU_code/criu/arch/s390/include/asm/int.h | 6 + .../criu/arch/s390/include/asm/kerndat.h | 7 + .../arch/s390/include/asm/parasite-syscall.h | 6 + .../criu/arch/s390/include/asm/parasite.h | 7 + .../criu/arch/s390/include/asm/restore.h | 27 + .../criu/arch/s390/include/asm/restorer.h | 65 + CRIU_code/criu/arch/s390/include/asm/types.h | 37 + CRIU_code/criu/arch/s390/include/asm/vdso.h | 23 + CRIU_code/criu/arch/s390/restorer.c | 37 + CRIU_code/criu/arch/s390/sigframe.c | 20 + CRIU_code/criu/arch/s390/vdso-pie.c | 65 + CRIU_code/criu/arch/x86/Makefile | 14 + CRIU_code/criu/arch/x86/cpu.c | 470 ++ CRIU_code/criu/arch/x86/crtools.c | 638 +++ CRIU_code/criu/arch/x86/include/asm/compat.h | 68 + CRIU_code/criu/arch/x86/include/asm/dump.h | 34 + CRIU_code/criu/arch/x86/include/asm/int.h | 6 + CRIU_code/criu/arch/x86/include/asm/kerndat.h | 8 + .../arch/x86/include/asm/parasite-syscall.h | 8 + .../criu/arch/x86/include/asm/parasite.h | 77 + CRIU_code/criu/arch/x86/include/asm/restore.h | 58 + .../criu/arch/x86/include/asm/restorer.h | 112 + .../criu/arch/x86/include/asm/syscall32.h | 17 + CRIU_code/criu/arch/x86/include/asm/types.h | 52 + CRIU_code/criu/arch/x86/include/asm/vdso.h | 72 + CRIU_code/criu/arch/x86/kerndat.c | 258 ++ CRIU_code/criu/arch/x86/restorer.c | 116 + CRIU_code/criu/arch/x86/restorer_unmap.S | 13 + CRIU_code/criu/arch/x86/sigaction_compat.c | 56 + .../criu/arch/x86/sigaction_compat_pie.c | 1 + CRIU_code/criu/arch/x86/sigframe.c | 36 + CRIU_code/criu/arch/x86/sys-exec-tbl.c | 44 + CRIU_code/criu/arch/x86/vdso-pie.c | 76 + CRIU_code/criu/autofs.c | 1090 +++++ CRIU_code/criu/bfd.c | 333 ++ CRIU_code/criu/bitmap.c | 54 + CRIU_code/criu/cgroup-props.c | 578 +++ CRIU_code/criu/cgroup.c | 1915 +++++++++ CRIU_code/criu/clone-noasan.c | 31 + CRIU_code/criu/config.c | 892 ++++ CRIU_code/criu/cr-check.c | 1529 +++++++ CRIU_code/criu/cr-dedup.c | 106 + CRIU_code/criu/cr-dump.c | 1942 +++++++++ CRIU_code/criu/cr-errno.c | 12 + CRIU_code/criu/cr-restore.c | 3587 ++++++++++++++++ CRIU_code/criu/cr-service.c | 1403 ++++++ CRIU_code/criu/crtools.c | 478 +++ CRIU_code/criu/eventfd.c | 117 + CRIU_code/criu/eventpoll.c | 502 +++ CRIU_code/criu/external.c | 94 + CRIU_code/criu/fault-injection.c | 22 + CRIU_code/criu/fdstore.c | 128 + CRIU_code/criu/fifo.c | 184 + CRIU_code/criu/file-ids.c | 113 + CRIU_code/criu/file-lock.c | 718 ++++ CRIU_code/criu/files-ext.c | 98 + CRIU_code/criu/files-reg.c | 2037 +++++++++ CRIU_code/criu/files.c | 1735 ++++++++ CRIU_code/criu/filesystems.c | 870 ++++ CRIU_code/criu/fsnotify.c | 934 ++++ CRIU_code/criu/image-desc.c | 120 + CRIU_code/criu/image.c | 731 ++++ CRIU_code/criu/img-cache.c | 56 + CRIU_code/criu/img-proxy.c | 45 + CRIU_code/criu/img-remote.c | 1159 +++++ CRIU_code/criu/include/action-scripts.h | 28 + CRIU_code/criu/include/aio.h | 35 + CRIU_code/criu/include/asm-generic/int.h | 15 + CRIU_code/criu/include/asm-generic/vdso.h | 15 + CRIU_code/criu/include/atomic.h | 4 + CRIU_code/criu/include/autofs.h | 234 + CRIU_code/criu/include/bfd.h | 40 + CRIU_code/criu/include/bitmap.h | 7 + CRIU_code/criu/include/bitops.h | 4 + CRIU_code/criu/include/bitsperlong.h | 4 + CRIU_code/criu/include/cgroup-props.h | 20 + CRIU_code/criu/include/cgroup.h | 95 + CRIU_code/criu/include/clone-noasan.h | 6 + CRIU_code/criu/include/cpu.h | 12 + CRIU_code/criu/include/cr-errno.h | 17 + CRIU_code/criu/include/cr-service-const.h | 6 + CRIU_code/criu/include/cr-service.h | 14 + CRIU_code/criu/include/cr_options.h | 157 + CRIU_code/criu/include/criu-log.h | 50 + CRIU_code/criu/include/criu-plugin.h | 132 + CRIU_code/criu/include/crtools.h | 47 + CRIU_code/criu/include/dump.h | 7 + CRIU_code/criu/include/eventfd.h | 10 + CRIU_code/criu/include/eventpoll.h | 13 + CRIU_code/criu/include/external.h | 28 + CRIU_code/criu/include/fault-injection.h | 51 + CRIU_code/criu/include/fcntl.h | 49 + CRIU_code/criu/include/fdinfo.h | 22 + CRIU_code/criu/include/fdstore.h | 17 + CRIU_code/criu/include/fifo.h | 11 + CRIU_code/criu/include/file-ids.h | 21 + CRIU_code/criu/include/file-lock.h | 79 + CRIU_code/criu/include/files-reg.h | 59 + CRIU_code/criu/include/files.h | 207 + CRIU_code/criu/include/filesystems.h | 33 + CRIU_code/criu/include/fs-magic.h | 56 + CRIU_code/criu/include/fsnotify.h | 24 + CRIU_code/criu/include/image-desc.h | 124 + CRIU_code/criu/include/image.h | 169 + CRIU_code/criu/include/img-remote.h | 146 + CRIU_code/criu/include/imgset.h | 38 + CRIU_code/criu/include/inet_diag.h | 136 + CRIU_code/criu/include/infect-pie.h | 7 + CRIU_code/criu/include/int.h | 4 + CRIU_code/criu/include/ipc_ns.h | 9 + CRIU_code/criu/include/irmap.h | 13 + CRIU_code/criu/include/kcmp-ids.h | 36 + CRIU_code/criu/include/kcmp.h | 26 + CRIU_code/criu/include/kerndat.h | 96 + CRIU_code/criu/include/libnetlink.h | 24 + CRIU_code/criu/include/linux/userfaultfd.h | 219 + CRIU_code/criu/include/log.h | 81 + CRIU_code/criu/include/lsm.h | 55 + CRIU_code/criu/include/magic.h | 125 + CRIU_code/criu/include/mem.h | 53 + CRIU_code/criu/include/mman.h | 17 + CRIU_code/criu/include/mount.h | 148 + CRIU_code/criu/include/namespaces.h | 224 + CRIU_code/criu/include/net.h | 57 + CRIU_code/criu/include/netfilter.h | 13 + CRIU_code/criu/include/netlink_diag.h | 42 + CRIU_code/criu/include/packet_diag.h | 76 + CRIU_code/criu/include/page-pipe.h | 160 + CRIU_code/criu/include/page-xfer.h | 74 + CRIU_code/criu/include/page.h | 4 + CRIU_code/criu/include/pagemap-cache.h | 29 + CRIU_code/criu/include/pagemap.h | 155 + CRIU_code/criu/include/parasite-syscall.h | 57 + CRIU_code/criu/include/parasite-vdso.h | 95 + CRIU_code/criu/include/parasite.h | 240 ++ CRIU_code/criu/include/path.h | 41 + CRIU_code/criu/include/pid.h | 62 + CRIU_code/criu/include/pipes.h | 63 + CRIU_code/criu/include/plugin.h | 46 + CRIU_code/criu/include/posix-timer.h | 27 + CRIU_code/criu/include/prctl.h | 85 + CRIU_code/criu/include/proc_parse.h | 105 + CRIU_code/criu/include/protobuf-desc.h | 99 + CRIU_code/criu/include/protobuf.h | 55 + CRIU_code/criu/include/pstree.h | 126 + CRIU_code/criu/include/ptrace-compat.h | 16 + CRIU_code/criu/include/rbtree.h | 88 + CRIU_code/criu/include/restore.h | 10 + CRIU_code/criu/include/restorer.h | 312 ++ CRIU_code/criu/include/rst-malloc.h | 81 + CRIU_code/criu/include/rst_info.h | 87 + CRIU_code/criu/include/seccomp.h | 74 + CRIU_code/criu/include/seize.h | 9 + CRIU_code/criu/include/servicefd.h | 48 + CRIU_code/criu/include/setproctitle.h | 19 + CRIU_code/criu/include/shmem.h | 21 + CRIU_code/criu/include/sigframe.h | 16 + CRIU_code/criu/include/signalfd.h | 10 + CRIU_code/criu/include/sk-inet.h | 105 + CRIU_code/criu/include/sk-packet.h | 40 + CRIU_code/criu/include/sk-queue.h | 8 + CRIU_code/criu/include/sockets.h | 123 + CRIU_code/criu/include/stats.h | 55 + CRIU_code/criu/include/string.h | 20 + CRIU_code/criu/include/sysctl.h | 41 + CRIU_code/criu/include/sysfs_parse.h | 17 + CRIU_code/criu/include/timerfd.h | 52 + CRIU_code/criu/include/tls.h | 26 + CRIU_code/criu/include/tty.h | 40 + CRIU_code/criu/include/tun.h | 22 + CRIU_code/criu/include/types.h | 5 + CRIU_code/criu/include/uffd.h | 13 + CRIU_code/criu/include/unix_diag.h | 65 + CRIU_code/criu/include/util-pie.h | 20 + CRIU_code/criu/include/util-vdso.h | 99 + CRIU_code/criu/include/util.h | 381 ++ CRIU_code/criu/include/uts_ns.h | 9 + CRIU_code/criu/include/vdso.h | 26 + CRIU_code/criu/include/vma.h | 137 + CRIU_code/criu/include/xmalloc.h | 2 + CRIU_code/criu/ipc_ns.c | 946 ++++ CRIU_code/criu/irmap.c | 494 +++ CRIU_code/criu/kcmp-ids.c | 208 + CRIU_code/criu/kerndat.c | 1097 +++++ CRIU_code/criu/libnetlink.c | 226 + CRIU_code/criu/log.c | 416 ++ CRIU_code/criu/lsm.c | 351 ++ CRIU_code/criu/mem.c | 1346 ++++++ CRIU_code/criu/mount.c | 3822 +++++++++++++++++ CRIU_code/criu/namespaces.c | 1754 ++++++++ CRIU_code/criu/net.c | 3283 ++++++++++++++ CRIU_code/criu/netfilter.c | 158 + CRIU_code/criu/page-pipe.c | 476 ++ CRIU_code/criu/page-xfer.c | 1331 ++++++ CRIU_code/criu/pagemap-cache.c | 193 + CRIU_code/criu/pagemap.c | 870 ++++ CRIU_code/criu/parasite-syscall.c | 577 +++ CRIU_code/criu/path.c | 105 + CRIU_code/criu/pie-util-vdso-elf32.c | 1 + CRIU_code/criu/pie-util-vdso.c | 1 + CRIU_code/criu/pie-util.c | 1 + CRIU_code/criu/pie/Makefile | 53 + CRIU_code/criu/pie/Makefile.library | 25 + CRIU_code/criu/pie/parasite-vdso.c | 299 ++ CRIU_code/criu/pie/parasite.c | 717 ++++ CRIU_code/criu/pie/pie-relocs.h | 12 + CRIU_code/criu/pie/restorer.c | 1796 ++++++++ CRIU_code/criu/pie/util-vdso-elf32.c | 1 + CRIU_code/criu/pie/util-vdso.c | 329 ++ CRIU_code/criu/pie/util.c | 54 + CRIU_code/criu/pipes.c | 526 +++ CRIU_code/criu/plugin.c | 260 ++ CRIU_code/criu/proc_parse.c | 2657 ++++++++++++ CRIU_code/criu/protobuf-desc.c | 104 + CRIU_code/criu/protobuf.c | 258 ++ CRIU_code/criu/pstree.c | 1038 +++++ CRIU_code/criu/rbtree.c | 357 ++ CRIU_code/criu/rst-malloc.c | 259 ++ CRIU_code/criu/seccomp.c | 509 +++ CRIU_code/criu/seize.c | 841 ++++ CRIU_code/criu/servicefd.c | 308 ++ CRIU_code/criu/shmem.c | 821 ++++ CRIU_code/criu/sigframe.c | 48 + CRIU_code/criu/signalfd.c | 112 + CRIU_code/criu/sk-inet.c | 1024 +++++ CRIU_code/criu/sk-netlink.c | 273 ++ CRIU_code/criu/sk-packet.c | 583 +++ CRIU_code/criu/sk-queue.c | 394 ++ CRIU_code/criu/sk-tcp.c | 448 ++ CRIU_code/criu/sk-unix.c | 2331 ++++++++++ CRIU_code/criu/sockets.c | 969 +++++ CRIU_code/criu/stats.c | 218 + CRIU_code/criu/string.c | 60 + CRIU_code/criu/sysctl.c | 472 ++ CRIU_code/criu/sysfs_parse.c | 326 ++ CRIU_code/criu/timerfd.c | 186 + CRIU_code/criu/tls.c | 370 ++ CRIU_code/criu/tty.c | 2495 +++++++++++ CRIU_code/criu/tun.c | 547 +++ CRIU_code/criu/uffd.c | 1476 +++++++ CRIU_code/criu/util.c | 1361 ++++++ CRIU_code/criu/uts_ns.c | 71 + CRIU_code/criu/vdso-compat.c | 74 + CRIU_code/criu/vdso.c | 703 +++ CRIU_code/images/Makefile | 123 + CRIU_code/images/autofs.proto | 15 + CRIU_code/images/binfmt-misc.proto | 12 + CRIU_code/images/cgroup.proto | 41 + CRIU_code/images/core-aarch64.proto | 23 + CRIU_code/images/core-arm.proto | 39 + CRIU_code/images/core-ppc64.proto | 71 + CRIU_code/images/core-riscv.proto | 53 + CRIU_code/images/core-s390.proto | 51 + CRIU_code/images/core-x86.proto | 108 + CRIU_code/images/core.proto | 122 + CRIU_code/images/cpuinfo.proto | 63 + CRIU_code/images/creds.proto | 24 + CRIU_code/images/eventfd.proto | 10 + CRIU_code/images/eventpoll.proto | 22 + CRIU_code/images/ext-file.proto | 8 + CRIU_code/images/fdinfo.proto | 73 + CRIU_code/images/fh.proto | 23 + CRIU_code/images/fifo.proto | 7 + CRIU_code/images/file-lock.proto | 10 + CRIU_code/images/fown.proto | 9 + CRIU_code/images/fs.proto | 7 + CRIU_code/images/fsnotify.proto | 60 + CRIU_code/images/ghost-file.proto | 23 + .../images/google/protobuf/descriptor.proto | 1 + CRIU_code/images/inventory.proto | 19 + CRIU_code/images/ipc-desc.proto | 11 + CRIU_code/images/ipc-msg.proto | 14 + CRIU_code/images/ipc-sem.proto | 8 + CRIU_code/images/ipc-shm.proto | 9 + CRIU_code/images/ipc-var.proto | 21 + CRIU_code/images/macvlan.proto | 6 + CRIU_code/images/mm.proto | 33 + CRIU_code/images/mnt.proto | 58 + CRIU_code/images/netdev.proto | 74 + CRIU_code/images/ns.proto | 8 + CRIU_code/images/opts.proto | 18 + CRIU_code/images/packet-sock.proto | 47 + CRIU_code/images/pagemap.proto | 14 + CRIU_code/images/pipe-data.proto | 7 + CRIU_code/images/pipe.proto | 11 + CRIU_code/images/pstree.proto | 9 + CRIU_code/images/regfile.proto | 16 + CRIU_code/images/remap-file-path.proto | 16 + CRIU_code/images/remote-image.proto | 22 + CRIU_code/images/rlimit.proto | 6 + CRIU_code/images/rpc.proto | 227 + CRIU_code/images/sa.proto | 11 + CRIU_code/images/seccomp.proto | 11 + CRIU_code/images/siginfo.proto | 9 + CRIU_code/images/signalfd.proto | 11 + CRIU_code/images/sit.proto | 22 + CRIU_code/images/sk-inet.proto | 53 + CRIU_code/images/sk-netlink.proto | 22 + CRIU_code/images/sk-opts.proto | 33 + CRIU_code/images/sk-packet.proto | 16 + CRIU_code/images/sk-unix.proto | 54 + CRIU_code/images/stats.proto | 38 + CRIU_code/images/sysctl.proto | 13 + CRIU_code/images/tcp-stream.proto | 27 + CRIU_code/images/time.proto | 6 + CRIU_code/images/timer.proto | 29 + CRIU_code/images/timerfd.proto | 19 + CRIU_code/images/tty.proto | 90 + CRIU_code/images/tun.proto | 18 + CRIU_code/images/userns.proto | 12 + CRIU_code/images/utsns.proto | 6 + CRIU_code/images/vma.proto | 25 + .../include/common/arch/aarch64/asm/atomic.h | 99 + .../include/common/arch/aarch64/asm/bitops.h | 9 + .../common/arch/aarch64/asm/bitsperlong.h | 6 + .../include/common/arch/aarch64/asm/linkage.h | 24 + .../include/common/arch/aarch64/asm/page.h | 44 + .../include/common/arch/arm/asm/atomic.h | 133 + .../include/common/arch/arm/asm/bitops.h | 9 + .../include/common/arch/arm/asm/bitsperlong.h | 6 + .../include/common/arch/arm/asm/linkage.h | 28 + CRIU_code/include/common/arch/arm/asm/page.h | 19 + .../include/common/arch/arm/asm/processor.h | 28 + .../include/common/arch/ppc64/asm/atomic.h | 134 + .../include/common/arch/ppc64/asm/bitops.h | 215 + .../common/arch/ppc64/asm/bitsperlong.h | 6 + .../include/common/arch/ppc64/asm/cmpxchg.h | 96 + .../include/common/arch/ppc64/asm/linkage.h | 301 ++ .../include/common/arch/ppc64/asm/page.h | 44 + CRIU_code/include/common/arch/riscv/.keep | 0 CRIU_code/include/common/arch/riscv/asm/.keep | 0 .../include/common/arch/riscv/asm/atomic.h | 338 ++ .../include/common/arch/riscv/asm/bitops.h | 69 + .../common/arch/riscv/asm/bitsperlong.h | 6 + .../include/common/arch/riscv/asm/linkage.h | 55 + .../include/common/arch/riscv/asm/page.h | 39 + .../include/common/arch/s390/asm/atomic.h | 67 + .../include/common/arch/s390/asm/atomic_ops.h | 74 + .../include/common/arch/s390/asm/bitops.h | 158 + .../common/arch/s390/asm/bitsperlong.h | 6 + .../include/common/arch/s390/asm/linkage.h | 22 + CRIU_code/include/common/arch/s390/asm/page.h | 19 + .../include/common/arch/x86/asm/atomic.h | 76 + .../include/common/arch/x86/asm/bitops.h | 132 + .../include/common/arch/x86/asm/bitsperlong.h | 10 + .../include/common/arch/x86/asm/cmpxchg.h | 107 + .../include/common/arch/x86/asm/linkage.h | 27 + CRIU_code/include/common/arch/x86/asm/page.h | 19 + CRIU_code/include/common/asm-generic/bitops.h | 113 + CRIU_code/include/common/bitops.h | 23 + CRIU_code/include/common/bitsperlong.h | 4 + CRIU_code/include/common/bug.h | 41 + CRIU_code/include/common/compiler.h | 102 + CRIU_code/include/common/err.h | 53 + CRIU_code/include/common/list.h | 421 ++ CRIU_code/include/common/lock.h | 164 + CRIU_code/include/common/page.h | 4 + CRIU_code/include/common/scm-code.c | 121 + CRIU_code/include/common/scm.h | 54 + CRIU_code/include/common/xmalloc.h | 69 + CRIU_code/lib/Makefile | 76 + CRIU_code/lib/c/Makefile | 8 + CRIU_code/lib/c/criu.c | 1631 +++++++ CRIU_code/lib/c/criu.h | 224 + CRIU_code/lib/c/criu.pc.in | 8 + CRIU_code/lib/py/.gitignore | 2 + CRIU_code/lib/py/Makefile | 19 + CRIU_code/lib/py/__init__.py | 3 + CRIU_code/lib/py/cli.py | 342 ++ CRIU_code/lib/py/criu.py | 332 ++ CRIU_code/lib/py/images/.gitignore | 4 + CRIU_code/lib/py/images/Makefile | 25 + CRIU_code/lib/py/images/__init__.py | 5 + CRIU_code/lib/py/images/images.py | 596 +++ CRIU_code/lib/py/images/pb2dict.py | 378 ++ .../scripts/build/Dockerfile.aarch64.hdr | 3 + .../scripts/build/Dockerfile.aarch64.tmpl | 1 + CRIU_code/scripts/build/Dockerfile.alpine | 46 + .../scripts/build/Dockerfile.armv7hf.hdr | 3 + .../scripts/build/Dockerfile.armv7hf.tmpl | 1 + CRIU_code/scripts/build/Dockerfile.centos | 48 + .../scripts/build/Dockerfile.fedora-asan.hdr | 2 + .../scripts/build/Dockerfile.fedora-asan.tmpl | 1 + .../Dockerfile.fedora-rawhide-aarch64.hdr | 3 + .../Dockerfile.fedora-rawhide-aarch64.tmpl | 1 + .../build/Dockerfile.fedora-rawhide.hdr | 1 + .../build/Dockerfile.fedora-rawhide.tmpl | 1 + .../scripts/build/Dockerfile.fedora.tmpl | 59 + .../scripts/build/Dockerfile.ppc64le.hdr | 5 + .../scripts/build/Dockerfile.ppc64le.tmpl | 1 + CRIU_code/scripts/build/Dockerfile.s390x.hdr | 6 + CRIU_code/scripts/build/Dockerfile.s390x.tmpl | 1 + CRIU_code/scripts/build/Dockerfile.tmpl | 45 + CRIU_code/scripts/build/Dockerfile.x86_64.hdr | 4 + .../scripts/build/Dockerfile.x86_64.tmpl | 1 + CRIU_code/scripts/build/Makefile | 52 + CRIU_code/scripts/build/binfmt_misc | 13 + CRIU_code/scripts/build/extract-deb-pkg | 36 + CRIU_code/scripts/crit-setup.py | 12 + CRIU_code/scripts/criu-ns | 252 ++ CRIU_code/scripts/fake-restore.sh | 15 + CRIU_code/scripts/feature-tests.mak | 138 + CRIU_code/scripts/flake8.cfg | 10 + CRIU_code/scripts/install-debian-pkgs.sh | 25 + CRIU_code/scripts/magic-gen.py | 61 + CRIU_code/scripts/nmk/.gitignore | 3 + CRIU_code/scripts/nmk/Documentation/Makefile | 50 + CRIU_code/scripts/nmk/Documentation/nmk.txt | 70 + CRIU_code/scripts/nmk/Makefile | 35 + CRIU_code/scripts/nmk/README.md | 5 + CRIU_code/scripts/nmk/scripts/build.mk | 330 ++ CRIU_code/scripts/nmk/scripts/include.mk | 58 + CRIU_code/scripts/nmk/scripts/macro.mk | 33 + CRIU_code/scripts/nmk/scripts/main.mk | 28 + CRIU_code/scripts/nmk/scripts/msg.mk | 71 + CRIU_code/scripts/nmk/scripts/tools.mk | 43 + CRIU_code/scripts/nmk/scripts/utils.mk | 35 + CRIU_code/scripts/protobuf-gen.sh | 19 + CRIU_code/scripts/systemd-autofs-restart.sh | 175 + CRIU_code/scripts/tmp-files.sh | 47 + CRIU_code/scripts/travis/Makefile | 42 + CRIU_code/scripts/travis/asan.sh | 21 + CRIU_code/scripts/travis/docker-test.sh | 66 + CRIU_code/scripts/travis/docker.env | 4 + CRIU_code/scripts/travis/travis-after_success | 10 + CRIU_code/scripts/travis/travis-tests | 174 + CRIU_code/soccr/Makefile | 2 + CRIU_code/soccr/soccr.c | 942 ++++ CRIU_code/soccr/soccr.h | 233 + CRIU_code/soccr/test/Makefile | 27 + CRIU_code/soccr/test/local.sh | 1 + CRIU_code/soccr/test/run.py | 62 + CRIU_code/soccr/test/tcp-conn-v6.c | 1 + CRIU_code/soccr/test/tcp-conn.c | 168 + CRIU_code/soccr/test/tcp-constructor.c | 151 + CRIU_code/soccr/test/tcp-test.py | 20 + CRIU_code/test/.gitignore | 16 + CRIU_code/test/Makefile | 61 + CRIU_code/test/abrt.sh | 35 + CRIU_code/test/check_actions.py | 40 + CRIU_code/test/compel/Makefile | 18 + .../aarch64/include/arch_test_handle_binary.h | 24 + .../arm/include/arch_test_handle_binary.h | 21 + .../ppc64/include/arch_test_handle_binary.h | 24 + .../x86/include/arch_test_handle_binary.h | 49 + CRIU_code/test/compel/handle_binary.c | 99 + CRIU_code/test/compel/handle_binary_32.c | 1 + CRIU_code/test/compel/main.c | 57 + CRIU_code/test/crit-recode.py | 75 + CRIU_code/test/empty-netns-prep.sh | 16 + CRIU_code/test/exhaustive/pipe.py | 270 ++ CRIU_code/test/exhaustive/unix.py | 754 ++++ CRIU_code/test/groups.desc | 1 + CRIU_code/test/inhfd.desc | 1 + CRIU_code/test/inhfd/fifo.py | 39 + CRIU_code/test/inhfd/fifo.py.desc | 1 + CRIU_code/test/inhfd/pipe.py | 17 + CRIU_code/test/inhfd/pipe.py.desc | 1 + CRIU_code/test/inhfd/socket.py | 21 + CRIU_code/test/inhfd/socket.py.desc | 1 + CRIU_code/test/inhfd/tty.py | 37 + CRIU_code/test/inhfd/tty.py.desc | 1 + CRIU_code/test/jenkins/_run_ct | 8 + CRIU_code/test/jenkins/actions.sh | 8 + CRIU_code/test/jenkins/crit.sh | 7 + CRIU_code/test/jenkins/criu-btrfs.sh | 6 + CRIU_code/test/jenkins/criu-by-id.sh | 12 + CRIU_code/test/jenkins/criu-dedup.sh | 14 + CRIU_code/test/jenkins/criu-dump.sh | 6 + CRIU_code/test/jenkins/criu-fault.sh | 25 + CRIU_code/test/jenkins/criu-fcg.sh | 13 + CRIU_code/test/jenkins/criu-groups.sh | 7 + CRIU_code/test/jenkins/criu-inhfd.sh | 5 + CRIU_code/test/jenkins/criu-iter.sh | 6 + CRIU_code/test/jenkins/criu-join-ns.sh | 7 + CRIU_code/test/jenkins/criu-lazy-common.sh | 11 + .../test/jenkins/criu-lazy-migration.pipeline | 35 + CRIU_code/test/jenkins/criu-lazy-migration.sh | 20 + CRIU_code/test/jenkins/criu-lazy-pages.sh | 17 + CRIU_code/test/jenkins/criu-lib.sh | 42 + CRIU_code/test/jenkins/criu-other.sh | 4 + CRIU_code/test/jenkins/criu-overlay.sh | 7 + CRIU_code/test/jenkins/criu-pre-dump.sh | 7 + .../test/jenkins/criu-remote-lazy-pages.sh | 17 + CRIU_code/test/jenkins/criu-sibling.sh | 6 + CRIU_code/test/jenkins/criu-snap.sh | 7 + CRIU_code/test/jenkins/criu-stop.sh | 5 + CRIU_code/test/jenkins/criu-user.sh | 6 + CRIU_code/test/jenkins/criu.sh | 5 + CRIU_code/test/jenkins/run_ct | 3 + CRIU_code/test/others/app-emu.sh | 29 + .../app-emu/java/HelloWorld/HelloWorld.java | 20 + .../others/app-emu/java/HelloWorld/run.sh | 37 + CRIU_code/test/others/app-emu/job/Makefile | 12 + CRIU_code/test/others/app-emu/job/job.c | 100 + CRIU_code/test/others/app-emu/job/job.exp | 59 + CRIU_code/test/others/app-emu/job/run.sh | 3 + .../test/others/app-emu/lxc/network-script.sh | 57 + CRIU_code/test/others/app-emu/lxc/run.sh | 64 + CRIU_code/test/others/app-emu/make/Makefile | 13 + CRIU_code/test/others/app-emu/make/run.sh | 59 + CRIU_code/test/others/app-emu/make/tmpl.c | 16 + CRIU_code/test/others/app-emu/screen/run.sh | 30 + CRIU_code/test/others/app-emu/tarbz/run.sh | 73 + CRIU_code/test/others/app-emu/vnc/run.sh | 31 + .../test/others/app-emu/vnc/vnc-server.sh | 13 + CRIU_code/test/others/bers/Makefile | 55 + CRIU_code/test/others/bers/bers.c | 418 ++ CRIU_code/test/others/bers/bers.txt | 74 + CRIU_code/test/others/crit/.gitignore | 5 + CRIU_code/test/others/crit/Makefile | 5 + CRIU_code/test/others/crit/loop.sh | 4 + CRIU_code/test/others/crit/test.sh | 49 + .../test/others/criu-coredump/.gitignore | 6 + CRIU_code/test/others/criu-coredump/Makefile | 5 + CRIU_code/test/others/criu-coredump/loop.sh | 4 + CRIU_code/test/others/criu-coredump/test.sh | 50 + CRIU_code/test/others/env.sh | 8 + CRIU_code/test/others/exec/Makefile | 2 + CRIU_code/test/others/exec/run.sh | 16 + CRIU_code/test/others/ext-links/Makefile | 4 + CRIU_code/test/others/ext-links/addmv.sh | 8 + CRIU_code/test/others/ext-links/addmv_raw.sh | 6 + CRIU_code/test/others/ext-links/mvlink.c | 28 + CRIU_code/test/others/ext-links/run.sh | 58 + CRIU_code/test/others/ext-links/run_ns.sh | 10 + CRIU_code/test/others/ext-links/run_wait.sh | 15 + CRIU_code/test/others/ext-tty/run.py | 36 + CRIU_code/test/others/functions.sh | 16 + CRIU_code/test/others/libcriu/.gitignore | 6 + CRIU_code/test/others/libcriu/Makefile | 26 + CRIU_code/test/others/libcriu/lib.c | 47 + CRIU_code/test/others/libcriu/lib.h | 2 + CRIU_code/test/others/libcriu/run.sh | 45 + CRIU_code/test/others/libcriu/test_errno.c | 154 + CRIU_code/test/others/libcriu/test_iters.c | 143 + CRIU_code/test/others/libcriu/test_notify.c | 97 + CRIU_code/test/others/libcriu/test_self.c | 96 + CRIU_code/test/others/libcriu/test_sub.c | 107 + CRIU_code/test/others/make/Makefile | 5 + CRIU_code/test/others/make/uninstall.sh | 22 + CRIU_code/test/others/mem-snap/Makefile | 2 + .../test/others/mem-snap/run-predump-2.sh | 66 + CRIU_code/test/others/mem-snap/run-predump.sh | 77 + .../others/mem-snap/run-snap-auto-dedup.sh | 93 + .../mem-snap/run-snap-dedup-on-restore.sh | 87 + .../test/others/mem-snap/run-snap-dedup.sh | 99 + .../test/others/mem-snap/run-snap-maps04.sh | 68 + CRIU_code/test/others/mem-snap/run-snap.sh | 74 + CRIU_code/test/others/mem-snap/run.sh | 13 + CRIU_code/test/others/mnt-ext-dev/Makefile | 2 + CRIU_code/test/others/mnt-ext-dev/run.sh | 17 + CRIU_code/test/others/mounts/ext/Makefile | 13 + CRIU_code/test/others/mounts/ext/ext-mount.c | 101 + CRIU_code/test/others/mounts/ext/ns_init.c | 143 + CRIU_code/test/others/mounts/ext/run.sh | 125 + CRIU_code/test/others/mounts/mounts.py | 31 + CRIU_code/test/others/mounts/mounts.sh | 27 + CRIU_code/test/others/mounts/run.sh | 24 + CRIU_code/test/others/netns_ext/Makefile | 2 + CRIU_code/test/others/netns_ext/_run.sh | 4 + CRIU_code/test/others/netns_ext/run.sh | 40 + CRIU_code/test/others/overlayfs/Makefile | 6 + CRIU_code/test/others/overlayfs/run.sh | 58 + CRIU_code/test/others/pipes/Makefile | 14 + CRIU_code/test/others/pipes/pipe.c | 693 +++ CRIU_code/test/others/rpc/.gitignore | 3 + CRIU_code/test/others/rpc/Makefile | 45 + CRIU_code/test/others/rpc/config_file.py | 192 + CRIU_code/test/others/rpc/errno.py | 135 + CRIU_code/test/others/rpc/loop.sh | 4 + CRIU_code/test/others/rpc/ps_test.py | 74 + CRIU_code/test/others/rpc/read.py | 17 + CRIU_code/test/others/rpc/restore-loop.py | 45 + CRIU_code/test/others/rpc/rpc.proto | 1 + CRIU_code/test/others/rpc/run.sh | 86 + CRIU_code/test/others/rpc/test-c.c | 170 + CRIU_code/test/others/rpc/test.py | 81 + CRIU_code/test/others/rpc/version.py | 47 + CRIU_code/test/others/security/Makefile | 34 + CRIU_code/test/others/security/loop.sh | 13 + CRIU_code/test/others/security/run.sh | 89 + CRIU_code/test/others/shell-job/Makefile | 2 + CRIU_code/test/others/shell-job/run.py | 64 + CRIU_code/test/others/socketpairs/Makefile | 9 + .../test/others/socketpairs/socketpair.c | 600 +++ CRIU_code/test/others/tcp/Makefile | 11 + CRIU_code/test/others/tcp/cln.c | 122 + CRIU_code/test/others/tcp/run.sh | 67 + CRIU_code/test/others/tcp/srv.c | 112 + CRIU_code/test/others/unix-callback/Makefile | 22 + CRIU_code/test/others/unix-callback/run.sh | 48 + .../test/others/unix-callback/syslog-lib.c | 66 + .../test/others/unix-callback/unix-client.c | 121 + .../test/others/unix-callback/unix-lib.c | 187 + .../test/others/unix-callback/unix-server.c | 104 + .../test/others/unix-callback/unix.proto | 6 + CRIU_code/test/pki/cacert.pem | 23 + CRIU_code/test/pki/cert.pem | 24 + CRIU_code/test/pki/key.pem | 182 + CRIU_code/test/pycriu | 1 + CRIU_code/test/show_action.sh | 3 + CRIU_code/test/umount2.c | 16 + CRIU_code/test/zdtm.desc | 1 + CRIU_code/test/zdtm.py | 2354 ++++++++++ CRIU_code/test/zdtm/.gitignore | 14 + CRIU_code/test/zdtm/Makefile | 13 + CRIU_code/test/zdtm/Makefile.inc | 108 + CRIU_code/test/zdtm/lib/Makefile | 30 + .../lib/arch/aarch64/include/asm/atomic.h | 73 + .../zdtm/lib/arch/arm/include/asm/atomic.h | 68 + .../zdtm/lib/arch/ppc64/include/asm/atomic.h | 87 + .../zdtm/lib/arch/s390/include/asm/atomic.h | 68 + .../zdtm/lib/arch/x86/include/asm/atomic.h | 49 + CRIU_code/test/zdtm/lib/cpuid.h | 39 + CRIU_code/test/zdtm/lib/datagen.c | 140 + CRIU_code/test/zdtm/lib/fs.c | 96 + CRIU_code/test/zdtm/lib/fs.h | 53 + CRIU_code/test/zdtm/lib/groups.c | 45 + CRIU_code/test/zdtm/lib/groups.desc | 1 + CRIU_code/test/zdtm/lib/lock.c | 85 + CRIU_code/test/zdtm/lib/lock.h | 160 + CRIU_code/test/zdtm/lib/msg.c | 69 + CRIU_code/test/zdtm/lib/ns.c | 456 ++ CRIU_code/test/zdtm/lib/ns.h | 17 + CRIU_code/test/zdtm/lib/parseargs.c | 175 + CRIU_code/test/zdtm/lib/parseargs.sh | 92 + CRIU_code/test/zdtm/lib/stop_and_chk.sh | 54 + CRIU_code/test/zdtm/lib/streamutil.c | 75 + CRIU_code/test/zdtm/lib/tcp.c | 132 + CRIU_code/test/zdtm/lib/test.c | 413 ++ CRIU_code/test/zdtm/lib/zdtmtst.h | 171 + CRIU_code/test/zdtm/static/Makefile | 574 +++ CRIU_code/test/zdtm/static/aio00.c | 36 + CRIU_code/test/zdtm/static/aio00.desc | 1 + CRIU_code/test/zdtm/static/aio01.c | 114 + CRIU_code/test/zdtm/static/aio01.desc | 1 + CRIU_code/test/zdtm/static/apparmor.c | 90 + CRIU_code/test/zdtm/static/apparmor.checkskip | 4 + CRIU_code/test/zdtm/static/apparmor.desc | 1 + CRIU_code/test/zdtm/static/apparmor.profile | 8 + CRIU_code/test/zdtm/static/arm-neon00.c | 67 + CRIU_code/test/zdtm/static/arm-neon00.desc | 1 + CRIU_code/test/zdtm/static/auto_dev-ioctl.h | 228 + CRIU_code/test/zdtm/static/autofs.c | 939 ++++ CRIU_code/test/zdtm/static/autofs.desc | 1 + CRIU_code/test/zdtm/static/bind-mount.c | 62 + CRIU_code/test/zdtm/static/bind-mount.desc | 1 + CRIU_code/test/zdtm/static/binfmt_misc.c | 199 + CRIU_code/test/zdtm/static/binfmt_misc.desc | 1 + CRIU_code/test/zdtm/static/binfmt_misc.hook | 22 + CRIU_code/test/zdtm/static/bridge.c | 113 + CRIU_code/test/zdtm/static/bridge.desc | 7 + CRIU_code/test/zdtm/static/busyloop00.c | 18 + CRIU_code/test/zdtm/static/caps00.c | 178 + CRIU_code/test/zdtm/static/caps00.desc | 1 + CRIU_code/test/zdtm/static/cgroup00.c | 205 + CRIU_code/test/zdtm/static/cgroup00.desc | 1 + CRIU_code/test/zdtm/static/cgroup00.hook | 20 + CRIU_code/test/zdtm/static/cgroup01.c | 116 + CRIU_code/test/zdtm/static/cgroup01.desc | 1 + CRIU_code/test/zdtm/static/cgroup01.hook | 21 + CRIU_code/test/zdtm/static/cgroup02.c | 176 + CRIU_code/test/zdtm/static/cgroup02.desc | 4 + CRIU_code/test/zdtm/static/cgroup02.hook | 32 + CRIU_code/test/zdtm/static/cgroup03.c | 171 + CRIU_code/test/zdtm/static/cgroup03.desc | 1 + CRIU_code/test/zdtm/static/cgroup03.hook | 14 + CRIU_code/test/zdtm/static/cgroup04.c | 196 + CRIU_code/test/zdtm/static/cgroup04.desc | 1 + CRIU_code/test/zdtm/static/cgroup04.hook | 15 + CRIU_code/test/zdtm/static/cgroup_ifpriomap.c | 364 ++ .../test/zdtm/static/cgroup_ifpriomap.desc | 1 + .../test/zdtm/static/cgroup_ifpriomap.hook | 22 + CRIU_code/test/zdtm/static/cgroup_stray.c | 230 + CRIU_code/test/zdtm/static/cgroup_stray.desc | 4 + CRIU_code/test/zdtm/static/cgroupns.c | 213 + CRIU_code/test/zdtm/static/cgroupns.desc | 4 + .../test/zdtm/static/child_opened_proc.c | 63 + CRIU_code/test/zdtm/static/chroot-file.c | 166 + CRIU_code/test/zdtm/static/chroot-file.desc | 1 + CRIU_code/test/zdtm/static/chroot.c | 164 + CRIU_code/test/zdtm/static/chroot.desc | 1 + CRIU_code/test/zdtm/static/clean_mntns.c | 25 + CRIU_code/test/zdtm/static/clean_mntns.desc | 1 + CRIU_code/test/zdtm/static/clone_fs.c | 104 + CRIU_code/test/zdtm/static/cmdlinenv00.c | 123 + CRIU_code/test/zdtm/static/cmdlinenv00.desc | 1 + .../test/zdtm/static/config_inotify_irmap.c | 91 + .../zdtm/static/config_inotify_irmap.desc | 3 + CRIU_code/test/zdtm/static/conntracks | 57 + CRIU_code/test/zdtm/static/conntracks.desc | 1 + CRIU_code/test/zdtm/static/console.c | 59 + CRIU_code/test/zdtm/static/console.desc | 1 + CRIU_code/test/zdtm/static/cow00.c | 113 + CRIU_code/test/zdtm/static/cow00.desc | 2 + CRIU_code/test/zdtm/static/cow01.c | 510 +++ CRIU_code/test/zdtm/static/cow01.desc | 1 + CRIU_code/test/zdtm/static/cr_veth.c | 69 + CRIU_code/test/zdtm/static/cr_veth.checkskip | 2 + CRIU_code/test/zdtm/static/cr_veth.desc | 4 + CRIU_code/test/zdtm/static/cr_veth.hook | 40 + CRIU_code/test/zdtm/static/criu-rtc.c | 124 + CRIU_code/test/zdtm/static/criu-rtc.proto | 5 + CRIU_code/test/zdtm/static/cwd00.c | 68 + CRIU_code/test/zdtm/static/cwd01.c | 101 + CRIU_code/test/zdtm/static/cwd02.c | 92 + .../test/zdtm/static/del_standalone_un.c | 124 + .../test/zdtm/static/del_standalone_un.desc | 1 + CRIU_code/test/zdtm/static/deleted_dev.c | 76 + CRIU_code/test/zdtm/static/deleted_dev.desc | 1 + .../test/zdtm/static/deleted_unix_sock.c | 193 + CRIU_code/test/zdtm/static/different_creds.c | 148 + .../test/zdtm/static/different_creds.desc | 1 + CRIU_code/test/zdtm/static/dumpable01.c | 48 + CRIU_code/test/zdtm/static/dumpable02.c | 208 + CRIU_code/test/zdtm/static/dumpable02.desc | 1 + CRIU_code/test/zdtm/static/env00.c | 39 + CRIU_code/test/zdtm/static/epoll.c | 137 + CRIU_code/test/zdtm/static/epoll.desc | 1 + CRIU_code/test/zdtm/static/eventfs00.c | 98 + CRIU_code/test/zdtm/static/fanotify00.c | 319 ++ CRIU_code/test/zdtm/static/fanotify00.desc | 1 + CRIU_code/test/zdtm/static/fd.c | 109 + CRIU_code/test/zdtm/static/fd01.c | 117 + CRIU_code/test/zdtm/static/fd01.desc | 1 + CRIU_code/test/zdtm/static/fdt_shared.c | 206 + CRIU_code/test/zdtm/static/fifo-ghost.c | 78 + CRIU_code/test/zdtm/static/fifo-rowo-pair.c | 158 + CRIU_code/test/zdtm/static/fifo.c | 83 + CRIU_code/test/zdtm/static/fifo_ro.c | 91 + CRIU_code/test/zdtm/static/fifo_wronly.c | 119 + CRIU_code/test/zdtm/static/file_append.c | 61 + CRIU_code/test/zdtm/static/file_attr.c | 121 + CRIU_code/test/zdtm/static/file_fown.c | 182 + CRIU_code/test/zdtm/static/file_fown.desc | 1 + CRIU_code/test/zdtm/static/file_lease00.c | 84 + CRIU_code/test/zdtm/static/file_lease00.desc | 1 + CRIU_code/test/zdtm/static/file_lease01.c | 88 + CRIU_code/test/zdtm/static/file_lease01.desc | 1 + CRIU_code/test/zdtm/static/file_lease02.c | 145 + CRIU_code/test/zdtm/static/file_lease02.desc | 1 + CRIU_code/test/zdtm/static/file_lease03.c | 146 + CRIU_code/test/zdtm/static/file_lease03.desc | 1 + CRIU_code/test/zdtm/static/file_lease04.c | 132 + CRIU_code/test/zdtm/static/file_lease04.desc | 1 + CRIU_code/test/zdtm/static/file_locks00.c | 197 + CRIU_code/test/zdtm/static/file_locks00.desc | 1 + CRIU_code/test/zdtm/static/file_locks01.c | 194 + CRIU_code/test/zdtm/static/file_locks01.desc | 1 + CRIU_code/test/zdtm/static/file_locks02.c | 105 + CRIU_code/test/zdtm/static/file_locks02.desc | 1 + CRIU_code/test/zdtm/static/file_locks03.c | 111 + CRIU_code/test/zdtm/static/file_locks03.desc | 1 + CRIU_code/test/zdtm/static/file_locks04.c | 120 + CRIU_code/test/zdtm/static/file_locks04.desc | 1 + CRIU_code/test/zdtm/static/file_locks05.c | 50 + CRIU_code/test/zdtm/static/file_locks05.desc | 1 + CRIU_code/test/zdtm/static/file_locks06.c | 65 + .../test/zdtm/static/file_locks06.checkskip | 19 + CRIU_code/test/zdtm/static/file_locks06.desc | 1 + CRIU_code/test/zdtm/static/file_locks07.c | 99 + .../test/zdtm/static/file_locks07.checkskip | 1 + CRIU_code/test/zdtm/static/file_locks07.desc | 1 + CRIU_code/test/zdtm/static/file_locks08.c | 90 + .../test/zdtm/static/file_locks08.checkskip | 1 + CRIU_code/test/zdtm/static/file_locks08.desc | 1 + CRIU_code/test/zdtm/static/file_shared.c | 117 + CRIU_code/test/zdtm/static/fpu00.c | 87 + CRIU_code/test/zdtm/static/fpu00.desc | 1 + CRIU_code/test/zdtm/static/fpu01.c | 119 + CRIU_code/test/zdtm/static/fpu01.desc | 1 + CRIU_code/test/zdtm/static/fpu02.c | 88 + CRIU_code/test/zdtm/static/fpu02.desc | 1 + CRIU_code/test/zdtm/static/futex-rl.c | 126 + CRIU_code/test/zdtm/static/futex.c | 88 + CRIU_code/test/zdtm/static/get_smaps_bits.c | 127 + CRIU_code/test/zdtm/static/get_smaps_bits.h | 6 + CRIU_code/test/zdtm/static/ghost_holes00.c | 168 + CRIU_code/test/zdtm/static/ghost_holes01.c | 1 + CRIU_code/test/zdtm/static/ghost_holes02.c | 1 + CRIU_code/test/zdtm/static/ghost_on_rofs.c | 179 + CRIU_code/test/zdtm/static/ghost_on_rofs.desc | 1 + CRIU_code/test/zdtm/static/groups.c | 64 + CRIU_code/test/zdtm/static/groups.desc | 1 + CRIU_code/test/zdtm/static/grow_map.c | 69 + CRIU_code/test/zdtm/static/grow_map.desc | 1 + CRIU_code/test/zdtm/static/grow_map02.c | 63 + CRIU_code/test/zdtm/static/grow_map02.desc | 1 + CRIU_code/test/zdtm/static/grow_map03.c | 40 + CRIU_code/test/zdtm/static/grow_map03.desc | 1 + .../test/zdtm/static/helper_zombie_child.c | 109 + .../test/zdtm/static/helper_zombie_child.desc | 1 + CRIU_code/test/zdtm/static/inotify00.c | 255 ++ CRIU_code/test/zdtm/static/inotify00.desc | 1 + CRIU_code/test/zdtm/static/inotify01.c | 1 + CRIU_code/test/zdtm/static/inotify01.desc | 1 + CRIU_code/test/zdtm/static/inotify02.c | 98 + CRIU_code/test/zdtm/static/inotify02.desc | 1 + CRIU_code/test/zdtm/static/inotify_irmap.c | 76 + CRIU_code/test/zdtm/static/inotify_irmap.desc | 1 + CRIU_code/test/zdtm/static/inotify_irmap.hook | 19 + CRIU_code/test/zdtm/static/inotify_system.c | 391 ++ .../test/zdtm/static/inotify_system.desc | 1 + .../test/zdtm/static/inotify_system_nodel.c | 1 + .../zdtm/static/inotify_system_nodel.desc | 1 + CRIU_code/test/zdtm/static/ipc_namespace.c | 414 ++ CRIU_code/test/zdtm/static/ipc_namespace.desc | 1 + CRIU_code/test/zdtm/static/jobctl00.c | 301 ++ CRIU_code/test/zdtm/static/link10.c | 80 + CRIU_code/test/zdtm/static/loginuid.c | 99 + CRIU_code/test/zdtm/static/loginuid.desc | 1 + CRIU_code/test/zdtm/static/macvlan.c | 70 + CRIU_code/test/zdtm/static/macvlan.desc | 8 + CRIU_code/test/zdtm/static/macvlan.hook | 33 + CRIU_code/test/zdtm/static/maps00.c | 268 ++ CRIU_code/test/zdtm/static/maps01.c | 183 + CRIU_code/test/zdtm/static/maps01.desc | 1 + CRIU_code/test/zdtm/static/maps02.c | 111 + CRIU_code/test/zdtm/static/maps03.c | 47 + CRIU_code/test/zdtm/static/maps03.desc | 1 + CRIU_code/test/zdtm/static/maps04.c | 57 + CRIU_code/test/zdtm/static/maps04.desc | 1 + CRIU_code/test/zdtm/static/maps05.c | 91 + CRIU_code/test/zdtm/static/maps06.c | 70 + CRIU_code/test/zdtm/static/maps_file_prot.c | 53 + CRIU_code/test/zdtm/static/mem-touch.c | 62 + CRIU_code/test/zdtm/static/mem-touch.desc | 1 + CRIU_code/test/zdtm/static/mlock_setuid.c | 57 + CRIU_code/test/zdtm/static/mlock_setuid.desc | 1 + CRIU_code/test/zdtm/static/mmx00.c | 99 + CRIU_code/test/zdtm/static/mmx00.desc | 1 + CRIU_code/test/zdtm/static/mnt_enablefs.c | 43 + .../test/zdtm/static/mnt_enablefs.checkskip | 3 + CRIU_code/test/zdtm/static/mnt_enablefs.desc | 4 + CRIU_code/test/zdtm/static/mnt_ext_auto.c | 200 + CRIU_code/test/zdtm/static/mnt_ext_auto.desc | 1 + CRIU_code/test/zdtm/static/mnt_ext_dev.c | 108 + CRIU_code/test/zdtm/static/mnt_ext_dev.desc | 6 + CRIU_code/test/zdtm/static/mnt_ext_manual.c | 1 + .../test/zdtm/static/mnt_ext_manual.desc | 4 + CRIU_code/test/zdtm/static/mnt_ext_master.c | 71 + .../test/zdtm/static/mnt_ext_master.desc | 3 + CRIU_code/test/zdtm/static/mnt_ro_bind.c | 84 + CRIU_code/test/zdtm/static/mnt_ro_bind.desc | 1 + CRIU_code/test/zdtm/static/mnt_tracefs.c | 72 + .../test/zdtm/static/mnt_tracefs.checkskip | 5 + CRIU_code/test/zdtm/static/mnt_tracefs.desc | 3 + CRIU_code/test/zdtm/static/mnt_tracefs.hook | 5 + CRIU_code/test/zdtm/static/mntns-deleted-dst | 0 CRIU_code/test/zdtm/static/mntns_deleted.c | 102 + CRIU_code/test/zdtm/static/mntns_deleted.desc | 1 + CRIU_code/test/zdtm/static/mntns_ghost.c | 115 + CRIU_code/test/zdtm/static/mntns_ghost.desc | 1 + CRIU_code/test/zdtm/static/mntns_ghost01.c | 120 + CRIU_code/test/zdtm/static/mntns_ghost01.desc | 1 + CRIU_code/test/zdtm/static/mntns_link_ghost.c | 1 + .../test/zdtm/static/mntns_link_ghost.desc | 1 + CRIU_code/test/zdtm/static/mntns_link_remap.c | 250 ++ .../test/zdtm/static/mntns_link_remap.desc | 1 + CRIU_code/test/zdtm/static/mntns_open.c | 139 + CRIU_code/test/zdtm/static/mntns_open.desc | 1 + CRIU_code/test/zdtm/static/mntns_overmount.c | 69 + .../test/zdtm/static/mntns_overmount.desc | 1 + CRIU_code/test/zdtm/static/mntns_remap.c | 100 + CRIU_code/test/zdtm/static/mntns_remap.desc | 1 + CRIU_code/test/zdtm/static/mntns_ro_root.c | 69 + CRIU_code/test/zdtm/static/mntns_ro_root.desc | 1 + CRIU_code/test/zdtm/static/mntns_root_bind.c | 125 + .../test/zdtm/static/mntns_root_bind.desc | 1 + .../test/zdtm/static/mntns_root_bind02.c | 1 + .../test/zdtm/static/mntns_root_bind02.desc | 1 + CRIU_code/test/zdtm/static/mntns_rw_ro_rw.c | 46 + .../test/zdtm/static/mntns_rw_ro_rw.desc | 1 + .../test/zdtm/static/mntns_shared_bind.c | 130 + .../test/zdtm/static/mntns_shared_bind.desc | 1 + .../test/zdtm/static/mntns_shared_bind02.c | 1 + .../test/zdtm/static/mntns_shared_bind02.desc | 1 + .../test/zdtm/static/mntns_shared_bind03.c | 123 + .../test/zdtm/static/mntns_shared_bind03.desc | 1 + .../zdtm/static/mntns_shared_vs_private.c | 117 + .../zdtm/static/mntns_shared_vs_private.desc | 1 + CRIU_code/test/zdtm/static/mount_paths.c | 57 + CRIU_code/test/zdtm/static/mount_paths.desc | 1 + CRIU_code/test/zdtm/static/mountpoints.c | 304 ++ CRIU_code/test/zdtm/static/mountpoints.desc | 1 + CRIU_code/test/zdtm/static/mprotect00.c | 116 + CRIU_code/test/zdtm/static/msgque.c | 137 + CRIU_code/test/zdtm/static/msgque.desc | 1 + CRIU_code/test/zdtm/static/mtime_mmap.c | 115 + CRIU_code/test/zdtm/static/netns-dev.c | 503 +++ CRIU_code/test/zdtm/static/netns-dev.desc | 1 + CRIU_code/test/zdtm/static/netns-nf.c | 48 + CRIU_code/test/zdtm/static/netns-nf.desc | 6 + CRIU_code/test/zdtm/static/netns.c | 55 + CRIU_code/test/zdtm/static/netns.desc | 3 + CRIU_code/test/zdtm/static/netns_sub.c | 208 + CRIU_code/test/zdtm/static/netns_sub.desc | 1 + CRIU_code/test/zdtm/static/netns_sub_veth.c | 124 + .../test/zdtm/static/netns_sub_veth.desc | 6 + .../static/non_uniform_share_propagation.c | 131 + .../static/non_uniform_share_propagation.desc | 1 + CRIU_code/test/zdtm/static/ofd_file_locks.c | 194 + CRIU_code/test/zdtm/static/ofd_file_locks.h | 21 + CRIU_code/test/zdtm/static/oom_score_adj.c | 94 + CRIU_code/test/zdtm/static/overmount_dev.c | 93 + CRIU_code/test/zdtm/static/overmount_dev.desc | 1 + CRIU_code/test/zdtm/static/overmount_fifo.c | 90 + .../test/zdtm/static/overmount_fifo.desc | 1 + CRIU_code/test/zdtm/static/overmount_file.c | 73 + .../test/zdtm/static/overmount_file.desc | 1 + CRIU_code/test/zdtm/static/overmount_sock.c | 207 + .../test/zdtm/static/overmount_sock.desc | 1 + .../static/overmount_with_shared_parent.c | 69 + .../static/overmount_with_shared_parent.desc | 1 + CRIU_code/test/zdtm/static/overmounted_file.c | 109 + .../test/zdtm/static/overmounted_file.desc | 1 + CRIU_code/test/zdtm/static/packet_sock.c | 301 ++ CRIU_code/test/zdtm/static/packet_sock.desc | 1 + CRIU_code/test/zdtm/static/packet_sock_mmap.c | 104 + .../test/zdtm/static/packet_sock_mmap.desc | 1 + CRIU_code/test/zdtm/static/packet_sock_spkt.c | 88 + .../test/zdtm/static/packet_sock_spkt.desc | 1 + CRIU_code/test/zdtm/static/pdeath_sig.c | 109 + CRIU_code/test/zdtm/static/pid00.c | 93 + CRIU_code/test/zdtm/static/pid00.desc | 1 + CRIU_code/test/zdtm/static/pid_file.c | 52 + CRIU_code/test/zdtm/static/pipe00.c | 121 + CRIU_code/test/zdtm/static/pipe01.c | 132 + CRIU_code/test/zdtm/static/pipe02.c | 61 + CRIU_code/test/zdtm/static/pipe03.c | 54 + CRIU_code/test/zdtm/static/poll.c | 138 + CRIU_code/test/zdtm/static/poll.desc | 1 + CRIU_code/test/zdtm/static/posix_timers.c | 443 ++ .../zdtm/static/private_bind_propagation.c | 116 + .../zdtm/static/private_bind_propagation.desc | 1 + CRIU_code/test/zdtm/static/proc-self.c | 78 + CRIU_code/test/zdtm/static/pstree.c | 87 + CRIU_code/test/zdtm/static/pthread00.c | 185 + CRIU_code/test/zdtm/static/pthread01.c | 209 + CRIU_code/test/zdtm/static/pthread02.c | 43 + CRIU_code/test/zdtm/static/pthread02.desc | 1 + CRIU_code/test/zdtm/static/ptrace_sig.c | 165 + CRIU_code/test/zdtm/static/ptrace_sig.desc | 1 + CRIU_code/test/zdtm/static/pty-console.c | 1 + CRIU_code/test/zdtm/static/pty-console.desc | 1 + CRIU_code/test/zdtm/static/pty00.c | 138 + CRIU_code/test/zdtm/static/pty01.c | 131 + CRIU_code/test/zdtm/static/pty01.desc | 1 + CRIU_code/test/zdtm/static/pty02.c | 103 + CRIU_code/test/zdtm/static/pty03.c | 83 + CRIU_code/test/zdtm/static/pty03.desc | 1 + CRIU_code/test/zdtm/static/pty04.c | 64 + CRIU_code/test/zdtm/static/remap_dead_pid.c | 78 + .../test/zdtm/static/remap_dead_pid.desc | 1 + .../test/zdtm/static/remap_dead_pid_root.c | 1 + .../test/zdtm/static/remap_dead_pid_root.desc | 1 + CRIU_code/test/zdtm/static/rlimits00.c | 66 + CRIU_code/test/zdtm/static/rmdir_open.c | 73 + CRIU_code/test/zdtm/static/route_rules | 73 + CRIU_code/test/zdtm/static/rtc.c | 62 + CRIU_code/test/zdtm/static/rtc.desc | 1 + CRIU_code/test/zdtm/static/s390x_gs_threads.c | 187 + CRIU_code/test/zdtm/static/s390x_mmap_high.c | 64 + .../test/zdtm/static/s390x_mmap_high.desc | 1 + CRIU_code/test/zdtm/static/s390x_regs_check.c | 575 +++ .../test/zdtm/static/s390x_regs_check.desc | 1 + .../test/zdtm/static/s390x_runtime_instr.c | 214 + CRIU_code/test/zdtm/static/sched_policy00.c | 88 + .../test/zdtm/static/sched_policy00.desc | 1 + CRIU_code/test/zdtm/static/sched_prio00.c | 79 + CRIU_code/test/zdtm/static/sched_prio00.desc | 1 + CRIU_code/test/zdtm/static/scm00.c | 162 + CRIU_code/test/zdtm/static/scm01.c | 1 + CRIU_code/test/zdtm/static/scm02.c | 1 + CRIU_code/test/zdtm/static/scm03.c | 154 + CRIU_code/test/zdtm/static/scm04.c | 1 + CRIU_code/test/zdtm/static/scm05.c | 139 + CRIU_code/test/zdtm/static/scm06.c | 147 + CRIU_code/test/zdtm/static/scm06.desc | 1 + CRIU_code/test/zdtm/static/seccomp_filter.c | 200 + .../test/zdtm/static/seccomp_filter.desc | 1 + .../zdtm/static/seccomp_filter_inheritance.c | 190 + .../static/seccomp_filter_inheritance.desc | 1 + .../test/zdtm/static/seccomp_filter_threads.c | 225 + .../zdtm/static/seccomp_filter_threads.desc | 1 + .../test/zdtm/static/seccomp_filter_tsync.c | 215 + .../zdtm/static/seccomp_filter_tsync.desc | 1 + CRIU_code/test/zdtm/static/seccomp_strict.c | 135 + .../test/zdtm/static/seccomp_strict.desc | 1 + CRIU_code/test/zdtm/static/selfexe00.c | 60 + CRIU_code/test/zdtm/static/selinux00.c | 142 + .../test/zdtm/static/selinux00.checkskip | 25 + CRIU_code/test/zdtm/static/selinux00.desc | 1 + CRIU_code/test/zdtm/static/selinux00.hook | 32 + CRIU_code/test/zdtm/static/selinux01.c | 200 + .../test/zdtm/static/selinux01.checkskip | 1 + CRIU_code/test/zdtm/static/selinux01.desc | 1 + CRIU_code/test/zdtm/static/selinux01.hook | 1 + CRIU_code/test/zdtm/static/selinux02.c | 1 + .../test/zdtm/static/selinux02.checkskip | 1 + CRIU_code/test/zdtm/static/selinux02.desc | 1 + CRIU_code/test/zdtm/static/selinux02.hook | 1 + CRIU_code/test/zdtm/static/sem.c | 186 + CRIU_code/test/zdtm/static/sem.desc | 1 + CRIU_code/test/zdtm/static/session00.c | 236 + CRIU_code/test/zdtm/static/session00.desc | 1 + CRIU_code/test/zdtm/static/session01.c | 337 ++ CRIU_code/test/zdtm/static/session01.desc | 1 + CRIU_code/test/zdtm/static/session02.c | 327 ++ CRIU_code/test/zdtm/static/session02.desc | 1 + CRIU_code/test/zdtm/static/session03.c | 376 ++ CRIU_code/test/zdtm/static/session03.desc | 1 + .../zdtm/static/shared_mount_propagation.c | 119 + .../zdtm/static/shared_mount_propagation.desc | 1 + .../zdtm/static/shared_slave_mount_children.c | 125 + .../static/shared_slave_mount_children.desc | 1 + CRIU_code/test/zdtm/static/shm-mp.c | 115 + CRIU_code/test/zdtm/static/shm-mp.desc | 1 + CRIU_code/test/zdtm/static/shm-unaligned.c | 1 + CRIU_code/test/zdtm/static/shm-unaligned.desc | 1 + CRIU_code/test/zdtm/static/shm.c | 197 + CRIU_code/test/zdtm/static/shm.desc | 1 + CRIU_code/test/zdtm/static/sigaltstack.c | 169 + CRIU_code/test/zdtm/static/signalfd00.c | 71 + CRIU_code/test/zdtm/static/sigpending.c | 306 ++ CRIU_code/test/zdtm/static/sit.c | 58 + CRIU_code/test/zdtm/static/sit.desc | 3 + .../test/zdtm/static/sk-freebind-false.c | 1 + CRIU_code/test/zdtm/static/sk-freebind.c | 77 + CRIU_code/test/zdtm/static/sk-netlink.c | 160 + CRIU_code/test/zdtm/static/sk-netlink.desc | 1 + CRIU_code/test/zdtm/static/sk-unix-mntns.c | 162 + CRIU_code/test/zdtm/static/sk-unix-mntns.desc | 5 + CRIU_code/test/zdtm/static/sk-unix-rel.c | 111 + CRIU_code/test/zdtm/static/sk-unix-unconn.c | 69 + CRIU_code/test/zdtm/static/sk-unix01.c | 375 ++ CRIU_code/test/zdtm/static/sk-unix01.desc | 1 + CRIU_code/test/zdtm/static/skip-me.c | 12 + CRIU_code/test/zdtm/static/sleeping00.c | 18 + CRIU_code/test/zdtm/static/sock_filter.c | 115 + CRIU_code/test/zdtm/static/sock_opts00.c | 91 + CRIU_code/test/zdtm/static/sock_opts00.desc | 1 + CRIU_code/test/zdtm/static/sock_opts01.c | 61 + CRIU_code/test/zdtm/static/sock_opts01.desc | 1 + CRIU_code/test/zdtm/static/sock_peercred.c | 127 + CRIU_code/test/zdtm/static/sock_peercred.desc | 1 + CRIU_code/test/zdtm/static/socket-ext.c | 128 + CRIU_code/test/zdtm/static/socket-ext.desc | 1 + CRIU_code/test/zdtm/static/socket-raw.c | 408 ++ CRIU_code/test/zdtm/static/socket-raw.desc | 1 + .../test/zdtm/static/socket-tcp-close-wait.c | 287 ++ .../zdtm/static/socket-tcp-close-wait.desc | 1 + .../zdtm/static/socket-tcp-close-wait.hook | 1 + .../test/zdtm/static/socket-tcp-close0.c | 75 + .../test/zdtm/static/socket-tcp-close0.desc | 1 + .../test/zdtm/static/socket-tcp-close1.c | 53 + .../test/zdtm/static/socket-tcp-close1.desc | 1 + .../zdtm/static/socket-tcp-closed-last-ack.c | 1 + .../static/socket-tcp-closed-last-ack.desc | 10 + .../static/socket-tcp-closed-last-ack.hook | 1 + .../test/zdtm/static/socket-tcp-closed.c | 146 + .../test/zdtm/static/socket-tcp-closed.desc | 1 + .../test/zdtm/static/socket-tcp-closed.hook | 1 + .../test/zdtm/static/socket-tcp-closing.c | 243 ++ .../test/zdtm/static/socket-tcp-closing.desc | 1 + .../test/zdtm/static/socket-tcp-closing.hook | 1 + .../test/zdtm/static/socket-tcp-fin-wait1.c | 232 + .../zdtm/static/socket-tcp-fin-wait1.desc | 1 + .../zdtm/static/socket-tcp-fin-wait1.hook | 70 + .../test/zdtm/static/socket-tcp-fin-wait2.c | 1 + .../zdtm/static/socket-tcp-fin-wait2.desc | 1 + .../zdtm/static/socket-tcp-fin-wait2.hook | 1 + .../test/zdtm/static/socket-tcp-last-ack.c | 1 + .../test/zdtm/static/socket-tcp-last-ack.desc | 1 + .../test/zdtm/static/socket-tcp-last-ack.hook | 1 + CRIU_code/test/zdtm/static/socket-tcp-local.c | 1 + .../test/zdtm/static/socket-tcp-local.desc | 1 + .../test/zdtm/static/socket-tcp-local.hook | 1 + .../test/zdtm/static/socket-tcp-nfconntrack.c | 1 + .../zdtm/static/socket-tcp-nfconntrack.desc | 1 + .../test/zdtm/static/socket-tcp-reseted.c | 96 + .../test/zdtm/static/socket-tcp-reseted.desc | 10 + .../test/zdtm/static/socket-tcp-reseted.hook | 1 + .../test/zdtm/static/socket-tcp-reuseport.c | 171 + .../zdtm/static/socket-tcp-reuseport.desc | 1 + .../zdtm/static/socket-tcp-skip-in-flight.c | 89 + .../static/socket-tcp-skip-in-flight.desc | 1 + .../test/zdtm/static/socket-tcp-syn-sent.c | 143 + .../test/zdtm/static/socket-tcp-syn-sent.desc | 9 + .../test/zdtm/static/socket-tcp-syn-sent.hook | 1 + .../test/zdtm/static/socket-tcp-unconn.c | 122 + .../test/zdtm/static/socket-tcp-unconn.desc | 1 + CRIU_code/test/zdtm/static/socket-tcp.c | 219 + CRIU_code/test/zdtm/static/socket-tcp.desc | 1 + .../zdtm/static/socket-tcp4v6-close-wait.c | 1 + .../zdtm/static/socket-tcp4v6-close-wait.desc | 1 + .../test/zdtm/static/socket-tcp4v6-closed.c | 1 + .../zdtm/static/socket-tcp4v6-closed.desc | 1 + .../test/zdtm/static/socket-tcp4v6-closing.c | 1 + .../zdtm/static/socket-tcp4v6-closing.desc | 1 + .../zdtm/static/socket-tcp4v6-fin-wait1.c | 1 + .../zdtm/static/socket-tcp4v6-fin-wait1.desc | 1 + .../zdtm/static/socket-tcp4v6-fin-wait2.c | 1 + .../zdtm/static/socket-tcp4v6-fin-wait2.desc | 1 + .../test/zdtm/static/socket-tcp4v6-last-ack.c | 1 + .../zdtm/static/socket-tcp4v6-last-ack.desc | 1 + .../test/zdtm/static/socket-tcp4v6-local.c | 1 + .../test/zdtm/static/socket-tcp4v6-local.desc | 1 + CRIU_code/test/zdtm/static/socket-tcp4v6.c | 1 + CRIU_code/test/zdtm/static/socket-tcp4v6.desc | 1 + .../test/zdtm/static/socket-tcp6-close-wait.c | 1 + .../zdtm/static/socket-tcp6-close-wait.desc | 1 + .../test/zdtm/static/socket-tcp6-closed.c | 1 + .../test/zdtm/static/socket-tcp6-closed.desc | 1 + .../test/zdtm/static/socket-tcp6-closing.c | 1 + .../test/zdtm/static/socket-tcp6-closing.desc | 1 + .../test/zdtm/static/socket-tcp6-closing.hook | 1 + .../test/zdtm/static/socket-tcp6-fin-wait1.c | 1 + .../zdtm/static/socket-tcp6-fin-wait1.desc | 1 + .../test/zdtm/static/socket-tcp6-fin-wait2.c | 1 + .../zdtm/static/socket-tcp6-fin-wait2.desc | 1 + .../test/zdtm/static/socket-tcp6-last-ack.c | 1 + .../zdtm/static/socket-tcp6-last-ack.desc | 1 + .../test/zdtm/static/socket-tcp6-local.c | 1 + .../test/zdtm/static/socket-tcp6-local.desc | 1 + .../test/zdtm/static/socket-tcp6-unconn.c | 1 + .../test/zdtm/static/socket-tcp6-unconn.desc | 1 + CRIU_code/test/zdtm/static/socket-tcp6.c | 1 + CRIU_code/test/zdtm/static/socket-tcp6.desc | 1 + .../test/zdtm/static/socket-tcpbuf-local.c | 1 + .../test/zdtm/static/socket-tcpbuf-local.desc | 1 + CRIU_code/test/zdtm/static/socket-tcpbuf.c | 321 ++ CRIU_code/test/zdtm/static/socket-tcpbuf.desc | 1 + .../test/zdtm/static/socket-tcpbuf6-local.c | 1 + .../zdtm/static/socket-tcpbuf6-local.desc | 1 + CRIU_code/test/zdtm/static/socket-tcpbuf6.c | 1 + .../test/zdtm/static/socket-tcpbuf6.desc | 1 + CRIU_code/test/zdtm/static/socket6_udp.c | 124 + CRIU_code/test/zdtm/static/socket_aio.c | 145 + CRIU_code/test/zdtm/static/socket_aio.desc | 1 + .../test/zdtm/static/socket_close_data.c | 43 + .../test/zdtm/static/socket_close_data01.c | 115 + .../test/zdtm/static/socket_dgram_data.c | 81 + CRIU_code/test/zdtm/static/socket_listen.c | 123 + CRIU_code/test/zdtm/static/socket_listen4v6.c | 1 + CRIU_code/test/zdtm/static/socket_listen6.c | 1 + CRIU_code/test/zdtm/static/socket_queues.c | 108 + CRIU_code/test/zdtm/static/socket_snd_addr.c | 100 + .../test/zdtm/static/socket_snd_addr.desc | 1 + .../test/zdtm/static/socket_udp-broadcast.c | 47 + .../test/zdtm/static/socket_udp-corked.c | 76 + .../test/zdtm/static/socket_udp-corked.desc | 1 + CRIU_code/test/zdtm/static/socket_udp.c | 129 + .../test/zdtm/static/socket_udp_shutdown.c | 128 + CRIU_code/test/zdtm/static/socket_udplite.c | 185 + CRIU_code/test/zdtm/static/sockets00.c | 164 + CRIU_code/test/zdtm/static/sockets00.desc | 1 + CRIU_code/test/zdtm/static/sockets01.c | 148 + CRIU_code/test/zdtm/static/sockets02.c | 65 + CRIU_code/test/zdtm/static/sockets03.c | 121 + CRIU_code/test/zdtm/static/sockets03.desc | 1 + CRIU_code/test/zdtm/static/sockets_dgram.c | 210 + CRIU_code/test/zdtm/static/sockets_spair.c | 56 + CRIU_code/test/zdtm/static/sse00.c | 88 + CRIU_code/test/zdtm/static/sse00.desc | 1 + CRIU_code/test/zdtm/static/sse20.c | 88 + CRIU_code/test/zdtm/static/sse20.desc | 1 + CRIU_code/test/zdtm/static/stopped.c | 87 + CRIU_code/test/zdtm/static/stopped01.c | 1 + CRIU_code/test/zdtm/static/stopped02.c | 1 + CRIU_code/test/zdtm/static/stopped12.c | 1 + CRIU_code/test/zdtm/static/tempfs.c | 111 + CRIU_code/test/zdtm/static/tempfs.desc | 1 + .../test/zdtm/static/tempfs_overmounted.c | 68 + .../test/zdtm/static/tempfs_overmounted.desc | 1 + .../test/zdtm/static/tempfs_overmounted01.c | 118 + .../zdtm/static/tempfs_overmounted01.desc | 1 + CRIU_code/test/zdtm/static/tempfs_ro.c | 78 + CRIU_code/test/zdtm/static/tempfs_ro.desc | 1 + CRIU_code/test/zdtm/static/tempfs_ro02.c | 50 + CRIU_code/test/zdtm/static/tempfs_ro02.desc | 1 + CRIU_code/test/zdtm/static/tempfs_subns.c | 135 + CRIU_code/test/zdtm/static/tempfs_subns.desc | 1 + CRIU_code/test/zdtm/static/thp_disable.c | 66 + .../zdtm/static/thread_different_uid_gid.c | 163 + .../zdtm/static/thread_different_uid_gid.desc | 1 + CRIU_code/test/zdtm/static/timerfd.c | 166 + CRIU_code/test/zdtm/static/timerfd.desc | 1 + CRIU_code/test/zdtm/static/timers.c | 93 + CRIU_code/test/zdtm/static/tty00.c | 108 + CRIU_code/test/zdtm/static/tty02.c | 53 + CRIU_code/test/zdtm/static/tty03.c | 121 + CRIU_code/test/zdtm/static/tun.c | 238 + CRIU_code/test/zdtm/static/tun.desc | 1 + CRIU_code/test/zdtm/static/tun_ns.c | 1 + CRIU_code/test/zdtm/static/tun_ns.desc | 1 + CRIU_code/test/zdtm/static/uffd-events.c | 187 + CRIU_code/test/zdtm/static/umask00.c | 30 + CRIU_code/test/zdtm/static/unbound_sock.c | 42 + CRIU_code/test/zdtm/static/unhashed_proc.c | 81 + CRIU_code/test/zdtm/static/unhashed_proc.desc | 1 + CRIU_code/test/zdtm/static/unlink_fifo.c | 50 + .../test/zdtm/static/unlink_fifo_wronly.c | 60 + CRIU_code/test/zdtm/static/unlink_fstat00.c | 173 + .../test/zdtm/static/unlink_fstat00.hook | 11 + CRIU_code/test/zdtm/static/unlink_fstat01+.c | 1 + CRIU_code/test/zdtm/static/unlink_fstat01.c | 93 + CRIU_code/test/zdtm/static/unlink_fstat02.c | 115 + CRIU_code/test/zdtm/static/unlink_fstat03.c | 111 + .../test/zdtm/static/unlink_fstat03.desc | 1 + CRIU_code/test/zdtm/static/unlink_fstat04.c | 1 + .../test/zdtm/static/unlink_fstat04.desc | 1 + CRIU_code/test/zdtm/static/unlink_fstat041.c | 1 + CRIU_code/test/zdtm/static/unlink_largefile.c | 59 + .../test/zdtm/static/unlink_largefile.desc | 1 + CRIU_code/test/zdtm/static/unlink_mmap00.c | 78 + CRIU_code/test/zdtm/static/unlink_mmap00.desc | 1 + CRIU_code/test/zdtm/static/unlink_mmap01.c | 102 + CRIU_code/test/zdtm/static/unlink_mmap01.desc | 1 + CRIU_code/test/zdtm/static/unlink_mmap02.c | 77 + CRIU_code/test/zdtm/static/unlink_mmap02.desc | 1 + .../zdtm/static/unlink_multiple_largefiles.c | 267 ++ .../static/unlink_multiple_largefiles.desc | 1 + CRIU_code/test/zdtm/static/unlink_regular00.c | 110 + .../test/zdtm/static/unlink_regular00.desc | 1 + .../static/unsupported_children_collision.c | 110 + .../unsupported_children_collision.desc | 1 + CRIU_code/test/zdtm/static/uptime_grow.c | 51 + CRIU_code/test/zdtm/static/uptime_grow.desc | 1 + CRIU_code/test/zdtm/static/utsname.c | 46 + CRIU_code/test/zdtm/static/utsname.desc | 1 + CRIU_code/test/zdtm/static/vdso-proxy.c | 170 + CRIU_code/test/zdtm/static/vdso00.c | 34 + CRIU_code/test/zdtm/static/vdso01.c | 446 ++ CRIU_code/test/zdtm/static/vdso01.desc | 1 + CRIU_code/test/zdtm/static/vdso02.c | 231 + CRIU_code/test/zdtm/static/vfork00.c | 80 + CRIU_code/test/zdtm/static/vfork00.desc | 1 + CRIU_code/test/zdtm/static/vsx.c | 400 ++ CRIU_code/test/zdtm/static/vsx.desc | 1 + CRIU_code/test/zdtm/static/vt.c | 64 + CRIU_code/test/zdtm/static/vt.desc | 1 + CRIU_code/test/zdtm/static/wait00.c | 61 + CRIU_code/test/zdtm/static/write_read00.c | 61 + CRIU_code/test/zdtm/static/write_read01.c | 69 + CRIU_code/test/zdtm/static/write_read02.c | 80 + CRIU_code/test/zdtm/static/write_read10.c | 130 + CRIU_code/test/zdtm/static/xids00.c | 127 + CRIU_code/test/zdtm/static/zombie00.c | 110 + CRIU_code/test/zdtm/static/zombie01.c | 80 + CRIU_code/test/zdtm/static/zombie01.desc | 1 + CRIU_code/test/zdtm/transition/Makefile | 91 + CRIU_code/test/zdtm/transition/epoll.c | 200 + CRIU_code/test/zdtm/transition/fifo_dyn.c | 151 + CRIU_code/test/zdtm/transition/fifo_dyn.desc | 1 + CRIU_code/test/zdtm/transition/fifo_loop.c | 194 + CRIU_code/test/zdtm/transition/file_aio.c | 102 + CRIU_code/test/zdtm/transition/file_read.c | 240 ++ CRIU_code/test/zdtm/transition/fork.c | 94 + CRIU_code/test/zdtm/transition/fork2.c | 1 + CRIU_code/test/zdtm/transition/ipc.c | 201 + CRIU_code/test/zdtm/transition/ipc.desc | 1 + CRIU_code/test/zdtm/transition/lazy-thp.c | 63 + CRIU_code/test/zdtm/transition/maps007.c | 178 + CRIU_code/test/zdtm/transition/maps007.desc | 1 + CRIU_code/test/zdtm/transition/maps008.c | 512 +++ CRIU_code/test/zdtm/transition/maps008.desc | 1 + CRIU_code/test/zdtm/transition/netlink00.c | 305 ++ CRIU_code/test/zdtm/transition/netlink00.desc | 1 + CRIU_code/test/zdtm/transition/pid_reuse.c | 116 + CRIU_code/test/zdtm/transition/pid_reuse.desc | 1 + CRIU_code/test/zdtm/transition/pipe_loop00.c | 176 + .../test/zdtm/transition/pipe_shared00.c | 140 + CRIU_code/test/zdtm/transition/ptrace.c | 123 + CRIU_code/test/zdtm/transition/ptrace.desc | 1 + CRIU_code/test/zdtm/transition/shmem.c | 81 + CRIU_code/test/zdtm/transition/socket-tcp.c | 1 + .../test/zdtm/transition/socket-tcp.desc | 1 + CRIU_code/test/zdtm/transition/socket-tcp6.c | 1 + .../test/zdtm/transition/socket-tcp6.desc | 1 + .../test/zdtm/transition/socket_loop00.c | 187 + CRIU_code/test/zdtm/transition/thread-bomb.c | 76 + .../test/zdtm/transition/thread-bomb.desc | 1 + CRIU_code/test/zdtm/transition/unix_sock.c | 288 ++ CRIU_code/test/zdtm_ct.c | 67 + CRIU_code/test/zdtm_mount_cgroups | 16 + CRIU_code/test/zdtm_umount_cgroups | 16 + README.md | 53 +- 1617 files changed, 173908 insertions(+), 32 deletions(-) create mode 100644 CRIU_code/.gitignore create mode 100644 CRIU_code/.mailmap create mode 100644 CRIU_code/.travis.yml create mode 100644 CRIU_code/COPYING create mode 100644 CRIU_code/CREDITS create mode 100644 CRIU_code/Documentation/.gitattributes create mode 100644 CRIU_code/Documentation/.gitignore create mode 100644 CRIU_code/Documentation/HOWTO.cross-compile create mode 100644 CRIU_code/Documentation/Makefile create mode 100644 CRIU_code/Documentation/asciidoc.conf create mode 100644 CRIU_code/Documentation/crit.txt create mode 100644 CRIU_code/Documentation/criu.txt create mode 100644 CRIU_code/Documentation/custom.xsl create mode 100644 CRIU_code/INSTALL.md create mode 100644 CRIU_code/Makefile create mode 100644 CRIU_code/Makefile.compel create mode 100644 CRIU_code/Makefile.config create mode 100644 CRIU_code/Makefile.install create mode 100644 CRIU_code/Makefile.versions rename README.en.md => CRIU_code/README.en.md (86%) create mode 100644 CRIU_code/README.md create mode 100644 CRIU_code/README.md.orig create mode 100644 CRIU_code/compel/.gitignore create mode 100644 CRIU_code/compel/Makefile create mode 100644 CRIU_code/compel/arch/aarch64/plugins/include/asm/prologue.h create mode 100644 CRIU_code/compel/arch/aarch64/plugins/include/asm/syscall-types.h create mode 100644 CRIU_code/compel/arch/aarch64/plugins/include/features.h create mode 100644 CRIU_code/compel/arch/aarch64/plugins/std/parasite-head.S create mode 100644 CRIU_code/compel/arch/aarch64/plugins/std/syscalls/Makefile.syscalls create mode 100644 CRIU_code/compel/arch/aarch64/plugins/std/syscalls/gen-sys-exec-tbl.pl create mode 100644 CRIU_code/compel/arch/aarch64/plugins/std/syscalls/gen-syscalls.pl create mode 100644 CRIU_code/compel/arch/aarch64/plugins/std/syscalls/syscall-aux.S create mode 100644 CRIU_code/compel/arch/aarch64/plugins/std/syscalls/syscall-aux.h create mode 100644 CRIU_code/compel/arch/aarch64/plugins/std/syscalls/syscall-common.S create mode 100644 CRIU_code/compel/arch/aarch64/plugins/std/syscalls/syscall.def create mode 100644 CRIU_code/compel/arch/aarch64/scripts/compel-pack.lds.S create mode 100644 CRIU_code/compel/arch/aarch64/src/lib/cpu.c create mode 100644 CRIU_code/compel/arch/aarch64/src/lib/handle-elf-host.c create mode 100644 CRIU_code/compel/arch/aarch64/src/lib/handle-elf.c create mode 100644 CRIU_code/compel/arch/aarch64/src/lib/include/cpu.h create mode 100644 CRIU_code/compel/arch/aarch64/src/lib/include/handle-elf.h create mode 100644 CRIU_code/compel/arch/aarch64/src/lib/include/syscall.h create mode 100644 CRIU_code/compel/arch/aarch64/src/lib/include/uapi/asm/.gitignore create mode 100644 CRIU_code/compel/arch/aarch64/src/lib/include/uapi/asm/breakpoints.h create mode 100644 CRIU_code/compel/arch/aarch64/src/lib/include/uapi/asm/cpu.h create mode 100644 CRIU_code/compel/arch/aarch64/src/lib/include/uapi/asm/fpu.h create mode 100644 CRIU_code/compel/arch/aarch64/src/lib/include/uapi/asm/infect-types.h create mode 100644 CRIU_code/compel/arch/aarch64/src/lib/include/uapi/asm/processor-flags.h create mode 100644 CRIU_code/compel/arch/aarch64/src/lib/include/uapi/asm/sigframe.h create mode 100644 CRIU_code/compel/arch/aarch64/src/lib/infect.c create mode 100644 CRIU_code/compel/arch/arm/plugins/include/asm/prologue.h create mode 100644 CRIU_code/compel/arch/arm/plugins/include/asm/syscall-types.h create mode 100644 CRIU_code/compel/arch/arm/plugins/include/features.h create mode 100644 CRIU_code/compel/arch/arm/plugins/std/parasite-head.S create mode 100644 CRIU_code/compel/arch/arm/plugins/std/syscalls/Makefile.syscalls create mode 100644 CRIU_code/compel/arch/arm/plugins/std/syscalls/gen-sys-exec-tbl.pl create mode 100644 CRIU_code/compel/arch/arm/plugins/std/syscalls/gen-syscalls.pl create mode 100644 CRIU_code/compel/arch/arm/plugins/std/syscalls/syscall-aux.S create mode 100644 CRIU_code/compel/arch/arm/plugins/std/syscalls/syscall-aux.h create mode 100644 CRIU_code/compel/arch/arm/plugins/std/syscalls/syscall-common.S create mode 100644 CRIU_code/compel/arch/arm/plugins/std/syscalls/syscall.def create mode 100644 CRIU_code/compel/arch/arm/scripts/compel-pack.lds.S create mode 100644 CRIU_code/compel/arch/arm/src/lib/cpu.c create mode 100644 CRIU_code/compel/arch/arm/src/lib/handle-elf-host.c create mode 100644 CRIU_code/compel/arch/arm/src/lib/handle-elf.c create mode 100644 CRIU_code/compel/arch/arm/src/lib/include/cpu.h create mode 100644 CRIU_code/compel/arch/arm/src/lib/include/handle-elf.h create mode 100644 CRIU_code/compel/arch/arm/src/lib/include/syscall.h create mode 100644 CRIU_code/compel/arch/arm/src/lib/include/uapi/asm/.gitignore create mode 100644 CRIU_code/compel/arch/arm/src/lib/include/uapi/asm/breakpoints.h create mode 100644 CRIU_code/compel/arch/arm/src/lib/include/uapi/asm/cpu.h create mode 100644 CRIU_code/compel/arch/arm/src/lib/include/uapi/asm/fpu.h create mode 100644 CRIU_code/compel/arch/arm/src/lib/include/uapi/asm/infect-types.h create mode 100644 CRIU_code/compel/arch/arm/src/lib/include/uapi/asm/processor-flags.h create mode 100644 CRIU_code/compel/arch/arm/src/lib/include/uapi/asm/sigframe.h create mode 100644 CRIU_code/compel/arch/arm/src/lib/infect.c create mode 100644 CRIU_code/compel/arch/ppc64/plugins/include/asm/prologue.h create mode 100644 CRIU_code/compel/arch/ppc64/plugins/include/asm/syscall-types.h create mode 100644 CRIU_code/compel/arch/ppc64/plugins/include/features.h create mode 100644 CRIU_code/compel/arch/ppc64/plugins/std/memcmp.S create mode 100644 CRIU_code/compel/arch/ppc64/plugins/std/memcpy.S create mode 100644 CRIU_code/compel/arch/ppc64/plugins/std/parasite-head.S create mode 100644 CRIU_code/compel/arch/ppc64/plugins/std/syscalls/Makefile.syscalls create mode 100644 CRIU_code/compel/arch/ppc64/plugins/std/syscalls/syscall-common-ppc64.S create mode 100644 CRIU_code/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl create mode 100644 CRIU_code/compel/arch/ppc64/scripts/compel-pack.lds.S create mode 100644 CRIU_code/compel/arch/ppc64/src/lib/cpu.c create mode 100644 CRIU_code/compel/arch/ppc64/src/lib/handle-elf-host.c create mode 100644 CRIU_code/compel/arch/ppc64/src/lib/handle-elf.c create mode 100644 CRIU_code/compel/arch/ppc64/src/lib/include/cpu.h create mode 100644 CRIU_code/compel/arch/ppc64/src/lib/include/handle-elf.h create mode 100644 CRIU_code/compel/arch/ppc64/src/lib/include/syscall.h create mode 100644 CRIU_code/compel/arch/ppc64/src/lib/include/uapi/asm/.gitignore create mode 100644 CRIU_code/compel/arch/ppc64/src/lib/include/uapi/asm/breakpoints.h create mode 100644 CRIU_code/compel/arch/ppc64/src/lib/include/uapi/asm/cpu.h create mode 100644 CRIU_code/compel/arch/ppc64/src/lib/include/uapi/asm/fpu.h create mode 100644 CRIU_code/compel/arch/ppc64/src/lib/include/uapi/asm/infect-types.h create mode 100644 CRIU_code/compel/arch/ppc64/src/lib/include/uapi/asm/processor-flags.h create mode 100644 CRIU_code/compel/arch/ppc64/src/lib/include/uapi/asm/processor.h create mode 100644 CRIU_code/compel/arch/ppc64/src/lib/include/uapi/asm/sigframe.h create mode 100644 CRIU_code/compel/arch/ppc64/src/lib/infect.c create mode 100644 CRIU_code/compel/arch/riscv/plugins/include/asm/prologue.h create mode 100644 CRIU_code/compel/arch/riscv/plugins/include/asm/syscall-types.h create mode 100644 CRIU_code/compel/arch/riscv/plugins/include/features.h create mode 100644 CRIU_code/compel/arch/riscv/plugins/std/memcpy.S create mode 100644 CRIU_code/compel/arch/riscv/plugins/std/parasite-head.S create mode 100644 CRIU_code/compel/arch/riscv/plugins/std/syscalls/Makefile.syscalls create mode 100644 CRIU_code/compel/arch/riscv/plugins/std/syscalls/syscall-common-riscv-64.S create mode 100644 CRIU_code/compel/arch/riscv/plugins/std/syscalls/syscall_64.tbl create mode 100644 CRIU_code/compel/arch/riscv/scripts/compel-pack.lds.S create mode 100644 CRIU_code/compel/arch/riscv/src/lib/cpu.c create mode 100644 CRIU_code/compel/arch/riscv/src/lib/handle-elf.c create mode 100644 CRIU_code/compel/arch/riscv/src/lib/include/handle-elf.h create mode 100644 CRIU_code/compel/arch/riscv/src/lib/include/syscall.h create mode 100644 CRIU_code/compel/arch/riscv/src/lib/include/uapi/asm/breakpoints.h create mode 100644 CRIU_code/compel/arch/riscv/src/lib/include/uapi/asm/cpu.h create mode 100644 CRIU_code/compel/arch/riscv/src/lib/include/uapi/asm/fpu.h create mode 100644 CRIU_code/compel/arch/riscv/src/lib/include/uapi/asm/infect-types.h create mode 100644 CRIU_code/compel/arch/riscv/src/lib/include/uapi/asm/sigframe.h create mode 100644 CRIU_code/compel/arch/riscv/src/lib/include/uapi/asm/siginfo.h create mode 100644 CRIU_code/compel/arch/riscv/src/lib/infect.c create mode 100644 CRIU_code/compel/arch/s390/plugins/include/asm/prologue.h create mode 100644 CRIU_code/compel/arch/s390/plugins/include/asm/syscall-types.h create mode 100644 CRIU_code/compel/arch/s390/plugins/std/parasite-head.S create mode 100644 CRIU_code/compel/arch/s390/plugins/std/syscalls/Makefile.syscalls create mode 100644 CRIU_code/compel/arch/s390/plugins/std/syscalls/syscall-common-s390.S create mode 100644 CRIU_code/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl create mode 100644 CRIU_code/compel/arch/s390/plugins/std/syscalls/syscalls-s390.c create mode 100644 CRIU_code/compel/arch/s390/scripts/compel-pack.lds.S create mode 100644 CRIU_code/compel/arch/s390/src/lib/cpu.c create mode 100644 CRIU_code/compel/arch/s390/src/lib/handle-elf-host.c create mode 100644 CRIU_code/compel/arch/s390/src/lib/handle-elf.c create mode 100644 CRIU_code/compel/arch/s390/src/lib/include/handle-elf.h create mode 100644 CRIU_code/compel/arch/s390/src/lib/include/syscall.h create mode 100644 CRIU_code/compel/arch/s390/src/lib/include/uapi/asm/breakpoints.h create mode 100644 CRIU_code/compel/arch/s390/src/lib/include/uapi/asm/cpu.h create mode 100644 CRIU_code/compel/arch/s390/src/lib/include/uapi/asm/fpu.h create mode 100644 CRIU_code/compel/arch/s390/src/lib/include/uapi/asm/infect-types.h create mode 100644 CRIU_code/compel/arch/s390/src/lib/include/uapi/asm/sigframe.h create mode 100644 CRIU_code/compel/arch/s390/src/lib/infect.c create mode 100644 CRIU_code/compel/arch/x86/plugins/include/asm/prologue.h create mode 100644 CRIU_code/compel/arch/x86/plugins/include/asm/syscall-types.h create mode 100644 CRIU_code/compel/arch/x86/plugins/include/features.h create mode 100644 CRIU_code/compel/arch/x86/plugins/std/memcpy.S create mode 100644 CRIU_code/compel/arch/x86/plugins/std/parasite-head.S create mode 100644 CRIU_code/compel/arch/x86/plugins/std/prologue.S create mode 100644 CRIU_code/compel/arch/x86/plugins/std/syscalls/Makefile.syscalls create mode 100644 CRIU_code/compel/arch/x86/plugins/std/syscalls/syscall-common-x86-32.S create mode 100644 CRIU_code/compel/arch/x86/plugins/std/syscalls/syscall-common-x86-64.S create mode 100644 CRIU_code/compel/arch/x86/plugins/std/syscalls/syscall32.c create mode 100644 CRIU_code/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl create mode 100644 CRIU_code/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl create mode 100644 CRIU_code/compel/arch/x86/scripts/compel-pack-compat.lds.S create mode 100644 CRIU_code/compel/arch/x86/scripts/compel-pack.lds.S create mode 100644 CRIU_code/compel/arch/x86/src/lib/cpu.c create mode 100644 CRIU_code/compel/arch/x86/src/lib/handle-elf-host.c create mode 100644 CRIU_code/compel/arch/x86/src/lib/handle-elf.c create mode 100644 CRIU_code/compel/arch/x86/src/lib/include/cpu.h create mode 100644 CRIU_code/compel/arch/x86/src/lib/include/handle-elf.h create mode 100644 CRIU_code/compel/arch/x86/src/lib/include/syscall.h create mode 100644 CRIU_code/compel/arch/x86/src/lib/include/uapi/asm/.gitignore create mode 100644 CRIU_code/compel/arch/x86/src/lib/include/uapi/asm/breakpoints.h create mode 100644 CRIU_code/compel/arch/x86/src/lib/include/uapi/asm/cpu.h create mode 100644 CRIU_code/compel/arch/x86/src/lib/include/uapi/asm/fpu.h create mode 100644 CRIU_code/compel/arch/x86/src/lib/include/uapi/asm/infect-types.h create mode 100644 CRIU_code/compel/arch/x86/src/lib/include/uapi/asm/processor-flags.h create mode 100644 CRIU_code/compel/arch/x86/src/lib/include/uapi/asm/sigframe.h create mode 100644 CRIU_code/compel/arch/x86/src/lib/infect.c create mode 100644 CRIU_code/compel/compel-host create mode 100644 CRIU_code/compel/include/compel-cpu.h create mode 100644 CRIU_code/compel/include/elf32-types.h create mode 100644 CRIU_code/compel/include/elf64-types.h create mode 100644 CRIU_code/compel/include/errno.h create mode 100644 CRIU_code/compel/include/infect-priv.h create mode 100644 CRIU_code/compel/include/log.h create mode 100644 CRIU_code/compel/include/piegen.h create mode 100644 CRIU_code/compel/include/ptrace.h create mode 100644 CRIU_code/compel/include/rpc-pie-priv.h create mode 100644 CRIU_code/compel/include/shmem.h create mode 100644 CRIU_code/compel/include/uapi/asm create mode 100644 CRIU_code/compel/include/uapi/common create mode 100644 CRIU_code/compel/include/uapi/compel create mode 100644 CRIU_code/compel/include/uapi/compel.h create mode 100644 CRIU_code/compel/include/uapi/cpu.h create mode 100644 CRIU_code/compel/include/uapi/handle-elf.h create mode 100644 CRIU_code/compel/include/uapi/infect-rpc.h create mode 100644 CRIU_code/compel/include/uapi/infect-util.h create mode 100644 CRIU_code/compel/include/uapi/infect.h create mode 100644 CRIU_code/compel/include/uapi/ksigset.h create mode 100644 CRIU_code/compel/include/uapi/log.h create mode 100644 CRIU_code/compel/include/uapi/loglevels.h create mode 100644 CRIU_code/compel/include/uapi/plugins create mode 100644 CRIU_code/compel/include/uapi/plugins.h create mode 100644 CRIU_code/compel/include/uapi/ptrace.h create mode 100644 CRIU_code/compel/include/uapi/sigframe-common.h create mode 100644 CRIU_code/compel/include/uapi/task-state.h create mode 100644 CRIU_code/compel/plugins/Makefile create mode 100644 CRIU_code/compel/plugins/fds/fds.c create mode 100644 CRIU_code/compel/plugins/include/std-priv.h create mode 100644 CRIU_code/compel/plugins/include/uapi/plugin-fds.h create mode 100644 CRIU_code/compel/plugins/include/uapi/shmem.h create mode 100644 CRIU_code/compel/plugins/include/uapi/std.h create mode 100644 CRIU_code/compel/plugins/include/uapi/std/asm/.gitignore create mode 100644 CRIU_code/compel/plugins/include/uapi/std/fds.h create mode 100644 CRIU_code/compel/plugins/include/uapi/std/infect.h create mode 100644 CRIU_code/compel/plugins/include/uapi/std/log.h create mode 100644 CRIU_code/compel/plugins/include/uapi/std/string.h create mode 100644 CRIU_code/compel/plugins/include/uapi/std/syscall-types.h create mode 100644 CRIU_code/compel/plugins/shmem/shmem.c create mode 100644 CRIU_code/compel/plugins/std/fds.c create mode 100644 CRIU_code/compel/plugins/std/infect.c create mode 100644 CRIU_code/compel/plugins/std/log.c create mode 100644 CRIU_code/compel/plugins/std/std.c create mode 100644 CRIU_code/compel/plugins/std/string.c create mode 100644 CRIU_code/compel/src/lib/handle-elf-host.c create mode 100644 CRIU_code/compel/src/lib/handle-elf.c create mode 100644 CRIU_code/compel/src/lib/infect-rpc.c create mode 100644 CRIU_code/compel/src/lib/infect-util.c create mode 100644 CRIU_code/compel/src/lib/infect.c create mode 100644 CRIU_code/compel/src/lib/log-host.c create mode 100644 CRIU_code/compel/src/lib/log.c create mode 100644 CRIU_code/compel/src/lib/ptrace.c create mode 100644 CRIU_code/compel/src/main-host.c create mode 100644 CRIU_code/compel/src/main.c create mode 100644 CRIU_code/compel/test/fdspy/.gitignore create mode 100644 CRIU_code/compel/test/fdspy/Makefile create mode 100644 CRIU_code/compel/test/fdspy/parasite.c create mode 100644 CRIU_code/compel/test/fdspy/spy.c create mode 100644 CRIU_code/compel/test/fdspy/victim.c create mode 100644 CRIU_code/compel/test/infect/.gitignore create mode 100644 CRIU_code/compel/test/infect/Makefile create mode 100644 CRIU_code/compel/test/infect/parasite.c create mode 100644 CRIU_code/compel/test/infect/spy.c create mode 100644 CRIU_code/compel/test/infect/victim.c create mode 100644 CRIU_code/compel/test/rsys/.gitignore create mode 100644 CRIU_code/compel/test/rsys/Makefile create mode 100644 CRIU_code/compel/test/rsys/spy.c create mode 100644 CRIU_code/compel/test/rsys/victim.c create mode 100644 CRIU_code/contrib/debian/dev-packages.lst create mode 100644 CRIU_code/contrib/docker_cr.sh create mode 100644 CRIU_code/coredump/criu-coredump create mode 100644 CRIU_code/coredump/criu_coredump/.gitignore create mode 100644 CRIU_code/coredump/criu_coredump/__init__.py create mode 100644 CRIU_code/coredump/criu_coredump/coredump.py create mode 100644 CRIU_code/coredump/criu_coredump/elf.py create mode 100644 CRIU_code/coredump/pycriu create mode 100644 CRIU_code/crit/Makefile create mode 100644 CRIU_code/crit/crit-python2 create mode 100644 CRIU_code/crit/crit-python3 create mode 100644 CRIU_code/crit/pycriu create mode 100644 CRIU_code/criu/Makefile create mode 100644 CRIU_code/criu/Makefile.crtools create mode 100644 CRIU_code/criu/Makefile.packages create mode 100644 CRIU_code/criu/action-scripts.c create mode 100644 CRIU_code/criu/aio.c create mode 100644 CRIU_code/criu/arch/aarch64/Makefile create mode 100644 CRIU_code/criu/arch/aarch64/bitops.S create mode 100644 CRIU_code/criu/arch/aarch64/cpu.c create mode 100644 CRIU_code/criu/arch/aarch64/crtools.c create mode 100644 CRIU_code/criu/arch/aarch64/include/asm/dump.h create mode 100644 CRIU_code/criu/arch/aarch64/include/asm/int.h create mode 100644 CRIU_code/criu/arch/aarch64/include/asm/kerndat.h create mode 100644 CRIU_code/criu/arch/aarch64/include/asm/parasite-syscall.h create mode 100644 CRIU_code/criu/arch/aarch64/include/asm/parasite.h create mode 100644 CRIU_code/criu/arch/aarch64/include/asm/restore.h create mode 100644 CRIU_code/criu/arch/aarch64/include/asm/restorer.h create mode 100644 CRIU_code/criu/arch/aarch64/include/asm/types.h create mode 100644 CRIU_code/criu/arch/aarch64/include/asm/vdso.h create mode 100644 CRIU_code/criu/arch/aarch64/intraprocedure.S create mode 100644 CRIU_code/criu/arch/aarch64/restorer.c create mode 100644 CRIU_code/criu/arch/aarch64/sigframe.c create mode 100644 CRIU_code/criu/arch/aarch64/vdso-pie.c create mode 100644 CRIU_code/criu/arch/arm/Makefile create mode 100644 CRIU_code/criu/arch/arm/aeabi-helpers.S create mode 100644 CRIU_code/criu/arch/arm/bitops.S create mode 100644 CRIU_code/criu/arch/arm/cpu.c create mode 100644 CRIU_code/criu/arch/arm/crtools.c create mode 100644 CRIU_code/criu/arch/arm/include/asm/dump.h create mode 100644 CRIU_code/criu/arch/arm/include/asm/int.h create mode 100644 CRIU_code/criu/arch/arm/include/asm/kerndat.h create mode 100644 CRIU_code/criu/arch/arm/include/asm/parasite-syscall.h create mode 100644 CRIU_code/criu/arch/arm/include/asm/parasite.h create mode 100644 CRIU_code/criu/arch/arm/include/asm/restore.h create mode 100644 CRIU_code/criu/arch/arm/include/asm/restorer.h create mode 100644 CRIU_code/criu/arch/arm/include/asm/types.h create mode 100644 CRIU_code/criu/arch/arm/include/asm/vdso.h create mode 100644 CRIU_code/criu/arch/arm/pie-cacheflush.c create mode 100644 CRIU_code/criu/arch/arm/restorer.c create mode 100644 CRIU_code/criu/arch/arm/sigframe.c create mode 100644 CRIU_code/criu/arch/arm/vdso-pie.c create mode 100644 CRIU_code/criu/arch/ppc64/Makefile create mode 100644 CRIU_code/criu/arch/ppc64/cpu.c create mode 100644 CRIU_code/criu/arch/ppc64/crtools.c create mode 100644 CRIU_code/criu/arch/ppc64/include/asm/dump.h create mode 100644 CRIU_code/criu/arch/ppc64/include/asm/int.h create mode 100644 CRIU_code/criu/arch/ppc64/include/asm/kerndat.h create mode 100644 CRIU_code/criu/arch/ppc64/include/asm/parasite-syscall.h create mode 100644 CRIU_code/criu/arch/ppc64/include/asm/parasite.h create mode 100644 CRIU_code/criu/arch/ppc64/include/asm/restore.h create mode 100644 CRIU_code/criu/arch/ppc64/include/asm/restorer.h create mode 100644 CRIU_code/criu/arch/ppc64/include/asm/types.h create mode 100644 CRIU_code/criu/arch/ppc64/include/asm/vdso.h create mode 100644 CRIU_code/criu/arch/ppc64/misc.S create mode 100644 CRIU_code/criu/arch/ppc64/restorer.c create mode 100644 CRIU_code/criu/arch/ppc64/sigframe.c create mode 100644 CRIU_code/criu/arch/ppc64/vdso-pie.c create mode 100644 CRIU_code/criu/arch/ppc64/vdso-trampoline.S create mode 100644 CRIU_code/criu/arch/riscv/Makefile create mode 100644 CRIU_code/criu/arch/riscv/cpu.c create mode 100644 CRIU_code/criu/arch/riscv/crtools.c create mode 100644 CRIU_code/criu/arch/riscv/include/asm/dump.h create mode 100644 CRIU_code/criu/arch/riscv/include/asm/int.h create mode 100644 CRIU_code/criu/arch/riscv/include/asm/kerndat.h create mode 100644 CRIU_code/criu/arch/riscv/include/asm/parasite-syscall.h create mode 100644 CRIU_code/criu/arch/riscv/include/asm/parasite.h create mode 100644 CRIU_code/criu/arch/riscv/include/asm/restore.h create mode 100644 CRIU_code/criu/arch/riscv/include/asm/restorer.h create mode 100644 CRIU_code/criu/arch/riscv/include/asm/syscall32.h create mode 100644 CRIU_code/criu/arch/riscv/include/asm/types.h create mode 100644 CRIU_code/criu/arch/riscv/include/asm/vdso.h create mode 100644 CRIU_code/criu/arch/riscv/restorer.c create mode 100644 CRIU_code/criu/arch/riscv/sigaction_compat.c create mode 100644 CRIU_code/criu/arch/riscv/sigframe.c create mode 100644 CRIU_code/criu/arch/riscv/vdso-pie.c create mode 100644 CRIU_code/criu/arch/s390/Makefile create mode 100644 CRIU_code/criu/arch/s390/cpu.c create mode 100644 CRIU_code/criu/arch/s390/crtools.c create mode 100644 CRIU_code/criu/arch/s390/include/asm/dump.h create mode 100644 CRIU_code/criu/arch/s390/include/asm/int.h create mode 100644 CRIU_code/criu/arch/s390/include/asm/kerndat.h create mode 100644 CRIU_code/criu/arch/s390/include/asm/parasite-syscall.h create mode 100644 CRIU_code/criu/arch/s390/include/asm/parasite.h create mode 100644 CRIU_code/criu/arch/s390/include/asm/restore.h create mode 100644 CRIU_code/criu/arch/s390/include/asm/restorer.h create mode 100644 CRIU_code/criu/arch/s390/include/asm/types.h create mode 100644 CRIU_code/criu/arch/s390/include/asm/vdso.h create mode 100644 CRIU_code/criu/arch/s390/restorer.c create mode 100644 CRIU_code/criu/arch/s390/sigframe.c create mode 100644 CRIU_code/criu/arch/s390/vdso-pie.c create mode 100644 CRIU_code/criu/arch/x86/Makefile create mode 100644 CRIU_code/criu/arch/x86/cpu.c create mode 100644 CRIU_code/criu/arch/x86/crtools.c create mode 100644 CRIU_code/criu/arch/x86/include/asm/compat.h create mode 100644 CRIU_code/criu/arch/x86/include/asm/dump.h create mode 100644 CRIU_code/criu/arch/x86/include/asm/int.h create mode 100644 CRIU_code/criu/arch/x86/include/asm/kerndat.h create mode 100644 CRIU_code/criu/arch/x86/include/asm/parasite-syscall.h create mode 100644 CRIU_code/criu/arch/x86/include/asm/parasite.h create mode 100644 CRIU_code/criu/arch/x86/include/asm/restore.h create mode 100644 CRIU_code/criu/arch/x86/include/asm/restorer.h create mode 100644 CRIU_code/criu/arch/x86/include/asm/syscall32.h create mode 100644 CRIU_code/criu/arch/x86/include/asm/types.h create mode 100644 CRIU_code/criu/arch/x86/include/asm/vdso.h create mode 100644 CRIU_code/criu/arch/x86/kerndat.c create mode 100644 CRIU_code/criu/arch/x86/restorer.c create mode 100644 CRIU_code/criu/arch/x86/restorer_unmap.S create mode 100644 CRIU_code/criu/arch/x86/sigaction_compat.c create mode 100644 CRIU_code/criu/arch/x86/sigaction_compat_pie.c create mode 100644 CRIU_code/criu/arch/x86/sigframe.c create mode 100644 CRIU_code/criu/arch/x86/sys-exec-tbl.c create mode 100644 CRIU_code/criu/arch/x86/vdso-pie.c create mode 100644 CRIU_code/criu/autofs.c create mode 100644 CRIU_code/criu/bfd.c create mode 100644 CRIU_code/criu/bitmap.c create mode 100644 CRIU_code/criu/cgroup-props.c create mode 100644 CRIU_code/criu/cgroup.c create mode 100644 CRIU_code/criu/clone-noasan.c create mode 100644 CRIU_code/criu/config.c create mode 100644 CRIU_code/criu/cr-check.c create mode 100644 CRIU_code/criu/cr-dedup.c create mode 100644 CRIU_code/criu/cr-dump.c create mode 100644 CRIU_code/criu/cr-errno.c create mode 100644 CRIU_code/criu/cr-restore.c create mode 100644 CRIU_code/criu/cr-service.c create mode 100644 CRIU_code/criu/crtools.c create mode 100644 CRIU_code/criu/eventfd.c create mode 100644 CRIU_code/criu/eventpoll.c create mode 100644 CRIU_code/criu/external.c create mode 100644 CRIU_code/criu/fault-injection.c create mode 100644 CRIU_code/criu/fdstore.c create mode 100644 CRIU_code/criu/fifo.c create mode 100644 CRIU_code/criu/file-ids.c create mode 100644 CRIU_code/criu/file-lock.c create mode 100644 CRIU_code/criu/files-ext.c create mode 100644 CRIU_code/criu/files-reg.c create mode 100644 CRIU_code/criu/files.c create mode 100644 CRIU_code/criu/filesystems.c create mode 100644 CRIU_code/criu/fsnotify.c create mode 100644 CRIU_code/criu/image-desc.c create mode 100644 CRIU_code/criu/image.c create mode 100644 CRIU_code/criu/img-cache.c create mode 100644 CRIU_code/criu/img-proxy.c create mode 100644 CRIU_code/criu/img-remote.c create mode 100644 CRIU_code/criu/include/action-scripts.h create mode 100644 CRIU_code/criu/include/aio.h create mode 100644 CRIU_code/criu/include/asm-generic/int.h create mode 100644 CRIU_code/criu/include/asm-generic/vdso.h create mode 100644 CRIU_code/criu/include/atomic.h create mode 100644 CRIU_code/criu/include/autofs.h create mode 100644 CRIU_code/criu/include/bfd.h create mode 100644 CRIU_code/criu/include/bitmap.h create mode 100644 CRIU_code/criu/include/bitops.h create mode 100644 CRIU_code/criu/include/bitsperlong.h create mode 100644 CRIU_code/criu/include/cgroup-props.h create mode 100644 CRIU_code/criu/include/cgroup.h create mode 100644 CRIU_code/criu/include/clone-noasan.h create mode 100644 CRIU_code/criu/include/cpu.h create mode 100644 CRIU_code/criu/include/cr-errno.h create mode 100644 CRIU_code/criu/include/cr-service-const.h create mode 100644 CRIU_code/criu/include/cr-service.h create mode 100644 CRIU_code/criu/include/cr_options.h create mode 100644 CRIU_code/criu/include/criu-log.h create mode 100644 CRIU_code/criu/include/criu-plugin.h create mode 100644 CRIU_code/criu/include/crtools.h create mode 100644 CRIU_code/criu/include/dump.h create mode 100644 CRIU_code/criu/include/eventfd.h create mode 100644 CRIU_code/criu/include/eventpoll.h create mode 100644 CRIU_code/criu/include/external.h create mode 100644 CRIU_code/criu/include/fault-injection.h create mode 100644 CRIU_code/criu/include/fcntl.h create mode 100644 CRIU_code/criu/include/fdinfo.h create mode 100644 CRIU_code/criu/include/fdstore.h create mode 100644 CRIU_code/criu/include/fifo.h create mode 100644 CRIU_code/criu/include/file-ids.h create mode 100644 CRIU_code/criu/include/file-lock.h create mode 100644 CRIU_code/criu/include/files-reg.h create mode 100644 CRIU_code/criu/include/files.h create mode 100644 CRIU_code/criu/include/filesystems.h create mode 100644 CRIU_code/criu/include/fs-magic.h create mode 100644 CRIU_code/criu/include/fsnotify.h create mode 100644 CRIU_code/criu/include/image-desc.h create mode 100644 CRIU_code/criu/include/image.h create mode 100644 CRIU_code/criu/include/img-remote.h create mode 100644 CRIU_code/criu/include/imgset.h create mode 100644 CRIU_code/criu/include/inet_diag.h create mode 100644 CRIU_code/criu/include/infect-pie.h create mode 100644 CRIU_code/criu/include/int.h create mode 100644 CRIU_code/criu/include/ipc_ns.h create mode 100644 CRIU_code/criu/include/irmap.h create mode 100644 CRIU_code/criu/include/kcmp-ids.h create mode 100644 CRIU_code/criu/include/kcmp.h create mode 100644 CRIU_code/criu/include/kerndat.h create mode 100644 CRIU_code/criu/include/libnetlink.h create mode 100644 CRIU_code/criu/include/linux/userfaultfd.h create mode 100644 CRIU_code/criu/include/log.h create mode 100644 CRIU_code/criu/include/lsm.h create mode 100644 CRIU_code/criu/include/magic.h create mode 100644 CRIU_code/criu/include/mem.h create mode 100644 CRIU_code/criu/include/mman.h create mode 100644 CRIU_code/criu/include/mount.h create mode 100644 CRIU_code/criu/include/namespaces.h create mode 100644 CRIU_code/criu/include/net.h create mode 100644 CRIU_code/criu/include/netfilter.h create mode 100644 CRIU_code/criu/include/netlink_diag.h create mode 100644 CRIU_code/criu/include/packet_diag.h create mode 100644 CRIU_code/criu/include/page-pipe.h create mode 100644 CRIU_code/criu/include/page-xfer.h create mode 100644 CRIU_code/criu/include/page.h create mode 100644 CRIU_code/criu/include/pagemap-cache.h create mode 100644 CRIU_code/criu/include/pagemap.h create mode 100644 CRIU_code/criu/include/parasite-syscall.h create mode 100644 CRIU_code/criu/include/parasite-vdso.h create mode 100644 CRIU_code/criu/include/parasite.h create mode 100644 CRIU_code/criu/include/path.h create mode 100644 CRIU_code/criu/include/pid.h create mode 100644 CRIU_code/criu/include/pipes.h create mode 100644 CRIU_code/criu/include/plugin.h create mode 100644 CRIU_code/criu/include/posix-timer.h create mode 100644 CRIU_code/criu/include/prctl.h create mode 100644 CRIU_code/criu/include/proc_parse.h create mode 100644 CRIU_code/criu/include/protobuf-desc.h create mode 100644 CRIU_code/criu/include/protobuf.h create mode 100644 CRIU_code/criu/include/pstree.h create mode 100644 CRIU_code/criu/include/ptrace-compat.h create mode 100644 CRIU_code/criu/include/rbtree.h create mode 100644 CRIU_code/criu/include/restore.h create mode 100644 CRIU_code/criu/include/restorer.h create mode 100644 CRIU_code/criu/include/rst-malloc.h create mode 100644 CRIU_code/criu/include/rst_info.h create mode 100644 CRIU_code/criu/include/seccomp.h create mode 100644 CRIU_code/criu/include/seize.h create mode 100644 CRIU_code/criu/include/servicefd.h create mode 100644 CRIU_code/criu/include/setproctitle.h create mode 100644 CRIU_code/criu/include/shmem.h create mode 100644 CRIU_code/criu/include/sigframe.h create mode 100644 CRIU_code/criu/include/signalfd.h create mode 100644 CRIU_code/criu/include/sk-inet.h create mode 100644 CRIU_code/criu/include/sk-packet.h create mode 100644 CRIU_code/criu/include/sk-queue.h create mode 100644 CRIU_code/criu/include/sockets.h create mode 100644 CRIU_code/criu/include/stats.h create mode 100644 CRIU_code/criu/include/string.h create mode 100644 CRIU_code/criu/include/sysctl.h create mode 100644 CRIU_code/criu/include/sysfs_parse.h create mode 100644 CRIU_code/criu/include/timerfd.h create mode 100644 CRIU_code/criu/include/tls.h create mode 100644 CRIU_code/criu/include/tty.h create mode 100644 CRIU_code/criu/include/tun.h create mode 100644 CRIU_code/criu/include/types.h create mode 100644 CRIU_code/criu/include/uffd.h create mode 100644 CRIU_code/criu/include/unix_diag.h create mode 100644 CRIU_code/criu/include/util-pie.h create mode 100644 CRIU_code/criu/include/util-vdso.h create mode 100644 CRIU_code/criu/include/util.h create mode 100644 CRIU_code/criu/include/uts_ns.h create mode 100644 CRIU_code/criu/include/vdso.h create mode 100644 CRIU_code/criu/include/vma.h create mode 100644 CRIU_code/criu/include/xmalloc.h create mode 100644 CRIU_code/criu/ipc_ns.c create mode 100644 CRIU_code/criu/irmap.c create mode 100644 CRIU_code/criu/kcmp-ids.c create mode 100644 CRIU_code/criu/kerndat.c create mode 100644 CRIU_code/criu/libnetlink.c create mode 100644 CRIU_code/criu/log.c create mode 100644 CRIU_code/criu/lsm.c create mode 100644 CRIU_code/criu/mem.c create mode 100644 CRIU_code/criu/mount.c create mode 100644 CRIU_code/criu/namespaces.c create mode 100644 CRIU_code/criu/net.c create mode 100644 CRIU_code/criu/netfilter.c create mode 100644 CRIU_code/criu/page-pipe.c create mode 100644 CRIU_code/criu/page-xfer.c create mode 100644 CRIU_code/criu/pagemap-cache.c create mode 100644 CRIU_code/criu/pagemap.c create mode 100644 CRIU_code/criu/parasite-syscall.c create mode 100644 CRIU_code/criu/path.c create mode 100644 CRIU_code/criu/pie-util-vdso-elf32.c create mode 100644 CRIU_code/criu/pie-util-vdso.c create mode 100644 CRIU_code/criu/pie-util.c create mode 100644 CRIU_code/criu/pie/Makefile create mode 100644 CRIU_code/criu/pie/Makefile.library create mode 100644 CRIU_code/criu/pie/parasite-vdso.c create mode 100644 CRIU_code/criu/pie/parasite.c create mode 100644 CRIU_code/criu/pie/pie-relocs.h create mode 100644 CRIU_code/criu/pie/restorer.c create mode 100644 CRIU_code/criu/pie/util-vdso-elf32.c create mode 100644 CRIU_code/criu/pie/util-vdso.c create mode 100644 CRIU_code/criu/pie/util.c create mode 100644 CRIU_code/criu/pipes.c create mode 100644 CRIU_code/criu/plugin.c create mode 100644 CRIU_code/criu/proc_parse.c create mode 100644 CRIU_code/criu/protobuf-desc.c create mode 100644 CRIU_code/criu/protobuf.c create mode 100644 CRIU_code/criu/pstree.c create mode 100644 CRIU_code/criu/rbtree.c create mode 100644 CRIU_code/criu/rst-malloc.c create mode 100644 CRIU_code/criu/seccomp.c create mode 100644 CRIU_code/criu/seize.c create mode 100644 CRIU_code/criu/servicefd.c create mode 100644 CRIU_code/criu/shmem.c create mode 100644 CRIU_code/criu/sigframe.c create mode 100644 CRIU_code/criu/signalfd.c create mode 100644 CRIU_code/criu/sk-inet.c create mode 100644 CRIU_code/criu/sk-netlink.c create mode 100644 CRIU_code/criu/sk-packet.c create mode 100644 CRIU_code/criu/sk-queue.c create mode 100644 CRIU_code/criu/sk-tcp.c create mode 100644 CRIU_code/criu/sk-unix.c create mode 100644 CRIU_code/criu/sockets.c create mode 100644 CRIU_code/criu/stats.c create mode 100644 CRIU_code/criu/string.c create mode 100644 CRIU_code/criu/sysctl.c create mode 100644 CRIU_code/criu/sysfs_parse.c create mode 100644 CRIU_code/criu/timerfd.c create mode 100644 CRIU_code/criu/tls.c create mode 100644 CRIU_code/criu/tty.c create mode 100644 CRIU_code/criu/tun.c create mode 100644 CRIU_code/criu/uffd.c create mode 100644 CRIU_code/criu/util.c create mode 100644 CRIU_code/criu/uts_ns.c create mode 100644 CRIU_code/criu/vdso-compat.c create mode 100644 CRIU_code/criu/vdso.c create mode 100644 CRIU_code/images/Makefile create mode 100644 CRIU_code/images/autofs.proto create mode 100644 CRIU_code/images/binfmt-misc.proto create mode 100644 CRIU_code/images/cgroup.proto create mode 100644 CRIU_code/images/core-aarch64.proto create mode 100644 CRIU_code/images/core-arm.proto create mode 100644 CRIU_code/images/core-ppc64.proto create mode 100644 CRIU_code/images/core-riscv.proto create mode 100644 CRIU_code/images/core-s390.proto create mode 100644 CRIU_code/images/core-x86.proto create mode 100644 CRIU_code/images/core.proto create mode 100644 CRIU_code/images/cpuinfo.proto create mode 100644 CRIU_code/images/creds.proto create mode 100644 CRIU_code/images/eventfd.proto create mode 100644 CRIU_code/images/eventpoll.proto create mode 100644 CRIU_code/images/ext-file.proto create mode 100644 CRIU_code/images/fdinfo.proto create mode 100644 CRIU_code/images/fh.proto create mode 100644 CRIU_code/images/fifo.proto create mode 100644 CRIU_code/images/file-lock.proto create mode 100644 CRIU_code/images/fown.proto create mode 100644 CRIU_code/images/fs.proto create mode 100644 CRIU_code/images/fsnotify.proto create mode 100644 CRIU_code/images/ghost-file.proto create mode 100644 CRIU_code/images/google/protobuf/descriptor.proto create mode 100644 CRIU_code/images/inventory.proto create mode 100644 CRIU_code/images/ipc-desc.proto create mode 100644 CRIU_code/images/ipc-msg.proto create mode 100644 CRIU_code/images/ipc-sem.proto create mode 100644 CRIU_code/images/ipc-shm.proto create mode 100644 CRIU_code/images/ipc-var.proto create mode 100644 CRIU_code/images/macvlan.proto create mode 100644 CRIU_code/images/mm.proto create mode 100644 CRIU_code/images/mnt.proto create mode 100644 CRIU_code/images/netdev.proto create mode 100644 CRIU_code/images/ns.proto create mode 100644 CRIU_code/images/opts.proto create mode 100644 CRIU_code/images/packet-sock.proto create mode 100644 CRIU_code/images/pagemap.proto create mode 100644 CRIU_code/images/pipe-data.proto create mode 100644 CRIU_code/images/pipe.proto create mode 100644 CRIU_code/images/pstree.proto create mode 100644 CRIU_code/images/regfile.proto create mode 100644 CRIU_code/images/remap-file-path.proto create mode 100644 CRIU_code/images/remote-image.proto create mode 100644 CRIU_code/images/rlimit.proto create mode 100644 CRIU_code/images/rpc.proto create mode 100644 CRIU_code/images/sa.proto create mode 100644 CRIU_code/images/seccomp.proto create mode 100644 CRIU_code/images/siginfo.proto create mode 100644 CRIU_code/images/signalfd.proto create mode 100644 CRIU_code/images/sit.proto create mode 100644 CRIU_code/images/sk-inet.proto create mode 100644 CRIU_code/images/sk-netlink.proto create mode 100644 CRIU_code/images/sk-opts.proto create mode 100644 CRIU_code/images/sk-packet.proto create mode 100644 CRIU_code/images/sk-unix.proto create mode 100644 CRIU_code/images/stats.proto create mode 100644 CRIU_code/images/sysctl.proto create mode 100644 CRIU_code/images/tcp-stream.proto create mode 100644 CRIU_code/images/time.proto create mode 100644 CRIU_code/images/timer.proto create mode 100644 CRIU_code/images/timerfd.proto create mode 100644 CRIU_code/images/tty.proto create mode 100644 CRIU_code/images/tun.proto create mode 100644 CRIU_code/images/userns.proto create mode 100644 CRIU_code/images/utsns.proto create mode 100644 CRIU_code/images/vma.proto create mode 100644 CRIU_code/include/common/arch/aarch64/asm/atomic.h create mode 100644 CRIU_code/include/common/arch/aarch64/asm/bitops.h create mode 100644 CRIU_code/include/common/arch/aarch64/asm/bitsperlong.h create mode 100644 CRIU_code/include/common/arch/aarch64/asm/linkage.h create mode 100644 CRIU_code/include/common/arch/aarch64/asm/page.h create mode 100644 CRIU_code/include/common/arch/arm/asm/atomic.h create mode 100644 CRIU_code/include/common/arch/arm/asm/bitops.h create mode 100644 CRIU_code/include/common/arch/arm/asm/bitsperlong.h create mode 100644 CRIU_code/include/common/arch/arm/asm/linkage.h create mode 100644 CRIU_code/include/common/arch/arm/asm/page.h create mode 100644 CRIU_code/include/common/arch/arm/asm/processor.h create mode 100644 CRIU_code/include/common/arch/ppc64/asm/atomic.h create mode 100644 CRIU_code/include/common/arch/ppc64/asm/bitops.h create mode 100644 CRIU_code/include/common/arch/ppc64/asm/bitsperlong.h create mode 100644 CRIU_code/include/common/arch/ppc64/asm/cmpxchg.h create mode 100644 CRIU_code/include/common/arch/ppc64/asm/linkage.h create mode 100644 CRIU_code/include/common/arch/ppc64/asm/page.h create mode 100644 CRIU_code/include/common/arch/riscv/.keep create mode 100644 CRIU_code/include/common/arch/riscv/asm/.keep create mode 100644 CRIU_code/include/common/arch/riscv/asm/atomic.h create mode 100644 CRIU_code/include/common/arch/riscv/asm/bitops.h create mode 100644 CRIU_code/include/common/arch/riscv/asm/bitsperlong.h create mode 100644 CRIU_code/include/common/arch/riscv/asm/linkage.h create mode 100644 CRIU_code/include/common/arch/riscv/asm/page.h create mode 100644 CRIU_code/include/common/arch/s390/asm/atomic.h create mode 100644 CRIU_code/include/common/arch/s390/asm/atomic_ops.h create mode 100644 CRIU_code/include/common/arch/s390/asm/bitops.h create mode 100644 CRIU_code/include/common/arch/s390/asm/bitsperlong.h create mode 100644 CRIU_code/include/common/arch/s390/asm/linkage.h create mode 100644 CRIU_code/include/common/arch/s390/asm/page.h create mode 100644 CRIU_code/include/common/arch/x86/asm/atomic.h create mode 100644 CRIU_code/include/common/arch/x86/asm/bitops.h create mode 100644 CRIU_code/include/common/arch/x86/asm/bitsperlong.h create mode 100644 CRIU_code/include/common/arch/x86/asm/cmpxchg.h create mode 100644 CRIU_code/include/common/arch/x86/asm/linkage.h create mode 100644 CRIU_code/include/common/arch/x86/asm/page.h create mode 100644 CRIU_code/include/common/asm-generic/bitops.h create mode 100644 CRIU_code/include/common/bitops.h create mode 100644 CRIU_code/include/common/bitsperlong.h create mode 100644 CRIU_code/include/common/bug.h create mode 100644 CRIU_code/include/common/compiler.h create mode 100644 CRIU_code/include/common/err.h create mode 100644 CRIU_code/include/common/list.h create mode 100644 CRIU_code/include/common/lock.h create mode 100644 CRIU_code/include/common/page.h create mode 100644 CRIU_code/include/common/scm-code.c create mode 100644 CRIU_code/include/common/scm.h create mode 100644 CRIU_code/include/common/xmalloc.h create mode 100644 CRIU_code/lib/Makefile create mode 100644 CRIU_code/lib/c/Makefile create mode 100644 CRIU_code/lib/c/criu.c create mode 100644 CRIU_code/lib/c/criu.h create mode 100644 CRIU_code/lib/c/criu.pc.in create mode 100644 CRIU_code/lib/py/.gitignore create mode 100644 CRIU_code/lib/py/Makefile create mode 100644 CRIU_code/lib/py/__init__.py create mode 100644 CRIU_code/lib/py/cli.py create mode 100644 CRIU_code/lib/py/criu.py create mode 100644 CRIU_code/lib/py/images/.gitignore create mode 100644 CRIU_code/lib/py/images/Makefile create mode 100644 CRIU_code/lib/py/images/__init__.py create mode 100644 CRIU_code/lib/py/images/images.py create mode 100644 CRIU_code/lib/py/images/pb2dict.py create mode 100644 CRIU_code/scripts/build/Dockerfile.aarch64.hdr create mode 100644 CRIU_code/scripts/build/Dockerfile.aarch64.tmpl create mode 100644 CRIU_code/scripts/build/Dockerfile.alpine create mode 100644 CRIU_code/scripts/build/Dockerfile.armv7hf.hdr create mode 100644 CRIU_code/scripts/build/Dockerfile.armv7hf.tmpl create mode 100644 CRIU_code/scripts/build/Dockerfile.centos create mode 100644 CRIU_code/scripts/build/Dockerfile.fedora-asan.hdr create mode 100644 CRIU_code/scripts/build/Dockerfile.fedora-asan.tmpl create mode 100644 CRIU_code/scripts/build/Dockerfile.fedora-rawhide-aarch64.hdr create mode 100644 CRIU_code/scripts/build/Dockerfile.fedora-rawhide-aarch64.tmpl create mode 100644 CRIU_code/scripts/build/Dockerfile.fedora-rawhide.hdr create mode 100644 CRIU_code/scripts/build/Dockerfile.fedora-rawhide.tmpl create mode 100644 CRIU_code/scripts/build/Dockerfile.fedora.tmpl create mode 100644 CRIU_code/scripts/build/Dockerfile.ppc64le.hdr create mode 100644 CRIU_code/scripts/build/Dockerfile.ppc64le.tmpl create mode 100644 CRIU_code/scripts/build/Dockerfile.s390x.hdr create mode 100644 CRIU_code/scripts/build/Dockerfile.s390x.tmpl create mode 100644 CRIU_code/scripts/build/Dockerfile.tmpl create mode 100644 CRIU_code/scripts/build/Dockerfile.x86_64.hdr create mode 100644 CRIU_code/scripts/build/Dockerfile.x86_64.tmpl create mode 100644 CRIU_code/scripts/build/Makefile create mode 100644 CRIU_code/scripts/build/binfmt_misc create mode 100644 CRIU_code/scripts/build/extract-deb-pkg create mode 100644 CRIU_code/scripts/crit-setup.py create mode 100644 CRIU_code/scripts/criu-ns create mode 100644 CRIU_code/scripts/fake-restore.sh create mode 100644 CRIU_code/scripts/feature-tests.mak create mode 100644 CRIU_code/scripts/flake8.cfg create mode 100644 CRIU_code/scripts/install-debian-pkgs.sh create mode 100644 CRIU_code/scripts/magic-gen.py create mode 100644 CRIU_code/scripts/nmk/.gitignore create mode 100644 CRIU_code/scripts/nmk/Documentation/Makefile create mode 100644 CRIU_code/scripts/nmk/Documentation/nmk.txt create mode 100644 CRIU_code/scripts/nmk/Makefile create mode 100644 CRIU_code/scripts/nmk/README.md create mode 100644 CRIU_code/scripts/nmk/scripts/build.mk create mode 100644 CRIU_code/scripts/nmk/scripts/include.mk create mode 100644 CRIU_code/scripts/nmk/scripts/macro.mk create mode 100644 CRIU_code/scripts/nmk/scripts/main.mk create mode 100644 CRIU_code/scripts/nmk/scripts/msg.mk create mode 100644 CRIU_code/scripts/nmk/scripts/tools.mk create mode 100644 CRIU_code/scripts/nmk/scripts/utils.mk create mode 100644 CRIU_code/scripts/protobuf-gen.sh create mode 100644 CRIU_code/scripts/systemd-autofs-restart.sh create mode 100644 CRIU_code/scripts/tmp-files.sh create mode 100644 CRIU_code/scripts/travis/Makefile create mode 100644 CRIU_code/scripts/travis/asan.sh create mode 100644 CRIU_code/scripts/travis/docker-test.sh create mode 100644 CRIU_code/scripts/travis/docker.env create mode 100644 CRIU_code/scripts/travis/travis-after_success create mode 100644 CRIU_code/scripts/travis/travis-tests create mode 100644 CRIU_code/soccr/Makefile create mode 100644 CRIU_code/soccr/soccr.c create mode 100644 CRIU_code/soccr/soccr.h create mode 100644 CRIU_code/soccr/test/Makefile create mode 100644 CRIU_code/soccr/test/local.sh create mode 100644 CRIU_code/soccr/test/run.py create mode 100644 CRIU_code/soccr/test/tcp-conn-v6.c create mode 100644 CRIU_code/soccr/test/tcp-conn.c create mode 100644 CRIU_code/soccr/test/tcp-constructor.c create mode 100644 CRIU_code/soccr/test/tcp-test.py create mode 100644 CRIU_code/test/.gitignore create mode 100644 CRIU_code/test/Makefile create mode 100644 CRIU_code/test/abrt.sh create mode 100644 CRIU_code/test/check_actions.py create mode 100644 CRIU_code/test/compel/Makefile create mode 100644 CRIU_code/test/compel/arch/aarch64/include/arch_test_handle_binary.h create mode 100644 CRIU_code/test/compel/arch/arm/include/arch_test_handle_binary.h create mode 100644 CRIU_code/test/compel/arch/ppc64/include/arch_test_handle_binary.h create mode 100644 CRIU_code/test/compel/arch/x86/include/arch_test_handle_binary.h create mode 100644 CRIU_code/test/compel/handle_binary.c create mode 100644 CRIU_code/test/compel/handle_binary_32.c create mode 100644 CRIU_code/test/compel/main.c create mode 100644 CRIU_code/test/crit-recode.py create mode 100644 CRIU_code/test/empty-netns-prep.sh create mode 100644 CRIU_code/test/exhaustive/pipe.py create mode 100644 CRIU_code/test/exhaustive/unix.py create mode 100644 CRIU_code/test/groups.desc create mode 100644 CRIU_code/test/inhfd.desc create mode 100644 CRIU_code/test/inhfd/fifo.py create mode 100644 CRIU_code/test/inhfd/fifo.py.desc create mode 100644 CRIU_code/test/inhfd/pipe.py create mode 100644 CRIU_code/test/inhfd/pipe.py.desc create mode 100644 CRIU_code/test/inhfd/socket.py create mode 100644 CRIU_code/test/inhfd/socket.py.desc create mode 100644 CRIU_code/test/inhfd/tty.py create mode 100644 CRIU_code/test/inhfd/tty.py.desc create mode 100644 CRIU_code/test/jenkins/_run_ct create mode 100644 CRIU_code/test/jenkins/actions.sh create mode 100644 CRIU_code/test/jenkins/crit.sh create mode 100644 CRIU_code/test/jenkins/criu-btrfs.sh create mode 100644 CRIU_code/test/jenkins/criu-by-id.sh create mode 100644 CRIU_code/test/jenkins/criu-dedup.sh create mode 100644 CRIU_code/test/jenkins/criu-dump.sh create mode 100644 CRIU_code/test/jenkins/criu-fault.sh create mode 100644 CRIU_code/test/jenkins/criu-fcg.sh create mode 100644 CRIU_code/test/jenkins/criu-groups.sh create mode 100644 CRIU_code/test/jenkins/criu-inhfd.sh create mode 100644 CRIU_code/test/jenkins/criu-iter.sh create mode 100644 CRIU_code/test/jenkins/criu-join-ns.sh create mode 100644 CRIU_code/test/jenkins/criu-lazy-common.sh create mode 100644 CRIU_code/test/jenkins/criu-lazy-migration.pipeline create mode 100644 CRIU_code/test/jenkins/criu-lazy-migration.sh create mode 100644 CRIU_code/test/jenkins/criu-lazy-pages.sh create mode 100644 CRIU_code/test/jenkins/criu-lib.sh create mode 100644 CRIU_code/test/jenkins/criu-other.sh create mode 100644 CRIU_code/test/jenkins/criu-overlay.sh create mode 100644 CRIU_code/test/jenkins/criu-pre-dump.sh create mode 100644 CRIU_code/test/jenkins/criu-remote-lazy-pages.sh create mode 100644 CRIU_code/test/jenkins/criu-sibling.sh create mode 100644 CRIU_code/test/jenkins/criu-snap.sh create mode 100644 CRIU_code/test/jenkins/criu-stop.sh create mode 100644 CRIU_code/test/jenkins/criu-user.sh create mode 100644 CRIU_code/test/jenkins/criu.sh create mode 100644 CRIU_code/test/jenkins/run_ct create mode 100644 CRIU_code/test/others/app-emu.sh create mode 100644 CRIU_code/test/others/app-emu/java/HelloWorld/HelloWorld.java create mode 100644 CRIU_code/test/others/app-emu/java/HelloWorld/run.sh create mode 100644 CRIU_code/test/others/app-emu/job/Makefile create mode 100644 CRIU_code/test/others/app-emu/job/job.c create mode 100644 CRIU_code/test/others/app-emu/job/job.exp create mode 100644 CRIU_code/test/others/app-emu/job/run.sh create mode 100644 CRIU_code/test/others/app-emu/lxc/network-script.sh create mode 100644 CRIU_code/test/others/app-emu/lxc/run.sh create mode 100644 CRIU_code/test/others/app-emu/make/Makefile create mode 100644 CRIU_code/test/others/app-emu/make/run.sh create mode 100644 CRIU_code/test/others/app-emu/make/tmpl.c create mode 100644 CRIU_code/test/others/app-emu/screen/run.sh create mode 100644 CRIU_code/test/others/app-emu/tarbz/run.sh create mode 100644 CRIU_code/test/others/app-emu/vnc/run.sh create mode 100644 CRIU_code/test/others/app-emu/vnc/vnc-server.sh create mode 100644 CRIU_code/test/others/bers/Makefile create mode 100644 CRIU_code/test/others/bers/bers.c create mode 100644 CRIU_code/test/others/bers/bers.txt create mode 100644 CRIU_code/test/others/crit/.gitignore create mode 100644 CRIU_code/test/others/crit/Makefile create mode 100644 CRIU_code/test/others/crit/loop.sh create mode 100644 CRIU_code/test/others/crit/test.sh create mode 100644 CRIU_code/test/others/criu-coredump/.gitignore create mode 100644 CRIU_code/test/others/criu-coredump/Makefile create mode 100644 CRIU_code/test/others/criu-coredump/loop.sh create mode 100644 CRIU_code/test/others/criu-coredump/test.sh create mode 100644 CRIU_code/test/others/env.sh create mode 100644 CRIU_code/test/others/exec/Makefile create mode 100644 CRIU_code/test/others/exec/run.sh create mode 100644 CRIU_code/test/others/ext-links/Makefile create mode 100644 CRIU_code/test/others/ext-links/addmv.sh create mode 100644 CRIU_code/test/others/ext-links/addmv_raw.sh create mode 100644 CRIU_code/test/others/ext-links/mvlink.c create mode 100644 CRIU_code/test/others/ext-links/run.sh create mode 100644 CRIU_code/test/others/ext-links/run_ns.sh create mode 100644 CRIU_code/test/others/ext-links/run_wait.sh create mode 100644 CRIU_code/test/others/ext-tty/run.py create mode 100644 CRIU_code/test/others/functions.sh create mode 100644 CRIU_code/test/others/libcriu/.gitignore create mode 100644 CRIU_code/test/others/libcriu/Makefile create mode 100644 CRIU_code/test/others/libcriu/lib.c create mode 100644 CRIU_code/test/others/libcriu/lib.h create mode 100644 CRIU_code/test/others/libcriu/run.sh create mode 100644 CRIU_code/test/others/libcriu/test_errno.c create mode 100644 CRIU_code/test/others/libcriu/test_iters.c create mode 100644 CRIU_code/test/others/libcriu/test_notify.c create mode 100644 CRIU_code/test/others/libcriu/test_self.c create mode 100644 CRIU_code/test/others/libcriu/test_sub.c create mode 100644 CRIU_code/test/others/make/Makefile create mode 100644 CRIU_code/test/others/make/uninstall.sh create mode 100644 CRIU_code/test/others/mem-snap/Makefile create mode 100644 CRIU_code/test/others/mem-snap/run-predump-2.sh create mode 100644 CRIU_code/test/others/mem-snap/run-predump.sh create mode 100644 CRIU_code/test/others/mem-snap/run-snap-auto-dedup.sh create mode 100644 CRIU_code/test/others/mem-snap/run-snap-dedup-on-restore.sh create mode 100644 CRIU_code/test/others/mem-snap/run-snap-dedup.sh create mode 100644 CRIU_code/test/others/mem-snap/run-snap-maps04.sh create mode 100644 CRIU_code/test/others/mem-snap/run-snap.sh create mode 100644 CRIU_code/test/others/mem-snap/run.sh create mode 100644 CRIU_code/test/others/mnt-ext-dev/Makefile create mode 100644 CRIU_code/test/others/mnt-ext-dev/run.sh create mode 100644 CRIU_code/test/others/mounts/ext/Makefile create mode 100644 CRIU_code/test/others/mounts/ext/ext-mount.c create mode 100644 CRIU_code/test/others/mounts/ext/ns_init.c create mode 100644 CRIU_code/test/others/mounts/ext/run.sh create mode 100644 CRIU_code/test/others/mounts/mounts.py create mode 100644 CRIU_code/test/others/mounts/mounts.sh create mode 100644 CRIU_code/test/others/mounts/run.sh create mode 100644 CRIU_code/test/others/netns_ext/Makefile create mode 100644 CRIU_code/test/others/netns_ext/_run.sh create mode 100644 CRIU_code/test/others/netns_ext/run.sh create mode 100644 CRIU_code/test/others/overlayfs/Makefile create mode 100644 CRIU_code/test/others/overlayfs/run.sh create mode 100644 CRIU_code/test/others/pipes/Makefile create mode 100644 CRIU_code/test/others/pipes/pipe.c create mode 100644 CRIU_code/test/others/rpc/.gitignore create mode 100644 CRIU_code/test/others/rpc/Makefile create mode 100644 CRIU_code/test/others/rpc/config_file.py create mode 100644 CRIU_code/test/others/rpc/errno.py create mode 100644 CRIU_code/test/others/rpc/loop.sh create mode 100644 CRIU_code/test/others/rpc/ps_test.py create mode 100644 CRIU_code/test/others/rpc/read.py create mode 100644 CRIU_code/test/others/rpc/restore-loop.py create mode 100644 CRIU_code/test/others/rpc/rpc.proto create mode 100644 CRIU_code/test/others/rpc/run.sh create mode 100644 CRIU_code/test/others/rpc/test-c.c create mode 100644 CRIU_code/test/others/rpc/test.py create mode 100644 CRIU_code/test/others/rpc/version.py create mode 100644 CRIU_code/test/others/security/Makefile create mode 100644 CRIU_code/test/others/security/loop.sh create mode 100644 CRIU_code/test/others/security/run.sh create mode 100644 CRIU_code/test/others/shell-job/Makefile create mode 100644 CRIU_code/test/others/shell-job/run.py create mode 100644 CRIU_code/test/others/socketpairs/Makefile create mode 100644 CRIU_code/test/others/socketpairs/socketpair.c create mode 100644 CRIU_code/test/others/tcp/Makefile create mode 100644 CRIU_code/test/others/tcp/cln.c create mode 100644 CRIU_code/test/others/tcp/run.sh create mode 100644 CRIU_code/test/others/tcp/srv.c create mode 100644 CRIU_code/test/others/unix-callback/Makefile create mode 100644 CRIU_code/test/others/unix-callback/run.sh create mode 100644 CRIU_code/test/others/unix-callback/syslog-lib.c create mode 100644 CRIU_code/test/others/unix-callback/unix-client.c create mode 100644 CRIU_code/test/others/unix-callback/unix-lib.c create mode 100644 CRIU_code/test/others/unix-callback/unix-server.c create mode 100644 CRIU_code/test/others/unix-callback/unix.proto create mode 100644 CRIU_code/test/pki/cacert.pem create mode 100644 CRIU_code/test/pki/cert.pem create mode 100644 CRIU_code/test/pki/key.pem create mode 100644 CRIU_code/test/pycriu create mode 100644 CRIU_code/test/show_action.sh create mode 100644 CRIU_code/test/umount2.c create mode 100644 CRIU_code/test/zdtm.desc create mode 100644 CRIU_code/test/zdtm.py create mode 100644 CRIU_code/test/zdtm/.gitignore create mode 100644 CRIU_code/test/zdtm/Makefile create mode 100644 CRIU_code/test/zdtm/Makefile.inc create mode 100644 CRIU_code/test/zdtm/lib/Makefile create mode 100644 CRIU_code/test/zdtm/lib/arch/aarch64/include/asm/atomic.h create mode 100644 CRIU_code/test/zdtm/lib/arch/arm/include/asm/atomic.h create mode 100644 CRIU_code/test/zdtm/lib/arch/ppc64/include/asm/atomic.h create mode 100644 CRIU_code/test/zdtm/lib/arch/s390/include/asm/atomic.h create mode 100644 CRIU_code/test/zdtm/lib/arch/x86/include/asm/atomic.h create mode 100644 CRIU_code/test/zdtm/lib/cpuid.h create mode 100644 CRIU_code/test/zdtm/lib/datagen.c create mode 100644 CRIU_code/test/zdtm/lib/fs.c create mode 100644 CRIU_code/test/zdtm/lib/fs.h create mode 100644 CRIU_code/test/zdtm/lib/groups.c create mode 100644 CRIU_code/test/zdtm/lib/groups.desc create mode 100644 CRIU_code/test/zdtm/lib/lock.c create mode 100644 CRIU_code/test/zdtm/lib/lock.h create mode 100644 CRIU_code/test/zdtm/lib/msg.c create mode 100644 CRIU_code/test/zdtm/lib/ns.c create mode 100644 CRIU_code/test/zdtm/lib/ns.h create mode 100644 CRIU_code/test/zdtm/lib/parseargs.c create mode 100644 CRIU_code/test/zdtm/lib/parseargs.sh create mode 100644 CRIU_code/test/zdtm/lib/stop_and_chk.sh create mode 100644 CRIU_code/test/zdtm/lib/streamutil.c create mode 100644 CRIU_code/test/zdtm/lib/tcp.c create mode 100644 CRIU_code/test/zdtm/lib/test.c create mode 100644 CRIU_code/test/zdtm/lib/zdtmtst.h create mode 100644 CRIU_code/test/zdtm/static/Makefile create mode 100644 CRIU_code/test/zdtm/static/aio00.c create mode 100644 CRIU_code/test/zdtm/static/aio00.desc create mode 100644 CRIU_code/test/zdtm/static/aio01.c create mode 100644 CRIU_code/test/zdtm/static/aio01.desc create mode 100644 CRIU_code/test/zdtm/static/apparmor.c create mode 100644 CRIU_code/test/zdtm/static/apparmor.checkskip create mode 100644 CRIU_code/test/zdtm/static/apparmor.desc create mode 100644 CRIU_code/test/zdtm/static/apparmor.profile create mode 100644 CRIU_code/test/zdtm/static/arm-neon00.c create mode 100644 CRIU_code/test/zdtm/static/arm-neon00.desc create mode 100644 CRIU_code/test/zdtm/static/auto_dev-ioctl.h create mode 100644 CRIU_code/test/zdtm/static/autofs.c create mode 100644 CRIU_code/test/zdtm/static/autofs.desc create mode 100644 CRIU_code/test/zdtm/static/bind-mount.c create mode 100644 CRIU_code/test/zdtm/static/bind-mount.desc create mode 100644 CRIU_code/test/zdtm/static/binfmt_misc.c create mode 100644 CRIU_code/test/zdtm/static/binfmt_misc.desc create mode 100644 CRIU_code/test/zdtm/static/binfmt_misc.hook create mode 100644 CRIU_code/test/zdtm/static/bridge.c create mode 100644 CRIU_code/test/zdtm/static/bridge.desc create mode 100644 CRIU_code/test/zdtm/static/busyloop00.c create mode 100644 CRIU_code/test/zdtm/static/caps00.c create mode 100644 CRIU_code/test/zdtm/static/caps00.desc create mode 100644 CRIU_code/test/zdtm/static/cgroup00.c create mode 100644 CRIU_code/test/zdtm/static/cgroup00.desc create mode 100644 CRIU_code/test/zdtm/static/cgroup00.hook create mode 100644 CRIU_code/test/zdtm/static/cgroup01.c create mode 100644 CRIU_code/test/zdtm/static/cgroup01.desc create mode 100644 CRIU_code/test/zdtm/static/cgroup01.hook create mode 100644 CRIU_code/test/zdtm/static/cgroup02.c create mode 100644 CRIU_code/test/zdtm/static/cgroup02.desc create mode 100644 CRIU_code/test/zdtm/static/cgroup02.hook create mode 100644 CRIU_code/test/zdtm/static/cgroup03.c create mode 100644 CRIU_code/test/zdtm/static/cgroup03.desc create mode 100644 CRIU_code/test/zdtm/static/cgroup03.hook create mode 100644 CRIU_code/test/zdtm/static/cgroup04.c create mode 100644 CRIU_code/test/zdtm/static/cgroup04.desc create mode 100644 CRIU_code/test/zdtm/static/cgroup04.hook create mode 100644 CRIU_code/test/zdtm/static/cgroup_ifpriomap.c create mode 100644 CRIU_code/test/zdtm/static/cgroup_ifpriomap.desc create mode 100644 CRIU_code/test/zdtm/static/cgroup_ifpriomap.hook create mode 100644 CRIU_code/test/zdtm/static/cgroup_stray.c create mode 100644 CRIU_code/test/zdtm/static/cgroup_stray.desc create mode 100644 CRIU_code/test/zdtm/static/cgroupns.c create mode 100644 CRIU_code/test/zdtm/static/cgroupns.desc create mode 100644 CRIU_code/test/zdtm/static/child_opened_proc.c create mode 100644 CRIU_code/test/zdtm/static/chroot-file.c create mode 100644 CRIU_code/test/zdtm/static/chroot-file.desc create mode 100644 CRIU_code/test/zdtm/static/chroot.c create mode 100644 CRIU_code/test/zdtm/static/chroot.desc create mode 100644 CRIU_code/test/zdtm/static/clean_mntns.c create mode 100644 CRIU_code/test/zdtm/static/clean_mntns.desc create mode 100644 CRIU_code/test/zdtm/static/clone_fs.c create mode 100644 CRIU_code/test/zdtm/static/cmdlinenv00.c create mode 100644 CRIU_code/test/zdtm/static/cmdlinenv00.desc create mode 100644 CRIU_code/test/zdtm/static/config_inotify_irmap.c create mode 100644 CRIU_code/test/zdtm/static/config_inotify_irmap.desc create mode 100644 CRIU_code/test/zdtm/static/conntracks create mode 100644 CRIU_code/test/zdtm/static/conntracks.desc create mode 100644 CRIU_code/test/zdtm/static/console.c create mode 100644 CRIU_code/test/zdtm/static/console.desc create mode 100644 CRIU_code/test/zdtm/static/cow00.c create mode 100644 CRIU_code/test/zdtm/static/cow00.desc create mode 100644 CRIU_code/test/zdtm/static/cow01.c create mode 100644 CRIU_code/test/zdtm/static/cow01.desc create mode 100644 CRIU_code/test/zdtm/static/cr_veth.c create mode 100644 CRIU_code/test/zdtm/static/cr_veth.checkskip create mode 100644 CRIU_code/test/zdtm/static/cr_veth.desc create mode 100644 CRIU_code/test/zdtm/static/cr_veth.hook create mode 100644 CRIU_code/test/zdtm/static/criu-rtc.c create mode 100644 CRIU_code/test/zdtm/static/criu-rtc.proto create mode 100644 CRIU_code/test/zdtm/static/cwd00.c create mode 100644 CRIU_code/test/zdtm/static/cwd01.c create mode 100644 CRIU_code/test/zdtm/static/cwd02.c create mode 100644 CRIU_code/test/zdtm/static/del_standalone_un.c create mode 100644 CRIU_code/test/zdtm/static/del_standalone_un.desc create mode 100644 CRIU_code/test/zdtm/static/deleted_dev.c create mode 100644 CRIU_code/test/zdtm/static/deleted_dev.desc create mode 100644 CRIU_code/test/zdtm/static/deleted_unix_sock.c create mode 100644 CRIU_code/test/zdtm/static/different_creds.c create mode 100644 CRIU_code/test/zdtm/static/different_creds.desc create mode 100644 CRIU_code/test/zdtm/static/dumpable01.c create mode 100644 CRIU_code/test/zdtm/static/dumpable02.c create mode 100644 CRIU_code/test/zdtm/static/dumpable02.desc create mode 100644 CRIU_code/test/zdtm/static/env00.c create mode 100644 CRIU_code/test/zdtm/static/epoll.c create mode 100644 CRIU_code/test/zdtm/static/epoll.desc create mode 100644 CRIU_code/test/zdtm/static/eventfs00.c create mode 100644 CRIU_code/test/zdtm/static/fanotify00.c create mode 100644 CRIU_code/test/zdtm/static/fanotify00.desc create mode 100644 CRIU_code/test/zdtm/static/fd.c create mode 100644 CRIU_code/test/zdtm/static/fd01.c create mode 100644 CRIU_code/test/zdtm/static/fd01.desc create mode 100644 CRIU_code/test/zdtm/static/fdt_shared.c create mode 100644 CRIU_code/test/zdtm/static/fifo-ghost.c create mode 100644 CRIU_code/test/zdtm/static/fifo-rowo-pair.c create mode 100644 CRIU_code/test/zdtm/static/fifo.c create mode 100644 CRIU_code/test/zdtm/static/fifo_ro.c create mode 100644 CRIU_code/test/zdtm/static/fifo_wronly.c create mode 100644 CRIU_code/test/zdtm/static/file_append.c create mode 100644 CRIU_code/test/zdtm/static/file_attr.c create mode 100644 CRIU_code/test/zdtm/static/file_fown.c create mode 100644 CRIU_code/test/zdtm/static/file_fown.desc create mode 100644 CRIU_code/test/zdtm/static/file_lease00.c create mode 100644 CRIU_code/test/zdtm/static/file_lease00.desc create mode 100644 CRIU_code/test/zdtm/static/file_lease01.c create mode 100644 CRIU_code/test/zdtm/static/file_lease01.desc create mode 100644 CRIU_code/test/zdtm/static/file_lease02.c create mode 100644 CRIU_code/test/zdtm/static/file_lease02.desc create mode 100644 CRIU_code/test/zdtm/static/file_lease03.c create mode 100644 CRIU_code/test/zdtm/static/file_lease03.desc create mode 100644 CRIU_code/test/zdtm/static/file_lease04.c create mode 100644 CRIU_code/test/zdtm/static/file_lease04.desc create mode 100644 CRIU_code/test/zdtm/static/file_locks00.c create mode 100644 CRIU_code/test/zdtm/static/file_locks00.desc create mode 100644 CRIU_code/test/zdtm/static/file_locks01.c create mode 100644 CRIU_code/test/zdtm/static/file_locks01.desc create mode 100644 CRIU_code/test/zdtm/static/file_locks02.c create mode 100644 CRIU_code/test/zdtm/static/file_locks02.desc create mode 100644 CRIU_code/test/zdtm/static/file_locks03.c create mode 100644 CRIU_code/test/zdtm/static/file_locks03.desc create mode 100644 CRIU_code/test/zdtm/static/file_locks04.c create mode 100644 CRIU_code/test/zdtm/static/file_locks04.desc create mode 100644 CRIU_code/test/zdtm/static/file_locks05.c create mode 100644 CRIU_code/test/zdtm/static/file_locks05.desc create mode 100644 CRIU_code/test/zdtm/static/file_locks06.c create mode 100644 CRIU_code/test/zdtm/static/file_locks06.checkskip create mode 100644 CRIU_code/test/zdtm/static/file_locks06.desc create mode 100644 CRIU_code/test/zdtm/static/file_locks07.c create mode 100644 CRIU_code/test/zdtm/static/file_locks07.checkskip create mode 100644 CRIU_code/test/zdtm/static/file_locks07.desc create mode 100644 CRIU_code/test/zdtm/static/file_locks08.c create mode 100644 CRIU_code/test/zdtm/static/file_locks08.checkskip create mode 100644 CRIU_code/test/zdtm/static/file_locks08.desc create mode 100644 CRIU_code/test/zdtm/static/file_shared.c create mode 100644 CRIU_code/test/zdtm/static/fpu00.c create mode 100644 CRIU_code/test/zdtm/static/fpu00.desc create mode 100644 CRIU_code/test/zdtm/static/fpu01.c create mode 100644 CRIU_code/test/zdtm/static/fpu01.desc create mode 100644 CRIU_code/test/zdtm/static/fpu02.c create mode 100644 CRIU_code/test/zdtm/static/fpu02.desc create mode 100644 CRIU_code/test/zdtm/static/futex-rl.c create mode 100644 CRIU_code/test/zdtm/static/futex.c create mode 100644 CRIU_code/test/zdtm/static/get_smaps_bits.c create mode 100644 CRIU_code/test/zdtm/static/get_smaps_bits.h create mode 100644 CRIU_code/test/zdtm/static/ghost_holes00.c create mode 100644 CRIU_code/test/zdtm/static/ghost_holes01.c create mode 100644 CRIU_code/test/zdtm/static/ghost_holes02.c create mode 100644 CRIU_code/test/zdtm/static/ghost_on_rofs.c create mode 100644 CRIU_code/test/zdtm/static/ghost_on_rofs.desc create mode 100644 CRIU_code/test/zdtm/static/groups.c create mode 100644 CRIU_code/test/zdtm/static/groups.desc create mode 100644 CRIU_code/test/zdtm/static/grow_map.c create mode 100644 CRIU_code/test/zdtm/static/grow_map.desc create mode 100644 CRIU_code/test/zdtm/static/grow_map02.c create mode 100644 CRIU_code/test/zdtm/static/grow_map02.desc create mode 100644 CRIU_code/test/zdtm/static/grow_map03.c create mode 100644 CRIU_code/test/zdtm/static/grow_map03.desc create mode 100644 CRIU_code/test/zdtm/static/helper_zombie_child.c create mode 100644 CRIU_code/test/zdtm/static/helper_zombie_child.desc create mode 100644 CRIU_code/test/zdtm/static/inotify00.c create mode 100644 CRIU_code/test/zdtm/static/inotify00.desc create mode 100644 CRIU_code/test/zdtm/static/inotify01.c create mode 100644 CRIU_code/test/zdtm/static/inotify01.desc create mode 100644 CRIU_code/test/zdtm/static/inotify02.c create mode 100644 CRIU_code/test/zdtm/static/inotify02.desc create mode 100644 CRIU_code/test/zdtm/static/inotify_irmap.c create mode 100644 CRIU_code/test/zdtm/static/inotify_irmap.desc create mode 100644 CRIU_code/test/zdtm/static/inotify_irmap.hook create mode 100644 CRIU_code/test/zdtm/static/inotify_system.c create mode 100644 CRIU_code/test/zdtm/static/inotify_system.desc create mode 100644 CRIU_code/test/zdtm/static/inotify_system_nodel.c create mode 100644 CRIU_code/test/zdtm/static/inotify_system_nodel.desc create mode 100644 CRIU_code/test/zdtm/static/ipc_namespace.c create mode 100644 CRIU_code/test/zdtm/static/ipc_namespace.desc create mode 100644 CRIU_code/test/zdtm/static/jobctl00.c create mode 100644 CRIU_code/test/zdtm/static/link10.c create mode 100644 CRIU_code/test/zdtm/static/loginuid.c create mode 100644 CRIU_code/test/zdtm/static/loginuid.desc create mode 100644 CRIU_code/test/zdtm/static/macvlan.c create mode 100644 CRIU_code/test/zdtm/static/macvlan.desc create mode 100644 CRIU_code/test/zdtm/static/macvlan.hook create mode 100644 CRIU_code/test/zdtm/static/maps00.c create mode 100644 CRIU_code/test/zdtm/static/maps01.c create mode 100644 CRIU_code/test/zdtm/static/maps01.desc create mode 100644 CRIU_code/test/zdtm/static/maps02.c create mode 100644 CRIU_code/test/zdtm/static/maps03.c create mode 100644 CRIU_code/test/zdtm/static/maps03.desc create mode 100644 CRIU_code/test/zdtm/static/maps04.c create mode 100644 CRIU_code/test/zdtm/static/maps04.desc create mode 100644 CRIU_code/test/zdtm/static/maps05.c create mode 100644 CRIU_code/test/zdtm/static/maps06.c create mode 100644 CRIU_code/test/zdtm/static/maps_file_prot.c create mode 100644 CRIU_code/test/zdtm/static/mem-touch.c create mode 100644 CRIU_code/test/zdtm/static/mem-touch.desc create mode 100644 CRIU_code/test/zdtm/static/mlock_setuid.c create mode 100644 CRIU_code/test/zdtm/static/mlock_setuid.desc create mode 100644 CRIU_code/test/zdtm/static/mmx00.c create mode 100644 CRIU_code/test/zdtm/static/mmx00.desc create mode 100644 CRIU_code/test/zdtm/static/mnt_enablefs.c create mode 100644 CRIU_code/test/zdtm/static/mnt_enablefs.checkskip create mode 100644 CRIU_code/test/zdtm/static/mnt_enablefs.desc create mode 100644 CRIU_code/test/zdtm/static/mnt_ext_auto.c create mode 100644 CRIU_code/test/zdtm/static/mnt_ext_auto.desc create mode 100644 CRIU_code/test/zdtm/static/mnt_ext_dev.c create mode 100644 CRIU_code/test/zdtm/static/mnt_ext_dev.desc create mode 100644 CRIU_code/test/zdtm/static/mnt_ext_manual.c create mode 100644 CRIU_code/test/zdtm/static/mnt_ext_manual.desc create mode 100644 CRIU_code/test/zdtm/static/mnt_ext_master.c create mode 100644 CRIU_code/test/zdtm/static/mnt_ext_master.desc create mode 100644 CRIU_code/test/zdtm/static/mnt_ro_bind.c create mode 100644 CRIU_code/test/zdtm/static/mnt_ro_bind.desc create mode 100644 CRIU_code/test/zdtm/static/mnt_tracefs.c create mode 100644 CRIU_code/test/zdtm/static/mnt_tracefs.checkskip create mode 100644 CRIU_code/test/zdtm/static/mnt_tracefs.desc create mode 100644 CRIU_code/test/zdtm/static/mnt_tracefs.hook create mode 100644 CRIU_code/test/zdtm/static/mntns-deleted-dst create mode 100644 CRIU_code/test/zdtm/static/mntns_deleted.c create mode 100644 CRIU_code/test/zdtm/static/mntns_deleted.desc create mode 100644 CRIU_code/test/zdtm/static/mntns_ghost.c create mode 100644 CRIU_code/test/zdtm/static/mntns_ghost.desc create mode 100644 CRIU_code/test/zdtm/static/mntns_ghost01.c create mode 100644 CRIU_code/test/zdtm/static/mntns_ghost01.desc create mode 100644 CRIU_code/test/zdtm/static/mntns_link_ghost.c create mode 100644 CRIU_code/test/zdtm/static/mntns_link_ghost.desc create mode 100644 CRIU_code/test/zdtm/static/mntns_link_remap.c create mode 100644 CRIU_code/test/zdtm/static/mntns_link_remap.desc create mode 100644 CRIU_code/test/zdtm/static/mntns_open.c create mode 100644 CRIU_code/test/zdtm/static/mntns_open.desc create mode 100644 CRIU_code/test/zdtm/static/mntns_overmount.c create mode 100644 CRIU_code/test/zdtm/static/mntns_overmount.desc create mode 100644 CRIU_code/test/zdtm/static/mntns_remap.c create mode 100644 CRIU_code/test/zdtm/static/mntns_remap.desc create mode 100644 CRIU_code/test/zdtm/static/mntns_ro_root.c create mode 100644 CRIU_code/test/zdtm/static/mntns_ro_root.desc create mode 100644 CRIU_code/test/zdtm/static/mntns_root_bind.c create mode 100644 CRIU_code/test/zdtm/static/mntns_root_bind.desc create mode 100644 CRIU_code/test/zdtm/static/mntns_root_bind02.c create mode 100644 CRIU_code/test/zdtm/static/mntns_root_bind02.desc create mode 100644 CRIU_code/test/zdtm/static/mntns_rw_ro_rw.c create mode 100644 CRIU_code/test/zdtm/static/mntns_rw_ro_rw.desc create mode 100644 CRIU_code/test/zdtm/static/mntns_shared_bind.c create mode 100644 CRIU_code/test/zdtm/static/mntns_shared_bind.desc create mode 100644 CRIU_code/test/zdtm/static/mntns_shared_bind02.c create mode 100644 CRIU_code/test/zdtm/static/mntns_shared_bind02.desc create mode 100644 CRIU_code/test/zdtm/static/mntns_shared_bind03.c create mode 100644 CRIU_code/test/zdtm/static/mntns_shared_bind03.desc create mode 100644 CRIU_code/test/zdtm/static/mntns_shared_vs_private.c create mode 100644 CRIU_code/test/zdtm/static/mntns_shared_vs_private.desc create mode 100644 CRIU_code/test/zdtm/static/mount_paths.c create mode 100644 CRIU_code/test/zdtm/static/mount_paths.desc create mode 100644 CRIU_code/test/zdtm/static/mountpoints.c create mode 100644 CRIU_code/test/zdtm/static/mountpoints.desc create mode 100644 CRIU_code/test/zdtm/static/mprotect00.c create mode 100644 CRIU_code/test/zdtm/static/msgque.c create mode 100644 CRIU_code/test/zdtm/static/msgque.desc create mode 100644 CRIU_code/test/zdtm/static/mtime_mmap.c create mode 100644 CRIU_code/test/zdtm/static/netns-dev.c create mode 100644 CRIU_code/test/zdtm/static/netns-dev.desc create mode 100644 CRIU_code/test/zdtm/static/netns-nf.c create mode 100644 CRIU_code/test/zdtm/static/netns-nf.desc create mode 100644 CRIU_code/test/zdtm/static/netns.c create mode 100644 CRIU_code/test/zdtm/static/netns.desc create mode 100644 CRIU_code/test/zdtm/static/netns_sub.c create mode 100644 CRIU_code/test/zdtm/static/netns_sub.desc create mode 100644 CRIU_code/test/zdtm/static/netns_sub_veth.c create mode 100644 CRIU_code/test/zdtm/static/netns_sub_veth.desc create mode 100644 CRIU_code/test/zdtm/static/non_uniform_share_propagation.c create mode 100644 CRIU_code/test/zdtm/static/non_uniform_share_propagation.desc create mode 100644 CRIU_code/test/zdtm/static/ofd_file_locks.c create mode 100644 CRIU_code/test/zdtm/static/ofd_file_locks.h create mode 100644 CRIU_code/test/zdtm/static/oom_score_adj.c create mode 100644 CRIU_code/test/zdtm/static/overmount_dev.c create mode 100644 CRIU_code/test/zdtm/static/overmount_dev.desc create mode 100644 CRIU_code/test/zdtm/static/overmount_fifo.c create mode 100644 CRIU_code/test/zdtm/static/overmount_fifo.desc create mode 100644 CRIU_code/test/zdtm/static/overmount_file.c create mode 100644 CRIU_code/test/zdtm/static/overmount_file.desc create mode 100644 CRIU_code/test/zdtm/static/overmount_sock.c create mode 100644 CRIU_code/test/zdtm/static/overmount_sock.desc create mode 100644 CRIU_code/test/zdtm/static/overmount_with_shared_parent.c create mode 100644 CRIU_code/test/zdtm/static/overmount_with_shared_parent.desc create mode 100644 CRIU_code/test/zdtm/static/overmounted_file.c create mode 100644 CRIU_code/test/zdtm/static/overmounted_file.desc create mode 100644 CRIU_code/test/zdtm/static/packet_sock.c create mode 100644 CRIU_code/test/zdtm/static/packet_sock.desc create mode 100644 CRIU_code/test/zdtm/static/packet_sock_mmap.c create mode 100644 CRIU_code/test/zdtm/static/packet_sock_mmap.desc create mode 100644 CRIU_code/test/zdtm/static/packet_sock_spkt.c create mode 100644 CRIU_code/test/zdtm/static/packet_sock_spkt.desc create mode 100644 CRIU_code/test/zdtm/static/pdeath_sig.c create mode 100644 CRIU_code/test/zdtm/static/pid00.c create mode 100644 CRIU_code/test/zdtm/static/pid00.desc create mode 100644 CRIU_code/test/zdtm/static/pid_file.c create mode 100644 CRIU_code/test/zdtm/static/pipe00.c create mode 100644 CRIU_code/test/zdtm/static/pipe01.c create mode 100644 CRIU_code/test/zdtm/static/pipe02.c create mode 100644 CRIU_code/test/zdtm/static/pipe03.c create mode 100644 CRIU_code/test/zdtm/static/poll.c create mode 100644 CRIU_code/test/zdtm/static/poll.desc create mode 100644 CRIU_code/test/zdtm/static/posix_timers.c create mode 100644 CRIU_code/test/zdtm/static/private_bind_propagation.c create mode 100644 CRIU_code/test/zdtm/static/private_bind_propagation.desc create mode 100644 CRIU_code/test/zdtm/static/proc-self.c create mode 100644 CRIU_code/test/zdtm/static/pstree.c create mode 100644 CRIU_code/test/zdtm/static/pthread00.c create mode 100644 CRIU_code/test/zdtm/static/pthread01.c create mode 100644 CRIU_code/test/zdtm/static/pthread02.c create mode 100644 CRIU_code/test/zdtm/static/pthread02.desc create mode 100644 CRIU_code/test/zdtm/static/ptrace_sig.c create mode 100644 CRIU_code/test/zdtm/static/ptrace_sig.desc create mode 100644 CRIU_code/test/zdtm/static/pty-console.c create mode 100644 CRIU_code/test/zdtm/static/pty-console.desc create mode 100644 CRIU_code/test/zdtm/static/pty00.c create mode 100644 CRIU_code/test/zdtm/static/pty01.c create mode 100644 CRIU_code/test/zdtm/static/pty01.desc create mode 100644 CRIU_code/test/zdtm/static/pty02.c create mode 100644 CRIU_code/test/zdtm/static/pty03.c create mode 100644 CRIU_code/test/zdtm/static/pty03.desc create mode 100644 CRIU_code/test/zdtm/static/pty04.c create mode 100644 CRIU_code/test/zdtm/static/remap_dead_pid.c create mode 100644 CRIU_code/test/zdtm/static/remap_dead_pid.desc create mode 100644 CRIU_code/test/zdtm/static/remap_dead_pid_root.c create mode 100644 CRIU_code/test/zdtm/static/remap_dead_pid_root.desc create mode 100644 CRIU_code/test/zdtm/static/rlimits00.c create mode 100644 CRIU_code/test/zdtm/static/rmdir_open.c create mode 100644 CRIU_code/test/zdtm/static/route_rules create mode 100644 CRIU_code/test/zdtm/static/rtc.c create mode 100644 CRIU_code/test/zdtm/static/rtc.desc create mode 100644 CRIU_code/test/zdtm/static/s390x_gs_threads.c create mode 100644 CRIU_code/test/zdtm/static/s390x_mmap_high.c create mode 100644 CRIU_code/test/zdtm/static/s390x_mmap_high.desc create mode 100644 CRIU_code/test/zdtm/static/s390x_regs_check.c create mode 100644 CRIU_code/test/zdtm/static/s390x_regs_check.desc create mode 100644 CRIU_code/test/zdtm/static/s390x_runtime_instr.c create mode 100644 CRIU_code/test/zdtm/static/sched_policy00.c create mode 100644 CRIU_code/test/zdtm/static/sched_policy00.desc create mode 100644 CRIU_code/test/zdtm/static/sched_prio00.c create mode 100644 CRIU_code/test/zdtm/static/sched_prio00.desc create mode 100644 CRIU_code/test/zdtm/static/scm00.c create mode 100644 CRIU_code/test/zdtm/static/scm01.c create mode 100644 CRIU_code/test/zdtm/static/scm02.c create mode 100644 CRIU_code/test/zdtm/static/scm03.c create mode 100644 CRIU_code/test/zdtm/static/scm04.c create mode 100644 CRIU_code/test/zdtm/static/scm05.c create mode 100644 CRIU_code/test/zdtm/static/scm06.c create mode 100644 CRIU_code/test/zdtm/static/scm06.desc create mode 100644 CRIU_code/test/zdtm/static/seccomp_filter.c create mode 100644 CRIU_code/test/zdtm/static/seccomp_filter.desc create mode 100644 CRIU_code/test/zdtm/static/seccomp_filter_inheritance.c create mode 100644 CRIU_code/test/zdtm/static/seccomp_filter_inheritance.desc create mode 100644 CRIU_code/test/zdtm/static/seccomp_filter_threads.c create mode 100644 CRIU_code/test/zdtm/static/seccomp_filter_threads.desc create mode 100644 CRIU_code/test/zdtm/static/seccomp_filter_tsync.c create mode 100644 CRIU_code/test/zdtm/static/seccomp_filter_tsync.desc create mode 100644 CRIU_code/test/zdtm/static/seccomp_strict.c create mode 100644 CRIU_code/test/zdtm/static/seccomp_strict.desc create mode 100644 CRIU_code/test/zdtm/static/selfexe00.c create mode 100644 CRIU_code/test/zdtm/static/selinux00.c create mode 100644 CRIU_code/test/zdtm/static/selinux00.checkskip create mode 100644 CRIU_code/test/zdtm/static/selinux00.desc create mode 100644 CRIU_code/test/zdtm/static/selinux00.hook create mode 100644 CRIU_code/test/zdtm/static/selinux01.c create mode 100644 CRIU_code/test/zdtm/static/selinux01.checkskip create mode 100644 CRIU_code/test/zdtm/static/selinux01.desc create mode 100644 CRIU_code/test/zdtm/static/selinux01.hook create mode 100644 CRIU_code/test/zdtm/static/selinux02.c create mode 100644 CRIU_code/test/zdtm/static/selinux02.checkskip create mode 100644 CRIU_code/test/zdtm/static/selinux02.desc create mode 100644 CRIU_code/test/zdtm/static/selinux02.hook create mode 100644 CRIU_code/test/zdtm/static/sem.c create mode 100644 CRIU_code/test/zdtm/static/sem.desc create mode 100644 CRIU_code/test/zdtm/static/session00.c create mode 100644 CRIU_code/test/zdtm/static/session00.desc create mode 100644 CRIU_code/test/zdtm/static/session01.c create mode 100644 CRIU_code/test/zdtm/static/session01.desc create mode 100644 CRIU_code/test/zdtm/static/session02.c create mode 100644 CRIU_code/test/zdtm/static/session02.desc create mode 100644 CRIU_code/test/zdtm/static/session03.c create mode 100644 CRIU_code/test/zdtm/static/session03.desc create mode 100644 CRIU_code/test/zdtm/static/shared_mount_propagation.c create mode 100644 CRIU_code/test/zdtm/static/shared_mount_propagation.desc create mode 100644 CRIU_code/test/zdtm/static/shared_slave_mount_children.c create mode 100644 CRIU_code/test/zdtm/static/shared_slave_mount_children.desc create mode 100644 CRIU_code/test/zdtm/static/shm-mp.c create mode 100644 CRIU_code/test/zdtm/static/shm-mp.desc create mode 100644 CRIU_code/test/zdtm/static/shm-unaligned.c create mode 100644 CRIU_code/test/zdtm/static/shm-unaligned.desc create mode 100644 CRIU_code/test/zdtm/static/shm.c create mode 100644 CRIU_code/test/zdtm/static/shm.desc create mode 100644 CRIU_code/test/zdtm/static/sigaltstack.c create mode 100644 CRIU_code/test/zdtm/static/signalfd00.c create mode 100644 CRIU_code/test/zdtm/static/sigpending.c create mode 100644 CRIU_code/test/zdtm/static/sit.c create mode 100644 CRIU_code/test/zdtm/static/sit.desc create mode 100644 CRIU_code/test/zdtm/static/sk-freebind-false.c create mode 100644 CRIU_code/test/zdtm/static/sk-freebind.c create mode 100644 CRIU_code/test/zdtm/static/sk-netlink.c create mode 100644 CRIU_code/test/zdtm/static/sk-netlink.desc create mode 100644 CRIU_code/test/zdtm/static/sk-unix-mntns.c create mode 100644 CRIU_code/test/zdtm/static/sk-unix-mntns.desc create mode 100644 CRIU_code/test/zdtm/static/sk-unix-rel.c create mode 100644 CRIU_code/test/zdtm/static/sk-unix-unconn.c create mode 100644 CRIU_code/test/zdtm/static/sk-unix01.c create mode 100644 CRIU_code/test/zdtm/static/sk-unix01.desc create mode 100644 CRIU_code/test/zdtm/static/skip-me.c create mode 100644 CRIU_code/test/zdtm/static/sleeping00.c create mode 100644 CRIU_code/test/zdtm/static/sock_filter.c create mode 100644 CRIU_code/test/zdtm/static/sock_opts00.c create mode 100644 CRIU_code/test/zdtm/static/sock_opts00.desc create mode 100644 CRIU_code/test/zdtm/static/sock_opts01.c create mode 100644 CRIU_code/test/zdtm/static/sock_opts01.desc create mode 100644 CRIU_code/test/zdtm/static/sock_peercred.c create mode 100644 CRIU_code/test/zdtm/static/sock_peercred.desc create mode 100644 CRIU_code/test/zdtm/static/socket-ext.c create mode 100644 CRIU_code/test/zdtm/static/socket-ext.desc create mode 100644 CRIU_code/test/zdtm/static/socket-raw.c create mode 100644 CRIU_code/test/zdtm/static/socket-raw.desc create mode 100644 CRIU_code/test/zdtm/static/socket-tcp-close-wait.c create mode 100644 CRIU_code/test/zdtm/static/socket-tcp-close-wait.desc create mode 100644 CRIU_code/test/zdtm/static/socket-tcp-close-wait.hook create mode 100644 CRIU_code/test/zdtm/static/socket-tcp-close0.c create mode 100644 CRIU_code/test/zdtm/static/socket-tcp-close0.desc create mode 100644 CRIU_code/test/zdtm/static/socket-tcp-close1.c create mode 100644 CRIU_code/test/zdtm/static/socket-tcp-close1.desc create mode 100644 CRIU_code/test/zdtm/static/socket-tcp-closed-last-ack.c create mode 100644 CRIU_code/test/zdtm/static/socket-tcp-closed-last-ack.desc create mode 100644 CRIU_code/test/zdtm/static/socket-tcp-closed-last-ack.hook create mode 100644 CRIU_code/test/zdtm/static/socket-tcp-closed.c create mode 100644 CRIU_code/test/zdtm/static/socket-tcp-closed.desc create mode 100644 CRIU_code/test/zdtm/static/socket-tcp-closed.hook create mode 100644 CRIU_code/test/zdtm/static/socket-tcp-closing.c create mode 100644 CRIU_code/test/zdtm/static/socket-tcp-closing.desc create mode 100644 CRIU_code/test/zdtm/static/socket-tcp-closing.hook create mode 100644 CRIU_code/test/zdtm/static/socket-tcp-fin-wait1.c create mode 100644 CRIU_code/test/zdtm/static/socket-tcp-fin-wait1.desc create mode 100644 CRIU_code/test/zdtm/static/socket-tcp-fin-wait1.hook create mode 100644 CRIU_code/test/zdtm/static/socket-tcp-fin-wait2.c create mode 100644 CRIU_code/test/zdtm/static/socket-tcp-fin-wait2.desc create mode 100644 CRIU_code/test/zdtm/static/socket-tcp-fin-wait2.hook create mode 100644 CRIU_code/test/zdtm/static/socket-tcp-last-ack.c create mode 100644 CRIU_code/test/zdtm/static/socket-tcp-last-ack.desc create mode 100644 CRIU_code/test/zdtm/static/socket-tcp-last-ack.hook create mode 100644 CRIU_code/test/zdtm/static/socket-tcp-local.c create mode 100644 CRIU_code/test/zdtm/static/socket-tcp-local.desc create mode 100644 CRIU_code/test/zdtm/static/socket-tcp-local.hook create mode 100644 CRIU_code/test/zdtm/static/socket-tcp-nfconntrack.c create mode 100644 CRIU_code/test/zdtm/static/socket-tcp-nfconntrack.desc create mode 100644 CRIU_code/test/zdtm/static/socket-tcp-reseted.c create mode 100644 CRIU_code/test/zdtm/static/socket-tcp-reseted.desc create mode 100644 CRIU_code/test/zdtm/static/socket-tcp-reseted.hook create mode 100644 CRIU_code/test/zdtm/static/socket-tcp-reuseport.c create mode 100644 CRIU_code/test/zdtm/static/socket-tcp-reuseport.desc create mode 100644 CRIU_code/test/zdtm/static/socket-tcp-skip-in-flight.c create mode 100644 CRIU_code/test/zdtm/static/socket-tcp-skip-in-flight.desc create mode 100644 CRIU_code/test/zdtm/static/socket-tcp-syn-sent.c create mode 100644 CRIU_code/test/zdtm/static/socket-tcp-syn-sent.desc create mode 100644 CRIU_code/test/zdtm/static/socket-tcp-syn-sent.hook create mode 100644 CRIU_code/test/zdtm/static/socket-tcp-unconn.c create mode 100644 CRIU_code/test/zdtm/static/socket-tcp-unconn.desc create mode 100644 CRIU_code/test/zdtm/static/socket-tcp.c create mode 100644 CRIU_code/test/zdtm/static/socket-tcp.desc create mode 100644 CRIU_code/test/zdtm/static/socket-tcp4v6-close-wait.c create mode 100644 CRIU_code/test/zdtm/static/socket-tcp4v6-close-wait.desc create mode 100644 CRIU_code/test/zdtm/static/socket-tcp4v6-closed.c create mode 100644 CRIU_code/test/zdtm/static/socket-tcp4v6-closed.desc create mode 100644 CRIU_code/test/zdtm/static/socket-tcp4v6-closing.c create mode 100644 CRIU_code/test/zdtm/static/socket-tcp4v6-closing.desc create mode 100644 CRIU_code/test/zdtm/static/socket-tcp4v6-fin-wait1.c create mode 100644 CRIU_code/test/zdtm/static/socket-tcp4v6-fin-wait1.desc create mode 100644 CRIU_code/test/zdtm/static/socket-tcp4v6-fin-wait2.c create mode 100644 CRIU_code/test/zdtm/static/socket-tcp4v6-fin-wait2.desc create mode 100644 CRIU_code/test/zdtm/static/socket-tcp4v6-last-ack.c create mode 100644 CRIU_code/test/zdtm/static/socket-tcp4v6-last-ack.desc create mode 100644 CRIU_code/test/zdtm/static/socket-tcp4v6-local.c create mode 100644 CRIU_code/test/zdtm/static/socket-tcp4v6-local.desc create mode 100644 CRIU_code/test/zdtm/static/socket-tcp4v6.c create mode 100644 CRIU_code/test/zdtm/static/socket-tcp4v6.desc create mode 100644 CRIU_code/test/zdtm/static/socket-tcp6-close-wait.c create mode 100644 CRIU_code/test/zdtm/static/socket-tcp6-close-wait.desc create mode 100644 CRIU_code/test/zdtm/static/socket-tcp6-closed.c create mode 100644 CRIU_code/test/zdtm/static/socket-tcp6-closed.desc create mode 100644 CRIU_code/test/zdtm/static/socket-tcp6-closing.c create mode 100644 CRIU_code/test/zdtm/static/socket-tcp6-closing.desc create mode 100644 CRIU_code/test/zdtm/static/socket-tcp6-closing.hook create mode 100644 CRIU_code/test/zdtm/static/socket-tcp6-fin-wait1.c create mode 100644 CRIU_code/test/zdtm/static/socket-tcp6-fin-wait1.desc create mode 100644 CRIU_code/test/zdtm/static/socket-tcp6-fin-wait2.c create mode 100644 CRIU_code/test/zdtm/static/socket-tcp6-fin-wait2.desc create mode 100644 CRIU_code/test/zdtm/static/socket-tcp6-last-ack.c create mode 100644 CRIU_code/test/zdtm/static/socket-tcp6-last-ack.desc create mode 100644 CRIU_code/test/zdtm/static/socket-tcp6-local.c create mode 100644 CRIU_code/test/zdtm/static/socket-tcp6-local.desc create mode 100644 CRIU_code/test/zdtm/static/socket-tcp6-unconn.c create mode 100644 CRIU_code/test/zdtm/static/socket-tcp6-unconn.desc create mode 100644 CRIU_code/test/zdtm/static/socket-tcp6.c create mode 100644 CRIU_code/test/zdtm/static/socket-tcp6.desc create mode 100644 CRIU_code/test/zdtm/static/socket-tcpbuf-local.c create mode 100644 CRIU_code/test/zdtm/static/socket-tcpbuf-local.desc create mode 100644 CRIU_code/test/zdtm/static/socket-tcpbuf.c create mode 100644 CRIU_code/test/zdtm/static/socket-tcpbuf.desc create mode 100644 CRIU_code/test/zdtm/static/socket-tcpbuf6-local.c create mode 100644 CRIU_code/test/zdtm/static/socket-tcpbuf6-local.desc create mode 100644 CRIU_code/test/zdtm/static/socket-tcpbuf6.c create mode 100644 CRIU_code/test/zdtm/static/socket-tcpbuf6.desc create mode 100644 CRIU_code/test/zdtm/static/socket6_udp.c create mode 100644 CRIU_code/test/zdtm/static/socket_aio.c create mode 100644 CRIU_code/test/zdtm/static/socket_aio.desc create mode 100644 CRIU_code/test/zdtm/static/socket_close_data.c create mode 100644 CRIU_code/test/zdtm/static/socket_close_data01.c create mode 100644 CRIU_code/test/zdtm/static/socket_dgram_data.c create mode 100644 CRIU_code/test/zdtm/static/socket_listen.c create mode 100644 CRIU_code/test/zdtm/static/socket_listen4v6.c create mode 100644 CRIU_code/test/zdtm/static/socket_listen6.c create mode 100644 CRIU_code/test/zdtm/static/socket_queues.c create mode 100644 CRIU_code/test/zdtm/static/socket_snd_addr.c create mode 100644 CRIU_code/test/zdtm/static/socket_snd_addr.desc create mode 100644 CRIU_code/test/zdtm/static/socket_udp-broadcast.c create mode 100644 CRIU_code/test/zdtm/static/socket_udp-corked.c create mode 100644 CRIU_code/test/zdtm/static/socket_udp-corked.desc create mode 100644 CRIU_code/test/zdtm/static/socket_udp.c create mode 100644 CRIU_code/test/zdtm/static/socket_udp_shutdown.c create mode 100644 CRIU_code/test/zdtm/static/socket_udplite.c create mode 100644 CRIU_code/test/zdtm/static/sockets00.c create mode 100644 CRIU_code/test/zdtm/static/sockets00.desc create mode 100644 CRIU_code/test/zdtm/static/sockets01.c create mode 100644 CRIU_code/test/zdtm/static/sockets02.c create mode 100644 CRIU_code/test/zdtm/static/sockets03.c create mode 100644 CRIU_code/test/zdtm/static/sockets03.desc create mode 100644 CRIU_code/test/zdtm/static/sockets_dgram.c create mode 100644 CRIU_code/test/zdtm/static/sockets_spair.c create mode 100644 CRIU_code/test/zdtm/static/sse00.c create mode 100644 CRIU_code/test/zdtm/static/sse00.desc create mode 100644 CRIU_code/test/zdtm/static/sse20.c create mode 100644 CRIU_code/test/zdtm/static/sse20.desc create mode 100644 CRIU_code/test/zdtm/static/stopped.c create mode 100644 CRIU_code/test/zdtm/static/stopped01.c create mode 100644 CRIU_code/test/zdtm/static/stopped02.c create mode 100644 CRIU_code/test/zdtm/static/stopped12.c create mode 100644 CRIU_code/test/zdtm/static/tempfs.c create mode 100644 CRIU_code/test/zdtm/static/tempfs.desc create mode 100644 CRIU_code/test/zdtm/static/tempfs_overmounted.c create mode 100644 CRIU_code/test/zdtm/static/tempfs_overmounted.desc create mode 100644 CRIU_code/test/zdtm/static/tempfs_overmounted01.c create mode 100644 CRIU_code/test/zdtm/static/tempfs_overmounted01.desc create mode 100644 CRIU_code/test/zdtm/static/tempfs_ro.c create mode 100644 CRIU_code/test/zdtm/static/tempfs_ro.desc create mode 100644 CRIU_code/test/zdtm/static/tempfs_ro02.c create mode 100644 CRIU_code/test/zdtm/static/tempfs_ro02.desc create mode 100644 CRIU_code/test/zdtm/static/tempfs_subns.c create mode 100644 CRIU_code/test/zdtm/static/tempfs_subns.desc create mode 100644 CRIU_code/test/zdtm/static/thp_disable.c create mode 100644 CRIU_code/test/zdtm/static/thread_different_uid_gid.c create mode 100644 CRIU_code/test/zdtm/static/thread_different_uid_gid.desc create mode 100644 CRIU_code/test/zdtm/static/timerfd.c create mode 100644 CRIU_code/test/zdtm/static/timerfd.desc create mode 100644 CRIU_code/test/zdtm/static/timers.c create mode 100644 CRIU_code/test/zdtm/static/tty00.c create mode 100644 CRIU_code/test/zdtm/static/tty02.c create mode 100644 CRIU_code/test/zdtm/static/tty03.c create mode 100644 CRIU_code/test/zdtm/static/tun.c create mode 100644 CRIU_code/test/zdtm/static/tun.desc create mode 100644 CRIU_code/test/zdtm/static/tun_ns.c create mode 100644 CRIU_code/test/zdtm/static/tun_ns.desc create mode 100644 CRIU_code/test/zdtm/static/uffd-events.c create mode 100644 CRIU_code/test/zdtm/static/umask00.c create mode 100644 CRIU_code/test/zdtm/static/unbound_sock.c create mode 100644 CRIU_code/test/zdtm/static/unhashed_proc.c create mode 100644 CRIU_code/test/zdtm/static/unhashed_proc.desc create mode 100644 CRIU_code/test/zdtm/static/unlink_fifo.c create mode 100644 CRIU_code/test/zdtm/static/unlink_fifo_wronly.c create mode 100644 CRIU_code/test/zdtm/static/unlink_fstat00.c create mode 100644 CRIU_code/test/zdtm/static/unlink_fstat00.hook create mode 100644 CRIU_code/test/zdtm/static/unlink_fstat01+.c create mode 100644 CRIU_code/test/zdtm/static/unlink_fstat01.c create mode 100644 CRIU_code/test/zdtm/static/unlink_fstat02.c create mode 100644 CRIU_code/test/zdtm/static/unlink_fstat03.c create mode 100644 CRIU_code/test/zdtm/static/unlink_fstat03.desc create mode 100644 CRIU_code/test/zdtm/static/unlink_fstat04.c create mode 100644 CRIU_code/test/zdtm/static/unlink_fstat04.desc create mode 100644 CRIU_code/test/zdtm/static/unlink_fstat041.c create mode 100644 CRIU_code/test/zdtm/static/unlink_largefile.c create mode 100644 CRIU_code/test/zdtm/static/unlink_largefile.desc create mode 100644 CRIU_code/test/zdtm/static/unlink_mmap00.c create mode 100644 CRIU_code/test/zdtm/static/unlink_mmap00.desc create mode 100644 CRIU_code/test/zdtm/static/unlink_mmap01.c create mode 100644 CRIU_code/test/zdtm/static/unlink_mmap01.desc create mode 100644 CRIU_code/test/zdtm/static/unlink_mmap02.c create mode 100644 CRIU_code/test/zdtm/static/unlink_mmap02.desc create mode 100644 CRIU_code/test/zdtm/static/unlink_multiple_largefiles.c create mode 100644 CRIU_code/test/zdtm/static/unlink_multiple_largefiles.desc create mode 100644 CRIU_code/test/zdtm/static/unlink_regular00.c create mode 100644 CRIU_code/test/zdtm/static/unlink_regular00.desc create mode 100644 CRIU_code/test/zdtm/static/unsupported_children_collision.c create mode 100644 CRIU_code/test/zdtm/static/unsupported_children_collision.desc create mode 100644 CRIU_code/test/zdtm/static/uptime_grow.c create mode 100644 CRIU_code/test/zdtm/static/uptime_grow.desc create mode 100644 CRIU_code/test/zdtm/static/utsname.c create mode 100644 CRIU_code/test/zdtm/static/utsname.desc create mode 100644 CRIU_code/test/zdtm/static/vdso-proxy.c create mode 100644 CRIU_code/test/zdtm/static/vdso00.c create mode 100644 CRIU_code/test/zdtm/static/vdso01.c create mode 100644 CRIU_code/test/zdtm/static/vdso01.desc create mode 100644 CRIU_code/test/zdtm/static/vdso02.c create mode 100644 CRIU_code/test/zdtm/static/vfork00.c create mode 100644 CRIU_code/test/zdtm/static/vfork00.desc create mode 100644 CRIU_code/test/zdtm/static/vsx.c create mode 100644 CRIU_code/test/zdtm/static/vsx.desc create mode 100644 CRIU_code/test/zdtm/static/vt.c create mode 100644 CRIU_code/test/zdtm/static/vt.desc create mode 100644 CRIU_code/test/zdtm/static/wait00.c create mode 100644 CRIU_code/test/zdtm/static/write_read00.c create mode 100644 CRIU_code/test/zdtm/static/write_read01.c create mode 100644 CRIU_code/test/zdtm/static/write_read02.c create mode 100644 CRIU_code/test/zdtm/static/write_read10.c create mode 100644 CRIU_code/test/zdtm/static/xids00.c create mode 100644 CRIU_code/test/zdtm/static/zombie00.c create mode 100644 CRIU_code/test/zdtm/static/zombie01.c create mode 100644 CRIU_code/test/zdtm/static/zombie01.desc create mode 100644 CRIU_code/test/zdtm/transition/Makefile create mode 100644 CRIU_code/test/zdtm/transition/epoll.c create mode 100644 CRIU_code/test/zdtm/transition/fifo_dyn.c create mode 100644 CRIU_code/test/zdtm/transition/fifo_dyn.desc create mode 100644 CRIU_code/test/zdtm/transition/fifo_loop.c create mode 100644 CRIU_code/test/zdtm/transition/file_aio.c create mode 100644 CRIU_code/test/zdtm/transition/file_read.c create mode 100644 CRIU_code/test/zdtm/transition/fork.c create mode 100644 CRIU_code/test/zdtm/transition/fork2.c create mode 100644 CRIU_code/test/zdtm/transition/ipc.c create mode 100644 CRIU_code/test/zdtm/transition/ipc.desc create mode 100644 CRIU_code/test/zdtm/transition/lazy-thp.c create mode 100644 CRIU_code/test/zdtm/transition/maps007.c create mode 100644 CRIU_code/test/zdtm/transition/maps007.desc create mode 100644 CRIU_code/test/zdtm/transition/maps008.c create mode 100644 CRIU_code/test/zdtm/transition/maps008.desc create mode 100644 CRIU_code/test/zdtm/transition/netlink00.c create mode 100644 CRIU_code/test/zdtm/transition/netlink00.desc create mode 100644 CRIU_code/test/zdtm/transition/pid_reuse.c create mode 100644 CRIU_code/test/zdtm/transition/pid_reuse.desc create mode 100644 CRIU_code/test/zdtm/transition/pipe_loop00.c create mode 100644 CRIU_code/test/zdtm/transition/pipe_shared00.c create mode 100644 CRIU_code/test/zdtm/transition/ptrace.c create mode 100644 CRIU_code/test/zdtm/transition/ptrace.desc create mode 100644 CRIU_code/test/zdtm/transition/shmem.c create mode 100644 CRIU_code/test/zdtm/transition/socket-tcp.c create mode 100644 CRIU_code/test/zdtm/transition/socket-tcp.desc create mode 100644 CRIU_code/test/zdtm/transition/socket-tcp6.c create mode 100644 CRIU_code/test/zdtm/transition/socket-tcp6.desc create mode 100644 CRIU_code/test/zdtm/transition/socket_loop00.c create mode 100644 CRIU_code/test/zdtm/transition/thread-bomb.c create mode 100644 CRIU_code/test/zdtm/transition/thread-bomb.desc create mode 100644 CRIU_code/test/zdtm/transition/unix_sock.c create mode 100644 CRIU_code/test/zdtm_ct.c create mode 100644 CRIU_code/test/zdtm_mount_cgroups create mode 100644 CRIU_code/test/zdtm_umount_cgroups diff --git a/CRIU_code/.gitignore b/CRIU_code/.gitignore new file mode 100644 index 0000000..c231104 --- /dev/null +++ b/CRIU_code/.gitignore @@ -0,0 +1,44 @@ +.config +*.o +*.d +*.a +*.img +*.bin +*.elf +*.out +*.swp +*.swo +*.so +.git-ignore +*.patch +*.pyc +cscope* +tags +TAGS +Makefile.local +compel/compel +compel/compel-host-bin +images/*.c +images/*.h +images/google/protobuf/*.c +images/google/protobuf/*.h +.gitid +criu/criu +crit/crit +criu/arch/*/sys-exec-tbl*.c +# x86 syscalls-table is not generated +!criu/arch/x86/sys-exec-tbl.c +criu/arch/*/syscalls*.S +criu/include/syscall-codes*.h +criu/include/syscall*.h +criu/include/version.h +criu/pie/restorer-blob.h +criu/pie/parasite-blob.h +criu/protobuf-desc-gen.h +lib/build/ +lib/c/criu.pc +scripts/build/qemu-user-static/* +lib/.crit-setup.files +compel/include/asm +include/common/asm +include/common/config.h diff --git a/CRIU_code/.mailmap b/CRIU_code/.mailmap new file mode 100644 index 0000000..d8c3f59 --- /dev/null +++ b/CRIU_code/.mailmap @@ -0,0 +1,6 @@ +Stanislav Kinsbursky +Pavel Emelyanov +Andrey Vagin +Andrey Vagin +Andrey Vagin Andrew Vagin +Cyrill Gorcunov diff --git a/CRIU_code/.travis.yml b/CRIU_code/.travis.yml new file mode 100644 index 0000000..37db394 --- /dev/null +++ b/CRIU_code/.travis.yml @@ -0,0 +1,42 @@ +language: c +sudo: required +dist: xenial +cache: ccache +services: + - docker +env: + - TR_ARCH=local + - TR_ARCH=local CLANG=1 + - TR_ARCH=local COMPAT_TEST=y + - TR_ARCH=local CLANG=1 COMPAT_TEST=y + - TR_ARCH=alpine + - TR_ARCH=fedora-asan + - TR_ARCH=x86_64 + - TR_ARCH=x86_64 CLANG=1 + - TR_ARCH=armv7hf + - TR_ARCH=aarch64 + - TR_ARCH=ppc64le + - TR_ARCH=s390x + - TR_ARCH=armv7hf CLANG=1 + - TR_ARCH=aarch64 CLANG=1 + - TR_ARCH=ppc64le CLANG=1 + - TR_ARCH=alpine CLANG=1 + - TR_ARCH=docker-test + - TR_ARCH=fedora-rawhide + - TR_ARCH=fedora-rawhide-aarch64 + - TR_ARCH=centos +matrix: + allow_failures: + - env: TR_ARCH=docker-test + - env: TR_ARCH=fedora-rawhide + - env: TR_ARCH=fedora-rawhide-aarch64 + - env: TR_ARCH=s390x + - env: TR_ARCH=local GCOV=1 + - env: TR_ARCH=local COMPAT_TEST=y + - env: TR_ARCH=local CLANG=1 COMPAT_TEST=y +script: + - sudo make CCACHE=1 -C scripts/travis $TR_ARCH +after_success: + - ccache -s + - make -C scripts/travis after_success +group: deprecated-2017Q2 diff --git a/CRIU_code/COPYING b/CRIU_code/COPYING new file mode 100644 index 0000000..b04304e --- /dev/null +++ b/CRIU_code/COPYING @@ -0,0 +1,860 @@ +This software is licensed under the GNU GENERAL PUBLIC LICENCE Version +2. Except that any software in the lib/ directory is for the creation of a +linkable library to the tools and is licensed under the GNU LESSER GENERAL +PUBLIC LICENCE Version 2.1. Contributing Authors agree that their code is +submitted under the licence appropriate for its location within the source +tree (GPL except for LGPL in lib/) and agree that any future patches, provided +they are accepted into the project, may change the licence of their code from +GPL to LGPL by moving pieces of it into lib/ or LGPL to GPL by moving pieces +of it out of lib/ + +Note that the only valid version of the GPL is THIS particular version +of the license (ie v2, not v2.2 or v3.x or whatever), unless explicitly +otherwise stated. +---------------------------------------- + + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc. + 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Library General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Library General +Public License instead of this License. + +--------------------------------------- + + GNU LESSER GENERAL PUBLIC LICENSE + Version 2.1, February 1999 + + Copyright (C) 1991, 1999 Free Software Foundation, Inc. + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + +[This is the first released version of the Lesser GPL. It also counts + as the successor of the GNU Library Public License, version 2, hence + the version number 2.1.] + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +Licenses are intended to guarantee your freedom to share and change +free software--to make sure the software is free for all its users. + + This license, the Lesser General Public License, applies to some +specially designated software packages--typically libraries--of the +Free Software Foundation and other authors who decide to use it. You +can use it too, but we suggest you first think carefully about whether +this license or the ordinary General Public License is the better +strategy to use in any particular case, based on the explanations below. + + When we speak of free software, we are referring to freedom of use, +not price. Our General Public Licenses are designed to make sure that +you have the freedom to distribute copies of free software (and charge +for this service if you wish); that you receive source code or can get +it if you want it; that you can change the software and use pieces of +it in new free programs; and that you are informed that you can do +these things. + + To protect your rights, we need to make restrictions that forbid +distributors to deny you these rights or to ask you to surrender these +rights. These restrictions translate to certain responsibilities for +you if you distribute copies of the library or if you modify it. + + For example, if you distribute copies of the library, whether gratis +or for a fee, you must give the recipients all the rights that we gave +you. You must make sure that they, too, receive or can get the source +code. If you link other code with the library, you must provide +complete object files to the recipients, so that they can relink them +with the library after making changes to the library and recompiling +it. And you must show them these terms so they know their rights. + + We protect your rights with a two-step method: (1) we copyright the +library, and (2) we offer you this license, which gives you legal +permission to copy, distribute and/or modify the library. + + To protect each distributor, we want to make it very clear that +there is no warranty for the free library. Also, if the library is +modified by someone else and passed on, the recipients should know +that what they have is not the original version, so that the original +author's reputation will not be affected by problems that might be +introduced by others. + + Finally, software patents pose a constant threat to the existence of +any free program. We wish to make sure that a company cannot +effectively restrict the users of a free program by obtaining a +restrictive license from a patent holder. Therefore, we insist that +any patent license obtained for a version of the library must be +consistent with the full freedom of use specified in this license. + + Most GNU software, including some libraries, is covered by the +ordinary GNU General Public License. This license, the GNU Lesser +General Public License, applies to certain designated libraries, and +is quite different from the ordinary General Public License. We use +this license for certain libraries in order to permit linking those +libraries into non-free programs. + + When a program is linked with a library, whether statically or using +a shared library, the combination of the two is legally speaking a +combined work, a derivative of the original library. The ordinary +General Public License therefore permits such linking only if the +entire combination fits its criteria of freedom. The Lesser General +Public License permits more lax criteria for linking other code with +the library. + + We call this license the "Lesser" General Public License because it +does Less to protect the user's freedom than the ordinary General +Public License. It also provides other free software developers Less +of an advantage over competing non-free programs. These disadvantages +are the reason we use the ordinary General Public License for many +libraries. However, the Lesser license provides advantages in certain +special circumstances. + + For example, on rare occasions, there may be a special need to +encourage the widest possible use of a certain library, so that it becomes +a de-facto standard. To achieve this, non-free programs must be +allowed to use the library. A more frequent case is that a free +library does the same job as widely used non-free libraries. In this +case, there is little to gain by limiting the free library to free +software only, so we use the Lesser General Public License. + + In other cases, permission to use a particular library in non-free +programs enables a greater number of people to use a large body of +free software. For example, permission to use the GNU C Library in +non-free programs enables many more people to use the whole GNU +operating system, as well as its variant, the GNU/Linux operating +system. + + Although the Lesser General Public License is Less protective of the +users' freedom, it does ensure that the user of a program that is +linked with the Library has the freedom and the wherewithal to run +that program using a modified version of the Library. + + The precise terms and conditions for copying, distribution and +modification follow. Pay close attention to the difference between a +"work based on the library" and a "work that uses the library". The +former contains code derived from the library, whereas the latter must +be combined with the library in order to run. + + GNU LESSER GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License Agreement applies to any software library or other +program which contains a notice placed by the copyright holder or +other authorized party saying it may be distributed under the terms of +this Lesser General Public License (also called "this License"). +Each licensee is addressed as "you". + + A "library" means a collection of software functions and/or data +prepared so as to be conveniently linked with application programs +(which use some of those functions and data) to form executables. + + The "Library", below, refers to any such software library or work +which has been distributed under these terms. A "work based on the +Library" means either the Library or any derivative work under +copyright law: that is to say, a work containing the Library or a +portion of it, either verbatim or with modifications and/or translated +straightforwardly into another language. (Hereinafter, translation is +included without limitation in the term "modification".) + + "Source code" for a work means the preferred form of the work for +making modifications to it. For a library, complete source code means +all the source code for all modules it contains, plus any associated +interface definition files, plus the scripts used to control compilation +and installation of the library. + + Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running a program using the Library is not restricted, and output from +such a program is covered only if its contents constitute a work based +on the Library (independent of the use of the Library in a tool for +writing it). Whether that is true depends on what the Library does +and what the program that uses the Library does. + + 1. You may copy and distribute verbatim copies of the Library's +complete source code as you receive it, in any medium, provided that +you conspicuously and appropriately publish on each copy an +appropriate copyright notice and disclaimer of warranty; keep intact +all the notices that refer to this License and to the absence of any +warranty; and distribute a copy of this License along with the +Library. + + You may charge a fee for the physical act of transferring a copy, +and you may at your option offer warranty protection in exchange for a +fee. + + 2. You may modify your copy or copies of the Library or any portion +of it, thus forming a work based on the Library, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) The modified work must itself be a software library. + + b) You must cause the files modified to carry prominent notices + stating that you changed the files and the date of any change. + + c) You must cause the whole of the work to be licensed at no + charge to all third parties under the terms of this License. + + d) If a facility in the modified Library refers to a function or a + table of data to be supplied by an application program that uses + the facility, other than as an argument passed when the facility + is invoked, then you must make a good faith effort to ensure that, + in the event an application does not supply such function or + table, the facility still operates, and performs whatever part of + its purpose remains meaningful. + + (For example, a function in a library to compute square roots has + a purpose that is entirely well-defined independent of the + application. Therefore, Subsection 2d requires that any + application-supplied function or table used by this function must + be optional: if the application does not supply it, the square + root function must still compute square roots.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Library, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Library, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote +it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Library. + +In addition, mere aggregation of another work not based on the Library +with the Library (or with a work based on the Library) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may opt to apply the terms of the ordinary GNU General Public +License instead of this License to a given copy of the Library. To do +this, you must alter all the notices that refer to this License, so +that they refer to the ordinary GNU General Public License, version 2, +instead of to this License. (If a newer version than version 2 of the +ordinary GNU General Public License has appeared, then you can specify +that version instead if you wish.) Do not make any other change in +these notices. + + Once this change is made in a given copy, it is irreversible for +that copy, so the ordinary GNU General Public License applies to all +subsequent copies and derivative works made from that copy. + + This option is useful when you wish to copy part of the code of +the Library into a program that is not a library. + + 4. You may copy and distribute the Library (or a portion or +derivative of it, under Section 2) in object code or executable form +under the terms of Sections 1 and 2 above provided that you accompany +it with the complete corresponding machine-readable source code, which +must be distributed under the terms of Sections 1 and 2 above on a +medium customarily used for software interchange. + + If distribution of object code is made by offering access to copy +from a designated place, then offering equivalent access to copy the +source code from the same place satisfies the requirement to +distribute the source code, even though third parties are not +compelled to copy the source along with the object code. + + 5. A program that contains no derivative of any portion of the +Library, but is designed to work with the Library by being compiled or +linked with it, is called a "work that uses the Library". Such a +work, in isolation, is not a derivative work of the Library, and +therefore falls outside the scope of this License. + + However, linking a "work that uses the Library" with the Library +creates an executable that is a derivative of the Library (because it +contains portions of the Library), rather than a "work that uses the +library". The executable is therefore covered by this License. +Section 6 states terms for distribution of such executables. + + When a "work that uses the Library" uses material from a header file +that is part of the Library, the object code for the work may be a +derivative work of the Library even though the source code is not. +Whether this is true is especially significant if the work can be +linked without the Library, or if the work is itself a library. The +threshold for this to be true is not precisely defined by law. + + If such an object file uses only numerical parameters, data +structure layouts and accessors, and small macros and small inline +functions (ten lines or less in length), then the use of the object +file is unrestricted, regardless of whether it is legally a derivative +work. (Executables containing this object code plus portions of the +Library will still fall under Section 6.) + + Otherwise, if the work is a derivative of the Library, you may +distribute the object code for the work under the terms of Section 6. +Any executables containing that work also fall under Section 6, +whether or not they are linked directly with the Library itself. + + 6. As an exception to the Sections above, you may also combine or +link a "work that uses the Library" with the Library to produce a +work containing portions of the Library, and distribute that work +under terms of your choice, provided that the terms permit +modification of the work for the customer's own use and reverse +engineering for debugging such modifications. + + You must give prominent notice with each copy of the work that the +Library is used in it and that the Library and its use are covered by +this License. You must supply a copy of this License. If the work +during execution displays copyright notices, you must include the +copyright notice for the Library among them, as well as a reference +directing the user to the copy of this License. Also, you must do one +of these things: + + a) Accompany the work with the complete corresponding + machine-readable source code for the Library including whatever + changes were used in the work (which must be distributed under + Sections 1 and 2 above); and, if the work is an executable linked + with the Library, with the complete machine-readable "work that + uses the Library", as object code and/or source code, so that the + user can modify the Library and then relink to produce a modified + executable containing the modified Library. (It is understood + that the user who changes the contents of definitions files in the + Library will not necessarily be able to recompile the application + to use the modified definitions.) + + b) Use a suitable shared library mechanism for linking with the + Library. A suitable mechanism is one that (1) uses at run time a + copy of the library already present on the user's computer system, + rather than copying library functions into the executable, and (2) + will operate properly with a modified version of the library, if + the user installs one, as long as the modified version is + interface-compatible with the version that the work was made with. + + c) Accompany the work with a written offer, valid for at + least three years, to give the same user the materials + specified in Subsection 6a, above, for a charge no more + than the cost of performing this distribution. + + d) If distribution of the work is made by offering access to copy + from a designated place, offer equivalent access to copy the above + specified materials from the same place. + + e) Verify that the user has already received a copy of these + materials or that you have already sent this user a copy. + + For an executable, the required form of the "work that uses the +Library" must include any data and utility programs needed for +reproducing the executable from it. However, as a special exception, +the materials to be distributed need not include anything that is +normally distributed (in either source or binary form) with the major +components (compiler, kernel, and so on) of the operating system on +which the executable runs, unless that component itself accompanies +the executable. + + It may happen that this requirement contradicts the license +restrictions of other proprietary libraries that do not normally +accompany the operating system. Such a contradiction means you cannot +use both them and the Library together in an executable that you +distribute. + + 7. You may place library facilities that are a work based on the +Library side-by-side in a single library together with other library +facilities not covered by this License, and distribute such a combined +library, provided that the separate distribution of the work based on +the Library and of the other library facilities is otherwise +permitted, and provided that you do these two things: + + a) Accompany the combined library with a copy of the same work + based on the Library, uncombined with any other library + facilities. This must be distributed under the terms of the + Sections above. + + b) Give prominent notice with the combined library of the fact + that part of it is a work based on the Library, and explaining + where to find the accompanying uncombined form of the same work. + + 8. You may not copy, modify, sublicense, link with, or distribute +the Library except as expressly provided under this License. Any +attempt otherwise to copy, modify, sublicense, link with, or +distribute the Library is void, and will automatically terminate your +rights under this License. However, parties who have received copies, +or rights, from you under this License will not have their licenses +terminated so long as such parties remain in full compliance. + + 9. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Library or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Library (or any work based on the +Library), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Library or works based on it. + + 10. Each time you redistribute the Library (or any work based on the +Library), the recipient automatically receives a license from the +original licensor to copy, distribute, link with or modify the Library +subject to these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties with +this License. + + 11. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Library at all. For example, if a patent +license would not permit royalty-free redistribution of the Library by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Library. + +If any portion of this section is held invalid or unenforceable under any +particular circumstance, the balance of the section is intended to apply, +and the section as a whole is intended to apply in other circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 12. If the distribution and/or use of the Library is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Library under this License may add +an explicit geographical distribution limitation excluding those countries, +so that distribution is permitted only in or among countries not thus +excluded. In such case, this License incorporates the limitation as if +written in the body of this License. + + 13. The Free Software Foundation may publish revised and/or new +versions of the Lesser General Public License from time to time. +Such new versions will be similar in spirit to the present version, +but may differ in detail to address new problems or concerns. + +Each version is given a distinguishing version number. If the Library +specifies a version number of this License which applies to it and +"any later version", you have the option of following the terms and +conditions either of that version or of any later version published by +the Free Software Foundation. If the Library does not specify a +license version number, you may choose any version ever published by +the Free Software Foundation. + + 14. If you wish to incorporate parts of the Library into other free +programs whose distribution conditions are incompatible with these, +write to the author to ask for permission. For software which is +copyrighted by the Free Software Foundation, write to the Free +Software Foundation; we sometimes make exceptions for this. Our +decision will be guided by the two goals of preserving the free status +of all derivatives of our free software and of promoting the sharing +and reuse of software generally. + + NO WARRANTY + + 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO +WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. +EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR +OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY +KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE +LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME +THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN +WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY +AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU +FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR +CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE +LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING +RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A +FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF +SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH +DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Libraries + + If you develop a new library, and you want it to be of the greatest +possible use to the public, we recommend making it free software that +everyone can redistribute and change. You can do so by permitting +redistribution under these terms (or, alternatively, under the terms of the +ordinary General Public License). + + To apply these terms, attach the following notices to the library. It is +safest to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least the +"copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +Also add information on how to contact you by electronic and paper mail. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the library, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the + library `Frob' (a library for tweaking knobs) written by James Random Hacker. + + , 1 April 1990 + Ty Coon, President of Vice + +That's all there is to it! diff --git a/CRIU_code/CREDITS b/CRIU_code/CREDITS new file mode 100644 index 0000000..e91ade4 --- /dev/null +++ b/CRIU_code/CREDITS @@ -0,0 +1,16 @@ +The following people provided invaluable help to CRIU project +(in alphabetical order) +------------------------------------------------------------------- + +Andrew Morton +David Miller +Eric Dumazet +Eric W. Biederman +H. Peter Anvin +Kees Cook +KOSAKI Motohiro +Li Yu +Linus Torvalds +Oleg Nesterov +Serge Hallyn +Tejun Heo diff --git a/CRIU_code/Documentation/.gitattributes b/CRIU_code/Documentation/.gitattributes new file mode 100644 index 0000000..ddb0301 --- /dev/null +++ b/CRIU_code/Documentation/.gitattributes @@ -0,0 +1 @@ +*.txt whitespace diff --git a/CRIU_code/Documentation/.gitignore b/CRIU_code/Documentation/.gitignore new file mode 100644 index 0000000..b4f3931 --- /dev/null +++ b/CRIU_code/Documentation/.gitignore @@ -0,0 +1,6 @@ +*.xml +*.html +*.[1-8] +*.pdf +*.ps +footer.txt diff --git a/CRIU_code/Documentation/HOWTO.cross-compile b/CRIU_code/Documentation/HOWTO.cross-compile new file mode 100644 index 0000000..f1b1784 --- /dev/null +++ b/CRIU_code/Documentation/HOWTO.cross-compile @@ -0,0 +1,39 @@ +This HOWTO explains how to cross-compile CRIU on x86 + + 1. Download the protobuf sources. + 2. Apply the patch http://16918.selcdn.ru/crtools/aarch64/0001-protobuf-added-the-support-for-the-acrchitecture-AAr.patch + 3. Configure protobuf to be compiled for the target architecture: + + ./configure --prefix=$X86_PREFIX --disable-shared --enable-static + + 4. Compile protobuf. + 5. Download protobuf-c sources. + 6. Configure protobuf-c for the architecture x86: + + export PATH=$PATH:$X86_PREFIX/bin + export PKG_CONFIG_PATH=$X86_PREFIX/lib/pkgconfig + CPPFLAGS=`pkg-config --cflags protobuf` LDFLAGS=`pkg-config --libs protobuf` ./configure --prefix=$X86_PREFIX --disable-shared --enable-static + + 7. Compile and install protobuf-c. + 8. Configure protobuf to be compiled for the target architecture: + + ./configure --prefix=$ARCH_PREFIX --disable-shared --enable-static --with-protoc=protoc --host=$TARGET + + 9. Compile and install protobuf. +10. Let PKG_CONFIG_PATH=$ARCH_PREFIX/lib/pkgconfig. +11. Configure protobuf-c to be compiled for the target architecture: + + CPPFLAGS=`pkg-config --cflags protobuf` LDFLAGS=`pkg-config --libs protobuf` ./configure --prefix=$ARCH_PREFIX --disable-shared --enable-static --disable-protoc --host=$TARGET + +12. Compile and install protobuf-c. +13. Compile CRIU: + + ARCH= CROSS_COMPILE=$TARGET- CFLAGS=`pkg-config --cflags libprotobuf-c` LDFLAGS="`pkg-config --libs libprotobuf-c`" make + +Special notes for Android NDK cross compile: + +1, Android NDK doesn't have some headers required by CRIU build, they are , + +2, Android NDK doesn't have some function required by CRIU build, they are aio*, fanotify_init, fanotify_mark, povit_root, index. + +3, in order to pass build with Android NDK, you implement them yourself, and link them to CRIU. diff --git a/CRIU_code/Documentation/Makefile b/CRIU_code/Documentation/Makefile new file mode 100644 index 0000000..aa5d3eb --- /dev/null +++ b/CRIU_code/Documentation/Makefile @@ -0,0 +1,98 @@ +__nmk_dir ?= ../scripts/nmk/scripts/ +include $(__nmk_dir)include.mk +include $(__nmk_dir)macro.mk + +ifneq ($(USE_ASCIIDOCTOR),) +ASCIIDOC := asciidoctor +XMLTO := +else +ASCIIDOC := asciidoc +XMLTO := xmlto +endif + +FOOTER := footer.txt +SRC1 += crit.txt +SRC8 += criu.txt +SRC := $(SRC1) $(SRC8) +XMLS := $(patsubst %.txt,%.xml,$(SRC)) +MAN1S := $(patsubst %.txt,%.1,$(SRC1)) +MAN8S := $(patsubst %.txt,%.8,$(SRC8)) +MANS := $(MAN1S) $(MAN8S) +MAN1DIR := $(MANDIR)/man1 +MAN8DIR := $(MANDIR)/man8 + +GROFF :=groff +PAPER :=$(shell paperconf 2>/dev/null || echo letter) +GROFF_OPTS := -Tps -t -dpaper=$(PAPER) -P-p$(PAPER) -man -msafer -rC1 -rD1 -rS11 +PSS := $(patsubst %,%.ps,$(basename $(MANS))) +PDFS := $(patsubst %,%.pdf,$(basename $(MANS))) + +all: check $(MANS) +ps: $(PSS) +pdf: $(PDFS) +.PHONY: all ps pdf check + +check: + $(Q) for B in $(ASCIIDOC) $(XMLTO); do \ + $$B --version > /dev/null || exit 1; \ + done + +ifeq ($(CRIU_VERSION),) + include ../Makefile.versions +endif +$(FOOTER): ../Makefile.versions + $(call msg-gen, $@) + $(Q) echo ":doctype: manpage" > $@ + $(Q) echo ":man source: criu" >> $@ + $(Q) echo ":man version: $(CRIU_VERSION)" >> $@ + $(Q) echo ":man manual: CRIU Manual" >> $@ + +%.1: %.txt $(FOOTER) custom.xsl + $(call msg-gen, $@) +ifneq ($(USE_ASCIIDOCTOR),) + $(Q) $(ASCIIDOC) -b manpage -d manpage -o $@ $< +else + $(Q) $(ASCIIDOC) -b docbook -d manpage -o $(patsubst %.1,%.xml,$@) $< + $(Q) $(XMLTO) man -m custom.xsl $(patsubst %.1,%.xml,$@) 2>/dev/null +endif + +%.8: %.txt $(FOOTER) custom.xsl + $(call msg-gen, $@) +ifneq ($(USE_ASCIIDOCTOR),) + $(Q) $(ASCIIDOC) -b manpage -d manpage -o $@ $< +else + $(Q) $(ASCIIDOC) -b docbook -d manpage -o $(patsubst %.8,%.xml,$@) $< + $(Q) $(XMLTO) man -m custom.xsl $(patsubst %.8,%.xml,$@) 2>/dev/null +endif + +%.ps: %.1 + $(call msg-gen, $@) + $(Q) $(GROFF) $(GROFF_OPTS) $^ > $@ + +%.ps: %.8 + $(call msg-gen, $@) + $(Q) $(GROFF) $(GROFF_OPTS) $^ > $@ + +%.pdf: %.ps + $(call msg-gen, $@) + $(Q) ps2pdf $< $@ + +clean: + $(call msg-clean, "Documentation") + $(Q) rm -f $(XMLS) $(MANS) $(PSS) $(PDFS) $(FOOTER) + +install: check $(MANS) + $(E) " INSTALL " $(MAN8S) + $(Q) mkdir -p $(DESTDIR)$(MAN8DIR) + $(Q) install -m 644 $(MAN8S) $(DESTDIR)$(MAN8DIR) + $(E) " INSTALL " $(MAN1S) + $(Q) mkdir -p $(DESTDIR)$(MAN1DIR) + $(Q) install -m 644 $(MAN1S) $(DESTDIR)$(MAN1DIR) + +uninstall: + $(E) " UNINSTALL" $(MAN1S) + $(Q) $(RM) $(addprefix $(DESTDIR)$(MAN1DIR)/,$(MAN1S)) + $(E) " UNINSTALL" $(MAN8S) + $(Q) $(RM) $(addprefix $(DESTDIR)$(MAN8DIR)/,$(MAN8S)) + +.PHONY: clean install uninstall diff --git a/CRIU_code/Documentation/asciidoc.conf b/CRIU_code/Documentation/asciidoc.conf new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/CRIU_code/Documentation/asciidoc.conf @@ -0,0 +1 @@ + diff --git a/CRIU_code/Documentation/crit.txt b/CRIU_code/Documentation/crit.txt new file mode 100644 index 0000000..32636c5 --- /dev/null +++ b/CRIU_code/Documentation/crit.txt @@ -0,0 +1,58 @@ +CRIT(1) +======= +include::footer.txt[] + +NAME +---- +crit - CRiu Image Tool + +SYNOPSIS +-------- +*crit* 'decode' [-h] [-i IN] [-o OUT] [--pretty] + +*crit* 'encode' [-h] [-i IN] [-o OUT] + +*crit* 'info' [-h] in + +*crit* 'x' [-h] dir {ps,fds,mems} + +*crit* 'show' [-h] in + +DESCRIPTION +----------- +*crit* is a feature-rich replacement for existing *criu* show. + +ARGUMENTS +--------- + +Positional Arguments +~~~~~~~~~~~~~~~~~~~~ + +*decode*:: + convert *criu* image from binary type JSON + +*encode*:: + convert *criu* image from JSON type to binary + +*info*:: + show info about image + +*x*:: + explore image directory + +*show*:: + convert *criu* image from binary to human-readable JSON + +Optional Arguments +~~~~~~~~~~~~~~~~~~ + +*-h*, *--help*:: + Print some help and exit + +SEE ALSO +-------- +criu(8) + +AUTHOR +------ +The CRIU team diff --git a/CRIU_code/Documentation/criu.txt b/CRIU_code/Documentation/criu.txt new file mode 100644 index 0000000..94fc542 --- /dev/null +++ b/CRIU_code/Documentation/criu.txt @@ -0,0 +1,748 @@ +CRIU(8) +======= +include::footer.txt[] + +NAME +---- +criu - checkpoint/restore in userspace + + +SYNOPSIS +-------- +*criu* 'command' ['option' ...] + + +DESCRIPTION +----------- +*criu* is a tool for checkpointing and restoring running applications. +It does this by saving their state as a collection of files (see the *dump* +command) and creating equivalent processes from those files (see the *restore* +command). The restore operation can be performed at a later time, +on a different system, or both. + + +OPTIONS +------- + +Most of the true / false long options (the ones without arguments) can be +prefixed with *--no-* to negate the option (example: *--display-stats* +and *--no-display-stats*). + +Common options +~~~~~~~~~~~~~~ +Common options are applicable to any 'command'. + +*-v*[*v*...], *--verbosity*:: + Increase verbosity up from the default level. Multiple *v* can be used, + each increasing verbosity by one level. Using long option without argument + increases verbosity by one level. + +*-v*'num', *--verbosity*='num':: + Set verbosity level to 'num'. The higher the level, the more output + is produced. + + +The following levels are available: + * *-v0* + no output; + * *-v1* + only errors; + * *-v2* + above plus warnings (this is the default level); + * *-v3* + above plus information messages and timestamps; + * *-v4* + above plus lots of debug. + +*--config* 'file':: + Pass a specific configuration file to criu. + +*--no-default-config*:: + Forbid parsing of default configuration files. + +*--pidfile* 'file':: + Write root task, service or page-server pid into a 'file'. + +*-o*, *--log-file* 'file':: + Write logging messages to 'file'. + +*--display-stats*:: + During dump as well as during restore *criu* collects information + like the time required to dump or restore the process or the + number of pages dumped or restored. This information is always + written to the files 'stats-dump' and 'stats-restore' and can + be easily displayed using *crit*. The option *--display-stats* + additionally prints out this information on the console at the end + of a dump or a restore. + +*-D*, *--images-dir* 'path':: + Use 'path' as a base directory where to look for sets of image files. + +*--prev-images-dir* 'path':: + Use 'path' as a parent directory where to look for sets of image files. + This option makes sense in case of incremental dumps. + +*-W*, *--work-dir* 'dir':: + Use directory 'dir' for putting logs, pidfiles and statistics. If not + specified, 'path' from *-D* option is taken. + +*--close* 'fd':: + Close file descriptor 'fd' before performing any actions. + +*-L*, *--libdir* 'path':: + Path to plugins directory. + +*--action-script* 'script':: + Add an external action script to be executed at certain stages. + The environment variable *CRTOOLS_SCRIPT_ACTION* is available + to the script to find out which action is being executed, and + its value can be one of the following: + *pre-dump*::: + run prior to beginning a *dump* + + *post-dump*::: + run upon *dump* completion + + *pre-restore*::: + run prior to beginning a *restore* + + *post-restore*::: + run upon *restore* completion + + *pre-resume*::: + run when all processes and resources are + restored but tasks are stopped waiting for + final kick to run. Must not fail. + + *post-resume*::: + called at the very end, when everything is + restored and processes were resumed + + *network-lock*::: + run to lock network in a target network namespace + + *network-unlock*::: + run to unlock network in a target network namespace + + *setup-namespaces*::: + run once root task has just been created + with required namespaces. Note it is an early stage + of restore, when nothing is restored yet, except for + namespaces themselves + + *post-setup-namespaces*::: + called after the namespaces are configured + + *orphan-pts-master*::: + called after master pty is opened and unlocked. This + hook can be used only in the RPC mode, and the + notification message contains a file descriptor for + the master pty + +*-V*, *--version*:: + Print program version and exit. + +*-h*, *--help*:: + Print some help and exit. + +*pre-dump* +~~~~~~~~~~ +Performs the pre-dump procedure, during which *criu* creates a snapshot of +memory changes since the previous *pre-dump*. Note that during this +*criu* also creates the fsnotify cache which speeds up the *restore* +procedure. *pre-dump* requires at least *-t* option (see *dump* below). +In addition, *page-server* options may be specified. + +*--track-mem*:: + Turn on memory changes tracker in the kernel. If the option is + not passed the memory tracker get turned on implicitly. + +*dump* +~~~~~~ +Performs a checkpoint procedure. + +*-t*, *--tree* 'pid':: + Checkpoint the whole process tree starting from 'pid'. + +*-R*, *--leave-running*:: + Leave tasks in running state after checkpoint, instead of killing. This + option is pretty dangerous and should be used only if you understand + what you are doing. ++ +Note if task is about to run after been checkpointed, it can modify +TCP connections, delete files and do other dangerous actions. Therefore, +*criu* can not guarantee that the next *restore* action will succeed. +Most likely if this option is used, at least the file system snapshot +must be made with the help of *post-dump* action script. ++ +In other words, do not use it unless really needed. + +*-s*, *--leave-stopped*:: + Leave tasks in stopped state after checkpoint, instead of killing. + +*--external* 'type'*[*'id'*]:*'value':: + Dump an instance of an external resource. The generic syntax is + 'type' of resource, followed by resource 'id' (enclosed in literal + square brackets), and optional 'value' (prepended by a literal colon). + The following resource types are currently supported: *mnt*, *dev*, + *file*, *tty*, *unix*. Syntax depends on type. + Note to restore external resources, either *--external* or *--inherit-fd* + is used, depending on resource type. + +*--external mnt[*'mountpoint'*]:*'name':: + Dump an external bind mount referenced by 'mountpoint', saving it + to image under the identifier 'name'. + +*--external mnt[]:*'flags':: + Dump all external bind mounts, autodetecting those. Optional 'flags' + can contain *m* to also dump external master mounts, *s* to also + dump external shared mounts (default behavior is to abort dumping + if such mounts are found). If 'flags' are not provided, colon + is optional. + +*--external dev[*'major'*/*'minor'*]:*'name':: + Allow to dump a mount namespace having a real block device mounted. + A block device is identified by its 'major' and 'minor' numbers, + and *criu* saves its information to image under the identifier 'name'. + +*--external file[*'mnt_id'*:*'inode'*]*:: + Dump an external file, i.e. an opened file that is can not be resolved + from the current mount namespace, which can not be dumped without using + this option. The file is identified by 'mnt_id' (a field obtained from + */proc/*'pid'*/fdinfo/*'N') and 'inode' (as returned by *stat*(2)). + +*--external tty[*'rdev'*:*'dev'*]*:: + Dump an external TTY, identified by *st_rdev* and *st_dev* fields + returned by *stat*(2). + +*--external unix[*'id'*]*:: + Tell *criu* that one end of a pair of UNIX sockets (created by + *socketpair*(2)) with 'id' is OK to be disconnected. + +*--freeze-cgroup*:: + Use cgroup freezer to collect processes. + +*--manage-cgroups*:: + Collect cgroups into the image thus they gonna be restored then. + Without this option, *criu* will not save cgroups configuration + associated with a task. + +*--cgroup-props* 'spec':: + Specify controllers and their properties to be saved into the + image file. *criu* predefines specifications for common controllers, + but since the kernel can add new controllers and modify their + properties, there should be a way to specify ones matched the kernel. ++ +'spec' argument describes the controller and properties specification in +a simplified YAML form: ++ +---------- +"c1": + - "strategy": "merge" + - "properties": ["a", "b"] +"c2": + - "strategy": "replace" + - "properties": ["c", "d"] +---------- ++ +where 'c1' and 'c2' are controllers names, and 'a', 'b', 'c', 'd' are +their properties. ++ +Note the format: double quotes, spaces and new lines are required. +The 'strategy' specifies what to do if a controller specified already +exists as a built-in one: *criu* can either *merge* or *replace* such. ++ +For example, the command line for the above example should look like this: ++ +---------- +--cgroup-props "\"c1\":\n - \"strategy\": \"merge\"\n - \"properties\": [\"a\", \"b\"]\n \"c2\":\n - \"strategy\": \"replace\"\n - \"properties\": [\"c\", \"d\"]" +---------- + +*--cgroup-props-file* 'file':: + Same as *--cgroup-props*, except the specification is read from + the 'file'. + +*--cgroup-dump-controller* 'name':: + Dump a controller with 'name' only, skipping anything else that was + discovered automatically (usually via */proc*). This option is + useful when one needs *criu* to skip some controllers. + +*--cgroup-props-ignore-default*:: + When combined with *--cgroup-props*, makes *criu* substitute + a predefined controller property with the new one shipped. If the option + is not used, the predefined properties are merged with the provided ones. + +*--tcp-established*:: + Checkpoint established TCP connections. + +*--skip-in-flight*:: + This option skips in-flight TCP connections. If any TCP connections + that are not yet completely established are found, *criu* ignores + these connections, rather than errors out. + The TCP stack on the client side is expected to handle the + re-connect gracefully. + +*--evasive-devices*:: + Use any path to a device file if the original one is inaccessible. + +*--page-server*:: + Send pages to a page server (see the *page-server* command). + +*--force-irmap*:: + Force resolving names for inotify and fsnotify watches. + +*--auto-dedup*:: + Deduplicate "old" data in pages images of previous *dump*. This option + implies incremental *dump* mode (see the *pre-dump* command). + +*-l*, *--file-locks*:: + Dump file locks. It is necessary to make sure that all file lock users + are taken into dump, so it is only safe to use this for enclosed containers + where locks are not held by any processes outside of dumped process tree. + +*--link-remap*:: + Allows to link unlinked files back, if possible (modifies filesystem + during *restore*). + +*--ghost-limit* 'size':: + Set the maximum size of deleted file to be carried inside image. + By default, up to 1M file is allowed. Using this + option allows to not put big deleted files inside images. Argument + 'size' may be postfixed with a *K*, *M* or *G*, which stands for kilo-, + mega, and gigabytes, accordingly. + +*-j*, *--shell-job*:: + Allow one to dump shell jobs. This implies the restored task will + inherit session and process group ID from the *criu* itself. + This option also allows to migrate a single external tty connection, + to migrate applications like *top*. If used with *dump* command, + it must be specified with *restore* as well. + +*--cpu-cap* ['cap'[,'cap'...]]:: + Specify CPU capabilities to write to an image file. The argument is a + comma-separated list of: ++ + - *none* to ignore capabilities at all; the image will not be produced + on dump, neither any check performed on restore; + - *fpu* to check if FPU module is compatible; + - *ins* to check if CPU supports all instructions required; + - *cpu* to check if CPU capabilities are exactly matching; + - *all* for all above set. + ++ +By default the option is set to *fpu* and *ins*. + +*--cgroup-root* ['controller':]/'newroot':: + Change the root for the controller that will be dumped. By default, *criu* + simply dumps everything below where any of the tasks live. However, if a + container moves all of its tasks into a cgroup directory below the container + engine's default directory for tasks, permissions will not be preserved on + the upper directories with no tasks in them, which may cause problems. + +*--lazy-pages*:: + Perform the dump procedure without writing memory pages into the + image files and prepare to service page requests over the + network. When *dump* runs in this mode it presumes that + *lazy-pages* daemon will connect to it and fetch memory pages to + lazily inject them into the restored process address space. This + option is intended for post-copy (lazy) migration and should be + used in conjunction with *restore* with appropriate options. + +*restore* +~~~~~~~~~ +Restores previously checkpointed processes. + +*--inherit-fd* *fd[*'N'*]:*'resource':: + Inherit a file descriptor. This option lets *criu* use an already opened + file descriptor 'N' for restoring a file identified by 'resource'. + This option can be used to restore an external resource dumped + with the help of *--external* *file*, *tty*, and *unix* options. ++ +The 'resource' argument can be one of the following: ++ + - *tty[*'rdev'*:*'dev'*]* + - *pipe[*'inode'*]* + - *socket[*'inode'*]* + - *file[*'mnt_id'*:*'inode'*]* + - 'path/to/file' + ++ +Note that square brackets used in this option arguments are literals and +usually need to be escaped from shell. + +*-d*, *--restore-detached*:: + Detach *criu* itself once restore is complete. + +*-s*, *--leave-stopped*:: + Leave tasks in stopped state after restore (rather than resuming + their execution). + +*-S*, *--restore-sibling*:: + Restore root task as a sibling (makes sense only with + *--restore-detached*). + +*--log-pid*:: + Write separate logging files per each pid. + +*-r*, *--root* 'path':: + Change the root filesystem to 'path' (when run in a mount namespace). + +*--external* 'type'*[*'id'*]:*'value':: + Restore an instance of an external resource. The generic syntax is + 'type' of resource, followed by resource 'id' (enclosed in literal + square brackets), and optional 'value' (prepended by a literal colon). + The following resource types are currently supported: *mnt*, *dev*, + *veth*, *macvlan*. Syntax depends on type. Note to restore external + resources dealing with opened file descriptors (such as dumped with + the help of *--external* *file*, *tty*, and *unix* options), option + *--inherit-fd* should be used. + +*--external mnt[*'name'*]:*'mountpoint':: + Restore an external bind mount referenced in the image by 'name', + bind-mounting it from the host 'mountpoint' to a proper mount point. + +*--external mnt[]*:: + Restore all external bind mounts (dumped with the help of + *--external mnt[]* auto-detection). + +*--external dev[*'name'*]:*'/dev/path':: + Restore an external mount device, identified in the image by 'name', + using the existing block device '/dev/path'. + +*--external veth[*'inner_dev'*]:*'outer_dev'*@*'bridge':: + Set the outer VETH device name (corresponding to 'inner_dev' being + restored) to 'outer_dev'. If optional *@*'bridge' is specified, + 'outer_dev' is added to that bridge. If the option is not used, + 'outer_dev' will be autogenerated by the kernel. + +*--external macvlan[*'inner_dev'*]:*'outer_dev':: + When restoring an image that have a MacVLAN device in it, this option + must be used to specify to which 'outer_dev' (an existing network device + in CRIU namespace) the restored 'inner_dev' should be bound to. + +*--manage-cgroups* ['mode']:: + Restore cgroups configuration associated with a task from the image. + Controllers are always restored in an optimistic way -- if already present + in system, *criu* reuses it, otherwise it will be created. + +The 'mode' may be one of the following: + + *none*::: Do not restore cgroup properties but require cgroup to + pre-exist at the moment of *restore* procedure. + + *props*::: Restore cgroup properties and require cgroup to pre-exist. + + *soft*::: Restore cgroup properties if only cgroup has been created + by *criu*, otherwise do not restore properties. This is the + default if mode is unspecified. + + *full*::: Always restore all cgroups and their properties. + + *strict*::: Restore all cgroups and their properties from the scratch, + requiring them to not present in the system. + + *ignore*::: Don't deal with cgroups and pretend that they don't exist. + +*--cgroup-root* ['controller'*:*]/'newroot':: + Change the root cgroup the controller will be installed into. No controller + means that root is the default for all controllers not specified. + +*--tcp-established*:: + Restore previously dumped established TCP connections. This implies that + the network has been locked between *dump* and *restore* phases so other + side of a connection simply notice a kind of lag. + +*--tcp-close*:: + Restore connected TCP sockets in closed state. + +*--veth-pair* 'IN'*=*'OUT':: + Correspondence between outside and inside names of veth devices. + +*-l*, *--file-locks*:: + Restore file locks from the image. + +*--lsm-profile* 'type'*:*'name':: + Specify an LSM profile to be used during restore. The `type` can be + either *apparmor* or *selinux*. + +*--auto-dedup*:: + As soon as a page is restored it get punched out from image. + +*-j*, *--shell-job*:: + Restore shell jobs, in other words inherit session and process group + ID from the criu itself. + +*--cpu-cap* ['cap'[,'cap'...]]:: + Specify CPU capabilities to be present on the CPU the process is + restoring. To inverse a capability, prefix it with *^*. This option implies + that *--cpu-cap* has been passed on *dump* as well, except *fpu* option + case. The 'cap' argument can be the following (or a set of comma-separated + values): + + *all*::: Require all capabilities. This is *default* mode if *--cpu-cap* + is passed without arguments. Most safe mode. + + *cpu*::: Require the CPU to have all capabilities in image to match + runtime CPU. + + *fpu*::: Require the CPU to have compatible FPU. For example the process + might be dumped with xsave capability but attempted to restore + without it present on target CPU. In such case we refuse to + proceed. This is *default* mode if *--cpu-cap* is not present + in command line. Note this argument might be passed even if + on the *dump* no *--cpu-cap* have been specified because FPU + frames are always encoded into images. + + *ins*::: Require CPU compatibility on instructions level. + + *none*::: Ignore capabilities. Most dangerous mode. The behaviour is + implementation dependent. Try to not use it until really + required. ++ +For example, this option can be used in case *--cpu-cap=cpu* was used +during *dump*, and images are migrated to a less capable CPU and are +to be restored. By default, *criu* shows an error that CPU capabilities +are not adequate, but this can be suppressed by using *--cpu-cap=none*. + +*--weak-sysctls*:: + Silently skip restoring sysctls that are not available. This allows + to restore on an older kernel, or a kernel configured without some + options. + +*--lazy-pages*:: + Restore the processes without filling out the entire memory + contents. When this option is used, *restore* sets up the + infrastructure required to fill memory pages either on demand when + the process accesses them or in the background without stopping the + restored process. + This option requires running *lazy-pages* daemon. + +*check* +~~~~~~~ +Checks whether the kernel supports the features needed by *criu* to +dump and restore a process tree. + +There are three categories of kernel support, as described below. *criu +check* always checks Category 1 features unless *--feature* is specified +which only checks a specified feature. + +*Category 1*::: Absolutely required. These are features like support for + */proc/PID/map_files*, *NETLINK_SOCK_DIAG* socket + monitoring, */proc/sys/kernel/ns_last_pid* etc. + +*Category 2*::: Required only for specific cases. These are features + like AIO remap, */dev/net/tun* and others that are only + required if a process being dumped or restored + is using those. + +*Category 3*::: Experimental. These are features like *task-diag* that + are used for experimental purposes (mostly + during development). + +If there are no errors or warnings, *criu* prints "Looks good." and its +exit code is 0. + +A missing Category 1 feature causes *criu* to print "Does not look good." +and its exit code is non-zero. + +Missing Category 2 and 3 features cause *criu* to print "Looks good but +..." and its exit code is be non-zero. + +Without any options, *criu check* checks Category 1 features. This +behavior can be changed by using the following options: + +*--extra*:: + Check kernel support for Category 2 features. + +*--experimental*:: + Check kernel support for Category 3 features. + +*--all*:: + Check kernel support for Category 1, 2, and 3 features. + +*--feature* 'name':: + Check a specific feature. If 'name' is *list*, a list of valid + kernel feature names that can be checked will be printed. + +*page-server* +~~~~~~~~~~~~~ +Launches *criu* in page server mode. + +*--daemon*:: + Runs page server as a daemon (background process). + +*--status-fd*:: + Write \0 to the FD and close it once page-server is ready to handle + requests. The status-fd allows to not daemonize a process and get its + exit code at the end. + It isn't supposed to use --daemon and --status-fd together. + +*--address* 'address':: + Page server IP address or hostname. + +*--port* 'number':: + Page server port number. + +*--ps-socket* 'fd':: + Use provided file descriptor as socket for incoming connection. + In this case --address and --port are ignored. + Useful for intercepting page-server traffic e.g. to add encryption + or authentication. + +*--lazy-pages*:: + Serve local memory dump to a remote *lazy-pages* daemon. In this + mode the *page-server* reads local memory dump and allows the + remote *lazy-pages* daemon to request memory pages in random + order. + +*--tls-cacert* 'file':: + Specifies the path to a trusted Certificate Authority (CA) certificate + file to be used for verification of a client or server certificate. + The 'file' must be in PEM format. When this option is used only the + specified CA is used for verification. Otherwise, the system's trusted CAs + and, if present, '/etc/pki/CA/cacert.pem' will be used. + +*--tls-cacrl* 'file':: + Specifies a path to a Certificate Revocation List (CRL) 'file' which + contains a list of revoked certificates that should no longer be trusted. + The 'file' must be in PEM format. When this option is not specified, the + file, if present, '/etc/pki/CA/cacrl.pem' will be used. + +*--tls-cert* 'file':: + Specifies a path to a file that contains a X.509 certificate to present + to the remote entity. The 'file' must be in PEM format. When this option + is not specified, the default location ('/etc/pki/criu/cert.pem') will be + used. + +*--tls-key* 'file':: + Specifies a path to a file that contains TLS private key. The 'file' must + be in PEM format. When this option is not the default location + ('/etc/pki/criu/private/key.pem') will be used. + +*--tls*:: + Use TLS to secure remote connections. + +*lazy-pages* +~~~~~~~~~~~~ +Launches *criu* in lazy-pages daemon mode. + +The *lazy-pages* daemon is responsible for managing user-level demand +paging for the restored processes. It gets information required to +fill the process memory pages from the *restore* and from the +checkpoint directory. When a restored process access certain memory +page for the first time, the *lazy-pages* daemon injects its contents +into the process address space. The memory pages that are not yet +requested by the restored processes are injected in the background. + +*exec* +~~~~~~ +Executes a system call inside a destination task\'s context. This functionality +is deprecated; please use *Compel* instead. + +*service* +~~~~~~~~~ +Launches *criu* in RPC daemon mode, where *criu* is listening for +RPC commands over socket to perform. This is convenient for a +case where daemon itself is running in a privileged (superuser) mode +but clients are not. + +dedup +~~~~~ +Starts pagemap data deduplication procedure, where *criu* scans over all +pagemap files and tries to minimize the number of pagemap entries by +obtaining the references from a parent pagemap image. + +cpuinfo dump +~~~~~~~~~~~~ +Fetches current CPU features and write them into an image file. + +cpuinfo check +~~~~~~~~~~~~~ +Fetches current CPU features (i.e. CPU the *criu* is running on) and test if +they are compatible with the ones present in an image file. + + +CONFIGURATION FILES +------------------- +*Criu* supports usage of configuration files to avoid the need of writing every +option on command line, which is useful especially with repeated usage of +same options. A specific configuration file can be passed with +the "*--config* 'file'" option. If no file is passed, the default configuration +files '/etc/criu/default.conf' and '$HOME/.criu/default.conf' are parsed (if +present on the system). If the environment variable CRIU_CONFIG_FILE is set, +it will also be parsed. + +The options passed to CRIU via CLI, RPC or configuration file are evaluated +in the following order: + + - apply_config(/etc/criu/default.conf) + - apply_config($HOME/.criu/default.conf) + - apply_config(CRIU_CONFIG_FILE) + - apply_config(*--config* 'file') + - apply_config(CLI) or apply_config(RPC) + - apply_config(RPC configuration file) (only for RPC mode) + +Default configuration file parsing can be deactivated +with "*--no-default-config*" if needed. Parsed configuration files are merged +with command line options, which allows overriding boolean options. + +Configuration file syntax +~~~~~~~~~~~~~~~~~~~~~~~~~ +Comments are supported using \'#' sign. The rest of the line is ignored. +Options are the same as command line options without the \'--' prefix, use +one option per line (with corresponding argument if applicable, divided by +whitespaces). If needed, the argument can be provided in double quotes (this +should be needed only if the argument contains whitespaces). In case this type +of argument contains a literal double quote as well, it can be escaped using +the \'\' sign. Usage of commands is disallowed and all other escape sequences +are interpreted literally. + +Example of configuration file to illustrate syntax: +--------------- +$ cat ~/.criu/default.conf +tcp-established +work-dir "/home/USERNAME/criu/my \"work\" directory" +#this is a comment +no-restore-sibling # this is another comment +--------------- + +Configuration files in RPC mode +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Not only does *criu* evaluate configuration files in CLI mode, it also +evaluates configuration files in RPC mode. Just as in CLI mode the +configuration file values are evaluated first. This means that any option +set via RPC will overwrite the configuration file setting. The user can +thus change *criu*'s default behavior but it is not possible to change +settings which are explicitly set by the RPC client. + +The RPC client can, however, specify an additional configuration file +which will be evaluated after the RPC options (see above for option evaluation +order). The RPC client can specify this additional configuration file +via "req.opts.config_file = '/path/to/file'". The values from this +configuration file will overwrite all other configuration file settings +or RPC options. *This can lead to undesired behavior of criu and +should only be used carefully.* + +EXAMPLES +-------- +To checkpoint a program with pid of *1234* and write all image files into +directory *checkpoint*: + +---------- + criu dump -D checkpoint -t 1234 +---------- + +To restore this program detaching criu itself: + +---------- + criu restore -d -D checkpoint +---------- + + +AUTHOR +------ +The CRIU team. + + +COPYRIGHT +--------- +Copyright \(C) 2011-2016, Parallels Holdings, Inc. diff --git a/CRIU_code/Documentation/custom.xsl b/CRIU_code/Documentation/custom.xsl new file mode 100644 index 0000000..663717e --- /dev/null +++ b/CRIU_code/Documentation/custom.xsl @@ -0,0 +1,8 @@ + + + + 1 + 1 + 1 + + diff --git a/CRIU_code/INSTALL.md b/CRIU_code/INSTALL.md new file mode 100644 index 0000000..d786d06 --- /dev/null +++ b/CRIU_code/INSTALL.md @@ -0,0 +1,32 @@ +## Installing CRIU from source code + +Once CRIU is built one can easily setup the complete CRIU package +(which includes executable itself, CRIT tool, libraries, manual +and etc) simply typing + + make install + +this command accepts the following variables: + + * **DESTDIR**, to specify global root where all components will be placed under (empty by default); + * **PREFIX**, to specify additional prefix for path of every component installed (`/usr/local` by default); + * **BINDIR**, to specify where to put CRIT tool (`$(PREFIX)/bin` by default); + * **SBINDIR**, to specify where to put CRIU executable (`$(PREFIX)/sbin` by default); + * **MANDIR**, to specify directory for manual pages (`$(PREFIX)/share/man` by default); + * **LIBDIR**, to specify directory where to put libraries (guess the correct path by default). + +Thus one can type + + make DESTDIR=/some/new/place install + +and get everything installed under `/some/new/place`. + +## Uninstalling CRIU + +To clean up previously installed CRIU instance one can type + + make uninstall + +and everything should be removed. Note though that if some variable (**DESTDIR**, **BINDIR** +and such) has been used during installation procedure, the same *must* be passed with +uninstall action. diff --git a/CRIU_code/Makefile b/CRIU_code/Makefile new file mode 100644 index 0000000..e27a280 --- /dev/null +++ b/CRIU_code/Makefile @@ -0,0 +1,400 @@ +__nmk_dir=$(CURDIR)/scripts/nmk/scripts/ +export __nmk_dir + +# +# No need to try to remake our Makefiles +Makefile: ; +Makefile.%: ; +scripts/%.mak: ; +$(__nmk_dir)%.mk: ; + +# +# Import the build engine +include $(__nmk_dir)include.mk +include $(__nmk_dir)macro.mk + +ifeq ($(origin HOSTCFLAGS), undefined) + HOSTCFLAGS := $(CFLAGS) $(USERCFLAGS) +endif + +UNAME-M := $(shell uname -m) + +# +# Supported Architectures +ifneq ($(filter-out x86 arm aarch64 ppc64 s390 riscv64,$(ARCH)),) + $(error "The architecture $(ARCH) isn't supported") +endif + +# The PowerPC 64 bits architecture could be big or little endian. +# They are handled in the same way. +ifeq ($(UNAME-M),ppc64) + error := $(error ppc64 big endian is not yet supported) +endif + +# +# Architecture specific options. +ifeq ($(ARCH),arm) + ARMV := $(shell echo $(UNAME-M) | sed -nr 's/armv([[:digit:]]).*/\1/p; t; i7') + DEFINES := -DCONFIG_ARMV$(ARMV) -DCONFIG_VDSO_32 + + ifeq ($(ARMV),6) + USERCFLAGS += -march=armv6 + endif + + ifeq ($(ARMV),7) + USERCFLAGS += -march=armv7-a + endif + + PROTOUFIX := y + # For simplicity - compile code in Arm mode without interwork. + # We could choose Thumb mode as default instead - but a dirty + # experiment shows that with 90Kb PIEs Thumb code doesn't save + # even one page. So, let's stick so far to Arm mode as it's more + # universal around all different Arm variations, until someone + # will find any use for Thumb mode. -dima + CFLAGS_PIE := -marm +endif + +ifeq ($(ARCH),aarch64) + DEFINES := -DCONFIG_AARCH64 +endif + +ifeq ($(ARCH),ppc64) + LDARCH := powerpc:common64 + DEFINES := -DCONFIG_PPC64 -D__SANE_USERSPACE_TYPES__ +endif + +ifeq ($(ARCH),x86) + LDARCH := i386:x86-64 + DEFINES := -DCONFIG_X86_64 +endif + +ifeq ($(ARCH),riscv64) + + DEFINES := -DCONFIG_RISCV64 +endif +# +# CFLAGS_PIE: +# +# Ensure with -fno-optimize-sibling-calls that we don't create GOT +# (Global Offset Table) relocations with gcc compilers that don't have +# commit "S/390: Fix 64 bit sibcall". +ifeq ($(ARCH),s390) + ARCH := s390 + SRCARCH := s390 + DEFINES := -DCONFIG_S390 + CFLAGS_PIE := -fno-optimize-sibling-calls +endif + +CFLAGS_PIE += -DCR_NOGLIBC +export CFLAGS_PIE + +LDARCH ?= $(SRCARCH) +export LDARCH +export PROTOUFIX DEFINES + +# +# Independent options for all tools. +DEFINES += -D_FILE_OFFSET_BITS=64 +DEFINES += -D_GNU_SOURCE + +WARNINGS := -Wall -Wformat-security + +CFLAGS-GCOV := --coverage -fno-exceptions -fno-inline -fprofile-update=atomic +export CFLAGS-GCOV + +ifneq ($(GCOV),) + LDFLAGS += -lgcov + CFLAGS += $(CFLAGS-GCOV) +endif + +ifeq ($(ASAN),1) + CFLAGS-ASAN := -fsanitize=address + export CFLAGS-ASAN + CFLAGS += $(CFLAGS-ASAN) +endif + +ifneq ($(WERROR),0) + WARNINGS += -Werror +endif + +ifeq ($(DEBUG),1) + DEFINES += -DCR_DEBUG + CFLAGS += -O0 -ggdb3 +else + CFLAGS += -O2 -g +endif + +ifeq ($(GMON),1) + CFLAGS += -pg + GMONLDOPT += -pg +export GMON GMONLDOPT +endif + +AFLAGS += -D__ASSEMBLY__ +CFLAGS += $(USERCFLAGS) $(WARNINGS) $(DEFINES) -iquote include/ +HOSTCFLAGS += $(WARNINGS) $(DEFINES) -iquote include/ +export AFLAGS CFLAGS USERCLFAGS HOSTCFLAGS + +# Default target +all: criu lib crit +.PHONY: all + +# +# Version headers. +include Makefile.versions + +VERSION_HEADER := criu/include/version.h +GITID_FILE := .gitid +GITID := $(shell if [ -d ".git" ]; then git describe --always; fi) + +# Git repository wasn't inited in CRIU folder +ifeq ($(GITID),) + GITID := 0 +else + GITID_FILE_VALUE := $(shell if [ -f '$(GITID_FILE)' ]; then if [ `cat '$(GITID_FILE)'` = $(GITID) ]; then echo y; fi; fi) + ifneq ($(GITID_FILE_VALUE),y) + .PHONY: $(GITID_FILE) + endif +endif + +$(GITID_FILE): + $(call msg-gen, $@) + $(Q) echo "$(GITID)" > $(GITID_FILE) + +$(VERSION_HEADER): Makefile.versions $(GITID_FILE) + $(call msg-gen, $@) + $(Q) echo "/* Autogenerated, do not edit */" > $@ + $(Q) echo "#ifndef __CR_VERSION_H__" >> $@ + $(Q) echo "#define __CR_VERSION_H__" >> $@ + $(Q) echo "#define CRIU_VERSION \"$(CRIU_VERSION)\"" >> $@ + $(Q) echo "#define CRIU_VERSION_MAJOR " $(CRIU_VERSION_MAJOR) >> $@ + $(Q) echo "#define CRIU_VERSION_MINOR " $(CRIU_VERSION_MINOR) >> $@ +ifneq ($(CRIU_VERSION_SUBLEVEL),) + $(Q) echo "#define CRIU_VERSION_SUBLEVEL " $(CRIU_VERSION_SUBLEVEL) >> $@ +endif +ifneq ($(CRIU_VERSION_EXTRA),) + $(Q) echo "#define CRIU_VERSION_EXTRA " $(CRIU_VERSION_EXTRA) >> $@ +endif + $(Q) echo "#define CRIU_GITID \"$(GITID)\"" >> $@ + $(Q) echo "#endif /* __CR_VERSION_H__ */" >> $@ + +criu-deps += $(VERSION_HEADER) + +# +# Setup proper link for asm headers in common code. +include/common/asm: include/common/arch/$(ARCH)/asm + $(call msg-gen, $@) + $(Q) ln -s ./arch/$(ARCH)/asm $@ + +criu-deps += include/common/asm + +# +# Configure variables. +export CONFIG_HEADER := include/common/config.h +ifeq ($(filter tags etags cscope clean mrproper,$(MAKECMDGOALS)),) +include Makefile.config +else +# To clean all files, enable make/build options here +export CONFIG_COMPAT := y +export CONFIG_GNUTLS := y +endif + +# +# Protobuf images first, they are not depending +# on anything else. +$(eval $(call gen-built-in,images)) +criu-deps += images/built-in.o + +# +# Compel get used by CRIU, build it earlier +include Makefile.compel + +# +# Next the socket CR library +# +SOCCR_A := soccr/libsoccr.a +soccr/Makefile: ; +soccr/%: $(CONFIG_HEADER) .FORCE + $(Q) $(MAKE) $(build)=soccr $@ +soccr/built-in.o: $(CONFIG_HEADER) .FORCE + $(Q) $(MAKE) $(build)=soccr all +$(SOCCR_A): |soccr/built-in.o +criu-deps += $(SOCCR_A) + +# +# CRIU building done in own directory +# with slightly different rules so we +# can't use nmk engine directly (we +# build syscalls library and such). +# +# But note that we're already included +# the nmk so we can reuse it there. +criu/Makefile: ; +criu/Makefile.packages: ; +criu/Makefile.crtools: ; +criu/%: $(criu-deps) .FORCE + $(Q) $(MAKE) $(build)=criu $@ +criu: $(criu-deps) + $(Q) $(MAKE) $(build)=criu all +.PHONY: criu + +crit/Makefile: ; +crit/%: criu .FORCE + $(Q) $(MAKE) $(build)=crit $@ +crit: criu + $(Q) $(MAKE) $(build)=crit all +.PHONY: crit + + +# +# Libraries next once crit it ready +# (we might generate headers and such +# when building criu itself). +lib/Makefile: ; +lib/%: crit .FORCE + $(Q) $(MAKE) $(build)=lib $@ +lib: crit + $(Q) $(MAKE) $(build)=lib all +.PHONY: lib + +clean mrproper: + $(Q) $(MAKE) $(build)=images $@ + $(Q) $(MAKE) $(build)=criu $@ + $(Q) $(MAKE) $(build)=soccr $@ + $(Q) $(MAKE) $(build)=lib $@ + $(Q) $(MAKE) $(build)=compel $@ + $(Q) $(MAKE) $(build)=compel/plugins $@ + $(Q) $(MAKE) $(build)=lib $@ + $(Q) $(MAKE) $(build)=crit $@ +.PHONY: clean mrproper + +clean-top: + $(Q) $(MAKE) -C Documentation clean + $(Q) $(MAKE) $(build)=test/compel clean + $(Q) $(RM) .gitid +.PHONY: clean-top + +clean: clean-top + +mrproper-top: clean-top + $(Q) $(RM) $(CONFIG_HEADER) + $(Q) $(RM) $(VERSION_HEADER) + $(Q) $(RM) $(COMPEL_VERSION_HEADER) + $(Q) $(RM) include/common/asm + $(Q) $(RM) compel/include/asm + $(Q) $(RM) cscope.* + $(Q) $(RM) tags TAGS +.PHONY: mrproper-top + +mrproper: mrproper-top + +# +# Non-CRIU stuff. +# + +docs: + $(Q) $(MAKE) -s -C Documentation all +.PHONY: docs + +zdtm: all + $(Q) $(MAKE) -C test/zdtm all +.PHONY: zdtm + +test: zdtm + $(Q) $(MAKE) -C test +.PHONY: test + +# +# Generating tar requires tag matched CRIU_VERSION. +# If not found then simply use GIT's describe with +# "v" prefix stripped. +head-name := $(shell git tag -l v$(CRIU_VERSION)) +ifeq ($(head-name),) + head-name := $(shell git describe 2>/dev/null) +endif +# If no git tag could describe current commit, +# use pre-defined CRIU_VERSION with GITID (if any). +ifeq ($(head-name),) + ifneq ($(GITID),) + head-name := $(CRIU_VERSION)-$(GITID) + else + head-name := $(CRIU_VERSION) + endif +endif +tar-name := $(shell echo $(head-name) | sed -e 's/^v//g') +criu-$(tar-name).tar.bz2: + git archive --format tar --prefix 'criu-$(tar-name)/' $(head-name) | bzip2 > $@ +dist tar: criu-$(tar-name).tar.bz2 ; +.PHONY: dist tar + +TAGS_FILES_REGEXP := . -name '*.[hcS]' ! -path './.*' \( ! -path './test/*' -o -path './test/zdtm/lib/*' \) +tags: + $(call msg-gen, $@) + $(Q) $(RM) tags + $(Q) $(FIND) $(TAGS_FILES_REGEXP) -print | xargs $(CTAGS) -a +.PHONY: tags + +etags: + $(call msg-gen, $@) + $(Q) $(RM) TAGS + $(Q) $(FIND) $(TAGS_FILES_REGEXP) -print | xargs $(ETAGS) -a +.PHONY: etags + + +cscope: + $(call msg-gen, $@) + $(Q) $(FIND) $(TAGS_FILES_REGEXP) ! -type l -print > cscope.files + $(Q) $(CSCOPE) -bkqu +.PHONY: cscope + +gcov: + $(E) " GCOV" + $(Q) test -d gcov || mkdir gcov && \ + geninfo --output-filename gcov/criu.info --no-recursion criu/ && \ + cd gcov && \ + genhtml --rc lcov_branch_coverage=1 --output-directory html criu.info + @echo "Code coverage report is in `pwd`/gcov/html/ directory." +.PHONY: gcov + +docker-build: + $(MAKE) -C scripts/build/ x86_64 +.PHONY: docker-build + +docker-test: + docker run --rm -it --privileged criu-x86_64 ./test/zdtm.py run -a -x tcp6 -x tcpbuf6 -x static/rtc -x cgroup +.PHONY: docker-test + +help: + @echo ' Targets:' + @echo ' all - Build all [*] targets' + @echo ' * criu - Build criu' + @echo ' zdtm - Build zdtm test-suite' + @echo ' docs - Build documentation' + @echo ' install - Install CRIU (see INSTALL.md)' + @echo ' uninstall - Uninstall CRIU' + @echo ' dist - Create a source tarball' + @echo ' clean - Clean most, but leave enough to navigate' + @echo ' mrproper - Delete all compiled/generated files' + @echo ' tags - Generate tags file (ctags)' + @echo ' etags - Generate TAGS file (etags)' + @echo ' cscope - Generate cscope database' + @echo ' test - Run zdtm test-suite' + @echo ' gcov - Make code coverage report' +.PHONY: help + +lint: + flake8 --config=scripts/flake8.cfg test/zdtm.py + flake8 --config=scripts/flake8.cfg test/inhfd/*.py + flake8 --config=scripts/flake8.cfg test/others/rpc/config_file.py + +include Makefile.install + +.DEFAULT_GOAL := all + +# Disable implicit rules in _this_ Makefile. +.SUFFIXES: + +# +# Optional local include. +-include Makefile.local diff --git a/CRIU_code/Makefile.compel b/CRIU_code/Makefile.compel new file mode 100644 index 0000000..764afad --- /dev/null +++ b/CRIU_code/Makefile.compel @@ -0,0 +1,77 @@ +COMPEL_BIN := ./compel/compel-host +export COMPEL_BIN + +COMPEL_VERSION_HEADER := compel/include/version.h + +$(COMPEL_VERSION_HEADER): Makefile.versions + $(call msg-gen, $(COMPEL_VERSION_HEADER)) + $(Q) echo "/* Autogenerated, do not edit */" > $(COMPEL_VERSION_HEADER) + $(Q) echo "#ifndef COMPEL_SO_VERSION_H__" >> $(COMPEL_VERSION_HEADER) + $(Q) echo "#define COMPEL_SO_VERSION_H__" >> $(COMPEL_VERSION_HEADER) + $(Q) echo "#define COMPEL_SO_VERSION \"$(COMPEL_SO_VERSION)\"" >> $(COMPEL_VERSION_HEADER) + $(Q) echo "#define COMPEL_SO_VERSION_MAJOR " $(COMPEL_SO_VERSION_MAJOR) >> $(COMPEL_VERSION_HEADER) + $(Q) echo "#define COMPEL_SO_VERSION_MINOR " $(COMPEL_SO_VERSION_MINOR) >> $(COMPEL_VERSION_HEADER) + $(Q) echo "#define COMPEL_SO_VERSION_SUBLEVEL " $(COMPEL_SO_VERSION_SUBLEVEL) >> $(COMPEL_VERSION_HEADER) + $(Q) echo "#endif /* COMPEL_SO_VERSION_H__ */" >> $(COMPEL_VERSION_HEADER) + +compel/include/asm: + $(call msg-gen, $@) + $(Q) ln -s ../arch/$(ARCH)/src/lib/include $@ + +compel-deps += compel/include/asm +compel-deps += $(COMPEL_VERSION_HEADER) +compel-deps += $(CONFIG_HEADER) +compel-deps += include/common/asm +compel-plugins += compel/plugins/std.lib.a compel/plugins/fds.lib.a + +LIBCOMPEL_SO := libcompel.so +LIBCOMPEL_A := libcompel.a +export LIBCOMPEL_SO LIBCOMPEL_A + +# +# Compel itself. +compel/Makefile: ; +compel/%: $(compel-deps) $(compel-plugins) .FORCE + $(Q) $(MAKE) $(build)=compel $@ + +criu-deps += compel/compel-host-bin + +# +# Make sure the host program is ready after the +# library and plugins are built. +compel/compel-host-bin: | compel/$(LIBCOMPEL_A) $(compel-plugins) +$(COMPEL_BIN): compel/compel-host-bin + +# +# Plugins +compel/plugins/Makefile: ; +compel/plugins/%: $(compel-deps) .FORCE + $(Q) $(MAKE) $(build)=compel/plugins $@ + +# +# GNU make 4.x supports targets matching via wide +# match targeting, where GNU make 3.x series (used on +# Travis) is not, so we have to write them here explicitly. +compel/plugins/std.lib.a: $(compel-deps) .FORCE + $(Q) $(MAKE) $(build)=compel/plugins $@ + +compel/plugins/shmem.lib.a: $(compel-deps) compel/plugins/std.lib.a .FORCE + $(Q) $(MAKE) $(build)=compel/plugins $@ + +compel/plugins/fds.lib.a: $(compel-deps) compel/plugins/std.lib.a .FORCE + $(Q) $(MAKE) $(build)=compel/plugins $@ + +compel/compel: compel/built-in.o compel/$(LIBCOMPEL_A) | $(compel-deps) + $(call msg-link, $@) + $(Q) $(CC) $(CFLAGS) $^ $(WRAPFLAGS) $(LDFLAGS) -rdynamic -o $@ + +# +# And compel library. +LIBCOMPEL_SO_CFLAGS += $(CFLAGS) -rdynamic -Wl,-soname,$(LIBCOMPEL_SO).$(COMPEL_SO_VERSION_MAJOR) +compel/$(LIBCOMPEL_SO): compel/$(LIBCOMPEL_A) + $(call msg-link, $@) + $(Q) $(CC) -shared $(LIBCOMPEL_SO_CFLAGS) -o $@ -Wl,--whole-archive $^ -Wl,--no-whole-archive $(LDFLAGS) + +compel-install-targets += compel/$(LIBCOMPEL_SO) +compel-install-targets += compel/compel +compel-install-targets += $(compel-plugins) diff --git a/CRIU_code/Makefile.config b/CRIU_code/Makefile.config new file mode 100644 index 0000000..6f73240 --- /dev/null +++ b/CRIU_code/Makefile.config @@ -0,0 +1,71 @@ +include $(__nmk_dir)utils.mk +include $(__nmk_dir)msg.mk +include scripts/feature-tests.mak + +ifeq ($(call try-cc,$(FEATURE_TEST_LIBBSD_DEV),-lbsd),true) + LIBS_FEATURES += -lbsd + FEATURE_DEFINES += -DCONFIG_HAS_LIBBSD +else + $(info Note: Building without setproctitle() and strlcpy() support.) + $(info $(info) To enable these features, please install libbsd-devel (RPM) / libbsd-dev (DEB).) +endif + +ifeq ($(call pkg-config-check,libselinux),y) + LIBS_FEATURES += -lselinux + FEATURE_DEFINES += -DCONFIG_HAS_SELINUX +endif + +ifeq ($(NO_GNUTLS)x$(call pkg-config-check,gnutls),xy) + LIBS_FEATURES += -lgnutls + export CONFIG_GNUTLS := y + FEATURE_DEFINES += -DCONFIG_GNUTLS +else + $(info Note: Building without GnuTLS support) +endif + +export LIBS += $(LIBS_FEATURES) + +CONFIG_FILE = .config + +$(CONFIG_FILE): + touch $(CONFIG_FILE) + +ifeq ($(SRCARCH),x86) +# CONFIG_COMPAT is only for x86 now, no need for compile-test other archs +ifeq ($(call try-asm,$(FEATURE_TEST_X86_COMPAT)),true) + export CONFIG_COMPAT := y + FEATURE_DEFINES += -DCONFIG_COMPAT +else + $(info Note: Building without ia32 C/R, missed ia32 support in gcc) + $(info $(info) That may be related to missing gcc-multilib in your) + $(info $(info) distribution or you may have Debian with buggy toolchain) + $(info $(info) (issue https://github.com/xemul/criu/issues/315)) +endif +endif + +export DEFINES += $(FEATURE_DEFINES) +export CFLAGS += $(FEATURE_DEFINES) + +FEATURES_LIST := TCP_REPAIR STRLCPY STRLCAT PTRACE_PEEKSIGINFO \ + SETPROCTITLE_INIT MEMFD TCP_REPAIR_WINDOW + +# $1 - config name +define gen-feature-test +ifeq ($$(call try-cc,$$(FEATURE_TEST_$(1)),$$(LIBS_FEATURES),$$(DEFINES)),true) + $(Q) echo '#define CONFIG_HAS_$(1)' >> $$@ + $(Q) echo '' >> $$@ +endif +endef + +define config-header-rule +$(CONFIG_HEADER): scripts/feature-tests.mak $(CONFIG_FILE) + $(call msg-gen, $$@) + $(Q) echo '#ifndef __CR_CONFIG_H__' > $$@ + $(Q) echo '#define __CR_CONFIG_H__' >> $$@ + $(Q) echo '' >> $$@ +$(call map,gen-feature-test,$(FEATURES_LIST)) + $(Q) cat $(CONFIG_FILE) | sed -n -e '/^[^#]/s/^/#define CONFIG_/p' >> $$@ + $(Q) echo '#endif /* __CR_CONFIG_H__ */' >> $$@ +endef + +$(eval $(config-header-rule)) diff --git a/CRIU_code/Makefile.install b/CRIU_code/Makefile.install new file mode 100644 index 0000000..3987bcc --- /dev/null +++ b/CRIU_code/Makefile.install @@ -0,0 +1,57 @@ +# +# Installation paths. +PREFIX ?= /usr/local +BINDIR ?= $(PREFIX)/bin +SBINDIR ?= $(PREFIX)/sbin +MANDIR ?= $(PREFIX)/share/man +INCLUDEDIR ?= $(PREFIX)/include +LIBEXECDIR ?= $(PREFIX)/libexec +RUNDIR ?= /run + +# +# For recent Debian/Ubuntu with multiarch support. +DEB_HOST_MULTIARCH := $(shell dpkg-architecture -qDEB_HOST_MULTIARCH 2>/dev/null) +ifneq "$(DEB_HOST_MULTIARCH)" "" + LIBDIR ?= $(PREFIX)/lib/$(DEB_HOST_MULTIARCH) +else + # + # For most other systems + ifeq "$(shell uname -m)" "x86_64" + LIBDIR ?= $(PREFIX)/lib64 + endif +endif + +# +# LIBDIR falls back to the standard path. +LIBDIR ?= $(PREFIX)/lib + +export PREFIX BINDIR SBINDIR MANDIR RUNDIR +export LIBDIR INCLUDEDIR LIBEXECDIR + +install-man: + $(Q) $(MAKE) -C Documentation install +.PHONY: install-man + +install-lib: lib + $(Q) $(MAKE) $(build)=lib install +.PHONY: install-lib + +install-criu: criu + $(Q) $(MAKE) $(build)=criu install +.PHONY: install-criu + +install-compel: $(compel-install-targets) + $(Q) $(MAKE) $(build)=compel install + $(Q) $(MAKE) $(build)=compel/plugins install +.PHONY: install-compel + +install: install-man install-lib install-criu install-compel ; +.PHONY: install + +uninstall: + $(Q) $(MAKE) -C Documentation $@ + $(Q) $(MAKE) $(build)=lib $@ + $(Q) $(MAKE) $(build)=criu $@ + $(Q) $(MAKE) $(build)=compel $@ + $(Q) $(MAKE) $(build)=compel/plugins $@ +.PHONY: uninstall diff --git a/CRIU_code/Makefile.versions b/CRIU_code/Makefile.versions new file mode 100644 index 0000000..6d4e15e --- /dev/null +++ b/CRIU_code/Makefile.versions @@ -0,0 +1,31 @@ +# +# CRIU version. +CRIU_VERSION_MAJOR := 3 +CRIU_VERSION_MINOR := 12 +CRIU_VERSION_SUBLEVEL := +CRIU_VERSION_EXTRA := +CRIU_VERSION_NAME := Ice Penguin +CRIU_VERSION := $(CRIU_VERSION_MAJOR)$(if $(CRIU_VERSION_MINOR),.$(CRIU_VERSION_MINOR))$(if $(CRIU_VERSION_SUBLEVEL),.$(CRIU_VERSION_SUBLEVEL))$(if $(CRIU_VERSION_EXTRA),.$(CRIU_VERSION_EXTRA)) + +export CRIU_VERSION_MAJOR CRIU_VERSION_MINOR CRIU_VERSION_SUBLEVEL +export CRIU_VERSION_EXTRA CRIU_VERSION_NAME CRIU_VERSION + +# +# C library for CRIU. +CRIU_SO_VERSION_MAJOR := 2 +CRIU_SO_VERSION_MINOR := 0 + +export CRIU_SO_VERSION_MAJOR CRIU_SO_VERSION_MINOR + +# +# SOCCR library. +SOCCR_SO_VERSION_MAJOR := 1 +SOCCR_SO_VERSION_MINOR := 0 + +export SOCCR_SO_VERSION_MAJOR SOCCR_SO_VERSION_MINOR + +COMPEL_SO_VERSION_MAJOR := 1 +COMPEL_SO_VERSION_MINOR := 0 +COMPEL_SO_VERSION_SUBLEVEL := 0 + +export COMPEL_SO_VERSION_MAJOR COMPEL_SO_VERSION_MINOR COMPEL_SO_VERSION_SUBLEVEL diff --git a/README.en.md b/CRIU_code/README.en.md similarity index 86% rename from README.en.md rename to CRIU_code/README.en.md index bf587cf..430a78f 100644 --- a/README.en.md +++ b/CRIU_code/README.en.md @@ -1,7 +1,7 @@ -# DyscheOS-utils +# 13-蝴蝶队 #### Description -It provides utility tools for DyscheOS, including management tools, scripts, user-guide and kernel modules. +TOPIC_ID:13, TEAM_ID:1243952768, TEAM_NAME:蝴蝶队. #### Software Architecture Software architecture description diff --git a/CRIU_code/README.md b/CRIU_code/README.md new file mode 100644 index 0000000..6fe32af --- /dev/null +++ b/CRIU_code/README.md @@ -0,0 +1,121 @@ +<<<<<<< HEAD +<<<<<<< HEAD +[![master](https://travis-ci.org/checkpoint-restore/criu.svg?branch=master)](https://travis-ci.org/checkpoint-restore/criu) +[![development](https://travis-ci.org/checkpoint-restore/criu.svg?branch=criu-dev)](https://travis-ci.org/checkpoint-restore/criu) +[![Codacy Badge](https://api.codacy.com/project/badge/Grade/55251ec7db28421da4481fc7c1cb0cee)](https://www.codacy.com/app/xemul/criu?utm_source=github.com&utm_medium=referral&utm_content=xemul/criu&utm_campaign=Badge_Grade) +

+ +## CRIU -- A project to implement checkpoint/restore functionality for Linux + +CRIU (stands for Checkpoint and Restore in Userspace) is a utility to checkpoint/restore Linux tasks. + +Using this tool, you can freeze a running application (or part of it) and checkpoint +it to a hard drive as a collection of files. You can then use the files to restore and run the +application from the point it was frozen at. The distinctive feature of the CRIU +project is that it is mainly implemented in user space. There are some more projects +doing C/R for Linux, and so far CRIU [appears to be](https://criu.org/Comparison_to_other_CR_projects) +the most feature-rich and up-to-date with the kernel. + +The project [started](https://criu.org/History) as the way to do live migration for OpenVZ +Linux containers, but later grew to more sophisticated and flexible tool. It is currently +used by (integrated into) OpenVZ, LXC/LXD, Docker, and other software, project gets tremendous +help from the community, and its packages are included into many Linux distributions. + +The project home is at http://criu.org. This wiki contains all the knowledge base for CRIU we have. +Pages worth starting with are: +- [Installation instructions](http://criu.org/Installation) +- [A simple example of usage](http://criu.org/Simple_loop) +- [Examples of more advanced usage](https://criu.org/Category:HOWTO) +- Troubleshooting can be hard, some help can be found [here](https://criu.org/When_C/R_fails), [here](https://criu.org/What_cannot_be_checkpointed) and [here](https://criu.org/FAQ) + +### Checkpoint and restore of simple loop process +[

](https://asciinema.org/a/232445) + +## Advanced features + +As main usage for CRIU is live migration, there's a library for it called P.Haul. Also the +project exposes two cool core features as standalone libraries. These are libcompel for parasite code +injection and libsoccr for TCP connections checkpoint-restore. + +### Live migration + +True [live migration](https://criu.org/Live_migration) using CRIU is possible, but doing +all the steps by hands might be complicated. The [phaul sub-project](https://criu.org/P.Haul) +provides a Go library that encapsulates most of the complexity. This library and the Go bindings +for CRIU are stored in the [go-criu](https://github.com/checkpoint-restore/go-criu) repository. + + +### Parasite code injection + +In order to get state of the running process CRIU needs to make this process execute +some code, that would fetch the required information. To make this happen without +killing the application itself, CRIU uses the [parasite code injection](https://criu.org/Parasite_code) +technique, which is also available as a standalone library called [libcompel](https://criu.org/Compel). + +### TCP sockets checkpoint-restore + +One of the CRIU features is the ability to save and restore state of a TCP socket +without breaking the connection. This functionality is considered to be useful by +itself, and we have it available as the [libsoccr library](https://criu.org/Libsoccr). + +## How to contribute + +CRIU project is (almost) the never-ending story, because we have to always keep up with the +Linux kernel supporting checkpoint and restore for all the features it provides. Thus we're +looking for contributors of all kinds -- feedback, bug reports, testing, coding, writing, etc. +Here are some useful hints to get involved. + +* We have both -- [very simple](https://github.com/xemul/criu/issues?q=is%3Aissue+is%3Aopen+label%3Aenhancement) and [more sophisticated](https://github.com/xemul/criu/issues?q=is%3Aissue+is%3Aopen+label%3A%22new+feature%22) coding tasks; +* CRIU does need [extensive testing](https://github.com/xemul/criu/issues?q=is%3Aissue+is%3Aopen+label%3Atesting); +* Documentation is always hard, we have [some information](https://criu.org/Category:Empty_articles) that is to be extracted from people's heads into wiki pages as well as [some texts](https://criu.org/Category:Editor_help_needed) that all need to be converted into useful articles; +* Feedback is expected on the github issues page and on the [mailing list](https://lists.openvz.org/mailman/listinfo/criu); +* For historical reasons we do not accept PRs, instead [patches are welcome](http://criu.org/How_to_submit_patches); +* Spread the word about CRIU in [social networks](http://criu.org/Contacts); +* If you're giving a talk about CRIU -- let us know, we'll mention it on the [wiki main page](https://criu.org/News/events); + +## Licence + +The project is licensed under GPLv2 (though files sitting in the lib/ directory are LGPLv2.1). +======= +======= +>>>>>>> eb42ab3a1a28c6ca4b2918cdaaa61a48a1e0c7d1 +# 13-蝴蝶队 + +#### 介绍 +TOPIC_ID:13, TEAM_ID:1243952768, TEAM_NAME:蝴蝶队. + +#### 软件架构 +软件架构说明 + + +#### 安装教程 + +1. xxxx +2. xxxx +3. xxxx + +#### 使用说明 + +1. xxxx +2. xxxx +3. xxxx + +#### 参与贡献 + +1. Fork 本仓库 +2. 新建 Feat_xxx 分支 +3. 提交代码 +4. 新建 Pull Request + + +#### 特技 + +1. 使用 Readme\_XXX.md 来支持不同的语言,例如 Readme\_en.md, Readme\_zh.md +2. Gitee 官方博客 [blog.gitee.com](https://blog.gitee.com) +3. 你可以 [https://gitee.com/explore](https://gitee.com/explore) 这个地址来了解 Gitee 上的优秀开源项目 +4. [GVP](https://gitee.com/gvp) 全称是 Gitee 最有价值开源项目,是综合评定出的优秀开源项目 +5. Gitee 官方提供的使用手册 [https://gitee.com/help](https://gitee.com/help) +6. Gitee 封面人物是一档用来展示 Gitee 会员风采的栏目 [https://gitee.com/gitee-stars/](https://gitee.com/gitee-stars/) +<<<<<<< HEAD +>>>>>>> eb42ab3a1a28c6ca4b2918cdaaa61a48a1e0c7d1 +======= diff --git a/CRIU_code/README.md.orig b/CRIU_code/README.md.orig new file mode 100644 index 0000000..23faa02 --- /dev/null +++ b/CRIU_code/README.md.orig @@ -0,0 +1,122 @@ +<<<<<<< HEAD +<<<<<<< HEAD +[![master](https://travis-ci.org/checkpoint-restore/criu.svg?branch=master)](https://travis-ci.org/checkpoint-restore/criu) +[![development](https://travis-ci.org/checkpoint-restore/criu.svg?branch=criu-dev)](https://travis-ci.org/checkpoint-restore/criu) +[![Codacy Badge](https://api.codacy.com/project/badge/Grade/55251ec7db28421da4481fc7c1cb0cee)](https://www.codacy.com/app/xemul/criu?utm_source=github.com&utm_medium=referral&utm_content=xemul/criu&utm_campaign=Badge_Grade) +

+ +## CRIU -- A project to implement checkpoint/restore functionality for Linux + +CRIU (stands for Checkpoint and Restore in Userspace) is a utility to checkpoint/restore Linux tasks. + +Using this tool, you can freeze a running application (or part of it) and checkpoint +it to a hard drive as a collection of files. You can then use the files to restore and run the +application from the point it was frozen at. The distinctive feature of the CRIU +project is that it is mainly implemented in user space. There are some more projects +doing C/R for Linux, and so far CRIU [appears to be](https://criu.org/Comparison_to_other_CR_projects) +the most feature-rich and up-to-date with the kernel. + +The project [started](https://criu.org/History) as the way to do live migration for OpenVZ +Linux containers, but later grew to more sophisticated and flexible tool. It is currently +used by (integrated into) OpenVZ, LXC/LXD, Docker, and other software, project gets tremendous +help from the community, and its packages are included into many Linux distributions. + +The project home is at http://criu.org. This wiki contains all the knowledge base for CRIU we have. +Pages worth starting with are: +- [Installation instructions](http://criu.org/Installation) +- [A simple example of usage](http://criu.org/Simple_loop) +- [Examples of more advanced usage](https://criu.org/Category:HOWTO) +- Troubleshooting can be hard, some help can be found [here](https://criu.org/When_C/R_fails), [here](https://criu.org/What_cannot_be_checkpointed) and [here](https://criu.org/FAQ) + +### Checkpoint and restore of simple loop process +[

](https://asciinema.org/a/232445) + +## Advanced features + +As main usage for CRIU is live migration, there's a library for it called P.Haul. Also the +project exposes two cool core features as standalone libraries. These are libcompel for parasite code +injection and libsoccr for TCP connections checkpoint-restore. + +### Live migration + +True [live migration](https://criu.org/Live_migration) using CRIU is possible, but doing +all the steps by hands might be complicated. The [phaul sub-project](https://criu.org/P.Haul) +provides a Go library that encapsulates most of the complexity. This library and the Go bindings +for CRIU are stored in the [go-criu](https://github.com/checkpoint-restore/go-criu) repository. + + +### Parasite code injection + +In order to get state of the running process CRIU needs to make this process execute +some code, that would fetch the required information. To make this happen without +killing the application itself, CRIU uses the [parasite code injection](https://criu.org/Parasite_code) +technique, which is also available as a standalone library called [libcompel](https://criu.org/Compel). + +### TCP sockets checkpoint-restore + +One of the CRIU features is the ability to save and restore state of a TCP socket +without breaking the connection. This functionality is considered to be useful by +itself, and we have it available as the [libsoccr library](https://criu.org/Libsoccr). + +## How to contribute + +CRIU project is (almost) the never-ending story, because we have to always keep up with the +Linux kernel supporting checkpoint and restore for all the features it provides. Thus we're +looking for contributors of all kinds -- feedback, bug reports, testing, coding, writing, etc. +Here are some useful hints to get involved. + +* We have both -- [very simple](https://github.com/xemul/criu/issues?q=is%3Aissue+is%3Aopen+label%3Aenhancement) and [more sophisticated](https://github.com/xemul/criu/issues?q=is%3Aissue+is%3Aopen+label%3A%22new+feature%22) coding tasks; +* CRIU does need [extensive testing](https://github.com/xemul/criu/issues?q=is%3Aissue+is%3Aopen+label%3Atesting); +* Documentation is always hard, we have [some information](https://criu.org/Category:Empty_articles) that is to be extracted from people's heads into wiki pages as well as [some texts](https://criu.org/Category:Editor_help_needed) that all need to be converted into useful articles; +* Feedback is expected on the github issues page and on the [mailing list](https://lists.openvz.org/mailman/listinfo/criu); +* For historical reasons we do not accept PRs, instead [patches are welcome](http://criu.org/How_to_submit_patches); +* Spread the word about CRIU in [social networks](http://criu.org/Contacts); +* If you're giving a talk about CRIU -- let us know, we'll mention it on the [wiki main page](https://criu.org/News/events); + +## Licence + +The project is licensed under GPLv2 (though files sitting in the lib/ directory are LGPLv2.1). +======= +======= +>>>>>>> eb42ab3a1a28c6ca4b2918cdaaa61a48a1e0c7d1 +# 13-蝴蝶队 + +#### 介绍 +TOPIC_ID:13, TEAM_ID:1243952768, TEAM_NAME:蝴蝶队. + +#### 软件架构 +软件架构说明 + + +#### 安装教程 + +1. xxxx +2. xxxx +3. xxxx + +#### 使用说明 + +1. xxxx +2. xxxx +3. xxxx + +#### 参与贡献 + +1. Fork 本仓库 +2. 新建 Feat_xxx 分支 +3. 提交代码 +4. 新建 Pull Request + + +#### 特技 + +1. 使用 Readme\_XXX.md 来支持不同的语言,例如 Readme\_en.md, Readme\_zh.md +2. Gitee 官方博客 [blog.gitee.com](https://blog.gitee.com) +3. 你可以 [https://gitee.com/explore](https://gitee.com/explore) 这个地址来了解 Gitee 上的优秀开源项目 +4. [GVP](https://gitee.com/gvp) 全称是 Gitee 最有价值开源项目,是综合评定出的优秀开源项目 +5. Gitee 官方提供的使用手册 [https://gitee.com/help](https://gitee.com/help) +6. Gitee 封面人物是一档用来展示 Gitee 会员风采的栏目 [https://gitee.com/gitee-stars/](https://gitee.com/gitee-stars/) +<<<<<<< HEAD +>>>>>>> eb42ab3a1a28c6ca4b2918cdaaa61a48a1e0c7d1 +======= +>>>>>>> eb42ab3a1a28c6ca4b2918cdaaa61a48a1e0c7d1 diff --git a/CRIU_code/compel/.gitignore b/CRIU_code/compel/.gitignore new file mode 100644 index 0000000..eab3337 --- /dev/null +++ b/CRIU_code/compel/.gitignore @@ -0,0 +1,14 @@ +arch/x86/plugins/std/sys-exec-tbl-64.c +arch/x86/plugins/std/syscalls-64.S +arch/arm/plugins/std/syscalls/syscalls.S +arch/aarch64/plugins/std/syscalls/syscalls.S +arch/s390/plugins/std/syscalls/syscalls.S +arch/ppc64/plugins/std/syscalls/syscalls.S +include/version.h +plugins/include/uapi/std/asm/syscall-types.h +plugins/include/uapi/std/syscall-64.h +plugins/include/uapi/std/syscall-codes-64.h +plugins/include/uapi/std/syscall-codes.h +plugins/include/uapi/std/syscall.h +plugins/include/uapi/std/syscall-aux.h +plugins/include/uapi/std/syscall-aux.S diff --git a/CRIU_code/compel/Makefile b/CRIU_code/compel/Makefile new file mode 100644 index 0000000..de9318c --- /dev/null +++ b/CRIU_code/compel/Makefile @@ -0,0 +1,81 @@ +include Makefile.versions + +COMPEL_SO_VERSION := $(COMPEL_SO_VERSION_MAJOR)$(if $(COMPEL_SO_VERSION_MINOR),.$(COMPEL_SO_VERSION_MINOR))$(if $(COMPEL_SO_VERSION_SUBLEVEL),.$(COMPEL_SO_VERSION_SUBLEVEL)) +COMPEL_SO_VERSION_CODE := $(shell expr $(COMPEL_SO_VERSION_MAJOR) \* 65536 \+ $(COMPEL_SO_VERSION_MINOR) \* 256 \+ $(COMPEL_SO_VERSION_SUBLEVEL)) +ccflags-y += -DINCLUDEDIR=\"$(INCLUDEDIR)\" +ccflags-y += -DLIBEXECDIR=\"$(LIBEXECDIR)\" +ccflags-y += -DLIBDIR=\"$(LIBDIR)\" +ccflags-y += -DSTATIC_LIB=\"$(LIBCOMPEL_A)\" +ccflags-y += -DDYN_LIB=\"$(LIBCOMPEL_SO).$(COMPEL_SO_VERSION_MAJOR)\" +ccflags-y += -iquote compel/arch/$(ARCH)/src/lib/include +ccflags-y += -iquote compel/include +ccflags-y += -fno-strict-aliasing +ccflags-y += -fPIC +ldflags-y += -r + +# +# UAPI inclusion, referred as +ccflags-y += -I compel/include/uapi + +lib-name := $(LIBCOMPEL_A) +lib-y += src/lib/log.o +host-lib-y += src/lib/log.o + +lib-y += arch/$(ARCH)/src/lib/cpu.o +lib-y += arch/$(ARCH)/src/lib/infect.o +lib-y += src/lib/infect-rpc.o +lib-y += src/lib/infect-util.o +lib-y += src/lib/infect.o +lib-y += src/lib/ptrace.o + +# handle_elf() has no support of ELF relocations on ARM (yet?) +ifneq ($(filter arm aarch64,$(ARCH)),) +CFLAGS += -DNO_RELOCS +HOSTCFLAGS += -DNO_RELOCS +endif + +obj-y += src/main.o +obj-y += arch/$(ARCH)/src/lib/handle-elf.o +obj-y += src/lib/handle-elf.o + +host-ccflags-y += $(ccflags-y) + +hostprogs-y += compel-host-bin +compel-host-bin-objs := $(patsubst %.o,%-host.o,$(obj-y) $(host-lib-y)) + +cleanup-y += compel/compel +cleanup-y += compel/compel-host-bin +cleanup-y += compel/libcompel.so + +install: compel/compel compel/$(LIBCOMPEL_SO) compel/$(LIBCOMPEL_A) + $(E) " INSTALL " compel + $(Q) mkdir -p $(DESTDIR)$(BINDIR) + $(Q) install -m 755 compel/compel $(DESTDIR)$(BINDIR) + $(E) " INSTALL " $(LIBCOMPEL_SO) + $(Q) mkdir -p $(DESTDIR)$(LIBDIR) + $(Q) install -m 0644 compel/$(LIBCOMPEL_SO) $(DESTDIR)$(LIBDIR) + $(Q) install -m 755 compel/$(LIBCOMPEL_SO) $(DESTDIR)$(LIBDIR)/$(LIBCOMPEL_SO).$(COMPEL_SO_VERSION_MAJOR).$(COMPEL_SO_VERSION_MINOR) + $(Q) ln -fns $(LIBCOMPEL_SO).$(COMPEL_SO_VERSION_MAJOR).$(COMPEL_SO_VERSION_MINOR) $(DESTDIR)$(LIBDIR)/$(LIBCOMPEL_SO).$(COMPEL_SO_VERSION_MAJOR) + $(Q) ln -fns $(LIBCOMPEL_SO).$(COMPEL_SO_VERSION_MAJOR).$(COMPEL_SO_VERSION_MINOR) $(DESTDIR)$(LIBDIR)/$(LIBCOMPEL_SO) + $(E) " INSTALL " $(LIBCOMPEL_A) + $(Q) install -m 0644 compel/$(LIBCOMPEL_A) $(DESTDIR)$(LIBDIR) + $(E) " INSTALL " compel uapi + $(Q) mkdir -p $(DESTDIR)$(INCLUDEDIR)/compel/asm + $(Q) cp compel/include/uapi/*.h $(DESTDIR)$(INCLUDEDIR)/compel/ + $(Q) cp compel/include/uapi/asm/*.h $(DESTDIR)$(INCLUDEDIR)/compel/asm/ + $(Q) mkdir -p $(DESTDIR)$(INCLUDEDIR)/compel/common/asm + $(Q) cp include/common/compiler.h $(DESTDIR)$(INCLUDEDIR)/compel/common/ +.PHONY: install + +uninstall: + $(E) " UNINSTALL" compel + $(Q) $(RM) $(addprefix $(DESTDIR)$(BINDIR)/,compel) + $(E) " UNINSTALL" $(LIBCOMPEL_SO) + $(Q) $(RM) $(addprefix $(DESTDIR)$(LIBDIR)/,$(LIBCOMPEL_SO)) + $(Q) $(RM) $(addprefix $(DESTDIR)$(LIBDIR)/,$(LIBCOMPEL_SO).$(COMPEL_SO_VERSION_MAJOR)) + $(Q) $(RM) $(addprefix $(DESTDIR)$(LIBDIR)/,$(LIBCOMPEL_SO).$(COMPEL_SO_VERSION_MAJOR).$(COMPEL_SO_VERSION_MINOR)) + $(E) " UNINSTALL" $(LIBCOMPEL_A) + $(Q) $(RM) $(addprefix $(DESTDIR)$(LIBDIR)/,$(LIBCOMPEL_A)) + $(E) " UNINSTALL" compel uapi + $(Q) $(RM) -rf $(addprefix $(DESTDIR)$(INCLUDEDIR)/,compel/*) +.PHONY: uninstall diff --git a/CRIU_code/compel/arch/aarch64/plugins/include/asm/prologue.h b/CRIU_code/compel/arch/aarch64/plugins/include/asm/prologue.h new file mode 100644 index 0000000..e0275e3 --- /dev/null +++ b/CRIU_code/compel/arch/aarch64/plugins/include/asm/prologue.h @@ -0,0 +1 @@ +../../../../../arch/x86/plugins/include/asm/prologue.h \ No newline at end of file diff --git a/CRIU_code/compel/arch/aarch64/plugins/include/asm/syscall-types.h b/CRIU_code/compel/arch/aarch64/plugins/include/asm/syscall-types.h new file mode 100644 index 0000000..ee0e218 --- /dev/null +++ b/CRIU_code/compel/arch/aarch64/plugins/include/asm/syscall-types.h @@ -0,0 +1,28 @@ +#ifndef COMPEL_ARCH_SYSCALL_TYPES_H__ +#define COMPEL_ARCH_SYSCALL_TYPES_H__ + +#define SA_RESTORER 0x04000000 + +typedef void rt_signalfn_t(int, siginfo_t *, void *); +typedef rt_signalfn_t *rt_sighandler_t; + +typedef void rt_restorefn_t(void); +typedef rt_restorefn_t *rt_sigrestore_t; + +#define _KNSIG 64 +#define _NSIG_BPW 64 + +#define _KNSIG_WORDS (_KNSIG / _NSIG_BPW) + +typedef struct { + unsigned long sig[_KNSIG_WORDS]; +} k_rtsigset_t; + +typedef struct { + rt_sighandler_t rt_sa_handler; + unsigned long rt_sa_flags; + rt_sigrestore_t rt_sa_restorer; + k_rtsigset_t rt_sa_mask; +} rt_sigaction_t; + +#endif /* COMPEL_ARCH_SYSCALL_TYPES_H__ */ diff --git a/CRIU_code/compel/arch/aarch64/plugins/include/features.h b/CRIU_code/compel/arch/aarch64/plugins/include/features.h new file mode 100644 index 0000000..b4a3cde --- /dev/null +++ b/CRIU_code/compel/arch/aarch64/plugins/include/features.h @@ -0,0 +1,4 @@ +#ifndef __COMPEL_ARCH_FEATURES_H +#define __COMPEL_ARCH_FEATURES_H + +#endif /* __COMPEL_ARCH_FEATURES_H */ diff --git a/CRIU_code/compel/arch/aarch64/plugins/std/parasite-head.S b/CRIU_code/compel/arch/aarch64/plugins/std/parasite-head.S new file mode 100644 index 0000000..5e7067f --- /dev/null +++ b/CRIU_code/compel/arch/aarch64/plugins/std/parasite-head.S @@ -0,0 +1,20 @@ +#include "common/asm/linkage.h" + + .section .head.text, "ax" +ENTRY(__export_parasite_head_start) + adr x2, __export_parasite_head_start // get the address of this instruction + + ldr x0, __export_parasite_cmd + + ldr x1, parasite_args_ptr + add x1, x1, x2 // fixup __export_parasite_args + + bl parasite_service + brk #0 // the instruction BRK #0 generates the signal SIGTRAP in Linux + +parasite_args_ptr: + .quad __export_parasite_args + +__export_parasite_cmd: + .quad 0 +END(__export_parasite_head_start) diff --git a/CRIU_code/compel/arch/aarch64/plugins/std/syscalls/Makefile.syscalls b/CRIU_code/compel/arch/aarch64/plugins/std/syscalls/Makefile.syscalls new file mode 100644 index 0000000..eba4d98 --- /dev/null +++ b/CRIU_code/compel/arch/aarch64/plugins/std/syscalls/Makefile.syscalls @@ -0,0 +1 @@ +../../../../arm/plugins/std/syscalls/Makefile.syscalls \ No newline at end of file diff --git a/CRIU_code/compel/arch/aarch64/plugins/std/syscalls/gen-sys-exec-tbl.pl b/CRIU_code/compel/arch/aarch64/plugins/std/syscalls/gen-sys-exec-tbl.pl new file mode 100644 index 0000000..8d7e897 --- /dev/null +++ b/CRIU_code/compel/arch/aarch64/plugins/std/syscalls/gen-sys-exec-tbl.pl @@ -0,0 +1 @@ +../../../../arm/plugins/std/syscalls/gen-sys-exec-tbl.pl \ No newline at end of file diff --git a/CRIU_code/compel/arch/aarch64/plugins/std/syscalls/gen-syscalls.pl b/CRIU_code/compel/arch/aarch64/plugins/std/syscalls/gen-syscalls.pl new file mode 100644 index 0000000..5c95636 --- /dev/null +++ b/CRIU_code/compel/arch/aarch64/plugins/std/syscalls/gen-syscalls.pl @@ -0,0 +1 @@ +../../../../arm/plugins/std/syscalls/gen-syscalls.pl \ No newline at end of file diff --git a/CRIU_code/compel/arch/aarch64/plugins/std/syscalls/syscall-aux.S b/CRIU_code/compel/arch/aarch64/plugins/std/syscalls/syscall-aux.S new file mode 100644 index 0000000..00ccf79 --- /dev/null +++ b/CRIU_code/compel/arch/aarch64/plugins/std/syscalls/syscall-aux.S @@ -0,0 +1,37 @@ +/** + * This source contains emulation of syscalls + * that are not implemented in the AArch64 Linux kernel + */ + +ENTRY(sys_open) + mov x3, x2 + mov x2, x1 + mov x1, x0 + mov x0, #-100 + b sys_openat +END(sys_open) + + +ENTRY(sys_mkdir) + mov x3, x2 + mov x2, x1 + mov x1, x0 + mov x0, #-100 + b sys_mkdirat +END(sys_mkdir) + + +ENTRY(sys_rmdir) + mov x2, #0x200 // flags = AT_REMOVEDIR + mov x1, x0 + mov x0, #-100 + b sys_unlinkat +END(sys_rmdir) + + +ENTRY(sys_unlink) + mov x2, #0 // flags = 0 + mov x1, x0 + mov x0, #-100 + b sys_unlinkat +END(sys_unlink) diff --git a/CRIU_code/compel/arch/aarch64/plugins/std/syscalls/syscall-aux.h b/CRIU_code/compel/arch/aarch64/plugins/std/syscalls/syscall-aux.h new file mode 100644 index 0000000..6272bf3 --- /dev/null +++ b/CRIU_code/compel/arch/aarch64/plugins/std/syscalls/syscall-aux.h @@ -0,0 +1,3 @@ +#ifndef __NR_openat +# define __NR_openat 56 +#endif diff --git a/CRIU_code/compel/arch/aarch64/plugins/std/syscalls/syscall-common.S b/CRIU_code/compel/arch/aarch64/plugins/std/syscalls/syscall-common.S new file mode 100644 index 0000000..aeb89ea --- /dev/null +++ b/CRIU_code/compel/arch/aarch64/plugins/std/syscalls/syscall-common.S @@ -0,0 +1,19 @@ +#include "common/asm/linkage.h" + +syscall_common: + svc #0 + ret + + +.macro syscall name, nr + ENTRY(\name) + mov x8, \nr + b syscall_common + END(\name) +.endm + + +ENTRY(__cr_restore_rt) + mov x8, __NR_rt_sigreturn + svc #0 +END(__cr_restore_rt) diff --git a/CRIU_code/compel/arch/aarch64/plugins/std/syscalls/syscall.def b/CRIU_code/compel/arch/aarch64/plugins/std/syscalls/syscall.def new file mode 100644 index 0000000..ebecde3 --- /dev/null +++ b/CRIU_code/compel/arch/aarch64/plugins/std/syscalls/syscall.def @@ -0,0 +1 @@ +../../../../arm/plugins/std/syscalls/syscall.def \ No newline at end of file diff --git a/CRIU_code/compel/arch/aarch64/scripts/compel-pack.lds.S b/CRIU_code/compel/arch/aarch64/scripts/compel-pack.lds.S new file mode 100644 index 0000000..eba89cd --- /dev/null +++ b/CRIU_code/compel/arch/aarch64/scripts/compel-pack.lds.S @@ -0,0 +1,36 @@ +OUTPUT_ARCH(aarch64) +EXTERN(__export_parasite_head_start) + +SECTIONS +{ + .crblob 0x0 : { + *(.head.text) + ASSERT(DEFINED(__export_parasite_head_start), + "Symbol __export_parasite_head_start is missing"); + *(.text*) + . = ALIGN(32); + *(.data*) + . = ALIGN(32); + *(.rodata*) + . = ALIGN(32); + *(.bss*) + . = ALIGN(32); + *(.got*) + . = ALIGN(32); + *(.toc*) + . = ALIGN(32); + } =0x00000000, + + /DISCARD/ : { + *(.debug*) + *(.comment*) + *(.note*) + *(.group*) + *(.eh_frame*) + *(*) + } + +/* Parasite args should have 4 bytes align, as we have futex inside. */ +. = ALIGN(4); +__export_parasite_args = .; +} diff --git a/CRIU_code/compel/arch/aarch64/src/lib/cpu.c b/CRIU_code/compel/arch/aarch64/src/lib/cpu.c new file mode 100644 index 0000000..e6898d4 --- /dev/null +++ b/CRIU_code/compel/arch/aarch64/src/lib/cpu.c @@ -0,0 +1,106 @@ +#include +#include + +#include "compel-cpu.h" + +#include "common/bitops.h" + +#include "log.h" + +#undef LOG_PREFIX +#define LOG_PREFIX "cpu: " + +#ifdef CONFIG_SMP +#define PER_CPU_BASE_SECTION ".data..percpu" +#else +#define PER_CPU_BASE_SECTION ".data" +#endif +#define PER_CPU_ATTRIBUTES +#define __PCPU_ATTRS(sec) \ + __percpu __attribute__((section(PER_CPU_BASE_SECTION sec))) \ + PER_CPU_ATTRIBUTES +#define DEFINE_PER_CPU_SECTION(type, name, sec) \ + __PCPU_ATTRS(sec) __typeof__(type) name +#define DEFINE_PER_CPU(type, name) \ + DEFINE_PER_CPU_SECTION(type, name, "") +DEFINE_PER_CPU(struct cpuinfo_arm64, cpu_data); + + +static compel_cpuinfo_t rt_info; + +static void fetch_rt_cpuinfo(void) +{ + static bool rt_info_done = false; + + if (!rt_info_done) { + compel_cpuid(&rt_info); + rt_info_done = true; + } +} + +void compel_set_cpu_cap(compel_cpuinfo_t *info, unsigned int feature) +{ + if (likely(feature < ARM64_NCAPS)) + set_bit(feature, (unsigned long *)info->cpu_hwcaps); +} + +void compel_clear_cpu_cap(compel_cpuinfo_t *info, unsigned int feature) +{ + if (likely(feature < ARM64_NCAPS)) + clear_bit(feature, (unsigned long *)info->cpu_hwcaps); +} + +int compel_test_cpu_cap(compel_cpuinfo_t *info, unsigned int feature) +{ + if (likely(feature < ARM64_NCAPS)) + return test_bit(feature, (unsigned long *)info->cpu_hwcaps); + return 0; +} + +int compel_test_fpu_cap(compel_cpuinfo_t *info, unsigned int feature) +{ + + return 0; +} + +int compel_cpuid(compel_cpuinfo_t *info) +{ + info->cpuinfo_arm64 = &per_cpu(cpu_data, 0); + return 0; +} + +bool compel_cpu_has_feature(unsigned int feature) +{ + fetch_rt_cpuinfo(); + return compel_test_cpu_cap(&rt_info, feature); +} + +bool compel_fpu_has_feature(unsigned int feature) +{ + fetch_rt_cpuinfo(); + return compel_test_fpu_cap(&rt_info, feature); +} + +uint32_t compel_fpu_feature_size(unsigned int feature) +{ + fetch_rt_cpuinfo(); + return 0; +} + +uint32_t compel_fpu_feature_offset(unsigned int feature) +{ + fetch_rt_cpuinfo(); + return 0; +} + +void compel_cpu_clear_feature(unsigned int feature) +{ + fetch_rt_cpuinfo(); + return compel_clear_cpu_cap(&rt_info, feature); +} + +void compel_cpu_copy_cpuinfo(compel_cpuinfo_t *c) +{ + fetch_rt_cpuinfo(); + memcpy(c, &rt_info, sizeof(rt_info)); +} diff --git a/CRIU_code/compel/arch/aarch64/src/lib/handle-elf-host.c b/CRIU_code/compel/arch/aarch64/src/lib/handle-elf-host.c new file mode 100644 index 0000000..fe46118 --- /dev/null +++ b/CRIU_code/compel/arch/aarch64/src/lib/handle-elf-host.c @@ -0,0 +1 @@ +handle-elf.c \ No newline at end of file diff --git a/CRIU_code/compel/arch/aarch64/src/lib/handle-elf.c b/CRIU_code/compel/arch/aarch64/src/lib/handle-elf.c new file mode 100644 index 0000000..1c3686c --- /dev/null +++ b/CRIU_code/compel/arch/aarch64/src/lib/handle-elf.c @@ -0,0 +1,35 @@ +#include + +#include "uapi/compel.h" + +#include "handle-elf.h" +#include "piegen.h" +#include "log.h" + +static const unsigned char __maybe_unused +elf_ident_64_le[EI_NIDENT] = { + 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, +}; + +static const unsigned char __maybe_unused +elf_ident_64_be[EI_NIDENT] = { + 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x02, 0x01, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, +}; + +int handle_binary(void *mem, size_t size) +{ + const unsigned char *elf_ident = +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + elf_ident_64_le; +#else + elf_ident_64_be; +#endif + + if (memcmp(mem, elf_ident, sizeof(elf_ident_64_le)) == 0) + return handle_elf_aarch64(mem, size); + + pr_err("Unsupported Elf format detected\n"); + return -EINVAL; +} diff --git a/CRIU_code/compel/arch/aarch64/src/lib/include/cpu.h b/CRIU_code/compel/arch/aarch64/src/lib/include/cpu.h new file mode 100644 index 0000000..1b5885e --- /dev/null +++ b/CRIU_code/compel/arch/aarch64/src/lib/include/cpu.h @@ -0,0 +1,6 @@ +#ifndef __COMPEL_ASM_CPU_H__ +#define __COMPEL_ASM_CPU_H__ + + +#endif + diff --git a/CRIU_code/compel/arch/aarch64/src/lib/include/handle-elf.h b/CRIU_code/compel/arch/aarch64/src/lib/include/handle-elf.h new file mode 100644 index 0000000..0f64b34 --- /dev/null +++ b/CRIU_code/compel/arch/aarch64/src/lib/include/handle-elf.h @@ -0,0 +1,11 @@ +#ifndef COMPEL_HANDLE_ELF_H__ +#define COMPEL_HANDLE_ELF_H__ + +#include "elf64-types.h" + +#define __handle_elf handle_elf_aarch64 +#define arch_is_machine_supported(e_machine) (e_machine == EM_AARCH64) + +extern int handle_elf_aarch64(void *mem, size_t size); + +#endif /* COMPEL_HANDLE_ELF_H__ */ diff --git a/CRIU_code/compel/arch/aarch64/src/lib/include/syscall.h b/CRIU_code/compel/arch/aarch64/src/lib/include/syscall.h new file mode 100644 index 0000000..e2ec127 --- /dev/null +++ b/CRIU_code/compel/arch/aarch64/src/lib/include/syscall.h @@ -0,0 +1,4 @@ +#ifndef __COMPEL_SYSCALL_H__ +#define __COMPEL_SYSCALL_H__ +#define __NR(syscall, compat) __NR_##syscall +#endif diff --git a/CRIU_code/compel/arch/aarch64/src/lib/include/uapi/asm/.gitignore b/CRIU_code/compel/arch/aarch64/src/lib/include/uapi/asm/.gitignore new file mode 100644 index 0000000..e69de29 diff --git a/CRIU_code/compel/arch/aarch64/src/lib/include/uapi/asm/breakpoints.h b/CRIU_code/compel/arch/aarch64/src/lib/include/uapi/asm/breakpoints.h new file mode 100644 index 0000000..5f09049 --- /dev/null +++ b/CRIU_code/compel/arch/aarch64/src/lib/include/uapi/asm/breakpoints.h @@ -0,0 +1,15 @@ +#ifndef __COMPEL_BREAKPOINTS_H__ +#define __COMPEL_BREAKPOINTS_H__ +#define ARCH_SI_TRAP TRAP_BRKPT + +static inline int ptrace_set_breakpoint(pid_t pid, void *addr) +{ + return 0; +} + +static inline int ptrace_flush_breakpoints(pid_t pid) +{ + return 0; +} + +#endif diff --git a/CRIU_code/compel/arch/aarch64/src/lib/include/uapi/asm/cpu.h b/CRIU_code/compel/arch/aarch64/src/lib/include/uapi/asm/cpu.h new file mode 100644 index 0000000..02de8d3 --- /dev/null +++ b/CRIU_code/compel/arch/aarch64/src/lib/include/uapi/asm/cpu.h @@ -0,0 +1,487 @@ +#ifndef UAPI_COMPEL_ASM_CPU_H__ +#define UAPI_COMPEL_ASM_CPU_H__ + +#include +#include +typedef unsigned short int u16; +typedef unsigned char u8; +typedef unsigned int u32; +typedef unsigned long long u64; + +#define INVALID_HWID ULONG_MAX + +#define MPIDR_UP_BITMASK (0x1 << 30) +#define MPIDR_MT_BITMASK (0x1 << 24) +#define MPIDR_HWID_BITMASK UL(0xff00ffffff) + +#define MPIDR_LEVEL_BITS_SHIFT 3 +#define MPIDR_LEVEL_BITS (1 << MPIDR_LEVEL_BITS_SHIFT) +#define MPIDR_LEVEL_MASK ((1 << MPIDR_LEVEL_BITS) - 1) + +#define MPIDR_LEVEL_SHIFT(level) \ + (((1 << level) >> 1) << MPIDR_LEVEL_BITS_SHIFT) + +#define MPIDR_AFFINITY_LEVEL(mpidr, level) \ + ((mpidr >> MPIDR_LEVEL_SHIFT(level)) & MPIDR_LEVEL_MASK) + +#define MIDR_REVISION_MASK 0xf +#define MIDR_REVISION(midr) ((midr) & MIDR_REVISION_MASK) +#define MIDR_PARTNUM_SHIFT 4 +#define MIDR_PARTNUM_MASK (0xfff << MIDR_PARTNUM_SHIFT) +#define MIDR_PARTNUM(midr) \ + (((midr) & MIDR_PARTNUM_MASK) >> MIDR_PARTNUM_SHIFT) +#define MIDR_ARCHITECTURE_SHIFT 16 +#define MIDR_ARCHITECTURE_MASK (0xf << MIDR_ARCHITECTURE_SHIFT) +#define MIDR_ARCHITECTURE(midr) \ + (((midr) & MIDR_ARCHITECTURE_MASK) >> MIDR_ARCHITECTURE_SHIFT) +#define MIDR_VARIANT_SHIFT 20 +#define MIDR_VARIANT_MASK (0xf << MIDR_VARIANT_SHIFT) +#define MIDR_VARIANT(midr) \ + (((midr) & MIDR_VARIANT_MASK) >> MIDR_VARIANT_SHIFT) +#define MIDR_IMPLEMENTOR_SHIFT 24 +#define MIDR_IMPLEMENTOR_MASK (0xff << MIDR_IMPLEMENTOR_SHIFT) +#define MIDR_IMPLEMENTOR(midr) \ + (((midr) & MIDR_IMPLEMENTOR_MASK) >> MIDR_IMPLEMENTOR_SHIFT) + +#define MIDR_CPU_MODEL(imp, partnum) \ + (((imp) << MIDR_IMPLEMENTOR_SHIFT) | \ + (0xf << MIDR_ARCHITECTURE_SHIFT) | \ + ((partnum) << MIDR_PARTNUM_SHIFT)) + +#define MIDR_CPU_VAR_REV(var, rev) \ + (((var) << MIDR_VARIANT_SHIFT) | (rev)) + +#define MIDR_CPU_MODEL_MASK (MIDR_IMPLEMENTOR_MASK | MIDR_PARTNUM_MASK | \ + MIDR_ARCHITECTURE_MASK) + +#define ARM_CPU_IMP_ARM 0x41 +#define ARM_CPU_IMP_APM 0x50 +#define ARM_CPU_IMP_CAVIUM 0x43 +#define ARM_CPU_IMP_BRCM 0x42 +#define ARM_CPU_IMP_QCOM 0x51 +#define ARM_CPU_IMP_NVIDIA 0x4E +#define ARM_CPU_IMP_FUJITSU 0x46 +#define ARM_CPU_IMP_HISI 0x48 + +#define ARM_CPU_PART_AEM_V8 0xD0F +#define ARM_CPU_PART_FOUNDATION 0xD00 +#define ARM_CPU_PART_CORTEX_A57 0xD07 +#define ARM_CPU_PART_CORTEX_A72 0xD08 +#define ARM_CPU_PART_CORTEX_A53 0xD03 +#define ARM_CPU_PART_CORTEX_A73 0xD09 +#define ARM_CPU_PART_CORTEX_A75 0xD0A +#define ARM_CPU_PART_CORTEX_A35 0xD04 +#define ARM_CPU_PART_CORTEX_A55 0xD05 +#define ARM_CPU_PART_CORTEX_A76 0xD0B +#define ARM_CPU_PART_NEOVERSE_N1 0xD0C + +#define APM_CPU_PART_POTENZA 0x000 + +#define CAVIUM_CPU_PART_THUNDERX 0x0A1 +#define CAVIUM_CPU_PART_THUNDERX_81XX 0x0A2 +#define CAVIUM_CPU_PART_THUNDERX_83XX 0x0A3 +#define CAVIUM_CPU_PART_THUNDERX2 0x0AF + +#define BRCM_CPU_PART_BRAHMA_B53 0x100 +#define BRCM_CPU_PART_VULCAN 0x516 + +#define QCOM_CPU_PART_FALKOR_V1 0x800 +#define QCOM_CPU_PART_FALKOR 0xC00 +#define QCOM_CPU_PART_KRYO 0x200 + +#define NVIDIA_CPU_PART_DENVER 0x003 +#define NVIDIA_CPU_PART_CARMEL 0x004 + +#define FUJITSU_CPU_PART_A64FX 0x001 + +#define HISI_CPU_PART_TSV110 0xD01 + +#define MIDR_CORTEX_A53 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A53) +#define MIDR_CORTEX_A57 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A57) +#define MIDR_CORTEX_A72 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A72) +#define MIDR_CORTEX_A73 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A73) +#define MIDR_CORTEX_A75 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A75) +#define MIDR_CORTEX_A35 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A35) +#define MIDR_CORTEX_A55 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A55) +#define MIDR_CORTEX_A76 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A76) +#define MIDR_NEOVERSE_N1 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_N1) +#define MIDR_THUNDERX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX) +#define MIDR_THUNDERX_81XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_81XX) +#define MIDR_THUNDERX_83XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_83XX) +#define MIDR_CAVIUM_THUNDERX2 MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX2) +#define MIDR_BRAHMA_B53 MIDR_CPU_MODEL(ARM_CPU_IMP_BRCM, BRCM_CPU_PART_BRAHMA_B53) +#define MIDR_BRCM_VULCAN MIDR_CPU_MODEL(ARM_CPU_IMP_BRCM, BRCM_CPU_PART_VULCAN) +#define MIDR_QCOM_FALKOR_V1 MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_FALKOR_V1) +#define MIDR_QCOM_FALKOR MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_FALKOR) +#define MIDR_QCOM_KRYO MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_KRYO) +#define MIDR_NVIDIA_DENVER MIDR_CPU_MODEL(ARM_CPU_IMP_NVIDIA, NVIDIA_CPU_PART_DENVER) +#define MIDR_NVIDIA_CARMEL MIDR_CPU_MODEL(ARM_CPU_IMP_NVIDIA, NVIDIA_CPU_PART_CARMEL) +#define MIDR_FUJITSU_A64FX MIDR_CPU_MODEL(ARM_CPU_IMP_FUJITSU, FUJITSU_CPU_PART_A64FX) +#define MIDR_HISI_TSV110 MIDR_CPU_MODEL(ARM_CPU_IMP_HISI, HISI_CPU_PART_TSV110) +#define ARM64_CPUCAP_SCOPE_LOCAL_CPU ((u16)BIT(0)) +#define ARM64_CPUCAP_SCOPE_SYSTEM ((u16)BIT(1)) +/* + * The capabilitiy is detected on the Boot CPU and is used by kernel + * during early boot. i.e, the capability should be "detected" and + * "enabled" as early as possibly on all booting CPUs. + */ +#define ARM64_CPUCAP_SCOPE_BOOT_CPU ((u16)BIT(2)) +#define ARM64_CPUCAP_SCOPE_MASK \ + (ARM64_CPUCAP_SCOPE_SYSTEM | \ + ARM64_CPUCAP_SCOPE_LOCAL_CPU | \ + ARM64_CPUCAP_SCOPE_BOOT_CPU) + +#define SCOPE_SYSTEM ARM64_CPUCAP_SCOPE_SYSTEM +#define SCOPE_LOCAL_CPU ARM64_CPUCAP_SCOPE_LOCAL_CPU +#define SCOPE_BOOT_CPU ARM64_CPUCAP_SCOPE_BOOT_CPU +#define SCOPE_ALL ARM64_CPUCAP_SCOPE_MASK + +/* + * Is it permitted for a late CPU to have this capability when system + * hasn't already enabled it ? + */ +#define ARM64_CPUCAP_PERMITTED_FOR_LATE_CPU ((u16)BIT(4)) +/* Is it safe for a late CPU to miss this capability when system has it */ +#define ARM64_CPUCAP_OPTIONAL_FOR_LATE_CPU ((u16)BIT(5)) + +/* + * CPU errata workarounds that need to be enabled at boot time if one or + * more CPUs in the system requires it. When one of these capabilities + * has been enabled, it is safe to allow any CPU to boot that doesn't + * require the workaround. However, it is not safe if a "late" CPU + * requires a workaround and the system hasn't enabled it already. + */ +#define ARM64_CPUCAP_LOCAL_CPU_ERRATUM \ + (ARM64_CPUCAP_SCOPE_LOCAL_CPU | ARM64_CPUCAP_OPTIONAL_FOR_LATE_CPU) +/* + * CPU feature detected at boot time based on system-wide value of a + * feature. It is safe for a late CPU to have this feature even though + * the system hasn't enabled it, although the feature will not be used + * by Linux in this case. If the system has enabled this feature already, + * then every late CPU must have it. + */ +#define ARM64_CPUCAP_SYSTEM_FEATURE \ + (ARM64_CPUCAP_SCOPE_SYSTEM | ARM64_CPUCAP_PERMITTED_FOR_LATE_CPU) +/* + * CPU feature detected at boot time based on feature of one or more CPUs. + * All possible conflicts for a late CPU are ignored. + */ +#define ARM64_CPUCAP_WEAK_LOCAL_CPU_FEATURE \ + (ARM64_CPUCAP_SCOPE_LOCAL_CPU | \ + ARM64_CPUCAP_OPTIONAL_FOR_LATE_CPU | \ + ARM64_CPUCAP_PERMITTED_FOR_LATE_CPU) + +/* + * CPU feature detected at boot time, on one or more CPUs. A late CPU + * is not allowed to have the capability when the system doesn't have it. + * It is Ok for a late CPU to miss the feature. + */ +#define ARM64_CPUCAP_BOOT_RESTRICTED_CPU_LOCAL_FEATURE \ + (ARM64_CPUCAP_SCOPE_LOCAL_CPU | \ + ARM64_CPUCAP_OPTIONAL_FOR_LATE_CPU) + +/* + * CPU feature used early in the boot based on the boot CPU. All secondary + * CPUs must match the state of the capability as detected by the boot CPU. + */ +#define ARM64_CPUCAP_STRICT_BOOT_CPU_FEATURE ARM64_CPUCAP_SCOPE_BOOT_CPU + +#define ARM64_WORKAROUND_CLEAN_CACHE 0 +#define ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE 1 +#define ARM64_WORKAROUND_845719 2 +#define ARM64_HAS_SYSREG_GIC_CPUIF 3 +#define ARM64_HAS_PAN 4 +#define ARM64_HAS_LSE_ATOMICS 5 +#define ARM64_WORKAROUND_CAVIUM_23154 6 +#define ARM64_WORKAROUND_834220 7 +#define ARM64_HAS_NO_HW_PREFETCH 8 +#define ARM64_HAS_VIRT_HOST_EXTN 11 +#define ARM64_WORKAROUND_CAVIUM_27456 12 +#define ARM64_HAS_32BIT_EL0 13 +#define ARM64_SPECTRE_V3A 14 +#define ARM64_HAS_CNP 15 +#define ARM64_HAS_NO_FPSIMD 16 +#define ARM64_WORKAROUND_REPEAT_TLBI 17 +#define ARM64_WORKAROUND_QCOM_FALKOR_E1003 18 +#define ARM64_WORKAROUND_858921 19 +#define ARM64_WORKAROUND_CAVIUM_30115 20 +#define ARM64_HAS_DCPOP 21 +#define ARM64_SVE 22 +#define ARM64_UNMAP_KERNEL_AT_EL0 23 +#define ARM64_SPECTRE_V2 24 +#define ARM64_HAS_RAS_EXTN 25 +#define ARM64_WORKAROUND_843419 26 +#define ARM64_HAS_CACHE_IDC 27 +#define ARM64_HAS_CACHE_DIC 28 +#define ARM64_HW_DBM 29 +#define ARM64_SPECTRE_V4 30 +#define ARM64_MISMATCHED_CACHE_TYPE 31 +#define ARM64_HAS_STAGE2_FWB 32 +#define ARM64_HAS_CRC32 33 +#define ARM64_SSBS 34 +#define ARM64_WORKAROUND_1418040 35 +#define ARM64_HAS_SB 36 +#define ARM64_WORKAROUND_SPECULATIVE_AT 37 +#define ARM64_HAS_ADDRESS_AUTH_ARCH 38 +#define ARM64_HAS_ADDRESS_AUTH_IMP_DEF 39 +#define ARM64_HAS_GENERIC_AUTH_ARCH 40 +#define ARM64_HAS_GENERIC_AUTH_IMP_DEF 41 +#define ARM64_HAS_IRQ_PRIO_MASKING 42 +#define ARM64_HAS_DCPODP 43 +#define ARM64_WORKAROUND_1463225 44 +#define ARM64_WORKAROUND_CAVIUM_TX2_219_TVM 45 +#define ARM64_WORKAROUND_CAVIUM_TX2_219_PRFM 46 +#define ARM64_WORKAROUND_1542419 47 +#define ARM64_HAS_E0PD 48 +#define ARM64_HAS_RNG 49 +#define ARM64_HAS_AMU_EXTN 50 +#define ARM64_HAS_ADDRESS_AUTH 51 +#define ARM64_HAS_GENERIC_AUTH 52 +#define ARM64_HAS_32BIT_EL1 53 +#define ARM64_BTI 54 +#define ARM64_HAS_ARMv8_4_TTL 55 +#define ARM64_HAS_TLB_RANGE 56 +#define ARM64_MTE 57 +#define ARM64_WORKAROUND_1508412 58 +#define ARM64_HAS_LDAPR 59 +#define ARM64_KVM_PROTECTED_MODE 60 + +#define ARM64_NCAPS 61 + +#define COMPAT_HWCAP_SWP (1 << 0) +#define COMPAT_HWCAP_HALF (1 << 1) +#define COMPAT_HWCAP_THUMB (1 << 2) +#define COMPAT_HWCAP_26BIT (1 << 3) +#define COMPAT_HWCAP_FAST_MULT (1 << 4) +#define COMPAT_HWCAP_FPA (1 << 5) +#define COMPAT_HWCAP_VFP (1 << 6) +#define COMPAT_HWCAP_EDSP (1 << 7) +#define COMPAT_HWCAP_JAVA (1 << 8) +#define COMPAT_HWCAP_IWMMXT (1 << 9) +#define COMPAT_HWCAP_CRUNCH (1 << 10) +#define COMPAT_HWCAP_THUMBEE (1 << 11) +#define COMPAT_HWCAP_NEON (1 << 12) +#define COMPAT_HWCAP_VFPv3 (1 << 13) +#define COMPAT_HWCAP_VFPV3D16 (1 << 14) +#define COMPAT_HWCAP_TLS (1 << 15) +#define COMPAT_HWCAP_VFPv4 (1 << 16) +#define COMPAT_HWCAP_IDIVA (1 << 17) +#define COMPAT_HWCAP_IDIVT (1 << 18) +#define COMPAT_HWCAP_IDIV (COMPAT_HWCAP_IDIVA|COMPAT_HWCAP_IDIVT) +#define COMPAT_HWCAP_VFPD32 (1 << 19) +#define COMPAT_HWCAP_LPAE (1 << 20) +#define COMPAT_HWCAP_EVTSTRM (1 << 21) + +#define COMPAT_HWCAP2_AES (1 << 0) +#define COMPAT_HWCAP2_PMULL (1 << 1) +#define COMPAT_HWCAP2_SHA1 (1 << 2) +#define COMPAT_HWCAP2_SHA2 (1 << 3) +#define COMPAT_HWCAP2_CRC32 (1 << 4) +#define __khwcap_feature(x) const_ilog2(HWCAP_ ## x) +#define KERNEL_HWCAP_FP __khwcap_feature(FP) +#define KERNEL_HWCAP_ASIMD __khwcap_feature(ASIMD) +#define KERNEL_HWCAP_EVTSTRM __khwcap_feature(EVTSTRM) +#define KERNEL_HWCAP_AES __khwcap_feature(AES) +#define KERNEL_HWCAP_PMULL __khwcap_feature(PMULL) +#define KERNEL_HWCAP_SHA1 __khwcap_feature(SHA1) +#define KERNEL_HWCAP_SHA2 __khwcap_feature(SHA2) +#define KERNEL_HWCAP_CRC32 __khwcap_feature(CRC32) +#define KERNEL_HWCAP_ATOMICS __khwcap_feature(ATOMICS) +#define KERNEL_HWCAP_FPHP __khwcap_feature(FPHP) +#define KERNEL_HWCAP_ASIMDHP __khwcap_feature(ASIMDHP) +#define KERNEL_HWCAP_CPUID __khwcap_feature(CPUID) +#define KERNEL_HWCAP_ASIMDRDM __khwcap_feature(ASIMDRDM) +#define KERNEL_HWCAP_JSCVT __khwcap_feature(JSCVT) +#define KERNEL_HWCAP_FCMA __khwcap_feature(FCMA) +#define KERNEL_HWCAP_LRCPC __khwcap_feature(LRCPC) +#define KERNEL_HWCAP_DCPOP __khwcap_feature(DCPOP) +#define KERNEL_HWCAP_SHA3 __khwcap_feature(SHA3) +#define KERNEL_HWCAP_SM3 __khwcap_feature(SM3) +#define KERNEL_HWCAP_SM4 __khwcap_feature(SM4) +#define KERNEL_HWCAP_ASIMDDP __khwcap_feature(ASIMDDP) +#define KERNEL_HWCAP_SHA512 __khwcap_feature(SHA512) +#define KERNEL_HWCAP_SVE __khwcap_feature(SVE) +#define KERNEL_HWCAP_ASIMDFHM __khwcap_feature(ASIMDFHM) +#define KERNEL_HWCAP_DIT __khwcap_feature(DIT) +#define KERNEL_HWCAP_USCAT __khwcap_feature(USCAT) +#define KERNEL_HWCAP_ILRCPC __khwcap_feature(ILRCPC) +#define KERNEL_HWCAP_FLAGM __khwcap_feature(FLAGM) +#define KERNEL_HWCAP_SSBS __khwcap_feature(SSBS) +#define KERNEL_HWCAP_SB __khwcap_feature(SB) +#define KERNEL_HWCAP_PACA __khwcap_feature(PACA) +#define KERNEL_HWCAP_PACG __khwcap_feature(PACG) + +#define __khwcap2_feature(x) (const_ilog2(HWCAP2_ ## x) + 32) +#define KERNEL_HWCAP_DCPODP __khwcap2_feature(DCPODP) +#define KERNEL_HWCAP_SVE2 __khwcap2_feature(SVE2) +#define KERNEL_HWCAP_SVEAES __khwcap2_feature(SVEAES) +#define KERNEL_HWCAP_SVEPMULL __khwcap2_feature(SVEPMULL) +#define KERNEL_HWCAP_SVEBITPERM __khwcap2_feature(SVEBITPERM) +#define KERNEL_HWCAP_SVESHA3 __khwcap2_feature(SVESHA3) +#define KERNEL_HWCAP_SVESM4 __khwcap2_feature(SVESM4) +#define KERNEL_HWCAP_FLAGM2 __khwcap2_feature(FLAGM2) +#define KERNEL_HWCAP_FRINT __khwcap2_feature(FRINT) +#define KERNEL_HWCAP_SVEI8MM __khwcap2_feature(SVEI8MM) +#define KERNEL_HWCAP_SVEF32MM __khwcap2_feature(SVEF32MM) +#define KERNEL_HWCAP_SVEF64MM __khwcap2_feature(SVEF64MM) +#define KERNEL_HWCAP_SVEBF16 __khwcap2_feature(SVEBF16) +#define KERNEL_HWCAP_I8MM __khwcap2_feature(I8MM) +#define KERNEL_HWCAP_BF16 __khwcap2_feature(BF16) +#define KERNEL_HWCAP_DGH __khwcap2_feature(DGH) +#define KERNEL_HWCAP_RNG __khwcap2_feature(RNG) +#define KERNEL_HWCAP_BTI __khwcap2_feature(BTI) +#define KERNEL_HWCAP_MTE __khwcap2_feature(MTE) +#define ELF_HWCAP cpu_get_elf_hwcap() +#define ELF_HWCAP2 cpu_get_elf_hwcap2() + +#ifdef CONFIG_COMPAT +#define COMPAT_ELF_HWCAP (compat_elf_hwcap) +#define COMPAT_ELF_HWCAP2 (compat_elf_hwcap2) + +#ifdef CONFIG_COMPAT +#define COMPAT_KERNEL_HWCAP(x) const_ilog2(COMPAT_HWCAP_ ## x) + +static const char *const hwcap_str[] = { + [KERNEL_HWCAP_FP] = "fp", + [KERNEL_HWCAP_ASIMD] = "asimd", + [KERNEL_HWCAP_EVTSTRM] = "evtstrm", + [KERNEL_HWCAP_AES] = "aes", + [KERNEL_HWCAP_PMULL] = "pmull", + [KERNEL_HWCAP_SHA1] = "sha1", + [KERNEL_HWCAP_SHA2] = "sha2", + [KERNEL_HWCAP_CRC32] = "crc32", + [KERNEL_HWCAP_ATOMICS] = "atomics", + [KERNEL_HWCAP_FPHP] = "fphp", + [KERNEL_HWCAP_ASIMDHP] = "asimdhp", + [KERNEL_HWCAP_CPUID] = "cpuid", + [KERNEL_HWCAP_ASIMDRDM] = "asimdrdm", + [KERNEL_HWCAP_JSCVT] = "jscvt", + [KERNEL_HWCAP_FCMA] = "fcma", + [KERNEL_HWCAP_LRCPC] = "lrcpc", + [KERNEL_HWCAP_DCPOP] = "dcpop", + [KERNEL_HWCAP_SHA3] = "sha3", + [KERNEL_HWCAP_SM3] = "sm3", + [KERNEL_HWCAP_SM4] = "sm4", + [KERNEL_HWCAP_ASIMDDP] = "asimddp", + [KERNEL_HWCAP_SHA512] = "sha512", + [KERNEL_HWCAP_SVE] = "sve", + [KERNEL_HWCAP_ASIMDFHM] = "asimdfhm", + [KERNEL_HWCAP_DIT] = "dit", + [KERNEL_HWCAP_USCAT] = "uscat", + [KERNEL_HWCAP_ILRCPC] = "ilrcpc", + [KERNEL_HWCAP_FLAGM] = "flagm", + [KERNEL_HWCAP_SSBS] = "ssbs", + [KERNEL_HWCAP_SB] = "sb", + [KERNEL_HWCAP_PACA] = "paca", + [KERNEL_HWCAP_PACG] = "pacg", + [KERNEL_HWCAP_DCPODP] = "dcpodp", + [KERNEL_HWCAP_SVE2] = "sve2", + [KERNEL_HWCAP_SVEAES] = "sveaes", + [KERNEL_HWCAP_SVEPMULL] = "svepmull", + [KERNEL_HWCAP_SVEBITPERM] = "svebitperm", + [KERNEL_HWCAP_SVESHA3] = "svesha3", + [KERNEL_HWCAP_SVESM4] = "svesm4", + [KERNEL_HWCAP_FLAGM2] = "flagm2", + [KERNEL_HWCAP_FRINT] = "frint", + [KERNEL_HWCAP_SVEI8MM] = "svei8mm", + [KERNEL_HWCAP_SVEF32MM] = "svef32mm", + [KERNEL_HWCAP_SVEF64MM] = "svef64mm", + [KERNEL_HWCAP_SVEBF16] = "svebf16", + [KERNEL_HWCAP_I8MM] = "i8mm", + [KERNEL_HWCAP_BF16] = "bf16", + [KERNEL_HWCAP_DGH] = "dgh", + [KERNEL_HWCAP_RNG] = "rng", + [KERNEL_HWCAP_BTI] = "bti", + [KERNEL_HWCAP_MTE] = "mte", +}; + +#ifdef CONFIG_COMPAT +#define COMPAT_KERNEL_HWCAP(x) const_ilog2(COMPAT_HWCAP_ ## x) +static const char *const compat_hwcap_str[] = { + [COMPAT_KERNEL_HWCAP(SWP)] = "swp", + [COMPAT_KERNEL_HWCAP(HALF)] = "half", + [COMPAT_KERNEL_HWCAP(THUMB)] = "thumb", + [COMPAT_KERNEL_HWCAP(26BIT)] = NULL, /* Not possible on arm64 */ + [COMPAT_KERNEL_HWCAP(FAST_MULT)] = "fastmult", + [COMPAT_KERNEL_HWCAP(FPA)] = NULL, /* Not possible on arm64 */ + [COMPAT_KERNEL_HWCAP(VFP)] = "vfp", + [COMPAT_KERNEL_HWCAP(EDSP)] = "edsp", + [COMPAT_KERNEL_HWCAP(JAVA)] = NULL, /* Not possible on arm64 */ + [COMPAT_KERNEL_HWCAP(IWMMXT)] = NULL, /* Not possible on arm64 */ + [COMPAT_KERNEL_HWCAP(CRUNCH)] = NULL, /* Not possible on arm64 */ + [COMPAT_KERNEL_HWCAP(THUMBEE)] = NULL, /* Not possible on arm64 */ + [COMPAT_KERNEL_HWCAP(NEON)] = "neon", + [COMPAT_KERNEL_HWCAP(VFPv3)] = "vfpv3", + [COMPAT_KERNEL_HWCAP(VFPV3D16)] = NULL, /* Not possible on arm64 */ + [COMPAT_KERNEL_HWCAP(TLS)] = "tls", + [COMPAT_KERNEL_HWCAP(VFPv4)] = "vfpv4", + [COMPAT_KERNEL_HWCAP(IDIVA)] = "idiva", + [COMPAT_KERNEL_HWCAP(IDIVT)] = "idivt", + [COMPAT_KERNEL_HWCAP(VFPD32)] = NULL, /* Not possible on arm64 */ + [COMPAT_KERNEL_HWCAP(LPAE)] = "lpae", + [COMPAT_KERNEL_HWCAP(EVTSTRM)] = "evtstrm", +}; + +#define COMPAT_KERNEL_HWCAP2(x) const_ilog2(COMPAT_HWCAP2_ ## x) +static const char *const compat_hwcap2_str[] = { + [COMPAT_KERNEL_HWCAP2(AES)] = "aes", + [COMPAT_KERNEL_HWCAP2(PMULL)] = "pmull", + [COMPAT_KERNEL_HWCAP2(SHA1)] = "sha1", + [COMPAT_KERNEL_HWCAP2(SHA2)] = "sha2", + [COMPAT_KERNEL_HWCAP2(CRC32)] = "crc32", +}; +#endif + +struct cpuinfo_arm64 +{ + struct cpu cpu; + struct kobject kobj; + unsigned long cpu_hwcaps; + u32 reg_ctr; + u32 reg_cntfrq; + u32 reg_dczid; + u32 reg_midr; + u32 reg_revidr; + + u64 reg_id_aa64dfr0; + u64 reg_id_aa64dfr1; + u64 reg_id_aa64isar0; + u64 reg_id_aa64isar1; + u64 reg_id_aa64mmfr0; + u64 reg_id_aa64mmfr1; + u64 reg_id_aa64mmfr2; + u64 reg_id_aa64pfr0; + u64 reg_id_aa64pfr1; + u64 reg_id_aa64zfr0; + + u32 reg_id_dfr0; + u32 reg_id_dfr1; + u32 reg_id_isar0; + u32 reg_id_isar1; + u32 reg_id_isar2; + u32 reg_id_isar3; + u32 reg_id_isar4; + u32 reg_id_isar5; + u32 reg_id_isar6; + u32 reg_id_mmfr0; + u32 reg_id_mmfr1; + u32 reg_id_mmfr2; + u32 reg_id_mmfr3; + u32 reg_id_mmfr4; + u32 reg_id_mmfr5; + u32 reg_id_pfr0; + u32 reg_id_pfr1; + u32 reg_id_pfr2; + + u32 reg_mvfr0; + u32 reg_mvfr1; + u32 reg_mvfr2; + + /* pseudo-ZCR for recording maximum ZCR_EL1 LEN value: */ + u64 reg_zcr; +}; + + +typedef struct cpuinfo_arm64 compel_cpuinfo_t; + +#endif /* UAPI_COMPEL_ASM_CPU_H__ */ diff --git a/CRIU_code/compel/arch/aarch64/src/lib/include/uapi/asm/fpu.h b/CRIU_code/compel/arch/aarch64/src/lib/include/uapi/asm/fpu.h new file mode 100644 index 0000000..7f476d5 --- /dev/null +++ b/CRIU_code/compel/arch/aarch64/src/lib/include/uapi/asm/fpu.h @@ -0,0 +1,4 @@ +#ifndef __CR_ASM_FPU_H__ +#define __CR_ASM_FPU_H__ + +#endif /* __CR_ASM_FPU_H__ */ diff --git a/CRIU_code/compel/arch/aarch64/src/lib/include/uapi/asm/infect-types.h b/CRIU_code/compel/arch/aarch64/src/lib/include/uapi/asm/infect-types.h new file mode 100644 index 0000000..4662f76 --- /dev/null +++ b/CRIU_code/compel/arch/aarch64/src/lib/include/uapi/asm/infect-types.h @@ -0,0 +1,32 @@ +#ifndef UAPI_COMPEL_ASM_TYPES_H__ +#define UAPI_COMPEL_ASM_TYPES_H__ + +#include +#include +#include +#include + +#define SIGMAX 64 +#define SIGMAX_OLD 31 + +/* + * Copied from the Linux kernel header arch/arm64/include/uapi/asm/ptrace.h + * + * A thread ARM CPU context + */ + +typedef struct user_pt_regs user_regs_struct_t; +typedef struct user_fpsimd_state user_fpregs_struct_t; + +#define REG_RES(r) ((uint64_t)(r).regs[0]) +#define REG_IP(r) ((uint64_t)(r).pc) +#define REG_SP(r) ((uint64_t)((r).sp)) +#define REG_SYSCALL_NR(r) ((uint64_t)(r).regs[8]) + +#define user_regs_native(pregs) true + +#define ARCH_SI_TRAP TRAP_BRKPT + +#define __NR(syscall, compat) __NR_##syscall + +#endif /* UAPI_COMPEL_ASM_TYPES_H__ */ diff --git a/CRIU_code/compel/arch/aarch64/src/lib/include/uapi/asm/processor-flags.h b/CRIU_code/compel/arch/aarch64/src/lib/include/uapi/asm/processor-flags.h new file mode 100644 index 0000000..1571918 --- /dev/null +++ b/CRIU_code/compel/arch/aarch64/src/lib/include/uapi/asm/processor-flags.h @@ -0,0 +1,4 @@ +#ifndef UAPI_COMPEL_ASM_PROCESSOR_FLAGS_H__ +#define UAPI_COMPEL_ASM_PROCESSOR_FLAGS_H__ + +#endif /* UAPI_COMPEL_ASM_PROCESSOR_FLAGS_H__ */ diff --git a/CRIU_code/compel/arch/aarch64/src/lib/include/uapi/asm/sigframe.h b/CRIU_code/compel/arch/aarch64/src/lib/include/uapi/asm/sigframe.h new file mode 100644 index 0000000..bff714c --- /dev/null +++ b/CRIU_code/compel/arch/aarch64/src/lib/include/uapi/asm/sigframe.h @@ -0,0 +1,69 @@ +#ifndef UAPI_COMPEL_ASM_SIGFRAME_H__ +#define UAPI_COMPEL_ASM_SIGFRAME_H__ + +#include +#include + +#include + +/* Copied from the kernel header arch/arm64/include/uapi/asm/sigcontext.h */ + +#define FPSIMD_MAGIC 0x46508001 + +typedef struct fpsimd_context fpu_state_t; + +struct aux_context { + struct fpsimd_context fpsimd; + /* additional context to be added before "end" */ + struct _aarch64_ctx end; +}; + +// XXX: the idetifier rt_sigcontext is expected to be struct by the CRIU code +#define rt_sigcontext sigcontext + +#include + +/* Copied from the kernel source arch/arm64/kernel/signal.c */ + +struct rt_sigframe { + siginfo_t info; + ucontext_t uc; + uint64_t fp; + uint64_t lr; +}; + +#define ARCH_RT_SIGRETURN(new_sp, rt_sigframe) \ + asm volatile( \ + "mov sp, %0 \n" \ + "mov x8, #"__stringify(__NR_rt_sigreturn)" \n" \ + "svc #0 \n" \ + : \ + : "r"(new_sp) \ + : "x8", "memory") + +/* cr_sigcontext is copied from arch/arm64/include/uapi/asm/sigcontext.h */ +struct cr_sigcontext { + __u64 fault_address; + /* AArch64 registers */ + __u64 regs[31]; + __u64 sp; + __u64 pc; + __u64 pstate; + /* 4K reserved for FP/SIMD state and future expansion */ + __u8 __reserved[4096] __attribute__((__aligned__(16))); +}; + +#define RT_SIGFRAME_UC(rt_sigframe) (&rt_sigframe->uc) +#define RT_SIGFRAME_REGIP(rt_sigframe) ((long unsigned int)(rt_sigframe)->uc.uc_mcontext.pc) +#define RT_SIGFRAME_HAS_FPU(rt_sigframe) (1) +#define RT_SIGFRAME_SIGCONTEXT(rt_sigframe) ((struct cr_sigcontext *)&(rt_sigframe)->uc.uc_mcontext) +#define RT_SIGFRAME_AUX_CONTEXT(rt_sigframe) ((struct aux_context*)&(RT_SIGFRAME_SIGCONTEXT(rt_sigframe)->__reserved)) +#define RT_SIGFRAME_FPU(rt_sigframe) (&RT_SIGFRAME_AUX_CONTEXT(rt_sigframe)->fpsimd) +#define RT_SIGFRAME_OFFSET(rt_sigframe) 0 + +#define rt_sigframe_erase_sigset(sigframe) \ + memset(&sigframe->uc.uc_sigmask, 0, sizeof(k_rtsigset_t)) +#define rt_sigframe_copy_sigset(sigframe, from) \ + memcpy(&sigframe->uc.uc_sigmask, from, sizeof(k_rtsigset_t)) + +#endif /* UAPI_COMPEL_ASM_SIGFRAME_H__ */ diff --git a/CRIU_code/compel/arch/aarch64/src/lib/infect.c b/CRIU_code/compel/arch/aarch64/src/lib/infect.c new file mode 100644 index 0000000..4b59390 --- /dev/null +++ b/CRIU_code/compel/arch/aarch64/src/lib/infect.c @@ -0,0 +1,178 @@ +#include +#include +#include +#include +#include +#include +#include "common/page.h" +#include "uapi/compel/asm/infect-types.h" +#include "log.h" +#include "errno.h" +#include "infect.h" +#include "infect-priv.h" + +unsigned __page_size = 0; +unsigned __page_shift = 0; + +/* + * Injected syscall instruction + */ +const char code_syscall[] = { + 0x01, 0x00, 0x00, 0xd4, /* SVC #0 */ + 0x00, 0x00, 0x20, 0xd4 /* BRK #0 */ +}; + +static const int +code_syscall_aligned = round_up(sizeof(code_syscall), sizeof(long)); + +static inline void __always_unused __check_code_syscall(void) +{ + BUILD_BUG_ON(code_syscall_aligned != BUILTIN_SYSCALL_SIZE); + BUILD_BUG_ON(!is_log2(sizeof(code_syscall))); +} + +int sigreturn_prep_regs_plain(struct rt_sigframe *sigframe, + user_regs_struct_t *regs, + user_fpregs_struct_t *fpregs) +{ + struct fpsimd_context *fpsimd = RT_SIGFRAME_FPU(sigframe); + + memcpy(sigframe->uc.uc_mcontext.regs, regs->regs, sizeof(regs->regs)); + + sigframe->uc.uc_mcontext.sp = regs->sp; + sigframe->uc.uc_mcontext.pc = regs->pc; + sigframe->uc.uc_mcontext.pstate = regs->pstate; + + memcpy(fpsimd->vregs, fpregs->vregs, 32 * sizeof(__uint128_t)); + + fpsimd->fpsr = fpregs->fpsr; + fpsimd->fpcr = fpregs->fpcr; + + fpsimd->head.magic = FPSIMD_MAGIC; + fpsimd->head.size = sizeof(*fpsimd); + + return 0; +} + +int sigreturn_prep_fpu_frame_plain(struct rt_sigframe *sigframe, + struct rt_sigframe *rsigframe) +{ + return 0; +} + +int get_task_regs(pid_t pid, user_regs_struct_t *regs, save_regs_t save, + void *arg, __maybe_unused unsigned long flags) +{ + struct iovec iov; + user_fpregs_struct_t fpsimd; + int ret; + + pr_info("Dumping GP/FPU registers for %d\n", pid); + + iov.iov_base = regs; + iov.iov_len = sizeof(user_regs_struct_t); + if ((ret = ptrace(PTRACE_GETREGSET, pid, NT_PRSTATUS, &iov))) { + pr_perror("Failed to obtain CPU registers for %d", pid); + goto err; + } + + iov.iov_base = &fpsimd; + iov.iov_len = sizeof(fpsimd); + if ((ret = ptrace(PTRACE_GETREGSET, pid, NT_PRFPREG, &iov))) { + pr_perror("Failed to obtain FPU registers for %d", pid); + goto err; + } + + ret = save(arg, regs, &fpsimd); +err: + return ret; +} + +int compel_syscall(struct parasite_ctl *ctl, int nr, long *ret, + unsigned long arg1, + unsigned long arg2, + unsigned long arg3, + unsigned long arg4, + unsigned long arg5, + unsigned long arg6) +{ + user_regs_struct_t regs = ctl->orig.regs; + int err; + + regs.regs[8] = (unsigned long)nr; + regs.regs[0] = arg1; + regs.regs[1] = arg2; + regs.regs[2] = arg3; + regs.regs[3] = arg4; + regs.regs[4] = arg5; + regs.regs[5] = arg6; + regs.regs[6] = 0; + regs.regs[7] = 0; + + err = compel_execute_syscall(ctl, ®s, code_syscall); + + *ret = regs.regs[0]; + return err; +} + +void *remote_mmap(struct parasite_ctl *ctl, + void *addr, size_t length, int prot, + int flags, int fd, off_t offset) +{ + long map; + int err; + + err = compel_syscall(ctl, __NR_mmap, &map, + (unsigned long)addr, length, prot, flags, fd, offset); + if (err < 0 || (long)map < 0) + map = 0; + + return (void *)map; +} + +void parasite_setup_regs(unsigned long new_ip, void *stack, user_regs_struct_t *regs) +{ + regs->pc = new_ip; + if (stack) + regs->sp = (unsigned long)stack; +} + +bool arch_can_dump_task(struct parasite_ctl *ctl) +{ + /* + * TODO: Add proper check here + */ + return true; +} + +int arch_fetch_sas(struct parasite_ctl *ctl, struct rt_sigframe *s) +{ + long ret; + int err; + + err = compel_syscall(ctl, __NR_sigaltstack, + &ret, 0, (unsigned long)&s->uc.uc_stack, + 0, 0, 0, 0); + return err ? err : ret; +} + +/* + * Range for task size calculated from the following Linux kernel files: + * arch/arm64/include/asm/memory.h + * arch/arm64/Kconfig + * + * TODO: handle 32 bit tasks + */ +#define TASK_SIZE_MIN (1UL << 39) +#define TASK_SIZE_MAX (1UL << 48) + +unsigned long compel_task_size(void) +{ + unsigned long task_size; + + for (task_size = TASK_SIZE_MIN; task_size < TASK_SIZE_MAX; task_size <<= 1) + if (munmap((void *)task_size, page_size())) + break; + return task_size; +} + diff --git a/CRIU_code/compel/arch/arm/plugins/include/asm/prologue.h b/CRIU_code/compel/arch/arm/plugins/include/asm/prologue.h new file mode 100644 index 0000000..e0275e3 --- /dev/null +++ b/CRIU_code/compel/arch/arm/plugins/include/asm/prologue.h @@ -0,0 +1 @@ +../../../../../arch/x86/plugins/include/asm/prologue.h \ No newline at end of file diff --git a/CRIU_code/compel/arch/arm/plugins/include/asm/syscall-types.h b/CRIU_code/compel/arch/arm/plugins/include/asm/syscall-types.h new file mode 100644 index 0000000..cdb03ef --- /dev/null +++ b/CRIU_code/compel/arch/arm/plugins/include/asm/syscall-types.h @@ -0,0 +1,28 @@ +#ifndef COMPEL_ARCH_SYSCALL_TYPES_H__ +#define COMPEL_ARCH_SYSCALL_TYPES_H__ + +#define SA_RESTORER 0x04000000 + +typedef void rt_signalfn_t(int, siginfo_t *, void *); +typedef rt_signalfn_t *rt_sighandler_t; + +typedef void rt_restorefn_t(void); +typedef rt_restorefn_t *rt_sigrestore_t; + +#define _KNSIG 64 +#define _NSIG_BPW 32 + +#define _KNSIG_WORDS (_KNSIG / _NSIG_BPW) + +typedef struct { + unsigned long sig[_KNSIG_WORDS]; +} k_rtsigset_t; + +typedef struct { + rt_sighandler_t rt_sa_handler; + unsigned long rt_sa_flags; + rt_sigrestore_t rt_sa_restorer; + k_rtsigset_t rt_sa_mask; +} rt_sigaction_t; + +#endif /* COMPEL_ARCH_SYSCALL_TYPES_H__ */ diff --git a/CRIU_code/compel/arch/arm/plugins/include/features.h b/CRIU_code/compel/arch/arm/plugins/include/features.h new file mode 100644 index 0000000..b4a3cde --- /dev/null +++ b/CRIU_code/compel/arch/arm/plugins/include/features.h @@ -0,0 +1,4 @@ +#ifndef __COMPEL_ARCH_FEATURES_H +#define __COMPEL_ARCH_FEATURES_H + +#endif /* __COMPEL_ARCH_FEATURES_H */ diff --git a/CRIU_code/compel/arch/arm/plugins/std/parasite-head.S b/CRIU_code/compel/arch/arm/plugins/std/parasite-head.S new file mode 100644 index 0000000..e72646b --- /dev/null +++ b/CRIU_code/compel/arch/arm/plugins/std/parasite-head.S @@ -0,0 +1,22 @@ +#include "common/asm/linkage.h" + + .section .head.text, "ax" +ENTRY(__export_parasite_head_start) + sub r2, pc, #8 @ get the address of this instruction + + adr r0, __export_parasite_cmd + ldr r0, [r0] + + adr r1, parasite_args_ptr + ldr r1, [r1] + add r1, r1, r2 @ fixup __export_parasite_args + + bl parasite_service + .byte 0xf0, 0x01, 0xf0, 0xe7 @ the instruction UDF #32 generates the signal SIGTRAP in Linux + +parasite_args_ptr: + .long __export_parasite_args + +__export_parasite_cmd: + .long 0 +END(__export_parasite_head_start) diff --git a/CRIU_code/compel/arch/arm/plugins/std/syscalls/Makefile.syscalls b/CRIU_code/compel/arch/arm/plugins/std/syscalls/Makefile.syscalls new file mode 100644 index 0000000..c89f1a5 --- /dev/null +++ b/CRIU_code/compel/arch/arm/plugins/std/syscalls/Makefile.syscalls @@ -0,0 +1,59 @@ +ccflags-y += -iquote $(PLUGIN_ARCH_DIR)/std/syscalls/ +asflags-y += -iquote $(PLUGIN_ARCH_DIR)/std/syscalls/ + +sys-types := $(obj)/include/uapi/std/syscall-types.h +sys-codes := $(obj)/include/uapi/std/syscall-codes.h +sys-proto := $(obj)/include/uapi/std/syscall.h + +sys-def := $(PLUGIN_ARCH_DIR)/std/syscalls/syscall.def +sys-asm-common-name := std/syscalls/syscall-common.S +sys-asm-common := $(PLUGIN_ARCH_DIR)/$(sys-asm-common-name) +sys-asm-types := $(obj)/include/uapi/std/asm/syscall-types.h +sys-exec-tbl = $(PLUGIN_ARCH_DIR)/std/sys-exec-tbl.c + +sys-gen := $(PLUGIN_ARCH_DIR)/std/syscalls/gen-syscalls.pl +sys-gen-tbl := $(PLUGIN_ARCH_DIR)/std/syscalls/gen-sys-exec-tbl.pl + +sys-asm := ./$(PLUGIN_ARCH_DIR)/std/syscalls/syscalls.S +std-lib-y += $(sys-asm:.S=).o + +ifeq ($(ARCH),arm) +arch_bits := 32 +else +arch_bits := 64 +endif + +sys-exec-tbl := sys-exec-tbl.c + +$(sys-asm) $(sys-types) $(sys-codes) $(sys-proto): $(sys-gen) $(sys-def) $(sys-asm-common) $(sys-asm-types) + $(E) " GEN " $@ + $(Q) perl \ + $(sys-gen) \ + $(sys-def) \ + $(sys-codes) \ + $(sys-proto) \ + $(sys-asm) \ + $(sys-asm-common-name) \ + $(sys-types) \ + $(arch_bits) + +$(sys-asm:.S=).o: $(sys-asm) + +$(sys-exec-tbl): $(sys-gen-tbl) $(sys-def) + $(E) " GEN " $@ + $(Q) perl \ + $(sys-gen-tbl) \ + $(sys-def) \ + $(sys-exec-tbl) \ + $(arch_bits) + +$(sys-asm-types): $(PLUGIN_ARCH_DIR)/include/asm/syscall-types.h + $(call msg-gen, $@) + $(Q) ln -s ../../../../../../$(PLUGIN_ARCH_DIR)/include/asm/syscall-types.h $(sys-asm-types) + $(Q) ln -s ../../../../../$(PLUGIN_ARCH_DIR)/std/syscalls/syscall-aux.S $(obj)/include/uapi/std/syscall-aux.S + $(Q) ln -s ../../../../../$(PLUGIN_ARCH_DIR)/std/syscalls/syscall-aux.h $(obj)/include/uapi/std/syscall-aux.h + +std-headers-deps += $(sys-asm) $(sys-codes) $(sys-proto) $(sys-asm-types) +mrproper-y += $(std-headers-deps) +mrproper-y += $(obj)/include/uapi/std/syscall-aux.S +mrproper-y += $(obj)/include/uapi/std/syscall-aux.h diff --git a/CRIU_code/compel/arch/arm/plugins/std/syscalls/gen-sys-exec-tbl.pl b/CRIU_code/compel/arch/arm/plugins/std/syscalls/gen-sys-exec-tbl.pl new file mode 100644 index 0000000..2f90c13 --- /dev/null +++ b/CRIU_code/compel/arch/arm/plugins/std/syscalls/gen-sys-exec-tbl.pl @@ -0,0 +1,43 @@ +#!/usr/bin/perl + +use strict; +use warnings; + +my $in = $ARGV[0]; +my $tblout = $ARGV[1]; +my $bits = $ARGV[2]; + +my $code = "code$bits"; + +open TBLOUT, ">", $tblout or die $!; +open IN, "<", $in or die $!; + +print TBLOUT "/* Autogenerated, don't edit */\n"; +print TBLOUT "static struct syscall_exec_desc sc_exec_table[] = {\n"; + +for () { + if ($_ =~ /\#/) { + next; + } + + my $sys_name; + my $sys_num; + + if (/(?\S+)\s+(?\S+)\s+(?\d+|\!)\s+(?(?:\d+|\!))\s+\((?.+)\)/) { + $sys_name = $+{alias}; + } elsif (/(?\S+)\s+(?\d+|\!)\s+(?(?:\d+|\!))\s+\((?.+)\)/) { + $sys_name = $+{name}; + } else { + unlink $tblout; + die "Invalid syscall definition file: invalid entry $_\n"; + } + + $sys_num = $+{$code}; + + if ($sys_num ne "!") { + print TBLOUT "SYSCALL($sys_name, $sys_num)\n"; + } +} + +print TBLOUT " { }, /* terminator */"; +print TBLOUT "};" diff --git a/CRIU_code/compel/arch/arm/plugins/std/syscalls/gen-syscalls.pl b/CRIU_code/compel/arch/arm/plugins/std/syscalls/gen-syscalls.pl new file mode 100644 index 0000000..a094211 --- /dev/null +++ b/CRIU_code/compel/arch/arm/plugins/std/syscalls/gen-syscalls.pl @@ -0,0 +1,99 @@ +#!/usr/bin/perl + +use strict; +use warnings; + +my $in = $ARGV[0]; +my $codesout = $ARGV[1]; +my $codes = $ARGV[1]; +$codes =~ s/.*include\/uapi\//compel\/plugins\//g; +my $protosout = $ARGV[2]; +my $protos = $ARGV[2]; +$protos =~ s/.*include\/uapi\//compel\/plugins\//g; +my $asmout = $ARGV[3]; +my $asmcommon = $ARGV[4]; +my $prototypes = $ARGV[5]; +$prototypes =~ s/.*include\/uapi\//compel\/plugins\//g; +my $bits = $ARGV[6]; + +my $codesdef = $codes; +$codesdef =~ tr/.\-\//_/; +my $protosdef = $protos; +$protosdef =~ tr/.\-\//_/; +my $code = "code$bits"; +my $need_aux = 0; + +unlink $codesout; +unlink $protosout; +unlink $asmout; + +open CODESOUT, ">", $codesout or die $!; +open PROTOSOUT, ">", $protosout or die $!; +open ASMOUT, ">", $asmout or die $!; +open IN, "<", $in or die $!; + +print CODESOUT <<"END"; +/* Autogenerated, don't edit */ +#ifndef $codesdef +#define $codesdef +END + +print PROTOSOUT <<"END"; +/* Autogenerated, don't edit */ +#ifndef $protosdef +#define $protosdef +#include <$prototypes> +#include <$codes> +END + +print ASMOUT <<"END"; +/* Autogenerated, don't edit */ +#include <$codes> +#include "$asmcommon" +END + + +for () { + if ($_ =~ /\#/) { + next; + } + + my $code_macro; + my $sys_macro; + my $sys_name; + + if (/(?\S+)\s+(?\S+)\s+(?\d+|\!)\s+(?(?:\d+|\!))\s+\((?.+)\)/) { + $code_macro = "__NR_$+{name}"; + $sys_macro = "SYS_$+{name}"; + $sys_name = "sys_$+{alias}"; + } elsif (/(?\S+)\s+(?\d+|\!)\s+(?(?:\d+|\!))\s+\((?.+)\)/) { + $code_macro = "__NR_$+{name}"; + $sys_macro = "SYS_$+{name}"; + $sys_name = "sys_$+{name}"; + } else { + unlink $codesout; + unlink $protosout; + unlink $asmout; + + die "Invalid syscall definition file: invalid entry $_\n"; + } + + if ($+{$code} ne "!") { + print CODESOUT "#ifndef $code_macro\n#define $code_macro $+{$code}\n#endif\n"; + print CODESOUT "#ifndef $sys_macro\n#define $sys_macro $code_macro\n#endif\n"; + print ASMOUT "syscall $sys_name, $code_macro\n"; + + } else { + $need_aux = 1; + } + + print PROTOSOUT "extern long $sys_name($+{args});\n"; +} + +if ($need_aux == 1) { + print ASMOUT "#include \n"; + print CODESOUT "#include \n"; +} + +print CODESOUT "#endif /* $codesdef */"; +print PROTOSOUT "#endif /* $protosdef */"; diff --git a/CRIU_code/compel/arch/arm/plugins/std/syscalls/syscall-aux.S b/CRIU_code/compel/arch/arm/plugins/std/syscalls/syscall-aux.S new file mode 100644 index 0000000..22cc328 --- /dev/null +++ b/CRIU_code/compel/arch/arm/plugins/std/syscalls/syscall-aux.S @@ -0,0 +1,13 @@ +nr_sys_mmap: + .long 192 + +ENTRY(sys_mmap) + push {r4, r5, r7, lr} + ldr r4, [sp, #16] + ldr r5, [sp, #20] + lsr r5, #12 + adr r7, nr_sys_mmap + ldr r7, [r7] + svc 0x00000000 + pop {r4, r5, r7, pc} +END(sys_mmap) diff --git a/CRIU_code/compel/arch/arm/plugins/std/syscalls/syscall-aux.h b/CRIU_code/compel/arch/arm/plugins/std/syscalls/syscall-aux.h new file mode 100644 index 0000000..3d2056b --- /dev/null +++ b/CRIU_code/compel/arch/arm/plugins/std/syscalls/syscall-aux.h @@ -0,0 +1,27 @@ +#ifndef __NR_mmap2 +# define __NR_mmap2 192 +#endif + +#ifndef __ARM_NR_BASE +# define __ARM_NR_BASE 0x0f0000 +#endif + +#ifndef __ARM_NR_breakpoint +# define __ARM_NR_breakpoint (__ARM_NR_BASE+1) +#endif + +#ifndef __ARM_NR_cacheflush +# define __ARM_NR_cacheflush (__ARM_NR_BASE+2) +#endif + +#ifndef __ARM_NR_usr26 +# define __ARM_NR_usr26 (__ARM_NR_BASE+3) +#endif + +#ifndef __ARM_NR_usr32 +# define __ARM_NR_usr32 (__ARM_NR_BASE+4) +#endif + +#ifndef __ARM_NR_set_tls +# define __ARM_NR_set_tls (__ARM_NR_BASE+5) +#endif diff --git a/CRIU_code/compel/arch/arm/plugins/std/syscalls/syscall-common.S b/CRIU_code/compel/arch/arm/plugins/std/syscalls/syscall-common.S new file mode 100644 index 0000000..9ac53bd --- /dev/null +++ b/CRIU_code/compel/arch/arm/plugins/std/syscalls/syscall-common.S @@ -0,0 +1,34 @@ +#include "common/asm/linkage.h" + +@ We use the register R8 unlike libc that uses R12. +@ This avoids corruption of the register by the stub +@ for the syscall sys_munmap() when syscalls are hooked +@ by ptrace(). However we have to make sure that +@ the compiler doesn't use the register on the route +@ between parasite_service() and sys_munmap(). + +syscall_common: + ldr r7, [r7] + add r8, sp, #24 + ldm r8, {r4, r5, r6} + svc 0x00000000 + pop {r4, r5, r6, r7, r8, pc} + + +.macro syscall name, nr + .nr_\name : + .long \nr + + ENTRY(\name) + push {r4, r5, r6, r7, r8, lr} + adr r7, .nr_\name + b syscall_common + END(\name) +.endm + + +ENTRY(__cr_restore_rt) + adr r7, .nr_sys_rt_sigreturn + ldr r7, [r7] + svc #0 +END(__cr_restore_rt) diff --git a/CRIU_code/compel/arch/arm/plugins/std/syscalls/syscall.def b/CRIU_code/compel/arch/arm/plugins/std/syscalls/syscall.def new file mode 100644 index 0000000..653a753 --- /dev/null +++ b/CRIU_code/compel/arch/arm/plugins/std/syscalls/syscall.def @@ -0,0 +1,113 @@ +# +# System calls table, please make sure the table consists of only the syscalls +# really used somewhere in the project. +# +# The template is (name and arguments are optional if you need only __NR_x +# defined, but no real entry point in syscalls lib). +# +# name/alias code64 code32 arguments +# ----------------------------------------------------------------------- +# +read 63 3 (int fd, void *buf, unsigned long count) +write 64 4 (int fd, const void *buf, unsigned long count) +open ! 5 (const char *filename, unsigned long flags, unsigned long mode) +close 57 6 (int fd) +lseek 62 19 (int fd, unsigned long offset, unsigned long origin) +mmap 222 ! (void *addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long offset) +mprotect 226 125 (const void *addr, unsigned long len, unsigned long prot) +munmap 215 91 (void *addr, unsigned long len) +brk 214 45 (void *addr) +rt_sigaction sigaction 134 174 (int signum, const rt_sigaction_t *act, rt_sigaction_t *oldact, size_t sigsetsize) +rt_sigprocmask sigprocmask 135 175 (int how, k_rtsigset_t *set, k_rtsigset_t *old, size_t sigsetsize) +rt_sigreturn 139 173 (void) +ioctl 29 54 (unsigned int fd, unsigned int cmd, unsigned long arg) +pread64 67 180 (unsigned int fd, char *buf, size_t count, loff_t pos) +ptrace 117 26 (long request, pid_t pid, void *addr, void *data) +mremap 216 163 (unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flag, unsigned long new_addr) +mincore 232 219 (void *addr, unsigned long size, unsigned char *vec) +madvise 233 220 (unsigned long start, size_t len, int behavior) +shmat 196 305 (int shmid, void *shmaddr, int shmflag) +pause 1061 29 (void) +nanosleep 101 162 (struct timespec *req, struct timespec *rem) +getitimer 102 105 (int which, const struct itimerval *val) +setitimer 103 104 (int which, const struct itimerval *val, struct itimerval *old) +getpid 172 20 (void) +socket 198 281 (int domain, int type, int protocol) +connect 203 283 (int sockfd, struct sockaddr *addr, int addrlen) +sendto 206 290 (int sockfd, void *buff, size_t len, unsigned int flags, struct sockaddr *addr, int addr_len) +recvfrom 207 292 (int sockfd, void *ubuf, size_t size, unsigned int flags, struct sockaddr *addr, int *addr_len) +sendmsg 211 296 (int sockfd, const struct msghdr *msg, int flags) +recvmsg 212 297 (int sockfd, struct msghdr *msg, int flags) +shutdown 210 293 (int sockfd, int how) +bind 235 282 (int sockfd, const struct sockaddr *addr, int addrlen) +setsockopt 208 294 (int sockfd, int level, int optname, const void *optval, socklen_t optlen) +getsockopt 209 295 (int sockfd, int level, int optname, const void *optval, socklen_t *optlen) +clone 220 120 (unsigned long flags, void *child_stack, void *parent_tid, unsigned long newtls, void *child_tid) +exit 93 1 (unsigned long error_code) +wait4 260 114 (int pid, int *status, int options, struct rusage *ru) +waitid 95 280 (int which, pid_t pid, struct siginfo *infop, int options, struct rusage *ru) +kill 129 37 (long pid, int sig) +fcntl 25 55 (int fd, int type, long arg) +flock 32 143 (int fd, unsigned long cmd) +mkdir ! 39 (const char *name, int mode) +rmdir ! 40 (const char *name) +unlink ! 10 (char *pathname) +readlinkat 78 332 (int fd, const char *path, char *buf, int bufsize) +umask 166 60 (int mask) +getgroups 158 205 (int gsize, unsigned int *groups) +setgroups 159 206 (int gsize, unsigned int *groups) +setresuid 147 164 (int uid, int euid, int suid) +getresuid 148 165 (int *uid, int *euid, int *suid) +setresgid 149 170 (int gid, int egid, int sgid) +getresgid 150 171 (int *gid, int *egid, int *sgid) +getpgid 155 132 (pid_t pid) +setfsuid 151 138 (int fsuid) +setfsgid 152 139 (int fsgid) +getsid 156 147 (void) +capget 90 184 (struct cap_header *h, struct cap_data *d) +capset 91 185 (struct cap_header *h, struct cap_data *d) +rt_sigqueueinfo 138 178 (pid_t pid, int sig, siginfo_t *info) +setpriority 140 97 (int which, int who, int nice) +sched_setscheduler 119 156 (int pid, int policy, struct sched_param *p) +sigaltstack 132 186 (const void *uss, void *uoss) +personality 92 136 (unsigned int personality) +prctl 167 172 (int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) +arch_prctl ! 17 (int option, unsigned long addr) +setrlimit 164 75 (int resource, struct krlimit *rlim) +mount 40 21 (char *dev_nmae, char *dir_name, char *type, unsigned long flags, void *data) +umount2 39 52 (char *name, int flags) +gettid 178 224 (void) +futex 98 240 (uint32_t *uaddr, int op, uint32_t val, struct timespec *utime, uint32_t *uaddr2, uint32_t val3) +set_tid_address 96 256 (int *tid_addr) +restart_syscall 128 0 (void) +timer_create 107 257 (clockid_t which_clock, struct sigevent *timer_event_spec, kernel_timer_t *created_timer_id) +timer_settime 110 258 (kernel_timer_t timer_id, int flags, const struct itimerspec *new_setting, struct itimerspec *old_setting) +timer_gettime 108 259 (int timer_id, const struct itimerspec *setting) +timer_getoverrun 109 260 (int timer_id) +timer_delete 111 261 (kernel_timer_t timer_id) +clock_gettime 113 263 (const clockid_t which_clock, const struct timespec *tp) +exit_group 94 248 (int error_code) +set_robust_list 99 338 (struct robust_list_head *head, size_t len) +get_robust_list 100 339 (int pid, struct robust_list_head **head_ptr, size_t *len_ptr) +signalfd4 74 355 (int fd, k_rtsigset_t *mask, size_t sizemask, int flags) +rt_tgsigqueueinfo 240 363 (pid_t tgid, pid_t pid, int sig, siginfo_t *info) +vmsplice 75 343 (int fd, const struct iovec *iov, unsigned long nr_segs, unsigned int flags) +timerfd_settime 86 353 (int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr) +fanotify_init 262 367 (unsigned int flags, unsigned int event_f_flags) +fanotify_mark 263 368 (int fanotify_fd, unsigned int flags, uint64_t mask, int dfd, const char *pathname) +open_by_handle_at 265 371 (int mountdirfd, struct file_handle *handle, int flags) +setns 268 375 (int fd, int nstype) +kcmp 272 378 (pid_t pid1, pid_t pid2, int type, unsigned long idx1, unsigned long idx2) +openat 56 322 (int dirfd, const char *pathname, int flags, mode_t mode) +mkdirat 34 323 (int dirfd, const char *pathname, mode_t mode) +unlinkat 35 328 (int dirfd, const char *pathname, int flags) +memfd_create 279 385 (const char *name, unsigned int flags) +io_setup 0 243 (unsigned nr_events, aio_context_t *ctx) +io_submit 2 246 (aio_context_t ctx_id, long nr, struct iocb **iocbpp) +io_getevents 4 245 (aio_context_t ctx, long min_nr, long nr, struct io_event *evs, struct timespec *tmo) +seccomp 277 383 (unsigned int op, unsigned int flags, const char *uargs) +gettimeofday 169 78 (struct timeval *tv, struct timezone *tz) +preadv_raw 69 361 (int fd, struct iovec *iov, unsigned long nr, unsigned long pos_l, unsigned long pos_h) +userfaultfd 282 388 (int flags) +fallocate 47 352 (int fd, int mode, loff_t offset, loff_t len) +cacheflush ! 983042 (void *start, void *end, int flags) diff --git a/CRIU_code/compel/arch/arm/scripts/compel-pack.lds.S b/CRIU_code/compel/arch/arm/scripts/compel-pack.lds.S new file mode 100644 index 0000000..f8a4739 --- /dev/null +++ b/CRIU_code/compel/arch/arm/scripts/compel-pack.lds.S @@ -0,0 +1,36 @@ +OUTPUT_ARCH(arm) +EXTERN(__export_parasite_head_start) + +SECTIONS +{ + .crblob 0x0 : { + *(.head.text) + ASSERT(DEFINED(__export_parasite_head_start), + "Symbol __export_parasite_head_start is missing"); + *(.text*) + . = ALIGN(32); + *(.data*) + . = ALIGN(32); + *(.rodata*) + . = ALIGN(32); + *(.bss*) + . = ALIGN(32); + *(.got*) + . = ALIGN(32); + *(.toc*) + . = ALIGN(32); + } =0x00000000, + + /DISCARD/ : { + *(.debug*) + *(.comment*) + *(.note*) + *(.group*) + *(.eh_frame*) + *(*) + } + +/* Parasite args should have 4 bytes align, as we have futex inside. */ +. = ALIGN(4); +__export_parasite_args = .; +} diff --git a/CRIU_code/compel/arch/arm/src/lib/cpu.c b/CRIU_code/compel/arch/arm/src/lib/cpu.c new file mode 100644 index 0000000..ceb924b --- /dev/null +++ b/CRIU_code/compel/arch/arm/src/lib/cpu.c @@ -0,0 +1 @@ +../../../aarch64/src/lib/cpu.c \ No newline at end of file diff --git a/CRIU_code/compel/arch/arm/src/lib/handle-elf-host.c b/CRIU_code/compel/arch/arm/src/lib/handle-elf-host.c new file mode 100644 index 0000000..fe46118 --- /dev/null +++ b/CRIU_code/compel/arch/arm/src/lib/handle-elf-host.c @@ -0,0 +1 @@ +handle-elf.c \ No newline at end of file diff --git a/CRIU_code/compel/arch/arm/src/lib/handle-elf.c b/CRIU_code/compel/arch/arm/src/lib/handle-elf.c new file mode 100644 index 0000000..8abf8da --- /dev/null +++ b/CRIU_code/compel/arch/arm/src/lib/handle-elf.c @@ -0,0 +1,22 @@ +#include + +#include "uapi/compel.h" + +#include "handle-elf.h" +#include "piegen.h" +#include "log.h" + +static const unsigned char __maybe_unused +elf_ident_32[EI_NIDENT] = { + 0x7f, 0x45, 0x4c, 0x46, 0x01, 0x01, 0x01, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, +}; + +int handle_binary(void *mem, size_t size) +{ + if (memcmp(mem, elf_ident_32, sizeof(elf_ident_32)) == 0) + return handle_elf_arm(mem, size); + + pr_err("Unsupported Elf format detected\n"); + return -EINVAL; +} diff --git a/CRIU_code/compel/arch/arm/src/lib/include/cpu.h b/CRIU_code/compel/arch/arm/src/lib/include/cpu.h new file mode 100644 index 0000000..e69de29 diff --git a/CRIU_code/compel/arch/arm/src/lib/include/handle-elf.h b/CRIU_code/compel/arch/arm/src/lib/include/handle-elf.h new file mode 100644 index 0000000..e5971f3 --- /dev/null +++ b/CRIU_code/compel/arch/arm/src/lib/include/handle-elf.h @@ -0,0 +1,11 @@ +#ifndef COMPEL_HANDLE_ELF_H__ +#define COMPEL_HANDLE_ELF_H__ + +#include "elf32-types.h" + +#define __handle_elf handle_elf_arm +#define arch_is_machine_supported(e_machine) (e_machine == EM_ARM) + +extern int handle_elf_arm(void *mem, size_t size); + +#endif /* COMPEL_HANDLE_ELF_H__ */ diff --git a/CRIU_code/compel/arch/arm/src/lib/include/syscall.h b/CRIU_code/compel/arch/arm/src/lib/include/syscall.h new file mode 100644 index 0000000..e2ec127 --- /dev/null +++ b/CRIU_code/compel/arch/arm/src/lib/include/syscall.h @@ -0,0 +1,4 @@ +#ifndef __COMPEL_SYSCALL_H__ +#define __COMPEL_SYSCALL_H__ +#define __NR(syscall, compat) __NR_##syscall +#endif diff --git a/CRIU_code/compel/arch/arm/src/lib/include/uapi/asm/.gitignore b/CRIU_code/compel/arch/arm/src/lib/include/uapi/asm/.gitignore new file mode 100644 index 0000000..e69de29 diff --git a/CRIU_code/compel/arch/arm/src/lib/include/uapi/asm/breakpoints.h b/CRIU_code/compel/arch/arm/src/lib/include/uapi/asm/breakpoints.h new file mode 100644 index 0000000..5f09049 --- /dev/null +++ b/CRIU_code/compel/arch/arm/src/lib/include/uapi/asm/breakpoints.h @@ -0,0 +1,15 @@ +#ifndef __COMPEL_BREAKPOINTS_H__ +#define __COMPEL_BREAKPOINTS_H__ +#define ARCH_SI_TRAP TRAP_BRKPT + +static inline int ptrace_set_breakpoint(pid_t pid, void *addr) +{ + return 0; +} + +static inline int ptrace_flush_breakpoints(pid_t pid) +{ + return 0; +} + +#endif diff --git a/CRIU_code/compel/arch/arm/src/lib/include/uapi/asm/cpu.h b/CRIU_code/compel/arch/arm/src/lib/include/uapi/asm/cpu.h new file mode 100644 index 0000000..c35460e --- /dev/null +++ b/CRIU_code/compel/arch/arm/src/lib/include/uapi/asm/cpu.h @@ -0,0 +1,6 @@ +#ifndef UAPI_COMPEL_ASM_CPU_H__ +#define UAPI_COMPEL_ASM_CPU_H__ + +typedef struct { } compel_cpuinfo_t; + +#endif /* UAPI_COMPEL_ASM_CPU_H__ */ diff --git a/CRIU_code/compel/arch/arm/src/lib/include/uapi/asm/fpu.h b/CRIU_code/compel/arch/arm/src/lib/include/uapi/asm/fpu.h new file mode 100644 index 0000000..7f476d5 --- /dev/null +++ b/CRIU_code/compel/arch/arm/src/lib/include/uapi/asm/fpu.h @@ -0,0 +1,4 @@ +#ifndef __CR_ASM_FPU_H__ +#define __CR_ASM_FPU_H__ + +#endif /* __CR_ASM_FPU_H__ */ diff --git a/CRIU_code/compel/arch/arm/src/lib/include/uapi/asm/infect-types.h b/CRIU_code/compel/arch/arm/src/lib/include/uapi/asm/infect-types.h new file mode 100644 index 0000000..b8286d4 --- /dev/null +++ b/CRIU_code/compel/arch/arm/src/lib/include/uapi/asm/infect-types.h @@ -0,0 +1,66 @@ +#ifndef UAPI_COMPEL_ASM_TYPES_H__ +#define UAPI_COMPEL_ASM_TYPES_H__ + +#include +#include + +#define SIGMAX 64 +#define SIGMAX_OLD 31 + +/* + * Copied from the Linux kernel header arch/arm/include/asm/ptrace.h + * + * A thread ARM CPU context + */ + +typedef struct { + long uregs[18]; +} user_regs_struct_t; + +typedef struct user_vfp user_fpregs_struct_t; + +#define ARM_cpsr uregs[16] +#define ARM_pc uregs[15] +#define ARM_lr uregs[14] +#define ARM_sp uregs[13] +#define ARM_ip uregs[12] +#define ARM_fp uregs[11] +#define ARM_r10 uregs[10] +#define ARM_r9 uregs[9] +#define ARM_r8 uregs[8] +#define ARM_r7 uregs[7] +#define ARM_r6 uregs[6] +#define ARM_r5 uregs[5] +#define ARM_r4 uregs[4] +#define ARM_r3 uregs[3] +#define ARM_r2 uregs[2] +#define ARM_r1 uregs[1] +#define ARM_r0 uregs[0] +#define ARM_ORIG_r0 uregs[17] + + +/* Copied from arch/arm/include/asm/user.h */ + +struct user_vfp { + unsigned long long fpregs[32]; + unsigned long fpscr; +}; + +struct user_vfp_exc { + unsigned long fpexc; + unsigned long fpinst; + unsigned long fpinst2; +}; + +#define REG_RES(regs) ((regs).ARM_r0) +#define REG_IP(regs) ((regs).ARM_pc) +#define REG_SP(regs) ((regs).ARM_sp) +#define REG_SYSCALL_NR(regs) ((regs).ARM_r7) + +#define user_regs_native(pregs) true + +#define ARCH_SI_TRAP TRAP_BRKPT + +#define __NR(syscall, compat) __NR_##syscall + +#endif /* UAPI_COMPEL_ASM_TYPES_H__ */ diff --git a/CRIU_code/compel/arch/arm/src/lib/include/uapi/asm/processor-flags.h b/CRIU_code/compel/arch/arm/src/lib/include/uapi/asm/processor-flags.h new file mode 100644 index 0000000..8745f44 --- /dev/null +++ b/CRIU_code/compel/arch/arm/src/lib/include/uapi/asm/processor-flags.h @@ -0,0 +1,42 @@ +#ifndef __CR_PROCESSOR_FLAGS_H__ +#define __CR_PROCESSOR_FLAGS_H__ + +/* Copied from the Linux kernel header arch/arm/include/uapi/asm/ptrace.h */ + +/* + * PSR bits + */ +#define USR26_MODE 0x00000000 +#define FIQ26_MODE 0x00000001 +#define IRQ26_MODE 0x00000002 +#define SVC26_MODE 0x00000003 +#define USR_MODE 0x00000010 +#define FIQ_MODE 0x00000011 +#define IRQ_MODE 0x00000012 +#define SVC_MODE 0x00000013 +#define ABT_MODE 0x00000017 +#define UND_MODE 0x0000001b +#define SYSTEM_MODE 0x0000001f +#define MODE32_BIT 0x00000010 +#define MODE_MASK 0x0000001f +#define PSR_T_BIT 0x00000020 +#define PSR_F_BIT 0x00000040 +#define PSR_I_BIT 0x00000080 +#define PSR_A_BIT 0x00000100 +#define PSR_E_BIT 0x00000200 +#define PSR_J_BIT 0x01000000 +#define PSR_Q_BIT 0x08000000 +#define PSR_V_BIT 0x10000000 +#define PSR_C_BIT 0x20000000 +#define PSR_Z_BIT 0x40000000 +#define PSR_N_BIT 0x80000000 + +/* + * Groups of PSR bits + */ +#define PSR_f 0xff000000 /* Flags */ +#define PSR_s 0x00ff0000 /* Status */ +#define PSR_x 0x0000ff00 /* Extension */ +#define PSR_c 0x000000ff /* Control */ + +#endif diff --git a/CRIU_code/compel/arch/arm/src/lib/include/uapi/asm/sigframe.h b/CRIU_code/compel/arch/arm/src/lib/include/uapi/asm/sigframe.h new file mode 100644 index 0000000..23ada50 --- /dev/null +++ b/CRIU_code/compel/arch/arm/src/lib/include/uapi/asm/sigframe.h @@ -0,0 +1,90 @@ +#ifndef UAPI_COMPEL_ASM_SIGFRAME_H__ +#define UAPI_COMPEL_ASM_SIGFRAME_H__ + +#include + +/* Copied from the Linux kernel header arch/arm/include/asm/sigcontext.h */ + +struct rt_sigcontext { + unsigned long trap_no; + unsigned long error_code; + unsigned long oldmask; + unsigned long arm_r0; + unsigned long arm_r1; + unsigned long arm_r2; + unsigned long arm_r3; + unsigned long arm_r4; + unsigned long arm_r5; + unsigned long arm_r6; + unsigned long arm_r7; + unsigned long arm_r8; + unsigned long arm_r9; + unsigned long arm_r10; + unsigned long arm_fp; + unsigned long arm_ip; + unsigned long arm_sp; + unsigned long arm_lr; + unsigned long arm_pc; + unsigned long arm_cpsr; + unsigned long fault_address; +}; + +/* Copied from the Linux kernel header arch/arm/include/asm/ucontext.h */ + +#define VFP_MAGIC 0x56465001 +#define VFP_STORAGE_SIZE sizeof(struct vfp_sigframe) + +struct vfp_sigframe { + unsigned long magic; + unsigned long size; + struct user_vfp ufp; + struct user_vfp_exc ufp_exc; +}; + +typedef struct vfp_sigframe fpu_state_t; + +struct aux_sigframe { + /* + struct crunch_sigframe crunch; + struct iwmmxt_sigframe iwmmxt; + */ + + struct vfp_sigframe vfp; + unsigned long end_magic; +} __attribute__((aligned(8))); + +#include + +struct sigframe { + struct rt_ucontext uc; + unsigned long retcode[2]; +}; + +struct rt_sigframe { + struct rt_siginfo info; + struct sigframe sig; +}; + + +#define ARCH_RT_SIGRETURN(new_sp, rt_sigframe) \ + asm volatile( \ + "mov sp, %0 \n" \ + "mov r7, #"__stringify(__NR_rt_sigreturn)" \n" \ + "svc #0 \n" \ + : \ + : "r"(new_sp) \ + : "memory") + +#define RT_SIGFRAME_UC(rt_sigframe) (&rt_sigframe->sig.uc) +#define RT_SIGFRAME_REGIP(rt_sigframe) (rt_sigframe)->sig.uc.uc_mcontext.arm_ip +#define RT_SIGFRAME_HAS_FPU(rt_sigframe) 1 +#define RT_SIGFRAME_AUX_SIGFRAME(rt_sigframe) ((struct aux_sigframe *)&(rt_sigframe)->sig.uc.uc_regspace) +#define RT_SIGFRAME_FPU(rt_sigframe) (&RT_SIGFRAME_AUX_SIGFRAME(rt_sigframe)->vfp) +#define RT_SIGFRAME_OFFSET(rt_sigframe) 0 + +#define rt_sigframe_erase_sigset(sigframe) \ + memset(&sigframe->sig.uc.uc_sigmask, 0, sizeof(k_rtsigset_t)) +#define rt_sigframe_copy_sigset(sigframe, from) \ + memcpy(&sigframe->sig.uc.uc_sigmask, from, sizeof(k_rtsigset_t)) + +#endif /* UAPI_COMPEL_ASM_SIGFRAME_H__ */ diff --git a/CRIU_code/compel/arch/arm/src/lib/infect.c b/CRIU_code/compel/arch/arm/src/lib/infect.c new file mode 100644 index 0000000..c17cb9c --- /dev/null +++ b/CRIU_code/compel/arch/arm/src/lib/infect.c @@ -0,0 +1,195 @@ +#include +#include +#include +#include +#include +#include "common/page.h" +#include "uapi/compel/asm/infect-types.h" +#include "log.h" +#include "errno.h" +#include "infect.h" +#include "infect-priv.h" + +/* + * Injected syscall instruction + */ +const char code_syscall[] = { + 0x00, 0x00, 0x00, 0xef, /* SVC #0 */ + 0xf0, 0x01, 0xf0, 0xe7 /* UDF #32 */ +}; + +static const int +code_syscall_aligned = round_up(sizeof(code_syscall), sizeof(long)); + +static inline __always_unused void __check_code_syscall(void) +{ + BUILD_BUG_ON(code_syscall_aligned != BUILTIN_SYSCALL_SIZE); + BUILD_BUG_ON(!is_log2(sizeof(code_syscall))); +} + +int sigreturn_prep_regs_plain(struct rt_sigframe *sigframe, + user_regs_struct_t *regs, + user_fpregs_struct_t *fpregs) +{ + struct aux_sigframe *aux = (struct aux_sigframe *)(void *)&sigframe->sig.uc.uc_regspace; + + sigframe->sig.uc.uc_mcontext.arm_r0 = regs->ARM_r0; + sigframe->sig.uc.uc_mcontext.arm_r1 = regs->ARM_r1; + sigframe->sig.uc.uc_mcontext.arm_r2 = regs->ARM_r2; + sigframe->sig.uc.uc_mcontext.arm_r3 = regs->ARM_r3; + sigframe->sig.uc.uc_mcontext.arm_r4 = regs->ARM_r4; + sigframe->sig.uc.uc_mcontext.arm_r5 = regs->ARM_r5; + sigframe->sig.uc.uc_mcontext.arm_r6 = regs->ARM_r6; + sigframe->sig.uc.uc_mcontext.arm_r7 = regs->ARM_r7; + sigframe->sig.uc.uc_mcontext.arm_r8 = regs->ARM_r8; + sigframe->sig.uc.uc_mcontext.arm_r9 = regs->ARM_r9; + sigframe->sig.uc.uc_mcontext.arm_r10 = regs->ARM_r10; + sigframe->sig.uc.uc_mcontext.arm_fp = regs->ARM_fp; + sigframe->sig.uc.uc_mcontext.arm_ip = regs->ARM_ip; + sigframe->sig.uc.uc_mcontext.arm_sp = regs->ARM_sp; + sigframe->sig.uc.uc_mcontext.arm_lr = regs->ARM_lr; + sigframe->sig.uc.uc_mcontext.arm_pc = regs->ARM_pc; + sigframe->sig.uc.uc_mcontext.arm_cpsr = regs->ARM_cpsr; + + memcpy(&aux->vfp.ufp.fpregs, &fpregs->fpregs, sizeof(aux->vfp.ufp.fpregs)); + aux->vfp.ufp.fpscr = fpregs->fpscr; + aux->vfp.magic = VFP_MAGIC; + aux->vfp.size = VFP_STORAGE_SIZE; + + return 0; +} + +int sigreturn_prep_fpu_frame_plain(struct rt_sigframe *sigframe, + struct rt_sigframe *rsigframe) +{ + return 0; +} + +#define PTRACE_GETVFPREGS 27 +int get_task_regs(pid_t pid, user_regs_struct_t *regs, save_regs_t save, + void *arg, __maybe_unused unsigned long flags) +{ + user_fpregs_struct_t vfp; + int ret = -1; + + pr_info("Dumping GP/FPU registers for %d\n", pid); + + if (ptrace(PTRACE_GETVFPREGS, pid, NULL, &vfp)) { + pr_perror("Can't obtain FPU registers for %d", pid); + goto err; + } + + /* Did we come from a system call? */ + if ((int)regs->ARM_ORIG_r0 >= 0) { + /* Restart the system call */ + switch ((long)(int)regs->ARM_r0) { + case -ERESTARTNOHAND: + case -ERESTARTSYS: + case -ERESTARTNOINTR: + regs->ARM_r0 = regs->ARM_ORIG_r0; + regs->ARM_pc -= 4; + break; + case -ERESTART_RESTARTBLOCK: + regs->ARM_r0 = __NR_restart_syscall; + regs->ARM_pc -= 4; + break; + } + } + + ret = save(arg, regs, &vfp); +err: + return ret; +} + +int compel_syscall(struct parasite_ctl *ctl, int nr, long *ret, + unsigned long arg1, + unsigned long arg2, + unsigned long arg3, + unsigned long arg4, + unsigned long arg5, + unsigned long arg6) +{ + user_regs_struct_t regs = ctl->orig.regs; + int err; + + regs.ARM_r7 = (unsigned long)nr; + regs.ARM_r0 = arg1; + regs.ARM_r1 = arg2; + regs.ARM_r2 = arg3; + regs.ARM_r3 = arg4; + regs.ARM_r4 = arg5; + regs.ARM_r5 = arg6; + + err = compel_execute_syscall(ctl, ®s, code_syscall); + + *ret = regs.ARM_r0; + return err; +} + +void *remote_mmap(struct parasite_ctl *ctl, + void *addr, size_t length, int prot, + int flags, int fd, off_t offset) +{ + long map; + int err; + + if (offset & ~PAGE_MASK) + return 0; + + err = compel_syscall(ctl, __NR_mmap2, &map, + (unsigned long)addr, length, prot, flags, fd, offset >> 12); + if (err < 0 || map > ctl->ictx.task_size) + map = 0; + + return (void *)map; +} + +void parasite_setup_regs(unsigned long new_ip, void *stack, user_regs_struct_t *regs) +{ + regs->ARM_pc = new_ip; + if (stack) + regs->ARM_sp = (unsigned long)stack; + + /* Make sure flags are in known state */ + regs->ARM_cpsr &= PSR_f | PSR_s | PSR_x | MODE32_BIT; +} + +bool arch_can_dump_task(struct parasite_ctl *ctl) +{ + /* + * TODO: Add proper check here + */ + return true; +} + +int arch_fetch_sas(struct parasite_ctl *ctl, struct rt_sigframe *s) +{ + long ret; + int err; + + err = compel_syscall(ctl, __NR_sigaltstack, + &ret, 0, (unsigned long)&s->sig.uc.uc_stack, + 0, 0, 0, 0); + return err ? err : ret; +} + +/* + * Range for task size calculated from the following Linux kernel files: + * arch/arm/include/asm/memory.h + * arch/arm/Kconfig (PAGE_OFFSET values in Memory split section) + */ +#define TASK_SIZE_MIN 0x3f000000 +#define TASK_SIZE_MAX 0xbf000000 +#define SZ_1G 0x40000000 + +unsigned long compel_task_size(void) +{ + unsigned long task_size; + + for (task_size = TASK_SIZE_MIN; task_size < TASK_SIZE_MAX; task_size += SZ_1G) + if (munmap((void *)task_size, page_size())) + break; + + return task_size; +} + diff --git a/CRIU_code/compel/arch/ppc64/plugins/include/asm/prologue.h b/CRIU_code/compel/arch/ppc64/plugins/include/asm/prologue.h new file mode 100644 index 0000000..e0275e3 --- /dev/null +++ b/CRIU_code/compel/arch/ppc64/plugins/include/asm/prologue.h @@ -0,0 +1 @@ +../../../../../arch/x86/plugins/include/asm/prologue.h \ No newline at end of file diff --git a/CRIU_code/compel/arch/ppc64/plugins/include/asm/syscall-types.h b/CRIU_code/compel/arch/ppc64/plugins/include/asm/syscall-types.h new file mode 100644 index 0000000..7754721 --- /dev/null +++ b/CRIU_code/compel/arch/ppc64/plugins/include/asm/syscall-types.h @@ -0,0 +1,28 @@ +#ifndef COMPEL_ARCH_SYSCALL_TYPES_H__ +#define COMPEL_ARCH_SYSCALL_TYPES_H__ + +#define SA_RESTORER 0x04000000U + +typedef void rt_signalfn_t(int, siginfo_t *, void *); +typedef rt_signalfn_t *rt_sighandler_t; + +typedef void rt_restorefn_t(void); +typedef rt_restorefn_t *rt_sigrestore_t; + +#define _KNSIG 64 +#define _NSIG_BPW 64 + +#define _KNSIG_WORDS (_KNSIG / _NSIG_BPW) + +typedef struct { + unsigned long sig[_KNSIG_WORDS]; +} k_rtsigset_t; + +typedef struct { + rt_sighandler_t rt_sa_handler; + unsigned long rt_sa_flags; + rt_sigrestore_t rt_sa_restorer; + k_rtsigset_t rt_sa_mask; +} rt_sigaction_t; + +#endif /* COMPEL_ARCH_SYSCALL_TYPES_H__ */ diff --git a/CRIU_code/compel/arch/ppc64/plugins/include/features.h b/CRIU_code/compel/arch/ppc64/plugins/include/features.h new file mode 100644 index 0000000..d7dd507 --- /dev/null +++ b/CRIU_code/compel/arch/ppc64/plugins/include/features.h @@ -0,0 +1,7 @@ +#ifndef __COMPEL_ARCH_FEATURES_H +#define __COMPEL_ARCH_FEATURES_H + +#define ARCH_HAS_MEMCPY +#define ARCH_HAS_MEMCMP + +#endif /* __COMPEL_ARCH_FEATURES_H */ diff --git a/CRIU_code/compel/arch/ppc64/plugins/std/memcmp.S b/CRIU_code/compel/arch/ppc64/plugins/std/memcmp.S new file mode 100644 index 0000000..7f7fe91 --- /dev/null +++ b/CRIU_code/compel/arch/ppc64/plugins/std/memcmp.S @@ -0,0 +1,236 @@ +/* + * Author: Anton Blanchard + * Copyright 2015 IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * -- + * Copied form the linux file arch/powerpc/lib/memcmp_64.S + */ +#include "common/asm/linkage.h" + +#define off8 r6 +#define off16 r7 +#define off24 r8 + +#define rA r9 +#define rB r10 +#define rC r11 +#define rD r27 +#define rE r28 +#define rF r29 +#define rG r30 +#define rH r31 + +#ifdef __LITTLE_ENDIAN__ +#define LD ldbrx +#else +#define LD ldx +#endif + +ENTRY(memcmp) + cmpdi cr1,r5,0 + + /* Use the short loop if both strings are not 8B aligned */ + or r6,r3,r4 + andi. r6,r6,7 + + /* Use the short loop if length is less than 32B */ + cmpdi cr6,r5,31 + + beq cr1,.Lzero + bne .Lshort + bgt cr6,.Llong + +.Lshort: + mtctr r5 + +1: lbz rA,0(r3) + lbz rB,0(r4) + subf. rC,rB,rA + bne .Lnon_zero + bdz .Lzero + + lbz rA,1(r3) + lbz rB,1(r4) + subf. rC,rB,rA + bne .Lnon_zero + bdz .Lzero + + lbz rA,2(r3) + lbz rB,2(r4) + subf. rC,rB,rA + bne .Lnon_zero + bdz .Lzero + + lbz rA,3(r3) + lbz rB,3(r4) + subf. rC,rB,rA + bne .Lnon_zero + + addi r3,r3,4 + addi r4,r4,4 + + bdnz 1b + +.Lzero: + li r3,0 + blr + +.Lnon_zero: + mr r3,rC + blr + +.Llong: + li off8,8 + li off16,16 + li off24,24 + + std r31,-8(r1) + std r30,-16(r1) + std r29,-24(r1) + std r28,-32(r1) + std r27,-40(r1) + + srdi r0,r5,5 + mtctr r0 + andi. r5,r5,31 + + LD rA,0,r3 + LD rB,0,r4 + + LD rC,off8,r3 + LD rD,off8,r4 + + LD rE,off16,r3 + LD rF,off16,r4 + + LD rG,off24,r3 + LD rH,off24,r4 + cmpld cr0,rA,rB + + addi r3,r3,32 + addi r4,r4,32 + + bdz .Lfirst32 + + LD rA,0,r3 + LD rB,0,r4 + cmpld cr1,rC,rD + + LD rC,off8,r3 + LD rD,off8,r4 + cmpld cr6,rE,rF + + LD rE,off16,r3 + LD rF,off16,r4 + cmpld cr7,rG,rH + bne cr0,.LcmpAB + + LD rG,off24,r3 + LD rH,off24,r4 + cmpld cr0,rA,rB + bne cr1,.LcmpCD + + addi r3,r3,32 + addi r4,r4,32 + + bdz .Lsecond32 + + .balign 16 + +1: LD rA,0,r3 + LD rB,0,r4 + cmpld cr1,rC,rD + bne cr6,.LcmpEF + + LD rC,off8,r3 + LD rD,off8,r4 + cmpld cr6,rE,rF + bne cr7,.LcmpGH + + LD rE,off16,r3 + LD rF,off16,r4 + cmpld cr7,rG,rH + bne cr0,.LcmpAB + + LD rG,off24,r3 + LD rH,off24,r4 + cmpld cr0,rA,rB + bne cr1,.LcmpCD + + addi r3,r3,32 + addi r4,r4,32 + + bdnz 1b + +.Lsecond32: + cmpld cr1,rC,rD + bne cr6,.LcmpEF + + cmpld cr6,rE,rF + bne cr7,.LcmpGH + + cmpld cr7,rG,rH + bne cr0,.LcmpAB + + bne cr1,.LcmpCD + bne cr6,.LcmpEF + bne cr7,.LcmpGH + +.Ltail: + ld r31,-8(r1) + ld r30,-16(r1) + ld r29,-24(r1) + ld r28,-32(r1) + ld r27,-40(r1) + + cmpdi r5,0 + beq .Lzero + b .Lshort + +.Lfirst32: + cmpld cr1,rC,rD + cmpld cr6,rE,rF + cmpld cr7,rG,rH + + bne cr0,.LcmpAB + bne cr1,.LcmpCD + bne cr6,.LcmpEF + bne cr7,.LcmpGH + + b .Ltail + +.LcmpAB: + li r3,1 + bgt cr0,.Lout + li r3,-1 + b .Lout + +.LcmpCD: + li r3,1 + bgt cr1,.Lout + li r3,-1 + b .Lout + +.LcmpEF: + li r3,1 + bgt cr6,.Lout + li r3,-1 + b .Lout + +.LcmpGH: + li r3,1 + bgt cr7,.Lout + li r3,-1 + +.Lout: + ld r31,-8(r1) + ld r30,-16(r1) + ld r29,-24(r1) + ld r28,-32(r1) + ld r27,-40(r1) + blr diff --git a/CRIU_code/compel/arch/ppc64/plugins/std/memcpy.S b/CRIU_code/compel/arch/ppc64/plugins/std/memcpy.S new file mode 100644 index 0000000..e1afb7e --- /dev/null +++ b/CRIU_code/compel/arch/ppc64/plugins/std/memcpy.S @@ -0,0 +1,212 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2012 + * + * Author: Anton Blanchard + * + * -- + * Copied from the kernel file arch/powerpc/lib/memcpy_power7.S + * Altivec support has been removed so we don't taint restored process. + */ +#include "common/asm/linkage.h" + +/* + * When building the parasite code, the compiler may rely on the C library + * service memcpy to initialise big local variable in the stack. + */ +ENTRY(memcpy) + cmpldi r5,16 + std r3,-STACKFRAMESIZE+STK_REG(R31)(r1) + blt .Lshort_copy + +.Lnonvmx_copy: + /* Get the source 8B aligned */ + neg r6,r4 + mtocrf 0x01,r6 + clrldi r6,r6,(64-3) + + bf cr7*4+3,1f + lbz r0,0(r4) + addi r4,r4,1 + stb r0,0(r3) + addi r3,r3,1 + +1: bf cr7*4+2,2f + lhz r0,0(r4) + addi r4,r4,2 + sth r0,0(r3) + addi r3,r3,2 + +2: bf cr7*4+1,3f + lwz r0,0(r4) + addi r4,r4,4 + stw r0,0(r3) + addi r3,r3,4 + +3: sub r5,r5,r6 + cmpldi r5,128 + blt 5f + + mflr r0 + stdu r1,-STACKFRAMESIZE(r1) + std r14,STK_REG(R14)(r1) + std r15,STK_REG(R15)(r1) + std r16,STK_REG(R16)(r1) + std r17,STK_REG(R17)(r1) + std r18,STK_REG(R18)(r1) + std r19,STK_REG(R19)(r1) + std r20,STK_REG(R20)(r1) + std r21,STK_REG(R21)(r1) + std r22,STK_REG(R22)(r1) + std r0,STACKFRAMESIZE+16(r1) + + srdi r6,r5,7 + mtctr r6 + + /* Now do cacheline (128B) sized loads and stores. */ + .align 5 +4: + ld r0,0(r4) + ld r6,8(r4) + ld r7,16(r4) + ld r8,24(r4) + ld r9,32(r4) + ld r10,40(r4) + ld r11,48(r4) + ld r12,56(r4) + ld r14,64(r4) + ld r15,72(r4) + ld r16,80(r4) + ld r17,88(r4) + ld r18,96(r4) + ld r19,104(r4) + ld r20,112(r4) + ld r21,120(r4) + addi r4,r4,128 + std r0,0(r3) + std r6,8(r3) + std r7,16(r3) + std r8,24(r3) + std r9,32(r3) + std r10,40(r3) + std r11,48(r3) + std r12,56(r3) + std r14,64(r3) + std r15,72(r3) + std r16,80(r3) + std r17,88(r3) + std r18,96(r3) + std r19,104(r3) + std r20,112(r3) + std r21,120(r3) + addi r3,r3,128 + bdnz 4b + + clrldi r5,r5,(64-7) + + ld r14,STK_REG(R14)(r1) + ld r15,STK_REG(R15)(r1) + ld r16,STK_REG(R16)(r1) + ld r17,STK_REG(R17)(r1) + ld r18,STK_REG(R18)(r1) + ld r19,STK_REG(R19)(r1) + ld r20,STK_REG(R20)(r1) + ld r21,STK_REG(R21)(r1) + ld r22,STK_REG(R22)(r1) + addi r1,r1,STACKFRAMESIZE + + /* Up to 127B to go */ +5: srdi r6,r5,4 + mtocrf 0x01,r6 + +6: bf cr7*4+1,7f + ld r0,0(r4) + ld r6,8(r4) + ld r7,16(r4) + ld r8,24(r4) + ld r9,32(r4) + ld r10,40(r4) + ld r11,48(r4) + ld r12,56(r4) + addi r4,r4,64 + std r0,0(r3) + std r6,8(r3) + std r7,16(r3) + std r8,24(r3) + std r9,32(r3) + std r10,40(r3) + std r11,48(r3) + std r12,56(r3) + addi r3,r3,64 + + /* Up to 63B to go */ +7: bf cr7*4+2,8f + ld r0,0(r4) + ld r6,8(r4) + ld r7,16(r4) + ld r8,24(r4) + addi r4,r4,32 + std r0,0(r3) + std r6,8(r3) + std r7,16(r3) + std r8,24(r3) + addi r3,r3,32 + + /* Up to 31B to go */ +8: bf cr7*4+3,9f + ld r0,0(r4) + ld r6,8(r4) + addi r4,r4,16 + std r0,0(r3) + std r6,8(r3) + addi r3,r3,16 + +9: clrldi r5,r5,(64-4) + + /* Up to 15B to go */ +.Lshort_copy: + mtocrf 0x01,r5 + bf cr7*4+0,12f + lwz r0,0(r4) /* Less chance of a reject with word ops */ + lwz r6,4(r4) + addi r4,r4,8 + stw r0,0(r3) + stw r6,4(r3) + addi r3,r3,8 + +12: bf cr7*4+1,13f + lwz r0,0(r4) + addi r4,r4,4 + stw r0,0(r3) + addi r3,r3,4 + +13: bf cr7*4+2,14f + lhz r0,0(r4) + addi r4,r4,2 + sth r0,0(r3) + addi r3,r3,2 + +14: bf cr7*4+3,15f + lbz r0,0(r4) + stb r0,0(r3) + +15: ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1) + blr + +.Lunwind_stack_nonvmx_copy: + addi r1,r1,STACKFRAMESIZE + b .Lnonvmx_copy + diff --git a/CRIU_code/compel/arch/ppc64/plugins/std/parasite-head.S b/CRIU_code/compel/arch/ppc64/plugins/std/parasite-head.S new file mode 100644 index 0000000..c870efd --- /dev/null +++ b/CRIU_code/compel/arch/ppc64/plugins/std/parasite-head.S @@ -0,0 +1,45 @@ +#include "common/asm/linkage.h" + + .section .head.text + .align 8 + +ENTRY(__export_parasite_head_start) + + // int __used parasite_service(unsigned int cmd, void *args) + // cmd = r3 = *__export_parasite_cmd (u32 ?) + // args = r4 = @parasite_args_ptr + @pc + bl 0f +0: mflr r2 + +#define LOAD_REG_ADDR(reg, name) \ + addis reg,r2,(name - 0b)@ha; \ + addi reg,r2,(name - 0b)@l; + + LOAD_REG_ADDR(r3,__export_parasite_cmd) + lwz r3,0(r3) + + LOAD_REG_ADDR(r4,parasite_args_ptr) + ld r4,0(r4) + + LOAD_REG_ADDR(r12,parasite_service_ptr) + ld r12,0(r12) + mtctr r12 + + bctrl // call parasite_service + twi 31,0,0 // Should generate SIGTRAP + +parasite_args_ptr: + .quad __export_parasite_args + +parasite_service_ptr: + // We want to run the function prototype to set r2. + // Since the relocation will prefer the local entry + // point, we force it to the global one which is 2 + // instructions above the local one. + // FIXME: There should be a way to specify the global entry here. + .quad parasite_service - 8 + +__export_parasite_cmd: + .long 0 + +END(__export_parasite_head_start) diff --git a/CRIU_code/compel/arch/ppc64/plugins/std/syscalls/Makefile.syscalls b/CRIU_code/compel/arch/ppc64/plugins/std/syscalls/Makefile.syscalls new file mode 100644 index 0000000..c0c22bf --- /dev/null +++ b/CRIU_code/compel/arch/ppc64/plugins/std/syscalls/Makefile.syscalls @@ -0,0 +1,57 @@ +ccflags-y += -iquote $(PLUGIN_ARCH_DIR)/std/syscalls/ +asflags-y += -iquote $(PLUGIN_ARCH_DIR)/std/syscalls/ + +sys-types := $(obj)/include/uapi/std/syscall-types.h +sys-codes := $(obj)/include/uapi/std/syscall-codes.h +sys-proto := $(obj)/include/uapi/std/syscall.h + +sys-def := $(PLUGIN_ARCH_DIR)/std/syscalls/syscall-ppc64.tbl +sys-asm-common-name := std/syscalls/syscall-common-ppc64.S +sys-asm-common := $(PLUGIN_ARCH_DIR)/$(sys-asm-common-name) +sys-asm-types := $(obj)/include/uapi/std/asm/syscall-types.h +sys-exec-tbl = $(PLUGIN_ARCH_DIR)/std/sys-exec-tbl.c + +sys-asm := ./$(PLUGIN_ARCH_DIR)/std/syscalls/syscalls.S +std-lib-y += $(sys-asm:.S=).o + +$(sys-codes): $(sys-def) + $(E) " GEN " $@ + $(Q) echo "/* Autogenerated, don't edit */" > $@ + $(Q) echo "#ifndef __ASM_CR_SYSCALL_CODES_H__" >> $@ + $(Q) echo "#define __ASM_CR_SYSCALL_CODES_H__" >> $@ + $(Q) cat $< | awk '/^__NR/{SYSN=$$1; sub("^__NR", "SYS", SYSN);'\ + 'print "\n#ifndef ", $$1, "\n#define", $$1, $$2, "\n#endif";'\ + 'print "#ifndef ", SYSN, "\n#define ", SYSN, $$1, "\n#endif"}' >> $@ + $(Q) echo "#endif /* __ASM_CR_SYSCALL_CODES_H__ */" >> $@ + +$(sys-proto): $(sys-def) + $(E) " GEN " $@ + $(Q) echo "/* Autogenerated, don't edit */" > $@ + $(Q) echo "#ifndef __ASM_CR_SYSCALL_PROTO_H__" >> $@ + $(Q) echo "#define __ASM_CR_SYSCALL_PROTO_H__" >> $@ + $(Q) echo "#include " >> $@ + $(Q) echo "#include " >> $@ + $(Q) cat $< | awk '/^__NR/{print "extern long", $$3, substr($$0, index($$0,$$4)), ";"}' >> $@ + $(Q) echo "#endif /* __ASM_CR_SYSCALL_PROTO_H__ */" >> $@ + +$(sys-asm): $(sys-def) $(sys-asm-common) $(sys-codes) $(sys-proto) + $(E) " GEN " $@ + $(Q) echo "/* Autogenerated, don't edit */" > $@ + $(Q) echo "#include " >> $@ + $(Q) echo "#include \"$(sys-asm-common-name)\"" >> $@ + $(Q) cat $< | awk '/^__NR/{print "SYSCALL(", $$3, ",", $$2, ")"}' >> $@ + +$(sys-exec-tbl): $(sys-def) $(sys-codes) $(sys-proto) + $(E) " GEN " $@ + $(Q) echo "/* Autogenerated, don't edit */" > $@ + $(Q) echo "static struct syscall_exec_desc sc_exec_table[] = {" >> $@ + $(Q) cat $< | awk '/^__NR/{print "SYSCALL(", substr($$3, 5), ",", $$2, ")"}' >> $@ + $(Q) echo " { }, /* terminator */" >> $@ + $(Q) echo "};" >> $@ + +$(sys-asm-types): $(PLUGIN_ARCH_DIR)/include/asm/syscall-types.h + $(call msg-gen, $@) + $(Q) ln -s ../../../../../../$(PLUGIN_ARCH_DIR)/include/asm/syscall-types.h $(sys-asm-types) + +std-headers-deps += $(sys-asm) $(sys-codes) $(sys-proto) $(sys-asm-types) +mrproper-y += $(std-headers-deps) diff --git a/CRIU_code/compel/arch/ppc64/plugins/std/syscalls/syscall-common-ppc64.S b/CRIU_code/compel/arch/ppc64/plugins/std/syscalls/syscall-common-ppc64.S new file mode 100644 index 0000000..6d2a8d2 --- /dev/null +++ b/CRIU_code/compel/arch/ppc64/plugins/std/syscalls/syscall-common-ppc64.S @@ -0,0 +1,24 @@ +#include "common/asm/linkage.h" +#include /* for __NR_ipc */ + +#define SYSCALL(name, opcode) \ + ENTRY(name); \ + li r0, opcode; \ + b __syscall_common; \ + END(name) + + .text + .align 4 + +ENTRY(__syscall_common) + sc + bnslr+ /* if no error return to LR */ + neg r3,r3 /* r3 = -r3 to return -errno value */ + blr +END(__syscall_common) + +ENTRY(__cr_restore_rt) + li r0, __NR_rt_sigreturn + b __syscall_common +END(__cr_restore_rt) + diff --git a/CRIU_code/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl b/CRIU_code/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl new file mode 100644 index 0000000..62e0bc1 --- /dev/null +++ b/CRIU_code/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl @@ -0,0 +1,109 @@ +# +# System calls table, please make sure the table consists of only the syscalls +# really used somewhere in the project. +# +# The template is (name and arguments are optional if you need only __NR_x +# defined, but no real entry point in syscalls lib). +# +# name code name arguments +# ----------------------------------------------------------------------- +# +__NR_read 3 sys_read (int fd, void *buf, unsigned long count) +__NR_write 4 sys_write (int fd, const void *buf, unsigned long count) +__NR_open 5 sys_open (const char *filename, unsigned long flags, unsigned long mode) +__NR_close 6 sys_close (int fd) +__NR_lseek 19 sys_lseek (int fd, unsigned long offset, unsigned long origin) +__NR_mmap 90 sys_mmap (void *addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long offset) +__NR_mprotect 125 sys_mprotect (const void *addr, unsigned long len, unsigned long prot) +__NR_munmap 91 sys_munmap (void *addr, unsigned long len) +__NR_brk 45 sys_brk (void *addr) +__NR_rt_sigaction 173 sys_sigaction (int signum, const rt_sigaction_t *act, rt_sigaction_t *oldact, size_t sigsetsize) +__NR_rt_sigprocmask 174 sys_sigprocmask (int how, k_rtsigset_t *set, k_rtsigset_t *old, size_t sigsetsize) +__NR_rt_sigreturn 172 sys_rt_sigreturn (void) +__NR_ioctl 54 sys_ioctl (unsigned int fd, unsigned int cmd, unsigned long arg) +__NR_pread64 179 sys_pread (unsigned int fd, char *buf, size_t count, loff_t pos) +__NR_ptrace 26 sys_ptrace (long request, pid_t pid, void *addr, void *data) +__NR_mremap 163 sys_mremap (unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flags, unsigned long new_addr) +__NR_mincore 206 sys_mincore (void *addr, unsigned long size, unsigned char *vec) +__NR_madvise 205 sys_madvise (unsigned long start, size_t len, int behavior) +__NR_pause 29 sys_pause (void) +__NR_nanosleep 162 sys_nanosleep (struct timespec *req, struct timespec *rem) +__NR_getitimer 105 sys_getitimer (int which, const struct itimerval *val) +__NR_setitimer 104 sys_setitimer (int which, const struct itimerval *val, struct itimerval *old) +__NR_getpid 20 sys_getpid (void) +__NR_socket 326 sys_socket (int domain, int type, int protocol) +__NR_connect 328 sys_connect (int sockfd, struct sockaddr *addr, int addrlen) +__NR_sendto 335 sys_sendto (int sockfd, void *buff, size_t len, unsigned int flags, struct sockaddr *addr, int addr_len) +__NR_recvfrom 337 sys_recvfrom (int sockfd, void *ubuf, size_t size, unsigned int flags, struct sockaddr *addr, int *addr_len) +__NR_sendmsg 341 sys_sendmsg (int sockfd, const struct msghdr *msg, int flags) +__NR_recvmsg 342 sys_recvmsg (int sockfd, struct msghdr *msg, int flags) +__NR_shutdown 338 sys_shutdown (int sockfd, int how) +__NR_bind 327 sys_bind (int sockfd, const struct sockaddr *addr, int addrlen) +__NR_setsockopt 339 sys_setsockopt (int sockfd, int level, int optname, const void *optval, socklen_t optlen) +__NR_getsockopt 340 sys_getsockopt (int sockfd, int level, int optname, const void *optval, socklen_t *optlen) +__NR_clone 120 sys_clone (unsigned long flags, void *child_stack, void *parent_tid, unsigned long newtls, void *child_tid) +__NR_exit 1 sys_exit (unsigned long error_code) +__NR_wait4 114 sys_wait4 (int pid, int *status, int options, struct rusage *ru) +__NR_kill 37 sys_kill (long pid, int sig) +__NR_fcntl 55 sys_fcntl (int fd, int type, long arg) +__NR_flock 143 sys_flock (int fd, unsigned long cmd) +__NR_mkdir 39 sys_mkdir (const char *name, int mode) +__NR_rmdir 40 sys_rmdir (const char *name) +__NR_unlink 10 sys_unlink (char *pathname) +__NR_readlinkat 296 sys_readlinkat (int fd, const char *path, char *buf, int bufsize) +__NR_umask 60 sys_umask (int mask) +__NR_getgroups 80 sys_getgroups (int gsize, unsigned int *groups) +__NR_setgroups 81 sys_setgroups (int gsize, unsigned int *groups) +__NR_setresuid 164 sys_setresuid (int uid, int euid, int suid) +__NR_getresuid 165 sys_getresuid (int *uid, int *euid, int *suid) +__NR_setresgid 169 sys_setresgid (int gid, int egid, int sgid) +__NR_getresgid 170 sys_getresgid (int *gid, int *egid, int *sgid) +__NR_getpgid 132 sys_getpgid (pid_t pid) +__NR_setfsuid 138 sys_setfsuid (int fsuid) +__NR_setfsgid 139 sys_setfsgid (int fsgid) +__NR_getsid 147 sys_getsid (void) +__NR_capget 183 sys_capget (struct cap_header *h, struct cap_data *d) +__NR_capset 184 sys_capset (struct cap_header *h, struct cap_data *d) +__NR_rt_sigqueueinfo 177 sys_rt_sigqueueinfo (pid_t pid, int sig, siginfo_t *info) +__NR_sigaltstack 185 sys_sigaltstack (const void *uss, void *uoss) +__NR_personality 136 sys_personality (unsigned int personality) +__NR_setpriority 97 sys_setpriority (int which, int who, int nice) +__NR_sched_setscheduler 156 sys_sched_setscheduler (int pid, int policy, struct sched_param *p) +__NR_prctl 171 sys_prctl (int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) +__NR_setrlimit 75 sys_setrlimit (int resource, struct krlimit *rlim) +__NR_mount 21 sys_mount (char *dev_nmae, char *dir_name, char *type, unsigned long flags, void *data) +__NR_umount2 52 sys_umount2 (char *name, int flags) +__NR_gettid 207 sys_gettid (void) +__NR_futex 221 sys_futex (uint32_t *uaddr, int op, uint32_t val, struct timespec *utime, uint32_t *uaddr2, uint32_t val3) +__NR_set_tid_address 232 sys_set_tid_address (int *tid_addr) +__NR_restart_syscall 0 sys_restart_syscall (void) +__NR_sys_timer_create 240 sys_timer_create (clockid_t which_clock, struct sigevent *timer_event_spec, kernel_timer_t *created_timer_id) +__NR_sys_timer_settime 241 sys_timer_settime (kernel_timer_t timer_id, int flags, const struct itimerspec *new_setting, struct itimerspec *old_setting) +__NR_sys_timer_gettime 242 sys_timer_gettime (int timer_id, const struct itimerspec *setting) +__NR_sys_timer_getoverrun 243 sys_timer_getoverrun (int timer_id) +__NR_sys_timer_delete 244 sys_timer_delete (kernel_timer_t timer_id) +__NR_clock_gettime 246 sys_clock_gettime (const clockid_t which_clock, const struct timespec *tp) +__NR_exit_group 234 sys_exit_group (int error_code) +__NR_waitid 272 sys_waitid (int which, pid_t pid, struct siginfo *infop, int options, struct rusage *ru) +__NR_set_robust_list 300 sys_set_robust_list (struct robust_list_head *head, size_t len) +__NR_get_robust_list 299 sys_get_robust_list (int pid, struct robust_list_head **head_ptr, size_t *len_ptr) +__NR_vmsplice 285 sys_vmsplice (int fd, const struct iovec *iov, unsigned long nr_segs, unsigned int flags) +__NR_openat 286 sys_openat (int dfd, const char *filename, int flags, int mode) +__NR_fallocate 309 sys_fallocate (int fd, int mode, loff_t offset, loff_t len) +__NR_timerfd_settime 311 sys_timerfd_settime (int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr) +__NR_signalfd4 313 sys_signalfd (int fd, k_rtsigset_t *mask, size_t sizemask, int flags) +__NR_rt_tgsigqueueinfo 322 sys_rt_tgsigqueueinfo (pid_t tgid, pid_t pid, int sig, siginfo_t *info) +__NR_fanotify_init 323 sys_fanotify_init (unsigned int flags, unsigned int event_f_flags) +__NR_fanotify_mark 324 sys_fanotify_mark (int fanotify_fd, unsigned int flags, uint64_t mask, int dfd, const char *pathname) +__NR_open_by_handle_at 346 sys_open_by_handle_at (int mountdirfd, struct file_handle *handle, int flags) +__NR_setns 350 sys_setns (int fd, int nstype) +__NR_kcmp 354 sys_kcmp (pid_t pid1, pid_t pid2, int type, unsigned long idx1, unsigned long idx2) +__NR_seccomp 358 sys_seccomp (unsigned int op, unsigned int flags, const char *uargs) +__NR_memfd_create 360 sys_memfd_create (const char *name, unsigned int flags) +__NR_io_setup 227 sys_io_setup (unsigned nr_events, aio_context_t *ctx_idp) +__NR_io_getevents 229 sys_io_getevents (aio_context_t ctx_id, long min_nr, long nr, struct io_event *events, struct timespec *timeout) +__NR_io_submit 230 sys_io_submit (aio_context_t ctx_id, long nr, struct iocb **iocbpp) +__NR_ipc 117 sys_ipc (unsigned int call, int first, unsigned long second, unsigned long third, const void *ptr, long fifth) +__NR_gettimeofday 78 sys_gettimeofday (struct timeval *tv, struct timezone *tz) +__NR_preadv 320 sys_preadv_raw (int fd, struct iovec *iov, unsigned long nr, unsigned long pos_l, unsigned long pos_h) +__NR_userfaultfd 364 sys_userfaultfd (int flags) diff --git a/CRIU_code/compel/arch/ppc64/scripts/compel-pack.lds.S b/CRIU_code/compel/arch/ppc64/scripts/compel-pack.lds.S new file mode 100644 index 0000000..e0f826d --- /dev/null +++ b/CRIU_code/compel/arch/ppc64/scripts/compel-pack.lds.S @@ -0,0 +1,40 @@ +OUTPUT_ARCH(powerpc:common64) +EXTERN(__export_parasite_head_start) + +SECTIONS +{ + .text : { + *(.head.text) + ASSERT(DEFINED(__export_parasite_head_start), + "Symbol __export_parasite_head_start is missing"); + *(.text*) + *(.compel.exit) + *(.compel.init) + } + + .data : { + *(.data*) + *(.bss*) + } + + .rodata : { + *(.rodata*) + *(.got*) + } + + .toc : ALIGN(8) { + *(.toc*) + } + + /DISCARD/ : { + *(.debug*) + *(.comment*) + *(.note*) + *(.group*) + *(.eh_frame*) + } + +/* Parasite args should have 4 bytes align, as we have futex inside. */ +. = ALIGN(4); +__export_parasite_args = .; +} diff --git a/CRIU_code/compel/arch/ppc64/src/lib/cpu.c b/CRIU_code/compel/arch/ppc64/src/lib/cpu.c new file mode 100644 index 0000000..338ab48 --- /dev/null +++ b/CRIU_code/compel/arch/ppc64/src/lib/cpu.c @@ -0,0 +1,79 @@ +#include +#include +#include +#include + +#include "compel-cpu.h" + +#include "common/bitops.h" + +#include "log.h" + +#undef LOG_PREFIX +#define LOG_PREFIX "cpu: " + +static compel_cpuinfo_t rt_info; + +static void fetch_rt_cpuinfo(void) +{ + static bool rt_info_done = false; + + if (!rt_info_done) { + compel_cpuid(&rt_info); + rt_info_done = true; + } +} + +void compel_set_cpu_cap(compel_cpuinfo_t *info, unsigned int feature) { } +void compel_clear_cpu_cap(compel_cpuinfo_t *info, unsigned int feature) { } +int compel_test_fpu_cap(compel_cpuinfo_t *info, unsigned int feature) { return 0; } +int compel_test_cpu_cap(compel_cpuinfo_t *info, unsigned int feature) { return 0; } + +int compel_cpuid(compel_cpuinfo_t *info) +{ + info->hwcap[0] = getauxval(AT_HWCAP); + info->hwcap[1] = getauxval(AT_HWCAP2); + + if (!info->hwcap[0] || !info->hwcap[1]) { + pr_err("Can't read the hardware capabilities\n"); + return -1; + } + + return 0; +} + +bool compel_cpu_has_feature(unsigned int feature) +{ + fetch_rt_cpuinfo(); + return compel_test_cpu_cap(&rt_info, feature); +} + +bool compel_fpu_has_feature(unsigned int feature) +{ + fetch_rt_cpuinfo(); + return compel_test_fpu_cap(&rt_info, feature); +} + +uint32_t compel_fpu_feature_size(unsigned int feature) +{ + fetch_rt_cpuinfo(); + return 0; +} + +uint32_t compel_fpu_feature_offset(unsigned int feature) +{ + fetch_rt_cpuinfo(); + return 0; +} + +void compel_cpu_clear_feature(unsigned int feature) +{ + fetch_rt_cpuinfo(); + return compel_clear_cpu_cap(&rt_info, feature); +} + +void compel_cpu_copy_cpuinfo(compel_cpuinfo_t *c) +{ + fetch_rt_cpuinfo(); + memcpy(c, &rt_info, sizeof(rt_info)); +} diff --git a/CRIU_code/compel/arch/ppc64/src/lib/handle-elf-host.c b/CRIU_code/compel/arch/ppc64/src/lib/handle-elf-host.c new file mode 100644 index 0000000..fe46118 --- /dev/null +++ b/CRIU_code/compel/arch/ppc64/src/lib/handle-elf-host.c @@ -0,0 +1 @@ +handle-elf.c \ No newline at end of file diff --git a/CRIU_code/compel/arch/ppc64/src/lib/handle-elf.c b/CRIU_code/compel/arch/ppc64/src/lib/handle-elf.c new file mode 100644 index 0000000..3d4020f --- /dev/null +++ b/CRIU_code/compel/arch/ppc64/src/lib/handle-elf.c @@ -0,0 +1,35 @@ +#include + +#include "uapi/compel.h" + +#include "handle-elf.h" +#include "piegen.h" +#include "log.h" + +static const unsigned char __maybe_unused +elf_ident_64_le[EI_NIDENT] = { + 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, +}; + +static const unsigned char __maybe_unused +elf_ident_64_be[EI_NIDENT] = { + 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x02, 0x01, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, +}; + +int handle_binary(void *mem, size_t size) +{ + const unsigned char *elf_ident = +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + elf_ident_64_le; +#else + elf_ident_64_be; +#endif + + if (memcmp(mem, elf_ident, sizeof(elf_ident_64_le)) == 0) + return handle_elf_ppc64(mem, size); + + pr_err("Unsupported Elf format detected\n"); + return -EINVAL; +} diff --git a/CRIU_code/compel/arch/ppc64/src/lib/include/cpu.h b/CRIU_code/compel/arch/ppc64/src/lib/include/cpu.h new file mode 100644 index 0000000..e69de29 diff --git a/CRIU_code/compel/arch/ppc64/src/lib/include/handle-elf.h b/CRIU_code/compel/arch/ppc64/src/lib/include/handle-elf.h new file mode 100644 index 0000000..1a8217e --- /dev/null +++ b/CRIU_code/compel/arch/ppc64/src/lib/include/handle-elf.h @@ -0,0 +1,13 @@ +#ifndef COMPEL_HANDLE_ELF_H__ +#define COMPEL_HANDLE_ELF_H__ + +#include "elf64-types.h" + +#define ELF_PPC64 + +#define __handle_elf handle_elf_ppc64 +#define arch_is_machine_supported(e_machine) (e_machine == EM_PPC64) + +extern int handle_elf_ppc64(void *mem, size_t size); + +#endif /* COMPEL_HANDLE_ELF_H__ */ diff --git a/CRIU_code/compel/arch/ppc64/src/lib/include/syscall.h b/CRIU_code/compel/arch/ppc64/src/lib/include/syscall.h new file mode 100644 index 0000000..e2ec127 --- /dev/null +++ b/CRIU_code/compel/arch/ppc64/src/lib/include/syscall.h @@ -0,0 +1,4 @@ +#ifndef __COMPEL_SYSCALL_H__ +#define __COMPEL_SYSCALL_H__ +#define __NR(syscall, compat) __NR_##syscall +#endif diff --git a/CRIU_code/compel/arch/ppc64/src/lib/include/uapi/asm/.gitignore b/CRIU_code/compel/arch/ppc64/src/lib/include/uapi/asm/.gitignore new file mode 100644 index 0000000..e69de29 diff --git a/CRIU_code/compel/arch/ppc64/src/lib/include/uapi/asm/breakpoints.h b/CRIU_code/compel/arch/ppc64/src/lib/include/uapi/asm/breakpoints.h new file mode 100644 index 0000000..5f09049 --- /dev/null +++ b/CRIU_code/compel/arch/ppc64/src/lib/include/uapi/asm/breakpoints.h @@ -0,0 +1,15 @@ +#ifndef __COMPEL_BREAKPOINTS_H__ +#define __COMPEL_BREAKPOINTS_H__ +#define ARCH_SI_TRAP TRAP_BRKPT + +static inline int ptrace_set_breakpoint(pid_t pid, void *addr) +{ + return 0; +} + +static inline int ptrace_flush_breakpoints(pid_t pid) +{ + return 0; +} + +#endif diff --git a/CRIU_code/compel/arch/ppc64/src/lib/include/uapi/asm/cpu.h b/CRIU_code/compel/arch/ppc64/src/lib/include/uapi/asm/cpu.h new file mode 100644 index 0000000..5992586 --- /dev/null +++ b/CRIU_code/compel/arch/ppc64/src/lib/include/uapi/asm/cpu.h @@ -0,0 +1,10 @@ +#ifndef UAPI_COMPEL_ASM_CPU_H__ +#define UAPI_COMPEL_ASM_CPU_H__ + +#include + +typedef struct { + uint64_t hwcap[2]; +} compel_cpuinfo_t; + +#endif /* UAPI_COMPEL_ASM_CPU_H__ */ diff --git a/CRIU_code/compel/arch/ppc64/src/lib/include/uapi/asm/fpu.h b/CRIU_code/compel/arch/ppc64/src/lib/include/uapi/asm/fpu.h new file mode 100644 index 0000000..7f476d5 --- /dev/null +++ b/CRIU_code/compel/arch/ppc64/src/lib/include/uapi/asm/fpu.h @@ -0,0 +1,4 @@ +#ifndef __CR_ASM_FPU_H__ +#define __CR_ASM_FPU_H__ + +#endif /* __CR_ASM_FPU_H__ */ diff --git a/CRIU_code/compel/arch/ppc64/src/lib/include/uapi/asm/infect-types.h b/CRIU_code/compel/arch/ppc64/src/lib/include/uapi/asm/infect-types.h new file mode 100644 index 0000000..89fc4aa --- /dev/null +++ b/CRIU_code/compel/arch/ppc64/src/lib/include/uapi/asm/infect-types.h @@ -0,0 +1,86 @@ +#ifndef UAPI_COMPEL_ASM_TYPES_H__ +#define UAPI_COMPEL_ASM_TYPES_H__ + +#include +#include +#include + +#define SIGMAX_OLD 31 +#define SIGMAX 64 + +/* + * Copied from kernel header arch/powerpc/include/uapi/asm/ptrace.h + */ +typedef struct { + unsigned long gpr[32]; + unsigned long nip; + unsigned long msr; + unsigned long orig_gpr3; /* Used for restarting system calls */ + unsigned long ctr; + unsigned long link; + unsigned long xer; + unsigned long ccr; + unsigned long softe; /* Soft enabled/disabled */ + unsigned long trap; /* Reason for being here */ + /* + * N.B. for critical exceptions on 4xx, the dar and dsisr + * fields are overloaded to hold srr0 and srr1. + */ + unsigned long dar; /* Fault registers */ + unsigned long dsisr; /* on 4xx/Book-E used for ESR */ + unsigned long result; /* Result of a system call */ +} user_regs_struct_t; + +#define NVSXREG 32 + +#define USER_FPREGS_FL_FP 0x00001 +#define USER_FPREGS_FL_ALTIVEC 0x00002 +#define USER_FPREGS_FL_VSX 0x00004 +#define USER_FPREGS_FL_TM 0x00010 + +#ifndef NT_PPC_TM_SPR +# define NT_PPC_TM_CGPR 0x108 /* TM checkpointed GPR Registers */ +# define NT_PPC_TM_CFPR 0x109 /* TM checkpointed FPR Registers */ +# define NT_PPC_TM_CVMX 0x10a /* TM checkpointed VMX Registers */ +# define NT_PPC_TM_CVSX 0x10b /* TM checkpointed VSX Registers */ +# define NT_PPC_TM_SPR 0x10c /* TM Special Purpose Registers */ +#endif + +#define MSR_TMA (1UL<<34) /* bit 29 Trans Mem state: Transactional */ +#define MSR_TMS (1UL<<33) /* bit 30 Trans Mem state: Suspended */ +#define MSR_TM (1UL<<32) /* bit 31 Trans Mem Available */ +#define MSR_VEC (1UL<<25) +#define MSR_VSX (1UL<<23) + +#define MSR_TM_ACTIVE(x) ((((x) & MSR_TM) && ((x)&(MSR_TMA|MSR_TMS))) != 0) + +typedef struct { + uint64_t fpregs[NFPREG]; + __vector128 vrregs[NVRREG]; + uint64_t vsxregs[NVSXREG]; + + int flags; + struct tm_regs { + int flags; + struct { + uint64_t tfhar, texasr, tfiar; + } tm_spr_regs; + user_regs_struct_t regs; + uint64_t fpregs[NFPREG]; + __vector128 vrregs[NVRREG]; + uint64_t vsxregs[NVSXREG]; + } tm; +} user_fpregs_struct_t; + +#define REG_RES(regs) ((uint64_t)(regs).gpr[3]) +#define REG_IP(regs) ((uint64_t)(regs).nip) +#define REG_SP(regs) ((uint64_t)(regs).gpr[1]) +#define REG_SYSCALL_NR(regs) ((uint64_t)(regs).gpr[0]) + +#define user_regs_native(pregs) true + +#define ARCH_SI_TRAP TRAP_BRKPT + +#define __NR(syscall, compat) __NR_##syscall + +#endif /* UAPI_COMPEL_ASM_TYPES_H__ */ diff --git a/CRIU_code/compel/arch/ppc64/src/lib/include/uapi/asm/processor-flags.h b/CRIU_code/compel/arch/ppc64/src/lib/include/uapi/asm/processor-flags.h new file mode 100644 index 0000000..1571918 --- /dev/null +++ b/CRIU_code/compel/arch/ppc64/src/lib/include/uapi/asm/processor-flags.h @@ -0,0 +1,4 @@ +#ifndef UAPI_COMPEL_ASM_PROCESSOR_FLAGS_H__ +#define UAPI_COMPEL_ASM_PROCESSOR_FLAGS_H__ + +#endif /* UAPI_COMPEL_ASM_PROCESSOR_FLAGS_H__ */ diff --git a/CRIU_code/compel/arch/ppc64/src/lib/include/uapi/asm/processor.h b/CRIU_code/compel/arch/ppc64/src/lib/include/uapi/asm/processor.h new file mode 100644 index 0000000..7376f88 --- /dev/null +++ b/CRIU_code/compel/arch/ppc64/src/lib/include/uapi/asm/processor.h @@ -0,0 +1,4 @@ +#ifndef UAPI_COMPEL_ASM_PROCESSOR_H__ +#define UAPI_COMPEL_ASM_PROCESSOR_H__ + +#endif /* UAPI_COMPEL_ASM_PROCESSOR_H__ */ diff --git a/CRIU_code/compel/arch/ppc64/src/lib/include/uapi/asm/sigframe.h b/CRIU_code/compel/arch/ppc64/src/lib/include/uapi/asm/sigframe.h new file mode 100644 index 0000000..9467a1b --- /dev/null +++ b/CRIU_code/compel/arch/ppc64/src/lib/include/uapi/asm/sigframe.h @@ -0,0 +1,79 @@ +#ifndef UAPI_COMPEL_ASM_SIGFRAME_H__ +#define UAPI_COMPEL_ASM_SIGFRAME_H__ + +#include +#include +#include + +/* + * sigcontext structure defined in file + * /usr/include/powerpc64le-linux-gnu/bits/sigcontext.h, + * included from /usr/include/signal.h + * + * Kernel definition can be found in arch/powerpc/include/uapi/asm/sigcontext.h + */ +#include + +// XXX: the idetifier rt_sigcontext is expected to be struct by the CRIU code +#define rt_sigcontext sigcontext + +#include + +#define RT_SIGFRAME_OFFSET(rt_sigframe) 0 + +/* Copied from the Linux kernel header arch/powerpc/include/asm/ptrace.h */ +#define USER_REDZONE_SIZE 512 + +/* Copied from the Linux kernel source file arch/powerpc/kernel/signal_64.c */ +#define TRAMP_SIZE 6 + +/* + * ucontext_t defined in /usr/include/powerpc64le-linux-gnu/sys/ucontext.h + */ +struct rt_sigframe { + /* sys_rt_sigreturn requires the ucontext be the first field */ + ucontext_t uc; + ucontext_t uc_transact; /* Transactional state */ + unsigned long _unused[2]; + unsigned int tramp[TRAMP_SIZE]; + struct rt_siginfo *pinfo; + void *puc; + struct rt_siginfo info; + /* New 64 bit little-endian ABI allows redzone of 512 bytes below sp */ + char abigap[USER_REDZONE_SIZE]; +} __attribute__((aligned(16))); + +#define ARCH_RT_SIGRETURN(new_sp, rt_sigframe) \ + asm volatile( \ + "mr 1, %0 \n" \ + "li 0, "__stringify(__NR_rt_sigreturn)" \n" \ + "sc \n" \ + : \ + : "r"(new_sp) \ + : "1", "memory") + +#if _CALL_ELF != 2 +# error Only supporting ABIv2. +#else +# define FRAME_MIN_SIZE_PARM 96 +#endif + +#define RT_SIGFRAME_UC(rt_sigframe) (&(rt_sigframe)->uc) +#define RT_SIGFRAME_REGIP(rt_sigframe) ((long unsigned int)(rt_sigframe)->uc.uc_mcontext.gp_regs[PT_NIP]) +#define RT_SIGFRAME_HAS_FPU(rt_sigframe) (1) +#define RT_SIGFRAME_FPU(rt_sigframe) (&(rt_sigframe)->uc.uc_mcontext) + +#define rt_sigframe_erase_sigset(sigframe) \ + memset(&sigframe->uc.uc_sigmask, 0, sizeof(k_rtsigset_t)) +#define rt_sigframe_copy_sigset(sigframe, from) \ + memcpy(&sigframe->uc.uc_sigmask, from, sizeof(k_rtsigset_t)) + +#define MSR_TMA (1UL<<34) /* bit 29 Trans Mem state: Transactional */ +#define MSR_TMS (1UL<<33) /* bit 30 Trans Mem state: Suspended */ +#define MSR_TM (1UL<<32) /* bit 31 Trans Mem Available */ +#define MSR_VEC (1UL<<25) +#define MSR_VSX (1UL<<23) + +#define MSR_TM_ACTIVE(x) ((((x) & MSR_TM) && ((x)&(MSR_TMA|MSR_TMS))) != 0) + +#endif /* UAPI_COMPEL_ASM_SIGFRAME_H__ */ diff --git a/CRIU_code/compel/arch/ppc64/src/lib/infect.c b/CRIU_code/compel/arch/ppc64/src/lib/infect.c new file mode 100644 index 0000000..defed3d --- /dev/null +++ b/CRIU_code/compel/arch/ppc64/src/lib/infect.c @@ -0,0 +1,481 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include "uapi/compel/asm/infect-types.h" +#include "errno.h" +#include "log.h" +#include "common/bug.h" +#include "common/page.h" +#include "infect.h" +#include "infect-priv.h" + +#ifndef NT_PPC_TM_SPR +#define NT_PPC_TM_CGPR 0x108 /* TM checkpointed GPR Registers */ +#define NT_PPC_TM_CFPR 0x109 /* TM checkpointed FPR Registers */ +#define NT_PPC_TM_CVMX 0x10a /* TM checkpointed VMX Registers */ +#define NT_PPC_TM_CVSX 0x10b /* TM checkpointed VSX Registers */ +#define NT_PPC_TM_SPR 0x10c /* TM Special Purpose Registers */ +#endif + +unsigned __page_size = 0; +unsigned __page_shift = 0; + +/* + * Injected syscall instruction + */ +const uint32_t code_syscall[] = { + 0x44000002, /* sc */ + 0x0fe00000 /* twi 31,0,0 */ +}; + +static inline __always_unused void __check_code_syscall(void) +{ + BUILD_BUG_ON(sizeof(code_syscall) != BUILTIN_SYSCALL_SIZE); + BUILD_BUG_ON(!is_log2(sizeof(code_syscall))); +} + +static void prep_gp_regs(mcontext_t *dst, user_regs_struct_t *regs) +{ + memcpy(dst->gp_regs, regs->gpr, sizeof(regs->gpr)); + + dst->gp_regs[PT_NIP] = regs->nip; + dst->gp_regs[PT_MSR] = regs->msr; + dst->gp_regs[PT_ORIG_R3] = regs->orig_gpr3; + dst->gp_regs[PT_CTR] = regs->ctr; + dst->gp_regs[PT_LNK] = regs->link; + dst->gp_regs[PT_XER] = regs->xer; + dst->gp_regs[PT_CCR] = regs->ccr; + dst->gp_regs[PT_TRAP] = regs->trap; +} + +static void put_fpu_regs(mcontext_t *mc, uint64_t *fpregs) +{ + uint64_t *mcfp = (uint64_t *)mc->fp_regs; + + memcpy(mcfp, fpregs, sizeof(*fpregs) * NFPREG); +} + +static void put_altivec_regs(mcontext_t *mc, __vector128 *vrregs) +{ + vrregset_t *v_regs = (vrregset_t *)(((unsigned long)mc->vmx_reserve + 15) & ~0xful); + + memcpy(&v_regs->vrregs[0][0], vrregs, sizeof(uint64_t) * 2 * (NVRREG - 1)); + v_regs->vrsave = *((uint32_t *)&vrregs[NVRREG - 1]); + mc->v_regs = v_regs; +} + +static void put_vsx_regs(mcontext_t *mc, uint64_t *vsxregs) +{ + memcpy((uint64_t *)(mc->v_regs + 1), vsxregs, sizeof(*vsxregs) * NVSXREG); +} + +int sigreturn_prep_regs_plain(struct rt_sigframe *sigframe, + user_regs_struct_t *regs, + user_fpregs_struct_t *fpregs) +{ + mcontext_t *dst_tc = &sigframe->uc_transact.uc_mcontext; + mcontext_t *dst = &sigframe->uc.uc_mcontext; + + if (fpregs->flags & USER_FPREGS_FL_TM) { + prep_gp_regs(&sigframe->uc_transact.uc_mcontext, &fpregs->tm.regs); + prep_gp_regs(&sigframe->uc.uc_mcontext, &fpregs->tm.regs); + } else { + prep_gp_regs(&sigframe->uc.uc_mcontext, regs); + } + + if (fpregs->flags & USER_FPREGS_FL_TM) + sigframe->uc.uc_link = &sigframe->uc_transact; + + if (fpregs->flags & USER_FPREGS_FL_FP) { + if (fpregs->flags & USER_FPREGS_FL_TM) { + put_fpu_regs(&sigframe->uc_transact.uc_mcontext, fpregs->tm.fpregs); + put_fpu_regs(&sigframe->uc.uc_mcontext, fpregs->tm.fpregs); + } else { + put_fpu_regs(&sigframe->uc.uc_mcontext, fpregs->fpregs); + } + } + + if (fpregs->flags & USER_FPREGS_FL_ALTIVEC) { + if (fpregs->flags & USER_FPREGS_FL_TM) { + put_altivec_regs(&sigframe->uc_transact.uc_mcontext, fpregs->tm.vrregs); + put_altivec_regs(&sigframe->uc.uc_mcontext, fpregs->tm.vrregs); + + dst_tc->gp_regs[PT_MSR] |= MSR_VEC; + } else { + put_altivec_regs(&sigframe->uc.uc_mcontext, fpregs->vrregs); + } + + dst->gp_regs[PT_MSR] |= MSR_VEC; + + if (fpregs->flags & USER_FPREGS_FL_VSX) { + if (fpregs->flags & USER_FPREGS_FL_TM) { + put_vsx_regs(&sigframe->uc_transact.uc_mcontext, fpregs->tm.vsxregs); + put_vsx_regs(&sigframe->uc.uc_mcontext, fpregs->tm.vsxregs); + + dst_tc->gp_regs[PT_MSR] |= MSR_VSX; + } else { + put_vsx_regs(&sigframe->uc.uc_mcontext, fpregs->vsxregs); + } + dst->gp_regs[PT_MSR] |= MSR_VSX; + } + } + + return 0; +} + +static void update_vregs(mcontext_t *lcontext, mcontext_t *rcontext) +{ + if (lcontext->v_regs) { + uint64_t offset = (uint64_t)(lcontext->v_regs) - (uint64_t)lcontext; + lcontext->v_regs = (vrregset_t *)((uint64_t)rcontext + offset); + + pr_debug("Updated v_regs:%llx (rcontext:%llx)\n", + (unsigned long long)lcontext->v_regs, + (unsigned long long)rcontext); + } +} + +int sigreturn_prep_fpu_frame_plain(struct rt_sigframe *frame, + struct rt_sigframe *rframe) +{ + uint64_t msr = frame->uc.uc_mcontext.gp_regs[PT_MSR]; + + update_vregs(&frame->uc.uc_mcontext, &rframe->uc.uc_mcontext); + + /* Sanity check: If TM so uc_link should be set, otherwise not */ + if (MSR_TM_ACTIVE(msr) ^ (!!(frame->uc.uc_link))) { + BUG(); + return -1; + } + + /* Updating the transactional state address if any */ + if (frame->uc.uc_link) { + update_vregs(&frame->uc_transact.uc_mcontext, + &rframe->uc_transact.uc_mcontext); + frame->uc.uc_link = &rframe->uc_transact; + } + + return 0; +} + +/* This is the layout of the POWER7 VSX registers and the way they + * overlap with the existing FPR and VMX registers. + * + * VSR doubleword 0 VSR doubleword 1 + * ---------------------------------------------------------------- + * VSR[0] | FPR[0] | | + * ---------------------------------------------------------------- + * VSR[1] | FPR[1] | | + * ---------------------------------------------------------------- + * | ... | | + * ---------------------------------------------------------------- + * VSR[30] | FPR[30] | | + * ---------------------------------------------------------------- + * VSR[31] | FPR[31] | | + * ---------------------------------------------------------------- + * VSR[32] | VR[0] | + * ---------------------------------------------------------------- + * VSR[33] | VR[1] | + * ---------------------------------------------------------------- + * | ... | + * ---------------------------------------------------------------- + * VSR[62] | VR[30] | + * ---------------------------------------------------------------- + * VSR[63] | VR[31] | + * ---------------------------------------------------------------- + * + * PTRACE_GETFPREGS returns FPR[0..31] + FPSCR + * PTRACE_GETVRREGS returns VR[0..31] + VSCR + VRSAVE + * PTRACE_GETVSRREGS returns VSR[0..31] + * + * PTRACE_GETVSRREGS and PTRACE_GETFPREGS are required since we need + * to save FPSCR too. + * + * There 32 VSX double word registers to save since the 32 first VSX double + * word registers are saved through FPR[0..32] and the remaining registers + * are saved when saving the Altivec registers VR[0..32]. + */ + +static int get_fpu_regs(pid_t pid, user_fpregs_struct_t *fp) +{ + if (ptrace(PTRACE_GETFPREGS, pid, 0, (void *)&fp->fpregs) < 0) { + pr_perror("Couldn't get floating-point registers"); + return -1; + } + fp->flags |= USER_FPREGS_FL_FP; + + return 0; +} + +static int get_altivec_regs(pid_t pid, user_fpregs_struct_t *fp) +{ + if (ptrace(PTRACE_GETVRREGS, pid, 0, (void*)&fp->vrregs) < 0) { + /* PTRACE_GETVRREGS returns EIO if Altivec is not supported. + * This should not happen if msr_vec is set. */ + if (errno != EIO) { + pr_perror("Couldn't get Altivec registers"); + return -1; + } + pr_debug("Altivec not supported\n"); + } + else { + pr_debug("Dumping Altivec registers\n"); + fp->flags |= USER_FPREGS_FL_ALTIVEC; + } + return 0; +} + +/* + * Since the FPR[0-31] is stored in the first double word of VSR[0-31] and + * FPR are saved through the FP state, there is no need to save the upper part + * of the first 32 VSX registers. + * Furthermore, the 32 last VSX registers are also the 32 Altivec registers + * already saved, so no need to save them. + * As a consequence, only the doubleword 1 of the 32 first VSX registers have + * to be saved (the ones are returned by PTRACE_GETVSRREGS). + */ +static int get_vsx_regs(pid_t pid, user_fpregs_struct_t *fp) +{ + if (ptrace(PTRACE_GETVSRREGS, pid, 0, (void*)fp->vsxregs) < 0) { + /* + * EIO is returned in the case PTRACE_GETVRREGS is not + * supported. + */ + if (errno != EIO) { + pr_perror("Couldn't get VSX registers"); + return -1; + } + pr_debug("VSX register's dump not supported.\n"); + } + else { + pr_debug("Dumping VSX registers\n"); + fp->flags |= USER_FPREGS_FL_VSX; + } + return 0; +} + +static int get_tm_regs(pid_t pid, user_fpregs_struct_t *fpregs) +{ + struct iovec iov; + + pr_debug("Dumping TM registers\n"); + +#define TM_REQUIRED 0 +#define TM_OPTIONAL 1 +#define PTRACE_GET_TM(s,n,c,u) do { \ + iov.iov_base = &s; \ + iov.iov_len = sizeof(s); \ + if (ptrace(PTRACE_GETREGSET, pid, c, &iov)) { \ + if (!u || errno != EIO) { \ + pr_perror("Couldn't get TM "n); \ + pr_err("Your kernel seems to not support the " \ + "new TM ptrace API (>= 4.8)\n"); \ + goto out_free; \ + } \ + pr_debug("TM "n" not supported.\n"); \ + iov.iov_base = NULL; \ + } \ +} while(0) + + /* Get special registers */ + PTRACE_GET_TM(fpregs->tm.tm_spr_regs, "SPR", NT_PPC_TM_SPR, TM_REQUIRED); + + /* Get checkpointed regular registers */ + PTRACE_GET_TM(fpregs->tm.regs, "GPR", NT_PPC_TM_CGPR, TM_REQUIRED); + + /* Get checkpointed FP registers */ + PTRACE_GET_TM(fpregs->tm.fpregs, "FPR", NT_PPC_TM_CFPR, TM_OPTIONAL); + if (iov.iov_base) + fpregs->tm.flags |= USER_FPREGS_FL_FP; + + /* Get checkpointed VMX (Altivec) registers */ + PTRACE_GET_TM(fpregs->tm.vrregs, "VMX", NT_PPC_TM_CVMX, TM_OPTIONAL); + if (iov.iov_base) + fpregs->tm.flags |= USER_FPREGS_FL_ALTIVEC; + + /* Get checkpointed VSX registers */ + PTRACE_GET_TM(fpregs->tm.vsxregs, "VSX", NT_PPC_TM_CVSX, TM_OPTIONAL); + if (iov.iov_base) + fpregs->tm.flags |= USER_FPREGS_FL_VSX; + + return 0; + +out_free: + return -1; /* still failing the checkpoint */ +} + +static int __get_task_regs(pid_t pid, user_regs_struct_t *regs, + user_fpregs_struct_t *fpregs) +{ + pr_info("Dumping GP/FPU registers for %d\n", pid); + + /* + * This is inspired by kernel function check_syscall_restart in + * arch/powerpc/kernel/signal.c + */ +#ifndef TRAP +#define TRAP(r) ((r).trap & ~0xF) +#endif + + if (TRAP(*regs) == 0x0C00 && regs->ccr & 0x10000000) { + /* Restart the system call */ + switch (regs->gpr[3]) { + case ERESTARTNOHAND: + case ERESTARTSYS: + case ERESTARTNOINTR: + regs->gpr[3] = regs->orig_gpr3; + regs->nip -= 4; + break; + case ERESTART_RESTARTBLOCK: + regs->gpr[0] = __NR_restart_syscall; + regs->nip -= 4; + break; + } + } + + /* Resetting trap since we are now coming from user space. */ + regs->trap = 0; + + fpregs->flags = 0; + /* + * Check for Transactional Memory operation in progress. + * Until we have support of TM register's state through the ptrace API, + * we can't checkpoint process with TM operation in progress (almost + * impossible) or suspended (easy to get). + */ + if (MSR_TM_ACTIVE(regs->msr)) { + pr_debug("Task %d has %s TM operation at 0x%lx\n", + pid, + (regs->msr & MSR_TMS) ? "a suspended" : "an active", + regs->nip); + if (get_tm_regs(pid, fpregs)) + return -1; + fpregs->flags = USER_FPREGS_FL_TM; + } + + if (get_fpu_regs(pid, fpregs)) + return -1; + + if (get_altivec_regs(pid, fpregs)) + return -1; + + if (fpregs->flags & USER_FPREGS_FL_ALTIVEC) { + /* + * Save the VSX registers if Altivec registers are supported + */ + if (get_vsx_regs(pid, fpregs)) + return -1; + } + return 0; +} + +int get_task_regs(pid_t pid, user_regs_struct_t *regs, save_regs_t save, + void *arg, __maybe_unused unsigned long flags) +{ + user_fpregs_struct_t fpregs; + int ret; + + ret = __get_task_regs(pid, regs, &fpregs); + if (ret) + return ret; + + return save(arg, regs, &fpregs); +} + +int compel_syscall(struct parasite_ctl *ctl, int nr, long *ret, + unsigned long arg1, + unsigned long arg2, + unsigned long arg3, + unsigned long arg4, + unsigned long arg5, + unsigned long arg6) +{ + user_regs_struct_t regs = ctl->orig.regs; + int err; + + regs.gpr[0] = (unsigned long)nr; + regs.gpr[3] = arg1; + regs.gpr[4] = arg2; + regs.gpr[5] = arg3; + regs.gpr[6] = arg4; + regs.gpr[7] = arg5; + regs.gpr[8] = arg6; + + err = compel_execute_syscall(ctl, ®s, (char*)code_syscall); + + *ret = regs.gpr[3]; + return err; +} + +void *remote_mmap(struct parasite_ctl *ctl, + void *addr, size_t length, int prot, + int flags, int fd, off_t offset) +{ + long map = 0; + int err; + + err = compel_syscall(ctl, __NR_mmap, &map, + (unsigned long)addr, length, prot, flags, fd, offset); + if (err < 0 || (long)map < 0) + map = 0; + + return (void *)map; +} + +void parasite_setup_regs(unsigned long new_ip, void *stack, user_regs_struct_t *regs) +{ + /* + * OpenPOWER ABI requires that r12 is set to the calling function addressi + * to compute the TOC pointer. + */ + regs->gpr[12] = new_ip; + regs->nip = new_ip; + if (stack) + regs->gpr[1] = (unsigned long) stack; + regs->trap = 0; +} + +bool arch_can_dump_task(struct parasite_ctl *ctl) +{ + /* + * TODO: We should detect 32bit task when BE support is done. + */ + return true; +} + +int arch_fetch_sas(struct parasite_ctl *ctl, struct rt_sigframe *s) +{ + long ret; + int err; + + err = compel_syscall(ctl, __NR_sigaltstack, + &ret, 0, (unsigned long)&s->uc.uc_stack, + 0, 0, 0, 0); + return err ? err : ret; +} + +/* + * Copied for the Linux kernel arch/powerpc/include/asm/processor.h + * + * NOTE: 32bit tasks are not supported. + */ +#define TASK_SIZE_64TB (0x0000400000000000UL) +#define TASK_SIZE_512TB (0x0002000000000000UL) + +#define TASK_SIZE_MIN TASK_SIZE_64TB +#define TASK_SIZE_MAX TASK_SIZE_512TB + +unsigned long compel_task_size(void) +{ + unsigned long task_size; + + for (task_size = TASK_SIZE_MIN; task_size < TASK_SIZE_MAX; task_size <<= 1) + if (munmap((void *)task_size, page_size())) + break; + return task_size; +} diff --git a/CRIU_code/compel/arch/riscv/plugins/include/asm/prologue.h b/CRIU_code/compel/arch/riscv/plugins/include/asm/prologue.h new file mode 100644 index 0000000..e0275e3 --- /dev/null +++ b/CRIU_code/compel/arch/riscv/plugins/include/asm/prologue.h @@ -0,0 +1 @@ +../../../../../arch/x86/plugins/include/asm/prologue.h \ No newline at end of file diff --git a/CRIU_code/compel/arch/riscv/plugins/include/asm/syscall-types.h b/CRIU_code/compel/arch/riscv/plugins/include/asm/syscall-types.h new file mode 100644 index 0000000..d473083 --- /dev/null +++ b/CRIU_code/compel/arch/riscv/plugins/include/asm/syscall-types.h @@ -0,0 +1,29 @@ +#ifndef COMPEL_ARCH_SYSCALL_TYPES_H__ +#define COMPEL_ARCH_SYSCALL_TYPES_H__ + +#define SA_RESTORER 0x04000000 + +typedef void rt_signalfn_t(int, siginfo_t *, void *); +typedef rt_signalfn_t *rt_sighandler_t; + +typedef void rt_restorefn_t(void); +typedef rt_restorefn_t *rt_sigrestore_t; + +#define _KNSIG 64 +#define _NSIG_BPW 64 + +#define _KNSIG_WORDS (_KNSIG / _NSIG_BPW) + +typedef struct { + unsigned long sig[_KNSIG_WORDS]; +} k_rtsigset_t; + +typedef struct { + rt_sighandler_t rt_sa_handler; + unsigned long rt_sa_flags; + rt_sigrestore_t rt_sa_restorer; + k_rtsigset_t rt_sa_mask; +} rt_sigaction_t; + + +#endif /* COMPEL_ARCH_SYSCALL_TYPES_H__ */ \ No newline at end of file diff --git a/CRIU_code/compel/arch/riscv/plugins/include/features.h b/CRIU_code/compel/arch/riscv/plugins/include/features.h new file mode 100644 index 0000000..e8b66c3 --- /dev/null +++ b/CRIU_code/compel/arch/riscv/plugins/include/features.h @@ -0,0 +1,6 @@ +#ifndef __COMPEL_ARCH_FEATURES_H +#define __COMPEL_ARCH_FEATURES_H + +#define ARCH_HAS_MEMCPY + +#endif /* __COMPEL_ARCH_FEATURES_H */ \ No newline at end of file diff --git a/CRIU_code/compel/arch/riscv/plugins/std/memcpy.S b/CRIU_code/compel/arch/riscv/plugins/std/memcpy.S new file mode 100644 index 0000000..b1e53f2 --- /dev/null +++ b/CRIU_code/compel/arch/riscv/plugins/std/memcpy.S @@ -0,0 +1,18 @@ +#include "common/asm/linkage.h" + + .section .head.text, "ax" +ENTRY(memcpy) + add t0,zero,a0 + addi t1,zero,0 +loop: + beq t1,a2,exit + lb t2,0(a1) + sb t2,0(a0) + addi t1,t1,1 + addi a0,a0,1 + addi a1,a1,1 + j loop +exit: + mv a0, t0 + jr ra +END(memcpy) \ No newline at end of file diff --git a/CRIU_code/compel/arch/riscv/plugins/std/parasite-head.S b/CRIU_code/compel/arch/riscv/plugins/std/parasite-head.S new file mode 100644 index 0000000..4609b73 --- /dev/null +++ b/CRIU_code/compel/arch/riscv/plugins/std/parasite-head.S @@ -0,0 +1,11 @@ +#include "common/asm/linkage.h" + + .section .head.text, "ax" +ENTRY(__export_parasite_head_start) + lw a0, __export_parasite_cmd + la a1, __export_parasite_args + jal parasite_service + EBREAK +__export_parasite_cmd: + .long 0 +END(__export_parasite_head_start) \ No newline at end of file diff --git a/CRIU_code/compel/arch/riscv/plugins/std/syscalls/Makefile.syscalls b/CRIU_code/compel/arch/riscv/plugins/std/syscalls/Makefile.syscalls new file mode 100644 index 0000000..ad66e14 --- /dev/null +++ b/CRIU_code/compel/arch/riscv/plugins/std/syscalls/Makefile.syscalls @@ -0,0 +1,117 @@ +std-lib-y += ./$(PLUGIN_ARCH_DIR)/std/syscalls-64.o +sys-proto-types := $(obj)/include/uapi/std/syscall-types.h +sys-proto-generic := $(obj)/include/uapi/std/syscall.h +sys-codes-generic := $(obj)/include/uapi/std/syscall-codes.h +sys-codes = $(obj)/include/uapi/std/syscall-codes-$(1).h +sys-proto = $(obj)/include/uapi/std/syscall-$(1).h +sys-def = $(PLUGIN_ARCH_DIR)/std/syscalls/syscall_$(1).tbl +sys-asm = $(PLUGIN_ARCH_DIR)/std/syscalls-$(1).S +sys-asm-common-name = std/syscalls/syscall-common-riscv-$(1).S +sys-asm-common = $(PLUGIN_ARCH_DIR)/$(sys-asm-common-name) +sys-asm-types := $(obj)/include/uapi/std/asm/syscall-types.h +sys-exec-tbl = $(PLUGIN_ARCH_DIR)/std/sys-exec-tbl-$(1).c + +sys-bits := 64 + +AV := $$$$ + +define gen-rule-sys-codes +$(sys-codes): $(sys-def) $(sys-proto-types) + $(call msg-gen, $$@) + $(Q) echo "/* Autogenerated, don't edit */" > $$@ + $(Q) echo "#ifndef ASM_SYSCALL_CODES_H_$(1)__" >> $$@ + $(Q) echo "#define ASM_SYSCALL_CODES_H_$(1)__" >> $$@ + $(Q) cat $$< | awk '/^__NR/{SYSN=$(AV)1; \ + sub("^__NR", "SYS", SYSN); \ + print "\n#ifndef ", $(AV)1; \ + print "#define", $(AV)1, $(AV)2; \ + print "#endif"; \ + print "\n#ifndef ", SYSN; \ + print "#define ", SYSN, $(AV)1; \ + print "#endif";}' >> $$@ + $(Q) echo "#endif /* ASM_SYSCALL_CODES_H_$(1)__ */" >> $$@ +endef + +define gen-rule-sys-proto +$(sys-proto): $(sys-def) $(sys-proto-types) + $(call msg-gen, $$@) + $(Q) echo "/* Autogenerated, don't edit */" > $$@ + $(Q) echo "#ifndef ASM_SYSCALL_PROTO_H_$(1)__" >> $$@ + $(Q) echo "#define ASM_SYSCALL_PROTO_H_$(1)__" >> $$@ + $(Q) echo '#include ' >> $$@ + $(Q) echo '#include ' >> $$@ +ifeq ($(1),32) + $(Q) echo '#include "asm/syscall32.h"' >> $$@ +endif + $(Q) cat $$< | awk '/^__NR/{print "extern long", $(AV)3, \ + substr($(AV)0, index($(AV)0,$(AV)4)), ";"}' >> $$@ + $(Q) echo "#endif /* ASM_SYSCALL_PROTO_H_$(1)__ */" >> $$@ +endef + +define gen-rule-sys-asm +$(sys-asm): $(sys-def) $(sys-asm-common) $(sys-codes) $(sys-proto) $(sys-proto-types) + $(call msg-gen, $$@) + $(Q) echo "/* Autogenerated, don't edit */" > $$@ + $(Q) echo '#include ' >> $$@ + $(Q) echo '#include "$(sys-asm-common-name)"' >> $$@ + $(Q) cat $$< | awk '/^__NR/{print "SYSCALL(", $(AV)3, ",", $(AV)2, ")"}' >> $$@ +endef + +define gen-rule-sys-exec-tbl +$(sys-exec-tbl): $(sys-def) $(sys-codes) $(sys-proto) $(sys-proto-generic) $(sys-proto-types) + $(call msg-gen, $$@) + $(Q) echo "/* Autogenerated, don't edit */" > $$@ + $(Q) cat $$< | awk '/^__NR/{print \ + "SYSCALL(", substr($(AV)3, 5), ",", $(AV)2, ")"}' >> $$@ +endef + +$(sys-codes-generic): $(sys-proto-types) + $(call msg-gen, $@) + $(Q) echo "/* Autogenerated, don't edit */" > $@ + $(Q) echo "#ifndef __ASM_CR_SYSCALL_CODES_H__" >> $@ + $(Q) echo "#define __ASM_CR_SYSCALL_CODES_H__" >> $@ + $(Q) echo '#include ' >> $@ + $(Q) cat $< | awk '/^__NR/{NR32=$$1; \ + sub("^__NR", "__NR32", NR32); \ + print "\n#ifndef ", NR32; \ + print "#define ", NR32, $$2; \ + print "#endif";}' >> $@ + $(Q) echo "#endif /* __ASM_CR_SYSCALL_CODES_H__ */" >> $@ +mrproper-y += $(sys-codes-generic) + +$(sys-proto-generic): $(strip $(call map,sys-proto,$(sys-bits))) $(sys-proto-types) + $(call msg-gen, $@) + $(Q) echo "/* Autogenerated, don't edit */" > $@ + $(Q) echo "#ifndef __ASM_CR_SYSCALL_PROTO_H__" >> $@ + $(Q) echo "#define __ASM_CR_SYSCALL_PROTO_H__" >> $@ + $(Q) echo "" >> $@ + $(Q) echo '#include ' >> $@ + $(Q) echo "" >> $@ + $(Q) echo "#endif /* __ASM_CR_SYSCALL_PROTO_H__ */" >> $@ +mrproper-y += $(sys-proto-generic) + +define gen-rule-sys-exec-tbl +$(sys-exec-tbl): $(sys-def) $(sys-codes) $(sys-proto) $(sys-proto-generic) + $(call msg-gen, $$@) + $(Q) echo "/* Autogenerated, don't edit */" > $$@ + $(Q) cat $$< | awk '/^__NR/{print \ + "SYSCALL(", substr($(AV)3, 5), ",", $(AV)2, ")"}' >> $$@ +endef + +$(eval $(call map,gen-rule-sys-codes,$(sys-bits))) +$(eval $(call map,gen-rule-sys-proto,$(sys-bits))) +$(eval $(call map,gen-rule-sys-asm,$(sys-bits))) +$(eval $(call map,gen-rule-sys-exec-tbl,$(sys-bits))) + +$(sys-asm-types): $(PLUGIN_ARCH_DIR)/include/asm/syscall-types.h + $(call msg-gen, $@) + $(Q) ln -s ../../../../../../$(PLUGIN_ARCH_DIR)/include/asm/syscall-types.h $(sys-asm-types) + +std-headers-deps += $(call sys-codes,$(sys-bits)) +std-headers-deps += $(call sys-proto,$(sys-bits)) +std-headers-deps += $(call sys-asm,$(sys-bits)) +std-headers-deps += $(call sys-exec-tbl,$(sys-bits)) +std-headers-deps += $(sys-codes-generic) +std-headers-deps += $(sys-proto-generic) +std-headers-deps += $(sys-asm-types) +mrproper-y += $(std-headers-deps) \ No newline at end of file diff --git a/CRIU_code/compel/arch/riscv/plugins/std/syscalls/syscall-common-riscv-64.S b/CRIU_code/compel/arch/riscv/plugins/std/syscalls/syscall-common-riscv-64.S new file mode 100644 index 0000000..cedb8ba --- /dev/null +++ b/CRIU_code/compel/arch/riscv/plugins/std/syscalls/syscall-common-riscv-64.S @@ -0,0 +1,12 @@ +#include "common/asm/linkage.h" + +#define SYSCALL(name, opcode) \ + ENTRY(name); \ + li a0, opcode; \ + syscall; \ + jr ra; \ + nop; \ + END(name) + +ENTRY(__cr_restore_rt) +END(__cr_restore_rt) \ No newline at end of file diff --git a/CRIU_code/compel/arch/riscv/plugins/std/syscalls/syscall_64.tbl b/CRIU_code/compel/arch/riscv/plugins/std/syscalls/syscall_64.tbl new file mode 100644 index 0000000..389d8d2 --- /dev/null +++ b/CRIU_code/compel/arch/riscv/plugins/std/syscalls/syscall_64.tbl @@ -0,0 +1,110 @@ +# __NR_name code name arguments +# ------------------------------------------------------------------------------------------------------------------------------------------------------------- +__NR_read 5000 sys_read (int fd, void *buf, unsigned long count) +__NR_write 5001 sys_write (int fd, const void *buf, unsigned long count) +__NR_open 5002 sys_open (const char *filename, unsigned long flags, unsigned long mode) +__NR_close 5003 sys_close (int fd) +__NR_lseek 5008 sys_lseek (int fd, unsigned long offset, unsigned long origin) +__NR_mmap 5009 sys_mmap (void *addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long offset) +__NR_mprotect 5010 sys_mprotect (const void *addr, unsigned long len, unsigned long prot) +__NR_munmap 5011 sys_munmap (void *addr, unsigned long len) +__NR_brk 5012 sys_brk (void *addr) +__NR_rt_sigaction 5013 sys_sigaction (int signum, const rt_sigaction_t *act, rt_sigaction_t *oldact, size_t sigsetsize) +__NR_rt_sigprocmask 5014 sys_sigprocmask (int how, k_rtsigset_t *set, k_rtsigset_t *old, size_t sigsetsize) +__NR_rt_sigreturn 5211 sys_rt_sigreturn (void) +__NR_ioctl 5015 sys_ioctl (unsigned int fd, unsigned int cmd, unsigned long arg) +__NR_pread64 5016 sys_pread (unsigned int fd, char *buf, size_t count, loff_t pos) +__NR_mremap 5024 sys_mremap (unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flags, unsigned long new_addr) +__NR_mincore 5026 sys_mincore (void *addr, unsigned long size, unsigned char *vec) +__NR_madvise 5027 sys_madvise (unsigned long start, size_t len, int behavior) +__NR_shmat 5029 sys_shmat (int shmid, void *shmaddr, int shmflag) +__NR_dup2 5032 sys_dup2 (int oldfd, int newfd) +__NR_nanosleep 5034 sys_nanosleep (struct timespec *req, struct timespec *rem) +__NR_getitimer 5035 sys_getitimer (int which, const struct itimerval *val) +__NR_setitimer 5036 sys_setitimer (int which, const struct itimerval *val, struct itimerval *old) +__NR_getpid 5038 sys_getpid (void) +__NR_socket 5040 sys_socket (int domain, int type, int protocol) +__NR_connect 5041 sys_connect (int sockfd, struct sockaddr *addr, int addrlen) +__NR_sendto 5043 sys_sendto (int sockfd, void *buff, size_t len, unsigned int flags, struct sockaddr *addr, int addr_len) +__NR_recvfrom 5044 sys_recvfrom (int sockfd, void *ubuf, size_t size, unsigned int flags, struct sockaddr *addr, int *addr_len) +__NR_sendmsg 5045 sys_sendmsg (int sockfd, const struct msghdr *msg, int flags) +__NR_recvmsg 5046 sys_recvmsg (int sockfd, struct msghdr *msg, int flags) +__NR_shutdown 5047 sys_shutdown (int sockfd, int how) +__NR_bind 5048 sys_bind (int sockfd, const struct sockaddr *addr, int addrlen) +__NR_setsockopt 5053 sys_setsockopt (int sockfd, int level, int optname, const void *optval, socklen_t optlen) +__NR_getsockopt 5054 sys_getsockopt (int sockfd, int level, int optname, const void *optval, socklen_t *optlen) +__NR_clone 5055 sys_clone (unsigned long flags, void *child_stack, void *parent_tid, unsigned long newtls, void *child_tid) +__NR_exit 5058 sys_exit (unsigned long error_code) +__NR_wait4 5059 sys_wait4 (int pid, int *status, int options, struct rusage *ru) +__NR_kill 5060 sys_kill (long pid, int sig) +__NR_fcntl 5070 sys_fcntl (int fd, int type, long arg) +__NR_flock 5071 sys_flock (int fd, unsigned long cmd) +__NR_mkdir 5081 sys_mkdir (const char *name, int mode) +__NR_rmdir 5082 sys_rmdir (const char *name) +__NR_unlink 5085 sys_unlink (char *pathname) +__NR_umask 5093 sys_umask (int mask) +__NR_gettimeofday 5094 sys_gettimeofday (struct timeval *tv, struct timezone *tz) +__NR_ptrace 5099 sys_ptrace (long request, pid_t pid, void *addr, void *data) +__NR_getgroups 5113 sys_getgroups (int gsize, unsigned int *groups) +__NR_setgroups 5114 sys_setgroups (int gsize, unsigned int *groups) +__NR_setresuid 5115 sys_setresuid (int uid, int euid, int suid) +__NR_getresuid 5116 sys_getresuid (int *uid, int *euid, int *suid) +__NR_setresgid 5117 sys_setresgid (int gid, int egid, int sgid) +__NR_getresgid 5118 sys_getresgid (int *gid, int *egid, int *sgid) +__NR_getpgid 5119 sys_getpgid (pid_t pid) +__NR_setfsuid 5120 sys_setfsuid (int fsuid) +__NR_setfsgid 5121 sys_setfsgid (int fsgid) +__NR_getsid 5122 sys_getsid (void) +__NR_capget 5123 sys_capget (struct cap_header *h, struct cap_data *d) +__NR_capset 5124 sys_capset (struct cap_header *h, struct cap_data *d) +__NR_rt_sigqueueinfo 5127 sys_rt_sigqueueinfo (pid_t pid, int sig, siginfo_t *info) +__NR_sigaltstack 5129 sys_sigaltstack (const void *uss, void *uoss) +__NR_personality 5132 sys_personality (unsigned int personality) +__NR_setpriority 5138 sys_setpriority (int which, int who, int nice) +__NR_sched_setscheduler 5141 sys_sched_setscheduler (int pid, int policy, struct sched_param *p) +__NR_prctl 5153 sys_prctl (int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) +__NR_setrlimit 5155 sys_setrlimit (int resource, struct krlimit *rlim) +__NR_mount 5160 sys_mount (char *dev_nmae, char *dir_name, char *type, unsigned long flags, void *data) +__NR_umount2 5161 sys_umount2 (char *name, int flags) +__NR_gettid 5178 sys_gettid (void) +__NR_futex 5194 sys_futex (uint32_t *uaddr, int op, uint32_t val, struct timespec *utime, uint32_t *uaddr2, uint32_t val3) +__NR_cacheflush 5197 sys_cacheflush (char *addr, int nbytes, int cache) +__NR_io_setup 5200 sys_io_setup (unsigned nr_events, aio_context_t *ctx) +__NR_io_getevents 5202 sys_io_getevents (aio_context_t ctx, long min_nr, long nr, struct io_event *evs, struct timespec *tmo) +__NR_io_submit 5203 sys_io_submit (aio_context_t ctx, long nr, struct iocb **iocbpp) +__NR_set_tid_address 5212 sys_set_tid_address (int *tid_addr) +__NR_restart_syscall 5213 sys_restart_syscall (void) +__NR_sys_timer_create 5216 sys_timer_create (clockid_t which_clock, struct sigevent *timer_event_spec, kernel_timer_t *created_timer_id) +__NR_sys_timer_settime 5217 sys_timer_settime (kernel_timer_t timer_id, int flags, const struct itimerspec *new_setting, struct itimerspec *old_setting) +__NR_sys_timer_gettime 5218 sys_timer_gettime (int timer_id, const struct itimerspec *setting) +__NR_sys_timer_getoverrun 5219 sys_timer_getoverrun (int timer_id) +__NR_sys_timer_delete 5220 sys_timer_delete (kernel_timer_t timer_id) +__NR_clock_gettime 5222 sys_clock_gettime (const clockid_t which_clock, const struct timespec *tp) +__NR_exit_group 5205 sys_exit_group (int error_code) +__NR_set_thread_area 5242 sys_set_thread_area (unsigned long *addr) +__NR_openat 5247 sys_openat (int dfd, const char *filename, int flags, int mode) +__NR_waitid 5237 sys_waitid (int which, pid_t pid, struct siginfo *infop, int options, struct rusage *ru) +__NR_readlinkat 5257 sys_readlinkat (int fd, const char *path, char *buf, int bufsize) +__NR_ppoll 5261 sys_ppoll (struct pollfd *fds, unsigned int nfds, const struct timespec *tmo, const sigset_t *sigmask, size_t sigsetsize) +__NR_set_robust_list 5268 sys_set_robust_list (struct robust_list_head *head, size_t len) +__NR_get_robust_list 5269 sys_get_robust_list (int pid, struct robust_list_head **head_ptr, size_t *len_ptr) +__NR_fallocate 5279 sys_fallocate (int fd, int mode, loff_t offset, loff_t len) +__NR_seccomp 5312 sys_seccomp (unsigned int op, unsigned int flags, const char *uargs) +__NR_vmsplice 5266 sys_vmsplice (int fd, const struct iovec *iov, unsigned long nr_segs, unsigned int flags) +__NR_timerfd_settime 5282 sys_timerfd_settime (int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr) +__NR_signalfd4 5283 sys_signalfd (int fd, k_rtsigset_t *mask, size_t sizemask, int flags) +__NR_preadv 5289 sys_preadv_raw (int fd, struct iovec *iov, unsigned long nr, unsigned long pos_l, unsigned long pos_h) +__NR_rt_tgsigqueueinfo 5291 sys_rt_tgsigqueueinfo (pid_t tgid, pid_t pid, int sig, siginfo_t *info) +__NR_fanotify_init 5295 sys_fanotify_init (unsigned int flags, unsigned int event_f_flags) +__NR_fanotify_mark 5296 sys_fanotify_mark (int fanotify_fd, unsigned int flags, uint64_t mask, int dfd, const char *pathname) +__NR_open_by_handle_at 5299 sys_open_by_handle_at (int mountdirfd, struct file_handle *handle, int flags) +__NR_setns 5303 sys_setns (int fd, int nstype) +__NR_kcmp 5306 sys_kcmp (pid_t pid1, pid_t pid2, int type, unsigned long idx1, unsigned long idx2) +__NR_memfd_create 5314 sys_memfd_create (const char *name, unsigned int flags) +__NR_userfaultfd 5317 sys_userfaultfd (int flags) + +##TODO for kernel +__NR_fsopen 5430 sys_fsopen (char *fsname, unsigned int flags) +__NR_fsconfig 5431 sys_fsconfig (int fd, unsigned int cmd, const char *key, const char *value, int aux) +__NR_fsmount 5432 sys_fsmount (int fd, unsigned int flags, unsigned int attr_flags) +__NR_clone3 5435 sys_clone3 (struct clone_args *uargs, size_t size) \ No newline at end of file diff --git a/CRIU_code/compel/arch/riscv/scripts/compel-pack.lds.S b/CRIU_code/compel/arch/riscv/scripts/compel-pack.lds.S new file mode 100644 index 0000000..0a045dd --- /dev/null +++ b/CRIU_code/compel/arch/riscv/scripts/compel-pack.lds.S @@ -0,0 +1,33 @@ +OUTPUT_ARCH(riscv) +EXTERN(__export_parasite_head_start) + +SECTIONS +{ + .text : { + *(.head.text) + ASSERT(DEFINED(__export_parasite_head_start), + "Symbol __export_parasite_head_start is missing"); + *(.text*) + *(.compel.exit) + *(.compel.init) + /* .rodata section*/ + *(.rodata*) + *(.got*) + /* .data section */ + *(.data*) + *(.bss*) + *(.sbss*) + *(.toc*) + } + + /DISCARD/ : { /*segments need to discard */ + *(.debug*) + *(.pdr) + *(.comment*) + *(.note*) + *(.group*) + *(.eh_frame*) + *(.RISCV.options) + *(.gnu.attributes) + } +} \ No newline at end of file diff --git a/CRIU_code/compel/arch/riscv/src/lib/cpu.c b/CRIU_code/compel/arch/riscv/src/lib/cpu.c new file mode 100644 index 0000000..e70f448 --- /dev/null +++ b/CRIU_code/compel/arch/riscv/src/lib/cpu.c @@ -0,0 +1,37 @@ + +#include +#include + +#include "compel-cpu.h" +#include "common/bitops.h" +#include "common/compiler.h" +#include "log.h" + +#undef LOG_PREFIX +#define LOG_PREFIX "cpu: " + +static compel_cpuinfo_t rt_info; +static bool rt_info_done = false; + +void compel_set_cpu_cap(compel_cpuinfo_t *c, unsigned int feature){ } + +void compel_clear_cpu_cap(compel_cpuinfo_t *c, unsigned int feature){ } + +int compel_test_cpu_cap(compel_cpuinfo_t *c, unsigned int feature) +{ + return 0; +} + +int compel_cpuid(compel_cpuinfo_t *c){ + return 0; +} + +bool compel_cpu_has_feature(unsigned int feature) +{ + if (!rt_info_done) { + compel_cpuid(&rt_info); + rt_info_done = true; + } + + return compel_test_cpu_cap(&rt_info, feature); +} \ No newline at end of file diff --git a/CRIU_code/compel/arch/riscv/src/lib/handle-elf.c b/CRIU_code/compel/arch/riscv/src/lib/handle-elf.c new file mode 100644 index 0000000..385ce9f --- /dev/null +++ b/CRIU_code/compel/arch/riscv/src/lib/handle-elf.c @@ -0,0 +1,11 @@ +#ifndef COMPEL_HANDLE_ELF_H__ +#define COMPEL_HANDLE_ELF_H__ + +#include "elf64-types.h" + +#define __handle_elf handle_elf_riscv +#define arch_is_machine_supported(e_machine) (e_machine == EM_RISCV) + +extern int handle_elf_riscv(void *mem, size_t size); + +#endif /* COMPEL_HANDLE_ELF_H__ */ \ No newline at end of file diff --git a/CRIU_code/compel/arch/riscv/src/lib/include/handle-elf.h b/CRIU_code/compel/arch/riscv/src/lib/include/handle-elf.h new file mode 100644 index 0000000..2eeaae1 --- /dev/null +++ b/CRIU_code/compel/arch/riscv/src/lib/include/handle-elf.h @@ -0,0 +1,8 @@ +#ifndef COMPEL_HANDLE_ELF_H__ +#define COMPEL_HANDLE_ELF_H__ + +#include "elf64-types.h" + +#define arch_is_machine_supported(e_machine) (e_machine == EM_RISCV) + +#endif /* COMPEL_HANDLE_ELF_H__ */ \ No newline at end of file diff --git a/CRIU_code/compel/arch/riscv/src/lib/include/syscall.h b/CRIU_code/compel/arch/riscv/src/lib/include/syscall.h new file mode 100644 index 0000000..a6aab1a --- /dev/null +++ b/CRIU_code/compel/arch/riscv/src/lib/include/syscall.h @@ -0,0 +1,7 @@ +#ifndef __COMPEL_SYSCALL_H__ +#define __COMPEL_SYSCALL_H__ + +#ifndef SIGSTKFLT +#define SIGSTKFLT 16 +#endif +#endif \ No newline at end of file diff --git a/CRIU_code/compel/arch/riscv/src/lib/include/uapi/asm/breakpoints.h b/CRIU_code/compel/arch/riscv/src/lib/include/uapi/asm/breakpoints.h new file mode 100644 index 0000000..ac167f5 --- /dev/null +++ b/CRIU_code/compel/arch/riscv/src/lib/include/uapi/asm/breakpoints.h @@ -0,0 +1,6 @@ +#ifndef __COMPEL_BREAKPOINTS_H__ +#define __COMPEL_BREAKPOINTS_H__ +#define ARCH_SI_TRAP TRAP_BRKPT +extern int ptrace_set_breakpoint(pid_t pid, void *addr); +extern int ptrace_flush_breakpoints(pid_t pid); +#endif \ No newline at end of file diff --git a/CRIU_code/compel/arch/riscv/src/lib/include/uapi/asm/cpu.h b/CRIU_code/compel/arch/riscv/src/lib/include/uapi/asm/cpu.h new file mode 100644 index 0000000..d32fe56 --- /dev/null +++ b/CRIU_code/compel/arch/riscv/src/lib/include/uapi/asm/cpu.h @@ -0,0 +1,5 @@ +#ifndef __CR_ASM_CPU_H__ +#define __CR_ASM_CPU_H__ + +typedef struct { } compel_cpuinfo_t; +#endif /* __CR_ASM_CPU_H__ */ \ No newline at end of file diff --git a/CRIU_code/compel/arch/riscv/src/lib/include/uapi/asm/fpu.h b/CRIU_code/compel/arch/riscv/src/lib/include/uapi/asm/fpu.h new file mode 100644 index 0000000..a74decc --- /dev/null +++ b/CRIU_code/compel/arch/riscv/src/lib/include/uapi/asm/fpu.h @@ -0,0 +1,4 @@ +#ifndef __CR_ASM_FPU_H__ +#define __CR_ASM_FPU_H__ + +#endif /* __CR_ASM_FPU_H__ */ \ No newline at end of file diff --git a/CRIU_code/compel/arch/riscv/src/lib/include/uapi/asm/infect-types.h b/CRIU_code/compel/arch/riscv/src/lib/include/uapi/asm/infect-types.h new file mode 100644 index 0000000..3a42c9c --- /dev/null +++ b/CRIU_code/compel/arch/riscv/src/lib/include/uapi/asm/infect-types.h @@ -0,0 +1,103 @@ +#ifndef UAPI_COMPEL_ASM_TYPES_H__ +#define UAPI_COMPEL_ASM_TYPES_H__ + +#include +#include +#include +#include +#include + +#define SIGMAX 64 +#define SIGMAX_OLD 31 + + +struct user_regs_struct { + unsigned long pc; + unsigned long ra; + unsigned long sp; + unsigned long gp; + unsigned long tp; + unsigned long t0; + unsigned long t1; + unsigned long t2; + unsigned long s0; + unsigned long s1; + unsigned long a0; + unsigned long a1; + unsigned long a2; + unsigned long a3; + unsigned long a4; + unsigned long a5; + unsigned long a6; + unsigned long a7; + unsigned long s2; + unsigned long s3; + unsigned long s4; + unsigned long s5; + unsigned long s6; + unsigned long s7; + unsigned long s8; + unsigned long s9; + unsigned long s10; + unsigned long s11; + unsigned long t3; + unsigned long t4; + unsigned long t5; + unsigned long t6; +}; + +typedef struct user_regs_struct user_regs_struct_t; + +struct __riscv_f_ext_state { + __u32 f[32]; + __u32 fcsr; +}; + +struct __riscv_d_ext_state { + __u64 f[32]; + __u32 fcsr; +}; + +struct __riscv_q_ext_state { + __u64 f[64] __attribute__((aligned(16))); + __u32 fcsr; + /* + * Reserved for expansion of sigcontext structure. Currently zeroed + * upon signal, and must be zero upon sigreturn. + */ + __u32 reserved[3]; +}; + +union __riscv_fp_state { + struct __riscv_f_ext_state f; + struct __riscv_d_ext_state d; + struct __riscv_q_ext_state q; +}; + +typedef struct __riscv_fp_state user_fpregs_struct_t; + +#define RISCV_a0 regs[10] //arguments a0-a7 +#define RISCV_t0 regs[5] //temporaries t0-t2 +#define RISCV_v0 regs[10] +#define RISCV_v1 regs[11] +#define RISCV_sp regs[2] +#define RISCV_ra regs[1] + + +#define NATIVE_MAGIC 0x0A +#define COMPAT_MAGIC 0x0C +static inline bool user_regs_native(user_regs_struct_t *pregs) +{ + return true; +} + + +#define REG_RES(regs) ((regs).RISCV_v0) +#define REG_IP(regs) ((regs).pc) +#define REG_SP(regs) ((regs).RISCV_sp) +#define REG_SYSCALL_NR(regs) ((regs).RISCV_v0) + +//#define __NR(syscall, compat) ((compat) ? __NR32_##syscall : __NR_##syscall) +#define __NR(syscall, compat) __NR_##syscall + +#endif /* UAPI_COMPEL_ASM_TYPES_H__ */ \ No newline at end of file diff --git a/CRIU_code/compel/arch/riscv/src/lib/include/uapi/asm/sigframe.h b/CRIU_code/compel/arch/riscv/src/lib/include/uapi/asm/sigframe.h new file mode 100644 index 0000000..6b48d0b --- /dev/null +++ b/CRIU_code/compel/arch/riscv/src/lib/include/uapi/asm/sigframe.h @@ -0,0 +1,76 @@ +#ifndef UAPI_COMPEL_ASM_SIGFRAME_H__ +#define UAPI_COMPEL_ASM_SIGFRAME_H__ + +#include +#include + +#include +#include +#include "siginfo.h" +#include "infect-types.h" +#include +#include +#define u32 __u32 + +/* sigcontext defined in /usr/include/asm/sigcontext.h*/ +#define rt_sigcontext sigcontext + + +#include + +struct sigcontext { + struct user_regs_struct sc_regs; + union __riscv_fp_state sc_fpregs; +}; + +typedef struct sigaltstack { + void __user *ss_sp; + int ss_flags; + size_t ss_size; +} stack_t; + +struct ucontext { + unsigned long uc_flags; + struct ucontext *uc_link; + stack_t uc_stack; + struct sigcontext uc_mcontext; + sigset_t uc_sigmask; /* mask last for extensibility */ +}; + + +struct rt_sigframe { + struct siginfo info; + struct ucontext uc; +#ifndef CONFIG_MMU + u32 sigreturn_code[2]; +#endif +}; + + +#define RT_SIGFRAME_UC(rt_sigframe) (&rt_sigframe->uc) +#define RT_SIGFRAME_UC_SIGMASK(rt_sigframe) ((k_rtsigset_t *)(void *)&rt_sigframe->uc.uc_sigmask) +#define RT_SIGFRAME_REGIP(rt_sigframe) ((long unsigned int)0x00) +#define RT_SIGFRAME_FPU(rt_sigframe) +#define RT_SIGFRAME_HAS_FPU(rt_sigframe) 1 + + +#define RT_SIGFRAME_OFFSET(rt_sigframe) 0 + + +#define ARCH_RT_SIGRETURN(new_sp, rt_sigframe) \ + asm volatile( \ + "mv $2, %0 \n" \ + "li $10, "__stringify(__NR_rt_sigreturn)" \n" \ + "syscall \n" \ + : \ + : "r"(new_sp) \ + : "$2","$10","memory") + +int sigreturn_prep_fpu_frame(struct rt_sigframe *sigframe, + struct rt_sigframe *rsigframe); + +#define rt_sigframe_erase_sigset(sigframe) \ + memset(&sigframe->rs_uc.uc_sigmask, 0, sizeof(k_rtsigset_t)) +#define rt_sigframe_copy_sigset(sigframe, from) \ + memcpy(&sigframe->rs_uc.uc_sigmask, from, sizeof(k_rtsigset_t)) +#endif /* UAPI_COMPEL_ASM_SIGFRAME_H__ */ \ No newline at end of file diff --git a/CRIU_code/compel/arch/riscv/src/lib/include/uapi/asm/siginfo.h b/CRIU_code/compel/arch/riscv/src/lib/include/uapi/asm/siginfo.h new file mode 100644 index 0000000..ba80745 --- /dev/null +++ b/CRIU_code/compel/arch/riscv/src/lib/include/uapi/asm/siginfo.h @@ -0,0 +1,116 @@ +#ifndef _UAPI_ASM_SIGINFO_H +#define _UAPI_ASM_SIGINFO_H + + +#define __ARCH_SIGEV_PREAMBLE_SIZE (sizeof(long) + 2*sizeof(int)) +#undef __ARCH_SI_TRAPNO /* exception code needs to fill this ... */ + +#define HAVE_ARCH_SIGINFO_T + +/* + * Careful to keep union _sifields from shifting ... + */ + +#define __ARCH_SI_PREAMBLE_SIZE (4 * sizeof(int)) + +#define __ARCH_SIGSYS + +#define SI_MAX_SIZE 128 +#define SI_PAD_SIZE ((SI_MAX_SIZE - __ARCH_SI_PREAMBLE_SIZE) / sizeof(int)) +#define __ARCH_SI_UID_T __kernel_uid32_t + +#ifndef __ARCH_SI_UID_T +#define __ARCH_SI_UID_T __kernel_uid32_t +#endif + +#ifndef __ARCH_SI_BAND_T +#define __ARCH_SI_BAND_T long +#endif + +#ifndef __ARCH_SI_CLOCK_T +#define __ARCH_SI_CLOCK_T __kernel_clock_t +#endif + +#ifndef __ARCH_SI_ATTRIBUTES +#define __ARCH_SI_ATTRIBUTES +#endif + +typedef struct siginfo { + int si_signo; + int si_errno; + int si_code; + + union { + int _pad[SI_PAD_SIZE]; + + /* kill() */ + struct { + __kernel_pid_t _pid; /* sender's pid */ + __ARCH_SI_UID_T _uid; /* sender's uid */ + } _kill; + + /* POSIX.1b timers */ + struct { + __kernel_timer_t _tid; /* timer id */ + int _overrun; /* overrun count */ + char _pad[sizeof( __ARCH_SI_UID_T) - sizeof(int)]; + sigval_t _sigval; /* same as below */ + int _sys_private; /* not to be passed to user */ + } _timer; + + /* POSIX.1b signals */ + struct { + __kernel_pid_t _pid; /* sender's pid */ + __ARCH_SI_UID_T _uid; /* sender's uid */ + sigval_t _sigval; + } _rt; + + /* SIGCHLD */ + struct { + __kernel_pid_t _pid; /* which child */ + __ARCH_SI_UID_T _uid; /* sender's uid */ + int _status; /* exit code */ + __ARCH_SI_CLOCK_T _utime; + __ARCH_SI_CLOCK_T _stime; + } _sigchld; + + /* SIGILL, SIGFPE, SIGSEGV, SIGBUS */ + struct { + void *_addr; /* faulting insn/memory ref. */ +#ifdef __ARCH_SI_TRAPNO + int _trapno; /* TRAP # which caused the signal */ +#endif + short _addr_lsb; /* LSB of the reported address */ +#ifndef __GENKSYMS__ + struct { + void *_lower; + void *_upper; + } _addr_bnd; +#endif + } _sigfault; + + /* SIGPOLL */ + struct { + __ARCH_SI_BAND_T _band; /* POLL_IN, POLL_OUT, POLL_MSG */ + int _fd; + } _sigpoll; + + /* SIGSYS */ + struct { + void *_call_addr; /* calling user insn */ + int _syscall; /* triggering system call number */ + unsigned int _arch; /* AUDIT_ARCH_* of syscall */ + } _sigsys; + } _sifields; +} __ARCH_SI_ATTRIBUTES siginfo_t; + +/* + * si_code values + * Again these have been chosen to be IRIX compatible. + */ +#undef SI_ASYNCIO +#undef SI_TIMER +#undef SI_MESGQ +#define SI_ASYNCIO -2 /* sent by AIO completion */ + +#endif /* _UAPI_ASM_SIGINFO_H */ \ No newline at end of file diff --git a/CRIU_code/compel/arch/riscv/src/lib/infect.c b/CRIU_code/compel/arch/riscv/src/lib/infect.c new file mode 100644 index 0000000..e0bcc83 --- /dev/null +++ b/CRIU_code/compel/arch/riscv/src/lib/infect.c @@ -0,0 +1,241 @@ +#include +#include +#include +#include +#include +#include +#include "errno.h" +#include +#include +#include "common/err.h" +#include "common/page.h" +#include "asm/infect-types.h" +#include "ptrace.h" +#include "infect.h" +#include "infect-priv.h" +#include "log.h" +#include "common/bug.h" +#include "asm/sigframe.h" +/* + * Injected syscall instruction + * riscv is Little Endian + */ +const char code_syscall[] = { + 0x73, 0x00, 0x00, 0x00, /* ECALL */ + 0x73, 0x00, 0x10, 0x00 /* EBREAK */ +}; + +/* 10-byte legacy floating point register */ +struct fpreg { + uint16_t significand[4]; + uint16_t exponent; +}; + +/* 16-byte floating point register */ +struct fpxreg { + uint16_t significand[4]; + uint16_t exponent; + uint16_t padding[3]; +}; + + +int sigreturn_prep_regs_plain(struct rt_sigframe *sigframe, + user_regs_struct_t *regs, + user_fpregs_struct_t *fpregs) +{ + sigframe->uc.uc_mcontext.sc_regs.pc = regs->regs.pc; + sigframe->uc.uc_mcontext.sc_regs.ra = regs->regs.ra; + sigframe->uc.uc_mcontext.sc_regs.sp = regs->regs.sp; + sigframe->uc.uc_mcontext.sc_regs.gp = regs->regs.gp; + sigframe->uc.uc_mcontext.sc_regs.tp = regs->regs.tp; + sigframe->uc.uc_mcontext.sc_regs.t0 = regs->regs.t0; + sigframe->uc.uc_mcontext.sc_regs.t1 = regs->regs.t1; + sigframe->uc.uc_mcontext.sc_regs.t2 = regs->regs.t2; + sigframe->uc.uc_mcontext.sc_regs.s0 = regs->regs.s0; + sigframe->uc.uc_mcontext.sc_regs.s1 = regs->regs.s1; + sigframe->uc.uc_mcontext.sc_regs.a0 = regs->regs.a0; + sigframe->uc.uc_mcontext.sc_regs.a1 = regs->regs.a1; + sigframe->uc.uc_mcontext.sc_regs.a2 = regs->regs.a2; + sigframe->uc.uc_mcontext.sc_regs.a3 = regs->regs.a3; + sigframe->uc.uc_mcontext.sc_regs.a4 = regs->regs.a4; + sigframe->uc.uc_mcontext.sc_regs.a5 = regs->regs.a5; + sigframe->uc.uc_mcontext.sc_regs.a6 = regs->regs.a6; + sigframe->uc.uc_mcontext.sc_regs.a7 = regs->regs.a7; + sigframe->uc.uc_mcontext.sc_regs.s2 = regs->regs.s2; + sigframe->uc.uc_mcontext.sc_regs.s3 = regs->regs.s3; + sigframe->uc.uc_mcontext.sc_regs.s4 = regs->regs.s4; + sigframe->uc.uc_mcontext.sc_regs.s5 = regs->regs.s5; + sigframe->uc.uc_mcontext.sc_regs.s6 = regs->regs.s6; + sigframe->uc.uc_mcontext.sc_regs.s7 = regs->regs.s7; + sigframe->uc.uc_mcontext.sc_regs.s8 = regs->regs.s8; + sigframe->uc.uc_mcontext.sc_regs.s9 = regs->regs.s9; + sigframe->uc.uc_mcontext.sc_regs.s10 = regs->regs.s10; + sigframe->uc.uc_mcontext.sc_regs.s11 = regs->regs.s11; + sigframe->uc.uc_mcontext.sc_regs.t3 = regs->regs.t3; + sigframe->uc.uc_mcontext.sc_regs.t4 = regs->regs.t4; + sigframe->uc.uc_mcontext.sc_regs.t5 = regs->regs.t5; + sigframe->uc.uc_mcontext.sc_regs.t6 = regs->regs.t6; + + sigframe->uc.uc_mcontext.sc_fpregs.f.f[0] = fpregs->f.f[0]; + sigframe->uc.uc_mcontext.sc_fpregs.f.f[1] = fpregs->f.f[1]; + sigframe->uc.uc_mcontext.sc_fpregs.f.f[2] = fpregs->f.f[2]; + sigframe->uc.uc_mcontext.sc_fpregs.f.f[3] = fpregs->f.f[3]; + sigframe->uc.uc_mcontext.sc_fpregs.f.f[4] = fpregs->f.f[4]; + sigframe->uc.uc_mcontext.sc_fpregs.f.f[5] = fpregs->f.f[5]; + sigframe->uc.uc_mcontext.sc_fpregs.f.f[6] = fpregs->f.f[6]; + sigframe->uc.uc_mcontext.sc_fpregs.f.f[7] = fpregs->f.f[7]; + sigframe->uc.uc_mcontext.sc_fpregs.f.f[8] = fpregs->f.f[8]; + sigframe->uc.uc_mcontext.sc_fpregs.f.f[9] = fpregs->f.f[9]; + sigframe->uc.uc_mcontext.sc_fpregs.f.f[10] = fpregs->f.f[10]; + sigframe->uc.uc_mcontext.sc_fpregs.f.f[11] = fpregs->f.f[11]; + sigframe->uc.uc_mcontext.sc_fpregs.f.f[12] = fpregs->f.f[12]; + sigframe->uc.uc_mcontext.sc_fpregs.f.f[13] = fpregs->f.f[13]; + sigframe->uc.uc_mcontext.sc_fpregs.f.f[14] = fpregs->f.f[14]; + sigframe->uc.uc_mcontext.sc_fpregs.f.f[15] = fpregs->f.f[15]; + sigframe->uc.uc_mcontext.sc_fpregs.f.f[16] = fpregs->f.f[16]; + sigframe->uc.uc_mcontext.sc_fpregs.f.f[17] = fpregs->f.f[17]; + sigframe->uc.uc_mcontext.sc_fpregs.f.f[18] = fpregs->f.f[18]; + sigframe->uc.uc_mcontext.sc_fpregs.f.f[19] = fpregs->f.f[19]; + sigframe->uc.uc_mcontext.sc_fpregs.f.f[20] = fpregs->f.f[20]; + sigframe->uc.uc_mcontext.sc_fpregs.f.f[21] = fpregs->f.f[21]; + sigframe->uc.uc_mcontext.sc_fpregs.f.f[22] = fpregs->f.f[22]; + sigframe->uc.uc_mcontext.sc_fpregs.f.f[23] = fpregs->f.f[23]; + sigframe->uc.uc_mcontext.sc_fpregs.f.f[24] = fpregs->f.f[24]; + sigframe->uc.uc_mcontext.sc_fpregs.f.f[25] = fpregs->f.f[25]; + sigframe->uc.uc_mcontext.sc_fpregs.f.f[26] = fpregs->f.f[26]; + sigframe->uc.uc_mcontext.sc_fpregs.f.f[27] = fpregs->f.f[27]; + sigframe->uc.uc_mcontext.sc_fpregs.f.f[28] = fpregs->f.f[28]; + sigframe->uc.uc_mcontext.sc_fpregs.f.f[29] = fpregs->f.f[29]; + sigframe->uc.uc_mcontext.sc_fpregs.f.f[30] = fpregs->f.f[30]; + sigframe->uc.uc_mcontext.sc_fpregs.f.f[31] = fpregs->f.f[31]; + + return 0; +} + +int sigreturn_prep_fpu_frame_plain(struct rt_sigframe *sigframe, + struct rt_sigframe *rsigframe) +{ + return 0; +} + +int get_task_regs(pid_t pid, user_regs_struct_t *regs, save_regs_t save, + void *arg, __maybe_unused unsigned long flags) +{ + user_fpregs_struct_t xsave = { }, *xs = NULL; + int ret = -1; + + if (ptrace(PTRACE_GETFPREGS, pid, NULL, &xsave)) { + pr_perror("Can't obtain FPU registers for %d", pid); + return ret; + } + + xs = &xsave; + ret = save(arg, regs, xs); + return ret; +} + +int compel_syscall(struct parasite_ctl *ctl, int nr, long *ret, + unsigned long arg1, + unsigned long arg2, + unsigned long arg3, + unsigned long arg4, + unsigned long arg5, + unsigned long arg6) +{ + + user_regs_struct_t regs = ctl->orig.regs; + int err; + + regs.regs.a0 = (unsigned long)nr; //syscall_number will be in v0 + regs.regs.a1 = arg1; + regs.regs.a2 = arg2; + regs.regs.a3 = arg3; + regs.regs.a4 = arg4; + regs.regs.a5 = arg5; + regs.regs.a6 = arg6; + + err = compel_execute_syscall(ctl, ®s, code_syscall); + *ret = regs.regs.a0; + + return err; +} + +void *remote_mmap(struct parasite_ctl *ctl, + void *addr, size_t length, int prot, + int flags, int fd, off_t offset) +{ + long map; + int err; + + err = compel_syscall(ctl, __NR_mmap, &map, + (unsigned long)addr, length, prot, flags, fd, offset >> PAGE_SHIFT); + + if (err < 0 || IS_ERR_VALUE(map)) { + pr_err("remote mmap() failed: %s\n", strerror(-map)); + return NULL; + } + + return (void *)map; +} + +/* + * regs must be inited when calling this function from original context + */ +void parasite_setup_regs(unsigned long new_ip, void *stack, user_regs_struct_t *regs) +{ + regs->epc = new_ip; + if (stack){ + regs->regs.sp = (unsigned long)stack; + } +} + +bool arch_can_dump_task(struct parasite_ctl *ctl) +{ + return true; +} + +int arch_fetch_sas(struct parasite_ctl *ctl, struct rt_sigframe *s) +{ + long ret; + int err; + + err = compel_syscall(ctl, __NR_sigaltstack, + &ret, 0, (unsigned long)&s->uc.uc_stack, + 0, 0, 0, 0); + return err ? err : ret; +} + + +int ptrace_set_breakpoint(pid_t pid, void *addr) +{ + return 0; +} + +int ptrace_flush_breakpoints(pid_t pid) +{ + return 0; +} + + +#define TASK_SIZE32 0x7fff8000UL +#define TASK_SIZE64 0x10000000000UL +#define TASK_SIZE TASK_SIZE64 + +unsigned long compel_task_size(void) { return TASK_SIZE; } + +/* + * Get task registers (overwrites weak function) + * + */ +int ptrace_get_regs(int pid, user_regs_struct_t *regs) +{ + return ptrace(PTRACE_GETREGS, pid, NULL, regs); +} + +/* + * Set task registers (overwrites weak function) + */ +int ptrace_set_regs(int pid, user_regs_struct_t *regs) +{ + return ptrace(PTRACE_SETREGS, pid, NULL, regs); +} \ No newline at end of file diff --git a/CRIU_code/compel/arch/s390/plugins/include/asm/prologue.h b/CRIU_code/compel/arch/s390/plugins/include/asm/prologue.h new file mode 100644 index 0000000..e0275e3 --- /dev/null +++ b/CRIU_code/compel/arch/s390/plugins/include/asm/prologue.h @@ -0,0 +1 @@ +../../../../../arch/x86/plugins/include/asm/prologue.h \ No newline at end of file diff --git a/CRIU_code/compel/arch/s390/plugins/include/asm/syscall-types.h b/CRIU_code/compel/arch/s390/plugins/include/asm/syscall-types.h new file mode 100644 index 0000000..55d7ddb --- /dev/null +++ b/CRIU_code/compel/arch/s390/plugins/include/asm/syscall-types.h @@ -0,0 +1,34 @@ +#ifndef COMPEL_ARCH_SYSCALL_TYPES_H__ +#define COMPEL_ARCH_SYSCALL_TYPES_H__ + +#define SA_RESTORER 0x04000000U + +typedef void rt_signalfn_t(int, siginfo_t *, void *); +typedef rt_signalfn_t *rt_sighandler_t; + +typedef void rt_restorefn_t(void); +typedef rt_restorefn_t *rt_sigrestore_t; + +#define _KNSIG 64 +#define _NSIG_BPW 64 + +#define _KNSIG_WORDS (_KNSIG / _NSIG_BPW) + +typedef struct { + unsigned long sig[_KNSIG_WORDS]; +} k_rtsigset_t; + +/* + * Used for rt_sigaction() system call - see kernel "struct sigaction" in + * include/linux/signal.h. + */ +typedef struct { + rt_sighandler_t rt_sa_handler; + unsigned long rt_sa_flags; + rt_sigrestore_t rt_sa_restorer; + k_rtsigset_t rt_sa_mask; +} rt_sigaction_t; + +struct mmap_arg_struct; + +#endif /* COMPEL_ARCH_SYSCALL_TYPES_H__ */ diff --git a/CRIU_code/compel/arch/s390/plugins/std/parasite-head.S b/CRIU_code/compel/arch/s390/plugins/std/parasite-head.S new file mode 100644 index 0000000..f4cb372 --- /dev/null +++ b/CRIU_code/compel/arch/s390/plugins/std/parasite-head.S @@ -0,0 +1,26 @@ +#include "common/asm/linkage.h" + + .section .head.text, "ax" + +/* + * Entry point for parasite_service() + * + * Addresses of symbols are exported in auto-generated criu/pie/parasite-blob.h + * + * Function is called via parasite_run(). The command for parasite_service() + * is stored in global variable __export_parasite_cmd. + * + * Load parameters for parasite_service(unsigned int cmd, void *args): + * + * - Parameter 1 (cmd) : %r2 = *(uint32 *)(__export_parasite_cmd + pc) + * - Parameter 2 (args): %r3 = __export_parasite_args + pc + */ +ENTRY(__export_parasite_head_start) + larl %r14,__export_parasite_cmd + llgf %r2,0(%r14) + larl %r3,__export_parasite_args + brasl %r14,parasite_service + .long 0x00010001 /* S390_BREAKPOINT_U16: Generates SIGTRAP */ +__export_parasite_cmd: + .long 0 +END(__export_parasite_head_start) diff --git a/CRIU_code/compel/arch/s390/plugins/std/syscalls/Makefile.syscalls b/CRIU_code/compel/arch/s390/plugins/std/syscalls/Makefile.syscalls new file mode 100644 index 0000000..f03b7cc --- /dev/null +++ b/CRIU_code/compel/arch/s390/plugins/std/syscalls/Makefile.syscalls @@ -0,0 +1,58 @@ +ccflags-y += -iquote $(PLUGIN_ARCH_DIR)/std/syscalls/ +asflags-y += -iquote $(PLUGIN_ARCH_DIR)/std/syscalls/ + +sys-types := $(obj)/include/uapi/std/syscall-types.h +sys-codes := $(obj)/include/uapi/std/syscall-codes.h +sys-proto := $(obj)/include/uapi/std/syscall.h + +sys-def := $(PLUGIN_ARCH_DIR)/std/syscalls/syscall-s390.tbl +sys-asm-common-name := std/syscalls/syscall-common-s390.S +sys-asm-common := $(PLUGIN_ARCH_DIR)/$(sys-asm-common-name) +sys-asm-types := $(obj)/include/uapi/std/asm/syscall-types.h +sys-exec-tbl = $(PLUGIN_ARCH_DIR)/std/sys-exec-tbl.c + +sys-asm := ./$(PLUGIN_ARCH_DIR)/std/syscalls/syscalls.S +std-lib-y += $(sys-asm:.S=).o +std-lib-y += ./$(PLUGIN_ARCH_DIR)/std/syscalls/syscalls-s390.o + +$(sys-codes): $(sys-def) + $(E) " GEN " $@ + $(Q) echo "/* Autogenerated, don't edit */" > $@ + $(Q) echo "#ifndef __ASM_CR_SYSCALL_CODES_H__" >> $@ + $(Q) echo "#define __ASM_CR_SYSCALL_CODES_H__" >> $@ + $(Q) cat $< | awk '/^__NR/{SYSN=$$1; sub("^__NR", "SYS", SYSN);'\ + 'print "\n#ifndef ", $$1, "\n#define", $$1, $$2, "\n#endif";'\ + 'print "#ifndef ", SYSN, "\n#define ", SYSN, $$1, "\n#endif"}' >> $@ + $(Q) echo "#endif /* __ASM_CR_SYSCALL_CODES_H__ */" >> $@ + +$(sys-proto): $(sys-def) + $(E) " GEN " $@ + $(Q) echo "/* Autogenerated, don't edit */" > $@ + $(Q) echo "#ifndef __ASM_CR_SYSCALL_PROTO_H__" >> $@ + $(Q) echo "#define __ASM_CR_SYSCALL_PROTO_H__" >> $@ + $(Q) echo "#include " >> $@ + $(Q) echo "#include " >> $@ + $(Q) cat $< | awk '/^__NR/{print "extern long", $$3, substr($$0, index($$0,$$4)), ";"}' >> $@ + $(Q) echo "#endif /* __ASM_CR_SYSCALL_PROTO_H__ */" >> $@ + +$(sys-asm): $(sys-def) $(sys-asm-common) $(sys-codes) $(sys-proto) + $(E) " GEN " $@ + $(Q) echo "/* Autogenerated, don't edit */" > $@ + $(Q) echo "#include " >> $@ + $(Q) echo "#include \"$(sys-asm-common-name)\"" >> $@ + $(Q) cat $< | awk '/^__NR/{print "SYSCALL(", $$3, ",", $$2, ")"}' >> $@ + +$(sys-exec-tbl): $(sys-def) $(sys-codes) $(sys-proto) + $(E) " GEN " $@ + $(Q) echo "/* Autogenerated, don't edit */" > $@ + $(Q) echo "static struct syscall_exec_desc sc_exec_table[] = {" >> $@ + $(Q) cat $< | awk '/^__NR/{print "SYSCALL(", substr($$3, 5), ",", $$2, ")"}' >> $@ + $(Q) echo " { }, /* terminator */" >> $@ + $(Q) echo "};" >> $@ + +$(sys-asm-types): $(PLUGIN_ARCH_DIR)/include/asm/syscall-types.h + $(call msg-gen, $@) + $(Q) ln -s ../../../../../../$(PLUGIN_ARCH_DIR)/include/asm/syscall-types.h $(sys-asm-types) + +std-headers-deps += $(sys-asm) $(sys-codes) $(sys-proto) $(sys-asm-types) +mrproper-y += $(std-headers-deps) diff --git a/CRIU_code/compel/arch/s390/plugins/std/syscalls/syscall-common-s390.S b/CRIU_code/compel/arch/s390/plugins/std/syscalls/syscall-common-s390.S new file mode 100644 index 0000000..79e3b8e --- /dev/null +++ b/CRIU_code/compel/arch/s390/plugins/std/syscalls/syscall-common-s390.S @@ -0,0 +1,37 @@ +#include "common/asm/linkage.h" + +/* + * Define a system call + * + * C-ABI on s390: + * - Parameters 1-5 are passed in %r2-%r6 + * - Parameter 6 is passed on the stack 160(%r15) + * - Return value is in %r2 + * - Return address is in %r14 + * - Registers %r0-%r6,%r14 are call-clobbered + * - Registers %r7-%r13,%r15 are call-saved + * + * SVC ABI on s390: + * - For SVC 0 the system call number is passed in %r1 + * - Parameters 1-6 are passed in %r2-%r7 + * - Return value is passed in %r2 + * - Besides of %r2 all registers are call-saved + */ +#define SYSCALL(name, opcode) \ +ENTRY(name); \ + lgr %r0,%r7; /* Save %r7 */ \ + lg %r7,160(%r15); /* Load 6th parameter */ \ + lghi %r1,opcode; /* Load SVC number */ \ + svc 0; /* Issue SVC 0 */ \ + lgr %r7,%r0; /* Restore %r7 */ \ + br %r14; /* Return to caller */ \ +END(name) \ + +/* + * Issue rt_sigreturn system call for sa_restorer + */ +ENTRY(__cr_restore_rt) + lghi %r1,__NR_rt_sigreturn + svc 0 +END(__cr_restore_rt) + diff --git a/CRIU_code/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl b/CRIU_code/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl new file mode 100644 index 0000000..3521e91 --- /dev/null +++ b/CRIU_code/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl @@ -0,0 +1,109 @@ +# +# System calls table, please make sure the table consists of only the syscalls +# really used somewhere in the project. +# +# The template is (name and arguments are optional if you need only __NR_x +# defined, but no real entry point in syscalls lib). +# +# name code name arguments +# ----------------------------------------------------------------------- +# +__NR_read 3 sys_read (int fd, void *buf, unsigned long count) +__NR_write 4 sys_write (int fd, const void *buf, unsigned long count) +__NR_open 5 sys_open (const char *filename, unsigned long flags, unsigned long mode) +__NR_close 6 sys_close (int fd) +__NR_lseek 19 sys_lseek (int fd, unsigned long offset, unsigned long origin) +__NR_mmap 90 sys_old_mmap (struct mmap_arg_struct *) +__NR_mprotect 125 sys_mprotect (const void *addr, unsigned long len, unsigned long prot) +__NR_munmap 91 sys_munmap (void *addr, unsigned long len) +__NR_brk 45 sys_brk (void *addr) +__NR_rt_sigaction 174 sys_sigaction (int signum, const rt_sigaction_t *act, rt_sigaction_t *oldact, size_t sigsetsize) +__NR_rt_sigprocmask 175 sys_sigprocmask (int how, k_rtsigset_t *set, k_rtsigset_t *old, size_t sigsetsize) +__NR_rt_sigreturn 173 sys_rt_sigreturn (void) +__NR_ioctl 54 sys_ioctl (unsigned int fd, unsigned int cmd, unsigned long arg) +__NR_pread64 180 sys_pread (unsigned int fd, char *buf, size_t count, loff_t pos) +__NR_ptrace 26 sys_ptrace (long request, pid_t pid, void *addr, void *data) +__NR_mremap 163 sys_mremap (unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flags, unsigned long new_addr) +__NR_mincore 218 sys_mincore (void *addr, unsigned long size, unsigned char *vec) +__NR_madvise 219 sys_madvise (unsigned long start, size_t len, int behavior) +__NR_pause 29 sys_pause (void) +__NR_nanosleep 162 sys_nanosleep (struct timespec *req, struct timespec *rem) +__NR_getitimer 105 sys_getitimer (int which, const struct itimerval *val) +__NR_setitimer 104 sys_setitimer (int which, const struct itimerval *val, struct itimerval *old) +__NR_getpid 20 sys_getpid (void) +__NR_socket 359 sys_socket (int domain, int type, int protocol) +__NR_connect 362 sys_connect (int sockfd, struct sockaddr *addr, int addrlen) +__NR_sendto 369 sys_sendto (int sockfd, void *buff, size_t len, unsigned int flags, struct sockaddr *addr, int addr_len) +__NR_recvfrom 371 sys_recvfrom (int sockfd, void *ubuf, size_t size, unsigned int flags, struct sockaddr *addr, int *addr_len) +__NR_sendmsg 370 sys_sendmsg (int sockfd, const struct msghdr *msg, int flags) +__NR_recvmsg 372 sys_recvmsg (int sockfd, struct msghdr *msg, int flags) +__NR_shutdown 373 sys_shutdown (int sockfd, int how) +__NR_bind 361 sys_bind (int sockfd, const struct sockaddr *addr, int addrlen) +__NR_setsockopt 366 sys_setsockopt (int sockfd, int level, int optname, const void *optval, socklen_t optlen) +__NR_getsockopt 365 sys_getsockopt (int sockfd, int level, int optname, const void *optval, socklen_t *optlen) +__NR_clone 120 sys_clone (unsigned long flags, void *child_stack, void *parent_tid, void *child_tid, void *tls) +__NR_exit 1 sys_exit (unsigned long error_code) +__NR_wait4 114 sys_wait4 (int pid, int *status, int options, struct rusage *ru) +__NR_kill 37 sys_kill (long pid, int sig) +__NR_fcntl 55 sys_fcntl (int fd, int type, long arg) +__NR_flock 143 sys_flock (int fd, unsigned long cmd) +__NR_mkdir 39 sys_mkdir (const char *name, int mode) +__NR_rmdir 40 sys_rmdir (const char *name) +__NR_unlink 10 sys_unlink (char *pathname) +__NR_readlinkat 298 sys_readlinkat (int fd, const char *path, char *buf, int bufsize) +__NR_umask 60 sys_umask (int mask) +__NR_getgroups 205 sys_getgroups (int gsize, unsigned int *groups) +__NR_setgroups 206 sys_setgroups (int gsize, unsigned int *groups) +__NR_setresuid 208 sys_setresuid (int uid, int euid, int suid) +__NR_getresuid 209 sys_getresuid (int *uid, int *euid, int *suid) +__NR_setresgid 210 sys_setresgid (int gid, int egid, int sgid) +__NR_getresgid 211 sys_getresgid (int *gid, int *egid, int *sgid) +__NR_getpgid 132 sys_getpgid (pid_t pid) +__NR_setfsuid 215 sys_setfsuid (int fsuid) +__NR_setfsgid 216 sys_setfsgid (int fsgid) +__NR_getsid 147 sys_getsid (void) +__NR_capget 184 sys_capget (struct cap_header *h, struct cap_data *d) +__NR_capset 185 sys_capset (struct cap_header *h, struct cap_data *d) +__NR_rt_sigqueueinfo 178 sys_rt_sigqueueinfo (pid_t pid, int sig, siginfo_t *info) +__NR_sigaltstack 186 sys_sigaltstack (const void *uss, void *uoss) +__NR_personality 136 sys_personality (unsigned int personality) +__NR_setpriority 97 sys_setpriority (int which, int who, int nice) +__NR_sched_setscheduler 156 sys_sched_setscheduler (int pid, int policy, struct sched_param *p) +__NR_prctl 172 sys_prctl (int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) +__NR_setrlimit 75 sys_setrlimit (int resource, struct krlimit *rlim) +__NR_mount 21 sys_mount (char *dev_nmae, char *dir_name, char *type, unsigned long flags, void *data) +__NR_umount2 52 sys_umount2 (char *name, int flags) +__NR_gettid 236 sys_gettid (void) +__NR_futex 238 sys_futex (uint32_t *uaddr, int op, uint32_t val, struct timespec *utime, uint32_t *uaddr2, uint32_t val3) +__NR_set_tid_address 252 sys_set_tid_address (int *tid_addr) +__NR_restart_syscall 7 sys_restart_syscall (void) +__NR_sys_timer_create 254 sys_timer_create (clockid_t which_clock, struct sigevent *timer_event_spec, kernel_timer_t *created_timer_id) +__NR_sys_timer_settime 255 sys_timer_settime (kernel_timer_t timer_id, int flags, const struct itimerspec *new_setting, struct itimerspec *old_setting) +__NR_sys_timer_gettime 256 sys_timer_gettime (int timer_id, const struct itimerspec *setting) +__NR_sys_timer_getoverrun 257 sys_timer_getoverrun (int timer_id) +__NR_sys_timer_delete 258 sys_timer_delete (kernel_timer_t timer_id) +__NR_clock_gettime 260 sys_clock_gettime (const clockid_t which_clock, const struct timespec *tp) +__NR_exit_group 248 sys_exit_group (int error_code) +__NR_waitid 281 sys_waitid (int which, pid_t pid, struct siginfo *infop, int options, struct rusage *ru) +__NR_set_robust_list 304 sys_set_robust_list (struct robust_list_head *head, size_t len) +__NR_get_robust_list 305 sys_get_robust_list (int pid, struct robust_list_head **head_ptr, size_t *len_ptr) +__NR_vmsplice 309 sys_vmsplice (int fd, const struct iovec *iov, unsigned long nr_segs, unsigned int flags) +__NR_openat 288 sys_openat (int dfd, const char *filename, int flags, int mode) +__NR_fallocate 314 sys_fallocate (int fd, int mode, loff_t offset, loff_t len) +__NR_timerfd_settime 320 sys_timerfd_settime (int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr) +__NR_signalfd4 322 sys_signalfd (int fd, k_rtsigset_t *mask, size_t sizemask, int flags) +__NR_rt_tgsigqueueinfo 330 sys_rt_tgsigqueueinfo (pid_t tgid, pid_t pid, int sig, siginfo_t *info) +__NR_fanotify_init 332 sys_fanotify_init (unsigned int flags, unsigned int event_f_flags) +__NR_fanotify_mark 333 sys_fanotify_mark (int fanotify_fd, unsigned int flags, uint64_t mask, int dfd, const char *pathname) +__NR_open_by_handle_at 336 sys_open_by_handle_at (int mountdirfd, struct file_handle *handle, int flags) +__NR_setns 339 sys_setns (int fd, int nstype) +__NR_kcmp 343 sys_kcmp (pid_t pid1, pid_t pid2, int type, unsigned long idx1, unsigned long idx2) +__NR_seccomp 348 sys_seccomp (unsigned int op, unsigned int flags, const char *uargs) +__NR_memfd_create 350 sys_memfd_create (const char *name, unsigned int flags) +__NR_io_setup 243 sys_io_setup (unsigned nr_events, aio_context_t *ctx_idp) +__NR_io_getevents 245 sys_io_getevents (aio_context_t ctx_id, long min_nr, long nr, struct io_event *events, struct timespec *timeout) +__NR_io_submit 246 sys_io_submit (aio_context_t ctx_id, long nr, struct iocb **iocbpp) +__NR_ipc 117 sys_ipc (unsigned int call, int first, unsigned long second, unsigned long third, const void *ptr, long fifth) +__NR_userfaultfd 355 sys_userfaultfd (int flags) +__NR_preadv 328 sys_preadv_raw (int fd, struct iovec *iov, unsigned long nr, unsigned long pos_l, unsigned long pos_h) +__NR_gettimeofday 78 sys_gettimeofday (struct timeval *tv, struct timezone *tz) diff --git a/CRIU_code/compel/arch/s390/plugins/std/syscalls/syscalls-s390.c b/CRIU_code/compel/arch/s390/plugins/std/syscalls/syscalls-s390.c new file mode 100644 index 0000000..2b35cca --- /dev/null +++ b/CRIU_code/compel/arch/s390/plugins/std/syscalls/syscalls-s390.c @@ -0,0 +1,26 @@ +#include "asm/infect-types.h" + +/* + * Define prototype because of compile error if we include uapi/std/syscall.h + */ +long sys_old_mmap (struct mmap_arg_struct *); + +/* + * On s390 we have defined __ARCH_WANT_SYS_OLD_MMAP - Therefore implement + * system call with one parameter "mmap_arg_struct". + */ +unsigned long sys_mmap(void *addr, unsigned long len, unsigned long prot, + unsigned long flags, unsigned long fd, + unsigned long offset) +{ + struct mmap_arg_struct arg_struct; + + arg_struct.addr = (unsigned long)addr; + arg_struct.len = len; + arg_struct.prot = prot; + arg_struct.flags = flags; + arg_struct.fd = fd; + arg_struct.offset = offset; + + return sys_old_mmap(&arg_struct); +} diff --git a/CRIU_code/compel/arch/s390/scripts/compel-pack.lds.S b/CRIU_code/compel/arch/s390/scripts/compel-pack.lds.S new file mode 100644 index 0000000..91ffbda --- /dev/null +++ b/CRIU_code/compel/arch/s390/scripts/compel-pack.lds.S @@ -0,0 +1,40 @@ +OUTPUT_ARCH(s390:64-bit) +EXTERN(__export_parasite_head_start) + +SECTIONS +{ + .text : { + *(.head.text) + ASSERT(DEFINED(__export_parasite_head_start), + "Symbol __export_parasite_head_start is missing"); + *(.text*) + *(.compel.exit) + *(.compel.init) + } + + .data : { + *(.data*) + *(.bss*) + } + + .rodata : { + *(.rodata*) + *(.got*) + } + + .toc : ALIGN(8) { + *(.toc*) + } + + /DISCARD/ : { + *(.debug*) + *(.comment*) + *(.note*) + *(.group*) + *(.eh_frame*) + } + +/* Parasite args should have 4 bytes align, as we have futex inside. */ +. = ALIGN(4); +__export_parasite_args = .; +} diff --git a/CRIU_code/compel/arch/s390/src/lib/cpu.c b/CRIU_code/compel/arch/s390/src/lib/cpu.c new file mode 100644 index 0000000..5d86bf2 --- /dev/null +++ b/CRIU_code/compel/arch/s390/src/lib/cpu.c @@ -0,0 +1,78 @@ +#include + +#include +#include + +#include "compel-cpu.h" +#include "common/bitops.h" +#include "common/compiler.h" + +#include "log.h" + +#undef LOG_PREFIX +#define LOG_PREFIX "cpu: " + +static compel_cpuinfo_t rt_info; +static bool rt_info_done = false; + +static void fetch_rt_cpuinfo(void) +{ + if (!rt_info_done) { + compel_cpuid(&rt_info); + rt_info_done = true; + } +} + +void compel_set_cpu_cap(compel_cpuinfo_t *c, unsigned int feature) { } +void compel_clear_cpu_cap(compel_cpuinfo_t *c, unsigned int feature) { } +int compel_test_fpu_cap(compel_cpuinfo_t *info, unsigned int feature) { return 0; } +int compel_test_cpu_cap(compel_cpuinfo_t *c, unsigned int feature) { return 0; } + +int compel_cpuid(compel_cpuinfo_t *info) +{ + info->hwcap[0] = getauxval(AT_HWCAP); + info->hwcap[1] = getauxval(AT_HWCAP2); + + if (!info->hwcap[0]) { + pr_err("Can't read the hardware capabilities\n"); + return -1; + } + + return 0; +} + +bool compel_cpu_has_feature(unsigned int feature) +{ + fetch_rt_cpuinfo(); + return compel_test_cpu_cap(&rt_info, feature); +} + +bool compel_fpu_has_feature(unsigned int feature) +{ + fetch_rt_cpuinfo(); + return compel_test_fpu_cap(&rt_info, feature); +} + +uint32_t compel_fpu_feature_offset(unsigned int feature) +{ + fetch_rt_cpuinfo(); + return 0; +} + +uint32_t compel_fpu_feature_size(unsigned int feature) +{ + fetch_rt_cpuinfo(); + return 0; +} + +void compel_cpu_clear_feature(unsigned int feature) +{ + fetch_rt_cpuinfo(); + return compel_clear_cpu_cap(&rt_info, feature); +} + +void compel_cpu_copy_cpuinfo(compel_cpuinfo_t *c) +{ + fetch_rt_cpuinfo(); + memcpy(c, &rt_info, sizeof(rt_info)); +} diff --git a/CRIU_code/compel/arch/s390/src/lib/handle-elf-host.c b/CRIU_code/compel/arch/s390/src/lib/handle-elf-host.c new file mode 100644 index 0000000..fe46118 --- /dev/null +++ b/CRIU_code/compel/arch/s390/src/lib/handle-elf-host.c @@ -0,0 +1 @@ +handle-elf.c \ No newline at end of file diff --git a/CRIU_code/compel/arch/s390/src/lib/handle-elf.c b/CRIU_code/compel/arch/s390/src/lib/handle-elf.c new file mode 100644 index 0000000..01a8bf4 --- /dev/null +++ b/CRIU_code/compel/arch/s390/src/lib/handle-elf.c @@ -0,0 +1,22 @@ +#include + +#include "uapi/compel.h" + +#include "handle-elf.h" +#include "piegen.h" +#include "log.h" + +static const unsigned char __maybe_unused +elf_ident_64[EI_NIDENT] = { + 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x02, 0x01, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, +}; + +int handle_binary(void *mem, size_t size) +{ + if (memcmp(mem, elf_ident_64, sizeof(elf_ident_64)) == 0) + return handle_elf_s390(mem, size); + + pr_err("Unsupported Elf format detected\n"); + return -EINVAL; +} diff --git a/CRIU_code/compel/arch/s390/src/lib/include/handle-elf.h b/CRIU_code/compel/arch/s390/src/lib/include/handle-elf.h new file mode 100644 index 0000000..cd13574 --- /dev/null +++ b/CRIU_code/compel/arch/s390/src/lib/include/handle-elf.h @@ -0,0 +1,13 @@ +#ifndef COMPEL_HANDLE_ELF_H__ +#define COMPEL_HANDLE_ELF_H__ + +#include "elf64-types.h" + +#define ELF_S390 + +#define __handle_elf handle_elf_s390 +#define arch_is_machine_supported(e_machine) (e_machine == EM_S390) + +int handle_elf_s390(void *mem, size_t size); + +#endif /* COMPEL_HANDLE_ELF_H__ */ diff --git a/CRIU_code/compel/arch/s390/src/lib/include/syscall.h b/CRIU_code/compel/arch/s390/src/lib/include/syscall.h new file mode 100644 index 0000000..57d4912 --- /dev/null +++ b/CRIU_code/compel/arch/s390/src/lib/include/syscall.h @@ -0,0 +1,8 @@ +#ifndef __COMPEL_SYSCALL_H__ +#define __COMPEL_SYSCALL_H__ + +unsigned long sys_mmap(void *addr, unsigned long len, unsigned long prot, + unsigned long flags, unsigned long fd, + unsigned long offset); + +#endif diff --git a/CRIU_code/compel/arch/s390/src/lib/include/uapi/asm/breakpoints.h b/CRIU_code/compel/arch/s390/src/lib/include/uapi/asm/breakpoints.h new file mode 100644 index 0000000..5f09049 --- /dev/null +++ b/CRIU_code/compel/arch/s390/src/lib/include/uapi/asm/breakpoints.h @@ -0,0 +1,15 @@ +#ifndef __COMPEL_BREAKPOINTS_H__ +#define __COMPEL_BREAKPOINTS_H__ +#define ARCH_SI_TRAP TRAP_BRKPT + +static inline int ptrace_set_breakpoint(pid_t pid, void *addr) +{ + return 0; +} + +static inline int ptrace_flush_breakpoints(pid_t pid) +{ + return 0; +} + +#endif diff --git a/CRIU_code/compel/arch/s390/src/lib/include/uapi/asm/cpu.h b/CRIU_code/compel/arch/s390/src/lib/include/uapi/asm/cpu.h new file mode 100644 index 0000000..b01db51 --- /dev/null +++ b/CRIU_code/compel/arch/s390/src/lib/include/uapi/asm/cpu.h @@ -0,0 +1,10 @@ +#ifndef UAPI_COMPEL_ASM_CPU_H__ +#define UAPI_COMPEL_ASM_CPU_H__ + +#include + +typedef struct { + uint64_t hwcap[2]; +} compel_cpuinfo_t; + +#endif /* __CR_ASM_CPU_H__ */ diff --git a/CRIU_code/compel/arch/s390/src/lib/include/uapi/asm/fpu.h b/CRIU_code/compel/arch/s390/src/lib/include/uapi/asm/fpu.h new file mode 100644 index 0000000..49c9078 --- /dev/null +++ b/CRIU_code/compel/arch/s390/src/lib/include/uapi/asm/fpu.h @@ -0,0 +1,14 @@ +#ifndef __CR_ASM_FPU_H__ +#define __CR_ASM_FPU_H__ + +#include +#include + +/* + * This one is used in restorer + */ +typedef struct { + bool has_fpu; +} fpu_state_t; + +#endif /* __CR_ASM_FPU_H__ */ diff --git a/CRIU_code/compel/arch/s390/src/lib/include/uapi/asm/infect-types.h b/CRIU_code/compel/arch/s390/src/lib/include/uapi/asm/infect-types.h new file mode 100644 index 0000000..fddf65d --- /dev/null +++ b/CRIU_code/compel/arch/s390/src/lib/include/uapi/asm/infect-types.h @@ -0,0 +1,87 @@ +#ifndef UAPI_COMPEL_ASM_TYPES_H__ +#define UAPI_COMPEL_ASM_TYPES_H__ + +#include +#include +#include +#include +#include "common/page.h" + +#define SIGMAX 64 +#define SIGMAX_OLD 31 + +/* + * Definitions from /usr/include/asm/ptrace.h: + * + * typedef struct + * { + * __u32 fpc; + * freg_t fprs[NUM_FPRS]; + * } s390_fp_regs; + * + * typedef struct + * { + * psw_t psw; + * unsigned long gprs[NUM_GPRS]; + * unsigned int acrs[NUM_ACRS]; + * unsigned long orig_gpr2; + * } s390_regs; + */ +typedef struct { + uint64_t part1; + uint64_t part2; +} vector128_t; + +struct prfpreg { + uint32_t fpc; + uint64_t fprs[16]; +}; + +#define USER_FPREGS_VXRS 0x000000001 +/* Guarded-storage control block */ +#define USER_GS_CB 0x000000002 +/* Guarded-storage broadcast control block */ +#define USER_GS_BC 0x000000004 +/* Runtime-instrumentation control block */ +#define USER_RI_CB 0x000000008 +/* Runtime-instrumentation bit set */ +#define USER_RI_ON 0x000000010 + +typedef struct { + uint32_t flags; + struct prfpreg prfpreg; + uint64_t vxrs_low[16]; + vector128_t vxrs_high[16]; + uint64_t gs_cb[4]; + uint64_t gs_bc[4]; + uint64_t ri_cb[8]; +} user_fpregs_struct_t; + +typedef struct { + s390_regs prstatus; + uint32_t system_call; +} user_regs_struct_t; + +#define REG_RES(r) ((uint64_t)(r).prstatus.gprs[2]) +#define REG_IP(r) ((uint64_t)(r).prstatus.psw.addr) +#define REG_SP(r) ((uint64_t)(r).prstatus.gprs[15]) +/* + * We assume that REG_SYSCALL_NR() is only used for pie code where we + * always use svc 0 with opcode in %r1. + */ +#define REG_SYSCALL_NR(r) ((uint64_t)(r).prstatus.gprs[1]) + +#define user_regs_native(pregs) true + +#define __NR(syscall, compat) __NR_##syscall + +struct mmap_arg_struct { + unsigned long addr; + unsigned long len; + unsigned long prot; + unsigned long flags; + unsigned long fd; + unsigned long offset; +}; + +#endif /* UAPI_COMPEL_ASM_TYPES_H__ */ diff --git a/CRIU_code/compel/arch/s390/src/lib/include/uapi/asm/sigframe.h b/CRIU_code/compel/arch/s390/src/lib/include/uapi/asm/sigframe.h new file mode 100644 index 0000000..b6b8944 --- /dev/null +++ b/CRIU_code/compel/arch/s390/src/lib/include/uapi/asm/sigframe.h @@ -0,0 +1,80 @@ + +#ifndef UAPI_COMPEL_ASM_SIGFRAME_H__ +#define UAPI_COMPEL_ASM_SIGFRAME_H__ + +#include +#include + +#include +#include + +// XXX: the identifier rt_sigcontext is expected to be struct by the CRIU code +#define rt_sigcontext sigcontext + +#include + +#define RT_SIGFRAME_OFFSET(rt_sigframe) 0 + +/* + * From /usr/include/asm/sigcontext.h + * + * Redefine _sigregs_ext to be able to compile on older systems + */ +#ifndef __NUM_VXRS_LOW +typedef struct { + __u32 u[4]; +} __vector128; + +typedef struct { + unsigned long long vxrs_low[16]; + __vector128 vxrs_high[16]; + unsigned char __reserved[128]; +} _sigregs_ext; +#endif + +/* + * From /usr/include/uapi/asm/ucontext.h + */ +struct ucontext_extended { + unsigned long uc_flags; + ucontext_t *uc_link; + stack_t uc_stack; + _sigregs uc_mcontext; + sigset_t uc_sigmask; + /* Allow for uc_sigmask growth. Glibc uses a 1024-bit sigset_t. */ + unsigned char __unused[128 - sizeof(sigset_t)]; + _sigregs_ext uc_mcontext_ext; +}; + +/* + * Signal stack frame for RT sigreturn + */ +struct rt_sigframe { + uint8_t callee_used_stack[160]; + uint8_t retcode[2]; + siginfo_t info; + struct ucontext_extended uc; +}; + +/* + * Do rt_sigreturn SVC + */ +#define ARCH_RT_SIGRETURN(new_sp, rt_sigframe) \ + asm volatile( \ + "lgr %%r15,%0\n" \ + "lghi %%r1,173\n" \ + "svc 0\n" \ + : \ + : "d" (new_sp) \ + : "15", "memory") + +#define RT_SIGFRAME_UC(rt_sigframe) (&rt_sigframe->uc) +#define RT_SIGFRAME_REGIP(rt_sigframe) (rt_sigframe)->uc.uc_mcontext.regs.psw.addr +#define RT_SIGFRAME_HAS_FPU(rt_sigframe) (1) + +#define rt_sigframe_erase_sigset(sigframe) \ + memset(&sigframe->uc.uc_sigmask, 0, sizeof(k_rtsigset_t)) +#define rt_sigframe_copy_sigset(sigframe, from) \ + memcpy(&sigframe->uc.uc_sigmask, from, sizeof(k_rtsigset_t)) + +#endif /* UAPI_COMPEL_ASM_SIGFRAME_H__ */ diff --git a/CRIU_code/compel/arch/s390/src/lib/infect.c b/CRIU_code/compel/arch/s390/src/lib/infect.c new file mode 100644 index 0000000..00e9c36 --- /dev/null +++ b/CRIU_code/compel/arch/s390/src/lib/infect.c @@ -0,0 +1,715 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "uapi/compel/asm/infect-types.h" +#include "errno.h" +#include "log.h" +#include "common/bug.h" +#include "infect.h" +#include "ptrace.h" +#include "infect-priv.h" + +#define NT_PRFPREG 2 +#define NT_S390_VXRS_LOW 0x309 +#define NT_S390_VXRS_HIGH 0x30a +#define NT_S390_GS_CB 0x30b +#define NT_S390_GS_BC 0x30c +#define NT_S390_RI_CB 0x30d + +/* + * Print general purpose and access registers + */ +static void print_user_regs_struct(const char *msg, int pid, + user_regs_struct_t *regs) +{ + int i; + + pr_debug("%s: Registers for pid=%d\n", msg, pid); + pr_debug("system_call %08lx\n", (unsigned long) regs->system_call); + pr_debug(" psw %016lx %016lx\n", regs->prstatus.psw.mask, + regs->prstatus.psw.addr); + pr_debug(" orig_gpr2 %016lx\n", regs->prstatus.orig_gpr2); + for (i = 0; i < 16; i++) + pr_debug(" g%02d %016lx\n", i, regs->prstatus.gprs[i]); + for (i = 0; i < 16; i++) + pr_debug(" a%02d %08x\n", i, regs->prstatus.acrs[i]); +} + +/* + * Print vector registers + */ +static void print_vxrs(user_fpregs_struct_t *fpregs) +{ + int i; + + if (!(fpregs->flags & USER_FPREGS_VXRS)) { + pr_debug(" No VXRS\n"); + return; + } + for (i = 0; i < 16; i++) + pr_debug(" vx_low%02d %016lx\n", i, fpregs->vxrs_low[i]); + for (i = 0; i < 16; i++) + pr_debug(" vx_high%02d %016lx %016lx\n", i, + fpregs->vxrs_high[i].part1, + fpregs->vxrs_high[i].part2); +} + +/* + * Print guarded-storage control block + */ +static void print_gs_cb(user_fpregs_struct_t *fpregs) +{ + int i; + + if (!(fpregs->flags & USER_GS_CB)) { + pr_debug(" No GS_CB\n"); + return; + } + for (i = 0; i < 4; i++) + pr_debug(" gs_cb%02d %016lx\n", i, fpregs->gs_cb[i]); +} + +/* + * Print guarded-storage broadcast control block + */ +static void print_gs_bc(user_fpregs_struct_t *fpregs) +{ + int i; + + if (!(fpregs->flags & USER_GS_BC)) { + pr_debug(" No GS_BC\n"); + return; + } + for (i = 0; i < 4; i++) + pr_debug(" gs_bc%02d %016lx\n", i, fpregs->gs_bc[i]); +} + +/* + * Print runtime-instrumentation control block + */ +static void print_ri_cb(user_fpregs_struct_t *fpregs) +{ + int i; + + if (!(fpregs->flags & USER_RI_CB)) { + pr_debug(" No RI_CB\n"); + return; + } + for (i = 0; i < 8; i++) + pr_debug(" ri_cb%02d %016lx\n", i, fpregs->ri_cb[i]); +} + +/* + * Print FP registers, VX registers, guarded-storage, and + * runtime-instrumentation + */ +static void print_user_fpregs_struct(const char *msg, int pid, + user_fpregs_struct_t *fpregs) +{ + int i; + + pr_debug("%s: FP registers for pid=%d\n", msg, pid); + pr_debug(" fpc %08x\n", fpregs->prfpreg.fpc); + for (i = 0; i < 16; i++) + pr_debug(" f%02d %016lx\n", i, fpregs->prfpreg.fprs[i]); + print_vxrs(fpregs); + print_gs_cb(fpregs); + print_gs_bc(fpregs); + print_ri_cb(fpregs); +} + +int sigreturn_prep_regs_plain(struct rt_sigframe *sigframe, + user_regs_struct_t *regs, + user_fpregs_struct_t *fpregs) +{ + _sigregs_ext *dst_ext = &sigframe->uc.uc_mcontext_ext; + _sigregs *dst = &sigframe->uc.uc_mcontext; + + memcpy(dst->regs.gprs, regs->prstatus.gprs, + sizeof(regs->prstatus.gprs)); + memcpy(dst->regs.acrs, regs->prstatus.acrs, + sizeof(regs->prstatus.acrs)); + memcpy(&dst->regs.psw, ®s->prstatus.psw, + sizeof(regs->prstatus.psw)); + memcpy(&dst->fpregs.fpc, &fpregs->prfpreg.fpc, + sizeof(fpregs->prfpreg.fpc)); + memcpy(&dst->fpregs.fprs, &fpregs->prfpreg.fprs, + sizeof(fpregs->prfpreg.fprs)); + if (fpregs->flags & USER_FPREGS_VXRS) { + memcpy(&dst_ext->vxrs_low, &fpregs->vxrs_low, + sizeof(fpregs->vxrs_low)); + memcpy(&dst_ext->vxrs_high, &fpregs->vxrs_high, + sizeof(fpregs->vxrs_high)); + } else { + memset(&dst_ext->vxrs_low, 0, sizeof(dst_ext->vxrs_low)); + memset(&dst_ext->vxrs_high, 0, sizeof(dst_ext->vxrs_high)); + } + return 0; +} + +int sigreturn_prep_fpu_frame_plain(struct rt_sigframe *sigframe, + struct rt_sigframe *rsigframe) +{ + return 0; +} + +/* + * Rewind the psw for 'bytes' bytes + */ +static inline void rewind_psw(psw_t *psw, unsigned long bytes) +{ + unsigned long mask; + + pr_debug("Rewind psw: %016lx bytes=%lu\n", psw->addr, bytes); + mask = (psw->mask & PSW_MASK_EA) ? -1UL : + (psw->mask & PSW_MASK_BA) ? (1UL << 31) - 1 : + (1UL << 24) - 1; + psw->addr = (psw->addr - bytes) & mask; +} + +/* + * Get vector registers + */ +int get_vx_regs(pid_t pid, user_fpregs_struct_t *fpregs) +{ + struct iovec iov; + + fpregs->flags &= ~USER_FPREGS_VXRS; + iov.iov_base = &fpregs->vxrs_low; + iov.iov_len = sizeof(fpregs->vxrs_low); + if (ptrace(PTRACE_GETREGSET, pid, NT_S390_VXRS_LOW, &iov) < 0) { + /* + * If the kernel does not support vector registers, we get + * EINVAL. With kernel support and old hardware, we get ENODEV. + */ + if (errno == EINVAL || errno == ENODEV) { + memset(fpregs->vxrs_low, 0, sizeof(fpregs->vxrs_low)); + memset(fpregs->vxrs_high, 0, sizeof(fpregs->vxrs_high)); + pr_debug("VXRS registers not supported\n"); + return 0; + } + pr_perror("Couldn't get VXRS_LOW\n"); + return -1; + } + iov.iov_base = &fpregs->vxrs_high; + iov.iov_len = sizeof(fpregs->vxrs_high); + if (ptrace(PTRACE_GETREGSET, pid, NT_S390_VXRS_HIGH, &iov) < 0) { + pr_perror("Couldn't get VXRS_HIGH\n"); + return -1; + } + fpregs->flags |= USER_FPREGS_VXRS; + return 0; +} + +/* + * Get guarded-storage control block + */ +int get_gs_cb(pid_t pid, user_fpregs_struct_t *fpregs) +{ + struct iovec iov; + + fpregs->flags &= ~(USER_GS_CB | USER_GS_BC); + iov.iov_base = &fpregs->gs_cb; + iov.iov_len = sizeof(fpregs->gs_cb); + if (ptrace(PTRACE_GETREGSET, pid, NT_S390_GS_CB, &iov) < 0) { + switch (errno) { + case EINVAL: + case ENODEV: + memset(&fpregs->gs_cb, 0, sizeof(fpregs->gs_cb)); + memset(&fpregs->gs_bc, 0, sizeof(fpregs->gs_bc)); + pr_debug("GS_CB not supported\n"); + return 0; + case ENODATA: + pr_debug("GS_CB not set\n"); + break; + default: + return -1; + } + } else { + fpregs->flags |= USER_GS_CB; + } + iov.iov_base = &fpregs->gs_bc; + iov.iov_len = sizeof(fpregs->gs_bc); + if (ptrace(PTRACE_GETREGSET, pid, NT_S390_GS_BC, &iov) < 0) { + if (errno == ENODATA) { + pr_debug("GS_BC not set\n"); + return 0; + } + pr_perror("Couldn't get GS_BC\n"); + return -1; + } + fpregs->flags |= USER_GS_BC; + + return 0; +} + +/* + * Get runtime-instrumentation control block + */ +int get_ri_cb(pid_t pid, user_fpregs_struct_t *fpregs) +{ + user_regs_struct_t regs; + struct iovec iov; + psw_t *psw; + + fpregs->flags &= ~(USER_RI_CB | USER_RI_ON); + iov.iov_base = &fpregs->ri_cb; + iov.iov_len = sizeof(fpregs->ri_cb); + if (ptrace(PTRACE_GETREGSET, pid, NT_S390_RI_CB, &iov) < 0) { + switch (errno) { + case EINVAL: + case ENODEV: + memset(&fpregs->ri_cb, 0, sizeof(fpregs->ri_cb)); + pr_debug("RI_CB not supported\n"); + return 0; + case ENODATA: + pr_debug("RI_CB not set\n"); + return 0; + default: + pr_perror("Couldn't get RI_CB\n"); + return -1; + } + } + fpregs->flags |= USER_RI_CB; + + /* Get PSW and check if runtime-instrumentation bit is enabled */ + iov.iov_base = ®s.prstatus; + iov.iov_len = sizeof(regs.prstatus); + if (ptrace(PTRACE_GETREGSET, pid, NT_PRSTATUS, &iov) < 0) + return -1; + psw = ®s.prstatus.psw; + if (psw->mask & PSW_MASK_RI) + fpregs->flags |= USER_RI_ON; + + return 0; +} + +/* + * Disable runtime-instrumentation bit + */ +static int s390_disable_ri_bit(pid_t pid, user_regs_struct_t *regs) +{ + struct iovec iov; + psw_t *psw; + + iov.iov_base = ®s->prstatus; + iov.iov_len = sizeof(regs->prstatus); + psw = ®s->prstatus.psw; + psw->mask &= ~PSW_MASK_RI; + return ptrace(PTRACE_SETREGSET, pid, NT_PRSTATUS, &iov); +} + +/* + * Prepare task registers for restart + */ +int get_task_regs(pid_t pid, user_regs_struct_t *regs, save_regs_t save, + void *arg, __maybe_unused unsigned long flags) +{ + user_fpregs_struct_t fpregs; + struct iovec iov; + int rewind; + + print_user_regs_struct("get_task_regs", pid, regs); + + memset(&fpregs, 0, sizeof(fpregs)); + iov.iov_base = &fpregs.prfpreg; + iov.iov_len = sizeof(fpregs.prfpreg); + if (ptrace(PTRACE_GETREGSET, pid, NT_PRFPREG, &iov) < 0) { + pr_perror("Couldn't get floating-point registers"); + return -1; + } + if (get_vx_regs(pid, &fpregs)) { + pr_perror("Couldn't get vector registers"); + return -1; + } + if (get_gs_cb(pid, &fpregs)) { + pr_perror("Couldn't get guarded-storage"); + return -1; + } + if (get_ri_cb(pid, &fpregs)) { + pr_perror("Couldn't get runtime-instrumentation"); + return -1; + } + /* + * If the runtime-instrumentation bit is set, we have to disable it + * before we execute parasite code. Otherwise parasite operations + * would be recorded. + */ + if (fpregs.flags & USER_RI_ON) + s390_disable_ri_bit(pid, regs); + + print_user_fpregs_struct("get_task_regs", pid, &fpregs); + /* Check for system call restarting. */ + if (regs->system_call) { + rewind = regs->system_call >> 16; + /* see arch/s390/kernel/signal.c: do_signal() */ + switch ((long)regs->prstatus.gprs[2]) { + case -ERESTARTNOHAND: + case -ERESTARTSYS: + case -ERESTARTNOINTR: + regs->prstatus.gprs[2] = regs->prstatus.orig_gpr2; + rewind_psw(®s->prstatus.psw, rewind); + pr_debug("New gpr2: %016lx\n", regs->prstatus.gprs[2]); + break; + case -ERESTART_RESTARTBLOCK: + pr_warn("Will restore %d with interrupted system call\n", pid); + regs->prstatus.gprs[2] = -EINTR; + break; + } + } + /* Call save_task_regs() */ + return save(arg, regs, &fpregs); +} + +/* + * Injected syscall instruction + */ +const char code_syscall[] = { + 0x0a, 0x00, /* sc 0 */ + 0x00, 0x01, /* S390_BREAKPOINT_U16 */ + 0x00, 0x01, /* S390_BREAKPOINT_U16 */ + 0x00, 0x01, /* S390_BREAKPOINT_U16 */ +}; + +static inline void __check_code_syscall(void) +{ + BUILD_BUG_ON(sizeof(code_syscall) != BUILTIN_SYSCALL_SIZE); + BUILD_BUG_ON(!is_log2(sizeof(code_syscall))); +} + +/* + * Issue s390 system call + */ +int compel_syscall(struct parasite_ctl *ctl, int nr, long *ret, + unsigned long arg1, + unsigned long arg2, + unsigned long arg3, + unsigned long arg4, + unsigned long arg5, + unsigned long arg6) +{ + user_regs_struct_t regs = ctl->orig.regs; + int err; + + /* Load syscall number into %r1 */ + regs.prstatus.gprs[1] = (unsigned long) nr; + /* Load parameter registers %r2-%r7 */ + regs.prstatus.gprs[2] = arg1; + regs.prstatus.gprs[3] = arg2; + regs.prstatus.gprs[4] = arg3; + regs.prstatus.gprs[5] = arg4; + regs.prstatus.gprs[6] = arg5; + regs.prstatus.gprs[7] = arg6; + + err = compel_execute_syscall(ctl, ®s, (char *) code_syscall); + + /* Return code from system is in %r2 */ + if (ret) + *ret = regs.prstatus.gprs[2]; + return err; +} + +/* + * Issue s390 mmap call + */ +void *remote_mmap(struct parasite_ctl *ctl, + void *addr, size_t length, int prot, + int flags, int fd, off_t offset) +{ + void *where = (void *)ctl->ictx.syscall_ip + BUILTIN_SYSCALL_SIZE; + struct mmap_arg_struct arg_struct; + pid_t pid = ctl->rpid; + long map = 0; + int err; + + /* Setup s390 mmap data */ + arg_struct.addr = (unsigned long)addr; + arg_struct.len = length; + arg_struct.prot = prot; + arg_struct.flags = flags; + arg_struct.fd = fd; + arg_struct.offset = offset; + + /* Move args to process */ + if (ptrace_swap_area(pid, where, &arg_struct, sizeof(arg_struct))) { + pr_err("Can't inject memfd args (pid: %d)\n", pid); + return NULL; + } + + /* Do syscall */ + err = compel_syscall(ctl, __NR_mmap, &map, (unsigned long) where, + 0, 0, 0, 0, 0); + if (err < 0 || (long)map < 0) + map = 0; + + /* Restore data */ + if (ptrace_poke_area(pid, &arg_struct, where, sizeof(arg_struct))) { + pr_err("Can't restore mmap args (pid: %d)\n", pid); + if (map != 0) { + compel_syscall(ctl, __NR_munmap, NULL, map, + length, 0, 0, 0, 0); + map = 0; + } + } + + return (void *)map; +} + +/* + * Setup registers for parasite call + */ +void parasite_setup_regs(unsigned long new_ip, void *stack, + user_regs_struct_t *regs) +{ + regs->prstatus.psw.addr = new_ip; + if (!stack) + return; + regs->prstatus.gprs[15] = ((unsigned long) stack) - + STACK_FRAME_OVERHEAD; +} + +/* + * Check if we have all kernel and CRIU features to dump the task + */ +bool arch_can_dump_task(struct parasite_ctl *ctl) +{ + user_fpregs_struct_t fpregs; + user_regs_struct_t regs; + pid_t pid = ctl->rpid; + char str[8]; + psw_t *psw; + + if (ptrace_get_regs(pid, ®s)) + return false; + psw = ®s.prstatus.psw; + /* Check if the kernel supports RI ptrace interface */ + if (psw->mask & PSW_MASK_RI) { + if (get_ri_cb(pid, &fpregs) < 0) { + pr_perror("Can't dump process with RI bit active"); + return false; + } + } + /* We don't support 24 and 31 bit mode - only 64 bit */ + if (psw->mask & PSW_MASK_EA) { + if (psw->mask & PSW_MASK_BA) + return true; + else + sprintf(str, "??"); + } else { + if (psw->mask & PSW_MASK_BA) + sprintf(str, "31"); + else + sprintf(str, "24"); + } + pr_err("Pid %d is %s bit: Only 64 bit tasks are supported\n", pid, str); + return false; +} + +/* + * Return current alternate signal stack + */ +int arch_fetch_sas(struct parasite_ctl *ctl, struct rt_sigframe *s) +{ + long ret; + int err; + + err = compel_syscall(ctl, __NR_sigaltstack, + &ret, 0, (unsigned long)&s->uc.uc_stack, + 0, 0, 0, 0); + return err ? err : ret; +} + +/* + * Find last mapped address of current process + */ +static unsigned long max_mapped_addr(void) +{ + unsigned long addr_end, addr_max = 0; + char line[128]; + FILE *fp; + + fp = fopen("/proc/self/maps", "r"); + if (!fp) + goto out; + + /* Parse lines like: 3fff415f000-3fff4180000 rw-p 00000000 00:00 0 */ + while (fgets(line, sizeof(line), fp)) { + char *ptr; + /* First skip start address */ + strtoul(&line[0], &ptr, 16); + addr_end = strtoul(ptr + 1, NULL, 16); + addr_max = max(addr_max, addr_end); + } + fclose(fp); +out: + return addr_max - 1; +} + +/* + * Kernel task size level + * + * We have (dynamic) 4 level page tables for 64 bit since linux 2.6.25: + * + * 5a216a2083 ("[S390] Add four level page tables for CONFIG_64BIT=y.") + * 6252d702c5 ("[S390] dynamic page tables.") + * + * The code below is already prepared for future (dynamic) 5 level page tables. + * + * Besides that there is one problematic kernel bug that has been fixed for + * linux 4.11 by the following commit: + * + * ee71d16d22 ("s390/mm: make TASK_SIZE independent from the number + * of page table levels") + * + * A 64 bit process on s390x always starts with 3 levels and upgrades to 4 + * levels for mmap(> 4 TB) and to 5 levels for mmap(> 16 EB). + * + * Unfortunately before fix ee71d16d22 for a 3 level process munmap() + * and mremap() fail for addresses > 4 TB. CRIU uses the task size, + * to unmap() all memory from a starting point to task size to get rid of + * unwanted mappings. CRIU uses mremap() to establish the final mappings + * which also fails if we want to restore mappings > 4 TB and the initial + * restore process still runs with 3 levels. + * + * To support the current CRIU design on s390 we return task size = 4 TB when + * a kernel without fix ee71d16d22 is detected. In this case we can dump at + * least processes with < 4 TB which is the most likely case anyway. + * + * For kernels with fix ee71d16d22 we are fully functional. + */ +enum kernel_ts_level { + /* Kernel with 4 level page tables without fix ee71d16d22 */ + KERNEL_TS_LEVEL_4_FIX_NO, + /* Kernel with 4 level page tables with fix ee71d16d22 */ + KERNEL_TS_LEVEL_4_FIX_YES, + /* Kernel with 4 level page tables with or without fix ee71d16d22 */ + KERNEL_TS_LEVEL_4_FIX_UNKN, + /* Kernel with 5 level page tables */ + KERNEL_TS_LEVEL_5, +}; + +/* See arch/s390/include/asm/processor.h */ +#define TASK_SIZE_LEVEL_3 0x40000000000UL /* 4 TB */ +#define TASK_SIZE_LEVEL_4 0x20000000000000UL /* 8 PB */ +#define TASK_SIZE_LEVEL_5 0xffffffffffffefffUL /* 16 EB - 0x1000 */ + +/* + * Return detected kernel version regarding task size level + * + * We use unmap() to probe the maximum possible page table level of kernel + */ +static enum kernel_ts_level get_kernel_ts_level(void) +{ + unsigned long criu_end_addr = max_mapped_addr(); + + /* Check for 5 levels */ + if (criu_end_addr >= TASK_SIZE_LEVEL_4) + return KERNEL_TS_LEVEL_5; + else if (munmap((void *) TASK_SIZE_LEVEL_4, 0x1000) == 0) + return KERNEL_TS_LEVEL_5; + + if (criu_end_addr < TASK_SIZE_LEVEL_3) { + /* Check for 4 level kernel with fix */ + if (munmap((void *) TASK_SIZE_LEVEL_3, 0x1000) == 0) + return KERNEL_TS_LEVEL_4_FIX_YES; + else + return KERNEL_TS_LEVEL_4_FIX_NO; + } + /* We can't find out if kernel has the fix */ + return KERNEL_TS_LEVEL_4_FIX_UNKN; +} + +/* + * Log detected level + */ +static void pr_levels(const char *str) +{ + pr_debug("Max user page table levels (task size): %s\n", str); +} + +/* + * Return last address (+1) of biggest possible user address space for + * current kernel + */ +unsigned long compel_task_size(void) +{ + switch (get_kernel_ts_level()) { + case KERNEL_TS_LEVEL_4_FIX_NO: + pr_levels("KERNEL_TS_LEVEL_4_FIX_NO"); + return TASK_SIZE_LEVEL_3; + case KERNEL_TS_LEVEL_4_FIX_YES: + pr_levels("KERNEL_TS_LEVEL_4_FIX_YES"); + return TASK_SIZE_LEVEL_4; + case KERNEL_TS_LEVEL_4_FIX_UNKN: + pr_levels("KERNEL_TS_LEVEL_4_FIX_UNKN"); + return TASK_SIZE_LEVEL_3; + default: /* KERNEL_TS_LEVEL_5 */ + pr_levels("KERNEL_TS_LEVEL_5"); + return TASK_SIZE_LEVEL_5; + } +} + +/* + * Get task registers (overwrites weak function) + * + * We don't store floating point and vector registers here because we + * assue that compel/pie code does not change them. + * + * For verification issue: + * + * $ objdump -S criu/pie/parasite.built-in.bin.o | grep "%f" + * $ objdump -S criu/pie/restorer.built-in.bin.o | grep "%f" + */ +int ptrace_get_regs(int pid, user_regs_struct_t *regs) +{ + struct iovec iov; + int rc; + + pr_debug("ptrace_get_regs: pid=%d\n", pid); + + iov.iov_base = ®s->prstatus; + iov.iov_len = sizeof(regs->prstatus); + rc = ptrace(PTRACE_GETREGSET, pid, NT_PRSTATUS, &iov); + if (rc != 0) + return rc; + + iov.iov_base = ®s->system_call; + iov.iov_len = sizeof(regs->system_call); + return ptrace(PTRACE_GETREGSET, pid, NT_S390_SYSTEM_CALL, &iov); +} + +/* + * Set task registers (overwrites weak function) + */ +int ptrace_set_regs(int pid, user_regs_struct_t *regs) +{ + uint32_t system_call = 0; + struct iovec iov; + int rc; + + pr_debug("ptrace_set_regs: pid=%d\n", pid); + + iov.iov_base = ®s->prstatus; + iov.iov_len = sizeof(regs->prstatus); + rc = ptrace(PTRACE_SETREGSET, pid, NT_PRSTATUS, &iov); + if (rc) + return rc; + + /* + * If we attached to an inferior that is sleeping in a restarting + * system call like futex_wait(), we have to reset the system_call + * to 0. Otherwise the kernel would try to finish the interrupted + * system call after PTRACE_CONT and we could not run the + * parasite code. + */ + iov.iov_base = &system_call; + iov.iov_len = sizeof(system_call); + return ptrace(PTRACE_SETREGSET, pid, NT_S390_SYSTEM_CALL, &iov); +} diff --git a/CRIU_code/compel/arch/x86/plugins/include/asm/prologue.h b/CRIU_code/compel/arch/x86/plugins/include/asm/prologue.h new file mode 100644 index 0000000..9d812ee --- /dev/null +++ b/CRIU_code/compel/arch/x86/plugins/include/asm/prologue.h @@ -0,0 +1,36 @@ +#ifndef __ASM_PROLOGUE_H__ +#define __ASM_PROLOGUE_H__ + +#ifndef __ASSEMBLY__ + +#include +#include +#include + +#include + +#define sys_recv(sockfd, ubuf, size, flags) \ + sys_recvfrom(sockfd, ubuf, size, flags, NULL, NULL) + +typedef struct prologue_init_args { + struct sockaddr_un ctl_sock_addr; + unsigned int ctl_sock_addr_len; + + unsigned int arg_s; + void *arg_p; + + void *sigframe; +} prologue_init_args_t; + +#endif /* __ASSEMBLY__ */ + +/* + * Reserve enough space for sigframe. + * + * FIXME It is rather should be taken from sigframe header. + */ +#define PROLOGUE_SGFRAME_SIZE 4096 + +#define PROLOGUE_INIT_ARGS_SIZE 1024 + +#endif /* __ASM_PROLOGUE_H__ */ diff --git a/CRIU_code/compel/arch/x86/plugins/include/asm/syscall-types.h b/CRIU_code/compel/arch/x86/plugins/include/asm/syscall-types.h new file mode 100644 index 0000000..9874fd0 --- /dev/null +++ b/CRIU_code/compel/arch/x86/plugins/include/asm/syscall-types.h @@ -0,0 +1,60 @@ +#ifndef COMPEL_ARCH_SYSCALL_TYPES_H__ +#define COMPEL_ARCH_SYSCALL_TYPES_H__ + +/* Types for sigaction, sigprocmask syscalls */ +typedef void rt_signalfn_t(int, siginfo_t *, void *); +typedef rt_signalfn_t *rt_sighandler_t; + +typedef void rt_restorefn_t(void); +typedef rt_restorefn_t *rt_sigrestore_t; + +#define SA_RESTORER 0x04000000 + +#define _KNSIG 64 +#define _NSIG_BPW 64 + +#define _KNSIG_WORDS (_KNSIG / _NSIG_BPW) + +/* + * Note: as k_rtsigset_t is the same size for 32-bit and 64-bit, + * sig defined as uint64_t rather than (unsigned long) - for the + * purpose if we ever going to support native 32-bit compilation. + */ +typedef struct { + uint64_t sig[_KNSIG_WORDS]; +} k_rtsigset_t; + +typedef struct { + rt_sighandler_t rt_sa_handler; + unsigned long rt_sa_flags; + rt_sigrestore_t rt_sa_restorer; + k_rtsigset_t rt_sa_mask; +} rt_sigaction_t; + +/* + * Note: there is unaligned access on x86_64 and it's fine. + * However, when porting this code -- keep in mind about possible issues + * with unaligned rt_sa_mask. + */ +typedef struct __attribute__((packed)) { + unsigned int rt_sa_handler; + unsigned int rt_sa_flags; + unsigned int rt_sa_restorer; + k_rtsigset_t rt_sa_mask; +} rt_sigaction_t_compat; + +/* Types for set_thread_area, get_thread_area syscalls */ +typedef struct { + unsigned int entry_number; + unsigned int base_addr; + unsigned int limit; + unsigned int seg_32bit:1; + unsigned int contents:2; + unsigned int read_exec_only:1; + unsigned int limit_in_pages:1; + unsigned int seg_not_present:1; + unsigned int useable:1; + unsigned int lm:1; +} user_desc_t; + +#endif /* COMPEL_ARCH_SYSCALL_TYPES_H__ */ diff --git a/CRIU_code/compel/arch/x86/plugins/include/features.h b/CRIU_code/compel/arch/x86/plugins/include/features.h new file mode 100644 index 0000000..0f35725 --- /dev/null +++ b/CRIU_code/compel/arch/x86/plugins/include/features.h @@ -0,0 +1,6 @@ +#ifndef __COMPEL_ARCH_FEATURES_H +#define __COMPEL_ARCH_FEATURES_H + +#define ARCH_HAS_MEMCPY + +#endif /* __COMPEL_ARCH_FEATURES_H */ diff --git a/CRIU_code/compel/arch/x86/plugins/std/memcpy.S b/CRIU_code/compel/arch/x86/plugins/std/memcpy.S new file mode 100644 index 0000000..2496cb9 --- /dev/null +++ b/CRIU_code/compel/arch/x86/plugins/std/memcpy.S @@ -0,0 +1,28 @@ +#include "common/asm/linkage.h" + +/* The following code is taken from Linux kernel (arch/x86/lib/memcpy_64.S). + * There are 3 implementations in there, we use the one that relies on + * X86_FEATURE_REP_GOOD ("rep microcode works well"). + */ + +/* + * memcpy - Copy a memory block. + * + * Input: + * rdi destination + * rsi source + * rdx count + * + * Output: + * rax original destination + */ +ENTRY(memcpy) + movq %rdi, %rax + movq %rdx, %rcx + shrq $3, %rcx + andl $7, %edx + rep movsq + movl %edx, %ecx + rep movsb + ret +END(memcpy) diff --git a/CRIU_code/compel/arch/x86/plugins/std/parasite-head.S b/CRIU_code/compel/arch/x86/plugins/std/parasite-head.S new file mode 100644 index 0000000..a988de9 --- /dev/null +++ b/CRIU_code/compel/arch/x86/plugins/std/parasite-head.S @@ -0,0 +1,52 @@ +#include "common/asm/linkage.h" + + .section .head.text, "ax" + +#ifndef CONFIG_X86_64 +# error 64-bit parasite should compile with CONFIG_X86_64 +#endif + +.macro PARASITE_ENTRY num + subq $16, %rsp + andq $~15, %rsp + pushq $\num + movq %rsp, %rbp + movl __export_parasite_cmd(%rip), %edi + leaq __export_parasite_args(%rip), %rsi + call parasite_service +.endm + +#ifdef CONFIG_COMPAT +.code32 +ENTRY(__export_parasite_head_start_compat) + /* A long jump to 64-bit parasite. */ + jmp $__USER_CS,$1f +1: +.code64 + PARASITE_ENTRY 0 + pushq $__USER32_CS + pushq $2f + lretq +2: +.code32 + /* + * parasite_service() can run commands in non-daemon mode + * with parasite_trap_cmd(): it waits that after return there + * is a software break. + * compel_run_in_thread() uses this and after hitting the break, + * it restores register set - that's the reason, why we should + * stop in 32-bit mode for compat tasks here. + */ + int $0x03 +END(__export_parasite_head_start_compat) +.code64 +#endif + +ENTRY(__export_parasite_head_start) + PARASITE_ENTRY 0 + int $0x03 +END(__export_parasite_head_start) + +.align 8 +GLOBAL(__export_parasite_cmd) + .long 0 diff --git a/CRIU_code/compel/arch/x86/plugins/std/prologue.S b/CRIU_code/compel/arch/x86/plugins/std/prologue.S new file mode 100644 index 0000000..79ad1f6 --- /dev/null +++ b/CRIU_code/compel/arch/x86/plugins/std/prologue.S @@ -0,0 +1,33 @@ +#include "common/asm/linkage.h" +#include "asm/prologue.h" + +#include "uapi/std/syscall-codes.h" + + .section .compel.prologue.text, "ax" +ENTRY(__export_std_prologue_start) + push %rsp + + leaq __export_std_prologue_init_args(%rip), %rdi + movq __export_std_plugin_begin(%rip), %rsi + movq __export_std_plugin_size(%rip), %rdx + call __export_std_compel_start + +do_rt_sigreturn: + leaq __export_std_prologue_sigframe(%rip), %rax + addq $8, %rax + movq %rax, %rsp # we can't use sys_rt_sigreturn here + mov $__NR_rt_sigreturn, %eax # because we're adjusting stack + syscall + +GLOBAL(__export_std_prologue_init_args) + .space PROLOGUE_INIT_ARGS_SIZE, 0 + +GLOBAL(__export_std_plugin_begin) + .space 8, 0 +GLOBAL(__export_std_plugin_size) + .space 8, 0 + + .align 64 +GLOBAL(__export_std_prologue_sigframe) + .space PROLOGUE_SGFRAME_SIZE, 0 +END(__export_std_prologue_start) diff --git a/CRIU_code/compel/arch/x86/plugins/std/syscalls/Makefile.syscalls b/CRIU_code/compel/arch/x86/plugins/std/syscalls/Makefile.syscalls new file mode 100644 index 0000000..4ba4b56 --- /dev/null +++ b/CRIU_code/compel/arch/x86/plugins/std/syscalls/Makefile.syscalls @@ -0,0 +1,122 @@ +std-lib-y += ./$(PLUGIN_ARCH_DIR)/std/syscalls-64.o + +sys-proto-types := $(obj)/include/uapi/std/syscall-types.h +sys-proto-generic := $(obj)/include/uapi/std/syscall.h +sys-codes-generic := $(obj)/include/uapi/std/syscall-codes.h +sys-codes = $(obj)/include/uapi/std/syscall-codes-$(1).h +sys-proto = $(obj)/include/uapi/std/syscall-$(1).h +sys-def = $(PLUGIN_ARCH_DIR)/std/syscalls/syscall_$(1).tbl +sys-asm = $(PLUGIN_ARCH_DIR)/std/syscalls-$(1).S +sys-asm-common-name = std/syscalls/syscall-common-x86-$(1).S +sys-asm-common = $(PLUGIN_ARCH_DIR)/$(sys-asm-common-name) +sys-asm-types := $(obj)/include/uapi/std/asm/syscall-types.h +sys-exec-tbl = $(PLUGIN_ARCH_DIR)/std/sys-exec-tbl-$(1).c + +sys-bits := 64 + +AV := $$$$ + +define gen-rule-sys-codes +$(sys-codes): $(sys-def) $(sys-proto-types) + $(call msg-gen, $$@) + $(Q) echo "/* Autogenerated, don't edit */" > $$@ + $(Q) echo "#ifndef ASM_SYSCALL_CODES_H_$(1)__" >> $$@ + $(Q) echo "#define ASM_SYSCALL_CODES_H_$(1)__" >> $$@ + $(Q) cat $$< | awk '/^__NR/{SYSN=$(AV)1; \ + sub("^__NR", "SYS", SYSN); \ + print "\n#ifndef ", $(AV)1; \ + print "#define", $(AV)1, $(AV)2; \ + print "#endif"; \ + print "\n#ifndef ", SYSN; \ + print "#define ", SYSN, $(AV)1; \ + print "#endif";}' >> $$@ + $(Q) echo "#endif /* ASM_SYSCALL_CODES_H_$(1)__ */" >> $$@ +endef + +define gen-rule-sys-proto +$(sys-proto): $(sys-def) $(sys-proto-types) + $(call msg-gen, $$@) + $(Q) echo "/* Autogenerated, don't edit */" > $$@ + $(Q) echo "#ifndef ASM_SYSCALL_PROTO_H_$(1)__" >> $$@ + $(Q) echo "#define ASM_SYSCALL_PROTO_H_$(1)__" >> $$@ + $(Q) echo '#include ' >> $$@ + $(Q) echo '#include ' >> $$@ +ifeq ($(1),32) + $(Q) echo '#include "asm/syscall32.h"' >> $$@ +endif + $(Q) cat $$< | awk '/^__NR/{print "extern long", $(AV)3, \ + substr($(AV)0, index($(AV)0,$(AV)4)), ";"}' >> $$@ + $(Q) echo "#endif /* ASM_SYSCALL_PROTO_H_$(1)__ */" >> $$@ +endef + +define gen-rule-sys-asm +$(sys-asm): $(sys-def) $(sys-asm-common) $(sys-codes) $(sys-proto) $(sys-proto-types) + $(call msg-gen, $$@) + $(Q) echo "/* Autogenerated, don't edit */" > $$@ + $(Q) echo '#include ' >> $$@ + $(Q) echo '#include "$(sys-asm-common-name)"' >> $$@ + $(Q) cat $$< | awk '/^__NR/{print "SYSCALL(", $(AV)3, ",", $(AV)2, ")"}' >> $$@ +endef + +define gen-rule-sys-exec-tbl +$(sys-exec-tbl): $(sys-def) $(sys-codes) $(sys-proto) $(sys-proto-generic) $(sys-proto-types) + $(call msg-gen, $$@) + $(Q) echo "/* Autogenerated, don't edit */" > $$@ + $(Q) cat $$< | awk '/^__NR/{print \ + "SYSCALL(", substr($(AV)3, 5), ",", $(AV)2, ")"}' >> $$@ +endef + +$(sys-codes-generic): $(PLUGIN_ARCH_DIR)/std/syscalls/syscall_32.tbl $(sys-proto-types) + $(call msg-gen, $@) + $(Q) echo "/* Autogenerated, don't edit */" > $@ + $(Q) echo "#ifndef __ASM_CR_SYSCALL_CODES_H__" >> $@ + $(Q) echo "#define __ASM_CR_SYSCALL_CODES_H__" >> $@ + $(Q) echo '#include ' >> $@ + $(Q) cat $< | awk '/^__NR/{NR32=$$1; \ + sub("^__NR", "__NR32", NR32); \ + print "\n#ifndef ", NR32; \ + print "#define ", NR32, $$2; \ + print "#endif";}' >> $@ + $(Q) echo "#endif /* __ASM_CR_SYSCALL_CODES_H__ */" >> $@ +mrproper-y += $(sys-codes-generic) + +$(sys-proto-generic): $(strip $(call map,sys-proto,$(sys-bits))) $(sys-proto-types) + $(call msg-gen, $@) + $(Q) echo "/* Autogenerated, don't edit */" > $@ + $(Q) echo "#ifndef __ASM_CR_SYSCALL_PROTO_H__" >> $@ + $(Q) echo "#define __ASM_CR_SYSCALL_PROTO_H__" >> $@ + $(Q) echo "" >> $@ + $(Q) echo "#ifdef CONFIG_X86_32" >> $@ + $(Q) echo '#include ' >> $@ + $(Q) echo "#else" >> $@ + $(Q) echo '#include ' >> $@ + $(Q) echo "#endif /* CONFIG_X86_32 */" >> $@ + $(Q) echo "" >> $@ + $(Q) echo "#endif /* __ASM_CR_SYSCALL_PROTO_H__ */" >> $@ +mrproper-y += $(sys-proto-generic) + +define gen-rule-sys-exec-tbl +$(sys-exec-tbl): $(sys-def) $(sys-codes) $(sys-proto) $(sys-proto-generic) + $(call msg-gen, $$@) + $(Q) echo "/* Autogenerated, don't edit */" > $$@ + $(Q) cat $$< | awk '/^__NR/{print \ + "SYSCALL(", substr($(AV)3, 5), ",", $(AV)2, ")"}' >> $$@ +endef + +$(eval $(call map,gen-rule-sys-codes,$(sys-bits))) +$(eval $(call map,gen-rule-sys-proto,$(sys-bits))) +$(eval $(call map,gen-rule-sys-asm,$(sys-bits))) +$(eval $(call map,gen-rule-sys-exec-tbl,$(sys-bits))) + +$(sys-asm-types): $(PLUGIN_ARCH_DIR)/include/asm/syscall-types.h + $(call msg-gen, $@) + $(Q) ln -s ../../../../../../$(PLUGIN_ARCH_DIR)/include/asm/syscall-types.h $(sys-asm-types) + +std-headers-deps += $(call sys-codes,$(sys-bits)) +std-headers-deps += $(call sys-proto,$(sys-bits)) +std-headers-deps += $(call sys-asm,$(sys-bits)) +std-headers-deps += $(call sys-exec-tbl,$(sys-bits)) +std-headers-deps += $(sys-codes-generic) +std-headers-deps += $(sys-proto-generic) +std-headers-deps += $(sys-asm-types) +mrproper-y += $(std-headers-deps) diff --git a/CRIU_code/compel/arch/x86/plugins/std/syscalls/syscall-common-x86-32.S b/CRIU_code/compel/arch/x86/plugins/std/syscalls/syscall-common-x86-32.S new file mode 100644 index 0000000..c1a7261 --- /dev/null +++ b/CRIU_code/compel/arch/x86/plugins/std/syscalls/syscall-common-x86-32.S @@ -0,0 +1,36 @@ +#include "common/asm/linkage.h" + +#define SYSCALL(name, opcode) \ + ENTRY(name); \ + movl $opcode, %eax; \ + jmp __syscall_common; \ + END(name) + +ENTRY(__syscall_common) + pushl %ebx + pushl %esi + pushl %edi + pushl %ebp + +#define __arg(n) (4 * (n) + 20)(%esp) + movl __arg(0),%ebx + movl __arg(1),%ecx + movl __arg(2),%edx + movl __arg(3),%esi + movl __arg(4),%edi + movl __arg(5),%ebp +#undef __arg + + int $0x80 + + popl %ebp + popl %edi + popl %esi + popl %ebx + ret +END(__syscall_common) + +ENTRY(__cr_restore_rt) + movl $__NR_rt_sigreturn, %eax + jmp __syscall_common +END(__cr_restore_rt) diff --git a/CRIU_code/compel/arch/x86/plugins/std/syscalls/syscall-common-x86-64.S b/CRIU_code/compel/arch/x86/plugins/std/syscalls/syscall-common-x86-64.S new file mode 100644 index 0000000..74465c3 --- /dev/null +++ b/CRIU_code/compel/arch/x86/plugins/std/syscalls/syscall-common-x86-64.S @@ -0,0 +1,21 @@ +#include "common/asm/linkage.h" + +#define SYSCALL(name, opcode) \ + ENTRY(name); \ + movl $opcode, %eax; \ + jmp __syscall_common; \ + END(name) + + .text + .align 4 + +ENTRY(__syscall_common) + movq %rcx, %r10 + syscall + ret +END(__syscall_common) + +ENTRY(__cr_restore_rt) + movq $__NR_rt_sigreturn, %rax + syscall +END(__cr_restore_rt) diff --git a/CRIU_code/compel/arch/x86/plugins/std/syscalls/syscall32.c b/CRIU_code/compel/arch/x86/plugins/std/syscalls/syscall32.c new file mode 100644 index 0000000..e172cac --- /dev/null +++ b/CRIU_code/compel/arch/x86/plugins/std/syscalls/syscall32.c @@ -0,0 +1,85 @@ +#include "asm/types.h" +#include "syscall-32.h" + +#define SYS_SOCKET 1 /* sys_socket(2) */ +#define SYS_BIND 2 /* sys_bind(2) */ +#define SYS_CONNECT 3 /* sys_connect(2) */ +#define SYS_SENDTO 11 /* sys_sendto(2) */ +#define SYS_RECVFROM 12 /* sys_recvfrom(2) */ +#define SYS_SHUTDOWN 13 /* sys_shutdown(2) */ +#define SYS_SETSOCKOPT 14 /* sys_setsockopt(2) */ +#define SYS_GETSOCKOPT 15 /* sys_getsockopt(2) */ +#define SYS_SENDMSG 16 /* sys_sendmsg(2) */ +#define SYS_RECVMSG 17 /* sys_recvmsg(2) */ + +long sys_socket(int domain, int type, int protocol) +{ + uint32_t a[] = { (uint32_t)domain, (uint32_t)type, (uint32_t)protocol }; + return sys_socketcall(SYS_SOCKET, (unsigned long *)a); +} + +long sys_connect(int sockfd, struct sockaddr *addr, int addrlen) +{ + uint32_t a[] = {(uint32_t)sockfd, (uint32_t)addr, (uint32_t)addrlen}; + return sys_socketcall(SYS_CONNECT, (unsigned long *)a); +} + +long sys_sendto(int sockfd, void *buff, size_t len, unsigned int flags, struct sockaddr *addr, int addr_len) +{ + uint32_t a[] = {(uint32_t)sockfd, (uint32_t)buff, (uint32_t)len, (uint32_t)flags, (uint32_t)addr, (uint32_t)addr_len}; + return sys_socketcall(SYS_SENDTO, (unsigned long *)a); +} + +long sys_recvfrom(int sockfd, void *ubuf, size_t size, unsigned int flags, struct sockaddr *addr, int *addr_len) +{ + uint32_t a[] = {(uint32_t)sockfd, (uint32_t)ubuf, (uint32_t)size, (uint32_t)flags, (uint32_t)addr, (uint32_t)addr_len}; + return sys_socketcall(SYS_RECVFROM, (unsigned long *)a); +} + +long sys_sendmsg(int sockfd, const struct msghdr *msg, int flags) +{ + uint32_t a[] = {(uint32_t)sockfd, (uint32_t)msg, (uint32_t)flags}; + return sys_socketcall(SYS_SENDMSG, (unsigned long *)a); +} + +long sys_recvmsg(int sockfd, struct msghdr *msg, int flags) +{ + uint32_t a[] = {(uint32_t)sockfd, (uint32_t)msg, (uint32_t)flags}; + return sys_socketcall(SYS_RECVMSG, (unsigned long *)a); +} + +long sys_shutdown(int sockfd, int how) +{ + uint32_t a[] = {(uint32_t)sockfd, (uint32_t)how}; + return sys_socketcall(SYS_SHUTDOWN, (unsigned long *)a); +} + +long sys_bind(int sockfd, const struct sockaddr *addr, int addrlen) +{ + uint32_t a[] = {(uint32_t)sockfd, (uint32_t)addr, (uint32_t)addrlen}; + return sys_socketcall(SYS_BIND, (unsigned long *)a); +} + +long sys_setsockopt(int sockfd, int level, int optname, const void *optval, unsigned int optlen) +{ + uint32_t a[] = {(uint32_t)sockfd, (uint32_t)level, (uint32_t)optname, (uint32_t)optval, (uint32_t)optlen}; + return sys_socketcall(SYS_SETSOCKOPT, (unsigned long *)a); +} + +long sys_getsockopt(int sockfd, int level, int optname, const void *optval, unsigned int *optlen) +{ + uint32_t a[] = {(uint32_t)sockfd, (uint32_t)level, (uint32_t)optname, (uint32_t)optval, (uint32_t)optlen}; + return sys_socketcall(SYS_GETSOCKOPT, (unsigned long *)a); +} + +#define SHMAT 21 + +long sys_shmat(int shmid, void *shmaddr, int shmflag) +{ + return sys_ipc(SHMAT, shmid, shmflag, 0, shmaddr, 0); +} + +long sys_pread(unsigned int fd, char *ubuf, uint32_t count, uint64_t pos) +{ + return sys_pread64(fd, ubuf, count, (uint32_t)(pos & 0xffffffffu), (uint32_t)(pos >> 32)); +} diff --git a/CRIU_code/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl b/CRIU_code/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl new file mode 100644 index 0000000..a6c55b8 --- /dev/null +++ b/CRIU_code/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl @@ -0,0 +1,97 @@ +# +# System calls table, please make sure the table consist only the syscalls +# really used somewhere in project. +# +# code name arguments +# ------------------------------------------------------------------------------------------------------------------------------------------------------------- +__NR_restart_syscall 0 sys_restart_syscall (void) +__NR_exit 1 sys_exit (unsigned long error_code) +__NR_read 3 sys_read (int fd, void *buf, unsigned long count) +__NR_write 4 sys_write (int fd, const void *buf, unsigned long count) +__NR_open 5 sys_open (const char *filename, int flags, unsigned int mode) +__NR_close 6 sys_close (int fd) +__NR_unlink 10 sys_unlink (char *pathname) +__NR_lseek 19 sys_lseek (int fd, int32_t offset, unsigned int origin) +__NR_getpid 20 sys_getpid (void) +__NR_mount 21 sys_mount (const char *dev_name, const char *dir_name, const char *type, unsigned long flags, const void *data) +__NR_ptrace 26 sys_ptrace (long request, pid_t pid, void *addr, void *data) +__NR_kill 37 sys_kill (long pid, int sig) +__NR_mkdir 39 sys_mkdir (const char *name, int mode) +__NR_rmdir 40 sys_rmdir (const char *name) +__NR_brk 45 sys_brk (void *addr) +__NR_umount2 52 sys_umount2 (char *name, int flags) +__NR_ioctl 54 sys_ioctl (unsigned int fd, unsigned int cmd, unsigned long arg) +__NR_fcntl 55 sys_fcntl (unsigned int fd, unsigned int cmd, unsigned long arg) +__NR_umask 60 sys_umask (int mask) +__NR_setrlimit 75 sys_setrlimit (unsigned int resource, struct krlimit *rlim) +__NR_gettimeofday 78 sys_gettimeofday (struct timeval *tv, struct timezone *tz) +__NR_munmap 91 sys_munmap (void *addr, unsigned long len) +__NR_setpriority 97 sys_setpriority (int which, int who, int nice) +__NR_socketcall 102 sys_socketcall (int call, unsigned long *args) +__NR_setitimer 104 sys_setitimer (int which, struct itimerval *in, struct itimerval *out) +__NR_getitimer 105 sys_getitimer (int which, struct itimerval *it) +__NR_wait4 114 sys_wait4 (pid_t pid, int *stat_addr, int options, struct rusage *ru) +__NR_ipc 117 sys_ipc (unsigned int call, int first, unsigned long second, unsigned long third, void *ptr, long fifth) +__NR_clone 120 sys_clone (unsigned long flags, void *child_stack, void *parent_tid, unsigned long newtls, void *child_tid) +__NR_mprotect 125 sys_mprotect (const void *addr, unsigned long len, unsigned long prot) +__NR_getpgid 132 sys_getpgid (pid_t pid) +__NR_personality 136 sys_personality (unsigned int personality) +__NR_flock 143 sys_flock (int fd, unsigned long cmd) +__NR_getsid 147 sys_getsid (void) +__NR_sched_setscheduler 156 sys_sched_setscheduler (int pid, int policy, struct sched_param *p) +__NR_nanosleep 162 sys_nanosleep (struct timespec *rqtp, struct timespec *rmtp) +__NR_mremap 163 sys_mremap (unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flags, unsigned long new_addr) +__NR_prctl 172 sys_prctl (int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) +__NR_rt_sigreturn 173 sys_rt_sigreturn (void) +__NR_rt_sigaction 174 sys_sigaction (int signum, const rt_sigaction_t *act, rt_sigaction_t *oldact, size_t sigsetsize) +__NR_rt_sigprocmask 175 sys_sigprocmask (int how, k_rtsigset_t *set, k_rtsigset_t *oset, size_t sigsetsize) +__NR_rt_sigqueueinfo 178 sys_rt_sigqueueinfo (pid_t pid, int sig, siginfo_t *uinfo) +__NR_pread64 180 sys_pread64 (unsigned int fd, char *ubuf, uint32_t count, uint32_t poslo, uint32_t poshi) +__NR_capget 184 sys_capget (struct cap_header *h, struct cap_data *d) +__NR_capset 185 sys_capset (struct cap_header *h, struct cap_data *d) +__NR_sigaltstack 186 sys_sigaltstack (const void *uss_ptr, void *uoss_ptr) +__NR_mmap2 192 sys_mmap (void *addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long pgoff) +__NR_getgroups32 205 sys_getgroups (int gsize, unsigned int *groups) +__NR_setgroups32 206 sys_setgroups (int gsize, unsigned int *groups) +__NR_setresuid32 208 sys_setresuid (int uid, int euid, int suid) +__NR_getresuid32 209 sys_getresuid (int *uid, int *euid, int *suid) +__NR_setresgid32 210 sys_setresgid (int gid, int egid, int sgid) +__NR_getresgid32 211 sys_getresgid (int *gid, int *egid, int *sgid) +__NR_setfsuid32 215 sys_setfsuid (int fsuid) +__NR_setfsgid32 216 sys_setfsgid (int fsgid) +__NR_mincore 218 sys_mincore (void *addr, unsigned long size, unsigned char *vec) +__NR_madvise 219 sys_madvise (unsigned long start, size_t len, int behavior) +__NR_gettid 224 sys_gettid (void) +__NR_futex 240 sys_futex (uint32_t *uaddr, int op, uint32_t val, struct timespec *utime, uint32_t *uaddr2, uint32_t val3) +__NR_set_thread_area 243 sys_set_thread_area (user_desc_t *info) +__NR_get_thread_area 244 sys_get_thread_area (user_desc_t *info) +__NR_io_setup 245 sys_io_setup (unsigned nr_reqs, aio_context_t *ctx32p) +__NR_io_getevents 247 sys_io_getevents (aio_context_t ctx_id, long min_nr, long nr, struct io_event *events, struct timespec *timeout) +__NR_io_submit 248 sys_io_submit (aio_context_t ctx_id, long nr, struct iocb **iocbpp) +__NR_exit_group 252 sys_exit_group (int error_code) +__NR_set_tid_address 258 sys_set_tid_address (int *tid_addr) +__NR_timer_create 259 sys_timer_create (clockid_t which_clock, struct sigevent *timer_event_spec, kernel_timer_t *created_timer_id) +__NR_timer_settime 260 sys_timer_settime (kernel_timer_t timer_id, int flags, struct itimerspec *new, struct itimerspec *old) +__NR_timer_gettime 261 sys_timer_gettime (int timer_id, struct itimerspec *setting) +__NR_timer_getoverrun 262 sys_timer_getoverrun (int timer_id) +__NR_timer_delete 263 sys_timer_delete (kernel_timer_t timer_id) +__NR_clock_gettime 265 sys_clock_gettime (int which_clock, struct timespec *tp) +__NR_waitid 284 sys_waitid (int which, pid_t pid, struct siginfo *infop, int options, struct rusage *ru) +__NR_openat 295 sys_openat (int dfd, const char *filename, int flags, int mode) +__NR_readlinkat 305 sys_readlinkat (int fd, const char *path, char *buf, int bufsize) +__NR_set_robust_list 311 sys_set_robust_list (struct robust_list_head *head, size_t len) +__NR_get_robust_list 312 sys_get_robust_list (int pid, struct robust_list_head **head_ptr, size_t *len_ptr) +__NR_vmsplice 316 sys_vmsplice (int fd, const struct iovec *iov, unsigned int nr_segs, unsigned int flags) +__NR_signalfd 321 sys_signalfd (int ufd, const k_rtsigset_t *sigmask, size_t sigsetsize) +__NR_fallocate 324 sys_fallocate (int fd, int mode, loff_t offset, loff_t len) +__NR_timerfd_settime 325 sys_timerfd_settime (int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr) +__NR_preadv 333 sys_preadv_raw (int fd, struct iovec *iov, unsigned long nr, unsigned long pos_l, unsigned long pos_h) +__NR_rt_tgsigqueueinfo 335 sys_rt_tgsigqueueinfo (pid_t tgid, pid_t pid, int sig, siginfo_t *uinfo) +__NR_fanotify_init 338 sys_fanotify_init (unsigned int flags, unsigned int event_f_flags) +__NR_fanotify_mark 339 sys_fanotify_mark (int fanotify_fd, unsigned int flag, uint32_t mask, int dfd, const char *pathname) +__NR_open_by_handle_at 342 sys_open_by_handle_at (int mountdirfd, struct file_handle *handle, int flags) +__NR_setns 346 sys_setns (int fd, int nstype) +__NR_kcmp 349 sys_kcmp (pid_t pid1, pid_t pid2, int type, unsigned long idx1, unsigned long idx2) +__NR_seccomp 354 sys_seccomp (unsigned int op, unsigned int flags, const char *uargs) +__NR_memfd_create 356 sys_memfd_create (const char *name, unsigned int flags) +__NR_userfaultfd 374 sys_userfaultfd (int flags) diff --git a/CRIU_code/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl b/CRIU_code/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl new file mode 100644 index 0000000..6427151 --- /dev/null +++ b/CRIU_code/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl @@ -0,0 +1,108 @@ +# +# System calls table, please make sure the table consist only the syscalls +# really used somewhere in project. +# +# __NR_name code name arguments +# ------------------------------------------------------------------------------------------------------------------------------------------------------------- +__NR_read 0 sys_read (int fd, void *buf, unsigned long count) +__NR_write 1 sys_write (int fd, const void *buf, unsigned long count) +__NR_open 2 sys_open (const char *filename, unsigned long flags, unsigned long mode) +__NR_close 3 sys_close (int fd) +__NR_lseek 8 sys_lseek (int fd, unsigned long offset, unsigned long origin) +__NR_mmap 9 sys_mmap (void *addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long offset) +__NR_mprotect 10 sys_mprotect (const void *addr, unsigned long len, unsigned long prot) +__NR_munmap 11 sys_munmap (void *addr, unsigned long len) +__NR_brk 12 sys_brk (void *addr) +__NR_rt_sigaction 13 sys_sigaction (int signum, const rt_sigaction_t *act, rt_sigaction_t *oldact, size_t sigsetsize) +__NR_rt_sigprocmask 14 sys_sigprocmask (int how, k_rtsigset_t *set, k_rtsigset_t *old, size_t sigsetsize) +__NR_rt_sigreturn 15 sys_rt_sigreturn (void) +__NR_ioctl 16 sys_ioctl (unsigned int fd, unsigned int cmd, unsigned long arg) +__NR_pread64 17 sys_pread (unsigned int fd, char *buf, size_t count, loff_t pos) +__NR_mremap 25 sys_mremap (unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flags, unsigned long new_addr) +__NR_mincore 27 sys_mincore (void *addr, unsigned long size, unsigned char *vec) +__NR_madvise 28 sys_madvise (unsigned long start, size_t len, int behavior) +__NR_shmat 30 sys_shmat (int shmid, void *shmaddr, int shmflag) +__NR_dup2 33 sys_dup2 (int oldfd, int newfd) +__NR_nanosleep 35 sys_nanosleep (struct timespec *req, struct timespec *rem) +__NR_getitimer 36 sys_getitimer (int which, const struct itimerval *val) +__NR_setitimer 38 sys_setitimer (int which, const struct itimerval *val, struct itimerval *old) +__NR_getpid 39 sys_getpid (void) +__NR_socket 41 sys_socket (int domain, int type, int protocol) +__NR_connect 42 sys_connect (int sockfd, struct sockaddr *addr, int addrlen) +__NR_sendto 44 sys_sendto (int sockfd, void *buff, size_t len, unsigned int flags, struct sockaddr *addr, int addr_len) +__NR_recvfrom 45 sys_recvfrom (int sockfd, void *ubuf, size_t size, unsigned int flags, struct sockaddr *addr, int *addr_len) +__NR_sendmsg 46 sys_sendmsg (int sockfd, const struct msghdr *msg, int flags) +__NR_recvmsg 47 sys_recvmsg (int sockfd, struct msghdr *msg, int flags) +__NR_shutdown 48 sys_shutdown (int sockfd, int how) +__NR_bind 49 sys_bind (int sockfd, const struct sockaddr *addr, int addrlen) +__NR_setsockopt 54 sys_setsockopt (int sockfd, int level, int optname, const void *optval, socklen_t optlen) +__NR_getsockopt 55 sys_getsockopt (int sockfd, int level, int optname, const void *optval, socklen_t *optlen) +__NR_clone 56 sys_clone (unsigned long flags, void *child_stack, void *parent_tid, void *child_tid, unsigned long new_tls) +__NR_exit 60 sys_exit (unsigned long error_code) +__NR_wait4 61 sys_wait4 (int pid, int *status, int options, struct rusage *ru) +__NR_kill 62 sys_kill (long pid, int sig) +__NR_fcntl 72 sys_fcntl (int fd, int type, long arg) +__NR_flock 73 sys_flock (int fd, unsigned long cmd) +__NR_mkdir 83 sys_mkdir (const char *name, int mode) +__NR_rmdir 84 sys_rmdir (const char *name) +__NR_unlink 87 sys_unlink (char *pathname) +__NR_umask 95 sys_umask (int mask) +__NR_gettimeofday 96 sys_gettimeofday (struct timeval *tv, struct timezone *tz) +__NR_ptrace 101 sys_ptrace (long request, pid_t pid, void *addr, void *data) +__NR_getgroups 115 sys_getgroups (int gsize, unsigned int *groups) +__NR_setgroups 116 sys_setgroups (int gsize, unsigned int *groups) +__NR_setresuid 117 sys_setresuid (int uid, int euid, int suid) +__NR_getresuid 118 sys_getresuid (int *uid, int *euid, int *suid) +__NR_setresgid 119 sys_setresgid (int gid, int egid, int sgid) +__NR_getresgid 120 sys_getresgid (int *gid, int *egid, int *sgid) +__NR_getpgid 121 sys_getpgid (pid_t pid) +__NR_setfsuid 122 sys_setfsuid (int fsuid) +__NR_setfsgid 123 sys_setfsgid (int fsgid) +__NR_getsid 124 sys_getsid (void) +__NR_capget 125 sys_capget (struct cap_header *h, struct cap_data *d) +__NR_capset 126 sys_capset (struct cap_header *h, struct cap_data *d) +__NR_rt_sigqueueinfo 129 sys_rt_sigqueueinfo (pid_t pid, int sig, siginfo_t *info) +__NR_sigaltstack 131 sys_sigaltstack (const void *uss, void *uoss) +__NR_personality 135 sys_personality (unsigned int personality) +__NR_setpriority 141 sys_setpriority (int which, int who, int nice) +__NR_sched_setscheduler 144 sys_sched_setscheduler (int pid, int policy, struct sched_param *p) +__NR_prctl 157 sys_prctl (int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) +__NR_arch_prctl 158 sys_arch_prctl (int option, unsigned long addr) +__NR_setrlimit 160 sys_setrlimit (int resource, struct krlimit *rlim) +__NR_mount 165 sys_mount (char *dev_nmae, char *dir_name, char *type, unsigned long flags, void *data) +__NR_umount2 166 sys_umount2 (char *name, int flags) +__NR_gettid 186 sys_gettid (void) +__NR_futex 202 sys_futex (uint32_t *uaddr, int op, uint32_t val, struct timespec *utime, uint32_t *uaddr2, uint32_t val3) +__NR_set_thread_area 205 sys_set_thread_area (user_desc_t *info) +__NR_io_setup 206 sys_io_setup (unsigned nr_events, aio_context_t *ctx) +__NR_io_getevents 208 sys_io_getevents (aio_context_t ctx, long min_nr, long nr, struct io_event *evs, struct timespec *tmo) +__NR_io_submit 209 sys_io_submit (aio_context_t ctx, long nr, struct iocb **iocbpp) +__NR_get_thread_area 211 sys_get_thread_area (user_desc_t *info) +__NR_set_tid_address 218 sys_set_tid_address (int *tid_addr) +__NR_restart_syscall 219 sys_restart_syscall (void) +__NR_sys_timer_create 222 sys_timer_create (clockid_t which_clock, struct sigevent *timer_event_spec, kernel_timer_t *created_timer_id) +__NR_sys_timer_settime 223 sys_timer_settime (kernel_timer_t timer_id, int flags, const struct itimerspec *new_setting, struct itimerspec *old_setting) +__NR_sys_timer_gettime 224 sys_timer_gettime (int timer_id, const struct itimerspec *setting) +__NR_sys_timer_getoverrun 225 sys_timer_getoverrun (int timer_id) +__NR_sys_timer_delete 226 sys_timer_delete (kernel_timer_t timer_id) +__NR_clock_gettime 228 sys_clock_gettime (const clockid_t which_clock, const struct timespec *tp) +__NR_exit_group 231 sys_exit_group (int error_code) +__NR_openat 257 sys_openat (int dfd, const char *filename, int flags, int mode) +__NR_waitid 247 sys_waitid (int which, pid_t pid, struct siginfo *infop, int options, struct rusage *ru) +__NR_readlinkat 267 sys_readlinkat (int fd, const char *path, char *buf, int bufsize) +__NR_set_robust_list 273 sys_set_robust_list (struct robust_list_head *head, size_t len) +__NR_get_robust_list 274 sys_get_robust_list (int pid, struct robust_list_head **head_ptr, size_t *len_ptr) +__NR_seccomp 317 sys_seccomp (unsigned int op, unsigned int flags, const char *uargs) +__NR_vmsplice 278 sys_vmsplice (int fd, const struct iovec *iov, unsigned long nr_segs, unsigned int flags) +__NR_fallocate 285 sys_fallocate (int fd, int mode, loff_t offset, loff_t len) +__NR_timerfd_settime 286 sys_timerfd_settime (int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr) +__NR_signalfd4 289 sys_signalfd (int fd, k_rtsigset_t *mask, size_t sizemask, int flags) +__NR_preadv 295 sys_preadv_raw (int fd, struct iovec *iov, unsigned long nr, unsigned long pos_l, unsigned long pos_h) +__NR_rt_tgsigqueueinfo 297 sys_rt_tgsigqueueinfo (pid_t tgid, pid_t pid, int sig, siginfo_t *info) +__NR_fanotify_init 300 sys_fanotify_init (unsigned int flags, unsigned int event_f_flags) +__NR_fanotify_mark 301 sys_fanotify_mark (int fanotify_fd, unsigned int flags, uint64_t mask, int dfd, const char *pathname) +__NR_open_by_handle_at 304 sys_open_by_handle_at (int mountdirfd, struct file_handle *handle, int flags) +__NR_setns 308 sys_setns (int fd, int nstype) +__NR_kcmp 312 sys_kcmp (pid_t pid1, pid_t pid2, int type, unsigned long idx1, unsigned long idx2) +__NR_memfd_create 319 sys_memfd_create (const char *name, unsigned int flags) +__NR_userfaultfd 323 sys_userfaultfd (int flags) diff --git a/CRIU_code/compel/arch/x86/scripts/compel-pack-compat.lds.S b/CRIU_code/compel/arch/x86/scripts/compel-pack-compat.lds.S new file mode 100644 index 0000000..ff9c2c6 --- /dev/null +++ b/CRIU_code/compel/arch/x86/scripts/compel-pack-compat.lds.S @@ -0,0 +1,41 @@ +OUTPUT_ARCH(i386) +TARGET(elf32-i386) +EXTERN(__export_parasite_head_start) + +SECTIONS +{ + .text : { + *(.head.text) + ASSERT(DEFINED(__export_parasite_head_start), + "Symbol __export_parasite_head_start is missing"); + *(.text*) + *(.compel.exit) + *(.compel.init) + } + + .data : { + *(.data*) + *(.bss*) + } + + .rodata : { + *(.rodata*) + *(.got*) + } + + .toc : ALIGN(8) { + *(.toc*) + } + + /DISCARD/ : { + *(.debug*) + *(.comment*) + *(.note*) + *(.group*) + *(.eh_frame*) + } + +/* Parasite args should have 4 bytes align, as we have futex inside. */ +. = ALIGN(4); +__export_parasite_args = .; +} diff --git a/CRIU_code/compel/arch/x86/scripts/compel-pack.lds.S b/CRIU_code/compel/arch/x86/scripts/compel-pack.lds.S new file mode 100644 index 0000000..0c936f8 --- /dev/null +++ b/CRIU_code/compel/arch/x86/scripts/compel-pack.lds.S @@ -0,0 +1,41 @@ +OUTPUT_ARCH(i386:x86-64) +TARGET(elf64-x86-64) +EXTERN(__export_parasite_head_start) + +SECTIONS +{ + .text : { + *(.head.text) + ASSERT(DEFINED(__export_parasite_head_start), + "Symbol __export_parasite_head_start is missing"); + *(.text*) + *(.compel.exit) + *(.compel.init) + } + + .data : { + *(.data*) + *(.bss*) + } + + .rodata : { + *(.rodata*) + *(.got*) + } + + .toc : ALIGN(8) { + *(.toc*) + } + + /DISCARD/ : { + *(.debug*) + *(.comment*) + *(.note*) + *(.group*) + *(.eh_frame*) + } + +/* Parasite args should have 4 bytes align, as we have futex inside. */ +. = ALIGN(4); +__export_parasite_args = .; +} diff --git a/CRIU_code/compel/arch/x86/src/lib/cpu.c b/CRIU_code/compel/arch/x86/src/lib/cpu.c new file mode 100644 index 0000000..6175121 --- /dev/null +++ b/CRIU_code/compel/arch/x86/src/lib/cpu.c @@ -0,0 +1,489 @@ +#include +#include + +#include "compel-cpu.h" +#include "common/bitops.h" +#include "common/compiler.h" + +#include "log.h" +#include "common/bug.h" + +#undef LOG_PREFIX +#define LOG_PREFIX "cpu: " + +static compel_cpuinfo_t rt_info; + +static void fetch_rt_cpuinfo(void) +{ + static bool rt_info_done = false; + + if (!rt_info_done) { + compel_cpuid(&rt_info); + rt_info_done = true; + } +} + +/* + * Although we spell it out in here, the Processor Trace + * xfeature is completely unused. We use other mechanisms + * to save/restore PT state in Linux. + */ + +static const char * const xfeature_names[] = { + "x87 floating point registers" , + "SSE registers" , + "AVX registers" , + "MPX bounds registers" , + "MPX CSR" , + "AVX-512 opmask" , + "AVX-512 Hi256" , + "AVX-512 ZMM_Hi256" , + "Processor Trace" , + "Protection Keys User registers", + "Hardware Duty Cycling" , +}; + +static short xsave_cpuid_features[] = { + X86_FEATURE_FPU, + X86_FEATURE_XMM, + X86_FEATURE_AVX, + X86_FEATURE_MPX, + X86_FEATURE_MPX, + X86_FEATURE_AVX512F, + X86_FEATURE_AVX512F, + X86_FEATURE_AVX512F, + X86_FEATURE_INTEL_PT, + X86_FEATURE_PKU, + X86_FEATURE_HDC, +}; + +void compel_set_cpu_cap(compel_cpuinfo_t *c, unsigned int feature) +{ + if (likely(feature < NCAPINTS_BITS)) + set_bit(feature, (unsigned long *)c->x86_capability); +} + +void compel_clear_cpu_cap(compel_cpuinfo_t *c, unsigned int feature) +{ + if (likely(feature < NCAPINTS_BITS)) + clear_bit(feature, (unsigned long *)c->x86_capability); +} + +int compel_test_cpu_cap(compel_cpuinfo_t *c, unsigned int feature) +{ + if (likely(feature < NCAPINTS_BITS)) + return test_bit(feature, (unsigned long *)c->x86_capability); + return 0; +} + +int compel_test_fpu_cap(compel_cpuinfo_t *c, unsigned int feature) +{ + if (likely(feature < XFEATURE_MAX)) + return (c->xfeatures_mask & (1UL << feature)); + return 0; +} + +static int compel_fpuid(compel_cpuinfo_t *c) +{ + unsigned int last_good_offset; + uint32_t eax, ebx, ecx, edx; + size_t i; + + BUILD_BUG_ON(ARRAY_SIZE(xsave_cpuid_features) != + ARRAY_SIZE(xfeature_names)); + + if (!compel_test_cpu_cap(c, X86_FEATURE_FPU)) { + pr_err("fpu: No FPU detected\n"); + return -1; + } + + if (!compel_test_cpu_cap(c, X86_FEATURE_XSAVE)) { + pr_info("fpu: x87 FPU will use %s\n", + compel_test_cpu_cap(c, X86_FEATURE_FXSR) ? + "FXSAVE" : "FSAVE"); + return 0; + } + + cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); + c->xfeatures_mask = eax + ((uint64_t)edx << 32); + + if ((c->xfeatures_mask & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) { + /* + * This indicates that something really unexpected happened + * with the enumeration. + */ + pr_err("fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx\n", + (unsigned long long)c->xfeatures_mask); + return -1; + } + + /* + * Clear XSAVE features that are disabled in the normal CPUID. + */ + for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) { + if (!compel_test_cpu_cap(c, xsave_cpuid_features[i])) + c->xfeatures_mask &= ~(1 << i); + } + + c->xfeatures_mask &= XCNTXT_MASK; + c->xfeatures_mask &= ~XFEATURE_MASK_SUPERVISOR; + + /* + * xsaves is not enabled in userspace, so + * xsaves is mostly for debug purpose. + */ + cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); + c->xsave_size = ebx; + c->xsave_size_max = ecx; + + cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx); + c->xsaves_size = ebx; + + pr_debug("fpu: xfeatures_mask 0x%llx xsave_size %u xsave_size_max %u xsaves_size %u\n", + (unsigned long long)c->xfeatures_mask, + c->xsave_size, c->xsave_size_max, c->xsaves_size); + + if (c->xsave_size_max > sizeof(struct xsave_struct)) + pr_warn_once("fpu: max xsave frame exceed xsave_struct (%u %u)\n", + c->xsave_size_max, (unsigned)sizeof(struct xsave_struct)); + + memset(c->xstate_offsets, 0xff, sizeof(c->xstate_offsets)); + memset(c->xstate_sizes, 0xff, sizeof(c->xstate_sizes)); + memset(c->xstate_comp_offsets, 0xff, sizeof(c->xstate_comp_offsets)); + memset(c->xstate_comp_sizes, 0xff, sizeof(c->xstate_comp_sizes)); + + /* start at the beginnning of the "extended state" */ + last_good_offset = offsetof(struct xsave_struct, extended_state_area); + + /* + * The FP xstates and SSE xstates are legacy states. They are always + * in the fixed offsets in the xsave area in either compacted form + * or standard form. + */ + c->xstate_offsets[0] = 0; + c->xstate_sizes[0] = offsetof(struct i387_fxsave_struct, xmm_space); + c->xstate_offsets[1] = c->xstate_sizes[0]; + c->xstate_sizes[1] = FIELD_SIZEOF(struct i387_fxsave_struct, xmm_space); + + for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { + if (!(c->xfeatures_mask & (1UL << i))) + continue; + + /* + * If an xfeature is supervisor state, the offset + * in EBX is invalid. We leave it to -1. + * + * SDM says: If state component 'i' is a user state component, + * ECX[0] return 0; if state component i is a supervisor + * state component, ECX[0] returns 1. + */ + cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx); + if (!(ecx & 1)) + c->xstate_offsets[i] = ebx; + + c->xstate_sizes[i] = eax; + + /* + * In our xstate size checks, we assume that the + * highest-numbered xstate feature has the + * highest offset in the buffer. Ensure it does. + */ + if (last_good_offset > c->xstate_offsets[i]) + pr_warn_once("fpu: misordered xstate %d %d\n", + last_good_offset, c->xstate_offsets[i]); + + last_good_offset = c->xstate_offsets[i]; + } + + BUILD_BUG_ON(sizeof(c->xstate_offsets) != sizeof(c->xstate_sizes)); + BUILD_BUG_ON(sizeof(c->xstate_comp_offsets) != sizeof(c->xstate_comp_sizes)); + + c->xstate_comp_offsets[0] = 0; + c->xstate_comp_sizes[0] = offsetof(struct i387_fxsave_struct, xmm_space); + c->xstate_comp_offsets[1] = c->xstate_comp_sizes[0]; + c->xstate_comp_sizes[1] = FIELD_SIZEOF(struct i387_fxsave_struct, xmm_space); + + if (!compel_test_cpu_cap(c, X86_FEATURE_XSAVES)) { + for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { + if ((c->xfeatures_mask & (1UL << i))) { + c->xstate_comp_offsets[i] = c->xstate_offsets[i]; + c->xstate_comp_sizes[i] = c->xstate_sizes[i]; + } + } + } else { + c->xstate_comp_offsets[FIRST_EXTENDED_XFEATURE] = + FXSAVE_SIZE + XSAVE_HDR_SIZE; + + for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { + if ((c->xfeatures_mask & (1UL << i))) + c->xstate_comp_sizes[i] = c->xstate_sizes[i]; + else + c->xstate_comp_sizes[i] = 0; + + if (i > FIRST_EXTENDED_XFEATURE) { + c->xstate_comp_offsets[i] = c->xstate_comp_offsets[i-1] + + c->xstate_comp_sizes[i-1]; + + /* + * The value returned by ECX[1] indicates the alignment + * of state component 'i' when the compacted format + * of the extended region of an XSAVE area is used: + */ + cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx); + if (ecx & 2) + c->xstate_comp_offsets[i] = ALIGN(c->xstate_comp_offsets[i], 64); + } + } + } + + if (!pr_quelled(COMPEL_LOG_DEBUG)) { + for (i = 0; i < ARRAY_SIZE(c->xstate_offsets); i++) { + if (!(c->xfeatures_mask & (1UL << i))) + continue; + pr_debug("fpu: %-32s xstate_offsets %6d / %-6d xstate_sizes %6d / %-6d\n", + xfeature_names[i], c->xstate_offsets[i], c->xstate_comp_offsets[i], + c->xstate_sizes[i], c->xstate_comp_sizes[i]); + } + } + + return 0; +} + +int compel_cpuid(compel_cpuinfo_t *c) +{ + uint32_t eax, ebx, ecx, edx; + + /* + * See cpu_detect() in the kernel, also + * read cpuid specs not only from general + * SDM but for extended instructions set + * reference. + */ + + /* Get vendor name */ + cpuid(0x00000000, + (unsigned int *)&c->cpuid_level, + (unsigned int *)&c->x86_vendor_id[0], + (unsigned int *)&c->x86_vendor_id[8], + (unsigned int *)&c->x86_vendor_id[4]); + + if (!strcmp(c->x86_vendor_id, "GenuineIntel")) { + c->x86_vendor = X86_VENDOR_INTEL; + } else if (!strcmp(c->x86_vendor_id, "AuthenticAMD") || + !strcmp(c->x86_vendor_id, "HygonGenuine")) { + c->x86_vendor = X86_VENDOR_AMD; + } else { + pr_err("Unsupported CPU vendor %s\n", + c->x86_vendor_id); + return -1; + } + + c->x86_family = 4; + + /* Intel-defined flags: level 0x00000001 */ + if (c->cpuid_level >= 0x00000001) { + cpuid(0x00000001, &eax, &ebx, &ecx, &edx); + c->x86_family = (eax >> 8) & 0xf; + c->x86_model = (eax >> 4) & 0xf; + c->x86_mask = eax & 0xf; + + if (c->x86_family == 0xf) + c->x86_family += (eax >> 20) & 0xff; + if (c->x86_family >= 0x6) + c->x86_model += ((eax >> 16) & 0xf) << 4; + + c->x86_capability[CPUID_1_EDX] = edx; + c->x86_capability[CPUID_1_ECX] = ecx; + } + + /* Thermal and Power Management Leaf: level 0x00000006 (eax) */ + if (c->cpuid_level >= 0x00000006) + c->x86_capability[CPUID_6_EAX] = cpuid_eax(0x00000006); + + /* Additional Intel-defined flags: level 0x00000007 */ + if (c->cpuid_level >= 0x00000007) { + cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx); + c->x86_capability[CPUID_7_0_EBX] = ebx; + c->x86_capability[CPUID_7_0_ECX] = ecx; + c->x86_capability[CPUID_7_0_EDX] = edx; + } + + /* Extended state features: level 0x0000000d */ + if (c->cpuid_level >= 0x0000000d) { + cpuid_count(0x0000000d, 1, &eax, &ebx, &ecx, &edx); + c->x86_capability[CPUID_D_1_EAX] = eax; + } + + /* Additional Intel-defined flags: level 0x0000000F */ + if (c->cpuid_level >= 0x0000000F) { + /* QoS sub-leaf, EAX=0Fh, ECX=0 */ + cpuid_count(0x0000000F, 0, &eax, &ebx, &ecx, &edx); + c->x86_capability[CPUID_F_0_EDX] = edx; + + if (compel_test_cpu_cap(c, X86_FEATURE_CQM_LLC)) { + /* QoS sub-leaf, EAX=0Fh, ECX=1 */ + cpuid_count(0x0000000F, 1, &eax, &ebx, &ecx, &edx); + c->x86_capability[CPUID_F_1_EDX] = edx; + } + } + + /* AMD-defined flags: level 0x80000001 */ + eax = cpuid_eax(0x80000000); + c->extended_cpuid_level = eax; + + if ((eax & 0xffff0000) == 0x80000000) { + if (eax >= 0x80000001) { + cpuid(0x80000001, &eax, &ebx, &ecx, &edx); + + c->x86_capability[CPUID_8000_0001_ECX] = ecx; + c->x86_capability[CPUID_8000_0001_EDX] = edx; + } + } + + /* + * We're don't care about scattered features for now, + * otherwise look into init_scattered_cpuid_features() + * in kernel. + * + * Same applies to speculation control. Look into + * init_speculation_control() otherwise. + */ + + if (c->extended_cpuid_level >= 0x80000004) { + unsigned int *v; + char *p, *q; + v = (unsigned int *)c->x86_model_id; + cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]); + cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]); + cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]); + c->x86_model_id[48] = 0; + + /* + * Intel chips right-justify this string for some dumb reason; + * undo that brain damage: + */ + p = q = &c->x86_model_id[0]; + while (*p == ' ') + p++; + if (p != q) { + while (*p) + *q++ = *p++; + while (q <= &c->x86_model_id[48]) + *q++ = '\0'; /* Zero-pad the rest */ + } + } + + if (c->extended_cpuid_level >= 0x80000007) { + cpuid(0x80000007, &eax, &ebx, &ecx, &edx); + + c->x86_capability[CPUID_8000_0007_EBX] = ebx; + c->x86_power = edx; + } + + if (c->extended_cpuid_level >= 0x8000000a) + c->x86_capability[CPUID_8000_000A_EDX] = cpuid_edx(0x8000000a); + + if (c->extended_cpuid_level >= 0x80000008) + c->x86_capability[CPUID_8000_0008_EBX] = cpuid_ebx(0x80000008); + + /* On x86-64 CPUID is always present */ + compel_set_cpu_cap(c, X86_FEATURE_CPUID); + + /* On x86-64 NOP is always present */ + compel_set_cpu_cap(c, X86_FEATURE_NOPL); + + /* + * On x86-64 syscalls32 are enabled but we don't + * set it yet for backward compatibility reason + */ + //compel_set_cpu_cap(c, X86_FEATURE_SYSCALL32); + + /* See filter_cpuid_features in kernel */ + if ((int32_t)c->cpuid_level < (int32_t)0x0000000d) + compel_clear_cpu_cap(c, X86_FEATURE_XSAVE); + + /* + * We only care about small subset from c_early_init: + * early_init_amd and early_init_intel + */ + switch (c->x86_vendor) { + case X86_VENDOR_INTEL: + /* + * Strictly speaking we need to read MSR_IA32_MISC_ENABLE + * here but on ring3 it's impossible. + */ + if (c->x86_family == 15) { + compel_clear_cpu_cap(c, X86_FEATURE_REP_GOOD); + compel_clear_cpu_cap(c, X86_FEATURE_ERMS); + } else if (c->x86_family == 6) { + /* On x86-64 rep is fine */ + compel_set_cpu_cap(c, X86_FEATURE_REP_GOOD); + } + + break; + case X86_VENDOR_AMD: + /* + * Bit 31 in normal CPUID used for nonstandard 3DNow ID; + * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway + */ + compel_clear_cpu_cap(c, 0 * 32 + 31); + if (c->x86_family >= 0x10) + compel_set_cpu_cap(c, X86_FEATURE_REP_GOOD); + if (c->x86_family == 0xf) { + uint32_t level; + + /* On C+ stepping K8 rep microcode works well for copy/memset */ + level = cpuid_eax(1); + if ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58) + compel_set_cpu_cap(c, X86_FEATURE_REP_GOOD); + } + break; + } + + pr_debug("x86_family %u x86_vendor_id %s x86_model_id %s\n", + c->x86_family, c->x86_vendor_id, c->x86_model_id); + + return compel_fpuid(c); +} + +bool compel_cpu_has_feature(unsigned int feature) +{ + fetch_rt_cpuinfo(); + return compel_test_cpu_cap(&rt_info, feature); +} + +bool compel_fpu_has_feature(unsigned int feature) +{ + fetch_rt_cpuinfo(); + return compel_test_fpu_cap(&rt_info, feature); +} + +uint32_t compel_fpu_feature_size(unsigned int feature) +{ + fetch_rt_cpuinfo(); + if (feature >= FIRST_EXTENDED_XFEATURE && + feature < XFEATURE_MAX) + return rt_info.xstate_sizes[feature]; + return 0; +} + +uint32_t compel_fpu_feature_offset(unsigned int feature) +{ + fetch_rt_cpuinfo(); + if (feature >= FIRST_EXTENDED_XFEATURE && + feature < XFEATURE_MAX) + return rt_info.xstate_offsets[feature]; + return 0; +} + +void compel_cpu_clear_feature(unsigned int feature) +{ + fetch_rt_cpuinfo(); + return compel_clear_cpu_cap(&rt_info, feature); +} + +void compel_cpu_copy_cpuinfo(compel_cpuinfo_t *c) +{ + fetch_rt_cpuinfo(); + memcpy(c, &rt_info, sizeof(rt_info)); +} diff --git a/CRIU_code/compel/arch/x86/src/lib/handle-elf-host.c b/CRIU_code/compel/arch/x86/src/lib/handle-elf-host.c new file mode 100644 index 0000000..fe46118 --- /dev/null +++ b/CRIU_code/compel/arch/x86/src/lib/handle-elf-host.c @@ -0,0 +1 @@ +handle-elf.c \ No newline at end of file diff --git a/CRIU_code/compel/arch/x86/src/lib/handle-elf.c b/CRIU_code/compel/arch/x86/src/lib/handle-elf.c new file mode 100644 index 0000000..62fb28f --- /dev/null +++ b/CRIU_code/compel/arch/x86/src/lib/handle-elf.c @@ -0,0 +1,22 @@ +#include + +#include "uapi/compel.h" + +#include "handle-elf.h" +#include "piegen.h" +#include "log.h" + +static const unsigned char __maybe_unused +elf_ident_64_le[EI_NIDENT] = { + 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, +}; + +int handle_binary(void *mem, size_t size) +{ + if (memcmp(mem, elf_ident_64_le, sizeof(elf_ident_64_le)) == 0) + return handle_elf_x86_64(mem, size); + + pr_err("Unsupported Elf format detected\n"); + return -EINVAL; +} diff --git a/CRIU_code/compel/arch/x86/src/lib/include/cpu.h b/CRIU_code/compel/arch/x86/src/lib/include/cpu.h new file mode 100644 index 0000000..60b7d24 --- /dev/null +++ b/CRIU_code/compel/arch/x86/src/lib/include/cpu.h @@ -0,0 +1,67 @@ +#ifndef __COMPEL_ASM_CPU_H__ +#define __COMPEL_ASM_CPU_H__ + +static inline void native_cpuid(unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) +{ + /* ecx is often an input as well as an output. */ + asm volatile("cpuid" + : "=a" (*eax), + "=b" (*ebx), + "=c" (*ecx), + "=d" (*edx) + : "0" (*eax), "2" (*ecx) + : "memory"); +} + +static inline void cpuid(unsigned int op, + unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) +{ + *eax = op; + *ecx = 0; + native_cpuid(eax, ebx, ecx, edx); +} + +static inline void cpuid_count(unsigned int op, int count, + unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) +{ + *eax = op; + *ecx = count; + native_cpuid(eax, ebx, ecx, edx); +} + +static inline unsigned int cpuid_eax(unsigned int op) +{ + unsigned int eax, ebx, ecx, edx; + + cpuid(op, &eax, &ebx, &ecx, &edx); + return eax; +} + +static inline unsigned int cpuid_ebx(unsigned int op) +{ + unsigned int eax, ebx, ecx, edx; + + cpuid(op, &eax, &ebx, &ecx, &edx); + return ebx; +} + +static inline unsigned int cpuid_ecx(unsigned int op) +{ + unsigned int eax, ebx, ecx, edx; + + cpuid(op, &eax, &ebx, &ecx, &edx); + return ecx; +} + +static inline unsigned int cpuid_edx(unsigned int op) +{ + unsigned int eax, ebx, ecx, edx; + + cpuid(op, &eax, &ebx, &ecx, &edx); + return edx; +} + +#endif diff --git a/CRIU_code/compel/arch/x86/src/lib/include/handle-elf.h b/CRIU_code/compel/arch/x86/src/lib/include/handle-elf.h new file mode 100644 index 0000000..e68fe3b --- /dev/null +++ b/CRIU_code/compel/arch/x86/src/lib/include/handle-elf.h @@ -0,0 +1,22 @@ +#ifndef COMPEL_HANDLE_ELF_H__ +#define COMPEL_HANDLE_ELF_H__ + +#include "elf64-types.h" + +#define ELF_X86_64 + +#ifndef R_X86_64_GOTPCRELX +# define R_X86_64_GOTPCRELX 41 +#endif + +#ifndef R_X86_64_REX_GOTPCRELX +# define R_X86_64_REX_GOTPCRELX 42 +#endif + +#define __handle_elf handle_elf_x86_64 +#define arch_is_machine_supported(e_machine) (e_machine == EM_X86_64) + +extern int handle_elf_x86_32(void *mem, size_t size); +extern int handle_elf_x86_64(void *mem, size_t size); + +#endif /* COMPEL_HANDLE_ELF_H__ */ diff --git a/CRIU_code/compel/arch/x86/src/lib/include/syscall.h b/CRIU_code/compel/arch/x86/src/lib/include/syscall.h new file mode 100644 index 0000000..9af1b1f --- /dev/null +++ b/CRIU_code/compel/arch/x86/src/lib/include/syscall.h @@ -0,0 +1,13 @@ +#ifndef __COMPEL_SYSCALL_H__ +#define __COMPEL_SYSCALL_H__ +#define __NR(syscall, compat) ((compat) ? __NR32_##syscall : __NR_##syscall) + +/* + * For x86_32 __NR_mmap inside the kernel represents old_mmap system + * call, but since we didn't use it yet lets go further and simply + * define own alias for __NR_mmap2 which would allow us to unify code + * between 32 and 64 bits version. + */ +#define __NR32_mmap __NR32_mmap2 + +#endif diff --git a/CRIU_code/compel/arch/x86/src/lib/include/uapi/asm/.gitignore b/CRIU_code/compel/arch/x86/src/lib/include/uapi/asm/.gitignore new file mode 100644 index 0000000..e69de29 diff --git a/CRIU_code/compel/arch/x86/src/lib/include/uapi/asm/breakpoints.h b/CRIU_code/compel/arch/x86/src/lib/include/uapi/asm/breakpoints.h new file mode 100644 index 0000000..980f25d --- /dev/null +++ b/CRIU_code/compel/arch/x86/src/lib/include/uapi/asm/breakpoints.h @@ -0,0 +1,6 @@ +#ifndef __COMPEL_BREAKPOINTS_H__ +#define __COMPEL_BREAKPOINTS_H__ +#define ARCH_SI_TRAP SI_KERNEL +extern int ptrace_set_breakpoint(pid_t pid, void *addr); +extern int ptrace_flush_breakpoints(pid_t pid); +#endif diff --git a/CRIU_code/compel/arch/x86/src/lib/include/uapi/asm/cpu.h b/CRIU_code/compel/arch/x86/src/lib/include/uapi/asm/cpu.h new file mode 100644 index 0000000..bb1914d --- /dev/null +++ b/CRIU_code/compel/arch/x86/src/lib/include/uapi/asm/cpu.h @@ -0,0 +1,350 @@ +#ifndef __CR_ASM_CPU_H__ +#define __CR_ASM_CPU_H__ + +#include + +#include + +/* + * Adopted from linux kernel and enhanced from Intel/AMD manuals. + * Note these bits are not ABI for linux kernel but they _are_ + * for us, so make sure they are at proper position between + * versions. + * + * In particular since we already used leaf 11 we have + * to keep it here, since it's an ABI now. + */ +enum cpuid_leafs { + CPUID_1_EDX = 0, + CPUID_8000_0001_EDX = 1, + CPUID_8086_0001_EDX = 2, + CPUID_LNX_1 = 3, + CPUID_1_ECX = 4, + CPUID_C000_0001_EDX = 5, + CPUID_8000_0001_ECX = 6, + CPUID_LNX_2 = 7, + CPUID_LNX_3 = 8, + CPUID_7_0_EBX = 9, + CPUID_D_1_EAX = 10, + CPUID_7_0_ECX = 11, + CPUID_F_1_EDX = 12, + CPUID_8000_0008_EBX = 13, + CPUID_6_EAX = 14, + CPUID_8000_000A_EDX = 15, + CPUID_F_0_EDX = 16, + CPUID_8000_0007_EBX = 17, + CPUID_7_0_EDX = 18, +}; + +#define NCAPINTS_V1 12 +#define NCAPINTS_V2 19 + +#define NCAPINTS (NCAPINTS_V2) /* N 32-bit words worth of info */ +#define NCAPINTS_BITS (NCAPINTS * 32) + +/* Intel-defined CPU features, CPUID level 0x00000001 (EDX), word 0 */ +#define X86_FEATURE_FPU (0*32+ 0) /* Onboard FPU */ +#define X86_FEATURE_VME (0*32+ 1) /* Virtual Mode Extensions */ +#define X86_FEATURE_DE (0*32+ 2) /* Debugging Extensions */ +#define X86_FEATURE_PSE (0*32+ 3) /* Page Size Extensions */ +#define X86_FEATURE_TSC (0*32+ 4) /* Time Stamp Counter */ +#define X86_FEATURE_MSR (0*32+ 5) /* Model-Specific Registers */ +#define X86_FEATURE_PAE (0*32+ 6) /* Physical Address Extensions */ +#define X86_FEATURE_MCE (0*32+ 7) /* Machine Check Exception */ +#define X86_FEATURE_CX8 (0*32+ 8) /* CMPXCHG8 instruction */ +#define X86_FEATURE_APIC (0*32+ 9) /* Onboard APIC */ +#define X86_FEATURE_SEP (0*32+11) /* SYSENTER/SYSEXIT */ +#define X86_FEATURE_MTRR (0*32+12) /* Memory Type Range Registers */ +#define X86_FEATURE_PGE (0*32+13) /* Page Global Enable */ +#define X86_FEATURE_MCA (0*32+14) /* Machine Check Architecture */ +#define X86_FEATURE_CMOV (0*32+15) /* CMOV instructions (plus FCMOVcc, FCOMI with FPU) */ +#define X86_FEATURE_PAT (0*32+16) /* Page Attribute Table */ +#define X86_FEATURE_PSE36 (0*32+17) /* 36-bit PSEs */ +#define X86_FEATURE_PN (0*32+18) /* Processor serial number */ +#define X86_FEATURE_CLFLUSH (0*32+19) /* CLFLUSH instruction */ +#define X86_FEATURE_DS (0*32+21) /* "dts" Debug Store */ +#define X86_FEATURE_ACPI (0*32+22) /* ACPI via MSR */ +#define X86_FEATURE_MMX (0*32+23) /* Multimedia Extensions */ +#define X86_FEATURE_FXSR (0*32+24) /* FXSAVE/FXRSTOR, CR4.OSFXSR */ +#define X86_FEATURE_XMM (0*32+25) /* "sse" */ +#define X86_FEATURE_XMM2 (0*32+26) /* "sse2" */ +#define X86_FEATURE_SELFSNOOP (0*32+27) /* "ss" CPU self snoop */ +#define X86_FEATURE_HT (0*32+28) /* Hyper-Threading */ +#define X86_FEATURE_ACC (0*32+29) /* "tm" Automatic clock control */ +#define X86_FEATURE_IA64 (0*32+30) /* IA-64 processor */ +#define X86_FEATURE_PBE (0*32+31) /* Pending Break Enable */ + +/* AMD-defined CPU features, CPUID level 0x80000001, word 1 */ +/* Don't duplicate feature flags which are redundant with Intel! */ +#define X86_FEATURE_SYSCALL (1*32+11) /* SYSCALL/SYSRET */ +#define X86_FEATURE_MP (1*32+19) /* MP Capable */ +#define X86_FEATURE_NX (1*32+20) /* Execute Disable */ +#define X86_FEATURE_MMXEXT (1*32+22) /* AMD MMX extensions */ +#define X86_FEATURE_FXSR_OPT (1*32+25) /* FXSAVE/FXRSTOR optimizations */ +#define X86_FEATURE_GBPAGES (1*32+26) /* "pdpe1gb" GB pages */ +#define X86_FEATURE_RDTSCP (1*32+27) /* RDTSCP */ +#define X86_FEATURE_LM (1*32+29) /* Long Mode (x86-64, 64-bit support) */ +#define X86_FEATURE_3DNOWEXT (1*32+30) /* AMD 3DNow extensions */ +#define X86_FEATURE_3DNOW (1*32+31) /* 3DNow */ + +/* Transmeta-defined CPU features, CPUID level 0x80860001, word 2 */ +#define X86_FEATURE_RECOVERY (2*32+ 0) /* CPU in recovery mode */ +#define X86_FEATURE_LONGRUN (2*32+ 1) /* Longrun power control */ +#define X86_FEATURE_LRTI (2*32+ 3) /* LongRun table interface */ + +/* Other features, Linux-defined mapping, word 3 */ +/* This range is used for feature bits which conflict or are synthesized */ +#define X86_FEATURE_CXMMX (3*32+ 0) /* Cyrix MMX extensions */ +#define X86_FEATURE_K6_MTRR (3*32+ 1) /* AMD K6 nonstandard MTRRs */ +#define X86_FEATURE_CYRIX_ARR (3*32+ 2) /* Cyrix ARRs (= MTRRs) */ +#define X86_FEATURE_CENTAUR_MCR (3*32+ 3) /* Centaur MCRs (= MTRRs) */ + +/* CPU types for specific tunings: */ +#define X86_FEATURE_K8 (3*32+ 4) /* "" Opteron, Athlon64 */ +#define X86_FEATURE_K7 (3*32+ 5) /* "" Athlon */ +#define X86_FEATURE_P3 (3*32+ 6) /* "" P3 */ +#define X86_FEATURE_P4 (3*32+ 7) /* "" P4 */ +#define X86_FEATURE_CONSTANT_TSC (3*32+ 8) /* TSC ticks at a constant rate */ +#define X86_FEATURE_UP (3*32+ 9) /* SMP kernel running on UP */ +#define X86_FEATURE_ART (3*32+10) /* Always running timer (ART) */ +#define X86_FEATURE_ARCH_PERFMON (3*32+11) /* Intel Architectural PerfMon */ +#define X86_FEATURE_PEBS (3*32+12) /* Precise-Event Based Sampling */ +#define X86_FEATURE_BTS (3*32+13) /* Branch Trace Store */ +#define X86_FEATURE_SYSCALL32 (3*32+14) /* "" syscall in IA32 userspace */ +#define X86_FEATURE_SYSENTER32 (3*32+15) /* "" sysenter in IA32 userspace */ +#define X86_FEATURE_REP_GOOD (3*32+16) /* REP microcode works well */ +#define X86_FEATURE_MFENCE_RDTSC (3*32+17) /* "" MFENCE synchronizes RDTSC */ +#define X86_FEATURE_LFENCE_RDTSC (3*32+18) /* "" LFENCE synchronizes RDTSC */ +#define X86_FEATURE_ACC_POWER (3*32+19) /* AMD Accumulated Power Mechanism */ +#define X86_FEATURE_NOPL (3*32+20) /* The NOPL (0F 1F) instructions */ +#define X86_FEATURE_ALWAYS (3*32+21) /* "" Always-present feature */ +#define X86_FEATURE_XTOPOLOGY (3*32+22) /* CPU topology enum extensions */ +#define X86_FEATURE_TSC_RELIABLE (3*32+23) /* TSC is known to be reliable */ +#define X86_FEATURE_NONSTOP_TSC (3*32+24) /* TSC does not stop in C states */ +#define X86_FEATURE_CPUID (3*32+25) /* CPU has CPUID instruction itself */ +#define X86_FEATURE_EXTD_APICID (3*32+26) /* Extended APICID (8 bits) */ +#define X86_FEATURE_AMD_DCM (3*32+27) /* AMD multi-node processor */ +#define X86_FEATURE_APERFMPERF (3*32+28) /* P-State hardware coordination feedback capability (APERF/MPERF MSRs) */ +#define X86_FEATURE_NONSTOP_TSC_S3 (3*32+30) /* TSC doesn't stop in S3 state */ +#define X86_FEATURE_TSC_KNOWN_FREQ (3*32+31) /* TSC has known frequency */ + +/* Intel-defined CPU features, CPUID level 0x00000001 (ECX), word 4 */ +#define X86_FEATURE_XMM3 (4*32+ 0) /* "pni" SSE-3 */ +#define X86_FEATURE_PCLMULQDQ (4*32+ 1) /* PCLMULQDQ instruction */ +#define X86_FEATURE_DTES64 (4*32+ 2) /* 64-bit Debug Store */ +#define X86_FEATURE_MWAIT (4*32+ 3) /* "monitor" MONITOR/MWAIT support */ +#define X86_FEATURE_DSCPL (4*32+ 4) /* "ds_cpl" CPL-qualified (filtered) Debug Store */ +#define X86_FEATURE_VMX (4*32+ 5) /* Hardware virtualization */ +#define X86_FEATURE_SMX (4*32+ 6) /* Safer Mode eXtensions */ +#define X86_FEATURE_EST (4*32+ 7) /* Enhanced SpeedStep */ +#define X86_FEATURE_TM2 (4*32+ 8) /* Thermal Monitor 2 */ +#define X86_FEATURE_SSSE3 (4*32+ 9) /* Supplemental SSE-3 */ +#define X86_FEATURE_CID (4*32+10) /* Context ID */ +#define X86_FEATURE_SDBG (4*32+11) /* Silicon Debug */ +#define X86_FEATURE_FMA (4*32+12) /* Fused multiply-add */ +#define X86_FEATURE_CX16 (4*32+13) /* CMPXCHG16B instruction */ +#define X86_FEATURE_XTPR (4*32+14) /* Send Task Priority Messages */ +#define X86_FEATURE_PDCM (4*32+15) /* Perf/Debug Capabilities MSR */ +#define X86_FEATURE_PCID (4*32+17) /* Process Context Identifiers */ +#define X86_FEATURE_DCA (4*32+18) /* Direct Cache Access */ +#define X86_FEATURE_XMM4_1 (4*32+19) /* "sse4_1" SSE-4.1 */ +#define X86_FEATURE_XMM4_2 (4*32+20) /* "sse4_2" SSE-4.2 */ +#define X86_FEATURE_X2APIC (4*32+21) /* X2APIC */ +#define X86_FEATURE_MOVBE (4*32+22) /* MOVBE instruction */ +#define X86_FEATURE_POPCNT (4*32+23) /* POPCNT instruction */ +#define X86_FEATURE_TSC_DEADLINE_TIMER (4*32+24) /* TSC deadline timer */ +#define X86_FEATURE_AES (4*32+25) /* AES instructions */ +#define X86_FEATURE_XSAVE (4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV instructions */ +#define X86_FEATURE_OSXSAVE (4*32+27) /* "" XSAVE instruction enabled in the OS */ +#define X86_FEATURE_AVX (4*32+28) /* Advanced Vector Extensions */ +#define X86_FEATURE_F16C (4*32+29) /* 16-bit FP conversions */ +#define X86_FEATURE_RDRAND (4*32+30) /* RDRAND instruction */ +#define X86_FEATURE_HYPERVISOR (4*32+31) /* Running on a hypervisor */ + +/* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */ +#define X86_FEATURE_XSTORE (5*32+ 2) /* "rng" RNG present (xstore) */ +#define X86_FEATURE_XSTORE_EN (5*32+ 3) /* "rng_en" RNG enabled */ +#define X86_FEATURE_XCRYPT (5*32+ 6) /* "ace" on-CPU crypto (xcrypt) */ +#define X86_FEATURE_XCRYPT_EN (5*32+ 7) /* "ace_en" on-CPU crypto enabled */ +#define X86_FEATURE_ACE2 (5*32+ 8) /* Advanced Cryptography Engine v2 */ +#define X86_FEATURE_ACE2_EN (5*32+ 9) /* ACE v2 enabled */ +#define X86_FEATURE_PHE (5*32+10) /* PadLock Hash Engine */ +#define X86_FEATURE_PHE_EN (5*32+11) /* PHE enabled */ +#define X86_FEATURE_PMM (5*32+12) /* PadLock Montgomery Multiplier */ +#define X86_FEATURE_PMM_EN (5*32+13) /* PMM enabled */ + +/* More extended AMD flags: CPUID level 0x80000001, ECX, word 6 */ +#define X86_FEATURE_LAHF_LM (6*32+ 0) /* LAHF/SAHF in long mode */ +#define X86_FEATURE_CMP_LEGACY (6*32+ 1) /* If yes HyperThreading not valid */ +#define X86_FEATURE_SVM (6*32+ 2) /* Secure Virtual Machine */ +#define X86_FEATURE_EXTAPIC (6*32+ 3) /* Extended APIC space */ +#define X86_FEATURE_CR8_LEGACY (6*32+ 4) /* CR8 in 32-bit mode */ +#define X86_FEATURE_ABM (6*32+ 5) /* Advanced bit manipulation */ +#define X86_FEATURE_SSE4A (6*32+ 6) /* SSE-4A */ +#define X86_FEATURE_MISALIGNSSE (6*32+ 7) /* Misaligned SSE mode */ +#define X86_FEATURE_3DNOWPREFETCH (6*32+ 8) /* 3DNow prefetch instructions */ +#define X86_FEATURE_OSVW (6*32+ 9) /* OS Visible Workaround */ +#define X86_FEATURE_IBS (6*32+10) /* Instruction Based Sampling */ +#define X86_FEATURE_XOP (6*32+11) /* extended AVX instructions */ +#define X86_FEATURE_SKINIT (6*32+12) /* SKINIT/STGI instructions */ +#define X86_FEATURE_WDT (6*32+13) /* Watchdog timer */ +#define X86_FEATURE_LWP (6*32+15) /* Light Weight Profiling */ +#define X86_FEATURE_FMA4 (6*32+16) /* 4 operands MAC instructions */ +#define X86_FEATURE_TCE (6*32+17) /* Translation Cache Extension */ +#define X86_FEATURE_NODEID_MSR (6*32+19) /* NodeId MSR */ +#define X86_FEATURE_TBM (6*32+21) /* Trailing Bit Manipulations */ +#define X86_FEATURE_TOPOEXT (6*32+22) /* Topology extensions CPUID leafs */ +#define X86_FEATURE_PERFCTR_CORE (6*32+23) /* Core performance counter extensions */ +#define X86_FEATURE_PERFCTR_NB (6*32+24) /* NB performance counter extensions */ +#define X86_FEATURE_BPEXT (6*32+26) /* Data breakpoint extension */ +#define X86_FEATURE_PTSC (6*32+27) /* Performance time-stamp counter */ +#define X86_FEATURE_PERFCTR_LLC (6*32+28) /* Last Level Cache performance counter extensions */ +#define X86_FEATURE_MWAITX (6*32+29) /* MWAIT extension (MONITORX/MWAITX instructions) */ + +/* Intel-defined CPU features, CPUID level 0x00000007:0 (EBX), word 9 */ +#define X86_FEATURE_FSGSBASE (9*32+ 0) /* RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/ +#define X86_FEATURE_TSC_ADJUST (9*32+ 1) /* TSC adjustment MSR 0x3B */ +#define X86_FEATURE_BMI1 (9*32+ 3) /* 1st group bit manipulation extensions */ +#define X86_FEATURE_HLE (9*32+ 4) /* Hardware Lock Elision */ +#define X86_FEATURE_AVX2 (9*32+ 5) /* AVX2 instructions */ +#define X86_FEATURE_SMEP (9*32+ 7) /* Supervisor Mode Execution Protection */ +#define X86_FEATURE_BMI2 (9*32+ 8) /* 2nd group bit manipulation extensions */ +#define X86_FEATURE_ERMS (9*32+ 9) /* Enhanced REP MOVSB/STOSB instructions */ +#define X86_FEATURE_INVPCID (9*32+10) /* Invalidate Processor Context ID */ +#define X86_FEATURE_RTM (9*32+11) /* Restricted Transactional Memory */ +#define X86_FEATURE_CQM (9*32+12) /* Cache QoS Monitoring */ +#define X86_FEATURE_MPX (9*32+14) /* Memory Protection Extension */ +#define X86_FEATURE_RDT_A (9*32+15) /* Resource Director Technology Allocation */ +#define X86_FEATURE_AVX512F (9*32+16) /* AVX-512 Foundation */ +#define X86_FEATURE_AVX512DQ (9*32+17) /* AVX-512 DQ (Double/Quad granular) Instructions */ +#define X86_FEATURE_RDSEED (9*32+18) /* RDSEED instruction */ +#define X86_FEATURE_ADX (9*32+19) /* ADCX and ADOX instructions */ +#define X86_FEATURE_SMAP (9*32+20) /* Supervisor Mode Access Prevention */ +#define X86_FEATURE_AVX512IFMA (9*32+21) /* AVX-512 Integer Fused Multiply-Add instructions */ +#define X86_FEATURE_CLFLUSHOPT (9*32+23) /* CLFLUSHOPT instruction */ +#define X86_FEATURE_CLWB (9*32+24) /* CLWB instruction */ +#define X86_FEATURE_INTEL_PT (9*32+25) /* Intel Processor Trace */ +#define X86_FEATURE_AVX512PF (9*32+26) /* AVX-512 Prefetch */ +#define X86_FEATURE_AVX512ER (9*32+27) /* AVX-512 Exponential and Reciprocal */ +#define X86_FEATURE_AVX512CD (9*32+28) /* AVX-512 Conflict Detection */ +#define X86_FEATURE_SHA_NI (9*32+29) /* SHA1/SHA256 Instruction Extensions */ +#define X86_FEATURE_AVX512BW (9*32+30) /* AVX-512 BW (Byte/Word granular) Instructions */ +#define X86_FEATURE_AVX512VL (9*32+31) /* AVX-512 VL (128/256 Vector Length) Extensions */ + +/* Extended state features, CPUID level 0x0000000d:1 (EAX), word 10 */ +#define X86_FEATURE_XSAVEOPT (10*32+ 0) /* XSAVEOPT instruction */ +#define X86_FEATURE_XSAVEC (10*32+ 1) /* XSAVEC instruction */ +#define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 instruction */ +#define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS instructions */ + +/* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 11 */ +#define X86_FEATURE_PREFETCHWT1 (11*32+ 0) /* PREFETCHWT1 Intel® Xeon PhiTM only */ +#define X86_FEATURE_AVX512VBMI (11*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/ +#define X86_FEATURE_UMIP (11*32+ 2) /* User Mode Instruction Protection */ +#define X86_FEATURE_PKU (11*32+ 3) /* Protection Keys for Userspace */ +#define X86_FEATURE_OSPKE (11*32+ 4) /* OS Protection Keys Enable */ +#define X86_FEATURE_AVX512_VBMI2 (11*32+ 6) /* Additional AVX512 Vector Bit Manipulation Instructions */ +#define X86_FEATURE_GFNI (11*32+ 8) /* Galois Field New Instructions */ +#define X86_FEATURE_VAES (11*32+ 9) /* Vector AES */ +#define X86_FEATURE_VPCLMULQDQ (11*32+10) /* Carry-Less Multiplication Double Quadword */ +#define X86_FEATURE_AVX512_VNNI (11*32+11) /* Vector Neural Network Instructions */ +#define X86_FEATURE_AVX512_BITALG (11*32+12) /* Support for VPOPCNT[B,W] and VPSHUF-BITQMB instructions */ +#define X86_FEATURE_TME (11*32+13) /* Intel Total Memory Encryption */ +#define X86_FEATURE_AVX512_VPOPCNTDQ (11*32+14) /* POPCNT for vectors of DW/QW */ +#define X86_FEATURE_LA57 (11*32+16) /* 5-level page tables */ +#define X86_FEATURE_RDPID (11*32+22) /* RDPID instruction */ +#define X86_FEATURE_CLDEMOTE (11*32+25) /* CLDEMOTE instruction */ + +/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (EDX), word 12 */ +#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring */ +#define X86_FEATURE_CQM_MBM_TOTAL (12*32+ 1) /* LLC Total MBM monitoring */ +#define X86_FEATURE_CQM_MBM_LOCAL (12*32+ 2) /* LLC Local MBM monitoring */ + +/* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */ +#define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */ +#define X86_FEATURE_IRPERF (13*32+ 1) /* Instructions Retired Count */ +#define X86_FEATURE_XSAVEERPTR (13*32+ 2) /* Always save/restore FP error pointers */ +#define X86_FEATURE_IBPB (13*32+12) /* Indirect Branch Prediction Barrier */ +#define X86_FEATURE_IBRS (13*32+14) /* Indirect Branch Restricted Speculation */ +#define X86_FEATURE_STIBP (13*32+15) /* Single Thread Indirect Branch Predictors */ + +/* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */ +#define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */ +#define X86_FEATURE_IDA (14*32+ 1) /* Intel Dynamic Acceleration */ +#define X86_FEATURE_ARAT (14*32+ 2) /* Always Running APIC Timer */ +#define X86_FEATURE_PLN (14*32+ 4) /* Intel Power Limit Notification */ +#define X86_FEATURE_PTS (14*32+ 6) /* Intel Package Thermal Status */ +#define X86_FEATURE_HWP (14*32+ 7) /* Intel Hardware P-states */ +#define X86_FEATURE_HWP_NOTIFY (14*32+ 8) /* HWP Notification */ +#define X86_FEATURE_HWP_ACT_WINDOW (14*32+ 9) /* HWP Activity Window */ +#define X86_FEATURE_HWP_EPP (14*32+10) /* HWP Energy Perf. Preference */ +#define X86_FEATURE_HWP_PKG_REQ (14*32+11) /* HWP Package Level Request */ +#define X86_FEATURE_HDC (14*32+13) /* HDC base registers present */ + +/* AMD SVM Feature Identification, CPUID level 0x8000000a (EDX), word 15 */ +#define X86_FEATURE_NPT (15*32+ 0) /* Nested Page Table support */ +#define X86_FEATURE_LBRV (15*32+ 1) /* LBR Virtualization support */ +#define X86_FEATURE_SVML (15*32+ 2) /* "svm_lock" SVM locking MSR */ +#define X86_FEATURE_NRIPS (15*32+ 3) /* "nrip_save" SVM next_rip save */ +#define X86_FEATURE_TSCRATEMSR (15*32+ 4) /* "tsc_scale" TSC scaling support */ +#define X86_FEATURE_VMCBCLEAN (15*32+ 5) /* "vmcb_clean" VMCB clean bits support */ +#define X86_FEATURE_FLUSHBYASID (15*32+ 6) /* flush-by-ASID support */ +#define X86_FEATURE_DECODEASSISTS (15*32+ 7) /* Decode Assists support */ +#define X86_FEATURE_PAUSEFILTER (15*32+10) /* filtered pause intercept */ +#define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */ +#define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */ +#define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* Virtual VMSAVE VMLOAD */ +#define X86_FEATURE_VGIF (15*32+16) /* Virtual GIF */ + +/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:0 (EDX), word 16 */ +#define X86_FEATURE_CQM_LLC (16*32+ 1) /* LLC QoS if 1 */ + +/* AMD-defined CPU features, CPUID level 0x80000007 (EBX), word 17 */ +#define X86_FEATURE_OVERFLOW_RECOV (17*32+ 0) /* MCA overflow recovery support */ +#define X86_FEATURE_SUCCOR (17*32+ 1) /* Uncorrectable error containment and recovery */ +#define X86_FEATURE_SMCA (17*32+ 3) /* Scalable MCA */ + +/* Intel-defined CPU features, CPUID level 0x00000007:0 (EDX), word 18 */ +#define X86_FEATURE_AVX512_4VNNIW (18*32+ 2) /* AVX-512 Neural Network Instructions */ +#define X86_FEATURE_AVX512_4FMAPS (18*32+ 3) /* AVX-512 Multiply Accumulation Single precision */ +#define X86_FEATURE_PCONFIG (18*32+18) /* Intel PCONFIG */ +#define X86_FEATURE_SPEC_CTRL (18*32+26) /* "" Speculation Control (IBRS + IBPB) */ +#define X86_FEATURE_INTEL_STIBP (18*32+27) /* "" Single Thread Indirect Branch Predictors */ +#define X86_FEATURE_ARCH_CAPABILITIES (18*32+29) /* IA32_ARCH_CAPABILITIES MSR (Intel) */ +#define X86_FEATURE_SPEC_CTRL_SSBD (18*32+31) /* "" Speculative Store Bypass Disable */ + +enum { + X86_VENDOR_INTEL = 0, + X86_VENDOR_AMD = 1, + + X86_VENDOR_MAX +}; + +struct cpuinfo_x86 { + /* cpu context */ + uint8_t x86_family; + uint8_t x86_vendor; + uint8_t x86_model; + uint8_t x86_mask; + uint32_t x86_capability[NCAPINTS]; + uint32_t x86_power; + uint32_t extended_cpuid_level; + int cpuid_level; + char x86_vendor_id[16]; + char x86_model_id[64]; + + /* fpu context */ + uint64_t xfeatures_mask; + uint32_t xsave_size_max; + uint32_t xsave_size; + uint32_t xstate_offsets[XFEATURE_MAX]; + uint32_t xstate_sizes[XFEATURE_MAX]; + + uint32_t xsaves_size; + uint32_t xstate_comp_offsets[XFEATURE_MAX]; + uint32_t xstate_comp_sizes[XFEATURE_MAX]; +}; + +typedef struct cpuinfo_x86 compel_cpuinfo_t; + +#endif /* __CR_ASM_CPU_H__ */ diff --git a/CRIU_code/compel/arch/x86/src/lib/include/uapi/asm/fpu.h b/CRIU_code/compel/arch/x86/src/lib/include/uapi/asm/fpu.h new file mode 100644 index 0000000..509f448 --- /dev/null +++ b/CRIU_code/compel/arch/x86/src/lib/include/uapi/asm/fpu.h @@ -0,0 +1,321 @@ +#ifndef __CR_ASM_FPU_H__ +#define __CR_ASM_FPU_H__ + +#include +#include +#include + +#include + +#define FP_MIN_ALIGN_BYTES 64 +#define FXSAVE_ALIGN_BYTES 16 + +#define FP_XSTATE_MAGIC1 0x46505853U +#define FP_XSTATE_MAGIC2 0x46505845U +#ifndef FP_XSTATE_MAGIC2_SIZE +#define FP_XSTATE_MAGIC2_SIZE sizeof(FP_XSTATE_MAGIC2) +#endif + +#define XSTATE_FP 0x1 +#define XSTATE_SSE 0x2 +#define XSTATE_YMM 0x4 + +#define FXSAVE_SIZE 512 +#define XSAVE_SIZE 4096 + +#define XSAVE_HDR_SIZE 64 +#define XSAVE_HDR_OFFSET FXSAVE_SIZE + +#define XSAVE_YMM_SIZE 256 +#define XSAVE_YMM_OFFSET (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET) + +/* + * List of XSAVE features Linux knows about: + */ +enum xfeature { + XFEATURE_FP, + XFEATURE_SSE, + /* + * Values above here are "legacy states". + * Those below are "extended states". + */ + XFEATURE_YMM, + XFEATURE_BNDREGS, + XFEATURE_BNDCSR, + XFEATURE_OPMASK, + XFEATURE_ZMM_Hi256, + XFEATURE_Hi16_ZMM, + XFEATURE_PT, + XFEATURE_PKRU, + XFEATURE_HDC, + + XFEATURE_MAX, +}; + +#define XSTATE_CPUID 0x0000000d + +#define XFEATURE_MASK_FP (1 << XFEATURE_FP) +#define XFEATURE_MASK_SSE (1 << XFEATURE_SSE) +#define XFEATURE_MASK_YMM (1 << XFEATURE_YMM) +#define XFEATURE_MASK_BNDREGS (1 << XFEATURE_BNDREGS) +#define XFEATURE_MASK_BNDCSR (1 << XFEATURE_BNDCSR) +#define XFEATURE_MASK_OPMASK (1 << XFEATURE_OPMASK) +#define XFEATURE_MASK_ZMM_Hi256 (1 << XFEATURE_ZMM_Hi256) +#define XFEATURE_MASK_Hi16_ZMM (1 << XFEATURE_Hi16_ZMM) +#define XFEATURE_MASK_PT (1 << XFEATURE_PT) +#define XFEATURE_MASK_PKRU (1 << XFEATURE_PKRU) +#define XFEATURE_MASK_HDC (1 << XFEATURE_HDC) +#define XFEATURE_MASK_MAX (1 << XFEATURE_MAX) + +#define XFEATURE_MASK_FPSSE (XFEATURE_MASK_FP | XFEATURE_MASK_SSE) +#define XFEATURE_MASK_AVX512 (XFEATURE_MASK_OPMASK | XFEATURE_MASK_ZMM_Hi256 | XFEATURE_MASK_Hi16_ZMM) + +#define FIRST_EXTENDED_XFEATURE XFEATURE_YMM + +/* Supervisor features */ +#define XFEATURE_MASK_SUPERVISOR (XFEATURE_MASK_PT | XFEATURE_HDC) + +/* All currently supported features */ +#define XCNTXT_MASK \ + (XFEATURE_MASK_FP | XFEATURE_MASK_SSE | \ + XFEATURE_MASK_YMM | XFEATURE_MASK_OPMASK | \ + XFEATURE_MASK_ZMM_Hi256 | XFEATURE_MASK_Hi16_ZMM | \ + XFEATURE_MASK_PKRU | XFEATURE_MASK_BNDREGS | \ + XFEATURE_MASK_BNDCSR) + +struct fpx_sw_bytes { + uint32_t magic1; + uint32_t extended_size; + uint64_t xstate_bv; + uint32_t xstate_size; + uint32_t padding[7]; +}; + +struct i387_fxsave_struct { + uint16_t cwd; /* Control Word */ + uint16_t swd; /* Status Word */ + uint16_t twd; /* Tag Word */ + uint16_t fop; /* Last Instruction Opcode */ + union { + struct { + uint64_t rip; /* Instruction Pointer */ + uint64_t rdp; /* Data Pointer */ + }; + struct { + uint32_t fip; /* FPU IP Offset */ + uint32_t fcs; /* FPU IP Selector */ + uint32_t foo; /* FPU Operand Offset */ + uint32_t fos; /* FPU Operand Selector */ + }; + }; + uint32_t mxcsr; /* MXCSR Register State */ + uint32_t mxcsr_mask; /* MXCSR Mask */ + + /* 8*16 bytes for each FP-reg = 128 bytes */ + uint32_t st_space[32]; + + /* 16*16 bytes for each XMM-reg = 256 bytes */ + uint32_t xmm_space[64]; + + uint32_t padding[12]; + + union { + uint32_t padding1[12]; + uint32_t sw_reserved[12]; + }; + +} __aligned(FXSAVE_ALIGN_BYTES); + +struct xsave_hdr_struct { + uint64_t xstate_bv; + uint64_t xcomp_bv; + uint64_t reserved[6]; +} __packed; + +/* + * xstate_header.xcomp_bv[63] indicates that the extended_state_area + * is in compacted format. + */ +#define XCOMP_BV_COMPACTED_FORMAT ((uint64_t)1 << 63) + +/* + * State component 2: + * + * There are 16x 256-bit AVX registers named YMM0-YMM15. + * The low 128 bits are aliased to the 16 SSE registers (XMM0-XMM15) + * and are stored in 'struct fxregs_state::xmm_space[]' in the + * "legacy" area. + * + * The high 128 bits are stored here. + */ +struct ymmh_struct { + uint32_t ymmh_space[64]; +} __packed; + +/* Intel MPX support: */ + +struct mpx_bndreg { + uint64_t lower_bound; + uint64_t upper_bound; +} __packed; + +/* + * State component 3 is used for the 4 128-bit bounds registers + */ +struct mpx_bndreg_state { + struct mpx_bndreg bndreg[4]; +} __packed; + +/* + * State component 4 is used for the 64-bit user-mode MPX + * configuration register BNDCFGU and the 64-bit MPX status + * register BNDSTATUS. We call the pair "BNDCSR". + */ +struct mpx_bndcsr { + uint64_t bndcfgu; + uint64_t bndstatus; +} __packed; + +/* + * The BNDCSR state is padded out to be 64-bytes in size. + */ +struct mpx_bndcsr_state { + union { + struct mpx_bndcsr bndcsr; + uint8_t pad_to_64_bytes[64]; + }; +} __packed; + +/* AVX-512 Components: */ + +/* + * State component 5 is used for the 8 64-bit opmask registers + * k0-k7 (opmask state). + */ +struct avx_512_opmask_state { + uint64_t opmask_reg[8]; +} __packed; + +/* + * State component 6 is used for the upper 256 bits of the + * registers ZMM0-ZMM15. These 16 256-bit values are denoted + * ZMM0_H-ZMM15_H (ZMM_Hi256 state). + */ +struct avx_512_zmm_uppers_state { + uint64_t zmm_upper[16 * 4]; +} __packed; + +/* + * State component 7 is used for the 16 512-bit registers + * ZMM16-ZMM31 (Hi16_ZMM state). + */ +struct avx_512_hi16_state { + uint64_t hi16_zmm[16 * 8]; +} __packed; + +/* + * State component 9: 32-bit PKRU register. The state is + * 8 bytes long but only 4 bytes is used currently. + */ +struct pkru_state { + uint32_t pkru; + uint32_t pad; +} __packed; + +/* + * This is our most modern FPU state format, as saved by the XSAVE + * and restored by the XRSTOR instructions. + * + * It consists of a legacy fxregs portion, an xstate header and + * subsequent areas as defined by the xstate header. Not all CPUs + * support all the extensions, so the size of the extended area + * can vary quite a bit between CPUs. + * + * + * One page should be enough for the whole xsave state. + */ +#define EXTENDED_STATE_AREA_SIZE (4096 - sizeof(struct i387_fxsave_struct) - sizeof(struct xsave_hdr_struct)) + +/* + * cpu requires it to be 64 byte aligned + */ +struct xsave_struct { + struct i387_fxsave_struct i387; + struct xsave_hdr_struct xsave_hdr; + union { + /* + * This ymmh is unndeed, for + * backward compatibility. + */ + struct ymmh_struct ymmh; + uint8_t extended_state_area[EXTENDED_STATE_AREA_SIZE]; + }; +} __aligned(FP_MIN_ALIGN_BYTES) __packed; + +struct xsave_struct_ia32 { + struct i387_fxsave_struct i387; + struct xsave_hdr_struct xsave_hdr; + union { + /* + * This ymmh is unndeed, for + * backward compatibility. + */ + struct ymmh_struct ymmh; + uint8_t extended_state_area[EXTENDED_STATE_AREA_SIZE]; + }; +} __aligned(FXSAVE_ALIGN_BYTES); + +typedef struct { + /* + * The FPU xsave area must be continious and FP_MIN_ALIGN_BYTES + * aligned, thus make sure the compiler won't insert any hole here. + */ + + union { + struct xsave_struct xsave; + uint8_t __pad[sizeof(struct xsave_struct) + FP_XSTATE_MAGIC2_SIZE]; + }; + + uint8_t has_fpu; +} fpu_state_64_t; + +struct user_i387_ia32_struct { + uint32_t cwd; /* FPU Control Word */ + uint32_t swd; /* FPU Status Word */ + uint32_t twd; /* FPU Tag Word */ + uint32_t fip; /* FPU IP Offset */ + uint32_t fcs; /* FPU IP Selector */ + uint32_t foo; /* FPU Operand Pointer Offset */ + uint32_t fos; /* FPU Operand Pointer Selector */ + uint32_t st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */ +}; + +typedef struct { + struct { + struct user_i387_ia32_struct i387_ia32; + + /* Software status information [not touched by FSAVE]: */ + uint32_t status; + } fregs_state; + union { + struct xsave_struct_ia32 xsave; + uint8_t __pad[sizeof(struct xsave_struct) + FP_XSTATE_MAGIC2_SIZE]; + } __aligned(FXSAVE_ALIGN_BYTES); +} __aligned(FXSAVE_ALIGN_BYTES) fpu_state_ia32_t; + +/* + * This one is used in restorer. + */ +typedef struct { + union { + fpu_state_64_t fpu_state_64; + fpu_state_ia32_t fpu_state_ia32; + }; + + uint8_t has_fpu; +} fpu_state_t; + +extern void compel_convert_from_fxsr(struct user_i387_ia32_struct *env, + struct i387_fxsave_struct *fxsave); + +#endif /* __CR_ASM_FPU_H__ */ diff --git a/CRIU_code/compel/arch/x86/src/lib/include/uapi/asm/infect-types.h b/CRIU_code/compel/arch/x86/src/lib/include/uapi/asm/infect-types.h new file mode 100644 index 0000000..e6d3949 --- /dev/null +++ b/CRIU_code/compel/arch/x86/src/lib/include/uapi/asm/infect-types.h @@ -0,0 +1,127 @@ +#ifndef UAPI_COMPEL_ASM_TYPES_H__ +#define UAPI_COMPEL_ASM_TYPES_H__ + +#include +#include +#include +#include + +#define SIGMAX 64 +#define SIGMAX_OLD 31 + +typedef struct { + uint64_t r15; + uint64_t r14; + uint64_t r13; + uint64_t r12; + uint64_t bp; + uint64_t bx; + uint64_t r11; + uint64_t r10; + uint64_t r9; + uint64_t r8; + uint64_t ax; + uint64_t cx; + uint64_t dx; + uint64_t si; + uint64_t di; + uint64_t orig_ax; + uint64_t ip; + uint64_t cs; + uint64_t flags; + uint64_t sp; + uint64_t ss; + uint64_t fs_base; + uint64_t gs_base; + uint64_t ds; + uint64_t es; + uint64_t fs; + uint64_t gs; +} user_regs_struct64; + +typedef struct { + uint32_t bx; + uint32_t cx; + uint32_t dx; + uint32_t si; + uint32_t di; + uint32_t bp; + uint32_t ax; + uint32_t ds; + uint32_t es; + uint32_t fs; + uint32_t gs; + uint32_t orig_ax; + uint32_t ip; + uint32_t cs; + uint32_t flags; + uint32_t sp; + uint32_t ss; +} user_regs_struct32; + +/* + * To be sure that we rely on inited reg->__is_native, this member + * is (short int) instead of initial (bool). The right way to + * check if regs are native or compat is to use user_regs_native() macro. + * This should cost nothing, as *usually* sizeof(bool) == sizeof(short) + */ +typedef struct { + union { + user_regs_struct64 native; + user_regs_struct32 compat; + }; + short __is_native; /* use user_regs_native macro to check it */ +} user_regs_struct_t; + +#define NATIVE_MAGIC 0x0A +#define COMPAT_MAGIC 0x0C +static inline bool user_regs_native(user_regs_struct_t *pregs) +{ + return pregs->__is_native == NATIVE_MAGIC; +} + +#define get_user_reg(pregs, name) \ + ((user_regs_native(pregs)) ? \ + ((pregs)->native.name) : \ + ((pregs)->compat.name)) + +#define set_user_reg(pregs, name, val) \ + ((user_regs_native(pregs)) ? \ + ((pregs)->native.name = (val)) : \ + ((pregs)->compat.name = (val))) + +#if 0 +typedef struct { + unsigned short cwd; + unsigned short swd; + unsigned short twd; /* Note this is not the same as + the 32bit/x87/FSAVE twd */ + unsigned short fop; + u64 rip; + u64 rdp; + u32 mxcsr; + u32 mxcsr_mask; + u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */ + u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */ + u32 padding[24]; +} user_fpregs_struct_t; +#endif + +typedef struct xsave_struct user_fpregs_struct_t; + +#define REG_RES(regs) get_user_reg(®s, ax) +#define REG_IP(regs) get_user_reg(®s, ip) +#define REG_SP(regs) get_user_reg(®s, sp) +#define REG_SYSCALL_NR(regs) get_user_reg(®s, orig_ax) + +#define __NR(syscall, compat) ((compat) ? __NR32_##syscall : __NR_##syscall) + +/* + * For x86_32 __NR_mmap inside the kernel represents old_mmap system + * call, but since we didn't use it yet lets go further and simply + * define own alias for __NR_mmap2 which would allow us to unify code + * between 32 and 64 bits version. + */ +#define __NR32_mmap __NR32_mmap2 + +#endif /* UAPI_COMPEL_ASM_TYPES_H__ */ diff --git a/CRIU_code/compel/arch/x86/src/lib/include/uapi/asm/processor-flags.h b/CRIU_code/compel/arch/x86/src/lib/include/uapi/asm/processor-flags.h new file mode 100644 index 0000000..9f1bccd --- /dev/null +++ b/CRIU_code/compel/arch/x86/src/lib/include/uapi/asm/processor-flags.h @@ -0,0 +1,28 @@ +#ifndef __CR_PROCESSOR_FLAGS_H__ +#define __CR_PROCESSOR_FLAGS_H__ + +/* Taken from linux kernel headers */ + +/* + * EFLAGS bits + */ +#define X86_EFLAGS_CF 0x00000001 /* Carry Flag */ +#define X86_EFLAGS_BIT1 0x00000002 /* Bit 1 - always on */ +#define X86_EFLAGS_PF 0x00000004 /* Parity Flag */ +#define X86_EFLAGS_AF 0x00000010 /* Auxiliary carry Flag */ +#define X86_EFLAGS_ZF 0x00000040 /* Zero Flag */ +#define X86_EFLAGS_SF 0x00000080 /* Sign Flag */ +#define X86_EFLAGS_TF 0x00000100 /* Trap Flag */ +#define X86_EFLAGS_IF 0x00000200 /* Interrupt Flag */ +#define X86_EFLAGS_DF 0x00000400 /* Direction Flag */ +#define X86_EFLAGS_OF 0x00000800 /* Overflow Flag */ +#define X86_EFLAGS_IOPL 0x00003000 /* IOPL mask */ +#define X86_EFLAGS_NT 0x00004000 /* Nested Task */ +#define X86_EFLAGS_RF 0x00010000 /* Resume Flag */ +#define X86_EFLAGS_VM 0x00020000 /* Virtual Mode */ +#define X86_EFLAGS_AC 0x00040000 /* Alignment Check */ +#define X86_EFLAGS_VIF 0x00080000 /* Virtual Interrupt Flag */ +#define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */ +#define X86_EFLAGS_ID 0x00200000 /* CPUID detection flag */ + +#endif /* __CR_PROCESSOR_FLAGS_H__ */ diff --git a/CRIU_code/compel/arch/x86/src/lib/include/uapi/asm/sigframe.h b/CRIU_code/compel/arch/x86/src/lib/include/uapi/asm/sigframe.h new file mode 100644 index 0000000..51ca023 --- /dev/null +++ b/CRIU_code/compel/arch/x86/src/lib/include/uapi/asm/sigframe.h @@ -0,0 +1,220 @@ +#ifndef UAPI_COMPEL_ASM_SIGFRAME_H__ +#define UAPI_COMPEL_ASM_SIGFRAME_H__ + +#include +#include +#include + +#include +#include + +#define SIGFRAME_MAX_OFFSET 8 + +struct rt_sigcontext { + uint64_t r8; + uint64_t r9; + uint64_t r10; + uint64_t r11; + uint64_t r12; + uint64_t r13; + uint64_t r14; + uint64_t r15; + uint64_t rdi; + uint64_t rsi; + uint64_t rbp; + uint64_t rbx; + uint64_t rdx; + uint64_t rax; + uint64_t rcx; + uint64_t rsp; + uint64_t rip; + uint64_t eflags; + uint16_t cs; + uint16_t gs; + uint16_t fs; + uint16_t ss; + uint64_t err; + uint64_t trapno; + uint64_t oldmask; + uint64_t cr2; + uint64_t fpstate; + uint64_t reserved1[8]; +}; + +struct rt_sigcontext_32 { + uint32_t gs; + uint32_t fs; + uint32_t es; + uint32_t ds; + uint32_t di; + uint32_t si; + uint32_t bp; + uint32_t sp; + uint32_t bx; + uint32_t dx; + uint32_t cx; + uint32_t ax; + uint32_t trapno; + uint32_t err; + uint32_t ip; + uint32_t cs; + uint32_t flags; + uint32_t sp_at_signal; + uint32_t ss; + + uint32_t fpstate; + uint32_t oldmask; + uint32_t cr2; +}; + +#include + +/* + * XXX: move declarations to generic sigframe.h or sigframe-compat.h + * when (if) other architectures will support compatible C/R + */ + +typedef uint32_t compat_uptr_t; +typedef uint32_t compat_size_t; +typedef uint32_t compat_sigset_word; + +typedef struct compat_siginfo { + int si_signo; + int si_errno; + int si_code; + int _pad[128/sizeof(int) - 3]; +} compat_siginfo_t; + +typedef struct compat_sigaltstack { + compat_uptr_t ss_sp; + int ss_flags; + compat_size_t ss_size; +} compat_stack_t; + +#define _COMPAT_NSIG 64 +#define _COMPAT_NSIG_BPW 32 +#define _COMPAT_NSIG_WORDS (_COMPAT_NSIG / _COMPAT_NSIG_BPW) + +typedef struct { + compat_sigset_word sig[_COMPAT_NSIG_WORDS]; +} compat_sigset_t; + +struct ucontext_ia32 { + unsigned int uc_flags; + unsigned int uc_link; + compat_stack_t uc_stack; + struct rt_sigcontext_32 uc_mcontext; + compat_sigset_t uc_sigmask; /* mask last for extensibility */ +}; + +struct rt_sigframe_ia32 { + uint32_t pretcode; + int32_t sig; + uint32_t pinfo; + uint32_t puc; + compat_siginfo_t info; + struct ucontext_ia32 uc; + char retcode[8]; + + /* fp state follows here */ + fpu_state_t fpu_state; +}; + +struct rt_sigframe_64 { + char *pretcode; + struct rt_ucontext uc; + struct rt_siginfo info; + + /* fp state follows here */ + fpu_state_t fpu_state; +}; + +struct rt_sigframe { + union { + struct rt_sigframe_ia32 compat; + struct rt_sigframe_64 native; + }; + bool is_native; +}; + +static inline +void rt_sigframe_copy_sigset(struct rt_sigframe *to, k_rtsigset_t *from) +{ + size_t sz = sizeof(k_rtsigset_t); + + BUILD_BUG_ON(sz != sizeof(compat_sigset_t)); + if (to->is_native) + memcpy(&to->native.uc.uc_sigmask, from, sz); + else + memcpy(&to->compat.uc.uc_sigmask, from, sz); +} + +static inline +void rt_sigframe_erase_sigset(struct rt_sigframe *sigframe) +{ + size_t sz = sizeof(k_rtsigset_t); + + if (sigframe->is_native) + memset(&sigframe->native.uc.uc_sigmask, 0, sz); + else + memset(&sigframe->compat.uc.uc_sigmask, 0, sz); +} + +#define RT_SIGFRAME_REGIP(rt_sigframe) \ + ((rt_sigframe->is_native) ? \ + (rt_sigframe)->native.uc.uc_mcontext.rip : \ + (rt_sigframe)->compat.uc.uc_mcontext.ip) + +#define RT_SIGFRAME_FPU(rt_sigframe) \ + ((rt_sigframe->is_native) ? \ + (&(rt_sigframe)->native.fpu_state) : \ + (&(rt_sigframe)->compat.fpu_state)) + +#define RT_SIGFRAME_HAS_FPU(rt_sigframe) (RT_SIGFRAME_FPU(rt_sigframe)->has_fpu) + +/* + * Sigframe offset is different for native/compat tasks. + * Offsets calculations one may see at kernel: + * - compatible is in sys32_rt_sigreturn at arch/x86/ia32/ia32_signal.c + * - native is in sys_rt_sigreturn at arch/x86/kernel/signal.c + */ +#define RT_SIGFRAME_OFFSET(rt_sigframe) (((rt_sigframe)->is_native) ? 8 : 4 ) + +#define USER32_CS 0x23 + +#define ARCH_RT_SIGRETURN_NATIVE(new_sp) \ + asm volatile( \ + "movq %0, %%rax \n" \ + "movq %%rax, %%rsp \n" \ + "movl $"__stringify(__NR_rt_sigreturn)", %%eax \n" \ + "syscall \n" \ + : \ + : "r"(new_sp) \ + : "rax","memory") +#define ARCH_RT_SIGRETURN_COMPAT(new_sp) \ + asm volatile( \ + "pushq $"__stringify(USER32_CS)" \n" \ + "pushq $1f \n" \ + "lretq \n" \ + "1: \n" \ + ".code32 \n" \ + "movl %%edi, %%esp \n" \ + "movl $"__stringify(__NR32_rt_sigreturn)",%%eax \n" \ + "int $0x80 \n" \ + ".code64 \n" \ + : \ + : "rdi"(new_sp) \ + : "eax", "r8", "r9", "r10", "r11", "memory") + +#define ARCH_RT_SIGRETURN(new_sp, rt_sigframe) \ +do { \ + if ((rt_sigframe)->is_native) \ + ARCH_RT_SIGRETURN_NATIVE(new_sp); \ + else \ + ARCH_RT_SIGRETURN_COMPAT(new_sp); \ +} while (0) + +int sigreturn_prep_fpu_frame(struct rt_sigframe *sigframe, + struct rt_sigframe *rsigframe); + +#endif /* UAPI_COMPEL_ASM_SIGFRAME_H__ */ diff --git a/CRIU_code/compel/arch/x86/src/lib/infect.c b/CRIU_code/compel/arch/x86/src/lib/infect.c new file mode 100644 index 0000000..11e7f4c --- /dev/null +++ b/CRIU_code/compel/arch/x86/src/lib/infect.c @@ -0,0 +1,592 @@ +#include +#include +#include +#include +#include + +#include + +#include "asm/cpu.h" + +#include +#include +#include "errno.h" +#include +#include +#include "common/err.h" +#include "asm/infect-types.h" +#include "ptrace.h" +#include "infect.h" +#include "infect-priv.h" +#include "log.h" + +#ifndef NT_X86_XSTATE +#define NT_X86_XSTATE 0x202 /* x86 extended state using xsave */ +#endif +#ifndef NT_PRSTATUS +#define NT_PRSTATUS 1 /* Contains copy of prstatus struct */ +#endif + +/* + * Injected syscall instruction + */ +const char code_syscall[] = { + 0x0f, 0x05, /* syscall */ + 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc /* int 3, ... */ +}; + +const char code_int_80[] = { + 0xcd, 0x80, /* int $0x80 */ + 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc /* int 3, ... */ +}; + +static const int +code_syscall_aligned = round_up(sizeof(code_syscall), sizeof(long)); +static const int +code_int_80_aligned = round_up(sizeof(code_syscall), sizeof(long)); + +static inline __always_unused void __check_code_syscall(void) +{ + BUILD_BUG_ON(code_int_80_aligned != BUILTIN_SYSCALL_SIZE); + BUILD_BUG_ON(code_syscall_aligned != BUILTIN_SYSCALL_SIZE); + BUILD_BUG_ON(!is_log2(sizeof(code_syscall))); +} + +/* 10-byte legacy floating point register */ +struct fpreg { + uint16_t significand[4]; + uint16_t exponent; +}; + +/* 16-byte floating point register */ +struct fpxreg { + uint16_t significand[4]; + uint16_t exponent; + uint16_t padding[3]; +}; + +#define FPREG_ADDR(f, n) ((void *)&(f)->st_space + (n) * 16) +#define FP_EXP_TAG_VALID 0 +#define FP_EXP_TAG_ZERO 1 +#define FP_EXP_TAG_SPECIAL 2 +#define FP_EXP_TAG_EMPTY 3 + +static inline uint32_t twd_fxsr_to_i387(struct i387_fxsave_struct *fxsave) +{ + struct fpxreg *st; + uint32_t tos = (fxsave->swd >> 11) & 7; + uint32_t twd = (unsigned long)fxsave->twd; + uint32_t tag; + uint32_t ret = 0xffff0000u; + int i; + + for (i = 0; i < 8; i++, twd >>= 1) { + if (twd & 0x1) { + st = FPREG_ADDR(fxsave, (i - tos) & 7); + + switch (st->exponent & 0x7fff) { + case 0x7fff: + tag = FP_EXP_TAG_SPECIAL; + break; + case 0x0000: + if (!st->significand[0] && + !st->significand[1] && + !st->significand[2] && + !st->significand[3]) + tag = FP_EXP_TAG_ZERO; + else + tag = FP_EXP_TAG_SPECIAL; + break; + default: + if (st->significand[3] & 0x8000) + tag = FP_EXP_TAG_VALID; + else + tag = FP_EXP_TAG_SPECIAL; + break; + } + } else { + tag = FP_EXP_TAG_EMPTY; + } + ret |= tag << (2 * i); + } + return ret; +} + +void compel_convert_from_fxsr(struct user_i387_ia32_struct *env, + struct i387_fxsave_struct *fxsave) +{ + struct fpxreg *from = (struct fpxreg *)&fxsave->st_space[0]; + struct fpreg *to = (struct fpreg *)env->st_space; + int i; + + env->cwd = fxsave->cwd | 0xffff0000u; + env->swd = fxsave->swd | 0xffff0000u; + env->twd = twd_fxsr_to_i387(fxsave); + + env->fip = fxsave->rip; + env->foo = fxsave->rdp; + /* + * should be actually ds/cs at fpu exception time, but + * that information is not available in 64bit mode. + */ + env->fcs = 0x23; /* __USER32_CS */ + env->fos = 0x2b; /* __USER32_DS */ + env->fos |= 0xffff0000; + + for (i = 0; i < 8; ++i) + memcpy(&to[i], &from[i], sizeof(to[0])); +} + +int sigreturn_prep_regs_plain(struct rt_sigframe *sigframe, + user_regs_struct_t *regs, + user_fpregs_struct_t *fpregs) +{ + bool is_native = user_regs_native(regs); + fpu_state_t *fpu_state = is_native ? + &sigframe->native.fpu_state : + &sigframe->compat.fpu_state; + if (is_native) { +#define cpreg64_native(d, s) sigframe->native.uc.uc_mcontext.d = regs->native.s + cpreg64_native(rdi, di); + cpreg64_native(rsi, si); + cpreg64_native(rbp, bp); + cpreg64_native(rsp, sp); + cpreg64_native(rbx, bx); + cpreg64_native(rdx, dx); + cpreg64_native(rcx, cx); + cpreg64_native(rip, ip); + cpreg64_native(rax, ax); + cpreg64_native(r8, r8); + cpreg64_native(r9, r9); + cpreg64_native(r10, r10); + cpreg64_native(r11, r11); + cpreg64_native(r12, r12); + cpreg64_native(r13, r13); + cpreg64_native(r14, r14); + cpreg64_native(r15, r15); + cpreg64_native(cs, cs); + cpreg64_native(eflags, flags); + + sigframe->is_native = true; +#undef cpreg64_native + } else { +#define cpreg32_compat(d) sigframe->compat.uc.uc_mcontext.d = regs->compat.d + cpreg32_compat(gs); + cpreg32_compat(fs); + cpreg32_compat(es); + cpreg32_compat(ds); + cpreg32_compat(di); + cpreg32_compat(si); + cpreg32_compat(bp); + cpreg32_compat(sp); + cpreg32_compat(bx); + cpreg32_compat(dx); + cpreg32_compat(cx); + cpreg32_compat(ip); + cpreg32_compat(ax); + cpreg32_compat(cs); + cpreg32_compat(ss); + cpreg32_compat(flags); +#undef cpreg32_compat + sigframe->is_native = false; + } + + fpu_state->has_fpu = true; + if (is_native) { + memcpy(&fpu_state->fpu_state_64.xsave, fpregs, sizeof(*fpregs)); + } else { + memcpy(&fpu_state->fpu_state_ia32.xsave, fpregs, sizeof(*fpregs)); + compel_convert_from_fxsr(&fpu_state->fpu_state_ia32.fregs_state.i387_ia32, + &fpu_state->fpu_state_ia32.xsave.i387); + } + + return 0; +} + +int sigreturn_prep_fpu_frame_plain(struct rt_sigframe *sigframe, + struct rt_sigframe *rsigframe) +{ + fpu_state_t *fpu_state = (sigframe->is_native) ? + &rsigframe->native.fpu_state : + &rsigframe->compat.fpu_state; + + if (sigframe->is_native) { + unsigned long addr = (unsigned long)(void *)&fpu_state->fpu_state_64.xsave; + + if ((addr % 64ul)) { + pr_err("Unaligned address passed: %lx (native %d)\n", + addr, sigframe->is_native); + return -1; + } + + sigframe->native.uc.uc_mcontext.fpstate = (uint64_t)addr; + } else if (!sigframe->is_native) { + sigframe->compat.uc.uc_mcontext.fpstate = + (uint32_t)(unsigned long)(void *)&fpu_state->fpu_state_ia32; + } + + return 0; +} + +#define get_signed_user_reg(pregs, name) \ + ((user_regs_native(pregs)) ? (int64_t)((pregs)->native.name) : \ + (int32_t)((pregs)->compat.name)) + +static int get_task_xsave(pid_t pid, user_fpregs_struct_t *xsave) +{ + struct iovec iov; + + iov.iov_base = xsave; + iov.iov_len = sizeof(*xsave); + + if (ptrace(PTRACE_GETREGSET, pid, (unsigned int)NT_X86_XSTATE, &iov) < 0) { + pr_perror("Can't obtain FPU registers for %d", pid); + return -1; + } + + return 0; +} + +static int get_task_fpregs(pid_t pid, user_fpregs_struct_t *xsave) +{ + if (ptrace(PTRACE_GETFPREGS, pid, NULL, xsave)) { + pr_perror("Can't obtain FPU registers for %d", pid); + return -1; + } + + return 0; +} + +int get_task_regs(pid_t pid, user_regs_struct_t *regs, save_regs_t save, + void *arg, unsigned long flags) +{ + user_fpregs_struct_t xsave = { }, *xs = NULL; + int ret = -1; + + pr_info("Dumping general registers for %d in %s mode\n", pid, + user_regs_native(regs) ? "native" : "compat"); + + /* Did we come from a system call? */ + if (get_signed_user_reg(regs, orig_ax) >= 0) { + /* Restart the system call */ + switch (get_signed_user_reg(regs, ax)) { + case -ERESTARTNOHAND: + case -ERESTARTSYS: + case -ERESTARTNOINTR: + set_user_reg(regs, ax, get_user_reg(regs, orig_ax)); + set_user_reg(regs, ip, get_user_reg(regs, ip) - 2); + break; + case -ERESTART_RESTARTBLOCK: + pr_warn("Will restore %d with interrupted system call\n", pid); + set_user_reg(regs, ax, -EINTR); + break; + } + } + + if (!compel_cpu_has_feature(X86_FEATURE_FPU)) + goto out; + + /* + * FPU fetched either via fxsave or via xsave, + * thus decode it accrodingly. + */ + + pr_info("Dumping GP/FPU registers for %d\n", pid); + + if (!compel_cpu_has_feature(X86_FEATURE_OSXSAVE)) { + ret = get_task_fpregs(pid, &xsave); + } else if (unlikely(flags & INFECT_X86_PTRACE_MXCSR_BUG)) { + /* + * get_task_fpregs() will fill FP state, + * get_task_xsave() will overwrite rightly sse/mmx/etc + */ + pr_warn("Skylake xsave fpu bug workaround used\n"); + ret = get_task_fpregs(pid, &xsave); + if (!ret) + ret = get_task_xsave(pid, &xsave); + } else { + ret = get_task_xsave(pid, &xsave); + } + + if (ret) + goto err; + + xs = &xsave; +out: + ret = save(arg, regs, xs); +err: + return ret; +} + +int compel_syscall(struct parasite_ctl *ctl, int nr, long *ret, + unsigned long arg1, + unsigned long arg2, + unsigned long arg3, + unsigned long arg4, + unsigned long arg5, + unsigned long arg6) +{ + user_regs_struct_t regs = ctl->orig.regs; + bool native = user_regs_native(®s); + int err; + + if (native) { + user_regs_struct64 *r = ®s.native; + + r->ax = (uint64_t)nr; + r->di = arg1; + r->si = arg2; + r->dx = arg3; + r->r10 = arg4; + r->r8 = arg5; + r->r9 = arg6; + + err = compel_execute_syscall(ctl, ®s, code_syscall); + } else { + user_regs_struct32 *r = ®s.compat; + + r->ax = (uint32_t)nr; + r->bx = arg1; + r->cx = arg2; + r->dx = arg3; + r->si = arg4; + r->di = arg5; + r->bp = arg6; + + err = compel_execute_syscall(ctl, ®s, code_int_80); + } + + *ret = native ? + (long)get_user_reg(®s, ax) : + (int)get_user_reg(®s, ax); + return err; +} + +void *remote_mmap(struct parasite_ctl *ctl, + void *addr, size_t length, int prot, + int flags, int fd, off_t offset) +{ + long map; + int err; + bool compat_task = !user_regs_native(&ctl->orig.regs); + + err = compel_syscall(ctl, __NR(mmap, compat_task), &map, + (unsigned long)addr, length, prot, flags, fd, offset); + if (err < 0) + return NULL; + + if (map == -EACCES && (prot & PROT_WRITE) && (prot & PROT_EXEC)) { + pr_warn("mmap(PROT_WRITE | PROT_EXEC) failed for %d, " + "check selinux execmem policy\n", ctl->rpid); + return NULL; + } + if (IS_ERR_VALUE(map)) { + pr_err("remote mmap() failed: %s\n", strerror(-map)); + return NULL; + } + + /* + * For compat tasks the address in foreign process + * must lay inside 4 bytes. + */ + if (compat_task) + map &= 0xfffffffful; + + return (void *)map; +} + +/* + * regs must be inited when calling this function from original context + */ +void parasite_setup_regs(unsigned long new_ip, void *stack, user_regs_struct_t *regs) +{ + set_user_reg(regs, ip, new_ip); + if (stack) + set_user_reg(regs, sp, (unsigned long) stack); + + /* Avoid end of syscall processing */ + set_user_reg(regs, orig_ax, -1); + + /* Make sure flags are in known state */ + set_user_reg(regs, flags, get_user_reg(regs, flags) & + ~(X86_EFLAGS_TF | X86_EFLAGS_DF | X86_EFLAGS_IF)); +} + +#define USER32_CS 0x23 +#define USER_CS 0x33 + +static bool ldt_task_selectors(pid_t pid) +{ + unsigned long cs; + + errno = 0; + /* + * Offset of register must be from 64-bit set even for + * compatible tasks. Fix this to support native i386 tasks + */ + cs = ptrace(PTRACE_PEEKUSER, pid, offsetof(user_regs_struct64, cs), 0); + if (errno != 0) { + pr_perror("Can't get CS register for %d", pid); + return -1; + } + + return cs != USER_CS && cs != USER32_CS; +} + +static int arch_task_compatible(pid_t pid) +{ + user_regs_struct_t r; + int ret = ptrace_get_regs(pid, &r); + + if (ret) + return -1; + + return !user_regs_native(&r); +} + +bool arch_can_dump_task(struct parasite_ctl *ctl) +{ + pid_t pid = ctl->rpid; + int ret; + + ret = arch_task_compatible(pid); + if (ret < 0) + return false; + + if (ret && !(ctl->ictx.flags & INFECT_COMPATIBLE)) { + pr_err("Can't dump task %d running in 32-bit mode\n", pid); + return false; + } + + if (ldt_task_selectors(pid)) { + pr_err("Can't dump task %d with LDT descriptors\n", pid); + return false; + } + + return true; +} + +int arch_fetch_sas(struct parasite_ctl *ctl, struct rt_sigframe *s) +{ + int native = compel_mode_native(ctl); + void *where = native ? + (void *)&s->native.uc.uc_stack : + (void *)&s->compat.uc.uc_stack; + long ret; + int err; + + err = compel_syscall(ctl, __NR(sigaltstack, !native), + &ret, 0, (unsigned long)where, + 0, 0, 0, 0); + return err ? err : ret; +} + +/* Copied from the gdb header gdb/nat/x86-dregs.h */ + +/* Debug registers' indices. */ +#define DR_FIRSTADDR 0 +#define DR_LASTADDR 3 +#define DR_NADDR 4 /* The number of debug address registers. */ +#define DR_STATUS 6 /* Index of debug status register (DR6). */ +#define DR_CONTROL 7 /* Index of debug control register (DR7). */ + +#define DR_LOCAL_ENABLE_SHIFT 0 /* Extra shift to the local enable bit. */ +#define DR_GLOBAL_ENABLE_SHIFT 1 /* Extra shift to the global enable bit. */ +#define DR_ENABLE_SIZE 2 /* Two enable bits per debug register. */ + +/* Locally enable the break/watchpoint in the I'th debug register. */ +#define X86_DR_LOCAL_ENABLE(i) (1 << (DR_LOCAL_ENABLE_SHIFT + DR_ENABLE_SIZE * (i))) + +int ptrace_set_breakpoint(pid_t pid, void *addr) +{ + int ret; + + /* Set a breakpoint */ + if (ptrace(PTRACE_POKEUSER, pid, + offsetof(struct user, u_debugreg[DR_FIRSTADDR]), + addr)) { + pr_perror("Unable to setup a breakpoint into %d", pid); + return -1; + } + + /* Enable the breakpoint */ + if (ptrace(PTRACE_POKEUSER, pid, + offsetof(struct user, u_debugreg[DR_CONTROL]), + X86_DR_LOCAL_ENABLE(DR_FIRSTADDR))) { + pr_perror("Unable to enable the breakpoint for %d", pid); + return -1; + } + + ret = ptrace(PTRACE_CONT, pid, NULL, NULL); + if (ret) { + pr_perror("Unable to restart the stopped tracee process %d", pid); + return -1; + } + + return 1; +} + +int ptrace_flush_breakpoints(pid_t pid) +{ + /* Disable the breakpoint */ + if (ptrace(PTRACE_POKEUSER, pid, + offsetof(struct user, u_debugreg[DR_CONTROL]), + 0)) { + pr_perror("Unable to disable the breakpoint for %d", pid); + return -1; + } + + return 0; +} + +int ptrace_get_regs(pid_t pid, user_regs_struct_t *regs) +{ + struct iovec iov; + int ret; + + iov.iov_base = ®s->native; + iov.iov_len = sizeof(user_regs_struct64); + + ret = ptrace(PTRACE_GETREGSET, pid, NT_PRSTATUS, &iov); + if (ret == -1) { + pr_perror("PTRACE_GETREGSET failed"); + return -1; + } + + if (iov.iov_len == sizeof(regs->native)) { + regs->__is_native = NATIVE_MAGIC; + return ret; + } + if (iov.iov_len == sizeof(regs->compat)) { + regs->__is_native = COMPAT_MAGIC; + return ret; + } + + pr_err("PTRACE_GETREGSET read %zu bytes for pid %d, but native/compat regs sizes are %zu/%zu bytes\n", + iov.iov_len, pid, + sizeof(regs->native), sizeof(regs->compat)); + return -1; +} + +int ptrace_set_regs(pid_t pid, user_regs_struct_t *regs) +{ + struct iovec iov; + + if (user_regs_native(regs)) { + iov.iov_base = ®s->native; + iov.iov_len = sizeof(user_regs_struct64); + } else { + iov.iov_base = ®s->compat; + iov.iov_len = sizeof(user_regs_struct32); + } + return ptrace(PTRACE_SETREGSET, pid, NT_PRSTATUS, &iov); +} + +#define TASK_SIZE ((1UL << 47) - PAGE_SIZE) +/* + * Task size may be limited to 3G but we need a + * higher limit, because it's backward compatible. + */ +#define TASK_SIZE_IA32 (0xffffe000) + +unsigned long compel_task_size(void) { return TASK_SIZE; } diff --git a/CRIU_code/compel/compel-host b/CRIU_code/compel/compel-host new file mode 100644 index 0000000..bf78862 --- /dev/null +++ b/CRIU_code/compel/compel-host @@ -0,0 +1,8 @@ +#!/bin/sh +# +# A wrapper to use compel-host right from the source dir +# (i.e. when it is not yet installed). + +COMPEL_UNINSTALLED_ROOTDIR=$(dirname "$0") +export COMPEL_UNINSTALLED_ROOTDIR +exec "${COMPEL_UNINSTALLED_ROOTDIR}/compel-host-bin" "$@" diff --git a/CRIU_code/compel/include/compel-cpu.h b/CRIU_code/compel/include/compel-cpu.h new file mode 100644 index 0000000..f30afa0 --- /dev/null +++ b/CRIU_code/compel/include/compel-cpu.h @@ -0,0 +1,12 @@ +#ifndef __COMPEL_CPU_H__ +#define __COMPEL_CPU_H__ + +#include +#include "asm/cpu.h" + +extern void compel_set_cpu_cap(compel_cpuinfo_t *info, unsigned int feature); +extern void compel_clear_cpu_cap(compel_cpuinfo_t *info, unsigned int feature); +extern int compel_test_cpu_cap(compel_cpuinfo_t *info, unsigned int feature); +extern int compel_test_fpu_cap(compel_cpuinfo_t *c, unsigned int feature); + +#endif diff --git a/CRIU_code/compel/include/elf32-types.h b/CRIU_code/compel/include/elf32-types.h new file mode 100644 index 0000000..b516ba1 --- /dev/null +++ b/CRIU_code/compel/include/elf32-types.h @@ -0,0 +1,16 @@ +#ifndef COMPEL_ELF32_TYPES_H__ +#define COMPEL_ELF32_TYPES_H__ + +#define Elf_Ehdr Elf32_Ehdr +#define Elf_Shdr Elf32_Shdr +#define Elf_Sym Elf32_Sym +#define Elf_Rel Elf32_Rel +#define Elf_Rela Elf32_Rela + +#define ELF_ST_TYPE ELF32_ST_TYPE +#define ELF_ST_BIND ELF32_ST_BIND + +#define ELF_R_SYM ELF32_R_SYM +#define ELF_R_TYPE ELF32_R_TYPE + +#endif /* COMPEL_ELF32_TYPES_H__ */ diff --git a/CRIU_code/compel/include/elf64-types.h b/CRIU_code/compel/include/elf64-types.h new file mode 100644 index 0000000..c4d5f1c --- /dev/null +++ b/CRIU_code/compel/include/elf64-types.h @@ -0,0 +1,16 @@ +#ifndef COMPEL_ELF64_TYPES_H__ +#define COMPEL_ELF64_TYPES_H__ + +#define Elf_Ehdr Elf64_Ehdr +#define Elf_Shdr Elf64_Shdr +#define Elf_Sym Elf64_Sym +#define Elf_Rel Elf64_Rel +#define Elf_Rela Elf64_Rela + +#define ELF_ST_TYPE ELF64_ST_TYPE +#define ELF_ST_BIND ELF64_ST_BIND + +#define ELF_R_SYM ELF64_R_SYM +#define ELF_R_TYPE ELF64_R_TYPE + +#endif /* COMPEL_ELF64_TYPES_H__ */ diff --git a/CRIU_code/compel/include/errno.h b/CRIU_code/compel/include/errno.h new file mode 100644 index 0000000..d41fd53 --- /dev/null +++ b/CRIU_code/compel/include/errno.h @@ -0,0 +1,9 @@ +#ifndef __COMPEL_ERRNO_H__ +#define __COMPEL_ERRNO_H__ + +#define ERESTARTSYS 512 +#define ERESTARTNOINTR 513 +#define ERESTARTNOHAND 514 +#define ERESTART_RESTARTBLOCK 516 + +#endif /* __CR_ERRNO_H__ */ diff --git a/CRIU_code/compel/include/infect-priv.h b/CRIU_code/compel/include/infect-priv.h new file mode 100644 index 0000000..ec6dd45 --- /dev/null +++ b/CRIU_code/compel/include/infect-priv.h @@ -0,0 +1,71 @@ +#ifndef __COMPEL_INFECT_PRIV_H__ +#define __COMPEL_INFECT_PRIV_H__ + +#include + +#define BUILTIN_SYSCALL_SIZE 8 + +struct thread_ctx { + k_rtsigset_t sigmask; + user_regs_struct_t regs; +}; + +/* parasite control block */ +struct parasite_ctl { + int rpid; /* Real pid of the victim */ + void *remote_map; + void *local_map; + void *sigreturn_addr; /* A place for the breakpoint */ + unsigned long map_length; + + struct infect_ctx ictx; + + /* thread leader data */ + bool daemonized; + + struct thread_ctx orig; + + void *rstack; /* thread leader stack*/ + struct rt_sigframe *sigframe; + struct rt_sigframe *rsigframe; /* address in a parasite */ + + void *r_thread_stack; /* stack for non-leader threads */ + + unsigned long parasite_ip; /* service routine start ip */ + + unsigned int *addr_cmd; /* addr for command */ + void *addr_args; /* address for arguments */ + unsigned long args_size; + int tsock; /* transport socket for transferring fds */ + + struct parasite_blob_desc pblob; +}; + +struct parasite_thread_ctl { + int tid; + struct parasite_ctl *ctl; + struct thread_ctx th; +}; + +#define MEMFD_FNAME "CRIUMFD" +#define MEMFD_FNAME_SZ sizeof(MEMFD_FNAME) + +struct ctl_msg; +int parasite_wait_ack(int sockfd, unsigned int cmd, struct ctl_msg *m); + +extern void parasite_setup_regs(unsigned long new_ip, void *stack, user_regs_struct_t *regs); +extern void *remote_mmap(struct parasite_ctl *ctl, + void *addr, size_t length, int prot, + int flags, int fd, off_t offset); +extern bool arch_can_dump_task(struct parasite_ctl *ctl); +extern int get_task_regs(pid_t pid, user_regs_struct_t *regs, save_regs_t save, + void *arg, unsigned long flags); +extern int arch_fetch_sas(struct parasite_ctl *ctl, struct rt_sigframe *s); +extern int sigreturn_prep_regs_plain(struct rt_sigframe *sigframe, + user_regs_struct_t *regs, + user_fpregs_struct_t *fpregs); +extern int sigreturn_prep_fpu_frame_plain(struct rt_sigframe *sigframe, + struct rt_sigframe *rsigframe); +extern int compel_execute_syscall(struct parasite_ctl *ctl, + user_regs_struct_t *regs, const char *code_syscall); +#endif diff --git a/CRIU_code/compel/include/log.h b/CRIU_code/compel/include/log.h new file mode 100644 index 0000000..559f909 --- /dev/null +++ b/CRIU_code/compel/include/log.h @@ -0,0 +1,64 @@ +#ifndef COMPEL_LOG_H__ +#define COMPEL_LOG_H__ + +#include "uapi/compel/compel.h" +#include "uapi/compel/loglevels.h" + +#ifndef LOG_PREFIX +# define LOG_PREFIX +#endif + +static inline int pr_quelled(unsigned int loglevel) +{ + return compel_log_get_loglevel() < loglevel + && loglevel != COMPEL_LOG_MSG; +} + +extern void compel_print_on_level(unsigned int loglevel, + const char *format, ...) + __attribute__ ((__format__ (__printf__, 2, 3))); + +#define pr_msg(fmt, ...) \ + compel_print_on_level(COMPEL_LOG_MSG, \ + fmt, ##__VA_ARGS__) + +#define pr_info(fmt, ...) \ + compel_print_on_level(COMPEL_LOG_INFO, \ + LOG_PREFIX fmt, ##__VA_ARGS__) + +#define pr_err(fmt, ...) \ + compel_print_on_level(COMPEL_LOG_ERROR, \ + "Error (%s:%d): " LOG_PREFIX fmt, \ + __FILE__, __LINE__, ##__VA_ARGS__) + +#define pr_err_once(fmt, ...) \ + do { \ + static bool __printed; \ + if (!__printed) { \ + pr_err(fmt, ##__VA_ARGS__); \ + __printed = 1; \ + } \ + } while (0) + +#define pr_warn(fmt, ...) \ + compel_print_on_level(COMPEL_LOG_WARN, \ + "Warn (%s:%d): " LOG_PREFIX fmt, \ + __FILE__, __LINE__, ##__VA_ARGS__) + +#define pr_warn_once(fmt, ...) \ + do { \ + static bool __printed; \ + if (!__printed) { \ + pr_warn(fmt, ##__VA_ARGS__); \ + __printed = 1; \ + } \ + } while (0) + +#define pr_debug(fmt, ...) \ + compel_print_on_level(COMPEL_LOG_DEBUG, \ + LOG_PREFIX fmt, ##__VA_ARGS__) + +#define pr_perror(fmt, ...) \ + pr_err(fmt ": %m\n", ##__VA_ARGS__) + +#endif /* COMPEL_LOG_H__ */ diff --git a/CRIU_code/compel/include/piegen.h b/CRIU_code/compel/include/piegen.h new file mode 100644 index 0000000..fd72f9c --- /dev/null +++ b/CRIU_code/compel/include/piegen.h @@ -0,0 +1,28 @@ +#ifndef COMPEL_PIEGEN_H__ +#define COMPEL_PIEGEN_H__ + +#include +#include + +#include + +#include "common/compiler.h" + +typedef struct { + char *input_filename; + char *output_filename; + char *prefix; + FILE *fout; +} piegen_opt_t; + +extern piegen_opt_t opts; + +#define pr_out(fmt, ...) \ +do { \ + if (opts.fout) \ + fprintf(opts.fout, fmt, ##__VA_ARGS__); \ +} while (0) + +extern int handle_binary(void *mem, size_t size); + +#endif /* COMPEL_PIEGEN_H__ */ diff --git a/CRIU_code/compel/include/ptrace.h b/CRIU_code/compel/include/ptrace.h new file mode 100644 index 0000000..01f55c4 --- /dev/null +++ b/CRIU_code/compel/include/ptrace.h @@ -0,0 +1,13 @@ +#ifndef COMPEL_PTRACE_H__ +#define COMPEL_PTRACE_H__ + +#include +#include +#include + +#define PTRACE_SI_EVENT(_si_code) (((_si_code) & 0xFFFF) >> 8) + +extern int ptrace_get_regs(pid_t pid, user_regs_struct_t *regs); +extern int ptrace_set_regs(pid_t pid, user_regs_struct_t *regs); + +#endif /* COMPEL_PTRACE_H__ */ diff --git a/CRIU_code/compel/include/rpc-pie-priv.h b/CRIU_code/compel/include/rpc-pie-priv.h new file mode 100644 index 0000000..15f5b14 --- /dev/null +++ b/CRIU_code/compel/include/rpc-pie-priv.h @@ -0,0 +1,50 @@ +#ifndef __COMPEL_RPC_H__ +#define __COMPEL_RPC_H__ +struct ctl_msg { + uint32_t cmd; /* command itself */ + uint32_t ack; /* ack on command */ + int32_t err; /* error code on reply */ +}; + +#define ctl_msg_cmd(_cmd) \ + (struct ctl_msg){.cmd = _cmd, } + +#define ctl_msg_ack(_cmd, _err) \ + (struct ctl_msg){.cmd = _cmd, .ack = _cmd, .err = _err, } + +/* + * NOTE: each command's args should be arch-independed sized. + * If you want to use one of the standard types, declare + * alternative type for compatible tasks in parasite-compat.h + */ +enum { + PARASITE_CMD_IDLE = 0, + PARASITE_CMD_ACK, + + PARASITE_CMD_INIT_DAEMON, + + /* + * This must be greater than INITs. + */ + PARASITE_CMD_FINI, + + __PARASITE_END_CMDS, +}; + +struct parasite_init_args { + int32_t h_addr_len; + struct sockaddr_un h_addr; + int32_t log_level; + uint64_t sigreturn_addr; + uint64_t sigframe; /* pointer to sigframe */ + futex_t daemon_connected; +#ifdef ARCH_HAS_LONG_PAGES + uint32_t page_size; +#endif +}; + +struct parasite_unmap_args { + uint64_t parasite_start; + uint64_t parasite_len; +}; +#endif diff --git a/CRIU_code/compel/include/shmem.h b/CRIU_code/compel/include/shmem.h new file mode 100644 index 0000000..b6f9946 --- /dev/null +++ b/CRIU_code/compel/include/shmem.h @@ -0,0 +1,10 @@ +#ifndef __COMPEL_PLUGIN_SHMEM_PRIV_H__ +#define __COMPEL_PLUGIN_SHMEM_PRIV_H__ + +struct shmem_plugin_msg { + unsigned long start; + unsigned long len; +}; + +#endif /* __COMPEL_PLUGIN_SHMEM_PRIV_H__ */ + diff --git a/CRIU_code/compel/include/uapi/asm b/CRIU_code/compel/include/uapi/asm new file mode 100644 index 0000000..36f9e04 --- /dev/null +++ b/CRIU_code/compel/include/uapi/asm @@ -0,0 +1 @@ +../asm/uapi/asm \ No newline at end of file diff --git a/CRIU_code/compel/include/uapi/common b/CRIU_code/compel/include/uapi/common new file mode 100644 index 0000000..33f00f4 --- /dev/null +++ b/CRIU_code/compel/include/uapi/common @@ -0,0 +1 @@ +../../../include/common \ No newline at end of file diff --git a/CRIU_code/compel/include/uapi/compel b/CRIU_code/compel/include/uapi/compel new file mode 100644 index 0000000..945c9b4 --- /dev/null +++ b/CRIU_code/compel/include/uapi/compel @@ -0,0 +1 @@ +. \ No newline at end of file diff --git a/CRIU_code/compel/include/uapi/compel.h b/CRIU_code/compel/include/uapi/compel.h new file mode 100644 index 0000000..318a472 --- /dev/null +++ b/CRIU_code/compel/include/uapi/compel.h @@ -0,0 +1,14 @@ +#ifndef UAPI_COMPEL_H__ +#define UAPI_COMPEL_H__ + +#include +#include + +#include +#include +#include +#include +#include +#include + +#endif /* UAPI_COMPEL_H__ */ diff --git a/CRIU_code/compel/include/uapi/cpu.h b/CRIU_code/compel/include/uapi/cpu.h new file mode 100644 index 0000000..6f827d4 --- /dev/null +++ b/CRIU_code/compel/include/uapi/cpu.h @@ -0,0 +1,17 @@ +#ifndef UAPI_COMPEL_CPU_H__ +#define UAPI_COMPEL_CPU_H__ + +#include +#include + +#include + +extern int compel_cpuid(compel_cpuinfo_t *info); +extern bool compel_cpu_has_feature(unsigned int feature); +extern bool compel_fpu_has_feature(unsigned int feature); +extern uint32_t compel_fpu_feature_size(unsigned int feature); +extern uint32_t compel_fpu_feature_offset(unsigned int feature); +extern void compel_cpu_clear_feature(unsigned int feature); +extern void compel_cpu_copy_cpuinfo(compel_cpuinfo_t *c); + +#endif /* UAPI_COMPEL_CPU_H__ */ diff --git a/CRIU_code/compel/include/uapi/handle-elf.h b/CRIU_code/compel/include/uapi/handle-elf.h new file mode 100644 index 0000000..ddeecb0 --- /dev/null +++ b/CRIU_code/compel/include/uapi/handle-elf.h @@ -0,0 +1,15 @@ +#ifndef __COMPEL_UAPI_HANDLE_ELF__ +#define __COMPEL_UAPI_HANDLE_ELF__ + +#define COMPEL_TYPE_INT (1u << 0) +#define COMPEL_TYPE_LONG (1u << 1) +#define COMPEL_TYPE_GOTPCREL (1u << 2) + +typedef struct { + unsigned int offset; + unsigned int type; + long addend; + long value; +} compel_reloc_t; + +#endif diff --git a/CRIU_code/compel/include/uapi/infect-rpc.h b/CRIU_code/compel/include/uapi/infect-rpc.h new file mode 100644 index 0000000..0176c11 --- /dev/null +++ b/CRIU_code/compel/include/uapi/infect-rpc.h @@ -0,0 +1,17 @@ +#ifndef __COMPEL_INFECT_RPC_H__ +#define __COMPEL_INFECT_RPC_H__ + +#include +#include +#include + +struct parasite_ctl; +extern int compel_rpc_sync(unsigned int cmd, struct parasite_ctl *ctl); +extern int compel_rpc_call(unsigned int cmd, struct parasite_ctl *ctl); +extern int compel_rpc_call_sync(unsigned int cmd, struct parasite_ctl *ctl); +extern int compel_rpc_sock(struct parasite_ctl *ctl); + +#define PARASITE_USER_CMDS 64 + + +#endif diff --git a/CRIU_code/compel/include/uapi/infect-util.h b/CRIU_code/compel/include/uapi/infect-util.h new file mode 100644 index 0000000..7307ba5 --- /dev/null +++ b/CRIU_code/compel/include/uapi/infect-util.h @@ -0,0 +1,6 @@ +#ifndef __COMPEL_INFECT_UTIL_H__ +#define __COMPEL_INFECT_UTIL_H__ +struct parasite_ctl; +extern int compel_util_send_fd(struct parasite_ctl *ctl, int fd); +extern int compel_util_recv_fd(struct parasite_ctl *ctl, int *pfd); +#endif diff --git a/CRIU_code/compel/include/uapi/infect.h b/CRIU_code/compel/include/uapi/infect.h new file mode 100644 index 0000000..08beaff --- /dev/null +++ b/CRIU_code/compel/include/uapi/infect.h @@ -0,0 +1,171 @@ +#ifndef __COMPEL_INFECT_H__ +#define __COMPEL_INFECT_H__ + +#include + +#include +#include +#include +#include +#include + +#include "common/compiler.h" + +#define PARASITE_START_AREA_MIN (4096) + +extern int compel_interrupt_task(int pid); + +struct seize_task_status { + unsigned long long sigpnd; + unsigned long long shdpnd; + char state; + int ppid; + int seccomp_mode; +}; + +extern int compel_wait_task(int pid, int ppid, + int (*get_status)(int pid, struct seize_task_status *, void *data), + void (*free_status)(int pid, struct seize_task_status *, void *data), + struct seize_task_status *st, void *data); + +extern int compel_stop_task(int pid); +extern int compel_resume_task(pid_t pid, int orig_state, int state); + +struct parasite_ctl; +struct parasite_thread_ctl; + +extern struct parasite_ctl *compel_prepare(int pid); +extern struct parasite_ctl *compel_prepare_noctx(int pid); +extern int compel_infect(struct parasite_ctl *ctl, unsigned long nr_threads, unsigned long args_size); +extern struct parasite_thread_ctl *compel_prepare_thread(struct parasite_ctl *ctl, int pid); +extern void compel_release_thread(struct parasite_thread_ctl *); + +extern int compel_stop_daemon(struct parasite_ctl *ctl); +extern int compel_cure_remote(struct parasite_ctl *ctl); +extern int compel_cure_local(struct parasite_ctl *ctl); +extern int compel_cure(struct parasite_ctl *ctl); + +#define PARASITE_ARG_SIZE_MIN ( 1 << 12) + +#define compel_parasite_args(ctl, type) \ + ({ \ + void *___ret; \ + BUILD_BUG_ON(sizeof(type) > PARASITE_ARG_SIZE_MIN); \ + ___ret = compel_parasite_args_p(ctl); \ + ___ret; \ + }) + +extern void *compel_parasite_args_p(struct parasite_ctl *ctl); +extern void *compel_parasite_args_s(struct parasite_ctl *ctl, unsigned long args_size); + +extern int compel_syscall(struct parasite_ctl *ctl, int nr, long *ret, + unsigned long arg1, + unsigned long arg2, + unsigned long arg3, + unsigned long arg4, + unsigned long arg5, + unsigned long arg6); +extern int compel_run_in_thread(struct parasite_thread_ctl *tctl, unsigned int cmd); +extern int compel_run_at(struct parasite_ctl *ctl, unsigned long ip, user_regs_struct_t *ret_regs); + +/* + * The PTRACE_SYSCALL will trap task twice -- on + * enter into and on exit from syscall. If we trace + * a single task, we may skip half of all getregs + * calls -- on exit we don't need them. + */ +enum trace_flags { + TRACE_ALL, + TRACE_ENTER, + TRACE_EXIT, +}; + +extern int compel_stop_on_syscall(int tasks, int sys_nr, + int sys_nr_compat, enum trace_flags trace); + +extern int compel_stop_pie(pid_t pid, void *addr, enum trace_flags *tf, bool no_bp); + +extern int compel_unmap(struct parasite_ctl *ctl, unsigned long addr); + +extern int compel_mode_native(struct parasite_ctl *ctl); + +extern k_rtsigset_t *compel_task_sigmask(struct parasite_ctl *ctl); +extern k_rtsigset_t *compel_thread_sigmask(struct parasite_thread_ctl *tctl); + +struct rt_sigframe; + +typedef int (*open_proc_fn)(int pid, int mode, const char *fmt, ...) + __attribute__ ((__format__ (__printf__, 3, 4))); +typedef int (*save_regs_t)(void *, user_regs_struct_t *, user_fpregs_struct_t *); +typedef int (*make_sigframe_t)(void *, struct rt_sigframe *, struct rt_sigframe *, k_rtsigset_t *); + +struct infect_ctx { + int sock; + + /* + * Regs manipulation context. + */ + save_regs_t save_regs; + make_sigframe_t make_sigframe; + void *regs_arg; + + unsigned long task_size; + unsigned long syscall_ip; /* entry point of infection */ + unsigned long flags; /* fine-tune (e.g. faults) */ + + void (*child_handler)(int, siginfo_t *, void *); /* hander for SIGCHLD deaths */ + struct sigaction orig_handler; + + open_proc_fn open_proc; + + int log_fd; /* fd for parasite code to send messages to */ +}; + +extern struct infect_ctx *compel_infect_ctx(struct parasite_ctl *); + +/* Don't use memfd() */ +#define INFECT_NO_MEMFD (1UL << 0) +/* Make parasite connect() fail */ +#define INFECT_FAIL_CONNECT (1UL << 1) +/* No breakpoints in pie tracking */ +#define INFECT_NO_BREAKPOINTS (1UL << 2) +/* Can run parasite inside compat tasks */ +#define INFECT_COMPATIBLE (1UL << 3) +/* Workaround for ptrace bug on Skylake CPUs with kernels older than v4.14 */ +#define INFECT_X86_PTRACE_MXCSR_BUG (1UL << 4) + +/* + * There are several ways to describe a blob to compel + * library. The simplest one derived from criu is to + * provide it from .h files. + */ +#define COMPEL_BLOB_CHEADER 0x1 + +struct parasite_blob_desc { + unsigned parasite_type; + union { + struct { + const void *mem; + size_t bsize; + size_t nr_gotpcrel; + unsigned long parasite_ip_off; + unsigned long addr_cmd_off; + unsigned long addr_arg_off; + compel_reloc_t *relocs; + unsigned int nr_relocs; + } hdr; + }; +}; + +extern struct parasite_blob_desc *compel_parasite_blob_desc(struct parasite_ctl *); + +extern int compel_get_thread_regs(struct parasite_thread_ctl *, save_regs_t, void *); + +extern void compel_relocs_apply(void *mem, void *vbase, size_t size, compel_reloc_t *elf_relocs, size_t nr_relocs); + +extern unsigned long compel_task_size(void); + +extern uint64_t compel_get_leader_sp(struct parasite_ctl *ctl); +extern uint64_t compel_get_thread_sp(struct parasite_thread_ctl *tctl); + +#endif diff --git a/CRIU_code/compel/include/uapi/ksigset.h b/CRIU_code/compel/include/uapi/ksigset.h new file mode 100644 index 0000000..f6b124b --- /dev/null +++ b/CRIU_code/compel/include/uapi/ksigset.h @@ -0,0 +1,25 @@ +#ifndef __COMPEL_KSIGSET_H__ +#define __COMPEL_KSIGSET_H__ + +#include + +static inline void ksigfillset(k_rtsigset_t *set) +{ + int i; + for (i = 0; i < _KNSIG_WORDS; i++) + set->sig[i] = (unsigned long)-1; +} + +static inline void ksigemptyset(k_rtsigset_t *set) +{ + int i; + for (i = 0; i < _KNSIG_WORDS; i++) + set->sig[i] = 0; +} + +static inline void ksigaddset(k_rtsigset_t *set, int _sig) +{ + int sig = _sig - 1; + set->sig[sig / _NSIG_BPW] |= 1UL << (sig % _NSIG_BPW); +} +#endif diff --git a/CRIU_code/compel/include/uapi/log.h b/CRIU_code/compel/include/uapi/log.h new file mode 100644 index 0000000..79dd1f4 --- /dev/null +++ b/CRIU_code/compel/include/uapi/log.h @@ -0,0 +1,11 @@ +#ifndef __COMPEL_UAPI_LOG_H__ +#define __COMPEL_UAPI_LOG_H__ + +#include +#include + +typedef void (*compel_log_fn)(unsigned int lvl, const char *fmt, va_list parms); +extern void compel_log_init(compel_log_fn log_fn, unsigned int level); +extern unsigned int compel_log_get_loglevel(void); + +#endif diff --git a/CRIU_code/compel/include/uapi/loglevels.h b/CRIU_code/compel/include/uapi/loglevels.h new file mode 100644 index 0000000..7bf8847 --- /dev/null +++ b/CRIU_code/compel/include/uapi/loglevels.h @@ -0,0 +1,20 @@ +#ifndef UAPI_COMPEL_LOGLEVELS_H__ +#define UAPI_COMPEL_LOGLEVELS_H__ + +/* + * Log levels used by compel itself (see compel_log_init()), + * also by log functions in the std plugin. + */ + +enum __compel_log_levels +{ + COMPEL_LOG_MSG, /* Print message regardless of log level */ + COMPEL_LOG_ERROR, /* Errors only, when we're in trouble */ + COMPEL_LOG_WARN, /* Warnings */ + COMPEL_LOG_INFO, /* Informative, everything is fine */ + COMPEL_LOG_DEBUG, /* Debug only */ + + COMPEL_DEFAULT_LOGLEVEL = COMPEL_LOG_WARN +}; + +#endif /* UAPI_COMPEL_LOGLEVELS_H__ */ diff --git a/CRIU_code/compel/include/uapi/plugins b/CRIU_code/compel/include/uapi/plugins new file mode 100644 index 0000000..7ff4c60 --- /dev/null +++ b/CRIU_code/compel/include/uapi/plugins @@ -0,0 +1 @@ +../../plugins/include/uapi \ No newline at end of file diff --git a/CRIU_code/compel/include/uapi/plugins.h b/CRIU_code/compel/include/uapi/plugins.h new file mode 100644 index 0000000..e9ebfb6 --- /dev/null +++ b/CRIU_code/compel/include/uapi/plugins.h @@ -0,0 +1,35 @@ +#ifndef UAPI_COMPEL_PLUGIN_H__ +#define UAPI_COMPEL_PLUGIN_H__ + +#define __init __attribute__((__used__)) __attribute__ ((__section__(".compel.init"))) +#define __exit __attribute__((__used__)) __attribute__ ((__section__(".compel.exit"))) + +#ifndef __ASSEMBLY__ + +typedef struct { + const char *name; + int (*init)(void); + void (*exit)(void); +} plugin_init_t; + +#define plugin_register(___desc) \ + static const plugin_init_t * const \ + ___ptr__##___desc __init = &___desc; + +#define PLUGIN_REGISTER(___id, ___name, ___init, ___exit) \ + static const plugin_init_t __plugin_desc_##___id = { \ + .name = ___name, \ + .init = ___init, \ + .exit = ___exit, \ + }; \ + plugin_register(__plugin_desc_##___id); + +#define PLUGIN_REGISTER_DUMMY(___id) \ + static const plugin_init_t __plugin_desc_##___id = { \ + .name = #___id, \ + }; \ + plugin_register(__plugin_desc_##___id); + +#endif /* __ASSEMBLY__ */ + +#endif /* UAPI_COMPEL_PLUGIN_H__ */ diff --git a/CRIU_code/compel/include/uapi/ptrace.h b/CRIU_code/compel/include/uapi/ptrace.h new file mode 100644 index 0000000..4df00b6 --- /dev/null +++ b/CRIU_code/compel/include/uapi/ptrace.h @@ -0,0 +1,82 @@ +#ifndef UAPI_COMPEL_PTRACE_H__ +#define UAPI_COMPEL_PTRACE_H__ + +/* + * We'd want to include both sys/ptrace.h and linux/ptrace.h, + * hoping that most definitions come from either one or another. + * Alas, on Alpine/musl both files declare struct ptrace_peeksiginfo_args, + * so there is no way they can be used together. Let's rely on libc one. + */ +#include +#include + +#include + +/* + * Some constants for ptrace that might be missing from the + * standard library includes due to being (relatively) new. + */ + +#ifndef PTRACE_SEIZE +# define PTRACE_SEIZE 0x4206 +#endif + +#ifndef PTRACE_O_SUSPEND_SECCOMP +# define PTRACE_O_SUSPEND_SECCOMP (1 << 21) +#endif + +#ifndef PTRACE_INTERRUPT +# define PTRACE_INTERRUPT 0x4207 +#endif + +#ifndef PTRACE_PEEKSIGINFO +#define PTRACE_PEEKSIGINFO 0x4209 + +/* Read signals from a shared (process wide) queue */ +#define PTRACE_PEEKSIGINFO_SHARED (1 << 0) +#endif + +#ifndef PTRACE_GETREGSET +# define PTRACE_GETREGSET 0x4204 +# define PTRACE_SETREGSET 0x4205 +#endif + +#ifndef PTRACE_GETSIGMASK +# define PTRACE_GETSIGMASK 0x420a +# define PTRACE_SETSIGMASK 0x420b +#endif + +#ifndef PTRACE_SECCOMP_GET_FILTER +#define PTRACE_SECCOMP_GET_FILTER 0x420c +#endif + +#ifndef PTRACE_SECCOMP_GET_METADATA +# define PTRACE_SECCOMP_GET_METADATA 0x420d +#endif /* PTRACE_SECCOMP_GET_METADATA */ + +/* + * struct seccomp_metadata is not yet + * settled down well in headers so use + * own identical definition for a while. + */ +typedef struct { + uint64_t filter_off; /* Input: which filter */ + uint64_t flags; /* Output: filter's flags */ +} seccomp_metadata_t; + +#ifdef PTRACE_EVENT_STOP +# if PTRACE_EVENT_STOP == 7 /* Bad value from Linux 3.1-3.3, fixed in 3.4 */ +# undef PTRACE_EVENT_STOP +# endif +#endif +#ifndef PTRACE_EVENT_STOP +# define PTRACE_EVENT_STOP 128 +#endif + +extern int ptrace_suspend_seccomp(pid_t pid); + +extern int ptrace_peek_area(pid_t pid, void *dst, void *addr, long bytes); +extern int ptrace_poke_area(pid_t pid, void *src, void *addr, long bytes); +extern int ptrace_swap_area(pid_t pid, void *dst, void *src, long bytes); + +#endif /* UAPI_COMPEL_PTRACE_H__ */ diff --git a/CRIU_code/compel/include/uapi/sigframe-common.h b/CRIU_code/compel/include/uapi/sigframe-common.h new file mode 100644 index 0000000..fc93c54 --- /dev/null +++ b/CRIU_code/compel/include/uapi/sigframe-common.h @@ -0,0 +1,62 @@ +/* + * Don't include it directly but use "arch-sigframe.h" instead. + */ +#ifndef UAPI_COMPEL_SIGFRAME_COMMON_H__ +#define UAPI_COMPEL_SIGFRAME_COMMON_H__ + +#ifndef UAPI_COMPEL_ASM_SIGFRAME_H__ +# error "Direct inclusion is forbidden, use instead" +#endif + +#include +#include + +struct rt_sigframe; + +#ifndef SIGFRAME_MAX_OFFSET +# define SIGFRAME_MAX_OFFSET RT_SIGFRAME_OFFSET(0) +#endif + +#define RESTORE_STACK_ALIGN(x, a) (((x) + (a) - 1) & ~((a) - 1)) + +/* sigframe should be aligned on 64 byte for x86 and 8 bytes for arm */ +#define RESTORE_STACK_SIGFRAME \ + RESTORE_STACK_ALIGN(sizeof(struct rt_sigframe) + SIGFRAME_MAX_OFFSET, 64) + +#ifndef __ARCH_SI_PREAMBLE_SIZE +# define __ARCH_SI_PREAMBLE_SIZE (3 * sizeof(int)) +#endif + +#define SI_MAX_SIZE 128 + +#ifndef SI_PAD_SIZE +# define SI_PAD_SIZE ((SI_MAX_SIZE - __ARCH_SI_PREAMBLE_SIZE) / sizeof(int)) +#endif + +typedef struct rt_siginfo { + int si_signo; + int si_errno; + int si_code; + int _pad[SI_PAD_SIZE]; +} rt_siginfo_t; + +typedef struct rt_sigaltstack { + void *ss_sp; + int ss_flags; + size_t ss_size; +} rt_stack_t; + +struct rt_ucontext { + unsigned long uc_flags; + struct rt_ucontext *uc_link; + rt_stack_t uc_stack; + struct rt_sigcontext uc_mcontext; + k_rtsigset_t uc_sigmask; /* mask last for extensibility */ + int _unused[32 - (sizeof (k_rtsigset_t) / sizeof (int))]; + unsigned long uc_regspace[128] __attribute__((aligned(8))); +}; + +extern int sigreturn_prep_fpu_frame(struct rt_sigframe *frame, + struct rt_sigframe *rframe); + +#endif /* UAPI_COMPEL_SIGFRAME_COMMON_H__ */ diff --git a/CRIU_code/compel/include/uapi/task-state.h b/CRIU_code/compel/include/uapi/task-state.h new file mode 100644 index 0000000..84a2a0b --- /dev/null +++ b/CRIU_code/compel/include/uapi/task-state.h @@ -0,0 +1,19 @@ +#ifndef __COMPEL_UAPI_TASK_STATE_H__ +#define __COMPEL_UAPI_TASK_STATE_H__ + +/* + * Task state, as returned by compel_wait_task() + * and used in arguments to compel_resume_task(). + */ +enum __compel_task_state +{ + COMPEL_TASK_ALIVE = 0x01, + COMPEL_TASK_DEAD = 0x02, + COMPEL_TASK_STOPPED = 0x03, + COMPEL_TASK_ZOMBIE = 0x06, + /* Don't ever change the above values, they are used by CRIU! */ + + COMPEL_TASK_MAX = 0x7f +}; + +#endif /* __COMPEL_UAPI_TASK_STATE_H__ */ diff --git a/CRIU_code/compel/plugins/Makefile b/CRIU_code/compel/plugins/Makefile new file mode 100644 index 0000000..a326e2a --- /dev/null +++ b/CRIU_code/compel/plugins/Makefile @@ -0,0 +1,102 @@ +CFLAGS := $(filter-out -pg $(CFLAGS-GCOV) $(CFLAGS-ASAN),$(CFLAGS)) +CFLAGS += -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=0 +CFLAGS += -Wp,-U_FORTIFY_SOURCE -Wp,-D_FORTIFY_SOURCE=0 + +PLUGIN_ARCH_DIR := compel/arch/$(ARCH)/plugins + +# +# CFLAGS, ASFLAGS, LDFLAGS + +# Required for pie code +ccflags-y += $(CFLAGS_PIE) + +# UAPI inclusion, referred as +ccflags-y += -I compel/include/uapi +asflags-y += -I compel/include/uapi + +# General compel includes +ccflags-y += -iquote compel/include +ccflags-y += -fpie -fno-stack-protector + +# General compel/plugins includes +ccflags-y += -iquote $(obj)/include +asflags-y += -iquote $(obj)/include + +# Arch compel/plugins includes +ccflags-y += -iquote $(PLUGIN_ARCH_DIR)/include +asflags-y += -iquote $(PLUGIN_ARCH_DIR)/include +asflags-y += -iquote $(PLUGIN_ARCH_DIR) + +# General flags for assembly +asflags-y += -fpie -Wstrict-prototypes +asflags-y += -nostdlib -fomit-frame-pointer +asflags-y += -fno-stack-protector +ldflags-y += -z noexecstack + +# +# Shmem plugin +target += shmem +shmem-lib-y += shmem/shmem.o + +# +# STD plugin +target += std +std-lib-y += std/std.o +std-lib-y += std/fds.o +std-lib-y += std/log.o +std-lib-y += std/string.o +std-lib-y += std/infect.o +std-lib-y += ./$(PLUGIN_ARCH_DIR)/std/parasite-head.o + +# +# FDS plugin +target += fds +fds-lib-y += fds/fds.o + +ifeq ($(SRCARCH),x86) + std-lib-y += ./$(PLUGIN_ARCH_DIR)/std/memcpy.o +endif + +ifeq ($(SRCARCH),ppc64) + std-lib-y += ./$(PLUGIN_ARCH_DIR)/std/memcpy.o + std-lib-y += ./$(PLUGIN_ARCH_DIR)/std/memcmp.o +endif + +include ./$(PLUGIN_ARCH_DIR)/std/syscalls/Makefile.syscalls + +define syscall-priority +$(addprefix $(obj)/,$($(1):%.o=%.d)): | $($(2)) +$(addprefix $(obj)/,$($(1):%.o=%.i)): | $($(2)) +$(addprefix $(obj)/,$($(1):%.o=%.s)): | $($(2)) +$(addprefix $(obj)/,$($(1))): | $($(2)) +endef + +# +# Almost all plugins depen on syscall headers +# and definitions so we have to order their +# generation manually. +$(foreach t,$(target),$(eval $(call syscall-priority,$(t)-lib-y,std-headers-deps))) + +# +# FIXME syscall-types.h should be setup earlier +# +install: compel/plugins/std.lib.a compel/plugins/fds.lib.a + $(E) " INSTALL " compel plugins + $(Q) mkdir -p $(DESTDIR)$(LIBEXECDIR)/compel/ + $(Q) install -m 0644 $^ $(DESTDIR)$(LIBEXECDIR)/compel/ + $(Q) mkdir -p $(DESTDIR)$(LIBEXECDIR)/compel/scripts + $(Q) install -m 0644 compel/arch/$(ARCH)/scripts/compel-pack.lds.S $(DESTDIR)$(LIBEXECDIR)/compel/scripts + $(E) " INSTALL " compel plugins uapi + $(Q) mkdir -p $(DESTDIR)$(INCLUDEDIR)/compel/plugins/std/asm + $(Q) cp -fL compel/plugins/include/uapi/*.h $(DESTDIR)$(INCLUDEDIR)/compel/plugins/ + $(Q) cp -fL compel/plugins/include/uapi/std/*.h $(DESTDIR)$(INCLUDEDIR)/compel/plugins/std/ + $(Q) cp -fL compel/plugins/include/uapi/std/asm/*.h $(DESTDIR)$(INCLUDEDIR)/compel/plugins/std/asm/ +.PHONY: install + +uninstall: + $(E) " UNINSTALL" compel plugins + $(Q) $(RM) $(addprefix $(DESTDIR)$(LIBEXECDIR)/compel/,*.lib.a) + $(Q) $(RM) $(addprefix $(DESTDIR)$(LIBEXECDIR)/compel/scripts/,compel-pack.lds.S) + $(E) " UNINSTALL" compel and plugins uapi + $(Q) $(RM) -rf $(addprefix $(DESTDIR)$(INCLUDEDIR)/,compel/plugins) +.PHONY: uninstall diff --git a/CRIU_code/compel/plugins/fds/fds.c b/CRIU_code/compel/plugins/fds/fds.c new file mode 100644 index 0000000..7ed9450 --- /dev/null +++ b/CRIU_code/compel/plugins/fds/fds.c @@ -0,0 +1,25 @@ +#include + +#include "uapi/plugins.h" +#include "uapi/plugins/std.h" +#include + +#define pr_err(fmt, ...) + +#include "common/compiler.h" +#include "common/bug.h" + +#define __sys(foo) sys_##foo +#define __sys_err(ret) ret + +#include "common/scm.h" + +int fds_send_fd(int fd) +{ + return send_fd(parasite_get_rpc_sock(), NULL, 0, fd); +} + +int fds_recv_fd(void) +{ + return recv_fd(parasite_get_rpc_sock()); +} diff --git a/CRIU_code/compel/plugins/include/std-priv.h b/CRIU_code/compel/plugins/include/std-priv.h new file mode 100644 index 0000000..3fc3041 --- /dev/null +++ b/CRIU_code/compel/plugins/include/std-priv.h @@ -0,0 +1,6 @@ +#ifndef __COMPEL_PLUGIN_STD_PRIV_H__ +#define __COMPEL_PLUGIN_STD_PRIV_H__ + +extern int std_ctl_sock(void); + +#endif /* __COMPEL_PLUGIN_STD_PRIV_H__ */ diff --git a/CRIU_code/compel/plugins/include/uapi/plugin-fds.h b/CRIU_code/compel/plugins/include/uapi/plugin-fds.h new file mode 100644 index 0000000..cececb2 --- /dev/null +++ b/CRIU_code/compel/plugins/include/uapi/plugin-fds.h @@ -0,0 +1,7 @@ +#ifndef COMPEL_PLUGIN_STD_STD_H__ +#define COMPEL_PLUGIN_STD_STD_H__ + +extern int fds_send_fd(int fd); +extern int fds_recv_fd(void); + +#endif /* COMPEL_PLUGIN_STD_STD_H__ */ diff --git a/CRIU_code/compel/plugins/include/uapi/shmem.h b/CRIU_code/compel/plugins/include/uapi/shmem.h new file mode 100644 index 0000000..7e58509 --- /dev/null +++ b/CRIU_code/compel/plugins/include/uapi/shmem.h @@ -0,0 +1,17 @@ +#ifndef __COMPEL_PLUGIN_SHMEM_H__ +#define __COMPEL_PLUGIN_SHMEM_H__ + +/* + * Creates local shmem mapping and announces it + * to the peer. Peer can later "receive" one. The + * local area should be munmap()-ed at the end. + */ +extern void *shmem_create(unsigned long size); +/* + * "Receives" shmem from peer and maps it. The + * locally mapped area should be munmap()-ed at + * the end + */ +extern void *shmem_receive(unsigned long *size); + +#endif /* __COMPEL_PLUGIN_SHMEM_H__ */ diff --git a/CRIU_code/compel/plugins/include/uapi/std.h b/CRIU_code/compel/plugins/include/uapi/std.h new file mode 100644 index 0000000..d05fc94 --- /dev/null +++ b/CRIU_code/compel/plugins/include/uapi/std.h @@ -0,0 +1,11 @@ +#ifndef COMPEL_PLUGIN_STD_STD_H__ +#define COMPEL_PLUGIN_STD_STD_H__ + +#include +#include +#include +#include +#include +#include + +#endif /* COMPEL_PLUGIN_STD_STD_H__ */ diff --git a/CRIU_code/compel/plugins/include/uapi/std/asm/.gitignore b/CRIU_code/compel/plugins/include/uapi/std/asm/.gitignore new file mode 100644 index 0000000..5ca2354 --- /dev/null +++ b/CRIU_code/compel/plugins/include/uapi/std/asm/.gitignore @@ -0,0 +1 @@ +# Dear git, please keep this directory diff --git a/CRIU_code/compel/plugins/include/uapi/std/fds.h b/CRIU_code/compel/plugins/include/uapi/std/fds.h new file mode 100644 index 0000000..ed695ee --- /dev/null +++ b/CRIU_code/compel/plugins/include/uapi/std/fds.h @@ -0,0 +1,7 @@ +#ifndef COMPEL_PLUGIN_STD_FDS_H__ +#define COMPEL_PLUGIN_STD_FDS_H__ + +#include +#include + +#endif /* COMPEL_PLUGIN_STD_FDS_H__ */ diff --git a/CRIU_code/compel/plugins/include/uapi/std/infect.h b/CRIU_code/compel/plugins/include/uapi/std/infect.h new file mode 100644 index 0000000..800df25 --- /dev/null +++ b/CRIU_code/compel/plugins/include/uapi/std/infect.h @@ -0,0 +1,20 @@ +#ifndef COMPEL_PLUGIN_STD_INFECT_H__ +#define COMPEL_PLUGIN_STD_INFECT_H__ + +extern int parasite_get_rpc_sock(void); +extern int parasite_service(unsigned int cmd, void *args); + +/* + * Must be supplied by user plugins. + */ +extern int parasite_daemon_cmd(int cmd, void *args); +extern int parasite_trap_cmd(int cmd, void *args); +extern void parasite_cleanup(void); + +/* + * FIXME: Should be supplied by log module. + */ +extern void log_set_fd(int fd); +extern void log_set_loglevel(unsigned int level); + +#endif /* COMPEL_PLUGIN_STD_INFECT_H__ */ diff --git a/CRIU_code/compel/plugins/include/uapi/std/log.h b/CRIU_code/compel/plugins/include/uapi/std/log.h new file mode 100644 index 0000000..fbd1803 --- /dev/null +++ b/CRIU_code/compel/plugins/include/uapi/std/log.h @@ -0,0 +1,15 @@ +#ifndef COMPEL_PLUGIN_STD_LOG_H__ +#define COMPEL_PLUGIN_STD_LOG_H__ + +#define STD_LOG_SIMPLE_CHUNK 256 + +extern void std_log_set_fd(int fd); +extern void std_log_set_loglevel(unsigned int level); +extern void std_log_set_start(struct timeval *tv); +extern int std_vprint_num(char *buf, int blen, int num, char **ps); +extern void std_sprintf(char output[STD_LOG_SIMPLE_CHUNK], const char *format, ...) + __attribute__ ((__format__ (__printf__, 2, 3))); +extern void print_on_level(unsigned int loglevel, const char *format, ...) + __attribute__ ((__format__ (__printf__, 2, 3))); + +#endif /* COMPEL_PLUGIN_STD_LOG_H__ */ diff --git a/CRIU_code/compel/plugins/include/uapi/std/string.h b/CRIU_code/compel/plugins/include/uapi/std/string.h new file mode 100644 index 0000000..c2e4b93 --- /dev/null +++ b/CRIU_code/compel/plugins/include/uapi/std/string.h @@ -0,0 +1,32 @@ +#ifndef COMPEL_PLUGIN_STD_STRING_H__ +#define COMPEL_PLUGIN_STD_STRING_H__ + +#include +#include +#include + +/* Standard file descriptors. */ +#define STDIN_FILENO 0 /* Standard input. */ +#define STDOUT_FILENO 1 /* Standard output. */ +#define STDERR_FILENO 2 /* Standard error output. */ + + +extern void std_dputc(int fd, char c); +extern void std_dputs(int fd, const char *s); +extern void std_vdprintf(int fd, const char *format, va_list args); +extern void std_dprintf(int fd, const char *format, ...) + __attribute__ ((__format__ (__printf__, 2, 3))); + +#define std_printf(fmt, ...) std_dprintf(STDOUT_FILENO, fmt, ##__VA_ARGS__) +#define std_puts(s) std_dputs(STDOUT_FILENO, s) +#define std_putchar(c) std_dputc(STDOUT_FILENO, c) + +extern unsigned long std_strtoul(const char *nptr, char **endptr, int base); +extern int std_strcmp(const char *cs, const char *ct); +extern int std_strncmp(const char *cs, const char *ct, size_t n); + +extern void *memcpy(void *dest, const void *src, size_t n); +extern int memcmp(const void *s1, const void *s2, size_t n); +extern void *memset(void *s, int c, size_t n); + +#endif /* COMPEL_PLUGIN_STD_STRING_H__ */ diff --git a/CRIU_code/compel/plugins/include/uapi/std/syscall-types.h b/CRIU_code/compel/plugins/include/uapi/std/syscall-types.h new file mode 100644 index 0000000..ddb740c --- /dev/null +++ b/CRIU_code/compel/plugins/include/uapi/std/syscall-types.h @@ -0,0 +1,72 @@ +/* + * Please add here type definitions if + * syscall prototypes need them. + */ + +#ifndef COMPEL_SYSCALL_TYPES_H__ +#define COMPEL_SYSCALL_TYPES_H__ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common/bitsperlong.h" + +struct cap_header { + uint32_t version; + int pid; +}; + +struct cap_data { + uint32_t eff; + uint32_t prm; + uint32_t inh; +}; + +struct robust_list_head; +struct file_handle; +struct itimerspec; +struct io_event; +struct sockaddr; +struct timespec; +struct siginfo; +struct msghdr; +struct rusage; +struct iocb; + +typedef unsigned long aio_context_t; + +#ifndef F_GETFD +# define F_GETFD 1 +#endif + +struct krlimit { + unsigned long rlim_cur; + unsigned long rlim_max; +}; + +/* Type of timers in the kernel. */ +typedef int kernel_timer_t; + +#include + + +extern long sys_preadv_raw(int fd, struct iovec *iov, unsigned long nr, unsigned long pos_l, unsigned long pos_h); + +static inline long sys_preadv(int fd, struct iovec *iov, unsigned long nr, off_t off) +{ +#if BITS_PER_LONG == 64 + return sys_preadv_raw(fd, iov, nr, off, 0); +#elif BITS_PER_LONG == 32 + return sys_preadv_raw(fd, iov, nr, off, ((uint64_t)off) >> 32); +#else +# error "BITS_PER_LONG isn't defined" +#endif +} + +#endif /* COMPEL_SYSCALL_TYPES_H__ */ diff --git a/CRIU_code/compel/plugins/shmem/shmem.c b/CRIU_code/compel/plugins/shmem/shmem.c new file mode 100644 index 0000000..695d193 --- /dev/null +++ b/CRIU_code/compel/plugins/shmem/shmem.c @@ -0,0 +1,38 @@ +#include + +#include +#include +#include +#include "shmem.h" +#include "std-priv.h" + +void *shmem_create(unsigned long size) +{ + int ret; + void *mem; + struct shmem_plugin_msg spi; + + mem = (void *)sys_mmap(NULL, size, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, 0, 0); + if (mem == MAP_FAILED) + return NULL; + + spi.start = (unsigned long)mem; + spi.len = size; + + ret = sys_write(std_ctl_sock(), &spi, sizeof(spi)); + if (ret != sizeof(spi)) { + sys_munmap(mem, size); + return NULL; + } + + return mem; +} + +void *shmem_receive(unsigned long *size) +{ + /* master -> parasite not implemented yet */ + return NULL; +} + +PLUGIN_REGISTER_DUMMY(shmem) diff --git a/CRIU_code/compel/plugins/std/fds.c b/CRIU_code/compel/plugins/std/fds.c new file mode 100644 index 0000000..4991027 --- /dev/null +++ b/CRIU_code/compel/plugins/std/fds.c @@ -0,0 +1,16 @@ +#include + +#include +#include + +#include "std-priv.h" + +#define pr_err(fmt, ...) + +#include "common/compiler.h" +#include "common/bug.h" + +#define __sys(foo) sys_##foo +#define __sys_err(ret) ret + +#include "common/scm-code.c" diff --git a/CRIU_code/compel/plugins/std/infect.c b/CRIU_code/compel/plugins/std/infect.c new file mode 100644 index 0000000..d5e1b43 --- /dev/null +++ b/CRIU_code/compel/plugins/std/infect.c @@ -0,0 +1,207 @@ +#include + +#include "common/scm.h" +#include "common/compiler.h" +#include "common/lock.h" +#include "common/page.h" + +#define pr_err(fmt, ...) print_on_level(1, fmt, ##__VA_ARGS__) +#define pr_info(fmt, ...) print_on_level(3, fmt, ##__VA_ARGS__) +#define pr_debug(fmt, ...) print_on_level(4, fmt, ##__VA_ARGS__) + +#include "common/bug.h" + +#include "uapi/compel/asm/sigframe.h" +#include "uapi/compel/infect-rpc.h" + +#include "rpc-pie-priv.h" + +static int tsock = -1; + +static struct rt_sigframe *sigframe; + +#ifdef ARCH_HAS_LONG_PAGES +/* + * XXX: Make it compel's std plugin global variable. Drop parasite_size(). + * Hint: compel on aarch64 shall learn relocs for that. + */ +static unsigned __page_size; + +unsigned __attribute((weak)) page_size(void) +{ + return __page_size; +} +#endif + +int parasite_get_rpc_sock(void) +{ + return tsock; +} + +/* RPC helpers */ +static int __parasite_daemon_reply_ack(unsigned int cmd, int err) +{ + struct ctl_msg m; + int ret; + + m = ctl_msg_ack(cmd, err); + ret = sys_sendto(tsock, &m, sizeof(m), 0, NULL, 0); + if (ret != sizeof(m)) { + pr_err("Sent only %d bytes while %zu expected\n", ret, sizeof(m)); + return -1; + } + + pr_debug("__sent ack msg: %d %d %d\n", + m.cmd, m.ack, m.err); + + return 0; +} + +static int __parasite_daemon_wait_msg(struct ctl_msg *m) +{ + int ret; + + pr_debug("Daemon waits for command\n"); + + while (1) { + *m = (struct ctl_msg){ }; + ret = sys_recvfrom(tsock, m, sizeof(*m), MSG_WAITALL, NULL, 0); + if (ret != sizeof(*m)) { + pr_err("Trimmed message received (%d/%d)\n", + (int)sizeof(*m), ret); + return -1; + } + + pr_debug("__fetched msg: %d %d %d\n", + m->cmd, m->ack, m->err); + return 0; + } + + return -1; +} + +/* Core infect code */ + +static noinline void fini_sigreturn(unsigned long new_sp) +{ + ARCH_RT_SIGRETURN(new_sp, sigframe); +} + +static int fini(void) +{ + unsigned long new_sp; + + parasite_cleanup(); + + new_sp = (long)sigframe + RT_SIGFRAME_OFFSET(sigframe); + pr_debug("%ld: new_sp=%lx ip %lx\n", sys_gettid(), + new_sp, RT_SIGFRAME_REGIP(sigframe)); + + sys_close(tsock); + std_log_set_fd(-1); + + fini_sigreturn(new_sp); + + BUG(); + + return -1; +} + +static noinline __used int noinline parasite_daemon(void *args) +{ + struct ctl_msg m; + int ret = -1; + + pr_debug("Running daemon thread leader\n"); + + /* Reply we're alive */ + if (__parasite_daemon_reply_ack(PARASITE_CMD_INIT_DAEMON, 0)) + goto out; + + ret = 0; + + while (1) { + if (__parasite_daemon_wait_msg(&m)) + break; + + if (ret && m.cmd != PARASITE_CMD_FINI) { + pr_err("Command rejected\n"); + continue; + } + + if (m.cmd == PARASITE_CMD_FINI) + goto out; + + ret = parasite_daemon_cmd(m.cmd, args); + + if (__parasite_daemon_reply_ack(m.cmd, ret)) + break; + + if (ret) { + pr_err("Close the control socket for writing\n"); + sys_shutdown(tsock, SHUT_WR); + } + } + +out: + fini(); + + return 0; +} + +static noinline __used int parasite_init_daemon(void *data) +{ + struct parasite_init_args *args = data; + int ret; + + args->sigreturn_addr = (uint64_t)(uintptr_t)fini_sigreturn; + sigframe = (void*)(uintptr_t)args->sigframe; +#ifdef ARCH_HAS_LONG_PAGES + __page_size = args->page_size; +#endif + + ret = tsock = sys_socket(PF_UNIX, SOCK_SEQPACKET, 0); + if (tsock < 0) { + pr_err("Can't create socket: %d\n", tsock); + goto err; + } + + ret = sys_connect(tsock, (struct sockaddr *)&args->h_addr, args->h_addr_len); + if (ret < 0) { + pr_err("Can't connect the control socket\n"); + goto err; + } + + futex_set_and_wake(&args->daemon_connected, 1); + + ret = recv_fd(tsock); + if (ret >= 0) { + std_log_set_fd(ret); + std_log_set_loglevel(args->log_level); + ret = 0; + } else + goto err; + + parasite_daemon(data); + +err: + futex_set_and_wake(&args->daemon_connected, ret); + fini(); + BUG(); + + return -1; +} + +#ifndef __parasite_entry +# define __parasite_entry +#endif + +int __used __parasite_entry parasite_service(unsigned int cmd, void *args) +{ + pr_info("Parasite cmd %d/%x process\n", cmd, cmd); + + if (cmd == PARASITE_CMD_INIT_DAEMON) + return parasite_init_daemon(args); + + return parasite_trap_cmd(cmd, args); +} diff --git a/CRIU_code/compel/plugins/std/log.c b/CRIU_code/compel/plugins/std/log.c new file mode 100644 index 0000000..403ea46 --- /dev/null +++ b/CRIU_code/compel/plugins/std/log.c @@ -0,0 +1,360 @@ +#include + +#include "common/bitsperlong.h" +#include +#include +#include +#include + +struct simple_buf { + char buf[STD_LOG_SIMPLE_CHUNK]; + char *bp; + int prefix_len; + void (*flush)(struct simple_buf *b); +}; + +static int logfd = -1; +static int cur_loglevel = COMPEL_DEFAULT_LOGLEVEL; +static struct timeval start; + +static void sbuf_log_flush(struct simple_buf *b); + +static inline void timediff(struct timeval *from, struct timeval *to) +{ + to->tv_sec -= from->tv_sec; + if (to->tv_usec >= from->tv_usec) + to->tv_usec -= from->tv_usec; + else { + to->tv_sec--; + to->tv_usec += 1000000 - from->tv_usec; + } +} + +static inline void pad_num(char **s, int *n, int nr) +{ + while (*n < nr) { + (*s)--; + (*n)++; + **s = '0'; + } +} + +static void sbuf_log_init(struct simple_buf *b) +{ + char pbuf[12], *s; + int n; + + /* + * Format: + * + * (time)pie: pid: string-itself + */ + b->bp = b->buf; + + if (start.tv_sec != 0) { + struct timeval now; + + sys_gettimeofday(&now, NULL); + timediff(&start, &now); + + /* Seconds */ + n = std_vprint_num(pbuf, sizeof(pbuf), (unsigned)now.tv_sec, &s); + pad_num(&s, &n, 2); + b->bp[0] = '('; + memcpy(b->bp + 1, s, n); + b->bp[n + 1] = '.'; + b->bp += n + 2; + + /* Mu-seconds */ + n = std_vprint_num(pbuf, sizeof(pbuf), (unsigned)now.tv_usec, &s); + pad_num(&s, &n, 6); + memcpy(b->bp, s, n); + b->bp[n++] = ')'; + b->bp[n++] = ' '; + b->bp += n; + } + + n = std_vprint_num(pbuf, sizeof(pbuf), sys_gettid(), &s); + b->bp[0] = 'p'; + b->bp[1] = 'i'; + b->bp[2] = 'e'; + b->bp[3] = ':'; + b->bp[4] = ' '; + memcpy(b->bp + 5, s, n); + b->bp[n + 5] = ':'; + b->bp[n + 6] = ' '; + b->bp += n + 7; + b->prefix_len = b->bp - b->buf; + b->flush = sbuf_log_flush; +} + +static void sbuf_log_flush(struct simple_buf *b) +{ + if (b->bp == b->buf + b->prefix_len) + return; + + sys_write(logfd, b->buf, b->bp - b->buf); + b->bp = b->buf + b->prefix_len; +} + +static void sbuf_putc(struct simple_buf *b, char c) +{ + /* TODO: maybe some warning or error here? */ + if (b->bp - b->buf >= STD_LOG_SIMPLE_CHUNK) + return; + + *b->bp = c; + b->bp++; + if (b->bp - b->buf >= STD_LOG_SIMPLE_CHUNK - 2) { + b->bp[0] = '>'; + b->bp[1] = '\n'; + b->bp += 2; + if (b->flush) + b->flush(b); + } +} + +void std_log_set_fd(int fd) +{ + sys_close(logfd); + logfd = fd; +} + +void std_log_set_loglevel(unsigned int level) +{ + cur_loglevel = level; +} + +void std_log_set_start(struct timeval *s) +{ + start = *s; +} + +static void print_string(const char *msg, struct simple_buf *b) +{ + while (*msg) { + sbuf_putc(b, *msg); + msg++; + } +} + +int std_vprint_num(char *buf, int blen, int num, char **ps) +{ + int neg = 0; + char *s; + + s = &buf[blen - 1]; + *s-- = 0; /* make sure the returned string is NULL terminated */ + + if (num < 0) { + neg = 1; + num = -num; + } else if (num == 0) { + *s = '0'; + s--; + goto done; + } + + while (num > 0) { + *s = (num % 10) + '0'; + s--; + num /= 10; + } + + if (neg) { + *s = '-'; + s--; + } +done: + s++; + *ps = s; + return blen - (s - buf) - 1; +} + +static void print_num(int num, struct simple_buf *b) +{ + char buf[12], *s; + + std_vprint_num(buf, sizeof(buf), num, &s); + print_string(s, b); +} + +static void print_num_l(long num, struct simple_buf *b) +{ + int neg = 0; + char buf[22], *s; + + buf[21] = '\0'; + s = &buf[20]; + + if (num < 0) { + neg = 1; + num = -num; + } else if (num == 0) { + *s = '0'; + s--; + goto done; + } + + while (num > 0) { + *s = (num % 10) + '0'; + s--; + num /= 10; + } + + if (neg) { + *s = '-'; + s--; + } +done: + s++; + print_string(s, b); +} + +static void hexdigit(unsigned int v, char *to, char **z) +{ + *to = "0123456789abcdef"[v & 0xf]; + if (*to != '0') + *z = to; +} + +static void print_hex(unsigned int num, struct simple_buf *b) +{ + char buf[11], *z = &buf[9]; + + buf[10] = '\0'; + hexdigit(num >> 0, &buf[9], &z); + hexdigit(num >> 4, &buf[8], &z); + hexdigit(num >> 8, &buf[7], &z); + hexdigit(num >> 12, &buf[6], &z); + hexdigit(num >> 16, &buf[5], &z); + hexdigit(num >> 20, &buf[4], &z); + hexdigit(num >> 24, &buf[3], &z); + hexdigit(num >> 28, &buf[2], &z); + z -= 2; + z[0] = '0'; + z[1] = 'x'; + + print_string(z, b); +} + +static void print_hex_l(unsigned long num, struct simple_buf *b) +{ + char buf[19], *z = &buf[17]; + + buf[18] = '\0'; + hexdigit(num >> 0, &buf[17], &z); + hexdigit(num >> 4, &buf[16], &z); + hexdigit(num >> 8, &buf[15], &z); + hexdigit(num >> 12, &buf[14], &z); + hexdigit(num >> 16, &buf[13], &z); + hexdigit(num >> 20, &buf[12], &z); + hexdigit(num >> 24, &buf[11], &z); + hexdigit(num >> 28, &buf[10], &z); + +#if BITS_PER_LONG == 64 + hexdigit(num >> 32, &buf[9], &z); + hexdigit(num >> 36, &buf[8], &z); + hexdigit(num >> 40, &buf[7], &z); + hexdigit(num >> 44, &buf[6], &z); + hexdigit(num >> 48, &buf[5], &z); + hexdigit(num >> 52, &buf[4], &z); + hexdigit(num >> 56, &buf[3], &z); + hexdigit(num >> 60, &buf[2], &z); +#endif + + z -= 2; + z[0] = '0'; + z[1] = 'x'; + + print_string(z, b); +} + +static void sbuf_printf(struct simple_buf *b, const char *format, va_list args) +{ + const char *s = format; + while (1) { + int along = 0; + + if (*s == '\0') + break; + + if (*s != '%') { + sbuf_putc(b, *s); + s++; + continue; + } + + s++; + if (*s == 'l') { + along = 1; + s++; + if (*s == 'l') + s++; + } else if (*s == 'z') { + along = (sizeof(size_t) > sizeof(int)); + s++; + } + + switch (*s) { + case 's': + print_string(va_arg(args, char *), b); + break; + case 'd': + if (along) + print_num_l(va_arg(args, long), b); + else + print_num(va_arg(args, int), b); + break; + case 'x': + if (along) + print_hex_l(va_arg(args, long), b); + else + print_hex(va_arg(args, unsigned int), b); + break; + case 'p': + print_hex_l((unsigned long)va_arg(args, void *), b); + break; + default: + print_string("UNKNOWN FORMAT ", b); + sbuf_putc(b, *s); + break; + } + s++; + } +} + +void print_on_level(unsigned int loglevel, const char *format, ...) +{ + va_list args; + struct simple_buf b; + + if (loglevel > cur_loglevel) + return; + + sbuf_log_init(&b); + + va_start(args, format); + sbuf_printf(&b, format, args); + va_end(args); + + sbuf_log_flush(&b); +} + +void std_sprintf(char output[STD_LOG_SIMPLE_CHUNK], const char *format, ...) +{ + va_list args; + struct simple_buf b; + char *p; + + b.bp = b.buf; + b.flush = NULL; + + va_start(args, format); + sbuf_printf(&b, format, args); + va_end(args); + *b.bp = 0; + + for (p = b.buf; p <= b.bp; p++) + output[p - b.buf] = *p; +} diff --git a/CRIU_code/compel/plugins/std/std.c b/CRIU_code/compel/plugins/std/std.c new file mode 100644 index 0000000..82f51ea --- /dev/null +++ b/CRIU_code/compel/plugins/std/std.c @@ -0,0 +1,85 @@ +#include + +#include +#include + +#include "asm/prologue.h" + +static struct prologue_init_args *init_args; +static int ctl_socket = -1; + +int std_ctl_sock(void) +{ + return ctl_socket; +} + +static int init_socket(struct prologue_init_args *args) +{ + int ret; + + ctl_socket = sys_socket(PF_UNIX, SOCK_SEQPACKET, 0); + if (ctl_socket < 0) + return ctl_socket; + + ret = sys_connect(ctl_socket, (struct sockaddr *)&args->ctl_sock_addr, args->ctl_sock_addr_len); + if (ret < 0) + return ret; + + return 0; +} + +static int fini_socket(void) +{ + char buf[32]; + int ret = 0; + + ret = sys_shutdown(ctl_socket, SHUT_WR); + if (ret) + goto err; + + ret = sys_recv(ctl_socket, buf, sizeof(buf), MSG_WAITALL); + if (ret) + goto err; +err: + sys_close(ctl_socket); + ctl_socket = -1; + return ret; +} + +#define plugin_init_count(size) ((size) / (sizeof(plugin_init_t *))) + +int __export_std_compel_start(struct prologue_init_args *args, + const plugin_init_t * const *init_array, + size_t init_size) +{ + unsigned int i; + int ret = 0; + + init_args = args; + + ret = init_socket(args); + if (ret) + return ret; + + for (i = 0; i < plugin_init_count(init_size); i++) { + const plugin_init_t *d = init_array[i]; + + if (d && d->init) { + ret = d->init(); + if (ret) + break; + } + } + + for (; i > 0; i--) { + const plugin_init_t *d = init_array[i - 1]; + + if (d && d->exit) + d->exit(); + } + + fini_socket(); + return ret; +} + +PLUGIN_REGISTER_DUMMY(std) diff --git a/CRIU_code/compel/plugins/std/string.c b/CRIU_code/compel/plugins/std/string.c new file mode 100644 index 0000000..85bede8 --- /dev/null +++ b/CRIU_code/compel/plugins/std/string.c @@ -0,0 +1,302 @@ +#include +#include +#include + +#include +#include + +#include "features.h" + +static const char conv_tab[] = "0123456789abcdefghijklmnopqrstuvwxyz"; + +void std_dputc(int fd, char c) +{ + sys_write(fd, &c, 1); +} + +void std_dputs(int fd, const char *s) +{ + for (; *s; s++) + std_dputc(fd, *s); +} + +static size_t __std_vprint_long_hex(char *buf, size_t blen, unsigned long num, char **ps) +{ + char *s = &buf[blen - 2]; + + buf[blen - 1] = '\0'; + + if (num == 0) { + *s = '0', s--; + goto done; + } + + while (num > 0) { + *s = conv_tab[num % 16], s--; + num /= 16; + } + +done: + s++; + *ps = s; + return blen - (s - buf); +} + +static size_t __std_vprint_long(char *buf, size_t blen, long num, char **ps) +{ + char *s = &buf[blen - 2]; + int neg = 0; + + buf[blen - 1] = '\0'; + + if (num < 0) { + neg = 1; + num = -num; + } else if (num == 0) { + *s = '0'; + s--; + goto done; + } + + while (num > 0) { + *s = (num % 10) + '0'; + s--; + num /= 10; + } + + if (neg) { + *s = '-'; + s--; + } +done: + s++; + *ps = s; + return blen - (s - buf); +} + +void std_vdprintf(int fd, const char *format, va_list args) +{ + const char *s = format; + + for (; *s != '\0'; s++) { + char buf[32], *t; + int along = 0; + + if (*s != '%') { + std_dputc(fd, *s); + continue; + } + + s++; + if (*s == 'l') { + along = 1; + s++; + if (*s == 'l') + s++; + } + + switch (*s) { + case 's': + std_dputs(fd, va_arg(args, char *)); + break; + case 'd': + __std_vprint_long(buf, sizeof(buf), + along ? + va_arg(args, long) : + (long)va_arg(args, int), + &t); + std_dputs(fd, t); + break; + case 'x': + __std_vprint_long_hex(buf, sizeof(buf), + along ? + va_arg(args, long) : + (long)va_arg(args, int), + &t); + std_dputs(fd, t); + break; + } + } +} + +void std_dprintf(int fd, const char *format, ...) +{ + va_list args; + + va_start(args, format); + std_vdprintf(fd, format, args); + va_end(args); +} + +static inline bool __isspace(unsigned char c) +{ + return c == ' ' || c == '\f' || + c == '\n' || c == '\r' || + c == '\t' || c == '\v'; +} + +static unsigned char __tolower(unsigned char c) +{ + return (c <= 'Z' && c >= 'A') ? c - 'A' + 'a' : c; +} + +static inline bool __isalpha(unsigned char c) +{ + return ((c <= 'Z' && c >= 'A') || + (c <= 'z' && c >= 'a')); +} + +static inline bool __isdigit(unsigned char c) +{ + return (c <= '9' && c >= '0'); +} + +static inline bool __isalnum(unsigned char c) +{ + return (__isalpha(c) || __isdigit(c)); +} + +static unsigned int __conv_val(unsigned char c) +{ + if (__isdigit(c)) + return c - '0'; + else if (__isalpha(c)) + return &conv_tab[__tolower(c)] - conv_tab; + return -1u; +} + +unsigned long std_strtoul(const char *nptr, char **endptr, int base) +{ + const char *s = nptr; + bool neg = false; + unsigned int v; + long num = 0; + + if (base < 0 || base == 1 || base > 36) + goto fin; + + while (__isspace(*s)) + s++; + if (!*s) + goto fin; + + if (*s == '-') + neg = true, s++; + + if (base == 0) { + if (s[0] == '0') { + unsigned char p = __tolower(s[1]); + switch (p) { + case 'b': + base = 2, s += 2; + break; + case 'x': + base = 16, s += 2; + break; + default: + base = 8, s += 1; + break; + } + } else + base = 10; + } else if (base == 16) { + if (s[0] == '0' && __tolower(s[1]) == 'x') + s += 2; + } + + for (; *s; s++) { + if (__isspace(*s)) + continue; + if (!__isalnum(*s)) + goto fin; + v = __conv_val(*s); + if (v == -1u || v > base) + goto fin; + num *= base; + num += v; + } + +fin: + if (endptr) + *endptr = (char *)s; + return neg ? (unsigned long)-num : (unsigned long)num; +} + + +/* + * C compiler is free to insert implicit calls to memcmp, memset, + * memcpy and memmove, assuming they are available during linking. + * As the parasite code is not linked with libc, it must provide + * our own implementations of the above functions. + * Surely, these functions can also be called explicitly. + * + * Note: for now, not having memmove() seems OK for both gcc and clang. + */ + +#ifndef ARCH_HAS_MEMCPY +void *memcpy(void *to, const void *from, size_t n) +{ + size_t i; + unsigned char *cto = to; + const unsigned char *cfrom = from; + + for (i = 0; i < n; ++i, ++cto, ++cfrom) + *cto = *cfrom; + + return to; +} +#endif + +#ifndef ARCH_HAS_MEMCMP +int memcmp(const void *cs, const void *ct, size_t count) +{ + const unsigned char *su1, *su2; + int res = 0; + + for (su1 = cs, su2 = ct; 0 < count; ++su1, ++su2, count--) + if ((res = *su1 - *su2) != 0) + break; + return res; +} +#endif + +#ifndef ARCH_HAS_MEMSET +void *memset(void *s, const int c, size_t count) +{ + volatile char *dest = s; + size_t i = 0; + + while (i < count) + dest[i++] = (char) c; + + return s; +} +#endif + +int std_strcmp(const char *cs, const char *ct) +{ + unsigned char c1, c2; + + while (1) { + c1 = *cs++; + c2 = *ct++; + if (c1 != c2) + return c1 < c2 ? -1 : 1; + if (!c1) + break; + } + return 0; +} + +int std_strncmp(const char *cs, const char *ct, size_t count) +{ + size_t i; + + for (i = 0; i < count; i++) { + if (cs[i] != ct[i]) + return cs[i] < ct[i] ? -1 : 1; + if (!cs[i]) + break; + } + return 0; +} diff --git a/CRIU_code/compel/src/lib/handle-elf-host.c b/CRIU_code/compel/src/lib/handle-elf-host.c new file mode 100644 index 0000000..fe46118 --- /dev/null +++ b/CRIU_code/compel/src/lib/handle-elf-host.c @@ -0,0 +1 @@ +handle-elf.c \ No newline at end of file diff --git a/CRIU_code/compel/src/lib/handle-elf.c b/CRIU_code/compel/src/lib/handle-elf.c new file mode 100644 index 0000000..ca7c53b --- /dev/null +++ b/CRIU_code/compel/src/lib/handle-elf.c @@ -0,0 +1,650 @@ +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +#include "uapi/compel.h" + +#include "handle-elf.h" +#include "piegen.h" +#include "log.h" + +piegen_opt_t opts = {}; + +/* Check if pointer is out-of-bound */ +static bool __ptr_oob(const uintptr_t ptr, const uintptr_t start, const size_t size) +{ + uintptr_t end = start + size; + + return ptr >= end || ptr < start; +} + +/* Check if pointed structure's end is out-of-bound */ +static bool __ptr_struct_end_oob(const uintptr_t ptr, const size_t struct_size, + const uintptr_t start, const size_t size) +{ + /* the last byte of the structure should be inside [begin, end) */ + return __ptr_oob(ptr + struct_size - 1, start, size); +} + +/* Check if pointed structure is out-of-bound */ +static bool __ptr_struct_oob(const uintptr_t ptr, const size_t struct_size, + const uintptr_t start, const size_t size) +{ + return __ptr_oob(ptr, start, size) || + __ptr_struct_end_oob(ptr, struct_size, start, size); +} + +static bool test_pointer(const void *ptr, const void *start, const size_t size, + const char *name, const char *file, const int line) +{ + if (__ptr_oob((const uintptr_t)ptr, (const uintptr_t)start, size)) { + pr_err("Corrupted pointer %p (%s) at %s:%d\n", + ptr, name, file, line); + return true; + } + return false; +} + +#define ptr_func_exit(__ptr) \ + do { \ + if (test_pointer((__ptr), mem, size, #__ptr, \ + __FILE__, __LINE__)) { \ + free(sec_hdrs); \ + return -1; \ + } \ + } while (0) + +#ifdef ELF_PPC64 +static int do_relative_toc(long value, uint16_t *location, + unsigned long mask, int complain_signed) +{ + if (complain_signed && (value + 0x8000 > 0xffff)) { + pr_err("TOC16 relocation overflows (%ld)\n", value); + return -1; + } + + if ((~mask & 0xffff) & value) { + pr_err("bad TOC16 relocation (%ld) (0x%lx)\n", + value, (~mask & 0xffff) & value); + return -1; + } + + *location = (*location & ~mask) | (value & mask); + return 0; +} +#endif + +static bool is_header_supported(Elf_Ehdr *hdr) +{ + if (!arch_is_machine_supported(hdr->e_machine)) + return false; + if ((hdr->e_type != ET_REL +#ifdef NO_RELOCS + && hdr->e_type != ET_EXEC +#endif + ) || hdr->e_version != EV_CURRENT) + return false; + return true; +} + +static const char *get_strings_section(Elf_Ehdr *hdr, uintptr_t mem, size_t size) +{ + size_t sec_table_size = ((size_t) hdr->e_shentsize) * hdr->e_shnum; + uintptr_t sec_table = mem + hdr->e_shoff; + Elf_Shdr *secstrings_hdr; + uintptr_t addr; + + if (__ptr_struct_oob(sec_table, sec_table_size, mem, size)) { + pr_err("Section table [%#zx, %#zx) is out of [%#zx, %#zx)\n", + sec_table, sec_table + sec_table_size, mem, mem + size); + return NULL; + } + + /* + * strings section header's offset in section headers table is + * (size of section header * index of string section header) + */ + addr = sec_table + ((size_t) hdr->e_shentsize) * hdr->e_shstrndx; + if (__ptr_struct_oob(addr, sizeof(Elf_Shdr), + sec_table, sec_table_size)) { + pr_err("String section header @%#zx is out of [%#zx, %#zx)\n", + addr, sec_table, sec_table + sec_table_size); + return NULL; + } + secstrings_hdr = (void*)addr; + + addr = mem + secstrings_hdr->sh_offset; + if (__ptr_struct_oob(addr, secstrings_hdr->sh_size, mem, size)) { + pr_err("String section @%#zx size %#lx is out of [%#zx, %#zx)\n", + addr, (unsigned long)secstrings_hdr->sh_size, + mem, mem + size); + return NULL; + } + + return (void*)addr; +} + +/* + * This name @__handle_elf get renamed into + * @handle_elf_ppc64 or say @handle_elf_x86_64 + * depending on the architecture it's compiled + * under. + */ +int __handle_elf(void *mem, size_t size) +{ + const char *symstrings = NULL; + Elf_Shdr *symtab_hdr = NULL; + Elf_Sym *symbols = NULL; + Elf_Ehdr *hdr = mem; + + Elf_Shdr *strtab_hdr = NULL; + Elf_Shdr **sec_hdrs = NULL; + const char *secstrings; + + size_t i, k, nr_gotpcrel = 0; +#ifdef ELF_PPC64 + int64_t toc_offset = 0; +#endif + int ret = -EINVAL; + + pr_debug("Header\n"); + pr_debug("------------\n"); + pr_debug("\ttype 0x%x machine 0x%x version 0x%x\n", + (unsigned)hdr->e_type, (unsigned)hdr->e_machine, + (unsigned)hdr->e_version); + + if (!is_header_supported(hdr)) { + pr_err("Unsupported header detected\n"); + goto err; + } + + sec_hdrs = malloc(sizeof(*sec_hdrs) * hdr->e_shnum); + if (!sec_hdrs) { + pr_err("No memory for section headers\n"); + ret = -ENOMEM; + goto err; + } + + secstrings = get_strings_section(hdr, (uintptr_t)mem, size); + if (!secstrings) + goto err; + + pr_debug("Sections\n"); + pr_debug("------------\n"); + for (i = 0; i < hdr->e_shnum; i++) { + Elf_Shdr *sh = mem + hdr->e_shoff + hdr->e_shentsize * i; + ptr_func_exit(sh); + + if (sh->sh_type == SHT_SYMTAB) + symtab_hdr = sh; + + ptr_func_exit(&secstrings[sh->sh_name]); + pr_debug("\t index %-2zd type 0x%-2x name %s\n", i, + (unsigned)sh->sh_type, &secstrings[sh->sh_name]); + + sec_hdrs[i] = sh; + +#ifdef ELF_PPC64 + if (!strcmp(&secstrings[sh->sh_name], ".toc")) { + toc_offset = sh->sh_addr + 0x8000; + pr_debug("\t\tTOC offset 0x%lx\n", toc_offset); + } +#endif + } + + if (!symtab_hdr) { + pr_err("No symbol table present\n"); + goto err; + } + + if (!symtab_hdr->sh_link || symtab_hdr->sh_link >= hdr->e_shnum) { + pr_err("Corrupted symtab header\n"); + goto err; + } + + pr_debug("Symbols\n"); + pr_debug("------------\n"); + strtab_hdr = sec_hdrs[symtab_hdr->sh_link]; + ptr_func_exit(strtab_hdr); + + symbols = mem + symtab_hdr->sh_offset; + ptr_func_exit(symbols); + symstrings = mem + strtab_hdr->sh_offset; + ptr_func_exit(symstrings); + + if (sizeof(*symbols) != symtab_hdr->sh_entsize) { + pr_err("Symbol table align differ\n"); + goto err; + } + + pr_out("/* Autogenerated from %s */\n", opts.input_filename); + pr_out("#include \n"); + + for (i = 0; i < symtab_hdr->sh_size / symtab_hdr->sh_entsize; i++) { + Elf_Sym *sym = &symbols[i]; + const char *name; + Elf_Shdr *sh_src; + + ptr_func_exit(sym); + name = &symstrings[sym->st_name]; + ptr_func_exit(name); + + if (!*name) + continue; + + pr_debug("\ttype 0x%-2x bind 0x%-2x shndx 0x%-4x value 0x%-2lx name %s\n", + (unsigned)ELF_ST_TYPE(sym->st_info), (unsigned)ELF_ST_BIND(sym->st_info), + (unsigned)sym->st_shndx, (unsigned long)sym->st_value, name); +#ifdef ELF_PPC64 + if (!sym->st_value && !strncmp(name, ".TOC.", 6)) { + if (!toc_offset) { + pr_err("No TOC pointer\n"); + goto err; + } + sym->st_value = toc_offset; + continue; + } +#endif + if (strncmp(name, "__export", 8)) + continue; + if ((sym->st_shndx && sym->st_shndx < hdr->e_shnum) || + sym->st_shndx == SHN_ABS) { + if (sym->st_shndx == SHN_ABS) { + sh_src = NULL; + } else { + sh_src = sec_hdrs[sym->st_shndx]; + ptr_func_exit(sh_src); + } + pr_out("#define %s_sym%s 0x%lx\n", + opts.prefix, name, + (unsigned long)(sym->st_value + + (sh_src ? sh_src->sh_addr : 0))); + } + } + + pr_out("static __maybe_unused compel_reloc_t %s_relocs[] = {\n", opts.prefix); +#ifndef NO_RELOCS + pr_debug("Relocations\n"); + pr_debug("------------\n"); + for (i = 0; i < hdr->e_shnum; i++) { + Elf_Shdr *sh = sec_hdrs[i]; + Elf_Shdr *sh_rel; + + if (sh->sh_type != SHT_REL && sh->sh_type != SHT_RELA) + continue; + + sh_rel = sec_hdrs[sh->sh_info]; + ptr_func_exit(sh_rel); + + pr_debug("\tsection %2zd type 0x%-2x link 0x%-2x info 0x%-2x name %s\n", i, + (unsigned)sh->sh_type, (unsigned)sh->sh_link, + (unsigned)sh->sh_info, &secstrings[sh->sh_name]); + + for (k = 0; k < sh->sh_size / sh->sh_entsize; k++) { + int64_t __maybe_unused addend64, __maybe_unused value64; + int32_t __maybe_unused addend32, __maybe_unused value32; + unsigned long place; + const char *name; + void *where; + Elf_Sym *sym; + + union { + Elf_Rel rel; + Elf_Rela rela; + } *r = mem + sh->sh_offset + sh->sh_entsize * k; + ptr_func_exit(r); + + sym = &symbols[ELF_R_SYM(r->rel.r_info)]; + ptr_func_exit(sym); + + name = &symstrings[sym->st_name]; + ptr_func_exit(name); + + where = mem + sh_rel->sh_offset + r->rel.r_offset; + ptr_func_exit(where); + + pr_debug("\t\tr_offset 0x%-4lx r_info 0x%-4lx / sym 0x%-2lx type 0x%-2lx symsecoff 0x%-4lx\n", + (unsigned long)r->rel.r_offset, (unsigned long)r->rel.r_info, + (unsigned long)ELF_R_SYM(r->rel.r_info), + (unsigned long)ELF_R_TYPE(r->rel.r_info), + (unsigned long)sh_rel->sh_addr); + + if (sym->st_shndx == SHN_UNDEF) { +#ifdef ELF_PPC64 + /* On PowerPC, TOC symbols appear to be + * undefined but should be processed as well. + * Their type is STT_NOTYPE, so report any + * other one. + */ + if (ELF32_ST_TYPE(sym->st_info) != STT_NOTYPE + || strncmp(name, ".TOC.", 6)) { + pr_err("Unexpected undefined symbol:%s\n", name); + goto err; + } +#else + pr_err("Unexpected undefined symbol: `%s'. External symbol in PIE?\n", name); + goto err; +#endif + } + + if (sh->sh_type == SHT_REL) { + addend32 = *(int32_t *)where; + addend64 = *(int64_t *)where; + } else { + addend32 = (int32_t)r->rela.r_addend; + addend64 = (int64_t)r->rela.r_addend; + } + + place = sh_rel->sh_addr + r->rel.r_offset; + + pr_debug("\t\t\tvalue 0x%-8lx addend32 %-4d addend64 %-8ld place %-8lx symname %s\n", + (unsigned long)sym->st_value, addend32, (long)addend64, (long)place, name); + + if (sym->st_shndx == SHN_ABS) { + value32 = (int32_t)sym->st_value; + value64 = (int64_t)sym->st_value; + } else { + Elf_Shdr *sh_src; + + if ((unsigned)sym->st_shndx > (unsigned)hdr->e_shnum) { + pr_err("Unexpected symbol section index %u/%u\n", + (unsigned)sym->st_shndx, hdr->e_shnum); + goto err; + } + sh_src = sec_hdrs[sym->st_shndx]; + ptr_func_exit(sh_src); + + value32 = (int32_t)sh_src->sh_addr + (int32_t)sym->st_value; + value64 = (int64_t)sh_src->sh_addr + (int64_t)sym->st_value; + } + +#ifdef ELF_PPC64 +/* + * Snippet from the OpenPOWER ABI for Linux Supplement: + * + * The OpenPOWER ABI uses the three most-significant bits in the symbol + * st_other field specifies the number of instructions between a function's + * global entry point and local entry point. The global entry point is used + * when it is necessary to set up the TOC pointer (r2) for the function. The + * local entry point is used when r2 is known to already be valid for the + * function. A value of zero in these bits asserts that the function does + * not use r2. + * + * The st_other values have the following meanings: + * 0 and 1, the local and global entry points are the same. + * 2, the local entry point is at 1 instruction past the global entry point. + * 3, the local entry point is at 2 instructions past the global entry point. + * 4, the local entry point is at 4 instructions past the global entry point. + * 5, the local entry point is at 8 instructions past the global entry point. + * 6, the local entry point is at 16 instructions past the global entry point. + * 7, reserved. + * + * Here we are only handle the case '3' which is the most commonly seen. + */ +#define LOCAL_OFFSET(s) ((s->st_other >> 5) & 0x7) + if (LOCAL_OFFSET(sym)) { + if (LOCAL_OFFSET(sym) != 3) { + pr_err("Unexpected local offset value %d\n", + LOCAL_OFFSET(sym)); + goto err; + } + pr_debug("\t\t\tUsing local offset\n"); + value64 += 8; + value32 += 8; + } +#endif + + switch (ELF_R_TYPE(r->rel.r_info)) { +#ifdef ELF_PPC64 + case R_PPC64_REL24: + /* Update PC relative offset, linker has not done this yet */ + pr_debug("\t\t\tR_PPC64_REL24 at 0x%-4lx val 0x%lx\n", + place, value64); + /* Convert value to relative */ + value64 -= place; + if (value64 + 0x2000000 > 0x3ffffff || (value64 & 3) != 0) { + pr_err("REL24 %li out of range!\n", (long int)value64); + goto err; + } + /* Only replace bits 2 through 26 */ + *(uint32_t *)where = (*(uint32_t *)where & ~0x03fffffc) | + (value64 & 0x03fffffc); + break; + + case R_PPC64_ADDR32: + case R_PPC64_REL32: + pr_debug("\t\t\tR_PPC64_ADDR32 at 0x%-4lx val 0x%x\n", + place, (unsigned int)(value32 + addend32)); + pr_out(" { .offset = 0x%-8x, .type = COMPEL_TYPE_INT, " + " .addend = %-8d, .value = 0x%-16x, " + "}, /* R_PPC64_ADDR32 */\n", + (unsigned int) place, addend32, value32); + break; + + case R_PPC64_ADDR64: + case R_PPC64_REL64: + pr_debug("\t\t\tR_PPC64_ADDR64 at 0x%-4lx val 0x%lx\n", + place, value64 + addend64); + pr_out("\t{ .offset = 0x%-8x, .type = COMPEL_TYPE_LONG," + " .addend = %-8ld, .value = 0x%-16lx, " + "}, /* R_PPC64_ADDR64 */\n", + (unsigned int) place, (long)addend64, (long)value64); + break; + + case R_PPC64_TOC16_HA: + pr_debug("\t\t\tR_PPC64_TOC16_HA at 0x%-4lx val 0x%lx\n", + place, value64 + addend64 - toc_offset + 0x8000); + if (do_relative_toc((value64 + addend64 - toc_offset + 0x8000) >> 16, + where, 0xffff, 1)) + goto err; + break; + + case R_PPC64_TOC16_LO: + pr_debug("\t\t\tR_PPC64_TOC16_LO at 0x%-4lx val 0x%lx\n", + place, value64 + addend64 - toc_offset); + if (do_relative_toc(value64 + addend64 - toc_offset, + where, 0xffff, 1)) + goto err; + break; + + case R_PPC64_TOC16_LO_DS: + pr_debug("\t\t\tR_PPC64_TOC16_LO_DS at 0x%-4lx val 0x%lx\n", + place, value64 + addend64 - toc_offset); + if (do_relative_toc(value64 + addend64 - toc_offset, + where, 0xfffc, 0)) + goto err; + break; + + case R_PPC64_REL16_HA: + value64 += addend64 - place; + pr_debug("\t\t\tR_PPC64_REL16_HA at 0x%-4lx val 0x%lx\n", + place, value64); + /* check that we are dealing with the addis 2,12 instruction */ + if (((*(uint32_t*)where) & 0xffff0000) != 0x3c4c0000) { + pr_err("Unexpected instruction for R_PPC64_REL16_HA\n"); + goto err; + } + *(uint16_t *)where = ((value64 + 0x8000) >> 16) & 0xffff; + break; + + case R_PPC64_REL16_LO: + value64 += addend64 - place; + pr_debug("\t\t\tR_PPC64_REL16_LO at 0x%-4lx val 0x%lx\n", + place, value64); + /* check that we are dealing with the addi 2,2 instruction */ + if (((*(uint32_t*)where) & 0xffff0000) != 0x38420000) { + pr_err("Unexpected instruction for R_PPC64_REL16_LO\n"); + goto err; + } + *(uint16_t *)where = value64 & 0xffff; + break; + +#endif /* ELF_PPC64 */ + +#ifdef ELF_X86_64 + case R_X86_64_32: /* Symbol + Addend (4 bytes) */ + case R_X86_64_32S: /* Symbol + Addend (4 bytes) */ + pr_debug("\t\t\t\tR_X86_64_32 at 0x%-4lx val 0x%x\n", place, value32); + pr_out(" { .offset = 0x%-8x, .type = COMPEL_TYPE_INT, " + ".addend = %-8d, .value = 0x%-16x, }, /* R_X86_64_32 */\n", + (unsigned int)place, addend32, value32); + break; + case R_X86_64_64: /* Symbol + Addend (8 bytes) */ + pr_debug("\t\t\t\tR_X86_64_64 at 0x%-4lx val 0x%lx\n", place, (long)value64); + pr_out(" { .offset = 0x%-8x, .type = COMPEL_TYPE_LONG, " + ".addend = %-8ld, .value = 0x%-16lx, }, /* R_X86_64_64 */\n", + (unsigned int)place, (long)addend64, (long)value64); + break; + case R_X86_64_PC32: /* Symbol + Addend - Place (4 bytes) */ + pr_debug("\t\t\t\tR_X86_64_PC32 at 0x%-4lx val 0x%x\n", place, value32 + addend32 - (int32_t)place); + /* + * R_X86_64_PC32 are relative, patch them inplace. + */ + *((int32_t *)where) = value32 + addend32 - place; + break; + case R_X86_64_PLT32: /* ProcLinkage + Addend - Place (4 bytes) */ + pr_debug("\t\t\t\tR_X86_64_PLT32 at 0x%-4lx val 0x%x\n", place, value32 + addend32 - (int32_t)place); + /* + * R_X86_64_PLT32 are relative, patch them inplace. + */ + *((int32_t *)where) = value32 + addend32 - place; + break; + case R_X86_64_GOTPCRELX: + case R_X86_64_REX_GOTPCRELX: + case R_X86_64_GOTPCREL: /* SymbolOffsetInGot + GOT + Addend - Place (4 bytes) */ + pr_debug("\t\t\t\tR_X86_64_GOTPCREL at 0x%-4lx val 0x%x\n", place, value32); + pr_out(" { .offset = 0x%-8x, .type = COMPEL_TYPE_LONG | COMPEL_TYPE_GOTPCREL, " + ".addend = %-8d, .value = 0x%-16x, }, /* R_X86_64_GOTPCREL */\n", + (unsigned int)place, addend32, value32); + nr_gotpcrel++; + break; +#endif + +#ifdef ELF_X86_32 + case R_386_32: /* Symbol + Addend */ + pr_debug("\t\t\t\tR_386_32 at 0x%-4lx val 0x%x\n", place, value32 + addend32); + pr_out(" { .offset = 0x%-8x, .type = COMPEL_TYPE_INT, " + ".addend = %-4d, .value = 0x%x, },\n", + (unsigned int)place, addend32, value32); + break; + case R_386_PC32: /* Symbol + Addend - Place */ + pr_debug("\t\t\t\tR_386_PC32 at 0x%-4lx val 0x%x\n", place, value32 + addend32 - (int32_t)place); + /* + * R_386_PC32 are relative, patch them inplace. + */ + *((int32_t *)where) = value32 + addend32 - place; + break; +#endif + +#ifdef ELF_S390 + /* + * See also arch/s390/kernel/module.c/apply_rela(): + * A PLT reads the GOT (global offest table). We can handle it like + * R_390_PC32DBL because we have linked statically. + */ + case R_390_PLT32DBL: /* PC relative on a PLT (predure link table) */ + pr_debug("\t\t\t\tR_390_PLT32DBL at 0x%-4lx val 0x%x\n", place, value32 + addend32); + *((int32_t *)where) = (value64 + addend64 - place) >> 1; + break; + case R_390_PC32DBL: /* PC relative on a symbol */ + pr_debug("\t\t\t\tR_390_PC32DBL at 0x%-4lx val 0x%x\n", place, value32 + addend32); + *((int32_t *)where) = (value64 + addend64 - place) >> 1; + break; + case R_390_64: /* 64 bit absolute address */ + pr_debug("\t\t\t\tR_390_64 at 0x%-4lx val 0x%lx\n", place, (long)value64); + pr_out(" { .offset = 0x%-8x, .type = COMPEL_TYPE_LONG, " + ".addend = %-8ld, .value = 0x%-16lx, }, /* R_390_64 */\n", + (unsigned int)place, (long)addend64, (long)value64); + break; + case R_390_PC64: /* 64 bit relative address */ + *((int64_t *)where) = value64 + addend64 - place; + pr_debug("\t\t\t\tR_390_PC64 at 0x%-4lx val 0x%lx\n", place, (long)value64); + break; +#endif + default: + pr_err("Unsupported relocation of type %lu\n", + (unsigned long)ELF_R_TYPE(r->rel.r_info)); + goto err; + } + } + } +#endif /* !NO_RELOCS */ + pr_out("};\n"); + pr_out("static __maybe_unused size_t %s_nr_gotpcrel = %zd;\n", opts.prefix, nr_gotpcrel); + + pr_out("static __maybe_unused const char %s_blob[] = {\n\t", opts.prefix); + + for (i = 0, k = 0; i < hdr->e_shnum; i++) { + Elf_Shdr *sh = sec_hdrs[i]; + unsigned char *shdata; + size_t j; + + if (!(sh->sh_flags & SHF_ALLOC) || !sh->sh_size) + continue; + + shdata = mem + sh->sh_offset; + pr_debug("Copying section '%s'\n" + "\tstart:0x%lx (gap:0x%lx) size:0x%lx\n", + &secstrings[sh->sh_name], (unsigned long) sh->sh_addr, + (unsigned long)(sh->sh_addr - k), (unsigned long) sh->sh_size); + + /* write 0 in the gap between the 2 sections */ + for (; k < sh->sh_addr; k++) { + if (k && (k % 8) == 0) + pr_out("\n\t"); + pr_out("0x00,"); + } + + for (j = 0; j < sh->sh_size; j++, k++) { + if (k && (k % 8) == 0) + pr_out("\n\t"); + pr_out("0x%02x,", shdata[j]); + } + } + pr_out("};\n"); + pr_out("\n"); + pr_out("static void __maybe_unused %s_setup_c_header(struct parasite_ctl *ctl)\n", + opts.prefix); + pr_out( +"{\n" +" struct parasite_blob_desc *pbd;\n" +"\n" +" pbd = compel_parasite_blob_desc(ctl);\n" +" pbd->parasite_type = COMPEL_BLOB_CHEADER;\n" +); + pr_out("\tpbd->hdr.mem = %s_blob;\n", opts.prefix); + pr_out("\tpbd->hdr.bsize = sizeof(%s_blob);\n", + opts.prefix); + pr_out("\tpbd->hdr.nr_gotpcrel = %s_nr_gotpcrel;\n", opts.prefix); + pr_out("\tif (compel_mode_native(ctl))\n"); + pr_out("\t\tpbd->hdr.parasite_ip_off = " + "%s_sym__export_parasite_head_start;\n", opts.prefix); + pr_out("#ifdef CONFIG_COMPAT\n"); + pr_out("\telse\n"); + pr_out("\t\tpbd->hdr.parasite_ip_off = " + "%s_sym__export_parasite_head_start_compat;\n", opts.prefix); + pr_out("#endif /* CONFIG_COMPAT */\n"); + pr_out("\tpbd->hdr.addr_cmd_off = " + "%s_sym__export_parasite_cmd;\n", opts.prefix); + pr_out("\tpbd->hdr.addr_arg_off = " + "%s_sym__export_parasite_args;\n", opts.prefix); + pr_out("\tpbd->hdr.relocs = %s_relocs;\n", opts.prefix); + pr_out("\tpbd->hdr.nr_relocs = " + "sizeof(%s_relocs) / sizeof(%s_relocs[0]);\n", + opts.prefix, opts.prefix); + pr_out("}\n"); + ret = 0; +err: + free(sec_hdrs); + return ret; +} diff --git a/CRIU_code/compel/src/lib/infect-rpc.c b/CRIU_code/compel/src/lib/infect-rpc.c new file mode 100644 index 0000000..265a4ad --- /dev/null +++ b/CRIU_code/compel/src/lib/infect-rpc.c @@ -0,0 +1,101 @@ +#include "log.h" +#include "common/bug.h" +#include "common/xmalloc.h" +#include "common/lock.h" + +#include "infect.h" +#include "infect-priv.h" +#include "infect-rpc.h" +#include "rpc-pie-priv.h" + +static int __parasite_send_cmd(int sockfd, struct ctl_msg *m) +{ + int ret; + + BUILD_BUG_ON(PARASITE_USER_CMDS < __PARASITE_END_CMDS); + + ret = send(sockfd, m, sizeof(*m), 0); + if (ret == -1) { + pr_perror("Failed to send command %d to daemon", m->cmd); + return -1; + } else if (ret != sizeof(*m)) { + pr_err("Message to daemon is trimmed (%d/%d)\n", + (int)sizeof(*m), ret); + return -1; + } + + pr_debug("Sent msg to daemon %d %d %d\n", m->cmd, m->ack, m->err); + return 0; +} + +int parasite_wait_ack(int sockfd, unsigned int cmd, struct ctl_msg *m) +{ + int ret; + + pr_debug("Wait for ack %d on daemon socket\n", cmd); + + while (1) { + memzero(m, sizeof(*m)); + + ret = recv(sockfd, m, sizeof(*m), MSG_WAITALL); + if (ret == -1) { + pr_perror("Failed to read ack"); + return -1; + } else if (ret != sizeof(*m)) { + pr_err("Message reply from daemon is trimmed (%d/%d)\n", + (int)sizeof(*m), ret); + return -1; + } + pr_debug("Fetched ack: %d %d %d\n", + m->cmd, m->ack, m->err); + + if (m->cmd != cmd || m->ack != cmd) { + pr_err("Communication error, this is not " + "the ack we expected\n"); + return -1; + } + return 0; + } + + return -1; +} + +int compel_rpc_sync(unsigned int cmd, struct parasite_ctl *ctl) +{ + struct ctl_msg m; + + if (parasite_wait_ack(ctl->tsock, cmd, &m)) + return -1; + + if (m.err != 0) { + pr_err("Command %d for daemon failed with %d\n", + cmd, m.err); + return -1; + } + + return 0; +} + +int compel_rpc_call(unsigned int cmd, struct parasite_ctl *ctl) +{ + struct ctl_msg m; + + m = ctl_msg_cmd(cmd); + return __parasite_send_cmd(ctl->tsock, &m); +} + +int compel_rpc_call_sync(unsigned int cmd, struct parasite_ctl *ctl) +{ + int ret; + + ret = compel_rpc_call(cmd, ctl); + if (!ret) + ret = compel_rpc_sync(cmd, ctl); + + return ret; +} + +int compel_rpc_sock(struct parasite_ctl *ctl) +{ + return ctl->tsock; +} diff --git a/CRIU_code/compel/src/lib/infect-util.c b/CRIU_code/compel/src/lib/infect-util.c new file mode 100644 index 0000000..5d6d0dd --- /dev/null +++ b/CRIU_code/compel/src/lib/infect-util.c @@ -0,0 +1,32 @@ +#include "log.h" +#include "common/bug.h" +#include "common/lock.h" + +#include "uapi/compel/plugins/std/fds.h" + +#include "infect-rpc.h" +#include "infect-util.h" + +int compel_util_send_fd(struct parasite_ctl *ctl, int fd) +{ + int sk; + + sk = compel_rpc_sock(ctl); + if (send_fd(sk, NULL, 0, fd) < 0) { + pr_perror("Can't send file descriptor"); + return -1; + } + return 0; +} + +int compel_util_recv_fd(struct parasite_ctl *ctl, int *pfd) +{ + int sk; + + sk = compel_rpc_sock(ctl); + if ((*pfd = recv_fd(sk)) < 0) { + pr_perror("Can't send file descriptor"); + return -1; + } + return 0; +} diff --git a/CRIU_code/compel/src/lib/infect.c b/CRIU_code/compel/src/lib/infect.c new file mode 100644 index 0000000..f0bcaf3 --- /dev/null +++ b/CRIU_code/compel/src/lib/infect.c @@ -0,0 +1,1589 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "log.h" +#include "common/bug.h" +#include "common/xmalloc.h" +#include "common/lock.h" +#include "common/page.h" + +#include +#include +#include "uapi/compel/plugins/std/syscall.h" +#include "asm/infect-types.h" +#include "asm/sigframe.h" +#include "infect.h" +#include "ptrace.h" +#include "infect-rpc.h" +#include "infect-priv.h" +#include "infect-util.h" +#include "rpc-pie-priv.h" +#include "infect-util.h" + +#define __sys(foo) foo +#define __sys_err(ret) (-errno) + +#include "common/scm.h" +#include "common/scm-code.c" + +#ifndef UNIX_PATH_MAX +#define UNIX_PATH_MAX (sizeof(struct sockaddr_un) - \ + (size_t)((struct sockaddr_un *) 0)->sun_path) +#endif + +#define PARASITE_STACK_SIZE (16 << 10) + +#ifndef SECCOMP_MODE_DISABLED +#define SECCOMP_MODE_DISABLED 0 +#endif + +static int prepare_thread(int pid, struct thread_ctx *ctx); + +static inline void close_safe(int *pfd) +{ + if (*pfd > -1) { + close(*pfd); + *pfd = -1; + } +} + +static int parse_pid_status(int pid, struct seize_task_status *ss, void *data) +{ + char aux[128]; + FILE *f; + + sprintf(aux, "/proc/%d/status", pid); + f = fopen(aux, "r"); + if (!f) + return -1; + + ss->ppid = -1; /* Not needed at this point */ + ss->seccomp_mode = SECCOMP_MODE_DISABLED; + + while (fgets(aux, sizeof(aux), f)) { + if (!strncmp(aux, "State:", 6)) { + ss->state = aux[7]; + continue; + } + + if (!strncmp(aux, "Seccomp:", 8)) { + if (sscanf(aux + 9, "%d", &ss->seccomp_mode) != 1) + goto err_parse; + + continue; + } + + if (!strncmp(aux, "ShdPnd:", 7)) { + if (sscanf(aux + 7, "%llx", &ss->shdpnd) != 1) + goto err_parse; + + continue; + } + if (!strncmp(aux, "SigPnd:", 7)) { + if (sscanf(aux + 7, "%llx", &ss->sigpnd) != 1) + goto err_parse; + + continue; + } + } + + fclose(f); + return 0; + +err_parse: + fclose(f); + return -1; +} + +int compel_stop_task(int pid) +{ + int ret; + struct seize_task_status ss; + + ret = compel_interrupt_task(pid); + if (ret == 0) + ret = compel_wait_task(pid, -1, parse_pid_status, NULL, &ss, NULL); + return ret; +} + +int compel_interrupt_task(int pid) +{ + int ret; + + ret = ptrace(PTRACE_SEIZE, pid, NULL, 0); + if (ret) { + /* + * ptrace API doesn't allow to distinguish + * attaching to zombie from other errors. + * All errors will be handled in compel_wait_task(). + */ + pr_warn("Unable to interrupt task: %d (%s)\n", pid, strerror(errno)); + return ret; + } + + /* + * If we SEIZE-d the task stop it before going + * and reading its stat from proc. Otherwise task + * may die _while_ we're doing it and we'll have + * inconsistent seize/state pair. + * + * If task dies after we seize it but before we + * do this interrupt, we'll notice it via proc. + */ + ret = ptrace(PTRACE_INTERRUPT, pid, NULL, NULL); + if (ret < 0) { + pr_warn("SEIZE %d: can't interrupt task: %s\n", pid, strerror(errno)); + if (ptrace(PTRACE_DETACH, pid, NULL, NULL)) + pr_perror("Unable to detach from %d", pid); + } + + return ret; +} + +static int skip_sigstop(int pid, int nr_signals) +{ + int i, status, ret; + + /* + * 1) SIGSTOP is queued, but isn't handled yet: + * SGISTOP can't be blocked, so we need to wait when the kernel + * handles this signal. + * + * Otherwise the process will be stopped immediately after + * starting it. + * + * 2) A seized task was stopped: + * PTRACE_SEIZE doesn't affect signal or group stop state. + * Currently ptrace reported that task is in stopped state. + * We need to start task again, and it will be trapped + * immediately, because we sent PTRACE_INTERRUPT to it. + */ + for (i = 0; i < nr_signals; i++) { + ret = ptrace(PTRACE_CONT, pid, 0, 0); + if (ret) { + pr_perror("Unable to start process"); + return -1; + } + + ret = wait4(pid, &status, __WALL, NULL); + if (ret < 0) { + pr_perror("SEIZE %d: can't wait task", pid); + return -1; + } + + if (!WIFSTOPPED(status)) { + pr_err("SEIZE %d: task not stopped after seize\n", pid); + return -1; + } + } + return 0; +} + +/* + * This routine seizes task putting it into a special + * state where we can manipulate the task via ptrace + * interface, and finally we can detach ptrace out of + * of it so the task would not know if it was saddled + * up with someone else. + */ +int compel_wait_task(int pid, int ppid, + int (*get_status)(int pid, struct seize_task_status *, void *), + void (*free_status)(int pid, struct seize_task_status *, void *), + struct seize_task_status *ss, void *data) +{ + siginfo_t si; + int status, nr_sigstop; + int ret = 0, ret2, wait_errno = 0; + + /* + * It's ugly, but the ptrace API doesn't allow to distinguish + * attaching to zombie from other errors. Thus we have to parse + * the target's /proc/pid/stat. Sad, but parse whatever else + * we might need at that early point. + */ + +try_again: + + ret = wait4(pid, &status, __WALL, NULL); + if (ret < 0) { + /* + * wait4() can expectedly fail only in a first time + * if a task is zombie. If we are here from try_again, + * this means that we are tracing this task. + * + * So here we can be only once in this function. + */ + wait_errno = errno; + } + + ret2 = get_status(pid, ss, data); + if (ret2) + goto err; + + if (ret < 0 || WIFEXITED(status) || WIFSIGNALED(status)) { + if (ss->state != 'Z') { + if (pid == getpid()) + pr_err("The criu itself is within dumped tree.\n"); + else + pr_err("Unseizable non-zombie %d found, state %c, err %d/%d\n", + pid, ss->state, ret, wait_errno); + return -1; + } + + if (ret < 0) + return COMPEL_TASK_ZOMBIE; + else + return COMPEL_TASK_DEAD; + } + + if ((ppid != -1) && (ss->ppid != ppid)) { + pr_err("Task pid reused while suspending (%d: %d -> %d)\n", + pid, ppid, ss->ppid); + goto err; + } + + if (!WIFSTOPPED(status)) { + pr_err("SEIZE %d: task not stopped after seize\n", pid); + goto err; + } + + ret = ptrace(PTRACE_GETSIGINFO, pid, NULL, &si); + if (ret < 0) { + pr_perror("SEIZE %d: can't read signfo", pid); + goto err; + } + + if (PTRACE_SI_EVENT(si.si_code) != PTRACE_EVENT_STOP) { + /* + * Kernel notifies us about the task being seized received some + * event other than the STOP, i.e. -- a signal. Let the task + * handle one and repeat. + */ + + if (ptrace(PTRACE_CONT, pid, NULL, + (void *)(unsigned long)si.si_signo)) { + pr_perror("Can't continue signal handling, aborting"); + goto err; + } + + if (free_status) + free_status(pid, ss, data); + goto try_again; + } + + if (ss->seccomp_mode != SECCOMP_MODE_DISABLED && ptrace_suspend_seccomp(pid) < 0) + goto err; + + nr_sigstop = 0; + if (ss->sigpnd & (1 << (SIGSTOP - 1))) + nr_sigstop++; + if (ss->shdpnd & (1 << (SIGSTOP - 1))) + nr_sigstop++; + if (si.si_signo == SIGSTOP) + nr_sigstop++; + + if (nr_sigstop) { + if (skip_sigstop(pid, nr_sigstop)) + goto err_stop; + + return COMPEL_TASK_STOPPED; + } + + if (si.si_signo == SIGTRAP) + return COMPEL_TASK_ALIVE; + else { + pr_err("SEIZE %d: unsupported stop signal %d\n", pid, si.si_signo); + goto err; + } + +err_stop: + kill(pid, SIGSTOP); +err: + if (ptrace(PTRACE_DETACH, pid, NULL, NULL)) + pr_perror("Unable to detach from %d", pid); + return -1; +} + +int compel_resume_task(pid_t pid, int orig_st, int st) +{ + pr_debug("\tUnseizing %d into %d\n", pid, st); + + if (st == COMPEL_TASK_DEAD) { + kill(pid, SIGKILL); + return 0; + } else if (st == COMPEL_TASK_STOPPED) { + /* + * Task might have had STOP in queue. We detected such + * guy as COMPEL_TASK_STOPPED, but cleared signal to run + * the parasite code. Thus after detach the task will become + * running. That said -- STOP everyone regardless of + * the initial state. + */ + kill(pid, SIGSTOP); + } else if (st == COMPEL_TASK_ALIVE) { + /* + * Same as in the comment above -- there might be a + * task with STOP in queue that would get lost after + * detach, so stop it again. + */ + if (orig_st == COMPEL_TASK_STOPPED) + kill(pid, SIGSTOP); + } else + pr_err("Unknown final state %d\n", st); + + if (ptrace(PTRACE_DETACH, pid, NULL, NULL)) { + pr_perror("Unable to detach from %d", pid); + return -1; + } + + return 0; +} + +static int gen_parasite_saddr(struct sockaddr_un *saddr, int key) +{ + int sun_len; + + saddr->sun_family = AF_UNIX; + snprintf(saddr->sun_path, UNIX_PATH_MAX, + "X/crtools-pr-%d", key); + + sun_len = SUN_LEN(saddr); + *saddr->sun_path = '\0'; + + return sun_len; +} + +static int prepare_tsock(struct parasite_ctl *ctl, pid_t pid, + struct parasite_init_args *args) +{ + int ssock = -1; + socklen_t sk_len; + struct sockaddr_un addr; + + pr_info("Putting tsock into pid %d\n", pid); + args->h_addr_len = gen_parasite_saddr(&args->h_addr, getpid()); + + ssock = ctl->ictx.sock; + sk_len = sizeof(addr); + + if (ssock == -1) { + pr_err("No socket in ictx\n"); + goto err; + } + + if (getsockname(ssock, (struct sockaddr *) &addr, &sk_len) < 0) { + pr_perror("Unable to get name for a socket"); + return -1; + } + + if (sk_len == sizeof(addr.sun_family)) { + if (bind(ssock, (struct sockaddr *)&args->h_addr, args->h_addr_len) < 0) { + pr_perror("Can't bind socket"); + goto err; + } + + if (listen(ssock, 1)) { + pr_perror("Can't listen on transport socket"); + goto err; + } + } + + /* Check a case when parasite can't initialize a command socket */ + if (ctl->ictx.flags & INFECT_FAIL_CONNECT) + args->h_addr_len = gen_parasite_saddr(&args->h_addr, getpid() + 1); + + /* + * Set to -1 to prevent any accidental misuse. The + * only valid user of it is accept_tsock(). + */ + ctl->tsock = -ssock; + return 0; +err: + close_safe(&ssock); + return -1; +} + +static int setup_child_handler(struct parasite_ctl *ctl) +{ + struct sigaction sa = { + .sa_sigaction = ctl->ictx.child_handler, + .sa_flags = SA_SIGINFO | SA_RESTART, + }; + + sigemptyset(&sa.sa_mask); + sigaddset(&sa.sa_mask, SIGCHLD); + if (sigaction(SIGCHLD, &sa, NULL)) { + pr_perror("Unable to setup SIGCHLD handler"); + return -1; + } + + return 0; +} + +static int restore_child_handler(struct parasite_ctl *ctl) +{ + if (sigaction(SIGCHLD, &ctl->ictx.orig_handler, NULL)) { + pr_perror("Unable to setup SIGCHLD handler"); + return -1; + } + + return 0; +} + +static int parasite_run(pid_t pid, int cmd, unsigned long ip, void *stack, + user_regs_struct_t *regs, struct thread_ctx *octx) +{ + k_rtsigset_t block; + + ksigfillset(&block); + if (ptrace(PTRACE_SETSIGMASK, pid, sizeof(k_rtsigset_t), &block)) { + pr_perror("Can't block signals for %d", pid); + goto err_sig; + } + + parasite_setup_regs(ip, stack, regs); + if (ptrace_set_regs(pid, regs)) { + pr_perror("Can't set registers for %d", pid); + goto err_regs; + } + + if (ptrace(cmd, pid, NULL, NULL)) { + pr_perror("Can't run parasite at %d", pid); + goto err_cont; + } + + return 0; + +err_cont: + if (ptrace_set_regs(pid, &octx->regs)) + pr_perror("Can't restore regs for %d", pid); +err_regs: + if (ptrace(PTRACE_SETSIGMASK, pid, sizeof(k_rtsigset_t), &octx->sigmask)) + pr_perror("Can't restore sigmask for %d", pid); +err_sig: + return -1; +} + +static int restore_thread_ctx(int pid, struct thread_ctx *ctx) +{ + int ret = 0; + + if (ptrace_set_regs(pid, &ctx->regs)) { + pr_perror("Can't restore registers (pid: %d)", pid); + ret = -1; + } + if (ptrace(PTRACE_SETSIGMASK, pid, sizeof(k_rtsigset_t), &ctx->sigmask)) { + pr_perror("Can't block signals"); + ret = -1; + } + + return ret; +} + + +/* we run at @regs->ip */ +static int parasite_trap(struct parasite_ctl *ctl, pid_t pid, + user_regs_struct_t *regs, + struct thread_ctx *octx) +{ + siginfo_t siginfo; + int status; + int ret = -1; + + /* + * Most ideas are taken from Tejun Heo's parasite thread + * https://code.google.com/p/ptrace-parasite/ + */ + + if (wait4(pid, &status, __WALL, NULL) != pid) { + pr_perror("Waited pid mismatch (pid: %d)", pid); + goto err; + } + + if (!WIFSTOPPED(status)) { + pr_err("Task is still running (pid: %d)\n", pid); + goto err; + } + + if (ptrace(PTRACE_GETSIGINFO, pid, NULL, &siginfo)) { + pr_perror("Can't get siginfo (pid: %d)", pid); + goto err; + } + + if (ptrace_get_regs(pid, regs)) { + pr_perror("Can't obtain registers (pid: %d)", pid); + goto err; + } + + if (WSTOPSIG(status) != SIGTRAP || siginfo.si_code != ARCH_SI_TRAP) { + pr_debug("** delivering signal %d si_code=%d\n", + siginfo.si_signo, siginfo.si_code); + + pr_err("Unexpected %d task interruption, aborting\n", pid); + goto err; + } + + /* + * We've reached this point if int3 is triggered inside our + * parasite code. So we're done. + */ + ret = 0; +err: + if (restore_thread_ctx(pid, octx)) + ret = -1; + + return ret; +} + + +int compel_execute_syscall(struct parasite_ctl *ctl, + user_regs_struct_t *regs, const char *code_syscall) +{ + pid_t pid = ctl->rpid; + int err; + uint8_t code_orig[BUILTIN_SYSCALL_SIZE]; + + /* + * Inject syscall instruction and remember original code, + * we will need it to restore original program content. + */ + memcpy(code_orig, code_syscall, sizeof(code_orig)); + if (ptrace_swap_area(pid, (void *)ctl->ictx.syscall_ip, + (void *)code_orig, sizeof(code_orig))) { + pr_err("Can't inject syscall blob (pid: %d)\n", pid); + return -1; + } + + err = parasite_run(pid, PTRACE_CONT, ctl->ictx.syscall_ip, 0, regs, &ctl->orig); + if (!err) + err = parasite_trap(ctl, pid, regs, &ctl->orig); + + if (ptrace_poke_area(pid, (void *)code_orig, + (void *)ctl->ictx.syscall_ip, sizeof(code_orig))) { + pr_err("Can't restore syscall blob (pid: %d)\n", ctl->rpid); + err = -1; + } + + return err; +} + +int compel_run_at(struct parasite_ctl *ctl, unsigned long ip, user_regs_struct_t *ret_regs) +{ + user_regs_struct_t regs = ctl->orig.regs; + int ret; + + ret = parasite_run(ctl->rpid, PTRACE_CONT, ip, 0, ®s, &ctl->orig); + if (!ret) + ret = parasite_trap(ctl, ctl->rpid, ret_regs ? ret_regs : ®s, &ctl->orig); + return ret; +} + +static int accept_tsock(struct parasite_ctl *ctl) +{ + int sock; + int ask = -ctl->tsock; /* this '-' is explained above */ + + sock = accept(ask, NULL, 0); + if (sock < 0) { + pr_perror("Can't accept connection to the transport socket"); + close(ask); + return -1; + } + + ctl->tsock = sock; + return 0; +} + +static int parasite_init_daemon(struct parasite_ctl *ctl) +{ + struct parasite_init_args *args; + pid_t pid = ctl->rpid; + user_regs_struct_t regs; + struct ctl_msg m = { }; + + *ctl->addr_cmd = PARASITE_CMD_INIT_DAEMON; + + args = compel_parasite_args(ctl, struct parasite_init_args); + + args->sigframe = (uintptr_t)ctl->rsigframe; + args->log_level = compel_log_get_loglevel(); +#ifdef ARCH_HAS_LONG_PAGES + args->page_size = PAGE_SIZE; +#endif + + futex_set(&args->daemon_connected, 0); + + if (prepare_tsock(ctl, pid, args)) + goto err; + + /* after this we can catch parasite errors in chld handler */ + if (setup_child_handler(ctl)) + goto err; + + regs = ctl->orig.regs; + if (parasite_run(pid, PTRACE_CONT, ctl->parasite_ip, ctl->rstack, ®s, &ctl->orig)) + goto err; + + futex_wait_while_eq(&args->daemon_connected, 0); + if (futex_get(&args->daemon_connected) != 1) { + errno = -(int)futex_get(&args->daemon_connected); + pr_perror("Unable to connect a transport socket"); + goto err; + } + + if (accept_tsock(ctl) < 0) + goto err; + + if (compel_util_send_fd(ctl, ctl->ictx.log_fd)) + goto err; + + pr_info("Wait for parasite being daemonized...\n"); + + if (parasite_wait_ack(ctl->tsock, PARASITE_CMD_INIT_DAEMON, &m)) { + pr_err("Can't switch parasite %d to daemon mode %d\n", + pid, m.err); + goto err; + } + + ctl->sigreturn_addr = (void*)(uintptr_t)args->sigreturn_addr; + ctl->daemonized = true; + pr_info("Parasite %d has been switched to daemon mode\n", pid); + return 0; +err: + return -1; +} + +static int parasite_start_daemon(struct parasite_ctl *ctl) +{ + pid_t pid = ctl->rpid; + struct infect_ctx *ictx = &ctl->ictx; + + /* + * Get task registers before going daemon, since the + * compel_get_task_regs needs to call ptrace on _stopped_ task, + * while in daemon it is not such. + */ + + if (get_task_regs(pid, &ctl->orig.regs, ictx->save_regs, + ictx->regs_arg, ictx->flags)) { + pr_err("Can't obtain regs for thread %d\n", pid); + return -1; + } + + if (ictx->make_sigframe(ictx->regs_arg, ctl->sigframe, ctl->rsigframe, &ctl->orig.sigmask)) + return -1; + + if (parasite_init_daemon(ctl)) + return -1; + + return 0; +} + +static int parasite_mmap_exchange(struct parasite_ctl *ctl, unsigned long size) +{ + int fd; + + ctl->remote_map = remote_mmap(ctl, NULL, size, + PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_ANONYMOUS | MAP_SHARED, -1, 0); + if (!ctl->remote_map) { + pr_err("Can't allocate memory for parasite blob (pid: %d)\n", ctl->rpid); + return -1; + } + + ctl->map_length = round_up(size, page_size()); + + fd = ctl->ictx.open_proc(ctl->rpid, O_RDWR, "map_files/%lx-%lx", + (long)ctl->remote_map, (long)ctl->remote_map + ctl->map_length); + if (fd < 0) + return -1; + + ctl->local_map = mmap(NULL, size, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_FILE, fd, 0); + close(fd); + + if (ctl->local_map == MAP_FAILED) { + ctl->local_map = NULL; + pr_perror("Can't map remote parasite map"); + return -1; + } + + return 0; +} + +static int parasite_memfd_exchange(struct parasite_ctl *ctl, unsigned long size) +{ + void *where = (void *)ctl->ictx.syscall_ip + BUILTIN_SYSCALL_SIZE; + uint8_t orig_code[MEMFD_FNAME_SZ] = MEMFD_FNAME; + pid_t pid = ctl->rpid; + long sret = -ENOSYS; + int ret, fd, lfd; + bool __maybe_unused compat_task = !compel_mode_native(ctl); + + if (ctl->ictx.flags & INFECT_NO_MEMFD) + return 1; + + BUILD_BUG_ON(sizeof(orig_code) < sizeof(long)); + + if (ptrace_swap_area(pid, where, (void *)orig_code, sizeof(orig_code))) { + pr_err("Can't inject memfd args (pid: %d)\n", pid); + return -1; + } + + ret = compel_syscall(ctl, __NR(memfd_create, compat_task), &sret, + (unsigned long)where, 0, 0, 0, 0, 0); + + if (ptrace_poke_area(pid, orig_code, where, sizeof(orig_code))) { + fd = (int)(long)sret; + if (fd >= 0) + compel_syscall(ctl, __NR(close, compat_task), &sret, + fd, 0, 0, 0, 0, 0); + pr_err("Can't restore memfd args (pid: %d)\n", pid); + return -1; + } + + if (ret < 0) + return ret; + + fd = (int)(long)sret; + if (fd == -ENOSYS) + return 1; + if (fd < 0) { + errno = -fd; + pr_perror("Can't create memfd in victim"); + return fd; + } + + ctl->map_length = round_up(size, page_size()); + lfd = ctl->ictx.open_proc(ctl->rpid, O_RDWR, "fd/%d", fd); + if (lfd < 0) + goto err_cure; + + if (ftruncate(lfd, ctl->map_length) < 0) { + pr_perror("Fail to truncate memfd for parasite"); + goto err_cure; + } + + ctl->remote_map = remote_mmap(ctl, NULL, size, + PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_FILE | MAP_SHARED, fd, 0); + if (!ctl->remote_map) { + pr_err("Can't rmap memfd for parasite blob\n"); + goto err_curef; + } + + ctl->local_map = mmap(NULL, size, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_FILE, lfd, 0); + if (ctl->local_map == MAP_FAILED) { + ctl->local_map = NULL; + pr_perror("Can't lmap memfd for parasite blob"); + goto err_curef; + } + + compel_syscall(ctl, __NR(close, compat_task), &sret, fd, 0, 0, 0, 0, 0); + close(lfd); + + pr_info("Set up parasite blob using memfd\n"); + return 0; + +err_curef: + close(lfd); +err_cure: + compel_syscall(ctl, __NR(close, compat_task), &sret, fd, 0, 0, 0, 0, 0); + return -1; +} + +void compel_relocs_apply(void *mem, void *vbase, size_t size, compel_reloc_t *elf_relocs, size_t nr_relocs) +{ + size_t i, j; + + for (i = 0, j = 0; i < nr_relocs; i++) { + if (elf_relocs[i].type & COMPEL_TYPE_LONG) { + long *where = mem + elf_relocs[i].offset; + long *p = mem + size; + + if (elf_relocs[i].type & COMPEL_TYPE_GOTPCREL) { + int *value = (int *)where; + int rel; + + p[j] = (long)vbase + elf_relocs[i].value; + rel = (unsigned)((void *)&p[j] - (void *)mem) - elf_relocs[i].offset + elf_relocs[i].addend; + + *value = rel; + j++; + } else + *where = elf_relocs[i].value + elf_relocs[i].addend + (unsigned long)vbase; + } else if (elf_relocs[i].type & COMPEL_TYPE_INT) { + int *where = (mem + elf_relocs[i].offset); + *where = elf_relocs[i].value + elf_relocs[i].addend + (unsigned long)vbase; + } else + BUG(); + } +} + +static int compel_map_exchange(struct parasite_ctl *ctl, unsigned long size) +{ + int ret; + + ret = parasite_memfd_exchange(ctl, size); + if (ret == 1) { + pr_info("MemFD parasite doesn't work, goto legacy mmap\n"); + ret = parasite_mmap_exchange(ctl, size); + } + return ret; +} + +static inline unsigned long total_pie_size(size_t blob_size) +{ + return round_up(blob_size, page_size()); +} + +int compel_infect(struct parasite_ctl *ctl, unsigned long nr_threads, unsigned long args_size) +{ + int ret; + unsigned long p, map_exchange_size, parasite_size = 0; + + if (ctl->pblob.parasite_type != COMPEL_BLOB_CHEADER) + goto err; + + if (ctl->ictx.log_fd < 0) + goto err; + + if (!arch_can_dump_task(ctl)) + goto err; + + /* + * Inject a parasite engine. Ie allocate memory inside alien + * space and copy engine code there. Then re-map the engine + * locally, so we will get an easy way to access engine memory + * without using ptrace at all. + */ + + parasite_size = total_pie_size(ctl->pblob.hdr.bsize); + + ctl->args_size = round_up(args_size, PAGE_SIZE); + parasite_size += ctl->args_size; + + map_exchange_size = parasite_size; + map_exchange_size += RESTORE_STACK_SIGFRAME + PARASITE_STACK_SIZE; + if (nr_threads > 1) + map_exchange_size += PARASITE_STACK_SIZE; + + ret = compel_map_exchange(ctl, map_exchange_size); + if (ret) + goto err; + + pr_info("Putting parasite blob into %p->%p\n", ctl->local_map, ctl->remote_map); + + ctl->parasite_ip = (unsigned long)(ctl->remote_map + ctl->pblob.hdr.parasite_ip_off); + ctl->addr_cmd = ctl->local_map + ctl->pblob.hdr.addr_cmd_off; + ctl->addr_args = ctl->local_map + ctl->pblob.hdr.addr_arg_off; + + memcpy(ctl->local_map, ctl->pblob.hdr.mem, ctl->pblob.hdr.bsize); + if (ctl->pblob.hdr.nr_relocs) + compel_relocs_apply(ctl->local_map, ctl->remote_map, ctl->pblob.hdr.bsize, + ctl->pblob.hdr.relocs, ctl->pblob.hdr.nr_relocs); + + p = parasite_size; + + ctl->rsigframe = ctl->remote_map + p; + ctl->sigframe = ctl->local_map + p; + + p += RESTORE_STACK_SIGFRAME; + p += PARASITE_STACK_SIZE; + ctl->rstack = ctl->remote_map + p; + + if (nr_threads > 1) { + p += PARASITE_STACK_SIZE; + ctl->r_thread_stack = ctl->remote_map + p; + } + + ret = arch_fetch_sas(ctl, ctl->rsigframe); + if (ret) { + pr_err("Can't fetch sigaltstack for task %d (ret %d)\n", + ctl->rpid, ret); + goto err; + } + + if (parasite_start_daemon(ctl)) + goto err; + + return 0; + +err: + return -1; +} + +struct parasite_thread_ctl *compel_prepare_thread(struct parasite_ctl *ctl, int pid) +{ + struct parasite_thread_ctl *tctl; + + tctl = xmalloc(sizeof(*tctl)); + if (tctl) { + if (prepare_thread(pid, &tctl->th)) { + xfree(tctl); + tctl = NULL; + } else { + tctl->tid = pid; + tctl->ctl = ctl; + } + } + + return tctl; +} + +static int prepare_thread(int pid, struct thread_ctx *ctx) +{ + if (ptrace(PTRACE_GETSIGMASK, pid, sizeof(k_rtsigset_t), &ctx->sigmask)) { + pr_perror("can't get signal blocking mask for %d", pid); + return -1; + } + + if (ptrace_get_regs(pid, &ctx->regs)) { + pr_perror("Can't obtain registers (pid: %d)", pid); + return -1; + } + + return 0; +} + +void compel_release_thread(struct parasite_thread_ctl *tctl) +{ + /* + * No stuff to cure in thread here, all routines leave the + * guy intact (for now) + */ + xfree(tctl); +} + +struct parasite_ctl *compel_prepare_noctx(int pid) +{ + struct parasite_ctl *ctl = NULL; + + /* + * Control block early setup. + */ + ctl = xzalloc(sizeof(*ctl)); + if (!ctl) { + pr_err("Parasite control block allocation failed (pid: %d)\n", pid); + goto err; + } + + ctl->tsock = -1; + ctl->ictx.log_fd = -1; + + if (prepare_thread(pid, &ctl->orig)) + goto err; + + ctl->rpid = pid; + + BUILD_BUG_ON(PARASITE_START_AREA_MIN < BUILTIN_SYSCALL_SIZE + MEMFD_FNAME_SZ); + + return ctl; + +err: + xfree(ctl); + return NULL; +} + +/* + * Find first executable VMA that would fit the initial + * syscall injection. + */ +static unsigned long find_executable_area(int pid) +{ + char aux[128]; + FILE *f; + unsigned long ret = (unsigned long)MAP_FAILED; + + sprintf(aux, "/proc/%d/maps", pid); + f = fopen(aux, "r"); + if (!f) + goto out; + + while (fgets(aux, sizeof(aux), f)) { + unsigned long start, end; + char *f; + + start = strtoul(aux, &f, 16); + end = strtoul(f + 1, &f, 16); + + /* f now points at " rwx" (yes, with space) part */ + if (f[3] == 'x') { + BUG_ON(end - start < PARASITE_START_AREA_MIN); + ret = start; + break; + } + } + + fclose(f); +out: + return ret; +} + +/* + * This routine is to create PF_UNIX/SOCK_SEQPACKET socket + * in the target net namespace + */ +static int make_sock_for(int pid) +{ + int ret, mfd, fd, sk = -1; + char p[32]; + + pr_debug("Preparing seqsk for %d\n", pid); + + sprintf(p, "/proc/%d/ns/net", pid); + fd = open(p, O_RDONLY); + if (fd < 0) { + pr_perror("Can't open %p", p); + goto out; + } + + mfd = open("/proc/self/ns/net", O_RDONLY); + if (mfd < 0) { + pr_perror("Can't open self netns"); + goto out_c; + } + + if (setns(fd, CLONE_NEWNET)) { + pr_perror("Can't setup target netns"); + goto out_cm; + } + + sk = socket(PF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK, 0); + if (sk < 0) + pr_perror("Can't create seqsk"); + + ret = setns(mfd, CLONE_NEWNET); + if (ret) { + pr_perror("Can't restore former netns"); + if (sk >= 0) + close(sk); + sk = -1; + } +out_cm: + close(mfd); +out_c: + close(fd); +out: + return sk; +} + +static int simple_open_proc(int pid, int mode, const char *fmt, ...) +{ + int l; + char path[128]; + va_list args; + + l = sprintf(path, "/proc/%d/", pid); + + va_start(args, fmt); + vsnprintf(path + l, sizeof(path) - l, fmt, args); + va_end(args); + + return open(path, mode); +} + +static void handle_sigchld(int signal, siginfo_t *siginfo, void *data) +{ + int pid, status; + + pid = waitpid(-1, &status, WNOHANG); + if (pid <= 0) + return; + + pr_err("si_code=%d si_pid=%d si_status=%d\n", + siginfo->si_code, siginfo->si_pid, siginfo->si_status); + + if (WIFEXITED(status)) + pr_err("%d exited with %d unexpectedly\n", pid, WEXITSTATUS(status)); + else if (WIFSIGNALED(status)) + pr_err("%d was killed by %d unexpectedly: %s\n", + pid, WTERMSIG(status), strsignal(WTERMSIG(status))); + else if (WIFSTOPPED(status)) + pr_err("%d was stopped by %d unexpectedly\n", pid, WSTOPSIG(status)); + + /* FIXME Should we exit? */ + /* exit(1); */ +} + +struct plain_regs_struct { + user_regs_struct_t regs; + user_fpregs_struct_t fpregs; +}; + +static int save_regs_plain(void *to, user_regs_struct_t *r, user_fpregs_struct_t *f) +{ + struct plain_regs_struct *prs = to; + + prs->regs = *r; + prs->fpregs = *f; + + return 0; +} + +static int make_sigframe_plain(void *from, struct rt_sigframe *f, struct rt_sigframe *rtf, k_rtsigset_t *b) +{ + struct plain_regs_struct *prs = from; + + /* + * Make sure it's zeroified. + */ + memset(f, 0, sizeof(*f)); + + if (sigreturn_prep_regs_plain(f, &prs->regs, &prs->fpregs)) + return -1; + + if (b) + rt_sigframe_copy_sigset(f, b); + + if (RT_SIGFRAME_HAS_FPU(f)) { + if (sigreturn_prep_fpu_frame_plain(f, rtf)) + return -1; + } + + /* + * FIXME What about sas? + * setup_sas(sigframe, core->thread_core->sas); + */ + + return 0; +} + +struct parasite_ctl *compel_prepare(int pid) +{ + struct parasite_ctl *ctl; + struct infect_ctx *ictx; + + ctl = compel_prepare_noctx(pid); + if (ctl == NULL) + goto out; + + ictx = &ctl->ictx; + ictx->task_size = compel_task_size(); + ictx->open_proc = simple_open_proc; + ictx->syscall_ip = find_executable_area(pid); + ictx->child_handler = handle_sigchld; + sigaction(SIGCHLD, NULL, &ictx->orig_handler); + + ictx->save_regs = save_regs_plain; + ictx->make_sigframe = make_sigframe_plain; + ictx->regs_arg = xmalloc(sizeof(struct plain_regs_struct)); + if (ictx->regs_arg == NULL) + goto err; + + if (ictx->syscall_ip == (unsigned long)MAP_FAILED) + goto err; + ictx->sock = make_sock_for(pid); + if (ictx->sock < 0) + goto err; + +out: + return ctl; + +err: + xfree(ictx->regs_arg); + xfree(ctl); + ctl = NULL; + goto out; +} + +static bool task_in_parasite(struct parasite_ctl *ctl, user_regs_struct_t *regs) +{ + void *addr = (void *) REG_IP(*regs); + return addr >= ctl->remote_map && + addr < ctl->remote_map + ctl->map_length; +} + +static int parasite_fini_seized(struct parasite_ctl *ctl) +{ + pid_t pid = ctl->rpid; + user_regs_struct_t regs; + int status, ret = 0; + enum trace_flags flag; + + /* stop getting chld from parasite -- we're about to step-by-step it */ + if (restore_child_handler(ctl)) + return -1; + + /* Start to trace syscalls for each thread */ + if (ptrace(PTRACE_INTERRUPT, pid, NULL, NULL)) { + pr_perror("Unable to interrupt the process"); + return -1; + } + + pr_debug("Waiting for %d to trap\n", pid); + if (wait4(pid, &status, __WALL, NULL) != pid) { + pr_perror("Waited pid mismatch (pid: %d)", pid); + return -1; + } + + pr_debug("Daemon %d exited trapping\n", pid); + if (!WIFSTOPPED(status)) { + pr_err("Task is still running (pid: %d)\n", pid); + return -1; + } + + ret = ptrace_get_regs(pid, ®s); + if (ret) { + pr_perror("Unable to get registers"); + return -1; + } + + if (!task_in_parasite(ctl, ®s)) { + pr_err("The task is not in parasite code\n"); + return -1; + } + + ret = compel_rpc_call(PARASITE_CMD_FINI, ctl); + close_safe(&ctl->tsock); + if (ret) + return -1; + + /* Go to sigreturn as closer as we can */ + ret = compel_stop_pie(pid, ctl->sigreturn_addr, &flag, + ctl->ictx.flags & INFECT_NO_BREAKPOINTS); + if (ret < 0) + return ret; + + if (compel_stop_on_syscall(1, __NR(rt_sigreturn, 0), + __NR(rt_sigreturn, 1), flag)) + return -1; + + if (ptrace_flush_breakpoints(pid)) + return -1; + + /* + * All signals are unblocked now. The kernel notifies about leaving + * syscall before starting to deliver signals. All parasite code are + * executed with blocked signals, so we can sefly unmap a parasite blob. + */ + + return 0; +} + +int compel_stop_daemon(struct parasite_ctl *ctl) +{ + if (ctl->daemonized) { + /* + * Looks like a previous attempt failed, we should do + * nothing in this case. parasite will try to cure itself. + */ + if (ctl->tsock < 0) + return -1; + + if (parasite_fini_seized(ctl)) { + close_safe(&ctl->tsock); + return -1; + } + } + + ctl->daemonized = false; + + return 0; +} + +int compel_cure_remote(struct parasite_ctl *ctl) +{ + long ret; + + if (compel_stop_daemon(ctl)) + return -1; + + if (!ctl->remote_map) + return 0; + + compel_syscall(ctl, __NR(munmap, !compel_mode_native(ctl)), &ret, + (unsigned long)ctl->remote_map, ctl->map_length, + 0, 0, 0, 0); + if (ret) { + pr_err("munmap for remote map %p, %lu returned %lu\n", + ctl->remote_map, ctl->map_length, ret); + return -1; + } + + return 0; +} + +int compel_cure_local(struct parasite_ctl *ctl) +{ + int ret = 0; + + if (ctl->local_map) { + if (munmap(ctl->local_map, ctl->map_length)) { + pr_err("munmap failed (pid: %d)\n", ctl->rpid); + ret = -1; + } + } + + free(ctl); + return ret; +} + +int compel_cure(struct parasite_ctl *ctl) +{ + int ret; + + ret = compel_cure_remote(ctl); + if (!ret) + ret = compel_cure_local(ctl); + + return ret; +} + +void *compel_parasite_args_p(struct parasite_ctl *ctl) +{ + return ctl->addr_args; +} + +void *compel_parasite_args_s(struct parasite_ctl *ctl, unsigned long args_size) +{ + BUG_ON(args_size > ctl->args_size); + return compel_parasite_args_p(ctl); +} + +int compel_run_in_thread(struct parasite_thread_ctl *tctl, unsigned int cmd) +{ + int pid = tctl->tid; + struct parasite_ctl *ctl = tctl->ctl; + struct thread_ctx *octx = &tctl->th; + void *stack = ctl->r_thread_stack; + user_regs_struct_t regs = octx->regs; + int ret; + + *ctl->addr_cmd = cmd; + + ret = parasite_run(pid, PTRACE_CONT, ctl->parasite_ip, stack, ®s, octx); + if (ret == 0) + ret = parasite_trap(ctl, pid, ®s, octx); + if (ret == 0) + ret = (int)REG_RES(regs); + + if (ret) + pr_err("Parasite exited with %d\n", ret); + + return ret; +} + +/* + * compel_unmap() is used for unmapping parasite and restorer blobs. + * A blob can contain code for unmapping itself, so the porcess is + * trapped on the exit from the munmap syscall. + */ +int compel_unmap(struct parasite_ctl *ctl, unsigned long addr) +{ + user_regs_struct_t regs = ctl->orig.regs; + pid_t pid = ctl->rpid; + int ret = -1; + + ret = parasite_run(pid, PTRACE_SYSCALL, addr, ctl->rstack, ®s, &ctl->orig); + if (ret) + goto err; + + ret = compel_stop_on_syscall(1, __NR(munmap, 0), + __NR(munmap, 1), TRACE_ENTER); + + if (restore_thread_ctx(pid, &ctl->orig)) + ret = -1; +err: + return ret; +} + +int compel_stop_pie(pid_t pid, void *addr, enum trace_flags *tf, bool no_bp) +{ + int ret; + + if (no_bp) { + pr_debug("Force no-breakpoints restore\n"); + ret = 0; + } else + ret = ptrace_set_breakpoint(pid, addr); + if (ret < 0) + return ret; + + if (ret > 0) { + /* + * PIE will stop on a breakpoint, next + * stop after that will be syscall enter. + */ + *tf = TRACE_EXIT; + return 0; + } + + /* + * No breakpoints available -- start tracing it + * in a per-syscall manner. + */ + ret = ptrace(PTRACE_SYSCALL, pid, NULL, NULL); + if (ret) { + pr_perror("Unable to restart the %d process", pid); + return -1; + } + + *tf = TRACE_ENTER; + return 0; +} + +static bool task_is_trapped(int status, pid_t pid) +{ + if (WIFSTOPPED(status) && WSTOPSIG(status) == SIGTRAP) + return true; + + pr_err("Task %d is in unexpected state: %x\n", pid, status); + if (WIFEXITED(status)) + pr_err("Task exited with %d\n", WEXITSTATUS(status)); + if (WIFSIGNALED(status)) + pr_err("Task signaled with %d: %s\n", + WTERMSIG(status), strsignal(WTERMSIG(status))); + if (WIFSTOPPED(status)) + pr_err("Task stopped with %d: %s\n", + WSTOPSIG(status), strsignal(WSTOPSIG(status))); + if (WIFCONTINUED(status)) + pr_err("Task continued\n"); + + return false; +} + +static inline int is_required_syscall(user_regs_struct_t *regs, pid_t pid, + const int sys_nr, const int sys_nr_compat) +{ + const char *mode = user_regs_native(regs) ? "native" : "compat"; + int req_sysnr = user_regs_native(regs) ? sys_nr : sys_nr_compat; + + pr_debug("%d (%s) is going to execute the syscall %lu, required is %d\n", + pid, mode, REG_SYSCALL_NR(*regs), req_sysnr); + + return (REG_SYSCALL_NR(*regs) == req_sysnr); +} + +/* + * Trap tasks on the exit from the specified syscall + * + * tasks - number of processes, which should be trapped + * sys_nr - the required syscall number + * sys_nr_compat - the required compatible syscall number + */ +int compel_stop_on_syscall(int tasks, + const int sys_nr, const int sys_nr_compat, + enum trace_flags trace) +{ + user_regs_struct_t regs; + int status, ret; + pid_t pid; + + if (tasks > 1) + trace = TRACE_ALL; + + /* Stop all threads on the enter point in sys_rt_sigreturn */ + while (tasks) { + pid = wait4(-1, &status, __WALL, NULL); + if (pid == -1) { + pr_perror("wait4 failed"); + return -1; + } + + if (!task_is_trapped(status, pid)) + return -1; + + pr_debug("%d was trapped\n", pid); + + if (trace == TRACE_EXIT) { + trace = TRACE_ENTER; + pr_debug("`- Expecting exit\n"); + goto goon; + } + if (trace == TRACE_ENTER) + trace = TRACE_EXIT; + + ret = ptrace_get_regs(pid, ®s); + if (ret) { + pr_perror("ptrace"); + return -1; + } + + if (is_required_syscall(®s, pid, sys_nr, sys_nr_compat)) { + /* + * The process is going to execute the required syscall, + * the next stop will be on the exit from this syscall + */ + ret = ptrace(PTRACE_SYSCALL, pid, NULL, NULL); + if (ret) { + pr_perror("ptrace"); + return -1; + } + + pid = wait4(pid, &status, __WALL, NULL); + if (pid == -1) { + pr_perror("wait4 failed"); + return -1; + } + + if (!task_is_trapped(status, pid)) + return -1; + + pr_debug("%d was stopped\n", pid); + tasks--; + continue; + } +goon: + ret = ptrace(PTRACE_SYSCALL, pid, NULL, NULL); + if (ret) { + pr_perror("ptrace"); + return -1; + } + } + + return 0; +} + +int compel_mode_native(struct parasite_ctl *ctl) +{ + return user_regs_native(&ctl->orig.regs); +} + +static inline k_rtsigset_t *thread_ctx_sigmask(struct thread_ctx *tctx) +{ + return &tctx->sigmask; +} + +k_rtsigset_t *compel_thread_sigmask(struct parasite_thread_ctl *tctl) +{ + return thread_ctx_sigmask(&tctl->th); +} + +k_rtsigset_t *compel_task_sigmask(struct parasite_ctl *ctl) +{ + return thread_ctx_sigmask(&ctl->orig); +} + +int compel_get_thread_regs(struct parasite_thread_ctl *tctl, save_regs_t save, void * arg) +{ + return get_task_regs(tctl->tid, &tctl->th.regs, save, arg, tctl->ctl->ictx.flags); +} + +struct infect_ctx *compel_infect_ctx(struct parasite_ctl *ctl) +{ + return &ctl->ictx; +} + +struct parasite_blob_desc *compel_parasite_blob_desc(struct parasite_ctl *ctl) +{ + return &ctl->pblob; +} + +uint64_t compel_get_leader_sp(struct parasite_ctl *ctl) +{ + return REG_SP(ctl->orig.regs); +} + +uint64_t compel_get_thread_sp(struct parasite_thread_ctl *tctl) +{ + return REG_SP(tctl->th.regs); +} diff --git a/CRIU_code/compel/src/lib/log-host.c b/CRIU_code/compel/src/lib/log-host.c new file mode 100644 index 0000000..918e3d3 --- /dev/null +++ b/CRIU_code/compel/src/lib/log-host.c @@ -0,0 +1 @@ +log.c \ No newline at end of file diff --git a/CRIU_code/compel/src/lib/log.c b/CRIU_code/compel/src/lib/log.c new file mode 100644 index 0000000..d195343 --- /dev/null +++ b/CRIU_code/compel/src/lib/log.c @@ -0,0 +1,38 @@ +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include "log.h" + +static unsigned int current_loglevel = COMPEL_DEFAULT_LOGLEVEL; +static compel_log_fn logfn; + +void compel_log_init(compel_log_fn log_fn, unsigned int level) +{ + logfn = log_fn; + current_loglevel = level; +} + +unsigned int compel_log_get_loglevel(void) +{ + return current_loglevel; +} + +void compel_print_on_level(unsigned int loglevel, const char *format, ...) +{ + va_list params; + compel_log_fn fn = logfn; + + if (fn != NULL && !pr_quelled(loglevel)) { + va_start(params, format); + fn(loglevel, format, params); + va_end(params); + } +} diff --git a/CRIU_code/compel/src/lib/ptrace.c b/CRIU_code/compel/src/lib/ptrace.c new file mode 100644 index 0000000..9142bac --- /dev/null +++ b/CRIU_code/compel/src/lib/ptrace.c @@ -0,0 +1,99 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common/compiler.h" + +#include "uapi/compel/asm/infect-types.h" +#include "ptrace.h" + +#include "log.h" + +int ptrace_suspend_seccomp(pid_t pid) +{ + if (ptrace(PTRACE_SETOPTIONS, pid, NULL, PTRACE_O_SUSPEND_SECCOMP) < 0) { + pr_perror("suspending seccomp failed"); + return -1; + } + + return 0; +} + +int ptrace_peek_area(pid_t pid, void *dst, void *addr, long bytes) +{ + unsigned long w; + if (bytes & (sizeof(long) - 1)) + return -1; + for (w = 0; w < bytes / sizeof(long); w++) { + unsigned long *d = dst, *a = addr; + d[w] = ptrace(PTRACE_PEEKDATA, pid, a + w, NULL); + if (d[w] == -1U && errno) + goto err; + } + return 0; +err: + return -2; +} + +int ptrace_poke_area(pid_t pid, void *src, void *addr, long bytes) +{ + unsigned long w; + if (bytes & (sizeof(long) - 1)) + return -1; + for (w = 0; w < bytes / sizeof(long); w++) { + unsigned long *s = src, *a = addr; + if (ptrace(PTRACE_POKEDATA, pid, a + w, s[w])) + goto err; + } + return 0; +err: + return -2; +} + +/* don't swap big space, it might overflow the stack */ +int ptrace_swap_area(pid_t pid, void *dst, void *src, long bytes) +{ + void *t = alloca(bytes); + + if (ptrace_peek_area(pid, t, dst, bytes)) + return -1; + + if (ptrace_poke_area(pid, src, dst, bytes)) { + if (ptrace_poke_area(pid, t, dst, bytes)) + return -2; + return -1; + } + + memcpy(src, t, bytes); + + return 0; +} + +int __attribute__((weak)) ptrace_get_regs(int pid, user_regs_struct_t *regs) { + struct iovec iov; + + iov.iov_base = regs; + iov.iov_len = sizeof(user_regs_struct_t); + return ptrace(PTRACE_GETREGSET, pid, NT_PRSTATUS, &iov); +} + +int __attribute__((weak)) ptrace_set_regs(int pid, user_regs_struct_t *regs) +{ + struct iovec iov; + + iov.iov_base = regs; + iov.iov_len = sizeof(user_regs_struct_t); + return ptrace(PTRACE_SETREGSET, pid, NT_PRSTATUS, &iov); +} diff --git a/CRIU_code/compel/src/main-host.c b/CRIU_code/compel/src/main-host.c new file mode 100644 index 0000000..8a03e94 --- /dev/null +++ b/CRIU_code/compel/src/main-host.c @@ -0,0 +1 @@ +main.c \ No newline at end of file diff --git a/CRIU_code/compel/src/main.c b/CRIU_code/compel/src/main.c new file mode 100644 index 0000000..51bac09 --- /dev/null +++ b/CRIU_code/compel/src/main.c @@ -0,0 +1,420 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +#include "uapi/compel/compel.h" + +#include "version.h" +#include "piegen.h" +#include "log.h" + +#define CFLAGS_DEFAULT_SET \ + "-Wstrict-prototypes " \ + "-fno-stack-protector -nostdlib -fomit-frame-pointer " + +#define COMPEL_CFLAGS_PIE CFLAGS_DEFAULT_SET "-fpie" +#define COMPEL_CFLAGS_NOPIC CFLAGS_DEFAULT_SET "-fno-pic" + +#ifdef NO_RELOCS +#define COMPEL_LDFLAGS_COMMON "-z noexecstack -T " +#else +#define COMPEL_LDFLAGS_COMMON "-r -z noexecstack -T " +#endif + +typedef struct { + const char *arch; // dir name under arch/ + const char *cflags; + const char *cflags_compat; +} flags_t; + +static const flags_t flags = { +#if defined CONFIG_X86_64 + .arch = "x86", + .cflags = COMPEL_CFLAGS_PIE, + .cflags_compat = COMPEL_CFLAGS_NOPIC, +#elif defined CONFIG_AARCH64 + .arch = "aarch64", + .cflags = COMPEL_CFLAGS_PIE, +#elif defined(CONFIG_ARMV6) || defined(CONFIG_ARMV7) + .arch = "arm", + .cflags = COMPEL_CFLAGS_PIE, +#elif defined CONFIG_PPC64 + .arch = "ppc64", + .cflags = COMPEL_CFLAGS_PIE, +#elif defined CONFIG_S390 + .arch = "s390", + .cflags = COMPEL_CFLAGS_PIE, +#else +#error "CONFIG_ not defined, or unsupported ARCH" +#endif +}; + +const char *uninst_root; + +static int piegen(void) +{ + struct stat st; + void *mem; + int fd, ret = -1; + + fd = open(opts.input_filename, O_RDONLY); + if (fd < 0) { + pr_perror("Can't open file %s", opts.input_filename); + return -1; + } + + if (fstat(fd, &st)) { + pr_perror("Can't stat file %s", opts.input_filename); + goto err; + } + + opts.fout = fopen(opts.output_filename, "w"); + if (opts.fout == NULL) { + pr_perror("Can't open %s", opts.output_filename); + goto err; + } + + mem = mmap(NULL, st.st_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_FILE, fd, 0); + if (mem == MAP_FAILED) { + pr_perror("Can't mmap file %s", opts.input_filename); + goto err; + } + + if (handle_binary(mem, st.st_size)) { + close(fd), fd = -1; + unlink(opts.output_filename); + goto err; + } + + ret = 0; + +err: + if (fd >= 0) + close(fd); + if (opts.fout) + fclose(opts.fout); + if (!ret) + pr_info("%s generated successfully.\n", opts.output_filename); + return ret; +} + +static void cli_log(unsigned int lvl, const char *fmt, va_list parms) +{ + FILE *f = stdout; + + if (pr_quelled(lvl)) + return; + + if ((lvl == COMPEL_LOG_ERROR) || (lvl == COMPEL_LOG_WARN)) + f = stderr; + + vfprintf(f, fmt, parms); +} + +static int usage(int rc) { + FILE *out = (rc == 0) ? stdout : stderr; + + fprintf(out, +"Usage:\n" +" compel [--compat] includes | cflags | ldflags\n" +" compel plugins [PLUGIN_NAME ...]\n" +" compel [--compat] [--static] libs\n" +" compel -f FILE -o FILE [-p NAME] [-l N] hgen\n" +" -f, --file FILE input (parasite object) file name\n" +" -o, --output FILE output (header) file name\n" +" -p, --prefix NAME prefix for var names\n" +" -l, --log-level NUM log level (default: %d)\n" +" compel -h|--help\n" +" compel -V|--version\n" +, COMPEL_DEFAULT_LOGLEVEL +); + + return rc; +} + +static void print_includes(void) +{ + int i; + /* list of standard include dirs (built into C preprocessor) */ + const char *standard_includes[] = { + "/usr/include", + "/usr/local/include", + }; + + /* I am not installed, called via a wrapper */ + if (uninst_root) { + printf("-I %s/include/uapi\n", uninst_root); + return; + } + + /* I am installed + * Make sure to not print banalities */ + for (i = 0; i < ARRAY_SIZE(standard_includes); i++) + if (strcmp(INCLUDEDIR, standard_includes[i]) == 0) + return; + + /* Finally, print our non-standard include path */ + printf("%s\n", "-I " INCLUDEDIR); +} + +static void print_cflags(bool compat) +{ + printf("%s\n", compat ? flags.cflags_compat : flags.cflags); + print_includes(); +} + +static void print_ldflags(bool compat) +{ + const char *compat_str = (compat) ? "-compat" : ""; + + printf("%s", COMPEL_LDFLAGS_COMMON); + + if (uninst_root) { + printf("%s/arch/%s/scripts/compel-pack%s.lds.S\n", + uninst_root, flags.arch, compat_str); + } + else { + printf("%s/compel/scripts/compel-pack%s.lds.S\n", + LIBEXECDIR, compat_str); + + } +} + +static void print_plugin(const char *name) +{ + const char suffix[] = ".lib.a"; + + if (uninst_root) + printf("%s/plugins/%s%s\n", + uninst_root, name, suffix); + else + printf("%s/compel/%s%s\n", LIBEXECDIR, name, suffix); +} + +static void print_plugins(char *const list[]) +{ + char *builtin_list[] = { "std", NULL }; + char **p = builtin_list; + + while (*p != NULL) + print_plugin(*p++); + + while (*list != NULL) + print_plugin(*list++); +} + +static int print_libs(bool is_static) +{ + if (uninst_root) { + if (!is_static) { + fprintf(stderr, "Compel is not installed, can " + "only link with static libraries " + "(use --static)\n"); + return 1; + } + printf("%s/%s\n", uninst_root, STATIC_LIB); + } + else { + printf("%s/%s\n", LIBDIR, (is_static) ? STATIC_LIB : DYN_LIB); + } + + return 0; +} + +/* Extracts the file name (removing directory path and suffix, + * and checks the result for being a valid C identifier + * (replacing - with _ along the way). + * + * If everything went fine, return the resulting string, + * otherwise NULL. + * + * Example: get_prefix("./some/path/to/file.c") ==> "file" + */ +static char *gen_prefix(const char *path) +{ + const char *p1 = NULL, *p2 = NULL; + size_t len; + int i; + char *p, *ret; + + len = strlen(path); + if (len == 0) + return NULL; + + // Find the last slash (p1) + // and the first dot after it (p2) + for (i = len - 1; i >= 0; i--) { + if (!p1 && path[i] == '.') { + p2 = path + i - 1; + } + else if (!p1 && path[i] == '/') { + p1 = path + i + 1; + break; + } + } + + if (!p1) // no slash in path + p1 = path; + if (!p2) // no dot (after slash) + p2 = path + len; + + len = p2 - p1 + 1; + if (len < 1) + return NULL; + + ret = strndup(p1, len); + + // Now, check if we got a valid C identifier. We don't need to care + // about C reserved keywords, as this is only used as a prefix. + for (p = ret; *p != '\0'; p++) { + if (isalpha(*p)) + continue; + // digit is fine, except the first character + if (isdigit(*p) && p > ret) + continue; + // only allowed special character is _ + if (*p == '_') + continue; + // as a courtesy, replace - with _ + if (*p == '-') { + *p = '_'; + continue; + } + // invalid character! + free(ret); + return NULL; + } + + return ret; +} + +int main(int argc, char *argv[]) +{ + int log_level = COMPEL_DEFAULT_LOGLEVEL; + bool compat = false; + bool is_static = false; + int opt, idx; + char *action; + + static const char short_opts[] = "csf:o:p:hVl:"; + static struct option long_opts[] = { + { "compat", no_argument, 0, 'c' }, + { "static", no_argument, 0, 's' }, + { "file", required_argument, 0, 'f' }, + { "output", required_argument, 0, 'o' }, + { "prefix", required_argument, 0, 'p' }, + { "help", no_argument, 0, 'h' }, + { "version", no_argument, 0, 'V' }, + { "log-level", required_argument, 0, 'l' }, + { }, + }; + + uninst_root = getenv("COMPEL_UNINSTALLED_ROOTDIR"); + + while (1) { + idx = -1; + opt = getopt_long(argc, argv, short_opts, long_opts, &idx); + if (opt == -1) + break; + switch (opt) { + case 'c': + compat = true; + break; + case 's': + is_static = true; + break; + case 'f': + opts.input_filename = optarg; + break; + case 'o': + opts.output_filename = optarg; + break; + case 'p': + opts.prefix = optarg; + break; + case 'l': + log_level = atoi(optarg); + break; + case 'h': + return usage(0); + case 'V': + printf("Version: %d.%d.%d\n", + COMPEL_SO_VERSION_MAJOR, + COMPEL_SO_VERSION_MINOR, + COMPEL_SO_VERSION_SUBLEVEL); + exit(0); + break; + default: // '?' + // error message already printed by getopt_long() + return usage(1); + break; + } + } + + if (optind >= argc) { + fprintf(stderr, "Error: action argument required\n"); + return usage(1); + } + action = argv[optind++]; + + if (!strcmp(action, "includes")) { + print_includes(); + return 0; + } + if (!strcmp(action, "cflags")) { + print_cflags(compat); + return 0; + } + + if (!strcmp(action, "ldflags")) { + print_ldflags(compat); + return 0; + } + + if (!strcmp(action, "plugins")) { + print_plugins(argv + optind); + return 0; + } + + if (!strcmp(action, "libs")) { + return print_libs(is_static); + } + + if (!strcmp(action, "hgen")) { + if (!opts.input_filename) { + fprintf(stderr, "Error: option --file required\n"); + return usage(1); + } + if (!opts.output_filename) { + fprintf(stderr, "Error: option --output required\n"); + return usage(1); + } + if (!opts.prefix) { + // prefix not provided, let's autogenerate + opts.prefix = gen_prefix(opts.input_filename); + if (!opts.prefix) + opts.prefix = gen_prefix(opts.output_filename); + if (!opts.prefix) { + fprintf(stderr, "Error: can't autogenerate " + "prefix (supply --prefix)"); + return 2; + } + } + compel_log_init(&cli_log, log_level); + return piegen(); + } + + fprintf(stderr, "Error: unknown action '%s'\n", action); + return usage(1); +} diff --git a/CRIU_code/compel/test/fdspy/.gitignore b/CRIU_code/compel/test/fdspy/.gitignore new file mode 100644 index 0000000..0a55475 --- /dev/null +++ b/CRIU_code/compel/test/fdspy/.gitignore @@ -0,0 +1,4 @@ +parasite.h +parasite.po +spy +victim diff --git a/CRIU_code/compel/test/fdspy/Makefile b/CRIU_code/compel/test/fdspy/Makefile new file mode 100644 index 0000000..027c373 --- /dev/null +++ b/CRIU_code/compel/test/fdspy/Makefile @@ -0,0 +1,28 @@ +CC := gcc +CFLAGS ?= -O2 -g -Wall -Werror + +COMPEL := ../../../compel/compel-host + +all: victim spy + +clean: + rm -f victim + rm -f spy + rm -f parasite.h + rm -f parasite.po + rm -f parasite.o + +victim: victim.c + $(CC) $(CFLAGS) -o $@ $^ + +spy: spy.c parasite.h + $(CC) $(CFLAGS) $(shell $(COMPEL) includes) -o $@ $< $(shell $(COMPEL) --static libs) + +parasite.h: parasite.po + $(COMPEL) hgen -o $@ -f $< + +parasite.po: parasite.o + ld $(shell $(COMPEL) ldflags) -o $@ $^ $(shell $(COMPEL) plugins fds) + +parasite.o: parasite.c + $(CC) $(CFLAGS) -c $(shell $(COMPEL) cflags) -o $@ $^ diff --git a/CRIU_code/compel/test/fdspy/parasite.c b/CRIU_code/compel/test/fdspy/parasite.c new file mode 100644 index 0000000..c14064b --- /dev/null +++ b/CRIU_code/compel/test/fdspy/parasite.c @@ -0,0 +1,20 @@ +#include + +#include +#include + +/* + * Stubs for std compel plugin. + */ +int compel_main(void *arg_p, unsigned int arg_s) { return 0; } +int parasite_trap_cmd(int cmd, void *args) { return 0; } +void parasite_cleanup(void) { } + +#define PARASITE_CMD_GETFD PARASITE_USER_CMDS + +int parasite_daemon_cmd(int cmd, void *args) +{ + if (cmd == PARASITE_CMD_GETFD) + fds_send_fd(2); + return 0; +} diff --git a/CRIU_code/compel/test/fdspy/spy.c b/CRIU_code/compel/test/fdspy/spy.c new file mode 100644 index 0000000..258e3ab --- /dev/null +++ b/CRIU_code/compel/test/fdspy/spy.c @@ -0,0 +1,169 @@ +#include +#include +#include +#include +#include +#include + +#include +#include "parasite.h" + +#define PARASITE_CMD_GETFD PARASITE_USER_CMDS + +static void print_vmsg(unsigned int lvl, const char *fmt, va_list parms) +{ + printf("\tLC%u: ", lvl); + vprintf(fmt, parms); +} + +static int do_infection(int pid, int *stolen_fd) +{ +#define err_and_ret(msg) do { fprintf(stderr, msg); return -1; } while (0) + + int state; + struct parasite_ctl *ctl; + struct infect_ctx *ictx; + + compel_log_init(print_vmsg, COMPEL_LOG_DEBUG); + + printf("Stopping task\n"); + state = compel_stop_task(pid); + if (state < 0) + err_and_ret("Can't stop task"); + + printf("Preparing parasite ctl\n"); + ctl = compel_prepare(pid); + if (!ctl) + err_and_ret("Can't prepare for infection"); + + printf("Configuring contexts\n"); + + /* + * First -- the infection context. Most of the stuff + * is already filled by compel_prepare(), just set the + * log descriptor for parasite side, library cannot + * live w/o it. + */ + ictx = compel_infect_ctx(ctl); + ictx->log_fd = STDERR_FILENO; + + parasite_setup_c_header(ctl); + + printf("Infecting\n"); + if (compel_infect(ctl, 1, sizeof(int))) + err_and_ret("Can't infect victim"); + + printf("Stealing fd\n"); + if (compel_rpc_call(PARASITE_CMD_GETFD, ctl)) + err_and_ret("Can't run cmd"); + + if (compel_util_recv_fd(ctl, stolen_fd)) + err_and_ret("Can't recv fd"); + + if (compel_rpc_sync(PARASITE_CMD_GETFD, ctl)) + err_and_ret("Con't finalize cmd"); + + printf("Stole %d fd\n", *stolen_fd); + + /* + * Done. Cure and resume the task. + */ + printf("Curing\n"); + if (compel_cure(ctl)) + err_and_ret("Can't cure victim"); + + if (compel_resume_task(pid, state, state)) + err_and_ret("Can't unseize task"); + + printf("Done\n"); + return 0; +} + +static int check_pipe_ends(int wfd, int rfd) +{ + struct stat r, w; + char aux[4] = "0000"; + + printf("Check pipe ends are at hands\n"); + if (fstat(wfd, &w) < 0) { + perror("Can't stat wfd"); + return 0; + } + + if (fstat(rfd, &r) < 0) { + perror("Can't stat rfd"); + return 0; + } + + if (w.st_dev != r.st_dev || w.st_ino != r.st_ino) { + perror("Pipe's not the same"); + return 0; + } + + printf("Check pipe ends are connected\n"); + write(wfd, "1", 2); + read(rfd, aux, sizeof(aux)); + if (aux[0] != '1' || aux[1] != '\0') { + fprintf(stderr, "Pipe connectivity lost\n"); + return 0; + } + + return 1; +} + +int main(int argc, char **argv) +{ + int p_in[2], p_out[2], p_err[2], pid, pass = 1, stolen_fd = -1; + + /* + * Prepare IO-s and fork the victim binary + */ + if (pipe(p_in) || pipe(p_out) || pipe(p_err)) { + perror("Can't make pipe"); + return -1; + } + + printf("Run the victim\n"); + pid = vfork(); + if (pid == 0) { + close(p_in[1]); dup2(p_in[0], 0); close(p_in[0]); + close(p_out[0]); dup2(p_out[1], 1); close(p_out[1]); + close(p_err[0]); dup2(p_err[1], 2); close(p_err[1]); + execl("./victim", "victim", NULL); + exit(1); + } + + close(p_in[0]); close(p_out[1]); close(p_err[1]); + + /* + * Now do the infection with parasite.c + */ + + printf("Infecting the victim\n"); + if (do_infection(pid, &stolen_fd)) + return 1; + + /* + * Stop the victim and check the infection went well + */ + printf("Closing victim stdin\n"); + close(p_in[1]); + printf("Waiting for victim to die\n"); + wait(NULL); + + printf("Checking the result\n"); + /* + * Stolen fd is the stderr of the task + * Check these are the ends of the same pipe + * and message passing works OK + */ + + pass = check_pipe_ends(stolen_fd, p_err[0]); + + if (pass) + printf("All OK\n"); + else + printf("Something went WRONG\n"); + + return 0; +} diff --git a/CRIU_code/compel/test/fdspy/victim.c b/CRIU_code/compel/test/fdspy/victim.c new file mode 100644 index 0000000..3dbd274 --- /dev/null +++ b/CRIU_code/compel/test/fdspy/victim.c @@ -0,0 +1,12 @@ +#include + +int main(int argc, char **argv) +{ + int i, aux; + + do { + i = read(0, &aux, 1); + } while (i > 0); + + return 0; +} diff --git a/CRIU_code/compel/test/infect/.gitignore b/CRIU_code/compel/test/infect/.gitignore new file mode 100644 index 0000000..0a55475 --- /dev/null +++ b/CRIU_code/compel/test/infect/.gitignore @@ -0,0 +1,4 @@ +parasite.h +parasite.po +spy +victim diff --git a/CRIU_code/compel/test/infect/Makefile b/CRIU_code/compel/test/infect/Makefile new file mode 100644 index 0000000..4dedf33 --- /dev/null +++ b/CRIU_code/compel/test/infect/Makefile @@ -0,0 +1,28 @@ +CC := gcc +CFLAGS ?= -O2 -g -Wall -Werror + +COMPEL := ../../../compel/compel-host + +all: victim spy + +clean: + rm -f victim + rm -f spy + rm -f parasite.h + rm -f parasite.po + rm -f parasite.o + +victim: victim.c + $(CC) $(CFLAGS) -o $@ $^ + +spy: spy.c parasite.h + $(CC) $(CFLAGS) $(shell $(COMPEL) includes) -o $@ $< $(shell $(COMPEL) --static libs) + +parasite.h: parasite.po + $(COMPEL) hgen -o $@ -f $< + +parasite.po: parasite.o + ld $(shell $(COMPEL) ldflags) -o $@ $^ $(shell $(COMPEL) plugins) + +parasite.o: parasite.c + $(CC) $(CFLAGS) -c $(shell $(COMPEL) cflags) -o $@ $^ diff --git a/CRIU_code/compel/test/infect/parasite.c b/CRIU_code/compel/test/infect/parasite.c new file mode 100644 index 0000000..f185809 --- /dev/null +++ b/CRIU_code/compel/test/infect/parasite.c @@ -0,0 +1,33 @@ +#include + +#include +#include + +/* + * Stubs for std compel plugin. + */ +int parasite_trap_cmd(int cmd, void *args) { return 0; } +void parasite_cleanup(void) { } + +#define PARASITE_CMD_INC PARASITE_USER_CMDS +#define PARASITE_CMD_DEC PARASITE_USER_CMDS + 1 + +int parasite_daemon_cmd(int cmd, void *args) +{ + int v; + + switch (cmd) { + case PARASITE_CMD_INC: + v = (*(int *)args) + 1; + break; + case PARASITE_CMD_DEC: + v = (*(int *)args) - 1; + break; + default: + v = -1; + break; + } + + sys_write(1, &v, sizeof(int)); + return 0; +} diff --git a/CRIU_code/compel/test/infect/spy.c b/CRIU_code/compel/test/infect/spy.c new file mode 100644 index 0000000..a5aba73 --- /dev/null +++ b/CRIU_code/compel/test/infect/spy.c @@ -0,0 +1,178 @@ +#include +#include +#include +#include + +#include +#include "parasite.h" + +#define PARASITE_CMD_INC PARASITE_USER_CMDS +#define PARASITE_CMD_DEC PARASITE_USER_CMDS + 1 + +static void print_vmsg(unsigned int lvl, const char *fmt, va_list parms) +{ + printf("\tLC%u: ", lvl); + vprintf(fmt, parms); +} + +static int do_infection(int pid) +{ +#define err_and_ret(msg) do { fprintf(stderr, msg); return -1; } while (0) + + int state; + struct parasite_ctl *ctl; + struct infect_ctx *ictx; + int *arg; + + compel_log_init(print_vmsg, COMPEL_LOG_DEBUG); + + printf("Stopping task\n"); + state = compel_stop_task(pid); + if (state < 0) + err_and_ret("Can't stop task"); + + printf("Preparing parasite ctl\n"); + ctl = compel_prepare(pid); + if (!ctl) + err_and_ret("Can't prepare for infection"); + + printf("Configuring contexts\n"); + + /* + * First -- the infection context. Most of the stuff + * is already filled by compel_prepare(), just set the + * log descriptor for parasite side, library cannot + * live w/o it. + */ + ictx = compel_infect_ctx(ctl); + ictx->log_fd = STDERR_FILENO; + + parasite_setup_c_header(ctl); + + printf("Infecting\n"); + if (compel_infect(ctl, 1, sizeof(int))) + err_and_ret("Can't infect victim"); + + /* + * Now get the area with arguments and run two + * commands one by one. + */ + arg = compel_parasite_args(ctl, int); + + printf("Running cmd 1\n"); + *arg = 137; + if (compel_rpc_call_sync(PARASITE_CMD_INC, ctl)) + err_and_ret("Can't run parasite command 1"); + + printf("Running cmd 2\n"); + *arg = 404; + if (compel_rpc_call_sync(PARASITE_CMD_DEC, ctl)) + err_and_ret("Can't run parasite command 2"); + + /* + * Done. Cure and resume the task. + */ + printf("Curing\n"); + if (compel_cure(ctl)) + err_and_ret("Can't cure victim"); + + if (compel_resume_task(pid, state, state)) + err_and_ret("Can't unseize task"); + + printf("Done\n"); + return 0; +} + +static inline int chk(int fd, int val) +{ + int v = 0; + + if (read(fd, &v, sizeof(v)) != sizeof(v)) + return 0; + + printf("%d, want %d\n", v, val); + return v == val; +} + +int main(int argc, char **argv) +{ + int p_in[2], p_out[2], p_err[2], pid, i, pass = 1; + + /* + * Prepare IO-s and fork the victim binary + */ + if (pipe(p_in) || pipe(p_out) || pipe(p_err)) { + perror("Can't make pipe"); + return -1; + } + + pid = vfork(); + if (pid == 0) { + close(p_in[1]); dup2(p_in[0], 0); close(p_in[0]); + close(p_out[0]); dup2(p_out[1], 1); close(p_out[1]); + close(p_err[0]); dup2(p_err[1], 2); close(p_err[1]); + execl("./victim", "victim", NULL); + exit(1); + } + + close(p_in[0]); close(p_out[1]); close(p_err[1]); + + /* + * Tell the little guy some numbers + */ + i = 1; + if (write(p_in[1], &i, sizeof(i)) != sizeof(i)) + return 1; + i = 42; + if (write(p_in[1], &i, sizeof(i)) != sizeof(i)) + return 1; + + printf("Checking the victim alive\n"); + pass = chk(p_out[0], 1); + pass = chk(p_out[0], 42); + if (!pass) + return 1; + + /* + * Now do the infection with parasite.c + */ + + printf("Infecting the victim\n"); + if (do_infection(pid)) + return 1; + + /* + * Tell the victim some more stuff to check it's alive + */ + i = 1234; + if (write(p_in[1], &i, sizeof(i)) != sizeof(i)) + return 1; + i = 4096; + if (write(p_in[1], &i, sizeof(i)) != sizeof(i)) + return 1; + + /* + * Stop the victim and check the infection went well + */ + printf("Closing victim stdin\n"); + close(p_in[1]); + printf("Waiting for victim to die\n"); + wait(NULL); + + printf("Checking the result\n"); + + /* These two came from parasite */ + pass = chk(p_out[0], 138); + pass = chk(p_out[0], 403); + + /* These two came from post-infect */ + pass = chk(p_out[0], 1234); + pass = chk(p_out[0], 4096); + + if (pass) + printf("All OK\n"); + else + printf("Something went WRONG\n"); + + return 0; +} diff --git a/CRIU_code/compel/test/infect/victim.c b/CRIU_code/compel/test/infect/victim.c new file mode 100644 index 0000000..f94613f --- /dev/null +++ b/CRIU_code/compel/test/infect/victim.c @@ -0,0 +1,16 @@ +#include + +int main(int argc, char **argv) +{ + int i; + + while (1) { + if (read(0, &i, sizeof(i)) != sizeof(i)) + break; + + if (write(1, &i, sizeof(i)) != sizeof(i)) + break; + } + + return 0; +} diff --git a/CRIU_code/compel/test/rsys/.gitignore b/CRIU_code/compel/test/rsys/.gitignore new file mode 100644 index 0000000..e3e9602 --- /dev/null +++ b/CRIU_code/compel/test/rsys/.gitignore @@ -0,0 +1,2 @@ +spy +victim diff --git a/CRIU_code/compel/test/rsys/Makefile b/CRIU_code/compel/test/rsys/Makefile new file mode 100644 index 0000000..3babda1 --- /dev/null +++ b/CRIU_code/compel/test/rsys/Makefile @@ -0,0 +1,16 @@ +CC := gcc +CFLAGS ?= -O2 -g -Wall -Werror + +COMPEL := ../../../compel/compel-host + +all: victim spy + +clean: + rm -f victim + rm -f spy + +victim: victim.c + $(CC) $(CFLAGS) -o $@ $^ + +spy: spy.c + $(CC) $(CFLAGS) $(shell $(COMPEL) includes) -o $@ $^ $(shell $(COMPEL) --static libs) diff --git a/CRIU_code/compel/test/rsys/spy.c b/CRIU_code/compel/test/rsys/spy.c new file mode 100644 index 0000000..f5c999d --- /dev/null +++ b/CRIU_code/compel/test/rsys/spy.c @@ -0,0 +1,136 @@ +#include +#include +#include +#include +#include + +#include + +static void print_vmsg(unsigned int lvl, const char *fmt, va_list parms) +{ + printf("\tLC%u: ", lvl); + vprintf(fmt, parms); +} + +static int do_rsetsid(int pid) +{ +#define err_and_ret(msg) do { fprintf(stderr, msg); return -1; } while (0) + + int state; + long ret; + struct parasite_ctl *ctl; + + compel_log_init(print_vmsg, COMPEL_LOG_DEBUG); + + printf("Stopping task\n"); + state = compel_stop_task(pid); + if (state < 0) + err_and_ret("Can't stop task"); + + printf("Preparing parasite ctl\n"); + ctl = compel_prepare(pid); + if (!ctl) + err_and_ret("Can't prepare for infection"); + + ret = -1000; + if (compel_syscall(ctl, __NR_getpid, &ret, 0, 0, 0, 0, 0, 0) < 0) + err_and_ret("Can't run rgetpid"); + + printf("Remote getpid returned %ld\n", ret); + if (ret != pid) + err_and_ret("Pid mismatch!"); + + ret = -1000; + if (compel_syscall(ctl, __NR_setsid, &ret, 0, 0, 0, 0, 0, 0) < 0) + err_and_ret("Can't run rsetsid"); + printf("Remote setsid returned %ld\n", ret); + + /* + * Done. Cure and resume the task. + */ + printf("Curing\n"); + if (compel_cure(ctl)) + err_and_ret("Can't cure victim"); + + if (compel_resume_task(pid, state, state)) + err_and_ret("Can't unseize task"); + + printf("Done\n"); + return 0; +} + +static inline int chk(int fd, int val) +{ + int v = 0; + + read(fd, &v, sizeof(v)); + printf("%d, want %d\n", v, val); + return v == val; +} + +int main(int argc, char **argv) +{ + int p_in[2], p_out[2], p_err[2], pid, i, pass = 1, sid; + + /* + * Prepare IO-s and fork the victim binary + */ + if (pipe(p_in) || pipe(p_out) || pipe(p_err)) { + perror("Can't make pipe"); + return -1; + } + + pid = vfork(); + if (pid == 0) { + close(p_in[1]); dup2(p_in[0], 0); close(p_in[0]); + close(p_out[0]); dup2(p_out[1], 1); close(p_out[1]); + close(p_err[0]); dup2(p_err[1], 2); close(p_err[1]); + execl("./victim", "victim", NULL); + exit(1); + } + + close(p_in[0]); close(p_out[1]); close(p_err[1]); + sid = getsid(0); + + /* + * Kick the victim once + */ + i = 0; + write(p_in[1], &i, sizeof(i)); + + printf("Checking the victim session to be %d\n", sid); + pass = chk(p_out[0], sid); + if (!pass) + return 1; + + /* + * Now do the infection with parasite.c + */ + + printf("Setsid() the victim\n"); + if (do_rsetsid(pid)) + return 1; + + /* + * Kick the victim again so it tells new session + */ + write(p_in[1], &i, sizeof(i)); + + /* + * Stop the victim and check the intrusion went well + */ + printf("Closing victim stdin\n"); + close(p_in[1]); + printf("Waiting for victim to die\n"); + wait(NULL); + + printf("Checking the new session to be %d\n", pid); + pass = chk(p_out[0], pid); + + if (pass) + printf("All OK\n"); + else + printf("Something went WRONG\n"); + + return 0; +} diff --git a/CRIU_code/compel/test/rsys/victim.c b/CRIU_code/compel/test/rsys/victim.c new file mode 100644 index 0000000..2f1943d --- /dev/null +++ b/CRIU_code/compel/test/rsys/victim.c @@ -0,0 +1,16 @@ +#include + +int main(int argc, char **argv) +{ + int i; + + while (1) { + if (read(0, &i, sizeof(i)) != sizeof(i)) + break; + + i = getsid(0); + write(1, &i, sizeof(i)); + } + + return 0; +} diff --git a/CRIU_code/contrib/debian/dev-packages.lst b/CRIU_code/contrib/debian/dev-packages.lst new file mode 100644 index 0000000..b0b664f --- /dev/null +++ b/CRIU_code/contrib/debian/dev-packages.lst @@ -0,0 +1,20 @@ +# Required packages for development in Debian +build-essential +libprotobuf-dev +libprotobuf-c0-dev +protobuf-c-compiler +protobuf-compiler +python-protobuf +libnet-dev + +# Extra packages, required for testing and building other tools +pkg-config +libnl-3-dev +python-ipaddr +libbsd0 +libbsd-dev +iproute2 +libcap-dev +libaio-dev +python-yaml +libnl-route-3-dev diff --git a/CRIU_code/contrib/docker_cr.sh b/CRIU_code/contrib/docker_cr.sh new file mode 100644 index 0000000..9b43d8b --- /dev/null +++ b/CRIU_code/contrib/docker_cr.sh @@ -0,0 +1,466 @@ +#!/bin/bash + +# +# A convenience shell script to call criu for checkpointing and restoring +# a Docker container. +# +# This script saves the user from having to remember all the command +# line options, some of which are very long. Note that once Docker +# has native support for checkpoint and restore, there will no longer +# be a need for this particular shell script. +# + +set -o errexit +set -o nounset +set -o pipefail + +# +# These can be set in the environment to override their defaults. +# Note that while the default value of CRIU_IMG_DIR in this script +# is a directory in DOCKER_HOME, it doesn't have to be tied to +# DOCKER_HOME. For example, it can be /var/spool/criu_img. +# +: ${DOCKER_HOME=/var/lib/docker} +: ${DOCKER_BINARY=docker} +: ${CRIU_IMG_DIR=${DOCKER_HOME}/criu_img} +: ${CRIU_BINARY=criu} +: ${DOCKERINIT_BINARY=} + +# +# Patterns for different filesystem types in dump.log. +# +readonly AUFS_PATTERN='/sys/fs/aufs/si_' +readonly OVERLAYFS_PATTERN='type.*source.*options.*lowerdir=.*upperdir=.*workdir=' +readonly UNIONFS_PATTERN='type.*source.*options.*dirs=' + +# +# These globals will be set by init_container_vars() +# +declare CID +declare CONTAINER_IMG_DIR +declare CONTAINER_DUMP_LOG + +declare -A BIND_MOUNT +BIND_MOUNT[/etc/resolv.conf]=.ResolvConfPath +BIND_MOUNT[/etc/hosts]=.HostsPath +BIND_MOUNT[/etc/hostname]=.HostnamePath +MOUNT_MAP_ARGS=() + +# +# The default mode is non-verbose, printing only a short message +# saying if the command succeeded or failed. For the verbose mode, +# we could have used set -o xtrace but this option would have +# generated excessive output suitable for debugging, not normal +# usage. So we set ${ECHO} to echo in the verbose mode to print +# selected messages. +# +VERBOSE="" +ECHO=":" +CMD="" +PGNAME=$(basename "$0") + +usage() { + local rv=0 + + if [[ -n "${1-}" ]]; then + rv=1 + echo -e "${PGNAME}: $1\n" >&2 + fi + + cat <] + -c, --checkpoint checkpoint container + -h, --help print help message + -r, --restore restore container + -v, --verbose enable verbose mode + +Environment: + DOCKER_HOME (default ${DOCKER_HOME}) + CRIU_IMG_DIR (default ${CRIU_IMG_DIR}) + DOCKER_BINARY (default ${DOCKER_BINARY}) + DOCKERINIT_BINARY (default \${DOCKER_HOME}/init/dockerinit--dev) + CRIU_BINARY (default ${CRIU_BINARY}) +EOF + exit ${rv} +} + +# +# If the user has not specified a bind mount file for the container's +# /.dockerinit, try to determine it from the Docker version. +# +find_dockerinit() { + local v + + if [[ -z "${DOCKERINIT_BINARY}" ]]; then + v=$("${DOCKER_BINARY}" --version | sed -e 's/.*version \(.*\),.*/\1/') + DOCKERINIT_BINARY="${DOCKER_HOME}/init/dockerinit-${v}" + elif [[ "${DOCKERINIT_BINARY}" != /* ]]; then + DOCKERINIT_BINARY="${DOCKER_HOME}/init/${DOCKERINIT_BINARY}" + fi + + if [[ ! -x "${DOCKERINIT_BINARY}" ]]; then + echo "${DOCKERINIT_BINARY} does not exist" + exit 1 + fi + + BIND_MOUNT[/.dockerinit]="${DOCKERINIT_BINARY}" +} + +parse_args() { + local args + local flags + + args=$(getopt --options 'chrv' \ + --longoptions 'checkpoint help restore verbose' -- "$@") + [[ $? == 0 ]] || usage + eval set -- "${args}" + + while :; do + arg="${1}" + shift + case "${arg}" in + -c|--checkpoint) CMD="dump" ;; + -h|--help) usage ;; + -r|--restore) CMD="restore" ;; + -v|--verbose) VERBOSE="-v"; ECHO="echo" ;; + --) break ;; + *) usage "internal error parsing arguments!" ;; + esac + done + + [[ "${CMD}" == "" ]] && usage "need either -c or -r" + [[ $# -gt 1 ]] && usage "$# too many arguments" + + # if no container id in args, prompt the user + if [[ $# -eq 1 ]]; then + CID="$1" + else + if [[ "${CMD}" == "dump" ]]; then + flags="" + else + # we need -a only for restore + flags="-a" + fi + "${DOCKER_BINARY}" ps ${flags} + read -rp $'\nContainer ID: ' CID + fi +} + +execute() { + # since commands are pretty long and can wrap around + # several lines, print a blank line to make it visually + # easier to see + ${ECHO} -e "\n$*" + "$@" +} + +init_container_vars() { + local d + + CID=$(get_container_conf .Id) + + d=$("${DOCKER_BINARY}" info 2> /dev/null | awk '/Storage Driver:/ { print $3 }') + if [[ "${d}" == "vfs" ]]; then + CONTAINER_ROOT_DIR="${DOCKER_HOME}/${d}/dir/${CID}" + elif [[ "${d}" == "aufs" || "${d}" == "unionfs" ]]; then + CONTAINER_ROOT_DIR="${DOCKER_HOME}/${d}/mnt/${CID}" + elif [[ "${d}" == "overlay" ]]; then + CONTAINER_ROOT_DIR="${DOCKER_HOME}/${d}/${CID}/merged" + else + echo "${d}: unknown filesystem type" + return 1 + fi + + CONTAINER_IMG_DIR="${CRIU_IMG_DIR}/${CID}" + CONTAINER_DUMP_LOG="${CONTAINER_IMG_DIR}/dump.log" +} + +get_container_conf() { + local val + + val=$("${DOCKER_BINARY}" inspect --format "{{$1}}" "${CID}") + [[ "${val}" == "" ]] && exit 1 + echo "${val//}" +} + +setup_mount_map() { + local key + + if [[ "$1" == "dump" ]]; then + for key in "${!BIND_MOUNT[@]}"; do + MOUNT_MAP_ARGS+=(--ext-mount-map "${key}:${key}") + done + else + for key in "${!BIND_MOUNT[@]}"; do + if [[ "${key}" == "/.dockerinit" ]]; then + MOUNT_MAP_ARGS+=("--ext-mount-map" "${key}:${BIND_MOUNT[$key]}") + else + MOUNT_MAP_ARGS+=("--ext-mount-map" "${key}:$(get_container_conf "${BIND_MOUNT[$key]}")") + fi + done + fi +} + +fs_mounted() { + if grep -wq "$1" /proc/self/mountinfo; then + ${ECHO} "container root directory already mounted" + return 0 + fi + ${ECHO} "container root directory not mounted" + return 1 +} + +# +# Pretty print the mount command in verbose mode by putting each branch +# pathname on a single line for easier visual inspection. +# +pp_mount() { + ${ECHO} -e "\nmount -t $1 -o" + ${ECHO} "${2}" | tr ':,' '\n' + ${ECHO} "${3}" + ${ECHO} "${4}" +} + +# +# Reconstruct the AUFS filesystem from information in CRIU's dump log. +# The dump log has a series of branch entries for each process in the +# entire process tree in the following form: +# +# (00.014075) /sys/fs/aufs/si_f598876b0855b883/br0 : /var/lib/docker/aufs/diff/ +# +# Note that this script assumes that all processes in the process +# tree have the same AUFS filesystem. This assumption is fairly +# safe for typical Docker containers. +# +setup_aufs() { + local -r tmpf="${CONTAINER_IMG_DIR}/aufs.br" + local br + local branches + + # nothing to do if filesystem already mounted + fs_mounted "${CONTAINER_ROOT_DIR}" && return + + # create a temporary file with branches listed in + # ascending order (line 1 is branch 0) + awk '/aufs.si_/ { print $2, $4 }' "${CONTAINER_DUMP_LOG}" | \ + sort | uniq | awk '{ print $2 }' > "${tmpf}" + + # construct the mount option string from branches + branches="" + while read br; do + branches+="${branches:+:}${br}" + done < "${tmpf}" + + # mount the container's filesystem + pp_mount "aufs" "${branches}" "none" "${CONTAINER_ROOT_DIR}" + mount -t aufs -o br="${branches}" none "${CONTAINER_ROOT_DIR}" + rm -f "${tmpf}" +} + +setup_overlayfs() { + local lowerdir + local upperdir + local workdir + local ovlydirs + local -r f="${CONTAINER_DUMP_LOG}" + + # nothing to do if filesystem already mounted + fs_mounted "${CONTAINER_ROOT_DIR}" && return + + lowerdir=$(grep "${OVERLAYFS_PATTERN}" "${f}" | sed -n -e 's/.*lowerdir=\([^,]*\).*/\1/p') + upperdir=$(grep "${OVERLAYFS_PATTERN}" "${f}" | sed -n -e 's/.*upperdir=\([^,]*\).*/\1/p') + workdir=$(grep "${OVERLAYFS_PATTERN}" "${f}" | sed -n -e 's/.*workdir=\([^,]*\).*/\1/p') + ovlydirs="lowerdir=${lowerdir},upperdir=${upperdir},workdir=${workdir}" + + # mount the container's filesystem + pp_mount "overlay" "${ovlydirs}" "overlay" "${CONTAINER_ROOT_DIR}" + mount -t overlay -o "${ovlydirs}" overlay "${CONTAINER_ROOT_DIR}" +} + +# +# Reconstruct the UnionFS filesystem from information in CRIU's dump log. +# The dump log has the mountinfo root entry for the filesystem. The +# options field contains the list of directories that make up the UnionFS. +# +# Note that this script assumes that all processes in the process +# tree have the same UnionFS filesystem. This assumption is fairly +# safe for typical Docker containers. +# +# XXX If /dev/null was manually created by Docker (i.e., it's not in +# a branch), create it. Although this has worked so far, it needs +# a deeper look as I am not sure if /dev/null should be created as +# a regular file to be the target of a bind mount or created as a +# device file by mknod. +# +setup_unionfs() { + local dirs + + # nothing to do if filesystem already mounted + fs_mounted "${CONTAINER_ROOT_DIR}" && return + + dirs=$(sed -n -e 's/.*type.*dirs=/dirs=/p' "${CONTAINER_DUMP_LOG}") + [[ "${dirs}" = "" ]] && echo "do not have branch information" && exit 1 + + # mount the container's filesystem + pp_mount "unionfs" "${dirs}" "none" "${CONTAINER_ROOT_DIR}" + mount -t unionfs -o "${dirs}" none "${CONTAINER_ROOT_DIR}" + + # see comment at the beginning of the function + if [[ ! -e "${CONTAINER_ROOT_DIR}/dev/null" ]]; then + execute touch "${CONTAINER_ROOT_DIR}/dev/null" + fi +} + +prep_dump() { + local pid + + pid=$(get_container_conf .State.Pid) + + # docker returns 0 for containers it thinks have exited + # (i.e., dumping a restored container again) + if [[ ${pid} -eq 0 ]]; then + echo -e "\nCheckpointing a restored container?" + read -p "Process ID: " pid + fi + + # remove files previously created by criu but not others files (if any) + mkdir -p "${CONTAINER_IMG_DIR}" + rm -f "${CONTAINER_IMG_DIR}"/*.{img,log,pid} "${CONTAINER_IMG_DIR}"/stats-restore + + CMD_ARGS=("-t" "${pid}") + + # we need --root only for aufs to compensate for the + # erroneous information in /proc//map_files + if [[ "${CONTAINER_ROOT_DIR}" == *aufs* ]]; then + CMD_ARGS+=("--root" "${CONTAINER_ROOT_DIR}") + fi +} + +# +# Set up container's root filesystem if not already set up. +# +prep_restore() { + local -r f="${CONTAINER_DUMP_LOG}" + + if [[ ! -f "${f}" ]]; then + echo "${f} does not exist" + return 1 + fi + + if grep -q "${AUFS_PATTERN}" "${f}"; then + setup_aufs + elif grep -q "${OVERLAYFS_PATTERN}" "${f}"; then + setup_overlayfs + elif grep -q "${UNIONFS_PATTERN}" "${f}"; then + setup_unionfs + fi + + # criu requires this (due to container using pivot_root) + if ! grep -qw "${CONTAINER_ROOT_DIR}" /proc/self/mountinfo; then + execute mount --rbind "${CONTAINER_ROOT_DIR}" "${CONTAINER_ROOT_DIR}" + MOUNTED=1 + else + MOUNTED=0 + fi + + CMD_ARGS=("-d" "--root" "${CONTAINER_ROOT_DIR}" "--pidfile" "${CONTAINER_IMG_DIR}/restore.pid") +} + +# +# Since this function produces output string (either in the +# verbose mode or from ${CRIU_BINARY}), we set the return value +# in parameter 1. +# +run_criu() { + local -a common_args=("-v4" "-D" "${CONTAINER_IMG_DIR}" \ + "-o" "${CMD}.log" \ + "--manage-cgroups" \ + "--evasive-devices") + + setup_mount_map "${CMD}" + common_args+=("${MOUNT_MAP_ARGS[@]}") + + # we do not want to exit if there's an error + execute "${CRIU_BINARY}" "${CMD}" "${common_args[@]}" "${CMD_ARGS[@]}" +} + +wrap_up() { + local -r logf="${CONTAINER_IMG_DIR}/${CMD}.log" + local -r pidf="${CONTAINER_IMG_DIR}/restore.pid" + + if [[ $1 -eq 0 ]]; then + ${ECHO} -e "\n" + echo "${CMD} successful" + else + ${ECHO} -e "\n" + echo "${CMD} failed" + fi + + if [[ "${VERBOSE}" == "-v" && -e "${logf}" ]]; then + if ! grep "finished successfully" "${logf}"; then + grep Error "${logf}" + fi + fi + + if [[ "${CMD}" == "restore" ]]; then + if [[ ${MOUNTED} -eq 1 ]]; then + execute umount "${CONTAINER_ROOT_DIR}" + fi + + if [[ -e "${pidf}" ]]; then + ${ECHO} -e "\n$(ps -f -p "$(cat "${pidf}")" --no-headers)" + fi + fi +} + +resolve_path() { + local p + + p="${2}" + if which realpath > /dev/null; then + p=$(realpath "${p}") + fi + ${ECHO} "${1}: ${p}" +} + +resolve_cmd() { + local cpath + + cpath=$(which "${2}") + resolve_path "${1}" "${cpath}" +} + +main() { + local rv=0 + + if [[ $(id -u) -ne 0 ]]; then + echo "not running as root" + exit 1 + fi + + parse_args "$@" + find_dockerinit + init_container_vars + + if [[ "${VERBOSE}" == "-v" ]]; then + echo + resolve_cmd "docker binary" "${DOCKER_BINARY}" + resolve_cmd "dockerinit binary" "${DOCKERINIT_BINARY}" + resolve_cmd "criu binary" "${CRIU_BINARY}" + resolve_path "image directory" "${CONTAINER_IMG_DIR}" + resolve_path "container root directory" "${CONTAINER_ROOT_DIR}" + fi + + if [[ "${CMD}" == "dump" ]]; then + prep_dump + else + prep_restore + fi + + run_criu || rv=$? + wrap_up ${rv} + exit ${rv} +} + +main "$@" diff --git a/CRIU_code/coredump/criu-coredump b/CRIU_code/coredump/criu-coredump new file mode 100644 index 0000000..25c188c --- /dev/null +++ b/CRIU_code/coredump/criu-coredump @@ -0,0 +1,40 @@ +#!/usr/bin/env python2 +import argparse +import os + +import criu_coredump + +def coredump(opts): + generator = criu_coredump.coredump_generator() + cores = generator(os.path.realpath(opts['in'])) + for pid in cores: + if opts['pid'] and pid != opts['pid']: + continue + with open(os.path.realpath(opts['out'])+"/core."+str(pid), 'w+') as f: + cores[pid].write(f) + + +def main(): + desc = 'CRIU core dump' + parser = argparse.ArgumentParser(description=desc, + formatter_class=argparse.RawTextHelpFormatter) + + parser.add_argument('-i', + '--in', + default = '.', + help = 'directory where to get images from') + parser.add_argument('-p', + '--pid', + type = int, + help = 'generate coredump for specific pid(all pids py default)') + parser.add_argument('-o', + '--out', + default = '.', + help = 'directory to write coredumps to') + + opts = vars(parser.parse_args()) + + coredump(opts) + +if __name__ == '__main__': + main() diff --git a/CRIU_code/coredump/criu_coredump/.gitignore b/CRIU_code/coredump/criu_coredump/.gitignore new file mode 100644 index 0000000..0d20b64 --- /dev/null +++ b/CRIU_code/coredump/criu_coredump/.gitignore @@ -0,0 +1 @@ +*.pyc diff --git a/CRIU_code/coredump/criu_coredump/__init__.py b/CRIU_code/coredump/criu_coredump/__init__.py new file mode 100644 index 0000000..213af42 --- /dev/null +++ b/CRIU_code/coredump/criu_coredump/__init__.py @@ -0,0 +1,2 @@ +from coredump import * +import elf diff --git a/CRIU_code/coredump/criu_coredump/coredump.py b/CRIU_code/coredump/criu_coredump/coredump.py new file mode 100644 index 0000000..2b0c37f --- /dev/null +++ b/CRIU_code/coredump/criu_coredump/coredump.py @@ -0,0 +1,830 @@ +# Functions and classes for creating core dump from criu images. +# Code is inspired by outdated google coredumper(RIP) [1] and +# fs/binfmt_elf.h from Linux kernel [2]. +# +# [1] https://code.google.com/p/google-coredumper/ +# probably already dead, so consider trying: +# https://github.com/efiop/google-coredumper/ +# [2] https://www.kernel.org/ +# +# On my x86_64 systems with fresh kernel ~3.17 core dump looks like: +# +# 1) Elf file header; +# 2) PT_NOTE program header describing notes section; +# 3) PT_LOAD program headers for (almost?) each vma; +# 4) NT_PRPSINFO note with elf_prpsinfo inside; +# 5) An array of notes for each thread of the process: +# NT_PRSTATUS note with elf_prstatus inside; +# NT_FPREGSET note with elf_fpregset inside; +# NT_X86_XSTATE note with x86 extended state using xsave; +# NT_SIGINFO note with siginfo_t inside; +# 6) NT_AUXV note with auxv; +# 7) NT_FILE note with mapped files; +# 8) VMAs themselves; +# +# Or, you can represent it in less details as: +# 1) Elf file header; +# 2) Program table; +# 3) Notes; +# 4) VMAs contents; +# +import io +import elf +import ctypes +from pycriu import images + +# Some memory-related constants +PAGESIZE = 4096 +status = { + "VMA_AREA_NONE" : 0 << 0, + "VMA_AREA_REGULAR" : 1 << 0, + "VMA_AREA_STACK" : 1 << 1, + "VMA_AREA_VSYSCALL" : 1 << 2, + "VMA_AREA_VDSO" : 1 << 3, + "VMA_FORCE_READ" : 1 << 4, + "VMA_AREA_HEAP" : 1 << 5, + "VMA_FILE_PRIVATE" : 1 << 6, + "VMA_FILE_SHARED" : 1 << 7, + "VMA_ANON_SHARED" : 1 << 8, + "VMA_ANON_PRIVATE" : 1 << 9, + "VMA_AREA_SYSVIPC" : 1 << 10, + "VMA_AREA_SOCKET" : 1 << 11, + "VMA_AREA_VVAR" : 1 << 12, + "VMA_AREA_AIORING" : 1 << 13, + "VMA_AREA_UNSUPP" : 1 << 31 +} + +prot = { + "PROT_READ" : 0x1, + "PROT_WRITE" : 0x2, + "PROT_EXEC" : 0x4 +} + +class elf_note: + nhdr = None # Elf_Nhdr; + owner = None # i.e. CORE or LINUX; + data = None # Ctypes structure with note data; + + +class coredump: + """ + A class to keep elf core dump components inside and + functions to properly write them to file. + """ + ehdr = None # Elf ehdr; + phdrs = [] # Array of Phdrs; + notes = [] # Array of elf_notes; + vmas = [] # Array of BytesIO with memory content; + # FIXME keeping all vmas in memory is a bad idea; + + def write(self, f): + """ + Write core dump to file f. + """ + buf = io.BytesIO() + buf.write(self.ehdr) + + for phdr in self.phdrs: + buf.write(phdr) + + for note in self.notes: + buf.write(note.nhdr) + buf.write(note.owner) + buf.write("\0"*(8-len(note.owner))) + buf.write(note.data) + + offset = ctypes.sizeof(elf.Elf64_Ehdr()) + offset += (len(self.vmas) + 1)*ctypes.sizeof(elf.Elf64_Phdr()) + + filesz = 0 + for note in self.notes: + filesz += ctypes.sizeof(note.nhdr) + ctypes.sizeof(note.data) + 8 + + note_align = PAGESIZE - ((offset + filesz) % PAGESIZE) + + if note_align == PAGESIZE: + note_align = 0 + + if note_align != 0: + scratch = (ctypes.c_char * note_align)() + ctypes.memset(ctypes.addressof(scratch), 0, ctypes.sizeof(scratch)) + buf.write(scratch) + + for vma in self.vmas: + buf.write(vma.data) + + buf.seek(0) + f.write(buf.read()) + + +class coredump_generator: + """ + Generate core dump from criu images. + """ + coredumps = {} # coredumps by pid; + + pstree = {} # process info by pid; + cores = {} # cores by pid; + mms = {} # mm by pid; + reg_files = None # reg-files; + pagemaps = {} # pagemap by pid; + + def _img_open_and_strip(self, name, single = False, pid = None): + """ + Load criu image and strip it from magic and redundant list. + """ + path = self._imgs_dir + "/" + name + if pid: + path += "-"+str(pid) + path += ".img" + + with open(path) as f: + img = images.load(f) + + if single: + return img["entries"][0] + else: + return img["entries"] + + + def __call__(self, imgs_dir): + """ + Parse criu images stored in directory imgs_dir to fill core dumps. + """ + self._imgs_dir = imgs_dir + pstree = self._img_open_and_strip("pstree") + + for p in pstree: + pid = p['pid'] + + self.pstree[pid] = p + for tid in p['threads']: + self.cores[tid] = self._img_open_and_strip("core", True, tid) + self.mms[pid] = self._img_open_and_strip("mm", True, pid) + self.pagemaps[pid] = self._img_open_and_strip("pagemap", False, pid) + + files = self._img_open_and_strip("files", False) + self.reg_files = [ x["reg"] for x in files if x["type"]=="REG" ] + + for pid in self.pstree: + self.coredumps[pid] = self._gen_coredump(pid) + + return self.coredumps + + + def write(self, coredumps_dir, pid = None): + """ + Write core dumpt to cores_dir directory. Specify pid to choose + core dump of only one process. + """ + for p in self.coredumps: + if pid and p != pid: + continue + with open(coredumps_dir+"/"+"core."+str(p), 'w+') as f: + self.coredumps[p].write(f) + + def _gen_coredump(self, pid): + """ + Generate core dump for pid. + """ + cd = coredump() + + # Generate everything backwards so it is easier to calculate offset. + cd.vmas = self._gen_vmas(pid) + cd.notes = self._gen_notes(pid) + cd.phdrs = self._gen_phdrs(pid, cd.notes, cd.vmas) + cd.ehdr = self._gen_ehdr(pid, cd.phdrs) + + return cd + + def _gen_ehdr(self, pid, phdrs): + """ + Generate elf header for process pid with program headers phdrs. + """ + ehdr = elf.Elf64_Ehdr() + + ctypes.memset(ctypes.addressof(ehdr), 0, ctypes.sizeof(ehdr)) + ehdr.e_ident[elf.EI_MAG0] = elf.ELFMAG0 + ehdr.e_ident[elf.EI_MAG1] = elf.ELFMAG1 + ehdr.e_ident[elf.EI_MAG2] = elf.ELFMAG2 + ehdr.e_ident[elf.EI_MAG3] = elf.ELFMAG3 + ehdr.e_ident[elf.EI_CLASS] = elf.ELFCLASS64 + ehdr.e_ident[elf.EI_DATA] = elf.ELFDATA2LSB + ehdr.e_ident[elf.EI_VERSION] = elf.EV_CURRENT + + ehdr.e_type = elf.ET_CORE + ehdr.e_machine = elf.EM_X86_64 + ehdr.e_version = elf.EV_CURRENT + ehdr.e_phoff = ctypes.sizeof(elf.Elf64_Ehdr()) + ehdr.e_ehsize = ctypes.sizeof(elf.Elf64_Ehdr()) + ehdr.e_phentsize = ctypes.sizeof(elf.Elf64_Phdr()) + #FIXME Case len(phdrs) > PN_XNUM should be handled properly. + # See fs/binfmt_elf.c from linux kernel. + ehdr.e_phnum = len(phdrs) + + return ehdr + + def _gen_phdrs(self, pid, notes, vmas): + """ + Generate program headers for process pid. + """ + phdrs = [] + + offset = ctypes.sizeof(elf.Elf64_Ehdr()) + offset += (len(vmas) + 1)*ctypes.sizeof(elf.Elf64_Phdr()) + + filesz = 0 + for note in notes: + filesz += ctypes.sizeof(note.nhdr) + ctypes.sizeof(note.data) + 8 + + # PT_NOTE + phdr = elf.Elf64_Phdr() + ctypes.memset(ctypes.addressof(phdr), 0, ctypes.sizeof(phdr)) + phdr.p_type = elf.PT_NOTE + phdr.p_offset = offset + phdr.p_filesz = filesz + + phdrs.append(phdr) + + note_align = PAGESIZE - ((offset + filesz) % PAGESIZE) + + if note_align == PAGESIZE: + note_align = 0 + + offset += note_align + + # VMA phdrs + + for vma in vmas: + offset += filesz + filesz = vma.filesz + phdr = elf.Elf64_Phdr() + ctypes.memset(ctypes.addressof(phdr), 0, ctypes.sizeof(phdr)) + phdr.p_type = elf.PT_LOAD + phdr.p_align = PAGESIZE + phdr.p_paddr = 0 + phdr.p_offset = offset + phdr.p_vaddr = vma.start + phdr.p_memsz = vma.memsz + phdr.p_filesz = vma.filesz + phdr.p_flags = vma.flags + + phdrs.append(phdr) + + return phdrs + + def _gen_prpsinfo(self, pid): + """ + Generate NT_PRPSINFO note for process pid. + """ + pstree = self.pstree[pid] + core = self.cores[pid] + + prpsinfo = elf.elf_prpsinfo() + ctypes.memset(ctypes.addressof(prpsinfo), 0, ctypes.sizeof(prpsinfo)) + + # FIXME TASK_ALIVE means that it is either running or sleeping, need to + # teach criu to distinguish them. + TASK_ALIVE = 0x1 + # XXX A bit of confusion here, as in ps "dead" and "zombie" + # state are two separate states, and we use TASK_DEAD for zombies. + TASK_DEAD = 0x2 + TASK_STOPPED = 0x3 + if core["tc"]["task_state"] == TASK_ALIVE: + prpsinfo.pr_state = 0 + if core["tc"]["task_state"] == TASK_DEAD: + prpsinfo.pr_state = 4 + if core["tc"]["task_state"] == TASK_STOPPED: + prpsinfo.pr_state = 3 + # Don't even ask me why it is so, just borrowed from linux + # source and made pr_state match. + prpsinfo.pr_sname = '.' if prpsinfo.pr_state > 5 else "RSDTZW"[prpsinfo.pr_state] + prpsinfo.pr_zomb = 1 if prpsinfo.pr_state == 4 else 0 + prpsinfo.pr_nice = core["thread_core"]["sched_prio"] if "sched_prio" in core["thread_core"] else 0 + prpsinfo.pr_flag = core["tc"]["flags"] + prpsinfo.pr_uid = core["thread_core"]["creds"]["uid"] + prpsinfo.pr_gid = core["thread_core"]["creds"]["gid"] + prpsinfo.pr_pid = pid + prpsinfo.pr_ppid = pstree["ppid"] + prpsinfo.pr_pgrp = pstree["pgid"] + prpsinfo.pr_sid = pstree["sid"] + prpsinfo.pr_fname = core["tc"]["comm"] + prpsinfo.pr_psargs = self._gen_cmdline(pid) + + nhdr = elf.Elf64_Nhdr() + nhdr.n_namesz = 5 + nhdr.n_descsz = ctypes.sizeof(elf.elf_prpsinfo()) + nhdr.n_type = elf.NT_PRPSINFO + + note = elf_note() + note.data = prpsinfo + note.owner = "CORE" + note.nhdr = nhdr + + return note + + def _gen_prstatus(self, pid, tid): + """ + Generate NT_PRSTATUS note for thread tid of process pid. + """ + core = self.cores[tid] + regs = core["thread_info"]["gpregs"] + pstree = self.pstree[pid] + + prstatus = elf.elf_prstatus() + + ctypes.memset(ctypes.addressof(prstatus), 0, ctypes.sizeof(prstatus)) + + #FIXME setting only some of the fields for now. Revisit later. + prstatus.pr_pid = tid + prstatus.pr_ppid = pstree["ppid"] + prstatus.pr_pgrp = pstree["pgid"] + prstatus.pr_sid = pstree["sid"] + + prstatus.pr_reg.r15 = regs["r15"] + prstatus.pr_reg.r14 = regs["r14"] + prstatus.pr_reg.r13 = regs["r13"] + prstatus.pr_reg.r12 = regs["r12"] + prstatus.pr_reg.rbp = regs["bp"] + prstatus.pr_reg.rbx = regs["bx"] + prstatus.pr_reg.r11 = regs["r11"] + prstatus.pr_reg.r10 = regs["r10"] + prstatus.pr_reg.r9 = regs["r9"] + prstatus.pr_reg.r8 = regs["r8"] + prstatus.pr_reg.rax = regs["ax"] + prstatus.pr_reg.rcx = regs["cx"] + prstatus.pr_reg.rdx = regs["dx"] + prstatus.pr_reg.rsi = regs["si"] + prstatus.pr_reg.rdi = regs["di"] + prstatus.pr_reg.orig_rax = regs["orig_ax"] + prstatus.pr_reg.rip = regs["ip"] + prstatus.pr_reg.cs = regs["cs"] + prstatus.pr_reg.eflags = regs["flags"] + prstatus.pr_reg.rsp = regs["sp"] + prstatus.pr_reg.ss = regs["ss"] + prstatus.pr_reg.fs_base = regs["fs_base"] + prstatus.pr_reg.gs_base = regs["gs_base"] + prstatus.pr_reg.ds = regs["ds"] + prstatus.pr_reg.es = regs["es"] + prstatus.pr_reg.fs = regs["fs"] + prstatus.pr_reg.gs = regs["gs"] + + nhdr = elf.Elf64_Nhdr() + nhdr.n_namesz = 5 + nhdr.n_descsz = ctypes.sizeof(elf.elf_prstatus()) + nhdr.n_type = elf.NT_PRSTATUS + + note = elf_note() + note.data = prstatus + note.owner = "CORE" + note.nhdr = nhdr + + return note + + def _gen_fpregset(self, pid, tid): + """ + Generate NT_FPREGSET note for thread tid of process pid. + """ + core = self.cores[tid] + regs = core["thread_info"]["fpregs"] + + fpregset = elf.elf_fpregset_t() + ctypes.memset(ctypes.addressof(fpregset), 0, ctypes.sizeof(fpregset)) + + fpregset.cwd = regs["cwd"] + fpregset.swd = regs["swd"] + fpregset.ftw = regs["twd"] + fpregset.fop = regs["fop"] + fpregset.rip = regs["rip"] + fpregset.rdp = regs["rdp"] + fpregset.mxcsr = regs["mxcsr"] + fpregset.mxcr_mask = regs["mxcsr_mask"] + fpregset.st_space = (ctypes.c_uint * len(regs["st_space"]))(*regs["st_space"]) + fpregset.xmm_space = (ctypes.c_uint * len(regs["xmm_space"]))(*regs["xmm_space"]) + #fpregset.padding = regs["padding"] unused + + nhdr = elf.Elf64_Nhdr() + nhdr.n_namesz = 5 + nhdr.n_descsz = ctypes.sizeof(elf.elf_fpregset_t()) + nhdr.n_type = elf.NT_FPREGSET + + note = elf_note() + note.data = fpregset + note.owner = "CORE" + note.nhdr = nhdr + + return note + + def _gen_x86_xstate(self, pid, tid): + """ + Generate NT_X86_XSTATE note for thread tid of process pid. + """ + core = self.cores[tid] + fpregs = core["thread_info"]["fpregs"] + + data = elf.elf_xsave_struct() + ctypes.memset(ctypes.addressof(data), 0, ctypes.sizeof(data)) + + data.i387.cwd = fpregs["cwd"] + data.i387.swd = fpregs["swd"] + data.i387.twd = fpregs["twd"] + data.i387.fop = fpregs["fop"] + data.i387.rip = fpregs["rip"] + data.i387.rdp = fpregs["rdp"] + data.i387.mxcsr = fpregs["mxcsr"] + data.i387.mxcsr_mask = fpregs["mxcsr_mask"] + data.i387.st_space = (ctypes.c_uint * len(fpregs["st_space"]))(*fpregs["st_space"]) + data.i387.xmm_space = (ctypes.c_uint * len(fpregs["xmm_space"]))(*fpregs["xmm_space"]) + + if "xsave" in fpregs: + data.xsave_hdr.xstate_bv = fpregs["xsave"]["xstate_bv"] + data.ymmh.ymmh_space = (ctypes.c_uint * len(fpregs["xsave"]["ymmh_space"]))(*fpregs["xsave"]["ymmh_space"]) + + nhdr = elf.Elf64_Nhdr() + nhdr.n_namesz = 6 + nhdr.n_descsz = ctypes.sizeof(data) + nhdr.n_type = elf.NT_X86_XSTATE + + note = elf_note() + note.data = data + note.owner = "LINUX" + note.nhdr = nhdr + + return note + + def _gen_siginfo(self, pid, tid): + """ + Generate NT_SIGINFO note for thread tid of process pid. + """ + siginfo = elf.siginfo_t() + # FIXME zeroify everything for now + ctypes.memset(ctypes.addressof(siginfo), 0, ctypes.sizeof(siginfo)) + + nhdr = elf.Elf64_Nhdr() + nhdr.n_namesz = 5 + nhdr.n_descsz = ctypes.sizeof(elf.siginfo_t()) + nhdr.n_type = elf.NT_SIGINFO + + note = elf_note() + note.data = siginfo + note.owner = "CORE" + note.nhdr = nhdr + + return note + + def _gen_auxv(self, pid): + """ + Generate NT_AUXV note for thread tid of process pid. + """ + mm = self.mms[pid] + num_auxv = len(mm["mm_saved_auxv"])/2 + + class elf_auxv(ctypes.Structure): + _fields_ = [("auxv", elf.Elf64_auxv_t*num_auxv)] + + auxv = elf_auxv() + for i in range(num_auxv): + auxv.auxv[i].a_type = mm["mm_saved_auxv"][i] + auxv.auxv[i].a_val = mm["mm_saved_auxv"][i+1] + + nhdr = elf.Elf64_Nhdr() + nhdr.n_namesz = 5 + nhdr.n_descsz = ctypes.sizeof(elf_auxv()) + nhdr.n_type = elf.NT_AUXV + + note = elf_note() + note.data = auxv + note.owner = "CORE" + note.nhdr = nhdr + + return note + + def _gen_files(self, pid): + """ + Generate NT_FILE note for process pid. + """ + mm = self.mms[pid] + + class mmaped_file_info: + start = None + end = None + file_ofs = None + name = None + + infos = [] + for vma in mm["vmas"]: + if vma["shmid"] == 0: + # shmid == 0 means that it is not a file + continue + + shmid = vma["shmid"] + size = vma["end"] - vma["start"] + off = vma["pgoff"]/PAGESIZE + + files = self.reg_files + fname = filter(lambda x: x["id"] == shmid, files)[0]["name"] + + info = mmaped_file_info() + info.start = vma["start"] + info.end = vma["end"] + info.file_ofs = off + info.name = fname + + infos.append(info) + + # /* + # * Format of NT_FILE note: + # * + # * long count -- how many files are mapped + # * long page_size -- units for file_ofs + # * array of [COUNT] elements of + # * long start + # * long end + # * long file_ofs + # * followed by COUNT filenames in ASCII: "FILE1" NUL "FILE2" NUL... + # */ + fields = [] + fields.append(("count", ctypes.c_long)) + fields.append(("page_size", ctypes.c_long)) + for i in range(len(infos)): + fields.append(("start"+str(i), ctypes.c_long)) + fields.append(("end"+str(i), ctypes.c_long)) + fields.append(("file_ofs"+str(i), ctypes.c_long)) + for i in range(len(infos)): + fields.append(("name"+str(i), ctypes.c_char*(len(infos[i].name)+1))) + + class elf_files(ctypes.Structure): + _fields_ = fields + + data = elf_files() + data.count = len(infos) + data.page_size = PAGESIZE + for i in range(len(infos)): + info = infos[i] + setattr(data, "start"+str(i), info.start) + setattr(data, "end"+str(i), info.end) + setattr(data, "file_ofs"+str(i), info.file_ofs) + setattr(data, "name"+str(i), info.name) + + nhdr = elf.Elf64_Nhdr() + + nhdr.n_namesz = 5#XXX strlen + 1 + nhdr.n_descsz = ctypes.sizeof(elf_files()) + nhdr.n_type = elf.NT_FILE + + note = elf_note() + note.nhdr = nhdr + note.owner = "CORE" + note.data = data + + return note + + def _gen_thread_notes(self, pid, tid): + notes = [] + + notes.append(self._gen_prstatus(pid, tid)) + notes.append(self._gen_fpregset(pid, tid)) + notes.append(self._gen_x86_xstate(pid, tid)) + notes.append(self._gen_siginfo(pid, tid)) + + return notes + + def _gen_notes(self, pid): + """ + Generate notes for core dump of process pid. + """ + notes = [] + + notes.append(self._gen_prpsinfo(pid)) + + threads = self.pstree[pid]["threads"] + + # Main thread first + notes += self._gen_thread_notes(pid, pid) + + # Then other threads + for tid in threads: + if tid == pid: + continue + + notes += self._gen_thread_notes(pid, tid) + + notes.append(self._gen_auxv(pid)) + notes.append(self._gen_files(pid)) + + return notes + + def _get_page(self, pid, page_no): + """ + Try to find memory page page_no in pages.img image for process pid. + """ + pagemap = self.pagemaps[pid] + + # First entry is pagemap_head, we will need it later to open + # proper pages.img. + pages_id = pagemap[0]["pages_id"] + off = 0# in pages + for m in pagemap[1:]: + found = False + for i in range(m["nr_pages"]): + if m["vaddr"] + i*PAGESIZE == page_no*PAGESIZE: + found = True + break + off += 1 + + if not found: + continue + + if "in_parent" in m and m["in_parent"] == True: + ppid = self.pstree[pid]["ppid"] + return self._get_page(ppid, page_no) + else: + with open(self._imgs_dir+"/"+"pages-"+str(pages_id)+".img") as f: + f.seek(off*PAGESIZE) + return f.read(PAGESIZE) + + return None + + def _gen_mem_chunk(self, pid, vma, size): + """ + Obtain vma contents for process pid. + """ + f = None + + if size == 0: + return "" + + if vma["status"] & status["VMA_AREA_VVAR"]: + #FIXME this is what gdb does, as vvar vma + # is not readable from userspace? + return "\0"*size + elif vma["status"] & status["VMA_AREA_VSYSCALL"]: + #FIXME need to dump it with criu or read from + # current process. + return "\0"*size + + if vma["status"] & status["VMA_FILE_SHARED"] or \ + vma["status"] & status["VMA_FILE_PRIVATE"]: + # Open file before iterating vma pages + shmid = vma["shmid"] + off = vma["pgoff"] + + files = self.reg_files + fname = filter(lambda x: x["id"] == shmid, files)[0]["name"] + + f = open(fname) + f.seek(off) + + start = vma["start"] + end = vma["start"] + size + + # Split requested memory chunk into pages, so it could be + # pictured as: + # + # "----" -- part of page with memory outside of our vma; + # "XXXX" -- memory from our vma; + # + # Start page Pages in the middle End page + # [-----XXXXX]...[XXXXXXXXXX][XXXXXXXXXX]...[XXX-------] + # + # Each page could be found in pages.img or in a standalone + # file described by shmid field in vma entry and + # corresponding entry in reg-files.img. + # For VMA_FILE_PRIVATE vma, unchanged pages are taken from + # a file, and changed ones -- from pages.img. + # Finally, if no page is found neither in pages.img nor + # in file, hole in inserted -- a page filled with zeroes. + start_page = start/PAGESIZE + end_page = end/PAGESIZE + + buf = "" + for page_no in range(start_page, end_page+1): + page = None + + # Search for needed page in pages.img and reg-files.img + # and choose appropriate. + page_mem = self._get_page(pid, page_no) + + if f != None: + page = f.read(PAGESIZE) + + if page_mem != None: + # Page from pages.img has higher priority + # than one from maped file on disk. + page = page_mem + + if page == None: + # Hole + page = PAGESIZE*"\0" + + # If it is a start or end page, we need to read + # only part of it. + if page_no == start_page: + n_skip = start - page_no*PAGESIZE + if start_page == end_page: + n_read = size + else: + n_read = PAGESIZE - n_skip + elif page_no == end_page: + n_skip = 0 + n_read = end - page_no*PAGESIZE + else: + n_skip = 0 + n_read = PAGESIZE + + buf += page[n_skip : n_skip + n_read] + + # Don't forget to close file. + if f != None: + f.close() + + return buf + + def _gen_cmdline(self, pid): + """ + Generate full command with arguments. + """ + mm = self.mms[pid] + + vma = {} + vma["start"] = mm["mm_arg_start"] + vma["end"] = mm["mm_arg_end"] + # Dummy flags and status. + vma["flags"] = 0 + vma["status"] = 0 + size = vma["end"] - vma["start"] + + chunk = self._gen_mem_chunk(pid, vma, size) + + # Replace all '\0's with spaces. + return chunk.replace('\0', ' ') + + def _get_vma_dump_size(self, vma): + """ + Calculate amount of vma to put into core dump. + """ + if vma["status"] & status["VMA_AREA_VVAR"] or \ + vma["status"] & status["VMA_AREA_VSYSCALL"] or \ + vma["status"] & status["VMA_AREA_VDSO"]: + size = vma["end"] - vma["start"] + elif vma["prot"] == 0: + size = 0 + elif vma["prot"] & prot["PROT_READ"] and \ + vma["prot"] & prot["PROT_EXEC"]: + size = PAGESIZE + elif vma["status"] & status["VMA_ANON_SHARED"] or \ + vma["status"] & status["VMA_FILE_SHARED"] or \ + vma["status"] & status["VMA_ANON_PRIVATE"] or \ + vma["status"] & status["VMA_FILE_PRIVATE"]: + size = vma["end"] - vma["start"] + else: + size = 0 + + return size + + def _get_vma_flags(self, vma): + """ + Convert vma flags int elf flags. + """ + flags = 0 + + if vma['prot'] & prot["PROT_READ"]: + flags = flags | elf.PF_R + + if vma['prot'] & prot["PROT_WRITE"]: + flags = flags | elf.PF_W + + if vma['prot'] & prot["PROT_EXEC"]: + flags = flags | elf.PF_X + + return flags + + def _gen_vmas(self, pid): + """ + Generate vma contents for core dump for process pid. + """ + mm = self.mms[pid] + + class vma_class: + data = None + filesz = None + memsz = None + flags = None + start = None + + vmas = [] + for vma in mm["vmas"]: + size = self._get_vma_dump_size(vma) + + chunk = self._gen_mem_chunk(pid, vma, size) + + v = vma_class() + v.filesz = self._get_vma_dump_size(vma) + v.data = self._gen_mem_chunk(pid, vma, v.filesz) + v.memsz = vma["end"] - vma["start"] + v.start = vma["start"] + v.flags = self._get_vma_flags(vma) + + vmas.append(v) + + return vmas diff --git a/CRIU_code/coredump/criu_coredump/elf.py b/CRIU_code/coredump/criu_coredump/elf.py new file mode 100644 index 0000000..1da06a6 --- /dev/null +++ b/CRIU_code/coredump/criu_coredump/elf.py @@ -0,0 +1,526 @@ +# Define structures and constants for generating elf file. +import ctypes + +Elf64_Half = ctypes.c_uint16 # typedef uint16_t Elf64_Half; +Elf64_Word = ctypes.c_uint32 # typedef uint32_t Elf64_Word; +Elf64_Addr = ctypes.c_uint64 # typedef uint64_t Elf64_Addr; +Elf64_Off = ctypes.c_uint64 # typedef uint64_t Elf64_Off; +Elf64_Xword = ctypes.c_uint64 # typedef uint64_t Elf64_Xword; + +# Elf64_Ehdr related constants. + +# e_ident size. +EI_NIDENT = 16 # #define EI_NIDENT (16) + +EI_MAG0 = 0 # #define EI_MAG0 0 /* File identification byte 0 index */ +ELFMAG0 = 0x7f # #define ELFMAG0 0x7f /* Magic number byte 0 */ + +EI_MAG1 = 1 # #define EI_MAG1 1 /* File identification byte 1 index */ +ELFMAG1 = ord('E') # #define ELFMAG1 'E' /* Magic number byte 1 */ + +EI_MAG2 = 2 # #define EI_MAG2 2 /* File identification byte 2 index */ +ELFMAG2 = ord('L') # #define ELFMAG2 'L' /* Magic number byte 2 */ + +EI_MAG3 = 3 # #define EI_MAG3 3 /* File identification byte 3 index */ +ELFMAG3 = ord('F') # #define ELFMAG3 'F' /* Magic number byte 3 */ + +EI_CLASS = 4 # #define EI_CLASS 4 /* File class byte index */ + +EI_DATA = 5 # #define EI_DATA 5 /* Data encoding byte index */ + +EI_VERSION = 6 # #define EI_VERSION 6 /* File version byte index */ + +ELFDATA2LSB = 1 # #define ELFDATA2LSB 1 /* 2's complement, little endian */ + +ELFCLASS64 = 2 # #define ELFCLASS64 2 /* 64-bit objects */ + +# Legal values for e_type (object file type). +ET_CORE = 4 # #define ET_CORE 4 /* Core file */ + +# Legal values for e_machine (architecture). +EM_X86_64 = 62 # #define EM_X86_64 62 /* AMD x86-64 architecture */ + +# Legal values for e_version (version). +EV_CURRENT = 1 # #define EV_CURRENT 1 /* Current version */ + +class Elf64_Ehdr(ctypes.Structure): # typedef struct + _fields_ = [ # { + ("e_ident", ctypes.c_ubyte*EI_NIDENT), # unsigned char e_ident[EI_NIDENT]; + ("e_type", Elf64_Half), # Elf64_Half e_type; + ("e_machine", Elf64_Half), # Elf64_Half e_machine; + ("e_version", Elf64_Word), # Elf64_Word e_version; + ("e_entry", Elf64_Addr), # Elf64_Addr e_entry; + ("e_phoff", Elf64_Off), # Elf64_Off e_phoff; + ("e_shoff", Elf64_Off), # Elf64_Off e_shoff; + ("e_flags", Elf64_Word), # Elf64_Word e_flags; + ("e_ehsize", Elf64_Half), # Elf64_Half e_ehsize; + ("e_phentsize", Elf64_Half), # Elf64_Half e_phentsize; + ("e_phnum", Elf64_Half), # Elf64_Half e_phnum; + ("e_shentsize", Elf64_Half), # Elf64_Half e_shentsize; + ("e_shnum", Elf64_Half), # Elf64_Half e_shnum; + ("e_shstrndx", Elf64_Half) # Elf64_Half e_shstrndx; + ] # } Elf64_Ehdr; + + +# Elf64_Phdr related constants. + +# Legal values for p_type (segment type). +PT_LOAD = 1 # #define PT_LOAD 1 /* Loadable program segment */ +PT_NOTE = 4 # #define PT_NOTE 4 /* Auxiliary information */ + +# Legal values for p_flags (segment flags). +PF_X = 1 # #define PF_X (1 << 0) /* Segment is executable */ +PF_W = 1 << 1 # #define PF_W (1 << 1) /* Segment is writable */ +PF_R = 1 << 2 # #define PF_R (1 << 2) /* Segment is readable */ + +class Elf64_Phdr(ctypes.Structure): # typedef struct + _fields_ = [ # { + ("p_type", Elf64_Word), # Elf64_Word p_type; + ("p_flags", Elf64_Word), # Elf64_Word p_flags; + ("p_offset", Elf64_Off), # Elf64_Off p_offset; + ("p_vaddr", Elf64_Addr), # Elf64_Addr p_vaddr; + ("p_paddr", Elf64_Addr), # Elf64_Addr p_paddr; + ("p_filesz", Elf64_Xword), # Elf64_Xword p_filesz; + ("p_memsz", Elf64_Xword), # Elf64_Xword p_memsz; + ("p_align", Elf64_Xword), # Elf64_Xword p_align; + ] # } Elf64_Phdr; + + +# Elf64_auxv_t related constants. + +class _Elf64_auxv_t_U(ctypes.Union): + _fields_ = [ + ("a_val", ctypes.c_uint64) + ] + +class Elf64_auxv_t(ctypes.Structure): # typedef struct + _fields_ = [ # { + ("a_type", ctypes.c_uint64), # uint64_t a_type; /* Entry type */ + ("a_un", _Elf64_auxv_t_U) # union + # { + # uint64_t a_val; /* Integer value */ + # /* We use to have pointer elements added here. We cannot do that, + # though, since it does not work when using 32-bit definitions + # on 64-bit platforms and vice versa. */ + # } a_un; + ] # } Elf64_auxv_t; + + +# Elf64_Nhdr related constants. + +NT_PRSTATUS = 1 # #define NT_PRSTATUS 1 /* Contains copy of prstatus struct */ +NT_FPREGSET = 2 # #define NT_FPREGSET 2 /* Contains copy of fpregset struct */ +NT_PRPSINFO = 3 # #define NT_PRPSINFO 3 /* Contains copy of prpsinfo struct */ +NT_AUXV = 6 # #define NT_AUXV 6 /* Contains copy of auxv array */ +NT_SIGINFO = 0x53494749 # #define NT_SIGINFO 0x53494749 /* Contains copy of siginfo_t, +# size might increase */ +NT_FILE = 0x46494c45 # #define NT_FILE 0x46494c45 /* Contains information about mapped +# files */ +NT_X86_XSTATE = 0x202 # #define NT_X86_XSTATE 0x202 /* x86 extended state using xsave */ + +class Elf64_Nhdr(ctypes.Structure): # typedef struct + _fields_ = [ # { + ("n_namesz", Elf64_Word), # Elf64_Word n_namesz; /* Length of the note's name. */ + ("n_descsz", Elf64_Word), # Elf64_Word n_descsz; /* Length of the note's descriptor. */ + ("n_type", Elf64_Word), # Elf64_Word n_type; /* Type of the note. */ + ] # } Elf64_Nhdr; + + +# Elf64_Shdr related constants. + +class Elf64_Shdr(ctypes.Structure): # typedef struct + _fields_ = [ # { + ("sh_name", Elf64_Word), # Elf64_Word sh_name; /* Section name (string tbl index) */ + ("sh_type", Elf64_Word), # Elf64_Word sh_type; /* Section type */ + ("sh_flags", Elf64_Xword), # Elf64_Xword sh_flags; /* Section flags */ + ("sh_addr", Elf64_Addr), # Elf64_Addr sh_addr; /* Section virtual addr at execution */ + ("sh_offset", Elf64_Off), # Elf64_Off sh_offset; /* Section file offset */ + ("sh_size", Elf64_Xword), # Elf64_Xword sh_size; /* Section size in bytes */ + ("sh_link", Elf64_Word), # Elf64_Word sh_link; /* Link to another section */ + ("sh_info", Elf64_Word), # Elf64_Word sh_info; /* Additional section information */ + ("sh_addralign",Elf64_Xword), # Elf64_Xword sh_addralign; /* Section alignment */ + ("sh_entsize", Elf64_Xword) # Elf64_Xword sh_entsize; /* Entry size if section holds table */ + ] # } Elf64_Shdr; + + +# elf_prstatus related constants. + +# Signal info. +class elf_siginfo(ctypes.Structure): # struct elf_siginfo + _fields_ = [ # { + ("si_signo", ctypes.c_int), # int si_signo; /* Signal number. */ + ("si_code", ctypes.c_int), # int si_code; /* Extra code. */ + ("si_errno", ctypes.c_int) # int si_errno; /* Errno. */ + ] # }; + +# A time value that is accurate to the nearest +# microsecond but also has a range of years. +class timeval(ctypes.Structure): # struct timeval + _fields_ = [ # { + ("tv_sec", ctypes.c_long), # __time_t tv_sec; /* Seconds. */ + ("tv_usec", ctypes.c_long) # __suseconds_t tv_usec; /* Microseconds. */ + ] # }; + +class user_regs_struct(ctypes.Structure): # struct user_regs_struct + _fields_ = [ # { + ("r15", ctypes.c_ulonglong), # __extension__ unsigned long long int r15; + ("r14", ctypes.c_ulonglong), # __extension__ unsigned long long int r14; + ("r13", ctypes.c_ulonglong), # __extension__ unsigned long long int r13; + ("r12", ctypes.c_ulonglong), # __extension__ unsigned long long int r12; + ("rbp", ctypes.c_ulonglong), # __extension__ unsigned long long int rbp; + ("rbx", ctypes.c_ulonglong), # __extension__ unsigned long long int rbx; + ("r11", ctypes.c_ulonglong), # __extension__ unsigned long long int r11; + ("r10", ctypes.c_ulonglong), # __extension__ unsigned long long int r10; + ("r9", ctypes.c_ulonglong), # __extension__ unsigned long long int r9; + ("r8", ctypes.c_ulonglong), # __extension__ unsigned long long int r8; + ("rax", ctypes.c_ulonglong), # __extension__ unsigned long long int rax; + ("rcx", ctypes.c_ulonglong), # __extension__ unsigned long long int rcx; + ("rdx", ctypes.c_ulonglong), # __extension__ unsigned long long int rdx; + ("rsi", ctypes.c_ulonglong), # __extension__ unsigned long long int rsi; + ("rdi", ctypes.c_ulonglong), # __extension__ unsigned long long int rdi; + ("orig_rax", ctypes.c_ulonglong), # __extension__ unsigned long long int orig_rax; + ("rip", ctypes.c_ulonglong), # __extension__ unsigned long long int rip; + ("cs", ctypes.c_ulonglong), # __extension__ unsigned long long int cs; + ("eflags", ctypes.c_ulonglong), # __extension__ unsigned long long int eflags; + ("rsp", ctypes.c_ulonglong), # __extension__ unsigned long long int rsp; + ("ss", ctypes.c_ulonglong), # __extension__ unsigned long long int ss; + ("fs_base", ctypes.c_ulonglong), # __extension__ unsigned long long int fs_base; + ("gs_base", ctypes.c_ulonglong), # __extension__ unsigned long long int gs_base; + ("ds", ctypes.c_ulonglong), # __extension__ unsigned long long int ds; + ("es", ctypes.c_ulonglong), # __extension__ unsigned long long int es; + ("fs", ctypes.c_ulonglong), # __extension__ unsigned long long int fs; + ("gs", ctypes.c_ulonglong) # __extension__ unsigned long long int gs; + ] # }; + +#elf_greg_t = ctypes.c_ulonglong +#ELF_NGREG = ctypes.sizeof(user_regs_struct)/ctypes.sizeof(elf_greg_t) +#elf_gregset_t = elf_greg_t*ELF_NGREG +elf_gregset_t = user_regs_struct + +class elf_prstatus(ctypes.Structure): # struct elf_prstatus + _fields_ = [ # { + ("pr_info", elf_siginfo), # struct elf_siginfo pr_info; /* Info associated with signal. */ + ("pr_cursig", ctypes.c_short), # short int pr_cursig; /* Current signal. */ + ("pr_sigpend", ctypes.c_ulong), # unsigned long int pr_sigpend; /* Set of pending signals. */ + ("pr_sighold", ctypes.c_ulong), # unsigned long int pr_sighold; /* Set of held signals. */ + ("pr_pid", ctypes.c_int), # __pid_t pr_pid; + ("pr_ppid", ctypes.c_int), # __pid_t pr_ppid; + ("pr_pgrp", ctypes.c_int), # __pid_t pr_pgrp; + ("pr_sid", ctypes.c_int), # __pid_t pr_sid; + ("pr_utime", timeval), # struct timeval pr_utime; /* User time. */ + ("pr_stime", timeval), # struct timeval pr_stime; /* System time. */ + ("pr_cutime", timeval), # struct timeval pr_cutime; /* Cumulative user time. */ + ("pr_cstime", timeval), # struct timeval pr_cstime; /* Cumulative system time. */ + ("pr_reg", elf_gregset_t), # elf_gregset_t pr_reg; /* GP registers. */ + ("pr_fpvalid", ctypes.c_int) # int pr_fpvalid; /* True if math copro being used. */ + ] # }; + + +# elf_prpsinfo related constants. + +ELF_PRARGSZ = 80 # #define ELF_PRARGSZ (80) /* Number of chars for args. */ + +class elf_prpsinfo(ctypes.Structure): # struct elf_prpsinfo + _fields_ = [ # { + ("pr_state", ctypes.c_byte), # char pr_state; /* Numeric process state. */ + ("pr_sname", ctypes.c_char), # char pr_sname; /* Char for pr_state. */ + ("pr_zomb", ctypes.c_byte), # char pr_zomb; /* Zombie. */ + ("pr_nice", ctypes.c_byte), # char pr_nice; /* Nice val. */ + ("pr_flag", ctypes.c_ulong), # unsigned long int pr_flag; /* Flags. */ + # #if __WORDSIZE == 32 + # unsigned short int pr_uid; + # unsigned short int pr_gid; + # #else + ("pr_uid", ctypes.c_uint), # unsigned int pr_uid; + ("pr_gid", ctypes.c_uint), # unsigned int pr_gid; + # #endif + ("pr_pid", ctypes.c_int), # int pr_pid, pr_ppid, pr_pgrp, pr_sid; + ("pr_ppid", ctypes.c_int), + ("pr_pgrp", ctypes.c_int), + ("pr_sid", ctypes.c_int), + # /* Lots missing */ + ("pr_fname", ctypes.c_char*16), # char pr_fname[16]; /* Filename of executable. */ + ("pr_psargs", ctypes.c_char*ELF_PRARGSZ) # char pr_psargs[ELF_PRARGSZ]; /* Initial part of arg list. */ + ] # }; + + +class user_fpregs_struct(ctypes.Structure): # struct user_fpregs_struct + _fields_ = [ # { + ("cwd", ctypes.c_ushort), # unsigned short int cwd; + ("swd", ctypes.c_ushort), # unsigned short int swd; + ("ftw", ctypes.c_ushort), # unsigned short int ftw; + ("fop", ctypes.c_ushort), # unsigned short int fop; + ("rip", ctypes.c_ulonglong), # __extension__ unsigned long long int rip; + ("rdp", ctypes.c_ulonglong), # __extension__ unsigned long long int rdp; + ("mxcsr", ctypes.c_uint), # unsigned int mxcsr; + ("mxcr_mask", ctypes.c_uint), # unsigned int mxcr_mask; + ("st_space", ctypes.c_uint*32), # unsigned int st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */ + ("xmm_space", ctypes.c_uint*64), # unsigned int xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */ + ("padding", ctypes.c_uint*24), # unsigned int padding[24]; + ] # }; + + +elf_fpregset_t = user_fpregs_struct + + +# siginfo_t related constants. + +_SI_MAX_SIZE = 128 +_SI_PAD_SIZE = (_SI_MAX_SIZE/ctypes.sizeof(ctypes.c_int)) - 4 + + # /* kill(). */ +class _siginfo_t_U_kill(ctypes.Structure): # struct + _fields_ = [ # { + ("si_pid", ctypes.c_int), # __pid_t si_pid; /* Sending process ID. */ + ("si_uid", ctypes.c_uint) # __uid_t si_uid; /* Real user ID of sending process. */ + ] # } _kill; + + + +# Type for data associated with a signal. +class sigval_t(ctypes.Union): # typedef union sigval + _fields_ = [ # { + ("sival_int", ctypes.c_int), # int sival_int; + ("sical_ptr", ctypes.c_void_p), # void *sival_ptr; + ] # } sigval_t; + + # /* POSIX.1b timers. */ +class _siginfo_t_U_timer(ctypes.Structure): # struct + _fields_ = [ # { + ("si_tid", ctypes.c_int), # int si_tid; /* Timer ID. */ + ("si_overrun", ctypes.c_int), # int si_overrun; /* Overrun count. */ + ("si_sigval", sigval_t) # sigval_t si_sigval; /* Signal value. */ + ] # } _timer; + + + # /* POSIX.1b signals. */ +class _siginfo_t_U_rt(ctypes.Structure): # struct + _fields_ = [ # { + ("si_pid", ctypes.c_int), # __pid_t si_pid; /* Sending process ID. */ + ("si_uid", ctypes.c_uint), # __uid_t si_uid; /* Real user ID of sending process. */ + ("si_sigval", sigval_t) # sigval_t si_sigval; /* Signal value. */ + ] # } _rt; + + + # /* SIGCHLD. */ +class _siginfo_t_U_sigchld(ctypes.Structure): # struct + _fields_ = [ # { + ("si_pid", ctypes.c_int), # __pid_t si_pid; /* Which child. */ + ("si_uid", ctypes.c_uint), # __uid_t si_uid; /* Real user ID of sending process. */ + ("si_status", ctypes.c_int), # int si_status; /* Exit value or signal. */ + ("si_utime", ctypes.c_long), # __sigchld_clock_t si_utime; + ("si_stime", ctypes.c_long) # __sigchld_clock_t si_stime; + ] # } _sigchld; + + # /* SIGILL, SIGFPE, SIGSEGV, SIGBUS. */ +class _siginfo_t_U_sigfault(ctypes.Structure): # struct + _fields_ = [ # { + ("si_addr", ctypes.c_void_p), # void *si_addr; /* Faulting insn/memory ref. */ + ("si_addr_lsb", ctypes.c_short) # short int si_addr_lsb; /* Valid LSB of the reported address. */ + ] # } _sigfault; + + # /* SIGPOLL. */ +class _siginfo_t_U_sigpoll(ctypes.Structure): # struct + _fields_ = [ # { + ("si_band", ctypes.c_long), # long int si_band; /* Band event for SIGPOLL. */ + ("si_fd", ctypes.c_int) # int si_fd; + ] # } _sigpoll; + + + # /* SIGSYS. */ +class _siginfo_t_U_sigsys(ctypes.Structure): # struct + _fields_ = [ # { + ("_call_addr", ctypes.c_void_p), # void *_call_addr; /* Calling user insn. */ + ("_syscall", ctypes.c_int), # int _syscall; /* Triggering system call number. */ + ("_arch", ctypes.c_uint) # unsigned int _arch; /* AUDIT_ARCH_* of syscall. */ + ] # } _sigsys; + + +class _siginfo_t_U(ctypes.Union): # union + _fields_ = [ # { + ("_pad", ctypes.c_int*_SI_PAD_SIZE), # int _pad[__SI_PAD_SIZE]; + # + # /* kill(). */ + ("_kill", _siginfo_t_U_kill), # struct + # { + # __pid_t si_pid; /* Sending process ID. */ + # __uid_t si_uid; /* Real user ID of sending process. */ + # } _kill; + # + # /* POSIX.1b timers. */ + ("_timer", _siginfo_t_U_timer), # struct + # { + # int si_tid; /* Timer ID. */ + # int si_overrun; /* Overrun count. */ + # sigval_t si_sigval; /* Signal value. */ + # } _timer; + # + # /* POSIX.1b signals. */ + ("_rt", _siginfo_t_U_rt), # struct + # { + # __pid_t si_pid; /* Sending process ID. */ + # __uid_t si_uid; /* Real user ID of sending process. */ + # sigval_t si_sigval; /* Signal value. */ + # } _rt; + # + # /* SIGCHLD. */ + ("_sigchld", _siginfo_t_U_sigchld), # struct + # { + # __pid_t si_pid; /* Which child. */ + # __uid_t si_uid; /* Real user ID of sending process. */ + # int si_status; /* Exit value or signal. */ + # __sigchld_clock_t si_utime; + # __sigchld_clock_t si_stime; + # } _sigchld; + # + # /* SIGILL, SIGFPE, SIGSEGV, SIGBUS. */ + ("_sigfault", _siginfo_t_U_sigfault), # struct + # { + # void *si_addr; /* Faulting insn/memory ref. */ + # short int si_addr_lsb; /* Valid LSB of the reported address. */ + # } _sigfault; + # + # /* SIGPOLL. */ + ("_sigpoll", _siginfo_t_U_sigpoll), # struct + # { + # long int si_band; /* Band event for SIGPOLL. */ + # int si_fd; + # } _sigpoll; + # + # /* SIGSYS. */ + ("_sigsys", _siginfo_t_U_sigpoll) # struct + # { + # void *_call_addr; /* Calling user insn. */ + # int _syscall; /* Triggering system call number. */ + # unsigned int _arch; /* AUDIT_ARCH_* of syscall. */ + # } _sigsys; + ] # } _sifields; + +class siginfo_t(ctypes.Structure): # typedef struct + _fields_ = [ # { + ("si_signo", ctypes.c_int), # int si_signo; /* Signal number. */ + ("si_errno", ctypes.c_int), # int si_errno; /* If non-zero, an errno value associated with + # this signal, as defined in . */ + ("si_code", ctypes.c_int), # int si_code; /* Signal code. */ + # + ("_sifields", _siginfo_t_U) # union + # { + # int _pad[__SI_PAD_SIZE]; + # + # /* kill(). */ + # struct + # { + # __pid_t si_pid; /* Sending process ID. */ + # __uid_t si_uid; /* Real user ID of sending process. */ + # } _kill; + # + # /* POSIX.1b timers. */ + # struct + # { + # int si_tid; /* Timer ID. */ + # int si_overrun; /* Overrun count. */ + # sigval_t si_sigval; /* Signal value. */ + # } _timer; + # + # /* POSIX.1b signals. */ + # struct + # { + # __pid_t si_pid; /* Sending process ID. */ + # __uid_t si_uid; /* Real user ID of sending process. */ + # sigval_t si_sigval; /* Signal value. */ + # } _rt; + # + # /* SIGCHLD. */ + # struct + # { + # __pid_t si_pid; /* Which child. */ + # __uid_t si_uid; /* Real user ID of sending process. */ + # int si_status; /* Exit value or signal. */ + # __sigchld_clock_t si_utime; + # __sigchld_clock_t si_stime; + # } _sigchld; + # + # /* SIGILL, SIGFPE, SIGSEGV, SIGBUS. */ + # struct + # { + # void *si_addr; /* Faulting insn/memory ref. */ + # short int si_addr_lsb; /* Valid LSB of the reported address. */ + # } _sigfault; + # + # /* SIGPOLL. */ + # struct + # { + # long int si_band; /* Band event for SIGPOLL. */ + # int si_fd; + # } _sigpoll; + # + # /* SIGSYS. */ + # struct + # { + # void *_call_addr; /* Calling user insn. */ + # int _syscall; /* Triggering system call number. */ + # unsigned int _arch; /* AUDIT_ARCH_* of syscall. */ + # } _sigsys; + # } _sifields; + ] # } siginfo_t __SI_ALIGNMENT; + + +# xsave related. + +class ymmh_struct(ctypes.Structure): # struct ymmh_struct { + _fields_ = [ + ("ymmh_space", 64*ctypes.c_uint) # u32 ymmh_space[64]; + ] # } __packed; + + +class xsave_hdr_struct(ctypes.Structure): # struct xsave_hdr_struct { + _fields_ = [ + ("xstate_bv", ctypes.c_ulonglong), # u64 xstate_bv; + ("reserved1", ctypes.c_ulonglong*2), # u64 reserved1[2]; + ("reserved2", ctypes.c_ulonglong*5) # u64 reserved2[5]; + ] # } __packed; + + +class i387_fxsave_struct(ctypes.Structure): # struct i387_fxsave_struct { + _fields_ = [ + ("cwd", ctypes.c_ushort), # u16 cwd; /* Control Word */ + ("swd", ctypes.c_ushort), # u16 swd; /* Status Word */ + ("twd", ctypes.c_ushort), # u16 twd; /* Tag Word */ + ("fop", ctypes.c_ushort), # u16 fop; /* Last Instruction Opcode */ + # union { + # struct { + ("rip", ctypes.c_ulonglong), # u64 rip; /* Instruction Pointer */ + ("rdp", ctypes.c_ulonglong), # u64 rdp; /* Data Pointer */ + # }; + # struct { + # u32 fip; /* FPU IP Offset */ + # u32 fcs; /* FPU IP Selector */ + # u32 foo; /* FPU Operand Offset */ + # u32 fos; /* FPU Operand Selector */ + # }; + # }; + ("mxcsr", ctypes.c_uint), # u32 mxcsr; /* MXCSR Register State */ + ("mxcsr_mask", ctypes.c_uint), # u32 mxcsr_mask; /* MXCSR Mask */ + # + # /* 8*16 bytes for each FP-reg = 128 bytes */ + ("st_space", ctypes.c_uint*32), # u32 st_space[32]; +# + # /* 16*16 bytes for each XMM-reg = 256 bytes */ + ("xmm_space", ctypes.c_uint*64), # u32 xmm_space[64]; + # + ("padding", ctypes.c_uint*12), # u32 padding[12]; + # + # union { + ("padding1", ctypes.c_uint*12) # u32 padding1[12]; + # u32 sw_reserved[12]; + # }; + # + ] # } __aligned(16); + + +class elf_xsave_struct(ctypes.Structure): # struct xsave_struct { + _fields_ = [ + ("i387", i387_fxsave_struct), # struct i387_fxsave_struct i387; + ("xsave_hdr", xsave_hdr_struct), # struct xsave_hdr_struct xsave_hdr; + ("ymmh", ymmh_struct) # struct ymmh_struct ymmh; + ] # } __aligned(FP_MIN_ALIGN_BYTES) __packed; diff --git a/CRIU_code/coredump/pycriu b/CRIU_code/coredump/pycriu new file mode 100644 index 0000000..d13a879 --- /dev/null +++ b/CRIU_code/coredump/pycriu @@ -0,0 +1 @@ +../lib/py/ \ No newline at end of file diff --git a/CRIU_code/crit/Makefile b/CRIU_code/crit/Makefile new file mode 100644 index 0000000..988b481 --- /dev/null +++ b/CRIU_code/crit/Makefile @@ -0,0 +1,13 @@ + +all-y += crit + +crit/crit: crit/crit-$(PYTHON) + $(Q) cp $^ $@ +crit: crit/crit +.PHONY: crit + +clean-crit: + $(Q) $(RM) crit/crit +.PHONY: clean-crit +clean: clean-crit +mrproper: clean diff --git a/CRIU_code/crit/crit-python2 b/CRIU_code/crit/crit-python2 new file mode 100644 index 0000000..b0b7d3c --- /dev/null +++ b/CRIU_code/crit/crit-python2 @@ -0,0 +1,6 @@ +#!/usr/bin/env python2 + +from pycriu import cli + +if __name__ == '__main__': + cli.main() diff --git a/CRIU_code/crit/crit-python3 b/CRIU_code/crit/crit-python3 new file mode 100644 index 0000000..80467cb --- /dev/null +++ b/CRIU_code/crit/crit-python3 @@ -0,0 +1,6 @@ +#!/usr/bin/env python3 + +from pycriu import cli + +if __name__ == '__main__': + cli.main() diff --git a/CRIU_code/crit/pycriu b/CRIU_code/crit/pycriu new file mode 100644 index 0000000..d13a879 --- /dev/null +++ b/CRIU_code/crit/pycriu @@ -0,0 +1 @@ +../lib/py/ \ No newline at end of file diff --git a/CRIU_code/criu/Makefile b/CRIU_code/criu/Makefile new file mode 100644 index 0000000..4134e50 --- /dev/null +++ b/CRIU_code/criu/Makefile @@ -0,0 +1,135 @@ +# here is a workaround for a bug in libnl-3: +# 6a8d90f5fec4 "attr: Allow attribute type 0" +WRAPFLAGS += -Wl,--wrap=nla_parse,--wrap=nlmsg_parse + +ARCH_DIR := criu/arch/$(SRCARCH) +PIE_DIR := criu/pie +export ARCH_DIR PIE_DIR + +ifeq ($(filter clean mrproper,$(MAKECMDGOALS)),) + CFLAGS += $(shell $(COMPEL_BIN) includes) + COMPEL_LIBS := $(shell $(COMPEL_BIN) --static libs) + CFLAGS_PIE += $(shell $(COMPEL_BIN) cflags) +endif + +# +# Configuration file paths +CONFIG-DEFINES += -DSYSCONFDIR='"/etc"' +CONFIG-DEFINES += -DGLOBAL_CONFIG_DIR='"/etc/criu/"' +CONFIG-DEFINES += -DDEFAULT_CONFIG_FILENAME='"default.conf"' +CONFIG-DEFINES += -DUSER_CONFIG_DIR='".criu/"' + +# +# General flags. +CFLAGS += -fno-strict-aliasing +CFLAGS += -iquote criu/include +CFLAGS += -iquote include +CFLAGS += -iquote images +CFLAGS += -iquote $(ARCH_DIR)/include +CFLAGS += -iquote . +CFLAGS += $(shell pkg-config --cflags libnl-3.0) +CFLAGS += $(CONFIG-DEFINES) + +ifeq ($(GMON),1) + CFLAGS += -pg + GMONLDOPT := -pg +endif + +# msg-* printing +include $(__nmk_dir)msg.mk + +# +# Needed libraries checks +include criu/Makefile.packages + +# +# Architecture dependent part. +ARCH-LIB := $(ARCH_DIR)/crtools.built-in.o +$(ARCH-LIB): .FORCE + $(Q) $(MAKE) $(build)=$(ARCH_DIR) all + +# +# PIE library code. +criu/pie/pie.lib.a: $(ARCH-LIB) .FORCE + $(Q) $(MAKE) $(call build-as,Makefile.library,criu/pie) all + +# +# PIE code blobs themseves. +pie: criu/pie/pie.lib.a + $(Q) $(MAKE) $(build)=criu/pie all +.PHONY: pie + +criu/pie/Makefile: ; +criu/pie/Makefile.library: ; +criu/pie/%: pie ; + +# +# CRIU executable +PROGRAM-BUILTINS += criu/pie/pie.lib.a +PROGRAM-BUILTINS += images/built-in.o +PROGRAM-BUILTINS += $(obj)/built-in.o +PROGRAM-BUILTINS += $(ARCH-LIB) +PROGRAM-BUILTINS += soccr/libsoccr.a +PROGRAM-BUILTINS += $(COMPEL_LIBS) + +$(obj)/built-in.o: pie + $(Q) $(MAKE) $(call build-as,Makefile.crtools,criu) all + + +$(obj)/Makefile: ; +$(obj)/Makefile.crtools: ; +$(obj)/Makefile.packages: ; + +$(obj)/%: pie + $(Q) $(MAKE) $(call build-as,Makefile.crtools,criu) $@ + +$(obj)/criu: $(PROGRAM-BUILTINS) + $(call msg-link, $@) + $(Q) $(CC) $(CFLAGS) $^ $(LIBS) $(WRAPFLAGS) $(LDFLAGS) $(GMONLDOPT) -rdynamic -o $@ + + +# +# Clean the most, except generated c files +subclean: + $(Q) $(RM) $(obj)/*.{gcda,gcno,gcov} + $(Q) $(RM) $(obj)/pie/*.{gcda,gcno,gcov} + $(Q) $(RM) -r $(obj)/gcov + $(Q) $(MAKE) $(build)=$(ARCH_DIR) clean + $(Q) $(MAKE) $(call build-as,Makefile.library,$(PIE_DIR)) clean + $(Q) $(MAKE) $(call build-as,Makefile.crtools,criu) clean + $(Q) $(MAKE) $(build)=$(PIE_DIR) clean +.PHONY: subclean +cleanup-y += $(obj)/criu +clean: subclean + +# +# Delete all generated files +subproper: + $(Q) $(MAKE) $(build)=$(ARCH_DIR) mrproper + $(Q) $(MAKE) $(call build-as,Makefile.library,$(PIE_DIR)) mrproper + $(Q) $(MAKE) $(call build-as,Makefile.crtools,criu) mrproper + $(Q) $(MAKE) $(build)=$(PIE_DIR) mrproper +.PHONY: subproper +mrproper: subproper + +UAPI_HEADERS := criu/include/criu-plugin.h +UAPI_HEADERS += criu/include/criu-log.h + +install: $(obj)/criu + $(E) " INSTALL " $(obj)/criu + $(Q) mkdir -p $(DESTDIR)$(SBINDIR) + $(Q) install -m 755 $(obj)/criu $(DESTDIR)$(SBINDIR) + $(Q) mkdir -p $(DESTDIR)$(INCLUDEDIR)/criu/ + $(Q) install -m 644 $(UAPI_HEADERS) $(DESTDIR)$(INCLUDEDIR)/criu/ + $(Q) mkdir -p $(DESTDIR)$(LIBEXECDIR)/criu/scripts + $(Q) install -m 755 scripts/systemd-autofs-restart.sh $(DESTDIR)$(LIBEXECDIR)/criu/scripts +.PHONY: install + +uninstall: + $(E) " UNINSTALL" criu + $(Q) $(RM) $(addprefix $(DESTDIR)$(SBINDIR)/,criu) + $(Q) $(RM) $(addprefix $(DESTDIR)$(INCLUDEDIR)/criu/,$(notdir $(UAPI_HEADERS))) + $(Q) $(RM) $(addprefix $(DESTDIR)$(LIBEXECDIR)/criu/scripts/,systemd-autofs-restart.sh) +.PHONY: uninstall + +all-y += check-packages $(obj)/criu diff --git a/CRIU_code/criu/Makefile.crtools b/CRIU_code/criu/Makefile.crtools new file mode 100644 index 0000000..d19ff81 --- /dev/null +++ b/CRIU_code/criu/Makefile.crtools @@ -0,0 +1,105 @@ +CFLAGS_REMOVE_clone-noasan.o += $(CFLAGS-ASAN) +CFLAGS_kerndat.o += -DKDAT_MAGIC_2=${shell echo $${SOURCE_DATE_EPOCH:-$$(date +%s)}} -DKDAT_RUNDIR=\"$(RUNDIR)\" +ldflags-y += -r + +obj-y += action-scripts.o +obj-y += external.o +obj-y += aio.o +obj-y += bfd.o +obj-y += bitmap.o +obj-y += cgroup.o +obj-y += cgroup-props.o +obj-y += clone-noasan.o +obj-y += cr-check.o +obj-y += cr-dedup.o +obj-y += cr-dump.o +obj-y += cr-errno.o +obj-y += cr-restore.o +obj-y += cr-service.o +obj-y += crtools.o +obj-y += eventfd.o +obj-y += eventpoll.o +obj-y += fault-injection.o +obj-y += fifo.o +obj-y += file-ids.o +obj-y += file-lock.o +obj-y += files-ext.o +obj-y += files.o +obj-y += files-reg.o +obj-y += fsnotify.o +obj-y += image-desc.o +obj-y += image.o +obj-y += img-remote.o +obj-y += img-proxy.o +obj-y += img-cache.o +obj-y += ipc_ns.o +obj-y += irmap.o +obj-y += kcmp-ids.o +obj-y += kerndat.o +obj-y += libnetlink.o +obj-y += log.o +obj-y += lsm.o +obj-y += mem.o +obj-y += mount.o +obj-y += filesystems.o +obj-y += namespaces.o +obj-y += netfilter.o +obj-y += net.o +obj-y += pagemap-cache.o +obj-y += page-pipe.o +obj-y += pagemap.o +obj-y += page-xfer.o +obj-y += parasite-syscall.o +obj-y += pie-util.o +obj-y += pipes.o +obj-y += plugin.o +obj-y += proc_parse.o +obj-y += protobuf-desc.o +obj-y += protobuf.o +obj-y += pstree.o +obj-y += rbtree.o +obj-y += rst-malloc.o +obj-y += seccomp.o +obj-y += seize.o +obj-y += shmem.o +obj-y += sigframe.o +obj-y += signalfd.o +obj-y += sk-inet.o +obj-y += sk-netlink.o +obj-y += sk-packet.o +obj-y += sk-queue.o +obj-y += sk-tcp.o +obj-y += sk-unix.o +obj-y += sockets.o +obj-y += stats.o +obj-y += string.o +obj-y += sysctl.o +obj-y += sysfs_parse.o +obj-y += timerfd.o +obj-$(CONFIG_GNUTLS) += tls.o +obj-y += tty.o +obj-y += tun.o +obj-y += util.o +obj-y += uts_ns.o +obj-y += path.o +obj-y += autofs.o +obj-y += fdstore.o +obj-y += uffd.o +obj-y += config.o +obj-y += servicefd.o +obj-y += pie-util-vdso.o +obj-y += vdso.o +obj-$(CONFIG_COMPAT) += pie-util-vdso-elf32.o +CFLAGS_pie-util-vdso-elf32.o += -DCONFIG_VDSO_32 +obj-$(CONFIG_COMPAT) += vdso-compat.o +CFLAGS_REMOVE_vdso-compat.o += $(CFLAGS-ASAN) $(CFLAGS-GCOV) + +PROTOBUF_GEN := scripts/protobuf-gen.sh + +$(obj)/protobuf-desc.d: $(obj)/protobuf-desc-gen.h + +$(obj)/protobuf-desc-gen.h: $(PROTOBUF_GEN) criu/include/protobuf-desc.h + $(call msg-gen, $@) + $(Q) $(SH) $(PROTOBUF_GEN) > $@ + +mrproper-y += $(obj)/protobuf-desc-gen.h diff --git a/CRIU_code/criu/Makefile.packages b/CRIU_code/criu/Makefile.packages new file mode 100644 index 0000000..b01b4b0 --- /dev/null +++ b/CRIU_code/criu/Makefile.packages @@ -0,0 +1,53 @@ +REQ-RPM-PKG-NAMES += protobuf +REQ-RPM-PKG-NAMES += protobuf-c +REQ-RPM-PKG-NAMES += protobuf-c-devel +REQ-RPM-PKG-NAMES += protobuf-compiler +REQ-RPM-PKG-NAMES += protobuf-devel +REQ-RPM-PKG-NAMES += protobuf-python +REQ-RPM-PKG-NAMES += libnl3-devel +REQ-RPM-PKG-NAMES += libcap-devel +REQ-RPM-PKG-NAMES += $(PYTHON)-future + +REQ-RPM-PKG-TEST-NAMES += libaio-devel + +REQ-DEB-PKG-NAMES += libprotobuf-dev +REQ-DEB-PKG-NAMES += libprotobuf-c0-dev +REQ-DEB-PKG-NAMES += protobuf-c-compiler +REQ-DEB-PKG-NAMES += protobuf-compiler +REQ-DEB-PKG-NAMES += python-protobuf +REQ-DEB-PKG-NAMES += libnl-3-dev +REQ-DEB-PKG-NAMES += libcap-dev + +REQ-DEB-PKG-TEST-NAMES += python-yaml +REQ-DEB-PKG-TEST-NAMES += libaio-dev + +ifeq ($(PYTHON),python3) +REQ-DEB-PKG-NAMES += $(PYTHON)-future +REQ-DEB-PKG-TEST-NAMES += libaio-dev + +REQ-RPM-PKG-TEST-NAMES += $(PYTHON)-PyYAML +else +REQ-DEB-PKG-NAMES += python-future + +REQ-RPM-PKG-TEST-NAMES += $(PYTHON)-pyyaml +endif + +export LIBS += -lprotobuf-c -ldl -lnl-3 -lsoccr -Lsoccr/ -lnet + +check-packages-failed: + $(warning Can not find some of the required libraries) + $(warning Make sure the following packages are installed) + $(warning RPM based distros: $(REQ-RPM-PKG-NAMES)) + $(warning DEB based distros: $(REQ-DEB-PKG-NAMES)) + $(warning To run tests the following packages are needed) + $(warning RPM based distros: $(REQ-RPM-PKG-TEST-NAMES)) + $(warning DEB based distros: $(REQ-DEB-PKG-TEST-NAMES)) + $(error Compilation aborted) + +# +# Make sure all required libs are installed +PROGRAM_STUB := int main(int argc, char **argv) { return 0; } +check-packages: + $(Q) $(call try-cc,$(PROGRAM_STUB),$(LIBS)) \ + || $(MAKE) -f $(obj)/Makefile.packages check-packages-failed +.PHONY: check-packages-failed check-packages diff --git a/CRIU_code/criu/action-scripts.c b/CRIU_code/criu/action-scripts.c new file mode 100644 index 0000000..2f7617c --- /dev/null +++ b/CRIU_code/criu/action-scripts.c @@ -0,0 +1,171 @@ +#include +#include +#include +#include + +#include "cr_options.h" +#include "common/list.h" +#include "xmalloc.h" +#include "log.h" +#include "servicefd.h" +#include "cr-service.h" +#include "action-scripts.h" +#include "pstree.h" +#include "common/bug.h" +#include "util.h" +#include +#include +#include "common/scm.h" + +static const char *action_names[ACT_MAX] = { + [ ACT_PRE_DUMP ] = "pre-dump", + [ ACT_POST_DUMP ] = "post-dump", + [ ACT_PRE_RESTORE ] = "pre-restore", + [ ACT_POST_RESTORE ] = "post-restore", + [ ACT_NET_LOCK ] = "network-lock", + [ ACT_NET_UNLOCK ] = "network-unlock", + [ ACT_SETUP_NS ] = "setup-namespaces", + [ ACT_POST_SETUP_NS ] = "post-setup-namespaces", + [ ACT_PRE_RESUME ] = "pre-resume", + [ ACT_POST_RESUME ] = "post-resume", + [ ACT_ORPHAN_PTS_MASTER ] = "orphan-pts-master", +}; + +struct script { + struct list_head node; + char *path; +}; + +enum { + SCRIPTS_NONE, + SCRIPTS_SHELL, + SCRIPTS_RPC +}; + +static int scripts_mode = SCRIPTS_NONE; +static LIST_HEAD(scripts); + +static int run_shell_scripts(const char *action) +{ + int retval = 0; + struct script *script; + static unsigned env_set = 0; + +#define ENV_IMGDIR 0x1 +#define ENV_ROOTPID 0x2 + + if (setenv("CRTOOLS_SCRIPT_ACTION", action, 1)) { + pr_perror("Can't set CRTOOLS_SCRIPT_ACTION=%s", action); + return -1; + } + + if (!(env_set & ENV_IMGDIR)) { + char image_dir[PATH_MAX]; + sprintf(image_dir, "/proc/%ld/fd/%d", (long) getpid(), get_service_fd(IMG_FD_OFF)); + if (setenv("CRTOOLS_IMAGE_DIR", image_dir, 1)) { + pr_perror("Can't set CRTOOLS_IMAGE_DIR=%s", image_dir); + return -1; + } + env_set |= ENV_IMGDIR; + } + + if (!(env_set & ENV_ROOTPID) && root_item) { + int pid; + + pid = root_item->pid->real; + if (pid != -1) { + char root_item_pid[16]; + snprintf(root_item_pid, sizeof(root_item_pid), "%d", pid); + if (setenv("CRTOOLS_INIT_PID", root_item_pid, 1)) { + pr_perror("Can't set CRTOOLS_INIT_PID=%s", root_item_pid); + return -1; + } + env_set |= ENV_ROOTPID; + } + } + + list_for_each_entry(script, &scripts, node) { + int err; + pr_debug("\t[%s]\n", script->path); + err = cr_system(-1, -1, -1, script->path, + (char *[]) { script->path, NULL }, 0); + if (err) + pr_err("Script %s exited with %d\n", script->path, err); + retval |= err; + } + + unsetenv("CRTOOLS_SCRIPT_ACTION"); + + return retval; +} + +int rpc_send_fd(enum script_actions act, int fd) +{ + const char *action = action_names[act]; + int rpc_sk; + + if (scripts_mode != SCRIPTS_RPC) + return -1; + + rpc_sk = get_service_fd(RPC_SK_OFF); + if (rpc_sk < 0) + return -1; + + pr_debug("\tRPC\n"); + return send_criu_rpc_script(act, (char *)action, rpc_sk, fd); +} + +int run_scripts(enum script_actions act) +{ + int ret = 0; + const char *action = action_names[act]; + + pr_debug("Running %s scripts\n", action); + + if (scripts_mode == SCRIPTS_NONE) + return 0; + + if (scripts_mode == SCRIPTS_RPC) { + ret = rpc_send_fd(act, -1); + goto out; + } + + if (scripts_mode == SCRIPTS_SHELL) { + ret = run_shell_scripts(action); + goto out; + } + + BUG(); +out: + if (ret) + pr_err("One of more action scripts failed\n"); + return ret; +} + +int add_script(char *path) +{ + struct script *script; + + BUG_ON(scripts_mode == SCRIPTS_RPC); + scripts_mode = SCRIPTS_SHELL; + + script = xmalloc(sizeof(struct script)); + if (script == NULL) + return 1; + + script->path = path; + list_add(&script->node, &scripts); + + return 0; +} + +int add_rpc_notify(int sk) +{ + BUG_ON(scripts_mode == SCRIPTS_SHELL); + scripts_mode = SCRIPTS_RPC; + + if (install_service_fd(RPC_SK_OFF, dup(sk)) < 0) + return -1; + + return 0; +} diff --git a/CRIU_code/criu/aio.c b/CRIU_code/criu/aio.c new file mode 100644 index 0000000..45651f2 --- /dev/null +++ b/CRIU_code/criu/aio.c @@ -0,0 +1,152 @@ +#include +#include +#include +#include "vma.h" +#include "xmalloc.h" +#include "pstree.h" +#include "restorer.h" +#include "aio.h" +#include "rst_info.h" +#include "rst-malloc.h" +#include "parasite.h" +#include "parasite-syscall.h" +#include "images/mm.pb-c.h" +#include + +#define NR_IOEVENTS_IN_NPAGES(npages) ((PAGE_SIZE * (npages) - sizeof(struct aio_ring)) / sizeof(struct io_event)) + +int dump_aio_ring(MmEntry *mme, struct vma_area *vma) +{ + int nr = mme->n_aios; + AioRingEntry *re; + + mme->aios = xrealloc(mme->aios, (nr + 1) * sizeof(re)); + if (!mme->aios) + return -1; + + re = xmalloc(sizeof(*re)); + if (!re) + return -1; + + aio_ring_entry__init(re); + re->id = vma->e->start; + re->ring_len = vma->e->end - vma->e->start; + re->nr_req = aio_estimate_nr_reqs(re->ring_len); + if (!re->nr_req) { + xfree(re); + return -1; + } + mme->aios[nr] = re; + mme->n_aios = nr + 1; + pr_info("Dumping AIO ring @%"PRIx64"-%"PRIx64"\n", + vma->e->start, vma->e->end); + return 0; +} + +void free_aios(MmEntry *mme) +{ + int i; + + if (mme->aios) { + for (i = 0; i < mme->n_aios; i++) + xfree(mme->aios[i]); + xfree(mme->aios); + } +} + +unsigned int aio_estimate_nr_reqs(unsigned int size) +{ + unsigned int k_max_reqs = NR_IOEVENTS_IN_NPAGES(size/PAGE_SIZE); + + if (size & ~PAGE_MASK) { + pr_err("Ring size is not aligned\n"); + return 0; + } + /* + * Kernel does + * + * nr_reqs = max(nr_reqs, nr_cpus * 4) + * nr_reqs *= 2 + * nr_reqs += 2 + * ring = roundup(sizeof(head) + nr_reqs * sizeof(req)) + * nr_reqs = (ring - sizeof(head)) / sizeof(req) + * + * And the k_max_reqs here is the resulting value. + * + * We need to get the initial nr_reqs that would grow + * up back to the k_max_reqs. + */ + + return (k_max_reqs - 2) / 2; +} + +unsigned long aio_rings_args_size(struct vm_area_list *vmas) +{ + return sizeof(struct parasite_check_aios_args) + + vmas->nr_aios * sizeof(struct parasite_aio); +} + +int parasite_collect_aios(struct parasite_ctl *ctl, struct vm_area_list *vmas) +{ + struct vma_area *vma; + struct parasite_check_aios_args *aa; + struct parasite_aio *pa; + + if (!vmas->nr_aios) + return 0; + + pr_info("Checking AIO rings\n"); + + /* + * Go to parasite and + * a) check that no requests are currently pengind + * b) get the maximum number of requests kernel handles + * to estimate what was the user request on ring + * creation. + */ + + aa = compel_parasite_args_s(ctl, aio_rings_args_size(vmas)); + pa = &aa->ring[0]; + list_for_each_entry(vma, &vmas->h, list) { + if (!vma_area_is(vma, VMA_AREA_AIORING)) + continue; + + pr_debug(" `- Ring #%ld @%"PRIx64"\n", + (long)(pa - &aa->ring[0]), vma->e->start); + pa->ctx = vma->e->start; + pa->size = vma->e->end - vma->e->start; + pa++; + } + aa->nr_rings = vmas->nr_aios; + + if (compel_rpc_call_sync(PARASITE_CMD_CHECK_AIOS, ctl)) + return -1; + + return 0; +} + +int prepare_aios(struct pstree_item *t, struct task_restore_args *ta) +{ + int i; + MmEntry *mm = rsti(t)->mm; + /* + * Put info about AIO rings, they will get remapped + */ + + ta->rings = (struct rst_aio_ring *)rst_mem_align_cpos(RM_PRIVATE); + ta->rings_n = mm->n_aios; + + for (i = 0; i < mm->n_aios; i++) { + struct rst_aio_ring *raio; + + raio = rst_mem_alloc(sizeof(*raio), RM_PRIVATE); + if (!raio) + return -1; + + raio->addr = mm->aios[i]->id; + raio->nr_req = mm->aios[i]->nr_req; + raio->len = mm->aios[i]->ring_len; + } + + return 0; +} diff --git a/CRIU_code/criu/arch/aarch64/Makefile b/CRIU_code/criu/arch/aarch64/Makefile new file mode 100644 index 0000000..b264873 --- /dev/null +++ b/CRIU_code/criu/arch/aarch64/Makefile @@ -0,0 +1,8 @@ +builtin-name := crtools.built-in.o + +ldflags-y += -r + +obj-y += cpu.o +obj-y += crtools.o +obj-y += sigframe.o +obj-y += bitops.o diff --git a/CRIU_code/criu/arch/aarch64/bitops.S b/CRIU_code/criu/arch/aarch64/bitops.S new file mode 100644 index 0000000..d8fa0ef --- /dev/null +++ b/CRIU_code/criu/arch/aarch64/bitops.S @@ -0,0 +1,18 @@ +#include "common/asm/linkage.h" + + .text + +ENTRY(test_and_set_bit) + and w3, w0, #63 + eor w0, w0, w3 + mov x2, #1 + add x1, x1, x0, lsr #3 + lsl x4, x2, x3 +1: ldaxr x2, [x1] + lsr x0, x2, x3 + orr x2, x2, x4 + stlxr w5, x2, [x1] + cbnz w5, 1b + and x0, x0, #1 +3: ret +END(test_and_set_bit) diff --git a/CRIU_code/criu/arch/aarch64/cpu.c b/CRIU_code/criu/arch/aarch64/cpu.c new file mode 100644 index 0000000..3ce74c8 --- /dev/null +++ b/CRIU_code/criu/arch/aarch64/cpu.c @@ -0,0 +1,125 @@ +#undef LOG_PREFIX +#define LOG_PREFIX "cpu: " + +#include "bitops.h" +#include "asm/cpu.h" +#include +#include +#include +#include +#include +#include +#include "cpu.h" + +#include "common/compiler.h" + +#include "cr_options.h" +#include "image.h" +#include "util.h" +#include "log.h" + +#include "cpu.h" + +#include "protobuf.h" +#include "images/cpuinfo.pb-c.h" + +static compel_cpuinfo_t rt_cpu_info; + +int cpu_init(void) +{ + compel_cpu_copy_cpuinfo(&rt_cpu_info); + return 0; +} + +int cpu_dump_cpuinfo(void) +{ + CpuinfoEntry cpu_info = CPUINFO_ENTRY__INIT; + CpuinfoAarch64Entry cpu_aarch64_info = CPUINFO_AARCH64_ENTRY__INIT; + CpuinfoAarch64Entry *cpu_aarch64_info_ptr = &cpu_aarch64_info; + + struct cr_img *img; + img = open_image(CR_FD_CPUINFO, O_DUMP); + if (!img) + return -1; + + cpu_info.aarch64_entry = &cpu_aarch64_info_ptr; + cpu_info.n_aarch64_entry = 1; + cpu_aarch64_info.reg_midr = rt_cpu_info.reg_midr; + cpu_aarch64_info.reg_ctr = rt_cpu_info.reg_ctr; + cpu_aarch64_info.reg_cntfrq = rt_cpu_info.reg_cntfrq; + cpu_aarch64_info.reg_dczid = rt_cpu_info.reg_dczid; + cpu_aarch64_info.reg_revidr = rt_cpu_info.reg_revidr; + if (pb_write_one(img, &cpu_info, PB_CPUINFO) < 0) { + close_image(img); + return -1; + } + close_image(img); + return 0; +} + +int cpu_validate_cpuinfo(void) +{ + compel_cpuinfo_t *cpu_info = NULL; + CpuinfoAarch64Entry *img_aarch64_entry; + CpuinfoEntry *img_cpu_info; + struct cr_img *img; + int ret = -1; + img = open_image(CR_FD_CPUINFO, O_RSTR); + if (!img) + return -1; + if (pb_read_one(img, &img_cpu_info, PB_CPUINFO) < 0) + goto err; + if (img_cpu_info->n_aarch64_entry != 1) + { + pr_err("No aarch64 related cpuinfo in image, " + "corruption (n_aarch64_entry = %zi)\n", + img_cpu_info->n_aarch64_entry); + goto err; + } + img_aarch64_entry = img_cpu_info->aarch64_entry[0]; + cpu_info = img_to_cpuinfo(img_aarch64_entry); + +err: + xfree(cpu_info); + close_image(img); + return 0; +} + +static compel_cpuinfo_t *img_to_cpuinfo(CpuinfoAarch64Entry *img_aarch64_entry) +{ + compel_cpuinfo_t *cpu_info; + + cpu_info = xzalloc(sizeof(*cpu_info)); + if (!cpu_info) + return NULL; + + /* + * Copy caps from image and fill the left ones from + * run-time information for easier compatibility testing. + */ + cpu_info->reg_midr = img_aarch64_entry->reg_midr; + cpu_info->reg_ctr = img_aarch64_entry->reg_ctr; + cpu_info->reg_cntfrq = img_aarch64_entry->reg_cntfrq; + cpu_info->reg_dczid = img_aarch64_entry->reg_dczid; + cpu_info->reg_revidr = img_aarch64_entry->reg_revidr; + + return cpu_info; +} + +int cpuinfo_dump(void) +{ + if (cpu_init()) + return -1; + if (cpu_dump_cpuinfo()) + return -1; + return 0; +} + +int cpuinfo_check(void) +{ + if (cpu_init()) + return 1; + if (cpu_validate_cpuinfo()) + return 1; + return 0; +} diff --git a/CRIU_code/criu/arch/aarch64/crtools.c b/CRIU_code/criu/arch/aarch64/crtools.c new file mode 100644 index 0000000..f98743a --- /dev/null +++ b/CRIU_code/criu/arch/aarch64/crtools.c @@ -0,0 +1,139 @@ +#include +#include + +#include + +#include "types.h" +#include + +#include +#include "asm/restorer.h" +#include "common/compiler.h" +#include +#include "asm/dump.h" +#include "protobuf.h" +#include "images/core.pb-c.h" +#include "images/creds.pb-c.h" +#include "parasite-syscall.h" +#include "log.h" +#include "util.h" +#include "cpu.h" +#include "restorer.h" +#include + +#define assign_reg(dst, src, e) dst->e = (__typeof__(dst->e))(src)->e + +int save_task_regs(void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpsimd) +{ + int i; + CoreEntry *core = x; + + // Save the Aarch64 CPU state + for (i = 0; i < 31; ++i) + assign_reg(core->ti_aarch64->gpregs, regs, regs[i]); + assign_reg(core->ti_aarch64->gpregs, regs, sp); + assign_reg(core->ti_aarch64->gpregs, regs, pc); + assign_reg(core->ti_aarch64->gpregs, regs, pstate); + + + // Save the FP/SIMD state + for (i = 0; i < 32; ++i) + { + core->ti_aarch64->fpsimd->vregs[2*i] = fpsimd->vregs[i]; + core->ti_aarch64->fpsimd->vregs[2*i + 1] = fpsimd->vregs[i] >> 64; + } + assign_reg(core->ti_aarch64->fpsimd, fpsimd, fpsr); + assign_reg(core->ti_aarch64->fpsimd, fpsimd, fpcr); + + return 0; +} + +int arch_alloc_thread_info(CoreEntry *core) +{ + ThreadInfoAarch64 *ti_aarch64; + UserAarch64RegsEntry *gpregs; + UserAarch64FpsimdContextEntry *fpsimd; + + ti_aarch64 = xmalloc(sizeof(*ti_aarch64)); + if (!ti_aarch64) + goto err; + thread_info_aarch64__init(ti_aarch64); + core->ti_aarch64 = ti_aarch64; + + gpregs = xmalloc(sizeof(*gpregs)); + if (!gpregs) + goto err; + user_aarch64_regs_entry__init(gpregs); + + gpregs->regs = xmalloc(31*sizeof(uint64_t)); + if (!gpregs->regs) + goto err; + gpregs->n_regs = 31; + + ti_aarch64->gpregs = gpregs; + + fpsimd = xmalloc(sizeof(*fpsimd)); + if (!fpsimd) + goto err; + user_aarch64_fpsimd_context_entry__init(fpsimd); + ti_aarch64->fpsimd = fpsimd; + fpsimd->vregs = xmalloc(64*sizeof(fpsimd->vregs[0])); + fpsimd->n_vregs = 64; + if (!fpsimd->vregs) + goto err; + + return 0; +err: + return -1; +} + +void arch_free_thread_info(CoreEntry *core) +{ + if (CORE_THREAD_ARCH_INFO(core)) { + if (CORE_THREAD_ARCH_INFO(core)->fpsimd) { + xfree(CORE_THREAD_ARCH_INFO(core)->fpsimd->vregs); + xfree(CORE_THREAD_ARCH_INFO(core)->fpsimd); + } + xfree(CORE_THREAD_ARCH_INFO(core)->gpregs->regs); + xfree(CORE_THREAD_ARCH_INFO(core)->gpregs); + xfree(CORE_THREAD_ARCH_INFO(core)); + CORE_THREAD_ARCH_INFO(core) = NULL; + } +} + +int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core) +{ + int i; + struct fpsimd_context *fpsimd = RT_SIGFRAME_FPU(sigframe); + + if (core->ti_aarch64->fpsimd->n_vregs != 64) + return 1; + + for (i = 0; i < 32; ++i) + fpsimd->vregs[i] = (__uint128_t)core->ti_aarch64->fpsimd->vregs[2*i] | + ((__uint128_t)core->ti_aarch64->fpsimd->vregs[2*i + 1] << 64); + assign_reg(fpsimd, core->ti_aarch64->fpsimd, fpsr); + assign_reg(fpsimd, core->ti_aarch64->fpsimd, fpcr); + + fpsimd->head.magic = FPSIMD_MAGIC; + fpsimd->head.size = sizeof(*fpsimd); + + return 0; +} + +int restore_gpregs(struct rt_sigframe *f, UserRegsEntry *r) +{ +#define CPREG1(d) f->uc.uc_mcontext.d = r->d + + int i; + + for (i = 0; i < 31; ++i) + CPREG1(regs[i]); + CPREG1(sp); + CPREG1(pc); + CPREG1(pstate); + +#undef CPREG1 + + return 0; +} diff --git a/CRIU_code/criu/arch/aarch64/include/asm/dump.h b/CRIU_code/criu/arch/aarch64/include/asm/dump.h new file mode 100644 index 0000000..bc3dbcf --- /dev/null +++ b/CRIU_code/criu/arch/aarch64/include/asm/dump.h @@ -0,0 +1,16 @@ +#ifndef __CR_ASM_DUMP_H__ +#define __CR_ASM_DUMP_H__ + +extern int save_task_regs(void *, user_regs_struct_t *, user_fpregs_struct_t *); +extern int arch_alloc_thread_info(CoreEntry *core); +extern void arch_free_thread_info(CoreEntry *core); + + +static inline void core_put_tls(CoreEntry *core, tls_t tls) +{ + core->ti_aarch64->tls = tls; +} + +#define get_task_futex_robust_list_compat(pid, info) -1 + +#endif diff --git a/CRIU_code/criu/arch/aarch64/include/asm/int.h b/CRIU_code/criu/arch/aarch64/include/asm/int.h new file mode 100644 index 0000000..642804e --- /dev/null +++ b/CRIU_code/criu/arch/aarch64/include/asm/int.h @@ -0,0 +1,6 @@ +#ifndef __CR_ASM_INT_H__ +#define __CR_ASM_INT_H__ + +#include "asm-generic/int.h" + +#endif /* __CR_ASM_INT_H__ */ diff --git a/CRIU_code/criu/arch/aarch64/include/asm/kerndat.h b/CRIU_code/criu/arch/aarch64/include/asm/kerndat.h new file mode 100644 index 0000000..60956b5 --- /dev/null +++ b/CRIU_code/criu/arch/aarch64/include/asm/kerndat.h @@ -0,0 +1,7 @@ +#ifndef __CR_ASM_KERNDAT_H__ +#define __CR_ASM_KERNDAT_H__ + +#define kdat_compatible_cr() 0 +#define kdat_can_map_vdso() 0 + +#endif /* __CR_ASM_KERNDAT_H__ */ diff --git a/CRIU_code/criu/arch/aarch64/include/asm/parasite-syscall.h b/CRIU_code/criu/arch/aarch64/include/asm/parasite-syscall.h new file mode 100644 index 0000000..6008c37 --- /dev/null +++ b/CRIU_code/criu/arch/aarch64/include/asm/parasite-syscall.h @@ -0,0 +1,6 @@ +#ifndef __CR_ASM_PARASITE_SYSCALL_H__ +#define __CR_ASM_PARASITE_SYSCALL_H__ + +struct parasite_ctl; + +#endif diff --git a/CRIU_code/criu/arch/aarch64/include/asm/parasite.h b/CRIU_code/criu/arch/aarch64/include/asm/parasite.h new file mode 100644 index 0000000..2a1e1c1 --- /dev/null +++ b/CRIU_code/criu/arch/aarch64/include/asm/parasite.h @@ -0,0 +1,11 @@ +#ifndef __ASM_PARASITE_H__ +#define __ASM_PARASITE_H__ + +static inline void arch_get_tls(tls_t *ptls) +{ + tls_t tls; + asm("mrs %0, tpidr_el0" : "=r" (tls)); + *ptls = tls; +} + +#endif diff --git a/CRIU_code/criu/arch/aarch64/include/asm/restore.h b/CRIU_code/criu/arch/aarch64/include/asm/restore.h new file mode 100644 index 0000000..3d794ff --- /dev/null +++ b/CRIU_code/criu/arch/aarch64/include/asm/restore.h @@ -0,0 +1,28 @@ +#ifndef __CR_ASM_RESTORE_H__ +#define __CR_ASM_RESTORE_H__ + +#include "asm/restorer.h" + +#include "images/core.pb-c.h" + +#define JUMP_TO_RESTORER_BLOB(new_sp, restore_task_exec_start, \ + task_args) \ + asm volatile( \ + "and sp, %0, #~15 \n" \ + "mov x0, %2 \n" \ + "br %1 \n" \ + : \ + : "r"(new_sp), \ + "r"(restore_task_exec_start), \ + "r"(task_args) \ + : "x0", "memory") + +static inline void core_get_tls(CoreEntry *pcore, tls_t *ptls) +{ + *ptls = pcore->ti_aarch64->tls; +} + + +int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core); + +#endif diff --git a/CRIU_code/criu/arch/aarch64/include/asm/restorer.h b/CRIU_code/criu/arch/aarch64/include/asm/restorer.h new file mode 100644 index 0000000..f502cdc --- /dev/null +++ b/CRIU_code/criu/arch/aarch64/include/asm/restorer.h @@ -0,0 +1,76 @@ +#ifndef __CR_ASM_RESTORER_H__ +#define __CR_ASM_RESTORER_H__ + +#include +#include + +#include "asm/types.h" +#include "images/core.pb-c.h" + +#include + +#define RUN_CLONE_RESTORE_FN(ret, clone_flags, new_sp, parent_tid, \ + thread_args, clone_restore_fn) \ + asm volatile( \ + "clone_emul: \n" \ + "ldr x1, %2 \n" \ + "and x1, x1, #~15 \n" \ + "sub x1, x1, #16 \n" \ + "stp %5, %6, [x1] \n" \ + "mov x0, %1 \n" \ + "mov x2, %3 \n" \ + "mov x3, %4 \n" \ + "mov x8, #"__stringify(__NR_clone)" \n" \ + "svc #0 \n" \ + \ + "cbz x0, thread_run \n" \ + \ + "mov %0, x0 \n" \ + "b clone_end \n" \ + \ + "thread_run: \n" \ + "ldp x1, x0, [sp] \n" \ + "br x1 \n" \ + \ + "clone_end: \n" \ + : "=r"(ret) \ + : "r"(clone_flags), \ + "m"(new_sp), \ + "r"(&parent_tid), \ + "r"(&thread_args[i].pid), \ + "r"(clone_restore_fn), \ + "r"(&thread_args[i]) \ + : "x0", "x1", "x2", "x3", "x8", "memory") + +#define ARCH_FAIL_CORE_RESTORE \ + asm volatile( \ + "mov sp, %0 \n" \ + "mov x0, #0 \n" \ + "b x0 \n" \ + : \ + : "r"(ret) \ + : "sp", "x0", "memory") + + +#define arch_map_vdso(map, compat) -1 + +int restore_gpregs(struct rt_sigframe *f, UserAarch64RegsEntry *r); +int restore_nonsigframe_gpregs(UserAarch64RegsEntry *r); + +static inline void restore_tls(tls_t *ptls) +{ + asm("msr tpidr_el0, %0" : : "r" (*ptls)); +} + +static inline void *alloc_compat_syscall_stack(void) { return NULL; } +static inline void free_compat_syscall_stack(void *stack32) { } +static inline int arch_compat_rt_sigaction(void *stack, int sig, void *act) +{ + return -1; +} +static inline int set_compat_robust_list(uint32_t head_ptr, uint32_t len) +{ + return -1; +} + +#endif diff --git a/CRIU_code/criu/arch/aarch64/include/asm/types.h b/CRIU_code/criu/arch/aarch64/include/asm/types.h new file mode 100644 index 0000000..e79f866 --- /dev/null +++ b/CRIU_code/criu/arch/aarch64/include/asm/types.h @@ -0,0 +1,32 @@ +#ifndef __CR_ASM_TYPES_H__ +#define __CR_ASM_TYPES_H__ + +#include +#include +#include +#include "images/core.pb-c.h" + +#include "page.h" +#include "bitops.h" +#include "asm/int.h" + +#include + +#define core_is_compat(core) false + +typedef UserAarch64RegsEntry UserRegsEntry; + +#define CORE_ENTRY__MARCH CORE_ENTRY__MARCH__AARCH64 + +#define CORE_THREAD_ARCH_INFO(core) core->ti_aarch64 + +#define TI_SP(core) ((core)->ti_aarch64->gpregs->sp) + +static inline void *decode_pointer(uint64_t v) { return (void*)v; } +static inline uint64_t encode_pointer(void *p) { return (uint64_t)p; } + +#define AT_VECTOR_SIZE 40 +typedef uint64_t auxv_t; +typedef uint64_t tls_t; + +#endif /* __CR_ASM_TYPES_H__ */ diff --git a/CRIU_code/criu/arch/aarch64/include/asm/vdso.h b/CRIU_code/criu/arch/aarch64/include/asm/vdso.h new file mode 100644 index 0000000..a7802a2 --- /dev/null +++ b/CRIU_code/criu/arch/aarch64/include/asm/vdso.h @@ -0,0 +1,31 @@ +#ifndef __CR_ASM_VDSO_H__ +#define __CR_ASM_VDSO_H__ + +#include "asm/int.h" +#include "common/compiler.h" +#include "asm-generic/vdso.h" + +/* + * This is a minimal amount of symbols + * we should support at the moment. + */ +#define VDSO_SYMBOL_MAX 4 + +/* + * Workaround for VDSO array symbol table's relocation. + * XXX: remove when compel/piegen will support aarch64. + */ +static const char* __maybe_unused aarch_vdso_symbol1 = "__kernel_clock_getres"; +static const char* __maybe_unused aarch_vdso_symbol2 = "__kernel_clock_gettime"; +static const char* __maybe_unused aarch_vdso_symbol3 = "__kernel_gettimeofday"; +static const char* __maybe_unused aarch_vdso_symbol4 = "__kernel_rt_sigreturn"; + +#define ARCH_VDSO_SYMBOLS \ + aarch_vdso_symbol1, \ + aarch_vdso_symbol2, \ + aarch_vdso_symbol3, \ + aarch_vdso_symbol4 + +extern void write_intraprocedure_branch(unsigned long to, unsigned long from); + +#endif /* __CR_ASM_VDSO_H__ */ diff --git a/CRIU_code/criu/arch/aarch64/intraprocedure.S b/CRIU_code/criu/arch/aarch64/intraprocedure.S new file mode 100644 index 0000000..e139dc8 --- /dev/null +++ b/CRIU_code/criu/arch/aarch64/intraprocedure.S @@ -0,0 +1,22 @@ +.global write_intraprocedure_branch + +/* to is x0, from is x1 */ +write_intraprocedure_branch: + /* load two 32-bit instructions */ + ldr x2, loadbranch + /* store 64 bits of instructions and 64 bits of destination address */ + stp x2, x0, [x1] + /* perform required cache maintenance and synronization operations */ + dc cvau, x1 + dsb ish + ic ivau, x1 + dsb ish + isb + ret + +/* intraprocedure trampoline instructions */ +loadbranch: + ldr x16, =destination + br x16 +/* label to get relative position of literal pool */ +destination: diff --git a/CRIU_code/criu/arch/aarch64/restorer.c b/CRIU_code/criu/arch/aarch64/restorer.c new file mode 100644 index 0000000..ce9c1b4 --- /dev/null +++ b/CRIU_code/criu/arch/aarch64/restorer.c @@ -0,0 +1,14 @@ +#include + +#include "restorer.h" +#include "asm/restorer.h" + +#include +#include "log.h" +#include +#include "cpu.h" + +int restore_nonsigframe_gpregs(UserRegsEntry *r) +{ + return 0; +} diff --git a/CRIU_code/criu/arch/aarch64/sigframe.c b/CRIU_code/criu/arch/aarch64/sigframe.c new file mode 100644 index 0000000..be57c16 --- /dev/null +++ b/CRIU_code/criu/arch/aarch64/sigframe.c @@ -0,0 +1,9 @@ +#include "asm/types.h" +#include +#include "asm/sigframe.h" + +int sigreturn_prep_fpu_frame(struct rt_sigframe *sigframe, + struct rt_sigframe *rsigframe) +{ + return 0; +} diff --git a/CRIU_code/criu/arch/aarch64/vdso-pie.c b/CRIU_code/criu/arch/aarch64/vdso-pie.c new file mode 100644 index 0000000..53d83cb --- /dev/null +++ b/CRIU_code/criu/arch/aarch64/vdso-pie.c @@ -0,0 +1,34 @@ +#include + +#include "asm/types.h" + +#include +#include "parasite-vdso.h" +#include "log.h" +#include "common/bug.h" + +#ifdef LOG_PREFIX +# undef LOG_PREFIX +#endif +#define LOG_PREFIX "vdso: " + +int vdso_redirect_calls(unsigned long base_to, unsigned long base_from, + struct vdso_symtable *to, struct vdso_symtable *from, + bool __always_unused compat_vdso) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(to->symbols); i++) { + if (vdso_symbol_empty(&from->symbols[i])) + continue; + + pr_debug("br: %lx/%lx -> %lx/%lx (index %d)\n", + base_from, from->symbols[i].offset, + base_to, to->symbols[i].offset, i); + + write_intraprocedure_branch(base_to + to->symbols[i].offset, + base_from + from->symbols[i].offset); + } + + return 0; +} diff --git a/CRIU_code/criu/arch/arm/Makefile b/CRIU_code/criu/arch/arm/Makefile new file mode 100644 index 0000000..b111e59 --- /dev/null +++ b/CRIU_code/criu/arch/arm/Makefile @@ -0,0 +1,8 @@ +builtin-name := crtools.built-in.o + +ldflags-y += -r -z noexecstack + +obj-y += cpu.o +obj-y += crtools.o +obj-y += sigframe.o +obj-y += bitops.o diff --git a/CRIU_code/criu/arch/arm/aeabi-helpers.S b/CRIU_code/criu/arch/arm/aeabi-helpers.S new file mode 100644 index 0000000..ea8561d --- /dev/null +++ b/CRIU_code/criu/arch/arm/aeabi-helpers.S @@ -0,0 +1,96 @@ +/* + * Code borrowed from gcc, arm/lib1funcs.S + * and adapted to CRIU macros. + */ + +#if defined(__thumb__) +/* + * We don't support compiling PIEs in Thumb mode, + * see top Makefile for details (ARM CFLAGS_PIE section). +*/ +#error Unsupported Thumb mode +#endif + +#include "common/asm/linkage.h" + +#define RET bx lr +#define RETc(x) bx##x lr +#define LSYM(x) .x + +.macro do_it cond, suffix="" +.endm + +.macro ARM_DIV2_ORDER divisor, order + clz \order, \divisor + rsb \order, \order, #31 +.endm + +.macro ARM_DIV_BODY dividend, divisor, result, curbit + clz \curbit, \dividend + clz \result, \divisor + sub \curbit, \result, \curbit + rsbs \curbit, \curbit, #31 + addne \curbit, \curbit, \curbit, lsl #1 + mov \result, #0 + addne pc, pc, \curbit, lsl #2 + nop + .set shift, 32 + .rept 32 + .set shift, shift - 1 + cmp \dividend, \divisor, lsl #shift + adc \result, \result, \result + subcs \dividend, \dividend, \divisor, lsl #shift + .endr +.endm + +/* + * XXX: as an optimization add udiv instruction based version. + * It's possible to check if CPU supports the instruction by + * reading Instruction Set Attribute Register (ID_ISAR0) + * and checking fields "Divide_instrs". + */ +ENTRY(__aeabi_uidiv) + /* Note: if called via udivsi3_skip_div0_test, this will unnecessarily + check for division-by-zero a second time. */ +LSYM(udivsi3_skip_div0_test): + subs r2, r1, #1 + do_it eq + RETc(eq) + bcc LSYM(Ldiv0) + cmp r0, r1 + bls 11f + tst r1, r2 + beq 12f + + ARM_DIV_BODY r0, r1, r2, r3 + + mov r0, r2 + RET + +11: do_it eq, e + moveq r0, #1 + movne r0, #0 + RET + +12: ARM_DIV2_ORDER r1, r2 + + mov r0, r0, lsr r2 + RET + +LSYM(Ldiv0): + .byte 0xf0, 0x01, 0xf0, 0xe7 @ the instruction UDF #32 generates the signal SIGTRAP in Linux + +END(__aeabi_uidiv) +ALIAS(__udivsi3, __aeabi_uidiv) + +ENTRY(__aeabi_uidivmod) + cmp r1, #0 + beq LSYM(Ldiv0) + stmfd sp!, { r0, r1, lr } + bl LSYM(udivsi3_skip_div0_test) + ldmfd sp!, { r1, r2, lr } + mul r3, r2, r0 + sub r1, r1, r3 + RET +END(__aeabi_uidivmod) +ALIAS(__umodsi3, __aeabi_uidiv) diff --git a/CRIU_code/criu/arch/arm/bitops.S b/CRIU_code/criu/arch/arm/bitops.S new file mode 100644 index 0000000..5193911 --- /dev/null +++ b/CRIU_code/criu/arch/arm/bitops.S @@ -0,0 +1,24 @@ +#include "common/asm/linkage.h" + +.syntax unified + +ENTRY(test_and_set_bit) + ands ip, r1, #3 + strbne r1, [ip] @ assert word-aligned + mov r2, #1 + and r3, r0, #31 @ Get bit offset + mov r0, r0, lsr #5 + add r1, r1, r0, lsl #2 @ Get word offset + mov r3, r2, lsl r3 @ create mask + dmb ish +1: ldrex r2, [r1] + ands r0, r2, r3 @ save old value of bit + orreq r2, r2, r3 @ toggle bit + strex ip, r2, [r1] + cmp ip, #0 + bne 1b + dmb ish + cmp r0, #0 + movne r0, #1 +2: bx lr +END(test_and_set_bit) diff --git a/CRIU_code/criu/arch/arm/cpu.c b/CRIU_code/criu/arch/arm/cpu.c new file mode 100644 index 0000000..34313fb --- /dev/null +++ b/CRIU_code/criu/arch/arm/cpu.c @@ -0,0 +1,40 @@ +#undef LOG_PREFIX +#define LOG_PREFIX "cpu: " + +#include +#include "cpu.h" + +int cpu_init(void) +{ + return 0; +} + +int cpu_dump_cpuinfo(void) +{ + return 0; +} + +int cpu_validate_cpuinfo(void) +{ + return 0; +} + +int cpu_dump_cpuinfo_single(void) +{ + return -ENOTSUP; +} + +int cpu_validate_image_cpuinfo_single(void) +{ + return -ENOTSUP; +} + +int cpuinfo_dump(void) +{ + return -ENOTSUP; +} + +int cpuinfo_check(void) +{ + return -ENOTSUP; +} diff --git a/CRIU_code/criu/arch/arm/crtools.c b/CRIU_code/criu/arch/arm/crtools.c new file mode 100644 index 0000000..c216cdc --- /dev/null +++ b/CRIU_code/criu/arch/arm/crtools.c @@ -0,0 +1,142 @@ +#include +#include + +#include "types.h" +#include + +#include +#include "asm/restorer.h" +#include "common/compiler.h" +#include "asm/dump.h" +#include +#include "protobuf.h" +#include "images/core.pb-c.h" +#include "images/creds.pb-c.h" +#include "log.h" +#include "util.h" +#include "cpu.h" +#include "elf.h" +#include "parasite-syscall.h" +#include "restorer.h" + +#include + +#define assign_reg(dst, src, e) dst->e = (__typeof__(dst->e))((src)->ARM_##e) + +int save_task_regs(void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) +{ + CoreEntry *core = x; + + // Save the ARM CPU state + + assign_reg(core->ti_arm->gpregs, regs, r0); + assign_reg(core->ti_arm->gpregs, regs, r1); + assign_reg(core->ti_arm->gpregs, regs, r2); + assign_reg(core->ti_arm->gpregs, regs, r3); + assign_reg(core->ti_arm->gpregs, regs, r4); + assign_reg(core->ti_arm->gpregs, regs, r5); + assign_reg(core->ti_arm->gpregs, regs, r6); + assign_reg(core->ti_arm->gpregs, regs, r7); + assign_reg(core->ti_arm->gpregs, regs, r8); + assign_reg(core->ti_arm->gpregs, regs, r9); + assign_reg(core->ti_arm->gpregs, regs, r10); + assign_reg(core->ti_arm->gpregs, regs, fp); + assign_reg(core->ti_arm->gpregs, regs, ip); + assign_reg(core->ti_arm->gpregs, regs, sp); + assign_reg(core->ti_arm->gpregs, regs, lr); + assign_reg(core->ti_arm->gpregs, regs, pc); + assign_reg(core->ti_arm->gpregs, regs, cpsr); + core->ti_arm->gpregs->orig_r0 = regs->ARM_ORIG_r0; + + + // Save the VFP state + + memcpy(CORE_THREAD_ARCH_INFO(core)->fpstate->vfp_regs, &fpregs->fpregs, sizeof(fpregs->fpregs)); + CORE_THREAD_ARCH_INFO(core)->fpstate->fpscr = fpregs->fpscr; + + return 0; +} + +int arch_alloc_thread_info(CoreEntry *core) +{ + ThreadInfoArm *ti_arm; + UserArmRegsEntry *gpregs; + UserArmVfpstateEntry *fpstate; + + ti_arm = xmalloc(sizeof(*ti_arm)); + if (!ti_arm) + goto err; + thread_info_arm__init(ti_arm); + core->ti_arm = ti_arm; + + gpregs = xmalloc(sizeof(*gpregs)); + user_arm_regs_entry__init(gpregs); + ti_arm->gpregs = gpregs; + + fpstate = xmalloc(sizeof(*fpstate)); + if (!fpstate) + goto err; + user_arm_vfpstate_entry__init(fpstate); + ti_arm->fpstate = fpstate; + fpstate->vfp_regs = xmalloc(32*sizeof(unsigned long long)); + fpstate->n_vfp_regs = 32; + if (!fpstate->vfp_regs) + goto err; + + return 0; +err: + return -1; +} + +void arch_free_thread_info(CoreEntry *core) +{ + if (CORE_THREAD_ARCH_INFO(core)) { + if (CORE_THREAD_ARCH_INFO(core)->fpstate) { + xfree(CORE_THREAD_ARCH_INFO(core)->fpstate->vfp_regs); + xfree(CORE_THREAD_ARCH_INFO(core)->fpstate); + } + xfree(CORE_THREAD_ARCH_INFO(core)->gpregs); + xfree(CORE_THREAD_ARCH_INFO(core)); + CORE_THREAD_ARCH_INFO(core) = NULL; + } +} + +int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core) +{ + struct aux_sigframe *aux = (struct aux_sigframe *)&sigframe->sig.uc.uc_regspace; + + memcpy(&aux->vfp.ufp.fpregs, CORE_THREAD_ARCH_INFO(core)->fpstate->vfp_regs, sizeof(aux->vfp.ufp.fpregs)); + aux->vfp.ufp.fpscr = CORE_THREAD_ARCH_INFO(core)->fpstate->fpscr; + aux->vfp.magic = VFP_MAGIC; + aux->vfp.size = VFP_STORAGE_SIZE; + return 0; +} + +int restore_gpregs(struct rt_sigframe *f, UserArmRegsEntry *r) +{ +#define CPREG1(d) f->sig.uc.uc_mcontext.arm_##d = r->d +#define CPREG2(d, s) f->sig.uc.uc_mcontext.arm_##d = r->s + + CPREG1(r0); + CPREG1(r1); + CPREG1(r2); + CPREG1(r3); + CPREG1(r4); + CPREG1(r5); + CPREG1(r6); + CPREG1(r7); + CPREG1(r8); + CPREG1(r9); + CPREG1(r10); + CPREG1(fp); + CPREG1(ip); + CPREG1(sp); + CPREG1(lr); + CPREG1(pc); + CPREG1(cpsr); + +#undef CPREG1 +#undef CPREG2 + + return 0; +} diff --git a/CRIU_code/criu/arch/arm/include/asm/dump.h b/CRIU_code/criu/arch/arm/include/asm/dump.h new file mode 100644 index 0000000..2382ba4 --- /dev/null +++ b/CRIU_code/criu/arch/arm/include/asm/dump.h @@ -0,0 +1,16 @@ +#ifndef __CR_ASM_DUMP_H__ +#define __CR_ASM_DUMP_H__ + +extern int save_task_regs(void *, user_regs_struct_t *, user_fpregs_struct_t *); +extern int arch_alloc_thread_info(CoreEntry *core); +extern void arch_free_thread_info(CoreEntry *core); + + +static inline void core_put_tls(CoreEntry *core, tls_t tls) +{ + core->ti_arm->tls = tls; +} + +#define get_task_futex_robust_list_compat(pid, info) -1 + +#endif diff --git a/CRIU_code/criu/arch/arm/include/asm/int.h b/CRIU_code/criu/arch/arm/include/asm/int.h new file mode 100644 index 0000000..642804e --- /dev/null +++ b/CRIU_code/criu/arch/arm/include/asm/int.h @@ -0,0 +1,6 @@ +#ifndef __CR_ASM_INT_H__ +#define __CR_ASM_INT_H__ + +#include "asm-generic/int.h" + +#endif /* __CR_ASM_INT_H__ */ diff --git a/CRIU_code/criu/arch/arm/include/asm/kerndat.h b/CRIU_code/criu/arch/arm/include/asm/kerndat.h new file mode 100644 index 0000000..60956b5 --- /dev/null +++ b/CRIU_code/criu/arch/arm/include/asm/kerndat.h @@ -0,0 +1,7 @@ +#ifndef __CR_ASM_KERNDAT_H__ +#define __CR_ASM_KERNDAT_H__ + +#define kdat_compatible_cr() 0 +#define kdat_can_map_vdso() 0 + +#endif /* __CR_ASM_KERNDAT_H__ */ diff --git a/CRIU_code/criu/arch/arm/include/asm/parasite-syscall.h b/CRIU_code/criu/arch/arm/include/asm/parasite-syscall.h new file mode 100644 index 0000000..6008c37 --- /dev/null +++ b/CRIU_code/criu/arch/arm/include/asm/parasite-syscall.h @@ -0,0 +1,6 @@ +#ifndef __CR_ASM_PARASITE_SYSCALL_H__ +#define __CR_ASM_PARASITE_SYSCALL_H__ + +struct parasite_ctl; + +#endif diff --git a/CRIU_code/criu/arch/arm/include/asm/parasite.h b/CRIU_code/criu/arch/arm/include/asm/parasite.h new file mode 100644 index 0000000..0ed320b --- /dev/null +++ b/CRIU_code/criu/arch/arm/include/asm/parasite.h @@ -0,0 +1,9 @@ +#ifndef __ASM_PARASITE_H__ +#define __ASM_PARASITE_H__ + +static inline void arch_get_tls(tls_t *ptls) +{ + *ptls = ((tls_t (*)(void))0xffff0fe0)(); +} + +#endif diff --git a/CRIU_code/criu/arch/arm/include/asm/restore.h b/CRIU_code/criu/arch/arm/include/asm/restore.h new file mode 100644 index 0000000..4c64d58 --- /dev/null +++ b/CRIU_code/criu/arch/arm/include/asm/restore.h @@ -0,0 +1,29 @@ +#ifndef __CR_ASM_RESTORE_H__ +#define __CR_ASM_RESTORE_H__ + +#include "asm/restorer.h" + +#include "images/core.pb-c.h" + +#define JUMP_TO_RESTORER_BLOB(new_sp, restore_task_exec_start, \ + task_args) \ + asm volatile( \ + "mov sp, %0 \n" \ + "mov r1, %1 \n" \ + "mov r0, %2 \n" \ + "bx r1 \n" \ + : \ + : "r"(new_sp), \ + "r"(restore_task_exec_start), \ + "r"(task_args) \ + : "sp", "r0", "r1", "memory") + +static inline void core_get_tls(CoreEntry *pcore, tls_t *ptls) +{ + *ptls = pcore->ti_arm->tls; +} + + +int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core); + +#endif diff --git a/CRIU_code/criu/arch/arm/include/asm/restorer.h b/CRIU_code/criu/arch/arm/include/asm/restorer.h new file mode 100644 index 0000000..217d920 --- /dev/null +++ b/CRIU_code/criu/arch/arm/include/asm/restorer.h @@ -0,0 +1,89 @@ +#ifndef __CR_ASM_RESTORER_H__ +#define __CR_ASM_RESTORER_H__ + +#include "asm/types.h" +#include "images/core.pb-c.h" + +#include + +#define RUN_CLONE_RESTORE_FN(ret, clone_flags, new_sp, parent_tid, \ + thread_args, clone_restore_fn) \ + asm volatile( \ + "clone_emul: \n" \ + "ldr r1, %2 \n" \ + "sub r1, #16 \n" \ + "mov r0, %6 \n" \ + "str r0, [r1, #4] \n" \ + "mov r0, %5 \n" \ + "str r0, [r1] \n" \ + "mov r0, %1 \n" \ + "mov r2, %3 \n" \ + "mov r3, %4 \n" \ + "mov r7, #"__stringify(__NR_clone)" \n" \ + "svc #0 \n" \ + \ + "cmp r0, #0 \n" \ + "beq thread_run \n" \ + \ + "mov %0, r0 \n" \ + "b clone_end \n" \ + \ + "thread_run: \n" \ + "pop { r1 } \n" \ + "pop { r0 } \n" \ + "bx r1 \n" \ + \ + "clone_end: \n" \ + : "=r"(ret) \ + : "r"(clone_flags), \ + "m"(new_sp), \ + "r"(&parent_tid), \ + "r"(&thread_args[i].pid), \ + "r"(clone_restore_fn), \ + "r"(&thread_args[i]) \ + : "r0", "r1", "r2", "r3", "r7", "memory") + +#define ARCH_FAIL_CORE_RESTORE \ + asm volatile( \ + "mov sp, %0 \n" \ + "mov r0, #0 \n" \ + "bx r0 \n" \ + : \ + : "r"(ret) \ + : "memory") + + +#define arch_map_vdso(map, compat) -1 + +int restore_gpregs(struct rt_sigframe *f, UserArmRegsEntry *r); +int restore_nonsigframe_gpregs(UserArmRegsEntry *r); +#define ARCH_HAS_SHMAT_HOOK +unsigned long arch_shmat(int shmid, void *shmaddr, + int shmflg, unsigned long size); + +static inline void restore_tls(tls_t *ptls) { + asm ( + "mov r7, #15 \n" + "lsl r7, #16 \n" + "mov r0, #5 \n" + "add r7, r0 \n" /* r7 = 0xF005 */ + "ldr r0, [%0] \n" + "svc #0 \n" + : + : "r"(ptls) + : "r0", "r7" + ); +} + +static inline void *alloc_compat_syscall_stack(void) { return NULL; } +static inline void free_compat_syscall_stack(void *stack32) { } +static inline int arch_compat_rt_sigaction(void *stack, int sig, void *act) +{ + return -1; +} +static inline int set_compat_robust_list(uint32_t head_ptr, uint32_t len) +{ + return -1; +} + +#endif diff --git a/CRIU_code/criu/arch/arm/include/asm/types.h b/CRIU_code/criu/arch/arm/include/asm/types.h new file mode 100644 index 0000000..32612a6 --- /dev/null +++ b/CRIU_code/criu/arch/arm/include/asm/types.h @@ -0,0 +1,31 @@ +#ifndef __CR_ASM_TYPES_H__ +#define __CR_ASM_TYPES_H__ + +#include +#include +#include "images/core.pb-c.h" + +#include "page.h" +#include "bitops.h" +#include "asm/int.h" + +#include + +#define core_is_compat(core) false + +typedef UserArmRegsEntry UserRegsEntry; + +#define CORE_ENTRY__MARCH CORE_ENTRY__MARCH__ARM + +#define CORE_THREAD_ARCH_INFO(core) core->ti_arm + +#define TI_SP(core) ((core)->ti_arm->gpregs->sp) + +static inline void *decode_pointer(u64 v) { return (void*)(u32)v; } +static inline u64 encode_pointer(void *p) { return (u32)p; } + +#define AT_VECTOR_SIZE 40 +typedef uint32_t auxv_t; +typedef uint32_t tls_t; + +#endif /* __CR_ASM_TYPES_H__ */ diff --git a/CRIU_code/criu/arch/arm/include/asm/vdso.h b/CRIU_code/criu/arch/arm/include/asm/vdso.h new file mode 100644 index 0000000..cf9d500 --- /dev/null +++ b/CRIU_code/criu/arch/arm/include/asm/vdso.h @@ -0,0 +1,17 @@ +#ifndef __CR_ASM_VDSO_H__ +#define __CR_ASM_VDSO_H__ + +#include "asm/int.h" +#include "asm-generic/vdso.h" + +/* This definition is used in pie/util-vdso.c to initialize the vdso symbol + * name string table 'vdso_symbols' + * + * Poke from kernel file arch/arm/vdso/vdso.lds.S + */ +#define VDSO_SYMBOL_MAX 2 +#define ARCH_VDSO_SYMBOLS \ + "__vdso_clock_gettime", \ + "__vdso_gettimeofday" + +#endif /* __CR_ASM_VDSO_H__ */ diff --git a/CRIU_code/criu/arch/arm/pie-cacheflush.c b/CRIU_code/criu/arch/arm/pie-cacheflush.c new file mode 100644 index 0000000..e6fd71f --- /dev/null +++ b/CRIU_code/criu/arch/arm/pie-cacheflush.c @@ -0,0 +1,7 @@ +#include + +/* That's __builtin___clear_cache() to flush CPU cache */ +void __clear_cache(void *start, void *end) +{ + sys_cacheflush(start, end, 0); +} diff --git a/CRIU_code/criu/arch/arm/restorer.c b/CRIU_code/criu/arch/arm/restorer.c new file mode 100644 index 0000000..588c1c0 --- /dev/null +++ b/CRIU_code/criu/arch/arm/restorer.c @@ -0,0 +1,73 @@ +#include + +#include "restorer.h" +#include "asm/restorer.h" + +#include +#include "log.h" +#include +#include "cpu.h" +#include "page.h" +#include "common/err.h" + +int restore_nonsigframe_gpregs(UserArmRegsEntry *r) +{ + return 0; +} + +/* + * On ARMv6 CPUs with VIPT caches there are aliasing issues: + * if two different cache line indexes correspond to the same physical + * address, then changes made to one of the alias might be lost or they + * can overwrite each other. To overcome aliasing issues, page coloring + * with 4 pages align for shared mappings was introduced (SHMLBA) in kernel. + * Which resulted in unique physical address after any tag in cache + * (because two upper bits corresponding to page address get unused in tags). + * + * The problem here is in shmat() syscall: + * 1. if shmaddr is NULL then do_shmat() uses arch_get_unmapped_area() + * to allocate shared mapping. Which checks if CPU cache is VIPT + * and only then use SHMLBA alignment. + * 2. if shmaddr is specified then do_shmat() checks that address has + * SHMLBA alignment regardless to CPU cache aliasing. + * + * All above means that on non-VIPT CPU (like any ARMv7) we can get + * non-SHMLBA, but page-aligned address with shmat(shmid, NULL, shmflg), + * but we can't restore it with shmat(shmid, shmaddr, shmflg). + * Which results that we can dump e.g., application with shmem aligned + * on 2 pages, but can't restore it on the same ARMv7 CPU. + * + * To workaround this kernel feature, use mremap() on shmem mapping, + * allocated with shmat(shmid, NULL, shmflg). + */ +#define SHMLBA (4UL * PAGE_SIZE) +unsigned long arch_shmat(int shmid, void *shmaddr, + int shmflg, unsigned long size) +{ + unsigned long smap; + + /* SHMLBA-aligned, direct call shmat() */ + if (!((unsigned long)shmaddr & (SHMLBA - 1))) + return sys_shmat(shmid, shmaddr, shmflg); + + smap = sys_shmat(shmid, NULL, shmflg); + if (IS_ERR_VALUE(smap)) { + pr_err("shmat() with NULL shmaddr failed: %d\n", (int)smap); + return smap; + } + + /* We're lucky! */ + if (smap == (unsigned long)shmaddr) + return smap; + + /* Warn ALOUD */ + pr_warn("Restoring shmem %p unaligned to SHMLBA.\n", shmaddr); + pr_warn("Make sure that you don't migrate shmem from non-VIPT cached CPU to VIPT cached (e.g., ARMv7 -> ARMv6)\n"); + pr_warn("Otherwise YOU HAVE A CHANCE OF DATA CORRUPTIONS in writeable shmem\n"); + + smap = sys_mremap(smap, size, size, + MREMAP_FIXED | MREMAP_MAYMOVE, (unsigned long)shmaddr); + if (IS_ERR_VALUE(smap)) + pr_err("mremap() for shmem failed: %d\n", (int)smap); + return smap; +} diff --git a/CRIU_code/criu/arch/arm/sigframe.c b/CRIU_code/criu/arch/arm/sigframe.c new file mode 100644 index 0000000..be57c16 --- /dev/null +++ b/CRIU_code/criu/arch/arm/sigframe.c @@ -0,0 +1,9 @@ +#include "asm/types.h" +#include +#include "asm/sigframe.h" + +int sigreturn_prep_fpu_frame(struct rt_sigframe *sigframe, + struct rt_sigframe *rsigframe) +{ + return 0; +} diff --git a/CRIU_code/criu/arch/arm/vdso-pie.c b/CRIU_code/criu/arch/arm/vdso-pie.c new file mode 100644 index 0000000..0ec8bd9 --- /dev/null +++ b/CRIU_code/criu/arch/arm/vdso-pie.c @@ -0,0 +1,58 @@ +#include + +#include "asm/types.h" + +#include +#include +#include "parasite-vdso.h" +#include "log.h" +#include "common/bug.h" + +#ifdef LOG_PREFIX +# undef LOG_PREFIX +#endif +#define LOG_PREFIX "vdso: " + +static void insert_trampoline(uintptr_t from, uintptr_t to) +{ + struct { + uint32_t ldr_pc; + uint32_t imm32; + uint32_t guards; + } __packed jmp = { + .ldr_pc = 0xe51ff004, /* ldr pc, [pc, #-4] */ + .imm32 = to, + .guards = 0xe1200070, /* bkpt 0x0000 */ + }; + void *iflush_start = (void *)from; + void *iflush_end = iflush_start + sizeof(jmp); + + memcpy((void *)from, &jmp, sizeof(jmp)); + + __builtin___clear_cache(iflush_start, iflush_end); +} + +int vdso_redirect_calls(unsigned long base_to, unsigned long base_from, + struct vdso_symtable *sto, struct vdso_symtable *sfrom, + bool compat_vdso) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(sto->symbols); i++) { + uintptr_t from, to; + + if (vdso_symbol_empty(&sfrom->symbols[i])) + continue; + + pr_debug("jmp: %lx/%lx -> %lx/%lx (index %d)\n", + base_from, sfrom->symbols[i].offset, + base_to, sto->symbols[i].offset, i); + + from = base_from + sfrom->symbols[i].offset; + to = base_to + sto->symbols[i].offset; + + insert_trampoline(from, to); + } + + return 0; +} diff --git a/CRIU_code/criu/arch/ppc64/Makefile b/CRIU_code/criu/arch/ppc64/Makefile new file mode 100644 index 0000000..f37337f --- /dev/null +++ b/CRIU_code/criu/arch/ppc64/Makefile @@ -0,0 +1,7 @@ +builtin-name := crtools.built-in.o + +ldflags-y += -r + +obj-y += cpu.o +obj-y += crtools.o +obj-y += sigframe.o diff --git a/CRIU_code/criu/arch/ppc64/cpu.c b/CRIU_code/criu/arch/ppc64/cpu.c new file mode 100644 index 0000000..4fcfb06 --- /dev/null +++ b/CRIU_code/criu/arch/ppc64/cpu.c @@ -0,0 +1,142 @@ +#undef LOG_PREFIX +#define LOG_PREFIX "cpu: " + +#include +#include +#include + +#include "asm/types.h" + +#include "cr_options.h" +#include "image.h" +#include "util.h" +#include "log.h" +#include "cpu.h" + +#include "protobuf.h" +#include "images/cpuinfo.pb-c.h" + +static compel_cpuinfo_t rt_cpuinfo; + +#ifdef __LITTLE_ENDIAN__ +#define CURRENT_ENDIANNESS CPUINFO_PPC64_ENTRY__ENDIANNESS__LITTLEENDIAN +#else +#define CURRENT_ENDIANNESS CPUINFO_PPC64_ENTRY__ENDIANESS__BIGENDIAN +#endif + +int cpu_init(void) +{ + return compel_cpuid(&rt_cpuinfo); +} + +int cpu_dump_cpuinfo(void) +{ + CpuinfoEntry cpu_info = CPUINFO_ENTRY__INIT; + CpuinfoPpc64Entry cpu_ppc64_info = CPUINFO_PPC64_ENTRY__INIT; + CpuinfoPpc64Entry *cpu_ppc64_info_ptr = &cpu_ppc64_info; + struct cr_img *img; + int ret = -1; + + img = open_image(CR_FD_CPUINFO, O_DUMP); + if (!img) + return -1; + + cpu_info.ppc64_entry = &cpu_ppc64_info_ptr; + cpu_info.n_ppc64_entry = 1; + + cpu_ppc64_info.endian = CURRENT_ENDIANNESS; + cpu_ppc64_info.n_hwcap = 2; + cpu_ppc64_info.hwcap = rt_cpuinfo.hwcap; + + ret = pb_write_one(img, &cpu_info, PB_CPUINFO); + + close_image(img); + return ret; +} + +int cpu_validate_cpuinfo(void) +{ + CpuinfoEntry *cpu_info; + CpuinfoPpc64Entry *cpu_ppc64_entry; + struct cr_img *img; + int ret = -1; + img = open_image(CR_FD_CPUINFO, O_RSTR); + if (!img) + return -1; + + if (pb_read_one(img, &cpu_info, PB_CPUINFO) < 0) + goto error; + + if (cpu_info->n_ppc64_entry != 1) { + pr_err("No PPC64 related entry in image\n"); + goto error; + } + cpu_ppc64_entry = cpu_info->ppc64_entry[0]; + + if (cpu_ppc64_entry->endian != CURRENT_ENDIANNESS) { + pr_err("Bad endianness\n"); + goto error; + } + + if (cpu_ppc64_entry->n_hwcap != 2) { + pr_err("Hardware capabilities information missing\n"); + goto error; + } + +#define CHECK_FEATURE(s,f) do { \ + if ((cpu_ppc64_entry->hwcap[s] & f) && \ + !(rt_cpuinfo.hwcap[s] & f)) { \ + pr_err("CPU Feature %s required by image " \ + "is not supported on host.\n", #f); \ + goto error; \ + } \ + } while(0) + +#define REQUIRE_FEATURE(s,f) do { \ + if (!(cpu_ppc64_entry->hwcap[s] & f)) { \ + pr_err("CPU Feature %s missing in image.\n", #f); \ + goto error; \ + } \ + } while(0) + + REQUIRE_FEATURE(0, PPC_FEATURE_64); + REQUIRE_FEATURE(0, PPC_FEATURE_HAS_FPU); + REQUIRE_FEATURE(0, PPC_FEATURE_HAS_MMU); + REQUIRE_FEATURE(0, PPC_FEATURE_HAS_VSX); + REQUIRE_FEATURE(1, PPC_FEATURE2_ARCH_2_07); + + CHECK_FEATURE(0, PPC_FEATURE_TRUE_LE); + CHECK_FEATURE(1, PPC_FEATURE2_HTM); + CHECK_FEATURE(1, PPC_FEATURE2_DSCR); + CHECK_FEATURE(1, PPC_FEATURE2_EBB); + CHECK_FEATURE(1, PPC_FEATURE2_ISEL); + CHECK_FEATURE(1, PPC_FEATURE2_TAR); + CHECK_FEATURE(1, PPC_FEATURE2_VEC_CRYPTO); + + ret = 0; +error: + close_image(img); + return ret; +} + +int cpuinfo_dump(void) +{ + if (cpu_init()) + return -1; + + if (cpu_dump_cpuinfo()) + return -1; + + return 0; +} + +int cpuinfo_check(void) +{ + if (cpu_init()) + return -1; + + if (cpu_validate_cpuinfo()) + return 1; + + return 0; +} diff --git a/CRIU_code/criu/arch/ppc64/crtools.c b/CRIU_code/criu/arch/ppc64/crtools.c new file mode 100644 index 0000000..5a5966a --- /dev/null +++ b/CRIU_code/criu/arch/ppc64/crtools.c @@ -0,0 +1,505 @@ +#include +#include +#include +#include +#include +#include + +#include "types.h" +#include +#include "asm/restorer.h" +#include "asm/dump.h" + +#include "cr_options.h" +#include "common/compiler.h" +#include +#include "parasite-syscall.h" +#include "log.h" +#include "util.h" +#include "cpu.h" +#include + +#include "protobuf.h" +#include "images/core.pb-c.h" +#include "images/creds.pb-c.h" + +static UserPpc64FpstateEntry *copy_fp_regs(uint64_t *fpregs) +{ + UserPpc64FpstateEntry *fpe; + int i; + + fpe = xmalloc(sizeof(UserPpc64FpstateEntry)); + if (!fpe) + return NULL; + user_ppc64_fpstate_entry__init(fpe); + + fpe->n_fpregs = NFPREG; + fpe->fpregs = xmalloc(fpe->n_fpregs * sizeof(fpe->fpregs[0])); + if (!fpe->fpregs) { + xfree(fpe); + return NULL; + } + + /* FPSRC is the last (33th) register in the set */ + for (i = 0; i < NFPREG; i++) + fpe->fpregs[i] = fpregs[i]; + + return fpe; +} + +static void put_fpu_regs(mcontext_t *mc, UserPpc64FpstateEntry *fpe) +{ + uint64_t *mcfp = (uint64_t *)mc->fp_regs; + size_t i; + + for (i = 0; i < fpe->n_fpregs; i++) + mcfp[i] = fpe->fpregs[i]; +} + +static UserPpc64VrstateEntry *copy_altivec_regs(__vector128 *vrregs) +{ + UserPpc64VrstateEntry *vse; + uint64_t *p64; + uint32_t *p32; + int i; + + vse = xmalloc(sizeof(*vse)); + if (!vse) + return NULL; + user_ppc64_vrstate_entry__init(vse); + + /* protocol buffer store only 64bit entries and we need 128bit */ + vse->n_vrregs = (NVRREG-1) * 2; + vse->vrregs = xmalloc(vse->n_vrregs * sizeof(vse->vrregs[0])); + if (!vse->vrregs) { + xfree(vse); + return NULL; + } + + /* Vectors are 2*64bits entries */ + for (i = 0; i < (NVRREG-1); i++) { + p64 = (uint64_t*) &vrregs[i]; + vse->vrregs[i*2] = p64[0]; + vse->vrregs[i*2 + 1] = p64[1]; + } + + p32 = (uint32_t*) &vrregs[NVRREG-1]; + vse->vrsave = *p32; + + return vse; +} + +static int put_altivec_regs(mcontext_t *mc, UserPpc64VrstateEntry *vse) +{ + vrregset_t *v_regs = (vrregset_t *)(((unsigned long)mc->vmx_reserve + 15) & ~0xful); + + pr_debug("Restoring Altivec registers\n"); + + if (vse->n_vrregs != (NVRREG-1)*2) { + pr_err("Corrupted Altivec dump data\n"); + return -1; + } + + /* Note that this should only be done in the case MSR_VEC is set but + * this is not a big deal to do that in all cases. + */ + memcpy(&v_regs->vrregs[0][0], vse->vrregs, + sizeof(uint64_t) * 2 * (NVRREG-1)); + /* vscr has been restored with the previous memcpy which copied 32 + * 128bits registers + a 128bits field containing the vscr value in + * the low part. + */ + + v_regs->vrsave = vse->vrsave; + mc->v_regs = v_regs; + + return 0; +} + +static UserPpc64VsxstateEntry* copy_vsx_regs(uint64_t *vsregs) +{ + UserPpc64VsxstateEntry *vse; + int i; + + vse = xmalloc(sizeof(*vse)); + if (!vse) + return NULL; + + user_ppc64_vsxstate_entry__init(vse); + vse->n_vsxregs = NVSXREG; + + vse->vsxregs = xmalloc(vse->n_vsxregs*sizeof(vse->vsxregs[0])); + if (!vse->vsxregs) { + xfree(vse); + return NULL; + } + + for (i = 0; i < vse->n_vsxregs; i++) + vse->vsxregs[i] = vsregs[i]; + + return vse; +} + +static int put_vsx_regs(mcontext_t *mc, UserPpc64VsxstateEntry *vse) +{ + uint64_t *buf; + int i; + + pr_debug("Restoring VSX registers\n"); + if (!mc->v_regs) { + /* VSX implies Altivec so v_regs should be set */ + pr_err("Internal error\n"); + return -1; + } + + /* point after the Altivec registers */ + buf = (uint64_t*) (mc->v_regs + 1); + + /* Copy the value saved by get_vsx_regs in the sigframe */ + for (i=0; i < vse->n_vsxregs; i++) + buf[i] = vse->vsxregs[i]; + + return 0; +} + + +static void copy_gp_regs(UserPpc64RegsEntry *dst, user_regs_struct_t *src) +{ + int i; + +#define assign_reg(e) do { \ + dst->e = (__typeof__(dst->e))src->e; \ +} while (0) + + for (i=0; i<32; i++) + assign_reg(gpr[i]); + assign_reg(nip); + assign_reg(msr); + assign_reg(orig_gpr3); + assign_reg(ctr); + assign_reg(link); + assign_reg(xer); + assign_reg(ccr); + assign_reg(trap); +#undef assign_reg +} + +static void restore_gp_regs(mcontext_t *dst, UserPpc64RegsEntry *src) +{ + int i; + + /* r0 to r31 */ + for (i=0; i<32; i++) + dst->gp_regs[i] = src->gpr[i]; + + dst->gp_regs[PT_NIP] = src->nip; + dst->gp_regs[PT_MSR] = src->msr; + dst->gp_regs[PT_ORIG_R3] = src->orig_gpr3; + dst->gp_regs[PT_CTR] = src->ctr; + dst->gp_regs[PT_LNK] = src->link; + dst->gp_regs[PT_XER] = src->xer; + dst->gp_regs[PT_CCR] = src->ccr; + dst->gp_regs[PT_TRAP] = src->trap; +} + +static UserPpc64RegsEntry *allocate_gp_regs(void) +{ + UserPpc64RegsEntry *gpregs; + + gpregs = xmalloc(sizeof(*gpregs)); + if (!gpregs) + return NULL; + user_ppc64_regs_entry__init(gpregs); + + gpregs->n_gpr = 32; + gpregs->gpr = xmalloc(32 * sizeof(uint64_t)); + if (!gpregs->gpr) { + xfree(gpregs); + return NULL; + } + + return gpregs; +} + +/**************************************************************************** + * TRANSACTIONAL MEMORY SUPPORT + */ +static void xfree_tm_state(UserPpc64TmRegsEntry *tme) +{ + if (tme) { + if (tme->fpstate) { + xfree(tme->fpstate->fpregs); + xfree(tme->fpstate); + } + if (tme->vrstate) { + xfree(tme->vrstate->vrregs); + xfree(tme->vrstate); + } + if (tme->vsxstate) { + xfree(tme->vsxstate->vsxregs); + xfree(tme->vsxstate); + } + if (tme->gpregs) { + if (tme->gpregs->gpr) + xfree(tme->gpregs->gpr); + xfree(tme->gpregs); + } + xfree(tme); + } +} + +static int put_tm_regs(struct rt_sigframe *f, UserPpc64TmRegsEntry *tme) +{ +/* + * WARNING: As stated in kernel's restore_tm_sigcontexts, TEXASR has to be + * restored by the process itself : + * TEXASR was set by the signal delivery reclaim, as was TFIAR. + * Users doing anything abhorrent like thread-switching w/ signals for + * TM-Suspended code will have to back TEXASR/TFIAR up themselves. + * For the case of getting a signal and simply returning from it, + * we don't need to re-copy them here. + */ + ucontext_t *tm_uc = &f->uc_transact; + + pr_debug("Restoring TM registers FP:%d VR:%d VSX:%d\n", + !!(tme->fpstate), !!(tme->vrstate), !!(tme->vsxstate)); + + restore_gp_regs(&tm_uc->uc_mcontext, tme->gpregs); + + if (tme->fpstate) + put_fpu_regs(&tm_uc->uc_mcontext, tme->fpstate); + + if (tme->vrstate && put_altivec_regs(&tm_uc->uc_mcontext, + tme->vrstate)) + return -1; + + if (tme->vsxstate && put_vsx_regs(&tm_uc->uc_mcontext, + tme->vsxstate)) + return -1; + + f->uc.uc_link = tm_uc; + return 0; +} + +/****************************************************************************/ +static int copy_tm_regs(user_regs_struct_t *regs, user_fpregs_struct_t *fpregs, + CoreEntry *core) +{ + UserPpc64TmRegsEntry *tme; + UserPpc64RegsEntry *gpregs = core->ti_ppc64->gpregs; + + pr_debug("Copying TM registers\n"); + tme = xmalloc(sizeof(*tme)); + if (!tme) + return -1; + + user_ppc64_tm_regs_entry__init(tme); + + tme->gpregs = allocate_gp_regs(); + if (!tme->gpregs) + goto out_free; + + gpregs->has_tfhar = true; + gpregs->tfhar = fpregs->tm.tm_spr_regs.tfhar; + gpregs->has_texasr = true; + gpregs->texasr = fpregs->tm.tm_spr_regs.texasr; + gpregs->has_tfiar = true; + gpregs->tfiar = fpregs->tm.tm_spr_regs.tfiar; + + + /* This is the checkpointed state, we must save it in place of the + * current state because the signal handler is made in this way. + * We invert the 2 states instead of when building the signal frame, + * because we can't modify the gpregs manipulated by the common layer. + */ + copy_gp_regs(gpregs, &fpregs->tm.regs); + + if (fpregs->tm.flags & USER_FPREGS_FL_FP) { + core->ti_ppc64->fpstate = copy_fp_regs(fpregs->tm.fpregs); + if (!core->ti_ppc64->fpstate) + goto out_free; + } + + if (fpregs->tm.flags & USER_FPREGS_FL_ALTIVEC) { + core->ti_ppc64->vrstate = copy_altivec_regs(fpregs->tm.vrregs); + if (!core->ti_ppc64->vrstate) + goto out_free; + + /* + * Force the MSR_VEC bit of the restored MSR otherwise the + * kernel will not restore them from the signal frame. + */ + gpregs->msr |= MSR_VEC; + + if (fpregs->tm.flags & USER_FPREGS_FL_VSX) { + core->ti_ppc64->vsxstate = copy_vsx_regs(fpregs->tm.vsxregs); + if (!core->ti_ppc64->vsxstate) + goto out_free; + /* + * Force the MSR_VSX bit of the restored MSR otherwise + * the kernel will not restore them from the signal + * frame. + */ + gpregs->msr |= MSR_VSX; + } + } + + core->ti_ppc64->tmstate = tme; + return 0; + +out_free: + xfree_tm_state(tme); + return -1; +} + +static int __copy_task_regs(user_regs_struct_t *regs, + user_fpregs_struct_t *fpregs, + CoreEntry *core) +{ + UserPpc64RegsEntry *gpregs; + UserPpc64FpstateEntry **fpstate; + UserPpc64VrstateEntry **vrstate; + UserPpc64VsxstateEntry **vsxstate; + + /* Copy retrieved registers in the proto data + * If TM is in the loop we switch the saved register set because + * the signal frame is built with checkpointed registers on top to not + * confused TM unaware process, while ptrace is retrieving the + * checkpointed set through the TM specific ELF notes. + */ + if (fpregs->flags & USER_FPREGS_FL_TM) { + if (copy_tm_regs(regs, fpregs, core)) + return -1; + gpregs = core->ti_ppc64->tmstate->gpregs; + fpstate = &(core->ti_ppc64->tmstate->fpstate); + vrstate = &(core->ti_ppc64->tmstate->vrstate); + vsxstate = &(core->ti_ppc64->tmstate->vsxstate); + } + else { + gpregs = core->ti_ppc64->gpregs; + fpstate = &(core->ti_ppc64->fpstate); + vrstate = &(core->ti_ppc64->vrstate); + vsxstate = &(core->ti_ppc64->vsxstate); + } + + copy_gp_regs(gpregs, regs); + if (fpregs->flags & USER_FPREGS_FL_FP) { + *fpstate = copy_fp_regs(fpregs->fpregs); + if (!*fpstate) + return -1; + } + if (fpregs->flags & USER_FPREGS_FL_ALTIVEC) { + *vrstate = copy_altivec_regs(fpregs->vrregs); + if (!*vrstate) + return -1; + /* + * Force the MSR_VEC bit of the restored MSR otherwise the + * kernel will not restore them from the signal frame. + */ + gpregs->msr |= MSR_VEC; + + if (fpregs->flags & USER_FPREGS_FL_VSX) { + *vsxstate = copy_vsx_regs(fpregs->vsxregs); + if (!*vsxstate) + return -1; + /* + * Force the MSR_VSX bit of the restored MSR otherwise + * the kernel will not restore them from the signal + * frame. + */ + gpregs->msr |= MSR_VSX; + } + } + return 0; +} + +int save_task_regs(void *arg, user_regs_struct_t *u, user_fpregs_struct_t *f) +{ + return __copy_task_regs(u, f, (CoreEntry *)arg); +} + +/****************************************************************************/ +int arch_alloc_thread_info(CoreEntry *core) +{ + ThreadInfoPpc64 *ti_ppc64; + + ti_ppc64 = xmalloc(sizeof(*ti_ppc64)); + if(!ti_ppc64) + return -1; + + thread_info_ppc64__init(ti_ppc64); + + ti_ppc64->gpregs = allocate_gp_regs(); + if (!ti_ppc64->gpregs) { + xfree(ti_ppc64); + return -1; + } + + CORE_THREAD_ARCH_INFO(core) = ti_ppc64; + return 0; +} + +void arch_free_thread_info(CoreEntry *core) +{ + if (CORE_THREAD_ARCH_INFO(core)) { + if (CORE_THREAD_ARCH_INFO(core)->fpstate) { + xfree(CORE_THREAD_ARCH_INFO(core)->fpstate->fpregs); + xfree(CORE_THREAD_ARCH_INFO(core)->fpstate); + } + if (CORE_THREAD_ARCH_INFO(core)->vrstate) { + xfree(CORE_THREAD_ARCH_INFO(core)->vrstate->vrregs); + xfree(CORE_THREAD_ARCH_INFO(core)->vrstate); + } + if (CORE_THREAD_ARCH_INFO(core)->vsxstate) { + xfree(CORE_THREAD_ARCH_INFO(core)->vsxstate->vsxregs); + xfree(CORE_THREAD_ARCH_INFO(core)->vsxstate); + } + xfree_tm_state(CORE_THREAD_ARCH_INFO(core)->tmstate); + xfree(CORE_THREAD_ARCH_INFO(core)->gpregs->gpr); + xfree(CORE_THREAD_ARCH_INFO(core)->gpregs); + xfree(CORE_THREAD_ARCH_INFO(core)); + CORE_THREAD_ARCH_INFO(core) = NULL; + } +} + +int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core) +{ + int ret = 0; + + if (CORE_THREAD_ARCH_INFO(core)->fpstate) + put_fpu_regs(&sigframe->uc.uc_mcontext, + CORE_THREAD_ARCH_INFO(core)->fpstate); + + if (CORE_THREAD_ARCH_INFO(core)->vrstate) + ret = put_altivec_regs(&sigframe->uc.uc_mcontext, + CORE_THREAD_ARCH_INFO(core)->vrstate); + else if (core->ti_ppc64->gpregs->msr & MSR_VEC) { + pr_err("Register's data mismatch, corrupted image ?\n"); + ret = -1; + } + + if (!ret && CORE_THREAD_ARCH_INFO(core)->vsxstate) + ret = put_vsx_regs(&sigframe->uc.uc_mcontext, + CORE_THREAD_ARCH_INFO(core)->vsxstate); + else if (core->ti_ppc64->gpregs->msr & MSR_VSX) { + pr_err("VSX register's data mismatch, corrupted image ?\n"); + ret = -1; + } + + if (!ret && CORE_THREAD_ARCH_INFO(core)->tmstate) + ret = put_tm_regs(sigframe, + CORE_THREAD_ARCH_INFO(core)->tmstate); + else if (MSR_TM_ACTIVE(core->ti_ppc64->gpregs->msr)) { + pr_err("TM register's data mismatch, corrupted image ?\n"); + ret = -1; + } + + return ret; +} + +int restore_gpregs(struct rt_sigframe *f, UserPpc64RegsEntry *r) +{ + restore_gp_regs(&f->uc.uc_mcontext, r); + + return 0; +} diff --git a/CRIU_code/criu/arch/ppc64/include/asm/dump.h b/CRIU_code/criu/arch/ppc64/include/asm/dump.h new file mode 100644 index 0000000..a81ee02 --- /dev/null +++ b/CRIU_code/criu/arch/ppc64/include/asm/dump.h @@ -0,0 +1,13 @@ +#ifndef __CR_ASM_DUMP_H__ +#define __CR_ASM_DUMP_H__ + +extern int save_task_regs(void *, user_regs_struct_t *, user_fpregs_struct_t *); +extern int arch_alloc_thread_info(CoreEntry *core); +extern void arch_free_thread_info(CoreEntry *core); + + +#define core_put_tls(core, tls) + +#define get_task_futex_robust_list_compat(pid, info) -1 + +#endif diff --git a/CRIU_code/criu/arch/ppc64/include/asm/int.h b/CRIU_code/criu/arch/ppc64/include/asm/int.h new file mode 100644 index 0000000..642804e --- /dev/null +++ b/CRIU_code/criu/arch/ppc64/include/asm/int.h @@ -0,0 +1,6 @@ +#ifndef __CR_ASM_INT_H__ +#define __CR_ASM_INT_H__ + +#include "asm-generic/int.h" + +#endif /* __CR_ASM_INT_H__ */ diff --git a/CRIU_code/criu/arch/ppc64/include/asm/kerndat.h b/CRIU_code/criu/arch/ppc64/include/asm/kerndat.h new file mode 100644 index 0000000..60956b5 --- /dev/null +++ b/CRIU_code/criu/arch/ppc64/include/asm/kerndat.h @@ -0,0 +1,7 @@ +#ifndef __CR_ASM_KERNDAT_H__ +#define __CR_ASM_KERNDAT_H__ + +#define kdat_compatible_cr() 0 +#define kdat_can_map_vdso() 0 + +#endif /* __CR_ASM_KERNDAT_H__ */ diff --git a/CRIU_code/criu/arch/ppc64/include/asm/parasite-syscall.h b/CRIU_code/criu/arch/ppc64/include/asm/parasite-syscall.h new file mode 100644 index 0000000..6008c37 --- /dev/null +++ b/CRIU_code/criu/arch/ppc64/include/asm/parasite-syscall.h @@ -0,0 +1,6 @@ +#ifndef __CR_ASM_PARASITE_SYSCALL_H__ +#define __CR_ASM_PARASITE_SYSCALL_H__ + +struct parasite_ctl; + +#endif diff --git a/CRIU_code/criu/arch/ppc64/include/asm/parasite.h b/CRIU_code/criu/arch/ppc64/include/asm/parasite.h new file mode 100644 index 0000000..fdbc340 --- /dev/null +++ b/CRIU_code/criu/arch/ppc64/include/asm/parasite.h @@ -0,0 +1,7 @@ +#ifndef __ASM_PARASITE_H__ +#define __ASM_PARASITE_H__ + +/* TLS is accessed through r13, which is already processed */ +static inline void arch_get_tls(tls_t *ptls) { (void)ptls; } + +#endif diff --git a/CRIU_code/criu/arch/ppc64/include/asm/restore.h b/CRIU_code/criu/arch/ppc64/include/asm/restore.h new file mode 100644 index 0000000..8d45160 --- /dev/null +++ b/CRIU_code/criu/arch/ppc64/include/asm/restore.h @@ -0,0 +1,31 @@ +#ifndef __CR_ASM_RESTORE_H__ +#define __CR_ASM_RESTORE_H__ + +#include "asm/restorer.h" + +#include "images/core.pb-c.h" + +/* + * Set R2 to blob + 8000 which is the default value + * Jump to restore_task_exec_start + 8 since R2 is already set (local call) + */ +#define JUMP_TO_RESTORER_BLOB(new_sp, restore_task_exec_start, \ + task_args) \ + asm volatile( \ + "mr 1,%0 \n" \ + "mr 12,%1 \n" \ + "mtctr 12 \n" \ + "mr 3,%2 \n" \ + "bctr \n" \ + : \ + : "r"(new_sp), \ + "r"((unsigned long)restore_task_exec_start), \ + "r"(task_args) \ + : "1", "3", "12") + +/* There is nothing to do since TLS is accessed through r13 */ +#define core_get_tls(pcore, ptls) + +int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core); + +#endif /* __CR_ASM_RESTORE_H__ */ diff --git a/CRIU_code/criu/arch/ppc64/include/asm/restorer.h b/CRIU_code/criu/arch/ppc64/include/asm/restorer.h new file mode 100644 index 0000000..d48d833 --- /dev/null +++ b/CRIU_code/criu/arch/ppc64/include/asm/restorer.h @@ -0,0 +1,75 @@ +#ifndef __CR_ASM_RESTORER_H__ +#define __CR_ASM_RESTORER_H__ + +#include +#include +#include +#include "asm/types.h" +#include + +#include + +/* + * Clone trampoline + * + * See glibc sysdeps/powerpc/powerpc64/sysdep.h for FRAME_MIN_SIZE defines + */ +#define RUN_CLONE_RESTORE_FN(ret, clone_flags, new_sp, parent_tid, \ + thread_args, clone_restore_fn) \ + asm volatile( \ + "clone_emul: \n" \ + "/* Save fn, args, stack across syscall. */ \n" \ + "mr 14, %5 /* clone_restore_fn in r14 */ \n" \ + "mr 15, %6 /* &thread_args[i] in r15 */ \n" \ + "mr 3, %1 /* clone_flags */ \n" \ + "ld 4, %2 /* new_sp */ \n" \ + "mr 5, %3 /* &parent_tid */ \n" \ + "li 6, 0 /* tls = 0 ? */ \n" \ + "mr 7, %4 /* &thread_args[i].pid */ \n" \ + "li 0,"__stringify(__NR_clone)" \n" \ + "sc \n" \ + "/* Check for child process. */ \n" \ + "cmpdi cr1,3,0 \n" \ + "crandc cr1*4+eq,cr1*4+eq,cr0*4+so \n" \ + "bne- cr1,clone_end \n" \ + "/* child */ \n" \ + "addi 14, 14, 8 /* jump over r2 fixup */ \n" \ + "mtctr 14 \n" \ + "mr 3,15 \n" \ + "bctr \n" \ + "clone_end: \n" \ + "mr %0,3 \n" \ + : "=r"(ret) /* %0 */ \ + : "r"(clone_flags), /* %1 */ \ + "m"(new_sp), /* %2 */ \ + "r"(&parent_tid), /* %3 */ \ + "r"(&thread_args[i].pid), /* %4 */ \ + "r"(clone_restore_fn), /* %5 */ \ + "r"(&thread_args[i]) /* %6 */ \ + : "memory","0","3","4","5","6","7","14","15") + +#define arch_map_vdso(map, compat) -1 + +int restore_gpregs(struct rt_sigframe *f, UserPpc64RegsEntry *r); +int restore_nonsigframe_gpregs(UserPpc64RegsEntry *r); + +/* Nothing to do, TLS is accessed through r13 */ +static inline void restore_tls(tls_t *ptls) { (void)ptls; } + +/* + * Defined in arch/ppc64/syscall-common-ppc64.S + */ +unsigned long sys_shmat(int shmid, const void *shmaddr, int shmflg); + +static inline void *alloc_compat_syscall_stack(void) { return NULL; } +static inline void free_compat_syscall_stack(void *stack32) { } +static inline int arch_compat_rt_sigaction(void *stack, int sig, void *act) +{ + return -1; +} +static inline int set_compat_robust_list(uint32_t head_ptr, uint32_t len) +{ + return -1; +} + +#endif /*__CR_ASM_RESTORER_H__*/ diff --git a/CRIU_code/criu/arch/ppc64/include/asm/types.h b/CRIU_code/criu/arch/ppc64/include/asm/types.h new file mode 100644 index 0000000..8f3af86 --- /dev/null +++ b/CRIU_code/criu/arch/ppc64/include/asm/types.h @@ -0,0 +1,42 @@ +#ifndef __CR_ASM_TYPES_H__ +#define __CR_ASM_TYPES_H__ + +#include +#include +#include "images/core.pb-c.h" + +#include "page.h" +#include "bitops.h" +#include "asm/int.h" + +#include + +typedef UserPpc64RegsEntry UserRegsEntry; + +#define CORE_ENTRY__MARCH CORE_ENTRY__MARCH__PPC64 + +#define core_is_compat(core) false + +#define CORE_THREAD_ARCH_INFO(core) core->ti_ppc64 + +static inline void *decode_pointer(uint64_t v) { return (void*)v; } +static inline uint64_t encode_pointer(void *p) { return (uint64_t)p; } + +/* + * Copied from the following kernel header files : + * include/linux/auxvec.h + * arch/powerpc/include/uapi/asm/auxvec.h + * include/linux/mm_types.h + */ +#define AT_VECTOR_SIZE_BASE 20 +#if !defined AT_VECTOR_SIZE_ARCH +#define AT_VECTOR_SIZE_ARCH 6 +#endif +#define AT_VECTOR_SIZE (2*(AT_VECTOR_SIZE_ARCH + AT_VECTOR_SIZE_BASE + 1)) + +typedef uint64_t auxv_t; + +/* Not used but the structure parasite_dump_thread needs a tls_t field */ +typedef uint64_t tls_t; + +#endif /* __CR_ASM_TYPES_H__ */ diff --git a/CRIU_code/criu/arch/ppc64/include/asm/vdso.h b/CRIU_code/criu/arch/ppc64/include/asm/vdso.h new file mode 100644 index 0000000..9546e24 --- /dev/null +++ b/CRIU_code/criu/arch/ppc64/include/asm/vdso.h @@ -0,0 +1,28 @@ +#ifndef __CR_ASM_VDSO_H__ +#define __CR_ASM_VDSO_H__ + +#include "asm/int.h" +#include "asm-generic/vdso.h" + +/* This definition is used in pie/util-vdso.c to initialize the vdso symbol + * name string table 'vdso_symbols' + * + * Poke from kernel file arch/powerpc/kernel/vdso64/vdso64.lds.S + * + * Note that '__kernel_datapage_offset' is not a service but mostly a data + * inside the text page which should not be used as is from user space. + */ +#define VDSO_SYMBOL_MAX 10 +#define ARCH_VDSO_SYMBOLS \ + "__kernel_clock_getres", \ + "__kernel_clock_gettime", \ + "__kernel_get_syscall_map", \ + "__kernel_get_tbfreq", \ + "__kernel_getcpu", \ + "__kernel_gettimeofday", \ + "__kernel_sigtramp_rt64", \ + "__kernel_sync_dicache", \ + "__kernel_sync_dicache_p5", \ + "__kernel_time" + +#endif /* __CR_ASM_VDSO_H__ */ diff --git a/CRIU_code/criu/arch/ppc64/misc.S b/CRIU_code/criu/arch/ppc64/misc.S new file mode 100644 index 0000000..4ee188d --- /dev/null +++ b/CRIU_code/criu/arch/ppc64/misc.S @@ -0,0 +1,197 @@ +/* + * This is from linux/arch/powerpc/lib/crtsavres.S: + * + * Special support for eabi and SVR4 + * + * Copyright (C) 1995, 1996, 1998, 2000, 2001 Free Software Foundation, Inc. + * Copyright 2008 Freescale Semiconductor, Inc. + * Written By Michael Meissner + * + * Based on gcc/config/rs6000/crtsavres.asm from gcc + * 64 bit additions from reading the PPC elf64abi document. + * + * This file is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2, or (at your option) any + * later version. + * + * In addition to the permissions in the GNU General Public License, the + * Free Software Foundation gives you unlimited permission to link the + * compiled version of this file with other programs, and to distribute + * those programs without any restriction coming from the use of this + * file. (The General Public License restrictions do apply in other + * respects; for example, they cover modification of the file, and + * distribution when not linked into another program.) + * + * This file is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; see the file COPYING. If not, write to + * the Free Software Foundation, 51 Franklin Street, Fifth Floor, + * Boston, MA 02110-1301, USA. + * + * As a special exception, if you link this library with files + * compiled with GCC to produce an executable, this does not cause + * the resulting executable to be covered by the GNU General Public License. + * This exception does not however invalidate any other reasons why + * the executable file might be covered by the GNU General Public License. + */ + +#define r0 0 +#define r1 1 +#define r2 2 +#define r3 3 +#define r4 4 +#define r5 5 +#define r6 6 +#define r7 7 +#define r8 8 +#define r9 9 +#define r10 10 +#define r11 11 +#define r12 12 +#define r13 13 +#define r14 14 +#define r15 15 +#define r16 16 +#define r17 17 +#define r18 18 +#define r19 19 +#define r20 20 +#define r21 21 +#define r22 22 +#define r23 23 +#define r24 24 +#define r25 25 +#define r26 26 +#define r27 27 +#define r28 28 +#define r29 29 +#define r30 30 +#define r31 31 + + .text + +.globl _savegpr0_14 +_savegpr0_14: + std r14,-144(r1) +.globl _savegpr0_15 +_savegpr0_15: + std r15,-136(r1) +.globl _savegpr0_16 +_savegpr0_16: + std r16,-128(r1) +.globl _savegpr0_17 +_savegpr0_17: + std r17,-120(r1) +.globl _savegpr0_18 +_savegpr0_18: + std r18,-112(r1) +.globl _savegpr0_19 +_savegpr0_19: + std r19,-104(r1) +.globl _savegpr0_20 +_savegpr0_20: + std r20,-96(r1) +.globl _savegpr0_21 +_savegpr0_21: + std r21,-88(r1) +.globl _savegpr0_22 +_savegpr0_22: + std r22,-80(r1) +.globl _savegpr0_23 +_savegpr0_23: + std r23,-72(r1) +.globl _savegpr0_24 +_savegpr0_24: + std r24,-64(r1) +.globl _savegpr0_25 +_savegpr0_25: + std r25,-56(r1) +.globl _savegpr0_26 +_savegpr0_26: + std r26,-48(r1) +.globl _savegpr0_27 +_savegpr0_27: + std r27,-40(r1) +.globl _savegpr0_28 +_savegpr0_28: + std r28,-32(r1) +.globl _savegpr0_29 +_savegpr0_29: + std r29,-24(r1) +.globl _savegpr0_30 +_savegpr0_30: + std r30,-16(r1) +.globl _savegpr0_31 +_savegpr0_31: + std r31,-8(r1) + std r0,16(r1) + blr + +.globl _restgpr0_14 +_restgpr0_14: + ld r14,-144(r1) +.globl _restgpr0_15 +_restgpr0_15: + ld r15,-136(r1) +.globl _restgpr0_16 +_restgpr0_16: + ld r16,-128(r1) +.globl _restgpr0_17 +_restgpr0_17: + ld r17,-120(r1) +.globl _restgpr0_18 +_restgpr0_18: + ld r18,-112(r1) +.globl _restgpr0_19 +_restgpr0_19: + ld r19,-104(r1) +.globl _restgpr0_20 +_restgpr0_20: + ld r20,-96(r1) +.globl _restgpr0_21 +_restgpr0_21: + ld r21,-88(r1) +.globl _restgpr0_22 +_restgpr0_22: + ld r22,-80(r1) +.globl _restgpr0_23 +_restgpr0_23: + ld r23,-72(r1) +.globl _restgpr0_24 +_restgpr0_24: + ld r24,-64(r1) +.globl _restgpr0_25 +_restgpr0_25: + ld r25,-56(r1) +.globl _restgpr0_26 +_restgpr0_26: + ld r26,-48(r1) +.globl _restgpr0_27 +_restgpr0_27: + ld r27,-40(r1) +.globl _restgpr0_28 +_restgpr0_28: + ld r28,-32(r1) +.globl _restgpr0_29 +_restgpr0_29: + ld r0,16(r1) + ld r29,-24(r1) + mtlr r0 + ld r30,-16(r1) + ld r31,-8(r1) + blr + +.globl _restgpr0_30 +_restgpr0_30: + ld r30,-16(r1) +.globl _restgpr0_31 +_restgpr0_31: + ld r0,16(r1) + ld r31,-8(r1) + mtlr r0 + blr diff --git a/CRIU_code/criu/arch/ppc64/restorer.c b/CRIU_code/criu/arch/ppc64/restorer.c new file mode 100644 index 0000000..7172e44 --- /dev/null +++ b/CRIU_code/criu/arch/ppc64/restorer.c @@ -0,0 +1,62 @@ +#include + +#include "restorer.h" +#include "asm/restorer.h" +#include + +#include +#include "log.h" + +int restore_nonsigframe_gpregs(UserPpc64RegsEntry *r) +{ +#define SPRN_TFHAR 128 +#define SPRN_TFIAR 129 +#define SPRN_TEXASR 130 + + if (r->has_tfhar) { + asm __volatile__ ( + "ld 3, %[value] ;" + "mtspr %[sprn],3 ;" + : [value]"=m"(r->tfhar) + : [sprn]"i"(SPRN_TFHAR) + : "r3"); + } + + if (r->has_tfiar) { + asm __volatile__ ( + "ld 3, %[value] ;" + "mtspr %[sprn],3 ;" + : [value]"=m"(r->tfiar) + : [sprn]"i"(SPRN_TFIAR) + : "r3"); + } + + if (r->has_texasr) { + asm __volatile__ ( + "ld 3, %[value] ;" + "mtspr %[sprn],3 ;" + : [value]"=m"(r->texasr) + : [sprn]"i"(SPRN_TEXASR) + : "r3"); + } + + return 0; +} + +unsigned long sys_shmat(int shmid, const void *shmaddr, int shmflg) +{ + unsigned long raddr; + int ret; + + ret = sys_ipc(21 /*SHMAT */, + shmid, /* first */ + shmflg, /* second */ + (unsigned long)&raddr, /* third */ + shmaddr, /* ptr */ + 0 /* fifth not used */); + + if (ret) + raddr = (unsigned long) ret; + + return raddr; +} diff --git a/CRIU_code/criu/arch/ppc64/sigframe.c b/CRIU_code/criu/arch/ppc64/sigframe.c new file mode 100644 index 0000000..52fad2e --- /dev/null +++ b/CRIU_code/criu/arch/ppc64/sigframe.c @@ -0,0 +1,48 @@ +#include +#include + +#include "asm/sigframe.h" +#include "asm/types.h" + +#include "log.h" +#include "common/bug.h" + +/* + * The signal frame has been built using local addresses. Since it has to be + * used in the context of the checkpointed process, the v_regs pointer in the + * signal frame must be updated to match the address in the remote stack. + */ +static inline void update_vregs(mcontext_t *lcontext, mcontext_t *rcontext) +{ + if (lcontext->v_regs) { + uint64_t offset = (uint64_t)(lcontext->v_regs) - (uint64_t)lcontext; + lcontext->v_regs = (vrregset_t *)((uint64_t)rcontext + offset); + + pr_debug("Updated v_regs:%llx (rcontext:%llx)\n", + (unsigned long long) lcontext->v_regs, + (unsigned long long) rcontext); + } +} + +int sigreturn_prep_fpu_frame(struct rt_sigframe *frame, + struct rt_sigframe *rframe) +{ + uint64_t msr = frame->uc.uc_mcontext.gp_regs[PT_MSR]; + + update_vregs(&frame->uc.uc_mcontext, &rframe->uc.uc_mcontext); + + /* Sanity check: If TM so uc_link should be set, otherwise not */ + if (MSR_TM_ACTIVE(msr) ^ (!!(frame->uc.uc_link))) { + BUG(); + return 1; + } + + /* Updating the transactional state address if any */ + if (frame->uc.uc_link) { + update_vregs(&frame->uc_transact.uc_mcontext, + &rframe->uc_transact.uc_mcontext); + frame->uc.uc_link = &rframe->uc_transact; + } + + return 0; +} diff --git a/CRIU_code/criu/arch/ppc64/vdso-pie.c b/CRIU_code/criu/arch/ppc64/vdso-pie.c new file mode 100644 index 0000000..910c3d3 --- /dev/null +++ b/CRIU_code/criu/arch/ppc64/vdso-pie.c @@ -0,0 +1,154 @@ +#include + +#include "asm/types.h" + +#include +#include +#include "parasite-vdso.h" +#include "log.h" +#include "common/bug.h" + +#ifdef LOG_PREFIX +# undef LOG_PREFIX +#endif +#define LOG_PREFIX "vdso: " + +/* This symbols are defined in vdso-trampoline.S */ +extern char *vdso_trampoline, *vdso_trampoline_end; + +static inline void invalidate_caches(unsigned long at) +{ + asm volatile("isync \n" \ + "li 3,0 \n" \ + "dcbf 3,%0 \n" \ + "sync \n" \ + "icbi 3,%0 \n" \ + "isync \n" \ + : /* no output */ \ + : "r"(at) \ + :"memory", "r3"); +} + +/* This is the size of the trampoline call : + * mlfr r0 + * bl trampoline + * <64 bit address> + */ +#define TRAMP_CALL_SIZE (2*sizeof(uint32_t) + sizeof(uint64_t)) + +/* + * put_trampoline does 2 things : + * + * 1. it looks for a place in the checkpointed vDSO where to put the + * trampoline code (see vdso-trampoline.S). + * + * 2. for each symbol from the checkpointed vDSO, it checks that there are + * enough place to put the call to the vDSO trampoline (see + * TRAMP_CALL_SIZE's comment above). + * This done by checking that there is no interesting symbols in the range + * of current one's offset -> (current one's offset + TRAMP_CALL_SIZE). + * Unfortunately the symbols are not sorted by address so we have to look + * for the complete table all the time. Since the vDSO is small, this is + * not a big issue. + */ +static unsigned long put_trampoline(unsigned long at, struct vdso_symtable *sym) +{ + int i,j; + unsigned long size; + unsigned long trampoline = 0; + + /* First of all we have to find a place where to put the trampoline + * code. + */ + size = (unsigned long)&vdso_trampoline_end + - (unsigned long)&vdso_trampoline; + + for (i = 0; i < ARRAY_SIZE(sym->symbols); i++) { + if (vdso_symbol_empty(&sym->symbols[i])) + continue; + + pr_debug("Checking '%s' at %lx\n", sym->symbols[i].name, + sym->symbols[i].offset); + + /* find the nearest following symbol we are interested in */ + for (j=0; j < ARRAY_SIZE(sym->symbols); j++) { + if (i==j || vdso_symbol_empty(&sym->symbols[j])) + continue; + + if (sym->symbols[j].offset <= sym->symbols[i].offset) + /* this symbol is above the current one */ + continue; + + if ((sym->symbols[i].offset+TRAMP_CALL_SIZE) > + sym->symbols[j].offset) { + /* we have a major issue here since we cannot + * even put the trampoline call for this symbol + */ + pr_err("Can't handle small vDSO symbol %s\n", + sym->symbols[i].name); + return 0; + } + + if (trampoline) + /* no need to put it twice */ + continue; + + if ((sym->symbols[j].offset - + (sym->symbols[i].offset+TRAMP_CALL_SIZE)) <= size) + /* not enough place */ + continue; + + /* We can put the trampoline there */ + trampoline = at + sym->symbols[i].offset; + trampoline += TRAMP_CALL_SIZE; + + pr_debug("Putting vDSO trampoline in %s at %lx\n", + sym->symbols[i].name, trampoline); + memcpy((void *)trampoline, &vdso_trampoline, + size); + invalidate_caches(trampoline); + } + } + + return trampoline; +} + +static inline void put_trampoline_call(unsigned long at, unsigned long to, + unsigned long tr) +{ + uint32_t *addr = (uint32_t *)at; + + *addr++ = 0x7C0802a6; /* mflr r0 */ + *addr++ = 0x48000001 | ((long)(tr-at-4) & 0x3fffffc); /* bl tr */ + *(uint64_t *)addr = to; /* the address to read by the trampoline */ + + invalidate_caches(at); +} + +int vdso_redirect_calls(unsigned long base_to, unsigned long base_from, + struct vdso_symtable *to, struct vdso_symtable *from, + bool __always_unused compat_vdso) +{ + unsigned int i; + unsigned long trampoline; + + trampoline = (unsigned long)put_trampoline(base_from, from); + if (!trampoline) + return 1; + + for (i = 0; i < ARRAY_SIZE(to->symbols); i++) { + if (vdso_symbol_empty(&from->symbols[i])) + continue; + + pr_debug("br: %lx/%lx -> %lx/%lx (index %d) '%s'\n", + base_from, from->symbols[i].offset, + base_to, to->symbols[i].offset, i, + from->symbols[i].name); + + put_trampoline_call(base_from + from->symbols[i].offset, + base_to + to->symbols[i].offset, + trampoline); + } + + return 0; +} diff --git a/CRIU_code/criu/arch/ppc64/vdso-trampoline.S b/CRIU_code/criu/arch/ppc64/vdso-trampoline.S new file mode 100644 index 0000000..116fc3a --- /dev/null +++ b/CRIU_code/criu/arch/ppc64/vdso-trampoline.S @@ -0,0 +1,11 @@ +#include "common/asm/linkage.h" + + .section .text + +GLOBAL(vdso_trampoline) + mflr r12 /* r12 vdso_ptr's address */ + mtlr r0 /* restore lr */ + ld r12,0(r12) /* read value store in vdso_ptr */ + mtctr r12 /* branch to it */ + bctr +GLOBAL(vdso_trampoline_end) diff --git a/CRIU_code/criu/arch/riscv/Makefile b/CRIU_code/criu/arch/riscv/Makefile new file mode 100644 index 0000000..a9a58c3 --- /dev/null +++ b/CRIU_code/criu/arch/riscv/Makefile @@ -0,0 +1,14 @@ +builtin-name := crtools.built-in.o + +ccflags-y += -iquote $(obj)/include +ccflags-y += -iquote criu/include -iquote include +ccflags-y += $(COMPEL_UAPI_INCLUDES) + +asflags-y += -Wstrict-prototypes +asflags-y += -D__ASSEMBLY__ -nostdlib -fomit-frame-pointer +asflags-y += -iquote $(obj)/include +ldflags-y += -r -z noexecstack + +obj-y += cpu.o +obj-y += crtools.o +obj-y += sigframe.o \ No newline at end of file diff --git a/CRIU_code/criu/arch/riscv/cpu.c b/CRIU_code/criu/arch/riscv/cpu.c new file mode 100644 index 0000000..e964876 --- /dev/null +++ b/CRIU_code/criu/arch/riscv/cpu.c @@ -0,0 +1,53 @@ +#include +#include +#include +#include +#include +#include + +#include "bitops.h" +#include "asm/types.h" +#include "asm/cpu.h" +#include +#include + +#include "common/compiler.h" +#include "cr_options.h" +#include "image.h" +#include "util.h" +#include "log.h" +#include "cpu.h" +#include "protobuf.h" +#include "images/cpuinfo.pb-c.h" + +#undef LOG_PREFIX +#define LOG_PREFIX "cpu: " + +int cpu_init(void) +{ + return 0; +} + +int cpu_dump_cpuinfo(void) +{ + return 0; +} + +int cpu_validate_cpuinfo(void) +{ + return 0; +} + +int cpuinfo_dump(void) +{ + if (cpu_init()) + return -1; + if (cpu_dump_cpuinfo()) + return -1; + return 0; +} + +int cpuinfo_check(void) +{ + return 0; +} \ No newline at end of file diff --git a/CRIU_code/criu/arch/riscv/crtools.c b/CRIU_code/criu/arch/riscv/crtools.c new file mode 100644 index 0000000..449129e --- /dev/null +++ b/CRIU_code/criu/arch/riscv/crtools.c @@ -0,0 +1,240 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "types.h" +#include "log.h" +#include "asm/parasite-syscall.h" +#include "asm/restorer.h" +#include +#include "asm/dump.h" +#include "cr_options.h" +#include "common/compiler.h" +#include "restorer.h" +#include "parasite-syscall.h" +#include "util.h" +#include "cpu.h" +#include +#include "kerndat.h" + +#include "protobuf.h" +#include "images/core.pb-c.h" +#include "images/creds.pb-c.h" + + +int save_task_regs(void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) +{ + CoreEntry *core = x; + + /* Save the RISCV CPU state */ + core->ti_riscv->gpregs->pc = regs->pc; + core->ti_riscv->gpregs->ra = regs->ra; + core->ti_riscv->gpregs->sp = regs->sp; + core->ti_riscv->gpregs->gp = regs->gp; + core->ti_riscv->gpregs->tp = regs->tp; + core->ti_riscv->gpregs->t0 = regs->t0; + core->ti_riscv->gpregs->t1 = regs->t1; + core->ti_riscv->gpregs->t2 = regs->t2; + core->ti_riscv->gpregs->s0 = regs->s0; + core->ti_riscv->gpregs->s1 = regs->s1; + core->ti_riscv->gpregs->a0 = regs->a0; + core->ti_riscv->gpregs->a1 = regs->a1; + core->ti_riscv->gpregs->a2 = regs->a2; + core->ti_riscv->gpregs->a3 = regs->a3; + core->ti_riscv->gpregs->a4 = regs->a4; + core->ti_riscv->gpregs->a5 = regs->a5; + core->ti_riscv->gpregs->a6 = regs->a6; + core->ti_riscv->gpregs->a7 = regs->a7; + core->ti_riscv->gpregs->s2 = regs->s2; + core->ti_riscv->gpregs->s3 = regs->s3; + core->ti_riscv->gpregs->s4 = regs->s4; + core->ti_riscv->gpregs->s5 = regs->s5; + core->ti_riscv->gpregs->s6 = regs->s6; + core->ti_riscv->gpregs->s7 = regs->s7; + core->ti_riscv->gpregs->s8 = regs->s8; + core->ti_riscv->gpregs->s9 = regs->s9; + core->ti_riscv->gpregs->s10 = regs->s10; + core->ti_riscv->gpregs->s11 = regs->s11; + core->ti_riscv->gpregs->t3 = regs->t3; + core->ti_riscv->gpregs->t4 = regs->t4; + core->ti_riscv->gpregs->t5 = regs->t5; + core->ti_riscv->gpregs->t6 = regs->t6; + + core->ti_riscv->fpregs->fregs[0]= fpregs->f->f[0]; + core->ti_riscv->fpregs->fregs[1]= fpregs->f->f[1]; + core->ti_riscv->fpregs->fregs[2]= fpregs->f->f[2]; + core->ti_riscv->fpregs->fregs[3]= fpregs->f->f[3]; + core->ti_riscv->fpregs->fregs[4]= fpregs->f->f[4]; + core->ti_riscv->fpregs->fregs[5]= fpregs->f->f[5]; + core->ti_riscv->fpregs->fregs[6]= fpregs->f->f[6]; + core->ti_riscv->fpregs->fregs[7]= fpregs->f->f[7]; + core->ti_riscv->fpregs->fregs[8]= fpregs->f->f[8]; + core->ti_riscv->fpregs->fregs[9]= fpregs->f->f[9]; + core->ti_riscv->fpregs->fregs[10] = fpregs->f->f[10]; + core->ti_riscv->fpregs->fregs[11] = fpregs->f->f[11]; + core->ti_riscv->fpregs->fregs[12] = fpregs->f->f[12]; + core->ti_riscv->fpregs->fregs[13] = fpregs->f->f[13]; + core->ti_riscv->fpregs->fregs[14] = fpregs->f->f[14]; + core->ti_riscv->fpregs->fregs[15] = fpregs->f->f[15]; + core->ti_riscv->fpregs->fregs[16] = fpregs->f->f[16]; + core->ti_riscv->fpregs->fregs[17] = fpregs->f->f[17]; + core->ti_riscv->fpregs->fregs[18] = fpregs->f->f[18]; + core->ti_riscv->fpregs->fregs[19] = fpregs->f->f[19]; + core->ti_riscv->fpregs->fregs[20] = fpregs->f->f[20]; + core->ti_riscv->fpregs->fregs[21] = fpregs->f->f[21]; + core->ti_riscv->fpregs->fregs[22] = fpregs->f->f[22]; + core->ti_riscv->fpregs->fregs[23] = fpregs->f->f[23]; + core->ti_riscv->fpregs->fregs[24] = fpregs->f->f[24]; + core->ti_riscv->fpregs->fregs[25] = fpregs->f->f[25]; + core->ti_riscv->fpregs->fregs[26] = fpregs->f->f[26]; + core->ti_riscv->fpregs->fregs[27] = fpregs->f->f[27]; + core->ti_riscv->fpregs->fregs[28] = fpregs->f->f[28]; + core->ti_riscv->fpregs->fregs[29] = fpregs->f->f[29]; + core->ti_riscv->fpregs->fregs[30] = fpregs->f->f[30]; + core->ti_riscv->fpregs->fregs[31] = fpregs->f->f[31]; + core->ti_riscv->fpregs->fcsr = fpregs->f->fcsr; + + return 0; +} + +int arch_alloc_thread_info(CoreEntry *core) +{ + ThreadInfoRiscv *ti_riscv; + UserRiscvRegsEntry *gpregs; + UserRiscvFpregsEntry *fpregs; + + ti_riscv = xmalloc(sizeof(*ti_riscv)); + if (!ti_riscv) + goto err; + + thread_info_riscv__init(ti_riscv); + core->ti_riscv = ti_riscv; + + gpregs = xmalloc(sizeof(*gpregs)); + if (!gpregs){ + xfree(ti_riscv); + goto err; + } + + user_riscv_regs_entry__init(gpregs); + ti_riscv->gpregs = gpregs; + + fpregs = xmalloc(sizeof(*fpregs)); + if (!fpregs){ + xfree(ti_riscv); + xfree(gpregs); + goto err; + } + + user_riscv_fpregs_entry__init(fpregs); + ti_riscv->fpregs = fpregs; + + return 0; +err: + return -1; +} + +void arch_free_thread_info(CoreEntry *core) +{ + if (!core->ti_riscv) + return; + + if (core->ti_riscv->gpregs) + xfree(core->ti_riscv->gpregs); + + if (core->ti_riscv->fpregs->fregs) + xfree(core->ti_riscv->fpregs->fregs); + + xfree(core->ti_riscv); +} + +int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core) +{ + struct rt_sigframe *f = sigframe; + UserRiscvFpregsEntry *r = core->ti_riscv->fpregs; + + sigframe->uc.uc_mcontext.sc_fpregs.f.f[0] = r->fregs[0]; + sigframe->uc.uc_mcontext.sc_fpregs.f.f[1] = r->fregs[1]; + sigframe->uc.uc_mcontext.sc_fpregs.f.f[2] = r->fregs[2]; + sigframe->uc.uc_mcontext.sc_fpregs.f.f[3] = r->fregs[3]; + sigframe->uc.uc_mcontext.sc_fpregs.f.f[4] = r->fregs[4]; + sigframe->uc.uc_mcontext.sc_fpregs.f.f[5] = r->fregs[5]; + sigframe->uc.uc_mcontext.sc_fpregs.f.f[6] = r->fregs[6]; + sigframe->uc.uc_mcontext.sc_fpregs.f.f[7] = r->fregs[7]; + sigframe->uc.uc_mcontext.sc_fpregs.f.f[8] = r->fregs[8]; + sigframe->uc.uc_mcontext.sc_fpregs.f.f[9] = r->fregs[9]; + sigframe->uc.uc_mcontext.sc_fpregs.f.f[10] = r->fregs[10]; + sigframe->uc.uc_mcontext.sc_fpregs.f.f[11] = r->fregs[11]; + sigframe->uc.uc_mcontext.sc_fpregs.f.f[12] = r->fregs[12]; + sigframe->uc.uc_mcontext.sc_fpregs.f.f[13] = r->fregs[13]; + sigframe->uc.uc_mcontext.sc_fpregs.f.f[14] = r->fregs[14]; + sigframe->uc.uc_mcontext.sc_fpregs.f.f[15] = r->fregs[15]; + sigframe->uc.uc_mcontext.sc_fpregs.f.f[16] = r->fregs[16]; + sigframe->uc.uc_mcontext.sc_fpregs.f.f[17] = r->fregs[17]; + sigframe->uc.uc_mcontext.sc_fpregs.f.f[18] = r->fregs[18]; + sigframe->uc.uc_mcontext.sc_fpregs.f.f[19] = r->fregs[19]; + sigframe->uc.uc_mcontext.sc_fpregs.f.f[20] = r->fregs[20]; + sigframe->uc.uc_mcontext.sc_fpregs.f.f[21] = r->fregs[21]; + sigframe->uc.uc_mcontext.sc_fpregs.f.f[22] = r->fregs[22]; + sigframe->uc.uc_mcontext.sc_fpregs.f.f[23] = r->fregs[23]; + sigframe->uc.uc_mcontext.sc_fpregs.f.f[24] = r->fregs[24]; + sigframe->uc.uc_mcontext.sc_fpregs.f.f[25] = r->fregs[25]; + sigframe->uc.uc_mcontext.sc_fpregs.f.f[26] = r->fregs[26]; + sigframe->uc.uc_mcontext.sc_fpregs.f.f[27] = r->fregs[27]; + sigframe->uc.uc_mcontext.sc_fpregs.f.f[28] = r->fregs[28]; + sigframe->uc.uc_mcontext.sc_fpregs.f.f[29] = r->fregs[29]; + sigframe->uc.uc_mcontext.sc_fpregs.f.f[30] = r->fregs[30]; + sigframe->uc.uc_mcontext.sc_fpregs.f.f[31] = r->fregs[31]; + + return 0; +} + + +int restore_gpregs(struct rt_sigframe *f, UserRiscvRegsEntry *r) +{ + f->uc.uc_mcontext.sc_regs.pc = r->pc; + f->uc.uc_mcontext.sc_regs.ra = r->ra; + f->uc.uc_mcontext.sc_regs.sp = r->sp; + f->uc.uc_mcontext.sc_regs.gp = r->gp; + f->uc.uc_mcontext.sc_regs.tp = r->tp; + f->uc.uc_mcontext.sc_regs.t0 = r->t0; + f->uc.uc_mcontext.sc_regs.t1 = r->t1; + f->uc.uc_mcontext.sc_regs.t2 = r->t2; + f->uc.uc_mcontext.sc_regs.s0 = r->s0; + f->uc.uc_mcontext.sc_regs.s1 = r->s1; + f->uc.uc_mcontext.sc_regs.a0 = r->a0; + f->uc.uc_mcontext.sc_regs.a1 = r->a1; + f->uc.uc_mcontext.sc_regs.a2 = r->a2; + f->uc.uc_mcontext.sc_regs.a3 = r->a3; + f->uc.uc_mcontext.sc_regs.a4 = r->a4; + f->uc.uc_mcontext.sc_regs.a5 = r->a5; + f->uc.uc_mcontext.sc_regs.a6 = r->a6; + f->uc.uc_mcontext.sc_regs.a7 = r->a7; + f->uc.uc_mcontext.sc_regs.s2 = r->s2; + f->uc.uc_mcontext.sc_regs.s3 = r->s3; + f->uc.uc_mcontext.sc_regs.s4 = r->s4; + f->uc.uc_mcontext.sc_regs.s5 = r->s5; + f->uc.uc_mcontext.sc_regs.s6 = r->s6; + f->uc.uc_mcontext.sc_regs.s7 = r->s7; + f->uc.uc_mcontext.sc_regs.s8 = r->s8; + f->uc.uc_mcontext.sc_regs.s9 = r->s9; + f->uc.uc_mcontext.sc_regs.s10 = r->s10; + f->uc.uc_mcontext.sc_regs.s11 = r->s11; + f->uc.uc_mcontext.sc_regs.t3 = r->t3; + f->uc.uc_mcontext.sc_regs.t4 = r->t4; + f->uc.uc_mcontext.sc_regs.t5 = r->t5; + f->uc.uc_mcontext.sc_regs.t6 = r->t6; + + return 0; +} + +int get_task_futex_robust_list_compat(pid_t pid, ThreadCoreEntry *info) +{ + return 0; +} \ No newline at end of file diff --git a/CRIU_code/criu/arch/riscv/include/asm/dump.h b/CRIU_code/criu/arch/riscv/include/asm/dump.h new file mode 100644 index 0000000..e9b23af --- /dev/null +++ b/CRIU_code/criu/arch/riscv/include/asm/dump.h @@ -0,0 +1,14 @@ +#ifndef __CR_ASM_DUMP_H__ +#define __CR_ASM_DUMP_H__ + +extern int save_task_regs(void *, user_regs_struct_t *, user_fpregs_struct_t *); +extern int arch_alloc_thread_info(CoreEntry *core); +extern void arch_free_thread_info(CoreEntry *core); +extern int get_task_futex_robust_list_compat(pid_t pid, ThreadCoreEntry *info); + +static inline void core_put_tls(CoreEntry *core, tls_t tls) +{ + core->ti_riscv->tls = tls; +} + +#endif \ No newline at end of file diff --git a/CRIU_code/criu/arch/riscv/include/asm/int.h b/CRIU_code/criu/arch/riscv/include/asm/int.h new file mode 100644 index 0000000..6ddf0d0 --- /dev/null +++ b/CRIU_code/criu/arch/riscv/include/asm/int.h @@ -0,0 +1,6 @@ +#ifndef __CR_ASM_INT_H__ +#define __CR_ASM_INT_H__ + +#include "asm-generic/int.h" + +#endif /* __CR_ASM_INT_H__ */ \ No newline at end of file diff --git a/CRIU_code/criu/arch/riscv/include/asm/kerndat.h b/CRIU_code/criu/arch/riscv/include/asm/kerndat.h new file mode 100644 index 0000000..0a3f2f0 --- /dev/null +++ b/CRIU_code/criu/arch/riscv/include/asm/kerndat.h @@ -0,0 +1,7 @@ +#ifndef __CR_ASM_KERNDAT_H__ +#define __CR_ASM_KERNDAT_H__ + +#define kdat_compatible_cr() 0 +#define kdat_can_map_vdso() 0 + +#endif /* __CR_ASM_KERNDAT_H__ */ \ No newline at end of file diff --git a/CRIU_code/criu/arch/riscv/include/asm/parasite-syscall.h b/CRIU_code/criu/arch/riscv/include/asm/parasite-syscall.h new file mode 100644 index 0000000..6087df2 --- /dev/null +++ b/CRIU_code/criu/arch/riscv/include/asm/parasite-syscall.h @@ -0,0 +1,8 @@ +#ifndef __CR_ASM_PARASITE_SYSCALL_H__ +#define __CR_ASM_PARASITE_SYSCALL_H__ + +#include "asm/types.h" + +struct parasite_ctl; + +#endif \ No newline at end of file diff --git a/CRIU_code/criu/arch/riscv/include/asm/parasite.h b/CRIU_code/criu/arch/riscv/include/asm/parasite.h new file mode 100644 index 0000000..21be68e --- /dev/null +++ b/CRIU_code/criu/arch/riscv/include/asm/parasite.h @@ -0,0 +1,9 @@ +#ifndef __ASM_PARASITE_H__ +#define __ASM_PARASITE_H__ + +static inline void arch_get_tls(tls_t *ptls) +{ + asm("move %0, tp" : "=r"(*ptls)); +} + +#endif \ No newline at end of file diff --git a/CRIU_code/criu/arch/riscv/include/asm/restore.h b/CRIU_code/criu/arch/riscv/include/asm/restore.h new file mode 100644 index 0000000..8736935 --- /dev/null +++ b/CRIU_code/criu/arch/riscv/include/asm/restore.h @@ -0,0 +1,28 @@ +#ifndef __CR_ASM_RESTORE_H__ +#define __CR_ASM_RESTORE_H__ + +#include "asm/restorer.h" +#include "images/core.pb-c.h" + +#define JUMP_TO_RESTORER_BLOB(new_sp, restore_task_exec_start, \ + task_args) \ + asm volatile( \ + "move a0, %0 \n" \ + "move t2, %1 \n" \ + "move a1, %2 \n" \ + "move sp, a1 \n" \ + "jalr t2 \n" \ + : \ + :"r"(task_args),"r"(restore_task_exec_start), \ + "g"(new_sp) \ + : "sp", "t2", "a0","a1") + +static inline void core_get_tls(CoreEntry *pcore, tls_t *ptls) +{ + *ptls = pcore->ti_riscv->tls; +} + + +int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core); + +#endif \ No newline at end of file diff --git a/CRIU_code/criu/arch/riscv/include/asm/restorer.h b/CRIU_code/criu/arch/riscv/include/asm/restorer.h new file mode 100644 index 0000000..a916f83 --- /dev/null +++ b/CRIU_code/criu/arch/riscv/include/asm/restorer.h @@ -0,0 +1,79 @@ +#ifndef __CR_ASM_RESTORER_H__ +#define __CR_ASM_RESTORER_H__ + +#include "asm/types.h" +#include +#include "images/core.pb-c.h" +#include +#include + +static inline void restore_tls(tls_t *ptls) { + asm volatile( + "move a0, %0 \n" + "li a7, "__NR_set_thread_area" \n" + "ecall \n" + : + : "r"(*ptls) + : "a0","a7","memory"); +} +static inline int arch_compat_rt_sigaction(void *stack, int sig, void *act) +{ + return -1; +} +static inline int set_compat_robust_list(uint32_t head_ptr, uint32_t len) +{ + return -1; +} + +#define RUN_CLONE_RESTORE_FN(ret, clone_flags, new_sp, parent_tid, \ + thread_args, clone_restore_fn) \ + asm volatile( \ + "ld a1,%2 \n" /* a1 = new_sp */ \ + "dsubu a1,32 \n" \ + "sd %5,0(a1) \n" \ + "sd %6,8(a1) \n" \ + "sd %1,16(a1) \n" \ + "move a0,%1 \n" /* a0=flags */ \ + "move a2,%3 \n" /* a2=parent_tid */ \ + "li a3,0 \n" /* a3 = tls is 0 */ \ + "move a4,%4 \n" /* a4 = child_tid */ \ + "li a7, "__NR_clone" \n" \ + "ecall \n" /* syscall */ \ + "bnez a3,err \n" \ + "beqz a0,thread_start \n" \ + "move %0,a0 \n" \ + "j end \n" \ + "err:ebreak \n" \ + "thread_start: \n" \ + "ld t2,0(sp) \n" \ + "ld a0,8(sp) \n" \ + "jalr t2 \n" \ + "nop \n" \ + "end: \n" \ + : "=r"(ret) \ + : "r"(clone_flags), \ + "m"(new_sp), \ + "r"(&parent_tid), \ + "r"(&thread_args[i].pid), \ + "r"(clone_restore_fn), \ + "r"(&thread_args[i]) \ + :"a0","a1","a2","a3","a4","a7","t2","sp","memory") + +#define RUN_CLONE3_RESTORE_FN(ret, clone_args, size, args, \ + clone_restore_fn) do { \ + pr_err("This architecture does not support clone3() with set_tid, yet!\n"); \ + ret = -1; \ +} while (0) + +#define kdat_compatible_cr() 0 +#define arch_map_vdso(map, compat) -1 + +static inline void *alloc_compat_syscall_stack(void) { return NULL; } +static inline void free_compat_syscall_stack(void *stack32) { } +int restore_gpregs(struct rt_sigframe *f, UserRiscvRegsEntry *r); +int restore_nonsigframe_gpregs(UserRiscvRegsEntry *r); + +#define ARCH_HAS_SHMAT_HOOK +unsigned long arch_shmat(int shmid, void *shmaddr, + int shmflg, unsigned long size); +#endif \ No newline at end of file diff --git a/CRIU_code/criu/arch/riscv/include/asm/syscall32.h b/CRIU_code/criu/arch/riscv/include/asm/syscall32.h new file mode 100644 index 0000000..fad38e4 --- /dev/null +++ b/CRIU_code/criu/arch/riscv/include/asm/syscall32.h @@ -0,0 +1,17 @@ +#ifndef __CR_SYSCALL32_H__ +#define __CR_SYSCALL32_H__ + +extern long sys_socket(int domain, int type, int protocol); +extern long sys_connect(int sockfd, struct sockaddr *addr, int addrlen); +extern long sys_sendto(int sockfd, void *buff, size_t len, unsigned int flags, struct sockaddr *addr, int addr_len); +extern long sys_recvfrom(int sockfd, void *ubuf, size_t size, unsigned int flags, struct sockaddr *addr, int *addr_len); +extern long sys_sendmsg(int sockfd, const struct msghdr *msg, int flags); +extern long sys_recvmsg(int sockfd, struct msghdr *msg, int flags); +extern long sys_shutdown(int sockfd, int how); +extern long sys_bind(int sockfd, const struct sockaddr *addr, int addrlen); +extern long sys_setsockopt(int sockfd, int level, int optname, const void *optval, unsigned int optlen); +extern long sys_getsockopt(int sockfd, int level, int optname, const void *optval, unsigned int *optlen); +extern long sys_shmat(int shmid, void *shmaddr, int shmflag); +extern long sys_pread(unsigned int fd, char *ubuf, u32 count, u64 pos); + +#endif /* __CR_SYSCALL32_H__ */ \ No newline at end of file diff --git a/CRIU_code/criu/arch/riscv/include/asm/types.h b/CRIU_code/criu/arch/riscv/include/asm/types.h new file mode 100644 index 0000000..58fc94e --- /dev/null +++ b/CRIU_code/criu/arch/riscv/include/asm/types.h @@ -0,0 +1,31 @@ +#ifndef __CR_ASM_TYPES_H__ +#define __CR_ASM_TYPES_H__ + +#include +#include + +#include "page.h" +#include "bitops.h" +#include "asm/int.h" + +#include + +#include "images/core.pb-c.h" + +#define core_is_compat(core) false + +#define CORE_ENTRY__MARCH CORE_ENTRY__MARCH__RISCV + +#define CORE_THREAD_ARCH_INFO(core) core->ti_riscv + +typedef UserRiscvRegsEntry UserRegsEntry; + +static inline u64 encode_pointer(void *p) { return (u64)p; } +static inline void *decode_pointer(u64 v) { return (void*)v; } + + +#define AT_VECTOR_SIZE 44 +typedef uint64_t auxv_t; +typedef unsigned long tls_t; + +#endif /* __CR_ASM_TYPES_H__ */ \ No newline at end of file diff --git a/CRIU_code/criu/arch/riscv/include/asm/vdso.h b/CRIU_code/criu/arch/riscv/include/asm/vdso.h new file mode 100644 index 0000000..eeb9e34 --- /dev/null +++ b/CRIU_code/criu/arch/riscv/include/asm/vdso.h @@ -0,0 +1,19 @@ +#ifndef __CR_ASM_VDSO_H__ +#define __CR_ASM_VDSO_H__ + +#include "asm/int.h" +#include "asm-generic/vdso.h" + +/* + * This is a minimal amount of symbols + * we should support at the moment. + */ +#define VDSO_SYMBOL_MAX 3 +#define VDSO_SYMBOL_GTOD 0 +#define ARCH_VDSO_SYMBOLS \ + "__vdso_clock_gettime", \ + "__vdso_gettimeofday", \ + "__vdso_clock_getres" + + +#endif /* __CR_ASM_VDSO_H__ */ \ No newline at end of file diff --git a/CRIU_code/criu/arch/riscv/restorer.c b/CRIU_code/criu/arch/riscv/restorer.c new file mode 100644 index 0000000..d92e902 --- /dev/null +++ b/CRIU_code/criu/arch/riscv/restorer.c @@ -0,0 +1,43 @@ +#include + +#include "types.h" +#include "restorer.h" +#include "asm/restorer.h" +#include + +#include +#include +#include +#include "log.h" +#include "cpu.h" + +int restore_nonsigframe_gpregs(UserRiscvRegsEntry *r) +{ + return 0; +} + +unsigned long arch_shmat(int shmid, void *shmaddr, + int shmflg, unsigned long size) +{ + unsigned long smap; + + smap = sys_shmat(shmid, NULL, shmflg); + if (IS_ERR_VALUE(smap)) { + pr_err("shmat() with NULL shmaddr failed: %d\n", (int)smap); + return smap; + } + + if (smap == (unsigned long)shmaddr) + return smap; + + /* Warn ALOUD */ + pr_warn("Restoring shmem %p unaligned to SHMLBA.\n", shmaddr); + pr_warn("Make sure that you don't migrate shmem from non-VIPT cached CPU to VIPT cached \n"); + pr_warn("Otherwise YOU HAVE A CHANCE OF DATA CORRUPTIONS in writeable shmem\n"); + + smap = sys_mremap(smap, size, size, + MREMAP_FIXED | MREMAP_MAYMOVE, (unsigned long)shmaddr); + if (IS_ERR_VALUE(smap)) + pr_err("mremap() for shmem failed: %d\n", (int)smap); + return smap; +} \ No newline at end of file diff --git a/CRIU_code/criu/arch/riscv/sigaction_compat.c b/CRIU_code/criu/arch/riscv/sigaction_compat.c new file mode 100644 index 0000000..73acff1 --- /dev/null +++ b/CRIU_code/criu/arch/riscv/sigaction_compat.c @@ -0,0 +1,18 @@ +#include "log.h" +#include "asm/restorer.h" +#include +#include "asm/compat.h" +#include + +#ifdef CR_NOGLIBC +# include +#endif + +#include "cpu.h" + +extern char restore_rt_sigaction; + +int arch_compat_rt_sigaction(void *stack32, int sig, rt_sigaction_t_compat *act) +{ + return 0; +} \ No newline at end of file diff --git a/CRIU_code/criu/arch/riscv/sigframe.c b/CRIU_code/criu/arch/riscv/sigframe.c new file mode 100644 index 0000000..3d0ab39 --- /dev/null +++ b/CRIU_code/criu/arch/riscv/sigframe.c @@ -0,0 +1,13 @@ +#include +#include + +#include "asm/sigframe.h" +#include "asm/types.h" + +#include "log.h" +#include +int sigreturn_prep_fpu_frame(struct rt_sigframe *sigframe, + struct rt_sigframe *rsigframe) +{ + return 0; +} \ No newline at end of file diff --git a/CRIU_code/criu/arch/riscv/vdso-pie.c b/CRIU_code/criu/arch/riscv/vdso-pie.c new file mode 100644 index 0000000..5d4f15c --- /dev/null +++ b/CRIU_code/criu/arch/riscv/vdso-pie.c @@ -0,0 +1,56 @@ +#include +#include "asm/types.h" + +#include +#include +#include "parasite-vdso.h" +#include "log.h" +#include "common/bug.h" + +#ifdef LOG_PREFIX +# undef LOG_PREFIX +#endif +#define LOG_PREFIX "vdso: " +static void insert_trampoline(uintptr_t from, uintptr_t to) +{ + struct { + uint32_t ldr_pc; + uint32_t imm32; + uint32_t guards; + } __packed jmp = { + .ldr_pc = 0xffffe06f, /* b -4 */ + .imm32 = to, + .guards = 0x00100073, /* ebreak */ + }; + void *iflush_start = (void *)from; + void *iflush_end = iflush_start + sizeof(jmp); + + memcpy((void *)from, &jmp, sizeof(jmp)); + + sys_cacheflush(iflush_start, sizeof(jmp), 0); +} + +int vdso_redirect_calls(unsigned long base_to, unsigned long base_from, + struct vdso_symtable *sto, struct vdso_symtable *sfrom, + bool compat_vdso) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(sto->symbols); i++) { + uintptr_t from, to; + + if (vdso_symbol_empty(&sfrom->symbols[i])) + continue; + + pr_debug("jmp: %lx/%lx -> %lx/%lx (index %d)\n", + base_from, sfrom->symbols[i].offset, + base_to, sto->symbols[i].offset, i); + + from = base_from + sfrom->symbols[i].offset; + to = base_to + sto->symbols[i].offset; + + insert_trampoline(from, to); + } + + return 0; +} \ No newline at end of file diff --git a/CRIU_code/criu/arch/s390/Makefile b/CRIU_code/criu/arch/s390/Makefile new file mode 100644 index 0000000..f37337f --- /dev/null +++ b/CRIU_code/criu/arch/s390/Makefile @@ -0,0 +1,7 @@ +builtin-name := crtools.built-in.o + +ldflags-y += -r + +obj-y += cpu.o +obj-y += crtools.o +obj-y += sigframe.o diff --git a/CRIU_code/criu/arch/s390/cpu.c b/CRIU_code/criu/arch/s390/cpu.c new file mode 100644 index 0000000..f93666e --- /dev/null +++ b/CRIU_code/criu/arch/s390/cpu.c @@ -0,0 +1,158 @@ +#undef LOG_PREFIX +#define LOG_PREFIX "cpu: " + +#include +#include + +#include "asm/types.h" + +#include "cr_options.h" +#include "image.h" +#include "util.h" +#include "log.h" +#include "cpu.h" + +#include "protobuf.h" +#include "images/cpuinfo.pb-c.h" + +static compel_cpuinfo_t rt_cpuinfo; + +static const char *hwcap_str1[64] = { + "HWCAP_S390_ESAN3", + "HWCAP_S390_ZARCH", + "HWCAP_S390_STFLE", + "HWCAP_S390_MSA", + "HWCAP_S390_LDISP", + "HWCAP_S390_EIMM", + "HWCAP_S390_DFP", + "HWCAP_S390_HPAGE", + "HWCAP_S390_ETF3EH", + "HWCAP_S390_HIGH_GPRS", + "HWCAP_S390_TE", + "HWCAP_S390_VXRS", + "HWCAP_S390_VXRS_BCD", + "HWCAP_S390_VXRS_EXT", +}; +static const char *hwcap_str2[64] = { }; + +static const char **hwcap_str[2] = { hwcap_str1, hwcap_str2 }; + +static void print_hwcaps(const char *msg, unsigned long hwcap[2]) +{ + int nr, cap; + + pr_debug("%s: Capabilities: %016lx %016lx\n", msg, hwcap[0], hwcap[1]); + for (nr = 0; nr < 2; nr++) { + for (cap = 0; cap < 64; cap++) { + if (!(hwcap[nr] & (1 << cap))) + continue; + if (hwcap_str[nr][cap]) + pr_debug("%s\n", hwcap_str[nr][cap]); + else + pr_debug("Capability %d/0x%x\n", nr, 1 << cap); + } + } +} + +int cpu_init(void) +{ + int ret; + + ret = compel_cpuid(&rt_cpuinfo); + print_hwcaps("Host (init)", rt_cpuinfo.hwcap); + return ret; +} + +int cpu_dump_cpuinfo(void) +{ + CpuinfoS390Entry cpu_s390_info = CPUINFO_S390_ENTRY__INIT; + CpuinfoS390Entry *cpu_s390_info_ptr = &cpu_s390_info; + CpuinfoEntry cpu_info = CPUINFO_ENTRY__INIT; + struct cr_img *img; + int ret = -1; + + img = open_image(CR_FD_CPUINFO, O_DUMP); + if (!img) + return -1; + + cpu_info.s390_entry = &cpu_s390_info_ptr; + cpu_info.n_s390_entry = 1; + + cpu_s390_info.n_hwcap = 2; + cpu_s390_info.hwcap = rt_cpuinfo.hwcap; + + ret = pb_write_one(img, &cpu_info, PB_CPUINFO); + + close_image(img); + return ret; +} + +int cpu_validate_cpuinfo(void) +{ + CpuinfoS390Entry *cpu_s390_entry; + CpuinfoEntry *cpu_info; + struct cr_img *img; + int cap, nr, ret; + + img = open_image(CR_FD_CPUINFO, O_RSTR); + if (!img) + return -1; + + ret = 0; + if (pb_read_one(img, &cpu_info, PB_CPUINFO) < 0) + goto error; + + if (cpu_info->n_s390_entry != 1) { + pr_err("No S390 related entry in image\n"); + goto error; + } + cpu_s390_entry = cpu_info->s390_entry[0]; + + if (cpu_s390_entry->n_hwcap != 2) { + pr_err("Hardware capabilities information missing\n"); + ret = -1; + goto error; + } + + print_hwcaps("Host", rt_cpuinfo.hwcap); + print_hwcaps("Image", cpu_s390_entry->hwcap); + + for (nr = 0; nr < 2; nr++) { + for (cap = 0; cap < 64; cap++) { + if (!(cpu_s390_entry->hwcap[nr] & (1 << cap))) + continue; + if (rt_cpuinfo.hwcap[nr] & (1 << cap)) + continue; + if (hwcap_str[nr][cap]) + pr_err("CPU Feature %s not supported on host\n", + hwcap_str[nr][cap]); + else + pr_err("CPU Feature %d/%x not supported on host\n", + nr, 1 << cap); + ret = -1; + } + } + if (ret == -1) + pr_err("See also: /usr/include/bits/hwcap.h\n"); +error: + close_image(img); + return ret; +} + +int cpuinfo_dump(void) +{ + if (cpu_init()) + return -1; + if (cpu_dump_cpuinfo()) + return -1; + return 0; +} + +int cpuinfo_check(void) +{ + if (cpu_init()) + return 1; + if (cpu_validate_cpuinfo()) + return 1; + return 0; +} diff --git a/CRIU_code/criu/arch/s390/crtools.c b/CRIU_code/criu/arch/s390/crtools.c new file mode 100644 index 0000000..238035b --- /dev/null +++ b/CRIU_code/criu/arch/s390/crtools.c @@ -0,0 +1,782 @@ +#include +#include +#include +#include +#include +#include + +#include "types.h" +#include +#include "asm/restorer.h" +#include "asm/dump.h" + +#include "cr_options.h" +#include "common/compiler.h" +#include +#include "parasite-syscall.h" +#include "log.h" +#include "util.h" +#include "cpu.h" +#include + +#include "protobuf.h" +#include "images/core.pb-c.h" +#include "images/creds.pb-c.h" +#include "ptrace.h" +#include "pstree.h" +#include "image.h" + +#define NT_PRFPREG 2 +#define NT_S390_VXRS_LOW 0x309 +#define NT_S390_VXRS_HIGH 0x30a +#define NT_S390_GS_CB 0x30b +#define NT_S390_GS_BC 0x30c +#define NT_S390_RI_CB 0x30d + +/* + * Print general purpose and access registers + */ +static void print_core_gpregs(const char *msg, UserS390RegsEntry *gpregs) +{ + int i; + + pr_debug("%s: General purpose registers\n", msg); + pr_debug(" psw %016lx %016lx\n", + gpregs->psw_mask, gpregs->psw_addr); + pr_debug(" orig_gpr2 %016lx\n", gpregs->orig_gpr2); + for (i = 0; i < 16; i++) + pr_debug(" g%02d %016lx\n", i, gpregs->gprs[i]); + for (i = 0; i < 16; i++) + pr_debug(" a%02d %08x\n", i, gpregs->acrs[i]); +} + +/* + * Print vector registers + */ +static void print_core_vx_regs(CoreEntry *core) +{ + UserS390VxrsHighEntry *vxrs_high; + UserS390VxrsLowEntry *vxrs_low; + int i; + + vxrs_high = CORE_THREAD_ARCH_INFO(core)->vxrs_high; + vxrs_low = CORE_THREAD_ARCH_INFO(core)->vxrs_low; + + if (vxrs_low == NULL) { + pr_debug(" No VXRS\n"); + return; + } + for (i = 0; i < 16; i++) + pr_debug(" vx_low%02d %016lx\n", i, vxrs_low->regs[i]); + for (i = 0; i < 32; i += 2) + pr_debug(" vx_high%02d %016lx %016lx\n", i / 2, + vxrs_high->regs[i], vxrs_high->regs[i + 1]); +} + +/* + * Print guarded-storage control block + */ +static void print_core_gs_cb(CoreEntry *core) +{ + UserS390GsCbEntry *gs_cb; + int i; + + gs_cb = CORE_THREAD_ARCH_INFO(core)->gs_cb; + if (!gs_cb) { + pr_debug(" No GS_CB\n"); + return; + } + for (i = 0; i < 4; i++) + pr_debug(" gs_cb%d %lx\n", i, gs_cb->regs[i]); +} + +/* + * Print guarded-storage broadcast control block + */ +static void print_core_gs_bc(CoreEntry *core) +{ + UserS390GsCbEntry *gs_bc; + int i; + + gs_bc = CORE_THREAD_ARCH_INFO(core)->gs_bc; + + if (!gs_bc) { + pr_debug(" No GS_BC\n"); + return; + } + for (i = 0; i < 4; i++) + pr_debug(" gs_bc%d %lx\n", i, gs_bc->regs[i]); +} + +/* + * Print runtime-instrumentation control block + */ +static void print_core_ri_cb(CoreEntry *core) +{ + UserS390RiEntry *ri_cb; + int i; + + ri_cb = CORE_THREAD_ARCH_INFO(core)->ri_cb; + if (!ri_cb) { + pr_debug(" No RI_CB\n"); + return; + } + for (i = 0; i < 8; i++) + pr_debug(" ri_cb%d %lx\n", i, ri_cb->regs[i]); +} +/* + * Print architecture registers + */ +static void print_core_fp_regs(const char *msg, CoreEntry *core) +{ + UserS390FpregsEntry *fpregs; + int i; + + fpregs = CORE_THREAD_ARCH_INFO(core)->fpregs; + + pr_debug("%s: Floating point registers\n", msg); + pr_debug(" fpc %08x\n", fpregs->fpc); + for (i = 0; i < 16; i++) + pr_debug(" f%02d %016lx\n", i, fpregs->fprs[i]); + print_core_vx_regs(core); + print_core_gs_cb(core); + print_core_gs_bc(core); + print_core_ri_cb(core); +} + +/* + * Allocate VxrsLow registers + */ +static UserS390VxrsLowEntry *allocate_vxrs_low_regs(void) +{ + UserS390VxrsLowEntry *vxrs_low; + + vxrs_low = xmalloc(sizeof(*vxrs_low)); + if (!vxrs_low) + return NULL; + user_s390_vxrs_low_entry__init(vxrs_low); + + vxrs_low->n_regs = 16; + vxrs_low->regs = xzalloc(16 * sizeof(uint64_t)); + if (!vxrs_low->regs) + goto fail_free_vxrs_low; + return vxrs_low; + +fail_free_vxrs_low: + xfree(vxrs_low); + return NULL; +} + +/* + * Free VxrsLow registers + */ +static void free_vxrs_low_regs(UserS390VxrsLowEntry *vxrs_low) +{ + if (vxrs_low) { + xfree(vxrs_low->regs); + xfree(vxrs_low); + } +} + +/* + * Allocate VxrsHigh registers + */ +static UserS390VxrsHighEntry *allocate_vxrs_high_regs(void) +{ + UserS390VxrsHighEntry *vxrs_high; + + vxrs_high = xmalloc(sizeof(*vxrs_high)); + if (!vxrs_high) + return NULL; + user_s390_vxrs_high_entry__init(vxrs_high); + + vxrs_high->n_regs = 32; + vxrs_high->regs = xzalloc(32 * sizeof(uint64_t)); + if (!vxrs_high->regs) + goto fail_free_vxrs_high; + return vxrs_high; + +fail_free_vxrs_high: + xfree(vxrs_high); + return NULL; +} + +/* + * Free VxrsHigh registers + */ +static void free_vxrs_high_regs(UserS390VxrsHighEntry *vxrs_high) +{ + if (vxrs_high) { + xfree(vxrs_high->regs); + xfree(vxrs_high); + } +} + +/* + * Allocate guarded-storage control block (GS_CB and GS_BC) + */ +static UserS390GsCbEntry *allocate_gs_cb(void) +{ + UserS390GsCbEntry *gs_cb; + + gs_cb = xmalloc(sizeof(*gs_cb)); + if (!gs_cb) + return NULL; + user_s390_gs_cb_entry__init(gs_cb); + + gs_cb->n_regs = 4; + gs_cb->regs = xzalloc(4 * sizeof(uint64_t)); + if (!gs_cb->regs) + goto fail_free_gs_cb; + return gs_cb; + +fail_free_gs_cb: + xfree(gs_cb); + return NULL; +} + +/* + * Free Guareded Storage control blocks + */ +static void free_gs_cb(UserS390GsCbEntry *gs_cb) +{ + if (gs_cb) { + xfree(gs_cb->regs); + xfree(gs_cb); + } +} + +/* + * Allocate runtime-instrumentation control block + */ +static UserS390RiEntry *allocate_ri_cb(void) +{ + UserS390RiEntry *ri_cb; + + ri_cb = xmalloc(sizeof(*ri_cb)); + if (!ri_cb) + return NULL; + user_s390_ri_entry__init(ri_cb); + + ri_cb->ri_on = 0; + ri_cb->n_regs = 8; + ri_cb->regs = xzalloc(8 * sizeof(uint64_t)); + if (!ri_cb->regs) + goto fail_free_ri_cb; + return ri_cb; + +fail_free_ri_cb: + xfree(ri_cb); + return NULL; +} + +/* + * Free runtime-instrumentation control block + */ +static void free_ri_cb(UserS390RiEntry *ri_cb) +{ + if (ri_cb) { + xfree(ri_cb->regs); + xfree(ri_cb); + } +} + +/* + * Copy internal structures into Google Protocol Buffers + */ +int save_task_regs(void *arg, user_regs_struct_t *u, user_fpregs_struct_t *f) +{ + UserS390VxrsHighEntry *vxrs_high = NULL; + UserS390VxrsLowEntry *vxrs_low = NULL; + UserS390FpregsEntry *fpregs = NULL; + UserS390RegsEntry *gpregs = NULL; + UserS390GsCbEntry *gs_cb = NULL; + UserS390GsCbEntry *gs_bc = NULL; + UserS390RiEntry *ri_cb = NULL; + CoreEntry *core = arg; + + gpregs = CORE_THREAD_ARCH_INFO(core)->gpregs; + fpregs = CORE_THREAD_ARCH_INFO(core)->fpregs; + + /* Vector registers */ + if (f->flags & USER_FPREGS_VXRS) { + vxrs_low = allocate_vxrs_low_regs(); + if (!vxrs_low) + return -1; + vxrs_high = allocate_vxrs_high_regs(); + if (!vxrs_high) + goto fail_free_vxrs_low; + memcpy(vxrs_low->regs, &f->vxrs_low, sizeof(f->vxrs_low)); + memcpy(vxrs_high->regs, &f->vxrs_high, sizeof(f->vxrs_high)); + CORE_THREAD_ARCH_INFO(core)->vxrs_low = vxrs_low; + CORE_THREAD_ARCH_INFO(core)->vxrs_high = vxrs_high; + } + /* Guarded-storage control block */ + if (f->flags & USER_GS_CB) { + gs_cb = allocate_gs_cb(); + if (!gs_cb) + goto fail_free_gs_cb; + memcpy(gs_cb->regs, &f->gs_cb, sizeof(f->gs_cb)); + CORE_THREAD_ARCH_INFO(core)->gs_cb = gs_cb; + } + /* Guarded-storage broadcast control block */ + if (f->flags & USER_GS_BC) { + gs_bc = allocate_gs_cb(); + if (!gs_bc) + goto fail_free_gs_bc; + memcpy(gs_bc->regs, &f->gs_bc, sizeof(f->gs_bc)); + CORE_THREAD_ARCH_INFO(core)->gs_bc = gs_bc; + } + /* Runtime-instrumentation control block */ + if (f->flags & USER_RI_CB) { + ri_cb = allocate_ri_cb(); + if (!ri_cb) + goto fail_free_ri_cb; + memcpy(ri_cb->regs, &f->ri_cb, sizeof(f->ri_cb)); + CORE_THREAD_ARCH_INFO(core)->ri_cb = ri_cb; + /* We need to remember that the RI bit was on */ + if (f->flags & USER_RI_ON) + ri_cb->ri_on = 1; + } + /* General purpose registers */ + memcpy(gpregs->gprs, u->prstatus.gprs, sizeof(u->prstatus.gprs)); + gpregs->psw_mask = u->prstatus.psw.mask; + gpregs->psw_addr = u->prstatus.psw.addr; + /* Access registers */ + memcpy(gpregs->acrs, u->prstatus.acrs, sizeof(u->prstatus.acrs)); + /* System call */ + gpregs->system_call = u->system_call; + /* Floating point registers */ + fpregs->fpc = f->prfpreg.fpc; + memcpy(fpregs->fprs, f->prfpreg.fprs, sizeof(f->prfpreg.fprs)); + return 0; +fail_free_ri_cb: + free_ri_cb(ri_cb); +fail_free_gs_cb: + free_gs_cb(gs_cb); +fail_free_gs_bc: + free_gs_cb(gs_bc); +fail_free_vxrs_low: + free_vxrs_low_regs(vxrs_low); + return -1; +} + +/* + * Copy general and access registers to signal frame + */ +int restore_gpregs(struct rt_sigframe *f, UserS390RegsEntry *src) +{ + _sigregs *dst = &f->uc.uc_mcontext; + + dst->regs.psw.mask = src->psw_mask; + dst->regs.psw.addr = src->psw_addr; + memcpy(dst->regs.gprs, src->gprs, sizeof(dst->regs.gprs)); + memcpy(dst->regs.acrs, src->acrs, sizeof(dst->regs.acrs)); + + print_core_gpregs("restore_gpregs_regs", src); + return 0; +} + +/* + * Copy floating point and vector registers to mcontext + */ +int restore_fpu(struct rt_sigframe *f, CoreEntry *core) +{ + UserS390VxrsHighEntry *vxrs_high; + UserS390VxrsLowEntry *vxrs_low; + UserS390FpregsEntry *fpregs; + _sigregs *dst = &f->uc.uc_mcontext; + _sigregs_ext *dst_ext = &f->uc.uc_mcontext_ext; + + fpregs = CORE_THREAD_ARCH_INFO(core)->fpregs; + vxrs_high = CORE_THREAD_ARCH_INFO(core)->vxrs_high; + vxrs_low = CORE_THREAD_ARCH_INFO(core)->vxrs_low; + + dst->fpregs.fpc = fpregs->fpc; + memcpy(dst->fpregs.fprs, fpregs->fprs, sizeof(dst->fpregs.fprs)); + if (vxrs_low) { + memcpy(&dst_ext->vxrs_low, vxrs_low->regs, + sizeof(dst_ext->vxrs_low)); + memcpy(&dst_ext->vxrs_high, vxrs_high->regs, + sizeof(dst_ext->vxrs_high)); + } + return 0; +} + +/* + * Allocate floating point registers + */ +static UserS390FpregsEntry *allocate_fp_regs(void) +{ + UserS390FpregsEntry *fpregs; + + fpregs = xmalloc(sizeof(*fpregs)); + if (!fpregs) + return NULL; + user_s390_fpregs_entry__init(fpregs); + + fpregs->n_fprs = 16; + fpregs->fprs = xzalloc(16 * sizeof(uint64_t)); + if (!fpregs->fprs) + goto fail_free_fpregs; + return fpregs; + +fail_free_fpregs: + xfree(fpregs); + return NULL; +} + +/* + * Free floating point registers + */ +static void free_fp_regs(UserS390FpregsEntry *fpregs) +{ + xfree(fpregs->fprs); + xfree(fpregs); +} + +/* + * Allocate general purpose and access registers + */ +static UserS390RegsEntry *allocate_gp_regs(void) +{ + UserS390RegsEntry *gpregs; + + gpregs = xmalloc(sizeof(*gpregs)); + if (!gpregs) + return NULL; + user_s390_regs_entry__init(gpregs); + + gpregs->n_gprs = 16; + gpregs->gprs = xzalloc(16 * sizeof(uint64_t)); + if (!gpregs->gprs) + goto fail_free_gpregs; + + gpregs->n_acrs = 16; + gpregs->acrs = xzalloc(16 * sizeof(uint32_t)); + if (!gpregs->acrs) + goto fail_free_gprs; + return gpregs; + +fail_free_gprs: + xfree(gpregs->gprs); +fail_free_gpregs: + xfree(gpregs); + return NULL; +} + +/* + * Free general purpose and access registers + */ +static void free_gp_regs(UserS390RegsEntry *gpregs) +{ + xfree(gpregs->gprs); + xfree(gpregs->acrs); + xfree(gpregs); +} + +/* + * Allocate thread info + */ +int arch_alloc_thread_info(CoreEntry *core) +{ + ThreadInfoS390 *ti_s390; + + ti_s390 = xmalloc(sizeof(*ti_s390)); + if (!ti_s390) + return -1; + + thread_info_s390__init(ti_s390); + + ti_s390->gpregs = allocate_gp_regs(); + if (!ti_s390->gpregs) + goto fail_free_ti_s390; + ti_s390->fpregs = allocate_fp_regs(); + if (!ti_s390->fpregs) + goto fail_free_gp_regs; + + CORE_THREAD_ARCH_INFO(core) = ti_s390; + return 0; + +fail_free_gp_regs: + free_gp_regs(ti_s390->gpregs); +fail_free_ti_s390: + xfree(ti_s390); + return -1; +} + +/* + * Free thread info + */ +void arch_free_thread_info(CoreEntry *core) +{ + if (!CORE_THREAD_ARCH_INFO(core)) + return; + free_gp_regs(CORE_THREAD_ARCH_INFO(core)->gpregs); + free_fp_regs(CORE_THREAD_ARCH_INFO(core)->fpregs); + free_vxrs_low_regs(CORE_THREAD_ARCH_INFO(core)->vxrs_low); + free_vxrs_high_regs(CORE_THREAD_ARCH_INFO(core)->vxrs_high); + free_gs_cb(CORE_THREAD_ARCH_INFO(core)->gs_cb); + free_gs_cb(CORE_THREAD_ARCH_INFO(core)->gs_bc); + free_ri_cb(CORE_THREAD_ARCH_INFO(core)->ri_cb); + xfree(CORE_THREAD_ARCH_INFO(core)); + CORE_THREAD_ARCH_INFO(core) = NULL; +} + +/* + * Set regset for pid + */ +static int setregset(int pid, int set, const char *set_str, struct iovec *iov) +{ + if (ptrace(PTRACE_SETREGSET, pid, set, iov) == 0) + return 0; + pr_perror("Couldn't set %s registers for pid %d", set_str, pid); + return -1; +} + +/* + * Set floating point registers for pid from fpregs + */ +static int set_fp_regs(pid_t pid, user_fpregs_struct_t *fpregs) +{ + struct iovec iov; + + iov.iov_base = &fpregs->prfpreg; + iov.iov_len = sizeof(fpregs->prfpreg); + return setregset(pid, NT_PRFPREG, "PRFPREG", &iov); +} + +/* + * Set vector registers + */ +static int set_vx_regs(pid_t pid, user_fpregs_struct_t *fpregs) +{ + struct iovec iov; + + if (!(fpregs->flags & USER_FPREGS_VXRS)) + return 0; + + iov.iov_base = &fpregs->vxrs_low; + iov.iov_len = sizeof(fpregs->vxrs_low); + if (setregset(pid, NT_S390_VXRS_LOW, "S390_VXRS_LOW", &iov)) + return -1; + + iov.iov_base = &fpregs->vxrs_high; + iov.iov_len = sizeof(fpregs->vxrs_high); + return setregset(pid, NT_S390_VXRS_HIGH, "S390_VXRS_HIGH", &iov); +} + +/* + * Set guarded-storage control block + */ +static int set_gs_cb(pid_t pid, user_fpregs_struct_t *fpregs) +{ + struct iovec iov; + + if (fpregs->flags & USER_GS_CB) { + iov.iov_base = &fpregs->gs_cb; + iov.iov_len = sizeof(fpregs->gs_cb); + if (setregset(pid, NT_S390_GS_CB, "S390_GS_CB", &iov)) + return -1; + } + + if (!(fpregs->flags & USER_GS_BC)) + return 0; + iov.iov_base = &fpregs->gs_bc; + iov.iov_len = sizeof(fpregs->gs_bc); + return setregset(pid, NT_S390_GS_BC, "S390_GS_BC", &iov); +} + +/* + * Set runtime-instrumentation control block + */ +static int set_ri_cb(pid_t pid, user_fpregs_struct_t *fpregs) +{ + struct iovec iov; + + if (!(fpregs->flags & USER_RI_CB)) + return 0; + + iov.iov_base = &fpregs->ri_cb; + iov.iov_len = sizeof(fpregs->ri_cb); + return setregset(pid, NT_S390_RI_CB, "S390_RI_CB", &iov); +} + +/* + * Set runtime-instrumentation bit + * + * The CPU collects information when the RI bit of the PSW is set. + * The RI control block is not part of the signal frame. Therefore during + * sigreturn it is not set. If the RI control block is present, the CPU + * writes into undefined storage. Hence, we have disabled the RI bit in + * the sigreturn PSW and set this bit after sigreturn by modifying the PSW + * of the task. + */ +static int set_ri_bit(pid_t pid) +{ + user_regs_struct_t regs; + struct iovec iov; + psw_t *psw; + + iov.iov_base = ®s.prstatus; + iov.iov_len = sizeof(regs.prstatus); + if (ptrace(PTRACE_GETREGSET, pid, NT_PRSTATUS, &iov) < 0) { + pr_perror("Fail to activate RI bit"); + return -1; + } + psw = ®s.prstatus.psw; + psw->mask |= PSW_MASK_RI; + + return ptrace(PTRACE_SETREGSET, pid, NT_PRSTATUS, &iov); +} + +/* + * Restore registers not present in sigreturn signal frame + */ +static int set_task_regs_nosigrt(pid_t pid, CoreEntry *core) +{ + user_fpregs_struct_t fpregs; + UserS390GsCbEntry *cgs_cb; + UserS390GsCbEntry *cgs_bc; + UserS390RiEntry *cri_cb; + int ret = 0; + + memset(&fpregs, 0, sizeof(fpregs)); + /* Guarded-storage control block (optional) */ + cgs_cb = CORE_THREAD_ARCH_INFO(core)->gs_cb; + if (cgs_cb != NULL) { + fpregs.flags |= USER_GS_CB; + memcpy(&fpregs.gs_cb, cgs_cb->regs, sizeof(fpregs.gs_cb)); + } + /* Guarded-storage broadcast control block (optional) */ + cgs_bc = CORE_THREAD_ARCH_INFO(core)->gs_bc; + if (cgs_bc != NULL) { + fpregs.flags |= USER_GS_BC; + memcpy(&fpregs.gs_bc, cgs_bc->regs, sizeof(fpregs.gs_bc)); + } + if (set_gs_cb(pid, &fpregs) < 0) + return -1; + /* Runtime-instrumentation control block (optional) */ + cri_cb = CORE_THREAD_ARCH_INFO(core)->ri_cb; + if (cri_cb != NULL) { + fpregs.flags |= USER_RI_CB; + memcpy(&fpregs.ri_cb, cri_cb->regs, sizeof(fpregs.ri_cb)); + if (set_ri_cb(pid, &fpregs) < 0) + return -1; + if (cri_cb->ri_on) { + fpregs.flags |= USER_RI_ON; + ret = set_ri_bit(pid); + } + } + return ret; +} + +/* + * Restore registers for pid from core + */ +static int set_task_regs(pid_t pid, CoreEntry *core) +{ + UserS390VxrsHighEntry *cvxrs_high; + UserS390VxrsLowEntry *cvxrs_low; + UserS390FpregsEntry *cfpregs; + user_fpregs_struct_t fpregs; + + memset(&fpregs, 0, sizeof(fpregs)); + /* Floating point registers */ + cfpregs = CORE_THREAD_ARCH_INFO(core)->fpregs; + if (!cfpregs) + return -1; + fpregs.prfpreg.fpc = cfpregs->fpc; + memcpy(fpregs.prfpreg.fprs, cfpregs->fprs, sizeof(fpregs.prfpreg.fprs)); + if (set_fp_regs(pid, &fpregs) < 0) + return -1; + /* Vector registers (optional) */ + cvxrs_low = CORE_THREAD_ARCH_INFO(core)->vxrs_low; + if (cvxrs_low != NULL) { + cvxrs_high = CORE_THREAD_ARCH_INFO(core)->vxrs_high; + if (!cvxrs_high) + return -1; + fpregs.flags |= USER_FPREGS_VXRS; + memcpy(&fpregs.vxrs_low, cvxrs_low->regs, + sizeof(fpregs.vxrs_low)); + memcpy(&fpregs.vxrs_high, cvxrs_high->regs, + sizeof(fpregs.vxrs_high)); + if (set_vx_regs(pid, &fpregs) < 0) + return -1; + } + return set_task_regs_nosigrt(pid, core); +} + +/* + * Restore registers for all threads: + * - Floating point registers + * - Vector registers + * - Guarded-storage control block + * - Guarded-storage broadcast control block + * - Runtime-instrumentation control block + */ +int arch_set_thread_regs(struct pstree_item *item, bool with_threads) +{ + int i; + + for_each_pstree_item(item) { + if (item->pid->state == TASK_DEAD || + item->pid->state == TASK_ZOMBIE) + continue; + for (i = 0; i < item->nr_threads; i++) { + if (item->threads[i].state == TASK_DEAD || + item->threads[i].state == TASK_ZOMBIE) + continue; + if (!with_threads && i > 0) + continue; + if (set_task_regs(item->threads[i].real, + item->core[i])) { + pr_perror("Not set registers for task %d", + item->threads[i].real); + return -1; + } + } + } + return 0; +} + +static int open_core(int pid, CoreEntry **pcore) +{ + struct cr_img *img; + int ret; + + img = open_image(CR_FD_CORE, O_RSTR, pid); + if (!img) { + pr_err("Can't open core data for %d\n", pid); + return -1; + } + ret = pb_read_one(img, pcore, PB_CORE); + close_image(img); + + return ret <= 0 ? -1 : 0; +} + +/* + * Restore all registers not present in sigreturn signal frame + * + * - Guarded-storage control block + * - Guarded-storage broadcast control block + * - Runtime-instrumentation control block + */ +int arch_set_thread_regs_nosigrt(struct pid *pid) +{ + CoreEntry *core; + + core = xmalloc(sizeof(*core)); + if (open_core(pid->ns[0].virt, &core) < 0) { + pr_perror("Cannot open core for virt pid %d", pid->ns[0].virt); + return -1; + } + + if (set_task_regs_nosigrt(pid->real, core) < 0) { + pr_perror("Set register for pid %d", pid->real); + return -1; + } + print_core_fp_regs("restore_fp_regs", core); + return 0; +} diff --git a/CRIU_code/criu/arch/s390/include/asm/dump.h b/CRIU_code/criu/arch/s390/include/asm/dump.h new file mode 100644 index 0000000..53aaac9 --- /dev/null +++ b/CRIU_code/criu/arch/s390/include/asm/dump.h @@ -0,0 +1,12 @@ +#ifndef __CR_ASM_DUMP_H__ +#define __CR_ASM_DUMP_H__ + +int save_task_regs(void *arg, user_regs_struct_t *u, user_fpregs_struct_t *f); +int arch_alloc_thread_info(CoreEntry *core); +void arch_free_thread_info(CoreEntry *core); + +static inline void core_put_tls(CoreEntry *core, tls_t tls) { } + +#define get_task_futex_robust_list_compat(pid, info) -1 + +#endif diff --git a/CRIU_code/criu/arch/s390/include/asm/int.h b/CRIU_code/criu/arch/s390/include/asm/int.h new file mode 100644 index 0000000..642804e --- /dev/null +++ b/CRIU_code/criu/arch/s390/include/asm/int.h @@ -0,0 +1,6 @@ +#ifndef __CR_ASM_INT_H__ +#define __CR_ASM_INT_H__ + +#include "asm-generic/int.h" + +#endif /* __CR_ASM_INT_H__ */ diff --git a/CRIU_code/criu/arch/s390/include/asm/kerndat.h b/CRIU_code/criu/arch/s390/include/asm/kerndat.h new file mode 100644 index 0000000..60956b5 --- /dev/null +++ b/CRIU_code/criu/arch/s390/include/asm/kerndat.h @@ -0,0 +1,7 @@ +#ifndef __CR_ASM_KERNDAT_H__ +#define __CR_ASM_KERNDAT_H__ + +#define kdat_compatible_cr() 0 +#define kdat_can_map_vdso() 0 + +#endif /* __CR_ASM_KERNDAT_H__ */ diff --git a/CRIU_code/criu/arch/s390/include/asm/parasite-syscall.h b/CRIU_code/criu/arch/s390/include/asm/parasite-syscall.h new file mode 100644 index 0000000..6008c37 --- /dev/null +++ b/CRIU_code/criu/arch/s390/include/asm/parasite-syscall.h @@ -0,0 +1,6 @@ +#ifndef __CR_ASM_PARASITE_SYSCALL_H__ +#define __CR_ASM_PARASITE_SYSCALL_H__ + +struct parasite_ctl; + +#endif diff --git a/CRIU_code/criu/arch/s390/include/asm/parasite.h b/CRIU_code/criu/arch/s390/include/asm/parasite.h new file mode 100644 index 0000000..0b02689 --- /dev/null +++ b/CRIU_code/criu/arch/s390/include/asm/parasite.h @@ -0,0 +1,7 @@ +#ifndef __ASM_PARASITE_H__ +#define __ASM_PARASITE_H__ + +/* TLS is accessed through %a01, which is already processed */ +static inline void arch_get_tls(tls_t *ptls) { (void)ptls; } + +#endif diff --git a/CRIU_code/criu/arch/s390/include/asm/restore.h b/CRIU_code/criu/arch/s390/include/asm/restore.h new file mode 100644 index 0000000..6463d8e --- /dev/null +++ b/CRIU_code/criu/arch/s390/include/asm/restore.h @@ -0,0 +1,27 @@ +#ifndef __CR_ASM_RESTORE_H__ +#define __CR_ASM_RESTORE_H__ + +#include "asm/restorer.h" + +#include "images/core.pb-c.h" +/* + * Load stack to %r15, return address in %r14 and argument 1 into %r2 + */ +#define JUMP_TO_RESTORER_BLOB(new_sp, restore_task_exec_start, \ + task_args) \ + asm volatile( \ + "lgr %%r15,%0\n" \ + "lgr %%r14,%1\n" \ + "lgr %%r2,%2\n" \ + "basr %%r14,%%r14\n" \ + : \ + : "d" (new_sp), \ + "d"((unsigned long)restore_task_exec_start), \ + "d" (task_args) \ + : "2", "14", "15", "memory") + +/* There is nothing to do since TLS is accessed through %a01 */ +#define core_get_tls(pcore, ptls) + +int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core); +#endif diff --git a/CRIU_code/criu/arch/s390/include/asm/restorer.h b/CRIU_code/criu/arch/s390/include/asm/restorer.h new file mode 100644 index 0000000..cfdefca --- /dev/null +++ b/CRIU_code/criu/arch/s390/include/asm/restorer.h @@ -0,0 +1,65 @@ +#ifndef __CR_ASM_RESTORER_H__ +#define __CR_ASM_RESTORER_H__ + +#include +#include + +#include "asm/types.h" + +#include "sigframe.h" + +/* + * Clone trampoline - see glibc sysdeps/unix/sysv/linux/s390/s390-64/clone.S + */ +#define RUN_CLONE_RESTORE_FN(ret, clone_flags, new_sp, parent_tid, \ + thread_args, clone_restore_fn) \ + asm volatile( \ + "lgr %%r0,%6\n" /* Save thread_args in %r0 */ \ + "lgr %%r1,%5\n" /* Save clone_restore_fn in %r1 */ \ + "lgr %%r2,%2\n" /* Parm 1: new_sp (child stack) */ \ + "lgr %%r3,%1\n" /* Parm 2: clone_flags */ \ + "lgr %%r4,%3\n" /* Parm 3: &parent_tid */ \ + "lgr %%r5,%4\n" /* Parm 4: &thread_args[i].pid */ \ + "lghi %%r6,0\n" /* Parm 5: tls = 0 */ \ + "svc "__stringify(__NR_clone)"\n" \ + "ltgr %0,%%r2\n" /* Set and check "ret" */ \ + "jnz 0f\n" /* ret != 0: Continue caller */ \ + "lgr %%r2,%%r0\n" /* Parm 1: &thread_args */ \ + "aghi %%r15,-160\n" /* Prepare stack frame */ \ + "xc 0(8,%%r15),0(%%r15)\n" \ + "basr %%r14,%%r1\n" /* Jump to clone_restore_fn() */ \ + "j .+2\n" /* BUG(): Force PGM check */ \ +"0:\n" /* Continue caller */ \ + : "=d"(ret) \ + : "d"(clone_flags), \ + "a"(new_sp), \ + "d"(&parent_tid), \ + "d"(&thread_args[i].pid), \ + "d"(clone_restore_fn), \ + "d"(&thread_args[i]) \ + : "0", "1", "2", "3", "4", "5", "6", "cc", "memory") + +#define arch_map_vdso(map, compat) -1 + +int restore_gpregs(struct rt_sigframe *f, UserS390RegsEntry *r); +int restore_nonsigframe_gpregs(UserS390RegsEntry *r); + +unsigned long sys_shmat(int shmid, const void *shmaddr, int shmflg); +unsigned long sys_mmap(void *addr, unsigned long len, unsigned long prot, + unsigned long flags, unsigned long fd, + unsigned long offset); + +static inline void restore_tls(tls_t *ptls) { (void)ptls; } +static inline void *alloc_compat_syscall_stack(void) { return NULL; } +static inline void free_compat_syscall_stack(void *stack32) { } +static inline int arch_compat_rt_sigaction(void *stack, int sig, void *act) +{ + return -1; +} + +static inline int set_compat_robust_list(uint32_t head_ptr, uint32_t len) +{ + return -1; +} + +#endif /*__CR_ASM_RESTORER_H__*/ diff --git a/CRIU_code/criu/arch/s390/include/asm/types.h b/CRIU_code/criu/arch/s390/include/asm/types.h new file mode 100644 index 0000000..4f36c13 --- /dev/null +++ b/CRIU_code/criu/arch/s390/include/asm/types.h @@ -0,0 +1,37 @@ +#ifndef _UAPI_S390_TYPES_H +#define _UAPI_S390_TYPES_H + +#include +#include +#include "images/core.pb-c.h" + +#include "page.h" +#include "bitops.h" +#include "asm/int.h" + +#include + +typedef UserS390RegsEntry UserRegsEntry; + +#define CORE_ENTRY__MARCH CORE_ENTRY__MARCH__S390 + +#define core_is_compat(core) false + +#define CORE_THREAD_ARCH_INFO(core) core->ti_s390 + +static inline u64 encode_pointer(void *p) { return (u64) p; } +static inline void *decode_pointer(u64 v) { return (void *) v; } + +/* + * See also: + * * arch/s390/include/uapi/asm/auxvec.h + * * include/linux/auxvec.h + */ +#define AT_VECTOR_SIZE_BASE 20 +#define AT_VECTOR_SIZE_ARCH 1 +#define AT_VECTOR_SIZE (2*(AT_VECTOR_SIZE_ARCH + AT_VECTOR_SIZE_BASE + 1)) + +typedef uint64_t auxv_t; +typedef uint64_t tls_t; + +#endif /* _UAPI_S390_TYPES_H */ diff --git a/CRIU_code/criu/arch/s390/include/asm/vdso.h b/CRIU_code/criu/arch/s390/include/asm/vdso.h new file mode 100644 index 0000000..63e7e04 --- /dev/null +++ b/CRIU_code/criu/arch/s390/include/asm/vdso.h @@ -0,0 +1,23 @@ +#ifndef __CR_ASM_VDSO_H__ +#define __CR_ASM_VDSO_H__ + +#include "asm/int.h" +#include "asm-generic/vdso.h" + +/* + * This is a minimal amount of symbols + * we should support at the moment. + */ +#define VDSO_SYMBOL_MAX 4 + +/* + * This definition is used in pie/util-vdso.c to initialize the vdso symbol + * name string table 'vdso_symbols' + */ +#define ARCH_VDSO_SYMBOLS \ + "__kernel_gettimeofday", \ + "__kernel_clock_gettime", \ + "__kernel_clock_getres", \ + "__kernel_getcpu" + +#endif /* __CR_ASM_VDSO_H__ */ diff --git a/CRIU_code/criu/arch/s390/restorer.c b/CRIU_code/criu/arch/s390/restorer.c new file mode 100644 index 0000000..3823fda --- /dev/null +++ b/CRIU_code/criu/arch/s390/restorer.c @@ -0,0 +1,37 @@ +#include + +#include "restorer.h" +#include "asm/restorer.h" +#include + +#include +#include "log.h" + +/* + * All registers are restored by sigreturn - nothing to do here + */ +int restore_nonsigframe_gpregs(UserS390RegsEntry *r) +{ + return 0; +} + +/* + * Call underlying ipc system call for shmat + */ +unsigned long sys_shmat(int shmid, const void *shmaddr, int shmflg) +{ + unsigned long raddr; + int ret; + + ret = sys_ipc(21 /*SHMAT */, + shmid, /* first */ + shmflg, /* second */ + (unsigned long)&raddr, /* third */ + shmaddr, /* ptr */ + 0 /* fifth not used */); + + if (ret) + raddr = (unsigned long) ret; + + return raddr; +} diff --git a/CRIU_code/criu/arch/s390/sigframe.c b/CRIU_code/criu/arch/s390/sigframe.c new file mode 100644 index 0000000..03f206a --- /dev/null +++ b/CRIU_code/criu/arch/s390/sigframe.c @@ -0,0 +1,20 @@ +#include +#include + +#include "asm/sigframe.h" +#include "asm/types.h" + +#include "log.h" + +/* + * Nothing to do since we don't have any pointers to adjust + * in the signal frame. + * + * - sigframe : Pointer to local signal frame + * - rsigframe: Pointer to remote signal frame of inferior + */ +int sigreturn_prep_fpu_frame(struct rt_sigframe *sigframe, + struct rt_sigframe *rsigframe) +{ + return 0; +} diff --git a/CRIU_code/criu/arch/s390/vdso-pie.c b/CRIU_code/criu/arch/s390/vdso-pie.c new file mode 100644 index 0000000..0667668 --- /dev/null +++ b/CRIU_code/criu/arch/s390/vdso-pie.c @@ -0,0 +1,65 @@ +#include + +#include "asm/types.h" + +#include +#include +#include "parasite-vdso.h" +#include "log.h" +#include "common/bug.h" + +#ifdef LOG_PREFIX +# undef LOG_PREFIX +#endif +#define LOG_PREFIX "vdso: " + +/* + * Trampoline instruction sequence + */ +typedef struct { + u8 larl[6]; /* Load relative address of imm64 */ + u8 lg[6]; /* Load %r1 with imm64 */ + u8 br[2]; /* Branch to %r1 */ + u64 addr; /* Jump address */ + u32 guards; /* Guard bytes */ +} __packed jmp_t; + +/* + * Trampoline template: Use %r1 to jump + */ +jmp_t jmp = { + /* larl %r1,e (addr) */ + .larl = {0xc0, 0x10, 0x00, 0x00, 0x00, 0x07}, + /* lg %r1,0(%r1) */ + .lg = {0xe3, 0x10, 0x10, 0x00, 0x00, 0x04}, + /* br %r1 */ + .br = {0x07, 0xf1}, + .guards = 0xcccccccc, +}; + +/* + * Insert trampoline code into old vdso entry points to + * jump to new vdso functions. + */ +int vdso_redirect_calls(unsigned long base_to, unsigned long base_from, + struct vdso_symtable *to, struct vdso_symtable *from, + bool __always_unused compat_vdso) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(to->symbols); i++) { + if (vdso_symbol_empty(&from->symbols[i])) + continue; + + pr_debug("jmp: %s: %lx/%lx -> %lx/%lx (index %d)\n", + from->symbols[i].name, base_from, + from->symbols[i].offset, + base_to, to->symbols[i].offset, i); + + jmp.addr = base_to + to->symbols[i].offset; + memcpy((void *)(base_from + from->symbols[i].offset), &jmp, + sizeof(jmp)); + } + + return 0; +} diff --git a/CRIU_code/criu/arch/x86/Makefile b/CRIU_code/criu/arch/x86/Makefile new file mode 100644 index 0000000..618e85b --- /dev/null +++ b/CRIU_code/criu/arch/x86/Makefile @@ -0,0 +1,14 @@ +builtin-name := crtools.built-in.o + +asflags-y += -Wstrict-prototypes +asflags-y += -nostdlib -fomit-frame-pointer +asflags-y += -iquote $(obj)/include +ldflags-y += -r -z noexecstack + +obj-y += cpu.o +obj-y += crtools.o +obj-y += kerndat.o +obj-y += sigframe.o +ifeq ($(CONFIG_COMPAT),y) + obj-y += sigaction_compat.o +endif diff --git a/CRIU_code/criu/arch/x86/cpu.c b/CRIU_code/criu/arch/x86/cpu.c new file mode 100644 index 0000000..3808b9d --- /dev/null +++ b/CRIU_code/criu/arch/x86/cpu.c @@ -0,0 +1,470 @@ +#include +#include +#include +#include +#include + +#include + +#include "bitops.h" +#include "asm/cpu.h" +#include +#include + +#include "common/compiler.h" + +#include "cr_options.h" +#include "image.h" +#include "util.h" +#include "log.h" + +#include "cpu.h" + +#include "protobuf.h" +#include "images/cpuinfo.pb-c.h" + +#undef LOG_PREFIX +#define LOG_PREFIX "cpu: " + +static compel_cpuinfo_t rt_cpu_info; + +static int cpu_has_unsupported_features(void) +{ + /* + * Put any unsupported features here. + */ + return 0; +} + +int cpu_init(void) +{ + compel_cpu_copy_cpuinfo(&rt_cpu_info); + + BUILD_BUG_ON(sizeof(struct xsave_struct) != XSAVE_SIZE); + BUILD_BUG_ON(sizeof(struct i387_fxsave_struct) != FXSAVE_SIZE); + + /* + * Make sure that at least FPU is onboard + * and fxsave is supported. + */ + if (compel_cpu_has_feature(X86_FEATURE_FPU)) { + if (!compel_cpu_has_feature(X86_FEATURE_FXSR)) { + pr_err("missing support fxsave/restore insns\n"); + return -1; + } + } + + pr_debug("fpu:%d fxsr:%d xsave:%d xsaveopt:%d xsavec:%d xgetbv1:%d xsaves:%d\n", + !!compel_cpu_has_feature(X86_FEATURE_FPU), + !!compel_cpu_has_feature(X86_FEATURE_FXSR), + !!compel_cpu_has_feature(X86_FEATURE_OSXSAVE), + !!compel_cpu_has_feature(X86_FEATURE_XSAVEOPT), + !!compel_cpu_has_feature(X86_FEATURE_XSAVEC), + !!compel_cpu_has_feature(X86_FEATURE_XGETBV1), + !!compel_cpu_has_feature(X86_FEATURE_XSAVES)); + + return cpu_has_unsupported_features() ? -1 : 0; +} + +int cpu_dump_cpuinfo(void) +{ + CpuinfoEntry cpu_info = CPUINFO_ENTRY__INIT; + CpuinfoX86Entry cpu_x86_info = CPUINFO_X86_ENTRY__INIT; + CpuinfoX86Entry *cpu_x86_info_ptr = &cpu_x86_info; + struct cr_img *img; + + img = open_image(CR_FD_CPUINFO, O_DUMP); + if (!img) + return -1; + + cpu_info.x86_entry = &cpu_x86_info_ptr; + cpu_info.n_x86_entry = 1; + + cpu_x86_info.vendor_id = (rt_cpu_info.x86_vendor == X86_VENDOR_INTEL) ? + CPUINFO_X86_ENTRY__VENDOR__INTEL : + CPUINFO_X86_ENTRY__VENDOR__AMD; + + cpu_x86_info.cpu_family = rt_cpu_info.x86_family; + cpu_x86_info.model = rt_cpu_info.x86_model; + cpu_x86_info.stepping = rt_cpu_info.x86_mask; + cpu_x86_info.capability_ver = 2; + cpu_x86_info.n_capability = ARRAY_SIZE(rt_cpu_info.x86_capability); + cpu_x86_info.capability = (void *)rt_cpu_info.x86_capability; + cpu_x86_info.has_xfeatures_mask = true; + cpu_x86_info.xfeatures_mask = rt_cpu_info.xfeatures_mask; + cpu_x86_info.has_xsave_size = true; + cpu_x86_info.xsave_size = rt_cpu_info.xsave_size; + cpu_x86_info.has_xsave_size_max = true; + cpu_x86_info.xsave_size_max = rt_cpu_info.xsave_size_max; + + if (rt_cpu_info.x86_model_id[0]) + cpu_x86_info.model_id = rt_cpu_info.x86_model_id; + + if (pb_write_one(img, &cpu_info, PB_CPUINFO) < 0) { + close_image(img); + return -1; + } + + close_image(img); + return 0; +} + +#define __ins_bit(__l, __v) (1u << ((__v) - 32u * (__l))) + +static uint32_t x86_ins_capability_mask[NCAPINTS] = { + [CPUID_1_EDX] = + __ins_bit(CPUID_1_EDX, X86_FEATURE_FPU) | + __ins_bit(CPUID_1_EDX, X86_FEATURE_TSC) | + __ins_bit(CPUID_1_EDX, X86_FEATURE_CX8) | + __ins_bit(CPUID_1_EDX, X86_FEATURE_SEP) | + __ins_bit(CPUID_1_EDX, X86_FEATURE_CMOV) | + __ins_bit(CPUID_1_EDX, X86_FEATURE_CLFLUSH) | + __ins_bit(CPUID_1_EDX, X86_FEATURE_MMX) | + __ins_bit(CPUID_1_EDX, X86_FEATURE_FXSR) | + __ins_bit(CPUID_1_EDX, X86_FEATURE_XMM) | + __ins_bit(CPUID_1_EDX, X86_FEATURE_XMM2), + + [CPUID_8000_0001_EDX] = + __ins_bit(CPUID_8000_0001_EDX, X86_FEATURE_SYSCALL) | + __ins_bit(CPUID_8000_0001_EDX, X86_FEATURE_MMXEXT) | + __ins_bit(CPUID_8000_0001_EDX, X86_FEATURE_RDTSCP) | + __ins_bit(CPUID_8000_0001_EDX, X86_FEATURE_3DNOWEXT) | + __ins_bit(CPUID_8000_0001_EDX, X86_FEATURE_3DNOW), + + [CPUID_LNX_1] = + __ins_bit(CPUID_LNX_1, X86_FEATURE_REP_GOOD) | + __ins_bit(CPUID_LNX_1, X86_FEATURE_NOPL), + + [CPUID_1_ECX] = + __ins_bit(CPUID_1_ECX, X86_FEATURE_XMM3) | + __ins_bit(CPUID_1_ECX, X86_FEATURE_PCLMULQDQ) | + __ins_bit(CPUID_1_ECX, X86_FEATURE_MWAIT) | + __ins_bit(CPUID_1_ECX, X86_FEATURE_SSSE3) | + __ins_bit(CPUID_1_ECX, X86_FEATURE_CX16) | + __ins_bit(CPUID_1_ECX, X86_FEATURE_XMM4_1) | + __ins_bit(CPUID_1_ECX, X86_FEATURE_XMM4_2) | + __ins_bit(CPUID_1_ECX, X86_FEATURE_MOVBE) | + __ins_bit(CPUID_1_ECX, X86_FEATURE_POPCNT) | + __ins_bit(CPUID_1_ECX, X86_FEATURE_AES) | + __ins_bit(CPUID_1_ECX, X86_FEATURE_XSAVE) | + __ins_bit(CPUID_1_ECX, X86_FEATURE_OSXSAVE) | + __ins_bit(CPUID_1_ECX, X86_FEATURE_AVX) | + __ins_bit(CPUID_1_ECX, X86_FEATURE_F16C) | + __ins_bit(CPUID_1_ECX, X86_FEATURE_RDRAND), + + [CPUID_8000_0001_ECX] = + __ins_bit(CPUID_8000_0001_ECX, X86_FEATURE_ABM) | + __ins_bit(CPUID_8000_0001_ECX, X86_FEATURE_SSE4A) | + __ins_bit(CPUID_8000_0001_ECX, X86_FEATURE_MISALIGNSSE) | + __ins_bit(CPUID_8000_0001_ECX, X86_FEATURE_3DNOWPREFETCH) | + __ins_bit(CPUID_8000_0001_ECX, X86_FEATURE_XOP) | + __ins_bit(CPUID_8000_0001_ECX, X86_FEATURE_FMA4) | + __ins_bit(CPUID_8000_0001_ECX, X86_FEATURE_TBM), + + [CPUID_7_0_EBX] = + __ins_bit(CPUID_7_0_EBX, X86_FEATURE_FSGSBASE) | + __ins_bit(CPUID_7_0_EBX, X86_FEATURE_BMI1) | + __ins_bit(CPUID_7_0_EBX, X86_FEATURE_HLE) | + __ins_bit(CPUID_7_0_EBX, X86_FEATURE_AVX2) | + __ins_bit(CPUID_7_0_EBX, X86_FEATURE_BMI2) | + __ins_bit(CPUID_7_0_EBX, X86_FEATURE_ERMS) | + __ins_bit(CPUID_7_0_EBX, X86_FEATURE_RTM) | + __ins_bit(CPUID_7_0_EBX, X86_FEATURE_MPX) | + __ins_bit(CPUID_7_0_EBX, X86_FEATURE_AVX512F) | + __ins_bit(CPUID_7_0_EBX, X86_FEATURE_AVX512DQ) | + __ins_bit(CPUID_7_0_EBX, X86_FEATURE_RDSEED) | + __ins_bit(CPUID_7_0_EBX, X86_FEATURE_ADX) | + __ins_bit(CPUID_7_0_EBX, X86_FEATURE_CLFLUSHOPT) | + __ins_bit(CPUID_7_0_EBX, X86_FEATURE_AVX512PF) | + __ins_bit(CPUID_7_0_EBX, X86_FEATURE_AVX512ER) | + __ins_bit(CPUID_7_0_EBX, X86_FEATURE_AVX512CD) | + __ins_bit(CPUID_7_0_EBX, X86_FEATURE_SHA_NI) | + __ins_bit(CPUID_7_0_EBX, X86_FEATURE_AVX512BW) | + __ins_bit(CPUID_7_0_EBX, X86_FEATURE_AVX512VL), + + [CPUID_D_1_EAX] = + __ins_bit(CPUID_D_1_EAX, X86_FEATURE_XSAVEOPT) | + __ins_bit(CPUID_D_1_EAX, X86_FEATURE_XSAVEC) | + __ins_bit(CPUID_D_1_EAX, X86_FEATURE_XGETBV1), + + [CPUID_7_0_ECX] = + __ins_bit(CPUID_7_0_ECX, X86_FEATURE_AVX512VBMI) | + __ins_bit(CPUID_7_0_ECX, X86_FEATURE_AVX512_VBMI2) | + __ins_bit(CPUID_7_0_ECX, X86_FEATURE_GFNI) | + __ins_bit(CPUID_7_0_ECX, X86_FEATURE_VAES) | + __ins_bit(CPUID_7_0_ECX, X86_FEATURE_VPCLMULQDQ) | + __ins_bit(CPUID_7_0_ECX, X86_FEATURE_AVX512_VNNI) | + __ins_bit(CPUID_7_0_ECX, X86_FEATURE_AVX512_BITALG) | + __ins_bit(CPUID_7_0_ECX, X86_FEATURE_TME) | + __ins_bit(CPUID_7_0_ECX, X86_FEATURE_AVX512_VPOPCNTDQ) | + __ins_bit(CPUID_7_0_ECX, X86_FEATURE_RDPID), + + [CPUID_8000_0008_EBX] = + __ins_bit(CPUID_8000_0008_EBX, X86_FEATURE_CLZERO), + + [CPUID_7_0_EDX] = + __ins_bit(CPUID_7_0_EDX, X86_FEATURE_AVX512_4VNNIW) | + __ins_bit(CPUID_7_0_EDX, X86_FEATURE_AVX512_4FMAPS), +}; + +#undef __ins_bit + +static int cpu_validate_ins_features(compel_cpuinfo_t *cpu_info) +{ + size_t i; + + for (i = 0; i < ARRAY_SIZE(cpu_info->x86_capability); i++) { + uint32_t s = cpu_info->x86_capability[i] & x86_ins_capability_mask[i]; + uint32_t d = rt_cpu_info.x86_capability[i] & x86_ins_capability_mask[i]; + + /* + * Destination might be more feature rich + * but not the reverse. + */ + if (s & ~d) { + pr_err("CPU instruction capabilities do not match run time\n"); + return -1; + } + } + + return 0; +} + +static int cpu_validate_features(compel_cpuinfo_t *cpu_info) +{ + if (cpu_has_unsupported_features()) + return -1; + + if (opts.cpu_cap & CPU_CAP_FPU) { + /* + * If we're requested to check FPU only ignore + * any other bit. It's up to a user if the + * rest of mismatches won't cause problems. + */ + +#define __mismatch_fpu_bit(__bit) \ + (test_bit(__bit, (void *)cpu_info->x86_capability) && \ + !compel_cpu_has_feature(__bit)) + if (__mismatch_fpu_bit(X86_FEATURE_FPU) || + __mismatch_fpu_bit(X86_FEATURE_FXSR) || + __mismatch_fpu_bit(X86_FEATURE_OSXSAVE) || + __mismatch_fpu_bit(X86_FEATURE_XSAVES)) { + pr_err("FPU feature required by image " + "is not supported on host " + "(fpu:%d fxsr:%d osxsave:%d xsaves:%d)\n", + __mismatch_fpu_bit(X86_FEATURE_FPU), + __mismatch_fpu_bit(X86_FEATURE_FXSR), + __mismatch_fpu_bit(X86_FEATURE_OSXSAVE), + __mismatch_fpu_bit(X86_FEATURE_XSAVES)); + return -1; + } +#undef __mismatch_fpu_bit + + /* + * Make sure the xsave features are compatible. We already hit the + * issue with libc where we've checkpointed the container on old + * machine but restored on more modern one and libc fetched new + * xsave frame size directly by xsave instruction with greedy + * feature mask causing programs to misbehave. + */ + if (cpu_info->xfeatures_mask > rt_cpu_info.xfeatures_mask) { + uint64_t m = cpu_info->xfeatures_mask & ~rt_cpu_info.xfeatures_mask; + pr_err("CPU xfeatures has unsupported bits (%#llx)\n", + (unsigned long long)m); + return -1; + } else if (cpu_info->xsave_size != rt_cpu_info.xsave_size) { + pr_err("CPU xsave size mismatch (%u/%u)\n", + cpu_info->xsave_size, rt_cpu_info.xsave_size); + return -1; + } else if (cpu_info->xsave_size_max != rt_cpu_info.xsave_size_max) { + pr_err("CPU xsave max size mismatch (%u/%u)\n", + cpu_info->xsave_size_max, rt_cpu_info.xsave_size_max); + return -1; + } + } + + /* + * Capability on instructions level only. + */ + if (opts.cpu_cap & CPU_CAP_INS) { + if (cpu_validate_ins_features(cpu_info)) + return -1; + } + + /* + * Strict capability mode. Everything must match. + */ + if (opts.cpu_cap & CPU_CAP_CPU) { + if (memcmp(cpu_info->x86_capability, rt_cpu_info.x86_capability, + sizeof(cpu_info->x86_capability))) { + pr_err("CPU capabilities do not match run time\n"); + return -1; + } + } + + return 0; +} + +static const struct { + const uint32_t capability_ver; + const uint32_t ncapints; +} ncapints[] = { + { .capability_ver = 1, .ncapints = NCAPINTS_V1 }, + { .capability_ver = 2, .ncapints = NCAPINTS_V2 }, +}; + +static compel_cpuinfo_t *img_to_cpuinfo(CpuinfoX86Entry *img_x86_entry) +{ + compel_cpuinfo_t *cpu_info; + size_t size, i; + + BUILD_BUG_ON(sizeof(img_x86_entry->capability[0]) != + sizeof(cpu_info->x86_capability[0])); + BUILD_BUG_ON(ARRAY_SIZE(rt_cpu_info.x86_capability) != NCAPINTS); + + if (img_x86_entry->vendor_id != CPUINFO_X86_ENTRY__VENDOR__INTEL && + img_x86_entry->vendor_id != CPUINFO_X86_ENTRY__VENDOR__AMD) { + pr_err("Image carries unknown vendor %u\n", + (unsigned)img_x86_entry->vendor_id); + return NULL; + } + + for (i = 0; i < ARRAY_SIZE(ncapints); i++) { + if (img_x86_entry->capability_ver == ncapints[i].capability_ver) { + if (img_x86_entry->n_capability != ncapints[i].ncapints) { + pr_err("Image carries %u words while %u expected\n", + (unsigned)img_x86_entry->n_capability, + (unsigned)ncapints[i].ncapints); + return NULL; + } + break; + } + } + + if (i >= ARRAY_SIZE(ncapints)) { + pr_err("Image carries unknown capability version %d\n", + (unsigned)img_x86_entry->capability_ver); + return NULL; + } + + cpu_info = xzalloc(sizeof(*cpu_info)); + if (!cpu_info) + return NULL; + + /* + * Copy caps from image and fill the left ones from + * run-time information for easier compatibility testing. + */ + size = sizeof(img_x86_entry->capability[0]) * img_x86_entry->n_capability; + memcpy(cpu_info->x86_capability, img_x86_entry->capability, size); + if (img_x86_entry->capability_ver == 1) { + memcpy(&cpu_info->x86_capability[NCAPINTS_V1], + &rt_cpu_info.x86_capability[NCAPINTS_V1], + (NCAPINTS_V2 - NCAPINTS_V1) * sizeof(rt_cpu_info.x86_capability[0])); + } + + if (img_x86_entry->vendor_id == CPUINFO_X86_ENTRY__VENDOR__INTEL) + cpu_info->x86_vendor = X86_VENDOR_INTEL; + else + cpu_info->x86_vendor = X86_VENDOR_AMD; + cpu_info->x86_family = img_x86_entry->cpu_family; + cpu_info->x86_model = img_x86_entry->model; + cpu_info->x86_mask = img_x86_entry->stepping; + cpu_info->extended_cpuid_level = rt_cpu_info.extended_cpuid_level; + cpu_info->cpuid_level = rt_cpu_info.cpuid_level; + cpu_info->x86_power = rt_cpu_info.x86_power; + + memcpy(cpu_info->x86_vendor_id, rt_cpu_info.x86_model_id, sizeof(cpu_info->x86_vendor_id)); + strncpy(cpu_info->x86_model_id, img_x86_entry->model_id, sizeof(cpu_info->x86_model_id) - 1); + + /* + * For old images where no xfeatures_mask present we + * simply fetch runtime cpu mask because later we will + * do either instruction capability check, either strict + * check for capabilities. + */ + if (!img_x86_entry->has_xfeatures_mask) { + cpu_info->xfeatures_mask = rt_cpu_info.xfeatures_mask; + } else + cpu_info->xfeatures_mask = img_x86_entry->xfeatures_mask; + + /* + * Same for other fields. + */ + if (!img_x86_entry->has_xsave_size) + cpu_info->xsave_size = rt_cpu_info.xsave_size; + else + cpu_info->xsave_size = img_x86_entry->xsave_size; + if (!img_x86_entry->has_xsave_size_max) + cpu_info->xsave_size_max = rt_cpu_info.xsave_size_max; + else + cpu_info->xsave_size_max = img_x86_entry->xsave_size_max; + + return cpu_info; +} + +int cpu_validate_cpuinfo(void) +{ + compel_cpuinfo_t *cpu_info = NULL; + CpuinfoX86Entry *img_x86_entry; + CpuinfoEntry *img_cpu_info; + struct cr_img *img; + int ret = -1; + + img = open_image(CR_FD_CPUINFO, O_RSTR); + if (!img) + return -1; + + if (pb_read_one(img, &img_cpu_info, PB_CPUINFO) < 0) + goto err; + + if (img_cpu_info->n_x86_entry != 1) { + pr_err("No x86 related cpuinfo in image, " + "corruption (n_x86_entry = %zi)\n", + img_cpu_info->n_x86_entry); + goto err; + } + + img_x86_entry = img_cpu_info->x86_entry[0]; + if (img_x86_entry->vendor_id != CPUINFO_X86_ENTRY__VENDOR__INTEL && + img_x86_entry->vendor_id != CPUINFO_X86_ENTRY__VENDOR__AMD) { + pr_err("Unknown cpu vendor %d\n", img_x86_entry->vendor_id); + goto err; + } + + cpu_info = img_to_cpuinfo(img_x86_entry); + if (cpu_info) + ret = cpu_validate_features(cpu_info); +err: + xfree(cpu_info); + close_image(img); + return ret; +} + +int cpuinfo_dump(void) +{ + if (cpu_init()) + return -1; + if (cpu_dump_cpuinfo()) + return -1; + return 0; +} + +int cpuinfo_check(void) +{ + if (cpu_init()) + return 1; + + /* + * Force to check all caps if empty passed, + * still allow to check instructions only + * and etc. + */ + if (opts.cpu_cap == CPU_CAP_NONE) + opts.cpu_cap = CPU_CAP_ALL; + + if (cpu_validate_cpuinfo()) + return 1; + + return 0; +} diff --git a/CRIU_code/criu/arch/x86/crtools.c b/CRIU_code/criu/arch/x86/crtools.c new file mode 100644 index 0000000..efc23e5 --- /dev/null +++ b/CRIU_code/criu/arch/x86/crtools.c @@ -0,0 +1,638 @@ +#include "compel/asm/fpu.h" +#include "compel/compel.h" +#include "compel/plugins/std/syscall-codes.h" +#include "cpu.h" +#include "cr_options.h" +#include "images/core.pb-c.h" +#include "log.h" +#include "protobuf.h" +#include "types.h" + +#include "asm/compat.h" + +#undef LOG_PREFIX +#define LOG_PREFIX "x86: " + +#define XSAVE_PB_NELEMS(__s, __obj, __member) \ + (sizeof(__s) / sizeof(*(__obj)->__member)) + +int save_task_regs(void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) +{ + CoreEntry *core = x; + UserX86RegsEntry *gpregs = core->thread_info->gpregs; + +#define assign_reg(dst, src, e) do { dst->e = (__typeof__(dst->e))src.e; } while (0) +#define assign_array(dst, src, e) memcpy(dst->e, &src.e, sizeof(src.e)) +#define assign_xsave(feature, xsave, member, area) \ + do { \ + if (compel_fpu_has_feature(feature)) { \ + uint32_t off = compel_fpu_feature_offset(feature); \ + void *from = &area[off]; \ + size_t size = pb_repeated_size(xsave, member); \ + size_t xsize = (size_t)compel_fpu_feature_size(feature); \ + if (xsize != size) { \ + pr_err("%s reported %zu bytes (expecting %zu)\n", \ + # feature, xsize, size); \ + return -1; \ + } \ + memcpy(xsave->member, from, size); \ + } \ + } while (0) + + if (user_regs_native(regs)) { + assign_reg(gpregs, regs->native, r15); + assign_reg(gpregs, regs->native, r14); + assign_reg(gpregs, regs->native, r13); + assign_reg(gpregs, regs->native, r12); + assign_reg(gpregs, regs->native, bp); + assign_reg(gpregs, regs->native, bx); + assign_reg(gpregs, regs->native, r11); + assign_reg(gpregs, regs->native, r10); + assign_reg(gpregs, regs->native, r9); + assign_reg(gpregs, regs->native, r8); + assign_reg(gpregs, regs->native, ax); + assign_reg(gpregs, regs->native, cx); + assign_reg(gpregs, regs->native, dx); + assign_reg(gpregs, regs->native, si); + assign_reg(gpregs, regs->native, di); + assign_reg(gpregs, regs->native, orig_ax); + assign_reg(gpregs, regs->native, ip); + assign_reg(gpregs, regs->native, cs); + assign_reg(gpregs, regs->native, flags); + assign_reg(gpregs, regs->native, sp); + assign_reg(gpregs, regs->native, ss); + assign_reg(gpregs, regs->native, fs_base); + assign_reg(gpregs, regs->native, gs_base); + assign_reg(gpregs, regs->native, ds); + assign_reg(gpregs, regs->native, es); + assign_reg(gpregs, regs->native, fs); + assign_reg(gpregs, regs->native, gs); + gpregs->mode = USER_X86_REGS_MODE__NATIVE; + } else { + assign_reg(gpregs, regs->compat, bx); + assign_reg(gpregs, regs->compat, cx); + assign_reg(gpregs, regs->compat, dx); + assign_reg(gpregs, regs->compat, si); + assign_reg(gpregs, regs->compat, di); + assign_reg(gpregs, regs->compat, bp); + assign_reg(gpregs, regs->compat, ax); + assign_reg(gpregs, regs->compat, ds); + assign_reg(gpregs, regs->compat, es); + assign_reg(gpregs, regs->compat, fs); + assign_reg(gpregs, regs->compat, gs); + assign_reg(gpregs, regs->compat, orig_ax); + assign_reg(gpregs, regs->compat, ip); + assign_reg(gpregs, regs->compat, cs); + assign_reg(gpregs, regs->compat, flags); + assign_reg(gpregs, regs->compat, sp); + assign_reg(gpregs, regs->compat, ss); + gpregs->mode = USER_X86_REGS_MODE__COMPAT; + } + gpregs->has_mode = true; + + if (!fpregs) + return 0; + + assign_reg(core->thread_info->fpregs, fpregs->i387, cwd); + assign_reg(core->thread_info->fpregs, fpregs->i387, swd); + assign_reg(core->thread_info->fpregs, fpregs->i387, twd); + assign_reg(core->thread_info->fpregs, fpregs->i387, fop); + assign_reg(core->thread_info->fpregs, fpregs->i387, rip); + assign_reg(core->thread_info->fpregs, fpregs->i387, rdp); + assign_reg(core->thread_info->fpregs, fpregs->i387, mxcsr); + assign_reg(core->thread_info->fpregs, fpregs->i387, mxcsr_mask); + + /* Make sure we have enough space */ + BUG_ON(core->thread_info->fpregs->n_st_space != ARRAY_SIZE(fpregs->i387.st_space)); + BUG_ON(core->thread_info->fpregs->n_xmm_space != ARRAY_SIZE(fpregs->i387.xmm_space)); + + assign_array(core->thread_info->fpregs, fpregs->i387, st_space); + assign_array(core->thread_info->fpregs, fpregs->i387, xmm_space); + + if (compel_cpu_has_feature(X86_FEATURE_OSXSAVE)) { + UserX86XsaveEntry *xsave = core->thread_info->fpregs->xsave; + uint8_t *extended_state_area = (void *)fpregs; + + /* + * xcomp_bv is designated for compacted format but user + * space never use it, thus we can simply ignore. + */ + assign_reg(xsave, fpregs->xsave_hdr, xstate_bv); + + assign_xsave(XFEATURE_YMM, xsave, ymmh_space, extended_state_area); + assign_xsave(XFEATURE_BNDREGS, xsave, bndreg_state, extended_state_area); + assign_xsave(XFEATURE_BNDCSR, xsave, bndcsr_state, extended_state_area); + assign_xsave(XFEATURE_OPMASK, xsave, opmask_reg, extended_state_area); + assign_xsave(XFEATURE_ZMM_Hi256,xsave, zmm_upper, extended_state_area); + assign_xsave(XFEATURE_Hi16_ZMM, xsave, hi16_zmm, extended_state_area); + assign_xsave(XFEATURE_PKRU, xsave, pkru, extended_state_area); + } + +#undef assign_reg +#undef assign_array +#undef assign_xsave + + return 0; +} + +static void alloc_tls(ThreadInfoX86 *ti, void **mempool) +{ + int i; + + ti->tls = xptr_pull_s(mempool, GDT_ENTRY_TLS_NUM*sizeof(UserDescT*)); + ti->n_tls = GDT_ENTRY_TLS_NUM; + for (i = 0; i < GDT_ENTRY_TLS_NUM; i++) { + ti->tls[i] = xptr_pull(mempool, UserDescT); + user_desc_t__init(ti->tls[i]); + } +} + +static int alloc_xsave_extends(UserX86XsaveEntry *xsave) +{ + if (compel_fpu_has_feature(XFEATURE_YMM)) { + xsave->n_ymmh_space = XSAVE_PB_NELEMS(struct ymmh_struct, xsave, ymmh_space); + xsave->ymmh_space = xzalloc(pb_repeated_size(xsave, ymmh_space)); + if (!xsave->ymmh_space) + goto err; + } + + if (compel_fpu_has_feature(XFEATURE_BNDREGS)) { + xsave->n_bndreg_state = XSAVE_PB_NELEMS(struct mpx_bndreg_state, xsave, bndreg_state); + xsave->bndreg_state = xzalloc(pb_repeated_size(xsave, bndreg_state)); + if (!xsave->bndreg_state) + goto err; + } + + if (compel_fpu_has_feature(XFEATURE_BNDCSR)) { + xsave->n_bndcsr_state = XSAVE_PB_NELEMS(struct mpx_bndcsr_state, xsave, bndcsr_state); + xsave->bndcsr_state = xzalloc(pb_repeated_size(xsave, bndcsr_state)); + if (!xsave->bndcsr_state) + goto err; + } + + if (compel_fpu_has_feature(XFEATURE_OPMASK)) { + xsave->n_opmask_reg = XSAVE_PB_NELEMS(struct avx_512_opmask_state, xsave, opmask_reg); + xsave->opmask_reg = xzalloc(pb_repeated_size(xsave, opmask_reg)); + if (!xsave->opmask_reg) + goto err; + } + + if (compel_fpu_has_feature(XFEATURE_ZMM_Hi256)) { + xsave->n_zmm_upper = XSAVE_PB_NELEMS(struct avx_512_zmm_uppers_state, xsave, zmm_upper); + xsave->zmm_upper = xzalloc(pb_repeated_size(xsave, zmm_upper)); + if (!xsave->zmm_upper) + goto err; + } + + if (compel_fpu_has_feature(XFEATURE_Hi16_ZMM)) { + xsave->n_hi16_zmm = XSAVE_PB_NELEMS(struct avx_512_hi16_state, xsave, hi16_zmm); + xsave->hi16_zmm = xzalloc(pb_repeated_size(xsave, hi16_zmm)); + if (!xsave->hi16_zmm) + goto err; + } + + if (compel_fpu_has_feature(XFEATURE_PKRU)) { + xsave->n_pkru = XSAVE_PB_NELEMS(struct pkru_state, xsave, pkru); + xsave->pkru = xzalloc(pb_repeated_size(xsave, pkru)); + if (!xsave->pkru) + goto err; + } + + return 0; +err: + return -1; +} + +int arch_alloc_thread_info(CoreEntry *core) +{ + size_t sz; + bool with_fpu, with_xsave = false; + void *m; + ThreadInfoX86 *ti = NULL; + + + with_fpu = compel_cpu_has_feature(X86_FEATURE_FPU); + + sz = sizeof(ThreadInfoX86) + sizeof(UserX86RegsEntry) + + GDT_ENTRY_TLS_NUM*sizeof(UserDescT) + + GDT_ENTRY_TLS_NUM*sizeof(UserDescT*); + if (with_fpu) { + sz += sizeof(UserX86FpregsEntry); + with_xsave = compel_cpu_has_feature(X86_FEATURE_OSXSAVE); + if (with_xsave) + sz += sizeof(UserX86XsaveEntry); + } + + m = xmalloc(sz); + if (!m) + return -1; + + ti = core->thread_info = xptr_pull(&m, ThreadInfoX86); + thread_info_x86__init(ti); + ti->gpregs = xptr_pull(&m, UserX86RegsEntry); + user_x86_regs_entry__init(ti->gpregs); + alloc_tls(ti, &m); + + if (with_fpu) { + UserX86FpregsEntry *fpregs; + + fpregs = ti->fpregs = xptr_pull(&m, UserX86FpregsEntry); + user_x86_fpregs_entry__init(fpregs); + + /* These are numbers from kernel */ + fpregs->n_st_space = 32; + fpregs->n_xmm_space = 64; + + fpregs->st_space = xzalloc(pb_repeated_size(fpregs, st_space)); + fpregs->xmm_space = xzalloc(pb_repeated_size(fpregs, xmm_space)); + + if (!fpregs->st_space || !fpregs->xmm_space) + goto err; + + if (with_xsave) { + UserX86XsaveEntry *xsave; + + xsave = fpregs->xsave = xptr_pull(&m, UserX86XsaveEntry); + user_x86_xsave_entry__init(xsave); + + if (alloc_xsave_extends(xsave)) + goto err; + } + } + + return 0; +err: + return -1; +} + +void arch_free_thread_info(CoreEntry *core) +{ + if (!core->thread_info) + return; + + if (core->thread_info->fpregs->xsave) { + xfree(core->thread_info->fpregs->xsave->ymmh_space); + xfree(core->thread_info->fpregs->xsave->pkru); + xfree(core->thread_info->fpregs->xsave->hi16_zmm); + xfree(core->thread_info->fpregs->xsave->zmm_upper); + xfree(core->thread_info->fpregs->xsave->opmask_reg); + xfree(core->thread_info->fpregs->xsave->bndcsr_state); + xfree(core->thread_info->fpregs->xsave->bndreg_state); + } + + xfree(core->thread_info->fpregs->st_space); + xfree(core->thread_info->fpregs->xmm_space); + xfree(core->thread_info); +} + +static bool valid_xsave_frame(CoreEntry *core) +{ + UserX86XsaveEntry *xsave = core->thread_info->fpregs->xsave; + struct xsave_struct *x = NULL; + + if (core->thread_info->fpregs->n_st_space < ARRAY_SIZE(x->i387.st_space)) { + pr_err("Corruption in FPU st_space area " + "(got %li but %li expected)\n", + (long)core->thread_info->fpregs->n_st_space, + (long)ARRAY_SIZE(x->i387.st_space)); + return false; + } + + if (core->thread_info->fpregs->n_xmm_space < ARRAY_SIZE(x->i387.xmm_space)) { + pr_err("Corruption in FPU xmm_space area " + "(got %li but %li expected)\n", + (long)core->thread_info->fpregs->n_st_space, + (long)ARRAY_SIZE(x->i387.xmm_space)); + return false; + } + + if (compel_cpu_has_feature(X86_FEATURE_OSXSAVE)) { + if (xsave) { + size_t i; + struct { + const char *name; + size_t expected; + size_t obtained; + void *ptr; + } features[] = { + { + .name = __stringify_1(XFEATURE_YMM), + .expected = XSAVE_PB_NELEMS(struct ymmh_struct, xsave, ymmh_space), + .obtained = xsave->n_ymmh_space, + .ptr = xsave->ymmh_space, + }, { + .name = __stringify_1(XFEATURE_BNDREGS), + .expected = XSAVE_PB_NELEMS(struct mpx_bndreg_state, xsave, bndreg_state), + .obtained = xsave->n_bndreg_state, + .ptr = xsave->bndreg_state, + }, { + .name = __stringify_1(XFEATURE_BNDCSR), + .expected = XSAVE_PB_NELEMS(struct mpx_bndcsr_state, xsave, bndcsr_state), + .obtained = xsave->n_bndcsr_state, + .ptr = xsave->bndcsr_state, + }, { + .name = __stringify_1(XFEATURE_OPMASK), + .expected = XSAVE_PB_NELEMS(struct avx_512_opmask_state, xsave, opmask_reg), + .obtained = xsave->n_opmask_reg, + .ptr = xsave->opmask_reg, + }, { + .name = __stringify_1(XFEATURE_ZMM_Hi256), + .expected = XSAVE_PB_NELEMS(struct avx_512_zmm_uppers_state, xsave, zmm_upper), + .obtained = xsave->n_zmm_upper, + .ptr = xsave->zmm_upper, + }, { + .name = __stringify_1(XFEATURE_Hi16_ZMM), + .expected = XSAVE_PB_NELEMS(struct avx_512_hi16_state, xsave, hi16_zmm), + .obtained = xsave->n_hi16_zmm, + .ptr = xsave->hi16_zmm, + }, { + .name = __stringify_1(XFEATURE_PKRU), + .expected = XSAVE_PB_NELEMS(struct pkru_state, xsave, pkru), + .obtained = xsave->n_pkru, + .ptr = xsave->pkru, + }, + }; + + for (i = 0; i < ARRAY_SIZE(features); i++) { + if (!features[i].ptr) + continue; + + if (features[i].expected > features[i].obtained) { + pr_err("Corruption in %s area (expected %zu but %zu obtained)\n", + features[i].name, features[i].expected, features[i].obtained); + return false; + } + } + } + } else { + /* + * If the image has xsave area present then CPU we're restoring + * on must have X86_FEATURE_OSXSAVE feature until explicitly + * stated in options. + */ + if (xsave) { + if (opts.cpu_cap & CPU_CAP_FPU) { + pr_err("FPU xsave area present, " + "but host cpu doesn't support it\n"); + return false; + } else + pr_warn_once("FPU is about to restore ignoring xsave state!\n"); + } + } + + return true; +} + +static void show_rt_xsave_frame(struct xsave_struct *x) +{ + struct fpx_sw_bytes *fpx = (void *)&x->i387.sw_reserved; + struct xsave_hdr_struct *xsave_hdr = &x->xsave_hdr; + struct i387_fxsave_struct *i387 = &x->i387; + + pr_debug("xsave runtime structure\n"); + pr_debug("-----------------------\n"); + + pr_debug("cwd:%#x swd:%#x twd:%#x fop:%#x mxcsr:%#x mxcsr_mask:%#x\n", + (int)i387->cwd, (int)i387->swd, (int)i387->twd, + (int)i387->fop, (int)i387->mxcsr, (int)i387->mxcsr_mask); + + pr_debug("magic1:%#x extended_size:%u xstate_bv:%#lx xstate_size:%u\n", + fpx->magic1, fpx->extended_size, (long)fpx->xstate_bv, fpx->xstate_size); + pr_debug("xstate_bv: %#lx\n", (long)xsave_hdr->xstate_bv); + + pr_debug("-----------------------\n"); +} + +int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core) +{ + fpu_state_t *fpu_state = core_is_compat(core) ? + &sigframe->compat.fpu_state : + &sigframe->native.fpu_state; + struct xsave_struct *x = core_is_compat(core) ? + (void *)&fpu_state->fpu_state_ia32.xsave : + (void *)&fpu_state->fpu_state_64.xsave; + + /* + * If no FPU information provided -- we're restoring + * old image which has no FPU support, or the dump simply + * has no FPU support at all. + */ + if (!core->thread_info->fpregs) { + fpu_state->has_fpu = false; + return 0; + } + + if (!valid_xsave_frame(core)) + return -1; + + fpu_state->has_fpu = true; + +#define assign_reg(dst, src, e) do { dst.e = (__typeof__(dst.e))src->e; } while (0) +#define assign_array(dst, src, e) memcpy(dst.e, (src)->e, sizeof(dst.e)) +#define assign_xsave(feature, xsave, member, area) \ + do { \ + if (compel_fpu_has_feature(feature)) { \ + uint32_t off = compel_fpu_feature_offset(feature); \ + void *to = &area[off]; \ + void *from = xsave->member; \ + size_t size = pb_repeated_size(xsave, member); \ + size_t xsize = (size_t)compel_fpu_feature_size(feature); \ + if (xsize != size) { \ + if (size) { \ + pr_err("%s reported %zu bytes (expecting %zu)\n",\ + # feature, xsize, size); \ + return -1; \ + } else { \ + pr_debug("%s is not present in image, ignore\n",\ + # feature); \ + } \ + } \ + xstate_bv |= (1UL << feature); \ + xstate_size += xsize; \ + memcpy(to, from, size); \ + } \ + } while (0) + + assign_reg(x->i387, core->thread_info->fpregs, cwd); + assign_reg(x->i387, core->thread_info->fpregs, swd); + assign_reg(x->i387, core->thread_info->fpregs, twd); + assign_reg(x->i387, core->thread_info->fpregs, fop); + assign_reg(x->i387, core->thread_info->fpregs, rip); + assign_reg(x->i387, core->thread_info->fpregs, rdp); + assign_reg(x->i387, core->thread_info->fpregs, mxcsr); + assign_reg(x->i387, core->thread_info->fpregs, mxcsr_mask); + + assign_array(x->i387, core->thread_info->fpregs, st_space); + assign_array(x->i387, core->thread_info->fpregs, xmm_space); + + if (core_is_compat(core)) + compel_convert_from_fxsr(&fpu_state->fpu_state_ia32.fregs_state.i387_ia32, + &fpu_state->fpu_state_ia32.xsave.i387); + + if (compel_cpu_has_feature(X86_FEATURE_OSXSAVE)) { + struct fpx_sw_bytes *fpx_sw = (void *)&x->i387.sw_reserved; + size_t xstate_size = XSAVE_YMM_OFFSET; + uint32_t xstate_bv = 0; + void *magic2; + + xstate_bv = XFEATURE_MASK_FP | XFEATURE_MASK_SSE; + + /* + * fpregs->xsave pointer might not present on image so we + * simply clear out everything. + */ + if (core->thread_info->fpregs->xsave) { + UserX86XsaveEntry *xsave = core->thread_info->fpregs->xsave; + uint8_t *extended_state_area = (void *)x; + + assign_xsave(XFEATURE_YMM, xsave, ymmh_space, extended_state_area); + assign_xsave(XFEATURE_BNDREGS, xsave, bndreg_state, extended_state_area); + assign_xsave(XFEATURE_BNDCSR, xsave, bndcsr_state, extended_state_area); + assign_xsave(XFEATURE_OPMASK, xsave, opmask_reg, extended_state_area); + assign_xsave(XFEATURE_ZMM_Hi256,xsave, zmm_upper, extended_state_area); + assign_xsave(XFEATURE_Hi16_ZMM, xsave, hi16_zmm, extended_state_area); + assign_xsave(XFEATURE_PKRU, xsave, pkru, extended_state_area); + } + + x->xsave_hdr.xstate_bv = xstate_bv; + + fpx_sw->magic1 = FP_XSTATE_MAGIC1; + fpx_sw->xstate_bv = xstate_bv; + fpx_sw->xstate_size = xstate_size; + fpx_sw->extended_size = xstate_size + FP_XSTATE_MAGIC2_SIZE; + + /* + * This should be at the end of xsave frame. + */ + magic2 = (void *)x + xstate_size; + *(u32 *)magic2 = FP_XSTATE_MAGIC2; + } + + show_rt_xsave_frame(x); + +#undef assign_reg +#undef assign_array +#undef assign_xsave + + return 0; +} + +#define CPREG32(d) f->compat.uc.uc_mcontext.d = r->d +static void restore_compat_gpregs(struct rt_sigframe *f, UserX86RegsEntry *r) +{ + CPREG32(gs); + CPREG32(fs); + CPREG32(es); + CPREG32(ds); + + CPREG32(di); CPREG32(si); CPREG32(bp); CPREG32(sp); CPREG32(bx); + CPREG32(dx); CPREG32(cx); CPREG32(ip); CPREG32(ax); + CPREG32(cs); + CPREG32(ss); + CPREG32(flags); + + f->is_native = false; +} +#undef CPREG32 + +#define CPREG64(d, s) f->native.uc.uc_mcontext.d = r->s +static void restore_native_gpregs(struct rt_sigframe *f, UserX86RegsEntry *r) +{ + CPREG64(rdi, di); + CPREG64(rsi, si); + CPREG64(rbp, bp); + CPREG64(rsp, sp); + CPREG64(rbx, bx); + CPREG64(rdx, dx); + CPREG64(rcx, cx); + CPREG64(rip, ip); + CPREG64(rax, ax); + + CPREG64(r8, r8); + CPREG64(r9, r9); + CPREG64(r10, r10); + CPREG64(r11, r11); + CPREG64(r12, r12); + CPREG64(r13, r13); + CPREG64(r14, r14); + CPREG64(r15, r15); + + CPREG64(cs, cs); + + CPREG64(eflags, flags); + + f->is_native = true; +} +#undef CPREG64 + +int restore_gpregs(struct rt_sigframe *f, UserX86RegsEntry *r) +{ + switch (r->mode) { + case USER_X86_REGS_MODE__NATIVE: + restore_native_gpregs(f, r); + break; + case USER_X86_REGS_MODE__COMPAT: + restore_compat_gpregs(f, r); + break; + default: + pr_err("Can't prepare rt_sigframe: registers mode corrupted (%d)\n", r->mode); + return -1; + } + return 0; +} + +static int get_robust_list32(pid_t pid, uintptr_t head, uintptr_t len) +{ + struct syscall_args32 s = { + .nr = __NR32_get_robust_list, + .arg0 = pid, + .arg1 = (uint32_t)head, + .arg2 = (uint32_t)len, + }; + + do_full_int80(&s); + return (int)s.nr; +} + +static int set_robust_list32(uint32_t head, uint32_t len) +{ + struct syscall_args32 s = { + .nr = __NR32_set_robust_list, + .arg0 = head, + .arg1 = len, + }; + + do_full_int80(&s); + return (int)s.nr; +} + +int get_task_futex_robust_list_compat(pid_t pid, ThreadCoreEntry *info) +{ + void *mmap32; + int ret = -1; + + mmap32 = alloc_compat_syscall_stack(); + if (!mmap32) + return -1; + + ret = get_robust_list32(pid, (uintptr_t)mmap32, (uintptr_t)mmap32 + 4); + + if (ret == -ENOSYS) { + /* Check native get_task_futex_robust_list() for details. */ + if (set_robust_list32(0, 0) == (uint32_t)-ENOSYS) { + info->futex_rla = 0; + info->futex_rla_len = 0; + ret = 0; + } + } else if (ret == 0) { + uint32_t *arg1 = (uint32_t*)mmap32; + + info->futex_rla = *arg1; + info->futex_rla_len = *(arg1 + 1); + ret = 0; + } + + + free_compat_syscall_stack(mmap32); + return ret; +} diff --git a/CRIU_code/criu/arch/x86/include/asm/compat.h b/CRIU_code/criu/arch/x86/include/asm/compat.h new file mode 100644 index 0000000..cd1ae47 --- /dev/null +++ b/CRIU_code/criu/arch/x86/include/asm/compat.h @@ -0,0 +1,68 @@ +#ifndef __CR_ASM_COMPAT_H__ +#define __CR_ASM_COMPAT_H__ + +#ifdef CR_NOGLIBC +# include +# include +#else +# define sys_mmap mmap +# define sys_munmap munmap +#endif + +#include + +static inline void *alloc_compat_syscall_stack(void) +{ + void *mem = (void*)sys_mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, + MAP_32BIT | MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + + if ((uintptr_t)mem % PAGE_SIZE) { + int err = (~(uint32_t)(uintptr_t)mem) + 1; + + pr_err("mmap() of compat syscall stack failed with %d\n", err); + return 0; + } + return mem; +} + +static inline void free_compat_syscall_stack(void *mem) +{ + long int ret = sys_munmap(mem, PAGE_SIZE); + + if (ret) + pr_err("munmap() of compat addr %p failed with %ld\n", + mem, ret); +} + +struct syscall_args32 { + uint32_t nr, arg0, arg1, arg2, arg3, arg4, arg5; +}; + +static inline void do_full_int80(struct syscall_args32 *args) +{ + /* + * r8-r11 registers are cleared during returning to userspace + * from syscall - that's x86_64 ABI to avoid leaking kernel + * pointers. + * + * Other than that - we can't use %rbp in clobbers as GCC's inline + * assembly doesn't allow to do so. So, here is explicitly saving + * %rbp before syscall and restoring it's value afterward. + */ + asm volatile ("pushq %%rbp\n\t" + "mov %6, %%ebp\n\t" + "int $0x80\n\t" + "mov %%ebp, %6\n\t" + "popq %%rbp\n\t" + : "+a" (args->nr), + "+b" (args->arg0), "+c" (args->arg1), "+d" (args->arg2), + "+S" (args->arg3), "+D" (args->arg4), "+g" (args->arg5) + : : "r8", "r9", "r10", "r11"); +} + +#ifndef CR_NOGLIBC +# undef sys_mmap +# undef sys_munmap +#endif + +#endif diff --git a/CRIU_code/criu/arch/x86/include/asm/dump.h b/CRIU_code/criu/arch/x86/include/asm/dump.h new file mode 100644 index 0000000..c79e0df --- /dev/null +++ b/CRIU_code/criu/arch/x86/include/asm/dump.h @@ -0,0 +1,34 @@ +#ifndef __CR_ASM_DUMP_H__ +#define __CR_ASM_DUMP_H__ + +extern int save_task_regs(void *, user_regs_struct_t *, user_fpregs_struct_t *); +extern int arch_alloc_thread_info(CoreEntry *core); +extern void arch_free_thread_info(CoreEntry *core); +extern int get_task_futex_robust_list_compat(pid_t pid, ThreadCoreEntry *info); + +static inline void core_put_tls(CoreEntry *core, tls_t tls) +{ + ThreadInfoX86 *ti = core->thread_info; + int i; + + for (i = 0; i < GDT_ENTRY_TLS_NUM; i++) + { + user_desc_t *from = &tls.desc[i]; + UserDescT *to = ti->tls[i]; + +#define COPY_TLS(field) to->field = from->field + COPY_TLS(entry_number); + COPY_TLS(base_addr); + COPY_TLS(limit); + COPY_TLS(seg_32bit); + to->contents_h = from->contents & 0x2; + to->contents_l = from->contents & 0x1; + COPY_TLS(read_exec_only); + COPY_TLS(limit_in_pages); + COPY_TLS(seg_not_present); + COPY_TLS(useable); +#undef COPY_TLS + } +} + +#endif diff --git a/CRIU_code/criu/arch/x86/include/asm/int.h b/CRIU_code/criu/arch/x86/include/asm/int.h new file mode 100644 index 0000000..642804e --- /dev/null +++ b/CRIU_code/criu/arch/x86/include/asm/int.h @@ -0,0 +1,6 @@ +#ifndef __CR_ASM_INT_H__ +#define __CR_ASM_INT_H__ + +#include "asm-generic/int.h" + +#endif /* __CR_ASM_INT_H__ */ diff --git a/CRIU_code/criu/arch/x86/include/asm/kerndat.h b/CRIU_code/criu/arch/x86/include/asm/kerndat.h new file mode 100644 index 0000000..903bc80 --- /dev/null +++ b/CRIU_code/criu/arch/x86/include/asm/kerndat.h @@ -0,0 +1,8 @@ +#ifndef __CR_ASM_KERNDAT_H__ +#define __CR_ASM_KERNDAT_H__ + +extern int kdat_compatible_cr(void); +extern int kdat_can_map_vdso(void); +extern int kdat_x86_has_ptrace_fpu_xsave_bug(void); + +#endif /* __CR_ASM_KERNDAT_H__ */ diff --git a/CRIU_code/criu/arch/x86/include/asm/parasite-syscall.h b/CRIU_code/criu/arch/x86/include/asm/parasite-syscall.h new file mode 100644 index 0000000..a2b5e75 --- /dev/null +++ b/CRIU_code/criu/arch/x86/include/asm/parasite-syscall.h @@ -0,0 +1,8 @@ +#ifndef __CR_ASM_PARASITE_SYSCALL_H__ +#define __CR_ASM_PARASITE_SYSCALL_H__ + +#include "asm/types.h" + +struct parasite_ctl; + +#endif diff --git a/CRIU_code/criu/arch/x86/include/asm/parasite.h b/CRIU_code/criu/arch/x86/include/asm/parasite.h new file mode 100644 index 0000000..6b4d4ac --- /dev/null +++ b/CRIU_code/criu/arch/x86/include/asm/parasite.h @@ -0,0 +1,77 @@ +#ifndef __ASM_PARASITE_H__ +#define __ASM_PARASITE_H__ + +#include +#include +#include "asm/compat.h" + +static int arch_get_user_desc(user_desc_t *desc) +{ + int ret = __NR32_get_thread_area; + /* + * For 64-bit applications, TLS (fs_base for Glibc) is + * in MSR, which are dumped with the help of arch_prctl(). + * + * But SET_FS_BASE will update GDT if base pointer fits in 4 bytes. + * Otherwise it will set only MSR, which allows for mixed 64/32-bit + * code to use: 2 MSRs as TLS base _and_ 3 GDT entries. + * Having in sum 5 TLS pointers, 3 of which are four bytes and + * other two bigger than four bytes: + * struct thread_struct { + * struct desc_struct tls_array[3]; + * ... + * #ifdef CONFIG_X86_64 + * unsigned long fsbase; + * unsigned long gsbase; + * #endif + * ... + * }; + */ + asm volatile ( + " mov %0,%%eax \n" + " mov %1,%%rbx \n" + " int $0x80 \n" + " mov %%eax,%0 \n" + : "+m"(ret) + : "m"(desc) + : "rax", "rbx", "r8", "r9", "r10", "r11", "memory"); + + if (ret) + pr_err("Failed to dump TLS descriptor #%d: %d\n", + desc->entry_number, ret); + return ret; +} + +static void arch_get_tls(tls_t *ptls) +{ + void *syscall_mem; + int i; + + syscall_mem = alloc_compat_syscall_stack(); + if (!syscall_mem) { + pr_err("Failed to allocate memory <4Gb for compat syscall\n"); + + for (i = 0; i < GDT_ENTRY_TLS_NUM; i++) { + user_desc_t *d = &ptls->desc[i]; + + d->seg_not_present = 1; + d->entry_number = GDT_ENTRY_TLS_MIN + i; + } + return; + } + + for (i = 0; i < GDT_ENTRY_TLS_NUM; i++) + { + user_desc_t *d = syscall_mem; + + memset(d, 0, sizeof(user_desc_t)); + d->seg_not_present = 1; + d->entry_number = GDT_ENTRY_TLS_MIN + i; + arch_get_user_desc(d); + memcpy(&ptls->desc[i], d, sizeof(user_desc_t)); + } + + free_compat_syscall_stack(syscall_mem); +} + +#endif diff --git a/CRIU_code/criu/arch/x86/include/asm/restore.h b/CRIU_code/criu/arch/x86/include/asm/restore.h new file mode 100644 index 0000000..21787a7 --- /dev/null +++ b/CRIU_code/criu/arch/x86/include/asm/restore.h @@ -0,0 +1,58 @@ +#ifndef __CR_ASM_RESTORE_H__ +#define __CR_ASM_RESTORE_H__ + +#include "asm/restorer.h" + +#include "images/core.pb-c.h" + +#define JUMP_TO_RESTORER_BLOB(new_sp, restore_task_exec_start, \ + task_args) \ + asm volatile( \ + "movq %0, %%rbx \n" \ + "movq %1, %%rax \n" \ + "movq %2, %%rdi \n" \ + "movq %%rbx, %%rsp \n" \ + "callq *%%rax \n" \ + : \ + : "g"(new_sp), \ + "g"(restore_task_exec_start), \ + "g"(task_args) \ + : "rdi", "rsi", "rbx", "rax", "memory") + +static inline void core_get_tls(CoreEntry *pcore, tls_t *ptls) +{ + ThreadInfoX86 *ti = pcore->thread_info; + size_t i; + + for (i = 0; i < GDT_ENTRY_TLS_NUM; i++) { + user_desc_t *to = &ptls->desc[i]; + UserDescT *from; + + /* + * If proto image has lesser TLS entries, + * mark them as not present (and thus skip restore). + */ + if (i >= ti->n_tls) { + to->seg_not_present = 1; + continue; + } + + from = ti->tls[i]; +#define COPY_TLS(field) to->field = from->field + COPY_TLS(entry_number); + COPY_TLS(base_addr); + COPY_TLS(limit); + COPY_TLS(seg_32bit); + to->contents = ((u32)from->contents_h << 1) | from->contents_l; + COPY_TLS(read_exec_only); + COPY_TLS(limit_in_pages); + COPY_TLS(seg_not_present); + COPY_TLS(useable); +#undef COPY_TLS + } +} + + +int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core); + +#endif diff --git a/CRIU_code/criu/arch/x86/include/asm/restorer.h b/CRIU_code/criu/arch/x86/include/asm/restorer.h new file mode 100644 index 0000000..25559b5 --- /dev/null +++ b/CRIU_code/criu/arch/x86/include/asm/restorer.h @@ -0,0 +1,112 @@ +#ifndef __CR_ASM_RESTORER_H__ +#define __CR_ASM_RESTORER_H__ + +#include "asm/types.h" +#include +#include "images/core.pb-c.h" +#include +#include +#include "asm/compat.h" + +#ifdef CONFIG_COMPAT +extern void restore_tls(tls_t *ptls); +extern int arch_compat_rt_sigaction(void *stack32, int sig, + rt_sigaction_t_compat *act); +extern int set_compat_robust_list(uint32_t head_ptr, uint32_t len); +#else /* CONFIG_COMPAT */ +static inline void restore_tls(tls_t *ptls) { } +static inline int arch_compat_rt_sigaction(void *stack, int sig, void *act) +{ + return -1; +} +static inline int set_compat_robust_list(uint32_t head_ptr, uint32_t len) +{ + return -1; +} +#endif /* !CONFIG_COMPAT */ + +#define RUN_CLONE_RESTORE_FN(ret, clone_flags, new_sp, parent_tid, \ + thread_args, clone_restore_fn) \ + asm volatile( \ + "clone_emul: \n" \ + "movq %2, %%rsi \n" \ + "subq $16, %%rsi \n" \ + "movq %6, %%rdi \n" \ + "movq %%rdi, 8(%%rsi) \n" \ + "movq %5, %%rdi \n" \ + "movq %%rdi, 0(%%rsi) \n" \ + "movq %1, %%rdi \n" \ + "movq %3, %%rdx \n" \ + "movq %4, %%r10 \n" \ + "movl $"__stringify(__NR_clone)", %%eax \n" \ + "syscall \n" \ + \ + "testq %%rax,%%rax \n" \ + "jz thread_run \n" \ + \ + "movq %%rax, %0 \n" \ + "jmp clone_end \n" \ + \ + "thread_run: \n" \ + "xorq %%rbp, %%rbp \n" \ + "popq %%rax \n" \ + "popq %%rdi \n" \ + "callq *%%rax \n" \ + \ + "clone_end: \n" \ + : "=r"(ret) \ + : "g"(clone_flags), \ + "g"(new_sp), \ + "g"(&parent_tid), \ + "g"(&thread_args[i].pid), \ + "g"(clone_restore_fn), \ + "g"(&thread_args[i]) \ + : "rax", "rcx", "rdi", "rsi", "rdx", "r10", "r11", "memory") + +#define ARCH_FAIL_CORE_RESTORE \ + asm volatile( \ + "movq %0, %%rsp \n" \ + "movq 0, %%rax \n" \ + "jmp *%%rax \n" \ + : \ + : "r"(ret) \ + : "memory") + +static inline void +__setup_sas_compat(struct ucontext_ia32* uc, ThreadSasEntry *sas) +{ + uc->uc_stack.ss_sp = (compat_uptr_t)(sas)->ss_sp; + uc->uc_stack.ss_flags = (int)(sas)->ss_flags; + uc->uc_stack.ss_size = (compat_size_t)(sas)->ss_size; +} + +static inline void +__setup_sas(struct rt_sigframe* sigframe, ThreadSasEntry *sas) +{ + if (sigframe->is_native) { + struct rt_ucontext *uc = &sigframe->native.uc; + + uc->uc_stack.ss_sp = (void *)decode_pointer((sas)->ss_sp); + uc->uc_stack.ss_flags = (int)(sas)->ss_flags; + uc->uc_stack.ss_size = (size_t)(sas)->ss_size; + } else { + __setup_sas_compat(&sigframe->compat.uc, sas); + } +} + +static inline void _setup_sas(struct rt_sigframe* sigframe, ThreadSasEntry *sas) +{ + if (sas) + __setup_sas(sigframe, sas); +} +#define setup_sas _setup_sas + +int restore_gpregs(struct rt_sigframe *f, UserX86RegsEntry *r); +int restore_nonsigframe_gpregs(UserX86RegsEntry *r); + +int ptrace_set_breakpoint(pid_t pid, void *addr); +int ptrace_flush_breakpoints(pid_t pid); + +extern int arch_map_vdso(unsigned long map_at, bool compatible); + +#endif diff --git a/CRIU_code/criu/arch/x86/include/asm/syscall32.h b/CRIU_code/criu/arch/x86/include/asm/syscall32.h new file mode 100644 index 0000000..a6e2982 --- /dev/null +++ b/CRIU_code/criu/arch/x86/include/asm/syscall32.h @@ -0,0 +1,17 @@ +#ifndef __CR_SYSCALL32_H__ +#define __CR_SYSCALL32_H__ + +extern long sys_socket(int domain, int type, int protocol); +extern long sys_connect(int sockfd, struct sockaddr *addr, int addrlen); +extern long sys_sendto(int sockfd, void *buff, size_t len, unsigned int flags, struct sockaddr *addr, int addr_len); +extern long sys_recvfrom(int sockfd, void *ubuf, size_t size, unsigned int flags, struct sockaddr *addr, int *addr_len); +extern long sys_sendmsg(int sockfd, const struct msghdr *msg, int flags); +extern long sys_recvmsg(int sockfd, struct msghdr *msg, int flags); +extern long sys_shutdown(int sockfd, int how); +extern long sys_bind(int sockfd, const struct sockaddr *addr, int addrlen); +extern long sys_setsockopt(int sockfd, int level, int optname, const void *optval, unsigned int optlen); +extern long sys_getsockopt(int sockfd, int level, int optname, const void *optval, unsigned int *optlen); +extern long sys_shmat(int shmid, void *shmaddr, int shmflag); +extern long sys_pread(unsigned int fd, char *ubuf, u32 count, u64 pos); + +#endif /* __CR_SYSCALL32_H__ */ diff --git a/CRIU_code/criu/arch/x86/include/asm/types.h b/CRIU_code/criu/arch/x86/include/asm/types.h new file mode 100644 index 0000000..3ff7fc6 --- /dev/null +++ b/CRIU_code/criu/arch/x86/include/asm/types.h @@ -0,0 +1,52 @@ +#ifndef __CR_ASM_TYPES_H__ +#define __CR_ASM_TYPES_H__ + +#include +#include + +#include "page.h" +#include "bitops.h" +#include "asm/int.h" + +#include + +#include "images/core.pb-c.h" + +static inline int core_is_compat(CoreEntry *c) +{ + switch (c->thread_info->gpregs->mode) { + case USER_X86_REGS_MODE__NATIVE: + return 0; + case USER_X86_REGS_MODE__COMPAT: + return 1; + default: + return -1; + } +} + +#define CORE_ENTRY__MARCH CORE_ENTRY__MARCH__X86_64 + +#define CORE_THREAD_ARCH_INFO(core) core->thread_info + +typedef UserX86RegsEntry UserRegsEntry; + +static inline u64 encode_pointer(void *p) { return (u64)(long)p; } +static inline void *decode_pointer(u64 v) { return (void*)(long)v; } + +#define AT_VECTOR_SIZE 44 +typedef uint64_t auxv_t; + +/* + * Linux preserves three TLS segments in GDT. + * Offsets in GDT differ between 32-bit and 64-bit machines. + * For 64-bit x86 those GDT offsets are the same + * for native and compat tasks. + */ +#define GDT_ENTRY_TLS_MIN 12 +#define GDT_ENTRY_TLS_MAX 14 +#define GDT_ENTRY_TLS_NUM 3 +typedef struct { + user_desc_t desc[GDT_ENTRY_TLS_NUM]; +} tls_t; + +#endif /* __CR_ASM_TYPES_H__ */ diff --git a/CRIU_code/criu/arch/x86/include/asm/vdso.h b/CRIU_code/criu/arch/x86/include/asm/vdso.h new file mode 100644 index 0000000..046db23 --- /dev/null +++ b/CRIU_code/criu/arch/x86/include/asm/vdso.h @@ -0,0 +1,72 @@ +#ifndef __CR_ASM_VDSO_H__ +#define __CR_ASM_VDSO_H__ + +#include "asm/int.h" +#include "asm-generic/vdso.h" + +/* This definition is used in pie/util-vdso.c to initialize the vdso symbol + * name string table 'vdso_symbols' + */ + +/* + * This is a minimal amount of symbols + * we should support at the moment. + */ +#define VDSO_SYMBOL_MAX 6 + +/* + * XXX: we don't patch __kernel_vsyscall as it's too small: + * + * byte *before* *after* + * 0x0 push %ecx mov $[rt-vdso],%eax + * 0x1 push %edx ^ + * 0x2 push %ebp ^ + * 0x3 mov %esp,%ebp ^ + * 0x5 sysenter jmp *%eax + * 0x7 int $0x80 int3 + * 0x9 pop %ebp int3 + * 0xa pop %edx int3 + * 0xb pop %ecx pop %ecx + * 0xc ret ret + * + * As restarting a syscall is quite likely after restore, + * the patched version quitly crashes. + * vsyscall will be patched again when addressing: + * https://github.com/checkpoint-restore/criu/issues/512 + */ +#define ARCH_VDSO_SYMBOLS \ + "__vdso_clock_gettime", \ + "__vdso_getcpu", \ + "__vdso_gettimeofday", \ + "__vdso_time", \ + "__kernel_sigreturn", \ + "__kernel_rt_sigreturn" + +/* "__kernel_vsyscall", */ + +#ifndef ARCH_MAP_VDSO_32 +# define ARCH_MAP_VDSO_32 0x2002 +#endif + +#ifndef ARCH_MAP_VDSO_64 +# define ARCH_MAP_VDSO_64 0x2003 +#endif + +#if defined(CONFIG_COMPAT) && !defined(__ASSEMBLY__) +struct vdso_symtable; +extern int vdso_fill_symtable(uintptr_t mem, size_t size, + struct vdso_symtable *t); +extern int vdso_fill_symtable_compat(uintptr_t mem, size_t size, + struct vdso_symtable *t); + +static inline int __vdso_fill_symtable(uintptr_t mem, size_t size, + struct vdso_symtable *t, bool compat_vdso) +{ + if (compat_vdso) + return vdso_fill_symtable_compat(mem, size, t); + else + return vdso_fill_symtable(mem, size, t); +} +#endif + +#endif /* __CR_ASM_VDSO_H__ */ diff --git a/CRIU_code/criu/arch/x86/kerndat.c b/CRIU_code/criu/arch/x86/kerndat.c new file mode 100644 index 0000000..f759325 --- /dev/null +++ b/CRIU_code/criu/arch/x86/kerndat.c @@ -0,0 +1,258 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "compel/asm/fpu.h" +#include "compel/plugins/std/syscall-codes.h" +#include "cpu.h" +#include "kerndat.h" +#include "log.h" +#include "types.h" + +#include "asm/compat.h" +#include "asm/dump.h" + +int kdat_can_map_vdso(void) +{ + pid_t child; + int stat; + + /* + * Running under fork so if vdso_64 is disabled - don't create + * it for criu accidentally. + */ + child = fork(); + if (child < 0) { + pr_perror("%s(): failed to fork()", __func__); + return -1; + } + + if (child == 0) { + int ret; + + ret = syscall(SYS_arch_prctl, ARCH_MAP_VDSO_32, 0); + if (ret == 0) + exit(1); + /* + * Mapping vDSO while have not unmap it yet: + * this is restricted by API if ARCH_MAP_VDSO_* is supported. + */ + if (ret == -1 && errno == EEXIST) + exit(1); + exit(0); + } + + if (waitpid(child, &stat, 0) != child) { + pr_err("Failed to wait for arch_prctl() test\n"); + kill(child, SIGKILL); + return -1; + } + + if (!WIFEXITED(stat)) + return -1; + + return WEXITSTATUS(stat); + +} + +#ifdef CONFIG_COMPAT +void *mmap_ia32(void *addr, size_t len, int prot, + int flags, int fildes, off_t off) +{ + struct syscall_args32 s; + + s.nr = __NR32_mmap2; + s.arg0 = (uint32_t)(uintptr_t)addr; + s.arg1 = (uint32_t)len; + s.arg2 = prot; + s.arg3 = flags; + s.arg4 = fildes; + s.arg5 = (uint32_t)off; + + do_full_int80(&s); + + return (void *)(uintptr_t)s.nr; +} + +/* + * The idea of the test: + * From kernel's top-down allocator we assume here that + * 1. A = mmap(0, ...); munmap(A); + * 2. B = mmap(0, ...); + * results in A == B. + * ...but if we have 32-bit mmap() bug, then A will have only lower + * 4 bytes of 64-bit address allocated with mmap(). + * That means, that the next mmap() will return B != A + * (as munmap(A) hasn't really unmapped A mapping). + * + * As mapping with lower 4 bytes of A may really exist, we run + * this test under fork(). + * + * Another approach to test bug's presence would be to parse + * /proc/self/maps before and after 32-bit mmap(), but that would + * be soo slow. + */ +static void mmap_bug_test(void) +{ + void *map1, *map2; + int err; + + map1 = mmap_ia32(0, PAGE_SIZE, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); + /* 32-bit error, not sign-extended - can't use IS_ERR_VALUE() here */ + err = (uintptr_t)map1 % PAGE_SIZE; + if (err) { + pr_err("ia32 mmap() failed: %d\n", err); + exit(1); + } + + if (munmap(map1, PAGE_SIZE)) { + pr_err("Failed to unmap() 32-bit mapping: %m\n"); + exit(1); + } + + map2 = mmap_ia32(0, PAGE_SIZE, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); + err = (uintptr_t)map2 % PAGE_SIZE; + if (err) { + pr_err("ia32 mmap() failed: %d\n", err); + exit(1); + } + + if (map1 != map2) + exit(1); + exit(0); +} + +/* + * Pre v4.12 kernels have a bug: for a process started as 64-bit + * 32-bit mmap() may return 8 byte pointer. + * Which is fatal for us: after 32-bit C/R a task will map 64-bit + * addresses, cut upper 4 bytes and try to use lower 4 bytes. + * This is a check if the bug was fixed in the kernel. + */ +static int has_32bit_mmap_bug(void) +{ + pid_t child = fork(); + int stat; + + if (child < 0) { + pr_perror("%s(): failed to fork()", __func__); + return -1; + } + + if (child == 0) + mmap_bug_test(); + + if (waitpid(child, &stat, 0) != child) { + pr_err("Failed to wait for mmap test\n"); + kill(child, SIGKILL); + return -1; + } + + if (!WIFEXITED(stat) || WEXITSTATUS(stat) != 0) + return 1; + return 0; +} + +int kdat_compatible_cr(void) +{ + if (!kdat.can_map_vdso) + return 0; + + if (has_32bit_mmap_bug()) + return 0; + + return 1; +} +#else /* !CONFIG_COMPAT */ +int kdat_compatible_cr(void) +{ + return 0; +} +#endif + +static int kdat_x86_has_ptrace_fpu_xsave_bug_child(void *arg) +{ + if (ptrace(PTRACE_TRACEME, 0, 0, 0)) { + pr_perror("%d: ptrace(PTRACE_TRACEME) failed", getpid()); + _exit(1); + } + + if (kill(getpid(), SIGSTOP)) + pr_perror("%d: failed to kill myself", getpid()); + + pr_err("Continue after SIGSTOP.. Urr what?\n"); + _exit(1); +} + +/* + * Pre v4.14 kernels have a bug on Skylake CPUs: + * copyout_from_xsaves() creates fpu state for + * ptrace(PTRACE_GETREGSET, pid, NT_X86_XSTATE, &iov) + * without MXCSR and MXCSR_FLAGS if there is SSE/YMM state, but no FP state. + * That is xfeatures had either/both XFEATURE_MASK_{SSE,YMM} set, but not + * XFEATURE_MASK_FP. + * But we *really* need to C/R MXCSR & MXCSR_FLAGS if SSE/YMM active, + * as mxcsr store part of the state. + */ +int kdat_x86_has_ptrace_fpu_xsave_bug(void) +{ + user_fpregs_struct_t xsave = { }; + struct iovec iov; + char stack[PAGE_SIZE]; + int flags = CLONE_VM | CLONE_FILES | CLONE_UNTRACED | SIGCHLD; + int ret = -1; + pid_t child; + int stat; + + /* OSXSAVE can't be changed during boot. */ + if (!compel_cpu_has_feature(X86_FEATURE_OSXSAVE)) + return 0; + + child = clone(kdat_x86_has_ptrace_fpu_xsave_bug_child, + stack + ARRAY_SIZE(stack), flags, 0); + if (child < 0) { + pr_perror("%s(): failed to clone()", __func__); + return -1; + } + + if (waitpid(child, &stat, WUNTRACED) != child) { + /* + * waitpid() may end with ECHILD if SIGCHLD == SIG_IGN, + * and the child has stopped already. + */ + pr_perror("Failed to wait for %s() test\n", __func__); + goto out_kill; + } + + if (!WIFSTOPPED(stat)) { + pr_err("Born child is unstoppable! (might be dead)\n"); + goto out_kill; + } + + iov.iov_base = &xsave; + iov.iov_len = sizeof(xsave); + + if (ptrace(PTRACE_GETREGSET, child, (unsigned)NT_X86_XSTATE, &iov) < 0) { + pr_perror("Can't obtain FPU registers for %d", child); + goto out_kill; + } + /* + * MXCSR should be never 0x0: e.g., it should contain either: + * R+/R-/RZ/RN to determine rounding model. + */ + ret = !xsave.i387.mxcsr; + +out_kill: + if (kill(child, SIGKILL)) + pr_perror("Failed to kill my own child"); + if (waitpid(child, &stat, 0) < 0) + pr_perror("Failed wait for a dead child"); + + return ret; +} diff --git a/CRIU_code/criu/arch/x86/restorer.c b/CRIU_code/criu/arch/x86/restorer.c new file mode 100644 index 0000000..2d335d5 --- /dev/null +++ b/CRIU_code/criu/arch/x86/restorer.c @@ -0,0 +1,116 @@ +#include +#include + +#include "types.h" +#include "restorer.h" +#include "asm/compat.h" +#include "asm/restorer.h" +#include + +#include +#include +#include +#include "log.h" +#include "cpu.h" + +int arch_map_vdso(unsigned long map_at, bool compatible) +{ + int vdso_type = compatible ? ARCH_MAP_VDSO_32 : ARCH_MAP_VDSO_64; + + pr_debug("Mapping %s vDSO at %lx\n", + compatible ? "compatible" : "native", map_at); + + return sys_arch_prctl(vdso_type, map_at); +} + +int restore_nonsigframe_gpregs(UserX86RegsEntry *r) +{ + long ret; + unsigned long fsgs_base; + + fsgs_base = r->fs_base; + ret = sys_arch_prctl(ARCH_SET_FS, fsgs_base); + if (ret) { + pr_info("SET_FS fail %ld\n", ret); + return -1; + } + + fsgs_base = r->gs_base; + ret = sys_arch_prctl(ARCH_SET_GS, fsgs_base); + if (ret) { + pr_info("SET_GS fail %ld\n", ret); + return -1; + } + return 0; +} + +#ifdef CONFIG_COMPAT + +int set_compat_robust_list(uint32_t head_ptr, uint32_t len) +{ + struct syscall_args32 s = { + .nr = __NR32_set_robust_list, + .arg0 = head_ptr, + .arg1 = len, + }; + + do_full_int80(&s); + return (int)s.nr; +} + +static int prepare_stack32(void **stack32) +{ + if (*stack32) + return 0; + + *stack32 = alloc_compat_syscall_stack(); + if (!*stack32) { + pr_err("Failed to allocate stack for 32-bit TLS restore\n"); + return -1; + } + + return 0; +} + +void restore_tls(tls_t *ptls) +{ + /* + * We need here compatible stack, because 32-bit syscalls get + * 4-byte pointer and _usally_ restorer is also under 4Gb, but + * it can be upper and then pointers are messed up. + * (we lose high 4 bytes and... BANG!) + * Nothing serious, but syscall will return -EFAULT - or if we're + * lucky and lower 4 bytes points on some writeable VMA - corruption). + */ + void *stack32 = NULL; + unsigned i; + + for (i = 0; i < GDT_ENTRY_TLS_NUM; i++) { + user_desc_t *desc = &ptls->desc[i]; + int ret; + + if (desc->seg_not_present) + continue; + + if (prepare_stack32(&stack32) < 0) + return; + + memcpy(stack32, desc, sizeof(user_desc_t)); + asm volatile ( + " mov %1,%%eax \n" + " mov %2,%%ebx \n" + " int $0x80 \n" + " mov %%eax,%0 \n" + : "=g"(ret) + : "r"(__NR32_set_thread_area), "r"((uint32_t)(uintptr_t)stack32) + : "eax", "ebx", "r8", "r9", "r10", "r11", "memory"); + + if (ret) + pr_err("Failed to restore TLS descriptor %u in GDT: %d\n", + desc->entry_number, ret); + } + + if (stack32) + free_compat_syscall_stack(stack32); +} +#endif diff --git a/CRIU_code/criu/arch/x86/restorer_unmap.S b/CRIU_code/criu/arch/x86/restorer_unmap.S new file mode 100644 index 0000000..d721eaf --- /dev/null +++ b/CRIU_code/criu/arch/x86/restorer_unmap.S @@ -0,0 +1,13 @@ +#include "common/asm/linkage.h" +#include "compel/plugins/std/syscall-codes.h" + + .text +ENTRY(__export_unmap_compat) + .code32 + mov bootstrap_start, %ebx + mov bootstrap_len, %ecx + sub vdso_rt_size, %ecx + movl $__NR32_munmap, %eax + int $0x80 + int $0x03 /* Guard */ + .code64 diff --git a/CRIU_code/criu/arch/x86/sigaction_compat.c b/CRIU_code/criu/arch/x86/sigaction_compat.c new file mode 100644 index 0000000..b38ba80 --- /dev/null +++ b/CRIU_code/criu/arch/x86/sigaction_compat.c @@ -0,0 +1,56 @@ +#include "log.h" +#include "asm/restorer.h" +#include +#include "asm/compat.h" +#include + +#ifdef CR_NOGLIBC +# include +#endif +#include "cpu.h" + +asm ( " .pushsection .text \n" + " .global restore_rt_sigaction \n" + " .code32 \n" + "restore_rt_sigaction: \n" + " mov %edx, %esi \n" + " mov $0, %edx \n" + " movl $"__stringify(__NR32_rt_sigaction)",%eax \n" + " int $0x80 \n" + " ret \n" + " .popsection \n" + " .code64"); +extern char restore_rt_sigaction; + +/* + * Call raw rt_sigaction syscall through int80 - so the ABI kernel choses + * to deliver this signal would be i386. + */ +int arch_compat_rt_sigaction(void *stack32, int sig, rt_sigaction_t_compat *act) +{ + int ret; + struct syscall_args32 arg = {}; + unsigned long act_stack = (unsigned long)stack32; + + /* To make sure the 32-bit stack was allocated in caller */ + if (act_stack >= (uint32_t)-1) { + pr_err("compat rt_sigaction without 32-bit stack\n"); + return -1; + } + + /* + * To be sure, that sigaction pointer lies under 4G, + * coping it on the bottom of the stack. + */ + memcpy(stack32, act, sizeof(rt_sigaction_t_compat)); + arg.nr = __NR32_rt_sigaction; + arg.arg0 = sig; + arg.arg1 = (uint32_t)act_stack; /* act */ + arg.arg2 = 0; /* oldact */ + arg.arg3 = (uint32_t)sizeof(act->rt_sa_mask); /* sigsetsize */ + + do_full_int80(&arg); + asm volatile ("\t movl %%eax,%0\n" : "=r"(ret)); + return ret; +} + diff --git a/CRIU_code/criu/arch/x86/sigaction_compat_pie.c b/CRIU_code/criu/arch/x86/sigaction_compat_pie.c new file mode 100644 index 0000000..009ac3a --- /dev/null +++ b/CRIU_code/criu/arch/x86/sigaction_compat_pie.c @@ -0,0 +1 @@ +sigaction_compat.c \ No newline at end of file diff --git a/CRIU_code/criu/arch/x86/sigframe.c b/CRIU_code/criu/arch/x86/sigframe.c new file mode 100644 index 0000000..11b0d64 --- /dev/null +++ b/CRIU_code/criu/arch/x86/sigframe.c @@ -0,0 +1,36 @@ +#include +#include + +#include "asm/sigframe.h" +#include "asm/types.h" + +#include "log.h" + +int sigreturn_prep_fpu_frame(struct rt_sigframe *sigframe, + struct rt_sigframe *rsigframe) +{ + /* + * Use local sigframe to check native/compat type, + * but set address for rsigframe. + */ + fpu_state_t *fpu_state = (sigframe->is_native) ? + &rsigframe->native.fpu_state : + &rsigframe->compat.fpu_state; + + if (sigframe->is_native) { + unsigned long addr = (unsigned long)(void *)&fpu_state->fpu_state_64.xsave; + + if ((addr % 64ul)) { + pr_err("Unaligned address passed: %lx (native %d)\n", + addr, sigframe->is_native); + return -1; + } + + sigframe->native.uc.uc_mcontext.fpstate = (uint64_t)addr; + } else if (!sigframe->is_native) { + sigframe->compat.uc.uc_mcontext.fpstate = + (uint32_t)(unsigned long)(void *)&fpu_state->fpu_state_ia32; + } + + return 0; +} diff --git a/CRIU_code/criu/arch/x86/sys-exec-tbl.c b/CRIU_code/criu/arch/x86/sys-exec-tbl.c new file mode 100644 index 0000000..608dc25 --- /dev/null +++ b/CRIU_code/criu/arch/x86/sys-exec-tbl.c @@ -0,0 +1,44 @@ +#include + +static struct syscall_exec_desc sc_exec_table_64[] = { +#include "sys-exec-tbl-64.c" + { }, /* terminator */ +}; + +#ifdef CONFIG_COMPAT +static struct syscall_exec_desc sc_exec_table_32[] = { +#include "sys-exec-tbl-32.c" + { }, /* terminator */ +}; +#endif + +struct syscall_exec_desc; + +static inline struct syscall_exec_desc * +find_syscall_table(char *name, struct syscall_exec_desc *tbl) +{ + int i; + + for (i = 0; tbl[i].name != NULL; i++) + if (!strcmp(tbl[i].name, name)) + return &tbl[i]; + return NULL; +} + +#define ARCH_HAS_FIND_SYSCALL +/* overwrite default to search in two tables above */ +#ifdef CONFIG_COMPAT +struct syscall_exec_desc * find_syscall(char *name, struct parasite_ctl *ctl) +{ + if (compel_mode_native(ctl)) + return find_syscall_table(name, sc_exec_table_64); + else + return find_syscall_table(name, sc_exec_table_32); +} +#else +struct syscall_exec_desc * +find_syscall(char *name, __always_unused struct parasite_ctl *ctl) +{ + return find_syscall_table(name, sc_exec_table_64); +} +#endif diff --git a/CRIU_code/criu/arch/x86/vdso-pie.c b/CRIU_code/criu/arch/x86/vdso-pie.c new file mode 100644 index 0000000..988cf08 --- /dev/null +++ b/CRIU_code/criu/arch/x86/vdso-pie.c @@ -0,0 +1,76 @@ +#include + +#include "asm/types.h" + +#include +#include +#include "parasite-vdso.h" +#include "log.h" +#include "common/bug.h" + +#ifdef LOG_PREFIX +# undef LOG_PREFIX +#endif +#define LOG_PREFIX "vdso: " + +static void insert_trampoline32(uintptr_t from, uintptr_t to) +{ + struct { + u8 movl; + u32 imm32; + u16 jmp_eax; + u32 guards; + } __packed jmp = { + .movl = 0xb8, + .imm32 = (uint32_t)to, + .jmp_eax = 0xe0ff, + .guards = 0xcccccccc, + }; + + memcpy((void *)from, &jmp, sizeof(jmp)); +} + +static void insert_trampoline64(uintptr_t from, uintptr_t to) +{ + struct { + u16 movabs; + u64 imm64; + u16 jmp_rax; + u32 guards; + } __packed jmp = { + .movabs = 0xb848, + .imm64 = to, + .jmp_rax = 0xe0ff, + .guards = 0xcccccccc, + }; + + memcpy((void *)from, &jmp, sizeof(jmp)); +} + +int vdso_redirect_calls(unsigned long base_to, unsigned long base_from, + struct vdso_symtable *sto, struct vdso_symtable *sfrom, + bool compat_vdso) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(sto->symbols); i++) { + uintptr_t from, to; + + if (vdso_symbol_empty(&sfrom->symbols[i])) + continue; + + pr_debug("jmp: %lx/%lx -> %lx/%lx (index %d)\n", + base_from, sfrom->symbols[i].offset, + base_to, sto->symbols[i].offset, i); + + from = base_from + sfrom->symbols[i].offset; + to = base_to + sto->symbols[i].offset; + + if (!compat_vdso) + insert_trampoline64(from, to); + else + insert_trampoline32(from, to); + } + + return 0; +} diff --git a/CRIU_code/criu/autofs.c b/CRIU_code/criu/autofs.c new file mode 100644 index 0000000..a2dc60f --- /dev/null +++ b/CRIU_code/criu/autofs.c @@ -0,0 +1,1090 @@ +#include +#include +#include +#include +#include + +#include "int.h" +#include "fdinfo.h" +#include "autofs.h" +#include "rst-malloc.h" +#include "mount.h" +#include "pstree.h" +#include "namespaces.h" +#include "protobuf.h" +#include "pipes.h" +#include "crtools.h" +#include "util.h" + +#include "images/autofs.pb-c.h" + +#define AUTOFS_OPT_UNKNOWN INT_MIN + +#define AUTOFS_MODE_DIRECT 0 +#define AUTOFS_MODE_INDIRECT 1 +#define AUTOFS_MODE_OFFSET 2 + +#define AUTOFS_CATATONIC_FD -1 + +static int autofs_mnt_open(const char *mnt_path, dev_t devid); + +struct autofs_pipe_s { + struct list_head list; + unsigned long inode; +}; + +struct list_head autofs_pipes = LIST_HEAD_INIT(autofs_pipes); + +bool is_autofs_pipe(unsigned long inode) +{ + struct autofs_pipe_s *p; + + list_for_each_entry(p, &autofs_pipes, list) { + if (p->inode == inode) + return true; + } + return false; +} + +static int autofs_gather_pipe(unsigned long inode) +{ + struct autofs_pipe_s *pipe; + + pipe = xmalloc(sizeof(*pipe)); + if (!pipe) + return -1; + pipe->inode = inode; + list_add_tail(&pipe->list, &autofs_pipes); + return 0; +} + +int autofs_parse(struct mount_info *pm) +{ + long pipe_ino = AUTOFS_OPT_UNKNOWN; + char **opts; + int nr_opts, i; + + split(pm->options, ',', &opts, &nr_opts); + if (!opts) + return -1; + for (i = 0; i < nr_opts; i++) { + if (!strncmp(opts[i], "pipe_ino=", strlen("pipe_ino="))) + pipe_ino = atoi(opts[i] + strlen("pipe_ino=")); + } + for (i = 0; i < nr_opts; i++) + xfree(opts[i]); + free(opts); + + if (pipe_ino == AUTOFS_OPT_UNKNOWN) { + pr_warn("Failed to find pipe_ino option (old kernel?)\n"); + return 0; + } + + return autofs_gather_pipe(pipe_ino); +} + +static int autofs_check_fd_stat(struct stat *stat, int prgp, int fd, + long ino, int *mode) +{ + struct fdinfo_common fdinfo; + + if (!S_ISFIFO(stat->st_mode)) + return 0; + if (stat->st_ino != ino) + return 0; + if (parse_fdinfo_pid(prgp, fd, FD_TYPES__UND, &fdinfo)) + return -1; + + *mode = fdinfo.flags & O_WRONLY; + return 1; +} + +static int autofs_kernel_pipe_alive(int pgrp, int fd, int ino) +{ + struct stat buf; + char *path; + int ret, fd_mode; + + path = xsprintf("/proc/%d/fd/%d", pgrp, fd); + if (!path) + return -1; + + if (stat(path, &buf) < 0) { + if (errno == ENOENT) { + xfree(path); + return 0; + } + pr_perror("Failed to stat %s", path); + return -1; + } + + xfree(path); + + ret = autofs_check_fd_stat(&buf, pgrp, fd, ino, &fd_mode); + if (ret <= 0) + return ret; + + return O_WRONLY == fd_mode; +} + +static int autofs_find_pipe_read_end(int pgrp, long ino, int *read_fd) +{ + DIR *dir; + struct dirent *de; + int ret = -1; + + dir = opendir_proc(pgrp, "fd"); + if (dir == NULL) + return -1; + + *read_fd = -1; + + while ((de = readdir(dir))) { + struct stat buf; + int found, mode, fd; + + if (dir_dots(de)) + continue; + + if (fstatat(dirfd(dir), de->d_name, &buf, 0) < 0) { + pr_perror("Failed to fstatat"); + goto out; + } + + ret = xatoi(de->d_name, &fd); + if (ret) + goto out; + + found = autofs_check_fd_stat(&buf, pgrp, fd, ino, &mode); + if (found < 0) + goto out; + if (found && (mode == O_RDONLY)) { + *read_fd = fd; + break; + } + } + + ret = 0; + +out: + closedir(dir); + close_pid_proc(); + + return ret; +} + +static int autofs_find_read_fd(int pgrp, long pipe_ino) +{ + int read_fd, fd; + + /* We need to find read end and make sure, that it's empty */ + if (autofs_find_pipe_read_end(pgrp, pipe_ino, &read_fd) < 0) { + pr_err("Failed to find read pipe fd (ino %ld) " + "in process %d\n", pipe_ino, pgrp); + return -1; + } + + if (read_fd == -1) { + pr_err("Master %d doesn't have a read end of the pipe with " + "inode %ld opened\n", pgrp, pipe_ino); + pr_err("Abandoned mount or control was delegated to child?\n"); + return -ENOENT; + } + + /* Let's check, that read end is empty */ + fd = open_proc(pgrp, "fd/%d", read_fd); + if (fd < 0) + return -1; + + if (fd_has_data(fd)) { + pr_err("Process %d autofs pipe fd %d is not empty.\n", pgrp, + read_fd); + pr_err("Try again later.\n"); + return -1; + } + close(fd); + return read_fd; +} + +static int parse_options(char *options, AutofsEntry *entry, long *pipe_ino) +{ + char **opts; + int nr_opts, i; + int parse_error = 0; + + entry->fd = AUTOFS_OPT_UNKNOWN; + entry->timeout = AUTOFS_OPT_UNKNOWN; + entry->minproto = AUTOFS_OPT_UNKNOWN; + entry->maxproto = AUTOFS_OPT_UNKNOWN; + entry->mode = AUTOFS_OPT_UNKNOWN; + entry->pgrp = AUTOFS_OPT_UNKNOWN; + entry->uid = AUTOFS_OPT_UNKNOWN; + entry->gid = AUTOFS_OPT_UNKNOWN; + *pipe_ino = AUTOFS_OPT_UNKNOWN; + + split(options, ',', &opts, &nr_opts); + if (!opts) + return -1; + + for (i = 0; i < nr_opts; i++) { + char *opt = opts[i]; + int err = 0; + + if (!strncmp(opt, "fd=", strlen("fd="))) + err = xatoi(opt + strlen("fd="), &entry->fd); + else if (!strncmp(opt, "pipe_ino=", strlen("pipe_ino="))) + err = xatol(opt + strlen("pipe_ino="), pipe_ino); + else if (!strncmp(opt, "pgrp=", strlen("pgrp="))) + err = xatoi(opt + strlen("pgrp="), &entry->pgrp); + else if (!strncmp(opt, "timeout=", strlen("timeout="))) + err = xatoi(opt + strlen("timeout="), &entry->timeout); + else if (!strncmp(opt, "minproto=", strlen("minproto="))) + err = xatoi(opt + strlen("minproto="), &entry->minproto); + else if (!strncmp(opt, "maxproto=", strlen("maxproto="))) + err = xatoi(opt + strlen("maxproto="), &entry->maxproto); + else if (!strcmp(opt, "indirect")) + entry->mode = AUTOFS_MODE_INDIRECT; + else if (!strcmp(opt, "offset")) + entry->mode = AUTOFS_MODE_OFFSET; + else if (!strcmp(opt, "direct")) + entry->mode = AUTOFS_MODE_DIRECT; + else if (!strncmp(opt, "uid=", strlen("uid="))) + err = xatoi(opt + strlen("uid="), &entry->uid); + else if (!strncmp(opt, "gid=", strlen("gid="))) + err = xatoi(opt + strlen("gid="), &entry->gid); + + if (err) { + parse_error = 1; + break; + } + } + + for (i = 0; i < nr_opts; i++) + xfree(opts[i]); + xfree(opts); + + if (parse_error) + return -1; + + if (entry->fd == AUTOFS_OPT_UNKNOWN) { + pr_err("Failed to find fd option\n"); + return -1; + } + if (entry->pgrp == AUTOFS_OPT_UNKNOWN) { + pr_err("Failed to find pgrp option\n"); + return -1; + } + if (entry->timeout == AUTOFS_OPT_UNKNOWN) { + pr_err("Failed to find timeout option\n"); + return -1; + } + if (entry->minproto == AUTOFS_OPT_UNKNOWN) { + pr_err("Failed to find minproto option\n"); + return -1; + } + if (entry->maxproto == AUTOFS_OPT_UNKNOWN) { + pr_err("Failed to find maxproto option\n"); + return -1; + } + if (entry->mode == AUTOFS_OPT_UNKNOWN) { + pr_err("Failed to find mode (direct,indirect,offset) option\n"); + return -1; + } + if (*pipe_ino == AUTOFS_OPT_UNKNOWN) { + pr_err("Failed to find pipe_ino option (old kernel?)\n"); + return -1; + } + + return 0; +} + +static int autofs_revisit_options(struct mount_info *pm) +{ + FILE *f; + char *str; + int ret = -ENOMEM; + + str = xmalloc(1024); + if (!str) { + return -ENOMEM; + } + + f = fopen_proc(getpid(), "mountinfo"); + if (!f) + goto free_str; + + while (fgets(str, 1024, f)) { + int mnt_id = -1; + char *token; + + /* Removing '/n' */ + str[strlen(str)-1] = '\0'; + + while ((token = strsep(&str, " ")) != NULL) { + if (mnt_id == -1) { + ret = xatoi(token, &mnt_id); + if (ret) + goto close_proc; + if (mnt_id != pm->mnt_id) + break; + } else if (strstr(token, "pipe_ino=")) { + ret = 0; + free(pm->options); + + pm->options = xstrdup(token); + if (!pm->options) + pr_err("failed to duplicate string\n"); + else + ret = 0; + goto close_proc; + } + } + } + + pr_err("failed to find autofs mount with mnt_id %d\n", pm->mnt_id); + ret = -ENOENT; + +close_proc: + fclose(f); +free_str: + free(str); + return ret; +} + +/* + * To access the mount point we have to set proper mount namespace. + * But, unfortunately, we have to set proper pid namespace as well, + * because otherwise autofs driver won't find the autofs master. + */ +static int access_autofs_mount(struct mount_info *pm) +{ + const char *mnt_path = pm->mountpoint + 1; + dev_t dev_id = pm->s_dev; + int new_pid_ns = -1, old_pid_ns = -1; + int old_mnt_ns; + int autofs_mnt; + int err = -1; + int pid, status; + + /* + * To be able to set proper pid namespace, we must open fd before + * switching to the mount namespace. + * The same applies to pid namespace fd to restore back. + */ + new_pid_ns = open_proc(pm->nsid->ns_pid, "ns/pid"); + if (new_pid_ns < 0) + return -1; + + old_pid_ns = open_proc(PROC_SELF, "ns/pid"); + if (old_pid_ns < 0) + goto close_new_pid_ns; + + if (switch_ns(pm->nsid->ns_pid, &mnt_ns_desc, &old_mnt_ns)) { + pr_err("failed to switch to mount namespace\n"); + goto close_old_pid_ns; + } + + err = restore_ns(new_pid_ns, &pid_ns_desc); + new_pid_ns = -1; + if (err) { + pr_err("failed to restore pid namespace\n"); + goto restore_mnt_ns; + } + + autofs_mnt = autofs_mnt_open(mnt_path, dev_id); + if (autofs_mnt < 0) + goto restore_pid_ns; + + pid = fork(); + switch (pid) { + case -1: + pr_err("failed to fork\n"); + goto close_autofs_mnt; + case 0: + /* We don't care about results. + * All we need is to "touch" */ + openat(autofs_mnt, mnt_path, O_RDONLY|O_NONBLOCK|O_DIRECTORY); + _exit(0); + + } + /* Here we also don't care about results */ + waitpid(pid, &status, 0); + + err = autofs_revisit_options(pm); + +close_autofs_mnt: + close(autofs_mnt); +restore_pid_ns: + if (restore_ns(old_pid_ns, &pid_ns_desc)) { + pr_err("failed to restore pid namespace\n"); + err = -1; + } + old_pid_ns = -1; +restore_mnt_ns: + if (restore_ns(old_mnt_ns, &mnt_ns_desc)) { + pr_err("failed to restore mount namespace\n"); + err = -1; + } +close_old_pid_ns: + if (old_pid_ns >= 0) + close(old_pid_ns); +close_new_pid_ns: + if (new_pid_ns >= 0) + close(new_pid_ns); + return err; +} + +static int autofs_create_entry(struct mount_info *pm, AutofsEntry *entry) +{ + long pipe_ino; + + if (parse_options(pm->options, entry, &pipe_ino)) + return -1; + + if (entry->uid != AUTOFS_OPT_UNKNOWN) + entry->has_uid = true; + if (entry->gid != AUTOFS_OPT_UNKNOWN) + entry->has_gid = true; + + if (entry->fd != AUTOFS_CATATONIC_FD) { + int found, read_fd, virt_pgrp; + + read_fd = autofs_find_read_fd(entry->pgrp, pipe_ino); + if (read_fd < 0) { + if (read_fd != -ENOENT) + return -1; + + /* Ok, our read end doesn't exist. + * There can be a case, when mount looks normal, but + * it's a "hidden" or "abandoned" catatonic mount in + * reality. + * This can happen if: + * 1) autofs master process has exited without switching + * the mount to catatonic mode (or was killed). + * 2) mount point was unmounted, but not propagated to + * nested mount namespace with private mounts. + * We can try handle these cases by accessing the mount + * point. If it's catatonic, it will update it's + * options, then we can read them again and dump it. + */ + if (access_autofs_mount(pm)) { + pr_err("failed to access autofs %s\n", + pm->mountpoint + 1); + return -1; + } + if (parse_options(pm->options, entry, &pipe_ino)) + return -1; + if (entry->fd == AUTOFS_CATATONIC_FD) + return 0; + pr_err("Autofs %d is alive, but unreachable.\n", + pm->mnt_id); + return -1; + } + + /* Let' check whether write end is still open */ + found = autofs_kernel_pipe_alive(entry->pgrp, entry->fd, pipe_ino); + if (found < 0) { + pr_err("Failed to check fd %d in process %d\n", + entry->fd, entry->pgrp); + return -1; + } + /* Write end is absent. we need to carry read end to restore. */ + if (!found) { + entry->has_read_fd = true; + entry->read_fd = read_fd; + } + + /* We need to get virtual pgrp to restore mount */ + virt_pgrp = pid_to_virt(entry->pgrp); + if (!virt_pgrp) { + pr_err("failed to find pstree item with pid %d\n", + entry->pgrp); + pr_err("Non-catatonic mount without master?\n"); + return -1; + } + entry->pgrp = virt_pgrp; + } + return 0; +} + +static int autofs_dump_entry(struct mount_info *pm, AutofsEntry *entry) +{ + struct cr_img *img; + int ret = -1; + + img = open_image(CR_FD_AUTOFS, O_DUMP, pm->s_dev); + if (img) { + ret = pb_write_one(img, entry, PB_AUTOFS); + close_image(img); + } + return ret; +} + + +int autofs_dump(struct mount_info *pm) +{ + AutofsEntry *entry; + int err; + + entry = xmalloc(sizeof(*entry)); + if (!entry) + return -1; + autofs_entry__init(entry); + + err = autofs_create_entry(pm, entry); + if (err) + goto free_entry; + + err = autofs_dump_entry(pm, entry); + +free_entry: + free(entry); + return err < 0 ? err : 0; +} + +typedef struct autofs_info_s { + struct pipe_info pi; + AutofsEntry *entry; + char *mnt_path; + dev_t mnt_dev; + struct mount_info *mi; + struct pprep_head ph; +} autofs_info_t; + +static int dup_pipe_info(struct pipe_info *pi, int flags, + struct file_desc_ops *ops) +{ + struct pipe_info *new; + PipeEntry *pe; + + new = shmalloc(sizeof(*new)); + if (!new) + return -1; + + pe = shmalloc(sizeof(*pe)); + if (!pe) + return -1; + + pe->id = pi->pe->id; + pe->pipe_id = pi->pe->pipe_id; + pe->fown = pi->pe->fown; + pe->flags = flags; + + if (collect_one_pipe_ops(new, &pe->base, ops) < 0) { + pr_err("Failed to add pipe info for write end\n"); + return -1; + } + + return 0; +} + +static int autofs_dup_pipe(struct pstree_item *task, + struct fdinfo_list_entry *ple, + int new_fd) +{ + struct pipe_info *pi = container_of(ple->desc, struct pipe_info, d); + unsigned flags = O_WRONLY; + + new_fd = find_unused_fd(task, new_fd); + + if (dup_pipe_info(pi, flags, pi->d.ops) < 0) { + pr_err("Failed to dup pipe entry ID %#x PIPE_ID %#x\n", + pi->pe->id, pi->pe->pipe_id); + return -1; + } + + if (dup_fle(task, ple, new_fd, flags) < 0) { + pr_err("Failed to add fd %d to process %d\n", + new_fd, vpid(task)); + return -1; + } + + pr_info("autofs: added pipe fd %d, flags %#x to %d\n", + new_fd, flags, vpid(task)); + return new_fd; +} + + +static int autofs_ioctl(const char *path, int fd, int cmd, const void *param) +{ + int err; + + err = ioctl(fd, cmd, param); + if (err) + pr_perror("%s ioctl failed", path); + + return err; +} + +static int autofs_dev_ioctl(int cmd, struct autofs_dev_ioctl *param) +{ + char *path = "/dev/"AUTOFS_DEVICE_NAME; + int fd, err; + + fd = open(path, O_RDONLY); + if (fd == -1) { + pr_perror("failed to open %s", path); + return -1; + } + + err = autofs_ioctl(path, fd, cmd, param); + + close(fd); + return err; +} + +static int autofs_mnt_make_catatonic(const char *mnt_path, int mnt_fd) +{ + pr_info("%s: set %s catatonic\n", __func__, mnt_path); + return autofs_ioctl(mnt_path, mnt_fd, AUTOFS_IOC_CATATONIC, NULL); +} + +static int autofs_mnt_set_timeout(time_t timeout, + const char *mnt_path, int mnt_fd) +{ + pr_info("%s: set timeout %ld for %s\n", __func__, timeout, mnt_path); + return autofs_ioctl(mnt_path, mnt_fd, AUTOFS_IOC_SETTIMEOUT, &timeout); +} + +static int autofs_mnt_set_pipefd(const autofs_info_t *i, int mnt_fd) +{ + struct autofs_dev_ioctl param; + + /* Restore pipe and pgrp only for non-catatonic mounts */ + if (i->entry->fd == AUTOFS_CATATONIC_FD) + return 0; + + pr_info("%s: set pipe fd %d (pgrp %d) for mount %s\n", __func__, + i->entry->fd, getpgrp(), i->mnt_path); + + init_autofs_dev_ioctl(¶m); + param.ioctlfd = mnt_fd; + param.setpipefd.pipefd = i->entry->fd; + + return autofs_dev_ioctl(AUTOFS_DEV_IOCTL_SETPIPEFD, ¶m); +} + +static int autofs_mnt_close(const char *mnt_path, int mnt_fd) +{ + struct autofs_dev_ioctl param; + + pr_info("%s: closing fd %d for mount %s\n", __func__, mnt_fd, + mnt_path); + + init_autofs_dev_ioctl(¶m); + param.ioctlfd = mnt_fd; + + return autofs_dev_ioctl(AUTOFS_DEV_IOCTL_CLOSEMOUNT, ¶m); +} + +static int autofs_mnt_open(const char *mnt_path, dev_t devid) +{ + struct autofs_dev_ioctl *param; + int err; + size_t size, fd; + + pr_info("%s: open mount %s\n", __func__, mnt_path); + + size = sizeof(*param) + strlen(mnt_path) + 1; + param = xmalloc(size); + if (!param) + return -1; + + init_autofs_dev_ioctl(param); + param->size = size; + strcpy(param->path, mnt_path); + param->openmount.devid = devid; + + err = autofs_dev_ioctl(AUTOFS_DEV_IOCTL_OPENMOUNT, param); + fd = param->ioctlfd; + free(param); + if (err < 0) { + pr_err("Failed to get %s fd (devid: %ld)\n", + mnt_path, (long)devid); + return -1; + } + return fd; +} + +static int autofs_create_dentries(const struct mount_info *mi, char *mnt_path) +{ + struct mount_info *c; + + list_for_each_entry(c, &mi->children, siblings) { + char *path, *basename; + + basename = strrchr(c->mountpoint, '/'); + if (!basename) { + pr_info("%s: mount path \"%s\" doesn't have '/'\n", + __func__, c->mountpoint); + return -1; + } + path = xsprintf("%s%s", mnt_path, basename); + if (!path) + return -1; + if (mkdir(path, 0555) < 0) { + pr_perror("Failed to create autofs dentry %s", path); + free(path); + return -1; + } + free(path); + } + return 0; +} + +static int autofs_populate_mount(const struct mount_info *mi, + const AutofsEntry *entry) +{ + if (entry->mode != AUTOFS_MODE_INDIRECT) + return 0; + + return autofs_create_dentries(mi, mi->mountpoint); +} + +static int autofs_post_mount(const char *mnt_path, dev_t mnt_dev, + time_t timeout) +{ + int mnt_fd; + + pr_info("%s: set timeout for %s and make it catatonic\n", + __func__, mnt_path); + + mnt_fd = autofs_mnt_open(mnt_path, mnt_dev); + if (mnt_fd < 0) { + pr_err("Failed to open %s\n", mnt_path); + return -1; + } + + if (autofs_mnt_set_timeout(timeout, mnt_path, mnt_fd)) { + pr_err("Failed to set timeout %ld for %s\n", + timeout, mnt_path); + return -1; + } + + if (autofs_mnt_make_catatonic(mnt_path, mnt_fd)) { + pr_err("Failed to set %s catatonic\n", mnt_path); + return -1; + } + + if (autofs_mnt_close(mnt_path, mnt_fd) < 0) { + pr_err("Failed to close %s\n", mnt_path); + return -1; + } + + return 0; +} + +/* Here to fixup Autofs mount */ +static int autofs_post_open(struct file_desc *d, int fd) +{ + struct pipe_info *pi = container_of(d, struct pipe_info, d); + autofs_info_t *i = container_of(pi, autofs_info_t, pi); + int mnt_fd; + + pr_info("%s: restoring %s\n", __func__, i->mnt_path); + + mnt_fd = autofs_mnt_open(i->mnt_path, i->mnt_dev); + if (mnt_fd < 0) { + pr_err("Failed to open %s\n", i->mnt_path); + return -1; + } + + if (autofs_mnt_set_pipefd(i, mnt_fd)) { + pr_err("Failed to set %s owner\n", i->mnt_path); + return -1; + } + + if (autofs_mnt_close(i->mnt_path, mnt_fd) < 0) { + pr_err("Failed to close %s\n", i->mnt_path); + return -1; + } + + pr_info("autofs mount %s owner restored: pgrp=%d, fd=%d\n", + i->mnt_path, getpgrp(), i->entry->fd); + + if (i->entry->has_read_fd) { + pr_info("%s: pid %d, closing write end %d\n", __func__, + getpid(), i->entry->fd); + close(i->entry->fd); + } + + pr_info("%s: pid %d, closing artificial pipe end %d\n", __func__, + getpid(), fd); + close(fd); + return 0; +} + +static autofs_info_t *autofs_create_info(const struct mount_info *mi, + const struct file_desc *desc, + const autofs_info_t *info) +{ + autofs_info_t *i; + + i = shmalloc(sizeof(*i)); + if (!i) + return NULL; + + i->mnt_path = shmalloc(strlen(mi->ns_mountpoint) + 1); + if (!i->mnt_path) + return NULL; + + /* Here we copy autofs dev_id and entry from private data to shared. + * See autofs_mount(). + */ + i->entry = shmalloc(sizeof(*info->entry)); + if (!i->entry) + return NULL; + memcpy(i->entry, info->entry, sizeof(*info->entry)); + i->mnt_dev = info->mnt_dev; + + /* We need mountpoint to be able to open mount in autofs_post_open() + * callback. And this have to be internal path, because process cwd + * will be changed already. That's why ns_mountpoint is used. */ + strcpy(i->mnt_path, mi->ns_mountpoint); + + return i; +} + +static struct fdinfo_list_entry *autofs_pipe_le(struct pstree_item *master, + AutofsEntry *entry) +{ + struct fdinfo_list_entry *ple; + int pipe_fd = entry->fd; + + if (entry->has_read_fd) + pipe_fd = entry->read_fd; + + ple = find_used_fd(master, pipe_fd); + if (!ple) { + pr_err("Failed to find pipe fd %d in process %d\n", + pipe_fd, vpid(master)); + return NULL; + } + if (ple->fe->type != FD_TYPES__PIPE) { + pr_err("Fd %d in process %d is not a pipe: %d\n", pipe_fd, + vpid(master), ple->fe->type); + return NULL; + } + return ple; +} + +static int autofs_open_pipefd(struct file_desc *d, int *new_fd) +{ + struct fdinfo_list_entry *fle = file_master(d); + int ret; + + if (fle->stage < FLE_OPEN) { + ret = open_pipe(d, new_fd); + if (ret != 0) + return ret; + set_fds_event(fle->pid); + return 1; + } + + return autofs_post_open(d, fle->fe->fd); +} + +static int autofs_create_pipe(struct pstree_item *task, autofs_info_t *i, + struct fdinfo_list_entry *ple) +{ + struct pipe_info *pi = container_of(ple->desc, struct pipe_info, d); + int fd = -1; + FdinfoEntry *fe; + unsigned flags = O_RDONLY; + struct file_desc_ops *ops; + PipeEntry *pe; + + fd = find_unused_fd(task, fd); + + ops = shmalloc(sizeof(*ops)); + if (!ops) + return -1; + memcpy(ops, pi->d.ops, sizeof(*ops)); + ops->open = autofs_open_pipefd; + ops->type = FD_TYPES__AUTOFS_PIPE; + + pe = shmalloc(sizeof(*pe)); + if (!pe) + return -1; + + pe->id = pi->pe->id; + pe->pipe_id = pi->pe->pipe_id; + pe->fown = pi->pe->fown; + pe->flags = flags; + + if (collect_one_pipe_ops(&i->pi, &pe->base, ops) < 0) { + pr_err("Failed to add pipe info for write end\n"); + return -1; + } + + fe = dup_fdinfo(ple->fe, fd, flags); + if (!fe) + return -1; + fe->type = FD_TYPES__AUTOFS_PIPE; + + pr_info("autofs: adding pipe fd %d, flags %#x to %d (with post_open)\n", + fe->fd, fe->flags, vpid(task)); + return collect_fd(vpid(task), fe, rsti(task), false); +} + +static int autofs_add_mount_info(struct pprep_head *ph) +{ + autofs_info_t *ai = container_of(ph, autofs_info_t, ph); + struct mount_info *mi = ai->mi; + autofs_info_t *info = mi->private; + AutofsEntry *entry = info->entry; + autofs_info_t *i; + struct pstree_item *master; + struct fdinfo_list_entry *ple; + + if (entry->fd == -1) + /* Catatonic mounts have no owner. Keep them with init. */ + master = pstree_item_by_virt(getpid()); + else + master = pstree_item_by_virt(entry->pgrp); + BUG_ON(!master); + + ple = autofs_pipe_le(master, entry); + if (!ple) + return -1; + + if (entry->has_read_fd) { + /* Original pipe write end was closed. + * We need create one to be able to fixup AutoFS mount. */ + + entry->fd = autofs_dup_pipe(master, ple, entry->fd); + if (entry->fd < 0) { + pr_err("Failed to find free fd in process %d\n", + vpid(master)); + return -1; + } + } + + i = autofs_create_info(mi, ple->desc, info); + if (!i) + return -1; + + /* Another pipe descriptor is needed to call post_open callback */ + if (autofs_create_pipe(master, i, ple)) + return -1; + + mi->private = i; + + return 0; +} + +static int autofs_restore_entry(struct mount_info *mi, AutofsEntry **entry) +{ + struct cr_img *img; + int ret; + + img = open_image(CR_FD_AUTOFS, O_RSTR, mi->s_dev); + if (!img) + return -1; + if (empty_image(img)) { + close_image(img); + return -1; + } + + ret = pb_read_one_eof(img, entry, PB_AUTOFS); + + close_image(img); + if (ret < 0) + return -1; + return 0; +} + +int autofs_mount(struct mount_info *mi, const char *source, const + char *filesystemtype, unsigned long mountflags) +{ + AutofsEntry *entry; + autofs_info_t *info; + char *opts, *mode; + int control_pipe[2], ret = -1; + struct stat buf; + + if (autofs_restore_entry(mi, &entry) < 0) + return -1; + + if (pipe(control_pipe) < 0) { + pr_perror("Can't create pipe"); + return -1; + } + + mode = "direct"; + if (entry->mode == AUTOFS_MODE_INDIRECT) + mode = "indirect"; + if (entry->mode == AUTOFS_MODE_OFFSET) + mode = "offset"; + + opts = xsprintf("fd=%d,pgrp=%d,minproto=%d,maxproto=%d,%s", + control_pipe[1], getpgrp(), entry->minproto, + entry->maxproto, mode); + if (opts && entry->has_uid) + opts = xstrcat(opts, ",uid=%d", entry->uid); + if (opts && entry->has_gid) + opts = xstrcat(opts, ",gid=%d", entry->gid); + if (!opts) { + pr_err("Failed to create options string\n"); + goto close_pipe; + } + + pr_info("autofs: mounting to %s with options: \"%s\"\n", + mi->mountpoint, opts); + + if (mount(source, mi->mountpoint, filesystemtype, mountflags, opts) < 0) { + pr_perror("Failed to mount autofs to %s", mi->mountpoint); + goto free_opts; + } + + info = xmalloc(sizeof(*info)); + if (!info) + goto umount; + info->entry = entry; + + /* We need autofs dev_id to be able to open direct mount point. + * But we can't call stat in autofs_add_mount_info(), because autofs + * mount can be overmounted. Thus we have to call it here. But shared + * data is not ready yet. So, let's put in on mi->private and copy to + * shared data in autofs_add_mount_info(). + */ + if (stat(mi->mountpoint, &buf) < 0) { + pr_perror("Failed to stat %s", mi->mountpoint); + goto free_info; + } + info->mnt_dev = buf.st_dev; + + /* We need to create dentries for nested mounts */ + ret = autofs_populate_mount(mi, entry); + if (ret < 0) + goto free_info; + + /* In case of catatonic mounts all we need as the function call below */ + ret = autofs_post_mount(mi->mountpoint, buf.st_dev, entry->timeout); + if (ret < 0) + goto free_info; + + /* Otherwise we have to add shared object creation callback */ + if (entry->fd != AUTOFS_CATATONIC_FD) { + info->ph.actor = autofs_add_mount_info; + add_post_prepare_cb(&info->ph); + } + + info->mi = mi; + mi->private = info; + +free_opts: + free(opts); +close_pipe: + close(control_pipe[1]); + close(control_pipe[0]); + return ret; + +free_info: + free(info); +umount: + if (umount(mi->mountpoint) < 0) + pr_perror("Failed to umount %s", mi->mountpoint); + goto close_pipe; +} + diff --git a/CRIU_code/criu/bfd.c b/CRIU_code/criu/bfd.c new file mode 100644 index 0000000..0582455 --- /dev/null +++ b/CRIU_code/criu/bfd.c @@ -0,0 +1,333 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "int.h" +#include "log.h" +#include "common/bug.h" +#include "bfd.h" +#include "common/list.h" +#include "util.h" +#include "xmalloc.h" +#include "page.h" + +#undef LOG_PREFIX +#define LOG_PREFIX "bfd: " + +/* + * Kernel doesn't produce more than one page of + * date per one read call on proc files. + */ +#define BUFSIZE (PAGE_SIZE) + +struct bfd_buf { + char *mem; + struct list_head l; +}; + +static LIST_HEAD(bufs); + +#define BUFBATCH (16) + +static int buf_get(struct xbuf *xb) +{ + struct bfd_buf *b; + + if (list_empty(&bufs)) { + void *mem; + int i; + + mem = mmap(NULL, BUFBATCH * BUFSIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); + if (mem == MAP_FAILED) { + pr_perror("No buf"); + return -1; + } + + for (i = 0; i < BUFBATCH; i++) { + b = xmalloc(sizeof(*b)); + if (!b) { + if (i == 0) { + pr_err("No buffer for bfd\n"); + return -1; + } + + pr_warn("BFD buffers partial refil!\n"); + break; + } + + b->mem = mem + i * BUFSIZE; + list_add_tail(&b->l, &bufs); + } + } + + b = list_first_entry(&bufs, struct bfd_buf, l); + list_del_init(&b->l); + + xb->mem = b->mem; + xb->data = xb->mem; + xb->sz = 0; + xb->buf = b; + return 0; +} + +static void buf_put(struct xbuf *xb) +{ + /* + * Don't unmap buffer back, it will get reused + * by next bfdopen call + */ + list_add(&xb->buf->l, &bufs); + xb->buf = NULL; + xb->mem = NULL; + xb->data = NULL; +} + +static int bfdopen(struct bfd *f, bool writable) +{ + if (buf_get(&f->b)) { + close_safe(&f->fd); + return -1; + } + + f->writable = writable; + return 0; +} + +int bfdopenr(struct bfd *f) +{ + return bfdopen(f, false); +} + +int bfdopenw(struct bfd *f) +{ + return bfdopen(f, true); +} + +static int bflush(struct bfd *bfd); +static bool flush_failed = false; + +int bfd_flush_images(void) +{ + return flush_failed ? -1 : 0; +} + +void bclose(struct bfd *f) +{ + if (bfd_buffered(f)) { + if (f->writable && bflush(f) < 0) { + /* + * This is to propagate error up. It's + * hardly possible by returning and + * checking it, but setting a static + * flag, failing further bfdopen-s and + * checking one at the end would work. + */ + flush_failed = true; + pr_perror("Error flushing image"); + } + + buf_put(&f->b); + } + close_safe(&f->fd); +} + +static int brefill(struct bfd *f) +{ + int ret; + struct xbuf *b = &f->b; + + memmove(b->mem, b->data, b->sz); + b->data = b->mem; + + ret = read(f->fd, b->mem + b->sz, BUFSIZE - b->sz); + if (ret < 0) { + pr_perror("Error reading file"); + return -1; + } + + if (ret == 0) + return 0; + + b->sz += ret; + return 1; +} + +static char *strnchr(char *str, unsigned int len, char c) +{ + while (len > 0 && *str != c) { + str++; + len--; + } + + return len == 0 ? NULL : str; +} + +char *breadline(struct bfd *f) +{ + return breadchr(f, '\n'); +} + +char *breadchr(struct bfd *f, char c) +{ + struct xbuf *b = &f->b; + bool refilled = false; + char *n; + unsigned int ss = 0; + +again: + n = strnchr(b->data + ss, b->sz - ss, c); + if (n) { + char *ret; + + ret = b->data; + b->data = n + 1; /* skip the \n found */ + *n = '\0'; + b->sz -= (b->data - ret); + return ret; + } + + if (refilled) { + if (!b->sz) + return NULL; + + if (b->sz == BUFSIZE) { + pr_err("The bfd buffer is too small\n"); + ERR_PTR(-EIO); + return NULL; + } + /* + * Last bytes may lack the \n at the + * end, need to report this as full + * line anyway + */ + b->data[b->sz] = '\0'; + + /* + * The b->data still points to old data, + * but we say that no bytes left there + * so next call to breadline will not + * "find" these bytes again. + */ + b->sz = 0; + return b->data; + } + + /* + * small optimization -- we've scanned b->sz + * symbols already, no need to re-scan them after + * the buffer refill. + */ + ss = b->sz; + + /* no full line in the buffer -- refill one */ + if (brefill(f) < 0) + return ERR_PTR(-EIO); + + refilled = true; + + goto again; +} + +static int bflush(struct bfd *bfd) +{ + struct xbuf *b = &bfd->b; + int ret; + + if (!b->sz) + return 0; + + ret = write(bfd->fd, b->data, b->sz); + if (ret != b->sz) + return -1; + + b->sz = 0; + return 0; +} + +static int __bwrite(struct bfd *bfd, const void *buf, int size) +{ + struct xbuf *b = &bfd->b; + + if (b->sz + size > BUFSIZE) { + int ret; + ret = bflush(bfd); + if (ret < 0) + return ret; + } + + if (size > BUFSIZE) + return write(bfd->fd, buf, size); + + memcpy(b->data + b->sz, buf, size); + b->sz += size; + return size; +} + +int bwrite(struct bfd *bfd, const void *buf, int size) +{ + if (!bfd_buffered(bfd)) + return write(bfd->fd, buf, size); + + return __bwrite(bfd, buf, size); +} + +int bwritev(struct bfd *bfd, const struct iovec *iov, int cnt) +{ + int i, written = 0; + + if (!bfd_buffered(bfd)) + return writev(bfd->fd, iov, cnt); + + for (i = 0; i < cnt; i++) { + int ret; + + ret = __bwrite(bfd, (const void *)iov[i].iov_base, iov[i].iov_len); + if (ret < 0) + return ret; + + written += ret; + if (ret < iov[i].iov_len) + break; + } + + return written; +} + +int bread(struct bfd *bfd, void *buf, int size) +{ + struct xbuf *b = &bfd->b; + int more = 1, filled = 0; + + if (!bfd_buffered(bfd)) + return read(bfd->fd, buf, size); + + while (more > 0) { + int chunk; + + chunk = size - filled; + if (chunk > b->sz) + chunk = b->sz; + + if (chunk) { + memcpy(buf + filled, b->data, chunk); + b->data += chunk; + b->sz -= chunk; + filled += chunk; + } + + if (filled < size) + more = brefill(bfd); + else { + BUG_ON(filled > size); + more = 0; + } + } + + return more < 0 ? more : filled; +} diff --git a/CRIU_code/criu/bitmap.c b/CRIU_code/criu/bitmap.c new file mode 100644 index 0000000..a28a89d --- /dev/null +++ b/CRIU_code/criu/bitmap.c @@ -0,0 +1,54 @@ +#include "common/bitsperlong.h" + +#define BIT_WORD(nr) ((nr) / BITS_PER_LONG) + +#define BITMAP_FIRST_WORD_MASK(start) (~0ul << ((start) % BITS_PER_LONG)) + +#define BITMAP_LAST_WORD_MASK(nbits) \ +( \ + ((nbits) % BITS_PER_LONG) ? \ + (1ul << ((nbits) % BITS_PER_LONG)) - 1 : ~0ul \ +) + +#define small_const_nbits(nbits) \ + (__builtin_constant_p(nbits) && (nbits) <= BITS_PER_LONG) + +void bitmap_set(unsigned long *map, int start, int nr) +{ + unsigned long *p = map + BIT_WORD(start); + const int size = start + nr; + int bits_to_set = BITS_PER_LONG - (start % BITS_PER_LONG); + unsigned long mask_to_set = BITMAP_FIRST_WORD_MASK(start); + + while (nr - bits_to_set >= 0) { + *p |= mask_to_set; + nr -= bits_to_set; + bits_to_set = BITS_PER_LONG; + mask_to_set = ~0UL; + p++; + } + if (nr) { + mask_to_set &= BITMAP_LAST_WORD_MASK(size); + *p |= mask_to_set; + } +} + +void bitmap_clear(unsigned long *map, int start, int nr) +{ + unsigned long *p = map + BIT_WORD(start); + const int size = start + nr; + int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG); + unsigned long mask_to_clear = BITMAP_FIRST_WORD_MASK(start); + + while (nr - bits_to_clear >= 0) { + *p &= ~mask_to_clear; + nr -= bits_to_clear; + bits_to_clear = BITS_PER_LONG; + mask_to_clear = ~0UL; + p++; + } + if (nr) { + mask_to_clear &= BITMAP_LAST_WORD_MASK(size); + *p &= ~mask_to_clear; + } +} diff --git a/CRIU_code/criu/cgroup-props.c b/CRIU_code/criu/cgroup-props.c new file mode 100644 index 0000000..ecd9593 --- /dev/null +++ b/CRIU_code/criu/cgroup-props.c @@ -0,0 +1,578 @@ +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "int.h" +#include "common/config.h" +#include "common/compiler.h" +#include "cgroup-props.h" +#include "cr_options.h" +#include "xmalloc.h" +#include "string.h" +#include "util.h" +#include "common/list.h" +#include "log.h" +#include "common/bug.h" + +#undef LOG_PREFIX +#define LOG_PREFIX "cg-prop: " + +enum { + CGP_MERGE, + CGP_REPLACE, +}; + +static const char *____criu_global_props____[] = { + "cgroup.clone_children", + "notify_on_release", + "cgroup.procs", + "tasks", +}; + +cgp_t cgp_global = { + .name = "____criu_global_props____", + .nr_props = ARRAY_SIZE(____criu_global_props____), + .props = ____criu_global_props____, +}; + +typedef struct { + struct list_head list; + cgp_t cgp; +} cgp_list_entry_t; + +static LIST_HEAD(cgp_list); + +static void cgp_free(cgp_list_entry_t *p) +{ + size_t i; + + if (p) { + for (i = 0; i < p->cgp.nr_props; i++) + xfree((void *)p->cgp.props[i]); + xfree((void *)p->cgp.name); + xfree((void *)p->cgp.props); + xfree(p); + } +} + +static int cgp_merge_props(cgp_list_entry_t *d, cgp_list_entry_t *s) +{ + size_t nr_props, i, j; + + nr_props = d->cgp.nr_props + s->cgp.nr_props; + if (xrealloc_safe(&d->cgp.props, nr_props * sizeof(char *))) + return -ENOMEM; + + /* + * FIXME: Check for duplicates in propties? + */ + for (i = d->cgp.nr_props, j = 0; i < nr_props; i++, j++) { + d->cgp.props[i] = xstrdup(s->cgp.props[j]); + if (!d->cgp.props[i]) + return -ENOMEM; + d->cgp.nr_props++; + } + + return 0; +} + +static int cgp_handle_props(cgp_list_entry_t **p, int strategy) +{ + cgp_list_entry_t *s = *p; + cgp_list_entry_t *t; + + list_for_each_entry(t, &cgp_list, list) { + if (strcmp(t->cgp.name, s->cgp.name)) + continue; + + pr_debug("%s \"%s\" controller properties\n", + strategy == CGP_MERGE ? + "Merging" : "Replacing", + s->cgp.name); + + if (strategy == CGP_MERGE) { + int ret; + + ret = cgp_merge_props(t, s); + cgp_free(s); + *p = NULL; + return ret; + } else if (strategy == CGP_REPLACE) { + /* + * Simply drop out previous instance. + */ + list_del(&t->list); + cgp_free(t); + break; + } else + BUG(); + } + + /* + * New controller, simply add it. + */ + list_add(&s->list, &cgp_list); + *p = NULL; + return 0; +} + +static char *skip_spaces(char **stream, size_t *len) +{ + if (stream && *len) { + char *p = *stream; + + while (p && *len && *p == ' ') + p++, (*len)--; + if (p != *stream) + *stream = p; + return p; + } + + return NULL; +} + +static bool eat_symbol(char **stream, size_t *len, char sym, bool skip_ws) +{ + char *p = skip_ws ? skip_spaces(stream, len) : (stream ? *stream : NULL); + + if (!p || *p != sym || !*len) + return false; + (*stream) = p + 1; + (*len)--; + return true; +} + +static bool eat_symbols(char **stream, size_t *len, char *syms, size_t n_syms, bool skip_ws) +{ + char *p = skip_ws ? skip_spaces(stream, len) : (stream ? *stream : NULL); + size_t i; + + if (p && *len) { + char *stream_orig = *stream; + size_t len_orig = *len; + + for (i = 0; i < n_syms; i++) { + if (!eat_symbol(stream, len, syms[i], false)) { + *stream = stream_orig; + *len = len_orig; + goto nomatch; + } + } + return true; + } +nomatch: + return false; +} + +static bool eat_word(char **stream, size_t *len, char *word, size_t word_len, bool skip_ws) +{ + char *p = skip_ws ? skip_spaces(stream, len) : (stream ? *stream : NULL); + + if (p && *len >= word_len) { + if (!strncmp(p, word, word_len)) { + (*stream) += word_len; + (*len) -= word_len; + return true; + } + } + + return false; +} + +static char *get_quoted(char **stream, size_t *len, bool skip_ws) +{ + char *p = skip_ws ? skip_spaces(stream, len) : (stream ? *stream : NULL); + char *from = p + 1; + char *dst; + + if (!p || *p != '\"') + return NULL; + + for (p = from, (*len)--; (*len); p++, (*len)--) { + if (*p == '\"') { + if (p == from) + break; + dst = xmalloc(p - from + 1); + if (!dst) + break; + + memcpy(dst, from, p - from); + dst[p - from] = '\0'; + + (*stream) = p + 1; + (*len)--; + return dst; + } + } + + return NULL; +} + +static int cgp_parse_stream(char *stream, size_t len) +{ + cgp_list_entry_t *cgp_entry = NULL; + int strategy; + int ret = 0; + char *p; + + /* + * We expect the following format here + * (very simplified YAML!) + * + * "cpu": + * - "strategy": "replace" + * - "properties": ["cpu.shares", "cpu.cfs_period_us"] + * "memory": + * - "strategy": "merge" + * - "properties": ["memory.limit_in_bytes", "memory.memsw.limit_in_bytes"] + * + * and etc. + */ + + while (len) { + /* + * Controller name. + */ + p = get_quoted(&stream, &len, false); + if (!p) { + pr_err("Expecting controller name\n"); + goto err_parse; + } + + pr_info("Parsing controller \"%s\"\n", p); + + cgp_entry = xzalloc(sizeof(*cgp_entry)); + if (cgp_entry) { + INIT_LIST_HEAD(&cgp_entry->list); + cgp_entry->cgp.name = p; + } else { + pr_err("Can't allocate memory for controller %s\n", p); + xfree(p); + return -ENOMEM; + } + + if (!eat_symbols(&stream, &len, ":\n - ", 5, true)) { + pr_err("Expected \':\\n - \' sequence controller's %s stream\n", + cgp_entry->cgp.name); + goto err_parse; + } + + if (!eat_word(&stream, &len, "\"strategy\":", 11, true)) { + pr_err("Expected \'strategy:\' keyword in controller's %s stream\n", + cgp_entry->cgp.name); + goto err_parse; + } + + p = get_quoted(&stream, &len, true); + if (!p) { + pr_err("Expected strategy in controller's %s stream\n", + cgp_entry->cgp.name); + goto err_parse; + }; + + if (!strcmp(p, "merge")) { + strategy = CGP_MERGE; + } else if (!strcmp(p, "replace")) { + strategy = CGP_REPLACE; + } else { + pr_err("Unknown strategy \"%s\" in controller's %s stream\n", + p, cgp_entry->cgp.name); + xfree(p); + goto err_parse; + } + + pr_info("\tStrategy \"%s\"\n", p); + xfree(p); + + if (!eat_symbols(&stream, &len, "\n - ", 4, true)) { + pr_err("Expected \':\\n - \' sequence controller's %s stream\n", + cgp_entry->cgp.name); + goto err_parse; + } + + if (!eat_word(&stream, &len, "\"properties\":", 13, true)) { + pr_err("Expected \"properties:\" keyword in controller's %s stream\n", + cgp_entry->cgp.name); + goto err_parse; + } + + if (!eat_symbol(&stream, &len, '[', true)) { + pr_err("Expected \'[\' sequence controller's %s properties stream\n", + cgp_entry->cgp.name); + goto err_parse; + } + + while ((p = get_quoted(&stream, &len, true))) { + if (!p) { + pr_err("Expected property name for controller %s\n", + cgp_entry->cgp.name); + goto err_parse; + } + + if (xrealloc_safe(&cgp_entry->cgp.props, + (cgp_entry->cgp.nr_props + 1) * sizeof(char *))) { + pr_err("Can't allocate property for controller %s\n", + cgp_entry->cgp.name); + goto err_parse; + } + + cgp_entry->cgp.props[cgp_entry->cgp.nr_props++] = p; + pr_info("\tProperty \"%s\"\n", p); + + if (!eat_symbol(&stream, &len, ',', true)) { + if (stream[0] == ']') { + stream++, len--; + break; + } + pr_err("Expected ']' in controller's %s stream\n", + cgp_entry->cgp.name); + goto err_parse; + } + } + + if (cgp_entry->cgp.nr_props == 0 && !eat_symbol(&stream, &len, ']', true)) { + pr_err("Expected ']' in empty property list for %s\n", cgp_entry->cgp.name); + goto err_parse; + } + + if (!eat_symbol(&stream, &len, '\n', true) && len) { + pr_err("Expected \'\\n\' symbol in controller's %s stream\n", + cgp_entry->cgp.name); + goto err_parse; + } + + if (cgp_handle_props(&cgp_entry, strategy)) + goto err_parse; + + cgp_entry = NULL; + } + + ret = 0; +out: + return ret; + +err_parse: + cgp_free(cgp_entry); + ret = -EINVAL; + goto out; +} + +static int cgp_parse_file(char *path) +{ + void *mem = MAP_FAILED; + int fd = -1, ret = -1; + struct stat st; + + fd = open(path, O_RDONLY); + if (fd < 0) { + pr_perror("Can't open file %s", path); + goto err; + } + + if (fstat(fd, &st)) { + pr_perror("Can't stat file %s", path); + goto err; + } + + mem = mmap(NULL, st.st_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_FILE, fd, 0); + if (mem == MAP_FAILED) { + pr_perror("Can't mmap file %s", path); + goto err; + } + + if (cgp_parse_stream(mem, st.st_size)) { + pr_err("Failed to parse file `%s'\n", path); + goto err; + } + + ret = 0; +err: + if (mem != MAP_FAILED) + munmap(mem, st.st_size); + close_safe(&fd); + return ret; +} + +static int cgp_parse_builtins(void) +{ + static const char predefined_stream[] = + "\"cpu\":\n" + " - \"strategy\": \"replace\"\n" + " - \"properties\": " + "[ " + "\"cpu.shares\", " + "\"cpu.cfs_period_us\", " + "\"cpu.cfs_quota_us\", " + "\"cpu.rt_period_us\", " + "\"cpu.rt_runtime_us\" " + "]\n" + /* limit_in_bytes and memsw.limit_in_bytes must be set in this order */ + "\"memory\":\n" + " - \"strategy\": \"replace\"\n" + " - \"properties\": " + "[ " + "\"memory.limit_in_bytes\", " + "\"memory.memsw.limit_in_bytes\", " + "\"memory.swappiness\", " + "\"memory.soft_limit_in_bytes\", " + "\"memory.move_charge_at_immigrate\", " + "\"memory.oom_control\", " + "\"memory.use_hierarchy\", " + "\"memory.kmem.limit_in_bytes\", " + "\"memory.kmem.tcp.limit_in_bytes\" " + "]\n" + /* + * cpuset.cpus and cpuset.mems must be set before the process moves + * into its cgroup; they are "initialized" below to whatever the root + * values are in copy_special_cg_props so as not to cause ENOSPC when + * values are restored via this code. + */ + "\"cpuset\":\n" + " - \"strategy\": \"replace\"\n" + " - \"properties\": " + "[ " + "\"cpuset.cpus\", " + "\"cpuset.mems\", " + "\"cpuset.memory_migrate\", " + "\"cpuset.cpu_exclusive\", " + "\"cpuset.mem_exclusive\", " + "\"cpuset.mem_hardwall\", " + "\"cpuset.memory_spread_page\", " + "\"cpuset.memory_spread_slab\", " + "\"cpuset.sched_load_balance\", " + "\"cpuset.sched_relax_domain_level\" " + "]\n" + "\"blkio\":\n" + " - \"strategy\": \"replace\"\n" + " - \"properties\": " + "[ " + "\"blkio.weight\" " + "]\n" + "\"freezer\":\n" + " - \"strategy\": \"replace\"\n" + " - \"properties\": " + "[ " + "]\n" + "\"perf_event\":\n" + " - \"strategy\": \"replace\"\n" + " - \"properties\": " + "[ " + "]\n" + "\"net_cls\":\n" + " - \"strategy\": \"replace\"\n" + " - \"properties\": " + "[ " + "\"net_cls.classid\" " + "]\n" + "\"net_prio\":\n" + " - \"strategy\": \"replace\"\n" + " - \"properties\": " + "[ " + "\"net_prio.ifpriomap\" " + "]\n" + "\"pids\":\n" + " - \"strategy\": \"replace\"\n" + " - \"properties\": " + "[ " + "\"pids.max\" " + "]\n" + "\"devices\":\n" + " - \"strategy\": \"replace\"\n" + " - \"properties\": " + "[ " + "\"devices.list\" " + "]\n"; + + return cgp_parse_stream((void *)predefined_stream, + strlen(predefined_stream)); +} + +int cgp_init(char *stream, size_t len, char *path) +{ + int ret; + + ret = cgp_parse_builtins(); + if (ret) + goto err; + + if (stream && len) { + ret = cgp_parse_stream(stream, len); + if (ret) + goto err; + } + + if (path) + ret = cgp_parse_file(path); +err: + return ret; +} + +static char **dump_controllers; +static size_t nr_dump_controllers; + +bool cgp_add_dump_controller(const char *name) +{ + if (xrealloc_safe(&dump_controllers, (nr_dump_controllers + 1) * sizeof(char *))) { + pr_err("Can't add controller \"%s\" to mark\n", name); + return false; + } + + dump_controllers[nr_dump_controllers] = xstrdup(name); + if (!dump_controllers[nr_dump_controllers]) + return false; + + pr_debug("Mark controller \"%s\" to dump\n", name); + nr_dump_controllers++; + return true; +} + +bool cgp_should_skip_controller(const char *name) +{ + size_t i; + + /* + * Dump all by default. + */ + if (!nr_dump_controllers) + return false; + + for (i = 0; i < nr_dump_controllers; i++) { + if (!strcmp(name, dump_controllers[i])) + return false; + } + + return true; +} + +const cgp_t *cgp_get_props(const char *name) +{ + cgp_list_entry_t *p; + + list_for_each_entry(p, &cgp_list, list) { + if (!strcmp(p->cgp.name, name)) + return &p->cgp; + } + + return NULL; +} + +void cgp_fini(void) +{ + cgp_list_entry_t *p, *t; + size_t i; + + list_for_each_entry_safe(p, t, &cgp_list, list) + cgp_free(p); + INIT_LIST_HEAD(&cgp_list); + + for (i = 0; i < nr_dump_controllers; i++) + xfree(dump_controllers[i]); + xfree(dump_controllers); + nr_dump_controllers = 0; +} diff --git a/CRIU_code/criu/cgroup.c b/CRIU_code/criu/cgroup.c new file mode 100644 index 0000000..332c79f --- /dev/null +++ b/CRIU_code/criu/cgroup.c @@ -0,0 +1,1915 @@ +#define LOG_PREFIX "cg: " +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "common/list.h" +#include "xmalloc.h" +#include "cgroup.h" +#include "cgroup-props.h" +#include "cr_options.h" +#include "pstree.h" +#include "criu-log.h" +#include "util.h" +#include "imgset.h" +#include "util-pie.h" +#include "namespaces.h" +#include "seize.h" +#include "string.h" +#include "protobuf.h" +#include "images/core.pb-c.h" +#include "images/cgroup.pb-c.h" + +/* + * This structure describes set of controller groups + * a task lives in. The cg_ctl entries are stored in + * the @ctls list sorted by the .name field and then + * by the .path field. + */ + +struct cg_set { + u32 id; + struct list_head l; + unsigned int n_ctls; + struct list_head ctls; +}; + +static LIST_HEAD(cg_sets); +static unsigned int n_sets; +static CgSetEntry **rst_sets; +static unsigned int n_controllers; +static CgControllerEntry **controllers; +static char *cg_yard; +static struct cg_set *root_cgset; /* Set root item lives in */ +static struct cg_set *criu_cgset; /* Set criu process lives in */ +static u32 cg_set_ids = 1; + +static LIST_HEAD(cgroups); +static unsigned int n_cgroups; + +static CgSetEntry *find_rst_set_by_id(u32 id) +{ + int i; + + for (i = 0; i < n_sets; i++) + if (rst_sets[i]->id == id) + return rst_sets[i]; + + return NULL; +} + +#define CGCMP_MATCH 1 /* check for exact match */ +#define CGCMP_ISSUB 2 /* check set is subset of ctls */ + +static bool cg_set_compare(struct cg_set *set, struct list_head *ctls, int what) +{ + struct list_head *l1 = &set->ctls, *l2 = ctls; + + while (1) { + struct cg_ctl *c1 = NULL, *c2 = NULL; + + if (l1->next != &set->ctls) + c1 = list_first_entry(l1, struct cg_ctl, l); + if (l2->next != ctls) + c2 = list_first_entry(l2, struct cg_ctl, l); + + if (!c1 || !c2) /* Nowhere to move next */ + return !c1 && !c2; /* Both lists scanned -- match */ + + if (strcmp(c1->name, c2->name)) + return false; + + switch (what) { + case CGCMP_MATCH: + /* must have the same cgns prefix to be considered equal */ + if (c1->cgns_prefix != c2->cgns_prefix) + return false; + + if (strcmp(c1->path, c2->path)) + return false; + + break; + case CGCMP_ISSUB: + if (!strstartswith(c1->path, c2->path)) + return false; + + break; + } + + l1 = l1->next; + l2 = l2->next; + } +} + +static int collect_cgroups(struct list_head *ctls); + +static struct cg_set *get_cg_set(struct list_head *ctls, unsigned int n_ctls, bool collect) +{ + struct cg_set *cs; + + list_for_each_entry(cs, &cg_sets, l) + if (cg_set_compare(cs, ctls, CGCMP_MATCH)) { + pr_debug(" `- Existing css %d found\n", cs->id); + put_ctls(ctls); + return cs; + } + + pr_debug(" `- New css ID %d\n", cg_set_ids); + cs = xmalloc(sizeof(*cs)); + if (cs) { + cs->id = cg_set_ids++; + INIT_LIST_HEAD(&cs->ctls); + list_splice_init(ctls, &cs->ctls); + cs->n_ctls = n_ctls; + list_add_tail(&cs->l, &cg_sets); + n_sets++; + + if (!pr_quelled(LOG_DEBUG)) { + struct cg_ctl *ctl; + + list_for_each_entry(ctl, &cs->ctls, l) + pr_debug(" `- [%s] -> [%s] [%u]\n", ctl->name, ctl->path, ctl->cgns_prefix); + } + + if (collect && collect_cgroups(&cs->ctls)) { + list_del(&cs->l); + n_sets--; + put_ctls(&cs->ctls); + xfree(cs); + return NULL; + } + } + + return cs; +} + +struct cg_controller *new_controller(const char *name) +{ + struct cg_controller *nc = xmalloc(sizeof(*nc)); + if (!nc) + return NULL; + + nc->controllers = xmalloc(sizeof(char *)); + if (!nc->controllers) { + xfree(nc); + return NULL; + } + + nc->controllers[0] = xstrdup(name); + if (!nc->controllers[0]) { + xfree(nc->controllers); + xfree(nc); + return NULL; + } + + nc->n_controllers = 1; + + nc->n_heads = 0; + INIT_LIST_HEAD(&nc->heads); + + return nc; +} + +int parse_cg_info(void) +{ + if (collect_controllers(&cgroups, &n_cgroups) < 0) + return -1; + + return 0; +} + +/* Check that co-mounted controllers from /proc/cgroups (e.g. cpu and cpuacct) + * are contained in a comma separated string (e.g. from /proc/self/cgroup or + * mount options). */ +static bool cgroup_contains(char **controllers, + unsigned int n_controllers, char *name, u64 *mask) +{ + unsigned int i; + bool all_match = true; + for (i = 0; i < n_controllers; i++) { + bool found = false; + const char *loc = name; + do { + loc = strstr(loc, controllers[i]); + if (loc) { + loc += strlen(controllers[i]); + switch (*loc) { + case '\0': + case ',': + found = true; + if (mask) + *mask &= ~(1ULL << i); + break; + } + } + } while (loc); + all_match &= found; + } + + return all_match && n_controllers > 0; +} + +/* This is for use in add_cgroup() as additional arguments for the ftw() + * callback */ +static struct cg_controller *current_controller; +static unsigned int path_pref_len; + +#define EXACT_MATCH 0 +#define PARENT_MATCH 1 +#define NO_MATCH 2 + +static int find_dir(const char *path, struct list_head *dirs, struct cgroup_dir **rdir) +{ + struct cgroup_dir *d; + list_for_each_entry(d, dirs, siblings) { + if (strcmp(d->path, path) == 0) { + *rdir = d; + return EXACT_MATCH; + } + + if (strstartswith(path, d->path)) { + int ret = find_dir(path, &d->children, rdir); + if (ret == NO_MATCH) { + *rdir = d; + return PARENT_MATCH; + } + return ret; + + } + } + + return NO_MATCH; +} + +/* + * Strips trailing '\n' from the string + */ +static inline char *strip(char *str) +{ + char *e; + + e = strchr(str, '\0'); + if (e != str && *(e - 1) == '\n') + *(e - 1) = '\0'; + + return str; +} + +/* + * Currently this function only supports properties that have a string value + * under 1024 chars. + */ +static int read_cgroup_prop(struct cgroup_prop *property, const char *fullpath) +{ + char buf[1024]; + int fd, ret; + struct stat sb; + + fd = open(fullpath, O_RDONLY); + if (fd == -1) { + property->value = NULL; + pr_perror("Failed opening %s", fullpath); + return -1; + } + + if (fstat(fd, &sb) < 0) { + pr_perror("failed statting cgroup prop %s", fullpath); + close(fd); + return -1; + } + + property->mode = sb.st_mode; + property->uid = sb.st_uid; + property->gid = sb.st_gid; + + /* skip dumping the value of these, since it doesn't make sense (we + * just want to restore the perms) */ + if (!strcmp(property->name, "cgroup.procs") || !strcmp(property->name, "tasks")) { + ret = 0; + /* libprotobuf segfaults if we leave a null pointer in a + * string, so let's not do that */ + property->value = xstrdup(""); + if (!property->value) + ret = -1; + + close(fd); + return ret; + } + + ret = read(fd, buf, sizeof(buf) - 1); + if (ret == -1) { + pr_err("Failed scanning %s\n", fullpath); + close(fd); + return -1; + } + close(fd); + + buf[ret] = 0; + + if (strtoll(buf, NULL, 10) == LLONG_MAX) + strcpy(buf, "-1"); + + property->value = xstrdup(strip(buf)); + if (!property->value) + return -1; + return 0; +} + +static struct cgroup_prop *create_cgroup_prop(const char *name) +{ + struct cgroup_prop *property; + + property = xmalloc(sizeof(*property)); + if (!property) + return NULL; + + property->name = xstrdup(name); + if (!property->name) { + xfree(property); + return NULL; + } + + property->value = NULL; + return property; +} + +static void free_cgroup_prop(struct cgroup_prop *prop) +{ + xfree(prop->name); + xfree(prop->value); + xfree(prop); +} + +static void free_all_cgroup_props(struct cgroup_dir *ncd) +{ + struct cgroup_prop *prop, *t; + + list_for_each_entry_safe(prop, t, &ncd->properties, list) { + list_del(&prop->list); + free_cgroup_prop(prop); + } + + INIT_LIST_HEAD(&ncd->properties); + ncd->n_properties = 0; +} + +static int dump_cg_props_array(const char *fpath, struct cgroup_dir *ncd, const cgp_t *cgp) +{ + int j; + char buf[PATH_MAX]; + struct cgroup_prop *prop; + + for (j = 0; cgp && j < cgp->nr_props; j++) { + if (snprintf(buf, PATH_MAX, "%s/%s", fpath, cgp->props[j]) >= PATH_MAX) { + pr_err("snprintf output was truncated\n"); + return -1; + } + + if (access(buf, F_OK) < 0 && errno == ENOENT) { + pr_info("Couldn't open %s. This cgroup property may not exist on this kernel\n", buf); + continue; + } + + prop = create_cgroup_prop(cgp->props[j]); + if (!prop) { + free_all_cgroup_props(ncd); + return -1; + } + + if (read_cgroup_prop(prop, buf) < 0) { + free_cgroup_prop(prop); + free_all_cgroup_props(ncd); + return -1; + } + + if (!strcmp("memory.oom_control", cgp->props[j])) { + char *new; + int disable; + + if (sscanf(prop->value, "oom_kill_disable %d\n", &disable) != 1) { + pr_err("couldn't scan oom state from %s\n", prop->value); + free_cgroup_prop(prop); + free_all_cgroup_props(ncd); + return -1; + } + + if (asprintf(&new, "%d", disable) < 0) { + pr_err("couldn't allocate new oom value\n"); + free_cgroup_prop(prop); + free_all_cgroup_props(ncd); + return -1; + } + + xfree(prop->value); + prop->value = new; + } + + pr_info("Dumping value %s from %s/%s\n", prop->value, fpath, prop->name); + list_add_tail(&prop->list, &ncd->properties); + ncd->n_properties++; + } + + return 0; +} + +static int add_cgroup_properties(const char *fpath, struct cgroup_dir *ncd, + struct cg_controller *controller) +{ + int i; + + for (i = 0; i < controller->n_controllers; ++i) { + const cgp_t *cgp = cgp_get_props(controller->controllers[i]); + + if (dump_cg_props_array(fpath, ncd, cgp) < 0) { + pr_err("dumping known properties failed\n"); + return -1; + } + + if (dump_cg_props_array(fpath, ncd, &cgp_global) < 0) { + pr_err("dumping global properties failed\n"); + return -1; + } + } + + return 0; +} + +static int add_cgroup(const char *fpath, const struct stat *sb, int typeflag) +{ + struct cgroup_dir *ncd = NULL, *match; + int exit_code = -1; + + if (typeflag == FTW_D) { + int mtype; + + pr_info("adding cgroup %s\n", fpath); + + ncd = xmalloc(sizeof(*ncd)); + if (!ncd) + goto out; + + ncd->mode = sb->st_mode; + ncd->uid = sb->st_uid; + ncd->gid = sb->st_gid; + + /* chop off the first "/proc/self/fd/N" str */ + if (fpath[path_pref_len] == '\0') + ncd->path = xstrdup("/"); + else + ncd->path = xstrdup(fpath + path_pref_len); + + if (!ncd->path) + goto out; + + mtype = find_dir(ncd->path, ¤t_controller->heads, &match); + + switch (mtype) { + /* ignore co-mounted cgroups and already dumped cgroups */ + case EXACT_MATCH: + exit_code = 0; + goto out; + case PARENT_MATCH: + list_add_tail(&ncd->siblings, &match->children); + match->n_children++; + break; + case NO_MATCH: + list_add_tail(&ncd->siblings, ¤t_controller->heads); + current_controller->n_heads++; + break; + default: + BUG(); + } + + INIT_LIST_HEAD(&ncd->children); + ncd->n_children = 0; + + INIT_LIST_HEAD(&ncd->properties); + ncd->n_properties = 0; + if (add_cgroup_properties(fpath, ncd, current_controller) < 0) { + list_del(&ncd->siblings); + if (mtype == PARENT_MATCH) + match->n_children--; + else if (mtype == NO_MATCH) + current_controller->n_heads--; + goto out; + } + } + + return 0; + +out: + if (ncd) + xfree(ncd->path); + xfree(ncd); + return exit_code; +} + +static int add_freezer_state(struct cg_controller *controller) +{ + struct cgroup_dir *it; + + /* There is one more case, that cgroup namespaces might + * generate "multiple" heads if nothing is actually in the + * root freezer cgroup, e.g. --freeze-cgroup=/lxc/foo and all + * tasks in either /lxc/foo/a or /lxc/foo/b. + * + * In this case + */ + list_for_each_entry(it, &controller->heads, siblings) { + struct cgroup_dir *cg_head; + struct cgroup_prop *prop; + + cg_head = list_first_entry(&controller->heads, struct cgroup_dir, siblings); + + prop = create_cgroup_prop("freezer.state"); + if (!prop) + return -1; + prop->value = xstrdup(get_real_freezer_state()); + if (!prop->value) { + free_cgroup_prop(prop); + return -1; + } + + list_add_tail(&prop->list, &cg_head->properties); + cg_head->n_properties++; + } + + return 0; +} + +static int collect_cgroups(struct list_head *ctls) +{ + struct cg_ctl *cc; + int ret = 0; + int fd = -1; + + list_for_each_entry(cc, ctls, l) { + char path[PATH_MAX], mopts[1024], *root; + char prefix[] = ".criu.cgmounts.XXXXXX"; + struct cg_controller *cg; + struct cg_root_opt *o; + + current_controller = NULL; + + /* We should get all the "real" (i.e. not name=systemd type) + * controller from parse_cgroups(), so find that controller if + * it exists. */ + list_for_each_entry(cg, &cgroups, l) { + if (cgroup_contains(cg->controllers, cg->n_controllers, cc->name, NULL)) { + current_controller = cg; + break; + } + } + + if (!current_controller) { + /* only allow "fake" controllers to be created this way */ + if (!strstartswith(cc->name, "name=")) { + pr_err("controller %s not found\n", cc->name); + return -1; + } else { + struct cg_controller *nc; + + nc = new_controller(cc->name); + if (!nc) + return -1; + list_add_tail(&nc->l, &cg->l); + n_cgroups++; + current_controller = nc; + } + } + + if (!opts.manage_cgroups) + continue; + + if (strstartswith(cc->name, "name=")) + snprintf(mopts, sizeof(mopts), "none,%s", cc->name); + else + snprintf(mopts, sizeof(mopts), "%s", cc->name); + + if (mkdtemp(prefix) == NULL) { + pr_perror("can't make dir for cg mounts"); + return -1; + } + + if (mount("none", prefix, "cgroup", 0, mopts) < 0) { + pr_perror("couldn't mount %s", mopts); + rmdir(prefix); + return -1; + } + + fd = open_detach_mount(prefix); + if (fd < 0) + return -1; + + path_pref_len = snprintf(path, PATH_MAX, "/proc/self/fd/%d", fd); + + root = cc->path; + if (opts.new_global_cg_root) + root = opts.new_global_cg_root; + + list_for_each_entry(o, &opts.new_cgroup_roots, node) { + if (!strcmp(cc->name, o->controller)) + root = o->newroot; + } + + snprintf(path + path_pref_len, PATH_MAX - path_pref_len, "%s", root); + + ret = ftw(path, add_cgroup, 4); + if (ret < 0) + pr_perror("failed walking %s for empty cgroups", path); + + close_safe(&fd); + + if (ret < 0) + return ret; + + if (opts.freeze_cgroup && !strcmp(cc->name, "freezer") && + add_freezer_state(current_controller)) + return -1; + } + + return 0; +} + +int dump_task_cgroup(struct pstree_item *item, u32 *cg_id, struct parasite_dump_cgroup_args *args) +{ + int pid; + LIST_HEAD(ctls); + unsigned int n_ctls = 0; + struct cg_set *cs; + + if (item) + pid = item->pid->real; + else + pid = getpid(); + + pr_info("Dumping cgroups for %d\n", pid); + if (parse_task_cgroup(pid, args, &ctls, &n_ctls)) + return -1; + + cs = get_cg_set(&ctls, n_ctls, item); + if (!cs) + return -1; + + if (!item) { + BUG_ON(criu_cgset); + criu_cgset = cs; + pr_info("Set %d is criu one\n", cs->id); + } else { + if (item == root_item) { + BUG_ON(root_cgset); + root_cgset = cs; + pr_info("Set %d is root one\n", cs->id); + } else { + struct cg_ctl *root, *stray; + + BUG_ON(!root_cgset); + pr_info("Set %d is a stray\n", cs->id); + + /* Copy the cgns prefix from the root cgset for each + * controller. This is ok because we know that there is + * only one cgroup namespace. + */ + list_for_each_entry(root, &root_cgset->ctls, l) { + list_for_each_entry(stray, &cs->ctls, l) { + if (strcmp(root->name, stray->name)) + continue; + + if (strlen(stray->path) < root->cgns_prefix) { + pr_err("cg %s shorter than path prefix %d?\n", stray->path, root->cgns_prefix); + return -1; + } + + stray->cgns_prefix = root->cgns_prefix; + } + } + } + } + + *cg_id = cs->id; + return 0; +} + +static int dump_cg_dir_props(struct list_head *props, size_t n_props, + CgroupPropEntry ***ents) +{ + struct cgroup_prop *prop_cur; + CgroupPropEntry *cpe; + void *m; + int i = 0; + + m = xmalloc(n_props * (sizeof(CgroupPropEntry *) + sizeof(CgroupPropEntry))); + *ents = m; + if (!m) + return -1; + + cpe = m + n_props * sizeof(CgroupPropEntry *); + + list_for_each_entry(prop_cur, props, list) { + cgroup_prop_entry__init(cpe); + + cpe->perms = xmalloc(sizeof(*cpe->perms)); + if (!cpe->perms) + goto error; + cgroup_perms__init(cpe->perms); + + cpe->name = xstrdup(prop_cur->name); + cpe->value = xstrdup(prop_cur->value); + if (!cpe->name || !cpe->value) + goto error; + cpe->perms->mode = prop_cur->mode; + cpe->perms->uid = prop_cur->uid; + cpe->perms->gid = prop_cur->gid; + + (*ents)[i++] = cpe++; + } + + return 0; + +error: + while (i >= 0) { + xfree(cpe->name); + xfree(cpe->value); + --cpe; + --i; + } + + xfree(*ents); + return -1; +} + +static int dump_cg_dirs(struct list_head *dirs, size_t n_dirs, CgroupDirEntry ***ents, int poff) +{ + struct cgroup_dir *cur; + CgroupDirEntry *cde; + void *m; + int i = 0; + + m = xmalloc(n_dirs * (sizeof(CgroupDirEntry *) + sizeof(CgroupDirEntry))); + *ents = m; + if (!m) + return -1; + + cde = m + n_dirs * sizeof(CgroupDirEntry *); + + list_for_each_entry(cur, dirs, siblings) { + cgroup_dir_entry__init(cde); + + cde->dir_perms = xmalloc(sizeof(*cde->dir_perms)); + if (!cde->dir_perms) + return -1; + cgroup_perms__init(cde->dir_perms); + + cde->dir_perms->mode = cur->mode; + cde->dir_perms->uid = cur->uid; + cde->dir_perms->gid = cur->gid; + + cde->dir_name = cur->path + poff; + if (poff != 1) /* parent isn't "/" */ + cde->dir_name++; /* leading / */ + cde->n_children = cur->n_children; + if (cur->n_children > 0) + if (dump_cg_dirs(&cur->children, cur->n_children, &cde->children, strlen(cur->path)) < 0) { + xfree(*ents); + return -1; + } + + cde->n_properties = cur->n_properties; + if (cde->n_properties > 0) { + if (dump_cg_dir_props(&cur->properties, + cde->n_properties, &cde->properties) < 0) { + xfree(*ents); + return -1; + } + } + + (*ents)[i++] = cde++; + } + + return 0; +} + +static int dump_controllers(CgroupEntry *cg) +{ + struct cg_controller *cur; + CgControllerEntry *ce; + void *m; + int i; + + cg->n_controllers = n_cgroups; + m = xmalloc(n_cgroups * (sizeof(CgControllerEntry *) + sizeof(CgControllerEntry))); + cg->controllers = m; + ce = m + cg->n_controllers * sizeof(CgControllerEntry *); + if (!m) + return -1; + + i = 0; + list_for_each_entry(cur, &cgroups, l) { + cg_controller_entry__init(ce); + + ce->cnames = cur->controllers; + ce->n_cnames = cur->n_controllers; + ce->n_dirs = cur->n_heads; + if (ce->n_dirs > 0) + if (dump_cg_dirs(&cur->heads, cur->n_heads, &ce->dirs, 0) < 0) { + xfree(cg->controllers); + cg->controllers = NULL; + return -1; + } + cg->controllers[i++] = ce++; + } + + return 0; +} + +static void free_sets(CgroupEntry *cg, unsigned nr) +{ + unsigned i; + + for (i = 0; i < nr; i++) + xfree(cg->sets[i]->ctls); + xfree(cg->sets); +} + + +static int dump_sets(CgroupEntry *cg) +{ + struct cg_set *set; + struct cg_ctl *ctl; + unsigned s, c; + void *m; + CgSetEntry *se; + CgMemberEntry *ce; + + pr_info("Dumping %d sets\n", n_sets - 1); + + cg->n_sets = n_sets - 1; + m = xmalloc(cg->n_sets * (sizeof(CgSetEntry *) + sizeof(CgSetEntry))); + cg->sets = m; + se = m + cg->n_sets * sizeof(CgSetEntry *); + if (!m) + return -1; + + s = 0; + list_for_each_entry(set, &cg_sets, l) { + if (set == criu_cgset) + continue; + + /* + * Now encode them onto the image entry + */ + + cg_set_entry__init(se); + se->id = set->id; + + se->n_ctls = set->n_ctls; + m = xmalloc(se->n_ctls * (sizeof(CgMemberEntry *) + sizeof(CgMemberEntry))); + se->ctls = m; + ce = m + se->n_ctls * sizeof(CgMemberEntry *); + if (!m) { + free_sets(cg, s); + return -1; + } + + c = 0; + list_for_each_entry(ctl, &set->ctls, l) { + pr_info(" `- Dumping %s of %s\n", ctl->name, ctl->path); + cg_member_entry__init(ce); + ce->name = ctl->name; + ce->path = ctl->path; + if (ctl->cgns_prefix > 0) { + ce->has_cgns_prefix = true; + ce->cgns_prefix = ctl->cgns_prefix; + } + se->ctls[c++] = ce++; + } + + cg->sets[s++] = se++; + } + + return 0; +} + +int dump_cgroups(void) +{ + CgroupEntry cg = CGROUP_ENTRY__INIT; + int ret = -1; + + BUG_ON(!criu_cgset || !root_cgset); + + /* + * Check whether root task lives in its own set as compared + * to criu. If yes, we should not dump anything. Note that + * list_is_singular() is slightly wrong here: if the criu cgset has + * empty cgroups, those will not be restored on the target host, since + * we're not dumping anything here. + */ + + if (root_cgset == criu_cgset && list_is_singular(&cg_sets)) { + pr_info("All tasks in criu's cgroups. Nothing to dump.\n"); + return 0; + } + + if (dump_sets(&cg)) + return -1; + if (dump_controllers(&cg)) { + goto err; + } + + pr_info("Writing CG image\n"); + ret = pb_write_one(img_from_set(glob_imgset, CR_FD_CGROUP), &cg, PB_CGROUP); +err: + free_sets(&cg, cg.n_sets); + xfree(cg.controllers); + return ret; +} + +static int ctrl_dir_and_opt(CgControllerEntry *ctl, char *dir, int ds, + char *opt, int os) +{ + int i, doff = 0, ooff = 0; + bool none_opt = false; + + for (i = 0; i < ctl->n_cnames; i++) { + char *n; + + n = ctl->cnames[i]; + if (strstartswith(n, "name=")) { + n += 5; + if (opt && !none_opt) { + ooff += snprintf(opt + ooff, os - ooff, "none,"); + none_opt = true; + } + } + + doff += snprintf(dir + doff, ds - doff, "%s,", n); + if (opt) + ooff += snprintf(opt + ooff, os - ooff, "%s,", ctl->cnames[i]); + } + + /* Chop the trailing ','-s */ + dir[--doff] = '\0'; + if (opt) + opt[ooff - 1] = '\0'; + + return doff; +} + +/* Some properties cannot be restored after the cgroup has children or tasks in + * it. We restore these properties as soon as the cgroup is created. + */ +static const char *special_props[] = { + "cpuset.cpus", + "cpuset.mems", + "devices.list", + "memory.kmem.limit_in_bytes", + "memory.swappiness", + "memory.oom_control", + "memory.use_hierarchy", + NULL, +}; + +bool is_special_property(const char *prop) +{ + size_t i = 0; + + for (i = 0; special_props[i]; i++) + if (strcmp(prop, special_props[i]) == 0) + return true; + + return false; +} + +static int userns_move(void *arg, int fd, pid_t pid) +{ + char pidbuf[32]; + int cg, len, err; + + len = snprintf(pidbuf, sizeof(pidbuf), "%d", pid); + + if (len >= sizeof(pidbuf)) { + pr_err("pid printing failed: %d\n", pid); + return -1; + } + + cg = get_service_fd(CGROUP_YARD); + err = fd = openat(cg, arg, O_WRONLY); + if (fd >= 0) { + err = write(fd, pidbuf, len); + close(fd); + } + + if (err < 0) { + pr_perror("Can't move %s into %s (%d/%d)", pidbuf, (char *)arg, err, fd); + return -1; + } + + return 0; +} + +static int prepare_cgns(CgSetEntry *se) +{ + int i; + bool do_unshare = false; + + for (i = 0; i < se->n_ctls; i++) { + char aux[PATH_MAX]; + int j, aux_off; + CgMemberEntry *ce = se->ctls[i]; + CgControllerEntry *ctrl = NULL; + + for (j = 0; j < n_controllers; j++) { + CgControllerEntry *cur = controllers[j]; + if (cgroup_contains(cur->cnames, cur->n_cnames, ce->name, NULL)) { + ctrl = cur; + break; + } + } + + if (!ctrl) { + pr_err("No cg_controller_entry found for %s/%s\n", ce->name, ce->path); + return -1; + } + + aux_off = ctrl_dir_and_opt(ctrl, aux, sizeof(aux), NULL, 0); + + /* We need to do an unshare() here as unshare() pins the root + * of the cgroup namespace to whatever the current cgroups are. + * For example, consider a task in a cgroup (according to the + * host): + * + * /unsprefix/insidecontainer + * + * If the task first moved itself into /unsprefix, then did unshare(), + * when the task examines its own /proc/self/cgroup file it will see /, + * but to the host it is really in /unsprefix. Then if it further enters + * /insidecontainer here, the full host path will be + * /unsprefix/insidecontianer. There is no way to say "set the cgroup + * namespace boundary at /unsprefix" without first entering that, doing + * the unshare, and then entering the rest of the path. + */ + if (ce->has_cgns_prefix) { + char tmp = ce->path[ce->cgns_prefix]; + ce->path[ce->cgns_prefix] = '\0'; + + pr_info("setting cgns prefix to %s\n", ce->path); + snprintf(aux + aux_off, sizeof(aux) - aux_off, "/%s/tasks", ce->path); + ce->path[ce->cgns_prefix] = tmp; + if (userns_call(userns_move, 0, aux, strlen(aux) + 1, -1) < 0) { + pr_perror("couldn't set cgns prefix %s", aux); + return -1; + } + + do_unshare = true; + } + + } + + if (do_unshare && unshare(CLONE_NEWCGROUP) < 0) { + pr_perror("couldn't unshare cgns"); + return -1; + } + + return 0; +} + +static int move_in_cgroup(CgSetEntry *se, bool setup_cgns) +{ + int i; + + pr_info("Move into %d\n", se->id); + + if (setup_cgns && prepare_cgns(se) < 0) { + pr_err("failed preparing cgns\n"); + return -1; + } + + for (i = 0; i < se->n_ctls; i++) { + char aux[PATH_MAX]; + int fd = -1, err, j, aux_off; + CgMemberEntry *ce = se->ctls[i]; + CgControllerEntry *ctrl = NULL; + + for (j = 0; j < n_controllers; j++) { + CgControllerEntry *cur = controllers[j]; + if (cgroup_contains(cur->cnames, cur->n_cnames, ce->name, NULL)) { + ctrl = cur; + break; + } + } + + if (!ctrl) { + pr_err("No cg_controller_entry found for %s/%s\n", ce->name, ce->path); + return -1; + } + + aux_off = ctrl_dir_and_opt(ctrl, aux, sizeof(aux), NULL, 0); + + /* Note that unshare(CLONE_NEWCGROUP) doesn't change the view + * of previously mounted cgroupfses; since we're restoring via + * a dirfd pointing to the cg yard set up by when criu was in + * the root cgns, we still want to use the full path here when + * we move into the cgroup. + */ + snprintf(aux + aux_off, sizeof(aux) - aux_off, "/%s/tasks", ce->path); + pr_debug(" `-> %s\n", aux); + err = userns_call(userns_move, 0, aux, strlen(aux) + 1, -1); + if (err < 0) { + pr_perror("Can't move into %s (%d/%d)", aux, err, fd); + return -1; + } + } + + return 0; +} + +int prepare_task_cgroup(struct pstree_item *me) +{ + CgSetEntry *se; + u32 current_cgset; + + if (!rsti(me)->cg_set) + return 0; + + if (me->parent) + current_cgset = rsti(me->parent)->cg_set; + else + current_cgset = root_cg_set; + + if (rsti(me)->cg_set == current_cgset) { + pr_info("Cgroups %d inherited from parent\n", current_cgset); + return 0; + } + + se = find_rst_set_by_id(rsti(me)->cg_set); + if (!se) { + pr_err("No set %d found\n", rsti(me)->cg_set); + return -1; + } + + /* Since don't support nesting of cgroup namespaces, let's only set up + * the cgns (if it exists) in the init task. In the future, we should + * just check that the cgns prefix string matches for all the entries + * in the cgset, and only unshare if that's true. + */ + + return move_in_cgroup(se, !me->parent); +} + +void fini_cgroup(void) +{ + if (!cg_yard) + return; + + close_service_fd(CGROUP_YARD); + if (umount2(cg_yard, MNT_DETACH)) + pr_perror("Unable to umount %s", cg_yard); + if (rmdir(cg_yard)) + pr_perror("Unable to remove %s", cg_yard); + xfree(cg_yard); + cg_yard = NULL; +} + +static int restore_perms(int fd, const char *path, CgroupPerms *perms) +{ + struct stat sb; + + if (perms) { + if (fstat(fd, &sb) < 0) { + pr_perror("stat of property %s failed", path); + return -1; + } + + /* only chmod/chown if the perms are actually different: we aren't + * allowed to chmod some cgroup props (e.g. the read only ones), so we + * don't want to try if the perms already match. + */ + if (sb.st_mode != (mode_t) perms->mode && fchmod(fd, perms->mode) < 0) { + pr_perror("chmod of %s failed", path); + return -1; + } + + if ((sb.st_uid != perms->uid || sb.st_gid != perms->gid) && + fchown(fd, perms->uid, perms->gid)) { + pr_perror("chown of %s failed", path); + return -1; + } + } + + return 0; +} + +static int restore_cgroup_prop(const CgroupPropEntry *cg_prop_entry_p, + char *path, int off, bool split_lines, bool skip_fails) +{ + int cg, fd, ret = -1; + CgroupPerms *perms = cg_prop_entry_p->perms; + + if (!cg_prop_entry_p->value) { + pr_err("cg_prop_entry->value was empty when should have had a value\n"); + return -1; + } + + if (snprintf(path + off, PATH_MAX - off, "/%s", cg_prop_entry_p->name) >= PATH_MAX) { + pr_err("snprintf output was truncated for %s\n", cg_prop_entry_p->name); + return -1; + } + + pr_info("Restoring cgroup property value [%s] to [%s]\n", cg_prop_entry_p->value, path); + + cg = get_service_fd(CGROUP_YARD); + fd = openat(cg, path, O_WRONLY); + if (fd < 0) { + pr_perror("bad cgroup path: %s", path); + return -1; + } + + if (restore_perms(fd, path, perms) < 0) + goto out; + + /* skip these two since restoring their values doesn't make sense */ + if (!strcmp(cg_prop_entry_p->name, "cgroup.procs") || !strcmp(cg_prop_entry_p->name, "tasks")) { + ret = 0; + goto out; + } + + if (split_lines) { + char *line = cg_prop_entry_p->value; + char *next_line; + size_t len; + + do { + next_line = strchrnul(line, '\n'); + len = next_line - line; + + if (write(fd, line, len) != len) { + pr_perror("Failed writing %s to %s", line, path); + if (!skip_fails) + goto out; + } + line = next_line + 1; + } while(*next_line != '\0'); + } else { + size_t len = strlen(cg_prop_entry_p->value); + + if (write(fd, cg_prop_entry_p->value, len) != len) { + pr_perror("Failed writing %s to %s", cg_prop_entry_p->value, path); + if (!skip_fails) + goto out; + } + } + + ret = 0; + +out: + if (close(fd) != 0) + pr_perror("Failed closing %s", path); + + return ret; +} + +static CgroupPropEntry *freezer_state_entry; +static char freezer_path[PATH_MAX]; + +int restore_freezer_state(void) +{ + size_t freezer_path_len; + + if (!freezer_state_entry) + return 0; + + freezer_path_len = strlen(freezer_path); + return restore_cgroup_prop(freezer_state_entry, freezer_path, + freezer_path_len, false, false); +} + +static void add_freezer_state_for_restore(CgroupPropEntry *entry, char *path, size_t path_len) +{ + BUG_ON(path_len >= sizeof(freezer_path)); + + if (freezer_state_entry) { + int max_len, i; + + max_len = strlen(freezer_path); + if (max_len > path_len) + max_len = path_len; + + /* If there are multiple freezer.state properties, that means they had + * one common path prefix with no tasks in it. Let's find that common + * prefix. + */ + for (i = 0; i < max_len; i++) { + if (freezer_path[i] != path[i]) { + freezer_path[i] = 0; + return; + } + } + } + + freezer_state_entry = entry; + /* Path is not null terminated at path_len */ + strncpy(freezer_path, path, path_len); + freezer_path[path_len] = 0; +} + +/* + * Filter out ifpriomap interfaces which have 0 as priority. + * As by default new ifpriomap has 0 as a priority for each + * interface, this will save up some write()'s. + * As this property is used rarely, this may save a whole bunch + * of syscalls, skipping all ifpriomap restore. + */ +static int filter_ifpriomap(char *out, char *line) +{ + char *next_line, *space; + bool written = false; + size_t len; + + if (*line == '\0') + return 0; + + do { + next_line = strchrnul(line, '\n'); + len = next_line - line; + + space = strchr(line, ' '); + if (!space) { + pr_err("Invalid value for ifpriomap: `%s'\n", line); + return -1; + } + + if (!strtol(space, NULL, 10)) + goto next; + + /* Copying with last \n or \0 */ + strncpy(out, line, len + 1); + out += len + 1; + written = true; +next: + line = next_line + 1; + } while(*next_line != '\0'); + + if (written) + *(out - 1) = '\0'; + + return 0; +} + +static int restore_cgroup_ifpriomap(CgroupPropEntry *cpe, char *path, int off) +{ + CgroupPropEntry priomap = *cpe; + int ret = -1; + + priomap.value = xmalloc(strlen(cpe->value) + 1); + priomap.value[0] = '\0'; + + if (filter_ifpriomap(priomap.value, cpe->value)) + goto out; + + if (strlen(priomap.value)) + ret = restore_cgroup_prop(&priomap, path, off, true, true); + else + ret = 0; + +out: + xfree(priomap.value); + return ret; +} + +static int prepare_cgroup_dir_properties(char *path, int off, CgroupDirEntry **ents, + unsigned int n_ents) +{ + unsigned int i, j; + + for (i = 0; i < n_ents; i++) { + CgroupDirEntry *e = ents[i]; + size_t off2 = off; + + if (strcmp(e->dir_name, "") == 0) + goto skip; /* skip root cgroups */ + + off2 += sprintf(path + off, "/%s", e->dir_name); + for (j = 0; j < e->n_properties; ++j) { + CgroupPropEntry *p = e->properties[j]; + + if (!strcmp(p->name, "freezer.state")) { + add_freezer_state_for_restore(p, path, off2); + continue; /* skip restore now */ + } + + /* Skip restoring special cpuset props now. + * They were restored earlier, and can cause + * the restore to fail if some other task has + * entered the cgroup. + */ + if (is_special_property(p->name)) + continue; + + /* + * The kernel can't handle it in one write() + * Number of network interfaces on host may differ. + */ + if (strcmp(p->name, "net_prio.ifpriomap") == 0) { + if (restore_cgroup_ifpriomap(p, path, off2)) + return -1; + continue; + } + + if (restore_cgroup_prop(p, path, off2, false, false) < 0) + return -1; + } +skip: + if (prepare_cgroup_dir_properties(path, off2, e->children, e->n_children) < 0) + return -1; + } + + return 0; +} + +int prepare_cgroup_properties(void) +{ + char cname_path[PATH_MAX]; + unsigned int i, off; + + for (i = 0; i < n_controllers; i++) { + CgControllerEntry *c = controllers[i]; + + if (c->n_cnames < 1) { + pr_err("Each CgControllerEntry should have at least 1 cname\n"); + return -1; + } + + off = ctrl_dir_and_opt(c, cname_path, sizeof(cname_path), NULL, 0); + if (prepare_cgroup_dir_properties(cname_path, off, c->dirs, c->n_dirs) < 0) + return -1; + } + + return 0; +} + +/* + * The devices cgroup must be restored in a special way: + * only the contents of devices.list can be read, and it is a whitelist + * of all the devices the cgroup is allowed to create. To re-create + * this whitelist, we firstly deny everything via devices.deny, + * and then write the list back into devices.allow. + * + * Further, we must have a write() call for each line, because the kernel + * only parses the first line of any write(). + */ +static int restore_devices_list(char *paux, size_t off, CgroupPropEntry *pr) +{ + CgroupPropEntry dev_allow = *pr; + CgroupPropEntry dev_deny = *pr; + int ret; + + dev_allow.name = "devices.allow"; + dev_deny.name = "devices.deny"; + dev_deny.value = "a"; + + ret = restore_cgroup_prop(&dev_deny, paux, off, false, false); + + /* + * An empty string here means nothing is allowed, + * and the kernel disallows writing an "" to devices.allow, + * so let's just keep going. + */ + if (!strcmp(dev_allow.value, "")) + return 0; + + if (ret < 0) + return -1; + + return restore_cgroup_prop(&dev_allow, paux, off, true, false); +} + +static int restore_special_property(char *paux, size_t off, CgroupPropEntry *pr) +{ + /* + * XXX: we can drop this hack and make memory.swappiness and + * memory.oom_control regular properties when we drop support for + * kernels < 3.16. See 3dae7fec5. + */ + if (!strcmp(pr->name, "memory.swappiness") && !strcmp(pr->value, "60")) + return 0; + if (!strcmp(pr->name, "memory.oom_control") && !strcmp(pr->value, "0")) + return 0; + + if (!strcmp(pr->name, "devices.list")) { + /* + * A bit of a fudge here. These are write only by owner + * by default, but the container engine could have changed + * the perms. We should come up with a better way to + * restore all of this stuff. + */ + pr->perms->mode = 0200; + return restore_devices_list(paux, off, pr); + } + + return restore_cgroup_prop(pr, paux, off, false, false); +} + +static int restore_special_props(char *paux, size_t off, CgroupDirEntry *e) +{ + unsigned int j; + + pr_info("Restore special props\n"); + + for (j = 0; j < e->n_properties; j++) { + CgroupPropEntry *prop = e->properties[j]; + + if (!is_special_property(prop->name)) + continue; + + if (restore_special_property(paux, off, prop) < 0) { + pr_err("Restoring %s special property failed\n", prop->name); + return -1; + } + } + + return 0; +} + +static int prepare_dir_perms(int cg, char *path, CgroupPerms *perms) +{ + int fd, ret; + + fd = openat(cg, path, O_DIRECTORY); + if (fd < 0) { + pr_perror("failed to open cg dir fd (%s) for chowning", path); + return -1; + } + + ret = restore_perms(fd, path, perms); + close(fd); + return ret; +} + +static int prepare_cgroup_dirs(char **controllers, int n_controllers, char *paux, size_t off, + CgroupDirEntry **ents, size_t n_ents) +{ + size_t i, j; + CgroupDirEntry *e; + int cg = get_service_fd(CGROUP_YARD); + + for (i = 0; i < n_ents; i++) { + size_t off2 = off; + e = ents[i]; + + off2 += sprintf(paux + off, "/%s", e->dir_name); + + if (faccessat(cg, paux, F_OK, 0) < 0) { + if (errno != ENOENT) { + pr_perror("Failed accessing cgroup dir %s", paux); + return -1; + } + + if (opts.manage_cgroups & (CG_MODE_NONE | CG_MODE_PROPS)) { + pr_err("Cgroup dir %s doesn't exist\n", paux); + return -1; + } + + if (mkdirpat(cg, paux, 0755)) { + pr_perror("Can't make cgroup dir %s", paux); + return -1; + } + pr_info("Created cgroup dir %s\n", paux); + + if (prepare_dir_perms(cg, paux, e->dir_perms) < 0) + return -1; + + for (j = 0; j < n_controllers; j++) { + if (!strcmp(controllers[j], "cpuset") + || !strcmp(controllers[j], "memory") + || !strcmp(controllers[j], "devices")) { + if (restore_special_props(paux, off2, e) < 0) { + pr_err("Restoring special cpuset props failed!\n"); + return -1; + } + } + } + } else { + pr_info("Determined cgroup dir %s already exist\n", paux); + + if (opts.manage_cgroups & CG_MODE_STRICT) { + pr_err("Abort restore of existing cgroups\n"); + return -1; + } + + if (opts.manage_cgroups & (CG_MODE_SOFT | CG_MODE_NONE)) { + pr_info("Skip restoring properties on cgroup dir %s\n", paux); + if (e->n_properties > 0) { + xfree(e->properties); + e->properties = NULL; + e->n_properties = 0; + } + } + + if (!(opts.manage_cgroups & CG_MODE_NONE) && + prepare_dir_perms(cg, paux, e->dir_perms) < 0) + return -1; + } + + if (prepare_cgroup_dirs(controllers, n_controllers, paux, off2, + e->children, e->n_children) < 0) + return -1; + } + + return 0; +} + +/* + * Prepare the CGROUP_YARD service descriptor. This guy is + * tmpfs mount with the set of ctl->name directories each + * one having the respective cgroup mounted. + * + * It's required for two reasons. + * + * First, if we move more than one task into cgroups it's + * faster to have cgroup tree visible by them all in sime + * single place. Searching for this thing existing in the + * criu's space is not nice, as parsing /proc/mounts is not + * very fast, other than this not all cgroups may be mounted. + * + * Second, when we have user-namespaces support we will + * loose the ability to mount cgroups on-demand, so prepare + * them in advance. + */ + +static int prepare_cgroup_sfd(CgroupEntry *ce) +{ + int off, i, ret; + char paux[PATH_MAX]; + + if (!opts.manage_cgroups) + return 0; + + pr_info("Preparing cgroups yard (cgroups restore mode %#x)\n", + opts.manage_cgroups); + + off = sprintf(paux, ".criu.cgyard.XXXXXX"); + if (mkdtemp(paux) == NULL) { + pr_perror("Can't make temp cgyard dir"); + return -1; + } + + cg_yard = xstrdup(paux); + if (!cg_yard) { + rmdir(paux); + return -1; + } + + if (make_yard(cg_yard)) + goto err; + + pr_debug("Opening %s as cg yard\n", cg_yard); + i = open(cg_yard, O_DIRECTORY); + if (i < 0) { + pr_perror("Can't open cgyard"); + goto err; + } + + ret = install_service_fd(CGROUP_YARD, i); + if (ret < 0) + goto err; + + paux[off++] = '/'; + + for (i = 0; i < ce->n_controllers; i++) { + int ctl_off = off, yard_off; + char opt[128], *yard; + CgControllerEntry *ctrl = ce->controllers[i]; + + if (ctrl->n_cnames < 1) { + pr_err("Each cg_controller_entry must have at least 1 controller\n"); + goto err; + } + + ctl_off += ctrl_dir_and_opt(ctrl, + paux + ctl_off, sizeof(paux) - ctl_off, + opt, sizeof(opt)); + + /* Create controller if not yet present */ + if (access(paux, F_OK)) { + pr_debug("\tMaking controller dir %s (%s)\n", paux, opt); + if (mkdir(paux, 0700)) { + pr_perror("\tCan't make controller dir %s", paux); + return -1; + } + if (mount("none", paux, "cgroup", 0, opt) < 0) { + pr_perror("\tCan't mount controller dir %s", paux); + return -1; + } + } + + /* + * Finally handle all cgroups for this controller. + */ + yard = paux + strlen(cg_yard) + 1; + yard_off = ctl_off - (strlen(cg_yard) + 1); + if (opts.manage_cgroups && + prepare_cgroup_dirs(ctrl->cnames, ctrl->n_cnames, yard, yard_off, + ctrl->dirs, ctrl->n_dirs)) + goto err; + } + + return 0; + +err: + fini_cgroup(); + return -1; +} + +static int rewrite_cgsets(CgroupEntry *cge, char **controllers, int n_controllers, + char **dir_name, char *newroot) +{ + size_t dirlen = strlen(*dir_name); + char *dir = *dir_name; + char *dirnew = NULL; + size_t i, j; + + /* + * For example we may have the following in the image: + * + * set + * name "hugetlb" + * path "/300" + * + * controller + * cnames hugetlb + * dirs + * dirname "300" + * properties ... + * + * when we're switching to a new root we need to change + * @path and don't forget to update the @dirname into + * new state. + */ + + for (i = 0; i < cge->n_sets; i++) { + CgSetEntry *set = cge->sets[i]; + + for (j = 0; j < set->n_ctls; j++) { + CgMemberEntry *cg = set->ctls[j]; + + /* + * Make sure if it's same controller + * and its path with stripping leading + * "/" is matching to be renamed. + */ + if (!(cgroup_contains(controllers, n_controllers, cg->name, NULL) && + strstartswith(cg->path + 1, dir))) + continue; + + if (cg->has_cgns_prefix && cg->cgns_prefix) { + char *prev = cg->path; + + cg->path = xsprintf("%s%s", newroot, cg->path + cg->cgns_prefix); + if (!cg->path) { + cg->path = prev; + return -ENOMEM; + } + xfree(prev); + + if (!dirnew) { + /* -1 because cgns_prefix includes leading "/" */ + dirnew = xsprintf("%s%s", newroot, dir + cg->cgns_prefix - 1); + if (!dirnew) + return -ENOMEM; + } + cg->cgns_prefix = strlen(newroot); + } else { + char *prev = cg->path; + /* + * If no prefix present simply rename the + * root but make sure the rest of path is + * untouched. + */ + cg->path = xsprintf("%s%s", newroot, + cg->path + dirlen + 1); + if (!cg->path) { + cg->path = prev; + return -ENOMEM; + } + xfree(prev); + if (!dirnew) { + dirnew = xstrdup(newroot); + if (!dirnew) + return -ENOMEM; + } + } + } + } + + if (dirnew) { + xfree(dir); + *dir_name = dirnew; + } + return 0; +} + +static int rewrite_cgroup_roots(CgroupEntry *cge) +{ + int i, j; + struct cg_root_opt *o; + + for (i = 0; i < cge->n_controllers; i++) { + CgControllerEntry *ctrl = cge->controllers[i]; + u64 ctrl_mask = (1ULL << ctrl->n_cnames) - 1; + char *newroot = NULL; + + list_for_each_entry(o, &opts.new_cgroup_roots, node) { + unsigned old_mask = ctrl_mask; + + cgroup_contains(ctrl->cnames, ctrl->n_cnames, + o->controller, &ctrl_mask); + if (old_mask != ctrl_mask) { + if (newroot && strcmp(newroot, o->newroot)) { + pr_err("CG paths mismatch: %s %s\n", + newroot, o->newroot); + return -1; + } + newroot = o->newroot; + } + if (!ctrl_mask) + break; + } + + if (!newroot) + newroot = opts.new_global_cg_root; + + if (newroot) { + for (j = 0; j < ctrl->n_dirs; j++) { + CgroupDirEntry *cgde = ctrl->dirs[j]; + + pr_info("rewriting %s to %s\n", cgde->dir_name, newroot); + if (rewrite_cgsets(cge, ctrl->cnames, ctrl->n_cnames, &cgde->dir_name, newroot)) + return -1; + } + } + } + + return 0; +} + +int prepare_cgroup(void) +{ + int ret; + struct cr_img *img; + CgroupEntry *ce; + + img = open_image(CR_FD_CGROUP, O_RSTR); + if (!img) + return -1; + + ret = pb_read_one_eof(img, &ce, PB_CGROUP); + close_image(img); + if (ret <= 0) /* Zero is OK -- no sets there. */ + return ret; + + if (rewrite_cgroup_roots(ce)) + return -1; + + n_sets = ce->n_sets; + rst_sets = ce->sets; + n_controllers = ce->n_controllers; + controllers = ce->controllers; + + if (n_sets) + /* + * We rely on the fact that all sets contain the same + * set of controllers. This is checked during dump + * with cg_set_compare(CGCMP_ISSUB) call. + */ + ret = prepare_cgroup_sfd(ce); + else + ret = 0; + + return ret; +} + +int new_cg_root_add(char *controller, char *newroot) +{ + struct cg_root_opt *o; + + if (!controller) { + SET_CHAR_OPTS(new_global_cg_root, newroot); + return 0; + } + + o = xmalloc(sizeof(*o)); + if (!o) + return -1; + + o->controller = controller; + o->newroot = newroot; + list_add(&o->node, &opts.new_cgroup_roots); + return 0; +} + +struct ns_desc cgroup_ns_desc = NS_DESC_ENTRY(CLONE_NEWCGROUP, "cgroup"); diff --git a/CRIU_code/criu/clone-noasan.c b/CRIU_code/criu/clone-noasan.c new file mode 100644 index 0000000..5ca280e --- /dev/null +++ b/CRIU_code/criu/clone-noasan.c @@ -0,0 +1,31 @@ +#include +#include "common/compiler.h" +#include "log.h" +#include "common/bug.h" + +/* + * ASan doesn't play nicely with clone if we use current stack for + * child task. ASan puts local variables on the fake stack + * to catch use-after-return bug: + * https://github.com/google/sanitizers/wiki/AddressSanitizerUseAfterReturn#algorithm + * + * So it's become easy to overflow this fake stack frame in cloned child. + * We need a real stack for clone(). + * + * To workaround this we add clone_noasan() not-instrumented wrapper for + * clone(). Unfortunately we can't use __attribute__((no_sanitize_address)) + * for this because of bug in GCC > 6: + * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69863 + * + * So the only way is to put this wrapper in separate non-instrumented file + */ +int clone_noasan(int (*fn)(void *), int flags, void *arg) +{ + void *stack_ptr = (void *)round_down((unsigned long)&stack_ptr - 1024, 16); + BUG_ON((flags & CLONE_VM) && !(flags & CLONE_VFORK)); + /* + * Reserve some bytes for clone() internal needs + * and use as stack the address above this area. + */ + return clone(fn, stack_ptr, flags, arg); +} diff --git a/CRIU_code/criu/config.c b/CRIU_code/criu/config.c new file mode 100644 index 0000000..3a54afd --- /dev/null +++ b/CRIU_code/criu/config.c @@ -0,0 +1,892 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "log.h" +#include "common/list.h" + +#include "action-scripts.h" +#include "cgroup.h" +#include "cgroup-props.h" +#include "common/bug.h" +#include "cpu.h" +#include "crtools.h" +#include "cr_options.h" +#include "filesystems.h" +#include "file-lock.h" +#include "irmap.h" +#include "mount.h" +#include "namespaces.h" +#include "net.h" +#include "sk-inet.h" +#include "sockets.h" +#include "tty.h" +#include "version.h" + +#include "common/xmalloc.h" + +struct cr_options opts; + +static int count_elements(char **to_count) +{ + int count = 0; + if (to_count != NULL) + while (to_count[count] != NULL) + count++; + return count; +} + +/* Parse one statement in configuration file */ +int parse_statement(int i, char *line, char **configuration) +{ + int offset = 0, len = 0; + bool was_newline = true; + char *tmp_string, *quoted, *quotedptr; + + while (1) { + /* Ignore white-space */ + while ((isspace(*(line + offset)) && (*(line + offset) != '\n'))) offset++; + + /* Read a single word. A word is everything + * that doesn't contain white-space characters. */ + if (sscanf(line + offset, "%m[^ \t\n]s", &configuration[i]) != 1) { + configuration[i] = NULL; + break; + } + + /* Ignore comments - everything between '#' and '\n' */ + if (configuration[i][0] == '#') { + configuration[i] = NULL; + break; + } + + if ((configuration[i][0] == '\"') && (strchr(line + offset + 1, '"'))) { + /* Handle empty strings which strtok ignores */ + if (!strcmp(configuration[i], "\"\"")) { + configuration[i] = ""; + offset += strlen("\"\""); + } else if ((configuration[i] = strtok_r(line + offset, "\"", "edptr))) { + /* Handle escaping of quotes in quoted string */ + while (configuration[i][strlen(configuration[i]) - 1] == '\\') { + offset++; + len = strlen(configuration[i]); + configuration[i][len - 1] = '"'; + if (*quotedptr == '"') { + quotedptr++; + break; + } + quoted = strtok_r(NULL, "\"", "edptr); + tmp_string = xmalloc(len + strlen(quoted) + 1); + if (tmp_string == NULL) + return -1; + + memmove(tmp_string, configuration[i], len); + memmove(tmp_string + len, quoted, strlen(quoted) + 1); + configuration[i] = tmp_string; + } + offset += 2; + } + } + + offset += strlen(configuration[i]); + + if (was_newline) { + was_newline = false; + len = strlen(configuration[i]); + tmp_string = xrealloc(configuration[i], len + strlen("--") + 1); + if (tmp_string == NULL) + return -1; + + memmove(tmp_string + strlen("--"), tmp_string, len + 1); + memmove(tmp_string, "--", strlen("--")); + configuration[i] = tmp_string; + } + i++; + } + + return i; +} + +/* Parse a configuration file */ +static char ** parse_config(char *filepath) +{ +#define DEFAULT_CONFIG_SIZE 10 + FILE* configfile = fopen(filepath, "r"); + int config_size = DEFAULT_CONFIG_SIZE; + int i = 1; + size_t line_size = 0; + char *line = NULL; + char **configuration; + + if (!configfile) + return NULL; + + configuration = xmalloc(config_size * sizeof(char *)); + if (configuration == NULL) { + fclose(configfile); + exit(1); + } + /* + * Initialize first element, getopt ignores it. + */ + configuration[0] = "criu"; + + while (getline(&line, &line_size, configfile) != -1) { + /* Extend configuration buffer if necessary */ + if (i >= config_size - 1) { + config_size *= 2; + configuration = xrealloc(configuration, config_size * sizeof(char *)); + if (configuration == NULL) { + fclose(configfile); + exit(1); + } + } + + i = parse_statement(i, line, configuration); + if (i < 0) { + fclose(configfile); + exit(1); + } + + free(line); + line = NULL; + } + /* Initialize the last element */ + configuration[i] = NULL; + + free(line); + fclose(configfile); + return configuration; +} + +static int next_config(char **argv, char ***_argv, bool no_default_config, + int state, char *cfg_file) +{ + char local_filepath[PATH_MAX + 1]; + char *home_dir = NULL; + char *cfg_from_env = NULL; + + if (state >= PARSING_LAST) + return 0; + + switch(state) { + case PARSING_GLOBAL_CONF: + if (no_default_config) + break; + *_argv = parse_config(GLOBAL_CONFIG_DIR DEFAULT_CONFIG_FILENAME); + break; + case PARSING_USER_CONF: + if (no_default_config) + break; + home_dir = getenv("HOME"); + if (!home_dir) { + pr_info("Unable to get $HOME directory, local configuration file will not be used."); + } else { + snprintf(local_filepath, PATH_MAX, "%s/%s%s", + home_dir, USER_CONFIG_DIR, DEFAULT_CONFIG_FILENAME); + *_argv = parse_config(local_filepath); + } + break; + case PARSING_ENV_CONF: + cfg_from_env = getenv("CRIU_CONFIG_FILE"); + if (!cfg_from_env) + break; + *_argv = parse_config(cfg_from_env); + break; + case PARSING_CMDLINE_CONF: + if (!cfg_file) + break; + *_argv = parse_config(cfg_file); + break; + case PARSING_ARGV: + *_argv = argv; + break; + case PARSING_RPC_CONF: + if (!rpc_cfg_file) + break; + *_argv = parse_config(rpc_cfg_file); + break; + default: + break; + } + + return ++state; +} + +static int pre_parse(int argc, char **argv, bool *usage_error, bool *no_default_config, + char **cfg_file) +{ + int i; + /* + * We are running before getopt(), so we need to pre-parse + * the command line. + * + * Check for --help / -h on commandline before parsing, otherwise + * the help message won't be displayed if there is an error in + * configuration file syntax. Checks are kept in parser in case of + * option being put in the configuration file itself. + * + * Check also whether default configfiles are forbidden to lower + * number of argv iterations, but checks for help have higher priority. + */ + for (i = 0; i < argc; i++) { + if ((!strcmp(argv[i], "--help")) || (!strcmp(argv[i], "-h"))) { + *usage_error = false; + return 1; + } else if (!strcmp(argv[i], "--no-default-config")) { + *no_default_config = true; + } else if (!strcmp(argv[i], "--config")) { + /* + * getopt takes next string as required + * argument automatically, we do the same + */ + *cfg_file = argv[i + 1]; + *no_default_config = true; + } else if (strstr(argv[i], "--config=") != NULL) { + *cfg_file = argv[i] + strlen("--config="); + *no_default_config = true; + } + } + + return 0; +} + +void init_opts(void) +{ + memset(&opts, 0, sizeof(opts)); + + /* Default options */ + opts.final_state = TASK_DEAD; + INIT_LIST_HEAD(&opts.ext_mounts); + INIT_LIST_HEAD(&opts.inherit_fds); + INIT_LIST_HEAD(&opts.external); + INIT_LIST_HEAD(&opts.join_ns); + INIT_LIST_HEAD(&opts.new_cgroup_roots); + INIT_LIST_HEAD(&opts.irmap_scan_paths); + + opts.cpu_cap = CPU_CAP_DEFAULT; + opts.manage_cgroups = CG_MODE_DEFAULT; + opts.ps_socket = -1; + opts.ghost_limit = DEFAULT_GHOST_LIMIT; + opts.timeout = DEFAULT_TIMEOUT; + opts.empty_ns = 0; + opts.status_fd = -1; + opts.log_level = DEFAULT_LOGLEVEL; +} + +bool deprecated_ok(char *what) +{ + if (opts.deprecated_ok) + return true; + + pr_err("Deprecated functionality (%s) rejected.\n", what); + pr_err("Use the --deprecated option or set CRIU_DEPRECATED environment.\n"); + pr_err("For details visit https://criu.org/Deprecation\n"); + return false; +} + +static int parse_cpu_cap(struct cr_options *opts, const char *optarg) +{ + bool inverse = false; + +#define ____cpu_set_cap(__opts, __cap, __inverse) \ + do { \ + if ((__inverse)) \ + (__opts)->cpu_cap &= ~(__cap); \ + else \ + (__opts)->cpu_cap |= (__cap); \ + } while (0) + + if (!optarg) { + ____cpu_set_cap(opts, CPU_CAP_ALL, false); + ____cpu_set_cap(opts, CPU_CAP_IMAGE, false); + return 0; + } + + while (*optarg) { + if (optarg[0] == '^') { + inverse = !inverse; + optarg++; + continue; + } else if (optarg[0] == ',') { + inverse = false; + optarg++; + continue; + } + + if (!strncmp(optarg, "fpu", 3)) { + ____cpu_set_cap(opts, CPU_CAP_FPU, inverse); + optarg += 3; + } else if (!strncmp(optarg, "all", 3)) { + ____cpu_set_cap(opts, CPU_CAP_ALL, inverse); + optarg += 3; + } else if (!strncmp(optarg, "none", 4)) { + if (inverse) + opts->cpu_cap = CPU_CAP_ALL; + else + opts->cpu_cap = CPU_CAP_NONE; + optarg += 4; + } else if (!strncmp(optarg, "cpu", 3)) { + ____cpu_set_cap(opts, CPU_CAP_CPU, inverse); + optarg += 3; + } else if (!strncmp(optarg, "ins", 3)) { + ____cpu_set_cap(opts, CPU_CAP_INS, inverse); + optarg += 3; + } else + goto Esyntax; + } + + if (opts->cpu_cap != CPU_CAP_NONE) + ____cpu_set_cap(opts, CPU_CAP_IMAGE, false); +#undef ____cpu_set_cap + + return 0; + +Esyntax: + pr_err("Unknown FPU mode `%s' selected\n", optarg); + return -1; +} + +static int parse_manage_cgroups(struct cr_options *opts, const char *optarg) +{ + if (!optarg) { + opts->manage_cgroups = CG_MODE_SOFT; + return 0; + } + + if (!strcmp(optarg, "none")) { + opts->manage_cgroups = CG_MODE_NONE; + } else if (!strcmp(optarg, "props")) { + opts->manage_cgroups = CG_MODE_PROPS; + } else if (!strcmp(optarg, "soft")) { + opts->manage_cgroups = CG_MODE_SOFT; + } else if (!strcmp(optarg, "full")) { + opts->manage_cgroups = CG_MODE_FULL; + } else if (!strcmp(optarg, "strict")) { + opts->manage_cgroups = CG_MODE_STRICT; + } else if (!strcmp(optarg, "ignore")) { + opts->manage_cgroups = CG_MODE_IGNORE; + } else + goto Esyntax; + + return 0; + +Esyntax: + pr_err("Unknown cgroups mode `%s' selected\n", optarg); + return -1; +} + +extern char *index(const char *s, int c); + +static size_t parse_size(char *optarg) +{ + if (index(optarg, 'K')) + return (size_t)KILO(atol(optarg)); + else if (index(optarg, 'M')) + return (size_t)MEGA(atol(optarg)); + else if (index(optarg, 'G')) + return (size_t)GIGA(atol(optarg)); + return (size_t)atol(optarg); +} + +static int parse_join_ns(const char *ptr) +{ + char *aux, *ns_file, *extra_opts = NULL; + + aux = strchr(ptr, ':'); + if (aux == NULL) + return -1; + *aux = '\0'; + + ns_file = aux + 1; + aux = strchr(ns_file, ','); + if (aux != NULL) { + *aux = '\0'; + extra_opts = aux + 1; + } else { + extra_opts = NULL; + } + if (join_ns_add(ptr, ns_file, extra_opts)) + return -1; + + return 0; +} + +/* + * parse_options() is the point where the getopt parsing happens. The CLI + * parsing as well as the configuration file parsing happens here. + * This used to be all part of main() and to integrate the new code flow + * in main() this function (parse_options()) returns '0' if everything is + * correct, '1' if something failed and '2' if the CRIU help text should + * be displayed. + */ +int parse_options(int argc, char **argv, bool *usage_error, + bool *has_exec_cmd, int state) +{ + int ret; + int opt = -1; + int idx; + bool no_default_config = false; + char *cfg_file = NULL; + char **_argv = NULL; + int _argc = 0; + + +#define BOOL_OPT(OPT_NAME, SAVE_TO) \ + {OPT_NAME, no_argument, SAVE_TO, true},\ + {"no-" OPT_NAME, no_argument, SAVE_TO, false} + + static const char short_opts[] = "dSsRt:hD:o:v::x::Vr:jJ:lW:L:M:"; + static struct option long_opts[] = { + { "tree", required_argument, 0, 't' }, + { "leave-stopped", no_argument, 0, 's' }, + { "leave-running", no_argument, 0, 'R' }, + BOOL_OPT("restore-detached", &opts.restore_detach), + BOOL_OPT("restore-sibling", &opts.restore_sibling), + BOOL_OPT("daemon", &opts.restore_detach), + { "images-dir", required_argument, 0, 'D' }, + { "work-dir", required_argument, 0, 'W' }, + { "log-file", required_argument, 0, 'o' }, + { "join-ns", required_argument, 0, 'J' }, + { "root", required_argument, 0, 'r' }, + { USK_EXT_PARAM, optional_argument, 0, 'x' }, + { "help", no_argument, 0, 'h' }, + BOOL_OPT(SK_EST_PARAM, &opts.tcp_established_ok), + { "close", required_argument, 0, 1043 }, + BOOL_OPT("log-pid", &opts.log_file_per_pid), + { "version", no_argument, 0, 'V' }, + BOOL_OPT("evasive-devices", &opts.evasive_devices), + { "pidfile", required_argument, 0, 1046 }, + { "veth-pair", required_argument, 0, 1047 }, + { "action-script", required_argument, 0, 1049 }, + BOOL_OPT(LREMAP_PARAM, &opts.link_remap_ok), + BOOL_OPT(OPT_SHELL_JOB, &opts.shell_job), + BOOL_OPT(OPT_FILE_LOCKS, &opts.handle_file_locks), + BOOL_OPT("page-server", &opts.use_page_server), + { "address", required_argument, 0, 1051 }, + { "port", required_argument, 0, 1052 }, + { "prev-images-dir", required_argument, 0, 1053 }, + { "ms", no_argument, 0, 1054 }, + BOOL_OPT("track-mem", &opts.track_mem), + BOOL_OPT("auto-dedup", &opts.auto_dedup), + { "libdir", required_argument, 0, 'L' }, + { "cpu-cap", optional_argument, 0, 1057 }, + BOOL_OPT("force-irmap", &opts.force_irmap), + { "ext-mount-map", required_argument, 0, 'M' }, + { "exec-cmd", no_argument, 0, 1059 }, + { "manage-cgroups", optional_argument, 0, 1060 }, + { "cgroup-root", required_argument, 0, 1061 }, + { "inherit-fd", required_argument, 0, 1062 }, + { "feature", required_argument, 0, 1063 }, + { "skip-mnt", required_argument, 0, 1064 }, + { "enable-fs", required_argument, 0, 1065 }, + { "enable-external-sharing", no_argument, &opts.enable_external_sharing, true }, + { "enable-external-masters", no_argument, &opts.enable_external_masters, true }, + { "freeze-cgroup", required_argument, 0, 1068 }, + { "ghost-limit", required_argument, 0, 1069 }, + { "irmap-scan-path", required_argument, 0, 1070 }, + { "lsm-profile", required_argument, 0, 1071 }, + { "timeout", required_argument, 0, 1072 }, + { "external", required_argument, 0, 1073 }, + { "empty-ns", required_argument, 0, 1074 }, + { "lazy-pages", no_argument, 0, 1076 }, + BOOL_OPT("extra", &opts.check_extra_features), + BOOL_OPT("experimental", &opts.check_experimental_features), + { "all", no_argument, 0, 1079 }, + { "cgroup-props", required_argument, 0, 1080 }, + { "cgroup-props-file", required_argument, 0, 1081 }, + { "cgroup-dump-controller", required_argument, 0, 1082 }, + BOOL_OPT(SK_INFLIGHT_PARAM, &opts.tcp_skip_in_flight), + BOOL_OPT("deprecated", &opts.deprecated_ok), + BOOL_OPT("display-stats", &opts.display_stats), + BOOL_OPT("weak-sysctls", &opts.weak_sysctls), + { "status-fd", required_argument, 0, 1088 }, + BOOL_OPT(SK_CLOSE_PARAM, &opts.tcp_close), + { "verbosity", optional_argument, 0, 'v' }, + { "ps-socket", required_argument, 0, 1091}, + BOOL_OPT("remote", &opts.remote), + { "config", required_argument, 0, 1089}, + { "no-default-config", no_argument, 0, 1090}, + { "tls-cacert", required_argument, 0, 1092}, + { "tls-cacrl", required_argument, 0, 1093}, + { "tls-cert", required_argument, 0, 1094}, + { "tls-key", required_argument, 0, 1095}, + BOOL_OPT("tls", &opts.tls), + {"tls-no-cn-verify", no_argument, &opts.tls_no_cn_verify, true}, + { }, + }; + +#undef BOOL_OPT + + ret = pre_parse(argc, argv, usage_error, &no_default_config, + &cfg_file); + + if (ret) + return 2; + + while (1) { + idx = -1; + /* Only if opt is -1 we are going to the next configuration input */ + if (opt == -1) { + /* Do not free any memory if it points to argv */ + if (state != PARSING_ARGV + 1) { + int i; + for (i=1; i < _argc; i++) { + free(_argv[i]); + } + free(_argv); + } + /* This needs to be reset for a new getopt() run */ + _argc = 0; + _argv = NULL; + + state = next_config(argv, &_argv, no_default_config, state, cfg_file); + + /* if next_config() returns 0 it means no more configs found */ + if (state == 0) + break; + + if (!_argv) + continue; + + _argc = count_elements(_argv); + optind = 0; + } + + opt = getopt_long(_argc, _argv, short_opts, long_opts, &idx); + + /* + * The end of the current _argv has been reached, + * let's go to the next _argv + */ + if (opt == -1) + continue; + + /* + * If opt == 0 then getopt will directly fill out the corresponding + * field in CRIU's opts structure. + */ + if (!opt) + continue; + + switch (opt) { + case 's': + opts.final_state = TASK_STOPPED; + break; + case 'R': + opts.final_state = TASK_ALIVE; + break; + case 'x': + if (optarg && unix_sk_ids_parse(optarg) < 0) + return 1; + opts.ext_unix_sk = true; + break; + case 't': + opts.tree_id = atoi(optarg); + if (opts.tree_id <= 0) + goto bad_arg; + break; + case 'r': + SET_CHAR_OPTS(root, optarg); + break; + case 'd': + opts.restore_detach = true; + break; + case 'S': + opts.restore_sibling = true; + break; + case 'D': + SET_CHAR_OPTS(imgs_dir, optarg); + break; + case 'W': + SET_CHAR_OPTS(work_dir, optarg); + break; + case 'o': + SET_CHAR_OPTS(output, optarg); + break; + case 'J': + if (parse_join_ns(optarg)) + goto bad_arg; + break; + case 'v': + if (optarg) { + if (optarg[0] == 'v') + /* handle -vvvvv */ + opts.log_level += strlen(optarg) + 1; + else + opts.log_level = atoi(optarg); + } else + opts.log_level++; + break; + case 1043: { + int fd; + + fd = atoi(optarg); + pr_info("Closing fd %d\n", fd); + close(fd); + break; + } + case 1046: + SET_CHAR_OPTS(pidfile, optarg); + break; + case 1047: + { + char *aux; + + aux = strchr(optarg, '='); + if (aux == NULL) + goto bad_arg; + + *aux = '\0'; + if (veth_pair_add(optarg, aux + 1)) + return 1; + } + break; + case 1049: + if (add_script(optarg)) + return 1; + break; + case 1051: + SET_CHAR_OPTS(addr, optarg); + break; + case 1052: + opts.port = atoi(optarg); + if (!opts.port) + goto bad_arg; + break; + case 'j': + opts.shell_job = true; + break; + case 'l': + opts.handle_file_locks = true; + break; + case 1053: + SET_CHAR_OPTS(img_parent, optarg); + break; + case 1057: + if (parse_cpu_cap(&opts, optarg)) + return 2; + break; + case 1058: + opts.force_irmap = true; + break; + case 1054: + pr_err("--ms is deprecated; see \"Check options\" of criu --help\n"); + return 1; + case 'L': + SET_CHAR_OPTS(libdir, optarg); + opts.libdir = optarg; + break; + case 1059: + *has_exec_cmd = true; + break; + case 1060: + if (parse_manage_cgroups(&opts, optarg)) + return 2; + break; + case 1061: + { + char *path, *ctl; + + path = strchr(optarg, ':'); + if (path) { + *path = '\0'; + path++; + ctl = optarg; + } else { + path = optarg; + ctl = NULL; + } + + if (new_cg_root_add(ctl, path)) + return -1; + } + break; + case 1062: + if (inherit_fd_parse(optarg) < 0) + return 1; + break; + case 1063: + ret = check_add_feature(optarg); + if (ret < 0) /* invalid kernel feature name */ + return 1; + if (ret > 0) /* list kernel features and exit */ + return 0; + break; + case 1064: + if (!add_skip_mount(optarg)) + return 1; + break; + case 1065: + if (!add_fsname_auto(optarg)) + return 1; + break; + case 1068: + SET_CHAR_OPTS(freeze_cgroup, optarg); + break; + case 1069: + opts.ghost_limit = parse_size(optarg); + break; + case 1070: + if (irmap_scan_path_add(optarg)) + return -1; + break; + case 1071: + SET_CHAR_OPTS(lsm_profile, optarg); + opts.lsm_supplied = true; + break; + case 1072: + opts.timeout = atoi(optarg); + break; + case 1076: + opts.lazy_pages = true; + break; + case 'M': + { + char *aux; + + if (strcmp(optarg, "auto") == 0) { + opts.autodetect_ext_mounts = true; + break; + } + + aux = strchr(optarg, ':'); + if (aux == NULL) + goto bad_arg; + + *aux = '\0'; + if (ext_mount_add(optarg, aux + 1)) + return 1; + } + break; + case 1073: + if (add_external(optarg)) + return 1; + break; + case 1074: + if (!strcmp("net", optarg)) + opts.empty_ns |= CLONE_NEWNET; + else { + pr_err("Unsupported empty namespace: %s\n", + optarg); + return 1; + } + break; + case 1079: + opts.check_extra_features = true; + opts.check_experimental_features = true; + break; + case 1080: + SET_CHAR_OPTS(cgroup_props, optarg); + break; + case 1081: + SET_CHAR_OPTS(cgroup_props_file, optarg); + break; + case 1082: + if (!cgp_add_dump_controller(optarg)) + return 1; + break; + case 1088: + if (sscanf(optarg, "%d", &opts.status_fd) != 1) { + pr_err("Unable to parse a value of --status-fd\n"); + return 1; + } + break; + case 1089: + break; + case 1090: + break; + case 1091: + opts.ps_socket = atoi(optarg); + break; + case 1092: + SET_CHAR_OPTS(tls_cacert, optarg); + break; + case 1093: + SET_CHAR_OPTS(tls_cacrl, optarg); + break; + case 1094: + SET_CHAR_OPTS(tls_cert, optarg); + break; + case 1095: + SET_CHAR_OPTS(tls_key, optarg); + break; + case 'V': + pr_msg("Version: %s\n", CRIU_VERSION); + if (strcmp(CRIU_GITID, "0")) + pr_msg("GitID: %s\n", CRIU_GITID); + exit(0); + case 'h': + *usage_error = false; + return 2; + default: + return 2; + } + } + + return 0; + +bad_arg: + if (idx < 0) /* short option */ + pr_msg("Error: invalid argument for -%c: %s\n", + opt, optarg); + else /* long option */ + pr_msg("Error: invalid argument for --%s: %s\n", + long_opts[idx].name, optarg); + return 1; +} + +int check_options() +{ + if (opts.tcp_established_ok) + pr_info("Will dump/restore TCP connections\n"); + if (opts.tcp_skip_in_flight) + pr_info("Will skip in-flight TCP connections\n"); + if (opts.tcp_close) + pr_info("Will drop all TCP connections on restore\n"); + if (opts.link_remap_ok) + pr_info("Will allow link remaps on FS\n"); + if (opts.weak_sysctls) + pr_info("Will skip non-existant sysctls on restore\n"); + + if (opts.deprecated_ok) + pr_info("Turn deprecated stuff ON\n"); + else if (getenv("CRIU_DEPRECATED")) { + pr_info("Turn deprecated stuff ON via env\n"); + opts.deprecated_ok = true; + } + + if (!opts.restore_detach && opts.restore_sibling) { + pr_err("--restore-sibling only makes sense with --restore-detach\n"); + return 1; + } + + if (opts.ps_socket != -1) { + if (opts.addr || opts.port) + pr_warn("Using --address or --port in " + "combination with --ps-socket is obsolete\n"); + if (opts.ps_socket <= STDERR_FILENO && opts.daemon_mode) { + pr_err("Standard file descriptors will be closed" + " in daemon mode\n"); + return 1; + } + } + +#ifndef CONFIG_GNUTLS + if (opts.tls) { + pr_err("CRIU was built without TLS support\n"); + return 1; + } +#endif + + if (check_namespace_opts()) { + pr_err("Error: namespace flags conflict\n"); + return 1; + } + + return 0; +} diff --git a/CRIU_code/criu/cr-check.c b/CRIU_code/criu/cr-check.c new file mode 100644 index 0000000..75a665c --- /dev/null +++ b/CRIU_code/criu/cr-check.c @@ -0,0 +1,1529 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../soccr/soccr.h" + +#include "types.h" +#include "fdinfo.h" +#include "sockets.h" +#include "crtools.h" +#include "log.h" +#include "util-pie.h" +#include "prctl.h" +#include "files.h" +#include "sk-inet.h" +#include "proc_parse.h" +#include "mount.h" +#include "tty.h" +#include +#include "ptrace-compat.h" +#include "kerndat.h" +#include "timerfd.h" +#include "util.h" +#include "tun.h" +#include "namespaces.h" +#include "pstree.h" +#include "cr_options.h" +#include "libnetlink.h" +#include "net.h" +#include "restorer.h" +#include "uffd.h" + +static char *feature_name(int (*func)()); + +static int check_tty(void) +{ + int master = -1, slave = -1; + const int lock = 1; + struct termios t; + char *slavename; + int ret = -1; + + if (ARRAY_SIZE(t.c_cc) < TERMIOS_NCC) { + pr_msg("struct termios has %d @c_cc while " + "at least %d expected.\n", + (int)ARRAY_SIZE(t.c_cc), + TERMIOS_NCC); + goto out; + } + + master = open("/dev/ptmx", O_RDWR); + if (master < 0) { + pr_perror("Can't open /dev/ptmx"); + goto out; + } + + if (ioctl(master, TIOCSPTLCK, &lock)) { + pr_perror("Can't lock pty master"); + goto out; + } + + slavename = ptsname(master); + slave = open(slavename, O_RDWR); + if (slave < 0) { + if (errno != EIO) { + pr_perror("Unexpected error on locked pty"); + goto out; + } + } else { + pr_err("Managed to open locked pty.\n"); + goto out; + } + + ret = 0; +out: + close_safe(&master); + close_safe(&slave); + return ret; +} + +static int check_map_files(void) +{ + int ret; + + ret = access("/proc/self/map_files", R_OK); + if (!ret) + return 0; + + pr_perror("/proc//map_files is inaccessible"); + return -1; +} + +static int check_sock_diag(void) +{ + int ret; + struct ns_id ns; + + ns.ns_pid = 0; + ns.type = NS_CRIU; + ns.net.nlsk = socket(PF_NETLINK, SOCK_RAW, NETLINK_SOCK_DIAG); + if (ns.net.nlsk < 0) { + pr_perror("Can't make diag socket for check"); + return -1; + } + + ret = collect_sockets(&ns); + if (!ret) + return 0; + + pr_msg("The sock diag infrastructure is incomplete.\n"); + pr_msg("Make sure you have:\n"); + pr_msg(" 1. *_DIAG kernel config options turned on;\n"); + pr_msg(" 2. *_diag.ko modules loaded (if compiled as modules).\n"); + return -1; +} + +static int check_ns_last_pid(void) +{ + int ret; + + ret = access("/proc/" LAST_PID_PATH, W_OK); + if (!ret) + return 0; + + pr_perror("%s sysctl is inaccessible", LAST_PID_PATH); + return -1; +} + +static int check_sock_peek_off(void) +{ + int sk; + int ret, off, sz; + + sk = socket(PF_UNIX, SOCK_DGRAM, 0); + if (sk < 0) { + pr_perror("Can't create unix socket for check"); + return -1; + } + + sz = sizeof(off); + ret = getsockopt(sk, SOL_SOCKET, SO_PEEK_OFF, &off, (socklen_t *)&sz); + close(sk); + + if ((ret == 0) && (off == -1) && (sz == sizeof(int))) + return 0; + + pr_msg("SO_PEEK_OFF sockoption doesn't work.\n"); + return -1; +} + +static int check_kcmp(void) +{ + int ret = syscall(SYS_kcmp, getpid(), -1, -1, -1, -1); + + if (ret < 0 && errno == ENOSYS) { + pr_perror("System call kcmp is not supported"); + return -1; + } + + return 0; +} + +static int check_prctl_cat1(void) +{ + unsigned long user_auxv = 0; + unsigned int *tid_addr; + unsigned int size = 0; + int ret; + + ret = prctl(PR_GET_TID_ADDRESS, (unsigned long)&tid_addr, 0, 0, 0); + if (ret < 0) { + pr_msg("prctl: PR_GET_TID_ADDRESS is not supported: %m"); + return -1; + } + + /* + * It's OK if the new interface is not supported because it's + * a Category 2 feature, but the old interface has to be supported. + */ + ret = prctl(PR_SET_MM, PR_SET_MM_MAP_SIZE, (unsigned long)&size, 0, 0); + if (ret < 0) { + pr_msg("Info prctl: PR_SET_MM_MAP_SIZE is not supported\n"); + ret = prctl(PR_SET_MM, PR_SET_MM_BRK, (unsigned long)sbrk(0), 0, 0); + if (ret < 0) { + if (errno == EPERM) + pr_msg("prctl: One needs CAP_SYS_RESOURCE capability to perform testing\n"); + else + pr_msg("prctl: PR_SET_MM_BRK is not supported: %m\n"); + return -1; + } + + ret = prctl(PR_SET_MM, PR_SET_MM_EXE_FILE, -1, 0, 0); + if (ret < 0 && errno != EBADF) { + pr_msg("prctl: PR_SET_MM_EXE_FILE is not supported: %m\n"); + return -1; + } + + ret = prctl(PR_SET_MM, PR_SET_MM_AUXV, (long)&user_auxv, sizeof(user_auxv), 0); + if (ret < 0) { + pr_msg("prctl: PR_SET_MM_AUXV is not supported: %m\n"); + return -1; + } + } + + return 0; +} + +static int check_prctl_cat2(void) +{ + unsigned int size = 0; + int ret; + + ret = prctl(PR_SET_MM, PR_SET_MM_MAP_SIZE, (unsigned long)&size, 0, 0); + if (ret) { + pr_warn("prctl: PR_SET_MM_MAP_SIZE is not supported\n"); + return -1; + } + return 0; +} + +static int check_fcntl(void) +{ + u32 v[2]; + int fd; + + fd = open_proc(PROC_SELF, "comm"); + if (fd < 0) + return -1; + + if (fcntl(fd, F_GETOWNER_UIDS, (long)v)) { + pr_perror("Can't fetch file owner UIDs"); + close(fd); + return -1; + } + + close(fd); + return 0; +} + +static int check_proc_stat(void) +{ + struct proc_pid_stat stat; + int ret; + + ret = parse_pid_stat(getpid(), &stat); + if (ret) { + pr_msg("procfs: stat extension is not supported\n"); + return -1; + } + + return 0; +} + +static int check_fdinfo_eventfd(void) +{ + int fd, ret; + int cnt = 13; + EventfdFileEntry fe = EVENTFD_FILE_ENTRY__INIT; + + fd = eventfd(cnt, 0); + if (fd < 0) { + pr_perror("Can't make eventfd"); + return -1; + } + + ret = parse_fdinfo(fd, FD_TYPES__EVENTFD, &fe); + close(fd); + + if (ret) { + pr_err("Error parsing proc fdinfo\n"); + return -1; + } + + if (fe.counter != cnt) { + pr_err("Counter mismatch (or not met) %d want %d\n", + (int)fe.counter, cnt); + return -1; + } + + pr_info("Eventfd fdinfo works OK (%d vs %d)\n", cnt, (int)fe.counter); + return 0; +} + +int check_mnt_id(void) +{ + struct fdinfo_common fdinfo = { .mnt_id = -1 }; + int ret; + + ret = parse_fdinfo(get_service_fd(LOG_FD_OFF), FD_TYPES__UND, &fdinfo); + if (ret < 0) + return -1; + + if (fdinfo.mnt_id == -1) { + pr_err("fdinfo doesn't contain the mnt_id field\n"); + return -1; + } + + return 0; +} + +static int check_fdinfo_signalfd(void) +{ + int fd, ret; + sigset_t mask; + SignalfdEntry sfd = SIGNALFD_ENTRY__INIT; + + sigemptyset(&mask); + sigaddset(&mask, SIGUSR1); + fd = signalfd(-1, &mask, 0); + if (fd < 0) { + pr_perror("Can't make signalfd"); + return -1; + } + + ret = parse_fdinfo(fd, FD_TYPES__SIGNALFD, &sfd); + close(fd); + + if (ret) { + pr_err("Error parsing proc fdinfo\n"); + return -1; + } + + return 0; +} + +static int check_fdinfo_eventpoll(void) +{ + int efd, pfd[2], ret = -1; + struct epoll_event ev; + EventpollFileEntry efe = EVENTPOLL_FILE_ENTRY__INIT; + + if (pipe(pfd)) { + pr_perror("Can't make pipe to watch"); + return -1; + } + + efd = epoll_create(1); + if (efd < 0) { + pr_perror("Can't make epoll fd"); + goto pipe_err; + } + + memset(&ev, 0, sizeof(ev)); + ev.events = EPOLLIN | EPOLLOUT; + + if (epoll_ctl(efd, EPOLL_CTL_ADD, pfd[0], &ev)) { + pr_perror("Can't add epoll tfd"); + goto epoll_err; + } + + ret = parse_fdinfo(efd, FD_TYPES__EVENTPOLL, &efe); + if (ret) { + pr_err("Error parsing proc fdinfo\n"); + goto epoll_err; + } + + if (efe.n_tfd != 1 || efe.tfd[0]->tfd != pfd[0]) { + pr_err("TFD mismatch (or not met)\n"); + ret = -1; + goto epoll_err; + } + + pr_info("Epoll fdinfo works OK\n"); + +epoll_err: + close(efd); +pipe_err: + close(pfd[0]); + close(pfd[1]); + + return ret; +} + +static int check_fdinfo_inotify(void) +{ + int ifd, wd, ret; + InotifyFileEntry ify = INOTIFY_FILE_ENTRY__INIT; + + ifd = inotify_init1(0); + if (ifd < 0) { + pr_perror("Can't make inotify fd"); + return -1; + } + + wd = inotify_add_watch(ifd, ".", IN_ALL_EVENTS); + if (wd < 0) { + pr_perror("Can't add watch"); + close(ifd); + return -1; + } + + ret = parse_fdinfo(ifd, FD_TYPES__INOTIFY, &ify); + close(ifd); + + if (ret < 0) { + pr_err("Error parsing proc fdinfo\n"); + return -1; + } + + if (ify.n_wd != 1 || ify.wd[0]->wd != wd) { + pr_err("WD mismatch (or not met)\n"); + return -1; + } + + pr_info("Inotify fdinfo works OK\n"); + return 0; +} + +static int check_fdinfo_ext(void) +{ + int ret = 0; + + ret |= check_fdinfo_eventfd(); + ret |= check_fdinfo_eventpoll(); + ret |= check_fdinfo_signalfd(); + ret |= check_fdinfo_inotify(); + + return ret; +} + +static int check_unaligned_vmsplice(void) +{ + int p[2], ret; + char buf; /* :) */ + struct iovec iov; + + ret = pipe(p); + if (ret < 0) { + pr_perror("Can't create pipe"); + return ret; + } + iov.iov_base = &buf; + iov.iov_len = sizeof(buf); + ret = vmsplice(p[1], &iov, 1, SPLICE_F_GIFT | SPLICE_F_NONBLOCK); + if (ret < 0) { + pr_perror("Unaligned vmsplice doesn't work"); + goto err; + } + + pr_info("Unaligned vmsplice works OK\n"); + ret = 0; +err: + close(p[0]); + close(p[1]); + + return ret; +} + +#ifndef SO_GET_FILTER +#define SO_GET_FILTER SO_ATTACH_FILTER +#endif + +static int check_so_gets(void) +{ + int sk, ret = -1; + socklen_t len; + char name[IFNAMSIZ]; + + sk = socket(PF_INET, SOCK_DGRAM, IPPROTO_UDP); + if (sk < 0) { + pr_perror("No socket"); + return -1; + } + + len = 0; + if (getsockopt(sk, SOL_SOCKET, SO_GET_FILTER, NULL, &len)) { + pr_perror("Can't get socket filter"); + goto err; + } + + len = sizeof(name); + if (getsockopt(sk, SOL_SOCKET, SO_BINDTODEVICE, name, &len)) { + pr_perror("Can't get socket bound dev"); + goto err; + } + + ret = 0; +err: + close(sk); + return ret; +} + +static int check_ipc(void) +{ + int ret; + + ret = access("/proc/sys/kernel/sem_next_id", R_OK | W_OK); + if (!ret) + return 0; + + pr_perror("/proc/sys/kernel/sem_next_id is inaccessible"); + return -1; +} + +static int check_sigqueuinfo() +{ + siginfo_t info = { .si_code = 1 }; + + signal(SIGUSR1, SIG_IGN); + + if (syscall(SYS_rt_sigqueueinfo, getpid(), SIGUSR1, &info) < 0) { + pr_perror("Unable to send siginfo with positive si_code to itself"); + return -1; + } + + return 0; +} + +static pid_t fork_and_ptrace_attach(int (*child_setup)(void)) +{ + pid_t pid; + int sk_pair[2], sk; + char c = 0; + + if (socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair)) { + pr_perror("socketpair"); + return -1; + } + + pid = fork(); + if (pid < 0) { + pr_perror("fork"); + return -1; + } else if (pid == 0) { + sk = sk_pair[1]; + close(sk_pair[0]); + + if (child_setup && child_setup() != 0) + exit(1); + + if (write(sk, &c, 1) != 1) { + pr_perror("write"); + exit(1); + } + + while (1) + sleep(1000); + exit(1); + } + + sk = sk_pair[0]; + close(sk_pair[1]); + + if (read(sk, &c, 1) != 1) { + close(sk); + kill(pid, SIGKILL); + pr_perror("read"); + return -1; + } + + close(sk); + + if (ptrace(PTRACE_ATTACH, pid, NULL, NULL) == -1) { + pr_perror("Unable to ptrace the child"); + kill(pid, SIGKILL); + return -1; + } + + waitpid(pid, NULL, 0); + + return pid; +} + +static int check_ptrace_peeksiginfo(void) +{ + struct ptrace_peeksiginfo_args arg; + siginfo_t siginfo; + pid_t pid, ret = 0; + k_rtsigset_t mask; + + pid = fork_and_ptrace_attach(NULL); + if (pid < 0) + return -1; + + arg.flags = 0; + arg.off = 0; + arg.nr = 1; + + if (ptrace(PTRACE_PEEKSIGINFO, pid, &arg, &siginfo) != 0) { + pr_perror("Unable to dump pending signals"); + ret = -1; + } + + if (ptrace(PTRACE_GETSIGMASK, pid, sizeof(mask), &mask) != 0) { + pr_perror("Unable to dump signal blocking mask"); + ret = -1; + } + + kill(pid, SIGKILL); + return ret; +} + +struct special_mapping { + const char *name; + void *addr; + size_t size; +}; + +static int parse_special_maps(struct special_mapping *vmas, size_t nr) +{ + FILE *maps; + char buf[256]; + int ret = 0; + + maps = fopen_proc(PROC_SELF, "maps"); + if (!maps) + return -1; + + while (fgets(buf, sizeof(buf), maps)) { + unsigned long start, end; + int r, tail; + size_t i; + + r = sscanf(buf, "%lx-%lx %*s %*s %*s %*s %n\n", + &start, &end, &tail); + if (r != 2) { + fclose(maps); + pr_err("Bad maps format %d.%d (%s)\n", r, tail, buf + tail); + return -1; + } + + for (i = 0; i < nr; i++) { + if (strcmp(buf + tail, vmas[i].name) != 0) + continue; + if (vmas[i].addr != MAP_FAILED) { + pr_err("Special mapping meet twice: %s\n", vmas[i].name); + ret = -1; + goto out; + } + vmas[i].addr = (void *)start; + vmas[i].size = end - start; + } + } + +out: + fclose(maps); + return ret; +} + +static void dummy_sighandler(int sig) +{ +} + +/* + * The idea of test is checking if the kernel correctly tracks positions + * of special_mappings: vdso/vvar/sigpage/... + * Per-architecture commits added handling for mremap() somewhere between + * v4.8...v4.14. If the kernel doesn't have one of those patches, + * a process will crash after receiving a signal (we use SIGUSR1 for + * the test here). That's because after processing a signal the kernel + * needs a "landing" to return to userspace, which is based on vdso/sigpage. + * If the kernel doesn't track the position of mapping - we land in the void. + * And we definitely mremap() support by the fact that those special_mappings + * are subjects for ASLR. (See #288 as a reference) + */ +static void check_special_mapping_mremap_child(struct special_mapping *vmas, + size_t nr) +{ + size_t i, parking_size = 0; + void *parking_lot; + pid_t self = getpid(); + + for (i = 0; i < nr; i++) { + if (vmas[i].addr != MAP_FAILED) + parking_size += vmas[i].size; + } + + if (signal(SIGUSR1, dummy_sighandler) == SIG_ERR) { + pr_perror("signal() failed"); + exit(1); + } + + parking_lot = mmap(NULL, parking_size, PROT_NONE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (parking_lot == MAP_FAILED) { + pr_perror("mmap(%zu) failed", parking_size); + exit(1); + } + + for (i = 0; i < nr; i++) { + unsigned long ret; + + if (vmas[i].addr == MAP_FAILED) + continue; + + ret = syscall(__NR_mremap, (unsigned long)vmas[i].addr, + vmas[i].size, vmas[i].size, + MREMAP_FIXED | MREMAP_MAYMOVE, + (unsigned long)parking_lot); + if (ret != (unsigned long)parking_lot) + syscall(__NR_exit, 1); + parking_lot += vmas[i].size; + } + + syscall(__NR_kill, self, SIGUSR1); + syscall(__NR_exit, 0); +} + +static int check_special_mapping_mremap(void) +{ + struct special_mapping special_vmas[] = { + { + .name = "[vvar]\n", + .addr = MAP_FAILED, + }, + { + .name = "[vdso]\n", + .addr = MAP_FAILED, + }, + { + .name = "[sigpage]\n", + .addr = MAP_FAILED, + }, + /* XXX: { .name = "[uprobes]\n" }, */ + /* + * Not subjects for ASLR, skipping: + * { .name = "[vectors]\n", }, + * { .name = "[vsyscall]\n" }, + */ + }; + size_t vmas_nr = ARRAY_SIZE(special_vmas); + pid_t child; + int stat; + + if (parse_special_maps(special_vmas, vmas_nr)) + return -1; + + child = fork(); + if (child < 0) { + pr_perror("%s(): failed to fork()", __func__); + return -1; + } + + if (child == 0) + check_special_mapping_mremap_child(special_vmas, vmas_nr); + + if (waitpid(child, &stat, 0) != child) { + if (errno == ECHILD) { + pr_err("BUG: Someone waited for the child already\n"); + return -1; + } + /* Probably, we're interrupted with a signal - cleanup */ + pr_err("Failed to wait for a child %d\n", errno); + kill(child, SIGKILL); + return -1; + } + + if (WIFSIGNALED(stat)) { + pr_err("Child killed by signal %d\n", WTERMSIG(stat)); + pr_err("Your kernel probably lacks the support for mremapping special mappings\n"); + return -1; + } else if (WIFEXITED(stat)) { + if (WEXITSTATUS(stat) == 0) + return 0; + pr_err("Child exited with %d\n", WEXITSTATUS(stat)); + return -1; + } + + pr_err("BUG: waitpid() returned stat=%d\n", stat); + /* We're not killing the child here - it's predestined to die anyway. */ + return -1; +} + +static int check_ptrace_suspend_seccomp(void) +{ + pid_t pid; + int ret = 0; + + pid = fork_and_ptrace_attach(NULL); + if (pid < 0) + return -1; + + if (ptrace(PTRACE_SETOPTIONS, pid, NULL, PTRACE_O_SUSPEND_SECCOMP) < 0) { + if (errno == EINVAL) { + pr_err("Kernel doesn't support PTRACE_O_SUSPEND_SECCOMP\n"); + } else { + pr_perror("couldn't suspend seccomp"); + } + ret = -1; + } + + kill(pid, SIGKILL); + return ret; +} + +static int setup_seccomp_filter(void) +{ + struct sock_filter filter[] = { + BPF_STMT(BPF_LD+BPF_W+BPF_ABS, offsetof(struct seccomp_data, nr)), + /* Allow all syscalls except ptrace */ + BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_ptrace, 0, 1), + BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL), + BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW), + }; + + struct sock_fprog bpf_prog = { + .len = (unsigned short)(sizeof(filter)/sizeof(filter[0])), + .filter = filter, + }; + + if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, (long) &bpf_prog, 0, 0) < 0) + return -1; + + return 0; +} + +static int check_ptrace_dump_seccomp_filters(void) +{ + pid_t pid; + int ret = 0, len; + + pid = fork_and_ptrace_attach(setup_seccomp_filter); + if (pid < 0) + return -1; + + len = ptrace(PTRACE_SECCOMP_GET_FILTER, pid, 0, NULL); + if (len < 0) { + ret = -1; + pr_perror("Dumping seccomp filters not supported"); + } + + kill(pid, SIGKILL); + return ret; +} + +static int check_mem_dirty_track(void) +{ + if (!kdat.has_dirty_track) { + pr_warn("Dirty tracking is OFF. Memory snapshot will not work.\n"); + return -1; + } + return 0; +} + +static int check_posix_timers(void) +{ + int ret; + + ret = access("/proc/self/timers", R_OK); + if (!ret) + return 0; + + pr_msg("/proc//timers file is missing.\n"); + return -1; +} + +static unsigned long get_ring_len(unsigned long addr) +{ + FILE *maps; + char buf[256]; + + maps = fopen_proc(PROC_SELF, "maps"); + if (!maps) + return 0; + + while (fgets(buf, sizeof(buf), maps)) { + unsigned long start, end; + int r, tail; + + r = sscanf(buf, "%lx-%lx %*s %*s %*s %*s %n\n", &start, &end, &tail); + if (r != 2) { + fclose(maps); + pr_err("Bad maps format %d.%d (%s)\n", r, tail, buf + tail); + return 0; + } + + if (start == addr) { + fclose(maps); + if (strcmp(buf + tail, "/[aio] (deleted)\n")) + goto notfound; + + return end - start; + } + } + + fclose(maps); +notfound: + pr_err("No AIO ring at expected location\n"); + return 0; +} + +static int check_aio_remap(void) +{ + aio_context_t ctx = 0; + unsigned long len; + void *naddr; + int r; + + if (syscall(SYS_io_setup, 16, &ctx) < 0) { + pr_err("No AIO syscall: %m\n"); + return -1; + } + + len = get_ring_len((unsigned long) ctx); + if (!len) + return -1; + + naddr = mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); + if (naddr == MAP_FAILED) { + pr_perror("Can't find place for new AIO ring"); + return -1; + } + + if (mremap((void *)ctx, len, len, MREMAP_FIXED | MREMAP_MAYMOVE, naddr) == MAP_FAILED) { + pr_perror("Can't remap AIO ring"); + return -1; + } + + ctx = (aio_context_t)naddr; + r = syscall(SYS_io_getevents, ctx, 0, 1, NULL, NULL); + if (r < 0) { + pr_err("AIO remap doesn't work properly: %m\n"); + return -1; + } + + return 0; +} + +static int check_fdinfo_lock(void) +{ + if (!kdat.has_fdinfo_lock) { + pr_err("fdinfo doesn't contain the lock field\n"); + return -1; + } + + return 0; +} + +struct clone_arg { + /* + * Reserve some space for clone() to locate arguments + * and retcode in this place + */ + char stack[128] __stack_aligned__; + char stack_ptr[0]; +}; + +static int clone_cb(void *_arg) { + exit(0); +} + +static int check_clone_parent_vs_pid() +{ + struct clone_arg ca; + pid_t pid; + + pid = clone(clone_cb, ca.stack_ptr, CLONE_NEWPID | CLONE_PARENT, &ca); + if (pid < 0) { + pr_err("CLONE_PARENT | CLONE_NEWPID don't work together\n"); + return -1; + } + + return 0; +} + +static int check_autofs_pipe_ino(void) +{ + FILE *f; + char str[1024]; + int ret = -ENOENT; + + f = fopen_proc(PROC_SELF, "mountinfo"); + if (!f) + return -1; + + while (fgets(str, sizeof(str), f)) { + if (strstr(str, " autofs ")) { + if (strstr(str, "pipe_ino=")) + ret = 0; + else { + pr_err("autofs not supported.\n"); + ret = -ENOTSUP; + } + break; + } + } + + fclose(f); + return ret; +} + +static int check_autofs(void) +{ + char *dir, *options, template[] = "/tmp/.criu.mnt.XXXXXX"; + int ret, pfd[2]; + + ret = check_autofs_pipe_ino(); + if (ret != -ENOENT) + return ret; + + if (pipe(pfd) < 0) { + pr_perror("failed to create pipe"); + return -1; + } + + ret = -1; + + options = xsprintf("fd=%d,pgrp=%d,minproto=5,maxproto=5,direct", + pfd[1], getpgrp()); + if (!options) { + pr_err("failed to allocate autofs options\n"); + goto close_pipe; + } + + dir = mkdtemp(template); + if (!dir) { + pr_perror("failed to construct temporary name"); + goto free_options; + } + + if (mount("criu", dir, "autofs", 0, options) < 0) { + pr_perror("failed to mount autofs"); + goto unlink_dir; + } + + ret = check_autofs_pipe_ino(); + + if (umount(dir)) + pr_perror("failed to umount %s", dir); + +unlink_dir: + if (rmdir(dir)) + pr_perror("failed to unlink %s", dir); +free_options: + free(options); +close_pipe: + close(pfd[0]); + close(pfd[1]); + return ret; +} + +static int check_cgroupns(void) +{ + int ret; + + ret = access("/proc/self/ns/cgroup", F_OK); + if (ret < 0) { + pr_err("cgroupns not supported. This is not fatal.\n"); + return -1; + } + + return 0; +} + +static int check_tcp(void) +{ + socklen_t optlen; + int sk, ret; + int val; + + sk = socket(PF_INET, SOCK_STREAM, IPPROTO_TCP); + if (sk < 0) { + pr_perror("Can't create TCP socket :("); + return -1; + } + + val = 1; + ret = setsockopt(sk, SOL_TCP, TCP_REPAIR, &val, sizeof(val)); + if (ret < 0) { + pr_perror("Can't turn TCP repair mode ON"); + goto out; + } + + optlen = sizeof(val); + ret = getsockopt(sk, SOL_TCP, TCP_TIMESTAMP, &val, &optlen); + if (ret) + pr_perror("Can't get TCP_TIMESTAMP"); + +out: + close(sk); + + return ret; +} + +static int check_tcp_halt_closed(void) +{ + if (!kdat.has_tcp_half_closed) { + pr_err("TCP_REPAIR can't be enabled for half-closed sockets\n"); + return -1; + } + + return 0; +} + +static int kerndat_tcp_repair_window(void) +{ + struct tcp_repair_window opt; + socklen_t optlen = sizeof(opt); + int sk, val = 1; + + sk = socket(AF_INET, SOCK_STREAM, 0); + if (sk < 0) { + pr_perror("Unable to create inet socket"); + goto errn; + } + + if (setsockopt(sk, SOL_TCP, TCP_REPAIR, &val, sizeof(val))) { + if (errno == EPERM) { + pr_warn("TCP_REPAIR isn't available to unprivileged users\n"); + goto now; + } + pr_perror("Unable to set TCP_REPAIR"); + goto err; + } + + if (getsockopt(sk, SOL_TCP, TCP_REPAIR_WINDOW, &opt, &optlen)) { + if (errno != ENOPROTOOPT) { + pr_perror("Unable to set TCP_REPAIR_WINDOW"); + goto err; + } +now: + val = 0; + } else + val = 1; + + close(sk); + return val; + +err: + close(sk); +errn: + return -1; +} + +static int check_tcp_window(void) +{ + int ret; + + ret = kerndat_tcp_repair_window(); + if (ret < 0) + return -1; + + if (ret == 0) { + pr_err("The TCP_REPAIR_WINDOW option isn't supported.\n"); + return -1; + } + + return 0; +} + +static int check_userns(void) +{ + int ret; + unsigned long size = 0; + + ret = access("/proc/self/ns/user", F_OK); + if (ret) { + pr_perror("No userns proc file"); + return -1; + } + + ret = prctl(PR_SET_MM, PR_SET_MM_MAP_SIZE, (unsigned long)&size, 0, 0); + if (ret < 0) { + pr_perror("prctl: PR_SET_MM_MAP_SIZE is not supported"); + return -1; + } + + return 0; +} + +static int check_loginuid(void) +{ + if (kdat.luid != LUID_FULL) { + pr_warn("Loginuid restore is OFF.\n"); + return -1; + } + + return 0; +} + +static int check_compat_cr(void) +{ +#ifdef CONFIG_COMPAT + if (kdat_compatible_cr()) + return 0; + pr_warn("compat_cr is not supported. Requires kernel >= v4.12\n"); +#else + pr_warn("CRIU built without CONFIG_COMPAT - can't C/R compatible tasks\n"); +#endif + return -1; +} + +static int check_uffd(void) +{ + if (!kdat.has_uffd) { + pr_err("UFFD is not supported\n"); + return -1; + } + + return 0; +} + +static int check_uffd_noncoop(void) +{ + if (check_uffd()) + return -1; + + if (!uffd_noncooperative()) { + pr_err("Non-cooperative UFFD is not supported\n"); + return -1; + } + + return 0; +} + +static int check_can_map_vdso(void) +{ + if (kdat_can_map_vdso() == 1) + return 0; + pr_warn("Do not have API to map vDSO - will use mremap() to restore vDSO\n"); + return -1; +} + +static int check_sk_netns(void) +{ + if (!kdat.sk_ns) + return -1; + + return 0; +} + +static int check_sk_unix_file(void) +{ + if (!kdat.sk_unix_file) + return -1; + + return 0; +} + +static int check_kcmp_epoll(void) +{ + if (!kdat.has_kcmp_epoll_tfd) + return -1; + + return 0; +} + +static int check_net_diag_raw(void) +{ + check_sock_diag(); + return (socket_test_collect_bit(AF_INET, IPPROTO_RAW) && + socket_test_collect_bit(AF_INET6, IPPROTO_RAW)) ? 0 : -1; +} + +static int (*chk_feature)(void); + +/* + * There are three categories of kernel features: + * + * 1. Absolutely required (/proc/pid/map_files, ptrace PEEKSIGINFO, etc.). + * 2. Required only for specific cases (aio remap, tun, etc.). + * Checked when --extra or --all is specified. + * 3. Experimental (task-diag). + * Checked when --experimental or --all is specified. + * + * We fail if any feature in category 1 is missing but tolerate failures + * in the other categories. Currently, there is nothing in category 3. + */ +#define CHECK_GOOD "Looks good." +#define CHECK_BAD "Does not look good." +#define CHECK_MAYBE "Looks good but some kernel features are missing\n" \ + "which, depending on your process tree, may cause\n" \ + "dump or restore failure." +#define CHECK_CAT1(fn) do { \ + if ((ret = fn) != 0) { \ + print_on_level(DEFAULT_LOGLEVEL, "%s\n", CHECK_BAD); \ + return ret; \ + } \ + } while (0) +int cr_check(void) +{ + struct ns_id *ns; + int ret = 0; + + if (!is_root_user()) + return -1; + + root_item = alloc_pstree_item(); + if (root_item == NULL) + return -1; + + root_item->pid->real = getpid(); + + if (collect_pstree_ids()) + return -1; + + ns = lookup_ns_by_id(root_item->ids->mnt_ns_id, &mnt_ns_desc); + if (ns == NULL) + return -1; + + mntinfo = collect_mntinfo(ns, false); + if (mntinfo == NULL) + return -1; + + if (chk_feature) { + if (chk_feature()) + return -1; + print_on_level(DEFAULT_LOGLEVEL, "%s is supported\n", + feature_name(chk_feature)); + return 0; + } + + /* + * Category 1 - absolutely required. + * So that the user can see clearly what's missing, we exit with + * non-zero status on the first failure because it gets very + * confusing when there are many warnings and error messages. + */ + CHECK_CAT1(check_map_files()); + CHECK_CAT1(check_sock_diag()); + CHECK_CAT1(check_ns_last_pid()); + CHECK_CAT1(check_sock_peek_off()); + CHECK_CAT1(check_kcmp()); + CHECK_CAT1(check_prctl_cat1()); + CHECK_CAT1(check_fcntl()); + CHECK_CAT1(check_proc_stat()); + CHECK_CAT1(check_tcp()); + CHECK_CAT1(check_fdinfo_ext()); + CHECK_CAT1(check_unaligned_vmsplice()); + CHECK_CAT1(check_tty()); + CHECK_CAT1(check_so_gets()); + CHECK_CAT1(check_ipc()); + CHECK_CAT1(check_sigqueuinfo()); + CHECK_CAT1(check_ptrace_peeksiginfo()); + CHECK_CAT1(check_special_mapping_mremap()); + + /* + * Category 2 - required for specific cases. + * Unlike Category 1 features, we don't exit with non-zero status + * on a failure because CRIU may still work. + */ + if (opts.check_extra_features) { + ret |= check_prctl_cat2(); + ret |= check_ptrace_suspend_seccomp(); + ret |= check_ptrace_dump_seccomp_filters(); + ret |= check_mem_dirty_track(); + ret |= check_posix_timers(); + ret |= check_tun_cr(0); + ret |= check_timerfd(); + ret |= check_mnt_id(); + ret |= check_aio_remap(); + ret |= check_fdinfo_lock(); + ret |= check_clone_parent_vs_pid(); + ret |= check_cgroupns(); + ret |= check_tcp_window(); + ret |= check_tcp_halt_closed(); + ret |= check_userns(); + ret |= check_loginuid(); + ret |= check_can_map_vdso(); + ret |= check_uffd(); + ret |= check_uffd_noncoop(); + ret |= check_sk_netns(); + ret |= check_kcmp_epoll(); + ret |= check_net_diag_raw(); + } + + /* + * Category 3 - experimental. + */ + if (opts.check_experimental_features) { + ret |= check_autofs(); + ret |= check_compat_cr(); + } + + print_on_level(DEFAULT_LOGLEVEL, "%s\n", ret ? CHECK_MAYBE : CHECK_GOOD); + return ret; +} +#undef CHECK_GOOD +#undef CHECK_BAD +#undef CHECK_MAYBE +#undef CHECK_CAT1 + +static int check_tun(void) +{ + /* + * In case there's no TUN support at all we + * should report error. Unlike this plain criu + * check would report "Looks good" in this case + * since C/R effectively works, just not for TUN. + */ + return check_tun_cr(-1); +} + +static int check_tun_netns(void) +{ + bool has = false; + check_tun_netns_cr(&has); + return has ? 0 : -1; +} + +static int check_nsid(void) +{ + if (!kdat.has_nsid) { + pr_warn("NSID isn't supported\n"); + return -1; + } + + return 0; +} + +static int check_link_nsid(void) +{ + if (!kdat.has_link_nsid) { + pr_warn("NSID isn't supported\n"); + return -1; + } + + return 0; +} + +static int check_external_net_ns(void) +{ + /* + * This is obviously not a real check. This only exists, so that + * CRIU clients/users can check if this CRIU version supports the + * external network namespace feature. Theoretically the CRIU client + * or user could also parse the version, but especially for CLI users + * version comparison in the shell is not easy. + * This feature check does not exist for RPC as RPC has a special + * version call which does not require string parsing and the external + * network namespace feature is available for all CRIU versions newer + * than 3.9. + */ + return 0; +} + +struct feature_list { + char *name; + int (*func)(); +}; + +static struct feature_list feature_list[] = { + { "mnt_id", check_mnt_id }, + { "mem_dirty_track", check_mem_dirty_track }, + { "aio_remap", check_aio_remap }, + { "timerfd", check_timerfd }, + { "tun", check_tun }, + { "tun_ns", check_tun_netns }, + { "userns", check_userns }, + { "fdinfo_lock", check_fdinfo_lock }, + { "seccomp_suspend", check_ptrace_suspend_seccomp }, + { "seccomp_filters", check_ptrace_dump_seccomp_filters }, + { "loginuid", check_loginuid }, + { "cgroupns", check_cgroupns }, + { "autofs", check_autofs }, + { "tcp_half_closed", check_tcp_halt_closed }, + { "compat_cr", check_compat_cr }, + { "uffd", check_uffd }, + { "uffd-noncoop", check_uffd_noncoop }, + { "can_map_vdso", check_can_map_vdso}, + { "sk_ns", check_sk_netns }, + { "sk_unix_file", check_sk_unix_file }, + { "net_diag_raw", check_net_diag_raw }, + { "nsid", check_nsid }, + { "link_nsid", check_link_nsid}, + { "kcmp_epoll", check_kcmp_epoll}, + { "external_net_ns", check_external_net_ns}, + { NULL, NULL }, +}; + +void pr_check_features(const char *offset, const char *sep, int width) +{ + struct feature_list *fl; + int pos = width + 1; + int sep_len = strlen(sep); + int offset_len = strlen(offset); + + for (fl = feature_list; fl->name; fl++) { + int len = strlen(fl->name); + + if (pos + len + sep_len > width) { + pr_msg("\n%s", offset); + pos = offset_len; + } + pr_msg("%s", fl->name); + pos += len; + if ((fl + 1)->name) { // not the last item + pr_msg("%s", sep); + pos += sep_len; + } + } + pr_msg("\n"); +} + +int check_add_feature(char *feat) +{ + struct feature_list *fl; + + for (fl = feature_list; fl->name; fl++) { + if (!strcmp(feat, fl->name)) { + chk_feature = fl->func; + return 0; + } + } + pr_err("Unknown feature %s\n", feat); + return -1; +} + +static char *feature_name(int (*func)()) +{ + struct feature_list *fl; + + for (fl = feature_list; fl->func; fl++) { + if (fl->func == func) + return fl->name; + } + return NULL; +} diff --git a/CRIU_code/criu/cr-dedup.c b/CRIU_code/criu/cr-dedup.c new file mode 100644 index 0000000..71b7a9c --- /dev/null +++ b/CRIU_code/criu/cr-dedup.c @@ -0,0 +1,106 @@ +#include +#include +#include +#include + +#include "int.h" +#include "crtools.h" +#include "pagemap.h" +#include "restorer.h" + +static int cr_dedup_one_pagemap(unsigned long img_id, int flags); + +int cr_dedup(void) +{ + int close_ret, ret = 0; + unsigned long img_id; + DIR * dirp; + struct dirent *ent; + + dirp = opendir(CR_PARENT_LINK); + if (dirp == NULL) { + pr_perror("Can't enter previous snapshot folder, error=%d", errno); + ret = -1; + goto err; + } + + while (1) { + errno = 0; + ent = readdir(dirp); + if (ent == NULL) { + if (errno) { + pr_perror("Failed readdir, error=%d", errno); + ret = -1; + goto err; + } + break; + } + + ret = sscanf(ent->d_name, "pagemap-%lu.img", &img_id); + if (ret == 1) { + pr_info("pid=%lu\n", img_id); + ret = cr_dedup_one_pagemap(img_id, PR_TASK); + if (ret < 0) + break; + } + + ret = sscanf(ent->d_name, "pagemap-shmem-%lu.img", &img_id); + if (ret == 1) { + pr_info("shmid=%lu\n", img_id); + ret = cr_dedup_one_pagemap(img_id, PR_SHMEM); + if (ret < 0) + break; + } + } + +err: + if (dirp) { + close_ret = closedir(dirp); + if (close_ret == -1) + return close_ret; + } + + if (ret < 0) + return ret; + + pr_info("Deduplicated\n"); + return 0; +} + +static int cr_dedup_one_pagemap(unsigned long img_id, int flags) +{ + int ret; + struct page_read pr; + struct page_read * prp; + + flags |= PR_MOD; + ret = open_page_read(img_id, &pr, flags); + if (ret <= 0) + return -1; + + prp = pr.parent; + if (!prp) + goto exit; + + while (1) { + ret = pr.advance(&pr); + if (ret <= 0) + goto exit; + + pr_debug("dedup iovec base=%"PRIx64", len=%lu\n", + pr.pe->vaddr, pagemap_len(pr.pe)); + if (!pagemap_in_parent(pr.pe)) { + ret = dedup_one_iovec(prp, pr.pe->vaddr, + pagemap_len(pr.pe)); + if (ret) + goto exit; + } + } +exit: + pr.close(&pr); + + if (ret < 0) + return ret; + + return 0; +} diff --git a/CRIU_code/criu/cr-dump.c b/CRIU_code/criu/cr-dump.c new file mode 100644 index 0000000..7f2e5ed --- /dev/null +++ b/CRIU_code/criu/cr-dump.c @@ -0,0 +1,1942 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include + + +#include +#include + +#include "types.h" +#include "protobuf.h" +#include "images/fdinfo.pb-c.h" +#include "images/fs.pb-c.h" +#include "images/mm.pb-c.h" +#include "images/creds.pb-c.h" +#include "images/core.pb-c.h" +#include "images/file-lock.pb-c.h" +#include "images/rlimit.pb-c.h" +#include "images/siginfo.pb-c.h" + +#include "common/list.h" +#include "imgset.h" +#include "file-ids.h" +#include "kcmp-ids.h" +#include "common/compiler.h" +#include "crtools.h" +#include "cr_options.h" +#include "servicefd.h" +#include "string.h" +#include "ptrace-compat.h" +#include "util.h" +#include "namespaces.h" +#include "image.h" +#include "proc_parse.h" +#include "parasite.h" +#include "parasite-syscall.h" +#include "files.h" +#include "files-reg.h" +#include "shmem.h" +#include "sk-inet.h" +#include "pstree.h" +#include "mount.h" +#include "tty.h" +#include "net.h" +#include "sk-packet.h" +#include "cpu.h" +#include "elf.h" +#include "cgroup.h" +#include "cgroup-props.h" +#include "file-lock.h" +#include "page-xfer.h" +#include "kerndat.h" +#include "stats.h" +#include "mem.h" +#include "page-pipe.h" +#include "posix-timer.h" +#include "vdso.h" +#include "vma.h" +#include "cr-service.h" +#include "plugin.h" +#include "irmap.h" +#include "sysfs_parse.h" +#include "action-scripts.h" +#include "aio.h" +#include "lsm.h" +#include "seccomp.h" +#include "seize.h" +#include "fault-injection.h" +#include "dump.h" +#include "eventpoll.h" +#include "img-remote.h" + +/* + * Architectures can overwrite this function to restore register sets that + * are not covered by ptrace_set/get_regs(). + * + * with_threads = false: Only the register sets of the tasks are restored + * with_threads = true : The register sets of the tasks with all their threads + * are restored + */ +int __attribute__((weak)) arch_set_thread_regs(struct pstree_item *item, + bool with_threads) +{ + return 0; +} + +#define PERSONALITY_LENGTH 9 +static char loc_buf[PERSONALITY_LENGTH]; + +void free_mappings(struct vm_area_list *vma_area_list) +{ + struct vma_area *vma_area, *p; + + list_for_each_entry_safe(vma_area, p, &vma_area_list->h, list) { + if (!vma_area->file_borrowed) + free(vma_area->vmst); + free(vma_area); + } + + INIT_LIST_HEAD(&vma_area_list->h); + vma_area_list->nr = 0; +} + +int collect_mappings(pid_t pid, struct vm_area_list *vma_area_list, + dump_filemap_t dump_file) +{ + int ret = -1; + + pr_info("\n"); + pr_info("Collecting mappings (pid: %d)\n", pid); + pr_info("----------------------------------------\n"); + + ret = parse_smaps(pid, vma_area_list, dump_file); + if (ret < 0) + goto err; + + pr_info("Collected, longest area occupies %lu pages\n", + vma_area_list->priv_longest); + pr_info_vma_list(&vma_area_list->h); + + pr_info("----------------------------------------\n"); +err: + return ret; +} + +static int dump_sched_info(int pid, ThreadCoreEntry *tc) +{ + int ret; + struct sched_param sp; + + BUILD_BUG_ON(SCHED_OTHER != 0); /* default in proto message */ + + /* + * In musl-libc sched_getscheduler and sched_getparam don't call + * syscalls and instead the always return -ENOSYS + */ + ret = syscall(__NR_sched_getscheduler, pid); + if (ret < 0) { + pr_perror("Can't get sched policy for %d", pid); + return -1; + } + + pr_info("%d has %d sched policy\n", pid, ret); + tc->has_sched_policy = true; + tc->sched_policy = ret; + + if ((ret == SCHED_RR) || (ret == SCHED_FIFO)) { + ret = syscall(__NR_sched_getparam, pid, &sp); + if (ret < 0) { + pr_perror("Can't get sched param for %d", pid); + return -1; + } + + pr_info("\tdumping %d prio for %d\n", sp.sched_priority, pid); + tc->has_sched_prio = true; + tc->sched_prio = sp.sched_priority; + } + + /* + * The nice is ignored for RT sched policies, but is stored + * in kernel. Thus we have to take it with us in the image. + */ + + errno = 0; + ret = getpriority(PRIO_PROCESS, pid); + if (ret == -1 && errno) { + pr_perror("Can't get nice for %d ret %d", pid, ret); + return -1; + } + + pr_info("\tdumping %d nice for %d\n", ret, pid); + tc->has_sched_nice = true; + tc->sched_nice = ret; + + return 0; +} + +struct cr_imgset *glob_imgset; + +static int collect_fds(pid_t pid, struct parasite_drain_fd **dfds) +{ + struct dirent *de; + DIR *fd_dir; + int size = 0; + int n; + + pr_info("\n"); + pr_info("Collecting fds (pid: %d)\n", pid); + pr_info("----------------------------------------\n"); + + fd_dir = opendir_proc(pid, "fd"); + if (!fd_dir) + return -1; + + n = 0; + while ((de = readdir(fd_dir))) { + if (dir_dots(de)) + continue; + + if (sizeof(struct parasite_drain_fd) + sizeof(int) * (n + 1) > size) { + struct parasite_drain_fd *t; + + size += PAGE_SIZE; + t = xrealloc(*dfds, size); + if (!t) + return -1; + *dfds = t; + } + + (*dfds)->fds[n++] = atoi(de->d_name); + } + + (*dfds)->nr_fds = n; + pr_info("Found %d file descriptors\n", n); + pr_info("----------------------------------------\n"); + + closedir(fd_dir); + + return 0; +} + +static int fill_fd_params_special(int fd, struct fd_parms *p) +{ + *p = FD_PARMS_INIT; + + if (fstat(fd, &p->stat) < 0) { + pr_perror("Can't fstat exe link"); + return -1; + } + + if (get_fd_mntid(fd, &p->mnt_id)) + return -1; + + return 0; +} + +static long get_fs_type(int lfd) +{ + struct statfs fst; + + if (fstatfs(lfd, &fst)) { + pr_perror("Unable to statfs fd %d", lfd); + return -1; + } + return fst.f_type; +} + +static int dump_one_reg_file_cond(int lfd, u32 *id, struct fd_parms *parms) +{ + if (fd_id_generate_special(parms, id)) { + parms->fs_type = get_fs_type(lfd); + if (parms->fs_type < 0) + return -1; + return dump_one_reg_file(lfd, *id, parms); + } + return 0; +} + +static int dump_task_exe_link(pid_t pid, MmEntry *mm) +{ + struct fd_parms params; + int fd, ret = 0; + + fd = open_proc_path(pid, "exe"); + if (fd < 0) + return -1; + + if (fill_fd_params_special(fd, ¶ms)) + return -1; + + ret = dump_one_reg_file_cond(fd, &mm->exe_file_id, ¶ms); + + close(fd); + return ret; +} + +static int dump_task_fs(pid_t pid, struct parasite_dump_misc *misc, struct cr_imgset *imgset) +{ + struct fd_parms p; + FsEntry fe = FS_ENTRY__INIT; + int fd, ret; + + fe.has_umask = true; + fe.umask = misc->umask; + + fd = open_proc_path(pid, "cwd"); + if (fd < 0) + return -1; + + if (fill_fd_params_special(fd, &p)) + return -1; + + ret = dump_one_reg_file_cond(fd, &fe.cwd_id, &p); + if (ret < 0) + return ret; + + close(fd); + + fd = open_proc_path(pid, "root"); + if (fd < 0) + return -1; + + if (fill_fd_params_special(fd, &p)) + return -1; + + ret = dump_one_reg_file_cond(fd, &fe.root_id, &p); + if (ret < 0) + return ret; + + close(fd); + + pr_info("Dumping task cwd id %#x root id %#x\n", + fe.cwd_id, fe.root_id); + + return pb_write_one(img_from_set(imgset, CR_FD_FS), &fe, PB_FS); +} + +static inline rlim_t encode_rlim(rlim_t val) +{ + return val == RLIM_INFINITY ? -1 : val; +} + +static int dump_task_rlimits(int pid, TaskRlimitsEntry *rls) +{ + int res; + + for (res = 0; res n_rlimits ; res++) { + struct rlimit64 lim; + + if (syscall(__NR_prlimit64, pid, res, NULL, &lim)) { + pr_perror("Can't get rlimit %d", res); + return -1; + } + + rls->rlimits[res]->cur = encode_rlim(lim.rlim_cur); + rls->rlimits[res]->max = encode_rlim(lim.rlim_max); + } + + return 0; +} + +static int dump_pid_misc(pid_t pid, TaskCoreEntry *tc) +{ + int ret; + + if (kdat.luid != LUID_NONE) { + pr_info("dumping /proc/%d/loginuid\n", pid); + + tc->has_loginuid = true; + tc->loginuid = parse_pid_loginuid(pid, &ret, false); + tc->loginuid = userns_uid(tc->loginuid); + /* + * loginuid dumping is critical, as if not correctly + * restored, you may loss ability to login via SSH to CT + */ + if (ret < 0) + return ret; + } else { + tc->has_loginuid = false; + } + + pr_info("dumping /proc/%d/oom_score_adj\n", pid); + + tc->oom_score_adj = parse_pid_oom_score_adj(pid, &ret); + /* + * oom_score_adj dumping is not very critical, as it will affect + * on victim in OOM situation and one will find dumping error in log + */ + if (ret < 0) + tc->has_oom_score_adj = false; + else + tc->has_oom_score_adj = true; + + return 0; +} + +static int dump_filemap(struct vma_area *vma_area, int fd) +{ + struct fd_parms p = FD_PARMS_INIT; + VmaEntry *vma = vma_area->e; + int ret = 0; + u32 id; + + BUG_ON(!vma_area->vmst); + p.stat = *vma_area->vmst; + p.mnt_id = vma_area->mnt_id; + + /* + * AUFS support to compensate for the kernel bug + * exposing branch pathnames in map_files. + * + * If the link found in vma_get_mapfile() pointed + * inside a branch, we should use the pathname + * from root that was saved in vma_area->aufs_rpath. + */ + if (vma_area->aufs_rpath) { + struct fd_link aufs_link; + + strlcpy(aufs_link.name, vma_area->aufs_rpath, + sizeof(aufs_link.name)); + aufs_link.len = strlen(aufs_link.name); + p.link = &aufs_link; + } + + /* Flags will be set during restore in open_filmap() */ + + ret = dump_one_reg_file_cond(fd, &id, &p); + + vma->shmid = id; + return ret; +} + +static int check_sysvipc_map_dump(pid_t pid, VmaEntry *vma) +{ + if (root_ns_mask & CLONE_NEWIPC) + return 0; + + pr_err("Task %d with SysVIPC shmem map @%"PRIx64" doesn't live in IPC ns\n", + pid, vma->start); + return -1; +} + +static int get_task_auxv(pid_t pid, MmEntry *mm) +{ + auxv_t mm_saved_auxv[AT_VECTOR_SIZE]; + int fd, i, ret; + + pr_info("Obtaining task auvx ...\n"); + + fd = open_proc(pid, "auxv"); + if (fd < 0) + return -1; + + ret = read(fd, mm_saved_auxv, sizeof(mm_saved_auxv)); + if (ret < 0) { + ret = -1; + pr_perror("Error reading %d's auxv", pid); + goto err; + } else { + mm->n_mm_saved_auxv = ret / sizeof(auxv_t); + for (i = 0; i < mm->n_mm_saved_auxv; i++) + mm->mm_saved_auxv[i] = (u64)mm_saved_auxv[i]; + } + + ret = 0; +err: + close_safe(&fd); + return ret; +} + +static int dump_task_mm(pid_t pid, const struct proc_pid_stat *stat, + const struct parasite_dump_misc *misc, + const struct vm_area_list *vma_area_list, + const struct cr_imgset *imgset) +{ + MmEntry mme = MM_ENTRY__INIT; + struct vma_area *vma_area; + int ret = -1, i = 0; + + pr_info("\n"); + pr_info("Dumping mm (pid: %d)\n", pid); + pr_info("----------------------------------------\n"); + + mme.n_vmas = vma_area_list->nr; + mme.vmas = xmalloc(mme.n_vmas * sizeof(VmaEntry *)); + if (!mme.vmas) + return -1; + + list_for_each_entry(vma_area, &vma_area_list->h, list) { + VmaEntry *vma = vma_area->e; + + pr_info_vma(vma_area); + + if (!vma_entry_is(vma, VMA_AREA_REGULAR)) + ret = 0; + else if (vma_entry_is(vma, VMA_AREA_SYSVIPC)) + ret = check_sysvipc_map_dump(pid, vma); + else if (vma_entry_is(vma, VMA_AREA_SOCKET)) + ret = dump_socket_map(vma_area); + else + ret = 0; + if (ret) + goto err; + + mme.vmas[i++] = vma; + + if (vma_entry_is(vma, VMA_AREA_AIORING)) { + ret = dump_aio_ring(&mme, vma_area); + if (ret) + goto err; + } + } + + mme.mm_start_code = stat->start_code; + mme.mm_end_code = stat->end_code; + mme.mm_start_data = stat->start_data; + mme.mm_end_data = stat->end_data; + mme.mm_start_stack = stat->start_stack; + mme.mm_start_brk = stat->start_brk; + + mme.mm_arg_start = stat->arg_start; + mme.mm_arg_end = stat->arg_end; + mme.mm_env_start = stat->env_start; + mme.mm_env_end = stat->env_end; + + mme.mm_brk = misc->brk; + + mme.dumpable = misc->dumpable; + mme.has_dumpable = true; + + mme.thp_disabled = misc->thp_disabled; + mme.has_thp_disabled = true; + + mme.n_mm_saved_auxv = AT_VECTOR_SIZE; + mme.mm_saved_auxv = xmalloc(pb_repeated_size(&mme, mm_saved_auxv)); + if (!mme.mm_saved_auxv) + goto err; + + if (get_task_auxv(pid, &mme)) + goto err; + + if (dump_task_exe_link(pid, &mme)) + goto err; + + ret = pb_write_one(img_from_set(imgset, CR_FD_MM), &mme, PB_MM); + xfree(mme.mm_saved_auxv); + free_aios(&mme); +err: + xfree(mme.vmas); + return ret; +} + +static int get_task_futex_robust_list(pid_t pid, ThreadCoreEntry *info) +{ + struct robust_list_head *head = NULL; + size_t len = 0; + int ret; + + ret = syscall(SYS_get_robust_list, pid, &head, &len); + if (ret < 0 && errno == ENOSYS) { + /* + * If the kernel says get_robust_list is not implemented, then + * check whether set_robust_list is also not implemented, in + * that case we can assume it is empty, since set_robust_list + * is the only way to populate it. This case is possible when + * "futex_cmpxchg_enabled" is unset in the kernel. + * + * The following system call should always fail, even if it is + * implemented, in which case it will return -EINVAL because + * len should be greater than zero. + */ + ret = syscall(SYS_set_robust_list, NULL, 0); + if (ret == 0 || (ret < 0 && errno != ENOSYS)) + goto err; + + head = NULL; + len = 0; + } else if (ret) { + goto err; + } + + info->futex_rla = encode_pointer(head); + info->futex_rla_len = (u32)len; + + return 0; + +err: + pr_err("Failed obtaining futex robust list on %d\n", pid); + return -1; +} + +static int get_task_personality(pid_t pid, u32 *personality) +{ + int fd, ret = -1; + + pr_info("Obtaining personality ... \n"); + + fd = open_proc(pid, "personality"); + if (fd < 0) + goto err; + + ret = read(fd, loc_buf, sizeof(loc_buf) - 1); + close(fd); + + if (ret >= 0) { + loc_buf[ret] = '\0'; + *personality = atoi(loc_buf); + } +err: + return ret; +} + +static DECLARE_KCMP_TREE(vm_tree, KCMP_VM); +static DECLARE_KCMP_TREE(fs_tree, KCMP_FS); +static DECLARE_KCMP_TREE(files_tree, KCMP_FILES); +static DECLARE_KCMP_TREE(sighand_tree, KCMP_SIGHAND); + +static int dump_task_kobj_ids(struct pstree_item *item) +{ + int new; + struct kid_elem elem; + int pid = item->pid->real; + TaskKobjIdsEntry *ids = item->ids; + + elem.pid = pid; + elem.idx = 0; /* really 0 for all */ + elem.genid = 0; /* FIXME optimize */ + + new = 0; + ids->vm_id = kid_generate_gen(&vm_tree, &elem, &new); + if (!ids->vm_id || !new) { + pr_err("Can't make VM id for %d\n", pid); + return -1; + } + + new = 0; + ids->fs_id = kid_generate_gen(&fs_tree, &elem, &new); + if (!ids->fs_id || !new) { + pr_err("Can't make FS id for %d\n", pid); + return -1; + } + + new = 0; + ids->files_id = kid_generate_gen(&files_tree, &elem, &new); + if (!ids->files_id || (!new && !shared_fdtable(item))) { + pr_err("Can't make FILES id for %d\n", pid); + return -1; + } + + new = 0; + ids->sighand_id = kid_generate_gen(&sighand_tree, &elem, &new); + if (!ids->sighand_id || !new) { + pr_err("Can't make IO id for %d\n", pid); + return -1; + } + + return 0; +} + +int get_task_ids(struct pstree_item *item) +{ + int ret; + + item->ids = xmalloc(sizeof(*item->ids)); + if (!item->ids) + goto err; + + task_kobj_ids_entry__init(item->ids); + + if (item->pid->state != TASK_DEAD) { + ret = dump_task_kobj_ids(item); + if (ret) + goto err_free; + + ret = dump_task_ns_ids(item); + if (ret) + goto err_free; + } + + return 0; + +err_free: + xfree(item->ids); + item->ids = NULL; +err: + return -1; +} + +static int dump_task_ids(struct pstree_item *item, const struct cr_imgset *cr_imgset) +{ + return pb_write_one(img_from_set(cr_imgset, CR_FD_IDS), item->ids, PB_IDS); +} + +int dump_thread_core(int pid, CoreEntry *core, const struct parasite_dump_thread *ti) + +{ + int ret; + ThreadCoreEntry *tc = core->thread_core; + + ret = collect_lsm_profile(pid, tc->creds); + if (!ret) { + /* + * XXX: It's possible to set two: 32-bit and 64-bit + * futex list's heads. That makes about no sense, but + * it's possible. Until we meet such application, dump + * only one: native or compat futex's list pointer. + */ + if (!core_is_compat(core)) + ret = get_task_futex_robust_list(pid, tc); + else + ret = get_task_futex_robust_list_compat(pid, tc); + } + if (!ret) + ret = dump_sched_info(pid, tc); + if (!ret) { + core_put_tls(core, ti->tls); + CORE_THREAD_ARCH_INFO(core)->clear_tid_addr = + encode_pointer(ti->tid_addr); + BUG_ON(!tc->sas); + copy_sas(tc->sas, &ti->sas); + if (ti->pdeath_sig) { + tc->has_pdeath_sig = true; + tc->pdeath_sig = ti->pdeath_sig; + } + tc->comm = xstrdup(ti->comm); + if (tc->comm == NULL) + return -1; + } + if (!ret) + ret = seccomp_dump_thread(pid, tc); + + return ret; +} + +static int dump_task_core_all(struct parasite_ctl *ctl, + struct pstree_item *item, + const struct proc_pid_stat *stat, + const struct cr_imgset *cr_imgset) +{ + struct cr_img *img; + CoreEntry *core = item->core[0]; + pid_t pid = item->pid->real; + int ret = -1; + struct parasite_dump_cgroup_args cgroup_args, *info = NULL; + + BUILD_BUG_ON(sizeof(cgroup_args) < PARASITE_ARG_SIZE_MIN); + + pr_info("\n"); + pr_info("Dumping core (pid: %d)\n", pid); + pr_info("----------------------------------------\n"); + + ret = get_task_personality(pid, &core->tc->personality); + if (ret < 0) + goto err; + + strlcpy((char *)core->tc->comm, stat->comm, TASK_COMM_LEN); + core->tc->flags = stat->flags; + core->tc->task_state = item->pid->state; + core->tc->exit_code = 0; + + ret = parasite_dump_thread_leader_seized(ctl, pid, core); + if (ret) + goto err; + + ret = dump_pid_misc(pid, core->tc); + if (ret) + goto err; + + ret = dump_task_rlimits(pid, core->tc->rlimits); + if (ret) + goto err; + + /* For now, we only need to dump the root task's cgroup ns, because we + * know all the tasks are in the same cgroup namespace because we don't + * allow nesting. + */ + if (item->ids->has_cgroup_ns_id && !item->parent) { + info = &cgroup_args; + ret = parasite_dump_cgroup(ctl, &cgroup_args); + if (ret) + goto err; + } + + core->tc->has_cg_set = true; + ret = dump_task_cgroup(item, &core->tc->cg_set, info); + if (ret) + goto err; + + img = img_from_set(cr_imgset, CR_FD_CORE); + ret = pb_write_one(img, core, PB_CORE); + if (ret < 0) + goto err; + +err: + pr_info("----------------------------------------\n"); + + return ret; +} + +static int collect_pstree_ids_predump(void) +{ + struct pstree_item *item; + struct pid pid; + struct { + struct pstree_item i; + struct dmp_info d; + } crt = { .i.pid = &pid, }; + + /* + * This thing is normally done inside + * write_img_inventory(). + */ + + crt.i.pid->state = TASK_ALIVE; + crt.i.pid->real = getpid(); + + if (predump_task_ns_ids(&crt.i)) + return -1; + + for_each_pstree_item(item) { + if (item->pid->state == TASK_DEAD) + continue; + + if (predump_task_ns_ids(item)) + return -1; + } + + return 0; +} + +int collect_pstree_ids(void) +{ + struct pstree_item *item; + + for_each_pstree_item(item) + if (get_task_ids(item)) + return -1; + + return 0; +} + +static int collect_file_locks(void) +{ + return parse_file_locks(); +} + +static int dump_task_thread(struct parasite_ctl *parasite_ctl, + const struct pstree_item *item, int id) +{ + struct parasite_thread_ctl *tctl = dmpi(item)->thread_ctls[id]; + struct pid *tid = &item->threads[id]; + CoreEntry *core = item->core[id]; + pid_t pid = tid->real; + int ret = -1; + struct cr_img *img; + + pr_info("\n"); + pr_info("Dumping core for thread (pid: %d)\n", pid); + pr_info("----------------------------------------\n"); + + ret = parasite_dump_thread_seized(tctl, parasite_ctl, id, tid, core); + if (ret) { + pr_err("Can't dump thread for pid %d\n", pid); + goto err; + } + pstree_insert_pid(tid); + + img = open_image(CR_FD_CORE, O_DUMP, tid->ns[0].virt); + if (!img) + goto err; + + ret = pb_write_one(img, core, PB_CORE); + + close_image(img); +err: + pr_info("----------------------------------------\n"); + return ret; +} + +static int dump_one_zombie(const struct pstree_item *item, + const struct proc_pid_stat *pps) +{ + CoreEntry *core; + int ret = -1; + struct cr_img *img; + + core = core_entry_alloc(0, 1); + if (!core) + return -1; + + strlcpy((char *)core->tc->comm, pps->comm, TASK_COMM_LEN); + core->tc->task_state = TASK_DEAD; + core->tc->exit_code = pps->exit_code; + + img = open_image(CR_FD_CORE, O_DUMP, vpid(item)); + if (!img) + goto err; + + ret = pb_write_one(img, core, PB_CORE); + close_image(img); +err: + core_entry_free(core); + return ret; +} + +#define SI_BATCH 32 + +static int dump_signal_queue(pid_t tid, SignalQueueEntry **sqe, bool group) +{ + struct ptrace_peeksiginfo_args arg; + int ret; + SignalQueueEntry *queue = NULL; + + pr_debug("Dump %s signals of %d\n", group ? "shared" : "private", tid); + + arg.nr = SI_BATCH; + arg.flags = 0; + if (group) + arg.flags |= PTRACE_PEEKSIGINFO_SHARED; + arg.off = 0; + + queue = xmalloc(sizeof(*queue)); + if (!queue) + return -1; + + signal_queue_entry__init(queue); + + while (1) { + int nr, si_pos; + siginfo_t *si; + + si = xmalloc(SI_BATCH * sizeof(*si)); + if (!si) { + ret = -1; + break; + } + + nr = ret = ptrace(PTRACE_PEEKSIGINFO, tid, &arg, si); + if (ret == 0) + break; /* Finished */ + + if (ret < 0) { + if (errno == EIO) { + pr_warn("ptrace doesn't support PTRACE_PEEKSIGINFO\n"); + ret = 0; + } else + pr_perror("ptrace"); + + break; + } + + queue->n_signals += nr; + queue->signals = xrealloc(queue->signals, sizeof(*queue->signals) * queue->n_signals); + if (!queue->signals) { + ret = -1; + break; + } + + for (si_pos = queue->n_signals - nr; + si_pos < queue->n_signals; si_pos++) { + SiginfoEntry *se; + + se = xmalloc(sizeof(*se)); + if (!se) { + ret = -1; + break; + } + + siginfo_entry__init(se); + se->siginfo.len = sizeof(siginfo_t); + se->siginfo.data = (void *)si++; /* XXX we don't free cores, but when + * we will, this would cause problems + */ + queue->signals[si_pos] = se; + } + + if (ret < 0) + break; + + arg.off += nr; + } + + *sqe = queue; + return ret; +} + +static int dump_task_signals(pid_t pid, struct pstree_item *item) +{ + int i, ret; + + /* Dump private signals for each thread */ + for (i = 0; i < item->nr_threads; i++) { + ret = dump_signal_queue(item->threads[i].real, &item->core[i]->thread_core->signals_p, false); + if (ret) { + pr_err("Can't dump private signals for thread %d\n", item->threads[i].real); + return -1; + } + } + + /* Dump shared signals */ + ret = dump_signal_queue(pid, &item->core[0]->tc->signals_s, true); + if (ret) { + pr_err("Can't dump shared signals (pid: %d)\n", pid); + return -1; + } + + return 0; +} + +static struct proc_pid_stat pps_buf; + +static int dump_task_threads(struct parasite_ctl *parasite_ctl, + const struct pstree_item *item) +{ + int i; + + for (i = 0; i < item->nr_threads; i++) { + /* Leader is already dumped */ + if (item->pid->real == item->threads[i].real) { + item->threads[i].ns[0].virt = vpid(item); + continue; + } + if (dump_task_thread(parasite_ctl, item, i)) + return -1; + } + + return 0; +} + +/* + * What this routine does is just reads pid-s of dead + * tasks in item's children list from item's ns proc. + * + * It does *not* find wihch real pid corresponds to + * which virtual one, but it's not required -- all we + * need to dump for zombie can be found in the same + * ns proc. + */ + +static int fill_zombies_pids(struct pstree_item *item) +{ + struct pstree_item *child; + int i, nr; + pid_t *ch; + + /* + * Pids read here are virtual -- caller has set up + * the proc of target pid namespace. + */ + if (parse_children(vpid(item), &ch, &nr) < 0) + return -1; + + /* + * Step 1 -- filter our ch's pid of alive tasks + */ + list_for_each_entry(child, &item->children, sibling) { + if (vpid(child) < 0) + continue; + for (i = 0; i < nr; i++) { + if (ch[i] == vpid(child)) { + ch[i] = -1; + break; + } + } + } + + /* + * Step 2 -- assign remaining pids from ch on + * children's items in arbitrary order. The caller + * will then re-read everything needed to dump + * zombies using newly obtained virtual pids. + */ + i = 0; + list_for_each_entry(child, &item->children, sibling) { + if (vpid(child) > 0) + continue; + for (; i < nr; i++) { + if (ch[i] < 0) + continue; + child->pid->ns[0].virt = ch[i]; + ch[i] = -1; + break; + } + BUG_ON(i == nr); + } + + xfree(ch); + + return 0; +} + +static int dump_zombies(void) +{ + struct pstree_item *item; + int ret = -1; + int pidns = root_ns_mask & CLONE_NEWPID; + + if (pidns && set_proc_fd(get_service_fd(CR_PROC_FD_OFF))) + return -1; + + /* + * We dump zombies separately because for pid-ns case + * we'd have to resolve their pids w/o parasite via + * target ns' proc. + */ + + for_each_pstree_item(item) { + if (item->pid->state != TASK_DEAD) + continue; + + if (vpid(item) < 0) { + if (!pidns) + item->pid->ns[0].virt = item->pid->real; + else if (root_item == item) { + pr_err("A root task is dead\n"); + goto err; + } else if (fill_zombies_pids(item->parent)) + goto err; + } + + pr_info("Obtaining zombie stat ... \n"); + if (parse_pid_stat(vpid(item), &pps_buf) < 0) + goto err; + + item->sid = pps_buf.sid; + item->pgid = pps_buf.pgid; + + BUG_ON(!list_empty(&item->children)); + if (dump_one_zombie(item, &pps_buf) < 0) + goto err; + } + + ret = 0; +err: + if (pidns) + close_proc(); + + return ret; +} + +static int pre_dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) +{ + pid_t pid = item->pid->real; + struct vm_area_list vmas; + struct parasite_ctl *parasite_ctl; + int ret = -1; + struct parasite_dump_misc misc; + struct mem_dump_ctl mdc; + + INIT_LIST_HEAD(&vmas.h); + vmas.nr = 0; + + pr_info("========================================\n"); + pr_info("Pre-dumping task (pid: %d)\n", pid); + pr_info("========================================\n"); + + if (item->pid->state == TASK_STOPPED) { + pr_warn("Stopped tasks are not supported\n"); + return 0; + } + + if (item->pid->state == TASK_DEAD) + return 0; + + ret = collect_mappings(pid, &vmas, NULL); + if (ret) { + pr_err("Collect mappings (pid: %d) failed with %d\n", pid, ret); + goto err; + } + + ret = -1; + parasite_ctl = parasite_infect_seized(pid, item, &vmas); + if (!parasite_ctl) { + pr_err("Can't infect (pid: %d) with parasite\n", pid); + goto err_free; + } + + ret = parasite_fixup_vdso(parasite_ctl, pid, &vmas); + if (ret) { + pr_err("Can't fixup vdso VMAs (pid: %d)\n", pid); + goto err_cure; + } + + ret = parasite_dump_misc_seized(parasite_ctl, &misc); + if (ret) { + pr_err("Can't dump misc (pid: %d)\n", pid); + goto err_cure; + } + + ret = predump_task_files(pid); + if (ret) { + pr_err("Pre-dumping files failed (pid: %d)\n", pid); + goto err_cure; + } + + item->pid->ns[0].virt = misc.pid; + + mdc.pre_dump = true; + mdc.lazy = false; + mdc.stat = NULL; + mdc.parent_ie = parent_ie; + + ret = parasite_dump_pages_seized(item, &vmas, &mdc, parasite_ctl); + if (ret) + goto err_cure; + + if (compel_cure_remote(parasite_ctl)) + pr_err("Can't cure (pid: %d) from parasite\n", pid); +err_free: + free_mappings(&vmas); +err: + return ret; + +err_cure: + if (compel_cure(parasite_ctl)) + pr_err("Can't cure (pid: %d) from parasite\n", pid); + goto err_free; +} + +static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) +{ + pid_t pid = item->pid->real; + struct vm_area_list vmas; + struct parasite_ctl *parasite_ctl; + int ret, exit_code = -1; + struct parasite_dump_misc misc; + struct cr_imgset *cr_imgset = NULL; + struct parasite_drain_fd *dfds = NULL; + struct proc_posix_timers_stat proc_args; + struct mem_dump_ctl mdc; + + INIT_LIST_HEAD(&vmas.h); + vmas.nr = 0; + + pr_info("========================================\n"); + pr_info("Dumping task (pid: %d)\n", pid); + pr_info("========================================\n"); + + if (item->pid->state == TASK_DEAD) + /* + * zombies are dumped separately in dump_zombies() + */ + return 0; + + pr_info("Obtaining task stat ... \n"); + ret = parse_pid_stat(pid, &pps_buf); + if (ret < 0) + goto err; + + ret = collect_mappings(pid, &vmas, dump_filemap); + if (ret) { + pr_err("Collect mappings (pid: %d) failed with %d\n", pid, ret); + goto err; + } + + if (!shared_fdtable(item)) { + dfds = xmalloc(sizeof(*dfds)); + if (!dfds) + goto err; + + ret = collect_fds(pid, &dfds); + if (ret) { + pr_err("Collect fds (pid: %d) failed with %d\n", pid, ret); + goto err; + } + + parasite_ensure_args_size(drain_fds_size(dfds)); + } + + ret = parse_posix_timers(pid, &proc_args); + if (ret < 0) { + pr_err("Can't read posix timers file (pid: %d)\n", pid); + goto err; + } + + parasite_ensure_args_size(posix_timers_dump_size(proc_args.timer_n)); + + ret = dump_task_signals(pid, item); + if (ret) { + pr_err("Dump %d signals failed %d\n", pid, ret); + goto err; + } + + parasite_ctl = parasite_infect_seized(pid, item, &vmas); + if (!parasite_ctl) { + pr_err("Can't infect (pid: %d) with parasite\n", pid); + goto err; + } + + if (fault_injected(FI_DUMP_EARLY)) { + pr_info("fault: CRIU sudden detach\n"); + kill(getpid(), SIGKILL); + } + + if (root_ns_mask & CLONE_NEWPID && root_item == item) { + int pfd; + + pfd = parasite_get_proc_fd_seized(parasite_ctl); + if (pfd < 0) { + pr_err("Can't get proc fd (pid: %d)\n", pid); + goto err_cure_imgset; + } + + if (install_service_fd(CR_PROC_FD_OFF, pfd) < 0) + goto err_cure_imgset; + } + + ret = parasite_fixup_vdso(parasite_ctl, pid, &vmas); + if (ret) { + pr_err("Can't fixup vdso VMAs (pid: %d)\n", pid); + goto err_cure_imgset; + } + + ret = parasite_collect_aios(parasite_ctl, &vmas); /* FIXME -- merge with above */ + if (ret) { + pr_err("Failed to check aio rings (pid: %d)\n", pid); + goto err_cure_imgset; + } + + ret = parasite_dump_misc_seized(parasite_ctl, &misc); + if (ret) { + pr_err("Can't dump misc (pid: %d)\n", pid); + goto err_cure_imgset; + } + + item->pid->ns[0].virt = misc.pid; + pstree_insert_pid(item->pid); + item->sid = misc.sid; + item->pgid = misc.pgid; + + pr_info("sid=%d pgid=%d pid=%d\n", + item->sid, item->pgid, vpid(item)); + + if (item->sid == 0) { + pr_err("A session leader of %d(%d) is outside of its pid namespace\n", + item->pid->real, vpid(item)); + goto err_cure; + } + + cr_imgset = cr_task_imgset_open(vpid(item), O_DUMP); + if (!cr_imgset) + goto err_cure; + + ret = dump_task_ids(item, cr_imgset); + if (ret) { + pr_err("Dump ids (pid: %d) failed with %d\n", pid, ret); + goto err_cure; + } + + if (dfds) { + ret = dump_task_files_seized(parasite_ctl, item, dfds); + if (ret) { + pr_err("Dump files (pid: %d) failed with %d\n", pid, ret); + goto err_cure; + } + ret = flush_eventpoll_dinfo_queue(); + if (ret) { + pr_err("Dump eventpoll (pid: %d) failed with %d\n", pid, ret); + goto err_cure; + } + } + + mdc.pre_dump = false; + mdc.lazy = opts.lazy_pages; + mdc.stat = &pps_buf; + mdc.parent_ie = parent_ie; + + ret = parasite_dump_pages_seized(item, &vmas, &mdc, parasite_ctl); + if (ret) + goto err_cure; + + ret = parasite_dump_sigacts_seized(parasite_ctl, item); + if (ret) { + pr_err("Can't dump sigactions (pid: %d) with parasite\n", pid); + goto err_cure; + } + + ret = parasite_dump_itimers_seized(parasite_ctl, item); + if (ret) { + pr_err("Can't dump itimers (pid: %d)\n", pid); + goto err_cure; + } + + ret = parasite_dump_posix_timers_seized(&proc_args, parasite_ctl, item); + if (ret) { + pr_err("Can't dump posix timers (pid: %d)\n", pid); + goto err_cure; + } + + ret = dump_task_core_all(parasite_ctl, item, &pps_buf, cr_imgset); + if (ret) { + pr_err("Dump core (pid: %d) failed with %d\n", pid, ret); + goto err_cure; + } + + ret = compel_stop_daemon(parasite_ctl); + if (ret) { + pr_err("Can't cure (pid: %d) from parasite\n", pid); + goto err; + } + + ret = dump_task_threads(parasite_ctl, item); + if (ret) { + pr_err("Can't dump threads\n"); + goto err; + } + + if (opts.lazy_pages) + ret = compel_cure_remote(parasite_ctl); + else + ret = compel_cure(parasite_ctl); + if (ret) { + pr_err("Can't cure (pid: %d) from parasite\n", pid); + goto err; + } + + ret = dump_task_mm(pid, &pps_buf, &misc, &vmas, cr_imgset); + if (ret) { + pr_err("Dump mappings (pid: %d) failed with %d\n", pid, ret); + goto err; + } + + ret = dump_task_fs(pid, &misc, cr_imgset); + if (ret) { + pr_err("Dump fs (pid: %d) failed with %d\n", pid, ret); + goto err; + } + + close_cr_imgset(&cr_imgset); + exit_code = 0; +err: + close_pid_proc(); + free_mappings(&vmas); + xfree(dfds); + return exit_code; + +err_cure: + close_cr_imgset(&cr_imgset); +err_cure_imgset: + compel_cure(parasite_ctl); + goto err; +} + +static int alarm_attempts = 0; + +bool alarm_timeouted() { + return alarm_attempts > 0; +} + +static void alarm_handler(int signo) +{ + + pr_err("Timeout reached. Try to interrupt: %d\n", alarm_attempts); + if (alarm_attempts++ < 5) { + alarm(1); + /* A curren syscall will be exited with EINTR */ + return; + } + pr_err("FATAL: Unable to interrupt the current operation\n"); + BUG(); +} + +static int setup_alarm_handler() +{ + struct sigaction sa = { + .sa_handler = alarm_handler, + .sa_flags = 0, /* Don't restart syscalls */ + }; + + sigemptyset(&sa.sa_mask); + sigaddset(&sa.sa_mask, SIGALRM); + if (sigaction(SIGALRM, &sa, NULL)) { + pr_perror("Unable to setup SIGALRM handler"); + return -1; + } + + return 0; +} + +static int cr_pre_dump_finish(int status) +{ + InventoryEntry he = INVENTORY_ENTRY__INIT; + struct pstree_item *item; + int ret; + + /* + * Restore registers for tasks only. The threads have not been + * infected. Therefore, the thread register sets have not been changed. + */ + ret = arch_set_thread_regs(root_item, false); + if (ret) + goto err; + + ret = inventory_save_uptime(&he); + if (ret) + goto err; + + pstree_switch_state(root_item, TASK_ALIVE); + + timing_stop(TIME_FROZEN); + + if (status < 0) { + ret = status; + goto err; + } + + pr_info("Pre-dumping tasks' memory\n"); + for_each_pstree_item(item) { + struct parasite_ctl *ctl = dmpi(item)->parasite_ctl; + struct page_pipe *mem_pp; + struct page_xfer xfer; + + if (!ctl) + continue; + + pr_info("\tPre-dumping %d\n", vpid(item)); + timing_start(TIME_MEMWRITE); + ret = open_page_xfer(&xfer, CR_FD_PAGEMAP, vpid(item)); + if (ret < 0) + goto err; + + mem_pp = dmpi(item)->mem_pp; + ret = page_xfer_dump_pages(&xfer, mem_pp); + + xfer.close(&xfer); + + if (ret) + goto err; + + timing_stop(TIME_MEMWRITE); + + destroy_page_pipe(mem_pp); + compel_cure_local(ctl); + } + + free_pstree(root_item); + seccomp_free_entries(); + + if (irmap_predump_run()) { + ret = -1; + goto err; + } + +err: + if (disconnect_from_page_server()) + ret = -1; + + if (bfd_flush_images()) + ret = -1; + + if (write_img_inventory(&he)) + ret = -1; + + if (ret) + pr_err("Pre-dumping FAILED.\n"); + else { + write_stats(DUMP_STATS); + pr_info("Pre-dumping finished successfully\n"); + } + return ret; +} + +int cr_pre_dump_tasks(pid_t pid) +{ + InventoryEntry *parent_ie = NULL; + struct pstree_item *item; + int ret = -1; + + /* + * We might need a lot of pipes to fetch huge number of pages to dump. + */ + rlimit_unlimit_nofile(); + + if (opts.remote && push_snapshot_id() < 0) { + pr_err("Failed to push image namespace.\n"); + goto err; + } + + root_item = alloc_pstree_item(); + if (!root_item) + goto err; + root_item->pid->real = pid; + + if (!opts.track_mem) { + pr_info("Enforcing memory tracking for pre-dump.\n"); + opts.track_mem = true; + } + + if (opts.final_state == TASK_DEAD) { + pr_info("Enforcing tasks run after pre-dump.\n"); + opts.final_state = TASK_ALIVE; + } + + if (init_stats(DUMP_STATS)) + goto err; + + if (cr_plugin_init(CR_PLUGIN_STAGE__PRE_DUMP)) + goto err; + + if (lsm_check_opts()) + goto err; + + if (irmap_load_cache()) + goto err; + + if (cpu_init()) + goto err; + + if (vdso_init_dump()) + goto err; + + if (connect_to_page_server_to_send() < 0) + goto err; + + if (setup_alarm_handler()) + goto err; + + if (collect_pstree()) + goto err; + + if (collect_pstree_ids_predump()) + goto err; + + if (collect_namespaces(false) < 0) + goto err; + + /* Errors handled later in detect_pid_reuse */ + parent_ie = get_parent_inventory(); + + for_each_pstree_item(item) + if (pre_dump_one_task(item, parent_ie)) + goto err; + + if (parent_ie) { + inventory_entry__free_unpacked(parent_ie, NULL); + parent_ie = NULL; + } + + ret = cr_dump_shmem(); + if (ret) + goto err; + + if (irmap_predump_prep()) + goto err; + + ret = 0; +err: + if (parent_ie) + inventory_entry__free_unpacked(parent_ie, NULL); + + return cr_pre_dump_finish(ret); +} + +static int cr_lazy_mem_dump(void) +{ + struct pstree_item *item; + int ret = 0; + + pr_info("Starting lazy pages server\n"); + ret = cr_page_server(false, true, -1); + + for_each_pstree_item(item) { + if (item->pid->state != TASK_DEAD) { + destroy_page_pipe(dmpi(item)->mem_pp); + compel_cure_local(dmpi(item)->parasite_ctl); + } + } + + if (ret) + pr_err("Lazy pages transfer FAILED.\n"); + else + pr_info("Lazy pages transfer finished successfully\n"); + + return ret; +} + +static int cr_dump_finish(int ret) +{ + int post_dump_ret = 0; + + if (disconnect_from_page_server()) + ret = -1; + + close_cr_imgset(&glob_imgset); + + if (bfd_flush_images()) + ret = -1; + + cr_plugin_fini(CR_PLUGIN_STAGE__DUMP, ret); + cgp_fini(); + + if (!ret) { + /* + * It might be a migration case, where we're asked + * to dump everything, then some script transfer + * image on a new node and we're supposed to kill + * dumpee because it continue running somewhere + * else. + * + * Thus ask user via script if we're to break + * checkpoint. + */ + post_dump_ret = run_scripts(ACT_POST_DUMP); + if (post_dump_ret) { + post_dump_ret = WEXITSTATUS(post_dump_ret); + pr_info("Post dump script passed with %d\n", post_dump_ret); + } + } + + /* + * Dump is complete at this stage. To choose what + * to do next we need to consider the following + * scenarios + * + * - error happened during checkpoint: just clean up + * everything and continue execution of the dumpee; + * + * - dump succeeded but post-dump script returned + * some ret code: same as in previous scenario -- + * just clean up everything and continue execution, + * we will return script ret code back to criu caller + * and it's up to a caller what to do with running instance + * of the dumpee -- either kill it, or continue running; + * + * - dump succeeded but -R option passed, pointing that + * we're asked to continue execution of the dumpee. It's + * assumed that a user will use post-dump script to keep + * consistency of the FS and other resources, we simply + * start rollback procedure and cleanup everything. + */ + if (ret || post_dump_ret || opts.final_state == TASK_ALIVE) { + network_unlock(); + delete_link_remaps(); + clean_cr_time_mounts(); + } + + if (!ret && opts.lazy_pages) + ret = cr_lazy_mem_dump(); + + if (arch_set_thread_regs(root_item, true) < 0) + return -1; + pstree_switch_state(root_item, + (ret || post_dump_ret) ? + TASK_ALIVE : opts.final_state); + timing_stop(TIME_FROZEN); + free_pstree(root_item); + seccomp_free_entries(); + free_file_locks(); + free_link_remaps(); + free_aufs_branches(); + free_userns_maps(); + + close_service_fd(CR_PROC_FD_OFF); + + if (opts.remote && (finish_remote_dump() < 0)) { + pr_err("Finish remote dump failed.\n"); + return post_dump_ret ? : 1; + } + + if (ret) { + pr_err("Dumping FAILED.\n"); + } else { + write_stats(DUMP_STATS); + pr_info("Dumping finished successfully\n"); + } + return post_dump_ret ? : (ret != 0); +} + +int cr_dump_tasks(pid_t pid) +{ + InventoryEntry he = INVENTORY_ENTRY__INIT; + InventoryEntry *parent_ie = NULL; + struct pstree_item *item; + int pre_dump_ret = 0; + int ret = -1; + + pr_info("========================================\n"); + pr_info("Dumping processes (pid: %d)\n", pid); + pr_info("========================================\n"); + + /* + * We will fetch all file descriptors for each task, their number can + * be bigger than a default file limit, so we need to raise it to the + * maximum. + */ + rlimit_unlimit_nofile(); + + if (opts.remote && push_snapshot_id() < 0) { + pr_err("Failed to push image namespace.\n"); + goto err; + } + + root_item = alloc_pstree_item(); + if (!root_item) + goto err; + root_item->pid->real = pid; + + pre_dump_ret = run_scripts(ACT_PRE_DUMP); + if (pre_dump_ret != 0) { + pr_err("Pre dump script failed with %d!\n", pre_dump_ret); + goto err; + } + if (init_stats(DUMP_STATS)) + goto err; + + if (cr_plugin_init(CR_PLUGIN_STAGE__DUMP)) + goto err; + + if (lsm_check_opts()) + goto err; + + if (irmap_load_cache()) + goto err; + + if (cpu_init()) + goto err; + + if (vdso_init_dump()) + goto err; + + if (cgp_init(opts.cgroup_props, + opts.cgroup_props ? + strlen(opts.cgroup_props) : 0, + opts.cgroup_props_file)) + goto err; + + if (parse_cg_info()) + goto err; + + if (prepare_inventory(&he)) + goto err; + + if (opts.cpu_cap & CPU_CAP_IMAGE) { + if (cpu_dump_cpuinfo()) + goto err; + } + + if (connect_to_page_server_to_send() < 0) + goto err; + + if (setup_alarm_handler()) + goto err; + + /* + * The collect_pstree will also stop (PTRACE_SEIZE) the tasks + * thus ensuring that they don't modify anything we collect + * afterwards. + */ + + if (collect_pstree()) + goto err; + + if (collect_pstree_ids()) + goto err; + + if (network_lock()) + goto err; + + if (collect_file_locks()) + goto err; + + if (collect_namespaces(true) < 0) + goto err; + + glob_imgset = cr_glob_imgset_open(O_DUMP); + if (!glob_imgset) + goto err; + + if (seccomp_collect_dump_filters() < 0) + goto err; + + /* Errors handled later in detect_pid_reuse */ + parent_ie = get_parent_inventory(); + + for_each_pstree_item(item) { + if (dump_one_task(item, parent_ie)) + goto err; + } + + if (parent_ie) { + inventory_entry__free_unpacked(parent_ie, NULL); + parent_ie = NULL; + } + + /* + * It may happen that a process has completed but its files in + * /proc/PID/ are still open by another process. If the PID has been + * given to some newer thread since then, we may be unable to dump + * all this. + */ + if (dead_pid_conflict()) + goto err; + + /* MNT namespaces are dumped after files to save remapped links */ + if (dump_mnt_namespaces() < 0) + goto err; + + if (dump_file_locks()) + goto err; + + if (dump_verify_tty_sids()) + goto err; + + if (dump_zombies()) + goto err; + + if (dump_pstree(root_item)) + goto err; + + /* + * TODO: cr_dump_shmem has to be called before dump_namespaces(), + * because page_ids is a global variable and it is used to dump + * ipc shared memory, but an ipc namespace is dumped in a child + * process. + */ + ret = cr_dump_shmem(); + if (ret) + goto err; + + if (root_ns_mask) { + ret = dump_namespaces(root_item, root_ns_mask); + if (ret) + goto err; + } + + ret = dump_cgroups(); + if (ret) + goto err; + + ret = fix_external_unix_sockets(); + if (ret) + goto err; + + ret = tty_post_actions(); + if (ret) + goto err; + + ret = inventory_save_uptime(&he); + if (ret) + goto err; + + ret = write_img_inventory(&he); + if (ret) + goto err; +err: + if (parent_ie) + inventory_entry__free_unpacked(parent_ie, NULL); + + return cr_dump_finish(ret); +} diff --git a/CRIU_code/criu/cr-errno.c b/CRIU_code/criu/cr-errno.c new file mode 100644 index 0000000..b62bb54 --- /dev/null +++ b/CRIU_code/criu/cr-errno.c @@ -0,0 +1,12 @@ +static int cr_errno; + +int get_cr_errno(void) +{ + return cr_errno; +} + +void set_cr_errno(int new_err) +{ + if (!cr_errno) + cr_errno = new_err; +} diff --git a/CRIU_code/criu/cr-restore.c b/CRIU_code/criu/cr-restore.c new file mode 100644 index 0000000..ecfee12 --- /dev/null +++ b/CRIU_code/criu/cr-restore.c @@ -0,0 +1,3587 @@ +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include "types.h" +#include +#include "common/compiler.h" + +#include "clone-noasan.h" +#include "cr_options.h" +#include "servicefd.h" +#include "image.h" +#include "img-remote.h" +#include "util.h" +#include "util-pie.h" +#include "criu-log.h" +#include "restorer.h" +#include "sockets.h" +#include "sk-packet.h" +#include "common/lock.h" +#include "files.h" +#include "pipes.h" +#include "fifo.h" +#include "sk-inet.h" +#include "eventfd.h" +#include "eventpoll.h" +#include "signalfd.h" +#include "proc_parse.h" +#include "pie/restorer-blob.h" +#include "crtools.h" +#include "uffd.h" +#include "namespaces.h" +#include "mem.h" +#include "mount.h" +#include "fsnotify.h" +#include "pstree.h" +#include "net.h" +#include "tty.h" +#include "cpu.h" +#include "file-lock.h" +#include "vdso.h" +#include "stats.h" +#include "tun.h" +#include "vma.h" +#include "kerndat.h" +#include "rst-malloc.h" +#include "plugin.h" +#include "cgroup.h" +#include "timerfd.h" +#include "action-scripts.h" +#include "shmem.h" +#include +#include "aio.h" +#include "lsm.h" +#include "seccomp.h" +#include "fault-injection.h" +#include "sk-queue.h" +#include "sigframe.h" +#include "fdstore.h" + +#include "parasite-syscall.h" +#include "files-reg.h" +#include +#include "compel/include/asm/syscall.h" + +#include "protobuf.h" +#include "images/sa.pb-c.h" +#include "images/timer.pb-c.h" +#include "images/vma.pb-c.h" +#include "images/rlimit.pb-c.h" +#include "images/pagemap.pb-c.h" +#include "images/siginfo.pb-c.h" + +#include "restore.h" + +#include "cr-errno.h" + +#include "pie/pie-relocs.h" + +#ifndef arch_export_restore_thread +#define arch_export_restore_thread __export_restore_thread +#endif + +#ifndef arch_export_restore_task +#define arch_export_restore_task __export_restore_task +#endif + +#ifndef arch_export_unmap +#define arch_export_unmap __export_unmap +#define arch_export_unmap_compat __export_unmap_compat +#endif + +struct pstree_item *current; + +static int restore_task_with_children(void *); +static int sigreturn_restore(pid_t pid, struct task_restore_args *ta, unsigned long alen, CoreEntry *core); +static int prepare_restorer_blob(void); +static int prepare_rlimits(int pid, struct task_restore_args *, CoreEntry *core); +static int prepare_posix_timers(int pid, struct task_restore_args *ta, CoreEntry *core); +static int prepare_signals(int pid, struct task_restore_args *, CoreEntry *core); + +/* + * Architectures can overwrite this function to restore registers that are not + * present in the sigreturn signal frame. + */ +int __attribute__((weak)) arch_set_thread_regs_nosigrt(struct pid *pid) +{ + return 0; +} + +static inline int stage_participants(int next_stage) +{ + switch (next_stage) { + case CR_STATE_FAIL: + return 0; + case CR_STATE_ROOT_TASK: + case CR_STATE_PREPARE_NAMESPACES: + return 1; + case CR_STATE_FORKING: + return task_entries->nr_tasks + task_entries->nr_helpers; + case CR_STATE_RESTORE: + return task_entries->nr_threads + task_entries->nr_helpers; + case CR_STATE_RESTORE_SIGCHLD: + case CR_STATE_RESTORE_CREDS: + return task_entries->nr_threads; + } + + BUG(); + return -1; +} + +static inline int stage_current_participants(int next_stage) +{ + switch (next_stage) { + case CR_STATE_FORKING: + return 1; + case CR_STATE_RESTORE: + /* + * Each thread has to be reported about this stage, + * so if we want to wait all other tasks, we have to + * exclude all threads of the current process. + * It is supposed that we will wait other tasks, + * before creating threads of the current task. + */ + return current->nr_threads; + } + + BUG(); + return -1; +} + +static int __restore_wait_inprogress_tasks(int participants) +{ + int ret; + futex_t *np = &task_entries->nr_in_progress; + + futex_wait_while_gt(np, participants); + ret = (int)futex_get(np); + if (ret < 0) { + set_cr_errno(get_task_cr_err()); + return ret; + } + + return 0; +} + +static int restore_wait_inprogress_tasks() +{ + return __restore_wait_inprogress_tasks(0); +} + +/* Wait all tasks except the current one */ +static int restore_wait_other_tasks() +{ + int participants, stage; + + stage = futex_get(&task_entries->start); + participants = stage_current_participants(stage); + + return __restore_wait_inprogress_tasks(participants); +} + +static inline void __restore_switch_stage_nw(int next_stage) +{ + futex_set(&task_entries->nr_in_progress, + stage_participants(next_stage)); + futex_set(&task_entries->start, next_stage); +} + +static inline void __restore_switch_stage(int next_stage) +{ + if (next_stage != CR_STATE_COMPLETE) + futex_set(&task_entries->nr_in_progress, + stage_participants(next_stage)); + futex_set_and_wake(&task_entries->start, next_stage); +} + +static int restore_switch_stage(int next_stage) +{ + __restore_switch_stage(next_stage); + return restore_wait_inprogress_tasks(); +} + +static int restore_finish_ns_stage(int from, int to) +{ + if (root_ns_mask) + return restore_finish_stage(task_entries, from); + + /* Nobody waits for this stage change, just go ahead */ + __restore_switch_stage_nw(to); + return 0; +} + +static int crtools_prepare_shared(void) +{ + if (prepare_files()) + return -1; + + /* We might want to remove ghost files on failed restore */ + if (collect_remaps_and_regfiles()) + return -1; + + /* Connections are unlocked from criu */ + if (!files_collected() && collect_image(&inet_sk_cinfo)) + return -1; + + if (collect_binfmt_misc()) + return -1; + + if (tty_prep_fds()) + return -1; + + if (prepare_cgroup()) + return -1; + + return 0; +} + +/* + * Collect order information: + * - reg_file should be before remap, as the latter needs + * to find file_desc objects + * - per-pid collects (mm and fd) should be after remap and + * reg_file since both per-pid ones need to get fdesc-s + * and bump counters on remaps if they exist + */ + +static struct collect_image_info *cinfos[] = { + &file_locks_cinfo, + &pipe_data_cinfo, + &fifo_data_cinfo, + &sk_queues_cinfo, +}; + +static struct collect_image_info *cinfos_files[] = { + &unix_sk_cinfo, + &fifo_cinfo, + &pipe_cinfo, + &nsfile_cinfo, + &packet_sk_cinfo, + &netlink_sk_cinfo, + &eventfd_cinfo, + &epoll_cinfo, + &epoll_tfd_cinfo, + &signalfd_cinfo, + &tunfile_cinfo, + &timerfd_cinfo, + &inotify_cinfo, + &inotify_mark_cinfo, + &fanotify_cinfo, + &fanotify_mark_cinfo, + &ext_file_cinfo, +}; + +/* These images are required to restore namespaces */ +static struct collect_image_info *before_ns_cinfos[] = { + &tty_info_cinfo, /* Restore devpts content */ + &tty_cdata, +}; + +static struct pprep_head *post_prepare_heads = NULL; + +void add_post_prepare_cb(struct pprep_head *ph) +{ + ph->next = post_prepare_heads; + post_prepare_heads = ph; +} + +static int run_post_prepare(void) +{ + struct pprep_head *ph; + + for (ph = post_prepare_heads; ph != NULL; ph = ph->next) + if (ph->actor(ph)) + return -1; + + return 0; +} + +static int root_prepare_shared(void) +{ + int ret = 0; + struct pstree_item *pi; + + pr_info("Preparing info about shared resources\n"); + + if (prepare_remaps()) + return -1; + + if (seccomp_read_image()) + return -1; + + if (collect_images(cinfos, ARRAY_SIZE(cinfos))) + return -1; + + if (!files_collected() && + collect_images(cinfos_files, ARRAY_SIZE(cinfos_files))) + return -1; + + for_each_pstree_item(pi) { + if (pi->pid->state == TASK_HELPER) + continue; + + ret = prepare_mm_pid(pi); + if (ret < 0) + break; + + ret = prepare_fd_pid(pi); + if (ret < 0) + break; + + ret = prepare_fs_pid(pi); + if (ret < 0) + break; + } + + if (ret < 0) + goto err; + + prepare_cow_vmas(); + + ret = prepare_restorer_blob(); + if (ret) + goto err; + + /* + * This should be called with all packets collected AND all + * fdescs and fles prepared BUT post-prep-s not run. + */ + ret = prepare_scms(); + if (ret) + goto err; + + ret = run_post_prepare(); + if (ret) + goto err; + + ret = unix_prepare_root_shared(); + if (ret) + goto err; + + ret = add_fake_unix_queuers(); + if (ret) + goto err; + + show_saved_files(); +err: + return ret; +} + +/* This actually populates and occupies ROOT_FD_OFF sfd */ +static int populate_root_fd_off(void) +{ + struct ns_id *mntns = NULL; + int ret; + + if (root_ns_mask & CLONE_NEWNS) { + mntns = lookup_ns_by_id(root_item->ids->mnt_ns_id, &mnt_ns_desc); + BUG_ON(!mntns); + } + + ret = mntns_get_root_fd(mntns); + if (ret < 0) + pr_err("Can't get root fd\n"); + return ret >= 0 ? 0 : -1; +} + +static int populate_pid_proc(void) +{ + if (open_pid_proc(vpid(current)) < 0) { + pr_err("Can't open PROC_SELF\n"); + return -1; + } + return 0; +} + +static rt_sigaction_t sigchld_act; +/* + * If parent's sigaction has blocked SIGKILL (which is non-sense), + * this parent action is non-valid and shouldn't be inherited. + * Used to mark parent_act* no more valid. + */ +static rt_sigaction_t parent_act[SIGMAX]; +#ifdef CONFIG_COMPAT +static rt_sigaction_t_compat parent_act_compat[SIGMAX]; +#endif + +static bool sa_inherited(int sig, rt_sigaction_t *sa) +{ + rt_sigaction_t *pa; + int i; + + if (current == root_item) + return false; /* XXX -- inherit from CRIU? */ + + pa = &parent_act[sig]; + + /* Omitting non-valid sigaction */ + if (pa->rt_sa_mask.sig[0] & (1 << SIGKILL)) + return false; + + for (i = 0; i < _KNSIG_WORDS; i++) + if (pa->rt_sa_mask.sig[i] != sa->rt_sa_mask.sig[i]) + return false; + + return pa->rt_sa_handler == sa->rt_sa_handler && + pa->rt_sa_flags == sa->rt_sa_flags && + pa->rt_sa_restorer == sa->rt_sa_restorer; +} + +static int restore_native_sigaction(int sig, SaEntry *e) +{ + rt_sigaction_t act; + int ret; + + ASSIGN_TYPED(act.rt_sa_handler, decode_pointer(e->sigaction)); + ASSIGN_TYPED(act.rt_sa_flags, e->flags); + ASSIGN_TYPED(act.rt_sa_restorer, decode_pointer(e->restorer)); + BUILD_BUG_ON(sizeof(e->mask) != sizeof(act.rt_sa_mask.sig)); + memcpy(act.rt_sa_mask.sig, &e->mask, sizeof(act.rt_sa_mask.sig)); + + if (sig == SIGCHLD) { + sigchld_act = act; + return 0; + } + + if (sa_inherited(sig - 1, &act)) + return 1; + + /* + * A pure syscall is used, because glibc + * sigaction overwrites se_restorer. + */ + ret = syscall(SYS_rt_sigaction, sig, &act, NULL, sizeof(k_rtsigset_t)); + if (ret < 0) { + pr_perror("Can't restore sigaction"); + return ret; + } + + parent_act[sig - 1] = act; + /* Mark SIGKILL blocked which makes compat sigaction non-valid */ +#ifdef CONFIG_COMPAT + parent_act_compat[sig - 1].rt_sa_mask.sig[0] |= 1 << SIGKILL; +#endif + + return 1; +} + +static void *stack32; + +#ifdef CONFIG_COMPAT +static bool sa_compat_inherited(int sig, rt_sigaction_t_compat *sa) +{ + rt_sigaction_t_compat *pa; + int i; + + if (current == root_item) + return false; + + pa = &parent_act_compat[sig]; + + /* Omitting non-valid sigaction */ + if (pa->rt_sa_mask.sig[0] & (1 << SIGKILL)) + return false; + + for (i = 0; i < _KNSIG_WORDS; i++) + if (pa->rt_sa_mask.sig[i] != sa->rt_sa_mask.sig[i]) + return false; + + return pa->rt_sa_handler == sa->rt_sa_handler && + pa->rt_sa_flags == sa->rt_sa_flags && + pa->rt_sa_restorer == sa->rt_sa_restorer; +} + +static int restore_compat_sigaction(int sig, SaEntry *e) +{ + rt_sigaction_t_compat act; + int ret; + + ASSIGN_TYPED(act.rt_sa_handler, (u32)e->sigaction); + ASSIGN_TYPED(act.rt_sa_flags, e->flags); + ASSIGN_TYPED(act.rt_sa_restorer, (u32)e->restorer); + BUILD_BUG_ON(sizeof(e->mask) != sizeof(act.rt_sa_mask.sig)); + memcpy(act.rt_sa_mask.sig, &e->mask, sizeof(act.rt_sa_mask.sig)); + + if (sig == SIGCHLD) { + memcpy(&sigchld_act, &act, sizeof(rt_sigaction_t_compat)); + return 0; + } + + if (sa_compat_inherited(sig - 1, &act)) + return 1; + + if (!stack32) { + stack32 = alloc_compat_syscall_stack(); + if (!stack32) + return -1; + } + + ret = arch_compat_rt_sigaction(stack32, sig, &act); + if (ret < 0) { + pr_err("Can't restore compat sigaction: %d\n", ret); + return ret; + } + + parent_act_compat[sig - 1] = act; + /* Mark SIGKILL blocked which makes native sigaction non-valid */ + parent_act[sig - 1].rt_sa_mask.sig[0] |= 1 << SIGKILL; + + return 1; +} +#else +static int restore_compat_sigaction(int sig, SaEntry *e) +{ + return -1; +} +#endif + +static int prepare_sigactions_from_core(TaskCoreEntry *tc) +{ + int sig, i; + + if (tc->n_sigactions != SIGMAX - 2) { + pr_err("Bad number of sigactions in the image (%d, want %d)\n", + (int)tc->n_sigactions, SIGMAX - 2); + return -1; + } + + pr_info("Restore on-core sigactions for %d\n", vpid(current)); + + for (sig = 1, i = 0; sig <= SIGMAX; sig++) { + int ret; + SaEntry *e; + bool sigaction_is_compat; + + if (sig == SIGKILL || sig == SIGSTOP) + continue; + + e = tc->sigactions[i++]; + sigaction_is_compat = e->has_compat_sigaction && e->compat_sigaction; + if (sigaction_is_compat) + ret = restore_compat_sigaction(sig, e); + else + ret = restore_native_sigaction(sig, e); + + if (ret < 0) + return ret; + } + + return 0; +} + +/* Returns number of restored signals, -1 or negative errno on fail */ +static int restore_one_sigaction(int sig, struct cr_img *img, int pid) +{ + bool sigaction_is_compat; + SaEntry *e; + int ret = 0; + + BUG_ON(sig == SIGKILL || sig == SIGSTOP); + + ret = pb_read_one_eof(img, &e, PB_SIGACT); + if (ret == 0) { + if (sig != SIGMAX_OLD + 1) { /* backward compatibility */ + pr_err("Unexpected EOF %d\n", sig); + return -1; + } + pr_warn("This format of sigacts-%d.img is deprecated\n", pid); + return -1; + } + if (ret < 0) + return ret; + + sigaction_is_compat = e->has_compat_sigaction && e->compat_sigaction; + if (sigaction_is_compat) + ret = restore_compat_sigaction(sig, e); + else + ret = restore_native_sigaction(sig, e); + + sa_entry__free_unpacked(e, NULL); + + return ret; +} + +static int prepare_sigactions_from_image(void) +{ + int pid = vpid(current); + struct cr_img *img; + int sig, rst = 0; + int ret = 0; + + pr_info("Restore sigacts for %d\n", pid); + + img = open_image(CR_FD_SIGACT, O_RSTR, pid); + if (!img) + return -1; + + for (sig = 1; sig <= SIGMAX; sig++) { + if (sig == SIGKILL || sig == SIGSTOP) + continue; + + ret = restore_one_sigaction(sig, img, pid); + if (ret < 0) + break; + if (ret) + rst++; + } + + pr_info("Restored %d/%d sigacts\n", rst, + SIGMAX - 3 /* KILL, STOP and CHLD */); + + close_image(img); + return ret; +} + +static int prepare_sigactions(CoreEntry *core) +{ + int ret; + + if (!task_alive(current)) + return 0; + + if (core->tc->n_sigactions != 0) + ret = prepare_sigactions_from_core(core->tc); + else + ret = prepare_sigactions_from_image(); + + if (stack32) { + free_compat_syscall_stack(stack32); + stack32 = NULL; + } + + return ret; +} + +static int __collect_child_pids(struct pstree_item *p, int state, unsigned int *n) +{ + struct pstree_item *pi; + + list_for_each_entry(pi, &p->children, sibling) { + pid_t *child; + + if (pi->pid->state != state) + continue; + + child = rst_mem_alloc(sizeof(*child), RM_PRIVATE); + if (!child) + return -1; + + (*n)++; + *child = vpid(pi); + } + + return 0; +} + +static int collect_child_pids(int state, unsigned int *n) +{ + struct pstree_item *pi; + + *n = 0; + + /* + * All children of helpers and zombies will be reparented to the init + * process and they have to be collected too. + */ + + if (current == root_item) { + for_each_pstree_item(pi) { + if (pi->pid->state != TASK_HELPER && + pi->pid->state != TASK_DEAD) + continue; + if (__collect_child_pids(pi, state, n)) + return -1; + } + } + + return __collect_child_pids(current, state, n); +} + +static int collect_helper_pids(struct task_restore_args *ta) +{ + ta->helpers = (pid_t *)rst_mem_align_cpos(RM_PRIVATE); + return collect_child_pids(TASK_HELPER, &ta->helpers_n); +} + +static int collect_zombie_pids(struct task_restore_args *ta) +{ + ta->zombies = (pid_t *)rst_mem_align_cpos(RM_PRIVATE); + return collect_child_pids(TASK_DEAD, &ta->zombies_n); +} + +static int open_core(int pid, CoreEntry **pcore) +{ + int ret; + struct cr_img *img; + + img = open_image(CR_FD_CORE, O_RSTR, pid); + if (!img) { + pr_err("Can't open core data for %d\n", pid); + return -1; + } + + ret = pb_read_one(img, pcore, PB_CORE); + close_image(img); + + return ret <= 0 ? -1 : 0; +} + +static int open_cores(int pid, CoreEntry *leader_core) +{ + int i, tpid; + CoreEntry **cores = NULL; + + cores = xmalloc(sizeof(*cores)*current->nr_threads); + if (!cores) + goto err; + + for (i = 0; i < current->nr_threads; i++) { + tpid = current->threads[i].ns[0].virt; + + if (tpid == pid) + cores[i] = leader_core; + else if (open_core(tpid, &cores[i])) + goto err; + } + + current->core = cores; + + /* + * Walk over all threads and if one them is having + * active seccomp mode we will suspend filtering + * on the whole group until restore complete. + * + * Otherwise any criu code which might use same syscall + * if present inside a filter chain would take filter + * action and might break restore procedure. + */ + for (i = 0; i < current->nr_threads; i++) { + ThreadCoreEntry *thread_core = cores[i]->thread_core; + if (thread_core->seccomp_mode != SECCOMP_MODE_DISABLED) { + rsti(current)->has_seccomp = true; + break; + } + } + + return 0; +err: + xfree(cores); + return -1; +} + +static int prepare_oom_score_adj(int value) +{ + int fd, ret = 0; + char buf[11]; + + fd = open_proc_rw(PROC_SELF, "oom_score_adj"); + if (fd < 0) + return -1; + + snprintf(buf, 11, "%d", value); + + if (write(fd, buf, 11) < 0) { + pr_perror("Write %s to /proc/self/oom_score_adj failed", buf); + ret = -1; + } + + close(fd); + return ret; +} + +static int prepare_proc_misc(pid_t pid, TaskCoreEntry *tc) +{ + int ret; + + /* loginuid value is critical to restore */ + if (kdat.luid == LUID_FULL && tc->has_loginuid && + tc->loginuid != INVALID_UID) { + ret = prepare_loginuid(tc->loginuid, LOG_ERROR); + if (ret < 0) + return ret; + } + + /* oom_score_adj is not critical: only log errors */ + if (tc->has_oom_score_adj && tc->oom_score_adj != 0) + prepare_oom_score_adj(tc->oom_score_adj); + + return 0; +} + +static int prepare_itimers(int pid, struct task_restore_args *args, CoreEntry *core); +static int prepare_mm(pid_t pid, struct task_restore_args *args); + +static int restore_one_alive_task(int pid, CoreEntry *core) +{ + unsigned args_len; + struct task_restore_args *ta; + pr_info("Restoring resources\n"); + + rst_mem_switch_to_private(); + + args_len = round_up(sizeof(*ta) + sizeof(struct thread_restore_args) * + current->nr_threads, page_size()); + ta = mmap(NULL, args_len, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); + if (!ta) + return -1; + + memzero(ta, args_len); + + if (prepare_fds(current)) + return -1; + + if (prepare_file_locks(pid)) + return -1; + + if (open_vmas(current)) + return -1; + + if (prepare_aios(current, ta)) + return -1; + + if (fixup_sysv_shmems()) + return -1; + + if (open_cores(pid, core)) + return -1; + + if (prepare_signals(pid, ta, core)) + return -1; + + if (prepare_posix_timers(pid, ta, core)) + return -1; + + if (prepare_rlimits(pid, ta, core) < 0) + return -1; + + if (collect_helper_pids(ta) < 0) + return -1; + + if (collect_zombie_pids(ta) < 0) + return -1; + + if (prepare_proc_misc(pid, core->tc)) + return -1; + + /* + * Get all the tcp sockets fds into rst memory -- restorer + * will turn repair off before going sigreturn + */ + if (prepare_tcp_socks(ta)) + return -1; + + /* + * Copy timerfd params for restorer args, we need to proceed + * timer setting at the very late. + */ + if (prepare_timerfds(ta)) + return -1; + + if (seccomp_prepare_threads(current, ta) < 0) + return -1; + + if (prepare_itimers(pid, ta, core) < 0) + return -1; + + if (prepare_mm(pid, ta)) + return -1; + + if (prepare_vmas(current, ta)) + return -1; + + /* + * Sockets have to be restored in their network namespaces, + * so a task namespace has to be restored after sockets. + */ + if (restore_task_net_ns(current)) + return -1; + + if (setup_uffd(pid, ta)) + return -1; + + return sigreturn_restore(pid, ta, args_len, core); +} + +static void zombie_prepare_signals(void) +{ + sigset_t blockmask; + int sig; + struct sigaction act; + + sigfillset(&blockmask); + sigprocmask(SIG_UNBLOCK, &blockmask, NULL); + + memset(&act, 0, sizeof(act)); + act.sa_handler = SIG_DFL; + + for (sig = 1; sig <= SIGMAX; sig++) + sigaction(sig, &act, NULL); +} + +#define SIG_FATAL_MASK ( \ + (1 << SIGHUP) |\ + (1 << SIGINT) |\ + (1 << SIGQUIT) |\ + (1 << SIGILL) |\ + (1 << SIGTRAP) |\ + (1 << SIGABRT) |\ + (1 << SIGIOT) |\ + (1 << SIGBUS) |\ + (1 << SIGFPE) |\ + (1 << SIGKILL) |\ + (1 << SIGUSR1) |\ + (1 << SIGSEGV) |\ + (1 << SIGUSR2) |\ + (1 << SIGPIPE) |\ + (1 << SIGALRM) |\ + (1 << SIGTERM) |\ + (1 << SIGXCPU) |\ + (1 << SIGXFSZ) |\ + (1 << SIGVTALRM)|\ + (1 << SIGPROF) |\ + (1 << SIGPOLL) |\ + (1 << SIGIO) |\ + (1 << SIGSYS) |\ + (1 << SIGSTKFLT)|\ + (1 << SIGPWR) \ + ) + +static inline int sig_fatal(int sig) +{ + return (sig > 0) && (sig < SIGMAX) && (SIG_FATAL_MASK & (1UL << sig)); +} + +struct task_entries *task_entries; +static unsigned long task_entries_pos; + +static int wait_on_helpers_zombies(void) +{ + struct pstree_item *pi; + + list_for_each_entry(pi, ¤t->children, sibling) { + pid_t pid = vpid(pi); + int status; + + switch (pi->pid->state) { + case TASK_DEAD: + if (waitid(P_PID, pid, NULL, WNOWAIT | WEXITED) < 0) { + pr_perror("Wait on %d zombie failed", pid); + return -1; + } + break; + case TASK_HELPER: + if (waitpid(pid, &status, 0) != pid) { + pr_perror("waitpid for helper %d failed", pid); + return -1; + } + break; + } + } + + return 0; +} + +static int wait_exiting_children(void); + +static int restore_one_zombie(CoreEntry *core) +{ + int exit_code = core->tc->exit_code; + + pr_info("Restoring zombie with %d code\n", exit_code); + + if (prepare_fds(current)) + return -1; + + if (lazy_pages_setup_zombie(vpid(current))) + return -1; + + prctl(PR_SET_NAME, (long)(void *)core->tc->comm, 0, 0, 0); + + if (task_entries != NULL) { + wait_exiting_children(); + zombie_prepare_signals(); + } + + if (exit_code & 0x7f) { + int signr; + + /* prevent generating core files */ + if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0)) + pr_perror("Can't drop the dumpable flag"); + + signr = exit_code & 0x7F; + if (!sig_fatal(signr)) { + pr_warn("Exit with non fatal signal ignored\n"); + signr = SIGABRT; + } + + if (kill(vpid(current), signr) < 0) + pr_perror("Can't kill myself, will just exit"); + + exit_code = 0; + } + + exit((exit_code >> 8) & 0x7f); + + /* never reached */ + BUG_ON(1); + return -1; +} + +static int setup_newborn_fds(struct pstree_item *me) +{ + if (clone_service_fd(me)) + return -1; + + if (!me->parent || + (rsti(me->parent)->fdt && !(rsti(me)->clone_flags & CLONE_FILES))) { + /* + * When our parent has shared fd table, some of the table owners + * may be already created. Files, they open, will be inherited + * by current process, and here we close them. Also, service fds + * of parent are closed here. And root_item closes the files, + * that were inherited from criu process. + */ + if (close_old_fds()) + return -1; + } + + return 0; +} + +static int check_core(CoreEntry *core, struct pstree_item *me) +{ + int ret = -1; + + if (core->mtype != CORE_ENTRY__MARCH) { + pr_err("Core march mismatch %d\n", (int)core->mtype); + goto out; + } + + if (!core->tc) { + pr_err("Core task state data missed\n"); + goto out; + } + + if (core->tc->task_state != TASK_DEAD) { + if (!core->ids && !me->ids) { + pr_err("Core IDS data missed for non-zombie\n"); + goto out; + } + + if (!CORE_THREAD_ARCH_INFO(core)) { + pr_err("Core info data missed for non-zombie\n"); + goto out; + } + + /* + * Seccomp are moved to per-thread origin, + * so for old images we need to move per-task + * data into proper place. + */ + if (core->tc->has_old_seccomp_mode) { + core->thread_core->has_seccomp_mode = core->tc->has_old_seccomp_mode; + core->thread_core->seccomp_mode = core->tc->old_seccomp_mode; + } + if (core->tc->has_old_seccomp_filter) { + core->thread_core->has_seccomp_filter = core->tc->has_old_seccomp_filter; + core->thread_core->seccomp_filter = core->tc->old_seccomp_filter; + rsti(me)->has_old_seccomp_filter = true; + } + } + + ret = 0; +out: + return ret; +} + +/* + * Find if there are children which are zombies or helpers - processes + * which are expected to die during the restore. + */ +static bool child_death_expected(void) +{ + struct pstree_item *pi; + + list_for_each_entry(pi, ¤t->children, sibling) { + switch (pi->pid->state) { + case TASK_DEAD: + case TASK_HELPER: + return true; + } + } + + return false; +} + +static int wait_exiting_children(void) +{ + siginfo_t info; + + if (!child_death_expected()) { + /* + * Restoree has no children that should die, during restore, + * wait for the next stage on futex. + * The default SIGCHLD handler will handle an unexpected + * child's death and abort the restore if someone dies. + */ + restore_finish_stage(task_entries, CR_STATE_RESTORE); + return 0; + } + + /* + * The restoree has children which will die - decrement itself from + * nr. of tasks processing the stage and wait for anyone to die. + * Tasks may die only when they're on the following stage. + * If one dies earlier - that's unexpected - treat it as an error + * and abort the restore. + */ + if (block_sigmask(NULL, SIGCHLD)) + return -1; + + /* Finish CR_STATE_RESTORE, but do not wait for the next stage. */ + futex_dec_and_wake(&task_entries->nr_in_progress); + + if (waitid(P_ALL, 0, &info, WEXITED | WNOWAIT)) { + pr_perror("Failed to wait\n"); + return -1; + } + + if (futex_get(&task_entries->start) == CR_STATE_RESTORE) { + pr_err("Child %d died too early\n", info.si_pid); + return -1; + } + + if (wait_on_helpers_zombies()) { + pr_err("Failed to wait on helpers and zombies\n"); + return -1; + } + + return 0; +} + +/* + * Restore a helper process - artificially created by criu + * to restore attributes of process tree. + * - sessions for each leaders are dead + * - process groups with dead leaders + * - dead tasks for which /proc//... is opened by restoring task + * - whatnot + */ +static int restore_one_helper(void) +{ + int i; + + if (prepare_fds(current)) + return -1; + + if (wait_exiting_children()) + return -1; + + sfds_protected = false; + close_image_dir(); + close_proc(); + for (i = SERVICE_FD_MIN + 1; i < SERVICE_FD_MAX; i++) + close_service_fd(i); + + return 0; +} + +static int restore_one_task(int pid, CoreEntry *core) +{ + int ret; + + /* No more fork()-s => no more per-pid logs */ + + if (task_alive(current)) + ret = restore_one_alive_task(pid, core); + else if (current->pid->state == TASK_DEAD) + ret = restore_one_zombie(core); + else if (current->pid->state == TASK_HELPER) { + ret = restore_one_helper(); + } else { + pr_err("Unknown state in code %d\n", (int)core->tc->task_state); + ret = -1; + } + + if (core) + core_entry__free_unpacked(core, NULL); + return ret; +} + +/* All arguments should be above stack, because it grows down */ +struct cr_clone_arg { + struct pstree_item *item; + unsigned long clone_flags; + + CoreEntry *core; +}; + +static void maybe_clone_parent(struct pstree_item *item, + struct cr_clone_arg *ca) +{ + /* + * zdtm runs in kernel 3.11, which has the problem described below. We + * avoid this by including the pdeath_sig test. Once users/zdtm migrate + * off of 3.11, this condition can be simplified to just test the + * options and not have the pdeath_sig test. + */ + if (opts.restore_sibling) { + /* + * This means we're called from lib's criu_restore_child(). + * In that case create the root task as the child one to+ + * the caller. This is the only way to correctly restore the + * pdeath_sig of the root task. But also looks nice. + * + * Alternatively, if we are --restore-detached, a similar trick is + * needed to correctly restore pdeath_sig and prevent processes from + * dying once restored. + * + * There were a problem in kernel 3.11 -- CLONE_PARENT can't be + * set together with CLONE_NEWPID, which has been solved in further + * versions of the kernels, but we treat 3.11 as a base, so at + * least warn a user about potential problems. + */ + rsti(item)->clone_flags |= CLONE_PARENT; + if (rsti(item)->clone_flags & CLONE_NEWPID) + pr_warn("Set CLONE_PARENT | CLONE_NEWPID but it might cause restore problem," + "because not all kernels support such clone flags combinations!\n"); + } else if (opts.restore_detach) { + if (ca->core->thread_core->pdeath_sig) + pr_warn("Root task has pdeath_sig configured, so it will receive one _right_" + "after restore on CRIU exit\n"); + } +} + +static bool needs_prep_creds(struct pstree_item *item) +{ + /* + * Before the 4.13 kernel, it was impossible to set + * an exe_file if uid or gid isn't zero. + */ + return (!item->parent && ((root_ns_mask & CLONE_NEWUSER) || getuid())); +} + +static inline int fork_with_pid(struct pstree_item *item) +{ + struct cr_clone_arg ca; + int ret = -1; + pid_t pid = vpid(item); + + if (item->pid->state != TASK_HELPER) { + if (open_core(pid, &ca.core)) + return -1; + + if (check_core(ca.core, item)) + return -1; + + item->pid->state = ca.core->tc->task_state; + rsti(item)->cg_set = ca.core->tc->cg_set; + + if (item->pid->state != TASK_DEAD && !task_alive(item)) { + pr_err("Unknown task state %d\n", item->pid->state); + return -1; + } + + /* + * By default we assume that seccomp is not + * used at all (especially on dead task). Later + * we will walk over all threads and check in + * details if filter is present setting up + * this flag as appropriate. + */ + rsti(item)->has_seccomp = false; + + if (unlikely(item == root_item)) + maybe_clone_parent(item, &ca); + } else { + /* + * Helper entry will not get moved around and thus + * will live in the parent's cgset. + */ + rsti(item)->cg_set = rsti(item->parent)->cg_set; + ca.core = NULL; + } + + ret = -1; + + ca.item = item; + ca.clone_flags = rsti(item)->clone_flags; + + BUG_ON(ca.clone_flags & CLONE_VM); + + pr_info("Forking task with %d pid (flags 0x%lx)\n", pid, ca.clone_flags); + + if (!(ca.clone_flags & CLONE_NEWPID)) { + char buf[32]; + int len; + int fd; + + fd = open_proc_rw(PROC_GEN, LAST_PID_PATH); + if (fd < 0) + goto err; + + lock_last_pid(); + + len = snprintf(buf, sizeof(buf), "%d", pid - 1); + if (write(fd, buf, len) != len) { + pr_perror("%d: Write %s to %s", pid, buf, LAST_PID_PATH); + close(fd); + goto err_unlock; + } + close(fd); + } else { + BUG_ON(pid != INIT_PID); + } + + /* + * Some kernel modules, such as network packet generator + * run kernel thread upon net-namespace creattion taking + * the @pid we've been requeting via LAST_PID_PATH interface + * so that we can't restore a take with pid needed. + * + * Here is an idea -- unhare net namespace in callee instead. + */ + /* + * The cgroup namespace is also unshared explicitly in the + * move_in_cgroup(), so drop this flag here as well. + */ + close_pid_proc(); + ret = clone_noasan(restore_task_with_children, + (ca.clone_flags & ~(CLONE_NEWNET | CLONE_NEWCGROUP)) | SIGCHLD, &ca); + if (ret < 0) { + pr_perror("Can't fork for %d", pid); + goto err_unlock; + } + + + if (item == root_item) { + item->pid->real = ret; + pr_debug("PID: real %d virt %d\n", + item->pid->real, vpid(item)); + } + +err_unlock: + if (!(ca.clone_flags & CLONE_NEWPID)) + unlock_last_pid(); +err: + if (ca.core) + core_entry__free_unpacked(ca.core, NULL); + return ret; +} + +static void sigchld_handler(int signal, siginfo_t *siginfo, void *data) +{ + int status, pid, exit; + + while (1) { + pid = waitpid(-1, &status, WNOHANG); + if (pid <= 0) + return; + + if (!current && WIFSTOPPED(status) && + WSTOPSIG(status) == SIGCHLD) { + /* The root task is ptraced. Allow it to handle SIGCHLD */ + if (ptrace(PTRACE_CONT, pid, 0, SIGCHLD)) + pr_perror("Unable to resume %d", pid); + return; + } + + exit = WIFEXITED(status); + status = exit ? WEXITSTATUS(status) : WTERMSIG(status); + + break; + } + + if (exit) + pr_err("%d exited, status=%d\n", pid, status); + else + pr_err("%d killed by signal %d: %s\n", + pid, status, strsignal(status)); + + futex_abort_and_wake(&task_entries->nr_in_progress); +} + +static int criu_signals_setup(void) +{ + int ret; + struct sigaction act; + sigset_t blockmask; + + ret = sigaction(SIGCHLD, NULL, &act); + if (ret < 0) { + pr_perror("sigaction() failed"); + return -1; + } + + act.sa_flags |= SA_NOCLDSTOP | SA_SIGINFO | SA_RESTART; + act.sa_sigaction = sigchld_handler; + sigemptyset(&act.sa_mask); + sigaddset(&act.sa_mask, SIGCHLD); + + ret = sigaction(SIGCHLD, &act, NULL); + if (ret < 0) { + pr_perror("sigaction() failed"); + return -1; + } + + /* + * The block mask will be restored in sigreturn. + * + * TODO: This code should be removed, when a freezer will be added. + */ + sigfillset(&blockmask); + sigdelset(&blockmask, SIGCHLD); + + /* + * Here we use SIG_SETMASK instead of SIG_BLOCK to avoid the case where + * we've been forked from a parent who had blocked SIGCHLD. If SIGCHLD + * is blocked when a task dies (e.g. if the task fails to restore + * somehow), we hang because our SIGCHLD handler is never run. Since we + * depend on SIGCHLD being unblocked, let's set the mask explicitly. + */ + ret = sigprocmask(SIG_SETMASK, &blockmask, NULL); + if (ret < 0) { + pr_perror("Can't block signals"); + return -1; + } + + return 0; +} + +static void restore_sid(void) +{ + pid_t sid; + + /* + * SID can only be reset to pid or inherited from parent. + * Thus we restore it right here to let our kids inherit + * one in case they need it. + * + * PGIDs are restored late when all tasks are forked and + * we can call setpgid() on custom values. + */ + + if (vpid(current) == current->sid) { + pr_info("Restoring %d to %d sid\n", vpid(current), current->sid); + sid = setsid(); + if (sid != current->sid) { + pr_perror("Can't restore sid (%d)", sid); + exit(1); + } + } else { + sid = getsid(0); + if (sid != current->sid) { + /* Skip the root task if it's not init */ + if (current == root_item && vpid(root_item) != INIT_PID) + return; + pr_err("Requested sid %d doesn't match inherited %d\n", + current->sid, sid); + exit(1); + } + } +} + +static void restore_pgid(void) +{ + /* + * Unlike sessions, process groups (a.k.a. pgids) can be joined + * by any task, provided the task with pid == pgid (group leader) + * exists. Thus, in order to restore pgid we must make sure that + * group leader was born and created the group, then join one. + * + * We do this _before_ finishing the forking stage to make sure + * helpers are still with us. + */ + + pid_t pgid, my_pgid = current->pgid; + + pr_info("Restoring %d to %d pgid\n", vpid(current), my_pgid); + + pgid = getpgrp(); + if (my_pgid == pgid) + return; + + if (my_pgid != vpid(current)) { + struct pstree_item *leader; + + /* + * Wait for leader to become such. + * Missing leader means we're going to crtools + * group (-j option). + */ + + leader = rsti(current)->pgrp_leader; + if (leader) { + BUG_ON(my_pgid != vpid(leader)); + futex_wait_until(&rsti(leader)->pgrp_set, 1); + } + } + + pr_info("\twill call setpgid, mine pgid is %d\n", pgid); + if (setpgid(0, my_pgid) != 0) { + pr_perror("Can't restore pgid (%d/%d->%d)", vpid(current), pgid, current->pgid); + exit(1); + } + + if (my_pgid == vpid(current)) + futex_set_and_wake(&rsti(current)->pgrp_set, 1); +} + +static int mount_proc(void) +{ + int fd, ret; + char proc_mountpoint[] = "crtools-proc.XXXXXX"; + + if (root_ns_mask == 0) + fd = ret = open("/proc", O_DIRECTORY); + else { + if (mkdtemp(proc_mountpoint) == NULL) { + pr_perror("mkdtemp failed %s", proc_mountpoint); + return -1; + } + + pr_info("Mount procfs in %s\n", proc_mountpoint); + if (mount("proc", proc_mountpoint, "proc", MS_MGC_VAL | MS_NOSUID | MS_NOEXEC | MS_NODEV, NULL)) { + pr_perror("mount failed"); + rmdir(proc_mountpoint); + return -1; + } + + ret = fd = open_detach_mount(proc_mountpoint); + } + + if (fd >= 0) { + ret = set_proc_fd(fd); + close(fd); + } + + return ret; +} + +/* + * Tasks cannot change sid (session id) arbitrary, but can either + * inherit one from ancestor, or create a new one with id equal to + * their pid. Thus sid-s restore is tied with children creation. + */ + +static int create_children_and_session(void) +{ + int ret; + struct pstree_item *child; + + pr_info("Restoring children in alien sessions:\n"); + list_for_each_entry(child, ¤t->children, sibling) { + if (!restore_before_setsid(child)) + continue; + + BUG_ON(child->born_sid != -1 && getsid(0) != child->born_sid); + + ret = fork_with_pid(child); + if (ret < 0) + return ret; + } + + if (current->parent) + restore_sid(); + + pr_info("Restoring children in our session:\n"); + list_for_each_entry(child, ¤t->children, sibling) { + if (restore_before_setsid(child)) + continue; + + ret = fork_with_pid(child); + if (ret < 0) + return ret; + } + + return 0; +} + +static int restore_task_with_children(void *_arg) +{ + struct cr_clone_arg *ca = _arg; + pid_t pid; + int ret; + + current = ca->item; + + if (current != root_item) { + char buf[12]; + int fd; + + /* Determine PID in CRIU's namespace */ + fd = get_service_fd(CR_PROC_FD_OFF); + if (fd < 0) + goto err; + + ret = readlinkat(fd, "self", buf, sizeof(buf) - 1); + if (ret < 0) { + pr_perror("Unable to read the /proc/self link"); + goto err; + } + buf[ret] = '\0'; + + current->pid->real = atoi(buf); + pr_debug("PID: real %d virt %d\n", + current->pid->real, vpid(current)); + } + + pid = getpid(); + if (vpid(current) != pid) { + pr_err("Pid %d do not match expected %d\n", pid, vpid(current)); + set_task_cr_err(EEXIST); + goto err; + } + + if (log_init_by_pid(vpid(current))) + return -1; + + if (current->parent == NULL) { + /* + * The root task has to be in its namespaces before executing + * ACT_SETUP_NS scripts, so the root netns has to be created here + */ + if (root_ns_mask & CLONE_NEWNET) { + struct ns_id *ns = net_get_root_ns(); + if (ns->ext_key) + ret = net_set_ext(ns); + else + ret = unshare(CLONE_NEWNET); + if (ret) { + pr_perror("Can't unshare net-namespace"); + goto err; + } + } + + /* Wait prepare_userns */ + if (restore_finish_ns_stage(CR_STATE_ROOT_TASK, CR_STATE_PREPARE_NAMESPACES) < 0) + goto err; + } + + if (needs_prep_creds(current) && (prepare_userns_creds())) + goto err; + + /* + * Call this _before_ forking to optimize cgroups + * restore -- if all tasks live in one set of cgroups + * we will only move the root one there, others will + * just have it inherited. + */ + if (prepare_task_cgroup(current) < 0) + goto err; + + /* Restore root task */ + if (current->parent == NULL) { + if (join_namespaces()) { + pr_perror("Join namespaces failed"); + goto err; + } + + pr_info("Calling restore_sid() for init\n"); + restore_sid(); + + /* + * We need non /proc proc mount for restoring pid and mount + * namespaces and do not care for the rest of the cases. + * Thus -- mount proc at custom location for any new namespace + */ + if (mount_proc()) + goto err; + + if (!files_collected() && collect_image(&tty_cinfo)) + goto err; + if (collect_images(before_ns_cinfos, ARRAY_SIZE(before_ns_cinfos))) + goto err; + + if (prepare_namespace(current, ca->clone_flags)) + goto err; + + if (restore_finish_ns_stage(CR_STATE_PREPARE_NAMESPACES, CR_STATE_FORKING) < 0) + goto err; + + if (root_prepare_shared()) + goto err; + + if (populate_root_fd_off()) + goto err; + } + + if (setup_newborn_fds(current)) + goto err; + + if (restore_task_mnt_ns(current)) + goto err; + + if (prepare_mappings(current)) + goto err; + + if (prepare_sigactions(ca->core) < 0) + goto err; + + if (fault_injected(FI_RESTORE_ROOT_ONLY)) { + pr_info("fault: Restore root task failure!\n"); + kill(getpid(), SIGKILL); + } + + if (open_transport_socket()) + goto err; + + timing_start(TIME_FORK); + + if (create_children_and_session()) + goto err; + + timing_stop(TIME_FORK); + + if (populate_pid_proc()) + goto err; + + sfds_protected = true; + + if (unmap_guard_pages(current)) + goto err; + + restore_pgid(); + + if (current->parent == NULL) { + /* + * Wait when all tasks passed the CR_STATE_FORKING stage. + * The stage was started by criu, but now it waits for + * the CR_STATE_RESTORE to finish. See comment near the + * CR_STATE_FORKING macro for details. + * + * It means that all tasks entered into their namespaces. + */ + if (restore_wait_other_tasks()) + goto err; + fini_restore_mntns(); + __restore_switch_stage(CR_STATE_RESTORE); + } else { + if (restore_finish_stage(task_entries, CR_STATE_FORKING) < 0) + goto err; + } + + if (restore_one_task(vpid(current), ca->core)) + goto err; + + return 0; + +err: + if (current->parent == NULL) + futex_abort_and_wake(&task_entries->nr_in_progress); + exit(1); +} + +static int attach_to_tasks(bool root_seized) +{ + struct pstree_item *item; + + for_each_pstree_item(item) { + int status, i; + + if (!task_alive(item)) + continue; + + if (item->nr_threads == 1) { + item->threads[0].real = item->pid->real; + } else { + if (parse_threads(item->pid->real, &item->threads, &item->nr_threads)) + return -1; + } + + for (i = 0; i < item->nr_threads; i++) { + pid_t pid = item->threads[i].real; + + if (item != root_item || !root_seized || i != 0) { + if (ptrace(PTRACE_SEIZE, pid, 0, 0)) { + pr_perror("Can't attach to %d", pid); + return -1; + } + } + if (ptrace(PTRACE_INTERRUPT, pid, 0, 0)) { + pr_perror("Can't interrupt the %d task", pid); + return -1; + } + + + if (wait4(pid, &status, __WALL, NULL) != pid) { + pr_perror("waitpid(%d) failed", pid); + return -1; + } + + /* + * Suspend seccomp if necessary. We need to do this because + * although seccomp is restored at the very end of the + * restorer blob (and the final sigreturn is ok), here we're + * doing an munmap in the process, which may be blocked by + * seccomp and cause the task to be killed. + */ + if (rsti(item)->has_seccomp && ptrace_suspend_seccomp(pid) < 0) + pr_err("failed to suspend seccomp, restore will probably fail...\n"); + + if (ptrace(PTRACE_CONT, pid, NULL, NULL) ) { + pr_perror("Unable to resume %d", pid); + return -1; + } + } + } + + return 0; +} + +static int catch_tasks(bool root_seized, enum trace_flags *flag) +{ + struct pstree_item *item; + + for_each_pstree_item(item) { + int status, i, ret; + + if (!task_alive(item)) + continue; + + if (item->nr_threads == 1) { + item->threads[0].real = item->pid->real; + } else { + if (parse_threads(item->pid->real, &item->threads, &item->nr_threads)) + return -1; + } + + for (i = 0; i < item->nr_threads; i++) { + pid_t pid = item->threads[i].real; + + if (ptrace(PTRACE_INTERRUPT, pid, 0, 0)) { + pr_perror("Can't interrupt the %d task", pid); + return -1; + } + + if (wait4(pid, &status, __WALL, NULL) != pid) { + pr_perror("waitpid(%d) failed", pid); + return -1; + } + + ret = compel_stop_pie(pid, rsti(item)->breakpoint, + flag, fault_injected(FI_NO_BREAKPOINTS)); + if (ret < 0) + return -1; + } + } + + return 0; +} + +static int clear_breakpoints() +{ + struct pstree_item *item; + int ret = 0, i; + + if (fault_injected(FI_NO_BREAKPOINTS)) + return 0; + + for_each_pstree_item(item) { + if (!task_alive(item)) + continue; + for (i = 0; i < item->nr_threads; i++) + ret |= ptrace_flush_breakpoints(item->threads[i].real); + } + + return ret; +} + +static void finalize_restore(void) +{ + struct pstree_item *item; + + for_each_pstree_item(item) { + pid_t pid = item->pid->real; + struct parasite_ctl *ctl; + + if (!task_alive(item)) + continue; + + /* Unmap the restorer blob */ + ctl = compel_prepare_noctx(pid); + if (ctl == NULL) + continue; + + compel_unmap(ctl, (unsigned long)rsti(item)->munmap_restorer); + + xfree(ctl); + + if ((item->pid->state == TASK_STOPPED) || + (opts.final_state == TASK_STOPPED)) + kill(item->pid->real, SIGSTOP); + } +} + +static void finalize_restore_detach(int status) +{ + struct pstree_item *item; + + for_each_pstree_item(item) { + pid_t pid; + int i; + + if (!task_alive(item)) + continue; + + for (i = 0; i < item->nr_threads; i++) { + pid = item->threads[i].real; + if (pid < 0) { + BUG_ON(status >= 0); + break; + } + + if (arch_set_thread_regs_nosigrt(&item->threads[i])) + pr_perror("Restoring regs for %d failed", pid); + if (ptrace(PTRACE_DETACH, pid, NULL, 0)) + pr_perror("Unable to execute %d", pid); + } + } +} + +static void ignore_kids(void) +{ + struct sigaction sa = { .sa_handler = SIG_DFL }; + + if (sigaction(SIGCHLD, &sa, NULL) < 0) + pr_perror("Restoring CHLD sigaction failed"); +} + +static unsigned int saved_loginuid; + +static int prepare_userns_hook(void) +{ + int ret; + + if (kdat.luid != LUID_FULL) + return 0; + /* + * Save old loginuid and set it to INVALID_UID: + * this value means that loginuid is unset and it will be inherited. + * After you set some value to /proc/<>/loginuid it can't be changed + * inside container due to permissions. + * But you still can set this value if it was unset. + */ + saved_loginuid = parse_pid_loginuid(getpid(), &ret, false); + if (ret < 0) + return -1; + + if (prepare_loginuid(INVALID_UID, LOG_ERROR) < 0) { + pr_err("Setting loginuid for CT init task failed, CAP_AUDIT_CONTROL?\n"); + return -1; + } + return 0; +} + +static void restore_origin_ns_hook(void) +{ + if (kdat.luid != LUID_FULL) + return; + + /* not critical: it does not affect CT in any way */ + if (prepare_loginuid(saved_loginuid, LOG_ERROR) < 0) + pr_err("Restore original /proc/self/loginuid failed\n"); +} + +static int write_restored_pid(void) +{ + int pid; + + if (!opts.pidfile) + return 0; + + pid = root_item->pid->real; + + if (write_pidfile(pid) < 0) { + pr_perror("Can't write pidfile"); + return -1; + } + + return 0; +} + +static int restore_root_task(struct pstree_item *init) +{ + enum trace_flags flag = TRACE_ALL; + int ret, fd, mnt_ns_fd = -1; + int root_seized = 0; + struct pstree_item *item; + + ret = run_scripts(ACT_PRE_RESTORE); + if (ret != 0) { + pr_err("Aborting restore due to pre-restore script ret code %d\n", ret); + return -1; + } + + fd = open("/proc", O_DIRECTORY | O_RDONLY); + if (fd < 0) { + pr_perror("Unable to open /proc"); + return -1; + } + + ret = install_service_fd(CR_PROC_FD_OFF, fd); + if (ret < 0) + return -1; + + /* + * FIXME -- currently we assume that all the tasks live + * in the same set of namespaces. This is done to debug + * the ns contents dumping/restoring. Need to revisit + * this later. + */ + + if (vpid(init) == INIT_PID) { + if (!(root_ns_mask & CLONE_NEWPID)) { + pr_err("This process tree can only be restored " + "in a new pid namespace.\n" + "criu should be re-executed with the " + "\"--namespace pid\" option.\n"); + return -1; + } + } else if (root_ns_mask & CLONE_NEWPID) { + pr_err("Can't restore pid namespace without the process init\n"); + return -1; + } + + if (prepare_userns_hook()) + return -1; + + if (prepare_namespace_before_tasks()) + return -1; + + __restore_switch_stage_nw(CR_STATE_ROOT_TASK); + + ret = fork_with_pid(init); + if (ret < 0) + goto out; + + restore_origin_ns_hook(); + + if (rsti(init)->clone_flags & CLONE_PARENT) { + struct sigaction act; + + root_seized = 1; + /* + * Root task will be our sibling. This means, that + * we will not notice when (if) it dies in SIGCHLD + * handler, but we should. To do this -- attach to + * the guy with ptrace (below) and (!) make the kernel + * deliver us the signal when it will get stopped. + * It will in case of e.g. segfault before handling + * the signal. + */ + sigaction(SIGCHLD, NULL, &act); + act.sa_flags &= ~SA_NOCLDSTOP; + sigaction(SIGCHLD, &act, NULL); + + if (ptrace(PTRACE_SEIZE, init->pid->real, 0, 0)) { + pr_perror("Can't attach to init"); + goto out_kill; + } + } + + if (!root_ns_mask) + goto skip_ns_bouncing; + + /* + * uid_map and gid_map must be filled from a parent user namespace. + * prepare_userns_creds() must be called after filling mappings. + */ + if ((root_ns_mask & CLONE_NEWUSER) && prepare_userns(init)) + goto out_kill; + + pr_info("Wait until namespaces are created\n"); + ret = restore_wait_inprogress_tasks(); + if (ret) + goto out_kill; + + ret = run_scripts(ACT_SETUP_NS); + if (ret) + goto out_kill; + + ret = restore_switch_stage(CR_STATE_PREPARE_NAMESPACES); + if (ret) + goto out_kill; + + if (root_ns_mask & CLONE_NEWNS) { + mnt_ns_fd = open_proc(init->pid->real, "ns/mnt"); + if (mnt_ns_fd < 0) + goto out_kill; + } + + if (root_ns_mask & opts.empty_ns & CLONE_NEWNET) { + /* + * Local TCP connections were locked by network_lock_internal() + * on dump and normally should have been C/R-ed by respectively + * dump_iptables() and restore_iptables() in net.c. However in + * the '--empty-ns net' mode no iptables C/R is done and we + * need to return these rules by hands. + */ + ret = network_lock_internal(); + if (ret) + goto out_kill; + } + + ret = run_scripts(ACT_POST_SETUP_NS); + if (ret) + goto out_kill; + + __restore_switch_stage(CR_STATE_FORKING); + +skip_ns_bouncing: + + ret = restore_wait_inprogress_tasks(); + if (ret < 0) + goto out_kill; + + /* + * Zombies die after CR_STATE_RESTORE which is switched + * by root task, not by us. See comment before CR_STATE_FORKING + * in the header for details. + */ + for_each_pstree_item(item) { + if (item->pid->state == TASK_DEAD) + task_entries->nr_threads--; + } + + ret = restore_switch_stage(CR_STATE_RESTORE_SIGCHLD); + if (ret < 0) + goto out_kill; + + ret = stop_usernsd(); + if (ret < 0) + goto out_kill; + + ret = move_veth_to_bridge(); + if (ret < 0) + goto out_kill; + + ret = prepare_cgroup_properties(); + if (ret < 0) + goto out_kill; + + if (fault_injected(FI_POST_RESTORE)) + goto out_kill; + + ret = run_scripts(ACT_POST_RESTORE); + if (ret != 0) { + pr_err("Aborting restore due to post-restore script ret code %d\n", ret); + timing_stop(TIME_RESTORE); + write_stats(RESTORE_STATS); + goto out_kill; + } + + /* + * There is no need to call try_clean_remaps() after this point, + * as restore went OK and all ghosts were removed by the openers. + */ + if (depopulate_roots_yard(mnt_ns_fd, false)) + goto out_kill; + + close_safe(&mnt_ns_fd); + + if (write_restored_pid()) + goto out_kill; + + /* Unlock network before disabling repair mode on sockets */ + network_unlock(); + + /* + * Stop getting sigchld, after we resume the tasks they + * may start to exit poking criu in vain. + */ + ignore_kids(); + + /* + * ------------------------------------------------------------- + * Below this line nothing should fail, because network is unlocked + */ + attach_to_tasks(root_seized); + + ret = restore_switch_stage(CR_STATE_RESTORE_CREDS); + BUG_ON(ret); + + timing_stop(TIME_RESTORE); + + ret = catch_tasks(root_seized, &flag); + + if (lazy_pages_finish_restore()) + goto out_kill; + + pr_info("Restore finished successfully. Resuming tasks.\n"); + __restore_switch_stage(CR_STATE_COMPLETE); + + if (ret == 0) + ret = compel_stop_on_syscall(task_entries->nr_threads, + __NR(rt_sigreturn, 0), __NR(rt_sigreturn, 1), flag); + + if (clear_breakpoints()) + pr_err("Unable to flush breakpoints\n"); + + if (ret == 0) + finalize_restore(); + + ret = run_scripts(ACT_PRE_RESUME); + if (ret) + pr_err("Pre-resume script ret code %d\n", ret); + + if (restore_freezer_state()) + pr_err("Unable to restore freezer state\n"); + + fini_cgroup(); + + /* Detaches from processes and they continue run through sigreturn. */ + finalize_restore_detach(ret); + + write_stats(RESTORE_STATS); + + ret = run_scripts(ACT_POST_RESUME); + if (ret != 0) + pr_err("Post-resume script ret code %d\n", ret); + + if (!opts.restore_detach && !opts.exec_cmd) + wait(NULL); + + return 0; + +out_kill: + /* + * The processes can be killed only when all of them have been created, + * otherwise an external processes can be killed. + */ + if (root_ns_mask & CLONE_NEWPID) { + int status; + + /* Kill init */ + if (root_item->pid->real > 0) + kill(root_item->pid->real, SIGKILL); + + if (waitpid(root_item->pid->real, &status, 0) < 0) + pr_warn("Unable to wait %d: %s\n", + root_item->pid->real, strerror(errno)); + } else { + struct pstree_item *pi; + + for_each_pstree_item(pi) + if (pi->pid->real > 0) + kill(pi->pid->real, SIGKILL); + } + +out: + fini_cgroup(); + depopulate_roots_yard(mnt_ns_fd, true); + stop_usernsd(); + __restore_switch_stage(CR_STATE_FAIL); + pr_err("Restoring FAILED.\n"); + return -1; +} + +int prepare_task_entries(void) +{ + task_entries_pos = rst_mem_align_cpos(RM_SHREMAP); + task_entries = rst_mem_alloc(sizeof(*task_entries), RM_SHREMAP); + if (!task_entries) { + pr_perror("Can't map shmem"); + return -1; + } + + task_entries->nr_threads = 0; + task_entries->nr_tasks = 0; + task_entries->nr_helpers = 0; + futex_set(&task_entries->start, CR_STATE_FAIL); + mutex_init(&task_entries->userns_sync_lock); + mutex_init(&task_entries->last_pid_mutex); + + return 0; +} + +int prepare_dummy_task_state(struct pstree_item *pi) +{ + CoreEntry *core; + + if (open_core(vpid(pi), &core)) + return -1; + + pi->pid->state = core->tc->task_state; + core_entry__free_unpacked(core, NULL); + + return 0; +} + +int cr_restore_tasks(void) +{ + int ret = -1; + + if (init_service_fd()) + return 1; + + if (cr_plugin_init(CR_PLUGIN_STAGE__RESTORE)) + return -1; + + if (check_img_inventory() < 0) + goto err; + + if (init_stats(RESTORE_STATS)) + goto err; + + if (lsm_check_opts()) + goto err; + + timing_start(TIME_RESTORE); + + if (cpu_init() < 0) + goto err; + + if (vdso_init_restore()) + goto err; + + if (opts.cpu_cap & CPU_CAP_IMAGE) { + if (cpu_validate_cpuinfo()) + goto err; + } + + if (prepare_task_entries() < 0) + goto err; + + if (prepare_pstree() < 0) + goto err; + + if (fdstore_init()) + goto err; + + if (inherit_fd_move_to_fdstore()) + goto err; + + if (crtools_prepare_shared() < 0) + goto err; + + if (criu_signals_setup() < 0) + goto err; + + if (prepare_lazy_pages_socket() < 0) + goto err; + + ret = restore_root_task(root_item); + + if (opts.remote && (finish_remote_restore() < 0)) { + pr_err("Finish remote restore failed.\n"); + goto err; + } +err: + cr_plugin_fini(CR_PLUGIN_STAGE__RESTORE, ret); + return ret; +} + +static long restorer_get_vma_hint(struct list_head *tgt_vma_list, + struct list_head *self_vma_list, long vma_len) +{ + struct vma_area *t_vma, *s_vma; + long prev_vma_end = 0; + struct vma_area end_vma; + VmaEntry end_e; + + end_vma.e = &end_e; + end_e.start = end_e.end = kdat.task_size; + prev_vma_end = kdat.mmap_min_addr; + + s_vma = list_first_entry(self_vma_list, struct vma_area, list); + t_vma = list_first_entry(tgt_vma_list, struct vma_area, list); + + while (1) { + if (prev_vma_end + vma_len > s_vma->e->start) { + if (s_vma->list.next == self_vma_list) { + s_vma = &end_vma; + continue; + } + if (s_vma == &end_vma) + break; + if (prev_vma_end < s_vma->e->end) + prev_vma_end = s_vma->e->end; + s_vma = vma_next(s_vma); + continue; + } + + if (prev_vma_end + vma_len > t_vma->e->start) { + if (t_vma->list.next == tgt_vma_list) { + t_vma = &end_vma; + continue; + } + if (t_vma == &end_vma) + break; + if (prev_vma_end < t_vma->e->end) + prev_vma_end = t_vma->e->end; + t_vma = vma_next(t_vma); + continue; + } + + return prev_vma_end; + } + + return -1; +} + +static inline int timeval_valid(struct timeval *tv) +{ + return (tv->tv_sec >= 0) && ((unsigned long)tv->tv_usec < USEC_PER_SEC); +} + +static inline int decode_itimer(char *n, ItimerEntry *ie, struct itimerval *val) +{ + if (ie->isec == 0 && ie->iusec == 0) { + memzero_p(val); + return 0; + } + + val->it_interval.tv_sec = ie->isec; + val->it_interval.tv_usec = ie->iusec; + + if (!timeval_valid(&val->it_interval)) { + pr_err("Invalid timer interval\n"); + return -1; + } + + if (ie->vsec == 0 && ie->vusec == 0) { + /* + * Remaining time was too short. Set it to + * interval to make the timer armed and work. + */ + val->it_value.tv_sec = ie->isec; + val->it_value.tv_usec = ie->iusec; + } else { + val->it_value.tv_sec = ie->vsec; + val->it_value.tv_usec = ie->vusec; + } + + if (!timeval_valid(&val->it_value)) { + pr_err("Invalid timer value\n"); + return -1; + } + + pr_info("Restored %s timer to %ld.%ld -> %ld.%ld\n", n, + val->it_value.tv_sec, val->it_value.tv_usec, + val->it_interval.tv_sec, val->it_interval.tv_usec); + + return 0; +} + +/* + * Legacy itimers restore from CR_FD_ITIMERS + */ + +static int prepare_itimers_from_fd(int pid, struct task_restore_args *args) +{ + int ret = -1; + struct cr_img *img; + ItimerEntry *ie; + + if (!deprecated_ok("Itimers")) + return -1; + + img = open_image(CR_FD_ITIMERS, O_RSTR, pid); + if (!img) + return -1; + + ret = pb_read_one(img, &ie, PB_ITIMER); + if (ret < 0) + goto out; + ret = decode_itimer("real", ie, &args->itimers[0]); + itimer_entry__free_unpacked(ie, NULL); + if (ret < 0) + goto out; + + ret = pb_read_one(img, &ie, PB_ITIMER); + if (ret < 0) + goto out; + ret = decode_itimer("virt", ie, &args->itimers[1]); + itimer_entry__free_unpacked(ie, NULL); + if (ret < 0) + goto out; + + ret = pb_read_one(img, &ie, PB_ITIMER); + if (ret < 0) + goto out; + ret = decode_itimer("prof", ie, &args->itimers[2]); + itimer_entry__free_unpacked(ie, NULL); + if (ret < 0) + goto out; +out: + close_image(img); + return ret; +} + +static int prepare_itimers(int pid, struct task_restore_args *args, CoreEntry *core) +{ + int ret = 0; + TaskTimersEntry *tte = core->tc->timers; + + if (!tte) + return prepare_itimers_from_fd(pid, args); + + ret |= decode_itimer("real", tte->real, &args->itimers[0]); + ret |= decode_itimer("virt", tte->virt, &args->itimers[1]); + ret |= decode_itimer("prof", tte->prof, &args->itimers[2]); + + return ret; +} + +static inline int timespec_valid(struct timespec *ts) +{ + return (ts->tv_sec >= 0) && ((unsigned long)ts->tv_nsec < NSEC_PER_SEC); +} + +static inline int decode_posix_timer(PosixTimerEntry *pte, + struct restore_posix_timer *pt) +{ + pt->val.it_interval.tv_sec = pte->isec; + pt->val.it_interval.tv_nsec = pte->insec; + + if (!timespec_valid(&pt->val.it_interval)) { + pr_err("Invalid timer interval(posix)\n"); + return -1; + } + + if (pte->vsec == 0 && pte->vnsec == 0) { + /* + * Remaining time was too short. Set it to + * interval to make the timer armed and work. + */ + pt->val.it_value.tv_sec = pte->isec; + pt->val.it_value.tv_nsec = pte->insec; + } else { + pt->val.it_value.tv_sec = pte->vsec; + pt->val.it_value.tv_nsec = pte->vnsec; + } + + if (!timespec_valid(&pt->val.it_value)) { + pr_err("Invalid timer value(posix)\n"); + return -1; + } + + pt->spt.it_id = pte->it_id; + pt->spt.clock_id = pte->clock_id; + pt->spt.si_signo = pte->si_signo; + pt->spt.it_sigev_notify = pte->it_sigev_notify; + pt->spt.sival_ptr = decode_pointer(pte->sival_ptr); + pt->overrun = pte->overrun; + + return 0; +} + +static int cmp_posix_timer_proc_id(const void *p1, const void *p2) +{ + return ((struct restore_posix_timer *)p1)->spt.it_id - ((struct restore_posix_timer *)p2)->spt.it_id; +} + +static void sort_posix_timers(struct task_restore_args *ta) +{ + void *tmem; + + /* + * This is required for restorer's create_posix_timers(), + * it will probe them one-by-one for the desired ID, since + * kernel doesn't provide another API for timer creation + * with given ID. + */ + + if (ta->posix_timers_n > 0) { + tmem = rst_mem_remap_ptr((unsigned long)ta->posix_timers, RM_PRIVATE); + qsort(tmem, ta->posix_timers_n, + sizeof(struct restore_posix_timer), + cmp_posix_timer_proc_id); + } +} + +/* + * Legacy posix timers restoration from CR_FD_POSIX_TIMERS + */ + +static int prepare_posix_timers_from_fd(int pid, struct task_restore_args *ta) +{ + struct cr_img *img; + int ret = -1; + struct restore_posix_timer *t; + + if (!deprecated_ok("Posix timers")) + return -1; + + img = open_image(CR_FD_POSIX_TIMERS, O_RSTR, pid); + if (!img) + return -1; + + ta->posix_timers_n = 0; + while (1) { + PosixTimerEntry *pte; + + ret = pb_read_one_eof(img, &pte, PB_POSIX_TIMER); + if (ret <= 0) + break; + + t = rst_mem_alloc(sizeof(struct restore_posix_timer), RM_PRIVATE); + if (!t) + break; + + ret = decode_posix_timer(pte, t); + if (ret < 0) + break; + + posix_timer_entry__free_unpacked(pte, NULL); + ta->posix_timers_n++; + } + + close_image(img); + if (!ret) + sort_posix_timers(ta); + + return ret; +} + +static int prepare_posix_timers(int pid, struct task_restore_args *ta, CoreEntry *core) +{ + int i, ret = -1; + TaskTimersEntry *tte = core->tc->timers; + struct restore_posix_timer *t; + + ta->posix_timers = (struct restore_posix_timer *)rst_mem_align_cpos(RM_PRIVATE); + + if (!tte) + return prepare_posix_timers_from_fd(pid, ta); + + ta->posix_timers_n = tte->n_posix; + for (i = 0; i < ta->posix_timers_n; i++) { + t = rst_mem_alloc(sizeof(struct restore_posix_timer), RM_PRIVATE); + if (!t) + goto out; + + if (decode_posix_timer(tte->posix[i], t)) + goto out; + } + + ret = 0; + sort_posix_timers(ta); +out: + return ret; +} + +static inline int verify_cap_size(CredsEntry *ce) +{ + return ((ce->n_cap_inh == CR_CAP_SIZE) && (ce->n_cap_eff == CR_CAP_SIZE) && + (ce->n_cap_prm == CR_CAP_SIZE) && (ce->n_cap_bnd == CR_CAP_SIZE)); +} + +static int prepare_mm(pid_t pid, struct task_restore_args *args) +{ + int exe_fd, i, ret = -1; + MmEntry *mm = rsti(current)->mm; + + args->mm = *mm; + args->mm.n_mm_saved_auxv = 0; + args->mm.mm_saved_auxv = NULL; + + if (mm->n_mm_saved_auxv > AT_VECTOR_SIZE) { + pr_err("Image corrupted on pid %d\n", pid); + goto out; + } + + args->mm_saved_auxv_size = mm->n_mm_saved_auxv*sizeof(auxv_t); + for (i = 0; i < mm->n_mm_saved_auxv; ++i) { + args->mm_saved_auxv[i] = (auxv_t)mm->mm_saved_auxv[i]; + } + + exe_fd = open_reg_by_id(mm->exe_file_id); + if (exe_fd < 0) + goto out; + + args->fd_exe_link = exe_fd; + + args->has_thp_enabled = rsti(current)->has_thp_enabled; + + ret = 0; +out: + return ret; +} + +static void *restorer; +static unsigned long restorer_len; + +static int prepare_restorer_blob(void) +{ + /* + * We map anonymous mapping, not mremap the restorer itself later. + * Otherwise the restorer vma would be tied to criu binary which + * in turn will lead to set-exe-file prctl to fail with EBUSY. + */ + + restorer_len = pie_size(restorer); + restorer = mmap(NULL, restorer_len, + PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); + if (restorer == MAP_FAILED) { + pr_perror("Can't map restorer code"); + return -1; + } + + memcpy(restorer, &restorer_blob, sizeof(restorer_blob)); + return 0; +} + +static int remap_restorer_blob(void *addr) +{ + void *mem; + + mem = mremap(restorer, restorer_len, restorer_len, + MREMAP_FIXED | MREMAP_MAYMOVE, addr); + if (mem != addr) { + pr_perror("Can't remap restorer blob"); + return -1; + } + + compel_relocs_apply(addr, addr, sizeof(restorer_blob), + restorer_relocs, ARRAY_SIZE(restorer_relocs)); + + return 0; +} + +static int validate_sched_parm(struct rst_sched_param *sp) +{ + if ((sp->nice < -20) || (sp->nice > 19)) + return 0; + + switch (sp->policy) { + case SCHED_RR: + case SCHED_FIFO: + return ((sp->prio > 0) && (sp->prio < 100)); + case SCHED_IDLE: + case SCHED_OTHER: + case SCHED_BATCH: + return sp->prio == 0; + } + + return 0; +} + +static int prep_sched_info(struct rst_sched_param *sp, ThreadCoreEntry *tc) +{ + if (!tc->has_sched_policy) { + sp->policy = SCHED_OTHER; + sp->nice = 0; + return 0; + } + + sp->policy = tc->sched_policy; + sp->nice = tc->sched_nice; + sp->prio = tc->sched_prio; + + if (!validate_sched_parm(sp)) { + pr_err("Inconsistent sched params received (%d.%d.%d)\n", + sp->policy, sp->nice, sp->prio); + return -1; + } + + return 0; +} + +static rlim_t decode_rlim(rlim_t ival) +{ + return ival == -1 ? RLIM_INFINITY : ival; +} + +/* + * Legacy rlimits restore from CR_FD_RLIMIT + */ + +static int prepare_rlimits_from_fd(int pid, struct task_restore_args *ta) +{ + struct rlimit *r; + int ret; + struct cr_img *img; + + if (!deprecated_ok("Rlimits")) + return -1; + + /* + * Old image -- read from the file. + */ + img = open_image(CR_FD_RLIMIT, O_RSTR, pid); + if (!img) + return -1; + + ta->rlims_n = 0; + while (1) { + RlimitEntry *re; + + ret = pb_read_one_eof(img, &re, PB_RLIMIT); + if (ret <= 0) + break; + + r = rst_mem_alloc(sizeof(*r), RM_PRIVATE); + if (!r) { + pr_err("Can't allocate memory for resource %d\n", + ta->rlims_n); + return -1; + } + + r->rlim_cur = decode_rlim(re->cur); + r->rlim_max = decode_rlim(re->max); + if (r->rlim_cur > r->rlim_max) { + pr_err("Can't restore cur > max for %d.%d\n", + pid, ta->rlims_n); + r->rlim_cur = r->rlim_max; + } + + rlimit_entry__free_unpacked(re, NULL); + + ta->rlims_n++; + } + + close_image(img); + + return 0; +} + +static int prepare_rlimits(int pid, struct task_restore_args *ta, CoreEntry *core) +{ + int i; + TaskRlimitsEntry *rls = core->tc->rlimits; + struct rlimit64 *r; + + ta->rlims = (struct rlimit64 *)rst_mem_align_cpos(RM_PRIVATE); + + if (!rls) + return prepare_rlimits_from_fd(pid, ta); + + for (i = 0; i < rls->n_rlimits; i++) { + r = rst_mem_alloc(sizeof(*r), RM_PRIVATE); + if (!r) { + pr_err("Can't allocate memory for resource %d\n", i); + return -1; + } + + r->rlim_cur = decode_rlim(rls->rlimits[i]->cur); + r->rlim_max = decode_rlim(rls->rlimits[i]->max); + + if (r->rlim_cur > r->rlim_max) { + pr_warn("Can't restore cur > max for %d.%d\n", pid, i); + r->rlim_cur = r->rlim_max; + } + } + + ta->rlims_n = rls->n_rlimits; + return 0; +} + +static int signal_to_mem(SiginfoEntry *sie) +{ + siginfo_t *info, *t; + + info = (siginfo_t *) sie->siginfo.data; + t = rst_mem_alloc(sizeof(siginfo_t), RM_PRIVATE); + if (!t) + return -1; + + memcpy(t, info, sizeof(*info)); + + return 0; +} + +static int open_signal_image(int type, pid_t pid, unsigned int *nr) +{ + int ret; + struct cr_img *img; + + img = open_image(type, O_RSTR, pid); + if (!img) + return -1; + + *nr = 0; + while (1) { + SiginfoEntry *sie; + + ret = pb_read_one_eof(img, &sie, PB_SIGINFO); + if (ret <= 0) + break; + if (sie->siginfo.len != sizeof(siginfo_t)) { + pr_err("Unknown image format\n"); + ret = -1; + break; + } + + ret = signal_to_mem(sie); + if (ret) + break; + + (*nr)++; + + siginfo_entry__free_unpacked(sie, NULL); + } + + close_image(img); + + return ret ? : 0; +} + +static int prepare_one_signal_queue(SignalQueueEntry *sqe, unsigned int *nr) +{ + int i; + + for (i = 0; i < sqe->n_signals; i++) + if (signal_to_mem(sqe->signals[i])) + return -1; + + *nr = sqe->n_signals; + + return 0; +} + +static unsigned int *siginfo_priv_nr; /* FIXME -- put directly on thread_args */ + +static int prepare_signals(int pid, struct task_restore_args *ta, CoreEntry *leader_core) +{ + int ret = -1, i; + + ta->siginfo = (siginfo_t *)rst_mem_align_cpos(RM_PRIVATE); + siginfo_priv_nr = xmalloc(sizeof(int) * current->nr_threads); + if (siginfo_priv_nr == NULL) + goto out; + + /* Prepare shared signals */ + if (!leader_core->tc->signals_s)/*backward compatibility*/ + ret = open_signal_image(CR_FD_SIGNAL, pid, &ta->siginfo_n); + else + ret = prepare_one_signal_queue(leader_core->tc->signals_s, &ta->siginfo_n); + + if (ret < 0) + goto out; + + for (i = 0; i < current->nr_threads; i++) { + if (!current->core[i]->thread_core->signals_p)/*backward compatibility*/ + ret = open_signal_image(CR_FD_PSIGNAL, + current->threads[i].ns[0].virt, &siginfo_priv_nr[i]); + else + ret = prepare_one_signal_queue(current->core[i]->thread_core->signals_p, + &siginfo_priv_nr[i]); + if (ret < 0) + goto out; + } +out: + return ret; +} + +extern void __gcov_flush(void) __attribute__((weak)); +void __gcov_flush(void) {} + +static void rst_reloc_creds(struct thread_restore_args *thread_args, + unsigned long *creds_pos_next) +{ + struct thread_creds_args *args; + + if (unlikely(!*creds_pos_next)) + return; + + args = rst_mem_remap_ptr(*creds_pos_next, RM_PRIVATE); + + if (args->lsm_profile) + args->lsm_profile = rst_mem_remap_ptr(args->mem_lsm_profile_pos, RM_PRIVATE); + if (args->lsm_sockcreate) + args->lsm_sockcreate = rst_mem_remap_ptr(args->mem_lsm_sockcreate_pos, RM_PRIVATE); + if (args->groups) + args->groups = rst_mem_remap_ptr(args->mem_groups_pos, RM_PRIVATE); + + *creds_pos_next = args->mem_pos_next; + thread_args->creds_args = args; +} + +static struct thread_creds_args * +rst_prep_creds_args(CredsEntry *ce, unsigned long *prev_pos) +{ + unsigned long this_pos; + struct thread_creds_args *args; + + if (!verify_cap_size(ce)) { + pr_err("Caps size mismatch %d %d %d %d\n", + (int)ce->n_cap_inh, (int)ce->n_cap_eff, + (int)ce->n_cap_prm, (int)ce->n_cap_bnd); + return ERR_PTR(-EINVAL); + } + + this_pos = rst_mem_align_cpos(RM_PRIVATE); + + args = rst_mem_alloc(sizeof(*args), RM_PRIVATE); + if (!args) + return ERR_PTR(-ENOMEM); + + args->cap_last_cap = kdat.last_cap; + memcpy(&args->creds, ce, sizeof(args->creds)); + + if (ce->lsm_profile || opts.lsm_supplied) { + char *rendered = NULL, *profile; + + profile = ce->lsm_profile; + if (opts.lsm_supplied) + profile = opts.lsm_profile; + + if (validate_lsm(profile) < 0) + return ERR_PTR(-EINVAL); + + if (profile && render_lsm_profile(profile, &rendered)) { + return ERR_PTR(-EINVAL); + } + + if (rendered) { + size_t lsm_profile_len; + char *lsm_profile; + + args->mem_lsm_profile_pos = rst_mem_align_cpos(RM_PRIVATE); + lsm_profile_len = strlen(rendered); + lsm_profile = rst_mem_alloc(lsm_profile_len + 1, RM_PRIVATE); + if (!lsm_profile) { + xfree(rendered); + return ERR_PTR(-ENOMEM); + } + + args = rst_mem_remap_ptr(this_pos, RM_PRIVATE); + args->lsm_profile = lsm_profile; + strncpy(args->lsm_profile, rendered, lsm_profile_len); + xfree(rendered); + } + } else { + args->lsm_profile = NULL; + args->mem_lsm_profile_pos = 0; + } + + if (ce->lsm_sockcreate) { + char *rendered = NULL; + char *profile; + + profile = ce->lsm_sockcreate; + + if (validate_lsm(profile) < 0) + return ERR_PTR(-EINVAL); + + if (profile && render_lsm_profile(profile, &rendered)) { + return ERR_PTR(-EINVAL); + } + if (rendered) { + size_t lsm_sockcreate_len; + char *lsm_sockcreate; + + args->mem_lsm_sockcreate_pos = rst_mem_align_cpos(RM_PRIVATE); + lsm_sockcreate_len = strlen(rendered); + lsm_sockcreate = rst_mem_alloc(lsm_sockcreate_len + 1, RM_PRIVATE); + if (!lsm_sockcreate) { + xfree(rendered); + return ERR_PTR(-ENOMEM); + } + + args = rst_mem_remap_ptr(this_pos, RM_PRIVATE); + args->lsm_sockcreate = lsm_sockcreate; + strncpy(args->lsm_sockcreate, rendered, lsm_sockcreate_len); + xfree(rendered); + } + } else { + args->lsm_sockcreate = NULL; + args->mem_lsm_sockcreate_pos = 0; + } + + /* + * Zap fields which we can't use. + */ + args->creds.cap_inh = NULL; + args->creds.cap_eff = NULL; + args->creds.cap_prm = NULL; + args->creds.cap_bnd = NULL; + args->creds.groups = NULL; + args->creds.lsm_profile = NULL; + + memcpy(args->cap_inh, ce->cap_inh, sizeof(args->cap_inh)); + memcpy(args->cap_eff, ce->cap_eff, sizeof(args->cap_eff)); + memcpy(args->cap_prm, ce->cap_prm, sizeof(args->cap_prm)); + memcpy(args->cap_bnd, ce->cap_bnd, sizeof(args->cap_bnd)); + + if (ce->n_groups) { + unsigned int *groups; + + args->mem_groups_pos = rst_mem_align_cpos(RM_PRIVATE); + groups = rst_mem_alloc(ce->n_groups * sizeof(u32), RM_PRIVATE); + if (!groups) + return ERR_PTR(-ENOMEM); + args = rst_mem_remap_ptr(this_pos, RM_PRIVATE); + args->groups = groups; + memcpy(args->groups, ce->groups, ce->n_groups * sizeof(u32)); + } else { + args->groups = NULL; + args->mem_groups_pos = 0; + } + + args->mem_pos_next = 0; + + if (prev_pos) { + if (*prev_pos) { + struct thread_creds_args *prev; + + prev = rst_mem_remap_ptr(*prev_pos, RM_PRIVATE); + prev->mem_pos_next = this_pos; + } + *prev_pos = this_pos; + } + return args; +} + +static int rst_prep_creds_from_img(pid_t pid) +{ + CredsEntry *ce = NULL; + struct cr_img *img; + int ret; + + img = open_image(CR_FD_CREDS, O_RSTR, pid); + if (!img) + return -ENOENT; + + ret = pb_read_one(img, &ce, PB_CREDS); + close_image(img); + + if (ret > 0) { + struct thread_creds_args *args; + + args = rst_prep_creds_args(ce, NULL); + if (IS_ERR(args)) + ret = PTR_ERR(args); + else + ret = 0; + } + creds_entry__free_unpacked(ce, NULL); + return ret; +} + +static int rst_prep_creds(pid_t pid, CoreEntry *core, unsigned long *creds_pos) +{ + struct thread_creds_args *args = NULL; + unsigned long this_pos = 0; + size_t i; + + /* + * This is _really_ very old image + * format where @thread_core were not + * present. It means we don't have + * creds either, just ignore and exit + * early. + */ + if (unlikely(!core->thread_core)) { + *creds_pos = 0; + return 0; + } + + *creds_pos = rst_mem_align_cpos(RM_PRIVATE); + + /* + * Old format: one Creds per task carried in own image file. + */ + if (!core->thread_core->creds) + return rst_prep_creds_from_img(pid); + + for (i = 0; i < current->nr_threads; i++) { + CredsEntry *ce = current->core[i]->thread_core->creds; + + args = rst_prep_creds_args(ce, &this_pos); + if (IS_ERR(args)) + return PTR_ERR(args); + } + + return 0; +} + +static void *restorer_munmap_addr(CoreEntry *core, void *restorer_blob) +{ +#ifdef CONFIG_COMPAT + if (core_is_compat(core)) + return restorer_sym(restorer_blob, arch_export_unmap_compat); +#endif + return restorer_sym(restorer_blob, arch_export_unmap); +} + +static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, unsigned long alen, CoreEntry *core) +{ + void *mem = MAP_FAILED; + void *restore_task_exec_start; + + long new_sp; + long ret; + + long rst_mem_size; + long memzone_size; + + struct thread_restore_args *thread_args; + struct restore_mem_zone *mz; + + struct vdso_maps vdso_maps_rt; + unsigned long vdso_rt_size = 0; + + struct vm_area_list self_vmas; + struct vm_area_list *vmas = &rsti(current)->vmas; + int i, siginfo_n; + + unsigned long creds_pos = 0; + unsigned long creds_pos_next; + + sigset_t blockmask; + + pr_info("Restore via sigreturn\n"); + + /* pr_info_vma_list(&self_vma_list); */ + + BUILD_BUG_ON(sizeof(struct task_restore_args) & 1); + BUILD_BUG_ON(sizeof(struct thread_restore_args) & 1); + + /* + * Read creds info for every thread and allocate memory + * needed so we can use this data inside restorer. + */ + if (rst_prep_creds(pid, core, &creds_pos)) + goto err_nv; + + if (current->parent == NULL) { + /* Wait when all tasks restored all files */ + if (restore_wait_other_tasks()) + goto err_nv; + if (root_ns_mask & CLONE_NEWNS && + remount_readonly_mounts()) + goto err_nv; + } + + /* + * We're about to search for free VM area and inject the restorer blob + * into it. No irrelevant mmaps/mremaps beyond this point, otherwise + * this unwanted mapping might get overlapped by the restorer. + */ + + ret = parse_self_maps_lite(&self_vmas); + if (ret < 0) + goto err; + + rst_mem_size = rst_mem_lock(); + memzone_size = round_up(sizeof(struct restore_mem_zone) * current->nr_threads, page_size()); + task_args->bootstrap_len = restorer_len + memzone_size + alen + rst_mem_size; + BUG_ON(task_args->bootstrap_len & (PAGE_SIZE - 1)); + pr_info("%d threads require %ldK of memory\n", + current->nr_threads, KBYTES(task_args->bootstrap_len)); + + if (core_is_compat(core)) + vdso_maps_rt = vdso_maps_compat; + else + vdso_maps_rt = vdso_maps; + /* + * Figure out how much memory runtime vdso and vvar will need. + */ + vdso_rt_size = vdso_maps_rt.sym.vdso_size; + if (vdso_rt_size && vdso_maps_rt.sym.vvar_size) + vdso_rt_size += ALIGN(vdso_maps_rt.sym.vvar_size, PAGE_SIZE); + task_args->bootstrap_len += vdso_rt_size; + + /* + * Restorer is a blob (code + args) that will get mapped in some + * place, that should _not_ intersect with both -- current mappings + * and mappings of the task we're restoring here. The subsequent + * call finds the start address for the restorer. + * + * After the start address is found we populate it with the restorer + * parts one by one (some are remap-ed, some are mmap-ed and copied + * or inited from scratch). + */ + + mem = (void *)restorer_get_vma_hint(&vmas->h, &self_vmas.h, + task_args->bootstrap_len); + if (mem == (void *)-1) { + pr_err("No suitable area for task_restore bootstrap (%ldK)\n", + task_args->bootstrap_len); + goto err; + } + + pr_info("Found bootstrap VMA hint at: %p (needs ~%ldK)\n", + mem, KBYTES(task_args->bootstrap_len)); + + ret = remap_restorer_blob(mem); + if (ret < 0) + goto err; + + /* + * Prepare a memory map for restorer. Note a thread space + * might be completely unused so it's here just for convenience. + */ + task_args->clone_restore_fn = restorer_sym(mem, arch_export_restore_thread); + restore_task_exec_start = restorer_sym(mem, arch_export_restore_task); + rsti(current)->munmap_restorer = restorer_munmap_addr(core, mem); + + task_args->bootstrap_start = mem; + mem += restorer_len; + + /* VMA we need for stacks and sigframes for threads */ + if (mmap(mem, memzone_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, 0, 0) != mem) { + pr_err("Can't mmap section for restore code\n"); + goto err; + } + + memzero(mem, memzone_size); + mz = mem; + mem += memzone_size; + + /* New home for task_restore_args and thread_restore_args */ + task_args = mremap(task_args, alen, alen, MREMAP_MAYMOVE|MREMAP_FIXED, mem); + if (task_args != mem) { + pr_perror("Can't move task args"); + goto err; + } + + task_args->rst_mem = mem; + task_args->rst_mem_size = rst_mem_size + alen; + thread_args = (struct thread_restore_args *)(task_args + 1); + + /* + * And finally -- the rest arguments referenced by task_ and + * thread_restore_args. Pointers will get remapped below. + */ + mem += alen; + if (rst_mem_remap(mem)) + goto err; + + /* + * At this point we've found a gap in VM that fits in both -- current + * and target tasks' mappings -- and its structure is + * + * | restorer code | memzone (stacks and sigframes) | arguments | + * + * Arguments is task_restore_args, thread_restore_args-s and all + * the bunch of objects allocated with rst_mem_alloc(). + * Note, that the task_args itself is inside the 3rd section and (!) + * it gets unmapped at the very end of __export_restore_task + */ + + task_args->proc_fd = dup(get_service_fd(PROC_FD_OFF)); + if (task_args->proc_fd < 0) { + pr_perror("can't dup proc fd"); + goto err; + } + + task_args->breakpoint = &rsti(current)->breakpoint; + task_args->fault_strategy = fi_strategy; + + sigemptyset(&blockmask); + sigaddset(&blockmask, SIGCHLD); + + if (sigprocmask(SIG_BLOCK, &blockmask, NULL) == -1) { + pr_perror("Can not set mask of blocked signals"); + return -1; + } + + task_args->task_entries = rst_mem_remap_ptr(task_entries_pos, RM_SHREMAP); + + task_args->premmapped_addr = (unsigned long)rsti(current)->premmapped_addr; + task_args->premmapped_len = rsti(current)->premmapped_len; + + task_args->task_size = kdat.task_size; +#ifdef ARCH_HAS_LONG_PAGES + task_args->page_size = PAGE_SIZE; +#endif + + RST_MEM_FIXUP_PPTR(task_args->vmas); + RST_MEM_FIXUP_PPTR(task_args->rings); + RST_MEM_FIXUP_PPTR(task_args->tcp_socks); + RST_MEM_FIXUP_PPTR(task_args->timerfd); + RST_MEM_FIXUP_PPTR(task_args->posix_timers); + RST_MEM_FIXUP_PPTR(task_args->siginfo); + RST_MEM_FIXUP_PPTR(task_args->rlims); + RST_MEM_FIXUP_PPTR(task_args->helpers); + RST_MEM_FIXUP_PPTR(task_args->zombies); + RST_MEM_FIXUP_PPTR(task_args->vma_ios); + + task_args->compatible_mode = core_is_compat(core); + /* + * Arguments for task restoration. + */ + + BUG_ON(core->mtype != CORE_ENTRY__MARCH); + + task_args->logfd = log_get_fd(); + task_args->loglevel = log_get_loglevel(); + log_get_logstart(&task_args->logstart); + task_args->sigchld_act = sigchld_act; + + strncpy(task_args->comm, core->tc->comm, TASK_COMM_LEN - 1); + task_args->comm[TASK_COMM_LEN - 1] = 0; + + /* + * Fill up per-thread data. + */ + creds_pos_next = creds_pos; + siginfo_n = task_args->siginfo_n; + for (i = 0; i < current->nr_threads; i++) { + CoreEntry *tcore; + struct rt_sigframe *sigframe; + k_rtsigset_t *blkset = NULL; + + thread_args[i].pid = current->threads[i].ns[0].virt; + thread_args[i].siginfo_n = siginfo_priv_nr[i]; + thread_args[i].siginfo = task_args->siginfo; + thread_args[i].siginfo += siginfo_n; + siginfo_n += thread_args[i].siginfo_n; + + /* skip self */ + if (thread_args[i].pid == pid) { + task_args->t = thread_args + i; + tcore = core; + blkset = (void *)&tcore->tc->blk_sigset; + } else { + tcore = current->core[i]; + if (tcore->thread_core->has_blk_sigset) + blkset = (void *)&tcore->thread_core->blk_sigset; + } + + if ((tcore->tc || tcore->ids) && thread_args[i].pid != pid) { + pr_err("Thread has optional fields present %d\n", + thread_args[i].pid); + ret = -1; + } + + if (ret < 0) { + pr_err("Can't read core data for thread %d\n", + thread_args[i].pid); + goto err; + } + + thread_args[i].ta = task_args; + thread_args[i].gpregs = *CORE_THREAD_ARCH_INFO(tcore)->gpregs; + thread_args[i].clear_tid_addr = CORE_THREAD_ARCH_INFO(tcore)->clear_tid_addr; + core_get_tls(tcore, &thread_args[i].tls); + + rst_reloc_creds(&thread_args[i], &creds_pos_next); + + thread_args[i].futex_rla = tcore->thread_core->futex_rla; + thread_args[i].futex_rla_len = tcore->thread_core->futex_rla_len; + thread_args[i].pdeath_sig = tcore->thread_core->pdeath_sig; + if (tcore->thread_core->pdeath_sig > _KNSIG) { + pr_err("Pdeath signal is too big\n"); + goto err; + } + + ret = prep_sched_info(&thread_args[i].sp, tcore->thread_core); + if (ret) + goto err; + + seccomp_rst_reloc(&thread_args[i]); + thread_args[i].seccomp_force_tsync = rsti(current)->has_old_seccomp_filter; + + thread_args[i].mz = mz + i; + sigframe = (struct rt_sigframe *)&mz[i].rt_sigframe; + + if (construct_sigframe(sigframe, sigframe, blkset, tcore)) + goto err; + + if (tcore->thread_core->comm) + strncpy(thread_args[i].comm, tcore->thread_core->comm, TASK_COMM_LEN - 1); + else + strncpy(thread_args[i].comm, core->tc->comm, TASK_COMM_LEN - 1); + thread_args[i].comm[TASK_COMM_LEN - 1] = 0; + + if (thread_args[i].pid != pid) + core_entry__free_unpacked(tcore, NULL); + + pr_info("Thread %4d stack %8p rt_sigframe %8p\n", + i, mz[i].stack, mz[i].rt_sigframe); + + } + + /* + * Restorer needs own copy of vdso parameters. Runtime + * vdso must be kept non intersecting with anything else, + * since we need it being accessible even when own + * self-vmas are unmaped. + */ + mem += rst_mem_size; + task_args->vdso_rt_parked_at = (unsigned long)mem; + task_args->vdso_maps_rt = vdso_maps_rt; + task_args->vdso_rt_size = vdso_rt_size; + task_args->can_map_vdso = kdat.can_map_vdso; + + new_sp = restorer_stack(task_args->t->mz); + + /* No longer need it */ + core_entry__free_unpacked(core, NULL); + xfree(current->core); + + /* + * Now prepare run-time data for threads restore. + */ + task_args->nr_threads = current->nr_threads; + task_args->thread_args = thread_args; + + task_args->auto_dedup = opts.auto_dedup; + + /* + * In the restorer we need to know if it is SELinux or not. For SELinux + * we must change the process context before creating threads. For + * Apparmor we can change each thread after they have been created. + */ + task_args->lsm_type = kdat.lsm; + + /* + * Make root and cwd restore _that_ late not to break any + * attempts to open files by paths above (e.g. /proc). + */ + + if (restore_fs(current)) + goto err; + + sfds_protected = false; + close_image_dir(); + close_proc(); + close_service_fd(TRANSPORT_FD_OFF); + close_service_fd(CR_PROC_FD_OFF); + close_service_fd(ROOT_FD_OFF); + close_service_fd(USERNSD_SK); + close_service_fd(FDSTORE_SK_OFF); + close_service_fd(RPC_SK_OFF); + + __gcov_flush(); + + pr_info("task_args: %p\n" + "task_args->pid: %d\n" + "task_args->nr_threads: %d\n" + "task_args->clone_restore_fn: %p\n" + "task_args->thread_args: %p\n", + task_args, task_args->t->pid, + task_args->nr_threads, + task_args->clone_restore_fn, + task_args->thread_args); + + /* + * An indirect call to task_restore, note it never returns + * and restoring core is extremely destructive. + */ + + JUMP_TO_RESTORER_BLOB(new_sp, restore_task_exec_start, task_args); + +err: + free_mappings(&self_vmas); +err_nv: + /* Just to be sure */ + exit(1); + return -1; +} diff --git a/CRIU_code/criu/cr-service.c b/CRIU_code/criu/cr-service.c new file mode 100644 index 0000000..0938db0 --- /dev/null +++ b/CRIU_code/criu/cr-service.c @@ -0,0 +1,1403 @@ +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "version.h" +#include "crtools.h" +#include "cr_options.h" +#include "external.h" +#include "util.h" +#include "criu-log.h" +#include "cpu.h" +#include "files.h" +#include "pstree.h" +#include "cr-service.h" +#include "cr-service-const.h" +#include "page-xfer.h" +#include "net.h" +#include "mount.h" +#include "filesystems.h" +#include "cgroup.h" +#include "cgroup-props.h" +#include "action-scripts.h" +#include "sockets.h" +#include "irmap.h" +#include "kerndat.h" +#include "proc_parse.h" +#include "common/scm.h" +#include "uffd.h" + +#include "setproctitle.h" + +#include "cr-errno.h" +#include "namespaces.h" + +unsigned int service_sk_ino = -1; + +static int recv_criu_msg(int socket_fd, CriuReq **req) +{ + unsigned char *buf; + int len; + + len = recv(socket_fd, NULL, 0, MSG_TRUNC | MSG_PEEK); + if (len == -1) { + pr_perror("Can't read request"); + return -1; + } + + buf = xmalloc(len); + if (!buf) + return -ENOMEM; + + len = recv(socket_fd, buf, len, MSG_TRUNC); + if (len == -1) { + pr_perror("Can't read request"); + goto err; + } + + if (len == 0) { + pr_info("Client exited unexpectedly\n"); + errno = ECONNRESET; + goto err; + } + + *req = criu_req__unpack(NULL, len, buf); + if (!*req) { + pr_perror("Failed unpacking request"); + goto err; + } + + xfree(buf); + return 0; +err: + xfree(buf); + return -1; +} + +static int send_criu_msg_with_fd(int socket_fd, CriuResp *msg, int fd) +{ + unsigned char *buf; + int len, ret; + + len = criu_resp__get_packed_size(msg); + + buf = xmalloc(len); + if (!buf) + return -ENOMEM; + + if (criu_resp__pack(msg, buf) != len) { + pr_perror("Failed packing response"); + goto err; + } + + if (fd >= 0) { + ret = send_fds(socket_fd, NULL, 0, &fd, 1, buf, len); + } else + ret = write(socket_fd, buf, len); + if (ret < 0) { + pr_perror("Can't send response"); + goto err; + } + + xfree(buf); + return 0; +err: + xfree(buf); + return -1; +} + +static int send_criu_msg(int socket_fd, CriuResp *msg) +{ + return send_criu_msg_with_fd(socket_fd, msg, -1); +} + +static void set_resp_err(CriuResp *resp) +{ + resp->cr_errno = get_cr_errno(); + resp->has_cr_errno = resp->cr_errno ? true : false; + resp->cr_errmsg = log_first_err(); +} + +static void send_criu_err(int sk, char *msg) +{ + CriuResp resp = CRIU_RESP__INIT; + + pr_perror("RPC error: %s", msg); + + resp.type = CRIU_REQ_TYPE__EMPTY; + resp.success = false; + set_resp_err(&resp); + + send_criu_msg(sk, &resp); +} + +int send_criu_dump_resp(int socket_fd, bool success, bool restored) +{ + CriuResp msg = CRIU_RESP__INIT; + CriuDumpResp resp = CRIU_DUMP_RESP__INIT; + + msg.type = CRIU_REQ_TYPE__DUMP; + msg.success = success; + set_resp_err(&msg); + msg.dump = &resp; + + resp.has_restored = true; + resp.restored = restored; + + return send_criu_msg(socket_fd, &msg); +} + +static int send_criu_pre_dump_resp(int socket_fd, bool success) +{ + CriuResp msg = CRIU_RESP__INIT; + + msg.type = CRIU_REQ_TYPE__PRE_DUMP; + msg.success = success; + set_resp_err(&msg); + + return send_criu_msg(socket_fd, &msg); +} + +int send_criu_restore_resp(int socket_fd, bool success, int pid) +{ + CriuResp msg = CRIU_RESP__INIT; + CriuRestoreResp resp = CRIU_RESTORE_RESP__INIT; + + msg.type = CRIU_REQ_TYPE__RESTORE; + msg.success = success; + set_resp_err(&msg); + msg.restore = &resp; + + resp.pid = pid; + + return send_criu_msg(socket_fd, &msg); +} + +int send_criu_rpc_script(enum script_actions act, char *name, int sk, int fd) +{ + int ret; + CriuResp msg = CRIU_RESP__INIT; + CriuReq *req; + CriuNotify cn = CRIU_NOTIFY__INIT; + + msg.type = CRIU_REQ_TYPE__NOTIFY; + msg.success = true; + msg.notify = &cn; + cn.script = name; + + switch (act) { + case ACT_SETUP_NS: + case ACT_POST_RESTORE: + /* + * FIXME pid is required only once on + * restore. Need some more sane way of + * checking this. + */ + cn.has_pid = true; + cn.pid = root_item->pid->real; + break; + default: + break; + } + + ret = send_criu_msg_with_fd(sk, &msg, fd); + if (ret < 0) + return ret; + + ret = recv_criu_msg(sk, &req); + if (ret < 0) + return ret; + + if (req->type != CRIU_REQ_TYPE__NOTIFY || !req->notify_success) { + pr_err("RPC client reported script error\n"); + return -1; + } + + criu_req__free_unpacked(req, NULL); + return 0; +} + +static char images_dir[PATH_MAX]; + +static int setup_opts_from_req(int sk, CriuOpts *req) +{ + struct ucred ids; + struct stat st; + socklen_t ids_len = sizeof(struct ucred); + char images_dir_path[PATH_MAX]; + char work_dir_path[PATH_MAX]; + char status_fd[PATH_MAX]; + bool output_changed_by_rpc_conf = false; + bool work_changed_by_rpc_conf = false; + bool imgs_changed_by_rpc_conf = false; + int i; + bool dummy = false; + + if (getsockopt(sk, SOL_SOCKET, SO_PEERCRED, &ids, &ids_len)) { + pr_perror("Can't get socket options"); + goto err; + } + + if (fstat(sk, &st)) { + pr_perror("Can't get socket stat"); + goto err; + } + + BUG_ON(st.st_ino == -1); + service_sk_ino = st.st_ino; + + /* + * Evaluate an additional configuration file if specified. + * This needs to happen twice, because it is needed early to detect + * things like work_dir, imgs_dir and logfile. The second parsing + * of the optional RPC configuration file happens at the end and + * overwrites all options set via RPC. + */ + if (req->config_file) { + char *tmp_output = opts.output; + char *tmp_work = opts.work_dir; + char *tmp_imgs = opts.imgs_dir; + + opts.output = NULL; + opts.work_dir = NULL; + opts.imgs_dir = NULL; + + rpc_cfg_file = req->config_file; + i = parse_options(0, NULL, &dummy, &dummy, PARSING_RPC_CONF); + if (i) { + xfree(tmp_output); + xfree(tmp_work); + xfree(tmp_imgs); + goto err; + } + /* If this is non-NULL, the RPC configuration file had a value, use it.*/ + if (opts.output) + output_changed_by_rpc_conf = true; + /* If this is NULL, use the old value if it was set. */ + if (!opts.output && tmp_output) { + opts.output = tmp_output; + tmp_output = NULL; + } + + if (opts.work_dir) + work_changed_by_rpc_conf = true; + if (!opts.work_dir && tmp_work) { + opts.work_dir = tmp_work; + tmp_work = NULL; + } + + if (opts.imgs_dir) + imgs_changed_by_rpc_conf = true; + /* + * As the images directory is a required RPC setting, it is not + * necessary to use the value from other configuration files. + * Either it is set in the RPC configuration file or it is set + * via RPC. + */ + xfree(tmp_output); + xfree(tmp_work); + xfree(tmp_imgs); + } + + /* + * open images_dir - images_dir_fd is a required RPC parameter + * + * This assumes that if opts.imgs_dir is set we have a value + * from the configuration file parser. The test to see that + * imgs_changed_by_rpc_conf is true is used to make sure the value + * is from the RPC configuration file. + * The idea is that only the RPC configuration file is able to + * overwrite RPC settings: + * * apply_config(global_conf) + * * apply_config(user_conf) + * * apply_config(environment variable) + * * apply_rpc_options() + * * apply_config(rpc_conf) + */ + if (imgs_changed_by_rpc_conf) + strncpy(images_dir_path, opts.imgs_dir, PATH_MAX - 1); + else + sprintf(images_dir_path, "/proc/%d/fd/%d", ids.pid, req->images_dir_fd); + + if (req->parent_img) + SET_CHAR_OPTS(img_parent, req->parent_img); + + if (open_image_dir(images_dir_path) < 0) { + pr_perror("Can't open images directory"); + goto err; + } + + /* get full path to images_dir to use in process title */ + if (readlink(images_dir_path, images_dir, PATH_MAX) == -1) { + pr_perror("Can't readlink %s", images_dir_path); + goto err; + } + + /* chdir to work dir */ + if (work_changed_by_rpc_conf) + /* Use the value from the RPC configuration file first. */ + strncpy(work_dir_path, opts.work_dir, PATH_MAX - 1); + else if (req->has_work_dir_fd) + /* Use the value set via RPC. */ + sprintf(work_dir_path, "/proc/%d/fd/%d", ids.pid, req->work_dir_fd); + else if (opts.work_dir) + /* Use the value from one of the other configuration files. */ + strncpy(work_dir_path, opts.work_dir, PATH_MAX - 1); + else + /* Use the images directory a work directory. */ + strcpy(work_dir_path, images_dir_path); + + if (chdir(work_dir_path)) { + pr_perror("Can't chdir to work_dir"); + goto err; + } + + /* initiate log file in work dir */ + if (req->log_file && !output_changed_by_rpc_conf) { + /* + * If RPC sets a log file and if there nothing from the + * RPC configuration file, use the RPC value. + */ + if (strchr(req->log_file, '/')) { + pr_perror("No subdirs are allowed in log_file name"); + goto err; + } + + SET_CHAR_OPTS(output, req->log_file); + } else if (!opts.output) { + SET_CHAR_OPTS(output, DEFAULT_LOG_FILENAME); + } + + /* This is needed later to correctly set the log_level */ + opts.log_level = req->log_level; + log_set_loglevel(req->log_level); + if (log_init(opts.output) == -1) { + pr_perror("Can't initiate log"); + goto err; + } + + if (req->config_file) { + pr_debug("Overwriting RPC settings with values from %s\n", req->config_file); + } + + if (kerndat_init()) + return 1; + + if (log_keep_err()) { + pr_perror("Can't tune log"); + goto err; + } + + /* checking flags from client */ + if (req->has_leave_running && req->leave_running) + opts.final_state = TASK_ALIVE; + + if (!req->has_pid) { + req->has_pid = true; + req->pid = ids.pid; + } + + if (req->has_ext_unix_sk) { + opts.ext_unix_sk = req->ext_unix_sk; + for (i = 0; i < req->n_unix_sk_ino; i++) { + if (unix_sk_id_add((unsigned int)req->unix_sk_ino[i]->inode) < 0) + goto err; + } + } + + if (req->root) + SET_CHAR_OPTS(root, req->root); + + if (req->has_rst_sibling) { + if (!opts.swrk_restore) { + pr_err("rst_sibling is not allowed in standalone service\n"); + goto err; + } + + opts.restore_sibling = req->rst_sibling; + } + + if (req->has_tcp_established) + opts.tcp_established_ok = req->tcp_established; + + if (req->has_tcp_skip_in_flight) + opts.tcp_skip_in_flight = req->tcp_skip_in_flight; + + if (req->has_tcp_close) + opts.tcp_close = req->tcp_close; + + if (req->has_weak_sysctls) + opts.weak_sysctls = req->weak_sysctls; + + if (req->has_evasive_devices) + opts.evasive_devices = req->evasive_devices; + + if (req->has_shell_job) + opts.shell_job = req->shell_job; + + if (req->has_file_locks) + opts.handle_file_locks = req->file_locks; + + if (req->has_track_mem) + opts.track_mem = req->track_mem; + + if (req->has_link_remap) + opts.link_remap_ok = req->link_remap; + + if (req->has_auto_dedup) + opts.auto_dedup = req->auto_dedup; + + if (req->has_force_irmap) + opts.force_irmap = req->force_irmap; + + if (req->n_exec_cmd > 0) { + opts.exec_cmd = xmalloc((req->n_exec_cmd + 1) * sizeof(char *)); + memcpy(opts.exec_cmd, req->exec_cmd, req->n_exec_cmd * sizeof(char *)); + opts.exec_cmd[req->n_exec_cmd] = NULL; + } + + if (req->has_lazy_pages) { + opts.lazy_pages = req->lazy_pages; + } + + if (req->ps) { + opts.port = (short)req->ps->port; + + if (!opts.lazy_pages) { + opts.use_page_server = true; + if (req->ps->address) + SET_CHAR_OPTS(addr, req->ps->address); + else + opts.addr = NULL; + + if (req->ps->has_fd) { + if (!opts.swrk_restore) + goto err; + + opts.ps_socket = req->ps->fd; + } + } + } + + if (req->notify_scripts && add_rpc_notify(sk)) + goto err; + + for (i = 0; i < req->n_veths; i++) { + if (veth_pair_add(req->veths[i]->if_in, req->veths[i]->if_out)) + goto err; + } + + for (i = 0; i < req->n_ext_mnt; i++) { + if (ext_mount_add(req->ext_mnt[i]->key, req->ext_mnt[i]->val)) + goto err; + } + + for (i = 0; i < req->n_join_ns; i++) { + if (join_ns_add(req->join_ns[i]->ns, req->join_ns[i]->ns_file, req->join_ns[i]->extra_opt)) + goto err; + } + + if (req->n_inherit_fd && !opts.swrk_restore) { + pr_err("inherit_fd is not allowed in standalone service\n"); + goto err; + } + for (i = 0; i < req->n_inherit_fd; i++) { + if (inherit_fd_add(req->inherit_fd[i]->fd, req->inherit_fd[i]->key)) + goto err; + } + + for (i = 0; i < req->n_external; i++) + if (add_external(req->external[i])) + goto err; + + for (i = 0; i < req->n_cg_root; i++) { + if (new_cg_root_add(req->cg_root[i]->ctrl, + req->cg_root[i]->path)) + goto err; + } + + for (i = 0; i < req->n_enable_fs; i++) { + if (!add_fsname_auto(req->enable_fs[i])) + goto err; + } + + for (i = 0; i < req->n_skip_mnt; i++) { + if (!add_skip_mount(req->skip_mnt[i])) + goto err; + } + + if (req->has_cpu_cap) { + opts.cpu_cap = req->cpu_cap; + opts.cpu_cap |= CPU_CAP_IMAGE; + } + + /* + * FIXME: For backward compatibility we setup + * soft mode here, need to enhance to support + * other modes as well via separate option + * probably. + */ + if (req->has_manage_cgroups) + opts.manage_cgroups = req->manage_cgroups ? CG_MODE_SOFT : CG_MODE_IGNORE; + + /* Override the manage_cgroup if mode is set explicitly */ + if (req->has_manage_cgroups_mode) { + unsigned int mode; + + switch (req->manage_cgroups_mode) { + case CRIU_CG_MODE__IGNORE: + mode = CG_MODE_IGNORE; + break; + case CRIU_CG_MODE__CG_NONE: + mode = CG_MODE_NONE; + break; + case CRIU_CG_MODE__PROPS: + mode = CG_MODE_PROPS; + break; + case CRIU_CG_MODE__SOFT: + mode = CG_MODE_SOFT; + break; + case CRIU_CG_MODE__FULL: + mode = CG_MODE_FULL; + break; + case CRIU_CG_MODE__STRICT: + mode = CG_MODE_STRICT; + break; + case CRIU_CG_MODE__DEFAULT: + mode = CG_MODE_DEFAULT; + break; + default: + goto err; + } + + opts.manage_cgroups = mode; + } + + if (req->freeze_cgroup) + SET_CHAR_OPTS(freeze_cgroup, req->freeze_cgroup); + + if (req->lsm_profile) { + opts.lsm_supplied = true; + SET_CHAR_OPTS(lsm_profile, req->lsm_profile); + } + + if (req->has_timeout) + opts.timeout = req->timeout; + + if (req->cgroup_props) + SET_CHAR_OPTS(cgroup_props, req->cgroup_props); + + if (req->cgroup_props_file) + SET_CHAR_OPTS(cgroup_props_file, req->cgroup_props_file); + + for (i = 0; i < req->n_cgroup_dump_controller; i++) { + if (!cgp_add_dump_controller(req->cgroup_dump_controller[i])) + goto err; + } + + if (req->tls_cacert) + SET_CHAR_OPTS(tls_cacert, req->tls_cacert); + if (req->tls_cacrl) + SET_CHAR_OPTS(tls_cacrl, req->tls_cacrl); + if (req->tls_cert) + SET_CHAR_OPTS(tls_cert, req->tls_cert); + if (req->tls_key) + SET_CHAR_OPTS(tls_key, req->tls_key); + if (req->tls) + opts.tls = req->tls; + if (req->tls_no_cn_verify) + opts.tls_no_cn_verify = req->tls_no_cn_verify; + + if (req->has_auto_ext_mnt) + opts.autodetect_ext_mounts = req->auto_ext_mnt; + + if (req->has_ext_sharing) + opts.enable_external_sharing = req->ext_sharing; + + if (req->has_ext_masters) + opts.enable_external_masters = req->ext_masters; + + if (req->has_ghost_limit) + opts.ghost_limit = req->ghost_limit; + + if (req->has_empty_ns) { + opts.empty_ns = req->empty_ns; + if (req->empty_ns & ~(CLONE_NEWNET)) + goto err; + } + + if (req->n_irmap_scan_paths) { + for (i = 0; i < req->n_irmap_scan_paths; i++) { + if (irmap_scan_path_add(req->irmap_scan_paths[i])) + goto err; + } + } + + if (req->has_status_fd) { + sprintf(status_fd, "/proc/%d/fd/%d", ids.pid, req->status_fd); + opts.status_fd = open(status_fd, O_WRONLY); + if (opts.status_fd < 0) + goto err; + } + + if (req->orphan_pts_master) + opts.orphan_pts_master = true; + + + /* Evaluate additional configuration file a second time to overwrite + * all RPC settings. */ + if (req->config_file) { + rpc_cfg_file = req->config_file; + i = parse_options(0, NULL, &dummy, &dummy, PARSING_RPC_CONF); + if (i) + goto err; + } + + log_set_loglevel(opts.log_level); + if (check_options()) + goto err; + + return 0; + +err: + set_cr_errno(EBADRQC); + return -1; +} + +static int dump_using_req(int sk, CriuOpts *req) +{ + bool success = false; + bool self_dump = !req->pid; + + if (setup_opts_from_req(sk, req)) + goto exit; + + setproctitle("dump --rpc -t %d -D %s", req->pid, images_dir); + + /* + * FIXME -- cr_dump_tasks() may return code from custom + * scripts, that can be positive. However, right now we + * don't have ability to push scripts via RPC, so positive + * ret values are impossible here. + */ + if (cr_dump_tasks(req->pid)) + goto exit; + + success = true; +exit: + if (req->leave_running || !self_dump || !success) { + if (send_criu_dump_resp(sk, success, false) == -1) { + pr_perror("Can't send response"); + success = false; + } + } + + return success ? 0 : 1; +} + +static int restore_using_req(int sk, CriuOpts *req) +{ + bool success = false; + + /* + * We can't restore processes under arbitrary task yet. + * Thus for now we force the detached restore under the + * cr service task. + */ + + opts.restore_detach = true; + + if (setup_opts_from_req(sk, req)) + goto exit; + + setproctitle("restore --rpc -D %s", images_dir); + + if (cr_restore_tasks()) + goto exit; + + success = true; +exit: + if (send_criu_restore_resp(sk, success, + root_item ? root_item->pid->real : -1) == -1) { + pr_perror("Can't send response"); + success = false; + } + + if (success && opts.exec_cmd) { + int logfd; + + logfd = log_get_fd(); + if (dup2(logfd, STDOUT_FILENO) == -1 || dup2(logfd, STDERR_FILENO) == -1) { + pr_perror("Failed to redirect stdout and stderr to the logfile"); + return 1; + } + + close_pid_proc(); + close(sk); + + execvp(opts.exec_cmd[0], opts.exec_cmd); + pr_perror("Failed to exec cmd %s", opts.exec_cmd[0]); + success = false; + } + + return success ? 0 : 1; +} + +static int check(int sk, CriuOpts *req) +{ + int pid, status; + CriuResp resp = CRIU_RESP__INIT; + + resp.type = CRIU_REQ_TYPE__CHECK; + + pid = fork(); + if (pid < 0) { + pr_perror("Can't fork"); + goto out; + } + + if (pid == 0) { + setproctitle("check --rpc"); + + if (setup_opts_from_req(sk, req)) + exit(1); + + exit(!!cr_check()); + } + if (waitpid(pid, &status, 0) != pid) { + pr_perror("Unable to wait %d", pid); + goto out; + } + if (status) + goto out; + + resp.success = true; +out: + return send_criu_msg(sk, &resp); +} + +static int pre_dump_using_req(int sk, CriuOpts *req) +{ + int pid, status; + bool success = false; + + pid = fork(); + if (pid < 0) { + pr_perror("Can't fork"); + goto out; + } + + if (pid == 0) { + int ret = 1; + + if (setup_opts_from_req(sk, req)) + goto cout; + + setproctitle("pre-dump --rpc -t %d -D %s", req->pid, images_dir); + + if (cr_pre_dump_tasks(req->pid)) + goto cout; + + ret = 0; +cout: + exit(ret); + } + + if (waitpid(pid, &status, 0) != pid) { + pr_perror("Unable to wait %d", pid); + goto out; + } + if (status != 0) + goto out; + + success = true; +out: + if (send_criu_pre_dump_resp(sk, success) == -1) { + pr_perror("Can't send pre-dump resp"); + success = false; + } + + return success ? 0 : -1; +} + +static int pre_dump_loop(int sk, CriuReq *msg) +{ + int ret; + + do { + ret = pre_dump_using_req(sk, msg->opts); + if (ret < 0) + return ret; + + criu_req__free_unpacked(msg, NULL); + if (recv_criu_msg(sk, &msg) == -1) { + pr_perror("Can't recv request"); + return -1; + } + } while (msg->type == CRIU_REQ_TYPE__PRE_DUMP); + + if (msg->type != CRIU_REQ_TYPE__DUMP) { + send_criu_err(sk, "Bad req seq"); + return -1; + } + + return dump_using_req(sk, msg->opts); +} + +static int start_page_server_req(int sk, CriuOpts *req, bool daemon_mode) +{ + int ret = -1, pid, start_pipe[2]; + ssize_t count; + bool success = false; + CriuResp resp = CRIU_RESP__INIT; + CriuPageServerInfo ps = CRIU_PAGE_SERVER_INFO__INIT; + struct ps_info info; + + if (pipe(start_pipe)) { + pr_perror("No start pipe"); + goto out; + } + + pid = fork(); + if (pid == 0) { + close(start_pipe[0]); + + if (setup_opts_from_req(sk, req)) + goto out_ch; + + setproctitle("page-server --rpc --address %s --port %hu", opts.addr, opts.port); + + pr_debug("Starting page server\n"); + + pid = cr_page_server(daemon_mode, false, start_pipe[1]); + if (pid < 0) + goto out_ch; + + if (daemon_mode) { + info.pid = pid; + info.port = opts.port; + + count = write(start_pipe[1], &info, sizeof(info)); + if (count != sizeof(info)) + goto out_ch; + } + + ret = 0; +out_ch: + if (daemon_mode && ret < 0 && pid > 0) + kill(pid, SIGKILL); + close(start_pipe[1]); + exit(ret); + } + + close(start_pipe[1]); + + if (daemon_mode) { + if (waitpid(pid, &ret, 0) != pid) { + pr_perror("Unable to wait %d", pid); + goto out; + } + if (WIFEXITED(ret)) { + if (WEXITSTATUS(ret)) { + pr_err("Child exited with an error\n"); + goto out; + } + } else { + pr_err("Child wasn't terminated normally\n"); + goto out; + } + } + + count = read(start_pipe[0], &info, sizeof(info)); + close(start_pipe[0]); + if (count != sizeof(info)) + goto out; + + ps.pid = info.pid; + ps.has_port = true; + ps.port = info.port; + + success = true; + ps.has_pid = true; + resp.ps = &ps; + + pr_debug("Page server started\n"); +out: + resp.type = CRIU_REQ_TYPE__PAGE_SERVER; + resp.success = success; + return send_criu_msg(sk, &resp); +} + +static int chk_keepopen_req(CriuReq *msg) +{ + if (!msg->keep_open) + return 0; + + /* + * Service may (well, it will) leave some + * resources leaked after processing e.g. + * dump or restore requests. Before we audit + * the code for this, let's first enable + * mreq RPCs for those requests we know do + * good work + */ + + if (msg->type == CRIU_REQ_TYPE__PAGE_SERVER) + /* This just fork()-s so no leaks */ + return 0; + if (msg->type == CRIU_REQ_TYPE__PAGE_SERVER_CHLD) + /* This just fork()-s so no leaks */ + return 0; + else if (msg->type == CRIU_REQ_TYPE__CPUINFO_DUMP || + msg->type == CRIU_REQ_TYPE__CPUINFO_CHECK) + return 0; + else if (msg->type == CRIU_REQ_TYPE__FEATURE_CHECK) + return 0; + else if (msg->type == CRIU_REQ_TYPE__VERSION) + return 0; + + return -1; +} + +/* + * Return the version information, depending on the information + * available in version.h + */ +static int handle_version(int sk, CriuReq * msg) +{ + CriuResp resp = CRIU_RESP__INIT; + CriuVersion version = CRIU_VERSION__INIT; + + /* This assumes we will always have a major and minor version */ + version.major_number = CRIU_VERSION_MAJOR; + version.minor_number = CRIU_VERSION_MINOR; + if (strcmp(CRIU_GITID, "0")) { + version.gitid = CRIU_GITID; + } +#ifdef CRIU_VERSION_SUBLEVEL + version.has_sublevel = 1; + version.sublevel = CRIU_VERSION_SUBLEVEL; +#endif +#ifdef CRIU_VERSION_EXTRA + version.has_extra = 1; + version.extra = CRIU_VERSION_EXTRA; +#endif +#ifdef CRIU_VERSION_NAME + /* This is not actually exported in version.h */ + version.name = CRIU_VERSION_NAME; +#endif + resp.type = msg->type; + resp.success = true; + resp.version = &version; + return send_criu_msg(sk, &resp); +} + +/* + * Generic function to handle CRIU_REQ_TYPE__FEATURE_CHECK. + * + * The function will have resp.success = true for most cases + * and the actual result will be in resp.features. + * + * For each feature which has been requested in msg->features + * the corresponding parameter will be set in resp.features. + */ +static int handle_feature_check(int sk, CriuReq * msg) +{ + CriuResp resp = CRIU_RESP__INIT; + CriuFeatures feat = CRIU_FEATURES__INIT; + int pid, status; + int ret; + + /* enable setting of an optional message */ + feat.has_mem_track = 1; + feat.mem_track = false; + feat.has_lazy_pages = 1; + feat.lazy_pages = false; + + pid = fork(); + if (pid < 0) { + pr_perror("Can't fork"); + goto out; + } + + if (pid == 0) { + /* kerndat_init() is called from setup_opts_from_req() */ + if (setup_opts_from_req(sk, msg->opts)) + exit(1); + + setproctitle("feature-check --rpc"); + + if ((msg->features->has_mem_track == 1) && + (msg->features->mem_track == true)) + feat.mem_track = kdat.has_dirty_track; + + if ((msg->features->has_lazy_pages == 1) && + (msg->features->lazy_pages == true)) + feat.lazy_pages = kdat.has_uffd && uffd_noncooperative(); + + resp.features = &feat; + resp.type = msg->type; + /* The feature check is working, actual results are in resp.features */ + resp.success = true; + + /* + * If this point is reached the information about the features + * is transmitted from the forked CRIU process (here). + * If an error occurred earlier, the feature check response will be + * be send from the parent process. + */ + ret = send_criu_msg(sk, &resp); + exit(!!ret); + } + if (waitpid(pid, &status, 0) != pid) { + pr_perror("Unable to wait %d", pid); + goto out; + } + if (status != 0) + goto out; + + /* + * The child process was not able to send an answer. Tell + * the RPC client that something did not work as expected. + */ +out: + resp.type = msg->type; + resp.success = false; + + return send_criu_msg(sk, &resp); +} + +static int handle_wait_pid(int sk, int pid) +{ + CriuResp resp = CRIU_RESP__INIT; + bool success = false; + int status; + + if (waitpid(pid, &status, 0) == -1) { + resp.cr_errno = errno; + pr_perror("Unable to wait %d", pid); + goto out; + } + + resp.status = status; + resp.has_status = true; + + success = true; +out: + resp.type = CRIU_REQ_TYPE__WAIT_PID; + resp.success = success; + + return send_criu_msg(sk, &resp); +} + +static int handle_cpuinfo(int sk, CriuReq *msg) +{ + CriuResp resp = CRIU_RESP__INIT; + bool success = false; + int pid, status; + + pid = fork(); + if (pid < 0) { + pr_perror("Can't fork"); + goto out; + } + + if (pid == 0) { + int ret = 1; + + if (setup_opts_from_req(sk, msg->opts)) + goto cout; + + setproctitle("cpuinfo %s --rpc -D %s", + msg->type == CRIU_REQ_TYPE__CPUINFO_DUMP ? + "dump" : "check", + images_dir); + + if (msg->type == CRIU_REQ_TYPE__CPUINFO_DUMP) + ret = cpuinfo_dump(); + else + ret = cpuinfo_check(); +cout: + exit(ret); + } + + if (waitpid(pid, &status, 0) != pid) { + pr_perror("Unable to wait %d", pid); + goto out; + } + if (!WIFEXITED(status)) + goto out; + switch (WEXITSTATUS(status)) { + case (-ENOTSUP & 0xff): + resp.has_cr_errno = 1; + /* + * Let's return the actual error code and + * not just (-ENOTSUP & 0xff) + */ + resp.cr_errno = ENOTSUP; + break; + case 0: + success = true; + break; + default: + break; + } + +out: + resp.type = msg->type; + resp.success = success; + + return send_criu_msg(sk, &resp); +} + +int cr_service_work(int sk) +{ + int ret = -1; + CriuReq *msg = 0; + +more: + if (recv_criu_msg(sk, &msg) != 0) { + pr_perror("Can't recv request"); + goto err; + } + + if (chk_keepopen_req(msg)) + goto err; + + switch (msg->type) { + case CRIU_REQ_TYPE__DUMP: + ret = dump_using_req(sk, msg->opts); + break; + case CRIU_REQ_TYPE__RESTORE: + ret = restore_using_req(sk, msg->opts); + break; + case CRIU_REQ_TYPE__CHECK: + ret = check(sk, msg->opts); + break; + case CRIU_REQ_TYPE__PRE_DUMP: + ret = pre_dump_loop(sk, msg); + break; + case CRIU_REQ_TYPE__PAGE_SERVER: + ret = start_page_server_req(sk, msg->opts, true); + break; + case CRIU_REQ_TYPE__PAGE_SERVER_CHLD: + ret = start_page_server_req(sk, msg->opts, false); + break; + case CRIU_REQ_TYPE__WAIT_PID: + ret = handle_wait_pid(sk, msg->pid); + break; + case CRIU_REQ_TYPE__CPUINFO_DUMP: + case CRIU_REQ_TYPE__CPUINFO_CHECK: + ret = handle_cpuinfo(sk, msg); + break; + case CRIU_REQ_TYPE__FEATURE_CHECK: + ret = handle_feature_check(sk, msg); + break; + case CRIU_REQ_TYPE__VERSION: + ret = handle_version(sk, msg); + break; + + default: + send_criu_err(sk, "Invalid req"); + break; + } + + if (!ret && msg->keep_open) { + criu_req__free_unpacked(msg, NULL); + ret = -1; + goto more; + } + +err: + return ret; +} + +static void reap_worker(int signo) +{ + int saved_errno; + int status; + pid_t pid; + + saved_errno = errno; + + /* + * As we block SIGCHLD, lets wait for every child that has + * already changed state. + */ + while (1) { + pid = waitpid(-1, &status, WNOHANG); + + if (pid <= 0) { + errno = saved_errno; + return; + } + + if (WIFEXITED(status)) + pr_info("Worker(pid %d) exited with %d\n", + pid, WEXITSTATUS(status)); + else if (WIFSIGNALED(status)) + pr_info("Worker(pid %d) was killed by %d: %s\n", pid, + WTERMSIG(status), strsignal(WTERMSIG(status))); + } +} + +static int setup_sigchld_handler() +{ + struct sigaction action; + + sigemptyset(&action.sa_mask); + sigaddset(&action.sa_mask, SIGCHLD); + action.sa_handler = reap_worker; + action.sa_flags = SA_RESTART; + + if (sigaction(SIGCHLD, &action, NULL)) { + pr_perror("Can't setup SIGCHLD handler"); + return -1; + } + + return 0; +} + +static int restore_sigchld_handler() +{ + struct sigaction action; + + sigemptyset(&action.sa_mask); + sigaddset(&action.sa_mask, SIGCHLD); + action.sa_handler = SIG_DFL; + action.sa_flags = SA_RESTART; + + if (sigaction(SIGCHLD, &action, NULL)) { + pr_perror("Can't restore SIGCHLD handler"); + return -1; + } + + return 0; +} + +int cr_service(bool daemon_mode) +{ + int server_fd = -1; + int child_pid; + + struct sockaddr_un client_addr; + socklen_t client_addr_len; + + { + struct sockaddr_un server_addr; + socklen_t server_addr_len; + + server_fd = socket(AF_LOCAL, SOCK_SEQPACKET, 0); + if (server_fd == -1) { + pr_perror("Can't initialize service socket"); + goto err; + } + + memset(&server_addr, 0, sizeof(server_addr)); + memset(&client_addr, 0, sizeof(client_addr)); + server_addr.sun_family = AF_LOCAL; + + if (opts.addr == NULL) { + pr_warn("Binding to local dir address!\n"); + SET_CHAR_OPTS(addr, CR_DEFAULT_SERVICE_ADDRESS); + } + + strncpy(server_addr.sun_path, opts.addr, + sizeof(server_addr.sun_path) - 1); + + server_addr_len = strlen(server_addr.sun_path) + + sizeof(server_addr.sun_family); + client_addr_len = sizeof(client_addr); + + unlink(server_addr.sun_path); + + if (bind(server_fd, (struct sockaddr *) &server_addr, + server_addr_len) == -1) { + pr_perror("Can't bind"); + goto err; + } + + pr_info("The service socket is bound to %s\n", server_addr.sun_path); + + /* change service socket permissions, so anyone can connect to it */ + if (chmod(server_addr.sun_path, 0666)) { + pr_perror("Can't change permissions of the service socket"); + goto err; + } + + if (listen(server_fd, 16) == -1) { + pr_perror("Can't listen for socket connections"); + goto err; + } + } + + if (daemon_mode) { + if (daemon(1, 0) == -1) { + pr_perror("Can't run service server in the background"); + goto err; + } + } + + if (opts.pidfile) { + if (write_pidfile(getpid()) == -1) { + pr_perror("Can't write pidfile"); + goto err; + } + } + + if (setup_sigchld_handler()) + goto err; + + if (close_status_fd()) + goto err; + + while (1) { + int sk; + + pr_info("Waiting for connection...\n"); + + sk = accept(server_fd, (struct sockaddr *)&client_addr, &client_addr_len); + if (sk == -1) { + pr_perror("Can't accept connection"); + goto err; + } + + pr_info("Connected.\n"); + child_pid = fork(); + if (child_pid == 0) { + int ret; + + if (restore_sigchld_handler()) + exit(1); + + close(server_fd); + init_opts(); + ret = cr_service_work(sk); + close(sk); + exit(ret != 0); + } + + if (child_pid < 0) + pr_perror("Can't fork a child"); + + close(sk); + } + +err: + close_safe(&server_fd); + + return 1; +} diff --git a/CRIU_code/criu/crtools.c b/CRIU_code/criu/crtools.c new file mode 100644 index 0000000..97a6d6d --- /dev/null +++ b/CRIU_code/criu/crtools.c @@ -0,0 +1,478 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +#include +#include +#include + +#include + +#include + +#include "int.h" +#include "page.h" +#include "common/compiler.h" +#include "crtools.h" +#include "cr_options.h" +#include "external.h" +#include "files.h" +#include "sk-inet.h" +#include "net.h" +#include "page-xfer.h" +#include "tty.h" +#include "file-lock.h" +#include "cr-service.h" +#include "plugin.h" +#include "criu-log.h" +#include "util.h" +#include "protobuf-desc.h" +#include "namespaces.h" +#include "cgroup.h" +#include "cpu.h" +#include "fault-injection.h" +#include "proc_parse.h" +#include "kerndat.h" + +#include "setproctitle.h" +#include "sysctl.h" +#include "img-remote.h" + +int main(int argc, char *argv[], char *envp[]) +{ + int ret = -1; + bool usage_error = true; + bool has_exec_cmd = false; + bool has_sub_command; + int state = PARSING_GLOBAL_CONF; + + BUILD_BUG_ON(CTL_32 != SYSCTL_TYPE__CTL_32); + BUILD_BUG_ON(__CTL_STR != SYSCTL_TYPE__CTL_STR); + /* We use it for fd overlap handling in clone_service_fd() */ + BUG_ON(get_service_fd(SERVICE_FD_MIN+1) < + get_service_fd(SERVICE_FD_MAX-1)); + + if (fault_injection_init()) + return 1; + + cr_pb_init(); + setproctitle_init(argc, argv, envp); + + if (argc < 2) + goto usage; + + init_opts(); + + + ret = parse_options(argc, argv, &usage_error, &has_exec_cmd, state); + + if (ret == 1) + return 1; + if (ret == 2) + goto usage; + + log_set_loglevel(opts.log_level); + + if (!strcmp(argv[1], "swrk")) { + if (argc < 3) + goto usage; + /* + * This is to start criu service worker from libcriu calls. + * The usage is "criu swrk " and is not for CLI/scripts. + * The arguments semantics can change at any time with the + * corresponding lib call change. + */ + opts.swrk_restore = true; + return cr_service_work(atoi(argv[2])); + } + + if (check_options()) { + flush_early_log_buffer(STDERR_FILENO); + return 1; + } + + if (opts.imgs_dir == NULL) + SET_CHAR_OPTS(imgs_dir, "."); + + if (opts.work_dir == NULL) + SET_CHAR_OPTS(work_dir, opts.imgs_dir); + + if (optind >= argc) { + pr_msg("Error: command is required\n"); + goto usage; + } + + has_sub_command = (argc - optind) > 1; + + if (has_exec_cmd) { + if (!has_sub_command) { + pr_msg("Error: --exec-cmd requires a command\n"); + goto usage; + } + + if (strcmp(argv[optind], "restore")) { + pr_msg("Error: --exec-cmd is available for the restore command only\n"); + goto usage; + } + + if (opts.restore_detach) { + pr_msg("Error: --restore-detached and --exec-cmd cannot be used together\n"); + goto usage; + } + + opts.exec_cmd = xmalloc((argc - optind) * sizeof(char *)); + if (!opts.exec_cmd) + return 1; + memcpy(opts.exec_cmd, &argv[optind + 1], (argc - optind - 1) * sizeof(char *)); + opts.exec_cmd[argc - optind - 1] = NULL; + } else { + /* No subcommands except for cpuinfo and restore --exec-cmd */ + if (strcmp(argv[optind], "cpuinfo") && has_sub_command) { + pr_msg("Error: excessive parameter%s for command %s\n", + (argc - optind) > 2 ? "s" : "", argv[optind]); + goto usage; + } + } + + /* We must not open imgs dir, if service is called */ + if (strcmp(argv[optind], "service")) { + ret = open_image_dir(opts.imgs_dir); + if (ret < 0) + return 1; + } + + /* + * When a process group becomes an orphan, + * its processes are sent a SIGHUP signal + */ + if (!strcmp(argv[optind], "restore") && + opts.restore_detach && + opts.final_state == TASK_STOPPED && + opts.shell_job) + pr_warn("Stopped and detached shell job will get SIGHUP from OS.\n"); + + if (chdir(opts.work_dir)) { + pr_perror("Can't change directory to %s", opts.work_dir); + return 1; + } + + if (log_init(opts.output)) + return 1; + + if (kerndat_init()) + return 1; + + if (opts.deprecated_ok) + pr_debug("DEPRECATED ON\n"); + + if (!list_empty(&opts.inherit_fds)) { + if (strcmp(argv[optind], "restore")) { + pr_err("--inherit-fd is restore-only option\n"); + return 1; + } + /* now that log file is set up, print inherit fd list */ + inherit_fd_log(); + } + + if (opts.img_parent) + pr_info("Will do snapshot from %s\n", opts.img_parent); + + if (!strcmp(argv[optind], "dump")) { + if (!opts.tree_id) + goto opt_pid_missing; + return cr_dump_tasks(opts.tree_id); + } + + if (!strcmp(argv[optind], "pre-dump")) { + if (!opts.tree_id) + goto opt_pid_missing; + + if (opts.lazy_pages) { + pr_err("Cannot pre-dump with --lazy-pages\n"); + return 1; + } + + return cr_pre_dump_tasks(opts.tree_id) != 0; + } + + if (!strcmp(argv[optind], "restore")) { + if (opts.tree_id) + pr_warn("Using -t with criu restore is obsoleted\n"); + + ret = cr_restore_tasks(); + if (ret == 0 && opts.exec_cmd) { + close_pid_proc(); + execvp(opts.exec_cmd[0], opts.exec_cmd); + pr_perror("Failed to exec command %s", opts.exec_cmd[0]); + ret = 1; + } + + return ret != 0; + } + + if (!strcmp(argv[optind], "lazy-pages")) + return cr_lazy_pages(opts.daemon_mode) != 0; + + if (!strcmp(argv[optind], "check")) + return cr_check() != 0; + + if (!strcmp(argv[optind], "page-server")) + return cr_page_server(opts.daemon_mode, false, -1) != 0; + + if (!strcmp(argv[optind], "image-cache")) { + if (!opts.port) + goto opt_port_missing; + return image_cache(opts.daemon_mode, DEFAULT_CACHE_SOCKET); + } + + if (!strcmp(argv[optind], "image-proxy")) { + if (!opts.addr) { + pr_msg("Error: address not specified\n"); + return 1; + } + if (!opts.port) + goto opt_port_missing; + return image_proxy(opts.daemon_mode, DEFAULT_PROXY_SOCKET); + } + + if (!strcmp(argv[optind], "service")) + return cr_service(opts.daemon_mode); + + if (!strcmp(argv[optind], "dedup")) + return cr_dedup() != 0; + + if (!strcmp(argv[optind], "cpuinfo")) { + if (!argv[optind + 1]) { + pr_msg("Error: cpuinfo requires an action: dump or check\n"); + goto usage; + } + if (!strcmp(argv[optind + 1], "dump")) + return cpuinfo_dump(); + else if (!strcmp(argv[optind + 1], "check")) + return cpuinfo_check(); + } + + if (!strcmp(argv[optind], "exec")) { + pr_msg("The \"exec\" action is deprecated by the Compel library.\n"); + return -1; + } + + if (!strcmp(argv[optind], "show")) { + pr_msg("The \"show\" action is deprecated by the CRIT utility.\n"); + pr_msg("To view an image use the \"crit decode -i $name --pretty\" command.\n"); + return -1; + } + + pr_msg("Error: unknown command: %s\n", argv[optind]); +usage: + pr_msg("\n" +"Usage:\n" +" criu dump|pre-dump -t PID []\n" +" criu restore []\n" +" criu check [--feature FEAT]\n" +" criu page-server\n" +" criu service []\n" +" criu dedup\n" +" criu lazy-pages -D DIR []\n" +" criu image-cache []\n" +" criu image-proxy []\n" +"\n" +"Commands:\n" +" dump checkpoint a process/tree identified by pid\n" +" pre-dump pre-dump task(s) minimizing their frozen time\n" +" restore restore a process/tree\n" +" check checks whether the kernel support is up-to-date\n" +" page-server launch page server\n" +" service launch service\n" +" dedup remove duplicates in memory dump\n" +" cpuinfo dump writes cpu information into image file\n" +" cpuinfo check validates cpu information read from image file\n" +" image-proxy launch dump-side proxy to sent images\n" +" image-cache launch restore-side cache to receive images\n" + ); + + if (usage_error) { + pr_msg("\nTry -h|--help for more info\n"); + return 1; + } + + pr_msg("\n" + +"Most of the true / false long options (the ones without arguments) can be\n" +"prefixed with --no- to negate the option (example: --display-stats and\n" +"--no-display-stats).\n" +"\n" +"Dump/Restore options:\n" +"\n" +"* Generic:\n" +" -t|--tree PID checkpoint a process tree identified by PID\n" +" -d|--restore-detached detach after restore\n" +" -S|--restore-sibling restore root task as sibling\n" +" -s|--leave-stopped leave tasks in stopped state after checkpoint\n" +" -R|--leave-running leave tasks in running state after checkpoint\n" +" -D|--images-dir DIR directory for image files\n" +" --pidfile FILE write root task, service or page-server pid to FILE\n" +" -W|--work-dir DIR directory to cd and write logs/pidfiles/stats to\n" +" (if not specified, value of --images-dir is used)\n" +" --cpu-cap [CAP] CPU capabilities to write/check. CAP is comma-separated\n" +" list of: cpu, fpu, all, ins, none. To disable\n" +" a capability, use ^CAP. Empty argument implies all\n" +" --exec-cmd execute the command specified after '--' on successful\n" +" restore making it the parent of the restored process\n" +" --freeze-cgroup use cgroup freezer to collect processes\n" +" --weak-sysctls skip restoring sysctls that are not available\n" +" --lazy-pages restore pages on demand\n" +" this requires running a second instance of criu\n" +" in lazy-pages mode: 'criu lazy-pages -D DIR'\n" +" --lazy-pages and lazy-pages mode require userfaultfd\n" +"\n" +"* External resources support:\n" +" --external RES dump objects from this list as external resources:\n" +" Formats of RES on dump:\n" +" tty[rdev:dev]\n" +" file[mnt_id:inode]\n" +" dev[major/minor]:NAME\n" +" unix[ino]\n" +" mnt[MOUNTPOINT]:COOKIE\n" +" mnt[]{:AUTO_OPTIONS}\n" +" Formats of RES on restore:\n" +" dev[NAME]:DEVPATH\n" +" veth[IFNAME]:OUTNAME{@BRIDGE}\n" +" macvlan[IFNAME]:OUTNAME\n" +" mnt[COOKIE]:ROOT\n" +"\n" +" --remote dump/restore images directly to/from remote node using\n" +" image-proxy/image-cache\n" +"* Special resources support:\n" +" --" SK_EST_PARAM " checkpoint/restore established TCP connections\n" +" --" SK_INFLIGHT_PARAM " skip (ignore) in-flight TCP connections\n" +" --" SK_CLOSE_PARAM " restore connected TCP sockets in closed state\n" +" -r|--root PATH change the root filesystem (when run in mount namespace)\n" +" --evasive-devices use any path to a device file if the original one\n" +" is inaccessible\n" +" --link-remap allow one to link unlinked files back when possible\n" +" --ghost-limit size limit max size of deleted file contents inside image\n" +" --action-script FILE add an external action script\n" +" -j|--" OPT_SHELL_JOB " allow one to dump and restore shell jobs\n" +" -l|--" OPT_FILE_LOCKS " handle file locks, for safety, only used for container\n" +" -L|--libdir path to a plugin directory (by default " CR_PLUGIN_DEFAULT ")\n" +" --force-irmap force resolving names for inotify/fsnotify watches\n" +" --irmap-scan-path FILE\n" +" add a path the irmap hints to scan\n" +" --manage-cgroups [m] dump/restore process' cgroups; argument can be one of\n" +" 'none', 'props', 'soft' (default), 'full', 'strict'\n" +" or 'ignore'\n" +" --cgroup-root [controller:]/newroot\n" +" on dump: change the root for the controller that will\n" +" be dumped. By default, only the paths with tasks in\n" +" them and below will be dumped.\n" +" on restore: change the root cgroup the controller will\n" +" be installed into. No controller means that root is the\n" +" default for all controllers not specified\n" +" --cgroup-props STRING\n" +" define cgroup controllers and properties\n" +" to be checkpointed, which are described\n" +" via STRING using simplified YAML format\n" +" --cgroup-props-file FILE\n" +" same as --cgroup-props, but taking description\n" +" from the path specified\n" +" --cgroup-dump-controller NAME\n" +" define cgroup controller to be dumped\n" +" and skip anything else present in system\n" +" --lsm-profile TYPE:NAME\n" +" Specify an LSM profile to be used during restore.\n" +" The type can be either 'apparmor' or 'selinux'.\n" +" --skip-mnt PATH ignore this mountpoint when dumping the mount namespace\n" +" --enable-fs FSNAMES a comma separated list of filesystem names or \"all\"\n" +" force criu to (try to) dump/restore these filesystem's\n" +" mountpoints even if fs is not supported\n" +" --inherit-fd fd[NUM]:RES\n" +" Inherit file descriptors, treating fd NUM as being\n" +" already opened via an existing RES, which can be:\n" +" tty[rdev:dev]\n" +" pipe[inode]\n" +" socket[inode]\n" +" file[mnt_id:inode]\n" +" path/to/file\n" +" --empty-ns net Create a namespace, but don't restore its properties\n" +" (assuming it will be restored by action scripts)\n" +" -J|--join-ns NS:{PID|NS_FILE}[,OPTIONS]\n" +" Join existing namespace and restore process in it.\n" +" Namespace can be specified as either pid or file path.\n" +" OPTIONS can be used to specify parameters for userns:\n" +" user:PID,UID,GID\n" +"\n" +"Check options:\n" +" Without options, \"criu check\" checks availability of absolutely required\n" +" kernel features, critical for performing dump and restore.\n" +" --extra add check for extra kernel features\n" +" --experimental add check for experimental kernel features\n" +" --all same as --extra --experimental\n" +" --feature FEAT only check a particular feature, one of:" + ); + pr_check_features(" ", ", ", 80); + pr_msg( +"\n" +"* Logging:\n" +" -o|--log-file FILE log file name\n" +" --log-pid enable per-process logging to separate FILE.pid files\n" +" -v[v...]|--verbosity increase verbosity (can use multiple v)\n" +" -vNUM|--verbosity=NUM set verbosity to NUM (higher level means more output):\n" +" -v1 - only errors and messages\n" +" -v2 - also warnings (default level)\n" +" -v3 - also information messages and timestamps\n" +" -v4 - lots of debug\n" +" --display-stats print out dump/restore stats\n" +"\n" +"* Memory dumping options:\n" +" --track-mem turn on memory changes tracker in kernel\n" +" --prev-images-dir DIR path to images from previous dump (relative to -D)\n" +" --page-server send pages to page server (see options below as well)\n" +" --auto-dedup when used on dump it will deduplicate \"old\" data in\n" +" pages images of previous dump\n" +" when used on restore, as soon as page is restored, it\n" +" will be punched from the image\n" +"\n" +"Page/Service server options:\n" +" --address ADDR address of server or service\n" +" --port PORT port of page server\n" +" --ps-socket FD use specified FD as page server socket\n" +" -d|--daemon run in the background after creating socket\n" +" --status-fd FD write \\0 to the FD and close it once process is ready\n" +" to handle requests\n" +" --tls-cacert FILE trust certificates signed only by this CA\n" +" --tls-cacrl FILE path to CA certificate revocation list file\n" +" --tls-cert FILE path to TLS certificate file\n" +" --tls-key FILE path to TLS private key file\n" +" --tls use TLS to secure remote connection\n" +" --tls-no-cn-verify do not verify common name in server certificate\n" +"\n" +"Configuration file options:\n" +" --config FILEPATH pass a specific configuration file\n" +" --no-default-config forbid usage of default configuration files\n" +"\n" +"Other options:\n" +" -h|--help show this text\n" +" -V|--version show version\n" + ); + + return 0; + +opt_port_missing: + pr_msg("Error: port not specified\n"); + return 1; + +opt_pid_missing: + pr_msg("Error: pid not specified\n"); + return 1; +} diff --git a/CRIU_code/criu/eventfd.c b/CRIU_code/criu/eventfd.c new file mode 100644 index 0000000..da31ce9 --- /dev/null +++ b/CRIU_code/criu/eventfd.c @@ -0,0 +1,117 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common/compiler.h" +#include "imgset.h" +#include "eventfd.h" +#include "fdinfo.h" +#include "image.h" +#include "util.h" +#include "log.h" + +#include "protobuf.h" +#include "images/eventfd.pb-c.h" + +#undef LOG_PREFIX +#define LOG_PREFIX "eventfd: " + +struct eventfd_file_info { + EventfdFileEntry *efe; + struct file_desc d; +}; + +/* Checks if file descriptor @lfd is eventfd */ +int is_eventfd_link(char *link) +{ + return is_anon_link_type(link, "[eventfd]"); +} + +static void pr_info_eventfd(char *action, EventfdFileEntry *efe) +{ + pr_info("%s: id %#08x flags %#04x counter %#016"PRIx64"\n", + action, efe->id, efe->flags, efe->counter); +} + +static int dump_one_eventfd(int lfd, u32 id, const struct fd_parms *p) +{ + EventfdFileEntry efd = EVENTFD_FILE_ENTRY__INIT; + FileEntry fe = FILE_ENTRY__INIT; + + if (parse_fdinfo(lfd, FD_TYPES__EVENTFD, &efd)) + return -1; + + efd.id = id; + efd.flags = p->flags; + efd.fown = (FownEntry *)&p->fown; + + fe.type = FD_TYPES__EVENTFD; + fe.id = efd.id; + fe.efd = &efd; + + pr_info_eventfd("Dumping ", &efd); + return pb_write_one(img_from_set(glob_imgset, CR_FD_FILES), &fe, PB_FILE); +} + +const struct fdtype_ops eventfd_dump_ops = { + .type = FD_TYPES__EVENTFD, + .dump = dump_one_eventfd, +}; + +static int eventfd_open(struct file_desc *d, int *new_fd) +{ + struct eventfd_file_info *info; + int tmp; + + info = container_of(d, struct eventfd_file_info, d); + + tmp = eventfd(info->efe->counter, 0); + if (tmp < 0) { + pr_perror("Can't create eventfd %#08x", + info->efe->id); + return -1; + } + + if (rst_file_params(tmp, info->efe->fown, info->efe->flags)) { + pr_perror("Can't restore params on eventfd %#08x", + info->efe->id); + goto err_close; + } + + *new_fd = tmp; + return 0; + +err_close: + close(tmp); + return -1; +} + +static struct file_desc_ops eventfd_desc_ops = { + .type = FD_TYPES__EVENTFD, + .open = eventfd_open, +}; + +static int collect_one_efd(void *obj, ProtobufCMessage *msg, struct cr_img *i) +{ + struct eventfd_file_info *info = obj; + + info->efe = pb_msg(msg, EventfdFileEntry); + pr_info_eventfd("Collected ", info->efe); + return file_desc_add(&info->d, info->efe->id, &eventfd_desc_ops); +} + +struct collect_image_info eventfd_cinfo = { + .fd_type = CR_FD_EVENTFD_FILE, + .pb_type = PB_EVENTFD_FILE, + .priv_size = sizeof(struct eventfd_file_info), + .collect = collect_one_efd, +}; diff --git a/CRIU_code/criu/eventpoll.c b/CRIU_code/criu/eventpoll.c new file mode 100644 index 0000000..e1384fa --- /dev/null +++ b/CRIU_code/criu/eventpoll.c @@ -0,0 +1,502 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "types.h" +#include "crtools.h" +#include "common/compiler.h" +#include "imgset.h" +#include "rst_info.h" +#include "eventpoll.h" +#include "fdinfo.h" +#include "image.h" +#include "util.h" +#include "log.h" +#include "pstree.h" +#include "parasite.h" +#include "kerndat.h" +#include "file-ids.h" +#include "kcmp-ids.h" + +#include "protobuf.h" +#include "images/eventpoll.pb-c.h" + +#undef LOG_PREFIX +#define LOG_PREFIX "epoll: " + +static LIST_HEAD(dinfo_list); + +typedef struct { + uint32_t tfd; + uint32_t off; + uint32_t idx; +} toff_t; + +struct eventpoll_dinfo { + struct list_head list; + + FileEntry *fe; + EventpollFileEntry *e; + + toff_t *toff; + FownEntry fown; + + pid_t pid; + int efd; +}; + +struct eventpoll_file_info { + EventpollFileEntry *efe; + struct file_desc d; +}; + +/* Checks if file descriptor @lfd is eventfd */ +int is_eventpoll_link(char *link) +{ + return is_anon_link_type(link, "[eventpoll]"); +} + +static void pr_info_eventpoll_tfd(char *action, uint32_t id, EventpollTfdEntry *e) +{ + pr_info("%seventpoll-tfd: id %#08x tfd %8d events %#08x data %#016"PRIx64"\n", + action, id, e->tfd, e->events, e->data); +} + +static void pr_info_eventpoll(char *action, EventpollFileEntry *e) +{ + pr_info("%seventpoll: id %#08x flags %#04x\n", action, e->id, e->flags); +} + +static int queue_dinfo(FileEntry **fe, EventpollFileEntry **e, toff_t **toff, const struct fd_parms *p) +{ + struct eventpoll_dinfo *dinfo; + + pr_info_eventpoll("Queueing ", *e); + + dinfo = xmalloc(sizeof(*dinfo)); + if (!dinfo) + return -ENOMEM; + + memcpy(&dinfo->fown, &p->fown, sizeof(dinfo->fown)); + + INIT_LIST_HEAD(&dinfo->list); + + dinfo->fe = *fe; + dinfo->e = *e; + dinfo->toff = *toff; + dinfo->e->fown = &dinfo->fown; + dinfo->pid = p->pid; + dinfo->efd = p->fd; + + *fe = NULL; + *e = NULL; + *toff = NULL; + + list_add_tail(&dinfo->list, &dinfo_list); + return 0; +} + +static void dequeue_dinfo(struct eventpoll_dinfo *dinfo) +{ + ssize_t i; + + for (i = 0; i < dinfo->e->n_tfd; i++) + eventpoll_tfd_entry__free_unpacked(dinfo->e->tfd[i], NULL); + + xfree(dinfo->fe); + xfree(dinfo->e->tfd); + xfree(dinfo->e); + xfree(dinfo->toff); + + list_del(&dinfo->list); + + xfree(dinfo); +} + +int flush_eventpoll_dinfo_queue(void) +{ + struct eventpoll_dinfo *dinfo, *t; + ssize_t i; + + list_for_each_entry_safe(dinfo, t, &dinfo_list, list) { + EventpollFileEntry *e = dinfo->e; + + for (i = 0; i < e->n_tfd; i++) { + EventpollTfdEntry *tfde = e->tfd[i]; + struct kid_elem ke = { + .pid = dinfo->pid, + .genid = make_gen_id(tfde->dev, + tfde->inode, + tfde->pos), + .idx = tfde->tfd, + }; + kcmp_epoll_slot_t slot = { + .efd = dinfo->efd, + .tfd = tfde->tfd, + .toff = dinfo->toff[i].off, + }; + struct kid_elem *t = kid_lookup_epoll_tfd(&fd_tree, &ke, &slot); + if (!t) { + pr_debug("kid_lookup_epoll: no match pid %d efd %d tfd %d toff %u\n", + dinfo->pid, dinfo->efd, tfde->tfd, dinfo->toff[i].off); + goto err; + } + + pr_debug("kid_lookup_epoll: rbsearch match pid %d efd %d tfd %d toff %u -> %d\n", + dinfo->pid, dinfo->efd, tfde->tfd, dinfo->toff[i].off, t->idx); + + /* Make sure the pid matches */ + if (t->pid != dinfo->pid) { + pr_debug("kid_lookup_epoll: pid mismatch %d %d efd %d tfd %d toff %u\n", + dinfo->pid, t->pid, dinfo->efd, tfde->tfd, dinfo->toff[i].off); + goto err; + } + + tfde->tfd = t->idx; + } + + pr_info_eventpoll("Dumping ", e); + if (pb_write_one(img_from_set(glob_imgset, CR_FD_FILES), dinfo->fe, PB_FILE)) + goto err; + + for (i = 0; i < e->n_tfd; i++) + pr_info_eventpoll_tfd("Dumping: ", e->id, e->tfd[i]); + + dequeue_dinfo(dinfo); + } + + return 0; + +err: + list_for_each_entry_safe(dinfo, t, &dinfo_list, list) + dequeue_dinfo(dinfo); + + return -1; +} + +static int tfd_cmp(const void *a, const void *b) +{ + if (((int *)a)[0] > ((int *)b)[0]) + return 1; + if (((int *)a)[0] < ((int *)b)[0]) + return -1; + return 0; +} + +static int toff_cmp(const void *a, const void *b) +{ + if (((toff_t *)a)[0].tfd > ((toff_t *)b)[0].tfd) + return 1; + if (((toff_t *)a)[0].tfd < ((toff_t *)b)[0].tfd) + return -1; + if (((toff_t *)a)[0].idx > ((toff_t *)b)[0].idx) + return 1; + if (((toff_t *)a)[0].idx < ((toff_t *)b)[0].idx) + return -1; + return 0; +} + +/* + * fds in fd_parms are sorted so we can use binary search + * for better performance. + */ +static int find_tfd_bsearch(pid_t pid, int efd, int fds[], size_t nr_fds, + int tfd, unsigned int toff) +{ + kcmp_epoll_slot_t slot = { + .efd = efd, + .tfd = tfd, + .toff = toff, + }; + int *tfd_found; + + pr_debug("find_tfd_bsearch: pid %d efd %d tfd %d toff %u\n", pid, efd, tfd, toff); + + /* + * Optimistic case: the target fd belongs to us + * and wasn't dup'ed. + */ + tfd_found = bsearch(&tfd, fds, nr_fds, sizeof(int), tfd_cmp); + if (tfd_found) { + if (kdat.has_kcmp_epoll_tfd) { + if (syscall(SYS_kcmp, pid, pid, KCMP_EPOLL_TFD, tfd, &slot) == 0) { + pr_debug("find_tfd_bsearch (kcmp-yes): bsearch match pid %d efd %d tfd %d toff %u\n", + pid, efd, tfd, toff); + return tfd; + } + } else { + pr_debug("find_tfd_bsearch (kcmp-no): bsearch match pid %d efd %d tfd %d toff %u\n", + pid, efd, tfd, toff); + return tfd; + } + } + + pr_debug("find_tfd_bsearch: no match pid %d efd %d tfd %d toff %u\n", + pid, efd, tfd, toff); + return -1; +} + +static int dump_one_eventpoll(int lfd, u32 id, const struct fd_parms *p) +{ + toff_t *toff_base, *toff = NULL; + EventpollFileEntry *e = NULL; + FileEntry *fe = NULL; + int ret = -1; + ssize_t i; + + e = xmalloc(sizeof(*e)); + if (!e) + goto out; + eventpoll_file_entry__init(e); + + fe = xmalloc(sizeof(*fe)); + if (!fe) + goto out; + file_entry__init(fe); + + e->id = id; + e->flags = p->flags; + e->fown = (FownEntry *)&p->fown; + + if (parse_fdinfo(lfd, FD_TYPES__EVENTPOLL, e)) + goto out; + + fe->type = FD_TYPES__EVENTPOLL; + fe->id = e->id; + fe->epfd = e; + + /* + * In regular case there is no so many dup'ed + * descriptors so instead of complex mappings + * lets rather walk over members with O(n^2) + */ + if (p->dfds) { + toff = xmalloc(sizeof(*toff) * e->n_tfd); + if (!toff) + goto out; + for (i = 0; i < e->n_tfd; i++) { + toff[i].idx = i; + toff[i].tfd = e->tfd[i]->tfd; + toff[i].off = 0; + } + + qsort(toff, e->n_tfd, sizeof(*toff), toff_cmp); + + toff_base = NULL; + for (i = 1; i < e->n_tfd; i++) { + if (toff[i].tfd == toff[i - 1].tfd) { + if (!toff_base) + toff_base = &toff[i - 1]; + toff[i].off = toff[i].idx - toff_base->idx; + } else + toff_base = NULL; + } + } + + /* + * Handling dup'ed or transferred target + * files is tricky: we need to use kcmp + * to find out where file came from. Until + * it's implemented lets use simpler approach + * just check the targets are blonging to the + * pid's file set. + */ + if (p->dfds) { + for (i = 0; i < e->n_tfd; i++) { + int tfd = find_tfd_bsearch(p->pid, p->fd, p->dfds->fds, + p->dfds->nr_fds, e->tfd[i]->tfd, toff[i].off); + if (tfd == -1) { + if (kdat.has_kcmp_epoll_tfd) { + ret = queue_dinfo(&fe, &e, &toff, p); + } else { + pr_err("Escaped/closed fd descriptor %d on pid %d\n", + e->tfd[i]->tfd, p->pid); + } + goto out; + } + } + } else + pr_warn_once("Unix SCM files are not verified\n"); + + pr_info_eventpoll("Dumping ", e); + ret = pb_write_one(img_from_set(glob_imgset, CR_FD_FILES), fe, PB_FILE); + if (!ret) { + for (i = 0; i < e->n_tfd; i++) + pr_info_eventpoll_tfd("Dumping: ", e->id, e->tfd[i]); + } + +out: + for (i = 0; e && i < e->n_tfd; i++) + eventpoll_tfd_entry__free_unpacked(e->tfd[i], NULL); + xfree(fe); + if (e) + xfree(e->tfd); + xfree(e); + xfree(toff); + + return ret; +} + +const struct fdtype_ops eventpoll_dump_ops = { + .type = FD_TYPES__EVENTPOLL, + .dump = dump_one_eventpoll, +}; + +static int eventpoll_post_open(struct file_desc *d, int fd); + +static int eventpoll_open(struct file_desc *d, int *new_fd) +{ + struct fdinfo_list_entry *fle = file_master(d); + struct eventpoll_file_info *info; + int tmp; + + info = container_of(d, struct eventpoll_file_info, d); + + if (fle->stage >= FLE_OPEN) + return eventpoll_post_open(d, fle->fe->fd); + + pr_info_eventpoll("Restore ", info->efe); + + tmp = epoll_create(1); + if (tmp < 0) { + pr_perror("Can't create epoll %#08x", + info->efe->id); + return -1; + } + + if (rst_file_params(tmp, info->efe->fown, info->efe->flags)) { + pr_perror("Can't restore file params on epoll %#08x", + info->efe->id); + goto err_close; + } + + *new_fd = tmp; + return 1; +err_close: + close(tmp); + return -1; +} + +static int epoll_not_ready_tfd(EventpollTfdEntry *tdefe) +{ + struct fdinfo_list_entry *fle; + + list_for_each_entry(fle, &rsti(current)->fds, ps_list) { + if (tdefe->tfd != fle->fe->fd) + continue; + + if (fle->desc->ops->type == FD_TYPES__EVENTPOLL) + return (fle->stage < FLE_OPEN); + else + return (fle->stage != FLE_RESTORED); + } + + /* + * If tgt fle is not on the fds list, it's already + * restored (see open_fdinfos), so we're ready. + */ + return 0; +} + +static int eventpoll_retore_tfd(int fd, int id, EventpollTfdEntry *tdefe) +{ + struct epoll_event event; + + pr_info_eventpoll_tfd("Restore ", id, tdefe); + + event.events = tdefe->events; + event.data.u64 = tdefe->data; + if (epoll_ctl(fd, EPOLL_CTL_ADD, tdefe->tfd, &event)) { + pr_perror("Can't add event on %#08x", id); + return -1; + } + + return 0; +} + +static int eventpoll_post_open(struct file_desc *d, int fd) +{ + struct eventpoll_file_info *info; + int i; + + info = container_of(d, struct eventpoll_file_info, d); + + for (i = 0; i < info->efe->n_tfd; i++) { + if (epoll_not_ready_tfd(info->efe->tfd[i])) + return 1; + } + for (i = 0; i < info->efe->n_tfd; i++) { + if (eventpoll_retore_tfd(fd, info->efe->id, info->efe->tfd[i])) + return -1; + } + + return 0; +} + +static struct file_desc_ops desc_ops = { + .type = FD_TYPES__EVENTPOLL, + .open = eventpoll_open, +}; + +static int collect_one_epoll_tfd(void *o, ProtobufCMessage *msg, struct cr_img *i) +{ + EventpollTfdEntry *tfde; + struct file_desc *d; + struct eventpoll_file_info *ef; + EventpollFileEntry *efe; + int n_tfd; + + if (!deprecated_ok("Epoll TFD image")) + return -1; + + tfde = pb_msg(msg, EventpollTfdEntry); + d = find_file_desc_raw(FD_TYPES__EVENTPOLL, tfde->id); + if (!d) { + pr_err("No epoll FD for %u\n", tfde->id); + return -1; + } + + ef = container_of(d, struct eventpoll_file_info, d); + efe = ef->efe; + + n_tfd = efe->n_tfd + 1; + if (xrealloc_safe(&efe->tfd, n_tfd * sizeof(EventpollTfdEntry *))) + return -1; + + efe->tfd[efe->n_tfd] = tfde; + efe->n_tfd = n_tfd; + + return 0; +} + +struct collect_image_info epoll_tfd_cinfo = { + .fd_type = CR_FD_EVENTPOLL_TFD, + .pb_type = PB_EVENTPOLL_TFD, + .collect = collect_one_epoll_tfd, + .flags = COLLECT_NOFREE, +}; + +static int collect_one_epoll(void *o, ProtobufCMessage *msg, struct cr_img *i) +{ + struct eventpoll_file_info *info = o; + + info->efe = pb_msg(msg, EventpollFileEntry); + pr_info_eventpoll("Collected ", info->efe); + return file_desc_add(&info->d, info->efe->id, &desc_ops); +} + +struct collect_image_info epoll_cinfo = { + .fd_type = CR_FD_EVENTPOLL_FILE, + .pb_type = PB_EVENTPOLL_FILE, + .priv_size = sizeof(struct eventpoll_file_info), + .collect = collect_one_epoll, +}; diff --git a/CRIU_code/criu/external.c b/CRIU_code/criu/external.c new file mode 100644 index 0000000..96e6768 --- /dev/null +++ b/CRIU_code/criu/external.c @@ -0,0 +1,94 @@ +#include "common/err.h" +#include "common/list.h" +#include "cr_options.h" +#include "xmalloc.h" +#include "mount.h" +#include "external.h" +#include "util.h" + +#include "net.h" + +int add_external(char *key) +{ + struct external *ext; + + ext = xmalloc(sizeof(*ext)); + if (!ext) + return -1; + ext->id = key; + + if (strstartswith(key, "macvlan") && macvlan_ext_add(ext) < 0) { + xfree(ext); + return -1; + } + + if (strstartswith(key, "mnt[]")) { + xfree(ext); + return ext_mount_parse_auto(key + 5); + } + + list_add(&ext->node, &opts.external); + + return 0; +} + +bool external_lookup_id(char *id) +{ + struct external *ext; + + list_for_each_entry(ext, &opts.external, node) + if (!strcmp(ext->id, id)) + return true; + return false; +} + +void *external_lookup_data(char *key) +{ + struct external *ext; + int len = strlen(key); + + list_for_each_entry(ext, &opts.external, node) { + if (strncmp(ext->id, key, len)) + continue; + + return ext->data; + } + + return ERR_PTR(-ENOENT); +} + +char *external_lookup_by_key(char *key) +{ + struct external *ext; + int len = strlen(key); + + list_for_each_entry(ext, &opts.external, node) { + if (strncmp(ext->id, key, len)) + continue; + if (ext->id[len] == ':') + return ext->id + len + 1; + else if (ext->id[len] == '\0') + return NULL; + } + return ERR_PTR(-ENOENT); +} + +int external_for_each_type(char *type, int (*cb)(struct external *, void *), void *arg) +{ + struct external *ext; + int ln = strlen(type); + int ret = 0; + + list_for_each_entry(ext, &opts.external, node) { + if (strncmp(ext->id, type, ln)) + continue; + if (ext->id[ln] != '[') + continue; + + ret = cb(ext, arg); + if (ret) + break; + } + + return ret; +} diff --git a/CRIU_code/criu/fault-injection.c b/CRIU_code/criu/fault-injection.c new file mode 100644 index 0000000..4128814 --- /dev/null +++ b/CRIU_code/criu/fault-injection.c @@ -0,0 +1,22 @@ +#include +#include "fault-injection.h" + +enum faults fi_strategy; + +int fault_injection_init() +{ + char *val; + int start; + + val = getenv("CRIU_FAULT"); + if (val == NULL) + return 0; + + start = atoi(val); + + if (start <= 0 || start >= FI_MAX) + return -1; + + fi_strategy = start; + return 0; +} diff --git a/CRIU_code/criu/fdstore.c b/CRIU_code/criu/fdstore.c new file mode 100644 index 0000000..a4583fd --- /dev/null +++ b/CRIU_code/criu/fdstore.c @@ -0,0 +1,128 @@ +#include +#include +#include +#include +#include +#include + +#include "common/scm.h" +#include "common/lock.h" +#include "servicefd.h" +#include "fdstore.h" +#include "xmalloc.h" +#include "rst-malloc.h" +#include "log.h" + +static struct fdstore_desc { + int next_id; + mutex_t lock; /* to protect a peek offset */ +} *desc; + +int fdstore_init(void) +{ + /* In kernel a bufsize has type int and a value is doubled. */ + uint32_t buf[2] = { INT_MAX / 2, INT_MAX / 2 }; + struct sockaddr_un addr; + unsigned int addrlen; + struct stat st; + int sk, ret; + + desc = shmalloc(sizeof(*desc)); + if (!desc) + return -1; + + desc->next_id = 0; + mutex_init(&desc->lock); + + sk = socket(AF_UNIX, SOCK_DGRAM | SOCK_NONBLOCK, 0); + if (sk < 0) { + pr_perror("Unable to create a socket"); + return -1; + } + + if (fstat(sk, &st)) { + pr_perror("Unable to stat a file descriptor"); + close(sk); + return -1; + } + + if (setsockopt(sk, SOL_SOCKET, SO_SNDBUFFORCE, &buf[0], sizeof(buf[0])) < 0 || + setsockopt(sk, SOL_SOCKET, SO_RCVBUFFORCE, &buf[1], sizeof(buf[1])) < 0) { + pr_perror("Unable to set SO_SNDBUFFORCE/SO_RCVBUFFORCE"); + close(sk); + return -1; + } + + addr.sun_family = AF_UNIX; + addrlen = snprintf(addr.sun_path, sizeof(addr.sun_path), "X/criu-fdstore-%"PRIx64, st.st_ino); + addrlen += sizeof(addr.sun_family); + + addr.sun_path[0] = 0; + + /* + * This socket is connected to itself, so all messages are queued to + * its receive queue. Here we are going to use this socket to store + * file descriptors. For that we need to send a file descriptor in + * a queue and remember its sequence number. Then we can set SO_PEEK_OFF + * to get a file descriptor without dequeuing it. + */ + if (bind(sk, (struct sockaddr *) &addr, addrlen)) { + pr_perror("Unable to bind a socket"); + close(sk); + return -1; + } + if (connect(sk, (struct sockaddr *) &addr, addrlen)) { + pr_perror("Unable to connect a socket"); + close(sk); + return -1; + } + + ret = install_service_fd(FDSTORE_SK_OFF, sk); + if (ret < 0) + return -1; + + return 0; +} + +int fdstore_add(int fd) +{ + int sk = get_service_fd(FDSTORE_SK_OFF); + int id, ret; + + mutex_lock(&desc->lock); + + ret = send_fd(sk, NULL, 0, fd); + if (ret) { + pr_perror("Can't send fd %d into store\n", fd); + mutex_unlock(&desc->lock); + return -1; + } + + id = desc->next_id++; + + mutex_unlock(&desc->lock); + + return id; +} + +int fdstore_get(int id) +{ + int sk = get_service_fd(FDSTORE_SK_OFF); + int fd; + + mutex_lock(&desc->lock); + if (setsockopt(sk, SOL_SOCKET, SO_PEEK_OFF, &id, sizeof(id))) { + mutex_unlock(&desc->lock); + pr_perror("Unable to a peek offset"); + return -1; + } + + if (__recv_fds(sk, &fd, 1, NULL, 0, MSG_PEEK) < 0) { + mutex_unlock(&desc->lock); + pr_perror("Unable to get a file descriptor with the %d id", id); + return -1; + } + mutex_unlock(&desc->lock); + + return fd; +} diff --git a/CRIU_code/criu/fifo.c b/CRIU_code/criu/fifo.c new file mode 100644 index 0000000..a269343 --- /dev/null +++ b/CRIU_code/criu/fifo.c @@ -0,0 +1,184 @@ +#include +#include +#include +#include +#include + +#include "imgset.h" +#include "image.h" +#include "files.h" +#include "files-reg.h" +#include "file-ids.h" +#include "pipes.h" + +#include "fifo.h" + +#include "protobuf.h" +#include "images/regfile.pb-c.h" +#include "images/fifo.pb-c.h" + +/* + * FIFO checkpoint and restore is done in a bit unusual manner. + * We use files-reg.c engine to save fifo path and flags, + * thus regular files image will contain fifo descriptors which + * are useless for reg-files engine itself but needed for our fifo + * engine. + * + * In particular we dump fifo-entry automatically and appropriate + * reg-file entry manually, thus on restore we need to ask reg-file + * engine to restore fifo path and flags via direct call. + */ + +struct fifo_info { + struct list_head list; + struct file_desc d; + FifoEntry *fe; + bool restore_data; +}; + +static LIST_HEAD(fifo_head); +static struct pipe_data_dump pd_fifo = { .img_type = CR_FD_FIFO_DATA, }; + +static int dump_one_fifo(int lfd, u32 id, const struct fd_parms *p) +{ + struct cr_img *img = img_from_set(glob_imgset, CR_FD_FILES); + FileEntry fe = FILE_ENTRY__INIT; + FifoEntry e = FIFO_ENTRY__INIT; + u32 rf_id; + + fd_id_generate_special(NULL, &rf_id); + + /* + * It's a trick here, we use regular files dumping + * code to save path to a fifo, then we reuse it + * on restore. + */ + if (dump_one_reg_file(lfd, rf_id, p)) + return -1; + + pr_info("Dumping fifo %d with id %#x pipe_id %#x\n", + lfd, id, pipe_id(p)); + + e.id = id; + e.pipe_id = pipe_id(p); + e.has_regf_id = true; + e.regf_id = rf_id; + + fe.type = FD_TYPES__FIFO; + fe.id = e.id; + fe.fifo = &e; + + if (pb_write_one(img, &fe, PB_FILE)) + return -1; + + return dump_one_pipe_data(&pd_fifo, lfd, p); +} + +const struct fdtype_ops fifo_dump_ops = { + .type = FD_TYPES__FIFO, + .dump = dump_one_fifo, +}; + +static struct pipe_data_rst *pd_hash_fifo[PIPE_DATA_HASH_SIZE]; + +static int do_open_fifo(int ns_root_fd, struct reg_file_info *rfi, void *arg) +{ + struct fifo_info *info = arg; + int new_fifo, fake_fifo = -1; + + /* + * The fifos (except read-write fifos) do wait until + * another pipe-end get connected, so to be able to + * proceed the restoration procedure we open a fake + * fifo here. + */ + fake_fifo = openat(ns_root_fd, rfi->path, O_RDWR); + if (fake_fifo < 0) { + pr_perror("Can't open fake fifo %#x [%s]", info->fe->id, rfi->path); + return -1; + } + + new_fifo = openat(ns_root_fd, rfi->path, rfi->rfe->flags); + if (new_fifo < 0) { + pr_perror("Can't open fifo %#x [%s]", info->fe->id, rfi->path); + goto out; + } + + if (info->restore_data) + if (restore_pipe_data(CR_FD_FIFO_DATA, fake_fifo, + info->fe->pipe_id, pd_hash_fifo)) { + close(new_fifo); + new_fifo = -1; + } + +out: + close(fake_fifo); + return new_fifo; +} + +static int open_fifo_fd(struct file_desc *d, int *new_fd) +{ + struct fifo_info *info = container_of(d, struct fifo_info, d); + struct file_desc *reg_d; + int fd; + + reg_d = collect_special_file(info->fe->has_regf_id ? + info->fe->regf_id : info->fe->id); + if (!reg_d) + return -1; + + fd = open_path(reg_d, do_open_fifo, info); + if (fd < 0) + return -1; + *new_fd = fd; + return 0; +} + +static struct file_desc_ops fifo_desc_ops = { + .type = FD_TYPES__FIFO, + .open = open_fifo_fd, +}; + +static int collect_one_fifo(void *o, ProtobufCMessage *base, struct cr_img *i) +{ + struct fifo_info *info = o, *f; + + info->fe = pb_msg(base, FifoEntry); + pr_info("Collected fifo entry ID %#x PIPE ID %#x\n", + info->fe->id, info->fe->pipe_id); + + /* check who will restore the fifo data */ + list_for_each_entry(f, &fifo_head, list) + if (f->fe->pipe_id == info->fe->pipe_id) + break; + + if (&f->list == &fifo_head) { + list_add(&info->list, &fifo_head); + info->restore_data = true; + } else { + INIT_LIST_HEAD(&info->list); + info->restore_data = false; + } + + return file_desc_add(&info->d, info->fe->id, &fifo_desc_ops); + +} + +struct collect_image_info fifo_cinfo = { + .fd_type = CR_FD_FIFO, + .pb_type = PB_FIFO, + .priv_size = sizeof(struct fifo_info), + .collect = collect_one_fifo, +}; + +static int collect_fifo_data(void *obj, ProtobufCMessage *msg, struct cr_img *img) +{ + return do_collect_pipe_data(obj, msg, img, pd_hash_fifo); +} + +struct collect_image_info fifo_data_cinfo = { + .fd_type = CR_FD_FIFO_DATA, + .pb_type = PB_PIPE_DATA, + .priv_size = sizeof(struct pipe_data_rst), + .collect = collect_fifo_data, +}; diff --git a/CRIU_code/criu/file-ids.c b/CRIU_code/criu/file-ids.c new file mode 100644 index 0000000..006e47d --- /dev/null +++ b/CRIU_code/criu/file-ids.c @@ -0,0 +1,113 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "int.h" +#include "file-ids.h" +#include "rbtree.h" +#include "kcmp-ids.h" +#include "common/compiler.h" +#include "image.h" +#include "util.h" +#include "irmap.h" +#include "files.h" + +DECLARE_KCMP_TREE(fd_tree, KCMP_FILE); + +#define FDID_BITS 5 +#define FDID_SIZE (1 << FDID_BITS) +#define FDID_MASK (FDID_SIZE - 1) + +static inline int fdid_hashfn(unsigned int s_dev, unsigned long i_ino) +{ + return (s_dev + i_ino) & FDID_MASK; +} + +struct fd_id { + int mnt_id; + unsigned int dev; + unsigned long ino; + u32 id; + struct fd_id *n; +}; + +static struct fd_id *fd_id_cache[FDID_SIZE]; + +static void fd_id_cache_one(u32 id, struct fd_parms *p) +{ + struct fd_id *fi; + unsigned hv; + + fi = xmalloc(sizeof(*fi)); + if (fi) { + fi->dev = p->stat.st_dev; + fi->ino = p->stat.st_ino; + fi->mnt_id = p->mnt_id; + fi->id = id; + + hv = fdid_hashfn(p->stat.st_dev, p->stat.st_ino); + fi->n = fd_id_cache[hv]; + fd_id_cache[hv] = fi; + } +} + +static struct fd_id *fd_id_cache_lookup(struct fd_parms *p) +{ + struct stat *st = &p->stat; + struct fd_id *fi; + + for (fi = fd_id_cache[fdid_hashfn(st->st_dev, st->st_ino)]; + fi; fi = fi->n) + if (fi->dev == st->st_dev && + fi->ino == st->st_ino && + fi->mnt_id == p->mnt_id) + return fi; + + return NULL; +} + +int fd_id_generate_special(struct fd_parms *p, u32 *id) +{ + if (p) { + struct fd_id *fi; + + fi = fd_id_cache_lookup(p); + if (fi) { + *id = fi->id; + return 0; + } + } + + *id = fd_tree.subid++; + if (p) + fd_id_cache_one(*id, p); + return 1; +} + +int fd_id_generate(pid_t pid, FdinfoEntry *fe, struct fd_parms *p) +{ + u32 id; + struct kid_elem e; + int new_id = 0; + + e.pid = pid; + e.genid = fe->id; + e.idx = fe->fd; + + id = kid_generate_gen(&fd_tree, &e, &new_id); + if (!id) + return -ENOMEM; + + if (new_id) + fd_id_cache_one(id, p); + + fe->id = id; + return new_id; +} diff --git a/CRIU_code/criu/file-lock.c b/CRIU_code/criu/file-lock.c new file mode 100644 index 0000000..8be7589 --- /dev/null +++ b/CRIU_code/criu/file-lock.c @@ -0,0 +1,718 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cr_options.h" +#include "imgset.h" +#include "files.h" +#include "fs-magic.h" +#include "kerndat.h" +#include "image.h" +#include "util.h" +#include "mount.h" +#include "proc_parse.h" +#include "servicefd.h" +#include "file-lock.h" +#include "pstree.h" +#include "files-reg.h" + +struct file_lock_rst { + FileLockEntry *fle; + struct list_head l; +}; + +struct list_head file_lock_list = LIST_HEAD_INIT(file_lock_list); + +static int collect_one_file_lock(void *o, ProtobufCMessage *m, struct cr_img *i) +{ + struct file_lock_rst *lr = o; + + lr->fle = pb_msg(m, FileLockEntry); + list_add_tail(&lr->l, &file_lock_list); + + return 0; +} + +struct collect_image_info file_locks_cinfo = { + .fd_type = CR_FD_FILE_LOCKS, + .pb_type = PB_FILE_LOCK, + .priv_size = sizeof(struct file_lock_rst), + .collect = collect_one_file_lock, +}; + +struct file_lock *alloc_file_lock(void) +{ + struct file_lock *flock; + + flock = xzalloc(sizeof(*flock)); + if (!flock) + return NULL; + + INIT_LIST_HEAD(&flock->list); + flock->real_owner = -1; + flock->owners_fd = -1; + flock->fl_holder = -1; + + return flock; +} + +void free_file_locks(void) +{ + struct file_lock *flock, *tmp; + + list_for_each_entry_safe(flock, tmp, &file_lock_list, list) { + xfree(flock); + } + + INIT_LIST_HEAD(&file_lock_list); +} + +static int dump_one_file_lock(FileLockEntry *fle) +{ + pr_info("LOCK flag: %d,type: %d,pid: %d,fd: %d,start: %8"PRIx64",len: %8"PRIx64"\n", + fle->flag, fle->type, fle->pid, fle->fd, fle->start, fle->len); + + return pb_write_one(img_from_set(glob_imgset, CR_FD_FILE_LOCKS), + fle, PB_FILE_LOCK); +} + +static void fill_flock_entry(FileLockEntry *fle, int fl_kind, int fl_ltype) +{ + fle->flag |= fl_kind; + fle->type = fl_ltype; +} + +int dump_file_locks(void) +{ + FileLockEntry fle; + struct file_lock *fl; + int ret = 0; + + pr_info("Dumping file-locks\n"); + + list_for_each_entry(fl, &file_lock_list, list) { + if (fl->real_owner == -1) { + if (fl->fl_kind == FL_POSIX) { + pr_err("Unresolved lock found pid %d ino %ld\n", + fl->fl_owner, fl->i_no); + return -1; + } + + continue; + } + + file_lock_entry__init(&fle); + fle.pid = fl->real_owner; + fle.fd = fl->owners_fd; + fill_flock_entry(&fle, fl->fl_kind, fl->fl_ltype); + fle.start = fl->start; + if (!strncmp(fl->end, "EOF", 3)) + fle.len = 0; + else + fle.len = (atoll(fl->end) + 1) - fl->start; + + ret = dump_one_file_lock(&fle); + if (ret) { + pr_err("Dump file lock failed!\n"); + goto err; + } + } + +err: + return ret; +} + +static int lock_btrfs_file_match(pid_t pid, int fd, struct file_lock *fl, struct fd_parms *p) +{ + int phys_dev = MKKDEV(fl->maj, fl->min); + char link[PATH_MAX], t[32]; + struct ns_id *ns; + int ret; + + snprintf(t, sizeof(t), "/proc/%d/fd/%d", pid, fd); + ret = readlink(t, link, sizeof(link)) - 1; + if (ret < 0) { + pr_perror("Can't read link of fd %d", fd); + return -1; + } else if ((size_t)ret == sizeof(link)) { + pr_err("Buffer for read link of fd %d is too small\n", fd); + return -1; + } + link[ret] = 0; + + ns = lookup_nsid_by_mnt_id(p->mnt_id); + return phys_stat_dev_match(p->stat.st_dev, phys_dev, ns, link); +} + +static inline int lock_file_match(pid_t pid, int fd, struct file_lock *fl, struct fd_parms *p) +{ + dev_t dev = p->stat.st_dev; + + if (fl->i_no != p->stat.st_ino) + return 0; + + /* + * Get the right devices for BTRFS. Look at phys_stat_resolve_dev() + * for more details. + */ + if (p->fs_type == BTRFS_SUPER_MAGIC) { + if (p->mnt_id != -1) { + struct mount_info *m; + + m = lookup_mnt_id(p->mnt_id); + BUG_ON(m == NULL); + dev = kdev_to_odev(m->s_dev); + } else /* old kernel */ + return lock_btrfs_file_match(pid, fd, fl, p); + } + + return makedev(fl->maj, fl->min) == dev; +} + +static int lock_check_fd(int lfd, struct file_lock *fl) +{ + int ret; + + if (fl->fl_ltype & LOCK_MAND) + ret = flock(lfd, LOCK_MAND | LOCK_RW); + else + ret = flock(lfd, LOCK_EX | LOCK_NB); + pr_debug(" `- %d/%d\n", ret, errno); + if (ret != 0) { + if (errno != EAGAIN) { + pr_err("Bogus lock test result %d\n", ret); + return -1; + } + + return 0; + } else { + /* + * The ret == 0 means, that new lock doesn't conflict + * with any others on the file. But since we do know, + * that there should be some other one (file is found + * in /proc/locks), it means that the lock is already + * on file pointed by fd. + */ + pr_debug(" `- downgrading lock back\n"); + if (fl->fl_ltype & LOCK_MAND) + ret = flock(lfd, fl->fl_ltype); + else if (fl->fl_ltype == F_RDLCK) + ret = flock(lfd, LOCK_SH); + if (ret) { + pr_err("Can't downgrade lock back %d\n", ret); + return -1; + } + } + + return 1; +} + +static int lock_ofd_check_fd(int lfd, struct file_lock *fl) +{ + int ret; + + struct flock lck = { + .l_whence = SEEK_SET, + .l_type = F_WRLCK, + .l_start = fl->start + }; + if (strcmp(fl->end, "EOF")) { + unsigned long end; + + ret = sscanf(fl->end, "%lu", &end); + if (ret <= 0) { + pr_err("Invalid lock entry\n"); + return -1; + } + lck.l_len = end - fl->start + 1; + } else { + lck.l_len = 0; + } + + ret = fcntl(lfd, F_OFD_SETLK, &lck); + pr_debug(" `- %d/%d\n", ret, errno); + if (ret != 0) { + if (errno != EAGAIN) { + pr_err("Bogus lock test result %d\n", ret); + return -1; + } + + return 0; + } else { + /* + * The ret == 0 means, that new lock doesn't conflict + * with any others on the file. But since we do know, + * that there should be some other one (file is found + * in /proc/locks), it means that the lock is already + * on file pointed by fd. + */ + pr_debug(" `- downgrading lock back\n"); + if (fl->fl_ltype & LOCK_WRITE) + lck.l_type = F_WRLCK; + else + lck.l_type = F_RDLCK; + + ret = fcntl(lfd, F_OFD_SETLK, &lck); + if (ret) { + pr_err("Can't downgrade lock back %d\n", ret); + return -1; + } + } + + return 1; +} + +static int lease_check_fd(int fd, int file_flags, struct file_lock *fl) +{ + int file_lease_type, err; + int lease_type = fl->fl_ltype & (~LEASE_BREAKING); + + if ((file_flags & O_ACCMODE) != O_RDONLY) { + /* + * Write OFD conflicts with any lease not associated + * with it, therefore there is can't be other lease + * or OFD for this file. + */ + return 1; + } + + file_lease_type = fcntl(fd, F_GETLEASE); + if (file_lease_type < 0) { + pr_err("Can't get lease type\n"); + return -1; + } + + /* + * Only read OFDs can be present for the file. If + * read and write OFDs with at least one lease had + * presented, it would have conflicted. + */ + if (fl->fl_ltype & LEASE_BREAKING) { + /* + * Only read leases are possible for read OFDs + * and they all should be in breaking state, + * because the current one is. + */ + int compatible_type = file_lease_type; + + if (compatible_type != F_UNLCK) { + pr_err("Lease doesn't conflicts but breaks\n"); + return -1; + } + /* + * Due to activated breaking sequence we can't + * get actual lease type with F_GETLEASE. + * The err == 0 after lease upgrade means, that + * there is already read lease on OFD. Otherwise + * it would fail, because current read lease is + * still set and breaking. + */ + err = fcntl(fd, F_SETLEASE, F_RDLCK); + if (err < 0) { + if (errno != EAGAIN) { + pr_perror("Can't set lease (fd %i)", fd); + return -1; + } + return 0; + } + return 1; + } else { + /* + * The file can have only non-breaking read + * leases, because otherwise the current one + * also would have broke. + */ + if (lease_type != F_RDLCK) { + pr_err("Incorrect lease type\n"); + return -1; + } + + if (file_lease_type == F_UNLCK) + return 0; + if (file_lease_type == F_RDLCK) + return 1; + pr_err("Invalid file lease type\n"); + return -1; + } +} + +int note_file_lock(struct pid *pid, int fd, int lfd, struct fd_parms *p) +{ + struct file_lock *fl; + int ret; + + if (kdat.has_fdinfo_lock) + return 0; + + list_for_each_entry(fl, &file_lock_list, list) { + ret = lock_file_match(pid->real, fd, fl, p); + if (ret < 0) + return -1; + if (ret == 0) + continue; + + if (!opts.handle_file_locks) { + pr_err("Some file locks are hold by dumping tasks!" + "You can try --" OPT_FILE_LOCKS " to dump them.\n"); + return -1; + } + + if (fl->fl_kind == FL_POSIX) { + /* + * POSIX locks cannot belong to anyone + * but creator. + */ + if (fl->fl_owner != pid->real) + continue; + } else if (fl->fl_kind == FL_LEASE) { + if (fl->owners_fd >= 0) + continue; + if (fl->fl_owner != pid->real && + fl->real_owner != -1) + continue; + + ret = lease_check_fd(lfd, p->flags, fl); + if (ret < 0) + return ret; + if (ret == 0) + continue; + } else /* fl->fl_kind == FL_FLOCK || fl->fl_kind == FL_OFD */ { + int ret; + + /* + * OFD locks & FLOCKs can be inherited across fork, + * thus we can have any task as lock + * owner. But the creator is preferred + * anyway. + */ + + if (fl->fl_owner != pid->real && + fl->real_owner != -1) + continue; + + pr_debug("Checking lock holder %d:%d\n", pid->real, fd); + if (fl->fl_kind == FL_FLOCK) + ret = lock_check_fd(lfd, fl); + else + ret = lock_ofd_check_fd(lfd, fl); + + if (ret < 0) + return ret; + if (ret == 0) + continue; + } + + fl->fl_holder = pid->real; + fl->real_owner = pid->ns[0].virt; + fl->owners_fd = fd; + + pr_info("Found lock entry %d.%d %d vs %d\n", + pid->real, pid->ns[0].virt, fd, + fl->fl_owner); + } + + return 0; +} + +void discard_dup_locks_tail(pid_t pid, int fd) +{ + struct file_lock *fl, *p; + + list_for_each_entry_safe_reverse(fl, p, &file_lock_list, list) { + if (fl->owners_fd != fd || pid != fl->fl_holder) + break; + + list_del(&fl->list); + xfree(fl); + } +} + +int correct_file_leases_type(struct pid *pid, int fd, int lfd) +{ + struct file_lock *fl; + int target_type; + + list_for_each_entry(fl, &file_lock_list, list) { + /* owners_fd should be set before usage */ + if (fl->fl_holder != pid->real || fl->owners_fd != fd) + continue; + + if (fl->fl_kind == FL_LEASE && + (fl->fl_ltype & LEASE_BREAKING)) { + /* + * Set lease type to actual 'target lease type' + * instead of 'READ' returned by procfs. + */ + target_type = fcntl(lfd, F_GETLEASE); + if (target_type < 0) { + perror("Can't get lease type\n"); + return -1; + } + fl->fl_ltype &= ~O_ACCMODE; + fl->fl_ltype |= target_type; + break; + } + } + return 0; +} + +static int open_break_cb(int ns_root_fd, struct reg_file_info *rfi, void *arg) +{ + int fd, flags = *(int *)arg | O_NONBLOCK; + + fd = openat(ns_root_fd, rfi->path, flags); + if (fd >= 0) { + pr_err("Conflicting lease wasn't found\n"); + close(fd); + return -1; + } else if (errno != EWOULDBLOCK) { + pr_perror("Can't break lease\n"); + return -1; + } + return 0; +} + +static int break_lease(int lease_type, struct file_desc *desc) +{ + int target_type = lease_type & (~LEASE_BREAKING); + int break_flags; + + /* + * Flags for open call chosen in a way to even + * 'target lease type' returned by fcntl(F_GETLEASE) + * and lease type from the image. + */ + if (target_type == F_UNLCK) { + break_flags = O_WRONLY; + } else if (target_type == F_RDLCK) { + break_flags = O_RDONLY; + } else { + pr_err("Incorrect target lease type\n"); + return -1; + } + return open_path(desc, open_break_cb, (void *)&break_flags); +} + +static int set_file_lease(int fd, int type) +{ + int old_fsuid, ret; + struct stat st; + + if (fstat(fd, &st)) { + pr_perror("Can't get file stat (%i)\n", fd); + return -1; + } + + /* + * An unprivileged process may take out a lease only if + * uid of the file matches the fsuid of the process. + */ + old_fsuid = setfsuid(st.st_uid); + + ret = fcntl(fd, F_SETLEASE, type); + if (ret < 0) + pr_perror("Can't set lease\n"); + + setfsuid(old_fsuid); + return ret; +} + +static int restore_lease_prebreaking_state(int fd, int fd_type) +{ + int access_flags = fd_type & O_ACCMODE; + int lease_type = (access_flags == O_RDONLY) ? F_RDLCK : F_WRLCK; + + return set_file_lease(fd, lease_type); +} + +static struct fdinfo_list_entry *find_fd_unordered(struct pstree_item *task, + int fd) +{ + struct list_head *head = &rsti(task)->fds; + struct fdinfo_list_entry *fle; + + list_for_each_entry_reverse(fle, head, ps_list) { + if (fle->fe->fd == fd) + return fle; + } + return NULL; +} + +static int restore_breaking_file_lease(FileLockEntry *fle) +{ + struct fdinfo_list_entry *fdle; + int ret; + + fdle = find_fd_unordered(current, fle->fd); + if (fdle == NULL) { + pr_err("Can't get file description\n"); + return -1; + } + + ret = restore_lease_prebreaking_state(fle->fd, fdle->desc->ops->type); + if (ret) + return ret; + + /* + * It could be broken by 2 types of open call: + * 1. non-blocking: It failed because of the lease. + * 2. blocking: It had been blocked at the moment + * of dumping, otherwise lease wouldn't be broken. + * Thus, it was canceled by CRIU. + * + * There are no files or leases in image, which will + * conflict with each other. Therefore we should explicitly + * break leases. Restoring can be done in any order. + */ + return break_lease(fle->type, fdle->desc); +} + +static int restore_file_lease(FileLockEntry *fle) +{ + sigset_t blockmask, oldmask; + int signum_fcntl, signum, ret; + + if (fle->type & LEASE_BREAKING) { + signum_fcntl = fcntl(fle->fd, F_GETSIG); + signum = signum_fcntl ? signum_fcntl : SIGIO; + if (signum_fcntl < 0) { + pr_perror("Can't get file i/o signum\n"); + return -1; + } + if (sigemptyset(&blockmask) || + sigaddset(&blockmask, signum) || + sigprocmask(SIG_BLOCK, &blockmask, &oldmask)) { + pr_perror("Can't block file i/o signal\n"); + return -1; + } + + ret = restore_breaking_file_lease(fle); + + if (sigprocmask(SIG_SETMASK, &oldmask, NULL)) { + pr_perror("Can't restore sigmask\n"); + ret = -1; + } + return ret; + } else { + ret = set_file_lease(fle->fd, fle->type); + if (ret < 0) + pr_perror("Can't restore non breaking lease"); + return ret; + } +} + +static int restore_file_lock(FileLockEntry *fle) +{ + int ret = -1; + unsigned int cmd; + + if (fle->flag & FL_FLOCK) { + if (fle->type & LOCK_MAND) { + cmd = fle->type; + } else if (fle->type == F_RDLCK) { + cmd = LOCK_SH; + } else if (fle->type == F_WRLCK) { + cmd = LOCK_EX; + } else if (fle->type == F_UNLCK) { + cmd = LOCK_UN; + } else { + pr_err("Unknown flock type!\n"); + goto err; + } + + pr_info("(flock)flag: %d, type: %d, cmd: %d, pid: %d, fd: %d\n", + fle->flag, fle->type, cmd, fle->pid, fle->fd); + + ret = flock(fle->fd, cmd); + if (ret < 0) { + pr_err("Can not set flock!\n"); + goto err; + } + } else if (fle->flag & FL_POSIX) { + struct flock flk; + memset(&flk, 0, sizeof(flk)); + + flk.l_whence = SEEK_SET; + flk.l_start = fle->start; + flk.l_len = fle->len; + flk.l_pid = fle->pid; + flk.l_type = fle->type; + + pr_info("(posix)flag: %d, type: %d, pid: %d, fd: %d, " + "start: %8"PRIx64", len: %8"PRIx64"\n", + fle->flag, fle->type, fle->pid, fle->fd, + fle->start, fle->len); + + ret = fcntl(fle->fd, F_SETLKW, &flk); + if (ret < 0) { + pr_err("Can not set posix lock!\n"); + goto err; + } + } else if (fle->flag & FL_OFD) { + struct flock flk = { + .l_whence = SEEK_SET, + .l_start = fle->start, + .l_len = fle->len, + .l_pid = 0, + .l_type = fle->type + }; + + pr_info("(ofd)flag: %d, type: %d, pid: %d, fd: %d, " + "start: %8"PRIx64", len: %8"PRIx64"\n", + fle->flag, fle->type, fle->pid, fle->fd, + fle->start, fle->len); + + ret = fcntl(fle->fd, F_OFD_SETLK, &flk); + if (ret < 0) { + pr_err("Can not set ofd lock!\n"); + goto err; + } + } else if (fle->flag & FL_LEASE) { + pr_info("(lease)flag: %d, type: %d, pid: %d, fd: %d, " + "start: %8"PRIx64", len: %8"PRIx64"\n", + fle->flag, fle->type, fle->pid, fle->fd, + fle->start, fle->len); + ret = restore_file_lease(fle); + if (ret < 0) + goto err; + } else { + pr_err("Unknown file lock style!\n"); + goto err; + } + + return 0; +err: + return ret; +} + +static int restore_file_locks(int pid) +{ + int ret = 0; + struct file_lock_rst *lr; + + list_for_each_entry(lr, &file_lock_list, l) { + if (lr->fle->pid == pid) { + ret = restore_file_lock(lr->fle); + if (ret) + break; + } + } + + return ret; +} + +int prepare_file_locks(int pid) +{ + if (!opts.handle_file_locks) + return 0; + + return restore_file_locks(pid); + +} diff --git a/CRIU_code/criu/files-ext.c b/CRIU_code/criu/files-ext.c new file mode 100644 index 0000000..a6247d6 --- /dev/null +++ b/CRIU_code/criu/files-ext.c @@ -0,0 +1,98 @@ +/* An external file is a file, which is dumped with help a plugin */ + +#include + +#include "imgset.h" +#include "files.h" +#include "plugin.h" + +#include "protobuf.h" +#include "images/ext-file.pb-c.h" + +static int dump_one_ext_file(int lfd, u32 id, const struct fd_parms *p) +{ + int ret; + struct cr_img *rimg; + FileEntry fe = FILE_ENTRY__INIT; + ExtFileEntry xfe = EXT_FILE_ENTRY__INIT; + + ret = run_plugins(DUMP_EXT_FILE, lfd, id); + if (ret < 0) + return ret; + + xfe.id = id; + xfe.fown = (FownEntry *)&p->fown; + + fe.type = FD_TYPES__EXT; + fe.id = xfe.id; + fe.ext = &xfe; + + rimg = img_from_set(glob_imgset, CR_FD_FILES); + return pb_write_one(rimg, &fe, PB_FILE); +} + +const struct fdtype_ops ext_dump_ops = { + .type = FD_TYPES__EXT, + .dump = dump_one_ext_file, +}; + +struct ext_file_info { + struct file_desc d; + ExtFileEntry *xfe; +}; + +static int open_fd(struct file_desc *d, int *new_fd) +{ + struct ext_file_info *xfi; + int fd; + + xfi = container_of(d, struct ext_file_info, d); + + fd = run_plugins(RESTORE_EXT_FILE, xfi->xfe->id); + if (fd < 0) { + pr_err("Unable to restore %#x\n", xfi->xfe->id); + return -1; + } + + if (restore_fown(fd, xfi->xfe->fown)) + return -1; + + *new_fd = fd; + return 0; +} + +static struct file_desc_ops ext_desc_ops = { + .type = FD_TYPES__EXT, + .open = open_fd, +}; + +static int collect_one_ext(void *o, ProtobufCMessage *base, struct cr_img *i) +{ + struct ext_file_info *xfi = o; + + xfi->xfe = pb_msg(base, ExtFileEntry); + + pr_info("Collected external file with ID %#x\n", xfi->xfe->id); + return file_desc_add(&xfi->d, xfi->xfe->id, &ext_desc_ops); +} + +struct collect_image_info ext_file_cinfo = { + .fd_type = CR_FD_EXT_FILES, + .pb_type = PB_EXT_FILE, + .priv_size = sizeof(struct ext_file_info), + .collect = collect_one_ext, +}; + +int dump_unsupp_fd(struct fd_parms *p, int lfd, + char *more, char *info, FdinfoEntry *e) +{ + int ret; + + ret = do_dump_gen_file(p, lfd, &ext_dump_ops, e); + if (ret == 0) + return 0; + if (ret == -ENOTSUP) + pr_err("Can't dump file %d of that type [%o] (%s %s)\n", + p->fd, p->stat.st_mode, more, info); + return -1; +} diff --git a/CRIU_code/criu/files-reg.c b/CRIU_code/criu/files-reg.c new file mode 100644 index 0000000..d982126 --- /dev/null +++ b/CRIU_code/criu/files-reg.c @@ -0,0 +1,2037 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef SEEK_DATA +#define SEEK_DATA 3 +#define SEEK_HOLE 4 +#endif + +/* Stolen from kernel/fs/nfs/unlink.c */ +#define SILLYNAME_PREF ".nfs" +#define SILLYNAME_SUFF_LEN (((unsigned)sizeof(u64) << 1) + ((unsigned)sizeof(unsigned int) << 1)) + +#include "cr_options.h" +#include "imgset.h" +#include "file-ids.h" +#include "mount.h" +#include "files.h" +#include "common/list.h" +#include "rst-malloc.h" +#include "fs-magic.h" +#include "namespaces.h" +#include "proc_parse.h" +#include "pstree.h" +#include "fault-injection.h" +#include "external.h" + +#include "protobuf.h" +#include "util.h" +#include "images/regfile.pb-c.h" +#include "images/remap-file-path.pb-c.h" + +#include "files-reg.h" +#include "plugin.h" + +int setfsuid(uid_t fsuid); +int setfsgid(gid_t fsuid); + +/* + * Ghost files are those not visible from the FS. Dumping them is + * nasty and the only way we have -- just carry its contents with + * us. Any brave soul to implement link unlinked file back? + */ +struct ghost_file { + struct list_head list; + u32 id; + + u32 dev; + u32 ino; + + struct file_remap remap; +}; + +static u32 ghost_file_ids = 1; +static LIST_HEAD(ghost_files); + +/* + * When opening remaps we first create a link on the remap + * target, then open one, then unlink. In case the remap + * source has more than one instance, these tree steps + * should be serialized with each other. + */ +static mutex_t *remap_open_lock; + +static inline int init_remap_lock(void) +{ + remap_open_lock = shmalloc(sizeof(*remap_open_lock)); + if (!remap_open_lock) + return -1; + + mutex_init(remap_open_lock); + return 0; +} + +static LIST_HEAD(remaps); + +/* + * Remember the name to delete it if needed on error or + * rollback action. Note we don't expect that there will + * be a HUGE number of link remaps, so in a sake of speed + * we keep all data in memory. + */ +struct link_remap_rlb { + struct list_head list; + struct ns_id *mnt_ns; + char *path; +}; + +static int note_link_remap(char *path, struct ns_id *nsid) +{ + struct link_remap_rlb *rlb; + + rlb = xmalloc(sizeof(*rlb)); + if (!rlb) + goto err; + + rlb->path = xstrdup(path); + if (!rlb->path) + goto err2; + + rlb->mnt_ns = nsid; + list_add(&rlb->list, &remaps); + + return 0; + +err2: + xfree(rlb); +err: + pr_err("Can't note link remap for %s\n", path); + return -1; +} + +/* Trim "a/b/c/d" to "a/b/d" */ +static int trim_last_parent(char *path) +{ + char *fname, *p; + + p = strrchr(path, '/'); + fname = p + 1; + if (!p || *fname == '\0') + return -1; + + while (p >= path && *p == '/') + p--; + + if (p < path) + return -1; + + while (p >= path && *p != '/') + p--; + p++; + + while (*fname != '\0') + *p++ = *fname++; + *p = '\0'; + + return 0; +} + +#define BUFSIZE (4096) + +static int copy_chunk_from_file(int fd, int img, off_t off, size_t len) +{ + char *buf = NULL; + int ret; + + if (opts.remote) { + buf = xmalloc(BUFSIZE); + if (!buf) + return -1; + } + + while (len > 0) { + if (opts.remote) { + ret = pread(fd, buf, min_t(size_t, BUFSIZE, len), off); + if (ret <= 0) { + pr_perror("Can't read from ghost file"); + xfree(buf); + return -1; + } + if (write(img, buf, ret) != ret) { + pr_perror("Can't write to image"); + xfree(buf); + return -1; + } + off += ret; + } else { + ret = sendfile(img, fd, &off, len); + if (ret <= 0) { + pr_perror("Can't send ghost to image"); + return -1; + } + } + + len -= ret; + } + + xfree(buf); + return 0; +} + +static int copy_file_to_chunks(int fd, struct cr_img *img, size_t file_size) +{ + GhostChunkEntry ce = GHOST_CHUNK_ENTRY__INIT; + off_t data, hole = 0; + + while (hole < file_size) { + data = lseek(fd, hole, SEEK_DATA); + if (data < 0) { + if (errno == ENXIO) + /* No data */ + break; + else if (hole == 0) { + /* No SEEK_HOLE/DATA by FS */ + data = 0; + hole = file_size; + } else { + pr_perror("Can't seek file data"); + return -1; + } + } else { + hole = lseek(fd, data, SEEK_HOLE); + if (hole < 0) { + pr_perror("Can't seek file hole"); + return -1; + } + } + + ce.len = hole - data; + ce.off = data; + + if (pb_write_one(img, &ce, PB_GHOST_CHUNK)) + return -1; + + if (copy_chunk_from_file(fd, img_raw_fd(img), ce.off, ce.len)) + return -1; + } + + return 0; +} + +static int copy_chunk_to_file(int img, int fd, off_t off, size_t len) +{ + char *buf = NULL; + int ret; + + if (opts.remote) { + buf = xmalloc(BUFSIZE); + if (!buf) + return -1; + } + + while (len > 0) { + if (opts.remote) { + ret = read(img, buf, min_t(size_t, BUFSIZE, len)); + if (ret <= 0) { + pr_perror("Can't read from image"); + xfree(buf); + return -1; + } + if (pwrite(fd, buf, ret, off) != ret) { + pr_perror("Can't write to file"); + xfree(buf); + return -1; + } + } else { + if (lseek(fd, off, SEEK_SET) < 0) { + pr_perror("Can't seek file"); + return -1; + } + ret = sendfile(fd, img, NULL, len); + if (ret < 0) { + pr_perror("Can't send data"); + return -1; + } + } + + off += ret; + len -= ret; + } + + xfree(buf); + return 0; +} + +static int copy_file_from_chunks(struct cr_img *img, int fd, size_t file_size) +{ + if (ftruncate(fd, file_size) < 0) { + pr_perror("Can't make file size"); + return -1; + } + + while (1) { + int ret; + GhostChunkEntry *ce; + + ret = pb_read_one_eof(img, &ce, PB_GHOST_CHUNK); + if (ret <= 0) + return ret; + + if (copy_chunk_to_file(img_raw_fd(img), fd, ce->off, ce->len)) + return -1; + + ghost_chunk_entry__free_unpacked(ce, NULL); + } +} + +static int mkreg_ghost(char *path, GhostFileEntry *gfe, struct cr_img *img) +{ + int gfd, ret; + + gfd = open(path, O_WRONLY | O_CREAT | O_EXCL, gfe->mode); + if (gfd < 0) + return -1; + + if (gfe->chunks) { + if (!gfe->has_size) { + pr_err("Corrupted ghost image -> no size\n"); + close(gfd); + return -1; + } + + ret = copy_file_from_chunks(img, gfd, gfe->size); + } else + ret = copy_file(img_raw_fd(img), gfd, 0); + if (ret < 0) + unlink(path); + close(gfd); + + return ret; +} + +static int ghost_apply_metadata(const char *path, GhostFileEntry *gfe) +{ + struct timeval tv[2]; + int ret = -1; + + if (chown(path, gfe->uid, gfe->gid) < 0) { + pr_perror("Can't reset user/group on ghost %s", path); + goto err; + } + + if (chmod(path, gfe->mode)) { + pr_perror("Can't set perms %o on ghost %s", gfe->mode, path); + goto err; + } + + if (gfe->atim) { + tv[0].tv_sec = gfe->atim->tv_sec; + tv[0].tv_usec = gfe->atim->tv_usec; + tv[1].tv_sec = gfe->mtim->tv_sec; + tv[1].tv_usec = gfe->mtim->tv_usec; + if (lutimes(path, tv)) { + pr_perror("Can't set access and modification times on ghost %s", path); + goto err; + } + } + + ret = 0; +err: + return ret; +} + +static int create_ghost(struct ghost_file *gf, GhostFileEntry *gfe, struct cr_img *img) +{ + struct mount_info *mi; + char path[PATH_MAX]; + int ret, root_len; + char *msg; + + root_len = ret = rst_get_mnt_root(gf->remap.rmnt_id, path, sizeof(path)); + if (ret < 0) { + pr_err("The %d mount is not found for ghost\n", gf->remap.rmnt_id); + goto err; + } + + /* Add a '/' only if we have no at the end */ + if (path[root_len-1] != '/') { + path[root_len++] = '/'; + path[root_len] = '\0'; + } + + snprintf(path + root_len, sizeof(path) - root_len, "%s", gf->remap.rpath); + ret = -1; + + mi = lookup_mnt_id(gf->remap.rmnt_id); + /* We get here while in service mntns */ + if (mi && try_remount_writable(mi, false)) + goto err; +again: + if (S_ISFIFO(gfe->mode)) { + if ((ret = mknod(path, gfe->mode, 0)) < 0) + msg = "Can't create node for ghost file"; + } else if (S_ISCHR(gfe->mode) || S_ISBLK(gfe->mode)) { + if (!gfe->has_rdev) { + pr_err("No rdev for ghost device\n"); + goto err; + } + if ((ret = mknod(path, gfe->mode, gfe->rdev)) < 0) + msg = "Can't create node for ghost dev"; + } else if (S_ISDIR(gfe->mode)) { + if ((ret = mkdirpat(AT_FDCWD, path, gfe->mode)) < 0) + msg = "Can't make ghost dir"; + } else { + if ((ret = mkreg_ghost(path, gfe, img)) < 0) + msg = "Can't create ghost regfile"; + } + + if (ret < 0) { + /* Use grand parent, if parent directory does not exist */ + if (errno == ENOENT) { + if (trim_last_parent(path) < 0) { + pr_err("trim failed: @%s@\n", path); + goto err; + } + goto again; + } + + pr_perror("%s", msg); + goto err; + } + + strcpy(gf->remap.rpath, path + root_len); + pr_debug("Remap rpath is %s\n", gf->remap.rpath); + + ret = -1; + if (ghost_apply_metadata(path, gfe)) + goto err; + + ret = 0; +err: + return ret; +} + +static inline void ghost_path(char *path, int plen, + struct reg_file_info *rfi, RemapFilePathEntry *rpe) +{ + snprintf(path, plen, "%s.cr.%x.ghost", rfi->path, rpe->remap_id); +} + +static int collect_remap_ghost(struct reg_file_info *rfi, + RemapFilePathEntry *rpe) +{ + struct ghost_file *gf; + + list_for_each_entry(gf, &ghost_files, list) + if (gf->id == rpe->remap_id) + goto gf_found; + + /* + * Ghost not found. We will create one in the same dir + * as the very first client of it thus resolving any + * issues with cross-device links. + */ + + pr_info("Opening ghost file %#x for %s\n", rpe->remap_id, rfi->path); + + gf = shmalloc(sizeof(*gf)); + if (!gf) + return -1; + + /* + * The rpath is shmalloc-ed because we create the ghost + * file in root task context and generate its path there. + * However the path should be visible by the criu task + * in order to remove the ghost files from root FS (see + * try_clean_remaps()). + */ + gf->remap.rpath = shmalloc(PATH_MAX); + if (!gf->remap.rpath) + return -1; + gf->remap.rpath[0] = 0; + gf->id = rpe->remap_id; + list_add_tail(&gf->list, &ghost_files); + +gf_found: + rfi->is_dir = gf->remap.is_dir; + rfi->remap = &gf->remap; + return 0; +} + +static int open_remap_ghost(struct reg_file_info *rfi, + RemapFilePathEntry *rpe) +{ + struct ghost_file *gf = container_of(rfi->remap, struct ghost_file, remap); + GhostFileEntry *gfe = NULL; + struct cr_img *img; + + if (rfi->remap->rpath[0]) + return 0; + + img = open_image(CR_FD_GHOST_FILE, O_RSTR, rpe->remap_id); + if (!img) + goto err; + + if (pb_read_one(img, &gfe, PB_GHOST_FILE) < 0) + goto close_ifd; + + /* + * For old formats where optional has_[dev|ino] is + * not present we will have zeros here which is quite + * a sign for "absent" fields. + */ + gf->dev = gfe->dev; + gf->ino = gfe->ino; + gf->remap.rmnt_id = rfi->rfe->mnt_id; + + if (S_ISDIR(gfe->mode)) + strncpy(gf->remap.rpath, rfi->path, PATH_MAX); + else + ghost_path(gf->remap.rpath, PATH_MAX, rfi, rpe); + + if (create_ghost(gf, gfe, img)) + goto close_ifd; + + close_image(img); + + gf->remap.is_dir = S_ISDIR(gfe->mode); + gf->remap.uid = gfe->uid; + gf->remap.gid = gfe->gid; + ghost_file_entry__free_unpacked(gfe, NULL); + + return 0; + +close_ifd: + close_image(img); +err: + if (gfe) + ghost_file_entry__free_unpacked(gfe, NULL); + return -1; +} + +static int collect_remap_linked(struct reg_file_info *rfi, + RemapFilePathEntry *rpe) +{ + struct file_remap *rm; + struct file_desc *rdesc; + struct reg_file_info *rrfi; + + rdesc = find_file_desc_raw(FD_TYPES__REG, rpe->remap_id); + if (!rdesc) { + pr_err("Can't find target file %x\n", rpe->remap_id); + return -1; + } + + rm = xmalloc(sizeof(*rm)); + if (!rm) + return -1; + + rrfi = container_of(rdesc, struct reg_file_info, d); + pr_info("Remapped %s -> %s\n", rfi->path, rrfi->path); + + rm->rpath = rrfi->path; + rm->is_dir = false; + rm->uid = -1; + rm->gid = -1; + rm->rmnt_id = rfi->rfe->mnt_id; + rfi->remap = rm; + return 0; +} + +static int open_remap_linked(struct reg_file_info *rfi) +{ + if (root_ns_mask & CLONE_NEWUSER) { + int rfd; + struct stat st; + + rfd = mntns_get_root_by_mnt_id(rfi->rfe->mnt_id); + if (fstatat(rfd, rfi->remap->rpath, &st, AT_SYMLINK_NOFOLLOW)) { + pr_perror("Can't get owner of link remap %s", rfi->remap->rpath); + return -1; + } + + rfi->remap->uid = st.st_uid; + rfi->remap->gid = st.st_gid; + } + + return 0; +} + +static int collect_remap_dead_process(struct reg_file_info *rfi, + RemapFilePathEntry *rfe) +{ + struct pstree_item *helper; + + helper = lookup_create_item(rfe->remap_id); + if (!helper) + return -1; + + if (helper->pid->state != TASK_UNDEF) { + pr_info("Skipping helper for restoring /proc/%d; pid exists\n", rfe->remap_id); + return 0; + } + + + helper->sid = root_item->sid; + helper->pgid = root_item->pgid; + helper->pid->ns[0].virt = rfe->remap_id; + helper->parent = root_item; + helper->ids = root_item->ids; + if (init_pstree_helper(helper)) { + pr_err("Can't init helper\n"); + return -1; + } + list_add_tail(&helper->sibling, &root_item->children); + + pr_info("Added a helper for restoring /proc/%d\n", vpid(helper)); + + return 0; +} + +struct remap_info { + struct list_head list; + RemapFilePathEntry *rpe; + struct reg_file_info *rfi; +}; + +static int collect_one_remap(void *obj, ProtobufCMessage *msg, struct cr_img *i) +{ + struct remap_info *ri = obj; + RemapFilePathEntry *rpe; + struct file_desc *fdesc; + + ri->rpe = rpe = pb_msg(msg, RemapFilePathEntry); + + if (!rpe->has_remap_type) { + rpe->has_remap_type = true; + /* backward compatibility with images */ + if (rpe->remap_id & REMAP_GHOST) { + rpe->remap_id &= ~REMAP_GHOST; + rpe->remap_type = REMAP_TYPE__GHOST; + } else + rpe->remap_type = REMAP_TYPE__LINKED; + } + + fdesc = find_file_desc_raw(FD_TYPES__REG, rpe->orig_id); + if (fdesc == NULL) { + pr_err("Remap for non existing file %#x\n", rpe->orig_id); + return -1; + } + + ri->rfi = container_of(fdesc, struct reg_file_info, d); + + switch (rpe->remap_type) { + case REMAP_TYPE__GHOST: + if (collect_remap_ghost(ri->rfi, ri->rpe)) + return -1; + break; + case REMAP_TYPE__LINKED: + if (collect_remap_linked(ri->rfi, ri->rpe)) + return -1; + break; + case REMAP_TYPE__PROCFS: + if (collect_remap_dead_process(ri->rfi, rpe) < 0) + return -1; + break; + default: + break; + } + + list_add_tail(&ri->list, &remaps); + + return 0; +} + +static int prepare_one_remap(struct remap_info *ri) +{ + int ret = -1; + RemapFilePathEntry *rpe = ri->rpe; + struct reg_file_info *rfi = ri->rfi; + + pr_info("Configuring remap %#x -> %#x\n", rfi->rfe->id, rpe->remap_id); + + switch (rpe->remap_type) { + case REMAP_TYPE__LINKED: + ret = open_remap_linked(rfi); + break; + case REMAP_TYPE__GHOST: + ret = open_remap_ghost(rfi, rpe); + break; + case REMAP_TYPE__PROCFS: + /* handled earlier by collect_remap_dead_process */ + ret = 0; + break; + default: + pr_err("unknown remap type %u\n", rpe->remap_type); + goto out; + } + +out: + return ret; +} + +int prepare_remaps(void) +{ + struct remap_info *ri; + int ret = 0; + + ret = init_remap_lock(); + if (ret) + return ret; + + list_for_each_entry(ri, &remaps, list) { + ret = prepare_one_remap(ri); + if (ret) + break; + } + + return ret; +} + +static int clean_one_remap(struct remap_info *ri) +{ + struct file_remap *remap = ri->rfi->remap; + int mnt_id, ret, rmntns_root; + struct mount_info *mi; + char path[PATH_MAX]; + + if (remap->rpath[0] == 0) + return 0; + + mnt_id = ri->rfi->rfe->mnt_id; /* rirfirfe %) */ + ret = rst_get_mnt_root(mnt_id, path, sizeof(path)); + if (ret < 0) + return -1; + if (ret >= sizeof(path) - 1) { + pr_err("The path buffer is too small\n"); + return -1; + } + + rmntns_root = open(path, O_RDONLY); + if (rmntns_root < 0) { + pr_perror("Unable to open %s", path); + return -1; + } + + mi = lookup_mnt_id(mnt_id); + /* We get here while in service mntns */ + if (mi && try_remount_writable(mi, false)) { + close(rmntns_root); + return -1; + } + + pr_info("Unlink remap %s\n", remap->rpath); + + ret = unlinkat(rmntns_root, remap->rpath, remap->is_dir ? AT_REMOVEDIR : 0); + if (ret < 0) { + close(rmntns_root); + pr_perror("Couldn't unlink remap %s %s", path, remap->rpath); + return -1; + } + close(rmntns_root); + remap->rpath[0] = 0; + + return 0; +} + +int try_clean_remaps(bool only_ghosts) +{ + struct remap_info *ri; + int ret = 0; + + list_for_each_entry(ri, &remaps, list) { + if (ri->rpe->remap_type == REMAP_TYPE__GHOST) + ret |= clean_one_remap(ri); + else if (only_ghosts) + continue; + else if (ri->rpe->remap_type == REMAP_TYPE__LINKED) + ret |= clean_one_remap(ri); + } + + return ret; +} + +static struct collect_image_info remap_cinfo = { + .fd_type = CR_FD_REMAP_FPATH, + .pb_type = PB_REMAP_FPATH, + .priv_size = sizeof(struct remap_info), + .collect = collect_one_remap, +}; + +/* Tiny files don't need to generate chunks in ghost image. */ +#define GHOST_CHUNKS_THRESH (3 * 4096) + +static int dump_ghost_file(int _fd, u32 id, const struct stat *st, dev_t phys_dev) +{ + struct cr_img *img; + GhostFileEntry gfe = GHOST_FILE_ENTRY__INIT; + Timeval atim = TIMEVAL__INIT, mtim = TIMEVAL__INIT; + + pr_info("Dumping ghost file contents (id %#x)\n", id); + + img = open_image(CR_FD_GHOST_FILE, O_DUMP, id); + if (!img) + return -1; + + gfe.uid = userns_uid(st->st_uid); + gfe.gid = userns_gid(st->st_gid); + gfe.mode = st->st_mode; + + gfe.atim = &atim; + gfe.mtim = &mtim; + gfe.atim->tv_sec = st->st_atim.tv_sec; + gfe.atim->tv_usec = st->st_atim.tv_nsec / 1000; + gfe.mtim->tv_sec = st->st_mtim.tv_sec; + gfe.mtim->tv_usec = st->st_mtim.tv_nsec / 1000; + + gfe.has_dev = gfe.has_ino = true; + gfe.dev = phys_dev; + gfe.ino = st->st_ino; + + if (S_ISCHR(st->st_mode) || S_ISBLK(st->st_mode)) { + gfe.has_rdev = true; + gfe.rdev = st->st_rdev; + } + + if (S_ISREG(st->st_mode) && (st->st_size >= GHOST_CHUNKS_THRESH)) { + gfe.has_chunks = gfe.chunks = true; + gfe.has_size = true; + gfe.size = st->st_size; + } + + if (pb_write_one(img, &gfe, PB_GHOST_FILE)) + return -1; + + if (S_ISREG(st->st_mode)) { + int fd, ret; + char lpath[PSFDS]; + + /* + * Reopen file locally since it may have no read + * permissions when drained + */ + sprintf(lpath, "/proc/self/fd/%d", _fd); + fd = open(lpath, O_RDONLY); + if (fd < 0) { + pr_perror("Can't open ghost original file"); + return -1; + } + + if (gfe.chunks) + ret = copy_file_to_chunks(fd, img, st->st_size); + else + ret = copy_file(fd, img_raw_fd(img), st->st_size); + close(fd); + if (ret) + return -1; + } + + close_image(img); + return 0; +} + +struct file_remap *lookup_ghost_remap(u32 dev, u32 ino) +{ + struct ghost_file *gf; + + list_for_each_entry(gf, &ghost_files, list) { + if (gf->ino == ino && (gf->dev == dev)) { + return &gf->remap; + } + } + + return NULL; +} + +static int dump_ghost_remap(char *path, const struct stat *st, + int lfd, u32 id, struct ns_id *nsid) +{ + struct ghost_file *gf; + RemapFilePathEntry rpe = REMAP_FILE_PATH_ENTRY__INIT; + dev_t phys_dev; + + pr_info("Dumping ghost file for fd %d id %#x\n", lfd, id); + + if (st->st_size > opts.ghost_limit) { + pr_err("Can't dump ghost file %s of %"PRIu64" size, increase limit\n", + path, st->st_size); + return -1; + } + + phys_dev = phys_stat_resolve_dev(nsid, st->st_dev, path); + list_for_each_entry(gf, &ghost_files, list) + if ((gf->dev == phys_dev) && (gf->ino == st->st_ino)) + goto dump_entry; + + gf = xmalloc(sizeof(*gf)); + if (gf == NULL) + return -1; + + gf->dev = phys_dev; + gf->ino = st->st_ino; + gf->id = ghost_file_ids++; + list_add_tail(&gf->list, &ghost_files); + + if (dump_ghost_file(lfd, gf->id, st, phys_dev)) + return -1; + +dump_entry: + rpe.orig_id = id; + rpe.remap_id = gf->id; + rpe.has_remap_type = true; + rpe.remap_type = REMAP_TYPE__GHOST; + + return pb_write_one(img_from_set(glob_imgset, CR_FD_REMAP_FPATH), + &rpe, PB_REMAP_FPATH); +} + +static void __rollback_link_remaps(bool do_unlink) +{ + struct link_remap_rlb *rlb, *tmp; + int mntns_root; + + list_for_each_entry_safe(rlb, tmp, &remaps, list) { + if (do_unlink) { + mntns_root = mntns_get_root_fd(rlb->mnt_ns); + if (mntns_root >= 0) + unlinkat(mntns_root, rlb->path, 0); + else + pr_err("Failed to clenaup %s link remap\n", rlb->path); + } + + list_del(&rlb->list); + xfree(rlb->path); + xfree(rlb); + } +} + +void delete_link_remaps(void) { __rollback_link_remaps(true); } +void free_link_remaps(void) { __rollback_link_remaps(false); } +static int linkat_hard(int odir, char *opath, int ndir, char *npath, uid_t uid, gid_t gid, int flags); + +static int create_link_remap(char *path, int len, int lfd, + u32 *idp, struct ns_id *nsid, + const struct stat *st) +{ + char link_name[PATH_MAX], *tmp; + FileEntry fe = FILE_ENTRY__INIT; + RegFileEntry rfe = REG_FILE_ENTRY__INIT; + FownEntry fwn = FOWN_ENTRY__INIT; + int mntns_root; + int ret; + + if (!opts.link_remap_ok) { + pr_err("Can't create link remap for %s. " + "Use " LREMAP_PARAM " option.\n", path); + return -1; + } + + /* + * Linked remapping -- we create a hard link on a removed file + * in the directory original file used to sit. + * + * Bad news is than we can't easily open lfd's parent dir. Thus + * we have to just generate an absolute path and use it. The linkat + * will fail if we chose the bad one. + */ + + link_name[0] = '.'; + memcpy(link_name + 1, path, len); + tmp = link_name + len; + while (*tmp != '/') { + BUG_ON(tmp == link_name); + tmp--; + } + + fd_id_generate_special(NULL, idp); + rfe.id = *idp; + rfe.flags = 0; + rfe.pos = 0; + rfe.fown = &fwn; + rfe.name = link_name + 1; + + /* Any 'unique' name works here actually. Remap works by reg-file ids. */ + snprintf(tmp + 1, sizeof(link_name) - (size_t)(tmp - link_name - 1), "link_remap.%d", rfe.id); + + mntns_root = mntns_get_root_fd(nsid); + +again: + ret = linkat_hard(lfd, "", mntns_root, link_name, + st->st_uid, st->st_gid, AT_EMPTY_PATH); + if (ret < 0 && errno == ENOENT) { + /* Use grand parent, if parent directory does not exist. */ + if (trim_last_parent(link_name) < 0) { + pr_err("trim failed: @%s@\n", link_name); + return -1; + } + goto again; + } else if (ret < 0) { + pr_perror("Can't link remap to %s", path); + return -1; + } + + if (note_link_remap(link_name, nsid)) + return -1; + + fe.type = FD_TYPES__REG; + fe.id = rfe.id; + fe.reg = &rfe; + + return pb_write_one(img_from_set(glob_imgset, CR_FD_FILES), &fe, PB_FILE); +} + +static int dump_linked_remap(char *path, int len, const struct stat *ost, + int lfd, u32 id, struct ns_id *nsid) +{ + u32 lid; + RemapFilePathEntry rpe = REMAP_FILE_PATH_ENTRY__INIT; + + if (create_link_remap(path, len, lfd, &lid, nsid, ost)) + return -1; + + rpe.orig_id = id; + rpe.remap_id = lid; + rpe.has_remap_type = true; + rpe.remap_type = REMAP_TYPE__LINKED; + + return pb_write_one(img_from_set(glob_imgset, CR_FD_REMAP_FPATH), + &rpe, PB_REMAP_FPATH); +} + +static pid_t *dead_pids; +static int n_dead_pids; + +int dead_pid_conflict(void) +{ + int i; + + for (i = 0; i < n_dead_pids; i++) { + struct pid *node; + pid_t pid = dead_pids[i]; + + node = pstree_pid_by_virt(pid); + if (!node) + continue; + + /* Main thread */ + if (node->state != TASK_THREAD) + continue; + + pr_err("Conflict with a dead task with the same PID as of this thread (virt %d, real %d).\n", + node->ns[0].virt, node->real); + return -1; + } + + return 0; +} + +static int have_seen_dead_pid(pid_t pid) +{ + int i; + + for (i = 0; i < n_dead_pids; i++) { + if (dead_pids[i] == pid) + return 1; + } + + if (xrealloc_safe(&dead_pids, sizeof(*dead_pids) * (n_dead_pids + 1))) + return -1; + dead_pids[n_dead_pids++] = pid; + + return 0; +} + +static int dump_dead_process_remap(pid_t pid, u32 id) +{ + RemapFilePathEntry rpe = REMAP_FILE_PATH_ENTRY__INIT; + int ret; + + ret = have_seen_dead_pid(pid); + if (ret < 0) + return -1; + if (ret) { + pr_info("Found dead pid %d already, skipping remap\n", pid); + return 0; + } + + rpe.orig_id = id; + rpe.remap_id = pid; + rpe.has_remap_type = true; + rpe.remap_type = REMAP_TYPE__PROCFS; + + return pb_write_one(img_from_set(glob_imgset, CR_FD_REMAP_FPATH), + &rpe, PB_REMAP_FPATH); +} + +static bool is_sillyrename_name(char *name) +{ + int i; + + name = strrchr(name, '/'); + BUG_ON(name == NULL); /* see check in dump_one_reg_file */ + name++; + + /* + * Strictly speaking this check is not bullet-proof. User + * can create file with this name by hands and we have no + * API to distinguish really-silly-renamed files from those + * fake names :( + * + * But since NFS people expect .nfsXXX files to be unstable, + * we treat them as such too. + */ + + if (strncmp(name, SILLYNAME_PREF, sizeof(SILLYNAME_PREF) - 1)) + return false; + + name += sizeof(SILLYNAME_PREF) - 1; + for (i = 0; i < SILLYNAME_SUFF_LEN; i++) + if (!isxdigit(name[i])) + return false; + + return true; +} + +static inline bool nfs_silly_rename(char *rpath, const struct fd_parms *parms) +{ + return (parms->fs_type == NFS_SUPER_MAGIC) && is_sillyrename_name(rpath); +} + +int strip_deleted(struct fd_link *link) +{ + struct dcache_prepends { + const char *str; + size_t len; + } static const prepends[] = { + { + .str = " (deleted)", + .len = 10, + }, { + .str = "//deleted", + .len = 9, + } + }; + size_t i; + + for (i = 0; i < ARRAY_SIZE(prepends); i++) { + size_t at; + + if (link->len <= prepends[i].len) + continue; + + at = link->len - prepends[i].len; + if (!strcmp(&link->name[at], prepends[i].str)) { + pr_debug("Strip '%s' tag from '%s'\n", + prepends[i].str, link->name); + link->name[at] = '\0'; + link->len -= prepends[i].len; + return 1; + } + } + return 0; +} + +static int check_path_remap(struct fd_link *link, const struct fd_parms *parms, + int lfd, u32 id, struct ns_id *nsid) +{ + char *rpath = link->name; + int plen = link->len; + int ret, mntns_root; + struct stat pst; + const struct stat *ost = &parms->stat; + + if (parms->fs_type == PROC_SUPER_MAGIC) { + /* The file points to /proc/pid/ where pid is a dead + * process. We remap this file by adding this pid to be + * fork()ed into a TASK_HELPER state so that we can point to it + * on restore. + */ + pid_t pid; + char *start, *end; + + /* skip "./proc/" */ + start = strstr(rpath, "/"); + if (!start) + return -1; + start = strstr(start + 1, "/"); + if (!start) /* it's /proc */ + return 0; + pid = strtol(start + 1, &end, 10); + + /* If strtol didn't convert anything, then we are looking at + * something like /proc/kmsg, which we shouldn't mess with. + * Anything under /proc/ (including that directory itself) + * can be c/r'd with a dead pid remap, so let's allow all such + * cases. + */ + if (pid != 0) { + bool is_dead = strip_deleted(link); + mntns_root = mntns_get_root_fd(nsid); + if (mntns_root < 0) + return -1; + + /* /proc/ will be "/proc/1 (deleted)" when it is + * dead, but a path like /proc/1/mountinfo won't have + * the suffix, since it isn't actually deleted (still + * exists, but the parent dir is deleted). So, if we + * have a path like /proc/1/mountinfo, test if /proc/1 + * exists instead, since this is what CRIU will need to + * open on restore. + */ + if (!is_dead) { + *end = 0; + is_dead = faccessat(mntns_root, rpath, F_OK, 0); + *end = '/'; + } + + if (is_dead) { + pr_info("Dumping dead process remap of %d\n", pid); + return dump_dead_process_remap(pid, id); + } + } + + return 0; + } else if (parms->fs_type == DEVPTS_SUPER_MAGIC) { + /* + * It's safe to call stripping here because + * file paths are having predefined format for + * this FS and can't have a valid " (deleted)" + * postfix as a part of not deleted filename. + */ + strip_deleted(link); + /* + * Devpts devices/files are generated by the + * kernel itself so we should not try to generate + * any kind of ghost files here even if file is + * no longer exist. + */ + return 0; + } + + if (ost->st_nlink == 0) { + /* + * Unpleasant, but easy case. File is completely invisible + * from the FS. Just dump its contents and that's it. But + * be careful whether anybody still has any of its hardlinks + * also open. + */ + strip_deleted(link); + return dump_ghost_remap(rpath + 1, ost, lfd, id, nsid); + } + + if (nfs_silly_rename(rpath, parms)) { + /* + * If this is NFS silly-rename file the path we have at hands + * will be accessible by fstat(), but once we kill the dumping + * tasks it will disappear. So we just go ahead an dump it as + * linked-remap file (NFS will allow us to create more hard + * links on it) to have some persistent name at hands. + */ + pr_debug("Dump silly-rename linked remap for %x\n", id); + return dump_linked_remap(rpath + 1, plen - 1, ost, lfd, id, nsid); + } + + mntns_root = mntns_get_root_fd(nsid); + if (mntns_root < 0) + return -1; + + ret = fstatat(mntns_root, rpath, &pst, 0); + if (ret < 0) { + /* + * Linked file, but path is not accessible (unless any + * other error occurred). We can create a temporary link to it + * using linkat with AT_EMPTY_PATH flag and remap it to this + * name. + */ + + if (errno == ENOENT) + return dump_linked_remap(rpath + 1, plen - 1, + ost, lfd, id, nsid); + + pr_perror("Can't stat path"); + return -1; + } + + if ((pst.st_ino != ost->st_ino) || (pst.st_dev != ost->st_dev)) { + if (opts.evasive_devices && + (S_ISCHR(ost->st_mode) || S_ISBLK(ost->st_mode)) && + pst.st_rdev == ost->st_rdev) + return 0; + /* + * FIXME linked file, but the name we see it by is reused + * by somebody else. We can dump it with linked remaps, but + * we'll have difficulties on restore -- we will have to + * move the existing file aside, then restore this one, + * unlink, then move the original file back. It's fairly + * easy to do, but we don't do it now, since unlinked files + * have the "(deleted)" suffix in proc and name conflict + * is unlikely :) + */ + pr_err("Unaccessible path opened %u:%u, need %u:%u\n", + (int)pst.st_dev, (int)pst.st_ino, + (int)ost->st_dev, (int)ost->st_ino); + return -1; + } + + /* + * File is linked and visible by the name it is opened by + * this task. Go ahead and dump it. + */ + return 0; +} + +static bool should_check_size(int flags) +{ + /* Skip size if file has O_APPEND and O_WRONLY flags (e.g. log file). */ + if (((flags & O_ACCMODE) == O_WRONLY) && + (flags & O_APPEND)) + return false; + + return true; +} + +int dump_one_reg_file(int lfd, u32 id, const struct fd_parms *p) +{ + struct fd_link _link, *link; + struct mount_info *mi; + struct cr_img *rimg; + char ext_id[64]; + FileEntry fe = FILE_ENTRY__INIT; + RegFileEntry rfe = REG_FILE_ENTRY__INIT; + + if (!p->link) { + if (fill_fdlink(lfd, p, &_link)) + return -1; + link = &_link; + } else + link = p->link; + + + + snprintf(ext_id, sizeof(ext_id), "file[%x:%"PRIx64"]", p->mnt_id, p->stat.st_ino); + if (external_lookup_id(ext_id)) { + /* the first symbol will be cut on restore to get an relative path*/ + rfe.name = xstrdup(ext_id); + rfe.ext = true; + rfe.has_ext = true; + goto ext; + } + + mi = lookup_mnt_id(p->mnt_id); + if (mi == NULL) { + pr_err("Can't lookup mount=%d for fd=%d path=%s\n", + p->mnt_id, p->fd, link->name + 1); + return -1; + } + + if (mnt_is_overmounted(mi)) { + pr_err("Open files on overmounted mounts are not supported yet\n"); + return -1; + } + + if (p->mnt_id >= 0 && (root_ns_mask & CLONE_NEWNS)) { + rfe.mnt_id = p->mnt_id; + rfe.has_mnt_id = true; + } + + pr_info("Dumping path for %d fd via self %d [%s]\n", + p->fd, lfd, &link->name[1]); + + /* + * The regular path we can handle should start with slash. + */ + if (link->name[1] != '/') { + pr_err("The path [%s] is not supported\n", &link->name[1]); + return -1; + } + + if (check_path_remap(link, p, lfd, id, mi->nsid)) + return -1; + rfe.name = &link->name[1]; +ext: + rfe.id = id; + rfe.flags = p->flags; + rfe.pos = p->pos; + rfe.fown = (FownEntry *)&p->fown; + rfe.has_mode = true; + rfe.mode = p->stat.st_mode; + + if (S_ISREG(p->stat.st_mode) && should_check_size(rfe.flags)) { + rfe.has_size = true; + rfe.size = p->stat.st_size; + } + + fe.type = FD_TYPES__REG; + fe.id = rfe.id; + fe.reg = &rfe; + + rimg = img_from_set(glob_imgset, CR_FD_FILES); + return pb_write_one(rimg, &fe, PB_FILE); +} + +const struct fdtype_ops regfile_dump_ops = { + .type = FD_TYPES__REG, + .dump = dump_one_reg_file, +}; + +static void convert_path_from_another_mp(char *src, char *dst, int dlen, + struct mount_info *smi, + struct mount_info *dmi) +{ + int off; + + /* + * mi->mountpoint ./foo/bar + * mi->ns_mountpoint /foo/bar + * rfi->path foo/bar/baz + */ + off = strlen(smi->ns_mountpoint + 1); + BUG_ON(strlen(smi->root) < strlen(dmi->root)); + + /* + * Create paths relative to this mount. + * Absolute path to the mount point + difference between source + * and destination roots + path relative to the mountpoint. + */ + snprintf(dst, dlen, "./%s/%s/%s", + dmi->ns_mountpoint + 1, + smi->root + strlen(dmi->root), + src + off); +} + +static int linkat_hard(int odir, char *opath, int ndir, char *npath, uid_t uid, gid_t gid, int flags) +{ + struct __user_cap_data_struct data[_LINUX_CAPABILITY_U32S_3]; + struct __user_cap_header_struct hdr; + int ret, old_fsuid = -1, old_fsgid = -1; + int errno_save; + + ret = linkat(odir, opath, ndir, npath, flags); + if (ret == 0) + return 0; + + if (!( (errno == EPERM || errno == EOVERFLOW) && (root_ns_mask & CLONE_NEWUSER) )) { + errno_save = errno; + pr_warn("Can't link %s -> %s\n", opath, npath); + errno = errno_save; + return ret; + } + + /* + * Kernel before 4.3 has strange security restrictions about + * linkat. If the fsuid of the caller doesn't equals + * the uid of the file and the file is not "safe" + * one, then only global CAP_CHOWN will be allowed + * to link(). + * + * Next, when we're in user namespace we're ns root, + * but not global CAP_CHOWN. Thus, even though we + * ARE ns root, we will not be allowed to link() at + * files that belong to regular users %) + * + * Fortunately, the setfsuid() requires ns-level + * CAP_SETUID which we have. + * + * Starting with 4.8 the kernel doesn't allow to create inodes + * with a uid or gid unknown to an user namespace. + * 036d523641c66 ("vfs: Don't create inodes with a uid or gid unknown to the vfs") + */ + + old_fsuid = setfsuid(uid); + old_fsgid = setfsgid(gid); + + /* AT_EMPTY_PATH requires CAP_DAC_READ_SEARCH */ + if (flags & AT_EMPTY_PATH) { + hdr.version = _LINUX_CAPABILITY_VERSION_3; + hdr.pid = 0; + + if (capget(&hdr, data) < 0) { + errno_save = errno; + pr_perror("capget"); + goto out; + } + data[0].effective = data[0].permitted; + data[1].effective = data[1].permitted; + if (capset(&hdr, data) < 0) { + errno_save = errno; + pr_perror("capset"); + goto out; + } + } + + ret = linkat(odir, opath, ndir, npath, flags); + errno_save = errno; + if (ret < 0) + pr_perror("Can't link %s -> %s", opath, npath); + +out: + setfsuid(old_fsuid); + setfsgid(old_fsgid); + if (setfsuid(-1) != old_fsuid) { + pr_warn("Failed to restore old fsuid!\n"); + /* + * Don't fail here. We still have chances to run till + * the pie/restorer, and if _this_ guy fails to set + * the proper fsuid, then we'll abort the restore. + */ + } + + /* + * Restoring PR_SET_DUMPABLE flag is required after setfsuid, + * as if it not set, proc inode will be created with root cred + * (see proc_pid_make_inode), which will result in permission + * check fail when trying to access files in /proc/self/ + */ + prctl(PR_SET_DUMPABLE, 1, 0); + + errno = errno_save; + + return ret; +} + +static void rm_parent_dirs(int mntns_root, char *path, int count) +{ + char *p, *prev = NULL; + + if (!count) + return; + + while (count > 0) { + count -= 1; + p = strrchr(path, '/'); + if (p) + *p = '\0'; + if (prev) + *prev = '/'; + + if (unlinkat(mntns_root, path, AT_REMOVEDIR)) + pr_perror("Can't remove %s AT %d", path, mntns_root); + else + pr_debug("Unlinked parent dir: %s AT %d\n", path, mntns_root); + prev = p; + } + + if (prev) + *prev = '/'; +} + +/* Construct parent dir name and mkdir parent/grandparents if they're not exist */ +static int make_parent_dirs_if_need(int mntns_root, char *path) +{ + char *p, *last_delim; + int err, count = 0; + struct stat st; + + p = last_delim = strrchr(path, '/'); + if (!p) + return 0; + *p = '\0'; + + if (fstatat(mntns_root, path, &st, AT_EMPTY_PATH) == 0) + goto out; + if (errno != ENOENT) { + pr_perror("Can't stat %s", path); + count = -1; + goto out; + } + + p = path; + do { + p = strchr(p, '/'); + if (p) + *p = '\0'; + + err = mkdirat(mntns_root, path, 0777); + if (err && errno != EEXIST) { + pr_perror("Can't create dir: %s AT %d", path, mntns_root); + rm_parent_dirs(mntns_root, path, count); + count = -1; + goto out; + } else if (!err) { + pr_debug("Created parent dir: %s AT %d\n", path, mntns_root); + count++; + } + + if (p) + *p++ = '/'; + } while (p); +out: + *last_delim = '/'; + return count; +} + +/* + * This routine properly resolves d's path handling ghost/link-remaps. + * The open_cb is a routine that does actual open, it differs for + * files, directories, fifos, etc. + */ + +static int rfi_remap(struct reg_file_info *rfi, int *level) +{ + struct mount_info *mi, *rmi, *tmi; + char _path[PATH_MAX], *path = _path; + char _rpath[PATH_MAX], *rpath = _rpath; + int mntns_root; + + if (rfi->rfe->mnt_id == -1) { + /* Know nothing about mountpoints */ + mntns_root = mntns_get_root_by_mnt_id(-1); + path = rfi->path; + rpath = rfi->remap->rpath; + goto out_root; + } + + mi = lookup_mnt_id(rfi->rfe->mnt_id); + if (mi == NULL) + return -1; + + if (rfi->rfe->mnt_id == rfi->remap->rmnt_id) { + /* Both links on the same mount point */ + tmi = mi; + path = rfi->path; + rpath = rfi->remap->rpath; + goto out; + } + + rmi = lookup_mnt_id(rfi->remap->rmnt_id); + if (rmi == NULL) + return -1; + + /* + * Find the common bind-mount. We know that one mount point was + * really mounted and all other were bind-mounted from it, so the + * lowest mount must contains all bind-mounts. + */ + for (tmi = mi; tmi->bind; tmi = tmi->bind) + ; + + BUG_ON(tmi->s_dev != rmi->s_dev); + BUG_ON(tmi->s_dev != mi->s_dev); + + /* Calcalate paths on the device (root mount) */ + convert_path_from_another_mp(rfi->path, path, sizeof(_path), mi, tmi); + convert_path_from_another_mp(rfi->remap->rpath, rpath, sizeof(_rpath), rmi, tmi); + +out: + mntns_root = mntns_get_root_fd(tmi->nsid); + + /* We get here while in task's mntns */ + if (try_remount_writable(tmi, true)) + return -1; + + pr_debug("%d: Link %s -> %s\n", tmi->mnt_id, rpath, path); +out_root: + *level = make_parent_dirs_if_need(mntns_root, path); + if (*level < 0) + return -1; + + if (linkat_hard(mntns_root, rpath, mntns_root, path, + rfi->remap->uid, rfi->remap->gid, 0) < 0) { + int errno_saved = errno; + rm_parent_dirs(mntns_root, path, *level); + errno = errno_saved; + return -1; + } + + return 0; +} + +int open_path(struct file_desc *d, + int(*open_cb)(int mntns_root, struct reg_file_info *, void *), void *arg) +{ + int tmp, mntns_root, level = 0; + struct reg_file_info *rfi; + char *orig_path = NULL; + char path[PATH_MAX]; + int inh_fd = -1; + + if (inherited_fd(d, &tmp)) + return tmp; + + rfi = container_of(d, struct reg_file_info, d); + + if (rfi->rfe->ext) { + tmp = inherit_fd_lookup_id(rfi->rfe->name); + if (tmp >= 0) { + inh_fd = tmp; + /* + * PROC_SELF isn't used, because only service + * descriptors can be used here. + */ + mntns_root = open_pid_proc(getpid()); + snprintf(path, sizeof(path), "fd/%d", tmp); + orig_path = rfi->path; + rfi->path = path; + goto ext; + } + } + + if (rfi->remap) { + if (fault_injected(FI_RESTORE_OPEN_LINK_REMAP)) { + pr_info("fault: Open link-remap failure!\n"); + kill(getpid(), SIGKILL); + } + + mutex_lock(remap_open_lock); + if (rfi->remap->is_dir) { + /* + * FIXME Can't make directory under new name. + * Will have to open it under the ghost one :( + */ + orig_path = rfi->path; + rfi->path = rfi->remap->rpath; + } else if (rfi_remap(rfi, &level) < 0) { + static char tmp_path[PATH_MAX]; + + if (errno != EEXIST) { + pr_perror("Can't link %s -> %s", + rfi->remap->rpath, rfi->path); + return -1; + } + + /* + * The file whose name we're trying to create + * exists. Need to pick some other one, we're + * going to remove it anyway. + * + * Strictly speaking, this is cheating, file + * name shouldn't change. But since NFS with + * its silly-rename doesn't care, why should we? + */ + + orig_path = rfi->path; + rfi->path = tmp_path; + snprintf(tmp_path, sizeof(tmp_path), "%s.cr_link", orig_path); + pr_debug("Fake %s -> %s link\n", rfi->path, rfi->remap->rpath); + + if (rfi_remap(rfi, &level) < 0) { + pr_perror("Can't create even fake link!"); + return -1; + } + } + } + + mntns_root = mntns_get_root_by_mnt_id(rfi->rfe->mnt_id); +ext: + tmp = open_cb(mntns_root, rfi, arg); + if (tmp < 0) { + pr_perror("Can't open file %s", rfi->path); + close_safe(&inh_fd); + return -1; + } + close_safe(&inh_fd); + + if ((rfi->rfe->has_size || rfi->rfe->has_mode) && + !rfi->size_mode_checked) { + struct stat st; + + if (fstat(tmp, &st) < 0) { + pr_perror("Can't fstat opened file"); + return -1; + } + + if (rfi->rfe->has_size && (st.st_size != rfi->rfe->size)) { + pr_err("File %s has bad size %"PRIu64" (expect %"PRIu64")\n", + rfi->path, st.st_size, + rfi->rfe->size); + return -1; + } + + if (rfi->rfe->has_mode && (st.st_mode != rfi->rfe->mode)) { + pr_err("File %s has bad mode 0%o (expect 0%o)\n", + rfi->path, (int)st.st_mode, + rfi->rfe->mode); + return -1; + } + + /* + * This is only visible in the current process, so + * change w/o locks. Other tasks sharing the same + * file will get one via unix sockets. + */ + rfi->size_mode_checked = true; + } + + if (rfi->remap) { + if (!rfi->remap->is_dir) { + unlinkat(mntns_root, rfi->path, 0); + rm_parent_dirs(mntns_root, rfi->path, level); + } + + mutex_unlock(remap_open_lock); + } + if (orig_path) + rfi->path = orig_path; + + if (restore_fown(tmp, rfi->rfe->fown)) + return -1; + + return tmp; +} + +int do_open_reg_noseek_flags(int ns_root_fd, struct reg_file_info *rfi, void *arg) +{ + u32 flags = *(u32 *)arg; + int fd; + + /* unnamed temporary files are restored as ghost files */ + flags &= ~O_TMPFILE; + + fd = openat(ns_root_fd, rfi->path, flags); + if (fd < 0) { + pr_perror("Can't open file %s on restore", rfi->path); + return fd; + } + + return fd; +} + +static int do_open_reg_noseek(int ns_root_fd, struct reg_file_info *rfi, void *arg) +{ + return do_open_reg_noseek_flags(ns_root_fd, rfi, &rfi->rfe->flags); +} + +static int do_open_reg(int ns_root_fd, struct reg_file_info *rfi, void *arg) +{ + int fd; + + fd = do_open_reg_noseek(ns_root_fd, rfi, arg); + if (fd < 0) + return fd; + + if ((rfi->rfe->pos != -1ULL) && + lseek(fd, rfi->rfe->pos, SEEK_SET) < 0) { + pr_perror("Can't restore file pos"); + close(fd); + return -1; + } + + return fd; +} + +int open_reg_fd(struct file_desc *fd) +{ + return open_path(fd, do_open_reg_noseek, NULL); +} + +int open_reg_by_id(u32 id) +{ + struct file_desc *fd; + + /* + * This one gets called by exe link, chroot and cwd + * restoring code. No need in calling lseek on either + * of them. + */ + + fd = find_file_desc_raw(FD_TYPES__REG, id); + if (fd == NULL) { + pr_err("Can't find regfile for %#x\n", id); + return -1; + } + + return open_reg_fd(fd); +} + +struct filemap_ctx { + u32 flags; + struct file_desc *desc; + int fd; + /* + * Whether or not to close the fd when we're about to + * put a new one into ctx. + * + * True is used by premap, so that it just calls vm_open + * in sequence, immediately mmap()s the file, then it + * can be closed. + * + * False is used by open_vmas() which pre-opens the files + * for restorer, and the latter mmap()s them and closes. + * + * ... + */ + bool close; + /* ... + * + * but closing all vmas won't work, as some of them share + * the descriptor, so only the ones that terminate the + * fd-sharing chain are marked with VMA_CLOSE flag, saying + * restorer to close the vma's fd. + * + * Said that, this vma pointer references the previously + * seen vma, so that once fd changes, this one gets the + * closing flag. + */ + struct vma_area *vma; +}; + +static struct filemap_ctx ctx; + +void filemap_ctx_init(bool auto_close) +{ + ctx.desc = NULL; /* to fail the first comparison in open_ */ + ctx.fd = -1; /* not to close random fd in _fini */ + ctx.vma = NULL; /* not to put spurious VMA_CLOSE in _fini */ + /* flags may remain any */ + ctx.close = auto_close; +} + +void filemap_ctx_fini(void) +{ + if (ctx.close) { + if (ctx.fd >= 0) + close(ctx.fd); + } else { + if (ctx.vma) + ctx.vma->e->status |= VMA_CLOSE; + } +} + +static int open_filemap(int pid, struct vma_area *vma) +{ + u32 flags; + int ret; + + /* + * The vma->fd should have been assigned in collect_filemap + * + * We open file w/o lseek, as mappings don't care about it + */ + + BUG_ON((vma->vmfd == NULL) || !vma->e->has_fdflags); + flags = vma->e->fdflags; + + if (ctx.flags != flags || ctx.desc != vma->vmfd) { + ret = open_path(vma->vmfd, do_open_reg_noseek_flags, &flags); + if (ret < 0) + return ret; + + filemap_ctx_fini(); + + ctx.flags = flags; + ctx.desc = vma->vmfd; + ctx.fd = ret; + } + + ctx.vma = vma; + vma->e->fd = ctx.fd; + return 0; +} + +int collect_filemap(struct vma_area *vma) +{ + struct file_desc *fd; + + if (!vma->e->has_fdflags) { + /* Make a wild guess for the fdflags */ + vma->e->has_fdflags = true; + if ((vma->e->prot & PROT_WRITE) && + vma_area_is(vma, VMA_FILE_SHARED)) + vma->e->fdflags = O_RDWR; + else + vma->e->fdflags = O_RDONLY; + } + + fd = collect_special_file(vma->e->shmid); + if (!fd) + return -1; + + vma->vmfd = fd; + vma->vm_open = open_filemap; + return 0; +} + +static int open_fe_fd(struct file_desc *fd, int *new_fd) +{ + int tmp; + + tmp = open_path(fd, do_open_reg, NULL); + if (tmp < 0) + return -1; + *new_fd = tmp; + return 0; +} + +static char *reg_file_path(struct file_desc *d, char *buf, size_t s) +{ + struct reg_file_info *rfi; + + rfi = container_of(d, struct reg_file_info, d); + return rfi->path; +} + +static struct file_desc_ops reg_desc_ops = { + .type = FD_TYPES__REG, + .open = open_fe_fd, + .name = reg_file_path, +}; + +struct file_desc *try_collect_special_file(u32 id, int optional) +{ + struct file_desc *fdesc; + + /* + * Files dumped for vmas/exe links can have remaps + * configured. Need to bump-up users for them, otherwise + * the open_path() would unlink the remap file after + * the very first open. + */ + + fdesc = find_file_desc_raw(FD_TYPES__REG, id); + if (fdesc == NULL) { + if (!optional) + pr_err("No entry for reg-file-ID %#x\n", id); + return NULL; + } + + return fdesc; +} + +static int collect_one_regfile(void *o, ProtobufCMessage *base, struct cr_img *i) +{ + struct reg_file_info *rfi = o; + static char dot[] = "."; + + rfi->rfe = pb_msg(base, RegFileEntry); + /* change "/foo" into "foo" and "/" into "." */ + if (rfi->rfe->name[1] == '\0') + rfi->path = dot; + else + rfi->path = rfi->rfe->name + 1; + rfi->remap = NULL; + rfi->size_mode_checked = false; + + pr_info("Collected [%s] ID %#x\n", rfi->path, rfi->rfe->id); + return file_desc_add(&rfi->d, rfi->rfe->id, ®_desc_ops); +} + +struct collect_image_info reg_file_cinfo = { + .fd_type = CR_FD_REG_FILES, + .pb_type = PB_REG_FILE, + .priv_size = sizeof(struct reg_file_info), + .collect = collect_one_regfile, + .flags = COLLECT_SHARED, +}; + +int collect_remaps_and_regfiles(void) +{ + if (!files_collected() && collect_image(®_file_cinfo)) + return -1; + + if (collect_image(&remap_cinfo)) + return -1; + + return 0; +} diff --git a/CRIU_code/criu/files.c b/CRIU_code/criu/files.c new file mode 100644 index 0000000..ffdaa45 --- /dev/null +++ b/CRIU_code/criu/files.c @@ -0,0 +1,1735 @@ +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "types.h" +#include "files.h" +#include "file-ids.h" +#include "files-reg.h" +#include "file-lock.h" +#include "image.h" +#include "common/list.h" +#include "rst-malloc.h" +#include "util-pie.h" +#include "common/lock.h" +#include "sockets.h" +#include "pstree.h" +#include "tty.h" +#include "pipes.h" +#include "fifo.h" +#include "eventfd.h" +#include "eventpoll.h" +#include "fsnotify.h" +#include "sk-packet.h" +#include "mount.h" +#include "signalfd.h" +#include "namespaces.h" +#include "tun.h" +#include "timerfd.h" +#include "imgset.h" +#include "fs-magic.h" +#include "fdinfo.h" +#include "cr_options.h" +#include "autofs.h" +#include "parasite.h" +#include "parasite-syscall.h" +#include "kerndat.h" +#include "fdstore.h" + +#include "protobuf.h" +#include "util.h" +#include "images/fs.pb-c.h" +#include "images/ext-file.pb-c.h" + +#include "plugin.h" + +#define FDESC_HASH_SIZE 64 +static struct hlist_head file_desc_hash[FDESC_HASH_SIZE]; +/* file_desc's, which fle is not owned by a process, that is able to open them */ +static LIST_HEAD(fake_master_head); + +static u32 max_file_desc_id = 0; + +static void init_fdesc_hash(void) +{ + int i; + + for (i = 0; i < FDESC_HASH_SIZE; i++) + INIT_HLIST_HEAD(&file_desc_hash[i]); +} + +void file_desc_init(struct file_desc *d, u32 id, struct file_desc_ops *ops) +{ + INIT_LIST_HEAD(&d->fd_info_head); + INIT_LIST_HEAD(&d->fake_master_list); + INIT_HLIST_NODE(&d->hash); + + d->id = id; + d->ops = ops; +} + +int file_desc_add(struct file_desc *d, u32 id, struct file_desc_ops *ops) +{ + file_desc_init(d, id, ops); + hlist_add_head(&d->hash, &file_desc_hash[id % FDESC_HASH_SIZE]); + + if (id > max_file_desc_id) + max_file_desc_id = id; + + return 0; /* this is to make tail-calls in collect_one_foo look nice */ +} + +struct file_desc *find_file_desc_raw(int type, u32 id) +{ + struct file_desc *d; + struct hlist_head *chain; + + chain = &file_desc_hash[id % FDESC_HASH_SIZE]; + hlist_for_each_entry(d, chain, hash) + if ((d->id == id) && + (d->ops->type == type || type == FD_TYPES__UND)) + /* + * Warning -- old CRIU might generate matching IDs + * for different file types! So any code that uses + * FD_TYPES__UND for fdesc search MUST make sure it's + * dealing with the merged files images where all + * descs are forced to have different IDs. + */ + return d; + + return NULL; +} + +static inline struct file_desc *find_file_desc(FdinfoEntry *fe) +{ + return find_file_desc_raw(fe->type, fe->id); +} + +u32 find_unused_file_desc_id(void) +{ + return max_file_desc_id + 1; +} + +struct fdinfo_list_entry *find_used_fd(struct pstree_item *task, int fd) +{ + struct list_head *head; + struct fdinfo_list_entry *fle; + + head = &rsti(task)->fds; + list_for_each_entry_reverse(fle, head, ps_list) { + if (fle->fe->fd == fd) + return fle; + /* List is ordered, so let's stop */ + if (fle->fe->fd < fd) + break; + } + return NULL; +} + +static void collect_task_fd(struct fdinfo_list_entry *new_fle, struct rst_info *ri) +{ + struct fdinfo_list_entry *fle; + + /* + * fles in fds list are ordered by fd. Fds are restored from img files + * in ascending order, so it is faster to insert them from the end of + * the list. + */ + list_for_each_entry_reverse(fle, &ri->fds, ps_list) { + if (fle->fe->fd < new_fle->fe->fd) + break; + } + + list_add(&new_fle->ps_list, &fle->ps_list); +} + +unsigned int find_unused_fd(struct pstree_item *task, int hint_fd) +{ + struct list_head *head; + struct fdinfo_list_entry *fle; + int fd = 0, prev_fd; + + if ((hint_fd >= 0) && (!find_used_fd(task, hint_fd))) { + fd = hint_fd; + goto out; + } + + prev_fd = service_fd_min_fd(task) - 1; + head = &rsti(task)->fds; + + list_for_each_entry_reverse(fle, head, ps_list) { + fd = fle->fe->fd; + if (prev_fd > fd) { + fd++; + goto out; + } + prev_fd = fd - 1; + } + BUG(); +out: + return fd; +} + +int set_fds_event(pid_t virt) +{ + struct pstree_item *item; + bool is_set; + + item = pstree_item_by_virt(virt); + BUG_ON(!item); + + is_set = !!test_and_set_bit_le(FDS_EVENT_BIT, &item->task_st_le_bits); + + if (!is_set) + futex_wake(&item->task_st); + return 0; +} + +void clear_fds_event(void) +{ + clear_bit_le(FDS_EVENT_BIT, ¤t->task_st_le_bits); +} + +void wait_fds_event(void) +{ + futex_t *f = ¤t->task_st; + int value; + + value = htole32(FDS_EVENT); + futex_wait_if_cond(f, value, &); + clear_fds_event(); +} + +struct fdinfo_list_entry *try_file_master(struct file_desc *d) +{ + if (list_empty(&d->fd_info_head)) + return NULL; + + return list_first_entry(&d->fd_info_head, + struct fdinfo_list_entry, desc_list); +} + +struct fdinfo_list_entry *file_master(struct file_desc *d) +{ + struct fdinfo_list_entry *fle; + + fle = try_file_master(d); + if (!fle) { + pr_err("Empty list on file desc id %#x(%d)\n", d->id, + d->ops ? d->ops->type : -1); + BUG(); + } + + return fle; +} + +void show_saved_files(void) +{ + int i; + struct file_desc *fd; + + pr_info("File descs:\n"); + for (i = 0; i < FDESC_HASH_SIZE; i++) + hlist_for_each_entry(fd, &file_desc_hash[i], hash) { + struct fdinfo_list_entry *le; + + pr_info(" `- type %d ID %#x\n", fd->ops->type, fd->id); + list_for_each_entry(le, &fd->fd_info_head, desc_list) + pr_info(" `- FD %d pid %d\n", le->fe->fd, le->pid); + } +} + +/* + * Workaround for the OverlayFS bug present before Kernel 4.2 + * + * This is here only to support the Linux Kernel between versions + * 3.18 and 4.2. After that, this workaround is not needed anymore, + * but it will work properly on both a kernel with and without the bug. + * + * When a process has a file open in an OverlayFS directory, + * the information in /proc//fd/ and /proc//fdinfo/ + * is wrong. We can't even rely on stat()-ing /proc//fd/ since + * this will show us the wrong filesystem type. + * + * So we grab that information from the mountinfo table instead. This is done + * every time fill_fdlink is called. See lookup_overlayfs for more details. + * + */ +static int fixup_overlayfs(struct fd_parms *p, struct fd_link *link) +{ + struct mount_info *m; + + if (!link) + return 0; + + m = lookup_overlayfs(link->name, p->stat.st_dev, p->stat.st_ino, p->mnt_id); + if (IS_ERR(m)) + return -1; + + if (!m) + return 0; + + p->mnt_id = m->mnt_id; + + /* + * If the bug is present, the file path from /proc//fd + * does not include the mountpoint, so we prepend it ourselves. + */ + if (strcmp("./", m->mountpoint) != 0) { + char buf[PATH_MAX]; + int n; + + strncpy(buf, link->name, PATH_MAX); + buf[PATH_MAX - 1] = 0; + n = snprintf(link->name, PATH_MAX, "%s/%s", m->mountpoint, buf + 2); + if (n >= PATH_MAX) { + pr_err("Not enough space to replace %s\n", buf); + return -1; + } + } + return 0; +} + +/* + * The gen_id thing is used to optimize the comparison of shared files. + * If two files have different gen_ids, then they are different for sure. + * If it matches, we don't know it and have to call sys_kcmp(). + * + * The kcmp-ids.c engine does this trick, see comments in it for more info. + */ + +uint32_t make_gen_id(uint32_t st_dev, uint32_t st_ino, uint64_t pos) +{ + uint32_t pos_hi = pos >> 32; + uint32_t pos_low = pos & 0xffffffff; + + return st_dev ^ st_ino ^ pos_hi ^ pos_low; +} + +int do_dump_gen_file(struct fd_parms *p, int lfd, + const struct fdtype_ops *ops, FdinfoEntry *e) +{ + int ret = -1; + + e->type = ops->type; + e->id = make_gen_id((uint32_t)p->stat.st_dev, + (uint32_t)p->stat.st_ino, + (uint64_t)p->pos); + e->fd = p->fd; + e->flags = p->fd_flags; + + ret = fd_id_generate(p->pid, e, p); + if (ret == 1) /* new ID generated */ + ret = ops->dump(lfd, e->id, p); + else + /* Remove locks generated by the fd before going to the next */ + discard_dup_locks_tail(p->pid, e->fd); + + return ret; +} + +int fill_fdlink(int lfd, const struct fd_parms *p, struct fd_link *link) +{ + int len; + + link->name[0] = '.'; + + len = read_fd_link(lfd, &link->name[1], sizeof(link->name) - 1); + if (len < 0) { + pr_err("Can't read link for pid %d fd %d\n", p->pid, p->fd); + return -1; + } + + link->len = len + 1; + + if (opts.overlayfs) + if (fixup_overlayfs((struct fd_parms *)p, link) < 0) + return -1; + return 0; +} + +static int fill_fd_params(struct pid *owner_pid, int fd, int lfd, + struct fd_opts *opts, struct fd_parms *p) +{ + int ret; + struct statfs fsbuf; + struct fdinfo_common fdinfo = { .mnt_id = -1, .owner = owner_pid->ns[0].virt }; + + if (fstat(lfd, &p->stat) < 0) { + pr_perror("Can't stat fd %d", lfd); + return -1; + } + + if (fstatfs(lfd, &fsbuf) < 0) { + pr_perror("Can't statfs fd %d", lfd); + return -1; + } + + if (parse_fdinfo_pid(owner_pid->real, fd, FD_TYPES__UND, &fdinfo)) + return -1; + + p->fs_type = fsbuf.f_type; + p->fd = fd; + p->pos = fdinfo.pos; + p->flags = fdinfo.flags; + p->mnt_id = fdinfo.mnt_id; + p->pid = owner_pid->real; + p->fd_flags = opts->flags; + + fown_entry__init(&p->fown); + + pr_info("%d fdinfo %d: pos: %#16"PRIx64" flags: %16o/%#x\n", + owner_pid->real, fd, p->pos, p->flags, (int)p->fd_flags); + + ret = fcntl(lfd, F_GETSIG, 0); + if (ret < 0) { + pr_perror("Can't get owner signum on %d", lfd); + return -1; + } + p->fown.signum = ret; + + if (opts->fown.pid == 0) + return 0; + + p->fown.pid = opts->fown.pid; + p->fown.pid_type = opts->fown.pid_type; + p->fown.uid = opts->fown.uid; + p->fown.euid = opts->fown.euid; + + return 0; +} + +static const struct fdtype_ops *get_misc_dev_ops(int minor) +{ + switch (minor) { + case TUN_MINOR: + return &tunfile_dump_ops; + case AUTOFS_MINOR: + return ®file_dump_ops; + }; + + return NULL; +} + +static const struct fdtype_ops *get_mem_dev_ops(struct fd_parms *p, int minor) +{ + const struct fdtype_ops *ops = NULL; + + /* + * If /dev/kmsg is opened in write-only mode the file position + * should not be set up upon restore, kernel doesn't allow that. + */ + if (minor == 11 && (p->flags & O_ACCMODE) == O_WRONLY && p->pos == 0) + p->pos = -1ULL; + + ops = ®file_dump_ops; + + return ops; +} + +static int dump_chrdev(struct fd_parms *p, int lfd, FdinfoEntry *e) +{ + struct fd_link *link_old = p->link; + int maj = major(p->stat.st_rdev); + const struct fdtype_ops *ops; + struct fd_link link; + int err; + + switch (maj) { + case MEM_MAJOR: + ops = get_mem_dev_ops(p, minor(p->stat.st_rdev)); + break; + case MISC_MAJOR: + ops = get_misc_dev_ops(minor(p->stat.st_rdev)); + if (ops) + break; + /* fallthrough */ + default: { + char more[32]; + + if (is_tty(p->stat.st_rdev, p->stat.st_dev)) { + if (fill_fdlink(lfd, p, &link)) + return -1; + p->link = &link; + ops = &tty_dump_ops; + break; + } + + sprintf(more, "%d:%d", maj, minor(p->stat.st_rdev)); + err = dump_unsupp_fd(p, lfd, "chr", more, e); + p->link = link_old; + return err; + } + } + + err = do_dump_gen_file(p, lfd, ops, e); + p->link = link_old; + return err; +} + +static int dump_one_file(struct pid *pid, int fd, int lfd, struct fd_opts *opts, + struct parasite_ctl *ctl, FdinfoEntry *e, + struct parasite_drain_fd *dfds) +{ + struct fd_parms p = FD_PARMS_INIT; + const struct fdtype_ops *ops; + struct fd_link link; + + if (fill_fd_params(pid, fd, lfd, opts, &p) < 0) { + pr_err("Can't get stat on %d\n", fd); + return -1; + } + + if (note_file_lock(pid, fd, lfd, &p)) + return -1; + + /* Lease can be set only on regular file */ + if (S_ISREG(p.stat.st_mode)) { + int ret = correct_file_leases_type(pid, fd, lfd); + + if (ret < 0) + return ret; + } + + p.fd_ctl = ctl; /* Some dump_opts require this to talk to parasite */ + p.dfds = dfds; /* epoll needs to verify if target fd exist */ + + if (S_ISSOCK(p.stat.st_mode)) + return dump_socket(&p, lfd, e); + + if (S_ISCHR(p.stat.st_mode)) + return dump_chrdev(&p, lfd, e); + + if (p.fs_type == ANON_INODE_FS_MAGIC) { + char link[32]; + + if (read_fd_link(lfd, link, sizeof(link)) < 0) + return -1; + + if (is_eventfd_link(link)) + ops = &eventfd_dump_ops; + else if (is_eventpoll_link(link)) + ops = &eventpoll_dump_ops; + else if (is_inotify_link(link)) + ops = &inotify_dump_ops; + else if (is_fanotify_link(link)) + ops = &fanotify_dump_ops; + else if (is_signalfd_link(link)) + ops = &signalfd_dump_ops; + else if (is_timerfd_link(link)) + ops = &timerfd_dump_ops; + else + return dump_unsupp_fd(&p, lfd, "anon", link, e); + + return do_dump_gen_file(&p, lfd, ops, e); + } + + if (S_ISREG(p.stat.st_mode) || S_ISDIR(p.stat.st_mode)) { + if (fill_fdlink(lfd, &p, &link)) + return -1; + + p.link = &link; + if (link.name[1] == '/') + return do_dump_gen_file(&p, lfd, ®file_dump_ops, e); + + if (check_ns_proc(&link)) + return do_dump_gen_file(&p, lfd, &nsfile_dump_ops, e); + + return dump_unsupp_fd(&p, lfd, "reg", link.name + 1, e); + } + + if (S_ISFIFO(p.stat.st_mode)) { + if (p.fs_type == PIPEFS_MAGIC) + ops = &pipe_dump_ops; + else + ops = &fifo_dump_ops; + + return do_dump_gen_file(&p, lfd, ops, e); + } + + /* + * For debug purpose -- at least show the link + * file pointing to when reporting unsupported file. + * On error simply empty string here. + */ + if (fill_fdlink(lfd, &p, &link)) + memzero(&link, sizeof(link)); + + return dump_unsupp_fd(&p, lfd, "unknown", link.name + 1, e); +} + +int dump_my_file(int lfd, u32 *id, int *type) +{ + struct pid me = {}; + struct fd_opts fo = {}; + FdinfoEntry e = FDINFO_ENTRY__INIT; + + me.real = getpid(); + me.ns[0].virt = -1; /* FIXME */ + + if (dump_one_file(&me, lfd, lfd, &fo, NULL, &e, NULL)) + return -1; + + *id = e.id; + *type = e.type; + return 0; +} + +int dump_task_files_seized(struct parasite_ctl *ctl, struct pstree_item *item, + struct parasite_drain_fd *dfds) +{ + int *lfds = NULL; + struct cr_img *img = NULL; + struct fd_opts *opts = NULL; + int i, ret = -1; + int off, nr_fds = min((int) PARASITE_MAX_FDS, dfds->nr_fds); + + pr_info("\n"); + pr_info("Dumping opened files (pid: %d)\n", item->pid->real); + pr_info("----------------------------------------\n"); + + lfds = xmalloc(nr_fds * sizeof(int)); + if (!lfds) + goto err; + + opts = xmalloc(nr_fds * sizeof(struct fd_opts)); + if (!opts) + goto err; + + img = open_image(CR_FD_FDINFO, O_DUMP, item->ids->files_id); + if (!img) + goto err; + + ret = 0; /* Don't fail if nr_fds == 0 */ + for (off = 0; ret == 0 && off < dfds->nr_fds; off += nr_fds) { + if (nr_fds + off > dfds->nr_fds) + nr_fds = dfds->nr_fds - off; + + ret = parasite_drain_fds_seized(ctl, dfds, nr_fds, + off, lfds, opts); + if (ret) + goto err; + + for (i = 0; i < nr_fds; i++) { + FdinfoEntry e = FDINFO_ENTRY__INIT; + + ret = dump_one_file(item->pid, dfds->fds[i + off], + lfds[i], opts + i, ctl, &e, dfds); + if (ret) + break; + + ret = pb_write_one(img, &e, PB_FDINFO); + if (ret) + break; + } + + for (i = 0; i < nr_fds; i++) + close(lfds[i]); + } + + pr_info("----------------------------------------\n"); +err: + if (img) + close_image(img); + xfree(opts); + xfree(lfds); + return ret; +} + +static int predump_one_fd(int pid, int fd) +{ + const struct fdtype_ops *ops; + char link[PATH_MAX], t[32]; + int ret = 0; + + snprintf(t, sizeof(t), "/proc/%d/fd/%d", pid, fd); + ret = readlink(t, link, sizeof(link)); + if (ret < 0) { + pr_perror("Can't read link of fd %d", fd); + return -1; + } else if ((size_t)ret == sizeof(link)) { + pr_err("Buffer for read link of fd %d is too small\n", fd); + return -1; + } + link[ret] = 0; + + ret = 0; + if (is_inotify_link(link)) + ops = &inotify_dump_ops; + else if (is_fanotify_link(link)) + ops = &fanotify_dump_ops; + else + goto out; + + pr_debug("Pre-dumping %d's %d fd\n", pid, fd); + ret = ops->pre_dump(pid, fd); +out: + return ret; +} + +int predump_task_files(int pid) +{ + struct dirent *de; + DIR *fd_dir; + int ret = -1; + + pr_info("Pre-dump fds for %d)\n", pid); + + fd_dir = opendir_proc(pid, "fd"); + if (!fd_dir) + return -1; + + while ((de = readdir(fd_dir))) { + if (dir_dots(de)) + continue; + + if (predump_one_fd(pid, atoi(de->d_name))) + goto out; + } + + ret = 0; +out: + closedir(fd_dir); + return ret; +} + +int restore_fown(int fd, FownEntry *fown) +{ + struct f_owner_ex owner; + uid_t uids[3]; + + if (fown->signum) { + if (fcntl(fd, F_SETSIG, fown->signum)) { + pr_perror("Can't set signal"); + return -1; + } + } + + /* May be untouched */ + if (!fown->pid) + return 0; + + if (getresuid(&uids[0], &uids[1], &uids[2])) { + pr_perror("Can't get current UIDs"); + return -1; + } + + if (setresuid(fown->uid, fown->euid, uids[2])) { + pr_perror("Can't set UIDs"); + return -1; + } + + owner.type = fown->pid_type; + owner.pid = fown->pid; + + if (fcntl(fd, F_SETOWN_EX, &owner)) { + pr_perror("Can't setup %d file owner pid", fd); + return -1; + } + + if (setresuid(uids[0], uids[1], uids[2])) { + pr_perror("Can't revert UIDs back"); + return -1; + } + + if (prctl(PR_SET_DUMPABLE, 1, 0)) + pr_perror("Unable to set PR_SET_DUMPABLE"); + + return 0; +} + +int rst_file_params(int fd, FownEntry *fown, int flags) +{ + if (set_fd_flags(fd, flags) < 0) + return -1; + if (restore_fown(fd, fown) < 0) + return -1; + return 0; +} + +static struct fdinfo_list_entry *alloc_fle(int pid, FdinfoEntry *fe) +{ + struct fdinfo_list_entry *fle; + + fle = shmalloc(sizeof(*fle)); + if (!fle) + return NULL; + fle->pid = pid; + fle->fe = fe; + fle->received = 0; + fle->fake = 0; + fle->stage = FLE_INITIALIZED; + fle->task = pstree_item_by_virt(pid); + if (!fle->task) { + pr_err("Can't find task with pid %d\n", pid); + shfree_last(fle); + return NULL; + } + + return fle; +} + +static void __collect_desc_fle(struct fdinfo_list_entry *new_le, struct file_desc *fdesc) +{ + struct fdinfo_list_entry *le; + + list_for_each_entry_reverse(le, &fdesc->fd_info_head, desc_list) + if (pid_rst_prio_eq(le->pid, new_le->pid)) + break; + list_add(&new_le->desc_list, &le->desc_list); +} + +static void collect_desc_fle(struct fdinfo_list_entry *new_le, + struct file_desc *fdesc, bool force_master) +{ + new_le->desc = fdesc; + + if (!force_master) + __collect_desc_fle(new_le, fdesc); + else { + /* Link as first entry */ + list_add(&new_le->desc_list, &fdesc->fd_info_head); + } +} + +struct fdinfo_list_entry *collect_fd_to(int pid, FdinfoEntry *e, + struct rst_info *rst_info, struct file_desc *fdesc, + bool fake, bool force_master) +{ + struct fdinfo_list_entry *new_le; + + new_le = alloc_fle(pid, e); + if (new_le) { + new_le->fake = (!!fake); + collect_desc_fle(new_le, fdesc, force_master); + collect_task_fd(new_le, rst_info); + } + + return new_le; +} + +int collect_fd(int pid, FdinfoEntry *e, struct rst_info *rst_info, bool fake) +{ + struct file_desc *fdesc; + + pr_info("Collect fdinfo pid=%d fd=%d id=%#x\n", + pid, e->fd, e->id); + + fdesc = find_file_desc(e); + if (fdesc == NULL) { + pr_err("No file for fd %d id %#x\n", e->fd, e->id); + return -1; + } + + if (!collect_fd_to(pid, e, rst_info, fdesc, fake, false)) + return -1; + + return 0; +} + +FdinfoEntry *dup_fdinfo(FdinfoEntry *old, int fd, unsigned flags) +{ + FdinfoEntry *e; + + e = shmalloc(sizeof(*e)); + if (!e) + return NULL; + + fdinfo_entry__init(e); + + e->id = old->id; + e->type = old->type; + e->fd = fd; + e->flags = flags; + return e; +} + +int dup_fle(struct pstree_item *task, struct fdinfo_list_entry *ple, + int fd, unsigned flags) +{ + FdinfoEntry *e; + + e = dup_fdinfo(ple->fe, fd, flags); + if (!e) + return -1; + + return collect_fd(vpid(task), e, rsti(task), false); +} + +int prepare_fd_pid(struct pstree_item *item) +{ + int ret = 0; + struct cr_img *img; + pid_t pid = vpid(item); + struct rst_info *rst_info = rsti(item); + + INIT_LIST_HEAD(&rst_info->fds); + + if (item->ids == NULL) /* zombie */ + return 0; + + if (rsti(item)->fdt && rsti(item)->fdt->pid != vpid(item)) + return 0; + + img = open_image(CR_FD_FDINFO, O_RSTR, item->ids->files_id); + if (!img) + return -1; + + while (1) { + FdinfoEntry *e; + + ret = pb_read_one_eof(img, &e, PB_FDINFO); + if (ret <= 0) + break; + + if (e->fd >= kdat.sysctl_nr_open) { + ret = -1; + pr_err("Too big FD number to restore %d\n", e->fd); + break; + } + + ret = collect_fd(pid, e, rst_info, false); + if (ret < 0) { + fdinfo_entry__free_unpacked(e, NULL); + break; + } + } + + close_image(img); + return ret; +} + +#define SETFL_MASK (O_APPEND | O_ASYNC | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME) +int set_fd_flags(int fd, int flags) +{ + int ret; + + ret = fcntl(fd, F_GETFL, 0); + if (ret < 0) + goto err; + + flags = (SETFL_MASK & flags) | (ret & ~SETFL_MASK); + + ret = fcntl(fd, F_SETFL, flags); + if (ret < 0) + goto err; + + /* Let's check, that now actual flags contains those we need */ + ret = fcntl(fd, F_GETFL, 0); + if (ret < 0) + goto err; + + if (ret != flags) { + pr_err("fcntl call on fd %d (flags %#o) succeeded, " + "but some flags were dropped: %#o\n", fd, flags, ret); + return -1; + } + return 0; + +err: + pr_perror("fcntl call on fd %d (flags %x) failed", fd, flags); + return -1; +} + +struct fd_open_state { + char *name; + int (*cb)(int, struct fdinfo_list_entry *); +}; + +static int receive_fd(struct fdinfo_list_entry *fle); + +static void transport_name_gen(struct sockaddr_un *addr, int *len, int pid) +{ + addr->sun_family = AF_UNIX; + snprintf(addr->sun_path, UNIX_PATH_MAX, "x/crtools-fd-%d", pid); + *len = SUN_LEN(addr); + *addr->sun_path = '\0'; +} + +static bool task_fle(struct pstree_item *task, struct fdinfo_list_entry *fle) +{ + struct fdinfo_list_entry *tmp; + + list_for_each_entry(tmp, &rsti(task)->fds, ps_list) + if (fle == tmp) + return true; + return false; +} + +static int plant_fd(struct fdinfo_list_entry *fle, int fd) +{ + BUG_ON(fle->received); + fle->received = 1; + return reopen_fd_as(fle->fe->fd, fd); +} + +static int recv_fd_from_peer(struct fdinfo_list_entry *fle) +{ + struct fdinfo_list_entry *tmp; + int fd, ret, tsock; + + if (fle->received) + return 0; + + tsock = get_service_fd(TRANSPORT_FD_OFF); + do { + ret = __recv_fds(tsock, &fd, 1, (void *)&tmp, sizeof(struct fdinfo_list_entry *), MSG_DONTWAIT); + if (ret == -EAGAIN || ret == -EWOULDBLOCK) + return 1; + else if (ret) + return -1; + + pr_info("Further fle=%p, pid=%d\n", tmp, fle->pid); + if (!task_fle(current, tmp)) { + pr_err("Unexpected fle %p, pid=%d\n", tmp, vpid(current)); + return -1; + } + if (plant_fd(tmp, fd)) + return -1; + } while (tmp != fle); + + return 0; +} + +static int send_fd_to_peer(int fd, struct fdinfo_list_entry *fle) +{ + struct sockaddr_un saddr; + int len, sock, ret; + + sock = get_service_fd(TRANSPORT_FD_OFF); + + transport_name_gen(&saddr, &len, fle->pid); + pr_info("\t\tSend fd %d to %s\n", fd, saddr.sun_path + 1); + ret = send_fds(sock, &saddr, len, &fd, 1, (void *)&fle, sizeof(struct fdinfo_list_entry *)); + if (ret < 0) + return -1; + return set_fds_event(fle->pid); +} + +/* + * Helpers to scatter file_desc across users for those files, that + * create two descriptors from a single system call at once (e.g. + * ... or better i.e. -- pipes, socketpairs and ttys) + */ +int recv_desc_from_peer(struct file_desc *d, int *fd) +{ + struct fdinfo_list_entry *fle; + + fle = file_master(d); + *fd = fle->fe->fd; + return recv_fd_from_peer(fle); +} + +int send_desc_to_peer(int fd, struct file_desc *d) +{ + return send_fd_to_peer(fd, file_master(d)); +} + +static int send_fd_to_self(int fd, struct fdinfo_list_entry *fle) +{ + int dfd = fle->fe->fd; + + if (fd == dfd) + return 0; + + BUG_ON(dfd == get_service_fd(TRANSPORT_FD_OFF)); + + pr_info("\t\t\tGoing to dup %d into %d\n", fd, dfd); + if (dup2(fd, dfd) != dfd) { + pr_perror("Can't dup local fd %d -> %d", fd, dfd); + return -1; + } + + if (fcntl(dfd, F_SETFD, fle->fe->flags) == -1) { + pr_perror("Unable to set file descriptor flags"); + return -1; + } + + fle->received = 1; + + return 0; +} + +static int serve_out_fd(int pid, int fd, struct file_desc *d) +{ + int ret; + struct fdinfo_list_entry *fle; + + pr_info("\t\tCreate fd for %d\n", fd); + + list_for_each_entry(fle, &d->fd_info_head, desc_list) { + if (pid == fle->pid) + ret = send_fd_to_self(fd, fle); + else + ret = send_fd_to_peer(fd, fle); + + if (ret) { + pr_err("Can't sent fd %d to %d\n", fd, fle->pid); + goto out; + } + } + + ret = 0; +out: + return ret; +} + +int setup_and_serve_out(struct fdinfo_list_entry *fle, int new_fd) +{ + struct file_desc *d = fle->desc; + pid_t pid = fle->pid; + + if (reopen_fd_as(fle->fe->fd, new_fd)) + return -1; + + if (fcntl(fle->fe->fd, F_SETFD, fle->fe->flags) == -1) { + pr_perror("Unable to set file descriptor flags"); + return -1; + } + + BUG_ON(fle->stage != FLE_INITIALIZED); + fle->stage = FLE_OPEN; + + if (serve_out_fd(pid, fle->fe->fd, d)) + return -1; + return 0; +} + +static int open_fd(struct fdinfo_list_entry *fle) +{ + struct file_desc *d = fle->desc; + struct fdinfo_list_entry *flem; + int new_fd = -1, ret; + + flem = file_master(d); + if (fle != flem) { + BUG_ON (fle->stage != FLE_INITIALIZED); + ret = receive_fd(fle); + if (ret != 0) + return ret; + goto out; + } + + /* + * Open method returns the following values: + * 0 -- restore is successfully finished; + * 1 -- restore is in process or can't be started + * yet, because of it depends on another fles, + * so the method should be called once again; + * -1 -- restore failed. + * In case of 0 and 1 return values, new_fd may + * be not negative. In this case it contains newly + * opened file descriptor, which may be served out. + * For every fle, new_fd is populated only once. + * See setup_and_serve_out() BUG_ON for the details. + */ + ret = d->ops->open(d, &new_fd); + if (ret != -1 && new_fd >= 0) { + if (setup_and_serve_out(fle, new_fd) < 0) + return -1; + } +out: + if (ret == 0) + fle->stage = FLE_RESTORED; + return ret; +} + +static int receive_fd(struct fdinfo_list_entry *fle) +{ + int ret; + + pr_info("\tReceive fd for %d\n", fle->fe->fd); + + ret = recv_fd_from_peer(fle); + if (ret != 0) { + if (ret != 1) + pr_err("Can't get fd=%d, pid=%d\n", fle->fe->fd, fle->pid); + return ret; + } + + if (fcntl(fle->fe->fd, F_SETFD, fle->fe->flags) == -1) { + pr_perror("Unable to set file descriptor flags"); + return -1; + } + + return 0; +} + +static void close_fdinfos(struct list_head *list) +{ + struct fdinfo_list_entry *fle; + + list_for_each_entry(fle, list, ps_list) + close(fle->fe->fd); +} + +static int open_fdinfos(struct pstree_item *me) +{ + struct list_head *list = &rsti(me)->fds; + struct fdinfo_list_entry *fle, *tmp; + LIST_HEAD(completed); + LIST_HEAD(fake); + bool progress, again; + int st, ret = 0; + + do { + progress = again = false; + clear_fds_event(); + + list_for_each_entry_safe(fle, tmp, list, ps_list) { + st = fle->stage; + BUG_ON(st == FLE_RESTORED); + ret = open_fd(fle); + if (ret == -1) { + pr_err("Unable to open fd=%d id=%#x\n", + fle->fe->fd, fle->fe->id); + goto splice; + } + if (st != fle->stage || ret == 0) + progress = true; + if (ret == 0) { + /* + * We delete restored items from fds list, + * so open() methods may base on this feature + * and reduce number of fles in their checks. + */ + list_del(&fle->ps_list); + if (!fle->fake) + list_add(&fle->ps_list, &completed); + else + list_add(&fle->ps_list, &fake); + } + if (ret == 1) + again = true; + } + if (!progress && again) + wait_fds_event(); + } while (again || progress); + + BUG_ON(!list_empty(list)); + /* + * Fake fles may be used for restore other + * file types, so their closing is delayed. + */ + close_fdinfos(&fake); +splice: + list_splice(&fake, list); + list_splice(&completed, list); + + return ret; +} + +int close_old_fds(void) +{ + DIR *dir; + struct dirent *de; + int fd, ret; + + dir = opendir_proc(PROC_SELF, "fd"); + if (dir == NULL) + return -1; + + while ((de = readdir(dir))) { + if (dir_dots(de)) + continue; + + ret = sscanf(de->d_name, "%d", &fd); + if (ret != 1) { + pr_err("Can't parse %s\n", de->d_name); + closedir(dir); + close_pid_proc(); + return -1; + } + + if ((!is_any_service_fd(fd)) && (dirfd(dir) != fd)) + close_safe(&fd); + } + + closedir(dir); + close_pid_proc(); + + return 0; +} + +int prepare_fds(struct pstree_item *me) +{ + u32 ret = 0; + + pr_info("Opening fdinfo-s\n"); + + /* + * This must be done after forking to allow child + * to get the cgroup fd so it can move into the + * correct /tasks file if it is in a different cgroup + * set than its parent + */ + sfds_protected = false; + close_service_fd(CGROUP_YARD); + sfds_protected = true; + set_proc_self_fd(-1); /* flush any proc cached fds we may have */ + + if (rsti(me)->fdt) { + struct fdt *fdt = rsti(me)->fdt; + + /* + * Wait all tasks, who share a current fd table. + * We should be sure, that nobody use any file + * descriptor while fdtable is being restored. + */ + futex_inc_and_wake(&fdt->fdt_lock); + futex_wait_while_lt(&fdt->fdt_lock, fdt->nr); + + if (fdt->pid != vpid(me)) { + pr_info("File descriptor table is shared with %d\n", fdt->pid); + futex_wait_until(&fdt->fdt_lock, fdt->nr + 1); + goto out; + } + } + + BUG_ON(current->pid->state == TASK_HELPER); + ret = open_fdinfos(me); + + if (rsti(me)->fdt) + futex_inc_and_wake(&rsti(me)->fdt->fdt_lock); +out: + return ret; +} + +static int fchroot(int fd) +{ + /* + * There's no such thing in syscalls. We can emulate + * it using fchdir() + */ + + if (fchdir(fd) < 0) { + pr_perror("Can't chdir to proc"); + return -1; + } + + pr_debug("Going to chroot into /proc/self/fd/%d\n", fd); + return chroot("."); +} + +int restore_fs(struct pstree_item *me) +{ + int dd_root = -1, dd_cwd = -1, ret, err = -1; + struct rst_info *ri = rsti(me); + + /* + * First -- open both descriptors. We will not + * be able to open the cwd one after we chroot. + */ + + dd_root = open_reg_fd(ri->root); + if (dd_root < 0) { + pr_err("Can't open root\n"); + goto out; + } + + dd_cwd = open_reg_fd(ri->cwd); + if (dd_cwd < 0) { + pr_err("Can't open cwd\n"); + goto out; + } + + /* + * Now do chroot/chdir. Chroot goes first as it calls chdir into + * dd_root so we'd need to fix chdir after it anyway. + */ + + ret = fchroot(dd_root); + if (ret < 0) { + pr_perror("Can't change root"); + goto out; + } + + ret = fchdir(dd_cwd); + if (ret < 0) { + pr_perror("Can't change cwd"); + goto out; + } + + if (ri->has_umask) { + pr_info("Restoring umask to %o\n", ri->umask); + umask(ri->umask); + } + + err = 0; +out: + if (dd_cwd >= 0) + close(dd_cwd); + if (dd_root >= 0) + close(dd_root); + + return err; +} + +int prepare_fs_pid(struct pstree_item *item) +{ + pid_t pid = vpid(item); + struct rst_info *ri = rsti(item); + struct cr_img *img; + FsEntry *fe; + int ret = -1; + + img = open_image(CR_FD_FS, O_RSTR, pid); + if (!img) + goto out; + + ret = pb_read_one_eof(img, &fe, PB_FS); + close_image(img); + if (ret <= 0) + goto out; + + ri->cwd = collect_special_file(fe->cwd_id); + if (!ri->cwd) { + pr_err("Can't find task cwd file\n"); + goto out_f; + } + + ri->root = collect_special_file(fe->root_id); + if (!ri->root) { + pr_err("Can't find task root file\n"); + goto out_f; + } + + ri->has_umask = fe->has_umask; + ri->umask = fe->umask; + + ret = 0; +out_f: + fs_entry__free_unpacked(fe, NULL); +out: + return ret; +} + +int shared_fdt_prepare(struct pstree_item *item) +{ + struct pstree_item *parent = item->parent; + struct fdt *fdt; + + if (!rsti(parent)->fdt) { + fdt = shmalloc(sizeof(*rsti(item)->fdt)); + if (fdt == NULL) + return -1; + + rsti(parent)->fdt = fdt; + + futex_init(&fdt->fdt_lock); + fdt->nr = 1; + fdt->pid = vpid(parent); + } else + fdt = rsti(parent)->fdt; + + rsti(item)->fdt = fdt; + rsti(item)->service_fd_id = fdt->nr; + fdt->nr++; + + return 0; +} + +/* + * Inherit fd support. + * + * There are cases where a process's file descriptor cannot be restored + * from the checkpointed image. For example, a pipe file descriptor with + * one end in the checkpointed process and the other end in a separate + * process (that was not part of the checkpointed process tree) cannot be + * restored because after checkpoint the pipe would be broken and removed. + * + * There are also cases where the user wants to use a new file during + * restore instead of the original file in the checkpointed image. For + * example, the user wants to change the log file of a process from + * /path/to/oldlog to /path/to/newlog. + * + * In these cases, criu's caller should set up a new file descriptor to be + * inherited by the restored process and specify it with the --inherit-fd + * command line option. The argument of --inherit-fd has the format + * fd[%d]:%s, where %d tells criu which of its own file descriptor to use + * for restoring file identified by %s. + * + * As a debugging aid, if the argument has the format debug[%d]:%s, it tells + * criu to write out the string after colon to the file descriptor %d. This + * can be used to leave a "restore marker" in the output stream of the process. + * + * It's important to note that inherit fd support breaks applications + * that depend on the state of the file descriptor being inherited. So, + * consider inherit fd only for specific use cases that you know for sure + * won't break the application. + * + * For examples please visit http://criu.org/Category:HOWTO. + */ + +struct inherit_fd { + struct list_head inh_list; + char *inh_id; /* file identifier */ + int inh_fd; /* criu's descriptor to inherit */ + int inh_fd_id; +}; + +int inh_fd_max = -1; + +int inherit_fd_parse(char *optarg) +{ + char *cp = NULL; + int n = -1; + int fd = -1; + int dbg = 0; + + /* + * Parse the argument. + */ + if (!strncmp(optarg, "fd", 2)) + cp = &optarg[2]; + else if (!strncmp(optarg, "debug", 5)) { + cp = &optarg[5]; + dbg = 1; + } + if (cp) { + n = sscanf(cp, "[%d]:", &fd); + cp = strchr(optarg, ':'); + } + if (n != 1 || fd < 0 || !cp || !cp[1]) { + pr_err("Invalid inherit fd argument: %s\n", optarg); + return -1; + } + + /* + * If the argument is a debug string, write it to fd. + * Otherwise, add it to the inherit fd list. + */ + cp++; + if (dbg) { + n = strlen(cp); + if (write(fd, cp, n) != n) { + pr_err("Can't write debug message %s to inherit fd %d\n", + cp, fd); + return -1; + } + return 0; + } + + return inherit_fd_add(fd, cp); +} + +int inherit_fd_add(int fd, char *key) +{ + struct inherit_fd *inh; + struct stat sbuf; + + if (fstat(fd, &sbuf) == -1) { + pr_perror("Can't fstat inherit fd %d", fd); + return -1; + } + + inh = xmalloc(sizeof *inh); + if (inh == NULL) + return -1; + + if (fd > inh_fd_max) + inh_fd_max = fd; + + inh->inh_id = key; + inh->inh_fd = fd; + list_add_tail(&inh->inh_list, &opts.inherit_fds); + return 0; +} + +/* + * Log the inherit fd list. Called for diagnostics purposes + * after the log file is initialized. + */ +void inherit_fd_log(void) +{ + struct inherit_fd *inh; + + list_for_each_entry(inh, &opts.inherit_fds, inh_list) { + pr_info("File %s will be restored from inherit fd %d\n", + inh->inh_id, inh->inh_fd); + } +} + +int inherit_fd_move_to_fdstore(void) +{ + struct inherit_fd *inh; + + list_for_each_entry(inh, &opts.inherit_fds, inh_list) { + inh->inh_fd_id = fdstore_add(inh->inh_fd); + if (inh->inh_fd_id < 0) + return -1; + close_safe(&inh->inh_fd); + } + + return 0; +} + +/* + * Look up the inherit fd list by a file identifier. + */ +int inherit_fd_lookup_id(char *id) +{ + int ret; + struct inherit_fd *inh; + + ret = -1; + list_for_each_entry(inh, &opts.inherit_fds, inh_list) { + if (!strcmp(inh->inh_id, id)) { + ret = fdstore_get(inh->inh_fd_id); + pr_debug("Found id %s (fd %d) in inherit fd list\n", + id, ret); + break; + } + } + return ret; +} + +bool inherited_fd(struct file_desc *d, int *fd_p) +{ + char buf[32], *id_str; + int i_fd; + + if (!d->ops->name) + return false; + + id_str = d->ops->name(d, buf, sizeof(buf)); + i_fd = inherit_fd_lookup_id(id_str); + if (i_fd < 0) + return false; + + if (fd_p == NULL) + return true; + + *fd_p = i_fd; + pr_info("File %s will be restored from fd %d dumped " + "from inherit fd %d\n", id_str, *fd_p, i_fd); + return true; +} + +int open_transport_socket(void) +{ + pid_t pid = vpid(current); + struct sockaddr_un saddr; + int sock, slen, ret = -1; + + sock = socket(PF_UNIX, SOCK_DGRAM | SOCK_CLOEXEC, 0); + if (sock < 0) { + pr_perror("Can't create socket"); + goto out; + } + + transport_name_gen(&saddr, &slen, pid); + if (bind(sock, (struct sockaddr *)&saddr, slen) < 0) { + pr_perror("Can't bind transport socket %s", saddr.sun_path + 1); + close(sock); + goto out; + } + + if (install_service_fd(TRANSPORT_FD_OFF, sock) < 0) + goto out; + ret = 0; +out: + return ret; +} + +static int collect_one_file_entry(FileEntry *fe, u_int32_t id, ProtobufCMessage *base, + struct collect_image_info *cinfo) +{ + if (fe->id != id) { + pr_err("ID mismatch %u != %u\n", fe->id, id); + return -1; + } + + return collect_entry(base, cinfo); +} + +static int collect_one_file(void *o, ProtobufCMessage *base, struct cr_img *i) +{ + int ret = 0; + FileEntry *fe; + + fe = pb_msg(base, FileEntry); + switch (fe->type) { + default: + pr_err("Unknown file type %d\n", fe->type); + return -1; + case FD_TYPES__REG: + ret = collect_one_file_entry(fe, fe->reg->id, &fe->reg->base, ®_file_cinfo); + break; + case FD_TYPES__INETSK: + ret = collect_one_file_entry(fe, fe->isk->id, &fe->isk->base, &inet_sk_cinfo); + break; + case FD_TYPES__NS: + ret = collect_one_file_entry(fe, fe->nsf->id, &fe->nsf->base, &nsfile_cinfo); + break; + case FD_TYPES__PACKETSK: + ret = collect_one_file_entry(fe, fe->psk->id, &fe->psk->base, &packet_sk_cinfo); + break; + case FD_TYPES__NETLINKSK: + ret = collect_one_file_entry(fe, fe->nlsk->id, &fe->nlsk->base, &netlink_sk_cinfo); + break; + case FD_TYPES__EVENTFD: + ret = collect_one_file_entry(fe, fe->efd->id, &fe->efd->base, &eventfd_cinfo); + break; + case FD_TYPES__EVENTPOLL: + ret = collect_one_file_entry(fe, fe->epfd->id, &fe->epfd->base, &epoll_cinfo); + break; + case FD_TYPES__SIGNALFD: + ret = collect_one_file_entry(fe, fe->sgfd->id, &fe->sgfd->base, &signalfd_cinfo); + break; + case FD_TYPES__TUNF: + ret = collect_one_file_entry(fe, fe->tunf->id, &fe->tunf->base, &tunfile_cinfo); + break; + case FD_TYPES__TIMERFD: + ret = collect_one_file_entry(fe, fe->tfd->id, &fe->tfd->base, &timerfd_cinfo); + break; + case FD_TYPES__INOTIFY: + ret = collect_one_file_entry(fe, fe->ify->id, &fe->ify->base, &inotify_cinfo); + break; + case FD_TYPES__FANOTIFY: + ret = collect_one_file_entry(fe, fe->ffy->id, &fe->ffy->base, &fanotify_cinfo); + break; + case FD_TYPES__EXT: + ret = collect_one_file_entry(fe, fe->ext->id, &fe->ext->base, &ext_file_cinfo); + break; + case FD_TYPES__UNIXSK: + ret = collect_one_file_entry(fe, fe->usk->id, &fe->usk->base, &unix_sk_cinfo); + break; + case FD_TYPES__FIFO: + ret = collect_one_file_entry(fe, fe->fifo->id, &fe->fifo->base, &fifo_cinfo); + break; + case FD_TYPES__PIPE: + ret = collect_one_file_entry(fe, fe->pipe->id, &fe->pipe->base, &pipe_cinfo); + break; + case FD_TYPES__TTY: + ret = collect_one_file_entry(fe, fe->tty->id, &fe->tty->base, &tty_cinfo); + break; + } + + return ret; +} + +struct collect_image_info files_cinfo = { + .fd_type = CR_FD_FILES, + .pb_type = PB_FILE, + .priv_size = 0, + .collect = collect_one_file, + .flags = COLLECT_NOFREE, +}; + +int prepare_files(void) +{ + init_fdesc_hash(); + return collect_image(&files_cinfo); +} diff --git a/CRIU_code/criu/filesystems.c b/CRIU_code/criu/filesystems.c new file mode 100644 index 0000000..1e4550b --- /dev/null +++ b/CRIU_code/criu/filesystems.c @@ -0,0 +1,870 @@ +#include +#include +#include +#include +#include + +#include "common/config.h" +#include "int.h" +#include "common/compiler.h" +#include "xmalloc.h" +#include "cr_options.h" +#include "filesystems.h" +#include "namespaces.h" +#include "mount.h" +#include "pstree.h" +#include "kerndat.h" +#include "protobuf.h" +#include "autofs.h" +#include "util.h" +#include "fs-magic.h" +#include "tty.h" + +#include "images/mnt.pb-c.h" +#include "images/binfmt-misc.pb-c.h" + +static int attach_option(struct mount_info *pm, char *opt) +{ + if (pm->options[0] == '\0') + pm->options = xstrcat(pm->options, "%s", opt); + else + pm->options = xstrcat(pm->options, ",%s", opt); + return pm->options ? 0 : -1; +} + +#ifdef CONFIG_BINFMT_MISC_VIRTUALIZED +struct binfmt_misc_info { + BinfmtMiscEntry *bme; + struct list_head list; +}; + +LIST_HEAD(binfmt_misc_list); + +static int binfmt_misc_parse_or_collect(struct mount_info *pm) +{ + opts.has_binfmt_misc = true; + return 0; + +} + +static int binfmt_misc_virtual(struct mount_info *pm) +{ + return kerndat_fs_virtualized(KERNDAT_FS_STAT_BINFMT_MISC, pm->s_dev); +} + +static int parse_binfmt_misc_entry(struct bfd *f, BinfmtMiscEntry *bme) +{ + while (1) { + char *str; + + str = breadline(f); + if (IS_ERR(str)) + return -1; + if (!str) + break; + + if (!strncmp(str, "enabled", 7)) { + bme->enabled = true; + continue; + } + + if (!strncmp(str, "disabled", 8)) + continue; + + if (!strncmp(str, "offset ", 7)) { + if (sscanf(str + 7, "%i", &bme->offset) != 1) + return -1; + bme->has_offset = true; + continue; + } + +#define DUP_EQUAL_AS(key, member) \ + if (!strncmp(str, key, strlen(key))) { \ + bme->member = xstrdup(str + strlen(key)); \ + if (!bme->member) \ + return -1; \ + continue; \ + } + DUP_EQUAL_AS("interpreter ", interpreter) + DUP_EQUAL_AS("flags: ", flags) + DUP_EQUAL_AS("extension .", extension) + DUP_EQUAL_AS("magic ", magic) + DUP_EQUAL_AS("mask ", mask) +#undef DUP_EQUAL_AS + + pr_perror("binfmt_misc: unsupported feature %s", str); + return -1; + } + + return 0; +} + +static int dump_binfmt_misc_entry(int dfd, char *name, struct cr_img *img) +{ + BinfmtMiscEntry bme = BINFMT_MISC_ENTRY__INIT; + struct bfd f; + int ret = -1; + + f.fd = openat(dfd, name, O_RDONLY); + if (f.fd < 0) { + pr_perror("binfmt_misc: can't open %s", name); + return -1; + } + + if (bfdopenr(&f)) + return -1; + + if (parse_binfmt_misc_entry(&f, &bme)) + goto err; + + bme.name = name; + + if (pb_write_one(img, &bme, PB_BINFMT_MISC)) + goto err; + ret = 0; +err: + free(bme.interpreter); + free(bme.flags); + free(bme.extension); + free(bme.magic); + free(bme.mask); + bclose(&f); + return ret; + +} + +static int binfmt_misc_dump(struct mount_info *pm) +{ + static bool dumped = false; + struct cr_img *img = NULL; + struct dirent *de; + DIR *fdir = NULL; + int fd, ret; + + ret = binfmt_misc_virtual(pm); + if (ret <= 0) + return ret; + + if (dumped) { + pr_err("Second binfmt_misc superblock\n"); + return -1; + } + dumped = true; + + fd = open_mountpoint(pm); + if (fd < 0) + return fd; + + fdir = fdopendir(fd); + if (fdir == NULL) { + close(fd); + return -1; + } + + ret = -1; + while ((de = readdir(fdir))) { + if (dir_dots(de)) + continue; + if (!strcmp(de->d_name, "register")) + continue; + if (!strcmp(de->d_name, "status")) + continue; + + if (!img) { + /* Create image only if an entry exists, i.e. here */ + img = open_image(CR_FD_BINFMT_MISC, O_DUMP); + if (!img) + goto out; + } + + if (dump_binfmt_misc_entry(fd, de->d_name, img)) + goto out; + } + + ret = 0; +out: + if (img) + close_image(img); + closedir(fdir); + return ret; +} + +static int write_binfmt_misc_entry(char *mp, char *buf, BinfmtMiscEntry *bme) +{ + int fd, len, ret = -1; + char path[PATH_MAX+1]; + + snprintf(path, PATH_MAX, "%s/register", mp); + + fd = open(path, O_WRONLY); + if (fd < 0) { + pr_perror("binfmt_misc: can't open %s", path); + return -1; + } + + len = strlen(buf); + + if (write(fd, buf, len) != len) { + pr_perror("binfmt_misc: can't write to %s", path); + goto close; + } + + if (!bme->enabled) { + close(fd); + snprintf(path, PATH_MAX, "%s/%s", mp, bme->name); + + fd = open(path, O_WRONLY); + if (fd < 0) { + pr_perror("binfmt_misc: can't open %s", path); + goto out; + } + if (write(fd, "0", 1) != 1) { + pr_perror("binfmt_misc: can't write to %s", path); + goto close; + } + } + + ret = 0; +close: + close(fd); +out: + return ret; +} + +#define BINFMT_MISC_STR (1920 + 1) +static int make_bfmtm_magic_str(char *buf, BinfmtMiscEntry *bme) +{ + int i, len; + + /* + * Format is ":name:type(M):offset:magic:mask:interpreter:flags". + * Magic and mask are special fields. Kernel outputs them as + * a sequence of hexadecimal numbers (abc -> 616263), and we + * dump them without changes. But for registering a new entry + * it expects every byte is prepended with \x, i.e. \x61\x62\x63. + */ + len = strlen(bme->name) + 3 /* offset < 128 */ + 2 * strlen(bme->magic) + + (bme->mask ? 2 * strlen(bme->mask) : 0) + strlen(bme->interpreter) + + (bme->flags ? strlen(bme->flags) : 0) + strlen(":::::::"); + + if ((len > BINFMT_MISC_STR - 1) || bme->offset > 128) + return -1; + + buf += sprintf(buf, ":%s:M:%d:", bme->name, bme->offset); + + len = strlen(bme->magic); + for (i = 0; i < len; i += 2) + buf += sprintf(buf, "\\x%c%c", bme->magic[i], bme->magic[i + 1]); + + buf += sprintf(buf, ":"); + + if (bme->mask) { + len = strlen(bme->mask); + for (i = 0; i < len; i += 2) + buf += sprintf(buf, "\\x%c%c", bme->mask[i], bme->mask[i + 1]); + } + + sprintf(buf, ":%s:%s", bme->interpreter, bme->flags ? : "\0"); + + return 1; +} + +static int binfmt_misc_restore_bme(struct mount_info *mi, BinfmtMiscEntry *bme, char *buf) +{ + int ret; + + if (!bme->name || !bme->interpreter) + goto bad_dump; + + /* Either magic or extension should be there */ + if (bme->magic) { + ret = make_bfmtm_magic_str(buf, bme); + } else if (bme->extension) { + /* :name:E::extension::interpreter:flags */ + ret = snprintf(buf, BINFMT_MISC_STR, ":%s:E::%s::%s:%s", + bme->name, bme->extension, bme->interpreter, + bme->flags ? : "\0"); + if (ret >= BINFMT_MISC_STR) /* output truncated */ + ret = -1; + } else + ret = -1; + + if (ret < 0) + goto bad_dump; + + pr_debug("binfmt_misc_pattern=%s\n", buf); + ret = write_binfmt_misc_entry(mi->mountpoint, buf, bme); + + return ret; + +bad_dump: + pr_perror("binfmt_misc: bad dump"); + return -1; +} + +static int binfmt_misc_restore(struct mount_info *mi) +{ + struct cr_img *img; + char *buf; + int ret = -1; + + buf = xmalloc(BINFMT_MISC_STR); + if (!buf) + return -1; + + if (!list_empty(&binfmt_misc_list)) { + struct binfmt_misc_info *bmi; + + list_for_each_entry(bmi, &binfmt_misc_list, list) { + ret = binfmt_misc_restore_bme(mi, bmi->bme, buf); + if (ret) + break; + } + goto free_buf; + } + + img = open_image(CR_FD_BINFMT_MISC_OLD, O_RSTR, mi->s_dev); + if (!img) { + pr_err("Can't open binfmt_misc_old image\n"); + goto free_buf; + } else if (empty_image(img)) { + close_image(img); + ret = 0; + goto free_buf; + } + + ret = 0; + while (ret == 0) { + BinfmtMiscEntry *bme; + + ret = pb_read_one_eof(img, &bme, PB_BINFMT_MISC); + if (ret <= 0) + break; + + ret = binfmt_misc_restore_bme(mi, bme, buf); + + binfmt_misc_entry__free_unpacked(bme, NULL); + } + + close_image(img); +free_buf: + free(buf); + return ret; +} + +static int collect_one_binfmt_misc_entry(void *o, ProtobufCMessage *msg, struct cr_img *img) +{ + struct binfmt_misc_info *bmi = o; + + bmi->bme = pb_msg(msg, BinfmtMiscEntry); + list_add_tail(&bmi->list, &binfmt_misc_list); + + return 0; +} + +struct collect_image_info binfmt_misc_cinfo = { + .fd_type = CR_FD_BINFMT_MISC, + .pb_type = PB_BINFMT_MISC, + .priv_size = sizeof(struct binfmt_misc_info), + .collect = collect_one_binfmt_misc_entry, +}; + +int collect_binfmt_misc(void) +{ + return collect_image(&binfmt_misc_cinfo); +} +#else +#define binfmt_misc_dump NULL +#define binfmt_misc_restore NULL +#define binfmt_misc_parse_or_collect NULL +#endif + +static int tmpfs_dump(struct mount_info *pm) +{ + int ret = -1, fd = -1, userns_pid = -1; + struct cr_img *img; + int tmp_fds[3], ntmp_fds = 0, i; + + fd = open_mountpoint(pm); + if (fd < 0) + return MNT_UNREACHABLE; + + /* + * fd should not be one of standard descriptors, because + * cr_system_userns will override them. + */ + for (i = 0; i < 3; i++) { + if (fd > 2) + break; + tmp_fds[ntmp_fds++] = fd; + fd = dup(fd); + if (fd < 0) { + pr_perror("Unable to duplicate a file descriptor"); + goto out; + } + } + + if (move_fd_from(&fd, STDIN_FILENO) < 0) + goto out; + + if (fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) & ~FD_CLOEXEC) == -1) { + pr_perror("Can not drop FD_CLOEXEC"); + goto out; + } + + img = open_image(CR_FD_TMPFS_DEV, O_DUMP, pm->s_dev); + if (!img) + goto out; + + if (root_ns_mask & CLONE_NEWUSER) + userns_pid = root_item->pid->real; + + ret = cr_system_userns(fd, img_raw_fd(img), -1, "tar", (char *[]) + { "tar", "--create", + "--gzip", + "--no-unquote", + "--no-wildcards", + "--one-file-system", + "--check-links", + "--preserve-permissions", + "--sparse", + "--numeric-owner", + "--directory", "/proc/self/fd/0", ".", NULL }, 0, userns_pid); + + if (ret) + pr_err("Can't dump tmpfs content\n"); + + close_image(img); +out: + for (i = 0; i < ntmp_fds; i++) + close(tmp_fds[i]); + close_safe(&fd); + return ret; +} + +static int tmpfs_restore(struct mount_info *pm) +{ + int ret; + struct cr_img *img; + + img = open_image(CR_FD_TMPFS_DEV, O_RSTR, pm->s_dev); + if (empty_image(img)) { + close_image(img); + img = open_image(CR_FD_TMPFS_IMG, O_RSTR, pm->mnt_id); + } + if (!img) + return -1; + if (empty_image(img)) { + close_image(img); + return -1; + } + + ret = cr_system(img_raw_fd(img), -1, -1, "tar", + (char *[]) {"tar", "--extract", "--gzip", + "--no-unquote", "--no-wildcards", + "--directory", pm->mountpoint, NULL}, 0); + close_image(img); + + if (ret) { + pr_err("Can't restore tmpfs content\n"); + return -1; + } + + return 0; +} + +/* + * Virtualized devtmpfs on any side (dump or restore) + * means, that we should try to handle it as a plain + * tmpfs. + * + * Interesting case -- shared on dump and virtual on + * restore -- will fail, since no tarball with the fs + * contents will be found. + */ + +static int devtmpfs_virtual(struct mount_info *pm) +{ + return kerndat_fs_virtualized(KERNDAT_FS_STAT_DEVTMPFS, pm->s_dev); +} + +static int devtmpfs_dump(struct mount_info *pm) +{ + int ret; + + ret = devtmpfs_virtual(pm); + if (ret == 1) + ret = tmpfs_dump(pm); + + return ret; +} + +static int devtmpfs_restore(struct mount_info *pm) +{ + int ret; + + ret = devtmpfs_virtual(pm); + if (ret == 1) + ret = tmpfs_restore(pm); + + return ret; +} + +/* Is it mounted w or w/o the newinstance option */ +static int devpts_parse(struct mount_info *pm) +{ + int ret; + + ret = kerndat_fs_virtualized(KERNDAT_FS_STAT_DEVPTS, pm->s_dev); + if (ret <= 0) + return ret; + + /* + * Kernel hides this option, but if the fs instance + * is new (virtualized) we know that it was created + * with -o newinstance. + */ + return attach_option(pm, "newinstance"); +} + +static int fusectl_dump(struct mount_info *pm) +{ + int fd, ret = -1; + struct dirent *de; + DIR *fdir = NULL; + + fd = open_mountpoint(pm); + if (fd < 0) + return fd; + + fdir = fdopendir(fd); + if (fdir == NULL) { + close(fd); + return -1; + } + + while ((de = readdir(fdir))) { + int id; + struct mount_info *it; + + if (dir_dots(de)) + continue; + + if (sscanf(de->d_name, "%d", &id) != 1) { + pr_err("wrong number of items scanned in fusectl dump\n"); + goto out; + } + + for (it = mntinfo; it; it = it->next) { + if (it->fstype->code == FSTYPE__FUSE && + id == kdev_minor(it->s_dev) && !it->external) { + pr_err("%s is a fuse mount but not external\n", it->mountpoint); + goto out; + } + } + } + + ret = 0; +out: + closedir(fdir); + return ret; +} + +static int debugfs_parse(struct mount_info *pm) +{ + /* tracefs is automounted underneath debugfs sometimes, and the + * kernel's overmounting protection prevents us from mounting debugfs + * first without tracefs, so let's always mount debugfs MS_REC. + */ + pm->flags |= MS_REC; + + return 0; +} + +static int tracefs_parse(struct mount_info *pm) +{ + return 1; +} + +static bool cgroup_sb_equal(struct mount_info *a, struct mount_info *b) +{ + if (a->private && b->private && + strcmp(a->private, b->private)) + return false; + if (strcmp(a->options, b->options)) + return false; + + return true; +} + +static int cgroup_parse(struct mount_info *pm) +{ + if (!(root_ns_mask & CLONE_NEWCGROUP)) + return 0; + + /* cgroup namespaced mounts don't look rooted to CRIU, so let's fake it + * here. + */ + pm->private = pm->root; + pm->root = xstrdup("/"); + if (!pm->root) + return -1; + + return 0; +} + +static bool btrfs_sb_equal(struct mount_info *a, struct mount_info *b) +{ + /* There is a btrfs bug where it doesn't emit subvol= correctly when + * files are bind mounted, so let's ignore it for now. + * https://marc.info/?l=linux-btrfs&m=145857372803614&w=2 + */ + + char *posa = strstr(a->options, "subvol="), *posb = strstr(b->options, "subvol="); + bool equal; + + if (!posa || !posb) { + pr_err("invalid btrfs options, no subvol argument\n"); + return false; + } + + *posa = *posb = 0; + equal = !strcmp(a->options, b->options); + *posa = *posb = 's'; + + if (!equal) + return false; + + posa = strchr(posa, ','); + posb = strchr(posb, ','); + + if ((posa && !posb) || (!posa && posb)) + return false; + + if (posa && strcmp(posa, posb)) + return false; + + return true; +} + +static int dump_empty_fs(struct mount_info *pm) +{ + int fd, ret = -1; + + fd = open_mountpoint(pm); + if (fd < 0) + return fd; + + ret = is_empty_dir(fd); + close(fd); + if (ret < 0) { + pr_err("%s isn't empty\n", pm->fstype->name); + return -1; + } + + return ret ? 0 : -1; +} + +/* + * Some fses (fuse) cannot be dumped, so we should always fail on dump/restore + * of these fses. + */ +static int always_fail(struct mount_info *pm) +{ + pr_err("failed to dump fs %s (%s): always fail\n", pm->mountpoint, + pm->fstype->name); + return -1; +} + +static struct fstype fstypes[] = { + { + .name = "unsupported", + .code = FSTYPE__UNSUPPORTED, + }, { + .name = "auto_cr", + .code = FSTYPE__AUTO, + }, { + .name = "proc", + .code = FSTYPE__PROC, + }, { + .name = "sysfs", + .code = FSTYPE__SYSFS, + }, { + .name = "devtmpfs", + .code = FSTYPE__DEVTMPFS, + .dump = devtmpfs_dump, + .restore = devtmpfs_restore, + }, { + .name = "binfmt_misc", + .parse = binfmt_misc_parse_or_collect, + .collect = binfmt_misc_parse_or_collect, + .code = FSTYPE__BINFMT_MISC, + .dump = binfmt_misc_dump, + .restore = binfmt_misc_restore, + }, { + .name = "tmpfs", + .code = FSTYPE__TMPFS, + .dump = tmpfs_dump, + .restore = tmpfs_restore, + }, { + .name = "devpts", + .parse = devpts_parse, + .code = FSTYPE__DEVPTS, + .restore = devpts_restore, + .check_bindmount = devpts_check_bindmount, + }, { + .name = "simfs", + .code = FSTYPE__SIMFS, + }, { + .name = "btrfs", + .code = FSTYPE__UNSUPPORTED, + .sb_equal = btrfs_sb_equal, + }, { + .name = "pstore", + .dump = dump_empty_fs, + .code = FSTYPE__PSTORE, + }, { + .name = "mqueue", + .dump = dump_empty_fs, + .code = FSTYPE__MQUEUE, + }, { + .name = "securityfs", + .code = FSTYPE__SECURITYFS, + }, { + .name = "fusectl", + .dump = fusectl_dump, + .code = FSTYPE__FUSECTL, + }, { + .name = "debugfs", + .code = FSTYPE__DEBUGFS, + .parse = debugfs_parse, + }, { + .name = "tracefs", + .code = FSTYPE__TRACEFS, + .parse = tracefs_parse, + }, { + .name = "cgroup", + .code = FSTYPE__CGROUP, + .parse = cgroup_parse, + .sb_equal = cgroup_sb_equal, + }, { + .name = "aufs", + .code = FSTYPE__AUFS, + .parse = aufs_parse, + }, { + .name = "fuse", + .code = FSTYPE__FUSE, + .dump = always_fail, + .restore = always_fail, + }, { + .name = "overlay", + .code = FSTYPE__OVERLAYFS, + .parse = overlayfs_parse, + }, { + .name = "autofs", + .code = FSTYPE__AUTOFS, + .parse = autofs_parse, + .dump = autofs_dump, + .mount = autofs_mount, + }, +}; + +struct fstype *fstype_auto(void) { return &fstypes[1]; } + +static char fsauto_all[] = "all"; +static char *fsauto_names; + +static bool css_contains(const char *css, const char *str) +{ + int len = strlen(str); + const char *cur; + + if (!len) + return false; + + for (cur = css; (cur = strstr(cur, str)); cur += len) { + if (cur > css && cur[-1] != ',') + continue; + if (cur[len] && cur[len] != ',') + continue; + return true; + } + + return false; +} + +static bool fsname_is_auto(const char *name) +{ + if (!fsauto_names) + return false; + + if (fsauto_names == fsauto_all) + return true; + + return css_contains(fsauto_names, name); +} + +bool add_fsname_auto(const char *names) +{ + char *old = fsauto_names; + + if (old == fsauto_all) + return true; + + if (css_contains(names, fsauto_all)) + fsauto_names = fsauto_all; + else if (!old) + fsauto_names = xstrdup(names); + else { + if (asprintf(&fsauto_names, "%s,%s", old, names) < 0) + fsauto_names = NULL; + } + + xfree(old); + return fsauto_names != NULL; +} + +struct fstype *find_fstype_by_name(char *fst) +{ + int i; + + /* + * This fn is required for two things. + * 1st -- to check supported filesystems (as just mounting + * anything is wrong, almost every fs has its own features) + * 2nd -- save some space in the image (since we scan all + * names anyway) + */ + for (i = 1; i < ARRAY_SIZE(fstypes); i++) { + struct fstype *fstype = fstypes + i; + + if (!strcmp(fstype->name, fst)) + return fstype; + } + + if (fsname_is_auto(fst)) + return &fstypes[1]; + + return &fstypes[0]; +} + +struct fstype *decode_fstype(u32 fst) +{ + int i; + + if (fst == FSTYPE__UNSUPPORTED) + goto uns; + + for (i = 1; i < ARRAY_SIZE(fstypes); i++) { + struct fstype *fstype = fstypes + i; + + if (!fstype->name) + break; + + if (fstype->code == fst) + return fstype; + } +uns: + return &fstypes[0]; +} + diff --git a/CRIU_code/criu/fsnotify.c b/CRIU_code/criu/fsnotify.c new file mode 100644 index 0000000..09093c0 --- /dev/null +++ b/CRIU_code/criu/fsnotify.c @@ -0,0 +1,934 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "common/compiler.h" +#include "imgset.h" +#include "fsnotify.h" +#include "fdinfo.h" +#include "mount.h" +#include "filesystems.h" +#include "image.h" +#include "util.h" +#include "crtools.h" +#include "files.h" +#include "files-reg.h" +#include "file-ids.h" +#include "criu-log.h" +#include "kerndat.h" +#include "common/list.h" +#include "common/lock.h" +#include "irmap.h" +#include "cr_options.h" +#include "namespaces.h" +#include "pstree.h" +#include "fault-injection.h" +#include + +#include "protobuf.h" +#include "images/fsnotify.pb-c.h" +#include "images/mnt.pb-c.h" + +#undef LOG_PREFIX +#define LOG_PREFIX "fsnotify: " + +struct fsnotify_mark_info { + struct list_head list; + union { + InotifyWdEntry *iwe; + FanotifyMarkEntry *fme; + }; + struct pprep_head prep; /* XXX union with remap */ + struct file_remap *remap; +}; + +struct fsnotify_file_info { + union { + InotifyFileEntry *ife; + FanotifyFileEntry *ffe; + }; + struct list_head marks; + struct file_desc d; +}; + +/* File handle */ +typedef struct { + u32 bytes; + u32 type; + u64 __handle[16]; +} fh_t; + +/* Checks if file descriptor @lfd is inotify */ +int is_inotify_link(char *link) +{ + return is_anon_link_type(link, "inotify"); +} + +/* Checks if file descriptor @lfd is fanotify */ +int is_fanotify_link(char *link) +{ + return is_anon_link_type(link, "[fanotify]"); +} + +static void decode_handle(fh_t *handle, FhEntry *img) +{ + memzero(handle, sizeof(*handle)); + + handle->type = img->type; + handle->bytes = img->bytes; + + memcpy(handle->__handle, img->handle, + min(pb_repeated_size(img, handle), + sizeof(handle->__handle))); +} + +static int open_by_handle(void *arg, int fd, int pid) +{ + return syscall(__NR_open_by_handle_at, fd, arg, O_PATH); +} + +static char *alloc_openable(unsigned int s_dev, unsigned long i_ino, FhEntry *f_handle) +{ + struct mount_info *m; + fh_t handle; + int fd = -1; + char *path; + + decode_handle(&handle, f_handle); + + /* + * We gonna try to open the handle and then + * depending on command line options and type + * of the filesystem (tmpfs/devtmpfs do not + * preserve their inodes between mounts) we + * might need to find out an openable path + * get used on restore as a watch destination. + */ + for (m = mntinfo; m; m = m->next) { + char buf[PATH_MAX], *__path; + int mntfd, openable_fd; + struct stat st; + + if (m->s_dev != s_dev) + continue; + if (!mnt_is_dir(m)) + continue; + + mntfd = __open_mountpoint(m, -1); + pr_debug("\t\tTrying via mntid %d root %s ns_mountpoint @%s (%d)\n", + m->mnt_id, m->root, m->ns_mountpoint, mntfd); + if (mntfd < 0) + continue; + + fd = userns_call(open_by_handle, UNS_FDOUT, &handle, + sizeof(handle), mntfd); + close(mntfd); + if (fd < 0) + continue; + + if (read_fd_link(fd, buf, sizeof(buf)) < 0) { + close(fd); + goto err; + } + close(fd); + + /* + * Convert into a relative path. + */ + __path = (buf[1] != '\0') ? buf + 1 : "."; + pr_debug("\t\t\tlink as %s\n", __path); + + mntfd = mntns_get_root_fd(m->nsid); + if (mntfd < 0) + goto err; + + openable_fd = openat(mntfd, __path, O_PATH); + if (openable_fd >= 0) { + if (fstat(openable_fd, &st)) { + pr_perror("Can't stat on %s", __path); + close(openable_fd); + return ERR_PTR(-errno); + } + close(openable_fd); + + pr_debug("\t\t\topenable (inode %s) as %s\n", + st.st_ino == i_ino ? + "match" : "don't match", __path); + + if (st.st_ino == i_ino) { + path = xstrdup(buf); + if (path == NULL) + return ERR_PTR(-ENOMEM); + if (root_ns_mask & CLONE_NEWNS) { + f_handle->has_mnt_id = true; + f_handle->mnt_id = m->mnt_id; + } + return path; + } + } else + pr_debug("\t\t\tnot openable as %s (%m)\n", __path); + } + + return ERR_PTR(-ENOENT); +err: + return ERR_PTR(-1); +} + +static int open_handle(unsigned int s_dev, unsigned long i_ino, + FhEntry *f_handle) +{ + struct mount_info *m; + int mntfd, fd = -1; + fh_t handle; + + decode_handle(&handle, f_handle); + + pr_debug("Opening fhandle %x:%llx...\n", + s_dev, (unsigned long long)handle.__handle[0]); + + for (m = mntinfo; m; m = m->next) { + if (m->s_dev != s_dev || !mnt_is_dir(m)) + continue; + + mntfd = __open_mountpoint(m, -1); + if (mntfd < 0) { + pr_err("Can't open mount for s_dev %x, continue\n", s_dev); + continue; + } + + fd = userns_call(open_by_handle, UNS_FDOUT, &handle, sizeof(handle), mntfd); + if (fd >= 0) { + close(mntfd); + goto out; + } + close(mntfd); + } +out: + return fd; +} + +int check_open_handle(unsigned int s_dev, unsigned long i_ino, + FhEntry *f_handle) +{ + char *path, *irmap_path; + int fd = -1; + + if (fault_injected(FI_CHECK_OPEN_HANDLE)) { + fd = -1; + goto fault; + } + + fd = open_handle(s_dev, i_ino, f_handle); +fault: + if (fd >= 0) { + struct mount_info *mi; + + pr_debug("\tHandle 0x%x:0x%lx is openable\n", s_dev, i_ino); + + mi = lookup_mnt_sdev(s_dev); + if (mi == NULL) { + pr_err("Unable to lookup a mount by dev 0x%x\n", s_dev); + goto err; + } + + /* + * Always try to fetch watchee path first. There are several reasons: + * + * - tmpfs/devtmps do not save inode numbers between mounts, + * so it is critical to have the complete path under our + * hands for restore purpose; + * + * - in case of migration the inodes might be changed as well + * so the only portable solution is to carry the whole path + * to the watchee inside image. + */ + path = alloc_openable(s_dev, i_ino, f_handle); + if (!IS_ERR_OR_NULL(path)) + goto out; + else if (IS_ERR(path) && PTR_ERR(path) == -ENOMEM) + goto err; + + if ((mi->fstype->code == FSTYPE__TMPFS) || + (mi->fstype->code == FSTYPE__DEVTMPFS)) { + pr_err("Can't find suitable path for handle (dev %#x ino %#lx): %d\n", + s_dev, i_ino, (int)PTR_ERR(path)); + goto err; + } + + if (!opts.force_irmap) + /* + * If we're not forced to do irmap, then + * say we have no path for watch. Otherwise + * do irmap scan even if the handle is + * working. + * + * FIXME -- no need to open-by-handle if + * we are in force-irmap and not on tempfs + */ + goto out_nopath; + } + + pr_warn("\tHandle 0x%x:0x%lx cannot be opened\n", s_dev, i_ino); + irmap_path = irmap_lookup(s_dev, i_ino); + if (!irmap_path) { + pr_err("\tCan't dump that handle\n"); + return -1; + } + path = xstrdup(irmap_path); + if (!path) + goto err; +out: + pr_debug("\tDumping %s as path for handle\n", path); + f_handle->path = path; +out_nopath: + close_safe(&fd); + return 0; +err: + close_safe(&fd); + return -1; +} + +static int check_one_wd(InotifyWdEntry *we) +{ + pr_info("wd: wd %#08x s_dev %#08x i_ino %#16"PRIx64" mask %#08x\n", + we->wd, we->s_dev, we->i_ino, we->mask); + pr_info("\t[fhandle] bytes %#08x type %#08x __handle %#016"PRIx64":%#016"PRIx64"\n", + we->f_handle->bytes, we->f_handle->type, + we->f_handle->handle[0], we->f_handle->handle[1]); + + if (we->mask & KERNEL_FS_EVENT_ON_CHILD) + pr_warn_once("\t\tDetected FS_EVENT_ON_CHILD bit " + "in mask (will be ignored on restore)\n"); + + if (check_open_handle(we->s_dev, we->i_ino, we->f_handle)) + return -1; + + return 0; +} + +static int dump_one_inotify(int lfd, u32 id, const struct fd_parms *p) +{ + FileEntry fe = FILE_ENTRY__INIT; + InotifyFileEntry ie = INOTIFY_FILE_ENTRY__INIT; + int exit_code = -1, i, ret; + + ret = fd_has_data(lfd); + if (ret < 0) + return -1; + else if (ret > 0) + pr_warn("The %#08x inotify events will be dropped\n", id); + + ie.id = id; + ie.flags = p->flags; + ie.fown = (FownEntry *)&p->fown; + + if (parse_fdinfo(lfd, FD_TYPES__INOTIFY, &ie)) + goto free; + + for (i = 0; i < ie.n_wd; i++) + if (check_one_wd(ie.wd[i])) + goto free; + + fe.type = FD_TYPES__INOTIFY; + fe.id = ie.id; + fe.ify = &ie; + + pr_info("id %#08x flags %#08x\n", ie.id, ie.flags); + if (pb_write_one(img_from_set(glob_imgset, CR_FD_FILES), &fe, PB_FILE)) + goto free; + + exit_code = 0; +free: + for (i = 0; i < ie.n_wd; i++) + xfree(ie.wd[i]); + xfree(ie.wd); + + return exit_code; +} + +static int pre_dump_one_inotify(int pid, int lfd) +{ + InotifyFileEntry ie = INOTIFY_FILE_ENTRY__INIT; + int i; + + if (parse_fdinfo_pid(pid, lfd, FD_TYPES__INOTIFY, &ie)) + return -1; + + for (i = 0; i < ie.n_wd; i++) { + InotifyWdEntry *we = ie.wd[i]; + + if (irmap_queue_cache(we->s_dev, we->i_ino, we->f_handle)) + return -1; + + xfree(we); + } + + return 0; +} + +const struct fdtype_ops inotify_dump_ops = { + .type = FD_TYPES__INOTIFY, + .dump = dump_one_inotify, + .pre_dump = pre_dump_one_inotify, +}; + +static int check_one_mark(FanotifyMarkEntry *fme) +{ + if (fme->type == MARK_TYPE__INODE) { + + BUG_ON(!fme->ie); + + pr_info("mark: s_dev %#08x i_ino %#016"PRIx64" mask %#08x\n", + fme->s_dev, fme->ie->i_ino, fme->mask); + + pr_info("\t[fhandle] bytes %#08x type %#08x __handle %#016"PRIx64":%#016"PRIx64"\n", + fme->ie->f_handle->bytes, fme->ie->f_handle->type, + fme->ie->f_handle->handle[0], fme->ie->f_handle->handle[1]); + + if (check_open_handle(fme->s_dev, fme->ie->i_ino, fme->ie->f_handle)) + return -1; + } + + if (fme->type == MARK_TYPE__MOUNT) { + struct mount_info *m; + + BUG_ON(!fme->me); + + m = lookup_mnt_id(fme->me->mnt_id); + if (!m) { + pr_err("Can't find mnt_id 0x%x\n", fme->me->mnt_id); + return -1; + } + if (!(root_ns_mask & CLONE_NEWNS)) + fme->me->path = m->mountpoint + 1; + fme->s_dev = m->s_dev; + + pr_info("mark: s_dev %#08x mnt_id %#08x mask %#08x\n", + fme->s_dev, fme->me->mnt_id, fme->mask); + + } + + return 0; +} + +static int dump_one_fanotify(int lfd, u32 id, const struct fd_parms *p) +{ + FileEntry fle = FILE_ENTRY__INIT; + FanotifyFileEntry fe = FANOTIFY_FILE_ENTRY__INIT; + int ret = -1, i; + + ret = fd_has_data(lfd); + if (ret < 0) + return -1; + else if (ret > 0) + pr_warn("The %#08x fanotify events will be dropped\n", id); + ret = -1; + + fe.id = id; + fe.flags = p->flags; + fe.fown = (FownEntry *)&p->fown; + + if (parse_fdinfo(lfd, FD_TYPES__FANOTIFY, &fe) < 0) + goto free; + + for (i = 0; i < fe.n_mark; i++) + if (check_one_mark(fe.mark[i])) + goto free; + + pr_info("id %#08x flags %#08x\n", fe.id, fe.flags); + + fle.type = FD_TYPES__FANOTIFY; + fle.id = fe.id; + fle.ffy = &fe; + + ret = pb_write_one(img_from_set(glob_imgset, CR_FD_FILES), &fle, PB_FILE); +free: + for (i = 0; i < fe.n_mark; i++) + xfree(fe.mark[i]); + xfree(fe.mark); + return ret; +} + +static int pre_dump_one_fanotify(int pid, int lfd) +{ + FanotifyFileEntry fe = FANOTIFY_FILE_ENTRY__INIT; + int i; + + if (parse_fdinfo_pid(pid, lfd, FD_TYPES__FANOTIFY, &fe)) + return -1; + + for (i = 0; i < fe.n_mark; i++) { + FanotifyMarkEntry *me = fe.mark[i]; + + if (me->type == MARK_TYPE__INODE && + irmap_queue_cache(me->s_dev, me->ie->i_ino, + me->ie->f_handle)) + return -1; + + xfree(me); + } + xfree(fe.mark); + return 0; +} + +const struct fdtype_ops fanotify_dump_ops = { + .type = FD_TYPES__FANOTIFY, + .dump = dump_one_fanotify, + .pre_dump = pre_dump_one_fanotify, +}; + +static char *get_mark_path(const char *who, struct file_remap *remap, + FhEntry *f_handle, unsigned long i_ino, + unsigned int s_dev, char *buf, int *target) +{ + char *path = NULL; + + if (remap) { + int mntns_root; + + mntns_root = mntns_get_root_by_mnt_id(remap->rmnt_id); + + pr_debug("\t\tRestore %s watch for %#08x:%#016lx (via %s)\n", + who, s_dev, i_ino, remap->rpath); + *target = openat(mntns_root, remap->rpath, O_PATH); + } else if (f_handle->path) { + int mntns_root; + char *path = "."; + uint32_t mnt_id = f_handle->has_mnt_id ? f_handle->mnt_id : -1; + + /* irmap cache is collected in the root namespaces. */ + mntns_root = mntns_get_root_by_mnt_id(mnt_id); + + /* change "/foo" into "foo" and "/" into "." */ + if (f_handle->path[1] != '\0') + path = f_handle->path + 1; + + pr_debug("\t\tRestore with path hint %d:%s\n", mnt_id, path); + *target = openat(mntns_root, path, O_PATH); + } else + *target = open_handle(s_dev, i_ino, f_handle); + + if (*target < 0) { + pr_perror("Unable to open %s", f_handle->path); + goto err; + } + + /* + * fanotify/inotify open syscalls want path to attach + * watch to. But the only thing we have is an FD obtained + * via fhandle. Fortunatelly, when trying to attach the + * /proc/pid/fd/ link, we will watch the inode the link + * points to, i.e. -- just what we want. + */ + + sprintf(buf, "/proc/self/fd/%d", *target); + path = buf; + + if (!pr_quelled(LOG_DEBUG)) { + char link[PATH_MAX]; + + if (read_fd_link(*target, link, sizeof(link)) < 0) + link[0] = '\0'; + + pr_debug("\t\tRestore %s watch for %#08x:%#016lx (via %s -> %s)\n", + who, s_dev, i_ino, path, link); + } +err: + return path; +} + +static int restore_one_inotify(int inotify_fd, struct fsnotify_mark_info *info) +{ + InotifyWdEntry *iwe = info->iwe; + int ret = -1, target = -1; + char buf[PSFDS], *path; + uint32_t mask; + + path = get_mark_path("inotify", info->remap, iwe->f_handle, + iwe->i_ino, iwe->s_dev, buf, &target); + if (!path) + goto err; + + mask = iwe->mask & IN_ALL_EVENTS; + if (iwe->mask & ~IN_ALL_EVENTS) { + pr_info("\t\tfilter event mask %#x -> %#x\n", + iwe->mask, mask); + } + + if (kdat.has_inotify_setnextwd) { + if (ioctl(inotify_fd, INOTIFY_IOC_SETNEXTWD, iwe->wd)) { + pr_perror("Can't set next inotify wd"); + return -1; + } + } + + while (1) { + int wd; + + wd = inotify_add_watch(inotify_fd, path, mask); + if (wd < 0) { + pr_perror("Can't add watch for 0x%x with 0x%x", inotify_fd, iwe->wd); + break; + } else if (wd == iwe->wd) { + ret = 0; + break; + } else if (wd > iwe->wd) { + pr_err("Unsorted watch 0x%x found for 0x%x with 0x%x\n", wd, inotify_fd, iwe->wd); + break; + } + + if (kdat.has_inotify_setnextwd) + return -1; + + inotify_rm_watch(inotify_fd, wd); + } + +err: + close_safe(&target); + return ret; +} + +static int restore_one_fanotify(int fd, struct fsnotify_mark_info *mark) +{ + FanotifyMarkEntry *fme = mark->fme; + unsigned int flags = FAN_MARK_ADD; + int ret = -1, target = -1; + char buf[PSFDS], *path = NULL; + + if (fme->type == MARK_TYPE__MOUNT) { + struct mount_info *m; + int mntns_root; + char *p = fme->me->path; + struct ns_id *nsid = NULL; + + if (root_ns_mask & CLONE_NEWNS) { + m = lookup_mnt_id(fme->me->mnt_id); + if (!m) { + pr_err("Can't find mount mnt_id 0x%x\n", fme->me->mnt_id); + return -1; + } + nsid = m->nsid; + p = m->ns_mountpoint; + } + + mntns_root = mntns_get_root_fd(nsid); + + target = openat(mntns_root, p, O_PATH); + if (target == -1) { + pr_perror("Unable to open %s", p); + goto err; + } + + flags |= FAN_MARK_MOUNT; + snprintf(buf, sizeof(buf), "/proc/self/fd/%d", target); + path = buf; + } else if (fme->type == MARK_TYPE__INODE) { + path = get_mark_path("fanotify", mark->remap, + fme->ie->f_handle, fme->ie->i_ino, + fme->s_dev, buf, &target); + if (!path) + goto err; + } else { + pr_err("Bad fsnotify mark type 0x%x\n", fme->type); + goto err; + } + + flags |= fme->mflags; + + if (mark->fme->mask) { + ret = fanotify_mark(fd, flags, fme->mask, AT_FDCWD, path); + if (ret) { + pr_err("Adding fanotify mask 0x%x on 0x%x/%s failed (%d)\n", + fme->mask, fme->id, path, ret); + goto err; + } + } + + if (fme->ignored_mask) { + ret = fanotify_mark(fd, flags | FAN_MARK_IGNORED_MASK, + fme->ignored_mask, AT_FDCWD, path); + if (ret) { + pr_err("Adding fanotify ignored-mask 0x%x on 0x%x/%s failed (%d)\n", + fme->ignored_mask, fme->id, path, ret); + goto err; + } + } + +err: + close_safe(&target); + return ret; +} + +static int open_inotify_fd(struct file_desc *d, int *new_fd) +{ + struct fsnotify_file_info *info; + struct fsnotify_mark_info *wd_info; + int tmp; + + info = container_of(d, struct fsnotify_file_info, d); + + tmp = inotify_init1(info->ife->flags); + if (tmp < 0) { + pr_perror("Can't create inotify for %#08x", info->ife->id); + return -1; + } + + list_for_each_entry(wd_info, &info->marks, list) { + pr_info("\tRestore 0x%x wd for %#08x\n", wd_info->iwe->wd, wd_info->iwe->id); + if (restore_one_inotify(tmp, wd_info)) { + close_safe(&tmp); + return -1; + } + pr_info("\t 0x%x wd for %#08x is restored\n", wd_info->iwe->wd, wd_info->iwe->id); + } + + if (restore_fown(tmp, info->ife->fown)) + close_safe(&tmp); + + *new_fd = tmp; + return 0; +} + +static int open_fanotify_fd(struct file_desc *d, int *new_fd) +{ + struct fsnotify_file_info *info; + struct fsnotify_mark_info *mark; + unsigned int flags = 0; + int ret; + + info = container_of(d, struct fsnotify_file_info, d); + + flags = info->ffe->faflags; + if (info->ffe->flags & O_CLOEXEC) + flags |= FAN_CLOEXEC; + if (info->ffe->flags & O_NONBLOCK) + flags |= FAN_NONBLOCK; + + ret = fanotify_init(flags, info->ffe->evflags); + if (ret < 0) { + pr_perror("Can't init fanotify mark (%d)", ret); + return -1; + } + + list_for_each_entry(mark, &info->marks, list) { + pr_info("\tRestore fanotify for %#08x\n", mark->fme->id); + if (restore_one_fanotify(ret, mark)) { + close_safe(&ret); + return -1; + } + } + + if (restore_fown(ret, info->ffe->fown)) + close_safe(&ret); + + *new_fd = ret; + return 0; +} + +static struct file_desc_ops inotify_desc_ops = { + .type = FD_TYPES__INOTIFY, + .open = open_inotify_fd, +}; + +static struct file_desc_ops fanotify_desc_ops = { + .type = FD_TYPES__FANOTIFY, + .open = open_fanotify_fd, +}; + +static int inotify_resolve_remap(struct pprep_head *ph) +{ + struct fsnotify_mark_info *m; + + m = container_of(ph, struct fsnotify_mark_info, prep); + m->remap = lookup_ghost_remap(m->iwe->s_dev, m->iwe->i_ino); + return 0; +} + +static int fanotify_resolve_remap(struct pprep_head *ph) +{ + struct fsnotify_mark_info *m; + + m = container_of(ph, struct fsnotify_mark_info, prep); + m->remap = lookup_ghost_remap(m->fme->s_dev, m->fme->ie->i_ino); + return 0; +} + +static int __collect_inotify_mark(struct fsnotify_file_info *p, struct fsnotify_mark_info *mark) +{ + struct fsnotify_mark_info *m; + + /* + * We should put marks in wd ascending order. See comment + * in restore_one_inotify() for explanation. + */ + list_for_each_entry(m, &p->marks, list) + if (m->iwe->wd > mark->iwe->wd) + break; + + list_add_tail(&mark->list, &m->list); + mark->prep.actor = inotify_resolve_remap; + add_post_prepare_cb(&mark->prep); + return 0; +} + +static int __collect_fanotify_mark(struct fsnotify_file_info *p, + struct fsnotify_mark_info *mark) +{ + list_add(&mark->list, &p->marks); + if (mark->fme->type == MARK_TYPE__INODE) { + mark->prep.actor = fanotify_resolve_remap; + add_post_prepare_cb(&mark->prep); + } + return 0; +} + +static int collect_one_inotify(void *o, ProtobufCMessage *msg, struct cr_img *img) +{ + struct fsnotify_file_info *info = o; + int i; + + info->ife = pb_msg(msg, InotifyFileEntry); + INIT_LIST_HEAD(&info->marks); + pr_info("Collected id %#08x flags %#08x\n", info->ife->id, info->ife->flags); + + for (i = 0; i < info->ife->n_wd; i++) { + struct fsnotify_mark_info *mark; + + mark = xmalloc(sizeof(*mark)); + if (!mark) + return -1; + + mark->iwe = info->ife->wd[i]; + INIT_LIST_HEAD(&mark->list); + mark->remap = NULL; + + if (__collect_inotify_mark(info, mark)) + return -1; + } + + return file_desc_add(&info->d, info->ife->id, &inotify_desc_ops); +} + +struct collect_image_info inotify_cinfo = { + .fd_type = CR_FD_INOTIFY_FILE, + .pb_type = PB_INOTIFY_FILE, + .priv_size = sizeof(struct fsnotify_file_info), + .collect = collect_one_inotify, +}; + +static int collect_one_fanotify(void *o, ProtobufCMessage *msg, struct cr_img *img) +{ + struct fsnotify_file_info *info = o; + int i; + + info->ffe = pb_msg(msg, FanotifyFileEntry); + INIT_LIST_HEAD(&info->marks); + pr_info("Collected id %#08x flags %#08x\n", info->ffe->id, info->ffe->flags); + + for (i = 0; i < info->ffe->n_mark; i++) { + struct fsnotify_mark_info *mark; + + mark = xmalloc(sizeof(*mark)); + if (!mark) + return -1; + + mark->fme = info->ffe->mark[i]; + INIT_LIST_HEAD(&mark->list); + mark->remap = NULL; + + if (__collect_fanotify_mark(info, mark)) + return -1; + } + + return file_desc_add(&info->d, info->ffe->id, &fanotify_desc_ops); +} + +struct collect_image_info fanotify_cinfo = { + .fd_type = CR_FD_FANOTIFY_FILE, + .pb_type = PB_FANOTIFY_FILE, + .priv_size = sizeof(struct fsnotify_file_info), + .collect = collect_one_fanotify, +}; + +static int collect_one_inotify_mark(void *o, ProtobufCMessage *msg, struct cr_img *i) +{ + struct fsnotify_mark_info *mark = o; + struct file_desc *d; + + if (!deprecated_ok("separate images for fsnotify marks")) + return -1; + + mark->iwe = pb_msg(msg, InotifyWdEntry); + INIT_LIST_HEAD(&mark->list); + mark->remap = NULL; + + /* + * The kernel prior 4.3 might export internal event + * mask bits which are not part of user-space API. It + * is fixed in kernel but we have to keep backward + * compatibility with old images. So mask out + * inappropriate bits (in particular fdinfo might + * have FS_EVENT_ON_CHILD bit set). + */ + mark->iwe->mask &= ~KERNEL_FS_EVENT_ON_CHILD; + + d = find_file_desc_raw(FD_TYPES__INOTIFY, mark->iwe->id); + if (!d) { + pr_err("Can't find inotify with id %#08x\n", mark->iwe->id); + return -1; + } + + return __collect_inotify_mark(container_of(d, struct fsnotify_file_info, d), mark); +} + +struct collect_image_info inotify_mark_cinfo = { + .fd_type = CR_FD_INOTIFY_WD, + .pb_type = PB_INOTIFY_WD, + .priv_size = sizeof(struct fsnotify_mark_info), + .collect = collect_one_inotify_mark, +}; + +static int collect_one_fanotify_mark(void *o, ProtobufCMessage *msg, struct cr_img *i) +{ + struct fsnotify_mark_info *mark = o; + struct file_desc *d; + + if (!deprecated_ok("separate images for fsnotify marks")) + return -1; + + mark->fme = pb_msg(msg, FanotifyMarkEntry); + INIT_LIST_HEAD(&mark->list); + mark->remap = NULL; + + d = find_file_desc_raw(FD_TYPES__FANOTIFY, mark->fme->id); + if (!d) { + pr_err("Can't find fanotify with id %#08x\n", mark->fme->id); + return -1; + } + + return __collect_fanotify_mark(container_of(d, struct fsnotify_file_info, d), mark); +} + +struct collect_image_info fanotify_mark_cinfo = { + .fd_type = CR_FD_FANOTIFY_MARK, + .pb_type = PB_FANOTIFY_MARK, + .priv_size = sizeof(struct fsnotify_mark_info), + .collect = collect_one_fanotify_mark, +}; diff --git a/CRIU_code/criu/image-desc.c b/CRIU_code/criu/image-desc.c new file mode 100644 index 0000000..053e7af --- /dev/null +++ b/CRIU_code/criu/image-desc.c @@ -0,0 +1,120 @@ +#include + +#include "image-desc.h" +#include "magic.h" +#include "image.h" + +/* + * The cr fd set is the set of files where the information + * about dumped processes is stored. Each file carries some + * small portion of info about the whole picture, see below + * for more details. + */ + +#define FD_ENTRY(_name, _fmt) \ + [CR_FD_##_name] = { \ + .fmt = _fmt ".img", \ + .magic = _name##_MAGIC, \ + } + +#define FD_ENTRY_F(_name, _fmt, _f) \ + [CR_FD_##_name] = { \ + .fmt = _fmt ".img", \ + .magic = _name##_MAGIC, \ + .oflags = _f, \ + } + +struct cr_fd_desc_tmpl imgset_template[CR_FD_MAX] = { + FD_ENTRY(INVENTORY, "inventory"), + FD_ENTRY(FDINFO, "fdinfo-%u"), + FD_ENTRY(PAGEMAP, "pagemap-%lu"), + FD_ENTRY(SHMEM_PAGEMAP, "pagemap-shmem-%lu"), + FD_ENTRY(REG_FILES, "reg-files"), + FD_ENTRY(EXT_FILES, "ext-files"), + FD_ENTRY(NS_FILES, "ns-files"), + FD_ENTRY(EVENTFD_FILE, "eventfd"), + FD_ENTRY(EVENTPOLL_FILE,"eventpoll"), + FD_ENTRY(EVENTPOLL_TFD, "eventpoll-tfd"), + FD_ENTRY(SIGNALFD, "signalfd"), + FD_ENTRY(INOTIFY_FILE, "inotify"), + FD_ENTRY(INOTIFY_WD, "inotify-wd"), + FD_ENTRY(FANOTIFY_FILE, "fanotify"), + FD_ENTRY(FANOTIFY_MARK, "fanotify-mark"), + FD_ENTRY(CORE, "core-%u"), + FD_ENTRY(IDS, "ids-%u"), + FD_ENTRY(MM, "mm-%u"), + FD_ENTRY(VMAS, "vmas-%u"), + FD_ENTRY(PIPES, "pipes"), + FD_ENTRY_F(PIPES_DATA, "pipes-data", O_NOBUF), /* splices data */ + FD_ENTRY(FIFO, "fifo"), + FD_ENTRY_F(FIFO_DATA, "fifo-data", O_NOBUF), /* the same */ + FD_ENTRY(PSTREE, "pstree"), + FD_ENTRY(SIGACT, "sigacts-%u"), + FD_ENTRY(UNIXSK, "unixsk"), + FD_ENTRY(INETSK, "inetsk"), + FD_ENTRY(PACKETSK, "packetsk"), + FD_ENTRY(NETLINK_SK, "netlinksk"), + FD_ENTRY_F(SK_QUEUES, "sk-queues", O_NOBUF), /* lseeks the image */ + FD_ENTRY(ITIMERS, "itimers-%u"), + FD_ENTRY(POSIX_TIMERS, "posix-timers-%u"), + FD_ENTRY(CREDS, "creds-%u"), + FD_ENTRY(UTSNS, "utsns-%u"), + FD_ENTRY(IPC_VAR, "ipcns-var-%u"), + FD_ENTRY_F(IPCNS_SHM, "ipcns-shm-%u", O_NOBUF), /* writes segments of data */ + FD_ENTRY(IPCNS_MSG, "ipcns-msg-%u"), + FD_ENTRY(IPCNS_SEM, "ipcns-sem-%u"), + FD_ENTRY(FS, "fs-%u"), + FD_ENTRY(REMAP_FPATH, "remap-fpath"), + FD_ENTRY_F(GHOST_FILE, "ghost-file-%x", O_NOBUF), + FD_ENTRY(TCP_STREAM, "tcp-stream-%x"), + FD_ENTRY(MNTS, "mountpoints-%u"), + FD_ENTRY(NETDEV, "netdev-%u"), + FD_ENTRY(NETNS, "netns-%u"), + FD_ENTRY_F(IFADDR, "ifaddr-%u", O_NOBUF), + FD_ENTRY_F(ROUTE, "route-%u", O_NOBUF), + FD_ENTRY_F(ROUTE6, "route6-%u", O_NOBUF), + FD_ENTRY_F(RULE, "rule-%u", O_NOBUF), + FD_ENTRY_F(IPTABLES, "iptables-%u", O_NOBUF), + FD_ENTRY_F(IP6TABLES, "ip6tables-%u", O_NOBUF), + FD_ENTRY_F(TMPFS_IMG, "tmpfs-%u.tar.gz", O_NOBUF), + FD_ENTRY_F(TMPFS_DEV, "tmpfs-dev-%u.tar.gz", O_NOBUF), + FD_ENTRY_F(AUTOFS, "autofs-%u", O_NOBUF), + FD_ENTRY(BINFMT_MISC_OLD, "binfmt-misc-%u"), + FD_ENTRY(BINFMT_MISC, "binfmt-misc"), + FD_ENTRY(TTY_FILES, "tty"), + FD_ENTRY(TTY_INFO, "tty-info"), + FD_ENTRY_F(TTY_DATA, "tty-data", O_NOBUF), + FD_ENTRY(FILE_LOCKS, "filelocks"), + FD_ENTRY(RLIMIT, "rlimit-%u"), + FD_ENTRY_F(PAGES, "pages-%u", O_NOBUF), + FD_ENTRY_F(PAGES_OLD, "pages-%d", O_NOBUF), + FD_ENTRY_F(SHM_PAGES_OLD, "pages-shmem-%ld", O_NOBUF), + FD_ENTRY(SIGNAL, "signal-s-%u"), + FD_ENTRY(PSIGNAL, "signal-p-%u"), + FD_ENTRY(TUNFILE, "tunfile"), + FD_ENTRY(CGROUP, "cgroup"), + FD_ENTRY(TIMERFD, "timerfd"), + FD_ENTRY(CPUINFO, "cpuinfo"), + FD_ENTRY(SECCOMP, "seccomp"), + FD_ENTRY(USERNS, "userns-%u"), + FD_ENTRY(NETNF_CT, "netns-ct-%u"), + FD_ENTRY(NETNF_EXP, "netns-exp-%u"), + FD_ENTRY(FILES, "files"), + + [CR_FD_STATS] = { + .fmt = "stats-%s", + .magic = STATS_MAGIC, + .oflags = O_SERVICE | O_FORCE_LOCAL, + }, + + [CR_FD_IRMAP_CACHE] = { + .fmt = "irmap-cache", + .magic = IRMAP_CACHE_MAGIC, + .oflags = O_SERVICE | O_FORCE_LOCAL, + }, + + [CR_FD_FILE_LOCKS_PID] = { + .fmt = "filelocks-%u.img", + .magic = FILE_LOCKS_MAGIC, + }, +}; diff --git a/CRIU_code/criu/image.c b/CRIU_code/criu/image.c new file mode 100644 index 0000000..78947ab --- /dev/null +++ b/CRIU_code/criu/image.c @@ -0,0 +1,731 @@ +#include +#include +#include +#include +#include +#include +#include "crtools.h" +#include "cr_options.h" +#include "imgset.h" +#include "image.h" +#include "pstree.h" +#include "stats.h" +#include "cgroup.h" +#include "lsm.h" +#include "protobuf.h" +#include "xmalloc.h" +#include "images/inventory.pb-c.h" +#include "images/pagemap.pb-c.h" +#include "proc_parse.h" +#include "img-remote.h" +#include "namespaces.h" + +bool ns_per_id = false; +bool img_common_magic = true; +TaskKobjIdsEntry *root_ids; +u32 root_cg_set; +Lsmtype image_lsm; + +int check_img_inventory(void) +{ + int ret = -1; + struct cr_img *img; + InventoryEntry *he; + + img = open_image(CR_FD_INVENTORY, O_RSTR); + if (!img) + return -1; + + if (pb_read_one(img, &he, PB_INVENTORY) < 0) + goto out_close; + + if (!he->has_fdinfo_per_id || !he->fdinfo_per_id) { + pr_err("Too old image, no longer supported\n"); + goto out_close; + } + + ns_per_id = he->has_ns_per_id ? he->ns_per_id : false; + + if (he->root_ids) { + root_ids = xmalloc(sizeof(*root_ids)); + if (!root_ids) + goto out_err; + + memcpy(root_ids, he->root_ids, sizeof(*root_ids)); + } + + if (he->has_root_cg_set) { + if (he->root_cg_set == 0) { + pr_err("Corrupted root cgset\n"); + goto out_err; + } + + root_cg_set = he->root_cg_set; + } + + if (he->has_lsmtype) + image_lsm = he->lsmtype; + else + image_lsm = LSMTYPE__NO_LSM; + + switch (he->img_version) { + case CRTOOLS_IMAGES_V1: + /* good old images. OK */ + img_common_magic = false; + break; + case CRTOOLS_IMAGES_V1_1: + /* newer images with extra magic in the head */ + break; + default: + pr_err("Not supported images version %u\n", he->img_version); + goto out_err; + } + + ret = 0; + +out_err: + inventory_entry__free_unpacked(he, NULL); +out_close: + close_image(img); + return ret; +} + +int write_img_inventory(InventoryEntry *he) +{ + struct cr_img *img; + int ret; + + pr_info("Writing image inventory (version %u)\n", CRTOOLS_IMAGES_V1); + + img = open_image(CR_FD_INVENTORY, O_DUMP); + if (!img) + return -1; + + ret = pb_write_one(img, he, PB_INVENTORY); + + xfree(he->root_ids); + close_image(img); + if (ret < 0) + return -1; + return 0; +} + +int inventory_save_uptime(InventoryEntry *he) +{ + if (!opts.track_mem) + return 0; + + /* + * dump_uptime is used to detect whether a process was handled + * before or it is a new process with the same pid. + */ + if (parse_uptime(&he->dump_uptime)) + return -1; + + he->has_dump_uptime = true; + return 0; +} + +InventoryEntry *get_parent_inventory(void) +{ + struct cr_img *img; + InventoryEntry *ie; + int dir; + + dir = openat(get_service_fd(IMG_FD_OFF), CR_PARENT_LINK, O_RDONLY); + if (dir == -1) { + pr_warn("Failed to open parent directory\n"); + return NULL; + } + + img = open_image_at(dir, CR_FD_INVENTORY, O_RSTR); + if (!img) { + pr_warn("Failed to open parent pre-dump inventory image\n"); + close(dir); + return NULL; + } + + if (pb_read_one(img, &ie, PB_INVENTORY) < 0) { + pr_warn("Failed to read parent pre-dump inventory entry\n"); + close_image(img); + close(dir); + return NULL; + } + + if (!ie->has_dump_uptime) { + pr_warn("Parent pre-dump inventory has no uptime\n"); + inventory_entry__free_unpacked(ie, NULL); + ie = NULL; + } + + close_image(img); + close(dir); + return ie; +} + +int prepare_inventory(InventoryEntry *he) +{ + struct pid pid; + struct { + struct pstree_item i; + struct dmp_info d; + } crt = { .i.pid = &pid }; + + pr_info("Perparing image inventory (version %u)\n", CRTOOLS_IMAGES_V1); + + he->img_version = CRTOOLS_IMAGES_V1_1; + he->fdinfo_per_id = true; + he->has_fdinfo_per_id = true; + he->ns_per_id = true; + he->has_ns_per_id = true; + he->has_lsmtype = true; + he->lsmtype = host_lsm_type(); + + crt.i.pid->state = TASK_ALIVE; + crt.i.pid->real = getpid(); + if (get_task_ids(&crt.i)) + return -1; + + he->has_root_cg_set = true; + if (dump_task_cgroup(NULL, &he->root_cg_set, NULL)) + return -1; + + he->root_ids = crt.i.ids; + + return 0; +} + +static struct cr_imgset *alloc_cr_imgset(int nr) +{ + struct cr_imgset *cr_imgset; + unsigned int i; + + cr_imgset = xmalloc(sizeof(*cr_imgset)); + if (cr_imgset == NULL) + return NULL; + + cr_imgset->_imgs = xmalloc(nr * sizeof(struct cr_img *)); + if (cr_imgset->_imgs == NULL) { + xfree(cr_imgset); + return NULL; + } + + for (i = 0; i < nr; i++) + cr_imgset->_imgs[i] = NULL; + cr_imgset->fd_nr = nr; + return cr_imgset; +} + +static void __close_cr_imgset(struct cr_imgset *cr_imgset) +{ + unsigned int i; + + if (!cr_imgset) + return; + + for (i = 0; i < cr_imgset->fd_nr; i++) { + if (!cr_imgset->_imgs[i]) + continue; + close_image(cr_imgset->_imgs[i]); + cr_imgset->_imgs[i] = NULL; + } +} + +void close_cr_imgset(struct cr_imgset **cr_imgset) +{ + if (!cr_imgset || !*cr_imgset) + return; + + __close_cr_imgset(*cr_imgset); + + xfree((*cr_imgset)->_imgs); + xfree(*cr_imgset); + *cr_imgset = NULL; +} + +struct cr_imgset *cr_imgset_open_range(int pid, int from, int to, + unsigned long flags) +{ + struct cr_imgset *imgset; + unsigned int i; + + imgset = alloc_cr_imgset(to - from); + if (!imgset) + goto err; + + from++; + imgset->fd_off = from; + for (i = from; i < to; i++) { + struct cr_img *img; + + img = open_image(i, flags, pid); + if (!img) { + if (!(flags & O_CREAT)) + /* caller should check himself */ + continue; + goto err; + } + + imgset->_imgs[i - from] = img; + } + + return imgset; + +err: + close_cr_imgset(&imgset); + return NULL; +} + +struct cr_imgset *cr_task_imgset_open(int pid, int mode) +{ + return cr_imgset_open(pid, TASK, mode); +} + +struct cr_imgset *cr_glob_imgset_open(int mode) +{ + return cr_imgset_open(-1 /* ignored */, GLOB, mode); +} + +static int do_open_image(struct cr_img *img, int dfd, int type, unsigned long flags, char *path); + +struct cr_img *open_image_at(int dfd, int type, unsigned long flags, ...) +{ + struct cr_img *img; + unsigned long oflags; + char path[PATH_MAX]; + va_list args; + bool lazy = false; + + if (dfd == -1) { + dfd = get_service_fd(IMG_FD_OFF); + lazy = (flags & O_CREAT); + } + + img = xmalloc(sizeof(*img)); + if (!img) + return NULL; + + oflags = flags | imgset_template[type].oflags; + + va_start(args, flags); + vsnprintf(path, PATH_MAX, imgset_template[type].fmt, args); + va_end(args); + + if (lazy) { + img->fd = LAZY_IMG_FD; + img->type = type; + img->oflags = oflags; + img->path = xstrdup(path); + return img; + } else + img->fd = EMPTY_IMG_FD; + + if (do_open_image(img, dfd, type, oflags, path)) { + close_image(img); + return NULL; + } + + return img; +} + +static inline u32 head_magic(int oflags) +{ + return oflags & O_SERVICE ? IMG_SERVICE_MAGIC : IMG_COMMON_MAGIC; +} + +static int img_check_magic(struct cr_img *img, int oflags, int type, char *path) +{ + u32 magic; + + if (read_img(img, &magic) < 0) + return -1; + + if (img_common_magic && (type != CR_FD_INVENTORY)) { + if (magic != head_magic(oflags)) { + pr_err("Head magic doesn't match for %s\n", path); + return -1; + } + + if (read_img(img, &magic) < 0) + return -1; + } + + if (magic != imgset_template[type].magic) { + pr_err("Magic doesn't match for %s\n", path); + return -1; + } + + return 0; +} + +static int img_write_magic(struct cr_img *img, int oflags, int type) +{ + if (img_common_magic && (type != CR_FD_INVENTORY)) { + u32 cmagic; + + cmagic = head_magic(oflags); + if (write_img(img, &cmagic)) + return -1; + } + + return write_img(img, &imgset_template[type].magic); +} + +int do_open_remote_image(int dfd, char *path, int flags) +{ + char *snapshot_id = NULL; + int ret, save; + + /* When using namespaces, the current dir is changed so we need to + * change to previous working dir and back to correctly open the image + * proxy and cache sockets. */ + save = open(".", O_RDONLY); + if (save < 0) { + pr_perror("unable to open current working directory"); + return -1; + } + + if (fchdir(get_service_fd(IMG_FD_OFF)) < 0) { + pr_perror("fchdir to dfd failed!\n"); + close(save); + return -1; + } + + snapshot_id = get_snapshot_id_from_idx(dfd); + + if (snapshot_id == NULL) + ret = -1; + else if (flags == O_RDONLY) { + pr_debug("do_open_remote_image RDONLY path=%s snapshot_id=%s\n", + path, snapshot_id); + ret = read_remote_image_connection(snapshot_id, path); + } else { + pr_debug("do_open_remote_image WRONLY path=%s snapshot_id=%s\n", + path, snapshot_id); + ret = write_remote_image_connection(snapshot_id, path, O_WRONLY); + } + + if (fchdir(save) < 0) { + pr_perror("fchdir to save failed"); + close(save); + return -1; + } + close(save); + + return ret; +} + +struct openat_args { + char path[PATH_MAX]; + int flags; + int err; + int mode; +}; + +static int userns_openat(void *arg, int dfd, int pid) +{ + struct openat_args *pa = (struct openat_args *)arg; + int ret; + + ret = openat(dfd, pa->path, pa->flags, pa->mode); + if (ret < 0) + pa->err = errno; + + return ret; +} + +static int do_open_image(struct cr_img *img, int dfd, int type, unsigned long oflags, char *path) +{ + int ret, flags; + + flags = oflags & ~(O_NOBUF | O_SERVICE | O_FORCE_LOCAL); + + if (opts.remote && !(oflags & O_FORCE_LOCAL)) + ret = do_open_remote_image(dfd, path, flags); + else { + /* + * For pages images dedup we need to open images read-write on + * restore, that may require proper capabilities, so we ask + * usernsd to do it for us + */ + if (root_ns_mask & CLONE_NEWUSER && + type == CR_FD_PAGES && oflags & O_RDWR) { + struct openat_args pa = { + .flags = flags, + .err = 0, + .mode = CR_FD_PERM, + }; + snprintf(pa.path, PATH_MAX, "%s", path); + ret = userns_call(userns_openat, UNS_FDOUT, &pa, sizeof(struct openat_args), dfd); + if (ret < 0) + errno = pa.err; + } else + ret = openat(dfd, path, flags, CR_FD_PERM); + } + if (ret < 0) { + if (!(flags & O_CREAT) && (errno == ENOENT || ret == -ENOENT)) { + pr_info("No %s image\n", path); + img->_x.fd = EMPTY_IMG_FD; + goto skip_magic; + } + + pr_perror("Unable to open %s", path); + goto err; + } + + img->_x.fd = ret; + if (oflags & O_NOBUF) + bfd_setraw(&img->_x); + else { + if (flags == O_RDONLY) + ret = bfdopenr(&img->_x); + else + ret = bfdopenw(&img->_x); + + if (ret) + goto err; + } + + if (imgset_template[type].magic == RAW_IMAGE_MAGIC) + goto skip_magic; + + if (flags == O_RDONLY) + ret = img_check_magic(img, oflags, type, path); + else + ret = img_write_magic(img, oflags, type); + if (ret) + goto err; + +skip_magic: + return 0; + +err: + return -1; +} + +int open_image_lazy(struct cr_img *img) +{ + int dfd; + char *path = img->path; + + img->path = NULL; + + dfd = get_service_fd(IMG_FD_OFF); + if (do_open_image(img, dfd, img->type, img->oflags, path)) { + xfree(path); + return -1; + } + + xfree(path); + return 0; +} + +void close_image(struct cr_img *img) +{ + if (lazy_image(img)) { + /* + * Remove the image file if it's there so that + * subsequent restore doesn't read wrong or fake + * data from it. + */ + unlinkat(get_service_fd(IMG_FD_OFF), img->path, 0); + xfree(img->path); + } else if (!empty_image(img)) + bclose(&img->_x); + + xfree(img); +} + +struct cr_img *img_from_fd(int fd) +{ + struct cr_img *img; + + img = xmalloc(sizeof(*img)); + if (img) { + img->_x.fd = fd; + bfd_setraw(&img->_x); + } + + return img; +} + +int open_image_dir(char *dir) +{ + int fd, ret; + + fd = open(dir, O_RDONLY); + if (fd < 0) { + pr_perror("Can't open dir %s", dir); + return -1; + } + + ret = install_service_fd(IMG_FD_OFF, fd); + if (ret < 0) + return -1; + fd = ret; + + if (opts.remote) { + init_snapshot_id(dir); + } else if (opts.img_parent) { + ret = symlinkat(opts.img_parent, fd, CR_PARENT_LINK); + if (ret < 0 && errno != EEXIST) { + pr_perror("Can't link parent snapshot"); + goto err; + } + + if (opts.img_parent[0] == '/') + pr_warn("Absolute paths for parent links " + "may not work on restore!\n"); + } + + return 0; + +err: + close_image_dir(); + return -1; +} + +void close_image_dir(void) +{ + close_service_fd(IMG_FD_OFF); +} + +static unsigned long page_ids = 1; + +void up_page_ids_base(void) +{ + /* + * When page server and criu dump work on + * the same dir, the shmem pagemaps and regular + * pagemaps may have IDs conflicts. Fix this by + * making page server produce page images with + * higher IDs. + */ + + BUG_ON(page_ids != 1); + page_ids += 0x10000; +} + +struct cr_img *open_pages_image_at(int dfd, unsigned long flags, struct cr_img *pmi, u32 *id) +{ + if (flags == O_RDONLY || flags == O_RDWR) { + PagemapHead *h; + if (pb_read_one(pmi, &h, PB_PAGEMAP_HEAD) < 0) + return NULL; + *id = h->pages_id; + pagemap_head__free_unpacked(h, NULL); + } else { + PagemapHead h = PAGEMAP_HEAD__INIT; + *id = h.pages_id = page_ids++; + if (pb_write_one(pmi, &h, PB_PAGEMAP_HEAD) < 0) + return NULL; + } + + return open_image_at(dfd, CR_FD_PAGES, flags, *id); +} + +struct cr_img *open_pages_image(unsigned long flags, struct cr_img *pmi, u32 *id) +{ + return open_pages_image_at(get_service_fd(IMG_FD_OFF), flags, pmi, id); +} + +/* + * Write buffer @ptr of @size bytes into @fd file + * Returns + * 0 on success + * -1 on error (error message is printed) + */ +int write_img_buf(struct cr_img *img, const void *ptr, int size) +{ + int ret; + + ret = bwrite(&img->_x, ptr, size); + if (ret == size) + return 0; + + if (ret < 0) + pr_perror("Can't write img file"); + else + pr_err("Img trimmed %d/%d\n", ret, size); + return -1; +} + +/* + * Read buffer @ptr of @size bytes from @fd file + * Returns + * 1 on success + * 0 on EOF (silently) + * -1 on error (error message is printed) + */ +int read_img_buf_eof(struct cr_img *img, void *ptr, int size) +{ + int ret; + + ret = bread(&img->_x, ptr, size); + if (ret == size) + return 1; + if (ret == 0) + return 0; + + if (ret < 0) + pr_perror("Can't read img file"); + else + pr_err("Img trimmed %d/%d\n", ret, size); + return -1; +} + +/* + * Read buffer @ptr of @size bytes from @fd file + * Returns + * 1 on success + * -1 on error or EOF (error message is printed) + */ +int read_img_buf(struct cr_img *img, void *ptr, int size) +{ + int ret; + + ret = read_img_buf_eof(img, ptr, size); + if (ret == 0) { + pr_err("Unexpected EOF\n"); + ret = -1; + } + + return ret; +} + +/* + * read_img_str -- same as read_img_buf, but allocates memory for + * the buffer and puts the '\0' at the end + */ + +int read_img_str(struct cr_img *img, char **pstr, int size) +{ + int ret; + char *str; + + str = xmalloc(size + 1); + if (!str) + return -1; + + ret = read_img_buf(img, str, size); + if (ret < 0) { + xfree(str); + return -1; + } + + str[size] = '\0'; + *pstr = str; + return 0; +} + +off_t img_raw_size(struct cr_img *img) +{ + struct stat stat; + + if (fstat(img->_x.fd, &stat)) { + pr_perror("Failed to get image stats"); + return -1; + } + + return stat.st_size; +} diff --git a/CRIU_code/criu/img-cache.c b/CRIU_code/criu/img-cache.c new file mode 100644 index 0000000..3887b50 --- /dev/null +++ b/CRIU_code/criu/img-cache.c @@ -0,0 +1,56 @@ +#include +#include + +#include "cr_options.h" +#include "img-remote.h" +#include "util.h" + +int image_cache(bool background, char *local_cache_path) +{ + int tmp; + + pr_info("Proxy to Cache Port %u, CRIU to Cache Path %s\n", + opts.port, local_cache_path); + restoring = true; + + if (opts.ps_socket != -1) { + remote_sk = opts.ps_socket; + pr_info("Re-using ps socket %d\n", remote_sk); + } else { + remote_sk = setup_tcp_server("image cache"); + if (remote_sk < 0) { + pr_perror("Unable to open proxy to cache TCP socket"); + return -1; + } + // Wait to accept connection from proxy. + tmp = accept(remote_sk, NULL, 0); + if (tmp < 0) { + pr_perror("Unable to accept remote image connection" + " from image proxy"); + close(remote_sk); + return -1; + } + remote_sk = tmp; + } + + pr_info("Cache is connected to Proxy through fd %d\n", remote_sk); + + local_sk = setup_UNIX_server_socket(local_cache_path); + if (local_sk < 0) { + pr_perror("Unable to open cache to proxy UNIX socket"); + close(remote_sk); + return -1; + + } + + if (background) { + if (daemon(1, 0) == -1) { + pr_perror("Can't run service server in the background"); + return -1; + } + } + + accept_image_connections(); + pr_info("Finished image cache."); + return 0; +} diff --git a/CRIU_code/criu/img-proxy.c b/CRIU_code/criu/img-proxy.c new file mode 100644 index 0000000..f15bd7c --- /dev/null +++ b/CRIU_code/criu/img-proxy.c @@ -0,0 +1,45 @@ +#include + +#include "cr_options.h" +#include "criu-log.h" +#include "img-remote.h" +#include "util.h" + +int image_proxy(bool background, char *local_proxy_path) +{ + pr_info("CRIU to Proxy Path: %s, Cache Address %s:%u\n", + local_proxy_path, opts.addr, opts.port); + restoring = false; + + local_sk = setup_UNIX_server_socket(local_proxy_path); + if (local_sk < 0) { + pr_perror("Unable to open CRIU to proxy UNIX socket"); + return -1; + } + + if (opts.ps_socket != -1) { + remote_sk = opts.ps_socket; + pr_info("Re-using ps socket %d\n", remote_sk); + } else { + remote_sk = setup_tcp_client(); + if (remote_sk < 0) { + pr_perror("Unable to open proxy to cache TCP socket"); + close(local_sk); + return -1; + } + } + + pr_info("Proxy is connected to Cache through fd %d\n", remote_sk); + + if (background) { + if (daemon(1, 0) == -1) { + pr_perror("Can't run service server in the background"); + return -1; + } + } + + // TODO - local_sk and remote_sk send as args. + accept_image_connections(); + pr_info("Finished image proxy."); + return 0; +} diff --git a/CRIU_code/criu/img-remote.c b/CRIU_code/criu/img-remote.c new file mode 100644 index 0000000..433c012 --- /dev/null +++ b/CRIU_code/criu/img-remote.c @@ -0,0 +1,1159 @@ +#include +#include +#include +#include +#include + +#include "cr_options.h" +#include "img-remote.h" +#include "image.h" +#include "images/remote-image.pb-c.h" +#include "protobuf.h" +#include "servicefd.h" +#include "xmalloc.h" + +#define EPOLL_MAX_EVENTS 50 + +#define strflags(f) ((f) == O_RDONLY ? "read" : \ + (f) == O_APPEND ? "append" : "write") + +// List of images already in memory. +static LIST_HEAD(rimg_head); + +// List of local operations currently in-progress. +static LIST_HEAD(rop_inprogress); + +// List of local operations pending (reads on the restore side for images that +// still haven't arrived). +static LIST_HEAD(rop_pending); + +// List of images waiting to be forwarded. The head of the list is currently +// being forwarded. +static LIST_HEAD(rop_forwarding); + +// List of snapshots (useful when doing incremental restores/dumps) +static LIST_HEAD(snapshot_head); + +// Snapshot id (setup at launch time by dump or restore). +static char *snapshot_id; + +// True if restoring (cache := true; proxy := false). +bool restoring = true; + +// True if the proxy to cache socket is being used (receiving or sending). +static bool forwarding = false; + +// True if the local dump or restore is finished. +static bool finished_local = false; + +// True if the communication between the proxy and cache can be closed. +static bool finished_remote = false; + +// Proxy to cache socket fd; Local dump or restore servicing fd. +int remote_sk; +int local_sk; + +// Epoll fd and event array. +static int epoll_fd; +static struct epoll_event *events; + +static int64_t recv_image_async(struct roperation *op); +static int64_t send_image_async(struct roperation *op); + +/* A snapshot is a dump or pre-dump operation. Each snapshot is identified by an + * ID which corresponds to the working directory specified by the user. + */ +struct snapshot { + char snapshot_id[PATH_MAX]; + struct list_head l; +}; + +static struct snapshot *new_snapshot(char *snapshot_id) +{ + struct snapshot *s = xmalloc(sizeof(struct snapshot)); + + if (!s) + return NULL; + + strncpy(s->snapshot_id, snapshot_id, PATH_MAX - 1); + s->snapshot_id[PATH_MAX - 1]= '\0'; + return s; +} + +static inline void add_snapshot(struct snapshot *snapshot) +{ + list_add_tail(&(snapshot->l), &snapshot_head); +} + +struct rimage *get_rimg_by_name(const char *snapshot_id, const char *path) +{ + struct rimage *rimg = NULL; + + list_for_each_entry(rimg, &rimg_head, l) { + if (!strncmp(rimg->path, path, PATH_MAX) && + !strncmp(rimg->snapshot_id, snapshot_id, PATH_MAX)) { + return rimg; + } + } + return NULL; +} + +static inline struct roperation *get_rop_by_name(struct list_head *head, + const char *snapshot_id, const char *path) +{ + struct roperation *rop = NULL; + + list_for_each_entry(rop, head, l) { + if (!strncmp(rop->path, path, PATH_MAX) && + !strncmp(rop->snapshot_id, snapshot_id, PATH_MAX)) { + return rop; + } + } + return NULL; +} + +static int event_set(int epoll_fd, int op, int fd, uint32_t events, void *data) +{ + int ret; + struct epoll_event event; + event.events = events; + event.data.ptr = data; + + ret = epoll_ctl(epoll_fd, op, fd, &event); + if (ret) + pr_perror("[fd=%d] Unable to set event", fd); + return ret; +} + +int setup_UNIX_server_socket(char *path) +{ + struct sockaddr_un addr; + int sockfd = socket(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK, 0); + + if (sockfd < 0) { + pr_perror("Unable to open image socket"); + return -1; + } + + memset(&addr, 0, sizeof(addr)); + addr.sun_family = AF_UNIX; + strncpy(addr.sun_path, path, sizeof(addr.sun_path)-1); + + unlink(path); + + if (bind(sockfd, (struct sockaddr *)&addr, sizeof(addr)) == -1) { + pr_perror("Unable to bind image socket"); + goto err; + } + + if (listen(sockfd, 50) == -1) { + pr_perror("Unable to listen image socket"); + goto err; + } + + return sockfd; +err: + close(sockfd); + return -1; +} + +static int setup_UNIX_client_socket(char *path) +{ + struct sockaddr_un addr; + int sockfd = socket(AF_UNIX, SOCK_STREAM, 0); + + if (sockfd < 0) { + pr_perror("Unable to open local image socket"); + return -1; + } + + memset(&addr, 0, sizeof(addr)); + addr.sun_family = AF_UNIX; + strncpy(addr.sun_path, path, sizeof(addr.sun_path)-1); + + if (connect(sockfd, (struct sockaddr *)&addr, sizeof(addr)) < 0) { + pr_perror("Unable to connect to local socket: %s", path); + close(sockfd); + return -1; + } + + return sockfd; +} + +static inline int64_t pb_write_obj(int fd, void *obj, int type) +{ + struct cr_img img; + + img._x.fd = fd; + bfd_setraw(&img._x); + return pb_write_one(&img, obj, type); +} + +static inline int64_t pb_read_obj(int fd, void **pobj, int type) +{ + struct cr_img img; + + img._x.fd = fd; + bfd_setraw(&img._x); + return do_pb_read_one(&img, pobj, type, true); +} + +static inline int64_t write_header(int fd, char *snapshot_id, char *path, + int flags) +{ + LocalImageEntry li = LOCAL_IMAGE_ENTRY__INIT; + + li.name = path; + li.snapshot_id = snapshot_id; + li.open_mode = flags; + return pb_write_obj(fd, &li, PB_LOCAL_IMAGE); +} + +static inline int64_t write_reply_header(int fd, int error) +{ + LocalImageReplyEntry lir = LOCAL_IMAGE_REPLY_ENTRY__INIT; + + lir.error = error; + return pb_write_obj(fd, &lir, PB_LOCAL_IMAGE_REPLY); +} + +static inline int64_t write_remote_header(int fd, char *snapshot_id, + char *path, int flags, uint64_t size) +{ + RemoteImageEntry ri = REMOTE_IMAGE_ENTRY__INIT; + + ri.name = path; + ri.snapshot_id = snapshot_id; + ri.open_mode = flags; + ri.size = size; + return pb_write_obj(fd, &ri, PB_REMOTE_IMAGE); +} + +static inline int64_t read_header(int fd, char *snapshot_id, char *path, + int *flags) +{ + LocalImageEntry *li; + int ret = pb_read_obj(fd, (void **)&li, PB_LOCAL_IMAGE); + + if (ret > 0) { + strncpy(snapshot_id, li->snapshot_id, PATH_MAX - 1); + snapshot_id[PATH_MAX - 1] = 0; + strncpy(path, li->name, PATH_MAX - 1); + path[PATH_MAX - 1] = 0; + *flags = li->open_mode; + } + free(li); + return ret; +} + +static inline int64_t read_reply_header(int fd, int *error) +{ + LocalImageReplyEntry *lir; + int ret = pb_read_obj(fd, (void **)&lir, PB_LOCAL_IMAGE_REPLY); + + if (ret > 0) + *error = lir->error; + free(lir); + return ret; +} + +static inline int64_t read_remote_header(int fd, char *snapshot_id, char *path, + int *flags, uint64_t *size) +{ + RemoteImageEntry *ri; + int ret = pb_read_obj(fd, (void **)&ri, PB_REMOTE_IMAGE); + + if (ret > 0) { + strncpy(snapshot_id, ri->snapshot_id, PATH_MAX - 1); + strncpy(path, ri->name, PATH_MAX - 1); + *flags = ri->open_mode; + *size = ri->size; + } + free(ri); + return ret; +} + +static struct rimage *new_remote_image(char *path, char *snapshot_id) +{ + struct rimage *rimg = xzalloc(sizeof(struct rimage)); + struct rbuf *buf = xzalloc(sizeof(struct rbuf)); + + if (rimg == NULL || buf == NULL) + goto err; + + strncpy(rimg->path, path, PATH_MAX -1 ); + strncpy(rimg->snapshot_id, snapshot_id, PATH_MAX - 1); + rimg->path[PATH_MAX - 1] = '\0'; + rimg->snapshot_id[PATH_MAX - 1] = '\0'; + INIT_LIST_HEAD(&(rimg->buf_head)); + list_add_tail(&(buf->l), &(rimg->buf_head)); + rimg->curr_fwd_buf = buf; + + return rimg; +err: + xfree(rimg); + xfree(buf); + return NULL; +} + +static struct roperation *new_remote_operation(char *path, + char *snapshot_id, int cli_fd, int flags, bool close_fd) +{ + struct roperation *rop = xzalloc(sizeof(struct roperation)); + + if (rop == NULL) + return NULL; + + strncpy(rop->path, path, PATH_MAX -1 ); + strncpy(rop->snapshot_id, snapshot_id, PATH_MAX - 1); + rop->path[PATH_MAX - 1] = '\0'; + rop->snapshot_id[PATH_MAX - 1] = '\0'; + rop->fd = cli_fd; + rop->flags = flags; + rop->close_fd = close_fd; + + return rop; +} + +static inline void rop_set_rimg(struct roperation *rop, struct rimage *rimg) +{ + rop->rimg = rimg; + rop->size = rimg->size; + if (rop->flags == O_APPEND) { + // Image forward on append must start where the last fwd finished. + if (rop->fd == remote_sk) { + rop->curr_sent_buf = rimg->curr_fwd_buf; + rop->curr_sent_bytes = rimg->curr_fwd_bytes; + } else { + // For local appends, just write at the end. + rop->curr_sent_buf = list_entry(rimg->buf_head.prev, struct rbuf, l); + rop->curr_sent_bytes = rop->curr_sent_buf->nbytes; + } + // On the receiver size, we just append + rop->curr_recv_buf = list_entry(rimg->buf_head.prev, struct rbuf, l); + } else { + // Writes or reads are simple. Just do it from the beginning. + rop->curr_recv_buf = list_entry(rimg->buf_head.next, struct rbuf, l); + rop->curr_sent_buf = list_entry(rimg->buf_head.next, struct rbuf, l); + rop->curr_sent_bytes = 0; + } +} + +/* Clears a remote image struct for reusing it. */ +static inline struct rimage *clear_remote_image(struct rimage *rimg) +{ + while (!list_is_singular(&(rimg->buf_head))) { + struct rbuf *buf = list_entry(rimg->buf_head.prev, struct rbuf, l); + + list_del(rimg->buf_head.prev); + xfree(buf); + } + + list_entry(rimg->buf_head.next, struct rbuf, l)->nbytes = 0; + rimg->size = 0; + + return rimg; +} + +static struct roperation *handle_accept_write(int cli_fd, char *snapshot_id, + char *path, int flags, bool close_fd, uint64_t size) +{ + struct roperation *rop = NULL; + struct rimage *rimg = get_rimg_by_name(snapshot_id, path); + + if (rimg == NULL) { + rimg = new_remote_image(path, snapshot_id); + if (rimg == NULL) { + pr_perror("Error preparing remote image"); + goto err; + } + } else { + list_del(&(rimg->l)); + if (flags == O_APPEND) + clear_remote_image(rimg); + } + + rop = new_remote_operation(path, snapshot_id, cli_fd, flags, close_fd); + if (rop == NULL) { + pr_perror("Error preparing remote operation"); + goto err; + } + + rop_set_rimg(rop, rimg); + rop->size = size; + return rop; +err: + xfree(rimg); + xfree(rop); + return NULL; +} + +static inline struct roperation *handle_accept_proxy_write(int cli_fd, + char *snapshot_id, char *path, int flags) +{ + return handle_accept_write(cli_fd, snapshot_id, path, flags, true, 0); +} + +static struct roperation *handle_accept_proxy_read(int cli_fd, + char *snapshot_id, char *path, int flags) +{ + struct roperation *rop = NULL; + struct rimage *rimg = NULL; + + rimg = get_rimg_by_name(snapshot_id, path); + + // Check if we already have the image. + if (rimg == NULL) { + pr_info("No image %s:%s.\n", path, snapshot_id); + if (write_reply_header(cli_fd, ENOENT) < 0) { + pr_perror("Error writing reply header for unexisting image"); + goto err; + } + close(cli_fd); + return NULL; + } + + if (write_reply_header(cli_fd, 0) < 0) { + pr_perror("Error writing reply header for %s:%s", + path, snapshot_id); + goto err; + } + + rop = new_remote_operation(path, snapshot_id, cli_fd, flags, true); + if (rop == NULL) { + pr_perror("Error preparing remote operation"); + goto err; + } + + rop_set_rimg(rop, rimg); + return rop; +err: + close(cli_fd); + return NULL; +} + +static inline void finish_local() +{ + int ret; + finished_local = true; + ret = event_set(epoll_fd, EPOLL_CTL_DEL, local_sk, 0, 0); + if (ret) { + pr_perror("Failed to del local fd from epoll"); + } +} + +static struct roperation *handle_accept_cache_read(int cli_fd, + char *snapshot_id, char *path, int flags) +{ + struct rimage *rimg = NULL; + struct roperation *rop = NULL; + + rop = new_remote_operation(path, snapshot_id, cli_fd, flags, true); + if (rop == NULL) { + pr_perror("Error preparing remote operation"); + close(cli_fd); + return NULL; + } + + // Check if we already have the image. + rimg = get_rimg_by_name(snapshot_id, path); + if (rimg != NULL && rimg->size > 0) { + if (write_reply_header(cli_fd, 0) < 0) { + pr_perror("Error writing reply header for %s:%s", + path, snapshot_id); + close(rop->fd); + xfree(rop); + } + rop_set_rimg(rop, rimg); + return rop; + } else if (finished_remote) { + // The file does not exist. + pr_info("No image %s:%s.\n", path, snapshot_id); + if (write_reply_header(cli_fd, ENOENT) < 0) + pr_perror("Error writing reply header for unexisting image"); + close(cli_fd); + xfree(rop); + } + return NULL; +} + +static void forward_remote_image(struct roperation *rop) +{ + int64_t ret = 0; + + // Set blocking during the setup. + fd_set_nonblocking(rop->fd, false); + + ret = write_remote_header( + rop->fd, rop->snapshot_id, rop->path, rop->flags, rop->size); + + if (ret < 0) { + pr_perror("Error writing header for %s:%s", + rop->path, rop->snapshot_id); + return; + } + + pr_info("[fd=%d] Forwarding %s request for %s:%s (%" PRIu64 " bytes\n", + rop->fd, strflags(rop->flags), rop->path, rop->snapshot_id, + rop->size); + + // Go back to non-blocking + fd_set_nonblocking(rop->fd, true); + + forwarding = true; + event_set(epoll_fd, EPOLL_CTL_ADD, rop->fd, EPOLLOUT, rop); +} + +static void handle_remote_accept(int fd) +{ + char path[PATH_MAX]; + char snapshot_id[PATH_MAX]; + int flags = 0; + uint64_t size = 0; + int64_t ret; + struct roperation* rop = NULL; + + // Set blocking during the setup. + fd_set_nonblocking(fd, false); + + ret = read_remote_header(fd, snapshot_id, path, &flags, &size); + if (ret < 0) { + pr_perror("Unable to receive remote header from image proxy"); + goto err; + } + /* This means that the no more images are coming. */ + else if (!ret) { + finished_remote = true; + pr_info("Image Proxy connection closed.\n"); + return; + } + + // Go back to non-blocking + fd_set_nonblocking(fd, true); + + pr_info("[fd=%d] Received %s request for %s:%s with %" PRIu64 " bytes\n", + fd, strflags(flags), path, snapshot_id, size); + + + forwarding = true; + rop = handle_accept_write(fd, snapshot_id, path, flags, false, size); + + if (rop != NULL) { + list_add_tail(&(rop->l), &rop_inprogress); + event_set(epoll_fd, EPOLL_CTL_ADD, rop->fd, EPOLLIN, rop); + } + return; +err: + close(fd); +} + +static void handle_local_accept(int fd) +{ + int cli_fd; + char path[PATH_MAX]; + char snapshot_id[PATH_MAX]; + int flags = 0; + struct sockaddr_in cli_addr; + socklen_t clilen = sizeof(cli_addr); + struct roperation *rop = NULL; + + cli_fd = accept(fd, (struct sockaddr *) &cli_addr, &clilen); + if (cli_fd < 0) { + pr_perror("Unable to accept local image connection"); + return; + } + + if (read_header(cli_fd, snapshot_id, path, &flags) < 0) { + pr_err("Error reading local image header\n"); + goto err; + } + + if (snapshot_id[0] == NULL_SNAPSHOT_ID && path[0] == FINISH) { + close(cli_fd); + finish_local(); + return; + } + + pr_info("[fd=%d] Received %s request for %s:%s\n", + cli_fd, strflags(flags), path, snapshot_id); + + // Write/Append case (only possible in img-proxy). + if (flags != O_RDONLY) { + rop = handle_accept_proxy_write(cli_fd, snapshot_id, path, flags); + } else if (restoring) { + // Read case while restoring (img-cache). + rop = handle_accept_cache_read(cli_fd, snapshot_id, path, flags); + } else { + // Read case while dumping (img-proxy). + rop = handle_accept_proxy_read(cli_fd, snapshot_id, path, flags); + } + + // If we have an operation. Check if we are ready to start or not. + if (rop != NULL) { + if (rop->rimg != NULL) { + list_add_tail(&(rop->l), &rop_inprogress); + event_set( + epoll_fd, + EPOLL_CTL_ADD, + rop->fd, + rop->flags == O_RDONLY ? EPOLLOUT : EPOLLIN, + rop); + } else { + list_add_tail(&(rop->l), &rop_pending); + } + fd_set_nonblocking(rop->fd, false); + } + + return; +err: + close(cli_fd); +} + +static inline void finish_proxy_read(struct roperation *rop) +{ + // If finished forwarding image + if (rop->fd == remote_sk) { + // Update fwd buffer and byte count on rimg. + rop->rimg->curr_fwd_buf = rop->curr_sent_buf; + rop->rimg->curr_fwd_bytes = rop->curr_sent_bytes; + + forwarding = false; + + // If there are images waiting to be forwarded, forward the next. + if (!list_empty(&rop_forwarding)) { + forward_remote_image(list_entry(rop_forwarding.next, struct roperation, l)); + } + } +} + +static inline void finish_proxy_write(struct roperation *rop) +{ + // Normal image received, forward it. + struct roperation *rop_to_forward = new_remote_operation( + rop->path, rop->snapshot_id, remote_sk, rop->flags, false); + + // Add image to list of images. + list_add_tail(&(rop->rimg->l), &rimg_head); + + rop_set_rimg(rop_to_forward, rop->rimg); + if (list_empty(&rop_forwarding)) { + forward_remote_image(rop_to_forward); + } + list_add_tail(&(rop_to_forward->l), &rop_forwarding); +} + +static void finish_cache_write(struct roperation *rop) +{ + struct roperation *prop = get_rop_by_name( + &rop_pending, rop->snapshot_id, rop->path); + + forwarding = false; + event_set(epoll_fd, EPOLL_CTL_ADD, remote_sk, EPOLLIN, &remote_sk); + + // Add image to list of images. + list_add_tail(&(rop->rimg->l), &rimg_head); + + if (prop != NULL) { + pr_info("\t[fd=%d] Resuming pending %s for %s:%s\n", + prop->fd, strflags(prop->flags), + prop->snapshot_id, prop->path); + + // Write header for pending image. + if (write_reply_header(prop->fd, 0) < 0) { + pr_perror("Error writing reply header for %s:%s", + prop->path, prop->snapshot_id); + close(prop->fd); + xfree(prop); + return; + } + + rop_set_rimg(prop, rop->rimg); + list_del(&(prop->l)); + list_add_tail(&(prop->l), &rop_inprogress); + event_set(epoll_fd, EPOLL_CTL_ADD, prop->fd, EPOLLOUT, prop); + } +} + +static void handle_roperation(struct epoll_event *event, + struct roperation *rop) +{ + int64_t ret = (EPOLLOUT & event->events) ? + send_image_async(rop) : + recv_image_async(rop); + + if (ret > 0 || ret == EAGAIN || ret == EWOULDBLOCK) { + event_set( + epoll_fd, + EPOLL_CTL_ADD, + rop->fd, + event->events, + rop); + return; + } + + // Remove rop from list (either in progress or forwarding). + list_del(&(rop->l)); + + // Operation is finished. + if (ret < 0) { + pr_perror("Unable to %s %s:%s (returned %" PRId64 ")", + event->events & EPOLLOUT ? "send" : "receive", + rop->rimg->path, rop->rimg->snapshot_id, ret); + goto err; + } else { + pr_info("[fd=%d] Finished %s %s:%s to CRIU (size %" PRIu64 ")\n", + rop->fd, + event->events & EPOLLOUT ? "sending" : "receiving", + rop->rimg->path, rop->rimg->snapshot_id, rop->rimg->size); + } + + // If receive operation is finished + if (event->events & EPOLLIN) { + // Cached side (finished receiving forwarded image) + if (restoring) { + finish_cache_write(rop); + } else { + // Proxy side (finished receiving local image) + finish_proxy_write(rop); + } + } else { + // Proxy side (Finished forwarding image or reading it locally). + if (!restoring) + finish_proxy_read(rop); + // Nothing to be done when a read is finished on the cache side. + } +err: + xfree(rop); +} + +static void check_pending() +{ + struct roperation *rop = NULL; + struct rimage *rimg = NULL; + + list_for_each_entry(rop, &rop_pending, l) { + rimg = get_rimg_by_name(rop->snapshot_id, rop->path); + if (rimg != NULL) { + rop_set_rimg(rop, rimg); + if (restoring) { + event_set(epoll_fd, EPOLL_CTL_ADD, rop->fd, EPOLLOUT, rop); + } else { + forward_remote_image(rop); + return; + } + } + } +} + +void accept_image_connections() { + int ret; + + epoll_fd = epoll_create(EPOLL_MAX_EVENTS); + if (epoll_fd < 0) { + pr_perror("Unable to open epoll"); + return; + } + + events = calloc(EPOLL_MAX_EVENTS, sizeof(struct epoll_event)); + if (events == NULL) { + pr_perror("Failed to allocated epoll events"); + goto end; + } + + ret = event_set(epoll_fd, EPOLL_CTL_ADD, local_sk, EPOLLIN, &local_sk); + if (ret) { + pr_perror("Failed to add local fd to epoll"); + goto end; + } + + // Only if we are restoring (cache-side) we need to add the remote sock to + // the epoll. + if (restoring) { + ret = event_set(epoll_fd, EPOLL_CTL_ADD, remote_sk, + EPOLLIN, &remote_sk); + if (ret) { + pr_perror("Failed to add proxy to cache fd to epoll"); + goto end; + } + } + + while (1) { + int n_events, i; + + n_events = epoll_wait(epoll_fd, events, EPOLL_MAX_EVENTS, 250); + + /* epoll_wait isn't restarted after interrupted by a signal */ + if (n_events < 0 && errno != EINTR) { + pr_perror("Failed to epoll wait"); + goto end; + } + + for (i = 0; i < n_events; i++) { + // Accept from local dump/restore? + if (events[i].data.ptr == &local_sk) { + if (events[i].events & EPOLLHUP || + events[i].events & EPOLLERR) { + if (!finished_local) + pr_perror("Unable to accept more local image connections"); + goto end; + } + handle_local_accept(local_sk); + } else if (restoring && !forwarding && events[i].data.ptr == &remote_sk) { + event_set(epoll_fd, EPOLL_CTL_DEL, remote_sk, 0, 0); + handle_remote_accept(remote_sk); + } else { + struct roperation *rop = + (struct roperation*)events[i].data.ptr; + event_set(epoll_fd, EPOLL_CTL_DEL, rop->fd, 0, 0); + handle_roperation(&events[i], rop); + } + } + + // Check if there are any pending operations + if (restoring || !forwarding) + check_pending(); + + // Check if we can close the tcp socket (this will unblock the cache + // to answer "no image" to restore). + if (!restoring && + finished_local && + !finished_remote && + list_empty(&rop_forwarding)) { + close(remote_sk); + finished_remote = true; + } + + // If both local and remote sockets are closed, leave. + if (finished_local && finished_remote) { + pr_info("Finished both local and remote, exiting\n"); + goto end; + } + } +end: + close(epoll_fd); + close(local_sk); + free(events); +} + + +/* Note: size is a limit on how much we want to read from the socket. Zero means + * read until the socket is closed. + */ +static int64_t recv_image_async(struct roperation *op) +{ + int fd = op->fd; + struct rimage *rimg = op->rimg; + uint64_t size = op->size; + bool close_fd = op->close_fd; + struct rbuf *curr_buf = op->curr_recv_buf; + int n; + + n = read(fd, + curr_buf->buffer + curr_buf->nbytes, + size ? + min((int) (size - rimg->size), BUF_SIZE - curr_buf->nbytes) : + BUF_SIZE - curr_buf->nbytes); + if (n == 0) { + if (close_fd) + close(fd); + return n; + } else if (n > 0) { + curr_buf->nbytes += n; + rimg->size += n; + if (curr_buf->nbytes == BUF_SIZE) { + struct rbuf *buf = xmalloc(sizeof(struct rbuf)); + if (buf == NULL) { + if (close_fd) + close(fd); + return -1; + } + buf->nbytes = 0; + list_add_tail(&(buf->l), &(rimg->buf_head)); + op->curr_recv_buf = buf; + return n; + } + if (size && rimg->size == size) { + if (close_fd) + close(fd); + return 0; + } + } else if (errno == EAGAIN || errno == EWOULDBLOCK) { + return errno; + } else { + pr_perror("Read for %s:%s socket on fd=%d failed", + rimg->path, rimg->snapshot_id, fd); + if (close_fd) + close(fd); + return -1; + } + return n; +} + +static int64_t send_image_async(struct roperation *op) +{ + int fd = op->fd; + struct rimage *rimg = op->rimg; + bool close_fd = op->close_fd; + int n; + + n = write( + fd, + op->curr_sent_buf->buffer + op->curr_sent_bytes, + min(BUF_SIZE, op->curr_sent_buf->nbytes) - op->curr_sent_bytes); + + if (n > -1) { + op->curr_sent_bytes += n; + if (op->curr_sent_bytes == BUF_SIZE) { + op->curr_sent_buf = + list_entry(op->curr_sent_buf->l.next, struct rbuf, l); + op->curr_sent_bytes = 0; + return n; + } else if (op->curr_sent_bytes == op->curr_sent_buf->nbytes) { + if (close_fd) + close(fd); + return 0; + } + return n; + } else if (errno == EPIPE || errno == ECONNRESET) { + pr_warn("Connection for %s:%s was closed early than expected\n", + rimg->path, rimg->snapshot_id); + return 0; + } else if (errno == EAGAIN || errno == EWOULDBLOCK) { + return errno; + } else { + pr_perror("Write on %s:%s socket failed", + rimg->path, rimg->snapshot_id); + return -1; + } +} + +int read_remote_image_connection(char *snapshot_id, char *path) +{ + int error = 0; + int sockfd = setup_UNIX_client_socket(restoring ? DEFAULT_CACHE_SOCKET: DEFAULT_PROXY_SOCKET); + + if (sockfd < 0) { + pr_err("Error opening local connection for %s:%s\n", + path, snapshot_id); + return -1; + } + + if (write_header(sockfd, snapshot_id, path, O_RDONLY) < 0) { + pr_err("Error writing header for %s:%s\n", path, snapshot_id); + return -1; + } + + if (read_reply_header(sockfd, &error) < 0) { + pr_err("Error reading reply header for %s:%s\n", + path, snapshot_id); + return -1; + } + + if (!error || (snapshot_id[0] == NULL_SNAPSHOT_ID && path[0] != FINISH)) + return sockfd; + + if (error == ENOENT) { + pr_info("Image does not exist (%s:%s)\n", path, snapshot_id); + close(sockfd); + return -ENOENT; + } + pr_err("Unexpected error returned: %d (%s:%s)\n", + error, path, snapshot_id); + close(sockfd); + return -1; +} + +int write_remote_image_connection(char *snapshot_id, char *path, int flags) +{ + int sockfd = setup_UNIX_client_socket(DEFAULT_PROXY_SOCKET); + + if (sockfd < 0) + return -1; + + if (write_header(sockfd, snapshot_id, path, flags) < 0) { + pr_err("Error writing header for %s:%s\n", path, snapshot_id); + return -1; + } + return sockfd; +} + +int finish_remote_dump(void) +{ + pr_info("Dump side is calling finish\n"); + int fd = write_remote_image_connection(NULL_SNAPSHOT_ID, FINISH, O_WRONLY); + + if (fd == -1) { + pr_err("Unable to open finish dump connection"); + return -1; + } + + close(fd); + return 0; +} + +int finish_remote_restore(void) +{ + pr_info("Restore side is calling finish\n"); + int fd = read_remote_image_connection(NULL_SNAPSHOT_ID, FINISH); + + if (fd == -1) { + pr_err("Unable to open finish restore connection\n"); + return -1; + } + + close(fd); + return 0; +} + +int skip_remote_bytes(int fd, unsigned long len) +{ + static char buf[4096]; + int n = 0; + unsigned long curr = 0; + + for (; curr < len; ) { + n = read(fd, buf, min(len - curr, (unsigned long)4096)); + if (n == 0) { + pr_perror("Unexpected end of stream (skipping %lx/%lx bytes)", + curr, len); + return -1; + } else if (n > 0) { + curr += n; + } else { + pr_perror("Error while skipping bytes from stream (%lx/%lx)", + curr, len); + return -1; + } + } + + if (curr != len) { + pr_err("Unable to skip the current number of bytes: %lx instead of %lx\n", + curr, len); + return -1; + } + return 0; +} + +static int pull_snapshot_ids(void) +{ + int n, sockfd; + SnapshotIdEntry *ls; + struct snapshot *s = NULL; + + sockfd = read_remote_image_connection(NULL_SNAPSHOT_ID, PARENT_IMG); + + /* The connection was successful but there is not file. */ + if (sockfd < 0) { + if (errno != ENOENT) { + pr_err("Unable to open snapshot id read connection\n"); + return -1; + } + return 0; + } + + while (1) { + n = pb_read_obj(sockfd, (void **)&ls, PB_SNAPSHOT_ID); + if (!n) { + close(sockfd); + return n; + } else if (n < 0) { + pr_err("Unable to read remote snapshot ids\n"); + close(sockfd); + return n; + } + + s = new_snapshot(ls->snapshot_id); + if (!s) { + close(sockfd); + return -1; + } + add_snapshot(s); + pr_info("[read_snapshot ids] parent = %s\n", ls->snapshot_id); + } + free(ls); + close(sockfd); + return n; +} + +int push_snapshot_id(void) +{ + int n; + restoring = false; + SnapshotIdEntry rn = SNAPSHOT_ID_ENTRY__INIT; + int sockfd = write_remote_image_connection(NULL_SNAPSHOT_ID, PARENT_IMG, O_APPEND); + + if (sockfd < 0) { + pr_err("Unable to open snapshot id push connection\n"); + return -1; + } + + rn.snapshot_id = xmalloc(sizeof(char) * PATH_MAX); + if (!rn.snapshot_id) { + close(sockfd); + return -1; + } + strncpy(rn.snapshot_id, snapshot_id, PATH_MAX); + + n = pb_write_obj(sockfd, &rn, PB_SNAPSHOT_ID); + + xfree(rn.snapshot_id); + close(sockfd); + return n; +} + +void init_snapshot_id(char *si) +{ + snapshot_id = si; +} + +char *get_curr_snapshot_id(void) +{ + return snapshot_id; +} + +int get_curr_snapshot_id_idx(void) +{ + struct snapshot *si; + int idx = 0; + + if (list_empty(&snapshot_head)) + pull_snapshot_ids(); + + list_for_each_entry(si, &snapshot_head, l) { + if (!strncmp(si->snapshot_id, snapshot_id, PATH_MAX)) + return idx; + idx++; + } + + pr_err("Error, could not find current snapshot id (%s) fd\n", + snapshot_id); + return -1; +} + +char *get_snapshot_id_from_idx(int idx) +{ + struct snapshot *si; + + if (list_empty(&snapshot_head)) + pull_snapshot_ids(); + + /* Note: if idx is the service fd then we need the current + * snapshot_id idx. Else we need a parent snapshot_id idx. + */ + if (idx == get_service_fd(IMG_FD_OFF)) + idx = get_curr_snapshot_id_idx(); + + list_for_each_entry(si, &snapshot_head, l) { + if (!idx) + return si->snapshot_id; + idx--; + } + + pr_err("Error, could not find snapshot id for idx %d\n", idx); + return NULL; +} + +int get_curr_parent_snapshot_id_idx(void) +{ + return get_curr_snapshot_id_idx() - 1; +} diff --git a/CRIU_code/criu/include/action-scripts.h b/CRIU_code/criu/include/action-scripts.h new file mode 100644 index 0000000..40b09b1 --- /dev/null +++ b/CRIU_code/criu/include/action-scripts.h @@ -0,0 +1,28 @@ +#ifndef __CR_ACTION_SCRIPTS_H__ +#define __CR_ACTION_SCRIPTS_H__ + +#include "asm/int.h" + +enum script_actions { + ACT_PRE_DUMP, + ACT_POST_DUMP, + ACT_PRE_RESTORE, + ACT_POST_RESTORE, + ACT_NET_LOCK, + ACT_NET_UNLOCK, + ACT_SETUP_NS, + ACT_POST_SETUP_NS, + ACT_POST_RESUME, + ACT_PRE_RESUME, + ACT_ORPHAN_PTS_MASTER, + + ACT_MAX +}; + +extern int add_script(char *path); +extern int add_rpc_notify(int sk); +extern int run_scripts(enum script_actions); +extern int rpc_send_fd(enum script_actions, int fd); +extern int send_criu_rpc_script(enum script_actions act, char *name, int sk, int fd); + +#endif /* __CR_ACTION_SCRIPTS_H__ */ diff --git a/CRIU_code/criu/include/aio.h b/CRIU_code/criu/include/aio.h new file mode 100644 index 0000000..858ccd3 --- /dev/null +++ b/CRIU_code/criu/include/aio.h @@ -0,0 +1,35 @@ +#ifndef __CR_AIO_H__ +#define __CR_AIO_H__ + +#include +#include "images/mm.pb-c.h" +unsigned int aio_estimate_nr_reqs(unsigned int size); +int dump_aio_ring(MmEntry *mme, struct vma_area *vma); +void free_aios(MmEntry *mme); +struct parasite_ctl; +int parasite_collect_aios(struct parasite_ctl *, struct vm_area_list *); +unsigned long aio_rings_args_size(struct vm_area_list *); +struct task_restore_args; +int prepare_aios(struct pstree_item *t, struct task_restore_args *ta); + +struct aio_ring { + unsigned id; /* kernel internal index number */ + unsigned nr; /* number of io_events */ + unsigned head; /* Written to by userland or under ring_lock + * mutex by aio_read_events_ring(). */ + unsigned tail; + + unsigned magic; + unsigned compat_features; + unsigned incompat_features; + unsigned header_length; /* size of aio_ring */ + + struct io_event io_events[0]; +}; + +struct rst_aio_ring { + unsigned long addr; + unsigned long len; + unsigned int nr_req; +}; +#endif /* __CR_AIO_H__ */ diff --git a/CRIU_code/criu/include/asm-generic/int.h b/CRIU_code/criu/include/asm-generic/int.h new file mode 100644 index 0000000..ac3088d --- /dev/null +++ b/CRIU_code/criu/include/asm-generic/int.h @@ -0,0 +1,15 @@ +#ifndef __CR_INT_H__ +#define __CR_INT_H__ + +#include + +typedef uint64_t u64; +typedef int64_t s64; +typedef uint32_t u32; +typedef int32_t s32; +typedef uint16_t u16; +typedef int16_t s16; +typedef uint8_t u8; +typedef int8_t s8; + +#endif /* __CR_INT_H__ */ diff --git a/CRIU_code/criu/include/asm-generic/vdso.h b/CRIU_code/criu/include/asm-generic/vdso.h new file mode 100644 index 0000000..6c3e3d1 --- /dev/null +++ b/CRIU_code/criu/include/asm-generic/vdso.h @@ -0,0 +1,15 @@ +#ifndef __CR_ASM_GENERIC_VDSO_H__ +#define __CR_ASM_GENERIC_VDSO_H__ + +#define VDSO_PROT (PROT_READ | PROT_EXEC) +#define VVAR_PROT (PROT_READ) + +/* Just in case of LPAE system PFN is u64. */ +#define VDSO_BAD_PFN (-1ull) +#define VVAR_BAD_PFN (-1ull) +#define VDSO_BAD_ADDR (-1ul) +#define VVAR_BAD_ADDR (-1ul) +#define VDSO_BAD_SIZE (-1ul) +#define VVAR_BAD_SIZE (-1ul) + +#endif /* __CR_ASM_GENERIC_VDSO_H__ */ diff --git a/CRIU_code/criu/include/atomic.h b/CRIU_code/criu/include/atomic.h new file mode 100644 index 0000000..41ac632 --- /dev/null +++ b/CRIU_code/criu/include/atomic.h @@ -0,0 +1,4 @@ +#ifndef __CR_INC_ATOMIC_H__ +#define __CR_INC_ATOMIC_H__ +#include "common/asm/atomic.h" +#endif diff --git a/CRIU_code/criu/include/autofs.h b/CRIU_code/criu/include/autofs.h new file mode 100644 index 0000000..c461885 --- /dev/null +++ b/CRIU_code/criu/include/autofs.h @@ -0,0 +1,234 @@ +#ifndef __CR_AUTOFS_H__ +#define __CR_AUTOFS_H__ + +#ifndef AUTOFS_MINOR +#define AUTOFS_MINOR 235 +#endif + +#include + +bool is_autofs_pipe(unsigned long inode); + +struct mount_info; +int autofs_parse(struct mount_info *pm); +int autofs_dump(struct mount_info *pm); +int autofs_mount(struct mount_info *mi, const char *source, const + char *filesystemtype, unsigned long mountflags); + +#include +#include + +#include + +#define AUTOFS_DEVICE_NAME "autofs" + +#define AUTOFS_DEV_IOCTL_VERSION_MAJOR 1 +#define AUTOFS_DEV_IOCTL_VERSION_MINOR 0 + +#define AUTOFS_DEVID_LEN 16 + +#define AUTOFS_DEV_IOCTL_SIZE sizeof(struct autofs_dev_ioctl) + +/* + * An ioctl interface for autofs mount point control. + */ + +struct args_protover { + __u32 version; +}; + +struct args_protosubver { + __u32 sub_version; +}; + +struct args_openmount { + __u32 devid; +}; + +struct args_ready { + __u32 token; +}; + +struct args_fail { + __u32 token; + __s32 status; +}; + +struct args_setpipefd { + __s32 pipefd; +}; + +struct args_timeout { + __u64 timeout; +}; + +struct args_requester { + __u32 uid; + __u32 gid; +}; + +struct args_expire { + __u32 how; +}; + + +struct args_askumount { + __u32 may_umount; +}; + +struct args_ismountpoint { + union { + struct args_in { + __u32 type; + } in; + struct args_out { + __u32 devid; + __u32 magic; + } out; + }; +}; + +/* + * All the ioctls use this structure. + * When sending a path size must account for the total length + * of the chunk of memory otherwise is is the size of the + * structure. + */ + +struct autofs_dev_ioctl { + __u32 ver_major; + __u32 ver_minor; + __u32 size; /* total size of data passed in + * including this struct */ + __s32 ioctlfd; /* automount command fd */ + + /* Command parameters */ + + union { + struct args_protover protover; + struct args_protosubver protosubver; + struct args_openmount openmount; + struct args_ready ready; + struct args_fail fail; + struct args_setpipefd setpipefd; + struct args_timeout timeout; + struct args_requester requester; + struct args_expire expire; + struct args_askumount askumount; + struct args_ismountpoint ismountpoint; + }; + + char path[0]; +}; + +static inline void init_autofs_dev_ioctl(struct autofs_dev_ioctl *in) +{ + memset(in, 0, sizeof(struct autofs_dev_ioctl)); + in->ver_major = AUTOFS_DEV_IOCTL_VERSION_MAJOR; + in->ver_minor = AUTOFS_DEV_IOCTL_VERSION_MINOR; + in->size = sizeof(struct autofs_dev_ioctl); + in->ioctlfd = -1; + return; +} + + +/* + * If you change this make sure you make the corresponding change + * to autofs-dev-ioctl.c:lookup_ioctl() + */ +enum { + /* Get various version info */ + AUTOFS_DEV_IOCTL_VERSION_CMD = 0x71, + AUTOFS_DEV_IOCTL_PROTOVER_CMD, + AUTOFS_DEV_IOCTL_PROTOSUBVER_CMD, + + /* Open mount ioctl fd */ + AUTOFS_DEV_IOCTL_OPENMOUNT_CMD, + + /* Close mount ioctl fd */ + AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD, + + /* Mount/expire status returns */ + AUTOFS_DEV_IOCTL_READY_CMD, + AUTOFS_DEV_IOCTL_FAIL_CMD, + + /* Activate/deactivate autofs mount */ + AUTOFS_DEV_IOCTL_SETPIPEFD_CMD, + AUTOFS_DEV_IOCTL_CATATONIC_CMD, + + /* Expiry timeout */ + AUTOFS_DEV_IOCTL_TIMEOUT_CMD, + + /* Get mount last requesting uid and gid */ + AUTOFS_DEV_IOCTL_REQUESTER_CMD, + + /* Check for eligible expire candidates */ + AUTOFS_DEV_IOCTL_EXPIRE_CMD, + + /* Request busy status */ + AUTOFS_DEV_IOCTL_ASKUMOUNT_CMD, + + /* Check if path is a mountpoint */ + AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD, +}; + +#define AUTOFS_IOCTL 0x93 + +#define AUTOFS_DEV_IOCTL_VERSION \ + _IOWR(AUTOFS_IOCTL, \ + AUTOFS_DEV_IOCTL_VERSION_CMD, struct autofs_dev_ioctl) + +#define AUTOFS_DEV_IOCTL_PROTOVER \ + _IOWR(AUTOFS_IOCTL, \ + AUTOFS_DEV_IOCTL_PROTOVER_CMD, struct autofs_dev_ioctl) + +#define AUTOFS_DEV_IOCTL_PROTOSUBVER \ + _IOWR(AUTOFS_IOCTL, \ + AUTOFS_DEV_IOCTL_PROTOSUBVER_CMD, struct autofs_dev_ioctl) + +#define AUTOFS_DEV_IOCTL_OPENMOUNT \ + _IOWR(AUTOFS_IOCTL, \ + AUTOFS_DEV_IOCTL_OPENMOUNT_CMD, struct autofs_dev_ioctl) + + +#define AUTOFS_DEV_IOCTL_CLOSEMOUNT \ + _IOWR(AUTOFS_IOCTL, \ + AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD, struct autofs_dev_ioctl) + +#define AUTOFS_DEV_IOCTL_READY \ + _IOWR(AUTOFS_IOCTL, \ + AUTOFS_DEV_IOCTL_READY_CMD, struct autofs_dev_ioctl) + +#define AUTOFS_DEV_IOCTL_FAIL \ + _IOWR(AUTOFS_IOCTL, \ + AUTOFS_DEV_IOCTL_FAIL_CMD, struct autofs_dev_ioctl) + +#define AUTOFS_DEV_IOCTL_SETPIPEFD \ + _IOWR(AUTOFS_IOCTL, \ + AUTOFS_DEV_IOCTL_SETPIPEFD_CMD, struct autofs_dev_ioctl) + +#define AUTOFS_DEV_IOCTL_CATATONIC \ + _IOWR(AUTOFS_IOCTL, \ + AUTOFS_DEV_IOCTL_CATATONIC_CMD, struct autofs_dev_ioctl) + +#define AUTOFS_DEV_IOCTL_TIMEOUT \ + _IOWR(AUTOFS_IOCTL, \ + AUTOFS_DEV_IOCTL_TIMEOUT_CMD, struct autofs_dev_ioctl) + +#define AUTOFS_DEV_IOCTL_REQUESTER \ + _IOWR(AUTOFS_IOCTL, \ + AUTOFS_DEV_IOCTL_REQUESTER_CMD, struct autofs_dev_ioctl) + +#define AUTOFS_DEV_IOCTL_EXPIRE \ + _IOWR(AUTOFS_IOCTL, \ + AUTOFS_DEV_IOCTL_EXPIRE_CMD, struct autofs_dev_ioctl) + +#define AUTOFS_DEV_IOCTL_ASKUMOUNT \ + _IOWR(AUTOFS_IOCTL, \ + AUTOFS_DEV_IOCTL_ASKUMOUNT_CMD, struct autofs_dev_ioctl) + +#define AUTOFS_DEV_IOCTL_ISMOUNTPOINT \ + _IOWR(AUTOFS_IOCTL, \ + AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD, struct autofs_dev_ioctl) + +#endif diff --git a/CRIU_code/criu/include/bfd.h b/CRIU_code/criu/include/bfd.h new file mode 100644 index 0000000..9f4bdb2 --- /dev/null +++ b/CRIU_code/criu/include/bfd.h @@ -0,0 +1,40 @@ +#ifndef __CR_BFD_H__ +#define __CR_BFD_H__ + +#include "common/err.h" + +struct bfd_buf; +struct xbuf { + char *mem; /* buffer */ + char *data; /* position we see bytes at */ + unsigned int sz; /* bytes sitting after b->pos */ + struct bfd_buf *buf; +}; + +struct bfd { + int fd; + bool writable; + struct xbuf b; +}; + +static inline bool bfd_buffered(struct bfd *b) +{ + return b->b.mem != NULL; +} + +static inline void bfd_setraw(struct bfd *b) +{ + b->b.mem = NULL; +} + +int bfdopenr(struct bfd *f); +int bfdopenw(struct bfd *f); +void bclose(struct bfd *f); +char *breadline(struct bfd *f); +char *breadchr(struct bfd *f, char c); +int bwrite(struct bfd *f, const void *buf, int sz); +struct iovec; +int bwritev(struct bfd *f, const struct iovec *iov, int cnt); +int bread(struct bfd *f, void *buf, int sz); +int bfd_flush_images(void); +#endif diff --git a/CRIU_code/criu/include/bitmap.h b/CRIU_code/criu/include/bitmap.h new file mode 100644 index 0000000..9e701b6 --- /dev/null +++ b/CRIU_code/criu/include/bitmap.h @@ -0,0 +1,7 @@ +#ifndef __CR_BITMAP_H__ +#define __CR_BITMAP_H__ + +extern void bitmap_set(unsigned long *map, int start, int nr); +extern void bitmap_clear(unsigned long *map, int start, int nr); + +#endif /* __CR_BITMAP_H__ */ diff --git a/CRIU_code/criu/include/bitops.h b/CRIU_code/criu/include/bitops.h new file mode 100644 index 0000000..cfc4991 --- /dev/null +++ b/CRIU_code/criu/include/bitops.h @@ -0,0 +1,4 @@ +#ifndef __CR_INC_BITOPS_H__ +#define __CR_INC_BITOPS_H__ +#include "common/bitops.h" +#endif diff --git a/CRIU_code/criu/include/bitsperlong.h b/CRIU_code/criu/include/bitsperlong.h new file mode 100644 index 0000000..0e530d5 --- /dev/null +++ b/CRIU_code/criu/include/bitsperlong.h @@ -0,0 +1,4 @@ +#ifndef __CR_INC_BITSPERLONG_H__ +#define __CR_INC_BITSPERLONG_H__ +#include "common/bitsperlong.h" +#endif diff --git a/CRIU_code/criu/include/cgroup-props.h b/CRIU_code/criu/include/cgroup-props.h new file mode 100644 index 0000000..0e52010 --- /dev/null +++ b/CRIU_code/criu/include/cgroup-props.h @@ -0,0 +1,20 @@ +#ifndef __CR_CGROUP_PROPS_H__ +#define __CR_CGROUP_PROPS_H__ + +#include + +typedef struct { + const char *name; + size_t nr_props; + const char **props; +} cgp_t; + +extern cgp_t cgp_global; +extern const cgp_t *cgp_get_props(const char *name); +extern bool cgp_should_skip_controller(const char *name); +extern bool cgp_add_dump_controller(const char *name); + +extern int cgp_init(char *stream, size_t len, char *path); +extern void cgp_fini(void); + +#endif /* __CR_CGROUP_PROPS_H__ */ diff --git a/CRIU_code/criu/include/cgroup.h b/CRIU_code/criu/include/cgroup.h new file mode 100644 index 0000000..949266d --- /dev/null +++ b/CRIU_code/criu/include/cgroup.h @@ -0,0 +1,95 @@ +#ifndef __CR_CGROUP_H__ +#define __CR_CGROUP_H__ + +#include "int.h" +#include "images/core.pb-c.h" + +struct pstree_item; +struct parasite_dump_cgroup_args; +extern u32 root_cg_set; +int dump_task_cgroup(struct pstree_item *, u32 *, struct parasite_dump_cgroup_args *args); +int dump_cgroups(void); +int prepare_task_cgroup(struct pstree_item *); +int prepare_cgroup(void); +/* Restore things like cpu_limit in known cgroups. */ +int prepare_cgroup_properties(void); +int restore_freezer_state(void); +void fini_cgroup(void); + +struct cg_controller; + +struct cgroup_prop { + char *name; + char *value; + mode_t mode; + uid_t uid; + gid_t gid; + struct list_head list; +}; + +/* This describes a particular cgroup path, e.g. the '/lxc/u1' part of + * 'blkio/lxc/u1' and any properties it has. + */ +struct cgroup_dir { + char *path; + mode_t mode; + uid_t uid; + gid_t gid; + + struct list_head properties; + unsigned int n_properties; + + /* this is how children are linked together */ + struct list_head siblings; + + /* more cgroup_dirs */ + struct list_head children; + unsigned int n_children; +}; + +/* This describes a particular cgroup controller, e.g. blkio or cpuset. + * The heads are subdirectories organized in their tree format. + */ +struct cg_controller { + unsigned int n_controllers; + char **controllers; + + /* cgroup_dirs */ + struct list_head heads; + unsigned int n_heads; + + /* for cgroup list in cgroup.c */ + struct list_head l; +}; +struct cg_controller *new_controller(const char *name); + +/* parse all global cgroup information into structures */ +int parse_cg_info(void); +int new_cg_root_add(char *controller, char *newroot); + +extern struct ns_desc cgroup_ns_desc; + +/* + * This struct describes a group controlled by one controller. + * The @name is the controller name or 'name=...' for named cgroups. + * The @path is the path from the hierarchy root. + */ + +struct cg_ctl { + struct list_head l; + char *name; + char *path; + u32 cgns_prefix; +}; + +/* + * Returns the list of cg_ctl-s sorted by name + */ +struct list_head; +struct parasite_dump_cgroup_args; +extern int parse_task_cgroup(int pid, struct parasite_dump_cgroup_args *args, struct list_head *l, unsigned int *n); +extern void put_ctls(struct list_head *); + +int collect_controllers(struct list_head *cgroups, unsigned int *n_cgroups); + +#endif /* __CR_CGROUP_H__ */ diff --git a/CRIU_code/criu/include/clone-noasan.h b/CRIU_code/criu/include/clone-noasan.h new file mode 100644 index 0000000..8ef75fa --- /dev/null +++ b/CRIU_code/criu/include/clone-noasan.h @@ -0,0 +1,6 @@ +#ifndef __CR_CLONE_NOASAN_H__ +#define __CR_CLONE_NOASAN_H__ + +int clone_noasan(int (*fn)(void *), int flags, void *arg); + +#endif /* __CR_CLONE_NOASAN_H__ */ diff --git a/CRIU_code/criu/include/cpu.h b/CRIU_code/criu/include/cpu.h new file mode 100644 index 0000000..e306967 --- /dev/null +++ b/CRIU_code/criu/include/cpu.h @@ -0,0 +1,12 @@ +#ifndef __CR_CPU_H__ +#define __CR_CPU_H__ + +#include + +extern int cpu_init(void); +extern int cpu_dump_cpuinfo(void); +extern int cpu_validate_cpuinfo(void); +extern int cpuinfo_dump(void); +extern int cpuinfo_check(void); + +#endif /* __CR_CPU_H__ */ diff --git a/CRIU_code/criu/include/cr-errno.h b/CRIU_code/criu/include/cr-errno.h new file mode 100644 index 0000000..1f94988 --- /dev/null +++ b/CRIU_code/criu/include/cr-errno.h @@ -0,0 +1,17 @@ +#ifndef __CR_ERRNO_H__ +#define __CR_ERRNO_H__ + +void set_cr_errno(int err); +int get_cr_errno(void); + +/* + * List of symbolic error names: + * ESRCH - no process can be found corresponding to that specified by pid + * EEXIST - process with such pid already exists + * EBADRQC - bad options + */ + +#define set_task_cr_err(new_err) atomic_cmpxchg(&task_entries->cr_err, 0, new_err) +#define get_task_cr_err() atomic_read(&task_entries->cr_err) + +#endif /* __CR_ERRNO_H__ */ diff --git a/CRIU_code/criu/include/cr-service-const.h b/CRIU_code/criu/include/cr-service-const.h new file mode 100644 index 0000000..c6d2e39 --- /dev/null +++ b/CRIU_code/criu/include/cr-service-const.h @@ -0,0 +1,6 @@ +#ifndef __CR_SERVICE_CONST_H__ +#define __CR_SERVICE_CONST_H__ + +#define CR_DEFAULT_SERVICE_ADDRESS "./criu_service.socket" + +#endif /* __CR_SERVICE_CONST_H__ */ diff --git a/CRIU_code/criu/include/cr-service.h b/CRIU_code/criu/include/cr-service.h new file mode 100644 index 0000000..37cf7b6 --- /dev/null +++ b/CRIU_code/criu/include/cr-service.h @@ -0,0 +1,14 @@ +#ifndef __CR_SERVICE_H__ +#define __CR_SERVICE_H__ + +#include "images/rpc.pb-c.h" + +extern int cr_service(bool daemon_mode); +int cr_service_work(int sk); + +extern int send_criu_dump_resp(int socket_fd, bool success, bool restored); + +extern struct _cr_service_client *cr_service_client; +extern unsigned int service_sk_ino; + +#endif /* __CR_SERVICE_H__ */ diff --git a/CRIU_code/criu/include/cr_options.h b/CRIU_code/criu/include/cr_options.h new file mode 100644 index 0000000..c519c74 --- /dev/null +++ b/CRIU_code/criu/include/cr_options.h @@ -0,0 +1,157 @@ +#ifndef __CR_OPTIONS_H__ +#define __CR_OPTIONS_H__ + +#include +#include +#include "common/config.h" +#include "common/list.h" + +/* Configuration and CLI parsing order defines */ +#define PARSING_GLOBAL_CONF 1 +#define PARSING_USER_CONF 2 +#define PARSING_ENV_CONF 3 +#define PARSING_CMDLINE_CONF 4 +#define PARSING_ARGV 5 +#define PARSING_RPC_CONF 6 +#define PARSING_LAST 7 + +#define SET_CHAR_OPTS(__dest, __src) \ + do { \ + free(opts.__dest); \ + opts.__dest = xstrdup(__src); \ + } while(0) + +/* + * CPU capability options. + */ +#define CPU_CAP_NONE (0u << 0) /* Don't check capability at all */ +#define CPU_CAP_FPU (1u << 0) /* Only FPU capability required */ +#define CPU_CAP_CPU (1u << 1) /* Strict CPU capability required */ +#define CPU_CAP_INS (1u << 2) /* Instructions CPU capability */ +#define CPU_CAP_IMAGE (1u << 3) /* Write capability on dump and read on restore*/ +#define CPU_CAP_ALL (CPU_CAP_FPU | CPU_CAP_CPU | CPU_CAP_INS) +#define CPU_CAP_DEFAULT (CPU_CAP_FPU | CPU_CAP_INS) + +struct cg_root_opt { + struct list_head node; + char *controller; + char *newroot; +}; + +/* + * Cgroup management options. + */ +#define CG_MODE_IGNORE (0u << 0) /* Zero is important here */ +#define CG_MODE_NONE (1u << 0) +#define CG_MODE_PROPS (1u << 1) +#define CG_MODE_SOFT (1u << 2) +#define CG_MODE_FULL (1u << 3) +#define CG_MODE_STRICT (1u << 4) + +#define CG_MODE_DEFAULT (CG_MODE_SOFT) + +/* + * Ghost file size we allow to carry by default. + */ +#define DEFAULT_GHOST_LIMIT (1 << 20) + +#define DEFAULT_TIMEOUT 10 + +struct irmap; + +struct irmap_path_opt { + struct list_head node; + struct irmap *ir; +}; + +struct cr_options { + int final_state; + int check_extra_features; + int check_experimental_features; + union { + int restore_detach; + bool daemon_mode; + }; + int restore_sibling; + bool ext_unix_sk; + int shell_job; + int handle_file_locks; + int tcp_established_ok; + int tcp_close; + int evasive_devices; + int link_remap_ok; + int log_file_per_pid; + bool swrk_restore; + char *output; + char *root; + char *pidfile; + char *freeze_cgroup; + struct list_head ext_mounts; + struct list_head inherit_fds; + struct list_head external; + struct list_head join_ns; + char *libdir; + int use_page_server; + unsigned short port; + char *addr; + int ps_socket; + int track_mem; + char *img_parent; + int auto_dedup; + unsigned int cpu_cap; + int force_irmap; + char **exec_cmd; + unsigned int manage_cgroups; + char *new_global_cg_root; + char *cgroup_props; + char *cgroup_props_file; + struct list_head new_cgroup_roots; + bool autodetect_ext_mounts; + int enable_external_sharing; + int enable_external_masters; + bool aufs; /* auto-detected, not via cli */ + bool overlayfs; +#ifdef CONFIG_BINFMT_MISC_VIRTUALIZED + bool has_binfmt_misc; /* auto-detected */ +#endif + size_t ghost_limit; + struct list_head irmap_scan_paths; + bool lsm_supplied; + char *lsm_profile; + unsigned int timeout; + unsigned int empty_ns; + int tcp_skip_in_flight; + bool lazy_pages; + char *work_dir; + + /* + * When we scheduler for removal some functionality we first + * deprecate it and it sits in criu for some time. By default + * the deprecated stuff is not working, but it's still possible + * to turn one ON while the code is in. + */ + int deprecated_ok; + int display_stats; + int weak_sysctls; + int status_fd; + bool orphan_pts_master; + int remote; + pid_t tree_id; + int log_level; + char *imgs_dir; + char *tls_cacert; + char *tls_cacrl; + char *tls_cert; + char *tls_key; + int tls; + int tls_no_cn_verify; +}; + +extern struct cr_options opts; +char *rpc_cfg_file; + +extern int parse_options(int argc, char **argv, bool *usage_error, bool *has_exec_cmd, int state); +extern int check_options(); +extern void init_opts(); + +#endif /* __CR_OPTIONS_H__ */ diff --git a/CRIU_code/criu/include/criu-log.h b/CRIU_code/criu/include/criu-log.h new file mode 100644 index 0000000..21ef543 --- /dev/null +++ b/CRIU_code/criu/include/criu-log.h @@ -0,0 +1,50 @@ +/* + This file defines types and macros for CRIU plugins. + Copyright (C) 2013 Parallels, Inc + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef __CRIU_LOG_H__ +#define __CRIU_LOG_H__ + +#include "log.h" + +struct timeval; + +extern int log_init(const char *output); +extern void log_fini(void); +extern int log_init_by_pid(pid_t pid); +extern void log_closedir(void); +extern int log_keep_err(void); +extern char *log_first_err(void); + +extern void log_set_fd(int fd); +extern int log_get_fd(void); + +extern void log_set_loglevel(unsigned int loglevel); +extern unsigned int log_get_loglevel(void); +struct timeval; +extern void log_get_logstart(struct timeval *); + +extern int write_pidfile(int pid); + +#define DEFAULT_LOG_FILENAME "criu.log" + +static inline int pr_quelled(unsigned int loglevel) +{ + return log_get_loglevel() < loglevel && loglevel != LOG_MSG; +} +#endif /* __CR_LOG_LEVELS_H__ */ diff --git a/CRIU_code/criu/include/criu-plugin.h b/CRIU_code/criu/include/criu-plugin.h new file mode 100644 index 0000000..b76f5f8 --- /dev/null +++ b/CRIU_code/criu/include/criu-plugin.h @@ -0,0 +1,132 @@ +/* + * This file defines types and macros for CRIU plugins. + * Copyright (C) 2013-2014 Parallels, Inc + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef __CRIU_PLUGIN_H__ +#define __CRIU_PLUGIN_H__ + +#include +#include + +#define CRIU_PLUGIN_GEN_VERSION(a,b,c) (((a) << 16) + ((b) << 8) + (c)) +#define CRIU_PLUGIN_VERSION_MAJOR 0 +#define CRIU_PLUGIN_VERSION_MINOR 2 +#define CRIU_PLUGIN_VERSION_SUBLEVEL 0 + +#define CRIU_PLUGIN_VERSION_OLD CRIU_PLUGIN_GEN_VERSION(0,1,0) + +#define CRIU_PLUGIN_VERSION \ + CRIU_PLUGIN_GEN_VERSION(CRIU_PLUGIN_VERSION_MAJOR, \ + CRIU_PLUGIN_VERSION_MINOR, \ + CRIU_PLUGIN_VERSION_SUBLEVEL) + +/* + * Plugin hook points and their arguments in hooks. + */ +enum { + CR_PLUGIN_HOOK__DUMP_UNIX_SK = 0, + CR_PLUGIN_HOOK__RESTORE_UNIX_SK = 1, + + CR_PLUGIN_HOOK__DUMP_EXT_FILE = 2, + CR_PLUGIN_HOOK__RESTORE_EXT_FILE = 3, + + CR_PLUGIN_HOOK__DUMP_EXT_MOUNT = 4, + CR_PLUGIN_HOOK__RESTORE_EXT_MOUNT = 5, + + CR_PLUGIN_HOOK__DUMP_EXT_LINK = 6, + + CR_PLUGIN_HOOK__MAX +}; + +#define DECLARE_PLUGIN_HOOK_ARGS(__hook, ...) \ + typedef int (__hook ##_t)(__VA_ARGS__) + +DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__DUMP_UNIX_SK, int fd, int id); +DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESTORE_UNIX_SK, int id); +DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__DUMP_EXT_FILE, int fd, int id); +DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESTORE_EXT_FILE, int id); +DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__DUMP_EXT_MOUNT, char *mountpoint, int id); +DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESTORE_EXT_MOUNT, int id, char *mountpoint, char *old_root, int *is_file); +DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__DUMP_EXT_LINK, int index, int type, char *kind); + +enum { + CR_PLUGIN_STAGE__DUMP, + CR_PLUGIN_STAGE__PRE_DUMP, + CR_PLUGIN_STAGE__RESTORE, + + CR_PLUGIN_STAGE_MAX +}; + +/* + * Plugin descriptor. + */ +typedef struct { + const char *name; + int (*init)(int stage); + void (*exit)(int stage, int ret); + unsigned int version; + unsigned int max_hooks; + void *hooks[CR_PLUGIN_HOOK__MAX]; +} cr_plugin_desc_t; + +extern cr_plugin_desc_t CR_PLUGIN_DESC; + +#define CR_PLUGIN_REGISTER(___name, ___init, ___exit) \ + cr_plugin_desc_t CR_PLUGIN_DESC = { \ + .name = ___name, \ + .init = ___init, \ + .exit = ___exit, \ + .version = CRIU_PLUGIN_VERSION, \ + .max_hooks = CR_PLUGIN_HOOK__MAX, \ + }; + +static inline int cr_plugin_dummy_init(int stage) { return 0; } +static inline void cr_plugin_dummy_exit(int stage, int ret) { } + +#define CR_PLUGIN_REGISTER_DUMMY(___name) \ + cr_plugin_desc_t CR_PLUGIN_DESC = { \ + .name = ___name, \ + .init = cr_plugin_dummy_init, \ + .exit = cr_plugin_dummy_exit, \ + .version = CRIU_PLUGIN_VERSION, \ + .max_hooks = CR_PLUGIN_HOOK__MAX, \ + }; + +#define CR_PLUGIN_REGISTER_HOOK(__hook, __func) \ +static void __attribute__((constructor)) cr_plugin_register_hook_##__func (void) \ +{ \ + CR_PLUGIN_DESC.hooks[__hook] = (void *)__func; \ +} + +/* Public API */ +extern int criu_get_image_dir(void); + +/* + * Deprecated, will be removed in next version. + */ +typedef int (cr_plugin_init_t)(void); +typedef void (cr_plugin_fini_t)(void); +typedef int (cr_plugin_dump_unix_sk_t)(int fd, int id); +typedef int (cr_plugin_restore_unix_sk_t)(int id); +typedef int (cr_plugin_dump_file_t)(int fd, int id); +typedef int (cr_plugin_restore_file_t)(int id); +typedef int (cr_plugin_dump_ext_mount_t)(char *mountpoint, int id); +typedef int (cr_plugin_restore_ext_mount_t)(int id, char *mountpoint, char *old_root, int *is_file); +typedef int (cr_plugin_dump_ext_link_t)(int index, int type, char *kind); + +#endif /* __CRIU_PLUGIN_H__ */ diff --git a/CRIU_code/criu/include/crtools.h b/CRIU_code/criu/include/crtools.h new file mode 100644 index 0000000..c5a5b64 --- /dev/null +++ b/CRIU_code/criu/include/crtools.h @@ -0,0 +1,47 @@ +#ifndef __CR_CRTOOLS_H__ +#define __CR_CRTOOLS_H__ + +#include + +#include "common/list.h" +#include "servicefd.h" + +#include "images/inventory.pb-c.h" + +#define CR_FD_PERM (S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH) + +extern int check_img_inventory(void); +extern int write_img_inventory(InventoryEntry *he); +extern int inventory_save_uptime(InventoryEntry *he); +extern InventoryEntry *get_parent_inventory(void); +extern int prepare_inventory(InventoryEntry *he); +struct pprep_head { + int (*actor)(struct pprep_head *); + struct pprep_head *next; +}; +extern void add_post_prepare_cb(struct pprep_head *); +extern bool deprecated_ok(char *what); +extern int cr_dump_tasks(pid_t pid); +extern int cr_pre_dump_tasks(pid_t pid); +extern int cr_restore_tasks(void); +extern int convert_to_elf(char *elf_path, int fd_core); +extern int cr_check(void); +extern int cr_dedup(void); +extern int cr_lazy_pages(bool daemon); + +extern int check_add_feature(char *arg); +extern void pr_check_features(const char *offset, const char *sep, int width); + +#define PPREP_HEAD_INACTIVE ((struct pprep_head *)-1) + +#define add_post_prepare_cb_once(phead) do { \ + if ((phead)->next == PPREP_HEAD_INACTIVE)\ + add_post_prepare_cb(phead); \ + } while (0) + +#define MAKE_PPREP_HEAD(name) struct pprep_head name = { \ + .next = PPREP_HEAD_INACTIVE, \ + .actor = name##_cb, \ + } + +#endif /* __CR_CRTOOLS_H__ */ diff --git a/CRIU_code/criu/include/dump.h b/CRIU_code/criu/include/dump.h new file mode 100644 index 0000000..1c14468 --- /dev/null +++ b/CRIU_code/criu/include/dump.h @@ -0,0 +1,7 @@ +#ifndef __CR_INC_DUMP_H__ +#define __CR_INC_DUMP_H__ +#include "asm/dump.h" + +extern int arch_set_thread_regs(struct pstree_item *item, bool with_threads); + +#endif diff --git a/CRIU_code/criu/include/eventfd.h b/CRIU_code/criu/include/eventfd.h new file mode 100644 index 0000000..65e0af7 --- /dev/null +++ b/CRIU_code/criu/include/eventfd.h @@ -0,0 +1,10 @@ +#ifndef __CR_EVENTFD_H__ +#define __CR_EVENTFD_H__ + +#include "files.h" + +extern int is_eventfd_link(char *link); +extern const struct fdtype_ops eventfd_dump_ops; +extern struct collect_image_info eventfd_cinfo; + +#endif /* __CR_EVENTFD_H__ */ diff --git a/CRIU_code/criu/include/eventpoll.h b/CRIU_code/criu/include/eventpoll.h new file mode 100644 index 0000000..411c5c9 --- /dev/null +++ b/CRIU_code/criu/include/eventpoll.h @@ -0,0 +1,13 @@ +#ifndef __CR_EVENTPOLL_H__ +#define __CR_EVENTPOLL_H__ + +#include "files.h" + +extern int is_eventpoll_link(char *link); +extern int flush_eventpoll_dinfo_queue(void); + +extern const struct fdtype_ops eventpoll_dump_ops; +extern struct collect_image_info epoll_tfd_cinfo; +extern struct collect_image_info epoll_cinfo; + +#endif /* __CR_EVENTPOLL_H__ */ diff --git a/CRIU_code/criu/include/external.h b/CRIU_code/criu/include/external.h new file mode 100644 index 0000000..d284b79 --- /dev/null +++ b/CRIU_code/criu/include/external.h @@ -0,0 +1,28 @@ +#ifndef __CR_EXTERNAL_H__ +#define __CR_EXTERNAL_H__ +struct external { + struct list_head node; + char *id; + void *data; +}; + +extern int add_external(char *key); +extern bool external_lookup_id(char *id); +extern char *external_lookup_by_key(char *id); +extern void *external_lookup_data(char *id); +extern int external_for_each_type(char *type, int (*cb)(struct external *, void *), void *arg); + +static inline char *external_val(struct external *e) +{ + char *aux; + + aux = strchr(e->id, '['); + if (aux) { + aux = strchr(aux + 1, ']'); + if (aux && aux[1] == ':') + return aux + 2; + } + + return NULL; +} +#endif diff --git a/CRIU_code/criu/include/fault-injection.h b/CRIU_code/criu/include/fault-injection.h new file mode 100644 index 0000000..852d271 --- /dev/null +++ b/CRIU_code/criu/include/fault-injection.h @@ -0,0 +1,51 @@ +#ifndef __CR_FAULT_INJECTION_H__ +#define __CR_FAULT_INJECTION_H__ +#include + +enum faults { + FI_NONE = 0, + FI_DUMP_EARLY, + FI_RESTORE_ROOT_ONLY, + FI_DUMP_PAGES, + FI_RESTORE_OPEN_LINK_REMAP, + FI_PARASITE_CONNECT, + FI_POST_RESTORE, + /* not fatal */ + FI_VDSO_TRAMPOLINES = 127, + FI_CHECK_OPEN_HANDLE = 128, + FI_NO_MEMFD = 129, + FI_NO_BREAKPOINTS = 130, + FI_PARTIAL_PAGES = 131, + FI_HUGE_ANON_SHMEM_ID = 132, + FI_MAX, +}; + +static inline bool __fault_injected(enum faults f, enum faults fi_strategy) +{ + /* + * Temporary workaround for Xen guests. Breakpoints degrade + * performance linearly, so until we find out the reason, + * let's disable them. + */ + if (f == FI_NO_BREAKPOINTS) + return true; + + return fi_strategy == f; +} + +#define FI_HUGE_ANON_SHMEM_ID_BASE (0xfffffffflu) + +#ifndef CR_NOGLIBC + +extern enum faults fi_strategy; +#define fault_injected(f) __fault_injected(f, fi_strategy) + +extern int fault_injection_init(void); + +#else /* CR_NOGLIBC */ + +extern bool fault_injected(enum faults f); + +#endif + +#endif diff --git a/CRIU_code/criu/include/fcntl.h b/CRIU_code/criu/include/fcntl.h new file mode 100644 index 0000000..d9c5c5e --- /dev/null +++ b/CRIU_code/criu/include/fcntl.h @@ -0,0 +1,49 @@ +#ifndef __CR_ASM_GENERIC_FCNTL_H__ +#define __CR_ASM_GENERIC_FCNTL_H__ + +#include +#include + +#ifndef F_SETOWN_EX +#define F_SETOWN_EX 15 +#define F_GETOWN_EX 16 + +struct f_owner_ex { + int type; + pid_t pid; +}; + +#endif + +#ifndef F_GETOWNER_UIDS +#define F_GETOWNER_UIDS 17 +#endif + +/* + * These things are required to compile on CentOS-6 + */ +#ifndef F_LINUX_SPECIFIC_BASE +# define F_LINUX_SPECIFIC_BASE 1024 +#endif + +#ifndef F_SETPIPE_SZ +# define F_SETPIPE_SZ (F_LINUX_SPECIFIC_BASE + 7) +#endif + +#ifndef F_GETPIPE_SZ +# define F_GETPIPE_SZ (F_LINUX_SPECIFIC_BASE + 8) +#endif + +#ifndef O_PATH +# define O_PATH 010000000 +#endif + +#ifndef __O_TMPFILE +#define __O_TMPFILE 020000000 +#endif + +#ifndef O_TMPFILE +#define O_TMPFILE (__O_TMPFILE | O_DIRECTORY) +#endif + +#endif /* __CR_ASM_GENERIC_FCNTL_H__ */ diff --git a/CRIU_code/criu/include/fdinfo.h b/CRIU_code/criu/include/fdinfo.h new file mode 100644 index 0000000..10fb31f --- /dev/null +++ b/CRIU_code/criu/include/fdinfo.h @@ -0,0 +1,22 @@ +#ifndef __CR_FDINFO_H__ +#define __CR_FDINFO_H__ + +#include "common/list.h" + +#include "images/eventfd.pb-c.h" +#include "images/eventpoll.pb-c.h" +#include "images/signalfd.pb-c.h" +#include "images/fsnotify.pb-c.h" +#include "images/timerfd.pb-c.h" + +struct fdinfo_common { + off64_t pos; + int flags; + int mnt_id; + int owner; +}; + +extern int parse_fdinfo(int fd, int type, void *arg); +extern int parse_fdinfo_pid(int pid, int fd, int type, void *arg); + +#endif diff --git a/CRIU_code/criu/include/fdstore.h b/CRIU_code/criu/include/fdstore.h new file mode 100644 index 0000000..bdfb5fe --- /dev/null +++ b/CRIU_code/criu/include/fdstore.h @@ -0,0 +1,17 @@ +#ifndef __CRIU_FDSTORE_H__ +#define __CRIU_FDSTORE_H__ + +/* + * fdstore is a storage for file descriptors which is shared + * between processes. + */ + +int fdstore_init(void); + +/* Add a file descriptor to the storage and return its id */ +int fdstore_add(int fd); + +/* Get a file descriptor from a storage by id */ +int fdstore_get(int id); + +#endif diff --git a/CRIU_code/criu/include/fifo.h b/CRIU_code/criu/include/fifo.h new file mode 100644 index 0000000..9560a7b --- /dev/null +++ b/CRIU_code/criu/include/fifo.h @@ -0,0 +1,11 @@ +#ifndef __CR_FIFO_H__ +#define __CR_FIFO_H__ + +struct fd_parms; +struct cr_imgset; + +extern const struct fdtype_ops fifo_dump_ops; +extern struct collect_image_info fifo_cinfo; +extern struct collect_image_info fifo_data_cinfo; + +#endif /* __CR_FIFO_H__ */ diff --git a/CRIU_code/criu/include/file-ids.h b/CRIU_code/criu/include/file-ids.h new file mode 100644 index 0000000..9a39f0d --- /dev/null +++ b/CRIU_code/criu/include/file-ids.h @@ -0,0 +1,21 @@ +#ifndef __CR_FILE_IDS_H__ +#define __CR_FILE_IDS_H__ + +#include "common/compiler.h" +#include "rbtree.h" + +#include "images/fdinfo.pb-c.h" + +#define FD_PID_INVALID (-2U) +#define FD_DESC_INVALID (-3U) + +struct fdinfo_entry; +struct stat; + +struct fd_parms; +extern int fd_id_generate(pid_t pid, FdinfoEntry *fe, struct fd_parms *p); +extern int fd_id_generate_special(struct fd_parms *p, u32 *id); + +extern struct kid_tree fd_tree; + +#endif /* __CR_FILE_IDS_H__ */ diff --git a/CRIU_code/criu/include/file-lock.h b/CRIU_code/criu/include/file-lock.h new file mode 100644 index 0000000..dc4f382 --- /dev/null +++ b/CRIU_code/criu/include/file-lock.h @@ -0,0 +1,79 @@ +#ifndef __FILE_LOCK_H__ +#define __FILE_LOCK_H__ + +#include "common/list.h" + +#include "protobuf.h" +#include "images/file-lock.pb-c.h" + +#define FL_UNKNOWN -1 +#define FL_POSIX 1 +#define FL_FLOCK 2 +#define FL_OFD 4 +#define FL_LEASE 8 + +/* for posix fcntl() and lockf() */ +#ifndef F_RDLCK +#define F_RDLCK 0 +#define F_WRLCK 1 +#define F_UNLCK 2 +#endif + +/* for OFD locks fcntl() */ +#ifndef F_OFD_GETLK +#define F_OFD_GETLK 36 +#define F_OFD_SETLK 37 +#define F_OFD_SETLKW 38 +#endif + +/* operations for bsd flock(), also used by the kernel implementation */ +#define LOCK_SH 1 /* shared lock */ +#define LOCK_EX 2 /* exclusive lock */ +#define LOCK_NB 4 /* or'd with one of the above to prevent + blocking */ +#define LOCK_UN 8 /* remove lock */ + +#define LOCK_MAND 32 /* This is a mandatory flock ... */ +#define LOCK_READ 64 /* which allows concurrent read operations */ +#define LOCK_WRITE 128 /* which allows concurrent write operations */ +#define LOCK_RW 192 /* which allows concurrent read & write ops */ + +/* for leases */ +#define LEASE_BREAKING 4 + +struct file_lock { + long long fl_id; + int fl_kind; + int fl_ltype; + + pid_t fl_owner; /* process, which created the lock */ + pid_t fl_holder; /* pid of fd on whose the lock is found */ + int maj, min; + unsigned long i_no; + long long start; + char end[32]; + + struct list_head list; /* list of all file locks */ + + int real_owner; + int owners_fd; +}; + +extern struct list_head file_lock_list; + +extern struct file_lock *alloc_file_lock(void); +extern void free_file_locks(void); + +extern int prepare_file_locks(int pid); +extern struct collect_image_info file_locks_cinfo; + +struct pid; +struct fd_parms; +extern void discard_dup_locks_tail(pid_t pid, int fd); +extern int correct_file_leases_type(struct pid *, int fd, int lfd); +extern int note_file_lock(struct pid *, int fd, int lfd, struct fd_parms *); +extern int dump_file_locks(void); + +#define OPT_FILE_LOCKS "file-locks" + +#endif /* __FILE_LOCK_H__ */ diff --git a/CRIU_code/criu/include/files-reg.h b/CRIU_code/criu/include/files-reg.h new file mode 100644 index 0000000..7a22d4d --- /dev/null +++ b/CRIU_code/criu/include/files-reg.h @@ -0,0 +1,59 @@ +#ifndef __CR_FILES_REG_H__ +#define __CR_FILES_REG_H__ + +#include "files.h" + +#include "images/regfile.pb-c.h" +#include "images/ghost-file.pb-c.h" + +struct cr_imgset; +struct fd_parms; + +struct file_remap { + char *rpath; + bool is_dir; + int rmnt_id; + uid_t uid; + gid_t gid; +}; + +struct reg_file_info { + struct file_desc d; + RegFileEntry *rfe; + struct file_remap *remap; + bool size_mode_checked; + bool is_dir; + char *path; +}; + +extern int open_reg_by_id(u32 id); +extern int open_reg_fd(struct file_desc *); +extern int open_path(struct file_desc *, int (*open_cb)(int ns_root_fd, + struct reg_file_info *, void *), void *arg); +extern void clear_ghost_files(void); + +extern const struct fdtype_ops regfile_dump_ops; +extern int do_open_reg_noseek_flags(int ns_root_fd, struct reg_file_info *rfi, void *arg); +extern int dump_one_reg_file(int lfd, u32 id, const struct fd_parms *p); + +extern struct file_remap *lookup_ghost_remap(u32 dev, u32 ino); + +extern struct file_desc *try_collect_special_file(u32 id, int optional); +#define collect_special_file(id) try_collect_special_file(id, 0) +extern int collect_filemap(struct vma_area *); +extern void filemap_ctx_init(bool auto_close); +extern void filemap_ctx_fini(void); + +extern struct collect_image_info reg_file_cinfo; +extern int collect_remaps_and_regfiles(void); + +extern void delete_link_remaps(void); +extern void free_link_remaps(void); +extern int prepare_remaps(void); +extern int try_clean_remaps(bool only_ghosts); + +extern int strip_deleted(struct fd_link *link); + +extern int dead_pid_conflict(void); + +#endif /* __CR_FILES_REG_H__ */ diff --git a/CRIU_code/criu/include/files.h b/CRIU_code/criu/include/files.h new file mode 100644 index 0000000..2c1e1e7 --- /dev/null +++ b/CRIU_code/criu/include/files.h @@ -0,0 +1,207 @@ +#ifndef __CR_FILES_H__ +#define __CR_FILES_H__ + +#include + +#include "int.h" +#include "common/compiler.h" +#include "fcntl.h" +#include "common/lock.h" +#include "common/list.h" +#include "pid.h" +#include "rst_info.h" + +#include "images/fdinfo.pb-c.h" +#include "images/fown.pb-c.h" +#include "images/vma.pb-c.h" + +struct parasite_drain_fd; +struct pstree_item; +struct file_desc; +struct cr_imgset; +struct rst_info; +struct parasite_ctl; + +struct fd_link { + union { + /* Link info for generic file (path) */ + struct { + char name[PATH_MAX]; + size_t len; + }; + + /* Link info for proc-ns file */ + struct { + struct ns_desc *ns_d; + unsigned int ns_kid; + }; + }; +}; + +struct fd_parms { + int fd; + off_t pos; + unsigned int flags; + char fd_flags; + struct stat stat; + pid_t pid; + FownEntry fown; + struct fd_link *link; + long fs_type; + int mnt_id; + + struct parasite_ctl *fd_ctl; + struct parasite_drain_fd *dfds; +}; + +#define FD_PARMS_INIT \ +(struct fd_parms) { \ + .fd = FD_DESC_INVALID, \ + .fown = FOWN_ENTRY__INIT, \ + .link = NULL, \ + .mnt_id = -1, \ +} + +extern int fill_fdlink(int lfd, const struct fd_parms *p, struct fd_link *link); +extern uint32_t make_gen_id(uint32_t st_dev, uint32_t st_ino, uint64_t pos); + +struct file_desc; + +enum { + FLE_INITIALIZED, + /* + * FLE is open (via open() or socket() or etc syscalls), and + * common file setting are set up (type-specific are not yet). + * Most possible, the master was already served out. + */ + FLE_OPEN, + /* + * File-type specific settings and preparations are finished, + * and FLE is completely restored. + */ + FLE_RESTORED, +}; + +struct fdinfo_list_entry { + struct list_head desc_list; /* To chain on @fd_info_head */ + struct file_desc *desc; /* Associated file descriptor */ + struct list_head ps_list; /* To chain per-task files */ + struct pstree_item *task; + FdinfoEntry *fe; + int pid; + u8 received:1; + u8 stage:3; + u8 fake:1; +}; + +extern int inh_fd_max; + +/* reports whether fd_a takes prio over fd_b */ +static inline int fdinfo_rst_prio(struct fdinfo_list_entry *fd_a, struct fdinfo_list_entry *fd_b) +{ + return pid_rst_prio(fd_a->pid, fd_b->pid) || + ((fd_a->pid == fd_b->pid) && (fd_a->fe->fd < fd_b->fe->fd)); +} + +struct file_desc_ops { + /* fd_types from images/fdinfo.proto */ + unsigned int type; + /* + * Opens a file by whatever syscall is required for that. + * The returned descriptor may be closed (dup2-ed to another) + * so it shouldn't be saved for any post-actions. + */ + int (*open)(struct file_desc *d, int *new_fd); + char * (*name)(struct file_desc *, char *b, size_t s); +}; + +int collect_fd(int pid, FdinfoEntry *e, struct rst_info *rst_info, bool ghost); +struct fdinfo_list_entry *collect_fd_to(int pid, FdinfoEntry *e, + struct rst_info *rst_info, struct file_desc *fdesc, + bool fake, bool force_master); + +u32 find_unused_file_desc_id(void); +unsigned int find_unused_fd(struct pstree_item *, int hint_fd); +struct fdinfo_list_entry *find_used_fd(struct pstree_item *, int fd); + +struct file_desc { + u32 id; /* File id, unique */ + struct hlist_node hash; /* Descriptor hashing and lookup */ + struct list_head fd_info_head; /* Chain of fdinfo_list_entry-s with same ID and type but different pids */ + struct file_desc_ops *ops; /* Associated operations */ + struct list_head fake_master_list;/* To chain in the list of file_desc, which don't + have a fle in a task, that having permissions */ +}; + +struct fdtype_ops { + unsigned int type; + int (*dump)(int lfd, u32 id, const struct fd_parms *p); + int (*pre_dump)(int pid, int lfd); +}; + +struct cr_img; + +extern int dump_my_file(int lfd, u32 *, int *type); +extern int do_dump_gen_file(struct fd_parms *p, int lfd, + const struct fdtype_ops *ops, + FdinfoEntry *e); +struct parasite_drain_fd; +int dump_task_files_seized(struct parasite_ctl *ctl, struct pstree_item *item, + struct parasite_drain_fd *dfds); +int predump_task_files(int pid); + +extern void file_desc_init(struct file_desc *d, u32 id, struct file_desc_ops *ops); +extern int file_desc_add(struct file_desc *d, u32 id, struct file_desc_ops *ops); +extern struct fdinfo_list_entry *try_file_master(struct file_desc *d); +extern struct fdinfo_list_entry *file_master(struct file_desc *d); +extern struct file_desc *find_file_desc_raw(int type, u32 id); + +extern int setup_and_serve_out(struct fdinfo_list_entry *fle, int new_fd); +extern int recv_desc_from_peer(struct file_desc *d, int *fd); +extern int send_desc_to_peer(int fd, struct file_desc *d); +extern int restore_fown(int fd, FownEntry *fown); +extern int rst_file_params(int fd, FownEntry *fown, int flags); + +extern void show_saved_files(void); + +extern int prepare_fds(struct pstree_item *me); +extern int prepare_fd_pid(struct pstree_item *me); +extern int prepare_files(void); +extern int restore_fs(struct pstree_item *); +extern int prepare_fs_pid(struct pstree_item *); +extern int set_fd_flags(int fd, int flags); + +extern struct collect_image_info files_cinfo; +#define files_collected() (files_cinfo.flags & COLLECT_HAPPENED) + +extern int close_old_fds(void); +#ifndef AT_EMPTY_PATH +#define AT_EMPTY_PATH 0x1000 +#endif + +#define LREMAP_PARAM "link-remap" + +extern int shared_fdt_prepare(struct pstree_item *item); + +extern struct collect_image_info ext_file_cinfo; +extern int dump_unsupp_fd(struct fd_parms *p, int lfd, + char *more, char *info, FdinfoEntry *); + +extern int inherit_fd_parse(char *optarg); +extern int inherit_fd_add(int fd, char *key); +extern void inherit_fd_log(void); +extern int inherit_fd_move_to_fdstore(void); + +extern int inherit_fd_lookup_id(char *id); + +extern bool inherited_fd(struct file_desc *, int *fdp); + +extern FdinfoEntry *dup_fdinfo(FdinfoEntry *old, int fd, unsigned flags); +int dup_fle(struct pstree_item *task, struct fdinfo_list_entry *ple, + int fd, unsigned flags); + +extern int open_transport_socket(void); +extern int set_fds_event(pid_t virt); +extern void wait_fds_event(void); + +#endif /* __CR_FILES_H__ */ diff --git a/CRIU_code/criu/include/filesystems.h b/CRIU_code/criu/include/filesystems.h new file mode 100644 index 0000000..bd79806 --- /dev/null +++ b/CRIU_code/criu/include/filesystems.h @@ -0,0 +1,33 @@ +#ifndef __CR_FILESYSTEMS_H__ +#define __CR_FILESYSTEMS_H__ +extern struct fstype *find_fstype_by_name(char *fst); +extern struct fstype *decode_fstype(u32 fst); +extern bool add_fsname_auto(const char *names); + +struct mount_info; +typedef int (*mount_fn_t)(struct mount_info *mi, const char *src, const + char *fstype, unsigned long mountflags); + +struct fstype { + char *name; + int code; + int (*dump)(struct mount_info *pm); + int (*restore)(struct mount_info *pm); + int (*check_bindmount)(struct mount_info *pm); + int (*parse)(struct mount_info *pm); + int (*collect)(struct mount_info *pm); + bool (*sb_equal)(struct mount_info *a, struct mount_info *b); + mount_fn_t mount; +}; + +extern struct fstype *fstype_auto(void); + +/* callback for AUFS support */ +extern int aufs_parse(struct mount_info *mi); + +/* callback for OverlayFS support */ +extern int overlayfs_parse(struct mount_info *mi); + +/* FIXME -- remove */ +extern struct list_head binfmt_misc_list; +#endif diff --git a/CRIU_code/criu/include/fs-magic.h b/CRIU_code/criu/include/fs-magic.h new file mode 100644 index 0000000..ced3377 --- /dev/null +++ b/CRIU_code/criu/include/fs-magic.h @@ -0,0 +1,56 @@ +#ifndef __CR_FS_MAGIC_H__ +#define __CR_FS_MAGIC_H__ + +#include + +/* + * Gather magic numbers in case if distros + * do not provide appropriate entry in + * linux/magic.h. + */ + +#ifndef NFS_SUPER_MAGIC +# define NFS_SUPER_MAGIC 0x6969 +#endif + +#ifndef PIPEFS_MAGIC +# define PIPEFS_MAGIC 0x50495045 +#endif + +#ifndef ANON_INODE_FS_MAGIC +# define ANON_INODE_FS_MAGIC 0x09041934 +#endif + +#ifndef TMPFS_MAGIC +# define TMPFS_MAGIC 0x01021994 +#endif + +#ifndef SOCKFS_MAGIC +# define SOCKFS_MAGIC 0x534f434b +#endif + +#ifndef DEVPTS_SUPER_MAGIC +#define DEVPTS_SUPER_MAGIC 0x1cd1 +#endif + +#ifndef BTRFS_SUPER_MAGIC +#define BTRFS_SUPER_MAGIC 0x9123683E +#endif + +#ifndef AUFS_SUPER_MAGIC +#define AUFS_SUPER_MAGIC 0x61756673 +#endif + +#ifndef PROC_SUPER_MAGIC +#define PROC_SUPER_MAGIC 0x9fa0 +#endif + +#ifndef BINFMTFS_MAGIC +#define BINFMTFS_MAGIC 0x42494e4d +#endif + +#ifndef AUTOFS_SUPER_MAGIC +#define AUTOFS_SUPER_MAGIC 0x0187 +#endif + +#endif /* __CR_FS_MAGIC_H__ */ diff --git a/CRIU_code/criu/include/fsnotify.h b/CRIU_code/criu/include/fsnotify.h new file mode 100644 index 0000000..935dd60 --- /dev/null +++ b/CRIU_code/criu/include/fsnotify.h @@ -0,0 +1,24 @@ +#ifndef __CR_FSNOTIFY_H__ +#define __CR_FSNOTIFY_H__ + +#include "files.h" + +#include "protobuf.h" +#include "images/fsnotify.pb-c.h" + +#define KERNEL_FS_EVENT_ON_CHILD 0x08000000 + +#ifndef INOTIFY_IOC_SETNEXTWD +#define INOTIFY_IOC_SETNEXTWD _IOW('I', 0, __s32) +#endif + +extern int is_inotify_link(char *link); +extern int is_fanotify_link(char *link); +extern const struct fdtype_ops inotify_dump_ops; +extern const struct fdtype_ops fanotify_dump_ops; +extern struct collect_image_info inotify_cinfo; +extern struct collect_image_info inotify_mark_cinfo; +extern struct collect_image_info fanotify_cinfo; +extern struct collect_image_info fanotify_mark_cinfo; + +#endif /* __CR_FSNOTIFY_H__ */ diff --git a/CRIU_code/criu/include/image-desc.h b/CRIU_code/criu/include/image-desc.h new file mode 100644 index 0000000..3135f56 --- /dev/null +++ b/CRIU_code/criu/include/image-desc.h @@ -0,0 +1,124 @@ +#ifndef __CR_IMAGE_DESC_H__ +#define __CR_IMAGE_DESC_H__ + +#include "int.h" + +enum { + CR_FD_INVENTORY, + CR_FD_STATS, + /* + * Task entries + */ + + _CR_FD_TASK_FROM, + CR_FD_CORE, + CR_FD_IDS, + CR_FD_MM, + CR_FD_CREDS, + CR_FD_FS, + _CR_FD_TASK_TO, + + CR_FD_PAGEMAP, + + /* + * NS entries + */ + CR_FD_UTSNS, + CR_FD_MNTS, + CR_FD_USERNS, + + _CR_FD_IPCNS_FROM, + CR_FD_IPC_VAR, + CR_FD_IPCNS_SHM, + CR_FD_IPCNS_MSG, + CR_FD_IPCNS_SEM, + _CR_FD_IPCNS_TO, + + _CR_FD_NETNS_FROM, + CR_FD_NETDEV, + CR_FD_IFADDR, + CR_FD_ROUTE, + CR_FD_ROUTE6, + CR_FD_RULE, + CR_FD_IPTABLES, + CR_FD_IP6TABLES, + CR_FD_NETNS, + CR_FD_NETNF_CT, + CR_FD_NETNF_EXP, + _CR_FD_NETNS_TO, + + CR_FD_PSTREE, + CR_FD_SHMEM_PAGEMAP, + CR_FD_GHOST_FILE, + CR_FD_TCP_STREAM, + CR_FD_FDINFO, + + _CR_FD_GLOB_FROM, + CR_FD_FILES, + CR_FD_SK_QUEUES, + CR_FD_PIPES_DATA, + CR_FD_FIFO_DATA, + CR_FD_TTY_INFO, + CR_FD_TTY_DATA, + CR_FD_REMAP_FPATH, + CR_FD_CGROUP, + CR_FD_FILE_LOCKS, + CR_FD_SECCOMP, + _CR_FD_GLOB_TO, + + CR_FD_TMPFS_IMG, + CR_FD_TMPFS_DEV, + CR_FD_BINFMT_MISC, + CR_FD_BINFMT_MISC_OLD, + CR_FD_PAGES, + + CR_FD_SIGACT, + CR_FD_VMAS, + CR_FD_PAGES_OLD, + CR_FD_SHM_PAGES_OLD, + CR_FD_RLIMIT, + CR_FD_ITIMERS, + CR_FD_POSIX_TIMERS, + CR_FD_FILE_LOCKS_PID, + + CR_FD_IRMAP_CACHE, + CR_FD_CPUINFO, + + CR_FD_SIGNAL, + CR_FD_PSIGNAL, + CR_FD_INOTIFY_WD, + CR_FD_FANOTIFY_MARK, + CR_FD_EVENTPOLL_TFD, + CR_FD_REG_FILES, + CR_FD_INETSK, + CR_FD_NS_FILES, + CR_FD_PACKETSK, + CR_FD_NETLINK_SK, + CR_FD_EVENTFD_FILE, + CR_FD_EVENTPOLL_FILE, + CR_FD_SIGNALFD, + CR_FD_TUNFILE, + CR_FD_TIMERFD, + CR_FD_INOTIFY_FILE, + CR_FD_FANOTIFY_FILE, + CR_FD_EXT_FILES, + CR_FD_UNIXSK, + CR_FD_FIFO, + CR_FD_PIPES, + CR_FD_TTY_FILES, + + CR_FD_AUTOFS, + + CR_FD_MAX +}; + +/* file descriptors template */ +struct cr_fd_desc_tmpl { + const char *fmt; /* format for the name */ + u32 magic; /* magic in the header */ + int oflags; /* flags for image_open */ +}; + +extern struct cr_fd_desc_tmpl imgset_template[CR_FD_MAX]; + +#endif /* __CR_IMAGE_DESC_H__ */ diff --git a/CRIU_code/criu/include/image.h b/CRIU_code/criu/include/image.h new file mode 100644 index 0000000..2baa394 --- /dev/null +++ b/CRIU_code/criu/include/image.h @@ -0,0 +1,169 @@ +#ifndef __CR_IMAGE_H__ +#define __CR_IMAGE_H__ + +#include + +#include "common/compiler.h" +#include "servicefd.h" +#include "image-desc.h" +#include "fcntl.h" +#include "magic.h" +#include "bfd.h" +#include "log.h" +#include "common/bug.h" + +#define PAGE_RSS 1 +#define PAGE_ANON 2 + +/* + * Top bit set in the tgt id means we've remapped + * to a ghost file. + */ +#define REMAP_GHOST (1 << 31) + +/* + * VMA_AREA status: + * + * - none + * VmaEntry is just allocated and has not been used + * for anything yet + * - regular + * VmaEntry represent some memory area which should be + * dumped and restored; this is a general sign that we + * should not skip the area content from processing in + * compare with special areas such as vsyscall + * - stack + * the memory area is used in application stack so we + * should be careful about guard page here + * - vsyscall + * special memory area injected into the task memory + * space by the kernel itself, represent virtual syscall + * implementation and it is specific to every kernel version, + * its contents should not be dumped ever + * - vdso,vvar + * the vDSO area, it might reqire additional memory + * contents modification especially when tasks are + * migrating between different kernel versions + * - heap + * "heap" area in application, currently for information only + * - file private + * stands for privately memory mapped files + * - file shared + * stands for shared memory mapped files + * - anon shared + * represent shared anonymous memory areas + * - anon private + * represent private anonymous memory areas + * - SysV IPC + * IPC shared memory area + * - socket + * memory map for socket + * - AIO ring + * memory area serves AIO buffers + * - unsupported + * stands for any unknown memory areas, usually means + * we don't know how to work with it and should stop + * processing exiting with error; while the rest of bits + * are part of image ABI, this particular one must never + * be used in image. + */ +#define VMA_AREA_NONE (0 << 0) +#define VMA_AREA_REGULAR (1 << 0) +#define VMA_AREA_STACK (1 << 1) +#define VMA_AREA_VSYSCALL (1 << 2) +#define VMA_AREA_VDSO (1 << 3) +#define VMA_AREA_HEAP (1 << 5) + +#define VMA_FILE_PRIVATE (1 << 6) +#define VMA_FILE_SHARED (1 << 7) +#define VMA_ANON_SHARED (1 << 8) +#define VMA_ANON_PRIVATE (1 << 9) + +#define VMA_AREA_SYSVIPC (1 << 10) +#define VMA_AREA_SOCKET (1 << 11) +#define VMA_AREA_VVAR (1 << 12) +#define VMA_AREA_AIORING (1 << 13) + +#define VMA_CLOSE (1 << 28) +#define VMA_NO_PROT_WRITE (1 << 29) +#define VMA_PREMMAPED (1 << 30) +#define VMA_UNSUPP (1 << 31) + +#define CR_CAP_SIZE 2 + +#define TASK_COMM_LEN 16 + +#define CR_PARENT_LINK "parent" + +extern bool ns_per_id; +extern bool img_common_magic; + +#define O_NOBUF (O_DIRECT) +#define O_SERVICE (O_DIRECTORY) +#define O_DUMP (O_WRONLY | O_CREAT | O_TRUNC) +#define O_RSTR (O_RDONLY) +#define O_FORCE_LOCAL (O_SYNC) + +struct cr_img { + union { + struct bfd _x; + struct { + int fd; /* should be first to coincide with _x.fd */ + int type; + unsigned long oflags; + char *path; + }; + }; +}; + +#define EMPTY_IMG_FD (-404) +#define LAZY_IMG_FD (-505) + +static inline bool empty_image(struct cr_img *img) +{ + return img && img->_x.fd == EMPTY_IMG_FD; +} + +static inline bool lazy_image(struct cr_img *img) +{ + return img->_x.fd == LAZY_IMG_FD; +} + +extern int open_image_lazy(struct cr_img *img); + +static inline int img_raw_fd(struct cr_img *img) +{ + if (!img) + return -1; + if (lazy_image(img) && open_image_lazy(img)) + return -1; + + BUG_ON(bfd_buffered(&img->_x)); + return img->_x.fd; +} + +extern off_t img_raw_size(struct cr_img *img); + +extern int open_image_dir(char *dir); +extern void close_image_dir(void); + +extern struct cr_img *open_image_at(int dfd, int type, unsigned long flags, ...); +#define open_image(typ, flags, ...) open_image_at(-1, typ, flags, ##__VA_ARGS__) +extern int open_image_lazy(struct cr_img *img); +extern struct cr_img *open_pages_image(unsigned long flags, struct cr_img *pmi, u32 *pages_id); +extern struct cr_img *open_pages_image_at(int dfd, unsigned long flags, struct cr_img *pmi, u32 *pages_id); +extern void up_page_ids_base(void); + +extern struct cr_img *img_from_fd(int fd); /* for cr-show mostly */ + +extern int write_img_buf(struct cr_img *, const void *ptr, int size); +#define write_img(img, ptr) write_img_buf((img), (ptr), sizeof(*(ptr))) +extern int read_img_buf_eof(struct cr_img *, void *ptr, int size); +#define read_img_eof(img, ptr) read_img_buf_eof((img), (ptr), sizeof(*(ptr))) +extern int read_img_buf(struct cr_img *, void *ptr, int size); +#define read_img(img, ptr) read_img_buf((img), (ptr), sizeof(*(ptr))) +extern int read_img_str(struct cr_img *, char **pstr, int size); + +extern void close_image(struct cr_img *); + +#endif /* __CR_IMAGE_H__ */ diff --git a/CRIU_code/criu/include/img-remote.h b/CRIU_code/criu/include/img-remote.h new file mode 100644 index 0000000..66d75b9 --- /dev/null +++ b/CRIU_code/criu/include/img-remote.h @@ -0,0 +1,146 @@ +#include +#include + +#include +#include "common/list.h" +#include +#include + +#ifndef IMAGE_REMOTE_H +#define IMAGE_REMOTE_H + +#define FINISH 0 +#define PARENT_IMG "parent" +#define NULL_SNAPSHOT_ID 0 +#define DEFAULT_CACHE_SOCKET "img-cache.sock" +#define DEFAULT_PROXY_SOCKET "img-proxy.sock" + +#define DEFAULT_LISTEN 50 +#define BUF_SIZE 4096 + +struct rbuf { + char buffer[BUF_SIZE]; + int nbytes; /* How many bytes are in the buffer. */ + struct list_head l; +}; + +struct rimage { + /* Path and snapshot id identify the image. */ + char path[PATH_MAX]; + char snapshot_id[PATH_MAX]; + /* List anchor. */ + struct list_head l; + /* List of buffers that compose the image. */ + struct list_head buf_head; + /* Number of bytes. */ + uint64_t size; + /* Note: forward (send) operation only. Buffer to start forwarding. */ + struct rbuf *curr_fwd_buf; + /* Note: forward (send) operation only. Number of fwd bytes in 'curr_fw_buf'. */ + uint64_t curr_fwd_bytes; +}; + +/* Structure that describes the state of a remote operation on remote images. */ +struct roperation { + /* List anchor. */ + struct list_head l; + /* File descriptor being used. */ + int fd; + /* Path and snapshot id identify the required image. */ + char path[PATH_MAX]; + char snapshot_id[PATH_MAX]; + /* Remote image being used (may be null if the operation is pending). */ + struct rimage *rimg; + /* Flags for the operation. */ + int flags; + /* If fd should be closed when the operation is done. */ + bool close_fd; + /* Note: recv operation only. How much bytes should be received. */ + uint64_t size; + /* Note: recv operation only. Buffer being written. */ + struct rbuf *curr_recv_buf; // TODO - needed? Could be replaced by list.last! + /* Note: send operation only. Pointer to buffer being sent. */ + struct rbuf *curr_sent_buf; + /* Note: send operation only. Number of bytes sent in 'curr_send_buf. */ + uint64_t curr_sent_bytes; +}; + +/* This is the proxy to cache TCP socket FD. */ +extern int remote_sk; +/* This the unix socket used to fulfill local requests. */ +extern int local_sk; +/* True if we are running the cache/restore, false if proxy/dump. */ +extern bool restoring; + +void accept_image_connections(); +struct rimage *get_rimg_by_name(const char *snapshot_id, const char *path); + +int setup_UNIX_server_socket(char *path); + +/* Called by restore to get the fd correspondent to a particular path. This call + * will block until the connection is received. + */ +int read_remote_image_connection(char *snapshot_id, char *path); + +/* Called by dump to create a socket connection to the restore side. The socket + * fd is returned for further writing operations. + */ +int write_remote_image_connection(char *snapshot_id, char *path, int flags); + +/* Called by dump/restore when everything is dumped/restored. This function + * creates a new connection with a special control name. The receiver side uses + * it to ack that no more files are coming. + */ +int finish_remote_dump(); +int finish_remote_restore(); + +/* Starts an image proxy daemon (dump side). It receives image files through + * socket connections and forwards them to the image cache (restore side). + */ +int image_proxy(bool background, char *local_proxy_path); + +/* Starts an image cache daemon (restore side). It receives image files through + * socket connections and caches them until they are requested by the restore + * process. + */ +int image_cache(bool background, char *local_cache_path); + +/* Reads (discards) 'len' bytes from fd. This is used to emulate the function + * lseek, which is used to advance the file needle. + */ +int skip_remote_bytes(int fd, unsigned long len); + +/* To support iterative migration, the concept of snapshot_id is introduced + * (only when remote migration is enabled). Each image is tagged with one + * snapshot_id. The snapshot_id is the image directory used for the operation + * that creates the image (either predump or dump). Images stored in memory + * (both in Image Proxy and Image Cache) are identified by their name and + * snapshot_id. Snapshot_ids are ordered so that we can find parent pagemaps + * (that will be used when restoring the process). + */ + +/* Sets the current snapshot_id */ +void init_snapshot_id(char *ns); + +/* Returns the current snapshot_id. */ +char *get_curr_snapshot_id(); + +/* Returns the snapshot_id index representing the current snapshot_id. This + * index represents the hierarchy position. For example: images tagged with + * the snapshot_id with index 1 are more recent than the images tagged with + * the snapshot_id with index 0. + */ +int get_curr_snapshot_id_idx(); + +/* Returns the snapshot_id associated with the snapshot_id index. */ +char *get_snapshot_id_from_idx(int idx); + +/* Pushes the current snapshot_id into the snapshot_id hierarchy (into the Image + * Proxy and Image Cache). + */ +int push_snapshot_id(); + +/* Returns the snapshot id index that precedes the current snapshot_id. */ +int get_curr_parent_snapshot_id_idx(); + +#endif diff --git a/CRIU_code/criu/include/imgset.h b/CRIU_code/criu/include/imgset.h new file mode 100644 index 0000000..02ad169 --- /dev/null +++ b/CRIU_code/criu/include/imgset.h @@ -0,0 +1,38 @@ +#ifndef __CR_IMGSET_H__ +#define __CR_IMGSET_H__ + +#include "image-desc.h" +#include "log.h" +#include "common/bug.h" +#include "image.h" + +struct cr_imgset { + int fd_off; + int fd_nr; + struct cr_img **_imgs; +}; + +static inline struct cr_img *img_from_set(const struct cr_imgset *imgset, int type) +{ + int idx; + + idx = type - imgset->fd_off; + BUG_ON(idx > imgset->fd_nr); + + return imgset->_imgs[idx]; +} + +extern struct cr_imgset *glob_imgset; + +extern struct cr_fd_desc_tmpl imgset_template[CR_FD_MAX]; + +extern struct cr_imgset *cr_task_imgset_open(int pid, int mode); +extern struct cr_imgset *cr_imgset_open_range(int pid, int from, int to, + unsigned long flags); +#define cr_imgset_open(pid, type, flags) cr_imgset_open_range(pid, \ + _CR_FD_##type##_FROM, _CR_FD_##type##_TO, flags) +extern struct cr_imgset *cr_glob_imgset_open(int mode); + +extern void close_cr_imgset(struct cr_imgset **cr_imgset); + +#endif /* __CR_IMGSET_H__ */ diff --git a/CRIU_code/criu/include/inet_diag.h b/CRIU_code/criu/include/inet_diag.h new file mode 100644 index 0000000..95be2c1 --- /dev/null +++ b/CRIU_code/criu/include/inet_diag.h @@ -0,0 +1,136 @@ +#ifndef __CR_INET_DIAG_H__ +#define __CR_INET_DIAG_H__ + +#include + +/* Just some random number */ +#define TCPDIAG_GETSOCK 18 +#define DCCPDIAG_GETSOCK 19 + +#define INET_DIAG_GETSOCK_MAX 24 + +/* Socket identity */ +struct inet_diag_sockid { + __be16 idiag_sport; + __be16 idiag_dport; + __be32 idiag_src[4]; + __be32 idiag_dst[4]; + __u32 idiag_if; + __u32 idiag_cookie[2]; +#define INET_DIAG_NOCOOKIE (~0U) +}; + +/* Request structure */ + +struct inet_diag_req_compat { + __u8 idiag_family; /* Family of addresses. */ + __u8 idiag_src_len; + __u8 idiag_dst_len; + __u8 idiag_ext; /* Query extended information */ + + struct inet_diag_sockid id; + + __u32 idiag_states; /* States to dump */ + __u32 idiag_dbs; /* Tables to dump (NI) */ +}; + +struct inet_diag_req_v2 { + __u8 sdiag_family; + __u8 sdiag_protocol; + __u8 idiag_ext; + __u8 pad; + __u32 idiag_states; + struct inet_diag_sockid id; +}; + +enum { + INET_DIAG_REQ_NONE, + INET_DIAG_REQ_BYTECODE, +}; + +#define INET_DIAG_REQ_MAX INET_DIAG_REQ_BYTECODE + +/* Bytecode is sequence of 4 byte commands followed by variable arguments. + * All the commands identified by "code" are conditional jumps forward: + * to offset cc+"yes" or to offset cc+"no". "yes" is supposed to be + * length of the command and its arguments. + */ + +struct inet_diag_bc_op { + unsigned char code; + unsigned char yes; + unsigned short no; +}; + +enum { + INET_DIAG_BC_NOP, + INET_DIAG_BC_JMP, + INET_DIAG_BC_S_GE, + INET_DIAG_BC_S_LE, + INET_DIAG_BC_D_GE, + INET_DIAG_BC_D_LE, + INET_DIAG_BC_AUTO, + INET_DIAG_BC_S_COND, + INET_DIAG_BC_D_COND, +}; + +struct inet_diag_hostcond { + __u8 family; + __u8 prefix_len; + int port; + __be32 addr[0]; +}; + +/* Base info structure. It contains socket identity (addrs/ports/cookie) + * and, alas, the information shown by netstat. */ +struct inet_diag_msg { + __u8 idiag_family; + __u8 idiag_state; + __u8 idiag_timer; + __u8 idiag_retrans; + + struct inet_diag_sockid id; + + __u32 idiag_expires; + __u32 idiag_rqueue; + __u32 idiag_wqueue; + __u32 idiag_uid; + __u32 idiag_inode; +}; + +/* Extensions */ + +enum { + INET_DIAG_NONE, + INET_DIAG_MEMINFO, + INET_DIAG_INFO, + INET_DIAG_VEGASINFO, + INET_DIAG_CONG, + INET_DIAG_TOS, + INET_DIAG_TCLASS, + INET_DIAG_SKMEMINFO, + INET_DIAG_SHUTDOWN, +}; + +#define INET_DIAG_MAX INET_DIAG_SHUTDOWN + + +/* INET_DIAG_MEM */ + +struct inet_diag_meminfo { + __u32 idiag_rmem; + __u32 idiag_wmem; + __u32 idiag_fmem; + __u32 idiag_tmem; +}; + +/* INET_DIAG_VEGASINFO */ + +struct tcpvegas_info { + __u32 tcpv_enabled; + __u32 tcpv_rttcnt; + __u32 tcpv_rtt; + __u32 tcpv_minrtt; +}; + +#endif /* __CR_INET_DIAG_H__ */ diff --git a/CRIU_code/criu/include/infect-pie.h b/CRIU_code/criu/include/infect-pie.h new file mode 100644 index 0000000..b00d7dd --- /dev/null +++ b/CRIU_code/criu/include/infect-pie.h @@ -0,0 +1,7 @@ +#ifndef __CR_INFECT_PIE_H__ +#define __CR_INFECT_PIE_H__ +extern int parasite_daemon_cmd(int cmd, void *args); +extern int parasite_trap_cmd(int cmd, void *args); +extern void parasite_cleanup(void); +extern int parasite_get_rpc_sock(void); +#endif diff --git a/CRIU_code/criu/include/int.h b/CRIU_code/criu/include/int.h new file mode 100644 index 0000000..5776ab6 --- /dev/null +++ b/CRIU_code/criu/include/int.h @@ -0,0 +1,4 @@ +#ifndef __CR_INC_INT_H__ +#define __CR_INC_INT_H__ +#include "asm/int.h" +#endif diff --git a/CRIU_code/criu/include/ipc_ns.h b/CRIU_code/criu/include/ipc_ns.h new file mode 100644 index 0000000..c890989 --- /dev/null +++ b/CRIU_code/criu/include/ipc_ns.h @@ -0,0 +1,9 @@ +#ifndef __CR_IPC_NS_H__ +#define __CR_IPC_NS_H__ + +extern int dump_ipc_ns(int ns_id); +extern int prepare_ipc_ns(int pid); + +extern struct ns_desc ipc_ns_desc; + +#endif /* __CR_IPC_NS_H__ */ diff --git a/CRIU_code/criu/include/irmap.h b/CRIU_code/criu/include/irmap.h new file mode 100644 index 0000000..033f71e --- /dev/null +++ b/CRIU_code/criu/include/irmap.h @@ -0,0 +1,13 @@ +#ifndef __CR_IRMAP__H__ +#define __CR_IRMAP__H__ +char *irmap_lookup(unsigned int s_dev, unsigned long i_ino); +struct _FhEntry; +int irmap_queue_cache(unsigned int dev, unsigned long ino, + struct _FhEntry *fh); +int irmap_predump_prep(void); +int irmap_predump_run(void); +int check_open_handle(unsigned int s_dev, unsigned long i_ino, + struct _FhEntry *f_handle); +int irmap_load_cache(void); +int irmap_scan_path_add(char *path); +#endif diff --git a/CRIU_code/criu/include/kcmp-ids.h b/CRIU_code/criu/include/kcmp-ids.h new file mode 100644 index 0000000..a37622c --- /dev/null +++ b/CRIU_code/criu/include/kcmp-ids.h @@ -0,0 +1,36 @@ +#ifndef __CR_KCMP_IDS_H__ +#define __CR_KCMP_IDS_H__ + +#include +#include + +#include "kcmp.h" + +struct kid_tree { + struct rb_root root; + unsigned int kcmp_type; + unsigned long subid; + +}; + +#define DECLARE_KCMP_TREE(name, type) \ + struct kid_tree name = { \ + .root = RB_ROOT, \ + .kcmp_type = type, \ + .subid = 1, \ + } + +struct kid_elem { + pid_t pid; + unsigned int genid; + unsigned int idx; +}; + +extern uint32_t kid_generate_gen(struct kid_tree *tree, + struct kid_elem *elem, int *new_id); + +extern struct kid_elem *kid_lookup_epoll_tfd(struct kid_tree *tree, + struct kid_elem *elem, + kcmp_epoll_slot_t *slot); + +#endif /* __CR_KCMP_IDS_H__ */ diff --git a/CRIU_code/criu/include/kcmp.h b/CRIU_code/criu/include/kcmp.h new file mode 100644 index 0000000..f1c898d --- /dev/null +++ b/CRIU_code/criu/include/kcmp.h @@ -0,0 +1,26 @@ +#ifndef __CR_KCMP_H__ +#define __CR_KCMP_H__ + +#include + +enum kcmp_type { + KCMP_FILE, + KCMP_VM, + KCMP_FILES, + KCMP_FS, + KCMP_SIGHAND, + KCMP_IO, + KCMP_SYSVSEM, + KCMP_EPOLL_TFD, + + KCMP_TYPES, +}; + +/* Slot for KCMP_EPOLL_TFD */ +typedef struct { + uint32_t efd; /* epoll file descriptor */ + uint32_t tfd; /* target file number */ + uint32_t toff; /* target offset within same numbered sequence */ +} kcmp_epoll_slot_t; + +#endif /* __CR_KCMP_H__ */ diff --git a/CRIU_code/criu/include/kerndat.h b/CRIU_code/criu/include/kerndat.h new file mode 100644 index 0000000..75e2130 --- /dev/null +++ b/CRIU_code/criu/include/kerndat.h @@ -0,0 +1,96 @@ +#ifndef __CR_KERNDAT_H__ +#define __CR_KERNDAT_H__ + +#include +#include "int.h" +#include "common/config.h" +#include "asm/kerndat.h" +#include "util-vdso.h" + +struct stat; + +/* + * kerndat stands for "kernel data" and is a collection + * of run-time information about current kernel + */ + +extern int kerndat_init(void); +extern int kerndat_get_dirty_track(void); +extern int kerndat_fdinfo_has_lock(void); +extern int kerndat_loginuid(void); +extern int kerndat_files_stat(bool early); + +enum pagemap_func { + PM_UNKNOWN, + PM_DISABLED, /* /proc/pid/pagemap doesn't open (user mode) */ + PM_FLAGS_ONLY, /* pagemap zeroes pfn part (user mode) */ + PM_FULL, +}; + +enum loginuid_func { + LUID_NONE, + LUID_READ, + LUID_FULL, +}; + +struct kerndat_s { + u32 magic1, magic2; + dev_t shmem_dev; + int last_cap; + u64 zero_page_pfn; + bool has_dirty_track; + bool has_memfd; + bool has_fdinfo_lock; + unsigned long task_size; + bool ipv6; + enum loginuid_func luid; + bool compat_cr; + bool sk_ns; + bool sk_unix_file; + bool tun_ns; + enum pagemap_func pmap; + unsigned int has_xtlocks; + unsigned long mmap_min_addr; + bool has_tcp_half_closed; + bool stack_guard_gap_hidden; + int lsm; + bool has_uffd; + unsigned long uffd_features; + bool has_thp_disable; + bool can_map_vdso; + bool vdso_hint_reliable; + struct vdso_symtable vdso_sym; +#ifdef CONFIG_COMPAT + struct vdso_symtable vdso_sym_compat; +#endif + bool has_nsid; + bool has_link_nsid; + unsigned int sysctl_nr_open; + unsigned long files_stat_max_files; + bool x86_has_ptrace_fpu_xsave_bug; + bool has_inotify_setnextwd; + bool has_kcmp_epoll_tfd; +}; + +extern struct kerndat_s kdat; + +enum { + KERNDAT_FS_STAT_DEVPTS, + KERNDAT_FS_STAT_DEVTMPFS, + KERNDAT_FS_STAT_BINFMT_MISC, + + KERNDAT_FS_STAT_MAX +}; + +/* + * Check whether the fs @which with kdevice @kdev + * is the same as host's. If yes, this means that + * the fs mount is shared with host, if no -- it's + * a new (likely virtuzlized) fs instance. + */ +extern int kerndat_fs_virtualized(unsigned int which, u32 kdev); + +extern int kerndat_tcp_repair(); +extern int kerndat_uffd(void); + +#endif /* __CR_KERNDAT_H__ */ diff --git a/CRIU_code/criu/include/libnetlink.h b/CRIU_code/criu/include/libnetlink.h new file mode 100644 index 0000000..f21a0e7 --- /dev/null +++ b/CRIU_code/criu/include/libnetlink.h @@ -0,0 +1,24 @@ +#ifndef __CR_LIBNETLINK_H__ +#define __CR_LIBNETLINK_H__ + +#define CR_NLMSG_SEQ 24680 /* arbitrary chosen */ + +struct ns_id; +extern int do_rtnl_req(int nl, void *req, int size, + int (*receive_callback)(struct nlmsghdr *h, struct ns_id *ns, void *), + int (*error_callback)(int err, struct ns_id *ns, void *), struct ns_id *ns, void *); + +extern int addattr_l(struct nlmsghdr *n, int maxlen, int type, + const void *data, int alen); + +extern int32_t nla_get_s32(const struct nlattr *nla); + +#define NLMSG_TAIL(nmsg) \ + ((struct rtattr *) (((void *) (nmsg)) + NLMSG_ALIGN((nmsg)->nlmsg_len))) + +#ifndef NETNS_RTA +#define NETNS_RTA(r) \ + ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct rtgenmsg)))) +#endif + +#endif /* __CR_LIBNETLINK_H__ */ diff --git a/CRIU_code/criu/include/linux/userfaultfd.h b/CRIU_code/criu/include/linux/userfaultfd.h new file mode 100644 index 0000000..3b05953 --- /dev/null +++ b/CRIU_code/criu/include/linux/userfaultfd.h @@ -0,0 +1,219 @@ +/* + * include/linux/userfaultfd.h + * + * Copyright (C) 2007 Davide Libenzi + * Copyright (C) 2015 Red Hat, Inc. + * + */ + +#ifndef _LINUX_USERFAULTFD_H +#define _LINUX_USERFAULTFD_H + +#include + +/* + * If the UFFDIO_API is upgraded someday, the UFFDIO_UNREGISTER and + * UFFDIO_WAKE ioctls should be defined as _IOW and not as _IOR. In + * userfaultfd.h we assumed the kernel was reading (instead _IOC_READ + * means the userland is reading). + */ +#define UFFD_API ((__u64)0xAA) +#define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_FORK | \ + UFFD_FEATURE_EVENT_REMAP | \ + UFFD_FEATURE_EVENT_REMOVE | \ + UFFD_FEATURE_EVENT_UNMAP | \ + UFFD_FEATURE_MISSING_HUGETLBFS | \ + UFFD_FEATURE_MISSING_SHMEM) +#define UFFD_API_IOCTLS \ + ((__u64)1 << _UFFDIO_REGISTER | \ + (__u64)1 << _UFFDIO_UNREGISTER | \ + (__u64)1 << _UFFDIO_API) +#define UFFD_API_RANGE_IOCTLS \ + ((__u64)1 << _UFFDIO_WAKE | \ + (__u64)1 << _UFFDIO_COPY | \ + (__u64)1 << _UFFDIO_ZEROPAGE) +#define UFFD_API_RANGE_IOCTLS_BASIC \ + ((__u64)1 << _UFFDIO_WAKE | \ + (__u64)1 << _UFFDIO_COPY) + +/* + * Valid ioctl command number range with this API is from 0x00 to + * 0x3F. UFFDIO_API is the fixed number, everything else can be + * changed by implementing a different UFFD_API. If sticking to the + * same UFFD_API more ioctl can be added and userland will be aware of + * which ioctl the running kernel implements through the ioctl command + * bitmask written by the UFFDIO_API. + */ +#define _UFFDIO_REGISTER (0x00) +#define _UFFDIO_UNREGISTER (0x01) +#define _UFFDIO_WAKE (0x02) +#define _UFFDIO_COPY (0x03) +#define _UFFDIO_ZEROPAGE (0x04) +#define _UFFDIO_API (0x3F) + +/* userfaultfd ioctl ids */ +#define UFFDIO 0xAA +#define UFFDIO_API _IOWR(UFFDIO, _UFFDIO_API, \ + struct uffdio_api) +#define UFFDIO_REGISTER _IOWR(UFFDIO, _UFFDIO_REGISTER, \ + struct uffdio_register) +#define UFFDIO_UNREGISTER _IOR(UFFDIO, _UFFDIO_UNREGISTER, \ + struct uffdio_range) +#define UFFDIO_WAKE _IOR(UFFDIO, _UFFDIO_WAKE, \ + struct uffdio_range) +#define UFFDIO_COPY _IOWR(UFFDIO, _UFFDIO_COPY, \ + struct uffdio_copy) +#define UFFDIO_ZEROPAGE _IOWR(UFFDIO, _UFFDIO_ZEROPAGE, \ + struct uffdio_zeropage) + +/* read() structure */ +struct uffd_msg { + __u8 event; + + __u8 reserved1; + __u16 reserved2; + __u32 reserved3; + + union { + struct { + __u64 flags; + __u64 address; + } pagefault; + + struct { + __u32 ufd; + } fork; + + struct { + __u64 from; + __u64 to; + __u64 len; + } remap; + + struct { + __u64 start; + __u64 end; + } remove; + + struct { + /* unused reserved fields */ + __u64 reserved1; + __u64 reserved2; + __u64 reserved3; + } reserved; + } arg; +} __packed; + +/* + * Start at 0x12 and not at 0 to be more strict against bugs. + */ +#define UFFD_EVENT_PAGEFAULT 0x12 +#define UFFD_EVENT_FORK 0x13 +#define UFFD_EVENT_REMAP 0x14 +#define UFFD_EVENT_REMOVE 0x15 +#define UFFD_EVENT_UNMAP 0x16 + +/* flags for UFFD_EVENT_PAGEFAULT */ +#define UFFD_PAGEFAULT_FLAG_WRITE (1<<0) /* If this was a write fault */ +#define UFFD_PAGEFAULT_FLAG_WP (1<<1) /* If reason is VM_UFFD_WP */ + +struct uffdio_api { + /* userland asks for an API number and the features to enable */ + __u64 api; + /* + * Kernel answers below with the all available features for + * the API, this notifies userland of which events and/or + * which flags for each event are enabled in the current + * kernel. + * + * Note: UFFD_EVENT_PAGEFAULT and UFFD_PAGEFAULT_FLAG_WRITE + * are to be considered implicitly always enabled in all kernels as + * long as the uffdio_api.api requested matches UFFD_API. + * + * UFFD_FEATURE_MISSING_HUGETLBFS means an UFFDIO_REGISTER + * with UFFDIO_REGISTER_MODE_MISSING mode will succeed on + * hugetlbfs virtual memory ranges. Adding or not adding + * UFFD_FEATURE_MISSING_HUGETLBFS to uffdio_api.features has + * no real functional effect after UFFDIO_API returns, but + * it's only useful for an initial feature set probe at + * UFFDIO_API time. There are two ways to use it: + * + * 1) by adding UFFD_FEATURE_MISSING_HUGETLBFS to the + * uffdio_api.features before calling UFFDIO_API, an error + * will be returned by UFFDIO_API on a kernel without + * hugetlbfs missing support + * + * 2) the UFFD_FEATURE_MISSING_HUGETLBFS can not be added in + * uffdio_api.features and instead it will be set by the + * kernel in the uffdio_api.features if the kernel supports + * it, so userland can later check if the feature flag is + * present in uffdio_api.features after UFFDIO_API + * succeeded. + * + * UFFD_FEATURE_MISSING_SHMEM works the same as + * UFFD_FEATURE_MISSING_HUGETLBFS, but it applies to shmem + * (i.e. tmpfs and other shmem based APIs). + */ +#define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0) +#define UFFD_FEATURE_EVENT_FORK (1<<1) +#define UFFD_FEATURE_EVENT_REMAP (1<<2) +#define UFFD_FEATURE_EVENT_REMOVE (1<<3) +#define UFFD_FEATURE_MISSING_HUGETLBFS (1<<4) +#define UFFD_FEATURE_MISSING_SHMEM (1<<5) +#define UFFD_FEATURE_EVENT_UNMAP (1<<6) + __u64 features; + + __u64 ioctls; +}; + +struct uffdio_range { + __u64 start; + __u64 len; +}; + +struct uffdio_register { + struct uffdio_range range; +#define UFFDIO_REGISTER_MODE_MISSING ((__u64)1<<0) +#define UFFDIO_REGISTER_MODE_WP ((__u64)1<<1) + __u64 mode; + + /* + * kernel answers which ioctl commands are available for the + * range, keep at the end as the last 8 bytes aren't read. + */ + __u64 ioctls; +}; + +struct uffdio_copy { + __u64 dst; + __u64 src; + __u64 len; + /* + * There will be a wrprotection flag later that allows to map + * pages wrprotected on the fly. And such a flag will be + * available if the wrprotection ioctl are implemented for the + * range according to the uffdio_register.ioctls. + */ +#define UFFDIO_COPY_MODE_DONTWAKE ((__u64)1<<0) + __u64 mode; + + /* + * "copy" is written by the ioctl and must be at the end: the + * copy_from_user will not read the last 8 bytes. + */ + __s64 copy; +}; + +struct uffdio_zeropage { + struct uffdio_range range; +#define UFFDIO_ZEROPAGE_MODE_DONTWAKE ((__u64)1<<0) + __u64 mode; + + /* + * "zeropage" is written by the ioctl and must be at the end: + * the copy_from_user will not read the last 8 bytes. + */ + __s64 zeropage; +}; + +#endif /* _LINUX_USERFAULTFD_H */ diff --git a/CRIU_code/criu/include/log.h b/CRIU_code/criu/include/log.h new file mode 100644 index 0000000..15787b0 --- /dev/null +++ b/CRIU_code/criu/include/log.h @@ -0,0 +1,81 @@ +#ifndef __CR_LOG_H__ +#define __CR_LOG_H__ + +#include + +#ifndef CR_NOGLIBC + +#include +#include +#include + +extern void vprint_on_level(unsigned int loglevel, const char *format, + va_list params); + +#endif /* CR_NOGLIBC */ + +#define LOG_UNSET (-1) +#define LOG_MSG (0) /* Print message regardless of log level */ +#define LOG_ERROR (1) /* Errors only, when we're in trouble */ +#define LOG_WARN (2) /* Warnings, dazen and confused but trying to continue */ +#define LOG_INFO (3) /* Informative, everything is fine */ +#define LOG_DEBUG (4) /* Debug only */ + +#define DEFAULT_LOGLEVEL LOG_WARN + +extern void print_on_level(unsigned int loglevel, const char *format, ...) + __attribute__ ((__format__ (__printf__, 2, 3))); + +#ifndef LOG_PREFIX +# define LOG_PREFIX +#endif + +void flush_early_log_buffer(int fd); + +#define print_once(loglevel, fmt, ...) \ + do { \ + static bool __printed; \ + if (!__printed) { \ + print_on_level(loglevel, fmt, ##__VA_ARGS__); \ + __printed = 1; \ + } \ + } while (0) + +#define pr_msg(fmt, ...) \ + print_on_level(LOG_MSG, \ + fmt, ##__VA_ARGS__) + +#define pr_info(fmt, ...) \ + print_on_level(LOG_INFO, \ + LOG_PREFIX fmt, ##__VA_ARGS__) + +#define pr_err(fmt, ...) \ + print_on_level(LOG_ERROR, \ + "Error (%s:%d): " LOG_PREFIX fmt, \ + __FILE__, __LINE__, ##__VA_ARGS__) + +#define pr_err_once(fmt, ...) \ + print_once(LOG_ERROR, fmt, ##__VA_ARGS__) + +#define pr_warn(fmt, ...) \ + print_on_level(LOG_WARN, \ + "Warn (%s:%d): " LOG_PREFIX fmt, \ + __FILE__, __LINE__, ##__VA_ARGS__) + +#define pr_warn_once(fmt, ...) \ + print_once(LOG_WARN, \ + "Warn (%s:%d): " LOG_PREFIX fmt, \ + __FILE__, __LINE__, ##__VA_ARGS__) + +#define pr_debug(fmt, ...) \ + print_on_level(LOG_DEBUG, \ + LOG_PREFIX fmt, ##__VA_ARGS__) + +#ifndef CR_NOGLIBC + +#define pr_perror(fmt, ...) \ + pr_err(fmt ": %s\n", ##__VA_ARGS__, strerror(errno)) + +#endif /* CR_NOGLIBC */ + +#endif /* __CR_LOG_H__ */ diff --git a/CRIU_code/criu/include/lsm.h b/CRIU_code/criu/include/lsm.h new file mode 100644 index 0000000..3b82712 --- /dev/null +++ b/CRIU_code/criu/include/lsm.h @@ -0,0 +1,55 @@ +#ifndef __CR_LSM_H__ +#define __CR_LSM_H__ + +#include "images/inventory.pb-c.h" +#include "images/creds.pb-c.h" +#include "images/fdinfo.pb-c.h" + +#define AA_SECURITYFS_PATH "/sys/kernel/security/apparmor" + +/* + * Get the Lsmtype for the current host. + */ +extern Lsmtype host_lsm_type(void); + +/* + * Initialize the Lsmtype for the current host + */ +extern void kerndat_lsm(void); + +/* + * Read the LSM profile for the pstree item + */ +extern int collect_lsm_profile(pid_t, CredsEntry *); + +/* + * Validate that the LSM profiles can be correctly applied (must happen after + * pstree is set up). + */ +int validate_lsm(char *profile); + +/* + * Render the profile name in the way that the LSM wants it written to + * /proc//attr/current. + */ +int render_lsm_profile(char *profile, char **val); + +extern int lsm_check_opts(void); + +#ifdef CONFIG_HAS_SELINUX +int dump_xattr_security_selinux(int fd, FdinfoEntry *e); +int run_setsockcreatecon(FdinfoEntry *e); +int reset_setsockcreatecon(); +#else +static inline int dump_xattr_security_selinux(int fd, FdinfoEntry *e) { + return 0; +} +static inline int run_setsockcreatecon(FdinfoEntry *e) { + return 0; +} +static inline int reset_setsockcreatecon() { + return 0; +} +#endif + +#endif /* __CR_LSM_H__ */ diff --git a/CRIU_code/criu/include/magic.h b/CRIU_code/criu/include/magic.h new file mode 100644 index 0000000..05101f4 --- /dev/null +++ b/CRIU_code/criu/include/magic.h @@ -0,0 +1,125 @@ +#ifndef __CR_MAGIC_H__ +#define __CR_MAGIC_H__ + +/* + * Basic multi-file images + */ + +#define CRTOOLS_IMAGES_V1 1 +/* + * v1.1 has common magic in the head of each image file, + * except for inventory + */ +#define CRTOOLS_IMAGES_V1_1 2 + +/* + * Raw images are images in which data is stored in some + * non-crtool format (ip tool dumps, tarballs, etc.) + */ + +#define RAW_IMAGE_MAGIC 0x0 + +/* + * Images have the IMG_COMMON_MAGIC in the head. Service files + * such as stats and irmap-cache have the IMG_SERVICE_MAGIC. + */ + +#define IMG_COMMON_MAGIC 0x54564319 /* Sarov (a.k.a. Arzamas-16) */ +#define IMG_SERVICE_MAGIC 0x55105940 /* Zlatoust */ + +/* + * The magic-s below correspond to coordinates + * of various Russian towns in the NNNNEEEE form. + */ + +#define INVENTORY_MAGIC 0x58313116 /* Veliky Novgorod */ +#define PSTREE_MAGIC 0x50273030 /* Kyiv */ +#define FDINFO_MAGIC 0x56213732 /* Dmitrov */ +#define PAGEMAP_MAGIC 0x56084025 /* Vladimir */ +#define SHMEM_PAGEMAP_MAGIC PAGEMAP_MAGIC +#define PAGES_MAGIC RAW_IMAGE_MAGIC +#define CORE_MAGIC 0x55053847 /* Kolomna */ +#define IDS_MAGIC 0x54432030 /* Konigsberg */ +#define VMAS_MAGIC 0x54123737 /* Tula */ +#define PIPES_MAGIC 0x56513555 /* Tver */ +#define PIPES_DATA_MAGIC 0x56453709 /* Dubna */ +#define FIFO_MAGIC 0x58364939 /* Kirov */ +#define FIFO_DATA_MAGIC 0x59333054 /* Tosno */ +#define SIGACT_MAGIC 0x55344201 /* Murom */ +#define UNIXSK_MAGIC 0x54373943 /* Ryazan */ +#define INETSK_MAGIC 0x56443851 /* Pereslavl */ +#define PACKETSK_MAGIC 0x60454618 /* Veliky Ustyug */ +#define ITIMERS_MAGIC 0x57464056 /* Kostroma */ +#define POSIX_TIMERS_MAGIC 0x52603957 /* Lipetsk */ +#define SK_QUEUES_MAGIC 0x56264026 /* Suzdal */ +#define UTSNS_MAGIC 0x54473203 /* Smolensk */ +#define CREDS_MAGIC 0x54023547 /* Kozelsk */ +#define IPC_VAR_MAGIC 0x53115007 /* Samara */ +#define IPCNS_SHM_MAGIC 0x46283044 /* Odessa */ +#define IPCNS_MSG_MAGIC 0x55453737 /* Moscow */ +#define IPCNS_SEM_MAGIC 0x59573019 /* St. Petersburg */ +#define REG_FILES_MAGIC 0x50363636 /* Belgorod */ +#define EXT_FILES_MAGIC 0x59255641 /* Usolye */ +#define FS_MAGIC 0x51403912 /* Voronezh */ +#define MM_MAGIC 0x57492820 /* Pskov */ +#define REMAP_FPATH_MAGIC 0x59133954 /* Vologda */ +#define GHOST_FILE_MAGIC 0x52583605 /* Oryol */ +#define TCP_STREAM_MAGIC 0x51465506 /* Orenburg */ +#define EVENTFD_FILE_MAGIC 0x44523722 /* Anapa */ +#define EVENTPOLL_FILE_MAGIC 0x45023858 /* Krasnodar */ +#define EVENTPOLL_TFD_MAGIC 0x44433746 /* Novorossiysk */ +#define SIGNALFD_MAGIC 0x57323820 /* Uglich */ +#define INOTIFY_FILE_MAGIC 0x48424431 /* Volgograd */ +#define INOTIFY_WD_MAGIC 0x54562009 /* Svetlogorsk (Rauschen) */ +#define MNTS_MAGIC 0x55563928 /* Petushki */ +#define NETDEV_MAGIC 0x57373951 /* Yaroslavl */ +#define NETNS_MAGIC 0x55933752 /* Dolgoprudny */ +#define TTY_FILES_MAGIC 0x59433025 /* Pushkin */ +#define TTY_INFO_MAGIC 0x59453036 /* Kolpino */ +#define TTY_DATA_MAGIC 0x59413026 /* Pavlovsk */ +#define FILE_LOCKS_MAGIC 0x54323616 /* Kaluga */ +#define RLIMIT_MAGIC 0x57113925 /* Rostov */ +#define FANOTIFY_FILE_MAGIC 0x55096122 /* Chelyabinsk */ +#define FANOTIFY_MARK_MAGIC 0x56506035 /* Yekaterinburg */ +#define SIGNAL_MAGIC 0x59255647 /* Berezniki */ +#define PSIGNAL_MAGIC SIGNAL_MAGIC +#define NETLINK_SK_MAGIC 0x58005614 /* Perm */ +#define NS_FILES_MAGIC 0x61394011 /* Nyandoma */ +#define TUNFILE_MAGIC 0x57143751 /* Kalyazin */ +#define CGROUP_MAGIC 0x59383330 /* Tikhvin */ +#define TIMERFD_MAGIC 0x50493712 /* Korocha */ +#define CPUINFO_MAGIC 0x61404013 /* Nyandoma */ +#define USERNS_MAGIC 0x55474906 /* Kazan */ +#define SECCOMP_MAGIC 0x64413049 /* Kostomuksha */ +#define BINFMT_MISC_MAGIC 0x67343323 /* Apatity */ +#define AUTOFS_MAGIC 0x49353943 /* Sochi */ +#define FILES_MAGIC 0x56303138 /* Toropets */ + +#define IFADDR_MAGIC RAW_IMAGE_MAGIC +#define ROUTE_MAGIC RAW_IMAGE_MAGIC +#define ROUTE6_MAGIC RAW_IMAGE_MAGIC +#define RULE_MAGIC RAW_IMAGE_MAGIC +#define TMPFS_IMG_MAGIC RAW_IMAGE_MAGIC +#define TMPFS_DEV_MAGIC RAW_IMAGE_MAGIC +#define IPTABLES_MAGIC RAW_IMAGE_MAGIC +#define IP6TABLES_MAGIC RAW_IMAGE_MAGIC +#define NETNF_CT_MAGIC RAW_IMAGE_MAGIC +#define NETNF_EXP_MAGIC RAW_IMAGE_MAGIC + +#define PAGES_OLD_MAGIC PAGEMAP_MAGIC +#define SHM_PAGES_OLD_MAGIC PAGEMAP_MAGIC +#define BINFMT_MISC_OLD_MAGIC BINFMT_MISC_MAGIC + +/* + * These are special files, not exactly images + */ +#define STATS_MAGIC 0x57093306 /* Ostashkov */ +#define IRMAP_CACHE_MAGIC 0x57004059 /* Ivanovo */ + +/* + * Main magic for kerndat_s structure. + */ + +#define KDAT_MAGIC 0x57023458 /* Torzhok */ + +#endif /* __CR_MAGIC_H__ */ diff --git a/CRIU_code/criu/include/mem.h b/CRIU_code/criu/include/mem.h new file mode 100644 index 0000000..251cb1a --- /dev/null +++ b/CRIU_code/criu/include/mem.h @@ -0,0 +1,53 @@ +#ifndef __CR_MEM_H__ +#define __CR_MEM_H__ + +#include +#include "int.h" +#include "vma.pb-c.h" +#include "pid.h" +#include "proc_parse.h" +#include "inventory.pb-c.h" + +struct parasite_ctl; +struct vm_area_list; +struct page_pipe; +struct pstree_item; +struct vma_area; + +struct mem_dump_ctl { + bool pre_dump; + bool lazy; + struct proc_pid_stat *stat; + InventoryEntry *parent_ie; +}; + +extern bool vma_has_guard_gap_hidden(struct vma_area *vma); +extern bool page_is_zero(u64 pme); +extern bool page_in_parent(bool dirty); +extern int prepare_mm_pid(struct pstree_item *i); +extern void prepare_cow_vmas(void); +extern int do_task_reset_dirty_track(int pid); +extern unsigned long dump_pages_args_size(struct vm_area_list *vmas); +extern int parasite_dump_pages_seized(struct pstree_item *item, + struct vm_area_list *vma_area_list, + struct mem_dump_ctl *mdc, + struct parasite_ctl *ctl); + +#define PME_PRESENT (1ULL << 63) +#define PME_SWAP (1ULL << 62) +#define PME_FILE (1ULL << 61) +#define PME_SOFT_DIRTY (1ULL << 55) +#define PME_PSHIFT_BITS (6) +#define PME_STATUS_BITS (3) +#define PME_STATUS_OFFSET (64 - PME_STATUS_BITS) +#define PME_PSHIFT_OFFSET (PME_STATUS_OFFSET - PME_PSHIFT_BITS) +#define PME_PFRAME_MASK ((1ULL << PME_PSHIFT_OFFSET) - 1) +#define PME_PFRAME(x) ((x) & PME_PFRAME_MASK) + +struct task_restore_args; +int open_vmas(struct pstree_item *t); +int prepare_vmas(struct pstree_item *t, struct task_restore_args *ta); +int unmap_guard_pages(struct pstree_item *t); +int prepare_mappings(struct pstree_item *t); +bool should_dump_page(VmaEntry *vmae, u64 pme); +#endif /* __CR_MEM_H__ */ diff --git a/CRIU_code/criu/include/mman.h b/CRIU_code/criu/include/mman.h new file mode 100644 index 0000000..340d369 --- /dev/null +++ b/CRIU_code/criu/include/mman.h @@ -0,0 +1,17 @@ +#ifndef __CR_MMAN_H__ +#define __CR_MMAN_H__ + +#ifndef MAP_HUGETLB +# define MAP_HUGETLB 0x40000 +#endif +#ifndef MADV_HUGEPAGE +# define MADV_HUGEPAGE 14 +#endif +#ifndef MADV_NOHUGEPAGE +# define MADV_NOHUGEPAGE 15 +#endif +#ifndef MADV_DONTDUMP +# define MADV_DONTDUMP 16 +#endif + +#endif /* __CR_MMAN_H__ */ diff --git a/CRIU_code/criu/include/mount.h b/CRIU_code/criu/include/mount.h new file mode 100644 index 0000000..d9b375f --- /dev/null +++ b/CRIU_code/criu/include/mount.h @@ -0,0 +1,148 @@ +#ifndef __CR_MOUNT_H__ +#define __CR_MOUNT_H__ + +#include + +#include "common/list.h" + +struct proc_mountinfo; +struct pstree_item; +struct fstype; +struct ns_id; + +#define MOUNT_INVALID_DEV (0) + +#define MNT_UNREACHABLE INT_MIN + +/* + * We have remounted these mount writable temporary, and we + * should return it back to readonly at the end of file restore. + */ +#define REMOUNTED_RW 1 +/* + * We have remounted these mount writable in service mount namespace, + * thus we shouldn't return it back to readonly, as service mntns + * will be destroyed anyway. + */ +#define REMOUNTED_RW_SERVICE 2 + +struct mount_info { + int mnt_id; + int parent_mnt_id; + unsigned int s_dev; + unsigned int s_dev_rt; + char *root; + /* + * During dump mountpoint contains path with dot at the + * beginning. It allows to use openat, statat, etc without + * creating a temporary copy of the path. + * + * On restore mountpoint is prepended with so called ns + * root path -- it's a place in fs where the namespace + * mount tree is constructed. Check mnt_roots for details. + * The ns_mountpoint contains path w/o this prefix. + */ + char *mountpoint; + char *ns_mountpoint; + int fd; + unsigned flags; + unsigned sb_flags; + int master_id; + int shared_id; + struct fstype *fstype; + char *source; + char *options; + char *fsname; + union { + bool mounted; + bool dumped; + }; + bool need_plugin; + bool is_ns_root; + bool deleted; + struct mount_info *next; + struct ns_id *nsid; + + char *external; + bool internal_sharing; + + /* tree linkage */ + struct mount_info *parent; + struct mount_info *bind; + struct list_head children; + struct list_head siblings; + + struct list_head mnt_bind; /* circular list of derivatives of one real mount */ + struct list_head mnt_share; /* circular list of shared mounts */ + struct list_head mnt_slave_list; /* list of slave mounts */ + struct list_head mnt_slave; /* slave list entry */ + struct mount_info *mnt_master; /* slave is on master->mnt_slave_list */ + struct list_head mnt_propagate; /* circular list of mounts which propagate from each other */ + struct list_head mnt_notprop; /* temporary list used in can_mount_now */ + + struct list_head postpone; + + int is_overmounted; + int remounted_rw; + + void *private; /* associated filesystem data */ +}; + +extern struct mount_info *mntinfo; +extern struct ns_desc mnt_ns_desc; +#ifdef CONFIG_BINFMT_MISC_VIRTUALIZED +extern int collect_binfmt_misc(void); +#else +static inline int collect_binfmt_misc(void) { return 0; } +#endif + +extern struct mount_info *mnt_entry_alloc(); +extern void mnt_entry_free(struct mount_info *mi); + +extern int __mntns_get_root_fd(pid_t pid); +extern int mntns_get_root_fd(struct ns_id *ns); +extern int mntns_get_root_by_mnt_id(int mnt_id); +extern struct ns_id *lookup_nsid_by_mnt_id(int mnt_id); + +extern int open_mount(unsigned int s_dev); +extern int __open_mountpoint(struct mount_info *pm, int mnt_fd); +extern int mnt_is_dir(struct mount_info *pm); +extern int open_mountpoint(struct mount_info *pm); + +extern struct mount_info *collect_mntinfo(struct ns_id *ns, bool for_dump); +extern int prepare_mnt_ns(void); + +extern int pivot_root(const char *new_root, const char *put_old); + +extern struct mount_info *lookup_overlayfs(char *rpath, unsigned int s_dev, + unsigned int st_ino, unsigned int mnt_id); +extern struct mount_info *lookup_mnt_id(unsigned int id); +extern struct mount_info *lookup_mnt_sdev(unsigned int s_dev); + +extern dev_t phys_stat_resolve_dev(struct ns_id *, dev_t st_dev, const char *path); +extern bool phys_stat_dev_match(dev_t st_dev, dev_t phys_dev, + struct ns_id *, const char *path); + +extern int restore_task_mnt_ns(struct pstree_item *current); +extern void fini_restore_mntns(void); +extern int depopulate_roots_yard(int mntns_root, bool clean_remaps); + +extern int rst_get_mnt_root(int mnt_id, char *path, int plen); +extern int ext_mount_add(char *key, char *val); +extern int ext_mount_parse_auto(char *key); +extern int mntns_maybe_create_roots(void); +extern int read_mnt_ns_img(void); +extern void cleanup_mnt_ns(void); +extern void clean_cr_time_mounts(void); + +extern bool add_skip_mount(const char *mountpoint); +struct ns_id; +extern struct mount_info *parse_mountinfo(pid_t pid, struct ns_id *nsid, bool for_dump); + +extern int check_mnt_id(void); + +extern int remount_readonly_mounts(void); +extern int try_remount_writable(struct mount_info *mi, bool ns); +extern bool mnt_is_overmounted(struct mount_info *mi); + +#endif /* __CR_MOUNT_H__ */ diff --git a/CRIU_code/criu/include/namespaces.h b/CRIU_code/criu/include/namespaces.h new file mode 100644 index 0000000..287abb3 --- /dev/null +++ b/CRIU_code/criu/include/namespaces.h @@ -0,0 +1,224 @@ +#ifndef __CR_NS_H__ +#define __CR_NS_H__ + +#include "common/compiler.h" +#include "files.h" +#include "common/list.h" +#include "images/netdev.pb-c.h" + +#ifndef CLONE_NEWNS +#define CLONE_NEWNS 0x00020000 +#endif + +#ifndef CLONE_NEWPID +#define CLONE_NEWPID 0x20000000 +#endif + +#ifndef CLONE_NEWUTS +#define CLONE_NEWUTS 0x04000000 +#endif + +#ifndef CLONE_NEWIPC +#define CLONE_NEWIPC 0x08000000 +#endif + +#ifndef CLONE_NEWNET +#define CLONE_NEWNET 0x40000000 +#endif + +#ifndef CLONE_NEWUSER +#define CLONE_NEWUSER 0x10000000 +#endif + +#ifndef CLONE_NEWCGROUP +#define CLONE_NEWCGROUP 0x02000000 +#endif + +#define CLONE_ALLNS (CLONE_NEWPID | CLONE_NEWNET | CLONE_NEWIPC | CLONE_NEWUTS | CLONE_NEWNS | CLONE_NEWUSER | CLONE_NEWCGROUP) + +/* Nested namespaces are supported only for these types */ +#define CLONE_SUBNS (CLONE_NEWNS | CLONE_NEWNET) + +#define EXTRA_SIZE 20 + +struct ns_desc { + unsigned int cflag; + char *str; + size_t len; +}; + +struct user_ns_extra { + char *uid; + char *gid; +}; + +/* struct join_ns is used for storing parameters specified by --join-ns */ +struct join_ns { + struct list_head list; + char *ns_file; + struct ns_desc *nd; /* namespace descriptor */ + int ns_fd; + /* extra options of --join-ns, like uid&gid in user namespace */ + union { + struct user_ns_extra user_extra; + char *common_extra; + } extra_opts; +}; + +enum ns_type { + NS_UNKNOWN = 0, + NS_CRIU, + NS_ROOT, + NS_OTHER, +}; + +struct netns_id { + unsigned target_ns_id; + unsigned netnsid_value; + struct list_head node; +}; + +struct net_link { + NetDeviceEntry *nde; + bool created; + struct list_head node; +}; + +struct ns_id { + unsigned int kid; + unsigned int id; + pid_t ns_pid; + struct ns_desc *nd; + struct ns_id *next; + enum ns_type type; + char *ext_key; + + /* + * For mount namespaces on restore -- indicates that + * the namespace in question is created (all mounts + * are mounted) and other tasks may do setns on it + * and proceed. + */ + bool ns_populated; + + union { + struct { + struct mount_info *mntinfo_list; + struct mount_info *mntinfo_tree; + int nsfd_id; + int root_fd_id; + } mnt; + + struct { + + /* + * ns_fd is used when network namespaces are being + * restored. On this stage we access these file + * descriptors many times and it is more efficient to + * have them opened rather than to get them from fdstore. + * + * nsfd_id is used to restore sockets. On this stage we + * can't use random file descriptors to not conflict + * with restored file descriptors. + */ + union { + int nsfd_id; /* a namespace descriptor id in fdstore */ + int ns_fd; /* a namespace file descriptor */ + }; + int nlsk; /* for sockets collection */ + int seqsk; /* to talk to parasite daemons */ + struct list_head ids; + struct list_head links; + NetnsEntry *netns; + } net; + }; +}; +extern struct ns_id *ns_ids; + +#define NS_DESC_ENTRY(_cflag, _str) \ + { \ + .cflag = _cflag, \ + .str = _str, \ + .len = sizeof(_str) - 1, \ + } + +extern bool check_ns_proc(struct fd_link *link); + +extern struct ns_desc pid_ns_desc; +extern struct ns_desc user_ns_desc; +extern unsigned long root_ns_mask; + +extern const struct fdtype_ops nsfile_dump_ops; +extern struct collect_image_info nsfile_cinfo; + +extern int walk_namespaces(struct ns_desc *nd, int (*cb)(struct ns_id *, void *), void *oarg); +extern int collect_namespaces(bool for_dump); +extern int collect_mnt_namespaces(bool for_dump); +extern int dump_mnt_namespaces(void); +extern int dump_namespaces(struct pstree_item *item, unsigned int ns_flags); +extern int prepare_namespace_before_tasks(void); +extern int prepare_namespace(struct pstree_item *item, unsigned long clone_flags); +extern int prepare_userns_creds(void); + +extern int switch_ns(int pid, struct ns_desc *nd, int *rst); +extern int switch_ns_by_fd(int nsfd, struct ns_desc *nd, int *rst); +extern int restore_ns(int rst, struct ns_desc *nd); + +extern int dump_task_ns_ids(struct pstree_item *); +extern int predump_task_ns_ids(struct pstree_item *); +extern struct ns_id *rst_new_ns_id(unsigned int id, pid_t pid, struct ns_desc *nd, enum ns_type t); +extern int rst_add_ns_id(unsigned int id, struct pstree_item *, struct ns_desc *nd); +extern struct ns_id *lookup_ns_by_id(unsigned int id, struct ns_desc *nd); + +extern int collect_user_namespaces(bool for_dump); +extern int prepare_userns(struct pstree_item *item); +extern int stop_usernsd(void); + +extern uid_t userns_uid(uid_t uid); +extern gid_t userns_gid(gid_t gid); + +extern int dump_user_ns(pid_t pid, int ns_id); +extern void free_userns_maps(void); +extern int join_ns_add(const char *type, char *ns_file, char *extra_opts); +extern int check_namespace_opts(void); +extern int join_namespaces(void); + +typedef int (*uns_call_t)(void *arg, int fd, pid_t pid); +/* + * Async call -- The call is guaranteed to be done till the + * CR_STATE_COMPLETE happens. The function may return even + * before the call starts. + * W/o flag the call is synchronous -- this function returns + * strictly after the call finishes. + */ +#define UNS_ASYNC 0x1 +/* + * The call returns an FD which should be sent back. Conflicts + * with UNS_ASYNC. + */ +#define UNS_FDOUT 0x2 + +#define MAX_UNSFD_MSG_SIZE 8192 + +/* + * When we're restoring inside user namespace, some things are + * not allowed to be done there due to insufficient capabilities. + * If the operation in question can be offloaded to another process, + * this call allows to do that. + * + * In case we're not in userns, just call the callback immediately + * in the context of calling task. + */ +extern int __userns_call(const char *func_name, uns_call_t call, int flags, + void *arg, size_t arg_size, int fd); + +#define userns_call(__call, __flags, __arg, __arg_size, __fd) \ + __userns_call(__stringify(__call), __call, __flags, \ + __arg, __arg_size, __fd) + +extern int add_ns_shared_cb(int (*actor)(void *data), void *data); + +extern struct ns_id *get_socket_ns(int lfd); +extern struct ns_id *lookup_ns_by_kid(unsigned int kid, struct ns_desc *nd); + +#endif /* __CR_NS_H__ */ diff --git a/CRIU_code/criu/include/net.h b/CRIU_code/criu/include/net.h new file mode 100644 index 0000000..9976f6e --- /dev/null +++ b/CRIU_code/criu/include/net.h @@ -0,0 +1,57 @@ +#ifndef __CR_NET_H__ +#define __CR_NET_H__ + +#include + +#include "common/list.h" +#include "external.h" + +#ifndef RTM_GETNSID +#define RTM_GETNSID 90 +#endif + +struct cr_imgset; +struct ns_id; +extern int dump_net_ns(struct ns_id *ns); +extern int prepare_net_namespaces(void); +extern void fini_net_namespaces(void); +extern int netns_keep_nsfd(void); + +struct pstree_item; +extern int restore_task_net_ns(struct pstree_item *current); + +struct veth_pair { + struct list_head node; + char *inside; + char *outside; + char *bridge; +}; + +extern int collect_net_namespaces(bool for_dump); + +extern int network_lock(void); +extern void network_unlock(void); +extern int network_lock_internal(); + +extern struct ns_desc net_ns_desc; + +#include "images/netdev.pb-c.h" +extern int write_netdev_img(NetDeviceEntry *nde, struct cr_imgset *fds, struct nlattr **info); +extern int read_ns_sys_file(char *path, char *buf, int len); +struct net_link; +extern int restore_link_parms(struct net_link *link, int nlsk); + +extern int veth_pair_add(char *in, char *out); +extern int macvlan_ext_add(struct external *ext); +extern int move_veth_to_bridge(void); + +extern int kerndat_link_nsid(void); +extern int net_get_nsid(int rtsk, int fd, int *nsid); +extern struct ns_id *net_get_root_ns(); +extern int kerndat_nsid(void); +extern void check_has_netns_ioc(int fd, bool *kdat_val, const char *name); +extern int net_set_ext(struct ns_id *ns); +extern struct ns_id *get_root_netns(); +extern int read_net_ns_img(); + +#endif /* __CR_NET_H__ */ diff --git a/CRIU_code/criu/include/netfilter.h b/CRIU_code/criu/include/netfilter.h new file mode 100644 index 0000000..35ef262 --- /dev/null +++ b/CRIU_code/criu/include/netfilter.h @@ -0,0 +1,13 @@ +#ifndef __CR_NETFILTER_H__ +#define __CR_NETFILTER_H__ + +struct inet_sk_desc; +extern int nf_lock_connection(struct inet_sk_desc *); +extern int nf_unlock_connection(struct inet_sk_desc *); + +struct inet_sk_info; +extern int nf_unlock_connection_info(struct inet_sk_info *); + +extern void preload_netfilter_modules(void); + +#endif /* __CR_NETFILTER_H__ */ diff --git a/CRIU_code/criu/include/netlink_diag.h b/CRIU_code/criu/include/netlink_diag.h new file mode 100644 index 0000000..14ca403 --- /dev/null +++ b/CRIU_code/criu/include/netlink_diag.h @@ -0,0 +1,42 @@ +#ifndef __CR_NETLINK_DIAG_H__ +#define __CR_NETLINK_DIAG_H__ + +#include + +struct netlink_diag_req { + __u8 sdiag_family; + __u8 sdiag_protocol; + __u16 pad; + __u32 ndiag_ino; + __u32 ndiag_show; + __u32 ndiag_cookie[2]; +}; + +struct netlink_diag_msg { + __u8 ndiag_family; + __u8 ndiag_type; + __u8 ndiag_protocol; + __u8 ndiag_state; + + __u32 ndiag_portid; + __u32 ndiag_dst_portid; + __u32 ndiag_dst_group; + __u32 ndiag_ino; + __u32 ndiag_cookie[2]; +}; + +enum { + NETLINK_DIAG_MEMINFO, + NETLINK_DIAG_GROUPS, + + __NETLINK_DIAG_MAX, +}; + +#define NETLINK_DIAG_MAX (__NETLINK_DIAG_MAX - 1) + +#define NDIAG_PROTO_ALL ((__u8) ~0) + +#define NDIAG_SHOW_MEMINFO 0x00000001 /* show memory info of a socket */ +#define NDIAG_SHOW_GROUPS 0x00000002 /* show groups of a netlink socket */ + +#endif /* __CR_NETLINK_DIAG_H__ */ diff --git a/CRIU_code/criu/include/packet_diag.h b/CRIU_code/criu/include/packet_diag.h new file mode 100644 index 0000000..287de84 --- /dev/null +++ b/CRIU_code/criu/include/packet_diag.h @@ -0,0 +1,76 @@ +#ifndef __CR_PACKET_DIAG_H__ +#define __CR_PACKET_DIAG_H__ + +#include + +struct packet_diag_req { + __u8 sdiag_family; + __u8 sdiag_protocol; + __u16 pad; + __u32 pdiag_ino; + __u32 pdiag_show; + __u32 pdiag_cookie[2]; +}; + +#define PACKET_SHOW_INFO 0x00000001 /* Basic packet_sk information */ +#define PACKET_SHOW_MCLIST 0x00000002 /* A set of packet_diag_mclist-s */ +#define PACKET_SHOW_RING_CFG 0x00000004 /* Rings configuration parameters */ +#define PACKET_SHOW_FANOUT 0x00000008 + +struct packet_diag_msg { + __u8 pdiag_family; + __u8 pdiag_type; + __u16 pdiag_num; + + __u32 pdiag_ino; + __u32 pdiag_cookie[2]; +}; + +enum { + PACKET_DIAG_INFO, + PACKET_DIAG_MCLIST, + PACKET_DIAG_RX_RING, + PACKET_DIAG_TX_RING, + PACKET_DIAG_FANOUT, + + PACKET_DIAG_MAX, +}; + +struct packet_diag_info { + __u32 pdi_index; + __u32 pdi_version; + __u32 pdi_reserve; + __u32 pdi_copy_thresh; + __u32 pdi_tstamp; + __u32 pdi_flags; + +#define PDI_RUNNING 0x1 +#define PDI_AUXDATA 0x2 +#define PDI_ORIGDEV 0x4 +#define PDI_VNETHDR 0x8 +#define PDI_LOSS 0x10 +}; + +#ifndef MAX_ADDR_LEN +#define MAX_ADDR_LEN 32 +#endif + +struct packet_diag_mclist { + __u32 pdmc_index; + __u32 pdmc_count; + __u16 pdmc_type; + __u16 pdmc_alen; + __u8 pdmc_addr[MAX_ADDR_LEN]; +}; + +struct packet_diag_ring { + __u32 pdr_block_size; + __u32 pdr_block_nr; + __u32 pdr_frame_size; + __u32 pdr_frame_nr; + __u32 pdr_retire_tmo; + __u32 pdr_sizeof_priv; + __u32 pdr_features; +}; + +#endif /* __CR_PACKET_DIAG_H__ */ diff --git a/CRIU_code/criu/include/page-pipe.h b/CRIU_code/criu/include/page-pipe.h new file mode 100644 index 0000000..80e5958 --- /dev/null +++ b/CRIU_code/criu/include/page-pipe.h @@ -0,0 +1,160 @@ +#ifndef __CR_PAGE_PIPE_H__ +#define __CR_PAGE_PIPE_H__ + +#include +#include "common/list.h" + +#define PAGE_ALLOC_COSTLY_ORDER 3 /* from the kernel source code */ +struct kernel_pipe_buffer { + struct page *page; + unsigned int offset, len; + const struct pipe_buf_operations *ops; + unsigned int flags; + unsigned long private; +}; + +/* + * The kernel allocates the linear chunk of memory for pipe buffers. + * Allocation of chunks with size more than PAGE_ALLOC_COSTLY_ORDER + * fails very often, so we need to restrict the pipe capacity to not + * allocate big chunks. + */ +#define PIPE_MAX_SIZE ((1 << PAGE_ALLOC_COSTLY_ORDER) * PAGE_SIZE / \ + sizeof(struct kernel_pipe_buffer)) + +/* The number of pipes for one chunk */ +#define NR_PIPES_PER_CHUNK 8 + +/* + * page_pipe is a descriptor of task's virtual memory + * with pipes, containing pages. + * + * A page-pipe may contain holes -- these are pagemap + * entries without pages. Holes are stored in separate + * array to optimize paged iovs feed into vmsplice -- + * they will be sent there in one go. + * + * A hole is a pagemap entry that doesn't have pages + * in it, since they are present in previous (parent) + * snapshot. + * + * + * This page-pipe vs holes vs task vmem vs image layout + * is described below. + * + * Task memory: (+ present, - not present pages) + * 0 0 0 0 1 1 1 + * 0 3 6 B 1 8 C + * ---+++-----++++++-------++++---- + * + * Page-pipe iovs: + * + * bufs = 03:3,0B:6,18:4 + * holes = + * + * The pagemap.img would purely contain page-pipe bufs. + * + * Pages image will contain pages at + * + * 03,04,05,0B,0C,0D,0E,0F,10,18,19,1A,1B + * + * stored one by one. + * + * Not let's imagine task touches some pages and its mem + * looks like: (+ present, = old present, - non present) + * + * 0 0 0 0 11 11 1 + * 0 3 6 B 12 78 C + * ---==+-----====+++-----++===---- + * + * (not new pages at 11 and 17 vaddrs) + * + * The new --snapshot'ed page-pipe would look like + * + * bufs = 05:1,0F:3,17:2 + * holes = 03:2,0B:4,19:3 + * + * So the pagemap.img would look like + * + * 03:2:P,05:1,0B:4:P,0F:3,17:2,19:3:P + * + * (the page_xfer_dump_pages generates one) + * + * where P means "in parent", i.e. respective pages should + * be looked up in the parent pagemap (not pages.img, but + * the pagemap, and then the offset in previous pages.img + * should be calculated, see the read_pagemap_page routine). + * + * New pages.img file would contain only pages for + * + * 05,0F,10,11,17,18 + */ + +struct page_pipe_buf { + int p[2]; /* pipe with pages */ + unsigned int pipe_size; /* how many pages can be fit into pipe */ + unsigned int pipe_off; /* where this buf is started in a pipe */ + unsigned int pages_in; /* how many pages are there */ + unsigned int nr_segs; /* how many iov-s are busy */ +#define PPB_LAZY (1 << 0) + unsigned int flags; + struct iovec *iov; /* vaddr:len map */ + struct list_head l; /* links into page_pipe->bufs */ +}; + +/* + * Page pipe buffers with different flags cannot share the same pipe. + * We track the last ppb that was used for each type separately in the + * prev[] array in the struct page_pipe (below). + * Currently we have 2 types: the buffers that are always stored in + * the images and the buffers that are lazily migrated + */ +#define PP_PIPE_TYPES 2 + +#define PP_HOLE_PARENT (1 << 0) + +struct page_pipe { + unsigned int nr_pipes; /* how many page_pipe_bufs in there */ + struct list_head bufs; /* list of bufs */ + struct list_head free_bufs; /* list of bufs */ + struct page_pipe_buf *prev[PP_PIPE_TYPES]; /* last ppb of each type + for pipe sharing */ + unsigned int nr_iovs; /* number of iovs */ + unsigned int free_iov; /* first free iov */ + struct iovec *iovs; /* iovs. They are provided into create_page_pipe + and all bufs have their iov-s in there */ + + unsigned int nr_holes; /* number of holes allocated */ + unsigned int free_hole; /* number of holes in use */ + struct iovec *holes; /* holes */ + unsigned int *hole_flags; + unsigned flags; /* PP_FOO flags below */ +}; + +#define PP_CHUNK_MODE 0x1 /* Restrict the maximum buffer size of pipes + and dump memory for a few iterations */ +#define PP_OWN_IOVS 0x4 /* create_page_pipe allocated IOVs memory */ + +struct page_pipe *create_page_pipe(unsigned int nr_segs, struct iovec *iovs, unsigned flags); +extern void destroy_page_pipe(struct page_pipe *p); +extern int page_pipe_add_page(struct page_pipe *p, unsigned long addr, + unsigned int flags); +extern int page_pipe_add_hole(struct page_pipe *pp, unsigned long addr, + unsigned int flags); + +extern void debug_show_page_pipe(struct page_pipe *pp); +void page_pipe_reinit(struct page_pipe *pp); + +extern void page_pipe_destroy_ppb(struct page_pipe_buf *ppb); + +struct pipe_read_dest { + int p[2]; + int sink_fd; +}; + +extern int pipe_read_dest_init(struct pipe_read_dest *prd); +extern int page_pipe_read(struct page_pipe *pp, struct pipe_read_dest *prd, + unsigned long addr, unsigned int *nr_pages, + unsigned int ppb_flags); + +#endif /* __CR_PAGE_PIPE_H__ */ diff --git a/CRIU_code/criu/include/page-xfer.h b/CRIU_code/criu/include/page-xfer.h new file mode 100644 index 0000000..fa72273 --- /dev/null +++ b/CRIU_code/criu/include/page-xfer.h @@ -0,0 +1,74 @@ +#ifndef __CR_PAGE_XFER__H__ +#define __CR_PAGE_XFER__H__ +#include "pagemap.h" + +struct ps_info { + int pid; + unsigned short port; +}; + +extern int cr_page_server(bool daemon_mode, bool lazy_dump, int cfd); + +/* + * page_xfer -- transfer pages into image file. + * Two images backends are implemented -- local image file + * and page-server image file. + */ + +struct page_xfer { + /* transfers one vaddr:len entry */ + int (*write_pagemap)(struct page_xfer *self, struct iovec *iov, u32 flags); + /* transfers pages related to previous pagemap */ + int (*write_pages)(struct page_xfer *self, int pipe, unsigned long len); + void (*close)(struct page_xfer *self); + + /* + * In case we need to dump pagemaps not as-is, but + * relative to some address. Used, e.g. by shmem. + */ + unsigned long offset; + bool transfer_lazy; + + /* private data for every page-xfer engine */ + union { + struct /* local */ { + struct cr_img *pmi; /* pagemaps */ + struct cr_img *pi; /* pages */ + }; + + struct /* page-server */ { + int sk; + u64 dst_id; + }; + }; + + struct page_read *parent; +}; + +extern int open_page_xfer(struct page_xfer *xfer, int fd_type, unsigned long id); +struct page_pipe; +extern int page_xfer_dump_pages(struct page_xfer *, struct page_pipe *); +extern int connect_to_page_server_to_send(void); +extern int connect_to_page_server_to_recv(int epfd); +extern int disconnect_from_page_server(void); + +extern int check_parent_page_xfer(int fd_type, unsigned long id); + +/* + * The post-copy migration makes it necessary to receive pages from + * remote dump. The protocol we use for that is quite simple: + * - lazy-pages sends request containing PS_IOV_GET(nr_pages, vaddr, pid) + * - dump-side page server responds with PS_IOV_ADD(nr_pages, vaddr, + pid) or PS_IOV_ADD(0, 0, 0) if it failed to locate the required + pages + * - dump-side page server sends the raw page data + */ + +/* async request/receive of remote pages */ +extern int request_remote_pages(unsigned long img_id, unsigned long addr, int nr_pages); + +typedef int (*ps_async_read_complete)(unsigned long img_id, unsigned long vaddr, int nr_pages, void *); +extern int page_server_start_read(void *buf, int nr_pages, + ps_async_read_complete complete, void *priv, unsigned flags); + +#endif /* __CR_PAGE_XFER__H__ */ diff --git a/CRIU_code/criu/include/page.h b/CRIU_code/criu/include/page.h new file mode 100644 index 0000000..47e6808 --- /dev/null +++ b/CRIU_code/criu/include/page.h @@ -0,0 +1,4 @@ +#ifndef __CR_INC_PAGE_H__ +#define __CR_INC_PAGE_H__ +#include "common/page.h" +#endif diff --git a/CRIU_code/criu/include/pagemap-cache.h b/CRIU_code/criu/include/pagemap-cache.h new file mode 100644 index 0000000..d3ace24 --- /dev/null +++ b/CRIU_code/criu/include/pagemap-cache.h @@ -0,0 +1,29 @@ +#ifndef __CR_PAGEMAP_H__ +#define __CR_PAGEMAP_H__ + +#include +#include "int.h" + +#include "common/list.h" + +struct vma_area; + +#define PAGEMAP_PFN_OFF(addr) (PAGE_PFN(addr) * sizeof(u64)) + +typedef struct { + pid_t pid; /* which process it belongs */ + unsigned long start; /* start of area */ + unsigned long end; /* end of area */ + const struct list_head *vma_head; /* list head of VMAs we're serving */ + u64 *map; /* local buffer */ + size_t map_len; /* length of a buffer */ + int fd; /* file to read PMs from */ +} pmc_t; + +#define PMC_INIT (pmc_t){ } + +extern int pmc_init(pmc_t *pmc, pid_t pid, const struct list_head *vma_head, size_t size); +extern u64 *pmc_get_map(pmc_t *pmc, const struct vma_area *vma); +extern void pmc_fini(pmc_t *pmc); + +#endif /* __CR_PAGEMAP_H__ */ diff --git a/CRIU_code/criu/include/pagemap.h b/CRIU_code/criu/include/pagemap.h new file mode 100644 index 0000000..45284b8 --- /dev/null +++ b/CRIU_code/criu/include/pagemap.h @@ -0,0 +1,155 @@ +#ifndef __CR_PAGE_READ_H__ +#define __CR_PAGE_READ_H__ + +#include "common/list.h" +#include "images/pagemap.pb-c.h" +#include "page.h" + +/* + * page_read -- engine, that reads pages from image file(s) + * + * Several page-read's can be arranged in a chain to read + * pages from a series of snapshot. + * + * A task's address space vs pagemaps+page image pairs can + * look like this (taken from comment in page-pipe.h): + * + * task: + * + * 0 0 0 0 1 1 1 + * 0 3 6 B 2 7 C + * ---+++-----+++++++-----+++++---- + * pm1: ---+++-----++++++-------++++---- + * pm2: ---==+-----====+++-----++===---- + * + * Here + is present page, - is non prsent, = is present, + * but is not modified from last snapshot. + * + * Thus pagemap.img and pages.img entries are + * + * pm1: 03:3,0B:6,18:4 + * pm2: 03:2:P,05:1,0B:4:P,0F:3,17:2,19:3:P + * + * where P means "page is in parent pagemap". + * + * pg1: 03,04,05,0B,0C,0D,0E,0F,10,18,19,1A,1B + * pg2: 05,0F,10,11,17,18 + * + * When trying to restore from these 4 files we'd have + * to carefully scan pagemap.img's one by one and read or + * skip pages from pages.img where appropriate. + * + * All this is implemented in read_pagemap_page. + */ + +struct page_read { + /* reads page from current pagemap */ + int (*read_pages)(struct page_read *, unsigned long vaddr, int nr, + void *, unsigned flags); + /* Advance page_read to the next entry */ + int (*advance)(struct page_read *pr); + void (*close)(struct page_read *); + void (*skip_pages)(struct page_read *, unsigned long len); + int (*sync)(struct page_read *pr); + int (*seek_pagemap)(struct page_read *pr, unsigned long vaddr); + void (*reset)(struct page_read *pr); + int (*io_complete)(struct page_read *, unsigned long vaddr, int nr); + int (*maybe_read_page)(struct page_read *pr, unsigned long vaddr, + int nr, void *buf, unsigned flags); + + /* Whether or not pages can be read in PIE code */ + bool pieok; + + /* Private data of reader */ + struct cr_img *pmi; + struct cr_img *pi; + u32 pages_img_id; + + PagemapEntry *pe; /* current pagemap we are on */ + struct page_read *parent; /* parent pagemap (if ->in_parent + pagemap is met in image, then + go to this guy for page, see + read_pagemap_page */ + unsigned long cvaddr; /* vaddr we are on */ + off_t pi_off; /* current offset in pages file */ + + struct iovec bunch; /* record consequent neighbour + iovecs to punch together */ + unsigned id; /* for logging */ + unsigned long img_id; /* pagemap image file ID */ + + PagemapEntry **pmes; + int nr_pmes; + int curr_pme; + + struct list_head async; +}; + +/* flags for ->read_pages */ +#define PR_ASYNC 0x1 /* may exit w/o data in the buffer */ +#define PR_ASAP 0x2 /* PR_ASYNC, but start the IO right now */ + +/* flags for open_page_read */ +#define PR_SHMEM 0x1 +#define PR_TASK 0x2 + +#define PR_TYPE_MASK 0x3 +#define PR_MOD 0x4 /* Will need to modify */ +#define PR_REMOTE 0x8 + +/* + * -1 -- error + * 0 -- no images + * 1 -- opened + */ +extern int open_page_read(unsigned long id, struct page_read *, int pr_flags); +extern int open_page_read_at(int dfd, unsigned long id, struct page_read *pr, + int pr_flags); + +struct task_restore_args; + +int pagemap_enqueue_iovec(struct page_read *pr, void *buf, + unsigned long len, struct list_head *to); +int pagemap_render_iovec(struct list_head *from, struct task_restore_args *ta); + +/* + * Create a shallow copy of page_read object. + * The new object shares the pagemap structures with the original, but + * maintains its own set of references to those structures. + */ +extern void dup_page_read(struct page_read *src, struct page_read *dst); + +extern int dedup_one_iovec(struct page_read *pr, unsigned long base, + unsigned long len); + +static inline unsigned long pagemap_len(PagemapEntry *pe) +{ + return pe->nr_pages * PAGE_SIZE; +} + +static inline bool page_read_has_parent(struct page_read *pr) +{ + return pr->parent != NULL; +} + +/* Pagemap flags */ +#define PE_PARENT (1 << 0) /* pages are in parent snapshot */ +#define PE_LAZY (1 << 1) /* pages can be lazily restored */ +#define PE_PRESENT (1 << 2) /* pages are present in pages*img */ + +static inline bool pagemap_in_parent(PagemapEntry *pe) +{ + return !!(pe->flags & PE_PARENT); +} + +static inline bool pagemap_lazy(PagemapEntry *pe) +{ + return !!(pe->flags & PE_LAZY); +} + +static inline bool pagemap_present(PagemapEntry *pe) +{ + return !!(pe->flags & PE_PRESENT); +} + +#endif /* __CR_PAGE_READ_H__ */ diff --git a/CRIU_code/criu/include/parasite-syscall.h b/CRIU_code/criu/include/parasite-syscall.h new file mode 100644 index 0000000..c86a724 --- /dev/null +++ b/CRIU_code/criu/include/parasite-syscall.h @@ -0,0 +1,57 @@ +#ifndef __CR_PARASITE_SYSCALL_H__ +#define __CR_PARASITE_SYSCALL_H__ + +#include "pid.h" +#include "common/list.h" +#include "common/config.h" +#include "asm/parasite-syscall.h" + +struct parasite_dump_thread; +struct parasite_dump_misc; +struct parasite_drain_fd; +struct vm_area_list; +struct pstree_item; +struct _CredsEntry; +struct _CoreEntry; +struct list_head; +struct cr_imgset; +struct fd_opts; +struct pid; +struct parasite_dump_cgroup_args; +struct rt_sigframe; + +struct parasite_ctl; +struct parasite_thread_ctl; + +extern int parasite_dump_sigacts_seized(struct parasite_ctl *ctl, struct pstree_item *); +extern int parasite_dump_itimers_seized(struct parasite_ctl *ctl, struct pstree_item *); + +struct proc_posix_timers_stat; +extern int parasite_dump_posix_timers_seized(struct proc_posix_timers_stat *proc_args, + struct parasite_ctl *ctl, struct pstree_item *); + +extern int parasite_dump_misc_seized(struct parasite_ctl *ctl, struct parasite_dump_misc *misc); +extern int parasite_dump_creds(struct parasite_ctl *ctl, struct _CredsEntry *ce); +extern int parasite_dump_thread_leader_seized(struct parasite_ctl *ctl, int pid, struct _CoreEntry *core); +extern int parasite_dump_thread_seized(struct parasite_thread_ctl *tctl, + struct parasite_ctl *ctl, int id, + struct pid *tid, struct _CoreEntry *core); +extern int dump_thread_core(int pid, CoreEntry *core, + const struct parasite_dump_thread *dt); + +extern int parasite_drain_fds_seized(struct parasite_ctl *ctl, + struct parasite_drain_fd *dfds, int nr_fds, int off, + int *lfds, struct fd_opts *flags); +extern int parasite_get_proc_fd_seized(struct parasite_ctl *ctl); + +extern struct parasite_ctl *parasite_infect_seized(pid_t pid, + struct pstree_item *item, + struct vm_area_list *vma_area_list); +extern void parasite_ensure_args_size(unsigned long sz); +extern unsigned long get_exec_start(struct vm_area_list *); + +extern int parasite_dump_cgroup(struct parasite_ctl *ctl, struct parasite_dump_cgroup_args *cgroup); + +extern struct parasite_tty_args *parasite_dump_tty(struct parasite_ctl *ctl, int fd, int type); + +#endif /* __CR_PARASITE_SYSCALL_H__ */ diff --git a/CRIU_code/criu/include/parasite-vdso.h b/CRIU_code/criu/include/parasite-vdso.h new file mode 100644 index 0000000..3cf67bb --- /dev/null +++ b/CRIU_code/criu/include/parasite-vdso.h @@ -0,0 +1,95 @@ +#ifndef __CR_PARASITE_VDSO_H__ +#define __CR_PARASITE_VDSO_H__ + +#include "common/config.h" +#include "util-vdso.h" +#include "images/vma.pb-c.h" + +struct parasite_ctl; +struct vm_area_list; + +/* Check if symbol present in symtable */ +static inline bool vdso_symbol_empty(struct vdso_symbol *s) +{ + return s->offset == VDSO_BAD_ADDR && s->name[0] == '\0'; +} + +/* + * Special mark which allows to identify runtime vdso (rt-vdso) where + * calls from proxy (original) vdso are redirected. This mark usually + * placed at the start of vdso area where Elf header lives. + * Since such runtime vdso is solely used by the proxy and + * nobody else is supposed to access it, it's more-less + * safe to screw the Elf header with @signature and + * vvar/vdso addresses for next dumping. + * + * The @orig_addr deserves a few comments. When we redirect the calls + * from the original vdso to runtime vdso, on next checkpoint it won't + * be possible to find original vdso/vvar pair, thus we save their + * addresses in the member. + * + * As on the following dumps we need to drop rt-{vvar,vdso} pair + * from list of VMAs to save in images, we save rt-vvar address also. + */ +struct vdso_mark { + u64 signature; + unsigned long orig_vdso_addr; + unsigned long version; + unsigned long orig_vvar_addr; + unsigned long rt_vvar_addr; +}; + +#define VDSO_MARK_SIGNATURE_V1 (0x6f73647675697263ULL) /* Magic number (criuvdso) */ +#define VDSO_MARK_SIGNATURE_V2 (0x4f53447675697263ULL) /* Magic number (criuvDSO) */ +#define VDSO_MARK_SIGNATURE_V3 (0x4f53447655495243ULL) /* Magic number (CRIUvDSO) */ +#define VDSO_MARK_CUR_VERSION (3) + +static inline void vdso_put_mark(void *where, unsigned long rt_vvar_addr, + unsigned long orig_vdso_addr, unsigned long orig_vvar_addr) +{ + struct vdso_mark *m = where; + + m->signature = VDSO_MARK_SIGNATURE_V3; + m->orig_vdso_addr = orig_vdso_addr; + m->version = VDSO_MARK_CUR_VERSION; + m->orig_vvar_addr = orig_vvar_addr; + m->rt_vvar_addr = rt_vvar_addr; +} + +static inline bool is_vdso_mark(void *addr) +{ + struct vdso_mark *m = addr; + + switch (m->signature) { + case VDSO_MARK_SIGNATURE_V3: + return true; + /* + * Old formats -- simply extend the mark up + * to the version we support. + */ + case VDSO_MARK_SIGNATURE_V2: + vdso_put_mark(m, VVAR_BAD_ADDR, + m->orig_vdso_addr, m->orig_vvar_addr); + return true; + + case VDSO_MARK_SIGNATURE_V1: + vdso_put_mark(m, VVAR_BAD_ADDR, + m->orig_vdso_addr, VVAR_BAD_ADDR); + return true; + } + + return false; +} + +extern int vdso_do_park(struct vdso_maps *rt, unsigned long park_at, + unsigned long park_size); +extern int vdso_map_compat(unsigned long map_at); +extern int vdso_proxify(struct vdso_symtable *sym_rt, + unsigned long vdso_rt_parked_at, + VmaEntry *vmas, size_t nr_vmas, + bool compat_vdso, bool force_trampolines); +extern int vdso_redirect_calls(unsigned long base_to, unsigned long base_from, + struct vdso_symtable *to, struct vdso_symtable *from, + bool compat_vdso); + +#endif /* __CR_PARASITE_VDSO_H__ */ diff --git a/CRIU_code/criu/include/parasite.h b/CRIU_code/criu/include/parasite.h new file mode 100644 index 0000000..0a62f24 --- /dev/null +++ b/CRIU_code/criu/include/parasite.h @@ -0,0 +1,240 @@ +#ifndef __CR_PARASITE_H__ +#define __CR_PARASITE_H__ + +#define PARASITE_MAX_SIZE (64 << 10) + +#ifndef __ASSEMBLY__ + +#include +#include +#include +#include + +#include "image.h" +#include "util-pie.h" +#include "common/lock.h" +#include "infect-rpc.h" + +#include "images/vma.pb-c.h" +#include "images/tty.pb-c.h" + +#define __head __used __section(.head.text) + +enum { + PARASITE_CMD_DUMP_THREAD = PARASITE_USER_CMDS, + PARASITE_CMD_MPROTECT_VMAS, + PARASITE_CMD_DUMPPAGES, + + PARASITE_CMD_DUMP_SIGACTS, + PARASITE_CMD_DUMP_ITIMERS, + PARASITE_CMD_DUMP_POSIX_TIMERS, + PARASITE_CMD_DUMP_MISC, + PARASITE_CMD_DRAIN_FDS, + PARASITE_CMD_GET_PROC_FD, + PARASITE_CMD_DUMP_TTY, + PARASITE_CMD_CHECK_VDSO_MARK, + PARASITE_CMD_CHECK_AIOS, + PARASITE_CMD_DUMP_CGROUP, + + PARASITE_CMD_MAX, +}; + +struct parasite_vma_entry +{ + unsigned long start; + unsigned long len; + int prot; +}; + +struct parasite_vdso_vma_entry { + unsigned long start; + unsigned long len; + unsigned long orig_vdso_addr; + unsigned long orig_vvar_addr; + unsigned long rt_vvar_addr; + int is_marked; + bool try_fill_symtable; + bool is_vdso; +}; + +struct parasite_dump_pages_args { + unsigned int nr_vmas; + unsigned int add_prot; + unsigned int off; + unsigned int nr_segs; + unsigned int nr_pages; +}; + +static inline struct parasite_vma_entry *pargs_vmas(struct parasite_dump_pages_args *a) +{ + return (struct parasite_vma_entry *)(a + 1); +} + +static inline struct iovec *pargs_iovs(struct parasite_dump_pages_args *a) +{ + return (struct iovec *)(pargs_vmas(a) + a->nr_vmas); +} + +struct parasite_dump_sa_args { + rt_sigaction_t sas[SIGMAX]; +}; + +struct parasite_dump_itimers_args { + struct itimerval real; + struct itimerval virt; + struct itimerval prof; +}; + +struct posix_timer { + int it_id; + struct itimerspec val; + int overrun; +}; + +struct parasite_dump_posix_timers_args { + int timer_n; + struct posix_timer timer[0]; +}; + +struct parasite_aio { + unsigned long ctx; + unsigned int size; +}; + +struct parasite_check_aios_args { + unsigned nr_rings; + struct parasite_aio ring[0]; +}; + +static inline int posix_timers_dump_size(int timer_n) +{ + return sizeof(int) + sizeof(struct posix_timer) * timer_n; +} + +/* + * Misc sfuff, that is too small for separate file, but cannot + * be read w/o using parasite + */ + +struct parasite_dump_misc { + unsigned long brk; + + u32 pid; + u32 sid; + u32 pgid; + u32 umask; + + int dumpable; + int thp_disabled; +}; + +/* + * Calculate how long we can make the groups array in parasite_dump_creds + * and still fit the struct in one page + */ +#define PARASITE_MAX_GROUPS \ + ((PAGE_SIZE - sizeof(struct parasite_dump_thread) - \ + offsetof(struct parasite_dump_creds, groups)) / sizeof(unsigned int)) /* groups */ + +struct parasite_dump_creds { + unsigned int cap_last_cap; + + u32 cap_inh[CR_CAP_SIZE]; + u32 cap_prm[CR_CAP_SIZE]; + u32 cap_eff[CR_CAP_SIZE]; + u32 cap_bnd[CR_CAP_SIZE]; + + int uids[4]; + int gids[4]; + unsigned int secbits; + unsigned int ngroups; + /* + * FIXME -- this structure is passed to parasite code + * through parasite args area so in parasite_dump_creds() + * call we check for size of this data fits the size of + * the area. Unfortunately, we _actually_ use more bytes + * than the sizeof() -- we put PARASITE_MAX_GROUPS int-s + * in there, so the size check is not correct. + * + * However, all this works simply because we make sure + * the PARASITE_MAX_GROUPS is so, that the total amount + * of memory in use doesn't exceed the PAGE_SIZE and the + * args area is at least one page (PARASITE_ARG_SIZE_MIN). + */ + unsigned int groups[0]; +}; + +struct parasite_dump_thread { + unsigned int *tid_addr; + pid_t tid; + tls_t tls; + stack_t sas; + int pdeath_sig; + char comm[TASK_COMM_LEN]; + struct parasite_dump_creds creds[0]; +}; + +static inline void copy_sas(ThreadSasEntry *dst, const stack_t *src) +{ + dst->ss_sp = encode_pointer(src->ss_sp); + dst->ss_size = (u64)src->ss_size; + dst->ss_flags = src->ss_flags; +} + +/* + * How many descriptors can be transferred from parasite: + * + * 1) struct parasite_drain_fd + all descriptors should fit into one page + * 2) The value should be a multiple of CR_SCM_MAX_FD, because descriptors + * are transferred with help of send_fds and recv_fds. + * 3) criu should work with a default value of the file limit (1024) + */ +#define PARASITE_MAX_FDS CR_SCM_MAX_FD * 3 + +struct parasite_drain_fd { + int nr_fds; + int fds[0]; +}; + +struct fd_opts { + char flags; + struct { + uint32_t uid; + uint32_t euid; + uint32_t signum; + uint32_t pid_type; + uint32_t pid; + } fown; +}; + +static inline int drain_fds_size(struct parasite_drain_fd *dfds) +{ + int nr_fds = min((int)PARASITE_MAX_FDS, dfds->nr_fds); + return sizeof(*dfds) + nr_fds * (sizeof(dfds->fds[0]) + sizeof(struct fd_opts)); +} + +struct parasite_tty_args { + int fd; + int type; + + int sid; + int pgrp; + bool hangup; + + int st_pckt; + int st_lock; + int st_excl; +}; + +struct parasite_dump_cgroup_args { + /* + * 4K should be enough for most cases. + * + * The string is null terminated. + */ + char contents[1 << 12]; +}; + +#endif /* !__ASSEMBLY__ */ + +#endif /* __CR_PARASITE_H__ */ diff --git a/CRIU_code/criu/include/path.h b/CRIU_code/criu/include/path.h new file mode 100644 index 0000000..c475986 --- /dev/null +++ b/CRIU_code/criu/include/path.h @@ -0,0 +1,41 @@ +#ifndef __CR_PATH_H__ +#define __CR_PATH_H__ + +#include "namespaces.h" +#include "pstree.h" + +/* Absolute paths are used on dump and relative paths are used on restore */ +static inline int is_root(char *p) +{ + return (!strcmp(p, "/")); +} + +/* True for the root mount (the topmost one) */ +static inline int is_root_mount(struct mount_info *mi) +{ + return mi->parent == NULL && mi->nsid->id == root_item->ids->mnt_ns_id; +} + +/* + * True if the mountpoint target is root on its FS. + * + * This is used to determine whether we need to postpone + * mounting. E.g. one can bind mount some subdir from a + * disk, and in this case we'll have to get the root disk + * mount first, then bind-mount it. See do_mount_one(). + */ +static inline int fsroot_mounted(struct mount_info *mi) +{ + return is_root(mi->root); +} + +char *cut_root_for_bind(char *target_root, char *source_root); + +/* + * Get a mount point for a sibling of m if m->parent and p are in the same + * shared group. + */ +char *mnt_get_sibling_path(struct mount_info *m, + struct mount_info *p, char *buf, int len); + +#endif diff --git a/CRIU_code/criu/include/pid.h b/CRIU_code/criu/include/pid.h new file mode 100644 index 0000000..c749176 --- /dev/null +++ b/CRIU_code/criu/include/pid.h @@ -0,0 +1,62 @@ +#ifndef __CR_PID_H__ +#define __CR_PID_H__ + +#include +#include "stdbool.h" +#include "rbtree.h" + +/* + * Task states, used in e.g. struct pid's state. + */ +enum __criu_task_state +{ + /* Values shared with compel */ + TASK_ALIVE = COMPEL_TASK_ALIVE, + TASK_DEAD = COMPEL_TASK_DEAD, + TASK_STOPPED = COMPEL_TASK_STOPPED, + TASK_ZOMBIE = COMPEL_TASK_ZOMBIE, + /* Own internal states */ + TASK_HELPER = COMPEL_TASK_MAX + 1, + TASK_THREAD, + /* new values are to be added before this line */ + TASK_UNDEF = 0xff +}; + +struct pid { + struct pstree_item *item; + /* + * The @real pid is used to fetch tasks during dumping stage, + * This is a global pid seen from the context where the dumping + * is running. + */ + pid_t real; + + int state; /* TASK_XXX constants */ + + /* + * The @virt pid is one which used in the image itself and keeps + * the pid value to be restored. This pid fetched from the + * dumpee context, because the dumpee might have own pid namespace. + */ + struct { + pid_t virt; + struct rb_node node; + } ns[1]; /* Must be at the end of struct pid */ +}; + +/* + * When we have to restore a shared resource, we mush select which + * task should do it, and make other(s) wait for it. In order to + * avoid deadlocks, always make task with lower pid be the restorer. + */ +static inline bool pid_rst_prio(unsigned pid_a, unsigned pid_b) +{ + return pid_a < pid_b; +} + +static inline bool pid_rst_prio_eq(unsigned pid_a, unsigned pid_b) +{ + return pid_a <= pid_b; +} + +#endif /* __CR_PID_H__ */ diff --git a/CRIU_code/criu/include/pipes.h b/CRIU_code/criu/include/pipes.h new file mode 100644 index 0000000..83fb71c --- /dev/null +++ b/CRIU_code/criu/include/pipes.h @@ -0,0 +1,63 @@ +#ifndef __CR_PIPES_H__ +#define __CR_PIPES_H__ + +#include "images/pipe-data.pb-c.h" +#include "images/pipe.pb-c.h" + +extern struct collect_image_info pipe_cinfo; +extern struct collect_image_info pipe_data_cinfo; +extern const struct fdtype_ops pipe_dump_ops; + +static inline u32 pipe_id(const struct fd_parms *p) +{ + return p->stat.st_ino; +} + +#define NR_PIPES_WITH_DATA 1024 + +struct pipe_data_dump { + int img_type; + unsigned int nr; + u32 ids[NR_PIPES_WITH_DATA]; +}; + +extern int dump_one_pipe_data(struct pipe_data_dump *pd, int lfd, const struct fd_parms *p); + +struct pipe_data_rst { + PipeDataEntry *pde; + void *data; + struct pipe_data_rst *next; +}; + +#define PIPE_DATA_HASH_BITS 5 +#define PIPE_DATA_HASH_SIZE (1 << PIPE_DATA_HASH_BITS) +#define PIPE_DATA_HASH_MASK (PIPE_DATA_HASH_SIZE - 1) + +extern int do_collect_pipe_data(struct pipe_data_rst *, + ProtobufCMessage *, struct cr_img *, struct pipe_data_rst **hash); +extern int restore_pipe_data(int img_type, int pfd, u32 id, struct pipe_data_rst **hash); + +/* + * The sequence of objects which should be restored: + * pipe -> files struct-s -> fd-s. + * pipe_entry describes pipe's file structs-s. + * A pipe doesn't have own properties, so it has no object. + */ + +#include "images/pipe.pb-c.h" + +struct pipe_info { + PipeEntry *pe; + struct list_head pipe_list; /* All pipe_info with the same pipe_id + * This is pure circular list without head */ + struct list_head list; /* global list of pipes */ + struct file_desc d; + unsigned int create : 1, + reopen : 1; +}; + +extern int collect_one_pipe_ops(void *o, ProtobufCMessage *base, + struct file_desc_ops *ops); +extern int open_pipe(struct file_desc *d, int *new_fd); + +#endif /* __CR_PIPES_H__ */ diff --git a/CRIU_code/criu/include/plugin.h b/CRIU_code/criu/include/plugin.h new file mode 100644 index 0000000..82a6723 --- /dev/null +++ b/CRIU_code/criu/include/plugin.h @@ -0,0 +1,46 @@ +#ifndef __CR_PLUGIN_H__ +#define __CR_PLUGIN_H__ + +#include "criu-plugin.h" +#include "common/compiler.h" +#include "common/list.h" + +#define CR_PLUGIN_DEFAULT "/var/lib/criu/" + +void cr_plugin_fini(int stage, int err); +int cr_plugin_init(int stage); + +typedef struct { + struct list_head head; + struct list_head hook_chain[CR_PLUGIN_HOOK__MAX]; +} cr_plugin_ctl_t; + +extern cr_plugin_ctl_t cr_plugin_ctl; + +typedef struct { + cr_plugin_desc_t *d; + struct list_head list; + void *dlhandle; + struct list_head link[CR_PLUGIN_HOOK__MAX]; +} plugin_desc_t; + +#define run_plugins(__hook, ...) \ +({ \ + plugin_desc_t *this; \ + int __ret = -ENOTSUP; \ + \ + list_for_each_entry(this, &cr_plugin_ctl.hook_chain[CR_PLUGIN_HOOK__ ##__hook], \ + link[CR_PLUGIN_HOOK__ ##__hook]) { \ + pr_debug("plugin: `%s' hook %u -> %p\n", \ + this->d->name, CR_PLUGIN_HOOK__ ##__hook, \ + this->d->hooks[CR_PLUGIN_HOOK__ ##__hook]); \ + __ret = ((CR_PLUGIN_HOOK__ ##__hook ##_t *) \ + this->d->hooks[CR_PLUGIN_HOOK__ ##__hook])(__VA_ARGS__); \ + if (__ret == -ENOTSUP) \ + continue; \ + break; \ + } \ + __ret; \ +}) + +#endif diff --git a/CRIU_code/criu/include/posix-timer.h b/CRIU_code/criu/include/posix-timer.h new file mode 100644 index 0000000..fa99d86 --- /dev/null +++ b/CRIU_code/criu/include/posix-timer.h @@ -0,0 +1,27 @@ +#ifndef __CR_PROC_POSIX_TIMER_H__ +#define __CR_PROC_POSIX_TIMER_H__ + +#include "common/list.h" + +struct str_posix_timer { + long it_id; + int clock_id; + int si_signo; + int it_sigev_notify; + void * sival_ptr; +}; + +struct proc_posix_timer { + struct list_head list; + struct str_posix_timer spt; +}; + +struct proc_posix_timers_stat { + int timer_n; + struct list_head timers; +}; + +extern int parse_posix_timers(pid_t pid, struct proc_posix_timers_stat * args); +void free_posix_timers(struct proc_posix_timers_stat *st); + +#endif /* __CR_PROC_POSIX_TIMER_H__ */ diff --git a/CRIU_code/criu/include/prctl.h b/CRIU_code/criu/include/prctl.h new file mode 100644 index 0000000..8e7fef3 --- /dev/null +++ b/CRIU_code/criu/include/prctl.h @@ -0,0 +1,85 @@ +#ifndef __CR_PRCTL_H__ +#define __CR_PRCTL_H__ + +#include "int.h" + +#ifndef PR_SET_NAME +# define PR_SET_NAME 15 +#endif +#ifndef PR_GET_NAME +# define PR_GET_NAME 16 +#endif +#ifndef PR_SET_SECCOMP +# define PR_SET_SECCOMP 22 +#endif +#ifndef PR_CAPBSET_READ +# define PR_CAPBSET_READ 23 +#endif +#ifndef PR_CAPBSET_DROP +# define PR_CAPBSET_DROP 24 +#endif +#ifndef PR_GET_SECUREBITS +# define PR_GET_SECUREBITS 27 +#endif +#ifndef PR_SET_SECUREBITS +# define PR_SET_SECUREBITS 28 +#endif +#ifndef PR_GET_DUMPABLE +# define PR_GET_DUMPABLE 3 +#endif +#ifndef PR_SET_DUMPABLE +# define PR_SET_DUMPABLE 4 +#endif + +#ifndef PR_SET_MM +#define PR_SET_MM 35 +# define PR_SET_MM_START_CODE 1 +# define PR_SET_MM_END_CODE 2 +# define PR_SET_MM_START_DATA 3 +# define PR_SET_MM_END_DATA 4 +# define PR_SET_MM_START_STACK 5 +# define PR_SET_MM_START_BRK 6 +# define PR_SET_MM_BRK 7 +# define PR_SET_MM_ARG_START 8 +# define PR_SET_MM_ARG_END 9 +# define PR_SET_MM_ENV_START 10 +# define PR_SET_MM_ENV_END 11 +# define PR_SET_MM_AUXV 12 +# define PR_SET_MM_EXE_FILE 13 +#endif + +#ifndef PR_SET_MM_MAP +# define PR_SET_MM_MAP 14 +# define PR_SET_MM_MAP_SIZE 15 + +struct prctl_mm_map { + u64 start_code; + u64 end_code; + u64 start_data; + u64 end_data; + u64 start_brk; + u64 brk; + u64 start_stack; + u64 arg_start; + u64 arg_end; + u64 env_start; + u64 env_end; + u64 *auxv; + u32 auxv_size; + u32 exe_fd; +}; +#endif + +#ifndef PR_GET_TID_ADDRESS +# define PR_GET_TID_ADDRESS 40 +#endif + +#ifndef PR_SET_THP_DISABLE +# define PR_SET_THP_DISABLE 41 +#endif + +#ifndef PR_GET_THP_DISABLE +# define PR_GET_THP_DISABLE 42 +#endif + +#endif /* __CR_PRCTL_H__ */ diff --git a/CRIU_code/criu/include/proc_parse.h b/CRIU_code/criu/include/proc_parse.h new file mode 100644 index 0000000..96a097b --- /dev/null +++ b/CRIU_code/criu/include/proc_parse.h @@ -0,0 +1,105 @@ +#ifndef __CR_PROC_PARSE_H__ +#define __CR_PROC_PARSE_H__ + +#include + +#include + +#define PROC_TASK_COMM_LEN 32 +#define PROC_TASK_COMM_LEN_FMT "(%31s" + +struct proc_pid_stat { + int pid; + char comm[PROC_TASK_COMM_LEN]; + char state; + int ppid; + int pgid; + int sid; + int tty_nr; + int tty_pgrp; + unsigned int flags; + unsigned long min_flt; + unsigned long cmin_flt; + unsigned long maj_flt; + unsigned long cmaj_flt; + unsigned long utime; + unsigned long stime; + long cutime; + long cstime; + long priority; + long nice; + int num_threads; + int zero0; + unsigned long long start_time; + unsigned long vsize; + long mm_rss; + unsigned long rsslim; + unsigned long start_code; + unsigned long end_code; + unsigned long start_stack; + unsigned long esp; + unsigned long eip; + unsigned long sig_pending; + unsigned long sig_blocked; + unsigned long sig_ignored; + unsigned long sig_handled; + unsigned long wchan; + unsigned long zero1; + unsigned long zero2; + int exit_signal; + int task_cpu; + unsigned int rt_priority; + unsigned int policy; + unsigned long long delayacct_blkio_ticks; + unsigned long gtime; + long cgtime; + unsigned long start_data; + unsigned long end_data; + unsigned long start_brk; + unsigned long arg_start; + unsigned long arg_end; + unsigned long env_start; + unsigned long env_end; + int exit_code; +}; + +#define PROC_CAP_SIZE 2 + +struct proc_status_creds { + struct seize_task_status s; + + unsigned int uids[4]; + unsigned int gids[4]; + + u32 last_filter; + + /* + * Keep them at the end of structure + * for fast comparison reason. + */ + u32 cap_inh[PROC_CAP_SIZE]; + u32 cap_prm[PROC_CAP_SIZE]; + u32 cap_eff[PROC_CAP_SIZE]; + u32 cap_bnd[PROC_CAP_SIZE]; +}; + +#define INVALID_UID ((uid_t)-1) + +extern int parse_pid_stat(pid_t pid, struct proc_pid_stat *s); +extern unsigned int parse_pid_loginuid(pid_t pid, int *err, bool ignore_noent); +extern int parse_pid_oom_score_adj(pid_t pid, int *err); +extern int prepare_loginuid(unsigned int value, unsigned int loglevel); +extern int parse_pid_status(pid_t pid, struct seize_task_status *, void *data); +extern int parse_file_locks(void); +extern int get_fd_mntid(int fd, int *mnt_id); + +struct pid; +extern int parse_threads(int pid, struct pid **_t, int *_n); + +int parse_children(pid_t pid, pid_t **_c, int *_n); + +extern bool is_vma_range_fmt(char *line); +extern void parse_vmflags(char *buf, u32 *flags, u64 *madv, int *io_pf); +extern int parse_uptime(uint64_t *upt); + +#endif /* __CR_PROC_PARSE_H__ */ diff --git a/CRIU_code/criu/include/protobuf-desc.h b/CRIU_code/criu/include/protobuf-desc.h new file mode 100644 index 0000000..21ba271 --- /dev/null +++ b/CRIU_code/criu/include/protobuf-desc.h @@ -0,0 +1,99 @@ +#ifndef __CR_PROTOBUF_DESC_H__ +#define __CR_PROTOBUF_DESC_H__ + +#include +#include + +enum { + /* PB_AUTOGEN_START */ + PB_INVENTORY, /* 0 */ + PB_STATS, + PB_FDINFO, + PB_CORE, + PB_MM, + PB_VMA, + PB_ITIMER, + PB_POSIX_TIMER, + PB_CREDS, + PB_FS, + PB_UTSNS, /* 10 */ + PB_IPC_VAR, + PB_IPC_SHM, + PB_IPC_SEM, + PB_MNT, + PB_PSTREE, + PB_GHOST_FILE, + PB_TCP_STREAM, + PB_REG_FILE, + PB_EXT_FILE, + PB_NS_FILE, /* 20 */ + PB_INET_SK, + PB_UNIX_SK, + PB_PACKET_SOCK, + PB_NETLINK_SK, + PB_PIPE, + PB_FIFO, + PB_PIPE_DATA, + PB_EVENTFD_FILE, + PB_EVENTPOLL_FILE, + PB_EVENTPOLL_TFD, /* 30 */ + PB_SIGNALFD, + PB_INOTIFY_FILE, + PB_INOTIFY_WD, + PB_FANOTIFY_FILE, + PB_FANOTIFY_MARK, + PB_TTY_FILE, + PB_TTY_INFO, + PB_FILE_LOCK, + PB_RLIMIT, + PB_PAGEMAP, /* 40 */ + PB_SIGINFO, + PB_TUNFILE, + PB_IRMAP_CACHE, + PB_CGROUP, + PB_SECCOMP, + PB_TIMERFD, + PB_CPUINFO, + PB_USERNS, + PB_NETNS, + PB_BINFMT_MISC, /* 50 */ + PB_TTY_DATA, + PB_AUTOFS, + PB_GHOST_CHUNK, + PB_FILE, + PB_REMOTE_IMAGE, /* Header for images sent from proxy to cache.*/ + PB_LOCAL_IMAGE, /* Header for reading/writing images from/to proxy or cache. */ + PB_LOCAL_IMAGE_REPLY, /* Header for reading/writing images reply. */ + PB_SNAPSHOT_ID, /* Contains a single id. Used for reading/writing ids from proxy or cache. */ + + /* PB_AUTOGEN_STOP */ + + PB_PAGEMAP_HEAD, + PB_IDS, + PB_SIGACT, + PB_NETDEV, + PB_REMAP_FPATH, + PB_SK_QUEUES, + PB_IPCNS_MSG, + PB_IPCNS_MSG_ENT, + + PB_MAX, +}; + +typedef size_t (*pb_getpksize_t)(void *obj); +typedef size_t (*pb_pack_t)(void *obj, void *where); +typedef void *(*pb_unpack_t)(void *allocator, size_t size, void *from); +typedef void (*pb_free_t)(void *obj, void *allocator); + +struct cr_pb_message_desc { + pb_getpksize_t getpksize; + pb_pack_t pack; + pb_unpack_t unpack; + pb_free_t free; + const ProtobufCMessageDescriptor *pb_desc; +}; + +extern void cr_pb_init(void); +extern struct cr_pb_message_desc cr_pb_descs[PB_MAX]; + +#endif /* __CR_PROTOBUF_DESC_H__ */ diff --git a/CRIU_code/criu/include/protobuf.h b/CRIU_code/criu/include/protobuf.h new file mode 100644 index 0000000..fb7489e --- /dev/null +++ b/CRIU_code/criu/include/protobuf.h @@ -0,0 +1,55 @@ +#ifndef __CR_PROTOBUF_H__ +#define __CR_PROTOBUF_H__ + +#include + +#include "protobuf-desc.h" +#include "common/compiler.h" +#include "util.h" + +struct cr_img; + +extern int do_pb_read_one(struct cr_img *, void **objp, int type, bool eof); + +#define pb_read_one(fd, objp, type) do_pb_read_one(fd, (void **)objp, type, false) +#define pb_read_one_eof(fd, objp, type) do_pb_read_one(fd, (void **)objp, type, true) + +extern int pb_write_one(struct cr_img *, void *obj, int type); + +#define pb_pksize(__obj, __proto_message_name) \ + (__proto_message_name ##__get_packed_size(__obj) + sizeof(u32)) + +#define pb_repeated_size(__obj, __member) \ + ((size_t)(sizeof(*(__obj)->__member) * (__obj)->n_ ##__member)) + +#define pb_msg(__base, __type) \ + container_of(__base, __type, base) + +#include + +struct collect_image_info { + int fd_type; + int pb_type; + unsigned int priv_size; + int (*collect)(void *, ProtobufCMessage *, struct cr_img *); + unsigned flags; +}; + +#define COLLECT_SHARED 0x1 /* use shared memory for obj-s */ +#define COLLECT_NOFREE 0x2 /* don't free entry after callback */ +#define COLLECT_HAPPENED 0x4 /* image was opened and collected */ + +extern int collect_image(struct collect_image_info *); +extern int collect_entry(ProtobufCMessage *base, struct collect_image_info *cinfo); + +static inline int collect_images(struct collect_image_info **array, unsigned size) +{ + int i; + for (i = 0; i < size; i++) { + if (collect_image(array[i])) + return -1; + } + return 0; +} + +#endif /* __CR_PROTOBUF_H__ */ diff --git a/CRIU_code/criu/include/pstree.h b/CRIU_code/criu/include/pstree.h new file mode 100644 index 0000000..7303c1f --- /dev/null +++ b/CRIU_code/criu/include/pstree.h @@ -0,0 +1,126 @@ +#ifndef __CR_PSTREE_H__ +#define __CR_PSTREE_H__ + +#include "common/list.h" +#include "common/lock.h" +#include "pid.h" +#include "xmalloc.h" +#include "images/core.pb-c.h" + +/* + * That's the init process which usually inherit + * all orphaned children in the system. + */ +#define INIT_PID (1) +struct pstree_item { + struct pstree_item *parent; + struct list_head children; /* list of my children */ + struct list_head sibling; /* linkage in my parent's children list */ + + struct pid *pid; + pid_t pgid; + pid_t sid; + pid_t born_sid; + + int nr_threads; /* number of threads */ + struct pid *threads; /* array of threads */ + CoreEntry **core; + TaskKobjIdsEntry *ids; + union { + futex_t task_st; + unsigned long task_st_le_bits; + }; +}; + +static inline pid_t vpid(const struct pstree_item *i) +{ + return i->pid->ns[0].virt; +} + +enum { + FDS_EVENT_BIT = 0, +}; +#define FDS_EVENT (1 << FDS_EVENT_BIT) + +struct pstree_item *current; + +struct rst_info; +/* See alloc_pstree_item() for details */ +static inline struct rst_info *rsti(struct pstree_item *i) +{ + return (struct rst_info *)(i + 1); +} + +struct ns_id; +struct dmp_info { + struct ns_id *netns; + struct page_pipe *mem_pp; + struct parasite_ctl *parasite_ctl; + struct parasite_thread_ctl **thread_ctls; + uint64_t *thread_sp; +}; + +static inline struct dmp_info *dmpi(const struct pstree_item *i) +{ + return (struct dmp_info *)(i + 1); +} + +/* ids is allocated and initialized for all alive tasks */ +static inline int shared_fdtable(struct pstree_item *item) +{ + return (item->parent && + item->ids->files_id == item->parent->ids->files_id); +} + +static inline bool is_alive_state(int state) +{ + return (state == TASK_ALIVE) || (state == TASK_STOPPED); +} + +static inline bool task_alive(struct pstree_item *i) +{ + return is_alive_state(i->pid->state); +} + +extern void free_pstree(struct pstree_item *root_item); +extern struct pstree_item *__alloc_pstree_item(bool rst); +#define alloc_pstree_item() __alloc_pstree_item(false) +extern int init_pstree_helper(struct pstree_item *ret); + +extern struct pstree_item *lookup_create_item(pid_t pid); +extern void pstree_insert_pid(struct pid *pid_node); +extern struct pid *pstree_pid_by_virt(pid_t pid); + +extern struct pstree_item *root_item; +extern struct pstree_item *pstree_item_next(struct pstree_item *item); +#define for_each_pstree_item(pi) \ + for (pi = root_item; pi != NULL; pi = pstree_item_next(pi)) + +extern bool restore_before_setsid(struct pstree_item *child); +extern int prepare_pstree(void); +extern int prepare_dummy_pstree(void); + +extern int dump_pstree(struct pstree_item *root_item); + +struct pstree_item *pstree_item_by_real(pid_t virt); +struct pstree_item *pstree_item_by_virt(pid_t virt); + +extern int pid_to_virt(pid_t pid); + +struct task_entries; +extern struct task_entries *task_entries; +extern int prepare_task_entries(void); +extern int prepare_dummy_task_state(struct pstree_item *pi); + +extern int get_task_ids(struct pstree_item *); +extern struct _TaskKobjIdsEntry *root_ids; + +extern void core_entry_free(CoreEntry *core); +extern CoreEntry *core_entry_alloc(int alloc_thread_info, int alloc_tc); +extern int pstree_alloc_cores(struct pstree_item *item); +extern void pstree_free_cores(struct pstree_item *item); + +extern int collect_pstree_ids(void); + +extern int preorder_pstree_traversal(struct pstree_item *item, int (*f)(struct pstree_item *)); +#endif /* __CR_PSTREE_H__ */ diff --git a/CRIU_code/criu/include/ptrace-compat.h b/CRIU_code/criu/include/ptrace-compat.h new file mode 100644 index 0000000..e16fef0 --- /dev/null +++ b/CRIU_code/criu/include/ptrace-compat.h @@ -0,0 +1,16 @@ +#ifndef __CR_PTRACE_H__ +#define __CR_PTRACE_H__ + +#include +#include +#include "common/config.h" + +#ifndef CONFIG_HAS_PTRACE_PEEKSIGINFO +struct ptrace_peeksiginfo_args { + __u64 off; /* from which siginfo to start */ + __u32 flags; + __u32 nr; /* how may siginfos to take */ +}; +#endif + +#endif /* __CR_PTRACE_H__ */ diff --git a/CRIU_code/criu/include/rbtree.h b/CRIU_code/criu/include/rbtree.h new file mode 100644 index 0000000..0079506 --- /dev/null +++ b/CRIU_code/criu/include/rbtree.h @@ -0,0 +1,88 @@ +/* + * RBtree implementation adopted from the Linux kernel sources. + */ + +#ifndef __CR_RBTREE_H__ +#define __CR_RBTREE_H__ + +#include + +#include "common/compiler.h" + +#define RB_RED 0 +#define RB_BLACK 1 +#define RB_MASK 3 + +struct rb_node { + unsigned long rb_parent_color; /* Keeps both parent anc color */ + struct rb_node *rb_right; + struct rb_node *rb_left; +} __aligned(sizeof(long)); + +struct rb_root { + struct rb_node *rb_node; +}; + +#define rb_parent(r) ((struct rb_node *)((r)->rb_parent_color & ~RB_MASK)) +#define rb_color(r) ((r)->rb_parent_color & RB_BLACK) +#define rb_is_red(r) (!rb_color(r)) +#define rb_is_black(r) (rb_color(r)) +#define rb_set_red(r) do { (r)->rb_parent_color &= ~RB_BLACK; } while (0) +#define rb_set_black(r) do { (r)->rb_parent_color |= RB_BLACK; } while (0) + +static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p) +{ + rb->rb_parent_color = (rb->rb_parent_color & RB_MASK) | (unsigned long)p; +} + +static inline void rb_set_color(struct rb_node *rb, int color) +{ + rb->rb_parent_color = (rb->rb_parent_color & ~RB_BLACK) | color; +} + +#define RB_ROOT (struct rb_root){ NULL, } +#define rb_entry(ptr, type, member) container_of(ptr, type, member) + +#define RB_EMPTY_ROOT(root) ((root)->rb_node == NULL) +#define RB_EMPTY_NODE(node) (rb_parent(node) == node) +#define RB_CLEAR_NODE(node) (rb_set_parent(node, node)) + +static inline void rb_init_node(struct rb_node *node) +{ + *node = (struct rb_node){ }; + + RB_CLEAR_NODE(node); +} + +extern void rb_insert_color(struct rb_node *node, struct rb_root *root); +extern void rb_erase(struct rb_node *node, struct rb_root *root); + +/* Find logical next and previous nodes in a tree */ +extern struct rb_node *rb_first(const struct rb_root *root); +extern struct rb_node *rb_last(const struct rb_root *root); +extern struct rb_node *rb_next(const struct rb_node *node); +extern struct rb_node *rb_prev(const struct rb_node *node); + +/* Fast replacement of a single node without remove/rebalance/add/rebalance */ +extern void rb_replace_node(struct rb_node *victim, struct rb_node *new, + struct rb_root *root); + +static inline void rb_link_node(struct rb_node *node, struct rb_node *parent, + struct rb_node **rb_link) +{ + node->rb_parent_color = (unsigned long)parent; + node->rb_left = node->rb_right = NULL; + + *rb_link = node; +} + +static inline void rb_link_and_balance(struct rb_root *root, + struct rb_node *node, + struct rb_node *parent, + struct rb_node **rb_link) +{ + rb_link_node(node, parent, rb_link); + rb_insert_color(node, root); +} + +#endif /* __CR_RBTREE_H__ */ diff --git a/CRIU_code/criu/include/restore.h b/CRIU_code/criu/include/restore.h new file mode 100644 index 0000000..8ef0dbd --- /dev/null +++ b/CRIU_code/criu/include/restore.h @@ -0,0 +1,10 @@ +#ifndef __CR_INC_RESTORE_H__ +#define __CR_INC_RESTORE_H__ + +#include "pid.h" +#include "types.h" +#include "asm/restore.h" + +extern int arch_set_thread_regs_nosigrt(struct pid *pid); + +#endif diff --git a/CRIU_code/criu/include/restorer.h b/CRIU_code/criu/include/restorer.h new file mode 100644 index 0000000..effbc36 --- /dev/null +++ b/CRIU_code/criu/include/restorer.h @@ -0,0 +1,312 @@ +#ifndef __CR_RESTORER_H__ +#define __CR_RESTORER_H__ + +#include +#include +#include +#include + +#include "common/config.h" +#include "types.h" +#include "int.h" +#include "types.h" +#include "common/compiler.h" +#include +#include "common/lock.h" +#include "util.h" +#include "asm/restorer.h" +#include "posix-timer.h" +#include "timerfd.h" +#include "shmem.h" +#include "parasite-vdso.h" +#include "fault-injection.h" + +#include + +#include "images/mm.pb-c.h" + +/* + * These *must* be power of two values. + */ +#define RESTORE_ARGS_SIZE (512) +#define RESTORE_STACK_REDZONE (128) +#define RESTORE_STACK_SIZE (KILO(32)) + +struct restore_mem_zone { + u8 redzone[RESTORE_STACK_REDZONE]; + u8 stack[RESTORE_STACK_SIZE]; + u8 rt_sigframe[RESTORE_STACK_SIGFRAME]; +} __stack_aligned__; + +struct rst_sched_param { + int policy; + int nice; + int prio; +}; + +struct restore_posix_timer { + struct str_posix_timer spt; + struct itimerspec val; + int overrun; +}; + +/* + * We should be able to construct fpu sigframe in sigreturn_prep_fpu_frame, + * so the mem_zone.rt_sigframe should be 64-bytes aligned. To make things + * simpler, force both _args alignment be 64 bytes. + */ + +struct thread_creds_args { + CredsEntry creds; + + unsigned int cap_last_cap; + + u32 cap_inh[CR_CAP_SIZE]; + u32 cap_prm[CR_CAP_SIZE]; + u32 cap_eff[CR_CAP_SIZE]; + u32 cap_bnd[CR_CAP_SIZE]; + + unsigned int secbits; + char *lsm_profile; + unsigned int *groups; + char *lsm_sockcreate; + + unsigned long mem_lsm_profile_pos; + unsigned long mem_lsm_sockcreate_pos; + unsigned long mem_groups_pos; + + unsigned long mem_pos_next; +}; + +struct thread_seccomp_filter { + struct sock_fprog sock_fprog; + unsigned int flags; +}; + +struct thread_restore_args { + struct restore_mem_zone *mz; + + int pid; + UserRegsEntry gpregs; + u64 clear_tid_addr; + + u64 futex_rla; + u32 futex_rla_len; + + struct rst_sched_param sp; + + struct task_restore_args *ta; + + tls_t tls; + + siginfo_t *siginfo; + unsigned int siginfo_n; + + int pdeath_sig; + + struct thread_creds_args *creds_args; + + int seccomp_mode; + unsigned long seccomp_filters_pos; + struct thread_seccomp_filter *seccomp_filters; + void *seccomp_filters_data; + unsigned int seccomp_filters_n; + bool seccomp_force_tsync; + + char comm[TASK_COMM_LEN]; +} __aligned(64); + +typedef long (*thread_restore_fcall_t) (struct thread_restore_args *args); + +struct restore_vma_io { + int nr_iovs; + loff_t off; + struct iovec iovs[0]; +}; + +#define RIO_SIZE(niovs) (sizeof(struct restore_vma_io) + (niovs) * sizeof(struct iovec)) + +struct task_restore_args { + struct thread_restore_args *t; /* thread group leader */ + + int fd_exe_link; /* opened self->exe file */ + int logfd; + unsigned int loglevel; + struct timeval logstart; + + int uffd; + bool has_thp_enabled; + + /* threads restoration */ + int nr_threads; /* number of threads */ + thread_restore_fcall_t clone_restore_fn; /* helper address for clone() call */ + struct thread_restore_args *thread_args; /* array of thread arguments */ + struct task_entries *task_entries; + void *rst_mem; + unsigned long rst_mem_size; + + /* Below arrays get remapped from RM_PRIVATE in sigreturn_restore */ + VmaEntry *vmas; + unsigned int vmas_n; + + int vma_ios_fd; + struct restore_vma_io *vma_ios; + unsigned int vma_ios_n; + + struct restore_posix_timer *posix_timers; + unsigned int posix_timers_n; + + struct restore_timerfd *timerfd; + unsigned int timerfd_n; + + siginfo_t *siginfo; + unsigned int siginfo_n; + + struct rst_tcp_sock *tcp_socks; + unsigned int tcp_socks_n; + + struct rst_aio_ring *rings; + unsigned int rings_n; + + struct rlimit64 *rlims; + unsigned int rlims_n; + + pid_t *helpers /* the TASK_HELPERS to wait on at the end of restore */; + unsigned int helpers_n; + + pid_t *zombies; + unsigned int zombies_n; + + /* * * * * * * * * * * * * * * * * * * * */ + + unsigned long task_size; + unsigned long premmapped_addr; + unsigned long premmapped_len; + rt_sigaction_t sigchld_act; + + void *bootstrap_start; + unsigned long bootstrap_len; + + struct itimerval itimers[3]; + + MmEntry mm; + auxv_t mm_saved_auxv[AT_VECTOR_SIZE]; + u32 mm_saved_auxv_size; + char comm[TASK_COMM_LEN]; + + /* + * proc_fd is a handle to /proc that the restorer blob can use to open + * files there, because some of them can't be opened before the + * restorer blob is called. + */ + int proc_fd; + + int seccomp_mode; + + bool compatible_mode; + + bool can_map_vdso; + bool auto_dedup; + unsigned long vdso_rt_size; + struct vdso_maps vdso_maps_rt; /* runtime vdso symbols */ + unsigned long vdso_rt_parked_at; /* safe place to keep vdso */ + void **breakpoint; + + enum faults fault_strategy; +#ifdef ARCH_HAS_LONG_PAGES + unsigned page_size; +#endif + int lsm_type; +} __aligned(64); + +/* + * For arm64 stack needs to aligned to 16 bytes. + * Hence align to 16 bytes for all +*/ +#define RESTORE_ALIGN_STACK(start, size) \ + (ALIGN((start) + (size) - 16, 16)) + +static inline unsigned long restorer_stack(struct restore_mem_zone *mz) +{ + return RESTORE_ALIGN_STACK((long)&mz->stack, RESTORE_STACK_SIZE); +} + +enum { + /* + * Restore stages. The stage is started by criu process, then + * confirmed by all tasks involved in it. Then criu does some + * actions and starts the next stage. + * + * The first stated stage is CR_STATE_ROOT_TASK which is started + * right before calling fork_with_pid() for the root_item. + */ + CR_STATE_FAIL = -1, + /* + * Root task is created and does some pre-checks. + * After the stage ACT_SETUP_NS scripts are performed. + */ + CR_STATE_ROOT_TASK = 0, + /* + * The prepare_namespace() is called. + * After the stage criu opens root task's mntns and + * calls ACT_POST_SETUP_NS scripts. + */ + CR_STATE_PREPARE_NAMESPACES, + /* + * All tasks fork and call open_transport_socket(). + * Stage is needed to make sure they all have the socket. + * Also this stage is a sync point after which the + * fini_restore_mntns() can be called. + * + * This stage is a little bit special. Normally all stages + * are controlled by criu process, but when this stage + * starts criu process starts waiting for the tasks to + * finish it, but by the time it gets woken up the stage + * finished is CR_STATE_RESTORE. The forking stage is + * barrier-ed by the root task, this task is also the one + * that switches the stage (into restoring). + * + * The above is done to lower the amount of context + * switches from root task to criu and back, since the + * separate forking stage is not needed by criu, it's + * purely to make sure all tasks be in sync. + */ + CR_STATE_FORKING, + /* + * Main restore stage. By the end of it all tasks are + * almost ready and what's left is: + * pick up zombies and helpers + * restore sigchild handlers used to detect restore errors + * restore credentials, seccomp, dumpable and pdeath_sig + */ + CR_STATE_RESTORE, + /* + * Tasks restore sigchild handlers. + * Stage is needed to synchronize the change in error + * propagation via sigchild. + */ + CR_STATE_RESTORE_SIGCHLD, + /* + * Final stage. + * For security reason processes can be resumed only when all + * credentials are restored. Otherwise someone can attach to a + * process, which are not restored credentials yet and execute + * some code. + * Seccomp needs to be restored after creds. + * Dumpable and pdeath signal are restored after seccomp. + */ + CR_STATE_RESTORE_CREDS, + CR_STATE_COMPLETE +}; + +#define restore_finish_stage(__v, __stage) ({ \ + futex_dec_and_wake(&(__v)->nr_in_progress); \ + futex_wait_while(&(__v)->start, __stage); \ + (s32) futex_get(&(__v)->start); \ + }) + + +#define __r_sym(name) restorer_sym ## name +#define restorer_sym(rblob, name) (void*)(rblob + __r_sym(name)) + +#endif /* __CR_RESTORER_H__ */ diff --git a/CRIU_code/criu/include/rst-malloc.h b/CRIU_code/criu/include/rst-malloc.h new file mode 100644 index 0000000..67391ba --- /dev/null +++ b/CRIU_code/criu/include/rst-malloc.h @@ -0,0 +1,81 @@ +#ifndef __CR_RST_MALLOC__H__ +#define __CR_RST_MALLOC__H__ + +/* + * On restore we need differetn types of memory allocation. + * Here's an engine that tries to generalize them all. The + * main difference is in how the buffer with objects is being + * grown up. + * + * Buffers, that are to be used by restorer will be remapped + * into restorer address space with rst_mem_remap() call. Thus + * we have to either keep track of all the buffers and objects, + * or keep objects one-by-one in a plain linear buffer. The + * engine uses the 2nd approach. + */ + +enum { + /* + * Shared non-remapable allocations. These can happen only + * in "global" context, i.e. when objects are allocated to + * be used by any process to be restored. The objects are + * not going to be used in restorer blob, thus allocation + * engine grows buffers in a simple manner. + */ + RM_SHARED, + /* + * Shared objects, that are about to be used in restorer + * blob. For these the *_remap_* stuff below is used to get + * the actual pointer on any object. Growing a buffer is + * done with mremap, so that we don't have to keep track + * of all the buffer chunks and can remap them in restorer + * in one call. + */ + RM_SHREMAP, + /* + * Privately used objects. Buffer grow and remap is the + * same as for SHREMAP, but memory regions are MAP_PRIVATE. + */ + RM_PRIVATE, + + RST_MEM_TYPES, +}; + +/* + * Disables SHARED and SHREMAP allocations, turns on PRIVATE + */ +extern void rst_mem_switch_to_private(void); +/* + * Reports a cookie of a current shared buffer position, that + * can later be used in rst_mem_remap_ptr() to find out the object + * pointer in the restorer blob. + */ +extern unsigned long rst_mem_align_cpos(int type); +extern void *rst_mem_remap_ptr(unsigned long pos, int type); +#define RST_MEM_FIXUP_PPTR(ptr) do { \ + ptr = rst_mem_remap_ptr((unsigned long)ptr, RM_PRIVATE);\ +} while (0) + +/* + * Allocate and free objects. We don't need to free arbitrary + * object, thus allocation is simple (linear) and only the + * last object can be freed (pop-ed from buffer). + */ +extern void *rst_mem_alloc(unsigned long size, int type); +extern void rst_mem_free_last(int type); + +/* Word-align the current freelist pointer for the next allocation. If we don't + * align pointers, some futex and atomic operations can fail. + */ +extern void rst_mem_align(int type); + +/* + * Routines to remap SHREMAP and PRIVATE into restorer address space + */ +extern unsigned long rst_mem_lock(void); +extern int rst_mem_remap(void *to); + +extern void *shmalloc(size_t bytes); +extern void shfree_last(void *ptr); + +#endif /* __CR_RST_MALLOC__H__ */ diff --git a/CRIU_code/criu/include/rst_info.h b/CRIU_code/criu/include/rst_info.h new file mode 100644 index 0000000..07c634f --- /dev/null +++ b/CRIU_code/criu/include/rst_info.h @@ -0,0 +1,87 @@ +#ifndef __CR_RST_INFO_H__ +#define __CR_RST_INFO_H__ + +#include "common/lock.h" +#include "common/list.h" +#include "vma.h" + +struct task_entries { + int nr_threads, nr_tasks, nr_helpers; + futex_t nr_in_progress; + futex_t start; + atomic_t cr_err; + mutex_t userns_sync_lock; + mutex_t last_pid_mutex; +}; + +struct fdt { + int nr; /* How many tasks share this fd table */ + pid_t pid; /* Who should restore this fd table */ + /* + * The fd table is ready for restoing, if fdt_lock is equal to nr + * The fdt table was restrored, if fdt_lock is equal to nr + 1 + */ + futex_t fdt_lock; +}; + +struct _MmEntry; + +struct rst_info { + struct list_head fds; + + void *premmapped_addr; + unsigned long premmapped_len; + unsigned long clone_flags; + + void *munmap_restorer; + + int service_fd_id; + struct fdt *fdt; + + struct vm_area_list vmas; + struct _MmEntry *mm; + struct list_head vma_io; + unsigned int pages_img_id; + + u32 cg_set; + + union { + struct pstree_item *pgrp_leader; + futex_t pgrp_set; + }; + + struct file_desc *cwd; + struct file_desc *root; + bool has_umask; + u32 umask; + + /* + * We set this flag when process has seccomp filters + * so that we know to suspend them before we unmap the + * restorer blob. + */ + bool has_seccomp; + /* + * To be compatible with old images where filters + * are bound to group leader and we need to use tsync flag. + */ + bool has_old_seccomp_filter; + + bool has_thp_enabled; + + void *breakpoint; +}; + +extern struct task_entries *task_entries; + +static inline void lock_last_pid(void) +{ + mutex_lock(&task_entries->last_pid_mutex); +} + +static inline void unlock_last_pid(void) +{ + mutex_unlock(&task_entries->last_pid_mutex); +} + +#endif /* __CR_RST_INFO_H__ */ diff --git a/CRIU_code/criu/include/seccomp.h b/CRIU_code/criu/include/seccomp.h new file mode 100644 index 0000000..8e20012 --- /dev/null +++ b/CRIU_code/criu/include/seccomp.h @@ -0,0 +1,74 @@ +#ifndef __CR_SECCOMP_H__ +#define __CR_SECCOMP_H__ + +#include +#include + +#include "images/seccomp.pb-c.h" +#include "images/core.pb-c.h" + +#ifndef SECCOMP_MODE_DISABLED +#define SECCOMP_MODE_DISABLED 0 +#endif + +#ifndef SECCOMP_MODE_STRICT +#define SECCOMP_MODE_STRICT 1 +#endif + +#ifndef SECCOMP_MODE_FILTER +#define SECCOMP_MODE_FILTER 2 +#endif + +#ifndef SECCOMP_SET_MODE_FILTER +#define SECCOMP_SET_MODE_FILTER 1 +#endif + +#ifndef SECCOMP_FILTER_FLAG_TSYNC +#define SECCOMP_FILTER_FLAG_TSYNC 1 +#endif + +struct thread_restore_args; +struct task_restore_args; +struct pstree_item; +struct rb_node; + +/* + * seccomp filters are bound to @current->seccomp.filter + * in the kernel, ie they are per thread structures. + * + * If filter is assigned then every subsequent call + * to fork() makes a copy of this @current->seccomp.filter + * pointer into child process. + * + * The thread group can share a filter if the filter + * is assigned with SECCOMP_FILTER_FLAG_TSYNC on group + * which has no filters yet. + */ +struct seccomp_filter_chain { + struct seccomp_filter_chain *prev; + SeccompFilter filter; +}; + +struct seccomp_entry { + struct rb_node node; + struct seccomp_entry *next; + pid_t tid_real; + size_t img_filter_pos; + unsigned int mode; + + struct seccomp_filter_chain *chain; + size_t nr_chains; +}; + +extern struct seccomp_entry *seccomp_lookup(pid_t tid_real, bool create, bool mandatory); +#define seccomp_find_entry(tid_real) seccomp_lookup(tid_real, false, true) +extern int seccomp_collect_entry(pid_t tid_real, unsigned int mode); +extern void seccomp_free_entries(void); +extern int seccomp_dump_thread(pid_t tid_real, ThreadCoreEntry *thread_core); +extern int seccomp_collect_dump_filters(void); + +extern int seccomp_read_image(void); +extern int seccomp_prepare_threads(struct pstree_item *item, struct task_restore_args *ta); +extern void seccomp_rst_reloc(struct thread_restore_args *thread_arg); + +#endif diff --git a/CRIU_code/criu/include/seize.h b/CRIU_code/criu/include/seize.h new file mode 100644 index 0000000..cf7366c --- /dev/null +++ b/CRIU_code/criu/include/seize.h @@ -0,0 +1,9 @@ +#ifndef __CR_SEIZE_H__ +#define __CR_SEIZE_H__ + +extern int collect_pstree(void); +extern void pstree_switch_state(struct pstree_item *root_item, int st); +extern const char *get_real_freezer_state(void); +extern bool alarm_timeouted(void); + +#endif diff --git a/CRIU_code/criu/include/servicefd.h b/CRIU_code/criu/include/servicefd.h new file mode 100644 index 0000000..986c46a --- /dev/null +++ b/CRIU_code/criu/include/servicefd.h @@ -0,0 +1,48 @@ +#ifndef __CR_SERVICE_FD_H__ +#define __CR_SERVICE_FD_H__ + +#include +#include +#include +#include +#include + +#include "criu-log.h" + +enum sfd_type { + SERVICE_FD_MIN, + + LOG_FD_OFF, + IMG_FD_OFF, + PROC_FD_OFF, /* fd with /proc for all proc_ calls */ + PROC_PID_FD_OFF, + CR_PROC_FD_OFF, /* some other's proc fd: + * - For dump -- target ns' proc + * - For restore -- CRIU ns' proc + */ + ROOT_FD_OFF, /* Root of the namespace we dump/restore */ + CGROUP_YARD, + USERNSD_SK, /* Socket for usernsd */ + NS_FD_OFF, /* Node's net namespace fd */ + TRANSPORT_FD_OFF, /* to transfer file descriptors */ + RPC_SK_OFF, + FDSTORE_SK_OFF, + + SERVICE_FD_MAX +}; + +struct pstree_item; +extern bool sfds_protected; + + +extern const char *sfd_type_name(enum sfd_type type); +extern int init_service_fd(void); +extern int get_service_fd(enum sfd_type type); +extern bool is_any_service_fd(int fd); +extern bool is_service_fd(int fd, enum sfd_type type); +extern int service_fd_min_fd(struct pstree_item *item); +extern int install_service_fd(enum sfd_type type, int fd); +extern int close_service_fd(enum sfd_type type); +extern int clone_service_fd(struct pstree_item *me); + +#endif /* __CR_SERVICE_FD_H__ */ diff --git a/CRIU_code/criu/include/setproctitle.h b/CRIU_code/criu/include/setproctitle.h new file mode 100644 index 0000000..bc63433 --- /dev/null +++ b/CRIU_code/criu/include/setproctitle.h @@ -0,0 +1,19 @@ +#ifndef __CR_SETPROCTITLE_H__ +#define __CR_SETPROCTITLE_H__ + +#ifdef CONFIG_HAS_LIBBSD +#include +#else + +/* + * setproctitle_init is in the libbsd since v0.6.0. This macro allows to + * compile criu with libbsd<0.6.0. + */ +#ifndef CONFIG_HAS_SETPROCTITLE_INIT +#define setproctitle_init(argc, argv, envp) +#endif + +#define setproctitle(fmt, ...) +#endif + +#endif /* __CR_SETPROCTITLE_H__ */ diff --git a/CRIU_code/criu/include/shmem.h b/CRIU_code/criu/include/shmem.h new file mode 100644 index 0000000..04ab8d0 --- /dev/null +++ b/CRIU_code/criu/include/shmem.h @@ -0,0 +1,21 @@ +#ifndef __CR_SHMEM_H__ +#define __CR_SHMEM_H__ + +#include "int.h" +#include "common/lock.h" +#include "images/vma.pb-c.h" + +struct _VmaEntry; +struct vma_area; + +extern int collect_shmem(int pid, struct vma_area *vma); +extern int collect_sysv_shmem(unsigned long shmid, unsigned long size); +extern int cr_dump_shmem(void); +extern int add_shmem_area(pid_t pid, VmaEntry *vma, u64 *map); +extern int fixup_sysv_shmems(void); +extern int dump_one_sysv_shmem(void *addr, unsigned long size, unsigned long shmid); +extern int restore_sysv_shmem_content(void *addr, unsigned long size, unsigned long shmid); + +#define SYSV_SHMEM_SKIP_FD (0x7fffffff) + +#endif /* __CR_SHMEM_H__ */ diff --git a/CRIU_code/criu/include/sigframe.h b/CRIU_code/criu/include/sigframe.h new file mode 100644 index 0000000..b63d9f0 --- /dev/null +++ b/CRIU_code/criu/include/sigframe.h @@ -0,0 +1,16 @@ +/* + * Generic sigframe bits. + */ + +#ifndef __CR_SIGFRAME_H__ +#define __CR_SIGFRAME_H__ + +#include +#include "images/core.pb-c.h" + +extern int construct_sigframe(struct rt_sigframe *sigframe, + struct rt_sigframe *rsigframe, + k_rtsigset_t *blkset, + CoreEntry *core); + +#endif /* __CR_SIGFRAME_H__ */ diff --git a/CRIU_code/criu/include/signalfd.h b/CRIU_code/criu/include/signalfd.h new file mode 100644 index 0000000..c7af819 --- /dev/null +++ b/CRIU_code/criu/include/signalfd.h @@ -0,0 +1,10 @@ +#ifndef __CR_SIGNALFD_H__ +#define __CR_SIGNALFD_H__ + +struct cr_imgset; +struct fd_parms; +extern int is_signalfd_link(char *link); +extern const struct fdtype_ops signalfd_dump_ops; +extern struct collect_image_info signalfd_cinfo; + +#endif /* __CR_SIGNALFD_H__ */ diff --git a/CRIU_code/criu/include/sk-inet.h b/CRIU_code/criu/include/sk-inet.h new file mode 100644 index 0000000..7996651 --- /dev/null +++ b/CRIU_code/criu/include/sk-inet.h @@ -0,0 +1,105 @@ +#ifndef __CR_SK_INET_H__ +#define __CR_SK_INET_H__ + +#include + +#include "sockets.h" +#include "files.h" +#include "common/list.h" +#include "images/sk-inet.pb-c.h" + +#define INET_ADDR_LEN 48 /* max of INET_ADDRSTRLEN and INET6_ADDRSTRLEN */ +#ifndef TCP_REPAIR +#define TCP_REPAIR 19 /* TCP sock is under repair right now */ +#define TCP_REPAIR_QUEUE 20 +#define TCP_QUEUE_SEQ 21 +#define TCP_REPAIR_OPTIONS 22 +#endif + +#ifndef IP_HDRINCL +# define IP_HDRINCL 3 +#endif + +#ifndef IP_NODEFRAG +# define IP_NODEFRAG 22 +#endif + +#ifndef IPV6_HDRINCL +# define IPV6_HDRINCL 36 +#endif + +struct inet_sk_desc { + struct socket_desc sd; + unsigned int type; + unsigned int src_port; + unsigned int dst_port; + unsigned int state; + unsigned int rqlen; + unsigned int wqlen; /* sent + unsent data */ + unsigned int uwqlen; /* unsent data */ + unsigned int src_addr[4]; + unsigned int dst_addr[4]; + unsigned short shutdown; + bool cork; + + int rfd; + int cpt_reuseaddr; + struct list_head rlist; + + void *priv; +}; + +struct inet_port; +struct inet_sk_info { + InetSkEntry *ie; + struct file_desc d; + struct inet_port *port; + struct list_head port_list; + /* + * This is an fd by which the socket is opened. + * It will be carried down to restorer code to + * repair-off the socket at the very end. + */ + int sk_fd; + struct list_head rlist; +}; + +extern int inet_bind(int sk, struct inet_sk_info *); +extern int inet_connect(int sk, struct inet_sk_info *); + +#ifdef CR_NOGLIBC +#define setsockopt sys_setsockopt +#endif +static inline void tcp_repair_off(int fd) +{ + int aux = 0, ret; + + ret = setsockopt(fd, SOL_TCP, TCP_REPAIR, &aux, sizeof(aux)); + if (ret < 0) + pr_err("Failed to turn off repair mode on socket: %m\n"); +} + +extern void tcp_locked_conn_add(struct inet_sk_info *); +extern void rst_unlock_tcp_connections(void); +extern void cpt_unlock_tcp_connections(void); + +extern int dump_one_tcp(int sk, struct inet_sk_desc *sd); +extern int restore_one_tcp(int sk, struct inet_sk_info *si); + +#define SK_EST_PARAM "tcp-established" +#define SK_INFLIGHT_PARAM "skip-in-flight" +#define SK_CLOSE_PARAM "tcp-close" + +struct task_restore_args; +int prepare_tcp_socks(struct task_restore_args *); + +struct rst_tcp_sock { + int sk; + bool reuseaddr; +}; + +union libsoccr_addr; +int restore_sockaddr(union libsoccr_addr *sa, + int family, u32 pb_port, u32 *pb_addr, u32 ifindex); + +#endif /* __CR_SK_INET_H__ */ diff --git a/CRIU_code/criu/include/sk-packet.h b/CRIU_code/criu/include/sk-packet.h new file mode 100644 index 0000000..a0738ae --- /dev/null +++ b/CRIU_code/criu/include/sk-packet.h @@ -0,0 +1,40 @@ +#ifndef __CR_SK_PACKET_H__ +#define __CR_SK_PACKET_H__ + +#ifndef PACKET_TIMESTAMP +#define PACKET_TIMESTAMP 17 +#endif + +struct cr_imgset; +struct fd_parms; +struct vma_area; + +extern struct collect_image_info packet_sk_cinfo; + +extern int dump_socket_map(struct vma_area *vma); +extern int collect_socket_map(struct vma_area *); + +struct nlmsghdr; +extern int packet_receive_one(struct nlmsghdr *h, struct ns_id *ns, void *arg); + +#ifndef PACKET_VNET_HDR +#define PACKET_VNET_HDR 15 +#endif + +#ifndef PACKET_FANOUT +#define PACKET_FANOUT 18 +#endif + +#ifndef TPACKET3_HDRLEN +struct tpacket_req3 { + unsigned int tp_block_size; + unsigned int tp_block_nr; + unsigned int tp_frame_size; + unsigned int tp_frame_nr; + unsigned int tp_retire_blk_tov; + unsigned int tp_sizeof_priv; + unsigned int tp_feature_req_word; +}; +#endif + +#endif /* __CR_SK_PACKET_H__ */ diff --git a/CRIU_code/criu/include/sk-queue.h b/CRIU_code/criu/include/sk-queue.h new file mode 100644 index 0000000..e0a47af --- /dev/null +++ b/CRIU_code/criu/include/sk-queue.h @@ -0,0 +1,8 @@ +#ifndef __CR_SK_QUEUE_H__ +#define __CR_SK_QUEUE_H__ + +extern struct collect_image_info sk_queues_cinfo; +extern int dump_sk_queue(int sock_fd, int sock_id); +extern int restore_sk_queue(int fd, unsigned int peer_id); + +#endif /* __CR_SK_QUEUE_H__ */ diff --git a/CRIU_code/criu/include/sockets.h b/CRIU_code/criu/include/sockets.h new file mode 100644 index 0000000..cd98d18 --- /dev/null +++ b/CRIU_code/criu/include/sockets.h @@ -0,0 +1,123 @@ +#ifndef __CR_SOCKETS_H__ +#define __CR_SOCKETS_H__ + +#include +#include +#include + +#include "images/sk-opts.pb-c.h" +#include "images/fdinfo.pb-c.h" + +struct fdinfo_list_entry; +struct sk_opts_entry; +struct file_desc; +struct fd_parms; +struct cr_imgset; +struct nlmsghdr; +struct cr_img; + +struct socket_desc { + unsigned int family; + unsigned int ino; + struct socket_desc *next; + struct ns_id *sk_ns; + int already_dumped; +}; + +extern int dump_socket(struct fd_parms *p, int lfd, FdinfoEntry *); +extern int dump_socket_opts(int sk, SkOptsEntry *soe); +extern int restore_socket_opts(int sk, SkOptsEntry *soe); +extern void release_skopts(SkOptsEntry *); +extern int restore_prepare_socket(int sk); +extern void preload_socket_modules(void); + +extern bool socket_test_collect_bit(unsigned int family, unsigned int proto); + +extern int sk_collect_one(unsigned ino, int family, struct socket_desc *d, struct ns_id *ns); +struct ns_id; +extern int collect_sockets(struct ns_id *); +extern struct collect_image_info inet_sk_cinfo; +extern struct collect_image_info unix_sk_cinfo; +extern int add_fake_unix_queuers(void); +extern int fix_external_unix_sockets(void); +extern int prepare_scms(void); +extern int unix_note_scm_rights(int id_for, uint32_t *file_ids, int *fds, int n_ids); + +extern struct collect_image_info netlink_sk_cinfo; + +extern struct socket_desc *lookup_socket_ino(unsigned int ino, int family); +extern struct socket_desc *lookup_socket(unsigned int ino, int family, int proto); + +extern const struct fdtype_ops unix_dump_ops; +extern const struct fdtype_ops inet_dump_ops; +extern const struct fdtype_ops inet6_dump_ops; +extern const struct fdtype_ops netlink_dump_ops; +extern const struct fdtype_ops packet_dump_ops; + +extern int inet_collect_one(struct nlmsghdr *h, int family, int type, struct ns_id *ns); +extern int unix_receive_one(struct nlmsghdr *h, struct ns_id *ns, void *); +extern int netlink_receive_one(struct nlmsghdr *hdr, struct ns_id *ns, void *arg); + +extern int unix_sk_id_add(unsigned int ino); +extern int unix_sk_ids_parse(char *optarg); +extern int unix_prepare_root_shared(void); + +extern int do_dump_opt(int sk, int level, int name, void *val, int len); +#define dump_opt(s, l, n, f) do_dump_opt(s, l, n, f, sizeof(*f)) +extern int do_restore_opt(int sk, int level, int name, void *val, int len); +#define restore_opt(s, l, n, f) do_restore_opt(s, l, n, f, sizeof(*f)) + +#define sk_encode_shutdown(img, mask) do { \ + /* \ + * protobuf SK_SHUTDOWN__ bits match those \ + * reported by kernel \ + */ \ + (img)->shutdown = mask; \ + if ((img)->shutdown != SK_SHUTDOWN__NONE) \ + (img)->has_shutdown = true; \ + } while (0) + +static inline int sk_decode_shutdown(int val) +{ + static const int hows[] = {-1, SHUT_RD, SHUT_WR, SHUT_RDWR}; + return hows[val]; +} + +#define USK_EXT_PARAM "ext-unix-sk" + +#ifndef NETLINK_SOCK_DIAG +#define NETLINK_SOCK_DIAG NETLINK_INET_DIAG +#endif + +extern int set_netns(uint32_t ns_id); + +#ifndef SIOCGSKNS +#define SIOCGSKNS 0x894C /* get socket network namespace */ +#endif + +extern int kerndat_socket_netns(void); +extern int kerndat_socket_unix_file(void); + +extern const char *tcp_state_name(unsigned int state, char *nm, size_t size); +extern const char *socket_type_name(unsigned int type, char *nm, size_t size); +extern const char *socket_family_name(unsigned int family, char *nm, size_t size); +extern const char *socket_proto_name(unsigned int proto, char *nm, size_t size); + +#define __tcp_state_name(state, a) tcp_state_name(state, a, sizeof(a)) +#define __socket_type_name(type, a) socket_type_name(type, a, sizeof(a)) +#define __socket_family_name(family, a) socket_family_name(family, a, sizeof(a)) +#define __socket_proto_name(proto, a) socket_proto_name(proto, a, sizeof(a)) + +#define __socket_info_helper(__h, __v) \ + ({ \ + char *__nm = alloca(32); \ + const char *__r = __h(__v, __nm, 32); \ + __r; \ + }) + +#define ___tcp_state_name(state) __socket_info_helper(tcp_state_name, state) +#define ___socket_type_name(type) __socket_info_helper(socket_type_name, type) +#define ___socket_family_name(family) __socket_info_helper(socket_family_name, family) +#define ___socket_proto_name(proto) __socket_info_helper(socket_proto_name, proto) + +#endif /* __CR_SOCKETS_H__ */ diff --git a/CRIU_code/criu/include/stats.h b/CRIU_code/criu/include/stats.h new file mode 100644 index 0000000..bab9a05 --- /dev/null +++ b/CRIU_code/criu/include/stats.h @@ -0,0 +1,55 @@ +#ifndef __CR_STATS_H__ +#define __CR_STATS_H__ + +enum { + TIME_FREEZING, + TIME_FROZEN, + TIME_MEMDUMP, + TIME_MEMWRITE, + TIME_IRMAP_RESOLVE, + + DUMP_TIME_NR_STATS, +}; + +enum { + TIME_FORK, + TIME_RESTORE, + + RESTORE_TIME_NS_STATS, +}; + +extern void timing_start(int t); +extern void timing_stop(int t); + +enum { + CNT_PAGES_SCANNED, + CNT_PAGES_SKIPPED_PARENT, + CNT_PAGES_WRITTEN, + CNT_PAGES_LAZY, + CNT_PAGE_PIPES, + CNT_PAGE_PIPE_BUFS, + + CNT_SHPAGES_SCANNED, + CNT_SHPAGES_SKIPPED_PARENT, + CNT_SHPAGES_WRITTEN, + + DUMP_CNT_NR_STATS, +}; + +enum { + CNT_PAGES_COMPARED, + CNT_PAGES_SKIPPED_COW, + CNT_PAGES_RESTORED, + + RESTORE_CNT_NR_STATS, +}; + +extern void cnt_add(int c, unsigned long val); + +#define DUMP_STATS 1 +#define RESTORE_STATS 2 + +extern int init_stats(int what); +extern void write_stats(int what); + +#endif /* __CR_STATS_H__ */ diff --git a/CRIU_code/criu/include/string.h b/CRIU_code/criu/include/string.h new file mode 100644 index 0000000..bc5f9d2 --- /dev/null +++ b/CRIU_code/criu/include/string.h @@ -0,0 +1,20 @@ +#ifndef __CR_STRING_H__ +#define __CR_STRING_H__ + +#include + +#ifdef CONFIG_HAS_LIBBSD +# include +#endif + +#include "common/config.h" + +#ifndef CONFIG_HAS_STRLCPY +extern size_t strlcpy(char *dest, const char *src, size_t size); +#endif + +#ifndef CONFIG_HAS_STRLCAT +extern size_t strlcat(char *dest, const char *src, size_t count); +#endif + +#endif /* __CR_STRING_H__ */ diff --git a/CRIU_code/criu/include/sysctl.h b/CRIU_code/criu/include/sysctl.h new file mode 100644 index 0000000..e271f5e --- /dev/null +++ b/CRIU_code/criu/include/sysctl.h @@ -0,0 +1,41 @@ +#ifndef __CR_SYSCTL_H__ +#define __CR_SYSCTL_H__ + +struct sysctl_req { + char *name; + void *arg; + int type; + int flags; +}; + +extern int sysctl_op(struct sysctl_req *req, size_t nr_req, int op, unsigned int ns); + +enum { + CTL_READ, + CTL_WRITE, +}; + +#define CTL_SHIFT 4 /* Up to 16 types */ + +#define CTL_U32 1 /* Single u32 */ +#define CTL_U64 2 /* Single u64 */ +#define __CTL_U32A 3 /* Array of u32 */ +#define __CTL_U64A 4 /* Array of u64 */ +#define __CTL_STR 5 /* String */ +#define CTL_32 6 /* Single s32 */ + +#define CTL_U32A(n) (__CTL_U32A | ((n) << CTL_SHIFT)) +#define CTL_U64A(n) (__CTL_U64A | ((n) << CTL_SHIFT)) +#define CTL_STR(len) (__CTL_STR | ((len) << CTL_SHIFT)) + +#define CTL_LEN(t) ((t) >> CTL_SHIFT) +#define CTL_TYPE(t) ((t) & ((1 << CTL_SHIFT) - 1)) + +/* + * Some entries might be missing mark them as optional. + */ +#define CTL_FLAGS_OPTIONAL 1 +#define CTL_FLAGS_HAS 2 +#define CTL_FLAGS_READ_EIO_SKIP 4 + +#endif /* __CR_SYSCTL_H__ */ diff --git a/CRIU_code/criu/include/sysfs_parse.h b/CRIU_code/criu/include/sysfs_parse.h new file mode 100644 index 0000000..3ba06ed --- /dev/null +++ b/CRIU_code/criu/include/sysfs_parse.h @@ -0,0 +1,17 @@ +#ifndef __CR_SYSFS_PARSE_H__ +#define __CR_SYSFS_PARSE_H__ + +#define SYSFS_AUFS "/sys/fs/aufs/" +#define SBINFO_LEN (3 + 16 + 1) /* si_%lx */ +#define SBINFO_PATH_LEN (sizeof SYSFS_AUFS + SBINFO_LEN) /* /sys/fs/aufs/ */ +#define AUFSBR_PATH_LEN (SBINFO_PATH_LEN + 6 + 1) /* /sys/fs/aufs//br%3d */ + +struct mount_info; +struct vma_area; + +extern int parse_aufs_branches(struct mount_info *mi); +extern int fixup_aufs_vma_fd(struct vma_area *vma, int vm_file_fd); +extern void free_aufs_branches(void); + +#endif /* __CR_SYSFS_PARSE_H__ */ + diff --git a/CRIU_code/criu/include/timerfd.h b/CRIU_code/criu/include/timerfd.h new file mode 100644 index 0000000..2e42a74 --- /dev/null +++ b/CRIU_code/criu/include/timerfd.h @@ -0,0 +1,52 @@ +#ifndef __CR_TIMERFD_H__ +#define __CR_TIMERFD_H__ + +#include +#include + +#include "files.h" + +#include "images/timerfd.pb-c.h" + +struct pstree_item; + +struct restore_timerfd { + int id; + int fd; + int clockid; + int settime_flags; + unsigned long ticks; + struct itimerspec val; +}; + +extern const struct fdtype_ops timerfd_dump_ops; +extern struct collect_image_info timerfd_cinfo; + +struct task_restore_args; +int prepare_timerfds(struct task_restore_args *); + +extern int check_timerfd(void); +extern int is_timerfd_link(char *link); + +#ifndef TFD_TIMER_ABSTIME +# define TFD_TIMER_ABSTIME (1 << 0) +#endif + +#ifndef TFD_IOC_SET_TICKS +# define TFD_IOC_SET_TICKS _IOW('T', 0, u64) +#endif + +static inline int verify_timerfd(TimerfdEntry *tfe) +{ + if (tfe->clockid != CLOCK_REALTIME && + tfe->clockid != CLOCK_BOOTTIME && + tfe->clockid != CLOCK_MONOTONIC) { + pr_err("Unknown clock type %d for %#x\n", tfe->clockid, tfe->id); + return -1; + } + + return 0; +} + + +#endif /* __CR_TIMERFD_H__ */ diff --git a/CRIU_code/criu/include/tls.h b/CRIU_code/criu/include/tls.h new file mode 100644 index 0000000..aa25178 --- /dev/null +++ b/CRIU_code/criu/include/tls.h @@ -0,0 +1,26 @@ +#ifndef __CR_TLS_H__ +#define __CR_TLS_H__ + +# ifdef CONFIG_GNUTLS + +int tls_x509_init(int sockfd, bool is_server); +void tls_terminate_session(); + +ssize_t tls_send(const void *buf, size_t len, int flags); +ssize_t tls_recv(void *buf, size_t len, int flags); + +int tls_send_data_from_fd(int fd, unsigned long len); +int tls_recv_data_to_fd(int fd, unsigned long len); + +# else /* CONFIG_GNUTLS */ + +#define tls_x509_init(sockfd, is_server) (0) +#define tls_send(buf, len, flags) (-1) +#define tls_recv(buf, len, flags) (-1) +#define tls_send_data_from_fd(fd, len) (-1) +#define tls_recv_data_to_fd(fd, len) (-1) +#define tls_terminate_session() + +#endif /* CONFIG_HAS_GNUTLS */ + +#endif /* __CR_TLS_H__ */ diff --git a/CRIU_code/criu/include/tty.h b/CRIU_code/criu/include/tty.h new file mode 100644 index 0000000..95ced83 --- /dev/null +++ b/CRIU_code/criu/include/tty.h @@ -0,0 +1,40 @@ +#ifndef __CR_TTY_H__ +#define __CR_TTY_H__ + +#include +#include + +#include "files.h" + +/* Kernel's limit */ +#define TERMIOS_NCC 19 + +/* Popular serial console's majors, which not defined in */ +#define USB_SERIAL_MAJOR 188 +#define LOW_DENSE_SERIAL_MAJOR 204 + +extern const struct fdtype_ops tty_dump_ops; + +struct tty_driver; +struct tty_driver *get_tty_driver(dev_t rdev, dev_t dev); +static inline int is_tty(dev_t rdev, dev_t dev) +{ + return get_tty_driver(rdev, dev) != NULL; +} + +extern int tty_post_actions(void); +extern int dump_verify_tty_sids(void); +extern struct collect_image_info tty_info_cinfo; +extern struct collect_image_info tty_cinfo; +extern struct collect_image_info tty_cdata; + +struct mount_info; +extern int devpts_restore(struct mount_info *pm); + +extern int tty_prep_fds(void); + +extern int devpts_check_bindmount(struct mount_info *m); + +#define OPT_SHELL_JOB "shell-job" + +#endif /* __CR_TTY_H__ */ diff --git a/CRIU_code/criu/include/tun.h b/CRIU_code/criu/include/tun.h new file mode 100644 index 0000000..ce0b266 --- /dev/null +++ b/CRIU_code/criu/include/tun.h @@ -0,0 +1,22 @@ +#ifndef __CR_TUN_H__ +#define __CR_TUN_H__ + +#ifndef TUN_MINOR +#define TUN_MINOR 200 +#endif + +struct ns_id *ns; + +#include + +#include "images/netdev.pb-c.h" + +extern const struct fdtype_ops tunfile_dump_ops; +extern int dump_tun_link(NetDeviceEntry *nde, struct cr_imgset *fds, struct nlattr **info); +struct net_link; +extern int restore_one_tun(struct ns_id *ns, struct net_link *link, int nlsk); +extern struct collect_image_info tunfile_cinfo; +extern int check_tun_cr(int no_tun_err); +extern int check_tun_netns_cr(bool *result); + +#endif /* __CR_TUN_H__ */ diff --git a/CRIU_code/criu/include/types.h b/CRIU_code/criu/include/types.h new file mode 100644 index 0000000..7600f35 --- /dev/null +++ b/CRIU_code/criu/include/types.h @@ -0,0 +1,5 @@ +#ifndef __CR_INC_TYPES_H__ +#define __CR_INC_TYPES_H__ +#include +#include "asm/types.h" +#endif diff --git a/CRIU_code/criu/include/uffd.h b/CRIU_code/criu/include/uffd.h new file mode 100644 index 0000000..814e60f --- /dev/null +++ b/CRIU_code/criu/include/uffd.h @@ -0,0 +1,13 @@ +#ifndef __CR_UFFD_H_ +#define __CR_UFFD_H_ + +struct task_restore_args; + +extern int uffd_open(int flags, unsigned long *features); +extern bool uffd_noncooperative(void); +extern int setup_uffd(int pid, struct task_restore_args *task_args); +extern int lazy_pages_setup_zombie(int pid); +extern int prepare_lazy_pages_socket(void); +extern int lazy_pages_finish_restore(void); + +#endif /* __CR_UFFD_H_ */ diff --git a/CRIU_code/criu/include/unix_diag.h b/CRIU_code/criu/include/unix_diag.h new file mode 100644 index 0000000..d88d52f --- /dev/null +++ b/CRIU_code/criu/include/unix_diag.h @@ -0,0 +1,65 @@ +#ifndef __CR_UNIX_DIAG_H__ +#define __CR_UNIX_DIAG_H__ + +struct unix_diag_req { + u8 sdiag_family; + u8 sdiag_protocol; + u16 pad; + u32 udiag_states; + u32 udiag_ino; + u32 udiag_show; + u32 udiag_cookie[2]; +}; + +#define UDIAG_SHOW_NAME 0x00000001 /* show name (not path) */ +#define UDIAG_SHOW_VFS 0x00000002 /* show VFS inode info */ +#define UDIAG_SHOW_PEER 0x00000004 /* show peer socket info */ +#define UDIAG_SHOW_ICONS 0x00000008 /* show pending connections */ +#define UDIAG_SHOW_RQLEN 0x00000010 /* show skb receive queue len */ +#define UDIAG_SHOW_MEMINFO 0x00000020 /* show memory info of a socket */ + +struct unix_diag_msg { + u8 udiag_family; + u8 udiag_type; + u8 udiag_state; + u8 pad; + + u32 udiag_ino; + u32 udiag_cookie[2]; +}; + +enum { + SK_MEMINFO_RMEM_ALLOC, + SK_MEMINFO_RCVBUF, + SK_MEMINFO_WMEM_ALLOC, + SK_MEMINFO_SNDBUF, + SK_MEMINFO_FWD_ALLOC, + SK_MEMINFO_WMEM_QUEUED, + SK_MEMINFO_OPTMEM, + + SK_MEMINFO_VARS, +}; + +enum { + UNIX_DIAG_NAME, + UNIX_DIAG_VFS, + UNIX_DIAG_PEER, + UNIX_DIAG_ICONS, + UNIX_DIAG_RQLEN, + UNIX_DIAG_MEMINFO, + UNIX_DIAG_SHUTDOWN, + + UNIX_DIAG_MAX, +}; + +struct unix_diag_vfs { + u32 udiag_vfs_ino; + u32 udiag_vfs_dev; +}; + +struct unix_diag_rqlen { + u32 udiag_rqueue; + u32 udiag_wqueue; +}; + +#endif /* __CR_UNIX_DIAG_H__ */ diff --git a/CRIU_code/criu/include/util-pie.h b/CRIU_code/criu/include/util-pie.h new file mode 100644 index 0000000..a8137f4 --- /dev/null +++ b/CRIU_code/criu/include/util-pie.h @@ -0,0 +1,20 @@ +#ifndef __CR_UTIL_NET_H__ +#define __CR_UTIL_NET_H__ + +#include +#include + +#ifndef UNIX_PATH_MAX +#define UNIX_PATH_MAX (sizeof(struct sockaddr_un) - \ + (size_t)((struct sockaddr_un *) 0)->sun_path) +#endif + +#ifndef SO_PEEK_OFF +#define SO_PEEK_OFF 42 +#endif + +#include "common/scm.h" + +extern int open_detach_mount(char *dir); + +#endif /* __CR_UTIL_NET_H__ */ diff --git a/CRIU_code/criu/include/util-vdso.h b/CRIU_code/criu/include/util-vdso.h new file mode 100644 index 0000000..c74360c --- /dev/null +++ b/CRIU_code/criu/include/util-vdso.h @@ -0,0 +1,99 @@ +#ifndef __CR_UTIL_VDSO_H__ +#define __CR_UTIL_VDSO_H__ + +/* + * VDSO management common definitions. + * + * This header file is included by the criu main code and the parasite code. + * It contains definitions shared by these 2 parts. + * + * This file should not be included except in pie/util-vdso.c, include/vdso.h + * and include/parasite-vdso.h + */ + +#include + +/* + * Each architecture must export: + * VDSO_SYMBOL_MAX, the number of vDSO symbols to manage + * ARCH_VDSO_SYMBOLS, a table of string containing the vDSO symbol names + * vdso_redirect_calls, a service called to redirect the vDSO symbols in + * the parasite code. + */ +#include "asm/vdso.h" + +struct vdso_symbol { + char name[32]; + unsigned long offset; +}; + +struct vdso_symtable { + unsigned long vdso_size; + unsigned long vvar_size; + struct vdso_symbol symbols[VDSO_SYMBOL_MAX]; + bool vdso_before_vvar; /* order of vdso/vvar pair */ +}; + +struct vdso_maps { + unsigned long vdso_start; + unsigned long vvar_start; + struct vdso_symtable sym; +}; + +#define VDSO_SYMBOL_INIT { .offset = VDSO_BAD_ADDR, } + +#define VDSO_SYMTABLE_INIT \ + { \ + .vdso_size = VDSO_BAD_SIZE, \ + .vvar_size = VVAR_BAD_SIZE, \ + .symbols = { \ + [0 ... VDSO_SYMBOL_MAX - 1] = \ + (struct vdso_symbol)VDSO_SYMBOL_INIT, \ + }, \ + .vdso_before_vvar = false, \ + } + +#define VDSO_MAPS_INIT \ + { \ + .vdso_start = VDSO_BAD_ADDR, \ + .vvar_start = VVAR_BAD_ADDR, \ + .sym = VDSO_SYMTABLE_INIT, \ + } + +#ifdef CONFIG_VDSO_32 + +#define Ehdr_t Elf32_Ehdr +#define Sym_t Elf32_Sym +#define Phdr_t Elf32_Phdr +#define Word_t Elf32_Word +#define Dyn_t Elf32_Dyn + +#ifndef ELF_ST_TYPE +#define ELF_ST_TYPE ELF32_ST_TYPE +#endif +#ifndef ELF_ST_BIND +#define ELF_ST_BIND ELF32_ST_BIND +#endif + +# define vdso_fill_symtable vdso_fill_symtable_compat + +#else /* CONFIG_VDSO_32 */ + +#define Ehdr_t Elf64_Ehdr +#define Sym_t Elf64_Sym +#define Phdr_t Elf64_Phdr +#define Word_t Elf64_Word +#define Dyn_t Elf64_Dyn + +#ifndef ELF_ST_TYPE +#define ELF_ST_TYPE ELF64_ST_TYPE +#endif +#ifndef ELF_ST_BIND +#define ELF_ST_BIND ELF64_ST_BIND +#endif + +#endif /* CONFIG_VDSO_32 */ + +extern int vdso_fill_symtable(uintptr_t mem, size_t size, struct vdso_symtable *t); + +#endif /* __CR_UTIL_VDSO_H__ */ diff --git a/CRIU_code/criu/include/util.h b/CRIU_code/criu/include/util.h new file mode 100644 index 0000000..a14be72 --- /dev/null +++ b/CRIU_code/criu/include/util.h @@ -0,0 +1,381 @@ +#ifndef __CR_UTIL_H__ +#define __CR_UTIL_H__ + +/* + * Some bits are stolen from perf and kvm tools + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "int.h" +#include "common/compiler.h" +#include "xmalloc.h" +#include "common/bug.h" +#include "log.h" +#include "common/err.h" + +#define PREF_SHIFT_OP(pref, op, size) ((size) op (pref ##BYTES_SHIFT)) +#define KBYTES_SHIFT 10 +#define MBYTES_SHIFT 20 +#define GBYTES_SHIFT 30 + +#define KBYTES(size) PREF_SHIFT_OP(K, >>, size) +#define MBYTES(size) PREF_SHIFT_OP(M, >>, size) +#define GBYTES(size) PREF_SHIFT_OP(G, >>, size) + +#define KILO(size) PREF_SHIFT_OP(K, <<, size) +#define MEGA(size) PREF_SHIFT_OP(M, <<, size) +#define GIGA(size) PREF_SHIFT_OP(G, <<, size) + +struct vma_area; +struct list_head; + +extern int service_fd_rlim_cur; + +extern void pr_vma(unsigned int loglevel, const struct vma_area *vma_area); + +#define pr_info_vma(vma_area) pr_vma(LOG_INFO, vma_area) + +#define pr_vma_list(level, head) \ + do { \ + struct vma_area *vma; \ + list_for_each_entry(vma, head, list) \ + pr_vma(level, vma); \ + } while (0) +#define pr_info_vma_list(head) pr_vma_list(LOG_INFO, head) + +extern int move_fd_from(int *img_fd, int want_fd); +extern int close_safe(int *fd); + +extern int reopen_fd_as_safe(char *file, int line, int new_fd, int old_fd, bool allow_reuse_fd); +#define reopen_fd_as(new_fd, old_fd) reopen_fd_as_safe(__FILE__, __LINE__, new_fd, old_fd, false) +#define reopen_fd_as_nocheck(new_fd, old_fd) reopen_fd_as_safe(__FILE__, __LINE__, new_fd, old_fd, true) + +extern void close_proc(void); +extern int open_pid_proc(pid_t pid); +extern int close_pid_proc(void); +extern int set_proc_fd(int fd); + +extern pid_t sys_clone_unified(unsigned long flags, void *child_stack, void *parent_tid, + void *child_tid, unsigned long newtls); + +/* + * Values for pid argument of the proc opening routines below. + * SELF would open file under /proc/self + * GEN would open a file under /proc itself + * NONE is internal, don't use it ;) + */ + +#define PROC_SELF 0 +#define PROC_GEN -1 +#define PROC_NONE -2 + +extern int do_open_proc(pid_t pid, int flags, const char *fmt, ...) + __attribute__ ((__format__ (__printf__, 3, 4))); + +#define __open_proc(pid, ier, flags, fmt, ...) \ + ({ \ + int __fd = do_open_proc(pid, flags, \ + fmt, ##__VA_ARGS__); \ + if (__fd < 0 && (errno != (ier))) \ + pr_perror("Can't open %d/" fmt " on procfs", \ + pid, ##__VA_ARGS__); \ + \ + __fd; \ + }) + +/* int open_proc(pid_t pid, const char *fmt, ...); */ +#define open_proc(pid, fmt, ...) \ + __open_proc(pid, 0, O_RDONLY, fmt, ##__VA_ARGS__) + +/* int open_proc_rw(pid_t pid, const char *fmt, ...); */ +#define open_proc_rw(pid, fmt, ...) \ + __open_proc(pid, 0, O_RDWR, fmt, ##__VA_ARGS__) + +#define open_proc_path(pid, fmt, ...) \ + __open_proc(pid, 0, O_PATH, fmt, ##__VA_ARGS__) + +/* DIR *opendir_proc(pid_t pid, const char *fmt, ...); */ +#define opendir_proc(pid, fmt, ...) \ + ({ \ + int __fd = open_proc(pid, fmt, ##__VA_ARGS__); \ + DIR *__d = NULL; \ + \ + if (__fd >= 0) { \ + __d = fdopendir(__fd); \ + if (__d == NULL) \ + pr_perror("Can't fdopendir %d " \ + "(%d/" fmt " on procfs)", \ + __fd, pid, ##__VA_ARGS__); \ + } \ + __d; \ + }) + +/* FILE *fopen_proc(pid_t pid, const char *fmt, ...); */ +#define fopen_proc(pid, fmt, ...) \ + ({ \ + int __fd = open_proc(pid, fmt, ##__VA_ARGS__); \ + FILE *__f = NULL; \ + \ + if (__fd >= 0) { \ + __f = fdopen(__fd, "r"); \ + if (__f == NULL) \ + pr_perror("Can't fdopen %d " \ + "(%d/" fmt " on procfs)", \ + __fd, pid, ##__VA_ARGS__); \ + } \ + __f; \ + }) + +#define DEVZERO (makedev(1, 5)) + +#define KDEV_MINORBITS 20 +#define KDEV_MINORMASK ((1UL << KDEV_MINORBITS) - 1) +#define MKKDEV(ma, mi) (((ma) << KDEV_MINORBITS) | (mi)) + +static inline u32 kdev_major(u32 kdev) +{ + return kdev >> KDEV_MINORBITS; +} + +static inline u32 kdev_minor(u32 kdev) +{ + return kdev & KDEV_MINORMASK; +} + +static inline dev_t kdev_to_odev(u32 kdev) +{ + /* + * New kernels encode devices in a new form. + * See kernel's fs/stat.c for details, there + * choose_32_64 helpers which are the key. + */ + unsigned major = kdev_major(kdev); + unsigned minor = kdev_minor(kdev); + + return makedev(major, minor); +} + +extern int copy_file(int fd_in, int fd_out, size_t bytes); +extern int is_anon_link_type(char *link, char *type); + +#define is_hex_digit(c) \ + (((c) >= '0' && (c) <= '9') || \ + ((c) >= 'a' && (c) <= 'f') || \ + ((c) >= 'A' && (c) <= 'F')) + +#define CRS_CAN_FAIL 0x1 /* cmd can validly exit with non zero code */ + +extern int cr_system(int in, int out, int err, char *cmd, char *const argv[], unsigned flags); +extern int cr_system_userns(int in, int out, int err, char *cmd, + char *const argv[], unsigned flags, int userns_pid); +extern int cr_daemon(int nochdir, int noclose, int close_fd); +extern int close_status_fd(void); +extern int is_root_user(void); + +extern void set_proc_self_fd(int fd); + +static inline bool dir_dots(const struct dirent *de) +{ + return !strcmp(de->d_name, ".") || !strcmp(de->d_name, ".."); +} + +extern int is_empty_dir(int dirfd); + +/* + * Size of buffer to carry the worst case or /proc/self/fd/N + * path. Since fd is an integer, we can easily estimate one :) + */ +#define PSFDS (sizeof("/proc/self/fd/2147483647")) + +extern int read_fd_link(int lfd, char *buf, size_t size); + +#define USEC_PER_SEC 1000000L +#define NSEC_PER_SEC 1000000000L + +int vaddr_to_pfn(int fd, unsigned long vaddr, u64 *pfn); + +/* + * Check whether @str starts with @sub and report the + * next character of @str in @end + */ +static inline bool strstartswith2(const char *str, const char *sub, char *end) +{ + while (1) { + if (*sub == '\0') /* end of sub -- match */ { + if (end) { + if (*(sub-1) == '/') /* "/", "./" or "path/" */ + *end = '/'; + else + *end = *str; + } + + return true; + } + if (*str == '\0') /* end of str, sub is NOT ended -- miss */ + return false; + if (*str != *sub) + return false; + + str++; + sub++; + } +} + +static inline bool strstartswith(const char *str, const char *sub) +{ + return strstartswith2(str, sub, NULL); +} + +/* + * Checks whether the @path has @sub_path as a sub path, i.e. + * sub_path is the beginning of path and the last component + * match is full (next character terminates path component). + * + * Paths shouldn't contain excessive /-s, i.e. only one slash + * between path components and no slash at the end (except for + * the "/" path. This is pretty good assumption to what paths + * are used by criu. + */ + +static inline bool issubpath(const char *path, const char *sub_path) +{ + char end; + return strstartswith2(path, sub_path, &end) && + (end == '/' || end == '\0'); +} + +/* + * mkdir -p + */ +int mkdirpat(int fd, const char *path, int mode); + +/* + * Tests whether a path is a prefix of another path. This is different than + * strstartswith because "/foo" is _not_ a path prefix of "/foobar", since they + * refer to different directories. + */ +bool is_path_prefix(const char *path, const char *prefix); +FILE *fopenat(int dirfd, char *path, char *cflags); +void split(char *str, char token, char ***out, int *n); + +int fd_has_data(int lfd); + +int make_yard(char *path); + +static inline int sk_wait_data(int sk) +{ + struct pollfd pfd = {sk, POLLIN, 0}; + return poll(&pfd, 1, -1); +} + +void fd_set_nonblocking(int fd, bool on); +void tcp_nodelay(int sk, bool on); +void tcp_cork(int sk, bool on); + +const char *ns_to_string(unsigned int ns); + +int xatol(const char *string, long *number); +int xatoi(const char *string, int *number); + +char *xstrcat(char *str, const char *fmt, ...) + __attribute__ ((__format__ (__printf__, 2, 3))); +char *xsprintf(const char *fmt, ...) + __attribute__ ((__format__ (__printf__, 1, 2))); + +int setup_tcp_server(char *type); +int run_tcp_server(bool daemon_mode, int *ask, int cfd, int sk); +int setup_tcp_client(void); + +#define LAST_PID_PATH "sys/kernel/ns_last_pid" +#define PID_MAX_PATH "sys/kernel/pid_max" + +#define block_sigmask(saved_mask, sig_mask) ({ \ + sigset_t ___blocked_mask; \ + int ___ret = 0; \ + sigemptyset(&___blocked_mask); \ + sigaddset(&___blocked_mask, sig_mask); \ + if (sigprocmask(SIG_BLOCK, &___blocked_mask, saved_mask) == -1) { \ + pr_perror("Can not set mask of blocked signals"); \ + ___ret = -1; \ + } \ + ___ret; \ + }) + +#define restore_sigmask(saved_mask) ({ \ + int ___ret = 0; \ + if (sigprocmask(SIG_SETMASK, saved_mask, NULL) == -1) { \ + pr_perror("Can not unset mask of blocked signals"); \ + ___ret = -1; \ + } \ + ___ret; \ + }) + +/* + * Helpers to organize asynchronous reading from a bunch + * of file descriptors. + */ +#include + +struct epoll_rfd { + int fd; + /* + * EPOLLIN notification. The data is available for read in + * rfd->fd. + * @return 0 to resume polling, 1 to stop polling or a + * negative error code + */ + int (*read_event)(struct epoll_rfd *); + + /* + * EPOLLHUP | EPOLLRDHUP notification. The remote side has + * close the connection for rfd->fd. + * @return 0 to resume polling, 1 to stop polling or a + * negative error code + */ + int (*hangup_event)(struct epoll_rfd *); +}; + +extern int epoll_add_rfd(int epfd, struct epoll_rfd *); +extern int epoll_del_rfd(int epfd, struct epoll_rfd *rfd); +extern int epoll_run_rfds(int epfd, struct epoll_event *evs, int nr_fds, int tmo); +extern int epoll_prepare(int nr_events, struct epoll_event **evs); + +extern void rlimit_unlimit_nofile(void); + +extern int call_in_child_process(int (*fn)(void *), void *arg); +#ifdef __GLIBC__ +extern void print_stack_trace(pid_t pid); +#else +static inline void print_stack_trace(pid_t pid) {} +#endif + +#define block_sigmask(saved_mask, sig_mask) ({ \ + sigset_t ___blocked_mask; \ + int ___ret = 0; \ + sigemptyset(&___blocked_mask); \ + sigaddset(&___blocked_mask, sig_mask); \ + if (sigprocmask(SIG_BLOCK, &___blocked_mask, saved_mask) == -1) { \ + pr_perror("Can not set mask of blocked signals"); \ + ___ret = -1; \ + } \ + ___ret; \ + }) + +#define restore_sigmask(saved_mask) ({ \ + int ___ret = 0; \ + if (sigprocmask(SIG_SETMASK, saved_mask, NULL) == -1) { \ + pr_perror("Can not unset mask of blocked signals"); \ + ___ret = -1; \ + } \ + ___ret; \ + }) + +#endif /* __CR_UTIL_H__ */ diff --git a/CRIU_code/criu/include/uts_ns.h b/CRIU_code/criu/include/uts_ns.h new file mode 100644 index 0000000..ab054ff --- /dev/null +++ b/CRIU_code/criu/include/uts_ns.h @@ -0,0 +1,9 @@ +#ifndef __CR_UTS_NS_H__ +#define __CR_UTS_NS_H__ + +extern int dump_uts_ns(int ns_id); +extern int prepare_utsns(int pid); + +extern struct ns_desc uts_ns_desc; + +#endif /* __CR_UTS_NS_H__ */ diff --git a/CRIU_code/criu/include/vdso.h b/CRIU_code/criu/include/vdso.h new file mode 100644 index 0000000..fd30772 --- /dev/null +++ b/CRIU_code/criu/include/vdso.h @@ -0,0 +1,26 @@ +#ifndef __CR_VDSO_H__ +#define __CR_VDSO_H__ + +#include +#include + +#include "common/config.h" +#include "util-vdso.h" + +extern struct vdso_maps vdso_maps; +extern struct vdso_maps vdso_maps_compat; + +extern int vdso_init_dump(void); +extern int vdso_init_restore(void); +extern int kerndat_vdso_fill_symtable(void); +extern int kerndat_vdso_preserves_hint(void); + +extern int parasite_fixup_vdso(struct parasite_ctl *ctl, pid_t pid, + struct vm_area_list *vma_area_list); + +#ifdef CONFIG_COMPAT +extern void compat_vdso_helper(struct vdso_maps *native, int pipe_fd, + int err_fd, void *vdso_buf, size_t buf_size); +#endif + +#endif /* __CR_VDSO_H__ */ diff --git a/CRIU_code/criu/include/vma.h b/CRIU_code/criu/include/vma.h new file mode 100644 index 0000000..c297c0d --- /dev/null +++ b/CRIU_code/criu/include/vma.h @@ -0,0 +1,137 @@ +#ifndef __CR_VMA_H__ +#define __CR_VMA_H__ + +#include "image.h" +#include "common/list.h" + +#include "images/vma.pb-c.h" + +#include + +struct vm_area_list { + struct list_head h; + unsigned nr; + unsigned int nr_aios; + unsigned long priv_size; /* nr of pages in private VMAs */ + unsigned long priv_longest; /* nr of pages in longest private VMA */ + unsigned long shared_longest; /* nr of pages in longest shared VMA */ +}; + +#define VM_AREA_LIST(name) struct vm_area_list name = { .h = LIST_HEAD_INIT(name.h), .nr = 0, } + +static inline void vm_area_list_init(struct vm_area_list *vml) +{ + INIT_LIST_HEAD(&vml->h); + vml->nr = 0; + vml->priv_size = 0; + vml->priv_longest = 0; + vml->shared_longest = 0; +} + +struct file_desc; + +struct vma_area { + struct list_head list; + VmaEntry *e; + + union { + struct /* for dump */ { + int vm_socket_id; + + char *aufs_rpath; /* path from aufs root */ + char *aufs_fpath; /* full path from global root */ + + /* + * When several subsequent vmas have the same + * dev:ino pair all 'tail' ones set this to true + * and the vmst points to the head's stat buf. + */ + bool file_borrowed; + struct stat *vmst; + int mnt_id; + }; + + struct /* for restore */ { + int (*vm_open)(int pid, struct vma_area *vma); + struct file_desc *vmfd; + struct vma_area *pvma; /* parent for inherited VMAs */ + unsigned long *page_bitmap; /* existent pages */ + unsigned long premmaped_addr; /* restore only */ + + /* + * Some notes about pvma, page_bitmap and premmaped_addr bits + * above. + * + * The pvma is set in prepare_cow_vmas() when we resolve which + * VMAs _may_ inherit pages from each other. + * The page_bitmap and premmaped_addr are set in prepare_mappings() + * when the respective VMAs get mmap-ed or mremap-ed. + * These VMAs are then inherited during fork_with_pid()-s + * called from create_children_and_session(). + */ + }; + }; +}; + +#define VMA_COW_ROOT ((struct vma_area *)1) + +typedef int (*dump_filemap_t)(struct vma_area *vma_area, int fd); + +extern struct vma_area *alloc_vma_area(void); +extern int collect_mappings(pid_t pid, + struct vm_area_list *vma_area_list, dump_filemap_t cb); +extern void free_mappings(struct vm_area_list *vma_area_list); + +extern int parse_smaps(pid_t pid, struct vm_area_list *vma_area_list, dump_filemap_t cb); +extern int parse_self_maps_lite(struct vm_area_list *vms); + +#define vma_area_is(vma_area, s) vma_entry_is((vma_area)->e, s) +#define vma_area_len(vma_area) vma_entry_len((vma_area)->e) +#define vma_entry_is(vma, s) (((vma)->status & (s)) == (s)) +#define vma_entry_len(vma) ((vma)->end - (vma)->start) + +/* + * vma_premmaped_start() can be used only in restorer. + * In other cases vma_area->premmaped_addr must be used. + * This hack is required, because vma_area isn't transferred in restorer and + * shmid is used to determine which vma-s are cowed. + */ +#define vma_premmaped_start(vma) ((vma)->shmid) + +static inline int in_vma_area(struct vma_area *vma, unsigned long addr) +{ + return addr >= (unsigned long)vma->e->start && + addr < (unsigned long)vma->e->end; +} + +static inline bool vma_entry_is_private(VmaEntry *entry, + unsigned long task_size) +{ + return (vma_entry_is(entry, VMA_AREA_REGULAR) && + (vma_entry_is(entry, VMA_ANON_PRIVATE) || + vma_entry_is(entry, VMA_FILE_PRIVATE)) && + (entry->end <= task_size)) || + vma_entry_is(entry, VMA_AREA_AIORING); +} + +static inline bool vma_area_is_private(struct vma_area *vma, + unsigned long task_size) +{ + return vma_entry_is_private(vma->e, task_size); +} + +static inline struct vma_area *vma_next(struct vma_area *vma) +{ + return list_entry(vma->list.next, struct vma_area, list); +} + +static inline bool vma_entry_can_be_lazy(VmaEntry *e) +{ + return ((e->flags & MAP_ANONYMOUS) && + (e->flags & MAP_PRIVATE) && + !(e->flags & MAP_LOCKED) && + !(vma_entry_is(e, VMA_AREA_VDSO)) && + !(vma_entry_is(e, VMA_AREA_VSYSCALL))); +} + +#endif /* __CR_VMA_H__ */ diff --git a/CRIU_code/criu/include/xmalloc.h b/CRIU_code/criu/include/xmalloc.h new file mode 100644 index 0000000..09838b5 --- /dev/null +++ b/CRIU_code/criu/include/xmalloc.h @@ -0,0 +1,2 @@ +#include "log.h" +#include "common/xmalloc.h" diff --git a/CRIU_code/criu/ipc_ns.c b/CRIU_code/criu/ipc_ns.c new file mode 100644 index 0000000..d8590fa --- /dev/null +++ b/CRIU_code/criu/ipc_ns.c @@ -0,0 +1,946 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "util.h" +#include "cr_options.h" +#include "imgset.h" +#include "namespaces.h" +#include "sysctl.h" +#include "ipc_ns.h" +#include "shmem.h" + +#include "protobuf.h" +#include "images/ipc-var.pb-c.h" +#include "images/ipc-shm.pb-c.h" +#include "images/ipc-sem.pb-c.h" +#include "images/ipc-msg.pb-c.h" + +#if defined (__GLIBC__) && __GLIBC__ >= 2 +#define KEY __key +#else +#define KEY key +#endif + +#ifndef MSGMAX +#define MSGMAX 8192 +#endif + +#ifndef MSG_COPY +#define MSG_COPY 040000 +#endif + +static void pr_ipc_desc_entry(unsigned int loglevel, const IpcDescEntry *desc) +{ + print_on_level(loglevel, "id: %-10d key: %#08x uid: %-10d gid: %-10d " + "cuid: %-10d cgid: %-10d mode: %-10o ", + desc->id, desc->key, desc->uid, desc->gid, + desc->cuid, desc->cgid, desc->mode); +} + +static void fill_ipc_desc(int id, IpcDescEntry *desc, const struct ipc_perm *ipcp) +{ + desc->id = id; + desc->key = ipcp->KEY; + desc->uid = userns_uid(ipcp->uid); + desc->gid = userns_gid(ipcp->gid); + desc->cuid = userns_uid(ipcp->cuid); + desc->cgid = userns_gid(ipcp->cgid); + desc->mode = ipcp->mode; +} + +static void pr_ipc_sem_array(unsigned int loglevel, int nr, u16 *values) +{ + while (nr--) + print_on_level(loglevel, " %-5d", values[nr]); + print_on_level(loglevel, "\n"); +} + +#define pr_info_ipc_sem_array(nr, values) pr_ipc_sem_array(LOG_INFO, nr, values) + +static void pr_info_ipc_sem_entry(const IpcSemEntry *sem) +{ + pr_ipc_desc_entry(LOG_INFO, sem->desc); + print_on_level(LOG_INFO, "nsems: %-10d\n", sem->nsems); +} + +static int dump_ipc_sem_set(struct cr_img *img, const IpcSemEntry *sem) +{ + size_t rounded; + int ret, size; + u16 *values; + + size = sizeof(u16) * sem->nsems; + rounded = round_up(size, sizeof(u64)); + values = xmalloc(rounded); + if (values == NULL) { + pr_err("Failed to allocate memory for semaphore set values\n"); + ret = -ENOMEM; + goto out; + } + ret = semctl(sem->desc->id, 0, GETALL, values); + if (ret < 0) { + pr_perror("Failed to get semaphore set values"); + ret = -errno; + goto out; + } + pr_info_ipc_sem_array(sem->nsems, values); + + memzero((void *)values + size, rounded - size); + ret = write_img_buf(img, values, rounded); + if (ret < 0) { + pr_err("Failed to write IPC message data\n"); + goto out; + } +out: + xfree(values); + return ret; +} + +static int dump_ipc_sem_desc(struct cr_img *img, int id, const struct semid_ds *ds) +{ + IpcSemEntry sem = IPC_SEM_ENTRY__INIT; + IpcDescEntry desc = IPC_DESC_ENTRY__INIT; + int ret; + + sem.desc = &desc; + sem.nsems = ds->sem_nsems; + + fill_ipc_desc(id, sem.desc, &ds->sem_perm); + pr_info_ipc_sem_entry(&sem); + + ret = pb_write_one(img, &sem, PB_IPC_SEM); + if (ret < 0) { + pr_err("Failed to write IPC semaphores set\n"); + return ret; + } + return dump_ipc_sem_set(img, &sem); +} + +static int dump_ipc_sem(struct cr_img *img) +{ + int i, maxid; + struct seminfo info; + int slot; + + maxid = semctl(0, 0, SEM_INFO, &info); + if (maxid < 0) { + pr_perror("semctl failed"); + return -errno; + } + + pr_info("IPC semaphore sets: %d\n", info.semusz); + for (i = 0, slot = 0; i <= maxid; i++) { + struct semid_ds ds; + int id, ret; + + id = semctl(i, 0, SEM_STAT, &ds); + if (id < 0) { + if (errno == EINVAL) + continue; + pr_perror("Failed to get stats for IPC semaphore set"); + break; + } + ret = dump_ipc_sem_desc(img, id, &ds); + if (!ret) + slot++; + } + if (slot != info.semusz) { + pr_err("Failed to collect %d (only %d succeeded)\n", info.semusz, slot); + return -EFAULT; + } + return info.semusz; +} + +static void pr_info_ipc_msg(int nr, const IpcMsg *msg) +{ + print_on_level(LOG_INFO, " %-5d: type: %-20"PRId64" size: %-10d\n", + nr++, msg->mtype, msg->msize); +} + +static void pr_info_ipc_msg_entry(const IpcMsgEntry *msg) +{ + pr_ipc_desc_entry(LOG_INFO, msg->desc); + print_on_level(LOG_INFO, "qbytes: %-10d qnum: %-10d\n", + msg->qbytes, msg->qnum); +} + +static int dump_ipc_msg_queue_messages(struct cr_img *img, const IpcMsgEntry *msq, + unsigned int msg_nr) +{ + struct msgbuf *message = NULL; + unsigned int msgmax; + int ret, msg_cnt = 0; + struct sysctl_req req[] = { + { "kernel/msgmax", &msgmax, CTL_U32 }, + }; + + ret = sysctl_op(req, ARRAY_SIZE(req), CTL_READ, CLONE_NEWIPC); + if (ret < 0) { + pr_err("Failed to read max IPC message size\n"); + goto err; + } + + msgmax += sizeof(struct msgbuf); + message = xmalloc(round_up(msgmax, sizeof(u64))); + if (message == NULL) { + pr_err("Failed to allocate memory for IPC message\n"); + return -ENOMEM; + } + + for (msg_cnt = 0; msg_cnt < msg_nr; msg_cnt++) { + IpcMsg msg = IPC_MSG__INIT; + size_t rounded; + + ret = msgrcv(msq->desc->id, message, msgmax, msg_cnt, IPC_NOWAIT | MSG_COPY); + if (ret < 0) { + pr_perror("Failed to copy IPC message"); + goto err; + } + + msg.msize = ret; + msg.mtype = message->mtype; + + pr_info_ipc_msg(msg_cnt, &msg); + + ret = pb_write_one(img, &msg, PB_IPCNS_MSG); + if (ret < 0) { + pr_err("Failed to write IPC message header\n"); + break; + } + + rounded = round_up(msg.msize, sizeof(u64)); + memzero(((void *)message->mtext + msg.msize), rounded - msg.msize); + ret = write_img_buf(img, message->mtext, rounded); + if (ret < 0) { + pr_err("Failed to write IPC message data\n"); + break; + } + } + ret = 0; +err: + xfree(message); + return ret; +} + +static int dump_ipc_msg_queue(struct cr_img *img, int id, const struct msqid_ds *ds) +{ + IpcMsgEntry msg = IPC_MSG_ENTRY__INIT; + IpcDescEntry desc = IPC_DESC_ENTRY__INIT; + int ret; + + msg.desc = &desc; + fill_ipc_desc(id, msg.desc, &ds->msg_perm); + msg.qbytes = ds->msg_qbytes; + msg.qnum = ds->msg_qnum; + + pr_info_ipc_msg_entry(&msg); + + ret = pb_write_one(img, &msg, PB_IPCNS_MSG_ENT); + if (ret < 0) { + pr_err("Failed to write IPC message queue\n"); + return ret; + } + return dump_ipc_msg_queue_messages(img, &msg, ds->msg_qnum); +} + +static int dump_ipc_msg(struct cr_img *img) +{ + int i, maxid; + struct msginfo info; + int slot; + + maxid = msgctl(0, MSG_INFO, (struct msqid_ds *)&info); + if (maxid < 0) { + pr_perror("msgctl failed"); + return -errno; + } + + pr_info("IPC message queues: %d\n", info.msgpool); + for (i = 0, slot = 0; i <= maxid; i++) { + struct msqid_ds ds; + int id, ret; + + id = msgctl(i, MSG_STAT, &ds); + if (id < 0) { + if (errno == EINVAL) + continue; + pr_perror("Failed to get stats for IPC message queue"); + break; + } + ret = dump_ipc_msg_queue(img, id, &ds); + if (!ret) + slot++; + } + if (slot != info.msgpool) { + pr_err("Failed to collect %d message queues (only %d succeeded)\n", info.msgpool, slot); + return -EFAULT; + } + return info.msgpool; +} + +static void pr_info_ipc_shm(const IpcShmEntry *shm) +{ + pr_ipc_desc_entry(LOG_INFO, shm->desc); + print_on_level(LOG_INFO, "size: %-10"PRIu64"\n", shm->size); +} + +#define NR_MANDATORY_IPC_SYSCTLS 9 + +static int ipc_sysctl_req(IpcVarEntry *e, int op) +{ + struct sysctl_req req[] = { + { "kernel/sem", e->sem_ctls, CTL_U32A(e->n_sem_ctls) }, + { "kernel/msgmax", &e->msg_ctlmax, CTL_U32 }, + { "kernel/msgmnb", &e->msg_ctlmnb, CTL_U32 }, + { "kernel/auto_msgmni", &e->auto_msgmni, CTL_U32 }, + { "kernel/msgmni", &e->msg_ctlmni, CTL_U32 }, + { "kernel/shmmax", &e->shm_ctlmax, CTL_U64 }, + { "kernel/shmall", &e->shm_ctlall, CTL_U64 }, + { "kernel/shmmni", &e->shm_ctlmni, CTL_U32 }, + { "kernel/shm_rmid_forced", &e->shm_rmid_forced, CTL_U32 }, + /* We have 9 mandatory sysctls above and 8 optional below */ + { "fs/mqueue/queues_max", &e->mq_queues_max, CTL_U32 }, + { "fs/mqueue/msg_max", &e->mq_msg_max, CTL_U32 }, + { "fs/mqueue/msgsize_max", &e->mq_msgsize_max, CTL_U32 }, + { "fs/mqueue/msg_default", &e->mq_msg_default, CTL_U32 }, + { "fs/mqueue/msgsize_default", &e->mq_msgsize_default, CTL_U32 }, + { "kernel/msg_next_id", &e->msg_next_id, CTL_U32 }, + { "kernel/sem_next_id", &e->sem_next_id, CTL_U32 }, + { "kernel/shm_next_id", &e->shm_next_id, CTL_U32 }, + }; + + int nr = NR_MANDATORY_IPC_SYSCTLS; + + /* Skip sysctls which can't be set or haven't existed on dump */ + if (access("/proc/sys/fs/mqueue", X_OK)) + pr_info("Mqueue sysctls are missing\n"); + else { + nr += 3; + if (e->has_mq_msg_default) { + req[nr++] = req[12]; + req[nr++] = req[13]; + } + } + if (e->has_msg_next_id) + req[nr++] = req[14]; + if (e->has_sem_next_id) + req[nr++] = req[15]; + if (e->has_shm_next_id) + req[nr++] = req[16]; + + return sysctl_op(req, nr, op, CLONE_NEWIPC); +} + +static int dump_ipc_shm_pages(const IpcShmEntry *shm) +{ + int ret; + void *data; + + data = shmat(shm->desc->id, NULL, SHM_RDONLY); + if (data == (void *)-1) { + pr_perror("Failed to attach IPC shared memory"); + return -errno; + } + + ret = dump_one_sysv_shmem(data, shm->size, shm->desc->id); + + if (shmdt(data)) { + pr_perror("Failed to detach IPC shared memory"); + return -errno; + } + return ret; +} + +static int dump_ipc_shm_seg(struct cr_img *img, int id, const struct shmid_ds *ds) +{ + IpcShmEntry shm = IPC_SHM_ENTRY__INIT; + IpcDescEntry desc = IPC_DESC_ENTRY__INIT; + int ret; + + shm.desc = &desc; + shm.size = ds->shm_segsz; + shm.has_in_pagemaps = true; + shm.in_pagemaps = true; + fill_ipc_desc(id, shm.desc, &ds->shm_perm); + pr_info_ipc_shm(&shm); + + ret = pb_write_one(img, &shm, PB_IPC_SHM); + if (ret < 0) { + pr_err("Failed to write IPC shared memory segment\n"); + return ret; + } + return dump_ipc_shm_pages(&shm); +} + +static int dump_ipc_shm(struct cr_img *img) +{ + int i, maxid, slot; + struct shm_info info; + + maxid = shmctl(0, SHM_INFO, (void *)&info); + if (maxid < 0) { + pr_perror("shmctl(SHM_INFO) failed"); + return -errno; + } + + pr_info("IPC shared memory segments: %d\n", info.used_ids); + for (i = 0, slot = 0; i <= maxid; i++) { + struct shmid_ds ds; + int id, ret; + + id = shmctl(i, SHM_STAT, &ds); + if (id < 0) { + if (errno == EINVAL) + continue; + pr_perror("Failed to get stats for IPC shared memory"); + break; + } + + ret = dump_ipc_shm_seg(img, id, &ds); + if (ret < 0) + return ret; + slot++; + } + if (slot != info.used_ids) { + pr_err("Failed to collect %d (only %d succeeded)\n", + info.used_ids, slot); + return -EFAULT; + } + return 0; +} + +static int dump_ipc_var(struct cr_img *img) +{ + IpcVarEntry var = IPC_VAR_ENTRY__INIT; + int ret = -1; + + var.n_sem_ctls = 4; + var.sem_ctls = xmalloc(pb_repeated_size(&var, sem_ctls)); + if (!var.sem_ctls) + goto err; + var.has_mq_msg_default = true; + var.has_mq_msgsize_default = true; + var.has_msg_next_id = true; + var.has_sem_next_id = true; + var.has_shm_next_id = true; + + ret = ipc_sysctl_req(&var, CTL_READ); + if (ret < 0) { + pr_err("Failed to read IPC variables\n"); + goto err; + } + + /* + * One can not write to msg_next_xxx sysctls -1, + * which is their initial value + */ + if (var.msg_next_id == -1) + var.has_msg_next_id = false; + if (var.sem_next_id == -1) + var.has_sem_next_id = false; + if (var.shm_next_id == -1) + var.has_shm_next_id = false; + + ret = pb_write_one(img, &var, PB_IPC_VAR); + if (ret < 0) { + pr_err("Failed to write IPC variables\n"); + goto err; + } + +err: + xfree(var.sem_ctls); + return ret; +} + +static int dump_ipc_data(const struct cr_imgset *imgset) +{ + int ret; + + ret = dump_ipc_var(img_from_set(imgset, CR_FD_IPC_VAR)); + if (ret < 0) + return ret; + ret = dump_ipc_shm(img_from_set(imgset, CR_FD_IPCNS_SHM)); + if (ret < 0) + return ret; + ret = dump_ipc_msg(img_from_set(imgset, CR_FD_IPCNS_MSG)); + if (ret < 0) + return ret; + ret = dump_ipc_sem(img_from_set(imgset, CR_FD_IPCNS_SEM)); + if (ret < 0) + return ret; + return 0; +} + +int dump_ipc_ns(int ns_id) +{ + int ret; + struct cr_imgset *imgset; + + imgset = cr_imgset_open(ns_id, IPCNS, O_DUMP); + if (imgset == NULL) + return -1; + + ret = dump_ipc_data(imgset); + if (ret < 0) { + pr_err("Failed to write IPC namespace data\n"); + goto err; + } + +err: + close_cr_imgset(&imgset); + return ret < 0 ? -1 : 0; +} + +static int prepare_ipc_sem_values(struct cr_img *img, const IpcSemEntry *sem) +{ + int ret, size; + u16 *values; + + size = round_up(sizeof(u16) * sem->nsems, sizeof(u64)); + values = xmalloc(size); + if (values == NULL) { + pr_err("Failed to allocate memory for semaphores set values\n"); + ret = -ENOMEM; + goto out; + } + + ret = read_img_buf(img, values, size); + if (ret < 0) { + pr_err("Failed to allocate memory for semaphores set values\n"); + ret = -ENOMEM; + goto out; + } + + pr_info_ipc_sem_array(sem->nsems, values); + + ret = semctl(sem->desc->id, 0, SETALL, values); + if (ret < 0) { + pr_perror("Failed to set semaphores set values"); + ret = -errno; + } +out: + xfree(values); + return ret; +} + +static int prepare_ipc_sem_desc(struct cr_img *img, const IpcSemEntry *sem) +{ + int ret, id; + struct sysctl_req req[] = { + { "kernel/sem_next_id", &sem->desc->id, CTL_U32 }, + }; + struct semid_ds semid; + + ret = sysctl_op(req, ARRAY_SIZE(req), CTL_WRITE, CLONE_NEWIPC); + if (ret < 0) { + pr_err("Failed to set desired IPC sem ID\n"); + return ret; + } + + id = semget(sem->desc->key, sem->nsems, + sem->desc->mode | IPC_CREAT | IPC_EXCL); + if (id == -1) { + pr_perror("Failed to create sem set"); + return -errno; + } + + if (id != sem->desc->id) { + pr_err("Failed to restore sem id (%d instead of %d)\n", + id, sem->desc->id); + return -EFAULT; + } + + ret = semctl(id, sem->nsems, IPC_STAT, &semid); + if (ret == -1) { + pr_err("Failed to get sem stat structure\n"); + return -EFAULT; + } + + semid.sem_perm.uid = sem->desc->uid; + semid.sem_perm.gid = sem->desc->gid; + + ret = semctl(id, sem->nsems, IPC_SET, &semid); + if (ret == -1) { + pr_err("Failed to set sem uid and gid\n"); + return -EFAULT; + } + + ret = prepare_ipc_sem_values(img, sem); + if (ret < 0) { + pr_err("Failed to update sem pages\n"); + return ret; + } + return 0; +} + +static int prepare_ipc_sem(int pid) +{ + int ret; + struct cr_img *img; + + pr_info("Restoring IPC semaphores sets\n"); + img = open_image(CR_FD_IPCNS_SEM, O_RSTR, pid); + if (!img) + return -1; + + while (1) { + IpcSemEntry *sem; + + ret = pb_read_one_eof(img, &sem, PB_IPC_SEM); + if (ret < 0) { + ret = -EIO; + goto err; + } + if (ret == 0) + break; + + pr_info_ipc_sem_entry(sem); + + ret = prepare_ipc_sem_desc(img, sem); + ipc_sem_entry__free_unpacked(sem, NULL); + + if (ret < 0) { + pr_err("Failed to prepare semaphores set\n"); + goto err; + } + } + + close_image(img); + return 0; + +err: + close_image(img); + return ret; +} + +static int prepare_ipc_msg_queue_messages(struct cr_img *img, const IpcMsgEntry *msq) +{ + IpcMsg *msg = NULL; + int msg_nr = 0; + int ret = 0; + + while (msg_nr < msq->qnum) { + struct msgbuf { + long mtype; + char mtext[MSGMAX]; + } data; + + ret = pb_read_one(img, &msg, PB_IPCNS_MSG); + if (ret <= 0) + return -EIO; + + pr_info_ipc_msg(msg_nr, msg); + + if (msg->msize > MSGMAX) { + ret = -1; + pr_err("Unsupported message size: %d (MAX: %d)\n", + msg->msize, MSGMAX); + break; + } + + ret = read_img_buf(img, data.mtext, round_up(msg->msize, sizeof(u64))); + if (ret < 0) { + pr_err("Failed to read IPC message data\n"); + break; + } + + data.mtype = msg->mtype; + ret = msgsnd(msq->desc->id, &data, msg->msize, IPC_NOWAIT); + if (ret < 0) { + pr_perror("Failed to send IPC message"); + ret = -errno; + break; + } + msg_nr++; + } + + if (msg) + ipc_msg__free_unpacked(msg, NULL); + return ret; +} + +static int prepare_ipc_msg_queue(struct cr_img *img, const IpcMsgEntry *msq) +{ + int ret, id; + struct sysctl_req req[] = { + { "kernel/msg_next_id", &msq->desc->id, CTL_U32 }, + }; + struct msqid_ds msqid; + + ret = sysctl_op(req, ARRAY_SIZE(req), CTL_WRITE, CLONE_NEWIPC); + if (ret < 0) { + pr_err("Failed to set desired IPC msg ID\n"); + return ret; + } + + id = msgget(msq->desc->key, msq->desc->mode | IPC_CREAT | IPC_EXCL); + if (id == -1) { + pr_perror("Failed to create msg set"); + return -errno; + } + + if (id != msq->desc->id) { + pr_err("Failed to restore msg id (%d instead of %d)\n", + id, msq->desc->id); + return -EFAULT; + } + + ret = msgctl(id, IPC_STAT, &msqid); + if (ret == -1) { + pr_err("Failed to get msq stat structure\n"); + return -EFAULT; + } + + msqid.msg_perm.uid = msq->desc->uid; + msqid.msg_perm.gid = msq->desc->gid; + + ret = msgctl(id, IPC_SET, &msqid); + if (ret == -1) { + pr_err("Failed to set msq queue uid and gid\n"); + return -EFAULT; + } + + ret = prepare_ipc_msg_queue_messages(img, msq); + if (ret < 0) { + pr_err("Failed to update message queue messages\n"); + return ret; + } + return 0; +} + +static int prepare_ipc_msg(int pid) +{ + int ret; + struct cr_img *img; + + pr_info("Restoring IPC message queues\n"); + img = open_image(CR_FD_IPCNS_MSG, O_RSTR, pid); + if (!img) + return -1; + + while (1) { + IpcMsgEntry *msq; + + ret = pb_read_one_eof(img, &msq, PB_IPCNS_MSG_ENT); + if (ret < 0) { + pr_err("Failed to read IPC messages queue\n"); + ret = -EIO; + goto err; + } + if (ret == 0) + break; + + pr_info_ipc_msg_entry(msq); + + ret = prepare_ipc_msg_queue(img, msq); + ipc_msg_entry__free_unpacked(msq, NULL); + + if (ret < 0) { + pr_err("Failed to prepare messages queue\n"); + goto err; + } + } + + close_image(img); + return 0; +err: + close_image(img); + return ret; +} + +static int restore_content(void *data, struct cr_img *img, const IpcShmEntry *shm) +{ + int ifd; + ssize_t size, off; + + ifd = img_raw_fd(img); + size = round_up(shm->size, sizeof(u32)); + off = 0; + do { + ssize_t ret; + + ret = read(ifd, data + off, size - off); + if (ret <= 0) { + pr_perror("Failed to write IPC shared memory data"); + return (int)ret; + } + + off += ret; + } while (off < size); + + return 0; +} + +static int prepare_ipc_shm_pages(struct cr_img *img, const IpcShmEntry *shm) +{ + int ret; + void *data; + + data = shmat(shm->desc->id, NULL, 0); + if (data == (void *)-1) { + pr_perror("Failed to attach IPC shared memory"); + return -errno; + } + + if (shm->has_in_pagemaps && shm->in_pagemaps) + ret = restore_sysv_shmem_content(data, shm->size, shm->desc->id); + else + ret = restore_content(data, img, shm); + + if (shmdt(data)) { + pr_perror("Failed to detach IPC shared memory"); + return -errno; + } + return ret; +} + +static int prepare_ipc_shm_seg(struct cr_img *img, const IpcShmEntry *shm) +{ + int ret, id; + struct sysctl_req req[] = { + { "kernel/shm_next_id", &shm->desc->id, CTL_U32 }, + }; + struct shmid_ds shmid; + + if (collect_sysv_shmem(shm->desc->id, shm->size)) + return -1; + + ret = sysctl_op(req, ARRAY_SIZE(req), CTL_WRITE, CLONE_NEWIPC); + if (ret < 0) { + pr_err("Failed to set desired IPC shm ID\n"); + return ret; + } + + id = shmget(shm->desc->key, shm->size, + shm->desc->mode | IPC_CREAT | IPC_EXCL); + if (id == -1) { + pr_perror("Failed to create shm set"); + return -errno; + } + + if (id != shm->desc->id) { + pr_err("Failed to restore shm id (%d instead of %d)\n", + id, shm->desc->id); + return -EFAULT; + } + + ret = shmctl(id, IPC_STAT, &shmid); + if (ret == -1) { + pr_err("Failed to get shm stat structure\n"); + return -EFAULT; + } + + shmid.shm_perm.uid = shm->desc->uid; + shmid.shm_perm.gid = shm->desc->gid; + + ret = shmctl(id, IPC_SET, &shmid); + if (ret == -1) { + pr_err("Failed to set shm uid and gid\n"); + return -EFAULT; + } + + ret = prepare_ipc_shm_pages(img, shm); + if (ret < 0) { + pr_err("Failed to update shm pages\n"); + return ret; + } + return 0; +} + +static int prepare_ipc_shm(int pid) +{ + int ret; + struct cr_img *img; + + pr_info("Restoring IPC shared memory\n"); + img = open_image(CR_FD_IPCNS_SHM, O_RSTR, pid); + if (!img) + return -1; + + while (1) { + IpcShmEntry *shm; + + ret = pb_read_one_eof(img, &shm, PB_IPC_SHM); + if (ret < 0) { + pr_err("Failed to read IPC shared memory segment\n"); + ret = -EIO; + goto err; + } + if (ret == 0) + break; + + pr_info_ipc_shm(shm); + + ret = prepare_ipc_shm_seg(img, shm); + ipc_shm_entry__free_unpacked(shm, NULL); + + if (ret < 0) { + pr_err("Failed to prepare shm segment\n"); + goto err; + } + } + + close_image(img); + return 0; +err: + close_image(img); + return ret; +} + +static int prepare_ipc_var(int pid) +{ + int ret; + struct cr_img *img; + IpcVarEntry *var; + + pr_info("Restoring IPC variables\n"); + img = open_image(CR_FD_IPC_VAR, O_RSTR, pid); + if (!img) + return -1; + + ret = pb_read_one(img, &var, PB_IPC_VAR); + close_image(img); + if (ret <= 0) { + pr_err("Failed to read IPC namespace variables\n"); + return -EFAULT; + } + + ret = ipc_sysctl_req(var, CTL_WRITE); + ipc_var_entry__free_unpacked(var, NULL); + + if (ret < 0) { + pr_err("Failed to prepare IPC namespace variables\n"); + return -EFAULT; + } + + return 0; +} + +int prepare_ipc_ns(int pid) +{ + int ret; + + pr_info("Restoring IPC namespace\n"); + ret = prepare_ipc_var(pid); + if (ret < 0) + return ret; + ret = prepare_ipc_shm(pid); + if (ret < 0) + return ret; + ret = prepare_ipc_msg(pid); + if (ret < 0) + return ret; + ret = prepare_ipc_sem(pid); + if (ret < 0) + return ret; + return 0; +} + +struct ns_desc ipc_ns_desc = NS_DESC_ENTRY(CLONE_NEWIPC, "ipc"); diff --git a/CRIU_code/criu/irmap.c b/CRIU_code/criu/irmap.c new file mode 100644 index 0000000..e729842 --- /dev/null +++ b/CRIU_code/criu/irmap.c @@ -0,0 +1,494 @@ +/* + * IRMAP -- inode reverse mapping. + * + * Helps us to map inode number (and device) back to path + * so that we can restore inotify/fanotify-s. + * + * Scanning _is_ slow, so we limit it with hints, which are + * heuristically known places where notifies are typically put. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "xmalloc.h" +#include "irmap.h" +#include "mount.h" +#include "log.h" +#include "util.h" +#include "image.h" +#include "stats.h" +#include "pstree.h" +#include "cr_options.h" + +#include "protobuf.h" +#include "images/fsnotify.pb-c.h" +#include "images/fh.pb-c.h" + +#undef LOG_PREFIX +#define LOG_PREFIX "irmap: " + +#define IRMAP_CACHE_BITS 5 +#define IRMAP_CACHE_SIZE (1 << IRMAP_CACHE_BITS) +#define IRMAP_CACHE_MASK (IRMAP_CACHE_SIZE - 1) + +static inline int irmap_hashfn(unsigned int s_dev, unsigned long i_ino) +{ + return (s_dev + i_ino) & IRMAP_CACHE_MASK; +} + +struct irmap { + unsigned int dev; + unsigned long ino; + char *path; + struct irmap *next; + bool revalidate; + int nr_kids; + struct irmap *kids; +}; + +static struct irmap *cache[IRMAP_CACHE_SIZE]; + +static struct irmap hints[] = { + { .path = "/etc", .nr_kids = -1, }, + { .path = "/var/spool", .nr_kids = -1, }, + { .path = "/var/log", .nr_kids = -1, }, + { .path = "/usr/share/dbus-1/system-services", .nr_kids = -1 }, + { .path = "/var/lib/polkit-1/localauthority", .nr_kids = -1 }, + { .path = "/usr/share/polkit-1/actions", .nr_kids = -1 }, + { .path = "/lib/udev", .nr_kids = -1, }, + { .path = "/.", .nr_kids = 0, }, + { .path = "/no-such-path", .nr_kids = -1, }, + { }, +}; + +/* + * Update inode (and device) number and cache the entry + */ +static int irmap_update_stat(struct irmap *i) +{ + struct stat st; + int mntns_root; + unsigned hv; + + if (i->ino) + return 0; + + mntns_root = get_service_fd(ROOT_FD_OFF); + + pr_debug("Refresh stat for %s\n", i->path); + if (fstatat(mntns_root, i->path + 1, &st, AT_SYMLINK_NOFOLLOW)) { + pr_perror("Can't stat %s", i->path); + return -1; + } + + i->revalidate = false; + i->dev = MKKDEV(major(st.st_dev), minor(st.st_dev)); + i->ino = st.st_ino; + if (!S_ISDIR(st.st_mode)) + i->nr_kids = 0; /* don't irmap_update_dir */ + + hv = irmap_hashfn(i->dev, i->ino); + i->next = cache[hv]; + cache[hv] = i; + + return 0; +} + +/* + * Update list of children, but don't cache any. Later + * we'll scan them one-by-one and cache. + */ +static int irmap_update_dir(struct irmap *t) +{ + int fd, nr = 0, mntns_root; + DIR *dfd; + struct dirent *de; + + if (t->nr_kids >= 0) + return 0; + + mntns_root = get_service_fd(ROOT_FD_OFF); + + pr_debug("Refilling %s dir\n", t->path); + fd = openat(mntns_root, t->path + 1, O_RDONLY); + if (fd < 0) { + pr_perror("Can't open %s", t->path); + return -1; + } + + dfd = fdopendir(fd); + if (!dfd) { + pr_perror("Can't opendir %s", t->path); + return -1; + } + + errno = 0; + while ((de = readdir(dfd)) != NULL) { + struct irmap *k; + + if (dir_dots(de)) + continue; + + nr++; + if (xrealloc_safe(&t->kids, nr * sizeof(struct irmap))) + goto out_err; + + k = &t->kids[nr - 1]; + + k->kids = NULL; /* for xrealloc above */ + k->ino = 0; /* for irmap_update_stat */ + k->nr_kids = -1; /* for irmap_update_dir */ + k->path = xsprintf("%s/%s", t->path, de->d_name); + if (!k->path) + goto out_err; + } + + if (errno) { + pr_perror("Readdir failed"); + goto out_err; + } + + closedir(dfd); + close(fd); + t->nr_kids = nr; + return 0; + +out_err: + xfree(t->kids); + closedir(dfd); + close(fd); + return -1; +} + +static struct irmap *irmap_scan(struct irmap *t, unsigned int dev, unsigned long ino) +{ + struct irmap *c; + int i; + + if (irmap_update_stat(t)) + return NULL; + + if (t->dev == dev && t->ino == ino) + return t; + + if (irmap_update_dir(t)) + return NULL; + + for (i = 0; i < t->nr_kids; i++) { + c = irmap_scan(&t->kids[i], dev, ino); + if (c) + return c; + } + + return NULL; +} + +static int irmap_revalidate(struct irmap *c, struct irmap **p) +{ + struct stat st; + int mntns_root; + + mntns_root = get_service_fd(ROOT_FD_OFF); + + pr_debug("Revalidate stat for %s\n", c->path); + if (fstatat(mntns_root, c->path + 1, &st, AT_SYMLINK_NOFOLLOW)) { + /* File can be (re)moved, so just treat it as invalid */ + pr_perror("Can't stat %s", c->path); + goto invalid; + } + + if (c->dev != MKKDEV(major(st.st_dev), minor(st.st_dev))) + goto invalid; + if (c->ino != st.st_ino) + goto invalid; + + c->revalidate = false; + return 0; + +invalid: + pr_debug("\t%x:%lx is invalid\n", c->dev, c->ino); + *p = c->next; + xfree(c->path); + xfree(c); + return 1; +} + +static bool doing_predump = false; + +char *irmap_lookup(unsigned int s_dev, unsigned long i_ino) +{ + struct irmap *c, *h, **p; + char *path = NULL; + int hv; + struct irmap_path_opt *o; + + pr_debug("Resolving %x:%lx path\n", s_dev, i_ino); + + /* + * If we're in predump, then processes already run + * and the root_item is already freed by that time. + * But the root service fd is already set by the + * irmap_predump_prep, so we just go ahead and scan. + */ + if (!doing_predump && + __mntns_get_root_fd(root_item->pid->real) < 0) + goto out; + + timing_start(TIME_IRMAP_RESOLVE); + + hv = irmap_hashfn(s_dev, i_ino); + for (p = &cache[hv]; *p; ) { + c = *p; + if (!(c->dev == s_dev && c->ino == i_ino)) { + p = &(*p)->next; + continue; + } + + if (c->revalidate && irmap_revalidate(c, p)) + continue; + + pr_debug("\tFound %s in cache\n", c->path); + path = c->path; + goto out; + } + + /* Let's scan any user provided paths first; since the user told us + * about them, hopefully they're more interesting than our hints. + */ + list_for_each_entry(o, &opts.irmap_scan_paths, node) { + c = irmap_scan(o->ir, s_dev, i_ino); + if (c) { + pr_debug("\tScanned %s\n", c->path); + path = c->path; + goto out; + } + } + + for (h = hints; h->path; h++) { + pr_debug("Scanning %s hint\n", h->path); + c = irmap_scan(h, s_dev, i_ino); + if (c) { + pr_debug("\tScanned %s\n", c->path); + path = c->path; + goto out; + } + } + +out: + timing_stop(TIME_IRMAP_RESOLVE); + return path; +} + +/* + * IRMAP pre-cache -- do early irmap scan on pre-dump to reduce + * the freeze time on dump + */ + +struct irmap_predump { + unsigned int dev; + unsigned long ino; + FhEntry fh; + struct irmap_predump *next; +}; + +static struct irmap_predump *predump_queue; + +int irmap_queue_cache(unsigned int dev, unsigned long ino, + FhEntry *fh) +{ + struct irmap_predump *ip; + + ip = xmalloc(sizeof(*ip)); + if (!ip) + return -1; + + ip->dev = dev; + ip->ino = ino; + ip->fh = *fh; + ip->fh.handle = xmemdup(fh->handle, + FH_ENTRY_SIZES__min_entries * sizeof(uint64_t)); + if (!ip->fh.handle) { + xfree(ip); + return -1; + } + + pr_debug("Queue %x:%lx for pre-dump\n", dev, ino); + + ip->next = predump_queue; + predump_queue = ip; + return 0; +} + +int irmap_predump_prep(void) +{ + /* + * Tasks are about to get released soon, but + * we'll need to do FS scan for irmaps. In this + * scan we will need to know the root dir tasks + * live in. Need to make sure the respective fd + * (service) is set to that root, so that the + * scan works and doesn't race with the tasks + * dying or changind root. + */ + + doing_predump = true; + return __mntns_get_root_fd(root_item->pid->real) < 0 ? -1 : 0; +} + +int irmap_predump_run(void) +{ + int ret = 0; + struct cr_img *img; + struct irmap_predump *ip; + + img = open_image_at(AT_FDCWD, CR_FD_IRMAP_CACHE, O_DUMP); + if (!img) + return -1; + + pr_info("Running irmap pre-dump\n"); + + for (ip = predump_queue; ip; ip = ip->next) { + pr_debug("\tchecking %x:%lx\n", ip->dev, ip->ino); + ret = check_open_handle(ip->dev, ip->ino, &ip->fh); + if (ret) { + pr_err("Failed to resolve %x:%lx\n", ip->dev, ip->ino); + break; + } + + if (ip->fh.path) { + IrmapCacheEntry ic = IRMAP_CACHE_ENTRY__INIT; + + pr_info("Irmap cache %x:%lx -> %s\n", ip->dev, ip->ino, ip->fh.path); + ic.dev = ip->dev; + ic.inode = ip->ino; + ic.path = ip->fh.path; + + ret = pb_write_one(img, &ic, PB_IRMAP_CACHE); + if (ret) + break; + } + } + + close_image(img); + return ret; +} + +static int irmap_cache_one(IrmapCacheEntry *ie) +{ + struct irmap *ic; + unsigned hv; + + ic = xmalloc(sizeof(*ic)); + if (!ic) + return -1; + + ic->dev = ie->dev; + ic->ino = ie->inode; + ic->path = xstrdup(ie->path); + if (!ie->path) { + xfree(ic); + return -1; + } + + ic->nr_kids = 0; + /* + * We've loaded entry from cache, thus we'll need to check + * whether it's still valid when find it in cache. + */ + ic->revalidate = true; + + pr_debug("Pre-cache %x:%lx -> %s\n", ic->dev, ic->ino, ic->path); + + hv = irmap_hashfn(ic->dev, ic->ino); + ic->next = cache[hv]; + cache[hv] = ic; + + return 0; +} + +static int open_irmap_cache(struct cr_img **img) +{ + int dir = AT_FDCWD; + + pr_info("Searching irmap cache in work dir\n"); +in: + *img = open_image_at(dir, CR_FD_IRMAP_CACHE, O_RSTR); + if (dir != AT_FDCWD) + close(dir); + + if (empty_image(*img)) { + close_image(*img); + if (dir == AT_FDCWD) { + pr_info("Searching irmap cache in parent\n"); + dir = openat(get_service_fd(IMG_FD_OFF), + CR_PARENT_LINK, O_RDONLY); + if (dir >= 0) + goto in; + if (errno != ENOENT) + return -1; + } + + pr_info("No irmap cache\n"); + return 0; + } + + if (!*img) + return -1; + + pr_info("... done\n"); + return 1; +} + +int irmap_load_cache(void) +{ + int ret; + struct cr_img *img; + + ret = open_irmap_cache(&img); + if (ret <= 0) + return ret; + + pr_info("Loading irmap cache\n"); + while (1) { + IrmapCacheEntry *ic; + + ret = pb_read_one_eof(img, &ic, PB_IRMAP_CACHE); + if (ret <= 0) + break; + + ret = irmap_cache_one(ic); + if (ret < 0) + break; + + irmap_cache_entry__free_unpacked(ic, NULL); + } + + close_image(img); + return ret; +} + +int irmap_scan_path_add(char *path) +{ + struct irmap_path_opt *o; + + o = xzalloc(sizeof(*o)); + if (!o) + return -1; + + o->ir = xzalloc(sizeof(*o->ir)); + if (!o->ir) { + xfree(o); + return -1; + } + + o->ir->path = path; + o->ir->nr_kids = -1; + list_add(&o->node, &opts.irmap_scan_paths); + return 0; +} diff --git a/CRIU_code/criu/kcmp-ids.c b/CRIU_code/criu/kcmp-ids.c new file mode 100644 index 0000000..4fde10e --- /dev/null +++ b/CRIU_code/criu/kcmp-ids.c @@ -0,0 +1,208 @@ +#include +#include +#include + +#include "log.h" +#include "xmalloc.h" + +#include "common/compiler.h" +#include "common/bug.h" + +#include "rbtree.h" +#include "kcmp-ids.h" + +/* + * We track shared files by global rbtree, where each node might + * be a root for subtree. The reason for that is the nature of data + * we obtain from operating system. + * + * Basically OS provides us two ways to distinguish files + * + * - information obtained from fstat call + * - shiny new sys_kcmp system call (which may compare the file descriptor + * pointers inside the kernel and provide us order info) + * + * So, to speedup procedure of searching for shared file descriptors + * we use both techniques. From fstat call we get that named general file + * IDs (genid) which are carried in the main rbtree. + * + * In case if two genid are the same -- we need to use a second way and + * call for sys_kcmp. Thus, if kernel tells us that files have identical + * genid but in real they are different from kernel point of view -- we assign + * a second unique key (subid) to such file descriptor and put it into a subtree. + * + * So the tree will look like + * + * (root) + * genid-1 + * / \ + * genid-2 genid-3 + * / \ / \ + * + * Where each genid node might be a sub-rbtree as well + * + * (genid-N) + * / \ + * subid-1 subid-2 + * / \ / \ + * + * Carrying two rbtree at once allow us to minimize the number + * of sys_kcmp syscalls, also to collect and dump file descriptors + * in one pass. + */ + +struct kid_entry { + struct rb_node node; + + struct rb_root subtree_root; + struct rb_node subtree_node; + + uint32_t subid; /* subid is always unique */ + struct kid_elem elem; +} __aligned(sizeof(long)); + +static struct kid_entry *alloc_kid_entry(struct kid_tree *tree, struct kid_elem *elem) +{ + struct kid_entry *e; + + e = xmalloc(sizeof(*e)); + if (!e) + goto err; + + e->subid = tree->subid++; + e->elem = *elem; + + /* Make sure no overflow here */ + BUG_ON(!e->subid); + + rb_init_node(&e->node); + rb_init_node(&e->subtree_node); + e->subtree_root = RB_ROOT; + rb_link_and_balance(&e->subtree_root, &e->subtree_node, + NULL, &e->subtree_root.rb_node); +err: + return e; +} + +static uint32_t kid_generate_sub(struct kid_tree *tree, struct kid_entry *e, + struct kid_elem *elem, int *new_id) +{ + struct rb_node *node = e->subtree_root.rb_node; + struct kid_entry *sub = NULL; + + struct rb_node **new = &e->subtree_root.rb_node; + struct rb_node *parent = NULL; + + BUG_ON(!node); + + while (node) { + struct kid_entry *this = rb_entry(node, struct kid_entry, subtree_node); + int ret = syscall(SYS_kcmp, this->elem.pid, elem->pid, tree->kcmp_type, + this->elem.idx, elem->idx); + + parent = *new; + if (ret == 1) + node = node->rb_left, new = &((*new)->rb_left); + else if (ret == 2) + node = node->rb_right, new = &((*new)->rb_right); + else if (ret == 0) + return this->subid; + else { + pr_perror("kcmp failed: pid (%d %d) type %u idx (%u %u)", + this->elem.pid, elem->pid, tree->kcmp_type, + this->elem.idx, elem->idx); + return 0; + } + } + + sub = alloc_kid_entry(tree, elem); + if (!sub) + return 0; + + rb_link_and_balance(&e->subtree_root, &sub->subtree_node, parent, new); + *new_id = 1; + return sub->subid; +} + +uint32_t kid_generate_gen(struct kid_tree *tree, + struct kid_elem *elem, int *new_id) +{ + struct rb_node *node = tree->root.rb_node; + struct kid_entry *e = NULL; + + struct rb_node **new = &tree->root.rb_node; + struct rb_node *parent = NULL; + + while (node) { + struct kid_entry *this = rb_entry(node, struct kid_entry, node); + + parent = *new; + if (elem->genid < this->elem.genid) + node = node->rb_left, new = &((*new)->rb_left); + else if (elem->genid > this->elem.genid) + node = node->rb_right, new = &((*new)->rb_right); + else + return kid_generate_sub(tree, this, elem, new_id); + } + + e = alloc_kid_entry(tree, elem); + if (!e) + return 0; + + rb_link_and_balance(&tree->root, &e->node, parent, new); + *new_id = 1; + return e->subid; +} + +static struct kid_elem *kid_lookup_epoll_tfd_sub(struct kid_tree *tree, + struct kid_entry *e, + struct kid_elem *elem, + kcmp_epoll_slot_t *slot) +{ + struct rb_node *node = e->subtree_root.rb_node; + struct rb_node **new = &e->subtree_root.rb_node; + + BUG_ON(!node); + + while (node) { + struct kid_entry *this = rb_entry(node, struct kid_entry, subtree_node); + int ret = syscall(SYS_kcmp, this->elem.pid, elem->pid, KCMP_EPOLL_TFD, + this->elem.idx, slot); + + if (ret == 1) + node = node->rb_left, new = &((*new)->rb_left); + else if (ret == 2) + node = node->rb_right, new = &((*new)->rb_right); + else if (ret == 0) + return &this->elem; + else { + pr_perror("kcmp-epoll failed: pid (%d %d) type %u idx (%u %u)", + this->elem.pid, elem->pid, KCMP_EPOLL_TFD, + this->elem.idx, elem->idx); + return NULL; + } + } + + return NULL; +} + +struct kid_elem *kid_lookup_epoll_tfd(struct kid_tree *tree, + struct kid_elem *elem, + kcmp_epoll_slot_t *slot) +{ + struct rb_node *node = tree->root.rb_node; + struct rb_node **new = &tree->root.rb_node; + + while (node) { + struct kid_entry *this = rb_entry(node, struct kid_entry, node); + + if (elem->genid < this->elem.genid) + node = node->rb_left, new = &((*new)->rb_left); + else if (elem->genid > this->elem.genid) + node = node->rb_right, new = &((*new)->rb_right); + else + return kid_lookup_epoll_tfd_sub(tree, this, elem, slot); + } + + return NULL; +} diff --git a/CRIU_code/criu/kerndat.c b/CRIU_code/criu/kerndat.c new file mode 100644 index 0000000..b884f6d --- /dev/null +++ b/CRIU_code/criu/kerndat.c @@ -0,0 +1,1097 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for sockaddr_in and inet_ntoa() */ +#include +#include + + +#include "common/config.h" +#include "int.h" +#include "log.h" +#include "restorer.h" +#include "kerndat.h" +#include "fs-magic.h" +#include "mem.h" +#include "common/compiler.h" +#include "sysctl.h" +#include "cr_options.h" +#include "util.h" +#include "lsm.h" +#include "proc_parse.h" +#include "sk-inet.h" +#include "sockets.h" +#include "net.h" +#include "tun.h" +#include +#include +#include "netfilter.h" +#include "fsnotify.h" +#include "linux/userfaultfd.h" +#include "prctl.h" +#include "uffd.h" +#include "vdso.h" +#include "kcmp.h" + +struct kerndat_s kdat = { +}; + +static int check_pagemap(void) +{ + int ret, fd; + u64 pfn = 0; + + fd = __open_proc(PROC_SELF, EPERM, O_RDONLY, "pagemap"); + if (fd < 0) { + if (errno == EPERM) { + pr_info("Pagemap disabled"); + kdat.pmap = PM_DISABLED; + return 0; + } + + return -1; + } + + /* Get the PFN of some present page. Stack is here, so try it :) */ + ret = pread(fd, &pfn, sizeof(pfn), (((unsigned long)&ret) / page_size()) * sizeof(pfn)); + if (ret != sizeof(pfn)) { + pr_perror("Can't read pagemap"); + return -1; + } + + close(fd); + + if ((pfn & PME_PFRAME_MASK) == 0) { + pr_info("Pagemap provides flags only\n"); + kdat.pmap = PM_FLAGS_ONLY; + } else { + pr_info("Pagemap is fully functional\n"); + kdat.pmap = PM_FULL; + } + + return 0; +} + +/* + * Anonymous shared mappings are backed by hidden tmpfs + * mount. Find out its dev to distinguish such mappings + * from real tmpfs files maps. + */ + +static int parse_self_maps(unsigned long vm_start, dev_t *device) +{ + FILE *maps; + char buf[1024]; + + maps = fopen_proc(PROC_SELF, "maps"); + if (maps == NULL) + return -1; + + while (fgets(buf, sizeof(buf), maps) != NULL) { + char *end, *aux; + unsigned long start; + int maj, min; + + start = strtoul(buf, &end, 16); + if (vm_start > start) + continue; + if (vm_start < start) + break; + + /* It's ours */ + aux = strchr(end + 1, ' '); /* end prot */ + aux = strchr(aux + 1, ' '); /* prot pgoff */ + aux = strchr(aux + 1, ' '); /* pgoff dev */ + + maj = strtoul(aux + 1, &end, 16); + min = strtoul(end + 1, NULL, 16); + + *device = makedev(maj, min); + fclose(maps); + return 0; + } + + fclose(maps); + return -1; +} + +static void kerndat_mmap_min_addr(void) +{ + /* From kernel's default CONFIG_LSM_MMAP_MIN_ADDR */ + static const unsigned long default_mmap_min_addr = 65536; + uint64_t value; + + struct sysctl_req req[] = { + { + .name = "vm/mmap_min_addr", + .arg = &value, + .type = CTL_U64, + }, + }; + + if (sysctl_op(req, ARRAY_SIZE(req), CTL_READ, 0)) { + pr_warn("Can't fetch %s value, use default %#lx\n", + req[0].name, (unsigned long)default_mmap_min_addr); + kdat.mmap_min_addr = default_mmap_min_addr; + return; + } + + if (value < default_mmap_min_addr) { + pr_debug("Adjust mmap_min_addr %#lx -> %#lx\n", + (unsigned long)value, + (unsigned long)default_mmap_min_addr); + kdat.mmap_min_addr = default_mmap_min_addr; + } else + kdat.mmap_min_addr = value; + + pr_debug("Found mmap_min_addr %#lx\n", + (unsigned long)kdat.mmap_min_addr); +} + +int kerndat_files_stat(bool early) +{ + static const uint32_t NR_OPEN_DEFAULT = 1024 * 1024; + static const uint64_t MAX_FILES_DEFAULT = 8192; + uint64_t max_files; + uint32_t nr_open; + + struct sysctl_req req[] = { + { + .name = "fs/file-max", + .arg = &max_files, + .type = CTL_U64, + }, + { + .name = "fs/nr_open", + .arg = &nr_open, + .type = CTL_U32, + }, + }; + + if (!early) { + if (sysctl_op(req, ARRAY_SIZE(req), CTL_READ, 0)) { + pr_warn("Can't fetch file_stat, using kernel defaults\n"); + nr_open = NR_OPEN_DEFAULT; + max_files = MAX_FILES_DEFAULT; + } + } else { + char buf[64]; + int fd1, fd2; + ssize_t ret; + + fd1 = open("/proc/sys/fs/file-max", O_RDONLY); + fd2 = open("/proc/sys/fs/nr_open", O_RDONLY); + + nr_open = NR_OPEN_DEFAULT; + max_files = MAX_FILES_DEFAULT; + + if (fd1 < 0 || fd2 < 0) { + pr_warn("Can't fetch file_stat, using kernel defaults\n"); + } else { + ret = read(fd1, buf, sizeof(buf) - 1); + if (ret > 0) { + buf[ret] = '\0'; + max_files = atol(buf); + } + ret = read(fd2, buf, sizeof(buf) - 1); + if (ret > 0) { + buf[ret] = '\0'; + nr_open = atol(buf); + } + } + + if (fd1 >= 0) + close(fd1); + if (fd2 >= 0) + close(fd2); + } + + kdat.sysctl_nr_open = nr_open; + kdat.files_stat_max_files = max_files; + + pr_debug("files stat: %s %lu, %s %u\n", + req[0].name, kdat.files_stat_max_files, + req[1].name, kdat.sysctl_nr_open); + + return 0; +} + +static int kerndat_get_shmemdev(void) +{ + void *map; + char maps[128]; + struct stat buf; + dev_t dev; + + map = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, 0, 0); + if (map == MAP_FAILED) { + pr_perror("Can't mmap memory for shmemdev test"); + return -1; + } + + sprintf(maps, "/proc/self/map_files/%lx-%lx", + (unsigned long)map, (unsigned long)map + page_size()); + if (stat(maps, &buf) < 0) { + int e = errno; + if (errno == EPERM) { + /* + * Kernel disables messing with map_files. + * OK, let's go the slower route. + */ + + if (parse_self_maps((unsigned long)map, &dev) < 0) { + pr_err("Can't read self maps\n"); + goto err; + } + } else { + pr_perror("Can't stat self map_files %d", e); + goto err; + } + } else + dev = buf.st_dev; + + munmap(map, PAGE_SIZE); + kdat.shmem_dev = dev; + pr_info("Found anon-shmem device at %"PRIx64"\n", kdat.shmem_dev); + return 0; + +err: + munmap(map, PAGE_SIZE); + return -1; +} + +static dev_t get_host_dev(unsigned int which) +{ + static struct kst { + const char *name; + const char *path; + unsigned int magic; + dev_t fs_dev; + } kstat[KERNDAT_FS_STAT_MAX] = { + [KERNDAT_FS_STAT_DEVPTS] = { + .name = "devpts", + .path = "/dev/pts", + .magic = DEVPTS_SUPER_MAGIC, + }, + [KERNDAT_FS_STAT_DEVTMPFS] = { + .name = "devtmpfs", + .path = "/dev", + .magic = TMPFS_MAGIC, + }, + [KERNDAT_FS_STAT_BINFMT_MISC] = { + .name = "binfmt_misc", + .path = "/proc/sys/fs/binfmt_misc", + .magic = BINFMTFS_MAGIC, + }, + }; + + if (which >= KERNDAT_FS_STAT_MAX) { + pr_err("Wrong fs type %u passed\n", which); + return 0; + } + + if (kstat[which].fs_dev == 0) { + struct statfs fst; + struct stat st; + + if (statfs(kstat[which].path, &fst)) { + pr_perror("Unable to statefs %s", kstat[which].path); + return 0; + } + + /* + * XXX: If the fs we need is not there, it still + * may mean that it's virtualized, but just not + * mounted on the host. + */ + + if (fst.f_type != kstat[which].magic) { + pr_err("%s isn't mount on the host\n", kstat[which].name); + return 0; + } + + if (stat(kstat[which].path, &st)) { + pr_perror("Unable to stat %s", kstat[which].path); + return 0; + } + + BUG_ON(st.st_dev == 0); + kstat[which].fs_dev = st.st_dev; + } + + return kstat[which].fs_dev; +} + +int kerndat_fs_virtualized(unsigned int which, u32 kdev) +{ + dev_t host_fs_dev; + + host_fs_dev = get_host_dev(which); + if (host_fs_dev == 0) + return -1; + + return (kdev_to_odev(kdev) == host_fs_dev) ? 0 : 1; +} + +/* + * Check whether pagemap reports soft dirty bit. Kernel has + * this functionality under CONFIG_MEM_SOFT_DIRTY option. + */ + +int kerndat_get_dirty_track(void) +{ + char *map; + int pm2; + u64 pmap = 0; + int ret = -1; + + map = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); + if (map == MAP_FAILED) { + pr_perror("Can't mmap memory for pagemap test"); + return ret; + } + + /* + * Kernel shows soft-dirty bits only if this soft-dirty + * was at least once re-set. (this is to be removed in + * a couple of kernel releases) + */ + ret = do_task_reset_dirty_track(getpid()); + if (ret < 0) + return ret; + if (ret == 1) + goto no_dt; + + ret = -1; + pm2 = open_proc(PROC_SELF, "pagemap"); + if (pm2 < 0) { + munmap(map, PAGE_SIZE); + return ret; + } + + map[0] = '\0'; + + lseek(pm2, (unsigned long)map / PAGE_SIZE * sizeof(u64), SEEK_SET); + ret = read(pm2, &pmap, sizeof(pmap)); + if (ret < 0) + pr_perror("Read pmap err!"); + + close(pm2); + munmap(map, PAGE_SIZE); + + if (pmap & PME_SOFT_DIRTY) { + pr_info("Dirty track supported on kernel\n"); + kdat.has_dirty_track = true; + } else { +no_dt: + pr_info("Dirty tracking support is OFF\n"); + if (opts.track_mem) { + pr_err("Tracking memory is not available\n"); + return -1; + } + } + + return 0; +} + +/* The page frame number (PFN) is constant for the zero page */ +static int init_zero_page_pfn() +{ + void *addr; + int ret = 0; + + kdat.zero_page_pfn = -1; + if (kdat.pmap != PM_FULL) { + pr_info("Zero page detection failed, optimization turns off.\n"); + return 0; + } + + addr = mmap(NULL, PAGE_SIZE, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (addr == MAP_FAILED) { + pr_perror("Unable to map zero page"); + return 0; + } + + if (*((int *) addr) != 0) { + BUG(); + return -1; + } + + ret = vaddr_to_pfn(-1, (unsigned long)addr, &kdat.zero_page_pfn); + munmap(addr, PAGE_SIZE); + + if (kdat.zero_page_pfn == 0) + ret = -1; + + return ret; +} + +static int get_last_cap(void) +{ + struct sysctl_req req[] = { + { "kernel/cap_last_cap", &kdat.last_cap, CTL_U32 }, + }; + + return sysctl_op(req, ARRAY_SIZE(req), CTL_READ, 0); +} + +static bool kerndat_has_memfd_create(void) +{ + int ret; + + ret = syscall(SYS_memfd_create, NULL, 0); + + if (ret == -1 && errno == ENOSYS) + kdat.has_memfd = false; + else if (ret == -1 && errno == EFAULT) + kdat.has_memfd = true; + else { + pr_err("Unexpected error from memfd_create(NULL, 0): %m\n"); + return -1; + } + + return 0; +} + +static int get_task_size(void) +{ + kdat.task_size = compel_task_size(); + pr_debug("Found task size of %lx\n", kdat.task_size); + return 0; +} + +int kerndat_fdinfo_has_lock() +{ + int fd, pfd = -1, exit_code = -1, len; + char buf[PAGE_SIZE]; + + fd = open_proc(PROC_GEN, "locks"); + if (fd < 0) + return -1; + + if (flock(fd, LOCK_SH)) { + pr_perror("Can't take a lock"); + goto out; + } + + pfd = open_proc(PROC_SELF, "fdinfo/%d", fd); + if (pfd < 0) + goto out; + + len = read(pfd, buf, sizeof(buf) - 1); + if (len < 0) { + pr_perror("Unable to read"); + goto out; + } + buf[len] = 0; + + kdat.has_fdinfo_lock = (strstr(buf, "lock:") != NULL); + + exit_code = 0; +out: + close(pfd); + close(fd); + + return exit_code; +} + +static int get_ipv6() +{ + if (access("/proc/sys/net/ipv6", F_OK) < 0) { + if (errno == ENOENT) { + pr_debug("ipv6 is disabled\n"); + kdat.ipv6 = false; + return 0; + } + pr_perror("Unable to access /proc/sys/net/ipv6"); + return -1; + } + kdat.ipv6 = true; + return 0; +} + +int kerndat_loginuid(void) +{ + unsigned int saved_loginuid; + int ret; + + kdat.luid = LUID_NONE; + + /* No such file: CONFIG_AUDITSYSCALL disabled */ + saved_loginuid = parse_pid_loginuid(PROC_SELF, &ret, true); + if (ret < 0) + return 0; + + kdat.luid = LUID_READ; + + /* + * From kernel v3.13-rc2 it's possible to unset loginuid value, + * on that rely dump/restore code. + * See also: marc.info/?l=git-commits-head&m=138509506407067 + */ + if (prepare_loginuid(INVALID_UID, LOG_WARN) < 0) + return 0; + /* Cleaning value back as it was */ + if (prepare_loginuid(saved_loginuid, LOG_WARN) < 0) + return 0; + + kdat.luid = LUID_FULL; + return 0; +} + +static int kerndat_iptables_has_xtlocks(void) +{ + int fd; + char *argv[4] = { "sh", "-c", "iptables -w -L", NULL }; + + fd = open("/dev/null", O_RDWR); + if (fd < 0) { + fd = -1; + pr_perror("failed to open /dev/null, using log fd for xtlocks check"); + } + + kdat.has_xtlocks = 1; + if (cr_system(fd, fd, fd, "sh", argv, CRS_CAN_FAIL) == -1) + kdat.has_xtlocks = 0; + + close_safe(&fd); + return 0; +} + +int kerndat_tcp_repair(void) +{ + int sock, clnt = -1, yes = 1, exit_code = -1; + struct sockaddr_in addr; + socklen_t aux; + + memset(&addr,0,sizeof(addr)); + addr.sin_family = AF_INET; + inet_pton(AF_INET, "127.0.0.1", &(addr.sin_addr)); + addr.sin_port = 0; + sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); + if (sock < 0) { + pr_perror("Unable to create a socket"); + return -1; + } + + if (bind(sock, (struct sockaddr *) &addr, sizeof(addr))) { + pr_perror("Unable to bind a socket"); + goto err; + } + + aux = sizeof(addr); + if (getsockname(sock, (struct sockaddr *) &addr, &aux)) { + pr_perror("Unable to get a socket name"); + goto err; + } + + if (listen(sock, 1)) { + pr_perror("Unable to listen a socket"); + goto err; + } + + clnt = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); + if (clnt < 0) { + pr_perror("Unable to create a socket"); + goto err; + } + + if (connect(clnt, (struct sockaddr *) &addr, sizeof(addr))) { + pr_perror("Unable to connect a socket"); + goto err; + } + + if (shutdown(clnt, SHUT_WR)) { + pr_perror("Unable to shutdown a socket"); + goto err; + } + + if (setsockopt(clnt, SOL_TCP, TCP_REPAIR, &yes, sizeof(yes))) { + if (errno != EPERM) + goto err; + kdat.has_tcp_half_closed = false; + } else + kdat.has_tcp_half_closed = true; + + exit_code = 0; +err: + close_safe(&clnt); + close(sock); + + return exit_code; +} + +int kerndat_nsid(void) +{ + int nsid, sk; + + sk = socket(PF_NETLINK, SOCK_RAW, NETLINK_ROUTE); + if (sk < 0) { + pr_perror("Unable to create a netlink socket"); + return -1; + } + + if (net_get_nsid(sk, getpid(), &nsid) < 0) { + pr_err("NSID is not supported\n"); + close(sk); + return -1; + } + + kdat.has_nsid = true; + close(sk); + return 0; +} + +static int kerndat_compat_restore(void) +{ + int ret; + + ret = kdat_can_map_vdso(); + if (ret < 0) + return ret; + kdat.can_map_vdso = !!ret; + + /* depends on kdat.can_map_vdso result */ + kdat.compat_cr = kdat_compatible_cr(); + + return 0; +} + +static int kerndat_detect_stack_guard_gap(void) +{ + int num, ret = -1, detected = 0; + unsigned long start, end; + char r, w, x, s; + char buf[1024]; + FILE *maps; + void *mem; + + mem = mmap(NULL, (3ul << 20), PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_GROWSDOWN, -1, 0); + if (mem == MAP_FAILED) { + pr_perror("Can't mmap stack area"); + return -1; + } + munmap(mem, (3ul << 20)); + + mem = mmap(mem + (2ul << 20), (1ul << 20), PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED | MAP_GROWSDOWN, -1, 0); + if (mem == MAP_FAILED) { + pr_perror("Can't mmap stack area"); + return -1; + } + + maps = fopen("/proc/self/maps", "r"); + if (maps == NULL) { + munmap(mem, 4096); + return -1; + } + + while (fgets(buf, sizeof(buf), maps)) { + num = sscanf(buf, "%lx-%lx %c%c%c%c", + &start, &end, &r, &w, &x, &s); + if (num < 6) { + pr_err("Can't parse: %s\n", buf); + goto err; + } + + /* + * When reading /proc/$pid/[s]maps the + * start/end addresses might be cutted off + * with PAGE_SIZE on kernels prior 4.12 + * (see kernel commit 1be7107fbe18ee). + * + * Same time there was semi-complete + * patch released which hitted a number + * of repos (Ubuntu, Fedora) where instead + * of PAGE_SIZE the 1M gap is cutted off. + */ + if (start == (unsigned long)mem) { + kdat.stack_guard_gap_hidden = false; + detected = 1; + break; + } else if (start == ((unsigned long)mem + (1ul << 20))) { + pr_warn("Unsupported stack guard detected, confused but continue\n"); + kdat.stack_guard_gap_hidden = true; + detected = 1; + break; + } else if (start == ((unsigned long)mem + PAGE_SIZE)) { + kdat.stack_guard_gap_hidden = true; + detected = 1; + break; + } + } + + if (detected) + ret = 0; + +err: + munmap(mem, (1ul << 20)); + fclose(maps); + return ret; +} + +int kerndat_has_inotify_setnextwd(void) +{ + int ret = 0; + int fd; + + fd = inotify_init(); + if (fd < 0) { + pr_perror("Can't create inotify"); + return -1; + } + + if (ioctl(fd, INOTIFY_IOC_SETNEXTWD, 0x10)) { + if (errno != ENOTTY) { + pr_perror("Can't call ioctl"); + ret = -1; + } + } else + kdat.has_inotify_setnextwd = true; + + close(fd); + return ret; +} + +int has_kcmp_epoll_tfd(void) +{ + kcmp_epoll_slot_t slot = { }; + int ret = -1, efd, tfd; + pid_t pid = getpid(); + struct epoll_event ev; + int pipefd[2]; + + efd = epoll_create(1); + if (efd < 0) { + pr_perror("Can't create epoll"); + return -1; + } + + memset(&ev, 0xff, sizeof(ev)); + ev.events = EPOLLIN | EPOLLOUT; + + if (pipe(pipefd)) { + pr_perror("Can't create pipe"); + close(efd); + return -1; + } + + tfd = pipefd[0]; + if (epoll_ctl(efd, EPOLL_CTL_ADD, tfd, &ev)) { + pr_perror("Can't add event"); + goto out; + } + + slot.efd = efd; + slot.tfd = tfd; + + if (syscall(SYS_kcmp, pid, pid, KCMP_EPOLL_TFD, tfd, &slot) == 0) + kdat.has_kcmp_epoll_tfd = true; + else + kdat.has_kcmp_epoll_tfd = false; + ret = 0; + +out: + close(pipefd[0]); + close(pipefd[1]); + close(efd); + return ret; +} + +int __attribute__((weak)) kdat_x86_has_ptrace_fpu_xsave_bug(void) +{ + return 0; +} + +static int kerndat_x86_has_ptrace_fpu_xsave_bug(void) +{ + int ret = kdat_x86_has_ptrace_fpu_xsave_bug(); + + if (ret < 0) + return ret; + + kdat.x86_has_ptrace_fpu_xsave_bug = !!ret; + return 0; +} + +#define KERNDAT_CACHE_FILE KDAT_RUNDIR"/criu.kdat" +#define KERNDAT_CACHE_FILE_TMP KDAT_RUNDIR"/.criu.kdat" + +static int kerndat_try_load_cache(void) +{ + int fd, ret; + + fd = open(KERNDAT_CACHE_FILE, O_RDONLY); + if (fd < 0) { + if(ENOENT == errno) + pr_debug("File %s does not exist\n", KERNDAT_CACHE_FILE); + else + pr_warn("Can't load %s\n", KERNDAT_CACHE_FILE); + return 1; + } + + ret = read(fd, &kdat, sizeof(kdat)); + if (ret < 0) { + pr_perror("Can't read kdat cache"); + close(fd); + return -1; + } + + close(fd); + + if (ret != sizeof(kdat) || + kdat.magic1 != KDAT_MAGIC || + kdat.magic2 != KDAT_MAGIC_2) { + pr_warn("Stale %s file\n", KERNDAT_CACHE_FILE); + unlink(KERNDAT_CACHE_FILE); + return 1; + } + + pr_info("Loaded kdat cache from %s\n", KERNDAT_CACHE_FILE); + return 0; +} + +static void kerndat_save_cache(void) +{ + int fd, ret; + struct statfs s; + + fd = open(KERNDAT_CACHE_FILE_TMP, O_CREAT | O_EXCL | O_WRONLY, 0600); + if (fd < 0) + /* + * It can happen that we race with some other criu + * instance. That's OK, just ignore this error and + * proceed. + */ + return; + + if (fstatfs(fd, &s) < 0 || s.f_type != TMPFS_MAGIC) { + pr_warn("Can't keep kdat cache on non-tempfs\n"); + close(fd); + goto unl; + } + + /* + * One magic to make sure we're reading the kdat file. + * One more magic to make somehow sure we don't read kdat + * from some other criu + */ + kdat.magic1 = KDAT_MAGIC; + kdat.magic2 = KDAT_MAGIC_2; + ret = write(fd, &kdat, sizeof(kdat)); + close(fd); + + if (ret == sizeof(kdat)) + ret = rename(KERNDAT_CACHE_FILE_TMP, KERNDAT_CACHE_FILE); + else { + ret = -1; + errno = EIO; + } + + if (ret < 0) { + pr_perror("Couldn't save %s", KERNDAT_CACHE_FILE); +unl: + unlink(KERNDAT_CACHE_FILE_TMP); + } +} + +int kerndat_uffd(void) +{ + int uffd; + + kdat.uffd_features = 0; + uffd = uffd_open(0, &kdat.uffd_features); + + /* + * uffd == -ENOSYS means userfaultfd is not supported on this + * system and we just happily return with kdat.has_uffd = false. + * Error other than -ENOSYS would mean "Houston, Houston, we + * have a problem!" + */ + if (uffd < 0) { + if (uffd == -ENOSYS) + return 0; + + pr_err("Lazy pages are not available\n"); + return -1; + } + + kdat.has_uffd = true; + + /* + * we have to close the uffd and reopen in later in restorer + * to enable non-cooperative features + */ + close(uffd); + + return 0; +} + +int kerndat_has_thp_disable(void) +{ + struct bfd f; + void *addr; + char *str; + int ret = -1; + bool vma_match = false; + + if (prctl(PR_SET_THP_DISABLE, 1, 0, 0, 0)) { + if (errno != EINVAL) + return -1; + pr_info("PR_SET_THP_DISABLE is not available\n"); + return 0; + } + + addr = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); + if (addr == MAP_FAILED) { + pr_perror("Can't mmap memory for THP disable test"); + return -1; + } + + if (prctl(PR_SET_THP_DISABLE, 0, 0, 0, 0)) + return -1; + + f.fd = open("/proc/self/smaps", O_RDONLY); + if (f.fd < 0) { + pr_perror("Can't open /proc/self/smaps"); + goto out_unmap; + } + if (bfdopenr(&f)) + goto out_unmap; + + while ((str = breadline(&f)) != NULL) { + if (IS_ERR(str)) + goto out_close; + + if (is_vma_range_fmt(str)) { + unsigned long vma_addr; + + if (sscanf(str, "%lx-", &vma_addr) != 1) { + pr_err("Can't parse: %s\n", str); + goto out_close; + } + + if (vma_addr == (unsigned long)addr) + vma_match = true; + } + + if (vma_match && !strncmp(str, "VmFlags: ", 9)) { + u32 flags = 0; + u64 madv = 0; + int io_pf = 0; + + parse_vmflags(str, &flags, &madv, &io_pf); + kdat.has_thp_disable = !(madv & (1 << MADV_NOHUGEPAGE)); + break; + } + } + + ret = 0; + +out_close: + bclose(&f); +out_unmap: + munmap(addr, PAGE_SIZE); + + return ret; +} + +static int kerndat_tun_netns(void) +{ + return check_tun_netns_cr(&kdat.tun_ns); +} + +int kerndat_init(void) +{ + int ret; + + ret = kerndat_try_load_cache(); + if (ret <= 0) + return ret; + + /* kerndat_try_load_cache can leave some trash in kdat */ + memset(&kdat, 0, sizeof(kdat)); + + preload_socket_modules(); + preload_netfilter_modules(); + + ret = check_pagemap(); + if (!ret) + ret = kerndat_get_shmemdev(); + if (!ret) + ret = kerndat_get_dirty_track(); + if (!ret) + ret = init_zero_page_pfn(); + if (!ret) + ret = get_last_cap(); + if (!ret) + ret = kerndat_fdinfo_has_lock(); + if (!ret) + ret = get_task_size(); + if (!ret) + ret = get_ipv6(); + if (!ret) + ret = kerndat_loginuid(); + if (!ret) + ret = kerndat_iptables_has_xtlocks(); + if (!ret) + ret = kerndat_tcp_repair(); + if (!ret) + ret = kerndat_compat_restore(); + if (!ret) + ret = kerndat_socket_netns(); + if (!ret) + ret = kerndat_tun_netns(); + if (!ret) + ret = kerndat_socket_unix_file(); + if (!ret) + ret = kerndat_nsid(); + if (!ret) + ret = kerndat_link_nsid(); + if (!ret) + ret = kerndat_has_memfd_create(); + if (!ret) + ret = kerndat_detect_stack_guard_gap(); + if (!ret) + ret = kerndat_uffd(); + if (!ret) + ret = kerndat_has_thp_disable(); + /* Needs kdat.compat_cr filled before */ + if (!ret) + ret = kerndat_vdso_fill_symtable(); + /* Depends on kerndat_vdso_fill_symtable() */ + if (!ret) + ret = kerndat_vdso_preserves_hint(); + if (!ret) + ret = kerndat_socket_netns(); + if (!ret) + ret = kerndat_nsid(); + if (!ret) + ret = kerndat_x86_has_ptrace_fpu_xsave_bug(); + if (!ret) + ret = kerndat_has_inotify_setnextwd(); + if (!ret) + ret = has_kcmp_epoll_tfd(); + + kerndat_lsm(); + kerndat_mmap_min_addr(); + kerndat_files_stat(false); + + if (!ret) + kerndat_save_cache(); + + return ret; +} diff --git a/CRIU_code/criu/libnetlink.c b/CRIU_code/criu/libnetlink.c new file mode 100644 index 0000000..18a323b --- /dev/null +++ b/CRIU_code/criu/libnetlink.c @@ -0,0 +1,226 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "libnetlink.h" +#include "util.h" + +static int nlmsg_receive(char *buf, int len, + int (*cb)(struct nlmsghdr *, struct ns_id *ns, void *), + int (*err_cb)(int, struct ns_id *, void *), struct ns_id *ns, void *arg) +{ + struct nlmsghdr *hdr; + + for (hdr = (struct nlmsghdr *)buf; NLMSG_OK(hdr, len); hdr = NLMSG_NEXT(hdr, len)) { + if (hdr->nlmsg_seq != CR_NLMSG_SEQ) + continue; + if (hdr->nlmsg_type == NLMSG_DONE) { + int *len = (int *)NLMSG_DATA(hdr); + if (*len < 0) + return err_cb(*len, ns, arg); + return 0; + } + if (hdr->nlmsg_type == NLMSG_ERROR) { + struct nlmsgerr *err = (struct nlmsgerr *)NLMSG_DATA(hdr); + + if (hdr->nlmsg_len - sizeof(*hdr) < sizeof(struct nlmsgerr)) { + pr_err("ERROR truncated\n"); + return -1; + } + + if (err->error == 0) + return 0; + + return err_cb(err->error, ns, arg); + } + if (cb(hdr, ns, arg)) + return -1; + } + + return 1; +} + +/* + * Default errror handler: just point our an error + * and pass up to caller. + */ +static int rtnl_return_err(int err, struct ns_id *ns, void *arg) +{ + errno = -err; + pr_perror("%d reported by netlink", err); + return err; +} + +int do_rtnl_req(int nl, void *req, int size, + int (*receive_callback)(struct nlmsghdr *h, struct ns_id *ns, void *), + int (*error_callback)(int err, struct ns_id *ns, void *arg), struct ns_id *ns, void *arg) +{ + struct msghdr msg; + struct sockaddr_nl nladdr; + struct iovec iov; + static char buf[16384]; + int err; + + if (!error_callback) + error_callback = rtnl_return_err; + + memset(&msg, 0, sizeof(msg)); + msg.msg_name = &nladdr; + msg.msg_namelen = sizeof(nladdr); + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + + memset(&nladdr, 0, sizeof(nladdr)); + nladdr.nl_family = AF_NETLINK; + + iov.iov_base = req; + iov.iov_len = size; + + if (sendmsg(nl, &msg, 0) < 0) { + err = -errno; + pr_perror("Can't send request message"); + goto err; + } + + iov.iov_base = buf; + iov.iov_len = sizeof(buf); + + while (1) { + + memset(&msg, 0, sizeof(msg)); + msg.msg_name = &nladdr; + msg.msg_namelen = sizeof(nladdr); + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + + err = recvmsg(nl, &msg, 0); + if (err < 0) { + if (errno == EINTR) + continue; + else { + err = -errno; + pr_perror("Error receiving nl report"); + goto err; + } + } + if (err == 0) + break; + + if (msg.msg_flags & MSG_TRUNC) { + pr_err("Message truncated\n"); + err = -EMSGSIZE; + goto err; + } + + err = nlmsg_receive(buf, err, receive_callback, error_callback, ns, arg); + if (err < 0) + goto err; + if (err == 0) + break; + } + + return 0; + +err: + return err; +} + +int addattr_l(struct nlmsghdr *n, int maxlen, int type, const void *data, + int alen) +{ + int len = nla_attr_size(alen); + struct rtattr *rta; + + if (NLMSG_ALIGN(n->nlmsg_len) + RTA_ALIGN(len) > maxlen) { + pr_err("addattr_l ERROR: message exceeded bound of %d\n", maxlen); + return -1; + } + + rta = NLMSG_TAIL(n); + rta->rta_type = type; + rta->rta_len = len; + memcpy(RTA_DATA(rta), data, alen); + n->nlmsg_len = NLMSG_ALIGN(n->nlmsg_len) + RTA_ALIGN(len); + return 0; +} + +/* + * Here is a workaround for a bug in libnl-3: + * 6a8d90f5fec4 "attr: Allow attribute type 0 + */ + +/** + * Create attribute index based on a stream of attributes. + * @arg tb Index array to be filled (maxtype+1 elements). + * @arg maxtype Maximum attribute type expected and accepted. + * @arg head Head of attribute stream. + * @arg len Length of attribute stream. + * @arg policy Attribute validation policy. + * + * Iterates over the stream of attributes and stores a pointer to each + * attribute in the index array using the attribute type as index to + * the array. Attribute with a type greater than the maximum type + * specified will be silently ignored in order to maintain backwards + * compatibility. If \a policy is not NULL, the attribute will be + * validated using the specified policy. + * + * @see nla_validate + * @return 0 on success or a negative error code. + */ +int __wrap_nla_parse(struct nlattr *tb[], int maxtype, struct nlattr *head, int len, + struct nla_policy *policy) +{ + struct nlattr *nla; + int rem; + + memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1)); + + nla_for_each_attr(nla, head, len, rem) { + int type = nla_type(nla); + + if (type > maxtype) + continue; + + if (tb[type]) + pr_warn("Attribute of type %#x found multiple times in message, " + "previous attribute is being ignored.\n", type); + + tb[type] = nla; + } + + if (rem > 0) + pr_warn("netlink: %d bytes leftover after parsing " + "attributes.\n", rem); + + return 0; +} + +/** + * parse attributes of a netlink message + * @arg nlh netlink message header + * @arg hdrlen length of family specific header + * @arg tb destination array with maxtype+1 elements + * @arg maxtype maximum attribute type to be expected + * @arg policy validation policy + * + * See nla_parse() + */ +int __wrap_nlmsg_parse(struct nlmsghdr *nlh, int hdrlen, struct nlattr *tb[], + int maxtype, struct nla_policy *policy) +{ + if (!nlmsg_valid_hdr(nlh, hdrlen)) + return -NLE_MSG_TOOSHORT; + + return nla_parse(tb, maxtype, nlmsg_attrdata(nlh, hdrlen), + nlmsg_attrlen(nlh, hdrlen), policy); +} + +int32_t nla_get_s32(const struct nlattr *nla) +{ + return *(const int32_t *) nla_data(nla); +} diff --git a/CRIU_code/criu/log.c b/CRIU_code/criu/log.c new file mode 100644 index 0000000..8bdf835 --- /dev/null +++ b/CRIU_code/criu/log.c @@ -0,0 +1,416 @@ +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +#include "page.h" +#include "common/compiler.h" +#include "util.h" +#include "cr_options.h" +#include "servicefd.h" +#include "rst-malloc.h" +#include "common/lock.h" +#include "string.h" +#include "version.h" + +#include "../soccr/soccr.h" +#include "compel/log.h" + + +#define DEFAULT_LOGFD STDERR_FILENO +/* Enable timestamps if verbosity is increased from default */ +#define LOG_TIMESTAMP (DEFAULT_LOGLEVEL + 1) +#define LOG_BUF_LEN (8*1024) +#define EARLY_LOG_BUF_LEN 1024 + +static unsigned int current_loglevel = DEFAULT_LOGLEVEL; + +static char buffer[LOG_BUF_LEN]; +static char buf_off = 0; +/* + * The early_log_buffer is used to store log messages before + * logging is set up to make sure no logs are lost. + */ +static char early_log_buffer[EARLY_LOG_BUF_LEN]; +static unsigned int early_log_buf_off = 0; + +/* If this is 0 the logging has not been set up yet. */ +static int init_done = 0; + +static struct timeval start; +/* + * Manual buf len as sprintf will _always_ put '\0' at the end, + * but we want a "constant" pid to be there on restore + */ +#define TS_BUF_OFF 12 + +static void timediff(struct timeval *from, struct timeval *to) +{ + to->tv_sec -= from->tv_sec; + if (to->tv_usec >= from->tv_usec) + to->tv_usec -= from->tv_usec; + else { + to->tv_sec--; + to->tv_usec += USEC_PER_SEC - from->tv_usec; + } +} + +static void print_ts(void) +{ + struct timeval t; + + gettimeofday(&t, NULL); + timediff(&start, &t); + snprintf(buffer, TS_BUF_OFF, + "(%02u.%06u)", (unsigned)t.tv_sec, (unsigned)t.tv_usec); + buffer[TS_BUF_OFF - 1] = ' '; /* kill the '\0' produced by snprintf */ +} + +int log_get_fd(void) +{ + int fd = get_service_fd(LOG_FD_OFF); + + return fd < 0 ? DEFAULT_LOGFD : fd; +} + +void log_get_logstart(struct timeval *s) +{ + if (current_loglevel >= LOG_TIMESTAMP) + *s = start; + else { + s->tv_sec = 0; + s->tv_usec = 0; + } +} + +static void reset_buf_off(void) +{ + if (current_loglevel >= LOG_TIMESTAMP) + /* reserve space for a timestamp */ + buf_off = TS_BUF_OFF; + else + buf_off = 0; +} + +/* + * Keeping the very first error message for RPC to report back. + */ +struct str_and_lock { + mutex_t l; + char s[1024]; +}; + +static struct str_and_lock *first_err; + +int log_keep_err(void) +{ + first_err = shmalloc(sizeof(struct str_and_lock)); + if (first_err == NULL) + return -1; + + mutex_init(&first_err->l); + first_err->s[0] = '\0'; + return 0; +} + +static void log_note_err(char *msg) +{ + if (first_err && first_err->s[0] == '\0') { + /* + * In any action other than restore this locking is + * actually not required, but ... it's error path + * anyway, so it doesn't make much sense to try hard + * and optimize this out. + */ + mutex_lock(&first_err->l); + if (first_err->s[0] == '\0') + strlcpy(first_err->s, msg, sizeof(first_err->s)); + mutex_unlock(&first_err->l); + } +} + +char *log_first_err(void) +{ + if (!first_err) + return NULL; + if (first_err->s[0] == '\0') + return NULL; + + return first_err->s; +} + +static void print_versions(void) +{ + struct utsname buf; + + pr_info("Version: %s (gitid %s)\n", CRIU_VERSION, CRIU_GITID); + + if (uname(&buf) < 0) { + pr_perror("Reading kernel version failed!"); + /* This pretty unlikely, just keep on running. */ + return; + } + + pr_info("Running on %s %s %s %s %s\n", buf.nodename, buf.sysname, + buf.release, buf.version, buf.machine); +} + +struct early_log_hdr { + uint16_t level; + uint16_t len; +}; + +void flush_early_log_buffer(int fd) +{ + unsigned int pos = 0; + int ret; + + while (pos < early_log_buf_off) { + /* + * The early_log_buffer contains all messages written + * before logging was set up. We only want to print + * out messages which correspond to the requested + * log_level. Therefore the early_log_buffer also contains + * the log_level and the size. This writes one messages, + * depending on the log_level, to the logging fd. Start + * with reading the log_level. + */ + struct early_log_hdr *hdr = (void *)early_log_buffer + pos; + pos += sizeof(hdr); + if (hdr->level <= current_loglevel) { + size_t size = 0; + while (size < hdr->len) { + ret = write(fd, early_log_buffer + pos + size, + hdr->len - size); + if (ret <= 0) + break; + size += ret; + } + } + pos += hdr->len; + } + if (early_log_buf_off) + pr_warn("The early log isn't empty\n"); + early_log_buf_off = 0; +} + +int log_init(const char *output) +{ + int new_logfd, fd; + + gettimeofday(&start, NULL); + reset_buf_off(); + + if (output && !strncmp(output, "-", 2)) { + new_logfd = dup(STDOUT_FILENO); + if (new_logfd < 0) { + pr_perror("Can't dup stdout stream"); + return -1; + } + } else if (output) { + new_logfd = open(output, O_CREAT|O_TRUNC|O_WRONLY|O_APPEND, 0600); + if (new_logfd < 0) { + pr_perror("Can't create log file %s", output); + return -1; + } + } else { + new_logfd = dup(DEFAULT_LOGFD); + if (new_logfd < 0) { + pr_perror("Can't dup log file"); + return -1; + } + } + + fd = install_service_fd(LOG_FD_OFF, new_logfd); + if (fd < 0) + goto err; + + init_done = 1; + + /* + * Once logging is setup this write out all early log messages. + * Only those messages which have to correct log level are printed. + */ + flush_early_log_buffer(fd); + + print_versions(); + + return 0; + +err: + pr_perror("Log engine failure, can't duplicate descriptor"); + return -1; +} + +int log_init_by_pid(pid_t pid) +{ + char path[PATH_MAX]; + + /* + * reset buf_off as this fn is called on each fork while + * restoring process tree + */ + reset_buf_off(); + + if (!opts.log_file_per_pid) { + buf_off += snprintf(buffer + buf_off, sizeof buffer - buf_off, "%6d: ", pid); + return 0; + } + + if (!opts.output) + return 0; + + snprintf(path, PATH_MAX, "%s.%d", opts.output, pid); + + return log_init(path); +} + +void log_fini(void) +{ + close_service_fd(LOG_FD_OFF); +} + +static void soccr_print_on_level(unsigned int loglevel, const char *format, ...) +{ + va_list args; + int lv; + + switch (loglevel) { + case SOCCR_LOG_DBG: + lv = LOG_DEBUG; + break; + case SOCCR_LOG_ERR: + lv = LOG_ERROR; + break; + default: + lv = LOG_INFO; + break; + } + + va_start(args, format); + vprint_on_level(lv, format, args); + va_end(args); +} + +void log_set_loglevel(unsigned int level) +{ + current_loglevel = level; + + libsoccr_set_log(level, soccr_print_on_level); + compel_log_init(vprint_on_level, level); +} + +unsigned int log_get_loglevel(void) +{ + return current_loglevel; +} + +static void early_vprint(const char *format, unsigned int loglevel, va_list params) +{ + unsigned int log_size = 0; + struct early_log_hdr *hdr; + + if ((early_log_buf_off + sizeof(hdr)) >= EARLY_LOG_BUF_LEN) + return; + + /* Save loglevel */ + + hdr = (void *)early_log_buffer + early_log_buf_off; + hdr->level = loglevel; + /* Skip the log entry size */ + early_log_buf_off += sizeof(hdr); + if (loglevel >= LOG_TIMESTAMP) { + /* + * If logging is not yet setup we just write zeros + * instead of a real timestamp. This way we can + * keep the same format as the other messages on + * log levels with timestamps (>=LOG_TIMESTAMP). + */ + log_size = snprintf(early_log_buffer + early_log_buf_off, + sizeof(early_log_buffer) - early_log_buf_off, + "(00.000000) "); + } + + log_size += vsnprintf(early_log_buffer + early_log_buf_off + log_size, + sizeof(early_log_buffer) - early_log_buf_off - log_size, + format, params); + + /* Save log entry size */ + hdr->len = log_size; + early_log_buf_off += log_size; +} + +void vprint_on_level(unsigned int loglevel, const char *format, va_list params) +{ + int fd, size, ret, off = 0; + int _errno = errno; + + if (unlikely(loglevel == LOG_MSG)) { + fd = STDOUT_FILENO; + off = buf_off; /* skip dangling timestamp */ + } else { + /* + * If logging has not yet been initialized (init_done == 0) + * make sure all messages are written to the early_log_buffer. + */ + if (!init_done) { + early_vprint(format, loglevel, params); + return; + } + if (loglevel > current_loglevel) + return; + fd = log_get_fd(); + if (current_loglevel >= LOG_TIMESTAMP) + print_ts(); + } + + size = vsnprintf(buffer + buf_off, sizeof buffer - buf_off, format, params); + size += buf_off; + + while (off < size) { + ret = write(fd, buffer + off, size - off); + if (ret <= 0) + break; + off += ret; + } + + /* This is missing for messages in the early_log_buffer. */ + if (loglevel == LOG_ERROR) + log_note_err(buffer + buf_off); + + errno = _errno; +} + +void print_on_level(unsigned int loglevel, const char *format, ...) +{ + va_list params; + + va_start(params, format); + vprint_on_level(loglevel, format, params); + va_end(params); +} + +int write_pidfile(int pid) +{ + int fd; + + fd = open(opts.pidfile, O_WRONLY | O_EXCL | O_CREAT, 0600); + if (fd == -1) { + pr_perror("Can't open %s", opts.pidfile); + return -1; + } + + dprintf(fd, "%d", pid); + close(fd); + return 0; +} diff --git a/CRIU_code/criu/lsm.c b/CRIU_code/criu/lsm.c new file mode 100644 index 0000000..9d7e55c --- /dev/null +++ b/CRIU_code/criu/lsm.c @@ -0,0 +1,351 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "common/config.h" +#include "kerndat.h" +#include "pstree.h" +#include "util.h" +#include "cr_options.h" +#include "lsm.h" +#include "fdstore.h" + +#include "protobuf.h" +#include "images/inventory.pb-c.h" +#include "images/creds.pb-c.h" +#include "images/fdinfo.pb-c.h" + +#ifdef CONFIG_HAS_SELINUX +#include +#endif + +static int apparmor_get_label(pid_t pid, char **profile_name) +{ + FILE *f; + char *space; + + f = fopen_proc(pid, "attr/current"); + if (!f) + return -1; + + if (fscanf(f, "%ms", profile_name) != 1) { + pr_perror("err scanfing"); + fclose(f); + return -1; + } + + fclose(f); + + /* + * A profile name can be followed by an enforcement mode, e.g. + * lxc-default-with-nesting (enforced) + * but the profile name is just the part before the space. + */ + space = strstr(*profile_name, " "); + if (space) + *space = 0; + + /* + * An "unconfined" value means there is no profile, so we don't need to + * worry about trying to restore one. + */ + if (strcmp(*profile_name, "unconfined") == 0) { + free(*profile_name); + *profile_name = NULL; + } + + return 0; +} + +#ifdef CONFIG_HAS_SELINUX +static int selinux_get_label(pid_t pid, char **output) +{ + security_context_t ctx; + char *pos; + int i; + int ret = -1; + + if (getpidcon_raw(pid, &ctx) < 0) { + pr_perror("getting selinux profile failed"); + return -1; + } + + *output = xstrdup((char *)ctx); + if (!*output) + goto err; + + /* + * Make sure it is a valid SELinux label. It should look like this: + * + * unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 + */ + pos = (char*)ctx; + for (i = 0; i < 3; i++) { + pos = strstr(pos, ":"); + if (!pos) { + pr_err("Invalid selinux context %s\n", (char *)ctx); + xfree(*output); + *output = NULL; + goto err; + } + + *pos = 0; + pos++; + } + + ret = 0; +err: + freecon(ctx); + return ret; +} + +/* + * selinux_get_sockcreate_label reads /proc/PID/attr/sockcreate + * to see if the PID has a special label specified for sockets. + * Most of the time this will be empty and the process will use + * the process context also for sockets. + */ +static int selinux_get_sockcreate_label(pid_t pid, char **output) +{ + FILE *f; + int ret; + + f = fopen_proc(pid, "attr/sockcreate"); + if (!f) + return -1; + + ret = fscanf(f, "%ms", output); + if (ret == -1 && errno != 0) { + pr_perror("Unable to parse /proc/%d/attr/sockcreate", pid); + /* + * Only if the error indicator is set it is a real error. + * -1 could also be EOF, which would mean that sockcreate + * was just empty, which is the most common case. + */ + fclose(f); + return -1; + } + fclose(f); + return 0; +} + +int reset_setsockcreatecon() +{ + /* Currently this only works for SELinux. */ + if (kdat.lsm != LSMTYPE__SELINUX) + return 0; + + if (setsockcreatecon_raw(NULL)) { + pr_perror("Unable to reset socket SELinux context"); + return -1; + } + return 0; +} + +int run_setsockcreatecon(FdinfoEntry *e) +{ + char *ctx = NULL; + + /* Currently this only works for SELinux. */ + if (kdat.lsm != LSMTYPE__SELINUX) + return 0; + + ctx = e->xattr_security_selinux; + /* Writing to the FD using fsetxattr() did not work for some reason. */ + if (setsockcreatecon_raw(ctx)) { + pr_perror("Unable to set the %s socket SELinux context", ctx); + return -1; + } + return 0; +} + +int dump_xattr_security_selinux(int fd, FdinfoEntry *e) +{ + char *ctx = NULL; + int len; + int ret; + + /* Currently this only works for SELinux. */ + if (kdat.lsm != LSMTYPE__SELINUX) + return 0; + + /* Get the size of the xattr. */ + len = fgetxattr(fd, "security.selinux", ctx, 0); + if (len == -1) { + pr_err("Reading xattr security.selinux from FD %d failed\n", fd); + return -1; + } + + ctx = xmalloc(len); + if (!ctx) { + pr_err("xmalloc to read xattr for FD %d failed\n", fd); + return -1; + } + + ret = fgetxattr(fd, "security.selinux", ctx, len); + if (len != ret) { + pr_err("Reading xattr %s to FD %d failed\n", ctx, fd); + return -1; + } + + e->xattr_security_selinux = ctx; + + return 0; +} + +#endif + +void kerndat_lsm(void) +{ + if (access(AA_SECURITYFS_PATH, F_OK) == 0) { + kdat.lsm = LSMTYPE__APPARMOR; + return; + } + +#ifdef CONFIG_HAS_SELINUX + /* + * This seems to be the canonical place to mount this fs if it is + * enabled, although we may (?) want to check /selinux for posterity as + * well. + */ + if (access("/sys/fs/selinux", F_OK) == 0) { + kdat.lsm = LSMTYPE__SELINUX; + return; + } +#endif + + kdat.lsm = LSMTYPE__NO_LSM; +} + +Lsmtype host_lsm_type(void) +{ + return kdat.lsm; +} + +int collect_lsm_profile(pid_t pid, CredsEntry *ce) +{ + int ret; + + ce->lsm_profile = NULL; + ce->lsm_sockcreate = NULL; + + switch (kdat.lsm) { + case LSMTYPE__NO_LSM: + ret = 0; + break; + case LSMTYPE__APPARMOR: + ret = apparmor_get_label(pid, &ce->lsm_profile); + break; +#ifdef CONFIG_HAS_SELINUX + case LSMTYPE__SELINUX: + ret = selinux_get_label(pid, &ce->lsm_profile); + if (ret) + break; + ret = selinux_get_sockcreate_label(pid, &ce->lsm_sockcreate); + break; +#endif + default: + BUG(); + ret = -1; + break; + } + + if (ce->lsm_profile) + pr_info("%d has lsm profile %s\n", pid, ce->lsm_profile); + if (ce->lsm_sockcreate) + pr_info("%d has lsm sockcreate label %s\n", pid, ce->lsm_sockcreate); + + return ret; +} + +// in inventory.c +extern Lsmtype image_lsm; + +int validate_lsm(char *lsm_profile) +{ + if (image_lsm == LSMTYPE__NO_LSM || image_lsm == kdat.lsm) + return 0; + + /* + * This is really only a problem if the processes have actually + * specified an LSM profile. If not, we won't restore anything anyway, + * so it's fine. + */ + if (lsm_profile) { + pr_err("mismatched lsm types and lsm profile specified\n"); + return -1; + } + + return 0; +} + +int render_lsm_profile(char *profile, char **val) +{ + *val = NULL; + + switch (kdat.lsm) { + case LSMTYPE__APPARMOR: + if (strcmp(profile, "unconfined") != 0 && asprintf(val, "changeprofile %s", profile) < 0) { + pr_err("allocating lsm profile failed\n"); + *val = NULL; + return -1; + } + break; + case LSMTYPE__SELINUX: + if (asprintf(val, "%s", profile) < 0) { + *val = NULL; + return -1; + } + break; + default: + pr_err("can't render profile %s for lsmtype %d\n", profile, LSMTYPE__NO_LSM); + return -1; + } + + return 0; +} + +int lsm_check_opts(void) +{ + char *aux; + + if (!opts.lsm_supplied) + return 0; + + aux = strchr(opts.lsm_profile, ':'); + if (aux == NULL) { + pr_err("invalid argument %s for --lsm-profile\n", opts.lsm_profile); + return -1; + } + + *aux = '\0'; + aux++; + + if (strcmp(opts.lsm_profile, "apparmor") == 0) { + if (kdat.lsm != LSMTYPE__APPARMOR) { + pr_err("apparmor LSM specified but apparmor not supported by kernel\n"); + return -1; + } + + SET_CHAR_OPTS(lsm_profile, aux); + } else if (strcmp(opts.lsm_profile, "selinux") == 0) { + if (kdat.lsm != LSMTYPE__SELINUX) { + pr_err("selinux LSM specified but selinux not supported by kernel\n"); + return -1; + } + + SET_CHAR_OPTS(lsm_profile, aux); + } else if (strcmp(opts.lsm_profile, "none") == 0) { + xfree(opts.lsm_profile); + opts.lsm_profile = NULL; + } else { + pr_err("unknown lsm %s\n", opts.lsm_profile); + return -1; + } + + return 0; +} diff --git a/CRIU_code/criu/mem.c b/CRIU_code/criu/mem.c new file mode 100644 index 0000000..6a1a87a --- /dev/null +++ b/CRIU_code/criu/mem.c @@ -0,0 +1,1346 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "types.h" +#include "cr_options.h" +#include "servicefd.h" +#include "mem.h" +#include "parasite-syscall.h" +#include "parasite.h" +#include "page-pipe.h" +#include "page-xfer.h" +#include "log.h" +#include "kerndat.h" +#include "stats.h" +#include "vma.h" +#include "shmem.h" +#include "uffd.h" +#include "pstree.h" +#include "restorer.h" +#include "rst-malloc.h" +#include "bitmap.h" +#include "sk-packet.h" +#include "files-reg.h" +#include "pagemap-cache.h" +#include "fault-injection.h" +#include "prctl.h" +#include + +#include "protobuf.h" +#include "images/pagemap.pb-c.h" + +static int task_reset_dirty_track(int pid) +{ + int ret; + + if (!opts.track_mem) + return 0; + + BUG_ON(!kdat.has_dirty_track); + + ret = do_task_reset_dirty_track(pid); + BUG_ON(ret == 1); + return ret; +} + +int do_task_reset_dirty_track(int pid) +{ + int fd, ret; + char cmd[] = "4"; + + pr_info("Reset %d's dirty tracking\n", pid); + + fd = __open_proc(pid, EACCES, O_RDWR, "clear_refs"); + if (fd < 0) + return errno == EACCES ? 1 : -1; + + ret = write(fd, cmd, sizeof(cmd)); + if (ret < 0) { + if (errno == EINVAL) /* No clear-soft-dirty in kernel */ + ret = 1; + else { + pr_perror("Can't reset %d's dirty memory tracker (%d)", pid, errno); + ret = -1; + } + } else { + pr_info(" ... done\n"); + ret = 0; + } + + close(fd); + return ret; +} + +unsigned long dump_pages_args_size(struct vm_area_list *vmas) +{ + /* In the worst case I need one iovec for each page */ + return sizeof(struct parasite_dump_pages_args) + + vmas->nr * sizeof(struct parasite_vma_entry) + + (vmas->priv_size + 1) * sizeof(struct iovec); +} + +static inline bool __page_is_zero(u64 pme) +{ + return (pme & PME_PFRAME_MASK) == kdat.zero_page_pfn; +} + +static inline bool __page_in_parent(bool dirty) +{ + /* + * If we do memory tracking, but w/o parent images, + * then we have to dump all memory + */ + + return opts.track_mem && opts.img_parent && !dirty; +} + +bool should_dump_page(VmaEntry *vmae, u64 pme) +{ + /* + * vDSO area must be always dumped because on restore + * we might need to generate a proxy. + */ + if (vma_entry_is(vmae, VMA_AREA_VDSO)) + return true; + /* + * In turn VVAR area is special and referenced from + * vDSO area by IP addressing (at least on x86) thus + * never ever dump its content but always use one provided + * by the kernel on restore, ie runtime VVAR area must + * be remapped into proper place.. + */ + if (vma_entry_is(vmae, VMA_AREA_VVAR)) + return false; + + /* + * Optimisation for private mapping pages, that haven't + * yet being COW-ed + */ + if (vma_entry_is(vmae, VMA_FILE_PRIVATE) && (pme & PME_FILE)) + return false; + if (vma_entry_is(vmae, VMA_AREA_AIORING)) + return true; + if ((pme & (PME_PRESENT | PME_SWAP)) && !__page_is_zero(pme)) + return true; + + return false; +} + +bool page_is_zero(u64 pme) +{ + return __page_is_zero(pme); +} + +bool page_in_parent(bool dirty) +{ + return __page_in_parent(dirty); +} + +static bool is_stack(struct pstree_item *item, unsigned long vaddr) +{ + int i; + + for (i = 0; i < item->nr_threads; i++) { + uint64_t sp = dmpi(item)->thread_sp[i]; + + if (!((sp ^ vaddr) & ~PAGE_MASK)) + return true; + } + + return false; +} + +/* + * This routine finds out what memory regions to grab from the + * dumpee. The iovs generated are then fed into vmsplice to + * put the memory into the page-pipe's pipe. + * + * "Holes" in page-pipe are regions, that should be dumped, but + * the memory contents is present in the pagent image set. + */ + +static int generate_iovs(struct pstree_item *item, struct vma_area *vma, struct page_pipe *pp, u64 *map, u64 *off, bool has_parent) +{ + u64 *at = &map[PAGE_PFN(*off)]; + unsigned long pfn, nr_to_scan; + unsigned long pages[3] = {}; + int ret = 0; + + nr_to_scan = (vma_area_len(vma) - *off) / PAGE_SIZE; + + for (pfn = 0; pfn < nr_to_scan; pfn++) { + unsigned long vaddr; + unsigned int ppb_flags = 0; + int st; + + if (!should_dump_page(vma->e, at[pfn])) + continue; + + vaddr = vma->e->start + *off + pfn * PAGE_SIZE; + + if (vma_entry_can_be_lazy(vma->e) && !is_stack(item, vaddr)) + ppb_flags |= PPB_LAZY; + + /* + * If we're doing incremental dump (parent images + * specified) and page is not soft-dirty -- we dump + * hole and expect the parent images to contain this + * page. The latter would be checked in page-xfer. + */ + + if (has_parent && page_in_parent(at[pfn] & PME_SOFT_DIRTY)) { + ret = page_pipe_add_hole(pp, vaddr, PP_HOLE_PARENT); + st = 0; + } else { + ret = page_pipe_add_page(pp, vaddr, ppb_flags); + if (ppb_flags & PPB_LAZY && opts.lazy_pages) + st = 1; + else + st = 2; + } + + if (ret) { + /* Do not do pfn++, just bail out */ + pr_debug("Pagemap full\n"); + break; + } + + pages[st]++; + } + + *off += pfn * PAGE_SIZE; + + cnt_add(CNT_PAGES_SCANNED, nr_to_scan); + cnt_add(CNT_PAGES_SKIPPED_PARENT, pages[0]); + cnt_add(CNT_PAGES_LAZY, pages[1]); + cnt_add(CNT_PAGES_WRITTEN, pages[2]); + + pr_info("Pagemap generated: %lu pages (%lu lazy) %lu holes\n", + pages[2] + pages[1], pages[1], pages[0]); + return ret; +} + +static struct parasite_dump_pages_args *prep_dump_pages_args(struct parasite_ctl *ctl, + struct vm_area_list *vma_area_list, bool skip_non_trackable) +{ + struct parasite_dump_pages_args *args; + struct parasite_vma_entry *p_vma; + struct vma_area *vma; + + args = compel_parasite_args_s(ctl, dump_pages_args_size(vma_area_list)); + + p_vma = pargs_vmas(args); + args->nr_vmas = 0; + + list_for_each_entry(vma, &vma_area_list->h, list) { + if (!vma_area_is_private(vma, kdat.task_size)) + continue; + /* + * Kernel write to aio ring is not soft-dirty tracked, + * so we ignore them at pre-dump. + */ + if (vma_entry_is(vma->e, VMA_AREA_AIORING) && skip_non_trackable) + continue; + if (vma->e->prot & PROT_READ) + continue; + + p_vma->start = vma->e->start; + p_vma->len = vma_area_len(vma); + p_vma->prot = vma->e->prot; + + args->nr_vmas++; + p_vma++; + } + + return args; +} + +static int drain_pages(struct page_pipe *pp, struct parasite_ctl *ctl, + struct parasite_dump_pages_args *args) +{ + struct page_pipe_buf *ppb; + int ret = 0; + + debug_show_page_pipe(pp); + + /* Step 2 -- grab pages into page-pipe */ + list_for_each_entry(ppb, &pp->bufs, l) { + args->nr_segs = ppb->nr_segs; + args->nr_pages = ppb->pages_in; + pr_debug("PPB: %d pages %d segs %u pipe %d off\n", + args->nr_pages, args->nr_segs, ppb->pipe_size, args->off); + + ret = compel_rpc_call(PARASITE_CMD_DUMPPAGES, ctl); + if (ret < 0) + return -1; + ret = compel_util_send_fd(ctl, ppb->p[1]); + if (ret) + return -1; + + ret = compel_rpc_sync(PARASITE_CMD_DUMPPAGES, ctl); + if (ret < 0) + return -1; + + args->off += args->nr_segs; + } + + return 0; +} + +static int xfer_pages(struct page_pipe *pp, struct page_xfer *xfer) +{ + int ret; + + /* + * Step 3 -- write pages into image (or delay writing for + * pre-dump action (see pre_dump_one_task) + */ + timing_start(TIME_MEMWRITE); + ret = page_xfer_dump_pages(xfer, pp); + timing_stop(TIME_MEMWRITE); + + return ret; +} + +static int detect_pid_reuse(struct pstree_item *item, + struct proc_pid_stat* pps, + InventoryEntry *parent_ie) +{ + unsigned long long dump_ticks; + struct proc_pid_stat pps_buf; + unsigned long long tps; /* ticks per second */ + int ret; + + if (!parent_ie) { + pr_err("Pid-reuse detection failed: no parent inventory, " \ + "check warnings in get_parent_stats\n"); + return -1; + } + + tps = sysconf(_SC_CLK_TCK); + if (tps == -1) { + pr_perror("Failed to get clock ticks via sysconf"); + return -1; + } + + if (!pps) { + pps = &pps_buf; + ret = parse_pid_stat(item->pid->real, pps); + if (ret < 0) + return -1; + } + + dump_ticks = parent_ie->dump_uptime/(USEC_PER_SEC/tps); + + if (pps->start_time >= dump_ticks) { + /* Print "*" if unsure */ + pr_warn("Pid reuse%s detected for pid %d\n", + pps->start_time == dump_ticks ? "*" : "", + item->pid->real); + return 1; + } + return 0; +} + +static int generate_vma_iovs(struct pstree_item *item, struct vma_area *vma, + struct page_pipe *pp, struct page_xfer *xfer, + struct parasite_dump_pages_args *args, + struct parasite_ctl *ctl, pmc_t *pmc, + bool has_parent, bool pre_dump) +{ + u64 off = 0; + u64 *map; + int ret; + + if (!vma_area_is_private(vma, kdat.task_size) && + !vma_area_is(vma, VMA_ANON_SHARED)) + return 0; + + if (vma_entry_is(vma->e, VMA_AREA_AIORING)) { + if (pre_dump) + return 0; + has_parent = false; + } + + map = pmc_get_map(pmc, vma); + if (!map) + return -1; + + if (vma_area_is(vma, VMA_ANON_SHARED)) + return add_shmem_area(item->pid->real, vma->e, map); + +again: + ret = generate_iovs(item,vma, pp, map, &off, has_parent); + if (ret == -EAGAIN) { + BUG_ON(!(pp->flags & PP_CHUNK_MODE)); + + ret = drain_pages(pp, ctl, args); + if (!ret) + ret = xfer_pages(pp, xfer); + if (!ret) { + page_pipe_reinit(pp); + goto again; + } + } + + return ret; +} + +static int __parasite_dump_pages_seized(struct pstree_item *item, + struct parasite_dump_pages_args *args, + struct vm_area_list *vma_area_list, + struct mem_dump_ctl *mdc, + struct parasite_ctl *ctl) +{ + pmc_t pmc = PMC_INIT; + struct page_pipe *pp; + struct vma_area *vma_area; + struct page_xfer xfer = { .parent = NULL }; + int ret, exit_code = -1; + unsigned cpp_flags = 0; + unsigned long pmc_size; + int possible_pid_reuse = 0; + bool has_parent; + + pr_info("\n"); + pr_info("Dumping pages (type: %d pid: %d)\n", CR_FD_PAGES, item->pid->real); + pr_info("----------------------------------------\n"); + + timing_start(TIME_MEMDUMP); + + pr_debug(" Private vmas %lu/%lu pages\n", + vma_area_list->priv_longest, vma_area_list->priv_size); + + /* + * Step 0 -- prepare + */ + + pmc_size = max(vma_area_list->priv_longest, + vma_area_list->shared_longest); + if (pmc_init(&pmc, item->pid->real, &vma_area_list->h, + pmc_size * PAGE_SIZE)) + return -1; + + if (!(mdc->pre_dump || mdc->lazy)) + /* + * Chunk mode pushes pages portion by portion. This mode + * only works when we don't need to keep pp for later + * use, i.e. on non-lazy non-predump. + */ + cpp_flags |= PP_CHUNK_MODE; + pp = create_page_pipe(vma_area_list->priv_size, + mdc->lazy ? NULL : pargs_iovs(args), + cpp_flags); + if (!pp) + goto out; + + if (!mdc->pre_dump) { + /* + * Regular dump -- create xfer object and send pages to it + * right here. For pre-dumps the pp will be taken by the + * caller and handled later. + */ + ret = open_page_xfer(&xfer, CR_FD_PAGEMAP, vpid(item)); + if (ret < 0) + goto out_pp; + + xfer.transfer_lazy = !mdc->lazy; + } else { + ret = check_parent_page_xfer(CR_FD_PAGEMAP, vpid(item)); + if (ret < 0) + goto out_pp; + + if (ret) + xfer.parent = NULL + 1; + } + + if (xfer.parent) { + possible_pid_reuse = detect_pid_reuse(item, mdc->stat, + mdc->parent_ie); + if (possible_pid_reuse == -1) + goto out_xfer; + } + + + /* + * Step 1 -- generate the pagemap + */ + args->off = 0; + has_parent = !!xfer.parent && !possible_pid_reuse; + list_for_each_entry(vma_area, &vma_area_list->h, list) { + ret = generate_vma_iovs(item, vma_area, pp, &xfer, args, ctl, + &pmc, has_parent, mdc->pre_dump); + if (ret < 0) + goto out_xfer; + } + + if (mdc->lazy) + memcpy(pargs_iovs(args), pp->iovs, + sizeof(struct iovec) * pp->nr_iovs); + ret = drain_pages(pp, ctl, args); + if (!ret && !mdc->pre_dump) + ret = xfer_pages(pp, &xfer); + if (ret) + goto out_xfer; + + timing_stop(TIME_MEMDUMP); + + /* + * Step 4 -- clean up + */ + + ret = task_reset_dirty_track(item->pid->real); + if (ret) + goto out_xfer; + exit_code = 0; +out_xfer: + if (!mdc->pre_dump) + xfer.close(&xfer); +out_pp: + if (ret || !(mdc->pre_dump || mdc->lazy)) + destroy_page_pipe(pp); + else + dmpi(item)->mem_pp = pp; +out: + pmc_fini(&pmc); + pr_info("----------------------------------------\n"); + return exit_code; +} + +int parasite_dump_pages_seized(struct pstree_item *item, + struct vm_area_list *vma_area_list, + struct mem_dump_ctl *mdc, + struct parasite_ctl *ctl) +{ + int ret; + struct parasite_dump_pages_args *pargs; + + pargs = prep_dump_pages_args(ctl, vma_area_list, mdc->pre_dump); + + /* + * Add PROT_READ protection for all VMAs we're about to + * dump if they don't have one. Otherwise we'll not be + * able to read the memory contents. + * + * Afterwards -- reprotect memory back. + */ + + pargs->add_prot = PROT_READ; + ret = compel_rpc_call_sync(PARASITE_CMD_MPROTECT_VMAS, ctl); + if (ret) { + pr_err("Can't dump unprotect vmas with parasite\n"); + return ret; + } + + if (fault_injected(FI_DUMP_PAGES)) { + pr_err("fault: Dump VMA pages failure!\n"); + return -1; + } + + ret = __parasite_dump_pages_seized(item, pargs, vma_area_list, mdc, ctl); + if (ret) { + pr_err("Can't dump page with parasite\n"); + /* Parasite will unprotect VMAs after fail in fini() */ + return ret; + } + + pargs->add_prot = 0; + if (compel_rpc_call_sync(PARASITE_CMD_MPROTECT_VMAS, ctl)) { + pr_err("Can't rollback unprotected vmas with parasite\n"); + ret = -1; + } + + return ret; +} + +int prepare_mm_pid(struct pstree_item *i) +{ + pid_t pid = vpid(i); + int ret = -1, vn = 0; + struct cr_img *img; + struct rst_info *ri = rsti(i); + + img = open_image(CR_FD_MM, O_RSTR, pid); + if (!img) + return -1; + + ret = pb_read_one_eof(img, &ri->mm, PB_MM); + close_image(img); + if (ret <= 0) + return ret; + + if (collect_special_file(ri->mm->exe_file_id) == NULL) + return -1; + + pr_debug("Found %zd VMAs in image\n", ri->mm->n_vmas); + img = NULL; + if (ri->mm->n_vmas == 0) { + /* + * Old image. Read VMAs from vma-.img + */ + img = open_image(CR_FD_VMAS, O_RSTR, pid); + if (!img) + return -1; + } + + + while (vn < ri->mm->n_vmas || img != NULL) { + struct vma_area *vma; + + ret = -1; + vma = alloc_vma_area(); + if (!vma) + break; + + ri->vmas.nr++; + if (!img) + vma->e = ri->mm->vmas[vn++]; + else { + ret = pb_read_one_eof(img, &vma->e, PB_VMA); + if (ret <= 0) { + xfree(vma); + close_image(img); + img = NULL; + break; + } + } + list_add_tail(&vma->list, &ri->vmas.h); + + if (vma_area_is_private(vma, kdat.task_size)) { + ri->vmas.priv_size += vma_area_len(vma); + if (vma_has_guard_gap_hidden(vma)) + ri->vmas.priv_size += PAGE_SIZE; + } + + pr_info("vma 0x%"PRIx64" 0x%"PRIx64"\n", vma->e->start, vma->e->end); + + if (vma_area_is(vma, VMA_ANON_SHARED)) + ret = collect_shmem(pid, vma); + else if (vma_area_is(vma, VMA_FILE_PRIVATE) || + vma_area_is(vma, VMA_FILE_SHARED)) + ret = collect_filemap(vma); + else if (vma_area_is(vma, VMA_AREA_SOCKET)) + ret = collect_socket_map(vma); + else + ret = 0; + if (ret) + break; + } + + if (img) + close_image(img); + return ret; +} + +static inline bool check_cow_vmas(struct vma_area *vma, struct vma_area *pvma) +{ + /* + * VMAs that _may_[1] have COW-ed pages should ... + * + * [1] I say "may" because whether or not particular pages are + * COW-ed is determined later in restore_priv_vma_content() by + * memcmp'aring the contents. + */ + + /* ... coincide by start/stop pair (start is checked by caller) */ + if (vma->e->end != pvma->e->end) + return false; + /* ... both be private (and thus have space in premmaped area) */ + if (!vma_area_is_private(vma, kdat.task_size)) + return false; + if (!vma_area_is_private(pvma, kdat.task_size)) + return false; + /* ... have growsdown and anon flags coincide */ + if ((vma->e->flags ^ pvma->e->flags) & (MAP_GROWSDOWN | MAP_ANONYMOUS)) + return false; + /* ... belong to the same file if being filemap */ + if (!(vma->e->flags & MAP_ANONYMOUS) && vma->e->shmid != pvma->e->shmid) + return false; + + pr_debug("Found two COW VMAs @0x%"PRIx64"-0x%"PRIx64"\n", vma->e->start, pvma->e->end); + return true; +} + +static inline bool vma_inherited(struct vma_area *vma) +{ + return (vma->pvma != NULL && vma->pvma != VMA_COW_ROOT); +} + +static void prepare_cow_vmas_for(struct vm_area_list *vmas, struct vm_area_list *pvmas) +{ + struct vma_area *vma, *pvma; + + vma = list_first_entry(&vmas->h, struct vma_area, list); + pvma = list_first_entry(&pvmas->h, struct vma_area, list); + + while (1) { + if ((vma->e->start == pvma->e->start) && check_cow_vmas(vma, pvma)) { + vma->pvma = pvma; + if (pvma->pvma == NULL) + pvma->pvma = VMA_COW_ROOT; + } + + /* <= here to shift from matching VMAs and ... */ + while (vma->e->start <= pvma->e->start) { + vma = vma_next(vma); + if (&vma->list == &vmas->h) + return; + } + + /* ... no == here since we must stop on matching pair */ + while (pvma->e->start < vma->e->start) { + pvma = vma_next(pvma); + if (&pvma->list == &pvmas->h) + return; + } + } +} + +void prepare_cow_vmas(void) +{ + struct pstree_item *pi; + + for_each_pstree_item(pi) { + struct pstree_item *ppi; + struct vm_area_list *vmas, *pvmas; + + ppi = pi->parent; + if (!ppi) + continue; + + vmas = &rsti(pi)->vmas; + if (vmas->nr == 0) /* Zombie */ + continue; + + pvmas = &rsti(ppi)->vmas; + if (pvmas->nr == 0) /* zombies cannot have kids, + * but helpers can (and do) */ + continue; + + if (rsti(pi)->mm->exe_file_id != rsti(ppi)->mm->exe_file_id) + /* + * Tasks running different executables have + * close to zero chance of having cow-ed areas + * and actually kernel never creates such. + */ + continue; + + prepare_cow_vmas_for(vmas, pvmas); + } +} + +/* Map a private vma, if it is not mapped by a parent yet */ +static int premap_private_vma(struct pstree_item *t, struct vma_area *vma, void **tgt_addr) +{ + int ret; + void *addr; + unsigned long nr_pages, size; + + nr_pages = vma_entry_len(vma->e) / PAGE_SIZE; + vma->page_bitmap = xzalloc(BITS_TO_LONGS(nr_pages) * sizeof(long)); + if (vma->page_bitmap == NULL) + return -1; + + /* + * A grow-down VMA has a guard page, which protect a VMA below it. + * So one more page is mapped here to restore content of the first page + */ + if (vma_has_guard_gap_hidden(vma)) + vma->e->start -= PAGE_SIZE; + + size = vma_entry_len(vma->e); + if (!vma_inherited(vma)) { + int flag = 0; + /* + * The respective memory area was NOT found in the parent. + * Map a new one. + */ + + /* + * Restore AIO ring buffer content to temporary anonymous area. + * This will be placed in io_setup'ed AIO in restore_aio_ring(). + */ + if (vma_entry_is(vma->e, VMA_AREA_AIORING)) + flag |= MAP_ANONYMOUS; + else if (vma_area_is(vma, VMA_FILE_PRIVATE)) { + ret = vma->vm_open(vpid(t), vma); + if (ret < 0) { + pr_err("Can't fixup VMA's fd\n"); + return -1; + } + } + + /* + * All mappings here get PROT_WRITE regardless of whether we + * put any data into it or not, because this area will get + * mremap()-ed (branch below) so we MIGHT need to have WRITE + * bits there. Ideally we'd check for the whole COW-chain + * having any data in. + */ + addr = mmap(*tgt_addr, size, + vma->e->prot | PROT_WRITE, + vma->e->flags | MAP_FIXED | flag, + vma->e->fd, vma->e->pgoff); + + if (addr == MAP_FAILED) { + pr_perror("Unable to map ANON_VMA"); + return -1; + } + } else { + void *paddr; + + /* + * The area in question can be COWed with the parent. Remap the + * parent area. Note, that it has already being passed through + * the restore_priv_vma_content() call and thus may have some + * pages in it. + */ + + paddr = decode_pointer(vma->pvma->premmaped_addr); + if (vma_has_guard_gap_hidden(vma)) + paddr -= PAGE_SIZE; + + addr = mremap(paddr, size, size, + MREMAP_FIXED | MREMAP_MAYMOVE, *tgt_addr); + if (addr != *tgt_addr) { + pr_perror("Unable to remap a private vma"); + return -1; + } + } + + vma->e->status |= VMA_PREMMAPED; + vma->premmaped_addr = (unsigned long) addr; + pr_debug("\tpremap %#016"PRIx64"-%#016"PRIx64" -> %016lx\n", + vma->e->start, vma->e->end, (unsigned long)addr); + + if (vma_has_guard_gap_hidden(vma)) { /* Skip guard page */ + vma->e->start += PAGE_SIZE; + vma->premmaped_addr += PAGE_SIZE; + } + + if (vma_area_is(vma, VMA_FILE_PRIVATE)) + vma->vm_open = NULL; /* prevent from 2nd open in prepare_vmas */ + + *tgt_addr += size; + return 0; +} + +static inline bool vma_force_premap(struct vma_area *vma, struct list_head *head) +{ + /* + * On kernels with 4K guard pages, growsdown VMAs + * always have one guard page at the + * beginning and sometimes this page contains data. + * In case the VMA is premmaped, we premmap one page + * larger VMA. In case of in place restore we can only + * do this if the VMA in question is not "guarded" by + * some other VMA. + */ + if (vma->e->flags & MAP_GROWSDOWN) { + if (vma->list.prev != head) { + struct vma_area *prev; + + prev = list_entry(vma->list.prev, struct vma_area, list); + if (prev->e->end == vma->e->start) { + pr_debug("Force premmap for 0x%"PRIx64":0x%"PRIx64"\n", + vma->e->start, vma->e->end); + return true; + } + } + } + + return false; +} + +/* + * Ensure for s390x that vma is below task size on restore system + */ +static int task_size_check(pid_t pid, VmaEntry *entry) +{ +#ifdef __s390x__ + if (entry->end <= kdat.task_size) + return 0; + pr_err("Can't restore high memory region %lx-%lx because kernel does only support vmas up to %lx\n", entry->start, entry->end, kdat.task_size); + return -1; +#else + return 0; +#endif +} + +static int premap_priv_vmas(struct pstree_item *t, struct vm_area_list *vmas, + void **at, struct page_read *pr) +{ + struct vma_area *vma; + unsigned long pstart = 0; + int ret = 0; + LIST_HEAD(empty); + + filemap_ctx_init(true); + + list_for_each_entry(vma, &vmas->h, list) { + if (task_size_check(vpid(t), vma->e)) { + ret = -1; + break; + } + if (pstart > vma->e->start) { + ret = -1; + pr_err("VMA-s are not sorted in the image file\n"); + break; + } + pstart = vma->e->start; + + if (!vma_area_is_private(vma, kdat.task_size)) + continue; + + if (vma->pvma == NULL && pr->pieok && !vma_force_premap(vma, &vmas->h)) { + /* + * VMA in question is not shared with anyone. We'll + * restore it with its contents in restorer. + * Now let's check whether we need to map it with + * PROT_WRITE or not. + */ + do { + if (pr->pe->vaddr + pr->pe->nr_pages * PAGE_SIZE <= vma->e->start) + continue; + if (pr->pe->vaddr > vma->e->end) + vma->e->status |= VMA_NO_PROT_WRITE; + break; + } while (pr->advance(pr)); + + continue; + } + + ret = premap_private_vma(t, vma, at); + + if (ret < 0) + break; + } + + filemap_ctx_fini(); + + return ret; +} + +static int restore_priv_vma_content(struct pstree_item *t, struct page_read *pr) +{ + struct vma_area *vma; + int ret = 0; + struct list_head *vmas = &rsti(t)->vmas.h; + struct list_head *vma_io = &rsti(t)->vma_io; + + unsigned int nr_restored = 0; + unsigned int nr_shared = 0; + unsigned int nr_dropped = 0; + unsigned int nr_compared = 0; + unsigned int nr_lazy = 0; + unsigned long va; + + vma = list_first_entry(vmas, struct vma_area, list); + rsti(t)->pages_img_id = pr->pages_img_id; + + /* + * Read page contents. + */ + while (1) { + unsigned long off, i, nr_pages; + + ret = pr->advance(pr); + if (ret <= 0) + break; + + va = (unsigned long)decode_pointer(pr->pe->vaddr); + nr_pages = pr->pe->nr_pages; + + /* + * This means that userfaultfd is used to load the pages + * on demand. + */ + if (opts.lazy_pages && pagemap_lazy(pr->pe)) { + pr_debug("Lazy restore skips %ld pages at %lx\n", nr_pages, va); + pr->skip_pages(pr, nr_pages * PAGE_SIZE); + nr_lazy += nr_pages; + continue; + } + + for (i = 0; i < nr_pages; i++) { + unsigned char buf[PAGE_SIZE]; + void *p; + + /* + * The lookup is over *all* possible VMAs + * read from image file. + */ + while (va >= vma->e->end) { + if (vma->list.next == vmas) + goto err_addr; + vma = vma_next(vma); + } + + /* + * Make sure the page address is inside existing VMA + * and the VMA it refers to still private one, since + * there is no guarantee that the data from pagemap is + * valid. + */ + if (va < vma->e->start) + goto err_addr; + else if (unlikely(!vma_area_is_private(vma, kdat.task_size))) { + pr_err("Trying to restore page for non-private VMA\n"); + goto err_addr; + } + + if (!vma_area_is(vma, VMA_PREMMAPED)) { + unsigned long len = min_t(unsigned long, + (nr_pages - i) * PAGE_SIZE, + vma->e->end - va); + + if (vma->e->status & VMA_NO_PROT_WRITE) { + pr_debug("VMA 0x%"PRIx64":0x%"PRIx64" RO %#lx:%lu IO\n", + vma->e->start, vma->e->end, va, nr_pages); + BUG(); + } + + if (pagemap_enqueue_iovec(pr, (void *)va, len, vma_io)) + return -1; + + pr->skip_pages(pr, len); + + va += len; + len >>= PAGE_SHIFT; + nr_restored += len; + i += len - 1; + pr_debug("Enqueue page-read\n"); + continue; + } + + /* + * Otherwise to the COW restore + */ + + off = (va - vma->e->start) / PAGE_SIZE; + p = decode_pointer((off) * PAGE_SIZE + + vma->premmaped_addr); + + set_bit(off, vma->page_bitmap); + if (vma_inherited(vma)) { + clear_bit(off, vma->pvma->page_bitmap); + + ret = pr->read_pages(pr, va, 1, buf, 0); + if (ret < 0) + goto err_read; + + va += PAGE_SIZE; + nr_compared++; + + if (memcmp(p, buf, PAGE_SIZE) == 0) { + nr_shared++; /* the page is cowed */ + continue; + } + + nr_restored++; + memcpy(p, buf, PAGE_SIZE); + } else { + int nr; + + /* + * Try to read as many pages as possible at once. + * + * Within the t pagemap we still have + * nr_pages - i pages (not all, as we might have + * switched VMA above), within the t VMA + * we have at most (vma->end - t_addr) bytes. + */ + + nr = min_t(int, nr_pages - i, (vma->e->end - va) / PAGE_SIZE); + + ret = pr->read_pages(pr, va, nr, p, PR_ASYNC); + if (ret < 0) + goto err_read; + + va += nr * PAGE_SIZE; + nr_restored += nr; + i += nr - 1; + + bitmap_set(vma->page_bitmap, off + 1, nr - 1); + } + + } + } + +err_read: + if (pr->sync(pr)) + return -1; + + pr->close(pr); + if (ret < 0) + return ret; + + /* Remove pages, which were not shared with a child */ + list_for_each_entry(vma, vmas, list) { + unsigned long size, i = 0; + void *addr = decode_pointer(vma->premmaped_addr); + + if (!vma_inherited(vma)) + continue; + + size = vma_entry_len(vma->e) / PAGE_SIZE; + while (1) { + /* Find all pages, which are not shared with this child */ + i = find_next_bit(vma->pvma->page_bitmap, size, i); + + if ( i >= size) + break; + + ret = madvise(addr + PAGE_SIZE * i, + PAGE_SIZE, MADV_DONTNEED); + if (ret < 0) { + pr_perror("madvise failed"); + return -1; + } + i++; + nr_dropped++; + } + } + + cnt_add(CNT_PAGES_COMPARED, nr_compared); + cnt_add(CNT_PAGES_SKIPPED_COW, nr_shared); + cnt_add(CNT_PAGES_RESTORED, nr_restored); + + pr_info("nr_restored_pages: %d\n", nr_restored); + pr_info("nr_shared_pages: %d\n", nr_shared); + pr_info("nr_dropped_pages: %d\n", nr_dropped); + pr_info("nr_lazy: %d\n", nr_lazy); + + return 0; + +err_addr: + pr_err("Page entry address %lx outside of VMA %lx-%lx\n", + va, (long)vma->e->start, (long)vma->e->end); + return -1; +} + +static int maybe_disable_thp(struct pstree_item *t, struct page_read *pr) +{ + struct _MmEntry *mm = rsti(t)->mm; + + /* + * There is no need to disable it if the page read doesn't + * have parent. In this case VMA will be empty until + * userfaultfd_register, so there would be no pages to + * collapse. And, once we register the VMA with uffd, + * khugepaged will skip it. + */ + if (!(opts.lazy_pages && page_read_has_parent(pr))) + return 0; + + if (!kdat.has_thp_disable) + pr_warn("Disabling transparent huge pages. " + "It may affect performance!\n"); + + /* + * temporarily disable THP to avoid collapse of pages + * in the areas that will be monitored by uffd + */ + if (prctl(PR_SET_THP_DISABLE, 1, 0, 0, 0)) { + pr_perror("Cannot disable THP"); + return -1; + } + if (!(mm->has_thp_disabled && mm->thp_disabled)) + rsti(t)->has_thp_enabled = true; + + return 0; +} + +int prepare_mappings(struct pstree_item *t) +{ + int ret = 0; + void *addr; + struct vm_area_list *vmas; + struct page_read pr; + + void *old_premmapped_addr = NULL; + unsigned long old_premmapped_len; + + vmas = &rsti(t)->vmas; + if (vmas->nr == 0) /* Zombie */ + goto out; + + /* Reserve a place for mapping private vma-s one by one */ + addr = mmap(NULL, vmas->priv_size, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); + if (addr == MAP_FAILED) { + ret = -1; + pr_perror("Unable to reserve memory (%lu bytes)", vmas->priv_size); + goto out; + } + + old_premmapped_addr = rsti(t)->premmapped_addr; + old_premmapped_len = rsti(t)->premmapped_len; + rsti(t)->premmapped_addr = addr; + rsti(t)->premmapped_len = vmas->priv_size; + + ret = open_page_read(vpid(t), &pr, PR_TASK); + if (ret <= 0) + return -1; + + if (maybe_disable_thp(t, &pr)) + return -1; + + pr.advance(&pr); /* shift to the 1st iovec */ + + ret = premap_priv_vmas(t, vmas, &addr, &pr); + if (ret < 0) + goto out; + + pr.reset(&pr); + + ret = restore_priv_vma_content(t, &pr); + if (ret < 0) + goto out; + + if (old_premmapped_addr) { + ret = munmap(old_premmapped_addr, old_premmapped_len); + if (ret < 0) + pr_perror("Unable to unmap %p(%lx)", + old_premmapped_addr, old_premmapped_len); + } + + /* + * Not all VMAs were premmaped. Find out the unused tail of the + * premapped area and unmap it. + */ + old_premmapped_len = addr - rsti(t)->premmapped_addr; + if (old_premmapped_len < rsti(t)->premmapped_len) { + unsigned long tail; + + tail = rsti(t)->premmapped_len - old_premmapped_len; + ret = munmap(addr, tail); + if (ret < 0) + pr_perror("Unable to unmap %p(%lx)", addr, tail); + rsti(t)->premmapped_len = old_premmapped_len; + pr_info("Shrunk premap area to %p(%lx)\n", + rsti(t)->premmapped_addr, rsti(t)->premmapped_len); + } + +out: + return ret; +} + +bool vma_has_guard_gap_hidden(struct vma_area *vma) +{ + return kdat.stack_guard_gap_hidden && (vma->e->flags & MAP_GROWSDOWN); +} + +/* + * A guard page must be unmapped after restoring content and + * forking children to restore COW memory. + */ +int unmap_guard_pages(struct pstree_item *t) +{ + struct vma_area *vma; + struct list_head *vmas = &rsti(t)->vmas.h; + + if (!kdat.stack_guard_gap_hidden) + return 0; + + list_for_each_entry(vma, vmas, list) { + if (!vma_area_is(vma, VMA_PREMMAPED)) + continue; + + if (vma->e->flags & MAP_GROWSDOWN) { + void *addr = decode_pointer(vma->premmaped_addr); + + if (munmap(addr - PAGE_SIZE, PAGE_SIZE)) { + pr_perror("Can't unmap guard page"); + return -1; + } + } + } + + return 0; +} + +int open_vmas(struct pstree_item *t) +{ + int pid = vpid(t); + struct vma_area *vma; + struct vm_area_list *vmas = &rsti(t)->vmas; + + filemap_ctx_init(false); + + list_for_each_entry(vma, &vmas->h, list) { + if (!vma_area_is(vma, VMA_AREA_REGULAR) || !vma->vm_open) + continue; + + pr_info("Opening %#016"PRIx64"-%#016"PRIx64" %#016"PRIx64" (%x) vma\n", + vma->e->start, vma->e->end, + vma->e->pgoff, vma->e->status); + + if (vma->vm_open(pid, vma)) { + pr_err("`- Can't open vma\n"); + return -1; + } + + /* + * File mappings have vm_open set to open_filemap which, in + * turn, puts the VMA_CLOSE bit itself. For all the rest we + * need to put it by hands, so that the restorer closes the fd + */ + if (!(vma_area_is(vma, VMA_FILE_PRIVATE) || + vma_area_is(vma, VMA_FILE_SHARED))) + vma->e->status |= VMA_CLOSE; + } + + filemap_ctx_fini(); + + return 0; +} + +static int prepare_vma_ios(struct pstree_item *t, struct task_restore_args *ta) +{ + struct cr_img *pages; + + /* + * If auto-dedup is on we need RDWR mode to be able to punch holes in + * the input files (in restorer.c) + */ + pages = open_image(CR_FD_PAGES, opts.auto_dedup ? O_RDWR : O_RSTR, + rsti(t)->pages_img_id); + if (!pages) + return -1; + + ta->vma_ios_fd = img_raw_fd(pages); + return pagemap_render_iovec(&rsti(t)->vma_io, ta); +} + +int prepare_vmas(struct pstree_item *t, struct task_restore_args *ta) +{ + struct vma_area *vma; + struct vm_area_list *vmas = &rsti(t)->vmas; + + ta->vmas = (VmaEntry *)rst_mem_align_cpos(RM_PRIVATE); + ta->vmas_n = vmas->nr; + + list_for_each_entry(vma, &vmas->h, list) { + VmaEntry *vme; + + vme = rst_mem_alloc(sizeof(*vme), RM_PRIVATE); + if (!vme) + return -1; + + /* + * Copy VMAs to private rst memory so that it's able to + * walk them and m(un|re)map. + */ + *vme = *vma->e; + + if (vma_area_is(vma, VMA_PREMMAPED)) + vma_premmaped_start(vme) = vma->premmaped_addr; + } + + return prepare_vma_ios(t, ta); +} diff --git a/CRIU_code/criu/mount.c b/CRIU_code/criu/mount.c new file mode 100644 index 0000000..c03a435 --- /dev/null +++ b/CRIU_code/criu/mount.c @@ -0,0 +1,3822 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cr_options.h" +#include "util.h" +#include "util-pie.h" +#include "log.h" +#include "plugin.h" +#include "filesystems.h" +#include "mount.h" +#include "pstree.h" +#include "image.h" +#include "namespaces.h" +#include "protobuf.h" +#include "fs-magic.h" +#include "path.h" +#include "files-reg.h" +#include "external.h" +#include "clone-noasan.h" +#include "fdstore.h" + +#include "images/mnt.pb-c.h" + +/* + * Put a : in here since those are invalid on + * the cli, so we know it's autogenerated in + * debugging. + */ +#define AUTODETECTED_MOUNT "CRIU:AUTOGENERATED" +#define NO_ROOT_MOUNT "CRIU:NO_ROOT" +#define MS_PROPAGATE (MS_SHARED | MS_PRIVATE | MS_UNBINDABLE | MS_SLAVE) + +#undef LOG_PREFIX +#define LOG_PREFIX "mnt: " + +#define BINFMT_MISC_HOME "proc/sys/fs/binfmt_misc" +#define CRTIME_MNT_ID 0 + +/* A helper mount_info entry for the roots yard */ +static struct mount_info *root_yard_mp = NULL; + +int ext_mount_add(char *key, char *val) +{ + char *e_str; + + e_str = xmalloc(strlen(key) + strlen(val) + 8); + if (!e_str) + return -1; + + /* + * On dump the key is the mountpoint as seen from the mount + * namespace, the val is some name that will be put into image + * instead of the mount point's root path. + * + * On restore the key is the name from the image (the one + * mentioned above) and the val is the path in criu's mount + * namespace that will become the mount point's root, i.e. -- + * be bind mounted to the respective mountpoint. + */ + + sprintf(e_str, "mnt[%s]:%s", key, val); + return add_external(e_str); +} + +int ext_mount_parse_auto(char *key) +{ + opts.autodetect_ext_mounts = true; + + if (*key == ':') { + key++; + if (*key == 'm') + opts.enable_external_masters = true; + else if (*key == 's') + opts.enable_external_sharing = true; + else if (*key != '\0') + return -1; + } + + return 0; +} + +/* Lookup ext_mount by key field */ +static char *ext_mount_lookup(char *key) +{ + char *v; + int len = strlen(key); + char mkey[len + 6]; + + sprintf(mkey, "mnt[%s]", key); + v = external_lookup_by_key(mkey); + if (IS_ERR(v)) + v = NULL; + + return v; +} + +/* + * Single linked list of mount points get from proc/images + */ +struct mount_info *mntinfo; + +static void mntinfo_add_list(struct mount_info *new) +{ + if (!mntinfo) + mntinfo = new; + else { + struct mount_info *pm; + + /* Add to the tail. (FIXME -- make O(1) ) */ + for (pm = mntinfo; pm->next != NULL; pm = pm->next) + ; + pm->next = new; + } +} + +static struct mount_info *__lookup_overlayfs(struct mount_info *list, char *rpath, + unsigned int st_dev, unsigned int st_ino, + unsigned int mnt_id) +{ + /* + * Goes through all entries in the mountinfo table + * looking for a mount point that contains the file specified + * in rpath. Uses the device number st_dev and the inode number st_ino + * to make sure the file is correct. + */ + struct mount_info *mi_ret = NULL; + struct mount_info *m; + int mntns_root = -1; + + for (m = list; m != NULL; m = m->next) { + struct stat f_stat; + int ret_stat; + + if (m->fstype->code != FSTYPE__OVERLAYFS) + continue; + + /* + * We need the mntns root fd of the process to be dumped, + * to make sure we stat the correct file + */ + if (mntns_root == -1) { + mntns_root = __mntns_get_root_fd(root_item->pid->real); + if (mntns_root < 0) { + pr_err("Unable to get the root file descriptor of pid %d\n", root_item->pid->real); + return ERR_PTR(-ENOENT); + } + } + + /* Concatenates m->mountpoint with rpath and attempts to stat the resulting path */ + if (is_root_mount(m)) { + ret_stat = fstatat(mntns_root, rpath, &f_stat, 0); + } else { + char _full_path[PATH_MAX]; + int n = snprintf(_full_path, PATH_MAX, "%s/%s", m->mountpoint, rpath); + + if (n >= PATH_MAX) { + pr_err("Not enough space to concatenate %s and %s\n", m->mountpoint, rpath); + return ERR_PTR(-ENOSPC); + } + ret_stat = fstatat(mntns_root, _full_path, &f_stat, 0); + } + + if (ret_stat == 0 && st_dev == f_stat.st_dev && st_ino == f_stat.st_ino) + mi_ret = m; + } + + return mi_ret; +} + +/* + * Looks up the mnt_id and path of a file in an overlayFS directory. + * + * This is useful in order to fix the OverlayFS bug present in the + * Linux Kernel before version 4.2. See fixup_overlayfs for details. + * + * We first check to see if the mnt_id and st_dev numbers currently match + * some entry in the mountinfo table. If so, we already have the correct mnt_id + * and no fixup is needed. + * + * Then we proceed to see if there are any overlayFS mounted directories + * in the mountinfo table. If so, we concatenate the mountpoint with the + * name of the file, and stat the resulting path to check if we found the + * correct device id and node number. If that is the case, we update the + * mount id and link variables with the correct values. + */ +struct mount_info *lookup_overlayfs(char *rpath, unsigned int st_dev, + unsigned int st_ino, unsigned int mnt_id) +{ + struct mount_info *m; + + /* If the mnt_id and device number match for some entry, no fixup is needed */ + for (m = mntinfo; m != NULL; m = m->next) + if (st_dev == kdev_to_odev(m->s_dev) && mnt_id == m->mnt_id) + return NULL; + + return __lookup_overlayfs(mntinfo, rpath, st_dev, st_ino, mnt_id); +} + +static struct mount_info *__lookup_mnt_id(struct mount_info *list, int id) +{ + struct mount_info *m; + + for (m = list; m != NULL; m = m->next) + if (m->mnt_id == id) + return m; + + return NULL; +} + +struct mount_info *lookup_mnt_id(unsigned int id) +{ + return __lookup_mnt_id(mntinfo, id); +} + +struct mount_info *lookup_mnt_sdev(unsigned int s_dev) +{ + struct mount_info *m; + + for (m = mntinfo; m != NULL; m = m->next) + /* + * We should not provide notdir bindmounts to open_mount as + * opening them can fail/hang for binds of unix sockets/fifos + */ + if (m->s_dev == s_dev && mnt_is_dir(m)) + return m; + + return NULL; +} + +static struct mount_info *mount_resolve_path(struct mount_info *mntinfo_tree, const char *path) +{ + size_t pathlen = strlen(path); + struct mount_info *m = mntinfo_tree, *c; + + while (1) { + list_for_each_entry(c, &m->children, siblings) { + size_t n; + + n = strlen(c->mountpoint + 1); + if (n > pathlen) + continue; + + if (strncmp(c->mountpoint + 1, path, min(n, pathlen))) + continue; + if (n < pathlen && path[n] != '/') + continue; + + m = c; + break; + } + if (&c->siblings == &m->children) + break; + } + + pr_debug("Path `%s' resolved to `%s' mountpoint\n", path, m->mountpoint); + return m; +} + +dev_t phys_stat_resolve_dev(struct ns_id *ns, dev_t st_dev, const char *path) +{ + struct mount_info *m; + + m = mount_resolve_path(ns->mnt.mntinfo_tree, path); + /* + * BTRFS returns subvolume dev-id instead of + * superblock dev-id, in such case return device + * obtained from mountinfo (ie subvolume0). + */ + return strcmp(m->fstype->name, "btrfs") ? + MKKDEV(major(st_dev), minor(st_dev)) : m->s_dev; +} + +bool phys_stat_dev_match(dev_t st_dev, dev_t phys_dev, + struct ns_id *ns, const char *path) +{ + if (st_dev == kdev_to_odev(phys_dev)) + return true; + + return phys_dev == phys_stat_resolve_dev(ns, st_dev, path); +} + +/* + * Compare super-blocks mounted at two places + */ +static bool mounts_sb_equal(struct mount_info *a, struct mount_info *b) +{ + if (a->fstype != b->fstype) + return false; + + if (a->s_dev != b->s_dev) + return false; + + if (strcmp(a->source, b->source) != 0) + return false; + + if (a->fstype->sb_equal) /* :) */ + return b->fstype->sb_equal(a, b); + + if (strcmp(a->options, b->options)) + return false; + + return true; +} + +/* + * Compare superblocks AND the way they are mounted + */ +static bool mounts_equal(struct mount_info *a, struct mount_info *b) +{ + if (!mounts_sb_equal(a, b)) + return false; + if (strcmp(a->root, b->root)) + return false; + + return true; +} + +/* + * mnt_roots is a temporary directory for restoring sub-trees of + * non-root namespaces. + */ +static char *mnt_roots; + +static struct mount_info *mnt_build_ids_tree(struct mount_info *list, struct mount_info *yard_mount) +{ + struct mount_info *m, *root = NULL; + + /* + * Just resolve the mnt_id:parent_mnt_id relations + */ + + pr_debug("\tBuilding plain mount tree\n"); + for (m = list; m != NULL; m = m->next) { + struct mount_info *parent; + + pr_debug("\t\tWorking on %d->%d\n", m->mnt_id, m->parent_mnt_id); + + if (m->mnt_id != m->parent_mnt_id) + parent = __lookup_mnt_id(list, m->parent_mnt_id); + else /* a circular mount reference. It's rootfs or smth like it. */ + parent = NULL; + + if (!parent) { + /* Only a root mount can be without parent */ + if (root == NULL && m->is_ns_root) { + root = m; + if (!yard_mount) + continue; + } + + if (!root) { + pr_err("No parent found for mountpoint %d (@%s)\n", + m->mnt_id, m->mountpoint); + return NULL; + } + + pr_debug("Mountpoint %d (@%s) w/o parent %d\n", + m->mnt_id, m->mountpoint, m->parent_mnt_id); + + if (!mounts_sb_equal(root, m) || + strcmp(root->root, m->root)) { + pr_err("Nested mount namespaces with different " + "roots %d (@%s %s) %d (@%s %s) are not supported yet\n", + root->mnt_id, root->mountpoint, root->root, + m->mnt_id, m->mountpoint, m->root); + return NULL; + } + + /* Mount all namespace roots into the roots yard. */ + parent = yard_mount; + if (unlikely(!yard_mount)) { + pr_err("Nested mount %d (@%s %s) w/o root insertion detected\n", + m->mnt_id, m->mountpoint, m->root); + return NULL; + } + + pr_debug("Mountpoint %d (@%s) get parent %d (@%s)\n", + m->mnt_id, m->mountpoint, + parent->mnt_id, parent->mountpoint); + } + + m->parent = parent; + list_add_tail(&m->siblings, &parent->children); + } + + if (!root) { + pr_err("No root found for tree\n"); + return NULL; + } + + if (yard_mount) + return yard_mount; + + return root; +} + +static unsigned int mnt_depth(struct mount_info *m) +{ + unsigned int depth = 0; + char *c; + + for (c = m->mountpoint; *c != '\0'; c++) + if (*c == '/') + depth++; + + return depth; +} + +static void mnt_resort_siblings(struct mount_info *tree) +{ + struct mount_info *m, *p; + LIST_HEAD(list); + + /* + * Put siblings of each node in an order they can be (u)mounted + * I.e. if we have mounts on foo/bar/, foo/bar/foobar/ and foo/ + * we should put them in the foo/bar/foobar/, foo/bar/, foo/ order. + * Otherwise we will not be able to (u)mount them in a sequence. + * + * Funny, but all we need for this is to sort them in the descending + * order of the amount of /-s in a path =) + * + * Use stupid insertion sort here, we're not expecting mount trees + * to contain hundreds (or more) elements. + */ + + pr_info("\tResorting siblings on %d\n", tree->mnt_id); + while (!list_empty(&tree->children)) { + unsigned int depth; + + m = list_first_entry(&tree->children, struct mount_info, siblings); + list_del(&m->siblings); + + depth = mnt_depth(m); + list_for_each_entry(p, &list, siblings) + if (mnt_depth(p) < depth) + break; + + list_add_tail(&m->siblings, &p->siblings); + mnt_resort_siblings(m); + } + + list_splice(&list, &tree->children); +} + +static void mnt_tree_show(struct mount_info *tree, int off) +{ + struct mount_info *m; + + pr_info("%*s[%s](%d->%d)\n", off, "", + tree->mountpoint, tree->mnt_id, tree->parent_mnt_id); + + list_for_each_entry(m, &tree->children, siblings) + mnt_tree_show(m, off + 1); + + pr_info("%*s<--\n", off, ""); +} + +/* Returns -1 on error, 1 if external mount resolved, 0 otherwise */ +static int try_resolve_ext_mount(struct mount_info *info) +{ + char *ext; + char devstr[64]; + + ext = ext_mount_lookup(info->mountpoint + 1 /* trim the . */); + if (ext) { + pr_info("Found %s mapping for %s mountpoint\n", + ext, info->mountpoint); + info->external = ext; + return 1; + } + + snprintf(devstr, sizeof(devstr), "dev[%d/%d]", + kdev_major(info->s_dev), kdev_minor(info->s_dev)); + + if (info->fstype->code == FSTYPE__UNSUPPORTED) { + char *val; + + val = external_lookup_by_key(devstr); + if (!IS_ERR_OR_NULL(val)) { + char *source; + int len; + + len = strlen(val) + sizeof("dev[]"); + source = xrealloc(info->source, len); + if (source == NULL) + return -1; + + snprintf(source, len, "dev[%s]", val); + info->fstype = fstype_auto(); + BUG_ON(info->fstype->code != FSTYPE__AUTO); + info->source = source; + return 1; + } + } + + return 0; +} + +/* + * Find the mount_info from which the respective bind-mount + * can be created. It can be either an FS-root mount, or the + * root of the tree (the latter only if its root path is the + * sub-path of the bind mount's root). + */ + +static struct mount_info *find_fsroot_mount_for(struct mount_info *bm) +{ + struct mount_info *sm; + + list_for_each_entry(sm, &bm->mnt_bind, mnt_bind) + if (fsroot_mounted(sm) || + (sm->parent == root_yard_mp && + strstartswith(bm->root, sm->root))) + return sm; + + return NULL; +} + +static bool mnt_needs_remap(struct mount_info *m) +{ + struct mount_info *t; + + if (!m->parent) + return false; + + list_for_each_entry(t, &m->parent->children, siblings) { + if (m == t) + continue; + if (issubpath(t->mountpoint, m->mountpoint)) + return true; + } + + /* + * If we are children-overmount and parent is remapped, we should be + * remapped too, else fixup_remap_mounts() won't be able to move parent + * to it's real place, it will move child instead. + */ + if (!strcmp(m->parent->mountpoint, m->mountpoint)) + return mnt_needs_remap(m->parent); + + return false; +} + +/* + * Say mount is external if it was explicitly specified as an + * external or it will be bind from such an explicit external + * mount, we set bind in propagate_mount and propagate_siblings + */ + +static bool mnt_is_external(struct mount_info *m) +{ + struct mount_info *t; + + while (m) { + if (m->external) + return 1; + + if (!list_empty(&m->mnt_share)) + list_for_each_entry(t, &m->mnt_share, mnt_share) + if (t->external) + return 1; + + if (m->master_id <= 0 && !list_empty(&m->mnt_bind)) + list_for_each_entry(t, &m->mnt_bind, mnt_bind) + if (issubpath(m->root, t->root) && t->external) + return 1; + + m = m->mnt_master; + } + + return 0; +} + +/* + * Having two children with same mountpoint is unsupported. That can happen in + * case of mount propagation inside of shared mounts, in that case it is hard + * to find out mount propagation siblings and which of these mounts is above + * (visible) and which is beneath (hidden). It would've broken mount restore + * order in can_mount_now and also visibility assumptions in open_mountpoint. + * + * Anyway after kernel v4.11 such mounts will be impossible. + */ +static int validate_children_collision(struct mount_info *mnt) +{ + struct mount_info *chi, *chj; + + list_for_each_entry(chi, &mnt->children, siblings) { + list_for_each_entry(chj, &mnt->children, siblings) { + if (chj == chi) + break; + if (!strcmp(chj->mountpoint, chi->mountpoint)) { + pr_err("Mount %d has two children with same " + "mountpoint: %d %d\n", + mnt->mnt_id, chj->mnt_id, chi->mnt_id); + return -1; + } + } + } + return 0; +} + +static int validate_mounts(struct mount_info *info, bool for_dump) +{ + struct mount_info *m, *t; + + for (m = info; m; m = m->next) { + if (m->parent == NULL || m->is_ns_root) + /* root mount can be any */ + continue; + + if (validate_children_collision(m)) + return -1; + + if (mnt_is_external(m)) + continue; + + /* + * Mountpoint can point to / of an FS. In that case this FS + * should be of some known type so that we can just mount one. + * + * Otherwise it's a bindmount mountpoint and we try to find + * what fsroot mountpoint it's bound to. If this point is the + * root mount, the path to bindmount root should be accessible + * form the rootmount path (the strstartswith check in the + * else branch below). + */ + + if (fsroot_mounted(m)) { + if (m->fstype->code == FSTYPE__UNSUPPORTED) { + pr_err("FS mnt %s dev %#x root %s unsupported id %d\n", + m->mountpoint, m->s_dev, m->root, m->mnt_id); + return -1; + } + } else { + t = find_fsroot_mount_for(m); + if (!t) { + int ret; + + /* + * No root-mount found for this bind and it's neither + * marked nor auto-resolved as external one. So last + * chance not to fail is to talk to plugins. + */ + + if (for_dump) { + ret = run_plugins(DUMP_EXT_MOUNT, m->mountpoint, m->mnt_id); + if (ret == 0) + m->need_plugin = true; + } else + /* + * Plugin should take care of this one + * in restore_ext_mount, or do_bind_mount + * will mount it as external + */ + ret = m->need_plugin ? 0 : -ENOTSUP; + + if (ret < 0) { + if (ret == -ENOTSUP) + pr_err("%d:%s doesn't have a proper root mount\n", + m->mnt_id, m->mountpoint); + return -1; + } + } + } + } + + return 0; +} + +static struct mount_info *find_best_external_match(struct mount_info *list, struct mount_info *info) +{ + struct mount_info *it, *candidate = NULL; + + for (it = list; it; it = it->next) { + if (!mounts_sb_equal(info, it)) + continue; + + /* + * This means we have a situation like: + * + * root@criu:~# mount --bind bind1/subdir/ bind2 + * root@criu:~# mount --bind bind1/ bind3 + * + * outside the container, and bind1 is directly bind mounted + * inside the container. mounts_equal() considers these mounts + * equal for bind purposes, but their roots are different, and + * we want to match the one with the right root. + */ + if (!issubpath(info->root, it->root)) + continue; + + candidate = it; + + /* + * Consider the case of: + * + * mount /xxx + * mount --bind /xxx /yyy + * mount --make-shared /yyy + * mount --bind /xxx /zzz + * mount --make-shared /zzz + * bind mount a shared mount into the namespace + * + * Here, we want to return the /right/ mount, not just a mount + * that's equal. However, in the case: + * + * bind mount a shared mount into the namespace + * inside the namespace, remount MS_PRIVATE + * inside the namespace, remount MS_SHARED + * + * there will be no external mount with matching sharing + * because the sharing is only internal; we still want to bind + * mount from this mountinfo so we should return it, but we + * should make the sharing namespace private after that bind + * mount. + * + * Below are the cases where we found an exact match. + */ + if (info->flags & MS_SHARED && info->shared_id == it->shared_id) + return candidate; + + if (info->flags & MS_SLAVE && info->master_id == it->shared_id) + return candidate; + } + + return candidate; +} + +static struct ns_id *find_ext_ns_id(void) +{ + struct ns_id *ns; + + for (ns = ns_ids; ns->next; ns = ns->next) + if (ns->type == NS_CRIU && ns->nd == &mnt_ns_desc) { + if (!ns->mnt.mntinfo_list && + !collect_mntinfo(ns, true)) + break; + return ns; + } + + pr_err("Failed to find criu pid's mount ns\n"); + return NULL; +} + +static int resolve_external_mounts(struct mount_info *info) +{ + struct ns_id *ext_ns = NULL; + struct mount_info *m; + + if (opts.autodetect_ext_mounts) { + ext_ns = find_ext_ns_id(); + if (!ext_ns) + return -1; + } + + for (m = info; m; m = m->next) { + int ret; + char *p, *cut_root; + struct mount_info *match; + + if (m->parent == NULL || m->is_ns_root) + continue; + + ret = try_resolve_ext_mount(m); + if (ret < 0) + return ret; + if (ret == 1 || !ext_ns) + continue; + + match = find_best_external_match(ext_ns->mnt.mntinfo_list, m); + if (!match) + continue; + + if (m->flags & MS_SHARED) { + if (!opts.enable_external_sharing) + continue; + + if (m->shared_id != match->shared_id) + m->internal_sharing = true; + } + + if (m->flags & MS_SLAVE) { + if (!opts.enable_external_masters) + continue; + + /* + * In order to support something like internal slavery, + * we need to teach can_mount_now and do_mount_one + * about slavery relationships in external mounts. This + * seems like an uncommon case, so we punt for not. + */ + if (m->master_id != match->shared_id && m->master_id != match->master_id) + continue; + } + + cut_root = cut_root_for_bind(m->root, match->root); + + p = xsprintf("%s/%s", match->mountpoint + 1, cut_root); + if (!p) + return -1; + + m->external = AUTODETECTED_MOUNT; + + /* + * Put the guessed name in source. It will be picked up + * as auto-root in get_mp_root() on restore. + */ + xfree(m->source); + m->source = p; + + pr_info("autodetected external mount %s for %s\n", p, m->mountpoint); + } + + return 0; +} + +static int root_path_from_parent(struct mount_info *m, char *buf, int size) +{ + bool head_slash = false, tail_slash = false; + int p_len, m_len, len; + + if (!m->parent) + return -1; + + p_len = strlen(m->parent->mountpoint); + m_len = strlen(m->mountpoint); + + len = snprintf(buf, size, "%s", m->parent->root); + if (len >= size) + return -1; + + BUG_ON(len <= 0); + if (buf[len-1] == '/') + tail_slash = true; + + size -= len; + buf += len; + + len = m_len - p_len; + BUG_ON(len < 0); + if (len) { + if (m->mountpoint[p_len] == '/') + head_slash = true; + + len = snprintf(buf, size, "%s%s", + (!tail_slash && !head_slash) ? "/" : "", + m->mountpoint + p_len + (tail_slash && head_slash)); + if (len >= size) + return -1; + } + + return 0; +} + +static int same_propagation_group(struct mount_info *a, struct mount_info *b) { + char root_path_a[PATH_MAX], root_path_b[PATH_MAX]; + + /* + * If mounts are in same propagation group: + * 1) Their parents should be different + * 2) Their parents should be together in same shared group + */ + if (!a->parent || !b->parent || a->parent == b->parent || + a->parent->shared_id != b->parent->shared_id) + return 0; + + if (root_path_from_parent(a, root_path_a, PATH_MAX)) { + pr_err("Failed to get root path for mount %d\n", a->mnt_id); + return -1; + } + + if (root_path_from_parent(b, root_path_b, PATH_MAX)) { + pr_err("Failed to get root path for mount %d\n", b->mnt_id); + return -1; + } + + /* + * 3) Their mountpoints relative to the root of the superblock of their + * parent's share should be equal + */ + if (!strcmp(root_path_a, root_path_b)) + return 1; + return 0; +} + +static int resolve_shared_mounts(struct mount_info *info, int root_master_id) +{ + struct mount_info *m, *t; + + /* + * If we have a shared mounts, both master + * slave targets are to be present in mount + * list, otherwise we can't be sure if we can + * recreate the scheme later on restore. + */ + for (m = info; m; m = m->next) { + bool need_share, need_master; + + /* the root master_id can be ignored, because it's already created */ + if (root_master_id && root_master_id == m->master_id) + m->master_id = -1; + + need_share = m->shared_id && list_empty(&m->mnt_share); + need_master = m->master_id > 0; + + pr_debug("Inspecting sharing on %2d shared_id %d master_id %d (@%s)\n", + m->mnt_id, m->shared_id, m->master_id, m->mountpoint); + + for (t = info; t && (need_share || need_master); t = t->next) { + if (t == m) + continue; + if (need_master && t->shared_id == m->master_id) { + pr_debug("\tThe mount %3d is slave for %3d (@%s -> @%s)\n", + m->mnt_id, t->mnt_id, + m->mountpoint, t->mountpoint); + list_add(&m->mnt_slave, &t->mnt_slave_list); + m->mnt_master = t; + need_master = false; + } + + /* Collect all mounts from this group */ + if (need_share && t->shared_id == m->shared_id) { + pr_debug("\tMount %3d is shared with %3d group %3d (@%s -> @%s)\n", + m->mnt_id, t->mnt_id, m->shared_id, + t->mountpoint, m->mountpoint); + list_add(&t->mnt_share, &m->mnt_share); + } + } + + /* + * If we haven't already determined this mount is external, + * or bind of external, then we don't know where it came from. + */ + if (need_master && m->parent && !mnt_is_external(m)) { + pr_err("Mount %d %s (master_id: %d shared_id: %d) " + "has unreachable sharing. Try --enable-external-masters.\n", m->mnt_id, + m->mountpoint, m->master_id, m->shared_id); + return -1; + } + + /* Search bind-mounts */ + if (list_empty(&m->mnt_bind)) { + /* + * A first mounted point will be set up as a source point + * for others. Look at propagate_mount() + */ + for (t = m->next; t; t = t->next) { + if (mounts_sb_equal(m, t)) { + list_add(&t->mnt_bind, &m->mnt_bind); + pr_debug("\tThe mount %3d is bind for %3d (@%s -> @%s)\n", + t->mnt_id, m->mnt_id, + t->mountpoint, m->mountpoint); + } + } + } + } + + /* Search propagation groups */ + for (m = info; m; m = m->next) { + struct mount_info *sparent; + + if (!list_empty(&m->mnt_propagate)) + continue; + + if (!m->parent || !m->parent->shared_id) + continue; + + list_for_each_entry(sparent, &m->parent->mnt_share, mnt_share) { + struct mount_info *schild; + + list_for_each_entry(schild, &sparent->children, siblings) { + int ret; + + ret = same_propagation_group(m, schild); + if (ret < 0) + return -1; + else if (ret) { + BUG_ON(!mounts_equal(m, schild)); + pr_debug("\tMount %3d is in same propagation group with %3d (@%s ~ @%s)\n", + m->mnt_id, schild->mnt_id, m->mountpoint, schild->mountpoint); + list_add(&schild->mnt_propagate, &m->mnt_propagate); + } + } + } + } + + return 0; +} + +static struct mount_info *mnt_build_tree(struct mount_info *list, + struct mount_info *root_mp) +{ + struct mount_info *tree; + + /* + * Organize them in a sequence in which they can be mounted/umounted. + */ + + pr_info("Building mountpoints tree\n"); + tree = mnt_build_ids_tree(list, root_mp); + if (!tree) + return NULL; + + mnt_resort_siblings(tree); + pr_info("Done:\n"); + mnt_tree_show(tree, 0); + return tree; +} + +int mnt_is_dir(struct mount_info *pm) +{ + int mntns_root; + struct stat st; + + mntns_root = mntns_get_root_fd(pm->nsid); + if (mntns_root < 0) { + pr_perror("Can't get root fd of mntns for %d", pm->mnt_id); + return 0; + } + + if (fstatat(mntns_root, pm->ns_mountpoint, &st, 0)) { + pr_perror("Can't fstatat on %s", pm->ns_mountpoint); + return 0; + } + + if (S_ISDIR(st.st_mode)) + return 1; + return 0; +} + +/* + * mnt_fd is a file descriptor on the mountpoint, which is closed in an error case. + * If mnt_fd is -1, the mountpoint will be opened by this function. + */ +int __open_mountpoint(struct mount_info *pm, int mnt_fd) +{ + struct stat st; + int dev; + int ret; + + if (mnt_fd == -1) { + int mntns_root; + + mntns_root = mntns_get_root_fd(pm->nsid); + if (mntns_root < 0) + return -1; + + mnt_fd = openat(mntns_root, pm->ns_mountpoint, O_RDONLY); + if (mnt_fd < 0) { + pr_perror("Can't open %s", pm->ns_mountpoint); + return -1; + } + } + + ret = fstat(mnt_fd, &st); + if (ret < 0) { + pr_perror("fstat(%s) failed", pm->ns_mountpoint); + goto err; + } + + if (pm->s_dev_rt == MOUNT_INVALID_DEV) { + pr_err("Resolving over invalid device for %#x %s %s\n", + pm->s_dev, pm->fstype->name, pm->ns_mountpoint); + goto err; + } + + dev = MKKDEV(major(st.st_dev), minor(st.st_dev)); + /* + * Always check for @s_dev_rt here, because the @s_dev + * from the image (in case of restore) has all rights + * to not match the device (say it's migrated and kernel + * allocates new device ID). + */ + if (dev != pm->s_dev_rt) { + pr_err("The file system %#x %#x (%#x) %s %s is inaccessible\n", + pm->s_dev, pm->s_dev_rt, dev, + pm->fstype->name, pm->ns_mountpoint); + goto err; + } + + return mnt_fd; +err: + close(mnt_fd); + return -1; +} + +int open_mount(unsigned int s_dev) +{ + struct mount_info *m; + + m = lookup_mnt_sdev(s_dev); + if (!m) + return -ENOENT; + + return __open_mountpoint(m, -1); +} + +/* Bind-mount a mount point in a temporary place without children */ +static char *get_clean_mnt(struct mount_info *mi, char *mnt_path_tmp, char *mnt_path_root) +{ + char *mnt_path; + + mnt_path = mkdtemp(mnt_path_tmp); + if (mnt_path == NULL && errno == ENOENT) + mnt_path = mkdtemp(mnt_path_root); + if (mnt_path == NULL) { + pr_perror("Can't create a temporary directory"); + return NULL; + } + + if (mount(mi->mountpoint, mnt_path, NULL, MS_BIND, NULL)) { + pr_perror("Can't bind-mount %d:%s to %s", + mi->mnt_id, mi->mountpoint, mnt_path); + rmdir(mnt_path); + return NULL; + } + + return mnt_path; +} + +static int get_clean_fd(struct mount_info *mi) +{ + char *mnt_path = NULL; + char mnt_path_tmp[] = "/tmp/cr-tmpfs.XXXXXX"; + char mnt_path_root[] = "/cr-tmpfs.XXXXXX"; + + mnt_path = get_clean_mnt(mi, mnt_path_tmp, mnt_path_root); + if (!mnt_path) + return -1; + + return open_detach_mount(mnt_path); +} + +/* + * Our children mount can have same mountpoint as it's parent, + * call these - children-overmount. + * Sibling mount's mountpoint can be a subpath of our mountpoint + * call these - sibling-overmount. + * In both above cases our mountpoint is not visible from the + * root of our mount namespace as it is covered by other mount. + * mnt_is_overmounted() checks if mount is not visible. + */ +bool mnt_is_overmounted(struct mount_info *mi) +{ + struct mount_info *t, *c, *m = mi; + + if (mi->is_overmounted != -1) + goto exit; + + mi->is_overmounted = 0; + + while (m->parent) { + if (mi->parent->is_overmounted == 1) { + mi->is_overmounted = 1; + goto exit; + } + + /* Check there is no sibling-overmount */ + list_for_each_entry(t, &m->parent->children, siblings) { + if (m == t) + continue; + if (issubpath(m->mountpoint, t->mountpoint)) { + mi->is_overmounted = 1; + goto exit; + } + } + + /* + * If parent has sibling-overmount we are not visible too, + * note that children-overmounts for parent are already + * checked as our sibling overmounts. + */ + m = m->parent; + } + + /* Check there is no children-overmount */ + list_for_each_entry(c, &mi->children, siblings) + if (!strcmp(c->mountpoint, mi->mountpoint)) { + mi->is_overmounted = 1; + goto exit; + } + +exit: + return mi->is_overmounted; +} + +static int set_is_overmounted(struct mount_info *mi) +{ + mnt_is_overmounted(mi); + return 0; +} + +/* + * __umount_children_overmounts() assumes that the mountpoint and + * it's ancestors have no sibling-overmounts, so we can see children + * of these mount. Unmount our children-overmounts now. + */ +static int __umount_children_overmounts(struct mount_info *mi) +{ + struct mount_info *c, *m = mi; + + /* + * Our children-overmount can itself have children-overmount + * which covers it, so find deepest children-overmount which + * is visible for us now. + */ +again: + list_for_each_entry(c, &m->children, siblings) { + if (!strcmp(c->mountpoint, m->mountpoint)) { + m = c; + goto again; + } + } + + /* Unmout children-overmounts in the order of visibility */ + while (m != mi) { + if (umount2(m->mountpoint, MNT_DETACH)) { + pr_perror("Unable to umount child-overmount %s", m->mountpoint); + return -1; + } + BUG_ON(!m->parent); + m = m->parent; + } + + return 0; +} + +/* Makes the mountpoint visible except for children-overmounts. */ +static int __umount_overmounts(struct mount_info *m) +{ + struct mount_info *t, *ovm; + int ovm_len, ovm_len_min = 0; + + /* Root mount has no sibling-overmounts */ + if (!m->parent) + return 0; + + /* + * If parent is sibling-overmounted we are not visible + * too, so first try to unmount overmounts for parent. + */ + if (__umount_overmounts(m->parent)) + return -1; + + /* Unmount sibling-overmounts in visibility order */ +next: + ovm = NULL; + ovm_len = strlen(m->mountpoint) + 1; + list_for_each_entry(t, &m->parent->children, siblings) { + if (m == t) + continue; + if (issubpath(m->mountpoint, t->mountpoint)) { + int t_len = strlen(t->mountpoint); + + if (t_len < ovm_len && t_len > ovm_len_min) { + ovm = t; + ovm_len = t_len; + } + } + } + + if (ovm) { + ovm_len_min = ovm_len; + + /* Our sibling-overmount can have children-overmount covering it */ + if (__umount_children_overmounts(ovm)) + return -1; + + if (umount2(ovm->mountpoint, MNT_DETACH)) { + pr_perror("Unable to umount %s", ovm->mountpoint); + return -1; + } + + goto next; + } + + return 0; +} + +/* Make our mountpoint fully visible */ +static int umount_overmounts(struct mount_info *m) +{ + if (__umount_overmounts(m)) + return -1; + + if (__umount_children_overmounts(m)) + return -1; + + return 0; +} + +struct clone_arg { + struct mount_info *mi; + int *fd; +}; + +/* + * Get access to the mountpoint covered by overmounts + * and open it's cleaned copy (without children mounts). + */ +int ns_open_mountpoint(void *arg) +{ + struct clone_arg *ca = arg; + struct mount_info *mi = ca->mi; + int *fd = ca->fd; + + /* + * We should enter user namespace owning mount namespace of our mount + * before creating helper mount namespace. Else all mounts in helper + * mount namespace will be locked (MNT_LOCKED) and we won't be able to + * unmount them (see CL_UNPRIVILEGED in sys_umount(), clone_mnt() and + * copy_mnt_ns() in linux kernel code). + */ + if ((root_ns_mask & CLONE_NEWUSER) && + switch_ns(root_item->pid->real, &user_ns_desc, NULL) < 0) + goto err; + + /* + * Create a helper mount namespace in which we can safely do unmounts + * without breaking dumping process' environment. + */ + if (unshare(CLONE_NEWNS)) { + pr_perror("Unable to unshare a mount namespace"); + goto err; + } + + /* Remount all mounts as private to disable propagation */ + if (mount("none", "/", NULL, MS_REC|MS_PRIVATE, NULL)) + goto err; + + if (umount_overmounts(mi)) + goto err; + + /* + * Save fd which we opened for parent due to CLONE_FILES flag + * + * Mount can still have children in it, but we don't need to clean it + * explicitly as when last process exits mntns all mounts in it are + * cleaned from their children, and we are exactly the last process. + */ + *fd = open(mi->mountpoint, O_DIRECTORY|O_RDONLY); + if (*fd < 0) { + pr_perror("Unable to open %s", mi->mountpoint); + goto err; + } + + return 0; +err: + return 1; +} + +int open_mountpoint(struct mount_info *pm) +{ + int fd = -1, cwd_fd, ns_old = -1; + + /* No overmounts and children - the entire mount is visible */ + if (list_empty(&pm->children) && !mnt_is_overmounted(pm)) + return __open_mountpoint(pm, -1); + + pr_info("Mount is not fully visible %s\n", pm->mountpoint); + + /* + * We do two things below: + * a) If mount has children mounts in it which partially cover it's + * content, to get access to the content we create a "private" copy of + * such a mount, bind-mounting mount w/o MS_REC in a temporary place. + * b) If mount is overmounted we create a private copy of it's mount + * namespace so that we can safely get rid of overmounts and get an + * access to the mount. + * In both cases we can't do the thing from criu's mount namespace, so + * we need to switch to mount's mount namespace, and later switch back. + */ + cwd_fd = open(".", O_DIRECTORY); + if (cwd_fd < 0) { + pr_perror("Unable to open cwd"); + return -1; + } + + if (switch_ns(pm->nsid->ns_pid, &mnt_ns_desc, &ns_old) < 0) + goto err; + + if (!mnt_is_overmounted(pm)) { + pr_info("\tmount has children %s\n", pm->mountpoint); + fd = get_clean_fd(pm); + } + + /* + * Mount is overmounted or probably we can't create a temporary + * directory for a cleaned mount + */ + if (fd < 0) { + int pid, status; + struct clone_arg ca = { + .mi = pm, + .fd = &fd + }; + + pr_info("\tmount is overmounted or has children %s\n", + pm->mountpoint); + + /* + * We are overmounted - not accessible in a regular way. We + * need to clone "private" copy of mount's monut namespace and + * unmount all covering overmounts in it. We also need to enter + * user namespace owning these mount namespace just before that + * (see explanation in ns_open_mountpoint). Thus we also have + * to create helper process here as entering user namespace is + * irreversible operation. + */ + pid = clone_noasan(ns_open_mountpoint, CLONE_VFORK | CLONE_VM + | CLONE_FILES | CLONE_IO | CLONE_SIGHAND + | CLONE_SYSVSEM, &ca); + if (pid == -1) { + pr_perror("Can't clone helper process"); + goto err; + } + + errno = 0; + if (waitpid(pid, &status, __WALL) != pid || !WIFEXITED(status) + || WEXITSTATUS(status)) { + pr_err("Can't wait or bad status: errno=%d, status=%d\n", + errno, status); + goto err; + } + } + + if (restore_ns(ns_old, &mnt_ns_desc)) { + ns_old = -1; + goto err; + } + + if (fchdir(cwd_fd)) { + pr_perror("Unable to restore cwd"); + close(cwd_fd); + close(fd); + return -1; + } + close(cwd_fd); + + return __open_mountpoint(pm, fd); +err: + if (ns_old >= 0) + restore_ns(ns_old, &mnt_ns_desc); + close_safe(&fd); + if (fchdir(cwd_fd)) + pr_perror("Unable to restore cwd"); + close(cwd_fd); + return -1; +} + +static __maybe_unused int add_cr_time_mount(struct mount_info *root, char *fsname, const char *path, unsigned int s_dev) +{ + struct mount_info *mi, *t, *parent; + bool add_slash = false; + int len; + + if (!root->nsid) { + /* On restore we have fake top mount_info. Find real NS_ROOT */ + list_for_each_entry(t, &root->children, siblings) + if (t->nsid->type == NS_ROOT) { + root = t; + break; + } + if (!root->nsid) { + pr_err("Can't find NS_ROOT\n"); + return -1; + } + } + + mi = mnt_entry_alloc(); + if (!mi) + return -1; + + len = strlen(root->mountpoint); + /* It may be "./" or "./path/to/dir" */ + if (root->mountpoint[len - 1] != '/') { + add_slash = true; + len++; + } + + mi->mountpoint = xmalloc(len + strlen(path) + 1); + if (!mi->mountpoint) + return -1; + mi->ns_mountpoint = mi->mountpoint; + if (!add_slash) + sprintf(mi->mountpoint, "%s%s", root->mountpoint, path); + else + sprintf(mi->mountpoint, "%s/%s", root->mountpoint, path); + mi->mnt_id = CRTIME_MNT_ID; + mi->flags = mi->sb_flags = 0; + mi->root = xstrdup("/"); + mi->fsname = xstrdup(fsname); + mi->source = xstrdup(fsname); + mi->options = xstrdup(""); + if (!mi->root || !mi->fsname || !mi->source || !mi->options) + return -1; + mi->fstype = find_fstype_by_name(fsname); + + mi->s_dev = mi->s_dev_rt = s_dev; + + parent = root; + while (1) { + list_for_each_entry(t, &parent->children, siblings) { + if (strstartswith(mi->mountpoint, t->mountpoint)) { + parent = t; + break; + } + } + if (&t->siblings == &parent->children) + break; + } + + mi->nsid = parent->nsid; + mi->parent = parent; + mi->parent_mnt_id = parent->mnt_id; + mi->next = parent->next; + parent->next = mi; + list_add(&mi->siblings, &parent->children); + pr_info("Add cr-time mountpoint %s with parent %s(%u)\n", + mi->mountpoint, parent->mountpoint, parent->mnt_id); + return 0; +} + +/* Returns 1 in case of success, -errno in case of mount fail, and 0 on other errors */ +static __maybe_unused int mount_cr_time_mount(struct ns_id *ns, unsigned int *s_dev, const char *source, + const char *target, const char *type) +{ + int mnt_fd, ret, exit_code = 0; + struct stat st; + + ret = switch_ns(ns->ns_pid, &mnt_ns_desc, &mnt_fd); + if (ret < 0) { + pr_err("Can't switch mnt_ns\n"); + goto out; + } + + ret = mount(source, target, type, 0, NULL); + if (ret < 0) { + exit_code = -errno; + goto restore_ns; + } else { + if (stat(target, &st) < 0) { + pr_perror("Can't stat %s", target); + exit_code = 0; + } else { + *s_dev = MKKDEV(major(st.st_dev), minor(st.st_dev)); + exit_code = 1; + } + } + +restore_ns: + ret = restore_ns(mnt_fd, &mnt_ns_desc); +out: + return ret < 0 ? 0 : exit_code; +} + + + +static int dump_one_fs(struct mount_info *mi) +{ + struct mount_info *pm = mi; + struct mount_info *t; + bool first = true; + + if (mi->is_ns_root || mi->need_plugin || mnt_is_external(mi) || !mi->fstype->dump) + return 0; + + /* mnt_bind is a cycled list, so list_for_each can't be used here. */ + for (; &pm->mnt_bind != &mi->mnt_bind || first; + pm = list_entry(pm->mnt_bind.next, typeof(*pm), mnt_bind)) { + int ret; + + first = false; + + if (!fsroot_mounted(pm)) + continue; + + ret = pm->fstype->dump(pm); + if (ret == MNT_UNREACHABLE) + continue; + if (ret < 0) + return ret; + + list_for_each_entry(t, &pm->mnt_bind, mnt_bind) + t->dumped = true; + return 0; + } + + pr_err("Unable to dump a file system for %d:%s\n", + mi->mnt_id, mi->mountpoint); + return -1; +} + +static int dump_one_mountpoint(struct mount_info *pm, struct cr_img *img) +{ + MntEntry me = MNT_ENTRY__INIT; + + pr_info("\t%d: %x:%s @ %s\n", pm->mnt_id, pm->s_dev, + pm->root, pm->mountpoint); + + me.fstype = pm->fstype->code; + + if (me.fstype == FSTYPE__AUTO) + me.fsname = pm->fsname; + + if (!pm->external) { + if (!pm->dumped && dump_one_fs(pm)) + return -1; + + if (!fsroot_mounted(pm) && + pm->fstype->check_bindmount && pm->fstype->check_bindmount(pm)) + return -1; + } + + if (pm->mnt_id == CRTIME_MNT_ID) { + pr_info("Skip dumping cr-time mountpoint: %s\n", pm->mountpoint); + return 0; + } + + me.mnt_id = pm->mnt_id; + me.root_dev = pm->s_dev; + me.parent_mnt_id = pm->parent_mnt_id; + me.flags = pm->flags; + me.sb_flags = pm->sb_flags; + me.has_sb_flags = true; + me.mountpoint = pm->mountpoint + 1; + me.source = pm->source; + me.options = pm->options; + me.shared_id = pm->shared_id; + me.has_shared_id = true; + me.master_id = pm->master_id; + me.has_master_id = true; + if (pm->need_plugin) { + me.has_with_plugin = true; + me.with_plugin = true; + } + if (pm->deleted) { + me.has_deleted = true; + me.deleted = true; + } + + if (pm->internal_sharing) { + me.has_internal_sharing = true; + me.internal_sharing = true; + } + + if (pm->external) + /* + * For external mount points dump the mapping's + * value, see collect_mnt_from_image -> get_mp_root + * for reverse mapping details. + */ + me.ext_key = pm->external; + me.root = pm->root; + + if (pb_write_one(img, &me, PB_MNT)) + return -1; + + return 0; +} + +static void free_mntinfo(struct mount_info *pms) +{ + while (pms) { + struct mount_info *pm; + + pm = pms->next; + mnt_entry_free(pms); + pms = pm; + } +} + +struct mount_info *collect_mntinfo(struct ns_id *ns, bool for_dump) +{ + struct mount_info *pm; + + pm = parse_mountinfo(ns->ns_pid, ns, for_dump); + if (!pm) { + pr_err("Can't parse %d's mountinfo\n", ns->ns_pid); + return NULL; + } + + ns->mnt.mntinfo_tree = mnt_build_tree(pm, NULL); + if (ns->mnt.mntinfo_tree == NULL) + goto err; + + ns->mnt.mntinfo_list = pm; + return pm; +err: + free_mntinfo(pm); + return NULL; +} + +static int dump_mnt_ns(struct ns_id *ns, struct mount_info *pms) +{ + struct mount_info *pm; + int ret = -1; + struct cr_img *img; + unsigned int ns_id = ns->id; + + pr_info("Dumping mountpoints\n"); + img = open_image(CR_FD_MNTS, O_DUMP, ns_id); + if (!img) + goto err; + + for (pm = pms; pm && pm->nsid == ns; pm = pm->next) + if (dump_one_mountpoint(pm, img)) + goto err_i; + + ret = 0; +err_i: + close_image(img); +err: + return ret; +} + +/* + * _fn_f - pre-order traversal function + * _fn_r - post-order traversal function + * _plist - a postpone list. _el is added to this list, if _fn_f returns + * a positive value, and all lower elements are not enumerated. + */ +#define MNT_TREE_WALK(_r, _el, _fn_f, _fn_r, _plist, _prgs) do { \ + struct mount_info *_mi = _r; \ + \ + while (1) { \ + int ret; \ + \ + list_del_init(&_mi->postpone); \ + \ + ret = _fn_f(_mi); \ + if (ret < 0) \ + return -1; \ + else if (ret > 0) { \ + list_add_tail(&_mi->postpone, _plist); \ + goto up; \ + } \ + \ + _prgs++; \ + \ + if (!list_empty(&_mi->children)) { \ + _mi = list_entry(_mi->children._el, \ + struct mount_info, siblings); \ + continue; \ + } \ + up: \ + if (_fn_r(_mi)) \ + return -1; \ + if (_mi == _r) \ + break; \ + if (_mi->siblings._el == &_mi->parent->children) { \ + _mi = _mi->parent; \ + goto up; \ + } \ + _mi = list_entry(_mi->siblings._el, \ + struct mount_info, siblings); \ + } \ + } while (0) + +#define MNT_WALK_NONE 0 && + + +static int mnt_tree_for_each(struct mount_info *start, + int (*fn)(struct mount_info *)) +{ + struct mount_info *tmp; + LIST_HEAD(postpone); + LIST_HEAD(postpone2); + int progress; + + pr_debug("Start with %d:%s\n", start->mnt_id, start->mountpoint); + list_add(&start->postpone, &postpone); + +again: + progress = 0; + + list_for_each_entry_safe(start, tmp, &postpone, postpone) + MNT_TREE_WALK(start, next, fn, MNT_WALK_NONE, &postpone2, progress); + + if (!progress) { + struct mount_info *m; + + pr_err("A few mount points can't be mounted\n"); + list_for_each_entry(m, &postpone2, postpone) { + pr_err("%d:%d %s %s %s\n", m->mnt_id, + m->parent_mnt_id, m->root, + m->mountpoint, m->source); + } + return -1; + } + + list_splice_init(&postpone2, &postpone); + + if (!list_empty(&postpone)) + goto again; + + return 0; + +} + +static int mnt_tree_for_each_reverse(struct mount_info *m, + int (*fn)(struct mount_info *)) +{ + int progress = 0; + + MNT_TREE_WALK(m, prev, MNT_WALK_NONE, fn, (struct list_head *) NULL, progress); + + return 0; +} + +static char *resolve_source(struct mount_info *mi) +{ + if (kdev_major(mi->s_dev) == 0) + /* + * Anonymous block device. Kernel creates them for + * diskless mounts. + */ + return mi->source; + + if (mi->fstype->code == FSTYPE__AUTO) { + struct stat st; + char *val; + + val = external_lookup_by_key(mi->source); + if (!IS_ERR_OR_NULL(val)) + return val; + + if (!stat(mi->source, &st) && S_ISBLK(st.st_mode) && + major(st.st_rdev) == kdev_major(mi->s_dev) && + minor(st.st_rdev) == kdev_minor(mi->s_dev)) + return mi->source; + } + + pr_err("No device for %s mount\n", mi->mountpoint); + return NULL; +} + +static int restore_shared_options(struct mount_info *mi, bool private, bool shared, bool slave) +{ + pr_debug("%d:%s private %d shared %d slave %d\n", + mi->mnt_id, mi->mountpoint, private, shared, slave); + + if (mi->flags & MS_UNBINDABLE) { + if (shared || slave) + pr_warn("%s has both unbindable and sharing, ignoring unbindable\n", mi->mountpoint); + else + return mount(NULL, mi->mountpoint, NULL, MS_UNBINDABLE, NULL); + } + + if (private && mount(NULL, mi->mountpoint, NULL, MS_PRIVATE, NULL)) { + pr_perror("Unable to make %s private", mi->mountpoint); + return -1; + } + if (slave && mount(NULL, mi->mountpoint, NULL, MS_SLAVE, NULL)) { + pr_perror("Unable to make %s slave", mi->mountpoint); + return -1; + } + if (shared && mount(NULL, mi->mountpoint, NULL, MS_SHARED, NULL)) { + pr_perror("Unable to make %s shared", mi->mountpoint); + return -1; + } + + return 0; +} + +/* + * Umount points, which are propagated in slave parents, because + * we can't be sure, that they were inherited in a real life. + */ +static int umount_from_slaves(struct mount_info *mi) +{ + struct mount_info *t; + char *mpath, buf[PATH_MAX]; + + list_for_each_entry(t, &mi->parent->mnt_slave_list, mnt_slave) { + if (!t->mounted) + continue; + + mpath = mnt_get_sibling_path(mi, t, buf, sizeof(buf)); + if (mpath == NULL) + continue; + + pr_debug("\t\tUmount slave %s\n", mpath); + if (umount(mpath) == -1) { + pr_perror("Can't umount slave %s", mpath); + return -1; + } + } + + return 0; +} + +/* + * If something is mounted in one shared point, it will be spread in + * all other points from this shared group. + * + * Look at Documentation/filesystems/sharedsubtree.txt for more details + */ +static int propagate_siblings(struct mount_info *mi) +{ + struct mount_info *t; + + /* + * Find all mounts, which must be bind-mounted from this one + * to inherit shared group or master id + */ + list_for_each_entry(t, &mi->mnt_share, mnt_share) { + if (t->mounted) + continue; + if (t->bind && t->bind->shared_id == t->shared_id) + continue; + pr_debug("\t\tBind share %s\n", t->mountpoint); + t->bind = mi; + t->s_dev_rt = mi->s_dev_rt; + } + + list_for_each_entry(t, &mi->mnt_slave_list, mnt_slave) { + if (t->mounted || t->bind) + continue; + pr_debug("\t\tBind slave %s\n", t->mountpoint); + t->bind = mi; + t->s_dev_rt = mi->s_dev_rt; + } + + return 0; +} + +static int propagate_mount(struct mount_info *mi) +{ + struct mount_info *p; + + propagate_siblings(mi); + + if (!mi->parent) + goto skip_parent; + + umount_from_slaves(mi); + + /* Mark mounts in propagation group mounted */ + list_for_each_entry(p, &mi->mnt_propagate, mnt_propagate) { + /* Should not propagate the same mount twice */ + BUG_ON(p->mounted); + pr_debug("\t\tPropagate %s\n", p->mountpoint); + + /* + * When a mount is propagated, the result mount + * is always shared. If we want to get a private + * mount, we need to convert it. + */ + restore_shared_options(p, !p->shared_id, 0, 0); + p->mounted = true; + propagate_siblings(p); + umount_from_slaves(p); + } + +skip_parent: + /* + * FIXME Currently non-root mounts can be restored + * only if a proper root mount exists + */ + if (fsroot_mounted(mi) || mi->parent == root_yard_mp || mi->external) { + struct mount_info *t; + + list_for_each_entry(t, &mi->mnt_bind, mnt_bind) { + if (t->mounted) + continue; + if (t->bind) + continue; + if (t->master_id > 0) + continue; + if (!issubpath(t->root, mi->root)) + continue; + pr_debug("\t\tBind private %s\n", t->mountpoint); + t->bind = mi; + t->s_dev_rt = mi->s_dev_rt; + } + } + + return 0; +} + +static int fetch_rt_stat(struct mount_info *m, const char *where) +{ + struct stat st; + + if (stat(where, &st)) { + pr_perror("Can't stat on %s", where); + return -1; + } + + m->s_dev_rt = MKKDEV(major(st.st_dev), minor(st.st_dev)); + return 0; +} + +/* + * Here are a set of flags which we know how to handle for the one mount call. + * All of them except MS_RDONLY are set only as mnt flags. + * MS_RDONLY is set for both mnt ans sb flags, so we can restore it for one + * mount call only if it set for both masks. + */ +#define MS_MNT_KNOWN_FLAGS (MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_NOATIME | \ + MS_NODIRATIME | MS_RELATIME | MS_RDONLY) + +static int do_simple_mount(struct mount_info *mi, const char *src, const + char *fstype, unsigned long mountflags) +{ + return mount(src, mi->mountpoint, fstype, mountflags, mi->options); +} + +static char *mnt_fsname(struct mount_info *mi) +{ + if (mi->fstype->code == FSTYPE__AUTO) + return mi->fsname; + return mi->fstype->name; +} + +static int apply_sb_flags(void *args, int fd, pid_t pid) +{ + unsigned long flags = *(unsigned long *) args; + int rst = -1, err = -1; + char path[PSFDS]; + + snprintf(path, sizeof(path), "/proc/self/fd/%d", fd); + + if (pid != getpid() && switch_ns(pid, &mnt_ns_desc, &rst)) + return -1; + + err = mount(NULL, path, NULL, MS_REMOUNT | flags, NULL); + if (err) + pr_perror("Unable to remount %s", path); + + if (rst >= 0 && restore_ns(rst, &mnt_ns_desc)) + return -1; + + return err; +} + +static int do_new_mount(struct mount_info *mi) +{ + unsigned long sflags = mi->sb_flags; + unsigned long mflags = mi->flags & (~MS_PROPAGATE); + char *src; + struct fstype *tp = mi->fstype; + bool remount_ro = (tp->restore && mi->sb_flags & MS_RDONLY); + mount_fn_t do_mount = (tp->mount) ? tp->mount : do_simple_mount; + + src = resolve_source(mi); + if (!src) + return -1; + + /* Merge superblock and mount flags if it's possible */ + if (!(mflags & ~MS_MNT_KNOWN_FLAGS) && !((sflags ^ mflags) & MS_RDONLY)) { + sflags |= mflags; + mflags = 0; + } + + if (remount_ro) + sflags &= ~MS_RDONLY; + + if (do_mount(mi, src, mnt_fsname(mi), sflags) < 0) { + pr_perror("Can't mount at %s", mi->mountpoint); + return -1; + } + + if (tp->restore && tp->restore(mi)) + return -1; + + if (mi->mnt_id == CRTIME_MNT_ID) { + /* C-r time mountpoint, umount it */ + if (umount(mi->mountpoint) < 0) { + pr_perror("Can't umount %s", mi->mountpoint); + return -1; + } + goto out; + } + + if (!mi->is_ns_root && remount_ro) { + int fd; + + fd = open(mi->mountpoint, O_PATH); + if (fd < 0) { + pr_perror("Unable to open %s", mi->mountpoint); + return -1; + } + sflags |= MS_RDONLY; + if (userns_call(apply_sb_flags, 0, + &sflags, sizeof(sflags), fd)) { + pr_perror("Unable to apply mount flags %d for %s", + mi->sb_flags, mi->mountpoint); + close(fd); + return -1; + } + close(fd); + } + + if (mflags && mount(NULL, mi->mountpoint, NULL, + MS_REMOUNT | MS_BIND | mflags, NULL)) { + pr_perror("Unable to apply bind-mount options"); + return -1; + } + + /* + * A slave should be mounted from do_bind_mount(). + * Look at can_mount_now() for details. + */ + BUG_ON(mi->master_id); + if (restore_shared_options(mi, !mi->shared_id, mi->shared_id, 0)) + return -1; +out: + mi->mounted = true; + + return 0; +} + +static int restore_ext_mount(struct mount_info *mi) +{ + int ret; + + pr_debug("Restoring external bind mount %s\n", mi->mountpoint); + ret = run_plugins(RESTORE_EXT_MOUNT, mi->mnt_id, mi->mountpoint, "/", NULL); + if (ret) + pr_err("Can't restore ext mount (%d)\n", ret); + return ret; +} + +static char mnt_clean_path[] = "/tmp/cr-tmpfs.XXXXXX"; + +static int mount_clean_path() +{ + /* + * To make a bind mount, we need to have access to a source directory, + * which can be over-mounted. The idea is to mount a source mount in + * an intermediate place without MS_REC and then create a target mounts. + * This intermediate place should be a private mount to not affect + * properties of the source mount. + */ + if (mkdtemp(mnt_clean_path) == NULL) { + pr_perror("Unable to create a temporary directory"); + return -1; + } + + if (mount(mnt_clean_path, mnt_clean_path, NULL, MS_BIND, NULL)) { + pr_perror("Unable to mount tmpfs into %s", mnt_clean_path); + return -1; + } + + if (mount(NULL, mnt_clean_path, NULL, MS_PRIVATE, NULL)) { + pr_perror("Unable to mark %s as private", mnt_clean_path); + return -1; + } + + return 0; +} + +static int umount_clean_path() +{ + if (umount2(mnt_clean_path, MNT_DETACH)) { + pr_perror("Unable to umount %s", mnt_clean_path); + return -1; + } + + if (rmdir(mnt_clean_path)) { + pr_perror("Unable to remove %s", mnt_clean_path); + } + + return 0; +} + +static int do_bind_mount(struct mount_info *mi) +{ + char mnt_fd_path[PSFDS]; + char *root, *cut_root, rpath[PATH_MAX]; + unsigned long mflags; + int exit_code = -1, mp_len; + bool shared = false; + bool master = false; + bool private = false; + char *mnt_path = NULL; + struct stat st; + bool umount_mnt_path = false; + struct mount_info *c; + + if (mi->need_plugin) { + if (restore_ext_mount(mi)) + return -1; + goto out; + } + + if (mi->external) { + /* + * We have / pointing to criu's ns root still, + * so just use the mapping's path. The mountpoint + * is tuned in collect_mnt_from_image to refer + * to proper location in the namespace we restore. + */ + root = mi->external; + private = !mi->master_id && (mi->internal_sharing || !mi->shared_id); + goto do_bind; + } + + shared = mi->shared_id && mi->shared_id == mi->bind->shared_id; + master = mi->master_id && mi->master_id == mi->bind->master_id; + private = !mi->master_id && !shared; + cut_root = cut_root_for_bind(mi->root, mi->bind->root); + + /* Mount private can be initialized on mount() callback, which is + * called only once. + * It have to be copied to all it's sibling structures to provide users + * of it with actual data. + */ + mi->private = mi->bind->private; + + mnt_path = mi->bind->mountpoint; + + /* Access a mount by fd if mi->bind->mountpoint is overmounted */ + if (mi->bind->fd >= 0) { + snprintf(mnt_fd_path, sizeof(mnt_fd_path), + "/proc/self/fd/%d", mi->bind->fd); + mnt_path = mnt_fd_path; + } + + if (cut_root[0] == 0) /* This case is handled by mi->bind->fd */ + goto skip_overmount_check; + + /* + * The target path may be over-mounted by one of child mounts + * and we need to create a new bind-mount to get access to the path. + */ + mp_len = strlen(mi->bind->mountpoint); + if (mp_len > 1) /* skip a joining / if mi->bind->mountpoint isn't "/" */ + mp_len++; + + list_for_each_entry(c, &mi->bind->children, siblings) { + if (!c->mounted) + continue; + if (issubpath(cut_root, c->mountpoint + mp_len)) + break; /* a source path is overmounted */ + } + + if (&c->siblings != &mi->bind->children) { + /* Get a copy of mi->bind without child mounts */ + if (mount(mnt_path, mnt_clean_path, NULL, MS_BIND, NULL)) { + pr_perror("Unable to bind-mount %s to %s", + mnt_path, mnt_clean_path); + return -1; + } + mnt_path = mnt_clean_path; + umount_mnt_path = true; + } + + if (mnt_path == NULL) + return -1; + +skip_overmount_check: + snprintf(rpath, sizeof(rpath), "%s/%s", + mnt_path, cut_root); + root = rpath; +do_bind: + pr_info("\tBind %s to %s\n", root, mi->mountpoint); + + if (unlikely(mi->deleted)) { + if (stat(mi->mountpoint, &st)) { + pr_perror("Can't fetch stat on %s", mi->mountpoint); + goto err; + } + + if (S_ISDIR(st.st_mode)) { + if (mkdir(root, (st.st_mode & ~S_IFMT))) { + pr_perror("Can't re-create deleted directory %s", root); + goto err; + } + } else if (S_ISREG(st.st_mode)) { + int fd = open(root, O_WRONLY | O_CREAT | O_EXCL, + st.st_mode & ~S_IFMT); + if (fd < 0) { + pr_perror("Can't re-create deleted file %s", root); + goto err; + } + close(fd); + } else { + pr_err("Unsupported st_mode 0%o deleted root %s\n", + (int)st.st_mode, root); + goto err; + } + } + + if (mount(root, mi->mountpoint, NULL, MS_BIND | (mi->flags & MS_REC), NULL) < 0) { + pr_perror("Can't mount at %s", mi->mountpoint); + goto err; + } + + mflags = mi->flags & (~MS_PROPAGATE); + if (!mi->bind || mflags != (mi->bind->flags & (~MS_PROPAGATE))) + if (mount(NULL, mi->mountpoint, NULL, MS_BIND | MS_REMOUNT | mflags, NULL)) { + pr_perror("Can't mount at %s", mi->mountpoint); + goto err; + } + + if (unlikely(mi->deleted)) { + if (S_ISDIR(st.st_mode)) { + if (rmdir(root)) { + pr_perror("Can't remove deleted directory %s", root); + goto err; + } + } else if (S_ISREG(st.st_mode)) { + if (unlink(root)) { + pr_perror("Can't unlink deleted file %s", root); + goto err; + } + } + } +out: + /* + * shared - the mount is in the same shared group with mi->bind + * mi->shared_id && !shared - create a new shared group + */ + if (restore_shared_options(mi, private, + mi->shared_id && !shared, + mi->master_id && !master)) + return -1; + + mi->mounted = true; + exit_code = 0; +err: + if (umount_mnt_path) { + /* + * If mnt_path was shared, a new mount may be propagated + * into it. + */ + if (mount(NULL, mnt_path, NULL, MS_PRIVATE, NULL)) { + pr_perror("Unable to make %s private", mnt_path); + return -1; + } + if (umount2(mnt_path, MNT_DETACH)) { + pr_perror("Unable to umount %s", mnt_path); + return -1; + } + } + return exit_code; +} + +static bool rst_mnt_is_root(struct mount_info *m) +{ + return (m->is_ns_root && m->nsid->id == root_item->ids->mnt_ns_id); +} + +static bool can_mount_now(struct mount_info *mi) +{ + if (rst_mnt_is_root(mi)) + return true; + + /* Parent should be mounted already, that's how mnt_tree_for_each works */ + BUG_ON(mi->parent && !mi->parent->mounted); + + if (mi->external) + goto shared; + + /* + * We're the slave peer: + * - Make sure the master peer is already mounted + * - Make sure all children of master's share are + * mounted as well to eliminate mounts duplications + */ + if (mi->mnt_master) { + struct mount_info *c, *s; + + if (mi->bind == NULL) + return false; + + list_for_each_entry(c, &mi->mnt_master->children, siblings) + if (!c->mounted) + return false; + + list_for_each_entry(s, &mi->mnt_master->mnt_share, mnt_share) + list_for_each_entry(c, &s->children, siblings) + if (!c->mounted) + return false; + } + + if (!fsroot_mounted(mi) && (mi->bind == NULL && !mi->need_plugin)) + return false; + +shared: + /* Mount only after all parents of our propagation group mounted */ + if (!list_empty(&mi->mnt_propagate)) { + struct mount_info *p; + + list_for_each_entry(p, &mi->mnt_propagate, mnt_propagate) { + BUG_ON(!p->parent); + if (!p->parent->mounted) + return false; + } + } + + /* + * Mount only after all children of share, which shouldn't + * (but can if wrong order) propagate to us, are mounted + */ + if (mi->shared_id) { + struct mount_info *s, *c, *p, *t; + LIST_HEAD(mi_notprop); + bool can = true; + + /* Add all children of the shared group */ + list_for_each_entry(s, &mi->mnt_share, mnt_share) { + list_for_each_entry(c, &s->children, siblings) { + char root_path[PATH_MAX]; + int ret; + + ret = root_path_from_parent(c, root_path, PATH_MAX); + BUG_ON(ret); + + /* Mount is out of our root */ + if (!issubpath(root_path, mi->root)) + continue; + + list_add(&c->mnt_notprop, &mi_notprop); + } + } + + /* Delete all members of our children's propagation groups */ + list_for_each_entry(c, &mi->children, siblings) { + list_for_each_entry(p, &c->mnt_propagate, mnt_propagate) { + list_del_init(&p->mnt_notprop); + } + } + + /* Delete all members of our propagation group */ + list_for_each_entry(p, &mi->mnt_propagate, mnt_propagate) { + list_del_init(&p->mnt_notprop); + } + + /* Delete self */ + list_del_init(&mi->mnt_notprop); + + /* Check not propagated mounts mounted and cleanup list */ + list_for_each_entry_safe(p, t, &mi_notprop, mnt_notprop) { + if (!p->mounted) + can = false; + list_del_init(&p->mnt_notprop); + } + + if (!can) + return false; + } + + return true; +} + +static int do_mount_root(struct mount_info *mi) +{ + if (restore_shared_options(mi, !mi->shared_id && !mi->master_id, + mi->shared_id, mi->master_id)) + return -1; + + return fetch_rt_stat(mi, mi->mountpoint); +} + +static int do_close_one(struct mount_info *mi) +{ + close_safe(&mi->fd); + return 0; +} + +static int do_mount_one(struct mount_info *mi) +{ + int ret; + + if (mi->mounted) + return 0; + + if (!can_mount_now(mi)) { + pr_debug("Postpone slave %s\n", mi->mountpoint); + return 1; + } + + if (!strcmp(mi->parent->mountpoint, mi->mountpoint)) { + mi->parent->fd = open(mi->parent->mountpoint, O_PATH); + if (mi->parent->fd < 0) { + pr_perror("Unable to open %s", mi->mountpoint); + return -1; + } + } + + pr_debug("\tMounting %s @%s (%d)\n", mi->fstype->name, mi->mountpoint, mi->need_plugin); + + if (rst_mnt_is_root(mi)) { + if (opts.root == NULL) { + pr_err("The --root option is required to restore a mount namespace\n"); + return -1; + } + + /* do_mount_root() is called from populate_mnt_ns() */ + if (mount(opts.root, mi->mountpoint, NULL, MS_BIND | MS_REC, NULL)) + return -1; + if (do_mount_root(mi)) + return -1; + mi->mounted = true; + ret = 0; + } else if (!mi->bind && !mi->need_plugin && !mi->external) + ret = do_new_mount(mi); + else + ret = do_bind_mount(mi); + + if (ret == 0 && fetch_rt_stat(mi, mi->mountpoint)) + return -1; + + if (ret == 0 && propagate_mount(mi)) + return -1; + + if (mi->fstype->code == FSTYPE__UNSUPPORTED) { + struct statfs st; + + if (statfs(mi->mountpoint, &st)) { + pr_perror("Unable to statfs %s", mi->mountpoint); + return -1; + } + if (st.f_type == BTRFS_SUPER_MAGIC) + mi->fstype = find_fstype_by_name("btrfs"); + } + + return ret; +} + +static int do_umount_one(struct mount_info *mi) +{ + if (!mi->parent) + return 0; + + if (mount("none", mi->parent->mountpoint, "none", MS_REC|MS_PRIVATE, NULL)) { + pr_perror("Can't mark %s as private", mi->parent->mountpoint); + return -1; + } + + if (umount(mi->mountpoint)) { + pr_perror("Can't umount at %s", mi->mountpoint); + return -1; + } + + pr_info("Umounted at %s\n", mi->mountpoint); + return 0; +} + +/* + * If a mount overmounts other mounts, it is restored separately in the roots + * yard and then moved to the right place. + * + * mnt_remap_entry is created for each such mount and it's added into + * mnt_remap_list. The origin mount point is replaced on a new one in + * roots_yard where it will be restored. The remapped mount will be + * moved to the right places after restoring all mounts. + */ + +static inline int print_ns_root(struct ns_id *ns, int remap_id, char *buf, int bs); +static int get_mp_mountpoint(char *mountpoint, struct mount_info *mi, char *root, int root_len); + +static LIST_HEAD(mnt_remap_list); +static int remap_id; + +struct mnt_remap_entry { + struct mount_info *mi; /* child is remaped into the root yards */ + struct mount_info *parent; /* the origin parent for the child*/ + struct list_head node; +}; + +static int do_remap_mount(struct mount_info *m) +{ + int len; + + /* A path in root_yard has a fixed size, so it can be replaced. */ + len = print_ns_root(m->nsid, remap_id, m->mountpoint, PATH_MAX); + m->mountpoint[len] = '/'; + + return 0; +} + +static int try_remap_mount(struct mount_info *m) +{ + struct mnt_remap_entry *r; + + if (!mnt_needs_remap(m)) + return 0; + + BUG_ON(!m->parent); + + r = xmalloc(sizeof(struct mnt_remap_entry)); + if (!r) + return -1; + + r->mi = m; + list_add_tail(&r->node, &mnt_remap_list); + + return 0; +} + +static int find_remap_mounts(struct mount_info *root) +{ + struct mnt_remap_entry *r; + struct mount_info *m; + + /* + * It's impossible to change a tree without interrupting + * enumeration, so on the first step mounts are added + * into mnt_remap_list and then they are connected to root_yard_mp. + */ + if (mnt_tree_for_each(root, try_remap_mount)) + return -1; + + /* Move remapped mounts to root_yard */ + list_for_each_entry(r, &mnt_remap_list, node) { + m = r->mi; + r->parent = m->parent; + m->parent = root_yard_mp; + list_del(&m->siblings); + list_add(&m->siblings, &root_yard_mp->children); + + remap_id++; + mnt_tree_for_each(m, do_remap_mount); + pr_debug("Restore the %d mount in %s\n", m->mnt_id, m->mountpoint); + } + + return 0; +} + +/* Move remapped mounts to places where they have to be */ +static int fixup_remap_mounts() +{ + struct mnt_remap_entry *r; + + list_for_each_entry(r, &mnt_remap_list, node) { + struct mount_info *m = r->mi; + char path[PATH_MAX]; + int len; + + strncpy(path, m->mountpoint, PATH_MAX - 1); + path[PATH_MAX - 1] = 0; + len = print_ns_root(m->nsid, 0, path, PATH_MAX); + path[len] = '/'; + + pr_debug("Move mount %s -> %s\n", m->mountpoint, path); + if (mount(m->mountpoint, path, NULL, MS_MOVE, NULL)) { + pr_perror("Unable to move mount %s -> %s", m->mountpoint, path); + return -1; + } + + /* Insert child back to its place in the tree */ + list_del(&r->mi->siblings); + list_add(&r->mi->siblings, &r->parent->children); + r->mi->parent = r->parent; + } + + return 0; +} + +static int cr_pivot_root(char *root) +{ + char tmp_dir_tmpl[] = "crtools-put-root.XXXXXX"; + bool tmp_dir = false; + char *put_root = "tmp"; + int exit_code = -1; + struct stat st; + + pr_info("Move the root to %s\n", root ? : "."); + + if (root) { + if (chdir(root)) { + pr_perror("chdir(%s) failed", root); + return -1; + } + } + + if (stat(put_root, &st) || !S_ISDIR(st.st_mode)) { + put_root = mkdtemp(tmp_dir_tmpl); + if (put_root == NULL) { + pr_perror("Can't create a temporary directory"); + return -1; + } + tmp_dir = true; + } + + if (mount(put_root, put_root, NULL, MS_BIND, NULL)) { + pr_perror("Unable to mount tmpfs in %s", put_root); + goto err_root; + } + + if (mount(NULL, put_root, NULL, MS_PRIVATE, NULL)) { + pr_perror("Can't remount %s with MS_PRIVATE", put_root); + goto err_tmpfs; + } + + if (pivot_root(".", put_root)) { + pr_perror("pivot_root(., %s) failed", put_root); + goto err_tmpfs; + } + + if (mount("none", put_root, "none", MS_REC|MS_SLAVE, NULL)) { + pr_perror("Can't remount root with MS_PRIVATE"); + return -1; + } + + exit_code = 0; + + if (umount2(put_root, MNT_DETACH)) { + pr_perror("Can't umount %s", put_root); + return -1; + } + +err_tmpfs: + if (umount2(put_root, MNT_DETACH)) { + pr_perror("Can't umount %s", put_root); + return -1; + } + +err_root: + if (tmp_dir && rmdir(put_root)) { + pr_perror("Can't remove the directory %s", put_root); + return -1; + } + + return exit_code; +} + +struct mount_info *mnt_entry_alloc() +{ + struct mount_info *new; + + /* + * We rely on xzalloc here for MOUNT_INVALID_DEV. + */ + BUILD_BUG_ON(MOUNT_INVALID_DEV); + + new = xzalloc(sizeof(struct mount_info)); + if (new) { + new->fd = -1; + new->is_overmounted = -1; + INIT_LIST_HEAD(&new->children); + INIT_LIST_HEAD(&new->siblings); + INIT_LIST_HEAD(&new->mnt_slave_list); + INIT_LIST_HEAD(&new->mnt_share); + INIT_LIST_HEAD(&new->mnt_bind); + INIT_LIST_HEAD(&new->mnt_propagate); + INIT_LIST_HEAD(&new->mnt_notprop); + INIT_LIST_HEAD(&new->postpone); + } + return new; +} + +void mnt_entry_free(struct mount_info *mi) +{ + if (mi) { + xfree(mi->root); + xfree(mi->mountpoint); + xfree(mi->source); + xfree(mi->options); + xfree(mi->fsname); + xfree(mi); + } +} + +/* + * Helper for getting a path to where the namespace's root + * is re-constructed. + */ +static inline int print_ns_root(struct ns_id *ns, int remap_id, char *buf, int bs) +{ + return snprintf(buf, bs, "%s/%d-%010d", mnt_roots, ns->id, remap_id); +} + +static int create_mnt_roots(void) +{ + int exit_code = -1; + + if (mnt_roots) + return 0; + + mnt_roots = xstrdup("/tmp/.criu.mntns.XXXXXX"); + if (mnt_roots == NULL) + goto out; + + if (mkdtemp(mnt_roots) == NULL) { + pr_perror("Unable to create a temporary directory"); + mnt_roots = NULL; + goto out; + } + chmod(mnt_roots, 0777); + + exit_code = 0; +out: + return exit_code; +} + +static int get_mp_root(MntEntry *me, struct mount_info *mi) +{ + char *ext = NULL; + + BUG_ON(me->ext_mount && me->ext_key); + + /* Forward compatibility fixup */ + if (me->ext_mount) { + me->ext_key = me->root; + /* + * Putting the id of external mount which is provided by user, + * to ->root can confuse mnt_is_external and other functions + * which expect to see the path in the file system to the root + * of these mount (mounts_equal, mnt_build_ids_tree, + * find_fsroot_mount_for, find_best_external_match, etc.) + */ + me->root = NO_ROOT_MOUNT; + } + + mi->root = xstrdup(me->root); + if (!mi->root) + return -1; + + if (!me->ext_key) + goto out; + + /* + * External mount point -- get the reverse mapping + * from the command line and put into root's place + */ + + ext = ext_mount_lookup(me->ext_key); + if (!ext) { + if (!opts.autodetect_ext_mounts) { + pr_err("No mapping for %s mountpoint\n", me->mountpoint); + return -1; + } + + /* + * Make up an external mount entry for this + * mount point, since we couldn't find a user + * supplied one. + * + * The 'val' was put into mi->source during + * dump by resolve_external_mounts(). + */ + + ext = mi->source; + } + + mi->external = ext; +out: + pr_debug("\t\tWill mount %d from %s%s\n", + mi->mnt_id, ext ? : mi->root, ext ? " (E)" : ""); + return 0; +} + +static int get_mp_mountpoint(char *mountpoint, struct mount_info *mi, char *root, int root_len) +{ + int len; + + len = strlen(mountpoint) + root_len + 1; + mi->mountpoint = xmalloc(len); + if (!mi->mountpoint) + return -1; + + /* + * For bind-mounts we would also fix the root here + * too, but bind-mounts restore merges mountpoint + * and root paths together, so there's no need in + * that. + */ + + strcpy(mi->mountpoint, root); + strcpy(mi->mountpoint + root_len, mountpoint); + + mi->ns_mountpoint = mi->mountpoint + root_len; + + pr_debug("\t\tWill mount %d @ %s\n", mi->mnt_id, mi->mountpoint); + return 0; +} + +static int collect_mnt_from_image(struct mount_info **pms, struct ns_id *nsid) +{ + MntEntry *me = NULL; + int ret, root_len = 1; + struct cr_img *img; + char root[PATH_MAX] = "."; + + img = open_image(CR_FD_MNTS, O_RSTR, nsid->id); + if (!img) + return -1; + + root_len = print_ns_root(nsid, 0, root, sizeof(root)); + + pr_debug("Reading mountpoint images (id %d pid %d)\n", + nsid->id, (int)nsid->ns_pid); + + while (1) { + struct mount_info *pm; + + ret = pb_read_one_eof(img, &me, PB_MNT); + if (ret <= 0) + break; + + pm = mnt_entry_alloc(); + if (!pm) + goto err; + + pm->nsid = nsid; + pm->next = *pms; + *pms = pm; + + pm->mnt_id = me->mnt_id; + pm->parent_mnt_id = me->parent_mnt_id; + pm->s_dev = me->root_dev; + pm->flags = me->flags; + pm->sb_flags = me->sb_flags; + if (!me->has_sb_flags) { + const unsigned int mflags = MS_SHARED | MS_PRIVATE | + MS_SLAVE | MS_UNBINDABLE | + MS_NOSUID | MS_NODEV | MS_NOEXEC | + MS_NOATIME | MS_NODIRATIME | MS_RELATIME; + + /* + * In old images mnt and sb flags are saved together. + * Here we separate them and save the old logic about MS_RDONLY. + */ + + pm->sb_flags = pm->flags & ~mflags; + pm->flags = pm->flags & mflags; + } + pm->shared_id = me->shared_id; + pm->master_id = me->master_id; + pm->need_plugin = me->with_plugin; + pm->deleted = me->deleted; + pm->is_ns_root = is_root(me->mountpoint); + if (me->has_internal_sharing) + pm->internal_sharing = me->internal_sharing; + + pm->source = xstrdup(me->source); + if (!pm->source) + goto err; + + pm->options = xstrdup(me->options); + if (!pm->options) + goto err; + + if (me->fstype != FSTYPE__AUTO && me->fsname) { + pr_err("fsname can be set only for FSTYPE__AUTO mounts\n"); + goto err; + } + + /* FIXME: abort unsupported early */ + pm->fstype = decode_fstype(me->fstype); + if (pm->fstype->collect && (pm->fstype->collect(pm) < 0)) + goto err; + + + if (me->fsname) { + pm->fsname = xstrdup(me->fsname); + if (!pm->fsname) + goto err; + } + + if (get_mp_root(me, pm)) + goto err; + + if (get_mp_mountpoint(me->mountpoint, pm, root, root_len)) + goto err; + + pr_debug("\tRead %d mp @ %s\n", pm->mnt_id, pm->mountpoint); + } + + if (me) + mnt_entry__free_unpacked(me, NULL); + + close_image(img); + + return 0; +err: + close_image(img); + return -1; +} + +int read_mnt_ns_img(void) +{ + struct mount_info *pms = NULL; + struct ns_id *nsid; + + for (nsid = ns_ids; nsid != NULL; nsid = nsid->next) { + if (nsid->nd != &mnt_ns_desc) + continue; + + if (collect_mnt_from_image(&pms, nsid)) + return -1; + } + + mntinfo = pms; + return 0; +} + +int rst_get_mnt_root(int mnt_id, char *path, int plen) +{ + struct mount_info *m; + + if (!(root_ns_mask & CLONE_NEWNS) || mnt_id == -1) + goto rroot; + + m = lookup_mnt_id(mnt_id); + if (m == NULL) + return -1; + + return print_ns_root(m->nsid, 0, path, plen); + +rroot: + path[0] = '/'; + path[1] = '\0'; + return 1; +} + +int mntns_maybe_create_roots(void) +{ + if (!(root_ns_mask & CLONE_NEWNS)) + return 0; + + return create_mnt_roots(); +} + +static int do_restore_task_mnt_ns(struct ns_id *nsid) +{ + int fd; + + fd = fdstore_get(nsid->mnt.nsfd_id); + if (fd < 0) + return -1; + + if (setns(fd, CLONE_NEWNS)) { + pr_perror("Can't restore mntns"); + close(fd); + return -1; + } + close(fd); + + return 0; +} + +int restore_task_mnt_ns(struct pstree_item *current) +{ + if ((root_ns_mask & CLONE_NEWNS) == 0) + return 0; + + if (current->ids && current->ids->has_mnt_ns_id) { + unsigned int id = current->ids->mnt_ns_id; + struct ns_id *nsid; + + /* + * Regardless of the namespace a task wants to + * live in, by that point they all will live in + * root's one (see prepare_pstree_kobj_ids() + + * get_clone_mask()). So if the current task's + * target namespace is the root's one -- it's + * already there, otherwise it will have to do + * setns(). + */ + if (current->parent && id == current->parent->ids->mnt_ns_id) + return 0; + + nsid = lookup_ns_by_id(id, &mnt_ns_desc); + if (nsid == NULL) { + pr_err("Can't find mount namespace %d\n", id); + return -1; + } + + BUG_ON(nsid->type == NS_CRIU); + + if (do_restore_task_mnt_ns(nsid)) + return -1; + } + + return 0; +} + +void fini_restore_mntns(void) +{ + struct ns_id *nsid; + + if (!(root_ns_mask & CLONE_NEWNS)) + return; + + for (nsid = ns_ids; nsid != NULL; nsid = nsid->next) { + if (nsid->nd != &mnt_ns_desc) + continue; + nsid->ns_populated = true; + } +} + +/* + * All nested mount namespaces are restore as sub-trees of the root namespace. + */ +static int populate_roots_yard(void) +{ + struct mnt_remap_entry *r; + char path[PATH_MAX]; + struct ns_id *nsid; + + if (make_yard(mnt_roots)) + return -1; + + for (nsid = ns_ids; nsid != NULL; nsid = nsid->next) { + if (nsid->nd != &mnt_ns_desc) + continue; + + print_ns_root(nsid, 0, path, sizeof(path)); + if (mkdir(path, 0600)) { + pr_perror("Unable to create %s", path); + return -1; + } + } + + /* + * mnt_remap_list is filled in find_remap_mounts() and + * contains mounts which has to be restored separately + */ + list_for_each_entry(r, &mnt_remap_list, node) { + if (mkdirpat(AT_FDCWD, r->mi->mountpoint, 0755)) { + pr_perror("Unable to create %s", r->mi->mountpoint); + return -1; + } + } + + return 0; +} + +static int populate_mnt_ns(void) +{ + struct mount_info *pms; + struct ns_id *nsid; + int ret; + + if (mnt_roots) { + /* mnt_roots is a tmpfs mount and it's private */ + root_yard_mp = mnt_entry_alloc(); + if (!root_yard_mp) + return -1; + + root_yard_mp->mountpoint = mnt_roots; + root_yard_mp->mounted = true; + } + + pms = mnt_build_tree(mntinfo, root_yard_mp); + if (!pms) + return -1; + +#ifdef CONFIG_BINFMT_MISC_VIRTUALIZED + if (!opts.has_binfmt_misc && !list_empty(&binfmt_misc_list)) { + /* Add to mount tree. Generic code will mount it later */ + ret = add_cr_time_mount(pms, "binfmt_misc", BINFMT_MISC_HOME, 0); + if (ret) + return -1; + } +#endif + + if (resolve_shared_mounts(mntinfo, pms->master_id)) + return -1; + + for (nsid = ns_ids; nsid; nsid = nsid->next) { + if (nsid->nd != &mnt_ns_desc) + continue; + + /* + * Make trees of all namespaces look the + * same, so that manual paths resolution + * works on them. + */ + nsid->mnt.mntinfo_tree = pms; + } + + if (validate_mounts(mntinfo, false)) + return -1; + + mnt_tree_for_each(pms, set_is_overmounted); + + if (find_remap_mounts(pms)) + return -1; + + if (populate_roots_yard()) + return -1; + + if (mount_clean_path()) + return -1; + + ret = mnt_tree_for_each(pms, do_mount_one); + mnt_tree_for_each(pms, do_close_one); + + if (ret == 0 && fixup_remap_mounts()) + return -1; + + if (umount_clean_path()) + return -1; + return ret; +} + +static int __depopulate_roots_yard(void) +{ + int ret = 0; + + if (mnt_roots == NULL) + return 0; + + if (mount("none", mnt_roots, "none", MS_REC|MS_PRIVATE, NULL)) { + pr_perror("Can't remount root with MS_PRIVATE"); + ret = 1; + } + /* + * Don't exit after a first error, because this function + * can be used to rollback in a error case. + * Don't worry about MNT_DETACH, because files are restored after this + * and nobody will not be restored from a wrong mount namespace. + */ + if (umount2(mnt_roots, MNT_DETACH)) { + pr_perror("Can't unmount %s", mnt_roots); + ret = -1; + } + + if (rmdir(mnt_roots)) { + pr_perror("Can't remove the directory %s", mnt_roots); + ret = -1; + } + + return ret; +} + +int depopulate_roots_yard(int mntns_fd, bool only_ghosts) +{ + int ret = 0, old_cwd = -1, old_ns = -1; + + if (mntns_fd < 0) { + ret |= try_clean_remaps(only_ghosts); + cleanup_mnt_ns(); + return ret; + } + + pr_info("Switching to new ns to clean ghosts\n"); + + old_cwd = open(".", O_PATH); + if (old_cwd < 0) { + pr_perror("Unable to open cwd"); + return -1; + } + + old_ns = open_proc(PROC_SELF, "ns/mnt"); + if (old_ns < 0) { + pr_perror("`- Can't keep old ns"); + close(old_cwd); + return -1; + } + if (setns(mntns_fd, CLONE_NEWNS) < 0) { + pr_perror("`- Can't switch"); + close(old_ns); + close(old_cwd); + return -1; + } + + if (try_clean_remaps(only_ghosts)) + ret = -1; + + if (__depopulate_roots_yard()) + ret = -1; + + if (setns(old_ns, CLONE_NEWNS) < 0) { + pr_perror("Fail to switch back!"); + ret = -1; + } + close(old_ns); + + if (fchdir(old_cwd)) { + pr_perror("Unable to restore cwd"); + ret = -1; + } + close(old_cwd); + + return ret; +} + +void cleanup_mnt_ns(void) +{ + if (mnt_roots == NULL) + return; + + if (rmdir(mnt_roots)) + pr_perror("Can't remove the directory %s", mnt_roots); +} + +int prepare_mnt_ns(void) +{ + int ret = -1, rst = -1, fd; + struct ns_id ns = { .type = NS_CRIU, .ns_pid = PROC_SELF, .nd = &mnt_ns_desc }; + struct ns_id *nsid; + + if (!(root_ns_mask & CLONE_NEWNS)) + return 0; + + pr_info("Restoring mount namespace\n"); + + if (!opts.root) { + struct mount_info *old; + + if (chdir("/")) { + pr_perror("chdir(\"/\") failed"); + return -1; + } + + old = collect_mntinfo(&ns, false); + if (old == NULL) + return -1; + /* + * The new mount namespace is filled with the mountpoint + * clones from the original one. We have to umount them + * prior to recreating new ones. + */ + pr_info("Cleaning mount namespace\n"); + if (mnt_tree_for_each_reverse(ns.mnt.mntinfo_tree, do_umount_one)) { + free_mntinfo(old); + return -1; + } + + free_mntinfo(old); + } + + ret = populate_mnt_ns(); + if (ret) + return -1; + + rst = open_proc(PROC_SELF, "ns/mnt"); + if (rst < 0) + return -1; + + /* restore non-root namespaces */ + for (nsid = ns_ids; nsid != NULL; nsid = nsid->next) { + char path[PATH_MAX]; + + if (nsid->nd != &mnt_ns_desc) + continue; + /* Create the new mount namespace */ + if (unshare(CLONE_NEWNS)) { + pr_perror("Unable to create a new mntns"); + goto err; + } + + fd = open_proc(PROC_SELF, "ns/mnt"); + if (fd < 0) + goto err; + + if (nsid->type == NS_ROOT) { + /* + * We need to create a mount namespace which will be + * used to clean up remap files + * (depopulate_roots_yard). The namespace where mounts + * was restored has to be restored as a root mount + * namespace, because there are file descriptors + * linked with it (e.g. to bind-mount slave pty-s). + */ + if (setns(rst, CLONE_NEWNS)) { + pr_perror("Can't restore mntns back"); + goto err; + } + SWAP(rst, fd); + } + + /* Pin one with a file descriptor */ + nsid->mnt.nsfd_id = fdstore_add(fd); + close(fd); + if (nsid->mnt.nsfd_id < 0) { + pr_err("Can't add ns fd\n"); + goto err; + } + + /* Set its root */ + print_ns_root(nsid, 0, path, sizeof(path) - 1); + if (cr_pivot_root(path)) + goto err; + + /* root fd is used to restore file mappings */ + fd = open_proc(PROC_SELF, "root"); + if (fd < 0) + goto err; + nsid->mnt.root_fd_id = fdstore_add(fd); + if (nsid->mnt.root_fd_id < 0) { + pr_err("Can't add root fd\n"); + close(fd); + goto err; + } + close(fd); + + /* And return back to regain the access to the roots yard */ + if (setns(rst, CLONE_NEWNS)) { + pr_perror("Can't restore mntns back"); + goto err; + } + } + close(rst); + + return ret; +err: + if (rst >= 0) + restore_ns(rst, &mnt_ns_desc); + return -1; +} + +static int mntns_root_pid = -1; +static int mntns_set_root_fd(pid_t pid, int fd) +{ + int ret; + + ret = install_service_fd(ROOT_FD_OFF, fd); + if (ret >= 0) + mntns_root_pid = pid; + + return ret; +} + +int __mntns_get_root_fd(pid_t pid) +{ + + int fd, pfd; + int ret; + char path[PATH_MAX + 1]; + + if (mntns_root_pid == pid) /* The required root is already opened */ + return get_service_fd(ROOT_FD_OFF); + + if (!(root_ns_mask & CLONE_NEWNS)) { + /* + * If criu and tasks we dump live in the same mount + * namespace, we can just open the root directory. + * All paths resolution would occur relative to criu's + * root. Even if it is not namespace's root, provided + * file paths are resolved, we'd get consistent dump. + */ + fd = open("/", O_RDONLY | O_DIRECTORY); + if (fd < 0) { + pr_perror("Can't open root"); + return -1; + } + + goto set_root; + } + + /* + * If /proc/pid/root links on '/', it signs that a root of the task + * and a root of mntns is the same. + */ + + pfd = open_pid_proc(pid); + ret = readlinkat(pfd, "root", path, sizeof(path) - 1); + if (ret < 0) { + close_pid_proc(); + return ret; + } + + path[ret] = '\0'; + + if (ret != 1 || path[0] != '/') { + pr_err("The root task has another root than mntns: %s\n", path); + close_pid_proc(); + return -1; + } + + fd = openat(pfd, "root", O_RDONLY | O_DIRECTORY, 0); + if (fd < 0) { + pr_perror("Can't open the task root"); + return -1; + } + +set_root: + return mntns_set_root_fd(pid, fd); +} + +int mntns_get_root_fd(struct ns_id *mntns) +{ + if (!(root_ns_mask & CLONE_NEWNS)) + return __mntns_get_root_fd(0); + /* + * All namespaces are restored from the root task and during the + * CR_STATE_FORKING stage the root task has two file descriptors for + * each mntns. One is associated with a namespace and another one is a + * root of this mntns. + * + * When a non-root task is forked, it enters into a proper mount + * namespace, restores private mappings and forks children. Some of + * these mappings can be associated with files from other namespaces. + * + * After the CR_STATE_FORKING stage the root task has to close all + * mntns file descriptors to restore its descriptors and at this moment + * we know that all tasks live in their mount namespaces. + * + * If we find that a mount namespace isn't populated, we can get its + * root from the root task. + */ + + if (!mntns->ns_populated) { + int fd; + + fd = fdstore_get(mntns->mnt.root_fd_id); + if (fd < 0) + return -1; + + return mntns_set_root_fd(mntns->ns_pid, fd); + } + + return __mntns_get_root_fd(mntns->ns_pid); +} + +struct ns_id *lookup_nsid_by_mnt_id(int mnt_id) +{ + struct mount_info *mi; + + /* + * Kernel before 3.15 doesn't show mnt_id for file descriptors. + * mnt_id isn't saved for files, if mntns isn't dumped. + * In both these cases we have only one root, so here + * is not matter which mount will be restored. + */ + if (mnt_id == -1) + mi = mntinfo; + else + mi = lookup_mnt_id(mnt_id); + return mi ? mi->nsid : NULL; +} + +int mntns_get_root_by_mnt_id(int mnt_id) +{ + struct ns_id *mntns = NULL; + + if (root_ns_mask & CLONE_NEWNS) { + mntns = lookup_nsid_by_mnt_id(mnt_id); + BUG_ON(mntns == NULL); + } + + return mntns_get_root_fd(mntns); +} + +struct collect_mntns_arg { + bool need_to_validate; + bool for_dump; + int root_master_id; +}; + +static int collect_mntns(struct ns_id *ns, void *__arg) +{ + struct collect_mntns_arg *arg = __arg; + struct mount_info *pms; + + pms = collect_mntinfo(ns, arg->for_dump); + if (!pms) + return -1; + + if (arg->for_dump && ns->type != NS_CRIU) + arg->need_to_validate = true; + + mntinfo_add_list(pms); + + if (arg->need_to_validate && ns->id == root_item->ids->mnt_ns_id) + arg->root_master_id = ns->mnt.mntinfo_tree->master_id; + + return 0; +} + +int collect_mnt_namespaces(bool for_dump) +{ + struct collect_mntns_arg arg; + int ret; + + arg.for_dump = for_dump; + arg.need_to_validate = false; + + ret = walk_namespaces(&mnt_ns_desc, collect_mntns, &arg); + if (ret) + goto err; + +#ifdef CONFIG_BINFMT_MISC_VIRTUALIZED + if (for_dump && !opts.has_binfmt_misc) { + unsigned int s_dev = 0; + struct ns_id *ns; + + for (ns = ns_ids; ns != NULL; ns = ns->next) { + if (ns->type == NS_ROOT && ns->nd == &mnt_ns_desc) + break; + } + + if (ns) { + ret = mount_cr_time_mount(ns, &s_dev, "binfmt_misc", "/" BINFMT_MISC_HOME, + "binfmt_misc"); + if (ret == -EPERM) + pr_info("Can't mount binfmt_misc: EPERM. Running in user_ns?\n"); + else if (ret < 0 && ret != -EBUSY && ret != -ENODEV && ret != -ENOENT) { + pr_err("Can't mount binfmt_misc: %d %s\n", ret, strerror(-ret)); + goto err; + } else if (ret == 0) { + ret = -1; + goto err; + } else if (ret > 0 && add_cr_time_mount(ns->mnt.mntinfo_tree, "binfmt_misc", + BINFMT_MISC_HOME, s_dev) < 0) { + ret = -1; + goto err; + } + } + } +#endif + + ret = resolve_external_mounts(mntinfo); + if (ret) + goto err; + + if (arg.need_to_validate) { + ret = -1; + + if (resolve_shared_mounts(mntinfo, arg.root_master_id)) + goto err; + if (validate_mounts(mntinfo, true)) + goto err; + } + + ret = 0; +err: + return ret; +} + +int dump_mnt_namespaces(void) +{ + struct ns_id *nsid; + + if (!(root_ns_mask & CLONE_NEWNS)) + return 0; + + for (nsid = ns_ids; nsid != NULL; nsid = nsid->next) { + if (nsid->nd != &mnt_ns_desc || nsid->type == NS_CRIU) + continue; + + if ((nsid->type == NS_OTHER) && check_mnt_id()) { + pr_err("Nested mount namespaces are not supported " + "without mnt_id in fdinfo\n"); + return -1; + } + + if (dump_mnt_ns(nsid, nsid->mnt.mntinfo_list)) + return -1; + } + + return 0; +} + +void clean_cr_time_mounts(void) +{ + struct mount_info *mi; + int mnt_fd, ret; + + for (mi = mntinfo; mi; mi = mi->next) { + if (mi->mnt_id != CRTIME_MNT_ID) + continue; + ret = switch_ns(mi->nsid->ns_pid, &mnt_ns_desc, &mnt_fd); + if (ret) { + pr_err("Can't switch to pid's %u mnt_ns\n", mi->nsid->ns_pid); + continue; + } + + if (umount(mi->mountpoint) < 0) + pr_perror("Can't umount forced mount %s", mi->mountpoint); + + if (restore_ns(mnt_fd, &mnt_ns_desc)) { + pr_err("cleanup_forced_mounts exiting with wrong mnt_ns\n"); + return; + } + } +} + +struct ns_desc mnt_ns_desc = NS_DESC_ENTRY(CLONE_NEWNS, "mnt"); + +static int call_helper_process(int (*call)(void *), void *arg) +{ + int pid, status; + + pid = clone_noasan(call, CLONE_VFORK | CLONE_VM | CLONE_FILES | + CLONE_IO | CLONE_SIGHAND | CLONE_SYSVSEM, arg); + if (pid == -1) { + pr_perror("Can't clone helper process"); + return -1; + } + + errno = 0; + if (waitpid(pid, &status, __WALL) != pid) { + pr_perror("Unable to wait %d", pid); + return -1; + } + + if (status) { + pr_err("Bad child exit status: %d\n", status); + return -1; + } + + return 0; +} + +static int ns_remount_writable(void *arg) +{ + struct mount_info *mi = (struct mount_info *)arg; + struct ns_id *ns = mi->nsid; + + if (do_restore_task_mnt_ns(ns)) + return 1; + pr_debug("Switched to mntns %u:%u/n", ns->id, ns->kid); + + if (mount(NULL, mi->ns_mountpoint, NULL, MS_REMOUNT | MS_BIND | + (mi->flags & ~(MS_PROPAGATE | MS_RDONLY)), NULL) == -1) { + pr_perror("Failed to remount %d:%s writable", mi->mnt_id, mi->mountpoint); + return 1; + } + return 0; +} + +int try_remount_writable(struct mount_info *mi, bool ns) +{ + int remounted = REMOUNTED_RW; + + /* Don't remount if we are in host mntns to be on the safe side */ + if (!(root_ns_mask & CLONE_NEWNS)) + return 0; + + if (!ns) + remounted = REMOUNTED_RW_SERVICE; + + if (mi->flags & MS_RDONLY && !(mi->remounted_rw & remounted)) { + if (mnt_is_overmounted(mi)) { + pr_err("The mount %d is overmounted so paths are invisible\n", mi->mnt_id); + return -1; + } + + /* There should be no ghost files on mounts with ro sb */ + if (mi->sb_flags & MS_RDONLY) { + pr_err("The mount %d has readonly sb\n", mi->mnt_id); + return -1; + } + + pr_info("Remount %d:%s writable\n", mi->mnt_id, mi->mountpoint); + if (!ns) { + if (mount(NULL, mi->mountpoint, NULL, MS_REMOUNT | MS_BIND | + (mi->flags & ~(MS_PROPAGATE | MS_RDONLY)), NULL) == -1) { + pr_perror("Failed to remount %d:%s writable", mi->mnt_id, mi->mountpoint); + return -1; + } + } else { + if (call_helper_process(ns_remount_writable, mi)) + return -1; + } + mi->remounted_rw |= remounted; + } + + return 0; +} + +static int __remount_readonly_mounts(struct ns_id *ns) +{ + struct mount_info *mi; + bool mntns_set = false; + + for (mi = mntinfo; mi; mi = mi->next) { + if (ns && mi->nsid != ns) + continue; + + if (!(mi->remounted_rw && REMOUNTED_RW)) + continue; + + /* + * Lets enter the mount namespace lazily, only if we've found the + * mount which should be remounted readonly. These saves us + * from entering mntns if we have no mounts to remount in it. + */ + if (ns && !mntns_set) { + if (do_restore_task_mnt_ns(ns)) + return -1; + mntns_set = true; + pr_debug("Switched to mntns %u:%u/n", ns->id, ns->kid); + } + + pr_info("Remount %d:%s back to readonly\n", mi->mnt_id, mi->mountpoint); + if (mount(NULL, mi->ns_mountpoint, NULL, + MS_REMOUNT | MS_BIND | (mi->flags & ~MS_PROPAGATE), + NULL)) { + pr_perror("Failed to restore %d:%s mount flags %x", + mi->mnt_id, mi->mountpoint, mi->flags); + return -1; + } + } + + return 0; +} + +static int ns_remount_readonly_mounts(void *arg) +{ + struct ns_id *nsid; + + for (nsid = ns_ids; nsid != NULL; nsid = nsid->next) { + if (nsid->nd != &mnt_ns_desc) + continue; + + if (__remount_readonly_mounts(nsid)) + return 1; + } + + return 0; +} + +int remount_readonly_mounts(void) +{ + /* + * Need a helper process because the root task can share fs via + * CLONE_FS and we would not be able to enter mount namespaces + */ + return call_helper_process(ns_remount_readonly_mounts, NULL); +} diff --git a/CRIU_code/criu/namespaces.c b/CRIU_code/criu/namespaces.c new file mode 100644 index 0000000..a228737 --- /dev/null +++ b/CRIU_code/criu/namespaces.c @@ -0,0 +1,1754 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "page.h" +#include "rst-malloc.h" +#include "cr_options.h" +#include "imgset.h" +#include "uts_ns.h" +#include "ipc_ns.h" +#include "mount.h" +#include "pstree.h" +#include "namespaces.h" +#include "net.h" +#include "cgroup.h" +#include "fdstore.h" + +#include "protobuf.h" +#include "util.h" +#include "images/ns.pb-c.h" +#include "images/userns.pb-c.h" + +static struct ns_desc *ns_desc_array[] = { + &net_ns_desc, + &uts_ns_desc, + &ipc_ns_desc, + &pid_ns_desc, + &user_ns_desc, + &mnt_ns_desc, + &cgroup_ns_desc, +}; + +static unsigned int join_ns_flags; + +int check_namespace_opts(void) +{ + errno = EINVAL; + if (join_ns_flags & opts.empty_ns) { + pr_err("Conflicting flags: --join-ns and --empty-ns\n"); + return -1; + } + if (join_ns_flags & CLONE_NEWUSER) + pr_warn("join-ns with user-namespace is not fully tested and dangerous\n"); + + errno = 0; + return 0; +} + +static int check_int_str(char *str) +{ + char *endptr; + long val; + + if (str == NULL) + return 0; + + if (*str == '\0') { + str = NULL; + return 0; + } + + errno = EINVAL; + val = strtol(str, &endptr, 10); + if ((errno == ERANGE) || (endptr == str) + || (*endptr != '\0') + || (val < 0) || (val > 65535)) { + str = NULL; + return -1; + } + + errno = 0; + return 0; +} + +static int check_ns_file(char *ns_file) +{ + int pid, ret, proc_dir; + + if (!check_int_str(ns_file)) { + pid = atoi(ns_file); + if (pid <= 0) { + pr_err("Invalid join_ns pid %s\n", ns_file); + return -1; + } + proc_dir = open_pid_proc(pid); + if (proc_dir < 0) { + pr_err("Invalid join_ns pid: /proc/%s not found\n", + ns_file); + return -1; + } + return 0; + } + + ret = access(ns_file, 0); + if (ret < 0) { + pr_perror("Can't access join-ns file %s", ns_file); + return -1; + } + return 0; +} + +static int set_user_extra_opts(struct join_ns *jn, char *extra_opts) +{ + char *uid, *gid, *aux; + + if (extra_opts == NULL) { + jn->extra_opts.user_extra.uid = NULL; + jn->extra_opts.user_extra.gid = NULL; + return 0; + } + + uid = extra_opts; + aux = strchr(extra_opts, ','); + if (aux == NULL) { + gid = NULL; + } else { + *aux = '\0'; + gid = aux + 1; + } + + if (check_int_str(uid) || check_int_str(gid)) + return -1; + + jn->extra_opts.user_extra.uid = uid; + jn->extra_opts.user_extra.gid = gid; + + return 0; +} + +int join_ns_add(const char *type, char *ns_file, char *extra_opts) +{ + struct join_ns *jn; + + if (check_ns_file(ns_file)) + return -1; + + jn = xmalloc(sizeof(*jn)); + if (!jn) + return -1; + + jn->ns_file = ns_file; + if (!strncmp(type, "net", 4)) { + jn->nd = &net_ns_desc; + join_ns_flags |= CLONE_NEWNET; + } else if (!strncmp(type, "uts", 4)) { + jn->nd = &uts_ns_desc; + join_ns_flags |= CLONE_NEWUTS; + } else if (!strncmp(type, "ipc", 4)) { + jn->nd = &ipc_ns_desc; + join_ns_flags |= CLONE_NEWIPC; + } else if (!strncmp(type, "pid", 4)) { + pr_err("join-ns pid namespace not supported\n"); + goto err; + } else if (!strncmp(type, "user", 5)) { + jn->nd = &user_ns_desc; + if (set_user_extra_opts(jn, extra_opts)) { + pr_err("invalid user namespace extra_opts %s\n", extra_opts); + goto err; + } + join_ns_flags |= CLONE_NEWUSER; + } else if (!strncmp(type, "mnt", 4)) { + jn->nd = &mnt_ns_desc; + join_ns_flags |= CLONE_NEWNS; + } else { + pr_err("invalid namespace type %s\n", type); + goto err; + } + + list_add_tail(&jn->list, &opts.join_ns); + pr_info("Added %s:%s join namespace\n", type, ns_file); + return 0; +err: + xfree(jn); + return -1; +} + +static unsigned int parse_ns_link(char *link, size_t len, struct ns_desc *d) +{ + unsigned long kid = 0; + char *end; + + if (len >= d->len + 2) { + if (link[d->len] == ':' && !memcmp(link, d->str, d->len)) { + kid = strtoul(&link[d->len + 2], &end, 10); + if (end && *end == ']') + BUG_ON(kid > UINT_MAX); + else + kid = 0; + } + } + + return (unsigned int)kid; +} + +bool check_ns_proc(struct fd_link *link) +{ + unsigned int i, kid; + + for (i = 0; i < ARRAY_SIZE(ns_desc_array); i++) { + kid = parse_ns_link(link->name + 1, link->len - 1, ns_desc_array[i]); + if (!kid) + continue; + + link->ns_d = ns_desc_array[i]; + link->ns_kid = kid; + return true; + } + + return false; +} + +int switch_ns(int pid, struct ns_desc *nd, int *rst) +{ + int nsfd; + int ret; + + nsfd = open_proc(pid, "ns/%s", nd->str); + if (nsfd < 0) + return -1; + + ret = switch_ns_by_fd(nsfd, nd, rst); + + close(nsfd); + + return ret; +} + +int switch_ns_by_fd(int nsfd, struct ns_desc *nd, int *rst) +{ + int ret = -1; + + if (rst) { + *rst = open_proc(PROC_SELF, "ns/%s", nd->str); + if (*rst < 0) + goto err_ns; + } + + ret = setns(nsfd, nd->cflag); + if (ret < 0) { + pr_perror("Can't setns %d/%s", nsfd, nd->str); + goto err_set; + } + + return 0; + +err_set: + if (rst) + close(*rst); +err_ns: + return -1; +} + +int restore_ns(int rst, struct ns_desc *nd) +{ + int ret; + + ret = setns(rst, nd->cflag); + if (ret < 0) + pr_perror("Can't restore ns back"); + + close(rst); + + return ret; +} + +struct ns_id *ns_ids = NULL; +static unsigned int ns_next_id = 1; +unsigned long root_ns_mask = 0; + +static void nsid_add(struct ns_id *ns, struct ns_desc *nd, unsigned int id, pid_t pid) +{ + ns->nd = nd; + ns->id = id; + ns->ns_pid = pid; + ns->next = ns_ids; + ns_ids = ns; + + pr_info("Add %s ns %d pid %d\n", nd->str, ns->id, ns->ns_pid); +} + +struct ns_id *rst_new_ns_id(unsigned int id, pid_t pid, + struct ns_desc *nd, enum ns_type type) +{ + struct ns_id *nsid; + + nsid = shmalloc(sizeof(*nsid)); + if (nsid) { + nsid->type = type; + nsid_add(nsid, nd, id, pid); + nsid->ns_populated = false; + + if (nd == &net_ns_desc) { + INIT_LIST_HEAD(&nsid->net.ids); + INIT_LIST_HEAD(&nsid->net.links); + nsid->net.netns = NULL; + } + } + + return nsid; +} + +int rst_add_ns_id(unsigned int id, struct pstree_item *i, struct ns_desc *nd) +{ + pid_t pid = vpid(i); + struct ns_id *nsid; + + nsid = lookup_ns_by_id(id, nd); + if (nsid) { + if (pid_rst_prio(pid, nsid->ns_pid)) + nsid->ns_pid = pid; + return 0; + } + + nsid = rst_new_ns_id(id, pid, nd, + i == root_item ? NS_ROOT : NS_OTHER); + if (nsid == NULL) + return -1; + + return 0; +} + +struct ns_id *lookup_ns_by_kid(unsigned int kid, struct ns_desc *nd) +{ + struct ns_id *nsid; + + for (nsid = ns_ids; nsid != NULL; nsid = nsid->next) + if (nsid->kid == kid && nsid->nd == nd) + return nsid; + + return NULL; +} + +struct ns_id *lookup_ns_by_id(unsigned int id, struct ns_desc *nd) +{ + struct ns_id *nsid; + + for (nsid = ns_ids; nsid != NULL; nsid = nsid->next) + if (nsid->id == id && nsid->nd == nd) + return nsid; + + return NULL; +} + +/* + * For all namespaces we support, there are two supported + * tasks-to-namespaces layout. + * + * If root task lives in the same namespace as criu does + * all other tasks should live in it too and we do NOT dump + * this namespace. On restore tasks inherit the respective + * namespace from criu. + * + * If root task lives in its own namespace, then all other + * tasks may live in it. Sometimes (CLONE_SUBNS) there can + * be more than one namespace of that type. For this case + * we dump all namespace's info and recreate them on restore. + */ + +int walk_namespaces(struct ns_desc *nd, int (*cb)(struct ns_id *, void *), void *oarg) +{ + int ret = 0; + struct ns_id *ns; + + for (ns = ns_ids; ns != NULL; ns = ns->next) { + if (ns->nd != nd) + continue; + + if (ns->type == NS_CRIU) { + if (root_ns_mask & nd->cflag) + continue; + + ret = cb(ns, oarg); + break; + } + + ret = cb(ns, oarg); + if (ret) + break; + } + + return ret; +} + +static unsigned int generate_ns_id(int pid, unsigned int kid, struct ns_desc *nd, + struct ns_id **ns_ret) +{ + struct ns_id *nsid; + enum ns_type type; + + nsid = lookup_ns_by_kid(kid, nd); + if (nsid) + goto found; + + if (pid != getpid()) { + type = NS_OTHER; + if (pid == root_item->pid->real) { + BUG_ON(root_ns_mask & nd->cflag); + pr_info("Will take %s namespace in the image\n", nd->str); + root_ns_mask |= nd->cflag; + type = NS_ROOT; + } else if (nd->cflag & ~CLONE_SUBNS) { + pr_err("Can't dump nested %s namespace for %d\n", + nd->str, pid); + return 0; + } + } else + type = NS_CRIU; + + nsid = xzalloc(sizeof(*nsid)); + if (!nsid) + return 0; + + nsid->type = type; + nsid->kid = kid; + nsid->ns_populated = true; + nsid_add(nsid, nd, ns_next_id++, pid); + + if (nd == &net_ns_desc) { + INIT_LIST_HEAD(&nsid->net.ids); + INIT_LIST_HEAD(&nsid->net.links); + } + +found: + if (ns_ret) + *ns_ret = nsid; + return nsid->id; +} + +static unsigned int __get_ns_id(int pid, struct ns_desc *nd, protobuf_c_boolean *supported, struct ns_id **ns) +{ + int proc_dir; + unsigned int kid; + char ns_path[10]; + struct stat st; + + proc_dir = open_pid_proc(pid); + if (proc_dir < 0) + return 0; + + snprintf(ns_path, sizeof(ns_path), "ns/%s", nd->str); + + if (fstatat(proc_dir, ns_path, &st, 0)) { + if (errno == ENOENT) { + /* The namespace is unsupported */ + kid = 0; + goto out; + } + pr_perror("Unable to stat %s", ns_path); + return 0; + } + kid = st.st_ino; + BUG_ON(!kid); + +out: + if (supported) + *supported = kid != 0; + return generate_ns_id(pid, kid, nd, ns); +} + +static unsigned int get_ns_id(int pid, struct ns_desc *nd, protobuf_c_boolean *supported) +{ + return __get_ns_id(pid, nd, supported, NULL); +} + +int dump_one_ns_file(int lfd, u32 id, const struct fd_parms *p) +{ + struct cr_img *img; + FileEntry fe = FILE_ENTRY__INIT; + NsFileEntry nfe = NS_FILE_ENTRY__INIT; + struct fd_link *link = p->link; + struct ns_id *nsid; + + nsid = lookup_ns_by_kid(link->ns_kid, link->ns_d); + if (!nsid) { + pr_err("No NS ID with kid %u\n", link->ns_kid); + return -1; + } + + nfe.id = id; + nfe.ns_id = nsid->id; + nfe.ns_cflag = link->ns_d->cflag; + nfe.flags = p->flags; + + fe.type = FD_TYPES__NS; + fe.id = nfe.id; + fe.nsf = &nfe; + + img = img_from_set(glob_imgset, CR_FD_FILES); + return pb_write_one(img, &fe, PB_FILE); +} + +const struct fdtype_ops nsfile_dump_ops = { + .type = FD_TYPES__NS, + .dump = dump_one_ns_file, +}; + +struct ns_file_info { + struct file_desc d; + NsFileEntry *nfe; +}; + +static int open_ns_fd(struct file_desc *d, int *new_fd) +{ + struct ns_file_info *nfi = container_of(d, struct ns_file_info, d); + struct pstree_item *item, *t; + struct ns_desc *nd = NULL; + struct ns_id *ns; + int nsfd_id, fd; + char path[64]; + + for (ns = ns_ids; ns != NULL; ns = ns->next) { + if (ns->id != nfi->nfe->ns_id) + continue; + /* Check for CLONE_XXX as we use fdstore only if flag is set */ + if (ns->nd == &net_ns_desc && (root_ns_mask & CLONE_NEWNET)) + nsfd_id = ns->net.nsfd_id; + else + break; + fd = fdstore_get(nsfd_id); + goto check_open; + } + + /* + * Find out who can open us. + * + * FIXME I need a hash or RBtree here. + */ + for_each_pstree_item(t) { + TaskKobjIdsEntry *ids = t->ids; + + if (ids->pid_ns_id == nfi->nfe->ns_id) { + item = t; + nd = &pid_ns_desc; + break; + } else if (ids->net_ns_id == nfi->nfe->ns_id) { + item = t; + nd = &net_ns_desc; + break; + } else if (ids->user_ns_id == nfi->nfe->ns_id) { + item = t; + nd = &user_ns_desc; + break; + } else if (ids->ipc_ns_id == nfi->nfe->ns_id) { + item = t; + nd = &ipc_ns_desc; + break; + } else if (ids->uts_ns_id == nfi->nfe->ns_id) { + item = t; + nd = &uts_ns_desc; + break; + } else if (ids->mnt_ns_id == nfi->nfe->ns_id) { + item = t; + nd = &mnt_ns_desc; + break; + } else if (ids->cgroup_ns_id == nfi->nfe->ns_id) { + item = t; + nd = &cgroup_ns_desc; + break; + } + } + + if (!nd || !item) { + pr_err("Can't find suitable NS ID for %#x\n", nfi->nfe->ns_id); + return -1; + } + + if (nd->cflag != nfi->nfe->ns_cflag) { + pr_err("Clone flag mismatch for %#x\n", nfi->nfe->ns_id); + return -1; + } + + snprintf(path, sizeof(path) - 1, "/proc/%d/ns/%s", vpid(item), nd->str); + path[sizeof(path) - 1] = '\0'; + + fd = open(path, nfi->nfe->flags); +check_open: + if (fd < 0) { + pr_perror("Can't open file %s on restore", path); + return fd; + } + + *new_fd = fd; + return 0; +} + +static struct file_desc_ops ns_desc_ops = { + .type = FD_TYPES__NS, + .open = open_ns_fd, +}; + +static int collect_one_nsfile(void *o, ProtobufCMessage *base, struct cr_img *img) +{ + struct ns_file_info *nfi = o; + + nfi->nfe = pb_msg(base, NsFileEntry); + pr_info("Collected ns file ID %#x NS-ID %#x\n", nfi->nfe->id, nfi->nfe->ns_id); + return file_desc_add(&nfi->d, nfi->nfe->id, &ns_desc_ops); +} + +struct collect_image_info nsfile_cinfo = { + .fd_type = CR_FD_NS_FILES, + .pb_type = PB_NS_FILE, + .priv_size = sizeof(struct ns_file_info), + .collect = collect_one_nsfile, +}; + +/* + * Same as dump_task_ns_ids(), but + * a) doesn't keep IDs (don't need them) + * b) generates them for mount and netns only + * mnt ones are needed for open_mount() in + * inotify pred-dump + * net ones are needed for parasite socket + */ + +int predump_task_ns_ids(struct pstree_item *item) +{ + int pid = item->pid->real; + + if (!__get_ns_id(pid, &net_ns_desc, NULL, &dmpi(item)->netns)) + return -1; + + if (!get_ns_id(pid, &mnt_ns_desc, NULL)) + return -1; + + return 0; +} + +int dump_task_ns_ids(struct pstree_item *item) +{ + int pid = item->pid->real; + TaskKobjIdsEntry *ids = item->ids; + + ids->has_pid_ns_id = true; + ids->pid_ns_id = get_ns_id(pid, &pid_ns_desc, NULL); + if (!ids->pid_ns_id) { + pr_err("Can't make pidns id\n"); + return -1; + } + + ids->has_net_ns_id = true; + ids->net_ns_id = __get_ns_id(pid, &net_ns_desc, NULL, &dmpi(item)->netns); + if (!ids->net_ns_id) { + pr_err("Can't make netns id\n"); + return -1; + } + + ids->has_ipc_ns_id = true; + ids->ipc_ns_id = get_ns_id(pid, &ipc_ns_desc, NULL); + if (!ids->ipc_ns_id) { + pr_err("Can't make ipcns id\n"); + return -1; + } + + ids->has_uts_ns_id = true; + ids->uts_ns_id = get_ns_id(pid, &uts_ns_desc, NULL); + if (!ids->uts_ns_id) { + pr_err("Can't make utsns id\n"); + return -1; + } + + ids->has_mnt_ns_id = true; + ids->mnt_ns_id = get_ns_id(pid, &mnt_ns_desc, NULL); + if (!ids->mnt_ns_id) { + pr_err("Can't make mntns id\n"); + return -1; + } + + ids->has_user_ns_id = true; + ids->user_ns_id = get_ns_id(pid, &user_ns_desc, NULL); + if (!ids->user_ns_id) { + pr_err("Can't make userns id\n"); + return -1; + } + + ids->cgroup_ns_id = get_ns_id(pid, &cgroup_ns_desc, &ids->has_cgroup_ns_id); + if (!ids->cgroup_ns_id) { + pr_err("Can't make cgroup id\n"); + return -1; + } + + return 0; +} + +static UsernsEntry userns_entry = USERNS_ENTRY__INIT; +#define INVALID_ID (~0U) + +static unsigned int userns_id(unsigned int id, UidGidExtent **map, int n) +{ + int i; + + if (!(root_ns_mask & CLONE_NEWUSER)) + return id; + + for (i = 0; i < n; i++) { + if (map[i]->lower_first <= id && + map[i]->lower_first + map[i]->count > id) + return map[i]->first + (id - map[i]->lower_first); + } + + return INVALID_ID; +} + +static unsigned int host_id(unsigned int id, UidGidExtent **map, int n) +{ + int i; + + if (!(root_ns_mask & CLONE_NEWUSER)) + return id; + + for (i = 0; i < n; i++) { + if (map[i]->first <= id && + map[i]->first + map[i]->count > id) + return map[i]->lower_first + (id - map[i]->first); + } + + return INVALID_ID; +} + +static uid_t host_uid(uid_t uid) +{ + UsernsEntry *e = &userns_entry; + return host_id(uid, e->uid_map, e->n_uid_map); +} + +static gid_t host_gid(gid_t gid) +{ + UsernsEntry *e = &userns_entry; + return host_id(gid, e->gid_map, e->n_gid_map); +} + +uid_t userns_uid(uid_t uid) +{ + UsernsEntry *e = &userns_entry; + return userns_id(uid, e->uid_map, e->n_uid_map); +} + +gid_t userns_gid(gid_t gid) +{ + UsernsEntry *e = &userns_entry; + return userns_id(gid, e->gid_map, e->n_gid_map); +} + +static int parse_id_map(pid_t pid, char *name, UidGidExtent ***pb_exts) +{ + UidGidExtent *extents = NULL; + int len = 0, size = 0, ret, i; + FILE *f; + + f = fopen_proc(pid, "%s", name); + if (f == NULL) + return -1; + + ret = -1; + while (1) { + UidGidExtent *ext; + + if (len == size) { + UidGidExtent *t; + + size = size * 2 + 1; + t = xrealloc(extents, size * sizeof(UidGidExtent)); + if (t == NULL) + break; + extents = t; + } + + ext = &extents[len]; + + uid_gid_extent__init(ext); + ret = fscanf(f, "%d %d %d", &ext->first, + &ext->lower_first, &ext->count); + if (ret != 3) { + if (ferror(f)) { + pr_perror("Unable to parse extents: %d", ret); + ret = -1; + } else + ret = 0; + break; + } + pr_info("id_map: %d %d %d\n", ext->first, ext->lower_first, ext->count); + len++; + } + + fclose(f); + + if (ret) + goto err; + + if (len) { + *pb_exts = xmalloc(sizeof(UidGidExtent *) * len); + if (*pb_exts == NULL) + goto err; + + for (i = 0; i < len; i++) + (*pb_exts)[i] = &extents[i]; + } else { + xfree(extents); + *pb_exts = NULL; + } + + return len; +err: + xfree(extents); + return -1; +} + +int collect_user_ns(struct ns_id *ns, void *oarg) +{ + /* + * User namespace is dumped before files to get uid and gid + * mappings, which are used for convirting local id-s to + * userns id-s (userns_uid(), userns_gid()) + */ + if (dump_user_ns(root_item->pid->real, root_item->ids->user_ns_id)) + return -1; + + return 0; +} + +int collect_user_namespaces(bool for_dump) +{ + if (!for_dump) + return 0; + + if (!(root_ns_mask & CLONE_NEWUSER)) + return 0; + + return walk_namespaces(&user_ns_desc, collect_user_ns, NULL); +} + +static int check_user_ns(int pid) +{ + int status; + pid_t chld; + + chld = fork(); + if (chld == -1) { + pr_perror("Unable to fork a process"); + return -1; + } + + if (chld == 0) { + struct __user_cap_data_struct data[_LINUX_CAPABILITY_U32S_3]; + struct __user_cap_header_struct hdr; + uid_t uid; + gid_t gid; + + uid = host_uid(0); + gid = host_gid(0); + if (uid == INVALID_ID || gid == INVALID_ID) { + pr_err("Unable to convert uid or gid\n"); + exit(1); + } + + if (prctl(PR_SET_KEEPCAPS, 1)) { + pr_perror("Unable to set PR_SET_KEEPCAPS"); + exit(1); + } + + if (setresgid(gid, gid, gid)) { + pr_perror("Unable to set group ID"); + exit(1); + } + + if (setgroups(0, NULL) < 0) { + pr_perror("Unable to drop supplementary groups"); + exit(1); + } + + if (setresuid(uid, uid, uid)) { + pr_perror("Unable to set user ID"); + exit(1); + } + + hdr.version = _LINUX_CAPABILITY_VERSION_3; + hdr.pid = 0; + + if (capget(&hdr, data) < 0) { + pr_perror("capget"); + exit(1); + } + data[0].effective = data[0].permitted; + data[1].effective = data[1].permitted; + if (capset(&hdr, data) < 0) { + pr_perror("capset"); + exit(1); + } + + /* + * Check that we are able to enter into other namespaces + * from the target userns namespace. This signs that these + * namespaces were created from the target userns. + */ + + if (switch_ns(pid, &user_ns_desc, NULL)) + exit(1); + + if ((root_ns_mask & CLONE_NEWNET) && + switch_ns(pid, &net_ns_desc, NULL)) + exit(1); + if ((root_ns_mask & CLONE_NEWUTS) && + switch_ns(pid, &uts_ns_desc, NULL)) + exit(1); + if ((root_ns_mask & CLONE_NEWIPC) && + switch_ns(pid, &ipc_ns_desc, NULL)) + exit(1); + if ((root_ns_mask & CLONE_NEWNS) && + switch_ns(pid, &mnt_ns_desc, NULL)) + exit(1); + exit(0); + } + + if (waitpid(chld, &status, 0) != chld) { + pr_perror("Unable to wait for PID %d", chld); + return -1; + } + + if (status) { + pr_err("One or more namespaces doesn't belong to the target user namespace\n"); + return -1; + } + + return 0; +} + +int dump_user_ns(pid_t pid, int ns_id) +{ + int ret, exit_code = -1; + UsernsEntry *e = &userns_entry; + struct cr_img *img; + + ret = parse_id_map(pid, "uid_map", &e->uid_map); + if (ret < 0) + goto err; + e->n_uid_map = ret; + + ret = parse_id_map(pid, "gid_map", &e->gid_map); + if (ret < 0) + goto err; + e->n_gid_map = ret; + + if (check_user_ns(pid)) + return -1; + + img = open_image(CR_FD_USERNS, O_DUMP, ns_id); + if (!img) + goto err; + ret = pb_write_one(img, e, PB_USERNS); + close_image(img); + if (ret < 0) + goto err; + + return 0; +err: + if (e->uid_map) { + xfree(e->uid_map[0]); + xfree(e->uid_map); + } + if (e->gid_map) { + xfree(e->gid_map[0]); + xfree(e->gid_map); + } + return exit_code; +} + +void free_userns_maps() +{ + if (userns_entry.n_uid_map > 0) { + xfree(userns_entry.uid_map[0]); + xfree(userns_entry.uid_map); + } + if (userns_entry.n_gid_map > 0) { + xfree(userns_entry.gid_map[0]); + xfree(userns_entry.gid_map); + } +} + +static int do_dump_namespaces(struct ns_id *ns) +{ + int ret; + + ret = switch_ns(ns->ns_pid, ns->nd, NULL); + if (ret) + return ret; + + switch (ns->nd->cflag) { + case CLONE_NEWUTS: + pr_info("Dump UTS namespace %d via %d\n", + ns->id, ns->ns_pid); + ret = dump_uts_ns(ns->id); + break; + case CLONE_NEWIPC: + pr_info("Dump IPC namespace %d via %d\n", + ns->id, ns->ns_pid); + ret = dump_ipc_ns(ns->id); + break; + case CLONE_NEWNET: + pr_info("Dump NET namespace info %d via %d\n", + ns->id, ns->ns_pid); + ret = dump_net_ns(ns); + break; + default: + pr_err("Unknown namespace flag %x\n", ns->nd->cflag); + break; + } + + return ret; + +} + +int dump_namespaces(struct pstree_item *item, unsigned int ns_flags) +{ + struct pid *ns_pid = item->pid; + struct ns_id *ns; + int pid, nr = 0; + int ret = 0; + + /* + * The setns syscall is cool, we can switch to the other + * namespace and then return back to our initial one, but + * for me it's much easier just to fork another task and + * let it do the job, all the more so it can be done in + * parallel with task dumping routine. + * + * However, the question how to dump sockets from the target + * net namespace with this is still open + */ + + pr_info("Dumping %d(%d)'s namespaces\n", ns_pid->ns[0].virt, ns_pid->real); + + if ((ns_flags & CLONE_NEWPID) && ns_pid->ns[0].virt != 1) { + pr_err("Can't dump a pid namespace without the process init\n"); + return -1; + } + + for (ns = ns_ids; ns; ns = ns->next) { + /* Skip current namespaces, which are in the list too */ + if (ns->type == NS_CRIU) + continue; + + switch (ns->nd->cflag) { + /* No data for pid namespaces to dump */ + case CLONE_NEWPID: + /* Dumped explicitly with dump_mnt_namespaces() */ + case CLONE_NEWNS: + /* Userns is dumped before dumping tasks */ + case CLONE_NEWUSER: + /* handled separately in cgroup dumping code */ + case CLONE_NEWCGROUP: + continue; + } + + pid = fork(); + if (pid < 0) { + pr_perror("Can't fork ns dumper"); + return -1; + } + + if (pid == 0) { + ret = do_dump_namespaces(ns); + exit(ret); + } + + nr++; + } + + while (nr > 0) { + int status; + + ret = waitpid(-1, &status, 0); + if (ret < 0) { + pr_perror("Can't wait ns dumper"); + return -1; + } + + if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) { + pr_err("Namespaces dumping finished with error %d\n", status); + return -1; + } + + nr--; + } + + pr_info("Namespaces dump complete\n"); + return 0; +} + +static int write_id_map(pid_t pid, UidGidExtent **extents, int n, char *id_map) +{ + char buf[PAGE_SIZE]; + int off = 0, i; + int fd; + + /* + * We can perform only a single write (that may contain multiple + * newline-delimited records) to a uid_map and a gid_map files. + */ + for (i = 0; i < n; i++) + off += snprintf(buf + off, sizeof(buf) - off, + "%u %u %u\n", extents[i]->first, + extents[i]->lower_first, + extents[i]->count); + + fd = open_proc_rw(pid, "%s", id_map); + if (fd < 0) + return -1; + if (write(fd, buf, off) != off) { + pr_perror("Unable to write into %s", id_map); + close(fd); + return -1; + } + close(fd); + + return 0; +} + +struct unsc_msg { + struct msghdr h; + /* + * 0th is the call address + * 1st is the flags + * 2nd is the optional (NULL in response) arguments + */ + struct iovec iov[3]; + char c[CMSG_SPACE(sizeof(struct ucred)) + CMSG_SPACE(sizeof(int))]; +}; + +static int usernsd_pid; + +static inline void unsc_msg_init(struct unsc_msg *m, uns_call_t *c, + int *x, void *arg, size_t asize, int fd) +{ + struct cmsghdr *ch; + struct ucred *ucred; + + m->h.msg_iov = m->iov; + m->h.msg_iovlen = 2; + + m->iov[0].iov_base = c; + m->iov[0].iov_len = sizeof(*c); + m->iov[1].iov_base = x; + m->iov[1].iov_len = sizeof(*x); + + if (arg) { + m->iov[2].iov_base = arg; + m->iov[2].iov_len = asize; + m->h.msg_iovlen++; + } + + m->h.msg_name = NULL; + m->h.msg_namelen = 0; + m->h.msg_flags = 0; + + m->h.msg_control = &m->c; + + /* Need to memzero because of: + * https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=514917 + */ + memzero(&m->c, sizeof(m->c)); + + m->h.msg_controllen = CMSG_SPACE(sizeof(struct ucred)); + + ch = CMSG_FIRSTHDR(&m->h); + ch->cmsg_len = CMSG_LEN(sizeof(struct ucred)); + ch->cmsg_level = SOL_SOCKET; + ch->cmsg_type = SCM_CREDENTIALS; + + ucred = (struct ucred *) CMSG_DATA(ch); + ucred->pid = getpid(); + ucred->uid = getuid(); + ucred->gid = getgid(); + + if (fd >= 0) { + m->h.msg_controllen += CMSG_SPACE(sizeof(int)); + ch = CMSG_NXTHDR(&m->h, ch); + BUG_ON(!ch); + ch->cmsg_len = CMSG_LEN(sizeof(int)); + ch->cmsg_level = SOL_SOCKET; + ch->cmsg_type = SCM_RIGHTS; + *((int *)CMSG_DATA(ch)) = fd; + } +} + +static void unsc_msg_pid_fd(struct unsc_msg *um, pid_t *pid, int *fd) +{ + struct cmsghdr *ch; + struct ucred *ucred; + + ch = CMSG_FIRSTHDR(&um->h); + BUG_ON(!ch); + BUG_ON(ch->cmsg_len != CMSG_LEN(sizeof(struct ucred))); + BUG_ON(ch->cmsg_level != SOL_SOCKET); + BUG_ON(ch->cmsg_type != SCM_CREDENTIALS); + + if (pid) { + ucred = (struct ucred *) CMSG_DATA(ch); + *pid = ucred->pid; + } + + ch = CMSG_NXTHDR(&um->h, ch); + + if (ch && ch->cmsg_len == CMSG_LEN(sizeof(int))) { + BUG_ON(ch->cmsg_level != SOL_SOCKET); + BUG_ON(ch->cmsg_type != SCM_RIGHTS); + *fd = *((int *)CMSG_DATA(ch)); + } else { + *fd = -1; + } +} + +static int usernsd(int sk) +{ + pr_info("uns: Daemon started\n"); + + while (1) { + struct unsc_msg um; + static char msg[MAX_UNSFD_MSG_SIZE]; + uns_call_t call; + int flags, fd, ret; + pid_t pid; + + unsc_msg_init(&um, &call, &flags, msg, sizeof(msg), 0); + if (recvmsg(sk, &um.h, 0) <= 0) { + pr_perror("uns: recv req error"); + return -1; + } + + unsc_msg_pid_fd(&um, &pid, &fd); + pr_debug("uns: daemon calls %p (%d, %d, %x)\n", call, pid, fd, flags); + + if (fd < 0 && flags & UNS_FDOUT) { + pr_err("uns: bad flags/fd %p %d %x\n", call, fd, flags); + BUG(); + } + + /* + * Caller has sent us bare address of the routine it + * wants to call. Since the caller is fork()-ed from the + * same process as the daemon is, the latter has exactly + * the same code at exactly the same address as the + * former guy has. So go ahead and just call one! + */ + + ret = call(msg, fd, pid); + + if (fd >= 0) + close(fd); + + if (flags & UNS_ASYNC) { + /* + * Async call failed and the called doesn't know + * about it. Exit now and let the stop_usernsd() + * check the exit code and abort the restoration. + * + * We'd get there either by the end of restore or + * from the next userns_call() due to failed + * sendmsg() in there. + */ + if (ret < 0) { + pr_err("uns: Async call failed. Exiting\n"); + return -1; + } + + continue; + } + + if (flags & UNS_FDOUT) + fd = ret; + else + fd = -1; + + unsc_msg_init(&um, &call, &ret, NULL, 0, fd); + if (sendmsg(sk, &um.h, 0) <= 0) { + pr_perror("uns: send resp error"); + return -1; + } + + if (fd >= 0) + close(fd); + } +} + +int __userns_call(const char *func_name, uns_call_t call, int flags, + void *arg, size_t arg_size, int fd) +{ + int ret, res, sk; + bool async = flags & UNS_ASYNC; + struct unsc_msg um; + + if (unlikely(arg_size > MAX_UNSFD_MSG_SIZE)) { + pr_err("uns: message size exceeded\n"); + return -1; + } + + if (!usernsd_pid) + return call(arg, fd, getpid()); + + sk = get_service_fd(USERNSD_SK); + pr_debug("uns: calling %s (%d, %x)\n", func_name, fd, flags); + + if (!async) + /* + * Why don't we lock for async requests? Because + * they just put the request in the daemon's + * queue and do not wait for the response. Thus + * when daemon response there's only one client + * waiting for it in recvmsg below, so he + * responses to proper caller. + */ + mutex_lock(&task_entries->userns_sync_lock); + else + /* + * If we want the callback to give us and FD then + * we should NOT do the asynchronous call. + */ + BUG_ON(flags & UNS_FDOUT); + + /* Send the request */ + + unsc_msg_init(&um, &call, &flags, arg, arg_size, fd); + ret = sendmsg(sk, &um.h, 0); + if (ret <= 0) { + pr_perror("uns: send req error"); + ret = -1; + goto out; + } + + if (async) { + ret = 0; + goto out; + } + + /* Get the response back */ + + unsc_msg_init(&um, &call, &res, NULL, 0, 0); + ret = recvmsg(sk, &um.h, 0); + if (ret <= 0) { + pr_perror("uns: recv resp error"); + ret = -1; + goto out; + } + + /* Decode the result and return */ + + if (flags & UNS_FDOUT) + unsc_msg_pid_fd(&um, NULL, &ret); + else + ret = res; +out: + if (!async) + mutex_unlock(&task_entries->userns_sync_lock); + + return ret; +} + +static int start_usernsd(void) +{ + int sk[2]; + int one = 1; + + if (!(root_ns_mask & CLONE_NEWUSER)) + return 0; + + /* + * Seqpacket to + * + * a) Help daemon distinguish individual requests from + * each other easily. Stream socket require manual + * messages boundaries. + * + * b) Make callers note the damon death by seeing the + * disconnected socket. In case of dgram socket + * callers would just get stuck in receiving the + * response. + */ + + if (socketpair(PF_UNIX, SOCK_SEQPACKET, 0, sk)) { + pr_perror("Can't make usernsd socket"); + return -1; + } + + if (setsockopt(sk[0], SOL_SOCKET, SO_PASSCRED, &one, sizeof(one)) < 0) { + pr_perror("failed to setsockopt"); + return -1; + } + + if (setsockopt(sk[1], SOL_SOCKET, SO_PASSCRED, &one, sizeof(1)) < 0) { + pr_perror("failed to setsockopt"); + return -1; + } + + usernsd_pid = fork(); + if (usernsd_pid < 0) { + pr_perror("Can't fork usernsd"); + close(sk[0]); + close(sk[1]); + return -1; + } + + if (usernsd_pid == 0) { + int ret; + + close(sk[0]); + ret = usernsd(sk[1]); + exit(ret); + } + + close(sk[1]); + if (install_service_fd(USERNSD_SK, sk[0]) < 0) { + kill(usernsd_pid, SIGKILL); + waitpid(usernsd_pid, NULL, 0); + return -1; + } + + return 0; +} + +static int exit_usernsd(void *arg, int fd, pid_t pid) +{ + int code = *(int *)arg; + pr_info("uns: `- daemon exits w/ %d\n", code); + exit(code); +} + +int stop_usernsd(void) +{ + int ret = 0; + + if (usernsd_pid) { + int status = -1; + sigset_t blockmask, oldmask; + + /* + * Don't let the sigchld_handler() mess with us + * calling waitpid() on the exited daemon. The + * same is done in cr_system(). + */ + + sigemptyset(&blockmask); + sigaddset(&blockmask, SIGCHLD); + sigprocmask(SIG_BLOCK, &blockmask, &oldmask); + + /* + * Send a message to make sure the daemon _has_ + * proceeded all its queue of asynchronous requests. + * + * All the restoring processes might have already + * closed their USERNSD_SK descriptors, but daemon + * still has its in connected state -- this is us + * who hold the last reference on the peer. + * + * If daemon has exited "in advance" due to async + * call or socket error, the userns_call() and the + * waitpid() below would both fail and we'll see + * bad exit status. + */ + + userns_call(exit_usernsd, UNS_ASYNC, &ret, sizeof(ret), -1); + waitpid(usernsd_pid, &status, 0); + + if (WIFEXITED(status)) + ret = WEXITSTATUS(status); + else + ret = -1; + + usernsd_pid = 0; + sigprocmask(SIG_SETMASK, &oldmask, NULL); + + if (ret != 0) + pr_err("uns: daemon exited abnormally\n"); + else + pr_info("uns: daemon stopped\n"); + } + + return ret; +} + +int prepare_userns(struct pstree_item *item) +{ + struct cr_img *img; + UsernsEntry *e; + int ret; + + img = open_image(CR_FD_USERNS, O_RSTR, item->ids->user_ns_id); + if (!img) + return -1; + ret = pb_read_one(img, &e, PB_USERNS); + close_image(img); + if (ret < 0) + return -1; + + if (write_id_map(item->pid->real, e->uid_map, e->n_uid_map, "uid_map")) + return -1; + + if (write_id_map(item->pid->real, e->gid_map, e->n_gid_map, "gid_map")) + return -1; + + return 0; +} + +int collect_namespaces(bool for_dump) +{ + int ret; + + ret = collect_user_namespaces(for_dump); + if (ret < 0) + return ret; + + ret = collect_mnt_namespaces(for_dump); + if (ret < 0) + return ret; + + ret = collect_net_namespaces(for_dump); + if (ret < 0) + return ret; + + return 0; +} + +int prepare_userns_creds(void) +{ + /* UID and GID must be set after restoring /proc/PID/{uid,gid}_maps */ + if (setuid(0) || setgid(0) || setgroups(0, NULL)) { + pr_perror("Unable to initialize id-s"); + return -1; + } + + /* + * This flag is dropped after entering userns, but is + * required to access files in /proc, so put one here + * temporarily. It will be set to proper value at the + * very end. + */ + if (prctl(PR_SET_DUMPABLE, 1, 0)) { + pr_perror("Unable to set PR_SET_DUMPABLE"); + return -1; + } + + return 0; +} + +static int get_join_ns_fd(struct join_ns *jn) +{ + int pid, fd; + char nsf[32]; + char *pnsf; + + pid = atoi(jn->ns_file); + if (pid > 0) { + snprintf(nsf, sizeof(nsf), "/proc/%d/ns/%s", pid, jn->nd->str); + pnsf = nsf; + } else { + pnsf = jn->ns_file; + } + + fd = open(pnsf, O_RDONLY); + if (fd < 0) { + pr_perror("Can't open ns file %s", pnsf); + return -1; + } + jn->ns_fd = fd; + return 0; +} + +static int switch_join_ns(struct join_ns *jn) +{ + struct stat st, self_st; + char buf[32]; + + if (jn->nd == &user_ns_desc) { + /* It is not permitted to use setns() to reenter the caller's current + * user namespace. This prevents a caller that has dropped capabilities + * from regaining those capabilities via a call to setns() + */ + if (fstat(jn->ns_fd, &st) == -1) { + pr_perror("Can't get ns file %s stat", jn->ns_file); + return -1; + } + + snprintf(buf, sizeof(buf), "/proc/self/ns/%s", jn->nd->str); + if (stat(buf, &self_st) == -1) { + pr_perror("Can't get ns file %s stat", buf); + return -1; + } + + if (st.st_ino == self_st.st_ino) + return 0; + } + + if (setns(jn->ns_fd, jn->nd->cflag)) { + pr_perror("Failed to setns when join-ns %s:%s", jn->nd->str, jn->ns_file); + return -1; + } + + return 0; +} + +static int switch_user_join_ns(struct join_ns *jn) +{ + uid_t uid; + gid_t gid; + + if (jn == NULL) + return 0; + + if (switch_join_ns(jn)) + return -1; + + if (jn->extra_opts.user_extra.uid == NULL) + uid = getuid(); + else + uid = atoi(jn->extra_opts.user_extra.uid); + + if (jn->extra_opts.user_extra.gid == NULL) + gid = getgid(); + else + gid = atoi(jn->extra_opts.user_extra.gid); + + /* FIXME: + * if err occurs in setuid/setgid, should we just alert or + * return an error + */ + if (setuid(uid)) { + pr_perror("setuid failed while joining userns"); + return -1; + } + if (setgid(gid)) { + pr_perror("setgid failed while joining userns"); + return -1; + } + + return 0; +} + +int join_namespaces(void) +{ + struct join_ns *jn, *user_jn = NULL; + int ret = -1; + + list_for_each_entry(jn, &opts.join_ns, list) + if (get_join_ns_fd(jn)) + goto err_out; + + list_for_each_entry(jn, &opts.join_ns, list) + if (jn->nd == &user_ns_desc) { + user_jn = jn; + } else { + if (switch_join_ns(jn)) + goto err_out; + } + + if (switch_user_join_ns(user_jn)) + goto err_out; + + ret = 0; +err_out: + list_for_each_entry(jn, &opts.join_ns, list) + close_safe(&jn->ns_fd); + return ret; +} + +int prepare_namespace(struct pstree_item *item, unsigned long clone_flags) +{ + pid_t pid = vpid(item); + sigset_t sig_mask; + int id, ret = -1; + + pr_info("Restoring namespaces %d flags 0x%lx\n", + vpid(item), clone_flags); + + if (block_sigmask(&sig_mask, SIGCHLD) < 0) + return -1; + + if ((clone_flags & CLONE_NEWUSER) && prepare_userns_creds()) + return -1; + + /* + * On netns restore we launch an IP tool, thus we + * have to restore it _before_ altering the mount + * tree (i.e. -- mnt_ns restoring) + */ + + id = ns_per_id ? item->ids->uts_ns_id : pid; + if ((clone_flags & CLONE_NEWUTS) && prepare_utsns(id)) + goto out; + id = ns_per_id ? item->ids->ipc_ns_id : pid; + if ((clone_flags & CLONE_NEWIPC) && prepare_ipc_ns(id)) + goto out; + + if (prepare_net_namespaces()) + goto out; + + /* + * This one is special -- there can be several mount + * namespaces and prepare_mnt_ns handles them itself. + */ + if (prepare_mnt_ns()) + goto out; + + ret = 0; +out: + if (restore_sigmask(&sig_mask) < 0) + ret = -1; + + return ret; +} + +int prepare_namespace_before_tasks(void) +{ + if (start_usernsd()) + goto err_unds; + + if (netns_keep_nsfd()) + goto err_netns; + + if (mntns_maybe_create_roots()) + goto err_mnt; + + if (read_mnt_ns_img()) + goto err_img; + + if (read_net_ns_img()) + goto err_img; + + return 0; + +err_img: + cleanup_mnt_ns(); +err_mnt: + /* + * Nothing, netns' descriptor will be closed + * on criu exit + */ +err_netns: + stop_usernsd(); +err_unds: + return -1; +} + +struct ns_desc pid_ns_desc = NS_DESC_ENTRY(CLONE_NEWPID, "pid"); +struct ns_desc user_ns_desc = NS_DESC_ENTRY(CLONE_NEWUSER, "user"); diff --git a/CRIU_code/criu/net.c b/CRIU_code/criu/net.c new file mode 100644 index 0000000..fe9b51a --- /dev/null +++ b/CRIU_code/criu/net.c @@ -0,0 +1,3283 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_HAS_SELINUX +#include +#endif + +#include "../soccr/soccr.h" + +#include "imgset.h" +#include "namespaces.h" +#include "net.h" +#include "libnetlink.h" +#include "cr_options.h" +#include "sk-inet.h" +#include "tun.h" +#include "util-pie.h" +#include "plugin.h" +#include "action-scripts.h" +#include "sockets.h" +#include "pstree.h" +#include "string.h" +#include "sysctl.h" +#include "kerndat.h" +#include "util.h" +#include "external.h" +#include "fdstore.h" + +#include "protobuf.h" +#include "images/netdev.pb-c.h" +#include "images/inventory.pb-c.h" + +#ifndef IFLA_LINK_NETNSID +#define IFLA_LINK_NETNSID 37 +#undef IFLA_MAX +#define IFLA_MAX IFLA_LINK_NETNSID +#endif + +#ifndef RTM_NEWNSID +#define RTM_NEWNSID 88 +#endif + +#ifndef IFLA_MACVLAN_FLAGS +#define IFLA_MACVLAN_FLAGS 2 +#endif + +enum { + IFLA_IPTUN_UNSPEC, + IFLA_IPTUN_LINK, + IFLA_IPTUN_LOCAL, + IFLA_IPTUN_REMOTE, + IFLA_IPTUN_TTL, + IFLA_IPTUN_TOS, + IFLA_IPTUN_ENCAP_LIMIT, + IFLA_IPTUN_FLOWINFO, + IFLA_IPTUN_FLAGS, + IFLA_IPTUN_PROTO, + IFLA_IPTUN_PMTUDISC, + IFLA_IPTUN_6RD_PREFIX, + IFLA_IPTUN_6RD_RELAY_PREFIX, + IFLA_IPTUN_6RD_PREFIXLEN, + IFLA_IPTUN_6RD_RELAY_PREFIXLEN, + IFLA_IPTUN_ENCAP_TYPE, + IFLA_IPTUN_ENCAP_FLAGS, + IFLA_IPTUN_ENCAP_SPORT, + IFLA_IPTUN_ENCAP_DPORT, + __IFLA_IPTUN_MAX, +}; +#define IFLA_IPTUN_MAX (__IFLA_IPTUN_MAX - 1) + +static int ns_sysfs_fd = -1; + +int read_ns_sys_file(char *path, char *buf, int len) +{ + int fd, rlen; + + BUG_ON(ns_sysfs_fd == -1); + + fd = openat(ns_sysfs_fd, path, O_RDONLY, 0); + if (fd < 0) { + pr_perror("Can't open ns' %s", path); + return -1; + } + + rlen = read(fd, buf, len); + close(fd); + + if (rlen == len) { + pr_err("Too small buffer to read ns sys file %s\n", path); + return -1; + } + + if (rlen > 0) + buf[rlen - 1] = '\0'; + + return rlen; +} + +static bool sysctl_entries_equal(SysctlEntry *a, SysctlEntry *b) +{ + if (a->type != b->type) + return false; + + switch (a->type) { + case SYSCTL_TYPE__CTL_32: + return a->has_iarg && b->has_iarg && a->iarg == b->iarg; + case SYSCTL_TYPE__CTL_STR: + return a->sarg && b->sarg && !strcmp(a->sarg, b->sarg); + default:; + } + + return false; +} + +static char *devconfs4[] = { + "accept_local", + "accept_redirects", + "accept_source_route", + "arp_accept", + "arp_announce", + "arp_filter", + "arp_ignore", + "arp_notify", + "bootp_relay", + "disable_policy", + "disable_xfrm", + "force_igmp_version", + "forwarding", + "igmpv2_unsolicited_report_interval", + "igmpv3_unsolicited_report_interval", + "log_martians", + "medium_id", + "promote_secondaries", + "proxy_arp", + "proxy_arp_pvlan", + "route_localnet", + "rp_filter", + "secure_redirects", + "send_redirects", + "shared_media", + "src_valid_mark", + "tag", + "ignore_routes_with_linkdown", + "drop_gratuitous_arp", + "drop_unicast_in_l2_multicast", +}; + +char *devconfs6[] = { + "accept_dad", + "accept_ra", + "accept_ra_defrtr", + "accept_ra_from_local", + "accept_ra_min_hop_limit", + "accept_ra_mtu", + "accept_ra_pinfo", + "accept_ra_rt_info_max_plen", + "accept_ra_rtr_pref", + "accept_redirects", + "accept_source_route", + "autoconf", + "dad_transmits", + "disable_ipv6", + "drop_unicast_in_l2_multicast", + "drop_unsolicited_na", + "force_mld_version", + "force_tllao", + "forwarding", + "hop_limit", + "ignore_routes_with_linkdown", + "keep_addr_on_down", + "max_addresses", + "max_desync_factor", + "mldv1_unsolicited_report_interval", + "mldv2_unsolicited_report_interval", + "mtu", + "ndisc_notify", + "optimistic_dad", + "proxy_ndp", + "regen_max_retry", + "router_probe_interval", + "router_solicitation_delay", + "router_solicitation_interval", + "router_solicitations", + "stable_secret", + "suppress_frag_ndisc", + "temp_prefered_lft", + "temp_valid_lft", + "use_oif_addrs_only", + "use_optimistic", + "use_tempaddr", +}; + +#define CONF_OPT_PATH "net/%s/conf/%s/%s" +#define MAX_CONF_OPT_PATH IFNAMSIZ+60 +#define MAX_STR_CONF_LEN 200 + +static int net_conf_op(char *tgt, SysctlEntry **conf, int n, int op, char *proto, + struct sysctl_req *req, char (*path)[MAX_CONF_OPT_PATH], int size, + char **devconfs, SysctlEntry **def_conf) +{ + int i, ri, ar = -1; + int ret, flags = op == CTL_READ ? CTL_FLAGS_OPTIONAL : 0; + SysctlEntry **rconf; + + if (n > size) + pr_warn("The image contains unknown sysctl-s\n"); + + if (opts.weak_sysctls) + flags = CTL_FLAGS_OPTIONAL; + + rconf = xmalloc(sizeof(SysctlEntry *) * size); + if (!rconf) + return -1; + + for (i = 0, ri = 0; i < size; i++) { + if (i >= n) { + pr_warn("Skip %s/%s\n", tgt, devconfs[i]); + continue; + } + /* + * If dev conf value is the same as default skip restoring it, + * mtu may be changed by disable_ipv6 so we can not skip + * it's restore + */ + if (def_conf && sysctl_entries_equal(conf[i], def_conf[i]) + && strcmp(devconfs[i], "mtu")) { + pr_debug("Skip %s/%s, coincides with default\n", tgt, devconfs[i]); + continue; + } + + /* + * Make "accept_redirects" go last on write(it should + * restore after forwarding to be correct) + */ + if (op == CTL_WRITE && !strcmp(devconfs[i], "accept_redirects")) { + ar = i; + continue; + } + + snprintf(path[i], MAX_CONF_OPT_PATH, CONF_OPT_PATH, proto, tgt, devconfs[i]); + req[ri].name = path[i]; + req[ri].flags = flags; + switch (conf[i]->type) { + case SYSCTL_TYPE__CTL_32: + req[ri].type = CTL_32; + + /* skip non-existing sysctl */ + if (op == CTL_WRITE && !conf[i]->has_iarg) + continue; + + req[ri].arg = &conf[i]->iarg; + break; + case SYSCTL_TYPE__CTL_STR: + req[ri].type = CTL_STR(MAX_STR_CONF_LEN); + req[ri].flags |= op == CTL_READ && !strcmp(devconfs[i], "stable_secret") + ? CTL_FLAGS_READ_EIO_SKIP : 0; + + /* skip non-existing sysctl */ + if (op == CTL_WRITE && !conf[i]->sarg) + continue; + + req[ri].arg = conf[i]->sarg; + break; + default: + continue; + } + rconf[ri] = conf[i]; + ri++; + } + + if (ar != -1 + && conf[ar]->type == SYSCTL_TYPE__CTL_32 + && conf[ar]->has_iarg) { + snprintf(path[ar], MAX_CONF_OPT_PATH, CONF_OPT_PATH, proto, tgt, devconfs[ar]); + req[ri].name = path[ar]; + req[ri].type = CTL_32; + req[ri].arg = &conf[ar]->iarg; + req[ri].flags = flags; + rconf[ri] = conf[ar]; + ri++; + } + + ret = sysctl_op(req, ri, op, CLONE_NEWNET); + if (ret < 0) { + pr_err("Failed to %s %s/\n", (op == CTL_READ)?"read":"write", tgt); + goto err_free; + } + + if (op == CTL_READ) { + /* (un)mark (non-)existing sysctls in image */ + for (i = 0; i < ri; i++) + if (req[i].flags & CTL_FLAGS_HAS) { + if (rconf[i]->type == SYSCTL_TYPE__CTL_32) + rconf[i]->has_iarg = true; + } else { + if (rconf[i]->type == SYSCTL_TYPE__CTL_STR) + rconf[i]->sarg = NULL; + } + } + +err_free: + xfree(rconf); + return ret; +} + +static int ipv4_conf_op(char *tgt, SysctlEntry **conf, int n, int op, SysctlEntry **def_conf) +{ + struct sysctl_req req[ARRAY_SIZE(devconfs4)]; + char path[ARRAY_SIZE(devconfs4)][MAX_CONF_OPT_PATH]; + + return net_conf_op(tgt, conf, n, op, "ipv4", + req, path, ARRAY_SIZE(devconfs4), + devconfs4, def_conf); +} + +static int ipv6_conf_op(char *tgt, SysctlEntry **conf, int n, int op, SysctlEntry **def_conf) +{ + struct sysctl_req req[ARRAY_SIZE(devconfs6)]; + char path[ARRAY_SIZE(devconfs6)][MAX_CONF_OPT_PATH]; + + return net_conf_op(tgt, conf, n, op, "ipv6", + req, path, ARRAY_SIZE(devconfs6), + devconfs6, def_conf); +} + +/* + * I case if some entry is missing in + * the kernel, simply write DEVCONFS_UNUSED + * into the image so we would skip it. + */ +#define DEVCONFS_UNUSED (-1u) + +static int ipv4_conf_op_old(char *tgt, int *conf, int n, int op, int *def_conf) +{ + int i, ri; + int ret, flags = op == CTL_READ ? CTL_FLAGS_OPTIONAL : 0; + struct sysctl_req req[ARRAY_SIZE(devconfs4)]; + char path[ARRAY_SIZE(devconfs4)][MAX_CONF_OPT_PATH]; + + if (n > ARRAY_SIZE(devconfs4)) + pr_warn("The image contains unknown sysctl-s\n"); + + for (i = 0, ri = 0; i < ARRAY_SIZE(devconfs4); i++) { + if (i >= n) { + pr_warn("Skip %s/%s\n", tgt, devconfs4[i]); + continue; + } + /* + * If dev conf value is the same as default skip restoring it + */ + if (def_conf && conf[i] == def_conf[i]) { + pr_debug("DEBUG Skip %s/%s, val =%d\n", tgt, devconfs4[i], conf[i]); + continue; + } + + if (op == CTL_WRITE && conf[i] == DEVCONFS_UNUSED) + continue; + else if (op == CTL_READ) + conf[i] = DEVCONFS_UNUSED; + + snprintf(path[i], MAX_CONF_OPT_PATH, CONF_OPT_PATH, "ipv4", tgt, devconfs4[i]); + req[ri].name = path[i]; + req[ri].arg = &conf[i]; + req[ri].type = CTL_32; + req[ri].flags = flags; + ri++; + } + + ret = sysctl_op(req, ri, op, CLONE_NEWNET); + if (ret < 0) { + pr_err("Failed to %s %s/\n", (op == CTL_READ)?"read":"write", tgt); + return -1; + } + return 0; +} + +int write_netdev_img(NetDeviceEntry *nde, struct cr_imgset *fds, struct nlattr **info) +{ + return pb_write_one(img_from_set(fds, CR_FD_NETDEV), nde, PB_NETDEV); +} + +static int lookup_net_by_netid(struct ns_id *ns, int net_id) +{ + struct netns_id *p; + + list_for_each_entry(p, &ns->net.ids, node) + if (p->netnsid_value == net_id) + return p->target_ns_id; + + return -1; +} + +static int dump_one_netdev(int type, struct ifinfomsg *ifi, + struct nlattr **tb, struct ns_id *ns, struct cr_imgset *fds, + int (*dump)(NetDeviceEntry *, struct cr_imgset *, struct nlattr **info)) +{ + int ret = -1, i, peer_ifindex; + NetDeviceEntry netdev = NET_DEVICE_ENTRY__INIT; + SysctlEntry *confs4 = NULL; + int size4 = ARRAY_SIZE(devconfs4); + SysctlEntry *confs6 = NULL; + int size6 = ARRAY_SIZE(devconfs6); + char stable_secret[MAX_STR_CONF_LEN + 1] = {}; + struct nlattr *info[IFLA_INFO_MAX + 1], **arg = NULL; + + if (!tb[IFLA_IFNAME]) { + pr_err("No name for link %d\n", ifi->ifi_index); + return -1; + } + + netdev.type = type; + netdev.ifindex = ifi->ifi_index; + netdev.mtu = *(int *)RTA_DATA(tb[IFLA_MTU]); + netdev.flags = ifi->ifi_flags; + netdev.name = RTA_DATA(tb[IFLA_IFNAME]); + + if (kdat.has_nsid) { + s32 nsid = -1; + + peer_ifindex = ifi->ifi_index; + if (tb[IFLA_LINK]) + peer_ifindex = nla_get_u32(tb[IFLA_LINK]); + + netdev.has_peer_ifindex = true; + netdev.peer_ifindex = peer_ifindex; + + if (tb[IFLA_LINK_NETNSID]) + nsid = nla_get_s32(tb[IFLA_LINK_NETNSID]); + + pr_debug("The peer link is in the %d netns with the %u index\n", + nsid, netdev.peer_ifindex); + + if (nsid == -1) + nsid = ns->id; + else + nsid = lookup_net_by_netid(ns, nsid); + if (nsid < 0) { + pr_warn("The %s veth is in an external netns\n", + netdev.name); + } else { + netdev.has_peer_nsid = true; + netdev.peer_nsid = nsid; + } + } + /* + * If kdat.has_nsid is false, a multiple network namespaces are not dumped, + * so if we are here, this means only one netns is dumped. + */ + + if (tb[IFLA_ADDRESS] && (type != ND_TYPE__LOOPBACK)) { + netdev.has_address = true; + netdev.address.data = nla_data(tb[IFLA_ADDRESS]); + netdev.address.len = nla_len(tb[IFLA_ADDRESS]); + pr_info("Found ll addr (%02x:../%d) for %s\n", + (int)netdev.address.data[0], + (int)netdev.address.len, netdev.name); + } + + if (tb[IFLA_MASTER]) { + netdev.has_master = true; + netdev.master = nla_get_u32(tb[IFLA_MASTER]); + } + + netdev.n_conf4 = size4; + netdev.conf4 = xmalloc(sizeof(SysctlEntry *) * size4); + if (!netdev.conf4) + goto err_free; + + confs4 = xmalloc(sizeof(SysctlEntry) * size4); + if (!confs4) + goto err_free; + + for (i = 0; i < size4; i++) { + sysctl_entry__init(&confs4[i]); + netdev.conf4[i] = &confs4[i]; + netdev.conf4[i]->type = CTL_32; + } + + netdev.n_conf6 = size6; + netdev.conf6 = xmalloc(sizeof(SysctlEntry *) * size6); + if (!netdev.conf6) + goto err_free; + + confs6 = xmalloc(sizeof(SysctlEntry) * size6); + if (!confs6) + goto err_free; + + for (i = 0; i < size6; i++) { + sysctl_entry__init(&confs6[i]); + netdev.conf6[i] = &confs6[i]; + if (strcmp(devconfs6[i], "stable_secret")) { + netdev.conf6[i]->type = SYSCTL_TYPE__CTL_32; + } else { + netdev.conf6[i]->type = SYSCTL_TYPE__CTL_STR; + netdev.conf6[i]->sarg = stable_secret; + } + } + + ret = ipv4_conf_op(netdev.name, netdev.conf4, size4, CTL_READ, NULL); + if (ret < 0) + goto err_free; + + ret = ipv6_conf_op(netdev.name, netdev.conf6, size6, CTL_READ, NULL); + if (ret < 0) + goto err_free; + + if (!dump) + dump = write_netdev_img; + + if (tb[IFLA_LINKINFO]) { + ret = nla_parse_nested(info, IFLA_INFO_MAX, tb[IFLA_LINKINFO], NULL); + if (ret < 0) { + pr_err("failed to parse nested linkinfo\n"); + return -1; + } + arg = info; + } + + ret = dump(&netdev, fds, arg); +err_free: + xfree(netdev.conf4); + xfree(confs4); + xfree(netdev.conf6); + xfree(confs6); + return ret; +} + +static char *link_kind(struct ifinfomsg *ifi, struct nlattr **tb) +{ + struct nlattr *linkinfo[IFLA_INFO_MAX + 1]; + + if (!tb[IFLA_LINKINFO]) { + pr_err("No linkinfo for eth link %d\n", ifi->ifi_index); + return NULL; + } + + nla_parse_nested(linkinfo, IFLA_INFO_MAX, tb[IFLA_LINKINFO], NULL); + if (!linkinfo[IFLA_INFO_KIND]) { + pr_err("No kind for eth link %d\n", ifi->ifi_index); + return NULL; + } + + return nla_data(linkinfo[IFLA_INFO_KIND]); +} + +static int dump_unknown_device(struct ifinfomsg *ifi, char *kind, + struct nlattr **tb, struct ns_id *ns, struct cr_imgset *fds) +{ + int ret; + + ret = run_plugins(DUMP_EXT_LINK, ifi->ifi_index, ifi->ifi_type, kind); + if (ret == 0) + return dump_one_netdev(ND_TYPE__EXTLINK, ifi, tb, ns, fds, NULL); + + if (ret == -ENOTSUP) + pr_err("Unsupported link %d (type %d kind %s)\n", + ifi->ifi_index, ifi->ifi_type, kind); + return -1; +} + +static int dump_bridge(NetDeviceEntry *nde, struct cr_imgset *imgset, struct nlattr **info) +{ + return write_netdev_img(nde, imgset, info); +} + +static int dump_macvlan(NetDeviceEntry *nde, struct cr_imgset *imgset, struct nlattr **info) +{ + MacvlanLinkEntry macvlan = MACVLAN_LINK_ENTRY__INIT; + int ret; + struct nlattr *data[IFLA_MACVLAN_FLAGS+1]; + + if (!info || !info[IFLA_INFO_DATA]) { + pr_err("no data for macvlan\n"); + return -1; + } + + ret = nla_parse_nested(data, IFLA_MACVLAN_FLAGS, info[IFLA_INFO_DATA], NULL); + if (ret < 0) { + pr_err("failed ot parse macvlan data\n"); + return -1; + } + + if (!data[IFLA_MACVLAN_MODE]) { + pr_err("macvlan mode required for %s\n", nde->name); + return -1; + } + + macvlan.mode = *((u32 *)RTA_DATA(data[IFLA_MACVLAN_MODE])); + + if (data[IFLA_MACVLAN_FLAGS]) + macvlan.flags = *((u16 *) RTA_DATA(data[IFLA_MACVLAN_FLAGS])); + + nde->macvlan = &macvlan; + return write_netdev_img(nde, imgset, info); +} + +static int dump_one_ethernet(struct ifinfomsg *ifi, char *kind, + struct nlattr **tb, struct ns_id *ns, struct cr_imgset *fds) +{ + if (!strcmp(kind, "veth")) + /* + * This is not correct. The peer of the veth device may + * be either outside or inside the netns we're working + * on, but there's currently no way of finding this out. + * + * Sigh... we have to assume, that the veth device is a + * connection to the outer world and just dump this end :( + */ + return dump_one_netdev(ND_TYPE__VETH, ifi, tb, ns, fds, NULL); + if (!strcmp(kind, "tun")) + return dump_one_netdev(ND_TYPE__TUN, ifi, tb, ns, fds, dump_tun_link); + if (!strcmp(kind, "bridge")) + return dump_one_netdev(ND_TYPE__BRIDGE, ifi, tb, ns, fds, dump_bridge); + if (!strcmp(kind, "gretap")) { + char *name = (char *)RTA_DATA(tb[IFLA_IFNAME]); + + if (!name) { + pr_err("gretap %d has no name\n", ifi->ifi_index); + return -1; + } + + if (!strcmp(name, "gretap0")) { + pr_info("found %s, ignoring\n", name); + return 0; + } + + pr_warn("GRE tap device %s not supported natively\n", name); + } + if (!strcmp(kind, "macvlan")) + return dump_one_netdev(ND_TYPE__MACVLAN, ifi, tb, ns, fds, dump_macvlan); + + return dump_unknown_device(ifi, kind, tb, ns, fds); +} + +static int dump_one_gendev(struct ifinfomsg *ifi, char *kind, + struct nlattr **tb, struct ns_id *ns, struct cr_imgset *fds) +{ + if (!strcmp(kind, "tun")) + return dump_one_netdev(ND_TYPE__TUN, ifi, tb, ns, fds, dump_tun_link); + + return dump_unknown_device(ifi, kind, tb, ns, fds); +} + +static int dump_one_voiddev(struct ifinfomsg *ifi, char *kind, + struct nlattr **tb, struct ns_id *ns, struct cr_imgset *fds) +{ + if (!strcmp(kind, "venet")) + return dump_one_netdev(ND_TYPE__VENET, ifi, tb, ns, fds, NULL); + + return dump_unknown_device(ifi, kind, tb, ns, fds); +} + +static int dump_one_gre(struct ifinfomsg *ifi, char *kind, + struct nlattr **tb, struct ns_id *ns, struct cr_imgset *fds) +{ + if (!strcmp(kind, "gre")) { + char *name = (char *)RTA_DATA(tb[IFLA_IFNAME]); + if (!name) { + pr_err("gre device %d has no name\n", ifi->ifi_index); + return -1; + } + + if (!strcmp(name, "gre0")) { + pr_info("found %s, ignoring\n", name); + return 0; + } + + pr_warn("GRE tunnel device %s not supported natively\n", name); + } + + return dump_unknown_device(ifi, kind, tb, ns, fds); +} + +static int dump_sit(NetDeviceEntry *nde, struct cr_imgset *imgset, struct nlattr **info) +{ + int ret; + struct nlattr *data[__IFLA_IPTUN_MAX]; + SitEntry se = SIT_ENTRY__INIT; + /* There are for IP(v6) addresses kernel feeds to us */ + uint32_t a_local, a_remote, rd_prefix[4], rl_prefix; + + if (!info || !info[IFLA_INFO_DATA]) { + pr_err("no data for sit\n"); + return -1; + } + + pr_info("Some data for SIT provided\n"); + ret = nla_parse_nested(data, IFLA_IPTUN_MAX, info[IFLA_INFO_DATA], NULL); + if (ret < 0) { + pr_err("failed ot parse sit data\n"); + return -1; + } + +#define ENCODE_ENTRY(__type, __ifla, __proto) do { \ + if (data[__ifla]) { \ + se.__proto = *(__type *)nla_data(data[__ifla]); \ + se.has_##__proto = true; \ + } \ + } while (0) + + if (data[IFLA_IPTUN_LOCAL]) { + a_local = *(u32 *)nla_data(data[IFLA_IPTUN_LOCAL]); + if (a_local != 0) { + se.n_local = 1; + se.local = &a_local; + } + } + + if (data[IFLA_IPTUN_REMOTE]) { + a_remote = *(u32 *)nla_data(data[IFLA_IPTUN_REMOTE]); + if (a_remote != 0) { + se.n_remote = 1; + se.remote = &a_remote; + } + } + + ENCODE_ENTRY(u32, IFLA_IPTUN_LINK, link); + ENCODE_ENTRY(u8, IFLA_IPTUN_TTL, ttl); + ENCODE_ENTRY(u8, IFLA_IPTUN_TOS, tos); + ENCODE_ENTRY(u16, IFLA_IPTUN_FLAGS, flags); + ENCODE_ENTRY(u8, IFLA_IPTUN_PROTO, proto); + + if (data[IFLA_IPTUN_PMTUDISC]) { + u8 v; + + v = *(u8 *)nla_data(data[IFLA_IPTUN_PMTUDISC]); + if (v) + se.pmtudisc = se.has_pmtudisc = true; + } + + ENCODE_ENTRY(u16, IFLA_IPTUN_ENCAP_TYPE, encap_type); + ENCODE_ENTRY(u16, IFLA_IPTUN_ENCAP_FLAGS, encap_flags); + ENCODE_ENTRY(u16, IFLA_IPTUN_ENCAP_SPORT, encap_sport); + ENCODE_ENTRY(u16, IFLA_IPTUN_ENCAP_DPORT, encap_dport); + + if (data[IFLA_IPTUN_6RD_PREFIXLEN]) { + se.rd_prefixlen = *(u16 *)nla_data(data[IFLA_IPTUN_6RD_PREFIXLEN]); + if (!se.rd_prefixlen) + goto skip; + + if (!data[IFLA_IPTUN_6RD_PREFIX]) { + pr_err("No 6rd prefix for sit device\n"); + return -1; + } + + se.has_rd_prefixlen = true; + memcpy(&rd_prefix, nla_data(data[IFLA_IPTUN_6RD_PREFIX]), sizeof(rd_prefix)); + se.n_rd_prefix = 4; + se.rd_prefix = rd_prefix; + + se.relay_prefixlen = *(u16 *)nla_data(data[IFLA_IPTUN_6RD_RELAY_PREFIXLEN]); + if (!se.relay_prefixlen) + goto skip; + + if (!data[IFLA_IPTUN_6RD_RELAY_PREFIX]) { + pr_err("No 6rd relay prefix for sit device\n"); + return -1; + } + + se.has_relay_prefixlen = true; + memcpy(&rl_prefix, nla_data(data[IFLA_IPTUN_6RD_RELAY_PREFIX]), sizeof(rl_prefix)); + se.n_relay_prefix = 1; + se.relay_prefix = &rl_prefix; +skip:; + } + +#undef ENCODE_ENTRY + + nde->sit = &se; + return write_netdev_img(nde, imgset, info); +} + +static int dump_one_sit(struct ifinfomsg *ifi, char *kind, + struct nlattr **tb, struct ns_id *ns, struct cr_imgset *fds) +{ + char *name; + + if (strcmp(kind, "sit")) { + pr_err("SIT device with %s kind\n", kind); + return -1; + } + + name = (char *)RTA_DATA(tb[IFLA_IFNAME]); + if (!name) { + pr_err("sit device %d has no name\n", ifi->ifi_index); + return -1; + } + + if (!strcmp(name, "sit0")) { + pr_info("found %s, ignoring\n", name); + return 0; + } + + return dump_one_netdev(ND_TYPE__SIT, ifi, tb, ns, fds, dump_sit); +} + +static int list_one_link(struct nlmsghdr *hdr, struct ns_id *ns, void *arg) +{ + return 0; +} + +static int dump_one_link(struct nlmsghdr *hdr, struct ns_id *ns, void *arg) +{ + struct cr_imgset *fds = arg; + struct ifinfomsg *ifi; + int ret = 0, len = hdr->nlmsg_len - NLMSG_LENGTH(sizeof(*ifi)); + struct nlattr *tb[IFLA_MAX + 1]; + char *kind; + + ifi = NLMSG_DATA(hdr); + + if (len < 0) { + pr_err("No iflas for link %d\n", ifi->ifi_index); + return -1; + } + + nlmsg_parse(hdr, sizeof(struct ifinfomsg), tb, IFLA_MAX, NULL); + pr_info("\tLD: Got link %d, type %d\n", ifi->ifi_index, ifi->ifi_type); + + if (ifi->ifi_type == ARPHRD_LOOPBACK) + return dump_one_netdev(ND_TYPE__LOOPBACK, ifi, tb, ns, fds, NULL); + + kind = link_kind(ifi, tb); + if (!kind) + goto unk; + + switch (ifi->ifi_type) { + case ARPHRD_ETHER: + ret = dump_one_ethernet(ifi, kind, tb, ns, fds); + break; + case ARPHRD_NONE: + ret = dump_one_gendev(ifi, kind, tb, ns, fds); + break; + case ARPHRD_VOID: + ret = dump_one_voiddev(ifi, kind, tb, ns, fds); + break; + case ARPHRD_IPGRE: + ret = dump_one_gre(ifi, kind, tb, ns, fds); + break; + case ARPHRD_SIT: + ret = dump_one_sit(ifi, kind, tb, ns, fds); + break; + default: +unk: + ret = dump_unknown_device(ifi, kind, tb, ns, fds); + break; + } + + return ret; +} + +static int dump_one_nf(struct nlmsghdr *hdr, struct ns_id *ns, void *arg) +{ + struct cr_img *img = arg; + + if (lazy_image(img) && open_image_lazy(img)) + return -1; + + if (write_img_buf(img, hdr, hdr->nlmsg_len)) + return -1; + + return 0; +} + +static int ct_restore_callback(struct nlmsghdr *nlh) +{ + struct nfgenmsg *msg; + struct nlattr *tb[CTA_MAX+1], *tbp[CTA_PROTOINFO_MAX + 1], *tb_tcp[CTA_PROTOINFO_TCP_MAX+1]; + int err; + + msg = NLMSG_DATA(nlh); + + if (msg->nfgen_family != AF_INET && msg->nfgen_family != AF_INET6) + return 0; + + err = nlmsg_parse(nlh, sizeof(struct nfgenmsg), tb, CTA_MAX, NULL); + if (err < 0) + return -1; + + if (!tb[CTA_PROTOINFO]) + return 0; + + err = nla_parse_nested(tbp, CTA_PROTOINFO_MAX, tb[CTA_PROTOINFO], NULL); + if (err < 0) + return -1; + + if (!tbp[CTA_PROTOINFO_TCP]) + return 0; + + err = nla_parse_nested(tb_tcp, CTA_PROTOINFO_TCP_MAX, tbp[CTA_PROTOINFO_TCP], NULL); + if (err < 0) + return -1; + + if (tb_tcp[CTA_PROTOINFO_TCP_FLAGS_ORIGINAL]) { + struct nf_ct_tcp_flags *flags; + + flags = nla_data(tb_tcp[CTA_PROTOINFO_TCP_FLAGS_ORIGINAL]); + flags->flags |= IP_CT_TCP_FLAG_BE_LIBERAL; + flags->mask |= IP_CT_TCP_FLAG_BE_LIBERAL; + } + + if (tb_tcp[CTA_PROTOINFO_TCP_FLAGS_REPLY]) { + struct nf_ct_tcp_flags *flags; + + flags = nla_data(tb_tcp[CTA_PROTOINFO_TCP_FLAGS_REPLY]); + flags->flags |= IP_CT_TCP_FLAG_BE_LIBERAL; + flags->mask |= IP_CT_TCP_FLAG_BE_LIBERAL; + } + + return 0; +} + +static int restore_nf_ct(int pid, int type) +{ + struct nlmsghdr *nlh = NULL; + int exit_code = -1, sk; + struct cr_img *img; + + img = open_image(type, O_RSTR, pid); + if (img == NULL) + return -1; + if (empty_image(img)) { + close_image(img); + return 0; + } + + sk = socket(AF_NETLINK, SOCK_RAW, NETLINK_NETFILTER); + if (sk < 0) { + pr_perror("Can't open rtnl sock for net dump"); + goto out_img; + } + + nlh = xmalloc(sizeof(struct nlmsghdr)); + if (nlh == NULL) + goto out; + + while (1) { + struct nlmsghdr *p; + int ret; + + ret = read_img_buf_eof(img, nlh, sizeof(struct nlmsghdr)); + if (ret < 0) + goto out; + if (ret == 0) + break; + + p = xrealloc(nlh, nlh->nlmsg_len); + if (p == NULL) + goto out; + nlh = p; + + ret = read_img_buf_eof(img, nlh + 1, nlh->nlmsg_len - sizeof(struct nlmsghdr)); + if (ret < 0) + goto out; + if (ret == 0) { + pr_err("The image file was truncated\n"); + goto out; + } + + if (type == CR_FD_NETNF_CT) + if (ct_restore_callback(nlh)) + goto out; + + nlh->nlmsg_flags = NLM_F_REQUEST|NLM_F_ACK|NLM_F_CREATE; + ret = do_rtnl_req(sk, nlh, nlh->nlmsg_len, NULL, NULL, NULL, NULL); + if (ret) + goto out; + } + + exit_code = 0; +out: + xfree(nlh); + close(sk); +out_img: + close_image(img); + return exit_code; +} + +static int dump_nf_ct(struct cr_imgset *fds, int type) +{ + struct cr_img *img; + struct { + struct nlmsghdr nlh; + struct nfgenmsg g; + } req; + int sk, ret; + + pr_info("Dumping netns links\n"); + + ret = sk = socket(AF_NETLINK, SOCK_RAW, NETLINK_NETFILTER); + if (sk < 0) { + pr_perror("Can't open rtnl sock for net dump"); + goto out; + } + + memset(&req, 0, sizeof(req)); + req.nlh.nlmsg_len = sizeof(req); + req.nlh.nlmsg_type = (NFNL_SUBSYS_CTNETLINK << 8); + + if (type == CR_FD_NETNF_CT) + req.nlh.nlmsg_type |= IPCTNL_MSG_CT_GET; + else if (type == CR_FD_NETNF_EXP) + req.nlh.nlmsg_type |= IPCTNL_MSG_EXP_GET; + else + BUG(); + + req.nlh.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST; + req.nlh.nlmsg_pid = 0; + req.nlh.nlmsg_seq = CR_NLMSG_SEQ; + req.g.nfgen_family = AF_UNSPEC; + + img = img_from_set(fds, type); + + ret = do_rtnl_req(sk, &req, sizeof(req), dump_one_nf, NULL, NULL, img); + close(sk); +out: + return ret; + +} + +/* + * When we request information about a link, the kernel shows + * information about the pair device (netns id and idx). + * If a pair device lives in another namespace and this namespace + * doesn't have a netns ID in the current namespace, the kernel + * will generate it. So we need to list all links, before dumping + * netns indexes. + */ +static int list_links(int rtsk, void *args) +{ + struct { + struct nlmsghdr nlh; + struct rtgenmsg g; + } req; + + pr_info("Dumping netns links\n"); + + memset(&req, 0, sizeof(req)); + req.nlh.nlmsg_len = sizeof(req); + req.nlh.nlmsg_type = RTM_GETLINK; + req.nlh.nlmsg_flags = NLM_F_ROOT|NLM_F_MATCH|NLM_F_REQUEST; + req.nlh.nlmsg_pid = 0; + req.nlh.nlmsg_seq = CR_NLMSG_SEQ; + req.g.rtgen_family = AF_PACKET; + + return do_rtnl_req(rtsk, &req, sizeof(req), list_one_link, NULL, NULL, args); +} + +static int dump_links(int rtsk, struct ns_id *ns, struct cr_imgset *fds) +{ + struct { + struct nlmsghdr nlh; + struct rtgenmsg g; + } req; + + pr_info("Dumping netns links\n"); + + memset(&req, 0, sizeof(req)); + req.nlh.nlmsg_len = sizeof(req); + req.nlh.nlmsg_type = RTM_GETLINK; + req.nlh.nlmsg_flags = NLM_F_ROOT|NLM_F_MATCH|NLM_F_REQUEST; + req.nlh.nlmsg_pid = 0; + req.nlh.nlmsg_seq = CR_NLMSG_SEQ; + req.g.rtgen_family = AF_PACKET; + + return do_rtnl_req(rtsk, &req, sizeof(req), dump_one_link, NULL, ns, fds); +} + +static int restore_link_cb(struct nlmsghdr *hdr, struct ns_id *ns, void *arg) +{ + pr_info("Got response on SETLINK =)\n"); + return 0; +} + +struct newlink_req { + struct nlmsghdr h; + struct ifinfomsg i; + char buf[1024]; +}; + +/* Optional extra things to be provided at the top level of the NEWLINK + * request. + */ +struct newlink_extras { + int link; /* IFLA_LINK */ + int target_netns; /* IFLA_NET_NS_FD */ +}; + +typedef int (*link_info_t)(struct ns_id *ns, struct net_link *, struct newlink_req *); + +static int populate_newlink_req(struct ns_id *ns, struct newlink_req *req, + int msg_type, struct net_link * link, + link_info_t link_info, struct newlink_extras *extras) +{ + NetDeviceEntry *nde = link->nde; + + memset(req, 0, sizeof(*req)); + + req->h.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)); + req->h.nlmsg_flags = NLM_F_REQUEST|NLM_F_ACK|NLM_F_CREATE; + req->h.nlmsg_type = msg_type; + req->h.nlmsg_seq = CR_NLMSG_SEQ; + req->i.ifi_family = AF_PACKET; + /* + * SETLINK is called for external devices which may + * have ifindex changed. Thus configure them by their + * name only. + */ + if (msg_type == RTM_NEWLINK) + req->i.ifi_index = nde->ifindex; + req->i.ifi_flags = nde->flags; + + if (extras) { + if (extras->link >= 0) + addattr_l(&req->h, sizeof(*req), IFLA_LINK, &extras->link, sizeof(extras->link)); + + if (extras->target_netns >= 0) + addattr_l(&req->h, sizeof(*req), IFLA_NET_NS_FD, &extras->target_netns, sizeof(extras->target_netns)); + + } + + addattr_l(&req->h, sizeof(*req), IFLA_IFNAME, nde->name, strlen(nde->name)); + addattr_l(&req->h, sizeof(*req), IFLA_MTU, &nde->mtu, sizeof(nde->mtu)); + + if (nde->has_address) { + pr_debug("Restore ll addr (%02x:../%d) for device\n", + (int)nde->address.data[0], (int)nde->address.len); + addattr_l(&req->h, sizeof(*req), IFLA_ADDRESS, + nde->address.data, nde->address.len); + } + + if (link_info) { + struct rtattr *linkinfo; + int ret; + + linkinfo = NLMSG_TAIL(&req->h); + addattr_l(&req->h, sizeof(*req), IFLA_LINKINFO, NULL, 0); + + ret = link_info(ns, link, req); + if (ret < 0) + return ret; + + linkinfo->rta_len = (void *)NLMSG_TAIL(&req->h) - (void *)linkinfo; + } + + return 0; +} + +static int do_rtm_link_req(int msg_type, + struct net_link *link, int nlsk, struct ns_id *ns, + link_info_t link_info, struct newlink_extras *extras) +{ + struct newlink_req req; + + if (populate_newlink_req(ns, &req, msg_type, link, link_info, extras) < 0) + return -1; + + return do_rtnl_req(nlsk, &req, req.h.nlmsg_len, restore_link_cb, NULL, NULL, NULL); +} + +int restore_link_parms(struct net_link *link, int nlsk) +{ + return do_rtm_link_req(RTM_SETLINK, link, nlsk, NULL, NULL, NULL); +} + +static int restore_one_link(struct ns_id *ns, struct net_link *link, int nlsk, + link_info_t link_info, struct newlink_extras *extras) +{ + pr_info("Restoring netdev %s idx %d\n", link->nde->name, link->nde->ifindex); + return do_rtm_link_req(RTM_NEWLINK, link, nlsk, ns, link_info, extras); +} + +#ifndef VETH_INFO_MAX +enum { + VETH_INFO_UNSPEC, + VETH_INFO_PEER, + + __VETH_INFO_MAX +#define VETH_INFO_MAX (__VETH_INFO_MAX - 1) +}; +#endif + +#if IFLA_MAX <= 28 +#define IFLA_NET_NS_FD 28 +#endif + +static int veth_peer_info(struct net_link *link, struct newlink_req *req, + struct ns_id *ns, int ns_fd) +{ + NetDeviceEntry *nde = link->nde; + char key[100], *val; + struct ns_id *peer_ns = NULL; + + snprintf(key, sizeof(key), "veth[%s]", nde->name); + val = external_lookup_by_key(key); + if (!IS_ERR_OR_NULL(val)) { + char *aux; + + aux = strchrnul(val, '@'); + addattr_l(&req->h, sizeof(*req), IFLA_IFNAME, val, aux - val); + addattr_l(&req->h, sizeof(*req), IFLA_NET_NS_FD, &ns_fd, sizeof(ns_fd)); + return 0; + } + + if (nde->has_peer_nsid) { + struct net_link *plink; + + peer_ns = lookup_ns_by_id(nde->peer_nsid, &net_ns_desc); + if (!peer_ns) + goto out; + list_for_each_entry(plink, &peer_ns->net.links, node) { + if (plink->nde->ifindex == nde->peer_ifindex && plink->created) { + req->h.nlmsg_type = RTM_SETLINK; + return 0; + } + } + } + + link->created = true; + if (peer_ns) { + addattr_l(&req->h, sizeof(*req), IFLA_NET_NS_FD, &peer_ns->net.ns_fd, sizeof(int)); + return 0; + } +out: + pr_err("Unknown peer net namespace\n"); + return -1; +} + +static int veth_link_info(struct ns_id *ns, struct net_link *link, struct newlink_req *req) +{ + int ns_fd = get_service_fd(NS_FD_OFF); + NetDeviceEntry *nde = link->nde; + struct rtattr *veth_data, *peer_data; + struct ifinfomsg ifm; + + addattr_l(&req->h, sizeof(*req), IFLA_INFO_KIND, "veth", 4); + + veth_data = NLMSG_TAIL(&req->h); + addattr_l(&req->h, sizeof(*req), IFLA_INFO_DATA, NULL, 0); + peer_data = NLMSG_TAIL(&req->h); + memset(&ifm, 0, sizeof(ifm)); + + /* + * Peer index might lay on the node root net namespace, + * where the device index may be already borrowed by + * some other device, so we should ignore it. + * + * Still if peer is laying in some other net-namespace, + * we should recreate the device index as well as the + * as we do for the master peer end. + */ + if (nde->has_peer_nsid) + ifm.ifi_index = nde->peer_ifindex; + addattr_l(&req->h, sizeof(*req), VETH_INFO_PEER, &ifm, sizeof(ifm)); + + veth_peer_info(link, req, ns, ns_fd); + peer_data->rta_len = (void *)NLMSG_TAIL(&req->h) - (void *)peer_data; + veth_data->rta_len = (void *)NLMSG_TAIL(&req->h) - (void *)veth_data; + + return 0; +} + +static int venet_link_info(struct ns_id *ns, struct net_link *link, struct newlink_req *req) +{ + int ns_fd = get_service_fd(NS_FD_OFF); + struct rtattr *venet_data; + + BUG_ON(ns_fd < 0); + + venet_data = NLMSG_TAIL(&req->h); + addattr_l(&req->h, sizeof(*req), IFLA_INFO_KIND, "venet", 5); + addattr_l(&req->h, sizeof(*req), IFLA_INFO_DATA, NULL, 0); + addattr_l(&req->h, sizeof(*req), IFLA_NET_NS_FD, &ns_fd, sizeof(ns_fd)); + venet_data->rta_len = (void *)NLMSG_TAIL(&req->h) - (void *)venet_data; + + return 0; +} + +static int bridge_link_info(struct ns_id *ns, struct net_link *link, struct newlink_req *req) +{ + struct rtattr *bridge_data; + + bridge_data = NLMSG_TAIL(&req->h); + addattr_l(&req->h, sizeof(*req), IFLA_INFO_KIND, "bridge", sizeof("bridge")); + bridge_data->rta_len = (void *)NLMSG_TAIL(&req->h) - (void *)bridge_data; + + return 0; +} + +static int changeflags(int s, char *name, short flags) +{ + struct ifreq ifr; + + strlcpy(ifr.ifr_name, name, IFNAMSIZ); + ifr.ifr_flags = flags; + + if (ioctl(s, SIOCSIFFLAGS, &ifr) < 0) { + pr_perror("couldn't set flags on %s", name); + return -1; + } + + return 0; +} + +static int macvlan_link_info(struct ns_id *ns, struct net_link *link, struct newlink_req *req) +{ + struct rtattr *macvlan_data; + NetDeviceEntry *nde = link->nde; + MacvlanLinkEntry *macvlan = nde->macvlan; + + if (!macvlan) { + pr_err("Missing macvlan link entry %d\n", nde->ifindex); + return -1; + } + + addattr_l(&req->h, sizeof(*req), IFLA_INFO_KIND, "macvlan", 7); + + macvlan_data = NLMSG_TAIL(&req->h); + addattr_l(&req->h, sizeof(*req), IFLA_INFO_DATA, NULL, 0); + + addattr_l(&req->h, sizeof(*req), IFLA_MACVLAN_MODE, &macvlan->mode, sizeof(macvlan->mode)); + + if (macvlan->has_flags) + addattr_l(&req->h, sizeof(*req), IFLA_MACVLAN_FLAGS, &macvlan->flags, sizeof(macvlan->flags)); + + macvlan_data->rta_len = (void *)NLMSG_TAIL(&req->h) - (void *)macvlan_data; + + return 0; +} + +static int userns_restore_one_link(void *arg, int fd, pid_t pid) +{ + int nlsk, ret; + struct newlink_req *req = arg; + int ns_fd = get_service_fd(NS_FD_OFF), rst = -1; + + if (!(root_ns_mask & CLONE_NEWUSER)) { + if (switch_ns_by_fd(ns_fd, &net_ns_desc, &rst)) + return -1; + } + + nlsk = socket(PF_NETLINK, SOCK_RAW, NETLINK_ROUTE); + if (nlsk < 0) { + pr_perror("Can't create nlk socket"); + ret = -1; + goto out; + } + + addattr_l(&req->h, sizeof(*req), IFLA_NET_NS_FD, &fd, sizeof(fd)); + + ret = do_rtnl_req(nlsk, req, req->h.nlmsg_len, restore_link_cb, NULL, NULL, NULL); + close(nlsk); + +out: + if (rst >= 0 && restore_ns(rst, &net_ns_desc) < 0) + ret = -1; + return ret; +} + +static int restore_one_macvlan(struct ns_id *ns, struct net_link *link, int nlsk) +{ + struct newlink_extras extras = { + .link = -1, + .target_netns = -1, + }; + char key[100], *val; + int my_netns = -1, ret = -1; + NetDeviceEntry *nde = link->nde; + + snprintf(key, sizeof(key), "macvlan[%s]", nde->name); + val = external_lookup_data(key); + if (IS_ERR_OR_NULL(val)) { + pr_err("a macvlan parent for %s is required\n", nde->name); + return -1; + } + + /* link and netns_id are used to identify the master device to plug our + * macvlan slave into. We identify the destination via setting + * IFLA_NET_NS_FD to my_netns, but we have to do that in two different + * ways: in the userns case, we send the fd across to usernsd and set + * it there, whereas in the non-userns case we can just set it here, + * since we can just use a socket from criu's net ns given to us by + * restore_links(). We need to do this two different ways because + * CAP_NET_ADMIN is required in both namespaces, which we don't have in + * the userns case, and usernsd doesn't exist in the non-userns case. + */ + extras.link = (int) (unsigned long) val; + + my_netns = open_proc(PROC_SELF, "ns/net"); + if (my_netns < 0) + return -1; + + { + struct newlink_req req; + + if (populate_newlink_req(ns, &req, RTM_NEWLINK, link, macvlan_link_info, &extras) < 0) + goto out; + + if (userns_call(userns_restore_one_link, 0, &req, sizeof(req), my_netns) < 0) { + pr_err("couldn't restore macvlan interface %s via usernsd\n", nde->name); + goto out; + } + } + + ret = 0; +out: + if (my_netns >= 0) + close(my_netns); + return ret; +} + +static int sit_link_info(struct ns_id *ns, struct net_link *link, struct newlink_req *req) +{ + NetDeviceEntry *nde = link->nde; + struct rtattr *sit_data; + SitEntry *se = nde->sit; + + if (!se) { + pr_err("Missing sit entry %d\n", nde->ifindex); + return -1; + } + + addattr_l(&req->h, sizeof(*req), IFLA_INFO_KIND, "sit", 3); + sit_data = NLMSG_TAIL(&req->h); + addattr_l(&req->h, sizeof(*req), IFLA_INFO_DATA, NULL, 0); + +#define DECODE_ENTRY(__type, __ifla, __proto) do { \ + __type aux; \ + if (se->has_##__proto) { \ + aux = se->__proto; \ + addattr_l(&req->h, sizeof(*req), __ifla, \ + &aux, sizeof(__type)); \ + } \ + } while (0) + + if (se->n_local) { + if (se->n_local != 1) { + pr_err("Too long local addr for sit\n"); + return -1; + } + addattr_l(&req->h, sizeof(*req), IFLA_IPTUN_LOCAL, se->local, sizeof(u32)); + } + + if (se->n_remote) { + if (se->n_remote != 1) { + pr_err("Too long remote addr for sit\n"); + return -1; + } + addattr_l(&req->h, sizeof(*req), IFLA_IPTUN_REMOTE, se->remote, sizeof(u32)); + } + + DECODE_ENTRY(u32, IFLA_IPTUN_LINK, link); + DECODE_ENTRY(u8, IFLA_IPTUN_TTL, ttl); + DECODE_ENTRY(u8, IFLA_IPTUN_TOS, tos); + DECODE_ENTRY(u16, IFLA_IPTUN_FLAGS, flags); + DECODE_ENTRY(u8, IFLA_IPTUN_PROTO, proto); + + if (se->has_pmtudisc && se->pmtudisc) { + u8 aux = 1; + addattr_l(&req->h, sizeof(*req), IFLA_IPTUN_PMTUDISC, &aux, sizeof(u8)); + } + + DECODE_ENTRY(u16, IFLA_IPTUN_ENCAP_TYPE, encap_type); + DECODE_ENTRY(u16, IFLA_IPTUN_ENCAP_FLAGS, encap_flags); + DECODE_ENTRY(u16, IFLA_IPTUN_ENCAP_SPORT, encap_sport); + DECODE_ENTRY(u16, IFLA_IPTUN_ENCAP_DPORT, encap_dport); + + if (se->has_rd_prefixlen) { + u16 aux; + + if (se->n_rd_prefix != 4) { + pr_err("Bad 6rd prefixlen for sit\n"); + return -1; + } + + aux = se->rd_prefixlen; + addattr_l(&req->h, sizeof(*req), IFLA_IPTUN_6RD_PREFIXLEN, &aux, sizeof(u16)); + addattr_l(&req->h, sizeof(*req), IFLA_IPTUN_6RD_PREFIX, se->rd_prefix, 4 * sizeof(u32)); + + if (!se->has_relay_prefixlen) + goto skip; + + if (se->n_relay_prefix != 1) { + pr_err("Bad 6rd relay prefixlen for sit\n"); + return -1; + } + + aux = se->relay_prefixlen; + addattr_l(&req->h, sizeof(*req), IFLA_IPTUN_6RD_RELAY_PREFIXLEN, &aux, sizeof(u16)); + addattr_l(&req->h, sizeof(*req), IFLA_IPTUN_6RD_RELAY_PREFIX, se->relay_prefix, sizeof(u32)); +skip:; + } + +#undef DECODE_ENTRY + + sit_data->rta_len = (void *)NLMSG_TAIL(&req->h) - (void *)sit_data; + + return 0; +} + +static int __restore_link(struct ns_id *ns, struct net_link *link, int nlsk) +{ + NetDeviceEntry *nde = link->nde; + + pr_info("Restoring link %s type %d\n", nde->name, nde->type); + + switch (nde->type) { + case ND_TYPE__LOOPBACK: /* fallthrough */ + case ND_TYPE__EXTLINK: /* see comment in images/netdev.proto */ + return restore_link_parms(link, nlsk); + case ND_TYPE__VENET: + return restore_one_link(ns, link, nlsk, venet_link_info, NULL); + case ND_TYPE__VETH: + return restore_one_link(ns, link, nlsk, veth_link_info, NULL); + case ND_TYPE__TUN: + return restore_one_tun(ns, link, nlsk); + case ND_TYPE__BRIDGE: + return restore_one_link(ns, link, nlsk, bridge_link_info, NULL); + case ND_TYPE__MACVLAN: + return restore_one_macvlan(ns, link, nlsk); + case ND_TYPE__SIT: + return restore_one_link(ns, link, nlsk, sit_link_info, NULL); + default: + pr_err("Unsupported link type %d\n", link->nde->type); + break; + } + + return -1; +} + +static int read_links(struct ns_id *ns) +{ + int ret = -1, id = ns->id; + struct cr_img *img; + NetDeviceEntry *nde; + + img = open_image(CR_FD_NETDEV, O_RSTR, id); + if (!img) + return -1; + + while (1) { + struct net_link *link; + + ret = pb_read_one_eof(img, &nde, PB_NETDEV); + if (ret <= 0) + break; + + link = xmalloc(sizeof(*link)); + if (link == NULL) { + ret = -1; + net_device_entry__free_unpacked(nde, NULL); + break; + } + + link->nde = nde; + link->created = 0; + list_add(&link->node, &ns->net.links); + } + close_image(img); + + return ret; +} + +static int restore_link(int nlsk, struct ns_id *ns, struct net_link *link) +{ + NetDeviceEntry *nde = link->nde; + NetnsEntry **def_netns = &ns->net.netns; + int ret; + + ret = __restore_link(ns, link, nlsk); + if (ret) { + pr_err("Can't restore link: %d\n", ret); + goto exit; + } + + /* + * optimize restore of devices configuration except lo + * lo is created with namespace and before default is set + * so we can't optimize its restore + */ + if (nde->type == ND_TYPE__LOOPBACK) + def_netns = NULL; + + if (nde->conf4) + ret = ipv4_conf_op(nde->name, nde->conf4, nde->n_conf4, CTL_WRITE, def_netns ? (*def_netns)->def_conf4 : NULL); + else if (nde->conf) + ret = ipv4_conf_op_old(nde->name, nde->conf, nde->n_conf, CTL_WRITE, def_netns ? (*def_netns)->def_conf : NULL); + if (ret) + goto exit; + + if (nde->conf6) + ret = ipv6_conf_op(nde->name, nde->conf6, nde->n_conf6, CTL_WRITE, def_netns ? (*def_netns)->def_conf6 : NULL); +exit: + return ret; +} + +static int restore_master_link(int nlsk, struct ns_id *ns, struct net_link *link) +{ + struct newlink_req req; + + memset(&req, 0, sizeof(req)); + + req.h.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)); + req.h.nlmsg_flags = NLM_F_REQUEST|NLM_F_ACK|NLM_F_CREATE; + req.h.nlmsg_type = RTM_SETLINK; + req.h.nlmsg_seq = CR_NLMSG_SEQ; + req.i.ifi_family = AF_PACKET; + req.i.ifi_index = link->nde->ifindex; + req.i.ifi_flags = link->nde->flags; + + addattr_l(&req.h, sizeof(req), IFLA_MASTER, + &link->nde->master, sizeof(link->nde->master)); + + return do_rtnl_req(nlsk, &req, req.h.nlmsg_len, restore_link_cb, NULL, NULL, NULL); +} + +struct net_link *lookup_net_link(struct ns_id *ns, uint32_t ifindex) +{ + struct net_link *link; + + list_for_each_entry(link, &ns->net.links, node) + if (link->nde->ifindex == ifindex) + return link; + + return NULL; +} + +static int __restore_links(struct ns_id *nsid, int *nrlinks, int *nrcreated) +{ + struct net_link *link, *t; + int ret; + + list_for_each_entry_safe(link, t, &nsid->net.links, node) { + struct net_link *mlink = NULL; + + if (link->created) + continue; + + (*nrlinks)++; + + pr_debug("Try to restore a link %d:%d:%s", + nsid->id, link->nde->ifindex, link->nde->name); + if (link->nde->has_master) { + mlink = lookup_net_link(nsid, link->nde->master); + if (mlink == NULL) { + pr_err("Unable to find the %d master\n", link->nde->master); + return -1; + } + + if (!mlink->created) { + pr_debug("The master %d:%d:%s isn't created yet", + nsid->id, mlink->nde->ifindex, mlink->nde->name); + continue; + } + } + + ret = restore_link(nsid->net.nlsk, nsid, link); + if (ret < 0) + return -1; + + if (ret == 0) { + (*nrcreated)++; + link->created = true; + + if (mlink && restore_master_link(nsid->net.nlsk, nsid, link)) + return -1; + } + } + + return 0; +} + +static int restore_links() +{ + int nrcreated, nrlinks; + struct ns_id *nsid; + + while (true) { + nrcreated = 0; + nrlinks = 0; + for (nsid = ns_ids; nsid != NULL; nsid = nsid->next) { + if (nsid->nd != &net_ns_desc) + continue; + + if (switch_ns_by_fd(nsid->net.ns_fd, &net_ns_desc, NULL)) + return -1; + + if (__restore_links(nsid, &nrlinks, &nrcreated)) + return -1; + } + + if (nrcreated == nrlinks) + break; + if (nrcreated == 0) { + pr_err("Unable to restore network links\n"); + return -1; + } + } + + return 0; +} + + +static int run_ip_tool(char *arg1, char *arg2, char *arg3, char *arg4, int fdin, int fdout, unsigned flags) +{ + char *ip_tool_cmd; + int ret; + + pr_debug("\tRunning ip %s %s %s %s\n", arg1, arg2, arg3 ? : "", arg4 ? : ""); + + ip_tool_cmd = getenv("CR_IP_TOOL"); + if (!ip_tool_cmd) + ip_tool_cmd = "ip"; + + ret = cr_system(fdin, fdout, -1, ip_tool_cmd, + (char *[]) { "ip", arg1, arg2, arg3, arg4, NULL }, flags); + if (ret) { + if (!(flags & CRS_CAN_FAIL)) + pr_err("IP tool failed on %s %s %s %s\n", arg1, arg2, arg3 ? : "", arg4 ? : ""); + return -1; + } + + return 0; +} + +static int run_iptables_tool(char *def_cmd, int fdin, int fdout) +{ + int ret; + char *cmd; + + cmd = getenv("CR_IPTABLES"); + if (!cmd) + cmd = def_cmd; + pr_debug("\tRunning %s for %s\n", cmd, def_cmd); + ret = cr_system(fdin, fdout, -1, "sh", (char *[]) { "sh", "-c", cmd, NULL }, 0); + if (ret) + pr_err("%s failed\n", def_cmd); + + return ret; +} + +static inline int dump_ifaddr(struct cr_imgset *fds) +{ + struct cr_img *img = img_from_set(fds, CR_FD_IFADDR); + return run_ip_tool("addr", "save", NULL, NULL, -1, img_raw_fd(img), 0); +} + +static inline int dump_route(struct cr_imgset *fds) +{ + struct cr_img *img; + + img = img_from_set(fds, CR_FD_ROUTE); + if (run_ip_tool("route", "save", NULL, NULL, -1, img_raw_fd(img), 0)) + return -1; + + /* If ipv6 is disabled, "ip -6 route dump" dumps all routes */ + if (!kdat.ipv6) + return 0; + + img = img_from_set(fds, CR_FD_ROUTE6); + if (run_ip_tool("-6", "route", "save", NULL, -1, img_raw_fd(img), 0)) + return -1; + + return 0; +} + +static inline int dump_rule(struct cr_imgset *fds) +{ + struct cr_img *img; + char *path; + + img = img_from_set(fds, CR_FD_RULE); + path = xstrdup(img->path); + + if (!path) + return -1; + + if (run_ip_tool("rule", "save", NULL, NULL, -1, img_raw_fd(img), CRS_CAN_FAIL)) { + pr_warn("Check if \"ip rule save\" is supported!\n"); + unlinkat(get_service_fd(IMG_FD_OFF), path, 0); + } + + free(path); + + return 0; +} + +static inline int dump_iptables(struct cr_imgset *fds) +{ + struct cr_img *img; + + img = img_from_set(fds, CR_FD_IPTABLES); + if (run_iptables_tool("iptables-save", -1, img_raw_fd(img))) + return -1; + + if (kdat.ipv6) { + img = img_from_set(fds, CR_FD_IP6TABLES); + if (run_iptables_tool("ip6tables-save", -1, img_raw_fd(img))) + return -1; + } + + return 0; +} + +static int dump_netns_conf(struct ns_id *ns, struct cr_imgset *fds) +{ + void *buf, *o_buf; + int ret = -1; + int i; + NetnsEntry netns = NETNS_ENTRY__INIT; + SysctlEntry *def_confs4 = NULL, *all_confs4 = NULL; + int size4 = ARRAY_SIZE(devconfs4); + SysctlEntry *def_confs6 = NULL, *all_confs6 = NULL; + int size6 = ARRAY_SIZE(devconfs6); + char def_stable_secret[MAX_STR_CONF_LEN + 1] = {}; + char all_stable_secret[MAX_STR_CONF_LEN + 1] = {}; + NetnsId *ids; + struct netns_id *p; + + i = 0; + list_for_each_entry(p, &ns->net.ids, node) + i++; + + o_buf = buf = xmalloc( + i * (sizeof(NetnsId*) + sizeof(NetnsId)) + + size4 * (sizeof(SysctlEntry*) + sizeof(SysctlEntry)) * 2 + + size6 * (sizeof(SysctlEntry*) + sizeof(SysctlEntry)) * 2 + ); + if (!buf) + goto out; + + netns.nsids = xptr_pull_s(&buf, i * sizeof(NetnsId*)); + ids = xptr_pull_s(&buf, i * sizeof(NetnsId)); + i = 0; + list_for_each_entry(p, &ns->net.ids, node) { + netns_id__init(&ids[i]); + ids[i].target_ns_id = p->target_ns_id; + ids[i].netnsid_value = p->netnsid_value; + netns.nsids[i] = ids + i; + i++; + } + netns.n_nsids = i; + + netns.n_def_conf4 = size4; + netns.n_all_conf4 = size4; + netns.def_conf4 = xptr_pull_s(&buf, size4 * sizeof(SysctlEntry*)); + netns.all_conf4 = xptr_pull_s(&buf, size4 * sizeof(SysctlEntry*)); + def_confs4 = xptr_pull_s(&buf, size4 * sizeof(SysctlEntry)); + all_confs4 = xptr_pull_s(&buf, size4 * sizeof(SysctlEntry)); + + for (i = 0; i < size4; i++) { + sysctl_entry__init(&def_confs4[i]); + sysctl_entry__init(&all_confs4[i]); + netns.def_conf4[i] = &def_confs4[i]; + netns.all_conf4[i] = &all_confs4[i]; + netns.def_conf4[i]->type = CTL_32; + netns.all_conf4[i]->type = CTL_32; + } + + netns.n_def_conf6 = size6; + netns.n_all_conf6 = size6; + netns.def_conf6 = xptr_pull_s(&buf, size6 * sizeof(SysctlEntry*)); + netns.all_conf6 = xptr_pull_s(&buf, size6 * sizeof(SysctlEntry*)); + def_confs6 = xptr_pull_s(&buf, size6 * sizeof(SysctlEntry)); + all_confs6 = xptr_pull_s(&buf, size6 * sizeof(SysctlEntry)); + + for (i = 0; i < size6; i++) { + sysctl_entry__init(&def_confs6[i]); + sysctl_entry__init(&all_confs6[i]); + netns.def_conf6[i] = &def_confs6[i]; + netns.all_conf6[i] = &all_confs6[i]; + if (strcmp(devconfs6[i], "stable_secret")) { + netns.def_conf6[i]->type = SYSCTL_TYPE__CTL_32; + netns.all_conf6[i]->type = SYSCTL_TYPE__CTL_32; + } else { + netns.def_conf6[i]->type = SYSCTL_TYPE__CTL_STR; + netns.all_conf6[i]->type = SYSCTL_TYPE__CTL_STR; + netns.def_conf6[i]->sarg = def_stable_secret; + netns.all_conf6[i]->sarg = all_stable_secret; + } + } + + ret = ipv4_conf_op("default", netns.def_conf4, size4, CTL_READ, NULL); + if (ret < 0) + goto err_free; + ret = ipv4_conf_op("all", netns.all_conf4, size4, CTL_READ, NULL); + if (ret < 0) + goto err_free; + + ret = ipv6_conf_op("default", netns.def_conf6, size6, CTL_READ, NULL); + if (ret < 0) + goto err_free; + ret = ipv6_conf_op("all", netns.all_conf6, size6, CTL_READ, NULL); + if (ret < 0) + goto err_free; + + ret = pb_write_one(img_from_set(fds, CR_FD_NETNS), &netns, PB_NETNS); +err_free: + xfree(o_buf); +out: + return ret; +} + +static int restore_ip_dump(int type, int pid, char *cmd) +{ + int ret = -1, sockfd, n, written; + FILE *tmp_file; + struct cr_img *img; + char buf[1024]; + + img = open_image(type, O_RSTR, pid); + if (empty_image(img)) { + close_image(img); + return 0; + } + sockfd = img_raw_fd(img); + tmp_file = tmpfile(); + if (!tmp_file) { + pr_perror("Failed to open tmpfile"); + return -1; + } + + while ((n = read(sockfd, buf, 1024)) > 0) { + written = fwrite(buf, sizeof(char), n, tmp_file); + if (written < n) { + pr_perror("Failed to write to tmpfile " + "[written: %d; total: %d]", written, n); + goto close; + } + } + + if (fseek(tmp_file, 0, SEEK_SET)) { + pr_perror("Failed to set file position to beginning of tmpfile"); + goto close; + } + + if (img) { + ret = run_ip_tool(cmd, "restore", NULL, NULL, fileno(tmp_file), -1, 0); + close_image(img); + } + +close: + if(fclose(tmp_file)) { + pr_perror("Failed to close tmpfile"); + } + + return ret; +} + +static inline int restore_ifaddr(int pid) +{ + return restore_ip_dump(CR_FD_IFADDR, pid, "addr"); +} + +static inline int restore_route(int pid) +{ + if (restore_ip_dump(CR_FD_ROUTE, pid, "route")) + return -1; + + if (restore_ip_dump(CR_FD_ROUTE6, pid, "route")) + return -1; + + return 0; +} + +static inline int restore_rule(int pid) +{ + struct cr_img *img; + int ret = 0; + + img = open_image(CR_FD_RULE, O_RSTR, pid); + if (!img) { + ret = -1; + goto out; + } + + if (empty_image(img)) + goto close; + + /* + * Delete 3 default rules to prevent duplicates. See kernel's + * function fib_default_rules_init() for the details. + */ + run_ip_tool("rule", "flush", NULL, NULL, -1, -1, 0); + run_ip_tool("rule", "delete", "table", "local", -1, -1, 0); + + if (restore_ip_dump(CR_FD_RULE, pid, "rule")) + ret = -1; +close: + close_image(img); +out: + return ret; +} + +/* + * iptables-restore is executed from a target userns and it may have not enough + * rights to open /run/xtables.lock. Here we try to workaround this problem. + */ +static int prepare_xtable_lock() +{ + int fd; + + fd = open("/run/xtables.lock", O_RDONLY); + if (fd >= 0) { + close(fd); + return 0; + } + + /* + * __prepare_net_namespaces is executed in a separate process, + * so a mount namespace can be changed. + */ + if (unshare(CLONE_NEWNS)) { + pr_perror("Unable to create a mount namespace"); + return -1; + } + + if (mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL)) { + pr_perror("Unable to conver mounts to slave mounts"); + return -1; + } + /* + * /run/xtables.lock may not exist, so we can't just bind-mount a file + * over it. + * A new mount will not be propagated to the host mount namespace, + * because we are in another userns. + */ + + if (mount("criu-xtable-lock", "/run", "tmpfs", 0, NULL)) { + pr_perror("Unable to mount tmpfs into /run"); + return -1; + } + + return 0; +} + +static inline int restore_iptables(int pid) +{ + int ret = -1; + struct cr_img *img; + + img = open_image(CR_FD_IPTABLES, O_RSTR, pid); + if (img == NULL) + return -1; + if (empty_image(img)) { + ret = 0; + close_image(img); + goto ipt6; + } + + ret = run_iptables_tool("iptables-restore -w", img_raw_fd(img), -1); + close_image(img); + if (ret) + return ret; +ipt6: + img = open_image(CR_FD_IP6TABLES, O_RSTR, pid); + if (img == NULL) + return -1; + if (empty_image(img)) + goto out; + + ret = run_iptables_tool("ip6tables-restore -w", img_raw_fd(img), -1); +out: + close_image(img); + + return ret; +} + +int read_net_ns_img(void) +{ + struct ns_id *ns; + + for (ns = ns_ids; ns != NULL; ns = ns->next) { + struct cr_img *img; + int ret; + + if (ns->nd != &net_ns_desc) + continue; + + img = open_image(CR_FD_NETNS, O_RSTR, ns->id); + if (!img) + return -1; + + if (empty_image(img)) { + /* Backward compatibility */ + close_image(img); + continue; + } + + ret = pb_read_one(img, &ns->net.netns, PB_NETNS); + close_image(img); + if (ret < 0) { + pr_err("Can not read netns object\n"); + return -1; + } + ns->ext_key = ns->net.netns->ext_key; + } + + return 0; +} + +static int restore_netns_conf(struct ns_id *ns) +{ + NetnsEntry *netns = ns->net.netns; + int ret = 0; + + if (ns->net.netns == NULL) + /* Backward compatibility */ + goto out; + + if ((netns)->def_conf4) { + ret = ipv4_conf_op("all", (netns)->all_conf4, (netns)->n_all_conf4, CTL_WRITE, NULL); + if (ret) + goto out; + ret = ipv4_conf_op("default", (netns)->def_conf4, (netns)->n_def_conf4, CTL_WRITE, NULL); + if (ret) + goto out; + } else if ((netns)->def_conf) { + /* Backward compatibility */ + ret = ipv4_conf_op_old("all", (netns)->all_conf, (netns)->n_all_conf, CTL_WRITE, NULL); + if (ret) + goto out; + ret = ipv4_conf_op_old("default", (netns)->def_conf, (netns)->n_def_conf, CTL_WRITE, NULL); + if (ret) + goto out; + } + + if ((netns)->def_conf6) { + ret = ipv6_conf_op("all", (netns)->all_conf6, (netns)->n_all_conf6, CTL_WRITE, NULL); + if (ret) + goto out; + ret = ipv6_conf_op("default", (netns)->def_conf6, (netns)->n_def_conf6, CTL_WRITE, NULL); + } + + ns->net.netns = netns; +out: + return ret; +} + +static int mount_ns_sysfs(void) +{ + char sys_mount[] = "crtools-sys.XXXXXX"; + + BUG_ON(ns_sysfs_fd != -1); + + /* + * A new mntns is required to avoid the race between + * open_detach_mount and creating mntns. + */ + if (unshare(CLONE_NEWNS)) { + pr_perror("Can't create new mount namespace"); + return -1; + } + + if (mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL)) { + pr_perror("Can't mark the root mount as private"); + return -1; + } + + if (mkdtemp(sys_mount) == NULL) { + pr_perror("mkdtemp failed %s", sys_mount); + return -1; + } + + /* + * The setns() is called, so we're in proper context, + * no need in pulling the mountpoint from parasite. + */ + pr_info("Mount ns' sysfs in %s\n", sys_mount); + if (mount("sysfs", sys_mount, "sysfs", MS_MGC_VAL, NULL)) { + pr_perror("mount failed"); + rmdir(sys_mount); + return -1; + } + + ns_sysfs_fd = open_detach_mount(sys_mount); + return ns_sysfs_fd >= 0 ? 0 : -1; +} + +struct net_id_arg { + struct ns_id *ns; + int sk; +}; + +static int collect_netns_id(struct ns_id *ns, void *oarg) +{ + struct net_id_arg *arg = oarg; + struct netns_id *netns_id; + int nsid = -1; + + if (net_get_nsid(arg->sk, ns->ns_pid, &nsid)) + return -1; + + if (nsid == -1) + return 0; + + netns_id = xmalloc(sizeof(*netns_id)); + if (!netns_id) + return -1; + + pr_debug("Found the %d id for %d in %d\n", nsid, ns->id, arg->ns->id); + netns_id->target_ns_id = ns->id; + netns_id->netnsid_value = nsid; + + list_add(&netns_id->node, &arg->ns->net.ids); + + return 0; +} + +static int dump_netns_ids(int rtsk, struct ns_id *ns) +{ + struct net_id_arg arg = { + .ns = ns, + .sk = rtsk, + }; + return walk_namespaces(&net_ns_desc, collect_netns_id, + (void *)&arg); +} + +int net_set_ext(struct ns_id *ns) +{ + int fd, ret; + + fd = inherit_fd_lookup_id(ns->ext_key); + if (fd < 0) { + pr_err("Unable to find an external netns: %s\n", ns->ext_key); + return -1; + } + + ret = switch_ns_by_fd(fd, &net_ns_desc, NULL); + close(fd); + + return ret; +} + +int dump_net_ns(struct ns_id *ns) +{ + struct cr_imgset *fds; + int ret; + + fds = cr_imgset_open(ns->id, NETNS, O_DUMP); + if (fds == NULL) + return -1; + + ret = mount_ns_sysfs(); + if (ns->ext_key) { + NetnsEntry netns = NETNS_ENTRY__INIT; + + netns.ext_key = ns->ext_key; + ret = pb_write_one(img_from_set(fds, CR_FD_NETNS), &netns, PB_NETNS); + if (ret) + goto out; + } else if (!(opts.empty_ns & CLONE_NEWNET)) { + int sk; + + sk = socket(PF_NETLINK, SOCK_RAW, NETLINK_ROUTE); + if (sk < 0) { + pr_perror("Can't open rtnl sock for net dump"); + ret = -1; + } + + /* + * If a device has a pair in another netns, the kernel generates + * a netns ID for this netns when we request information about + * the link. + * So we need to get information about all links to be sure that + * all related net namespaces have got netns id-s in this netns. + */ + if (!ret) + ret = list_links(sk, NULL); + if (!ret) + ret = dump_netns_ids(sk, ns); + if (!ret) + ret = dump_links(sk, ns, fds); + + close(sk); + + if (!ret) + ret = dump_ifaddr(fds); + if (!ret) + ret = dump_route(fds); + if (!ret) + ret = dump_rule(fds); + if (!ret) + ret = dump_iptables(fds); + if (!ret) + ret = dump_netns_conf(ns, fds); + } else if (ns->type != NS_ROOT) { + pr_err("Unable to dump more than one netns if the --emptyns is set\n"); + ret = -1; + } + if (!ret) + ret = dump_nf_ct(fds, CR_FD_NETNF_CT); + if (!ret) + ret = dump_nf_ct(fds, CR_FD_NETNF_EXP); + +out: + close(ns_sysfs_fd); + ns_sysfs_fd = -1; + + close_cr_imgset(&fds); + return ret; +} + +static int net_set_nsid(int rtsk, int fd, int nsid); +static int restore_netns_ids(struct ns_id *ns) +{ + int i, sk, exit_code = -1; + + if (!ns->net.netns) + return 0; + + sk = socket(PF_NETLINK, SOCK_RAW, NETLINK_ROUTE); + if (sk < 0) { + pr_perror("Can't open rtnl sock for net dump"); + return -1; + } + + for (i = 0; i < ns->net.netns->n_nsids; i++) { + struct ns_id *tg_ns; + struct netns_id *id; + + id = xmalloc(sizeof(*id)); + if (!id) + goto out; + id->target_ns_id = ns->net.netns->nsids[i]->target_ns_id; + id->netnsid_value = ns->net.netns->nsids[i]->netnsid_value; + list_add(&id->node, &ns->net.ids); + + tg_ns = lookup_ns_by_id(id->target_ns_id, &net_ns_desc); + if (tg_ns == NULL) { + pr_err("Unknown namespace: %d\n", id->target_ns_id); + goto out; + } + + if (net_set_nsid(sk, tg_ns->net.ns_fd, id->netnsid_value)) + goto out; + } + + exit_code = 0; +out: + close(sk); + + return exit_code; +} + +static int prepare_net_ns_first_stage(struct ns_id *ns) +{ + int ret = 0; + + if (ns->ext_key || (opts.empty_ns & CLONE_NEWNET)) + return 0; + + ret = restore_netns_conf(ns); + if (!ret) + ret = restore_netns_ids(ns); + if (!ret) + ret = read_links(ns); + + return ret; +} + +static int prepare_net_ns_second_stage(struct ns_id *ns) +{ + int ret = 0, nsid = ns->id; + + if (!(opts.empty_ns & CLONE_NEWNET) && !ns->ext_key) { + if (ns->net.netns) + netns_entry__free_unpacked(ns->net.netns, NULL); + + if (!ret) + ret = restore_ifaddr(nsid); + if (!ret) + ret = restore_route(nsid); + if (!ret) + ret = restore_rule(nsid); + if (!ret) + ret = restore_iptables(nsid); + } + + if (!ret) + ret = restore_nf_ct(nsid, CR_FD_NETNF_CT); + if (!ret) + ret = restore_nf_ct(nsid, CR_FD_NETNF_EXP); + + if (!ret) { + int fd = ns->net.ns_fd; + + ns->net.nsfd_id = fdstore_add(fd); + if (ns->net.nsfd_id < 0) + ret = -1; + close(fd); + } + + ns->ns_populated = true; + + return ret; +} + +static int open_net_ns(struct ns_id *nsid) +{ + int fd; + + /* Pin one with a file descriptor */ + fd = open_proc(PROC_SELF, "ns/net"); + if (fd < 0) + return -1; + nsid->net.ns_fd = fd; + + return 0; +} + +static int do_create_net_ns(struct ns_id *ns) +{ + int ret; + + if (ns->ext_key) + ret = net_set_ext(ns); + else + ret = unshare(CLONE_NEWNET); + + if (ret) { + pr_perror("Unable to create a new netns"); + return -1; + } + if (open_net_ns(ns)) + return -1; + return 0; +} + +static int __prepare_net_namespaces(void *unused) +{ + struct ns_id *nsid; + int root_ns; + + if (prepare_xtable_lock()) + return -1; + + root_ns = open_proc(PROC_SELF, "ns/net"); + if (root_ns < 0) + return -1; + + /* Pin one with a file descriptor */ + for (nsid = ns_ids; nsid != NULL; nsid = nsid->next) { + if (nsid->nd != &net_ns_desc) + continue; + + if (nsid->type == NS_ROOT) { + nsid->net.ns_fd = root_ns; + } else { + if (do_create_net_ns(nsid)) + goto err; + } + } + + for (nsid = ns_ids; nsid != NULL; nsid = nsid->next) { + if (nsid->nd != &net_ns_desc) + continue; + + if (switch_ns_by_fd(nsid->net.ns_fd, &net_ns_desc, NULL)) + goto err; + + if (prepare_net_ns_first_stage(nsid)) + goto err; + + nsid->net.nlsk = socket(PF_NETLINK, SOCK_RAW, NETLINK_ROUTE); + if (nsid->net.nlsk < 0) { + pr_perror("Can't create nlk socket"); + goto err; + } + + } + + if (restore_links()) + goto err; + + for (nsid = ns_ids; nsid != NULL; nsid = nsid->next) { + if (nsid->nd != &net_ns_desc) + continue; + + if (switch_ns_by_fd(nsid->net.ns_fd, &net_ns_desc, NULL)) + goto err; + + if (prepare_net_ns_second_stage(nsid)) + goto err; + + close_safe(&nsid->net.nlsk); + } + + close_service_fd(NS_FD_OFF); + + return 0; +err: + return -1; +} + + +int prepare_net_namespaces(void) +{ + if (!(root_ns_mask & CLONE_NEWNET)) + return 0; + + return call_in_child_process(__prepare_net_namespaces, NULL); +} + +static int do_restore_task_net_ns(struct ns_id *nsid, struct pstree_item *current) +{ + int fd; + + if (!(root_ns_mask & CLONE_NEWNET)) + return 0; + + fd = fdstore_get(nsid->net.nsfd_id); + if (fd < 0) + return -1; + + if (setns(fd, CLONE_NEWNET)) { + pr_perror("Can't restore netns"); + close(fd); + return -1; + } + close(fd); + + return 0; +} + +int restore_task_net_ns(struct pstree_item *current) +{ + if (current->ids && current->ids->has_net_ns_id) { + unsigned int id = current->ids->net_ns_id; + struct ns_id *nsid; + + nsid = lookup_ns_by_id(id, &net_ns_desc); + if (nsid == NULL) { + pr_err("Can't find mount namespace %d\n", id); + return -1; + } + + BUG_ON(nsid->type == NS_CRIU); + + if (do_restore_task_net_ns(nsid, current)) + return -1; + } + + return 0; +} + +int netns_keep_nsfd(void) +{ + int ns_fd, ret; + + if (!(root_ns_mask & CLONE_NEWNET)) + return 0; + + /* + * When restoring a net namespace we need to communicate + * with the original (i.e. -- init) one. Thus, prepare for + * that before we leave the existing namespaces. + */ + + ns_fd = __open_proc(PROC_SELF, 0, O_RDONLY | O_CLOEXEC, "ns/net"); + if (ns_fd < 0) + return -1; + + ret = install_service_fd(NS_FD_OFF, ns_fd); + if (ret < 0) + pr_err("Can't install ns net reference\n"); + else + pr_info("Saved netns fd for links restore\n"); + + return ret >= 0 ? 0 : -1; +} + +/* + * If we want to modify iptables, we need to received the current + * configuration, change it and load a new one into the kernel. + * iptables can change or add only one rule. + * iptables-restore allows to make a few changes for one iteration, + * so it works faster. + */ +static int iptables_restore(bool ipv6, char *buf, int size) +{ + int pfd[2], ret = -1; + char *cmd4[] = {"iptables-restore", "-w", "--noflush", NULL}; + char *cmd6[] = {"ip6tables-restore", "-w", "--noflush", NULL}; + char **cmd = ipv6 ? cmd6 : cmd4; + + if (pipe(pfd) < 0) { + pr_perror("Unable to create pipe"); + return -1; + } + + if (write(pfd[1], buf, size) < size) { + pr_perror("Unable to write iptables configugration"); + goto err; + } + close_safe(&pfd[1]); + + ret = cr_system(pfd[0], -1, -1, cmd[0], cmd, 0); +err: + close_safe(&pfd[1]); + close_safe(&pfd[0]); + return ret; +} + +int network_lock_internal() +{ + char conf[] = "*filter\n" + ":CRIU - [0:0]\n" + "-I INPUT -j CRIU\n" + "-I OUTPUT -j CRIU\n" + "-A CRIU -m mark --mark " __stringify(SOCCR_MARK) " -j ACCEPT\n" + "-A CRIU -j DROP\n" + "COMMIT\n"; + int ret = 0, nsret; + + if (switch_ns(root_item->pid->real, &net_ns_desc, &nsret)) + return -1; + + + ret |= iptables_restore(false, conf, sizeof(conf) - 1); + if (kdat.ipv6) + ret |= iptables_restore(true, conf, sizeof(conf) - 1); + + if (ret) + pr_err("Locking network failed: iptables-restore returned %d. " + "This may be connected to disabled " + "CONFIG_NETFILTER_XT_MARK kernel build config " + "option.\n", ret); + + if (restore_ns(nsret, &net_ns_desc)) + ret = -1; + + return ret; +} + +static int network_unlock_internal() +{ + char conf[] = "*filter\n" + ":CRIU - [0:0]\n" + "-D INPUT -j CRIU\n" + "-D OUTPUT -j CRIU\n" + "-X CRIU\n" + "COMMIT\n"; + int ret = 0, nsret; + + if (switch_ns(root_item->pid->real, &net_ns_desc, &nsret)) + return -1; + + + ret |= iptables_restore(false, conf, sizeof(conf) - 1); + if (kdat.ipv6) + ret |= iptables_restore(true, conf, sizeof(conf) - 1); + + if (restore_ns(nsret, &net_ns_desc)) + ret = -1; + + return ret; +} + +int network_lock(void) +{ + pr_info("Lock network\n"); + + /* Each connection will be locked on dump */ + if (!(root_ns_mask & CLONE_NEWNET)) + return 0; + + if (run_scripts(ACT_NET_LOCK)) + return -1; + + return network_lock_internal(); +} + +void network_unlock(void) +{ + pr_info("Unlock network\n"); + + cpt_unlock_tcp_connections(); + rst_unlock_tcp_connections(); + + if (root_ns_mask & CLONE_NEWNET) { + run_scripts(ACT_NET_UNLOCK); + network_unlock_internal(); + } +} + +int veth_pair_add(char *in, char *out) +{ + char *e_str; + + e_str = xmalloc(200); /* For 3 IFNAMSIZ + 8 service characters */ + if (!e_str) + return -1; + snprintf(e_str, 200, "veth[%s]:%s", in, out); + return add_external(e_str); +} + +int macvlan_ext_add(struct external *ext) +{ + ext->data = (void *) (unsigned long) if_nametoindex(external_val(ext)); + if (ext->data == 0) { + pr_perror("can't get ifindex of %s", ext->id); + return -1; + } + + return 0; +} + +/* + * The setns() syscall (called by switch_ns()) can be extremely + * slow. If we call it two or more times from the same task the + * kernel will synchonously go on a very slow routine called + * synchronize_rcu() trying to put a reference on old namespaces. + * + * To avoid doing this more than once we pre-create all the + * needed other-ns sockets in advance. + */ + +static int prep_ns_sockets(struct ns_id *ns, bool for_dump) +{ + int nsret = -1, ret; + + if (ns->type != NS_CRIU) { + pr_info("Switching to %d's net for collecting sockets\n", ns->ns_pid); + if (switch_ns(ns->ns_pid, &net_ns_desc, &nsret)) + return -1; + } + + if (for_dump) { + ret = ns->net.nlsk = socket(PF_NETLINK, SOCK_RAW, NETLINK_SOCK_DIAG); + if (ret < 0) { + pr_perror("Can't create sock diag socket"); + goto err_nl; + } + } else + ns->net.nlsk = -1; + +#ifdef CONFIG_HAS_SELINUX + /* + * If running on a system with SELinux enabled the socket for the + * communication between parasite daemon and the main + * CRIU process needs to be correctly labeled. + * Initially this was motivated by Podman's use case: The container + * is usually running as something like '...:...:container_t:...:....' + * and CRIU started from runc and Podman will run as + * '...:...:container_runtime_t:...:...'. As the parasite will be + * running with the same context as the container process: 'container_t'. + * Allowing a container process to connect via socket to the outside + * of the container ('container_runtime_t') is not desired and + * therefore CRIU needs to label the socket with the context of + * the container: 'container_t'. + * So this first gets the context of the root container process + * and tells SELinux to label the next created socket with + * the same label as the root container process. + * For this to work it is necessary to have the correct SELinux + * policies installed. For Fedora based systems this is part + * of the container-selinux package. + */ + security_context_t ctx; + + /* + * This assumes that all processes CRIU wants to dump are labeled + * with the same SELinux context. If some of the child processes + * have different labels this will not work and needs additional + * SELinux policies. But the whole SELinux socket labeling relies + * on the correct SELinux being available. + */ + if (kdat.lsm == LSMTYPE__SELINUX) { + ret = getpidcon_raw(root_item->pid->real, &ctx); + if (ret < 0) { + pr_perror("Getting SELinux context for PID %d failed", + root_item->pid->real); + goto err_sq; + } + + ret = setsockcreatecon(ctx); + freecon(ctx); + if (ret < 0) { + pr_perror("Setting SELinux socket context for PID %d failed", + root_item->pid->real); + goto err_sq; + } + } +#endif + + ret = ns->net.seqsk = socket(PF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK, 0); + if (ret < 0) { + pr_perror("Can't create seqsk for parasite"); + goto err_sq; + } + + ret = 0; + +#ifdef CONFIG_HAS_SELINUX + /* + * Once the socket has been created, reset the SELinux socket labelling + * back to the default value of this process. + */ + if (kdat.lsm == LSMTYPE__SELINUX) { + ret = setsockcreatecon_raw(NULL); + if (ret < 0) { + pr_perror("Resetting SELinux socket context to " + "default for PID %d failed", + root_item->pid->real); + goto err_ret; + } + } +#endif + +out: + if (nsret >= 0 && restore_ns(nsret, &net_ns_desc) < 0) { + nsret = -1; + if (ret == 0) + goto err_ret; + } + + return ret; + +err_ret: + close(ns->net.seqsk); +err_sq: + if (ns->net.nlsk >= 0) + close(ns->net.nlsk); +err_nl: + goto out; +} + +static int netns_nr; +static int collect_net_ns(struct ns_id *ns, void *oarg) +{ + bool for_dump = (oarg == (void *)1); + char id[64], *val; + int ret; + + pr_info("Collecting netns %d/%d\n", ns->id, ns->ns_pid); + + snprintf(id, sizeof(id), "net[%u]", ns->kid); + val = external_lookup_by_key(id); + if (!IS_ERR_OR_NULL(val)) { + pr_debug("The %s netns is external\n", id); + ns->ext_key = val; + } + + ret = prep_ns_sockets(ns, for_dump); + if (ret) + return ret; + + netns_nr++; + + if (!for_dump) + return 0; + + return collect_sockets(ns); +} + +int collect_net_namespaces(bool for_dump) +{ + return walk_namespaces(&net_ns_desc, collect_net_ns, + (void *)(for_dump ? 1UL : 0)); +} + +struct ns_desc net_ns_desc = NS_DESC_ENTRY(CLONE_NEWNET, "net"); + +struct ns_id *net_get_root_ns() +{ + static struct ns_id *root_netns = NULL; + + if (root_netns) + return root_netns; + + if (root_item->ids == NULL) + return NULL; + + root_netns = lookup_ns_by_id(root_item->ids->net_ns_id, &net_ns_desc); + + return root_netns; +} + +/* + * socket_diag doesn't report unbound and unconnected sockets, + * so we have to get their network namesapces explicitly + */ +struct ns_id *get_socket_ns(int lfd) +{ + struct ns_id *ns; + struct stat st; + int ns_fd; + + ns_fd = ioctl(lfd, SIOCGSKNS); + if (ns_fd < 0) { + /* backward compatibility with old kernels */ + if (netns_nr == 1) + return net_get_root_ns(); + + pr_perror("Unable to get a socket net namespace"); + return NULL; + } + if (fstat(ns_fd, &st)) { + pr_perror("Unable to stat a network namespace"); + close(ns_fd); + return NULL; + } + close(ns_fd); + + ns = lookup_ns_by_kid(st.st_ino, &net_ns_desc); + if (ns == NULL) { + pr_err("Unable to dump a socket from an external network namespace\n"); + return NULL; + } + + return ns; +} + +void check_has_netns_ioc(int fd, bool *kdat_val, const char *name) +{ + int ns_fd; + + ns_fd = ioctl(fd, SIOCGSKNS); + *kdat_val = (ns_fd >= 0); + + if (ns_fd < 0) + pr_warn("Unable to get %s network namespace\n", name); + else + close(ns_fd); +} + +int kerndat_socket_netns(void) +{ + int sk; + + sk = socket(AF_UNIX, SOCK_DGRAM, 0); + if (sk < 0) { + pr_perror("Unable to create socket"); + return -1; + } + check_has_netns_ioc(sk, &kdat.sk_ns, "socket"); + close(sk); + + return 0; +} + +static int move_to_bridge(struct external *ext, void *arg) +{ + int s = *(int *)arg; + int ret; + char *out, *br; + struct ifreq ifr; + + out = external_val(ext); + if (!out) + return -1; + + br = strchr(out, '@'); + if (!br) + return 0; + + *br = '\0'; + br++; + + { + pr_debug("\tMoving dev %s to bridge %s\n", out, br); + + if (s == -1) { + s = socket(AF_LOCAL, SOCK_STREAM|SOCK_CLOEXEC, 0); + if (s < 0) { + pr_perror("Can't create control socket"); + return -1; + } + } + + /* + * Add the device to the bridge. This is equivalent to: + * $ brctl addif + */ + ifr.ifr_ifindex = if_nametoindex(out); + if (ifr.ifr_ifindex == 0) { + pr_perror("Can't get index of %s", out); + ret = -1; + goto out; + } + strlcpy(ifr.ifr_name, br, IFNAMSIZ); + ret = ioctl(s, SIOCBRADDIF, &ifr); + if (ret < 0) { + pr_perror("Can't add interface %s to bridge %s", out, br); + goto out; + } + + /* + * Make sure the device is up. This is equivalent to: + * $ ip link set dev up + */ + ifr.ifr_ifindex = 0; + strlcpy(ifr.ifr_name, out, IFNAMSIZ); + ret = ioctl(s, SIOCGIFFLAGS, &ifr); + if (ret < 0) { + pr_perror("Can't get flags of interface %s", out); + goto out; + } + + ret = 0; + if (ifr.ifr_flags & IFF_UP) + goto out; + + ifr.ifr_flags |= IFF_UP; + if (changeflags(s, out, ifr.ifr_flags) < 0) + goto out; + ret = 0; + } +out: + br--; + *br = '@'; + *(int *)arg = s; + return ret; +} + +int move_veth_to_bridge(void) +{ + int sk = -1, ret; + + ret = external_for_each_type("veth", move_to_bridge, &sk); + if (sk >= 0) + close(sk); + + return ret; +} + +#if NLA_TYPE_MAX < 14 +#define NLA_S32 14 +#endif + +#ifndef NETNSA_MAX +/* Attributes of RTM_NEWNSID/RTM_GETNSID messages */ +enum { + NETNSA_NONE, +#define NETNSA_NSID_NOT_ASSIGNED -1 + NETNSA_NSID, + NETNSA_PID, + NETNSA_FD, + __NETNSA_MAX, +}; + +#define NETNSA_MAX (__NETNSA_MAX - 1) +#endif + +static struct nla_policy rtnl_net_policy[NETNSA_MAX + 1] = { + [NETNSA_NONE] = { .type = NLA_UNSPEC }, + [NETNSA_NSID] = { .type = NLA_S32 }, + [NETNSA_PID] = { .type = NLA_U32 }, + [NETNSA_FD] = { .type = NLA_U32 }, +}; + +static int nsid_cb(struct nlmsghdr *msg, struct ns_id *ns, void *arg) +{ + struct nlattr *tb[NETNSA_MAX + 1]; + int err; + + err = nlmsg_parse(msg, sizeof(struct rtgenmsg), tb, + NETNSA_MAX, rtnl_net_policy); + if (err < 0) + return NL_STOP; + + if (tb[NETNSA_NSID]) + *((int *)arg) = nla_get_s32(tb[NETNSA_NSID]); + + return 0; +} + +static int net_set_nsid(int rtsk, int fd, int nsid) +{ + struct { + struct nlmsghdr nlh; + struct rtgenmsg g; + char msg[128]; + } req; + + memset(&req, 0, sizeof(req)); + req.nlh.nlmsg_len = NLMSG_LENGTH(sizeof(struct rtgenmsg)); + req.nlh.nlmsg_type = RTM_NEWNSID; + req.nlh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK; + req.nlh.nlmsg_seq = CR_NLMSG_SEQ; + if (addattr_l(&req.nlh, sizeof(req), NETNSA_FD, &fd, sizeof(fd))) + return -1; + if (addattr_l(&req.nlh, sizeof(req), NETNSA_NSID, &nsid, sizeof(nsid))) + return -1; + + if (do_rtnl_req(rtsk, &req, req.nlh.nlmsg_len, NULL, NULL, NULL, NULL) < 0) + return -1; + + return 0; +} + +int net_get_nsid(int rtsk, int pid, int *nsid) +{ + struct { + struct nlmsghdr nlh; + struct rtgenmsg g; + char msg[128]; + } req; + int32_t id = INT_MIN; + + memset(&req, 0, sizeof(req)); + req.nlh.nlmsg_len = NLMSG_LENGTH(sizeof(struct rtgenmsg)); + req.nlh.nlmsg_type = RTM_GETNSID; + req.nlh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK; + req.nlh.nlmsg_seq = CR_NLMSG_SEQ; + if (addattr_l(&req.nlh, sizeof(req), NETNSA_PID, &pid, sizeof(pid))) + return -1; + + if (do_rtnl_req(rtsk, &req, req.nlh.nlmsg_len, nsid_cb, NULL, NULL, (void *) &id) < 0) + return -1; + + if (id == INT_MIN) + return -1; + + *nsid = id; + + return 0; +} + + +static int nsid_link_info(struct ns_id *ns, struct net_link *link, struct newlink_req *req) +{ + NetDeviceEntry *nde = link->nde; + struct rtattr *veth_data, *peer_data; + struct ifinfomsg ifm; + + addattr_l(&req->h, sizeof(*req), IFLA_INFO_KIND, "veth", 4); + + veth_data = NLMSG_TAIL(&req->h); + addattr_l(&req->h, sizeof(*req), IFLA_INFO_DATA, NULL, 0); + peer_data = NLMSG_TAIL(&req->h); + memset(&ifm, 0, sizeof(ifm)); + + ifm.ifi_index = nde->peer_ifindex; + addattr_l(&req->h, sizeof(*req), VETH_INFO_PEER, &ifm, sizeof(ifm)); + + addattr_l(&req->h, sizeof(*req), IFLA_NET_NS_FD, &nde->peer_nsid, sizeof(int)); + peer_data->rta_len = (void *)NLMSG_TAIL(&req->h) - (void *)peer_data; + veth_data->rta_len = (void *)NLMSG_TAIL(&req->h) - (void *)veth_data; + + return 0; +} + +static int check_one_link_nsid(struct nlmsghdr *hdr, struct ns_id *ns, void *arg) +{ + bool *has_link_nsid = arg; + struct ifinfomsg *ifi; + int len = hdr->nlmsg_len - NLMSG_LENGTH(sizeof(*ifi)); + struct nlattr *tb[IFLA_MAX + 1]; + + ifi = NLMSG_DATA(hdr); + + if (len < 0) { + pr_err("No iflas for link %d\n", ifi->ifi_index); + return -1; + } + + nlmsg_parse(hdr, sizeof(struct ifinfomsg), tb, IFLA_MAX, NULL); + pr_info("\tLD: Got link %d, type %d\n", ifi->ifi_index, ifi->ifi_type); + + if (tb[IFLA_LINK_NETNSID]) + *has_link_nsid = true; + + return 0; +} + +static int check_link_nsid(int rtsk, void *args) +{ + struct { + struct nlmsghdr nlh; + struct rtgenmsg g; + } req; + + pr_info("Dumping netns links\n"); + + memset(&req, 0, sizeof(req)); + req.nlh.nlmsg_len = sizeof(req); + req.nlh.nlmsg_type = RTM_GETLINK; + req.nlh.nlmsg_flags = NLM_F_ROOT|NLM_F_MATCH|NLM_F_REQUEST; + req.nlh.nlmsg_pid = 0; + req.nlh.nlmsg_seq = CR_NLMSG_SEQ; + req.g.rtgen_family = AF_PACKET; + + return do_rtnl_req(rtsk, &req, sizeof(req), check_one_link_nsid, NULL, NULL, args); +} + +int kerndat_link_nsid() +{ + int status; + pid_t pid; + + pid = fork(); + if (pid < 0) { + pr_perror("Unable to fork a process"); + return -1; + } + + if (pid == 0) { + NetDeviceEntry nde = NET_DEVICE_ENTRY__INIT; + struct net_link link = { + .created = false, + .nde = &nde, + }; + int nsfd, sk, ret; + + if (unshare(CLONE_NEWNET)) { + pr_perror("Unable create a network namespace"); + exit(1); + } + + nsfd = open_proc(PROC_SELF, "ns/net"); + if (nsfd < 0) + exit(1); + + if (unshare(CLONE_NEWNET)) { + pr_perror("Unable create a network namespace"); + exit(1); + } + + sk = socket(PF_NETLINK, SOCK_RAW, NETLINK_ROUTE); + if (sk < 0) { + pr_perror("Unable to create a netlink socket"); + exit(1); + } + + nde.type = ND_TYPE__VETH; + nde.name = "veth"; + nde.ifindex = 10; + nde.mtu = 1500; + nde.peer_nsid = nsfd; + nde.peer_ifindex = 11; + nde.has_peer_ifindex = true; + nde.has_peer_nsid = true; + + ret = restore_one_link(NULL, &link, sk, nsid_link_info, NULL); + if (ret) { + pr_err("Unable to create a veth pair: %d\n", ret); + exit(1); + } + + bool has_link_nsid = false; + if (check_link_nsid(sk, &has_link_nsid)) + exit(1); + + if (!has_link_nsid) + exit(5); + + close(sk); + + exit(0); + } + + if (waitpid(pid, &status, 0) != pid) { + pr_perror("Unable to wait a process"); + return -1; + } + + if (status) { + pr_warn("NSID isn't reported for network links\n"); + return 0; + } + + kdat.has_link_nsid = true; + + return 0; +} diff --git a/CRIU_code/criu/netfilter.c b/CRIU_code/criu/netfilter.c new file mode 100644 index 0000000..368651c --- /dev/null +++ b/CRIU_code/criu/netfilter.c @@ -0,0 +1,158 @@ +#include +#include +#include +#include +#include +#include + +#include "../soccr/soccr.h" + +#include "util.h" +#include "common/list.h" +#include "files.h" +#include "netfilter.h" +#include "sockets.h" +#include "sk-inet.h" +#include "kerndat.h" + +static char buf[512]; + +/* + * Need to configure simple netfilter rules for blocking connections + * Any brave soul to write it using xtables-devel? + */ + +#define NF_CONN_CMD "%s %s -t filter %s %s --protocol tcp " \ + "-m mark ! --mark " __stringify(SOCCR_MARK) " --source %s --sport %d --destination %s --dport %d -j DROP" + +static char iptable_cmd_ipv4[] = "iptables"; +static char iptable_cmd_ipv6[] = "ip6tables"; + +void preload_netfilter_modules(void) +{ + int fd = -1; + + /* same as socket modules, ip_tables and ip6_tables will be loaded by + * CRIU, so we should try and preload these as well. + */ + fd = open("/dev/null", O_RDWR); + if (fd < 0) { + fd = -1; + pr_perror("failed to open /dev/null, using log fd for net module preload"); + } + cr_system(fd, fd, fd, iptable_cmd_ipv4, + (char *[]) { iptable_cmd_ipv4, "-L", "-n", NULL}, 0); + cr_system(fd, fd, fd, iptable_cmd_ipv6, + (char *[]) { iptable_cmd_ipv6, "-L", "-n", NULL}, 0); + close_safe(&fd); +} + +/* IPv4-Mapped IPv6 Addresses */ +static int ipv6_addr_mapped(u32 *addr) +{ + return (addr[2] == htonl(0x0000ffff)); +} + +static int nf_connection_switch_raw(int family, u32 *src_addr, u16 src_port, + u32 *dst_addr, u16 dst_port, + bool input, bool lock) +{ + char sip[INET_ADDR_LEN], dip[INET_ADDR_LEN]; + char *cmd; + char *argv[4] = { "sh", "-c", buf, NULL }; + int ret; + + if (family == AF_INET6 && ipv6_addr_mapped(dst_addr)) { + family = AF_INET; + src_addr = &src_addr[3]; + dst_addr = &dst_addr[3]; + } + + switch (family) { + case AF_INET: + cmd = iptable_cmd_ipv4; + break; + case AF_INET6: + cmd = iptable_cmd_ipv6; + break; + default: + pr_err("Unknown socket family %d\n", family); + return -1; + }; + + if (!inet_ntop(family, (void *)src_addr, sip, INET_ADDR_LEN) || + !inet_ntop(family, (void *)dst_addr, dip, INET_ADDR_LEN)) { + pr_perror("nf: Can't translate ip addr"); + return -1; + } + + snprintf(buf, sizeof(buf), NF_CONN_CMD, cmd, + kdat.has_xtlocks ? "-w" : "", + lock ? "-I" : "-D", + input ? "INPUT" : "OUTPUT", + dip, (int)dst_port, sip, (int)src_port); + + pr_debug("\tRunning iptables [%s]\n", buf); + + /* + * cr_system is used here, because it blocks SIGCHLD before waiting + * a child and the child can't be waited from SIGCHLD handler. + */ + ret = cr_system(-1, -1, -1, "sh", argv, 0); + if (ret < 0 || !WIFEXITED(ret) || WEXITSTATUS(ret)) { + pr_err("Iptables configuration failed\n"); + return -1; + } + + pr_info("%s %s:%d - %s:%d connection\n", lock ? "Locked" : "Unlocked", + sip, (int)src_port, dip, (int)dst_port); + return 0; +} + +static int nf_connection_switch(struct inet_sk_desc *sk, bool lock) +{ + int ret = 0; + + ret = nf_connection_switch_raw(sk->sd.family, + sk->src_addr, sk->src_port, + sk->dst_addr, sk->dst_port, true, lock); + if (ret) + return -1; + + ret = nf_connection_switch_raw(sk->sd.family, + sk->dst_addr, sk->dst_port, + sk->src_addr, sk->src_port, false, lock); + if (ret) /* rollback */ + nf_connection_switch_raw(sk->sd.family, + sk->src_addr, sk->src_port, + sk->dst_addr, sk->dst_port, true, !lock); + return ret; +} + +int nf_lock_connection(struct inet_sk_desc *sk) +{ + return nf_connection_switch(sk, true); +} + +int nf_unlock_connection(struct inet_sk_desc *sk) +{ + return nf_connection_switch(sk, false); +} + +int nf_unlock_connection_info(struct inet_sk_info *si) +{ + int ret = 0; + + ret |= nf_connection_switch_raw(si->ie->family, + si->ie->src_addr, si->ie->src_port, + si->ie->dst_addr, si->ie->dst_port, true, false); + ret |= nf_connection_switch_raw(si->ie->family, + si->ie->dst_addr, si->ie->dst_port, + si->ie->src_addr, si->ie->src_port, false, false); + /* + * rollback nothing in case of any error, + * because nobody checks errors of this function + */ + + return ret; +} diff --git a/CRIU_code/criu/page-pipe.c b/CRIU_code/criu/page-pipe.c new file mode 100644 index 0000000..c32b893 --- /dev/null +++ b/CRIU_code/criu/page-pipe.c @@ -0,0 +1,476 @@ +#include + +#undef LOG_PREFIX +#define LOG_PREFIX "page-pipe: " + +#include "common/config.h" +#include "page.h" +#include "util.h" +#include "criu-log.h" +#include "page-pipe.h" +#include "fcntl.h" +#include "stats.h" +#include "cr_options.h" + +/* can existing iov accumulate the page? */ +static inline bool iov_grow_page(struct iovec *iov, unsigned long addr) +{ + if ((unsigned long)iov->iov_base + iov->iov_len == addr) { + iov->iov_len += PAGE_SIZE; + return true; + } + + return false; +} + +static inline void iov_init(struct iovec *iov, unsigned long addr) +{ + iov->iov_base = (void *)addr; + iov->iov_len = PAGE_SIZE; +} + +static int __ppb_resize_pipe(struct page_pipe_buf *ppb, unsigned long new_size) +{ + int ret; + + ret = fcntl(ppb->p[0], F_SETPIPE_SZ, new_size * PAGE_SIZE); + if (ret < 0) + return -1; + + ret /= PAGE_SIZE; + BUG_ON(ret < ppb->pipe_size); + + pr_debug("Grow pipe %x -> %x\n", ppb->pipe_size, ret); + ppb->pipe_size = ret; + + return 0; +} + +static inline int ppb_resize_pipe(struct page_pipe_buf *ppb) +{ + unsigned long new_size = ppb->pipe_size << 1; + int ret; + + if (ppb->pages_in + ppb->pipe_off < ppb->pipe_size) + return 0; + + if (new_size > PIPE_MAX_SIZE) + return 1; + + ret = __ppb_resize_pipe(ppb, new_size); + if (ret < 0) + return 1; /* need to add another buf */ + + return 0; +} + +static struct page_pipe_buf *pp_prev_ppb(struct page_pipe *pp, + unsigned int ppb_flags) +{ + int type = 0; + + /* don't allow to reuse a pipe in the PP_CHUNK_MODE mode */ + if (pp->flags & PP_CHUNK_MODE) + return NULL; + + if (list_empty(&pp->bufs)) + return NULL; + + if (ppb_flags & PPB_LAZY && opts.lazy_pages) + type = 1; + + return pp->prev[type]; +} + +static void pp_update_prev_ppb(struct page_pipe *pp, struct page_pipe_buf *ppb, + unsigned int ppb_flags) +{ + int type = 0; + + if (ppb_flags & PPB_LAZY && opts.lazy_pages) + type = 1; + + pp->prev[type] = ppb; +} + +static struct page_pipe_buf *ppb_alloc(struct page_pipe *pp, + unsigned int ppb_flags) +{ + struct page_pipe_buf *prev = pp_prev_ppb(pp, ppb_flags); + struct page_pipe_buf *ppb; + + ppb = xmalloc(sizeof(*ppb)); + if (!ppb) + return NULL; + cnt_add(CNT_PAGE_PIPE_BUFS, 1); + + ppb->pipe_off = 0; + + if (prev && ppb_resize_pipe(prev) == 0) { + /* The previous pipe isn't full and we can continue to use it. */ + ppb->p[0] = prev->p[0]; + ppb->p[1] = prev->p[1]; + ppb->pipe_off = prev->pages_in + prev->pipe_off; + ppb->pipe_size = prev->pipe_size; + } else { + if (pipe(ppb->p)) { + xfree(ppb); + pr_perror("Can't make pipe for page-pipe"); + return NULL; + } + cnt_add(CNT_PAGE_PIPES, 1); + + ppb->pipe_size = fcntl(ppb->p[0], F_GETPIPE_SZ, 0) / PAGE_SIZE; + pp->nr_pipes++; + } + + list_add_tail(&ppb->l, &pp->bufs); + + pp_update_prev_ppb(pp, ppb, ppb_flags); + + return ppb; +} + +static void ppb_destroy(struct page_pipe_buf *ppb) +{ + /* Check whether a pipe is shared with another ppb */ + if (ppb->pipe_off == 0) { + close(ppb->p[0]); + close(ppb->p[1]); + } + xfree(ppb); +} + +static void ppb_init(struct page_pipe_buf *ppb, unsigned int pages_in, + unsigned int nr_segs, unsigned int flags, + struct iovec *iov) +{ + ppb->pages_in = pages_in; + ppb->nr_segs = nr_segs; + ppb->flags = flags; + ppb->iov = iov; +} + +static int page_pipe_grow(struct page_pipe *pp, unsigned int flags) +{ + struct page_pipe_buf *ppb; + struct iovec *free_iov; + + pr_debug("Will grow page pipe (iov off is %u)\n", pp->free_iov); + + if (!list_empty(&pp->free_bufs)) { + ppb = list_first_entry(&pp->free_bufs, struct page_pipe_buf, l); + list_move_tail(&ppb->l, &pp->bufs); + goto out; + } + + if ((pp->flags & PP_CHUNK_MODE) && (pp->nr_pipes == NR_PIPES_PER_CHUNK)) + return -EAGAIN; + + ppb = ppb_alloc(pp, flags); + if (!ppb) + return -1; + +out: + free_iov = &pp->iovs[pp->free_iov]; + ppb_init(ppb, 0, 0, flags, free_iov); + + return 0; +} + +struct page_pipe *create_page_pipe(unsigned int nr_segs, struct iovec *iovs, unsigned flags) +{ + struct page_pipe *pp; + + pr_debug("Create page pipe for %u segs\n", nr_segs); + + pp = xzalloc(sizeof(*pp)); + if (!pp) + return NULL; + + pp->flags = flags; + + if (!iovs) { + iovs = xmalloc(sizeof(*iovs) * nr_segs); + if (!iovs) + goto err_free_pp; + + pp->flags |= PP_OWN_IOVS; + } + + pp->nr_pipes = 0; + INIT_LIST_HEAD(&pp->bufs); + INIT_LIST_HEAD(&pp->free_bufs); + pp->nr_iovs = nr_segs; + pp->iovs = iovs; + pp->free_iov = 0; + + pp->nr_holes = 0; + pp->free_hole = 0; + pp->holes = NULL; + + if (page_pipe_grow(pp, 0)) + goto err_free_iovs; + + return pp; + +err_free_iovs: + if (pp->flags & PP_OWN_IOVS) + xfree(iovs); +err_free_pp: + xfree(pp); + return NULL; +} + +void destroy_page_pipe(struct page_pipe *pp) +{ + struct page_pipe_buf *ppb, *n; + + pr_debug("Killing page pipe\n"); + + list_splice(&pp->free_bufs, &pp->bufs); + list_for_each_entry_safe(ppb, n, &pp->bufs, l) + ppb_destroy(ppb); + + if (pp->flags & PP_OWN_IOVS) + xfree(pp->iovs); + xfree(pp); +} + +void page_pipe_reinit(struct page_pipe *pp) +{ + struct page_pipe_buf *ppb, *n; + + BUG_ON(!(pp->flags & PP_CHUNK_MODE)); + + pr_debug("Clean up page pipe\n"); + + list_for_each_entry_safe(ppb, n, &pp->bufs, l) + list_move(&ppb->l, &pp->free_bufs); + + pp->free_hole = 0; + + if (page_pipe_grow(pp, 0)) + BUG(); /* It can't fail, because ppb is in free_bufs */ +} + +static inline int try_add_page_to(struct page_pipe *pp, struct page_pipe_buf *ppb, + unsigned long addr, unsigned int flags) +{ + if (ppb->flags != flags) + return 1; + + if (ppb_resize_pipe(ppb) == 1) + return 1; + + if (ppb->nr_segs && iov_grow_page(&ppb->iov[ppb->nr_segs - 1], addr)) + goto out; + + pr_debug("Add iov to page pipe (%u iovs, %u/%u total)\n", + ppb->nr_segs, pp->free_iov, pp->nr_iovs); + iov_init(&ppb->iov[ppb->nr_segs++], addr); + pp->free_iov++; + BUG_ON(pp->free_iov > pp->nr_iovs); +out: + ppb->pages_in++; + return 0; +} + +static inline int try_add_page(struct page_pipe *pp, unsigned long addr, + unsigned int flags) +{ + BUG_ON(list_empty(&pp->bufs)); + return try_add_page_to(pp, list_entry(pp->bufs.prev, struct page_pipe_buf, l), addr, flags); +} + +int page_pipe_add_page(struct page_pipe *pp, unsigned long addr, + unsigned int flags) +{ + int ret; + + ret = try_add_page(pp, addr, flags); + if (ret <= 0) + return ret; + + ret = page_pipe_grow(pp, flags); + if (ret < 0) + return ret; + + ret = try_add_page(pp, addr, flags); + BUG_ON(ret > 0); + return ret; +} + +#define PP_HOLES_BATCH 32 + +int page_pipe_add_hole(struct page_pipe *pp, unsigned long addr, + unsigned int flags) +{ + if (pp->free_hole >= pp->nr_holes) { + pp->holes = xrealloc(pp->holes, + (pp->nr_holes + PP_HOLES_BATCH) * sizeof(struct iovec)); + if (!pp->holes) + return -1; + + pp->hole_flags = xrealloc(pp->hole_flags, + (pp->nr_holes + PP_HOLES_BATCH) * sizeof(unsigned int)); + if(!pp->hole_flags) + return -1; + + pp->nr_holes += PP_HOLES_BATCH; + } + + if (pp->free_hole && + pp->hole_flags[pp->free_hole - 1] == flags && + iov_grow_page(&pp->holes[pp->free_hole - 1], addr)) + goto out; + + iov_init(&pp->holes[pp->free_hole++], addr); + + pp->hole_flags[pp->free_hole - 1] = flags; + +out: + return 0; +} + +/* + * Get ppb and iov that contain addr and count amount of data between + * beginning of the pipe belonging to the ppb and addr + */ +static struct page_pipe_buf *get_ppb(struct page_pipe *pp, unsigned long addr, + struct iovec **iov_ret, + unsigned long *len) +{ + struct page_pipe_buf *ppb; + int i; + + list_for_each_entry(ppb, &pp->bufs, l) { + for (i = 0, *len = 0; i < ppb->nr_segs; i++) { + struct iovec *iov = &ppb->iov[i]; + unsigned long base = (unsigned long)iov->iov_base; + + if (addr < base || addr >= base + iov->iov_len) { + *len += iov->iov_len; + continue; + } + + /* got iov that contains the addr */ + *len += (addr - base); + *iov_ret = iov; + + list_move(&ppb->l, &pp->bufs); + return ppb; + } + } + + return NULL; +} + +int pipe_read_dest_init(struct pipe_read_dest *prd) +{ + int ret; + + if (pipe(prd->p)) { + pr_perror("Cannot create pipe for reading from page-pipe"); + return -1; + } + + ret = fcntl(prd->p[0], F_SETPIPE_SZ, PIPE_MAX_SIZE * PAGE_SIZE); + if (ret < 0) + return -1; + + prd->sink_fd = open("/dev/null", O_WRONLY); + if (prd->sink_fd < 0) { + pr_perror("Cannot open sink for reading from page-pipe"); + return -1; + } + + ret = fcntl(prd->p[0], F_GETPIPE_SZ, 0); + pr_debug("Created tee pipe size %d\n", ret); + + return 0; +} + +int page_pipe_read(struct page_pipe *pp, struct pipe_read_dest *prd, + unsigned long addr, unsigned int *nr_pages, + unsigned int ppb_flags) +{ + struct page_pipe_buf *ppb; + struct iovec *iov = NULL; + unsigned long skip = 0, len; + int ret; + + /* + * Get ppb that contains addr and count length of data between + * the beginning of the pipe and addr. If no ppb is found, the + * requested page is mapped to zero pfn + */ + ppb = get_ppb(pp, addr, &iov, &skip); + if (!ppb) { + *nr_pages = 0; + return 0; + } + + if (!(ppb->flags & ppb_flags)) { + pr_err("PPB flags mismatch: %x %x\n", ppb_flags, ppb->flags); + return false; + } + + /* clamp the request if it passes the end of iovec */ + len = min((unsigned long)iov->iov_base + iov->iov_len - addr, + (unsigned long)(*nr_pages) * PAGE_SIZE); + *nr_pages = len / PAGE_SIZE; + + skip += ppb->pipe_off * PAGE_SIZE; + /* we should tee() the requested length + the beginning of the pipe */ + len += skip; + + ret = tee(ppb->p[0], prd->p[1], len, 0); + if (ret != len) { + pr_perror("tee: %d", ret); + return -1; + } + + ret = splice(prd->p[0], NULL, prd->sink_fd, NULL, skip, 0); + if (ret != skip) { + pr_perror("splice: %d", ret); + return -1; + } + + return 0; +} + +void page_pipe_destroy_ppb(struct page_pipe_buf *ppb) +{ + list_del(&ppb->l); + ppb_destroy(ppb); +} + +void debug_show_page_pipe(struct page_pipe *pp) +{ + struct page_pipe_buf *ppb; + int i; + struct iovec *iov; + + if (pr_quelled(LOG_DEBUG)) + return; + + pr_debug("Page pipe:\n"); + pr_debug("* %u pipes %u/%u iovs:\n", + pp->nr_pipes, pp->free_iov, pp->nr_iovs); + list_for_each_entry(ppb, &pp->bufs, l) { + pr_debug("\tbuf %u pages, %u iovs, flags: %x pipe_off: %x :\n", + ppb->pages_in, ppb->nr_segs, ppb->flags, ppb->pipe_off); + for (i = 0; i < ppb->nr_segs; i++) { + iov = &ppb->iov[i]; + pr_debug("\t\t%p %lu\n", iov->iov_base, + iov->iov_len / PAGE_SIZE); + } + } + + pr_debug("* %u holes:\n", pp->free_hole); + for (i = 0; i < pp->free_hole; i++) { + iov = &pp->holes[i]; + pr_debug("\t%p %lu\n", iov->iov_base, iov->iov_len / PAGE_SIZE); + } +} diff --git a/CRIU_code/criu/page-xfer.c b/CRIU_code/criu/page-xfer.c new file mode 100644 index 0000000..9cdffd8 --- /dev/null +++ b/CRIU_code/criu/page-xfer.c @@ -0,0 +1,1331 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "types.h" +#include "cr_options.h" +#include "servicefd.h" +#include "image.h" +#include "page-xfer.h" +#include "page-pipe.h" +#include "util.h" +#include "protobuf.h" +#include "images/pagemap.pb-c.h" +#include "fcntl.h" +#include "pstree.h" +#include "parasite-syscall.h" +#include "rst_info.h" +#include "stats.h" +#include "img-remote.h" +#include "tls.h" + +static int page_server_sk = -1; + +struct page_server_iov { + u32 cmd; + u32 nr_pages; + u64 vaddr; + u64 dst_id; +}; + +static void psi2iovec(struct page_server_iov *ps, struct iovec *iov) +{ + iov->iov_base = decode_pointer(ps->vaddr); + iov->iov_len = ps->nr_pages * PAGE_SIZE; +} + +#define PS_IOV_ADD 1 +#define PS_IOV_HOLE 2 +#define PS_IOV_OPEN 3 +#define PS_IOV_OPEN2 4 +#define PS_IOV_PARENT 5 +#define PS_IOV_ADD_F 6 +#define PS_IOV_GET 7 + +#define PS_IOV_FLUSH 0x1023 +#define PS_IOV_FLUSH_N_CLOSE 0x1024 + +#define PS_CMD_BITS 16 +#define PS_CMD_MASK ((1 << PS_CMD_BITS) - 1) + +#define PS_TYPE_BITS 8 +#define PS_TYPE_MASK ((1 << PS_TYPE_BITS) - 1) + +#define PS_TYPE_PID (1) +#define PS_TYPE_SHMEM (2) +/* + * XXX: When adding new types here check decode_pm for legacy + * numbers that can be met from older CRIUs + */ + +static inline u64 encode_pm(int type, unsigned long id) +{ + if (type == CR_FD_PAGEMAP) + type = PS_TYPE_PID; + else if (type == CR_FD_SHMEM_PAGEMAP) + type = PS_TYPE_SHMEM; + else { + BUG(); + return 0; + } + + return ((u64)id) << PS_TYPE_BITS | type; +} + +static int decode_pm(u64 dst_id, unsigned long *id) +{ + int type; + + /* + * Magic numbers below came from the older CRIU versions that + * erroneously used the changing CR_FD_* constants. The + * changes were made when we merged images together and moved + * the CR_FD_-s at the tail of the enum + */ + type = dst_id & PS_TYPE_MASK; + switch (type) { + case 10: /* 3.1 3.2 */ + case 11: /* 1.3 1.4 1.5 1.6 1.7 1.8 2.* 3.0 */ + case 16: /* 1.2 */ + case 17: /* 1.0 1.1 */ + case PS_TYPE_PID: + *id = dst_id >> PS_TYPE_BITS; + type = CR_FD_PAGEMAP; + break; + case 27: /* 1.3 */ + case 28: /* 1.4 1.5 */ + case 29: /* 1.6 1.7 */ + case 32: /* 1.2 1.8 */ + case 33: /* 1.0 1.1 3.1 3.2 */ + case 34: /* 2.* 3.0 */ + case PS_TYPE_SHMEM: + *id = dst_id >> PS_TYPE_BITS; + type = CR_FD_SHMEM_PAGEMAP; + break; + default: + type = -1; + break; + } + + return type; +} + +static inline u32 encode_ps_cmd(u32 cmd, u32 flags) +{ + return flags << PS_CMD_BITS | cmd; +} + +static inline u32 decode_ps_cmd(u32 cmd) +{ + return cmd & PS_CMD_MASK; +} + +static inline u32 decode_ps_flags(u32 cmd) +{ + return cmd >> PS_CMD_BITS; +} + +static inline int __send(int sk, const void *buf, size_t sz, int fl) +{ + return opts.tls ? tls_send(buf, sz, fl) : send(sk, buf, sz, fl); +} + +static inline int __recv(int sk, void *buf, size_t sz, int fl) +{ + return opts.tls ? tls_recv(buf, sz, fl) : recv(sk, buf, sz, fl); +} + +static inline int send_psi_flags(int sk, struct page_server_iov *pi, int flags) +{ + if (__send(sk, pi, sizeof(*pi), flags) != sizeof(*pi)) { + pr_perror("Can't send PSI %d to server", pi->cmd); + return -1; + } + return 0; +} + +static inline int send_psi(int sk, struct page_server_iov *pi) +{ + return send_psi_flags(sk, pi, 0); +} + +/* page-server xfer */ +static int write_pages_to_server(struct page_xfer *xfer, + int p, unsigned long len) +{ + ssize_t ret, left = len; + + if (opts.tls) { + pr_debug("Sending %lu bytes / %lu pages\n", + len, len / PAGE_SIZE); + + if (tls_send_data_from_fd(p, len)) + return -1; + } else { + pr_debug("Splicing %lu bytes / %lu pages into socket\n", + len, len / PAGE_SIZE); + + while (left > 0) { + ret = splice(p, NULL, xfer->sk, NULL, left, + SPLICE_F_MOVE); + if (ret < 0) { + pr_perror("Can't write pages to socket"); + return -1; + } + + pr_debug("\tSpliced: %lu bytes sent\n", + (unsigned long)ret); + left -= ret; + } + } + + return 0; +} + +static int write_pagemap_to_server(struct page_xfer *xfer, struct iovec *iov, u32 flags) +{ + struct page_server_iov pi = { + .cmd = encode_ps_cmd(PS_IOV_ADD_F, flags), + .nr_pages = iov->iov_len / PAGE_SIZE, + .vaddr = encode_pointer(iov->iov_base), + .dst_id = xfer->dst_id, + }; + + return send_psi(xfer->sk, &pi); +} + +static void close_server_xfer(struct page_xfer *xfer) +{ + xfer->sk = -1; +} + +static int open_page_server_xfer(struct page_xfer *xfer, int fd_type, unsigned long img_id) +{ + char has_parent; + struct page_server_iov pi = { + .cmd = PS_IOV_OPEN2, + }; + + xfer->sk = page_server_sk; + xfer->write_pagemap = write_pagemap_to_server; + xfer->write_pages = write_pages_to_server; + xfer->close = close_server_xfer; + xfer->dst_id = encode_pm(fd_type, img_id); + xfer->parent = NULL; + + pi.dst_id = xfer->dst_id; + if (send_psi(xfer->sk, &pi)) { + pr_perror("Can't write to page server"); + return -1; + } + + /* Push the command NOW */ + tcp_nodelay(xfer->sk, true); + + if (__recv(xfer->sk, &has_parent, 1, 0) != 1) { + pr_perror("The page server doesn't answer"); + return -1; + } + + if (has_parent) + xfer->parent = (void *) 1; /* This is required for generate_iovs() */ + + return 0; +} + +/* local xfer */ +static int write_pages_loc(struct page_xfer *xfer, + int p, unsigned long len) +{ + ssize_t ret; + ssize_t curr = 0; + + while (1) { + ret = splice(p, NULL, img_raw_fd(xfer->pi), NULL, len - curr, SPLICE_F_MOVE); + if (ret == -1) { + pr_perror("Unable to spice data"); + return -1; + } + if (ret == 0) { + pr_err("A pipe was closed unexpectedly\n"); + return -1; + } + curr += ret; + if (curr == len) + break; + } + + return 0; +} + +static int check_pagehole_in_parent(struct page_read *p, struct iovec *iov) +{ + int ret; + unsigned long off, end; + + /* + * Try to find pagemap entry in parent, from which + * the data will be read on restore. + * + * This is the optimized version of the page-by-page + * read_pagemap_page routine. + */ + + pr_debug("Checking %p/%zu hole\n", iov->iov_base, iov->iov_len); + off = (unsigned long)iov->iov_base; + end = off + iov->iov_len; + while (1) { + unsigned long pend; + + ret = p->seek_pagemap(p, off); + if (ret <= 0 || !p->pe) { + pr_err("Missing %lx in parent pagemap\n", off); + return -1; + } + + pr_debug("\tFound %"PRIx64"/%lu\n", p->pe->vaddr, pagemap_len(p->pe)); + + /* + * The pagemap entry in parent may happen to be + * shorter, than the hole we write. In this case + * we should go ahead and check the remainder. + */ + + pend = p->pe->vaddr + pagemap_len(p->pe); + if (end <= pend) + return 0; + + pr_debug("\t\tcontinue on %lx\n", pend); + off = pend; + } +} + +static int write_pagemap_loc(struct page_xfer *xfer, struct iovec *iov, u32 flags) +{ + int ret; + PagemapEntry pe = PAGEMAP_ENTRY__INIT; + + pe.vaddr = encode_pointer(iov->iov_base); + pe.nr_pages = iov->iov_len / PAGE_SIZE; + pe.has_flags = true; + pe.flags = flags; + + if (flags & PE_PRESENT) { + if (opts.auto_dedup && xfer->parent != NULL) { + ret = dedup_one_iovec(xfer->parent, pe.vaddr, + pagemap_len(&pe)); + if (ret == -1) { + pr_perror("Auto-deduplication failed"); + return ret; + } + } + } else if (flags & PE_PARENT) { + if (xfer->parent != NULL) { + ret = check_pagehole_in_parent(xfer->parent, iov); + if (ret) { + pr_err("Hole %p/%zu not found in parent\n", + iov->iov_base, iov->iov_len); + return -1; + } + } + } + + if (pb_write_one(xfer->pmi, &pe, PB_PAGEMAP) < 0) + return -1; + + return 0; +} + +static void close_page_xfer(struct page_xfer *xfer) +{ + if (xfer->parent != NULL) { + xfer->parent->close(xfer->parent); + xfree(xfer->parent); + xfer->parent = NULL; + } + close_image(xfer->pi); + close_image(xfer->pmi); +} + +static int open_page_local_xfer(struct page_xfer *xfer, int fd_type, unsigned long img_id) +{ + u32 pages_id; + + xfer->pmi = open_image(fd_type, O_DUMP, img_id); + if (!xfer->pmi) + return -1; + + xfer->pi = open_pages_image(O_DUMP, xfer->pmi, &pages_id); + if (!xfer->pi) { + close_image(xfer->pmi); + return -1; + } + + /* + * Open page-read for parent images (if it exists). It will + * be used for two things: + * 1) when writing a page, those from parent will be dedup-ed + * 2) when writing a hole, the respective place would be checked + * to exist in parent (either pagemap or hole) + */ + xfer->parent = NULL; + if (fd_type == CR_FD_PAGEMAP || fd_type == CR_FD_SHMEM_PAGEMAP) { + int ret; + int pfd; + int pr_flags = (fd_type == CR_FD_PAGEMAP) ? PR_TASK : PR_SHMEM; + + + if (opts.remote) { + /* Note: we are replacing a real directory FD for a snapshot_id + * index. Since we need the parent of the current snapshot_id, + * we want the current snapshot_id index minus one. It is + * possible that dfd is already a snapshot_id index. We test it + * by comparing it to the service FD. When opening an image (see + * do_open_image) we convert the snapshot_id index into a real + * snapshot_id. + */ + pfd = get_curr_snapshot_id_idx() - 1; + if (pfd < 0) + goto out; + } else { + pfd = openat(get_service_fd(IMG_FD_OFF), CR_PARENT_LINK, O_RDONLY); + if (pfd < 0 && errno == ENOENT) + goto out; + } + + xfer->parent = xmalloc(sizeof(*xfer->parent)); + if (!xfer->parent) { + if (!opts.remote) + close(pfd); + return -1; + } + + ret = open_page_read_at(pfd, img_id, xfer->parent, pr_flags); + if (ret <= 0) { + pr_perror("No parent image found, though parent directory is set"); + xfree(xfer->parent); + xfer->parent = NULL; + if (!opts.remote) + close(pfd); + goto out; + } + if (!opts.remote) + close(pfd); + } + +out: + xfer->write_pagemap = write_pagemap_loc; + xfer->write_pages = write_pages_loc; + xfer->close = close_page_xfer; + return 0; +} + +int open_page_xfer(struct page_xfer *xfer, int fd_type, unsigned long img_id) +{ + xfer->offset = 0; + xfer->transfer_lazy = true; + + if (opts.use_page_server) + return open_page_server_xfer(xfer, fd_type, img_id); + else + return open_page_local_xfer(xfer, fd_type, img_id); +} + +static int page_xfer_dump_hole(struct page_xfer *xfer, + struct iovec *hole, u32 flags) +{ + BUG_ON(hole->iov_base < (void *)xfer->offset); + hole->iov_base -= xfer->offset; + pr_debug("\th %p [%u]\n", hole->iov_base, + (unsigned int)(hole->iov_len / PAGE_SIZE)); + + if (xfer->write_pagemap(xfer, hole, flags)) + return -1; + + return 0; +} + +static int get_hole_flags(struct page_pipe *pp, int n) +{ + unsigned int hole_flags = pp->hole_flags[n]; + + if (hole_flags == PP_HOLE_PARENT) + return PE_PARENT; + else + BUG(); + + return -1; +} + +static int dump_holes(struct page_xfer *xfer, struct page_pipe *pp, + unsigned int *cur_hole, void *limit) +{ + int ret; + + for (; *cur_hole < pp->free_hole ; (*cur_hole)++) { + struct iovec hole = pp->holes[*cur_hole]; + u32 hole_flags; + + if (limit && hole.iov_base >= limit) + break; + + hole_flags = get_hole_flags(pp, *cur_hole); + ret = page_xfer_dump_hole(xfer, &hole, hole_flags); + if (ret) + return ret; + } + + return 0; +} + +static inline u32 ppb_xfer_flags(struct page_xfer *xfer, struct page_pipe_buf *ppb) +{ + if (ppb->flags & PPB_LAZY) + /* + * Pages that can be lazily restored are always marked as such. + * In the case we actually transfer them into image mark them + * as present as well. + */ + return (xfer->transfer_lazy ? PE_PRESENT : 0) | PE_LAZY; + else + return PE_PRESENT; +} + +int page_xfer_dump_pages(struct page_xfer *xfer, struct page_pipe *pp) +{ + struct page_pipe_buf *ppb; + unsigned int cur_hole = 0; + int ret; + + pr_debug("Transferring pages:\n"); + + list_for_each_entry(ppb, &pp->bufs, l) { + unsigned int i; + + pr_debug("\tbuf %d/%d\n", ppb->pages_in, ppb->nr_segs); + + for (i = 0; i < ppb->nr_segs; i++) { + struct iovec iov = ppb->iov[i]; + u32 flags; + + ret = dump_holes(xfer, pp, &cur_hole, iov.iov_base); + if (ret) + return ret; + + BUG_ON(iov.iov_base < (void *)xfer->offset); + iov.iov_base -= xfer->offset; + pr_debug("\tp %p [%u]\n", iov.iov_base, + (unsigned int)(iov.iov_len / PAGE_SIZE)); + + flags = ppb_xfer_flags(xfer, ppb); + + if (xfer->write_pagemap(xfer, &iov, flags)) + return -1; + if ((flags & PE_PRESENT) && xfer->write_pages(xfer, + ppb->p[0], iov.iov_len)) + return -1; + } + } + + return dump_holes(xfer, pp, &cur_hole, NULL); +} + +/* + * Return: + * 1 - if a parent image exists + * 0 - if a parent image doesn't exist + * -1 - in error cases + */ +int check_parent_local_xfer(int fd_type, unsigned long img_id) +{ + char path[PATH_MAX]; + struct stat st; + int ret, pfd; + + if (opts.remote) + return get_curr_parent_snapshot_id_idx() == -1 ? 0 : 1; + + pfd = openat(get_service_fd(IMG_FD_OFF), CR_PARENT_LINK, O_RDONLY); + if (pfd < 0 && errno == ENOENT) + return 0; + + snprintf(path, sizeof(path), imgset_template[fd_type].fmt, img_id); + ret = fstatat(pfd, path, &st, 0); + if (ret == -1 && errno != ENOENT) { + pr_perror("Unable to stat %s", path); + close(pfd); + return -1; + } + + close(pfd); + return (ret == 0); +} + +/* page server */ +static int page_server_check_parent(int sk, struct page_server_iov *pi) +{ + int type, ret; + unsigned long id; + + type = decode_pm(pi->dst_id, &id); + if (type == -1) { + pr_err("Unknown pagemap type received\n"); + return -1; + } + + ret = check_parent_local_xfer(type, id); + if (ret < 0) + return -1; + + if (__send(sk, &ret, sizeof(ret), 0) != sizeof(ret)) { + pr_perror("Unable to send response"); + return -1; + } + + return 0; +} + +static int check_parent_server_xfer(int fd_type, unsigned long img_id) +{ + struct page_server_iov pi = {}; + int has_parent; + + pi.cmd = PS_IOV_PARENT; + pi.dst_id = encode_pm(fd_type, img_id); + + if (send_psi(page_server_sk, &pi)) + return -1; + + tcp_nodelay(page_server_sk, true); + + if (__recv(page_server_sk, &has_parent, sizeof(int), 0) != sizeof(int)) { + pr_perror("The page server doesn't answer"); + return -1; + } + + return has_parent; +} + +int check_parent_page_xfer(int fd_type, unsigned long img_id) +{ + if (opts.use_page_server) + return check_parent_server_xfer(fd_type, img_id); + else + return check_parent_local_xfer(fd_type, img_id); +} + +struct page_xfer_job { + u64 dst_id; + int p[2]; + unsigned pipe_size; + struct page_xfer loc_xfer; +}; + +static struct page_xfer_job cxfer = { + .dst_id = ~0, +}; + +static struct pipe_read_dest pipe_read_dest = { + .sink_fd = -1, +}; + +static void page_server_close(void) +{ + if (cxfer.dst_id != ~0) + cxfer.loc_xfer.close(&cxfer.loc_xfer); + if (pipe_read_dest.sink_fd != -1) { + close(pipe_read_dest.sink_fd); + close(pipe_read_dest.p[0]); + close(pipe_read_dest.p[1]); + } +} + +static int page_server_open(int sk, struct page_server_iov *pi) +{ + int type; + unsigned long id; + + type = decode_pm(pi->dst_id, &id); + if (type == -1) { + pr_err("Unknown pagemap type received\n"); + return -1; + } + + pr_info("Opening %d/%lu\n", type, id); + + page_server_close(); + + if (open_page_local_xfer(&cxfer.loc_xfer, type, id)) + return -1; + + cxfer.dst_id = pi->dst_id; + + if (sk >= 0) { + char has_parent = !!cxfer.loc_xfer.parent; + if (__send(sk, &has_parent, 1, 0) != 1) { + pr_perror("Unable to send response"); + close_page_xfer(&cxfer.loc_xfer); + return -1; + } + } + + return 0; +} + +static int prep_loc_xfer(struct page_server_iov *pi) +{ + if (cxfer.dst_id != pi->dst_id) { + pr_warn("Deprecated IO w/o open\n"); + return page_server_open(-1, pi); + } else + return 0; +} + +static int page_server_add(int sk, struct page_server_iov *pi, u32 flags) +{ + size_t len; + struct page_xfer *lxfer = &cxfer.loc_xfer; + struct iovec iov; + + pr_debug("Adding %"PRIx64"/%u\n", pi->vaddr, pi->nr_pages); + + if (prep_loc_xfer(pi)) + return -1; + + psi2iovec(pi, &iov); + if (lxfer->write_pagemap(lxfer, &iov, flags)) + return -1; + + if (!(flags & PE_PRESENT)) + return 0; + + len = iov.iov_len; + while (len > 0) { + ssize_t chunk; + + chunk = len; + if (chunk > cxfer.pipe_size) + chunk = cxfer.pipe_size; + + /* + * Splicing into a pipe may end up blocking if pipe is "full", + * and we need the SPLICE_F_NONBLOCK flag here. At the same time + * splicing from UNIX socket with this flag aborts splice with + * the EAGAIN if there's no data in it (TCP looks at the socket + * O_NONBLOCK flag _only_ and waits for data), so before doing + * the non-blocking splice we need to explicitly wait. + */ + + if (sk_wait_data(sk) < 0) { + pr_perror("Can't poll socket"); + return -1; + } + + if (opts.tls) { + if(tls_recv_data_to_fd(cxfer.p[1], chunk)) { + pr_err("Can't read from socket\n"); + return -1; + } + } else { + chunk = splice(sk, NULL, cxfer.p[1], NULL, chunk, + SPLICE_F_MOVE | SPLICE_F_NONBLOCK); + + if (chunk < 0) { + pr_perror("Can't read from socket"); + return -1; + } + if (chunk == 0) { + pr_err("A socket was closed unexpectedly\n"); + return -1; + } + } + + if (lxfer->write_pages(lxfer, cxfer.p[0], chunk)) + return -1; + + len -= chunk; + } + + return 0; +} + +static int page_server_get_pages(int sk, struct page_server_iov *pi) +{ + struct pstree_item *item; + struct page_pipe *pp; + unsigned long len; + int ret; + + item = pstree_item_by_virt(pi->dst_id); + pp = dmpi(item)->mem_pp; + + ret = page_pipe_read(pp, &pipe_read_dest, pi->vaddr, + &pi->nr_pages, PPB_LAZY); + if (ret) + return ret; + + /* + * The pi is reused for send_psi here, so .nr_pages, .vaddr and + * .dst_id all remain intact. + */ + + if (pi->nr_pages == 0) { + pr_debug("no iovs found, zero pages\n"); + return -1; + } + + pi->cmd = encode_ps_cmd(PS_IOV_ADD_F, PE_PRESENT); + if (send_psi(sk, pi)) + return -1; + + len = pi->nr_pages * PAGE_SIZE; + + if (opts.tls) { + if (tls_send_data_from_fd(pipe_read_dest.p[0], len)) + return -1; + } else { + ret = splice(pipe_read_dest.p[0], NULL, sk, NULL, len, + SPLICE_F_MOVE); + if (ret != len) + return -1; + } + + tcp_nodelay(sk, true); + + return 0; +} + +static int page_server_serve(int sk) +{ + int ret = -1; + bool flushed = false; + bool receiving_pages = !opts.lazy_pages; + + if (receiving_pages) { + /* + * This socket only accepts data except one thing -- it + * writes back the has_parent bit from time to time, so + * make it NODELAY all the time. + */ + tcp_nodelay(sk, true); + + if (pipe(cxfer.p)) { + pr_perror("Can't make pipe for xfer"); + close(sk); + return -1; + } + + cxfer.pipe_size = fcntl(cxfer.p[0], F_GETPIPE_SZ, 0); + pr_debug("Created xfer pipe size %u\n", cxfer.pipe_size); + } else { + pipe_read_dest_init(&pipe_read_dest); + tcp_cork(sk, true); + } + + while (1) { + struct page_server_iov pi; + u32 cmd; + + ret = __recv(sk, &pi, sizeof(pi), MSG_WAITALL); + if (!ret) + break; + + if (ret != sizeof(pi)) { + pr_perror("Can't read pagemap from socket"); + ret = -1; + break; + } + + flushed = false; + cmd = decode_ps_cmd(pi.cmd); + + switch (cmd) { + case PS_IOV_OPEN: + ret = page_server_open(-1, &pi); + break; + case PS_IOV_OPEN2: + ret = page_server_open(sk, &pi); + break; + case PS_IOV_PARENT: + ret = page_server_check_parent(sk, &pi); + break; + case PS_IOV_ADD_F: + case PS_IOV_ADD: + case PS_IOV_HOLE: + { + u32 flags; + + if (likely(cmd == PS_IOV_ADD_F)) + flags = decode_ps_flags(pi.cmd); + else if (cmd == PS_IOV_ADD) + flags = PE_PRESENT; + else /* PS_IOV_HOLE */ + flags = PE_PARENT; + + ret = page_server_add(sk, &pi, flags); + break; + } + case PS_IOV_FLUSH: + case PS_IOV_FLUSH_N_CLOSE: + { + int32_t status = 0; + + ret = 0; + + /* + * An answer must be sent back to inform another side, + * that all data were received + */ + if (__send(sk, &status, sizeof(status), 0) != sizeof(status)) { + pr_perror("Can't send the final package"); + ret = -1; + } + + flushed = true; + break; + } + case PS_IOV_GET: + ret = page_server_get_pages(sk, &pi); + break; + default: + pr_err("Unknown command %u\n", pi.cmd); + ret = -1; + break; + } + + if (ret || (pi.cmd == PS_IOV_FLUSH_N_CLOSE)) + break; + } + + if (receiving_pages && !ret && !flushed) { + pr_err("The data were not flushed\n"); + ret = -1; + } + + if (ret == 0 && opts.ps_socket == -1) { + char c; + + /* + * Wait when a remote side closes the connection + * to avoid TIME_WAIT bucket + */ + if (read(sk, &c, sizeof(c)) != 0) { + pr_perror("Unexpected data"); + ret = -1; + } + } + + tls_terminate_session(); + page_server_close(); + + pr_info("Session over\n"); + + close(sk); + return ret; +} + +static int fill_page_pipe(struct page_read *pr, struct page_pipe *pp) +{ + struct page_pipe_buf *ppb; + int i, ret; + + pr->reset(pr); + + while (pr->advance(pr)) { + unsigned long vaddr = pr->pe->vaddr; + + for (i = 0; i < pr->pe->nr_pages; i++, vaddr += PAGE_SIZE) { + if (pagemap_in_parent(pr->pe)) + ret = page_pipe_add_hole(pp, vaddr, PP_HOLE_PARENT); + else + ret = page_pipe_add_page(pp, vaddr, pagemap_lazy(pr->pe) ? PPB_LAZY : 0); + if (ret) { + pr_err("Failed adding page at %lx\n", vaddr); + return -1; + } + } + } + + list_for_each_entry(ppb, &pp->bufs, l) { + for (i = 0; i < ppb->nr_segs; i++) { + struct iovec iov = ppb->iov[i]; + + if (splice(img_raw_fd(pr->pi), NULL, ppb->p[1], NULL, + iov.iov_len, SPLICE_F_MOVE) != iov.iov_len) { + pr_perror("Splice failed"); + return -1; + } + } + } + + debug_show_page_pipe(pp); + + return 0; +} + +static int page_pipe_from_pagemap(struct page_pipe **pp, int pid) +{ + struct page_read pr; + int nr_pages = 0; + + if (open_page_read(pid, &pr, PR_TASK) <= 0) { + pr_err("Failed to open page read for %d\n", pid); + return -1; + } + + while (pr.advance(&pr)) + if (pagemap_present(pr.pe)) + nr_pages += pr.pe->nr_pages; + + *pp = create_page_pipe(nr_pages, NULL, 0); + if (!*pp) { + pr_err("Cannot create page pipe for %d\n", pid); + return -1; + } + + if (fill_page_pipe(&pr, *pp)) + return -1; + + return 0; +} + +static int page_server_init_send(void) +{ + struct pstree_item *pi; + struct page_pipe *pp; + + BUILD_BUG_ON(sizeof(struct dmp_info) > sizeof(struct rst_info)); + + if (prepare_dummy_pstree()) + return -1; + + for_each_pstree_item(pi) { + if (prepare_dummy_task_state(pi)) + return -1; + + if (!task_alive(pi)) + continue; + + if (page_pipe_from_pagemap(&pp, vpid(pi))) { + pr_err("%d: failed to open page-read\n", vpid(pi)); + return -1; + } + + /* + * prepare_dummy_pstree presumes 'restore' behaviour, + * but page_server_get_pages uses dmpi() to get access + * to the page-pipe, so we are faking it here. + */ + memset(rsti(pi), 0, sizeof(struct rst_info)); + dmpi(pi)->mem_pp = pp; + } + + return 0; +} + +int cr_page_server(bool daemon_mode, bool lazy_dump, int cfd) +{ + int ask = -1; + int sk = -1; + int ret; + + if (init_stats(DUMP_STATS)) + return -1; + + if (!opts.lazy_pages) + up_page_ids_base(); + else if (!lazy_dump) + if (page_server_init_send()) + return -1; + + if (opts.ps_socket != -1) { + ask = opts.ps_socket; + pr_info("Re-using ps socket %d\n", ask); + goto no_server; + } + + sk = setup_tcp_server("page"); + if (sk == -1) + return -1; +no_server: + + if (!daemon_mode && cfd >= 0) { + struct ps_info info = {.pid = getpid(), .port = opts.port}; + int count; + + count = write(cfd, &info, sizeof(info)); + close_safe(&cfd); + if (count != sizeof(info)) { + pr_perror("Unable to write ps_info"); + exit(1); + } + } + + ret = run_tcp_server(daemon_mode, &ask, cfd, sk); + if (ret != 0) + return ret > 0 ? 0 : -1; + + if (tls_x509_init(ask, true)) { + close(sk); + return -1; + } + + if (ask >= 0) + ret = page_server_serve(ask); + + if (daemon_mode) + exit(ret); + + return ret; +} + +static int connect_to_page_server(void) +{ + if (!opts.use_page_server) + return 0; + + if (opts.ps_socket != -1) { + page_server_sk = opts.ps_socket; + pr_info("Re-using ps socket %d\n", page_server_sk); + goto out; + } + + page_server_sk = setup_tcp_client(); + if (page_server_sk == -1) + return -1; + + if (tls_x509_init(page_server_sk, false)) { + close(page_server_sk); + return -1; + } +out: + /* + * CORK the socket at the very beginning. As per ANK + * the corked by default socket with sporadic NODELAY-s + * on urgent data is the smartest mode ever. + */ + tcp_cork(page_server_sk, true); + return 0; +} + +int connect_to_page_server_to_send(void) +{ + return connect_to_page_server(); +} + +int disconnect_from_page_server(void) +{ + struct page_server_iov pi = { }; + int32_t status = -1; + int ret = -1; + + if (!opts.use_page_server) + return 0; + + if (page_server_sk == -1) + return 0; + + pr_info("Disconnect from the page server\n"); + + if (opts.ps_socket != -1) + /* + * The socket might not get closed (held by + * the parent process) so we must order the + * page-server to terminate itself. + */ + pi.cmd = PS_IOV_FLUSH_N_CLOSE; + else + pi.cmd = PS_IOV_FLUSH; + + if (send_psi(page_server_sk, &pi)) + goto out; + + if (__recv(page_server_sk, &status, sizeof(status), 0) != sizeof(status)) { + pr_perror("The page server doesn't answer"); + goto out; + } + + ret = 0; +out: + tls_terminate_session(); + close_safe(&page_server_sk); + + return ret ? : status; +} + +struct ps_async_read { + unsigned long rb; /* read bytes */ + unsigned long goal; + unsigned long nr_pages; + + struct page_server_iov pi; + void *pages; + + ps_async_read_complete complete; + void *priv; + + struct list_head l; +}; + +static LIST_HEAD(async_reads); + +static inline void async_read_set_goal(struct ps_async_read *ar, int nr_pages) +{ + ar->goal = sizeof(ar->pi) + nr_pages * PAGE_SIZE; + ar->nr_pages = nr_pages; +} + +static void init_ps_async_read(struct ps_async_read *ar, void *buf, + int nr_pages, ps_async_read_complete complete, void *priv) +{ + ar->pages = buf; + ar->rb = 0; + ar->complete = complete; + ar->priv = priv; + async_read_set_goal(ar, nr_pages); +} + +static int page_server_start_async_read(void *buf, int nr_pages, + ps_async_read_complete complete, void *priv) +{ + struct ps_async_read *ar; + + ar = xmalloc(sizeof(*ar)); + if (ar == NULL) + return -1; + + init_ps_async_read(ar, buf, nr_pages, complete, priv); + list_add_tail(&ar->l, &async_reads); + return 0; +} + +/* + * There are two possible event types we need to handle: + * - page info is available as a reply to request_remote_page + * - page data is available, and it follows page info we've just received + * Since the on dump side communications are completely synchronous, + * we can return to epoll right after the reception of page info and + * for sure the next time socket event will occur we'll get page data + * related to info we've just received + */ +static int page_server_read(struct ps_async_read *ar, int flags) +{ + int ret, need; + void *buf; + + if (ar->rb < sizeof(ar->pi)) { + /* Header */ + buf = ((void *)&ar->pi) + ar->rb; + need = sizeof(ar->pi) - ar->rb; + } else { + /* page-serer may return less pages than we asked for */ + if (ar->pi.nr_pages < ar->nr_pages) + async_read_set_goal(ar, ar->pi.nr_pages); + /* Page(s) data itself */ + buf = ar->pages + (ar->rb - sizeof(ar->pi)); + need = ar->goal - ar->rb; + } + + ret = __recv(page_server_sk, buf, need, flags); + if (ret < 0) { + if (flags == MSG_DONTWAIT && (errno == EAGAIN || errno == EINTR)) { + ret = 0; + } else { + pr_perror("Error reading data from page server"); + return -1; + } + } + + ar->rb += ret; + if (ar->rb < ar->goal) + return 1; + + /* + * IO complete -- notify the caller and drop the request + */ + BUG_ON(ar->rb > ar->goal); + return ar->complete((int)ar->pi.dst_id, (unsigned long)ar->pi.vaddr, + (int)ar->pi.nr_pages, ar->priv); +} + +static int page_server_async_read(struct epoll_rfd *f) +{ + struct ps_async_read *ar; + int ret; + + BUG_ON(list_empty(&async_reads)); + ar = list_first_entry(&async_reads, struct ps_async_read, l); + ret = page_server_read(ar, MSG_DONTWAIT); + + if (ret > 0) + return 0; + if (!ret) { + list_del(&ar->l); + xfree(ar); + } + + return ret; +} + +static int page_server_hangup_event(struct epoll_rfd *rfd) +{ + pr_err("Remote side closed connection\n"); + return -1; +} + +static struct epoll_rfd ps_rfd; + +int connect_to_page_server_to_recv(int epfd) +{ + if (connect_to_page_server()) + return -1; + + ps_rfd.fd = page_server_sk; + ps_rfd.read_event = page_server_async_read; + ps_rfd.hangup_event = page_server_hangup_event; + + return epoll_add_rfd(epfd, &ps_rfd); +} + +int request_remote_pages(unsigned long img_id, unsigned long addr, int nr_pages) +{ + struct page_server_iov pi = { + .cmd = PS_IOV_GET, + .nr_pages = nr_pages, + .vaddr = addr, + .dst_id = img_id, + }; + + /* XXX: why MSG_DONTWAIT here? */ + if (send_psi_flags(page_server_sk, &pi, MSG_DONTWAIT)) + return -1; + + tcp_nodelay(page_server_sk, true); + return 0; +} + +static int page_server_start_sync_read(void *buf, int nr, + ps_async_read_complete complete, void *priv) +{ + struct ps_async_read ar; + int ret = 1; + + init_ps_async_read(&ar, buf, nr, complete, priv); + while (ret == 1) + ret = page_server_read(&ar, MSG_WAITALL); + return ret; +} + +int page_server_start_read(void *buf, int nr, + ps_async_read_complete complete, void *priv, unsigned flags) +{ + if (flags & PR_ASYNC) + return page_server_start_async_read(buf, nr, complete, priv); + else + return page_server_start_sync_read(buf, nr, complete, priv); +} diff --git a/CRIU_code/criu/pagemap-cache.c b/CRIU_code/criu/pagemap-cache.c new file mode 100644 index 0000000..a1c2d42 --- /dev/null +++ b/CRIU_code/criu/pagemap-cache.c @@ -0,0 +1,193 @@ +#include +#include + +#include "page.h" +#include "pagemap-cache.h" +#include "common/compiler.h" +#include "xmalloc.h" +#include "util.h" +#include "log.h" +#include "vma.h" +#include "mem.h" +#include "kerndat.h" + +#undef LOG_PREFIX +#define LOG_PREFIX "pagemap-cache: " + +/* To carry up to 2M of physical memory */ +#define PMC_SHIFT (21) +#define PMC_SIZE (1ul << PMC_SHIFT) +#define PMC_MASK (~(PMC_SIZE - 1)) +#define PMC_SIZE_GAP (PMC_SIZE / 4) + +#define PAGEMAP_LEN(addr) (PAGE_PFN(addr) * sizeof(u64)) + +/* + * It's a workaround for a kernel bug. In the 3.19 kernel when pagemap are read + * for a few vma-s for one read call, it returns incorrect data. + * https://github.com/xemul/criu/issues/207 +*/ +static bool pagemap_cache_disabled; + +static inline void pmc_reset(pmc_t *pmc) +{ + memzero(pmc, sizeof(*pmc)); + pmc->fd = -1; +} + +static inline void pmc_zap(pmc_t *pmc) +{ + pmc->start = pmc->end = 0; +} + +int pmc_init(pmc_t *pmc, pid_t pid, const struct list_head *vma_head, size_t size) +{ + size_t map_size = max(size, (size_t)PMC_SIZE); + pmc_reset(pmc); + + BUG_ON(!vma_head); + + pmc->pid = pid; + pmc->map_len = PAGEMAP_LEN(map_size); + pmc->vma_head = vma_head; + + pmc->map = xmalloc(pmc->map_len); + if (!pmc->map) + goto err; + + if (pagemap_cache_disabled) + pr_debug("The pagemap cache is disabled\n"); + + if (kdat.pmap == PM_DISABLED) { + /* + * FIXME We might need to implement greedy + * mode via reading all pages available inside + * parasite. + * + * Actually since linux-4.4 the pagemap file + * is available for usernamespace with hiding + * PFNs but providing page attributes, so other + * option simply require kernel 4.4 and above + * for usernamespace support. + */ + pr_err("No pagemap for %d available\n", pid); + goto err; + } else { + pmc->fd = open_proc(pid, "pagemap"); + if (pmc->fd < 0) + goto err; + } + + pr_debug("created for pid %d (takes %zu bytes)\n", pid, pmc->map_len); + return 0; + +err: + pr_err("Failed to init pagemap for %d\n", pid); + pmc_fini(pmc); + return -1; +} + +static inline u64 *__pmc_get_map(pmc_t *pmc, unsigned long addr) +{ + return &pmc->map[PAGE_PFN(addr - pmc->start)]; +} + +static int pmc_fill_cache(pmc_t *pmc, const struct vma_area *vma) +{ + unsigned long low = vma->e->start & PMC_MASK; + unsigned long high = low + PMC_SIZE; + size_t len = vma_area_len(vma); + size_t size_map; + + if (high > kdat.task_size) + high = kdat.task_size; + + pmc->start = vma->e->start; + pmc->end = vma->e->end; + + pr_debug("filling VMA %lx-%lx (%zuK) [l:%lx h:%lx]\n", + (long)vma->e->start, (long)vma->e->end, len >> 10, low, high); + + /* + * If we meet a small VMA, lets try to fit 2M cache + * window at least 75% full, otherwise left as a plain + * "one vma at a time" read. Note the VMAs in cache must + * fit in solid manner, iow -- either the whole vma fits + * the cache window, either plain read is used. + * + * The benefit (apart redusing the number of read() calls) + * is to walk page tables less. + */ + if (!pagemap_cache_disabled && + len < PMC_SIZE && (vma->e->start - low) < PMC_SIZE_GAP) { + size_t size_cov = len; + size_t nr_vmas = 1; + + pr_debug("\t%16lx-%-16lx nr:%-5zu cov:%zu\n", + (long)vma->e->start, (long)vma->e->end, nr_vmas, size_cov); + + list_for_each_entry_continue(vma, pmc->vma_head, list) { + if (vma->e->start > high || vma->e->end > high) + break; + + BUG_ON(vma->e->start < low); + size_cov += vma_area_len(vma); + nr_vmas++; + + pr_debug("\t%16lx-%-16lx nr:%-5zu cov:%zu\n", + (long)vma->e->start, (long)vma->e->end, nr_vmas, size_cov); + } + + if (nr_vmas > 1) { + /* + * Note we don't touch low bound since it's set + * to first VMA start already and not updating it + * allows us to save a couple of code bytes. + */ + pmc->end = high; + pr_debug("\tcache mode [l:%lx h:%lx]\n", pmc->start, pmc->end); + } else + pr_debug("\tsimple mode [l:%lx h:%lx]\n", pmc->start, pmc->end); + } + + size_map = PAGEMAP_LEN(pmc->end - pmc->start); + BUG_ON(pmc->map_len < size_map); + BUG_ON(pmc->fd < 0); + + if (pread(pmc->fd, pmc->map, size_map, PAGEMAP_PFN_OFF(pmc->start)) != size_map) { + pmc_zap(pmc); + pr_perror("Can't read %d's pagemap file", pmc->pid); + return -1; + } + + return 0; +} + +u64 *pmc_get_map(pmc_t *pmc, const struct vma_area *vma) +{ + /* Hit */ + if (likely(pmc->start <= vma->e->start && pmc->end >= vma->e->end)) + return __pmc_get_map(pmc, vma->e->start); + + /* Miss, refill the cache */ + if (pmc_fill_cache(pmc, vma)) { + pr_err("Failed to fill cache for %d (%lx-%lx)\n", + pmc->pid, (long)vma->e->start, (long)vma->e->end); + return NULL; + } + + /* Hit for sure */ + return __pmc_get_map(pmc, vma->e->start); +} + +void pmc_fini(pmc_t *pmc) +{ + close_safe(&pmc->fd); + xfree(pmc->map); + pmc_reset(pmc); +} + +static void __attribute__((constructor)) pagemap_cache_init(void) +{ + pagemap_cache_disabled = (getenv("CRIU_PMC_OFF") != NULL); +} diff --git a/CRIU_code/criu/pagemap.c b/CRIU_code/criu/pagemap.c new file mode 100644 index 0000000..a19969b --- /dev/null +++ b/CRIU_code/criu/pagemap.c @@ -0,0 +1,870 @@ +#include +#include +#include +#include +#include +#include + +#include "types.h" +#include "image.h" +#include "cr_options.h" +#include "servicefd.h" +#include "pagemap.h" +#include "restorer.h" +#include "rst-malloc.h" +#include "page-xfer.h" + +#include "fault-injection.h" +#include "xmalloc.h" +#include "protobuf.h" +#include "images/pagemap.pb-c.h" +#include "img-remote.h" + +#ifndef SEEK_DATA +#define SEEK_DATA 3 +#define SEEK_HOLE 4 +#endif + +#define MAX_BUNCH_SIZE 256 + +/* + * One "job" for the preadv() syscall in pagemap.c + */ +struct page_read_iov { + off_t from; /* offset in pi file where to start reading from */ + off_t end; /* the end of the read == sum to.iov_len -s */ + struct iovec *to; /* destination iovs */ + unsigned int nr; /* their number */ + + struct list_head l; +}; + +static inline bool can_extend_bunch(struct iovec *bunch, + unsigned long off, unsigned long len) +{ + return /* The next region is the continuation of the existing */ + ((unsigned long)bunch->iov_base + bunch->iov_len == off) && + /* The resulting region is non empty and is small enough */ + (bunch->iov_len == 0 || bunch->iov_len + len < MAX_BUNCH_SIZE * PAGE_SIZE); +} + +static int punch_hole(struct page_read *pr, unsigned long off, + unsigned long len, bool cleanup) +{ + int ret; + struct iovec * bunch = &pr->bunch; + + if (!cleanup && can_extend_bunch(bunch, off, len)) { + pr_debug("pr%lu-%u:Extend bunch len from %zu to %lu\n", pr->img_id, + pr->id, bunch->iov_len, bunch->iov_len + len); + bunch->iov_len += len; + } else { + if (bunch->iov_len > 0) { + pr_debug("Punch!/%p/%zu/\n", bunch->iov_base, bunch->iov_len); + ret = fallocate(img_raw_fd(pr->pi), FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, + (unsigned long)bunch->iov_base, bunch->iov_len); + if (ret != 0) { + pr_perror("Error punching hole"); + return -1; + } + } + bunch->iov_base = (void *)off; + bunch->iov_len = len; + pr_debug("pr%lu-%u:New bunch/%p/%zu/\n", pr->img_id, pr->id, bunch->iov_base, bunch->iov_len); + } + return 0; +} + +int dedup_one_iovec(struct page_read *pr, unsigned long off, unsigned long len) +{ + unsigned long iov_end; + + iov_end = off + len; + while (1) { + int ret; + unsigned long piov_end; + struct page_read * prp; + + ret = pr->seek_pagemap(pr, off); + if (ret == 0) { + pr_warn("Missing %lx in parent pagemap\n", off); + if (off < pr->cvaddr && pr->cvaddr < iov_end) + off = pr->cvaddr; + else + return 0; + } + + if (!pr->pe) + return -1; + piov_end = pr->pe->vaddr + pagemap_len(pr->pe); + if (!pagemap_in_parent(pr->pe)) { + ret = punch_hole(pr, pr->pi_off, min(piov_end, iov_end) - off, false); + if (ret == -1) + return ret; + } + + prp = pr->parent; + if (prp) { + /* recursively */ + pr_debug("Go to next parent level\n"); + len = min(piov_end, iov_end) - off; + ret = dedup_one_iovec(prp, off, len); + if (ret != 0) + return -1; + } + + if (piov_end < iov_end) { + off = piov_end; + continue; + } else + return 0; + } + return 0; +} + +static int advance(struct page_read *pr) +{ + pr->curr_pme++; + if (pr->curr_pme >= pr->nr_pmes) + return 0; + + pr->pe = pr->pmes[pr->curr_pme]; + pr->cvaddr = pr->pe->vaddr; + + return 1; +} + +static void skip_pagemap_pages(struct page_read *pr, unsigned long len) +{ + if (!len) + return; + + if (pagemap_present(pr->pe)) { + if (opts.remote) + if (skip_remote_bytes(img_raw_fd(pr->pi), len)) + pr_perror("Error skipping remote bytes"); + pr->pi_off += len; + } + pr->cvaddr += len; +} + +static int seek_pagemap(struct page_read *pr, unsigned long vaddr) +{ + if (!pr->pe) + goto adv; + + do { + unsigned long start = pr->pe->vaddr; + unsigned long end = start + pagemap_len(pr->pe); + + if (vaddr < pr->cvaddr) + break; + + if (vaddr >= start && vaddr < end) { + skip_pagemap_pages(pr, vaddr > pr->cvaddr ? vaddr - pr->cvaddr : 0); + return 1; + } + + if (end <= vaddr) + skip_pagemap_pages(pr, end - pr->cvaddr); +adv: + ; /* otherwise "label at end of compound stmt" gcc error */ + } while (advance(pr)); + + return 0; +} + +static inline void pagemap_bound_check(PagemapEntry *pe, unsigned long vaddr, int nr) +{ + if (vaddr < pe->vaddr || (vaddr - pe->vaddr) / PAGE_SIZE + nr > pe->nr_pages) { + pr_err("Page read err %"PRIx64":%u vs %lx:%u\n", + pe->vaddr, pe->nr_pages, vaddr, nr); + BUG(); + } +} + +static int read_parent_page(struct page_read *pr, unsigned long vaddr, + int nr, void *buf, unsigned flags) +{ + struct page_read *ppr = pr->parent; + int ret; + + if (!ppr) { + pr_err("No parent for snapshot pagemap\n"); + return -1; + } + + /* + * Parent pagemap at this point entry may be shorter + * than the current vaddr:nr needs, so we have to + * carefully 'split' the vaddr:nr into pieces and go + * to parent page-read with the longest requests it + * can handle. + */ + + do { + int p_nr; + + pr_debug("\tpr%lu-%u Read from parent\n", pr->img_id, pr->id); + ret = ppr->seek_pagemap(ppr, vaddr); + if (ret <= 0) { + pr_err("Missing %lx in parent pagemap\n", vaddr); + return -1; + } + + /* + * This is how many pages we have in the parent + * page_read starting from vaddr. Go ahead and + * read as much as we can. + */ + p_nr = ppr->pe->nr_pages - (vaddr - ppr->pe->vaddr) / PAGE_SIZE; + pr_info("\tparent has %u pages in\n", p_nr); + if (p_nr > nr) + p_nr = nr; + + ret = ppr->read_pages(ppr, vaddr, p_nr, buf, flags); + if (ret == -1) + return ret; + + /* + * OK, let's see how much data we have left and go + * to parent page-read again for the next pagemap + * entry. + */ + nr -= p_nr; + vaddr += p_nr * PAGE_SIZE; + buf += p_nr * PAGE_SIZE; + } while (nr); + + return 0; +} + +static int read_local_page(struct page_read *pr, unsigned long vaddr, + unsigned long len, void *buf) +{ + int fd = img_raw_fd(pr->pi); + ssize_t ret; + + /* + * Flush any pending async requests if any not to break the + * linear reading from the pages.img file. + */ + if (pr->sync(pr)) + return -1; + + pr_debug("\tpr%lu-%u Read page from self %lx/%"PRIx64"\n", pr->img_id, pr->id, pr->cvaddr, pr->pi_off); + ret = pread(fd, buf, len, pr->pi_off); + if (ret != len) { + pr_perror("Can't read mapping page %zd", ret); + return -1; + } + + if (opts.auto_dedup) { + ret = punch_hole(pr, pr->pi_off, len, false); + if (ret == -1) + return -1; + } + + return 0; +} + +static int enqueue_async_iov(struct page_read *pr, void *buf, + unsigned long len, struct list_head *to) +{ + struct page_read_iov *pr_iov; + struct iovec *iov; + + pr_iov = xzalloc(sizeof(*pr_iov)); + if (!pr_iov) + return -1; + + pr_iov->from = pr->pi_off; + pr_iov->end = pr->pi_off + len; + + iov = xzalloc(sizeof(*iov)); + if (!iov) { + xfree(pr_iov); + return -1; + } + + iov->iov_base = buf; + iov->iov_len = len; + + pr_iov->to = iov; + pr_iov->nr = 1; + + list_add_tail(&pr_iov->l, to); + + return 0; +} + +int pagemap_render_iovec(struct list_head *from, struct task_restore_args *ta) +{ + struct page_read_iov *piov; + + ta->vma_ios = (struct restore_vma_io *)rst_mem_align_cpos(RM_PRIVATE); + ta->vma_ios_n = 0; + + list_for_each_entry(piov, from, l) { + struct restore_vma_io *rio; + + pr_info("`- render %d iovs (%p:%zd...)\n", piov->nr, + piov->to[0].iov_base, piov->to[0].iov_len); + rio = rst_mem_alloc(RIO_SIZE(piov->nr), RM_PRIVATE); + if (!rio) + return -1; + + rio->nr_iovs = piov->nr; + rio->off = piov->from; + memcpy(rio->iovs, piov->to, piov->nr * sizeof(struct iovec)); + + ta->vma_ios_n++; + } + + return 0; +} + +int pagemap_enqueue_iovec(struct page_read *pr, void *buf, + unsigned long len, struct list_head *to) +{ + struct page_read_iov *cur_async = NULL; + struct iovec *iov; + + if (!list_empty(to)) + cur_async = list_entry(to->prev, struct page_read_iov, l); + + /* + * We don't have any async requests or we have new read + * request that should happen at pos _after_ some hole from + * the previous one. + * Start the new preadv request here. + */ + if (!cur_async || pr->pi_off != cur_async->end) + return enqueue_async_iov(pr, buf, len, to); + + /* + * This read is pure continuation of the previous one. Let's + * just add another IOV (or extend one of the existing). + */ + iov = &cur_async->to[cur_async->nr - 1]; + if (iov->iov_base + iov->iov_len == buf) { + /* Extendable */ + iov->iov_len += len; + } else { + /* Need one more target iovec */ + unsigned int n_iovs = cur_async->nr + 1; + + if (n_iovs >= IOV_MAX) + return enqueue_async_iov(pr, buf, len, to); + + iov = xrealloc(cur_async->to, n_iovs * sizeof(*iov)); + if (!iov) + return -1; + + cur_async->to = iov; + + iov += cur_async->nr; + iov->iov_base = buf; + iov->iov_len = len; + + cur_async->nr = n_iovs; + } + + cur_async->end += len; + + return 0; +} + +static int maybe_read_page_local(struct page_read *pr, unsigned long vaddr, + int nr, void *buf, unsigned flags) +{ + int ret; + unsigned long len = nr * PAGE_SIZE; + + /* + * There's no API in the kernel to start asynchronous + * cached read (or write), so in case someone is asking + * for us for urgent async read, just do the regular + * cached read. + */ + if ((flags & (PR_ASYNC|PR_ASAP)) == PR_ASYNC) + ret = pagemap_enqueue_iovec(pr, buf, len, &pr->async); + else { + ret = read_local_page(pr, vaddr, len, buf); + if (ret == 0 && pr->io_complete) + ret = pr->io_complete(pr, vaddr, nr); + } + + pr->pi_off += len; + + return ret; +} + +static int maybe_read_page_img_cache(struct page_read *pr, unsigned long vaddr, + int nr, void *buf, unsigned flags) +{ + unsigned long len = nr * PAGE_SIZE; + int fd = img_raw_fd(pr->pi); + int ret; + size_t curr = 0; + + pr_debug("\tpr%lu-%u Read page from self %lx/%"PRIx64"\n", pr->img_id, pr->id, pr->cvaddr, pr->pi_off); + while (1) { + ret = read(fd, buf + curr, len - curr); + if (ret < 0) { + pr_perror("Can't read mapping page %d", ret); + return -1; + } + curr += ret; + if (curr == len) + break; + } + + if (opts.auto_dedup) + pr_warn_once("Can't dedup from image cache\n"); + + if (ret == 0 && pr->io_complete) + ret = pr->io_complete(pr, vaddr, nr); + + pr->pi_off += len; + + return ret; +} + +static int read_page_complete(unsigned long img_id, unsigned long vaddr, int nr_pages, void *priv) +{ + int ret = 0; + struct page_read *pr = priv; + + if (pr->img_id != img_id) { + pr_err("Out of order read completed (want %lu have %lu)\n", + pr->img_id, img_id); + return -1; + } + + if (pr->io_complete) + ret = pr->io_complete(pr, vaddr, nr_pages); + else + pr_warn_once("Remote page read w/o io_complete!\n"); + + return ret; +} + +static int maybe_read_page_remote(struct page_read *pr, unsigned long vaddr, + int nr, void *buf, unsigned flags) +{ + int ret; + + /* We always do PR_ASAP mode here (FIXME?) */ + ret = request_remote_pages(pr->img_id, vaddr, nr); + if (!ret) + ret = page_server_start_read(buf, nr, + read_page_complete, pr, flags); + return ret; +} + +static int read_pagemap_page(struct page_read *pr, unsigned long vaddr, int nr, + void *buf, unsigned flags) +{ + pr_info("pr%lu-%u Read %lx %u pages\n", pr->img_id, pr->id, vaddr, nr); + pagemap_bound_check(pr->pe, vaddr, nr); + + if (pagemap_in_parent(pr->pe)) { + if (read_parent_page(pr, vaddr, nr, buf, flags) < 0) + return -1; + } else { + if (pr->maybe_read_page(pr, vaddr, nr, buf, flags) < 0) + return -1; + } + + pr->cvaddr += nr * PAGE_SIZE; + + return 1; +} + +static void free_pagemaps(struct page_read *pr) +{ + int i; + + for (i = 0; i < pr->nr_pmes; i++) + pagemap_entry__free_unpacked(pr->pmes[i], NULL); + + xfree(pr->pmes); + pr->pmes = NULL; +} + +static void advance_piov(struct page_read_iov *piov, ssize_t len) +{ + ssize_t olen = len; + int onr = piov->nr; + piov->from += len; + + while (len) { + struct iovec *cur = piov->to; + + if (cur->iov_len <= len) { + piov->to++; + piov->nr--; + len -= cur->iov_len; + continue; + } + + cur->iov_base += len; + cur->iov_len -= len; + break; + } + + pr_debug("Advanced iov %zu bytes, %d->%d iovs, %zu tail\n", + olen, onr, piov->nr, len); +} + +static int process_async_reads(struct page_read *pr) +{ + int fd, ret = 0; + struct page_read_iov *piov, *n; + + fd = img_raw_fd(pr->pi); + list_for_each_entry_safe(piov, n, &pr->async, l) { + ssize_t ret; + off_t start = piov->from; + struct iovec *iovs = piov->to; + + pr_debug("Read piov iovs %d, from %ju, len %ju, first %p:%zu\n", + piov->nr, piov->from, piov->end - piov->from, + piov->to->iov_base, piov->to->iov_len); +more: + ret = preadv(fd, piov->to, piov->nr, piov->from); + if (fault_injected(FI_PARTIAL_PAGES)) { + /* + * We might have read everything, but for debug + * purposes let's try to force the advance_piov() + * and re-read tail. + */ + if (ret > 0 && piov->nr >= 2) { + pr_debug("`- trim preadv %zu\n", ret); + ret /= 2; + } + } + + if (ret != piov->end - piov->from) { + if (ret < 0) { + pr_err("Can't read async pr bytes (%zd / %ju read, %ju off, %d iovs)\n", + ret, piov->end - piov->from, piov->from, piov->nr); + return -1; + } + + /* + * The preadv() can return less than requested. It's + * valid and doesn't mean error or EOF. We should advance + * the iovecs and continue + * + * Modify the piov in-place, we're going to drop this one + * anyway. + */ + + advance_piov(piov, ret); + goto more; + } + + if (opts.auto_dedup && punch_hole(pr, start, ret, false)) + return -1; + + BUG_ON(pr->io_complete); /* FIXME -- implement once needed */ + + list_del(&piov->l); + xfree(iovs); + xfree(piov); + } + + if (pr->parent) + ret = process_async_reads(pr->parent); + + return ret; +} + +static void close_page_read(struct page_read *pr) +{ + int ret; + + BUG_ON(!list_empty(&pr->async)); + + if (pr->bunch.iov_len > 0) { + ret = punch_hole(pr, 0, 0, true); + if (ret == -1) + return; + + pr->bunch.iov_len = 0; + } + + if (pr->parent) { + close_page_read(pr->parent); + xfree(pr->parent); + } + + if (pr->pmi) + close_image(pr->pmi); + if (pr->pi) + close_image(pr->pi); + + if (pr->pmes) + free_pagemaps(pr); +} + +static void reset_pagemap(struct page_read *pr) +{ + pr->cvaddr = 0; + pr->pi_off = 0; + pr->curr_pme = -1; + pr->pe = NULL; + + /* FIXME: take care of bunch */ + + if (pr->parent) + reset_pagemap(pr->parent); +} + +static int try_open_parent(int dfd, unsigned long id, struct page_read *pr, int pr_flags) +{ + int pfd, ret; + struct page_read *parent = NULL; + + if (opts.remote) { + /* Note: we are replacing a real directory FD for a snapshot_id + * index. Since we need the parent of the current snapshot_id, + * we want the current snapshot_id index minus one. It is + * possible that dfd is already a snapshot_id index. We test it + * by comparing it to the service FD. When opening an image (see + * do_open_image) we convert the snapshot_id index into a real + * snapshot_id. + */ + pfd = dfd == get_service_fd(IMG_FD_OFF) ? + get_curr_snapshot_id_idx() - 1 : dfd - 1; + if (pfd < 0) + goto out; + } else { + pfd = openat(dfd, CR_PARENT_LINK, O_RDONLY); + if (pfd < 0 && errno == ENOENT) + goto out; + } + + parent = xmalloc(sizeof(*parent)); + if (!parent) + goto err_cl; + + ret = open_page_read_at(pfd, id, parent, pr_flags); + if (ret < 0) + goto err_free; + + if (!ret) { + xfree(parent); + parent = NULL; + } + + if (!opts.remote) + close(pfd); +out: + pr->parent = parent; + return 0; + +err_free: + xfree(parent); +err_cl: + if (!opts.remote) + close(pfd); + return -1; +} + +static void init_compat_pagemap_entry(PagemapEntry *pe) +{ + /* + * pagemap image generated with older version will either + * contain a hole because the pages are in the parent + * snapshot or a pagemap that should be marked with + * PE_PRESENT + */ + if (pe->has_in_parent && pe->in_parent) + pe->flags |= PE_PARENT; + else if (!pe->has_flags) + pe->flags = PE_PRESENT; +} + +/* + * The pagemap entry size is at least 8 bytes for small mappings with + * low address and may get to 18 bytes or even more for large mappings + * with high address and in_parent flag set. 16 seems to be nice round + * number to minimize {over,under}-allocations + */ +#define PAGEMAP_ENTRY_SIZE_ESTIMATE 16 + +static int init_pagemaps(struct page_read *pr) +{ + off_t fsize; + int nr_pmes, nr_realloc; + + if (!opts.remote) + fsize = img_raw_size(pr->pmi); + else + /* + * FIXME - There is no easy way to estimate the size of the + * pagemap that is still to be read from the socket. Possible + * solution is to ask Image Proxy or Image Cache about the size + * of the image. 1024 is a wild guess (more space is allocated + * if needed). + */ + fsize = 1024; + + if (fsize < 0) + return -1; + + nr_pmes = fsize / PAGEMAP_ENTRY_SIZE_ESTIMATE + 1; + nr_realloc = nr_pmes / 2; + + pr->pmes = xzalloc(nr_pmes * sizeof(*pr->pmes)); + if (!pr->pmes) + return -1; + + pr->nr_pmes = 0; + pr->curr_pme = -1; + + while (1) { + int ret = pb_read_one_eof(pr->pmi, &pr->pmes[pr->nr_pmes], + PB_PAGEMAP); + if (ret < 0) + goto free_pagemaps; + if (ret == 0) + break; + + init_compat_pagemap_entry(pr->pmes[pr->nr_pmes]); + + pr->nr_pmes++; + if (pr->nr_pmes >= nr_pmes) { + PagemapEntry **new; + nr_pmes += nr_realloc; + new = xrealloc(pr->pmes, + nr_pmes * sizeof(*pr->pmes)); + if (!new) + goto free_pagemaps; + pr->pmes = new; + } + } + + close_image(pr->pmi); + pr->pmi = NULL; + + return 0; + +free_pagemaps: + free_pagemaps(pr); + return -1; +} + +int open_page_read_at(int dfd, unsigned long img_id, struct page_read *pr, int pr_flags) +{ + int flags, i_typ; + static unsigned ids = 1; + bool remote = pr_flags & PR_REMOTE; + + /* + * Only the top-most page-read can be remote, all the + * others are always local. + */ + pr_flags &= ~PR_REMOTE; + if (opts.auto_dedup) + pr_flags |= PR_MOD; + if (pr_flags & PR_MOD) + flags = O_RDWR; + else + flags = O_RSTR; + + switch (pr_flags & PR_TYPE_MASK) { + case PR_TASK: + i_typ = CR_FD_PAGEMAP; + break; + case PR_SHMEM: + i_typ = CR_FD_SHMEM_PAGEMAP; + break; + default: + BUG(); + return -1; + } + + INIT_LIST_HEAD(&pr->async); + pr->pe = NULL; + pr->parent = NULL; + pr->cvaddr = 0; + pr->pi_off = 0; + pr->bunch.iov_len = 0; + pr->bunch.iov_base = NULL; + pr->pmes = NULL; + pr->pieok = false; + + pr->pmi = open_image_at(dfd, i_typ, O_RSTR, img_id); + if (!pr->pmi) + return -1; + + if (empty_image(pr->pmi)) { + close_image(pr->pmi); + return 0; + } + + if (try_open_parent(dfd, img_id, pr, pr_flags)) { + close_image(pr->pmi); + return -1; + } + + pr->pi = open_pages_image_at(dfd, flags, pr->pmi, &pr->pages_img_id); + if (!pr->pi) { + close_page_read(pr); + return -1; + } + + if (init_pagemaps(pr)) { + close_page_read(pr); + return -1; + } + + pr->read_pages = read_pagemap_page; + pr->advance = advance; + pr->close = close_page_read; + pr->skip_pages = skip_pagemap_pages; + pr->sync = process_async_reads; + pr->seek_pagemap = seek_pagemap; + pr->reset = reset_pagemap; + pr->io_complete = NULL; /* set up by the client if needed */ + pr->id = ids++; + pr->img_id = img_id; + + if (opts.remote) + pr->maybe_read_page = maybe_read_page_img_cache; + else if (remote) + pr->maybe_read_page = maybe_read_page_remote; + else { + pr->maybe_read_page = maybe_read_page_local; + if (!pr->parent && !opts.lazy_pages) + pr->pieok = true; + } + + pr_debug("Opened %s page read %u (parent %u)\n", + remote ? "remote" : "local", pr->id, + pr->parent ? pr->parent->id : 0); + + return 1; +} + +int open_page_read(unsigned long img_id, struct page_read *pr, int pr_flags) +{ + return open_page_read_at(get_service_fd(IMG_FD_OFF), img_id, pr, pr_flags); +} + + +#define DUP_IDS_BASE 1000 + +void dup_page_read(struct page_read *src, struct page_read *dst) +{ + static int dup_ids = 1; + + memcpy(dst, src, sizeof(*dst)); + INIT_LIST_HEAD(&dst->async); + dst->id = src->id + DUP_IDS_BASE * dup_ids++; + dst->reset(dst); +} diff --git a/CRIU_code/criu/parasite-syscall.c b/CRIU_code/criu/parasite-syscall.c new file mode 100644 index 0000000..b9788a4 --- /dev/null +++ b/CRIU_code/criu/parasite-syscall.c @@ -0,0 +1,577 @@ +#include +#include + +#include +#include +#include + +#include "common/config.h" +#include "common/compiler.h" +#include "types.h" +#include "protobuf.h" +#include "images/sa.pb-c.h" +#include "images/timer.pb-c.h" +#include "images/creds.pb-c.h" +#include "images/core.pb-c.h" +#include "images/pagemap.pb-c.h" + +#include "imgset.h" +#include "parasite-syscall.h" +#include "parasite.h" +#include "crtools.h" +#include "namespaces.h" +#include "kerndat.h" +#include "pstree.h" +#include "posix-timer.h" +#include "mem.h" +#include "criu-log.h" +#include "vma.h" +#include "proc_parse.h" +#include "aio.h" +#include "fault-injection.h" +#include +#include "signal.h" +#include "sigframe.h" + +#include +#include +#include + +#include "dump.h" +#include "restorer.h" +#include "pie/pie-relocs.h" + +#include "infect.h" +#include "infect-rpc.h" +#include "pie/parasite-blob.h" + +#include + +unsigned long get_exec_start(struct vm_area_list *vmas) +{ + struct vma_area *vma_area; + + list_for_each_entry(vma_area, &vmas->h, list) { + unsigned long len; + + if (vma_area->e->start >= kdat.task_size) + continue; + if (!(vma_area->e->prot & PROT_EXEC)) + continue; + + len = vma_area_len(vma_area); + if (len < PARASITE_START_AREA_MIN) { + pr_warn("Suspiciously short VMA @%#lx\n", (unsigned long)vma_area->e->start); + continue; + } + + return vma_area->e->start; + } + + return 0; +} + +/* + * We need to detect parasite crashes not to hang on socket operations. + * Since CRIU holds parasite with ptrace, it will receive SIGCHLD if the + * latter would crash. + * + * This puts a restriction on how to execute a sub-process on dump stage. + * One should use the cr_system helper, that blocks sigcild and waits + * for the spawned program to finish. + */ +static void sigchld_handler(int signal, siginfo_t *siginfo, void *data) +{ + int pid, status; + + pid = waitpid(-1, &status, WNOHANG); + if (pid <= 0) + return; + + pr_err("si_code=%d si_pid=%d si_status=%d\n", + siginfo->si_code, siginfo->si_pid, siginfo->si_status); + + if (WIFEXITED(status)) + pr_err("%d exited with %d unexpectedly\n", pid, WEXITSTATUS(status)); + else if (WIFSIGNALED(status)) + pr_err("%d was killed by %d unexpectedly: %s\n", + pid, WTERMSIG(status), strsignal(WTERMSIG(status))); + else if (WIFSTOPPED(status)) + pr_err("%d was stopped by %d unexpectedly\n", pid, WSTOPSIG(status)); + + exit(1); +} + +static int alloc_groups_copy_creds(CredsEntry *ce, struct parasite_dump_creds *c) +{ + BUILD_BUG_ON(sizeof(ce->groups[0]) != sizeof(c->groups[0])); + BUILD_BUG_ON(sizeof(ce->cap_inh[0]) != sizeof(c->cap_inh[0])); + BUILD_BUG_ON(sizeof(ce->cap_prm[0]) != sizeof(c->cap_prm[0])); + BUILD_BUG_ON(sizeof(ce->cap_eff[0]) != sizeof(c->cap_eff[0])); + BUILD_BUG_ON(sizeof(ce->cap_bnd[0]) != sizeof(c->cap_bnd[0])); + + BUG_ON(ce->n_cap_inh != CR_CAP_SIZE); + BUG_ON(ce->n_cap_prm != CR_CAP_SIZE); + BUG_ON(ce->n_cap_eff != CR_CAP_SIZE); + BUG_ON(ce->n_cap_bnd != CR_CAP_SIZE); + + memcpy(ce->cap_inh, c->cap_inh, sizeof(c->cap_inh[0]) * CR_CAP_SIZE); + memcpy(ce->cap_prm, c->cap_prm, sizeof(c->cap_prm[0]) * CR_CAP_SIZE); + memcpy(ce->cap_eff, c->cap_eff, sizeof(c->cap_eff[0]) * CR_CAP_SIZE); + memcpy(ce->cap_bnd, c->cap_bnd, sizeof(c->cap_bnd[0]) * CR_CAP_SIZE); + + ce->secbits = c->secbits; + ce->n_groups = c->ngroups; + + ce->groups = xmemdup(c->groups, sizeof(c->groups[0]) * c->ngroups); + + ce->uid = c->uids[0]; + ce->gid = c->gids[0]; + ce->euid = c->uids[1]; + ce->egid = c->gids[1]; + ce->suid = c->uids[2]; + ce->sgid = c->gids[2]; + ce->fsuid = c->uids[3]; + ce->fsgid = c->gids[3]; + + return ce->groups ? 0 : -ENOMEM; +} + +int parasite_dump_thread_leader_seized(struct parasite_ctl *ctl, int pid, CoreEntry *core) +{ + ThreadCoreEntry *tc = core->thread_core; + struct parasite_dump_thread *args; + struct parasite_dump_creds *pc; + int ret; + + args = compel_parasite_args(ctl, struct parasite_dump_thread); + + pc = args->creds; + pc->cap_last_cap = kdat.last_cap; + + ret = compel_rpc_call_sync(PARASITE_CMD_DUMP_THREAD, ctl); + if (ret < 0) + return ret; + + ret = alloc_groups_copy_creds(tc->creds, pc); + if (ret) { + pr_err("Can't copy creds for thread leader %d\n", pid); + return -1; + } + + return dump_thread_core(pid, core, args); +} + +int parasite_dump_thread_seized(struct parasite_thread_ctl *tctl, + struct parasite_ctl *ctl, int id, + struct pid *tid, CoreEntry *core) +{ + struct parasite_dump_thread *args; + pid_t pid = tid->real; + ThreadCoreEntry *tc = core->thread_core; + CredsEntry *creds = tc->creds; + struct parasite_dump_creds *pc; + int ret; + + BUG_ON(id == 0); /* Leader is dumped in dump_task_core_all */ + + args = compel_parasite_args(ctl, struct parasite_dump_thread); + + pc = args->creds; + pc->cap_last_cap = kdat.last_cap; + + tc->has_blk_sigset = true; + memcpy(&tc->blk_sigset, compel_thread_sigmask(tctl), sizeof(k_rtsigset_t)); + ret = compel_get_thread_regs(tctl, save_task_regs, core); + if (ret) { + pr_err("Can't obtain regs for thread %d\n", pid); + goto err_rth; + } + + ret = compel_run_in_thread(tctl, PARASITE_CMD_DUMP_THREAD); + if (ret) { + pr_err("Can't init thread in parasite %d\n", pid); + goto err_rth; + } + + ret = alloc_groups_copy_creds(creds, pc); + if (ret) { + pr_err("Can't copy creds for thread %d\n", pid); + goto err_rth; + } + + compel_release_thread(tctl); + + tid->ns[0].virt = args->tid; + return dump_thread_core(pid, core, args); + +err_rth: + compel_release_thread(tctl); + return -1; +} + +int parasite_dump_sigacts_seized(struct parasite_ctl *ctl, struct pstree_item *item) +{ + TaskCoreEntry *tc = item->core[0]->tc; + struct parasite_dump_sa_args *args; + int ret, sig; + SaEntry *sa, **psa; + + args = compel_parasite_args(ctl, struct parasite_dump_sa_args); + + ret = compel_rpc_call_sync(PARASITE_CMD_DUMP_SIGACTS, ctl); + if (ret < 0) + return ret; + + psa = xmalloc((SIGMAX - 2) * (sizeof(SaEntry *) + sizeof(SaEntry))); + if (!psa) + return -1; + + sa = (SaEntry *)(psa + SIGMAX - 2); + + tc->n_sigactions = SIGMAX - 2; + tc->sigactions = psa; + + for (sig = 1; sig <= SIGMAX; sig++) { + int i = sig - 1; + + if (sig == SIGSTOP || sig == SIGKILL) + continue; + + sa_entry__init(sa); + ASSIGN_TYPED(sa->sigaction, encode_pointer(args->sas[i].rt_sa_handler)); + ASSIGN_TYPED(sa->flags, args->sas[i].rt_sa_flags); + ASSIGN_TYPED(sa->restorer, encode_pointer(args->sas[i].rt_sa_restorer)); + BUILD_BUG_ON(sizeof(sa->mask) != sizeof(args->sas[0].rt_sa_mask.sig)); + memcpy(&sa->mask, args->sas[i].rt_sa_mask.sig, sizeof(sa->mask)); + sa->has_compat_sigaction = true; + sa->compat_sigaction = !compel_mode_native(ctl); + + *(psa++) = sa++; + } + + return 0; +} + +static void encode_itimer(struct itimerval *v, ItimerEntry *ie) +{ + ie->isec = v->it_interval.tv_sec; + ie->iusec = v->it_interval.tv_usec; + ie->vsec = v->it_value.tv_sec; + ie->vusec = v->it_value.tv_usec; +} + +int parasite_dump_itimers_seized(struct parasite_ctl *ctl, struct pstree_item *item) +{ + CoreEntry *core = item->core[0]; + struct parasite_dump_itimers_args *args; + int ret; + + args = compel_parasite_args(ctl, struct parasite_dump_itimers_args); + + ret = compel_rpc_call_sync(PARASITE_CMD_DUMP_ITIMERS, ctl); + if (ret < 0) + return ret; + + encode_itimer((&args->real), (core->tc->timers->real)); \ + encode_itimer((&args->virt), (core->tc->timers->virt)); \ + encode_itimer((&args->prof), (core->tc->timers->prof)); \ + + return 0; +} + +static int core_alloc_posix_timers(TaskTimersEntry *tte, int n, + PosixTimerEntry **pte) +{ + int sz; + + /* + * Will be free()-ed in core_entry_free() + */ + + sz = n * (sizeof(PosixTimerEntry *) + sizeof(PosixTimerEntry)); + tte->posix = xmalloc(sz); + if (!tte->posix) + return -1; + + tte->n_posix = n; + *pte = (PosixTimerEntry *)(tte->posix + n); + return 0; +} + +static void encode_posix_timer(struct posix_timer *v, + struct proc_posix_timer *vp, PosixTimerEntry *pte) +{ + pte->it_id = vp->spt.it_id; + pte->clock_id = vp->spt.clock_id; + pte->si_signo = vp->spt.si_signo; + pte->it_sigev_notify = vp->spt.it_sigev_notify; + pte->sival_ptr = encode_pointer(vp->spt.sival_ptr); + + pte->overrun = v->overrun; + + pte->isec = v->val.it_interval.tv_sec; + pte->insec = v->val.it_interval.tv_nsec; + pte->vsec = v->val.it_value.tv_sec; + pte->vnsec = v->val.it_value.tv_nsec; +} + +int parasite_dump_posix_timers_seized(struct proc_posix_timers_stat *proc_args, + struct parasite_ctl *ctl, struct pstree_item *item) +{ + CoreEntry *core = item->core[0]; + TaskTimersEntry *tte = core->tc->timers; + PosixTimerEntry *pte; + struct proc_posix_timer *temp; + struct parasite_dump_posix_timers_args *args; + int args_size; + int ret = 0; + int i; + + if (core_alloc_posix_timers(tte, proc_args->timer_n, &pte)) + return -1; + + args_size = posix_timers_dump_size(proc_args->timer_n); + args = compel_parasite_args_s(ctl, args_size); + args->timer_n = proc_args->timer_n; + + i = 0; + list_for_each_entry(temp, &proc_args->timers, list) { + args->timer[i].it_id = temp->spt.it_id; + i++; + } + + ret = compel_rpc_call_sync(PARASITE_CMD_DUMP_POSIX_TIMERS, ctl); + if (ret < 0) + goto end_posix; + + i = 0; + list_for_each_entry(temp, &proc_args->timers, list) { + posix_timer_entry__init(&pte[i]); + encode_posix_timer(&args->timer[i], temp, &pte[i]); + tte->posix[i] = &pte[i]; + i++; + } + +end_posix: + free_posix_timers(proc_args); + return ret; +} + +int parasite_dump_misc_seized(struct parasite_ctl *ctl, struct parasite_dump_misc *misc) +{ + struct parasite_dump_misc *ma; + + ma = compel_parasite_args(ctl, struct parasite_dump_misc); + if (compel_rpc_call_sync(PARASITE_CMD_DUMP_MISC, ctl) < 0) + return -1; + + *misc = *ma; + return 0; +} + +struct parasite_tty_args *parasite_dump_tty(struct parasite_ctl *ctl, int fd, int type) +{ + struct parasite_tty_args *p; + + p = compel_parasite_args(ctl, struct parasite_tty_args); + p->fd = fd; + p->type = type; + + if (compel_rpc_call_sync(PARASITE_CMD_DUMP_TTY, ctl) < 0) + return NULL; + + return p; +} + +int parasite_drain_fds_seized(struct parasite_ctl *ctl, + struct parasite_drain_fd *dfds, int nr_fds, int off, + int *lfds, struct fd_opts *opts) +{ + int ret = -1, size, sk; + struct parasite_drain_fd *args; + + size = drain_fds_size(dfds); + args = compel_parasite_args_s(ctl, size); + args->nr_fds = nr_fds; + memcpy(&args->fds, dfds->fds + off, sizeof(int) * nr_fds); + + ret = compel_rpc_call(PARASITE_CMD_DRAIN_FDS, ctl); + if (ret) { + pr_err("Parasite failed to drain descriptors\n"); + goto err; + } + + sk = compel_rpc_sock(ctl); + ret = recv_fds(sk, lfds, nr_fds, opts, sizeof(struct fd_opts)); + if (ret) + pr_err("Can't retrieve FDs from socket\n"); + + ret |= compel_rpc_sync(PARASITE_CMD_DRAIN_FDS, ctl); +err: + return ret; +} + +int parasite_get_proc_fd_seized(struct parasite_ctl *ctl) +{ + int ret = -1, fd, sk; + + ret = compel_rpc_call(PARASITE_CMD_GET_PROC_FD, ctl); + if (ret) { + pr_err("Parasite failed to get proc fd\n"); + return ret; + } + + sk = compel_rpc_sock(ctl); + fd = recv_fd(sk); + if (fd < 0) + pr_err("Can't retrieve FD from socket\n"); + if (compel_rpc_sync(PARASITE_CMD_GET_PROC_FD, ctl)) { + close_safe(&fd); + return -1; + } + + return fd; +} + +/* This is officially the 50000'th line in the CRIU source code */ + +int parasite_dump_cgroup(struct parasite_ctl *ctl, struct parasite_dump_cgroup_args *cgroup) +{ + int ret; + struct parasite_dump_cgroup_args *ca; + + ca = compel_parasite_args(ctl, struct parasite_dump_cgroup_args); + ret = compel_rpc_call_sync(PARASITE_CMD_DUMP_CGROUP, ctl); + if (ret) { + pr_err("Parasite failed to dump /proc/self/cgroup\n"); + return ret; + } + + *cgroup = *ca; + return 0; +} + +static unsigned long parasite_args_size = PARASITE_ARG_SIZE_MIN; +void parasite_ensure_args_size(unsigned long sz) +{ + if (parasite_args_size < sz) + parasite_args_size = sz; +} + +static int make_sigframe(void *arg, struct rt_sigframe *sf, struct rt_sigframe *rtsf, k_rtsigset_t *bs) +{ + return construct_sigframe(sf, rtsf, bs, (CoreEntry *)arg); +} + +static int parasite_prepare_threads(struct parasite_ctl *ctl, + struct pstree_item *item) +{ + struct parasite_thread_ctl **thread_ctls; + uint64_t *thread_sp; + int i; + + thread_ctls = xzalloc(sizeof(*thread_ctls) * item->nr_threads); + if (!thread_ctls) + return -1; + + thread_sp = xzalloc(sizeof(*thread_sp) * item->nr_threads); + if (!thread_sp) + goto free_ctls; + + for (i = 0; i < item->nr_threads; i++) { + struct pid *tid = &item->threads[i]; + + if (item->pid->real == tid->real) { + thread_sp[i] = compel_get_leader_sp(ctl); + continue; + } + + thread_ctls[i] = compel_prepare_thread(ctl, tid->real); + if (!thread_ctls[i]) + goto free_sp; + + thread_sp[i] = compel_get_thread_sp(thread_ctls[i]); + } + + dmpi(item)->thread_ctls = thread_ctls; + dmpi(item)->thread_sp = thread_sp; + + return 0; + +free_sp: + xfree(thread_sp); +free_ctls: + xfree(thread_ctls); + return -1; +} + +struct parasite_ctl *parasite_infect_seized(pid_t pid, struct pstree_item *item, + struct vm_area_list *vma_area_list) +{ + struct parasite_ctl *ctl; + struct infect_ctx *ictx; + unsigned long p; + int ret; + + BUG_ON(item->threads[0].real != pid); + + p = get_exec_start(vma_area_list); + if (!p) { + pr_err("No suitable VM found\n"); + return NULL; + } + + ctl = compel_prepare_noctx(pid); + if (!ctl) + return NULL; + + ret = parasite_prepare_threads(ctl, item); + if (ret) + return NULL; + + ictx = compel_infect_ctx(ctl); + + ictx->open_proc = do_open_proc; + ictx->child_handler = sigchld_handler; + ictx->orig_handler.sa_handler = SIG_DFL; + ictx->orig_handler.sa_flags = SA_SIGINFO | SA_RESTART; + sigemptyset(&ictx->orig_handler.sa_mask); + sigaddset(&ictx->orig_handler.sa_mask, SIGCHLD); + ictx->sock = dmpi(item)->netns->net.seqsk; + ictx->save_regs = save_task_regs; + ictx->make_sigframe = make_sigframe; + ictx->regs_arg = item->core[0]; + ictx->task_size = kdat.task_size; + ictx->syscall_ip = p; + pr_debug("Parasite syscall_ip at %#lx\n", p); + + if (fault_injected(FI_NO_MEMFD)) + ictx->flags |= INFECT_NO_MEMFD; + if (fault_injected(FI_PARASITE_CONNECT)) + ictx->flags |= INFECT_FAIL_CONNECT; + if (fault_injected(FI_NO_BREAKPOINTS)) + ictx->flags |= INFECT_NO_BREAKPOINTS; + if (kdat.compat_cr) + ictx->flags |= INFECT_COMPATIBLE; + if (kdat.x86_has_ptrace_fpu_xsave_bug) + ictx->flags |= INFECT_X86_PTRACE_MXCSR_BUG; + + ictx->log_fd = log_get_fd(); + + parasite_setup_c_header(ctl); + + parasite_ensure_args_size(dump_pages_args_size(vma_area_list)); + parasite_ensure_args_size(aio_rings_args_size(vma_area_list)); + + if (compel_infect(ctl, item->nr_threads, parasite_args_size) < 0) { + compel_cure(ctl); + return NULL; + } + + parasite_args_size = PARASITE_ARG_SIZE_MIN; /* reset for next task */ + memcpy(&item->core[0]->tc->blk_sigset, compel_task_sigmask(ctl), sizeof(k_rtsigset_t)); + dmpi(item)->parasite_ctl = ctl; + + return ctl; +} diff --git a/CRIU_code/criu/path.c b/CRIU_code/criu/path.c new file mode 100644 index 0000000..22a89a4 --- /dev/null +++ b/CRIU_code/criu/path.c @@ -0,0 +1,105 @@ +#include +#include +#include + +#include "int.h" +#include "mount.h" +#include "path.h" +#include "log.h" +#include "common/bug.h" + +char *cut_root_for_bind(char *target_root, char *source_root) +{ + int tok = 0; + char *path = NULL; + /* + * Cut common part of root. + * For non-root binds the source is always "/" (checked) + * so this will result in this slash removal only. + */ + while (target_root[tok] == source_root[tok]) { + tok++; + if (source_root[tok] == '\0') { + path = target_root + tok; + goto out; + } + if (target_root[tok] == '\0') { + path = source_root + tok; + goto out; + } + } + + return NULL; +out: + BUG_ON(path == NULL); + if (path[0] == '/') + path++; + + return path; +} + +char *mnt_get_sibling_path(struct mount_info *m, + struct mount_info *p, char *buf, int len) +{ + struct mount_info *pa = m->parent; + char *rpath, *cut_root, *path = buf; + int off = 0; + + if (pa == NULL) + return NULL; + + rpath = m->mountpoint + strlen(pa->mountpoint); + if (rpath[0] == '/') + rpath++; + + /* + * Get a path to a sibling of "m" with parent "p", + * return NULL is p can't have a sibling of m. + * + * Here are two cases: + * When a parent of "m" has longer root than "p": + * / pm->root / rpath + * | cut_root | + * / p->root / + * In this case, a sibling path is a sum of p->mountpoint, + * cut_root and rpath. + * + * When a parent of m has shorter root than "p": + * / pm->root / rpath + * | cut_root | + * / p->root / rpath +strlen(cut_root) + * In this case, a sibling path is a sum of p->mountpoint and + * rpath - strlen(cut_root). + */ + + cut_root = cut_root_for_bind(pa->root, p->root); + if (cut_root == NULL) + return NULL; + if (p->mountpoint[1] != 0) /* not "/" */ { + off = snprintf(path, len, "%s", p->mountpoint); + if (path[off - 1] == '/') /* p->mountpoint = "./" */ + off--; + } + len -= off; + path += off; + + if (strlen(pa->root) > strlen(p->root)) { + off = snprintf(path, len, "/%s", cut_root); + len -= off; + path += off; + } else { + int len = strlen(cut_root); + if (strncmp(rpath, cut_root, len)) + return NULL; + rpath += strlen(cut_root); + if (len > 0 && (rpath[0] && rpath[0] != '/')) + return NULL; + } + if (rpath[0] == '/') + rpath++; + + if (rpath[0] != '\0') + snprintf(path, len, "/%s", rpath); + + return buf; +} diff --git a/CRIU_code/criu/pie-util-vdso-elf32.c b/CRIU_code/criu/pie-util-vdso-elf32.c new file mode 100644 index 0000000..961bb1d --- /dev/null +++ b/CRIU_code/criu/pie-util-vdso-elf32.c @@ -0,0 +1 @@ +pie/util-vdso-elf32.c \ No newline at end of file diff --git a/CRIU_code/criu/pie-util-vdso.c b/CRIU_code/criu/pie-util-vdso.c new file mode 100644 index 0000000..6e56238 --- /dev/null +++ b/CRIU_code/criu/pie-util-vdso.c @@ -0,0 +1 @@ +pie/util-vdso.c \ No newline at end of file diff --git a/CRIU_code/criu/pie-util.c b/CRIU_code/criu/pie-util.c new file mode 100644 index 0000000..238f297 --- /dev/null +++ b/CRIU_code/criu/pie-util.c @@ -0,0 +1 @@ +pie/util.c \ No newline at end of file diff --git a/CRIU_code/criu/pie/Makefile b/CRIU_code/criu/pie/Makefile new file mode 100644 index 0000000..1ad456f --- /dev/null +++ b/CRIU_code/criu/pie/Makefile @@ -0,0 +1,53 @@ +# Recipes to compile PIEs: parastie and restorer +# Compel will deal with converting the result binaries +# to a C array to be used in CRIU. + +target := parasite restorer + +CFLAGS := $(filter-out -pg $(CFLAGS-GCOV) $(CFLAGS-ASAN),$(CFLAGS)) +CFLAGS += $(CFLAGS_PIE) +ccflags-y += -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=0 +ccflags-y += -Wp,-U_FORTIFY_SOURCE -Wp,-D_FORTIFY_SOURCE=0 + +ifneq ($(filter-out clean mrproper,$(MAKECMDGOALS)),) + LDFLAGS += $(shell $(COMPEL_BIN) ldflags) + compel_plugins := $(shell $(COMPEL_BIN) plugins) +endif + +LDS := compel/arch/$(SRCARCH)/scripts/compel-pack.lds.S + +restorer-obj-y += parasite-vdso.o ./$(ARCH_DIR)/vdso-pie.o +restorer-obj-y += ./$(ARCH_DIR)/restorer.o + +ifeq ($(ARCH),x86) + ifeq ($(CONFIG_COMPAT),y) + restorer-obj-y += ./$(ARCH_DIR)/restorer_unmap.o + restorer-obj-y += ./$(ARCH_DIR)/sigaction_compat_pie.o + endif +endif + +ifeq ($(SRCARCH),aarch64) + restorer-obj-y += ./$(ARCH_DIR)/intraprocedure.o +endif + +ifeq ($(SRCARCH),ppc64) + restorer-obj-y += ./$(ARCH_DIR)/vdso-trampoline.o +endif + +define gen-pie-rules +$(1)-obj-y += $(1).o +$(1)-obj-e += pie.lib.a +$(1)-obj-e += $$(compel_plugins) + +# Dependency on compel linker script, to relink if it has changed +$$(obj)/$(1).built-in.o: $$(LDS) $$(compel_plugins) + +$$(obj)/$(1)-blob.h: $$(obj)/$(1).built-in.o + $$(call msg-gen, $$@) + $$(Q) $$(COMPEL_BIN) hgen -f $$< -o $$@ + +all-y += $$(obj)/$(1)-blob.h +cleanup-y += $$(obj)/$(1)-blob.h +endef + +$(foreach t,$(target),$(eval $(call gen-pie-rules,$(t)))) diff --git a/CRIU_code/criu/pie/Makefile.library b/CRIU_code/criu/pie/Makefile.library new file mode 100644 index 0000000..658c8a4 --- /dev/null +++ b/CRIU_code/criu/pie/Makefile.library @@ -0,0 +1,25 @@ +# PIE library is a static library that's going to be linked into +# *both* CRIU binary and PIEs (parasite/restorer). +# Please, make sure that you're including here only objects +# those will be used in CRIU too. For objects files only for PIEs +# edit their separate recipes criu/pie/Makefile + +lib-name := pie.lib.a + +lib-y += util.o +lib-y += util-vdso.o + +ifeq ($(SRCARCH),x86) + ifeq ($(CONFIG_COMPAT),y) + lib-y += util-vdso-elf32.o + endif + CFLAGS_util-vdso-elf32.o += -DCONFIG_VDSO_32 +endif + +ifeq ($(SRCARCH),arm) + lib-y += ./$(ARCH_DIR)/aeabi-helpers.o + lib-y += ./$(ARCH_DIR)/pie-cacheflush.o +endif + +CFLAGS := $(filter-out -pg $(CFLAGS-GCOV) $(CFLAGS-ASAN),$(CFLAGS)) +CFLAGS += $(CFLAGS_PIE) diff --git a/CRIU_code/criu/pie/parasite-vdso.c b/CRIU_code/criu/pie/parasite-vdso.c new file mode 100644 index 0000000..00bc2bf --- /dev/null +++ b/CRIU_code/criu/pie/parasite-vdso.c @@ -0,0 +1,299 @@ +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "int.h" +#include "types.h" +#include "page.h" +#include +#include "image.h" +#include "parasite-vdso.h" +#include "vma.h" +#include "log.h" +#include "common/bug.h" + +#ifdef LOG_PREFIX +# undef LOG_PREFIX +#endif +#define LOG_PREFIX "vdso: " + + +static int vdso_remap(char *who, unsigned long from, unsigned long to, size_t size) +{ + unsigned long addr; + + pr_debug("Remap %s %lx -> %lx\n", who, from, to); + + addr = sys_mremap(from, size, size, MREMAP_MAYMOVE | MREMAP_FIXED, to); + if (addr != to) { + pr_err("Unable to remap %lx -> %lx %lx\n", + from, to, addr); + return -1; + } + + return 0; +} + +/* + * Park runtime vDSO in some safe place where it can be accessible + * from the restorer + */ +int vdso_do_park(struct vdso_maps *rt, unsigned long park_at, + unsigned long park_size) +{ + unsigned long vvar_size = rt->sym.vvar_size; + unsigned long vdso_size = rt->sym.vdso_size; + unsigned long rt_vvar_park = park_at; + unsigned long rt_vdso_park = park_at; + int ret; + + + if (rt->vvar_start == VVAR_BAD_ADDR) { + BUG_ON(vdso_size < park_size); + return vdso_remap("rt-vdso", rt->vdso_start, + rt_vdso_park, vdso_size); + } + + BUG_ON((vdso_size + vvar_size) < park_size); + + if (rt->sym.vdso_before_vvar) + rt_vvar_park = park_at + vdso_size; + else + rt_vdso_park = park_at + vvar_size; + + ret = vdso_remap("rt-vdso", rt->vdso_start, rt_vdso_park, vdso_size); + ret |= vdso_remap("rt-vvar", rt->vvar_start, rt_vvar_park, vvar_size); + + return ret; +} + +#ifndef CONFIG_COMPAT +static int __vdso_fill_symtable(uintptr_t mem, size_t size, + struct vdso_symtable *t, bool __always_unused compat_vdso) +{ + return vdso_fill_symtable(mem, size, t); +} +#endif + +/* + * Proxification strategy + * + * - There might be two vDSO zones: vdso code and optionally vvar data + * - To be able to use in-place remapping we need + * + * a) Size and order of vDSO zones are to match + * b) Symbols offsets must match + * c) Have same number of vDSO zones + */ +static bool blobs_matches(VmaEntry *vdso_img, VmaEntry *vvar_img, + struct vdso_symtable *sym_img, struct vdso_symtable *sym_rt) +{ + unsigned long vdso_size = vma_entry_len(vdso_img); + unsigned long rt_vdso_size = sym_rt->vdso_size; + size_t i; + + if (vdso_size != rt_vdso_size) { + pr_info("size differs: %lx != %lx (rt)\n", + vdso_size, rt_vdso_size); + return false; + } + + for (i = 0; i < ARRAY_SIZE(sym_img->symbols); i++) { + unsigned long sym_offset = sym_img->symbols[i].offset; + unsigned long rt_sym_offset = sym_rt->symbols[i].offset; + char *sym_name = sym_img->symbols[i].name; + + if (sym_offset != rt_sym_offset) { + pr_info("[%zu]`%s` offset differs: %lx != %lx (rt)\n", + i, sym_name, sym_offset, rt_sym_offset); + return false; + } + } + + if (vvar_img && sym_rt->vvar_size != VVAR_BAD_SIZE) { + bool vdso_firstly = (vvar_img->start > vdso_img->start); + unsigned long vvar_size = vma_entry_len(vvar_img); + unsigned long rt_vvar_size = sym_rt->vvar_size; + + if (vvar_size != rt_vvar_size) { + pr_info("vvar size differs: %lx != %lx (rt)\n", + vdso_size, rt_vdso_size); + return false; + } + + if (vdso_firstly != sym_rt->vdso_before_vvar) { + pr_info("[%s] pair has different order\n", + vdso_firstly ? "vdso/vvar" : "vvar/vdso"); + return false; + } + } + + return true; +} + +/* + * The easy case -- the vdso from an image has the same offsets, + * order and size as runtime vdso, so we simply remap runtime vdso + * to dumpee position without generating any proxy. + */ +static int remap_rt_vdso(VmaEntry *vma_vdso, VmaEntry *vma_vvar, + struct vdso_symtable *sym_rt, unsigned long vdso_rt_parked_at) +{ + unsigned long rt_vvar_addr = vdso_rt_parked_at; + unsigned long rt_vdso_addr = vdso_rt_parked_at; + void *remap_addr; + int ret; + + pr_info("Runtime vdso/vvar matches dumpee, remap inplace\n"); + + /* + * Ugly casts for 32bit platforms, which don't like uint64_t + * cast to (void *) + */ + remap_addr = (void *)(uintptr_t)vma_vdso->start; + if (sys_munmap(remap_addr, vma_entry_len(vma_vdso))) { + pr_err("Failed to unmap dumpee vdso\n"); + return -1; + } + + if (!vma_vvar) { + return vdso_remap("rt-vdso", rt_vdso_addr, + vma_vdso->start, sym_rt->vdso_size); + } + + remap_addr = (void *)(uintptr_t)vma_vvar->start; + if (sys_munmap(remap_addr, vma_entry_len(vma_vvar))) { + pr_err("Failed to unmap dumpee vvar\n"); + return -1; + } + + if (vma_vdso->start < vma_vvar->start) + rt_vvar_addr = vdso_rt_parked_at + sym_rt->vdso_size; + else + rt_vdso_addr = vdso_rt_parked_at + sym_rt->vvar_size; + + ret = vdso_remap("rt-vdso", rt_vdso_addr, + vma_vdso->start, sym_rt->vdso_size); + ret |= vdso_remap("rt-vvar", rt_vvar_addr, + vma_vvar->start, sym_rt->vvar_size); + + return ret; +} + +/* + * The complex case -- we need to proxify calls. We redirect + * calls from dumpee vdso to runtime vdso, making dumpee + * to operate as proxy vdso. + */ +static int add_vdso_proxy(VmaEntry *vma_vdso, VmaEntry *vma_vvar, + struct vdso_symtable *sym_img, struct vdso_symtable *sym_rt, + unsigned long vdso_rt_parked_at, bool compat_vdso) +{ + unsigned long rt_vvar_addr = vdso_rt_parked_at; + unsigned long rt_vdso_addr = vdso_rt_parked_at; + unsigned long orig_vvar_addr = + vma_vvar ? vma_vvar->start : VVAR_BAD_ADDR; + + pr_info("Runtime vdso mismatches dumpee, generate proxy\n"); + + /* + * Don't forget to shift if vvar is before vdso. + */ + if (sym_rt->vvar_size == VVAR_BAD_SIZE) { + rt_vvar_addr = VVAR_BAD_ADDR; + } else { + if (sym_rt->vdso_before_vvar) + rt_vvar_addr += sym_rt->vdso_size; + else + rt_vdso_addr += sym_rt->vvar_size; + } + + /* + * Note: we assume that after first migration with inserted + * rt-vdso and trampoilines on the following migrations + * number of vdso symbols will not decrease. + * We don't save the content of original vdso under inserted + * jumps, so we can't remove them if on the following migration + * found that number of symbols in vdso has decreased. + */ + if (vdso_redirect_calls(rt_vdso_addr, vma_vdso->start, + sym_rt, sym_img, compat_vdso)) { + pr_err("Failed to proxify dumpee contents\n"); + return -1; + } + + /* + * Put a special mark into runtime vdso, thus at next checkpoint + * routine we could detect this vdso and do not dump it, since + * it's auto-generated every new session if proxy required. + */ + sys_mprotect((void *)rt_vdso_addr, sym_rt->vdso_size, PROT_WRITE); + vdso_put_mark((void *)rt_vdso_addr, rt_vvar_addr, + vma_vdso->start, orig_vvar_addr); + sys_mprotect((void *)rt_vdso_addr, sym_rt->vdso_size, VDSO_PROT); + + return 0; +} + +int vdso_proxify(struct vdso_symtable *sym_rt, unsigned long vdso_rt_parked_at, + VmaEntry *vmas, size_t nr_vmas, + bool compat_vdso, bool force_trampolines) +{ + VmaEntry *vma_vdso = NULL, *vma_vvar = NULL; + struct vdso_symtable s = VDSO_SYMTABLE_INIT; + unsigned int i; + + for (i = 0; i < nr_vmas; i++) { + if (vma_entry_is(&vmas[i], VMA_AREA_VDSO)) + vma_vdso = &vmas[i]; + else if (vma_entry_is(&vmas[i], VMA_AREA_VVAR)) + vma_vvar = &vmas[i]; + } + + if (!vma_vdso && !vma_vvar) { + pr_info("No VVAR, no vDSO in image\n"); + /* + * We don't have to unmap rt-vdso, rt-vvar as we didn't + * park them previously. + */ + return 0; + } + + if (!vma_vdso) { + pr_err("Can't find vDSO area in image\n"); + return -1; + } + + /* + * vDSO mark overwrites Elf program header of proxy vDSO thus + * it must never ever be greater in size. + */ + BUILD_BUG_ON(sizeof(struct vdso_mark) > sizeof(Elf64_Phdr)); + + /* + * Find symbols in vDSO zone read from image. + */ + if (__vdso_fill_symtable((uintptr_t)vma_vdso->start, + vma_entry_len(vma_vdso), &s, compat_vdso)) + return -1; + + pr_debug("image [vdso] %lx-%lx [vvar] %lx-%lx\n", + (unsigned long)vma_vdso->start, (unsigned long)vma_vdso->end, + vma_vvar ? (unsigned long)vma_vvar->start : VVAR_BAD_ADDR, + vma_vvar ? (unsigned long)vma_vvar->end : VVAR_BAD_ADDR); + + if (blobs_matches(vma_vdso, vma_vvar, &s, sym_rt) && !force_trampolines) { + return remap_rt_vdso(vma_vdso, vma_vvar, + sym_rt, vdso_rt_parked_at); + } + + return add_vdso_proxy(vma_vdso, vma_vvar, &s, sym_rt, + vdso_rt_parked_at, compat_vdso); +} diff --git a/CRIU_code/criu/pie/parasite.c b/CRIU_code/criu/pie/parasite.c new file mode 100644 index 0000000..01bacd3 --- /dev/null +++ b/CRIU_code/criu/pie/parasite.c @@ -0,0 +1,717 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common/config.h" +#include "int.h" +#include "types.h" +#include +#include "parasite.h" +#include "fcntl.h" +#include "prctl.h" +#include "common/lock.h" +#include "parasite-vdso.h" +#include "criu-log.h" +#include "tty.h" +#include "aio.h" + +#include "asm/parasite.h" +#include "restorer.h" +#include "infect-pie.h" + +/* + * PARASITE_CMD_DUMPPAGES is called many times and the parasite args contains + * an array of VMAs at this time, so VMAs can be unprotected in any moment + */ +static struct parasite_dump_pages_args *mprotect_args = NULL; + +#ifndef SPLICE_F_GIFT +#define SPLICE_F_GIFT 0x08 +#endif + +#ifndef PR_GET_PDEATHSIG +#define PR_GET_PDEATHSIG 2 +#endif + +static int mprotect_vmas(struct parasite_dump_pages_args *args) +{ + struct parasite_vma_entry *vmas, *vma; + int ret = 0, i; + + vmas = pargs_vmas(args); + for (i = 0; i < args->nr_vmas; i++) { + vma = vmas + i; + ret = sys_mprotect((void *)vma->start, vma->len, vma->prot | args->add_prot); + if (ret) { + pr_err("mprotect(%08lx, %lu) failed with code %d\n", + vma->start, vma->len, ret); + break; + } + } + + if (args->add_prot) + mprotect_args = args; + else + mprotect_args = NULL; + + return ret; +} + +static int dump_pages(struct parasite_dump_pages_args *args) +{ + int p, ret, tsock; + struct iovec *iovs; + int off, nr_segs; + unsigned long spliced_bytes = 0; + + tsock = parasite_get_rpc_sock(); + p = recv_fd(tsock); + if (p < 0) + return -1; + + iovs = pargs_iovs(args); + off = 0; + nr_segs = args->nr_segs; + if (nr_segs > UIO_MAXIOV) + nr_segs = UIO_MAXIOV; + while (1) { + ret = sys_vmsplice(p, &iovs[args->off + off], nr_segs, + SPLICE_F_GIFT | SPLICE_F_NONBLOCK); + if (ret < 0) { + sys_close(p); + pr_err("Can't splice pages to pipe (%d/%d/%d)\n", + ret, nr_segs, args->off + off); + return -1; + } + spliced_bytes += ret; + off += nr_segs; + if (off == args->nr_segs) + break; + if (off + nr_segs > args->nr_segs) + nr_segs = args->nr_segs - off; + } + if (spliced_bytes != args->nr_pages * PAGE_SIZE) { + sys_close(p); + pr_err("Can't splice all pages to pipe (%lu/%d)\n", spliced_bytes, args->nr_pages); + return -1; + } + + sys_close(p); + return 0; +} + +static int dump_sigact(struct parasite_dump_sa_args *da) +{ + int sig, ret = 0; + + for (sig = 1; sig <= SIGMAX; sig++) { + int i = sig - 1; + + if (sig == SIGKILL || sig == SIGSTOP) + continue; + + ret = sys_sigaction(sig, NULL, &da->sas[i], sizeof(k_rtsigset_t)); + if (ret < 0) { + pr_err("sys_sigaction failed (%d)\n", ret); + break; + } + } + + return ret; +} + +static int dump_itimers(struct parasite_dump_itimers_args *args) +{ + int ret; + + ret = sys_getitimer(ITIMER_REAL, &args->real); + if (!ret) + ret = sys_getitimer(ITIMER_VIRTUAL, &args->virt); + if (!ret) + ret = sys_getitimer(ITIMER_PROF, &args->prof); + + if (ret) + pr_err("getitimer failed (%d)\n", ret); + + return ret; +} + +static int dump_posix_timers(struct parasite_dump_posix_timers_args *args) +{ + int i; + int ret = 0; + + for(i = 0; i < args->timer_n; i++) { + ret = sys_timer_gettime(args->timer[i].it_id, &args->timer[i].val); + if (ret < 0) { + pr_err("sys_timer_gettime failed (%d)\n", ret); + return ret; + } + args->timer[i].overrun = sys_timer_getoverrun(args->timer[i].it_id); + ret = args->timer[i].overrun; + if (ret < 0) { + pr_err("sys_timer_getoverrun failed (%d)\n", ret); + return ret; + } + } + + return ret; +} + +static int dump_creds(struct parasite_dump_creds *args); + +static int dump_thread_common(struct parasite_dump_thread *ti) +{ + int ret; + + arch_get_tls(&ti->tls); + ret = sys_prctl(PR_GET_TID_ADDRESS, (unsigned long) &ti->tid_addr, 0, 0, 0); + if (ret) { + pr_err("Unable to get the clear_child_tid address: %d\n", ret); + goto out; + } + + ret = sys_sigaltstack(NULL, &ti->sas); + if (ret) { + pr_err("Unable to get signal stack context: %d\n", ret); + goto out; + } + + ret = sys_prctl(PR_GET_PDEATHSIG, (unsigned long)&ti->pdeath_sig, 0, 0, 0); + if (ret) { + pr_err("Unable to get the parent death signal: %d\n", ret); + goto out; + } + + ret = sys_prctl(PR_GET_NAME, (unsigned long) &ti->comm, 0, 0, 0); + if (ret) { + pr_err("Unable to get the thread name: %d\n", ret); + goto out; + } + + ret = dump_creds(ti->creds); +out: + return ret; +} + +static int dump_misc(struct parasite_dump_misc *args) +{ + args->brk = sys_brk(0); + + args->pid = sys_getpid(); + args->sid = sys_getsid(); + args->pgid = sys_getpgid(0); + args->umask = sys_umask(0); + sys_umask(args->umask); /* never fails */ + args->dumpable = sys_prctl(PR_GET_DUMPABLE, 0, 0, 0, 0); + args->thp_disabled = sys_prctl(PR_GET_THP_DISABLE, 0, 0, 0, 0); + + return 0; +} + +static int dump_creds(struct parasite_dump_creds *args) +{ + int ret, i, j; + struct cap_data data[_LINUX_CAPABILITY_U32S_3]; + struct cap_header hdr = {_LINUX_CAPABILITY_VERSION_3, 0}; + + ret = sys_capget(&hdr, data); + if (ret < 0) { + pr_err("Unable to get capabilities: %d\n", ret); + return -1; + } + + /* + * Loop through the capability constants until we reach cap_last_cap. + * The cap_bnd set is stored as a bitmask comprised of CR_CAP_SIZE number of + * 32-bit uints, hence the inner loop from 0 to 32. + */ + for (i = 0; i < CR_CAP_SIZE; i++) { + args->cap_eff[i] = data[i].eff; + args->cap_prm[i] = data[i].prm; + args->cap_inh[i] = data[i].inh; + args->cap_bnd[i] = 0; + + for (j = 0; j < 32; j++) { + if (j + i * 32 > args->cap_last_cap) + break; + ret = sys_prctl(PR_CAPBSET_READ, j + i * 32, 0, 0, 0); + if (ret < 0) { + pr_err("Unable to read capability %d: %d\n", + j + i * 32, ret); + return -1; + } + if (ret) + args->cap_bnd[i] |= (1 << j); + } + } + + args->secbits = sys_prctl(PR_GET_SECUREBITS, 0, 0, 0, 0); + + ret = sys_getgroups(0, NULL); + if (ret < 0) + goto grps_err; + + args->ngroups = ret; + if (args->ngroups >= PARASITE_MAX_GROUPS) { + pr_err("Too many groups in task %d\n", (int)args->ngroups); + return -1; + } + + ret = sys_getgroups(args->ngroups, args->groups); + if (ret < 0) + goto grps_err; + + if (ret != args->ngroups) { + pr_err("Groups changed on the fly %d -> %d\n", + args->ngroups, ret); + return -1; + } + + ret = sys_getresuid(&args->uids[0], &args->uids[1], &args->uids[2]); + if (ret) { + pr_err("Unable to get uids: %d\n", ret); + return -1; + } + + args->uids[3] = sys_setfsuid(-1L); + + /* + * FIXME In https://github.com/xemul/criu/issues/95 it is + * been reported that only low 16 bits are set upon syscall + * on ARMv7. + * + * We may rather need implement builtin-memset and clear the + * whole memory needed here. + */ + args->gids[0] = args->gids[1] = args->gids[2] = args->gids[3] = 0; + + ret = sys_getresgid(&args->gids[0], &args->gids[1], &args->gids[2]); + if (ret) { + pr_err("Unable to get uids: %d\n", ret); + return -1; + } + + args->gids[3] = sys_setfsgid(-1L); + + return 0; + +grps_err: + pr_err("Error calling getgroups (%d)\n", ret); + return -1; +} + +static int fill_fds_opts(struct parasite_drain_fd *fds, struct fd_opts *opts) +{ + int i; + + for (i = 0; i < fds->nr_fds; i++) { + int flags, fd = fds->fds[i], ret; + struct fd_opts *p = opts + i; + struct f_owner_ex owner_ex; + uint32_t v[2]; + + flags = sys_fcntl(fd, F_GETFD, 0); + if (flags < 0) { + pr_err("fcntl(%d, F_GETFD) -> %d\n", fd, flags); + return -1; + } + + p->flags = (char)flags; + + ret = sys_fcntl(fd, F_GETOWN_EX, (long)&owner_ex); + if (ret) { + pr_err("fcntl(%d, F_GETOWN_EX) -> %d\n", fd, ret); + return -1; + } + + /* + * Simple case -- nothing is changed. + */ + if (owner_ex.pid == 0) { + p->fown.pid = 0; + continue; + } + + ret = sys_fcntl(fd, F_GETOWNER_UIDS, (long)&v); + if (ret) { + pr_err("fcntl(%d, F_GETOWNER_UIDS) -> %d\n", fd, ret); + return -1; + } + + p->fown.uid = v[0]; + p->fown.euid = v[1]; + p->fown.pid_type = owner_ex.type; + p->fown.pid = owner_ex.pid; + } + + return 0; +} + +static int drain_fds(struct parasite_drain_fd *args) +{ + int ret, tsock; + struct fd_opts *opts; + + /* + * See the drain_fds_size() in criu code, the memory + * for this args is ensured to be large enough to keep + * an array of fd_opts at the tail. + */ + opts = ((void *)args) + sizeof(*args) + args->nr_fds * sizeof(args->fds[0]); + ret = fill_fds_opts(args, opts); + if (ret) + return ret; + + tsock = parasite_get_rpc_sock(); + ret = send_fds(tsock, NULL, 0, + args->fds, args->nr_fds, opts, sizeof(struct fd_opts)); + if (ret) + pr_err("send_fds failed (%d)\n", ret); + + return ret; +} + +static int dump_thread(struct parasite_dump_thread *args) +{ + args->tid = sys_gettid(); + return dump_thread_common(args); +} + +static char proc_mountpoint[] = "proc.crtools"; + +static int pie_atoi(char *str) +{ + int ret = 0; + + while (*str) { + ret *= 10; + ret += *str - '0'; + str++; + } + + return ret; +} + +static int get_proc_fd(void) +{ + int ret; + char buf[11]; + + ret = sys_readlinkat(AT_FDCWD, "/proc/self", buf, sizeof(buf) - 1); + if (ret < 0 && ret != -ENOENT) { + pr_err("Can't readlink /proc/self (%d)\n", ret); + return ret; + } + if (ret > 0) { + buf[ret] = 0; + + /* Fast path -- if /proc belongs to this pidns */ + if (pie_atoi(buf) == sys_getpid()) + return sys_open("/proc", O_RDONLY, 0); + } + + ret = sys_mkdir(proc_mountpoint, 0700); + if (ret) { + pr_err("Can't create a directory (%d)\n", ret); + return -1; + } + + ret = sys_mount("proc", proc_mountpoint, "proc", MS_MGC_VAL, NULL); + if (ret) { + if (ret == -EPERM) + pr_err("can't dump unpriviliged task whose /proc doesn't belong to it\n"); + else + pr_err("mount failed (%d)\n", ret); + sys_rmdir(proc_mountpoint); + return -1; + } + + return open_detach_mount(proc_mountpoint); +} + +static int parasite_get_proc_fd(void) +{ + int fd, ret, tsock; + + fd = get_proc_fd(); + if (fd < 0) { + pr_err("Can't get /proc fd\n"); + return -1; + } + + tsock = parasite_get_rpc_sock(); + ret = send_fd(tsock, NULL, 0, fd); + sys_close(fd); + return ret; +} + +static inline int tty_ioctl(int fd, int cmd, int *arg) +{ + int ret; + + ret = sys_ioctl(fd, cmd, (unsigned long)arg); + if (ret < 0) { + if (ret != -ENOTTY) + return ret; + *arg = 0; + } + return 0; +} + +/* + * Stolen from kernel/fs/aio.c + * + * Is it valid to go to memory and check it? Should be, + * as libaio does the same. + */ + +#define AIO_RING_MAGIC 0xa10a10a1 +#define AIO_RING_COMPAT_FEATURES 1 +#define AIO_RING_INCOMPAT_FEATURES 0 + +static int sane_ring(struct parasite_aio *aio) +{ + struct aio_ring *ring = (struct aio_ring *)aio->ctx; + unsigned nr; + + nr = (aio->size - sizeof(struct aio_ring)) / sizeof(struct io_event); + + return ring->magic == AIO_RING_MAGIC && + ring->compat_features == AIO_RING_COMPAT_FEATURES && + ring->incompat_features == AIO_RING_INCOMPAT_FEATURES && + ring->header_length == sizeof(struct aio_ring) && + ring->nr == nr; +} + +static int parasite_check_aios(struct parasite_check_aios_args *args) +{ + int i; + + for (i = 0; i < args->nr_rings; i++) { + struct aio_ring *ring; + + ring = (struct aio_ring *)args->ring[i].ctx; + if (!sane_ring(&args->ring[i])) { + pr_err("Not valid ring #%d\n", i); + pr_info(" `- magic %x\n", ring->magic); + pr_info(" `- cf %d\n", ring->compat_features); + pr_info(" `- if %d\n", ring->incompat_features); + pr_info(" `- header size %d (%zd)\n", ring->header_length, sizeof(struct aio_ring)); + pr_info(" `- nr %d\n", ring->nr); + return -1; + } + + /* XXX: wait aio completion */ + } + + return 0; +} + +static int parasite_dump_tty(struct parasite_tty_args *args) +{ + int ret; + +#ifndef TIOCGPKT +# define TIOCGPKT _IOR('T', 0x38, int) +#endif + +#ifndef TIOCGPTLCK +# define TIOCGPTLCK _IOR('T', 0x39, int) +#endif + +#ifndef TIOCGEXCL +# define TIOCGEXCL _IOR('T', 0x40, int) +#endif + + args->sid = 0; + args->pgrp = 0; + args->st_pckt = 0; + args->st_lock = 0; + args->st_excl = 0; + +#define __tty_ioctl(cmd, arg) \ + do { \ + ret = tty_ioctl(args->fd, cmd, &arg); \ + if (ret < 0) { \ + if (ret == -ENOTTY) \ + arg = 0; \ + else if (ret == -EIO) \ + goto err_io; \ + else \ + goto err; \ + } \ + } while (0) + + __tty_ioctl(TIOCGSID, args->sid); + __tty_ioctl(TIOCGPGRP, args->pgrp); + __tty_ioctl(TIOCGEXCL, args->st_excl); + + if (args->type == TTY_TYPE__PTY) { + __tty_ioctl(TIOCGPKT, args->st_pckt); + __tty_ioctl(TIOCGPTLCK, args->st_lock); + } + + args->hangup = false; + return 0; + +err: + pr_err("tty: Can't fetch params: err = %d\n", ret); + return -1; +err_io: + + /* kernel reports EIO for get ioctls on pair-less ptys */ + pr_debug("tty: EIO on tty\n"); + args->hangup = true; + return 0; +#undef __tty_ioctl +} + +static int parasite_check_vdso_mark(struct parasite_vdso_vma_entry *args) +{ + struct vdso_mark *m = (void *)args->start; + + if (is_vdso_mark(m)) { + /* + * Make sure we don't meet some corrupted entry + * where signature matches but versions do not! + */ + if (m->version != VDSO_MARK_CUR_VERSION) { + pr_err("vdso: Mark version mismatch!\n"); + return -EINVAL; + } + args->is_marked = 1; + args->orig_vdso_addr = m->orig_vdso_addr; + args->orig_vvar_addr = m->orig_vvar_addr; + args->rt_vvar_addr = m->rt_vvar_addr; + } else { + args->is_marked = 0; + args->orig_vdso_addr = VDSO_BAD_ADDR; + args->orig_vvar_addr = VVAR_BAD_ADDR; + args->rt_vvar_addr = VVAR_BAD_ADDR; + + if (args->try_fill_symtable) { + struct vdso_symtable t; + + if (vdso_fill_symtable(args->start, args->len, &t)) + args->is_vdso = false; + else + args->is_vdso = true; + } + } + + return 0; +} + +static int parasite_dump_cgroup(struct parasite_dump_cgroup_args *args) +{ + int proc, cgroup, len; + + proc = get_proc_fd(); + if (proc < 0) { + pr_err("can't get /proc fd\n"); + return -1; + } + + cgroup = sys_openat(proc, "self/cgroup", O_RDONLY, 0); + sys_close(proc); + if (cgroup < 0) { + pr_err("can't get /proc/self/cgroup fd\n"); + sys_close(cgroup); + return -1; + } + + len = sys_read(cgroup, args->contents, sizeof(args->contents)); + sys_close(cgroup); + if (len < 0) { + pr_err("can't read /proc/self/cgroup %d\n", len); + return -1; + } + + if (len == sizeof(args->contents)) { + pr_warn("/proc/self/cgroup was bigger than the page size\n"); + return -1; + } + + /* null terminate */ + args->contents[len] = 0; + return 0; +} + +void parasite_cleanup(void) +{ + if (mprotect_args) { + mprotect_args->add_prot = 0; + mprotect_vmas(mprotect_args); + } +} + +int parasite_daemon_cmd(int cmd, void *args) +{ + int ret; + + switch (cmd) { + case PARASITE_CMD_DUMPPAGES: + ret = dump_pages(args); + break; + case PARASITE_CMD_MPROTECT_VMAS: + ret = mprotect_vmas(args); + break; + case PARASITE_CMD_DUMP_SIGACTS: + ret = dump_sigact(args); + break; + case PARASITE_CMD_DUMP_ITIMERS: + ret = dump_itimers(args); + break; + case PARASITE_CMD_DUMP_POSIX_TIMERS: + ret = dump_posix_timers(args); + break; + case PARASITE_CMD_DUMP_THREAD: + ret = dump_thread(args); + break; + case PARASITE_CMD_DUMP_MISC: + ret = dump_misc(args); + break; + case PARASITE_CMD_DRAIN_FDS: + ret = drain_fds(args); + break; + case PARASITE_CMD_GET_PROC_FD: + ret = parasite_get_proc_fd(); + break; + case PARASITE_CMD_DUMP_TTY: + ret = parasite_dump_tty(args); + break; + case PARASITE_CMD_CHECK_AIOS: + ret = parasite_check_aios(args); + break; + case PARASITE_CMD_CHECK_VDSO_MARK: + ret = parasite_check_vdso_mark(args); + break; + case PARASITE_CMD_DUMP_CGROUP: + ret = parasite_dump_cgroup(args); + break; + default: + pr_err("Unknown command in parasite daemon thread leader: %d\n", cmd); + ret = -1; + break; + } + + return ret; +} + +int parasite_trap_cmd(int cmd, void *args) +{ + switch (cmd) { + case PARASITE_CMD_DUMP_THREAD: + return dump_thread(args); + } + + pr_err("Unknown command to parasite: %d\n", cmd); + return -EINVAL; +} diff --git a/CRIU_code/criu/pie/pie-relocs.h b/CRIU_code/criu/pie/pie-relocs.h new file mode 100644 index 0000000..6797486 --- /dev/null +++ b/CRIU_code/criu/pie/pie-relocs.h @@ -0,0 +1,12 @@ +#ifndef __PIE_RELOCS_H__ +#define __PIE_RELOCS_H__ + +#include + +#include "common/config.h" +#include "common/compiler.h" + +#define pie_size(__pie_name) (round_up(sizeof(__pie_name##_blob) + \ + __pie_name ## _nr_gotpcrel * sizeof(long), page_size())) + +#endif /* __PIE_RELOCS_H__ */ diff --git a/CRIU_code/criu/pie/restorer.c b/CRIU_code/criu/pie/restorer.c new file mode 100644 index 0000000..513be74 --- /dev/null +++ b/CRIU_code/criu/pie/restorer.c @@ -0,0 +1,1796 @@ +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "linux/userfaultfd.h" + +#include "common/config.h" +#include "int.h" +#include "types.h" +#include "common/compiler.h" +#include +#include +#include +#include "signal.h" +#include "prctl.h" +#include "criu-log.h" +#include "util.h" +#include "image.h" +#include "sk-inet.h" +#include "vma.h" +#include "uffd.h" + +#include "common/lock.h" +#include "common/page.h" +#include "restorer.h" +#include "aio.h" +#include "seccomp.h" + +#include "images/creds.pb-c.h" +#include "images/mm.pb-c.h" +#include "images/inventory.pb-c.h" + +#include "shmem.h" +#include "restorer.h" + +#ifndef PR_SET_PDEATHSIG +#define PR_SET_PDEATHSIG 1 +#endif + +#ifndef FALLOC_FL_KEEP_SIZE +#define FALLOC_FL_KEEP_SIZE 0x01 +#endif + +#ifndef FALLOC_FL_PUNCH_HOLE +#define FALLOC_FL_PUNCH_HOLE 0x02 +#endif + + +#define sys_prctl_safe(opcode, val1, val2, val3) \ + ({ \ + long __ret = sys_prctl(opcode, val1, val2, val3, 0); \ + if (__ret) \ + pr_err("prctl failed @%d with %ld\n", __LINE__, __ret);\ + __ret; \ + }) + +static struct task_entries *task_entries_local; +static futex_t thread_inprogress; +static pid_t *helpers; +static int n_helpers; +static pid_t *zombies; +static int n_zombies; +static enum faults fi_strategy; +bool fault_injected(enum faults f) +{ + return __fault_injected(f, fi_strategy); +} + +#ifdef ARCH_HAS_LONG_PAGES +/* + * XXX: Make it compel's std plugin global variable. Drop parasite_size(). + * Hint: compel on aarch64 shall learn relocs for that. + */ +static unsigned __page_size; +unsigned page_size(void) +{ + return __page_size; +} +#endif + +/* + * These are stubs for std compel plugin. + */ +int parasite_daemon_cmd(int cmd, void *args) +{ + return 0; +} + +int parasite_trap_cmd(int cmd, void *args) +{ + return 0; +} + +void parasite_cleanup(void) +{ +} + +extern void cr_restore_rt (void) asm ("__cr_restore_rt") + __attribute__ ((visibility ("hidden"))); + +static void sigchld_handler(int signal, siginfo_t *siginfo, void *data) +{ + char *r; + int i; + + /* We can ignore helpers that die, we expect them to after + * CR_STATE_RESTORE is finished. */ + for (i = 0; i < n_helpers; i++) + if (siginfo->si_pid == helpers[i]) + return; + + for (i = 0; i < n_zombies; i++) + if (siginfo->si_pid == zombies[i]) + return; + + if (siginfo->si_code == CLD_EXITED) + r = "exited, status="; + else if (siginfo->si_code == CLD_KILLED) + r = "killed by signal"; + else if (siginfo->si_code == CLD_DUMPED) + r = "terminated abnormally with"; + else if (siginfo->si_code == CLD_TRAPPED) + r = "trapped with"; + else if (siginfo->si_code == CLD_STOPPED) + r = "stopped with"; + else + r = "disappeared with"; + + pr_info("Task %d %s %d\n", siginfo->si_pid, r, siginfo->si_status); + + futex_abort_and_wake(&task_entries_local->nr_in_progress); + /* sa_restorer may be unmaped, so we can't go back to userspace*/ + sys_kill(sys_getpid(), SIGSTOP); + sys_exit_group(1); +} + +static int lsm_set_label(char *label, char *type, int procfd) +{ + int ret = -1, len, lsmfd; + char path[STD_LOG_SIMPLE_CHUNK]; + + if (!label) + return 0; + + pr_info("restoring lsm profile (%s) %s\n", type, label); + + std_sprintf(path, "self/task/%ld/attr/%s", sys_gettid(), type); + + lsmfd = sys_openat(procfd, path, O_WRONLY, 0); + if (lsmfd < 0) { + pr_err("failed openat %d\n", lsmfd); + return -1; + } + + for (len = 0; label[len]; len++) + ; + + ret = sys_write(lsmfd, label, len); + sys_close(lsmfd); + if (ret < 0) { + pr_err("can't write lsm profile %d\n", ret); + return -1; + } + + return 0; +} + +static int restore_creds(struct thread_creds_args *args, int procfd, + int lsm_type) +{ + CredsEntry *ce = &args->creds; + int b, i, ret; + struct cap_header hdr; + struct cap_data data[_LINUX_CAPABILITY_U32S_3]; + + /* + * We're still root here and thus can do it without failures. + */ + + /* + * Setup supplementary group IDs early. + */ + if (args->groups) { + ret = sys_setgroups(ce->n_groups, args->groups); + if (ret) { + pr_err("Can't setup supplementary group IDs: %d\n", ret); + return -1; + } + } + + /* + * First -- set the SECURE_NO_SETUID_FIXUP bit not to + * lose caps bits when changing xids. + */ + + ret = sys_prctl(PR_SET_SECUREBITS, 1 << SECURE_NO_SETUID_FIXUP, 0, 0, 0); + if (ret) { + pr_err("Unable to set SECURE_NO_SETUID_FIXUP: %d\n", ret); + return -1; + } + + /* + * Second -- restore xids. Since we still have the CAP_SETUID + * capability nothing should fail. But call the setfsXid last + * to override the setresXid settings. + */ + + ret = sys_setresuid(ce->uid, ce->euid, ce->suid); + if (ret) { + pr_err("Unable to set real, effective and saved user ID: %d\n", ret); + return -1; + } + + sys_setfsuid(ce->fsuid); + if (sys_setfsuid(-1) != ce->fsuid) { + pr_err("Unable to set fsuid\n"); + return -1; + } + + ret = sys_setresgid(ce->gid, ce->egid, ce->sgid); + if (ret) { + pr_err("Unable to set real, effective and saved group ID: %d\n", ret); + return -1; + } + + sys_setfsgid(ce->fsgid); + if (sys_setfsgid(-1) != ce->fsgid) { + pr_err("Unable to set fsgid\n"); + return -1; + } + + /* + * Third -- restore securebits. We don't need them in any + * special state any longer. + */ + + ret = sys_prctl(PR_SET_SECUREBITS, ce->secbits, 0, 0, 0); + if (ret) { + pr_err("Unable to set PR_SET_SECUREBITS: %d\n", ret); + return -1; + } + + /* + * Fourth -- trim bset. This can only be done while + * having the CAP_SETPCAP capability. + */ + + for (b = 0; b < CR_CAP_SIZE; b++) { + for (i = 0; i < 32; i++) { + if (b * 32 + i > args->cap_last_cap) + break; + if (args->cap_bnd[b] & (1 << i)) + /* already set */ + continue; + ret = sys_prctl(PR_CAPBSET_DROP, i + b * 32, 0, 0, 0); + if (ret) { + pr_err("Unable to drop capability %d: %d\n", + i + b * 32, ret); + return -1; + } + } + } + + /* + * Fifth -- restore caps. Nothing but cap bits are changed + * at this stage, so just do it. + */ + + hdr.version = _LINUX_CAPABILITY_VERSION_3; + hdr.pid = 0; + + BUILD_BUG_ON(_LINUX_CAPABILITY_U32S_3 != CR_CAP_SIZE); + + for (i = 0; i < CR_CAP_SIZE; i++) { + data[i].eff = args->cap_eff[i]; + data[i].prm = args->cap_prm[i]; + data[i].inh = args->cap_inh[i]; + } + + ret = sys_capset(&hdr, data); + if (ret) { + pr_err("Unable to restore capabilities: %d\n", ret); + return -1; + } + + if (lsm_type != LSMTYPE__SELINUX) { + /* + * SELinux does not support setting the process context for + * threaded processes. So this is skipped if running with + * SELinux and instead the process context is set before the + * threads are created. + */ + if (lsm_set_label(args->lsm_profile, "current", procfd) < 0) + return -1; + } + + /* Also set the sockcreate label for all threads */ + if (lsm_set_label(args->lsm_sockcreate, "sockcreate", procfd) < 0) + return -1; + + return 0; +} + +/* + * This should be done after creds restore, as + * some creds changes might drop the value back + * to zero. + */ + +static inline int restore_pdeath_sig(struct thread_restore_args *ta) +{ + if (ta->pdeath_sig) + return sys_prctl(PR_SET_PDEATHSIG, ta->pdeath_sig, 0, 0, 0); + else + return 0; +} + +static int restore_dumpable_flag(MmEntry *mme) +{ + int current_dumpable; + int ret; + + if (!mme->has_dumpable) { + pr_warn("Dumpable flag not present in criu dump.\n"); + return 0; + } + + if (mme->dumpable == 0 || mme->dumpable == 1) { + ret = sys_prctl(PR_SET_DUMPABLE, mme->dumpable, 0, 0, 0); + if (ret) { + pr_err("Unable to set PR_SET_DUMPABLE: %d\n", ret); + return -1; + } + return 0; + } + + /* + * If dumpable flag is present but it is not 0 or 1, then we can not + * use prctl to set it back. Try to see if it is already correct + * (which is likely if sysctl fs.suid_dumpable is the same when dump + * and restore are run), in which case there is nothing to do. + * Otherwise, set dumpable to 0 which should be a secure fallback. + */ + current_dumpable = sys_prctl(PR_GET_DUMPABLE, 0, 0, 0, 0); + if (mme->dumpable != current_dumpable) { + pr_warn("Dumpable flag [%d] does not match current [%d]. " + "Will fallback to setting it to 0 to disable it.\n", + mme->dumpable, current_dumpable); + ret = sys_prctl(PR_SET_DUMPABLE, 0, 0, 0, 0); + if (ret) { + pr_err("Unable to set PR_SET_DUMPABLE: %d\n", ret); + return -1; + } + } + return 0; +} + +static void restore_sched_info(struct rst_sched_param *p) +{ + struct sched_param parm; + + pr_info("Restoring scheduler params %d.%d.%d\n", + p->policy, p->nice, p->prio); + + sys_setpriority(PRIO_PROCESS, 0, p->nice); + parm.sched_priority = p->prio; + sys_sched_setscheduler(0, p->policy, &parm); +} + +static void restore_rlims(struct task_restore_args *ta) +{ + int r; + + for (r = 0; r < ta->rlims_n; r++) { + struct krlimit krlim; + + krlim.rlim_cur = ta->rlims[r].rlim_cur; + krlim.rlim_max = ta->rlims[r].rlim_max; + sys_setrlimit(r, &krlim); + } +} + +static int restore_signals(siginfo_t *ptr, int nr, bool group) +{ + int ret, i; + + for (i = 0; i < nr; i++) { + siginfo_t *info = ptr + i; + + pr_info("Restore signal %d group %d\n", info->si_signo, group); + if (group) + ret = sys_rt_sigqueueinfo(sys_getpid(), info->si_signo, info); + else + ret = sys_rt_tgsigqueueinfo(sys_getpid(), + sys_gettid(), info->si_signo, info); + if (ret) { + pr_err("Unable to send siginfo %d %x with code %d\n", + info->si_signo, info->si_code, ret); + return -1; + } + } + + return 0; +} + +static int restore_seccomp_filter(pid_t tid, struct thread_restore_args *args) +{ + unsigned int flags = args->seccomp_force_tsync ? SECCOMP_FILTER_FLAG_TSYNC : 0; + size_t i; + int ret; + + for (i = 0; i < args->seccomp_filters_n; i++) { + struct thread_seccomp_filter *filter = &args->seccomp_filters[i]; + + pr_debug("seccomp: Restoring mode %d flags %x on tid %d filter %d\n", + SECCOMP_SET_MODE_FILTER, (filter->flags | flags), tid, (int)i); + + ret = sys_seccomp(SECCOMP_SET_MODE_FILTER, filter->flags | flags, (void *)&filter->sock_fprog); + if (ret < 0) { + if (ret == -ENOSYS) { + pr_debug("seccomp: sys_seccomp is not supported in kernel, " + "switching to prctl interface\n"); + ret = sys_prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, + (long)(void *)&filter->sock_fprog, 0, 0); + if (ret) { + pr_err("seccomp: PR_SET_SECCOMP returned %d on tid %d\n", + ret, tid); + return -1; + } + } else { + pr_err("seccomp: SECCOMP_SET_MODE_FILTER returned %d on tid %d\n", + ret, tid); + return -1; + } + } + } + + return 0; +} + +static int restore_seccomp(struct thread_restore_args *args) +{ + pid_t tid = sys_gettid(); + int ret; + + switch (args->seccomp_mode) { + case SECCOMP_MODE_DISABLED: + pr_debug("seccomp: mode %d on tid %d\n", SECCOMP_MODE_DISABLED, tid); + return 0; + break; + case SECCOMP_MODE_STRICT: + ret = sys_prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, 0, 0, 0); + if (ret < 0) { + pr_err("seccomp: SECCOMP_MODE_STRICT returned %d on tid %d\n", + ret, tid); + } + break; + case SECCOMP_MODE_FILTER: + ret = restore_seccomp_filter(tid, args); + break; + default: + pr_err("seccomp: Unknown seccomp mode %d on tid %d\n", + args->seccomp_mode, tid); + ret = -1; + break; + } + + if (!ret) { + pr_debug("seccomp: Restored mode %d on tid %d\n", + args->seccomp_mode, tid); + } + + return ret; +} + +static int restore_robust_futex(struct thread_restore_args *args) +{ + uint32_t futex_len = args->futex_rla_len; + int ret; + + if (!args->futex_rla_len) + return 0; + + /* + * XXX: We check here *task's* mode, not *thread's*. + * But it's possible to write an application with mixed + * threads (on x86): some in 32-bit mode, some in 64-bit. + * Quite unlikely that such application exists at all. + */ + if (args->ta->compatible_mode) { + uint32_t futex = (uint32_t)args->futex_rla; + ret = set_compat_robust_list(futex, futex_len); + } else { + void *futex = decode_pointer(args->futex_rla); + ret = sys_set_robust_list(futex, futex_len); + } + + if (ret) + pr_err("Failed to recover futex robust list: %d\n", ret); + + return ret; +} + +static int restore_thread_common(struct thread_restore_args *args) +{ + sys_set_tid_address((int *)decode_pointer(args->clear_tid_addr)); + + if (restore_robust_futex(args)) + return -1; + + restore_sched_info(&args->sp); + + if (restore_nonsigframe_gpregs(&args->gpregs)) + return -1; + + restore_tls(&args->tls); + + return 0; +} + +static void noinline rst_sigreturn(unsigned long new_sp, + struct rt_sigframe *sigframe) +{ + ARCH_RT_SIGRETURN(new_sp, sigframe); +} + +/* + * Threads restoration via sigreturn. Note it's locked + * routine and calls for unlock at the end. + */ +long __export_restore_thread(struct thread_restore_args *args) +{ + struct rt_sigframe *rt_sigframe; + k_rtsigset_t to_block; + unsigned long new_sp; + int my_pid = sys_gettid(); + int ret; + + if (my_pid != args->pid) { + pr_err("Thread pid mismatch %d/%d\n", my_pid, args->pid); + goto core_restore_end; + } + + /* All signals must be handled by thread leader */ + ksigfillset(&to_block); + ret = sys_sigprocmask(SIG_SETMASK, &to_block, NULL, sizeof(k_rtsigset_t)); + if (ret) { + pr_err("Unable to block signals %d\n", ret); + goto core_restore_end; + } + + rt_sigframe = (void *)&args->mz->rt_sigframe; + + if (restore_thread_common(args)) + goto core_restore_end; + + ret = sys_prctl(PR_SET_NAME, (unsigned long) &args->comm, 0, 0, 0); + if (ret) { + pr_err("Unable to set a thread name: %d\n", ret); + goto core_restore_end; + } + + pr_info("%ld: Restored\n", sys_gettid()); + + restore_finish_stage(task_entries_local, CR_STATE_RESTORE); + + if (restore_signals(args->siginfo, args->siginfo_n, false)) + goto core_restore_end; + + restore_finish_stage(task_entries_local, CR_STATE_RESTORE_SIGCHLD); + + /* + * Make sure it's before creds, since it's privileged + * operation bound to uid 0 in current user ns. + */ + if (restore_seccomp(args)) + BUG(); + + ret = restore_creds(args->creds_args, args->ta->proc_fd, + args->ta->lsm_type); + ret = ret || restore_dumpable_flag(&args->ta->mm); + ret = ret || restore_pdeath_sig(args); + if (ret) + BUG(); + + restore_finish_stage(task_entries_local, CR_STATE_RESTORE_CREDS); + + futex_dec_and_wake(&thread_inprogress); + + new_sp = (long)rt_sigframe + RT_SIGFRAME_OFFSET(rt_sigframe); + rst_sigreturn(new_sp, rt_sigframe); + +core_restore_end: + pr_err("Restorer abnormal termination for %ld\n", sys_getpid()); + futex_abort_and_wake(&task_entries_local->nr_in_progress); + sys_exit_group(1); + return -1; +} + +static long restore_self_exe_late(struct task_restore_args *args) +{ + int fd = args->fd_exe_link, ret; + + pr_info("Restoring EXE link\n"); + ret = sys_prctl_safe(PR_SET_MM, PR_SET_MM_EXE_FILE, fd, 0); + if (ret) + pr_err("Can't restore EXE link (%d)\n", ret); + sys_close(fd); + + return ret; +} + +#ifndef ARCH_HAS_SHMAT_HOOK +unsigned long arch_shmat(int shmid, void *shmaddr, + int shmflg, unsigned long size) +{ + return sys_shmat(shmid, shmaddr, shmflg); +} +#endif + +static unsigned long restore_mapping(VmaEntry *vma_entry) +{ + int prot = vma_entry->prot; + int flags = vma_entry->flags | MAP_FIXED; + unsigned long addr; + + if (vma_entry_is(vma_entry, VMA_AREA_SYSVIPC)) { + int att_flags; + void *shmaddr = decode_pointer(vma_entry->start); + unsigned long shmsize = (vma_entry->end - vma_entry->start); + /* + * See comment in open_shmem_sysv() for what SYSV_SHMEM_SKIP_FD + * means and why we check for PROT_EXEC few lines below. + */ + if (vma_entry->fd == SYSV_SHMEM_SKIP_FD) + return vma_entry->start; + + if (vma_entry->prot & PROT_EXEC) { + att_flags = 0; + vma_entry->prot &= ~PROT_EXEC; + } else + att_flags = SHM_RDONLY; + + pr_info("Attach SYSV shmem %d at %"PRIx64"\n", (int)vma_entry->fd, vma_entry->start); + return arch_shmat(vma_entry->fd, shmaddr, att_flags, shmsize); + } + + /* + * Restore or shared mappings are tricky, since + * we open anonymous mapping via map_files/ + * MAP_ANONYMOUS should be eliminated so fd would + * be taken into account by a kernel. + */ + if (vma_entry_is(vma_entry, VMA_ANON_SHARED) && (vma_entry->fd != -1UL)) + flags &= ~MAP_ANONYMOUS; + + /* See comment in premap_private_vma() for this flag change */ + if (vma_entry_is(vma_entry, VMA_AREA_AIORING)) + flags |= MAP_ANONYMOUS; + + /* A mapping of file with MAP_SHARED is up to date */ + if ((vma_entry->fd == -1 || !(vma_entry->flags & MAP_SHARED)) && + !(vma_entry->status & VMA_NO_PROT_WRITE)) + prot |= PROT_WRITE; + + /* TODO: Drop MAP_LOCKED bit and restore it after reading memory. + * + * Code below tries to limit memory usage by running fallocate() + * after each preadv() to avoid doubling memory usage (once in + * image files, once in process). Unfortunately, MAP_LOCKED defeats + * that mechanism as it causes the process to be charged for memory + * immediately upon mmap, not later upon preadv(). + */ + pr_debug("\tmmap(%"PRIx64" -> %"PRIx64", %x %x %d)\n", + vma_entry->start, vma_entry->end, + prot, flags, (int)vma_entry->fd); + /* + * Should map memory here. Note we map them as + * writable since we're going to restore page + * contents. + */ + addr = sys_mmap(decode_pointer(vma_entry->start), + vma_entry_len(vma_entry), + prot, flags, + vma_entry->fd, + vma_entry->pgoff); + + if ((vma_entry->fd != -1) && + (vma_entry->status & VMA_CLOSE)) + sys_close(vma_entry->fd); + + return addr; +} + +/* + * This restores aio ring header, content, head and in-kernel position + * of tail. To set tail, we write to /dev/null and use the fact this + * operation is synchronious for the device. Also, we unmap temporary + * anonymous area, used to store content of ring buffer during restore + * and mapped in premap_private_vma(). + */ +static int restore_aio_ring(struct rst_aio_ring *raio) +{ + struct aio_ring *ring = (void *)raio->addr, *new; + int i, maxr, count, fd, ret; + unsigned head = ring->head; + unsigned tail = ring->tail; + struct iocb *iocb, **iocbp; + unsigned long ctx = 0; + unsigned size; + char buf[1]; + + ret = sys_io_setup(raio->nr_req, &ctx); + if (ret < 0) { + pr_err("Ring setup failed with %d\n", ret); + return -1; + } + + new = (struct aio_ring *)ctx; + i = (raio->len - sizeof(struct aio_ring)) / sizeof(struct io_event); + if (tail >= ring->nr || head >= ring->nr || ring->nr != i || + new->nr != ring->nr) { + pr_err("wrong aio: tail=%x head=%x req=%x old_nr=%x new_nr=%x expect=%x\n", + tail, head, raio->nr_req, ring->nr, new->nr, i); + + return -1; + } + + if (tail == 0 && head == 0) + goto populate; + + fd = sys_open("/dev/null", O_WRONLY, 0); + if (fd < 0) { + pr_err("Can't open /dev/null for aio\n"); + return -1; + } + + /* + * If tail < head, we have to do full turn and then submit + * tail more request, i.e. ring->nr + tail. + * If we do not do full turn, in-kernel completed_events + * will initialize wrong. + * + * Maximum number reqs to submit at once are ring->nr-1, + * so we won't allocate more. + */ + if (tail < head) + count = ring->nr + tail; + else + count = tail; + maxr = min_t(unsigned, count, ring->nr-1); + + /* + * Since we only interested in moving the tail, the requests + * may be any. We submit count identical requests. + */ + size = sizeof(struct iocb) + maxr * sizeof(struct iocb *); + iocb = (void *)sys_mmap(NULL, size, PROT_READ|PROT_WRITE, + MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); + iocbp = (void *)iocb + sizeof(struct iocb); + + if (IS_ERR(iocb)) { + pr_err("Can't mmap aio tmp buffer: %ld\n", PTR_ERR(iocb)); + return -1; + } + + iocb->aio_fildes = fd; + iocb->aio_buf = (unsigned long)buf; + iocb->aio_nbytes = 1; + iocb->aio_lio_opcode = IOCB_CMD_PWRITE; /* Write is nop, read populates buf */ + + for (i = 0; i < maxr; i++) + iocbp[i] = iocb; + + i = 0; + do { + ret = sys_io_submit(ctx, count - i, iocbp); + if (ret < 0) { + pr_err("Can't submit aio iocbs: ret=%d\n", ret); + return -1; + } + i += ret; + + /* + * We may submit less than requested, because of too big + * count OR behaviour of get_reqs_available(), which + * takes available requests only if their number is + * aliquot to kioctx::req_batch. Free part of buffer + * for next iteration. + * + * Direct set of head is equal to sys_io_getevents() call, + * and faster. See kernel for the details. + */ + ((struct aio_ring *)ctx)->head = i < head ? i : head; + } while (i < count); + + sys_munmap(iocb, size); + sys_close(fd); + +populate: + i = offsetof(struct aio_ring, io_events); + memcpy((void *)ctx + i, (void *)ring + i, raio->len - i); + + /* + * If we failed to get the proper nr_req right and + * created smaller or larger ring, then this remap + * will (should) fail, since AIO rings has immutable + * size. + * + * This is not great, but anyway better than putting + * a ring of wrong size into correct place. + * + * Also, this unmaps temporary anonymous area on raio->addr. + */ + + ctx = sys_mremap(ctx, raio->len, raio->len, + MREMAP_FIXED | MREMAP_MAYMOVE, + raio->addr); + if (ctx != raio->addr) { + pr_err("Ring remap failed with %ld\n", ctx); + return -1; + } + return 0; +} + +static void rst_tcp_repair_off(struct rst_tcp_sock *rts) +{ + int aux, ret; + + aux = rts->reuseaddr; + pr_debug("pie: Turning repair off for %d (reuse %d)\n", rts->sk, aux); + tcp_repair_off(rts->sk); + + ret = sys_setsockopt(rts->sk, SOL_SOCKET, SO_REUSEADDR, &aux, sizeof(aux)); + if (ret < 0) + pr_err("Failed to restore of SO_REUSEADDR on socket (%d)\n", ret); +} + +static void rst_tcp_socks_all(struct task_restore_args *ta) +{ + int i; + + for (i = 0; i < ta->tcp_socks_n; i++) + rst_tcp_repair_off(&ta->tcp_socks[i]); +} + +static int enable_uffd(int uffd, unsigned long addr, unsigned long len) +{ + int rc; + struct uffdio_register uffdio_register; + unsigned long expected_ioctls; + + /* + * If uffd == -1, this means that userfaultfd is not enabled + * or it is not available. + */ + if (uffd == -1) + return 0; + + uffdio_register.range.start = addr; + uffdio_register.range.len = len; + uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; + + pr_info("lazy-pages: register: %lx, len %lx\n", addr, len); + + rc = sys_ioctl(uffd, UFFDIO_REGISTER, (unsigned long) &uffdio_register); + if (rc != 0) { + pr_err("lazy-pages: register %lx failed: rc:%d, \n", addr, rc); + return -1; + } + + expected_ioctls = (1 << _UFFDIO_WAKE) | (1 << _UFFDIO_COPY) | (1 << _UFFDIO_ZEROPAGE); + + if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls) { + pr_err("lazy-pages: unexpected missing uffd ioctl for anon memory\n"); + } + + return 0; +} + + +static int vma_remap(VmaEntry *vma_entry, int uffd) +{ + unsigned long src = vma_premmaped_start(vma_entry); + unsigned long dst = vma_entry->start; + unsigned long len = vma_entry_len(vma_entry); + unsigned long guard = 0, tmp; + + pr_info("Remap %lx->%lx len %lx\n", src, dst, len); + + if (src - dst < len) + guard = dst; + else if (dst - src < len) + guard = dst + len - PAGE_SIZE; + + if (src == dst) + return 0; + + if (guard != 0) { + /* + * mremap() returns an error if a target and source vma-s are + * overlapped. In this case the source vma are remapped in + * a temporary place and then remapped to the target address. + * Here is one hack to find non-ovelapped temporary place. + * + * 1. initial placement. We need to move src -> tgt. + * | |+++++src+++++| + * |-----tgt-----| | + * + * 2. map a guard page at the non-ovelapped border of a target vma. + * | |+++++src+++++| + * |G|----tgt----| | + * + * 3. remap src to any other place. + * G prevents src from being remaped on tgt again + * | |-------------| -> |+++++src+++++| + * |G|---tgt-----| | + * + * 4. remap src to tgt, no overlapping any longer + * |+++++src+++++| <---- |-------------| + * |G|---tgt-----| | + */ + + unsigned long addr; + + /* Map guard page (step 2) */ + tmp = sys_mmap((void *) guard, PAGE_SIZE, PROT_NONE, + MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); + if (tmp != guard) { + pr_err("Unable to map a guard page %lx (%lx)\n", guard, tmp); + return -1; + } + + /* Move src to non-overlapping place (step 3) */ + addr = sys_mmap(NULL, len, PROT_NONE, + MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); + if (addr == (unsigned long) MAP_FAILED) { + pr_err("Unable to reserve memory (%lx)\n", addr); + return -1; + } + + tmp = sys_mremap(src, len, len, + MREMAP_MAYMOVE | MREMAP_FIXED, addr); + if (tmp != addr) { + pr_err("Unable to remap %lx -> %lx (%lx)\n", src, addr, tmp); + return -1; + } + + src = addr; + } + + tmp = sys_mremap(src, len, len, MREMAP_MAYMOVE | MREMAP_FIXED, dst); + if (tmp != dst) { + pr_err("Unable to remap %lx -> %lx\n", src, dst); + return -1; + } + + /* + * If running in userfaultfd/lazy-pages mode pages with + * MAP_ANONYMOUS and MAP_PRIVATE are remapped but without the + * real content. + * The function enable_uffd() marks the page(s) as userfaultfd + * pages, so that the processes will hang until the memory is + * injected via userfaultfd. + */ + if (vma_entry_can_be_lazy(vma_entry)) + if (enable_uffd(uffd, dst, len) != 0) + return -1; + + return 0; +} + +static int timerfd_arm(struct task_restore_args *args) +{ + int i; + + for (i = 0; i < args->timerfd_n; i++) { + struct restore_timerfd *t = &args->timerfd[i]; + int ret; + + pr_debug("timerfd: arm for fd %d (%d)\n", t->fd, i); + + if (t->settime_flags & TFD_TIMER_ABSTIME) { + struct timespec ts; + + /* + * We might need to adjust value because the checkpoint + * and restore procedure takes some time itself. Note + * we don't adjust nanoseconds, since the result may + * overflow the limit NSEC_PER_SEC FIXME + */ + if (sys_clock_gettime(t->clockid, &ts)) { + pr_err("Can't get current time\n"); + return -1; + } + + t->val.it_value.tv_sec += (time_t)ts.tv_sec; + + pr_debug("Adjust id %#x it_value(%llu, %llu) -> it_value(%llu, %llu)\n", + t->id, (unsigned long long)ts.tv_sec, + (unsigned long long)ts.tv_nsec, + (unsigned long long)t->val.it_value.tv_sec, + (unsigned long long)t->val.it_value.tv_nsec); + } + + ret = sys_timerfd_settime(t->fd, t->settime_flags, &t->val, NULL); + if (t->ticks) + ret |= sys_ioctl(t->fd, TFD_IOC_SET_TICKS, (unsigned long)&t->ticks); + if (ret) { + pr_err("Can't restore ticks/time for timerfd - %d\n", i); + return ret; + } + } + return 0; +} + +static int create_posix_timers(struct task_restore_args *args) +{ + int ret, i; + kernel_timer_t next_id; + struct sigevent sev; + + for (i = 0; i < args->posix_timers_n; i++) { + sev.sigev_notify = args->posix_timers[i].spt.it_sigev_notify; + sev.sigev_signo = args->posix_timers[i].spt.si_signo; + sev.sigev_value.sival_ptr = args->posix_timers[i].spt.sival_ptr; + + while (1) { + ret = sys_timer_create(args->posix_timers[i].spt.clock_id, &sev, &next_id); + if (ret < 0) { + pr_err("Can't create posix timer - %d\n", i); + return ret; + } + + if (next_id == args->posix_timers[i].spt.it_id) + break; + + ret = sys_timer_delete(next_id); + if (ret < 0) { + pr_err("Can't remove temporaty posix timer 0x%x\n", next_id); + return ret; + } + + if ((long)next_id > args->posix_timers[i].spt.it_id) { + pr_err("Can't create timers, kernel don't give them consequently\n"); + return -1; + } + } + } + + return 0; +} + +static void restore_posix_timers(struct task_restore_args *args) +{ + int i; + struct restore_posix_timer *rt; + + for (i = 0; i < args->posix_timers_n; i++) { + rt = &args->posix_timers[i]; + sys_timer_settime((kernel_timer_t)rt->spt.it_id, 0, &rt->val, NULL); + } +} + +/* + * sys_munmap must not return here. The control process must + * trap us on the exit from sys_munmap. + */ +unsigned long vdso_rt_size = 0; + +void *bootstrap_start = NULL; +unsigned int bootstrap_len = 0; + +void __export_unmap(void) +{ + sys_munmap(bootstrap_start, bootstrap_len - vdso_rt_size); +} + +/* + * This function unmaps all VMAs, which don't belong to + * the restored process or the restorer. + * + * The restorer memory is two regions -- area with restorer, its stack + * and arguments and the one with private vmas of the tasks we restore + * (a.k.a. premmaped area): + * + * 0 task_size + * +----+====+----+====+---+ + * + * Thus to unmap old memory we have to do 3 unmaps: + * [ 0 -- 1st area start ] + * [ 1st end -- 2nd start ] + * [ 2nd start -- task_size ] + */ +static int unmap_old_vmas(void *premmapped_addr, unsigned long premmapped_len, + void *bootstrap_start, unsigned long bootstrap_len, + unsigned long task_size) +{ + unsigned long s1, s2; + void *p1, *p2; + int ret; + + if (premmapped_addr < bootstrap_start) { + p1 = premmapped_addr; + s1 = premmapped_len; + p2 = bootstrap_start; + s2 = bootstrap_len; + } else { + p2 = premmapped_addr; + s2 = premmapped_len; + p1 = bootstrap_start; + s1 = bootstrap_len; + } + + ret = sys_munmap(NULL, p1 - NULL); + if (ret) { + pr_err("Unable to unmap (%p-%p): %d\n", NULL, p1, ret); + return -1; + } + + ret = sys_munmap(p1 + s1, p2 - (p1 + s1)); + if (ret) { + pr_err("Unable to unmap (%p-%p): %d\n", p1 + s1, p2, ret); + return -1; + } + + ret = sys_munmap(p2 + s2, task_size - (unsigned long)(p2 + s2)); + if (ret) { + pr_err("Unable to unmap (%p-%p): %d\n", + p2 + s2, (void *)task_size, ret); + return -1; + } + + return 0; +} + +static int wait_helpers(struct task_restore_args *task_args) +{ + int i; + + for (i = 0; i < task_args->helpers_n; i++) { + int status; + pid_t pid = task_args->helpers[i]; + + /* Check that a helper completed. */ + if (sys_wait4(pid, &status, 0, NULL) == -ECHILD) { + /* It has been waited in sigchld_handler */ + continue; + } + if (!WIFEXITED(status) || WEXITSTATUS(status)) { + pr_err("%d exited with non-zero code (%d,%d)\n", pid, + WEXITSTATUS(status), WTERMSIG(status)); + return -1; + } + } + + return 0; +} + +static int wait_zombies(struct task_restore_args *task_args) +{ + int i; + + for (i = 0; i < task_args->zombies_n; i++) { + int ret, nr_in_progress; + + nr_in_progress = futex_get(&task_entries_local->nr_in_progress); + + ret = sys_waitid(P_PID, task_args->zombies[i], NULL, WNOWAIT | WEXITED, NULL); + if (ret == -ECHILD) { + /* A process isn't reparented to this task yet. + * Let's wait when someone complete this stage + * and try again. + */ + futex_wait_while_eq(&task_entries_local->nr_in_progress, + nr_in_progress); + i--; + continue; + } + if (ret < 0) { + pr_err("Wait on %d zombie failed: %d\n", task_args->zombies[i], ret); + return -1; + } + pr_debug("%ld: Collect a zombie with pid %d\n", + sys_getpid(), task_args->zombies[i]); + } + + return 0; +} + +static bool vdso_unmapped(struct task_restore_args *args) +{ + unsigned int i; + + /* Don't park rt-vdso or rt-vvar if dumpee doesn't have them */ + for (i = 0; i < args->vmas_n; i++) { + VmaEntry *vma = &args->vmas[i]; + + if (vma_entry_is(vma, VMA_AREA_VDSO) || + vma_entry_is(vma, VMA_AREA_VVAR)) + return false; + } + + return true; +} + +static bool vdso_needs_parking(struct task_restore_args *args) +{ + /* Compatible vDSO will be mapped, not moved */ + if (args->compatible_mode) + return false; + + if (args->can_map_vdso) + return false; + + return !vdso_unmapped(args); +} + +/* + * The main routine to restore task via sigreturn. + * This one is very special, we never return there + * but use sigreturn facility to restore core registers + * and jump execution to some predefined ip read from + * core file. + */ +long __export_restore_task(struct task_restore_args *args) +{ + long ret = -1; + int i; + VmaEntry *vma_entry; + unsigned long va; + struct restore_vma_io *rio; + struct rt_sigframe *rt_sigframe; + struct prctl_mm_map prctl_map; + unsigned long new_sp; + k_rtsigset_t to_block; + pid_t my_pid = sys_getpid(); + rt_sigaction_t act; + + bootstrap_start = args->bootstrap_start; + bootstrap_len = args->bootstrap_len; + + vdso_rt_size = args->vdso_rt_size; + + fi_strategy = args->fault_strategy; + + task_entries_local = args->task_entries; + helpers = args->helpers; + n_helpers = args->helpers_n; + zombies = args->zombies; + n_zombies = args->zombies_n; + *args->breakpoint = rst_sigreturn; +#ifdef ARCH_HAS_LONG_PAGES + __page_size = args->page_size; +#endif + + ksigfillset(&act.rt_sa_mask); + act.rt_sa_handler = sigchld_handler; + act.rt_sa_flags = SA_SIGINFO | SA_RESTORER | SA_RESTART; + act.rt_sa_restorer = cr_restore_rt; + ret = sys_sigaction(SIGCHLD, &act, NULL, sizeof(k_rtsigset_t)); + if (ret) { + pr_err("Failed to set SIGCHLD %ld\n", ret); + goto core_restore_end; + } + + ksigemptyset(&to_block); + ksigaddset(&to_block, SIGCHLD); + ret = sys_sigprocmask(SIG_UNBLOCK, &to_block, NULL, sizeof(k_rtsigset_t)); + if (ret) { + pr_err("Failed to unblock SIGCHLD %ld\n", ret); + goto core_restore_end; + } + + std_log_set_fd(args->logfd); + std_log_set_loglevel(args->loglevel); + std_log_set_start(&args->logstart); + + pr_info("Switched to the restorer %d\n", my_pid); + + if (args->uffd > -1) { + pr_debug("lazy-pages: uffd %d\n", args->uffd); + } + + if (vdso_needs_parking(args)) { + if (vdso_do_park(&args->vdso_maps_rt, + args->vdso_rt_parked_at, vdso_rt_size)) + goto core_restore_end; + } + + if (unmap_old_vmas((void *)args->premmapped_addr, args->premmapped_len, + bootstrap_start, bootstrap_len, args->task_size)) + goto core_restore_end; + + /* Map vdso that wasn't parked */ + if (!vdso_unmapped(args) && args->can_map_vdso) { + if (arch_map_vdso(args->vdso_rt_parked_at, + args->compatible_mode) < 0) { + goto core_restore_end; + } + } + + /* Shift private vma-s to the left */ + for (i = 0; i < args->vmas_n; i++) { + vma_entry = args->vmas + i; + + if (!vma_entry_is(vma_entry, VMA_PREMMAPED)) + continue; + + if (vma_entry->end >= args->task_size) + continue; + + if (vma_entry->start > vma_entry->shmid) + break; + + if (vma_remap(vma_entry, args->uffd)) + goto core_restore_end; + } + + /* Shift private vma-s to the right */ + for (i = args->vmas_n - 1; i >= 0; i--) { + vma_entry = args->vmas + i; + + if (!vma_entry_is(vma_entry, VMA_PREMMAPED)) + continue; + + if (vma_entry->start > args->task_size) + continue; + + if (vma_entry->start < vma_entry->shmid) + break; + + if (vma_remap(vma_entry, args->uffd)) + goto core_restore_end; + } + + if (args->uffd > -1) { + /* re-enable THP if we disabled it previously */ + if (args->has_thp_enabled) { + if (sys_prctl(PR_SET_THP_DISABLE, 0, 0, 0, 0)) { + pr_err("Cannot re-enable THP\n"); + goto core_restore_end; + } + } + + pr_debug("lazy-pages: closing uffd %d\n", args->uffd); + /* + * All userfaultfd configuration has finished at this point. + * Let's close the UFFD file descriptor, so that the restored + * process does not have an opened UFFD FD for ever. + */ + sys_close(args->uffd); + } + + /* + * OK, lets try to map new one. + */ + for (i = 0; i < args->vmas_n; i++) { + vma_entry = args->vmas + i; + + if (!vma_entry_is(vma_entry, VMA_AREA_REGULAR) && + !vma_entry_is(vma_entry, VMA_AREA_AIORING)) + continue; + + if (vma_entry_is(vma_entry, VMA_PREMMAPED)) + continue; + + va = restore_mapping(vma_entry); + + if (va != vma_entry->start) { + pr_err("Can't restore %"PRIx64" mapping with %lx\n", vma_entry->start, va); + goto core_restore_end; + } + } + + /* + * Now read the contents (if any) + */ + + rio = args->vma_ios; + for (i = 0; i < args->vma_ios_n; i++) { + struct iovec *iovs = rio->iovs; + int nr = rio->nr_iovs; + ssize_t r; + + while (nr) { + pr_debug("Preadv %lx:%d... (%d iovs)\n", + (unsigned long)iovs->iov_base, + (int)iovs->iov_len, nr); + r = sys_preadv(args->vma_ios_fd, iovs, nr, rio->off); + if (r < 0) { + pr_err("Can't read pages data (%d)\n", (int)r); + goto core_restore_end; + } + + pr_debug("`- returned %ld\n", (long)r); + /* If the file is open for writing, then it means we should punch holes + * in it. */ + if (r > 0 && args->auto_dedup) { + int fr = sys_fallocate(args->vma_ios_fd, FALLOC_FL_KEEP_SIZE|FALLOC_FL_PUNCH_HOLE, + rio->off, r); + if (fr < 0) { + pr_debug("Failed to punch holes with fallocate: %d\n", fr); + } + } + rio->off += r; + /* Advance the iovecs */ + do { + if (iovs->iov_len <= r) { + pr_debug(" `- skip pagemap\n"); + r -= iovs->iov_len; + iovs++; + nr--; + continue; + } + + iovs->iov_base += r; + iovs->iov_len -= r; + break; + } while (nr > 0); + } + + rio = ((void *)rio) + RIO_SIZE(rio->nr_iovs); + } + + sys_close(args->vma_ios_fd); + + /* + * Proxify vDSO. + */ + if (vdso_proxify(&args->vdso_maps_rt.sym, args->vdso_rt_parked_at, + args->vmas, args->vmas_n, args->compatible_mode, + fault_injected(FI_VDSO_TRAMPOLINES))) + goto core_restore_end; + + /* + * Walk though all VMAs again to drop PROT_WRITE + * if it was not there. + */ + for (i = 0; i < args->vmas_n; i++) { + vma_entry = args->vmas + i; + + if (!(vma_entry_is(vma_entry, VMA_AREA_REGULAR))) + continue; + + if ((vma_entry->prot & PROT_WRITE) || + (vma_entry->status & VMA_NO_PROT_WRITE)) + continue; + + sys_mprotect(decode_pointer(vma_entry->start), + vma_entry_len(vma_entry), + vma_entry->prot); + } + + /* + * Now when all VMAs are in their places time to set + * up AIO rings. + */ + + for (i = 0; i < args->rings_n; i++) + if (restore_aio_ring(&args->rings[i]) < 0) + goto core_restore_end; + + /* + * Finally restore madivse() bits + */ + for (i = 0; i < args->vmas_n; i++) { + unsigned long m; + + vma_entry = args->vmas + i; + if (!vma_entry->has_madv || !vma_entry->madv) + continue; + + for (m = 0; m < sizeof(vma_entry->madv) * 8; m++) { + if (vma_entry->madv & (1ul << m)) { + ret = sys_madvise(vma_entry->start, + vma_entry_len(vma_entry), + m); + if (ret) { + pr_err("madvise(%"PRIx64", %"PRIu64", %ld) " + "failed with %ld\n", + vma_entry->start, + vma_entry_len(vma_entry), + m, ret); + goto core_restore_end; + } + } + } + } + + /* + * Tune up the task fields. + */ + ret = sys_prctl_safe(PR_SET_NAME, (long)args->comm, 0, 0); + if (ret) + goto core_restore_end; + + /* + * New kernel interface with @PR_SET_MM_MAP will become + * more widespread once kernel get deployed over the world. + * Thus lets be opportunistic and use new interface as a try. + */ + prctl_map = (struct prctl_mm_map) { + .start_code = args->mm.mm_start_code, + .end_code = args->mm.mm_end_code, + .start_data = args->mm.mm_start_data, + .end_data = args->mm.mm_end_data, + .start_stack = args->mm.mm_start_stack, + .start_brk = args->mm.mm_start_brk, + .brk = args->mm.mm_brk, + .arg_start = args->mm.mm_arg_start, + .arg_end = args->mm.mm_arg_end, + .env_start = args->mm.mm_env_start, + .env_end = args->mm.mm_env_end, + .auxv = (void *)args->mm_saved_auxv, + .auxv_size = args->mm_saved_auxv_size, + .exe_fd = args->fd_exe_link, + }; + ret = sys_prctl(PR_SET_MM, PR_SET_MM_MAP, (long)&prctl_map, sizeof(prctl_map), 0); + if (ret == -EINVAL) { + ret = sys_prctl_safe(PR_SET_MM, PR_SET_MM_START_CODE, (long)args->mm.mm_start_code, 0); + ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_END_CODE, (long)args->mm.mm_end_code, 0); + ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_START_DATA, (long)args->mm.mm_start_data, 0); + ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_END_DATA, (long)args->mm.mm_end_data, 0); + ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_START_STACK, (long)args->mm.mm_start_stack, 0); + ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_START_BRK, (long)args->mm.mm_start_brk, 0); + ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_BRK, (long)args->mm.mm_brk, 0); + ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_ARG_START, (long)args->mm.mm_arg_start, 0); + ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_ARG_END, (long)args->mm.mm_arg_end, 0); + ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_ENV_START, (long)args->mm.mm_env_start, 0); + ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_ENV_END, (long)args->mm.mm_env_end, 0); + ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_AUXV, (long)args->mm_saved_auxv, args->mm_saved_auxv_size); + + /* + * Because of requirements applied from kernel side + * we need to restore /proc/pid/exe symlink late, + * after old existing VMAs are superseded with + * new ones from image file. + */ + ret |= restore_self_exe_late(args); + } else { + if (ret) + pr_err("sys_prctl(PR_SET_MM, PR_SET_MM_MAP) failed with %d\n", (int)ret); + sys_close(args->fd_exe_link); + } + + if (ret) + goto core_restore_end; + + /* SELinux (1) process context needs to be set before creating threads. */ + if (args->lsm_type == LSMTYPE__SELINUX) { + /* Only for SELinux */ + if (lsm_set_label(args->t->creds_args->lsm_profile, + "current", args->proc_fd) < 0) + goto core_restore_end; + } + + /* + * We need to prepare a valid sigframe here, so + * after sigreturn the kernel will pick up the + * registers from the frame, set them up and + * finally pass execution to the new IP. + */ + rt_sigframe = (void *)&args->t->mz->rt_sigframe; + + if (restore_thread_common(args->t)) + goto core_restore_end; + + /* + * Threads restoration. This requires some more comments. This + * restorer routine and thread restorer routine has the following + * memory map, prepared by a caller code. + * + * | <-- low addresses high addresses --> | + * +-------------------------------------------------------+-----------------------+ + * | this proc body | own stack | rt_sigframe space | thread restore zone | + * +-------------------------------------------------------+-----------------------+ + * + * where each thread restore zone is the following + * + * | <-- low addresses high addresses --> | + * +--------------------------------------------------------------------------+ + * | thread restore proc | thread1 stack | thread1 rt_sigframe | + * +--------------------------------------------------------------------------+ + */ + + if (args->nr_threads > 1) { + struct thread_restore_args *thread_args = args->thread_args; + long clone_flags = CLONE_VM | CLONE_FILES | CLONE_SIGHAND | + CLONE_THREAD | CLONE_SYSVSEM | CLONE_FS; + long last_pid_len; + long parent_tid; + int i, fd = -1; + + /* One level pid ns hierarhy */ + fd = sys_openat(args->proc_fd, LAST_PID_PATH, O_RDWR, 0); + if (fd < 0) { + pr_err("can't open last pid fd %d\n", fd); + goto core_restore_end; + } + + mutex_lock(&task_entries_local->last_pid_mutex); + + for (i = 0; i < args->nr_threads; i++) { + char last_pid_buf[16], *s; + + /* skip self */ + if (thread_args[i].pid == args->t->pid) + continue; + + new_sp = restorer_stack(thread_args[i].mz); + last_pid_len = std_vprint_num(last_pid_buf, sizeof(last_pid_buf), thread_args[i].pid - 1, &s); + sys_lseek(fd, 0, SEEK_SET); + ret = sys_write(fd, s, last_pid_len); + if (ret < 0) { + pr_err("Can't set last_pid %ld/%s\n", ret, last_pid_buf); + sys_close(fd); + mutex_unlock(&task_entries_local->last_pid_mutex); + goto core_restore_end; + } + + /* + * To achieve functionality like libc's clone() + * we need a pure assembly here, because clone()'ed + * thread will run with own stack and we must not + * have any additional instructions... oh, dear... + */ + + RUN_CLONE_RESTORE_FN(ret, clone_flags, new_sp, parent_tid, thread_args, args->clone_restore_fn); + if (ret != thread_args[i].pid) { + pr_err("Unable to create a thread: %ld\n", ret); + mutex_unlock(&task_entries_local->last_pid_mutex); + goto core_restore_end; + } + } + + mutex_unlock(&task_entries_local->last_pid_mutex); + if (fd >= 0) + sys_close(fd); + } + + restore_rlims(args); + + ret = create_posix_timers(args); + if (ret < 0) { + pr_err("Can't restore posix timers %ld\n", ret); + goto core_restore_end; + } + + ret = timerfd_arm(args); + if (ret < 0) { + pr_err("Can't restore timerfd %ld\n", ret); + goto core_restore_end; + } + + pr_info("%ld: Restored\n", sys_getpid()); + + restore_finish_stage(task_entries_local, CR_STATE_RESTORE); + + if (wait_helpers(args) < 0) + goto core_restore_end; + if (wait_zombies(args) < 0) + goto core_restore_end; + + ksigfillset(&to_block); + ret = sys_sigprocmask(SIG_SETMASK, &to_block, NULL, sizeof(k_rtsigset_t)); + if (ret) { + pr_err("Unable to block signals %ld\n", ret); + goto core_restore_end; + } + + if (!args->compatible_mode) { + ret = sys_sigaction(SIGCHLD, &args->sigchld_act, + NULL, sizeof(k_rtsigset_t)); + } else { + void *stack = alloc_compat_syscall_stack(); + + if (!stack) { + pr_err("Failed to allocate 32-bit stack for sigaction\n"); + goto core_restore_end; + } + ret = arch_compat_rt_sigaction(stack, SIGCHLD, + (void*)&args->sigchld_act); + free_compat_syscall_stack(stack); + } + if (ret) { + pr_err("Failed to restore SIGCHLD: %ld\n", ret); + goto core_restore_end; + } + + ret = restore_signals(args->siginfo, args->siginfo_n, true); + if (ret) + goto core_restore_end; + + ret = restore_signals(args->t->siginfo, args->t->siginfo_n, false); + if (ret) + goto core_restore_end; + + restore_finish_stage(task_entries_local, CR_STATE_RESTORE_SIGCHLD); + + rst_tcp_socks_all(args); + + /* + * Make sure it's before creds, since it's privileged + * operation bound to uid 0 in current user ns. + */ + if (restore_seccomp(args->t)) + goto core_restore_end; + + /* + * Writing to last-pid is CAP_SYS_ADMIN protected, + * turning off TCP repair is CAP_SYS_NED_ADMIN protected, + * thus restore* creds _after_ all of the above. + */ + ret = restore_creds(args->t->creds_args, args->proc_fd, + args->lsm_type); + ret = ret || restore_dumpable_flag(&args->mm); + ret = ret || restore_pdeath_sig(args->t); + + futex_set_and_wake(&thread_inprogress, args->nr_threads); + + restore_finish_stage(task_entries_local, CR_STATE_RESTORE_CREDS); + + if (ret) + BUG(); + + /* Wait until children stop to use args->task_entries */ + futex_wait_while_gt(&thread_inprogress, 1); + + sys_close(args->proc_fd); + std_log_set_fd(-1); + + /* + * The code that prepared the itimers makes sure that the + * code below doesn't fail due to bad timing values. + */ + +#define itimer_armed(args, i) \ + (args->itimers[i].it_interval.tv_sec || \ + args->itimers[i].it_interval.tv_usec) + + if (itimer_armed(args, 0)) + sys_setitimer(ITIMER_REAL, &args->itimers[0], NULL); + if (itimer_armed(args, 1)) + sys_setitimer(ITIMER_VIRTUAL, &args->itimers[1], NULL); + if (itimer_armed(args, 2)) + sys_setitimer(ITIMER_PROF, &args->itimers[2], NULL); + + restore_posix_timers(args); + + sys_munmap(args->rst_mem, args->rst_mem_size); + + /* + * Sigframe stack. + */ + new_sp = (long)rt_sigframe + RT_SIGFRAME_OFFSET(rt_sigframe); + + /* + * Prepare the stack and call for sigreturn, + * pure assembly since we don't need any additional + * code insns from gcc. + */ + rst_sigreturn(new_sp, rt_sigframe); + +core_restore_end: + futex_abort_and_wake(&task_entries_local->nr_in_progress); + pr_err("Restorer fail %ld\n", sys_getpid()); + sys_exit_group(1); + return -1; +} + +/* + * For most of the restorer's objects -fstack-protector is disabled. + * But we share some of them with CRIU, which may have it enabled. + */ +void __stack_chk_fail(void) +{ + pr_err("Restorer stack smash detected %ld\n", sys_getpid()); + sys_exit_group(1); + BUG(); +} diff --git a/CRIU_code/criu/pie/util-vdso-elf32.c b/CRIU_code/criu/pie/util-vdso-elf32.c new file mode 100644 index 0000000..97928c0 --- /dev/null +++ b/CRIU_code/criu/pie/util-vdso-elf32.c @@ -0,0 +1 @@ +util-vdso.c \ No newline at end of file diff --git a/CRIU_code/criu/pie/util-vdso.c b/CRIU_code/criu/pie/util-vdso.c new file mode 100644 index 0000000..104da06 --- /dev/null +++ b/CRIU_code/criu/pie/util-vdso.c @@ -0,0 +1,329 @@ +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "image.h" +#include "util-vdso.h" +#include "vma.h" +#include "log.h" +#include "common/bug.h" + +#ifdef CR_NOGLIBC +# include +#else +# include +# define std_strncmp strncmp +#endif + +#ifdef LOG_PREFIX +# undef LOG_PREFIX +#endif +#define LOG_PREFIX "vdso: " + +/* Check if pointer is out-of-bound */ +static bool __ptr_oob(uintptr_t ptr, uintptr_t start, size_t size) +{ + uintptr_t end = start + size; + + return ptr >= end || ptr < start; +} + +/* Check if pointed structure's end is out-of-bound */ +static bool __ptr_struct_end_oob(uintptr_t ptr, size_t struct_size, + uintptr_t start, size_t size) +{ + return __ptr_oob(ptr + struct_size - 1, start, size); +} + +/* Check if pointed structure is out-of-bound */ +static bool __ptr_struct_oob(uintptr_t ptr, size_t struct_size, + uintptr_t start, size_t size) +{ + return __ptr_oob(ptr, start, size) || + __ptr_struct_end_oob(ptr, struct_size, start, size); +} + +/* + * Elf hash, see format specification. + */ +static unsigned long elf_hash(const unsigned char *name) +{ + unsigned long h = 0, g; + + while (*name) { + h = (h << 4) + *name++; + g = h & 0xf0000000ul; + if (g) + h ^= g >> 24; + h &= ~g; + } + return h; +} + +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +#define BORD ELFDATA2MSB /* 0x02 */ +#else +#define BORD ELFDATA2LSB /* 0x01 */ +#endif + +static int has_elf_identity(Ehdr_t *ehdr) +{ + /* + * See Elf specification for this magic values. + */ +#if defined(CONFIG_VDSO_32) + static const char elf_ident[] = { + 0x7f, 0x45, 0x4c, 0x46, 0x01, BORD, 0x01, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + }; +#else + static const char elf_ident[] = { + 0x7f, 0x45, 0x4c, 0x46, 0x02, BORD, 0x01, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + }; +#endif + + BUILD_BUG_ON(sizeof(elf_ident) != sizeof(ehdr->e_ident)); + + if (memcmp(ehdr->e_ident, elf_ident, sizeof(elf_ident))) { + pr_err("ELF header magic mismatch\n"); + return false; + } + + return true; +} + +static int parse_elf_phdr(uintptr_t mem, size_t size, + Phdr_t **dynamic, Phdr_t **load) +{ + Ehdr_t *ehdr = (void *)mem; + uintptr_t addr; + Phdr_t *phdr; + int i; + + if (__ptr_struct_end_oob(mem, sizeof(Ehdr_t), mem, size)) + goto err_oob; + /* + * Make sure it's a file we support. + */ + if (!has_elf_identity(ehdr)) + return -EINVAL; + + addr = mem + ehdr->e_phoff; + if (__ptr_oob(addr, mem, size)) + goto err_oob; + + for (i = 0; i < ehdr->e_phnum; i++, addr += sizeof(Phdr_t)) { + if (__ptr_struct_end_oob(addr, sizeof(Phdr_t), mem, size)) + goto err_oob; + + phdr = (void *)addr; + switch (phdr->p_type) { + case PT_DYNAMIC: + if (*dynamic) { + pr_err("Second PT_DYNAMIC header\n"); + return -EINVAL; + } + *dynamic = phdr; + break; + case PT_LOAD: + if (*load) { + pr_err("Second PT_LOAD header\n"); + return -EINVAL; + } + *load = phdr; + break; + } + } + return 0; + +err_oob: + pr_err("Corrupted Elf phdr\n"); + return -EFAULT; +} + +/* + * Parse dynamic program header. + * Output parameters are: + * @dyn_strtab - address of the symbol table + * @dyn_symtab - address of the string table section + * @dyn_hash - address of the symbol hash table + */ +static int parse_elf_dynamic(uintptr_t mem, size_t size, Phdr_t *dynamic, + Dyn_t **dyn_strtab, Dyn_t **dyn_symtab, Dyn_t **dyn_hash) +{ + Dyn_t *dyn_syment = NULL; + Dyn_t *dyn_strsz = NULL; + uintptr_t addr; + Dyn_t *d; + int i; + + addr = mem + dynamic->p_offset; + if (__ptr_oob(addr, mem, size)) + goto err_oob; + + for (i = 0; i < dynamic->p_filesz / sizeof(*d); + i++, addr += sizeof(Dyn_t)) { + if (__ptr_struct_end_oob(addr, sizeof(Dyn_t), mem, size)) + goto err_oob; + d = (void *)addr; + + if (d->d_tag == DT_NULL) { + break; + } else if (d->d_tag == DT_STRTAB) { + *dyn_strtab = d; + pr_debug("DT_STRTAB: %lx\n", (unsigned long)d->d_un.d_ptr); + } else if (d->d_tag == DT_SYMTAB) { + *dyn_symtab = d; + pr_debug("DT_SYMTAB: %lx\n", (unsigned long)d->d_un.d_ptr); + } else if (d->d_tag == DT_STRSZ) { + dyn_strsz = d; + pr_debug("DT_STRSZ: %lx\n", (unsigned long)d->d_un.d_val); + } else if (d->d_tag == DT_SYMENT) { + dyn_syment = d; + pr_debug("DT_SYMENT: %lx\n", (unsigned long)d->d_un.d_val); + } else if (d->d_tag == DT_HASH) { + *dyn_hash = d; + pr_debug("DT_HASH: %lx\n", (unsigned long)d->d_un.d_ptr); + } + } + + if (!*dyn_strtab || !*dyn_symtab || !dyn_strsz || !dyn_syment || !*dyn_hash) { + pr_err("Not all dynamic entries are present\n"); + return -EINVAL; + } + + return 0; + +err_oob: + pr_err("Corrupted Elf dynamic section\n"); + return -EFAULT; +} + +/* On s390x Hash_t is 64 bit */ +#ifdef __s390x__ +typedef unsigned long Hash_t; +#else +typedef Word_t Hash_t; +#endif + +static void parse_elf_symbols(uintptr_t mem, size_t size, Phdr_t *load, + struct vdso_symtable *t, uintptr_t dynsymbol_names, + Hash_t *hash, Dyn_t *dyn_symtab) +{ + const char *vdso_symbols[VDSO_SYMBOL_MAX] = { + ARCH_VDSO_SYMBOLS + }; + const size_t vdso_symbol_length = sizeof(t->symbols[0].name); + + Hash_t nbucket, nchain; + Hash_t *bucket, *chain; + + unsigned int i, j, k; + uintptr_t addr; + + nbucket = hash[0]; + nchain = hash[1]; + bucket = &hash[2]; + chain = &hash[nbucket + 2]; + + pr_debug("nbucket %lx nchain %lx bucket %lx chain %lx\n", + (long)nbucket, (long)nchain, (unsigned long)bucket, (unsigned long)chain); + + for (i = 0; i < VDSO_SYMBOL_MAX; i++) { + const char * symbol = vdso_symbols[i]; + k = elf_hash((const unsigned char *)symbol); + + for (j = bucket[k % nbucket]; j < nchain && j != STN_UNDEF; j = chain[j]) { + addr = mem + dyn_symtab->d_un.d_ptr - load->p_vaddr; + Sym_t *sym; + char *name; + + addr += sizeof(Sym_t)*j; + if (__ptr_struct_oob(addr, sizeof(Sym_t), mem, size)) + continue; + sym = (void *)addr; + + if (ELF_ST_TYPE(sym->st_info) != STT_FUNC && + ELF_ST_BIND(sym->st_info) != STB_GLOBAL) + continue; + + addr = dynsymbol_names + sym->st_name; + if (__ptr_struct_oob(addr, vdso_symbol_length, mem, size)) + continue; + name = (void *)addr; + + if (std_strncmp(name, symbol, vdso_symbol_length)) + continue; + + memcpy(t->symbols[i].name, name, vdso_symbol_length); + t->symbols[i].offset = (unsigned long)sym->st_value - load->p_vaddr; + break; + } + } +} + +int vdso_fill_symtable(uintptr_t mem, size_t size, struct vdso_symtable *t) +{ + Phdr_t *dynamic = NULL, *load = NULL; + Dyn_t *dyn_strtab = NULL; + Dyn_t *dyn_symtab = NULL; + Dyn_t *dyn_hash = NULL; + Hash_t *hash = NULL; + + uintptr_t dynsymbol_names; + uintptr_t addr; + int ret; + + pr_debug("Parsing at %lx %lx\n", (long)mem, (long)mem + (long)size); + + /* + * We need PT_LOAD and PT_DYNAMIC here. Each once. + */ + ret = parse_elf_phdr(mem, size, &dynamic, &load); + if (ret < 0) + return ret; + if (!load || !dynamic) { + pr_err("One of obligated program headers is missed\n"); + return -EINVAL; + } + + pr_debug("PT_LOAD p_vaddr: %lx\n", (unsigned long)load->p_vaddr); + + /* + * Dynamic section tags should provide us the rest of information + * needed. Note that we're interested in a small set of tags. + */ + + ret = parse_elf_dynamic(mem, size, dynamic, + &dyn_strtab, &dyn_symtab, &dyn_hash); + if (ret < 0) + return ret; + + addr = mem + dyn_strtab->d_un.d_val - load->p_vaddr; + if (__ptr_oob(addr, mem, size)) + goto err_oob; + dynsymbol_names = addr; + + addr = mem + dyn_hash->d_un.d_ptr - load->p_vaddr; + if (__ptr_struct_oob(addr, sizeof(Word_t), mem, size)) + goto err_oob; + hash = (void *)addr; + + parse_elf_symbols(mem, size, load, t, dynsymbol_names, hash, dyn_symtab); + + return 0; + +err_oob: + pr_err("Corrupted Elf symbols/hash\n"); + return -EFAULT; +} + diff --git a/CRIU_code/criu/pie/util.c b/CRIU_code/criu/pie/util.c new file mode 100644 index 0000000..4945483 --- /dev/null +++ b/CRIU_code/criu/pie/util.c @@ -0,0 +1,54 @@ +#include +#include +#include +#include +#include +#include + +#include "int.h" +#include "types.h" +#include "common/compiler.h" +#include "fcntl.h" +#include "log.h" +#include "util-pie.h" + +#ifdef CR_NOGLIBC +# include +# define __sys(foo) sys_##foo +#else +# define __sys(foo) foo +#endif + +#ifdef CR_NOGLIBC +#define __pr_perror(fmt, ...) pr_err(fmt "\n", ##__VA_ARGS__) +#else +#define __pr_perror(fmt, ...) pr_perror(fmt, ##__VA_ARGS__) +#endif + +int open_detach_mount(char *dir) +{ + int fd, ret; + + fd = __sys(open)(dir, O_RDONLY | O_DIRECTORY, 0); + if (fd < 0) + __pr_perror("Can't open directory %s: %d", dir, fd); + + ret = __sys(umount2)(dir, MNT_DETACH); + if (ret) { + __pr_perror("Can't detach mount %s: %d", dir, ret); + goto err_close; + } + + ret = __sys(rmdir)(dir); + if (ret) { + __pr_perror("Can't remove tmp dir %s: %d", dir, ret); + goto err_close; + } + + return fd; + +err_close: + if (fd >= 0) + __sys(close)(fd); + return -1; +} diff --git a/CRIU_code/criu/pipes.c b/CRIU_code/criu/pipes.c new file mode 100644 index 0000000..fd1a7e6 --- /dev/null +++ b/CRIU_code/criu/pipes.c @@ -0,0 +1,526 @@ +#include +#include +#include +#include +#include + +#include "crtools.h" +#include "imgset.h" +#include "image.h" +#include "files.h" +#include "pipes.h" +#include "util-pie.h" +#include "autofs.h" + +#include "protobuf.h" +#include "util.h" +#include "images/pipe.pb-c.h" +#include "images/pipe-data.pb-c.h" +#include "fcntl.h" +#include "namespaces.h" + +static LIST_HEAD(pipes); + +static void show_saved_pipe_fds(struct pipe_info *pi) +{ + struct fdinfo_list_entry *fle; + + pr_info(" `- ID %p %#x\n", pi, pi->pe->id); + list_for_each_entry(fle, &pi->d.fd_info_head, desc_list) + pr_info(" `- FD %d pid %d\n", fle->fe->fd, fle->pid); +} + +static int pipe_data_read(struct cr_img *img, struct pipe_data_rst *r) +{ + unsigned long bytes = r->pde->bytes; + + if (!bytes) + return 0; + + /* + * We potentially allocate more memory than required for data, + * but this is OK. Look at restore_pipe_data -- it vmsplice-s + * this into the kernel with F_GIFT flag (since some time it + * works on non-aligned data), thus just giving this page to + * pipe buffer. And since kernel allocates pipe buffers in pages + * anyway we don't increase memory consumption :) + */ + + r->data = mmap(NULL, bytes, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, 0, 0); + if (r->data == MAP_FAILED) { + pr_perror("Can't map mem for pipe buffers"); + return -1; + } + + return read_img_buf(img, r->data, bytes); +} + +int do_collect_pipe_data(struct pipe_data_rst *r, ProtobufCMessage *msg, + struct cr_img *img, struct pipe_data_rst **hash) +{ + int aux; + + r->pde = pb_msg(msg, PipeDataEntry); + aux = pipe_data_read(img, r); + if (aux < 0) + return aux; + + aux = r->pde->pipe_id & PIPE_DATA_HASH_MASK; + r->next = hash[aux]; + hash[aux] = r; + pr_info("Collected pipe data for %#x (chain %u)\n", + r->pde->pipe_id, aux); + return 0; +} + +/* Choose who will restore a pipe. */ +static int mark_pipe_master_cb(struct pprep_head *ph) +{ + LIST_HEAD(head); + + pr_info("Pipes:\n"); + + while (1) { + struct fdinfo_list_entry *fle; + struct pipe_info *pi, *pic, *p; + struct pipe_info *pr = NULL, *pw = NULL; + + if (list_empty(&pipes)) + break; + + pi = list_first_entry(&pipes, struct pipe_info, list); + list_move(&pi->list, &head); + + pr_info(" `- PIPE ID %#x\n", pi->pe->pipe_id); + show_saved_pipe_fds(pi); + + fle = file_master(&pi->d); + p = pi; + if (!(pi->pe->flags & O_LARGEFILE)) { + if (pi->pe->flags & O_WRONLY) { + if (pw == NULL) + pw = pi; + } else { + if (pr == NULL) + pr = pi; + } + } + + list_for_each_entry(pic, &pi->pipe_list, pipe_list) { + struct fdinfo_list_entry *f; + + list_move(&pic->list, &head); + f = file_master(&pic->d); + if (fdinfo_rst_prio(f, fle)) { + p = pic; + fle = f; + } + + if (!(pic->pe->flags & O_LARGEFILE)) { + if (pic->pe->flags & O_WRONLY) { + if (pw == NULL) + pw = pic; + } else { + if (pr == NULL) + pr = pic; + } + } + + show_saved_pipe_fds(pic); + } + p->create = 1; + if (pr) + pr->reopen = 0; + if (pw) + pw->reopen = 0; + pr_info(" by %#x\n", p->pe->id); + } + + list_splice(&head, &pipes); + return 0; +} + +static MAKE_PPREP_HEAD(mark_pipe_master); + +static struct pipe_data_rst *pd_hash_pipes[PIPE_DATA_HASH_SIZE]; + +int restore_pipe_data(int img_type, int pfd, u32 id, struct pipe_data_rst **hash) +{ + int ret; + struct pipe_data_rst *pd; + struct iovec iov; + + for (pd = hash[id & PIPE_DATA_HASH_MASK]; pd != NULL; pd = pd->next) + if (pd->pde->pipe_id == id) + break; + + if (!pd) { /* no data for this pipe */ + pr_info("No data for pipe %#x\n", id); + return 0; + } + + if (!pd->pde->bytes) + goto out; + + if (!pd->data) { + pr_err("Double data restore occurred on %#x\n", id); + return -1; + } + + if (pd->pde->has_size) { + pr_info("Restoring size %#x for %#x\n", + pd->pde->size, pd->pde->pipe_id); + ret = fcntl(pfd, F_SETPIPE_SZ, pd->pde->size); + if (ret < 0) { + pr_perror("Can't restore pipe size"); + goto err; + } + } + + iov.iov_base = pd->data; + iov.iov_len = pd->pde->bytes; + + while (iov.iov_len > 0) { + ret = vmsplice(pfd, &iov, 1, SPLICE_F_GIFT | SPLICE_F_NONBLOCK); + if (ret < 0) { + pr_perror("%#x: Error splicing data", id); + goto err; + } + + if (ret == 0 || ret > iov.iov_len /* sanity */) { + pr_err("%#x: Wanted to restore %zu bytes, but got %d\n", id, + iov.iov_len, ret); + ret = -1; + goto err; + } + + iov.iov_base += ret; + iov.iov_len -= ret; + } + + /* + * 3 reasons for killing the buffer from our address space: + * + * 1. We gifted the pages to the kernel to optimize memory usage, thus + * accidental memory corruption can change the pipe buffer. + * 2. This will make the vmas restoration a bit faster due to less self + * mappings to be unmapped. + * 3. We can catch bugs with double pipe data restore. + */ + + munmap(pd->data, pd->pde->bytes); + pd->data = NULL; +out: + ret = 0; +err: + return ret; +} + +static int userns_reopen(void *_arg, int fd, pid_t pid) +{ + char path[PSFDS]; + int ret, flags = *(int*)_arg; + + sprintf(path, "/proc/self/fd/%d", fd); + ret = open(path, flags); + if (ret < 0) + pr_perror("Unable to reopen the pipe %s", path); + close(fd); + + return ret; +} + +static int reopen_pipe(int fd, int flags) +{ + int ret; + char path[PSFDS]; + + sprintf(path, "/proc/self/fd/%d", fd); + ret = open(path, flags); + if (ret < 0) { + if (errno == EACCES) { + /* It may be an external pipe from an another userns */ + ret = userns_call(userns_reopen, UNS_FDOUT, + &flags, sizeof(flags), fd); + } else + pr_perror("Unable to reopen the pipe %s", path); + } + close(fd); + + return ret; +} + +static int recv_pipe_fd(struct pipe_info *pi, int *new_fd) +{ + int tmp, fd, ret; + + ret = recv_desc_from_peer(&pi->d, &tmp); + if (ret != 0) { + if (ret != 1) + pr_err("Can't get fd %d\n", tmp); + return ret; + } + + if (pi->reopen) + fd = reopen_pipe(tmp, pi->pe->flags); + else + fd = tmp; + if (fd >= 0) { + if (rst_file_params(fd, pi->pe->fown, pi->pe->flags)) { + close(fd); + return -1; + } + *new_fd = fd; + } + + return fd < 0 ? -1 : 0; +} + +static char *pipe_d_name(struct file_desc *d, char *buf, size_t s) +{ + struct pipe_info *pi; + + pi = container_of(d, struct pipe_info, d); + if (snprintf(buf, s, "pipe:[%d]", pi->pe->pipe_id) >= s) { + pr_err("Not enough room for pipe %d identifier string\n", + pi->pe->pipe_id); + return NULL; + } + + return buf; +} + +int open_pipe(struct file_desc *d, int *new_fd) +{ + struct pipe_info *pi, *p; + int ret, tmp; + int pfd[2]; + + pi = container_of(d, struct pipe_info, d); + pr_info("\t\tCreating pipe pipe_id=%#x id=%#x\n", pi->pe->pipe_id, pi->pe->id); + if (inherited_fd(d, &tmp)) { + if (tmp < 0) + return tmp; + + pi->reopen = 1; + goto reopen; + } + + if (!pi->create) + return recv_pipe_fd(pi, new_fd); + + if (pipe(pfd) < 0) { + pr_perror("Can't create pipe"); + return -1; + } + + ret = restore_pipe_data(CR_FD_PIPES_DATA, pfd[1], + pi->pe->pipe_id, pd_hash_pipes); + if (ret) + return -1; + + list_for_each_entry(p, &pi->pipe_list, pipe_list) { + int fd = pfd[p->pe->flags & O_WRONLY]; + + if (send_desc_to_peer(fd, &p->d)) { + pr_perror("Can't send file descriptor"); + return -1; + } + } + + close(pfd[!(pi->pe->flags & O_WRONLY)]); + tmp = pfd[pi->pe->flags & O_WRONLY]; + +reopen: + if (pi->reopen) + tmp = reopen_pipe(tmp, pi->pe->flags); + + if (tmp >= 0) + if (rst_file_params(tmp, pi->pe->fown, pi->pe->flags)) + return -1; + if (tmp < 0) + return -1; + *new_fd = tmp; + return 0; +} + +static struct file_desc_ops pipe_desc_ops = { + .type = FD_TYPES__PIPE, + .open = open_pipe, + .name = pipe_d_name, +}; + +int collect_one_pipe_ops(void *o, ProtobufCMessage *base, struct file_desc_ops *ops) +{ + struct pipe_info *pi = o, *tmp; + + pi->pe = pb_msg(base, PipeEntry); + + pi->create = 0; + pi->reopen = 1; + pr_info("Collected pipe entry ID %#x PIPE ID %#x\n", + pi->pe->id, pi->pe->pipe_id); + + if (file_desc_add(&pi->d, pi->pe->id, ops)) + return -1; + + INIT_LIST_HEAD(&pi->pipe_list); + if (!inherited_fd(&pi->d, NULL)) { + list_for_each_entry(tmp, &pipes, list) + if (pi->pe->pipe_id == tmp->pe->pipe_id) + break; + + if (&tmp->list != &pipes) + list_add(&pi->pipe_list, &tmp->pipe_list); + } + + add_post_prepare_cb_once(&mark_pipe_master); + list_add_tail(&pi->list, &pipes); + + return 0; +} + +static int collect_one_pipe(void *o, ProtobufCMessage *base, struct cr_img *i) +{ + return collect_one_pipe_ops(o, base, &pipe_desc_ops); +} + +struct collect_image_info pipe_cinfo = { + .fd_type = CR_FD_PIPES, + .pb_type = PB_PIPE, + .priv_size = sizeof(struct pipe_info), + .collect = collect_one_pipe, +}; + +static int collect_pipe_data(void *obj, ProtobufCMessage *msg, struct cr_img *img) +{ + return do_collect_pipe_data(obj, msg, img, pd_hash_pipes); +} + +struct collect_image_info pipe_data_cinfo = { + .fd_type = CR_FD_PIPES_DATA, + .pb_type = PB_PIPE_DATA, + .priv_size = sizeof(struct pipe_data_rst), + .collect = collect_pipe_data, +}; + +int dump_one_pipe_data(struct pipe_data_dump *pd, int lfd, const struct fd_parms *p) +{ + struct cr_img *img; + int pipe_size, i, bytes; + int steal_pipe[2]; + int ret = -1; + PipeDataEntry pde = PIPE_DATA_ENTRY__INIT; + + if (p->flags & O_WRONLY) + return 0; + + /* Maybe we've dumped it already */ + for (i = 0; i < pd->nr; i++) { + if (pd->ids[i] == pipe_id(p)) + return 0; + } + + pr_info("Dumping data from pipe %#x fd %d\n", pipe_id(p), lfd); + + if (pd->nr >= NR_PIPES_WITH_DATA) { + pr_err("OOM storing pipe\n"); + return -1; + } + + img = img_from_set(glob_imgset, pd->img_type); + pd->ids[pd->nr++] = pipe_id(p); + + pipe_size = fcntl(lfd, F_GETPIPE_SZ); + if (pipe_size < 0) { + pr_err("Can't obtain piped data size\n"); + goto err; + } + + if (pipe(steal_pipe) < 0) { + pr_perror("Can't create pipe for stealing data"); + goto err; + } + + /* steal_pipe has to be able to fit all data from a target pipe */ + if (fcntl(steal_pipe[1], F_SETPIPE_SZ, pipe_size) < 0) { + pr_perror("Unable to set a pipe size"); + goto err; + } + + bytes = tee(lfd, steal_pipe[1], pipe_size, SPLICE_F_NONBLOCK); + if (bytes < 0) { + if (errno != EAGAIN) { + pr_perror("Can't pick pipe data"); + goto err_close; + } + + bytes = 0; + } + + pde.pipe_id = pipe_id(p); + pde.bytes = bytes; + pde.has_size = true; + pde.size = pipe_size; + + if (pb_write_one(img, &pde, PB_PIPE_DATA)) + goto err_close; + + if (bytes) { + int wrote; + + wrote = splice(steal_pipe[0], NULL, img_raw_fd(img), NULL, bytes, 0); + if (wrote < 0) { + pr_perror("Can't push pipe data"); + goto err_close; + } else if (wrote != bytes) { + pr_err("%#x: Wanted to write %d bytes, but wrote %d\n", + pipe_id(p), bytes, wrote); + goto err_close; + } + } + + ret = 0; + +err_close: + close(steal_pipe[0]); + close(steal_pipe[1]); +err: + return ret; +} + +static struct pipe_data_dump pd_pipes = { .img_type = CR_FD_PIPES_DATA, }; + +static int dump_one_pipe(int lfd, u32 id, const struct fd_parms *p) +{ + FileEntry fe = FILE_ENTRY__INIT; + PipeEntry pe = PIPE_ENTRY__INIT; + + pr_info("Dumping pipe %d with id %#x pipe_id %#x\n", + lfd, id, pipe_id(p)); + + if ((p->flags & O_DIRECT) && !is_autofs_pipe(pipe_id(p))) { + pr_err("The packetized mode for pipes is not supported yet\n"); + return -1; + } + + pe.id = id; + pe.pipe_id = pipe_id(p); + pe.flags = p->flags & ~O_DIRECT; + pe.fown = (FownEntry *)&p->fown; + + fe.type = FD_TYPES__PIPE; + fe.id = pe.id; + fe.pipe = &pe; + + if (pb_write_one(img_from_set(glob_imgset, CR_FD_FILES), &fe, PB_FILE)) + return -1; + + return dump_one_pipe_data(&pd_pipes, lfd, p); +} + +const struct fdtype_ops pipe_dump_ops = { + .type = FD_TYPES__PIPE, + .dump = dump_one_pipe, +}; diff --git a/CRIU_code/criu/plugin.c b/CRIU_code/criu/plugin.c new file mode 100644 index 0000000..b97d376 --- /dev/null +++ b/CRIU_code/criu/plugin.c @@ -0,0 +1,260 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "cr_options.h" +#include "common/compiler.h" +#include "xmalloc.h" +#include "plugin.h" +#include "servicefd.h" +#include "common/list.h" +#include "log.h" + +cr_plugin_ctl_t cr_plugin_ctl = { + .head.next = &cr_plugin_ctl.head, + .head.prev = &cr_plugin_ctl.head, +}; + +/* + * If we met old version of a plugin, selfgenerate a plugin descriptor for it. + */ +static cr_plugin_desc_t *cr_gen_plugin_desc(void *h, char *path) +{ + cr_plugin_desc_t *d; + + d = xzalloc(sizeof(*d)); + if (!d) + return NULL; + + d->name = xstrdup(path); + d->max_hooks = CR_PLUGIN_HOOK__MAX; + d->version = CRIU_PLUGIN_VERSION_OLD; + + pr_warn("Generating dynamic descriptor for plugin `%s'." + "Won't work in next version of the program." + "Please update your plugin.\n", path); + +#define __assign_hook(__hook, __name) \ + do { \ + void *name; \ + name = dlsym(h, __name); \ + if (name) \ + d->hooks[CR_PLUGIN_HOOK__ ##__hook] = name; \ + } while (0) + + __assign_hook(DUMP_UNIX_SK, "cr_plugin_dump_unix_sk"); + __assign_hook(RESTORE_UNIX_SK, "cr_plugin_restore_unix_sk"); + __assign_hook(DUMP_EXT_FILE, "cr_plugin_dump_file"); + __assign_hook(RESTORE_EXT_FILE, "cr_plugin_restore_file"); + __assign_hook(DUMP_EXT_MOUNT, "cr_plugin_dump_ext_mount"); + __assign_hook(RESTORE_EXT_MOUNT, "cr_plugin_restore_ext_mount"); + __assign_hook(DUMP_EXT_LINK, "cr_plugin_dump_ext_link"); + +#undef __assign_hook + + d->init = dlsym(h, "cr_plugin_init"); + d->exit = dlsym(h, "cr_plugin_fini"); + + return d; +} + +static void show_plugin_desc(cr_plugin_desc_t *d) +{ + size_t i; + + pr_debug("Plugin \"%s\" (version %u hooks %u)\n", + d->name, d->version, d->max_hooks); + for (i = 0; i < d->max_hooks; i++) { + if (d->hooks[i]) + pr_debug("\t%4zu -> %p\n", i, d->hooks[i]); + } +} + +static int verify_plugin(cr_plugin_desc_t *d) +{ + if (d->version > CRIU_PLUGIN_VERSION) { + pr_debug("Plugin %s has version %x while max %x supported\n", + d->name, d->version, CRIU_PLUGIN_VERSION); + return -1; + } + + if (d->max_hooks > CR_PLUGIN_HOOK__MAX) { + pr_debug("Plugin %s has %u assigned while max %u supported\n", + d->name, d->max_hooks, CR_PLUGIN_HOOK__MAX); + return -1; + } + + return 0; +} + +int criu_get_image_dir(void) +{ + return get_service_fd(IMG_FD_OFF); +} + +static int cr_lib_load(int stage, char *path) +{ + cr_plugin_desc_t *d; + plugin_desc_t *this; + size_t i; + void *h; + bool allocated = false; + + h = dlopen(path, RTLD_LAZY); + if (h == NULL) { + pr_err("Unable to load %s: %s\n", path, dlerror()); + return -1; + } + + /* + * Load plugin descriptor. If plugin is too old -- create + * dynamic plugin descriptor. In most cases this won't + * be a common operation and plugins are not supposed to + * be changing own format frequently. + */ + d = dlsym(h, "CR_PLUGIN_DESC"); + if (!d) { + d = cr_gen_plugin_desc(h, path); + if (!d) { + pr_err("Can't load plugin %s\n", path); + goto error_close; + } + allocated = true; + } + + this = xzalloc(sizeof(*this)); + if (!this) + goto error_close; + + if (verify_plugin(d)) { + pr_err("Corrupted plugin %s\n", path); + goto error_free; + } + + this->d = d; + this->dlhandle = h; + INIT_LIST_HEAD(&this->list); + + for (i = 0; i < d->max_hooks; i++) + INIT_LIST_HEAD(&this->link[i]); + + list_add_tail(&this->list, &cr_plugin_ctl.head); + show_plugin_desc(d); + + if (d->init && d->init(stage)) { + pr_err("Failed in init(%d) of \"%s\"\n", stage, d->name); + list_del(&this->list); + goto error_free; + } + + /* + * Chain hooks into appropriate places for + * fast handler access. + */ + for (i = 0; i < d->max_hooks; i++) { + if (!d->hooks[i]) + continue; + list_add_tail(&this->link[i], &cr_plugin_ctl.hook_chain[i]); + } + + return 0; + +error_free: + xfree(this); +error_close: + dlclose(h); + if (allocated) + xfree(d); + return -1; +} + +void cr_plugin_fini(int stage, int ret) +{ + plugin_desc_t *this, *tmp; + + list_for_each_entry_safe(this, tmp, &cr_plugin_ctl.head, list) { + void *h = this->dlhandle; + size_t i; + + list_del(&this->list); + if (this->d->exit) + this->d->exit(stage, ret); + + for (i = 0; i < this->d->max_hooks; i++) { + if (!list_empty(&this->link[i])) + list_del(&this->link[i]); + } + + if (this->d->version == CRIU_PLUGIN_VERSION_OLD) + xfree(this->d); + dlclose(h); + } +} + +int cr_plugin_init(int stage) +{ + int exit_code = -1; + char *path; + size_t i; + DIR *d; + + INIT_LIST_HEAD(&cr_plugin_ctl.head); + for (i = 0; i < ARRAY_SIZE(cr_plugin_ctl.hook_chain); i++) + INIT_LIST_HEAD(&cr_plugin_ctl.hook_chain[i]); + + if (opts.libdir == NULL) { + path = getenv("CRIU_LIBS_DIR"); + if (path) + SET_CHAR_OPTS(libdir, path); + else { + if (access(CR_PLUGIN_DEFAULT, F_OK)) + return 0; + + SET_CHAR_OPTS(libdir, CR_PLUGIN_DEFAULT); + } + } + + d = opendir(opts.libdir); + if (d == NULL) { + pr_perror("Unable to open directory %s", opts.libdir); + return -1; + } + + while (1) { + char path[PATH_MAX]; + struct dirent *de; + int len; + + errno = 0; + de = readdir(d); + if (de == NULL) { + if (errno == 0) + break; + pr_perror("Unable to read the libraries directory"); + goto err; + } + + len = strlen(de->d_name); + + if (len < 3 || strncmp(de->d_name + len - 3, ".so", 3)) + continue; + + snprintf(path, sizeof(path), "%s/%s", opts.libdir, de->d_name); + + if (cr_lib_load(stage, path)) + goto err; + } + + exit_code = 0; +err: + closedir(d); + + if (exit_code) + cr_plugin_fini(stage, exit_code); + + return exit_code; +} diff --git a/CRIU_code/criu/proc_parse.c b/CRIU_code/criu/proc_parse.c new file mode 100644 index 0000000..f6ebb1f --- /dev/null +++ b/CRIU_code/criu/proc_parse.c @@ -0,0 +1,2657 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "types.h" +#include "common/list.h" +#include "util.h" +#include "mount.h" +#include "filesystems.h" +#include "mman.h" +#include "cpu.h" +#include "file-lock.h" +#include "pstree.h" +#include "fsnotify.h" +#include "posix-timer.h" +#include "kerndat.h" +#include "vdso.h" +#include "vma.h" +#include "mem.h" +#include "bfd.h" +#include "proc_parse.h" +#include "fdinfo.h" +#include "parasite.h" +#include "cr_options.h" +#include "sysfs_parse.h" +#include "seccomp.h" +#include "string.h" +#include "namespaces.h" +#include "files-reg.h" +#include "cgroup.h" +#include "cgroup-props.h" +#include "timerfd.h" +#include "path.h" +#include "fault-injection.h" + +#include "protobuf.h" +#include "images/fdinfo.pb-c.h" +#include "images/mnt.pb-c.h" + +#include + +#ifndef SIGEV_SIGNAL +#define SIGEV_SIGNAL 0 /* notify via signal */ +#endif +#ifndef SIGEV_NONE +#define SIGEV_NONE 1 /* other notification: meaningless */ +#endif +#ifndef SIGEV_THREAD +#define SIGEV_THREAD 2 /* deliver via thread creation */ +#endif +#ifndef SIGEV_THREAD_ID +#define SIGEV_THREAD_ID 4 /* deliver to thread */ +#endif + +#define BUF_SIZE 4096 /* Good enough value - can be changed */ + +struct buffer { + char buf[BUF_SIZE]; + char end; /* '\0' */ +}; + +static struct buffer __buf; +static char *buf = __buf.buf; + +/* + * This is how AIO ring buffers look like in proc + */ + +#define AIO_FNAME "/[aio]" + +/* check the @line starts with "%lx-%lx" format */ +static bool __is_vma_range_fmt(char *line) +{ +#define ____is_vma_addr_char(__c) \ + (((__c) <= '9' && (__c) >= '0') || \ + ((__c) <= 'f' && (__c) >= 'a')) + + while (*line && ____is_vma_addr_char(*line)) + line++; + + if (*line++ != '-') + return false; + + while (*line && ____is_vma_addr_char(*line)) + line++; + + if (*line++ != ' ') + return false; + + return true; +#undef ____is_vma_addr_char +} + +bool is_vma_range_fmt(char *line) +{ + return __is_vma_range_fmt(line); +} + +static void __parse_vmflags(char *buf, u32 *flags, u64 *madv, int *io_pf) +{ + char *tok; + + if (!buf[0]) + return; + + tok = strtok(buf, " \n"); + if (!tok) + return; + +#define _vmflag_match(_t, _s) (_t[0] == _s[0] && _t[1] == _s[1]) + + do { + /* mmap() block */ + if (_vmflag_match(tok, "gd")) + *flags |= MAP_GROWSDOWN; + else if (_vmflag_match(tok, "lo")) + *flags |= MAP_LOCKED; + else if (_vmflag_match(tok, "nr")) + *flags |= MAP_NORESERVE; + else if (_vmflag_match(tok, "ht")) + *flags |= MAP_HUGETLB; + + /* madvise() block */ + if (_vmflag_match(tok, "sr")) + *madv |= (1ul << MADV_SEQUENTIAL); + else if (_vmflag_match(tok, "rr")) + *madv |= (1ul << MADV_RANDOM); + else if (_vmflag_match(tok, "dc")) + *madv |= (1ul << MADV_DONTFORK); + else if (_vmflag_match(tok, "dd")) + *madv |= (1ul << MADV_DONTDUMP); + else if (_vmflag_match(tok, "mg")) + *madv |= (1ul << MADV_MERGEABLE); + else if (_vmflag_match(tok, "hg")) + *madv |= (1ul << MADV_HUGEPAGE); + else if (_vmflag_match(tok, "nh")) + *madv |= (1ul << MADV_NOHUGEPAGE); + + /* vmsplice doesn't work for VM_IO and VM_PFNMAP mappings. */ + if (_vmflag_match(tok, "io") || _vmflag_match(tok, "pf")) + *io_pf = 1; + + /* + * Anything else is just ignored. + */ + } while ((tok = strtok(NULL, " \n"))); + +#undef _vmflag_match +} + +void parse_vmflags(char *buf, u32 *flags, u64 *madv, int *io_pf) +{ + __parse_vmflags(buf, flags, madv, io_pf); +} + +static void parse_vma_vmflags(char *buf, struct vma_area *vma_area) +{ + int io_pf = 0; + + __parse_vmflags(buf, &vma_area->e->flags, &vma_area->e->madv, &io_pf); + + /* + * vmsplice doesn't work for VM_IO and VM_PFNMAP mappings, the + * only exception is VVAR area that mapped by the kernel as + * VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP + */ + if (io_pf && !vma_area_is(vma_area, VMA_AREA_VVAR)) + vma_area->e->status |= VMA_UNSUPP; + + if (vma_area->e->madv) + vma_area->e->has_madv = true; +} + +static inline int is_anon_shmem_map(dev_t dev) +{ + return kdat.shmem_dev == dev; +} + +struct vma_file_info { + int dev_maj; + int dev_min; + unsigned long ino; + struct vma_area *vma; +}; + +static inline int vfi_equal(struct vma_file_info *a, struct vma_file_info *b) +{ + return ((a->ino ^ b->ino) | + (a->dev_maj ^ b->dev_maj) | + (a->dev_min ^ b->dev_min)) == 0; +} + +static int vma_get_mapfile_flags(struct vma_area *vma, DIR *mfd, char *path) +{ + struct stat stat; + + if (fstatat(dirfd(mfd), path, &stat, AT_SYMLINK_NOFOLLOW) < 0) { + if (errno == ENOENT) { + /* Just mapping w/o map_files link */ + return 0; + } + pr_perror("Failed fstatat on map %"PRIx64"", vma->e->start); + return -1; + } + + switch(stat.st_mode & 0600) { + case 0200: + vma->e->fdflags = O_WRONLY; + break; + case 0400: + vma->e->fdflags = O_RDONLY; + break; + case 0600: + vma->e->fdflags = O_RDWR; + break; + } + vma->e->has_fdflags = true; + return 0; +} + +static int vma_stat(struct vma_area *vma, int fd) +{ + vma->vmst = xmalloc(sizeof(struct stat)); + if (!vma->vmst) + return -1; + + /* + * For AUFS support, we need to check if the symbolic link + * points to a branch. If it does, we cannot fstat() its file + * descriptor because it would return a different dev/ino than + * the real file. If fixup_aufs_vma_fd() returns positive, + * it means that it has stat()'ed using the full pathname. + * Zero return means that the symbolic link does not point to + * a branch and we can do fstat() below. + */ + if (opts.aufs) { + int ret; + + ret = fixup_aufs_vma_fd(vma, fd); + if (ret < 0) + return -1; + if (ret > 0) + return 0; + } + + if (fstat(fd, vma->vmst) < 0) { + pr_perror("Failed fstat on map %"PRIx64"", vma->e->start); + return -1; + } + + return 0; +} + +static int vma_get_mapfile_user(const char *fname, struct vma_area *vma, + struct vma_file_info *vfi, int *vm_file_fd, + const char *path) +{ + int fd; + dev_t vfi_dev; + + /* + * Kernel prohibits reading map_files for users. The + * best we can do here is fill stat using the information + * from smaps file and ... hope for the better :\ + * + * Here we'll miss AIO-s and sockets :( + */ + + if (fname[0] == '\0') { + /* + * Another bad thing is that kernel first checks + * for permission access to ANY map_files link, + * then checks for its existence. So we have to + * check for file path being empty to "emulate" + * the ENOENT case. + */ + + if (vfi->dev_maj != 0 || vfi->dev_min != 0 || vfi->ino != 0) { + pr_err("Strange file mapped at %lx [%s]:%d.%d.%ld\n", + (unsigned long)vma->e->start, fname, + vfi->dev_maj, vfi->dev_min, vfi->ino); + return -1; + } + + return 0; + } else if (fname[0] != '/') { + /* + * This should be some kind of + * special mapping like [heap], [vdso] + * and such, the caller should take care + * of the @fname and vma status. + */ + return 0; + } + + vfi_dev = makedev(vfi->dev_maj, vfi->dev_min); + if (is_anon_shmem_map(vfi_dev)) { + if (!(vma->e->flags & MAP_SHARED)) + return -1; + + vma->e->flags |= MAP_ANONYMOUS; + vma->e->status |= VMA_ANON_SHARED; + vma->e->shmid = vfi->ino; + + if (!strncmp(fname, "/SYSV", 5)) { + vma->e->status |= VMA_AREA_SYSVIPC; + } else { + if (fault_injected(FI_HUGE_ANON_SHMEM_ID)) + vma->e->shmid += FI_HUGE_ANON_SHMEM_ID_BASE; + } + + return 0; + } + + pr_info("Failed to open map_files/%s, try to go via [%s] path\n", path, fname); + fd = open(fname, O_RDONLY); + if (fd < 0) { + pr_perror("Can't open mapped [%s]", fname); + return -1; + } + + if (vma_stat(vma, fd)) { + close(fd); + return -1; + } + + if (vma->vmst->st_dev != vfi_dev || + vma->vmst->st_ino != vfi->ino) { + pr_err("Failed to resolve mapping %lx filename\n", + (unsigned long)vma->e->start); + close(fd); + return -1; + } + + *vm_file_fd = fd; + return 0; +} + +static int vma_get_mapfile(const char *fname, struct vma_area *vma, DIR *mfd, + struct vma_file_info *vfi, + struct vma_file_info *prev_vfi, + int *vm_file_fd) +{ + char path[32]; + int flags; + + /* Figure out if it's file mapping */ + snprintf(path, sizeof(path), "%"PRIx64"-%"PRIx64, vma->e->start, vma->e->end); + + if (vma_get_mapfile_flags(vma, mfd, path)) + return -1; + + if (prev_vfi->vma && vfi_equal(vfi, prev_vfi)) { + struct vma_area *prev = prev_vfi->vma; + + /* + * If vfi is equal (!) and negative @vm_file_fd -- + * we have nothing to borrow for sure. + */ + if (*vm_file_fd < 0) + return 0; + + pr_debug("vma %"PRIx64" borrows vfi from previous %"PRIx64"\n", + vma->e->start, prev->e->start); + if (prev->e->status & VMA_AREA_SOCKET) + vma->e->status |= VMA_AREA_SOCKET | VMA_AREA_REGULAR; + + /* + * FIXME -- in theory there can be vmas that have + * dev:ino match, but live in different mount + * namespaces. However, we only borrow files for + * subsequent vmas. These are _very_ likely to + * have files from the same namespaces. + */ + vma->file_borrowed = true; + + return 0; + } + close_safe(vm_file_fd); + + /* + * Note that we "open" it in dumper process space + * so later we might refer to it via /proc/self/fd/vm_file_fd + * if needed. + */ + flags = O_PATH; + if (vfi->dev_maj == 0) + /* + * Opening with O_PATH omits calling kernel ->open + * method, thus for some special files their type + * detection might be broken. Thus we open those with + * the O_RDONLY to potentially get ENXIO and check + * it below. + */ + flags = O_RDONLY; + + *vm_file_fd = openat(dirfd(mfd), path, flags); + if (*vm_file_fd < 0) { + if (errno == ENOENT) + /* Just mapping w/o map_files link */ + return 0; + + if (errno == ENXIO) { + struct stat buf; + + if (fstatat(dirfd(mfd), path, &buf, 0)) + return -1; + + if (S_ISSOCK(buf.st_mode)) { + pr_info("Found socket mapping @%"PRIx64"\n", vma->e->start); + vma->vm_socket_id = buf.st_ino; + vma->e->status |= VMA_AREA_SOCKET | VMA_AREA_REGULAR; + return 0; + } + + if ((buf.st_mode & S_IFMT) == 0 && !strncmp(fname, AIO_FNAME, sizeof(AIO_FNAME) - 1)) { + /* AIO ring, let's try */ + close_safe(vm_file_fd); + vma->e->status = VMA_AREA_AIORING; + return 0; + } + + pr_err("Unknown shit %o (%s)\n", buf.st_mode, fname); + return -1; + } + + if (errno == EPERM && !opts.aufs) + return vma_get_mapfile_user(fname, vma, vfi, vm_file_fd, path); + + pr_perror("Can't open map_files"); + return -1; + } + + return vma_stat(vma, *vm_file_fd); +} + +int parse_self_maps_lite(struct vm_area_list *vms) +{ + struct vma_area *prev = NULL; + struct bfd maps; + char *buf; + + vm_area_list_init(vms); + + maps.fd = open_proc(PROC_SELF, "maps"); + if (maps.fd < 0) + return -1; + + if (bfdopenr(&maps)) + return -1; + + while (1) { + struct vma_area *vma; + char *end; + unsigned long s, e; + + buf = breadline(&maps); + if (!buf) + break; + if (IS_ERR(buf)) + goto err; + + s = strtoul(buf, &end, 16); + e = strtoul(end + 1, NULL, 16); + + if (prev && prev->e->end == s) + /* + * This list is needed for one thing only -- to + * get the idea of what parts of current address + * space are busy. So merge them altogether. + */ + prev->e->end = e; + else { + vma = alloc_vma_area(); + if (!vma) + goto err; + + vma->e->start = s; + vma->e->end = e; + list_add_tail(&vma->list, &vms->h); + vms->nr++; + prev = vma; + } + + pr_debug("Parsed %"PRIx64"-%"PRIx64" vma\n", prev->e->start, prev->e->end); + } + + bclose(&maps); + return 0; + +err: + bclose(&maps); + return -1; +} + +static inline int handle_vdso_vma(struct vma_area *vma) +{ + vma->e->status |= VMA_AREA_REGULAR; + if ((vma->e->prot & VDSO_PROT) == VDSO_PROT) + vma->e->status |= VMA_AREA_VDSO; + return 0; +} + +static inline int handle_vvar_vma(struct vma_area *vma) +{ + vma->e->status |= VMA_AREA_REGULAR; + if ((vma->e->prot & VVAR_PROT) == VVAR_PROT) + vma->e->status |= VMA_AREA_VVAR; + return 0; +} + +static int handle_vma(pid_t pid, struct vma_area *vma_area, + const char *file_path, DIR *map_files_dir, + struct vma_file_info *vfi, + struct vma_file_info *prev_vfi, + int *vm_file_fd) +{ + if (vma_get_mapfile(file_path, vma_area, map_files_dir, + vfi, prev_vfi, vm_file_fd)) + goto err_bogus_mapfile; + + if (vma_area->e->status != 0) + return 0; + + if (!strcmp(file_path, "[vsyscall]") || + !strcmp(file_path, "[vectors]")) { + vma_area->e->status |= VMA_AREA_VSYSCALL; + } else if (!strcmp(file_path, "[vdso]")) { + if (handle_vdso_vma(vma_area)) + goto err; + } else if (!strcmp(file_path, "[vvar]")) { + if (handle_vvar_vma(vma_area)) + goto err; + } else if (!strcmp(file_path, "[heap]")) { + vma_area->e->status |= VMA_AREA_REGULAR | VMA_AREA_HEAP; + } else { + vma_area->e->status = VMA_AREA_REGULAR; + } + + /* + * Some mapping hints for restore, we save this on + * disk and restore might need to analyze it. + */ + if (vma_area->file_borrowed) { + struct vma_area *prev = prev_vfi->vma; + + /* + * Pick-up flags that might be set in the branch below. + * Status is copied as-is as it should be zero here, + * and have full match with the previous. + */ + vma_area->e->flags |= (prev->e->flags & MAP_ANONYMOUS); + vma_area->e->status = prev->e->status; + vma_area->e->shmid = prev->e->shmid; + vma_area->vmst = prev->vmst; + vma_area->mnt_id = prev->mnt_id; + } else if (*vm_file_fd >= 0) { + struct stat *st_buf = vma_area->vmst; + + if (S_ISREG(st_buf->st_mode)) + /* regular file mapping -- supported */; + else if (S_ISCHR(st_buf->st_mode) && (st_buf->st_rdev == DEVZERO)) + /* devzero mapping -- also makes sense */; + else { + pr_err("Can't handle non-regular mapping on %d's map %"PRIx64"\n", pid, vma_area->e->start); + goto err; + } + + /* + * /dev/zero stands for anon-shared mapping + * otherwise it's some file mapping. + */ + if (is_anon_shmem_map(st_buf->st_dev)) { + if (!(vma_area->e->flags & MAP_SHARED)) + goto err_bogus_mapping; + vma_area->e->flags |= MAP_ANONYMOUS; + vma_area->e->status |= VMA_ANON_SHARED; + vma_area->e->shmid = st_buf->st_ino; + + if (!strncmp(file_path, "/SYSV", 5)) { + pr_info("path: %s\n", file_path); + vma_area->e->status |= VMA_AREA_SYSVIPC; + } else { + if (fault_injected(FI_HUGE_ANON_SHMEM_ID)) + vma_area->e->shmid += FI_HUGE_ANON_SHMEM_ID_BASE; + } + } else { + if (vma_area->e->flags & MAP_PRIVATE) + vma_area->e->status |= VMA_FILE_PRIVATE; + else + vma_area->e->status |= VMA_FILE_SHARED; + } + + /* + * We cannot use the mnt_id value provided by the kernel + * for vm_file_fd if it is an AUFS file (the value is + * wrong). In such a case, fixup_aufs_vma_fd() has set + * mnt_id to -1 to mimic pre-3.15 kernels that didn't + * have mnt_id. + */ + if (vma_area->mnt_id != -1 && + get_fd_mntid(*vm_file_fd, &vma_area->mnt_id)) + return -1; + } else { + /* + * No file but mapping -- anonymous one. + */ + if (vma_area->e->flags & MAP_SHARED) { + vma_area->e->status |= VMA_ANON_SHARED; + vma_area->e->shmid = vfi->ino; + } else { + vma_area->e->status |= VMA_ANON_PRIVATE; + } + vma_area->e->flags |= MAP_ANONYMOUS; + } + + return 0; +err: + return -1; +err_bogus_mapping: + pr_err("Bogus mapping 0x%"PRIx64"-0x%"PRIx64" (flags: %#x vm_file_fd: %d)\n", + vma_area->e->start, vma_area->e->end, + vma_area->e->flags, *vm_file_fd); + goto err; + +err_bogus_mapfile: + pr_perror("Can't open %d's mapfile link %"PRIx64, pid, vma_area->e->start); + goto err; +} + +static int vma_list_add(struct vma_area *vma_area, + struct vm_area_list *vma_area_list, + unsigned long *prev_end, + struct vma_file_info *vfi, struct vma_file_info *prev_vfi) +{ + if (vma_area->e->status & VMA_UNSUPP) { + pr_err("Unsupported mapping found %016"PRIx64"-%016"PRIx64"\n", + vma_area->e->start, vma_area->e->end); + return -1; + } + + /* Add a guard page only if here is enough space for it */ + if (vma_has_guard_gap_hidden(vma_area) && + *prev_end < vma_area->e->start) + vma_area->e->start -= PAGE_SIZE; /* Guard page */ + *prev_end = vma_area->e->end; + + list_add_tail(&vma_area->list, &vma_area_list->h); + vma_area_list->nr++; + if (vma_area_is_private(vma_area, kdat.task_size)) { + unsigned long pages; + + pages = vma_area_len(vma_area) / PAGE_SIZE; + vma_area_list->priv_size += pages; + vma_area_list->priv_longest = max(vma_area_list->priv_longest, pages); + } else if (vma_area_is(vma_area, VMA_ANON_SHARED)) { + unsigned long pages; + + pages = vma_area_len(vma_area) / PAGE_SIZE; + vma_area_list->shared_longest = + max(vma_area_list->shared_longest, pages); + } + + *prev_vfi = *vfi; + prev_vfi->vma = vma_area; + + return 0; +} + +/* + * On s390 we have old kernels where the global task size assumption of + * criu does not work. See also compel_task_size() for s390. + */ +static int task_size_check(pid_t pid, VmaEntry *entry) +{ +#ifdef __s390x__ + if (entry->end <= kdat.task_size) + return 0; + pr_err("Can't dump high memory region %lx-%lx of task %d because kernel commit ee71d16d22bb is missing\n", entry->start, entry->end, pid); + return -1; +#else + return 0; +#endif +} + +int parse_smaps(pid_t pid, struct vm_area_list *vma_area_list, + dump_filemap_t dump_filemap) +{ + struct vma_area *vma_area = NULL; + unsigned long start, end, pgoff, prev_end = 0; + char r, w, x, s; + int ret = -1, vm_file_fd = -1; + struct vma_file_info vfi; + struct vma_file_info prev_vfi = {}; + + DIR *map_files_dir = NULL; + struct bfd f; + + vma_area_list->nr = 0; + vma_area_list->nr_aios = 0; + vma_area_list->priv_longest = 0; + vma_area_list->priv_size = 0; + vma_area_list->shared_longest = 0; + INIT_LIST_HEAD(&vma_area_list->h); + + f.fd = open_proc(pid, "smaps"); + if (f.fd < 0) + goto err_n; + + if (bfdopenr(&f)) + goto err_n; + + map_files_dir = opendir_proc(pid, "map_files"); + if (!map_files_dir) /* old kernel? */ + goto err; + + while (1) { + int num, path_off; + bool eof; + char *str; + + str = breadline(&f); + if (IS_ERR(str)) + goto err; + eof = (str == NULL); + + if (!eof && !__is_vma_range_fmt(str)) { + if (!strncmp(str, "Nonlinear", 9)) { + BUG_ON(!vma_area); + pr_err("Nonlinear mapping found %016"PRIx64"-%016"PRIx64"\n", + vma_area->e->start, vma_area->e->end); + /* + * VMA is already on list and will be + * freed later as list get destroyed. + */ + vma_area = NULL; + goto err; + } else if (!strncmp(str, "VmFlags: ", 9)) { + BUG_ON(!vma_area); + parse_vma_vmflags(&str[9], vma_area); + continue; + } else + continue; + } + + if (vma_area && vma_list_add(vma_area, vma_area_list, + &prev_end, &vfi, &prev_vfi)) + goto err; + + if (eof) + break; + + vma_area = alloc_vma_area(); + if (!vma_area) + goto err; + + num = sscanf(str, "%lx-%lx %c%c%c%c %lx %x:%x %lu %n", + &start, &end, &r, &w, &x, &s, &pgoff, + &vfi.dev_maj, &vfi.dev_min, &vfi.ino, &path_off); + if (num < 10) { + pr_err("Can't parse: %s\n", str); + goto err; + } + + vma_area->e->start = start; + vma_area->e->end = end; + vma_area->e->pgoff = pgoff; + vma_area->e->prot = PROT_NONE; + + if (task_size_check(pid, vma_area->e)) + goto err; + + if (r == 'r') + vma_area->e->prot |= PROT_READ; + if (w == 'w') + vma_area->e->prot |= PROT_WRITE; + if (x == 'x') + vma_area->e->prot |= PROT_EXEC; + + if (s == 's') + vma_area->e->flags = MAP_SHARED; + else if (s == 'p') + vma_area->e->flags = MAP_PRIVATE; + else { + pr_err("Unexpected VMA met (%c)\n", s); + goto err; + } + + if (handle_vma(pid, vma_area, str + path_off, map_files_dir, + &vfi, &prev_vfi, &vm_file_fd)) + goto err; + + if (vma_entry_is(vma_area->e, VMA_FILE_PRIVATE) || + vma_entry_is(vma_area->e, VMA_FILE_SHARED)) { + if (dump_filemap && dump_filemap(vma_area, vm_file_fd)) + goto err; + } else if (vma_entry_is(vma_area->e, VMA_AREA_AIORING)) + vma_area_list->nr_aios++; + } + + vma_area = NULL; + ret = 0; + +err: + bclose(&f); +err_n: + close_safe(&vm_file_fd); + if (map_files_dir) + closedir(map_files_dir); + + xfree(vma_area); + return ret; + +} + +int parse_pid_stat(pid_t pid, struct proc_pid_stat *s) +{ + char *tok, *p; + int fd; + int n; + + fd = open_proc(pid, "stat"); + if (fd < 0) + return -1; + + n = read(fd, buf, BUF_SIZE); + close(fd); + if (n < 1) { + pr_err("stat for %d is corrupted\n", pid); + return -1; + } + + memset(s, 0, sizeof(*s)); + + tok = strchr(buf, ' '); + if (!tok) + goto err; + *tok++ = '\0'; + if (*tok != '(') + goto err; + + s->pid = atoi(buf); + + p = strrchr(tok + 1, ')'); + if (!p) + goto err; + *tok = '\0'; + *p = '\0'; + + strlcpy(s->comm, tok + 1, sizeof(s->comm)); + + n = sscanf(p + 1, + " %c %d %d %d %d %d %u %lu %lu %lu %lu " + "%lu %lu %ld %ld %ld %ld %d %d %llu %lu %ld %lu %lu %lu %lu " + "%lu %lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld " + "%lu %lu %lu %lu %lu %lu %lu %d", + &s->state, + &s->ppid, + &s->pgid, + &s->sid, + &s->tty_nr, + &s->tty_pgrp, + &s->flags, + &s->min_flt, + &s->cmin_flt, + &s->maj_flt, + &s->cmaj_flt, + &s->utime, + &s->stime, + &s->cutime, + &s->cstime, + &s->priority, + &s->nice, + &s->num_threads, + &s->zero0, + &s->start_time, + &s->vsize, + &s->mm_rss, + &s->rsslim, + &s->start_code, + &s->end_code, + &s->start_stack, + &s->esp, + &s->eip, + &s->sig_pending, + &s->sig_blocked, + &s->sig_ignored, + &s->sig_handled, + &s->wchan, + &s->zero1, + &s->zero2, + &s->exit_signal, + &s->task_cpu, + &s->rt_priority, + &s->policy, + &s->delayacct_blkio_ticks, + &s->gtime, + &s->cgtime, + &s->start_data, + &s->end_data, + &s->start_brk, + &s->arg_start, + &s->arg_end, + &s->env_start, + &s->env_end, + &s->exit_code); + if (n < 50) + goto err; + + return 0; + +err: + pr_err("Parsing %d's stat failed (#fields do not match)\n", pid); + return -1; +} + +int prepare_loginuid(unsigned int value, unsigned int loglevel) +{ + int fd, ret = 0; + char buf[11]; /* 4294967295 is maximum for u32 */ + + fd = open_proc_rw(PROC_SELF, "loginuid"); + if (fd < 0) + return -1; + + snprintf(buf, 11, "%u", value); + + if (write(fd, buf, 11) < 0) { + print_on_level(loglevel, + "Write %s to /proc/self/loginuid failed: %s", + buf, strerror(errno)); + ret = -1; + } + close(fd); + return ret; +} + +unsigned int parse_pid_loginuid(pid_t pid, int *err, bool ignore_noent) +{ + int fd; + ssize_t num; + + *err = 0; + fd = __open_proc(pid, (ignore_noent) ? ENOENT : 0, + O_RDONLY, "loginuid"); + if (fd < 0) + goto out; + + num = read(fd, buf, 10); + close(fd); + if (num < 0) { + pr_perror("Unable to read /proc/%d/loginuid", pid); + goto out; + } + buf[num] = '\0'; + + return strtol(buf, NULL, 10); + +out: + *err = -1; + return INVALID_UID; /* unset value */ +} + +int parse_pid_oom_score_adj(pid_t pid, int *err) +{ + int fd; + ssize_t num; + + *err = 0; + fd = open_proc(pid, "oom_score_adj"); + if (fd < 0) + goto out; + + num = read(fd, buf, 10); + close(fd); + if (num < 0) { + pr_perror("Unable to read /proc/%d/oom_score_adj", pid); + goto out; + } + buf[num] = '\0'; + + return strtol(buf, NULL, 10); + +out: + *err = -1; + return 0; +} + +static int ids_parse(char *str, unsigned int *arr) +{ + char *end; + + arr[0] = strtol(str, &end, 10); + arr[1] = strtol(end + 1, &end, 10); + arr[2] = strtol(end + 1, &end, 10); + arr[3] = strtol(end + 1, &end, 10); + if (*end) + return -1; + else + return 0; +} + +static int cap_parse(char *str, unsigned int *res) +{ + int i, ret; + + for (i = 0; i < PROC_CAP_SIZE; i++) { + ret = sscanf(str, "%08x", &res[PROC_CAP_SIZE - 1 - i]); + if (ret != 1) + return -1; + str += 8; + } + + return 0; +} + +int parse_pid_status(pid_t pid, struct seize_task_status *ss, void *data) +{ + struct proc_status_creds *cr = container_of(ss, struct proc_status_creds, s); + struct bfd f; + int done = 0; + int ret = -1; + char *str; + bool parsed_seccomp = false; + + f.fd = open_proc(pid, "status"); + if (f.fd < 0) + return -1; + + cr->s.sigpnd = 0; + cr->s.shdpnd = 0; + + if (bfdopenr(&f)) + return -1; + + while (done < 12) { + str = breadline(&f); + if (str == NULL) + break; + if (IS_ERR(str)) + goto err_parse; + + if (!strncmp(str, "State:", 6)) { + cr->s.state = str[7]; + done++; + continue; + } + + if (!strncmp(str, "PPid:", 5)) { + if (sscanf(str, "PPid:\t%d", &cr->s.ppid) != 1) { + pr_err("Unable to parse: %s\n", str); + goto err_parse; + } + done++; + continue; + } + + if (!strncmp(str, "Uid:", 4)) { + if (ids_parse(str + 5, cr->uids)) + goto err_parse; + + done++; + continue; + } + + if (!strncmp(str, "Gid:", 4)) { + if (ids_parse(str + 5, cr->gids)) + goto err_parse; + + done++; + continue; + } + + if (!strncmp(str, "CapInh:", 7)) { + if (cap_parse(str + 8, cr->cap_inh)) + goto err_parse; + + done++; + continue; + } + + if (!strncmp(str, "CapEff:", 7)) { + if (cap_parse(str + 8, cr->cap_eff)) + goto err_parse; + + done++; + continue; + } + + if (!strncmp(str, "CapPrm:", 7)) { + if (cap_parse(str + 8, cr->cap_prm)) + goto err_parse; + + done++; + continue; + } + + if (!strncmp(str, "CapBnd:", 7)) { + if (cap_parse(str + 8, cr->cap_bnd)) + goto err_parse; + + done++; + continue; + } + + if (!strncmp(str, "Seccomp:", 8)) { + if (sscanf(str + 9, "%d", &cr->s.seccomp_mode) != 1) { + goto err_parse; + } + + parsed_seccomp = true; + done++; + continue; + } + + if (!strncmp(str, "ShdPnd:", 7)) { + unsigned long long sigpnd; + + if (sscanf(str + 7, "%llx", &sigpnd) != 1) + goto err_parse; + cr->s.shdpnd |= sigpnd; + + done++; + continue; + } + if (!strncmp(str, "SigPnd:", 7)) { + unsigned long long sigpnd; + + if (sscanf(str + 7, "%llx", &sigpnd) != 1) + goto err_parse; + cr->s.sigpnd |= sigpnd; + + done++; + continue; + } + } + + /* seccomp is optional */ + if (done >= 11 || (done == 10 && !parsed_seccomp)) + ret = 0; + +err_parse: + if (ret) + pr_err("Error parsing proc status file\n"); + bclose(&f); + return ret; +} + +struct opt2flag { + char *opt; + unsigned flag; +}; + +static bool sb_opt_cb(char *opt, char *unknown, size_t *uoff) +{ + unsigned int id; + + if (sscanf(opt, "gid=%d", &id) == 1) { + *uoff += sprintf(unknown + *uoff, "gid=%d", userns_gid(id)); + unknown[*uoff] = ','; + (*uoff)++; + return true; + } else if (sscanf(opt, "uid=%d", &id) == 1) { + *uoff += sprintf(unknown + *uoff, "uid=%d", userns_uid(id)); + unknown[*uoff] = ','; + (*uoff)++; + return true; + } + return false; +} + +static int do_opt2flag(char *opt, unsigned *flags, + const struct opt2flag *opts, char *unknown, + bool (*cb)(char *opt, char *unknown, size_t *uoff)) +{ + int i; + char *end; + size_t uoff = 0; + + while (1) { + end = strchr(opt, ','); + if (end) + *end = '\0'; + + for (i = 0; opts[i].opt != NULL; i++) + if (!strcmp(opts[i].opt, opt)) { + (*flags) |= opts[i].flag; + break; + } + + if (opts[i].opt == NULL && cb && !cb(opt, unknown, &uoff)) { + if (!unknown) { + pr_err("Unknown option [%s]\n", opt); + return -1; + } + + strcpy(unknown + uoff, opt); + uoff += strlen(opt); + unknown[uoff] = ','; + uoff++; + } + + if (!end) { + if (uoff) + uoff--; + if (unknown) + unknown[uoff] = '\0'; + break; + } else + opt = end + 1; + } + + return 0; +} + +static int parse_mnt_flags(char *opt, unsigned *flags) +{ + static const struct opt2flag mnt_opt2flag[] = { + { "rw", 0, }, + { "ro", MS_RDONLY, }, + { "nosuid", MS_NOSUID, }, + { "nodev", MS_NODEV, }, + { "noexec", MS_NOEXEC, }, + { "noatime", MS_NOATIME, }, + { "nodiratime", MS_NODIRATIME, }, + { "relatime", MS_RELATIME, }, + { }, + }; + + if (do_opt2flag(opt, flags, mnt_opt2flag, NULL, NULL)) + return -1; + + /* Otherwise the kernel assumes RELATIME by default */ + if ((*flags & (MS_RELATIME | MS_NOATIME)) == 0) + *flags |= MS_STRICTATIME; + + return 0; +} + +static int parse_sb_opt(char *opt, unsigned *flags, char *uopt) +{ + static const struct opt2flag sb_opt2flag[] = { + { "rw", 0, }, + { "ro", MS_RDONLY, }, + { "sync", MS_SYNC, }, + { "dirsync", MS_DIRSYNC, }, + { "mad", MS_MANDLOCK, }, + { }, + }; + + return do_opt2flag(opt, flags, sb_opt2flag, uopt, sb_opt_cb); +} + +static int parse_mnt_opt(char *str, struct mount_info *mi, int *off) +{ + char *istr = str, *end; + + while (1) { + end = strchr(str, ' '); + if (!end) { + pr_err("Error parsing mount options\n"); + return -1; + } + + *end = '\0'; + if (!strncmp(str, "-", 1)) + break; + else if (!strncmp(str, "shared:", 7)) { + mi->flags |= MS_SHARED; + mi->shared_id = atoi(str + 7); + } else if (!strncmp(str, "master:", 7)) { + mi->flags |= MS_SLAVE; + mi->master_id = atoi(str + 7); + } else if (!strncmp(str, "propagate_from:", 15)) { + /* skip */; + } else if (!strncmp(str, "unbindable", 11)) + mi->flags |= MS_UNBINDABLE; + else { + pr_err("Unknown option [%s]\n", str); + return -1; + } + + str = end + 1; + } + + *off = end - istr + 1; + return 0; +} + +/* + * mountinfo contains mangled paths. space, tab and back slash were replaced + * with usual octal escape. This function replaces these symbols back. + */ +static void cure_path(char *path) +{ + int i, len, off = 0; + + if (strchr(path, '\\') == NULL) /* fast path */ + return; + + len = strlen(path); + for (i = 0; i < len; i++) { + if (!strncmp(path + i, "\\040", 4)) { + path[i - off] = ' '; + goto replace; + } else if (!strncmp(path + i, "\\011", 4)) { + path[i - off] = '\t'; + goto replace; + } else if (!strncmp(path + i, "\\134", 4)) { + path[i - off] = '\\'; + goto replace; + } + if (off) + path[i - off] = path[i]; + continue; +replace: + off += 3; + i += 3; + } + path[len - off] = 0; +} + +static int parse_mountinfo_ent(char *str, struct mount_info *new, char **fsname) +{ + struct fd_link root_link; + unsigned int kmaj, kmin; + int ret, n; + char *sub, *opt = NULL; + + new->mountpoint = xmalloc(PATH_MAX); + if (new->mountpoint == NULL) + goto err; + + new->mountpoint[0] = '.'; + ret = sscanf(str, "%i %i %u:%u %ms %s %ms %n", + &new->mnt_id, &new->parent_mnt_id, + &kmaj, &kmin, &new->root, new->mountpoint + 1, + &opt, &n); + if (ret != 7) + goto err; + + cure_path(new->mountpoint); + cure_path(new->root); + + root_link.len = strlen(new->root); + strcpy(root_link.name, new->root); + if (strip_deleted(&root_link)) { + strcpy(new->root, root_link.name); + new->deleted = true; + } + + new->mountpoint = xrealloc(new->mountpoint, strlen(new->mountpoint) + 1); + if (!new->mountpoint) + goto err; + new->ns_mountpoint = new->mountpoint; + new->is_ns_root = is_root(new->ns_mountpoint + 1); + + new->s_dev = new->s_dev_rt = MKKDEV(kmaj, kmin); + new->flags = 0; + if (parse_mnt_flags(opt, &new->flags)) + goto err; + + free(opt); /* we are going to reallocate/reuse this buffer */ + opt = NULL; + + str += n; + if (parse_mnt_opt(str, new, &n)) + goto err; + + str += n; + ret = sscanf(str, "%ms %ms %ms", fsname, &new->source, &opt); + if (ret == 2) { + /* src may be empty */ + opt = new->source; + new->source = xstrdup(""); + if (new->source == NULL) + goto err; + } else if (ret != 3) + goto err; + + cure_path(new->source); + + new->fsname = xstrdup(*fsname); + if (!new->fsname) + goto err; + + /* + * The kernel reports "subtypes" sometimes and the valid + * type-vs-subtype delimiter is the dot symbol. We disregard + * any subtypes for the purpose of finding the fstype. + */ + sub = strchr(*fsname, '.'); + if (sub) + *sub = 0; + + new->fstype = find_fstype_by_name(*fsname); + + new->options = xmalloc(strlen(opt) + 1); + if (!new->options) + goto err; + + if (parse_sb_opt(opt, &new->sb_flags, new->options)) + goto err; + + ret = 0; +ret: + xfree(opt); + return ret; +err: + ret = -1; + goto ret; +} + +static LIST_HEAD(skip_mount_list); + +struct str_node { + struct list_head node; + char string[]; +}; + +bool add_skip_mount(const char *mountpoint) +{ + struct str_node *skip = xmalloc(sizeof(struct str_node) + + strlen(mountpoint) + 1); + if (!skip) + return false; + + strcpy(skip->string, mountpoint); + list_add(&skip->node, &skip_mount_list); + return true; +} + +static bool should_skip_mount(const char *mountpoint) +{ + struct str_node *pos; + + list_for_each_entry(pos, &skip_mount_list, node) { + if (strcmp(mountpoint, pos->string) == 0) + return true; + } + + return false; +} + +struct mount_info *parse_mountinfo(pid_t pid, struct ns_id *nsid, bool for_dump) +{ + struct mount_info *list = NULL; + FILE *f; + + f = fopen_proc(pid, "mountinfo"); + if (!f) + return NULL; + + while (fgets(buf, BUF_SIZE, f)) { + struct mount_info *new; + int ret = -1; + char *fsname = NULL; + + new = mnt_entry_alloc(); + if (!new) + goto end; + + new->nsid = nsid; + + ret = parse_mountinfo_ent(buf, new, &fsname); + if (ret < 0) { + pr_err("Bad format in %d mountinfo: '%s'\n", pid, buf); + goto end; + } + + /* + * Drop this mountpoint early, so that lookup_mnt_id/etc will + * fail loudly at "dump" stage if an opened file or another mnt + * depends on this one. + */ + if (for_dump && should_skip_mount(new->mountpoint + 1)) { + pr_info("\tskip %s @ %s\n", fsname, new->mountpoint); + mnt_entry_free(new); + new = NULL; + goto end; + } + + pr_info("\ttype %s source %s mnt_id %d s_dev %#x %s @ %s flags %#x options %s\n", + fsname, new->source, + new->mnt_id, new->s_dev, new->root, new->mountpoint, + new->flags, new->options); + + if (new->fstype->parse) { + ret = new->fstype->parse(new); + if (ret < 0) { + pr_err("Failed to parse FS specific data on %s\n", + new->mountpoint); + mnt_entry_free(new); + new = NULL; + goto end; + } + + if (ret > 0) { + pr_info("\tskipping fs mounted at %s\n", new->mountpoint + 1); + mnt_entry_free(new); + new = NULL; + ret = 0; + goto end; + + } + } +end: + if (fsname) + free(fsname); + + if (new) { + new->next = list; + list = new; + } + + if (ret) + goto err; + } +out: + fclose(f); + return list; + +err: + while (list) { + struct mount_info *next = list->next; + mnt_entry_free(list); + list = next; + } + goto out; +} + +static char nybble(const char n) +{ + if (n >= '0' && n <= '9') + return n - '0'; + else if (n >= 'A' && n <= 'F') + return n - ('A' - 10); + else if (n >= 'a' && n <= 'f') + return n - ('a' - 10); + return 0; +} + +static void parse_fhandle_encoded(char *tok, FhEntry *fh) +{ + char *d = (char *)fh->handle; + int i = 0; + + memzero(d, pb_repeated_size(fh, handle)); + + while (*tok == ' ') + tok++; + + while (*tok) { + if (i >= pb_repeated_size(fh, handle)) + break; + d[i++] = (nybble(tok[0]) << 4) | nybble(tok[1]); + if (tok[1]) + tok += 2; + else + break; + } +} + +static int parse_timerfd(struct bfd *f, char *str, TimerfdEntry *tfy) +{ + /* + * Format is + * clockid: 0 + * ticks: 0 + * settime flags: 01 + * it_value: (0, 49406829) + * it_interval: (1, 0) + */ + if (sscanf(str, "clockid: %d", &tfy->clockid) != 1) + goto parse_err; + + if (verify_timerfd(tfy) < 0) + goto parse_err; + + str = breadline(f); + if (IS_ERR_OR_NULL(str)) + goto nodata; + if (sscanf(str, "ticks: %llu", (unsigned long long *)&tfy->ticks) != 1) + goto parse_err; + + str = breadline(f); + if (IS_ERR_OR_NULL(str)) + goto nodata; + if (sscanf(str, "settime flags: 0%o", &tfy->settime_flags) != 1) + goto parse_err; + + str = breadline(f); + if (IS_ERR_OR_NULL(str)) + goto nodata; + if (sscanf(str, "it_value: (%llu, %llu)", + (unsigned long long *)&tfy->vsec, + (unsigned long long *)&tfy->vnsec) != 2) + goto parse_err; + + str = breadline(f); + if (IS_ERR_OR_NULL(str)) + goto nodata; + if (sscanf(str, "it_interval: (%llu, %llu)", + (unsigned long long *)&tfy->isec, + (unsigned long long *)&tfy->insec) != 2) + goto parse_err; + return 0; + +parse_err: + return -1; +nodata: + pr_err("No data left in proc file while parsing timerfd\n"); + goto parse_err; +} + +#define fdinfo_field(str, field) !strncmp(str, field":", sizeof(field)) + +static int parse_file_lock_buf(char *buf, struct file_lock *fl, + bool is_blocked); +static int parse_fdinfo_pid_s(int pid, int fd, int type, void *arg) +{ + struct bfd f; + char *str; + bool entry_met = false; + int ret, exit_code = -1; + + f.fd = open_proc(pid, "fdinfo/%d", fd); + if (f.fd < 0) + return -1; + + if (bfdopenr(&f)) + return -1; + + while (1) { + str = breadline(&f); + if (!str) + break; + if (IS_ERR(str)) + goto out; + + if (fdinfo_field(str, "pos") || + fdinfo_field(str, "flags") || + fdinfo_field(str, "mnt_id")) { + unsigned long long val; + struct fdinfo_common *fdinfo = arg; + + if (type != FD_TYPES__UND) + continue; + ret = sscanf(str, "%*s %lli", &val); + if (ret != 1) + goto parse_err; + + if (fdinfo_field(str, "pos")) + fdinfo->pos = val; + else if (fdinfo_field(str, "flags")) + fdinfo->flags = val; + else if (fdinfo_field(str, "mnt_id")) + fdinfo->mnt_id = val; + + entry_met = true; + continue; + } + + if (fdinfo_field(str, "lock")) { + struct file_lock *fl; + struct fdinfo_common *fdinfo = arg; + + if (type != FD_TYPES__UND) + continue; + + fl = alloc_file_lock(); + if (!fl) { + pr_perror("Alloc file lock failed!"); + goto out; + } + + if (parse_file_lock_buf(str + 6, fl, 0)) { + xfree(fl); + goto parse_err; + } + + pr_info("lockinfo: %lld:%d %x %d %02x:%02x:%ld %lld %s\n", + fl->fl_id, fl->fl_kind, fl->fl_ltype, + fl->fl_owner, fl->maj, fl->min, fl->i_no, + fl->start, fl->end); + + + if (fl->fl_kind == FL_UNKNOWN) { + pr_err("Unknown file lock!\n"); + xfree(fl); + goto out; + } + + fl->real_owner = fdinfo->owner; + fl->fl_holder = pid; + fl->owners_fd = fd; + list_add_tail(&fl->list, &file_lock_list); + } + + if (type == FD_TYPES__UND) + continue; + + if (fdinfo_field(str, "eventfd-count")) { + EventfdFileEntry *efd = arg; + + if (type != FD_TYPES__EVENTFD) + goto parse_err; + ret = sscanf(str, "eventfd-count: %"PRIx64, + &efd->counter); + if (ret != 1) + goto parse_err; + + entry_met = true; + continue; + } + if (fdinfo_field(str, "clockid")) { + TimerfdEntry *tfe = arg; + + if (type != FD_TYPES__TIMERFD) + goto parse_err; + ret = parse_timerfd(&f, str, tfe); + if (ret) + goto parse_err; + + entry_met = true; + continue; + } + if (fdinfo_field(str, "tfd")) { + EventpollFileEntry *epfe = arg; + EventpollTfdEntry *e; + int i; + + if (type != FD_TYPES__EVENTPOLL) + goto parse_err; + + e = xmalloc(sizeof(EventpollTfdEntry)); + if (!e) + goto out; + + eventpoll_tfd_entry__init(e); + + ret = sscanf(str, "tfd: %d events: %x data: %llx" + " pos:%lli ino:%lx sdev:%x", + &e->tfd, &e->events, (long long *)&e->data, + (long long *)&e->pos, (long *)&e->inode, + &e->dev); + if (ret < 3 || ret > 6) { + eventpoll_tfd_entry__free_unpacked(e, NULL); + goto parse_err; + } else if (ret == 3) { + e->has_dev = false; + e->has_inode = false; + e->has_pos = false; + } else if (ret == 6) { + e->has_dev = true; + e->has_inode = true; + e->has_pos = true; + } else if (ret < 6) { + eventpoll_tfd_entry__free_unpacked(e, NULL); + goto parse_err; + } + + i = epfe->n_tfd++; + if (xrealloc_safe(&epfe->tfd, epfe->n_tfd * sizeof(EventpollTfdEntry *))) + goto out; + + epfe->tfd[i] = e; + entry_met = true; + continue; + } + if (fdinfo_field(str, "sigmask")) { + SignalfdEntry *sfd = arg; + + if (type != FD_TYPES__SIGNALFD) + goto parse_err; + ret = sscanf(str, "sigmask: %llx", + (unsigned long long *)&sfd->sigmask); + if (ret != 1) + goto parse_err; + + entry_met = true; + continue; + } + if (fdinfo_field(str, "fanotify flags")) { + FanotifyFileEntry *fe = arg; + + if (type != FD_TYPES__FANOTIFY) + goto parse_err; + + ret = sscanf(str, "fanotify flags:%x event-flags:%x", + &fe->faflags, &fe->evflags); + if (ret != 2) + goto parse_err; + entry_met = true; + continue; + } + if (fdinfo_field(str, "fanotify ino")) { + void *buf, *ob; + FanotifyFileEntry *fe = arg; + FanotifyMarkEntry *me; + int hoff = 0, i; + + if (type != FD_TYPES__FANOTIFY) + goto parse_err; + + ob = buf = xmalloc(sizeof(FanotifyMarkEntry) + + sizeof(FanotifyInodeMarkEntry) + + sizeof(FhEntry) + + FH_ENTRY_SIZES__min_entries * sizeof(uint64_t)); + if (!buf) + goto out; + + me = xptr_pull(&buf, FanotifyMarkEntry); + fanotify_mark_entry__init(me); + me->ie = xptr_pull(&buf, FanotifyInodeMarkEntry); + fanotify_inode_mark_entry__init(me->ie); + me->ie->f_handle = xptr_pull(&buf, FhEntry); + fh_entry__init(me->ie->f_handle); + me->ie->f_handle->n_handle = FH_ENTRY_SIZES__min_entries; + me->ie->f_handle->handle = xptr_pull_s(&buf, + FH_ENTRY_SIZES__min_entries * sizeof(uint64_t)); + + ret = sscanf(str, + "fanotify ino:%"PRIx64" sdev:%x mflags:%x mask:%x ignored_mask:%x " + "fhandle-bytes:%x fhandle-type:%x f_handle: %n", + &me->ie->i_ino, &me->s_dev, + &me->mflags, &me->mask, &me->ignored_mask, + &me->ie->f_handle->bytes, &me->ie->f_handle->type, + &hoff); + if (ret != 7 || hoff == 0) { + xfree(ob); + goto parse_err; + } + + parse_fhandle_encoded(str + hoff, me->ie->f_handle); + me->type = MARK_TYPE__INODE; + + i = fe->n_mark++; + if (xrealloc_safe(&fe->mark, fe->n_mark * sizeof(FanotifyMarkEntry *))) { + xfree(ob); + goto out; + } + + fe->mark[i] = me; + entry_met = true; + continue; + } + if (fdinfo_field(str, "fanotify mnt_id")) { + void *buf, *ob; + FanotifyFileEntry *fe = arg; + FanotifyMarkEntry *me; + int i; + + if (type != FD_TYPES__FANOTIFY) + goto parse_err; + + + ob = buf = xmalloc(sizeof(FanotifyMarkEntry) + + sizeof(FanotifyMountMarkEntry)); + if (!buf) + goto out; + + me = xptr_pull(&buf, FanotifyMarkEntry); + fanotify_mark_entry__init(me); + me->me = xptr_pull(&buf, FanotifyMountMarkEntry); + fanotify_mount_mark_entry__init(me->me); + + ret = sscanf(str, + "fanotify mnt_id:%x mflags:%x mask:%x ignored_mask:%x", + &me->me->mnt_id, &me->mflags, + &me->mask, &me->ignored_mask); + if (ret != 4) { + xfree(ob); + goto parse_err; + } + + me->type = MARK_TYPE__MOUNT; + + i = fe->n_mark++; + if (xrealloc_safe(&fe->mark, fe->n_mark * sizeof(FanotifyMarkEntry *))) { + xfree(ob); + goto out; + } + + fe->mark[i] = me; + entry_met = true; + continue; + } + if (fdinfo_field(str, "inotify wd")) { + void *buf, *ob; + InotifyFileEntry *ie = arg; + InotifyWdEntry *ify; + int hoff, i; + + if (type != FD_TYPES__INOTIFY) + goto parse_err; + + ob = buf = xmalloc(sizeof(InotifyWdEntry) + + sizeof(FhEntry) + + FH_ENTRY_SIZES__min_entries * sizeof(uint64_t)); + if (!buf) + goto out; + + ify = xptr_pull(&buf, InotifyWdEntry); + inotify_wd_entry__init(ify); + ify->f_handle = xptr_pull(&buf, FhEntry); + fh_entry__init(ify->f_handle); + ify->f_handle->n_handle = FH_ENTRY_SIZES__min_entries; + ify->f_handle->handle = xptr_pull_s(&buf, + FH_ENTRY_SIZES__min_entries * sizeof(uint64_t)); + + ret = sscanf(str, + "inotify wd:%x ino:%"PRIx64" sdev:%x " + "mask:%x ignored_mask:%x " + "fhandle-bytes:%x fhandle-type:%x " + "f_handle: %n", + &ify->wd, &ify->i_ino, &ify->s_dev, + &ify->mask, &ify->ignored_mask, + &ify->f_handle->bytes, &ify->f_handle->type, + &hoff); + if (ret != 7) { + xfree(ob); + goto parse_err; + } + + parse_fhandle_encoded(str + hoff, ify->f_handle); + + i = ie->n_wd++; + if (xrealloc_safe(&ie->wd, ie->n_wd * sizeof(InotifyWdEntry *))) { + xfree(ob); + goto out; + } + + ie->wd[i] = ify; + entry_met = true; + continue; + } + } + + exit_code = 0; + if (entry_met) + goto out; + /* + * An eventpoll/inotify file may have no target fds set thus + * resulting in no tfd: lines in proc. This is normal. + */ + if (type == FD_TYPES__EVENTPOLL || type == FD_TYPES__INOTIFY) + goto out; + + pr_err("No records of type %d found in fdinfo file\n", type); +parse_err: + exit_code = -1; + pr_perror("%s: error parsing [%s] for %d", __func__, str, type); +out: + bclose(&f); + return exit_code; +} + +int parse_fdinfo_pid(int pid, int fd, int type, void *arg) +{ + return parse_fdinfo_pid_s(pid, fd, type, arg); +} + +int parse_fdinfo(int fd, int type, void *arg) +{ + return parse_fdinfo_pid_s(PROC_SELF, fd, type, arg); +} + +int get_fd_mntid(int fd, int *mnt_id) +{ + struct fdinfo_common fdinfo = { .mnt_id = -1}; + + if (parse_fdinfo(fd, FD_TYPES__UND, &fdinfo)) + return -1; + + *mnt_id = fdinfo.mnt_id; + return 0; +} + +static int parse_file_lock_buf(char *buf, struct file_lock *fl, + bool is_blocked) +{ + int num; + char fl_flag[10], fl_type[15], fl_option[10]; + + if (is_blocked) { + num = sscanf(buf, "%lld: -> %s %s %s %d %x:%x:%ld %lld %s", + &fl->fl_id, fl_flag, fl_type, fl_option, + &fl->fl_owner, &fl->maj, &fl->min, &fl->i_no, + &fl->start, fl->end); + } else { + num = sscanf(buf, "%lld:%s %s %s %d %x:%x:%ld %lld %s", + &fl->fl_id, fl_flag, fl_type, fl_option, + &fl->fl_owner, &fl->maj, &fl->min, &fl->i_no, + &fl->start, fl->end); + } + + if (num < 10) { + pr_err("Invalid file lock info (%d): %s\n", num, buf); + return -1; + } + + if (!strcmp(fl_flag, "POSIX")) + fl->fl_kind = FL_POSIX; + else if (!strcmp(fl_flag, "FLOCK")) + fl->fl_kind = FL_FLOCK; + else if (!strcmp(fl_flag, "OFDLCK")) + fl->fl_kind = FL_OFD; + else if (!strcmp(fl_flag, "LEASE")) + fl->fl_kind = FL_LEASE; + else + fl->fl_kind = FL_UNKNOWN; + + if (fl->fl_kind == FL_LEASE && !strcmp(fl_type, "BREAKING")) { + fl->fl_ltype |= LEASE_BREAKING; + } + + if (!strcmp(fl_type, "MSNFS")) { + fl->fl_ltype |= LOCK_MAND; + + if (!strcmp(fl_option, "READ")) { + fl->fl_ltype |= LOCK_READ; + } else if (!strcmp(fl_option, "RW")) { + fl->fl_ltype |= LOCK_RW; + } else if (!strcmp(fl_option, "WRITE")) { + fl->fl_ltype |= LOCK_WRITE; + } else { + pr_err("Unknown lock option!\n"); + return -1; + } + } else { + if (!strcmp(fl_option, "UNLCK")) { + fl->fl_ltype |= F_UNLCK; + } else if (!strcmp(fl_option, "WRITE")) { + fl->fl_ltype |= F_WRLCK; + } else if (!strcmp(fl_option, "READ")) { + fl->fl_ltype |= F_RDLCK; + } else { + pr_err("Unknown lock option!\n"); + return -1; + } + } + + return 0; +} + +static bool pid_in_pstree(pid_t pid) +{ + return pstree_item_by_real(pid) != NULL; +} + +int parse_file_locks(void) +{ + struct file_lock *fl; + + FILE *fl_locks; + int exit_code = -1; + bool is_blocked; + + if (kdat.has_fdinfo_lock) + return 0; + + fl_locks = fopen_proc(PROC_GEN, "locks"); + if (!fl_locks) + return -1; + + while (fgets(buf, BUF_SIZE, fl_locks)) { + is_blocked = strstr(buf, "->") != NULL; + + fl = alloc_file_lock(); + if (!fl) { + pr_perror("Alloc file lock failed!"); + goto err; + } + + if (parse_file_lock_buf(buf, fl, is_blocked)) { + xfree(fl); + goto err; + } + + pr_info("lockinfo: %lld:%d %x %d %02x:%02x:%ld %lld %s\n", + fl->fl_id, fl->fl_kind, fl->fl_ltype, + fl->fl_owner, fl->maj, fl->min, fl->i_no, + fl->start, fl->end); + + + if (fl->fl_kind == FL_UNKNOWN) { + pr_err("Unknown file lock: %s!\n", buf); + xfree(fl); + goto err; + } + + if (is_blocked) { + /* + * All target processes are stopped in this moment and + * can't wait any locks. + */ + pr_debug("Skip blocked processes\n"); + xfree(fl); + continue; + } + + if ((fl->fl_kind == FL_POSIX) && + !pid_in_pstree(fl->fl_owner)) { + /* + * We only care about tasks which are taken + * into dump, so we only collect file locks + * belong to these tasks. + */ + xfree(fl); + continue; + } + + list_add_tail(&fl->list, &file_lock_list); + } + + exit_code = 0; +err: + fclose(fl_locks); + return exit_code; +} + +void free_posix_timers(struct proc_posix_timers_stat *st) +{ + while (!list_empty(&st->timers)) { + struct proc_posix_timer *timer; + timer = list_first_entry(&st->timers, struct proc_posix_timer, list); + list_del(&timer->list); + xfree(timer); + } +} + +int parse_posix_timers(pid_t pid, struct proc_posix_timers_stat *args) +{ + int exit_code = -1; + int pid_t; + int i = 0; + + struct bfd f; + char *s; + char sigpid[7]; + char tidpid[4]; + + struct proc_posix_timer *timer = NULL; + + INIT_LIST_HEAD(&args->timers); + args->timer_n = 0; + + f.fd = open_proc(pid, "timers"); + if (f.fd < 0) + return -1; + + if (bfdopenr(&f)) + return -1; + + while (1) { + char pbuf[17]; /* 16 + eol */ + + s = breadline(&f); + if (!s) + break; + if (IS_ERR(s)) + goto err; + + switch (i % 4) { + case 0: + timer = xzalloc(sizeof(struct proc_posix_timer)); + if (timer == NULL) + goto err; + + if (sscanf(s, "ID: %ld", + &timer->spt.it_id) != 1) + goto err; + break; + case 1: + if (sscanf(s, "signal: %d/%16s", + &timer->spt.si_signo, pbuf) != 2) + goto err; + break; + case 2: + if (sscanf(s, "notify: %6[a-z]/%3[a-z].%d\n", + sigpid, tidpid, &pid_t) != 3) + goto err; + break; + case 3: + if (sscanf(s, "ClockID: %d\n", + &timer->spt.clock_id) != 1) + goto err; + + timer->spt.sival_ptr = NULL; + if (sscanf(pbuf, "%p", &timer->spt.sival_ptr) != 1 && + strcmp(pbuf, "(null)")) { + pr_err("Unable to parse '%s'\n", pbuf); + goto err; + } + + if ( tidpid[0] == 't') { + timer->spt.it_sigev_notify = SIGEV_THREAD_ID; + } else { + switch (sigpid[0]) { + case 's' : + timer->spt.it_sigev_notify = SIGEV_SIGNAL; + break; + case 't' : + timer->spt.it_sigev_notify = SIGEV_THREAD; + break; + default : + timer->spt.it_sigev_notify = SIGEV_NONE; + break; + } + } + + list_add(&timer->list, &args->timers); + timer = NULL; + args->timer_n++; + break; + } + i++; + } + + exit_code = 0; +out: + bclose(&f); + return exit_code; +err: + xfree(timer); + free_posix_timers(args); + pr_perror("Parse error in posix timers proc file!"); + goto out; +} + +int parse_threads(int pid, struct pid **_t, int *_n) +{ + struct dirent *de; + DIR *dir; + struct pid *t = NULL; + int nr = 1; + + if (*_t) + t = *_t; + + dir = opendir_proc(pid, "task"); + if (!dir) + return -1; + + while ((de = readdir(dir))) { + struct pid *tmp; + + /* We expect numbers only here */ + if (de->d_name[0] == '.') + continue; + + if (*_t == NULL) { + tmp = xrealloc(t, nr * sizeof(struct pid)); + if (!tmp) { + xfree(t); + closedir(dir); + return -1; + } + t = tmp; + t[nr - 1].ns[0].virt = -1; + } + t[nr - 1].real = atoi(de->d_name); + t[nr - 1].state = TASK_THREAD; + nr++; + } + + closedir(dir); + + if (*_t == NULL) { + *_t = t; + *_n = nr - 1; + } else + BUG_ON(nr - 1 != *_n); + + return 0; +} + +int parse_cgroup_file(FILE *f, struct list_head *retl, unsigned int *n) +{ + while (fgets(buf, BUF_SIZE, f)) { + struct cg_ctl *ncc, *cc; + char *name, *path = NULL, *e; + + ncc = xmalloc(sizeof(*cc)); + if (!ncc) + goto err; + + /* + * Typical output (':' is a separator here) + * + * 4:cpu,cpuacct:/ + * 3:cpuset:/ + * 2:name=systemd:/user.slice/user-1000.slice/session-1.scope + */ + name = strchr(buf, ':'); + if (name) { + path = strchr(++name, ':'); + if (*name == ':') { + /* + * It's unified hierarchy. On kernels with legacy + * tree this item is added automatically, so we + * can just skip one. For those with full unified + * support is on ... we need to write new code. + */ + xfree(ncc); + continue; + } + } + if (!name || !path) { + pr_err("Failed parsing cgroup %s\n", buf); + xfree(ncc); + goto err; + } + e = strchr(name, '\n'); + *path++ = '\0'; + if (e) + *e = '\0'; + + /* + * Controllers and their props might be + * configured the way some of them are + * not taken into the image for migration + * sake or container specifics. + */ + if (cgp_should_skip_controller(name)) { + pr_debug("cg-prop: Skipping controller %s\n", name); + xfree(ncc); + continue; + } + + ncc->name = xstrdup(name); + ncc->path = xstrdup(path); + ncc->cgns_prefix = 0; + if (!ncc->name || !ncc->path) { + xfree(ncc->name); + xfree(ncc->path); + xfree(ncc); + goto err; + } + + list_for_each_entry(cc, retl, l) + if (strcmp(cc->name, name) >= 0) + break; + + list_add_tail(&ncc->l, &cc->l); + (*n)++; + } + + return 0; + +err: + put_ctls(retl); + return -1; +} + +int parse_task_cgroup(int pid, struct parasite_dump_cgroup_args *args, struct list_head *retl, unsigned int *n) +{ + FILE *f; + int ret; + LIST_HEAD(internal); + unsigned int n_internal = 0; + struct cg_ctl *intern, *ext; + + f = fopen_proc(pid, "cgroup"); + if (!f) + return -1; + + ret = parse_cgroup_file(f, retl, n); + fclose(f); + if (ret < 0) + return -1; + + /* No parasite args, we're dumping criu's cg set, so we don't need to + * try and parse the "internal" cgroup set to find namespace + * boundaries. + */ + if (!args) + return 0; + + f = fmemopen(args->contents, strlen(args->contents), "r"); + if (!f) { + pr_perror("couldn't fmemopen cgroup buffer %s", args->contents); + return -1; + } + + ret = parse_cgroup_file(f, &internal, &n_internal); + fclose(f); + if (ret < 0) { + pr_err("couldn't parse internal cgroup file\n"); + return -1; + } + + /* Here's where we actually compute the cgns prefix. Consider a task + * in /foo/bar which has unshared its namespace at /foo. The internal + * path is /bar, but the external path is /foo/bar, and the cgns + * prefix is /foo. The algorithm is: + * + * // no cg ns unshare in this case + * if (internal == external) + * continue; + * idx = find_suffix_pos(external, internal) + * cgns_prefix = external[:idx] + */ + list_for_each_entry(intern, &internal, l) { + list_for_each_entry(ext, retl, l) { + char *pos; + + if (strcmp(ext->name, intern->name)) + continue; + + /* If the cgroup namespace was unshared at / (or there + * is no cgroup namespace relative to criu), the paths + * are equal and we don't need to set a prefix. + */ + if (!strcmp(ext->path, intern->path)) + continue; + + /* +1 here to chop off the leading / */ + pos = ext->path + strlen(ext->path) - strlen(intern->path+1); + if (strcmp(pos, intern->path+1)) { + pr_err("invalid cgroup configuration, %s is not a suffix of %s\n", intern->path, ext->path); + ret = -1; + goto out; + } + + ext->cgns_prefix = pos - ext->path; + if (ext->path[ext->cgns_prefix-1] == '/') + ext->cgns_prefix--; + } + } + +out: + put_ctls(&internal); + return ret; +} + +void put_ctls(struct list_head *l) +{ + struct cg_ctl *c, *n; + + list_for_each_entry_safe(c, n, l, l) { + xfree(c->name); + xfree(c->path); + xfree(c); + } + INIT_LIST_HEAD(l); +} + +/* Parse and create all the real controllers. This does not include things with + * the "name=" prefix, e.g. systemd. + */ +int collect_controllers(struct list_head *cgroups, unsigned int *n_cgroups) +{ + int exit_code = -1; + FILE *f; + + f = fopen_proc(PROC_SELF, "cgroup"); + if (f == NULL) + return -1; + + while (fgets(buf, BUF_SIZE, f)) { + struct cg_controller *nc = NULL; + char *controllers, *off; + + controllers = strchr(buf, ':'); + if (!controllers) { + pr_err("Unable to parse \"%s\"\n", buf); + goto err; + } + controllers++; + + if (*controllers == ':') + /* + * Unified hier. See comment in parse_cgroup_file + * for more details. + */ + continue; + + off = strchr(controllers, ':'); + if (!off) { + pr_err("Unable to parse \"%s\"\n", buf); + goto err; + } + *off = '\0'; + while (1) { + off = strchr(controllers, ','); + if (off) + *off = '\0'; + + if (!strncmp("name=", controllers, 5)) + goto skip; + + if (!nc) { + nc = new_controller(controllers); + if (!nc) + goto err; + list_add_tail(&nc->l, cgroups); + (*n_cgroups)++; + } else { + void *m; + char *n; + + nc->n_controllers++; + m = xrealloc(nc->controllers, sizeof(char *) * nc->n_controllers); + if (!m) + goto err; + + nc->controllers = m; + + n = xstrdup(controllers); + if (!n) + goto err; + + nc->controllers[nc->n_controllers-1] = n; + } + +skip: + if (!off) + break; + controllers = off + 1; + } + } + + exit_code = 0; +err: + fclose(f); + return exit_code; +} + +/* + * If an OverlayFS mountpoint is found in the mountinfo table, + * we enable opts.overlayfs, which is a workaround for the + * OverlayFS Kernel bug. + * + * See fixup_overlayfs for details. + */ +int overlayfs_parse(struct mount_info *new) +{ + opts.overlayfs = true; + return 0; +} + +/* + * AUFS callback function to "fix up" the root pathname. + * See sysfs_parse.c for details. + */ +int aufs_parse(struct mount_info *new) +{ + int ret = 0; + + if (!strcmp(new->mountpoint, "./")) { + opts.aufs = true; + ret = parse_aufs_branches(new); + } + + return ret; +} + +int parse_children(pid_t pid, pid_t **_c, int *_n) +{ + pid_t *ch = NULL; + int nr = 0; + DIR *dir; + struct dirent *de; + struct bfd f; + + dir = opendir_proc(pid, "task"); + if (dir == NULL) + return -1; + + while ((de = readdir(dir))) { + char *pos, *end; + + if (dir_dots(de)) + continue; + + f.fd = open_proc(pid, "task/%s/children", de->d_name); + if (f.fd < 0) + goto err; + + if (bfdopenr(&f)) + goto err; + + while (1) { + pid_t val, *tmp; + + pos = breadchr(&f, ' '); + if (IS_ERR(pos)) + goto err_close; + if (pos == NULL) + break; + + val = strtol(pos, &end, 0); + + if (*end != 0 && *end != ' ') { + pr_err("Unable to parse %s\n", end); + goto err_close; + } + + tmp = xrealloc(ch, (nr + 1) * sizeof(pid_t)); + if (!tmp) + goto err_close; + + ch = tmp; + ch[nr] = val; + nr++; + } + bclose(&f); + } + + *_c = ch; + *_n = nr; + + closedir(dir); + return 0; +err_close: + bclose(&f); +err: + closedir(dir); + xfree(ch); + return -1; +} + +#define CSEC_PER_SEC 100 + +int parse_uptime(uint64_t *upt) +{ + unsigned long sec, csec; + FILE *f; + + f = fopen("/proc/uptime", "r"); + if (!f) { + pr_perror("Failed to fopen /proc/uptime"); + return -1; + } + + if (fscanf(f, "%lu.%2lu", &sec, &csec) != 2) { + pr_perror("Failed to parse /proc/uptime"); + fclose(f); + return -1; + } + + *upt = sec * USEC_PER_SEC + csec * (USEC_PER_SEC / CSEC_PER_SEC); + + fclose(f); + return 0; +} diff --git a/CRIU_code/criu/protobuf-desc.c b/CRIU_code/criu/protobuf-desc.c new file mode 100644 index 0000000..bfe00c5 --- /dev/null +++ b/CRIU_code/criu/protobuf-desc.c @@ -0,0 +1,104 @@ +#include +#include + +#include +#include + +#include +#include +#include + +#include "common/compiler.h" +#include "log.h" + +#include "protobuf-desc.h" + +#include "images/inventory.pb-c.h" +#include "images/stats.pb-c.h" +#include "images/regfile.pb-c.h" +#include "images/ext-file.pb-c.h" +#include "images/ns.pb-c.h" +#include "images/eventfd.pb-c.h" +#include "images/eventpoll.pb-c.h" +#include "images/signalfd.pb-c.h" +#include "images/fsnotify.pb-c.h" +#include "images/core.pb-c.h" +#include "images/mm.pb-c.h" +#include "images/pipe.pb-c.h" +#include "images/fifo.pb-c.h" +#include "images/fdinfo.pb-c.h" +#include "images/pipe-data.pb-c.h" +#include "images/pstree.pb-c.h" +#include "images/sa.pb-c.h" +#include "images/sk-unix.pb-c.h" +#include "images/sk-inet.pb-c.h" +#include "images/packet-sock.pb-c.h" +#include "images/sk-packet.pb-c.h" +#include "images/creds.pb-c.h" +#include "images/timer.pb-c.h" +#include "images/utsns.pb-c.h" +#include "images/ipc-var.pb-c.h" +#include "images/ipc-shm.pb-c.h" +#include "images/ipc-msg.pb-c.h" +#include "images/ipc-sem.pb-c.h" +#include "images/fs.pb-c.h" +#include "images/remap-file-path.pb-c.h" +#include "images/ghost-file.pb-c.h" +#include "images/mnt.pb-c.h" +#include "images/netdev.pb-c.h" +#include "images/tcp-stream.pb-c.h" +#include "images/tty.pb-c.h" +#include "images/file-lock.pb-c.h" +#include "images/rlimit.pb-c.h" +#include "images/pagemap.pb-c.h" +#include "images/siginfo.pb-c.h" +#include "images/sk-netlink.pb-c.h" +#include "images/vma.pb-c.h" +#include "images/tun.pb-c.h" +#include "images/cgroup.pb-c.h" +#include "images/timerfd.pb-c.h" +#include "images/cpuinfo.pb-c.h" +#include "images/userns.pb-c.h" +#include "images/seccomp.pb-c.h" +#include "images/binfmt-misc.pb-c.h" +#include "images/autofs.pb-c.h" +#include "images/remote-image.pb-c.h" + +struct cr_pb_message_desc cr_pb_descs[PB_MAX]; + +#define CR_PB_DESC(__type, __vtype, __ftype) \ + CR_PB_MDESC_INIT(cr_pb_descs[PB_##__type], \ + __vtype##Entry, \ + __ftype##_entry) + +#define PB_PACK_TYPECHECK(__o, __fn) ({ if (0) __fn##__pack(__o, NULL); (pb_pack_t)&__fn##__pack; }) +#define PB_GPS_TYPECHECK(__o, __fn) ({ if (0) __fn##__get_packed_size(__o); (pb_getpksize_t)&__fn##__get_packed_size; }) +#define PB_UNPACK_TYPECHECK(__op, __fn) ({ if (0) *__op = __fn##__unpack(NULL, 0, NULL); (pb_unpack_t)&__fn##__unpack; }) +#define PB_FREE_TYPECHECK(__o, __fn) ({ if (0) __fn##__free_unpacked(__o, NULL); (pb_free_t)&__fn##__free_unpacked; }) + +/* + * This should be explicitly "called" to do type-checking + */ + +#define CR_PB_MDESC_INIT(__var, __type, __name) \ + do { \ + __var.getpksize = PB_GPS_TYPECHECK((__type *)NULL, __name); \ + __var.pack = PB_PACK_TYPECHECK((__type *)NULL, __name); \ + __var.unpack = PB_UNPACK_TYPECHECK((__type **)NULL, __name); \ + __var.free = PB_FREE_TYPECHECK((__type *)NULL, __name); \ + __var.pb_desc = &__name##__descriptor; \ + } while (0) + +void cr_pb_init(void) +{ + CR_PB_DESC(IDS, TaskKobjIds, task_kobj_ids); + CR_PB_DESC(SIGACT, Sa, sa); + CR_PB_DESC(SK_QUEUES, SkPacket, sk_packet); + CR_PB_MDESC_INIT(cr_pb_descs[PB_IPCNS_MSG], IpcMsg, ipc_msg); + CR_PB_DESC(IPCNS_MSG_ENT, IpcMsg, ipc_msg); + CR_PB_DESC(REMAP_FPATH, RemapFilePath, remap_file_path); + CR_PB_DESC(NETDEV, NetDevice, net_device); + CR_PB_MDESC_INIT(cr_pb_descs[PB_PAGEMAP_HEAD], PagemapHead, pagemap_head); + +#include "protobuf-desc-gen.h" +} diff --git a/CRIU_code/criu/protobuf.c b/CRIU_code/criu/protobuf.c new file mode 100644 index 0000000..8eb73e0 --- /dev/null +++ b/CRIU_code/criu/protobuf.c @@ -0,0 +1,258 @@ +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "image.h" +#include "servicefd.h" +#include "common/compiler.h" +#include "log.h" +#include "rst-malloc.h" +#include "string.h" +#include "sockets.h" +#include "cr_options.h" +#include "bfd.h" +#include "protobuf.h" +#include "util.h" + +/* + * To speed up reading of packed objects + * by providing space on stack, this should + * be more than enough for most objects. + */ +#define PB_PKOBJ_LOCAL_SIZE 1024 + +static char *image_name(struct cr_img *img) +{ + int fd = img->_x.fd; + static char image_path[PATH_MAX]; + + if (read_fd_link(fd, image_path, sizeof(image_path)) > 0) + return image_path; + return NULL; +} + +/* + * Reads PB record (header + packed object) from file @fd and unpack + * it with @unpack procedure to the pointer @pobj + * + * 1 on success + * -1 on error (or EOF met and @eof set to false) + * 0 on EOF and @eof set to true + * + * Don't forget to free memory granted to unpacked object in calling code if needed + */ + +int do_pb_read_one(struct cr_img *img, void **pobj, int type, bool eof) +{ + u8 local[PB_PKOBJ_LOCAL_SIZE]; + void *buf = (void *)&local; + u32 size; + int ret; + + if (!cr_pb_descs[type].pb_desc) { + pr_err("Wrong object requested %d on %s\n", + type, image_name(img)); + return -1; + } + + *pobj = NULL; + + if (unlikely(empty_image(img))) + ret = 0; + else + ret = bread(&img->_x, &size, sizeof(size)); + if (ret == 0) { + if (eof) { + return 0; + } else { + pr_err("Unexpected EOF on %s\n", + image_name(img)); + return -1; + } + } else if (ret < sizeof(size)) { + pr_perror("Read %d bytes while %d expected on %s", + ret, (int)sizeof(size), + image_name(img)); + return -1; + } + + if (size > sizeof(local)) { + ret = -1; + buf = xmalloc(size); + if (!buf) + goto err; + } + + ret = bread(&img->_x, buf, size); + if (ret < 0) { + pr_perror("Can't read %d bytes from file %s", + size, image_name(img)); + goto err; + } else if (ret != size) { + pr_perror("Read %d bytes while %d expected from %s", + ret, size, image_name(img)); + ret = -1; + goto err; + } + + *pobj = cr_pb_descs[type].unpack(NULL, size, buf); + if (!*pobj) { + ret = -1; + pr_err("Failed unpacking object %p from %s\n", + pobj, image_name(img)); + goto err; + } + + ret = 1; +err: + if (buf != (void *)&local) + xfree(buf); + + return ret; +} + +/* + * Writes PB record (header + packed object pointed by @obj) + * to file @fd, using @getpksize to get packed size and @pack + * to implement packing + * + * 0 on success + * -1 on error + */ +int pb_write_one(struct cr_img *img, void *obj, int type) +{ + u8 local[PB_PKOBJ_LOCAL_SIZE]; + void *buf = (void *)&local; + u32 size, packed; + int ret = -1; + struct iovec iov[2]; + + if (!cr_pb_descs[type].pb_desc) { + pr_err("Wrong object requested %d\n", type); + return -1; + } + + if (lazy_image(img) && open_image_lazy(img)) + return -1; + + size = cr_pb_descs[type].getpksize(obj); + if (size > (u32)sizeof(local)) { + buf = xmalloc(size); + if (!buf) + goto err; + } + + packed = cr_pb_descs[type].pack(obj, buf); + if (packed != size) { + pr_err("Failed packing PB object %p\n", obj); + goto err; + } + + iov[0].iov_base = &size; + iov[0].iov_len = sizeof(size); + iov[1].iov_base = buf; + iov[1].iov_len = size; + + ret = bwritev(&img->_x, iov, 2); + if (ret != size + sizeof(size)) { + pr_perror("Can't write %d bytes", (int)(size + sizeof(size))); + goto err; + } + + ret = 0; +err: + if (buf != (void *)&local) + xfree(buf); + return ret; +} + +int collect_entry(ProtobufCMessage *msg, struct collect_image_info *cinfo) +{ + void *obj; + void *(*o_alloc)(size_t size) = malloc; + void (*o_free)(void *ptr) = free; + + if (cinfo->flags & COLLECT_SHARED) { + o_alloc = shmalloc; + o_free = shfree_last; + } + + if (cinfo->priv_size) { + obj = o_alloc(cinfo->priv_size); + if (!obj) + return -1; + } else + obj = NULL; + + cinfo->flags |= COLLECT_HAPPENED; + if (cinfo->collect(obj, msg, NULL) < 0) { + o_free(obj); + cr_pb_descs[cinfo->pb_type].free(msg, NULL); + return -1; + } + + if (!cinfo->priv_size && !(cinfo->flags & COLLECT_NOFREE)) + cr_pb_descs[cinfo->pb_type].free(msg, NULL); + + return 0; +} + +int collect_image(struct collect_image_info *cinfo) +{ + int ret; + struct cr_img *img; + void *(*o_alloc)(size_t size) = malloc; + void (*o_free)(void *ptr) = free; + + pr_info("Collecting %d/%d (flags %x)\n", + cinfo->fd_type, cinfo->pb_type, cinfo->flags); + + img = open_image(cinfo->fd_type, O_RSTR); + if (!img) + return -1; + + if (cinfo->flags & COLLECT_SHARED) { + o_alloc = shmalloc; + o_free = shfree_last; + } + + while (1) { + void *obj; + ProtobufCMessage *msg; + + if (cinfo->priv_size) { + ret = -1; + obj = o_alloc(cinfo->priv_size); + if (!obj) + break; + } else + obj = NULL; + + ret = pb_read_one_eof(img, &msg, cinfo->pb_type); + if (ret <= 0) { + o_free(obj); + break; + } + + cinfo->flags |= COLLECT_HAPPENED; + ret = cinfo->collect(obj, msg, img); + if (ret < 0) { + o_free(obj); + cr_pb_descs[cinfo->pb_type].free(msg, NULL); + break; + } + + if (!cinfo->priv_size && !(cinfo->flags & COLLECT_NOFREE)) + cr_pb_descs[cinfo->pb_type].free(msg, NULL); + } + + close_image(img); + pr_debug(" `- ... done\n"); + return ret; +} diff --git a/CRIU_code/criu/pstree.c b/CRIU_code/criu/pstree.c new file mode 100644 index 0000000..92b4167 --- /dev/null +++ b/CRIU_code/criu/pstree.c @@ -0,0 +1,1038 @@ +#include +#include +#include +#include + +#include "types.h" +#include "cr_options.h" +#include "pstree.h" +#include "rst-malloc.h" +#include "common/lock.h" +#include "namespaces.h" +#include "files.h" +#include "tty.h" +#include "mount.h" +#include "dump.h" +#include "util.h" +#include "net.h" + +#include "protobuf.h" +#include "images/pstree.pb-c.h" +#include "crtools.h" + +struct pstree_item *root_item; +static struct rb_root pid_root_rb; + +void core_entry_free(CoreEntry *core) +{ + if (core->tc && core->tc->timers) + xfree(core->tc->timers->posix); + if (core->thread_core) + xfree(core->thread_core->creds->groups); + arch_free_thread_info(core); + xfree(core); +} + +#ifndef RLIM_NLIMITS +# define RLIM_NLIMITS 16 +#endif + +CoreEntry *core_entry_alloc(int th, int tsk) +{ + size_t sz; + CoreEntry *core = NULL; + void *m; + + sz = sizeof(CoreEntry); + if (tsk) { + sz += sizeof(TaskCoreEntry) + TASK_COMM_LEN; + if (th) { + sz += sizeof(TaskRlimitsEntry); + sz += RLIM_NLIMITS * sizeof(RlimitEntry *); + sz += RLIM_NLIMITS * sizeof(RlimitEntry); + sz += sizeof(TaskTimersEntry); + sz += 3 * sizeof(ItimerEntry); /* 3 for real, virt and prof */ + } + } + if (th) { + CredsEntry *ce = NULL; + + sz += sizeof(ThreadCoreEntry) + sizeof(ThreadSasEntry) + sizeof(CredsEntry); + + sz += CR_CAP_SIZE * sizeof(ce->cap_inh[0]); + sz += CR_CAP_SIZE * sizeof(ce->cap_prm[0]); + sz += CR_CAP_SIZE * sizeof(ce->cap_eff[0]); + sz += CR_CAP_SIZE * sizeof(ce->cap_bnd[0]); + /* + * @groups are dynamic and allocated + * on demand. + */ + } + + m = xmalloc(sz); + if (m) { + core = xptr_pull(&m, CoreEntry); + core_entry__init(core); + core->mtype = CORE_ENTRY__MARCH; + + if (tsk) { + core->tc = xptr_pull(&m, TaskCoreEntry); + task_core_entry__init(core->tc); + core->tc->comm = xptr_pull_s(&m, TASK_COMM_LEN); + memzero(core->tc->comm, TASK_COMM_LEN); + + if (th) { + TaskRlimitsEntry *rls; + TaskTimersEntry *tte; + int i; + + rls = core->tc->rlimits = xptr_pull(&m, TaskRlimitsEntry); + task_rlimits_entry__init(rls); + + rls->n_rlimits = RLIM_NLIMITS; + rls->rlimits = xptr_pull_s(&m, sizeof(RlimitEntry *) * RLIM_NLIMITS); + + for (i = 0; i < RLIM_NLIMITS; i++) { + rls->rlimits[i] = xptr_pull(&m, RlimitEntry); + rlimit_entry__init(rls->rlimits[i]); + } + + tte = core->tc->timers = xptr_pull(&m, TaskTimersEntry); + task_timers_entry__init(tte); + tte->real = xptr_pull(&m, ItimerEntry); + itimer_entry__init(tte->real); + tte->virt = xptr_pull(&m, ItimerEntry); + itimer_entry__init(tte->virt); + tte->prof = xptr_pull(&m, ItimerEntry); + itimer_entry__init(tte->prof); + } + } + + if (th) { + CredsEntry *ce; + + core->thread_core = xptr_pull(&m, ThreadCoreEntry); + thread_core_entry__init(core->thread_core); + core->thread_core->sas = xptr_pull(&m, ThreadSasEntry); + thread_sas_entry__init(core->thread_core->sas); + ce = core->thread_core->creds = xptr_pull(&m, CredsEntry); + creds_entry__init(ce); + + ce->n_cap_inh = CR_CAP_SIZE; + ce->n_cap_prm = CR_CAP_SIZE; + ce->n_cap_eff = CR_CAP_SIZE; + ce->n_cap_bnd = CR_CAP_SIZE; + ce->cap_inh = xptr_pull_s(&m, CR_CAP_SIZE * sizeof(ce->cap_inh[0])); + ce->cap_prm = xptr_pull_s(&m, CR_CAP_SIZE * sizeof(ce->cap_prm[0])); + ce->cap_eff = xptr_pull_s(&m, CR_CAP_SIZE * sizeof(ce->cap_eff[0])); + ce->cap_bnd = xptr_pull_s(&m, CR_CAP_SIZE * sizeof(ce->cap_bnd[0])); + + if (arch_alloc_thread_info(core)) { + xfree(core); + core = NULL; + } + } + } + + return core; +} + +int pstree_alloc_cores(struct pstree_item *item) +{ + unsigned int i; + + item->core = xzalloc(sizeof(*item->core) * item->nr_threads); + if (!item->core) + return -1; + + for (i = 0; i < item->nr_threads; i++) { + if (item->threads[i].real == item->pid->real) + item->core[i] = core_entry_alloc(1, 1); + else + item->core[i] = core_entry_alloc(1, 0); + + if (!item->core[i]) + goto err; + } + + return 0; +err: + pstree_free_cores(item); + return -1; +} + +void pstree_free_cores(struct pstree_item *item) +{ + unsigned int i; + + if (item->core) { + for (i = 1; i < item->nr_threads; i++) + if (item->core[i]) + core_entry_free(item->core[i]); + xfree(item->core); + item->core = NULL; + } +} + +void free_pstree(struct pstree_item *root_item) +{ + struct pstree_item *item = root_item, *parent; + + while (item) { + if (!list_empty(&item->children)) { + item = list_first_entry(&item->children, struct pstree_item, sibling); + continue; + } + + parent = item->parent; + list_del(&item->sibling); + pstree_free_cores(item); + xfree(item->threads); + xfree(item); + item = parent; + } +} + +struct pstree_item *__alloc_pstree_item(bool rst) +{ + struct pstree_item *item; + int sz; + + if (!rst) { + sz = sizeof(*item) + sizeof(struct dmp_info) + sizeof(struct pid); + item = xzalloc(sz); + if (!item) + return NULL; + item->pid = (void *)item + sizeof(*item) + sizeof(struct dmp_info); + } else { + sz = sizeof(*item) + sizeof(struct rst_info) + sizeof(struct pid); + item = shmalloc(sz); + if (!item) + return NULL; + + memset(item, 0, sz); + vm_area_list_init(&rsti(item)->vmas); + INIT_LIST_HEAD(&rsti(item)->vma_io); + item->pid = (void *)item + sizeof(*item) + sizeof(struct rst_info); + } + + INIT_LIST_HEAD(&item->children); + INIT_LIST_HEAD(&item->sibling); + + item->pid->ns[0].virt = -1; + item->pid->real = -1; + item->pid->state = TASK_UNDEF; + item->born_sid = -1; + item->pid->item = item; + futex_init(&item->task_st); + + return item; +} + +int init_pstree_helper(struct pstree_item *ret) +{ + BUG_ON(!ret->parent); + ret->pid->state = TASK_HELPER; + rsti(ret)->clone_flags = CLONE_FILES | CLONE_FS; + if (shared_fdt_prepare(ret) < 0) + return -1; + task_entries->nr_helpers++; + return 0; +} + +/* Deep first search on children */ +struct pstree_item *pstree_item_next(struct pstree_item *item) +{ + if (!list_empty(&item->children)) + return list_first_entry(&item->children, struct pstree_item, sibling); + + while (item->parent) { + if (item->sibling.next != &item->parent->children) + return list_entry(item->sibling.next, struct pstree_item, sibling); + item = item->parent; + } + + return NULL; +} + +/* Preorder traversal of pstree item */ +int preorder_pstree_traversal(struct pstree_item *item, int (*f)(struct pstree_item *)) +{ + struct pstree_item *cursor; + + if (f(item) < 0) + return -1; + + list_for_each_entry(cursor, &item->children, sibling) { + if (preorder_pstree_traversal(cursor, f) < 0) + return -1; + } + + return 0; +} + +int dump_pstree(struct pstree_item *root_item) +{ + struct pstree_item *item = root_item; + PstreeEntry e = PSTREE_ENTRY__INIT; + int ret = -1, i; + struct cr_img *img; + + pr_info("\n"); + pr_info("Dumping pstree (pid: %d)\n", root_item->pid->real); + pr_info("----------------------------------------\n"); + + /* + * Make sure we're dumping session leader, if not an + * appropriate option must be passed. + * + * Also note that if we're not a session leader we + * can't get the situation where the leader sits somewhere + * deeper in process tree, thus top-level checking for + * leader is enough. + */ + if (vpid(root_item) != root_item->sid) { + if (!opts.shell_job) { + pr_err("The root process %d is not a session leader. " + "Consider using --" OPT_SHELL_JOB " option\n", vpid(item)); + return -1; + } + } + + img = open_image(CR_FD_PSTREE, O_DUMP); + if (!img) + return -1; + + for_each_pstree_item(item) { + pr_info("Process: %d(%d)\n", vpid(item), item->pid->real); + + e.pid = vpid(item); + e.ppid = item->parent ? vpid(item->parent) : 0; + e.pgid = item->pgid; + e.sid = item->sid; + e.n_threads = item->nr_threads; + + e.threads = xmalloc(sizeof(e.threads[0]) * e.n_threads); + if (!e.threads) + goto err; + + for (i = 0; i < item->nr_threads; i++) + e.threads[i] = item->threads[i].ns[0].virt; + + ret = pb_write_one(img, &e, PB_PSTREE); + xfree(e.threads); + + if (ret) + goto err; + } + ret = 0; + +err: + pr_info("----------------------------------------\n"); + close_image(img); + return ret; +} + +static int prepare_pstree_for_shell_job(pid_t pid) +{ + pid_t current_sid = getsid(pid); + pid_t current_gid = getpgid(pid); + + struct pstree_item *pi; + + pid_t old_sid; + pid_t old_gid; + + if (!opts.shell_job) + return 0; + + if (root_item->sid == vpid(root_item)) + return 0; + + /* + * Migration of a root task group leader is a bit tricky. + * When a task yields SIGSTOP, the kernel notifies the parent + * with SIGCHLD. This means when task is running in a + * shell, the shell obtains SIGCHLD and sends a task to + * the background. + * + * The situation gets changed once we restore the + * program -- our tool become an additional stub between + * the restored program and the shell. So to be able to + * notify the shell with SIGCHLD from our restored + * program -- we make the root task to inherit the + * process group from us. + * + * Not that clever solution but at least it works. + */ + + old_sid = root_item->sid; + + pr_info("Migrating process tree (SID %d->%d)\n", + old_sid, current_sid); + + for_each_pstree_item(pi) { + if (pi->sid == old_sid) + pi->sid = current_sid; + } + + old_gid = root_item->pgid; + if (old_gid != vpid(root_item)) { + if (lookup_create_item(current_sid) == NULL) + return -1; + + pr_info("Migrating process tree (GID %d->%d)\n", + old_gid, current_gid); + + for_each_pstree_item(pi) { + if (pi->pgid == old_gid) + pi->pgid = current_gid; + } + + if (lookup_create_item(current_gid) == NULL) + return -1; + } + + return 0; +} + +/* + * Try to find a pid node in the tree and insert a new one, + * it is not there yet. If pid_node isn't set, pstree_item + * is inserted. + */ +static struct pid *lookup_create_pid(pid_t pid, struct pid *pid_node) +{ + struct rb_node *node = pid_root_rb.rb_node; + struct rb_node **new = &pid_root_rb.rb_node; + struct rb_node *parent = NULL; + + while (node) { + struct pid *this = rb_entry(node, struct pid, ns[0].node); + + parent = *new; + if (pid < this->ns[0].virt) + node = node->rb_left, new = &((*new)->rb_left); + else if (pid > this->ns[0].virt) + node = node->rb_right, new = &((*new)->rb_right); + else + return this; + } + + if (!pid_node) { + struct pstree_item *item; + + item = __alloc_pstree_item(true); + if (item == NULL) + return NULL; + + item->pid->ns[0].virt = pid; + pid_node = item->pid; + } + rb_link_and_balance(&pid_root_rb, &pid_node->ns[0].node, parent, new); + return pid_node; +} + +void pstree_insert_pid(struct pid *pid_node) +{ + struct pid* n; + + n = lookup_create_pid(pid_node->ns[0].virt, pid_node); + + BUG_ON(n != pid_node); +} + +struct pstree_item *lookup_create_item(pid_t pid) +{ + struct pid *node; + + node = lookup_create_pid(pid, NULL); + if (!node) + return NULL; + BUG_ON(node->state == TASK_THREAD); + + return node->item; +} + +struct pid *pstree_pid_by_virt(pid_t pid) +{ + struct rb_node *node = pid_root_rb.rb_node; + + while (node) { + struct pid *this = rb_entry(node, struct pid, ns[0].node); + + if (pid < this->ns[0].virt) + node = node->rb_left; + else if (pid > this->ns[0].virt) + node = node->rb_right; + else + return this; + } + return NULL; +} + +static int read_pstree_ids(struct pstree_item *pi) +{ + int ret; + struct cr_img *img; + + img = open_image(CR_FD_IDS, O_RSTR, vpid(pi)); + if (!img) + return -1; + + ret = pb_read_one_eof(img, &pi->ids, PB_IDS); + close_image(img); + + if (ret <= 0) + return ret; + + if (pi->ids->has_mnt_ns_id) { + if (rst_add_ns_id(pi->ids->mnt_ns_id, pi, &mnt_ns_desc)) + return -1; + } + if (pi->ids->has_net_ns_id) { + if (rst_add_ns_id(pi->ids->net_ns_id, pi, &net_ns_desc)) + return -1; + } + + return 0; +} + +static int read_pstree_image(pid_t *pid_max) +{ + int ret = 0, i; + struct cr_img *img; + struct pstree_item *pi; + + pr_info("Reading image tree\n"); + + img = open_image(CR_FD_PSTREE, O_RSTR); + if (!img) + return -1; + + while (1) { + PstreeEntry *e; + + ret = pb_read_one_eof(img, &e, PB_PSTREE); + if (ret <= 0) + break; + + ret = -1; + pi = lookup_create_item(e->pid); + if (pi == NULL) + break; + BUG_ON(pi->pid->state != TASK_UNDEF); + + /* + * All pids should be added in the tree to be able to find + * free pid-s for helpers. pstree_item for these pid-s will + * be initialized when we meet PstreeEntry with this pid or + * we will create helpers for them. + */ + if (lookup_create_item(e->pgid) == NULL) + break; + if (lookup_create_item(e->sid) == NULL) + break; + + pi->pid->ns[0].virt = e->pid; + if (e->pid > *pid_max) + *pid_max = e->pid; + pi->pgid = e->pgid; + if (e->pgid > *pid_max) + *pid_max = e->pgid; + pi->sid = e->sid; + if (e->sid > *pid_max) + *pid_max = e->sid; + pi->pid->state = TASK_ALIVE; + + if (e->ppid == 0) { + if (root_item) { + pr_err("Parent missed on non-root task " + "with pid %d, image corruption!\n", e->pid); + goto err; + } + root_item = pi; + pi->parent = NULL; + } else { + struct pid *pid; + struct pstree_item *parent; + + pid = pstree_pid_by_virt(e->ppid); + if (!pid || pid->state == TASK_UNDEF || pid->state == TASK_THREAD) { + pr_err("Can't find a parent for %d\n", vpid(pi)); + pstree_entry__free_unpacked(e, NULL); + xfree(pi); + goto err; + } + + parent = pid->item; + pi->parent = parent; + list_add(&pi->sibling, &parent->children); + } + + pi->nr_threads = e->n_threads; + pi->threads = xmalloc(e->n_threads * sizeof(struct pid)); + if (!pi->threads) + break; + + for (i = 0; i < e->n_threads; i++) { + struct pid *node; + pi->threads[i].real = -1; + pi->threads[i].ns[0].virt = e->threads[i]; + pi->threads[i].state = TASK_THREAD; + pi->threads[i].item = NULL; + if (i == 0) + continue; /* A thread leader is in a tree already */ + node = lookup_create_pid(pi->threads[i].ns[0].virt, &pi->threads[i]); + + BUG_ON(node == NULL); + if (node != &pi->threads[i]) { + pr_err("Unexpected task %d in a tree %d\n", e->threads[i], i); + return -1; + } + } + + task_entries->nr_threads += e->n_threads; + task_entries->nr_tasks++; + + pstree_entry__free_unpacked(e, NULL); + + ret = read_pstree_ids(pi); + if (ret < 0) + goto err; + } + +err: + close_image(img); + return ret; +} + +#define RESERVED_PIDS 300 +static int get_free_pid() +{ + static struct pid *prev, *next; + + if (prev == NULL) + prev = rb_entry(rb_first(&pid_root_rb), struct pid, ns[0].node); + + while (1) { + struct rb_node *node; + pid_t pid; + + pid = prev->ns[0].virt + 1; + pid = pid < RESERVED_PIDS ? RESERVED_PIDS + 1 : pid; + + node = rb_next(&prev->ns[0].node); + if (node == NULL) + return pid; + next = rb_entry(node, struct pid, ns[0].node); + if (next->ns[0].virt > pid) + return pid; + prev = next; + } + + return -1; +} + +static int prepare_pstree_ids(pid_t pid) +{ + struct pstree_item *item, *child, *helper, *tmp; + LIST_HEAD(helpers); + + pid_t current_pgid = getpgid(pid); + + /* + * Some task can be reparented to init. A helper task should be added + * for restoring sid of such tasks. The helper tasks will be exited + * immediately after forking children and all children will be + * reparented to init. + */ + list_for_each_entry(item, &root_item->children, sibling) { + struct pstree_item *leader; + + /* + * If a child belongs to the root task's session or it's + * a session leader himself -- this is a simple case, we + * just proceed in a normal way. + */ + if (item->sid == root_item->sid || item->sid == vpid(item)) + continue; + + leader = pstree_item_by_virt(item->sid); + BUG_ON(leader == NULL); + if (leader->pid->state != TASK_UNDEF) { + pid_t pid; + + pid = get_free_pid(); + if (pid < 0) + break; + helper = lookup_create_item(pid); + if (helper == NULL) + return -1; + + pr_info("Session leader %d\n", item->sid); + + helper->sid = item->sid; + helper->pgid = leader->pgid; + helper->ids = leader->ids; + helper->parent = leader; + list_add(&helper->sibling, &leader->children); + + pr_info("Attach %d to the task %d\n", + vpid(helper), vpid(leader)); + } else { + helper = leader; + helper->sid = item->sid; + helper->pgid = item->sid; + helper->parent = root_item; + helper->ids = root_item->ids; + list_add_tail(&helper->sibling, &helpers); + } + if (init_pstree_helper(helper)) { + pr_err("Can't init helper\n"); + return -1; + } + + pr_info("Add a helper %d for restoring SID %d\n", + vpid(helper), helper->sid); + + child = list_entry(item->sibling.prev, struct pstree_item, sibling); + item = child; + + /* + * Stack on helper task all children with target sid. + */ + list_for_each_entry_safe_continue(child, tmp, &root_item->children, sibling) { + if (child->sid != helper->sid) + continue; + if (child->sid == vpid(child)) + continue; + + pr_info("Attach %d to the temporary task %d\n", + vpid(child), vpid(helper)); + + child->parent = helper; + list_move(&child->sibling, &helper->children); + } + } + + /* Try to connect helpers to session leaders */ + for_each_pstree_item(item) { + if (!item->parent) /* skip the root task */ + continue; + + if (item->pid->state == TASK_HELPER) + continue; + + if (item->sid != vpid(item)) { + struct pstree_item *parent; + + if (item->parent->sid == item->sid) + continue; + + /* the task could fork a child before and after setsid() */ + parent = item->parent; + while (parent && vpid(parent) != item->sid) { + if (parent->born_sid != -1 && parent->born_sid != item->sid) { + pr_err("Can't figure out which sid (%d or %d)" + "the process %d was born with\n", + parent->born_sid, item->sid, vpid(parent)); + return -1; + } + parent->born_sid = item->sid; + pr_info("%d was born with sid %d\n", vpid(parent), item->sid); + parent = parent->parent; + } + + if (parent == NULL) { + pr_err("Can't find a session leader for %d\n", item->sid); + return -1; + } + + continue; + } + } + + /* All other helpers are session leaders for own sessions */ + list_splice(&helpers, &root_item->children); + + /* Add a process group leader if it is absent */ + for_each_pstree_item(item) { + struct pid *pid; + + if (!item->pgid || vpid(item) == item->pgid) + continue; + + pid = pstree_pid_by_virt(item->pgid); + if (pid->state != TASK_UNDEF) { + BUG_ON(pid->state == TASK_THREAD); + rsti(item)->pgrp_leader = pid->item; + continue; + } + + /* + * If the PGID is eq to current one -- this + * means we're inheriting group from the current + * task so we need to escape creating a helper here. + */ + if (current_pgid == item->pgid) + continue; + + helper = pid->item; + + helper->sid = item->sid; + helper->pgid = item->pgid; + helper->pid->ns[0].virt = item->pgid; + helper->parent = item; + helper->ids = item->ids; + if (init_pstree_helper(helper)) { + pr_err("Can't init helper\n"); + return -1; + } + list_add(&helper->sibling, &item->children); + rsti(item)->pgrp_leader = helper; + + pr_info("Add a helper %d for restoring PGID %d\n", + vpid(helper), helper->pgid); + } + + return 0; +} + +static unsigned long get_clone_mask(TaskKobjIdsEntry *i, + TaskKobjIdsEntry *p) +{ + unsigned long mask = 0; + + if (i->files_id == p->files_id) + mask |= CLONE_FILES; + if (i->pid_ns_id != p->pid_ns_id) + mask |= CLONE_NEWPID; + if (i->net_ns_id != p->net_ns_id) + mask |= CLONE_NEWNET; + if (i->ipc_ns_id != p->ipc_ns_id) + mask |= CLONE_NEWIPC; + if (i->uts_ns_id != p->uts_ns_id) + mask |= CLONE_NEWUTS; + if (i->mnt_ns_id != p->mnt_ns_id) + mask |= CLONE_NEWNS; + if (i->user_ns_id != p->user_ns_id) + mask |= CLONE_NEWUSER; + + return mask; +} + +static int prepare_pstree_kobj_ids(void) +{ + struct pstree_item *item; + + /* Find a process with minimal pid for shared fd tables */ + for_each_pstree_item(item) { + struct pstree_item *parent = item->parent; + TaskKobjIdsEntry *ids; + unsigned long cflags; + + if (!item->ids) { + if (item == root_item) { + pr_err("No IDS for root task.\n"); + pr_err("Images currupted or too old criu was used for dump.\n"); + return -1; + } + + continue; + } + + if (parent) + ids = parent->ids; + else + ids = root_ids; + + /* + * Add some sanity check on image data. + */ + if (unlikely(!ids)) { + pr_err("No kIDs provided, image corruption\n"); + return -1; + } + + cflags = get_clone_mask(item->ids, ids); + + if (cflags & CLONE_FILES) { + int ret; + + /* + * There might be a case when kIDs for + * root task are the same as in root_ids, + * thus it's image corruption and we should + * exit out. + */ + if (unlikely(!item->parent)) { + pr_err("Image corruption on kIDs data\n"); + return -1; + } + + ret = shared_fdt_prepare(item); + if (ret) + return ret; + } + + rsti(item)->clone_flags = cflags; + if (parent) + /* + * Mount namespaces are setns()-ed at + * restore_task_mnt_ns() explicitly, + * no need in creating it with its own + * temporary namespace. + * + * Root task is exceptional -- it will + * be born in a fresh new mount namespace + * which will be populated with all other + * namespaces' entries. + */ + rsti(item)->clone_flags &= ~CLONE_NEWNS; + + cflags &= CLONE_ALLNS; + + if (item == root_item) { + pr_info("Will restore in %lx namespaces\n", cflags); + root_ns_mask = cflags; + } else if (cflags & ~(root_ns_mask & CLONE_SUBNS)) { + /* + * Namespaces from CLONE_SUBNS can be nested, but in + * this case nobody can't share external namespaces of + * these types. + * + * Workaround for all other namespaces -- + * all tasks should be in one namespace. And + * this namespace is either inherited from the + * criu or is created for the init task (only) + */ + pr_err("Can't restore sub-task in NS\n"); + return -1; + } + } + + pr_debug("NS mask to use %lx\n", root_ns_mask); + return 0; +} + +int prepare_pstree(void) +{ + int ret; + pid_t pid_max = 0, kpid_max = 0, pid; + int fd; + char buf[21]; + + fd = open_proc(PROC_GEN, PID_MAX_PATH); + if (fd >= 0) { + ret = read(fd, buf, sizeof(buf) - 1); + close(fd); + if (ret > 0) { + buf[ret] = 0; + kpid_max = strtoul(buf, NULL, 10); + pr_debug("kernel pid_max=%d\n", kpid_max); + } + } + + ret = read_pstree_image(&pid_max); + pr_debug("pstree pid_max=%d\n", pid_max); + + if (!ret && kpid_max && pid_max > kpid_max) { + /* Try to set kernel pid_max */ + fd = open_proc_rw(PROC_GEN, PID_MAX_PATH); + if (fd == -1) + ret = -1; + else { + snprintf(buf, sizeof(buf), "%u", pid_max+1); + if (write(fd, buf, strlen(buf)) < 0) { + pr_perror("Can't set kernel pid_max=%s", buf); + ret = -1; + } + else + pr_info("kernel pid_max pushed to %s\n", buf); + close(fd); + } + } + + pid = getpid(); + + if (!ret) + /* + * Shell job may inherit sid/pgid from the current + * shell, not from image. Set things up for this. + */ + ret = prepare_pstree_for_shell_job(pid); + if (!ret) + /* + * Walk the collected tree and prepare for restoring + * of shared objects at clone time + */ + ret = prepare_pstree_kobj_ids(); + if (!ret) + /* + * Session/Group leaders might be dead. Need to fix + * pstree with properly injected helper tasks. + */ + ret = prepare_pstree_ids(pid); + + return ret; +} + +int prepare_dummy_pstree(void) +{ + pid_t dummy = 0; + + if (check_img_inventory() == -1) + return -1; + + if (prepare_task_entries() == -1) + return -1; + + if (read_pstree_image(&dummy) == -1) + return -1; + + return 0; +} + +bool restore_before_setsid(struct pstree_item *child) +{ + int csid = child->born_sid == -1 ? child->sid : child->born_sid; + + if (child->parent->born_sid == csid) + return true; + + return false; +} + +struct pstree_item *pstree_item_by_virt(pid_t virt) +{ + struct pid *pid; + + pid = pstree_pid_by_virt(virt); + if (pid == NULL) + return NULL; + BUG_ON(pid->state == TASK_THREAD); + + return pid->item; +} + +struct pstree_item *pstree_item_by_real(pid_t real) +{ + struct pstree_item *item; + + for_each_pstree_item(item) { + if (item->pid->real == real) + return item; + } + return NULL; +} + +int pid_to_virt(pid_t real) +{ + struct pstree_item *item; + + item = pstree_item_by_real(real); + if (item) + return vpid(item); + return 0; +} diff --git a/CRIU_code/criu/rbtree.c b/CRIU_code/criu/rbtree.c new file mode 100644 index 0000000..64a38ea --- /dev/null +++ b/CRIU_code/criu/rbtree.c @@ -0,0 +1,357 @@ +/* + * RBtree implementation adopted from the Linux kernel sources. + */ + +#include +#include "rbtree.h" + +static void __rb_rotate_left(struct rb_node *node, struct rb_root *root) +{ + struct rb_node *right = node->rb_right; + struct rb_node *parent = rb_parent(node); + + node->rb_right = right->rb_left; + if (node->rb_right) + rb_set_parent(right->rb_left, node); + right->rb_left = node; + + rb_set_parent(right, parent); + + if (parent) { + if (node == parent->rb_left) + parent->rb_left = right; + else + parent->rb_right = right; + } else + root->rb_node = right; + rb_set_parent(node, right); +} + +static void __rb_rotate_right(struct rb_node *node, struct rb_root *root) +{ + struct rb_node *left = node->rb_left; + struct rb_node *parent = rb_parent(node); + + node->rb_left = left->rb_right; + if (node->rb_left) + rb_set_parent(left->rb_right, node); + left->rb_right = node; + + rb_set_parent(left, parent); + + if (parent) { + if (node == parent->rb_right) + parent->rb_right = left; + else + parent->rb_left = left; + } else + root->rb_node = left; + rb_set_parent(node, left); +} + +void rb_insert_color(struct rb_node *node, struct rb_root *root) +{ + struct rb_node *parent, *gparent; + + while ((parent = rb_parent(node)) && rb_is_red(parent)) { + gparent = rb_parent(parent); + + if (parent == gparent->rb_left) { + { + register struct rb_node *uncle = gparent->rb_right; + if (uncle && rb_is_red(uncle)) { + rb_set_black(uncle); + rb_set_black(parent); + rb_set_red(gparent); + node = gparent; + continue; + } + } + + if (parent->rb_right == node) { + register struct rb_node *tmp; + __rb_rotate_left(parent, root); + tmp = parent; + parent = node; + node = tmp; + } + + rb_set_black(parent); + rb_set_red(gparent); + __rb_rotate_right(gparent, root); + } else { + { + register struct rb_node *uncle = gparent->rb_left; + if (uncle && rb_is_red(uncle)) { + rb_set_black(uncle); + rb_set_black(parent); + rb_set_red(gparent); + node = gparent; + continue; + } + } + + if (parent->rb_left == node) { + register struct rb_node *tmp; + __rb_rotate_right(parent, root); + tmp = parent; + parent = node; + node = tmp; + } + + rb_set_black(parent); + rb_set_red(gparent); + __rb_rotate_left(gparent, root); + } + } + + rb_set_black(root->rb_node); +} + +static void __rb_erase_color(struct rb_node *node, struct rb_node *parent, + struct rb_root *root) +{ + struct rb_node *other; + + while ((!node || rb_is_black(node)) && node != root->rb_node) { + if (parent->rb_left == node) { + other = parent->rb_right; + if (rb_is_red(other)) { + rb_set_black(other); + rb_set_red(parent); + __rb_rotate_left(parent, root); + other = parent->rb_right; + } + if ((!other->rb_left || rb_is_black(other->rb_left)) && + (!other->rb_right || rb_is_black(other->rb_right))) { + rb_set_red(other); + node = parent; + parent = rb_parent(node); + } else { + if (!other->rb_right || rb_is_black(other->rb_right)) { + rb_set_black(other->rb_left); + rb_set_red(other); + __rb_rotate_right(other, root); + other = parent->rb_right; + } + rb_set_color(other, rb_color(parent)); + rb_set_black(parent); + rb_set_black(other->rb_right); + __rb_rotate_left(parent, root); + node = root->rb_node; + break; + } + } else { + other = parent->rb_left; + if (rb_is_red(other)) { + rb_set_black(other); + rb_set_red(parent); + __rb_rotate_right(parent, root); + other = parent->rb_left; + } + if ((!other->rb_left || rb_is_black(other->rb_left)) && + (!other->rb_right || rb_is_black(other->rb_right))) { + rb_set_red(other); + node = parent; + parent = rb_parent(node); + } else { + if (!other->rb_left || rb_is_black(other->rb_left)) { + rb_set_black(other->rb_right); + rb_set_red(other); + __rb_rotate_left(other, root); + other = parent->rb_left; + } + rb_set_color(other, rb_color(parent)); + rb_set_black(parent); + rb_set_black(other->rb_left); + __rb_rotate_right(parent, root); + node = root->rb_node; + break; + } + } + } + + if (node) + rb_set_black(node); +} + +void rb_erase(struct rb_node *node, struct rb_root *root) +{ + struct rb_node *child, *parent; + int color; + + if (!node->rb_left) + child = node->rb_right; + else if (!node->rb_right) + child = node->rb_left; + else { + struct rb_node *old = node, *left; + + node = node->rb_right; + while ((left = node->rb_left)) + node = left; + + if (rb_parent(old)) { + if (rb_parent(old)->rb_left == old) + rb_parent(old)->rb_left = node; + else + rb_parent(old)->rb_right = node; + } else + root->rb_node = node; + + child = node->rb_right; + parent = rb_parent(node); + color = rb_color(node); + + if (parent == old) { + parent = node; + } else { + if (child) + rb_set_parent(child, parent); + parent->rb_left = child; + + node->rb_right = old->rb_right; + rb_set_parent(old->rb_right, node); + } + + node->rb_parent_color = old->rb_parent_color; + node->rb_left = old->rb_left; + rb_set_parent(old->rb_left, node); + + goto color; + } + + parent = rb_parent(node); + color = rb_color(node); + + if (child) + rb_set_parent(child, parent); + + if (parent) { + if (parent->rb_left == node) + parent->rb_left = child; + else + parent->rb_right = child; + } else + root->rb_node = child; + +color: + if (color == RB_BLACK) + __rb_erase_color(child, parent, root); +} + +/* + * This function returns the first node (in sort order) of the tree. + */ +struct rb_node *rb_first(const struct rb_root *root) +{ + struct rb_node *n; + + n = root->rb_node; + if (!n) + return NULL; + + while (n->rb_left) + n = n->rb_left; + + return n; +} + +struct rb_node *rb_last(const struct rb_root *root) +{ + struct rb_node *n; + + n = root->rb_node; + if (!n) + return NULL; + + while (n->rb_right) + n = n->rb_right; + + return n; +} + +struct rb_node *rb_next(const struct rb_node *node) +{ + struct rb_node *parent; + + if (rb_parent(node) == node) + return NULL; + + /* + * If we have a right-hand child, go down and + * then left as far as we can. + */ + if (node->rb_right) { + node = node->rb_right; + while (node->rb_left) + node=node->rb_left; + return (struct rb_node *)node; + } + + /* + * No right-hand children. Everything down and left is + * smaller than us, so any 'next' node must be in the general + * direction of our parent. Go up the tree; any time the + * ancestor is a right-hand child of its parent, keep going + * up. First time it's a left-hand child of its parent, said + * parent is our 'next' node. + */ + while ((parent = rb_parent(node)) && node == parent->rb_right) + node = parent; + + return parent; +} + +struct rb_node *rb_prev(const struct rb_node *node) +{ + struct rb_node *parent; + + if (rb_parent(node) == node) + return NULL; + + /* + * If we have a left-hand child, go down and + * then right as far as we can. + */ + if (node->rb_left) { + node = node->rb_left; + while (node->rb_right) + node = node->rb_right; + return (struct rb_node *)node; + } + + /* + * No left-hand children. Go up till we find + * an ancestor which is a right-hand child of its parent. + */ + while ((parent = rb_parent(node)) && node == parent->rb_left) + node = parent; + + return parent; +} + +void rb_replace_node(struct rb_node *victim, + struct rb_node *new, + struct rb_root *root) +{ + struct rb_node *parent = rb_parent(victim); + + /* Set the surrounding nodes to point to the replacement */ + if (parent) { + if (victim == parent->rb_left) + parent->rb_left = new; + else + parent->rb_right = new; + } else + root->rb_node = new; + + if (victim->rb_left) + rb_set_parent(victim->rb_left, new); + + if (victim->rb_right) + rb_set_parent(victim->rb_right, new); + + /* Copy the pointers/colour from the victim to the replacement */ + *new = *victim; +} diff --git a/CRIU_code/criu/rst-malloc.c b/CRIU_code/criu/rst-malloc.c new file mode 100644 index 0000000..ff96797 --- /dev/null +++ b/CRIU_code/criu/rst-malloc.c @@ -0,0 +1,259 @@ +#include +#include +#include + +#include "page.h" +#include "rst-malloc.h" +#include "log.h" +#include "common/bug.h" + +struct rst_mem_type_s { + bool remapable; + bool enabled; + unsigned long free_bytes; + void *free_mem; + int (*grow)(struct rst_mem_type_s *, unsigned long size); + unsigned long last; + + void *buf; + unsigned long size; +}; + +static inline unsigned long rst_mem_grow(unsigned long need_size) +{ + int rst_mem_batch = 2 * page_size(); + + need_size = round_up(need_size, page_size()); + if (likely(need_size < rst_mem_batch)) + need_size = rst_mem_batch; + else + pr_debug("Growing rst memory %lu pages\n", need_size / page_size()); + return need_size; +} + +static int grow_shared(struct rst_mem_type_s *t, unsigned long size) +{ + void *aux; + + size = rst_mem_grow(size); + + /* + * This buffer will not get remapped into + * restorer, thus we can just forget the + * previous chunk location and allocate a + * new one + */ + aux = mmap(NULL, size, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, 0, 0); + if (aux == MAP_FAILED) + return -1; + + t->free_mem = aux; + t->free_bytes = size; + t->last = 0; + + return 0; +} + +static int grow_remap(struct rst_mem_type_s *t, int flag, unsigned long size) +{ + void *aux; + + size = rst_mem_grow(size); + + if (!t->buf) + /* + * Can't call mremap with NULL address :( + */ + aux = mmap(NULL, size, PROT_READ | PROT_WRITE, + flag | MAP_ANONYMOUS, 0, 0); + else { + if (flag & MAP_SHARED) { + /* + * Anon shared memory cannot grow with + * mremap, anon-shmem file size doesn't + * change and memory access generates + * SIGBUS. We should truncate the guy, + * but for now we don't need it. + */ + pr_err("Can't grow RM_SHREMAP memory\n"); + return -1; + } + /* + * We'll have to remap all objects into restorer + * address space and get their new addresses. Since + * we allocate many objects as one linear array, it's + * simpler just to grow the buffer and let callers + * find out new array addresses, rather than allocate + * a completely new one and force callers use objects' + * cpos-s. + */ + aux = mremap(t->buf, t->size, + t->size + size, MREMAP_MAYMOVE); + } + if (aux == MAP_FAILED) + return -1; + + t->free_mem += (aux - t->buf); + t->free_bytes += size; + t->size += size; + t->buf = aux; + + return 0; +} + +static int grow_shremap(struct rst_mem_type_s *t, unsigned long size) +{ + return grow_remap(t, MAP_SHARED, size); +} + +static int grow_private(struct rst_mem_type_s *t, unsigned long size) +{ + return grow_remap(t, MAP_PRIVATE, size); +} + +static struct rst_mem_type_s rst_mems[RST_MEM_TYPES] = { + [RM_SHARED] = { + .grow = grow_shared, + .remapable = false, + .enabled = true, + }, + [RM_SHREMAP] = { + .grow = grow_shremap, + .remapable = true, + .enabled = true, + }, + [RM_PRIVATE] = { + .grow = grow_private, + .remapable = true, + .enabled = false, + }, +}; + +void rst_mem_switch_to_private(void) +{ + rst_mems[RM_SHARED].enabled = false; + rst_mems[RM_SHREMAP].enabled = false; + rst_mems[RM_PRIVATE].enabled = true; +} + +void rst_mem_align(int type) +{ + struct rst_mem_type_s *t = &rst_mems[type]; + void *ptr; + + ptr = (void *) round_up((unsigned long)t->free_mem, sizeof(void *)); + t->free_bytes -= (ptr - t->free_mem); + t->free_mem = ptr; +} + +unsigned long rst_mem_align_cpos(int type) +{ + struct rst_mem_type_s *t = &rst_mems[type]; + BUG_ON(!t->remapable || !t->enabled); + + rst_mem_align(type); + + return t->free_mem - t->buf; +} + +void *rst_mem_remap_ptr(unsigned long pos, int type) +{ + struct rst_mem_type_s *t = &rst_mems[type]; + BUG_ON(!t->remapable); + return t->buf + pos; +} + +void *rst_mem_alloc(unsigned long size, int type) +{ + struct rst_mem_type_s *t = &rst_mems[type]; + void *ret; + + BUG_ON(!t->enabled); + + if ((t->free_bytes < size) && t->grow(t, size)) { + pr_perror("Can't grow rst mem"); + return NULL; + } + + ret = t->free_mem; + t->free_mem += size; + t->free_bytes -= size; + t->last = size; + + return ret; +} + +void rst_mem_free_last(int type) +{ + struct rst_mem_type_s *t = &rst_mems[type]; + + BUG_ON(!t->enabled); + + t->free_mem -= t->last; + t->free_bytes += t->last; + t->last = 0; /* next free_last would be no-op */ +} + +unsigned long rst_mem_lock(void) +{ + /* + * Don't allow further allocations from rst_mem since we're + * going to get the bootstrap area and remap all the stuff + * into it. The SHREMAP and SHARED should be already locked + * in the rst_mem_switch_to_private(). + */ + rst_mems[RM_PRIVATE].enabled = false; + return rst_mems[RM_PRIVATE].size + rst_mems[RM_SHREMAP].size; +} + +static int rst_mem_remap_one(struct rst_mem_type_s *t, void *to) +{ + void *aux; + + BUG_ON(!t->remapable || t->enabled); + + if (!t->buf) + /* + * No allocations happened from this buffer. + * It's safe just to do nothing. + */ + return 0; + + pr_debug("\tcall mremap(%p, %lu, %lu, MAYMOVE | FIXED, %p)\n", + t->buf, t->size, t->size, to); + aux = mremap(t->buf, t->size, t->size, MREMAP_MAYMOVE | MREMAP_FIXED, to); + if (aux == MAP_FAILED) { + pr_perror("Can't mremap rst mem"); + return -1; + } + + t->buf = aux; + return 0; +} + +int rst_mem_remap(void *to) +{ + int ret; + + ret = rst_mem_remap_one(&rst_mems[RM_PRIVATE], to); + if (!ret) { + to += rst_mems[RM_PRIVATE].size; + ret = rst_mem_remap_one(&rst_mems[RM_SHREMAP], to); + } + + return ret; +} + +void *shmalloc(size_t bytes) +{ + rst_mem_align(RM_SHARED); + return rst_mem_alloc(bytes, RM_SHARED); +} + +/* Only last chunk can be released */ +void shfree_last(void *ptr) +{ + rst_mem_free_last(RM_SHARED); +} + diff --git a/CRIU_code/criu/seccomp.c b/CRIU_code/criu/seccomp.c new file mode 100644 index 0000000..94e663d --- /dev/null +++ b/CRIU_code/criu/seccomp.c @@ -0,0 +1,509 @@ +#include +#include +#include +#include +#include + +#include "common/config.h" +#include "imgset.h" +#include "kcmp.h" +#include "pstree.h" +#include +#include "proc_parse.h" +#include "restorer.h" +#include "seccomp.h" +#include "servicefd.h" +#include "util.h" +#include "rst-malloc.h" + +#include "protobuf.h" +#include "images/seccomp.pb-c.h" + +#undef LOG_PREFIX +#define LOG_PREFIX "seccomp: " + +static struct rb_root seccomp_tid_rb_root = RB_ROOT; +static struct seccomp_entry *seccomp_tid_entry_root; + +static SeccompEntry *seccomp_img_entry; + +struct seccomp_entry *seccomp_lookup(pid_t tid_real, bool create, bool mandatory) +{ + struct seccomp_entry *entry = NULL; + + struct rb_node *node = seccomp_tid_rb_root.rb_node; + struct rb_node **new = &seccomp_tid_rb_root.rb_node; + struct rb_node *parent = NULL; + + while (node) { + struct seccomp_entry *this = rb_entry(node, struct seccomp_entry, node); + + parent = *new; + if (tid_real < this->tid_real) + node = node->rb_left, new = &((*new)->rb_left); + else if (tid_real > this->tid_real) + node = node->rb_right, new = &((*new)->rb_right); + else + return this; + } + + if (create) { + entry = xzalloc(sizeof(*entry)); + if (!entry) + return NULL; + rb_init_node(&entry->node); + entry->tid_real = tid_real; + + entry->next = seccomp_tid_entry_root, seccomp_tid_entry_root = entry; + rb_link_and_balance(&seccomp_tid_rb_root, &entry->node, parent, new); + } else { + if (mandatory) + pr_err("Can't find entry on tid_real %d\n", tid_real); + } + + return entry; +} + +int seccomp_collect_entry(pid_t tid_real, unsigned int mode) +{ + struct seccomp_entry *entry; + + entry = seccomp_lookup(tid_real, true, false); + if (!entry) { + pr_err("Can't create entry on tid_real %d\n", tid_real); + return -1; + } + entry->mode = mode; + + pr_debug("Collected tid_real %d mode %#x\n", tid_real, mode); + return 0; +} + +static void seccomp_free_chain(struct seccomp_entry *entry) +{ + struct seccomp_filter_chain *chain, *prev; + + for (chain = entry->chain; chain; chain = prev) { + prev = chain->prev; + + xfree(chain->filter.filter.data); + xfree(chain); + } + + entry->nr_chains = 0; + entry->chain = NULL; +} + +void seccomp_free_entries(void) +{ + struct seccomp_entry *entry, *next; + + for (entry = seccomp_tid_entry_root; entry; entry = next) { + next = entry->next; + seccomp_free_chain(entry); + xfree(entry); + } + + seccomp_tid_rb_root = RB_ROOT; + seccomp_tid_entry_root = NULL; +} + +int seccomp_dump_thread(pid_t tid_real, ThreadCoreEntry *thread_core) +{ + struct seccomp_entry *entry = seccomp_find_entry(tid_real); + if (!entry) { + pr_err("Can't dump thread core on tid_real %d\n", tid_real); + return -1; + } + + if (entry->mode != SECCOMP_MODE_DISABLED) { + thread_core->has_seccomp_mode = true; + thread_core->seccomp_mode = entry->mode; + + if (entry->mode == SECCOMP_MODE_FILTER) { + thread_core->has_seccomp_filter = true; + thread_core->seccomp_filter = entry->img_filter_pos; + } + } + + return 0; +} + +static int collect_filter(struct seccomp_entry *entry) +{ + seccomp_metadata_t meta_buf, *meta = &meta_buf; + struct seccomp_filter_chain *chain, *prev; + struct sock_filter buf[BPF_MAXINSNS]; + size_t i; + int len; + + if (entry->mode != SECCOMP_MODE_FILTER) + return 0; + + for (i = 0; true; i++) { + len = ptrace(PTRACE_SECCOMP_GET_FILTER, entry->tid_real, i, buf); + if (len < 0) { + if (errno == ENOENT) { + break; + } else { + pr_perror("Can't fetch filter on tid_real %d i %zu", + entry->tid_real, i); + return -1; + } + } + + if (meta) { + meta->filter_off = i; + + if (ptrace(PTRACE_SECCOMP_GET_METADATA, entry->tid_real, sizeof(*meta), meta) < 0) { + if (errno == EIO) { + /* Old kernel, no METADATA support */ + meta = NULL; + } else { + pr_perror("Can't fetch seccomp metadata on tid_real %d pos %zu", + entry->tid_real, i); + return -1; + } + } + } + + chain = xzalloc(sizeof(*chain)); + if (!chain) + return -1; + + seccomp_filter__init(&chain->filter); + + chain->filter.has_flags = true; + chain->filter.flags = 0; + + chain->filter.filter.len = len * sizeof(struct sock_filter); + chain->filter.filter.data = xmalloc(chain->filter.filter.len); + if (!chain->filter.filter.data) { + xfree(chain); + return -1; + } + + memcpy(chain->filter.filter.data, buf, chain->filter.filter.len); + + if (meta) + chain->filter.flags |= meta->flags; + + prev = entry->chain, entry->chain = chain, chain->prev = prev; + entry->nr_chains++; + } + + return 0; +} + +/* + * When filter is being set up with SECCOMP_FILTER_FLAG_TSYNC then all + * threads share same filters chain. Still without kernel support we + * don't know if the chains are indeed were propagated by the flag above + * or application installed identical chains manually. + * + * Thus we do a trick: if all threads are sharing chains we just drop + * all ones except on a leader and assign SECCOMP_FILTER_FLAG_TSYNC there. + * The rationale is simple: if application is using tsync it always can + * assign new not-tsync filters after, but in reverse if we don't provide + * tsync on restore the further calls with tsync will fail later. + * + * Proper fix needs some support from kernel side (presumably kcmp mode). + */ +static void try_use_tsync(struct seccomp_entry *leader, struct pstree_item *item) +{ + struct seccomp_filter_chain *chain_a, *chain_b; + struct seccomp_entry *entry; + size_t i, j; + + if (leader->mode != SECCOMP_MODE_FILTER) + return; + + for (i = 0; i < item->nr_threads; i++) { + entry = seccomp_find_entry(item->threads[i].real); + BUG_ON(!entry); + + if (entry == leader) + continue; + + if (entry->mode != leader->mode || + entry->nr_chains != leader->nr_chains) + return; + + chain_a = leader->chain; + chain_b = entry->chain; + + for (j = 0; j < leader->nr_chains; j++) { + BUG_ON((!chain_a || !chain_b)); + + if (chain_a->filter.filter.len != + chain_b->filter.filter.len) + return; + + if (memcmp(chain_a->filter.filter.data, + chain_b->filter.filter.data, + chain_a->filter.filter.len)) + return; + + chain_a = chain_a->prev; + chain_b = chain_b->prev; + } + } + + /* OK, so threads can be restored with tsync */ + pr_debug("Use SECCOMP_FILTER_FLAG_TSYNC for tid_real %d\n", + leader->tid_real); + + for (chain_a = leader->chain; chain_a; chain_a = chain_a->prev) + chain_a->filter.flags |= SECCOMP_FILTER_FLAG_TSYNC; + + for (i = 0; i < item->nr_threads; i++) { + entry = seccomp_find_entry(item->threads[i].real); + BUG_ON(!entry); + + if (entry == leader) + continue; + + pr_debug("\t Disable filter on tid_rea %d, will be propagated\n", + entry->tid_real); + + entry->mode = SECCOMP_MODE_DISABLED; + seccomp_free_chain(entry); + } +} + +static int collect_filters(struct pstree_item *item) +{ + struct seccomp_entry *leader, *entry; + size_t i; + + if (item->pid->state == TASK_DEAD) + return 0; + + leader = seccomp_find_entry(item->pid->real); + if (!leader) { + pr_err("Can't collect filter on leader tid_real %d\n", + item->pid->real); + return -1; + } + + for (i = 0; i < item->nr_threads; i++) { + entry = seccomp_find_entry(item->threads[i].real); + if (!entry) { + pr_err("Can't collect filter on tid_real %d\n", + item->pid->real); + return -1; + } + + if (collect_filter(entry)) + return -1; + } + + try_use_tsync(leader, item); + return 0; +} + +static int dump_seccomp_filters(void) +{ + SeccompEntry se = SECCOMP_ENTRY__INIT; + struct seccomp_filter_chain *chain; + struct seccomp_entry *entry; + size_t img_filter_pos = 0, nr_chains = 0; + struct rb_node *node; + int ret; + + for (node = rb_first(&seccomp_tid_rb_root); node; node = rb_next(node)) { + entry = rb_entry(node, struct seccomp_entry, node); + nr_chains += entry->nr_chains; + } + + se.n_seccomp_filters = nr_chains; + if (nr_chains) { + se.seccomp_filters = xmalloc(sizeof(*se.seccomp_filters) * nr_chains); + if (!se.seccomp_filters) + return -1; + } + + for (node = rb_first(&seccomp_tid_rb_root); node; node = rb_next(node)) { + entry = rb_entry(node, struct seccomp_entry, node); + + if (!entry->nr_chains) + continue; + + for (chain = entry->chain; chain; chain = chain->prev) { + if (img_filter_pos >= nr_chains) { + pr_err("Unexpected position %zu > %zu\n", + img_filter_pos, nr_chains); + xfree(se.seccomp_filters); + return -1; + } + + se.seccomp_filters[img_filter_pos] = &chain->filter; + if (chain != entry->chain) { + chain->filter.has_prev = true; + chain->filter.prev = img_filter_pos - 1; + } + img_filter_pos++; + } + + entry->img_filter_pos = img_filter_pos - 1; + } + + ret = pb_write_one(img_from_set(glob_imgset, CR_FD_SECCOMP), &se, PB_SECCOMP); + + xfree(se.seccomp_filters); + + for (node = rb_first(&seccomp_tid_rb_root); node; node = rb_next(node)) { + entry = rb_entry(node, struct seccomp_entry, node); + seccomp_free_chain(entry); + } + + return ret; +} + +int seccomp_collect_dump_filters(void) +{ + if (preorder_pstree_traversal(root_item, collect_filters) < 0) + return -1; + + if (dump_seccomp_filters()) + return -1; + + return 0; +} + +/* The seccomp_img_entry will be shared between all children */ +int seccomp_read_image(void) +{ + struct cr_img *img; + int ret; + + img = open_image(CR_FD_SECCOMP, O_RSTR); + if (!img) + return -1; + + ret = pb_read_one_eof(img, &seccomp_img_entry, PB_SECCOMP); + close_image(img); + if (ret <= 0) + return 0; /* there were no filters */ + + BUG_ON(!seccomp_img_entry); + + return 0; +} + +/* seccomp_img_entry will be freed per-children after forking */ +static void free_seccomp_filters(void) +{ + if (seccomp_img_entry) { + seccomp_entry__free_unpacked(seccomp_img_entry, NULL); + seccomp_img_entry = NULL; + } +} + +void seccomp_rst_reloc(struct thread_restore_args *args) +{ + size_t j, off; + + if (!args->seccomp_filters_n) + return; + + args->seccomp_filters = rst_mem_remap_ptr(args->seccomp_filters_pos, RM_PRIVATE); + args->seccomp_filters_data = (void *)args->seccomp_filters + + args->seccomp_filters_n * sizeof(struct thread_seccomp_filter); + + for (j = off = 0; j < args->seccomp_filters_n; j++) { + struct thread_seccomp_filter *f = &args->seccomp_filters[j]; + + f->sock_fprog.filter = args->seccomp_filters_data + off; + off += f->sock_fprog.len * sizeof(struct sock_filter); + } +} + +int seccomp_prepare_threads(struct pstree_item *item, struct task_restore_args *ta) +{ + struct thread_restore_args *args_array = (struct thread_restore_args *)(&ta[1]); + size_t i, j, nr_filters, filters_size, rst_size, off; + + for (i = 0; i < item->nr_threads; i++) { + ThreadCoreEntry *thread_core = item->core[i]->thread_core; + struct thread_restore_args *args = &args_array[i]; + SeccompFilter *sf; + + args->seccomp_mode = SECCOMP_MODE_DISABLED; + args->seccomp_filters_pos = 0; + args->seccomp_filters_n = 0; + args->seccomp_filters = NULL; + args->seccomp_filters_data = NULL; + + if (thread_core->has_seccomp_mode) + args->seccomp_mode = thread_core->seccomp_mode; + + if (args->seccomp_mode != SECCOMP_MODE_FILTER) + continue; + + if (thread_core->seccomp_filter >= seccomp_img_entry->n_seccomp_filters) { + pr_err("Corrupted filter index on tid %d (%u > %zu)\n", + item->threads[i].ns[0].virt, thread_core->seccomp_filter, + seccomp_img_entry->n_seccomp_filters); + return -1; + } + + sf = seccomp_img_entry->seccomp_filters[thread_core->seccomp_filter]; + if (sf->filter.len % (sizeof(struct sock_filter))) { + pr_err("Corrupted filter len on tid %d (index %u)\n", + item->threads[i].ns[0].virt, + thread_core->seccomp_filter); + return -1; + } + filters_size = sf->filter.len; + nr_filters = 1; + + while (sf->has_prev) { + if (sf->prev >= seccomp_img_entry->n_seccomp_filters) { + pr_err("Corrupted filter index on tid %d (%u > %zu)\n", + item->threads[i].ns[0].virt, sf->prev, + seccomp_img_entry->n_seccomp_filters); + return -1; + } + + sf = seccomp_img_entry->seccomp_filters[sf->prev]; + if (sf->filter.len % (sizeof(struct sock_filter))) { + pr_err("Corrupted filter len on tid %d (index %u)\n", + item->threads[i].ns[0].virt, sf->prev); + return -1; + } + filters_size += sf->filter.len; + nr_filters++; + } + + args->seccomp_filters_n = nr_filters; + + rst_size = filters_size + nr_filters * sizeof(struct thread_seccomp_filter); + args->seccomp_filters_pos = rst_mem_align_cpos(RM_PRIVATE); + args->seccomp_filters = rst_mem_alloc(rst_size, RM_PRIVATE); + if (!args->seccomp_filters) { + pr_err("Can't allocate %zu bytes for filters on tid %d\n", + rst_size, item->threads[i].ns[0].virt); + return -ENOMEM; + } + args->seccomp_filters_data = (void *)args->seccomp_filters + + nr_filters * sizeof(struct thread_seccomp_filter); + + sf = seccomp_img_entry->seccomp_filters[thread_core->seccomp_filter]; + for (j = off = 0; j < nr_filters; j++) { + struct thread_seccomp_filter *f = &args->seccomp_filters[j]; + + f->sock_fprog.len = sf->filter.len / sizeof(struct sock_filter); + f->sock_fprog.filter = args->seccomp_filters_data + off; + f->flags = sf->flags; + + memcpy(f->sock_fprog.filter, sf->filter.data, sf->filter.len); + + off += sf->filter.len; + sf = seccomp_img_entry->seccomp_filters[sf->prev]; + } + } + + free_seccomp_filters(); + return 0; +} diff --git a/CRIU_code/criu/seize.c b/CRIU_code/criu/seize.c new file mode 100644 index 0000000..b958d4b --- /dev/null +++ b/CRIU_code/criu/seize.c @@ -0,0 +1,841 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "int.h" +#include "common/compiler.h" +#include "cr_options.h" +#include "cr-errno.h" +#include "pstree.h" +#include "criu-log.h" +#include +#include "proc_parse.h" +#include "seccomp.h" +#include "seize.h" +#include "stats.h" +#include "xmalloc.h" +#include "util.h" +#include + +#define NR_ATTEMPTS 5 + +static const char frozen[] = "FROZEN"; +static const char freezing[] = "FREEZING"; +static const char thawed[] = "THAWED"; + +static const char *get_freezer_state(int fd) +{ + char state[32]; + int ret; + + BUILD_BUG_ON((sizeof(state) < sizeof(frozen)) || + (sizeof(state) < sizeof(freezing)) || + (sizeof(state) < sizeof(thawed))); + + lseek(fd, 0, SEEK_SET); + ret = read(fd, state, sizeof(state) - 1); + if (ret <= 0) { + pr_perror("Unable to get a current state"); + goto err; + } + if (state[ret - 1] == '\n') + state[ret - 1] = 0; + else + state[ret] = 0; + + pr_debug("freezer.state=%s\n", state); + if (strcmp(state, frozen) == 0) + return frozen; + else if (strcmp(state, freezing) == 0) + return freezing; + else if (strcmp(state, thawed) == 0) + return thawed; + + pr_err("Unknown freezer state: %s\n", state); +err: + return NULL; +} + +static bool freezer_thawed; + +const char *get_real_freezer_state(void) +{ + return freezer_thawed ? thawed : frozen; +} + +static int freezer_restore_state(void) +{ + int fd; + char path[PATH_MAX]; + + if (!opts.freeze_cgroup || freezer_thawed) + return 0; + + snprintf(path, sizeof(path), "%s/freezer.state", opts.freeze_cgroup); + fd = open(path, O_RDWR); + if (fd < 0) { + pr_perror("Unable to open %s", path); + return -1; + } + + if (write(fd, frozen, sizeof(frozen)) != sizeof(frozen)) { + pr_perror("Unable to freeze tasks"); + close(fd); + return -1; + } + close(fd); + return 0; +} + +/* A number of tasks in a freezer cgroup which are not going to be dumped */ +static int processes_to_wait; +static pid_t *processes_to_wait_pids; + +static int seize_cgroup_tree(char *root_path, const char *state) +{ + DIR *dir; + struct dirent *de; + char path[PATH_MAX]; + FILE *f; + + /* + * New tasks can appear while a freezer state isn't + * frozen, so we need to catch all new tasks. + */ + snprintf(path, sizeof(path), "%s/tasks", root_path); + f = fopen(path, "r"); + if (f == NULL) { + pr_perror("Unable to open %s", path); + return -1; + } + while (fgets(path, sizeof(path), f)) { + pid_t pid; + int ret; + + pid = atoi(path); + + /* Here we are going to skip tasks which are already traced. */ + ret = ptrace(PTRACE_INTERRUPT, pid, NULL, NULL); + if (ret == 0) + continue; + if (errno != ESRCH) { + pr_perror("Unexpected error"); + fclose(f); + return -1; + } + + if (!compel_interrupt_task(pid)) { + pr_debug("SEIZE %d: success\n", pid); + processes_to_wait++; + } else if (state == frozen) { + char buf[] = "/proc/XXXXXXXXXX/exe"; + struct stat st; + + /* skip kernel threads */ + snprintf(buf, sizeof(buf), "/proc/%d/exe", pid); + if (stat(buf, &st) == -1 && errno == ENOENT) + continue; + /* + * fails when meets a zombie, or exiting process: + * there is a small race in a kernel -- the process + * may start exiting and we are trying to freeze it + * before it compete exit procedure. The caller simply + * should wait a bit and try freezing again. + */ + pr_err("zombie found while seizing\n"); + fclose(f); + return -EAGAIN; + } + } + fclose(f); + + dir = opendir(root_path); + if (!dir) { + pr_perror("Unable to open %s", root_path); + return -1; + } + + while ((de = readdir(dir))) { + struct stat st; + int ret; + + if (dir_dots(de)) + continue; + + sprintf(path, "%s/%s", root_path, de->d_name); + + if (fstatat(dirfd(dir), de->d_name, &st, 0) < 0) { + pr_perror("stat of %s failed", path); + closedir(dir); + return -1; + } + + if (!S_ISDIR(st.st_mode)) + continue; + ret = seize_cgroup_tree(path, state); + if (ret < 0) { + closedir(dir); + return ret; + } + } + closedir(dir); + + return 0; +} + +/* + * A freezer cgroup can contain tasks which will not be dumped + * and we need to wait them, because the are interrupted them by ptrace. + */ +static int freezer_wait_processes() +{ + int i; + + processes_to_wait_pids = xmalloc(sizeof(pid_t) * processes_to_wait); + if (processes_to_wait_pids == NULL) + return -1; + + for (i = 0; i < processes_to_wait; i++) { + int status; + pid_t pid; + + /* + * Here we are going to skip tasks which are already traced. + * Ptraced tasks looks like children for us, so if + * a task isn't ptraced yet, waitpid() will return a error. + */ + pid = waitpid(-1, &status, 0); + if (pid < 0) { + pr_perror("Unable to wait processes"); + xfree(processes_to_wait_pids); + processes_to_wait_pids = NULL; + return -1; + } + pr_warn("Unexpected process %d in the freezer cgroup (status 0x%x)\n", pid, status); + + processes_to_wait_pids[i] = pid; + } + + return 0; +} + +static int freezer_detach(void) +{ + int i; + + if (!opts.freeze_cgroup) + return 0; + + for (i = 0; i < processes_to_wait && processes_to_wait_pids; i++) { + pid_t pid = processes_to_wait_pids[i]; + int status, save_errno; + + if (ptrace(PTRACE_DETACH, pid, NULL, NULL) == 0) + continue; + + save_errno = errno; + + /* A process may be killed by SIGKILL */ + if (wait4(pid, &status, __WALL, NULL) == pid) { + pr_warn("The %d process returned 0x %x\n", pid, status); + continue; + } + errno = save_errno; + pr_perror("Unable to detach from %d", pid); + } + + return 0; +} + +static int log_unfrozen_stacks(char *root) +{ + DIR *dir; + struct dirent *de; + char path[PATH_MAX]; + FILE *f; + + snprintf(path, sizeof(path), "%s/tasks", root); + f = fopen(path, "r"); + if (f == NULL) { + pr_perror("Unable to open %s", path); + return -1; + } + while (fgets(path, sizeof(path), f)) { + pid_t pid; + int ret, stack; + char stackbuf[2048]; + + pid = atoi(path); + + stack = open_proc(pid, "stack"); + if (stack < 0) { + pr_err("`- couldn't log %d's stack\n", pid); + fclose(f); + return -1; + } + + ret = read(stack, stackbuf, sizeof(stackbuf) - 1); + close(stack); + if (ret < 0) { + pr_perror("couldn't read %d's stack", pid); + fclose(f); + return -1; + } + stackbuf[ret] = '\0'; + + pr_debug("Task %d has stack:\n%s", pid, stackbuf); + + } + fclose(f); + + dir = opendir(root); + if (!dir) { + pr_perror("Unable to open %s", root); + return -1; + } + + while ((de = readdir(dir))) { + struct stat st; + + if (dir_dots(de)) + continue; + + sprintf(path, "%s/%s", root, de->d_name); + + if (fstatat(dirfd(dir), de->d_name, &st, 0) < 0) { + pr_perror("stat of %s failed", path); + closedir(dir); + return -1; + } + + if (!S_ISDIR(st.st_mode)) + continue; + + if (log_unfrozen_stacks(path) < 0) { + closedir(dir); + return -1; + } + } + closedir(dir); + + return 0; +} + +static int freeze_processes(void) +{ + int fd, exit_code = -1; + char path[PATH_MAX]; + const char *state = thawed; + + static const unsigned long step_ms = 100; + unsigned long nr_attempts = (opts.timeout * 1000000) / step_ms; + unsigned long i = 0; + + const struct timespec req = { + .tv_nsec = step_ms * 1000000, + .tv_sec = 0, + }; + + if (unlikely(!nr_attempts)) { + /* + * If timeout is turned off, lets + * wait for at least 10 seconds. + */ + nr_attempts = (10 * 1000000) / step_ms; + } + + pr_debug("freezing processes: %lu attempts with %lu ms steps\n", + nr_attempts, step_ms); + + snprintf(path, sizeof(path), "%s/freezer.state", opts.freeze_cgroup); + fd = open(path, O_RDWR); + if (fd < 0) { + pr_perror("Unable to open %s", path); + return -1; + } + state = get_freezer_state(fd); + if (!state) { + close(fd); + return -1; + } + if (state == thawed) { + freezer_thawed = true; + + lseek(fd, 0, SEEK_SET); + if (write(fd, frozen, sizeof(frozen)) != sizeof(frozen)) { + pr_perror("Unable to freeze tasks"); + close(fd); + return -1; + } + + /* + * Wait the freezer to complete before + * processing tasks. They might be exiting + * before freezing complete so we should + * not read @tasks pids while freezer in + * transition stage. + */ + for (; i <= nr_attempts; i++) { + state = get_freezer_state(fd); + if (!state) { + close(fd); + return -1; + } + + if (state == frozen) + break; + if (alarm_timeouted()) + goto err; + nanosleep(&req, NULL); + } + + if (i > nr_attempts) { + pr_err("Unable to freeze cgroup %s\n", opts.freeze_cgroup); + if (!pr_quelled(LOG_DEBUG)) + log_unfrozen_stacks(opts.freeze_cgroup); + goto err; + } + + pr_debug("freezing processes: %lu attempts done\n", i); + } + + /* + * Pay attention on @i variable -- it's continuation. + */ + for (; i <= nr_attempts; i++) { + exit_code = seize_cgroup_tree(opts.freeze_cgroup, state); + if (exit_code == -EAGAIN) { + if (alarm_timeouted()) + goto err; + nanosleep(&req, NULL); + } else + break; + } + +err: + if (exit_code == 0 || freezer_thawed) { + lseek(fd, 0, SEEK_SET); + if (write(fd, thawed, sizeof(thawed)) != sizeof(thawed)) { + pr_perror("Unable to thaw tasks"); + exit_code = -1; + } + } + if (close(fd)) { + pr_perror("Unable to thaw tasks"); + return -1; + } + + return exit_code; +} + +static inline bool child_collected(struct pstree_item *i, pid_t pid) +{ + struct pstree_item *c; + + list_for_each_entry(c, &i->children, sibling) + if (c->pid->real == pid) + return true; + + return false; +} + +static int collect_task(struct pstree_item *item); +static int collect_children(struct pstree_item *item) +{ + pid_t *ch; + int ret, i, nr_children, nr_inprogress; + + ret = parse_children(item->pid->real, &ch, &nr_children); + if (ret < 0) + return ret; + + nr_inprogress = 0; + for (i = 0; i < nr_children; i++) { + struct pstree_item *c; + struct proc_status_creds creds; + pid_t pid = ch[i]; + + /* Is it already frozen? */ + if (child_collected(item, pid)) + continue; + + nr_inprogress++; + + if (alarm_timeouted()) { + ret = -1; + goto free; + } + + pr_info("Seized task %d, state %d\n", pid, ret); + + c = alloc_pstree_item(); + if (c == NULL) { + ret = -1; + goto free; + } + + if (!opts.freeze_cgroup) + /* fails when meets a zombie */ + compel_interrupt_task(pid); + + ret = compel_wait_task(pid, item->pid->real, parse_pid_status, NULL, &creds.s, NULL); + if (ret < 0) { + /* + * Here is a race window between parse_children() and seize(), + * so the task could die for these time. + * Don't worry, will try again on the next attempt. The number + * of attempts is restricted, so it will exit if something + * really wrong. + */ + ret = 0; + xfree(c); + continue; + } + + if (ret == TASK_ZOMBIE) + ret = TASK_DEAD; + else + processes_to_wait--; + + c->pid->real = pid; + c->parent = item; + c->pid->state = ret; + list_add_tail(&c->sibling, &item->children); + + ret = seccomp_collect_entry(pid, creds.s.seccomp_mode); + if (ret < 0) + goto free; + + /* Here is a recursive call (Depth-first search) */ + ret = collect_task(c); + if (ret < 0) + goto free; + } +free: + xfree(ch); + return ret < 0 ? ret : nr_inprogress; +} + +static void unseize_task_and_threads(const struct pstree_item *item, int st) +{ + int i; + + if (item->pid->state == TASK_DEAD) + return; + + /* + * The st is the state we want to switch tasks into, + * the item->state is the state task was in when we seized one. + */ + + compel_resume_task(item->pid->real, item->pid->state, st); + + if (st == TASK_DEAD) + return; + + for (i = 1; i < item->nr_threads; i++) + if (ptrace(PTRACE_DETACH, item->threads[i].real, NULL, NULL)) + pr_perror("Unable to detach from %d", item->threads[i].real); +} + +static void pstree_wait(struct pstree_item *root_item) +{ + struct pstree_item *item = root_item; + int pid, status, i; + + for_each_pstree_item(item) { + + if (item->pid->state == TASK_DEAD) + continue; + + for (i = 0; i < item->nr_threads; i++) { + pid = wait4(-1, &status, __WALL, NULL); + if (pid < 0) { + pr_perror("wait4 failed"); + break; + } else { + if (!WIFSIGNALED(status) || WTERMSIG(status) != SIGKILL) { + pr_err("Unexpected exit code %d of %d: %s\n", + status, pid, strsignal(status)); + BUG(); + } + } + } + } + + pid = wait4(-1, &status, __WALL, NULL); + if (pid > 0) { + pr_err("Unexpected child %d\n", pid); + BUG(); + } +} + +void pstree_switch_state(struct pstree_item *root_item, int st) +{ + struct pstree_item *item = root_item; + + if (!root_item) + return; + + if (st != TASK_DEAD) + freezer_restore_state(); + + /* + * We need to detach from all processes before waiting the init + * process, because one of these processes may collect processes from a + * target pid namespace. The pid namespace is destroyed only when all + * processes have been killed and collected. + */ + freezer_detach(); + + pr_info("Unfreezing tasks into %d\n", st); + for_each_pstree_item(item) + unseize_task_and_threads(item, st); + + if (st == TASK_DEAD) + pstree_wait(root_item); +} + +static pid_t item_ppid(const struct pstree_item *item) +{ + item = item->parent; + return item ? item->pid->real : -1; +} + +static inline bool thread_collected(struct pstree_item *i, pid_t tid) +{ + int t; + + if (i->pid->real == tid) /* thread leader is collected as task */ + return true; + + for (t = 0; t < i->nr_threads; t++) + if (tid == i->threads[t].real) + return true; + + return false; +} + +static int collect_threads(struct pstree_item *item) +{ + struct seccomp_entry *task_seccomp_entry; + struct pid *threads = NULL; + int nr_threads = 0, i = 0, ret, nr_inprogress, nr_stopped = 0; + + task_seccomp_entry = seccomp_find_entry(item->pid->real); + if (!task_seccomp_entry) + goto err; + + ret = parse_threads(item->pid->real, &threads, &nr_threads); + if (ret < 0) + goto err; + + if ((item->pid->state == TASK_DEAD) && (nr_threads > 1)) { + pr_err("Zombies with threads are not supported\n"); + goto err; + } + + /* The number of threads can't be less than already frozen */ + item->threads = xrealloc(item->threads, nr_threads * sizeof(struct pid)); + if (item->threads == NULL) + return -1; + + if (item->nr_threads == 0) { + item->threads[0].real = item->pid->real; + item->nr_threads = 1; + item->threads[0].item = NULL; + } + + nr_inprogress = 0; + for (i = 0; i < nr_threads; i++) { + pid_t pid = threads[i].real; + struct proc_status_creds t_creds = {}; + + if (thread_collected(item, pid)) + continue; + + nr_inprogress++; + + pr_info("\tSeizing %d's %d thread\n", + item->pid->real, pid); + + if (!opts.freeze_cgroup && compel_interrupt_task(pid)) + continue; + + ret = compel_wait_task(pid, item_ppid(item), parse_pid_status, NULL, &t_creds.s, NULL); + if (ret < 0) { + /* + * Here is a race window between parse_threads() and seize(), + * so the task could die for these time. + * Don't worry, will try again on the next attempt. The number + * of attempts is restricted, so it will exit if something + * really wrong. + */ + continue; + } + + if (ret == TASK_ZOMBIE) + ret = TASK_DEAD; + else + processes_to_wait--; + + BUG_ON(item->nr_threads + 1 > nr_threads); + item->threads[item->nr_threads].real = pid; + item->threads[item->nr_threads].item = NULL; + item->nr_threads++; + + if (ret == TASK_DEAD) { + pr_err("Zombie thread not supported\n"); + goto err; + } + + if (seccomp_collect_entry(pid, t_creds.s.seccomp_mode)) + goto err; + + if (ret == TASK_STOPPED) { + nr_stopped++; + } + } + + if (nr_stopped && nr_stopped != nr_inprogress) { + pr_err("Individually stopped threads not supported\n"); + goto err; + } + + xfree(threads); + return nr_inprogress; + +err: + xfree(threads); + return -1; +} + +static int collect_loop(struct pstree_item *item, + int (*collect)(struct pstree_item *)) +{ + int attempts = NR_ATTEMPTS, nr_inprogress = 1; + + if (opts.freeze_cgroup) + attempts = 1; + + /* + * While we scan the proc and seize the children/threads + * new ones can appear (with clone(CLONE_PARENT) or with + * pthread_create). Thus, after one go, we need to repeat + * the scan-and-freeze again collecting new arrivals. As + * new guys may appear again we do NR_ATTEMPTS passes and + * fail to seize the item if new tasks/threads still + * appear. + */ + + while (nr_inprogress > 0 && attempts >= 0) { + attempts--; + nr_inprogress = collect(item); + } + + pr_info("Collected (%d attempts, %d in_progress)\n", attempts, nr_inprogress); + + /* + * We may fail to collect items or run out of attempts. + * In the former case nr_inprogress will be negative, in + * the latter -- positive. Thus it's enough just to check + * for "no more new stuff" and say "we're OK" if so. + */ + + return (nr_inprogress == 0) ? 0 : -1; +} + +static int collect_task(struct pstree_item *item) +{ + int ret; + + ret = collect_loop(item, collect_threads); + if (ret < 0) + goto err_close; + + /* Depth-first search (DFS) is used for traversing a process tree. */ + ret = collect_loop(item, collect_children); + if (ret < 0) + goto err_close; + + if ((item->pid->state == TASK_DEAD) && !list_empty(&item->children)) { + pr_err("Zombie with children?! O_o Run, run, run!\n"); + goto err_close; + } + + if (pstree_alloc_cores(item)) + goto err_close; + + pr_info("Collected %d in %d state\n", item->pid->real, item->pid->state); + return 0; + +err_close: + close_pid_proc(); + return -1; +} + +int collect_pstree(void) +{ + pid_t pid = root_item->pid->real; + int ret = -1; + struct proc_status_creds creds; + + timing_start(TIME_FREEZING); + + /* + * wait4() may hang for some reason. Enable timer and fire SIGALRM + * if timeout reached. SIGALRM handler will do the necessary + * cleanups and terminate current process. + */ + alarm(opts.timeout); + + if (opts.freeze_cgroup && freeze_processes()) + goto err; + + if (!opts.freeze_cgroup && compel_interrupt_task(pid)) { + set_cr_errno(ESRCH); + goto err; + } + + ret = compel_wait_task(pid, -1, parse_pid_status, NULL, &creds.s, NULL); + if (ret < 0) + goto err; + + if (ret == TASK_ZOMBIE) + ret = TASK_DEAD; + else + processes_to_wait--; + + pr_info("Seized task %d, state %d\n", pid, ret); + root_item->pid->state = ret; + + ret = seccomp_collect_entry(pid, creds.s.seccomp_mode); + if (ret < 0) + goto err; + + ret = collect_task(root_item); + if (ret < 0) + goto err; + + if (opts.freeze_cgroup && freezer_wait_processes()) { + ret = -1; + goto err; + } + + ret = 0; + timing_stop(TIME_FREEZING); + timing_start(TIME_FROZEN); + +err: + /* Freezing stage finished in time - disable timer. */ + alarm(0); + return ret; +} + diff --git a/CRIU_code/criu/servicefd.c b/CRIU_code/criu/servicefd.c new file mode 100644 index 0000000..dc42389 --- /dev/null +++ b/CRIU_code/criu/servicefd.c @@ -0,0 +1,308 @@ +#include +#include +#include +#include + +#include +#include +#include + +#include "common/compiler.h" +#include "common/list.h" + +#include "util.h" +#include "bitops.h" +#include "pstree.h" +#include "files.h" +#include "rst_info.h" +#include "servicefd.h" + +#undef LOG_PREFIX +#define LOG_PREFIX "sfd: " + +/* Max potentially possible fd to be open by criu process */ +int service_fd_rlim_cur; + +/* Base of current process service fds set */ +static int service_fd_base; + +/* Id of current process in shared fdt */ +static int service_fd_id = 0; + +static DECLARE_BITMAP(sfd_map, SERVICE_FD_MAX); +static int sfd_arr[SERVICE_FD_MAX]; +/* + * Variable for marking areas of code, where service fds modifications + * are prohibited. It's used to safe them from reusing their numbers + * by ordinary files. See install_service_fd() and close_service_fd(). + */ +bool sfds_protected = false; + +const char *sfd_type_name(enum sfd_type type) +{ + static const char *names[] = { + [SERVICE_FD_MIN] = __stringify_1(SERVICE_FD_MIN), + [LOG_FD_OFF] = __stringify_1(LOG_FD_OFF), + [IMG_FD_OFF] = __stringify_1(IMG_FD_OFF), + [PROC_FD_OFF] = __stringify_1(PROC_FD_OFF), + [PROC_PID_FD_OFF] = __stringify_1(PROC_PID_FD_OFF), + [CR_PROC_FD_OFF] = __stringify_1(CR_PROC_FD_OFF), + [ROOT_FD_OFF] = __stringify_1(ROOT_FD_OFF), + [CGROUP_YARD] = __stringify_1(CGROUP_YARD), + [USERNSD_SK] = __stringify_1(USERNSD_SK), + [NS_FD_OFF] = __stringify_1(NS_FD_OFF), + [TRANSPORT_FD_OFF] = __stringify_1(TRANSPORT_FD_OFF), + [RPC_SK_OFF] = __stringify_1(RPC_SK_OFF), + [FDSTORE_SK_OFF] = __stringify_1(FDSTORE_SK_OFF), + [SERVICE_FD_MAX] = __stringify_1(SERVICE_FD_MAX), + }; + + if (type < ARRAY_SIZE(names)) + return names[type]; + + return "UNKNOWN"; +} + +int init_service_fd(void) +{ + struct rlimit64 rlimit; + + /* + * Service fd engine implies that file descriptors used won't be + * borrowed by the rest of the code and default 1024 limit is not + * enough for high loaded test/containers. Thus use kdat engine to + * fetch current system level limit for numbers of files allowed to + * open up and lift up own limits. + * + * Note we have to do it before the service fd get initialized and we + * don't exit with errors here because in worst scenario where clash of + * fd happen we simply exit with explicit error during real action + * stage. + */ + rlimit_unlimit_nofile(); + + /* + * Service FDs are those that most likely won't + * conflict with any 'real-life' ones + */ + + if (syscall(__NR_prlimit64, getpid(), RLIMIT_NOFILE, NULL, &rlimit)) { + pr_perror("Can't get rlimit"); + return -1; + } + + service_fd_rlim_cur = (int)rlimit.rlim_cur; + return 0; +} + +static int __get_service_fd(enum sfd_type type, int service_fd_id) +{ + return service_fd_base - type - SERVICE_FD_MAX * service_fd_id; +} + +int get_service_fd(enum sfd_type type) +{ + BUG_ON((int)type <= SERVICE_FD_MIN || (int)type >= SERVICE_FD_MAX); + + if (!test_bit(type, sfd_map)) + return -1; + + if (service_fd_base == 0) + return sfd_arr[type]; + + return __get_service_fd(type, service_fd_id); +} + +bool is_any_service_fd(int fd) +{ + int sfd_min_fd = __get_service_fd(SERVICE_FD_MAX, service_fd_id); + int sfd_max_fd = __get_service_fd(SERVICE_FD_MIN, service_fd_id); + + if (fd > sfd_min_fd && fd < sfd_max_fd) { + int type = SERVICE_FD_MAX - (fd - sfd_min_fd); + if (type > SERVICE_FD_MIN && type < SERVICE_FD_MAX) + return !!test_bit(type, sfd_map); + } + + return false; +} + +bool is_service_fd(int fd, enum sfd_type type) +{ + return fd == get_service_fd(type); +} + +int service_fd_min_fd(struct pstree_item *item) +{ + struct fdt *fdt = rsti(item)->fdt; + int id = 0; + + if (fdt) + id = fdt->nr - 1; + return service_fd_rlim_cur - (SERVICE_FD_MAX - 1) - SERVICE_FD_MAX * id; +} + +static void sfds_protection_bug(enum sfd_type type) +{ + pr_err("Service fd %s is being modified in protected context\n", + sfd_type_name(type)); + print_stack_trace(current ? vpid(current) : 0); + BUG(); +} + +int install_service_fd(enum sfd_type type, int fd) +{ + int sfd = __get_service_fd(type, service_fd_id); + int tmp; + + BUG_ON((int)type <= SERVICE_FD_MIN || (int)type >= SERVICE_FD_MAX); + if (sfds_protected && !test_bit(type, sfd_map)) + sfds_protection_bug(type); + + if (service_fd_base == 0) { + if (test_bit(type, sfd_map)) + close(sfd_arr[type]); + sfd_arr[type] = fd; + set_bit(type, sfd_map); + return fd; + } + + if (!test_bit(type, sfd_map)) + tmp = fcntl(fd, F_DUPFD, sfd); + else + tmp = dup3(fd, sfd, O_CLOEXEC); + if (tmp < 0) { + pr_perror("%s dup %d -> %d failed", + sfd_type_name(type), fd, sfd); + close(fd); + return -1; + } else if (tmp != sfd) { + pr_err("%s busy target %d -> %d\n", sfd_type_name(type), fd, sfd); + close(fd); + return -1; + } + + set_bit(type, sfd_map); + close(fd); + return sfd; +} + +int close_service_fd(enum sfd_type type) +{ + int fd; + + if (sfds_protected) + sfds_protection_bug(type); + + fd = get_service_fd(type); + if (fd < 0) + return 0; + + if (close_safe(&fd)) + return -1; + + clear_bit(type, sfd_map); + return 0; +} + +static int move_service_fd(struct pstree_item *me, int type, int new_id, int new_base) +{ + int old = get_service_fd(type); + int new = new_base - type - SERVICE_FD_MAX * new_id; + int ret; + + if (old < 0) + return 0; + + if (!test_bit(type, sfd_map)) + ret = fcntl(old, F_DUPFD, new); + else + ret = dup2(old, new); + if (ret == -1) { + pr_perror("%s unable to clone %d->%d", + sfd_type_name(type), old, new); + return -1; + } else if (ret != new) { + pr_err("%s busy target %d -> %d\n", sfd_type_name(type), old, new); + return -1; + } else if (!(rsti(me)->clone_flags & CLONE_FILES)) + close(old); + + return 0; +} + +static int choose_service_fd_base(struct pstree_item *me) +{ + int nr, real_nr, fdt_nr = 1, id = rsti(me)->service_fd_id; + + if (rsti(me)->fdt) { + /* The base is set by owner of fdt (id 0) */ + if (id != 0) + return service_fd_base; + fdt_nr = rsti(me)->fdt->nr; + } + /* Now find process's max used fd number */ + if (!list_empty(&rsti(me)->fds)) + nr = list_entry(rsti(me)->fds.prev, + struct fdinfo_list_entry, ps_list)->fe->fd; + else + nr = -1; + + nr = max(nr, inh_fd_max); + /* + * Service fds go after max fd near right border of alignment: + * + * ...|max_fd|max_fd+1|...|sfd first|...|sfd last (aligned)| + * + * So, they take maximum numbers of area allocated by kernel. + * See linux alloc_fdtable() for details. + */ + nr += (SERVICE_FD_MAX - SERVICE_FD_MIN) * fdt_nr; + nr += 16; /* Safety pad */ + real_nr = nr; + + nr /= (1024 / sizeof(void *)); + if (nr) + nr = 1 << (32 - __builtin_clz(nr)); + else + nr = 1; + nr *= (1024 / sizeof(void *)); + + if (nr > service_fd_rlim_cur) { + /* Right border is bigger, than rlim. OK, then just aligned value is enough */ + nr = round_down(service_fd_rlim_cur, (1024 / sizeof(void *))); + if (nr < real_nr) { + pr_err("Can't chose service_fd_base: %d %d\n", nr, real_nr); + return -1; + } + } + + return nr; +} + +int clone_service_fd(struct pstree_item *me) +{ + int id, new_base, i, ret = -1; + + new_base = choose_service_fd_base(me); + id = rsti(me)->service_fd_id; + + if (new_base == -1) + return -1; + if (service_fd_base == new_base && service_fd_id == id) + return 0; + + /* Dup sfds in memmove() style: they may overlap */ + if (get_service_fd(LOG_FD_OFF) < new_base - LOG_FD_OFF - SERVICE_FD_MAX * id) + for (i = SERVICE_FD_MIN + 1; i < SERVICE_FD_MAX; i++) + move_service_fd(me, i, id, new_base); + else + for (i = SERVICE_FD_MAX - 1; i > SERVICE_FD_MIN; i--) + move_service_fd(me, i, id, new_base); + + service_fd_base = new_base; + service_fd_id = id; + ret = 0; + + return ret; +} diff --git a/CRIU_code/criu/shmem.c b/CRIU_code/criu/shmem.c new file mode 100644 index 0000000..03b088f --- /dev/null +++ b/CRIU_code/criu/shmem.c @@ -0,0 +1,821 @@ +#include +#include +#include +#include +#include + +#include "common/config.h" +#include "common/list.h" +#include "pid.h" +#include "shmem.h" +#include "image.h" +#include "cr_options.h" +#include "kerndat.h" +#include "stats.h" +#include "page-pipe.h" +#include "page-xfer.h" +#include "rst-malloc.h" +#include "vma.h" +#include "mem.h" +#include +#include "bitops.h" +#include "log.h" +#include "types.h" +#include "page.h" +#include "util.h" +#include "protobuf.h" +#include "images/pagemap.pb-c.h" + +#ifndef SEEK_DATA +#define SEEK_DATA 3 +#define SEEK_HOLE 4 +#endif + +/* + * Hash table and routines for keeping shmid -> shmem_xinfo mappings + */ + +/* + * The hash is filled with shared objects before we fork + * any tasks. Thus the heads are private (COW-ed) and the + * entries are all in shmem. + */ +#define SHMEM_HASH_SIZE 32 +static struct hlist_head shmems_hash[SHMEM_HASH_SIZE]; + +#define for_each_shmem(_i, _si) \ + for (i = 0; i < SHMEM_HASH_SIZE; i++) \ + hlist_for_each_entry(_si, &shmems_hash[_i], h) + +struct shmem_info { + struct hlist_node h; + unsigned long shmid; + + /* + * Owner PID. This guy creates anon shmem on restore and + * from this the shmem is read on dump + */ + int pid; + unsigned long size; + + union { + struct { /* For restore */ + /* + * Descriptor by which this shmem is opened + * by the creator + */ + int fd; + + /* + * 0. lock is initialized to zero + * 1. the master opens a descriptor and set lock to 1 + * 2. slaves open their descriptors and increment lock + * 3. the master waits all slaves on lock. After that + * it can close the descriptor. + */ + futex_t lock; + + /* + * Here is a problem, that we don't know, which process will restore + * an region. Each time when we found a process with a smaller pid, + * we reset self_count, so we can't have only one counter. + */ + int count; /* the number of regions */ + int self_count; /* the number of regions, which belongs to "pid" */ + }; + + struct { /* For sysvipc restore */ + struct list_head att; /* list of shmem_sysv_att-s */ + int want_write; + }; + + struct { /* For dump */ + unsigned long start; + unsigned long end; + unsigned long *pstate_map; + }; + }; +}; + +struct shmem_sysv_att { + struct list_head l; + VmaEntry *first; + unsigned long prev_end; +}; + +/* This is the "pid that will restore shmem" value for sysv */ +#define SYSVIPC_SHMEM_PID (-1) + +static inline struct hlist_head *shmem_chain(unsigned long shmid) +{ + return &shmems_hash[shmid % SHMEM_HASH_SIZE]; +} + +static void shmem_hash_add(struct shmem_info *si) +{ + struct hlist_head *chain; + + chain = shmem_chain(si->shmid); + hlist_add_head(&si->h, chain); +} + +static struct shmem_info *shmem_find(unsigned long shmid) +{ + struct hlist_head *chain; + struct shmem_info *si; + + chain = shmem_chain(shmid); + hlist_for_each_entry(si, chain, h) + if (si->shmid == shmid) + return si; + + return NULL; +} + +#define PST_DONT_DUMP 0 +#define PST_DUMP 1 +#define PST_ZERO 2 +#define PST_DIRTY 3 + +#define PST_BITS 2 +#define PST_BIT0_IX(pfn) ((pfn) * PST_BITS) +#define PST_BIT1_IX(pfn) (PST_BIT0_IX(pfn) + 1) + +/* + * Disable pagemap based shmem changes tracking by default + * because it has bugs in implementation - + * process can map shmem page, change it and unmap it. + * We won't observe any changes in such pagemaps during dump. + */ +static bool is_shmem_tracking_en(void) +{ + static bool is_inited = false; + static bool is_enabled = false; + + if (!is_inited) { + is_enabled = (bool)getenv("CRIU_TRACK_SHMEM"); + is_inited = true; + if (is_enabled) + pr_msg("Turn anon shmem tracking on via env\n"); + } + return is_enabled; +} + +static unsigned int get_pstate(unsigned long *pstate_map, unsigned long pfn) +{ + unsigned int bit0 = test_bit(PST_BIT0_IX(pfn), pstate_map) ? 1 : 0; + unsigned int bit1 = test_bit(PST_BIT1_IX(pfn), pstate_map) ? 1 : 0; + return (bit1 << 1) | bit0; +} + +static void set_pstate(unsigned long *pstate_map, unsigned long pfn, + unsigned int pstate) +{ + if (pstate & 1) + set_bit(PST_BIT0_IX(pfn), pstate_map); + if (pstate & 2) + set_bit(PST_BIT1_IX(pfn), pstate_map); +} + +static int expand_shmem(struct shmem_info *si, unsigned long new_size) +{ + unsigned long nr_pages, nr_map_items, map_size, + nr_new_map_items, new_map_size, old_size; + + old_size = si->size; + si->size = new_size; + if (!is_shmem_tracking_en()) + return 0; + + nr_pages = DIV_ROUND_UP(old_size, PAGE_SIZE); + nr_map_items = BITS_TO_LONGS(nr_pages * PST_BITS); + map_size = nr_map_items * sizeof(*si->pstate_map); + + nr_pages = DIV_ROUND_UP(new_size, PAGE_SIZE); + nr_new_map_items = BITS_TO_LONGS(nr_pages * PST_BITS); + new_map_size = nr_new_map_items * sizeof(*si->pstate_map); + + BUG_ON(new_map_size < map_size); + + si->pstate_map = xrealloc(si->pstate_map, new_map_size); + if (!si->pstate_map) + return -1; + memzero(si->pstate_map + nr_map_items, new_map_size - map_size); + return 0; +} + +static void update_shmem_pmaps(struct shmem_info *si, u64 *map, VmaEntry *vma) +{ + unsigned long shmem_pfn, vma_pfn, vma_pgcnt; + + if (!is_shmem_tracking_en()) + return; + + vma_pgcnt = DIV_ROUND_UP(si->size - vma->pgoff, PAGE_SIZE); + for (vma_pfn = 0; vma_pfn < vma_pgcnt; ++vma_pfn) { + if (!should_dump_page(vma, map[vma_pfn])) + continue; + + shmem_pfn = vma_pfn + DIV_ROUND_UP(vma->pgoff, PAGE_SIZE); + if (map[vma_pfn] & PME_SOFT_DIRTY) + set_pstate(si->pstate_map, shmem_pfn, PST_DIRTY); + else if (page_is_zero(map[vma_pfn])) + set_pstate(si->pstate_map, shmem_pfn, PST_ZERO); + else + set_pstate(si->pstate_map, shmem_pfn, PST_DUMP); + } +} + +int collect_sysv_shmem(unsigned long shmid, unsigned long size) +{ + struct shmem_info *si; + + /* + * Tasks will not modify this object, so don't + * shmalloc() as we do it for anon shared mem + */ + si = malloc(sizeof(*si)); + if (!si) + return -1; + + si->shmid = shmid; + si->pid = SYSVIPC_SHMEM_PID; + si->size = size; + si->want_write = 0; + INIT_LIST_HEAD(&si->att); + + shmem_hash_add(si); + + pr_info("Collected SysV shmem %lx, size %ld\n", si->shmid, si->size); + + return 0; +} + +int fixup_sysv_shmems(void) +{ + int i; + struct shmem_info *si; + struct shmem_sysv_att *att; + + for_each_shmem(i, si) { + /* It can be anon shmem */ + if (si->pid != SYSVIPC_SHMEM_PID) + continue; + + list_for_each_entry(att, &si->att, l) { + /* + * Same thing is checked in open_shmem_sysv() for + * intermediate holes. + */ + if (att->first->start + round_up(si->size, page_size()) != att->prev_end) { + pr_err("Sysv shmem %lx with tail hole not supported\n", si->shmid); + return -1; + } + + /* + * See comment in open_shmem_sysv() about this PROT_EXEC + */ + if (si->want_write) + att->first->prot |= PROT_EXEC; + } + } + + return 0; +} + +static int open_shmem_sysv(int pid, struct vma_area *vma) +{ + VmaEntry *vme = vma->e; + struct shmem_info *si; + struct shmem_sysv_att *att; + int64_t ret_fd; + + si = shmem_find(vme->shmid); + if (!si) { + pr_err("Can't find sysv shmem for %"PRIx64"\n", vme->shmid); + return -1; + } + + if (si->pid != SYSVIPC_SHMEM_PID) { + pr_err("SysV shmem vma 0x%"PRIx64" points to anon vma %lx\n", + vme->start, si->shmid); + return -1; + } + + /* + * We can have a chain of VMAs belonging to the same + * sysv shmem segment all with different access rights + * (ro and rw). But single shmat() system call attaches + * the whole segment regardless of the actual mapping + * size. This can be achieved by attaching a segment + * and then write-protecting its parts. + * + * So, to restore this thing we note the very first + * area of the segment and make it restore the whole + * thing. All the subsequent ones will carry the sign + * telling the restorer to omit shmat and only do the + * ro protection. Yes, it may happen that some sysv + * shmem vma-s sit in the list (and restorer's array) + * for no use. + * + * Holes in between are not handled now, as well as + * the hole at the end (see fixup_sysv_shmems). + * + * One corner case. At shmat() time we need to know + * whether to create the segment rw or ro, but the + * first vma can have different protection. So the + * segment ro-ness is marked with PROT_EXEC bit in + * the first vma. Unfortunately, we only know this + * after we scan all the vmas, so this bit is set + * at the end in fixup_sysv_shmems(). + */ + + if (vme->pgoff == 0) { + att = xmalloc(sizeof(*att)); + if (!att) + return -1; + + att->first = vme; + list_add(&att->l, &si->att); + + ret_fd = si->shmid; + } else { + att = list_first_entry(&si->att, struct shmem_sysv_att, l); + if (att->prev_end != vme->start) { + pr_err("Sysv shmem %lx with a hole not supported\n", si->shmid); + return -1; + } + if (vme->pgoff != att->prev_end - att->first->start) { + pr_err("Sysv shmem %lx with misordered attach chunks\n", si->shmid); + return -1; + } + + /* + * Value that doesn't (shouldn't) match with any real + * sysv shmem ID (thus it cannot be 0, as shmem id can) + * and still is not negative to prevent prepare_vmas() from + * treating it as error. + */ + ret_fd = SYSV_SHMEM_SKIP_FD; + } + + pr_info("Note 0x%"PRIx64"-0x%"PRIx64" as %lx sysvshmem\n", vme->start, vme->end, si->shmid); + + att->prev_end = vme->end; + if (!vme->has_fdflags || vme->fdflags == O_RDWR) + /* + * We can't look at vma->prot & PROT_WRITE as all this stuff + * can be read-protected. If !has_fdflags these are old images + * and ... we have no other choice other than make it with + * maximum access :( + */ + si->want_write = 1; + + vme->fd = ret_fd; + return 0; +} + +static int open_shmem(int pid, struct vma_area *vma); + +int collect_shmem(int pid, struct vma_area *vma) +{ + VmaEntry *vi = vma->e; + unsigned long size = vi->pgoff + vi->end - vi->start; + struct shmem_info *si; + + if (vma_entry_is(vi, VMA_AREA_SYSVIPC)) { + vma->vm_open = open_shmem_sysv; + return 0; + } + + vma->vm_open = open_shmem; + + si = shmem_find(vi->shmid); + if (si) { + if (si->pid == SYSVIPC_SHMEM_PID) { + pr_err("Shmem %"PRIx64" already collected as SYSVIPC\n", vi->shmid); + return -1; + } + + if (si->size < size) + si->size = size; + si->count++; + + /* + * Only the shared mapping with a lowest + * pid will be created in real, other processes + * will wait until the kernel propagate this mapping + * into /proc + */ + if (!pid_rst_prio(pid, si->pid)) { + if (si->pid == pid) + si->self_count++; + + return 0; + } + + si->pid = pid; + si->self_count = 1; + + return 0; + } + + si = shmalloc(sizeof(struct shmem_info)); + if (!si) + return -1; + + pr_info("Add new shmem 0x%"PRIx64" (%#016"PRIx64"-%#016"PRIx64")\n", + vi->shmid, vi->start, vi->end); + + si->shmid = vi->shmid; + si->pid = pid; + si->size = size; + si->fd = -1; + si->count = 1; + si->self_count = 1; + futex_init(&si->lock); + shmem_hash_add(si); + + return 0; +} + +static int shmem_wait_and_open(struct shmem_info *si, VmaEntry *vi) +{ + char path[128]; + int ret; + + pr_info("Waiting for the %lx shmem to appear\n", si->shmid); + futex_wait_while(&si->lock, 0); + + snprintf(path, sizeof(path), "/proc/%d/fd/%d", + si->pid, si->fd); + + pr_info("Opening shmem [%s] \n", path); + ret = open_proc_rw(si->pid, "fd/%d", si->fd); + futex_inc_and_wake(&si->lock); + if (ret < 0) + return -1; + + vi->fd = ret; + return 0; +} + +static int do_restore_shmem_content(void *addr, unsigned long size, unsigned long shmid) +{ + int ret = 0; + struct page_read pr; + + ret = open_page_read(shmid, &pr, PR_SHMEM); + if (ret <= 0) + return -1; + + while (1) { + unsigned long vaddr; + unsigned nr_pages; + + ret = pr.advance(&pr); + if (ret <= 0) + break; + + vaddr = (unsigned long)decode_pointer(pr.pe->vaddr); + nr_pages = pr.pe->nr_pages; + + if (vaddr + nr_pages * PAGE_SIZE > size) + break; + + pr.read_pages(&pr, vaddr, nr_pages, addr + vaddr, 0); + } + + pr.close(&pr); + return ret; +} + +static int restore_shmem_content(void *addr, struct shmem_info *si) +{ + return do_restore_shmem_content(addr, si->size, si->shmid); +} + +int restore_sysv_shmem_content(void *addr, unsigned long size, unsigned long shmid) +{ + return do_restore_shmem_content(addr, round_up(size, PAGE_SIZE), shmid); +} + +static int open_shmem(int pid, struct vma_area *vma) +{ + VmaEntry *vi = vma->e; + struct shmem_info *si; + void *addr = MAP_FAILED; + int f = -1; + int flags; + + si = shmem_find(vi->shmid); + pr_info("Search for %#016"PRIx64" shmem 0x%"PRIx64" %p/%d\n", vi->start, vi->shmid, si, si ? si->pid : -1); + if (!si) { + pr_err("Can't find my shmem %#016"PRIx64"\n", vi->start); + return -1; + } + + BUG_ON(si->pid == SYSVIPC_SHMEM_PID); + + if (si->pid != pid) + return shmem_wait_and_open(si, vi); + + if (si->fd != -1) { + f = dup(si->fd); + if (f < 0) { + pr_perror("Can't dup shmem fd"); + return -1; + } + + goto out; + } + + flags = MAP_SHARED; + if (kdat.has_memfd) { + f = syscall(SYS_memfd_create, "", 0); + if (f < 0) { + pr_perror("Unable to create memfd"); + goto err; + } + + if (ftruncate(f, si->size)) { + pr_perror("Unable to truncate memfd"); + goto err; + } + flags |= MAP_FILE; + } else + flags |= MAP_ANONYMOUS; + + /* + * The following hack solves problems: + * vi->pgoff may be not zero in a target process. + * This mapping may be mapped more then once. + * The restorer doesn't have snprintf. + * Here is a good place to restore content + */ + addr = mmap(NULL, si->size, PROT_WRITE | PROT_READ, flags, f, 0); + if (addr == MAP_FAILED) { + pr_err("Can't mmap shmid=0x%"PRIx64" size=%ld\n", + vi->shmid, si->size); + goto err; + } + + if (restore_shmem_content(addr, si) < 0) { + pr_err("Can't restore shmem content\n"); + goto err; + } + + if (f == -1) { + f = open_proc_rw(getpid(), "map_files/%lx-%lx", + (unsigned long) addr, + (unsigned long) addr + si->size); + if (f < 0) + goto err; + } + munmap(addr, si->size); + + si->fd = f; + + /* Send signal to slaves, that they can open fd for this shmem */ + futex_inc_and_wake(&si->lock); + /* + * All other regions in this process will duplicate + * the file descriptor, so we don't wait them. + */ + futex_wait_until(&si->lock, si->count - si->self_count + 1); +out: + vi->fd = f; + return 0; +err: + if (addr != MAP_FAILED) + munmap(addr, si->size); + close_safe(&f); + return -1; +} + +int add_shmem_area(pid_t pid, VmaEntry *vma, u64 *map) +{ + struct shmem_info *si; + unsigned long size = vma->pgoff + (vma->end - vma->start); + + if (vma_entry_is(vma, VMA_AREA_SYSVIPC)) + pid = SYSVIPC_SHMEM_PID; + + si = shmem_find(vma->shmid); + if (si) { + if (si->size < size) { + if (expand_shmem(si, size)) + return -1; + } + update_shmem_pmaps(si, map, vma); + + return 0; + } + + si = xzalloc(sizeof(*si)); + if (!si) + return -1; + + si->pid = pid; + si->start = vma->start; + si->end = vma->end; + si->shmid = vma->shmid; + shmem_hash_add(si); + + if (expand_shmem(si, size)) + return -1; + update_shmem_pmaps(si, map, vma); + + return 0; +} + +static int dump_pages(struct page_pipe *pp, struct page_xfer *xfer) +{ + struct page_pipe_buf *ppb; + + list_for_each_entry(ppb, &pp->bufs, l) + if (vmsplice(ppb->p[1], ppb->iov, ppb->nr_segs, + SPLICE_F_GIFT | SPLICE_F_NONBLOCK) != + ppb->pages_in * PAGE_SIZE) { + pr_perror("Can't get shmem into page-pipe"); + return -1; + } + + return page_xfer_dump_pages(xfer, pp); +} + +static int next_data_segment(int fd, unsigned long pfn, + unsigned long *next_data_pfn, unsigned long *next_hole_pfn) +{ + off_t off; + + off = lseek(fd, pfn * PAGE_SIZE, SEEK_DATA); + if (off == (off_t) -1) { + if (errno == ENXIO) { + *next_data_pfn = ~0UL; + *next_hole_pfn = ~0UL; + return 0; + } + pr_perror("Unable to lseek(SEEK_DATA)"); + return -1; + } + *next_data_pfn = off / PAGE_SIZE; + + off = lseek(fd, off, SEEK_HOLE); + if (off == (off_t) -1) { + pr_perror("Unable to lseek(SEEK_HOLE)"); + return -1; + } + *next_hole_pfn = off / PAGE_SIZE; + + return 0; +} + +static int do_dump_one_shmem(int fd, void *addr, struct shmem_info *si) +{ + struct page_pipe *pp; + struct page_xfer xfer; + int err, ret = -1; + unsigned long pfn, nrpages, next_data_pnf = 0, next_hole_pfn = 0; + unsigned long pages[2] = {}; + + nrpages = (si->size + PAGE_SIZE - 1) / PAGE_SIZE; + + pp = create_page_pipe((nrpages + 1) / 2, NULL, PP_CHUNK_MODE); + if (!pp) + goto err; + + err = open_page_xfer(&xfer, CR_FD_SHMEM_PAGEMAP, si->shmid); + if (err) + goto err_pp; + + xfer.offset = (unsigned long)addr; + + for (pfn = 0; pfn < nrpages; pfn++) { + unsigned int pgstate = PST_DIRTY; + bool use_mc = true; + unsigned long pgaddr; + int st = -1; + + if (pfn >= next_hole_pfn && + next_data_segment(fd, pfn, &next_data_pnf, &next_hole_pfn)) + goto err_xfer; + + if (si->pstate_map && is_shmem_tracking_en()) { + pgstate = get_pstate(si->pstate_map, pfn); + use_mc = pgstate == PST_DONT_DUMP; + } + + if (use_mc) { + if (pfn < next_data_pnf) + pgstate = PST_ZERO; + else + pgstate = PST_DIRTY; + } + + pgaddr = (unsigned long)addr + pfn * PAGE_SIZE; +again: + if (pgstate == PST_ZERO) + ret = 0; + else if (xfer.parent && page_in_parent(pgstate == PST_DIRTY)) { + ret = page_pipe_add_hole(pp, pgaddr, PP_HOLE_PARENT); + st = 0; + } else { + ret = page_pipe_add_page(pp, pgaddr, 0); + st = 1; + } + + if (ret == -EAGAIN) { + ret = dump_pages(pp, &xfer); + if (ret) + goto err_xfer; + page_pipe_reinit(pp); + goto again; + } else if (ret) + goto err_xfer; + + if (st >= 0) + pages[st]++; + } + + cnt_add(CNT_SHPAGES_SCANNED, nrpages); + cnt_add(CNT_SHPAGES_SKIPPED_PARENT, pages[0]); + cnt_add(CNT_SHPAGES_WRITTEN, pages[1]); + + ret = dump_pages(pp, &xfer); + +err_xfer: + xfer.close(&xfer); +err_pp: + destroy_page_pipe(pp); +err: + return ret; +} + +static int dump_one_shmem(struct shmem_info *si) +{ + int fd, ret = -1; + void *addr; + + pr_info("Dumping shared memory %ld\n", si->shmid); + + fd = open_proc(si->pid, "map_files/%lx-%lx", si->start, si->end); + if (fd < 0) + goto err; + + addr = mmap(NULL, si->size, PROT_READ, MAP_SHARED, fd, 0); + if (addr == MAP_FAILED) { + pr_err("Can't map shmem 0x%lx (0x%lx-0x%lx)\n", + si->shmid, si->start, si->end); + goto errc; + } + + ret = do_dump_one_shmem(fd, addr, si); + + munmap(addr, si->size); +errc: + close(fd); +err: + return ret; +} + +int dump_one_sysv_shmem(void *addr, unsigned long size, unsigned long shmid) +{ + int fd, ret; + struct shmem_info *si, det; + + si = shmem_find(shmid); + if (!si) { + pr_info("Detached shmem...\n"); + det.pid = SYSVIPC_SHMEM_PID; + det.shmid = shmid; + det.size = round_up(size, PAGE_SIZE); + det.pstate_map = NULL; + si = &det; + } + + fd = open_proc(PROC_SELF, "map_files/%lx-%lx", + (unsigned long)addr, (unsigned long)addr + si->size); + if (fd < 0) + return -1; + + ret = do_dump_one_shmem(fd, addr, si); + close(fd); + return ret; +} + +int cr_dump_shmem(void) +{ + int ret = 0, i; + struct shmem_info *si; + + for_each_shmem(i, si) { + if (si->pid == SYSVIPC_SHMEM_PID) + continue; + ret = dump_one_shmem(si); + if (ret) + goto out; + } +out: + return ret; +} diff --git a/CRIU_code/criu/sigframe.c b/CRIU_code/criu/sigframe.c new file mode 100644 index 0000000..b8798ef --- /dev/null +++ b/CRIU_code/criu/sigframe.c @@ -0,0 +1,48 @@ +#include +#include +#include "log.h" +#include "restore.h" +#include "images/core.pb-c.h" + +#ifndef setup_sas +static inline void setup_sas(struct rt_sigframe* sigframe, ThreadSasEntry *sas) +{ + if (sas) { +#define UC RT_SIGFRAME_UC(sigframe) + + UC->uc_stack.ss_sp = (void *)decode_pointer((sas)->ss_sp); + UC->uc_stack.ss_flags = (int)(sas)->ss_flags; + UC->uc_stack.ss_size = (size_t)(sas)->ss_size; +#undef UC + } +} +#endif + +int construct_sigframe(struct rt_sigframe *sigframe, + struct rt_sigframe *rsigframe, + k_rtsigset_t *blkset, + CoreEntry *core) +{ + /* + * Copy basic register set in the first place: this will set + * rt_sigframe type: native/compat. + */ + if (restore_gpregs(sigframe, CORE_THREAD_ARCH_INFO(core)->gpregs)) + return -1; + + if (blkset) + rt_sigframe_copy_sigset(sigframe, blkset); + else + rt_sigframe_erase_sigset(sigframe); + + if (restore_fpu(sigframe, core)) + return -1; + + if (RT_SIGFRAME_HAS_FPU(sigframe)) + if (sigreturn_prep_fpu_frame(sigframe, rsigframe)) + return -1; + + setup_sas(sigframe, core->thread_core->sas); + + return 0; +} diff --git a/CRIU_code/criu/signalfd.c b/CRIU_code/criu/signalfd.c new file mode 100644 index 0000000..1bb87d0 --- /dev/null +++ b/CRIU_code/criu/signalfd.c @@ -0,0 +1,112 @@ +#include +#include +#include + +#include "common/compiler.h" +#include "signalfd.h" +#include "fdinfo.h" +#include "imgset.h" +#include "image.h" +#include "util.h" +#include "log.h" +#include "files.h" + +#include "protobuf.h" +#include "images/signalfd.pb-c.h" + +struct signalfd_info { + SignalfdEntry *sfe; + struct file_desc d; +}; + +int is_signalfd_link(char *link) +{ + return is_anon_link_type(link, "[signalfd]"); +} + +static int dump_one_signalfd(int lfd, u32 id, const struct fd_parms *p) +{ + SignalfdEntry sfd = SIGNALFD_ENTRY__INIT; + FileEntry fe = FILE_ENTRY__INIT; + + if (parse_fdinfo(lfd, FD_TYPES__SIGNALFD, &sfd)) + return -1; + + sfd.id = id; + sfd.flags = p->flags; + sfd.fown = (FownEntry *)&p->fown; + + fe.type = FD_TYPES__SIGNALFD; + fe.id = sfd.id; + fe.sgfd = &sfd; + + return pb_write_one(img_from_set(glob_imgset, CR_FD_FILES), &fe, PB_FILE); +} + +const struct fdtype_ops signalfd_dump_ops = { + .type = FD_TYPES__SIGNALFD, + .dump = dump_one_signalfd, +}; + +static void sigset_fill(sigset_t *to, unsigned long long from) +{ + int sig; + + pr_info("\tCalculating sigmask for %llx\n", from); + sigemptyset(to); + for (sig = 1; sig < NSIG; sig++) + if (from & (1ULL << (sig - 1))) { + pr_debug("\t\tAdd %d signal to mask\n", sig); + sigaddset(to, sig); + } +} + +static int signalfd_open(struct file_desc *d, int *new_fd) +{ + struct signalfd_info *info; + int tmp; + sigset_t mask; + + info = container_of(d, struct signalfd_info, d); + pr_info("Restoring signalfd %#x\n", info->sfe->id); + + sigset_fill(&mask, info->sfe->sigmask); + tmp = signalfd(-1, &mask, 0); + if (tmp < 0) { + pr_perror("Can't create signalfd %#08x", info->sfe->id); + return -1; + } + + if (rst_file_params(tmp, info->sfe->fown, info->sfe->flags)) { + pr_perror("Can't restore params on signalfd %#08x", + info->sfe->id); + goto err_close; + } + + *new_fd = tmp; + return 0; + +err_close: + close(tmp); + return -1; +} + +static struct file_desc_ops signalfd_desc_ops = { + .type = FD_TYPES__SIGNALFD, + .open = signalfd_open, +}; + +static int collect_one_sigfd(void *o, ProtobufCMessage *msg, struct cr_img *i) +{ + struct signalfd_info *info = o; + + info->sfe = pb_msg(msg, SignalfdEntry); + return file_desc_add(&info->d, info->sfe->id, &signalfd_desc_ops); +} + +struct collect_image_info signalfd_cinfo = { + .fd_type = CR_FD_SIGNALFD, + .pb_type = PB_SIGNALFD, + .priv_size = sizeof(struct signalfd_info), + .collect = collect_one_sigfd, +}; diff --git a/CRIU_code/criu/sk-inet.c b/CRIU_code/criu/sk-inet.c new file mode 100644 index 0000000..90ab492 --- /dev/null +++ b/CRIU_code/criu/sk-inet.c @@ -0,0 +1,1024 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../soccr/soccr.h" + +#include "libnetlink.h" +#include "cr_options.h" +#include "imgset.h" +#include "inet_diag.h" +#include "files.h" +#include "image.h" +#include "log.h" +#include "lsm.h" +#include "kerndat.h" +#include "pstree.h" +#include "rst-malloc.h" +#include "sockets.h" +#include "sk-inet.h" +#include "protobuf.h" +#include "util.h" +#include "namespaces.h" + +#include "images/inventory.pb-c.h" + +#undef LOG_PREFIX +#define LOG_PREFIX "inet: " + +#define PB_ALEN_INET 1 +#define PB_ALEN_INET6 4 + +static LIST_HEAD(inet_ports); + +struct inet_port { + int port; + int type; + struct list_head type_list; + atomic_t users; + mutex_t reuseaddr_lock; + struct list_head list; +}; + +static struct inet_port *port_add(struct inet_sk_info *ii, int port) +{ + int type = ii->ie->type; + struct inet_port *e; + + list_for_each_entry(e, &inet_ports, list) + if (e->type == type && e->port == port) { + atomic_inc(&e->users); + goto out_link; + } + + e = shmalloc(sizeof(*e)); + if (e == NULL) { + pr_err("Not enough memory\n"); + return NULL; + } + + e->port = port; + e->type = type; + atomic_set(&e->users, 1); + mutex_init(&e->reuseaddr_lock); + INIT_LIST_HEAD(&e->type_list); + + list_add(&e->list, &inet_ports); +out_link: + list_add(&ii->port_list, &e->type_list); + + return e; +} + +static void show_one_inet(const char *act, const struct inet_sk_desc *sk) +{ + char src_addr[INET_ADDR_LEN] = ""; + + if (inet_ntop(sk->sd.family, (void *)sk->src_addr, src_addr, + INET_ADDR_LEN) == NULL) { + pr_perror("Failed to translate address"); + } + + pr_debug("\t%s: ino %#8x family %-10s type %-14s port %8d " + "state %-16s src_addr %s\n", act, sk->sd.ino, + ___socket_family_name(sk->sd.family), + ___socket_type_name(sk->type), sk->src_port, + ___tcp_state_name(sk->state), src_addr); +} + +static void show_one_inet_img(const char *act, const InetSkEntry *e) +{ + char src_addr[INET_ADDR_LEN] = ""; + + if (inet_ntop(e->family, (void *)e->src_addr, src_addr, + INET_ADDR_LEN) == NULL) { + pr_perror("Failed to translate address"); + } + + pr_debug("\t%s: family %-10s type %-14s proto %-16s port %d " + "state %-16s src_addr %s\n", act, + ___socket_family_name(e->family), + ___socket_type_name(e->type), + ___socket_proto_name(e->proto), + e->src_port, ___tcp_state_name(e->state), + src_addr); +} + +static int can_dump_ipproto(int ino, int proto, int type) +{ + /* Raw sockets may have any protocol inside */ + if (type == SOCK_RAW) + return 1; + + /* Make sure it's a proto we support */ + switch (proto) { + case IPPROTO_IP: + case IPPROTO_TCP: + case IPPROTO_UDP: + case IPPROTO_UDPLITE: + break; + default: + pr_err("Unsupported proto %d for socket %x\n", proto, ino); + return 0; + } + + return 1; +} + +static int can_dump_inet_sk(const struct inet_sk_desc *sk) +{ + BUG_ON((sk->sd.family != AF_INET) && (sk->sd.family != AF_INET6)); + + if (sk->type == SOCK_DGRAM) { + if (sk->wqlen != 0) { + if (sk->cork) { + pr_err("Can't dump corked dgram socket %x\n", + sk->sd.ino); + return 0; + } else { + pr_warn("Write queue of the %x socket isn't empty\n", + sk->sd.ino); + } + } + + if (sk->rqlen) + pr_warn("Read queue is dropped for socket %x\n", + sk->sd.ino); + + return 1; + } + + if (sk->type != SOCK_STREAM && sk->type != SOCK_RAW) { + pr_err("Can't dump %d inet socket %x. " + "Only stream, dgram and raw are supported.\n", + sk->type, sk->sd.ino); + return 0; + } + + switch (sk->state) { + case TCP_LISTEN: + if (sk->rqlen != 0) { + if (opts.tcp_skip_in_flight) { + pr_info("Skipping in-flight connection (l) for %x\n", + sk->sd.ino); + break; + } + /* + * Currently the ICONS nla reports the conn + * requests for listen sockets. Need to pick + * those up and fix the connect job respectively + */ + pr_err("In-flight connection (l) for %x\n", + sk->sd.ino); + pr_err("In-flight connections can be ignored with the " + "--%s option.\n", SK_INFLIGHT_PARAM); + return 0; + } + break; + case TCP_ESTABLISHED: + case TCP_FIN_WAIT2: + case TCP_FIN_WAIT1: + case TCP_CLOSE_WAIT: + case TCP_LAST_ACK: + case TCP_CLOSING: + case TCP_SYN_SENT: + if (!opts.tcp_established_ok) { + pr_err("Connected TCP socket, consider using --%s option.\n", + SK_EST_PARAM); + return 0; + } + break; + case TCP_CLOSE: + /* Trivial case, we just need to create a socket on restore */ + break; + default: + pr_err("Unknown inet socket %x state %d\n", sk->sd.ino, sk->state); + return 0; + } + + return 1; +} + +static int dump_sockaddr(union libsoccr_addr *sa, u32 *pb_port, u32 *pb_addr) +{ + if (sa->sa.sa_family == AF_INET) { + memcpy(pb_addr, &sa->v4.sin_addr, sizeof(sa->v4.sin_addr)); + *pb_port = ntohs(sa->v4.sin_port); + return 0; + } if (sa->sa.sa_family == AF_INET6) { + *pb_port = ntohs(sa->v6.sin6_port); + memcpy(pb_addr, &sa->v6.sin6_addr, sizeof(sa->v6.sin6_addr)); + return 0; + } + return -1; +} + +static struct inet_sk_desc *gen_uncon_sk(int lfd, const struct fd_parms *p, + int proto, int family, int type) +{ + struct inet_sk_desc *sk; + union libsoccr_addr address; + struct ns_id *ns = NULL; + socklen_t aux; + int ret; + + if (root_ns_mask & CLONE_NEWNET) { + ns = get_socket_ns(lfd); + if (ns == NULL) + return NULL; + } + + sk = xzalloc(sizeof(*sk)); + if (!sk) + goto err; + + sk->sd.family = family; + sk->type = type; + + if (sk->sd.family == AF_INET) + aux = sizeof(struct sockaddr_in); + else if (sk->sd.family == AF_INET6) + aux = sizeof(struct sockaddr_in6); + else { + pr_err("Unsupported socket family: %d\n", sk->sd.family); + goto err; + } + + ret = getsockopt(lfd, SOL_SOCKET, SO_PEERNAME, &address, &aux); + if (ret < 0) { + if (errno != ENOTCONN) { + pr_perror("Unexpected error returned from unconnected socket"); + goto err; + } + } else if (dump_sockaddr(&address, &sk->dst_port, sk->dst_addr)) + goto err; + + ret = getsockname(lfd, &address.sa, &aux); + if (ret < 0) { + if (errno != ENOTCONN) { + pr_perror("Unexpected error returned from unconnected socket"); + goto err; + } + } else if (dump_sockaddr(&address, &sk->src_port, sk->src_addr)) + goto err; + + sk->sd.ino = p->stat.st_ino; + + if (type != SOCK_RAW && proto == IPPROTO_TCP) { + struct { + __u8 tcpi_state; + __u8 tcpi_ca_state; + __u8 tcpi_retransmits; + __u8 tcpi_probes; + __u8 tcpi_backoff; + __u8 tcpi_options; + } info; + + aux = sizeof(info); + ret = getsockopt(lfd, SOL_TCP, TCP_INFO, &info, &aux); + if (ret) { + pr_perror("Failed to obtain TCP_INFO"); + goto err; + } + + if (info.tcpi_state != TCP_CLOSE) { + pr_err("Socket state %d obtained but expected %d\n", + info.tcpi_state, TCP_CLOSE); + goto err; + } + + sk->wqlen = info.tcpi_backoff; + } + + sk->state = TCP_CLOSE; + + sk_collect_one(sk->sd.ino, sk->sd.family, &sk->sd, ns); + + return sk; +err: + xfree(sk); + return NULL; +} + +static int ip_raw_opts_alloc(int family, int proto, IpOptsRawEntry *r) +{ + if (proto == IPPROTO_ICMP || proto == IPPROTO_ICMPV6) { + if (family == AF_INET6) + r->n_icmpv_filter = NELEMS_AS_ARRAY(struct icmp6_filter, + r->icmpv_filter); + else + r->n_icmpv_filter = NELEMS_AS_ARRAY(struct icmp_filter, + r->icmpv_filter); + r->icmpv_filter = xmalloc(pb_repeated_size(r, icmpv_filter)); + pr_debug("r->n_icmpv_filter %d size %d\n", + (int)r->n_icmpv_filter, + (int)pb_repeated_size(r, icmpv_filter)); + if (!r->icmpv_filter) + return -ENOMEM; + } + return 0; +} + +static void ip_raw_opts_free(IpOptsRawEntry *r) +{ + r->n_icmpv_filter = 0; + xfree(r->icmpv_filter); + r->icmpv_filter = NULL; +} + +static int dump_ip_raw_opts(int sk, int family, int proto, IpOptsRawEntry *r) +{ + int ret = 0; + + ret = ip_raw_opts_alloc(family, proto, r); + if (ret) + return ret; + + /* + * Either fill icmpv_filter if match or free + * so it won't fetch zeros to image. + */ + + if (family == AF_INET6) { + ret |= dump_opt(sk, SOL_IPV6, IPV6_HDRINCL, &r->hdrincl); + + if (proto == IPPROTO_ICMPV6) + ret |= do_dump_opt(sk, SOL_ICMPV6, ICMPV6_FILTER, + r->icmpv_filter, + pb_repeated_size(r, icmpv_filter)); + else + ip_raw_opts_free(r); + } else { + ret |= dump_opt(sk, SOL_IP, IP_HDRINCL, &r->hdrincl); + ret |= dump_opt(sk, SOL_IP, IP_NODEFRAG, &r->nodefrag); + r->has_nodefrag = !!r->nodefrag; + + if (proto == IPPROTO_ICMP) + ret |= do_dump_opt(sk, SOL_RAW, ICMP_FILTER, + r->icmpv_filter, + pb_repeated_size(r, icmpv_filter)); + else + ip_raw_opts_free(r); + } + r->has_hdrincl = !!r->hdrincl; + + return ret; +} + +static int dump_ip_opts(int sk, int family, int type, int proto, IpOptsEntry *ioe) +{ + int ret = 0; + + if (type == SOCK_RAW) { + /* + * Raw sockets might need allocate more space + * and fetch additional options. + */ + ret |= dump_ip_raw_opts(sk, family, proto, ioe->raw); + } else { + /* Due to kernel code we can use SOL_IP instead of SOL_IPV6 */ + ret |= dump_opt(sk, SOL_IP, IP_FREEBIND, &ioe->freebind); + ioe->has_freebind = ioe->freebind; + } + + return ret; +} + +/* Stolen from the kernel's __ipv6_addr_type/__ipv6_addr_needs_scopeid; + * link local and (multicast + loopback + linklocal) addrs require a + * scope id. + */ +#define IPV6_ADDR_SCOPE_NODELOCAL 0x01 +#define IPV6_ADDR_SCOPE_LINKLOCAL 0x02 +static bool needs_scope_id(uint32_t *src_addr) +{ + if ((src_addr[0] & htonl(0xFF00000)) == htonl(0xFF000000)) { + if (src_addr[1] & (IPV6_ADDR_SCOPE_LINKLOCAL|IPV6_ADDR_SCOPE_NODELOCAL)) + return true; + } + + if ((src_addr[0] & htonl(0xFFC00000)) == htonl(0xFE800000)) + return true; + + return false; +} + +static int do_dump_one_inet_fd(int lfd, u32 id, const struct fd_parms *p, int family) +{ + struct inet_sk_desc *sk; + FileEntry fe = FILE_ENTRY__INIT; + InetSkEntry ie = INET_SK_ENTRY__INIT; + IpOptsEntry ipopts = IP_OPTS_ENTRY__INIT; + IpOptsRawEntry ipopts_raw = IP_OPTS_RAW_ENTRY__INIT; + SkOptsEntry skopts = SK_OPTS_ENTRY__INIT; + int ret = -1, err = -1, proto, aux, type; + + ret = do_dump_opt(lfd, SOL_SOCKET, SO_PROTOCOL, + &proto, sizeof(proto)); + if (ret) + goto err; + + if (do_dump_opt(lfd, SOL_SOCKET, SO_TYPE, &type, sizeof(type))) + goto err; + + if (!can_dump_ipproto(p->stat.st_ino, proto, type)) + goto err; + + if (type == SOCK_RAW) + sk = (struct inet_sk_desc *)lookup_socket_ino(p->stat.st_ino, family); + else + sk = (struct inet_sk_desc *)lookup_socket(p->stat.st_ino, family, proto); + if (IS_ERR(sk)) + goto err; + if (!sk) { + sk = gen_uncon_sk(lfd, p, proto, family, type); + if (!sk) + goto err; + } + + sk->cork = false; + if (type != SOCK_RAW) { + switch (proto) { + case IPPROTO_UDP: + case IPPROTO_UDPLITE: + if (dump_opt(lfd, SOL_UDP, UDP_CORK, &aux)) + return -1; + if (aux) { + sk->cork = true; + /* + * FIXME: it is possible to dump a corked socket with + * the empty send queue. + */ + pr_err("Can't dump corked dgram socket %x\n", sk->sd.ino); + goto err; + } + break; + } + } + + if (!can_dump_inet_sk(sk)) + goto err; + + BUG_ON(sk->sd.already_dumped); + + ie.id = id; + ie.ino = sk->sd.ino; + if (sk->sd.sk_ns) { + ie.ns_id = sk->sd.sk_ns->id; + ie.has_ns_id = true; + } + ie.family = family; + ie.proto = proto; + ie.type = sk->type; + ie.src_port = sk->src_port; + ie.dst_port = sk->dst_port; + ie.backlog = sk->wqlen; + ie.flags = p->flags; + + ie.fown = (FownEntry *)&p->fown; + ie.opts = &skopts; + ie.ip_opts = &ipopts; + ie.ip_opts->raw = &ipopts_raw; + + ie.n_src_addr = PB_ALEN_INET; + ie.n_dst_addr = PB_ALEN_INET; + if (ie.family == AF_INET6) { + int val; + char device[IFNAMSIZ]; + socklen_t len = sizeof(device); + + ie.n_src_addr = PB_ALEN_INET6; + ie.n_dst_addr = PB_ALEN_INET6; + + ret = dump_opt(lfd, SOL_IPV6, IPV6_V6ONLY, &val); + if (ret < 0) + goto err; + + ie.v6only = val ? true : false; + ie.has_v6only = true; + + /* ifindex only matters on source ports for bind, so let's + * find only that ifindex. */ + if (sk->src_port && needs_scope_id(sk->src_addr)) { + if (getsockopt(lfd, SOL_SOCKET, SO_BINDTODEVICE, device, &len) < 0) { + pr_perror("can't get ifname"); + goto err; + } + + if (len > 0) { + ie.ifname = xstrdup(device); + if (!ie.ifname) + goto err; + } else { + pr_err("couldn't find ifname for %d, can't bind\n", id); + goto err; + } + } + } + + ie.src_addr = xmalloc(pb_repeated_size(&ie, src_addr)); + ie.dst_addr = xmalloc(pb_repeated_size(&ie, dst_addr)); + + if (!ie.src_addr || !ie.dst_addr) + goto err; + + memcpy(ie.src_addr, sk->src_addr, pb_repeated_size(&ie, src_addr)); + memcpy(ie.dst_addr, sk->dst_addr, pb_repeated_size(&ie, dst_addr)); + + if (dump_ip_opts(lfd, family, type, proto, &ipopts)) + goto err; + + if (dump_socket_opts(lfd, &skopts)) + goto err; + + pr_info("Dumping inet socket at %d\n", p->fd); + show_one_inet("Dumping", sk); + show_one_inet_img("Dumped", &ie); + sk->sd.already_dumped = 1; + sk->cpt_reuseaddr = skopts.reuseaddr; + + switch (proto) { + case IPPROTO_TCP: + err = (type != SOCK_RAW) ? dump_one_tcp(lfd, sk) : 0; + break; + case IPPROTO_UDP: + case IPPROTO_UDPLITE: + sk_encode_shutdown(&ie, sk->shutdown); + /* Fallthrough! */ + default: + err = 0; + break; + } + + ie.state = sk->state; + + fe.type = FD_TYPES__INETSK; + fe.id = ie.id; + fe.isk = &ie; + + /* Unchain not need field back */ + if (type != SOCK_RAW) + ie.ip_opts->raw = NULL; + + if (pb_write_one(img_from_set(glob_imgset, CR_FD_FILES), &fe, PB_FILE)) + goto err; +err: + ip_raw_opts_free(&ipopts_raw); + release_skopts(&skopts); + xfree(ie.src_addr); + xfree(ie.dst_addr); + xfree(ie.ifname); + return err; +} + +static int dump_one_inet_fd(int lfd, u32 id, const struct fd_parms *p) +{ + return do_dump_one_inet_fd(lfd, id, p, PF_INET); +} + +const struct fdtype_ops inet_dump_ops = { + .type = FD_TYPES__INETSK, + .dump = dump_one_inet_fd, +}; + +static int dump_one_inet6_fd(int lfd, u32 id, const struct fd_parms *p) +{ + return do_dump_one_inet_fd(lfd, id, p, PF_INET6); +} + +const struct fdtype_ops inet6_dump_ops = { + .type = FD_TYPES__INETSK, + .dump = dump_one_inet6_fd, +}; + +int inet_collect_one(struct nlmsghdr *h, int family, int type, struct ns_id *ns) +{ + struct inet_sk_desc *d; + struct inet_diag_msg *m = NLMSG_DATA(h); + struct nlattr *tb[INET_DIAG_MAX+1]; + int ret; + + nlmsg_parse(h, sizeof(struct inet_diag_msg), tb, INET_DIAG_MAX, NULL); + + d = xzalloc(sizeof(*d)); + if (!d) + return -1; + + d->type = type; + d->src_port = ntohs(m->id.idiag_sport); + d->dst_port = ntohs(m->id.idiag_dport); + d->state = m->idiag_state; + d->rqlen = m->idiag_rqueue; + d->wqlen = m->idiag_wqueue; + memcpy(d->src_addr, m->id.idiag_src, sizeof(u32) * 4); + memcpy(d->dst_addr, m->id.idiag_dst, sizeof(u32) * 4); + + if (tb[INET_DIAG_SHUTDOWN]) + d->shutdown = nla_get_u8(tb[INET_DIAG_SHUTDOWN]); + else + pr_err_once("Can't check shutdown state of inet socket\n"); + + ret = sk_collect_one(m->idiag_inode, family, &d->sd, ns); + + show_one_inet("Collected", d); + + return ret; +} + +static int open_inet_sk(struct file_desc *d, int *new_fd); +static int post_open_inet_sk(struct file_desc *d, int sk); + +static struct file_desc_ops inet_desc_ops = { + .type = FD_TYPES__INETSK, + .open = open_inet_sk, +}; + +static inline int tcp_connection(InetSkEntry *ie) +{ + return (ie->proto == IPPROTO_TCP && ie->dst_port); +} + +static int collect_one_inetsk(void *o, ProtobufCMessage *base, struct cr_img *i) +{ + struct inet_sk_info *ii = o; + + ii->ie = pb_msg(base, InetSkEntry); + if (tcp_connection(ii->ie)) + tcp_locked_conn_add(ii); + + /* + * A socket can reuse addr only if all previous sockets allow that, + * so a value of SO_REUSEADDR can be restored after restoring all + * sockets. + */ + ii->port = port_add(ii, ii->ie->src_port); + if (ii->port == NULL) + return -1; + + return file_desc_add(&ii->d, ii->ie->id, &inet_desc_ops); +} + +struct collect_image_info inet_sk_cinfo = { + .fd_type = CR_FD_INETSK, + .pb_type = PB_INET_SK, + .priv_size = sizeof(struct inet_sk_info), + .collect = collect_one_inetsk, +}; + +static int inet_validate_address(InetSkEntry *ie) +{ + if ((ie->family == AF_INET) && + /* v0.1 had 4 in ipv4 addr len */ + (ie->n_src_addr >= PB_ALEN_INET) && + (ie->n_dst_addr >= PB_ALEN_INET)) + return 0; + + if ((ie->family == AF_INET6) && + (ie->n_src_addr == PB_ALEN_INET6) && + (ie->n_dst_addr == PB_ALEN_INET6)) + return 0; + + pr_err("Addr len mismatch f %d ss %zu ds %zu\n", ie->family, + pb_repeated_size(ie, src_addr), + pb_repeated_size(ie, dst_addr)); + + return -1; +} + +static void dec_users_and_wake(struct inet_port *port) +{ + struct fdinfo_list_entry *fle; + struct inet_sk_info *ii; + + if (atomic_dec_return(&port->users)) + return; + list_for_each_entry(ii, &port->type_list, port_list) { + fle = file_master(&ii->d); + set_fds_event(fle->pid); + } +} + +static int post_open_inet_sk(struct file_desc *d, int sk) +{ + struct inet_sk_info *ii; + int val; + + ii = container_of(d, struct inet_sk_info, d); + + /* + * TCP sockets are handled at the last moment + * after unlocking connections. + */ + if (tcp_connection(ii->ie)) { + pr_debug("Schedule %d socket for repair off\n", sk); + BUG_ON(ii->sk_fd != -1); + ii->sk_fd = sk; + return 0; + } + + /* SO_REUSEADDR is set for all sockets */ + if (ii->ie->opts->reuseaddr && ii->ie->opts->so_reuseport) + return 0; + + if (atomic_read(&ii->port->users)) + return 1; + + val = ii->ie->opts->reuseaddr; + if (!val && restore_opt(sk, SOL_SOCKET, SO_REUSEADDR, &val)) + return -1; + + val = ii->ie->opts->so_reuseport; + if (!val && restore_opt(sk, SOL_SOCKET, SO_REUSEPORT, &val)) + return -1; + + val = ii->ie->opts->so_broadcast; + if (!val && restore_opt(sk, SOL_SOCKET, SO_BROADCAST, &val)) + return -1; + + return 0; +} + +static int restore_ip_raw_opts(int sk, int family, int proto, IpOptsRawEntry *r) +{ + int ret = 0; + + if (r->icmpv_filter) { + if (proto == IPPROTO_ICMP || proto == IPPROTO_ICMPV6) { + ret |= do_restore_opt(sk, family == AF_INET6 ? SOL_ICMPV6 : SOL_RAW, + family == AF_INET6 ? ICMPV6_FILTER : ICMP_FILTER, + r->icmpv_filter, pb_repeated_size(r, icmpv_filter)); + } + } + + if (r->has_nodefrag) + ret |= restore_opt(sk, SOL_IP, IP_NODEFRAG, &r->nodefrag); + if (r->has_hdrincl) + ret |= restore_opt(sk, family == AF_INET6 ? SOL_IPV6 : SOL_IP, + family == AF_INET6 ? IPV6_HDRINCL : IP_HDRINCL, + &r->hdrincl); + + return ret; +} + +int restore_ip_opts(int sk, int family, int proto, IpOptsEntry *ioe) +{ + int ret = 0; + + if (ioe->has_freebind) + ret |= restore_opt(sk, SOL_IP, IP_FREEBIND, &ioe->freebind); + + if (ioe->raw) + ret |= restore_ip_raw_opts(sk, family, proto, ioe->raw); + return ret; +} +static int open_inet_sk(struct file_desc *d, int *new_fd) +{ + struct fdinfo_list_entry *fle = file_master(d); + struct inet_sk_info *ii; + InetSkEntry *ie; + int sk, yes = 1; + + if (fle->stage >= FLE_OPEN) + return post_open_inet_sk(d, fle->fe->fd); + + ii = container_of(d, struct inet_sk_info, d); + ie = ii->ie; + + show_one_inet_img("Restore", ie); + + if (ie->family != AF_INET && ie->family != AF_INET6) { + pr_err("Unsupported socket family: %d\n", ie->family); + return -1; + } + + if ((ie->type != SOCK_STREAM) && (ie->type != SOCK_DGRAM) && (ie->type != SOCK_RAW)) { + pr_err("Unsupported socket type: %d\n", ie->type); + return -1; + } + + if (inet_validate_address(ie)) + return -1; + + if (set_netns(ie->ns_id)) + return -1; + + if (run_setsockcreatecon(fle->fe)) + return -1; + + sk = socket(ie->family, ie->type, ie->proto); + if (sk < 0) { + pr_perror("Can't create inet socket"); + return -1; + } + + if (reset_setsockcreatecon()) + goto err; + + if (ie->v6only) { + if (restore_opt(sk, SOL_IPV6, IPV6_V6ONLY, &yes) == -1) + goto err; + } + + /* + * Set SO_REUSEADDR, because some sockets can be bound to one addr. + * The origin value of SO_REUSEADDR will be restored in post_open. + */ + if (restore_opt(sk, SOL_SOCKET, SO_REUSEADDR, &yes)) + goto err; + if (restore_opt(sk, SOL_SOCKET, SO_REUSEPORT, &yes)) + goto err; + + if (tcp_connection(ie)) { + if (!opts.tcp_established_ok && !opts.tcp_close) { + pr_err("Connected TCP socket in image\n"); + goto err; + } + + mutex_lock(&ii->port->reuseaddr_lock); + if (restore_one_tcp(sk, ii)) { + mutex_unlock(&ii->port->reuseaddr_lock); + goto err; + } + mutex_unlock(&ii->port->reuseaddr_lock); + + goto done; + } + + if (ie->src_port) { + if (inet_bind(sk, ii)) + goto err; + } + + /* + * Listen sockets are easiest ones -- simply + * bind() and listen(), and that's all. + */ + if (ie->state == TCP_LISTEN) { + if (ie->proto != IPPROTO_TCP) { + pr_err("Wrong socket in listen state %d\n", ie->proto); + goto err; + } + + mutex_lock(&ii->port->reuseaddr_lock); + if (listen(sk, ie->backlog) == -1) { + pr_perror("Can't listen on a socket"); + mutex_unlock(&ii->port->reuseaddr_lock); + goto err; + } + mutex_unlock(&ii->port->reuseaddr_lock); + } + + if (ie->dst_port && + inet_connect(sk, ii)) + goto err; +done: + dec_users_and_wake(ii->port); + + if (rst_file_params(sk, ie->fown, ie->flags)) + goto err; + + if (ie->ip_opts && restore_ip_opts(sk, ie->family, ie->proto, ie->ip_opts)) + goto err; + + if (restore_socket_opts(sk, ie->opts)) + goto err; + + if (ie->has_shutdown && + (ie->proto == IPPROTO_UDP || + ie->proto == IPPROTO_UDPLITE)) { + if (shutdown(sk, sk_decode_shutdown(ie->shutdown))) { + if (ie->state != TCP_CLOSE && errno != ENOTCONN) { + pr_perror("Can't shutdown socket into %d", + sk_decode_shutdown(ie->shutdown)); + goto err; + } else { + pr_debug("Called shutdown on closed socket, " + "proto %d ino %x", ie->proto, ie->ino); + } + } + } + + *new_fd = sk; + + return 1; +err: + close(sk); + return -1; +} + +int restore_sockaddr(union libsoccr_addr *sa, + int family, u32 pb_port, u32 *pb_addr, u32 ifindex) +{ + BUILD_BUG_ON(sizeof(sa->v4.sin_addr.s_addr) > PB_ALEN_INET * sizeof(u32)); + BUILD_BUG_ON(sizeof(sa->v6.sin6_addr.s6_addr) > PB_ALEN_INET6 * sizeof(u32)); + + memzero(sa, sizeof(*sa)); + + if (family == AF_INET) { + sa->v4.sin_family = AF_INET; + sa->v4.sin_port = htons(pb_port); + memcpy(&sa->v4.sin_addr.s_addr, pb_addr, sizeof(sa->v4.sin_addr.s_addr)); + return sizeof(sa->v4); + } + + if (family == AF_INET6) { + sa->v6.sin6_family = AF_INET6; + sa->v6.sin6_port = htons(pb_port); + memcpy(sa->v6.sin6_addr.s6_addr, pb_addr, sizeof(sa->v6.sin6_addr.s6_addr)); + + /* Here although the struct member is called scope_id, the + * kernel really wants ifindex. See + * /net/ipv6/af_inet6.c:inet6_bind for details. + */ + sa->v6.sin6_scope_id = ifindex; + return sizeof(sa->v6); + } + + BUG(); + return -1; +} + +int inet_bind(int sk, struct inet_sk_info *ii) +{ + bool rst_freebind = false; + union libsoccr_addr addr; + int addr_size, ifindex = 0; + + if (ii->ie->ifname) { + ifindex = if_nametoindex(ii->ie->ifname); + if (!ifindex) { + pr_err("couldn't find ifindex for %s\n", ii->ie->ifname); + return -1; + } + } + + addr_size = restore_sockaddr(&addr, ii->ie->family, + ii->ie->src_port, ii->ie->src_addr, ifindex); + + /* + * ipv6 addresses go through a “tentative” phase and + * sockets could not be bound to them in this moment + * without setting IP_FREEBIND. + */ + if (ii->ie->family == AF_INET6 && ii->ie->type != SOCK_RAW) { + int yes = 1; + + if (restore_opt(sk, SOL_IP, IP_FREEBIND, &yes)) + return -1; + + if (ii->ie->ip_opts && ii->ie->ip_opts->freebind) + /* + * The right value is already set, so + * don't need to restore it in restore_ip_opts() + */ + ii->ie->ip_opts->has_freebind = false; + else + rst_freebind = true; + } + + if (bind(sk, (struct sockaddr *)&addr, addr_size) == -1) { + pr_perror("Can't bind inet socket (id %d)", ii->ie->id); + return -1; + } + + if (rst_freebind) { + int no = 0; + + /* + * The "no" value is default, so it will not be + * restore in restore_ip_opts() + */ + if (restore_opt(sk, SOL_IP, IP_FREEBIND, &no)) + return -1; + } + + return 0; +} + +int inet_connect(int sk, struct inet_sk_info *ii) +{ + union libsoccr_addr addr; + int addr_size; + + addr_size = restore_sockaddr(&addr, ii->ie->family, + ii->ie->dst_port, ii->ie->dst_addr, 0); + + if (connect(sk, (struct sockaddr *)&addr, addr_size) == -1) { + pr_perror("Can't connect inet socket back"); + return -1; + } + + return 0; +} diff --git a/CRIU_code/criu/sk-netlink.c b/CRIU_code/criu/sk-netlink.c new file mode 100644 index 0000000..e4163f5 --- /dev/null +++ b/CRIU_code/criu/sk-netlink.c @@ -0,0 +1,273 @@ +#include +#include +#include +#include + +#include "imgset.h" +#include "files.h" +#include "sockets.h" +#include "util.h" + +#include "protobuf.h" +#include "images/sk-netlink.pb-c.h" +#include "netlink_diag.h" +#include "libnetlink.h" +#include "namespaces.h" + +#undef LOG_PREFIX +#define LOG_PREFIX "netlink: " + +struct netlink_sk_desc { + struct socket_desc sd; + u32 portid; + u32 *groups; + u32 gsize; + u32 dst_portid; + u32 dst_group; + u8 state; + u8 protocol; +}; + +int netlink_receive_one(struct nlmsghdr *hdr, struct ns_id *ns, void *arg) +{ + struct nlattr *tb[NETLINK_DIAG_MAX+1]; + struct netlink_diag_msg *m; + struct netlink_sk_desc *sd; + unsigned long *groups; + + m = NLMSG_DATA(hdr); + pr_debug("Collect netlink sock 0x%x\n", m->ndiag_ino); + + sd = xmalloc(sizeof(*sd)); + if (!sd) + return -1; + + sd->protocol = m->ndiag_protocol; + sd->portid = m->ndiag_portid; + sd->dst_portid = m->ndiag_dst_portid; + sd->dst_group = m->ndiag_dst_group; + sd->state = m->ndiag_state; + + nlmsg_parse(hdr, sizeof(struct netlink_diag_msg), tb, NETLINK_DIAG_MAX, NULL); + + if (tb[NETLINK_DIAG_GROUPS]) { + sd->gsize = nla_len(tb[NETLINK_DIAG_GROUPS]); + groups = nla_data(tb[NETLINK_DIAG_GROUPS]); + + sd->groups = xmalloc(sd->gsize); + if (!sd->groups) { + xfree(sd); + return -1; + } + memcpy(sd->groups, groups, sd->gsize); + } else { + sd->groups = NULL; + sd->gsize = 0; + } + + return sk_collect_one(m->ndiag_ino, PF_NETLINK, &sd->sd, ns); +} + +static bool can_dump_netlink_sk(int lfd) +{ + int ret; + + ret = fd_has_data(lfd); + if (ret == 1) + pr_err("The socket has data to read\n"); + + return ret == 0; +} + +static int dump_one_netlink_fd(int lfd, u32 id, const struct fd_parms *p) +{ + struct netlink_sk_desc *sk; + FileEntry fe = FILE_ENTRY__INIT; + NetlinkSkEntry ne = NETLINK_SK_ENTRY__INIT; + SkOptsEntry skopts = SK_OPTS_ENTRY__INIT; + + sk = (struct netlink_sk_desc *)lookup_socket(p->stat.st_ino, PF_NETLINK, 0); + if (IS_ERR(sk)) + goto err; + + ne.id = id; + ne.ino = p->stat.st_ino; + + if (!can_dump_netlink_sk(lfd)) + goto err; + + if (sk) { + BUG_ON(sk->sd.already_dumped); + + ne.ns_id = sk->sd.sk_ns->id; + ne.has_ns_id = true; + ne.protocol = sk->protocol; + ne.portid = sk->portid; + ne.groups = sk->groups; + + + ne.n_groups = sk->gsize / sizeof(ne.groups[0]); + /* + * On 64-bit sk->gsize is multiple to 8 bytes (sizeof(long)), + * so remove the last 4 bytes if they are empty. + */ +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + /* + * Big endian swap: Ugly hack for zdtm/static/sk-netlink + * + * For big endian systems: + * + * - sk->groups[0] are bits 32-64 + * - sk->groups[1] are bits 0-32 + */ + if (ne.n_groups == 2) { + uint32_t tmp = sk->groups[1]; + + sk->groups[1] = sk->groups[0]; + sk->groups[0] = tmp; + } +#endif + if (ne.n_groups && sk->groups[ne.n_groups - 1] == 0) + ne.n_groups -= 1; + + if (ne.n_groups > 1) { + pr_err("%d %x\n", sk->gsize, sk->groups[1]); + pr_err("The netlink socket 0x%x has more than 32 groups\n", ne.ino); + return -1; + } + if (sk->groups && !sk->portid) { + pr_err("The netlink socket 0x%x is bound to groups but not to portid\n", ne.ino); + return -1; + } + ne.state = sk->state; + ne.dst_portid = sk->dst_portid; + ne.dst_group = sk->dst_group; + } else { /* unconnected and unbound socket */ + struct ns_id *nsid; + int val; + socklen_t aux = sizeof(val); + + if (root_ns_mask & CLONE_NEWNET) { + nsid = get_socket_ns(lfd); + if (nsid == NULL) + return -1; + ne.ns_id = nsid->id; + ne.has_ns_id = true; + } + + if (getsockopt(lfd, SOL_SOCKET, SO_PROTOCOL, &val, &aux) < 0) { + pr_perror("Unable to get protocol for netlink socket"); + goto err; + } + + ne.protocol = val; + } + + ne.fown = (FownEntry *)&p->fown; + ne.opts = &skopts; + + if (dump_socket_opts(lfd, &skopts)) + goto err; + + fe.type = FD_TYPES__NETLINKSK; + fe.id = ne.id; + fe.nlsk = ≠ + + if (pb_write_one(img_from_set(glob_imgset, CR_FD_FILES), &fe, PB_FILE)) + goto err; + + return 0; +err: + return -1; +} + +const struct fdtype_ops netlink_dump_ops = { + .type = FD_TYPES__NETLINKSK, + .dump = dump_one_netlink_fd, +}; + +struct netlink_sock_info { + NetlinkSkEntry *nse; + struct file_desc d; +}; + +static int open_netlink_sk(struct file_desc *d, int *new_fd) +{ + struct netlink_sock_info *nsi; + NetlinkSkEntry *nse; + struct sockaddr_nl addr; + int sk = -1; + + nsi = container_of(d, struct netlink_sock_info, d); + nse = nsi->nse; + + pr_info("Opening netlink socket id %#x\n", nse->id); + + if (set_netns(nse->ns_id)) + return -1; + + sk = socket(PF_NETLINK, SOCK_RAW, nse->protocol); + if (sk < 0) { + pr_perror("Can't create netlink sock"); + goto err; + } + + if (nse->portid) { + memset(&addr, 0, sizeof(addr)); + addr.nl_family = AF_NETLINK; + if (nse->n_groups > 1) { + pr_err("Groups above 32 are not supported yet\n"); + goto err; + } + if (nse->n_groups) + addr.nl_groups = nse->groups[0]; + addr.nl_pid = nse->portid; + + if (bind(sk, (struct sockaddr *)&addr, sizeof(addr)) < 0) { + pr_perror("Can't bind netlink socket"); + goto err; + } + } + + if (nse->state == NETLINK_CONNECTED) { + addr.nl_family = AF_NETLINK; + addr.nl_groups = 1 << (nse->dst_group - 1); + addr.nl_pid = nse->dst_portid; + if (connect(sk, (struct sockaddr *)&addr, sizeof(addr)) < 0) { + pr_perror("Can't connect netlink socket"); + goto err; + } + } + + if (rst_file_params(sk, nse->fown, nse->flags)) + goto err; + + if (restore_socket_opts(sk, nse->opts)) + goto err; + + *new_fd = sk; + return 0; +err: + close(sk); + return -1; +} + +static struct file_desc_ops netlink_sock_desc_ops = { + .type = FD_TYPES__NETLINKSK, + .open = open_netlink_sk, +}; + +static int collect_one_netlink_sk(void *o, ProtobufCMessage *base, struct cr_img *i) +{ + struct netlink_sock_info *si = o; + + si->nse = pb_msg(base, NetlinkSkEntry); + return file_desc_add(&si->d, si->nse->id, &netlink_sock_desc_ops); +} + +struct collect_image_info netlink_sk_cinfo = { + .fd_type = CR_FD_NETLINK_SK, + .pb_type = PB_NETLINK_SK, + .priv_size = sizeof(struct netlink_sock_info), + .collect = collect_one_netlink_sk, +}; diff --git a/CRIU_code/criu/sk-packet.c b/CRIU_code/criu/sk-packet.c new file mode 100644 index 0000000..0abe840 --- /dev/null +++ b/CRIU_code/criu/sk-packet.c @@ -0,0 +1,583 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "imgset.h" +#include "files.h" +#include "sockets.h" +#include "libnetlink.h" +#include "sk-packet.h" +#include "packet_diag.h" +#include "vma.h" +#include + +#include "protobuf.h" +#include "xmalloc.h" +#include "images/packet-sock.pb-c.h" +#include "images/fdinfo.pb-c.h" +#include "namespaces.h" + +#undef LOG_PREFIX +#define LOG_PREFIX "packet: " + +struct packet_sock_info { + PacketSockEntry *pse; + struct file_desc d; +}; + +struct packet_mreq_max { + int mr_ifindex; + unsigned short mr_type; + unsigned short mr_alen; + unsigned char mr_address[MAX_ADDR_LEN]; +}; + +struct packet_sock_desc { + struct socket_desc sd; + unsigned int file_id; + unsigned int type; + unsigned short proto; + struct packet_diag_info nli; + int mreq_n; + struct packet_diag_mclist *mreqs; + unsigned int fanout; + struct packet_diag_ring *rx, *tx; +}; + +#define NO_FANOUT ((unsigned int)-1) + +static int dump_mreqs(PacketSockEntry *psk, struct packet_sock_desc *sd) +{ + int i; + + if (!sd->mreq_n) + return 0; + + pr_debug("\tdumping %d mreqs\n", sd->mreq_n); + psk->mclist = xmalloc(sd->mreq_n * sizeof(psk->mclist[0])); + if (!psk->mclist) + return -1; + + for (i = 0; i < sd->mreq_n; i++) { + struct packet_diag_mclist *m = &sd->mreqs[i]; + PacketMclist *im; + + if (m->pdmc_count != 1) { + pr_err("Multiple MC membership not supported (but can be)\n"); + goto err; + } + + pr_debug("\tmr%d: idx %d type %d\n", i, + m->pdmc_index, m->pdmc_type); + + im = xmalloc(sizeof(*im)); + if (!im) + goto err; + + packet_mclist__init(im); + psk->mclist[i] = im; + psk->n_mclist++; + + im->index = m->pdmc_index; + im->type = m->pdmc_type; + + switch (m->pdmc_type) { + case PACKET_MR_MULTICAST: + case PACKET_MR_UNICAST: + im->addr.len = m->pdmc_alen; + im->addr.data = xmalloc(m->pdmc_alen); + if (!im->addr.data) + goto err; + + memcpy(im->addr.data, m->pdmc_addr, m->pdmc_alen); + break; + case PACKET_MR_PROMISC: + case PACKET_MR_ALLMULTI: + break; + default: + pr_err("Unknown mc membership type %d\n", m->pdmc_type); + goto err; + } + } + + return 0; +err: + return -1; +} + +static PacketRing *dump_ring(struct packet_diag_ring *dr) +{ + PacketRing *ring; + + ring = xmalloc(sizeof(*ring)); + if (!ring) + return NULL; + + packet_ring__init(ring); + + ring->block_size = dr->pdr_block_size; + ring->block_nr = dr->pdr_block_nr; + ring->frame_size = dr->pdr_frame_size; + ring->frame_nr = dr->pdr_frame_nr; + ring->retire_tmo = dr->pdr_retire_tmo; + ring->sizeof_priv = dr->pdr_sizeof_priv; + ring->features = dr->pdr_features; + + return ring; +} + +static int dump_rings(PacketSockEntry *psk, struct packet_sock_desc *sd) +{ + if (sd->rx) { + psk->rx_ring = dump_ring(sd->rx); + if (!psk->rx_ring) + return -1; + } + + if (sd->tx) { + psk->tx_ring = dump_ring(sd->tx); + if (!psk->tx_ring) + return -1; + } + + return 0; +} + +static int dump_one_packet_fd(int lfd, u32 id, const struct fd_parms *p) +{ + FileEntry fe = FILE_ENTRY__INIT; + PacketSockEntry psk = PACKET_SOCK_ENTRY__INIT; + SkOptsEntry skopts = SK_OPTS_ENTRY__INIT; + struct packet_sock_desc *sd; + int i, ret; + + sd = (struct packet_sock_desc *)lookup_socket(p->stat.st_ino, PF_PACKET, 0); + if (IS_ERR_OR_NULL(sd)) { + pr_err("Can't find packet socket %"PRIu64"\n", p->stat.st_ino); + return -1; + } + + pr_info("Dumping packet socket fd %d id %#x\n", lfd, id); + BUG_ON(sd->sd.already_dumped); + sd->sd.already_dumped = 1; + + psk.id = sd->file_id = id; + psk.ns_id = sd->sd.sk_ns->id; + psk.has_ns_id = true; + psk.type = sd->type; + psk.flags = p->flags; + psk.fown = (FownEntry *)&p->fown; + psk.opts = &skopts; + + if (dump_socket_opts(lfd, &skopts)) + return -1; + + psk.protocol = sd->proto; + psk.ifindex = sd->nli.pdi_index; + psk.version = sd->nli.pdi_version; + psk.reserve = sd->nli.pdi_reserve; + psk.timestamp = sd->nli.pdi_tstamp; + psk.copy_thresh = sd->nli.pdi_copy_thresh; + psk.aux_data = (sd->nli.pdi_flags & PDI_AUXDATA ? true : false); + psk.orig_dev = (sd->nli.pdi_flags & PDI_ORIGDEV ? true : false); + psk.vnet_hdr = (sd->nli.pdi_flags & PDI_VNETHDR ? true : false); + psk.loss = (sd->nli.pdi_flags & PDI_LOSS ? true : false); + + ret = dump_mreqs(&psk, sd); + if (ret) + goto out; + + if (sd->fanout != NO_FANOUT) { + psk.has_fanout = true; + psk.fanout = sd->fanout; + } + + ret = dump_rings(&psk, sd); + if (ret) + goto out; + + fe.type = FD_TYPES__PACKETSK; + fe.id = psk.id; + fe.psk = &psk; + + ret = pb_write_one(img_from_set(glob_imgset, CR_FD_FILES), &fe, PB_FILE); +out: + release_skopts(&skopts); + xfree(psk.rx_ring); + xfree(psk.tx_ring); + for (i = 0; i < psk.n_mclist; i++) + xfree(psk.mclist[i]->addr.data); + xfree(psk.mclist); + return ret; +} + +const struct fdtype_ops packet_dump_ops = { + .type = FD_TYPES__PACKETSK, + .dump = dump_one_packet_fd, +}; + +int dump_socket_map(struct vma_area *vma) +{ + struct packet_sock_desc *sd; + + sd = (struct packet_sock_desc *)lookup_socket(vma->vm_socket_id, PF_PACKET, 0); + if (IS_ERR_OR_NULL(sd)) { + pr_err("Can't find packet socket %u to mmap\n", vma->vm_socket_id); + return -1; + } + + if (!sd->file_id) { + pr_err("Mmap-ed socket %u not open\n", vma->vm_socket_id); + return -1; + } + + pr_info("Dumping socket map %x -> %"PRIx64"\n", sd->file_id, vma->e->start); + vma->e->shmid = sd->file_id; + return 0; +} + +static int packet_save_mreqs(struct packet_sock_desc *sd, struct nlattr *mc) +{ + sd->mreq_n = nla_len(mc) / sizeof(struct packet_diag_mclist); + pr_debug("\tGot %d mreqs\n", sd->mreq_n); + sd->mreqs = xmalloc(nla_len(mc)); + if (!sd->mreqs) + return -1; + + memcpy(sd->mreqs, nla_data(mc), nla_len(mc)); + return 0; +} + +int packet_receive_one(struct nlmsghdr *hdr, struct ns_id *ns, void *arg) +{ + struct packet_diag_msg *m; + struct nlattr *tb[PACKET_DIAG_MAX + 1]; + struct packet_sock_desc *sd; + + m = NLMSG_DATA(hdr); + nlmsg_parse(hdr, sizeof(struct packet_diag_msg), + tb, PACKET_DIAG_MAX, NULL); + pr_info("Collect packet sock %u %u\n", m->pdiag_ino, (unsigned int)m->pdiag_num); + + if (!tb[PACKET_DIAG_INFO]) { + pr_err("No packet sock info in nlm\n"); + return -1; + } + + if (!tb[PACKET_DIAG_MCLIST]) { + pr_err("No packet sock mclist in nlm\n"); + return -1; + } + + sd = xmalloc(sizeof(*sd)); + if (!sd) + return -1; + + sd->file_id = 0; + sd->type = m->pdiag_type; + sd->proto = htons(m->pdiag_num); + sd->rx = NULL; + sd->tx = NULL; + memcpy(&sd->nli, nla_data(tb[PACKET_DIAG_INFO]), sizeof(sd->nli)); + + if (packet_save_mreqs(sd, tb[PACKET_DIAG_MCLIST])) + goto err; + + if (tb[PACKET_DIAG_FANOUT]) + sd->fanout = *(__u32 *)RTA_DATA(tb[PACKET_DIAG_FANOUT]); + else + sd->fanout = NO_FANOUT; + + if (tb[PACKET_DIAG_RX_RING]) { + sd->rx = xmalloc(sizeof(*sd->rx)); + if (sd->rx == NULL) + goto err; + memcpy(sd->rx, RTA_DATA(tb[PACKET_DIAG_RX_RING]), sizeof(*sd->rx)); + } + + if (tb[PACKET_DIAG_TX_RING]) { + sd->tx = xmalloc(sizeof(*sd->tx)); + if (sd->tx == NULL) + goto err; + memcpy(sd->tx, RTA_DATA(tb[PACKET_DIAG_TX_RING]), sizeof(*sd->tx)); + } + + return sk_collect_one(m->pdiag_ino, PF_PACKET, &sd->sd, ns); +err: + xfree(sd->tx); + xfree(sd->rx); + xfree(sd); + return -1; +} + +static int open_socket_map(int pid, struct vma_area *vm) +{ + VmaEntry *vma = vm->e; + struct file_desc *fd; + struct fdinfo_list_entry *le; + + pr_info("Getting packet socket fd for %d:%x\n", + pid, (int)vma->shmid); + fd = find_file_desc_raw(FD_TYPES__PACKETSK, vma->shmid); + if (!fd) { + pr_err("No packet socket %x\n", (int)vma->shmid); + return -1; + } + + list_for_each_entry(le, &fd->fd_info_head, desc_list) + if (le->pid == pid) { + int fd; + + /* + * Restorer will close the mmap-ed fd + */ + + fd = dup(le->fe->fd); + if (fd < 0) { + pr_perror("Can't dup packet sk"); + return -1; + } + + vma->fd = fd; + return 0; + } + + pr_err("No open packet socket %x by %d\n", (int)vma->shmid, pid); + return -1; +} + +int collect_socket_map(struct vma_area *vma) +{ + vma->vm_open = open_socket_map; + return 0; +} + +static int restore_mreqs(int sk, PacketSockEntry *pse) +{ + int i; + + for (i = 0; i < pse->n_mclist; i++) { + PacketMclist *ml; + struct packet_mreq_max mreq; + + ml = pse->mclist[i]; + pr_info("Restoring mreq type %d\n", ml->type); + + if (ml->addr.len > sizeof(mreq.mr_address)) { + pr_err("To big mcaddr %zu\n", ml->addr.len); + return -1; + } + + mreq.mr_ifindex = ml->index; + mreq.mr_type = ml->type; + mreq.mr_alen = ml->addr.len; + memcpy(mreq.mr_address, ml->addr.data, ml->addr.len); + + if (restore_opt(sk, SOL_PACKET, PACKET_ADD_MEMBERSHIP, &mreq)) + return -1; + } + + return 0; +} + +static int restore_ring(int sk, int type, PacketRing *ring) +{ + struct tpacket_req3 req; + + if (!ring) + return 0; + + pr_debug("\tRestoring %d ring\n", type); + + req.tp_block_size = ring->block_size; + req.tp_block_nr = ring->block_nr; + req.tp_frame_size = ring->frame_size; + req.tp_frame_nr = ring->frame_nr; + req.tp_retire_blk_tov = ring->retire_tmo; + req.tp_sizeof_priv = ring->sizeof_priv; + req.tp_feature_req_word = ring->features; + + return restore_opt(sk, SOL_PACKET, type, &req); +} + +static int restore_rings(int sk, PacketSockEntry *psk) +{ + if (restore_ring(sk, PACKET_RX_RING, psk->rx_ring)) + return -1; + + if (restore_ring(sk, PACKET_TX_RING, psk->tx_ring)) + return -1; + + return 0; +} + +static int open_packet_sk_spkt(PacketSockEntry *pse, int *new_fd) +{ + struct sockaddr addr_spkt; + int sk; + + sk = socket(PF_PACKET, pse->type, pse->protocol); + if (sk < 0) { + pr_perror("Can't create packet socket"); + return -1; + } + + memset(&addr_spkt, 0, sizeof(addr_spkt)); + addr_spkt.sa_family = AF_PACKET; + + // if the socket was bound to any device + if (pse->ifindex > 0) { + const size_t sa_data_size = sizeof(addr_spkt.sa_data); + struct ifreq req; + + memset(&req, 0, sizeof(req)); + req.ifr_ifindex = pse->ifindex; + + if (ioctl(sk, SIOCGIFNAME, &req) < 0) { + pr_perror("Can't get interface name (ifindex %d)", pse->ifindex); + goto err; + } + + memcpy(addr_spkt.sa_data, req.ifr_name, sa_data_size); + addr_spkt.sa_data[sa_data_size - 1] = 0; + + if (bind(sk, &addr_spkt, sizeof(addr_spkt)) < 0) { + pr_perror("Can't bind packet socket to %s", req.ifr_name); + goto err; + } + } + + if (rst_file_params(sk, pse->fown, pse->flags)) + goto err; + + if (restore_socket_opts(sk, pse->opts)) + goto err; + + *new_fd = sk; + return 0; + +err: + close(sk); + return -1; +} + +static int open_packet_sk(struct file_desc *d, int *new_fd) +{ + struct packet_sock_info *psi; + PacketSockEntry *pse; + struct sockaddr_ll addr; + int sk, yes; + + psi = container_of(d, struct packet_sock_info, d); + pse = psi->pse; + + pr_info("Opening packet socket id %#x\n", pse->id); + + if (set_netns(pse->ns_id)) + return -1; + + if (pse->type == SOCK_PACKET) + return open_packet_sk_spkt(pse, new_fd); + + sk = socket(PF_PACKET, pse->type, pse->protocol); + if (sk < 0) { + pr_perror("Can't create packet sock"); + goto err; + } + + memset(&addr, 0, sizeof(addr)); + addr.sll_family = AF_PACKET; + addr.sll_ifindex = pse->ifindex; + + if (bind(sk, (struct sockaddr *)&addr, sizeof(addr)) < 0) { + pr_perror("Can't bind packet socket"); + goto err_cl; + } + + if (restore_opt(sk, SOL_PACKET, PACKET_VERSION, &pse->version)) + goto err_cl; + + if (restore_opt(sk, SOL_PACKET, PACKET_RESERVE, &pse->reserve)) + goto err_cl; + + if (restore_opt(sk, SOL_PACKET, PACKET_TIMESTAMP, &pse->timestamp)) + goto err_cl; + + if (restore_opt(sk, SOL_PACKET, PACKET_COPY_THRESH, &pse->copy_thresh)) + goto err_cl; + + if (pse->aux_data) { + yes = 1; + if (restore_opt(sk, SOL_PACKET, PACKET_AUXDATA, &yes)) + goto err_cl; + } + + if (pse->orig_dev) { + yes = 1; + if (restore_opt(sk, SOL_PACKET, PACKET_ORIGDEV, &yes)) + goto err_cl; + } + + if (pse->vnet_hdr) { + yes = 1; + if (restore_opt(sk, SOL_PACKET, PACKET_VNET_HDR, &yes)) + goto err_cl; + } + + if (pse->loss) { + yes = 1; + if (restore_opt(sk, SOL_PACKET, PACKET_LOSS, &yes)) + goto err_cl; + } + + if (restore_mreqs(sk, pse)) + goto err_cl; + + if (restore_rings(sk, pse)) + goto err_cl; + + if (pse->has_fanout) { + pr_info("Restoring fanout %x\n", pse->fanout); + if (restore_opt(sk, SOL_PACKET, PACKET_FANOUT, &pse->fanout)) + goto err_cl; + } + + if (rst_file_params(sk, pse->fown, pse->flags)) + goto err_cl; + + if (restore_socket_opts(sk, pse->opts)) + goto err_cl; + + *new_fd = sk; + return 0; + +err_cl: + close(sk); +err: + return -1; +} + +static struct file_desc_ops packet_sock_desc_ops = { + .type = FD_TYPES__PACKETSK, + .open = open_packet_sk, +}; + +static int collect_one_packet_sk(void *o, ProtobufCMessage *base, struct cr_img *i) +{ + struct packet_sock_info *si = o; + + si->pse = pb_msg(base, PacketSockEntry); + return file_desc_add(&si->d, si->pse->id, &packet_sock_desc_ops); +} + +struct collect_image_info packet_sk_cinfo = { + .fd_type = CR_FD_PACKETSK, + .pb_type = PB_PACKET_SOCK, + .priv_size = sizeof(struct packet_sock_info), + .collect = collect_one_packet_sk, +}; diff --git a/CRIU_code/criu/sk-queue.c b/CRIU_code/criu/sk-queue.c new file mode 100644 index 0000000..776eb5a --- /dev/null +++ b/CRIU_code/criu/sk-queue.c @@ -0,0 +1,394 @@ +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "common/list.h" +#include "imgset.h" +#include "image.h" +#include "servicefd.h" +#include "cr_options.h" +#include "util.h" +#include "util-pie.h" +#include "sockets.h" +#include "xmalloc.h" +#include "sk-queue.h" +#include "files.h" +#include "protobuf.h" +#include "images/sk-packet.pb-c.h" + +#undef LOG_PREFIX +#define LOG_PREFIX "skqueue: " + +struct sk_packet { + struct list_head list; + SkPacketEntry *entry; + char *data; + unsigned scm_len; + int *scm; +}; + +static LIST_HEAD(packets_list); + +static int collect_one_packet(void *obj, ProtobufCMessage *msg, struct cr_img *img) +{ + struct sk_packet *pkt = obj; + + pkt->entry = pb_msg(msg, SkPacketEntry); + pkt->scm = NULL; + pkt->data = xmalloc(pkt->entry->length); + if (pkt->data ==NULL) + return -1; + + /* + * See dump_packet_cmsg() -- only SCM_RIGHTS are supported and + * only 1 of that kind is possible, thus not more than 1 SCMs + * on a packet. + */ + if (pkt->entry->n_scm > 1) { + pr_err("More than 1 SCM is not possible\n"); + return -1; + } + + /* + * NOTE: packet must be added to the tail. Otherwise sequence + * will be broken. + */ + list_add_tail(&pkt->list, &packets_list); + + if (read_img_buf(img, pkt->data, pkt->entry->length) != 1) { + xfree(pkt->data); + pr_perror("Unable to read packet data"); + return -1; + } + + return 0; +} + +struct collect_image_info sk_queues_cinfo = { + .fd_type = CR_FD_SK_QUEUES, + .pb_type = PB_SK_QUEUES, + .priv_size = sizeof(struct sk_packet), + .collect = collect_one_packet, +}; + +static int dump_scm_rights(struct cmsghdr *ch, SkPacketEntry *pe) +{ + int nr_fds, *fds, i; + void *buf; + ScmEntry *scme; + + nr_fds = (ch->cmsg_len - sizeof(*ch)) / sizeof(int); + fds = (int *)CMSG_DATA(ch); + + buf = xmalloc(sizeof(ScmEntry) + nr_fds * sizeof(uint32_t)); + if (!buf) + return -1; + + scme = xptr_pull(&buf, ScmEntry); + scm_entry__init(scme); + scme->type = SCM_RIGHTS; + scme->n_rights = nr_fds; + scme->rights = xptr_pull_s(&buf, nr_fds * sizeof(uint32_t)); + + for (i = 0; i < nr_fds; i++) { + int ftyp; + + if (dump_my_file(fds[i], &scme->rights[i], &ftyp)) + return -1; + } + + i = pe->n_scm++; + if (xrealloc_safe(&pe->scm, pe->n_scm * sizeof(ScmEntry*))) + return -1; + + pe->scm[i] = scme; + return 0; +} + +/* + * Maximum size of the control messages. XXX -- is there any + * way to get this value out of the kernel? + * */ +#define CMSG_MAX_SIZE 1024 + +static int dump_packet_cmsg(struct msghdr *mh, SkPacketEntry *pe) +{ + struct cmsghdr *ch; + int n_rights = 0; + + for (ch = CMSG_FIRSTHDR(mh); ch; ch = CMSG_NXTHDR(mh, ch)) { + if (ch->cmsg_type == SCM_RIGHTS) { + if (n_rights) { + /* + * Even if user is sending more than one cmsg with + * rights, kernel merges them altogether on recv. + */ + pr_err("Unexpected 2nd SCM_RIGHTS from the kernel\n"); + return -1; + } + + if (dump_scm_rights(ch, pe)) + return -1; + + n_rights++; + continue; + } + + pr_err("Control messages in queue, not supported\n"); + return -1; + } + + return 0; +} + +static void release_cmsg(SkPacketEntry *pe) +{ + int i; + + for (i = 0; i < pe->n_scm; i++) + xfree(pe->scm[i]); + xfree(pe->scm); + + pe->n_scm = 0; + pe->scm = NULL; +} + +int dump_sk_queue(int sock_fd, int sock_id) +{ + SkPacketEntry pe = SK_PACKET_ENTRY__INIT; + int ret, size, orig_peek_off; + void *data; + socklen_t tmp; + + /* + * Save original peek offset. + */ + tmp = sizeof(orig_peek_off); + orig_peek_off = 0; + ret = getsockopt(sock_fd, SOL_SOCKET, SO_PEEK_OFF, &orig_peek_off, &tmp); + if (ret < 0) { + pr_perror("getsockopt failed"); + return ret; + } + /* + * Discover max DGRAM size + */ + tmp = sizeof(size); + size = 0; + ret = getsockopt(sock_fd, SOL_SOCKET, SO_SNDBUF, &size, &tmp); + if (ret < 0) { + pr_perror("getsockopt failed"); + return ret; + } + + /* Note: 32 bytes will be used by kernel for protocol header. */ + size -= 32; + + /* + * Allocate data for a stream. + */ + data = xmalloc(size); + if (!data) + return -1; + + /* + * Enable peek offset incrementation. + */ + ret = setsockopt(sock_fd, SOL_SOCKET, SO_PEEK_OFF, &ret, sizeof(int)); + if (ret < 0) { + pr_perror("setsockopt fail"); + goto err_brk; + } + + pe.id_for = sock_id; + + while (1) { + char cmsg[CMSG_MAX_SIZE]; + struct iovec iov = { + .iov_base = data, + .iov_len = size, + }; + struct msghdr msg = { + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = &cmsg, + .msg_controllen = sizeof(cmsg), + }; + + ret = pe.length = recvmsg(sock_fd, &msg, MSG_DONTWAIT | MSG_PEEK); + if (!ret) + /* + * It means, that peer has performed an + * orderly shutdown, so we're done. + */ + break; + else if (ret < 0) { + if (errno == EAGAIN) + break; /* we're done */ + pr_perror("recvmsg fail: error"); + goto err_set_sock; + } + if (msg.msg_flags & MSG_TRUNC) { + /* + * DGRAM truncated. This should not happen. But we have + * to check... + */ + pr_err("sys_recvmsg failed: truncated\n"); + ret = -E2BIG; + goto err_set_sock; + } + + if (dump_packet_cmsg(&msg, &pe)) + goto err_set_sock; + + ret = pb_write_one(img_from_set(glob_imgset, CR_FD_SK_QUEUES), &pe, PB_SK_QUEUES); + if (ret < 0) { + ret = -EIO; + goto err_set_sock; + } + + ret = write_img_buf(img_from_set(glob_imgset, CR_FD_SK_QUEUES), data, pe.length); + if (ret < 0) { + ret = -EIO; + goto err_set_sock; + } + + if (pe.scm) + release_cmsg(&pe); + } + ret = 0; + +err_set_sock: + /* + * Restore original peek offset. + */ + if (setsockopt(sock_fd, SOL_SOCKET, SO_PEEK_OFF, &orig_peek_off, sizeof(int))) { + pr_perror("setsockopt failed on restore"); + ret = -1; + } + if (pe.scm) + release_cmsg(&pe); +err_brk: + xfree(data); + return ret; +} + +static int send_one_pkt(int fd, struct sk_packet *pkt) +{ + int ret; + SkPacketEntry *entry = pkt->entry; + struct msghdr mh = {}; + struct iovec iov; + + mh.msg_iov = &iov; + mh.msg_iovlen = 1; + iov.iov_base = pkt->data; + iov.iov_len = entry->length; + + if (pkt->scm != NULL) { + mh.msg_controllen = pkt->scm_len; + mh.msg_control = pkt->scm; + } + + /* + * Don't try to use sendfile here, because it use sendpage() and + * all data are split on pages and a new skb is allocated for + * each page. It creates a big overhead on SNDBUF. + * sendfile() isn't suitable for DGRAM sockets, because message + * boundaries messages should be saved. + */ + + ret = sendmsg(fd, &mh, 0); + xfree(pkt->data); + if (ret < 0) { + pr_perror("Failed to send packet"); + return -1; + } + if (ret != entry->length) { + pr_err("Restored skb trimmed to %d/%d\n", + ret, (unsigned int)entry->length); + return -1; + } + + return 0; +} + +int restore_sk_queue(int fd, unsigned int peer_id) +{ + struct sk_packet *pkt, *tmp; + int ret = -1; + + pr_info("Trying to restore recv queue for %u\n", peer_id); + + if (restore_prepare_socket(fd)) + goto out; + + list_for_each_entry_safe(pkt, tmp, &packets_list, list) { + SkPacketEntry *entry = pkt->entry; + + if (entry->id_for != peer_id) + continue; + + pr_info("\tRestoring %d-bytes skb for %u\n", + (unsigned int)entry->length, peer_id); + + ret = send_one_pkt(fd, pkt); + if (ret) + goto out; + + list_del(&pkt->list); + sk_packet_entry__free_unpacked(entry, NULL); + xfree(pkt); + } + + ret = 0; +out: + return ret; +} + +int prepare_scms(void) +{ + struct sk_packet *pkt; + + pr_info("Preparing SCMs\n"); + list_for_each_entry(pkt, &packets_list, list) { + SkPacketEntry *pe = pkt->entry; + ScmEntry *se; + struct cmsghdr *ch; + + if (!pe->n_scm) + continue; + + se = pe->scm[0]; /* Only 1 SCM is possible */ + + if (se->type == SCM_RIGHTS) { + pkt->scm_len = CMSG_SPACE(se->n_rights * sizeof(int)); + pkt->scm = xmalloc(pkt->scm_len); + if (!pkt->scm) + return -1; + + ch = (struct cmsghdr *)pkt->scm; /* FIXME -- via msghdr */ + ch->cmsg_level = SOL_SOCKET; + ch->cmsg_type = SCM_RIGHTS; + ch->cmsg_len = CMSG_LEN(se->n_rights * sizeof(int)); + + if (unix_note_scm_rights(pe->id_for, se->rights, + (int *)CMSG_DATA(ch), se->n_rights)) + return -1; + + continue; + } + + pr_err("Unsupported scm %d in image\n", se->type); + return -1; + } + + return 0; +} diff --git a/CRIU_code/criu/sk-tcp.c b/CRIU_code/criu/sk-tcp.c new file mode 100644 index 0000000..4fd2eb8 --- /dev/null +++ b/CRIU_code/criu/sk-tcp.c @@ -0,0 +1,448 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "../soccr/soccr.h" + +#include "common/config.h" +#include "cr_options.h" +#include "util.h" +#include "common/list.h" +#include "log.h" +#include "files.h" +#include "sockets.h" +#include "sk-inet.h" +#include "netfilter.h" +#include "image.h" +#include "namespaces.h" +#include "xmalloc.h" +#include "kerndat.h" +#include "restorer.h" +#include "rst-malloc.h" + +#include "protobuf.h" +#include "images/tcp-stream.pb-c.h" + +#undef LOG_PREFIX +#define LOG_PREFIX "tcp: " + +static LIST_HEAD(cpt_tcp_repair_sockets); +static LIST_HEAD(rst_tcp_repair_sockets); + +static int tcp_repair_established(int fd, struct inet_sk_desc *sk) +{ + int ret; + struct libsoccr_sk *socr; + + pr_info("\tTurning repair on for socket %x\n", sk->sd.ino); + /* + * Keep the socket open in criu till the very end. In + * case we close this fd after one task fd dumping and + * fail we'll have to turn repair mode off + */ + sk->rfd = dup(fd); + if (sk->rfd < 0) { + pr_perror("Can't save socket fd for repair"); + goto err1; + } + + if (!(root_ns_mask & CLONE_NEWNET)) { + ret = nf_lock_connection(sk); + if (ret < 0) + goto err2; + } + + socr = libsoccr_pause(sk->rfd); + if (!socr) + goto err3; + + sk->priv = socr; + list_add_tail(&sk->rlist, &cpt_tcp_repair_sockets); + return 0; + +err3: + if (!(root_ns_mask & CLONE_NEWNET)) + nf_unlock_connection(sk); +err2: + close(sk->rfd); +err1: + return -1; +} + +static void tcp_unlock_one(struct inet_sk_desc *sk) +{ + int ret; + + list_del(&sk->rlist); + + if (!(root_ns_mask & CLONE_NEWNET)) { + ret = nf_unlock_connection(sk); + if (ret < 0) + pr_perror("Failed to unlock TCP connection"); + } + + libsoccr_resume(sk->priv); + sk->priv = NULL; + + /* + * tcp_repair_off modifies SO_REUSEADDR so + * don't forget to restore original value. + */ + restore_opt(sk->rfd, SOL_SOCKET, SO_REUSEADDR, &sk->cpt_reuseaddr); + + close(sk->rfd); +} + +void cpt_unlock_tcp_connections(void) +{ + struct inet_sk_desc *sk, *n; + + list_for_each_entry_safe(sk, n, &cpt_tcp_repair_sockets, rlist) + tcp_unlock_one(sk); +} + +static int dump_tcp_conn_state(struct inet_sk_desc *sk) +{ + struct libsoccr_sk *socr = sk->priv; + int ret, aux; + struct cr_img *img; + TcpStreamEntry tse = TCP_STREAM_ENTRY__INIT; + char *buf; + struct libsoccr_sk_data data; + + ret = libsoccr_save(socr, &data, sizeof(data)); + if (ret < 0) { + pr_err("libsoccr_save() failed with %d\n", ret); + goto err_r; + } + if (ret != sizeof(data)) { + pr_err("This libsocr is not supported (%d vs %d)\n", + ret, (int)sizeof(data)); + goto err_r; + } + + sk->state = data.state; + + tse.inq_len = data.inq_len; + tse.inq_seq = data.inq_seq; + tse.outq_len = data.outq_len; + tse.outq_seq = data.outq_seq; + tse.unsq_len = data.unsq_len; + tse.has_unsq_len = true; + tse.mss_clamp = data.mss_clamp; + tse.opt_mask = data.opt_mask; + + if (tse.opt_mask & TCPI_OPT_WSCALE) { + tse.snd_wscale = data.snd_wscale; + tse.rcv_wscale = data.rcv_wscale; + tse.has_rcv_wscale = true; + } + if (tse.opt_mask & TCPI_OPT_TIMESTAMPS) { + tse.timestamp = data.timestamp; + tse.has_timestamp = true; + } + + if (data.flags & SOCCR_FLAGS_WINDOW) { + tse.has_snd_wl1 = true; + tse.has_snd_wnd = true; + tse.has_max_window = true; + tse.has_rcv_wnd = true; + tse.has_rcv_wup = true; + tse.snd_wl1 = data.snd_wl1; + tse.snd_wnd = data.snd_wnd; + tse.max_window = data.max_window; + tse.rcv_wnd = data.rcv_wnd; + tse.rcv_wup = data.rcv_wup; + } + + /* + * TCP socket options + */ + + if (dump_opt(sk->rfd, SOL_TCP, TCP_NODELAY, &aux)) + goto err_opt; + + if (aux) { + tse.has_nodelay = true; + tse.nodelay = true; + } + + if (dump_opt(sk->rfd, SOL_TCP, TCP_CORK, &aux)) + goto err_opt; + + if (aux) { + tse.has_cork = true; + tse.cork = true; + } + + /* + * Push the stuff to image + */ + + img = open_image(CR_FD_TCP_STREAM, O_DUMP, sk->sd.ino); + if (!img) + goto err_img; + + ret = pb_write_one(img, &tse, PB_TCP_STREAM); + if (ret < 0) + goto err_iw; + + buf = libsoccr_get_queue_bytes(socr, TCP_RECV_QUEUE, SOCCR_MEM_EXCL); + if (buf) { + ret = write_img_buf(img, buf, tse.inq_len); + if (ret < 0) + goto err_iw; + + xfree(buf); + } + + buf = libsoccr_get_queue_bytes(socr, TCP_SEND_QUEUE, SOCCR_MEM_EXCL); + if (buf) { + ret = write_img_buf(img, buf, tse.outq_len); + if (ret < 0) + goto err_iw; + + xfree(buf); + } + + pr_info("Done\n"); +err_iw: + close_image(img); +err_img: +err_opt: +err_r: + return ret; +} + +int dump_one_tcp(int fd, struct inet_sk_desc *sk) +{ + if (sk->dst_port == 0) + return 0; + + pr_info("Dumping TCP connection\n"); + + if (tcp_repair_established(fd, sk)) + return -1; + + if (dump_tcp_conn_state(sk)) + return -1; + + /* + * Socket is left in repair mode, so that at the end it's just + * closed and the connection is silently terminated + */ + return 0; +} + +static int read_tcp_queue(struct libsoccr_sk *sk, struct libsoccr_sk_data *data, + int queue, u32 len, struct cr_img *img) +{ + char *buf; + + buf = xmalloc(len); + if (!buf) + return -1; + + if (read_img_buf(img, buf, len) < 0) + goto err; + + return libsoccr_set_queue_bytes(sk, queue, buf, SOCCR_MEM_EXCL); + +err: + xfree(buf); + return -1; +} + +static int read_tcp_queues(struct libsoccr_sk *sk, struct libsoccr_sk_data *data, struct cr_img *img) +{ + u32 len; + + len = data->inq_len; + if (len && read_tcp_queue(sk, data, TCP_RECV_QUEUE, len, img)) + return -1; + + len = data->outq_len; + if (len && read_tcp_queue(sk, data, TCP_SEND_QUEUE, len, img)) + return -1; + + return 0; +} + +static int restore_tcp_conn_state(int sk, struct libsoccr_sk *socr, struct inet_sk_info *ii) +{ + int aux; + struct cr_img *img; + TcpStreamEntry *tse; + struct libsoccr_sk_data data = {}; + union libsoccr_addr sa_src, sa_dst; + + pr_info("Restoring TCP connection id %x ino %x\n", ii->ie->id, ii->ie->ino); + + img = open_image(CR_FD_TCP_STREAM, O_RSTR, ii->ie->ino); + if (!img) + goto err; + + if (pb_read_one(img, &tse, PB_TCP_STREAM) < 0) + goto err_c; + + if (!tse->has_unsq_len) { + pr_err("No unsq len in the image\n"); + goto err_c; + } + + data.state = ii->ie->state; + data.inq_len = tse->inq_len; + data.inq_seq = tse->inq_seq; + data.outq_len = tse->outq_len; + data.outq_seq = tse->outq_seq; + data.unsq_len = tse->unsq_len; + data.mss_clamp = tse->mss_clamp; + data.opt_mask = tse->opt_mask; + if (tse->opt_mask & TCPI_OPT_WSCALE) { + if (!tse->has_rcv_wscale) { + pr_err("No rcv wscale in the image\n"); + goto err_c; + } + + data.snd_wscale = tse->snd_wscale; + data.rcv_wscale = tse->rcv_wscale; + } + if (tse->opt_mask & TCPI_OPT_TIMESTAMPS) { + if (!tse->has_timestamp) { + pr_err("No timestamp in the image\n"); + goto err_c; + } + + data.timestamp = tse->timestamp; + } + + if (tse->has_snd_wnd) { + data.flags |= SOCCR_FLAGS_WINDOW; + data.snd_wl1 = tse->snd_wl1; + data.snd_wnd = tse->snd_wnd; + data.max_window = tse->max_window; + data.rcv_wnd = tse->rcv_wnd; + data.rcv_wup = tse->rcv_wup; + } + + if (restore_sockaddr(&sa_src, + ii->ie->family, ii->ie->src_port, + ii->ie->src_addr, 0) < 0) + goto err_c; + if (restore_sockaddr(&sa_dst, + ii->ie->family, ii->ie->dst_port, + ii->ie->dst_addr, 0) < 0) + goto err_c; + + libsoccr_set_addr(socr, 1, &sa_src, 0); + libsoccr_set_addr(socr, 0, &sa_dst, 0); + + /* + * O_NONBLOCK has to be set before libsoccr_restore(), + * it is required to restore syn-sent sockets. + */ + if (restore_prepare_socket(sk)) + goto err_c; + + if (read_tcp_queues(socr, &data, img)) + goto err_c; + + if (libsoccr_restore(socr, &data, sizeof(data))) + goto err_c; + + if (tse->has_nodelay && tse->nodelay) { + aux = 1; + if (restore_opt(sk, SOL_TCP, TCP_NODELAY, &aux)) + goto err_c; + } + + if (tse->has_cork && tse->cork) { + aux = 1; + if (restore_opt(sk, SOL_TCP, TCP_CORK, &aux)) + goto err_c; + } + + tcp_stream_entry__free_unpacked(tse, NULL); + close_image(img); + return 0; + +err_c: + tcp_stream_entry__free_unpacked(tse, NULL); + close_image(img); +err: + return -1; +} + +int prepare_tcp_socks(struct task_restore_args *ta) +{ + struct inet_sk_info *ii; + + ta->tcp_socks = (struct rst_tcp_sock *) rst_mem_align_cpos(RM_PRIVATE); + ta->tcp_socks_n = 0; + + list_for_each_entry(ii, &rst_tcp_repair_sockets, rlist) { + struct rst_tcp_sock *rs; + + /* + * rst_tcp_repair_sockets contains all sockets, so we need to + * select sockets which restored in a current process. + */ + if (ii->sk_fd == -1) + continue; + + rs = rst_mem_alloc(sizeof(*rs), RM_PRIVATE); + if (!rs) + return -1; + + rs->sk = ii->sk_fd; + rs->reuseaddr = ii->ie->opts->reuseaddr; + ta->tcp_socks_n++; + } + + return 0; +} + +int restore_one_tcp(int fd, struct inet_sk_info *ii) +{ + struct libsoccr_sk *sk; + + pr_info("Restoring TCP connection\n"); + + if (opts.tcp_close && + ii->ie->state != TCP_LISTEN && ii->ie->state != TCP_CLOSE) { + return 0; + } + + sk = libsoccr_pause(fd); + if (!sk) + return -1; + + if (restore_tcp_conn_state(fd, sk, ii)) { + libsoccr_release(sk); + return -1; + } + + return 0; +} + +void tcp_locked_conn_add(struct inet_sk_info *ii) +{ + list_add_tail(&ii->rlist, &rst_tcp_repair_sockets); + ii->sk_fd = -1; +} + +void rst_unlock_tcp_connections(void) +{ + struct inet_sk_info *ii; + + /* Network will be unlocked by network-unlock scripts */ + if (root_ns_mask & CLONE_NEWNET) + return; + + list_for_each_entry(ii, &rst_tcp_repair_sockets, rlist) + nf_unlock_connection_info(ii); +} diff --git a/CRIU_code/criu/sk-unix.c b/CRIU_code/criu/sk-unix.c new file mode 100644 index 0000000..f0620e6 --- /dev/null +++ b/CRIU_code/criu/sk-unix.c @@ -0,0 +1,2331 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "libnetlink.h" +#include "cr_options.h" +#include "imgset.h" +#include "unix_diag.h" +#include "files.h" +#include "file-ids.h" +#include "log.h" +#include "util.h" +#include "util-pie.h" +#include "sockets.h" +#include "sk-queue.h" +#include "mount.h" +#include "cr-service.h" +#include "plugin.h" +#include "namespaces.h" +#include "pstree.h" +#include "external.h" +#include "crtools.h" +#include "fdstore.h" +#include "fdinfo.h" +#include "kerndat.h" +#include "rst-malloc.h" + +#include "protobuf.h" +#include "images/sk-unix.pb-c.h" + +#undef LOG_PREFIX +#define LOG_PREFIX "unix: " + +/* + * By-default, when dumping a unix socket, we should dump its peer + * as well. Which in turn means, we should dump the task(s) that have + * this peer opened. + * + * Sometimes, we can break this rule and dump only one end of the + * unix sockets pair, and on restore time connect() this end back to + * its peer. + * + * So, to resolve this situation we mark the peers we don't dump + * as "external" and require the --ext-unix-sk option. + */ + +#define USK_EXTERN (1 << 0) +#define USK_SERVICE (1 << 1) +#define USK_CALLBACK (1 << 2) +#define USK_INHERIT (1 << 3) + +#define FAKE_INO 0 + +struct unix_sk_desc { + struct socket_desc sd; + unsigned int type; + unsigned int state; + unsigned int peer_ino; + unsigned int rqlen; + unsigned int wqlen; + unsigned int namelen; + char *name; + unsigned int nr_icons; + unsigned int *icons; + + unsigned int vfs_dev; + unsigned int vfs_ino; + + unsigned char shutdown; + bool deleted; + + mode_t mode; + uid_t uid; + gid_t gid; + + struct list_head list; + + int fd; + struct list_head peer_list; + struct list_head peer_node; + + UnixSkEntry *ue; +}; + +/* + * The mutex_ghost is accessed from different tasks, + * so make sure it is in shared memory. + */ +static mutex_t *mutex_ghost; + +static LIST_HEAD(unix_sockets); +static LIST_HEAD(unix_ghost_addr); + +static int unix_resolve_name(int lfd, uint32_t id, struct unix_sk_desc *d, + UnixSkEntry *ue, const struct fd_parms *p); + +struct unix_sk_info; +static int unlink_sk(struct unix_sk_info *ui); + +struct unix_sk_listen_icon { + unsigned int peer_ino; + struct unix_sk_desc *sk_desc; + struct unix_sk_listen_icon *next; +}; + +#define SK_HASH_SIZE 32 + +static struct unix_sk_listen_icon *unix_listen_icons[SK_HASH_SIZE]; + +static struct unix_sk_listen_icon *lookup_unix_listen_icons(unsigned int peer_ino) +{ + struct unix_sk_listen_icon *ic; + + for (ic = unix_listen_icons[peer_ino % SK_HASH_SIZE]; + ic; ic = ic->next) + if (ic->peer_ino == peer_ino) + return ic; + return NULL; +} + +static void show_one_unix(char *act, const struct unix_sk_desc *sk) +{ + pr_debug("\t%s: ino %d peer_ino %d family %4d type %4d state %2d name %s\n", + act, sk->sd.ino, sk->peer_ino, sk->sd.family, sk->type, sk->state, sk->name); + + if (sk->nr_icons) { + int i; + + for (i = 0; i < sk->nr_icons; i++) + pr_debug("\t\ticon: %d\n", sk->icons[i]); + } +} + +static void show_one_unix_img(const char *act, const UnixSkEntry *e) +{ + pr_info("\t%s: id %#x ino %d peer %d type %d state %d name %d bytes\n", + act, e->id, e->ino, e->peer, e->type, e->state, (int)e->name.len); +} + +static int can_dump_unix_sk(const struct unix_sk_desc *sk) +{ + /* + * The last case in this "if" is seqpacket socket, + * that is connected to cr_service. We will dump + * it properly below. + */ + if (sk->type != SOCK_STREAM && + sk->type != SOCK_DGRAM && + sk->type != SOCK_SEQPACKET) { + pr_err("Unsupported type (%d) on socket %d.\n" + "Only stream/dgram/seqpacket are supported.\n", + sk->type, sk->sd.ino); + return 0; + } + + switch (sk->state) { + case TCP_LISTEN: + case TCP_ESTABLISHED: + case TCP_CLOSE: + break; + default: + pr_err("Unknown state %d for unix socket %d\n", + sk->state, sk->sd.ino); + return 0; + } + + return 1; +} + +static bool unix_sk_exception_lookup_id(unsigned int ino) +{ + char id[20]; + + snprintf(id, sizeof(id), "unix[%u]", ino); + if (external_lookup_id(id)) { + pr_debug("Found ino %u in exception unix sk list\n", (unsigned int)ino); + return true; + } + + return false; +} + +static int write_unix_entry(struct unix_sk_desc *sk) +{ + int ret; + FileEntry fe = FILE_ENTRY__INIT; + + fe.type = FD_TYPES__UNIXSK; + fe.id = sk->ue->id; + fe.usk = sk->ue; + + ret = pb_write_one(img_from_set(glob_imgset, CR_FD_FILES), &fe, PB_FILE); + + show_one_unix_img("Dumped", sk->ue); + + release_skopts(sk->ue->opts); + xfree(sk->ue); + + sk->ue = NULL; + + return ret; +} + +#ifndef SIOCUNIXFILE +#define SIOCUNIXFILE (SIOCPROTOPRIVATE + 0) /* open a socket file with O_PATH */ +#endif + +int kerndat_socket_unix_file(void) +{ + int sk, fd; + + sk = socket(AF_UNIX, SOCK_DGRAM, 0); + if (sk < 0) { + pr_perror("Unable to create socket"); + return -1; + } + fd = ioctl(sk, SIOCUNIXFILE); + if (fd < 0 && errno != ENOENT) { + pr_warn("Unable to open a socket file: %m\n"); + kdat.sk_unix_file = false; + close(sk); + return 0; + } + close(sk); + close_safe(&fd); + + kdat.sk_unix_file = true; + + return 0; +} + +static int get_mnt_id(int lfd, int *mnt_id) +{ + struct fdinfo_common fdinfo = { .mnt_id = -1 }; + int ret, fd; + + fd = ioctl(lfd, SIOCUNIXFILE); + if (fd < 0) { + pr_perror("Unable to get a socker file descriptor"); + return -1; + } + + ret = parse_fdinfo(fd, FD_TYPES__UND, &fdinfo); + close(fd); + if (ret < 0) + return -1; + + *mnt_id = fdinfo.mnt_id; + + return 0; +} + +static int resolve_rel_name(uint32_t id, struct unix_sk_desc *sk, const struct fd_parms *p, char **pdir) +{ + const char *dirs[] = { "cwd", "root" }; + struct pstree_item *task; + int mntns_root, i; + struct ns_id *ns; + + task = pstree_item_by_real(p->pid); + if (!task) { + pr_err("Can't find task with pid %d\n", p->pid); + return -ENOENT; + } + + ns = lookup_ns_by_id(task->ids->mnt_ns_id, &mnt_ns_desc); + if (!ns) { + pr_err("Can't resolve mount namespace for pid %d\n", p->pid); + return -ENOENT; + } + + mntns_root = mntns_get_root_fd(ns); + if (mntns_root < 0) { + pr_err("Can't resolve fs root for pid %d\n", p->pid); + return -ENOENT; + } + + pr_debug("Resolving relative name %s for socket %d\n", + sk->name, sk->sd.ino); + + for (i = 0; i < ARRAY_SIZE(dirs); i++) { + char dir[PATH_MAX], path[PATH_MAX]; + struct stat st; + int ret; + + snprintf(path, sizeof(path), "/proc/%d/%s", p->pid, dirs[i]); + ret = readlink(path, dir, sizeof(dir)); + if (ret < 0 || (size_t)ret == sizeof(dir)) { + pr_err("Can't readlink for %s\n", dirs[i]); + return -1; + } + dir[ret] = 0; + + if (snprintf(path, sizeof(path), ".%s/%s", dir, sk->name) >= sizeof(path)) { + pr_err("The path .%s/%s is too long\n", dir, sk->name); + goto err; + } + if (fstatat(mntns_root, path, &st, 0)) { + if (errno == ENOENT) + continue; + goto err; + } + + if ((st.st_ino == sk->vfs_ino) && + phys_stat_dev_match(st.st_dev, sk->vfs_dev, ns, &path[1])) { + *pdir = xstrdup(dir); + if (!*pdir) + return -ENOMEM; + + pr_debug("Resolved relative socket name to dir %s\n", *pdir); + sk->mode = st.st_mode; + sk->uid = st.st_uid; + sk->gid = st.st_gid; + return 0; + } + } + +err: + pr_err("Can't resolve name for socket %#x\n", id); + return -ENOENT; +} + +static int dump_one_unix_fd(int lfd, uint32_t id, const struct fd_parms *p) +{ + struct unix_sk_desc *sk, *peer; + UnixSkEntry *ue; + SkOptsEntry *skopts; + FilePermsEntry *perms; + FownEntry *fown; + void *m; + + m = xmalloc(sizeof(UnixSkEntry) + + sizeof(SkOptsEntry) + + sizeof(FilePermsEntry) + + sizeof(FownEntry)); + if (!m) + return -ENOMEM; + ue = xptr_pull(&m, UnixSkEntry); + skopts = xptr_pull(&m, SkOptsEntry); + perms = xptr_pull(&m, FilePermsEntry); + fown = xptr_pull(&m, FownEntry); + + unix_sk_entry__init(ue); + sk_opts_entry__init(skopts); + file_perms_entry__init(perms); + + *fown = p->fown; + + sk = (struct unix_sk_desc *)lookup_socket(p->stat.st_ino, PF_UNIX, 0); + if (IS_ERR_OR_NULL(sk)) { + pr_err("Unix socket %d not found\n", (int)p->stat.st_ino); + goto err; + } + + if (!can_dump_unix_sk(sk)) + goto err; + + BUG_ON(sk->sd.already_dumped); + + ue->name.len = (size_t)sk->namelen; + ue->name.data = (void *)sk->name; + + ue->id = id; + ue->ino = sk->sd.ino; + ue->ns_id = sk->sd.sk_ns->id; + ue->has_ns_id = true; + ue->type = sk->type; + ue->state = sk->state; + ue->flags = p->flags; + ue->backlog = sk->wqlen; + ue->peer = sk->peer_ino; + ue->fown = fown; + ue->opts = skopts; + ue->uflags = 0; + + if (unix_resolve_name(lfd, id, sk, ue, p)) + goto err; + + /* + * Check if this socket is connected to criu service. + * Dump it like closed one and mark it for restore. + */ + if (unlikely(ue->peer == service_sk_ino)) { + ue->state = TCP_CLOSE; + ue->peer = 0; + ue->uflags |= USK_SERVICE; + } + + if (sk->namelen && *sk->name) { + ue->file_perms = perms; + + perms->mode = sk->mode; + perms->uid = userns_uid(sk->uid); + perms->gid = userns_gid(sk->gid); + } + + if (sk->deleted) { + ue->has_deleted = true; + ue->deleted = sk->deleted; + } + + sk_encode_shutdown(ue, sk->shutdown); + + /* + * If a stream listening socket has non-zero rqueue, this + * means there are in-flight connections waiting to get + * accept()-ed. We handle them separately with the "icons" + * (i stands for in-flight, cons -- for connections) things. + */ + if (sk->rqlen != 0 && !(sk->type == SOCK_STREAM && + sk->state == TCP_LISTEN)) { + if (dump_sk_queue(lfd, id)) + goto err; + } + + if (ue->peer) { + peer = (struct unix_sk_desc *)lookup_socket(ue->peer, PF_UNIX, 0); + if (IS_ERR_OR_NULL(peer)) { + pr_err("Unix socket %d without peer %d\n", + ue->ino, ue->peer); + goto err; + } + + /* + * Peer should have us as peer or have a name by which + * we can access one. + */ + if (peer->peer_ino != ue->ino) { + if (!peer->name) { + pr_err("Unix socket %d with unreachable peer %d (%d)\n", + ue->ino, ue->peer, peer->peer_ino); + goto err; + } + } + + /* + * It can be external socket, so we defer dumping + * until all sockets the program owns are processed. + */ + if (!peer->sd.already_dumped) { + show_one_unix("Add a peer", peer); + list_add(&sk->peer_node, &peer->peer_list); + sk->fd = dup(lfd); + if (sk->fd < 0) { + pr_perror("Unable to dup(%d)", lfd); + goto err; + } + } + + if ((ue->type != SOCK_DGRAM) && ( + ((ue->shutdown == SK_SHUTDOWN__READ) && + (peer->shutdown != SK_SHUTDOWN__WRITE)) || + ((ue->shutdown == SK_SHUTDOWN__WRITE) && + (peer->shutdown != SK_SHUTDOWN__READ)) || + ((ue->shutdown == SK_SHUTDOWN__BOTH) && + (peer->shutdown != SK_SHUTDOWN__BOTH)) )) { + /* + * Usually this doesn't happen, however it's possible if + * socket was shut down before connect() (see sockets03.c test). + * On restore we will shutdown both end (iow sockets will be in + * matched state). This shouldn't be a problem, since kernel seems + * to check both ends on read()/write(). Thus mismatched sockets behave + * the same way as matched. + */ + pr_warn("Shutdown mismatch %d:%d -> %d:%d\n", + ue->ino, ue->shutdown, peer->sd.ino, peer->shutdown); + } + } else if (ue->state == TCP_ESTABLISHED) { + const struct unix_sk_listen_icon *e; + + e = lookup_unix_listen_icons(ue->ino); + if (!e) { + /* + * ESTABLISHED socket without peer and without + * anyone waiting for it should be semi-closed + * connection. + */ + + if (ue->shutdown == SK_SHUTDOWN__BOTH) { + pr_info("Dumping semi-closed connection\n"); + goto dump; + } + + pr_err("Dangling connection %d\n", ue->ino); + goto err; + } + + /* + * If this is in-flight connection we need to figure + * out where to connect it on restore. Thus, tune up peer + * id by searching an existing listening socket. + * + * Note the socket name will be found at restore stage, + * not now, just to reduce size of dump files. + */ + + /* e->sk_desc is _never_ NULL */ + if (e->sk_desc->state != TCP_LISTEN) { + pr_err("In-flight connection on " + "non-listening socket %d\n", ue->ino); + goto err; + } + + ue->peer = e->sk_desc->sd.ino; + + pr_debug("\t\tFixed inflight socket %d peer %d)\n", + ue->ino, ue->peer); + } +dump: + if (dump_socket_opts(lfd, skopts)) + goto err; + + pr_info("Dumping unix socket at %d\n", p->fd); + show_one_unix("Dumping", sk); + + sk->ue = ue; + /* + * Postpone writing the entry if a peer isn't found yet. + * It's required, because we may need to modify the entry. + * For example, if a socket is external and is dumped by + * a callback, the USK_CALLBACK flag must be set. + */ + if (list_empty(&sk->peer_node) && write_unix_entry(sk)) + return -1; + + sk->sd.already_dumped = 1; + + while (!list_empty(&sk->peer_list)) { + struct unix_sk_desc *psk; + psk = list_first_entry(&sk->peer_list, struct unix_sk_desc, peer_node); + close_safe(&psk->fd); + list_del_init(&psk->peer_node); + + if (write_unix_entry(psk)) + return -1; + psk->sd.already_dumped = 1; + } + + return 0; + +err: + release_skopts(skopts); + xfree(ue); + return -1; +} + +const struct fdtype_ops unix_dump_ops = { + .type = FD_TYPES__UNIXSK, + .dump = dump_one_unix_fd, +}; + +static int unix_resolve_name(int lfd, uint32_t id, struct unix_sk_desc *d, + UnixSkEntry *ue, const struct fd_parms *p) +{ + char *name = d->name; + bool deleted = false; + char rpath[PATH_MAX]; + struct ns_id *ns; + struct stat st; + int mntns_root; + int ret, mnt_id; + + if (d->namelen == 0 || name[0] == '\0') + return 0; + + if (kdat.sk_unix_file && (root_ns_mask & CLONE_NEWNS)) { + if (get_mnt_id(lfd, &mnt_id)) + return -1; + ue->mnt_id = mnt_id; + ue->has_mnt_id = mnt_id; + } + + if (ue->mnt_id >= 0) + ns = lookup_nsid_by_mnt_id(ue->mnt_id); + else + ns = lookup_ns_by_id(root_item->ids->mnt_ns_id, &mnt_ns_desc); + if (!ns) { + ret = -ENOENT; + goto out; + } + + mntns_root = mntns_get_root_fd(ns); + if (mntns_root < 0) { + ret = -ENOENT; + goto out; + } + + if (name[0] != '/') { + /* + * Relative names are be resolved later at first + * dump attempt. + */ + + ret = resolve_rel_name(id, d, p, &ue->name_dir); + if (ret < 0) + goto out; + goto postprone; + } + + snprintf(rpath, sizeof(rpath), ".%s", name); + if (fstatat(mntns_root, rpath, &st, 0)) { + if (errno != ENOENT) { + pr_warn("Can't stat socket %#x(%s), skipping: %m (err %d)\n", + id, rpath, errno); + goto skip; + } + + pr_info("unix: Dropping path %s for unlinked sk %#x\n", + name, id); + deleted = true; + } else if ((st.st_ino != d->vfs_ino) || + !phys_stat_dev_match(st.st_dev, d->vfs_dev, ns, name)) { + pr_info("unix: Dropping path %s for unlinked bound " + "sk %#x.%d real %#x.%d\n", + name, (int)st.st_dev, (int)st.st_ino, + (int)d->vfs_dev, (int)d->vfs_ino); + deleted = true; + } + + d->mode = st.st_mode; + d->uid = st.st_uid; + d->gid = st.st_gid; + + d->deleted = deleted; + +postprone: + return 0; + +out: + xfree(name); + return ret; +skip: + ret = 1; + goto out; +} + +/* + * Returns: < 0 on error, 0 if OK, 1 to skip the socket + */ +static int unix_process_name(struct unix_sk_desc *d, const struct unix_diag_msg *m, struct nlattr **tb) +{ + int len; + char *name; + + len = nla_len(tb[UNIX_DIAG_NAME]); + name = xmalloc(len + 1); + if (!name) + return -ENOMEM; + + memcpy(name, nla_data(tb[UNIX_DIAG_NAME]), len); + name[len] = '\0'; + + if (name[0]) { + struct unix_diag_vfs *uv; + + if (!tb[UNIX_DIAG_VFS]) { + pr_err("Bound socket w/o inode %d\n", m->udiag_ino); + goto skip; + } + + uv = RTA_DATA(tb[UNIX_DIAG_VFS]); + d->vfs_dev = uv->udiag_vfs_dev; + d->vfs_ino = uv->udiag_vfs_ino; + } + + d->namelen = len; + d->name = name; + return 0; +skip: + xfree(name); + return 1; +} + +static int unix_collect_one(const struct unix_diag_msg *m, + struct nlattr **tb, struct ns_id *ns) +{ + struct unix_sk_desc *d; + int ret = 0; + + d = xzalloc(sizeof(*d)); + if (!d) + return -1; + + d->type = m->udiag_type; + d->state = m->udiag_state; + INIT_LIST_HEAD(&d->list); + + INIT_LIST_HEAD(&d->peer_list); + INIT_LIST_HEAD(&d->peer_node); + d->fd = -1; + + if (tb[UNIX_DIAG_SHUTDOWN]) + d->shutdown = nla_get_u8(tb[UNIX_DIAG_SHUTDOWN]); + else + pr_err_once("No socket shutdown info\n"); + + if (tb[UNIX_DIAG_PEER]) + d->peer_ino = nla_get_u32(tb[UNIX_DIAG_PEER]); + + if (tb[UNIX_DIAG_NAME]) { + ret = unix_process_name(d, m, tb); + if (ret < 0) + goto err; + else if (ret == 1) + goto skip; + BUG_ON(ret != 0); + } + + if (tb[UNIX_DIAG_ICONS]) { + unsigned int len = nla_len(tb[UNIX_DIAG_ICONS]); + unsigned int i; + + d->icons = xmalloc(len); + if (!d->icons) + goto err; + + memcpy(d->icons, nla_data(tb[UNIX_DIAG_ICONS]), len); + d->nr_icons = len / sizeof(uint32_t); + + /* + * Remember these sockets, we will need them + * to fix up in-flight sockets peers. + */ + for (i = 0; i < d->nr_icons; i++) { + struct unix_sk_listen_icon *e, **chain; + unsigned int n; + + e = xzalloc(sizeof(*e)); + if (!e) + goto err; + + n = d->icons[i]; + chain = &unix_listen_icons[n % SK_HASH_SIZE]; + e->next = *chain; + *chain = e; + + pr_debug("\t\tCollected icon %d\n", d->icons[i]); + + e->peer_ino = n; + e->sk_desc = d; + } + } + + if (tb[UNIX_DIAG_RQLEN]) { + struct unix_diag_rqlen *rq; + + rq = (struct unix_diag_rqlen *)RTA_DATA(tb[UNIX_DIAG_RQLEN]); + d->rqlen = rq->udiag_rqueue; + d->wqlen = rq->udiag_wqueue; + } + + sk_collect_one(m->udiag_ino, AF_UNIX, &d->sd, ns); + list_add_tail(&d->list, &unix_sockets); + show_one_unix("Collected", d); + + return 0; +err: + ret = -1; +skip: + xfree(d->icons); + xfree(d->name); + xfree(d); + return ret; +} + +int unix_receive_one(struct nlmsghdr *h, struct ns_id *ns, void *arg) +{ + struct unix_diag_msg *m = NLMSG_DATA(h); + struct nlattr *tb[UNIX_DIAG_MAX+1]; + + nlmsg_parse(h, sizeof(struct unix_diag_msg), tb, UNIX_DIAG_MAX, NULL); + + return unix_collect_one(m, tb, ns); +} + +static int __dump_external_socket(struct unix_sk_desc *sk, + struct unix_sk_desc *peer) +{ + int ret; + + ret = run_plugins(DUMP_UNIX_SK, sk->fd, sk->sd.ino); + if (ret < 0 && ret != -ENOTSUP) + return -1; + + if (ret == 0) { + sk->ue->uflags |= USK_CALLBACK; + return 0; + } + + if (unix_sk_exception_lookup_id(sk->sd.ino)) { + pr_debug("found exception for unix name-less external socket.\n"); + return 0; + } + + /* Legacy -x|--ext-unix-sk option handling */ + if (!opts.ext_unix_sk) { + show_one_unix("Runaway socket", peer); + pr_err("External socket is used. " + "Consider using --" USK_EXT_PARAM " option.\n"); + return -1; + } + + if (peer->type != SOCK_DGRAM) { + show_one_unix("Ext stream not supported", peer); + pr_err("Can't dump half of stream unix connection.\n"); + return -1; + } + + if (!peer->name) { + show_one_unix("Ext dgram w/o name", peer); + pr_err("Can't dump name-less external socket.\n"); + pr_err("%d\n", sk->fd); + return -1; + } + + return 0; +} + +static int dump_external_sockets(struct unix_sk_desc *peer) +{ + struct unix_sk_desc *sk; + + while (!list_empty(&peer->peer_list)) { + sk = list_first_entry(&peer->peer_list, struct unix_sk_desc, peer_node); + + if (__dump_external_socket(sk, peer)) + return -1; + + if (write_unix_entry(sk)) + return -1; + close_safe(&sk->fd); + + list_del_init(&sk->peer_node); + } + + return 0; +} + +int fix_external_unix_sockets(void) +{ + struct unix_sk_desc *sk; + + pr_debug("Dumping external sockets\n"); + + list_for_each_entry(sk, &unix_sockets, list) { + FileEntry fe = FILE_ENTRY__INIT; + UnixSkEntry e = UNIX_SK_ENTRY__INIT; + FownEntry fown = FOWN_ENTRY__INIT; + SkOptsEntry skopts = SK_OPTS_ENTRY__INIT; + + if (sk->sd.already_dumped || + list_empty(&sk->peer_list)) + continue; + + show_one_unix("Dumping extern", sk); + + fd_id_generate_special(NULL, &e.id); + e.ino = sk->sd.ino; + e.type = SOCK_DGRAM; + e.state = TCP_LISTEN; + e.name.data = (void *)sk->name; + e.name.len = (size_t)sk->namelen; + e.uflags = USK_EXTERN; + e.peer = 0; + e.fown = &fown; + e.opts = &skopts; + + fe.type = FD_TYPES__UNIXSK; + fe.id = e.id; + fe.usk = &e; + + if (pb_write_one(img_from_set(glob_imgset, CR_FD_FILES), &fe, PB_FILE)) + goto err; + + show_one_unix_img("Dumped extern", &e); + + if (dump_external_sockets(sk)) + goto err; + } + + return 0; +err: + return -1; +} + +struct unix_sk_info { + UnixSkEntry *ue; + struct list_head list; + char *name; + char *name_dir; + unsigned flags; + int fdstore_id; + struct unix_sk_info *peer; + struct pprep_head peer_resolve; /* XXX : union with the above? */ + struct file_desc d; + struct list_head connected; /* List of sockets, connected to me */ + struct list_head node; /* To link in peer's connected list */ + struct list_head scm_fles; + struct list_head ghost_node; + size_t ghost_dir_pos; + + /* + * For DGRAM sockets with queues, we should only restore the queue + * once although it may be open by more than one tid. This is the peer + * that should do the queueing. + */ + struct unix_sk_info *queuer; + /* + * These bits are set by task-owner of this unix_sk_info. + * Another tasks can only read them. + */ + uint8_t bound:1; + uint8_t listen:1; + uint8_t is_connected:1; + uint8_t peer_queue_restored:1; /* Set in 1 after we restore peer's queue */ +}; + +struct scm_fle { + struct list_head l; + struct fdinfo_list_entry *fle; +}; + +#define USK_PAIR_MASTER 0x1 +#define USK_PAIR_SLAVE 0x2 +#define USK_GHOST_FDSTORE 0x4 /* bound but removed address */ + +static struct unix_sk_info *find_unix_sk_by_ino(int ino) +{ + struct unix_sk_info *ui; + + list_for_each_entry(ui, &unix_sockets, list) { + if (ui->ue->ino == ino) + return ui; + } + + return NULL; +} + +static struct unix_sk_info *find_queuer_for(int id) +{ + struct unix_sk_info *ui; + + list_for_each_entry(ui, &unix_sockets, list) { + if (ui->queuer && ui->queuer->ue->id == id) + return ui; + } + + return NULL; +} + +static struct fdinfo_list_entry *get_fle_for_task(struct file_desc *tgt, + struct pstree_item *owner, bool force_master) +{ + struct fdinfo_list_entry *fle; + FdinfoEntry *e = NULL; + int fd; + + list_for_each_entry(fle, &tgt->fd_info_head, desc_list) { + if (fle->task == owner) + /* + * Owner already has this file in its fdtable. + * Just use one. + */ + return fle; + + e = fle->fe; /* keep any for further reference */ + } + + /* + * Some other task restores this file. Pretend that + * we're another user of it. + */ + fd = find_unused_fd(owner, -1); + pr_info("`- will add fake %d fd\n", fd); + + if (e != NULL) { + e = dup_fdinfo(e, fd, 0); + if (!e) { + pr_err("Can't duplicate fdinfo for scm\n"); + return NULL; + } + } else { + /* + * This can happen if the file in question is + * sent over the socket and closed. In this case + * we need to ... invent a new one! + */ + + e = xmalloc(sizeof(*e)); + if (!e) + return NULL; + + fdinfo_entry__init(e); + e->id = tgt->id; + e->type = tgt->ops->type; + e->fd = fd; + e->flags = 0; + } + + /* + * Make this fle fake, so that files collecting engine + * closes them at the end. + */ + return collect_fd_to(vpid(owner), e, rsti(owner), tgt, true, force_master); +} + +int unix_note_scm_rights(int id_for, uint32_t *file_ids, int *fds, int n_ids) +{ + struct unix_sk_info *ui; + struct pstree_item *owner; + int i; + + ui = find_queuer_for(id_for); + if (!ui) { + pr_err("Can't find sender for %#x\n", id_for); + return -1; + } + + pr_info("Found queuer for %#x -> %#x\n", id_for, ui->ue->id); + /* + * This is the task that will restore this socket + */ + owner = file_master(&ui->d)->task; + + pr_info("-> will set up deps\n"); + /* + * The ui will send data to the rights receiver. Add a fake fle + * for the file and a dependency. + */ + for (i = 0; i < n_ids; i++) { + struct file_desc *tgt; + struct scm_fle *sfle; + + tgt = find_file_desc_raw(FD_TYPES__UND, file_ids[i]); + if (!tgt) { + pr_err("Can't find fdesc to send\n"); + return -1; + } + + pr_info("scm: add file %#x -> %d\n", tgt->id, vpid(owner)); + sfle = xmalloc(sizeof(*sfle)); + if (!sfle) + return -1; + + sfle->fle = get_fle_for_task(tgt, owner, false); + if (!sfle->fle) { + pr_err("Can't request new fle for scm\n"); + xfree(sfle); + return -1; + } + + list_add_tail(&sfle->l, &ui->scm_fles); + fds[i] = sfle->fle->fe->fd; + } + + return 0; +} + +static int chk_restored_scms(struct unix_sk_info *ui) +{ + struct scm_fle *sf, *n; + + list_for_each_entry_safe(sf, n, &ui->scm_fles, l) { + if (sf->fle->stage < FLE_OPEN) + return 1; + + /* Optimization for the next pass */ + list_del(&sf->l); + xfree(sf); + } + + return 0; +} + +static int wake_connected_sockets(struct unix_sk_info *ui) +{ + struct fdinfo_list_entry *fle; + struct unix_sk_info *tmp; + + list_for_each_entry(tmp, &ui->connected, node) { + fle = file_master(&tmp->d); + set_fds_event(fle->pid); + } + return 0; +} + +static bool peer_is_not_prepared(struct unix_sk_info *peer) +{ + if (peer->ue->state != TCP_LISTEN) + return (!peer->bound); + else + return (!peer->listen); +} + +static int restore_unix_queue(int fd, struct unix_sk_info *peer) +{ + struct pstree_item *task; + + if (restore_sk_queue(fd, peer->ue->id)) + return -1; + if (peer->queuer) + peer->queuer->peer_queue_restored = true; + + task = file_master(&peer->d)->task; + set_fds_event(vpid(task)); + return 0; +} + +static int shutdown_unix_sk(int sk, struct unix_sk_info *ui) +{ + int how; + UnixSkEntry *ue = ui->ue; + + if (!ue->has_shutdown || ue->shutdown == SK_SHUTDOWN__NONE) + return 0; + + how = sk_decode_shutdown(ue->shutdown); + if (shutdown(sk, how)) { + pr_perror("Can't shutdown unix socket"); + return -1; + } + + pr_debug("Socket %d is shut down %d\n", ue->ino, how); + return 0; +} + +static int restore_sk_common(int fd, struct unix_sk_info *ui) +{ + if (rst_file_params(fd, ui->ue->fown, ui->ue->flags)) + return -1; + + if (restore_socket_opts(fd, ui->ue->opts)) + return -1; + + if (shutdown_unix_sk(fd, ui)) + return -1; + + return 0; +} + +static int revert_unix_sk_cwd(struct unix_sk_info *ui, int *prev_cwd_fd, int *root_fd, int *ns_fd) +{ + int ret = 0; + + if (*ns_fd >= 0 && restore_ns(*ns_fd, &mnt_ns_desc)) + ret = -1; + if (*root_fd >= 0) { + if (fchdir(*root_fd) || chroot(".")) + pr_perror("Can't revert root directory"); + close_safe(root_fd); + ret = -1; + } + if (prev_cwd_fd && *prev_cwd_fd >= 0) { + if (fchdir(*prev_cwd_fd)) + pr_perror("Can't revert working dir"); + else if (ui->name_dir) + pr_debug("Reverted working dir\n"); + close(*prev_cwd_fd); + *prev_cwd_fd = -1; + ret = -1; + } + + return ret; +} + +static int prep_unix_sk_cwd(struct unix_sk_info *ui, int *prev_cwd_fd, + int *prev_root_fd, int *prev_mntns_fd) +{ + static struct ns_id *root = NULL, *ns; + int fd; + + if (prev_mntns_fd && ui->name[0] && ui->ue->mnt_id >= 0) { + struct ns_id *mntns = lookup_nsid_by_mnt_id(ui->ue->mnt_id); + int ns_fd; + + if (mntns == NULL) { + pr_err("Unable to find the %d mount\n", ui->ue->mnt_id); + return -1; + } + + ns_fd = fdstore_get(mntns->mnt.nsfd_id); + if (ns_fd < 0) + return -1; + + if (switch_ns_by_fd(ns_fd, &mnt_ns_desc, prev_mntns_fd)) + return -1; + + set_proc_self_fd(-1); + close(ns_fd); + } + + *prev_cwd_fd = open(".", O_RDONLY); + if (*prev_cwd_fd < 0) { + pr_perror("Can't open current dir"); + return -1; + } + + if (prev_root_fd && (root_ns_mask & CLONE_NEWNS)) { + if (ui->ue->mnt_id >= 0) { + ns = lookup_nsid_by_mnt_id(ui->ue->mnt_id); + } else { + if (root == NULL) + root = lookup_ns_by_id(root_item->ids->mnt_ns_id, + &mnt_ns_desc); + ns = root; + } + if (ns == NULL) + goto err; + *prev_root_fd = open("/", O_RDONLY); + if (*prev_root_fd < 0) { + pr_perror("Can't open current root"); + goto err; + } + + fd = fdstore_get(ns->mnt.root_fd_id); + if (fd < 0) { + pr_err("Can't get root fd\n"); + goto err; + } + if (fchdir(fd)) { + pr_perror("Unable to change current working dir"); + close(fd); + goto err; + } + close(fd); + if (chroot(".")) { + pr_perror("Unable to change root directory"); + goto err; + } + } + + if (ui->name_dir) { + if (chdir(ui->name_dir)) { + pr_perror("Can't change working dir %s", + ui->name_dir); + goto err; + } + pr_debug("Change working dir to %s\n", ui->name_dir); + } + + return 0; +err: + close_safe(prev_cwd_fd); + if (prev_root_fd) + close_safe(prev_root_fd); + return -1; +} + +static int post_open_standalone(struct file_desc *d, int fd) +{ + int fdstore_fd = -1, procfs_self_dir = -1, len; + struct unix_sk_info *ui; + struct unix_sk_info *peer; + struct sockaddr_un addr; + int cwd_fd = -1, root_fd = -1, ns_fd = -1; + + ui = container_of(d, struct unix_sk_info, d); + BUG_ON((ui->flags & (USK_PAIR_MASTER | USK_PAIR_SLAVE)) || + (ui->ue->uflags & (USK_CALLBACK | USK_INHERIT))); + + if (chk_restored_scms(ui)) + return 1; + + peer = ui->peer; + if (!peer || ui->is_connected) + goto restore_sk_common; + + if (ui->ue->ino == FAKE_INO) { + BUG_ON(ui->queuer); + goto restore_queue; + } + + /* Skip external sockets */ + if (!list_empty(&peer->d.fd_info_head)) + if (peer_is_not_prepared(peer)) + return 1; + + memset(&addr, 0, sizeof(addr)); + addr.sun_family = AF_UNIX; + + pr_info("\tConnect %d to %d\n", ui->ue->ino, peer->ue->ino); + + if (prep_unix_sk_cwd(peer, &cwd_fd, &root_fd, &ns_fd)) + return -1; + + if (peer->flags & USK_GHOST_FDSTORE) { + procfs_self_dir = open_proc(getpid(), "fd"); + fdstore_fd = fdstore_get(peer->fdstore_id); + + if (fdstore_fd < 0 || procfs_self_dir < 0) + goto err_revert_and_exit; + + /* + * WARNING: After this call we rely on revert_unix_sk_cwd + * to restore the former directories so that connect + * will operate inside proc/$pid/fd/X. + */ + if (fchdir(procfs_self_dir)) { + pr_perror("Can't change to procfs"); + goto err_revert_and_exit; + } + len = snprintf(addr.sun_path, UNIX_PATH_MAX, "%d", fdstore_fd); + } else { + memcpy(&addr.sun_path, peer->name, peer->ue->name.len); + len = peer->ue->name.len; + } + + /* + * Make sure the target is not being renamed at the moment + * while we're connecting in sake of ghost sockets. + */ + mutex_lock(mutex_ghost); + if (connect(fd, (struct sockaddr *)&addr, sizeof(addr.sun_family) + len) < 0) { + pr_perror("Can't connect %d socket", ui->ue->ino); + goto err_revert_and_exit; + } + mutex_unlock(mutex_ghost); + + ui->is_connected = true; + + close_safe(&procfs_self_dir); + close_safe(&fdstore_fd); + revert_unix_sk_cwd(peer, &cwd_fd, &root_fd, &ns_fd); + +restore_queue: + if (peer->queuer == ui && + !(peer->ue->uflags & USK_EXTERN) && + restore_unix_queue(fd, peer)) + return -1; +restore_sk_common: + if (ui->queuer && !ui->queuer->peer_queue_restored) + return 1; + return restore_sk_common(fd, ui); + +err_revert_and_exit: + close_safe(&procfs_self_dir); + close_safe(&fdstore_fd); + revert_unix_sk_cwd(peer, &cwd_fd, &root_fd, &ns_fd); + return -1; +} + +static int restore_file_perms(struct unix_sk_info *ui) +{ + if (ui->ue->file_perms) { + FilePermsEntry *perms = ui->ue->file_perms; + char fname[PATH_MAX]; + + if (ui->ue->name.len >= sizeof(fname)) { + pr_err("The file name is too long\n"); + return -E2BIG; + } + + memcpy(fname, ui->name, ui->ue->name.len); + fname[ui->ue->name.len] = '\0'; + + if (fchownat(AT_FDCWD, fname, perms->uid, perms->gid, 0) < 0) { + int errno_cpy = errno; + pr_perror("Unable to change file owner and group"); + return -errno_cpy; + } + + if (fchmodat(AT_FDCWD, fname, perms->mode, 0) < 0) { + int errno_cpy = errno; + pr_perror("Unable to change file mode bits"); + return -errno_cpy; + } + } + + return 0; +} + +static int keep_deleted(struct unix_sk_info *ui) +{ + int fd = open(ui->name, O_PATH); + if (fd < 0) { + pr_perror("ghost: Can't open id %#x ino %d addr %s", + ui->ue->id, ui->ue->ino, ui->name); + return -1; + } + ui->fdstore_id = fdstore_add(fd); + pr_debug("ghost: id %#x %d fdstore_id %d %s\n", + ui->ue->id, ui->ue->ino, ui->fdstore_id, ui->name); + close(fd); + return ui->fdstore_id; +} + + +#define UNIX_GHOST_FMT "%s.criu-sk-ghost" + +/* + * When path where socket lives is deleted, we need to reconstruct + * it back up but allow caller to remove it after. + */ +static int bind_on_deleted(int sk, struct unix_sk_info *ui) +{ + char path[PATH_MAX], path_parked[PATH_MAX], *pos; + struct sockaddr_un addr; + bool renamed = false; + int ret; + + if (ui->ue->name.len >= UNIX_PATH_MAX) { + pr_err("ghost: Too long name for socket id %#x ino %d name %s\n", + ui->ue->id, ui->ue->ino, ui->name); + return -ENOSPC; + } + + memcpy(path, ui->name, ui->ue->name.len); + path[ui->ue->name.len] = '\0'; + + for (pos = strrchr(path, '/'); pos; + pos = strrchr(path, '/')) { + *pos = '\0'; + + ret = access(path, R_OK | W_OK | X_OK); + if (ret == 0) { + ui->ghost_dir_pos = pos - path; + pr_debug("ghost: socket id %#x ino %d name %s detected F_OK %s\n", + ui->ue->id, ui->ue->ino, ui->name, path); + break; + } + + if (errno != ENOENT) { + ret = -errno; + pr_perror("ghost: Can't access %s for socket id %#x ino %d name %s", + path, ui->ue->id, ui->ue->ino, ui->name); + return ret; + } + } + + memcpy(path, ui->name, ui->ue->name.len); + path[ui->ue->name.len] = '\0'; + + pos = dirname(path); + pr_debug("ghost: socket id %#x ino %d name %s creating %s\n", + ui->ue->id, ui->ue->ino, ui->name, pos); + ret = mkdirpat(AT_FDCWD, pos, 0755); + if (ret) { + errno = -ret; + pr_perror("ghost: Can't create %s", pos); + return ret; + } + + memset(&addr, 0, sizeof(addr)); + addr.sun_family = AF_UNIX; + memcpy(&addr.sun_path, ui->name, ui->ue->name.len); + + ret = bind(sk, (struct sockaddr *)&addr, + sizeof(addr.sun_family) + ui->ue->name.len); + if (ret < 0) { + /* + * In case if there some real living socket + * with same name just move it aside for a + * while, we will move it back once ghost + * socket is processed. + */ + if (errno == EADDRINUSE) { + snprintf(path_parked, sizeof(path_parked), UNIX_GHOST_FMT, ui->name); + /* + * Say previous restore get killed in a middle due to + * any reason, be ready the file might already exist, + * clean it up. + */ + if (unlinkat(AT_FDCWD, path_parked, 0) == 0) + pr_debug("ghost: Unlinked stale socket id %#x ino %d name %s\n", + ui->ue->id, ui->ue->ino, path_parked); + if (rename(ui->name, path_parked)) { + ret = -errno; + pr_perror("ghost: Can't rename id %#x ino %d addr %s -> %s", + ui->ue->id, ui->ue->ino, ui->name, path_parked); + return ret; + } + pr_debug("ghost: id %#x ino %d renamed %s -> %s\n", + ui->ue->id, ui->ue->ino, ui->name, path_parked); + renamed = true; + ret = bind(sk, (struct sockaddr *)&addr, + sizeof(addr.sun_family) + ui->ue->name.len); + } + if (ret < 0) { + ret = -errno; + pr_perror("ghost: Can't bind on socket id %#x ino %d addr %s", + ui->ue->id, ui->ue->ino, ui->name); + return ret; + } + } + + ret = restore_file_perms(ui); + if (ret < 0) + return ret; + + ret = keep_deleted(ui); + if (ret < 0) { + pr_err("ghost: Can't save socket %#x ino %d addr %s into fdstore\n", + ui->ue->id, ui->ue->ino, ui->name); + return -EIO; + } + + /* + * Once everything is ready, just remove the socket from the + * filesystem and rename back the original one if it were here. + */ + ret = unlinkat(AT_FDCWD, ui->name, 0); + if (ret < 0) { + ret = -errno; + pr_perror("ghost: Can't unlink socket %#x ino %d addr %s", + ui->ue->id, ui->ue->ino, ui->name); + return ret; + } + + if (renamed) { + if (rename(path_parked, ui->name)) { + ret = -errno; + pr_perror("ghost: Can't rename id %#x ino %d addr %s -> %s", + ui->ue->id, ui->ue->ino, path_parked, ui->name); + return ret; + } + + pr_debug("ghost: id %#x ino %d renamed %s -> %s\n", + ui->ue->id, ui->ue->ino, path_parked, ui->name); + } + + /* + * Finally remove directories we've created. + */ + if (ui->ghost_dir_pos) { + char *pos; + + memcpy(path, ui->name, ui->ue->name.len); + path[ui->ue->name.len] = '\0'; + + for (pos = strrchr(path, '/'); + pos && (pos - path) > ui->ghost_dir_pos; + pos = strrchr(path, '/')) { + *pos = '\0'; + if (rmdir(path)) { + pr_perror("ghost: Can't remove directory %s on id %#x ino %d", + path, ui->ue->id, ui->ue->ino); + return -1; + } + pr_debug("ghost: Removed %s on id %#x ino %d\n", + path, ui->ue->id, ui->ue->ino); + } + } + + return 0; +} + +static int bind_unix_sk(int sk, struct unix_sk_info *ui) +{ + struct sockaddr_un addr; + int cwd_fd = -1, root_fd = -1, ns_fd = -1; + int ret, exit_code = -1; + + if (ui->ue->name.len == 0) + return 0; + + if ((ui->ue->type == SOCK_STREAM) && (ui->ue->state == TCP_ESTABLISHED)) { + /* + * FIXME this can be done, but for doing this properly we + * need to bind socket to its name, then rename one to + * some temporary unique one and after all the sockets are + * restored we should walk those temp names and rename + * some of them back to real ones. + */ + return 0; + } + + memset(&addr, 0, sizeof(addr)); + addr.sun_family = AF_UNIX; + memcpy(&addr.sun_path, ui->name, ui->ue->name.len); + + if (ui->name[0] && prep_unix_sk_cwd(ui, &cwd_fd, &root_fd, &ns_fd)) + return -1; + + /* + * Order binding for sake of ghost sockets. We might rename + * existing socket to some temp name, bind ghost, delete it, + * and finally move the former back, thus while we're doing + * this stuff we should not be interrupted by connection + * from another sockets. + * + * FIXME: Probably wort make it per address rather for + * optimization sake. + */ + mutex_lock(mutex_ghost); + + if (ui->flags & USK_GHOST_FDSTORE) { + pr_debug("ghost: bind id %#x ino %d addr %s\n", + ui->ue->id, ui->ue->ino, ui->name); + ret = bind_on_deleted(sk, ui); + if (ret) + errno = -ret; + } else { + pr_debug("bind id %#x ino %d addr %s\n", + ui->ue->id, ui->ue->ino, ui->name); + ret = bind(sk, (struct sockaddr *)&addr, + sizeof(addr.sun_family) + ui->ue->name.len); + if (ret == 0 && restore_file_perms(ui)) + goto done; + } + if (ret < 0) { + pr_perror("Can't bind id %#x ino %d addr %s", + ui->ue->id, ui->ue->ino, ui->name); + goto done; + } + + if (ui->ue->state != TCP_LISTEN) { + ui->bound = 1; + wake_connected_sockets(ui); + } + + exit_code = 0; +done: + revert_unix_sk_cwd(ui, &cwd_fd, &root_fd, &ns_fd); + mutex_unlock(mutex_ghost); + return exit_code; +} + +static int post_open_interconnected_master(struct unix_sk_info *ui) +{ + struct fdinfo_list_entry *fle, *fle_peer; + struct unix_sk_info *peer = ui->peer; + + fle = file_master(&ui->d); + fle_peer = file_master(&peer->d); + BUG_ON(fle->task != fle_peer->task); /* See interconnected_pair() */ + + if (chk_restored_scms(ui) || chk_restored_scms(peer)) + return 0; + + if (restore_unix_queue(fle->fe->fd, peer)) + return -1; + + if (restore_unix_queue(fle_peer->fe->fd, ui)) + return -1; + + if (restore_sk_common(fle->fe->fd, ui)) + return -1; + + if (restore_sk_common(fle_peer->fe->fd, peer)) + return -1; + + return 0; +} + +static void pr_info_opening(const char *prefix, struct unix_sk_info *ui, struct fdinfo_list_entry *fle) +{ + pr_info("Opening %s (stage %d id %#x ino %d peer %d)\n", + prefix, fle->stage, ui->ue->id, ui->ue->ino, ui->ue->peer); +} + +static int open_unixsk_pair_master(struct unix_sk_info *ui, int *new_fd) +{ + struct fdinfo_list_entry *fle, *fle_peer; + struct unix_sk_info *peer = ui->peer; + int sk[2], tmp; + + fle = file_master(&ui->d); + pr_info_opening("master", ui, fle); + if (fle->stage == FLE_OPEN) + return post_open_interconnected_master(ui); + + fle_peer = file_master(&peer->d); + + BUG_ON(fle->task != fle_peer->task); /* See interconnected_pair() */ + + if (set_netns(ui->ue->ns_id)) + return -1; + + if (socketpair(PF_UNIX, ui->ue->type, 0, sk) < 0) { + pr_perror("Can't make socketpair"); + return -1; + } + + if (sk[0] == fle_peer->fe->fd) { + /* + * Below setup_and_serve_out() will reuse this fd, + * so this dups it in something else. + */ + tmp = dup(sk[0]); + if (tmp < 0) { + pr_perror("Can't dup()"); + return -1; + } + close(sk[0]); + sk[0] = tmp; + } + + if (setup_and_serve_out(fle_peer, sk[1])) { + pr_err("Can't send pair slave\n"); + return -1; + } + sk[1] = fle_peer->fe->fd; + + if (bind_unix_sk(sk[0], ui)) + return -1; + + if (bind_unix_sk(sk[1], peer)) + return -1; + + *new_fd = sk[0]; + return 1; +} + +static int open_unixsk_pair_slave(struct unix_sk_info *ui, int *new_fd) +{ + struct fdinfo_list_entry *fle_peer; + + fle_peer = file_master(&ui->peer->d); + pr_info_opening("slave", ui, fle_peer); + /* + * All the work is made in master. Slave just says it's restored + * after it sees the master is restored. + */ + return (fle_peer->stage != FLE_RESTORED); +} + +/* + * When sks[0]'s fle requires to create socketpair, and sks[1] is also + * somebody's fle, this makes file engine to make note the second_end + * is also open. + */ +static int setup_second_end(int *sks, struct fdinfo_list_entry *second_end) +{ + int ret; + + if (sks[0] == second_end->fe->fd) { + /* + * Below setup_and_serve_out() will reuse this fd, + * so this dups it in something else. + */ + ret = dup(sks[0]); + if (ret < 0) { + pr_perror("Can't dup()"); + return -1; + } + close(sks[0]); + sks[0] = ret; + } + + if (setup_and_serve_out(second_end, sks[1])) { + pr_err("Can't send pair slave\n"); + return -1; + } + return 0; +} + +static int open_unixsk_standalone(struct unix_sk_info *ui, int *new_fd) +{ + struct unix_sk_info *queuer = ui->queuer; + struct unix_sk_info *peer = ui->peer; + struct fdinfo_list_entry *fle, *fle_peer; + int sk; + + fle = file_master(&ui->d); + pr_info_opening("standalone", ui, fle); + + /* + * If we're about to connect to the peer which + * has been bound to removed address we should + * wait until it is processed and put into fdstore + * engine, later we will use the engine to connect + * into it in a special way. + */ + if (peer && (peer->flags & USK_GHOST_FDSTORE)) { + fle_peer = file_master(&peer->d); + if (fle_peer->stage < FLE_OPEN) { + return 1; + } + } + + if (fle->stage == FLE_OPEN) + return post_open_standalone(&ui->d, fle->fe->fd); + + /* Fake socket will be restored by its peer */ + if (!(ui->ue->uflags & USK_EXTERN) && ui->ue->ino == FAKE_INO) + return 1; + + if (set_netns(ui->ue->ns_id)) + return -1; + + /* + * Check if this socket was connected to criu service. + * If so, put response, that dumping and restoring + * was successful. + */ + if (ui->ue->uflags & USK_SERVICE) { + int sks[2]; + + if (socketpair(PF_UNIX, ui->ue->type, 0, sks)) { + pr_perror("Can't create socketpair"); + return -1; + } + + if (send_criu_dump_resp(sks[1], true, true) == -1) + return -1; + + close(sks[1]); + sk = sks[0]; + } else if (ui->ue->state == TCP_ESTABLISHED && queuer && queuer->ue->ino == FAKE_INO) { + int ret, sks[2]; + + if (ui->ue->type != SOCK_STREAM) { + pr_err("Non-stream socket %d in established state\n", + ui->ue->ino); + return -1; + } + + if (ui->ue->shutdown != SK_SHUTDOWN__BOTH) { + pr_err("Wrong shutdown/peer state for %d\n", + ui->ue->ino); + return -1; + } + + ret = socketpair(PF_UNIX, ui->ue->type, 0, sks); + if (ret < 0) { + pr_perror("Can't create socketpair"); + return -1; + } + + if (setup_second_end(sks, file_master(&queuer->d))) + return -1; + + sk = sks[0]; + } else if (ui->ue->type == SOCK_DGRAM && queuer && queuer->ue->ino == FAKE_INO) { + struct sockaddr_un addr; + int sks[2]; + + if (socketpair(PF_UNIX, ui->ue->type, 0, sks) < 0) { + pr_perror("Can't create socketpair"); + return -1; + } + + sk = sks[0]; + addr.sun_family = AF_UNSPEC; + + /* + * socketpair() assigns sks[1] as a peer of sks[0] + * (and vice versa). But in this case (not zero peer) + * it's impossible for other sockets to connect + * to sks[0] (see unix_dgram_connect()->unix_may_send()). + * The below is hack: we use that connect with AF_UNSPEC + * clears socket's peer. + * Note, that connect hack flushes receive queue, + * so restore_unix_queue() must be after it. + */ + if (connect(sk, (struct sockaddr *)&addr, sizeof(addr.sun_family))) { + pr_perror("Can't clear socket's peer"); + return -1; + } + + if (setup_second_end(sks, file_master(&queuer->d))) + return -1; + + sk = sks[0]; + } else { + if (ui->ue->uflags & USK_CALLBACK) { + sk = run_plugins(RESTORE_UNIX_SK, ui->ue->ino); + if (sk >= 0) + goto out; + } + + /* + * Connect to external sockets requires + * special option to be passed. + */ + if (ui->peer && (ui->peer->ue->uflags & USK_EXTERN) && + !(opts.ext_unix_sk)) { + pr_err("External socket found in image. " + "Consider using the --" USK_EXT_PARAM + "option to allow restoring it.\n"); + return -1; + } + + sk = socket(PF_UNIX, ui->ue->type, 0); + if (sk < 0) { + pr_perror("Can't make unix socket"); + return -1; + } + } + + if (bind_unix_sk(sk, ui)) { + close(sk); + return -1; + } + + if (ui->ue->state == TCP_LISTEN) { + pr_info("\tPutting %d into listen state\n", ui->ue->ino); + if (listen(sk, ui->ue->backlog) < 0) { + pr_perror("Can't make usk listen"); + close(sk); + return -1; + } + ui->listen = 1; + wake_connected_sockets(ui); + } + + if (ui->peer || ui->queuer) { + /* + * 1)We need to connect() to the peer, but the + * guy might have not bind()-ed himself, so + * let's postpone this. + * 2)Queuer won't be able to connect, if we do + * shutdown, so postpone it. + */ + *new_fd = sk; + return 1; + } + +out: + if (restore_sk_common(sk, ui)) + return -1; + + *new_fd = sk; + return 0; +} + +static int open_unix_sk(struct file_desc *d, int *new_fd) +{ + struct unix_sk_info *ui; + int ret; + + ui = container_of(d, struct unix_sk_info, d); + + if (inherited_fd(d, new_fd)) { + ui->ue->uflags |= USK_INHERIT; + ret = *new_fd >= 0 ? 0 : -1; + } else if (ui->flags & USK_PAIR_MASTER) + ret = open_unixsk_pair_master(ui, new_fd); + else if (ui->flags & USK_PAIR_SLAVE) + ret = open_unixsk_pair_slave(ui, new_fd); + else + ret = open_unixsk_standalone(ui, new_fd); + + return ret; +} + +static char *socket_d_name(struct file_desc *d, char *buf, size_t s) +{ + struct unix_sk_info *ui; + + ui = container_of(d, struct unix_sk_info, d); + + if (snprintf(buf, s, "socket:[%d]", ui->ue->ino) >= s) { + pr_err("Not enough room for unixsk %d identifier string\n", + ui->ue->ino); + return NULL; + } + + return buf; +} + +static struct file_desc_ops unix_desc_ops = { + .type = FD_TYPES__UNIXSK, + .open = open_unix_sk, + .name = socket_d_name, +}; + +/* + * Make FS clean from sockets we're about to + * restore. See for how we bind them for details + */ +static int unlink_sk(struct unix_sk_info *ui) +{ + int ret = 0, cwd_fd = -1, root_fd = -1, ns_fd = -1; + + if (!ui->name || ui->name[0] == '\0' || (ui->ue->uflags & USK_EXTERN)) + return 0; + + if (prep_unix_sk_cwd(ui, &cwd_fd, &root_fd, NULL)) + return -1; + + ret = unlinkat(AT_FDCWD, ui->name, 0) ? -1 : 0; + if (ret < 0 && errno != ENOENT) { + pr_warn("Can't unlink socket %d peer %d (name %s dir %s)\n", + ui->ue->ino, ui->ue->peer, + ui->name ? (ui->name[0] ? ui->name : &ui->name[1]) : "-", + ui->name_dir ? ui->name_dir : "-"); + ret = -errno; + goto out; + } else if (ret == 0) { + pr_debug("Unlinked socket %d peer %d (name %s dir %s)\n", + ui->ue->ino, ui->ue->peer, + ui->name ? (ui->name[0] ? ui->name : &ui->name[1]) : "-", + ui->name_dir ? ui->name_dir : "-"); + } +out: + revert_unix_sk_cwd(ui, &cwd_fd, &root_fd, &ns_fd); + return ret; +} + +static void try_resolve_unix_peer(struct unix_sk_info *ui); +static int fixup_unix_peer(struct unix_sk_info *ui); + +static int post_prepare_unix_sk(struct pprep_head *ph) +{ + struct unix_sk_info *ui; + + ui = container_of(ph, struct unix_sk_info, peer_resolve); + if (ui->ue->peer && fixup_unix_peer(ui)) + return -1; + unlink_sk(ui); + return 0; +} + +static int init_unix_sk_info(struct unix_sk_info *ui, UnixSkEntry *ue) +{ + ui->ue = ue; + if (ue->name.len) { + if (ue->name.len > UNIX_PATH_MAX) { + pr_err("Bad unix name len %d\n", (int)ue->name.len); + return -1; + } + + ui->name = (void *)ue->name.data; + } else + ui->name = NULL; + ui->name_dir = (void *)ue->name_dir; + + ui->flags = 0; + ui->fdstore_id = -1; + ui->ghost_dir_pos = 0; + ui->peer = NULL; + ui->queuer = NULL; + ui->bound = 0; + ui->listen = 0; + ui->is_connected = 0; + ui->peer_queue_restored = 0; + + memzero(&ui->peer_resolve, sizeof(ui->peer_resolve)); + memzero(&ui->d, sizeof(ui->d)); + + INIT_LIST_HEAD(&ui->list); + INIT_LIST_HEAD(&ui->connected); + INIT_LIST_HEAD(&ui->node); + INIT_LIST_HEAD(&ui->scm_fles); + INIT_LIST_HEAD(&ui->ghost_node); + + return 0; +} + +int unix_prepare_root_shared(void) +{ + struct unix_sk_info *ui; + + mutex_ghost = shmalloc(sizeof(*mutex_ghost)); + if (!mutex_ghost) { + pr_err("ghost: Can't allocate mutex\n"); + return -ENOMEM; + } + mutex_init(mutex_ghost); + + pr_debug("ghost: Resolving addresses\n"); + + list_for_each_entry(ui, &unix_ghost_addr, ghost_node) { + char tp_name[32]; + char st_name[32]; + + pr_debug("ghost: id %#x type %s state %s ino %d peer %d address %s\n", + ui->ue->id, __socket_type_name(ui->ue->type, tp_name), + __tcp_state_name(ui->ue->state, st_name), + ui->ue->ino, ui->peer ? ui->peer->ue->ino : 0, + ui->name); + + /* + * Drop any existing trash on the FS and mark the + * peer as a ghost one, so we will put it into + * fdstore to be able to connect into it even + * when the address is removed from the FS. + */ + unlink_sk(ui); + ui->flags |= USK_GHOST_FDSTORE; + } + + return 0; +} + +static int collect_one_unixsk(void *o, ProtobufCMessage *base, struct cr_img *i) +{ + struct unix_sk_info *ui = o; + char *uname, *prefix = ""; + int ulen; + + if (init_unix_sk_info(ui, pb_msg(base, UnixSkEntry))) + return -1; + + uname = ui->name; + ulen = ui->ue->name.len; + if (ulen > 0 && uname[0] == 0) { + prefix = "@"; + uname++; + ulen--; + if (memrchr(uname, 0, ulen)) { + /* replace zero characters */ + char *s = alloca(ulen + 1); + int i; + + for (i = 0; i < ulen; i++) + s[i] = uname[i] ? : '@'; + uname = s; + } + } else if (ulen == 0) { + ulen = 1; + uname = "-"; + } + + pr_info(" `- Got id %#x ino %d type %s state %s peer %d (name %s%.*s dir %s)\n", + ui->ue->id, ui->ue->ino, ___socket_type_name(ui->ue->type), + ___tcp_state_name(ui->ue->state), ui->ue->peer, prefix, ulen, + uname, ui->name_dir ? ui->name_dir : "-"); + + if (ui->ue->peer || ui->name) { + if (ui->ue->peer) + try_resolve_unix_peer(ui); + + ui->peer_resolve.actor = post_prepare_unix_sk; + add_post_prepare_cb(&ui->peer_resolve); + } + + if (ui->ue->deleted) { + if (!ui->name || !ui->ue->name.len || !ui->name[0]) { + pr_err("No name present, ino %d\n", ui->ue->ino); + return -1; + } + + list_add_tail(&ui->ghost_node, &unix_ghost_addr); + } + + list_add_tail(&ui->list, &unix_sockets); + return file_desc_add(&ui->d, ui->ue->id, &unix_desc_ops); +} + +struct collect_image_info unix_sk_cinfo = { + .fd_type = CR_FD_UNIXSK, + .pb_type = PB_UNIX_SK, + .priv_size = sizeof(struct unix_sk_info), + .collect = collect_one_unixsk, + .flags = COLLECT_SHARED, +}; + +static void set_peer(struct unix_sk_info *ui, struct unix_sk_info *peer) +{ + ui->peer = peer; + list_add(&ui->node, &peer->connected); + if (!peer->queuer) + peer->queuer = ui; +} + +static int add_fake_queuer(struct unix_sk_info *ui) +{ + struct unix_sk_info *peer; + struct pstree_item *task; + UnixSkEntry *peer_ue; + SkOptsEntry *skopts; + FownEntry *fown; + + if (ui->ue->ino == FAKE_INO) + return 0; + + peer = xzalloc(sizeof(struct unix_sk_info) + + sizeof(UnixSkEntry) + + sizeof(SkOptsEntry) + + sizeof(FownEntry)); + if (peer == NULL) + return -1; + + peer_ue = (void *) peer + sizeof(struct unix_sk_info); + skopts = (void *) peer_ue + sizeof(UnixSkEntry); + fown = (void *) skopts + sizeof(SkOptsEntry); + memcpy(skopts, ui->ue->opts, sizeof(SkOptsEntry)); + memcpy(fown, ui->ue->fown, sizeof(FownEntry)); + memcpy(peer_ue, ui->ue, sizeof(UnixSkEntry)); + peer_ue->opts = skopts; + peer_ue->file_perms = NULL; + peer_ue->fown = fown; + peer_ue->name.len = 0; + peer_ue->name_dir = NULL; + + if (init_unix_sk_info(peer, peer_ue)) + return -1; + + peer_ue->id = find_unused_file_desc_id(); + set_peer(peer, ui); + + /* Note, that this fake fdesc has no ino */ + peer->ue->ino = FAKE_INO; + file_desc_add(&peer->d, peer_ue->id, &unix_desc_ops); + list_del_init(&peer->d.fake_master_list); + list_add(&peer->list, &unix_sockets); + task = file_master(&ui->d)->task; + + return (get_fle_for_task(&peer->d, task, true) == NULL); +} + +int add_fake_unix_queuers(void) +{ + struct unix_sk_info *ui; + + list_for_each_entry(ui, &unix_sockets, list) { + if ((ui->ue->uflags & (USK_EXTERN | USK_CALLBACK)) || ui->queuer) + continue; + if (!(ui->ue->state == TCP_ESTABLISHED && !ui->peer) && + ui->ue->type != SOCK_DGRAM) + continue; + if (add_fake_queuer(ui)) + return -1; + } + return 0; +} + +/* This function is called from post prepare only */ +static int interconnected_pair(struct unix_sk_info *ui, struct unix_sk_info *peer) +{ + struct fdinfo_list_entry *fle, *fle_peer; + + ui->flags |= USK_PAIR_MASTER; + peer->flags |= USK_PAIR_SLAVE; + + fle = file_master(&ui->d); + fle_peer = file_master(&peer->d); + + /* + * Since queue restore is delayed, every socket of the pair + * should have another end to send the queue packets. + * To fit that, we make the both file_master's to be owned + * by the only task. + * This function is called from run_post_prepare() and + * after add_fake_fds_masters(), so we must not add masters, + * which fle->task has no permissions to restore. But + * it has permissions on ui, so it has permissions on peer. + */ + if (fle->task != fle_peer->task && + !get_fle_for_task(&peer->d, fle->task, true)) + return -1; + + return 0; +} + +static int fixup_unix_peer(struct unix_sk_info *ui) +{ + struct unix_sk_info *peer = ui->peer; + + if (!peer) { + pr_err("FATAL: Peer %d unresolved for %d\n", + ui->ue->peer, ui->ue->ino); + return -1; + } + + if (peer != ui && peer->peer == ui && + !(ui->flags & (USK_PAIR_MASTER | USK_PAIR_SLAVE))) { + pr_info("Connected %d -> %d (%d) flags %#x\n", + ui->ue->ino, ui->ue->peer, peer->ue->ino, ui->flags); + /* socketpair or interconnected sockets */ + if (interconnected_pair(ui, peer)) + return -1; + } + + return 0; +} + +static void try_resolve_unix_peer(struct unix_sk_info *ui) +{ + struct unix_sk_info *peer; + + if (ui->peer) + return; + + BUG_ON(!ui->ue->peer); + + if (ui->ue->peer == ui->ue->ino) { + /* socket connected to self %) */ + set_peer(ui, ui); + return; + } + + peer = find_unix_sk_by_ino(ui->ue->peer); + if (peer) { + set_peer(ui, peer); + if (peer->ue->peer == ui->ue->ino) + set_peer(peer, ui); + } /* else -- maybe later */ +} + +int unix_sk_id_add(unsigned int ino) +{ + char *e_str; + + e_str = xmalloc(20); + if (!e_str) + return -1; + snprintf(e_str, 20, "unix[%u]", ino); + return add_external(e_str); +} + +int unix_sk_ids_parse(char *optarg) +{ + /* + * parsing option of the following form: --ext-unix-sk=,... or short form -x,... + */ + + char *iter = optarg; + + while (*iter != '\0') { + if (*iter == ',') + iter++; + else { + unsigned int ino = strtoul(iter, &iter, 10); + + if (0 == ino) { + pr_err("Can't parse unix socket inode from optarg: %s\n", optarg); + return -1; + } + if (unix_sk_id_add(ino) < 0) { + pr_err("Can't add unix socket inode in list: %s\n", optarg); + return -1; + } + } + } + + return 0; +} diff --git a/CRIU_code/criu/sockets.c b/CRIU_code/criu/sockets.c new file mode 100644 index 0000000..312b55c --- /dev/null +++ b/CRIU_code/criu/sockets.c @@ -0,0 +1,969 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "int.h" +#include "bitops.h" +#include "libnetlink.h" +#include "sockets.h" +#include "unix_diag.h" +#include "inet_diag.h" +#include "packet_diag.h" +#include "netlink_diag.h" +#include "files.h" +#include "util-pie.h" +#include "sk-packet.h" +#include "namespaces.h" +#include "lsm.h" +#include "net.h" +#include "xmalloc.h" +#include "fs-magic.h" +#include "pstree.h" +#include "util.h" +#include "fdstore.h" + +#undef LOG_PREFIX +#define LOG_PREFIX "sockets: " + +#ifndef SOCK_DIAG_BY_FAMILY +#define SOCK_DIAG_BY_FAMILY 20 +#endif + +#define SK_HASH_SIZE 32 + +#ifndef SO_GET_FILTER +#define SO_GET_FILTER SO_ATTACH_FILTER +#endif + +static const char *__socket_const_name(char *dst, size_t len, const char **a, size_t n, unsigned int v) +{ + if (v < n) { + const char *name = a[v]; + if (name) + return name; + } + snprintf(dst, len, "%u", v); + return dst; +} + +const char *socket_proto_name(unsigned int proto, char *nm, size_t size) +{ + static const char *protos[] = { + [IPPROTO_IP] = __stringify_1(IPPROTO_IP), + [IPPROTO_ICMP] = __stringify_1(IPPROTO_ICMP), + [IPPROTO_IGMP] = __stringify_1(IPPROTO_IGMP), + [IPPROTO_IPIP] = __stringify_1(IPPROTO_IPIP), + [IPPROTO_TCP] = __stringify_1(IPPROTO_TCP), + [IPPROTO_EGP] = __stringify_1(IPPROTO_EGP), + [IPPROTO_UDP] = __stringify_1(IPPROTO_UDP), + [IPPROTO_DCCP] = __stringify_1(IPPROTO_DCCP), + [IPPROTO_IPV6] = __stringify_1(IPPROTO_IPV6), + [IPPROTO_RSVP] = __stringify_1(IPPROTO_RSVP), + [IPPROTO_GRE] = __stringify_1(IPPROTO_GRE), + [IPPROTO_ESP] = __stringify_1(IPPROTO_ESP), + [IPPROTO_AH] = __stringify_1(IPPROTO_AH), + [IPPROTO_UDPLITE] = __stringify_1(IPPROTO_UDPLITE), + [IPPROTO_RAW] = __stringify_1(IPPROTO_RAW), + }; + return __socket_const_name(nm, size, protos, ARRAY_SIZE(protos), proto); +} + +const char *socket_family_name(unsigned int family, char *nm, size_t size) +{ + static const char *families[] = { + [AF_UNIX] = __stringify_1(AF_UNIX), + [AF_INET] = __stringify_1(AF_INET), + [AF_BRIDGE] = __stringify_1(AF_BRIDGE), + [AF_INET6] = __stringify_1(AF_INET6), + [AF_KEY] = __stringify_1(AF_KEY), + [AF_NETLINK] = __stringify_1(AF_NETLINK), + [AF_PACKET] = __stringify_1(AF_PACKET), + }; + return __socket_const_name(nm, size, families, ARRAY_SIZE(families), family); +} + +const char *socket_type_name(unsigned int type, char *nm, size_t size) +{ + static const char *types[] = { + [SOCK_STREAM] = __stringify_1(SOCK_STREAM), + [SOCK_DGRAM] = __stringify_1(SOCK_DGRAM), + [SOCK_RAW] = __stringify_1(SOCK_RAW), + [SOCK_SEQPACKET] = __stringify_1(SOCK_SEQPACKET), + [SOCK_PACKET] = __stringify_1(SOCK_PACKET), + }; + return __socket_const_name(nm, size, types, ARRAY_SIZE(types), type); +} + +const char *tcp_state_name(unsigned int state, char *nm, size_t size) +{ + static const char *states[] = { + [TCP_ESTABLISHED] = __stringify_1(TCP_ESTABLISHED), + [TCP_SYN_SENT] = __stringify_1(TCP_SYN_SENT), + [TCP_SYN_RECV] = __stringify_1(TCP_SYN_RECV), + [TCP_FIN_WAIT1] = __stringify_1(TCP_FIN_WAIT1), + [TCP_FIN_WAIT2] = __stringify_1(TCP_FIN_WAIT2), + [TCP_TIME_WAIT] = __stringify_1(TCP_TIME_WAIT), + [TCP_CLOSE] = __stringify_1(TCP_CLOSE), + [TCP_CLOSE_WAIT] = __stringify_1(TCP_CLOSE_WAIT), + [TCP_LAST_ACK] = __stringify_1(TCP_LAST_ACK), + [TCP_LISTEN] = __stringify_1(TCP_LISTEN), + [TCP_CLOSING] = __stringify_1(TCP_CLOSING), + }; + return __socket_const_name(nm, size, states, ARRAY_SIZE(states), state); +} + +struct sock_diag_greq { + u8 family; + u8 protocol; +}; + +struct sock_diag_req { + struct nlmsghdr hdr; + union { + struct unix_diag_req u; + struct inet_diag_req_v2 i; + struct packet_diag_req p; + struct netlink_diag_req n; + struct sock_diag_greq g; + } r; +}; + +enum socket_cl_bits +{ + NETLINK_CL_BIT, + INET_TCP_CL_BIT, + INET_UDP_CL_BIT, + INET_UDPLITE_CL_BIT, + INET_RAW_CL_BIT, + INET6_TCP_CL_BIT, + INET6_UDP_CL_BIT, + INET6_UDPLITE_CL_BIT, + INET6_RAW_CL_BIT, + UNIX_CL_BIT, + PACKET_CL_BIT, + _MAX_CL_BIT, +}; + +#define MAX_CL_BIT (_MAX_CL_BIT - 1) + +static DECLARE_BITMAP(socket_cl_bits, MAX_CL_BIT); + +static inline +enum socket_cl_bits get_collect_bit_nr(unsigned int family, unsigned int proto) +{ + if (family == AF_NETLINK) + return NETLINK_CL_BIT; + if (family == AF_UNIX) + return UNIX_CL_BIT; + if (family == AF_PACKET) + return PACKET_CL_BIT; + if (family == AF_INET) { + if (proto == IPPROTO_TCP) + return INET_TCP_CL_BIT; + if (proto == IPPROTO_UDP) + return INET_UDP_CL_BIT; + if (proto == IPPROTO_UDPLITE) + return INET_UDPLITE_CL_BIT; + if (proto == IPPROTO_RAW) + return INET_RAW_CL_BIT; + } + if (family == AF_INET6) { + if (proto == IPPROTO_TCP) + return INET6_TCP_CL_BIT; + if (proto == IPPROTO_UDP) + return INET6_UDP_CL_BIT; + if (proto == IPPROTO_UDPLITE) + return INET6_UDPLITE_CL_BIT; + if (proto == IPPROTO_RAW) + return INET6_RAW_CL_BIT; + } + + pr_err("Unknown pair family %d proto %d\n", family, proto); + BUG(); + return -1; +} + +static void set_collect_bit(unsigned int family, unsigned int proto) +{ + enum socket_cl_bits nr; + + nr = get_collect_bit_nr(family, proto); + set_bit(nr, socket_cl_bits); +} + +bool socket_test_collect_bit(unsigned int family, unsigned int proto) +{ + enum socket_cl_bits nr; + + nr = get_collect_bit_nr(family, proto); + return test_bit(nr, socket_cl_bits) != 0; +} + +static int probe_recv_one(struct nlmsghdr *h, struct ns_id *ns, void *arg) +{ + pr_err("PROBE RECEIVED\n"); + return -1; +} + +static int probe_err(int err, struct ns_id *ns, void *arg) +{ + int expected_err = *(int *)arg; + + if (err == expected_err) + return 0; + + pr_err("Diag module missing (%d)\n", err); + return err; +} + +static inline void probe_diag(int nl, struct sock_diag_req *req, int expected_err) +{ + do_rtnl_req(nl, req, req->hdr.nlmsg_len, probe_recv_one, probe_err, NULL, &expected_err); +} + +void preload_socket_modules(void) +{ + int nl; + struct sock_diag_req req; + + /* + * If the task to dump (e.g. an LXC container) has any netlink + * KOBJECT_UEVENT socket open and the _diag modules aren't + * loaded is dumped, criu will freeze the task and then the + * kernel will send it messages on the socket, and then we will + * fail to dump because the socket has pending data. The Real + * Solution is to dump this pending data, but we just make sure + * modules are there beforehand for now so that the first dump + * doesn't fail. + */ + + nl = socket(PF_NETLINK, SOCK_RAW, NETLINK_SOCK_DIAG); + if (nl < 0) + return; + + pr_info("Probing sock diag modules\n"); + + memset(&req, 0, sizeof(req)); + req.hdr.nlmsg_type = SOCK_DIAG_BY_FAMILY; + req.hdr.nlmsg_seq = CR_NLMSG_SEQ; + + /* + * Probe UNIX, netlink and packet diag-s by feeding + * to the kernel request that is shorter than they + * expect, byt still containing the family to make + * sure the family handler is there. The family-level + * diag module would report EINVAL in this case. + */ + + req.hdr.nlmsg_len = sizeof(req.hdr) + sizeof(req.r.g); + req.hdr.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST; + + req.r.g.family = AF_UNIX; + probe_diag(nl, &req, -EINVAL); + + req.r.g.family = AF_PACKET; + probe_diag(nl, &req, -EINVAL); + + req.r.g.family = AF_NETLINK; + probe_diag(nl, &req, -EINVAL); + + /* + * TCP and UDP(LITE) diags do not support such trick, only + * inet_diag module can be probed like that. For the protocol + * level ones it's OK to request for exact non-existing socket + * and check for ENOENT being reported back as error. + */ + + req.hdr.nlmsg_len = sizeof(req.hdr) + sizeof(req.r.i); + req.hdr.nlmsg_flags = NLM_F_REQUEST; + req.r.i.sdiag_family = AF_INET; + + req.r.i.sdiag_protocol = IPPROTO_TCP; + probe_diag(nl, &req, -ENOENT); + + req.r.i.sdiag_protocol = IPPROTO_UDP; /* UDLITE is merged with UDP */ + probe_diag(nl, &req, -ENOENT); + + req.r.i.sdiag_protocol = IPPROTO_RAW; + probe_diag(nl, &req, -ENOENT); + + close(nl); + pr_info("Done probing\n"); +} + +static int dump_bound_dev(int sk, SkOptsEntry *soe) +{ + int ret; + char dev[IFNAMSIZ]; + socklen_t len = sizeof(dev); + + ret = getsockopt(sk, SOL_SOCKET, SO_BINDTODEVICE, &dev, &len); + if (ret) { + pr_perror("Can't get bound dev"); + return ret; + } + + if (len == 0) + return 0; + + pr_debug("\tDumping %s bound dev for sk\n", dev); + soe->so_bound_dev = xmalloc(len); + if (soe->so_bound_dev == NULL) + return -1; + strcpy(soe->so_bound_dev, dev); + return 0; +} + +static int restore_bound_dev(int sk, SkOptsEntry *soe) +{ + char *n = soe->so_bound_dev; + + if (!n) + return 0; + + pr_debug("\tBinding socket to %s dev\n", n); + return do_restore_opt(sk, SOL_SOCKET, SO_BINDTODEVICE, n, strlen(n)); +} + +/* + * Protobuf handles le/be himself, but the sock_filter is not just u64, + * it's a structure and we have to preserve the fields order to be able + * to move socket image across architectures. + */ + +static void encode_filter(struct sock_filter *f, u64 *img, int n) +{ + int i; + + BUILD_BUG_ON(sizeof(*f) != sizeof(*img)); + + for (i = 0; i < n; i++) + img[i] = ((u64)f[i].code << 48) | + ((u64)f[i].jt << 40) | + ((u64)f[i].jf << 32) | + ((u64)f[i].k << 0); +} + +static void decode_filter(u64 *img, struct sock_filter *f, int n) +{ + int i; + + for (i = 0; i < n; i++) { + f[i].code = img[i] >> 48; + f[i].jt = img[i] >> 40; + f[i].jf = img[i] >> 32; + f[i].k = img[i] >> 0; + } +} + +static int dump_socket_filter(int sk, SkOptsEntry *soe) +{ + socklen_t len = 0; + int ret; + struct sock_filter *flt; + + ret = getsockopt(sk, SOL_SOCKET, SO_GET_FILTER, NULL, &len); + if (ret) { + pr_perror("Can't get socket filter len"); + return ret; + } + + if (!len) { + pr_info("No filter for socket\n"); + return 0; + } + + flt = xmalloc(len * sizeof(*flt)); + if (!flt) + return -1; + + ret = getsockopt(sk, SOL_SOCKET, SO_GET_FILTER, flt, &len); + if (ret) { + pr_perror("Can't get socket filter"); + xfree(flt); + return ret; + } + + soe->so_filter = xmalloc(len * sizeof(*soe->so_filter)); + if (!soe->so_filter) { + xfree(flt); + return -1; + } + + encode_filter(flt, soe->so_filter, len); + soe->n_so_filter = len; + xfree(flt); + return 0; +} + +static int restore_socket_filter(int sk, SkOptsEntry *soe) +{ + int ret; + struct sock_fprog sfp; + + if (!soe->n_so_filter) + return 0; + + pr_info("Restoring socket filter\n"); + sfp.len = soe->n_so_filter; + sfp.filter = xmalloc(soe->n_so_filter * sfp.len); + if (!sfp.filter) + return -1; + + decode_filter(soe->so_filter, sfp.filter, sfp.len); + ret = restore_opt(sk, SOL_SOCKET, SO_ATTACH_FILTER, &sfp); + xfree(sfp.filter); + + return ret; +} + +static struct socket_desc *sockets[SK_HASH_SIZE]; + +struct socket_desc *lookup_socket_ino(unsigned int ino, int family) +{ + struct socket_desc *sd; + + pr_debug("Searching for socket %#x family %d\n", ino, family); + + for (sd = sockets[ino % SK_HASH_SIZE]; sd; sd = sd->next) { + if (sd->ino == ino) { + BUG_ON(sd->family != family); + return sd; + } + } + + return NULL; +} + + +struct socket_desc *lookup_socket(unsigned int ino, int family, int proto) +{ + if (!socket_test_collect_bit(family, proto)) { + pr_err("Sockets (family %d proto %d) are not collected\n", + family, proto); + return ERR_PTR(-EINVAL); + } + + return lookup_socket_ino(ino, family); +} + +int sk_collect_one(unsigned ino, int family, struct socket_desc *d, struct ns_id *ns) +{ + struct socket_desc **chain; + + d->ino = ino; + d->family = family; + d->already_dumped = 0; + d->sk_ns = ns; + + chain = &sockets[ino % SK_HASH_SIZE]; + d->next = *chain; + *chain = d; + + return 0; +} + +int do_restore_opt(int sk, int level, int name, void *val, int len) +{ + if (setsockopt(sk, level, name, val, len) < 0) { + pr_perror("Can't set %d:%d (len %d)", level, name, len); + return -1; + } + + return 0; +} + +static int sk_setbufs(void *arg, int fd, pid_t pid) +{ + u32 *buf = (u32 *)arg; + + if (restore_opt(fd, SOL_SOCKET, SO_SNDBUFFORCE, &buf[0])) + return -1; + if (restore_opt(fd, SOL_SOCKET, SO_RCVBUFFORCE, &buf[1])) + return -1; + + return 0; +} + +/* + * Set sizes of buffers to maximum and prevent blocking + * Caller of this fn should call other socket restoring + * routines to drop the non-blocking and set proper send + * and receive buffers. + */ +int restore_prepare_socket(int sk) +{ + int flags; + /* In kernel a bufsize has type int and a value is doubled. */ + u32 maxbuf[2] = { INT_MAX / 2, INT_MAX / 2 }; + + if (userns_call(sk_setbufs, 0, maxbuf, sizeof(maxbuf), sk)) + return -1; + + /* Prevent blocking on restore */ + flags = fcntl(sk, F_GETFL, 0); + if (flags == -1) { + pr_perror("Unable to get flags for %d", sk); + return -1; + } + if (fcntl(sk, F_SETFL, flags | O_NONBLOCK) ) { + pr_perror("Unable to set O_NONBLOCK for %d", sk); + return -1; + } + + return 0; +} + +int restore_socket_opts(int sk, SkOptsEntry *soe) +{ + int ret = 0, val; + struct timeval tv; + /* In kernel a bufsize value is doubled. */ + u32 bufs[2] = { soe->so_sndbuf / 2, soe->so_rcvbuf / 2}; + + pr_info("%d restore sndbuf %d rcv buf %d\n", sk, soe->so_sndbuf, soe->so_rcvbuf); + + /* setsockopt() multiplies the input values by 2 */ + ret |= userns_call(sk_setbufs, UNS_ASYNC, bufs, sizeof(bufs), sk); + + if (soe->has_so_priority) { + pr_debug("\trestore priority %d for socket\n", soe->so_priority); + ret |= restore_opt(sk, SOL_SOCKET, SO_PRIORITY, &soe->so_priority); + } + if (soe->has_so_rcvlowat) { + pr_debug("\trestore rcvlowat %d for socket\n", soe->so_rcvlowat); + ret |= restore_opt(sk, SOL_SOCKET, SO_RCVLOWAT, &soe->so_rcvlowat); + } + if (soe->has_so_mark) { + pr_debug("\trestore mark %d for socket\n", soe->so_mark); + ret |= restore_opt(sk, SOL_SOCKET, SO_MARK, &soe->so_mark); + } + if (soe->has_so_passcred && soe->so_passcred) { + val = 1; + pr_debug("\tset passcred for socket\n"); + ret |= restore_opt(sk, SOL_SOCKET, SO_PASSCRED, &val); + } + if (soe->has_so_passsec && soe->so_passsec) { + val = 1; + pr_debug("\tset passsec for socket\n"); + ret |= restore_opt(sk, SOL_SOCKET, SO_PASSSEC, &val); + } + if (soe->has_so_dontroute && soe->so_dontroute) { + val = 1; + pr_debug("\tset dontroute for socket\n"); + ret |= restore_opt(sk, SOL_SOCKET, SO_DONTROUTE, &val); + } + if (soe->has_so_no_check && soe->so_no_check) { + val = 1; + pr_debug("\tset no_check for socket\n"); + ret |= restore_opt(sk, SOL_SOCKET, SO_NO_CHECK, &val); + } + if (soe->has_so_broadcast && soe->so_broadcast) { + val = 1; + pr_debug("\tset broadcast for socket\n"); + ret |= restore_opt(sk, SOL_SOCKET, SO_BROADCAST, &val); + } + + tv.tv_sec = soe->so_snd_tmo_sec; + tv.tv_usec = soe->so_snd_tmo_usec; + ret |= restore_opt(sk, SOL_SOCKET, SO_SNDTIMEO, &tv); + + tv.tv_sec = soe->so_rcv_tmo_sec; + tv.tv_usec = soe->so_rcv_tmo_usec; + ret |= restore_opt(sk, SOL_SOCKET, SO_RCVTIMEO, &tv); + + ret |= restore_bound_dev(sk, soe); + ret |= restore_socket_filter(sk, soe); + + /* The restore of SO_REUSEADDR depends on type of socket */ + + return ret; +} + +int do_dump_opt(int sk, int level, int name, void *val, int len) +{ + socklen_t aux = len; + + if (getsockopt(sk, level, name, val, &aux) < 0) { + pr_perror("Can't get %d:%d opt", level, name); + return -1; + } + + if (aux != len) { + pr_err("Len mismatch on %d:%d : %d, want %d\n", + level, name, aux, len); + return -1; + } + + return 0; +} + +int dump_socket_opts(int sk, SkOptsEntry *soe) +{ + int ret = 0, val; + struct timeval tv; + + ret |= dump_opt(sk, SOL_SOCKET, SO_SNDBUF, &soe->so_sndbuf); + ret |= dump_opt(sk, SOL_SOCKET, SO_RCVBUF, &soe->so_rcvbuf); + soe->has_so_priority = true; + ret |= dump_opt(sk, SOL_SOCKET, SO_PRIORITY, &soe->so_priority); + soe->has_so_rcvlowat = true; + ret |= dump_opt(sk, SOL_SOCKET, SO_RCVLOWAT, &soe->so_rcvlowat); + soe->has_so_mark = true; + ret |= dump_opt(sk, SOL_SOCKET, SO_MARK, &soe->so_mark); + + ret |= dump_opt(sk, SOL_SOCKET, SO_SNDTIMEO, &tv); + soe->so_snd_tmo_sec = tv.tv_sec; + soe->so_snd_tmo_usec = tv.tv_usec; + + ret |= dump_opt(sk, SOL_SOCKET, SO_RCVTIMEO, &tv); + soe->so_rcv_tmo_sec = tv.tv_sec; + soe->so_rcv_tmo_usec = tv.tv_usec; + + ret |= dump_opt(sk, SOL_SOCKET, SO_REUSEADDR, &val); + soe->reuseaddr = val ? true : false; + soe->has_reuseaddr = true; + + ret |= dump_opt(sk, SOL_SOCKET, SO_REUSEPORT, &val); + soe->so_reuseport = val ? true : false; + soe->has_so_reuseport = true; + + ret |= dump_opt(sk, SOL_SOCKET, SO_PASSCRED, &val); + soe->has_so_passcred = true; + soe->so_passcred = val ? true : false; + + ret |= dump_opt(sk, SOL_SOCKET, SO_PASSSEC, &val); + soe->has_so_passsec = true; + soe->so_passsec = val ? true : false; + + ret |= dump_opt(sk, SOL_SOCKET, SO_DONTROUTE, &val); + soe->has_so_dontroute = true; + soe->so_dontroute = val ? true : false; + + ret |= dump_opt(sk, SOL_SOCKET, SO_NO_CHECK, &val); + soe->has_so_no_check = true; + soe->so_no_check = val ? true : false; + + ret |= dump_opt(sk, SOL_SOCKET, SO_BROADCAST, &val); + soe->has_so_broadcast = true; + soe->so_broadcast = val ? true : false; + + ret |= dump_bound_dev(sk, soe); + ret |= dump_socket_filter(sk, soe); + + return ret; +} + +void release_skopts(SkOptsEntry *soe) +{ + xfree(soe->so_filter); + xfree(soe->so_bound_dev); +} + +int dump_socket(struct fd_parms *p, int lfd, FdinfoEntry *e) +{ + int family; + const struct fdtype_ops *ops; + + if (dump_xattr_security_selinux(lfd, e)) + return -1; + + if (dump_opt(lfd, SOL_SOCKET, SO_DOMAIN, &family)) + return -1; + + switch (family) { + case AF_UNIX: + ops = &unix_dump_ops; + break; + case AF_INET: + ops = &inet_dump_ops; + break; + case AF_INET6: + ops = &inet6_dump_ops; + break; + case AF_PACKET: + ops = &packet_dump_ops; + break; + case AF_NETLINK: + ops = &netlink_dump_ops; + break; + default: + pr_err("BUG! Unknown socket collected (family %d)\n", family); + return -1; + } + + return do_dump_gen_file(p, lfd, ops, e); +} + +static int inet_receive_one(struct nlmsghdr *h, struct ns_id *ns, void *arg) +{ + struct inet_diag_req_v2 *i = arg; + int type; + + switch (i->sdiag_protocol) { + case IPPROTO_TCP: + type = SOCK_STREAM; + break; + case IPPROTO_UDP: + case IPPROTO_UDPLITE: + type = SOCK_DGRAM; + break; + case IPPROTO_RAW: + type = SOCK_RAW; + break; + default: + BUG_ON(1); + return -1; + } + + return inet_collect_one(h, i->sdiag_family, type, ns); +} + +static int do_collect_req(int nl, struct sock_diag_req *req, int size, + int (*receive_callback)(struct nlmsghdr *h, struct ns_id *ns, void *), + int (*error_callback)(int err, struct ns_id *ns, void *), + struct ns_id *ns, void *arg) +{ + int tmp = do_rtnl_req(nl, req, size, receive_callback, error_callback, ns, arg); + if (tmp == 0) + set_collect_bit(req->r.n.sdiag_family, req->r.n.sdiag_protocol); + return tmp; +} + +static int collect_err(int err, struct ns_id *ns, void *arg) +{ + struct sock_diag_greq *gr = arg; + char family[32], proto[32]; + char msg[256]; + + snprintf(msg, sizeof(msg), + "Sockects collect procedure family %s proto %s", + socket_family_name(gr->family, family, sizeof(family)), + socket_proto_name(gr->protocol, proto, sizeof(proto))); + + /* + * If module is not compiled or unloaded, + * we should simply pass error up to a caller + * which then warn a user. + */ + if (err == -ENOENT) { + pr_debug("%s: %d\n", msg, err); + /* + * Unlike other modules RAW sockets are + * always optional and not commonly used. + * Currently we warn user about lack of + * a particular module support in "check" + * procedure. Thus don't fail on lack of + * RAW diags in a regular dump. If we meet + * a raw socket we will simply fail on dump + * procedure because it won't be resolved. + */ + if (gr->protocol == IPPROTO_RAW) + return 0; + return -ENOENT; + } + + /* + * Diag modules such as unix, packet, netlink + * may return EINVAL on older kernels. + */ + if (err == -EINVAL) { + if (gr->family == AF_UNIX || + gr->family == AF_PACKET || + gr->family == AF_NETLINK) { + pr_debug("%s: %d\n", msg, err); + return -EINVAL; + } + } + + /* + * Rest is more serious, just print enough information. + * In case if everything is OK -- point as well. + */ + if (!err) + pr_info("%s: OK\n", msg); + else + pr_err("%s: %d: %s\n", msg, err, strerror(-err)); + + return err; +} + +int collect_sockets(struct ns_id *ns) +{ + int err = 0, tmp; + int nl = ns->net.nlsk; + struct sock_diag_req req; + + memset(&req, 0, sizeof(req)); + req.hdr.nlmsg_len = sizeof(req); + req.hdr.nlmsg_type = SOCK_DIAG_BY_FAMILY; + req.hdr.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST; + req.hdr.nlmsg_seq = CR_NLMSG_SEQ; + + /* Collect UNIX sockets */ + req.r.u.sdiag_family = AF_UNIX; + req.r.u.udiag_states = -1; /* All */ + req.r.u.udiag_show = UDIAG_SHOW_NAME | UDIAG_SHOW_VFS | + UDIAG_SHOW_PEER | UDIAG_SHOW_ICONS | + UDIAG_SHOW_RQLEN; + tmp = do_collect_req(nl, &req, sizeof(req), unix_receive_one, collect_err, ns, &req.r.u); + if (tmp) + err = tmp; + + /* Collect IPv4 TCP sockets */ + req.r.i.sdiag_family = AF_INET; + req.r.i.sdiag_protocol = IPPROTO_TCP; + req.r.i.idiag_ext = 0; + /* Only listening and established sockets supported yet */ + req.r.i.idiag_states = (1 << TCP_LISTEN) | (1 << TCP_ESTABLISHED) | + (1 << TCP_FIN_WAIT1) | (1 << TCP_FIN_WAIT2) | + (1 << TCP_CLOSE_WAIT) | (1 << TCP_LAST_ACK) | + (1 << TCP_CLOSING) | (1 << TCP_SYN_SENT); + tmp = do_collect_req(nl, &req, sizeof(req), inet_receive_one, collect_err, ns, &req.r.i); + if (tmp) + err = tmp; + + /* Collect IPv4 UDP sockets */ + req.r.i.sdiag_family = AF_INET; + req.r.i.sdiag_protocol = IPPROTO_UDP; + req.r.i.idiag_ext = 0; + req.r.i.idiag_states = -1; /* All */ + tmp = do_collect_req(nl, &req, sizeof(req), inet_receive_one, collect_err, ns, &req.r.i); + if (tmp) + err = tmp; + + /* Collect IPv4 UDP-lite sockets */ + req.r.i.sdiag_family = AF_INET; + req.r.i.sdiag_protocol = IPPROTO_UDPLITE; + req.r.i.idiag_ext = 0; + req.r.i.idiag_states = -1; /* All */ + tmp = do_collect_req(nl, &req, sizeof(req), inet_receive_one, collect_err, ns, &req.r.i); + if (tmp) + err = tmp; + + /* Collect IPv4 RAW sockets */ + req.r.i.sdiag_family = AF_INET; + req.r.i.sdiag_protocol = IPPROTO_RAW; + req.r.i.idiag_ext = 0; + req.r.i.idiag_states = -1; /* All */ + tmp = do_collect_req(nl, &req, sizeof(req), inet_receive_one, collect_err, ns, &req.r.i); + if (tmp) + err = tmp; + + /* Collect IPv6 TCP sockets */ + req.r.i.sdiag_family = AF_INET6; + req.r.i.sdiag_protocol = IPPROTO_TCP; + req.r.i.idiag_ext = 0; + /* Only listening sockets supported yet */ + req.r.i.idiag_states = (1 << TCP_LISTEN) | (1 << TCP_ESTABLISHED) | + (1 << TCP_FIN_WAIT1) | (1 << TCP_FIN_WAIT2) | + (1 << TCP_CLOSE_WAIT) | (1 << TCP_LAST_ACK) | + (1 << TCP_CLOSING) | (1 << TCP_SYN_SENT); + tmp = do_collect_req(nl, &req, sizeof(req), inet_receive_one, collect_err, ns, &req.r.i); + if (tmp) + err = tmp; + + /* Collect IPv6 UDP sockets */ + req.r.i.sdiag_family = AF_INET6; + req.r.i.sdiag_protocol = IPPROTO_UDP; + req.r.i.idiag_ext = 0; + req.r.i.idiag_states = -1; /* All */ + tmp = do_collect_req(nl, &req, sizeof(req), inet_receive_one, collect_err, ns, &req.r.i); + if (tmp) + err = tmp; + + /* Collect IPv6 UDP-lite sockets */ + req.r.i.sdiag_family = AF_INET6; + req.r.i.sdiag_protocol = IPPROTO_UDPLITE; + req.r.i.idiag_ext = 0; + req.r.i.idiag_states = -1; /* All */ + tmp = do_collect_req(nl, &req, sizeof(req), inet_receive_one, collect_err, ns, &req.r.i); + if (tmp) + err = tmp; + + /* Collect IPv6 RAW sockets */ + req.r.i.sdiag_family = AF_INET6; + req.r.i.sdiag_protocol = IPPROTO_RAW; + req.r.i.idiag_ext = 0; + req.r.i.idiag_states = -1; /* All */ + tmp = do_collect_req(nl, &req, sizeof(req), inet_receive_one, collect_err, ns, &req.r.i); + if (tmp) + err = tmp; + + req.r.p.sdiag_family = AF_PACKET; + req.r.p.sdiag_protocol = 0; + req.r.p.pdiag_show = PACKET_SHOW_INFO | PACKET_SHOW_MCLIST | + PACKET_SHOW_FANOUT | PACKET_SHOW_RING_CFG; + tmp = do_collect_req(nl, &req, sizeof(req), packet_receive_one, collect_err, ns, &req.r.p); + if (tmp) + err = tmp; + + req.r.n.sdiag_family = AF_NETLINK; + req.r.n.sdiag_protocol = NDIAG_PROTO_ALL; + req.r.n.ndiag_show = NDIAG_SHOW_GROUPS; + tmp = do_collect_req(nl, &req, sizeof(req), netlink_receive_one, collect_err, ns, &req.r.n); + if (tmp) + err = tmp; + + /* don't need anymore */ + close(nl); + ns->net.nlsk = -1; + + if (err && (ns->type == NS_CRIU)) { + /* + * If netns isn't dumped, criu will fail only + * if an unsupported socket will be really dumped. + */ + pr_info("Uncollected sockets! Will probably fail later.\n"); + err = 0; + } + + return err; +} + +static uint32_t last_ns_id = 0; + +int set_netns(uint32_t ns_id) +{ + struct ns_id *ns; + int nsfd; + + if (!(root_ns_mask & CLONE_NEWNET)) + return 0; + + if (ns_id == last_ns_id) + return 0; + + /* + * The 0 ns_id means that it was not set. We need + * this to be compatible with old images. + */ + if (ns_id == 0) + ns = net_get_root_ns(); + else + ns = lookup_ns_by_id(ns_id, &net_ns_desc); + if (ns == NULL) { + pr_err("Unable to find a network namespace\n"); + return -1; + } + nsfd = fdstore_get(ns->net.nsfd_id); + if (nsfd < 0) + return -1; + if (setns(nsfd, CLONE_NEWNET)) { + pr_perror("Unable to switch a network namespace"); + close(nsfd); + return -1; + } + last_ns_id = ns_id; + close(nsfd); + + return 0; +} diff --git a/CRIU_code/criu/stats.c b/CRIU_code/criu/stats.c new file mode 100644 index 0000000..7410b5c --- /dev/null +++ b/CRIU_code/criu/stats.c @@ -0,0 +1,218 @@ +#include +#include +#include +#include "int.h" +#include "atomic.h" +#include "cr_options.h" +#include "rst-malloc.h" +#include "protobuf.h" +#include "stats.h" +#include "util.h" +#include "image.h" +#include "images/stats.pb-c.h" + +struct timing { + struct timeval start; + struct timeval total; +}; + +struct dump_stats { + struct timing timings[DUMP_TIME_NR_STATS]; + unsigned long counts[DUMP_CNT_NR_STATS]; +}; + +struct restore_stats { + struct timing timings[RESTORE_TIME_NS_STATS]; + atomic_t counts[RESTORE_CNT_NR_STATS]; +}; + +struct dump_stats *dstats; +struct restore_stats *rstats; + +void cnt_add(int c, unsigned long val) +{ + if (dstats != NULL) { + BUG_ON(c >= DUMP_CNT_NR_STATS); + dstats->counts[c] += val; + } else if (rstats != NULL) { + BUG_ON(c >= RESTORE_CNT_NR_STATS); + atomic_add(val, &rstats->counts[c]); + } else + BUG(); +} + +static void timeval_accumulate(const struct timeval *from, const struct timeval *to, + struct timeval *res) +{ + suseconds_t usec; + + res->tv_sec += to->tv_sec - from->tv_sec; + usec = to->tv_usec; + if (usec < from->tv_usec) { + usec += USEC_PER_SEC; + res->tv_sec -= 1; + } + res->tv_usec += usec - from->tv_usec; + if (res->tv_usec > USEC_PER_SEC) { + res->tv_usec -= USEC_PER_SEC; + res->tv_sec += 1; + } +} + +static struct timing *get_timing(int t) +{ + if (dstats != NULL) { + BUG_ON(t >= DUMP_TIME_NR_STATS); + return &dstats->timings[t]; + } else if (rstats != NULL) { + /* + * FIXME -- this does _NOT_ work when called + * from different tasks. + */ + BUG_ON(t >= RESTORE_TIME_NS_STATS); + return &rstats->timings[t]; + } + + BUG(); + return NULL; +} + +void timing_start(int t) +{ + struct timing *tm; + + tm = get_timing(t); + gettimeofday(&tm->start, NULL); +} + +void timing_stop(int t) +{ + struct timing *tm; + struct timeval now; + + tm = get_timing(t); + gettimeofday(&now, NULL); + timeval_accumulate(&tm->start, &now, &tm->total); +} + +static void encode_time(int t, u_int32_t *to) +{ + struct timing *tm; + + tm = get_timing(t); + *to = tm->total.tv_sec * USEC_PER_SEC + tm->total.tv_usec; +} + +static void display_stats(int what, StatsEntry *stats) +{ + if (what == DUMP_STATS) { + pr_msg("Displaying dump stats:\n"); + pr_msg("Freezing time: %d us\n", stats->dump->freezing_time); + pr_msg("Frozen time: %d us\n", stats->dump->frozen_time); + pr_msg("Memory dump time: %d us\n", stats->dump->memdump_time); + pr_msg("Memory write time: %d us\n", stats->dump->memwrite_time); + if (stats->dump->has_irmap_resolve) + pr_msg("IRMAP resolve time: %d us\n", stats->dump->irmap_resolve); + pr_msg("Memory pages scanned: %" PRIu64 " (0x%" PRIx64 ")\n", stats->dump->pages_scanned, + stats->dump->pages_scanned); + pr_msg("Memory pages skipped from parent: %" PRIu64 " (0x%" PRIx64 ")\n", + stats->dump->pages_skipped_parent, + stats->dump->pages_skipped_parent); + pr_msg("Memory pages written: %" PRIu64 " (0x%" PRIx64 ")\n", stats->dump->pages_written, + stats->dump->pages_written); + pr_msg("Lazy memory pages: %" PRIu64 " (0x%" PRIx64 ")\n", stats->dump->pages_lazy, + stats->dump->pages_lazy); + } else if (what == RESTORE_STATS) { + pr_msg("Displaying restore stats:\n"); + pr_msg("Pages compared: %" PRIu64 " (0x%" PRIx64 ")\n", stats->restore->pages_compared, + stats->restore->pages_compared); + pr_msg("Pages skipped COW: %" PRIu64 " (0x%" PRIx64 ")\n", stats->restore->pages_skipped_cow, + stats->restore->pages_skipped_cow); + if (stats->restore->has_pages_restored) + pr_msg("Pages restored: %" PRIu64 " (0x%" PRIx64 ")\n", stats->restore->pages_restored, + stats->restore->pages_restored); + pr_msg("Restore time: %d us\n", stats->restore->restore_time); + pr_msg("Forking time: %d us\n", stats->restore->forking_time); + } else + return; +} + +void write_stats(int what) +{ + StatsEntry stats = STATS_ENTRY__INIT; + DumpStatsEntry ds_entry = DUMP_STATS_ENTRY__INIT; + RestoreStatsEntry rs_entry = RESTORE_STATS_ENTRY__INIT; + char *name; + struct cr_img *img; + + pr_info("Writing stats\n"); + if (what == DUMP_STATS) { + stats.dump = &ds_entry; + + encode_time(TIME_FREEZING, &ds_entry.freezing_time); + encode_time(TIME_FROZEN, &ds_entry.frozen_time); + encode_time(TIME_MEMDUMP, &ds_entry.memdump_time); + encode_time(TIME_MEMWRITE, &ds_entry.memwrite_time); + ds_entry.has_irmap_resolve = true; + encode_time(TIME_IRMAP_RESOLVE, &ds_entry.irmap_resolve); + + ds_entry.pages_scanned = dstats->counts[CNT_PAGES_SCANNED]; + ds_entry.pages_skipped_parent = dstats->counts[CNT_PAGES_SKIPPED_PARENT]; + ds_entry.pages_written = dstats->counts[CNT_PAGES_WRITTEN]; + ds_entry.pages_lazy = dstats->counts[CNT_PAGES_LAZY]; + ds_entry.page_pipes = dstats->counts[CNT_PAGE_PIPES]; + ds_entry.has_page_pipes = true; + ds_entry.page_pipe_bufs = dstats->counts[CNT_PAGE_PIPE_BUFS]; + ds_entry.has_page_pipe_bufs = true; + + ds_entry.shpages_scanned = dstats->counts[CNT_SHPAGES_SCANNED]; + ds_entry.has_shpages_scanned = true; + ds_entry.shpages_skipped_parent = dstats->counts[CNT_SHPAGES_SKIPPED_PARENT]; + ds_entry.has_shpages_skipped_parent = true; + ds_entry.shpages_written = dstats->counts[CNT_SHPAGES_WRITTEN]; + ds_entry.has_shpages_written = true; + + name = "dump"; + } else if (what == RESTORE_STATS) { + stats.restore = &rs_entry; + + rs_entry.pages_compared = atomic_read(&rstats->counts[CNT_PAGES_COMPARED]); + rs_entry.pages_skipped_cow = atomic_read(&rstats->counts[CNT_PAGES_SKIPPED_COW]); + rs_entry.has_pages_restored = true; + rs_entry.pages_restored = atomic_read(&rstats->counts[CNT_PAGES_RESTORED]); + + encode_time(TIME_FORK, &rs_entry.forking_time); + encode_time(TIME_RESTORE, &rs_entry.restore_time); + + name = "restore"; + } else + return; + + img = open_image_at(AT_FDCWD, CR_FD_STATS, O_DUMP, name); + if (img) { + pb_write_one(img, &stats, PB_STATS); + close_image(img); + } + + if (opts.display_stats) + display_stats(what, &stats); +} + +int init_stats(int what) +{ + if (what == DUMP_STATS) { + /* + * Dumping happens via one process most of the time, + * so we are typically OK with the plain malloc, but + * when dumping namespaces we fork() a separate process + * for it and when it goes and dumps shmem segments + * it will alter the CNT_SHPAGES_ counters, so we need + * to have them in shmem. + */ + dstats = shmalloc(sizeof(*dstats)); + return dstats ? 0 : -1; + } + + rstats = shmalloc(sizeof(struct restore_stats)); + return rstats ? 0 : -1; +} diff --git a/CRIU_code/criu/string.c b/CRIU_code/criu/string.c new file mode 100644 index 0000000..543c642 --- /dev/null +++ b/CRIU_code/criu/string.c @@ -0,0 +1,60 @@ +/* + * Adopted from linux kernel + */ +#include +#include + +#include "string.h" + +#ifndef CONFIG_HAS_STRLCPY +/** + * strlcpy - Copy a %NUL terminated string into a sized buffer + * @dest: Where to copy the string to + * @src: Where to copy the string from + * @size: size of destination buffer + * + * Compatible with *BSD: the result is always a valid + * NUL-terminated string that fits in the buffer (unless, + * of course, the buffer size is zero). It does not pad + * out the result like strncpy() does. + */ +size_t strlcpy(char *dest, const char *src, size_t size) +{ + size_t ret = strlen(src); + + if (size) { + size_t len = (ret >= size) ? size - 1 : ret; + memcpy(dest, src, len); + dest[len] = '\0'; + } + return ret; +} +#endif + +#ifndef CONFIG_HAS_STRLCAT +/** + * strlcat - Append a length-limited, %NUL-terminated string to another + * @dest: The string to be appended to + * @src: The string to append to it + * @count: The size of the destination buffer. + */ +size_t strlcat(char *dest, const char *src, size_t count) +{ + size_t dsize = strlen(dest); + size_t len = strlen(src); + size_t res = dsize + len; + + /* + * It's assumed that @dsize strictly + * less than count. Otherwise it's + * a bug. But we left it to a caller. + */ + dest += dsize; + count -= dsize; + if (len >= count) + len = count-1; + memcpy(dest, src, len); + dest[len] = 0; + return res; +} +#endif diff --git a/CRIU_code/criu/sysctl.c b/CRIU_code/criu/sysctl.c new file mode 100644 index 0000000..e484765 --- /dev/null +++ b/CRIU_code/criu/sysctl.c @@ -0,0 +1,472 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "namespaces.h" +#include "sysctl.h" +#include "util.h" + +/* These are the namespaces we know how to restore in various ways. + */ +#define KNOWN_NS_MASK (CLONE_NEWUTS | CLONE_NEWNET | CLONE_NEWIPC) + +struct sysctl_userns_req { + int op; + unsigned int ns; + size_t nr_req; + struct sysctl_req *reqs; +}; + +#define __SYSCTL_OP(__ret, __fd, __req, __type, __nr, __op) \ +do { \ + if (__op == CTL_READ) \ + __ret = sysctl_read_##__type(__fd, __req, \ + (__type *)(__req)->arg, \ + __nr); \ + else if (__op == CTL_WRITE) \ + __ret = sysctl_write_##__type(__fd, __req, \ + (__type *)(__req)->arg, \ + __nr); \ + else \ + __ret = -1; \ +} while (0) + +#define GEN_SYSCTL_READ_FUNC(__type, __conv) \ +static int sysctl_read_##__type(int fd, \ + struct sysctl_req *req, \ + __type *arg, \ + int nr) \ +{ \ + char buf[1024] = {0}; \ + int i, ret = -1; \ + char *p = buf; \ + \ + ret = read(fd, buf, sizeof(buf)); \ + if (ret < 0) { \ + pr_perror("Can't read %s", req->name); \ + ret = -1; \ + goto err; \ + } \ + \ + for (i = 0; i < nr && p < buf + sizeof(buf); p++, i++) \ + ((__type *)arg)[i] = __conv(p, &p, 10); \ + \ + if (i != nr) { \ + pr_err("Not enough params for %s (%d != %d)\n", \ + req->name, i, nr); \ + goto err; \ + } \ + \ + ret = 0; \ + \ +err: \ + return ret; \ +} + +#define GEN_SYSCTL_WRITE_FUNC(__type, __fmt) \ +static int sysctl_write_##__type(int fd, \ + struct sysctl_req *req, \ + __type *arg, \ + int nr) \ +{ \ + char buf[1024]; \ + int i, ret = -1; \ + int off = 0; \ + \ + for (i = 0; i < nr && off < sizeof(buf) - 1; i++) { \ + snprintf(&buf[off], sizeof(buf) - off, __fmt, arg[i]); \ + off += strlen(&buf[off]); \ + } \ + \ + if (i != nr) { \ + pr_err("Not enough space for %s (%d != %d)\n", \ + req->name, i, nr); \ + goto err; \ + } \ + \ + /* trailing spaces in format */ \ + while (off > 0 && isspace(buf[off - 1])) \ + off--; \ + buf[off + 0] = '\n'; \ + ret = write(fd, buf, off + 1); \ + if (ret < 0) { \ + pr_perror("Can't write %s", req->name); \ + ret = -1; \ + goto err; \ + } \ + \ + ret = 0; \ +err: \ + return ret; \ +} + +GEN_SYSCTL_READ_FUNC(u32, strtoul); +GEN_SYSCTL_READ_FUNC(u64, strtoull); +GEN_SYSCTL_READ_FUNC(s32, strtol); + +GEN_SYSCTL_WRITE_FUNC(u32, "%u "); +GEN_SYSCTL_WRITE_FUNC(u64, "%"PRIu64" "); +GEN_SYSCTL_WRITE_FUNC(s32, "%d "); + +static int +sysctl_write_char(int fd, struct sysctl_req *req, char *arg, int nr) +{ + pr_debug("%s nr %d\n", req->name, nr); + if (dprintf(fd, "%s\n", arg) < 0) + return -1; + + return 0; +} + +static int +sysctl_read_char(int fd, struct sysctl_req *req, char *arg, int nr) +{ + int ret = -1; + + pr_debug("%s nr %d\n", req->name, nr); + ret = read(fd, arg, nr - 1); + if (ret < 0) { + if (errno != EIO || !(req->flags & CTL_FLAGS_READ_EIO_SKIP)) + pr_perror("Can't read %s", req->name); + goto err; + } + arg[ret]='\0'; + ret = 0; + +err: + return ret; +} + +static int sysctl_userns_arg_size(int type) +{ + switch(CTL_TYPE(type)) { + case __CTL_U32A: + return sizeof(u32) * CTL_LEN(type); + case CTL_U32: + return sizeof(u32); + case CTL_32: + return sizeof(s32); + case __CTL_U64A: + return sizeof(u64) * CTL_LEN(type); + case CTL_U64: + return sizeof(u64); + case __CTL_STR: + return sizeof(char) * CTL_LEN(type) + 1; + default: + pr_err("unknown arg type %d\n", type); + + /* Ensure overflow to cause an error */ + return MAX_UNSFD_MSG_SIZE; + } +} + +static int do_sysctl_op(int fd, struct sysctl_req *req, int op) +{ + int ret = -1, nr = 1; + + switch (CTL_TYPE(req->type)) { + case __CTL_U32A: + nr = CTL_LEN(req->type); + /* fallthrough */ + case CTL_U32: + __SYSCTL_OP(ret, fd, req, u32, nr, op); + break; + case CTL_32: + __SYSCTL_OP(ret, fd, req, s32, nr, op); + break; + case __CTL_U64A: + nr = CTL_LEN(req->type); + /* fallthrough */ + case CTL_U64: + __SYSCTL_OP(ret, fd, req, u64, nr, op); + break; + case __CTL_STR: + nr = CTL_LEN(req->type); + __SYSCTL_OP(ret, fd, req, char, nr, op); + break; + } + + return ret; +} + +static int __userns_sysctl_op(void *arg, int proc_fd, pid_t pid) +{ + int fd, ret = -1, dir, i, status, *fds = NULL; + struct sysctl_userns_req *userns_req = arg; + int op = userns_req->op; + struct sysctl_req *req, **reqs = NULL; + sigset_t blockmask, oldmask; + pid_t worker; + + // fix up the pointer + req = userns_req->reqs = (struct sysctl_req *) &userns_req[1]; + + /* For files in the IPC/UTS namespaces, restoring is more complicated + * than for net. Unprivileged users cannot even open these files, so + * they must be opened by usernsd. However, the value in the kernel is + * changed for the IPC/UTS namespace that write()s to the open sysctl + * file (not who opened it). So, we must set the value from inside the + * usernsd caller's namespace. We: + * + * 1. unsd opens the sysctl files + * 2. forks a task + * 3. setns()es to the UTS/IPC namespace of the caller + * 4. write()s to the files and exits + */ + dir = open("/proc/sys", O_RDONLY, O_DIRECTORY); + if (dir < 0) { + pr_perror("Can't open sysctl dir"); + return -1; + } + + fds = xmalloc(sizeof(int) * userns_req->nr_req); + if (!fds) + goto out; + + reqs = xmalloc(sizeof(struct sysctl_req *) * userns_req->nr_req); + if (!reqs) + goto out; + + memset(fds, -1, sizeof(int) * userns_req->nr_req); + + for (i = 0; i < userns_req->nr_req; i++) { + int arg_len = sysctl_userns_arg_size(req->type); + int name_len = strlen((char *) &req[1]) + 1; + int total_len = sizeof(*req) + arg_len + name_len; + int flags; + + /* fix up the pointers */ + req->name = (char *) &req[1]; + req->arg = req->name + name_len; + + if (((char *) req) + total_len >= ((char *) userns_req) + MAX_UNSFD_MSG_SIZE) { + pr_err("bad sysctl req %s, too big: %d\n", req->name, total_len); + goto out; + } + + if (op == CTL_READ) + flags = O_RDONLY; + else + flags = O_WRONLY; + + fd = openat(dir, req->name, flags); + if (fd < 0) { + if (errno == ENOENT && (req->flags & CTL_FLAGS_OPTIONAL)) + continue; + pr_perror("Can't open sysctl %s", req->name); + goto out; + } + + /* save a pointer to the req, so we don't need to recompute its + * location + */ + reqs[i] = req; + fds[i] = fd; + + req = (struct sysctl_req *) (((char *) req) + total_len); + } + + /* + * Don't let the sigchld_handler() mess with us + * calling waitpid() on the exited worker. The + * same is done in cr_system(). + */ + + sigemptyset(&blockmask); + sigaddset(&blockmask, SIGCHLD); + sigprocmask(SIG_BLOCK, &blockmask, &oldmask); + + worker = fork(); + if (worker < 0) + goto out; + + if (!worker) { + int nsfd; + const char *nsname = ns_to_string(userns_req->ns); + + BUG_ON(!nsname); + nsfd = openat(proc_fd, nsname, O_RDONLY); + if (nsfd < 0) { + pr_perror("failed to open pid %d's ns %s", pid, nsname); + exit(1); + } + + if (setns(nsfd, 0) < 0) { + pr_perror("failed to setns to %d's ns %s", pid, nsname); + exit(1); + } + + close(nsfd); + + for (i = 0; i < userns_req->nr_req; i++) { + if (do_sysctl_op(fds[i], reqs[i], op) < 0) { + if (op != CTL_READ || errno != EIO || !(req->flags & CTL_FLAGS_READ_EIO_SKIP)) + exit(1); + } else { + /* mark sysctl in question exists */ + req->flags |= CTL_FLAGS_HAS; + } + } + + exit(0); + } + + if (waitpid(worker, &status, 0) != worker) { + pr_perror("worker didn't die?"); + kill(worker, SIGKILL); + goto out; + } + sigprocmask(SIG_SETMASK, &oldmask, NULL); + + if (!WIFEXITED(status) || WEXITSTATUS(status)) { + pr_err("worker failed: %d\n", status); + goto out; + } + + ret = 0; + +out: + if (fds) { + for (i = 0; i < userns_req->nr_req; i++) { + if (fds[i] < 0) + break; + close_safe(&fds[i]); + } + + xfree(fds); + } + + if (reqs) + xfree(reqs); + + close_safe(&dir); + + return ret; +} + +static int __nonuserns_sysctl_op(struct sysctl_req *req, size_t nr_req, int op) +{ + int ret, exit_code = -1; + + while (nr_req--) { + int fd; + + if (op == CTL_READ) + fd = do_open_proc(PROC_GEN, O_RDONLY, "sys/%s", req->name); + else + fd = do_open_proc(PROC_GEN, O_RDWR, "sys/%s", req->name); + if (fd < 0) { + if (errno == ENOENT && (req->flags & CTL_FLAGS_OPTIONAL)) { + req++; + continue; + } + pr_perror("Can't open sysctl %s", req->name); + goto out; + } + + ret = do_sysctl_op(fd, req, op); + if (ret) { + if (op != CTL_READ || errno != EIO || !(req->flags & CTL_FLAGS_READ_EIO_SKIP)) { + close(fd); + goto out; + } + } else { + /* mark sysctl in question exists */ + req->flags |= CTL_FLAGS_HAS; + } + + close(fd); + req++; + } + + exit_code = 0; +out: + return exit_code; +} + +int sysctl_op(struct sysctl_req *req, size_t nr_req, int op, unsigned int ns) +{ + int i, fd, ret; + struct sysctl_userns_req *userns_req; + struct sysctl_req *cur; + + if (nr_req == 0) + return 0; + + if (ns & ~KNOWN_NS_MASK) { + pr_err("don't know how to restore some namespaces in %u\n", ns); + return -1; + } + + /* The way sysctl files behave on open/write depends on the namespace + * they correspond to. If we don't want to interact with something in a + * namespace (e.g. kernel/cap_last_cap is global), we can do this from + * the current process. Similarly, if we're accessing net namespaces, + * we can just do the operation from our current process, since + * anything with CAP_NET_ADMIN can write to the net/ sysctls, and we + * still have that even when restoring in a user ns. + * + * For IPC/UTS, we restore them as described above. + * + * For read operations, we need to copy the values back to return. + * Fortunately, we only do read on dump (or global reads on restore), + * so we can do those in process as well. + */ + if (!ns || ns & CLONE_NEWNET || op == CTL_READ) + return __nonuserns_sysctl_op(req, nr_req, op); + + /* + * In order to avoid lots of opening of /proc/sys for each struct sysctl_req, + * we encode each array of sysctl_reqs into one contiguous region of memory so + * it can be passed via userns_call if necessary. It looks like this: + * + * struct sysctl_userns_req struct sysctl_req name arg + * --------------------------------------------------------------------------- + * | op | nr_req | reqs | | name | arg | "the name" | "the arg" ... + * --------------------------------------------------------------------------- + * |____^ |______|__^ ^ + * |_______________| + */ + userns_req = alloca(MAX_UNSFD_MSG_SIZE); + userns_req->op = op; + userns_req->nr_req = nr_req; + userns_req->ns = ns; + userns_req->reqs = (struct sysctl_req *) (&userns_req[1]); + + cur = userns_req->reqs; + for (i = 0; i < nr_req; i++) { + int arg_len = sysctl_userns_arg_size(req[i].type); + int name_len = strlen(req[i].name) + 1; + int total_len = sizeof(*cur) + arg_len + name_len; + + if (((char *) cur) + total_len >= ((char *) userns_req) + MAX_UNSFD_MSG_SIZE) { + pr_err("sysctl msg %s too big: %d\n", req[i].name, total_len); + return -1; + } + + /* copy over the non-pointer fields */ + cur->type = req[i].type; + cur->flags = req[i].flags; + + cur->name = (char *) &cur[1]; + strcpy(cur->name, req[i].name); + + cur->arg = cur->name + name_len; + memcpy(cur->arg, req[i].arg, arg_len); + + cur = (struct sysctl_req *) (((char *) cur) + total_len); + } + + fd = open_proc(PROC_SELF, "ns"); + if (fd < 0) + return -1; + + ret = userns_call(__userns_sysctl_op, 0, userns_req, MAX_UNSFD_MSG_SIZE, fd); + close(fd); + return ret; +} diff --git a/CRIU_code/criu/sysfs_parse.c b/CRIU_code/criu/sysfs_parse.c new file mode 100644 index 0000000..922e5d4 --- /dev/null +++ b/CRIU_code/criu/sysfs_parse.c @@ -0,0 +1,326 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cr_options.h" +#include "log.h" +#include "xmalloc.h" +#include "files.h" +#include "proc_parse.h" +#include "util.h" +#include "sysfs_parse.h" +#include "namespaces.h" +#include "mount.h" + +/* + * Currently, there are two kernel problems dealing with AUFS + * filesystems. Until these problems are fixed in the kernel, + * we have AUFS support in CRIU to handle the following issues: + * + * 1) /proc//mountinfo: The problem is that for AUFS the root field + * of the root entry is missing the pathname (it's only /). For example: + * + * 90 61 0:33 / / rw,relatime - aufs none rw,si=4476a910a24617e6 + * + * To handle this issue, the user has to specify the root of the AUFS + * filesystem with the --root command line option. + * + * 2) /proc//map_files: The symlinks are absolute pathnames of the + * corresponding *physical* files in the branch they exist. For example, + * for a Docker container using AUFS, a symlink would look like: + * 400000-489000 -> /var/lib/docker/aufs/diff//bin/ + * + * Therefore, when we use the link file descriptor vm_file_fd in + * dump_one_reg_file() to read the link, we get the file's physical + * absolute pathname which does not exist relative to the root of the + * mount namespace and even if we used its relative pathname, the dev:ino + * values would be different from the physical file's dev:ino causing the + * dump to fail. + * + * To handle this issue, we figure out the "correct" paths when parsing + * map_files and save it for later use. See fixup_aufs_vma_fd() for + * details. + */ + +struct ns_id *aufs_nsid; +static char **aufs_branches; + +/* + * Parse out and save the AUFS superblock info in the + * given buffer. + */ +static int parse_aufs_sbinfo(struct mount_info *mi, char *sbinfo, int len) +{ + char *cp; + int n; + + cp = strstr(mi->options, "si="); + if (!cp) { + pr_err("Cannot find sbinfo in option string %s\n", mi->options); + return -1; + } + + /* all ok, copy */ + if (len < 4) { /* 4 for "si_" */ + pr_err("Buffer of %d bytes too small for sbinfo\n", len); + return -1; + } + strcpy(sbinfo, "si_"); + n = 3; + sbinfo += n; + cp += n; + while (isxdigit(*cp) && n < len) { + *sbinfo++ = *cp++; + n++; + } + if (n >= len) { + pr_err("Sbinfo in options string %s too long\n", mi->options); + return -1; + } + *sbinfo = '\0'; + return 0; +} + +/* + * If the specified path is in a branch, replace it + * with pathname from root. + */ +static int fixup_aufs_path(char *path, int size) +{ + char rpath[PATH_MAX]; + int n; + int blen; + + if (aufs_branches == NULL) { + pr_err("No aufs branches to search for %s\n", path); + return -1; + } + + for (n = 0; aufs_branches[n] != NULL; n++) { + blen = strlen(aufs_branches[n]); + if (!strncmp(path, aufs_branches[n], blen)) + break; + } + + if (aufs_branches[n] == NULL) + return 0; /* not in a branch */ + + n = snprintf(rpath, PATH_MAX, "%s", &path[blen]); + if (n >= min(PATH_MAX, size)) { + pr_err("Not enough space to replace %s\n", path); + return -1; + } + + pr_debug("Replacing %s with %s\n", path, rpath); + strcpy(path, rpath); + return n; +} + +/* + * Kernel stores patchnames to AUFS branches in the br files in + * the /sys/fs/aufs/si_ directory where denotes a branch + * number and is a hexadecimal number in %lx format. For + * example: + * + * $ cat /sys/fs/aufs/si_f598876b087ed883/br0 + * /path/to/branch0/directory=rw + * + * This function sets up an array of pointers to branch pathnames. + */ +int parse_aufs_branches(struct mount_info *mi) +{ + char path[AUFSBR_PATH_LEN]; + char *cp; + int n; + int ret; + unsigned int br_num; + unsigned int br_max; + DIR *dp; + FILE *fp; + struct dirent *de; + + pr_info("Collecting AUFS branch pathnames ...\n"); + + if (mi->nsid == 0) { + pr_err("No nsid to parse its aufs branches\n"); + return -1; + } + + if (mi->nsid == aufs_nsid) { + pr_debug("Using cached aufs branch paths for nsid %p\n", aufs_nsid); + return 0; + } + + if (aufs_nsid) + free_aufs_branches(); + + strcpy(path, SYSFS_AUFS); /* /sys/fs/aufs/ */ + if (parse_aufs_sbinfo(mi, &path[sizeof SYSFS_AUFS - 1], SBINFO_LEN) < 0) + return -1; + if ((dp = opendir(path)) == NULL) { + pr_perror("Cannot opendir %s", path); + return -1; + } + + /* + * Find out how many branches we have. + */ + br_max = 0; + ret = 0; + while (1) { + errno = 0; + if ((de = readdir(dp)) == NULL) { + if (errno) { + pr_perror("Cannot readdir %s", path); + ret = -1; + } + break; + } + + ret = sscanf(de->d_name, "br%d", &br_num); + if (ret == 1 && br_num > br_max) + br_max = br_num; + } + closedir(dp); + if (ret == -1) + return -1; + + /* + * Default AUFS maximum is 127, so 1000 should be plenty. + * If you increase the maximum to more than 3 digits, + * make sure to change AUFSBR_PATH_LEN accordingly. + */ + if (br_max > 999) { + pr_err("Too many branches %d\n", br_max); + return -1; + } + + /* + * Allocate an array of pointers to branch pathnames to be read. + * Branches are indexed from 0 and we need a NULL pointer at the end. + */ + aufs_branches = xzalloc((br_max + 2) * sizeof (char *)); + if (!aufs_branches) + return -1; + + /* + * Now read branch pathnames from the branch files. + */ + n = strlen(path); + for (br_num = 0; br_num <= br_max; br_num++) { + fp = NULL; + + ret = snprintf(&path[n], sizeof path - n, "/br%d", br_num); + if (ret >= sizeof path - n) { + pr_err("Buffer overrun creating path for branch %d\n", br_num); + goto err; + } + + if ((fp = fopen(path, "r")) == NULL) { + pr_perror("Cannot fopen %s", path); + goto err; + } + + if (fscanf(fp, "%ms=", &aufs_branches[br_num]) != 1 || + aufs_branches[br_num] == NULL) { + pr_perror("Parse error reading %s", path); + goto err; + } + + /* chop off the trailing "=..." stuff */ + if ((cp = strchr(aufs_branches[br_num], '=')) == NULL) { + pr_err("Bad format in branch pathname %s\n", aufs_branches[br_num]); + goto err; + } + *cp = '\0'; + + fclose(fp); + /* + * Log branch information for external utitilies that + * want to recreate the process's AUFS filesystem + * before calling criu restore. + * + * DO NOT CHANGE this format! + */ + pr_info("%s : %s\n", path, aufs_branches[br_num]); + } + + aufs_nsid = mi->nsid; + return 0; + +err: + if (fp) + fclose(fp); + free_aufs_branches(); + return -1; +} + +/* + * AUFS support to compensate for the kernel bug + * exposing branch pathnames in map_files and providing + * a wrong mnt_id value in /proc//fdinfo/. + * + * If the link points inside a branch, save the + * relative pathname from the root of the mount + * namespace as well as the full pathname from + * globl root (/) for later use in dump_filemap() + * and parse_smaps(). + */ +int fixup_aufs_vma_fd(struct vma_area *vma, int vm_file_fd) +{ + char path[PATH_MAX]; + int len; + + path[0] = '.'; + len = read_fd_link(vm_file_fd, &path[0], sizeof path - 1); + if (len < 0) + return -1; + + len = fixup_aufs_path(&path[1], sizeof path - 1); + if (len <= 0) + return len; + + vma->aufs_rpath = xmalloc(len + 2); + if (!vma->aufs_rpath) + return -1; + + strcpy(vma->aufs_rpath, path); + if (opts.root) { + /* skip ./ in path */ + vma->aufs_fpath = xsprintf("%s/%s", opts.root, &path[2]); + if (!vma->aufs_fpath) + return -1; + } + pr_debug("Saved AUFS paths %s and %s\n", vma->aufs_rpath, vma->aufs_fpath); + + if (stat(vma->aufs_fpath, vma->vmst) < 0) { + pr_perror("Failed stat on map %"PRIx64" (%s)", + vma->e->start, vma->aufs_fpath); + return -1; + } + + /* tell parse_smap() not to call get_fd_mntid() */ + vma->mnt_id = -1; + return len; +} + +void free_aufs_branches(void) +{ + int n; + + if (aufs_branches) { + for (n = 0; aufs_branches[n] != NULL; n++) + xfree(aufs_branches[n]); + + xfree(aufs_branches); + aufs_branches = NULL; + } + + aufs_nsid = NULL; +} diff --git a/CRIU_code/criu/timerfd.c b/CRIU_code/criu/timerfd.c new file mode 100644 index 0000000..b5ee5d8 --- /dev/null +++ b/CRIU_code/criu/timerfd.c @@ -0,0 +1,186 @@ +#include +#include +#include + +#include +#include + +#include "protobuf.h" +#include "images/timerfd.pb-c.h" + +#include "fdinfo.h" +#include "rst-malloc.h" +#include "cr_options.h" +#include "restorer.h" +#include "timerfd.h" +#include "pstree.h" +#include "files.h" +#include "imgset.h" +#include "util.h" +#include "log.h" +#include "common/bug.h" + +#undef LOG_PREFIX +#define LOG_PREFIX "timerfd: " + +struct timerfd_dump_arg { + u32 id; + const struct fd_parms *p; +}; + +struct timerfd_info { + TimerfdEntry *tfe; + struct file_desc d; + int t_fd; + struct list_head rlist; +}; + +static LIST_HEAD(rst_timerfds); + +int check_timerfd(void) +{ + int fd, ret = -1; + + fd = timerfd_create(CLOCK_MONOTONIC, 0); + if (fd < 0) { + pr_perror("timerfd_create failed"); + return -1; + } else { + ret = ioctl(fd, TFD_IOC_SET_TICKS, NULL); + if (ret < 0) { + if (errno != EFAULT) + pr_perror("No timerfd support for c/r"); + else + ret = 0; + } + } + + close(fd); + return ret; +} + +int is_timerfd_link(char *link) +{ + return is_anon_link_type(link, "[timerfd]"); +} + +static int dump_one_timerfd(int lfd, u32 id, const struct fd_parms *p) +{ + TimerfdEntry tfe = TIMERFD_ENTRY__INIT; + FileEntry fe = FILE_ENTRY__INIT; + + if (parse_fdinfo(lfd, FD_TYPES__TIMERFD, &tfe)) + return -1; + + tfe.id = id; + tfe.flags = p->flags; + tfe.fown = (FownEntry *)&p->fown; + pr_info("Dumping id %#x clockid %d it_value(%llu, %llu) it_interval(%llu, %llu)\n", + tfe.id, tfe.clockid, (unsigned long long)tfe.vsec, (unsigned long long)tfe.vnsec, + (unsigned long long)tfe.isec, (unsigned long long)tfe.insec); + + fe.type = FD_TYPES__TIMERFD; + fe.id = tfe.id; + fe.tfd = &tfe; + + return pb_write_one(img_from_set(glob_imgset, CR_FD_FILES), &fe, PB_FILE); +} + +const struct fdtype_ops timerfd_dump_ops = { + .type = FD_TYPES__TIMERFD, + .dump = dump_one_timerfd, +}; + +int prepare_timerfds(struct task_restore_args *ta) +{ + struct timerfd_info *ti; + struct restore_timerfd *t; + + ta->timerfd = (struct restore_timerfd *)rst_mem_align_cpos(RM_PRIVATE); + ta->timerfd_n = 0; + + list_for_each_entry(ti, &rst_timerfds, rlist) { + TimerfdEntry *tfe = ti->tfe; + + t = rst_mem_alloc(sizeof(*t), RM_PRIVATE); + if (!t) + return -1; + + t->id = tfe->id; + t->fd = ti->t_fd; + t->clockid = tfe->clockid; + t->ticks = (unsigned long)tfe->ticks; + t->settime_flags = tfe->settime_flags; + t->val.it_interval.tv_sec = (time_t)tfe->isec; + t->val.it_interval.tv_nsec = (long)tfe->insec; + t->val.it_value.tv_sec = (time_t)tfe->vsec; + t->val.it_value.tv_nsec = (long)tfe->vnsec; + + ta->timerfd_n++; + } + + return 0; +} + +static int timerfd_open(struct file_desc *d, int *new_fd) +{ + struct timerfd_info *info; + TimerfdEntry *tfe; + int tmp = -1; + + info = container_of(d, struct timerfd_info, d); + tfe = info->tfe; + pr_info("Creating timerfd id %#x clockid %d settime_flags %x ticks %llu " + "it_value(%llu, %llu) it_interval(%llu, %llu)\n", + tfe->id, tfe->clockid, tfe->settime_flags, (unsigned long long)tfe->ticks, + (unsigned long long)tfe->vsec, (unsigned long long)tfe->vnsec, + (unsigned long long)tfe->isec, (unsigned long long)tfe->insec); + + tmp = timerfd_create(tfe->clockid, 0); + if (tmp < 0) { + pr_perror("Can't create for %#x", tfe->id); + return -1; + } + + if (rst_file_params(tmp, tfe->fown, tfe->flags)) { + pr_perror("Can't restore params for %#x", tfe->id); + goto err_close; + } + + info->t_fd = file_master(d)->fe->fd; + list_add_tail(&info->rlist, &rst_timerfds); + + *new_fd = tmp; + return 0; + +err_close: + close_safe(&tmp); + return -1; +} + +static struct file_desc_ops timerfd_desc_ops = { + .type = FD_TYPES__TIMERFD, + .open = timerfd_open, +}; + +static int collect_one_timerfd(void *o, ProtobufCMessage *msg, struct cr_img *i) +{ + struct timerfd_info *info = o; + + info->tfe = pb_msg(msg, TimerfdEntry); + if (verify_timerfd(info->tfe)) { + pr_err("Verification failed for %#x\n", info->tfe->id); + return -1; + } + + info->t_fd = -1; + + return file_desc_add(&info->d, info->tfe->id, &timerfd_desc_ops); +} + +struct collect_image_info timerfd_cinfo = { + .fd_type = CR_FD_TIMERFD, + .pb_type = PB_TIMERFD, + .priv_size = sizeof(struct timerfd_info), + .collect = collect_one_timerfd, +}; diff --git a/CRIU_code/criu/tls.c b/CRIU_code/criu/tls.c new file mode 100644 index 0000000..db9cc4f --- /dev/null +++ b/CRIU_code/criu/tls.c @@ -0,0 +1,370 @@ +#include +#include +#include +#include + +#include + +#include "cr_options.h" +#include "xmalloc.h" + +/* Compatability with GnuTLS verson <3.5 */ +#ifndef GNUTLS_E_CERTIFICATE_VERIFICATION_ERROR +# define GNUTLS_E_CERTIFICATE_VERIFICATION_ERROR GNUTLS_E_CERTIFICATE_ERROR +#endif + +#undef LOG_PREFIX +#define LOG_PREFIX "tls: " + +#define CRIU_PKI_DIR SYSCONFDIR "/pki" +#define CRIU_CACERT CRIU_PKI_DIR "/CA/cacert.pem" +#define CRIU_CACRL CRIU_PKI_DIR "/CA/cacrl.pem" +#define CRIU_CERT CRIU_PKI_DIR "/criu/cert.pem" +#define CRIU_KEY CRIU_PKI_DIR "/criu/private/key.pem" + +#define SPLICE_BUF_SZ_MAX (PIPE_BUF * 100) + +#define tls_perror(msg, ret) pr_err("%s: %s\n", msg, gnutls_strerror(ret)) + +static gnutls_session_t session; +static gnutls_certificate_credentials_t x509_cred; +static int tls_sk = -1; +static int tls_sk_flags = 0; + +void tls_terminate_session() +{ + int ret; + + if (!opts.tls) + return; + + if (session) { + do { + /* don't wait for peer to close connection */ + ret = gnutls_bye(session, GNUTLS_SHUT_WR); + } while(ret == GNUTLS_E_AGAIN || ret == GNUTLS_E_INTERRUPTED); + gnutls_deinit(session); + } + + tls_sk = -1; + if (x509_cred) + gnutls_certificate_free_credentials(x509_cred); +} + +ssize_t tls_send(const void *buf, size_t len, int flags) +{ + int ret; + + tls_sk_flags = flags; + ret = gnutls_record_send(session, buf, len); + tls_sk_flags = 0; + + if (ret < 0) { + switch(ret) { + case GNUTLS_E_AGAIN: + errno = EAGAIN; + break; + case GNUTLS_E_INTERRUPTED: + errno = EINTR; + break; + case GNUTLS_E_UNEXPECTED_PACKET_LENGTH: + errno = ENOMSG; + break; + default: + tls_perror("Failed to send data", ret); + errno = EIO; + break; + } + } + + return ret; +} + +/* + * Read data from a file descriptor, then encrypt and send it with GnuTLS. + * This function is used for cases when we would otherwise use splice() + * to transfer data from PIPE to TCP socket. + */ +int tls_send_data_from_fd(int fd, unsigned long len) +{ + ssize_t copied; + unsigned long buf_size = min(len, (unsigned long)SPLICE_BUF_SZ_MAX); + void *buf = xmalloc(buf_size); + + if (!buf) + return -1; + + while (len > 0) { + int ret, sent; + + copied = read(fd, buf, min(len, buf_size)); + if (copied <= 0) { + pr_perror("Can't read from pipe"); + goto err; + } + + for(sent = 0; sent < copied; sent += ret) { + ret = tls_send((buf + sent), (copied - sent), 0); + if (ret < 0) { + tls_perror("Failed sending data", ret); + goto err; + } + } + len -= copied; + } +err: + xfree(buf); + return (len > 0); +} + +ssize_t tls_recv(void *buf, size_t len, int flags) +{ + int ret; + + tls_sk_flags = flags; + ret = gnutls_record_recv(session, buf, len); + tls_sk_flags = 0; + + /* Check if there are any data to receive in the gnutls buffers. */ + if (flags == MSG_DONTWAIT + && (ret == GNUTLS_E_AGAIN || ret == GNUTLS_E_INTERRUPTED)) { + size_t pending = gnutls_record_check_pending(session); + if (pending > 0) { + pr_debug("Receiving pending data (%zu bytes)\n", pending); + ret = gnutls_record_recv(session, buf, len); + } + } + + if (ret < 0) { + switch (ret) { + case GNUTLS_E_AGAIN: + errno = EAGAIN; + break; + case GNUTLS_E_INTERRUPTED: + errno = EINTR; + break; + default: + tls_perror("Failed receiving data", ret); + errno = EIO; + break; + } + ret = -1; + } + + return ret; +} + +/* + * Read and decrypt data with GnuTLS, then write it to a file descriptor. + * This function is used for cases when we would otherwise use splice() + * to transfer data from a TCP socket to a PIPE. + */ +int tls_recv_data_to_fd(int fd, unsigned long len) +{ + gnutls_packet_t packet; + + while (len > 0) { + int ret, w; + gnutls_datum_t pdata; + + ret = gnutls_record_recv_packet(session, &packet); + if (ret == 0) { + pr_info("Connection closed by peer\n"); + break; + } else if (ret < 0) { + tls_perror("Received corrupted data", ret); + break; + } + + gnutls_packet_get(packet, &pdata, NULL); + for(w = 0; w < pdata.size; w += ret) { + ret = write(fd, (pdata.data + w), (pdata.size - w)); + if (ret < 0) { + pr_perror("Failed writing to fd"); + goto err; + } + } + len -= pdata.size; + } +err: + gnutls_packet_deinit(packet); + return (len > 0); +} + +static inline void tls_handshake_verification_status_print(int ret, unsigned status) +{ + gnutls_datum_t out; + int type = gnutls_certificate_type_get(session); + + if (!gnutls_certificate_verification_status_print(status, type, &out, 0)) + pr_err("%s\n", out.data); + + gnutls_free(out.data); +} + +static int tls_x509_verify_peer_cert(void) +{ + int ret; + unsigned status; + const char *hostname = NULL; + + if (!opts.tls_no_cn_verify) + hostname = opts.addr; + + ret = gnutls_certificate_verify_peers3(session, hostname, &status); + if (ret != GNUTLS_E_SUCCESS) { + tls_perror("Unable to verify TLS peer", ret); + return -1; + } + + if (status != 0) { + pr_err("Invalid certificate\n"); + tls_handshake_verification_status_print( + GNUTLS_E_CERTIFICATE_VERIFICATION_ERROR, status); + return -1; + } + + return 0; +} + +static int tls_handshake() +{ + int ret = -1; + while (ret != GNUTLS_E_SUCCESS) { + ret = gnutls_handshake(session); + if (gnutls_error_is_fatal(ret)) { + tls_perror("TLS handshake failed", ret); + return -1; + } + } + pr_info("TLS handshake completed\n"); + return 0; +} + +static int tls_x509_setup_creds() +{ + int ret; + char *cacert = CRIU_CACERT; + char *cacrl = CRIU_CACRL; + char *cert = CRIU_CERT; + char *key = CRIU_KEY; + gnutls_x509_crt_fmt_t pem = GNUTLS_X509_FMT_PEM; + + if (opts.tls_cacert) + cacert = opts.tls_cacert; + if (opts.tls_cacrl) + cacrl = opts.tls_cacrl; + if (opts.tls_cert) + cert = opts.tls_cert; + if (opts.tls_key) + key = opts.tls_key; + + ret = gnutls_certificate_allocate_credentials(&x509_cred); + if (ret != GNUTLS_E_SUCCESS) { + tls_perror("Failed to allocate x509 credentials", ret); + return -1; + } + + if (!opts.tls_cacert) { + ret = gnutls_certificate_set_x509_system_trust(x509_cred); + if (ret < 0) { + tls_perror("Failed to load default trusted CAs", ret); + return -1; + } + } + + ret = gnutls_certificate_set_x509_trust_file(x509_cred, cacert, pem); + if (ret == 0) { + pr_info("No trusted CA certificates added (%s)\n", cacert); + if (opts.tls_cacert) + return -1; + } + + if (!access(cacrl, R_OK)) { + ret = gnutls_certificate_set_x509_crl_file(x509_cred, cacrl, pem); + if (ret < 0) { + tls_perror("Can't set certificate revocation list", ret); + return -1; + } + } else if (opts.tls_cacrl) { + pr_perror("Can't read certificate revocation list %s", cacrl); + return -1; + } + + ret = gnutls_certificate_set_x509_key_file(x509_cred, cert, key, pem); + if (ret != GNUTLS_E_SUCCESS) { + tls_perror("Failed to set certificate/private key pair", ret); + return -1; + } + + return 0; +} + +static ssize_t _tls_push_cb(void *p, const void* data, size_t sz) +{ + int fd = *(int *)(p); + return send(fd, data, sz, tls_sk_flags); +} + +static ssize_t _tls_pull_cb(void *p, void* data, size_t sz) +{ + int fd = *(int *)(p); + return recv(fd, data, sz, tls_sk_flags); +} + +static int tls_x509_setup_session(unsigned int flags) +{ + int ret; + + ret = gnutls_init(&session, flags); + if (ret != GNUTLS_E_SUCCESS) { + tls_perror("Failed to initialize session", ret); + return -1; + } + + ret = gnutls_credentials_set(session, GNUTLS_CRD_CERTIFICATE, x509_cred); + if (ret != GNUTLS_E_SUCCESS) { + tls_perror("Failed to set session credentials", ret); + return -1; + } + + ret = gnutls_set_default_priority(session); + if (ret != GNUTLS_E_SUCCESS) { + tls_perror("Failed to set priority", ret); + return -1; + } + + gnutls_transport_set_ptr(session, &tls_sk); + gnutls_transport_set_push_function(session, _tls_push_cb); + gnutls_transport_set_pull_function(session, _tls_pull_cb); + + if (flags == GNUTLS_SERVER) { + /* Require client certificate */ + gnutls_certificate_server_set_request(session, GNUTLS_CERT_REQUIRE); + /* Do not advertise trusted CAs to the client */ + gnutls_certificate_send_x509_rdn_sequence(session, 1); + } + + return 0; +} + +int tls_x509_init(int sockfd, bool is_server) +{ + if (!opts.tls) + return 0; + + tls_sk = sockfd; + if (tls_x509_setup_creds()) + goto err; + if (tls_x509_setup_session(is_server ? GNUTLS_SERVER : GNUTLS_CLIENT)) + goto err; + if (tls_handshake()) + goto err; + if (tls_x509_verify_peer_cert()) + goto err; + + return 0; +err: + tls_terminate_session(); + return -1; +} diff --git a/CRIU_code/criu/tty.c b/CRIU_code/criu/tty.c new file mode 100644 index 0000000..6fe1153 --- /dev/null +++ b/CRIU_code/criu/tty.c @@ -0,0 +1,2495 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "types.h" +#include "common/compiler.h" +#include "crtools.h" +#include "files.h" +#include "cr_options.h" +#include "imgset.h" +#include "servicefd.h" +#include "rst-malloc.h" +#include "log.h" +#include "common/list.h" +#include "util-pie.h" +#include "proc_parse.h" +#include "file-ids.h" +#include "files-reg.h" +#include "namespaces.h" +#include "external.h" +#include "action-scripts.h" +#include "mount.h" + +#include "protobuf.h" +#include "util.h" +#include "images/tty.pb-c.h" + +#include "parasite-syscall.h" +#include "parasite.h" + +#include "pstree.h" +#include "fdstore.h" +#include "tty.h" + +/* + * Here are some notes about overall TTY c/r design. At moment + * we support unix98 ptys only. Supporting legacy BSD terminals + * is impossible without help from the kernel side -- the indices + * of such terminals are not reported anywhere in the kernel so that + * we can't figure out active pairs. + * + * Usually the PTYs represent a pair of links -- master peer and slave + * peer. Master peer must be opened before slave. Internally, when kernel + * creates master peer it also generates a slave interface in a form of + * /dev/pts/N, where N is that named pty "index". Master/slave connection + * unambiguously identified by this index. + * + * Still, one master can carry multiple slaves -- for example a user opens + * one master via /dev/ptmx and appropriate /dev/pts/N in sequence. + * The result will be the following + * + * master + * `- slave 1 + * `- slave 2 + * + * both slave will have same master index but different file descriptors. + * Still inside the kernel pty parameters are same for both slaves. Thus + * only one slave parameters should be restored, there is no need to carry + * all parameters for every slave peer we've found. + * + * Note the /dev/pts/ is rather convenient agreement and internally the + * kernel doesn't care where exactly the inodes of ptys are laying -- + * it depends on "devpts" mount point path. + */ + +#undef LOG_PREFIX +#define LOG_PREFIX "tty: " + +struct tty_data_entry { + struct list_head list; + TtyDataEntry *tde; +}; + +struct tty_info { + struct list_head list; + struct file_desc d; + + struct file_desc *reg_d; + + TtyFileEntry *tfe; + TtyInfoEntry *tie; + + struct list_head sibling; + struct tty_driver *driver; + + bool create; + bool inherit; + + struct tty_info *ctl_tty; + struct tty_info *link; + struct tty_data_entry *tty_data; + + int fdstore_id; +}; + +struct tty_dump_info { + struct list_head list; + + u32 id; + pid_t sid; + pid_t pgrp; + pid_t pid_real; + int fd; + int mnt_id; + struct tty_driver *driver; + + int index; + int lfd; + int flags; + struct tty_dump_info *link; + void *tty_data; + size_t tty_data_size; +}; + +static bool stdin_isatty = false; +static LIST_HEAD(collected_ttys); +static LIST_HEAD(all_ttys); +static int self_stdin_fdid = -1; + +/* + * Usually an application has not that many ttys opened. + * If this won't be enough in future we simply need to + * change tracking mechanism to some more extendable. + * + * This particular bitmap requires 256 bytes of memory. + * Pretty acceptable trade off in a sake of simplicity. + */ + +#define MAX_TTYS 1088 + +/* + * Custom indices should be even numbers just in case if we + * need odds for pair numbering someday. + */ + +#define MAX_PTY_INDEX 1000 +#define CONSOLE_INDEX 1002 +#define VT_INDEX 1004 +#define CTTY_INDEX 1006 +#define STTY_INDEX 1010 +#define ETTY_INDEX 1012 +#define ETTY_INDEX_MAX 1076 +#define INDEX_ERR (MAX_TTYS + 1) + +static DECLARE_BITMAP(tty_bitmap, (MAX_TTYS << 1)); +static DECLARE_BITMAP(tty_active_pairs, (MAX_TTYS << 1)); + +struct tty_driver { + short type; + short subtype; + char *name; + int index; + int (*fd_get_index)(int fd, const struct fd_parms *p); + int (*img_get_index)(struct tty_info *ti); + int (*open)(struct tty_info *ti); +}; + +#define TTY_SUBTYPE_MASTER 0x0001 +#define TTY_SUBTYPE_SLAVE 0x0002 + +static int ptm_fd_get_index(int fd, const struct fd_parms *p) +{ + int index; + + if (ioctl(fd, TIOCGPTN, &index)) { + pr_perror("Can't obtain ptmx index"); + return INDEX_ERR; + } + + if (index > MAX_PTY_INDEX) { + pr_err("Index %d on ptmx is too big\n", index); + return INDEX_ERR; + } + + return index; +} + +static int pty_get_index(struct tty_info *ti) +{ + return ti->tie->pty->index; +} + +static int ext_fd_get_index(int fd, const struct fd_parms *p) +{ + static int index; + + index++; + + if (index + ETTY_INDEX > ETTY_INDEX_MAX) { + pr_err("Too many external terminals\n"); + return INDEX_ERR; + } + + return index + ETTY_INDEX; +} + +static int pty_open_ptmx(struct tty_info *info); + +static struct tty_driver ptm_driver = { + .type = TTY_TYPE__PTY, + .subtype = TTY_SUBTYPE_MASTER, + .name = "ptmx", + .fd_get_index = ptm_fd_get_index, + .img_get_index = pty_get_index, + .open = pty_open_ptmx, +}; + +static int open_simple_tty(struct tty_info *info); + +static struct tty_driver console_driver = { + .type = TTY_TYPE__CONSOLE, + .name = "console", + .index = CONSOLE_INDEX, + .open = open_simple_tty, +}; + +static struct tty_driver ctty_driver = { + .type = TTY_TYPE__CTTY, + .name = "ctty", + .index = CTTY_INDEX, + .open = open_simple_tty, +}; + +static struct tty_driver vt_driver = { + .type = TTY_TYPE__VT, + .name = "vt", + .index = VT_INDEX, + .open = open_simple_tty, +}; + +static int open_ext_tty(struct tty_info *info); +static struct tty_driver ext_driver = { + .type = TTY_TYPE__EXT_TTY, + .name = "ext", + .index = ETTY_INDEX, + .open = open_ext_tty, + .fd_get_index = ext_fd_get_index, +}; + +static struct tty_driver serial_driver = { + .type = TTY_TYPE__SERIAL, + .name = "serial", + .index = STTY_INDEX, + .open = open_simple_tty, +}; + +static int pts_fd_get_index(int fd, const struct fd_parms *p) +{ + int index; + const struct fd_link *link = p->link; + char *pos = strrchr(link->name, '/'); + + if (!pos || pos == (link->name + link->len - 1)) { + pr_err("Unexpected format on path %s\n", link->name + 1); + return INDEX_ERR; + } + + index = atoi(pos + 1); + if (index > MAX_PTY_INDEX) { + pr_err("Index %d on pts is too big\n", index); + return INDEX_ERR; + } + + return index; +} + +static struct tty_driver pts_driver = { + .type = TTY_TYPE__PTY, + .subtype = TTY_SUBTYPE_SLAVE, + .name = "pts", + .fd_get_index = pts_fd_get_index, + .img_get_index = pty_get_index, + .open = pty_open_ptmx, +}; + +struct tty_driver *get_tty_driver(dev_t rdev, dev_t dev) +{ + int major, minor; + char id[42]; + + snprintf(id, sizeof(id), "tty[%"PRIx64":%"PRIx64"]", rdev, dev); + if (external_lookup_id(id) || inherit_fd_lookup_id(id) >= 0) + return &ext_driver; + + major = major(rdev); + minor = minor(rdev); + + switch (major) { + case TTYAUX_MAJOR: + if (minor == 2) + return &ptm_driver; + else if (minor == 1) + return &console_driver; + else if (minor == 0) + return &ctty_driver; + break; + case TTY_MAJOR: + if (minor >= MIN_NR_CONSOLES && minor <= MAX_NR_CONSOLES) + /* + * Minors [MIN_NR_CONSOLES; MAX_NR_CONSOLES] stand + * for consoles (virtual terminals, VT in terms + * of kernel). + */ + return &vt_driver; +#ifdef __s390x__ + /* + * On s390 we have the following consoles: + * - tty3215 : ttyS0 , minor = 64, linemode console + * - sclp_line : ttyS0 , minor = 64, linemode console + * - sclp_vt220 : ttysclp0, minor = 65, vt220 console + * See also "drivers/s390/char" + */ + else if (minor == 64 || minor == 65) + return &vt_driver; +#endif + /* Other minors points to UART serial ports */ + break; + case USB_SERIAL_MAJOR: + case LOW_DENSE_SERIAL_MAJOR: + return &serial_driver; + case UNIX98_PTY_MASTER_MAJOR ... (UNIX98_PTY_MASTER_MAJOR + UNIX98_PTY_MAJOR_COUNT - 1): + return &ptm_driver; + case UNIX98_PTY_SLAVE_MAJOR: + return &pts_driver; + } + return NULL; +} + +static inline int is_pty(struct tty_driver *driver) +{ + return driver->type == TTY_TYPE__PTY; +} + +/* + * /dev/ptmx is a shared resource between all tasks + * so we need to serialize access to it. + */ +static mutex_t *tty_mutex; + +static bool tty_is_master(struct tty_info *info); + +static int init_tty_mutex(void) +{ + if (tty_mutex) + return 0; + + tty_mutex = shmalloc(sizeof(*tty_mutex)); + if (!tty_mutex) { + pr_err("Can't create ptmx index mutex\n"); + return -1; + } + + mutex_init(tty_mutex); + + return 0; +} + +#define winsize_copy(d, s) \ + do { \ + ASSIGN_MEMBER((d), (s), ws_row); \ + ASSIGN_MEMBER((d), (s), ws_col); \ + ASSIGN_MEMBER((d), (s), ws_xpixel); \ + ASSIGN_MEMBER((d), (s), ws_ypixel); \ + } while (0) + +#define termios_copy(d, s) \ + do { \ + struct termios __t; \ + \ + memcpy((d)->c_cc, (s)->c_cc, \ + sizeof(__t.c_cc)); \ + \ + ASSIGN_MEMBER((d),(s), c_iflag); \ + ASSIGN_MEMBER((d),(s), c_oflag); \ + ASSIGN_MEMBER((d),(s), c_cflag); \ + ASSIGN_MEMBER((d),(s), c_lflag); \ + ASSIGN_MEMBER((d),(s), c_line); \ + } while (0) + +static int tty_gen_id(struct tty_driver *driver, int index) +{ + return (index << 1) + (driver->subtype == TTY_SUBTYPE_MASTER); +} + +static int tty_get_index(u32 id) +{ + return id >> 1; +} + +/* Make sure the active pairs do exist */ +static int tty_verify_active_pairs(void) +{ + unsigned long i, unpaired_slaves = 0; + + for_each_bit(i, tty_active_pairs) { + if ((i % 2) == 0) { + if (test_bit(i + 1, tty_active_pairs)) { + i++; + continue; + } + + if (!opts.shell_job && !opts.orphan_pts_master) { + pr_err("Found slave peer index %d without " + "correspond master peer\n", + tty_get_index(i)); + return -1; + } + + pr_debug("Unpaired slave %d\n", tty_get_index(i)); + + if (++unpaired_slaves > 1) { + pr_err("Only one slave external peer " + "is allowed (index %d)\n", + tty_get_index(i)); + return -1; + } + } + } + + return 0; +} + +static int tty_test_and_set(int bit, unsigned long *bitmap) +{ + int ret; + + ret = test_bit(bit, bitmap); + if (!ret) + set_bit(bit, bitmap); + return ret; +} + +/* + * Generate a regular file object in case if such is missed + * in the image file, ie obsolete interface has been used on + * checkpoint. + */ +static struct file_desc *pty_alloc_reg(struct tty_info *info, bool add) +{ + TtyFileEntry *tfe = info->tfe; + const size_t namelen = 64; + struct reg_file_info *r; + static struct file_desc_ops noops = {}; + + r = xzalloc(sizeof(*r) + sizeof(*r->rfe) + namelen); + if (!r) + return NULL; + + r->rfe = (void *)r + sizeof(*r); + reg_file_entry__init(r->rfe); + + r->rfe->name = (void *)r + sizeof(*r) + sizeof(*r->rfe); + if (tty_is_master(info)) + strcpy(r->rfe->name, "/dev/ptmx"); + else + snprintf(r->rfe->name, namelen, "/dev/pts/%u", + info->tie->pty->index); + + if (add) + file_desc_add(&r->d, tfe->id, &noops); + else + file_desc_init(&r->d, tfe->id, &noops); + + r->rfe->id = tfe->id; + r->rfe->flags = tfe->flags; + r->rfe->fown = tfe->fown; + r->path = &r->rfe->name[1]; + + return &r->d; +} + +/* + * In case if we need to open a fake pty (for example + * a master peer which were deleted at checkpoint moment, + * or open a slave peer when restoring control terminal) + * we need to create a new reg-file object taking @info + * as a template. Here is a trick though: the @info might + * represent master peer while we need to allocate a slave + * one and the reverse. For such case taking path from the + * @info as a template we generate that named 'inverted-path'. + * + * For example if the master peer was /dev/pts/ptmx with index 1, + * the inverted path is /dev/pts/1, for inverted slaves it's simpler + * we just add 'ptmx' postfix. + */ +static struct reg_file_info *pty_alloc_fake_reg(struct tty_info *info, int subtype) +{ + struct reg_file_info *new, *orig; + struct file_desc *fake_desc; + + pr_debug("Allocating fake descriptor for %#x (reg_d %p)\n", + info->tfe->id, info->reg_d); + + BUG_ON(!info->reg_d); + BUG_ON(!is_pty(info->driver)); + + fake_desc = pty_alloc_reg(info, false); + if (!fake_desc) + return NULL; + + orig = container_of(info->reg_d, struct reg_file_info, d); + new = container_of(fake_desc, struct reg_file_info, d); + + if ((subtype == TTY_SUBTYPE_MASTER && tty_is_master(info)) || + (subtype == TTY_SUBTYPE_SLAVE && !tty_is_master(info))) { + new->path = xstrdup(orig->path); + new->rfe->name = &new->path[1]; + } else { + char *pos = strrchr(orig->rfe->name, '/'); + size_t len = strlen(orig->rfe->name) + 1; + size_t slash_at = pos - orig->rfe->name; + char *inverted_path = xmalloc(len + 32); + + BUG_ON(!pos || !inverted_path); + + memcpy(inverted_path, orig->rfe->name, slash_at + 1); + if (subtype == TTY_SUBTYPE_MASTER) { + inverted_path[slash_at + 1] = '\0'; + strcat(inverted_path, "ptmx"); + } else { + if (slash_at >= 3 && strncmp(&inverted_path[slash_at - 3], "pts", 3)) + snprintf(&inverted_path[slash_at + 1], 10, "pts/%u", + info->tie->pty->index); + else + snprintf(&inverted_path[slash_at + 1], 10, "%u", + info->tie->pty->index); + } + + new->rfe->name = inverted_path; + new->path = &inverted_path[1]; + } + + return new; +} + +#define pty_alloc_fake_master(info) pty_alloc_fake_reg(info, TTY_SUBTYPE_MASTER) +#define pty_alloc_fake_slave(info) pty_alloc_fake_reg(info, TTY_SUBTYPE_SLAVE) + +static void pty_free_fake_reg(struct reg_file_info **r) +{ + if (*r) { + xfree((*r)->rfe->name); + xfree((*r)); + *r = NULL; + } +} + +static int do_open_tty_reg(int ns_root_fd, struct reg_file_info *rfi, void *arg) +{ + int fd; + + fd = do_open_reg_noseek_flags(ns_root_fd, rfi, arg); + if (fd >= 0) { + /* + * Peers might have different modes set + * after creation before we've dumped + * them. So simply setup mode from image + * the regular file engine will check + * for this, so if we fail here it + * gonna be catched anyway. + */ + if (rfi->rfe->has_mode) + fchmod(fd, rfi->rfe->mode); + } + + return fd; +} + +static int open_tty_reg(void *arg, int flags) +{ + struct file_desc *reg_d = arg; + /* + * Never set as a control terminal automatically, all + * ctty magic happens only in tty_set_sid(). + */ + flags |= O_NOCTTY; + return open_path(reg_d, do_open_tty_reg, &flags); +} + +static char *path_from_reg(struct file_desc *d) +{ + struct reg_file_info *rfi = container_of(d, struct reg_file_info, d); + return rfi->path; +} + +static int __pty_open_ptmx_index(int index, int flags, + int (*cb)(void *arg, int flags), void *arg, char *path) +{ + int fds[32], i, ret = -1, cur_idx; + + memset(fds, 0xff, sizeof(fds)); + + mutex_lock(tty_mutex); + + for (i = 0; i < ARRAY_SIZE(fds); i++) { + fds[i] = cb(arg, flags); + if (fds[i] < 0) { + pr_err("Can't open %s\n", path); + break; + } + + if (ioctl(fds[i], TIOCGPTN, &cur_idx)) { + pr_perror("Can't obtain current index on %s", + path); + break; + } + + pr_debug("\t\tptmx opened with index %d\n", cur_idx); + + if (cur_idx == index) { + pr_info("ptmx opened with index %d\n", cur_idx); + ret = fds[i]; + fds[i] = -1; + break; + } + + /* + * Maybe indices are already borrowed by + * someone else, so no need to continue. + */ + if (cur_idx < index && (index - cur_idx) < ARRAY_SIZE(fds)) + continue; + + pr_err("Unable to open %s with specified index %d\n", + path, index); + break; + } + + for (i = 0; i < ARRAY_SIZE(fds); i++) { + if (fds[i] >= 0) + close(fds[i]); + } + + mutex_unlock(tty_mutex); + + return ret; +} + +static int pty_open_ptmx_index(struct file_desc *d, struct tty_info *info, int flags) +{ + if (info->fdstore_id >= 0) + return fdstore_get(info->fdstore_id); + + return __pty_open_ptmx_index(info->tie->pty->index, flags, + open_tty_reg, d, path_from_reg(d)); +} + +static int unlock_pty(int fd) +{ + const int lock = 0; + + /* + * Usually when ptmx opened it gets locked + * by kernel and we need to unlock it to be + * able to connect slave peer. + */ + if (ioctl(fd, TIOCSPTLCK, &lock)) { + pr_err("Unable to unlock pty device via y%d\n", fd); + return -1; + } + + return 0; +} + +static int lock_pty(int fd) +{ + const int lock = 1; + + if (ioctl(fd, TIOCSPTLCK, &lock)) { + pr_err("Unable to lock pty device via %d\n", fd); + return -1; + } + + return 0; +} + +static int tty_set_sid(int fd) +{ + if (ioctl(fd, TIOCSCTTY, 1)) { + pr_perror("Can't set sid on terminal fd %d", fd); + return -1; + } + + return 0; +} + +static int tty_set_prgp(int fd, int group) +{ + if (ioctl(fd, TIOCSPGRP, &group)) { + pr_perror("Failed to set group %d on %d", group, fd); + return -1; + } + return 0; +} + +static int tty_restore_ctl_terminal(struct file_desc *d) +{ + struct tty_info *info = container_of(d, struct tty_info, d); + struct tty_driver *driver = info->driver; + struct reg_file_info *fake = NULL; + struct file_desc *slave_d; + int slave = -1, ret = -1, index = -1; + + if (driver->type == TTY_TYPE__EXT_TTY) { + slave = -1; + if (!inherited_fd(&info->d, &slave) && slave < 0) + return -1; + goto out; + } + if (driver->img_get_index) + index = driver->img_get_index(info); + else + index = driver->index; + + if (is_pty(info->driver) && tty_is_master(info)) { + fake = pty_alloc_fake_slave(info); + if (!fake) + goto err; + + slave_d = &fake->d; + } else + slave_d = info->reg_d; + + slave = open_tty_reg(slave_d, O_RDONLY); + if (slave < 0) { + pr_err("Can't open slave tty %s\n", path_from_reg(slave_d)); + goto err; + } + +out: + pr_info("Restore session %d by %d tty (index %d)\n", + info->tie->sid, (int)getpid(), index); + + ret = tty_set_sid(slave); + if (!ret) + ret = tty_set_prgp(slave, info->tie->pgrp); + + close(slave); +err: + pty_free_fake_reg(&fake); + return ret ? -1 : 0; +} + +static bool __tty_is_master(struct tty_driver *driver) +{ + if (driver->subtype == TTY_SUBTYPE_MASTER) + return true; + + switch (driver->type) { + case TTY_TYPE__CONSOLE: + case TTY_TYPE__CTTY: + return true; + case TTY_TYPE__SERIAL: + case TTY_TYPE__VT: + if (!opts.shell_job) + return true; + break; + case TTY_TYPE__EXT_TTY: + return true; + } + + return false; +} + +static bool tty_is_master(struct tty_info *info) +{ + return __tty_is_master(info->driver); +} + +static bool tty_is_hung(struct tty_info *info) +{ + return info->tie->termios == NULL; +} + +static bool tty_has_active_pair(struct tty_info *info) +{ + int d = tty_is_master(info) ? -1 : + 1; + + return test_bit(info->tfe->tty_info_id + d, + tty_active_pairs); +} + +static void tty_show_pty_info(char *prefix, struct tty_info *info) +{ + int index = -1; + struct tty_driver *driver = info->driver; + + if (driver->img_get_index) + index = driver->img_get_index(info); + else + index = driver->index; + + pr_info("%s driver %s id %#x index %d (master %d sid %d pgrp %d inherit %d)\n", + prefix, info->driver->name, info->tfe->id, index, + tty_is_master(info), info->tie->sid, info->tie->pgrp, info->inherit); +} + +struct tty_parms { + int tty_id; + unsigned has; +#define HAS_TERMIOS_L 0x1 +#define HAS_TERMIOS 0x2 +#define HAS_WINS 0x4 + struct termios tl; + struct termios t; + struct winsize w; +}; + +static int do_restore_tty_parms(void *arg, int fd, pid_t pid) +{ + struct tty_parms *p = arg; + + /* + * Only locked termios need CAP_SYS_ADMIN, but we + * restore them all here, since the regular tremios + * restore is affected by locked and thus we would + * have to do synchronous usernsd call which is not + * nice. + * + * Window size is restored here as it might depend + * on termios too. Just to be on the safe side. + */ + + if ((p->has & HAS_TERMIOS_L) && + ioctl(fd, TIOCSLCKTRMIOS, &p->tl) < 0) + goto err; + + if ((p->has & HAS_TERMIOS) && + ioctl(fd, TCSETS, &p->t) < 0) + goto err; + + if ((p->has & HAS_WINS) && + ioctl(fd, TIOCSWINSZ, &p->w) < 0) + goto err; + + return 0; + +err: + pr_perror("Can't set tty params on %#x", p->tty_id); + return -1; +} + +static int restore_tty_params(int fd, struct tty_info *info) +{ + struct tty_parms p; + + /* + * It's important to zeroify termios + * because it contain @c_cc array which + * is bigger than TERMIOS_NCC. Same applies + * to winsize usage, we can't guarantee the + * structure taken from the system headers will + * never be extended. + */ + + p.has = 0; + p.tty_id = info->tfe->id; + + if (info->tie->termios_locked) { + memzero(&p.tl, sizeof(p.tl)); + p.has |= HAS_TERMIOS_L; + termios_copy(&p.tl, info->tie->termios_locked); + } + + if (info->tie->termios) { + memzero(&p.t, sizeof(p.t)); + p.has |= HAS_TERMIOS; + termios_copy(&p.t, info->tie->termios); + } + + if (info->tie->winsize) { + memzero(&p.w, sizeof(p.w)); + p.has |= HAS_WINS; + winsize_copy(&p.w, info->tie->winsize); + } + + if (info->tie->has_uid && info->tie->has_gid) { + if (fchown(fd, info->tie->uid, info->tie->gid)) { + pr_perror("Can't setup uid %d gid %d on %#x", + (int)info->tie->uid, + (int)info->tie->gid, + info->tfe->id); + return -1; + } + } + + return userns_call(do_restore_tty_parms, 0, &p, sizeof(p), fd); +} + +/* + * When we restore queued data we don't exit if error happened: + * the terminals never was a transport with guaranteed delivery, + * it's up to application which uses it to guaratee the data + * integrity. + */ +static void pty_restore_queued_data(struct tty_info *info, int fd) +{ + if (info && info->tty_data) { + ProtobufCBinaryData bd = info->tty_data->tde->data; + int retval; + + pr_debug("restore queued data on %#x (%zu bytes)\n", + info->tfe->id, (size_t)bd.len); + + retval = write(fd, bd.data, bd.len); + if (retval != bd.len) + pr_err("Restored %d bytes while %zu expected\n", + retval, (size_t)bd.len); + } +} + +static int pty_open_slaves(struct tty_info *info) +{ + int fd = -1, ret = -1; + struct tty_info *slave; + + list_for_each_entry(slave, &info->sibling, sibling) { + BUG_ON(tty_is_master(slave)); + + fd = open_tty_reg(slave->reg_d, slave->tfe->flags); + if (fd < 0) { + pr_err("Can't open slave tty %s\n", path_from_reg(slave->reg_d)); + goto err; + } + + if (restore_tty_params(fd, slave)) + goto err; + + pr_debug("send slave %#x fd %d connected on %s\n", + slave->tfe->id, fd, path_from_reg(slave->reg_d)); + + if (send_desc_to_peer(fd, &slave->d)) { + pr_err("Can't send file descriptor\n"); + goto err; + } + + pty_restore_queued_data(slave->link, fd); + close(fd); + fd = -1; + } + ret = 0; + +err: + close_safe(&fd); + return ret; +} + +static int receive_tty(struct tty_info *info, int *new_fd) +{ + int fd, ret; + + ret = recv_desc_from_peer(&info->d, &fd); + if (ret != 0) { + if (ret != 1) + pr_err("Can't get fd %d\n", fd); + return ret; + } + + if (rst_file_params(fd, info->tfe->fown, info->tfe->flags) < 0) { + close_safe(&fd); + return -1; + } + + *new_fd = fd; + return 0; +} + +static int pty_open_unpaired_slave(struct file_desc *d, struct tty_info *slave) +{ + struct reg_file_info *fake = NULL; + int master = -1, ret = -1, fd = -1; + + /* + * We may have 2 cases here: the slave either need to + * be inherited, either it requires a fake master. + */ + + if (likely(slave->inherit)) { + if (opts.orphan_pts_master) { + fake = pty_alloc_fake_master(slave); + if (!fake) + goto err; + master = pty_open_ptmx_index(&fake->d, slave, O_RDWR); + if (master < 0) { + pr_err("Can't open master pty %x (index %d)\n", + slave->tfe->id, slave->tie->pty->index); + goto err; + } + + if (unlock_pty(master)) + goto err; + + if (opts.orphan_pts_master && + rpc_send_fd(ACT_ORPHAN_PTS_MASTER, master) == 0) { + + fd = open_tty_reg(slave->reg_d, slave->tfe->flags); + if (fd < 0) { + pr_err("Can't open slave pty %s\n", path_from_reg(slave->reg_d)); + goto err; + } + + goto out; + } + } + + if (!stdin_isatty) { + pr_err("Don't have tty to inherit session from, aborting\n"); + return -1; + } + + fd = fdstore_get(self_stdin_fdid); + if (fd < 0) { + pr_err("Can't get self_stdin_fdid\n"); + return -1; + } + + pr_info("Migrated slave peer %#x -> to fd %d\n", + slave->tfe->id, fd); + } else { + fake = pty_alloc_fake_master(slave); + if (!fake) + goto err; + master = pty_open_ptmx_index(&fake->d, slave, O_RDONLY); + if (master < 0) { + pr_err("Can't open master pty %#x (index %d)\n", + slave->tfe->id, slave->tie->pty->index); + goto err; + } + + if (unlock_pty(master)) + goto err; + + fd = open_tty_reg(slave->reg_d, slave->tfe->flags); + if (fd < 0) { + pr_err("Can't open slave pty %s\n", path_from_reg(slave->reg_d)); + goto err; + } + + } + +out: + if (restore_tty_params(fd, slave)) + goto err; + + /* + * If tty is migrated we need to set its group + * to the parent group, because signals on key + * presses are delivered to a group of terminal. + * + * Note, at this point the group/session should + * be already restored properly thus we can simply + * use syscalls instead of lookup via process tree. + */ + if (slave->inherit && opts.shell_job) { + /* + * The restoration procedure only works if we're + * migrating not a session leader, otherwise it's + * not allowed to restore a group and one better to + * checkpoint complete process tree together with + * the process which keeps the master peer. + */ + if (root_item->sid != vpid(root_item)) { + if (root_item->pgid == vpid(root_item)) { + if (tty_set_prgp(fd, root_item->pgid)) + goto err; + } else { + pr_debug("Restore inherited group %d\n", + getpgid(getppid())); + if (tty_set_prgp(fd, getpgid(getppid()))) + goto err; + } + } + } + + if (pty_open_slaves(slave)) + goto err; + + ret = fd; + fd = -1; +err: + close_safe(&master); + close_safe(&fd); + pty_free_fake_reg(&fake); + return ret; +} + +static int pty_open_ptmx(struct tty_info *info) +{ + int master = -1; + + master = pty_open_ptmx_index(info->reg_d, info, info->tfe->flags); + if (master < 0) { + pr_err("Can't open master pty %#x (index %d)\n", + info->tfe->id, info->tie->pty->index); + return -1; + } + + if (unlock_pty(master)) + goto err; + + if (restore_tty_params(master, info)) + goto err; + + if (info->tie->packet_mode) { + int packet_mode = 1; + + if (ioctl(master, TIOCPKT, &packet_mode) < 0) { + pr_perror("Can't set packed mode on %#x", + info->tfe->id); + goto err; + } + } + + if (pty_open_slaves(info)) + goto err; + + pty_restore_queued_data(info->link, master); + + if (info->tie->locked) + lock_pty(master); + + return master; +err: + close_safe(&master); + return -1; +} + +static int open_simple_tty(struct tty_info *info) +{ + int fd = -1; + + fd = open_tty_reg(info->reg_d, info->tfe->flags); + if (fd < 0) { + pr_err("Can't open tty %s %#x\n", + info->driver->name, info->tfe->id); + return -1; + } + + if (restore_tty_params(fd, info)) + goto err; + + return fd; +err: + close_safe(&fd); + return -1; +} + +static int open_ext_tty(struct tty_info *info) +{ + int fd = -1; + + if (!inherited_fd(&info->d, &fd) && fd < 0) + return -1; + + if (restore_tty_params(fd, info)) { + close(fd); + return -1; + } + + return fd; +} + +static bool tty_deps_restored(struct tty_info *info) +{ + struct list_head *list = &rsti(current)->fds; + struct fdinfo_list_entry *fle; + struct tty_info *tmp; + + if (info->driver->type == TTY_TYPE__CTTY) { + list_for_each_entry(fle, list, ps_list) { + if (fle->desc->ops->type != FD_TYPES__TTY || fle->desc == &info->d) + continue; + + /* ctty needs all others are restored */ + if (fle->stage != FLE_RESTORED) + return false; + } + } else if (!tty_is_master(info)) { + list_for_each_entry(fle, list, ps_list) { + if (fle->desc->ops->type != FD_TYPES__TTY || fle->desc == &info->d) + continue; + tmp = container_of(fle->desc, struct tty_info, d); + + /* slaves wait for masters except ctty */ + if (tmp->driver->type == TTY_TYPE__CTTY || + !tty_is_master(tmp)) + continue; + if (fle->stage != FLE_RESTORED) + return false; + } + } + return true; +} + +static int tty_open(struct file_desc *d, int *new_fd) +{ + struct tty_info *info = container_of(d, struct tty_info, d); + int ret; + + tty_show_pty_info("open", info); + + if (!info->create) + return receive_tty(info, new_fd); + + if (!tty_deps_restored(info)) + return 1; + + if (is_pty(info->driver) && !tty_is_master(info)) + ret = pty_open_unpaired_slave(d, info); + else + ret = info->driver->open(info); + if (ret < 0) + return -1; + *new_fd = ret; + return 0; +} + +static char *tty_d_name(struct file_desc *d, char *buf, size_t s) +{ + struct tty_info *info = container_of(d, struct tty_info, d); + + snprintf(buf, s, "tty[%x:%x]", info->tie->rdev, info->tie->dev); + + return buf; +} + +static struct file_desc_ops tty_desc_ops = { + .type = FD_TYPES__TTY, + .open = tty_open, + .name = tty_d_name, +}; + +static struct pstree_item *find_first_sid(int sid) +{ + struct pstree_item *item; + + for_each_pstree_item(item) { + if (item->sid == sid) + return item; + } + + return NULL; +} + +static int add_fake_fle(struct pstree_item *item, u32 desc_id) +{ + FdinfoEntry *e; + + e = xmalloc(sizeof(*e)); + if (!e) + return -1; + + fdinfo_entry__init(e); + + e->id = desc_id; + e->fd = find_unused_fd(item, -1); + e->type = FD_TYPES__TTY; + + if (collect_fd(vpid(item), e, rsti(item), true)) { + xfree(e); + return -1; + } + + return e->fd; +} + +struct ctl_tty { + struct file_desc desc; + struct fdinfo_list_entry *real_tty; +}; + +static int ctl_tty_open(struct file_desc *d, int *new_fd) +{ + struct fdinfo_list_entry *fle; + int ret; + + fle = container_of(d, struct ctl_tty, desc)->real_tty; + if (fle->stage != FLE_RESTORED) + return 1; + + ret = tty_restore_ctl_terminal(fle->desc); + if (!ret) { + /* + * Generic engine expects we return a new_fd. + * Return this one just to return something. + */ + *new_fd = dup(fle->fe->fd); + if (*new_fd < 0) { + pr_perror("dup() failed"); + ret = -1; + } else + ret = 0; + } + return ret; +} + +/* + * This is a fake type to handle ctl tty. The problem + * is sometimes we need to do tty_set_sid() from slave + * fle, while generic file engine allows to call open + * method for file masters only. So, this type allows + * to add fake masters, which will call open for slave + * fles of type FD_TYPES__TTY indirectly. + */ +static struct file_desc_ops ctl_tty_desc_ops = { + .type = FD_TYPES__CTL_TTY, + .open = ctl_tty_open, +}; + +static int prepare_ctl_tty(struct pstree_item *item, u32 ctl_tty_id) +{ + struct fdinfo_list_entry *fle; + struct ctl_tty *ctl_tty; + FdinfoEntry *e; + int fd; + + if (!ctl_tty_id) + return 0; + + pr_info("Requesting for ctl tty %#x into service fd\n", ctl_tty_id); + + /* Add a fake fle to make generic engine deliver real tty desc to task */ + fd = add_fake_fle(item, ctl_tty_id); + if (fd < 0) + return -1; + + fle = find_used_fd(item, fd); + BUG_ON(!fle); + /* + * Add a fake ctl_tty depending on the above fake fle, which will + * actually restore the session. + */ + ctl_tty = xmalloc(sizeof(*ctl_tty)); + e = xmalloc(sizeof(*e)); + + if (!ctl_tty || !e) + goto err; + + ctl_tty->real_tty = fle; + + /* + * Use the same ctl_tty_id id for ctl_tty as it's unique among + * FD_TYPES__CTL_TTY (as it's unique for FD_TYPES__TTY type). + */ + file_desc_add(&ctl_tty->desc, ctl_tty_id, &ctl_tty_desc_ops); + + fdinfo_entry__init(e); + + e->id = ctl_tty_id; + e->fd = find_unused_fd(item, -1); + e->type = FD_TYPES__CTL_TTY; + + if (collect_fd(vpid(item), e, rsti(item), true)) + goto err; + + return 0; +err: + xfree(ctl_tty); + xfree(e); + return -1; +} + +static int tty_find_restoring_task(struct tty_info *info) +{ + struct pstree_item *item; + + /* + * The overall scenario is the following (note + * we might have corrupted image so don't believe + * anything). + * + * SID is present on a peer + * ------------------------ + * + * - if it's master peer and we have as well a slave + * peer then prefer restore controlling terminal + * via slave peer + * + * - if it's master peer without slave, there must be + * a SID leader who will be restoring the peer + * + * - if it's a slave peer and no session leader found + * than we need an option to inherit terminal + * + * No SID present on a peer + * ------------------------ + * + * - if it's a master peer than we are in good shape + * and continue in a normal way, we're the peer keepers + * + * - if it's a slave peer and no appropriate master peer + * found we need an option to inherit terminal + * + * In any case if it's hungup peer, then we jump out + * early since it will require fake master peer and + * rather non-usable anyway. + */ + + if (tty_is_hung(info)) { + pr_debug("Hungup terminal found id %#x\n", info->tfe->id); + return 0; + } + + /* + * Current tty should be skipped here: the + * underlied _real_ pty (or anything else + * driver in future) should restore the + * session. + */ + if (info->driver->type == TTY_TYPE__CTTY) + return 0; + + if (info->tie->sid) { + if (!tty_is_master(info)) { + if (tty_has_active_pair(info)) + return 0; + else if (!opts.orphan_pts_master) + goto shell_job; + else + info->inherit = true; + } + + /* + * Restoring via leader only. All files + * opened over same real tty get propagated + * automatically by kernel itself. + */ + if (info->ctl_tty != info) + return 0; + + /* + * Find out the task which is session leader + * and it can restore the controlling terminal + * for us. + */ + item = find_first_sid(info->tie->sid); + if (item && vpid(item) == item->sid) { + pr_info("Set a control terminal %#x to %d\n", + info->tfe->id, info->tie->sid); + return prepare_ctl_tty(item, info->tfe->id); + } + + goto notask; + } else { + if (tty_is_master(info)) + return 0; + if (tty_has_active_pair(info)) + return 0; + } + +shell_job: + if (opts.shell_job) { + pr_info("Inherit terminal for id %#x\n", info->tfe->id); + info->inherit = true; + return 0; + } + +notask: + pr_err("No task found with sid %d\n", info->tie->sid); + return -1; +} + +static int tty_setup_orphan_slavery(void) +{ + struct tty_info *info, *peer, *m; + + list_for_each_entry(info, &all_ttys, list) { + struct fdinfo_list_entry *a, *b; + bool has_leader = false; + + if (tty_is_master(info)) + continue; + + a = file_master(&info->d); + m = info; + + list_for_each_entry(peer, &info->sibling, sibling) { + if (tty_is_master(peer)) { + has_leader = true; + break; + } + + /* + * Same check as in pipes and files -- need to + * order slave ends so that they do not dead lock + * waiting for each other. + */ + b = file_master(&peer->d); + if (fdinfo_rst_prio(b, a)) { + a = b; + m = peer; + } + } + + if (!has_leader) { + m->create = true; + pr_debug("Found orphan slave fake leader (%#x)\n", + m->tfe->id); + } + } + + return 0; +} + +static int tty_setup_slavery(void) +{ + struct tty_info *info, *peer, *m; + + /* + * Setup links for PTY terminal pairs by + * their indices, queued data already bound + * to them by data ids. + */ + list_for_each_entry(info, &all_ttys, list) { + if (!is_pty(info->driver) || info->link) + continue; + peer = info; + list_for_each_entry_continue(peer, &all_ttys, list) { + if (!is_pty(peer->driver) || peer->link) + continue; + if (peer->tie->pty->index == info->tie->pty->index) { + info->link = peer; + peer->link = info; + + pr_debug("Link PTYs (%#x)\n", info->tfe->id); + break; + } + } + } + + /* + * The image may carry several terminals opened + * belonging to the same session, so choose the + * leader which gonna be setting up the controlling + * terminal. + */ + list_for_each_entry(info, &all_ttys, list) { + if (!info->tie->sid || info->ctl_tty || + info->driver->type == TTY_TYPE__CTTY) + continue; + + if (!tty_is_master(info) && info->link) + continue; + + info->ctl_tty = info; + pr_debug("ctl tty leader %#x\n", info->tfe->id); + peer = info; + list_for_each_entry_safe_continue(peer, m, &all_ttys, list) { + if (!peer->tie->sid || peer->ctl_tty || + peer->driver->type == TTY_TYPE__CTTY) + continue; + if (peer->tie->sid == info->tie->sid) { + pr_debug(" `- slave %#x\n", peer->tfe->id); + peer->ctl_tty = info; + } + } + } + + list_for_each_entry(info, &all_ttys, list) { + if (tty_find_restoring_task(info)) + return -1; + if (!is_pty(info->driver)) + continue; + + peer = info; + list_for_each_entry_safe_continue(peer, m, &all_ttys, list) { + if (!is_pty(peer->driver)) + continue; + if (peer->tie->pty->index != info->tie->pty->index) + continue; + + if (tty_find_restoring_task(peer)) + return -1; + + list_add(&peer->sibling, &info->sibling); + list_del(&peer->list); + } + } + + /* + * Print out information about peers. + */ + list_for_each_entry(info, &all_ttys, list) { + tty_show_pty_info("head", info); + list_for_each_entry(peer, &info->sibling, sibling) + tty_show_pty_info(" `- sibling", peer); + } + + return tty_setup_orphan_slavery(); +} + +static int verify_termios(u32 id, TermiosEntry *e) +{ + if (e && e->n_c_cc < TERMIOS_NCC) { + pr_err("pty ID %#x n_c_cc (%d) has wrong value\n", + id, (int)e->n_c_cc); + return -1; + } + return 0; +} + +#define term_opts_missing_cmp(tie, op) \ + (!(tie)->termios op \ + !(tie)->termios_locked op \ + !(tie)->winsize) + +#define term_opts_missing_any(p) \ + term_opts_missing_cmp(p, ||) + +#define term_opts_missing_all(p) \ + term_opts_missing_cmp(p, &&) + +static int verify_info(TtyInfoEntry *tie, struct tty_driver *driver) +{ + /* + * Master peer must have all parameters present, + * while slave peer must have either all parameters present + * or don't have them at all. + */ + if (term_opts_missing_any(tie)) { + if (__tty_is_master(driver)) { + pr_err("Corrupted master peer %#x\n", tie->id); + return -1; + } else if (!term_opts_missing_all(tie)) { + pr_err("Corrupted slave peer %#x\n", tie->id); + return -1; + } + } + + if (verify_termios(tie->id, tie->termios_locked) || + verify_termios(tie->id, tie->termios)) + return -1; + + if (tie->termios && tie->id > (MAX_TTYS << 1)) + return -1; + + return 0; +} + +static int tty_info_setup(struct tty_info *info); + +static int collect_one_tty_info_entry(void *obj, ProtobufCMessage *msg, struct cr_img *i) +{ + struct tty_info *info, *n; + TtyInfoEntry *tie; + struct tty_driver *driver; + + tie = pb_msg(msg, TtyInfoEntry); + + switch (tie->type) { + case TTY_TYPE__PTY: + if (!tie->pty) { + pr_err("No PTY data found (id %#x), corrupted image?\n", tie->id); + return -1; + } + break; + case TTY_TYPE__CTTY: + case TTY_TYPE__CONSOLE: + case TTY_TYPE__SERIAL: + case TTY_TYPE__VT: + case TTY_TYPE__EXT_TTY: + if (tie->pty) { + pr_err("PTY data found (id %#x), corrupted image?\n", tie->id); + return -1; + } + break; + default: + pr_err("Unexpected TTY type %d (id %#x)\n", tie->type, tie->id); + return -1; + } + + driver = get_tty_driver(tie->rdev, tie->dev); + if (driver == NULL) { + pr_err("Unable to find a tty driver (rdev %#x dev %#x)\n", + tie->rdev, tie->dev); + return -1; + } + + if (verify_info(tie, driver)) + return -1; + + list_for_each_entry_safe(info, n, &collected_ttys, list) { + if (info->tfe->tty_info_id != tie->id) + continue; + + info->tie = tie; + info->driver = driver; + list_move_tail(&info->list, &all_ttys); + + if (tty_info_setup(info)) + return -1; + } + + /* + * The tty peers which have no @termios are hung up, + * so don't mark them as active, we create them with + * faked master and they are rather a rudiment which + * can't be used. Most likely they appear if a user has + * dumped program when it was closing a peer. + */ + if (is_pty(driver) && tie->termios) + tty_test_and_set(tie->id, tty_active_pairs); + + return 0; +} + +struct collect_image_info tty_info_cinfo = { + .fd_type = CR_FD_TTY_INFO, + .pb_type = PB_TTY_INFO, + .collect = collect_one_tty_info_entry, + .flags = COLLECT_NOFREE, +}; + +static int prep_tty_restore_cb(struct pprep_head *ph) +{ + if (!list_empty(&collected_ttys)) { + pr_err("Not all TTYs got its infos\n"); + return -1; + } + if (tty_verify_active_pairs()) + return -1; + if (tty_setup_slavery()) + return -1; + return 0; +} + +static MAKE_PPREP_HEAD(prep_tty_restore); + +static int collect_one_tty(void *obj, ProtobufCMessage *msg, struct cr_img *i) +{ + struct tty_info *info = obj; + + info->tfe = pb_msg(msg, TtyFileEntry); + list_add_tail(&info->list, &collected_ttys); + + return 0; +} + +static int tty_info_setup(struct tty_info *info) +{ + INIT_LIST_HEAD(&info->sibling); + info->create = tty_is_master(info); + info->inherit = false; + info->ctl_tty = NULL; + info->tty_data = NULL; + info->link = NULL; + + /* + * The image might have no reg file record in old CRIU, so + * lets don't fail for a while. After a couple of releases + * simply require the record to present. + * + * Note for external ttys it's fine to not have any + * reg file rectord because they are inherited from + * command line on restore. + */ + info->reg_d = try_collect_special_file( info->tfe->has_regf_id ? + info->tfe->regf_id : info->tfe->id, 1); + if (!info->reg_d) { + if (info->driver->type != TTY_TYPE__EXT_TTY) { + if (!deprecated_ok("TTY w/o regfile")) + return -1; + + if (is_pty(info->driver)) { + info->reg_d = pty_alloc_reg(info, true); + if (!info->reg_d) { + pr_err("Can't generate new reg descriptor for id %#x\n", + info->tfe->id); + return -1; + } + } else { + pr_err("No reg_d descriptor for id %#x\n", info->tfe->id); + return -1; + } + } + } + + pr_info("Collected tty ID %#x (%s)\n", info->tfe->id, info->driver->name); + + add_post_prepare_cb_once(&prep_tty_restore); + + /* + * Call it explicitly. Post-callbacks will be called after + * namespaces preparation, while the latter needs this mutex. + */ + if (init_tty_mutex()) + return -1; + + info->fdstore_id = -1; + return file_desc_add(&info->d, info->tfe->id, &tty_desc_ops); +} + +struct collect_image_info tty_cinfo = { + .fd_type = CR_FD_TTY_FILES, + .pb_type = PB_TTY_FILE, + .priv_size = sizeof(struct tty_info), + .collect = collect_one_tty, +}; + +static int collect_one_tty_data(void *obj, ProtobufCMessage *msg, struct cr_img *i) +{ + struct tty_data_entry *tdo = obj; + struct tty_info *info; + + tdo->tde = pb_msg(msg, TtyDataEntry); + pr_debug("Collected data for id %#x (size %zu bytes)\n", + tdo->tde->tty_id, (size_t)tdo->tde->data.len); + + list_for_each_entry(info, &all_ttys, list) { + if (tdo->tde->tty_id == info->tie->id) { + info->tty_data = tdo; + return 0; + } + } + + pr_err("No tty found to queued data on id %#x\n", tdo->tde->tty_id); + return -ENOENT; +} + +struct collect_image_info tty_cdata = { + .fd_type = CR_FD_TTY_DATA, + .pb_type = PB_TTY_DATA, + .priv_size = sizeof(struct tty_data_entry), + .collect = collect_one_tty_data, +}; + +/* Make sure the ttys we're dumping do belong our process tree */ +int dump_verify_tty_sids(void) +{ + struct tty_dump_info *dinfo, *n; + int ret = 0; + + /* + * There might be a cases where we get sid/pgid on + * slave peer. For example the application is running + * with redirection and we're migrating shell job. + * + * # ./app < /dev/zero > /dev/zero &2>1 + * + * Which produce a tree like + * PID PPID PGID SID + * root 23786 23784 23786 23786 pts/0 \_ -bash + * root 24246 23786 24246 23786 pts/0 \_ ./app + * + * And the application goes background, then we dump + * it from the same shell. + * + * In this case we simply zap sid/pgid and inherit + * the peer from the current terminal on restore. + */ + list_for_each_entry_safe(dinfo, n, &all_ttys, list) { + if (!ret && dinfo->sid) { + struct pstree_item *item = find_first_sid(dinfo->sid); + + if (!item || vpid(item) != dinfo->sid) { + if (!opts.shell_job) { + pr_err("Found dangling tty with sid %d pgid %d (%s) on peer fd %d.\n", + dinfo->sid, dinfo->pgrp, + dinfo->driver->name, dinfo->fd); + /* + * First thing people do with criu is dump smth + * run from shell. This is typical pitfall, warn + * user about it explicitly. + */ + pr_msg("Task attached to shell terminal. " + "Consider using --" OPT_SHELL_JOB " option. " + "More details on http://criu.org/Simple_loop\n"); + ret = -1; + } + } + } + } + + return ret; +} + +static int dump_tty_info(int lfd, u32 id, const struct fd_parms *p, struct tty_driver *driver, int index) +{ + TtyInfoEntry info = TTY_INFO_ENTRY__INIT; + TermiosEntry termios = TERMIOS_ENTRY__INIT; + TermiosEntry termios_locked = TERMIOS_ENTRY__INIT; + WinsizeEntry winsize = WINSIZE_ENTRY__INIT; + TtyPtyEntry pty = TTY_PTY_ENTRY__INIT; + struct parasite_tty_args *pti; + struct tty_dump_info *dinfo; + + struct termios t; + struct winsize w; + + int ret = -1; + + if (!p->fd_ctl) { + pr_err("No CTL for TTY dump, likely SCM case\n"); + return -1; + } + + /* + * Make sure the structures the system provides us + * correlates well with protobuf templates. + */ + BUILD_BUG_ON(ARRAY_SIZE(t.c_cc) < TERMIOS_NCC); + BUILD_BUG_ON(sizeof(termios.c_cc) != sizeof(void *)); + BUILD_BUG_ON((sizeof(termios.c_cc) * TERMIOS_NCC) < sizeof(t.c_cc)); + + pti = parasite_dump_tty(p->fd_ctl, p->fd, driver->type); + if (!pti) + return -1; + + dinfo = xzalloc(sizeof(*dinfo)); + if (!dinfo) + return -1; + + dinfo->id = id; + dinfo->sid = pti->sid; + dinfo->pgrp = pti->pgrp; + dinfo->pid_real = p->pid; + dinfo->fd = p->fd; + dinfo->mnt_id = p->mnt_id; + dinfo->driver = driver; + dinfo->flags = p->flags; + + if (is_pty(driver)) { + dinfo->lfd = dup(lfd); + if (dinfo->lfd < 0) { + pr_perror("Can't dup local fd on %#x", id); + xfree(dinfo); + return -1; + } + dinfo->index = index; + } else { + dinfo->index = -1; + dinfo->lfd = -1; + } + + list_add_tail(&dinfo->list, &all_ttys); + + info.id = id; + info.sid = pti->sid; + info.pgrp = pti->pgrp; + info.rdev = p->stat.st_rdev; + info.dev = p->stat.st_dev; + info.has_dev = true; + info.locked = pti->st_lock; + info.exclusive = pti->st_excl; + info.packet_mode = pti->st_pckt; + + info.has_uid = true; + info.uid = userns_uid(p->stat.st_uid); + info.has_gid = true; + info.gid = userns_gid(p->stat.st_gid); + + info.type = driver->type; + if (info.type == TTY_TYPE__PTY) { + info.pty = &pty; + pty.index = index; + } + + /* + * Nothing we can do on hanging up terminal, + * just write out minimum information we can + * gather. + */ + if (pti->hangup) + return pb_write_one(img_from_set(glob_imgset, CR_FD_TTY_INFO), &info, PB_TTY_INFO); + + /* + * Now trace the paired/unpaired ttys. For example + * the task might have slave peer assigned but no + * master peer. Such "detached" master peers are + * not yet supported by our tool and better to + * inform a user about such situation. + */ + if (is_pty(driver)) + tty_test_and_set(id, tty_active_pairs); + + info.termios = &termios; + info.termios_locked = &termios_locked; + info.winsize = &winsize; + + termios.n_c_cc = TERMIOS_NCC; + termios.c_cc = xmalloc(pb_repeated_size(&termios, c_cc)); + + termios_locked.n_c_cc = TERMIOS_NCC; + termios_locked.c_cc = xmalloc(pb_repeated_size(&termios_locked, c_cc)); + + if (!termios.c_cc || !termios_locked.c_cc) + goto out; + + memzero(&t, sizeof(t)); + if (ioctl(lfd, TCGETS, &t) < 0) { + pr_perror("Can't get tty params on %#x", id); + goto out; + } + termios_copy(&termios, &t); + + memzero(&t, sizeof(t)); + if (ioctl(lfd, TIOCGLCKTRMIOS, &t) < 0) { + pr_perror("Can't get tty locked params on %#x", id); + goto out; + } + termios_copy(&termios_locked, &t); + + memzero(&w, sizeof(w)); + if (ioctl(lfd, TIOCGWINSZ, &w) < 0) { + pr_perror("Can't get tty window params on %#x", id); + goto out; + } + winsize_copy(&winsize, &w); + + ret = pb_write_one(img_from_set(glob_imgset, CR_FD_TTY_INFO), &info, PB_TTY_INFO); +out: + xfree(termios.c_cc); + xfree(termios_locked.c_cc); + return ret; +} + +static int dump_one_tty(int lfd, u32 id, const struct fd_parms *p) +{ + TtyFileEntry e = TTY_FILE_ENTRY__INIT; + int ret = 0, index = -1; + struct tty_driver *driver; + + pr_info("Dumping tty %d with id %#x\n", lfd, id); + + driver = get_tty_driver(p->stat.st_rdev, p->stat.st_dev); + if (driver->fd_get_index) + index = driver->fd_get_index(lfd, p); + else + index = driver->index; + + if (index == INDEX_ERR) { + pr_info("Can't obtain index on tty %d id %#x\n", lfd, id); + return -1; + } + + e.id = id; + e.tty_info_id = tty_gen_id(driver, index); + e.flags = p->flags; + e.fown = (FownEntry *)&p->fown; + + if (driver->type != TTY_TYPE__EXT_TTY) { + u32 rf_id; + + fd_id_generate_special(NULL, &rf_id); + if (dump_one_reg_file(lfd, rf_id, p)) + return -1; + + e.has_regf_id = true; + e.regf_id = rf_id; + } + + + /* + * FIXME + * + * Figure out how to fetch data buffered in terminal. + * For a while simply flush before dumping. Note + * we don't check for errors here since it makes + * no sense anyway, the buffered data is not handled + * properly yet. + * + * Note as well that if we have only one peer here + * the external end might be sending the data to us + * again and again while kernel buffer is not full, + * this might lead to endless SIGTTOU signal delivery + * to the dumpee, ruining checkpoint procedure. + * + * So simply do not flush the line while we dump + * parameters tty never was being a guaranteed delivery + * transport anyway. + */ + + if (!tty_test_and_set(e.tty_info_id, tty_bitmap)) + ret = dump_tty_info(lfd, e.tty_info_id, p, driver, index); + + if (!ret) { + FileEntry fe = FILE_ENTRY__INIT; + + fe.type = FD_TYPES__TTY; + fe.id = e.id; + fe.tty = &e; + ret = pb_write_one(img_from_set(glob_imgset, CR_FD_FILES), &fe, PB_FILE); + } + + return ret; +} + +const struct fdtype_ops tty_dump_ops = { + .type = FD_TYPES__TTY, + .dump = dump_one_tty, +}; + +static int tty_reblock(int id, int lfd, int flags) +{ + static const int fmask = O_RDWR | O_NONBLOCK; + int ret; + + if ((flags & fmask) != fmask) { + if (fcntl(lfd, F_SETFL, flags)) { + ret = -errno; + pr_perror("Can't revert mode back to %o on (%#x)", fmask, id); + return ret; + } + } + + return 0; +} + +static int tty_unblock(int id, int lfd, int flags) +{ + static const int fmask = O_RDWR | O_NONBLOCK; + int ret; + + if ((flags & fmask) != fmask) { + if (fcntl(lfd, F_SETFL, fmask)) { + ret = -errno; + pr_perror("Can't change mode to %o on (%#x)", fmask, id); + return ret; + } + } + + return 0; +} + +static int tty_do_dump_queued_data(struct tty_dump_info *dinfo) +{ + TtyDataEntry e = TTY_DATA_ENTRY__INIT; + size_t off = 0, size = 16384; + char *buf; + int ret; + + buf = xmalloc(size); + if (!buf) + return -ENOMEM; + + ret = tty_unblock(dinfo->id, dinfo->lfd, dinfo->flags); + if (ret) { + xfree(buf); + return ret; + } + + while (1) { + ret = read(dinfo->lfd, &buf[off], size - off); + if (ret == 0) { + pr_debug("No more data on tty (%s %#x)\n", + dinfo->driver->name, dinfo->id); + break; + } else if (ret < 0) { + if (errno == EAGAIN) { + pr_debug("Not waiting data tty (%s %#x)\n", + dinfo->driver->name, dinfo->id); + break; + } else { + ret = -errno; + pr_perror("Can't read data from tty (%s %#x)", + dinfo->driver->name, dinfo->id); + xfree(buf); + return ret; + } + } + + off += ret; + pr_debug("Read %d bytes (%d) from tty (%s %#x)\n", + ret, (int)off, dinfo->driver->name, dinfo->id); + + if (off >= size) { + pr_err("The tty (%s %#x) queued data overflow %zu bytes limit\n", + dinfo->driver->name, dinfo->id, size); + off = size; + break; + } + } + + if (off) { + dinfo->tty_data = buf; + dinfo->tty_data_size = off; + + e.tty_id = dinfo->id; + e.data.data = (void *)buf; + e.data.len = off; + + ret = pb_write_one(img_from_set(glob_imgset, CR_FD_TTY_DATA), + &e, PB_TTY_DATA); + } else { + xfree(buf); + ret = 0; + } + + return ret; +} + +/* + * If error happens here, so be it, ttys are not delivering + * data with guaranteed results. + */ +static void __tty_do_writeback_queued_data(struct tty_dump_info *dinfo) +{ + if (dinfo->tty_data) { + if (write(dinfo->link->lfd, dinfo->tty_data, + dinfo->tty_data_size) != dinfo->tty_data_size) + pr_perror("Can't writeback to tty (%#x)", dinfo->id); + } + tty_reblock(dinfo->link->id, dinfo->link->lfd, dinfo->link->flags); +} + +static void tty_do_writeback_queued_data(struct tty_dump_info *dinfo) +{ + __tty_do_writeback_queued_data(dinfo); + __tty_do_writeback_queued_data(dinfo->link); +} + +static void tty_dinfo_free(struct tty_dump_info *dinfo) +{ + list_del(&dinfo->list); + close_safe(&dinfo->lfd); + xfree(dinfo->tty_data); + xfree(dinfo); +} + +/* + * Dumping queued data must be done at the very end of the + * checkpoint procedure -- it's tail optimization, we trying + * to defer this procedure until everything else passed + * successfully because in real it is time consuming on + * its own which might require writing data back to the + * former peers if case something go wrong. + * + * Moreover when we gather PTYs peers into own list we + * do it in destructive way -- the former @all_ttys + * list get modified (one of the peer get moved from + * @all_ttys to @all_ptys list) because otherwise we + * will have to add one more entry into tty_dump_info, + * thus we simply reuse the @list entry for own needs. + */ +static int tty_dump_queued_data(void) +{ + struct tty_dump_info *dinfo, *peer, *n; + LIST_HEAD(all_ptys); + int ret = 0; + + /* + * Link PTY peers, and move one of linked + * into separate list. + */ + list_for_each_entry_safe(dinfo, n, &all_ttys, list) { + if (!is_pty(dinfo->driver) || dinfo->link) + continue; + + peer = dinfo; + list_for_each_entry_continue(peer, &all_ttys, list) { + if (!is_pty(peer->driver) || peer->link) + continue; + + if (peer->index == dinfo->index) { + dinfo->link = peer; + peer->link = dinfo; + pr_debug("Link PTYs (%#x)\n", dinfo->id); + + list_move(&dinfo->list, &all_ptys); + } + } + } + + /* + * Once linked fetch the queued data if present. + */ + list_for_each_entry(dinfo, &all_ptys, list) { + ret = tty_do_dump_queued_data(dinfo); + if (ret) + break; + ret = tty_do_dump_queued_data(dinfo->link); + if (ret) + break; + } + + if (ret || opts.final_state != TASK_DEAD) { + list_for_each_entry(dinfo, &all_ptys, list) + tty_do_writeback_queued_data(dinfo); + } + + list_for_each_entry_safe(dinfo, n, &all_ptys, list) { + tty_dinfo_free(dinfo->link); + tty_dinfo_free(dinfo); + } + + list_for_each_entry_safe(dinfo, n, &all_ttys, list) + tty_dinfo_free(dinfo); + + return ret; +} + +static int tty_verify_ctty(void) +{ + struct tty_dump_info *d, *p; + + list_for_each_entry(d, &all_ttys, list) { + struct tty_dump_info *n = NULL; + + if (d->driver->type != TTY_TYPE__CTTY) + continue; + + list_for_each_entry(p, &all_ttys, list) { + if (!is_pty(p->driver) || + p->sid != d->sid || + p->pgrp != d->sid) + continue; + n = p; + break; + } + + if (!n) { + pr_err("ctty inheritance detected sid/pgrp %d, " + "no PTY peer with sid/pgrp needed\n", + d->sid); + return -ENOENT; + } else if (n->pid_real != d->pid_real) { + pr_err("ctty inheritance detected sid/pgrp %d " + "(ctty pid_real %d pty pid_real %d)\n", + d->sid, d->pid_real, n->pid_real); + return -ENOENT; + } + } + + return 0; +} + +int tty_post_actions(void) +{ + if (tty_verify_ctty()) + return -1; + if (tty_verify_active_pairs()) + return -1; + else if (tty_dump_queued_data()) + return -1; + return 0; +} + +int tty_prep_fds(void) +{ + if (!opts.shell_job) + return 0; + + if (!isatty(STDIN_FILENO)) + pr_info("Standard stream is not a terminal, may fail later\n"); + else + stdin_isatty = true; + + self_stdin_fdid = fdstore_add(STDIN_FILENO); + if (self_stdin_fdid < 0) { + pr_err("Can't place stdin fd to fdstore\n"); + return -1; + } + + return 0; +} + +static int open_pty(void *arg, int flags) +{ + int dfd = (unsigned long) arg; + /* + * Never set as a control terminal automatically, all + * ctty magic happens only in tty_set_sid(). + */ + flags |= O_NOCTTY; + return openat(dfd, "ptmx", flags); +} + +/* Create a pty pair and save a master descriptor in fdstore */ +static int pty_create_ptmx_index(int dfd, int index, int flags) +{ + struct tty_info *info; + int fd, id; + + fd = __pty_open_ptmx_index(index, flags, open_pty, (void *)(unsigned long) dfd, "ptmx"); + if (fd < 0) + return -1; + + id = fdstore_add(fd); + if (id < 0) + return -1; + close(fd); + + list_for_each_entry(info, &all_ttys, list) { + if (!is_pty(info->driver)) + continue; + + if (info->tie->pty->index == index) { + info->fdstore_id = id; + } + } + + return 0; +} + +/* + * Here we check that a master of a bind-mounted slave was opened in the root + * mount namespace. The problem is that we restore all mounts in the root mount + * namespace. Only when all mounts are restored, we create other mount + * namespaces. So when we are restoring mounts, we can open files only in the + * root mount namespace. + */ +int devpts_check_bindmount(struct mount_info *m) +{ + struct tty_dump_info *dinfo = NULL; + struct mount_info *master_mp; + int index; + + if (strcmp(m->root, "/") == 0 || strcmp(m->root, "/ptmx") == 0) + return 0; + + if (sscanf(m->root, "/%d", &index) != 1) { + pr_err("Unable to parse %s\n", m->root); + return -1; + } + + list_for_each_entry(dinfo, &all_ttys, list) { + if (!is_pty(dinfo->driver)) + continue; + + if (dinfo->driver->subtype != TTY_SUBTYPE_MASTER) + continue; + + if (dinfo->index == index) + goto found; + } + + if (opts.orphan_pts_master) /* external master */ + return 0; + + pr_err("Unable to find a master for %s\n", m->root); + return -1; + +found: + /* mnt_id isn't reported in fdinfo, so here is only one mntns */ + if (dinfo->mnt_id == -1) + return 0; + + master_mp = lookup_mnt_id(dinfo->mnt_id); + if (!master_mp) { + pr_err("Unable to find a mount %d\n", dinfo->mnt_id); + return -1; + } + + if (master_mp->nsid->type != NS_ROOT) { + pr_err("The master for %s isn't from the root mntns\n", + m->root); + return -1; + } + + return 0; +} + +/* Restore slave pty-s which have to be bind-mounted to somewhere */ +int devpts_restore(struct mount_info *pm) +{ + struct mount_info *bm; + int dfd, exit_code = -1; + + dfd = open(pm->mountpoint, O_RDONLY); + if (dfd < 0) { + pr_perror("Unable to open %s", pm->mountpoint); + return -1; + } + + + list_for_each_entry(bm, &pm->mnt_bind, mnt_bind) { + int idx; + struct stat st; + + if (sscanf(bm->root, "/%d", &idx) < 1) + continue; + + if (fstatat(dfd, bm->root + 1, &st, 0) == 0) + continue; + + pr_debug("Create a slave tty %d\n", idx); + if (pty_create_ptmx_index(dfd, idx, O_RDWR)) + goto err; + } + + exit_code = 0; +err: + close(dfd); + return exit_code; +} diff --git a/CRIU_code/criu/tun.c b/CRIU_code/criu/tun.c new file mode 100644 index 0000000..b13148b --- /dev/null +++ b/CRIU_code/criu/tun.c @@ -0,0 +1,547 @@ +#include +#include +#include +#include +#include +#include +#include + +// MAO required on Centos 6 (linux-3.18.1 kernel) +#include + +#include "cr_options.h" +#include "imgset.h" +#include "protobuf.h" +#include "string.h" +#include "files.h" +#include "files-reg.h" +#include "tun.h" +#include "net.h" +#include "namespaces.h" +#include "xmalloc.h" +#include "kerndat.h" +#include "sockets.h" + +#include "images/tun.pb-c.h" + +#undef LOG_PREFIX +#define LOG_PREFIX "tun: " + +#ifndef IFF_PERSIST +#define IFF_PERSIST 0x0800 +#endif + +#ifndef IFF_NOFILTER +#define IFF_NOFILTER 0x1000 +#endif + +#ifndef TUNSETQUEUE +#define TUNSETQUEUE _IOW('T', 217, int) +#define IFF_ATTACH_QUEUE 0x0200 +#define IFF_DETACH_QUEUE 0x0400 +#endif + +/* + * Absence of the 1st ioctl means we cannot restore tun link. But + * since the 2nd one appeared at the same time, we'll "check" this + * by trying to dump filter and abort dump if it's not there. + */ + +#ifndef TUNSETIFINDEX +#define TUNSETIFINDEX _IOW('T', 218, unsigned int) +#endif + +#ifndef TUNGETFILTER +#define TUNGETFILTER _IOR('T', 219, struct sock_fprog) +#endif + +#define TUN_DEV_GEN_PATH "/dev/net/tun" + +int check_tun_cr(int no_tun_err) +{ + int fd, idx = 13, ret; + + fd = open(TUN_DEV_GEN_PATH, O_RDWR); + if (fd < 0) { + pr_perror("Can't check tun support"); + return no_tun_err; + } + + ret = ioctl(fd, TUNSETIFINDEX, &idx); + if (ret < 0) + pr_perror("No proper support for tun dump/restore"); + + close(fd); + return ret; +} + +int check_tun_netns_cr(bool *result) +{ + bool val = false; + int tun; + + tun = open(TUN_DEV_GEN_PATH, O_RDONLY); + if (tun < 0) { + pr_perror("Unable to create tun"); + goto out; + } + check_has_netns_ioc(tun, &val, "tun"); + close(tun); + +out: + if (result) + *result = val; + + return 0; +} + +static LIST_HEAD(tun_links); + +struct tun_link { + char name[IFNAMSIZ]; + struct list_head l; + unsigned ns_id; + union { + struct { + unsigned flags; + } rst; + + struct { + unsigned sndbuf; + unsigned vnethdr; + } dmp; + }; +}; + +static int list_tun_link(NetDeviceEntry *nde, unsigned ns_id) +{ + struct tun_link *tl; + + tl = xmalloc(sizeof(*tl)); + if (!tl) + return -1; + + strlcpy(tl->name, nde->name, sizeof(tl->name)); + /* + * Keep tun-flags not only for persistency fixup (see + * comment below), but also for TUNSETIFF -- we must + * open the device with the same flags it should live + * with (i.e. -- with which it was created. + */ + tl->rst.flags = nde->tun->flags; + tl->ns_id = ns_id; + list_add_tail(&tl->l, &tun_links); + return 0; +} + +static struct tun_link *find_tun_link(char *name, unsigned int ns_id) +{ + struct tun_link *tl; + + list_for_each_entry(tl, &tun_links, l) { + if (!strcmp(tl->name, name) && + tl->ns_id == ns_id) + return tl; + } + return NULL; +} + +static struct tun_link *__dump_tun_link_fd(int fd, char *name, unsigned ns_id, unsigned flags) +{ + struct tun_link *tl; + struct sock_fprog flt; + + tl = xmalloc(sizeof(*tl)); + if (!tl) + goto err; + strlcpy(tl->name, name, sizeof(tl->name)); + tl->ns_id = ns_id; + + if (ioctl(fd, TUNGETVNETHDRSZ, &tl->dmp.vnethdr) < 0) { + pr_perror("Can't dump vnethdr size for %s", name); + goto err; + } + + if (ioctl(fd, TUNGETSNDBUF, &tl->dmp.sndbuf) < 0) { + pr_perror("Can't dump sndbuf for %s", name); + goto err; + } + + if (flags & IFF_TAP) { + pr_debug("Checking filter for tap %s\n", name); + if (ioctl(fd, TUNGETFILTER, &flt) < 0) { + pr_perror("Can't get tun filter for %s", name); + goto err; + } + + /* + * TUN filters are tricky -- the program itself is 'somewhere' + * in the task's memory, so we can't get one for unattached + * persistent device. The only way for doing it is opening the + * device with IFF_NOFILTER and attaching some fake one :( + */ + + if (flt.len != 0) { + pr_err("Can't dump %s with filter on-board\n", name); + goto err; + } + } else if (!(flags & IFF_NOFILTER)) { + pr_err("No info about %s filter, kernel is too old\n", name); + goto err; + } + + return tl; + +err: + xfree(tl); + return NULL; +} + +static struct tun_link *dump_tun_link_fd(int fd, char *name, unsigned ns_id, unsigned flags) +{ + struct tun_link *tl; + + tl = find_tun_link(name, ns_id); + if (tl) + return tl; + + tl = __dump_tun_link_fd(fd, name, ns_id, flags); + if (tl) { + /* + * Keep this in list till links dumping code starts. + * We can't let it dump all this stuff itself, since + * multiple attaches to one tun device is limited and + * we may not be able to it that late. + * + * For persistent detached devices the get_tun_link_fd + * will attach to the device and get the needed stuff. + */ + list_add(&tl->l, &tun_links); + } + return tl; +} + +static int open_tun_dev(char *name, unsigned int idx, unsigned flags) +{ + int fd; + struct ifreq ifr; + + fd = open(TUN_DEV_GEN_PATH, O_RDWR); + if (fd < 0) { + pr_perror("Can't open tun device"); + return -1; + } + + if (idx) { + pr_debug(" restoring %u for %s tun\n", idx, name); + if (ioctl(fd, TUNSETIFINDEX, &idx) < 0) { + pr_perror("Can't restore tun's index"); + goto err; + } + } + + memset(&ifr, 0, sizeof(ifr)); + strlcpy(ifr.ifr_name, name, sizeof(ifr.ifr_name)); + ifr.ifr_flags = flags; + + if (ioctl(fd, TUNSETIFF, &ifr)) { + pr_perror("Can't create tun device"); + goto err; + } + + return fd; + +err: + close(fd); + return -1; +} + +static struct tun_link *get_tun_link_fd(char *name, unsigned ns_id, unsigned flags) +{ + struct tun_link *tl; + int fd; + + tl = find_tun_link(name, ns_id); + if (tl) + return tl; + + /* + * If we haven't found this thing, then the + * device we see via netlink exists w/o any fds + * attached, i.e. -- it's persistent + */ + + if (!(flags & IFF_PERSIST)) { + pr_err("No fd infor for non persistent tun device %s\n", name); + return NULL; + } + + /* + * Kernel will try to attach filter (if it exists) to our memory, + * avoid this. + */ + + flags |= IFF_NOFILTER; + + fd = open_tun_dev(name, 0, flags); + if (fd < 0) + return NULL; + + tl = __dump_tun_link_fd(fd, name, ns_id, flags); + close(fd); + + return tl; +} + +static int dump_tunfile(int lfd, u32 id, const struct fd_parms *p) +{ + int ret; + struct cr_img *img; + FileEntry fe = FILE_ENTRY__INIT; + TunfileEntry tfe = TUNFILE_ENTRY__INIT; + struct ns_id *ns; + struct ifreq ifr; + + if (!(root_ns_mask & CLONE_NEWNET)) { + pr_err("Net namespace is required to dump tun link\n"); + return -1; + } + + if (kdat.tun_ns) { + ns = get_socket_ns(lfd); + if (!ns) { + pr_err("No net_ns for tun device\n"); + return -1; + } + tfe.has_ns_id = true; + tfe.ns_id = ns->id; + } + + if (dump_one_reg_file(lfd, id, p)) + return -1; + + pr_info("Dumping tun-file %d with id %#x\n", lfd, id); + + tfe.id = id; + ret = ioctl(lfd, TUNGETIFF, &ifr); + if (ret < 0) { + if (errno != EBADFD) { + pr_perror("Can't dump tun-file device"); + return -1; + } + + /* + * Otherwise this is just opened file with not yet attached + * tun device. Go ahead an write the respective entry. + */ + } else { + tfe.netdev = ifr.ifr_name; + pr_info("`- attached to device %s (flags %x)\n", tfe.netdev, ifr.ifr_flags); + + if (ifr.ifr_flags & IFF_DETACH_QUEUE) { + tfe.has_detached = true; + tfe.detached = true; + } + + if (dump_tun_link_fd(lfd, tfe.netdev, tfe.ns_id, ifr.ifr_flags) == NULL) + return -1; + } + + fe.type = FD_TYPES__TUNF; + fe.id = tfe.id; + fe.tunf = &tfe; + + img = img_from_set(glob_imgset, CR_FD_FILES); + return pb_write_one(img, &fe, PB_FILE); +} + +const struct fdtype_ops tunfile_dump_ops = { + .type = FD_TYPES__TUNF, + .dump = dump_tunfile, +}; + +struct tunfile_info { + struct file_desc d; + TunfileEntry *tfe; +}; + +static int tunfile_open(struct file_desc *d, int *new_fd) +{ + int fd, ns_id; + struct tunfile_info *ti; + struct ifreq ifr; + struct tun_link *tl; + + ti = container_of(d, struct tunfile_info, d); + + ns_id = ti->tfe->ns_id; + if (set_netns(ns_id)) + return -1; + + fd = open_reg_by_id(ti->tfe->id); + if (fd < 0) + return -1; + + if (!ti->tfe->netdev) + /* just-opened tun file */ + goto ok; + + tl = find_tun_link(ti->tfe->netdev, ns_id); + if (!tl) { + pr_err("No tun device for file %s\n", ti->tfe->netdev); + goto err; + } + + memset(&ifr, 0, sizeof(ifr)); + strlcpy(ifr.ifr_name, tl->name, sizeof(ifr.ifr_name)); + ifr.ifr_flags = tl->rst.flags; + + if (ioctl(fd, TUNSETIFF, &ifr) < 0) { + pr_perror("Can't attach tunfile to device"); + goto err; + } + + if (ti->tfe->has_detached && ti->tfe->detached) { + pr_info("Detaching from %s queue\n", ti->tfe->netdev); + ifr.ifr_flags = IFF_DETACH_QUEUE; + if (ioctl(fd, TUNSETQUEUE, &ifr) < 0) { + pr_perror("Can't detach queue"); + goto err; + } + } + + if (!(tl->rst.flags & IFF_PERSIST)) { + pr_info("Dropping persistency for %s\n", tl->name); + if (ioctl(fd, TUNSETPERSIST, 0) < 0) { + pr_perror("Error dropping persistency"); + goto err; + } + } +ok: + *new_fd = fd; + return 0; + +err: + close(fd); + return -1; +} + +static struct file_desc_ops tunfile_desc_ops = { + .type = FD_TYPES__TUNF, + .open = tunfile_open, +}; + +static int collect_one_tunfile(void *o, ProtobufCMessage *base, struct cr_img *i) +{ + struct tunfile_info *ti = o; + + ti->tfe = pb_msg(base, TunfileEntry); + file_desc_add(&ti->d, ti->tfe->id, &tunfile_desc_ops); + + pr_info("Collected %s tunfile\n", ti->tfe->netdev); + + return 0; +} + +struct collect_image_info tunfile_cinfo = { + .fd_type = CR_FD_TUNFILE, + .pb_type = PB_TUNFILE, + .priv_size = sizeof(struct tunfile_info), + .collect = collect_one_tunfile, +}; + +int dump_tun_link(NetDeviceEntry *nde, struct cr_imgset *fds, struct nlattr **info) +{ + TunLinkEntry tle = TUN_LINK_ENTRY__INIT; + char spath[64]; + char buf[64]; + int ret = 0; + struct tun_link *tl; + + sprintf(spath, "class/net/%s/tun_flags", nde->name); + ret |= read_ns_sys_file(spath, buf, sizeof(buf)); + tle.flags = strtol(buf, NULL, 0); + + sprintf(spath, "class/net/%s/owner", nde->name); + ret |= read_ns_sys_file(spath, buf, sizeof(buf)); + tle.owner = strtol(buf, NULL, 10); + + sprintf(spath, "class/net/%s/group", nde->name); + ret |= read_ns_sys_file(spath, buf, sizeof(buf)); + tle.group = strtol(buf, NULL, 10); + + if (ret < 0) + return ret; + + tl = get_tun_link_fd(nde->name, nde->peer_nsid, tle.flags); + if (!tl) + return ret; + + tle.vnethdr = tl->dmp.vnethdr; + tle.sndbuf = tl->dmp.sndbuf; + + nde->tun = &tle; + return write_netdev_img(nde, fds, info); +} + +int restore_one_tun(struct ns_id *ns, struct net_link *link, int nlsk) +{ + NetDeviceEntry *nde = link->nde; + int fd, ret = -1, aux; + + if (!nde->tun) { + pr_err("Corrupted TUN link entry %x\n", nde->ifindex); + return -1; + } + + pr_info("Restoring tun device %s\n", nde->name); + + fd = open_tun_dev(nde->name, nde->ifindex, nde->tun->flags); + if (fd < 0) + return -1; + + aux = nde->tun->owner; + if ((aux != -1) && ioctl(fd, TUNSETOWNER, aux) < 0) { + pr_perror("Can't set owner"); + goto out; + } + + aux = nde->tun->group; + if ((aux != -1) && ioctl(fd, TUNSETGROUP, aux) < 0) { + pr_perror("Can't set group"); + goto out; + } + + aux = nde->tun->sndbuf; + if (ioctl(fd, TUNSETSNDBUF, &aux) < 0) { + pr_perror("Can't set sndbuf"); + goto out; + } + + aux = nde->tun->vnethdr; + if (ioctl(fd, TUNSETVNETHDRSZ, &aux) < 0) { + pr_perror("Can't set vnethdr"); + goto out; + } + + /* + * Set this device persistent anyway and schedule + * the persistence drop if it should not be such. + * The first _real_ opener will do it. + */ + + if (ioctl(fd, TUNSETPERSIST, 1)) { + pr_perror("Can't make tun device persistent"); + goto out; + } + + if (restore_link_parms(link, nlsk)) { + pr_err("Error restoring %s link params\n", nde->name); + goto out; + } + + ret = list_tun_link(nde, ns->id); +out: + close(fd); + return ret; +} diff --git a/CRIU_code/criu/uffd.c b/CRIU_code/criu/uffd.c new file mode 100644 index 0000000..5c1e321 --- /dev/null +++ b/CRIU_code/criu/uffd.c @@ -0,0 +1,1476 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "linux/userfaultfd.h" + +#include "int.h" +#include "page.h" +#include "criu-log.h" +#include "criu-plugin.h" +#include "pagemap.h" +#include "files-reg.h" +#include "kerndat.h" +#include "mem.h" +#include "uffd.h" +#include "util-pie.h" +#include "protobuf.h" +#include "pstree.h" +#include "crtools.h" +#include "cr_options.h" +#include "xmalloc.h" +#include +#include "restorer.h" +#include "page-xfer.h" +#include "common/lock.h" +#include "rst-malloc.h" +#include "tls.h" +#include "fdstore.h" +#include "util.h" + +#undef LOG_PREFIX +#define LOG_PREFIX "uffd: " + +#define lp_debug(lpi, fmt, arg...) pr_debug("%d-%d: " fmt, lpi->pid, lpi->lpfd.fd, ##arg) +#define lp_info(lpi, fmt, arg...) pr_info("%d-%d: " fmt, lpi->pid, lpi->lpfd.fd, ##arg) +#define lp_warn(lpi, fmt, arg...) pr_warn("%d-%d: " fmt, lpi->pid, lpi->lpfd.fd, ##arg) +#define lp_err(lpi, fmt, arg...) pr_err("%d-%d: " fmt, lpi->pid, lpi->lpfd.fd, ##arg) +#define lp_perror(lpi, fmt, arg...) pr_perror("%d-%d: " fmt, lpi->pid, lpi->lpfd.fd, ##arg) + +#define NEED_UFFD_API_FEATURES (UFFD_FEATURE_EVENT_FORK | \ + UFFD_FEATURE_EVENT_REMAP | \ + UFFD_FEATURE_EVENT_UNMAP | \ + UFFD_FEATURE_EVENT_REMOVE) + +#define LAZY_PAGES_SOCK_NAME "lazy-pages.socket" + +#define LAZY_PAGES_RESTORE_FINISHED 0x52535446 /* ReSTore Finished */ + +/* + * Backround transfer parameters. + * The default xfer length is arbitrary set to 64Kbytes + * The limit of 4Mbytes matches the maximal chunk size we can have in + * a pipe in the page-server + */ +#define DEFAULT_XFER_LEN (64 << 10) +#define MAX_XFER_LEN (4 << 20) + +static mutex_t *lazy_sock_mutex; + +struct lazy_iov { + struct list_head l; + unsigned long start; /* run-time start address, tracks remaps */ + unsigned long end; /* run-time end address, tracks remaps */ + unsigned long img_start; /* start address at the dump time */ +}; + +struct lazy_pages_info { + int pid; + bool exited; + + struct list_head iovs; + struct list_head reqs; + + struct lazy_pages_info *parent; + unsigned ref_cnt; + + struct page_read pr; + + unsigned long xfer_len; /* in pages */ + unsigned long total_pages; + unsigned long copied_pages; + + struct epoll_rfd lpfd; + + struct list_head l; + + unsigned long buf_size; + void *buf; +}; + +/* global lazy-pages daemon state */ +static LIST_HEAD(lpis); +static LIST_HEAD(exiting_lpis); +static LIST_HEAD(pending_lpis); +static int epollfd; +static bool restore_finished; +static struct epoll_rfd lazy_sk_rfd; +/* socket for communication with lazy-pages daemon */ +static int lazy_pages_sk_id = -1; + +static int handle_uffd_event(struct epoll_rfd *lpfd); + +static struct lazy_pages_info *lpi_init(void) +{ + struct lazy_pages_info *lpi = NULL; + + lpi = xmalloc(sizeof(*lpi)); + if (!lpi) + return NULL; + + memset(lpi, 0, sizeof(*lpi)); + INIT_LIST_HEAD(&lpi->iovs); + INIT_LIST_HEAD(&lpi->reqs); + INIT_LIST_HEAD(&lpi->l); + lpi->lpfd.read_event = handle_uffd_event; + lpi->xfer_len = DEFAULT_XFER_LEN; + lpi->ref_cnt = 1; + + return lpi; +} + +static void free_iovs(struct lazy_pages_info *lpi) +{ + struct lazy_iov *p, *n; + + list_for_each_entry_safe(p, n, &lpi->iovs, l) { + list_del(&p->l); + xfree(p); + } + + list_for_each_entry_safe(p, n, &lpi->reqs, l) { + list_del(&p->l); + xfree(p); + } +} + +static void lpi_fini(struct lazy_pages_info *lpi); + +static inline void lpi_put(struct lazy_pages_info *lpi) +{ + lpi->ref_cnt--; + if (!lpi->ref_cnt) + lpi_fini(lpi); +} + +static inline void lpi_get(struct lazy_pages_info *lpi) +{ + lpi->ref_cnt++; +} + +static void lpi_fini(struct lazy_pages_info *lpi) +{ + if (!lpi) + return; + xfree(lpi->buf); + free_iovs(lpi); + if (lpi->lpfd.fd > 0) + close(lpi->lpfd.fd); + if (lpi->parent) + lpi_put(lpi->parent); + if (!lpi->parent && lpi->pr.close) + lpi->pr.close(&lpi->pr); + xfree(lpi); +} + + +static int prepare_sock_addr(struct sockaddr_un *saddr) +{ + int len; + + memset(saddr, 0, sizeof(struct sockaddr_un)); + + saddr->sun_family = AF_UNIX; + len = snprintf(saddr->sun_path, sizeof(saddr->sun_path), + "%s", LAZY_PAGES_SOCK_NAME); + if (len >= sizeof(saddr->sun_path)) { + pr_err("Wrong UNIX socket name: %s\n", LAZY_PAGES_SOCK_NAME); + return -1; + } + + return 0; +} + +static int send_uffd(int sendfd, int pid) +{ + int fd; + int ret = -1; + + if (sendfd < 0) + return -1; + + fd = fdstore_get(lazy_pages_sk_id); + if (fd < 0) { + pr_err("%s: get_service_fd\n", __func__); + return -1; + } + + mutex_lock(lazy_sock_mutex); + + /* The "transfer protocol" is first the pid as int and then + * the FD for UFFD */ + pr_debug("Sending PID %d\n", pid); + if (send(fd, &pid, sizeof(pid), 0) < 0) { + pr_perror("PID sending error"); + goto out; + } + + /* for a zombie process pid will be negative */ + if (pid < 0) { + ret = 0; + goto out; + } + + if (send_fd(fd, NULL, 0, sendfd) < 0) { + pr_err("send_fd error\n"); + goto out; + } + + ret = 0; +out: + mutex_unlock(lazy_sock_mutex); + close(fd); + return ret; +} + +int lazy_pages_setup_zombie(int pid) +{ + if (!opts.lazy_pages) + return 0; + + if (send_uffd(0, -pid)) + return -1; + + return 0; +} + +bool uffd_noncooperative(void) +{ + unsigned long features = NEED_UFFD_API_FEATURES; + + return (kdat.uffd_features & features) == features; +} + +int uffd_open(int flags, unsigned long *features) +{ + struct uffdio_api uffdio_api = { 0 }; + int uffd; + + uffd = syscall(SYS_userfaultfd, flags); + if (uffd == -1) { + pr_perror("Lazy pages are not available"); + return -errno; + } + + uffdio_api.api = UFFD_API; + if (features) + uffdio_api.features = *features; + + if (ioctl(uffd, UFFDIO_API, &uffdio_api)) { + pr_perror("Failed to get uffd API"); + goto err; + } + + if (uffdio_api.api != UFFD_API) { + pr_err("Incompatible uffd API: expected %Lu, got %Lu\n", + UFFD_API, uffdio_api.api); + goto err; + } + + if (features) + *features = uffdio_api.features; + + return uffd; + +err: + close(uffd); + return -1; +} + +/* This function is used by 'criu restore --lazy-pages' */ +int setup_uffd(int pid, struct task_restore_args *task_args) +{ + unsigned long features = kdat.uffd_features & NEED_UFFD_API_FEATURES; + + if (!opts.lazy_pages) { + task_args->uffd = -1; + return 0; + } + + /* + * Open userfaulfd FD which is passed to the restorer blob and + * to a second process handling the userfaultfd page faults. + */ + task_args->uffd = uffd_open(O_CLOEXEC | O_NONBLOCK, &features); + if (task_args->uffd < 0) { + pr_perror("Unable to open an userfaultfd descriptor"); + return -1; + } + + if (send_uffd(task_args->uffd, pid) < 0) + goto err; + + return 0; +err: + close(task_args->uffd); + return -1; +} + +int prepare_lazy_pages_socket(void) +{ + int fd, len, ret = -1; + struct sockaddr_un sun; + + if (!opts.lazy_pages) + return 0; + + if (prepare_sock_addr(&sun)) + return -1; + + lazy_sock_mutex = shmalloc(sizeof(*lazy_sock_mutex)); + if (!lazy_sock_mutex) + return -1; + + mutex_init(lazy_sock_mutex); + + if ((fd = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) + return -1; + + len = offsetof(struct sockaddr_un, sun_path) + strlen(sun.sun_path); + if (connect(fd, (struct sockaddr *) &sun, len) < 0) { + pr_perror("connect to %s failed", sun.sun_path); + goto out; + } + + lazy_pages_sk_id = fdstore_add(fd); + if (lazy_pages_sk_id < 0) { + pr_perror("Can't add fd to fdstore"); + goto out; + } + + ret = 0; +out: + close(fd); + return ret; +} + +static int server_listen(struct sockaddr_un *saddr) +{ + int fd; + int len; + + if ((fd = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) + return -1; + + unlink(saddr->sun_path); + + len = offsetof(struct sockaddr_un, sun_path) + strlen(saddr->sun_path); + + if (bind(fd, (struct sockaddr *) saddr, len) < 0) { + goto out; + } + + if (listen(fd, 10) < 0) { + goto out; + } + + return fd; + +out: + close(fd); + return -1; +} + +static MmEntry *init_mm_entry(struct lazy_pages_info *lpi) +{ + struct cr_img *img; + MmEntry *mm; + int ret; + + img = open_image(CR_FD_MM, O_RSTR, lpi->pid); + if (!img) + return NULL; + + ret = pb_read_one_eof(img, &mm, PB_MM); + close_image(img); + if (ret == -1) + return NULL; + lp_debug(lpi, "Found %zd VMAs in image\n", mm->n_vmas); + + return mm; +} + +static struct lazy_iov *find_iov(struct lazy_pages_info *lpi, + unsigned long addr) +{ + struct lazy_iov *iov; + + list_for_each_entry(iov, &lpi->iovs, l) + if (addr >= iov->start && addr < iov->end) + return iov; + + return NULL; +} + +static int split_iov(struct lazy_iov *iov, unsigned long addr) +{ + struct lazy_iov *new; + + new = xzalloc(sizeof(*new)); + if (!new) + return -1; + + new->start = addr; + new->img_start = iov->img_start + addr - iov->start; + new->end = iov->end; + iov->end = addr; + list_add(&new->l, &iov->l); + + return 0; +} + +static void iov_list_insert(struct lazy_iov *new, struct list_head *dst) +{ + struct lazy_iov *iov; + + if (list_empty(dst)) { + list_move(&new->l, dst); + return; + } + + list_for_each_entry(iov, dst, l) { + if (new->start < iov->start) { + list_move_tail(&new->l, &iov->l); + break; + } + if (list_is_last(&iov->l, dst) && + new->start > iov->start) { + list_move(&new->l, &iov->l); + break; + } + } +} + +static void merge_iov_lists(struct list_head *src, struct list_head *dst) +{ + struct lazy_iov *iov, *n; + + if (list_empty(src)) + return; + + list_for_each_entry_safe(iov, n, src, l) + iov_list_insert(iov, dst); +} + +static int __copy_iov_list(struct list_head *src, struct list_head *dst) +{ + struct lazy_iov *iov, *new; + + list_for_each_entry(iov, src, l) { + new = xzalloc(sizeof(*new)); + if (!new) + return -1; + + new->start = iov->start; + new->img_start = iov->img_start; + new->end = iov->end; + + list_add_tail(&new->l, dst); + } + + + return 0; +} + +static int copy_iovs(struct lazy_pages_info *src, struct lazy_pages_info *dst) +{ + if (__copy_iov_list(&src->iovs, &dst->iovs)) + goto free_iovs; + + if (__copy_iov_list(&src->reqs, &dst->reqs)) + goto free_iovs; + + /* + * The IOVs already in flight for the parent process need to be + * transferred again for the child process + */ + merge_iov_lists(&dst->reqs, &dst->iovs); + + dst->buf_size = src->buf_size; + if (posix_memalign(&dst->buf, PAGE_SIZE, dst->buf_size)) + goto free_iovs; + + return 0; + +free_iovs: + free_iovs(dst); + return -1; +} + +/* + * Purge range (addr, addr + len) from lazy_iovs. The range may + * cover several continuous IOVs. + */ +static int __drop_iovs(struct list_head *iovs, unsigned long addr, int len) +{ + struct lazy_iov *iov, *n; + + list_for_each_entry_safe(iov, n, iovs, l) { + unsigned long start = iov->start; + unsigned long end = iov->end; + + if (len <= 0 || addr + len < start) + break; + + if (addr >= end) + continue; + + if (addr < start) { + len -= (start - addr); + addr = start; + } + + /* + * The range completely fits into the current IOV. + * If addr equals iov_start we just "drop" the + * beginning of the IOV. Otherwise, we make the IOV to + * end at addr, and add a new IOV start starts at + * addr + len. + */ + if (addr + len < end) { + if (addr == start) { + iov->start += len; + iov->img_start += len; + } else { + if (split_iov(iov, addr + len)) + return -1; + iov->end = addr; + } + break; + } + + /* + * The range spawns beyond the end of the current IOV. + * If addr equals iov_start we just "drop" the entire + * IOV. Otherwise, we cut the beginning of the IOV + * and continue to the next one with the updated range + */ + if (addr == start) { + list_del(&iov->l); + xfree(iov); + } else { + iov->end = addr; + } + + len -= (end - addr); + addr = end; + } + + return 0; +} + +static int drop_iovs(struct lazy_pages_info *lpi, unsigned long addr, int len) +{ + if (__drop_iovs(&lpi->iovs, addr, len)) + return -1; + + if (__drop_iovs(&lpi->reqs, addr, len)) + return -1; + + return 0; +} + + +static struct lazy_iov *extract_range(struct lazy_iov *iov, + unsigned long start, + unsigned long end) +{ + /* move the IOV tail into a new IOV */ + if (end < iov->end) + if (split_iov(iov, end)) + return NULL; + + if (start == iov->start) + return iov; + + /* after splitting the IOV head we'll need the ->next IOV */ + if (split_iov(iov, start)) + return NULL; + + return list_entry(iov->l.next, struct lazy_iov, l); +} + +static int __remap_iovs(struct list_head *iovs, unsigned long from, + unsigned long to, unsigned long len) +{ + LIST_HEAD(remaps); + + unsigned long off = to - from; + struct lazy_iov *iov, *n; + + list_for_each_entry_safe(iov, n, iovs, l) { + if (from >= iov->end) + continue; + + if (len <= 0 || from + len <= iov->start) + break; + + if (from < iov->start) { + len -= (iov->start - from); + from = iov->start; + } + + if (from > iov->start) { + if (split_iov(iov, from)) + return -1; + list_safe_reset_next(iov, n, l); + continue; + } + + if (from + len < iov->end) { + if (split_iov(iov, from + len)) + return -1; + list_safe_reset_next(iov, n, l); + } + + /* here we have iov->start = from, iov->end <= from + len */ + from = iov->end; + len -= iov->end - iov->start; + iov->start += off; + iov->end += off; + list_move_tail(&iov->l, &remaps); + } + + merge_iov_lists(&remaps, iovs); + + return 0; +} + +static int remap_iovs(struct lazy_pages_info *lpi, unsigned long from, + unsigned long to, unsigned long len) +{ + if (__remap_iovs(&lpi->iovs, from, to, len)) + return -1; + + if (__remap_iovs(&lpi->reqs, from, to, len)) + return -1; + + return 0; +} + +/* + * Create a list of IOVs that can be handled using userfaultfd. The + * IOVs generally correspond to lazy pagemap entries, except the cases + * when a single pagemap entry covers several VMAs. In those cases + * IOVs are split at VMA boundaries because UFFDIO_COPY may be done + * only inside a single VMA. + * We assume here that pagemaps and VMAs are sorted. + */ +static int collect_iovs(struct lazy_pages_info *lpi) +{ + struct page_read *pr = &lpi->pr; + struct lazy_iov *iov; + MmEntry *mm; + int nr_pages = 0, n_vma = 0, max_iov_len = 0; + int ret = -1; + unsigned long start, end, len; + + mm = init_mm_entry(lpi); + if (!mm) + return -1; + + while (pr->advance(pr)) { + if (!pagemap_lazy(pr->pe)) + continue; + + start = pr->pe->vaddr; + end = start + pr->pe->nr_pages * page_size(); + nr_pages += pr->pe->nr_pages; + + for (; n_vma < mm->n_vmas; n_vma++) { + VmaEntry *vma = mm->vmas[n_vma]; + + if (start >= vma->end) + continue; + + iov = xzalloc(sizeof(*iov)); + if (!iov) + goto free_iovs; + + len = min_t(uint64_t, end, vma->end) - start; + iov->start = start; + iov->img_start = start; + iov->end = iov->start + len; + list_add_tail(&iov->l, &lpi->iovs); + + if (len > max_iov_len) + max_iov_len = len; + + if (end <= vma->end) + break; + + start = vma->end; + } + } + + lpi->buf_size = max_iov_len; + if (posix_memalign(&lpi->buf, PAGE_SIZE, lpi->buf_size)) + goto free_iovs; + + ret = nr_pages; + goto free_mm; + +free_iovs: + free_iovs(lpi); +free_mm: + mm_entry__free_unpacked(mm, NULL); + + return ret; +} + +static int uffd_io_complete(struct page_read *pr, unsigned long vaddr, int nr); + +static int ud_open(int client, struct lazy_pages_info **_lpi) +{ + struct lazy_pages_info *lpi; + int ret = -1; + int pr_flags = PR_TASK; + + lpi = lpi_init(); + if (!lpi) + goto out; + + /* The "transfer protocol" is first the pid as int and then + * the FD for UFFD */ + ret = recv(client, &lpi->pid, sizeof(lpi->pid), 0); + if (ret != sizeof(lpi->pid)) { + if (ret < 0) + pr_perror("PID recv error"); + else + pr_err("PID recv: short read\n"); + goto out; + } + + if (lpi->pid < 0) { + pr_debug("Zombie PID: %d\n", lpi->pid); + lpi_fini(lpi); + return 0; + } + + lpi->lpfd.fd = recv_fd(client); + if (lpi->lpfd.fd < 0) { + pr_err("recv_fd error\n"); + goto out; + } + pr_debug("Received PID: %d, uffd: %d\n", lpi->pid, lpi->lpfd.fd); + + if (opts.use_page_server) + pr_flags |= PR_REMOTE; + ret = open_page_read(lpi->pid, &lpi->pr, pr_flags); + if (ret <= 0) { + lp_err(lpi, "Failed to open pagemap\n"); + goto out; + } + + lpi->pr.io_complete = uffd_io_complete; + + /* + * Find the memory pages belonging to the restored process + * so that it is trackable when all pages have been transferred. + */ + ret = collect_iovs(lpi); + if (ret < 0) + goto out; + lpi->total_pages = ret; + + lp_debug(lpi, "Found %ld pages to be handled by UFFD\n", lpi->total_pages); + + list_add_tail(&lpi->l, &lpis); + *_lpi = lpi; + + return 0; + +out: + lpi_fini(lpi); + return -1; +} + +static int handle_exit(struct lazy_pages_info *lpi) +{ + lp_debug(lpi, "EXIT\n"); + if (epoll_del_rfd(epollfd, &lpi->lpfd)) + return -1; + free_iovs(lpi); + close(lpi->lpfd.fd); + lpi->lpfd.fd = -lpi->lpfd.fd; + lpi->exited = true; + + /* keep it for tracking in-flight requests and for the summary */ + list_move_tail(&lpi->l, &lpis); + + return 0; +} + +static bool uffd_recoverable_error(int mcopy_rc) +{ + if (errno == EAGAIN || errno == ENOENT || errno == EEXIST) + return true; + + if (mcopy_rc == -ENOENT || mcopy_rc == -EEXIST) + return true; + + return false; +} + +static int uffd_check_op_error(struct lazy_pages_info *lpi, const char *op, + int *nr_pages, long mcopy_rc) +{ + if (errno == ENOSPC || errno == ESRCH) { + handle_exit(lpi); + return 0; + } + + if (!uffd_recoverable_error(mcopy_rc)) { + lp_perror(lpi, "%s: mcopy_rc:%ld", op, mcopy_rc); + return -1; + } + + lp_debug(lpi, "%s: mcopy_rc:%ld, errno:%d\n", op, mcopy_rc, errno); + + if (mcopy_rc <= 0) + *nr_pages = 0; + else + *nr_pages = mcopy_rc / PAGE_SIZE; + + return 0; +} + +static int uffd_copy(struct lazy_pages_info *lpi, __u64 address, int *nr_pages) +{ + struct uffdio_copy uffdio_copy; + unsigned long len = *nr_pages * page_size(); + + uffdio_copy.dst = address; + uffdio_copy.src = (unsigned long)lpi->buf; + uffdio_copy.len = len; + uffdio_copy.mode = 0; + uffdio_copy.copy = 0; + + lp_debug(lpi, "uffd_copy: 0x%llx/%ld\n", uffdio_copy.dst, len); + if (ioctl(lpi->lpfd.fd, UFFDIO_COPY, &uffdio_copy) && + uffd_check_op_error(lpi, "copy", nr_pages, uffdio_copy.copy)) + return -1; + + lpi->copied_pages += *nr_pages; + + return 0; +} + +static int uffd_io_complete(struct page_read *pr, unsigned long img_addr, int nr) +{ + struct lazy_pages_info *lpi; + unsigned long addr = 0; + int req_pages, ret; + struct lazy_iov *req; + + lpi = container_of(pr, struct lazy_pages_info, pr); + + /* + * The process may exit while we still have requests in + * flight. We just drop the request and the received data in + * this case to avoid making uffd unhappy + */ + if (lpi->exited) + return 0; + + list_for_each_entry(req, &lpi->reqs, l) { + if (req->img_start == img_addr) { + addr = req->start; + break; + } + } + + /* the request may be already gone because if unmap/remove */ + if (!addr) + return 0; + + /* + * By the time we get the pages from the remote source, parts + * of the request may already be gone because of unmap/remove + * OTOH, the remote side may send less pages than we requested. + * Make sure we are not trying to uffd_copy more memory than + * we should. + */ + req_pages = (req->end - req->start) / PAGE_SIZE; + nr = min(nr, req_pages); + + ret = uffd_copy(lpi, addr, &nr); + if (ret < 0) + return ret; + + /* recheck if the process exited, it may be detected in uffd_copy */ + if (lpi->exited) + return 0; + + /* + * Since the completed request length may differ from the + * actual data we've received we re-insert the request to IOVs + * list and let drop_iovs do the range math, free memory etc. + */ + iov_list_insert(req, &lpi->iovs); + return drop_iovs(lpi, addr, nr * PAGE_SIZE); +} + +static int uffd_zero(struct lazy_pages_info *lpi, __u64 address, int nr_pages) +{ + struct uffdio_zeropage uffdio_zeropage; + unsigned long len = page_size() * nr_pages; + + uffdio_zeropage.range.start = address; + uffdio_zeropage.range.len = len; + uffdio_zeropage.mode = 0; + + lp_debug(lpi, "zero page at 0x%llx\n", address); + if (ioctl(lpi->lpfd.fd, UFFDIO_ZEROPAGE, &uffdio_zeropage) && + uffd_check_op_error(lpi, "zero", &nr_pages, + uffdio_zeropage.zeropage)) + return -1; + + return 0; +} + +/* + * Seek for the requested address in the pagemap. If it is found, the + * subsequent call to pr->page_read will bring us the data. If the + * address is not found in the pagemap, but no error occurred, the + * address should be mapped to zero pfn. + * + * Returns 0 for zero pages, 1 for "real" pages and negative value on + * error + */ +static int uffd_seek_pages(struct lazy_pages_info *lpi, __u64 address, int nr) +{ + int ret; + + lpi->pr.reset(&lpi->pr); + + ret = lpi->pr.seek_pagemap(&lpi->pr, address); + if (!ret) { + lp_err(lpi, "no pagemap covers %llx\n", address); + return -1; + } + + return 0; +} + +static int uffd_handle_pages(struct lazy_pages_info *lpi, __u64 address, int nr, unsigned flags) +{ + int ret; + + ret = uffd_seek_pages(lpi, address, nr); + if (ret) + return ret; + + ret = lpi->pr.read_pages(&lpi->pr, address, nr, lpi->buf, flags); + if (ret <= 0) { + lp_err(lpi, "failed reading pages at %llx\n", address); + return ret; + } + + return 0; +} + +static struct lazy_iov *pick_next_range(struct lazy_pages_info *lpi) +{ + return list_first_entry(&lpi->iovs, struct lazy_iov, l); +} + +/* + * This is very simple heurstics for background transfer control. + * The idea is to transfer larger chunks when there is no page faults + * and drop the background transfer size each time #PF occurs to some + * default value. The default is empirically set to 64Kbytes + */ +static void update_xfer_len(struct lazy_pages_info *lpi, bool pf) +{ + if (pf) + lpi->xfer_len = DEFAULT_XFER_LEN; + else + lpi->xfer_len += DEFAULT_XFER_LEN; + + if (lpi->xfer_len > MAX_XFER_LEN) + lpi->xfer_len = MAX_XFER_LEN; +} + +static int xfer_pages(struct lazy_pages_info *lpi) +{ + struct lazy_iov *iov; + unsigned int nr_pages; + unsigned long len; + int err; + + iov = pick_next_range(lpi); + if (!iov) + return 0; + + len = min(iov->end - iov->start, lpi->xfer_len); + + iov = extract_range(iov, iov->start, iov->start + len); + if (!iov) + return -1; + list_move(&iov->l, &lpi->reqs); + + nr_pages = (iov->end - iov->start) / PAGE_SIZE; + + update_xfer_len(lpi, false); + + err = uffd_handle_pages(lpi, iov->img_start, nr_pages, PR_ASYNC | PR_ASAP); + if (err < 0) { + lp_err(lpi, "Error during UFFD copy\n"); + return -1; + } + + return 0; +} + +static int handle_remove(struct lazy_pages_info *lpi, struct uffd_msg *msg) +{ + struct uffdio_range unreg; + + unreg.start = msg->arg.remove.start; + unreg.len = msg->arg.remove.end - msg->arg.remove.start; + + lp_debug(lpi, "%s: %llx(%llx)\n", + msg->event == UFFD_EVENT_REMOVE ? "REMOVE" : "UNMAP", + unreg.start, unreg.len); + + /* + * The REMOVE event does not change the VMA, so we need to + * make sure that we won't handle #PFs in the removed + * range. With UNMAP, there's no VMA to worry about + */ + if (msg->event == UFFD_EVENT_REMOVE && + ioctl(lpi->lpfd.fd, UFFDIO_UNREGISTER, &unreg)) { + /* + * The kernel returns -ENOMEM when unregister is + * called after the process has gone + */ + if (errno == ENOMEM) { + handle_exit(lpi); + return 0; + } + + pr_perror("Failed to unregister (%llx - %llx)", unreg.start, + unreg.start + unreg.len); + return -1; + } + + return drop_iovs(lpi, unreg.start, unreg.len); +} + +static int handle_remap(struct lazy_pages_info *lpi, struct uffd_msg *msg) +{ + unsigned long from = msg->arg.remap.from; + unsigned long to = msg->arg.remap.to; + unsigned long len = msg->arg.remap.len; + + lp_debug(lpi, "REMAP: %lx -> %lx (%ld)\n", from , to, len); + + return remap_iovs(lpi, from, to, len); +} + +static int handle_fork(struct lazy_pages_info *parent_lpi, struct uffd_msg *msg) +{ + struct lazy_pages_info *lpi; + int uffd = msg->arg.fork.ufd; + + lp_debug(parent_lpi, "FORK: child with ufd=%d\n", uffd); + + lpi = lpi_init(); + if (!lpi) + return -1; + + if (copy_iovs(parent_lpi, lpi)) + goto out; + + lpi->pid = parent_lpi->pid; + lpi->lpfd.fd = uffd; + lpi->parent = parent_lpi->parent ? parent_lpi->parent : parent_lpi; + lpi->copied_pages = lpi->parent->copied_pages; + lpi->total_pages = lpi->parent->total_pages; + list_add_tail(&lpi->l, &pending_lpis); + + dup_page_read(&lpi->parent->pr, &lpi->pr); + + lpi_get(lpi->parent); + + return 1; + +out: + lpi_fini(lpi); + return -1; +} + +/* + * We may exit epoll_run_rfds() loop because of non-fork() event. In + * such case we return 1 rather than 0 to let the caller know that no + * fork() events were pending + */ +static int complete_forks(int epollfd, struct epoll_event **events, int *nr_fds) +{ + struct lazy_pages_info *lpi, *n; + + if (list_empty(&pending_lpis)) + return 1; + + list_for_each_entry(lpi, &pending_lpis, l) + (*nr_fds)++; + + *events = xrealloc(*events, sizeof(struct epoll_event) * (*nr_fds)); + if (!*events) + return -1; + + list_for_each_entry_safe(lpi, n, &pending_lpis, l) { + if (epoll_add_rfd(epollfd, &lpi->lpfd)) + return -1; + + list_del_init(&lpi->l); + list_add_tail(&lpi->l, &lpis); + } + + return 0; +} + +static bool is_page_queued(struct lazy_pages_info *lpi, unsigned long addr) +{ + struct lazy_iov *req; + + list_for_each_entry(req, &lpi->reqs, l) + if (addr >= req->start && addr < req->end) + return true; + + return false; +} + +static int handle_page_fault(struct lazy_pages_info *lpi, struct uffd_msg *msg) +{ + struct lazy_iov *iov; + __u64 address; + int ret; + + /* Align requested address to the next page boundary */ + address = msg->arg.pagefault.address & ~(page_size() - 1); + lp_debug(lpi, "#PF at 0x%llx\n", address); + + if (is_page_queued(lpi, address)) + return 0; + + iov = find_iov(lpi, address); + if (!iov) + return uffd_zero(lpi, address, 1); + + iov = extract_range(iov, address, address + PAGE_SIZE); + if (!iov) + return -1; + + list_move(&iov->l, &lpi->reqs); + + update_xfer_len(lpi, true); + + ret = uffd_handle_pages(lpi, iov->img_start, 1, PR_ASYNC | PR_ASAP); + if (ret < 0) { + lp_err(lpi, "Error during regular page copy\n"); + return -1; + } + + return 0; +} + +static int handle_uffd_event(struct epoll_rfd *lpfd) +{ + struct lazy_pages_info *lpi; + struct uffd_msg msg; + int ret; + + lpi = container_of(lpfd, struct lazy_pages_info, lpfd); + + ret = read(lpfd->fd, &msg, sizeof(msg)); + if (!ret) + return 1; + + if (ret != sizeof(msg)) { + /* we've already handled the page fault for another thread */ + if (errno == EAGAIN) + return 0; + if (ret < 0) + lp_perror(lpi, "Can't read uffd message"); + else + lp_err(lpi, "Can't read uffd message: short read"); + return -1; + } + + switch (msg.event) { + case UFFD_EVENT_PAGEFAULT: + return handle_page_fault(lpi, &msg); + case UFFD_EVENT_REMOVE: + case UFFD_EVENT_UNMAP: + return handle_remove(lpi, &msg); + case UFFD_EVENT_REMAP: + return handle_remap(lpi, &msg); + case UFFD_EVENT_FORK: + return handle_fork(lpi, &msg); + default: + lp_err(lpi, "unexpected uffd event %u\n", msg.event); + return -1; + } + + return 0; +} + +static void lazy_pages_summary(struct lazy_pages_info *lpi) +{ + lp_debug(lpi, "UFFD transferred pages: (%ld/%ld)\n", + lpi->copied_pages, lpi->total_pages); + +#if 0 + if ((lpi->copied_pages != lpi->total_pages) && (lpi->total_pages > 0)) { + lp_warn(lpi, "Only %ld of %ld pages transferred via UFFD\n" + "Something probably went wrong.\n", + lpi->copied_pages, lpi->total_pages); + return 1; + } +#endif +} + +static int handle_requests(int epollfd, struct epoll_event *events, int nr_fds) +{ + struct lazy_pages_info *lpi, *n; + int poll_timeout = -1; + int ret; + + for (;;) { + ret = epoll_run_rfds(epollfd, events, nr_fds, poll_timeout); + if (ret < 0) + goto out; + if (ret > 0) { + ret = complete_forks(epollfd, &events, &nr_fds); + if (ret < 0) + goto out; + if (restore_finished) + poll_timeout = 0; + if (!restore_finished || !ret) + continue; + } + + /* make sure we return success if there is nothing to xfer */ + ret = 0; + + list_for_each_entry_safe(lpi, n, &lpis, l) { + if (!list_empty(&lpi->iovs) && list_empty(&lpi->reqs)) { + ret = xfer_pages(lpi); + if (ret < 0) + goto out; + break; + } + + if (list_empty(&lpi->reqs)) { + lazy_pages_summary(lpi); + list_del(&lpi->l); + lpi_put(lpi); + } + } + + if (list_empty(&lpis)) + break; + } + +out: + return ret; + +} + +int lazy_pages_finish_restore(void) +{ + uint32_t fin = LAZY_PAGES_RESTORE_FINISHED; + int fd, ret; + + if (!opts.lazy_pages) + return 0; + + fd = fdstore_get(lazy_pages_sk_id); + if (fd < 0) { + pr_err("No lazy-pages socket\n"); + return -1; + } + + ret = send(fd, &fin, sizeof(fin), 0); + if (ret != sizeof(fin)) + pr_perror("Failed sending restore finished indication"); + + close(fd); + + return ret < 0 ? ret : 0; +} + +static int prepare_lazy_socket(void) +{ + int listen; + struct sockaddr_un saddr; + + if (prepare_sock_addr(&saddr)) + return -1; + + pr_debug("Waiting for incoming connections on %s\n", saddr.sun_path); + if ((listen = server_listen(&saddr)) < 0) { + pr_perror("server_listen error"); + return -1; + } + + return listen; +} + +static int lazy_sk_read_event(struct epoll_rfd *rfd) +{ + uint32_t fin; + int ret; + + ret = recv(rfd->fd, &fin, sizeof(fin), 0); + /* + * epoll sets POLLIN | POLLHUP for the EOF case, so we get short + * read just before hangup_event + */ + if (!ret) + return 0; + + if (ret != sizeof(fin)) { + pr_perror("Failed getting restore finished indication"); + return -1; + } + + if (fin != LAZY_PAGES_RESTORE_FINISHED) { + pr_err("Unexpected response: %x\n", fin); + return -1; + } + + restore_finished = true; + + return 1; +} + +static int lazy_sk_hangup_event(struct epoll_rfd *rfd) +{ + if (!restore_finished) { + pr_err("Restorer unexpectedly closed the connection\n"); + return -1; + } + + return 0; +} + +static int prepare_uffds(int listen, int epollfd) +{ + int i; + int client; + socklen_t len; + struct sockaddr_un saddr; + + /* accept new client request */ + len = sizeof(struct sockaddr_un); + if ((client = accept(listen, (struct sockaddr *) &saddr, &len)) < 0) { + pr_perror("server_accept error"); + close(listen); + return -1; + } + + for (i = 0; i < task_entries->nr_tasks; i++) { + struct lazy_pages_info *lpi = NULL; + if (ud_open(client, &lpi)) + goto close_uffd; + if (lpi == NULL) + continue; + if (epoll_add_rfd(epollfd, &lpi->lpfd)) + goto close_uffd; + } + + lazy_sk_rfd.fd = client; + lazy_sk_rfd.read_event = lazy_sk_read_event; + lazy_sk_rfd.hangup_event = lazy_sk_hangup_event; + if (epoll_add_rfd(epollfd, &lazy_sk_rfd)) + goto close_uffd; + + close(listen); + return 0; + +close_uffd: + close_safe(&client); + close(listen); + return -1; +} + +int cr_lazy_pages(bool daemon) +{ + struct epoll_event *events; + int nr_fds; + int lazy_sk; + int ret; + + if (kerndat_uffd() || !kdat.has_uffd) + return -1; + + if (prepare_dummy_pstree()) + return -1; + + lazy_sk = prepare_lazy_socket(); + if (lazy_sk < 0) + return -1; + + if (daemon) { + ret = cr_daemon(1, 0, -1); + if (ret == -1) { + pr_err("Can't run in the background\n"); + return -1; + } + if (ret > 0) { /* parent task, daemon started */ + if (opts.pidfile) { + if (write_pidfile(ret) == -1) { + pr_perror("Can't write pidfile"); + kill(ret, SIGKILL); + waitpid(ret, NULL, 0); + return -1; + } + } + + return 0; + } + } + + if (close_status_fd()) + return -1; + + /* + * we poll nr_tasks userfault fds, UNIX socket between lazy-pages + * daemon and the cr-restore, and, optionally TCP socket for + * remote pages + */ + nr_fds = task_entries->nr_tasks + (opts.use_page_server ? 2 : 1); + epollfd = epoll_prepare(nr_fds, &events); + if (epollfd < 0) + return -1; + + if (prepare_uffds(lazy_sk, epollfd)) + return -1; + + if (opts.use_page_server) { + if (connect_to_page_server_to_recv(epollfd)) + return -1; + } + + ret = handle_requests(epollfd, events, nr_fds); + + tls_terminate_session(); + + return ret; +} diff --git a/CRIU_code/criu/util.c b/CRIU_code/criu/util.c new file mode 100644 index 0000000..31cdee1 --- /dev/null +++ b/CRIU_code/criu/util.c @@ -0,0 +1,1361 @@ +#define _XOPEN_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kerndat.h" +#include "page.h" +#include "util.h" +#include "image.h" +#include "vma.h" +#include "mem.h" +#include "namespaces.h" +#include "criu-log.h" + +#include "clone-noasan.h" +#include "cr_options.h" +#include "cr-service.h" +#include "files.h" +#include "pstree.h" + +#include "cr-errno.h" + +#define VMA_OPT_LEN 128 + +static int xatol_base(const char *string, long *number, int base) +{ + char *endptr; + long nr; + + errno = 0; + nr = strtol(string, &endptr, base); + if ((errno == ERANGE && (nr == LONG_MAX || nr == LONG_MIN)) + || (errno != 0 && nr == 0)) { + pr_perror("failed to convert string '%s'", string); + return -EINVAL; + } + + if ((endptr == string) || (*endptr != '\0')) { + pr_err("String is not a number: '%s'\n", string); + return -EINVAL; + } + *number = nr; + return 0; +} + +int xatol(const char *string, long *number) +{ + return xatol_base(string, number, 10); +} + + +int xatoi(const char *string, int *number) +{ + long tmp; + int err; + + err = xatol(string, &tmp); + if (err) + return err; + + if (tmp > INT_MAX || tmp < INT_MIN) { + pr_err("value %#lx (%ld) is out of int range\n", tmp, tmp); + return -ERANGE; + } + + *number = (int)tmp; + return 0; +} + +/* + * This function reallocates passed str pointer. + * It means: + * 1) passed pointer can be either NULL, or previously allocated by malloc. + * 2) Passed pointer can' be reused. It's either freed in case of error or can + * be changed. + */ +static char *xvstrcat(char *str, const char *fmt, va_list args) +{ + size_t offset = 0, delta; + int ret; + char *new; + va_list tmp; + + if (str) + offset = strlen(str); + delta = strlen(fmt) * 2; + + do { + new = xrealloc(str, offset + delta); + if (!new) { + /* realloc failed. We must release former string */ + xfree(str); + pr_err("Failed to allocate string\n"); + return new; + } + + va_copy(tmp, args); + ret = vsnprintf(new + offset, delta, fmt, tmp); + va_end(tmp); + if (ret < delta) /* an error, or all was written */ + break; + + /* NOTE: vsnprintf returns the amount of bytes + * to allocate. */ + delta = ret + 1; + str = new; + } while (1); + + if (ret < 0) { + /* vsnprintf failed */ + pr_err("Failed to print string\n"); + xfree(new); + new = NULL; + } + return new; +} + +char *xstrcat(char *str, const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + str = xvstrcat(str, fmt, args); + va_end(args); + + return str; +} + +char *xsprintf(const char *fmt, ...) +{ + va_list args; + char *str; + + va_start(args, fmt); + str = xvstrcat(NULL, fmt, args); + va_end(args); + + return str; +} + +static void vma_opt_str(const struct vma_area *v, char *opt) +{ + int p = 0; + +#define opt2s(_o, _s) do { \ + if (v->e->status & _o) \ + p += sprintf(opt + p, _s " "); \ + } while (0) + + opt[p] = '\0'; + opt2s(VMA_AREA_REGULAR, "reg"); + opt2s(VMA_AREA_STACK, "stk"); + opt2s(VMA_AREA_VSYSCALL, "vsys"); + opt2s(VMA_AREA_VDSO, "vdso"); + opt2s(VMA_AREA_VVAR, "vvar"); + opt2s(VMA_AREA_HEAP, "heap"); + opt2s(VMA_FILE_PRIVATE, "fp"); + opt2s(VMA_FILE_SHARED, "fs"); + opt2s(VMA_ANON_SHARED, "as"); + opt2s(VMA_ANON_PRIVATE, "ap"); + opt2s(VMA_AREA_SYSVIPC, "sysv"); + opt2s(VMA_AREA_SOCKET, "sk"); + +#undef opt2s +} + +void pr_vma(unsigned int loglevel, const struct vma_area *vma_area) +{ + char opt[VMA_OPT_LEN]; + memset(opt, 0, VMA_OPT_LEN); + + if (!vma_area) + return; + + vma_opt_str(vma_area, opt); + print_on_level(loglevel, "%#"PRIx64"-%#"PRIx64" (%"PRIi64"K) prot %#x flags %#x fdflags %#o st %#x off %#"PRIx64" " + "%s shmid: %#"PRIx64"\n", + vma_area->e->start, vma_area->e->end, + KBYTES(vma_area_len(vma_area)), + vma_area->e->prot, + vma_area->e->flags, + vma_area->e->fdflags, + vma_area->e->status, + vma_area->e->pgoff, + opt, vma_area->e->shmid); +} + +int close_safe(int *fd) +{ + int ret = 0; + + if (*fd > -1) { + ret = close(*fd); + if (!ret) + *fd = -1; + else + pr_perror("Unable to close fd %d", *fd); + } + + return ret; +} + +int reopen_fd_as_safe(char *file, int line, int new_fd, int old_fd, bool allow_reuse_fd) +{ + int tmp; + + if (old_fd != new_fd) { + if (!allow_reuse_fd) + tmp = fcntl(old_fd, F_DUPFD, new_fd); + else + tmp = dup2(old_fd, new_fd); + if (tmp < 0) { + pr_perror("Dup %d -> %d failed (called at %s:%d)", + old_fd, new_fd, file, line); + return tmp; + } else if (tmp != new_fd) { + close(tmp); + pr_err("fd %d already in use (called at %s:%d)\n", + new_fd, file, line); + return -1; + } + + /* Just to have error message if failed */ + close_safe(&old_fd); + } + + return 0; +} + +int move_fd_from(int *img_fd, int want_fd) +{ + if (*img_fd == want_fd) { + int tmp; + + tmp = dup(*img_fd); + if (tmp < 0) { + pr_perror("Can't dup file"); + return -1; + } + + close(*img_fd); + + *img_fd = tmp; + } + + return 0; +} + +/* + * Cached opened /proc/$pid and /proc/self files. + * Used for faster access to /proc/.../foo files + * by using openat()-s + */ + +static pid_t open_proc_pid = PROC_NONE; +static pid_t open_proc_self_pid; +static int open_proc_self_fd = -1; + +void set_proc_self_fd(int fd) +{ + if (open_proc_self_fd >= 0) + close(open_proc_self_fd); + + open_proc_self_fd = fd; + open_proc_self_pid = getpid(); +} + +static inline int set_proc_pid_fd(int pid, int fd) +{ + int ret; + + if (fd < 0) + return close_service_fd(PROC_PID_FD_OFF); + + open_proc_pid = pid; + ret = install_service_fd(PROC_PID_FD_OFF, fd); + + return ret; +} + +static inline int get_proc_fd(int pid) +{ + if (pid == PROC_SELF) { + if (open_proc_self_fd != -1 && open_proc_self_pid != getpid()) { + close(open_proc_self_fd); + open_proc_self_fd = -1; + } + return open_proc_self_fd; + } else if (pid == open_proc_pid) + return get_service_fd(PROC_PID_FD_OFF); + else + return -1; +} + +int close_pid_proc(void) +{ + set_proc_self_fd(-1); + set_proc_pid_fd(PROC_NONE, -1); + return 0; +} + +void close_proc() +{ + close_pid_proc(); + close_service_fd(PROC_FD_OFF); +} + +int set_proc_fd(int fd) +{ + if (install_service_fd(PROC_FD_OFF, dup(fd)) < 0) + return -1; + return 0; +} + +static int open_proc_sfd(char *path) +{ + int fd, ret; + + close_proc(); + fd = open(path, O_DIRECTORY | O_PATH); + if (fd == -1) { + pr_perror("Can't open %s", path); + return -1; + } + + ret = install_service_fd(PROC_FD_OFF, fd); + if (ret < 0) + return -1; + + return 0; +} + +inline int open_pid_proc(pid_t pid) +{ + char path[18]; + int fd; + int dfd; + + fd = get_proc_fd(pid); + if (fd >= 0) + return fd; + + dfd = get_service_fd(PROC_FD_OFF); + if (dfd < 0) { + if (open_proc_sfd("/proc") < 0) + return -1; + + dfd = get_service_fd(PROC_FD_OFF); + } + + if (pid == PROC_GEN) + /* + * Don't cache it, close_pid_proc() would + * close service descriptor otherwise. + */ + return dfd; + + if (pid == PROC_SELF) + snprintf(path, sizeof(path), "self"); + else + snprintf(path, sizeof(path), "%d", pid); + + fd = openat(dfd, path, O_PATH); + if (fd < 0) { + pr_perror("Can't open %s", path); + set_cr_errno(ESRCH); + return -1; + } + + if (pid == PROC_SELF) + set_proc_self_fd(fd); + else + fd = set_proc_pid_fd(pid, fd); + + return fd; +} + +int do_open_proc(pid_t pid, int flags, const char *fmt, ...) +{ + char path[128]; + va_list args; + int dirfd; + + dirfd = open_pid_proc(pid); + if (dirfd < 0) + return -1; + + va_start(args, fmt); + vsnprintf(path, sizeof(path), fmt, args); + va_end(args); + + return openat(dirfd, path, flags); +} + +int copy_file(int fd_in, int fd_out, size_t bytes) +{ + ssize_t written = 0; + size_t chunk = bytes ? bytes : 4096; + char *buffer; + ssize_t ret; + + buffer = xmalloc(chunk); + if (buffer == NULL) { + pr_perror("failed to allocate buffer to copy file"); + return -1; + } + + while (1) { + if (opts.remote) { + ret = read(fd_in, buffer, chunk); + if (ret < 0) { + pr_perror("Can't read from fd_in\n"); + ret = -1; + goto err; + } + if (write(fd_out, buffer, ret) != ret) { + pr_perror("Couldn't write all read bytes\n"); + ret = -1; + goto err; + } + } else + ret = sendfile(fd_out, fd_in, NULL, chunk); + + if (ret < 0) { + pr_perror("Can't send data to ghost file"); + ret = -1; + goto err; + } + + if (ret == 0) { + if (bytes && (written != bytes)) { + pr_err("Ghost file size mismatch %zu/%zu\n", + written, bytes); + ret = -1; + goto err; + } + break; + } + + written += ret; + } +err: + xfree(buffer); + return ret; +} + +int read_fd_link(int lfd, char *buf, size_t size) +{ + char t[32]; + ssize_t ret; + + snprintf(t, sizeof(t), "/proc/self/fd/%d", lfd); + ret = readlink(t, buf, size); + if (ret < 0) { + pr_perror("Can't read link of fd %d", lfd); + return -1; + } else if ((size_t)ret >= size) { + pr_err("Buffer for read link of fd %d is too small\n", lfd); + return -1; + } + buf[ret] = 0; + + return ret; +} + +int is_anon_link_type(char *link, char *type) +{ + char aux[32]; + + snprintf(aux, sizeof(aux), "anon_inode:%s", type); + return !strcmp(link, aux); +} + +#define DUP_SAFE(fd, out) \ + ({ \ + int ret__; \ + ret__ = dup(fd); \ + if (ret__ == -1) { \ + pr_perror("dup(%d) failed", fd); \ + goto out; \ + } \ + ret__; \ + }) + +/* + * If "in" is negative, stdin will be closed. + * If "out" or "err" are negative, a log file descriptor will be used. + */ +int cr_system(int in, int out, int err, char *cmd, char *const argv[], unsigned flags) +{ + return cr_system_userns(in, out, err, cmd, argv, flags, -1); +} + +static int close_fds(int minfd) +{ + DIR *dir; + struct dirent *de; + int fd, ret, dfd; + + dir = opendir("/proc/self/fd"); + if (dir == NULL) { + pr_perror("Can't open /proc/self/fd"); + return -1; + } + dfd = dirfd(dir); + + while ((de = readdir(dir))) { + if (dir_dots(de)) + continue; + + ret = sscanf(de->d_name, "%d", &fd); + if (ret != 1) { + pr_err("Can't parse %s\n", de->d_name); + return -1; + } + if (dfd == fd) + continue; + if (fd < minfd) + continue; + close(fd); + } + closedir(dir); + + return 0; +} + +int cr_system_userns(int in, int out, int err, char *cmd, + char *const argv[], unsigned flags, int userns_pid) +{ + sigset_t blockmask, oldmask; + int ret = -1, status; + pid_t pid; + + sigemptyset(&blockmask); + sigaddset(&blockmask, SIGCHLD); + if (sigprocmask(SIG_BLOCK, &blockmask, &oldmask) == -1) { + pr_perror("Can not set mask of blocked signals"); + return -1; + } + + pid = fork(); + if (pid == -1) { + pr_perror("fork() failed"); + goto out; + } else if (pid == 0) { + if (userns_pid > 0) { + if (switch_ns(userns_pid, &user_ns_desc, NULL)) + goto out_chld; + if (setuid(0) || setgid(0)) { + pr_perror("Unable to set uid or gid"); + goto out_chld; + } + } + + if (out < 0) + out = DUP_SAFE(log_get_fd(), out_chld); + if (err < 0) + err = DUP_SAFE(log_get_fd(), out_chld); + + /* + * out, err, in should be a separate fds, + * because reopen_fd_as() closes an old fd + */ + if (err == out || err == in) + err = DUP_SAFE(err, out_chld); + + if (out == in) + out = DUP_SAFE(out, out_chld); + + if (move_fd_from(&out, STDIN_FILENO) || + move_fd_from(&err, STDIN_FILENO)) + goto out_chld; + + if (in < 0) { + close(STDIN_FILENO); + } else { + if (reopen_fd_as_nocheck(STDIN_FILENO, in)) + goto out_chld; + } + + if (move_fd_from(&err, STDOUT_FILENO)) + goto out_chld; + + if (reopen_fd_as_nocheck(STDOUT_FILENO, out)) + goto out_chld; + + if (reopen_fd_as_nocheck(STDERR_FILENO, err)) + goto out_chld; + + close_fds(STDERR_FILENO + 1); + + execvp(cmd, argv); + + pr_perror("exec(%s, ...) failed", cmd); +out_chld: + _exit(1); + } + + while (1) { + ret = waitpid(pid, &status, 0); + if (ret == -1) { + pr_perror("waitpid() failed"); + goto out; + } + + if (WIFEXITED(status)) { + if (!(flags & CRS_CAN_FAIL) && WEXITSTATUS(status)) + pr_err("exited, status=%d\n", WEXITSTATUS(status)); + break; + } else if (WIFSIGNALED(status)) { + pr_err("killed by signal %d: %s\n", WTERMSIG(status), + strsignal(WTERMSIG(status))); + break; + } else if (WIFSTOPPED(status)) { + pr_err("stopped by signal %d\n", WSTOPSIG(status)); + } else if (WIFCONTINUED(status)) { + pr_err("continued\n"); + } + } + + ret = status ? -1 : 0; +out: + if (sigprocmask(SIG_SETMASK, &oldmask, NULL) == -1) { + pr_perror("Can not unset mask of blocked signals"); + BUG(); + } + + return ret; +} + +int close_status_fd(void) +{ + char c = 0; + + if (opts.status_fd < 0) + return 0; + + if (write(opts.status_fd, &c, 1) != 1) { + pr_perror("Unable to write into the status fd"); + return -1; + } + + return close_safe(&opts.status_fd); +} + +int cr_daemon(int nochdir, int noclose, int close_fd) +{ + int pid; + + pid = fork(); + if (pid < 0) { + pr_perror("Can't fork"); + return -1; + } + + if (pid > 0) + return pid; + + setsid(); + if (!nochdir) + if (chdir("/") == -1) + pr_perror("Can't change directory"); + if (!noclose) { + int fd; + + if (close_fd != -1) + close(close_fd); + + fd = open("/dev/null", O_RDWR); + if (fd < 0) { + pr_perror("Can't open /dev/null"); + return -1; + } + dup2(fd, 0); + dup2(fd, 1); + dup2(fd, 2); + close(fd); + } + + return 0; +} + +int is_root_user() +{ + if (geteuid() != 0) { + pr_err("You need to be root to run this command\n"); + return 0; + } + + return 1; +} + +int is_empty_dir(int dirfd) +{ + int ret = 0; + DIR *fdir = NULL; + struct dirent *de; + + fdir = fdopendir(dirfd); + if (!fdir) + return -1; + + while ((de = readdir(fdir))) { + if (dir_dots(de)) + continue; + + goto out; + } + + ret = 1; +out: + closedir(fdir); + return ret; +} + +/* + * Get PFN from pagemap file for virtual address vaddr. + * Optionally if fd >= 0, it's used as pagemap file descriptor + * (may be other task's pagemap) + */ +int vaddr_to_pfn(int fd, unsigned long vaddr, u64 *pfn) +{ + int ret = -1; + off_t off; + bool close_fd = false; + + if (fd < 0) { + fd = open_proc(PROC_SELF, "pagemap"); + if (fd < 0) + return -1; + close_fd = true; + } + + off = (vaddr / page_size()) * sizeof(u64); + ret = pread(fd, pfn, sizeof(*pfn), off); + if (ret != sizeof(*pfn)) { + pr_perror("Can't read pme for pid %d", getpid()); + ret = -1; + } else { + *pfn &= PME_PFRAME_MASK; + ret = 0; + } + + if (close_fd) + close(fd); + + return ret; +} + +/* + * Note since VMA_AREA_NONE = 0 we can skip assignment + * here and simply rely on xzalloc + */ +struct vma_area *alloc_vma_area(void) +{ + struct vma_area *p; + + p = xzalloc(sizeof(*p) + sizeof(VmaEntry)); + if (p) { + p->e = (VmaEntry *)(p + 1); + vma_entry__init(p->e); + p->e->fd = -1; + } + + return p; +} + +int mkdirpat(int fd, const char *path, int mode) +{ + size_t i; + char made_path[PATH_MAX], *pos; + + if (strlen(path) >= PATH_MAX) { + pr_err("path %s is longer than PATH_MAX\n", path); + return -ENOSPC; + } + + strcpy(made_path, path); + + i = 0; + if (made_path[0] == '/') + i++; + + for (; i < strlen(made_path); i++) { + pos = strchr(made_path + i, '/'); + if (pos) + *pos = '\0'; + if (mkdirat(fd, made_path, mode) < 0 && errno != EEXIST) { + int ret = -errno; + pr_perror("couldn't mkdirpat directory %s", made_path); + return ret; + } + if (pos) { + *pos = '/'; + i = pos - made_path; + } else + break; + } + + return 0; +} + +bool is_path_prefix(const char *path, const char *prefix) +{ + if (strstartswith(path, prefix)) { + size_t len = strlen(prefix); + switch (path[len]) { + case '\0': + case '/': + return true; + } + } + + return false; +} + +FILE *fopenat(int dirfd, char *path, char *cflags) +{ + int tmp, flags = 0; + char *iter; + + for (iter = cflags; *iter; iter++) { + switch (*iter) { + case 'r': + flags |= O_RDONLY; + break; + case 'a': + flags |= O_APPEND; + break; + case 'w': + flags |= O_WRONLY | O_CREAT; + break; + case '+': + flags = O_RDWR | O_CREAT; + break; + } + } + + tmp = openat(dirfd, path, flags, S_IRUSR | S_IWUSR); + if (tmp < 0) + return NULL; + + return fdopen(tmp, cflags); +} + +void split(char *str, char token, char ***out, int *n) +{ + int i; + char *cur; + + *n = 0; + for (cur = str; cur != NULL; cur = strchr(cur, token)) { + (*n)++; + cur++; + } + + + *out = xmalloc((*n) * sizeof(char *)); + if (!*out) { + *n = -1; + return; + + } + + cur = str; + i = 0; + do { + char *prev = cur; + cur = strchr(cur, token); + + if (cur) + *cur = '\0'; + (*out)[i] = xstrdup(prev); + if (cur) { + *cur = token; + cur++; + } + + if (!(*out)[i]) { + int j; + for (j = 0; j < i; j++) + xfree((*out)[j]); + xfree(*out); + *out = NULL; + *n = -1; + return; + } + + i++; + } while(cur); +} + +int fd_has_data(int lfd) +{ + struct pollfd pfd = {lfd, POLLIN, 0}; + int ret; + + ret = poll(&pfd, 1, 0); + if (ret < 0) { + pr_perror("poll() failed"); + } + + return ret; +} + +void fd_set_nonblocking(int fd, bool on) +{ + int flags = fcntl(fd, F_GETFL, NULL); + + if (flags < 0) { + pr_perror("Failed to obtain flags from fd %d", fd); + return; + } + + if (on) + flags |= O_NONBLOCK; + else + flags &= (~O_NONBLOCK); + + if (fcntl(fd, F_SETFL, flags) < 0) + pr_perror("Failed to set flags for fd %d", fd); +} + +int make_yard(char *path) +{ + if (mount("none", path, "tmpfs", 0, NULL)) { + pr_perror("Unable to mount tmpfs in %s", path); + return -1; + } + + if (mount("none", path, NULL, MS_PRIVATE, NULL)) { + pr_perror("Unable to mark yard as private"); + return -1; + } + + return 0; +} + +const char *ns_to_string(unsigned int ns) +{ + switch (ns) { + case CLONE_NEWIPC: + return "ipc"; + case CLONE_NEWNS: + return "mnt"; + case CLONE_NEWNET: + return "net"; + case CLONE_NEWPID: + return "pid"; + case CLONE_NEWUSER: + return "user"; + case CLONE_NEWUTS: + return "uts"; + default: + return NULL; + } +} + +void tcp_cork(int sk, bool on) +{ + int val = on ? 1 : 0; + if (setsockopt(sk, SOL_TCP, TCP_CORK, &val, sizeof(val))) + pr_perror("Unable to restore TCP_CORK (%d)", val); +} + +void tcp_nodelay(int sk, bool on) +{ + int val = on ? 1 : 0; + if (setsockopt(sk, SOL_TCP, TCP_NODELAY, &val, sizeof(val))) + pr_perror("Unable to restore TCP_NODELAY (%d)", val); +} + +static int get_sockaddr_in(struct sockaddr_storage *addr, char *host) +{ + memset(addr, 0, sizeof(*addr)); + + if (!host) { + ((struct sockaddr_in *)addr)->sin_addr.s_addr = INADDR_ANY; + addr->ss_family = AF_INET; + } else if (inet_pton(AF_INET, host, &((struct sockaddr_in *)addr)->sin_addr)) { + addr->ss_family = AF_INET; + } else if (inet_pton(AF_INET6, host, &((struct sockaddr_in6 *)addr)->sin6_addr)) { + addr->ss_family = AF_INET6; + } else { + pr_err("Invalid server address \"%s\". " + "The address must be in IPv4 or IPv6 format.\n", host); + return -1; + } + + if (addr->ss_family == AF_INET6) { + ((struct sockaddr_in6 *)addr)->sin6_port = htons(opts.port); + } else if (addr->ss_family == AF_INET) { + ((struct sockaddr_in *)addr)->sin_port = htons(opts.port); + } + + return 0; +} + +int setup_tcp_server(char *type) +{ + int sk = -1; + int sockopt = 1; + struct sockaddr_storage saddr; + socklen_t slen = sizeof(saddr); + + if (get_sockaddr_in(&saddr, opts.addr)) { + return -1; + } + + pr_info("Starting %s server on port %u\n", type, opts.port); + + sk = socket(saddr.ss_family, SOCK_STREAM, IPPROTO_TCP); + + if (sk < 0) { + pr_perror("Can't init %s server", type); + return -1; + } + + if (setsockopt( + sk, SOL_SOCKET, SO_REUSEADDR, &sockopt, sizeof(sockopt)) == -1) { + pr_perror("Unable to set SO_REUSEADDR"); + goto out; + } + + if (bind(sk, (struct sockaddr *)&saddr, slen)) { + pr_perror("Can't bind %s server", type); + goto out; + } + + if (listen(sk, 1)) { + pr_perror("Can't listen on %s server socket", type); + goto out; + } + + /* Get socket port in case of autobind */ + if (opts.port == 0) { + if (getsockname(sk, (struct sockaddr *)&saddr, &slen)) { + pr_perror("Can't get %s server name", type); + goto out; + } + + if (saddr.ss_family == AF_INET6) { + opts.port = ntohs(((struct sockaddr_in6 *)&saddr)->sin6_port); + } else if (saddr.ss_family == AF_INET) { + opts.port = ntohs(((struct sockaddr_in *)&saddr)->sin_port); + } + + pr_info("Using %u port\n", opts.port); + } + + return sk; +out: + close(sk); + return -1; +} + +int run_tcp_server(bool daemon_mode, int *ask, int cfd, int sk) +{ + int ret; + struct sockaddr_in caddr; + socklen_t clen = sizeof(caddr); + + if (daemon_mode) { + ret = cr_daemon(1, 0, cfd); + if (ret == -1) { + pr_err("Can't run in the background\n"); + goto out; + } + if (ret > 0) { /* parent task, daemon started */ + close_safe(&sk); + if (opts.pidfile) { + if (write_pidfile(ret) == -1) { + pr_perror("Can't write pidfile"); + kill(ret, SIGKILL); + waitpid(ret, NULL, 0); + return -1; + } + } + + return ret; + } + } + + if (close_status_fd()) + return -1; + + if (sk >= 0) { + ret = *ask = accept(sk, (struct sockaddr *)&caddr, &clen); + if (*ask < 0) + pr_perror("Can't accept connection to server"); + else + pr_info("Accepted connection from %s:%u\n", + inet_ntoa(caddr.sin_addr), + (int)ntohs(caddr.sin_port)); + close(sk); + } + + return 0; +out: + close(sk); + return -1; +} + +int setup_tcp_client(void) +{ + struct sockaddr_storage saddr; + struct addrinfo addr_criteria, *addr_list, *p; + char ipstr[INET6_ADDRSTRLEN]; + int sk = -1; + void *ip; + + memset(&addr_criteria, 0, sizeof(addr_criteria)); + addr_criteria.ai_family = AF_UNSPEC; + addr_criteria.ai_socktype = SOCK_STREAM; + addr_criteria.ai_protocol = IPPROTO_TCP; + + /* + * addr_list contains a list of addrinfo structures that corresponding + * to the criteria specified in opts.addr and addr_criteria. + */ + if (getaddrinfo(opts.addr, NULL, &addr_criteria, &addr_list)) { + pr_perror("Failed to resolve hostname: %s", opts.addr); + goto out; + } + + /* + * Iterate through addr_list and try to connect. The loop stops if the + * connection is successful or we reach the end of the list. + */ + for(p = addr_list; p != NULL; p = p->ai_next) { + + if (p->ai_family == AF_INET) { + struct sockaddr_in *ipv4 = (struct sockaddr_in *)p->ai_addr; + ip = &(ipv4->sin_addr); + } else { + struct sockaddr_in6 *ipv6 = (struct sockaddr_in6 *)p->ai_addr; + ip = &(ipv6->sin6_addr); + } + + inet_ntop(p->ai_family, ip, ipstr, sizeof(ipstr)); + pr_info("Connecting to server %s:%u\n", ipstr, opts.port); + + if (get_sockaddr_in(&saddr, ipstr)) + goto out; + + sk = socket(saddr.ss_family, SOCK_STREAM, IPPROTO_TCP); + if (sk < 0) { + pr_perror("Can't create socket"); + goto out; + } + + if (connect(sk, (struct sockaddr *)&saddr, sizeof(saddr)) < 0) { + pr_info("Can't connect to server %s:%u\n", ipstr, opts.port); + close(sk); + sk = -1; + } else { + /* Connected successfully */ + break; + } + } + +out: + freeaddrinfo(addr_list); + return sk; +} + +int epoll_add_rfd(int epfd, struct epoll_rfd *rfd) +{ + struct epoll_event ev; + + ev.events = EPOLLIN | EPOLLRDHUP; + ev.data.ptr = rfd; + if (epoll_ctl(epfd, EPOLL_CTL_ADD, rfd->fd, &ev) == -1) { + pr_perror("epoll_ctl failed"); + return -1; + } + + return 0; +} + +int epoll_del_rfd(int epfd, struct epoll_rfd *rfd) +{ + if (epoll_ctl(epfd, EPOLL_CTL_DEL, rfd->fd, NULL) == -1) { + pr_perror("epoll_ctl failed"); + return -1; + } + + return 0; +} + +static int epoll_hangup_event(int epollfd, struct epoll_rfd *rfd) +{ + int ret = 0; + + if (rfd->hangup_event) { + ret = rfd->hangup_event(rfd); + if (ret < 0) + return ret; + } + + if (epoll_del_rfd(epollfd, rfd)) + return -1; + + close_safe(&rfd->fd); + + return ret; +} + +int epoll_run_rfds(int epollfd, struct epoll_event *evs, int nr_fds, int timeout) +{ + int ret, i, nr_events; + bool have_a_break = false; + + while (1) { + ret = epoll_wait(epollfd, evs, nr_fds, timeout); + if (ret <= 0) { + if (ret < 0) + pr_perror("polling failed"); + break; + } + + nr_events = ret; + for (i = 0; i < nr_events; i++) { + struct epoll_rfd *rfd; + uint32_t events; + + rfd = (struct epoll_rfd *)evs[i].data.ptr; + events = evs[i].events; + + if (events & EPOLLIN) { + ret = rfd->read_event(rfd); + if (ret < 0) + goto out; + if (ret > 0) + have_a_break = true; + } + + if (events & (EPOLLHUP | EPOLLRDHUP)) { + ret = epoll_hangup_event(epollfd, rfd); + if (ret < 0) + goto out; + if (ret > 0) + have_a_break = true; + } + } + + if (have_a_break) + return 1; + } +out: + return ret; +} + +int epoll_prepare(int nr_fds, struct epoll_event **events) +{ + int epollfd; + + *events = xmalloc(sizeof(struct epoll_event) * nr_fds); + if (!*events) + return -1; + + epollfd = epoll_create(nr_fds); + if (epollfd == -1) { + pr_perror("epoll_create failed"); + goto free_events; + } + + return epollfd; + +free_events: + xfree(*events); + return -1; +} + +int call_in_child_process(int (*fn)(void *), void *arg) +{ + int status, ret = -1; + pid_t pid; + /* + * Parent freezes till child exit, so child may use the same stack. + * No SIGCHLD flag, so it's not need to block signal. + */ + pid = clone_noasan(fn, CLONE_VFORK | CLONE_VM | CLONE_FILES | + CLONE_IO | CLONE_SIGHAND | CLONE_SYSVSEM, arg); + if (pid == -1) { + pr_perror("Can't clone"); + return -1; + } + errno = 0; + if (waitpid(pid, &status, __WALL) != pid || !WIFEXITED(status) || WEXITSTATUS(status)) { + pr_err("Can't wait or bad status: errno=%d, status=%d\n", errno, status); + goto out; + } + ret = 0; + /* + * Child opened PROC_SELF for pid. If we create one more child + * with the same pid later, it will try to reuse this /proc/self. + */ +out: + close_pid_proc(); + return ret; +} + +void rlimit_unlimit_nofile(void) +{ + struct rlimit new; + + new.rlim_cur = kdat.sysctl_nr_open; + new.rlim_max = kdat.sysctl_nr_open; + + if (prlimit(getpid(), RLIMIT_NOFILE, &new, NULL)) { + pr_perror("rlimit: Can't setup RLIMIT_NOFILE for self"); + return; + } else + pr_debug("rlimit: RLIMIT_NOFILE unlimited for self\n"); + + service_fd_rlim_cur = kdat.sysctl_nr_open; +} + + +#ifdef __GLIBC__ +#include +void print_stack_trace(pid_t pid) +{ + void *array[10]; + char **strings; + size_t size, i; + + size = backtrace(array, 10); + strings = backtrace_symbols(array, size); + + for (i = 0; i < size; i++) + pr_err("stack %d#%zu: %s\n", pid, i, strings[i]); + + free(strings); +} +#endif diff --git a/CRIU_code/criu/uts_ns.c b/CRIU_code/criu/uts_ns.c new file mode 100644 index 0000000..1ea075c --- /dev/null +++ b/CRIU_code/criu/uts_ns.c @@ -0,0 +1,71 @@ +#include +#include +#include +#include +#include + +#include "util.h" +#include "namespaces.h" +#include "sysctl.h" +#include "uts_ns.h" + +#include "protobuf.h" +#include "images/utsns.pb-c.h" + +int dump_uts_ns(int ns_id) +{ + int ret; + struct cr_img *img; + struct utsname ubuf; + UtsnsEntry ue = UTSNS_ENTRY__INIT; + + img = open_image(CR_FD_UTSNS, O_DUMP, ns_id); + if (!img) + return -1; + + ret = uname(&ubuf); + if (ret < 0) { + pr_perror("Error calling uname"); + goto err; + } + + ue.nodename = ubuf.nodename; + ue.domainname = ubuf.domainname; + + ret = pb_write_one(img, &ue, PB_UTSNS); +err: + close_image(img); + return ret < 0 ? -1 : 0; +} + +int prepare_utsns(int pid) +{ + int ret; + struct cr_img *img; + UtsnsEntry *ue; + struct sysctl_req req[] = { + { "kernel/hostname" }, + { "kernel/domainname" }, + }; + + img = open_image(CR_FD_UTSNS, O_RSTR, pid); + if (!img) + return -1; + + ret = pb_read_one(img, &ue, PB_UTSNS); + if (ret < 0) + goto out; + + req[0].arg = ue->nodename; + req[0].type = CTL_STR(strlen(ue->nodename)); + req[1].arg = ue->domainname; + req[1].type = CTL_STR(strlen(ue->domainname)); + + ret = sysctl_op(req, ARRAY_SIZE(req), CTL_WRITE, CLONE_NEWUTS); + utsns_entry__free_unpacked(ue, NULL); +out: + close_image(img); + return ret; +} + +struct ns_desc uts_ns_desc = NS_DESC_ENTRY(CLONE_NEWUTS, "uts"); diff --git a/CRIU_code/criu/vdso-compat.c b/CRIU_code/criu/vdso-compat.c new file mode 100644 index 0000000..a68c0ba --- /dev/null +++ b/CRIU_code/criu/vdso-compat.c @@ -0,0 +1,74 @@ +#include +#include +#include +#include + +#include "types.h" +#include "parasite-syscall.h" +#include "parasite.h" +#include "vdso.h" + +static void exit_on(int ret, int err_fd, char *reason) +{ + if (ret) { + syscall(__NR_write, err_fd, reason, strlen(reason)); + syscall(__NR_exit, ret); + } +} +/* + * Because of restrictions of ARCH_MAP_VDSO_* API, new vDSO blob + * can be mapped only if there is no vDSO blob present for a process. + * This is a helper process, it unmaps 64-bit vDSO and maps 32-bit vDSO. + * Then it copies vDSO blob to shared with CRIU mapping. + * + * The purpose is to fill compat vdso's symtable (vdso_compat_rt). + * It's an optimization to fill symtable only once at CRIU restore + * for all restored tasks. + * + * @native - 64-bit vDSO blob (for easy unmap) + * @pipe_fd - to get size of compat blob from /proc/.../maps + * @err_fd - to print error messages + * @vdso_buf, buf_size - shared with CRIU buffer + * + * WARN: This helper shouldn't call pr_err() or any syscall with + * Glibc's wrapper function - it may very likely blow up. + */ +void compat_vdso_helper(struct vdso_maps *native, int pipe_fd, + int err_fd, void *vdso_buf, size_t buf_size) +{ + void *vdso_addr; + long vdso_size; + long ret; + + if (native->vdso_start != VDSO_BAD_ADDR) { + ret = syscall(__NR_munmap, + native->vdso_start, native->sym.vdso_size); + exit_on(ret, err_fd, "Error: Failed to unmap native vdso\n"); + } + + if (native->vvar_start != VVAR_BAD_ADDR) { + ret = syscall(__NR_munmap, + native->vvar_start, native->sym.vvar_size); + exit_on(ret, err_fd, "Error: Failed to unmap native vvar\n"); + } + + ret = syscall(__NR_arch_prctl, ARCH_MAP_VDSO_32, native->vdso_start); + if (ret < 0) + exit_on(ret, err_fd, "Error: ARCH_MAP_VDSO failed\n"); + + vdso_size = ret; + if (vdso_size > buf_size) + exit_on(-1, err_fd, "Error: Compatible vdso's size is bigger than reserved buf\n"); + + /* Stop so CRIU could parse smaps to find 32-bit vdso's size */ + ret = syscall(__NR_kill, syscall(__NR_getpid), SIGSTOP); + exit_on(ret, err_fd, "Error: Can't stop myself with SIGSTOP (having a good time)\n"); + + ret = syscall(__NR_read, pipe_fd, &vdso_addr, sizeof(void *)); + if (ret != sizeof(void *)) + exit_on(-1, err_fd, "Error: Can't read size of mapped vdso from pipe\n"); + + memcpy(vdso_buf, vdso_addr, vdso_size); + + syscall(__NR_exit, 0); +} diff --git a/CRIU_code/criu/vdso.c b/CRIU_code/criu/vdso.c new file mode 100644 index 0000000..257cbcd --- /dev/null +++ b/CRIU_code/criu/vdso.c @@ -0,0 +1,703 @@ +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "types.h" +#include "parasite-syscall.h" +#include "parasite.h" +#include "common/compiler.h" +#include "kerndat.h" +#include "vdso.h" +#include "util.h" +#include "criu-log.h" +#include "mem.h" +#include "vma.h" +#include +#include + +#ifdef LOG_PREFIX +# undef LOG_PREFIX +#endif +#define LOG_PREFIX "vdso: " + +u64 vdso_pfn = VDSO_BAD_PFN; +struct vdso_maps vdso_maps = VDSO_MAPS_INIT; +struct vdso_maps vdso_maps_compat = VDSO_MAPS_INIT; + +/* + * Starting with 3.16 the [vdso]/[vvar] marks are reported correctly + * even when they are remapped into a new place, but only since that + * particular version of the kernel! + * On previous kernels we need to check if vma is vdso by some means: + * - if pagemap is present, by pfn + * - by parsing ELF and filling vdso symtable otherwise + */ +enum vdso_check_t { + /* from slowest to fastest */ + VDSO_CHECK_SYMS = 0, + VDSO_CHECK_PFN, + VDSO_NO_CHECK, +}; + +static enum vdso_check_t get_vdso_check_type(struct parasite_ctl *ctl) +{ + /* + * ia32 C/R depends on mremap() for vdso patches (v4.8), + * so we can omit any check and be sure that "[vdso]" + * hint stays in /proc/../maps file and is correct. + */ + if (!compel_mode_native(ctl)) { + pr_info("Don't check vdso for compat task\n"); + return VDSO_NO_CHECK; + } + + if (kdat.vdso_hint_reliable) { + pr_info("vDSO hint is reliable - omit checking\n"); + return VDSO_NO_CHECK; + } + + if (kdat.pmap == PM_FULL) { + pr_info("Check vdso by pfn from pagemap\n"); + return VDSO_CHECK_PFN; + } + + pr_info("Pagemap is unavailable, check vdso by filling symtable\n"); + return VDSO_CHECK_SYMS; +} + +static int check_vdso_by_pfn(int pagemap_fd, struct vma_area *vma, + bool *has_vdso_pfn) +{ + u64 pfn = VDSO_BAD_PFN; + + if (vaddr_to_pfn(pagemap_fd, vma->e->start, &pfn)) + return -1; + + if (!pfn) { + pr_err("Unexpected page frame number 0\n"); + return -1; + } + + if ((pfn == vdso_pfn && pfn != VDSO_BAD_PFN)) + *has_vdso_pfn = true; + else + *has_vdso_pfn = false; + + return 0; +} + +static bool not_vvar_or_vdso(struct vma_area *vma) +{ + if (!vma_area_is(vma, VMA_AREA_REGULAR)) + return true; + + if (vma_area_is(vma, VMA_FILE_SHARED)) + return true; + + if (vma_area_is(vma, VMA_FILE_PRIVATE)) + return true; + + if (vma->e->start > kdat.task_size) + return true; + + if (vma->e->flags & MAP_GROWSDOWN) + return true; + + BUILD_BUG_ON(!(VDSO_PROT & VVAR_PROT)); + if ((vma->e->prot & VVAR_PROT) != VVAR_PROT) + return true; + + return false; +} + +/* Contains addresses from vdso mark */ +struct vdso_quarter { + unsigned long orig_vdso; + unsigned long orig_vvar; + unsigned long rt_vdso; + unsigned long rt_vvar; +}; + +static void drop_rt_vdso(struct vm_area_list *vma_area_list, + struct vdso_quarter *addr, struct vma_area *rt_vdso_marked) +{ + struct vma_area *rt_vvar_marked = NULL; + struct vma_area *vma; + + if (!rt_vdso_marked) + return; + + /* + * There is marked vdso, it means such vdso is autogenerated + * and must be dropped from vma list. + */ + pr_debug("vdso: Found marked at %lx (orig vDSO at %lx VVAR at %lx)\n", + (long)rt_vdso_marked->e->start, addr->orig_vdso, addr->orig_vvar); + + /* + * Don't forget to restore the proxy vdso/vvar status, since + * they're unknown to the kernel. + * Also BTW search for rt-vvar to remove it later. + */ + list_for_each_entry(vma, &vma_area_list->h, list) { + if (vma->e->start == addr->orig_vdso) { + vma->e->status |= VMA_AREA_REGULAR | VMA_AREA_VDSO; + pr_debug("vdso: Restore orig vDSO status at %lx\n", + (long)vma->e->start); + } else if (vma->e->start == addr->orig_vvar) { + vma->e->status |= VMA_AREA_REGULAR | VMA_AREA_VVAR; + pr_debug("vdso: Restore orig VVAR status at %lx\n", + (long)vma->e->start); + } else if (addr->rt_vvar != VVAR_BAD_ADDR && + addr->rt_vvar == vma->e->start) { + BUG_ON(rt_vvar_marked); + if (not_vvar_or_vdso(vma)) { + pr_warn("Mark in rt-vdso points to vma, that doesn't look like vvar - skipping unmap\n"); + continue; + } + rt_vvar_marked = vma; + } + } + + pr_debug("vdso: Dropping marked vdso at %lx\n", + (long)rt_vdso_marked->e->start); + list_del(&rt_vdso_marked->list); + xfree(rt_vdso_marked); + vma_area_list->nr--; + + if (rt_vvar_marked) { + pr_debug("vdso: Dropping marked vvar at %lx\n", + (long)rt_vvar_marked->e->start); + list_del(&rt_vvar_marked->list); + xfree(rt_vvar_marked); + vma_area_list->nr--; + } +} + +/* + * I need to poke every potentially marked vma, + * otherwise if task never called for vdso functions + * page frame number won't be reported. + * + * Moreover, if page frame numbers are not accessible + * we have to scan the vma zone for vDSO elf structure + * which gonna be a slow way. + */ +static int check_if_vma_is_vdso(enum vdso_check_t vcheck, int pagemap_fd, + struct parasite_ctl *ctl, struct vma_area *vma, + struct vma_area **rt_vdso_marked, struct vdso_quarter *addr) +{ + struct parasite_vdso_vma_entry *args; + bool has_vdso_pfn = false; + + args = compel_parasite_args(ctl, struct parasite_vdso_vma_entry); + + if (not_vvar_or_vdso(vma)) + return 0; + + if ((vma->e->prot & VDSO_PROT) != VDSO_PROT) + return 0; + + args->start = vma->e->start; + args->len = vma_area_len(vma); + args->try_fill_symtable = (vcheck == VDSO_CHECK_SYMS); + args->is_vdso = false; + + if (compel_rpc_call_sync(PARASITE_CMD_CHECK_VDSO_MARK, ctl)) { + pr_err("Parasite failed to poke for mark\n"); + return -1; + } + + if (unlikely(args->is_marked)) { + if (*rt_vdso_marked) { + pr_err("Ow! Second vdso mark detected!\n"); + return -1; + } + *rt_vdso_marked = vma; + addr->orig_vdso = args->orig_vdso_addr; + addr->orig_vvar = args->orig_vvar_addr; + addr->rt_vvar = args->rt_vvar_addr; + return 0; + } + + if (vcheck == VDSO_NO_CHECK) + return 0; + + if (vcheck == VDSO_CHECK_PFN) { + if (check_vdso_by_pfn(pagemap_fd, vma, &has_vdso_pfn) < 0) { + pr_err("Failed checking vdso by pfn\n"); + return -1; + } + } + + if (has_vdso_pfn || args->is_vdso) { + if (!vma_area_is(vma, VMA_AREA_VDSO)) { + pr_debug("Restore vDSO status by pfn/symtable at %lx\n", + (long)vma->e->start); + vma->e->status |= VMA_AREA_VDSO; + } + } else { + if (unlikely(vma_area_is(vma, VMA_AREA_VDSO))) { + pr_debug("Drop mishinted vDSO status at %lx\n", + (long)vma->e->start); + vma->e->status &= ~VMA_AREA_VDSO; + } + } + + return 0; +} + +/* + * The VMAs list might have proxy vdso/vvar areas left + * from previous dump/restore cycle so we need to detect + * them and eliminated from the VMAs list, they will be + * generated again on restore if needed. + */ +int parasite_fixup_vdso(struct parasite_ctl *ctl, pid_t pid, + struct vm_area_list *vma_area_list) +{ + struct vma_area *rt_vdso_marked = NULL; + struct vdso_quarter addr = { + .orig_vdso = VDSO_BAD_ADDR, + .orig_vvar = VVAR_BAD_ADDR, + .rt_vdso = VDSO_BAD_ADDR, + .rt_vvar = VVAR_BAD_ADDR, + }; + enum vdso_check_t vcheck; + struct vma_area *vma; + int fd = -1; + + vcheck = get_vdso_check_type(ctl); + if (vcheck == VDSO_CHECK_PFN) { + BUG_ON(vdso_pfn == VDSO_BAD_PFN); + fd = open_proc(pid, "pagemap"); + if (fd < 0) + return -1; + } + + list_for_each_entry(vma, &vma_area_list->h, list) { + /* + * Defer handling marked vdso until we walked over + * all vmas and restore potentially remapped vDSO + * area status. + */ + if (check_if_vma_is_vdso(vcheck, fd, ctl, vma, + &rt_vdso_marked, &addr)) { + close_safe(&fd); + return -1; + } + } + + drop_rt_vdso(vma_area_list, &addr, rt_vdso_marked); + + close_safe(&fd); + return 0; +} + +static int vdso_parse_maps(pid_t pid, struct vdso_maps *s) +{ + int exit_code = -1; + char *buf; + struct bfd f; + + *s = (struct vdso_maps)VDSO_MAPS_INIT; + + f.fd = open_proc(pid, "maps"); + if (f.fd < 0) + return -1; + + if (bfdopenr(&f)) + goto err; + + while (1) { + unsigned long start, end; + char *has_vdso, *has_vvar; + + buf = breadline(&f); + if (buf == NULL) + break; + if (IS_ERR(buf)) + goto err; + + has_vdso = strstr(buf, "[vdso]"); + if (!has_vdso) + has_vvar = strstr(buf, "[vvar]"); + else + has_vvar = NULL; + + if (!has_vdso && !has_vvar) + continue; + + if (sscanf(buf, "%lx-%lx", &start, &end) != 2) { + pr_err("Can't find vDSO/VVAR bounds\n"); + goto err; + } + + if (has_vdso) { + if (s->vdso_start != VDSO_BAD_ADDR) { + pr_err("Got second vDSO entry\n"); + goto err; + } + s->vdso_start = start; + s->sym.vdso_size = end - start; + } else { + if (s->vvar_start != VVAR_BAD_ADDR) { + pr_err("Got second VVAR entry\n"); + goto err; + } + s->vvar_start = start; + s->sym.vvar_size = end - start; + } + } + + if (s->vdso_start != VDSO_BAD_ADDR && s->vvar_start != VVAR_BAD_ADDR) + s->sym.vdso_before_vvar = (s->vdso_start < s->vvar_start); + + exit_code = 0; +err: + bclose(&f); + return exit_code; +} + +static int validate_vdso_addr(struct vdso_maps *s) +{ + unsigned long vdso_end = s->vdso_start + s->sym.vdso_size; + unsigned long vvar_end = s->vvar_start + s->sym.vvar_size; + /* + * Validate its structure -- for new vDSO format the + * structure must be like + * + * 7fff1f5fd000-7fff1f5fe000 r-xp 00000000 00:00 0 [vdso] + * 7fff1f5fe000-7fff1f600000 r--p 00000000 00:00 0 [vvar] + * + * The areas may be in reverse order. + * + * 7fffc3502000-7fffc3504000 r--p 00000000 00:00 0 [vvar] + * 7fffc3504000-7fffc3506000 r-xp 00000000 00:00 0 [vdso] + * + */ + if (s->vdso_start != VDSO_BAD_ADDR) { + if (s->vvar_start != VVAR_BAD_ADDR) { + if (vdso_end != s->vvar_start && + vvar_end != s->vdso_start) { + pr_err("Unexpected rt vDSO area bounds\n"); + return -1; + } + } + } else { + pr_err("Can't find rt vDSO\n"); + return -1; + } + + return 0; +} + +static int vdso_fill_self_symtable(struct vdso_maps *s) +{ + if (s->vdso_start == VDSO_BAD_ADDR || s->sym.vdso_size == VDSO_BAD_SIZE) + return -1; + + if (vdso_fill_symtable(s->vdso_start, s->sym.vdso_size, &s->sym)) + return -1; + + if (validate_vdso_addr(s)) + return -1; + + pr_debug("rt [vdso] %lx-%lx [vvar] %lx-%lx\n", + s->vdso_start, s->vdso_start + s->sym.vdso_size, + s->vvar_start, s->vvar_start + s->sym.vvar_size); + + return 0; +} + +#ifdef CONFIG_COMPAT +static int vdso_mmap_compat(struct vdso_maps *native, + struct vdso_maps *compat, void *vdso_buf, size_t buf_size) +{ + pid_t pid; + int status, ret = -1; + int fds[2]; + + if (pipe(fds)) { + pr_perror("Failed to open pipe"); + return -1; + } + + pid = fork(); + if (pid == 0) { + if (close(fds[1])) { + pr_perror("Failed to close pipe"); + syscall(__NR_exit, 1); + } + + compat_vdso_helper(native, fds[0], log_get_fd(), + vdso_buf, buf_size); + + BUG(); + } + + if (close(fds[0])) { + pr_perror("Failed to close pipe"); + goto out_kill; + } + waitpid(pid, &status, WUNTRACED); + + if (WIFEXITED(status)) { + pr_err("Compat vdso helper exited with %d\n", + WEXITSTATUS(status)); + goto out_kill; + } + + if (!WIFSTOPPED(status)) { + pr_err("Compat vdso helper isn't stopped\n"); + goto out_kill; + } + + if (vdso_parse_maps(pid, compat)) + goto out_kill; + + if (validate_vdso_addr(compat)) + goto out_kill; + + if (kill(pid, SIGCONT)) { + pr_perror("Failed to kill(SIGCONT) for compat vdso helper\n"); + goto out_kill; + } + if (write(fds[1], &compat->vdso_start, sizeof(void *)) != + sizeof(compat->vdso_start)) { + pr_perror("Failed write to pipe\n"); + goto out_kill; + } + waitpid(pid, &status, WUNTRACED); + + if (WIFEXITED(status)) { + ret = WEXITSTATUS(status); + if (ret) + pr_err("Helper for mmaping compat vdso failed with %d\n", ret); + goto out_close; + } + pr_err("Compat vDSO helper didn't exit, status: %d\n", status); + +out_kill: + kill(pid, SIGKILL); +out_close: + if (close(fds[1])) + pr_perror("Failed to close pipe"); + return ret; +} + +#define COMPAT_VDSO_BUF_SZ (PAGE_SIZE*2) +static int vdso_fill_compat_symtable(struct vdso_maps *native, + struct vdso_maps *compat) +{ + void *vdso_mmap; + int ret = -1; + + if (!kdat.compat_cr) + return 0; + + vdso_mmap = mmap(NULL, COMPAT_VDSO_BUF_SZ, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, -1, 0); + if (vdso_mmap == MAP_FAILED) { + pr_perror("Failed to mmap buf for compat vdso"); + return -1; + } + + if (vdso_mmap_compat(native, compat, vdso_mmap, COMPAT_VDSO_BUF_SZ)) { + pr_err("Failed to mmap compatible vdso with helper process\n"); + goto out_unmap; + } + + if (vdso_fill_symtable_compat((uintptr_t)vdso_mmap, + compat->sym.vdso_size, &compat->sym)) { + pr_err("Failed to parse mmaped compatible vdso blob\n"); + goto out_unmap; + } + + pr_debug("compat [vdso] %lx-%lx [vvar] %lx-%lx\n", + compat->vdso_start, compat->vdso_start + compat->sym.vdso_size, + compat->vvar_start, compat->vvar_start + compat->sym.vvar_size); + ret = 0; + +out_unmap: + if (munmap(vdso_mmap, COMPAT_VDSO_BUF_SZ)) + pr_perror("Failed to unmap buf for compat vdso"); + return ret; +} +#endif /* CONFIG_COMPAT */ + +int vdso_init_dump(void) +{ + if (vdso_parse_maps(PROC_SELF, &vdso_maps)) { + pr_err("Failed reading self/maps for filling vdso/vvar bounds\n"); + return -1; + } + + if (kdat.pmap != PM_FULL) + pr_info("VDSO detection turned off\n"); + else if (vaddr_to_pfn(-1, vdso_maps.vdso_start, &vdso_pfn)) + return -1; + + return 0; +} + +/* + * Check vdso/vvar sized read from maps to kdat values. + * We do not read /proc/self/maps for compatible vdso as it's + * not parked as run-time vdso in restorer, but mapped with + * arch_prlctl(MAP_VDSO_32) API. + * By that reason we verify only native sizes. + */ +static int is_kdat_vdso_sym_valid(void) +{ + if (vdso_maps.sym.vdso_size != kdat.vdso_sym.vdso_size) + return false; + if (vdso_maps.sym.vvar_size != kdat.vdso_sym.vvar_size) + return false; + + return true; +} + +int vdso_init_restore(void) +{ + if (kdat.vdso_sym.vdso_size == VDSO_BAD_SIZE) { + pr_err("Kdat has empty vdso symtable\n"); + return -1; + } + + /* Already filled vdso_maps during kdat test */ + if (vdso_maps.vdso_start != VDSO_BAD_ADDR) + return 0; + + /* + * Parsing self-maps here only to find vvar/vdso vmas in + * criu's address space, for further remapping to restorer's + * parking zone. Don't need to do this if map-vdso API + * is present. + */ + if (!kdat.can_map_vdso) { + if (vdso_parse_maps(PROC_SELF, &vdso_maps)) { + pr_err("Failed reading self/maps for filling vdso/vvar bounds\n"); + return -1; + } + + if (!is_kdat_vdso_sym_valid()) { + pr_err("Kdat sizes of vdso/vvar differ to maps file \n"); + return -1; + } + } + + vdso_maps.sym = kdat.vdso_sym; +#ifdef CONFIG_COMPAT + vdso_maps_compat.sym = kdat.vdso_sym_compat; +#endif + + return 0; +} + +int kerndat_vdso_fill_symtable(void) +{ + if (vdso_parse_maps(PROC_SELF, &vdso_maps)) { + pr_err("Failed reading self/maps for filling vdso/vvar bounds\n"); + return -1; + } + + if (vdso_fill_self_symtable(&vdso_maps)) { + pr_err("Failed to fill self vdso symtable\n"); + return -1; + } + kdat.vdso_sym = vdso_maps.sym; + +#ifdef CONFIG_COMPAT + if (vdso_fill_compat_symtable(&vdso_maps, &vdso_maps_compat)) { + pr_err("Failed to fill compat vdso symtable\n"); + return -1; + } + kdat.vdso_sym_compat = vdso_maps_compat.sym; +#endif + + return 0; +} + +/* + * On x86 pre-v3.16 kernels can lose "[vdso]" hint + * in /proc/.../maps file after mremap()'ing vdso vma. + * Depends on kerndat_vdso_fill_symtable() - assuming that + * vdso_maps and vdso_maps_compat are filled. + */ +int kerndat_vdso_preserves_hint(void) +{ + struct vdso_maps vdso_maps_after; + int status, ret = -1; + pid_t child; + + kdat.vdso_hint_reliable = 0; + + if (vdso_maps.vdso_start == VDSO_BAD_ADDR) + return 0; + + child = fork(); + if (child < 0) { + pr_perror("fork() failed"); + return -1; + } + + if (child == 0) { + unsigned long vdso_addr = vdso_maps.vdso_start; + unsigned long vdso_size = vdso_maps.sym.vdso_size; + void *new_addr; + + new_addr = mmap(0, vdso_size, PROT_NONE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (new_addr == MAP_FAILED) + exit(1); + + child = getpid(); + new_addr = (void *)syscall(SYS_mremap, vdso_addr, vdso_size, + vdso_size, MREMAP_MAYMOVE | MREMAP_FIXED, new_addr); + if (new_addr == MAP_FAILED) + syscall(SYS_exit, 2); + syscall(SYS_kill, child, SIGSTOP); + syscall(SYS_exit, 3); + } + + waitpid(child, &status, WUNTRACED); + if (WIFEXITED(status)) { + int ret = WEXITSTATUS(status); + + pr_err("Child unexpectedly exited with %d\n", ret); + goto out; + } else if (WIFSIGNALED(status)) { + int sig = WTERMSIG(status); + + pr_err("Child unexpectedly signaled with %d: %s\n", + sig, strsignal(sig)); + goto out; + } else if (!WIFSTOPPED(status) || WSTOPSIG(status) != SIGSTOP) { + pr_err("Child is unstoppable or was stopped by other means\n"); + goto out_kill; + } + + if (vdso_parse_maps(child, &vdso_maps_after)) { + pr_err("Failed parsing maps for child helper\n"); + goto out_kill; + } + + if (vdso_maps_after.vdso_start != VDSO_BAD_ADDR) + kdat.vdso_hint_reliable = 1; + + ret = 0; +out_kill: + kill(child, SIGKILL); + waitpid(child, &status, 0); +out: + return ret; +} diff --git a/CRIU_code/images/Makefile b/CRIU_code/images/Makefile new file mode 100644 index 0000000..863c583 --- /dev/null +++ b/CRIU_code/images/Makefile @@ -0,0 +1,123 @@ +proto-obj-y += stats.o +proto-obj-y += core.o +proto-obj-y += core-x86.o +proto-obj-y += core-arm.o +proto-obj-y += core-aarch64.o +proto-obj-y += core-ppc64.o +proto-obj-y += core-s390.o +proto-obj-y += core-riscv.o +proto-obj-y += cpuinfo.o +proto-obj-y += inventory.o +proto-obj-y += fdinfo.o +proto-obj-y += fown.o +proto-obj-y += ns.o +proto-obj-y += regfile.o +proto-obj-y += ghost-file.o +proto-obj-y += fifo.o +proto-obj-y += remap-file-path.o +proto-obj-y += eventfd.o +proto-obj-y += eventpoll.o +proto-obj-y += fh.o +proto-obj-y += fsnotify.o +proto-obj-y += signalfd.o +proto-obj-y += fs.o +proto-obj-y += pstree.o +proto-obj-y += pipe.o +proto-obj-y += tcp-stream.o +proto-obj-y += sk-packet.o +proto-obj-y += mnt.o +proto-obj-y += pipe-data.o +proto-obj-y += sa.o +proto-obj-y += timer.o +proto-obj-y += timerfd.o +proto-obj-y += mm.o +proto-obj-y += sk-opts.o +proto-obj-y += sk-unix.o +proto-obj-y += sk-inet.o +proto-obj-y += tun.o +proto-obj-y += sk-netlink.o +proto-obj-y += packet-sock.o +proto-obj-y += ipc-var.o +proto-obj-y += ipc-desc.o +proto-obj-y += ipc-shm.o +proto-obj-y += ipc-msg.o +proto-obj-y += ipc-sem.o +proto-obj-y += utsns.o +proto-obj-y += creds.o +proto-obj-y += vma.o +proto-obj-y += netdev.o +proto-obj-y += tty.o +proto-obj-y += file-lock.o +proto-obj-y += rlimit.o +proto-obj-y += pagemap.o +proto-obj-y += siginfo.o +proto-obj-y += rpc.o +proto-obj-y += ext-file.o +proto-obj-y += cgroup.o +proto-obj-y += userns.o +proto-obj-y += google/protobuf/descriptor.o # To make protoc-c happy and compile opts.proto +proto-obj-y += opts.o +proto-obj-y += seccomp.o +proto-obj-y += binfmt-misc.o +proto-obj-y += time.o +proto-obj-y += sysctl.o +proto-obj-y += autofs.o +proto-obj-y += macvlan.o +proto-obj-y += sit.o +proto-obj-y += remote-image.o + +CFLAGS += -iquote $(obj)/ + +# +# Generates a set of names from protobuf "import" directive. +# The names are bare, ie no suffixes. +define gen-proto-dep-names +$(shell grep "^[[:blank:]]*import[[:blank:]]" $(1) | \ + sed -e 's/[[:blank:]]*import[[:blank:]]*//' \ + -e 's/[\";]//g' \ + -e 's/\.proto//g' | \ + sort | uniq) +endef + +makefile-deps := Makefile $(obj)/Makefile + +# +# Generates rules needed to compile protobuf files. +define gen-proto-rules +$(obj)/$(1).pb-c.c $(obj)/$(1).pb-c.h: $(obj)/$(1).proto $(addsuffix .pb-c.c,$(addprefix $(obj)/,$(2))) $(makefile-deps) + $$(E) " PBCC " $$@ + $$(Q) protoc-c --proto_path=$(obj)/ --c_out=$(obj)/ $$< +ifeq ($(PROTOUFIX),y) + $$(Q) sed -i -e 's/4294967295/0xFFFFFFFF/g' $$@ + $$(Q) sed -i -e 's/4294967295/0xFFFFFFFF/g' $$(patsubst %.c,%.h,$$@) + $$(Q) sed -i -e 's/4294967295/0xFFFFFFFF/g' $$(patsubst %.h,%.c,$$@) +endif +$(obj)/$(1).pb-c.d: $(obj)/$(1).pb-c.c $(addsuffix .pb-c.d,$(addprefix $(obj)/,$(2))) $(makefile-deps) + $$(E) " DEP " $$@ + $$(Q) $$(CC) -M -MT $$@ -MT $$(patsubst %.d,%.o,$$@) $$(CFLAGS) $$< -o $$@ +endef + +$(foreach file, $(proto-obj-y), \ + $(eval $(call gen-proto-rules,$(file:.o=), \ + $(call gen-proto-dep-names, \ + $(addprefix $(obj)/,$(file:.o=.proto)))))) + +$(obj)/%.o: $(obj)/%.pb-c.c $(obj)/%.pb-c.h + $(E) " CC " $@ + $(Q) $(CC) -c $(CFLAGS) $< -o $@ + +$(obj)/built-in.o: $(addprefix $(obj)/,$(proto-obj-y)) + $(E) " LINK " $@ + $(Q) $(LD) $(ldflags-y) -r -o $@ $^ +cleanup-y += $(obj)/built-in.o + +ifneq ($(MAKECMDGOALS),clean) +ifneq ($(MAKECMDGOALS),mrproper) +-include $(addprefix $(obj)/,$(proto-obj-y:.o=.pb-c.d)) +endif +endif + +cleanup-y += $(call cleanify,$(addprefix $(obj)/,$(proto-obj-y))) +cleanup-y += $(call cleanify,$(addprefix $(obj)/,$(proto-obj-y:.o=.pb-c.o))) +mrproper-y += $(addprefix $(obj)/,$(proto-obj-y:.o=.pb-c.c)) +mrproper-y += $(addprefix $(obj)/,$(proto-obj-y:.o=.pb-c.h)) diff --git a/CRIU_code/images/autofs.proto b/CRIU_code/images/autofs.proto new file mode 100644 index 0000000..2146ca8 --- /dev/null +++ b/CRIU_code/images/autofs.proto @@ -0,0 +1,15 @@ +syntax = "proto2"; + +message autofs_entry { + required int32 fd = 1; + required int32 pgrp = 2; + required int32 timeout = 3; + required int32 minproto = 4; + required int32 maxproto = 5; + required int32 mode = 6; + + optional int32 uid = 7; + optional int32 gid = 8; + + optional int32 read_fd = 9; +} diff --git a/CRIU_code/images/binfmt-misc.proto b/CRIU_code/images/binfmt-misc.proto new file mode 100644 index 0000000..82a86c8 --- /dev/null +++ b/CRIU_code/images/binfmt-misc.proto @@ -0,0 +1,12 @@ +syntax = "proto2"; + +message binfmt_misc_entry { + required string name = 1; + required bool enabled = 2; + required string interpreter = 3; + optional string flags = 4; + optional string extension = 5; + optional string magic = 6; + optional string mask = 7; + optional int32 offset = 8; +} diff --git a/CRIU_code/images/cgroup.proto b/CRIU_code/images/cgroup.proto new file mode 100644 index 0000000..b8a545e --- /dev/null +++ b/CRIU_code/images/cgroup.proto @@ -0,0 +1,41 @@ +syntax = "proto2"; + +message cgroup_perms { + required uint32 mode = 1; + required uint32 uid = 2; + required uint32 gid = 3; +} + +message cgroup_prop_entry { + required string name = 1; + required string value = 2; + optional cgroup_perms perms = 3; +} + +message cgroup_dir_entry { + required string dir_name = 1; + repeated cgroup_dir_entry children = 2; + repeated cgroup_prop_entry properties = 3; + optional cgroup_perms dir_perms = 4; +} + +message cg_controller_entry { + repeated string cnames = 1; + repeated cgroup_dir_entry dirs = 2; +} + +message cg_member_entry { + required string name = 1; + required string path = 2; + optional uint32 cgns_prefix = 3; +} + +message cg_set_entry { + required uint32 id = 1; + repeated cg_member_entry ctls = 2; +} + +message cgroup_entry { + repeated cg_set_entry sets = 1; + repeated cg_controller_entry controllers = 2; +} diff --git a/CRIU_code/images/core-aarch64.proto b/CRIU_code/images/core-aarch64.proto new file mode 100644 index 0000000..83fdd64 --- /dev/null +++ b/CRIU_code/images/core-aarch64.proto @@ -0,0 +1,23 @@ +syntax = "proto2"; + +import "opts.proto"; + +message user_aarch64_regs_entry { + repeated uint64 regs = 1; + required uint64 sp = 2; + required uint64 pc = 3; + required uint64 pstate = 4; +} + +message user_aarch64_fpsimd_context_entry { + repeated uint64 vregs = 1; + required uint32 fpsr = 2; + required uint32 fpcr = 3; +} + +message thread_info_aarch64 { + required uint64 clear_tid_addr = 1[(criu).hex = true]; + required uint64 tls = 2; + required user_aarch64_regs_entry gpregs = 3[(criu).hex = true]; + required user_aarch64_fpsimd_context_entry fpsimd = 4; +} diff --git a/CRIU_code/images/core-arm.proto b/CRIU_code/images/core-arm.proto new file mode 100644 index 0000000..3004346 --- /dev/null +++ b/CRIU_code/images/core-arm.proto @@ -0,0 +1,39 @@ +syntax = "proto2"; + +import "opts.proto"; + +message user_arm_regs_entry { + required uint32 r0 = 1; + required uint32 r1 = 2; + required uint32 r2 = 3; + required uint32 r3 = 4; + required uint32 r4 = 5; + required uint32 r5 = 6; + required uint32 r6 = 7; + required uint32 r7 = 8; + required uint32 r8 = 9; + required uint32 r9 = 10; + required uint32 r10 = 11; + required uint32 fp = 12; + required uint32 ip = 13; + required uint32 sp = 14; + required uint32 lr = 15; + required uint32 pc = 16; + required uint32 cpsr = 17; + required uint32 orig_r0 = 18; +} + +message user_arm_vfpstate_entry { + repeated uint64 vfp_regs = 1; + required uint32 fpscr = 2; + required uint32 fpexc = 3; + required uint32 fpinst = 4; + required uint32 fpinst2 = 5; +} + +message thread_info_arm { + required uint64 clear_tid_addr = 1[(criu).hex = true]; + required uint32 tls = 2; + required user_arm_regs_entry gpregs = 3[(criu).hex = true]; + required user_arm_vfpstate_entry fpstate = 4; +} diff --git a/CRIU_code/images/core-ppc64.proto b/CRIU_code/images/core-ppc64.proto new file mode 100644 index 0000000..aca1c2f --- /dev/null +++ b/CRIU_code/images/core-ppc64.proto @@ -0,0 +1,71 @@ +syntax = "proto2"; + +import "opts.proto"; + +message user_ppc64_regs_entry { + /* Following is the list of regiters starting at r0. */ + repeated uint64 gpr = 1; + required uint64 nip = 2; + required uint64 msr = 3; + required uint64 orig_gpr3 = 4; + required uint64 ctr = 5; + required uint64 link = 6; + required uint64 xer = 7; + required uint64 ccr = 8; + required uint64 trap = 9; + /* For Transactional memory support since P8 */ + optional uint64 texasr = 10; + optional uint64 tfhar = 11; + optional uint64 tfiar = 12; +} + +message user_ppc64_fpstate_entry { + /* Following is the list of regiters starting at fpr0 */ + repeated uint64 fpregs = 1; +} + +message user_ppc64_vrstate_entry { + /* + * Altivec registers + * The vector registers are 128bit registers (VSR[32..63]). + * The following vregs entry will store first the high part then the + * low one: + * VR0 = vrregs[0] << 64 | vrregs[1]; + * VR1 = vrregs[2] << 64 | vrregs[3]; + * .. + * The last entry stores in a 128bit field the VSCR which is a 32bit + * value returned by the kernel in a 128 field. + */ + repeated uint64 vrregs = 1; + required uint32 vrsave = 2; +} + +message user_ppc64_vsxstate_entry { + /* + * VSX registers + * The vector-scale registers are 128bit registers (VSR[0..64]). + * Since there is an overlapping over the VSX registers by the FPR and + * the Altivec registers, only the lower part of the first 32 VSX + * registers have to be saved. + */ + repeated uint64 vsxregs = 1; +} + +/* + * Transactional memory operation's state + */ +message user_ppc64_tm_regs_entry { + required user_ppc64_regs_entry gpregs = 1; + optional user_ppc64_fpstate_entry fpstate = 2; + optional user_ppc64_vrstate_entry vrstate = 3; + optional user_ppc64_vsxstate_entry vsxstate = 4; +} + +message thread_info_ppc64 { + required uint64 clear_tid_addr = 1[(criu).hex = true]; + required user_ppc64_regs_entry gpregs = 2[(criu).hex = true]; + optional user_ppc64_fpstate_entry fpstate = 3; + optional user_ppc64_vrstate_entry vrstate = 4; + optional user_ppc64_vsxstate_entry vsxstate = 5; + optional user_ppc64_tm_regs_entry tmstate = 6; +} diff --git a/CRIU_code/images/core-riscv.proto b/CRIU_code/images/core-riscv.proto new file mode 100644 index 0000000..2c0d645 --- /dev/null +++ b/CRIU_code/images/core-riscv.proto @@ -0,0 +1,53 @@ +syntax = "proto2"; + +import "opts.proto"; + +message user_riscv_regs_entry { + required uint64 pc = 1; + required uint64 ra = 2; + required uint64 sp = 3; + required uint64 gp = 4; + required uint64 tp = 5; + required uint64 t0 = 6; + required uint64 t1 = 7; + required uint64 t2 = 8; + required uint64 s0 = 9; + required uint64 s1 = 10; + required uint64 a0 = 11; + required uint64 a1 = 12; + required uint64 a2 = 13; + required uint64 a3 = 14; + required uint64 a4 = 15; + required uint64 a5 = 16; + required uint64 a6 = 17; + required uint64 a7 = 18; + required uint64 s2 = 19; + required uint64 s3 = 20; + required uint64 s4 = 21; + required uint64 s5 = 22; + required uint64 s6 = 23; + required uint64 s7 = 24; + required uint64 s8 = 25; + required uint64 s9 = 26; + required uint64 s10 = 27; + required uint64 s11 = 28; + required uint64 t3 = 29; + required uint64 t4 = 30; + required uint64 t5 = 31; + required uint64 t6 = 32; +} + +message user_riscv_fpregs_entry { + //The F extension adds 32 floating-point registers + repeated uint64 fregs = 1; + // and a floating-point control and status register fcsr + //which contains the operating mode and exception status of the loating-point unit. + required uint32 fcsr = 2; +} + +message thread_info_riscv { + required uint64 clear_tid_addr = 1[(criu).hex = true]; + required uint64 tls = 2; + required user_riscv_regs_entry gpregs = 3[(criu).hex = true]; + required user_riscv_fpregs_entry fpregs = 4; +} diff --git a/CRIU_code/images/core-s390.proto b/CRIU_code/images/core-s390.proto new file mode 100644 index 0000000..497c73b --- /dev/null +++ b/CRIU_code/images/core-s390.proto @@ -0,0 +1,51 @@ +syntax = "proto2"; + +import "opts.proto"; + +message user_s390_regs_entry { + required uint64 psw_mask = 1; + required uint64 psw_addr = 2; + repeated uint64 gprs = 3; + repeated uint32 acrs = 4; + required uint64 orig_gpr2 = 5; + required uint32 system_call = 6; +} + +message user_s390_vxrs_low_entry { + repeated uint64 regs = 1; +} + +/* + * The vxrs_high registers have 128 bit: + * + * vxrs_high_0 = regs[0] << 64 | regs[1]; + * vxrs_high_1 = regs[2] << 64 | regs[3]; + */ +message user_s390_vxrs_high_entry { + repeated uint64 regs = 1; +} + +message user_s390_fpregs_entry { + required uint32 fpc = 1; + repeated uint64 fprs = 2; +} + +message user_s390_gs_cb_entry { + repeated uint64 regs = 1; +} + +message user_s390_ri_entry { + required uint32 ri_on = 1; + repeated uint64 regs = 2; +} + +message thread_info_s390 { + required uint64 clear_tid_addr = 1[(criu).hex = true]; + required user_s390_regs_entry gpregs = 2[(criu).hex = true]; + required user_s390_fpregs_entry fpregs = 3[(criu).hex = true]; + optional user_s390_vxrs_low_entry vxrs_low = 4[(criu).hex = true]; + optional user_s390_vxrs_high_entry vxrs_high = 5[(criu).hex = true]; + optional user_s390_gs_cb_entry gs_cb = 6[(criu).hex = true]; + optional user_s390_gs_cb_entry gs_bc = 7[(criu).hex = true]; + optional user_s390_ri_entry ri_cb = 8[(criu).hex = true]; +} diff --git a/CRIU_code/images/core-x86.proto b/CRIU_code/images/core-x86.proto new file mode 100644 index 0000000..2ed2ad3 --- /dev/null +++ b/CRIU_code/images/core-x86.proto @@ -0,0 +1,108 @@ +syntax = "proto2"; + +import "opts.proto"; + +enum user_x86_regs_mode { + NATIVE = 1; + COMPAT = 2; +} + +/* Reusing entry for both 64 and 32 bits register sets */ +message user_x86_regs_entry { + required uint64 r15 = 1; + required uint64 r14 = 2; + required uint64 r13 = 3; + required uint64 r12 = 4; + required uint64 bp = 5; + required uint64 bx = 6; + required uint64 r11 = 7; + required uint64 r10 = 8; + required uint64 r9 = 9; + required uint64 r8 = 10; + required uint64 ax = 11; + required uint64 cx = 12; + required uint64 dx = 13; + required uint64 si = 14; + required uint64 di = 15; + required uint64 orig_ax = 16; + required uint64 ip = 17; + required uint64 cs = 18; + required uint64 flags = 19; + required uint64 sp = 20; + required uint64 ss = 21; + required uint64 fs_base = 22; + required uint64 gs_base = 23; + required uint64 ds = 24; + required uint64 es = 25; + required uint64 fs = 26; + required uint64 gs = 27; + optional user_x86_regs_mode mode = 28 [default = NATIVE]; +} + +message user_x86_xsave_entry { + /* standart xsave features */ + required uint64 xstate_bv = 1; + + /* AVX components: 16x 256-bit ymm registers, hi 128 bits */ + repeated uint32 ymmh_space = 2; + + /* MPX components */ + repeated uint64 bndreg_state = 3; + repeated uint64 bndcsr_state = 4; + + /* AVX512 components: k0-k7, ZMM_Hi256, Hi16_ZMM */ + repeated uint64 opmask_reg = 5; + repeated uint64 zmm_upper = 6; + repeated uint64 hi16_zmm = 7; + + /* Protected keys */ + repeated uint32 pkru = 8; + + /* + * Processor trace (PT) and hardware duty cycling (HDC) + * are supervisor state components and only managed by + * xsaves/xrstors on cpl=0, so ignore them. + */ +} + +message user_x86_fpregs_entry { + + /* fxsave data */ + required uint32 cwd = 1; + required uint32 swd = 2; + required uint32 twd = 3; + required uint32 fop = 4; + required uint64 rip = 5; + required uint64 rdp = 6; + required uint32 mxcsr = 7; + required uint32 mxcsr_mask = 8; + repeated uint32 st_space = 9; + repeated uint32 xmm_space = 10; + + /* Unused, but present for backward compatibility */ + repeated uint32 padding = 11; + + /* xsave extension */ + optional user_x86_xsave_entry xsave = 13; +} + +message user_desc_t { + required uint32 entry_number = 1; + /* this is for GDT, not for MSRs - 32-bit base */ + required uint32 base_addr = 2; + required uint32 limit = 3; + required bool seg_32bit = 4; + required bool contents_h = 5; + required bool contents_l = 6; + required bool read_exec_only = 7 [default = true]; + required bool limit_in_pages = 8; + required bool seg_not_present = 9 [default = true]; + required bool useable = 10; +} + +message thread_info_x86 { + required uint64 clear_tid_addr = 1[(criu).hex = true]; + required user_x86_regs_entry gpregs = 2[(criu).hex = true]; + required user_x86_fpregs_entry fpregs = 3; + repeated user_desc_t tls = 4; +} diff --git a/CRIU_code/images/core.proto b/CRIU_code/images/core.proto new file mode 100644 index 0000000..e9e94f9 --- /dev/null +++ b/CRIU_code/images/core.proto @@ -0,0 +1,122 @@ +syntax = "proto2"; + +import "core-x86.proto"; +import "core-arm.proto"; +import "core-aarch64.proto"; +import "core-ppc64.proto"; +import "core-s390.proto"; + +import "rlimit.proto"; +import "timer.proto"; +import "creds.proto"; +import "sa.proto"; +import "siginfo.proto"; + +import "opts.proto"; + +/* + * These match the SECCOMP_MODE_* flags from . + */ +enum seccomp_mode { + disabled = 0; + strict = 1; + filter = 2; +}; + +message task_core_entry { + required uint32 task_state = 1 [(criu).dict = "gen"]; + required uint32 exit_code = 2; + + required uint32 personality = 3; + required uint32 flags = 4; + required uint64 blk_sigset = 5[(criu).hex = true]; + + required string comm = 6; + + optional task_timers_entry timers = 7; + optional task_rlimits_entry rlimits = 8; + + optional uint32 cg_set = 9; + + optional signal_queue_entry signals_s = 10; + + /* These two are deprecated, should be per-thread */ + optional seccomp_mode old_seccomp_mode = 11; + optional uint32 old_seccomp_filter = 12; + + optional uint32 loginuid = 13; + + optional int32 oom_score_adj = 14; + repeated sa_entry sigactions = 15; + // Reserved for tty inheritance + //optional int32 tty_nr = 16; + //optional int32 tty_pgrp = 17; +} + +message task_kobj_ids_entry { + required uint32 vm_id = 1; + required uint32 files_id = 2; + required uint32 fs_id = 3; + required uint32 sighand_id = 4; + + optional uint32 pid_ns_id = 5; + optional uint32 net_ns_id = 6; + optional uint32 ipc_ns_id = 7; + optional uint32 uts_ns_id = 8; + optional uint32 mnt_ns_id = 9; + optional uint32 user_ns_id = 10; + optional uint32 cgroup_ns_id = 11; +} + +message thread_sas_entry { + required uint64 ss_sp = 1; + required uint64 ss_size = 2; + required uint32 ss_flags = 3; +} + +message thread_core_entry { + required uint64 futex_rla = 1; + required uint32 futex_rla_len = 2; + optional sint32 sched_nice = 3; + optional uint32 sched_policy = 4; + optional uint32 sched_prio = 5; + optional uint64 blk_sigset = 6; + optional thread_sas_entry sas = 7; + optional uint32 pdeath_sig = 8; + + optional signal_queue_entry signals_p = 9; + optional creds_entry creds = 10; + + optional seccomp_mode seccomp_mode = 11; + optional uint32 seccomp_filter = 12; + + optional string comm = 13; +} + +message task_rlimits_entry { + repeated rlimit_entry rlimits = 1; +}; + +message core_entry { + enum march { + UNKNOWN = 0; + X86_64 = 1; + ARM = 2; + AARCH64 = 3; + PPC64 = 4; + S390 = 5; + RISCV = 6; + } + + required march mtype = 1; + optional thread_info_x86 thread_info = 2; + optional thread_info_arm ti_arm = 6; + optional thread_info_aarch64 ti_aarch64 = 8; + optional thread_info_ppc64 ti_ppc64 = 9; + optional thread_info_s390 ti_s390 = 10; + optional thread_info_riscv ti_riscv = 11; + + optional task_core_entry tc = 3; + optional task_kobj_ids_entry ids = 4; + optional thread_core_entry thread_core = 5; +} diff --git a/CRIU_code/images/cpuinfo.proto b/CRIU_code/images/cpuinfo.proto new file mode 100644 index 0000000..784da5b --- /dev/null +++ b/CRIU_code/images/cpuinfo.proto @@ -0,0 +1,63 @@ +syntax = "proto2"; + +message cpuinfo_x86_entry { + enum vendor { + UNKNOWN = 0; + INTEL = 1; + AMD = 2; + } + + required vendor vendor_id = 1; + required uint32 cpu_family = 2; + required uint32 model = 3; + required uint32 stepping = 4; + required uint32 capability_ver = 5; + repeated uint32 capability = 6; + + optional string model_id = 7; + + optional uint64 xfeatures_mask = 8; + optional uint32 xsave_size = 9; + optional uint32 xsave_size_max = 10; +} + +message cpuinfo_ppc64_entry { + enum endianness { + BIGENDIAN = 0; + LITTLEENDIAN = 1; + } + + required endianness endian = 1; + repeated uint64 hwcap = 2; +} + +message cpuinfo_s390_entry { + repeated uint64 hwcap = 2; +} + +message cpuinfo_riscv_entry +{ + repeated uint64 hwcap = 2; +} + +message cpuinfo_aarch64_entry +{ + required uint32 reg_midr = 1; + repeated uint32 reg_ctr = 2; + repeated uint32 reg_cntfrq = 3; + repeated uint32 reg_dczid =4; + repeated uint32 reg_revidr = 5; +} + +message cpuinfo_entry { + /* + * Usually on SMP system there should be same CPUs + * installed, but it might happen that system carries + * various CPUs so @repeated used. + */ + repeated cpuinfo_x86_entry x86_entry = 1; + repeated cpuinfo_ppc64_entry ppc64_entry = 2; + repeated cpuinfo_s390_entry s390_entry = 3; + repeated cpuinfo_aarch64_entry aarch64_entry = 4; + repeated cpuinfo_riscv_entry riscv_entry = 5; +} diff --git a/CRIU_code/images/creds.proto b/CRIU_code/images/creds.proto new file mode 100644 index 0000000..23b84c7 --- /dev/null +++ b/CRIU_code/images/creds.proto @@ -0,0 +1,24 @@ +syntax = "proto2"; + +message creds_entry { + required uint32 uid = 1; + required uint32 gid = 2; + required uint32 euid = 3; + required uint32 egid = 4; + required uint32 suid = 5; + required uint32 sgid = 6; + required uint32 fsuid = 7; + required uint32 fsgid = 8; + + repeated uint32 cap_inh = 9; + repeated uint32 cap_prm = 10; + repeated uint32 cap_eff = 11; + repeated uint32 cap_bnd = 12; + + required uint32 secbits = 13; + + repeated uint32 groups = 14; + + optional string lsm_profile = 15; + optional string lsm_sockcreate = 16; +} diff --git a/CRIU_code/images/eventfd.proto b/CRIU_code/images/eventfd.proto new file mode 100644 index 0000000..ff9ced3 --- /dev/null +++ b/CRIU_code/images/eventfd.proto @@ -0,0 +1,10 @@ +syntax = "proto2"; + +import "fown.proto"; + +message eventfd_file_entry { + required uint32 id = 1; + required uint32 flags = 2; + required fown_entry fown = 3; + required uint64 counter = 4; +} diff --git a/CRIU_code/images/eventpoll.proto b/CRIU_code/images/eventpoll.proto new file mode 100644 index 0000000..4a8d1b8 --- /dev/null +++ b/CRIU_code/images/eventpoll.proto @@ -0,0 +1,22 @@ +syntax = "proto2"; + +import "fown.proto"; + +message eventpoll_tfd_entry { + required uint32 id = 1; + required uint32 tfd = 2; + required uint32 events = 3; + required uint64 data = 4; + + /* to find dup'ed target files */ + optional uint32 dev = 5; + optional uint64 inode = 6; + optional uint64 pos = 7; +} + +message eventpoll_file_entry { + required uint32 id = 1; + required uint32 flags = 2; + required fown_entry fown = 3; + repeated eventpoll_tfd_entry tfd = 4; +} diff --git a/CRIU_code/images/ext-file.proto b/CRIU_code/images/ext-file.proto new file mode 100644 index 0000000..f820ffb --- /dev/null +++ b/CRIU_code/images/ext-file.proto @@ -0,0 +1,8 @@ +syntax = "proto2"; + +import "fown.proto"; + +message ext_file_entry { + required uint32 id = 1; + required fown_entry fown = 5; +} diff --git a/CRIU_code/images/fdinfo.proto b/CRIU_code/images/fdinfo.proto new file mode 100644 index 0000000..77e375a --- /dev/null +++ b/CRIU_code/images/fdinfo.proto @@ -0,0 +1,73 @@ +syntax = "proto2"; + +import "regfile.proto"; +import "sk-inet.proto"; +import "ns.proto"; +import "packet-sock.proto"; +import "sk-netlink.proto"; +import "eventfd.proto"; +import "eventpoll.proto"; +import "signalfd.proto"; +import "tun.proto"; +import "timerfd.proto"; +import "fsnotify.proto"; +import "ext-file.proto"; +import "sk-unix.proto"; +import "fifo.proto"; +import "pipe.proto"; +import "tty.proto"; + +enum fd_types { + UND = 0; + REG = 1; + PIPE = 2; + FIFO = 3; + INETSK = 4; + UNIXSK = 5; + EVENTFD = 6; + EVENTPOLL = 7; + INOTIFY = 8; + SIGNALFD = 9; + PACKETSK = 10; + TTY = 11; + FANOTIFY = 12; + NETLINKSK = 13; + NS = 14; + TUNF = 15; + EXT = 16; + TIMERFD = 17; + + /* Any number above the real used. Not stored to image */ + CTL_TTY = 65534; + AUTOFS_PIPE = 65535; +} + +message fdinfo_entry { + required uint32 id = 1; + required uint32 flags = 2; + required fd_types type = 3; + required uint32 fd = 4; + optional string xattr_security_selinux = 5; +} + +message file_entry { + required fd_types type = 1; + required uint32 id = 2; + optional reg_file_entry reg = 3; + optional inet_sk_entry isk = 4; + optional ns_file_entry nsf = 5; + optional packet_sock_entry psk = 6; + optional netlink_sk_entry nlsk = 7; + optional eventfd_file_entry efd = 8; + optional eventpoll_file_entry epfd = 9; + optional signalfd_entry sgfd = 10; + optional tunfile_entry tunf = 11; + optional timerfd_entry tfd = 12; + optional inotify_file_entry ify = 13; + optional fanotify_file_entry ffy = 14; + optional ext_file_entry ext = 15; + optional unix_sk_entry usk = 16; + optional fifo_entry fifo = 17; + optional pipe_entry pipe = 18; + optional tty_file_entry tty = 19; +} diff --git a/CRIU_code/images/fh.proto b/CRIU_code/images/fh.proto new file mode 100644 index 0000000..2da7e9d --- /dev/null +++ b/CRIU_code/images/fh.proto @@ -0,0 +1,23 @@ +syntax = "proto2"; + +import "opts.proto"; + +enum fh_entry_sizes { + min_entries = 16; +} + +message fh_entry { + required uint32 bytes = 1; + required uint32 type = 2; + + /* The minimum is fh_n_handle repetitions */ + repeated uint64 handle = 3; + optional string path = 4; + optional uint32 mnt_id = 5; +} + +message irmap_cache_entry { + required uint32 dev = 1 [(criu).dev = true, (criu).odev = true]; + required uint64 inode = 2; + required string path = 3; +} diff --git a/CRIU_code/images/fifo.proto b/CRIU_code/images/fifo.proto new file mode 100644 index 0000000..f5b3283 --- /dev/null +++ b/CRIU_code/images/fifo.proto @@ -0,0 +1,7 @@ +syntax = "proto2"; + +message fifo_entry { + required uint32 id = 1; + required uint32 pipe_id = 2; + optional uint32 regf_id = 3; +} diff --git a/CRIU_code/images/file-lock.proto b/CRIU_code/images/file-lock.proto new file mode 100644 index 0000000..5dd8847 --- /dev/null +++ b/CRIU_code/images/file-lock.proto @@ -0,0 +1,10 @@ +syntax = "proto2"; + +message file_lock_entry { + required uint32 flag = 1; + required uint32 type = 2; + required int32 pid = 3; + required int32 fd = 4; + required int64 start = 5; + required int64 len = 6; +} diff --git a/CRIU_code/images/fown.proto b/CRIU_code/images/fown.proto new file mode 100644 index 0000000..9956b98 --- /dev/null +++ b/CRIU_code/images/fown.proto @@ -0,0 +1,9 @@ +syntax = "proto2"; + +message fown_entry { + required uint32 uid = 1; + required uint32 euid = 2; + required uint32 signum = 3; + required uint32 pid_type = 4; + required uint32 pid = 5; +} diff --git a/CRIU_code/images/fs.proto b/CRIU_code/images/fs.proto new file mode 100644 index 0000000..5b940a1 --- /dev/null +++ b/CRIU_code/images/fs.proto @@ -0,0 +1,7 @@ +syntax = "proto2"; + +message fs_entry { + required uint32 cwd_id = 1; + required uint32 root_id = 2; + optional uint32 umask = 3; +} diff --git a/CRIU_code/images/fsnotify.proto b/CRIU_code/images/fsnotify.proto new file mode 100644 index 0000000..399a449 --- /dev/null +++ b/CRIU_code/images/fsnotify.proto @@ -0,0 +1,60 @@ +syntax = "proto2"; + +import "opts.proto"; +import "fh.proto"; +import "fown.proto"; + +message inotify_wd_entry { + required uint32 id = 1; + required uint64 i_ino = 2; + required uint32 mask = 3 [(criu).hex = true]; + required uint32 ignored_mask = 4 [(criu).hex = true]; + required uint32 s_dev = 5 [(criu).dev = true]; + required uint32 wd = 6; + required fh_entry f_handle = 7; +} + +message inotify_file_entry { + required uint32 id = 1; + required uint32 flags = 2 [(criu).hex = true]; + required fown_entry fown = 4; + repeated inotify_wd_entry wd = 5; +} + +enum mark_type { + INODE = 1; + MOUNT = 2; +} + +message fanotify_inode_mark_entry { + required uint64 i_ino = 1; + required fh_entry f_handle = 2; +} + +message fanotify_mount_mark_entry { + required uint32 mnt_id = 1; + optional string path = 2; +} + +message fanotify_mark_entry { + required uint32 id = 1; + required mark_type type = 2; + + required uint32 mflags = 3 [(criu).hex = true]; + required uint32 mask = 4 [(criu).hex = true]; + required uint32 ignored_mask = 5 [(criu).hex = true]; + required uint32 s_dev = 6 [(criu).dev = true]; + + optional fanotify_inode_mark_entry ie = 7; + optional fanotify_mount_mark_entry me = 8; +} + +message fanotify_file_entry { + required uint32 id = 1; + required uint32 flags = 2 [(criu).hex = true]; + required fown_entry fown = 3; + + required uint32 faflags = 4 [(criu).hex = true]; + required uint32 evflags = 5 [(criu).hex = true]; + repeated fanotify_mark_entry mark = 6; +} diff --git a/CRIU_code/images/ghost-file.proto b/CRIU_code/images/ghost-file.proto new file mode 100644 index 0000000..eda4664 --- /dev/null +++ b/CRIU_code/images/ghost-file.proto @@ -0,0 +1,23 @@ +syntax = "proto2"; + +import "opts.proto"; +import "time.proto"; + +message ghost_file_entry { + required uint32 uid = 1; + required uint32 gid = 2; + required uint32 mode = 3; + + optional uint32 dev = 4 [(criu).dev = true]; + optional uint64 ino = 5; + optional uint32 rdev = 6 [(criu).dev = true, (criu).odev = true]; + optional timeval atim = 7; + optional timeval mtim = 8; + optional bool chunks = 9; + optional uint64 size = 10; +} + +message ghost_chunk_entry { + required uint64 len = 1; + required uint64 off = 2; +} diff --git a/CRIU_code/images/google/protobuf/descriptor.proto b/CRIU_code/images/google/protobuf/descriptor.proto new file mode 100644 index 0000000..07a4c9a --- /dev/null +++ b/CRIU_code/images/google/protobuf/descriptor.proto @@ -0,0 +1 @@ +/usr/include/google/protobuf/descriptor.proto \ No newline at end of file diff --git a/CRIU_code/images/inventory.proto b/CRIU_code/images/inventory.proto new file mode 100644 index 0000000..7bc2b0c --- /dev/null +++ b/CRIU_code/images/inventory.proto @@ -0,0 +1,19 @@ +syntax = "proto2"; + +import "core.proto"; + +enum lsmtype { + NO_LSM = 0; + SELINUX = 1; + APPARMOR = 2; +} + +message inventory_entry { + required uint32 img_version = 1; + optional bool fdinfo_per_id = 2; + optional task_kobj_ids_entry root_ids = 3; + optional bool ns_per_id = 4; + optional uint32 root_cg_set = 5; + optional lsmtype lsmtype = 6; + optional uint64 dump_uptime = 8; +} diff --git a/CRIU_code/images/ipc-desc.proto b/CRIU_code/images/ipc-desc.proto new file mode 100644 index 0000000..b400bd7 --- /dev/null +++ b/CRIU_code/images/ipc-desc.proto @@ -0,0 +1,11 @@ +syntax = "proto2"; + +message ipc_desc_entry { + required uint32 key = 1; + required uint32 uid = 2; + required uint32 gid = 3; + required uint32 cuid = 4; + required uint32 cgid = 5; + required uint32 mode = 6; + required uint32 id = 7; +} diff --git a/CRIU_code/images/ipc-msg.proto b/CRIU_code/images/ipc-msg.proto new file mode 100644 index 0000000..5260ea8 --- /dev/null +++ b/CRIU_code/images/ipc-msg.proto @@ -0,0 +1,14 @@ +syntax = "proto2"; + +import "ipc-desc.proto"; + +message ipc_msg { + required uint64 mtype = 1; + required uint32 msize = 2; +} + +message ipc_msg_entry { + required ipc_desc_entry desc = 1; + required uint32 qbytes = 2; + required uint32 qnum = 3; +} diff --git a/CRIU_code/images/ipc-sem.proto b/CRIU_code/images/ipc-sem.proto new file mode 100644 index 0000000..bffb581 --- /dev/null +++ b/CRIU_code/images/ipc-sem.proto @@ -0,0 +1,8 @@ +syntax = "proto2"; + +import "ipc-desc.proto"; + +message ipc_sem_entry { + required ipc_desc_entry desc = 1; + required uint32 nsems = 2; +} diff --git a/CRIU_code/images/ipc-shm.proto b/CRIU_code/images/ipc-shm.proto new file mode 100644 index 0000000..31e172e --- /dev/null +++ b/CRIU_code/images/ipc-shm.proto @@ -0,0 +1,9 @@ +syntax = "proto2"; + +import "ipc-desc.proto"; + +message ipc_shm_entry { + required ipc_desc_entry desc = 1; + required uint64 size = 2; + optional bool in_pagemaps = 3; +} diff --git a/CRIU_code/images/ipc-var.proto b/CRIU_code/images/ipc-var.proto new file mode 100644 index 0000000..f46fcde --- /dev/null +++ b/CRIU_code/images/ipc-var.proto @@ -0,0 +1,21 @@ +syntax = "proto2"; + +message ipc_var_entry { + repeated uint32 sem_ctls = 1; + required uint32 msg_ctlmax = 2; + required uint32 msg_ctlmnb = 3; + required uint32 msg_ctlmni = 4; + required uint32 auto_msgmni = 5; + required uint64 shm_ctlmax = 6; + required uint64 shm_ctlall = 7; + required uint32 shm_ctlmni = 8; + required uint32 shm_rmid_forced = 9; + required uint32 mq_queues_max = 10; + required uint32 mq_msg_max = 11; + required uint32 mq_msgsize_max = 12; + optional uint32 mq_msg_default = 13; + optional uint32 mq_msgsize_default = 14; + optional uint32 msg_next_id = 15; + optional uint32 sem_next_id = 16; + optional uint32 shm_next_id = 17; +} diff --git a/CRIU_code/images/macvlan.proto b/CRIU_code/images/macvlan.proto new file mode 100644 index 0000000..0ca2652 --- /dev/null +++ b/CRIU_code/images/macvlan.proto @@ -0,0 +1,6 @@ +syntax = "proto2"; + +message macvlan_link_entry { + required uint32 mode = 1; + optional uint32 flags = 2; +} diff --git a/CRIU_code/images/mm.proto b/CRIU_code/images/mm.proto new file mode 100644 index 0000000..e0f14c6 --- /dev/null +++ b/CRIU_code/images/mm.proto @@ -0,0 +1,33 @@ +syntax = "proto2"; + +import "opts.proto"; +import "vma.proto"; + +message aio_ring_entry { + required uint64 id = 1; + required uint32 nr_req = 2; + required uint32 ring_len = 3; +} + +message mm_entry { + required uint64 mm_start_code = 1 [(criu).hex = true]; + required uint64 mm_end_code = 2 [(criu).hex = true]; + required uint64 mm_start_data = 3 [(criu).hex = true]; + required uint64 mm_end_data = 4 [(criu).hex = true]; + required uint64 mm_start_stack = 5 [(criu).hex = true]; + required uint64 mm_start_brk = 6 [(criu).hex = true]; + required uint64 mm_brk = 7 [(criu).hex = true]; + required uint64 mm_arg_start = 8 [(criu).hex = true]; + required uint64 mm_arg_end = 9 [(criu).hex = true]; + required uint64 mm_env_start = 10 [(criu).hex = true]; + required uint64 mm_env_end = 11 [(criu).hex = true]; + required uint32 exe_file_id = 12; + + repeated uint64 mm_saved_auxv = 13; + + repeated vma_entry vmas = 14; + + optional int32 dumpable = 15; + repeated aio_ring_entry aios = 16; + optional bool thp_disabled = 17; +} diff --git a/CRIU_code/images/mnt.proto b/CRIU_code/images/mnt.proto new file mode 100644 index 0000000..4160acb --- /dev/null +++ b/CRIU_code/images/mnt.proto @@ -0,0 +1,58 @@ +syntax = "proto2"; + +import "opts.proto"; + +enum fstype { + UNSUPPORTED = 0; + PROC = 1; + SYSFS = 2; + DEVTMPFS = 3; + BINFMT_MISC = 4; + TMPFS = 5; + DEVPTS = 6; + SIMFS = 7; + PSTORE = 8; + SECURITYFS = 9; + FUSECTL = 10; + DEBUGFS = 11; + CGROUP = 12; + AUFS = 13; + MQUEUE = 14; + FUSE = 15; + AUTO = 16; + OVERLAYFS = 17; + AUTOFS = 18; + TRACEFS = 19; + + /* These three are reserved for NFS support */ + // RPC_PIPEFS = 20; + // NFS = 21; + // NFS4 = 22; +}; + +message mnt_entry { + required uint32 fstype = 1; + required uint32 mnt_id = 2; + required uint32 root_dev = 3 [(criu).dev = true]; + required uint32 parent_mnt_id = 4; + required uint32 flags = 5 [(criu).hex = true]; + + required string root = 6; + required string mountpoint = 7; + required string source = 8; + required string options = 9; + + optional uint32 shared_id = 10; + optional uint32 master_id = 11; + + optional bool with_plugin = 12; + optional bool ext_mount = 13; + + optional string fsname = 14; + optional bool internal_sharing = 15; + + optional bool deleted = 16; + optional uint32 sb_flags = 17 [(criu).hex = true]; + /* user defined mapping for external mount */ + optional string ext_key = 18; +} diff --git a/CRIU_code/images/netdev.proto b/CRIU_code/images/netdev.proto new file mode 100644 index 0000000..476a92c --- /dev/null +++ b/CRIU_code/images/netdev.proto @@ -0,0 +1,74 @@ +syntax = "proto2"; + +import "macvlan.proto"; +import "opts.proto"; +import "tun.proto"; +import "sysctl.proto"; +import "sit.proto"; + +enum nd_type { + LOOPBACK = 1; + VETH = 2; + TUN = 3; + /* + * External link -- for those CRIU only dumps and restores + * link parameters such as flags, address, MTU, etc. The + * existence of the link on restore should be provided + * by the setup-namespaces script. + */ + EXTLINK = 4; + VENET = 5; /* OpenVZ device */ + BRIDGE = 6; + MACVLAN = 7; + SIT = 8; +} + +message net_device_entry { + required nd_type type = 1; + required uint32 ifindex = 2; + required uint32 mtu = 3; + required uint32 flags = 4 [(criu).hex = true]; + required string name = 5; + + optional tun_link_entry tun = 6; + + optional bytes address = 7; + + repeated int32 conf = 8; + + repeated sysctl_entry conf4 = 9; + + repeated sysctl_entry conf6 = 10; + + optional macvlan_link_entry macvlan = 11; + + optional uint32 peer_ifindex = 12; + optional uint32 peer_nsid = 13; + optional uint32 master = 14; + optional sit_entry sit = 15; +} + +message netns_id { + /* This is CRIU's id which is allocated for each namespace */ + required uint32 target_ns_id = 1; + /* + * This is an id which can be used to address this namespace + * from another network namespace. Each network namespace has + * one set of id-s for other namespaces. + */ + required int32 netnsid_value = 2; +} + +message netns_entry { + repeated int32 def_conf = 1; + repeated int32 all_conf = 2; + + repeated sysctl_entry def_conf4 = 3; + repeated sysctl_entry all_conf4 = 4; + + repeated sysctl_entry def_conf6 = 5; + repeated sysctl_entry all_conf6 = 6; + + repeated netns_id nsids = 7; + optional string ext_key = 8; +} diff --git a/CRIU_code/images/ns.proto b/CRIU_code/images/ns.proto new file mode 100644 index 0000000..5ff0001 --- /dev/null +++ b/CRIU_code/images/ns.proto @@ -0,0 +1,8 @@ +syntax = "proto2"; + +message ns_file_entry { + required uint32 id = 1; + required uint32 ns_id = 2; + required uint32 ns_cflag = 3; + required uint32 flags = 4; +} diff --git a/CRIU_code/images/opts.proto b/CRIU_code/images/opts.proto new file mode 100644 index 0000000..70c7fd4 --- /dev/null +++ b/CRIU_code/images/opts.proto @@ -0,0 +1,18 @@ +syntax = "proto2"; + +import "google/protobuf/descriptor.proto"; + +message CRIU_Opts { + optional bool hex = 1; // Idicate that CRIT should treat this field as hex. + optional bool ipadd = 2; // The field is IPv4/v6 address + optional string flags = 3; + optional bool dev = 4; // Device major:minor packed + optional bool odev = 5; // ... in old format + optional string dict = 6; + optional string conv = 7; +} + +extend google.protobuf.FieldOptions { + // Registered unique number to use for all kinds of custom options. + optional CRIU_Opts criu = 1018; +} diff --git a/CRIU_code/images/packet-sock.proto b/CRIU_code/images/packet-sock.proto new file mode 100644 index 0000000..25875b4 --- /dev/null +++ b/CRIU_code/images/packet-sock.proto @@ -0,0 +1,47 @@ +syntax = "proto2"; + +import "opts.proto"; +import "fown.proto"; +import "sk-opts.proto"; + +message packet_mclist { + required uint32 index = 1; + required uint32 type = 2; + required bytes addr = 3; +} + +message packet_ring { + required uint32 block_size = 1; + required uint32 block_nr = 2; + required uint32 frame_size = 3; + required uint32 frame_nr = 4; + + required uint32 retire_tmo = 5; + required uint32 sizeof_priv = 6; + required uint32 features = 7; +} + +message packet_sock_entry { + required uint32 id = 1; + required uint32 type = 2; + required uint32 protocol = 3; + required uint32 flags = 4 [(criu).hex = true]; + required uint32 ifindex = 5; + + required fown_entry fown = 6; + required sk_opts_entry opts = 7; + + required uint32 version = 8; + required uint32 reserve = 9; + required bool aux_data = 10; + required bool orig_dev = 11; + required bool vnet_hdr = 12; + required bool loss = 13; + required uint32 timestamp = 14; + required uint32 copy_thresh = 15; + repeated packet_mclist mclist = 16; + optional uint32 fanout = 17 [ default = 0xffffffff ]; + optional packet_ring rx_ring = 18; + optional packet_ring tx_ring = 19; + optional uint32 ns_id = 20; +} diff --git a/CRIU_code/images/pagemap.proto b/CRIU_code/images/pagemap.proto new file mode 100644 index 0000000..42ed5eb --- /dev/null +++ b/CRIU_code/images/pagemap.proto @@ -0,0 +1,14 @@ +syntax = "proto2"; + +import "opts.proto"; + +message pagemap_head { + required uint32 pages_id = 1; +} + +message pagemap_entry { + required uint64 vaddr = 1 [(criu).hex = true]; + required uint32 nr_pages = 2; + optional bool in_parent = 3; + optional uint32 flags = 4 [(criu).flags = "pmap.flags" ]; +} diff --git a/CRIU_code/images/pipe-data.proto b/CRIU_code/images/pipe-data.proto new file mode 100644 index 0000000..78d53a8 --- /dev/null +++ b/CRIU_code/images/pipe-data.proto @@ -0,0 +1,7 @@ +syntax = "proto2"; + +message pipe_data_entry { + required uint32 pipe_id = 1; + required uint32 bytes = 2; + optional uint32 size = 3; +} diff --git a/CRIU_code/images/pipe.proto b/CRIU_code/images/pipe.proto new file mode 100644 index 0000000..a9a213b --- /dev/null +++ b/CRIU_code/images/pipe.proto @@ -0,0 +1,11 @@ +syntax = "proto2"; + +import "opts.proto"; +import "fown.proto"; + +message pipe_entry { + required uint32 id = 1; + required uint32 pipe_id = 2; + required uint32 flags = 3 [(criu).hex = true]; + required fown_entry fown = 4; +} diff --git a/CRIU_code/images/pstree.proto b/CRIU_code/images/pstree.proto new file mode 100644 index 0000000..23e88aa --- /dev/null +++ b/CRIU_code/images/pstree.proto @@ -0,0 +1,9 @@ +syntax = "proto2"; + +message pstree_entry { + required uint32 pid = 1; + required uint32 ppid = 2; + required uint32 pgid = 3; + required uint32 sid = 4; + repeated uint32 threads = 5; +} diff --git a/CRIU_code/images/regfile.proto b/CRIU_code/images/regfile.proto new file mode 100644 index 0000000..bc4c14d --- /dev/null +++ b/CRIU_code/images/regfile.proto @@ -0,0 +1,16 @@ +syntax = "proto2"; + +import "opts.proto"; +import "fown.proto"; + +message reg_file_entry { + required uint32 id = 1; + required uint32 flags = 2 [(criu).flags = "rfile.flags"]; + required uint64 pos = 3; + required fown_entry fown = 5; + required string name = 6; + optional sint32 mnt_id = 7 [default = -1]; + optional uint64 size = 8; + optional bool ext = 9; + optional uint32 mode = 10; +} diff --git a/CRIU_code/images/remap-file-path.proto b/CRIU_code/images/remap-file-path.proto new file mode 100644 index 0000000..3cc78a2 --- /dev/null +++ b/CRIU_code/images/remap-file-path.proto @@ -0,0 +1,16 @@ +syntax = "proto2"; + +enum remap_type { + LINKED = 0; + GHOST = 1; + PROCFS = 2; + // Reserved for spfs manager + // SPFS = 3; + // SPFS_LINKED = 4; +}; + +message remap_file_path_entry { + required uint32 orig_id = 1; + required uint32 remap_id = 2; + optional remap_type remap_type = 3; +} diff --git a/CRIU_code/images/remote-image.proto b/CRIU_code/images/remote-image.proto new file mode 100644 index 0000000..f6b8150 --- /dev/null +++ b/CRIU_code/images/remote-image.proto @@ -0,0 +1,22 @@ +syntax = "proto2"; + +message local_image_entry { + required string name = 1; + required string snapshot_id = 2; + required uint32 open_mode = 3; +} + +message remote_image_entry { + required string name = 1; + required string snapshot_id = 2; + required uint32 open_mode = 3; + required uint64 size = 4; +} + +message local_image_reply_entry { + required uint32 error = 1; +} + +message snapshot_id_entry { + required string snapshot_id = 1; +} diff --git a/CRIU_code/images/rlimit.proto b/CRIU_code/images/rlimit.proto new file mode 100644 index 0000000..773a8df --- /dev/null +++ b/CRIU_code/images/rlimit.proto @@ -0,0 +1,6 @@ +syntax = "proto2"; + +message rlimit_entry { + required uint64 cur = 1; + required uint64 max = 2; +} diff --git a/CRIU_code/images/rpc.proto b/CRIU_code/images/rpc.proto new file mode 100644 index 0000000..15e677a --- /dev/null +++ b/CRIU_code/images/rpc.proto @@ -0,0 +1,227 @@ +syntax = "proto2"; + +message criu_page_server_info { + optional string address = 1; + optional int32 port = 2; + optional int32 pid = 3; + optional int32 fd = 4; +} + +message criu_veth_pair { + required string if_in = 1; + required string if_out = 2; +}; + +message ext_mount_map { + required string key = 1; + required string val = 2; +}; + +message join_namespace { + required string ns = 1; + required string ns_file = 2; + optional string extra_opt = 3; +} + +message inherit_fd { + required string key = 1; + required int32 fd = 2; +}; + +message cgroup_root { + optional string ctrl = 1; + required string path = 2; +}; + +message unix_sk { + required uint32 inode = 1; +}; + +enum criu_cg_mode { + IGNORE = 0; + CG_NONE = 1; + PROPS = 2; + SOFT = 3; + FULL = 4; + STRICT = 5; + DEFAULT = 6; +}; + +message criu_opts { + required int32 images_dir_fd = 1; + optional int32 pid = 2; /* if not set on dump, will dump requesting process */ + + optional bool leave_running = 3; + optional bool ext_unix_sk = 4; + optional bool tcp_established = 5; + optional bool evasive_devices = 6; + optional bool shell_job = 7; + optional bool file_locks = 8; + optional int32 log_level = 9 [default = 2]; + optional string log_file = 10; /* No subdirs are allowed. Consider using work-dir */ + + optional criu_page_server_info ps = 11; + + optional bool notify_scripts = 12; + + optional string root = 13; + optional string parent_img = 14; + optional bool track_mem = 15; + optional bool auto_dedup = 16; + + optional int32 work_dir_fd = 17; + optional bool link_remap = 18; + repeated criu_veth_pair veths = 19; /* DEPRECATED, use external instead */ + + optional uint32 cpu_cap = 20 [default = 0xffffffff]; + optional bool force_irmap = 21; + repeated string exec_cmd = 22; + + repeated ext_mount_map ext_mnt = 23; /* DEPRECATED, use external instead */ + optional bool manage_cgroups = 24; /* backward compatibility */ + repeated cgroup_root cg_root = 25; + + optional bool rst_sibling = 26; /* swrk only */ + repeated inherit_fd inherit_fd = 27; /* swrk only */ + + optional bool auto_ext_mnt = 28; + optional bool ext_sharing = 29; + optional bool ext_masters = 30; + + repeated string skip_mnt = 31; + repeated string enable_fs = 32; + + repeated unix_sk unix_sk_ino = 33; /* DEPRECATED, use external instead */ + + optional criu_cg_mode manage_cgroups_mode = 34; + optional uint32 ghost_limit = 35 [default = 0x100000]; + repeated string irmap_scan_paths = 36; + repeated string external = 37; + optional uint32 empty_ns = 38; + repeated join_namespace join_ns = 39; + + optional string cgroup_props = 41; + optional string cgroup_props_file = 42; + repeated string cgroup_dump_controller = 43; + + optional string freeze_cgroup = 44; + optional uint32 timeout = 45; + optional bool tcp_skip_in_flight = 46; + optional bool weak_sysctls = 47; + optional bool lazy_pages = 48; + optional int32 status_fd = 49; + optional bool orphan_pts_master = 50; + optional string config_file = 51; + optional bool tcp_close = 52; + optional string lsm_profile = 53; + optional string tls_cacert = 54; + optional string tls_cacrl = 55; + optional string tls_cert = 56; + optional string tls_key = 57; + optional bool tls = 58; + optional bool tls_no_cn_verify = 59; +/* optional bool check_mounts = 128; */ +} + +message criu_dump_resp { + optional bool restored = 1; +} + +message criu_restore_resp { + required int32 pid = 1; +} + +message criu_notify { + optional string script = 1; + optional int32 pid = 2; +} + +enum criu_req_type { + EMPTY = 0; + DUMP = 1; + RESTORE = 2; + CHECK = 3; + PRE_DUMP = 4; + PAGE_SERVER = 5; + + NOTIFY = 6; + + CPUINFO_DUMP = 7; + CPUINFO_CHECK = 8; + + FEATURE_CHECK = 9; + + VERSION = 10; + + WAIT_PID = 11; + PAGE_SERVER_CHLD = 12; +} + +/* + * List of features which can queried via + * CRIU_REQ_TYPE__FEATURE_CHECK + */ +message criu_features { + optional bool mem_track = 1; + optional bool lazy_pages = 2; +} + +/* + * Request -- each type corresponds to must-be-there + * request arguments of respective type + */ + +message criu_req { + required criu_req_type type = 1; + + optional criu_opts opts = 2; + optional bool notify_success = 3; + + /* + * When set service won't close the connection but + * will wait for more req-s to appear. Works not + * for all request types. + */ + optional bool keep_open = 4; + /* + * 'features' can be used to query which features + * are supported by the installed criu/kernel + * via RPC. + */ + optional criu_features features = 5; + + /* 'pid' is used for WAIT_PID */ + optional uint32 pid = 6; +} + +/* + * Response -- it states whether the request was served + * and additional request-specific information + */ + +message criu_resp { + required criu_req_type type = 1; + required bool success = 2; + + optional criu_dump_resp dump = 3; + optional criu_restore_resp restore = 4; + optional criu_notify notify = 5; + optional criu_page_server_info ps = 6; + + optional int32 cr_errno = 7; + optional criu_features features = 8; + optional string cr_errmsg = 9; + optional criu_version version = 10; + + optional int32 status = 11; +} + +/* Answer for criu_req_type.VERSION requests */ +message criu_version { + required int32 major_number = 1; + required int32 minor_number = 2; + optional string gitid = 3; + optional int32 sublevel = 4; + optional int32 extra = 5; + optional string name = 6; +} diff --git a/CRIU_code/images/sa.proto b/CRIU_code/images/sa.proto new file mode 100644 index 0000000..3bce0c4 --- /dev/null +++ b/CRIU_code/images/sa.proto @@ -0,0 +1,11 @@ +syntax = "proto2"; + +import "opts.proto"; + +message sa_entry { + required uint64 sigaction = 1 [(criu).hex = true]; + required uint64 flags = 2 [(criu).hex = true]; + required uint64 restorer = 3 [(criu).hex = true]; + required uint64 mask = 4 [(criu).hex = true]; + optional bool compat_sigaction = 5; +} diff --git a/CRIU_code/images/seccomp.proto b/CRIU_code/images/seccomp.proto new file mode 100644 index 0000000..177e5fd --- /dev/null +++ b/CRIU_code/images/seccomp.proto @@ -0,0 +1,11 @@ +syntax = "proto2"; + +message seccomp_filter { + required bytes filter = 1; + optional uint32 prev = 2; + optional uint32 flags = 3; +} + +message seccomp_entry { + repeated seccomp_filter seccomp_filters = 1; +} diff --git a/CRIU_code/images/siginfo.proto b/CRIU_code/images/siginfo.proto new file mode 100644 index 0000000..e0d141e --- /dev/null +++ b/CRIU_code/images/siginfo.proto @@ -0,0 +1,9 @@ +syntax = "proto2"; + +message siginfo_entry { + required bytes siginfo = 1; +} + +message signal_queue_entry { + repeated siginfo_entry signals = 1; +} diff --git a/CRIU_code/images/signalfd.proto b/CRIU_code/images/signalfd.proto new file mode 100644 index 0000000..31d0d9f --- /dev/null +++ b/CRIU_code/images/signalfd.proto @@ -0,0 +1,11 @@ +syntax = "proto2"; + +import "opts.proto"; +import "fown.proto"; + +message signalfd_entry { + required uint32 id = 1; + required uint32 flags = 2 [(criu).hex = true]; + required fown_entry fown = 3; + required uint64 sigmask = 4 [(criu).hex = true]; +}; diff --git a/CRIU_code/images/sit.proto b/CRIU_code/images/sit.proto new file mode 100644 index 0000000..7ca91cc --- /dev/null +++ b/CRIU_code/images/sit.proto @@ -0,0 +1,22 @@ +syntax = "proto2"; + +import "opts.proto"; + +message sit_entry { + optional uint32 link = 1; + repeated uint32 local = 2 [(criu).ipadd = true]; + repeated uint32 remote = 3 [(criu).ipadd = true]; + optional uint32 ttl = 4; + optional uint32 tos = 5; + optional bool pmtudisc = 6; + optional uint32 proto = 7; + optional uint32 flags = 8; + optional uint32 encap_type = 9; + optional uint32 encap_flags = 10; + optional uint32 encap_sport = 11; + optional uint32 encap_dport = 12; + optional uint32 rd_prefixlen = 13; + repeated uint32 rd_prefix = 14 [(criu).ipadd = true]; + optional uint32 relay_prefixlen = 15; + repeated uint32 relay_prefix = 16 [(criu).ipadd = true]; +}; diff --git a/CRIU_code/images/sk-inet.proto b/CRIU_code/images/sk-inet.proto new file mode 100644 index 0000000..75d565d --- /dev/null +++ b/CRIU_code/images/sk-inet.proto @@ -0,0 +1,53 @@ +syntax = "proto2"; + +import "opts.proto"; +import "fown.proto"; +import "sk-opts.proto"; + +message ip_opts_raw_entry { + optional bool hdrincl = 1; + optional bool nodefrag = 2; + optional bool checksum = 3; + repeated uint32 icmpv_filter = 4; +} + +message ip_opts_entry { + optional bool freebind = 1; + // Fields 2 and 3 are reserved for vz7 use + optional ip_opts_raw_entry raw = 4; +} + +message inet_sk_entry { + /* + * We have two IDs here -- id and ino. The first one + * is used when restoring socket behind a file descriprot. + * The fdinfo image's id is it. The second one is used + * in sk-inet.c internally, in particular we identify + * a TCP stream to restore into this socket using the + * ino value. + */ + required uint32 id = 1; + required uint32 ino = 2; + required uint32 family = 3 [(criu).dict = "sk"]; + required uint32 type = 4 [(criu).dict = "sk"]; + required uint32 proto = 5 [(criu).dict = "sk"]; + required uint32 state = 6 [(criu).dict = "sk"]; + required uint32 src_port = 7; + required uint32 dst_port = 8; + required uint32 flags = 9 [(criu).hex = true]; + required uint32 backlog = 10; + + repeated uint32 src_addr = 11 [(criu).ipadd = true]; + repeated uint32 dst_addr = 12 [(criu).ipadd = true]; + + required fown_entry fown = 13; + required sk_opts_entry opts = 14; + optional bool v6only = 15; + optional ip_opts_entry ip_opts = 16; + + /* for ipv6, we need to send the ifindex to bind(); we keep the ifname + * here and convert it on restore */ + optional string ifname = 17; + optional uint32 ns_id = 18; + optional sk_shutdown shutdown = 19; +} diff --git a/CRIU_code/images/sk-netlink.proto b/CRIU_code/images/sk-netlink.proto new file mode 100644 index 0000000..97fa445 --- /dev/null +++ b/CRIU_code/images/sk-netlink.proto @@ -0,0 +1,22 @@ +syntax = "proto2"; + +import "opts.proto"; +import "fown.proto"; +import "sk-opts.proto"; + +message netlink_sk_entry { + required uint32 id = 1; + required uint32 ino = 2; + required uint32 protocol = 3; + required uint32 state = 4; + required uint32 flags = 6 [(criu).hex = true]; + required uint32 portid = 7; + repeated uint32 groups = 8; + required uint32 dst_portid = 9; + required uint32 dst_group = 10; + required fown_entry fown = 11; + required sk_opts_entry opts = 12; + optional uint32 ns_id = 13; + // For netlink queued messages + // optional nl_sk_opts_entry nl_opts = 14; +} diff --git a/CRIU_code/images/sk-opts.proto b/CRIU_code/images/sk-opts.proto new file mode 100644 index 0000000..c93ec5f --- /dev/null +++ b/CRIU_code/images/sk-opts.proto @@ -0,0 +1,33 @@ +syntax = "proto2"; + +message sk_opts_entry { + required uint32 so_sndbuf = 1; + required uint32 so_rcvbuf = 2; + + required uint64 so_snd_tmo_sec = 3; + required uint64 so_snd_tmo_usec = 4; + required uint64 so_rcv_tmo_sec = 5; + required uint64 so_rcv_tmo_usec = 6; + optional bool reuseaddr = 7; + + optional uint32 so_priority = 8; + optional uint32 so_rcvlowat = 9; + optional uint32 so_mark = 10; + optional bool so_passcred = 11; + optional bool so_passsec = 12; + optional bool so_dontroute = 13; + optional bool so_no_check = 14; + + optional string so_bound_dev = 15; + + repeated fixed64 so_filter = 16; + optional bool so_reuseport = 17; + optional bool so_broadcast = 18; +} + +enum sk_shutdown { + NONE = 0; + READ = 1; + WRITE = 2; + BOTH = 3; +} diff --git a/CRIU_code/images/sk-packet.proto b/CRIU_code/images/sk-packet.proto new file mode 100644 index 0000000..e15dd38 --- /dev/null +++ b/CRIU_code/images/sk-packet.proto @@ -0,0 +1,16 @@ +syntax = "proto2"; + +message scm_entry { + required uint32 type = 1; + repeated uint32 rights = 2; +} + +message sk_packet_entry { + required uint32 id_for = 1; + required uint32 length = 2; + // Reserved for message address + // optional bytes addr = 3; + repeated scm_entry scm = 4; + // Reserved for ucred restore + // optional sk_ucred_entry ucred = 128; +} diff --git a/CRIU_code/images/sk-unix.proto b/CRIU_code/images/sk-unix.proto new file mode 100644 index 0000000..c59644f --- /dev/null +++ b/CRIU_code/images/sk-unix.proto @@ -0,0 +1,54 @@ +syntax = "proto2"; + +import "opts.proto"; +import "fown.proto"; +import "sk-opts.proto"; + +message file_perms_entry { + required uint32 mode = 1; + required uint32 uid = 2; + required uint32 gid = 3; +} + +message unix_sk_entry { + /* + * Few words about why we need both -- id and ino. + * + * The former one is used to link file descriptor from + * fdinfo image with the unix_sk_entry that should be + * opened under it. + * + * The latter one ties together unix peers -- the peer + * member on this structure is the ino one of its peer + * and simetimes vise-versa. + */ + required uint32 id = 1; + required uint32 ino = 2; + required uint32 type = 3 [(criu).dict = "sk"]; + required uint32 state = 4 [(criu).dict = "sk"]; + required uint32 flags = 5 [(criu).hex = true]; + required uint32 uflags = 6 [(criu).hex = true]; + required uint32 backlog = 7; + required uint32 peer = 8; + required fown_entry fown = 9; + required sk_opts_entry opts = 10; + + /* + * Abstract name may contain \0 at any point, + * so we need to carry it as byte sequence... + */ + required bytes name = 11 [(criu).conv = "unix_name"]; + + optional sk_shutdown shutdown = 12; + + optional file_perms_entry file_perms = 13; + + /* + * Relative socket name may have prefix. + */ + optional string name_dir = 14; + optional bool deleted = 15; + + optional uint32 ns_id = 16; + optional sint32 mnt_id = 17 [default = -1]; +} diff --git a/CRIU_code/images/stats.proto b/CRIU_code/images/stats.proto new file mode 100644 index 0000000..68d2f1b --- /dev/null +++ b/CRIU_code/images/stats.proto @@ -0,0 +1,38 @@ +syntax = "proto2"; + +// This one contains statistics about dump/restore process +message dump_stats_entry { + required uint32 freezing_time = 1; + required uint32 frozen_time = 2; + required uint32 memdump_time = 3; + required uint32 memwrite_time = 4; + + required uint64 pages_scanned = 5; + required uint64 pages_skipped_parent = 6; + required uint64 pages_written = 7; + + optional uint32 irmap_resolve = 8; + + required uint64 pages_lazy = 9; + optional uint64 page_pipes = 10; + optional uint64 page_pipe_bufs = 11; + + optional uint64 shpages_scanned = 12; + optional uint64 shpages_skipped_parent = 13; + optional uint64 shpages_written = 14; +} + +message restore_stats_entry { + required uint64 pages_compared = 1; + required uint64 pages_skipped_cow = 2; + + required uint32 forking_time = 3; + required uint32 restore_time = 4; + + optional uint64 pages_restored = 5; +} + +message stats_entry { + optional dump_stats_entry dump = 1; + optional restore_stats_entry restore = 2; +} diff --git a/CRIU_code/images/sysctl.proto b/CRIU_code/images/sysctl.proto new file mode 100644 index 0000000..4ecdf27 --- /dev/null +++ b/CRIU_code/images/sysctl.proto @@ -0,0 +1,13 @@ +syntax = "proto2"; + +enum SysctlType { + CTL_STR = 5; + CTL_32 = 6; +} + +message sysctl_entry { + required SysctlType type = 1; + + optional int32 iarg = 2; + optional string sarg = 3; +} diff --git a/CRIU_code/images/tcp-stream.proto b/CRIU_code/images/tcp-stream.proto new file mode 100644 index 0000000..1740783 --- /dev/null +++ b/CRIU_code/images/tcp-stream.proto @@ -0,0 +1,27 @@ +syntax = "proto2"; + +import "opts.proto"; + +message tcp_stream_entry { + required uint32 inq_len = 1; + required uint32 inq_seq = 2; + required uint32 outq_len = 3; /* unsent and sent data in the send queue*/ + required uint32 outq_seq = 4; + + required uint32 opt_mask = 5 [(criu).hex = true]; /* TCPI_OPT_ bits */ + required uint32 snd_wscale = 6; + required uint32 mss_clamp = 7; + optional uint32 rcv_wscale = 8; + optional uint32 timestamp = 9; + + optional bool cork = 10; + optional bool nodelay = 11; + + optional uint32 unsq_len = 12; /* unsent data in the send queue */ + + optional uint32 snd_wl1 = 13; + optional uint32 snd_wnd = 14; + optional uint32 max_window = 15; + optional uint32 rcv_wnd = 16; + optional uint32 rcv_wup = 17; +} diff --git a/CRIU_code/images/time.proto b/CRIU_code/images/time.proto new file mode 100644 index 0000000..4bb2b94 --- /dev/null +++ b/CRIU_code/images/time.proto @@ -0,0 +1,6 @@ +syntax = "proto2"; + +message timeval { + required uint64 tv_sec = 1; + required uint64 tv_usec = 2; +} diff --git a/CRIU_code/images/timer.proto b/CRIU_code/images/timer.proto new file mode 100644 index 0000000..a254a6f --- /dev/null +++ b/CRIU_code/images/timer.proto @@ -0,0 +1,29 @@ +syntax = "proto2"; + +message itimer_entry { + required uint64 isec = 1; + required uint64 iusec = 2; + required uint64 vsec = 3; + required uint64 vusec = 4; +} + +message posix_timer_entry { + required uint32 it_id = 1; + required uint32 clock_id = 2; + required uint32 si_signo = 3; + required uint32 it_sigev_notify = 4; + required uint64 sival_ptr = 5; + required uint32 overrun = 6; + + required uint64 isec = 7; + required uint64 insec = 8; + required uint64 vsec = 9; + required uint64 vnsec = 10; +} + +message task_timers_entry { + required itimer_entry real = 1; + required itimer_entry virt = 2; + required itimer_entry prof = 3; + repeated posix_timer_entry posix = 4; +} diff --git a/CRIU_code/images/timerfd.proto b/CRIU_code/images/timerfd.proto new file mode 100644 index 0000000..2432815 --- /dev/null +++ b/CRIU_code/images/timerfd.proto @@ -0,0 +1,19 @@ +syntax = "proto2"; + +import "opts.proto"; +import "fown.proto"; + +message timerfd_entry { + required uint32 id = 1; + required uint32 flags = 2 [(criu).hex = true]; + required fown_entry fown = 3; + + required uint32 clockid = 4; + required uint64 ticks = 5; + required uint32 settime_flags = 6 [(criu).hex = true]; + + required uint64 vsec = 7; + required uint64 vnsec = 8; + required uint64 isec = 9; + required uint64 insec = 10; +} diff --git a/CRIU_code/images/tty.proto b/CRIU_code/images/tty.proto new file mode 100644 index 0000000..ed664ef --- /dev/null +++ b/CRIU_code/images/tty.proto @@ -0,0 +1,90 @@ +syntax = "proto2"; + +import "opts.proto"; +import "fown.proto"; + +message winsize_entry { + required uint32 ws_row = 1; + required uint32 ws_col = 2; + required uint32 ws_xpixel = 3; + required uint32 ws_ypixel = 4; +}; + +message termios_entry { + required uint32 c_iflag = 1; + required uint32 c_oflag = 2; + required uint32 c_cflag = 3; + required uint32 c_lflag = 4; + required uint32 c_line = 5; + required uint32 c_ispeed = 6; + required uint32 c_ospeed = 7; + + repeated uint32 c_cc = 8; +} + +message tty_pty_entry { + required uint32 index = 1; +} + +enum TtyType { + UNKNOWN = 0; + PTY = 1; + CONSOLE = 2; + VT = 3; + CTTY = 4; + EXT_TTY = 5; + SERIAL = 6; +} + +message tty_data_entry { + required uint32 tty_id = 1; + required bytes data = 2; + + // optional sint32 mnt_id = 3 [default = 0]; +} + +message tty_info_entry { + required uint32 id = 1; + + required TtyType type = 2; + + required bool locked = 3; /* Unix98 PTY only */ + required bool exclusive = 4; + required bool packet_mode = 5; /* Unix98 PTY only */ + + required uint32 sid = 6; + required uint32 pgrp = 7; + + /* + * Convenient for printing errors and such, with this + * device encoded we can figure out major and minor + * numbers. + */ + required uint32 rdev = 8; + + optional termios_entry termios = 9; + optional termios_entry termios_locked = 10; + optional winsize_entry winsize = 11; + + /* + * These are optional fields which presence depends on + * TTY type. + */ + optional tty_pty_entry pty = 12; + optional uint32 dev = 13; + + optional uint32 uid = 14; + optional uint32 gid = 15; + + // optional sint32 mnt_id = 16 [default = 0]; +}; + +message tty_file_entry { + required uint32 id = 1; + required uint32 tty_info_id = 2; + + required uint32 flags = 3 [(criu).hex = true]; + required fown_entry fown = 4; + // optional sint32 mnt_id = 5 [default = 0]; + optional uint32 regf_id = 6; +} diff --git a/CRIU_code/images/tun.proto b/CRIU_code/images/tun.proto new file mode 100644 index 0000000..b70c9ed --- /dev/null +++ b/CRIU_code/images/tun.proto @@ -0,0 +1,18 @@ +syntax = "proto2"; + +import "opts.proto"; + +message tunfile_entry { + required uint32 id = 1; + optional string netdev = 2; + optional bool detached = 3; + optional uint32 ns_id = 4; +}; + +message tun_link_entry { + required uint32 flags = 1 [(criu).hex = true]; + required int32 owner = 2; + required int32 group = 3; + required uint32 vnethdr = 4; + required uint32 sndbuf = 5; +}; diff --git a/CRIU_code/images/userns.proto b/CRIU_code/images/userns.proto new file mode 100644 index 0000000..16be6b1 --- /dev/null +++ b/CRIU_code/images/userns.proto @@ -0,0 +1,12 @@ +syntax = "proto2"; + +message uid_gid_extent { + required uint32 first = 1; + required uint32 lower_first = 2; + required uint32 count = 3; +} + +message userns_entry { + repeated uid_gid_extent uid_map = 1; + repeated uid_gid_extent gid_map = 2; +} diff --git a/CRIU_code/images/utsns.proto b/CRIU_code/images/utsns.proto new file mode 100644 index 0000000..a29aea1 --- /dev/null +++ b/CRIU_code/images/utsns.proto @@ -0,0 +1,6 @@ +syntax = "proto2"; + +message utsns_entry { + required string nodename = 1; + required string domainname = 2; +} diff --git a/CRIU_code/images/vma.proto b/CRIU_code/images/vma.proto new file mode 100644 index 0000000..7085f42 --- /dev/null +++ b/CRIU_code/images/vma.proto @@ -0,0 +1,25 @@ +syntax = "proto2"; + +import "opts.proto"; + +message vma_entry { + required uint64 start = 1 [(criu).hex = true]; + required uint64 end = 2 [(criu).hex = true]; + required uint64 pgoff = 3; + required uint64 shmid = 4; + required uint32 prot = 5 [(criu).flags = "mmap.prot" ]; + required uint32 flags = 6 [(criu).flags = "mmap.flags" ]; + required uint32 status = 7 [(criu).flags = "mmap.status" ]; + /* + * This fd thing is unused in the image, it was lost + * while switching from execve restore model. It is + * -1 by default. + */ + required sint64 fd = 8; + + /* madvise flags bitmap */ + optional uint64 madv = 9 [(criu).hex = true]; + + /* file status flags */ + optional uint32 fdflags = 10 [(criu).hex = true]; +} diff --git a/CRIU_code/include/common/arch/aarch64/asm/atomic.h b/CRIU_code/include/common/arch/aarch64/asm/atomic.h new file mode 100644 index 0000000..11785c3 --- /dev/null +++ b/CRIU_code/include/common/arch/aarch64/asm/atomic.h @@ -0,0 +1,99 @@ +#ifndef __CR_ATOMIC_H__ +#define __CR_ATOMIC_H__ + +typedef struct { + int counter; +} atomic_t; + + +/* Copied from the Linux header arch/arm/include/asm/barrier.h */ + +#define smp_mb() asm volatile("dmb ish" : : : "memory") + + +/* Copied from the Linux kernel header arch/arm64/include/asm/atomic.h */ + +static inline int atomic_read(const atomic_t *v) +{ + return (*(volatile int *)&(v)->counter); +} + +static inline void atomic_set(atomic_t *v, int i) +{ + v->counter = i; +} + +#define atomic_get atomic_read + + +static inline int atomic_add_return(int i, atomic_t *v) +{ + unsigned long tmp; + int result; + + asm volatile( +"1: ldxr %w0, %2\n" +" add %w0, %w0, %w3\n" +" stlxr %w1, %w0, %2\n" +" cbnz %w1, 1b" + : "=&r" (result), "=&r" (tmp), "+Q" (v->counter) + : "Ir" (i) + : "cc", "memory"); + + smp_mb(); + return result; +} + +static inline int atomic_sub_return(int i, atomic_t *v) +{ + unsigned long tmp; + int result; + + asm volatile( +"1: ldxr %w0, %2\n" +" sub %w0, %w0, %w3\n" +" stlxr %w1, %w0, %2\n" +" cbnz %w1, 1b" + : "=&r" (result), "=&r" (tmp), "+Q" (v->counter) + : "Ir" (i) + : "cc", "memory"); + + smp_mb(); + return result; +} + +static inline int atomic_inc(atomic_t *v) { return atomic_add_return(1, v) - 1; } + +static inline int atomic_add(int val, atomic_t *v) { return atomic_add_return(val, v) - val; } + +static inline int atomic_dec(atomic_t *v) { return atomic_sub_return(1, v) + 1; } + +/* true if the result is 0, or false for all other cases. */ +#define atomic_dec_and_test(v) (atomic_sub_return(1, v) == 0) +#define atomic_dec_return(v) (atomic_sub_return(1, v)) + +#define atomic_inc_return(v) (atomic_add_return(1, v)) + +static inline int atomic_cmpxchg(atomic_t *ptr, int old, int new) +{ + unsigned long tmp; + int oldval; + + smp_mb(); + + asm volatile("// atomic_cmpxchg\n" +"1: ldxr %w1, %2\n" +" cmp %w1, %w3\n" +" b.ne 2f\n" +" stxr %w0, %w4, %2\n" +" cbnz %w0, 1b\n" +"2:" + : "=&r" (tmp), "=&r" (oldval), "+Q" (ptr->counter) + : "Ir" (old), "r" (new) + : "cc"); + + smp_mb(); + return oldval; +} + +#endif /* __CR_ATOMIC_H__ */ diff --git a/CRIU_code/include/common/arch/aarch64/asm/bitops.h b/CRIU_code/include/common/arch/aarch64/asm/bitops.h new file mode 100644 index 0000000..eb9aa62 --- /dev/null +++ b/CRIU_code/include/common/arch/aarch64/asm/bitops.h @@ -0,0 +1,9 @@ +#ifndef __CR_ASM_BITOPS_H__ +#define __CR_ASM_BITOPS_H__ + +#include "common/compiler.h" +#include "common/asm-generic/bitops.h" + +extern int test_and_set_bit(int nr, volatile unsigned long *p); + +#endif /* __CR_ASM_BITOPS_H__ */ diff --git a/CRIU_code/include/common/arch/aarch64/asm/bitsperlong.h b/CRIU_code/include/common/arch/aarch64/asm/bitsperlong.h new file mode 100644 index 0000000..d95727d --- /dev/null +++ b/CRIU_code/include/common/arch/aarch64/asm/bitsperlong.h @@ -0,0 +1,6 @@ +#ifndef __CR_BITSPERLONG_H__ +#define __CR_BITSPERLONG_H__ + +#define BITS_PER_LONG 64 + +#endif /* __CR_BITSPERLONG_H__ */ diff --git a/CRIU_code/include/common/arch/aarch64/asm/linkage.h b/CRIU_code/include/common/arch/aarch64/asm/linkage.h new file mode 100644 index 0000000..7380642 --- /dev/null +++ b/CRIU_code/include/common/arch/aarch64/asm/linkage.h @@ -0,0 +1,24 @@ +#ifndef __CR_LINKAGE_H__ +#define __CR_LINKAGE_H__ + +#ifdef __ASSEMBLY__ + +#define __ALIGN .align 4, 0x00 +#define __ALIGN_STR ".align 4, 0x00" + +#define GLOBAL(name) \ + .globl name; \ + name: + +#define ENTRY(name) \ + .globl name; \ + .type name, #function; \ + __ALIGN; \ + name: + +#define END(sym) \ + .size sym, . - sym + +#endif /* __ASSEMBLY__ */ + +#endif /* __CR_LINKAGE_H__ */ diff --git a/CRIU_code/include/common/arch/aarch64/asm/page.h b/CRIU_code/include/common/arch/aarch64/asm/page.h new file mode 100644 index 0000000..bd8fe8f --- /dev/null +++ b/CRIU_code/include/common/arch/aarch64/asm/page.h @@ -0,0 +1,44 @@ +#ifndef __CR_ASM_PAGE_H__ +#define __CR_ASM_PAGE_H__ + +#define ARCH_HAS_LONG_PAGES + +#ifndef CR_NOGLIBC +#include /* ffsl() */ +#include /* _SC_PAGESIZE */ + +extern unsigned __page_size; +extern unsigned __page_shift; + +static inline unsigned page_size(void) +{ + if (!__page_size) + __page_size = sysconf(_SC_PAGESIZE); + return __page_size; +} + +static inline unsigned page_shift(void) +{ + if (!__page_shift) + __page_shift = (ffsl(page_size()) - 1); + return __page_shift; +} + +/* + * Don't add ifdefs for PAGE_SIZE: if any header defines it as a constant + * on aarch64, then we need refrain using PAGE_SIZE in criu and use + * page_size() across sources (as it may differ on aarch64). + */ +#define PAGE_SIZE page_size() +#define PAGE_MASK (~(PAGE_SIZE - 1)) +#define PAGE_SHIFT page_shift() + +#define PAGE_PFN(addr) ((addr) / PAGE_SIZE) + +#else /* CR_NOGLIBC */ + +extern unsigned page_size(void); +#define PAGE_SIZE page_size() + +#endif /* CR_NOGLIBC */ +#endif /* __CR_ASM_PAGE_H__ */ diff --git a/CRIU_code/include/common/arch/arm/asm/atomic.h b/CRIU_code/include/common/arch/arm/asm/atomic.h new file mode 100644 index 0000000..7998a20 --- /dev/null +++ b/CRIU_code/include/common/arch/arm/asm/atomic.h @@ -0,0 +1,133 @@ +#ifndef __CR_ATOMIC_H__ +#define __CR_ATOMIC_H__ + +#include "common/arch/arm/asm/processor.h" + +typedef struct { + int counter; +} atomic_t; + + +/* Copied from the Linux kernel header arch/arm/include/asm/atomic.h */ + +#if defined(CONFIG_ARMV7) + +#define smp_mb() __asm__ __volatile__ ("dmb" : : : "memory") + +static inline int atomic_cmpxchg(atomic_t *ptr, int old, int new) +{ + int oldval; + unsigned long res; + + smp_mb(); + prefetchw(&ptr->counter); + + do { + __asm__ __volatile__("@ atomic_cmpxchg\n" + "ldrex %1, [%3]\n" + "mov %0, #0\n" + "teq %1, %4\n" + "it eq\n" + "strexeq %0, %5, [%3]\n" + : "=&r" (res), "=&r" (oldval), "+Qo" (ptr->counter) + : "r" (&ptr->counter), "Ir" (old), "r" (new) + : "cc"); + } while (res); + + smp_mb(); + + return oldval; +} + +#elif defined(CONFIG_ARMV6) + +/* SMP isn't supported for ARMv6 */ + +#define smp_mb() __asm__ __volatile__ ("mcr p15, 0, %0, c7, c10, 5" : : "r" (0) : "memory") + +static inline int atomic_cmpxchg(atomic_t *v, int old, int new) +{ + int ret; + + ret = v->counter; + if (ret == old) + v->counter = new; + + return ret; +} + +#else + +#error ARM architecture version (CONFIG_ARMV*) not set or unsupported. + +#endif + +static inline int atomic_read(const atomic_t *v) +{ + return (*(volatile int *)&(v)->counter); +} + +static inline void atomic_set(atomic_t *v, int i) +{ + v->counter = i; +} + +#define atomic_get atomic_read + +static inline int atomic_add_return(int i, atomic_t *v) +{ + unsigned long tmp; + int result; + + smp_mb(); + + __asm__ __volatile__("@ atomic_add_return\n" +"1: ldrex %0, [%3]\n" +" add %0, %0, %4\n" +" strex %1, %0, [%3]\n" +" teq %1, #0\n" +" bne 1b\n" + : "=&r" (result), "=&r" (tmp), "+Qo" (v->counter) + : "r" (&v->counter), "Ir" (i) + : "cc"); + + smp_mb(); + + return result; +} + +static inline int atomic_sub_return(int i, atomic_t *v) +{ + unsigned long tmp; + int result; + + smp_mb(); + + __asm__ __volatile__("@ atomic_sub_return\n" +"1: ldrex %0, [%3]\n" +" sub %0, %0, %4\n" +" strex %1, %0, [%3]\n" +" teq %1, #0\n" +" bne 1b\n" + : "=&r" (result), "=&r" (tmp), "+Qo" (v->counter) + : "r" (&v->counter), "Ir" (i) + : "cc"); + + smp_mb(); + + return result; +} + +static inline int atomic_inc(atomic_t *v) { return atomic_add_return(1, v) - 1; } + +static inline int atomic_add(int val, atomic_t *v) { return atomic_add_return(val, v) - val; } + +static inline int atomic_dec(atomic_t *v) { return atomic_sub_return(1, v) + 1; } + +/* true if the result is 0, or false for all other cases. */ +#define atomic_dec_and_test(v) (atomic_sub_return(1, v) == 0) +#define atomic_dec_return(v) (atomic_sub_return(1, v)) + +#define atomic_inc_return(v) (atomic_add_return(1, v)) + +#endif /* __CR_ATOMIC_H__ */ diff --git a/CRIU_code/include/common/arch/arm/asm/bitops.h b/CRIU_code/include/common/arch/arm/asm/bitops.h new file mode 100644 index 0000000..eb9aa62 --- /dev/null +++ b/CRIU_code/include/common/arch/arm/asm/bitops.h @@ -0,0 +1,9 @@ +#ifndef __CR_ASM_BITOPS_H__ +#define __CR_ASM_BITOPS_H__ + +#include "common/compiler.h" +#include "common/asm-generic/bitops.h" + +extern int test_and_set_bit(int nr, volatile unsigned long *p); + +#endif /* __CR_ASM_BITOPS_H__ */ diff --git a/CRIU_code/include/common/arch/arm/asm/bitsperlong.h b/CRIU_code/include/common/arch/arm/asm/bitsperlong.h new file mode 100644 index 0000000..43858b7 --- /dev/null +++ b/CRIU_code/include/common/arch/arm/asm/bitsperlong.h @@ -0,0 +1,6 @@ +#ifndef __CR_BITSPERLONG_H__ +#define __CR_BITSPERLONG_H__ + +#define BITS_PER_LONG 32 + +#endif /* __CR_BITSPERLONG_H__ */ diff --git a/CRIU_code/include/common/arch/arm/asm/linkage.h b/CRIU_code/include/common/arch/arm/asm/linkage.h new file mode 100644 index 0000000..a93898b --- /dev/null +++ b/CRIU_code/include/common/arch/arm/asm/linkage.h @@ -0,0 +1,28 @@ +#ifndef __CR_LINKAGE_H__ +#define __CR_LINKAGE_H__ + +#ifdef __ASSEMBLY__ + +#define __ALIGN .align 4, 0x00 +#define __ALIGN_STR ".align 4, 0x00" + +#define GLOBAL(name) \ + .globl name; \ + name: + +#define ENTRY(name) \ + .globl name; \ + .type name, #function; \ + __ALIGN; \ + name: + +#define END(sym) \ + .size sym, . - sym + +#define ALIAS(sym_new, sym_old) \ + .globl sym_new; \ + .set sym_new, sym_old + +#endif /* __ASSEMBLY__ */ + +#endif /* __CR_LINKAGE_H__ */ diff --git a/CRIU_code/include/common/arch/arm/asm/page.h b/CRIU_code/include/common/arch/arm/asm/page.h new file mode 100644 index 0000000..1348355 --- /dev/null +++ b/CRIU_code/include/common/arch/arm/asm/page.h @@ -0,0 +1,19 @@ +#ifndef __CR_ASM_PAGE_H__ +#define __CR_ASM_PAGE_H__ + +#ifndef PAGE_SHIFT +# define PAGE_SHIFT 12 +#endif + +#ifndef PAGE_SIZE +# define PAGE_SIZE (1UL << PAGE_SHIFT) +#endif + +#ifndef PAGE_MASK +# define PAGE_MASK (~(PAGE_SIZE - 1)) +#endif + +#define PAGE_PFN(addr) ((addr) / PAGE_SIZE) +#define page_size() PAGE_SIZE + +#endif /* __CR_ASM_PAGE_H__ */ diff --git a/CRIU_code/include/common/arch/arm/asm/processor.h b/CRIU_code/include/common/arch/arm/asm/processor.h new file mode 100644 index 0000000..a390cfd --- /dev/null +++ b/CRIU_code/include/common/arch/arm/asm/processor.h @@ -0,0 +1,28 @@ +#ifndef __CR_PROCESSOR_H__ +#define __CR_PROCESSOR_H__ + +/* Copied from linux kernel arch/arm/include/asm/unified.h */ + +#define WASM(instr) #instr + +/* Copied from linux kernel arch/arm/include/asm/processor.h */ + +#define __ALT_SMP_ASM(smp, up) \ + "9998: " smp "\n" \ + " .pushsection \".alt.smp.init\", \"a\"\n" \ + " .long 9998b\n" \ + " " up "\n" \ + " .popsection\n" + +static inline void prefetchw(const void *ptr) +{ + __asm__ __volatile__( + ".arch_extension mp\n" + __ALT_SMP_ASM( + WASM(pldw) "\t%a0", + WASM(pld) "\t%a0" + ) + :: "p" (ptr)); +} + +#endif /* __CR_PROCESSOR_H__ */ diff --git a/CRIU_code/include/common/arch/ppc64/asm/atomic.h b/CRIU_code/include/common/arch/ppc64/asm/atomic.h new file mode 100644 index 0000000..4c64774 --- /dev/null +++ b/CRIU_code/include/common/arch/ppc64/asm/atomic.h @@ -0,0 +1,134 @@ +#ifndef __CR_ATOMIC_H__ +#define __CR_ATOMIC_H__ + +/* + * PowerPC atomic operations + * + * Copied from kernel header file arch/powerpc/include/asm/atomic.h + */ + +typedef struct { + int counter; +} atomic_t; + +#include "common/arch/ppc64/asm/cmpxchg.h" + +#define PPC_ATOMIC_ENTRY_BARRIER "lwsync \n" +#define PPC_ATOMIC_EXIT_BARRIER "sync \n" + +#define ATOMIC_INIT(i) { (i) } + +static __inline__ int atomic_read(const atomic_t *v) +{ + int t; + + __asm__ __volatile__("lwz%U1%X1 %0,%1" : "=r"(t) : "m"(v->counter)); + + return t; +} + +static __inline__ void atomic_set(atomic_t *v, int i) +{ + __asm__ __volatile__("stw%U0%X0 %1,%0" : "=m"(v->counter) : "r"(i)); +} + +#define ATOMIC_OP(op, asm_op) \ +static __inline__ void atomic_##op(int a, atomic_t *v) \ +{ \ + int t; \ + \ + __asm__ __volatile__( \ +"1: lwarx %0,0,%3 # atomic_" #op "\n" \ + #asm_op " %0,%2,%0\n" \ +" stwcx. %0,0,%3 \n" \ +" bne- 1b\n" \ + : "=&r" (t), "+m" (v->counter) \ + : "r" (a), "r" (&v->counter) \ + : "cc"); \ +} \ + +ATOMIC_OP(add, add) +ATOMIC_OP(sub, subf) + +#undef ATOMIC_OP + +static __inline__ void atomic_inc(atomic_t *v) +{ + int t; + + __asm__ __volatile__( +"1: lwarx %0,0,%2 # atomic_inc\n\ + addic %0,%0,1\n" +" stwcx. %0,0,%2 \n\ + bne- 1b" + : "=&r" (t), "+m" (v->counter) + : "r" (&v->counter) + : "cc", "xer"); +} + +static __inline__ int atomic_inc_return(atomic_t *v) +{ + int t; + + __asm__ __volatile__( + PPC_ATOMIC_ENTRY_BARRIER \ +"1: lwarx %0,0,%1 # atomic_inc_return\n\ + addic %0,%0,1\n" +" stwcx. %0,0,%1 \n\ + bne- 1b \n" \ + PPC_ATOMIC_EXIT_BARRIER + : "=&r" (t) + : "r" (&v->counter) + : "cc", "xer", "memory"); + + return t; +} + +/* + * atomic_inc_and_test - increment and test + * @v: pointer of type atomic_t + * + * Atomically increments @v by 1 + * and returns true if the result is zero, or false for all + * other cases. + */ + +static __inline__ void atomic_dec(atomic_t *v) +{ + int t; + + __asm__ __volatile__( +"1: lwarx %0,0,%2 # atomic_dec\n\ + addic %0,%0,-1\n" +" stwcx. %0,0,%2\n\ + bne- 1b" + : "=&r" (t), "+m" (v->counter) + : "r" (&v->counter) + : "cc", "xer"); +} + +static __inline__ int atomic_sub_return(int a, atomic_t *v) +{ + int t; + + __asm__ __volatile__( +" \nLWSYNC\n" +"1: lwarx %0,0,%2 # atomic_sub_return\n\ + subf %0,%1,%0\n" +" stwcx. %0,0,%2 \n\ + bne- 1b" +" \nsync\n" + : "=&r" (t) + : "r" (a), "r" (&v->counter) + : "cc", "memory"); + + return t; +} + +/* true if the result is 0, or false for all other cases. */ +#define atomic_dec_and_test(v) (atomic_sub_return(1, v) == 0) +#define atomic_dec_return(v) (atomic_sub_return(1, v)) + +#define atomic_cmpxchg(v, o, n) (cmpxchg(&((v)->counter), (o), (n))) + +#endif /* __CR_ATOMIC_H__ */ diff --git a/CRIU_code/include/common/arch/ppc64/asm/bitops.h b/CRIU_code/include/common/arch/ppc64/asm/bitops.h new file mode 100644 index 0000000..f9a327c --- /dev/null +++ b/CRIU_code/include/common/arch/ppc64/asm/bitops.h @@ -0,0 +1,215 @@ +#ifndef __CR_BITOPS_H__ +#define __CR_BITOPS_H__ +/* + * PowerPC atomic bit operations. + * + * Merged version by David Gibson . + * Based on ppc64 versions by: Dave Engebretsen, Todd Inglett, Don + * Reed, Pat McCarthy, Peter Bergner, Anton Blanchard. They + * originally took it from the ppc32 code. + * + * Within a word, bits are numbered LSB first. Lot's of places make + * this assumption by directly testing bits with (val & (1< 1 word) bitmaps on a + * big-endian system because, unlike little endian, the number of each + * bit depends on the word size. + * + * The bitop functions are defined to work on unsigned longs, so for a + * ppc64 system the bits end up numbered: + * |63..............0|127............64|191...........128|255...........192| + * and on ppc32: + * |31.....0|63....32|95....64|127...96|159..128|191..160|223..192|255..224| + * + * There are a few little-endian macros used mostly for filesystem + * bitmaps, these work on similar bit arrays layouts, but + * byte-oriented: + * |7...0|15...8|23...16|31...24|39...32|47...40|55...48|63...56| + * + * The main difference is that bit 3-5 (64b) or 3-4 (32b) in the bit + * number field needs to be reversed compared to the big-endian bit + * fields. This can be achieved by XOR with 0x38 (64b) or 0x18 (32b). + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * -- + * Copied from the kernel file arch/powerpc/include/asm/bitops.h + */ + +#include "common/compiler.h" + +#include "common/asm/bitsperlong.h" + +#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d)) +#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_LONG) + +#define DECLARE_BITMAP(name,bits) \ + unsigned long name[BITS_TO_LONGS(bits)] + +#define __stringify_in_c(...) #__VA_ARGS__ +#define stringify_in_c(...) __stringify_in_c(__VA_ARGS__) " " + +#define BIT_MASK(nr) (1UL << ((nr) % BITS_PER_LONG)) +#define BIT_WORD(nr) ((nr) / BITS_PER_LONG) + +/* PPC bit number conversion */ +#define PPC_BITLSHIFT(be) (BITS_PER_LONG - 1 - (be)) +#define PPC_BIT(bit) (1UL << PPC_BITLSHIFT(bit)) +#define PPC_BITMASK(bs, be) ((PPC_BIT(bs) - PPC_BIT(be)) | PPC_BIT(bs)) + +#define PPC_INST_LDARX 0x7c0000a8 +#define ___PPC_RA(a) (((a) & 0x1f) << 16) +#define ___PPC_RB(b) (((b) & 0x1f) << 11) +#define ___PPC_RS(s) (((s) & 0x1f) << 21) +#define __PPC_EH(eh) (((eh) & 0x1) << 0) +#define ___PPC_RT(t) ___PPC_RS(t) + +#define PPC_LDARX(t, a, b, eh) stringify_in_c(.long PPC_INST_LDARX | \ + ___PPC_RT(t) | ___PPC_RA(a) | \ + ___PPC_RB(b) | __PPC_EH(eh)) +#define PPC_LLARX(t, a, b, eh) PPC_LDARX(t, a, b, eh) + +/* Macro for generating the ***_bits() functions */ +#define DEFINE_BITOP(fn, op) \ +static __inline__ void fn(unsigned long mask, \ + volatile unsigned long *_p) \ +{ \ + unsigned long old; \ + unsigned long *p = (unsigned long *)_p; \ + __asm__ __volatile__ ( \ +"1: ldarx %0,0,%3\n" \ + stringify_in_c(op) "%0,%0,%2\n" \ + "stdcx. %0,0,%3\n" \ + "bne- 1b\n" \ + : "=&r" (old), "+m" (*p) \ + : "r" (mask), "r" (p) \ + : "cc", "memory"); \ +} + +DEFINE_BITOP(set_bits, or) +DEFINE_BITOP(clear_bits, andc) +DEFINE_BITOP(change_bits, xor) + +static __inline__ void set_bit(int nr, volatile unsigned long *addr) +{ + set_bits(BIT_MASK(nr), addr + BIT_WORD(nr)); +} + +static __inline__ void clear_bit(int nr, volatile unsigned long *addr) +{ + clear_bits(BIT_MASK(nr), addr + BIT_WORD(nr)); +} + +static __inline__ void change_bit(int nr, volatile unsigned long *addr) +{ + change_bits(BIT_MASK(nr), addr + BIT_WORD(nr)); +} + +static inline int test_bit(int nr, const volatile unsigned long *addr) +{ + return 1UL & (addr[BIT_WORD(nr)] >> (nr & (BITS_PER_LONG-1))); +} + +/* Like DEFINE_BITOP(), with changes to the arguments to 'op' and the output + * operands. */ +#define DEFINE_TESTOP(fn, op, prefix, postfix, eh) \ +static __inline__ unsigned long fn( \ + unsigned long mask, \ + volatile unsigned long *_p) \ +{ \ + unsigned long old, t; \ + unsigned long *p = (unsigned long *)_p; \ + __asm__ __volatile__ ( \ + prefix \ +"1:" PPC_LLARX(%0,0,%3,eh) "\n" \ + stringify_in_c(op) "%1,%0,%2\n" \ + "stdcx. %1,0,%3\n" \ + "bne- 1b\n" \ + postfix \ + : "=&r" (old), "=&r" (t) \ + : "r" (mask), "r" (p) \ + : "cc", "memory"); \ + return (old & mask); \ +} + +DEFINE_TESTOP(test_and_set_bits, or, "\nLWSYNC\n", "\nsync\n", 0) + +static __inline__ int test_and_set_bit(unsigned long nr, + volatile unsigned long *addr) +{ + return test_and_set_bits(BIT_MASK(nr), addr + BIT_WORD(nr)) != 0; +} + +/* + * Return the zero-based bit position (LE, not IBM bit numbering) of + * the most significant 1-bit in a double word. + */ +static __inline__ __attribute__((const)) +int __ilog2(unsigned long x) +{ + int lz; + + asm ("cntlzd %0,%1" : "=r" (lz) : "r" (x)); + return BITS_PER_LONG - 1 - lz; +} + + +static __inline__ unsigned long __ffs(unsigned long x) +{ + return __ilog2(x & -x); +} + + +#define BITOP_WORD(nr) ((nr) / BITS_PER_LONG) +/* + * Find the next set bit in a memory region. + */ +static inline +unsigned long find_next_bit(const unsigned long *addr, unsigned long size, + unsigned long offset) +{ + const unsigned long *p = addr + BITOP_WORD(offset); + unsigned long result = offset & ~(BITS_PER_LONG-1); + unsigned long tmp; + + if (offset >= size) + return size; + size -= result; + offset %= BITS_PER_LONG; + if (offset) { + tmp = *(p++); + tmp &= (~0UL << offset); + if (size < BITS_PER_LONG) + goto found_first; + if (tmp) + goto found_middle; + size -= BITS_PER_LONG; + result += BITS_PER_LONG; + } + while (size & ~(BITS_PER_LONG-1)) { + if ((tmp = *(p++))) + goto found_middle; + result += BITS_PER_LONG; + size -= BITS_PER_LONG; + } + if (!size) + return result; + tmp = *p; + +found_first: + tmp &= (~0UL >> (BITS_PER_LONG - size)); + if (tmp == 0UL) /* Are any bits set? */ + return result + size; /* Nope. */ +found_middle: + return result + __ffs(tmp); +} + +#define for_each_bit(i, bitmask) \ + for (i = find_next_bit(bitmask, sizeof(bitmask), 0); \ + i < sizeof(bitmask); \ + i = find_next_bit(bitmask, sizeof(bitmask), i + 1)) + + +#endif /* __CR_BITOPS_H__ */ diff --git a/CRIU_code/include/common/arch/ppc64/asm/bitsperlong.h b/CRIU_code/include/common/arch/ppc64/asm/bitsperlong.h new file mode 100644 index 0000000..d95727d --- /dev/null +++ b/CRIU_code/include/common/arch/ppc64/asm/bitsperlong.h @@ -0,0 +1,6 @@ +#ifndef __CR_BITSPERLONG_H__ +#define __CR_BITSPERLONG_H__ + +#define BITS_PER_LONG 64 + +#endif /* __CR_BITSPERLONG_H__ */ diff --git a/CRIU_code/include/common/arch/ppc64/asm/cmpxchg.h b/CRIU_code/include/common/arch/ppc64/asm/cmpxchg.h new file mode 100644 index 0000000..b93fbde --- /dev/null +++ b/CRIU_code/include/common/arch/ppc64/asm/cmpxchg.h @@ -0,0 +1,96 @@ +#ifndef __CR_CMPXCHG_H__ +#define __CR_CMPXCHG_H__ + +/* + * Copied from kernel header file arch/powerpc/include/asm/cmpxchg.h + */ + +#define PPC_ACQUIRE_BARRIER "isync \n" +#define PPC_RELEASE_BARRIER "lwsync \n" + +/* + * Compare and exchange - if *p == old, set it to new, + * and return the old value of *p. + */ + +static __always_inline unsigned long +__cmpxchg_u32(volatile unsigned int *p, unsigned long old, unsigned long new) +{ + unsigned int prev; + + __asm__ __volatile__ ( + PPC_RELEASE_BARRIER \ +"1: lwarx %0,0,%2 # __cmpxchg_u32\n\ + cmpw 0,%0,%3\n\ + bne- 2f\n" +" stwcx. %4,0,%2\n\ + bne- 1b \n" \ + PPC_ACQUIRE_BARRIER + "\n\ +2:" + : "=&r" (prev), "+m" (*p) + : "r" (p), "r" (old), "r" (new) + : "cc", "memory"); + + return prev; +} + +static __always_inline unsigned long +__cmpxchg_u64(volatile unsigned long *p, unsigned long old, unsigned long new) +{ + unsigned long prev; + + __asm__ __volatile__ ( + PPC_RELEASE_BARRIER \ +"1: ldarx %0,0,%2 # __cmpxchg_u64\n\ + cmpd 0,%0,%3\n\ + bne- 2f\n\ + stdcx. %4,0,%2\n\ + bne- 1b \n" \ + PPC_ACQUIRE_BARRIER + "\n\ +2:" + : "=&r" (prev), "+m" (*p) + : "r" (p), "r" (old), "r" (new) + : "cc", "memory"); + + return prev; +} + +/* This function doesn't exist, so you'll get a linker error + if something tries to do an invalid cmpxchg(). */ +#ifdef CR_DEBUG +static inline void __cmpxchg_called_with_bad_pointer(void) +{ + __asm__ __volatile__ ( + "1: twi 31,0,0 # trap\n" + " b 1b" + : : : "memory"); +} +#else +extern void __cmpxchg_called_with_bad_pointer(void); +#endif + +static __always_inline unsigned long +__cmpxchg(volatile void *ptr, unsigned long old, unsigned long new, + unsigned int size) +{ + switch (size) { + case 4: + return __cmpxchg_u32(ptr, old, new); + case 8: + return __cmpxchg_u64(ptr, old, new); + } + __cmpxchg_called_with_bad_pointer(); + return old; +} + +#define cmpxchg(ptr, o, n) \ + ({ \ + __typeof__(*(ptr)) _o_ = (o); \ + __typeof__(*(ptr)) _n_ = (n); \ + (__typeof__(*(ptr))) __cmpxchg((ptr), (unsigned long)_o_, \ + (unsigned long)_n_, sizeof(*(ptr))); \ + }) + +#endif /* __CR_CMPXCHG_H__ */ diff --git a/CRIU_code/include/common/arch/ppc64/asm/linkage.h b/CRIU_code/include/common/arch/ppc64/asm/linkage.h new file mode 100644 index 0000000..01a47ab --- /dev/null +++ b/CRIU_code/include/common/arch/ppc64/asm/linkage.h @@ -0,0 +1,301 @@ +/* + * Various PowerPc assembly definitions + * + * Copied from the kernel file arch/powerpc/include/asm/ppc_asm.h + * + * Copyright (C) 1995-1999 Gary Thomas, Paul Mackerras, Cort Dougan. + */ +#ifndef __CR_LINKAGE_H__ +#define __CR_LINKAGE_H__ + +#ifdef __ASSEMBLY__ + +#define GLOBAL(name) \ + .globl name; \ + name: + +#define ENTRY(name) \ + .globl name; \ + .type name, @function; \ + name: + +#define END(sym) \ + .size sym, . - sym + + +#define STACKFRAMESIZE 256 +#define __STK_REG(i) (112 + ((i)-14)*8) +#define STK_REG(i) __STK_REG(__REG_##i) + +/* The boring bits... */ + +/* Condition Register Bit Fields */ + +#define cr0 0 +#define cr1 1 +#define cr2 2 +#define cr3 3 +#define cr4 4 +#define cr5 5 +#define cr6 6 +#define cr7 7 + + +/* + * General Purpose Registers (GPRs) + * + * The lower case r0-r31 should be used in preference to the upper + * case R0-R31 as they provide more error checking in the assembler. + * Use R0-31 only when really nessesary. + */ + +#define r0 %r0 +#define r1 %r1 +#define r2 %r2 +#define r3 %r3 +#define r4 %r4 +#define r5 %r5 +#define r6 %r6 +#define r7 %r7 +#define r8 %r8 +#define r9 %r9 +#define r10 %r10 +#define r11 %r11 +#define r12 %r12 +#define r13 %r13 +#define r14 %r14 +#define r15 %r15 +#define r16 %r16 +#define r17 %r17 +#define r18 %r18 +#define r19 %r19 +#define r20 %r20 +#define r21 %r21 +#define r22 %r22 +#define r23 %r23 +#define r24 %r24 +#define r25 %r25 +#define r26 %r26 +#define r27 %r27 +#define r28 %r28 +#define r29 %r29 +#define r30 %r30 +#define r31 %r31 + + +/* Floating Point Registers (FPRs) */ + +#define fr0 0 +#define fr1 1 +#define fr2 2 +#define fr3 3 +#define fr4 4 +#define fr5 5 +#define fr6 6 +#define fr7 7 +#define fr8 8 +#define fr9 9 +#define fr10 10 +#define fr11 11 +#define fr12 12 +#define fr13 13 +#define fr14 14 +#define fr15 15 +#define fr16 16 +#define fr17 17 +#define fr18 18 +#define fr19 19 +#define fr20 20 +#define fr21 21 +#define fr22 22 +#define fr23 23 +#define fr24 24 +#define fr25 25 +#define fr26 26 +#define fr27 27 +#define fr28 28 +#define fr29 29 +#define fr30 30 +#define fr31 31 + +/* AltiVec Registers (VPRs) */ + +#define vr0 0 +#define vr1 1 +#define vr2 2 +#define vr3 3 +#define vr4 4 +#define vr5 5 +#define vr6 6 +#define vr7 7 +#define vr8 8 +#define vr9 9 +#define vr10 10 +#define vr11 11 +#define vr12 12 +#define vr13 13 +#define vr14 14 +#define vr15 15 +#define vr16 16 +#define vr17 17 +#define vr18 18 +#define vr19 19 +#define vr20 20 +#define vr21 21 +#define vr22 22 +#define vr23 23 +#define vr24 24 +#define vr25 25 +#define vr26 26 +#define vr27 27 +#define vr28 28 +#define vr29 29 +#define vr30 30 +#define vr31 31 + +/* VSX Registers (VSRs) */ + +#define vsr0 0 +#define vsr1 1 +#define vsr2 2 +#define vsr3 3 +#define vsr4 4 +#define vsr5 5 +#define vsr6 6 +#define vsr7 7 +#define vsr8 8 +#define vsr9 9 +#define vsr10 10 +#define vsr11 11 +#define vsr12 12 +#define vsr13 13 +#define vsr14 14 +#define vsr15 15 +#define vsr16 16 +#define vsr17 17 +#define vsr18 18 +#define vsr19 19 +#define vsr20 20 +#define vsr21 21 +#define vsr22 22 +#define vsr23 23 +#define vsr24 24 +#define vsr25 25 +#define vsr26 26 +#define vsr27 27 +#define vsr28 28 +#define vsr29 29 +#define vsr30 30 +#define vsr31 31 +#define vsr32 32 +#define vsr33 33 +#define vsr34 34 +#define vsr35 35 +#define vsr36 36 +#define vsr37 37 +#define vsr38 38 +#define vsr39 39 +#define vsr40 40 +#define vsr41 41 +#define vsr42 42 +#define vsr43 43 +#define vsr44 44 +#define vsr45 45 +#define vsr46 46 +#define vsr47 47 +#define vsr48 48 +#define vsr49 49 +#define vsr50 50 +#define vsr51 51 +#define vsr52 52 +#define vsr53 53 +#define vsr54 54 +#define vsr55 55 +#define vsr56 56 +#define vsr57 57 +#define vsr58 58 +#define vsr59 59 +#define vsr60 60 +#define vsr61 61 +#define vsr62 62 +#define vsr63 63 + +/* SPE Registers (EVPRs) */ + +#define evr0 0 +#define evr1 1 +#define evr2 2 +#define evr3 3 +#define evr4 4 +#define evr5 5 +#define evr6 6 +#define evr7 7 +#define evr8 8 +#define evr9 9 +#define evr10 10 +#define evr11 11 +#define evr12 12 +#define evr13 13 +#define evr14 14 +#define evr15 15 +#define evr16 16 +#define evr17 17 +#define evr18 18 +#define evr19 19 +#define evr20 20 +#define evr21 21 +#define evr22 22 +#define evr23 23 +#define evr24 24 +#define evr25 25 +#define evr26 26 +#define evr27 27 +#define evr28 28 +#define evr29 29 +#define evr30 30 +#define evr31 31 + +/* some stab codes */ +#define N_FUN 36 +#define N_RSYM 64 +#define N_SLINE 68 +#define N_SO 100 + +#define __REG_R0 0 +#define __REG_R1 1 +#define __REG_R2 2 +#define __REG_R3 3 +#define __REG_R4 4 +#define __REG_R5 5 +#define __REG_R6 6 +#define __REG_R7 7 +#define __REG_R8 8 +#define __REG_R9 9 +#define __REG_R10 10 +#define __REG_R11 11 +#define __REG_R12 12 +#define __REG_R13 13 +#define __REG_R14 14 +#define __REG_R15 15 +#define __REG_R16 16 +#define __REG_R17 17 +#define __REG_R18 18 +#define __REG_R19 19 +#define __REG_R20 20 +#define __REG_R21 21 +#define __REG_R22 22 +#define __REG_R23 23 +#define __REG_R24 24 +#define __REG_R25 25 +#define __REG_R26 26 +#define __REG_R27 27 +#define __REG_R28 28 +#define __REG_R29 29 +#define __REG_R30 30 +#define __REG_R31 31 + + + +#endif /* __ASSEMBLY__ */ + +#endif /* __CR_LINKAGE_H__ */ diff --git a/CRIU_code/include/common/arch/ppc64/asm/page.h b/CRIU_code/include/common/arch/ppc64/asm/page.h new file mode 100644 index 0000000..5107cb8 --- /dev/null +++ b/CRIU_code/include/common/arch/ppc64/asm/page.h @@ -0,0 +1,44 @@ +#ifndef __CR_ASM_PAGE_H__ +#define __CR_ASM_PAGE_H__ + +#define ARCH_HAS_LONG_PAGES + +#ifndef CR_NOGLIBC +#include /* ffsl() */ +#include /* _SC_PAGESIZE */ + +extern unsigned __page_size; +extern unsigned __page_shift; + +static inline unsigned page_size(void) +{ + if (!__page_size) + __page_size = sysconf(_SC_PAGESIZE); + return __page_size; +} + +static inline unsigned page_shift(void) +{ + if (!__page_shift) + __page_shift = (ffsl(page_size()) - 1); + return __page_shift; +} + +/* + * Don't add ifdefs for PAGE_SIZE: if any header defines it as a constant + * on ppc64, then we need refrain using PAGE_SIZE in criu and use + * page_size() across sources (as it may differ on ppc64). + */ +#define PAGE_SIZE page_size() +#define PAGE_MASK (~(PAGE_SIZE - 1)) +#define PAGE_SHIFT page_shift() + +#define PAGE_PFN(addr) ((addr) / PAGE_SIZE) + +#else /* CR_NOGLIBC */ + +extern unsigned page_size(void); +#define PAGE_SIZE page_size() + +#endif /* CR_NOGLIBC */ +#endif /* __CR_ASM_PAGE_H__ */ diff --git a/CRIU_code/include/common/arch/riscv/.keep b/CRIU_code/include/common/arch/riscv/.keep new file mode 100644 index 0000000..e69de29 diff --git a/CRIU_code/include/common/arch/riscv/asm/.keep b/CRIU_code/include/common/arch/riscv/asm/.keep new file mode 100644 index 0000000..e69de29 diff --git a/CRIU_code/include/common/arch/riscv/asm/atomic.h b/CRIU_code/include/common/arch/riscv/asm/atomic.h new file mode 100644 index 0000000..778e437 --- /dev/null +++ b/CRIU_code/include/common/arch/riscv/asm/atomic.h @@ -0,0 +1,338 @@ +#ifndef __CR_ATOMIC_H +#define __CR_ATOMIC_H + +#define __atomic_acquire_fence() \ + __asm__ __volatile__(RISCV_ACQUIRE_BARRIER "" ::: "memory") + +#define __atomic_release_fence() \ + __asm__ __volatile__(RISCV_RELEASE_BARRIER "" ::: "memory") + +static __always_inline int atomic_read(const atomic_t *v) +{ + return READ_ONCE(v->counter); +} +static __always_inline void atomic_set(atomic_t *v, int i) +{ + WRITE_ONCE(v->counter, i); +} + +#ifndef CONFIG_GENERIC_ATOMIC64 +#define ATOMIC64_INIT(i) { (i) } +static __always_inline s64 atomic64_read(const atomic64_t *v) +{ + return READ_ONCE(v->counter); +} +static __always_inline void atomic64_set(atomic64_t *v, s64 i) +{ + WRITE_ONCE(v->counter, i); +} +#endif + +/* + * First, the atomic ops that have no ordering constraints and therefor don't + * have the AQ or RL bits set. These don't return anything, so there's only + * one version to worry about. + */ +#define ATOMIC_OP(op, asm_op, I, asm_type, c_type, prefix) \ +static __always_inline \ +void atomic##prefix##_##op(c_type i, atomic##prefix##_t *v) \ +{ \ + __asm__ __volatile__ ( \ + " amo" #asm_op "." #asm_type " zero, %1, %0" \ + : "+A" (v->counter) \ + : "r" (I) \ + : "memory"); \ +} \ + +#ifdef CONFIG_GENERIC_ATOMIC64 +#define ATOMIC_OPS(op, asm_op, I) \ + ATOMIC_OP (op, asm_op, I, w, int, ) +#else +#define ATOMIC_OPS(op, asm_op, I) \ + ATOMIC_OP (op, asm_op, I, w, int, ) \ + ATOMIC_OP (op, asm_op, I, d, s64, 64) +#endif + +ATOMIC_OPS(add, add, i) +ATOMIC_OPS(sub, add, -i) +ATOMIC_OPS(and, and, i) +ATOMIC_OPS( or, or, i) +ATOMIC_OPS(xor, xor, i) + +#undef ATOMIC_OP +#undef ATOMIC_OPS + +/* + * Atomic ops that have ordered, relaxed, acquire, and release variants. + * There's two flavors of these: the arithmatic ops have both fetch and return + * versions, while the logical ops only have fetch versions. + */ +#define ATOMIC_FETCH_OP(op, asm_op, I, asm_type, c_type, prefix) \ +static __always_inline \ +c_type atomic##prefix##_fetch_##op##_relaxed(c_type i, \ + atomic##prefix##_t *v) \ +{ \ + register c_type ret; \ + __asm__ __volatile__ ( \ + " amo" #asm_op "." #asm_type " %1, %2, %0" \ + : "+A" (v->counter), "=r" (ret) \ + : "r" (I) \ + : "memory"); \ + return ret; \ +} \ +static __always_inline \ +c_type atomic##prefix##_fetch_##op(c_type i, atomic##prefix##_t *v) \ +{ \ + register c_type ret; \ + __asm__ __volatile__ ( \ + " amo" #asm_op "." #asm_type ".aqrl %1, %2, %0" \ + : "+A" (v->counter), "=r" (ret) \ + : "r" (I) \ + : "memory"); \ + return ret; \ +} + +#define ATOMIC_OP_RETURN(op, asm_op, c_op, I, asm_type, c_type, prefix) \ +static __always_inline \ +c_type atomic##prefix##_##op##_return_relaxed(c_type i, \ + atomic##prefix##_t *v) \ +{ \ + return atomic##prefix##_fetch_##op##_relaxed(i, v) c_op I; \ +} \ +static __always_inline \ +c_type atomic##prefix##_##op##_return(c_type i, atomic##prefix##_t *v) \ +{ \ + return atomic##prefix##_fetch_##op(i, v) c_op I; \ +} + +#ifdef CONFIG_GENERIC_ATOMIC64 +#define ATOMIC_OPS(op, asm_op, c_op, I) \ + ATOMIC_FETCH_OP( op, asm_op, I, w, int, ) \ + ATOMIC_OP_RETURN(op, asm_op, c_op, I, w, int, ) +#else +#define ATOMIC_OPS(op, asm_op, c_op, I) \ + ATOMIC_FETCH_OP( op, asm_op, I, w, int, ) \ + ATOMIC_OP_RETURN(op, asm_op, c_op, I, w, int, ) \ + ATOMIC_FETCH_OP( op, asm_op, I, d, s64, 64) \ + ATOMIC_OP_RETURN(op, asm_op, c_op, I, d, s64, 64) +#endif + +ATOMIC_OPS(add, add, +, i) +ATOMIC_OPS(sub, add, +, -i) + +#define atomic_add_return_relaxed atomic_add_return_relaxed +#define atomic_sub_return_relaxed atomic_sub_return_relaxed +#define atomic_add_return atomic_add_return +#define atomic_sub_return atomic_sub_return + +#define atomic_fetch_add_relaxed atomic_fetch_add_relaxed +#define atomic_fetch_sub_relaxed atomic_fetch_sub_relaxed +#define atomic_fetch_add atomic_fetch_add +#define atomic_fetch_sub atomic_fetch_sub + +#ifndef CONFIG_GENERIC_ATOMIC64 +#define atomic64_add_return_relaxed atomic64_add_return_relaxed +#define atomic64_sub_return_relaxed atomic64_sub_return_relaxed +#define atomic64_add_return atomic64_add_return +#define atomic64_sub_return atomic64_sub_return + +#define atomic64_fetch_add_relaxed atomic64_fetch_add_relaxed +#define atomic64_fetch_sub_relaxed atomic64_fetch_sub_relaxed +#define atomic64_fetch_add atomic64_fetch_add +#define atomic64_fetch_sub atomic64_fetch_sub +#endif + +#undef ATOMIC_OPS + +#ifdef CONFIG_GENERIC_ATOMIC64 +#define ATOMIC_OPS(op, asm_op, I) \ + ATOMIC_FETCH_OP(op, asm_op, I, w, int, ) +#else +#define ATOMIC_OPS(op, asm_op, I) \ + ATOMIC_FETCH_OP(op, asm_op, I, w, int, ) \ + ATOMIC_FETCH_OP(op, asm_op, I, d, s64, 64) +#endif + +ATOMIC_OPS(and, and, i) +ATOMIC_OPS( or, or, i) +ATOMIC_OPS(xor, xor, i) + +#define atomic_fetch_and_relaxed atomic_fetch_and_relaxed +#define atomic_fetch_or_relaxed atomic_fetch_or_relaxed +#define atomic_fetch_xor_relaxed atomic_fetch_xor_relaxed +#define atomic_fetch_and atomic_fetch_and +#define atomic_fetch_or atomic_fetch_or +#define atomic_fetch_xor atomic_fetch_xor + +#ifndef CONFIG_GENERIC_ATOMIC64 +#define atomic64_fetch_and_relaxed atomic64_fetch_and_relaxed +#define atomic64_fetch_or_relaxed atomic64_fetch_or_relaxed +#define atomic64_fetch_xor_relaxed atomic64_fetch_xor_relaxed +#define atomic64_fetch_and atomic64_fetch_and +#define atomic64_fetch_or atomic64_fetch_or +#define atomic64_fetch_xor atomic64_fetch_xor +#endif + +#undef ATOMIC_OPS + +#undef ATOMIC_FETCH_OP +#undef ATOMIC_OP_RETURN + +/* This is required to provide a full barrier on success. */ +static __always_inline int atomic_fetch_add_unless(atomic_t *v, int a, int u) +{ + int prev, rc; + + __asm__ __volatile__ ( + "0: lr.w %[p], %[c]\n" + " beq %[p], %[u], 1f\n" + " add %[rc], %[p], %[a]\n" + " sc.w.rl %[rc], %[rc], %[c]\n" + " bnez %[rc], 0b\n" + " fence rw, rw\n" + "1:\n" + : [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter) + : [a]"r" (a), [u]"r" (u) + : "memory"); + return prev; +} +#define atomic_fetch_add_unless atomic_fetch_add_unless + +#ifndef CONFIG_GENERIC_ATOMIC64 +static __always_inline s64 atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u) +{ + s64 prev; + long rc; + + __asm__ __volatile__ ( + "0: lr.d %[p], %[c]\n" + " beq %[p], %[u], 1f\n" + " add %[rc], %[p], %[a]\n" + " sc.d.rl %[rc], %[rc], %[c]\n" + " bnez %[rc], 0b\n" + " fence rw, rw\n" + "1:\n" + : [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter) + : [a]"r" (a), [u]"r" (u) + : "memory"); + return prev; +} +#define atomic64_fetch_add_unless atomic64_fetch_add_unless +#endif + +/* + * atomic_{cmp,}xchg is required to have exactly the same ordering semantics as + * {cmp,}xchg and the operations that return, so they need a full barrier. + */ +#define ATOMIC_OP(c_t, prefix, size) \ +static __always_inline \ +c_t atomic##prefix##_xchg_relaxed(atomic##prefix##_t *v, c_t n) \ +{ \ + return __xchg_relaxed(&(v->counter), n, size); \ +} \ +static __always_inline \ +c_t atomic##prefix##_xchg_acquire(atomic##prefix##_t *v, c_t n) \ +{ \ + return __xchg_acquire(&(v->counter), n, size); \ +} \ +static __always_inline \ +c_t atomic##prefix##_xchg_release(atomic##prefix##_t *v, c_t n) \ +{ \ + return __xchg_release(&(v->counter), n, size); \ +} \ +static __always_inline \ +c_t atomic##prefix##_xchg(atomic##prefix##_t *v, c_t n) \ +{ \ + return __xchg(&(v->counter), n, size); \ +} \ +static __always_inline \ +c_t atomic##prefix##_cmpxchg_relaxed(atomic##prefix##_t *v, \ + c_t o, c_t n) \ +{ \ + return __cmpxchg_relaxed(&(v->counter), o, n, size); \ +} \ +static __always_inline \ +c_t atomic##prefix##_cmpxchg_acquire(atomic##prefix##_t *v, \ + c_t o, c_t n) \ +{ \ + return __cmpxchg_acquire(&(v->counter), o, n, size); \ +} \ +static __always_inline \ +c_t atomic##prefix##_cmpxchg_release(atomic##prefix##_t *v, \ + c_t o, c_t n) \ +{ \ + return __cmpxchg_release(&(v->counter), o, n, size); \ +} \ +static __always_inline \ +c_t atomic##prefix##_cmpxchg(atomic##prefix##_t *v, c_t o, c_t n) \ +{ \ + return __cmpxchg(&(v->counter), o, n, size); \ +} + +#ifdef CONFIG_GENERIC_ATOMIC64 +#define ATOMIC_OPS() \ + ATOMIC_OP(int, , 4) +#else +#define ATOMIC_OPS() \ + ATOMIC_OP(int, , 4) \ + ATOMIC_OP(s64, 64, 8) +#endif + +ATOMIC_OPS() + +#define atomic_xchg_relaxed atomic_xchg_relaxed +#define atomic_xchg_acquire atomic_xchg_acquire +#define atomic_xchg_release atomic_xchg_release +#define atomic_xchg atomic_xchg +#define atomic_cmpxchg_relaxed atomic_cmpxchg_relaxed +#define atomic_cmpxchg_acquire atomic_cmpxchg_acquire +#define atomic_cmpxchg_release atomic_cmpxchg_release +#define atomic_cmpxchg atomic_cmpxchg + +#undef ATOMIC_OPS +#undef ATOMIC_OP + +static __always_inline int atomic_sub_if_positive(atomic_t *v, int offset) +{ + int prev, rc; + + __asm__ __volatile__ ( + "0: lr.w %[p], %[c]\n" + " sub %[rc], %[p], %[o]\n" + " bltz %[rc], 1f\n" + " sc.w.rl %[rc], %[rc], %[c]\n" + " bnez %[rc], 0b\n" + " fence rw, rw\n" + "1:\n" + : [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter) + : [o]"r" (offset) + : "memory"); + return prev - offset; +} + +#define atomic_dec_if_positive(v) atomic_sub_if_positive(v, 1) + +#ifndef CONFIG_GENERIC_ATOMIC64 +static __always_inline s64 atomic64_sub_if_positive(atomic64_t *v, s64 offset) +{ + s64 prev; + long rc; + + __asm__ __volatile__ ( + "0: lr.d %[p], %[c]\n" + " sub %[rc], %[p], %[o]\n" + " bltz %[rc], 1f\n" + " sc.d.rl %[rc], %[rc], %[c]\n" + " bnez %[rc], 0b\n" + " fence rw, rw\n" + "1:\n" + : [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter) + : [o]"r" (offset) + : "memory"); + return prev - offset; +} + +#define atomic64_dec_if_positive(v) atomic64_sub_if_positive(v, 1) +#endif + +#endif \ No newline at end of file diff --git a/CRIU_code/include/common/arch/riscv/asm/bitops.h b/CRIU_code/include/common/arch/riscv/asm/bitops.h new file mode 100644 index 0000000..c2a8c91 --- /dev/null +++ b/CRIU_code/include/common/arch/riscv/asm/bitops.h @@ -0,0 +1,69 @@ +#ifndef _LINUX_BITOPS_H +#define _LINUX_BITOPS_H + +#if (BITS_PER_LONG == 64) +#define __AMO(op) "amo" #op ".d" +#elif (BITS_PER_LONG == 32) +#define __AMO(op) "amo" #op ".w" +#else +#error "Unexpected BITS_PER_LONG" +#endif + +#define __test_and_op_bit_ord(op, mod, nr, addr, ord) \ +({ \ + unsigned long __res, __mask; \ + __mask = BIT_MASK(nr); \ + __asm__ __volatile__ ( \ + __AMO(op) #ord " %0, %2, %1" \ + : "=r" (__res), "+A" (addr[BIT_WORD(nr)]) \ + : "r" (mod(__mask)) \ + : "memory"); \ + ((__res & __mask) != 0); \ +}) + +#define __op_bit_ord(op, mod, nr, addr, ord) \ + __asm__ __volatile__ ( \ + __AMO(op) #ord " zero, %1, %0" \ + : "+A" (addr[BIT_WORD(nr)]) \ + : "r" (mod(BIT_MASK(nr))) \ + : "memory"); + +#define __test_and_op_bit(op, mod, nr, addr) \ + __test_and_op_bit_ord(op, mod, nr, addr, .aqrl) +#define __op_bit(op, mod, nr, addr) \ + __op_bit_ord(op, mod, nr, addr, ) + +static inline int test_and_set_bit(int nr, volatile unsigned long *addr) +{ + return __test_and_op_bit(or, __NOP, nr, addr); +} + +static inline int test_and_clear_bit(int nr, volatile unsigned long *addr) +{ + return __test_and_op_bit(and, __NOT, nr, addr); +} + +static inline int test_and_change_bit(int nr, volatile unsigned long *addr) +{ + return __test_and_op_bit(xor, __NOP, nr, addr); +} + +static inline void set_bit(int nr, volatile unsigned long *addr) +{ + __op_bit(or, __NOP, nr, addr); +} + +static inline void clear_bit(int nr, volatile unsigned long *addr) +{ + __op_bit(and, __NOT, nr, addr); +} + +static inline void change_bit(int nr, volatile unsigned long *addr) +{ + __op_bit(xor, __NOP, nr, addr); +} + +/* Bitmask modifiers */ +#define __NOP(x) (x) +#define __NOT(x) (~(x)) +#endif \ No newline at end of file diff --git a/CRIU_code/include/common/arch/riscv/asm/bitsperlong.h b/CRIU_code/include/common/arch/riscv/asm/bitsperlong.h new file mode 100644 index 0000000..b00838e --- /dev/null +++ b/CRIU_code/include/common/arch/riscv/asm/bitsperlong.h @@ -0,0 +1,6 @@ +#ifndef __CR_BITSPERLONG_H__ +#define __CR_BITSPERLONG_H__ + +#define BITS_PER_LONG 64 + +#endif /* __CR_BITSPERLONG_H__ */ \ No newline at end of file diff --git a/CRIU_code/include/common/arch/riscv/asm/linkage.h b/CRIU_code/include/common/arch/riscv/asm/linkage.h new file mode 100644 index 0000000..d88a991 --- /dev/null +++ b/CRIU_code/include/common/arch/riscv/asm/linkage.h @@ -0,0 +1,55 @@ +#ifndef __CR_LINKAGE_H__ +#define __CR_LINKAGE_H__ + +#define zero $0 +#define ra $1 +#define sp $2 +#define gp $3 +#define tp $4 +#define t0 $5 +#define t1 $6 +#define t2 $7 +#define s0 $8 +#define fp $8 +#define s1 $9 +#define a0 $10 +#define a1 $11 +#define a2 $12 +#define a3 $13 +#define a4 $14 +#define a5 $15 +#define a6 $16 +#define a7 $17 +#define s2 $18 +#define s3 $19 +#define s4 $20 +#define s5 $21 +#define s6 $22 +#define s7 $23 +#define s8 $24 +#define s9 $25 +#define s10 $26 +#define s11 $27 +#define t3 $28 +#define t4 $29 +#define t5 $30 +#define t6 $31 + +#define __ALIGN .align 8 +#define __ALIGN_STR ".align 8" + +#define GLOBAL(name) \ + .globl name; \ + name: + +#define ENTRY(name) \ + .globl name; \ + __ALIGN; \ + .type name, @function; \ + name: + +#define END(sym) \ + .size sym, . - sym + + +#endif /* __CR_LINKAGE_H__ */ \ No newline at end of file diff --git a/CRIU_code/include/common/arch/riscv/asm/page.h b/CRIU_code/include/common/arch/riscv/asm/page.h new file mode 100644 index 0000000..30816c7 --- /dev/null +++ b/CRIU_code/include/common/arch/riscv/asm/page.h @@ -0,0 +1,39 @@ +#ifndef __CR_ASM_PAGE_H__ +#define __CR_ASM_PAGE_H__ + +#define ARCH_HAS_LONG_PAGES + +#ifndef CR_NOGLIBC +#include /* ffsl() */ +#include /* _SC_PAGESIZE */ + +static unsigned __page_size; +static unsigned __page_shift; + +static inline unsigned page_size(void) +{ + if (!__page_size) + __page_size = sysconf(_SC_PAGESIZE); + return __page_size; +} + +static inline unsigned page_shift(void) +{ + if (!__page_shift) + __page_shift = (ffsl(page_size()) - 1); + return __page_shift; +} + +#define PAGE_SIZE page_size() +#define PAGE_SHIFT page_shift() +#define PAGE_MASK (~(PAGE_SIZE - 1)) + +#define PAGE_PFN(addr) ((addr) / PAGE_SIZE) +#else /* CR_NOGLIBC */ + +extern unsigned page_size(void); +#define PAGE_SIZE page_size() + +#endif /* CR_NOGLIBC */ + +#endif /* __CR_ASM_PAGE_H__ */ \ No newline at end of file diff --git a/CRIU_code/include/common/arch/s390/asm/atomic.h b/CRIU_code/include/common/arch/s390/asm/atomic.h new file mode 100644 index 0000000..dfdba12 --- /dev/null +++ b/CRIU_code/include/common/arch/s390/asm/atomic.h @@ -0,0 +1,67 @@ +#ifndef __ARCH_S390_ATOMIC__ +#define __ARCH_S390_ATOMIC__ + +#include "common/arch/s390/asm/atomic_ops.h" +#include "common/compiler.h" + +#define ATOMIC_INIT(i) { (i) } + +typedef struct { + int counter; +} atomic_t; + +static inline int atomic_read(const atomic_t *v) +{ + int c; + + asm volatile( + " l %0,%1\n" + : "=d" (c) : "Q" (v->counter)); + return c; +} + +static inline void atomic_set(atomic_t *v, int i) +{ + asm volatile( + " st %1,%0\n" + : "=Q" (v->counter) : "d" (i)); +} + +static inline int atomic_add_return(int i, atomic_t *v) +{ + return __atomic_add_barrier(i, &v->counter) + i; +} + + +static inline void atomic_add(int i, atomic_t *v) +{ + __atomic_add(i, &v->counter); +} + +#define atomic_inc(_v) atomic_add(1, _v) +#define atomic_inc_return(_v) atomic_add_return(1, _v) +#define atomic_sub(_i, _v) atomic_add(-(int)(_i), _v) +#define atomic_sub_return(_i, _v) atomic_add_return(-(int)(_i), _v) +#define atomic_dec(_v) atomic_sub(1, _v) +#define atomic_dec_return(_v) atomic_sub_return(1, _v) +#define atomic_dec_and_test(_v) (atomic_sub_return(1, _v) == 0) + +#define ATOMIC_OPS(op) \ +static inline void atomic_##op(int i, atomic_t *v) \ +{ \ + __atomic_##op(i, &v->counter); \ +} \ + +ATOMIC_OPS(and) +ATOMIC_OPS(or) +ATOMIC_OPS(xor) + +#undef ATOMIC_OPS + +static inline int atomic_cmpxchg(atomic_t *v, int old, int new) +{ + return __atomic_cmpxchg(&v->counter, old, new); +} + +#endif /* __ARCH_S390_ATOMIC__ */ + diff --git a/CRIU_code/include/common/arch/s390/asm/atomic_ops.h b/CRIU_code/include/common/arch/s390/asm/atomic_ops.h new file mode 100644 index 0000000..ff0e1e3 --- /dev/null +++ b/CRIU_code/include/common/arch/s390/asm/atomic_ops.h @@ -0,0 +1,74 @@ +#ifndef __ARCH_S390_ATOMIC_OPS__ +#define __ARCH_S390_ATOMIC_OPS__ + +#define __ATOMIC_OP(op_name, op_string) \ +static inline int op_name(int val, int *ptr) \ +{ \ + int old, new; \ + \ + asm volatile( \ + "0: lr %[new],%[old]\n" \ + op_string " %[new],%[val]\n" \ + " cs %[old],%[new],%[ptr]\n" \ + " jl 0b" \ + : [old] "=d" (old), [new] "=&d" (new), [ptr] "+Q" (*ptr)\ + : [val] "d" (val), "0" (*ptr) : "cc", "memory"); \ + return old; \ +} + +#define __ATOMIC_OPS(op_name, op_string) \ + __ATOMIC_OP(op_name, op_string) \ + __ATOMIC_OP(op_name##_barrier, op_string) + +__ATOMIC_OPS(__atomic_add, "ar") +__ATOMIC_OPS(__atomic_and, "nr") +__ATOMIC_OPS(__atomic_or, "or") +__ATOMIC_OPS(__atomic_xor, "xr") + +#undef __ATOMIC_OPS + +#define __ATOMIC64_OP(op_name, op_string) \ +static inline long op_name(long val, long *ptr) \ +{ \ + long old, new; \ + \ + asm volatile( \ + "0: lgr %[new],%[old]\n" \ + op_string " %[new],%[val]\n" \ + " csg %[old],%[new],%[ptr]\n" \ + " jl 0b" \ + : [old] "=d" (old), [new] "=&d" (new), [ptr] "+Q" (*ptr)\ + : [val] "d" (val), "0" (*ptr) : "cc", "memory"); \ + return old; \ +} + +#define __ATOMIC64_OPS(op_name, op_string) \ + __ATOMIC64_OP(op_name, op_string) \ + __ATOMIC64_OP(op_name##_barrier, op_string) + +__ATOMIC64_OPS(__atomic64_add, "agr") +__ATOMIC64_OPS(__atomic64_and, "ngr") +__ATOMIC64_OPS(__atomic64_or, "ogr") +__ATOMIC64_OPS(__atomic64_xor, "xgr") + +#undef __ATOMIC64_OPS + +static inline int __atomic_cmpxchg(int *ptr, int old, int new) +{ + asm volatile( + " cs %[old],%[new],%[ptr]" + : [old] "+d" (old), [ptr] "+Q" (*ptr) + : [new] "d" (new) : "cc", "memory"); + return old; +} + +static inline long __atomic64_cmpxchg(long *ptr, long old, long new) +{ + asm volatile( + " csg %[old],%[new],%[ptr]" + : [old] "+d" (old), [ptr] "+Q" (*ptr) + : [new] "d" (new) : "cc", "memory"); + return old; +} + +#endif /* __ARCH_S390_ATOMIC_OPS__ */ diff --git a/CRIU_code/include/common/arch/s390/asm/bitops.h b/CRIU_code/include/common/arch/s390/asm/bitops.h new file mode 100644 index 0000000..648d898 --- /dev/null +++ b/CRIU_code/include/common/arch/s390/asm/bitops.h @@ -0,0 +1,158 @@ +#ifndef _S390_BITOPS_H +#define _S390_BITOPS_H + +#include "common/asm/bitsperlong.h" +#include "common/compiler.h" +#include "common/arch/s390/asm/atomic_ops.h" + +#define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d)) +#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_LONG) +#define __BITOPS_WORDS(bits) (((bits) + BITS_PER_LONG - 1) / BITS_PER_LONG) + +#define DECLARE_BITMAP(name,bits) \ + unsigned long name[BITS_TO_LONGS(bits)] + +static inline unsigned long * +__bitops_word(unsigned long nr, volatile unsigned long *ptr) +{ + unsigned long addr; + + addr = (unsigned long)ptr + ((nr ^ (nr & (BITS_PER_LONG - 1))) >> 3); + return (unsigned long *)addr; +} + +static inline unsigned char * +__bitops_byte(unsigned long nr, volatile unsigned long *ptr) +{ + return ((unsigned char *)ptr) + ((nr ^ (BITS_PER_LONG - 8)) >> 3); +} + +static inline void set_bit(unsigned long nr, volatile unsigned long *ptr) +{ + unsigned long *addr = __bitops_word(nr, ptr); + unsigned long mask; + + mask = 1UL << (nr & (BITS_PER_LONG - 1)); + __atomic64_or((long) mask, (long *) addr); +} + +static inline void clear_bit(unsigned long nr, volatile unsigned long *ptr) +{ + unsigned long *addr = __bitops_word(nr, ptr); + unsigned long mask; + + mask = ~(1UL << (nr & (BITS_PER_LONG - 1))); + __atomic64_and((long) mask, (long *) addr); +} + +static inline void change_bit(unsigned long nr, volatile unsigned long *ptr) +{ + unsigned long *addr = __bitops_word(nr, ptr); + unsigned long mask; + + mask = 1UL << (nr & (BITS_PER_LONG - 1)); + __atomic64_xor((long) mask, (long *) addr); +} + +static inline int +test_and_set_bit(unsigned long nr, volatile unsigned long *ptr) +{ + unsigned long *addr = __bitops_word(nr, ptr); + unsigned long old, mask; + + mask = 1UL << (nr & (BITS_PER_LONG - 1)); + old = __atomic64_or_barrier((long) mask, (long *) addr); + return (old & mask) != 0; +} + +static inline int test_bit(unsigned long nr, const volatile unsigned long *ptr) +{ + const volatile unsigned char *addr; + + addr = ((const volatile unsigned char *)ptr); + addr += (nr ^ (BITS_PER_LONG - 8)) >> 3; + return (*addr >> (nr & 7)) & 1; +} + +static inline unsigned char __flogr(unsigned long word) +{ + if (__builtin_constant_p(word)) { + unsigned long bit = 0; + + if (!word) + return 64; + if (!(word & 0xffffffff00000000UL)) { + word <<= 32; + bit += 32; + } + if (!(word & 0xffff000000000000UL)) { + word <<= 16; + bit += 16; + } + if (!(word & 0xff00000000000000UL)) { + word <<= 8; + bit += 8; + } + if (!(word & 0xf000000000000000UL)) { + word <<= 4; + bit += 4; + } + if (!(word & 0xc000000000000000UL)) { + word <<= 2; + bit += 2; + } + if (!(word & 0x8000000000000000UL)) { + word <<= 1; + bit += 1; + } + return bit; + } else { + return __builtin_clzl(word); + } +} + +static inline unsigned long __ffs(unsigned long word) +{ + return __flogr(-word & word) ^ (BITS_PER_LONG - 1); +} + +#define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1))) + +static inline unsigned long _find_next_bit(const unsigned long *addr, + unsigned long nbits, unsigned long start, + unsigned long invert) +{ + unsigned long tmp; + + if (!nbits || start >= nbits) + return nbits; + + tmp = addr[start / BITS_PER_LONG] ^ invert; + + tmp &= BITMAP_FIRST_WORD_MASK(start); + start = round_down(start, BITS_PER_LONG); + + while (!tmp) { + start += BITS_PER_LONG; + if (start >= nbits) + return nbits; + + tmp = addr[start / BITS_PER_LONG] ^ invert; + } + + return min(start + __ffs(tmp), nbits); +} + +static inline unsigned long find_next_bit(const unsigned long *addr, + unsigned long size, + unsigned long offset) +{ + return _find_next_bit(addr, size, offset, 0UL); +} + +#define for_each_bit(i, bitmask) \ + for (i = find_next_bit(bitmask, sizeof(bitmask), 0); \ + i < sizeof(bitmask); \ + i = find_next_bit(bitmask, sizeof(bitmask), i + 1)) + +#endif /* _S390_BITOPS_H */ diff --git a/CRIU_code/include/common/arch/s390/asm/bitsperlong.h b/CRIU_code/include/common/arch/s390/asm/bitsperlong.h new file mode 100644 index 0000000..d95727d --- /dev/null +++ b/CRIU_code/include/common/arch/s390/asm/bitsperlong.h @@ -0,0 +1,6 @@ +#ifndef __CR_BITSPERLONG_H__ +#define __CR_BITSPERLONG_H__ + +#define BITS_PER_LONG 64 + +#endif /* __CR_BITSPERLONG_H__ */ diff --git a/CRIU_code/include/common/arch/s390/asm/linkage.h b/CRIU_code/include/common/arch/s390/asm/linkage.h new file mode 100644 index 0000000..99895ce --- /dev/null +++ b/CRIU_code/include/common/arch/s390/asm/linkage.h @@ -0,0 +1,22 @@ +#ifndef __ASM_LINKAGE_H +#define __ASM_LINKAGE_H + +#ifdef __ASSEMBLY__ + +#define __ALIGN .align 4, 0x07 + +#define GLOBAL(name) \ + .globl name; \ + name: + +#define ENTRY(name) \ + .globl name; \ + .type name, @function; \ + __ALIGN; \ + name: + +#define END(name) \ + .size name, . - name + +#endif /* __ASSEMBLY__ */ +#endif diff --git a/CRIU_code/include/common/arch/s390/asm/page.h b/CRIU_code/include/common/arch/s390/asm/page.h new file mode 100644 index 0000000..8e8c649 --- /dev/null +++ b/CRIU_code/include/common/arch/s390/asm/page.h @@ -0,0 +1,19 @@ +#ifndef __CR_ASM_PAGE_H__ +#define __CR_ASM_PAGE_H__ + +#ifndef PAGE_SHIFT +#define PAGE_SHIFT 12 +#endif + +#ifndef PAGE_SIZE +#define PAGE_SIZE (1UL << PAGE_SHIFT) +#endif + +#ifndef PAGE_MASK +#define PAGE_MASK (~(PAGE_SIZE - 1)) +#endif + +#define PAGE_PFN(addr) ((addr) / PAGE_SIZE) +#define page_size() PAGE_SIZE + +#endif /* __CR_ASM_PAGE_H__ */ diff --git a/CRIU_code/include/common/arch/x86/asm/atomic.h b/CRIU_code/include/common/arch/x86/asm/atomic.h new file mode 100644 index 0000000..ec178e7 --- /dev/null +++ b/CRIU_code/include/common/arch/x86/asm/atomic.h @@ -0,0 +1,76 @@ +#ifndef __CR_ATOMIC_H__ +#define __CR_ATOMIC_H__ + +#include "common/arch/x86/asm/cmpxchg.h" + +typedef struct { + int counter; +} atomic_t; + +#define ATOMIC_INIT(i) { (i) } + +static inline int atomic_read(const atomic_t *v) +{ + return (*(volatile int *)&(v)->counter); +} + +static inline void atomic_set(atomic_t *v, int i) +{ + v->counter = i; +} + +static inline void atomic_add(int i, atomic_t *v) +{ + asm volatile(LOCK_PREFIX "addl %1,%0" + : "+m" (v->counter) + : "ir" (i)); +} + +static inline void atomic_sub(int i, atomic_t *v) +{ + asm volatile(LOCK_PREFIX "subl %1,%0" + : "+m" (v->counter) + : "ir" (i)); +} + +static inline void atomic_inc(atomic_t *v) +{ + asm volatile(LOCK_PREFIX "incl %0" + : "+m" (v->counter)); +} + +static inline void atomic_dec(atomic_t *v) +{ + asm volatile(LOCK_PREFIX "decl %0" + : "+m" (v->counter)); +} + +static inline int atomic_dec_and_test(atomic_t *v) +{ + unsigned char c; + + asm volatile(LOCK_PREFIX "decl %0; sete %1" + : "+m" (v->counter), "=qm" (c) + : : "memory"); + return c != 0; +} + +static inline int atomic_add_return(int i, atomic_t *v) +{ + return i + xadd(&v->counter, i); +} + +static inline int atomic_sub_return(int i, atomic_t *v) +{ + return atomic_add_return(-i, v); +} + +#define atomic_inc_return(v) (atomic_add_return(1, v)) +#define atomic_dec_return(v) (atomic_sub_return(1, v)) + +static inline int atomic_cmpxchg(atomic_t *v, int old, int new) +{ + return cmpxchg(&v->counter, old, new); +} + +#endif /* __CR_ATOMIC_H__ */ diff --git a/CRIU_code/include/common/arch/x86/asm/bitops.h b/CRIU_code/include/common/arch/x86/asm/bitops.h new file mode 100644 index 0000000..b60ead7 --- /dev/null +++ b/CRIU_code/include/common/arch/x86/asm/bitops.h @@ -0,0 +1,132 @@ +#ifndef __CR_BITOPS_H__ +#define __CR_BITOPS_H__ + +#include "common/arch/x86/asm/cmpxchg.h" +#include "common/asm/bitsperlong.h" + +#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d)) +#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_LONG) + +#define DECLARE_BITMAP(name, bits) \ + unsigned long name[BITS_TO_LONGS(bits)] + +#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 1) +/* Technically wrong, but this avoids compilation errors on some gcc + versions. */ +#define BITOP_ADDR(x) "=m" (*(volatile long *) (x)) +#else +#define BITOP_ADDR(x) "+m" (*(volatile long *) (x)) +#endif + +#define ADDR BITOP_ADDR(addr) + +static inline void set_bit(int nr, volatile unsigned long *addr) +{ + asm volatile("btsl %1,%0" : ADDR : "Ir" (nr) : "memory"); +} + +static inline void change_bit(int nr, volatile unsigned long *addr) +{ + asm volatile("btcl %1,%0" : ADDR : "Ir" (nr)); +} + +static inline int test_bit(int nr, volatile const unsigned long *addr) +{ + int oldbit; + + asm volatile("bt %2,%1\n\t" + "sbb %0,%0" + : "=r" (oldbit) + : "m" (*(unsigned long *)addr), "Ir" (nr)); + + return oldbit; +} + +static inline void clear_bit(int nr, volatile unsigned long *addr) +{ + asm volatile("btrl %1,%0" : ADDR : "Ir" (nr)); +} + +/** + * test_and_set_bit - Set a bit and return its old value + * @nr: Bit to set + * @addr: Address to count from + * + * This operation is atomic and cannot be reordered. + * It also implies a memory barrier. + */ +static inline int test_and_set_bit(int nr, volatile unsigned long *addr) +{ + int oldbit; + + asm volatile(LOCK_PREFIX "bts %2,%1\n\t" + "sbb %0,%0" : "=r" (oldbit), ADDR : "Ir" (nr) : "memory"); + + return oldbit; +} + +/** + * __ffs - find first set bit in word + * @word: The word to search + * + * Undefined if no bit exists, so code should check against 0 first. + */ +static inline unsigned long __ffs(unsigned long word) +{ + asm("bsf %1,%0" + : "=r" (word) + : "rm" (word)); + return word; +} + +#define BITOP_WORD(nr) ((nr) / BITS_PER_LONG) + +/* + * Find the next set bit in a memory region. + */ +static inline +unsigned long find_next_bit(const unsigned long *addr, unsigned long size, + unsigned long offset) +{ + const unsigned long *p = addr + BITOP_WORD(offset); + unsigned long result = offset & ~(BITS_PER_LONG-1); + unsigned long tmp; + + if (offset >= size) + return size; + size -= result; + offset %= BITS_PER_LONG; + if (offset) { + tmp = *(p++); + tmp &= (~0UL << offset); + if (size < BITS_PER_LONG) + goto found_first; + if (tmp) + goto found_middle; + size -= BITS_PER_LONG; + result += BITS_PER_LONG; + } + while (size & ~(BITS_PER_LONG-1)) { + if ((tmp = *(p++))) + goto found_middle; + result += BITS_PER_LONG; + size -= BITS_PER_LONG; + } + if (!size) + return result; + tmp = *p; + +found_first: + tmp &= (~0UL >> (BITS_PER_LONG - size)); + if (tmp == 0UL) /* Are any bits set? */ + return result + size; /* Nope. */ +found_middle: + return result + __ffs(tmp); +} + +#define for_each_bit(i, bitmask) \ + for (i = find_next_bit(bitmask, sizeof(bitmask), 0); \ + i < sizeof(bitmask); \ + i = find_next_bit(bitmask, sizeof(bitmask), i + 1)) + +#endif /* __CR_BITOPS_H__ */ diff --git a/CRIU_code/include/common/arch/x86/asm/bitsperlong.h b/CRIU_code/include/common/arch/x86/asm/bitsperlong.h new file mode 100644 index 0000000..7e0a71e --- /dev/null +++ b/CRIU_code/include/common/arch/x86/asm/bitsperlong.h @@ -0,0 +1,10 @@ +#ifndef __CR_BITSPERLONG_H__ +#define __CR_BITSPERLONG_H__ + +#ifdef CONFIG_X86_64 +# define BITS_PER_LONG 64 +#else +# define BITS_PER_LONG 32 +#endif + +#endif /* __CR_BITSPERLONG_H__ */ diff --git a/CRIU_code/include/common/arch/x86/asm/cmpxchg.h b/CRIU_code/include/common/arch/x86/asm/cmpxchg.h new file mode 100644 index 0000000..fa5eccf --- /dev/null +++ b/CRIU_code/include/common/arch/x86/asm/cmpxchg.h @@ -0,0 +1,107 @@ +#ifndef __CR_CMPXCHG_H__ +#define __CR_CMPXCHG_H__ + +#include + +#define LOCK_PREFIX "\n\tlock; " + +#define __X86_CASE_B 1 +#define __X86_CASE_W 2 +#define __X86_CASE_L 4 +#define __X86_CASE_Q 8 + +/* + * An exchange-type operation, which takes a value and a pointer, and + * returns the old value. Make sure you never reach non-case statement + * here, otherwise behaviour is undefined. + */ +#define __xchg_op(ptr, arg, op, lock) \ + ({ \ + __typeof__ (*(ptr)) __ret = (arg); \ + switch (sizeof(*(ptr))) { \ + case __X86_CASE_B: \ + asm volatile (lock #op "b %b0, %1\n" \ + : "+q" (__ret), "+m" (*(ptr)) \ + : : "memory", "cc"); \ + break; \ + case __X86_CASE_W: \ + asm volatile (lock #op "w %w0, %1\n" \ + : "+r" (__ret), "+m" (*(ptr)) \ + : : "memory", "cc"); \ + break; \ + case __X86_CASE_L: \ + asm volatile (lock #op "l %0, %1\n" \ + : "+r" (__ret), "+m" (*(ptr)) \ + : : "memory", "cc"); \ + break; \ + case __X86_CASE_Q: \ + asm volatile (lock #op "q %q0, %1\n" \ + : "+r" (__ret), "+m" (*(ptr)) \ + : : "memory", "cc"); \ + break; \ + } \ + __ret; \ + }) + +#define __xadd(ptr, inc, lock) __xchg_op((ptr), (inc), xadd, lock) +#define xadd(ptr, inc) __xadd((ptr), (inc), "lock ;") + +/* Borrowed from linux kernel arch/x86/include/asm/cmpxchg.h */ + +/* + * Atomic compare and exchange. Compare OLD with MEM, if identical, + * store NEW in MEM. Return the initial value in MEM. Success is + * indicated by comparing RETURN with OLD. + */ +#define __raw_cmpxchg(ptr, old, new, size, lock) \ +({ \ + __typeof__(*(ptr)) __ret; \ + __typeof__(*(ptr)) __old = (old); \ + __typeof__(*(ptr)) __new = (new); \ + switch (size) { \ + case __X86_CASE_B: \ + { \ + volatile uint8_t *__ptr = (volatile uint8_t *)(ptr); \ + asm volatile(lock "cmpxchgb %2,%1" \ + : "=a" (__ret), "+m" (*__ptr) \ + : "q" (__new), "0" (__old) \ + : "memory"); \ + break; \ + } \ + case __X86_CASE_W: \ + { \ + volatile uint16_t *__ptr = (volatile uint16_t *)(ptr); \ + asm volatile(lock "cmpxchgw %2,%1" \ + : "=a" (__ret), "+m" (*__ptr) \ + : "r" (__new), "0" (__old) \ + : "memory"); \ + break; \ + } \ + case __X86_CASE_L: \ + { \ + volatile uint32_t *__ptr = (volatile uint32_t *)(ptr); \ + asm volatile(lock "cmpxchgl %2,%1" \ + : "=a" (__ret), "+m" (*__ptr) \ + : "r" (__new), "0" (__old) \ + : "memory"); \ + break; \ + } \ + case __X86_CASE_Q: \ + { \ + volatile uint64_t *__ptr = (volatile uint64_t *)(ptr); \ + asm volatile(lock "cmpxchgq %2,%1" \ + : "=a" (__ret), "+m" (*__ptr) \ + : "r" (__new), "0" (__old) \ + : "memory"); \ + break; \ + } \ + } \ + __ret; \ +}) + +#define __cmpxchg(ptr, old, new, size) \ + __raw_cmpxchg((ptr), (old), (new), (size), LOCK_PREFIX) +#define cmpxchg(ptr, old, new) \ + __cmpxchg(ptr, old, new, sizeof(*(ptr))) + +#endif /* __CR_CMPXCHG_H__ */ diff --git a/CRIU_code/include/common/arch/x86/asm/linkage.h b/CRIU_code/include/common/arch/x86/asm/linkage.h new file mode 100644 index 0000000..5eaf450 --- /dev/null +++ b/CRIU_code/include/common/arch/x86/asm/linkage.h @@ -0,0 +1,27 @@ +#ifndef __CR_LINKAGE_H__ +#define __CR_LINKAGE_H__ + +#ifdef __ASSEMBLY__ + +#define __ALIGN .align 4, 0x90 +#define __ALIGN_STR ".align 4, 0x90" + +#define GLOBAL(name) \ + .globl name; \ + name: + +#define ENTRY(name) \ + .globl name; \ + .type name, @function; \ + __ALIGN; \ + name: + +#define END(sym) \ + .size sym, . - sym + +#endif /* __ASSEMBLY__ */ + +#define __USER32_CS 0x23 +#define __USER_CS 0x33 + +#endif /* __CR_LINKAGE_H__ */ diff --git a/CRIU_code/include/common/arch/x86/asm/page.h b/CRIU_code/include/common/arch/x86/asm/page.h new file mode 100644 index 0000000..1348355 --- /dev/null +++ b/CRIU_code/include/common/arch/x86/asm/page.h @@ -0,0 +1,19 @@ +#ifndef __CR_ASM_PAGE_H__ +#define __CR_ASM_PAGE_H__ + +#ifndef PAGE_SHIFT +# define PAGE_SHIFT 12 +#endif + +#ifndef PAGE_SIZE +# define PAGE_SIZE (1UL << PAGE_SHIFT) +#endif + +#ifndef PAGE_MASK +# define PAGE_MASK (~(PAGE_SIZE - 1)) +#endif + +#define PAGE_PFN(addr) ((addr) / PAGE_SIZE) +#define page_size() PAGE_SIZE + +#endif /* __CR_ASM_PAGE_H__ */ diff --git a/CRIU_code/include/common/asm-generic/bitops.h b/CRIU_code/include/common/asm-generic/bitops.h new file mode 100644 index 0000000..0d861bd --- /dev/null +++ b/CRIU_code/include/common/asm-generic/bitops.h @@ -0,0 +1,113 @@ +/* + * Generic bits operations. + * + * Architectures that don't want their own implementation of those, + * should include this file into the arch/$ARCH/include/asm/bitops.h + */ + +#ifndef __CR_GENERIC_BITOPS_H__ +#define __CR_GENERIC_BITOPS_H__ + +#include "common/asm/bitsperlong.h" + +#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d)) +#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_LONG) + +#define DECLARE_BITMAP(name, bits) \ + unsigned long name[BITS_TO_LONGS(bits)] + +#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 1) +/* Technically wrong, but this avoids compilation errors on some gcc + versions. */ +#define BITOP_ADDR(x) "=m" (*(volatile long *) (x)) +#else +#define BITOP_ADDR(x) "+m" (*(volatile long *) (x)) +#endif + +#define ADDR BITOP_ADDR(addr) + +static inline void set_bit(int nr, volatile unsigned long *addr) { + addr += nr / BITS_PER_LONG; + *addr |= (1UL << (nr % BITS_PER_LONG)); +} + +static inline void change_bit(int nr, volatile unsigned long *addr) +{ + addr += nr / BITS_PER_LONG; + *addr ^= (1UL << (nr % BITS_PER_LONG)); +} + +static inline int test_bit(int nr, volatile const unsigned long *addr) +{ + addr += nr / BITS_PER_LONG; + return (*addr & (1UL << (nr % BITS_PER_LONG))) ? -1 : 0; +} + +static inline void clear_bit(int nr, volatile unsigned long *addr) +{ + addr += nr / BITS_PER_LONG; + *addr &= ~(1UL << (nr % BITS_PER_LONG)); +} + +/** + * __ffs - find first set bit in word + * @word: The word to search + * + * Undefined if no bit exists, so code should check against 0 first. + */ +static inline unsigned long __ffs(unsigned long word) +{ + return __builtin_ffsl(word) - 1; +} + +#define BITOP_WORD(nr) ((nr) / BITS_PER_LONG) + +/* + * Find the next set bit in a memory region. + */ +static inline +unsigned long find_next_bit(const unsigned long *addr, unsigned long size, + unsigned long offset) +{ + const unsigned long *p = addr + BITOP_WORD(offset); + unsigned long result = offset & ~(BITS_PER_LONG-1); + unsigned long tmp; + + if (offset >= size) + return size; + size -= result; + offset %= BITS_PER_LONG; + if (offset) { + tmp = *(p++); + tmp &= (~0UL << offset); + if (size < BITS_PER_LONG) + goto found_first; + if (tmp) + goto found_middle; + size -= BITS_PER_LONG; + result += BITS_PER_LONG; + } + while (size & ~(BITS_PER_LONG-1)) { + if ((tmp = *(p++))) + goto found_middle; + result += BITS_PER_LONG; + size -= BITS_PER_LONG; + } + if (!size) + return result; + tmp = *p; + +found_first: + tmp &= (~0UL >> (BITS_PER_LONG - size)); + if (tmp == 0UL) /* Are any bits set? */ + return result + size; /* Nope. */ +found_middle: + return result + __ffs(tmp); +} + +#define for_each_bit(i, bitmask) \ + for (i = find_next_bit(bitmask, sizeof(bitmask), 0); \ + i < sizeof(bitmask); \ + i = find_next_bit(bitmask, sizeof(bitmask), i + 1)) + +#endif /* __CR_GENERIC_BITOPS_H__ */ diff --git a/CRIU_code/include/common/bitops.h b/CRIU_code/include/common/bitops.h new file mode 100644 index 0000000..1e64112 --- /dev/null +++ b/CRIU_code/include/common/bitops.h @@ -0,0 +1,23 @@ +#ifndef __CR_COMMON_BITOPS_H__ +#define __CR_COMMON_BITOPS_H__ +#include "common/asm/bitops.h" + +#include "common/bitsperlong.h" +#include + +#if __BYTE_ORDER == __BIG_ENDIAN +#define BITOP_LE_SWIZZLE ((BITS_PER_LONG-1) & ~0x7) +#else +#define BITOP_LE_SWIZZLE 0 +#endif + +static inline int test_and_set_bit_le(int nr, void *addr) +{ + return test_and_set_bit(nr ^ BITOP_LE_SWIZZLE, addr); +} + +static inline void clear_bit_le(int nr, void *addr) +{ + clear_bit(nr ^ BITOP_LE_SWIZZLE, addr); +} +#endif diff --git a/CRIU_code/include/common/bitsperlong.h b/CRIU_code/include/common/bitsperlong.h new file mode 100644 index 0000000..b074936 --- /dev/null +++ b/CRIU_code/include/common/bitsperlong.h @@ -0,0 +1,4 @@ +#ifndef __CR_COMMON_BITSPERLONG_H__ +#define __CR_COMMON_BITSPERLONG_H__ +#include "common/asm/bitsperlong.h" +#endif diff --git a/CRIU_code/include/common/bug.h b/CRIU_code/include/common/bug.h new file mode 100644 index 0000000..4622911 --- /dev/null +++ b/CRIU_code/include/common/bug.h @@ -0,0 +1,41 @@ +#ifndef __CR_BUG_H__ +#define __CR_BUG_H__ + +#include +#include + +#include "common/compiler.h" + +#ifndef BUG_ON_HANDLER + +#ifdef CR_NOGLIBC +# define __raise() +#else +# define __raise() raise(SIGABRT) +#endif + +#ifndef __clang_analyzer__ +# ifndef pr_err +# error pr_err macro must be defined +# endif +# define BUG_ON_HANDLER(condition) \ + do { \ + if ((condition)) { \ + pr_err("BUG at %s:%d\n", __FILE__, __LINE__); \ + __raise(); \ + *(volatile unsigned long *)NULL = 0xdead0000 + __LINE__; \ + } \ + } while (0) +#else +# define BUG_ON_HANDLER(condition) \ + do { \ + assert(!condition); \ + } while (0) +#endif + +#endif /* BUG_ON_HANDLER */ + +#define BUG_ON(condition) BUG_ON_HANDLER((condition)) +#define BUG() BUG_ON(true) + +#endif /* __CR_BUG_H__ */ diff --git a/CRIU_code/include/common/compiler.h b/CRIU_code/include/common/compiler.h new file mode 100644 index 0000000..fc8abcf --- /dev/null +++ b/CRIU_code/include/common/compiler.h @@ -0,0 +1,102 @@ +#ifndef __CR_COMPILER_H__ +#define __CR_COMPILER_H__ + +/* + * Various definitions for success build, + * picked from various places, mostly from + * the linux kernel. + */ + +#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) +#define NELEMS_AS_ARRAY(x,y) (sizeof(x) / sizeof((y)[0])) +#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)])) + +#define ASSIGN_TYPED(a, b) do { (a) = (typeof(a))(b); } while (0) +#define ASSIGN_MEMBER(a, b, m) do { ASSIGN_TYPED((a)->m, (b)->m); } while (0) + +#define __stringify_1(x...) #x +#define __stringify(x...) __stringify_1(x) + +#define NORETURN __attribute__((__noreturn__)) +#define __packed __attribute__((__packed__)) +#define __used __attribute__((__used__)) +#define __maybe_unused __attribute__((unused)) +#define __always_unused __attribute__((unused)) + +#define __section(S) __attribute__ ((__section__(#S))) + +#ifndef __always_inline +# define __always_inline inline __attribute__((always_inline)) +#endif + +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) + +#ifndef always_inline +# define always_inline __always_inline +#endif + +#ifndef noinline +# define noinline __attribute__((noinline)) +#endif + +#define __aligned(x) __attribute__((aligned(x))) + +/* + * Macro to define stack alignment. + * aarch64 requires stack to be aligned to 16 bytes. + */ +#define __stack_aligned__ __attribute__((aligned(16))) + +#ifndef offsetof +# define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER) +#endif + +#define barrier() asm volatile("" ::: "memory") + +#define container_of(ptr, type, member) ({ \ + const typeof( ((type *)0)->member ) *__mptr = (ptr); \ + (type *)( (char *)__mptr - offsetof(type,member) );}) + +#ifndef FIELD_SIZEOF +# define FIELD_SIZEOF(t, f) (sizeof(((t*)0)->f)) +#endif + +#define __round_mask(x, y) ((__typeof__(x))((y) - 1)) +#define round_up(x, y) ((((x) - 1) | __round_mask(x, y)) + 1) +#define round_down(x, y) ((x) & ~__round_mask(x, y)) +#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d)) +#define ALIGN(x, a) (((x) + (a) - 1) & ~((a) - 1)) + +#define min(x, y) ({ \ + typeof(x) _min1 = (x); \ + typeof(y) _min2 = (y); \ + (void) (&_min1 == &_min2); \ + _min1 < _min2 ? _min1 : _min2; }) + +#define max(x, y) ({ \ + typeof(x) _max1 = (x); \ + typeof(y) _max2 = (y); \ + (void) (&_max1 == &_max2); \ + _max1 > _max2 ? _max1 : _max2; }) + +#define min_t(type, x, y) ({ \ + type __min1 = (x); \ + type __min2 = (y); \ + __min1 < __min2 ? __min1: __min2; }) + +#define max_t(type, x, y) ({ \ + type __max1 = (x); \ + type __max2 = (y); \ + __max1 > __max2 ? __max1: __max2; }) + +#define SWAP(x, y) \ + do { \ + typeof(x) ____val = x; \ + x = y; \ + y = ____val; \ + } while (0) + +#define is_log2(v) (((v) & ((v) - 1)) == 0) + +#endif /* __CR_COMPILER_H__ */ diff --git a/CRIU_code/include/common/err.h b/CRIU_code/include/common/err.h new file mode 100644 index 0000000..a370daf --- /dev/null +++ b/CRIU_code/include/common/err.h @@ -0,0 +1,53 @@ +/* + * Adopted from linux kernel + */ +#ifndef __CR_COMMON_ERR_H__ +#define __CR_COMMON_ERR_H__ + +#include "common/compiler.h" + +/* + * The address of a block returned by malloc or realloc in GNU + * systems is always a multiple of eight (or sixteen on 64-bit systems). + * + * Thus we may encode error number in low bits. + */ +#define MAX_ERRNO 4095 + +#define IS_ERR_VALUE(x) unlikely((x) >= (unsigned long)-MAX_ERRNO) + +static inline void *ERR_PTR(long error) +{ + return (void *)error; +} + +static inline long PTR_ERR(const void *ptr) +{ + return (long)ptr; +} + +static inline long IS_ERR(const void *ptr) +{ + return IS_ERR_VALUE((unsigned long)ptr); +} + +static inline long IS_ERR_OR_NULL(const void *ptr) +{ + return !ptr || IS_ERR_VALUE((unsigned long)ptr); +} + +static inline void *ERR_CAST(const void *ptr) +{ + /* cast away the const */ + return (void *)ptr; +} + +static inline int PTR_RET(const void *ptr) +{ + if (IS_ERR(ptr)) + return PTR_ERR(ptr); + else + return 0; +} + +#endif /* __CR_ERR_H__ */ diff --git a/CRIU_code/include/common/list.h b/CRIU_code/include/common/list.h new file mode 100644 index 0000000..b8b57c7 --- /dev/null +++ b/CRIU_code/include/common/list.h @@ -0,0 +1,421 @@ +#ifndef __CR_LIST_H__ +#define __CR_LIST_H__ + +/* + * Double linked lists. + */ + +#include +#include "common/compiler.h" + +#define POISON_POINTER_DELTA 0 +#define LIST_POISON1 ((void *) 0x00100100 + POISON_POINTER_DELTA) +#define LIST_POISON2 ((void *) 0x00200200 + POISON_POINTER_DELTA) + +struct list_head { + struct list_head *prev, *next; +}; + +#define LIST_HEAD_INIT(name) { &(name), &(name) } +#define LIST_HEAD(name) struct list_head name = LIST_HEAD_INIT(name) + +static inline void INIT_LIST_HEAD(struct list_head *list) +{ + list->next = list; + list->prev = list; +} + +static inline void __list_add(struct list_head *new, + struct list_head *prev, + struct list_head *next) +{ + next->prev = new; + new->next = next; + new->prev = prev; + prev->next = new; +} + +static inline void list_add(struct list_head *new, struct list_head *head) +{ + __list_add(new, head, head->next); +} + +static inline void list_add_tail(struct list_head *new, struct list_head *head) +{ + __list_add(new, head->prev, head); +} + +static inline void __list_del(struct list_head * prev, struct list_head * next) +{ + next->prev = prev; + prev->next = next; +} + +static inline void __list_del_entry(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); +} + +static inline void list_del(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); + entry->next = LIST_POISON1; + entry->prev = LIST_POISON2; +} + +static inline void list_replace(struct list_head *old, + struct list_head *new) +{ + new->next = old->next; + new->next->prev = new; + new->prev = old->prev; + new->prev->next = new; +} + +static inline void list_replace_init(struct list_head *old, + struct list_head *new) +{ + list_replace(old, new); + INIT_LIST_HEAD(old); +} + +static inline void list_del_init(struct list_head *entry) +{ + __list_del_entry(entry); + INIT_LIST_HEAD(entry); +} + +static inline void list_move(struct list_head *list, struct list_head *head) +{ + __list_del_entry(list); + list_add(list, head); +} + +static inline void list_move_tail(struct list_head *list, + struct list_head *head) +{ + __list_del_entry(list); + list_add_tail(list, head); +} + +static inline int list_is_last(const struct list_head *list, + const struct list_head *head) +{ + return list->next == head; +} + +static inline int list_is_first(const struct list_head *list, + const struct list_head *head) +{ + return list->prev == head; +} + +static inline int list_empty(const struct list_head *head) +{ + return head->next == head; +} + +static inline int list_empty_careful(const struct list_head *head) +{ + struct list_head *next = head->next; + return (next == head) && (next == head->prev); +} +static inline void list_rotate_left(struct list_head *head) +{ + struct list_head *first; + + if (!list_empty(head)) { + first = head->next; + list_move_tail(first, head); + } +} + +static inline int list_is_singular(const struct list_head *head) +{ + return !list_empty(head) && (head->next == head->prev); +} + +static inline void __list_cut_position(struct list_head *list, + struct list_head *head, struct list_head *entry) +{ + struct list_head *new_first = entry->next; + list->next = head->next; + list->next->prev = list; + list->prev = entry; + entry->next = list; + head->next = new_first; + new_first->prev = head; +} + +static inline void list_cut_position(struct list_head *list, + struct list_head *head, struct list_head *entry) +{ + if (list_empty(head)) + return; + if (list_is_singular(head) && + (head->next != entry && head != entry)) + return; + if (entry == head) + INIT_LIST_HEAD(list); + else + __list_cut_position(list, head, entry); +} + +static inline void __list_splice(const struct list_head *list, + struct list_head *prev, + struct list_head *next) +{ + struct list_head *first = list->next; + struct list_head *last = list->prev; + + first->prev = prev; + prev->next = first; + + last->next = next; + next->prev = last; +} + +static inline void list_splice(const struct list_head *list, + struct list_head *head) +{ + if (!list_empty(list)) + __list_splice(list, head, head->next); +} + +static inline void list_splice_tail(struct list_head *list, + struct list_head *head) +{ + if (!list_empty(list)) + __list_splice(list, head->prev, head); +} + +static inline void list_splice_init(struct list_head *list, + struct list_head *head) +{ + if (!list_empty(list)) { + __list_splice(list, head, head->next); + INIT_LIST_HEAD(list); + } +} + +static inline void list_splice_tail_init(struct list_head *list, + struct list_head *head) +{ + if (!list_empty(list)) { + __list_splice(list, head->prev, head); + INIT_LIST_HEAD(list); + } +} + +#define list_entry(ptr, type, member) \ + container_of(ptr, type, member) + +#define list_first_entry(ptr, type, member) \ + list_entry((ptr)->next, type, member) + +#define list_for_each(pos, head) \ + for (pos = (head)->next; pos != (head); pos = pos->next) + +#define list_for_each_prev(pos, head) \ + for (pos = (head)->prev; pos != (head); pos = pos->prev) + +#define list_for_each_safe(pos, n, head) \ + for (pos = (head)->next, n = pos->next; pos != (head); \ + pos = n, n = pos->next) + +#define list_for_each_prev_safe(pos, n, head) \ + for (pos = (head)->prev, n = pos->prev; \ + pos != (head); \ + pos = n, n = pos->prev) + +#define list_for_each_entry(pos, head, member) \ + for (pos = list_entry((head)->next, typeof(*pos), member); \ + &pos->member != (head); \ + pos = list_entry(pos->member.next, typeof(*pos), member)) + +#define list_for_each_entry_reverse(pos, head, member) \ + for (pos = list_entry((head)->prev, typeof(*pos), member); \ + &pos->member != (head); \ + pos = list_entry(pos->member.prev, typeof(*pos), member)) + +#define list_prepare_entry(pos, head, member) \ + ((pos) ? : list_entry(head, typeof(*pos), member)) + +#define list_for_each_entry_continue(pos, head, member) \ + for (pos = list_entry(pos->member.next, typeof(*pos), member); \ + &pos->member != (head); \ + pos = list_entry(pos->member.next, typeof(*pos), member)) + +#define list_for_each_entry_continue_reverse(pos, head, member) \ + for (pos = list_entry(pos->member.prev, typeof(*pos), member); \ + &pos->member != (head); \ + pos = list_entry(pos->member.prev, typeof(*pos), member)) + +#define list_for_each_entry_from(pos, head, member) \ + for (; &pos->member != (head); \ + pos = list_entry(pos->member.next, typeof(*pos), member)) + +#define list_for_each_entry_safe(pos, n, head, member) \ + for (pos = list_entry((head)->next, typeof(*pos), member), \ + n = list_entry(pos->member.next, typeof(*pos), member); \ + &pos->member != (head); \ + pos = n, n = list_entry(n->member.next, typeof(*n), member)) + +#define list_for_each_entry_safe_continue(pos, n, head, member) \ + for (pos = list_entry(pos->member.next, typeof(*pos), member), \ + n = list_entry(pos->member.next, typeof(*pos), member); \ + &pos->member != (head); \ + pos = n, n = list_entry(n->member.next, typeof(*n), member)) + +#define list_for_each_entry_safe_from(pos, n, head, member) \ + for (n = list_entry(pos->member.next, typeof(*pos), member); \ + &pos->member != (head); \ + pos = n, n = list_entry(n->member.next, typeof(*n), member)) + +#define list_for_each_entry_safe_reverse(pos, n, head, member) \ + for (pos = list_entry((head)->prev, typeof(*pos), member), \ + n = list_entry(pos->member.prev, typeof(*pos), member); \ + &pos->member != (head); \ + pos = n, n = list_entry(n->member.prev, typeof(*n), member)) + +#define list_safe_reset_next(pos, n, member) \ + n = list_entry(pos->member.next, typeof(*pos), member) + +/* + * Double linked lists with a single pointer list head. + */ + +struct hlist_head { + struct hlist_node *first; +}; + +struct hlist_node { + struct hlist_node *next, **pprev; +}; + +#define HLIST_HEAD_INIT { .first = NULL } +#define HLIST_HEAD(name) struct hlist_head name = { .first = NULL } +#define INIT_HLIST_HEAD(ptr) ((ptr)->first = NULL) + +static inline void INIT_HLIST_NODE(struct hlist_node *h) +{ + h->next = NULL; + h->pprev = NULL; +} + +static inline int hlist_unhashed(const struct hlist_node *h) +{ + return !h->pprev; +} + +static inline int hlist_empty(const struct hlist_head *h) +{ + return !h->first; +} + +static inline void __hlist_del(struct hlist_node *n) +{ + struct hlist_node *next = n->next; + struct hlist_node **pprev = n->pprev; + *pprev = next; + if (next) + next->pprev = pprev; +} + +static inline void hlist_del(struct hlist_node *n) +{ + __hlist_del(n); + n->next = LIST_POISON1; + n->pprev = LIST_POISON2; +} + +static inline void hlist_del_init(struct hlist_node *n) +{ + if (!hlist_unhashed(n)) { + __hlist_del(n); + INIT_HLIST_NODE(n); + } +} + +static inline void hlist_add_head(struct hlist_node *n, struct hlist_head *h) +{ + struct hlist_node *first = h->first; + n->next = first; + if (first) + first->pprev = &n->next; + h->first = n; + n->pprev = &h->first; +} + +/* next must be != NULL */ +static inline void hlist_add_before(struct hlist_node *n, + struct hlist_node *next) +{ + n->pprev = next->pprev; + n->next = next; + next->pprev = &n->next; + *(n->pprev) = n; +} + +static inline void hlist_add_after(struct hlist_node *n, + struct hlist_node *next) +{ + next->next = n->next; + n->next = next; + next->pprev = &n->next; + + if (next->next) + next->next->pprev = &next->next; +} + +/* after that we'll appear to be on some hlist and hlist_del will work */ +static inline void hlist_add_fake(struct hlist_node *n) +{ + n->pprev = &n->next; +} + +/* + * Move a list from one list head to another. Fixup the pprev + * reference of the first entry if it exists. + */ +static inline void hlist_move_list(struct hlist_head *old, + struct hlist_head *new) +{ + new->first = old->first; + if (new->first) + new->first->pprev = &new->first; + old->first = NULL; +} + +#define hlist_entry(ptr, type, member) container_of(ptr,type,member) + +#define hlist_for_each(pos, head) \ + for (pos = (head)->first; pos ; pos = pos->next) + +#define hlist_for_each_safe(pos, n, head) \ + for (pos = (head)->first; pos && ({ n = pos->next; 1; }); \ + pos = n) + +#define hlist_entry_safe(ptr, type, member) \ + (ptr) ? hlist_entry(ptr, type, member) : NULL + +#define hlist_for_each_entry(pos, head, member) \ + for (pos = hlist_entry_safe((head)->first, typeof(*(pos)), member); \ + pos; \ + pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member)) + +#define hlist_for_each_entry_continue(pos, member) \ + for (pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member);\ + pos; \ + pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member)) + +#define hlist_for_each_entry_from(pos, member) \ + for (; pos; \ + pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member)) + +#define hlist_for_each_entry_safe(pos, n, head, member) \ + for (pos = hlist_entry_safe((head)->first, typeof(*pos), member); \ + pos && ({ n = pos->member.next; 1; }); \ + pos = hlist_entry_safe(n, typeof(*pos), member)) + +#endif /* __CR_LIST_H__ */ diff --git a/CRIU_code/include/common/lock.h b/CRIU_code/include/common/lock.h new file mode 100644 index 0000000..4782b63 --- /dev/null +++ b/CRIU_code/include/common/lock.h @@ -0,0 +1,164 @@ +#ifndef __CR_COMMON_LOCK_H__ +#define __CR_COMMON_LOCK_H__ + +#include +#include +#include +#include +#include +#include "common/asm/atomic.h" +#include "common/compiler.h" + +#define LOCK_BUG_ON(condition) \ + if ((condition)) \ + *(volatile unsigned long *)NULL = 0xdead0000 + __LINE__ +#define LOCK_BUG() LOCK_BUG_ON(1) + +#ifdef CR_NOGLIBC +# include +#else +# include +# include +static inline long sys_futex (uint32_t *addr1, int op, uint32_t val1, + struct timespec *timeout, uint32_t *addr2, uint32_t val3) +{ + int rc = syscall(SYS_futex, addr1, op, val1, timeout, addr2, val3); + if (rc == -1) rc = -errno; + return rc; +} +#endif + +typedef struct { + atomic_t raw; +} __aligned(sizeof(int)) futex_t; + +#define FUTEX_ABORT_FLAG (0x80000000) +#define FUTEX_ABORT_RAW (-1U) + +/* Get current futex @f value */ +static inline uint32_t futex_get(futex_t *f) +{ + return atomic_read(&f->raw); +} + +/* Set futex @f value to @v */ +static inline void futex_set(futex_t *f, uint32_t v) +{ + atomic_set(&f->raw, (int)v); +} + +#define futex_init(f) futex_set(f, 0) + +/* Wait on futex @__f value @__v become in condition @__c */ +#define futex_wait_if_cond(__f, __v, __cond) \ + do { \ + int ret; \ + uint32_t tmp; \ + \ + while (1) { \ + struct timespec to = {.tv_sec = 120}; \ + tmp = futex_get(__f); \ + if ((tmp & FUTEX_ABORT_FLAG) || \ + (tmp __cond (__v))) \ + break; \ + ret = sys_futex((uint32_t *)&(__f)->raw.counter, FUTEX_WAIT,\ + tmp, &to, NULL, 0); \ + if (ret == -ETIMEDOUT) \ + continue; \ + if (ret == -EINTR || ret == -EWOULDBLOCK) \ + continue; \ + if (ret < 0) \ + LOCK_BUG(); \ + } \ + } while (0) + +/* Set futex @f to @v and wake up all waiters */ +static inline void futex_set_and_wake(futex_t *f, uint32_t v) +{ + atomic_set(&f->raw, (int)v); + LOCK_BUG_ON(sys_futex((uint32_t *)&f->raw.counter, FUTEX_WAKE, INT_MAX, NULL, NULL, 0) < 0); +} + +/* Wake up all futex @f waiters */ +static inline void futex_wake(futex_t *f) +{ + LOCK_BUG_ON(sys_futex((uint32_t *)&f->raw.counter, FUTEX_WAKE, INT_MAX, NULL, NULL, 0) < 0); +} + +/* Mark futex @f as wait abort needed and wake up all waiters */ +static inline void futex_abort_and_wake(futex_t *f) +{ + BUILD_BUG_ON(!(FUTEX_ABORT_RAW & FUTEX_ABORT_FLAG)); + futex_set_and_wake(f, FUTEX_ABORT_RAW); +} + +/* Decrement futex @f value and wake up all waiters */ +static inline void futex_dec_and_wake(futex_t *f) +{ + atomic_dec(&f->raw); + LOCK_BUG_ON(sys_futex((uint32_t *)&f->raw.counter, FUTEX_WAKE, INT_MAX, NULL, NULL, 0) < 0); +} + +/* Increment futex @f value and wake up all waiters */ +static inline void futex_inc_and_wake(futex_t *f) +{ + atomic_inc(&f->raw); + LOCK_BUG_ON(sys_futex((uint32_t *)&f->raw.counter, FUTEX_WAKE, INT_MAX, NULL, NULL, 0) < 0); +} + +/* Plain increment futex @f value */ +static inline void futex_inc(futex_t *f) { atomic_inc(&f->raw); } + +/* Plain decrement futex @f value */ +static inline void futex_dec(futex_t *f) { atomic_dec(&f->raw); } + +/* Wait until futex @f value become @v */ +#define futex_wait_until(f, v) futex_wait_if_cond(f, v, ==) + +/* Wait while futex @f value is greater than @v */ +#define futex_wait_while_gt(f, v) futex_wait_if_cond(f, v, <=) + +/* Wait while futex @f value is less than @v */ +#define futex_wait_while_lt(f, v) futex_wait_if_cond(f, v, >=) + +/* Wait while futex @f value is equal to @v */ +#define futex_wait_while_eq(f, v) futex_wait_if_cond(f, v, !=) + +/* Wait while futex @f value is @v */ +static inline void futex_wait_while(futex_t *f, uint32_t v) +{ + while ((uint32_t)atomic_read(&f->raw) == v) { + int ret = sys_futex((uint32_t *)&f->raw.counter, FUTEX_WAIT, v, NULL, NULL, 0); + LOCK_BUG_ON(ret < 0 && ret != -EWOULDBLOCK); + } +} + +typedef struct { + atomic_t raw; +} mutex_t; + +static inline void mutex_init(mutex_t *m) +{ + uint32_t c = 0; + atomic_set(&m->raw, (int)c); +} + +static inline void mutex_lock(mutex_t *m) +{ + uint32_t c; + int ret; + + while ((c = (uint32_t)atomic_inc_return(&m->raw)) != 1) { + ret = sys_futex((uint32_t *)&m->raw.counter, FUTEX_WAIT, c, NULL, NULL, 0); + LOCK_BUG_ON(ret < 0 && ret != -EWOULDBLOCK); + } +} + +static inline void mutex_unlock(mutex_t *m) +{ + uint32_t c = 0; + atomic_set(&m->raw, (int)c); + LOCK_BUG_ON(sys_futex((uint32_t *)&m->raw.counter, FUTEX_WAKE, 1, NULL, NULL, 0) < 0); +} + +#endif /* __CR_COMMON_LOCK_H__ */ diff --git a/CRIU_code/include/common/page.h b/CRIU_code/include/common/page.h new file mode 100644 index 0000000..4b6b8a6 --- /dev/null +++ b/CRIU_code/include/common/page.h @@ -0,0 +1,4 @@ +#ifndef __CR_COMMON_PAGE_H__ +#define __CR_COMMON_PAGE_H__ +#include "common/asm/page.h" +#endif diff --git a/CRIU_code/include/common/scm-code.c b/CRIU_code/include/common/scm-code.c new file mode 100644 index 0000000..351c405 --- /dev/null +++ b/CRIU_code/include/common/scm-code.c @@ -0,0 +1,121 @@ +#ifndef __sys +#error "The __sys macro is required" +#endif + +static void scm_fdset_init_chunk(struct scm_fdset *fdset, int nr_fds, + void *data, unsigned ch_size) +{ + struct cmsghdr *cmsg; + static char dummy; + + fdset->hdr.msg_controllen = CMSG_LEN(sizeof(int) * nr_fds); + + cmsg = CMSG_FIRSTHDR(&fdset->hdr); + cmsg->cmsg_len = fdset->hdr.msg_controllen; + + if (data) { + fdset->iov.iov_base = data; + fdset->iov.iov_len = nr_fds * ch_size; + } else { + fdset->iov.iov_base = &dummy; + fdset->iov.iov_len = 1; + } +} + +static int *scm_fdset_init(struct scm_fdset *fdset, struct sockaddr_un *saddr, + int saddr_len) +{ + struct cmsghdr *cmsg; + + BUILD_BUG_ON(sizeof(fdset->msg_buf) < (CMSG_SPACE(sizeof(int) * CR_SCM_MAX_FD))); + + fdset->iov.iov_base = (void *)0xdeadbeef; + + fdset->hdr.msg_iov = &fdset->iov; + fdset->hdr.msg_iovlen = 1; + fdset->hdr.msg_name = (struct sockaddr *)saddr; + fdset->hdr.msg_namelen = saddr_len; + + fdset->hdr.msg_control = &fdset->msg_buf; + fdset->hdr.msg_controllen = CMSG_LEN(sizeof(int) * CR_SCM_MAX_FD); + + cmsg = CMSG_FIRSTHDR(&fdset->hdr); + cmsg->cmsg_len = fdset->hdr.msg_controllen; + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + + return (int *)CMSG_DATA(cmsg); +} + +int send_fds(int sock, struct sockaddr_un *saddr, int len, + int *fds, int nr_fds, void *data, unsigned ch_size) +{ + /* In musl_libc the msghdr structure has pads which has to be zeroed */ + struct scm_fdset fdset = {}; + int *cmsg_data; + int i, min_fd, ret; + + cmsg_data = scm_fdset_init(&fdset, saddr, len); + for (i = 0; i < nr_fds; i += min_fd) { + min_fd = min(CR_SCM_MAX_FD, nr_fds - i); + scm_fdset_init_chunk(&fdset, min_fd, data, ch_size); + memcpy(cmsg_data, &fds[i], sizeof(int) * min_fd); + + ret = __sys(sendmsg)(sock, &fdset.hdr, 0); + if (ret <= 0) + return ret ? : -1; + + if (data) + data += min_fd * ch_size; + } + + return 0; +} + +int __recv_fds(int sock, int *fds, int nr_fds, void *data, unsigned ch_size, int flags) +{ + /* In musl_libc the msghdr structure has pads which has to be zeroed */ + struct scm_fdset fdset = {}; + struct cmsghdr *cmsg; + int *cmsg_data; + int ret; + int i, min_fd; + + cmsg_data = scm_fdset_init(&fdset, NULL, 0); + for (i = 0; i < nr_fds; i += min_fd) { + min_fd = min(CR_SCM_MAX_FD, nr_fds - i); + scm_fdset_init_chunk(&fdset, min_fd, data, ch_size); + + ret = __sys(recvmsg)(sock, &fdset.hdr, flags); + if (ret <= 0) + return ret ? __sys_err(ret) : -ENOMSG; + + cmsg = CMSG_FIRSTHDR(&fdset.hdr); + if (!cmsg || cmsg->cmsg_type != SCM_RIGHTS) + return -EINVAL; + if (fdset.hdr.msg_flags & MSG_CTRUNC) + return -ENFILE; + + min_fd = (cmsg->cmsg_len - sizeof(struct cmsghdr)) / sizeof(int); + /* + * In case if kernel screwed the recipient, most probably + * the caller stack frame will be overwriten, just scream + * and exit. + * + * FIXME Need to sanitize util.h to be able to include it + * into files which do not have glibc and a couple of + * sys_write_ helpers. Meawhile opencoded BUG_ON here. + */ + BUG_ON(min_fd > CR_SCM_MAX_FD); + + if (unlikely(min_fd <= 0)) + return -EBADFD; + + memcpy(&fds[i], cmsg_data, sizeof(int) * min_fd); + if (data) + data += ch_size * min_fd; + } + + return 0; +} + diff --git a/CRIU_code/include/common/scm.h b/CRIU_code/include/common/scm.h new file mode 100644 index 0000000..ab27137 --- /dev/null +++ b/CRIU_code/include/common/scm.h @@ -0,0 +1,54 @@ +#ifndef __COMMON_SCM_H__ +#define __COMMON_SCM_H__ + +#include +#include +#include + +/* + * Because of kernel doing kmalloc for user data passed + * in SCM messages, and there is kernel's SCM_MAX_FD as a limit + * for descriptors passed at once we're trying to reduce + * the pressue on kernel memory manager and use predefined + * known to work well size of the message buffer. + */ +#define CR_SCM_MSG_SIZE (1024) +#define CR_SCM_MAX_FD (252) + +struct scm_fdset { + struct msghdr hdr; + struct iovec iov; + char msg_buf[CR_SCM_MSG_SIZE]; +}; + +#ifndef F_GETOWNER_UIDS +#define F_GETOWNER_UIDS 17 +#endif + +extern int send_fds(int sock, struct sockaddr_un *saddr, int len, + int *fds, int nr_fds, void *data, unsigned ch_size); +extern int __recv_fds(int sock, int *fds, int nr_fds, + void *data, unsigned ch_size, int flags); +static inline int recv_fds(int sock, int *fds, int nr_fds, + void *data, unsigned ch_size) +{ + return __recv_fds(sock, fds, nr_fds, data, ch_size, 0); +} + +static inline int send_fd(int sock, struct sockaddr_un *saddr, int saddr_len, int fd) +{ + return send_fds(sock, saddr, saddr_len, &fd, 1, NULL, 0); +} + +static inline int recv_fd(int sock) +{ + int fd, ret; + + ret = recv_fds(sock, &fd, 1, NULL, 0); + if (ret) + return -1; + + return fd; +} + +#endif diff --git a/CRIU_code/include/common/xmalloc.h b/CRIU_code/include/common/xmalloc.h new file mode 100644 index 0000000..d377c83 --- /dev/null +++ b/CRIU_code/include/common/xmalloc.h @@ -0,0 +1,69 @@ +#ifndef __COMMON_XMALLOC_H__ +#define __COMMON_XMALLOC_H__ + +#include +#include + +#ifndef pr_err +#error "Macro pr_err is needed." +#endif + +#define __xalloc(op, size, ...) \ + ({ \ + void *___p = op( __VA_ARGS__ ); \ + if (!___p) \ + pr_err("%s: Can't allocate %li bytes\n", \ + __func__, (long)(size)); \ + ___p; \ + }) + +#define xstrdup(str) __xalloc(strdup, strlen(str) + 1, str) +#define xmalloc(size) __xalloc(malloc, size, size) +#define xzalloc(size) __xalloc(calloc, size, 1, size) +#define xrealloc(p, size) __xalloc(realloc, size, p, size) + +#define xfree(p) free(p) + +#define xrealloc_safe(pptr, size) \ + ({ \ + int __ret = -1; \ + void *new = xrealloc(*pptr, size); \ + if (new) { \ + *pptr = new; \ + __ret = 0; \ + } \ + __ret; \ + }) + +#define xmemdup(ptr, size) \ + ({ \ + void *new = xmalloc(size); \ + if (new) \ + memcpy(new, ptr, size); \ + new; \ + }) + +#define memzero_p(p) memset(p, 0, sizeof(*p)) +#define memzero(p, size) memset(p, 0, size) + +/* + * Helper for allocating trees with single xmalloc. + * This one advances the void *pointer on s bytes and + * returns the previous value. Use like this + * + * m = xmalloc(total_size); + * a = xptr_pull(&m, tree_root_t); + * a->b = xptr_pull(&m, leaf_a_t); + * a->c = xptr_pull(&m, leaf_c_t); + * ... + */ +static inline void *xptr_pull_s(void **m, size_t s) +{ + void *ret = (*m); + (*m) += s; + return ret; +} + +#define xptr_pull(m, type) xptr_pull_s(m, sizeof(type)) + +#endif /* __CR_XMALLOC_H__ */ diff --git a/CRIU_code/lib/Makefile b/CRIU_code/lib/Makefile new file mode 100644 index 0000000..67c50b9 --- /dev/null +++ b/CRIU_code/lib/Makefile @@ -0,0 +1,76 @@ +CRIU_SO := libcriu.so +CRIU_A := libcriu.a +UAPI_HEADERS := lib/c/criu.h images/rpc.proto + +# +# File to keep track of files installed by setup.py +CRIT_SETUP_FILES := lib/.crit-setup.files + +all-y += lib-c lib-a lib-py + +# +# C language bindings. +lib/c/Makefile: ; +lib/c/%: .FORCE + $(Q) $(MAKE) $(build)=lib/c $@ + +cflags-so += $(CFLAGS) -rdynamic -Wl,-soname,$(CRIU_SO).$(CRIU_SO_VERSION_MAJOR) +ldflags-so += -lprotobuf-c + +lib/c/$(CRIU_SO): lib/c/built-in.o + $(call msg-link, $@) + $(Q) $(CC) -shared $(cflags-so) -o $@ $^ $(ldflags-so) $(LDFLAGS) +lib/c/$(CRIU_A): lib/c/built-in.o + $(call msg-link, $@) + $(Q) $(AR) rcs $@ $^ +lib-c: lib/c/$(CRIU_SO) +lib-a: lib/c/$(CRIU_A) +.PHONY: lib-c lib-a + +# +# Python bindings. +lib/py/Makefile: ; +lib/py/%: .FORCE + $(call msg-gen, $@) + $(Q) $(MAKE) $(build)=lib/py $@ +lib-py: + $(Q) $(MAKE) $(build)=lib/py all +.PHONY: lib-py + +clean-lib: + $(Q) $(MAKE) $(build)=lib/c clean + $(Q) $(MAKE) $(build)=lib/py clean +.PHONY: clean-lib +clean: clean-lib +cleanup-y += lib/c/$(CRIU_SO) lib/c/$(CRIU_A) lib/c/criu.pc +mrproper: clean + +install: lib-c lib-a lib-py crit/crit lib/c/criu.pc.in + $(E) " INSTALL " lib + $(Q) mkdir -p $(DESTDIR)$(LIBDIR) + $(Q) install -m 755 lib/c/$(CRIU_SO) $(DESTDIR)$(LIBDIR)/$(CRIU_SO).$(CRIU_SO_VERSION_MAJOR).$(CRIU_SO_VERSION_MINOR) + $(Q) ln -fns $(CRIU_SO).$(CRIU_SO_VERSION_MAJOR).$(CRIU_SO_VERSION_MINOR) $(DESTDIR)$(LIBDIR)/$(CRIU_SO).$(CRIU_SO_VERSION_MAJOR) + $(Q) ln -fns $(CRIU_SO).$(CRIU_SO_VERSION_MAJOR).$(CRIU_SO_VERSION_MINOR) $(DESTDIR)$(LIBDIR)/$(CRIU_SO) + $(Q) install -m 755 lib/c/$(CRIU_A) $(DESTDIR)$(LIBDIR)/$(CRIU_A) + $(Q) mkdir -p $(DESTDIR)$(INCLUDEDIR)/criu/ + $(Q) install -m 644 $(UAPI_HEADERS) $(DESTDIR)$(INCLUDEDIR)/criu/ + $(E) " INSTALL " pkgconfig/criu.pc + $(Q) mkdir -p $(DESTDIR)$(LIBDIR)/pkgconfig + $(Q) sed -e 's,@version@,$(CRIU_VERSION),' -e 's,@libdir@,$(LIBDIR),' -e 's,@includedir@,$(dir $(INCLUDEDIR)/criu/),' lib/c/criu.pc.in > lib/c/criu.pc + $(Q) install -m 644 lib/c/criu.pc $(DESTDIR)$(LIBDIR)/pkgconfig + $(E) " INSTALL " crit + $(Q) $(PYTHON) scripts/crit-setup.py install --prefix=$(DESTDIR)$(PREFIX) --record $(CRIT_SETUP_FILES) +.PHONY: install + +uninstall: + $(E) " UNINSTALL" $(CRIU_SO) + $(Q) $(RM) $(addprefix $(DESTDIR)$(LIBDIR)/,$(CRIU_SO).$(CRIU_SO_VERSION_MAJOR)) + $(Q) $(RM) $(addprefix $(DESTDIR)$(LIBDIR)/,$(CRIU_SO)) + $(Q) $(RM) $(addprefix $(DESTDIR)$(LIBDIR)/,$(CRIU_A)) + $(Q) $(RM) $(addprefix $(DESTDIR)$(LIBDIR)/,$(CRIU_SO).$(CRIU_SO_VERSION_MAJOR).$(CRIU_SO_VERSION_MINOR)) + $(Q) $(RM) $(addprefix $(DESTDIR)$(INCLUDEDIR)/criu/,$(notdir $(UAPI_HEADERS))) + $(E) " UNINSTALL" pkgconfig/criu.pc + $(Q) $(RM) $(addprefix $(DESTDIR)$(LIBDIR)/pkgconfig/,criu.pc) + $(E) " UNINSTALL" crit + $(Q) while read -r file; do $(RM) "$$file"; done < $(CRIT_SETUP_FILES) +.PHONY: uninstall diff --git a/CRIU_code/lib/c/Makefile b/CRIU_code/lib/c/Makefile new file mode 100644 index 0000000..af01467 --- /dev/null +++ b/CRIU_code/lib/c/Makefile @@ -0,0 +1,8 @@ +obj-y += criu.o +obj-y += ./images/rpc.pb-c.o + +ccflags-y += -iquote criu/$(ARCH_DIR)/include +ccflags-y += -iquote criu/include +ccflags-y += -iquote images +ccflags-y += -fPIC -fno-stack-protector +ldflags-y += -r -z noexecstack diff --git a/CRIU_code/lib/c/criu.c b/CRIU_code/lib/c/criu.c new file mode 100644 index 0000000..bdf8f93 --- /dev/null +++ b/CRIU_code/lib/c/criu.c @@ -0,0 +1,1631 @@ +#include "version.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "criu.h" +#include "rpc.pb-c.h" +#include "cr-service-const.h" + +#define CR_DEFAULT_SERVICE_BIN "criu" + +const char *criu_lib_version = CRIU_VERSION; + +struct criu_opts { + CriuOpts *rpc; + int (*notify)(char *action, criu_notify_arg_t na); + enum criu_service_comm service_comm; + union { + const char *service_address; + int service_fd; + const char *service_binary; + }; + int swrk_pid; +}; + +static criu_opts *global_opts; +static int saved_errno; + +void criu_free_service(criu_opts *opts) +{ + switch(opts->service_comm) { + case CRIU_COMM_SK: + free((void*)(opts->service_address)); + break; + case CRIU_COMM_BIN: + free((void*)(opts->service_binary)); + break; + default: + break; + } +} + +int criu_local_set_service_address(criu_opts *opts, const char *path) +{ + criu_free_service(opts); + opts->service_comm = CRIU_COMM_SK; + if (path) { + opts->service_address = strdup(path); + } else { + opts->service_address = strdup(CR_DEFAULT_SERVICE_ADDRESS); + } + if(opts->service_address == NULL) { + return -ENOMEM; + } + return 0; +} + +int criu_set_service_address(const char *path) +{ + return criu_local_set_service_address(global_opts, path); +} + +void criu_local_set_service_fd(criu_opts *opts, int fd) +{ + criu_free_service(opts); + opts->service_comm = CRIU_COMM_FD; + opts->service_fd = fd; +} + +void criu_set_service_fd(int fd) +{ + criu_local_set_service_fd(global_opts, fd); +} + +int criu_local_set_service_binary(criu_opts *opts, const char *path) +{ + criu_free_service(opts); + opts->service_comm = CRIU_COMM_BIN; + if (path) { + opts->service_binary = strdup(path); + } else { + opts->service_binary = strdup(CR_DEFAULT_SERVICE_BIN); + } + if(opts->service_binary == NULL) { + return -ENOMEM; + } + return 0; +} + +int criu_set_service_binary(const char *path) +{ + return criu_local_set_service_binary(global_opts, path); +} + +void criu_local_free_opts(criu_opts *opts) +{ + int i; + + if (!opts) + return; + if (!opts->rpc) + return; + + if (opts->rpc->exec_cmd) { + for (i = 0; i < opts->rpc->n_exec_cmd; i++) { + free(opts->rpc->exec_cmd[i]); + } + free(opts->rpc->exec_cmd); + } + opts->rpc->n_exec_cmd = 0; + + if(opts->rpc->unix_sk_ino) { + for (i = 0; i < opts->rpc->n_unix_sk_ino; i++) { + free(opts->rpc->unix_sk_ino[i]); + } + free(opts->rpc->unix_sk_ino); + } + opts->rpc->n_unix_sk_ino = 0; + + if(opts->rpc->ext_mnt) { + for (i = 0; i < opts->rpc->n_ext_mnt; i++) { + if (opts->rpc->ext_mnt[i]) { + free(opts->rpc->ext_mnt[i]->val); + free(opts->rpc->ext_mnt[i]->key); + free(opts->rpc->ext_mnt[i]); + } + } + free(opts->rpc->ext_mnt); + } + opts->rpc->n_ext_mnt = 0; + + if(opts->rpc->cg_root) { + for (i = 0; i < opts->rpc->n_cg_root; i++) { + if (opts->rpc->cg_root[i]) { + free(opts->rpc->cg_root[i]->ctrl); + free(opts->rpc->cg_root[i]->path); + free(opts->rpc->cg_root[i]); + } + } + free(opts->rpc->cg_root); + } + opts->rpc->n_cg_root = 0; + + if(opts->rpc->veths) { + for (i = 0; i < opts->rpc->n_veths; i++) { + if (opts->rpc->veths[i]) { + free(opts->rpc->veths[i]->if_in); + free(opts->rpc->veths[i]->if_out); + free(opts->rpc->veths[i]); + } + } + free(opts->rpc->veths); + } + opts->rpc->n_veths = 0; + + if(opts->rpc->enable_fs) { + for (i = 0; i < opts->rpc->n_enable_fs; i++) { + free(opts->rpc->enable_fs[i]); + } + free(opts->rpc->enable_fs); + } + opts->rpc->n_enable_fs = 0; + + if(opts->rpc->skip_mnt) { + for (i = 0; i < opts->rpc->n_skip_mnt; i++) { + free(opts->rpc->skip_mnt[i]); + } + free(opts->rpc->skip_mnt); + } + opts->rpc->n_skip_mnt = 0; + + if(opts->rpc->irmap_scan_paths) { + for (i = 0; i < opts->rpc->n_irmap_scan_paths; i++) { + free(opts->rpc->irmap_scan_paths[i]); + } + free(opts->rpc->irmap_scan_paths); + } + opts->rpc->n_irmap_scan_paths = 0; + + if(opts->rpc->cgroup_dump_controller) { + for (i = 0; i < opts->rpc->n_cgroup_dump_controller; i++) { + free(opts->rpc->cgroup_dump_controller[i]); + } + free(opts->rpc->cgroup_dump_controller); + } + opts->rpc->n_cgroup_dump_controller = 0; + + if(opts->rpc->inherit_fd) { + for (i = 0; i < opts->rpc->n_inherit_fd; i++) { + if (opts->rpc->inherit_fd[i]) { + free(opts->rpc->inherit_fd[i]->key); + free(opts->rpc->inherit_fd[i]); + } + } + free(opts->rpc->inherit_fd); + } + opts->rpc->n_inherit_fd = 0; + + if(opts->rpc->external) { + for (i = 0; i < opts->rpc->n_external; i++) { + free(opts->rpc->external[i]); + } + free(opts->rpc->external); + } + opts->rpc->n_external = 0; + + if(opts->rpc->ps) { + free(opts->rpc->ps->address); + free(opts->rpc->ps); + } + + free(opts->rpc->cgroup_props_file); + free(opts->rpc->cgroup_props); + free(opts->rpc->parent_img); + free(opts->rpc->root); + free(opts->rpc->freeze_cgroup); + free(opts->rpc->log_file); + free(opts->rpc->lsm_profile); + free(opts->rpc); + criu_free_service(opts); + free(opts); +} + +int criu_local_init_opts(criu_opts **o) +{ + criu_opts *opts = NULL; + CriuOpts *rpc = NULL; + + opts = *o; + + criu_local_free_opts(opts); + *o = NULL; + + rpc = malloc(sizeof(CriuOpts)); + if (rpc == NULL) { + perror("Can't allocate memory for criu RPC opts"); + return -1; + } + + criu_opts__init(rpc); + + opts = malloc(sizeof(criu_opts)); + if (opts == NULL) { + perror("Can't allocate memory for criu opts"); + criu_local_free_opts(opts); + free(rpc); + return -1; + } + + opts->rpc = rpc; + opts->notify = NULL; + + opts->service_comm = CRIU_COMM_BIN; + opts->service_binary = strdup(CR_DEFAULT_SERVICE_BIN); + + if(opts->service_binary == NULL) { + perror("Can't allocate memory for criu service setting"); + criu_local_free_opts(opts); + return -1; + } + + *o = opts; + + return 0; +} + +int criu_init_opts(void) +{ + return criu_local_init_opts(&global_opts); +} + +void criu_free_opts(void) +{ + criu_local_free_opts(global_opts); + global_opts = NULL; +} + +void criu_local_set_notify_cb(criu_opts *opts, int (*cb)(char *action, criu_notify_arg_t na)) +{ + opts->notify = cb; + opts->rpc->has_notify_scripts = true; + opts->rpc->notify_scripts = true; +} + +void criu_set_notify_cb(int (*cb)(char *action, criu_notify_arg_t na)) +{ + criu_local_set_notify_cb(global_opts, cb); +} + +int criu_notify_pid(criu_notify_arg_t na) +{ + return na->has_pid ? na->pid : 0; +} + +void criu_local_set_pid(criu_opts *opts, int pid) +{ + opts->rpc->has_pid = true; + opts->rpc->pid = pid; +} + +void criu_set_pid(int pid) +{ + criu_local_set_pid(global_opts, pid); +} + +void criu_local_set_images_dir_fd(criu_opts *opts, int fd) +{ + opts->rpc->images_dir_fd = fd; +} + +void criu_set_images_dir_fd(int fd) +{ + criu_local_set_images_dir_fd(global_opts, fd); +} + +int criu_local_set_parent_images(criu_opts *opts, const char *path) +{ + opts->rpc->parent_img = strdup(path); + if(opts->rpc->parent_img == NULL) { + return -ENOMEM; + } + return 0; +} + +int criu_set_parent_images(const char *path) +{ + return criu_local_set_parent_images(global_opts, path); +} + +void criu_local_set_track_mem(criu_opts *opts, bool track_mem) +{ + opts->rpc->has_track_mem = true; + opts->rpc->track_mem = track_mem; +} + +void criu_set_track_mem(bool track_mem) +{ + criu_local_set_track_mem(global_opts, track_mem); +} + +void criu_local_set_auto_dedup(criu_opts *opts, bool auto_dedup) +{ + opts->rpc->has_auto_dedup = true; + opts->rpc->auto_dedup = auto_dedup; +} + +void criu_set_auto_dedup(bool auto_dedup) +{ + criu_local_set_auto_dedup(global_opts, auto_dedup); +} + +void criu_local_set_force_irmap(criu_opts *opts, bool force_irmap) +{ + opts->rpc->has_force_irmap = true; + opts->rpc->force_irmap = force_irmap; +} + +void criu_set_force_irmap(bool force_irmap) +{ + criu_local_set_force_irmap(global_opts, force_irmap); +} + +void criu_local_set_link_remap(criu_opts *opts, bool link_remap) +{ + opts->rpc->has_link_remap = true; + opts->rpc->link_remap = link_remap; +} + +void criu_set_link_remap(bool link_remap) +{ + criu_local_set_link_remap(global_opts, link_remap); +} + +void criu_local_set_work_dir_fd(criu_opts *opts, int fd) +{ + opts->rpc->has_work_dir_fd = true; + opts->rpc->work_dir_fd = fd; +} + +void criu_set_work_dir_fd(int fd) +{ + criu_local_set_work_dir_fd(global_opts, fd); +} + +void criu_local_set_leave_running(criu_opts *opts, bool leave_running) +{ + opts->rpc->has_leave_running = true; + opts->rpc->leave_running = leave_running; +} + +void criu_set_leave_running(bool leave_running) +{ + criu_local_set_leave_running(global_opts, leave_running); +} + +void criu_local_set_ext_unix_sk(criu_opts *opts, bool ext_unix_sk) +{ + opts->rpc->has_ext_unix_sk = true; + opts->rpc->ext_unix_sk = ext_unix_sk; +} + +void criu_set_ext_unix_sk(bool ext_unix_sk) +{ + criu_local_set_ext_unix_sk(global_opts, ext_unix_sk); +} + +int criu_local_add_unix_sk(criu_opts *opts, unsigned int inode) +{ + int nr; + UnixSk **a, *u; + + /*if caller forgot enable ext_unix_sk option we do it*/ + if (!opts->rpc->has_ext_unix_sk) { + criu_local_set_ext_unix_sk(opts, true); + } + + /*if user disabled ext_unix_sk and try to add unixsk inode after that*/ + if (opts->rpc->has_ext_unix_sk && !opts->rpc->ext_unix_sk) { + if (opts->rpc->n_unix_sk_ino > 0) { + free(opts->rpc->unix_sk_ino); + opts->rpc->n_unix_sk_ino = 0; + } + return -1; + } + + u = malloc(sizeof(*u)); + if (!u) + goto er; + unix_sk__init(u); + + u->inode = inode; + + nr = opts->rpc->n_unix_sk_ino + 1; + a = realloc(opts->rpc->unix_sk_ino, nr * sizeof(u)); + if (!a) + goto er_u; + + a[nr - 1] = u; + opts->rpc->unix_sk_ino = a; + opts->rpc->n_unix_sk_ino = nr; + return 0; + +er_u: + free(u); +er: + return -ENOMEM; +} + +int criu_add_unix_sk(unsigned int inode) +{ + return criu_local_add_unix_sk(global_opts, inode); +} + +void criu_local_set_tcp_established(criu_opts *opts, bool tcp_established) +{ + opts->rpc->has_tcp_established = true; + opts->rpc->tcp_established = tcp_established; +} + +void criu_set_tcp_established(bool tcp_established) +{ + criu_local_set_tcp_established(global_opts, tcp_established); +} + +void criu_local_set_tcp_skip_in_flight(criu_opts *opts, bool tcp_skip_in_flight) +{ + opts->rpc->has_tcp_skip_in_flight = true; + opts->rpc->tcp_skip_in_flight = tcp_skip_in_flight; +} + +void criu_set_tcp_skip_in_flight(bool tcp_skip_in_flight) +{ + criu_local_set_tcp_skip_in_flight(global_opts, tcp_skip_in_flight); +} + +void criu_local_set_tcp_close(criu_opts *opts, bool tcp_close) +{ + opts->rpc->has_tcp_close = true; + opts->rpc->tcp_close = tcp_close; +} + +void criu_set_tcp_close(bool tcp_close) +{ + criu_local_set_tcp_close(global_opts, tcp_close); +} + +void criu_local_set_weak_sysctls(criu_opts *opts, bool val) +{ + opts->rpc->has_weak_sysctls = true; + opts->rpc->weak_sysctls = val; +} + +void criu_set_weak_sysctls(bool val) +{ + criu_local_set_weak_sysctls(global_opts, val); +} + +void criu_local_set_evasive_devices(criu_opts *opts, bool evasive_devices) +{ + opts->rpc->has_evasive_devices = true; + opts->rpc->evasive_devices = evasive_devices; +} + +void criu_set_evasive_devices(bool evasive_devices) +{ + criu_local_set_evasive_devices(global_opts, evasive_devices); +} + +void criu_local_set_shell_job(criu_opts *opts, bool shell_job) +{ + opts->rpc->has_shell_job = true; + opts->rpc->shell_job = shell_job; +} + +void criu_set_shell_job(bool shell_job) +{ + criu_local_set_shell_job(global_opts, shell_job); +} + +void criu_local_set_file_locks(criu_opts *opts, bool file_locks) +{ + opts->rpc->has_file_locks = true; + opts->rpc->file_locks = file_locks; +} + +void criu_set_file_locks(bool file_locks) +{ + criu_local_set_file_locks(global_opts, file_locks); +} + +void criu_local_set_log_level(criu_opts *opts, int log_level) +{ + opts->rpc->has_log_level = true; + opts->rpc->log_level = log_level; +} + +void criu_set_log_level(int log_level) +{ + criu_local_set_log_level(global_opts, log_level); +} + +int criu_local_set_root(criu_opts *opts, const char *root) +{ + opts->rpc->root = strdup(root); + if(opts->rpc->root == NULL) { + return -ENOMEM; + } + return 0; +} + +int criu_set_root(const char *root) +{ + return criu_local_set_root(global_opts, root); +} + +void criu_local_set_manage_cgroups(criu_opts *opts, bool manage) +{ + opts->rpc->has_manage_cgroups = true; + opts->rpc->manage_cgroups = manage; +} + +void criu_set_manage_cgroups(bool manage) +{ + criu_local_set_manage_cgroups(global_opts, manage); +} + +void criu_local_set_manage_cgroups_mode(criu_opts *opts, enum criu_cg_mode mode) +{ + opts->rpc->has_manage_cgroups_mode = true; + opts->rpc->manage_cgroups_mode = (CriuCgMode)mode; +} + +void criu_set_manage_cgroups_mode(enum criu_cg_mode mode) +{ + criu_local_set_manage_cgroups_mode(global_opts, mode); +} + +int criu_local_set_freeze_cgroup(criu_opts *opts, const char *name) +{ + opts->rpc->freeze_cgroup = strdup(name); + if(opts->rpc->freeze_cgroup == NULL) { + return -ENOMEM; + } + return 0; +} + +int criu_set_freeze_cgroup(const char *name) +{ + return criu_local_set_freeze_cgroup(global_opts, name); +} + +int criu_local_set_lsm_profile(criu_opts *opts, const char *name) +{ + opts->rpc->lsm_profile = strdup(name); + if(opts->rpc->lsm_profile == NULL) { + return -ENOMEM; + } + return 0; +} + +int criu_set_lsm_profile(const char *name) +{ + return criu_local_set_lsm_profile(global_opts, name); +} + +void criu_local_set_timeout(criu_opts *opts, unsigned int timeout) +{ + opts->rpc->timeout = timeout; +} + +void criu_set_timeout(unsigned int timeout) +{ + criu_local_set_timeout(global_opts, timeout); +} + +void criu_local_set_auto_ext_mnt(criu_opts *opts, bool val) +{ + opts->rpc->has_auto_ext_mnt = true; + opts->rpc->auto_ext_mnt = val; +} + +void criu_set_auto_ext_mnt(bool val) +{ + criu_local_set_auto_ext_mnt(global_opts, val); +} + +void criu_local_set_ext_sharing(criu_opts *opts, bool val) +{ + opts->rpc->has_ext_sharing = true; + opts->rpc->ext_sharing = val; +} + +void criu_set_ext_sharing(bool val) +{ + criu_local_set_ext_sharing(global_opts, val); +} + +void criu_local_set_ext_masters(criu_opts *opts, bool val) +{ + opts->rpc->has_ext_masters = true; + opts->rpc->ext_masters = val; +} + +void criu_set_ext_masters(bool val) +{ + criu_local_set_ext_masters(global_opts, val); +} + +int criu_local_set_log_file(criu_opts *opts, const char *log_file) +{ + opts->rpc->log_file = strdup(log_file); + if(opts->rpc->log_file == NULL) { + return -ENOMEM; + } + return 0; +} + +int criu_set_log_file(const char *log_file) +{ + return criu_local_set_log_file(global_opts, log_file); +} + +void criu_local_set_cpu_cap(criu_opts *opts, unsigned int cap) +{ + opts->rpc->has_cpu_cap = true; + opts->rpc->cpu_cap = cap; +} + +void criu_set_cpu_cap(unsigned int cap) +{ + criu_local_set_cpu_cap(global_opts, cap); +} + +int criu_local_set_exec_cmd(criu_opts *opts, int argc, char *argv[]) +{ + int i; + + opts->rpc->n_exec_cmd = argc; + opts->rpc->exec_cmd = malloc((argc) * sizeof(char *)); + + if (opts->rpc->exec_cmd) { + for (i = 0; i < argc; i++) { + opts->rpc->exec_cmd[i] = strdup(argv[i]); + if (!opts->rpc->exec_cmd[i]) { + while (i > 0) + free(opts->rpc->exec_cmd[i--]); + free(opts->rpc->exec_cmd); + opts->rpc->n_exec_cmd = 0; + opts->rpc->exec_cmd = NULL; + goto out; + } + } + return 0; + } + +out: + return -ENOMEM; +} + +int criu_set_exec_cmd(int argc, char *argv[]) +{ + return criu_local_set_exec_cmd(global_opts, argc, argv); +} + +int criu_local_add_ext_mount(criu_opts *opts, const char *key, const char *val) +{ + int nr; + ExtMountMap **a, *m; + + m = malloc(sizeof(*m)); + if (!m) + goto er; + ext_mount_map__init(m); + + m->key = strdup(key); + if (!m->key) + goto er_n; + m->val = strdup(val); + if (!m->val) + goto er_k; + + nr = opts->rpc->n_ext_mnt + 1; + a = realloc(opts->rpc->ext_mnt, nr * sizeof(m)); + if (!a) + goto er_v; + + a[nr - 1] = m; + opts->rpc->ext_mnt = a; + opts->rpc->n_ext_mnt = nr; + return 0; + +er_v: + free(m->val); +er_k: + free(m->key); +er_n: + free(m); +er: + return -ENOMEM; +} + +int criu_add_ext_mount(const char *key, const char *val) +{ + return criu_local_add_ext_mount(global_opts, key, val); +} + +int criu_local_add_cg_root(criu_opts *opts, const char *ctrl, const char *path) +{ + int nr; + CgroupRoot **a, *root; + + root = malloc(sizeof(*root)); + if (!root) + goto er; + cgroup_root__init(root); + + if (ctrl) { + root->ctrl = strdup(ctrl); + if (!root->ctrl) + goto er_r; + } + + root->path = strdup(path); + if (!root->path) + goto er_c; + + nr = opts->rpc->n_cg_root + 1; + a = realloc(opts->rpc->cg_root, nr * sizeof(root)); + if (!a) + goto er_p; + + a[nr - 1] = root; + opts->rpc->cg_root = a; + opts->rpc->n_cg_root = nr; + return 0; + +er_p: + free(root->path); +er_c: + if (root->ctrl) + free(root->ctrl); +er_r: + free(root); +er: + return -ENOMEM; +} + +int criu_add_cg_root(const char *ctrl, const char *path) +{ + return criu_local_add_cg_root(global_opts, ctrl, path); +} + +int criu_local_add_veth_pair(criu_opts *opts, const char *in, const char *out) +{ + int nr; + CriuVethPair **a, *p; + + p = malloc(sizeof(*p)); + if (!p) + goto er; + criu_veth_pair__init(p); + + p->if_in = strdup(in); + if (!p->if_in) + goto er_p; + p->if_out = strdup(out); + if (!p->if_out) + goto er_i; + + nr = opts->rpc->n_veths + 1; + a = realloc(opts->rpc->veths, nr * sizeof(p)); + if (!a) + goto er_o; + + a[nr - 1] = p; + opts->rpc->veths = a; + opts->rpc->n_veths = nr; + return 0; + +er_o: + free(p->if_out); +er_i: + free(p->if_in); +er_p: + free(p); +er: + return -ENOMEM; +} + +int criu_add_veth_pair(const char *in, const char *out) +{ + return criu_local_add_veth_pair(global_opts, in, out); +} + +int criu_local_add_enable_fs(criu_opts *opts, const char *fs) +{ + int nr; + char *str = NULL; + char **ptr = NULL; + + str = strdup(fs); + if (!str) + goto err; + + nr = opts->rpc->n_enable_fs + 1; + ptr = realloc(opts->rpc->enable_fs, nr * sizeof(*ptr)); + if (!ptr) + goto err; + + ptr[nr - 1] = str; + + opts->rpc->n_enable_fs = nr; + opts->rpc->enable_fs = ptr; + + return 0; + +err: + if (str) + free(str); + + return -ENOMEM; +} + +int criu_add_enable_fs(const char *fs) +{ + return criu_local_add_enable_fs(global_opts, fs); +} + + +int criu_local_add_skip_mnt(criu_opts *opts, const char *mnt) +{ + int nr; + char *str = NULL; + char **ptr = NULL; + + str = strdup(mnt); + if (!str) + goto err; + + nr = opts->rpc->n_skip_mnt + 1; + ptr = realloc(opts->rpc->skip_mnt, nr * sizeof(*ptr)); + if (!ptr) + goto err; + + ptr[nr - 1] = str; + + opts->rpc->n_skip_mnt = nr; + opts->rpc->skip_mnt = ptr; + + return 0; + +err: + if (str) + free(str); + + return -ENOMEM; +} + +int criu_local_add_irmap_path(criu_opts *opts, const char *path) +{ + int nr; + char *my_path; + char **m; + + if (!opts) + return -1; + + my_path = strdup(path); + if (!my_path) + goto err; + + nr = opts->rpc->n_irmap_scan_paths + 1; + m = realloc(opts->rpc->irmap_scan_paths, nr * sizeof(*m)); + if (!m) + goto err; + + m[nr - 1] = my_path; + + opts->rpc->n_irmap_scan_paths = nr; + opts->rpc->irmap_scan_paths = m; + + return 0; + +err: + if (my_path) + free(my_path); + + return -ENOMEM; +} + +int criu_local_add_cg_props(criu_opts *opts, const char *stream) +{ + char *new; + + new = strdup(stream); + if (!new) + return -ENOMEM; + + free(opts->rpc->cgroup_props); + opts->rpc->cgroup_props = new; + return 0; +} + +int criu_local_add_cg_props_file(criu_opts *opts, const char *path) +{ + char *new; + + new = strdup(path); + if (!new) + return -ENOMEM; + + free(opts->rpc->cgroup_props_file); + opts->rpc->cgroup_props_file = new; + return 0; +} + +int criu_local_add_cg_dump_controller(criu_opts *opts, const char *name) +{ + char **new, *ctrl_name; + size_t nr; + + ctrl_name = strdup(name); + if (!ctrl_name) + return -ENOMEM; + + nr = opts->rpc->n_cgroup_dump_controller + 1; + new = realloc(opts->rpc->cgroup_dump_controller, nr * sizeof(char *)); + if (!new) { + free(ctrl_name); + return -ENOMEM; + } + + new[opts->rpc->n_cgroup_dump_controller] = ctrl_name; + + opts->rpc->n_cgroup_dump_controller = nr; + opts->rpc->cgroup_dump_controller = new; + + return 0; +} + +int criu_add_skip_mnt(const char *mnt) +{ + return criu_local_add_skip_mnt(global_opts, mnt); +} + +void criu_local_set_ghost_limit(criu_opts *opts, unsigned int limit) +{ + opts->rpc->has_ghost_limit = true; + opts->rpc->ghost_limit = limit; +} + +void criu_set_ghost_limit(unsigned int limit) +{ + criu_local_set_ghost_limit(global_opts, limit); +} + +int criu_add_irmap_path(const char *path) +{ + return criu_local_add_irmap_path(global_opts, path); +} + +int criu_local_add_inherit_fd(criu_opts *opts, int fd, const char *key) +{ + int nr; + InheritFd **a, *f; + + /* Inheriting is only supported with swrk mode */ + if (opts->service_comm != CRIU_COMM_BIN) + return -1; + + f = malloc(sizeof(*f)); + if (!f) + goto er; + inherit_fd__init(f); + + f->fd = fd; + f->key = strdup(key); + if (!f->key) + goto er_f; + + nr = opts->rpc->n_inherit_fd + 1; + a = realloc(opts->rpc->inherit_fd, nr * sizeof(f)); + if (!a) + goto err_k; + + a[nr - 1] = f; + opts->rpc->inherit_fd = a; + opts->rpc->n_inherit_fd = nr; + return 0; +err_k: + free(f->key); +er_f: + free(f); +er: + return -ENOMEM; +} + +int criu_add_inherit_fd(int fd, const char *key) +{ + return criu_local_add_inherit_fd(global_opts, fd, key); +} + +int criu_local_add_external(criu_opts *opts, const char *key) +{ + int nr; + char **a, *e = NULL; + + e = strdup(key); + if (!e) + goto err; + + nr = opts->rpc->n_external + 1; + a = realloc(opts->rpc->external, nr * sizeof(*a)); + if (!a) + goto err; + + a[nr - 1] = e; + opts->rpc->external = a; + opts->rpc->n_external = nr; + return 0; +err: + if (e) + free(e); + return -ENOMEM; +} + +int criu_add_external(const char *key) +{ + return criu_local_add_external(global_opts, key); +} + +int criu_local_set_page_server_address_port(criu_opts *opts, const char *address, int port) +{ + opts->rpc->ps = malloc(sizeof(CriuPageServerInfo)); + if (opts->rpc->ps) { + criu_page_server_info__init(opts->rpc->ps); + + opts->rpc->ps->address = strdup(address); + if (!opts->rpc->ps->address) { + free(opts->rpc->ps); + opts->rpc->ps = NULL; + goto out; + } + + opts->rpc->ps->has_port = true; + opts->rpc->ps->port = port; + } + +out: + return -ENOMEM; +} + +int criu_set_page_server_address_port(const char *address, int port) +{ + return criu_local_set_page_server_address_port(global_opts, address, port); +} + +static CriuResp *recv_resp(int socket_fd) +{ + unsigned char *buf = NULL; + int len; + CriuResp *msg = 0; + + len = recv(socket_fd, NULL, 0, MSG_TRUNC | MSG_PEEK); + if (len == -1) { + perror("Can't read request"); + goto err; + } + + buf = malloc(len); + if (!buf) { + errno = ENOMEM; + perror("Can't receive response"); + goto err; + } + + len = recv(socket_fd, buf, len, MSG_TRUNC); + if (len == -1) { + perror("Can't read request"); + goto err; + } + + msg = criu_resp__unpack(NULL, len, buf); + if (!msg) { + perror("Failed unpacking response"); + goto err; + } + + free(buf); + return msg; +err: + free(buf); + saved_errno = errno; + return NULL; +} + +static int send_req(int socket_fd, CriuReq *req) +{ + unsigned char *buf; + int len; + + len = criu_req__get_packed_size(req); + + buf = malloc(len); + if (!buf) { + errno = ENOMEM; + perror("Can't send request"); + goto err; + } + + if (criu_req__pack(req, buf) != len) { + perror("Failed packing request"); + goto err; + } + + if (write(socket_fd, buf, len) == -1) { + perror("Can't send request"); + goto err; + } + + free(buf); + return 0; +err: + free(buf); + saved_errno = errno; + return -1; +} + +static int send_notify_ack(int socket_fd, int ret) +{ + int send_ret; + CriuReq req = CRIU_REQ__INIT; + + req.type = CRIU_REQ_TYPE__NOTIFY; + req.has_notify_success = true; + req.notify_success = (ret == 0); + + send_ret = send_req(socket_fd, &req); + + /* + * If we're failing the notification then report + * back the original error code (and it will be + * propagated back to user). + * + * If the notification was OK, then report the + * result of acking it. + */ + + return ret ? : send_ret; +} + +static void swrk_wait(criu_opts *opts) +{ + if (opts->service_comm == CRIU_COMM_BIN) + waitpid(opts->swrk_pid, NULL, 0); +} + +static int swrk_connect(criu_opts *opts, bool d) +{ + int sks[2], pid, ret = -1; + + if (socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sks)) + goto out; + + pid = fork(); + if (pid < 0) + goto err; + + if (pid == 0) {//子进程 + sigset_t mask; + char fds[11]; + + /* + * Unblock SIGCHLD. + * + * The caller of this function is supposed to have + * this signal blocked. Otherwise it risks to get + * into situation, when this routine is not yet + * returned, but the restore subtree exits and + * emits the SIGCHLD. + * + * In turn, unblocked SIGCHLD is required to make + * criu restoration process work -- it catches + * subtasks restore errors in this handler. + */ + + sigemptyset(&mask); + sigaddset(&mask, SIGCHLD); + sigprocmask(SIG_UNBLOCK, &mask, NULL); + + close(sks[0]); + sprintf(fds, "%d", sks[1]); + + if (d) + if (daemon(0, 1)) { + perror("Can't detach for a self-dump"); + goto child_err; + } + + pid = getpid(); + if (write(sks[1], &pid, sizeof(pid)) != sizeof(pid)) { + perror("Can't write swrk pid"); + goto child_err; + } + + execlp(opts->service_binary, opts->service_binary, "swrk", fds, NULL); + perror("Can't exec criu swrk"); +child_err: + close(sks[1]); + exit(1); + } + + close(sks[1]); + + if (read(sks[0], &pid, sizeof(pid)) != sizeof(pid)) { + perror("Can't read swrk pid"); + goto err; + } + + opts->swrk_pid = pid; + ret = sks[0]; + +out: + return ret; + +err: + close(sks[0]); + close(sks[1]); + goto out; +} + +static int criu_connect(criu_opts *opts, bool d) +{ + int fd, ret; + struct sockaddr_un addr; + socklen_t addr_len; + + if (opts->service_comm == CRIU_COMM_FD) + return opts->service_fd; + else if (opts->service_comm == CRIU_COMM_BIN) + return swrk_connect(opts, d); + + fd = socket(AF_LOCAL, SOCK_SEQPACKET, 0); + if (fd < 0) { + saved_errno = errno; + perror("Can't create socket"); + return -1; + } + + memset(&addr, 0, sizeof(addr)); + addr.sun_family = AF_LOCAL; + + addr_len = strlen(opts->service_address); + if (addr_len >= sizeof(addr.sun_path)) { + fprintf(stderr, "The service address %s is too long", + opts->service_address); + close(fd); + return -1; + } + memcpy(addr.sun_path, opts->service_address, addr_len); + + addr_len += sizeof(addr.sun_family); + + ret = connect(fd, (struct sockaddr *) &addr, addr_len); + if (ret < 0) { + saved_errno = errno; + perror("Can't connect to socket"); + close(fd); + return -1; + } + + return fd; +} + +static int send_req_and_recv_resp_sk(int fd, criu_opts *opts, CriuReq *req, CriuResp **resp) +{ + int ret = 0; + + if (send_req(fd, req) < 0) { + ret = -ECOMM; + goto exit; + } + +again: + *resp = recv_resp(fd); + if (!*resp) { + perror("Can't receive response"); + ret = -ECOMM; + goto exit; + } + + if ((*resp)->type == CRIU_REQ_TYPE__NOTIFY) { + if (opts->notify) + ret = opts->notify((*resp)->notify->script, (*resp)->notify); + + ret = send_notify_ack(fd, ret); + if (!ret) { + criu_resp__free_unpacked(*resp, NULL); + goto again; + } + else + goto exit; + } + + if ((*resp)->type != req->type) { + if ((*resp)->type == CRIU_REQ_TYPE__EMPTY && + (*resp)->success == false) + ret = -EINVAL; + else { + perror("Unexpected response type"); + ret = -EBADMSG; + } + } + + if ((*resp)->has_cr_errno) + saved_errno = (*resp)->cr_errno; + +exit: + return ret; +} + +static int send_req_and_recv_resp(criu_opts *opts, CriuReq *req, CriuResp **resp) +{ + int fd; + int ret = 0; + bool d = false; + + if (req->type == CRIU_REQ_TYPE__DUMP && req->opts->has_pid == false) + d = true; + + fd = criu_connect(opts, d); + if (fd < 0) { + perror("Can't connect to criu"); + ret = -ECONNREFUSED; + } else { + ret = send_req_and_recv_resp_sk(fd, opts, req, resp); + close(fd); + } + + return ret; +} + +int criu_local_check(criu_opts *opts) +{ + int ret = -1; + CriuReq req = CRIU_REQ__INIT; + CriuResp *resp = NULL; + + saved_errno = 0; + + req.type = CRIU_REQ_TYPE__CHECK; + + ret = send_req_and_recv_resp(opts, &req, &resp); + if (ret) + goto exit; + + ret = resp->success ? 0 : -EBADE; + +exit: + if (resp) + criu_resp__free_unpacked(resp, NULL); + + swrk_wait(opts); + + errno = saved_errno; + + return ret; +} + +int criu_check(void) +{ + return criu_local_check(global_opts); +} + +int criu_local_dump(criu_opts *opts) +{ + int ret = -1; + CriuReq req = CRIU_REQ__INIT; + CriuResp *resp = NULL; + + saved_errno = 0; + + req.type = CRIU_REQ_TYPE__DUMP; + req.opts = opts->rpc; + + ret = send_req_and_recv_resp(opts, &req, &resp); + if (ret) + goto exit; + + if (resp->success) { + if (resp->dump->has_restored && resp->dump->restored) + ret = 1; + else + ret = 0; + } else + ret = -EBADE; + +exit: + if (resp) + criu_resp__free_unpacked(resp, NULL); + + swrk_wait(opts); + + errno = saved_errno; + + return ret; +} + +int criu_dump(void) +{ + return criu_local_dump(global_opts); +} + +int criu_local_dump_iters(criu_opts *opts, int (*more)(criu_predump_info pi)) +{ + int ret = -1, fd = -1, uret; + CriuReq req = CRIU_REQ__INIT; + CriuResp *resp = NULL; + + saved_errno = 0; + + req.type = CRIU_REQ_TYPE__PRE_DUMP; + req.opts = opts->rpc; + + ret = -EINVAL; + /* + * Self-dump in iterable manner is tricky and + * not supported for the moment. + * + * Calls w/o iteration callback is, well, not + * allowed either. + */ + if (!opts->rpc->has_pid || !more) + goto exit; + + ret = -ECONNREFUSED; + fd = criu_connect(opts, false); + if (fd < 0) + goto exit; + + while (1) { + ret = send_req_and_recv_resp_sk(fd, opts, &req, &resp); + if (ret) + goto exit; + + if (!resp->success) { + ret = -EBADE; + goto exit; + } + + uret = more(NULL); + if (uret < 0) { + ret = uret; + goto exit; + } + + criu_resp__free_unpacked(resp, NULL); + + if (uret == 0) + break; + } + + req.type = CRIU_REQ_TYPE__DUMP; + ret = send_req_and_recv_resp_sk(fd, opts, &req, &resp); + if (!ret) + ret = (resp->success ? 0 : -EBADE); +exit: + if (fd >= 0) + close(fd); + if (resp) + criu_resp__free_unpacked(resp, NULL); + + swrk_wait(opts); + + errno = saved_errno; + + return ret; +} + +int criu_dump_iters(int (*more)(criu_predump_info pi)) +{ + return criu_local_dump_iters((void *)global_opts, more); +} + +int criu_local_restore(criu_opts *opts) +{ + int ret = -1; + CriuReq req = CRIU_REQ__INIT; + CriuResp *resp = NULL; + + saved_errno = 0; + + req.type = CRIU_REQ_TYPE__RESTORE; + req.opts = opts->rpc; + + ret = send_req_and_recv_resp(opts, &req, &resp); + if (ret) + goto exit; + + if (resp->success) + ret = resp->restore->pid; + else + ret = -EBADE; + +exit: + if (resp) + criu_resp__free_unpacked(resp, NULL); + + swrk_wait(opts); + + errno = saved_errno; + + return ret; +} + +int criu_restore(void) +{ + return criu_local_restore(global_opts); +} + +int criu_local_restore_child(criu_opts *opts) +{ + int sk, ret = -1; + enum criu_service_comm saved_comm; + const char *saved_comm_data; + bool save_comm; + CriuReq req = CRIU_REQ__INIT; + CriuResp *resp = NULL; + + /* + * restore_child is not possible with criu running as a system + * service, so we need to switch comm method to CRIU_COMM_BIN. + * We're doing so because of the backward compatibility, and we + * should probably consider requiring CRIU_COMM_BIN to be set by + * user at some point. + */ + save_comm = (opts->service_comm != CRIU_COMM_BIN); + if (save_comm) { + /* Save comm */ + saved_comm = opts->service_comm; + saved_comm_data = opts->service_address; + + opts->service_comm = CRIU_COMM_BIN; + opts->service_binary = CR_DEFAULT_SERVICE_BIN; + } + + sk = swrk_connect(opts, false); + if (save_comm) { + /* Restore comm */ + opts->service_comm = saved_comm; + opts->service_address = saved_comm_data; + } + + if (sk < 0) + return -1; + + saved_errno = 0; + + req.type = CRIU_REQ_TYPE__RESTORE; + req.opts = opts->rpc; + + req.opts->has_rst_sibling = true; + req.opts->rst_sibling = true; + + ret = send_req_and_recv_resp_sk(sk, opts, &req, &resp); + + swrk_wait(opts); + + if (!ret) { + ret = resp->success ? resp->restore->pid : -EBADE; + criu_resp__free_unpacked(resp, NULL); + } + + close(sk); + errno = saved_errno; + return ret; +} + +int criu_restore_child(void) +{ + return criu_local_restore_child(global_opts); +} diff --git a/CRIU_code/lib/c/criu.h b/CRIU_code/lib/c/criu.h new file mode 100644 index 0000000..4462ce0 --- /dev/null +++ b/CRIU_code/lib/c/criu.h @@ -0,0 +1,224 @@ +/* + * (C) Copyright 2013 Parallels, Inc. (www.parallels.com). + * + * All rights reserved. This program and the accompanying materials + * are made available under the terms of the GNU Lesser General Public License + * (LGPL) version 2.1 which accompanies this distribution, and is available at + * http://www.gnu.org/licenses/lgpl-2.1.html + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, you can find it here: + * www.gnu.org/licenses/lgpl.html + */ + +#ifndef __CRIU_LIB_H__ +#define __CRIU_LIB_H__ + +#include + +#ifdef __GNUG__ + extern "C" { +#endif + +enum criu_service_comm { + CRIU_COMM_SK, + CRIU_COMM_FD, + CRIU_COMM_BIN +}; + +enum criu_cg_mode { + CRIU_CG_MODE_IGNORE, + CRIU_CG_MODE_NONE, + CRIU_CG_MODE_PROPS, + CRIU_CG_MODE_SOFT, + CRIU_CG_MODE_FULL, + CRIU_CG_MODE_STRICT, + CRIU_CG_MODE_DEFAULT, +}; + +int criu_set_service_address(const char *path); +void criu_set_service_fd(int fd); +int criu_set_service_binary(const char *path); + +/* + * Set opts to defaults. _Must_ be called first before using any functions from + * the list down below. 0 on success, -1 on fail. + */ +int criu_init_opts(void); +void criu_free_opts(void); + +void criu_set_pid(int pid); +void criu_set_images_dir_fd(int fd); /* must be set for dump/restore */ +int criu_set_parent_images(const char *path); +void criu_set_work_dir_fd(int fd); +void criu_set_leave_running(bool leave_running); +void criu_set_ext_unix_sk(bool ext_unix_sk); +int criu_add_unix_sk(unsigned int inode); +void criu_set_tcp_established(bool tcp_established); +void criu_set_tcp_skip_in_flight(bool tcp_skip_in_flight); +void criu_set_tcp_close(bool tcp_close); +void criu_set_weak_sysctls(bool val); +void criu_set_evasive_devices(bool evasive_devices); +void criu_set_shell_job(bool shell_job); +void criu_set_file_locks(bool file_locks); +void criu_set_track_mem(bool track_mem); +void criu_set_auto_dedup(bool auto_dedup); +void criu_set_force_irmap(bool force_irmap); +void criu_set_link_remap(bool link_remap); +void criu_set_log_level(int log_level); +int criu_set_log_file(const char *log_file); +void criu_set_cpu_cap(unsigned int cap); +int criu_set_root(const char *root); +void criu_set_manage_cgroups(bool manage); +void criu_set_manage_cgroups_mode(enum criu_cg_mode mode); +int criu_set_freeze_cgroup(const char *name); +int criu_set_lsm_profile(const char *name); +void criu_set_timeout(unsigned int timeout); +void criu_set_auto_ext_mnt(bool val); +void criu_set_ext_sharing(bool val); +void criu_set_ext_masters(bool val); +int criu_set_exec_cmd(int argc, char *argv[]); +int criu_add_ext_mount(const char *key, const char *val); +int criu_add_veth_pair(const char *in, const char *out); +int criu_add_cg_root(const char *ctrl, const char *path); +int criu_add_enable_fs(const char *fs); +int criu_add_skip_mnt(const char *mnt); +void criu_set_ghost_limit(unsigned int limit); +int criu_add_irmap_path(const char *path); +int criu_add_inherit_fd(int fd, const char *key); +int criu_add_external(const char *key); +int criu_set_page_server_address_port(const char *address, int port); + +/* + * The criu_notify_arg_t na argument is an opaque + * value that callbacks (cb-s) should pass into + * criu_notify_xxx() calls to fetch arbitrary values + * from notification. If the value is not available + * some non-existing one is reported. + */ + +typedef struct _CriuNotify *criu_notify_arg_t; +void criu_set_notify_cb(int (*cb)(char *action, criu_notify_arg_t na)); + +/* Get pid of root task. 0 if not available */ +int criu_notify_pid(criu_notify_arg_t na); + +/* Here is a table of return values and errno's of functions + * from the list down below. + * + * Return value errno Description + * ---------------------------------------------------------------------------- + * 0 undefined Success. + * + * >0 undefined Success(criu_restore() only). + * + * -BADE rpc err (0 for now) RPC has returned fail. + * + * -ECONNREFUSED errno Unable to connect to CRIU. + * + * -ECOMM errno Unable to send/recv msg to/from CRIU. + * + * -EINVAL undefined CRIU doesn't support this type of request. + * You should probably update CRIU. + * + * -EBADMSG undefined Unexpected response from CRIU. + * You should probably update CRIU. + */ +int criu_check(void); +int criu_dump(void); +int criu_restore(void); +int criu_restore_child(void); + +/* + * Perform dumping but with preliminary iterations. Each + * time an iteration ends the ->more callback is called. + * The callback's return value is + * - positive -- one more iteration starts + * - zero -- final dump is performed and call exits + * - negative -- dump is aborted, the value is returned + * back from criu_dump_iters + * + * The @pi argument is an opaque value that caller may + * use to request pre-dump statistics (not yet implemented). + */ +typedef void *criu_predump_info; +int criu_dump_iters(int (*more)(criu_predump_info pi)); + +/* + * Same as the list above, but lets you have your very own options + * structure and lets you set individual options in it. + */ +typedef struct criu_opts criu_opts; + +int criu_local_init_opts(criu_opts **opts); +void criu_local_free_opts(criu_opts *opts); + +int criu_local_set_service_address(criu_opts *opts, const char *path); +void criu_local_set_service_fd(criu_opts *opts, int fd); + +void criu_local_set_service_fd(criu_opts *opts, int fd); + +void criu_local_set_pid(criu_opts *opts, int pid); +void criu_local_set_images_dir_fd(criu_opts *opts, int fd); /* must be set for dump/restore */ +int criu_local_set_parent_images(criu_opts *opts, const char *path); +int criu_local_set_service_binary(criu_opts *opts, const char *path); +void criu_local_set_work_dir_fd(criu_opts *opts, int fd); +void criu_local_set_leave_running(criu_opts *opts, bool leave_running); +void criu_local_set_ext_unix_sk(criu_opts *opts, bool ext_unix_sk); +int criu_local_add_unix_sk(criu_opts *opts, unsigned int inode); +void criu_local_set_tcp_established(criu_opts *opts, bool tcp_established); +void criu_local_set_tcp_skip_in_flight(criu_opts *opts, bool tcp_skip_in_flight); +void criu_local_set_tcp_close(criu_opts *opts, bool tcp_close); +void criu_local_set_weak_sysctls(criu_opts *opts, bool val); +void criu_local_set_evasive_devices(criu_opts *opts, bool evasive_devices); +void criu_local_set_shell_job(criu_opts *opts, bool shell_job); +void criu_local_set_file_locks(criu_opts *opts, bool file_locks); +void criu_local_set_track_mem(criu_opts *opts, bool track_mem); +void criu_local_set_auto_dedup(criu_opts *opts, bool auto_dedup); +void criu_local_set_force_irmap(criu_opts *opts, bool force_irmap); +void criu_local_set_link_remap(criu_opts *opts, bool link_remap); +void criu_local_set_log_level(criu_opts *opts, int log_level); +int criu_local_set_log_file(criu_opts *opts, const char *log_file); +void criu_local_set_cpu_cap(criu_opts *opts, unsigned int cap); +int criu_local_set_root(criu_opts *opts, const char *root); +void criu_local_set_manage_cgroups(criu_opts *opts, bool manage); +void criu_local_set_manage_cgroups_mode(criu_opts *opts, enum criu_cg_mode mode); +int criu_local_set_freeze_cgroup(criu_opts *opts, const char *name); +int criu_local_set_lsm_profile(criu_opts *opts, const char *name); +void criu_local_set_timeout(criu_opts *opts, unsigned int timeout); +void criu_local_set_auto_ext_mnt(criu_opts *opts, bool val); +void criu_local_set_ext_sharing(criu_opts *opts, bool val); +void criu_local_set_ext_masters(criu_opts *opts, bool val); +int criu_local_set_exec_cmd(criu_opts *opts, int argc, char *argv[]); +int criu_local_add_ext_mount(criu_opts *opts, const char *key, const char *val); +int criu_local_add_veth_pair(criu_opts *opts, const char *in, const char *out); +int criu_local_add_cg_root(criu_opts *opts, const char *ctrl, const char *path); +int criu_local_add_enable_fs(criu_opts *opts, const char *fs); +int criu_local_add_skip_mnt(criu_opts *opts, const char *mnt); +void criu_local_set_ghost_limit(criu_opts *opts, unsigned int limit); +int criu_local_add_irmap_path(criu_opts *opts, const char *path); +int criu_local_add_cg_props(criu_opts *opts, const char *stream); +int criu_local_add_cg_props_file(criu_opts *opts, const char *path); +int criu_local_add_cg_dump_controller(criu_opts *opts, const char *name); +int criu_local_add_inherit_fd(criu_opts *opts, int fd, const char *key); +int criu_local_add_external(criu_opts *opts, const char *key); +int criu_local_set_page_server_address_port(criu_opts *opts, const char *address, int port); + +void criu_local_set_notify_cb(criu_opts *opts, int (*cb)(char *action, criu_notify_arg_t na)); + +int criu_local_check(criu_opts *opts); +int criu_local_dump(criu_opts *opts); +int criu_local_restore(criu_opts *opts); +int criu_local_restore_child(criu_opts *opts); +int criu_local_dump_iters(criu_opts *opts, int (*more)(criu_predump_info pi)); + +#ifdef __GNUG__ +} +#endif + +#endif /* __CRIU_LIB_H__ */ diff --git a/CRIU_code/lib/c/criu.pc.in b/CRIU_code/lib/c/criu.pc.in new file mode 100644 index 0000000..33986d1 --- /dev/null +++ b/CRIU_code/lib/c/criu.pc.in @@ -0,0 +1,8 @@ +libdir=@libdir@ +includedir=@includedir@ + +Name: CRIU +Description: RPC library for userspace checkpoint and restore +Version: @version@ +Libs: -L${libdir} -lcriu +Cflags: -I${includedir} diff --git a/CRIU_code/lib/py/.gitignore b/CRIU_code/lib/py/.gitignore new file mode 100644 index 0000000..d3090fc --- /dev/null +++ b/CRIU_code/lib/py/.gitignore @@ -0,0 +1,2 @@ +*_pb2.py +*.pyc diff --git a/CRIU_code/lib/py/Makefile b/CRIU_code/lib/py/Makefile new file mode 100644 index 0000000..691b6bd --- /dev/null +++ b/CRIU_code/lib/py/Makefile @@ -0,0 +1,19 @@ +all-y += libpy-images rpc_pb2.py + +$(obj)/images/Makefile: ; +$(obj)/images/%: .FORCE + $(Q) $(MAKE) $(build)=$(obj)/images $@ + +libpy-images: + $(Q) $(MAKE) $(build)=$(obj)/images all +.PHONY: libpy-images + +rpc_pb2.py: + $(Q) protoc -I=images/ --python_out=$(obj) images/$(@:_pb2.py=.proto) + +cleanup-y += $(addprefix $(obj)/,rpc_pb2.py *.pyc) + +clean-lib-py: + $(Q) $(MAKE) $(build)=$(obj)/images clean +.PHONY: clean-lib-py +clean: clean-lib-py diff --git a/CRIU_code/lib/py/__init__.py b/CRIU_code/lib/py/__init__.py new file mode 100644 index 0000000..96b3e95 --- /dev/null +++ b/CRIU_code/lib/py/__init__.py @@ -0,0 +1,3 @@ +from . import rpc_pb2 as rpc +from . import images +from .criu import * diff --git a/CRIU_code/lib/py/cli.py b/CRIU_code/lib/py/cli.py new file mode 100644 index 0000000..abaf072 --- /dev/null +++ b/CRIU_code/lib/py/cli.py @@ -0,0 +1,342 @@ +from __future__ import print_function +import argparse +import sys +import json +import os + +import pycriu + +def inf(opts): + if opts['in']: + return open(opts['in'], 'rb') + else: + return sys.stdin + +def outf(opts): + if opts['out']: + return open(opts['out'], 'w+') + else: + return sys.stdout + +def dinf(opts, name): + return open(os.path.join(opts['dir'], name)) + +def decode(opts): + indent = None + + try: + img = pycriu.images.load(inf(opts), opts['pretty'], opts['nopl']) + except pycriu.images.MagicException as exc: + print("Unknown magic %#x.\n"\ + "Maybe you are feeding me an image with "\ + "raw data(i.e. pages.img)?" % exc.magic, file=sys.stderr) + sys.exit(1) + + if opts['pretty']: + indent = 4 + + f = outf(opts) + json.dump(img, f, indent=indent) + if f == sys.stdout: + f.write("\n") + +def encode(opts): + img = json.load(inf(opts)) + pycriu.images.dump(img, outf(opts)) + +def info(opts): + infs = pycriu.images.info(inf(opts)) + json.dump(infs, sys.stdout, indent = 4) + print() + +def get_task_id(p, val): + return p[val] if val in p else p['ns_' + val][0] +# +# Explorers +# + +class ps_item: + def __init__(self, p, core): + self.pid = get_task_id(p, 'pid') + self.ppid = p['ppid'] + self.p = p + self.core = core + self.kids = [] + +def show_ps(p, opts, depth = 0): + print("%7d%7d%7d %s%s" % (p.pid, get_task_id(p.p, 'pgid'), get_task_id(p.p, 'sid'), + ' ' * (4 * depth), p.core['tc']['comm'])) + for kid in p.kids: + show_ps(kid, opts, depth + 1) + +def explore_ps(opts): + pss = { } + ps_img = pycriu.images.load(dinf(opts, 'pstree.img')) + for p in ps_img['entries']: + core = pycriu.images.load(dinf(opts, 'core-%d.img' % get_task_id(p, 'pid'))) + ps = ps_item(p, core['entries'][0]) + pss[ps.pid] = ps + + # Build tree + psr = None + for pid in pss: + p = pss[pid] + if p.ppid == 0: + psr = p + continue + + pp = pss[p.ppid] + pp.kids.append(p) + + print("%7s%7s%7s %s" % ('PID', 'PGID', 'SID', 'COMM')) + show_ps(psr, opts) + +files_img = None + +def ftype_find_in_files(opts, ft, fid): + global files_img + + if files_img is None: + try: + files_img = pycriu.images.load(dinf(opts, "files.img"))['entries'] + except: + files_img = [] + + if len(files_img) == 0: + return None + + for f in files_img: + if f['id'] == fid: + return f + + return None + + +def ftype_find_in_image(opts, ft, fid, img): + f = ftype_find_in_files(opts, ft, fid) + if f: + return f[ft['field']] + + if ft['img'] == None: + ft['img'] = pycriu.images.load(dinf(opts, img))['entries'] + for f in ft['img']: + if f['id'] == fid: + return f + return None + +def ftype_reg(opts, ft, fid): + rf = ftype_find_in_image(opts, ft, fid, 'reg-files.img') + return rf and rf['name'] or 'unknown path' + +def ftype_pipe(opts, ft, fid): + p = ftype_find_in_image(opts, ft, fid, 'pipes.img') + return p and 'pipe[%d]' % p['pipe_id'] or 'pipe[?]' + +def ftype_unix(opts, ft, fid): + ux = ftype_find_in_image(opts, ft, fid, 'unixsk.img') + if not ux: + return 'unix[?]' + + n = ux['name'] and ' %s' % ux['name'] or '' + return 'unix[%d (%d)%s]' % (ux['ino'], ux['peer'], n) + +file_types = { + 'REG': {'get': ftype_reg, 'img': None, 'field': 'reg'}, + 'PIPE': {'get': ftype_pipe, 'img': None, 'field': 'pipe'}, + 'UNIXSK': {'get': ftype_unix, 'img': None, 'field': 'usk'}, +} + +def ftype_gen(opts, ft, fid): + return '%s.%d' % (ft['typ'], fid) + +files_cache = { } + +def get_file_str(opts, fd): + key = (fd['type'], fd['id']) + f = files_cache.get(key, None) + if not f: + ft = file_types.get(fd['type'], {'get': ftype_gen, 'typ': fd['type']}) + f = ft['get'](opts, ft, fd['id']) + files_cache[key] = f + + return f + +def explore_fds(opts): + ps_img = pycriu.images.load(dinf(opts, 'pstree.img')) + for p in ps_img['entries']: + pid = get_task_id(p, 'pid') + idi = pycriu.images.load(dinf(opts, 'ids-%s.img' % pid)) + fdt = idi['entries'][0]['files_id'] + fdi = pycriu.images.load(dinf(opts, 'fdinfo-%d.img' % fdt)) + + print("%d" % pid) + for fd in fdi['entries']: + print("\t%7d: %s" % (fd['fd'], get_file_str(opts, fd))) + + fdi = pycriu.images.load(dinf(opts, 'fs-%d.img' % pid))['entries'][0] + print("\t%7s: %s" % ('cwd', get_file_str(opts, {'type': 'REG', 'id': fdi['cwd_id']}))) + print("\t%7s: %s" % ('root', get_file_str(opts, {'type': 'REG', 'id': fdi['root_id']}))) + + +class vma_id: + def __init__(self): + self.__ids = {} + self.__last = 1 + + def get(self, iid): + ret = self.__ids.get(iid, None) + if not ret: + ret = self.__last + self.__last += 1 + self.__ids[iid] = ret + + return ret + +def explore_mems(opts): + ps_img = pycriu.images.load(dinf(opts, 'pstree.img')) + vids = vma_id() + for p in ps_img['entries']: + pid = get_task_id(p, 'pid') + mmi = pycriu.images.load(dinf(opts, 'mm-%d.img' % pid))['entries'][0] + + print("%d" % pid) + print("\t%-36s %s" % ('exe', get_file_str(opts, {'type': 'REG', 'id': mmi['exe_file_id']}))) + + for vma in mmi['vmas']: + st = vma['status'] + if st & (1 << 10): + fn = ' ' + 'ips[%lx]' % vids.get(vma['shmid']) + elif st & (1 << 8): + fn = ' ' + 'shmem[%lx]' % vids.get(vma['shmid']) + elif st & (1 << 11): + fn = ' ' + 'packet[%lx]' % vids.get(vma['shmid']) + elif st & ((1 << 6) | (1 << 7)): + fn = ' ' + get_file_str(opts, {'type': 'REG', 'id': vma['shmid']}) + if vma['pgoff']: + fn += ' + %#lx' % vma['pgoff'] + if st & (1 << 7): + fn += ' (s)' + elif st & (1 << 1): + fn = ' [stack]' + elif st & (1 << 2): + fn = ' [vsyscall]' + elif st & (1 << 3): + fn = ' [vdso]' + elif vma['flags'] & 0x0100: # growsdown + fn = ' [stack?]' + else: + fn = '' + + if not st & (1 << 0): + fn += ' *' + + prot = vma['prot'] & 0x1 and 'r' or '-' + prot += vma['prot'] & 0x2 and 'w' or '-' + prot += vma['prot'] & 0x4 and 'x' or '-' + + astr = '%08lx-%08lx' % (vma['start'], vma['end']) + print("\t%-36s%s%s" % (astr, prot, fn)) + + +def explore_rss(opts): + ps_img = pycriu.images.load(dinf(opts, 'pstree.img')) + for p in ps_img['entries']: + pid = get_task_id(p, 'pid') + vmas = pycriu.images.load(dinf(opts, 'mm-%d.img' % pid))['entries'][0]['vmas'] + pms = pycriu.images.load(dinf(opts, 'pagemap-%d.img' % pid))['entries'] + + print("%d" % pid) + vmi = 0 + pvmi = -1 + for pm in pms[1:]: + pstr = '\t%lx / %-8d' % (pm['vaddr'], pm['nr_pages']) + while vmas[vmi]['end'] <= pm['vaddr']: + vmi += 1 + + pme = pm['vaddr'] + (pm['nr_pages'] << 12) + vstr = '' + while vmas[vmi]['start'] < pme: + vma = vmas[vmi] + if vmi == pvmi: + vstr += ' ~' + else: + vstr += ' %08lx / %-8d' % (vma['start'], (vma['end'] - vma['start'])>>12) + if vma['status'] & ((1 << 6) | (1 << 7)): + vstr += ' ' + get_file_str(opts, {'type': 'REG', 'id': vma['shmid']}) + pvmi = vmi + vstr += '\n\t%23s' % '' + vmi += 1 + + vmi -= 1 + + print('%-24s%s' % (pstr, vstr)) + + + +explorers = { 'ps': explore_ps, 'fds': explore_fds, 'mems': explore_mems, 'rss': explore_rss } + +def explore(opts): + explorers[opts['what']](opts) + +def main(): + desc = 'CRiu Image Tool' + parser = argparse.ArgumentParser(description=desc, + formatter_class=argparse.RawTextHelpFormatter) + + subparsers = parser.add_subparsers(help='Use crit CMD --help for command-specific help') + + # Decode + decode_parser = subparsers.add_parser('decode', + help = 'convert criu image from binary type to json') + decode_parser.add_argument('--pretty', + help = 'Multiline with indents and some numerical fields in field-specific format', + action = 'store_true') + decode_parser.add_argument('-i', + '--in', + help = 'criu image in binary format to be decoded (stdin by default)') + decode_parser.add_argument('-o', + '--out', + help = 'where to put criu image in json format (stdout by default)') + decode_parser.set_defaults(func=decode, nopl=False) + + # Encode + encode_parser = subparsers.add_parser('encode', + help = 'convert criu image from json type to binary') + encode_parser.add_argument('-i', + '--in', + help = 'criu image in json format to be encoded (stdin by default)') + encode_parser.add_argument('-o', + '--out', + help = 'where to put criu image in binary format (stdout by default)') + encode_parser.set_defaults(func=encode) + + # Info + info_parser = subparsers.add_parser('info', + help = 'show info about image') + info_parser.add_argument("in") + info_parser.set_defaults(func=info) + + # Explore + x_parser = subparsers.add_parser('x', help = 'explore image dir') + x_parser.add_argument('dir') + x_parser.add_argument('what', choices = [ 'ps', 'fds', 'mems', 'rss']) + x_parser.set_defaults(func=explore) + + # Show + show_parser = subparsers.add_parser('show', + help = "convert criu image from binary to human-readable json") + show_parser.add_argument("in") + show_parser.add_argument('--nopl', help = 'do not show entry payload (if exists)', action = 'store_true') + show_parser.set_defaults(func=decode, pretty=True, out=None) + + opts = vars(parser.parse_args()) + + if not opts: + sys.stderr.write(parser.format_usage()) + sys.stderr.write("crit: error: too few arguments\n") + sys.exit(1) + + opts["func"](opts) + +if __name__ == '__main__': + main() diff --git a/CRIU_code/lib/py/criu.py b/CRIU_code/lib/py/criu.py new file mode 100644 index 0000000..de1a214 --- /dev/null +++ b/CRIU_code/lib/py/criu.py @@ -0,0 +1,332 @@ +# Same as libcriu for C. + +import socket +import errno +import fcntl +import os +import struct + +import pycriu.rpc_pb2 as rpc + +class _criu_comm: + """ + Base class for communication classes. + """ + COMM_SK = 0 + COMM_FD = 1 + COMM_BIN = 2 + comm_type = None + comm = None + sk = None + + def connect(self, daemon): + """ + Connect to criu and return socket object. + daemon -- is for whether or not criu should daemonize if executing criu from binary(comm_bin). + """ + pass + + def disconnect(self): + """ + Disconnect from criu. + """ + pass + + +class _criu_comm_sk(_criu_comm): + """ + Communication class for unix socket. + """ + def __init__(self, sk_path): + self.comm_type = self.COMM_SK + self.comm = sk_path + + def connect(self, daemon): + self.sk = socket.socket(socket.AF_UNIX, socket.SOCK_SEQPACKET) + self.sk.connect(self.comm) + + return self.sk + + def disconnect(self): + self.sk.close() + + +class _criu_comm_fd(_criu_comm): + """ + Communication class for file descriptor. + """ + def __init__(self, fd): + self.comm_type = self.COMM_FD + self.comm = fd + + def connect(self, daemon): + self.sk = socket.fromfd(self.comm, socket.AF_UNIX, socket.SOCK_SEQPACKET) + + return self.sk + + def disconnect(self): + self.sk.close() + +class _criu_comm_bin(_criu_comm): + """ + Communication class for binary. + """ + def __init__(self, bin_path): + self.comm_type = self.COMM_BIN + self.comm = bin_path + self.swrk = None + self.daemon = None + + def connect(self, daemon): + # Kind of the same thing we do in libcriu + css = socket.socketpair(socket.AF_UNIX, socket.SOCK_SEQPACKET) + flags = fcntl.fcntl(css[1], fcntl.F_GETFD) + fcntl.fcntl(css[1], fcntl.F_SETFD, flags | fcntl.FD_CLOEXEC) + flags = fcntl.fcntl(css[0], fcntl.F_GETFD) + fcntl.fcntl(css[0], fcntl.F_SETFD, flags & ~fcntl.FD_CLOEXEC) + + self.daemon = daemon + + p = os.fork() + + if p == 0: + def exec_criu(): + os.close(0) + os.close(1) + os.close(2) + + css[0].send(struct.pack('i', os.getpid())) + os.execv(self.comm, [self.comm, 'swrk', "%d" % css[0].fileno()]) + os._exit(1) + + if daemon: + # Python has no daemon(3) alternative, + # so we need to mimic it ourself. + p = os.fork() + + if p == 0: + os.setsid() + + exec_criu() + else: + os._exit(0) + else: + exec_criu() + else: + if daemon: + os.waitpid(p, 0) + + css[0].close() + self.swrk = struct.unpack('i', css[1].recv(4))[0] + self.sk = css[1] + + return self.sk + + def disconnect(self): + self.sk.close() + if not self.daemon: + os.waitpid(self.swrk, 0) + + +class CRIUException(Exception): + """ + Exception class for handling and storing criu errors. + """ + typ = None + _str = None + + def __str__(self): + return self._str + + +class CRIUExceptionInternal(CRIUException): + """ + Exception class for handling and storing internal errors. + """ + def __init__(self, typ, s): + self.typ = typ + self._str = "%s failed with internal error: %s" % (rpc.criu_req_type.Name(self.typ), s) + + +class CRIUExceptionExternal(CRIUException): + """ + Exception class for handling and storing criu RPC errors. + """ + + def __init__(self, req_typ, resp_typ, errno): + self.typ = req_typ + self.resp_typ = resp_typ + self.errno = errno + self._str = self._gen_error_str() + + def _gen_error_str(self): + s = "%s failed: " % (rpc.criu_req_type.Name(self.typ), ) + + if self.typ != self.resp_typ: + s += "Unexpected response type %d: " % (self.resp_typ, ) + + s += "Error(%d): " % (self.errno, ) + + if self.errno == errno.EBADRQC: + s += "Bad options" + + if self.typ == rpc.DUMP: + if self.errno == errno.ESRCH: + s += "No process with such pid" + + if self.typ == rpc.RESTORE: + if self.errno == errno.EEXIST: + s += "Process with requested pid already exists" + + s += "Unknown" + + return s + + +class criu: + """ + Call criu through RPC. + """ + opts = None #CRIU options in pb format + + _comm = None #Communication method + + def __init__(self): + self.use_binary('criu') + self.opts = rpc.criu_opts() + self.sk = None + + def use_sk(self, sk_name): + """ + Access criu using unix socket which that belongs to criu service daemon. + """ + self._comm = _criu_comm_sk(sk_name) + + def use_fd(self, fd): + """ + Access criu using provided fd. + """ + self._comm = _criu_comm_fd(fd) + + def use_binary(self, bin_name): + """ + Access criu by execing it using provided path to criu binary. + """ + self._comm = _criu_comm_bin(bin_name) + + def _send_req_and_recv_resp(self, req): + """ + As simple as send request and receive response. + """ + # In case of self-dump we need to spawn criu swrk detached + # from our current process, as criu has a hard time separating + # process resources from its own if criu is located in a same + # process tree it is trying to dump. + daemon = False + if req.type == rpc.DUMP and not req.opts.HasField('pid'): + daemon = True + + try: + if not self.sk: + s = self._comm.connect(daemon) + else: + s = self.sk + + if req.keep_open: + self.sk = s + + s.send(req.SerializeToString()) + + buf = s.recv(len(s.recv(1, socket.MSG_TRUNC | socket.MSG_PEEK))) + + if not req.keep_open: + self._comm.disconnect() + + resp = rpc.criu_resp() + resp.ParseFromString(buf) + except Exception as e: + raise CRIUExceptionInternal(req.type, str(e)) + + return resp + + def check(self): + """ + Checks whether the kernel support is up-to-date. + """ + req = rpc.criu_req() + req.type = rpc.CHECK + + resp = self._send_req_and_recv_resp(req) + + if not resp.success: + raise CRIUExceptionExternal(req.type, resp.type, resp.cr_errno) + + def dump(self): + """ + Checkpoint a process/tree identified by opts.pid. + """ + req = rpc.criu_req() + req.type = rpc.DUMP + req.opts.MergeFrom(self.opts) + + resp = self._send_req_and_recv_resp(req) + + if not resp.success: + raise CRIUExceptionExternal(req.type, resp.type, resp.cr_errno) + + return resp.dump + + def pre_dump(self): + """ + Checkpoint a process/tree identified by opts.pid. + """ + req = rpc.criu_req() + req.type = rpc.PRE_DUMP + req.opts.MergeFrom(self.opts) + + resp = self._send_req_and_recv_resp(req) + + if not resp.success: + raise CRIUExceptionExternal(req.type, resp.type, resp.cr_errno) + + return resp.dump + + def restore(self): + """ + Restore a process/tree. + """ + req = rpc.criu_req() + req.type = rpc.RESTORE + req.opts.MergeFrom(self.opts) + + resp = self._send_req_and_recv_resp(req) + + if not resp.success: + raise CRIUExceptionExternal(req.type, resp.type, resp.cr_errno) + + return resp.restore + + def page_server_chld(self): + req = rpc.criu_req() + req.type = rpc.PAGE_SERVER_CHLD + req.opts.MergeFrom(self.opts) + req.keep_open = True + + resp = self._send_req_and_recv_resp(req) + + if not resp.success: + raise CRIUExceptionExternal(req.type, resp.type, resp.cr_errno) + + return resp.ps + + def wait_pid(self, pid): + req = rpc.criu_req() + req.type = rpc.WAIT_PID + req.pid = pid + + resp = self._send_req_and_recv_resp(req) + + if not resp.success: + raise CRIUExceptionExternal(req.type, resp.type, resp.cr_errno) + + return resp.status diff --git a/CRIU_code/lib/py/images/.gitignore b/CRIU_code/lib/py/images/.gitignore new file mode 100644 index 0000000..234bfe9 --- /dev/null +++ b/CRIU_code/lib/py/images/.gitignore @@ -0,0 +1,4 @@ +*.pyc +*_pb2.py +magic.py +pb.py diff --git a/CRIU_code/lib/py/images/Makefile b/CRIU_code/lib/py/images/Makefile new file mode 100644 index 0000000..f7df20f --- /dev/null +++ b/CRIU_code/lib/py/images/Makefile @@ -0,0 +1,25 @@ +all-y += images magic.py pb.py + +proto := $(filter-out images/rpc.proto, $(sort $(wildcard images/*.proto))) +proto-py-modules := $(foreach m,$(proto),$(subst -,_,$(notdir $(m:.proto=_pb2)))) + +# We don't need rpc_pb2.py here, as it is not related to the images. +# Unfortunately, we can't drop ugly _pb2 suffixes here, because +# some _pb2 files depend on others _pb2 files. +images: + $(Q) protoc -I=images/ -I=/usr/include/ --python_out=$(obj) $(proto) +.PHONY: images + +magic.py: scripts/magic-gen.py criu/include/magic.h + $(call msg-gen, $@) + $(Q) $(PYTHON) $^ $(obj)/$@ + +pb.py: images + $(Q) echo "# Autogenerated. Do not edit!" > $(obj)/$@ + $(Q) for m in $(filter-out opts_pb2, $(proto-py-modules)); do \ + echo "from .$$m import *" >> $(obj)/$@ ;\ + done +.PHONY: pb.py + +cleanup-y += $(addprefix $(obj)/,magic.py pb.py *.pyc) +cleanup-y += $(call cleanify,$(addprefix $(obj)/,$(addsuffix .py,$(proto-py-modules)))) diff --git a/CRIU_code/lib/py/images/__init__.py b/CRIU_code/lib/py/images/__init__.py new file mode 100644 index 0000000..ea87e4e --- /dev/null +++ b/CRIU_code/lib/py/images/__init__.py @@ -0,0 +1,5 @@ +import sys, os +sys.path.append(os.path.dirname(os.path.realpath(__file__))) +from .magic import * +from .images import * +from .pb import * diff --git a/CRIU_code/lib/py/images/images.py b/CRIU_code/lib/py/images/images.py new file mode 100644 index 0000000..7a9b9da --- /dev/null +++ b/CRIU_code/lib/py/images/images.py @@ -0,0 +1,596 @@ +# This file contains methods to deal with criu images. +# +# According to http://criu.org/Images, criu images can be described +# with such IOW: +# +# IMAGE_FILE ::= MAGIC { ENTRY } +# ENTRY ::= SIZE PAYLOAD [ EXTRA ] +# PAYLOAD ::= "message encoded in ProtocolBuffer format" +# EXTRA ::= "arbitrary blob, depends on the PAYLOAD contents" +# +# MAGIC ::= "32 bit integer" +# SIZE ::= "32 bit integer, equals the PAYLOAD length" +# +# Images v1.1 NOTE: MAGIC now consist of 2 32 bit integers, first one is +# MAGIC_COMMON or MAGIC_SERVICE and the second one is same as MAGIC +# in images V1.0. We don't keep "first" magic in json images. +# +# In order to convert images to human-readable format, we use dict(json). +# Using json not only allows us to easily read\write images, but also +# to use a great variety of tools out there to manipulate them. +# It also allows us to clearly describe criu images structure. +# +# Using dict(json) format, criu images can be described like: +# +# { +# 'magic' : 'FOO', +# 'entries' : [ +# entry, +# ... +# ] +# } +# +# Entry, in its turn, could be described as: +# +# { +# pb_msg, +# 'extra' : extra_msg +# } +# +import io +import base64 +import struct +import os +import array + +from . import magic +from . import pb +from . import pb2dict + +if "encodebytes" not in dir(base64): + base64.encodebytes = base64.encodestring + base64.decodebytes = base64.decodestring + +# +# Predefined hardcoded constants +sizeof_u16 = 2 +sizeof_u32 = 4 +sizeof_u64 = 8 + +# A helper for rounding +def round_up(x,y): + return (((x - 1) | (y - 1)) + 1) + +class MagicException(Exception): + def __init__(self, magic): + self.magic = magic + +# Generic class to handle loading/dumping criu images entries from/to bin +# format to/from dict(json). +class entry_handler: + """ + Generic class to handle loading/dumping criu images + entries from/to bin format to/from dict(json). + """ + def __init__(self, payload, extra_handler=None): + """ + Sets payload class and extra handler class. + """ + self.payload = payload + self.extra_handler = extra_handler + + def load(self, f, pretty = False, no_payload = False): + """ + Convert criu image entries from binary format to dict(json). + Takes a file-like object and returnes a list with entries in + dict(json) format. + """ + entries = [] + + while True: + entry = {} + + # Read payload + pbuff = self.payload() + buf = f.read(4) + if buf == b'': + break + size, = struct.unpack('i', buf) + pbuff.ParseFromString(f.read(size)) + entry = pb2dict.pb2dict(pbuff, pretty) + + # Read extra + if self.extra_handler: + if no_payload: + def human_readable(num): + for unit in ['','K','M','G','T','P','E','Z']: + if num < 1024.0: + if int(num) == num: + return "%d%sB" % (num, unit) + else: + return "%.1f%sB" % (num, unit) + num /= 1024.0 + return "%.1fYB" % num + + pl_size = self.extra_handler.skip(f, pbuff) + entry['extra'] = '... <%s>' % human_readable(pl_size) + else: + entry['extra'] = self.extra_handler.load(f, pbuff) + + entries.append(entry) + + return entries + + def loads(self, s, pretty = False): + """ + Same as load(), but takes a string as an argument. + """ + f = io.BytesIO(s) + return self.load(f, pretty) + + def dump(self, entries, f): + """ + Convert criu image entries from dict(json) format to binary. + Takes a list of entries and a file-like object to write entries + in binary format to. + """ + for entry in entries: + extra = entry.pop('extra', None) + + # Write payload + pbuff = self.payload() + pb2dict.dict2pb(entry, pbuff) + pb_str = pbuff.SerializeToString() + size = len(pb_str) + f.write(struct.pack('i', size)) + f.write(pb_str) + + # Write extra + if self.extra_handler and extra: + self.extra_handler.dump(extra, f, pbuff) + + def dumps(self, entries): + """ + Same as dump(), but doesn't take file-like object and just + returns a string. + """ + f = io.BytesIO('') + self.dump(entries, f) + return f.read() + + def count(self, f): + """ + Counts the number of top-level object in the image file + """ + entries = 0 + + while True: + buf = f.read(4) + if buf == '': + break + size, = struct.unpack('i', buf) + f.seek(size, 1) + entries += 1 + + return entries + +# Special handler for pagemap.img +class pagemap_handler: + """ + Special entry handler for pagemap.img, which is unique in a way + that it has a header of pagemap_head type followed by entries + of pagemap_entry type. + """ + def load(self, f, pretty = False, no_payload = False): + entries = [] + + pbuff = pb.pagemap_head() + while True: + buf = f.read(4) + if buf == b'': + break + size, = struct.unpack('i', buf) + pbuff.ParseFromString(f.read(size)) + entries.append(pb2dict.pb2dict(pbuff, pretty)) + + pbuff = pb.pagemap_entry() + + return entries + + def loads(self, s, pretty = False): + f = io.BytesIO(s) + return self.load(f, pretty) + + def dump(self, entries, f): + pbuff = pb.pagemap_head() + for item in entries: + pb2dict.dict2pb(item, pbuff) + pb_str = pbuff.SerializeToString() + size = len(pb_str) + f.write(struct.pack('i', size)) + f.write(pb_str) + + pbuff = pb.pagemap_entry() + + def dumps(self, entries): + f = io.BytesIO('') + self.dump(entries, f) + return f.read() + + def count(self, f): + return entry_handler(None).count(f) - 1 + +# Special handler for ghost-file.img +class ghost_file_handler: + def load(self, f, pretty = False, no_payload = False): + entries = [] + + gf = pb.ghost_file_entry() + buf = f.read(4) + size, = struct.unpack('i', buf) + gf.ParseFromString(f.read(size)) + g_entry = pb2dict.pb2dict(gf, pretty) + + if gf.chunks: + entries.append(g_entry) + while True: + gc = pb.ghost_chunk_entry() + buf = f.read(4) + if buf == '': + break + size, = struct.unpack('i', buf) + gc.ParseFromString(f.read(size)) + entry = pb2dict.pb2dict(gc, pretty) + if no_payload: + f.seek(gc.len, os.SEEK_CUR) + else: + entry['extra'] = base64.encodebytes(f.read(gc.len)) + entries.append(entry) + else: + if no_payload: + f.seek(0, os.SEEK_END) + else: + g_entry['extra'] = base64.encodebytes(f.read()) + entries.append(g_entry) + + return entries + + def loads(self, s, pretty = False): + f = io.BytesIO(s) + return self.load(f, pretty) + + def dump(self, entries, f): + pbuff = pb.ghost_file_entry() + item = entries.pop(0) + pb2dict.dict2pb(item, pbuff) + pb_str = pbuff.SerializeToString() + size = len(pb_str) + f.write(struct.pack('i', size)) + f.write(pb_str) + + if pbuff.chunks: + for item in entries: + pbuff = pb.ghost_chunk_entry() + pb2dict.dict2pb(item, pbuff) + pb_str = pbuff.SerializeToString() + size = len(pb_str) + f.write(struct.pack('i', size)) + f.write(pb_str) + f.write(base64.decodebytes(item['extra'])) + else: + f.write(base64.decodebytes(item['extra'])) + + def dumps(self, entries): + f = io.BytesIO('') + self.dump(entries, f) + return f.read() + + +# In following extra handlers we use base64 encoding +# to store binary data. Even though, the nature +# of base64 is that it increases the total size, +# it doesn't really matter, because our images +# do not store big amounts of binary data. They +# are negligible comparing to pages size. +class pipes_data_extra_handler: + def load(self, f, pload): + size = pload.bytes + data = f.read(size) + return base64.encodebytes(data) + + def dump(self, extra, f, pload): + data = base64.decodebytes(extra) + f.write(data) + + def skip(self, f, pload): + f.seek(pload.bytes, os.SEEK_CUR) + return pload.bytes + +class sk_queues_extra_handler: + def load(self, f, pload): + size = pload.length + data = f.read(size) + return base64.encodebytes(data) + + def dump(self, extra, f, _unused): + data = base64.decodebytes(extra) + f.write(data) + + def skip(self, f, pload): + f.seek(pload.length, os.SEEK_CUR) + return pload.length + + +class tcp_stream_extra_handler: + def load(self, f, pbuff): + d = {} + + inq = f.read(pbuff.inq_len) + outq = f.read(pbuff.outq_len) + + d['inq'] = base64.encodebytes(inq) + d['outq'] = base64.encodebytes(outq) + + return d + + def dump(self, extra, f, _unused): + inq = base64.decodebytes(extra['inq']) + outq = base64.decodebytes(extra['outq']) + + f.write(inq) + f.write(outq) + + def skip(self, f, pbuff): + f.seek(0, os.SEEK_END) + return pbuff.inq_len + pbuff.outq_len + +class ipc_sem_set_handler: + def load(self, f, pbuff): + entry = pb2dict.pb2dict(pbuff) + size = sizeof_u16 * entry['nsems'] + rounded = round_up(size, sizeof_u64) + s = array.array('H') + if s.itemsize != sizeof_u16: + raise Exception("Array size mismatch") + s.fromstring(f.read(size)) + f.seek(rounded - size, 1) + return s.tolist() + + def dump(self, extra, f, pbuff): + entry = pb2dict.pb2dict(pbuff) + size = sizeof_u16 * entry['nsems'] + rounded = round_up(size, sizeof_u64) + s = array.array('H') + if s.itemsize != sizeof_u16: + raise Exception("Array size mismatch") + s.fromlist(extra) + if len(s) != entry['nsems']: + raise Exception("Number of semaphores mismatch") + f.write(s.tostring()) + f.write('\0' * (rounded - size)) + + def skip(self, f, pbuff): + entry = pb2dict.pb2dict(pbuff) + size = sizeof_u16 * entry['nsems'] + f.seek(round_up(size, sizeof_u64), os.SEEK_CUR) + return size + +class ipc_msg_queue_handler: + def load(self, f, pbuff): + entry = pb2dict.pb2dict(pbuff) + messages = [] + for x in range (0, entry['qnum']): + buf = f.read(4) + if buf == '': + break + size, = struct.unpack('i', buf) + msg = pb.ipc_msg() + msg.ParseFromString(f.read(size)) + rounded = round_up(msg.msize, sizeof_u64) + data = f.read(msg.msize) + f.seek(rounded - msg.msize, 1) + messages.append(pb2dict.pb2dict(msg)) + messages.append(base64.encodebytes(data)) + return messages + + def dump(self, extra, f, pbuff): + entry = pb2dict.pb2dict(pbuff) + for i in range (0, len(extra), 2): + msg = pb.ipc_msg() + pb2dict.dict2pb(extra[i], msg) + msg_str = msg.SerializeToString() + size = len(msg_str) + f.write(struct.pack('i', size)) + f.write(msg_str) + rounded = round_up(msg.msize, sizeof_u64) + data = base64.decodebytes(extra[i + 1]) + f.write(data[:msg.msize]) + f.write('\0' * (rounded - msg.msize)) + + def skip(self, f, pbuff): + entry = pb2dict.pb2dict(pbuff) + pl_len = 0 + for x in range (0, entry['qnum']): + buf = f.read(4) + if buf == '': + break + size, = struct.unpack('i', buf) + msg = pb.ipc_msg() + msg.ParseFromString(f.read(size)) + rounded = round_up(msg.msize, sizeof_u64) + f.seek(rounded, os.SEEK_CUR) + pl_len += size + msg.msize + + return pl_len + +class ipc_shm_handler: + def load(self, f, pbuff): + entry = pb2dict.pb2dict(pbuff) + size = entry['size'] + data = f.read(size) + rounded = round_up(size, sizeof_u32) + f.seek(rounded - size, 1) + return base64.encodebytes(data) + + def dump(self, extra, f, pbuff): + entry = pb2dict.pb2dict(pbuff) + size = entry['size'] + data = base64.decodebytes(extra) + rounded = round_up(size, sizeof_u32) + f.write(data[:size]) + f.write('\0' * (rounded - size)) + + def skip(self, f, pbuff): + entry = pb2dict.pb2dict(pbuff) + size = entry['size'] + rounded = round_up(size, sizeof_u32) + f.seek(rounded, os.SEEK_CUR) + return size + + +handlers = { + 'INVENTORY' : entry_handler(pb.inventory_entry), + 'CORE' : entry_handler(pb.core_entry), + 'IDS' : entry_handler(pb.task_kobj_ids_entry), + 'CREDS' : entry_handler(pb.creds_entry), + 'UTSNS' : entry_handler(pb.utsns_entry), + 'IPC_VAR' : entry_handler(pb.ipc_var_entry), + 'FS' : entry_handler(pb.fs_entry), + 'GHOST_FILE' : ghost_file_handler(), + 'MM' : entry_handler(pb.mm_entry), + 'CGROUP' : entry_handler(pb.cgroup_entry), + 'TCP_STREAM' : entry_handler(pb.tcp_stream_entry, tcp_stream_extra_handler()), + 'STATS' : entry_handler(pb.stats_entry), + 'PAGEMAP' : pagemap_handler(), # Special one + 'PSTREE' : entry_handler(pb.pstree_entry), + 'REG_FILES' : entry_handler(pb.reg_file_entry), + 'NS_FILES' : entry_handler(pb.ns_file_entry), + 'EVENTFD_FILE' : entry_handler(pb.eventfd_file_entry), + 'EVENTPOLL_FILE' : entry_handler(pb.eventpoll_file_entry), + 'EVENTPOLL_TFD' : entry_handler(pb.eventpoll_tfd_entry), + 'SIGNALFD' : entry_handler(pb.signalfd_entry), + 'TIMERFD' : entry_handler(pb.timerfd_entry), + 'INOTIFY_FILE' : entry_handler(pb.inotify_file_entry), + 'INOTIFY_WD' : entry_handler(pb.inotify_wd_entry), + 'FANOTIFY_FILE' : entry_handler(pb.fanotify_file_entry), + 'FANOTIFY_MARK' : entry_handler(pb.fanotify_mark_entry), + 'VMAS' : entry_handler(pb.vma_entry), + 'PIPES' : entry_handler(pb.pipe_entry), + 'FIFO' : entry_handler(pb.fifo_entry), + 'SIGACT' : entry_handler(pb.sa_entry), + 'NETLINK_SK' : entry_handler(pb.netlink_sk_entry), + 'REMAP_FPATH' : entry_handler(pb.remap_file_path_entry), + 'MNTS' : entry_handler(pb.mnt_entry), + 'TTY_FILES' : entry_handler(pb.tty_file_entry), + 'TTY_INFO' : entry_handler(pb.tty_info_entry), + 'TTY_DATA' : entry_handler(pb.tty_data_entry), + 'RLIMIT' : entry_handler(pb.rlimit_entry), + 'TUNFILE' : entry_handler(pb.tunfile_entry), + 'EXT_FILES' : entry_handler(pb.ext_file_entry), + 'IRMAP_CACHE' : entry_handler(pb.irmap_cache_entry), + 'FILE_LOCKS' : entry_handler(pb.file_lock_entry), + 'FDINFO' : entry_handler(pb.fdinfo_entry), + 'UNIXSK' : entry_handler(pb.unix_sk_entry), + 'INETSK' : entry_handler(pb.inet_sk_entry), + 'PACKETSK' : entry_handler(pb.packet_sock_entry), + 'ITIMERS' : entry_handler(pb.itimer_entry), + 'POSIX_TIMERS' : entry_handler(pb.posix_timer_entry), + 'NETDEV' : entry_handler(pb.net_device_entry), + 'PIPES_DATA' : entry_handler(pb.pipe_data_entry, pipes_data_extra_handler()), + 'FIFO_DATA' : entry_handler(pb.pipe_data_entry, pipes_data_extra_handler()), + 'SK_QUEUES' : entry_handler(pb.sk_packet_entry, sk_queues_extra_handler()), + 'IPCNS_SHM' : entry_handler(pb.ipc_shm_entry, ipc_shm_handler()), + 'IPCNS_SEM' : entry_handler(pb.ipc_sem_entry, ipc_sem_set_handler()), + 'IPCNS_MSG' : entry_handler(pb.ipc_msg_entry, ipc_msg_queue_handler()), + 'NETNS' : entry_handler(pb.netns_entry), + 'USERNS' : entry_handler(pb.userns_entry), + 'SECCOMP' : entry_handler(pb.seccomp_entry), + 'AUTOFS' : entry_handler(pb.autofs_entry), + 'FILES' : entry_handler(pb.file_entry), + 'CPUINFO' : entry_handler(pb.cpuinfo_entry), + } + +def __rhandler(f): + # Images v1.1 NOTE: First read "first" magic. + img_magic, = struct.unpack('i', f.read(4)) + if img_magic in (magic.by_name['IMG_COMMON'], magic.by_name['IMG_SERVICE']): + img_magic, = struct.unpack('i', f.read(4)) + + try: + m = magic.by_val[img_magic] + except: + raise MagicException(img_magic) + + try: + handler = handlers[m] + except: + raise Exception("No handler found for image with magic " + m) + + return m, handler + +def load(f, pretty = False, no_payload = False): + """ + Convert criu image from binary format to dict(json). + Takes a file-like object to read criu image from. + Returns criu image in dict(json) format. + """ + image = {} + + m, handler = __rhandler(f) + + image['magic'] = m + image['entries'] = handler.load(f, pretty, no_payload) + + return image + +def info(f): + res = {} + + m, handler = __rhandler(f) + + res['magic'] = m + res['count'] = handler.count(f) + + return res + +def loads(s, pretty = False): + """ + Same as load(), but takes a string. + """ + f = io.BytesIO(s) + return load(f, pretty) + +def dump(img, f): + """ + Convert criu image from dict(json) format to binary. + Takes an image in dict(json) format and file-like + object to write to. + """ + m = img['magic'] + magic_val = magic.by_name[img['magic']] + + # Images v1.1 NOTE: use "second" magic to identify what "first" + # should be written. + if m != 'INVENTORY': + if m in ('STATS', 'IRMAP_CACHE'): + f.write(struct.pack('i', magic.by_name['IMG_SERVICE'])) + else: + f.write(struct.pack('i', magic.by_name['IMG_COMMON'])) + + f.write(struct.pack('i', magic_val)) + + try: + handler = handlers[m] + except: + raise Exception("No handler found for image with such magic") + + handler.dump(img['entries'], f) + +def dumps(img): + """ + Same as dump(), but takes only an image and returns + a string. + """ + f = io.BytesIO(b'') + dump(img, f) + return f.getvalue() diff --git a/CRIU_code/lib/py/images/pb2dict.py b/CRIU_code/lib/py/images/pb2dict.py new file mode 100644 index 0000000..c4ce736 --- /dev/null +++ b/CRIU_code/lib/py/images/pb2dict.py @@ -0,0 +1,378 @@ +from google.protobuf.descriptor import FieldDescriptor as FD +import opts_pb2 +from ipaddress import IPv4Address, ip_address +from ipaddress import IPv6Address +import socket +import collections +import os +import base64 +import quopri + +if "encodebytes" not in dir(base64): + base64.encodebytes = base64.encodestring + base64.decodebytes = base64.decodestring + +# pb2dict and dict2pb are methods to convert pb to/from dict. +# Inspired by: +# protobuf-to-dict - https://github.com/benhodgson/protobuf-to-dict +# protobuf-json - https://code.google.com/p/protobuf-json/ +# protobuf source - https://code.google.com/p/protobuf/ +# Both protobuf-to-dict/json do not fit here because of several reasons, +# here are some of them: +# - both have a common bug in treating optional field with empty +# repeated inside. +# - protobuf-to-json is not avalible in pip or in any other python +# repo, so it is hard to distribute and we can't rely on it. +# - both do not treat enums in a way we would like to. They convert +# protobuf enum to int, but we need a string here, because it is +# much more informative. BTW, protobuf text_format converts pb +# enums to string value too. (i.e. "march : x86_64" is better then +# "march : 1"). + + +_basic_cast = { + FD.TYPE_FIXED64 : int, + FD.TYPE_FIXED32 : int, + FD.TYPE_SFIXED64 : int, + FD.TYPE_SFIXED32 : int, + + FD.TYPE_INT64 : int, + FD.TYPE_UINT64 : int, + FD.TYPE_SINT64 : int, + + FD.TYPE_INT32 : int, + FD.TYPE_UINT32 : int, + FD.TYPE_SINT32 : int, + + FD.TYPE_BOOL : bool, + + FD.TYPE_STRING : str +} + +def _marked_as_hex(field): + return field.GetOptions().Extensions[opts_pb2.criu].hex + +def _marked_as_ip(field): + return field.GetOptions().Extensions[opts_pb2.criu].ipadd + +def _marked_as_flags(field): + return field.GetOptions().Extensions[opts_pb2.criu].flags + +def _marked_as_dev(field): + return field.GetOptions().Extensions[opts_pb2.criu].dev + +def _marked_as_odev(field): + return field.GetOptions().Extensions[opts_pb2.criu].odev + +def _marked_as_dict(field): + return field.GetOptions().Extensions[opts_pb2.criu].dict + +def _custom_conv(field): + return field.GetOptions().Extensions[opts_pb2.criu].conv + +mmap_prot_map = [ + ('PROT_READ', 0x1), + ('PROT_WRITE', 0x2), + ('PROT_EXEC', 0x4), +] + +mmap_flags_map = [ + ('MAP_SHARED', 0x1), + ('MAP_PRIVATE', 0x2), + ('MAP_ANON', 0x20), + ('MAP_GROWSDOWN', 0x0100), +] + +mmap_status_map = [ + ('VMA_AREA_NONE', 0 << 0), + ('VMA_AREA_REGULAR', 1 << 0), + ('VMA_AREA_STACK', 1 << 1), + ('VMA_AREA_VSYSCALL', 1 << 2), + ('VMA_AREA_VDSO', 1 << 3), + ('VMA_AREA_HEAP', 1 << 5), + + ('VMA_FILE_PRIVATE', 1 << 6), + ('VMA_FILE_SHARED', 1 << 7), + ('VMA_ANON_SHARED', 1 << 8), + ('VMA_ANON_PRIVATE', 1 << 9), + + ('VMA_AREA_SYSVIPC', 1 << 10), + ('VMA_AREA_SOCKET', 1 << 11), + ('VMA_AREA_VVAR', 1 << 12), + ('VMA_AREA_AIORING', 1 << 13), + + ('VMA_UNSUPP', 1 << 31), +] + +rfile_flags_map = [ + ('O_WRONLY', 0o1), + ('O_RDWR', 0o2), + ('O_APPEND', 0o2000), + ('O_DIRECT', 0o40000), + ('O_LARGEFILE', 0o100000), +] + +pmap_flags_map = [ + ('PE_PARENT', 1 << 0), + ('PE_LAZY', 1 << 1), + ('PE_PRESENT', 1 << 2), +] + +flags_maps = { + 'mmap.prot' : mmap_prot_map, + 'mmap.flags' : mmap_flags_map, + 'mmap.status' : mmap_status_map, + 'rfile.flags' : rfile_flags_map, + 'pmap.flags' : pmap_flags_map, +} + +gen_maps = { + 'task_state' : { 1: 'Alive', 3: 'Zombie', 6: 'Stopped' }, +} + +sk_maps = { + 'family' : { 1: 'UNIX', + 2: 'INET', + 10: 'INET6', + 16: 'NETLINK', + 17: 'PACKET' }, + 'type' : { 1: 'STREAM', + 2: 'DGRAM', + 3: 'RAW', + 5: 'SEQPACKET', + 10: 'PACKET' }, + 'state' : { 1: 'ESTABLISHED', + 2: 'SYN_SENT', + 3: 'SYN_RECV', + 4: 'FIN_WAIT1', + 5: 'FIN_WAIT2', + 6: 'TIME_WAIT', + 7: 'CLOSE', + 8: 'CLOSE_WAIT', + 9: 'LAST_ACK', + 10: 'LISTEN' }, + 'proto' : { 0: 'IP', + 6: 'TCP', + 17: 'UDP', + 136: 'UDPLITE' }, +} + +gen_rmaps = { k: {v2:k2 for k2,v2 in list(v.items())} for k,v in list(gen_maps.items()) } +sk_rmaps = { k: {v2:k2 for k2,v2 in list(v.items())} for k,v in list(sk_maps.items()) } + +dict_maps = { + 'gen' : ( gen_maps, gen_rmaps ), + 'sk' : ( sk_maps, sk_rmaps ), +} + +def map_flags(value, flags_map): + bs = [x[0] for x in [x for x in flags_map if value & x[1]]] + value &= ~sum([x[1] for x in flags_map]) + if value: + bs.append("0x%x" % value) + return " | ".join(bs) + +def unmap_flags(value, flags_map): + if value == '': + return 0 + + bd = dict(flags_map) + return sum([int(str(bd.get(x, x)), 0) for x in [x.strip() for x in value.split('|')]]) + +kern_minorbits = 20 # This is how kernel encodes dev_t in new format + +def decode_dev(field, value): + if _marked_as_odev(field): + return "%d:%d" % (os.major(value), os.minor(value)) + else: + return "%d:%d" % (value >> kern_minorbits, value & ((1 << kern_minorbits) - 1)) + +def encode_dev(field, value): + dev = [int(x) for x in value.split(':')] + if _marked_as_odev(field): + return os.makedev(dev[0], dev[1]) + else: + return dev[0] << kern_minorbits | dev[1] + +def encode_base64(value): + return base64.encodebytes(value) +def decode_base64(value): + return base64.decodebytes(value) + +def encode_unix(value): + return quopri.encodestring(value) +def decode_unix(value): + return quopri.decodestring(value) + +encode = { 'unix_name': encode_unix } +decode = { 'unix_name': decode_unix } + +def get_bytes_enc(field): + c = _custom_conv(field) + if c: + return encode[c] + else: + return encode_base64 + +def get_bytes_dec(field): + c = _custom_conv(field) + if c: + return decode[c] + else: + return decode_base64 + +def is_string(value): + # Python 3 compatibility + if "basestring" in __builtins__: + string_types = basestring + else: + string_types = (str, bytes) + return isinstance(value, string_types) + +def _pb2dict_cast(field, value, pretty = False, is_hex = False): + if not is_hex: + is_hex = _marked_as_hex(field) + + if field.type == FD.TYPE_MESSAGE: + return pb2dict(value, pretty, is_hex) + elif field.type == FD.TYPE_BYTES: + return get_bytes_enc(field)(value) + elif field.type == FD.TYPE_ENUM: + return field.enum_type.values_by_number.get(value, None).name + elif field.type in _basic_cast: + cast = _basic_cast[field.type] + if pretty and (cast == int): + if is_hex: + # Fields that have (criu).hex = true option set + # should be stored in hex string format. + return "0x%x" % value + + if _marked_as_dev(field): + return decode_dev(field, value) + + flags = _marked_as_flags(field) + if flags: + try: + flags_map = flags_maps[flags] + except: + return "0x%x" % value # flags are better seen as hex anyway + else: + return map_flags(value, flags_map) + + dct = _marked_as_dict(field) + if dct: + return dict_maps[dct][0][field.name].get(value, cast(value)) + + return cast(value) + else: + raise Exception("Field(%s) has unsupported type %d" % (field.name, field.type)) + +def pb2dict(pb, pretty = False, is_hex = False): + """ + Convert protobuf msg to dictionary. + Takes a protobuf message and returns a dict. + """ + d = collections.OrderedDict() if pretty else {} + for field, value in pb.ListFields(): + if field.label == FD.LABEL_REPEATED: + d_val = [] + if pretty and _marked_as_ip(field): + if len(value) == 1: + v = socket.ntohl(value[0]) + addr = IPv4Address(v) + else: + v = 0 + (socket.ntohl(value[0]) << (32 * 3)) + \ + (socket.ntohl(value[1]) << (32 * 2)) + \ + (socket.ntohl(value[2]) << (32 * 1)) + \ + (socket.ntohl(value[3])) + addr = IPv6Address(v) + + d_val.append(addr.compressed) + else: + for v in value: + d_val.append(_pb2dict_cast(field, v, pretty, is_hex)) + else: + d_val = _pb2dict_cast(field, value, pretty, is_hex) + + d[field.name] = d_val + return d + +def _dict2pb_cast(field, value): + # Not considering TYPE_MESSAGE here, as repeated + # and non-repeated messages need special treatment + # in this case, and are hadled separately. + if field.type == FD.TYPE_BYTES: + return get_bytes_dec(field)(value) + elif field.type == FD.TYPE_ENUM: + return field.enum_type.values_by_name.get(value, None).number + elif field.type in _basic_cast: + cast = _basic_cast[field.type] + if (cast == int) and is_string(value): + if _marked_as_dev(field): + return encode_dev(field, value) + + flags = _marked_as_flags(field) + if flags: + try: + flags_map = flags_maps[flags] + except: + pass # Try to use plain string cast + else: + return unmap_flags(value, flags_map) + + dct = _marked_as_dict(field) + if dct: + ret = dict_maps[dct][1][field.name].get(value, None) + if ret == None: + ret = cast(value, 0) + return ret + + # Some int or long fields might be stored as hex + # strings. See _pb2dict_cast. + return cast(value, 0) + else: + return cast(value) + else: + raise Exception("Field(%s) has unsupported type %d" % (field.name, field.type)) + +def dict2pb(d, pb): + """ + Convert dictionary to protobuf msg. + Takes dict and protobuf message to be merged into. + """ + for field in pb.DESCRIPTOR.fields: + if field.name not in d: + continue + value = d[field.name] + if field.label == FD.LABEL_REPEATED: + pb_val = getattr(pb, field.name, None) + if is_string(value[0]) and _marked_as_ip(field): + val = ip_address(value[0]) + if val.version == 4: + pb_val.append(socket.htonl(int(val))) + elif val.version == 6: + ival = int(val) + pb_val.append(socket.htonl((ival >> (32 * 3)) & 0xFFFFFFFF)) + pb_val.append(socket.htonl((ival >> (32 * 2)) & 0xFFFFFFFF)) + pb_val.append(socket.htonl((ival >> (32 * 1)) & 0xFFFFFFFF)) + pb_val.append(socket.htonl((ival >> (32 * 0)) & 0xFFFFFFFF)) + else: + raise Exception("Unknown IP address version %d" % val.version) + continue + + for v in value: + if field.type == FD.TYPE_MESSAGE: + dict2pb(v, pb_val.add()) + else: + pb_val.append(_dict2pb_cast(field, v)) + else: + if field.type == FD.TYPE_MESSAGE: + # SetInParent method acts just like has_* = true in C, + # and helps to properly treat cases when we have optional + # field with empty repeated inside. + getattr(pb, field.name).SetInParent() + + dict2pb(value, getattr(pb, field.name, None)) + else: + setattr(pb, field.name, _dict2pb_cast(field, value)) + return pb diff --git a/CRIU_code/scripts/build/Dockerfile.aarch64.hdr b/CRIU_code/scripts/build/Dockerfile.aarch64.hdr new file mode 100644 index 0000000..c90c980 --- /dev/null +++ b/CRIU_code/scripts/build/Dockerfile.aarch64.hdr @@ -0,0 +1,3 @@ +FROM arm64v8/ubuntu:xenial + +COPY scripts/build/qemu-user-static/usr/bin/qemu-aarch64-static /usr/bin/qemu-aarch64-static diff --git a/CRIU_code/scripts/build/Dockerfile.aarch64.tmpl b/CRIU_code/scripts/build/Dockerfile.aarch64.tmpl new file mode 100644 index 0000000..cb80479 --- /dev/null +++ b/CRIU_code/scripts/build/Dockerfile.aarch64.tmpl @@ -0,0 +1 @@ +Dockerfile.tmpl \ No newline at end of file diff --git a/CRIU_code/scripts/build/Dockerfile.alpine b/CRIU_code/scripts/build/Dockerfile.alpine new file mode 100644 index 0000000..c71a390 --- /dev/null +++ b/CRIU_code/scripts/build/Dockerfile.alpine @@ -0,0 +1,46 @@ +FROM alpine +ARG CC=gcc +ARG ENV1=FOOBAR + +RUN apk update && apk add \ + $CC \ + bash \ + build-base \ + ccache \ + coreutils \ + git \ + gnutls-dev \ + libaio-dev \ + libcap-dev \ + libnet-dev \ + libnl3-dev \ + pkgconfig \ + protobuf-c-dev \ + protobuf-dev \ + python \ + sudo + +COPY . /criu +WORKDIR /criu +ENV CC="ccache $CC" CCACHE_DIR=/tmp/.ccache CCACHE_NOCOMPRESS=1 $ENV1=yes +RUN mv .ccache /tmp && make mrproper && ccache -sz && \ + date && make -j $(nproc) CC="$CC" && date && ccache -s + +RUN apk add \ + py-yaml \ + py-pip \ + py2-future \ + ip6tables \ + iptables \ + iproute2 \ + tar \ + bash \ + go \ + e2fsprogs \ + asciidoctor + +# The rpc test cases are running as user #1000, let's add the user +RUN adduser -u 1000 -D test + +RUN pip install protobuf ipaddress junit_xml +RUN make -C test/zdtm diff --git a/CRIU_code/scripts/build/Dockerfile.armv7hf.hdr b/CRIU_code/scripts/build/Dockerfile.armv7hf.hdr new file mode 100644 index 0000000..d453d6d --- /dev/null +++ b/CRIU_code/scripts/build/Dockerfile.armv7hf.hdr @@ -0,0 +1,3 @@ +FROM arm32v7/ubuntu:xenial + +COPY scripts/build/qemu-user-static/usr/bin/qemu-arm-static /usr/bin/qemu-arm-static diff --git a/CRIU_code/scripts/build/Dockerfile.armv7hf.tmpl b/CRIU_code/scripts/build/Dockerfile.armv7hf.tmpl new file mode 100644 index 0000000..cb80479 --- /dev/null +++ b/CRIU_code/scripts/build/Dockerfile.armv7hf.tmpl @@ -0,0 +1 @@ +Dockerfile.tmpl \ No newline at end of file diff --git a/CRIU_code/scripts/build/Dockerfile.centos b/CRIU_code/scripts/build/Dockerfile.centos new file mode 100644 index 0000000..2ce40b1 --- /dev/null +++ b/CRIU_code/scripts/build/Dockerfile.centos @@ -0,0 +1,48 @@ +FROM centos:7 + +ARG CC=gcc +ARG ENV1=FOOBAR + +RUN yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm +RUN yum install -y \ + ccache \ + findutils \ + gcc \ + git \ + gnutls-devel \ + iproute \ + iptables \ + libaio-devel \ + libasan \ + libcap-devel \ + libnet-devel \ + libnl3-devel \ + make \ + procps-ng \ + protobuf-c-devel \ + protobuf-devel \ + protobuf-python \ + python \ + python-ipaddress \ + python2-future \ + python2-junit_xml \ + python-yaml \ + python-six \ + sudo \ + tar \ + which \ + e2fsprogs \ + python2-pip \ + rubygem-asciidoctor + +COPY . /criu +WORKDIR /criu + +ENV CCACHE_DIR=/tmp/.ccache CCACHE_NOCOMPRESS=1 $ENV1=yes +RUN mv .ccache /tmp && make mrproper && ccache -sz && \ + date && make -j $(nproc) CC="$CC" && date && ccache -s + +# The rpc test cases are running as user #1000, let's add the user +RUN adduser -u 1000 test + +RUN make -C test/zdtm -j $(nproc) diff --git a/CRIU_code/scripts/build/Dockerfile.fedora-asan.hdr b/CRIU_code/scripts/build/Dockerfile.fedora-asan.hdr new file mode 100644 index 0000000..3ec09c1 --- /dev/null +++ b/CRIU_code/scripts/build/Dockerfile.fedora-asan.hdr @@ -0,0 +1,2 @@ +FROM fedora:29 +ENV ASAN=1 diff --git a/CRIU_code/scripts/build/Dockerfile.fedora-asan.tmpl b/CRIU_code/scripts/build/Dockerfile.fedora-asan.tmpl new file mode 100644 index 0000000..e4c4030 --- /dev/null +++ b/CRIU_code/scripts/build/Dockerfile.fedora-asan.tmpl @@ -0,0 +1 @@ +Dockerfile.fedora.tmpl \ No newline at end of file diff --git a/CRIU_code/scripts/build/Dockerfile.fedora-rawhide-aarch64.hdr b/CRIU_code/scripts/build/Dockerfile.fedora-rawhide-aarch64.hdr new file mode 100644 index 0000000..82f29e3 --- /dev/null +++ b/CRIU_code/scripts/build/Dockerfile.fedora-rawhide-aarch64.hdr @@ -0,0 +1,3 @@ +FROM arm64v8/fedora:rawhide + +COPY scripts/build/qemu-user-static/usr/bin/qemu-aarch64-static /usr/bin/qemu-aarch64-static diff --git a/CRIU_code/scripts/build/Dockerfile.fedora-rawhide-aarch64.tmpl b/CRIU_code/scripts/build/Dockerfile.fedora-rawhide-aarch64.tmpl new file mode 100644 index 0000000..e4c4030 --- /dev/null +++ b/CRIU_code/scripts/build/Dockerfile.fedora-rawhide-aarch64.tmpl @@ -0,0 +1 @@ +Dockerfile.fedora.tmpl \ No newline at end of file diff --git a/CRIU_code/scripts/build/Dockerfile.fedora-rawhide.hdr b/CRIU_code/scripts/build/Dockerfile.fedora-rawhide.hdr new file mode 100644 index 0000000..d6d9ab7 --- /dev/null +++ b/CRIU_code/scripts/build/Dockerfile.fedora-rawhide.hdr @@ -0,0 +1 @@ +FROM fedora:rawhide diff --git a/CRIU_code/scripts/build/Dockerfile.fedora-rawhide.tmpl b/CRIU_code/scripts/build/Dockerfile.fedora-rawhide.tmpl new file mode 100644 index 0000000..e4c4030 --- /dev/null +++ b/CRIU_code/scripts/build/Dockerfile.fedora-rawhide.tmpl @@ -0,0 +1 @@ +Dockerfile.fedora.tmpl \ No newline at end of file diff --git a/CRIU_code/scripts/build/Dockerfile.fedora.tmpl b/CRIU_code/scripts/build/Dockerfile.fedora.tmpl new file mode 100644 index 0000000..9653096 --- /dev/null +++ b/CRIU_code/scripts/build/Dockerfile.fedora.tmpl @@ -0,0 +1,59 @@ +ARG CC=gcc +ARG ENV1=FOOBAR + +RUN dnf install -y \ + ccache \ + findutils \ + gcc \ + git \ + gnutls-devel \ + iproute \ + iptables \ + libaio-devel \ + libasan \ + libcap-devel \ + libnet-devel \ + libnl3-devel \ + make \ + procps-ng \ + protobuf-c-devel \ + protobuf-devel \ + python2-protobuf \ + python2 \ + # Starting with Fedora 28 this is python2-ipaddress + python-ipaddress \ + # Starting with Fedora 28 this is python2-pyyaml + python-yaml \ + python3-pip \ + python2-future \ + python3-PyYAML \ + python3-future \ + python3-protobuf \ + python3-junit_xml \ + sudo \ + tar \ + which \ + e2fsprogs \ + rubygem-asciidoctor \ + kmod + +# Replace coreutils-single with "traditional" coreutils +# to fix the following error on Fedora 28/rawhide while +# running under QEMU: +# > sh: /usr/bin/sort: /usr/bin/coreutils: bad interpreter: No such file or directory +RUN dnf install -y --allowerasing coreutils + +RUN ln -sf python3 /usr/bin/python + +COPY . /criu +WORKDIR /criu + +ENV CCACHE_DIR=/tmp/.ccache CCACHE_NOCOMPRESS=1 $ENV1=yes +RUN mv .ccache /tmp && make mrproper && ccache -sz && \ + date && make -j $(nproc) CC="$CC" && date && ccache -s + +# The rpc test cases are running as user #1000, let's add the user +RUN adduser -u 1000 test + +RUN make -C test/zdtm -j $(nproc) + diff --git a/CRIU_code/scripts/build/Dockerfile.ppc64le.hdr b/CRIU_code/scripts/build/Dockerfile.ppc64le.hdr new file mode 100644 index 0000000..ba65901 --- /dev/null +++ b/CRIU_code/scripts/build/Dockerfile.ppc64le.hdr @@ -0,0 +1,5 @@ +FROM ppc64le/ubuntu:xenial + +ENV QEMU_CPU POWER8 +COPY scripts/build/qemu-user-static/usr/bin/qemu-ppc64le-static /usr/bin/qemu-ppc64le-static +RUN sed -i '/security/ d' /etc/apt/sources.list diff --git a/CRIU_code/scripts/build/Dockerfile.ppc64le.tmpl b/CRIU_code/scripts/build/Dockerfile.ppc64le.tmpl new file mode 100644 index 0000000..cb80479 --- /dev/null +++ b/CRIU_code/scripts/build/Dockerfile.ppc64le.tmpl @@ -0,0 +1 @@ +Dockerfile.tmpl \ No newline at end of file diff --git a/CRIU_code/scripts/build/Dockerfile.s390x.hdr b/CRIU_code/scripts/build/Dockerfile.s390x.hdr new file mode 100644 index 0000000..e02097f --- /dev/null +++ b/CRIU_code/scripts/build/Dockerfile.s390x.hdr @@ -0,0 +1,6 @@ +FROM s390x/debian:latest + +ENV QEMU_CPU z900 +COPY scripts/build/qemu-user-static/usr/bin/qemu-s390x-static /usr/bin/qemu-s390x-static +# The security repository does not seem to exist anymore +RUN sed -i '/security/ d' /etc/apt/sources.list diff --git a/CRIU_code/scripts/build/Dockerfile.s390x.tmpl b/CRIU_code/scripts/build/Dockerfile.s390x.tmpl new file mode 100644 index 0000000..cb80479 --- /dev/null +++ b/CRIU_code/scripts/build/Dockerfile.s390x.tmpl @@ -0,0 +1 @@ +Dockerfile.tmpl \ No newline at end of file diff --git a/CRIU_code/scripts/build/Dockerfile.tmpl b/CRIU_code/scripts/build/Dockerfile.tmpl new file mode 100644 index 0000000..4378ba1 --- /dev/null +++ b/CRIU_code/scripts/build/Dockerfile.tmpl @@ -0,0 +1,45 @@ +ARG CC=gcc +ARG ENV1=FOOBAR + +RUN apt-get update && apt-get install -y \ + ccache \ + libnet-dev \ + libnl-route-3-dev \ + $CC \ + bsdmainutils \ + build-essential \ + git-core \ + iptables \ + libaio-dev \ + libcap-dev \ + libgnutls28-dev \ + libgnutls30 \ + libnl-3-dev \ + libprotobuf-c0-dev \ + libprotobuf-dev \ + libselinux-dev \ + pkg-config \ + protobuf-c-compiler \ + protobuf-compiler \ + python-minimal \ + python-future + +COPY . /criu +WORKDIR /criu +ENV CC="ccache $CC" CCACHE_DIR=/tmp/.ccache CCACHE_NOCOMPRESS=1 $ENV1=yes + +RUN mv .ccache /tmp && make mrproper && ccache -s && \ + date && \ +# Check single object build + make -j $(nproc) CC="$CC" criu/parasite-syscall.o && \ +# Compile criu + make -j $(nproc) CC="$CC" && \ + date && \ +# Check that "make mrproper" works + make mrproper && ! git clean -ndx --exclude=scripts/build \ + --exclude=.config --exclude=test | grep . + +# Compile tests +RUN date && make -j $(nproc) CC="$CC" -C test/zdtm && date + +#RUN make test/compel/handle_binary && ./test/compel/handle_binary diff --git a/CRIU_code/scripts/build/Dockerfile.x86_64.hdr b/CRIU_code/scripts/build/Dockerfile.x86_64.hdr new file mode 100644 index 0000000..b020d9c --- /dev/null +++ b/CRIU_code/scripts/build/Dockerfile.x86_64.hdr @@ -0,0 +1,4 @@ +FROM ubuntu:xenial + +RUN apt-get update -qq && apt-get install -qq \ + gcc-multilib diff --git a/CRIU_code/scripts/build/Dockerfile.x86_64.tmpl b/CRIU_code/scripts/build/Dockerfile.x86_64.tmpl new file mode 100644 index 0000000..cb80479 --- /dev/null +++ b/CRIU_code/scripts/build/Dockerfile.x86_64.tmpl @@ -0,0 +1 @@ +Dockerfile.tmpl \ No newline at end of file diff --git a/CRIU_code/scripts/build/Makefile b/CRIU_code/scripts/build/Makefile new file mode 100644 index 0000000..f333b21 --- /dev/null +++ b/CRIU_code/scripts/build/Makefile @@ -0,0 +1,52 @@ +QEMU_ARCHES := armv7hf aarch64 ppc64le s390x fedora-rawhide-aarch64 # require qemu +ARCHES := $(QEMU_ARCHES) x86_64 fedora-asan fedora-rawhide centos +TARGETS := $(ARCHES) alpine +TARGETS_CLANG := $(addsuffix $(TARGETS),-clang) + +all: $(TARGETS) $(TARGETS_CLANG) +.PHONY: all + +# A build for each architecture requires appropriate Dockerfile +define ARCH_DEP +$(1): Dockerfile.$(1) +endef +$(foreach arch,$(ARCHES),$(eval $(call ARCH_DEP,$(arch)))) + +Dockerfile.%: Dockerfile.%.hdr Dockerfile.%.tmpl + cat $^ > $@ + +qemu-user-static: + ./extract-deb-pkg qemu-user-static + +binfmt_misc: + ./binfmt_misc +.PHONY: binfmt_misc + +$(QEMU_ARCHES): qemu-user-static binfmt_misc + +$(TARGETS): + mkdir -p $(HOME)/.ccache + mv $(HOME)/.ccache ../../ + docker build -t criu-$@ -f Dockerfile.$@ $(DB_CC) $(DB_ENV) ../.. + docker run criu-$@ tar c -C /tmp .ccache | tar x -C $(HOME) +.PHONY: $(TARGETS) + +# Clang builds add some Docker build env +define CLANG_DEP +$(1)-clang: $(1) +endef +$(foreach t,$(TARGETS),$(eval $(call CLANG_DEP,$(t)))) + +%-clang: DB_CC=--build-arg CC=clang +%-clang: DB_ENV=--build-arg ENV1=CCACHE_CPP2 +s390x-clang: DB_CC=--build-arg CC=clang-3.8 +.PHONY: $(TARGETS_CLANG) + +clean: + rm -rf qemu-user-static + for ARCH in $(ARCHES); do \ + FILE=/proc/sys/fs/binfmt_misc/$$ARCH; \ + test -f $$FILE && echo -1 > $$FILE; \ + rm -f Dockerfile.$$ARCH; \ + done +.PHONY: clean diff --git a/CRIU_code/scripts/build/binfmt_misc b/CRIU_code/scripts/build/binfmt_misc new file mode 100644 index 0000000..bf2a2ec --- /dev/null +++ b/CRIU_code/scripts/build/binfmt_misc @@ -0,0 +1,13 @@ +set -e -x + +test -f /proc/sys/fs/binfmt_misc/armv7hf || + echo ':armv7hf:M::\x7fELF\x01\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02\x00\x28\x00:\xff\xff\xff\xff\xff\xff\xff\x00\xff\xff\xff\xff\xff\xff\xff\xff\xfe\xff\xff\xff:/usr/bin/qemu-arm-static:' > /proc/sys/fs/binfmt_misc/register; + +test -f /proc/sys/fs/binfmt_misc/aarch64 || + echo ':aarch64:M::\x7fELF\x02\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02\x00\xb7:\xff\xff\xff\xff\xff\xff\xff\x00\xff\xff\xff\xff\xff\xff\xff\xff\xfe\xff\xff:/usr/bin/qemu-aarch64-static:' > /proc/sys/fs/binfmt_misc/register + +test -f /proc/sys/fs/binfmt_misc/ppc64le || + echo ':ppc64le:M::\x7fELF\x02\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02\x00\x15\x00:\xff\xff\xff\xff\xff\xff\xff\xfc\xff\xff\xff\xff\xff\xff\xff\xff\xfe\xff\xff\x00:/usr/bin/qemu-ppc64le-static:' > /proc/sys/fs/binfmt_misc/register + +test -f /proc/sys/fs/binfmt_misc/s390x || + echo ':s390x:M::\x7fELF\x02\x02\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02\x00\x16:\xff\xff\xff\xff\xff\xff\xff\x00\xff\xff\xff\xff\xff\xff\xff\xff\xff\xfe\xff\xff:/usr/bin/qemu-s390x-static:' > /proc/sys/fs/binfmt_misc/register diff --git a/CRIU_code/scripts/build/extract-deb-pkg b/CRIU_code/scripts/build/extract-deb-pkg new file mode 100644 index 0000000..44457bc --- /dev/null +++ b/CRIU_code/scripts/build/extract-deb-pkg @@ -0,0 +1,36 @@ +#!/bin/bash + +set -e +set -u +set -o pipefail +MIRROR="https://mirrors.kernel.org/ubuntu" +PKGS="$MIRROR/dists/bionic/universe/binary-amd64/Packages.gz" + +if [ $# -ne 1 ]; then + echo "Usage: $0 package-name" 1>&2 + exit 1 +fi + +if [ -d "$1" ]; then + echo "Directory $1 already exists -- exiting" + exit 0 +fi + +if ! pkg=$(curl -sSL "$PKGS" | zgrep "Filename.*$1" | awk '{ print $2 }'); then + echo "ERROR: no packages matching $1" 1>&2 + exit 1 +fi + +if [ "$(wc -w <<< "$pkg")" -gt 1 ]; then + echo "$pkg" 1>&2 + echo "ERROR: more than one match for $1" 1>&2 + exit 1 +fi + +mkdir "$1" +cd "$1" + +wget "$MIRROR/$pkg" +pkg=$(basename "$pkg") +ar vx "$pkg" +tar xJvf data.tar.xz diff --git a/CRIU_code/scripts/crit-setup.py b/CRIU_code/scripts/crit-setup.py new file mode 100644 index 0000000..60fef6a --- /dev/null +++ b/CRIU_code/scripts/crit-setup.py @@ -0,0 +1,12 @@ +from distutils.core import setup + +setup(name = "crit", + version = "0.0.1", + description = "CRiu Image Tool", + author = "CRIU team", + author_email = "criu@openvz.org", + url = "https://github.com/xemul/criu", + package_dir = {'pycriu': 'lib/py'}, + packages = ["pycriu", "pycriu.images"], + scripts = ["crit/crit"] + ) diff --git a/CRIU_code/scripts/criu-ns b/CRIU_code/scripts/criu-ns new file mode 100644 index 0000000..e065c59 --- /dev/null +++ b/CRIU_code/scripts/criu-ns @@ -0,0 +1,252 @@ +#!/usr/bin/env python +import ctypes +import ctypes.util +import errno +import sys +import os + +# constants for unshare +CLONE_NEWNS = 0x00020000 +CLONE_NEWPID = 0x20000000 + +# - constants for mount +MS_REC = 16384 +MS_PRIVATE = 1 << 18 +MS_SLAVE = 1 << 19 + +# Load libc bindings +_libc = ctypes.CDLL(ctypes.util.find_library("c"), use_errno=True) + +try: + _unshare = _libc.unshare +except AttributeError: + raise OSError(errno.EINVAL, "unshare is not supported on this platform") +else: + _unshare.argtypes = [ ctypes.c_int ] + _unshare.restype = ctypes.c_int + +try: + _setns = _libc.setns +except AttributeError: + raise OSError(errno.EINVAL, "setns is not supported on this platform") +else: + _setns.argtypes = [ ctypes.c_int, ctypes.c_int ] + _setns.restype = ctypes.c_int + +try: + _mount = _libc.mount +except AttributeError: + raise OSError(errno.EINVAL, "mount is not supported on this platform") +else: + _mount.argtypes = [ + ctypes.c_char_p, + ctypes.c_char_p, + ctypes.c_char_p, + ctypes.c_ulong, + ctypes.c_void_p + ] + _mount.restype = ctypes.c_int + +try: + _umount = _libc.umount +except AttributeError: + raise OSError(errno.EINVAL, "umount is not supported on this platform") +else: + _umount.argtypes = [ctypes.c_char] + _umount.restype = ctypes.c_int + + +def run_criu(): + print(sys.argv) + os.execlp('criu', *['criu'] + sys.argv[1:]) + + +def wrap_restore(): + # Unshare pid and mount namespaces + if _unshare(CLONE_NEWNS | CLONE_NEWPID) != 0: + _errno = ctypes.get_errno() + raise OSError(_errno, errno.errorcode[_errno]) + + (r_pipe, w_pipe) = os.pipe() + + # Spawn the init + if os.fork() == 0: + os.close(r_pipe) + + # Mount new /proc + if _mount(None, b"/", None, MS_SLAVE|MS_REC, None) != 0: + _errno = ctypes.get_errno() + raise OSError(_errno, errno.errorcode[_errno]) + + if _mount(b'proc', b'/proc', b'proc', 0, None) != 0: + _errno = ctypes.get_errno() + raise OSError(_errno, errno.errorcode[_errno]) + + # Spawn CRIU binary + criu_pid = os.fork() + if criu_pid == 0: + run_criu() + raise OSError(errno.ENOENT, "No such command") + + while True: + try: + (pid, status) = os.wait() + if pid == criu_pid: + status = os.WEXITSTATUS(status) + break + except OSError: + status = -251 + break + + os.write(w_pipe, b"%d" % status) + os.close(w_pipe) + + if status != 0: + sys.exit(status) + + while True: + try: + os.wait() + except OSError: + break + + sys.exit(0) + + # Wait for CRIU to exit and report the status back + os.close(w_pipe) + status = os.read(r_pipe, 1024) + if not status.isdigit(): + status_i = -252 + else: + status_i = int(status) + + return status_i + + +def get_varg(args): + for i in range(1, len(sys.argv)): + if not sys.argv[i] in args: + continue + + if i + 1 >= len(sys.argv): + break + + return (sys.argv[i + 1], i + 1) + + return (None, None) + + + +def set_pidns(tpid, pid_idx): + # Joind pid namespace. Note, that the given pid should + # be changed in -t option, as task lives in different + # pid namespace. + + myns = os.stat('/proc/self/ns/pid').st_ino + + ns_fd = os.open('/proc/%s/ns/pid' % tpid, os.O_RDONLY) + if myns != os.fstat(ns_fd).st_ino: + + for l in open('/proc/%s/status' % tpid): + if not l.startswith('NSpid:'): + continue + + ls = l.split() + if ls[1] != tpid: + raise OSError(errno.ESRCH, 'No such pid') + + print('Replace pid {} with {}'.format(tpid, ls[2])) + sys.argv[pid_idx] = ls[2] + break + else: + raise OSError(errno.ENOENT, 'Cannot find NSpid field in proc') + + if _setns(ns_fd, 0) != 0: + _errno = ctypes.get_errno() + raise OSError(_errno, errno.errorcode[_errno]) + + os.close(ns_fd) + + +def set_mntns(tpid): + # Join mount namespace. Trick here too -- check / and . + # will be the same in target mntns. + + myns = os.stat('/proc/self/ns/mnt').st_ino + ns_fd = os.open('/proc/%s/ns/mnt' % tpid, os.O_RDONLY) + if myns != os.fstat(ns_fd).st_ino: + root_st = os.stat('/') + cwd_st = os.stat('.') + cwd_path = os.path.realpath('.') + + if _setns(ns_fd, 0) != 0: + _errno = ctypes.get_errno() + raise OSError(_errno, errno.errorcode[_errno]) + + os.chdir(cwd_path) + root_nst = os.stat('/') + cwd_nst = os.stat('.') + + def steq(st, nst): + return (st.st_dev, st.st_ino) == (nst.st_dev, nst.st_ino) + + if not steq(root_st, root_nst): + raise OSError(errno.EXDEV, 'Target ns / is not as current') + if not steq(cwd_st, cwd_nst): + raise OSError(errno.EXDEV, 'Target ns . is not as current') + + + os.close(ns_fd) + + +def wrap_dump(): + (pid, pid_idx) = get_varg(('-t', '--tree')) + if pid is None: + raise OSError(errno.EINVAL, 'No --tree option given') + + set_pidns(pid, pid_idx) + set_mntns(pid) + + # Spawn CRIU binary + criu_pid = os.fork() + if criu_pid == 0: + run_criu() + raise OSError(errno.ENOENT, "No such command") + + # Wait for CRIU to exit and report the status back + while True: + try: + (pid, status) = os.wait() + if pid == criu_pid: + status = os.WEXITSTATUS(status) + break + except OSError: + status = -251 + break + + return status + + +if len(sys.argv) == 1: + print(""" +Usage: + {0} dump|pre-dump -t PID [] + {0} restore [] +\nCommands: + dump checkpoint a process/tree identified by pid + pre-dump pre-dump task(s) minimizing their frozen time + restore restore a process/tree +""".format(sys.argv[0])) + exit(1) + +action = sys.argv[1] + +if action == 'restore': + res = wrap_restore() +elif action == 'dump' or action == 'pre-dump': + res = wrap_dump() +else: + print('Unsupported action {} for nswrap'.format(action)) + res = -1 + +sys.exit(res) diff --git a/CRIU_code/scripts/fake-restore.sh b/CRIU_code/scripts/fake-restore.sh new file mode 100644 index 0000000..2728fb3 --- /dev/null +++ b/CRIU_code/scripts/fake-restore.sh @@ -0,0 +1,15 @@ +#!/bin/bash +# +# A stupid script to abort restore at the very end. Useful to test +# restore w/o letting the restored processes continue running. E.g. +# can be used to measure the restore time. +# +# Usage: +# criu restore --action-script $(pwd)/scripts/fake-restore.sh +# +if [ "$CRTOOLS_SCRIPT_ACTION" == "post-restore" ]; then + touch restore-succeeded + exit 1 +else + exit 0 +fi diff --git a/CRIU_code/scripts/feature-tests.mak b/CRIU_code/scripts/feature-tests.mak new file mode 100644 index 0000000..e39d97b --- /dev/null +++ b/CRIU_code/scripts/feature-tests.mak @@ -0,0 +1,138 @@ +define FEATURE_TEST_TCP_REPAIR + +#include + +int main(void) +{ + struct tcp_repair_opt opts; + opts.opt_code = TCP_NO_QUEUE; + opts.opt_val = 0; + + return opts.opt_val; +} +endef + +define FEATURE_TEST_TCP_REPAIR_WINDOW + +#include + +int main(void) +{ + struct tcp_repair_window opts; + + opts.snd_wl1 = 0; + + return opts.snd_wl1; +} +endef + +define FEATURE_TEST_LIBBSD_DEV +#include + +int main(void) +{ + return 0; +} +endef + +define FEATURE_TEST_STRLCPY + +#include + +#ifdef CONFIG_HAS_LIBBSD +# include +#endif + +int main(void) +{ + return strlcpy(NULL, NULL, 0); +} +endef + +define FEATURE_TEST_STRLCAT + +#include + +#ifdef CONFIG_HAS_LIBBSD +# include +#endif + +int main(void) +{ + return strlcat(NULL, NULL, 0); +} +endef + +define FEATURE_TEST_PTRACE_PEEKSIGINFO + +#include + +int main(void) +{ + struct ptrace_peeksiginfo_args args = {}; + + return 0; +} + +endef + +define FEATURE_TEST_SETPROCTITLE_INIT + +#include + +int main(int argc, char *argv[], char *envp[]) +{ + setproctitle_init(argc, argv, envp); + + return 0; +} + +endef + +define FEATURE_TEST_X86_COMPAT +#define __ALIGN .align 4, 0x90 +#define ENTRY(name) \ + .globl name; \ + .type name, @function; \ + __ALIGN; \ + name: + +#define END(sym) \ + .size sym, . - sym + +#define __USER32_CS 0x23 +#define __USER_CS 0x33 + + .text + +ENTRY(call32_from_64) + /* Push return address and 64-bit segment descriptor */ + sub \$$4, %rsp + movl \$$__USER_CS,(%rsp) + sub \$$4, %rsp + /* Using rip-relative addressing to get rid of R_X86_64_32S relocs */ + leaq 2f(%rip),%r12 + movl %r12d,(%rsp) + + /* Switch into compatibility mode */ + pushq \$$__USER32_CS + /* Using rip-relative addressing to get rid of R_X86_64_32S relocs */ + leaq 1f(%rip), %r12 + pushq %r12 + lretq + +1: .code32 + /* Run function and switch back */ + call *%esi + lret + +2: .code64 + /* Restore the stack */ + mov (%rsp),%rsp + add \$$8, %rdi +END(call32_from_64) + +ENTRY(main) + nop +END(main) +endef diff --git a/CRIU_code/scripts/flake8.cfg b/CRIU_code/scripts/flake8.cfg new file mode 100644 index 0000000..4231e84 --- /dev/null +++ b/CRIU_code/scripts/flake8.cfg @@ -0,0 +1,10 @@ +[flake8] +# W191 indentation contains tabs +# E128 continuation line under-indented for visual indent +# E501 line too long +# E251 unexpected spaces around keyword / parameter equals +# E101 indentation contains mixed spaces and tabs +# E126 continuation line over-indented for hanging indent +# W504 line break after binary operator +# E117 over-indented +ignore = W191,E128,E501,E251,E101,E126,W504,E117 diff --git a/CRIU_code/scripts/install-debian-pkgs.sh b/CRIU_code/scripts/install-debian-pkgs.sh new file mode 100644 index 0000000..c8af34f --- /dev/null +++ b/CRIU_code/scripts/install-debian-pkgs.sh @@ -0,0 +1,25 @@ +#!/bin/bash +# Install required packages for development environment in Debian Distro + +REQ_PKGS=${REQ_PKGS:=contrib/debian/dev-packages.lst} + +help_msg="Install required packages for development environment in Debian Distro +Usage: + scripts/install-debian-pkgs.sh" + +function print_help() +{ + exec echo -e "$help_msg" +} + +function process() +{ + sudo apt-get update + sudo apt-get install -yq $( sed 's/\#.*$//' ${REQ_PKGS} ) +} + +if [ "$1" = "--help" ] || [ "$1" = "-h" ]; then + print_help +else + process +fi diff --git a/CRIU_code/scripts/magic-gen.py b/CRIU_code/scripts/magic-gen.py new file mode 100644 index 0000000..7088f63 --- /dev/null +++ b/CRIU_code/scripts/magic-gen.py @@ -0,0 +1,61 @@ +#!/bin/env python2 +import sys + +# This program parses criu magic.h file and produces +# magic.py with all *_MAGIC constants except RAW and V1. +def main(argv): + if len(argv) != 3: + print("Usage: magic-gen.py path/to/image.h path/to/magic.py") + exit(1) + + magic_c_header = argv[1] + magic_py = argv[2] + + out = open(magic_py, 'w+') + + # all_magic is used to parse constructions like: + # #define PAGEMAP_MAGIC 0x56084025 + # #define SHMEM_PAGEMAP_MAGIC PAGEMAP_MAGIC + all_magic = {} + # and magic is used to store only unique magic. + magic = {} + + f = open(magic_c_header, 'r') + for line in f: + split = line.split() + + if len(split) < 3: + continue + + if not '#define' in split[0]: + continue + + key = split[1] + value = split[2] + + if value in all_magic: + value = all_magic[value] + else: + magic[key] = value + + all_magic[key] = value + + out.write('#Autogenerated. Do not edit!\n') + out.write('by_name = {}\n') + out.write('by_val = {}\n') + for k,v in list(magic.items()): + # We don't need RAW or V1 magic, because + # they can't be used to identify images. + if v == '0x0' or v == '1' or k == '0x0' or v == '1': + continue + if k.endswith("_MAGIC"): + # Just cutting _MAGIC suffix + k = k[:-6] + v = int(v, 16) + out.write("by_name['"+ k +"'] = "+ str(v) +"\n") + out.write("by_val["+ str(v) +"] = '"+ k +"'\n") + f.close() + out.close() + +if __name__ == "__main__": + main(sys.argv) diff --git a/CRIU_code/scripts/nmk/.gitignore b/CRIU_code/scripts/nmk/.gitignore new file mode 100644 index 0000000..9166105 --- /dev/null +++ b/CRIU_code/scripts/nmk/.gitignore @@ -0,0 +1,3 @@ +*.swp +*.swo +.git-ignore diff --git a/CRIU_code/scripts/nmk/Documentation/Makefile b/CRIU_code/scripts/nmk/Documentation/Makefile new file mode 100644 index 0000000..4c037de --- /dev/null +++ b/CRIU_code/scripts/nmk/Documentation/Makefile @@ -0,0 +1,50 @@ +ifneq ($(USE_ASCIIDOCTOR),) +ASCIIDOC := asciidoctor +XMLTO := +else +ASCIIDOC := asciidoc +XMLTO := xmlto +endif +PS2PDF := ps2pdf + +SRC += nmk.txt +XMLS := $(patsubst %.txt,%.xml,$(SRC)) +MANS := $(patsubst %.txt,%.8,$(SRC)) + +GROFF := groff +PAPER := $(shell paperconf 2>/dev/null || echo letter) +GROFF_OPTS := -Tps -t -dpaper=$(PAPER) -P-p$(PAPER) -man -msafer -rC1 -rD1 -rS11 +PSS := $(MANS:%.8=%.ps) +PDFS := $(MANS:%.8=%.pdf) + +ps: $(PSS) +pdf: $(PDFS) +all: check $(MANS) + +.PHONY: all ps pdf check clean + +check: + $(Q) for B in $(ASCIIDOC) $(XMLTO); do \ + $$B --version > /dev/null || exit 1; \ + done + +%.8: %.txt + $(call msg-gen, $@) +ifneq ($(USE_ASCIIDOCTOR),) + $(Q) $(ASCIIDOC) -b manpage -d manpage -o $@ $< +else + $(Q) $(ASCIIDOC) -b docbook -d manpage -o $(patsubst %.8,%.xml,$@) $< + $(Q) $(XMLTO) man --skip-validation $(patsubst %.8,%.xml,$@) 2>/dev/null +endif + +%.ps: %.8 + $(call msg-gen, $@) + $(Q) $(GROFF) $(GROFF_OPTS) $^ > $@ + +%.pdf: %.ps + $(call msg-gen, $@) + $(Q) $(PS2PDF) $< $@ + +clean: + $(call msg-clean, "docs") + $(Q) $(RM) $(XMLS) $(MANS) $(PSS) $(PDFS) diff --git a/CRIU_code/scripts/nmk/Documentation/nmk.txt b/CRIU_code/scripts/nmk/Documentation/nmk.txt new file mode 100644 index 0000000..8e97864 --- /dev/null +++ b/CRIU_code/scripts/nmk/Documentation/nmk.txt @@ -0,0 +1,70 @@ +nmk(8) +====== + +NAME +---- +nmk - a framework to minimize Makefile code needed for simple projects + + +SYNOPSIS +-------- +*make* -f main.mk makefile=Makefile obj= + + +OVERVIEW +-------- +Most of projects have similar source code structure: + + * Toplevel 'Makefile' + * Source code itself in directory '' + * Headers are gathered into directory '' + +so that building procedure is invoking *make* to read toplevel 'Makefile', +compile sources and link a final executable program. Taking this into account +*nmk* is trying to minimize efforts needed to write 'Makefile'. + + +USAGE +----- +First of all the *nmk* scripts are to be placed into some known place so the +*make* would be able to read them from a command line. Internally *nmk* uses +*__nmk_dir* variable to find own sources. Thus one can export + +---------- + export __nmk_dir=/ +---------- + +in a makefile or do it via environment variables. Note the ending slash is mandatory. + +As been mentioned earlier source code tree should include toplevel 'Makefile' +and source code in '' directory. Source code '' should provide own +'Makefile' (secondlevel) where files to be compiled are enumerated. + +A typical source code tree will look like + +---------- + Makefile # toplevel Makefile + # directory with nmk scripts + # source code directory + Makefile # secondlevel Makefile + src1.c # source code + src2.c + ... +---------- + +In toplevel 'Makefile' we should plug in *nmk* itself + +---------- + export __nmk_dir=scripts/ + include $(__nmk_dir)include.mk +---------- + +In secondlevel 'Makefile' we should enumerate files to be compiled. + +---------- + obj-y += src1.o + obj-y += src2.o + ... +---------- + +That is basically all one need to build a program. diff --git a/CRIU_code/scripts/nmk/Makefile b/CRIU_code/scripts/nmk/Makefile new file mode 100644 index 0000000..ba4aca8 --- /dev/null +++ b/CRIU_code/scripts/nmk/Makefile @@ -0,0 +1,35 @@ +__nmk_dir=scripts/ +export __nmk_dir + +include $(__nmk_dir)include.mk + +help: + @echo ' Targets:' + @echo ' install dir= - Install scripts into directory ' + @echo ' docs - Build documentation' + @echo ' clean - Clean everything' + +test: + $(Q) $(MAKE) -C tests all + +docs: + $(Q) $(MAKE) -C Documentation all + +install: + @echo 'Copying scripts into $(dir)' + @cp scripts/build.mk $(dir) + @cp scripts/include.mk $(dir) + @cp scripts/macro.mk $(dir) + @cp scripts/main.mk $(dir) + @cp scripts/rules.mk $(dir) + @cp scripts/tools.mk $(dir) + @cp scripts/utils.mk $(dir) + +all: ; + +clean: + $(call msg-clean, "nmk") + $(Q) $(MAKE) -C Documentation clean + $(Q) $(MAKE) -C tests clean + +.DEFAULT_GOAL ?= all diff --git a/CRIU_code/scripts/nmk/README.md b/CRIU_code/scripts/nmk/README.md new file mode 100644 index 0000000..3c0b570 --- /dev/null +++ b/CRIU_code/scripts/nmk/README.md @@ -0,0 +1,5 @@ +NMK +=== + +NMK stands for NetMaKe -- is a very simple framework for make build system. +Most ideas are taken from the Linux kernel kbuild system. diff --git a/CRIU_code/scripts/nmk/scripts/build.mk b/CRIU_code/scripts/nmk/scripts/build.mk new file mode 100644 index 0000000..d01d2b7 --- /dev/null +++ b/CRIU_code/scripts/nmk/scripts/build.mk @@ -0,0 +1,330 @@ +ifndef ____nmk_defined__build + +# +# General helpers for simplified Makefiles. +# +src := $(obj) +src-makefile := $(call objectify,$(makefile)) +obj-y := +obj-e := +builtin-name := +builtin-target := +lib-y := +lib-e := +lib-name := +lib-target := +hostprogs-y := +libso-y := +ld_flags := +ldflags-so := +arflags-y := +target := +deps-y := +all-y := +cleanup-y := +mrproper-y := +target := +objdirs := + +MAKECMDGOALS := $(call uniq,$(MAKECMDGOALS)) + +ifndef obj + $(error obj is undefined) +endif + +ifndef __nmk-makefile-deps + # Add top-make - it isn't included into this build.mk + __nmk-makefile-deps := Makefile +endif +__nmk-makefile-deps += $(src-makefile) +export __nmk-makefile-deps + +# +# Filter out any -Wl,XXX option: some of build farms +# assumes that we're using $(CC) for building built-in +# targets (and they have all rights to). But we're +# using $(LD) directly instead so filter out -Wl +# flags to make maintainer's life easier. +LDFLAGS-MASK := -Wl,% +LDFLAGS := $(filter-out $(LDFLAGS-MASK),$(LDFLAGS)) + +# +# Accumulate common flags. +define nmk-ccflags + $(filter-out $(CFLAGS_REMOVE_$(@F)), $(CFLAGS) $(ccflags-y) $(CFLAGS_$(@F))) +endef + +define nmk-asflags + $(CFLAGS) $(AFLAGS) $(asflags-y) $(AFLAGS_$(@F)) +endef + +define nmk-host-ccflags + $(HOSTCFLAGS) $(host-ccflags-y) $(HOSTCFLAGS_$(@F)) +endef + +# +# General rules. +define gen-cc-rules +$(1).o: $(2).c $(__nmk-makefile-deps) + $$(call msg-cc, $$@) + $$(Q) $$(CC) -c $$(strip $$(nmk-ccflags)) $$< -o $$@ +$(1).i: $(2).c $(__nmk-makefile-deps) + $$(call msg-cc, $$@) + $$(Q) $$(CC) -E $$(strip $$(nmk-ccflags)) $$< -o $$@ +$(1).s: $(2).c $(__nmk-makefile-deps) + $$(call msg-cc, $$@) + $$(Q) $$(CC) -S -fverbose-asm $$(strip $$(nmk-ccflags)) $$< -o $$@ +$(1).d: $(2).c $(__nmk-makefile-deps) + $$(call msg-dep, $$@) + $$(Q) $$(CC) -M -MT $$@ -MT $$(patsubst %.d,%.o,$$@) $$(strip $$(nmk-ccflags)) $$< -o $$@ +$(1).o: $(2).S $(__nmk-makefile-deps) + $$(call msg-cc, $$@) + $$(Q) $$(CC) -c $$(strip $$(nmk-asflags)) $$< -o $$@ +$(1).i: $(2).S $(__nmk-makefile-deps) + $$(call msg-cc, $$@) + $$(Q) $$(CC) -E $$(strip $$(nmk-asflags)) $$< -o $$@ +$(1).d: $(2).S $(__nmk-makefile-deps) + $$(call msg-dep, $$@) + $$(Q) $$(CC) -M -MT $$@ -MT $$(patsubst %.d,%.o,$$@) $$(strip $$(nmk-asflags)) $$< -o $$@ +endef + +include $(src-makefile) + +ifneq ($(strip $(target)),) + target := $(sort $(call uniq,$(target))) +endif + +# +# Prepare the unique entries. +obj-y := $(sort $(call uniq,$(obj-y))) +lib-y := $(filter-out $(obj-y),$(lib-y)) + +# +# Add subdir path +obj-y := $(call objectify,$(obj-y)) +lib-y := $(call objectify,$(lib-y)) + +# +# Strip custom names. +lib-name := $(strip $(lib-name)) +builtin-name := $(strip $(builtin-name)) + +# +# Link flags. +ldflags-y := $(strip $(LDFLAGS) $(ldflags-y)) + +# +# $(obj) related rules. +$(eval $(call gen-cc-rules,$(obj)/%,$(obj)/%)) + +# +# Prepare targets. +ifneq ($(lib-y),) + ifneq ($(lib-name),) + lib-target := $(obj)/$(lib-name) + else + lib-target := $(obj)/lib.a + endif + cleanup-y += $(call cleanify,$(lib-y)) + cleanup-y += $(lib-target) + all-y += $(lib-target) + objdirs += $(dir $(lib-y)) +endif + +ifneq ($(obj-y),) + ifneq ($(builtin-name),) + builtin-target := $(obj)/$(builtin-name) + else + builtin-target := $(obj)/built-in.o + endif + cleanup-y += $(call cleanify,$(obj-y)) + cleanup-y += $(builtin-target) + all-y += $(builtin-target) + objdirs += $(dir $(obj-y)) +endif + +# +# Helpers for targets. +define gen-ld-target-rule +$(1): $(3) + $$(call msg-link, $$@) + $$(Q) $$(LD) $(2) -o $$@ $(4) +endef + +define gen-ar-target-rule +$(1): $(3) + $$(call msg-ar, $$@) + $$(Q) $$(AR) -rcs$(2) $$@ $(4) +endef + +# +# Predefined (builtins) targets rules. +ifdef builtin-target + $(eval $(call gen-ld-target-rule, \ + $(builtin-target), \ + $(ldflags-y), \ + $(obj-y) $(__nmk-makefile-deps), \ + $(obj-y) $(call objectify,$(obj-e)))) +endif + +ifdef lib-target + $(eval $(call gen-ar-target-rule, \ + $(lib-target), \ + $(ARFLAGS) $(arflags-y), \ + $(lib-y) $(__nmk-makefile-deps), \ + $(lib-y) $(call objectify,$(lib-e)))) +endif + +# +# Custom targets rules. +define gen-custom-target-rule + ifneq ($($(1)-obj-y),) + $(eval $(call gen-ld-target-rule, \ + $(obj)/$(1).built-in.o, \ + $(ldflags-y) $(LDFLAGS_$(1)), \ + $(call objectify,$($(1)-obj-y)) \ + $(__nmk-makefile-deps), \ + $(call objectify,$($(1)-obj-y)) \ + $(call objectify,$($(1)-obj-e)))) + all-y += $(obj)/$(1).built-in.o + cleanup-y += $(call cleanify,$(call objectify,$($(1)-obj-y))) + cleanup-y += $(obj)/$(1).built-in.o + objdirs += $(dir $(call objectify,$($(1)-obj-y))) + endif + ifneq ($($(1)-lib-y),) + $(eval $(call gen-ar-target-rule, \ + $(obj)/$(1).lib.a, \ + $(ARFLAGS) $($(1)-arflags-y), \ + $(call objectify,$($(1)-lib-y)) \ + $(__nmk-makefile-deps), \ + $(call objectify,$($(1)-lib-y))) \ + $(call objectify,$($(1)-lib-e))) + all-y += $(obj)/$(1).lib.a + cleanup-y += $(call cleanify,$(call objectify,$($(1)-lib-y))) + cleanup-y += $(obj)/$(1).lib.a + objdirs += $(dir $(call objectify,$($(1)-lib-y))) + endif +endef + +$(foreach t,$(target),$(eval $(call gen-custom-target-rule,$(t)))) + +# +# Prepare rules for dirs other than (obj)/. +objdirs := $(patsubst %/,%,$(filter-out $(obj)/,$(call uniq,$(objdirs)))) +$(foreach t,$(objdirs),$(eval $(call gen-cc-rules,$(t)/%,$(t)/%))) + +# +# Host programs. +define gen-host-cc-rules +$(addprefix $(obj)/,$(1)): $(obj)/%.o: $(obj)/%.c $(__nmk-makefile-deps) + $$(call msg-host-cc, $$@) + $$(Q) $$(HOSTCC) -c $$(strip $$(nmk-host-ccflags)) $$< -o $$@ +$(patsubst %.o,%.i,$(addprefix $(obj)/,$(1))): $(obj)/%.i: $(obj)/%.c $(__nmk-makefile-deps) + $$(call msg-host-cc, $$@) + $$(Q) $$(HOSTCC) -E $$(strip $$(nmk-host-ccflags)) $$< -o $$@ +$(patsubst %.o,%.s,$(addprefix $(obj)/,$(1))): $(obj)/%.s: $(obj)/%.c $(__nmk-makefile-deps) + $$(call msg-host-cc, $$@) + $$(Q) $$(HOSTCC) -S -fverbose-asm $$(strip $$(nmk-host-ccflags)) $$< -o $$@ +$(patsubst %.o,%.d,$(addprefix $(obj)/,$(1))): $(obj)/%.d: $(obj)/%.c $(__nmk-makefile-deps) + $$(call msg-host-dep, $$@) + $$(Q) $$(HOSTCC) -M -MT $$@ -MT $$(patsubst %.d,%.o,$$@) $$(strip $$(nmk-host-ccflags)) $$< -o $$@ +endef + +define gen-host-rules + $(eval $(call gen-host-cc-rules,$($(1)-objs))) + all-y += $(addprefix $(obj)/,$($(1)-objs)) + cleanup-y += $(call cleanify,$(addprefix $(obj)/,$($(1)-objs))) +$(obj)/$(1): $(addprefix $(obj)/,$($(1)-objs)) $(__nmk-makefile-deps) + $$(call msg-host-link, $$@) + $$(Q) $$(HOSTCC) $$(HOSTCFLAGS) $(addprefix $(obj)/,$($(1)-objs)) $$(HOSTLDFLAGS) $$(HOSTLDFLAGS_$$(@F)) -o $$@ +all-y += $(obj)/$(1) +cleanup-y += $(obj)/$(1) +endef +$(foreach t,$(hostprogs-y),$(eval $(call gen-host-rules,$(t)))) + +# +# Dynamic library linking. +define gen-so-link-rules +$(call objectify,$(1)).so: $(call objectify,$($(1)-objs)) $(__nmk-makefile-deps) + $$(call msg-link, $$@) + $$(Q) $$(CC) -shared $$(ldflags-so) $$(LDFLAGS) $$(LDFLAGS_$$(@F)) -o $$@ $(call objectify,$($(1)-objs)) +all-y += $(call objectify,$(1)).so +cleanup-y += $(call objectify,$(1)).so +endef +$(foreach t,$(libso-y),$(eval $(call gen-so-link-rules,$(t)))) + +# +# Figure out if the target we're building needs deps to include. +define collect-builtin-deps + ifeq ($(1),$(2)) + deps-y += $(obj-y:.o=.d) + endif +endef +define collect-lib-deps + ifeq ($(1),$(2)) + deps-y += $(lib-y:.o=.d) + endif +endef +define collect-hostprogs-deps + ifeq ($(1),$(2)) + deps-y += $(addprefix $(obj)/,$($(1)-objs:.o=.d)) + endif +endef +define collect-target-deps + ifeq ($(1),$(2)) + deps-y += $(call objectify,$($(t)-lib-y:.o=.d)) + deps-y += $(call objectify,$($(t)-obj-y:.o=.d)) + endif +endef +define collect-deps + ifneq ($(filter all,$(1)),) + $(eval $(call collect-builtin-deps,$(builtin-target),$(builtin-target))) + $(eval $(call collect-lib-deps,$(lib-target),$(lib-target))) + $(foreach t,$(hostprogs-y),$(eval $(call collect-hostprogs-deps,$(t),$(t)))) + $(foreach t,$(target),$(eval $(call collect-target-deps,$(t),$(t)))) + else + ifneq ($(filter-out %.d $(builtin-target) $(lib-target) $(hostprogs-y) $(target),$(1)),) + ifneq ($(filter %.o %.i %.s,$(1)),) + deps-y += $(addsuffix .d,$(basename $(1))) + endif + else + $(eval $(call collect-builtin-deps,$(builtin-target),$(1))) + $(eval $(call collect-lib-deps,$(lib-target),$(1))) + $(foreach t,$(hostprogs-y),$(eval $(call collect-hostprogs-deps,$(t),$(1)))) + $(foreach t,$(target),$(eval $(call collect-target-deps,$(t),$(1)))) + endif + endif +endef + +ifneq ($(MAKECMDGOALS),) + ifneq ($(filter-out clean mrproper,$(MAKECMDGOALS)),) + $(foreach goal,$(MAKECMDGOALS),$(eval $(call collect-deps,$(goal)))) + deps-y := $(call uniq,$(deps-y)) + ifneq ($(deps-y),) + $(eval -include $(deps-y)) + endif + endif +endif + +# +# Main phony rule. +all: $(all-y) ; +.PHONY: all + +# +# Clean most files, but leave enough to navigate with tags (generated files) +clean: + $(call msg-clean, $(obj)) + $(Q) $(RM) $(cleanup-y) +.PHONY: clean + +# +# Delete all generated files +mrproper: clean + $(Q) $(RM) $(mrproper-y) +.PHONY: mrproper + +# +# Footer. +____nmk_defined__build = y +endif diff --git a/CRIU_code/scripts/nmk/scripts/include.mk b/CRIU_code/scripts/nmk/scripts/include.mk new file mode 100644 index 0000000..e170110 --- /dev/null +++ b/CRIU_code/scripts/nmk/scripts/include.mk @@ -0,0 +1,58 @@ +ifndef ____nmk_defined__include + +ifndef ____nmk_defined__msg + include $(__nmk_dir)msg.mk +endif + +.PHONY: all help test docs clean install .FORCE + +# +# Common vars. +SUBARCH := $(shell uname -m | sed \ + -e s/i.86/x86/ \ + -e s/x86_64/x86/ \ + -e s/sun4u/sparc64/ \ + -e s/arm.*/arm/ \ + -e s/sa110/arm/ \ + -e s/s390x/s390/ \ + -e s/parisc64/parisc/ \ + -e s/ppc64.*/ppc64/ \ + -e s/mips.*/mips/ \ + -e s/sh[234].*/sh/ \ + -e s/aarch64.*/aarch64/) + +ARCH ?= $(SUBARCH) +SRCARCH := $(ARCH) + +export SUBARCH ARCH SRCARCH + +ifndef ____nmk_defined__tools + include $(__nmk_dir)tools.mk +endif + +# Do not use make's built-in rules and variables +# (this increases performance and avoids hard-to-debug behaviour). +MAKEFLAGS += -rR --no-print-directory +export MAKEFLAGS + +# Avoid funny character set dependencies. +unexport LC_ALL +LC_COLLATE=C +LC_NUMERIC=C +export LC_COLLATE LC_NUMERIC + +# Avoid interference with shell env settings. +unexport GREP_OPTIONS + +# Shorthand for build. +build := -r -R -f $(__nmk_dir)main.mk makefile=Makefile obj +export build + +# With specified Makefile +build-as = -r -R -f $(__nmk_dir)main.mk makefile=$(1) obj=$(2) +export build-as + +# +# Footer. +____nmk_defined__include = y +endif diff --git a/CRIU_code/scripts/nmk/scripts/macro.mk b/CRIU_code/scripts/nmk/scripts/macro.mk new file mode 100644 index 0000000..b36d5b2 --- /dev/null +++ b/CRIU_code/scripts/nmk/scripts/macro.mk @@ -0,0 +1,33 @@ +ifndef ____nmk_defined__macro + +# +# Helper to include makefile only once. +# +define include-once + ifndef $(join ____nmk_defined__,$(1:.mk=)) + include $(__nmk_dir)$(1) + endif +endef + +# Helper to build built-in target in directory. +# $(eval $(call gen-built-in,,,)) +define gen-built-in +$(1)/%: $(2) + $$(Q) $$(MAKE) $$(build)=$(1) $$@ +ifneq ($(3),) +$(3): $(2) + $$(Q) $$(MAKE) $$(build)=$(1) all +.PHONY: $(3) +$(1)/built-in.o: $(3) +else +$(1): $(2) + $$(Q) $$(MAKE) $$(build)=$(1) all +.PHONY: $(1) +$(1)/built-in.o: $(1) +endif +endef + +# +# Footer. +____nmk_defined__macro = y +endif diff --git a/CRIU_code/scripts/nmk/scripts/main.mk b/CRIU_code/scripts/nmk/scripts/main.mk new file mode 100644 index 0000000..493a164 --- /dev/null +++ b/CRIU_code/scripts/nmk/scripts/main.mk @@ -0,0 +1,28 @@ +ifndef ____nmk_defined__main + +# +# Genaral inclusion statement + +ifndef ____nmk_defined__include + include $(__nmk_dir)include.mk +endif + +ifndef ____nmk_defined__macro + include $(__nmk_dir)macro.mk +endif + +# +# Anything else might be included with +# +# $(eval $(call include-once,)) +# +# Note the order does matter! + +$(eval $(call include-once,tools.mk)) +$(eval $(call include-once,utils.mk)) +$(eval $(call include-once,build.mk)) + +# +# Footer +____nmk_defined__main = y +endif diff --git a/CRIU_code/scripts/nmk/scripts/msg.mk b/CRIU_code/scripts/nmk/scripts/msg.mk new file mode 100644 index 0000000..d07f216 --- /dev/null +++ b/CRIU_code/scripts/nmk/scripts/msg.mk @@ -0,0 +1,71 @@ +ifndef ____nmk_defined__msg + +# +# Silent make rules. +ifeq ($(strip $(V)),) + E := @echo + Q := @ +else + E := @\# + Q := +endif + +export E Q + +# +# Message helpers. +define msg-gen + $(E) " GEN " $(1) +endef + +define msg-clean + $(E) " CLEAN " $(1) +endef + +define msg-cc + $(E) " CC " $(1) +endef + +define msg-dep + $(E) " DEP " $(1) +endef + +define msg-link + $(E) " LINK " $(1) +endef + +define msg-ar + $(E) " AR " $(1) +endef + +define msg-build + $(E) " BUILD " $(1) +endef + +define msg-host-cc + $(E) " HOSTCC " $(1) +endef + +define msg-host-dep + $(E) " HOSTDEP " $(1) +endef + +define msg-host-link + $(E) " HOSTLINK" $(1) +endef + +define newline + + +endef + +# map funciton: +# $1 - func to call +# $2 - list over which map the $1 func +# result is divided with newlines +map = $(foreach x,$2,$(call $1,$x)$(newline)) + +# +# Footer. +____nmk_defined__msg = y +endif #____nmk_defined__msg diff --git a/CRIU_code/scripts/nmk/scripts/tools.mk b/CRIU_code/scripts/nmk/scripts/tools.mk new file mode 100644 index 0000000..ce3d85d --- /dev/null +++ b/CRIU_code/scripts/nmk/scripts/tools.mk @@ -0,0 +1,43 @@ +ifndef ____nmk_defined__tools + +# +# System tools shorthands +RM := rm -f +HOSTLD ?= ld +ifeq ($(origin LD), default) +LD := $(CROSS_COMPILE)$(HOSTLD) +endif +HOSTCC ?= gcc +ifeq ($(origin CC), default) +CC := $(CROSS_COMPILE)$(HOSTCC) +endif +CPP := $(CC) -E +AS := $(CROSS_COMPILE)as +AR := $(CROSS_COMPILE)ar +STRIP := $(CROSS_COMPILE)strip +OBJCOPY := $(CROSS_COMPILE)objcopy +OBJDUMP := $(CROSS_COMPILE)objdump +NM := $(CROSS_COMPILE)nm +MAKE := make +MKDIR := mkdir -p +AWK := awk +PERL := perl +FULL_PYTHON := $(shell which python2 2>/dev/null || which python3 2>/dev/null) +PYTHON ?= $(shell basename $(FULL_PYTHON)) +FIND := find +SH := $(shell if [ -x "$$BASH" ]; then echo $$BASH; \ + else if [ -x /bin/bash ]; then echo /bin/bash; \ + else echo sh; fi ; fi) +CSCOPE := cscope +ETAGS := etags +CTAGS := ctags + +export RM HOSTLD LD HOSTCC CC CPP AS AR STRIP OBJCOPY OBJDUMP +export NM SH MAKE MKDIR AWK PERL PYTHON SH CSCOPE + +export USE_ASCIIDOCTOR ?= $(shell which asciidoctor 2>/dev/null) + +# +# Footer. +____nmk_defined__tools = y +endif diff --git a/CRIU_code/scripts/nmk/scripts/utils.mk b/CRIU_code/scripts/nmk/scripts/utils.mk new file mode 100644 index 0000000..0cf216b --- /dev/null +++ b/CRIU_code/scripts/nmk/scripts/utils.mk @@ -0,0 +1,35 @@ +ifndef ____nmk_defined__utils + +# +# Usage: option := $(call try-compile,language,source-to-build,cc-options,cc-defines) +try-compile = $(shell sh -c 'echo "$(2)" | \ + $(CC) $(4) -x $(1) - $(3) -o /dev/null > /dev/null 2>&1 && \ + echo true || echo false') + +# +# Usage: option := $(call try-cc,source-to-build,cc-options,cc-defines) +try-cc = $(call try-compile,c,$(1),$(2),$(3)) + +# +# Usage: option := $(call try-cc,source-to-build,cc-options,cc-defines) +try-asm = $(call try-compile,assembler-with-cpp,$(1),$(2),$(3)) + +# pkg-config-check +# Usage: ifeq ($(call pkg-config-check, library),y) +pkg-config-check = $(shell sh -c 'pkg-config $(1) && echo y') + +# +# Remove duplicates. +uniq = $(strip $(if $1,$(firstword $1) $(call uniq,$(filter-out $(firstword $1),$1)))) + +# +# Add $(obj)/ for paths that are not relative +objectify = $(foreach o,$(1),$(if $(filter /% ./% ../%,$(o)),$(o),$(obj)/$(o))) + +# To cleanup entries. +cleanify = $(foreach o,$(sort $(call uniq,$(1))),$(o) $(o:.o=.d) $(o:.o=.i) $(o:.o=.s) $(o:.o=.gcda) $(o:.o=.gcno)) + +# +# Footer. +____nmk_defined__utils = y +endif diff --git a/CRIU_code/scripts/protobuf-gen.sh b/CRIU_code/scripts/protobuf-gen.sh new file mode 100644 index 0000000..29c52cf --- /dev/null +++ b/CRIU_code/scripts/protobuf-gen.sh @@ -0,0 +1,19 @@ +TR="y/ABCDEFGHIJKLMNOPQRSTUVWXYZ/abcdefghijklmnopqrstuvwxyz/" + +for x in $(sed -n '/PB_AUTOGEN_START/,/PB_AUTOGEN_STOP/ { + /PB_AUTOGEN_ST/d; + s/,.*$//; + s/\tPB_//; + p; + }' criu/include/protobuf-desc.h); do + x_la=$(echo $x | sed $TR) + x_uf=$(echo $x | sed -nr 's/^./&#\\\ +/; + s/_(.)/\\\ +\1#\\\ +/g; + p;' | \ + sed -r "/^[A-Z]#\\\\\$/!{ $TR; }" | \ + sed -r ':loop; N; s/#?\\\n//; t loop') + echo "CR_PB_DESC($x, $x_uf, $x_la);" +done diff --git a/CRIU_code/scripts/systemd-autofs-restart.sh b/CRIU_code/scripts/systemd-autofs-restart.sh new file mode 100644 index 0000000..b22078c --- /dev/null +++ b/CRIU_code/scripts/systemd-autofs-restart.sh @@ -0,0 +1,175 @@ +#!/bin/bash +# +# This script can be used as a workaround for systemd autofs mount migration. +# The problem is that systemd is a clever guy: before mounting of actual file +# system on top of autofs mount, it first checks that device number of autofs +# mount is equal to the one, stored in sytemd internals. If they do not match, +# systemd ignores kernel request. +# The problem happens each time autofs is restored (new device number for +# autofs superblock) and can't be properly solved without some kind of "device +# namespaces", where device number can be preseved. +# But some of systemd services can be painlessly restarted. Like +# proc-sys-fs-binfmt_misc. +# +# Usage: +# criu restore --action-script $(pwd)/scripts/systemd-autofs-restart.sh +# +[ "$CRTOOLS_SCRIPT_ACTION" == "post-resume" ] || exit 0 + +if [ ! -n "$CRTOOLS_INIT_PID" ]; then + echo "CRTOOLS_INIT_PID environment variable is not set" + exit 1 +fi + +if [ ! -d "/proc/$CRTOOLS_INIT_PID" ]; then + echo "Process with CRTOOLS_INIT_PID=$CRTOOLS_INIT_PID doesn't exist" + exit 1 +fi + +NS_ENTER=/bin/nsenter +[ ! -x $NS_ENTER ] || NS_ENTER=/usr/bin/nsenter + +if [ ! -x $NS_ENTER ]; then + echo "$NS_ENTER binary not found" + exit 2 +fi + +JOIN_CT="$NS_ENTER -t $CRTOOLS_INIT_PID -m -u -p" + +# Skip container, if it's not systemd based +[ "$($JOIN_CT basename -- $($JOIN_CT readlink /proc/1/exe))" == "systemd" ] || exit 0 + +AUTOFS_SERVICES="proc-sys-fs-binfmt_misc.automount" + +bindmount="" + +function remove_bindmount { + if [ -n "$bindmount" ]; then + $JOIN_CT umount $bindmount + $JOIN_CT rm -rf $bindmount + bindmount="" + fi +} +trap remove_bindmount EXIT + +function get_fs_type { + local mountpoint=$1 + + local top_mount_id="" + local top_mount_fs_type="" + + while IFS='' read -r line; do + # Skip those entries which do not match the mountpoint + [ "$(echo $line | awk '{print $5;}')" = "$mountpoint" ] || continue + + local mnt_id=$(echo $line | awk '{print $1;}') + local mnt_parent_id=$(echo $line | awk '{print $2;}') + local mnt_fs_type=$(echo $line | sed 's/.* - //g' | awk '{print $1;}') + + # Skip mount entry, if not the first one and not a child + [ -n "$top_mount_id" ] && [ "$mnt_parent_id" != "$top_mount_id" ] && continue + + top_mount_id=$mnt_id + top_mount_fs_type=$mnt_fs_type + done < "/proc/$CRTOOLS_INIT_PID/mountinfo" + + if [ -z "$top_mount_fs_type" ]; then + echo "Failed to find $mountpoint mountpoint" + return 1 + fi + + echo $top_mount_fs_type + return 0 +} + +function bind_mount { + local from=$1 + local to=$2 + + $JOIN_CT mount --bind $from $to && return 0 + + echo "Failed to bind mount $from to $to" + return 1 +} + +function save_mountpoint { + local mountpoint=$1 + local top_mount_fs_type="" + + top_mount_fs_type=$(get_fs_type $mountpoint) + if [ $? -ne 0 ]; then + echo "$top_mount_fs_type" + return + fi + + # Nothing to do, if no file system is on top of autofs + [ "$top_mount_fs_type" = "autofs" ] && return + + bindmount=$($JOIN_CT mktemp -d) + if [ -z "$bindmount" ]; then + echo "Failed to create temporary directory" + return 1 + fi + + # No need to unmount fs on top of autofs: + # systemd will does it for us on service restart + bind_mount $mountpoint $bindmount || $JOIN_CT rm -rf $bindmount +} + +function restore_mountpoint { + local mountpoint=$1 + + [ -n "$bindmount" ] || return + + # Umount file system, remounted by systemd, if any + top_mount_fs_type=$(get_fs_type $mountpoint) + if [ $? -ne 0 ]; then + echo "$top_mount_fs_type" + return + fi + + # Nothing to do, if no file system is on top of autofs + if [ "$top_mount_fs_type" != "autofs" ]; then + $JOIN_CT umount $mountpoint || echo "Failed to umount $mountpoint" + fi + + # Restore origin file system even if we failed to unmount the new one + bind_mount $bindmount $mountpoint + remove_bindmount +} + +function restart_service { + local service=$1 + local mountpoint=$($JOIN_CT systemctl show $service -p Where | sed 's/.*=//g') + + if [ -z "$mountpoint" ]; then + echo "Failed to discover $service mountpoint" + return + fi + + # Try to move restored bind-mount aside and exit if Failed + # Nothing to do, if we Failed + save_mountpoint $mountpoint || return + + $JOIN_CT systemctl restart $service + if [ $? -ne 0 ]; then + echo "Failed to restart $service service" + return + fi + echo "$service restarted" + + # Try to move saved monutpoint back on top of autofs + restore_mountpoint $mountpoint +} + +for service in $AUTOFS_SERVICES; do + status=$($JOIN_CT systemctl is-active $service) + + if [ $status == "active" ]; then + restart_service $service + else + echo "$service skipped ($status)" + fi +done + +exit 0 diff --git a/CRIU_code/scripts/tmp-files.sh b/CRIU_code/scripts/tmp-files.sh new file mode 100644 index 0000000..d6b93a0 --- /dev/null +++ b/CRIU_code/scripts/tmp-files.sh @@ -0,0 +1,47 @@ +#!/bin/bash +# +# Script allows to save arbitrary files in CRIU dump dir and properly restore +# these files on CRIU restore cmd. +# +# Normally you need to call this script for files that can be lost between +# CRIU checkpoint and restore cmds. For example for files stored on non-tmpfs +# mount points. That's why this script is called tmp-files. +# +# You should call this script on both CRIU dump and restore cmds. +# +# Usage: +# criu dump --action-script \ +# '$CRIU_SCRIPTS_PATH/tmp-files.sh /tmp/ycm_temp /home/user/.tmpfile.txt.swp' +# criu restore --action-script $CRIU_SCRIPTS_PATH/tmp-files.sh +# +# Note: absolute path to tmp-files.sh should be supplied in --action-script with '' +# + +POSTDUMP="post-dump" +PRERESTORE="pre-restore" +DUMPARGS="--create --absolute-names --gzip --no-unquote --no-wildcards --file" +RESTOREARGS="--extract --gzip --no-unquote --no-wildcards --absolute-names --directory / --file" +IMGFILE=$CRTOOLS_IMAGE_DIR"/tmpfiles.tar.gz" + +MY_NAME=`basename "$0"` + +case "$CRTOOLS_SCRIPT_ACTION" in + $POSTDUMP ) + if [ "$#" -lt 1 ]; then + echo "$MY_NAME: ERROR! No files are given." + exit 1 + fi + tar $DUMPARGS $IMGFILE -- "$@" + exit $? + ;; + $PRERESTORE ) + if [ "$#" -ne 0 ]; then + echo "$MY_NAME: ERROR! Not expected script args." + exit 1 + fi + tar $RESTOREARGS $IMGFILE + exit $? + ;; +esac + +exit 0 diff --git a/CRIU_code/scripts/travis/Makefile b/CRIU_code/scripts/travis/Makefile new file mode 100644 index 0000000..77c9374 --- /dev/null +++ b/CRIU_code/scripts/travis/Makefile @@ -0,0 +1,42 @@ +local: + ./travis-tests +.PHONY: local + +after_success: + ./travis-after_success +.PHONY: after_success + +target-suffix = +ifdef CLANG + target-suffix = -clang +endif + +TARGETS := alpine fedora-rawhide centos +ZDTM_OPTIONS := + +alpine: ZDTM_OPTIONS=-x zdtm/static/binfmt_misc -x zdtm/static/netns-nf -x zdtm/static/sched_policy00 -x zdtm/static/seccomp_strict -x zdtm/static/sigaltstack -x zdtm/static/signalfd00 -x zdtm/static/config_inotify_irmap + +define DOCKER_JSON +{ + "storage-driver": "devicemapper" +} +endef + +export DOCKER_JSON +$(TARGETS): + echo "$$DOCKER_JSON" > /etc/docker/daemon.json + systemctl restart docker + $(MAKE) -C ../build $@$(target-suffix) + docker run --env-file docker.env --rm -it --privileged -v /lib/modules:/lib/modules --tmpfs /run criu-$@ scripts/travis/travis-tests + +fedora-asan: + echo "$$DOCKER_JSON" > /etc/docker/daemon.json + systemctl restart docker + $(MAKE) -C ../build $@$(target-suffix) + docker run --rm -it --privileged -v /lib/modules:/lib/modules --tmpfs /run criu-$@ ./scripts/travis/asan.sh $(ZDTM_OPTIONS) + +docker-test: + ./docker-test.sh + +%: + $(MAKE) -C ../build $@$(target-suffix) diff --git a/CRIU_code/scripts/travis/asan.sh b/CRIU_code/scripts/travis/asan.sh new file mode 100644 index 0000000..15bfe53 --- /dev/null +++ b/CRIU_code/scripts/travis/asan.sh @@ -0,0 +1,21 @@ +#!/bin/sh + +set -x + +cat /proc/self/mountinfo + +chmod 0777 test +chmod 0777 test/zdtm/transition/ +chmod 0777 test/zdtm/static + +./test/zdtm.py run -a --keep-going -k always --parallel 4 -x zdtm/static/rtc "$@" +ret=$? + +for i in `find / -name 'asan.log*'`; do + echo $i; + echo ======================================== + cat $i; + echo ======================================== + ret=1; +done; +exit $ret diff --git a/CRIU_code/scripts/travis/docker-test.sh b/CRIU_code/scripts/travis/docker-test.sh new file mode 100644 index 0000000..ee96fef --- /dev/null +++ b/CRIU_code/scripts/travis/docker-test.sh @@ -0,0 +1,66 @@ +#!/bin/bash +set -x -e -o pipefail + +apt-get install -qq \ + apt-transport-https \ + ca-certificates \ + curl \ + software-properties-common + +curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add - + +add-apt-repository \ + "deb [arch=amd64] https://download.docker.com/linux/ubuntu \ + $(lsb_release -cs) \ + stable test" + + +apt-get update -qq + +apt-get install -qq docker-ce + +cat > /etc/docker/daemon.json <&1 | tee log || { + cat "`cat log | grep 'log file:' | sed 's/log file:\s*//'`" || true + docker logs cr || true + cat /tmp/zdtm-core-* || true + dmesg + docker ps + exit 1 + } + docker ps + sleep 1 +done + diff --git a/CRIU_code/scripts/travis/docker.env b/CRIU_code/scripts/travis/docker.env new file mode 100644 index 0000000..36154df --- /dev/null +++ b/CRIU_code/scripts/travis/docker.env @@ -0,0 +1,4 @@ +SKIP_TRAVIS_PREP=1 +ZDTM_OPTS=-x zdtm/static/binfmt_misc -x zdtm/static/sched_policy00 +CC=gcc +SKIP_EXT_DEV_TEST=1 diff --git a/CRIU_code/scripts/travis/travis-after_success b/CRIU_code/scripts/travis/travis-after_success new file mode 100644 index 0000000..daab76e --- /dev/null +++ b/CRIU_code/scripts/travis/travis-after_success @@ -0,0 +1,10 @@ +#!/bin/sh +set -x -e + +# We only need to run the below for gcov-enabled builds +test -z "$GCOV" && exit 0 + +sudo apt-get install -qq -y lcov +gem install coveralls-lcov +sudo lcov --directory ../.. --capture --output-file coverage.info --ignore-errors graph +coveralls-lcov coverage.info diff --git a/CRIU_code/scripts/travis/travis-tests b/CRIU_code/scripts/travis/travis-tests new file mode 100644 index 0000000..664f723 --- /dev/null +++ b/CRIU_code/scripts/travis/travis-tests @@ -0,0 +1,174 @@ +#!/bin/sh +set -x -e + +TRAVIS_PKGS="protobuf-c-compiler libprotobuf-c0-dev libaio-dev + libgnutls28-dev libgnutls30 libprotobuf-dev protobuf-compiler + libcap-dev libnl-3-dev gcc-multilib gdb bash python-protobuf + libnet-dev util-linux asciidoctor libnl-route-3-dev" + +travis_prep () { + [ -n "$SKIP_TRAVIS_PREP" ] && return + + cd ../../ + + service apport stop + + CC=gcc + # clang support + if [ "$CLANG" = "1" ]; then + TRAVIS_PKGS="$TRAVIS_PKGS clang" + CC=clang + fi + + [ -n "$GCOV" ] && { + apt-add-repository -y "ppa:ubuntu-toolchain-r/test" + apt-get update -yq + apt-get -yq --no-install-suggests --no-install-recommends --force-yes install g++-7 + CC=gcc-7 + } + + # ccache support, only enable for non-GCOV case + if [ "$CCACHE" = "1" -a -z "$GCOV" ]; then + # ccache is installed by default, need to set it up + export CCACHE_DIR=$HOME/.ccache + [ "$CC" = "clang" ] && export CCACHE_CPP2=yes + # uncomment the following to get detailed ccache logs + #export CCACHE_LOGFILE=$HOME/ccache.log + CC="ccache $CC" + fi + + # The /etc/apt/sources.list in the current trusty image for ppc64le is + # broken and needs to be fixed + if [ "$TR_ARCH" = "ppc64le" ] ; then + sed -i '/security/ d' /etc/apt/sources.list + fi + + apt-get update -qq + apt-get install -qq --no-install-recommends $TRAVIS_PKGS + # travis is based on 14.04 and that does not have python + # packages for future and ipaddress (16.04 has those packages) + pip install junit-xml future ipaddress + chmod a+x $HOME +} + +travis_prep + +ulimit -c unlimited +echo "|`pwd`/test/abrt.sh %P %p %s %e" > /proc/sys/kernel/core_pattern + +export GCOV +time make CC="$CC" -j4 + +[ -n "$SKIP_TRAVIS_TEST" ] && return + +if [ "${COMPAT_TEST}x" = "yx" ] ; then + # Dirty hack to keep both ia32 & x86_64 shared libs on a machine: + # headers are probably not compatible, so apt-get doesn't allow + # installing both versions, while we need one for CRIU and one + # for 32-bit tests. A better way would involve launching docker.. + # But it would require making zdtm.py aware of docker and launching + # tests inside the CT. + INCOMPATIBLE_LIBS="libaio-dev libcap-dev libnl-3-dev libnl-route-3-dev" + IA32_PKGS="" + REFUGE=64-refuge + + mkdir "$REFUGE" + for i in $INCOMPATIBLE_LIBS ; do + for j in $(dpkg --listfiles $i | grep '\.so$') ; do + cp "$j" "$REFUGE/" + done + IA32_PKGS="$IA32_PKGS $i:i386" + done + apt-get remove $INCOMPATIBLE_LIBS + apt-get install --no-install-recommends $IA32_PKGS + mkdir -p /usr/lib/x86_64-linux-gnu/ + mv "$REFUGE"/* /usr/lib/x86_64-linux-gnu/ +fi + +time make CC="$CC" -j4 -C test/zdtm + +[ -f "$CCACHE_LOGFILE" ] && cat $CCACHE_LOGFILE + +# umask has to be called before a first criu run, so that .gcda (coverage data) +# files are created with read-write permissions for all. +umask 0000 +./criu/criu check +./criu/criu check --all || echo $? +./criu/criu cpuinfo dump +./criu/criu cpuinfo check + +export SKIP_PREP=1 +# The 3.19 kernel (from Ubuntu 14.04) has a bug. When /proc/PID/pagemap +# is read for a few VMAs in one read call, incorrect data is returned. +# See https://github.com/xemul/criu/issues/207 +# Kernel 4.4 (from Ubuntu 14.04.5 update) fixes this. +uname -r | grep -q ^3\.19 && export CRIU_PMC_OFF=1 + +chmod 0777 test/ +chmod 0777 test/zdtm/static +chmod 0777 test/zdtm/transition + +./test/zdtm.py run -a -p 2 --keep-going $ZDTM_OPTS + +KERN_MAJ=`uname -r | cut -d. -f1` +KERN_MIN=`uname -r | cut -d. -f2` +if [ $KERN_MAJ -ge "4" ] && [ $KERN_MIN -ge "18" ]; then + LAZY_EXCLUDE="-x cmdlinenv00 -x maps007" +else + LAZY_EXCLUDE="-x maps007 -x fork -x fork2 -x uffd-events -x cgroupns + -x socket_listen -x socket_listen6 -x cmdlinenv00 + -x socket_close_data01 -x file_read -x lazy-thp -x futex" +fi +LAZY_EXCLUDE="$LAZY_EXCLUDE -x maps04" + +LAZY_TESTS=.*\(maps0\|uffd-events\|lazy-thp\|futex\|fork\).* + +./test/zdtm.py run -p 2 -T $LAZY_TESTS --lazy-pages $LAZY_EXCLUDE $ZDTM_OPTS +./test/zdtm.py run -p 2 -T $LAZY_TESTS --remote-lazy-pages $LAZY_EXCLUDE $ZDTM_OPTS +./test/zdtm.py run -p 2 -T $LAZY_TESTS --remote-lazy-pages --tls $LAZY_EXCLUDE $ZDTM_OPTS + +bash ./test/jenkins/criu-fault.sh +bash ./test/jenkins/criu-fcg.sh +bash ./test/jenkins/criu-inhfd.sh + +if [ -z "$SKIP_EXT_DEV_TEST" ]; then + make -C test/others/mnt-ext-dev/ run +fi +#make -C test/others/exec/ run +make -C test/others/make/ run CC="$CC" +make -C test/others/shell-job/ run +make -C test/others/rpc/ run + +./test/zdtm.py run -t zdtm/static/env00 --sibling + +./test/zdtm.py run -t zdtm/transition/maps007 --pre 2 --dedup +./test/zdtm.py run -t zdtm/transition/maps007 --pre 2 --noauto-dedup +./test/zdtm.py run -t zdtm/transition/maps007 --pre 2 --page-server +./test/zdtm.py run -t zdtm/transition/maps007 --pre 2 --page-server --dedup + +./test/zdtm.py run -t zdtm/static/socket-tcp-local --norst + +ip net add test +./test/zdtm.py run -t zdtm/static/env00 -f h --join-ns + +# RPC testing +./test/zdtm.py run -t zdtm/static/env00 --rpc # Basic +./test/zdtm.py run -t zdtm/static/env00 --rpc --pre 2 --page-server +./test/zdtm.py run -t zdtm/static/ptrace_sig -f h --rpc # Error handling (crfail test) + +./test/zdtm.py run --empty-ns -T zdtm/static/socket-tcp*-local --iter 2 + +./test/zdtm.py run -t zdtm/static/env00 -k always +./test/crit-recode.py + +make -C test/others/shell-job + +pip install flake8 +make lint + +# Check that help output fits into 80 columns +WIDTH=$(./criu/criu --help | wc --max-line-length) +if [ "$WIDTH" -gt 80 ]; then + echo "criu --help output does not obey 80 characters line width!" + exit 1 +fi diff --git a/CRIU_code/soccr/Makefile b/CRIU_code/soccr/Makefile new file mode 100644 index 0000000..dd37eb1 --- /dev/null +++ b/CRIU_code/soccr/Makefile @@ -0,0 +1,2 @@ +lib-name := libsoccr.a +lib-y += soccr.o diff --git a/CRIU_code/soccr/soccr.c b/CRIU_code/soccr/soccr.c new file mode 100644 index 0000000..20eabfb --- /dev/null +++ b/CRIU_code/soccr/soccr.c @@ -0,0 +1,942 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include "soccr.h" + +#ifndef SIOCOUTQNSD +/* MAO - Define SIOCOUTQNSD ioctl if we don't have it */ +#define SIOCOUTQNSD 0x894B +#endif + +enum { + TCPF_ESTABLISHED = (1 << 1), + TCPF_SYN_SENT = (1 << 2), + TCPF_SYN_RECV = (1 << 3), + TCPF_FIN_WAIT1 = (1 << 4), + TCPF_FIN_WAIT2 = (1 << 5), + TCPF_TIME_WAIT = (1 << 6), + TCPF_CLOSE = (1 << 7), + TCPF_CLOSE_WAIT = (1 << 8), + TCPF_LAST_ACK = (1 << 9), + TCPF_LISTEN = (1 << 10), + TCPF_CLOSING = (1 << 11), +}; + +/* + * The TCP transition diagram for half closed connections + * + * ------------ + * FIN_WAIT1 \ FIN + * --------- + * / ACK CLOSE_WAIT + * ----------- + * FIN_WAIT2 + * ---------- + * / FIN LAST_ACK + * ----------- + * TIME_WAIT \ ACK + * ---------- + * CLOSED + * + * How to get the TCP_CLOSING state + * + * ----------- ---------- + * FIN_WAIT1 \/ FIN FIN_WAIT1 + * ----------- ---------- + * CLOSING CLOSING + * \/ ACK + * ----------- ---------- + * TIME_WAIT TIME_WAIT + */ + +/* Restore a fin packet in a send queue first */ +#define SNDQ_FIRST_FIN (TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 | TCPF_CLOSING) +/* Restore fin in a send queue after restoring fi in the receive queue. */ +#define SNDQ_SECOND_FIN (TCPF_LAST_ACK | TCPF_CLOSE) +#define SNDQ_FIN_ACKED (TCPF_FIN_WAIT2 | TCPF_CLOSE) + +#define RCVQ_FIRST_FIN (TCPF_CLOSE_WAIT | TCPF_LAST_ACK | TCPF_CLOSE) +#define RCVQ_SECOND_FIN (TCPF_CLOSING) +#define RCVQ_FIN_ACKED (TCPF_CLOSE) + +static void (*log)(unsigned int loglevel, const char *format, ...) + __attribute__ ((__format__ (__printf__, 2, 3))); +static unsigned int log_level = 0; + +void libsoccr_set_log(unsigned int level, void (*fn)(unsigned int level, const char *fmt, ...)) +{ + log_level = level; + log = fn; +} + +#define loge(msg, ...) do { if (log && (log_level >= SOCCR_LOG_ERR)) log(SOCCR_LOG_ERR, "Error (%s:%d): " msg, __FILE__, __LINE__, ##__VA_ARGS__); } while (0) +#define logerr(msg, ...) loge(msg ": %s\n", ##__VA_ARGS__, strerror(errno)) +#define logd(msg, ...) do { if (log && (log_level >= SOCCR_LOG_DBG)) log(SOCCR_LOG_DBG, "Debug: " msg, ##__VA_ARGS__); } while (0) + +static int tcp_repair_on(int fd) +{ + int ret, aux = 1; + + ret = setsockopt(fd, SOL_TCP, TCP_REPAIR, &aux, sizeof(aux)); + if (ret < 0) + logerr("Can't turn TCP repair mode ON"); + + return ret; +} + +static int tcp_repair_off(int fd) +{ + int aux = 0, ret; + + ret = setsockopt(fd, SOL_TCP, TCP_REPAIR, &aux, sizeof(aux)); + if (ret < 0) + logerr("Failed to turn off repair mode on socket"); + + return ret; +} + +struct libsoccr_sk { + int fd; + unsigned flags; + char *recv_queue; + char *send_queue; + union libsoccr_addr *src_addr; + union libsoccr_addr *dst_addr; +}; + +#define SK_FLAG_FREE_RQ 0x1 +#define SK_FLAG_FREE_SQ 0x2 +#define SK_FLAG_FREE_SA 0x4 +#define SK_FLAG_FREE_DA 0x8 + +struct libsoccr_sk *libsoccr_pause(int fd) +{ + struct libsoccr_sk *ret; + + ret = malloc(sizeof(*ret)); + if (!ret) { + loge("Unable to allocate memory\n"); + return NULL; + } + + if (tcp_repair_on(fd) < 0) { + free(ret); + return NULL; + } + + ret->flags = 0; + ret->recv_queue = NULL; + ret->send_queue = NULL; + ret->src_addr = NULL; + ret->dst_addr = NULL; + ret->fd = fd; + return ret; +} + +void libsoccr_resume(struct libsoccr_sk *sk) +{ + tcp_repair_off(sk->fd); + libsoccr_release(sk); +} + +void libsoccr_release(struct libsoccr_sk *sk) +{ + if (sk->flags & SK_FLAG_FREE_RQ) + free(sk->recv_queue); + if (sk->flags & SK_FLAG_FREE_SQ) + free(sk->send_queue); + if (sk->flags & SK_FLAG_FREE_SA) + free(sk->src_addr); + if (sk->flags & SK_FLAG_FREE_DA) + free(sk->dst_addr); + free(sk); +} + +struct soccr_tcp_info { + __u8 tcpi_state; + __u8 tcpi_ca_state; + __u8 tcpi_retransmits; + __u8 tcpi_probes; + __u8 tcpi_backoff; + __u8 tcpi_options; + __u8 tcpi_snd_wscale : 4, tcpi_rcv_wscale : 4; +}; + +static int refresh_sk(struct libsoccr_sk *sk, + struct libsoccr_sk_data *data, struct soccr_tcp_info *ti) +{ + int size; + socklen_t olen = sizeof(*ti); + + if (getsockopt(sk->fd, SOL_TCP, TCP_INFO, ti, &olen) || olen != sizeof(*ti)) { + logerr("Failed to obtain TCP_INFO"); + return -1; + } + + switch (ti->tcpi_state) { + case TCP_ESTABLISHED: + case TCP_FIN_WAIT1: + case TCP_FIN_WAIT2: + case TCP_LAST_ACK: + case TCP_CLOSE_WAIT: + case TCP_CLOSING: + case TCP_CLOSE: + case TCP_SYN_SENT: + break; + default: + loge("Unknown state %d\n", ti->tcpi_state); + return -1; + } + + data->state = ti->tcpi_state; + + if (ioctl(sk->fd, SIOCOUTQ, &size) == -1) { + logerr("Unable to get size of snd queue"); + return -1; + } + + data->outq_len = size; + + if (ioctl(sk->fd, SIOCOUTQNSD, &size) == -1) { + logerr("Unable to get size of unsent data"); + return -1; + } + + data->unsq_len = size; + + if (data->state == TCP_CLOSE) { + /* A connection could be reseted. In thise case a sent queue + * may contain some data. A user can't read this data, so let's + * ignore them. Otherwise we will need to add a logic whether + * the send queue contains a fin packet or not and decide whether + * a fin or reset packet has to be sent to restore a state + */ + + data->unsq_len = 0; + data->outq_len = 0; + } + + /* Don't account the fin packet. It doesn't countain real data. */ + if ((1 << data->state) & (SNDQ_FIRST_FIN | SNDQ_SECOND_FIN)) { + if (data->outq_len) + data->outq_len--; + data->unsq_len = data->unsq_len ? data->unsq_len - 1 : 0; + } + + if (ioctl(sk->fd, SIOCINQ, &size) == -1) { + logerr("Unable to get size of recv queue"); + return -1; + } + + data->inq_len = size; + + return 0; +} + +static int get_stream_options(struct libsoccr_sk *sk, + struct libsoccr_sk_data *data, struct soccr_tcp_info *ti) +{ + int ret; + socklen_t auxl; + int val; + + auxl = sizeof(data->mss_clamp); + ret = getsockopt(sk->fd, SOL_TCP, TCP_MAXSEG, &data->mss_clamp, &auxl); + if (ret < 0) + goto err_sopt; + + data->opt_mask = ti->tcpi_options; + if (ti->tcpi_options & TCPI_OPT_WSCALE) { + data->snd_wscale = ti->tcpi_snd_wscale; + data->rcv_wscale = ti->tcpi_rcv_wscale; + } + + if (ti->tcpi_options & TCPI_OPT_TIMESTAMPS) { + auxl = sizeof(val); + ret = getsockopt(sk->fd, SOL_TCP, TCP_TIMESTAMP, &val, &auxl); + if (ret < 0) + goto err_sopt; + + data->timestamp = val; + } + + return 0; + +err_sopt: + logerr("\tsockopt failed"); + return -1; +} + +static int get_window(struct libsoccr_sk *sk, struct libsoccr_sk_data *data) +{ + struct tcp_repair_window opt; + socklen_t optlen = sizeof(opt); + + if (getsockopt(sk->fd, SOL_TCP, + TCP_REPAIR_WINDOW, &opt, &optlen)) { + /* Appeared since 4.8, but TCP_repair itself is since 3.11 */ + if (errno == ENOPROTOOPT) + return 0; + + logerr("Unable to get window properties"); + return -1; + } + + data->flags |= SOCCR_FLAGS_WINDOW; + data->snd_wl1 = opt.snd_wl1; + data->snd_wnd = opt.snd_wnd; + data->max_window = opt.max_window; + data->rcv_wnd = opt.rcv_wnd; + data->rcv_wup = opt.rcv_wup; + + return 0; +} + +/* + * TCP queues sequences and their relations to the code below + * + * output queue + * net <----------------------------- sk + * ^ ^ ^ seq >> + * snd_una snd_nxt write_seq + * + * input queue + * net -----------------------------> sk + * << seq ^ ^ + * rcv_nxt copied_seq + * + * + * inq_len = rcv_nxt - copied_seq = SIOCINQ + * outq_len = write_seq - snd_una = SIOCOUTQ + * inq_seq = rcv_nxt + * outq_seq = write_seq + * + * On restore kernel moves the option we configure with setsockopt, + * thus we should advance them on the _len value in restore_tcp_seqs. + * + */ + +static int get_queue(int sk, int queue_id, + __u32 *seq, __u32 len, char **bufp) +{ + int ret, aux; + socklen_t auxl; + char *buf; + + aux = queue_id; + auxl = sizeof(aux); + ret = setsockopt(sk, SOL_TCP, TCP_REPAIR_QUEUE, &aux, auxl); + if (ret < 0) + goto err_sopt; + + auxl = sizeof(*seq); + ret = getsockopt(sk, SOL_TCP, TCP_QUEUE_SEQ, seq, &auxl); + if (ret < 0) + goto err_sopt; + + if (len) { + /* + * Try to grab one byte more from the queue to + * make sure there are len bytes for real + */ + buf = malloc(len + 1); + if (!buf) { + loge("Unable to allocate memory\n"); + goto err_buf; + } + + ret = recv(sk, buf, len + 1, MSG_PEEK | MSG_DONTWAIT); + if (ret != len) + goto err_recv; + } else + buf = NULL; + + *bufp = buf; + return 0; + +err_sopt: + logerr("\tsockopt failed"); +err_buf: + return -1; + +err_recv: + logerr("\trecv failed (%d, want %d)", ret, len); + free(buf); + goto err_buf; +} + +/* + * This is how much data we've had in the initial libsoccr + */ +#define SOCR_DATA_MIN_SIZE (17 * sizeof(__u32)) + +int libsoccr_save(struct libsoccr_sk *sk, struct libsoccr_sk_data *data, unsigned data_size) +{ + struct soccr_tcp_info ti; + + if (!data || data_size < SOCR_DATA_MIN_SIZE) { + loge("Invalid input parameters\n"); + return -1; + } + + memset(data, 0, data_size); + + if (refresh_sk(sk, data, &ti)) + return -2; + + if (get_stream_options(sk, data, &ti)) + return -3; + + if (get_window(sk, data)) + return -4; + + sk->flags |= SK_FLAG_FREE_SQ | SK_FLAG_FREE_RQ; + + if (get_queue(sk->fd, TCP_RECV_QUEUE, &data->inq_seq, data->inq_len, &sk->recv_queue)) + return -5; + + if (get_queue(sk->fd, TCP_SEND_QUEUE, &data->outq_seq, data->outq_len, &sk->send_queue)) + return -6; + + return sizeof(struct libsoccr_sk_data); +} + +#define GET_Q_FLAGS (SOCCR_MEM_EXCL) +char *libsoccr_get_queue_bytes(struct libsoccr_sk *sk, int queue_id, unsigned flags) +{ + char **p, *ret; + + if (flags & ~GET_Q_FLAGS) + return NULL; + + switch (queue_id) { + case TCP_RECV_QUEUE: + p = &sk->recv_queue; + break; + case TCP_SEND_QUEUE: + p = &sk->send_queue; + break; + default: + return NULL; + } + + ret = *p; + if (flags & SOCCR_MEM_EXCL) + *p = NULL; + + return ret; +} + +#define GET_SA_FLAGS (SOCCR_MEM_EXCL) +union libsoccr_addr *libsoccr_get_addr(struct libsoccr_sk *sk, int self, unsigned flags) +{ + if (flags & ~GET_SA_FLAGS) + return NULL; + + /* FIXME -- implemeted in CRIU, makes sence to have it here too */ + return NULL; +} + +static int set_queue_seq(struct libsoccr_sk *sk, int queue, __u32 seq) +{ + logd("\tSetting %d queue seq to %u\n", queue, seq); + + if (setsockopt(sk->fd, SOL_TCP, TCP_REPAIR_QUEUE, &queue, sizeof(queue)) < 0) { + logerr("Can't set repair queue"); + return -1; + } + + if (setsockopt(sk->fd, SOL_TCP, TCP_QUEUE_SEQ, &seq, sizeof(seq)) < 0) { + logerr("Can't set queue seq"); + return -1; + } + + return 0; +} + +#ifndef TCPOPT_SACK_PERM +#define TCPOPT_SACK_PERM TCPOPT_SACK_PERMITTED +#endif + +static int libsoccr_set_sk_data_noq(struct libsoccr_sk *sk, + struct libsoccr_sk_data *data, unsigned data_size) +{ + struct tcp_repair_opt opts[4]; + int addr_size, mstate; + int onr = 0; + __u32 seq; + + if (!data || data_size < SOCR_DATA_MIN_SIZE) { + loge("Invalid input parameters\n"); + return -1; + } + + if (!sk->dst_addr || !sk->src_addr) { + loge("Destination or/and source addresses aren't set\n"); + return -1; + } + + mstate = 1 << data->state; + + if (data->state == TCP_LISTEN) { + loge("Unable to handle listen sockets\n"); + return -1; + } + + if (sk->src_addr->sa.sa_family == AF_INET) + addr_size = sizeof(sk->src_addr->v4); + else + addr_size = sizeof(sk->src_addr->v6); + + if (bind(sk->fd, &sk->src_addr->sa, addr_size)) { + logerr("Can't bind inet socket back"); + return -1; + } + + if (mstate & (RCVQ_FIRST_FIN | RCVQ_SECOND_FIN)) + data->inq_seq--; + + /* outq_seq is adjusted due to not accointing the fin packet */ + if (mstate & (SNDQ_FIRST_FIN | SNDQ_SECOND_FIN)) + data->outq_seq--; + + if (set_queue_seq(sk, TCP_RECV_QUEUE, + data->inq_seq - data->inq_len)) + return -2; + + seq = data->outq_seq - data->outq_len; + if (data->state == TCP_SYN_SENT) + seq--; + + if (set_queue_seq(sk, TCP_SEND_QUEUE, seq)) + return -3; + + if (sk->dst_addr->sa.sa_family == AF_INET) + addr_size = sizeof(sk->dst_addr->v4); + else + addr_size = sizeof(sk->dst_addr->v6); + + if (data->state == TCP_SYN_SENT && tcp_repair_off(sk->fd)) + return -1; + + if (connect(sk->fd, &sk->dst_addr->sa, addr_size) == -1 && + errno != EINPROGRESS) { + logerr("Can't connect inet socket back"); + return -1; + } + + if (data->state == TCP_SYN_SENT && tcp_repair_on(sk->fd)) + return -1; + + logd("\tRestoring TCP options\n"); + + if (data->opt_mask & TCPI_OPT_SACK) { + logd("\t\tWill turn SAK on\n"); + opts[onr].opt_code = TCPOPT_SACK_PERM; + opts[onr].opt_val = 0; + onr++; + } + + if (data->opt_mask & TCPI_OPT_WSCALE) { + logd("\t\tWill set snd_wscale to %u\n", data->snd_wscale); + logd("\t\tWill set rcv_wscale to %u\n", data->rcv_wscale); + opts[onr].opt_code = TCPOPT_WINDOW; + opts[onr].opt_val = data->snd_wscale + (data->rcv_wscale << 16); + onr++; + } + + if (data->opt_mask & TCPI_OPT_TIMESTAMPS) { + logd("\t\tWill turn timestamps on\n"); + opts[onr].opt_code = TCPOPT_TIMESTAMP; + opts[onr].opt_val = 0; + onr++; + } + + logd("Will set mss clamp to %u\n", data->mss_clamp); + opts[onr].opt_code = TCPOPT_MAXSEG; + opts[onr].opt_val = data->mss_clamp; + onr++; + + if (data->state != TCP_SYN_SENT && + setsockopt(sk->fd, SOL_TCP, TCP_REPAIR_OPTIONS, + opts, onr * sizeof(struct tcp_repair_opt)) < 0) { + logerr("Can't repair options"); + return -2; + } + + if (data->opt_mask & TCPI_OPT_TIMESTAMPS) { + if (setsockopt(sk->fd, SOL_TCP, TCP_TIMESTAMP, + &data->timestamp, sizeof(data->timestamp)) < 0) { + logerr("Can't set timestamp"); + return -3; + } + } + + return 0; +} + +/* IPv4-Mapped IPv6 Addresses */ +static int ipv6_addr_mapped(union libsoccr_addr *addr) +{ + return (addr->v6.sin6_addr.s6_addr32[2] == htonl(0x0000ffff)); +} + +static int send_fin(struct libsoccr_sk *sk, struct libsoccr_sk_data *data, + unsigned data_size, uint8_t flags) +{ + uint32_t src_v4 = sk->src_addr->v4.sin_addr.s_addr; + uint32_t dst_v4 = sk->dst_addr->v4.sin_addr.s_addr; + int ret, exit_code = -1, family; + char errbuf[LIBNET_ERRBUF_SIZE]; + int mark = SOCCR_MARK; + int libnet_type; + libnet_t *l; + + family = sk->dst_addr->sa.sa_family; + + if (family == AF_INET6 && ipv6_addr_mapped(sk->dst_addr)) { + /* TCP over IPv4 */ + family = AF_INET; + dst_v4 = sk->dst_addr->v6.sin6_addr.s6_addr32[3]; + src_v4 = sk->src_addr->v6.sin6_addr.s6_addr32[3]; + } + + if (family == AF_INET6) + libnet_type = LIBNET_RAW6; + else + libnet_type = LIBNET_RAW4; + + l = libnet_init( + libnet_type, /* injection type */ + NULL, /* network interface */ + errbuf); /* errbuf */ + if (l == NULL) { + loge("libnet_init failed (%s)\n", errbuf); + return -1; + } + + if (setsockopt(l->fd, SOL_SOCKET, SO_MARK, &mark, sizeof(mark))) { + logerr("Can't set SO_MARK (%d) for socket\n", mark); + goto err; + } + + ret = libnet_build_tcp( + ntohs(sk->dst_addr->v4.sin_port), /* source port */ + ntohs(sk->src_addr->v4.sin_port), /* destination port */ + data->inq_seq, /* sequence number */ + data->outq_seq - data->outq_len, /* acknowledgement num */ + flags, /* control flags */ + data->rcv_wnd, /* window size */ + 0, /* checksum */ + 10, /* urgent pointer */ + LIBNET_TCP_H + 20, /* TCP packet size */ + NULL, /* payload */ + 0, /* payload size */ + l, /* libnet handle */ + 0); /* libnet id */ + if (ret == -1) { + loge("Can't build TCP header: %s\n", libnet_geterror(l)); + goto err; + } + + if (family == AF_INET6) { + struct libnet_in6_addr src, dst; + + memcpy(&dst, &sk->dst_addr->v6.sin6_addr, sizeof(dst)); + memcpy(&src, &sk->src_addr->v6.sin6_addr, sizeof(src)); + + ret = libnet_build_ipv6( + 0, 0, + LIBNET_TCP_H, /* length */ + IPPROTO_TCP, /* protocol */ + 64, /* hop limit */ + dst, /* source IP */ + src, /* destination IP */ + NULL, /* payload */ + 0, /* payload size */ + l, /* libnet handle */ + 0); /* libnet id */ + } else if (family == AF_INET) + ret = libnet_build_ipv4( + LIBNET_IPV4_H + LIBNET_TCP_H + 20, /* length */ + 0, /* TOS */ + 242, /* IP ID */ + 0, /* IP Frag */ + 64, /* TTL */ + IPPROTO_TCP, /* protocol */ + 0, /* checksum */ + dst_v4, /* source IP */ + src_v4, /* destination IP */ + NULL, /* payload */ + 0, /* payload size */ + l, /* libnet handle */ + 0); /* libnet id */ + else { + loge("Unknown socket family\n"); + goto err; + } + if (ret == -1) { + loge("Can't build IP header: %s\n", libnet_geterror(l)); + goto err; + } + + ret = libnet_write(l); + if (ret == -1) { + loge("Unable to send a fin packet: %s\n", libnet_geterror(l)); + goto err; + } + + exit_code = 0; +err: + libnet_destroy(l); + return exit_code; +} + +static int restore_fin_in_snd_queue(int sk, int acked) +{ + int queue = TCP_SEND_QUEUE; + int ret; + + /* + * If TCP_SEND_QUEUE is set, a fin packet will be + * restored as a sent packet. + */ + if (acked && + setsockopt(sk, SOL_TCP, TCP_REPAIR_QUEUE, &queue, sizeof(queue)) < 0) { + logerr("Can't set repair queue"); + return -1; + } + + ret = shutdown(sk, SHUT_WR); + if (ret < 0) + logerr("Unable to shut down a socket"); + + queue = TCP_NO_QUEUE; + if (acked && + setsockopt(sk, SOL_TCP, TCP_REPAIR_QUEUE, &queue, sizeof(queue)) < 0) { + logerr("Can't set repair queue"); + return -1; + } + + return ret; +} + +static int libsoccr_restore_queue(struct libsoccr_sk *sk, struct libsoccr_sk_data *data, unsigned data_size, + int queue, char *buf); + +int libsoccr_restore(struct libsoccr_sk *sk, + struct libsoccr_sk_data *data, unsigned data_size) +{ + int mstate = 1 << data->state; + + if (libsoccr_set_sk_data_noq(sk, data, data_size)) + return -1; + + if (libsoccr_restore_queue(sk, data, sizeof(*data), TCP_RECV_QUEUE, sk->recv_queue)) + return -1; + + if (libsoccr_restore_queue(sk, data, sizeof(*data), TCP_SEND_QUEUE, sk->send_queue)) + return -1; + + if (data->flags & SOCCR_FLAGS_WINDOW) { + struct tcp_repair_window wopt = { + .snd_wl1 = data->snd_wl1, + .snd_wnd = data->snd_wnd, + .max_window = data->max_window, + .rcv_wnd = data->rcv_wnd, + .rcv_wup = data->rcv_wup, + }; + + if (mstate & (RCVQ_FIRST_FIN | RCVQ_SECOND_FIN)) { + wopt.rcv_wup--; + wopt.rcv_wnd++; + } + + if (setsockopt(sk->fd, SOL_TCP, TCP_REPAIR_WINDOW, &wopt, sizeof(wopt))) { + logerr("Unable to set window parameters"); + return -1; + } + } + + /* + * To restore a half closed sockets, fin packets has to be restored in + * recv and send queues. Here shutdown() is used to restore a fin + * packet in the send queue and a fake fin packet is send to restore it + * in the recv queue. + */ + if (mstate & SNDQ_FIRST_FIN) + restore_fin_in_snd_queue(sk->fd, mstate & SNDQ_FIN_ACKED); + + /* Send a fin packet to the socket to restore it in a receive queue. */ + if (mstate & (RCVQ_FIRST_FIN | RCVQ_SECOND_FIN)) + if (send_fin(sk, data, data_size, TH_ACK | TH_FIN) < 0) + return -1; + + if (mstate & SNDQ_SECOND_FIN) + restore_fin_in_snd_queue(sk->fd, mstate & SNDQ_FIN_ACKED); + + if (mstate & RCVQ_FIN_ACKED) + data->inq_seq++; + + if (mstate & SNDQ_FIN_ACKED) { + data->outq_seq++; + if (send_fin(sk, data, data_size, TH_ACK) < 0) + return -1; + } + + return 0; +} + +static int __send_queue(struct libsoccr_sk *sk, int queue, char *buf, __u32 len) +{ + int ret, err = -1, max_chunk; + int off; + + max_chunk = len; + off = 0; + + do { + int chunk = len; + + if (chunk > max_chunk) + chunk = max_chunk; + + ret = send(sk->fd, buf + off, chunk, 0); + if (ret <= 0) { + if (max_chunk > 1024) { + /* + * Kernel not only refuses the whole chunk, + * but refuses to split it into pieces too. + * + * When restoring recv queue in repair mode + * kernel doesn't try hard and just allocates + * a linear skb with the size we pass to the + * system call. Thus, if the size is too big + * for slab allocator, the send just fails + * with ENOMEM. + * + * In any case -- try smaller chunk, hopefully + * there's still enough memory in the system. + */ + max_chunk >>= 1; + continue; + } + + logerr("Can't restore %d queue data (%d), want (%d:%d:%d)", + queue, ret, chunk, len, max_chunk); + goto err; + } + off += ret; + len -= ret; + } while (len); + + err = 0; +err: + return err; +} + +static int send_queue(struct libsoccr_sk *sk, int queue, char *buf, __u32 len) +{ + logd("\tRestoring TCP %d queue data %u bytes\n", queue, len); + + if (setsockopt(sk->fd, SOL_TCP, TCP_REPAIR_QUEUE, &queue, sizeof(queue)) < 0) { + logerr("Can't set repair queue"); + return -1; + } + + return __send_queue(sk, queue, buf, len); +} + +static int libsoccr_restore_queue(struct libsoccr_sk *sk, struct libsoccr_sk_data *data, unsigned data_size, + int queue, char *buf) +{ + if (!buf) + return 0; + + if (!data || data_size < SOCR_DATA_MIN_SIZE) + return -1; + + if (queue == TCP_RECV_QUEUE) { + if (!data->inq_len) + return 0; + return send_queue(sk, TCP_RECV_QUEUE, buf, data->inq_len); + } + + if (queue == TCP_SEND_QUEUE) { + __u32 len, ulen; + + /* + * All data in a write buffer can be divided on two parts sent + * but not yet acknowledged data and unsent data. + * The TCP stack must know which data have been sent, because + * acknowledgment can be received for them. These data must be + * restored in repair mode. + */ + ulen = data->unsq_len; + len = data->outq_len - ulen; + if (len && send_queue(sk, TCP_SEND_QUEUE, buf, len)) + return -2; + + if (ulen) { + /* + * The second part of data have never been sent to outside, so + * they can be restored without any tricks. + */ + tcp_repair_off(sk->fd); + if (__send_queue(sk, TCP_SEND_QUEUE, buf + len, ulen)) + return -3; + if (tcp_repair_on(sk->fd)) + return -4; + } + + return 0; + } + + return -5; +} + +#define SET_Q_FLAGS (SOCCR_MEM_EXCL) +int libsoccr_set_queue_bytes(struct libsoccr_sk *sk, int queue_id, char *bytes, unsigned flags) +{ + if (flags & ~SET_Q_FLAGS) + return -1; + + switch (queue_id) { + case TCP_RECV_QUEUE: + sk->recv_queue = bytes; + if (flags & SOCCR_MEM_EXCL) + sk->flags |= SK_FLAG_FREE_RQ; + return 0; + case TCP_SEND_QUEUE: + sk->send_queue = bytes; + if (flags & SOCCR_MEM_EXCL) + sk->flags |= SK_FLAG_FREE_SQ; + return 0; + } + + return -1; +} + +#define SET_SA_FLAGS (SOCCR_MEM_EXCL) +int libsoccr_set_addr(struct libsoccr_sk *sk, int self, union libsoccr_addr *addr, unsigned flags) +{ + if (flags & ~SET_SA_FLAGS) + return -1; + + if (self) { + sk->src_addr = addr; + if (flags & SOCCR_MEM_EXCL) + sk->flags |= SK_FLAG_FREE_SA; + } else { + sk->dst_addr = addr; + if (flags & SOCCR_MEM_EXCL) + sk->flags |= SK_FLAG_FREE_DA; + } + + return 0; +} diff --git a/CRIU_code/soccr/soccr.h b/CRIU_code/soccr/soccr.h new file mode 100644 index 0000000..2c34e16 --- /dev/null +++ b/CRIU_code/soccr/soccr.h @@ -0,0 +1,233 @@ +#ifndef __LIBSOCCR_H__ +#define __LIBSOCCR_H__ +#include /* sockaddr_in, sockaddr_in6 */ +#include /* TCP_REPAIR_WINDOW, TCP_TIMESTAMP */ +#include /* uint32_t */ +#include /* sockaddr */ + +#include "common/config.h" + +/* All packets with this mark have not to be blocked. */ +#define SOCCR_MARK 0xC114 + +#ifndef CONFIG_HAS_TCP_REPAIR_WINDOW +struct tcp_repair_window { + uint32_t snd_wl1; + uint32_t snd_wnd; + uint32_t max_window; + + uint32_t rcv_wnd; + uint32_t rcv_wup; +}; +#endif + +#ifndef CONFIG_HAS_TCP_REPAIR +/* + * It's been reported that both tcp_repair_opt + * and TCP_ enum already shipped in netinet/tcp.h + * system header by some distros thus we need a + * test if we can use predefined ones or provide + * our own. + */ +struct tcp_repair_opt { + uint32_t opt_code; + uint32_t opt_val; +}; + +enum { + TCP_NO_QUEUE, + TCP_RECV_QUEUE, + TCP_SEND_QUEUE, + TCP_QUEUES_NR, +}; +#endif + +#ifndef TCP_TIMESTAMP +#define TCP_TIMESTAMP 24 +#endif + +#ifndef TCP_REPAIR_WINDOW +#define TCP_REPAIR_WINDOW 29 +#endif + +void libsoccr_set_log(unsigned int level, void (*fn)(unsigned int level, const char *fmt, ...)); + +#define SOCCR_LOG_ERR 1 +#define SOCCR_LOG_DBG 2 + +/* + * An opaque handler for C/R-ing a TCP socket. + */ +struct libsoccr_sk; + +union libsoccr_addr { + struct sockaddr sa; + struct sockaddr_in v4; + struct sockaddr_in6 v6; +}; + +/* + * Connection info that should be saved after fetching from the + * socket and given back into the library in two steps (see below). + */ +struct libsoccr_sk_data { + uint32_t state; + uint32_t inq_len; + uint32_t inq_seq; + uint32_t outq_len; + uint32_t outq_seq; + uint32_t unsq_len; + uint32_t opt_mask; + uint32_t mss_clamp; + uint32_t snd_wscale; + uint32_t rcv_wscale; + uint32_t timestamp; + + uint32_t flags; /* SOCCR_FLAGS_... below */ + uint32_t snd_wl1; + uint32_t snd_wnd; + uint32_t max_window; + uint32_t rcv_wnd; + uint32_t rcv_wup; +}; + +/* + * The flags below denote which data on libsoccr_sk_data was get + * from the kernel and is required for restore. Not present data + * is zeroified by the library. + * + * Ideally the caller should carry the whole _data structure between + * calls, but for optimization purposes it may analyze the flags + * field and drop the unneeded bits. + */ + +/* + * Window parameters. Mark snd_wl1, snd_wnd, max_window, rcv_wnd + * and rcv_wup fields. + */ +#define SOCCR_FLAGS_WINDOW 0x1 + +/* + * These two calls pause and resume the socket for and after C/R + * The first one returns an opaque handle that is to be used by all + * the subsequent calls. + * + * For now the library only supports ESTABLISHED sockets. The caller + * should check the socket is supported before calling the library. + * + * Before doing socket C/R make sure no packets can reach the socket + * you're working with, nor any packet can leave the node from this + * socket. This can be done by using netfilter DROP target (of by + * DOWN-ing an interface in case of containers). + */ +struct libsoccr_sk *libsoccr_pause(int fd); +void libsoccr_resume(struct libsoccr_sk *sk); + +/* This one is like _resume, but doesn't turn repair off on socket. */ +void libsoccr_release(struct libsoccr_sk *sk); + +/* + * Flags for calls below + */ + +/* + * Memory given to or taken from library is in exclusive ownership + * of the resulting owner. I.e. -- when taken by caller from library, + * the former will free() one, when given to the library, the latter + * is to free() it. + */ +#define SOCCR_MEM_EXCL 0x1 + +/* + * CHECKPOINTING calls + * + * Roughly the checkpoint steps for sockets in supported states are + * + * h = libsoccr_pause(sk); + * libsoccr_save(h, &data, sizeof(data)) + * inq = libsoccr_get_queue_bytes(h, TCP_RECV_QUEUE, 0) + * outq = libsoccr_get_queue_bytes(h, TCP_SEND_QUEUE, 0) + * getsocname(sk, &name, ...) + * getpeername(sk, &peer, ...) + * + * save_all_bytes(h, inq, outq, name, peer) + * + * Resuming the socket afterwards effectively obsoletes the saved + * info, as the connection resumes and old saved bytes become + * outdated. + * + * Please note, that getsocname() and getpeername() are standard glibc + * calls, not the libsoccr's ones. + */ + +/* + * Fills in the libsoccr_sk_data structure with connection info. The + * data_size shows the size of a buffer. The returned value is the + * amount of bytes put into data (the rest is zeroed with memcpy). + */ +int libsoccr_save(struct libsoccr_sk *sk, struct libsoccr_sk_data *data, unsigned data_size); + +/* + * Get a pointer on the contents of queues. The amount of bytes is + * determined from the filled libsoccr_sk_data by queue_id. + * + * For TCP_RECV_QUEUE the lenght is .inq_len + * For TCP_SEND_QUEUE the lenght is .outq_len + * + * For any other queues returns NULL. + * + * The steal argument means that the caller grabs the buffer from + * library and should free() it himself. Otherwise the buffer can + * be claimed again and will be free by library upon _resume call. + */ +char *libsoccr_get_queue_bytes(struct libsoccr_sk *sk, int queue_id, unsigned flags); + +/* + * Returns filled libsoccr_addr for a socket. This value is also required + * on restore, but addresses may be obtained from somewhere else, these + * are just common sockaddr-s. + */ +union libsoccr_addr *libsoccr_get_addr(struct libsoccr_sk *sk, int self, unsigned flags); + +/* + * RESTORING calls + * + * The restoring of a socket is like below + * + * get_all_bytes(h, inq, outq, name, peer) + * + * sk = socket(PF_INET, SOCK_STREAM, IPPROTO_TCP); + * + * h = libsoccr_pause(sk) + * libsoccr_set_queue_bytes(h, TCP_SEND_QUEUE, outq); + * libsoccr_set_queue_bytes(h, TCP_RECV_QUEUE, inq); + * libsoccr_set_addr(h, 1, src_addr); + * libsoccr_set_addr(h, 0, dst_addr); + * libsoccr_restore(h, &data, sizeof(data)) + * + * libsoccr_resume(h) + * + * Only after this the packets path from and to the socket can be + * enabled back. + */ + +/* + * Set a pointer on the send/recv queue data. + * If flags have SOCCR_MEM_EXCL, the buffer is stolen by the library and is + * free()-ed after libsoccr_resume(). + */ +int libsoccr_set_queue_bytes(struct libsoccr_sk *sk, int queue_id, char *bytes, unsigned flags); + +/* + * Set a pointer on the libsoccr_addr for src/dst. + * If flags have SOCCR_MEM_EXCL, the buffer is stolen by the library and is + * fre()-ed after libsoccr_resume(). + */ +int libsoccr_set_addr(struct libsoccr_sk *sk, int self, union libsoccr_addr *, unsigned flags); + +/* + * Performs restore actions on a socket + */ +int libsoccr_restore(struct libsoccr_sk *sk, struct libsoccr_sk_data *data, unsigned data_size); + +#endif diff --git a/CRIU_code/soccr/test/Makefile b/CRIU_code/soccr/test/Makefile new file mode 100644 index 0000000..4585400 --- /dev/null +++ b/CRIU_code/soccr/test/Makefile @@ -0,0 +1,27 @@ +CFLAGS += -Wall -g -I../../ +LDFLAGS += -L../ -lsoccr ../libsoccr.a -lnet + +RUN ?= tcp-constructor + +run: + ./local.sh + +tcp-constructor: tcp-constructor.c ../libsoccr.a + $(CC) $(CFLAGS) tcp-constructor.c -o tcp-constructor $(LDFLAGS) + +clean: + rm -f tcp-constructor + +tcp-conn: tcp-conn.c + $(CC) $(CFLAGS) tcp-conn.c -o tcp-conn $(LDFLAGS) + +tcp-conn-v6: tcp-conn-v6.c + $(CC) $(CFLAGS) -DTEST_IPV6 tcp-conn-v6.c -o tcp-conn-v6 $(LDFLAGS) + +test: tcp-constructor tcp-conn tcp-conn-v6 + unshare -n sh -c "ip link set up dev lo; ./tcp-conn" + unshare -n sh -c "ip link set up dev lo; ./tcp-conn-v6" + python run.py ./$(RUN) + +.PHONY: test + diff --git a/CRIU_code/soccr/test/local.sh b/CRIU_code/soccr/test/local.sh new file mode 100644 index 0000000..aac3a58 --- /dev/null +++ b/CRIU_code/soccr/test/local.sh @@ -0,0 +1 @@ +unshare -Urn sh -c 'ip link set up dev lo && make test' diff --git a/CRIU_code/soccr/test/run.py b/CRIU_code/soccr/test/run.py new file mode 100644 index 0000000..a25c292 --- /dev/null +++ b/CRIU_code/soccr/test/run.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python2 + +import sys, os +import hashlib +from subprocess import Popen, PIPE + +str2 = "test_test" * (1 << 20) +str1 = "Test_Test!" + +src = os.getenv("TCP_SRC", "127.0.0.1") +dst = os.getenv("TCP_DST", "127.0.0.1") +sport = os.getenv("TCP_SPORT", "12345") +dport = os.getenv("TCP_DPORT", "54321") + +print(sys.argv[1]) +args = [sys.argv[1], + "--addr", src, "--port", sport, "--seq", "555", + "--next", + "--addr", dst, "--port", dport, "--seq", "666", + "--reverse", "--", "./tcp-test.py"] + +p1 = Popen(args + ["dst"], stdout = PIPE, stdin = PIPE) + +args.remove("--reverse"); + +p2 = Popen(args + ["src"], stdout = PIPE, stdin = PIPE) + +p1.stdout.read(5) +p2.stdout.read(5) +p1.stdin.write("start") +p2.stdin.write("start") + +p1.stdin.write(str1) +p1.stdin.close() +p2.stdin.write(str2) +p2.stdin.close() + +s = p1.stdout.read() +m = hashlib.md5() +m.update(str2) +str2 = m.hexdigest() + +if str2 != eval(s): + print("FAIL", repr(str2), repr(s)) + sys.exit(5); + +s = p1.stdout.read() +m = hashlib.md5() +m.update(str1) +str1 = m.hexdigest() + +s = p2.stdout.read() +if str1 != eval(s): + print("FAIL", repr(str1), s) + sys.exit(5); + +if p1.wait(): + sys.exit(1) +if p2.wait(): + sys.exit(1) + +print("PASS") diff --git a/CRIU_code/soccr/test/tcp-conn-v6.c b/CRIU_code/soccr/test/tcp-conn-v6.c new file mode 100644 index 0000000..81da796 --- /dev/null +++ b/CRIU_code/soccr/test/tcp-conn-v6.c @@ -0,0 +1 @@ +tcp-conn.c \ No newline at end of file diff --git a/CRIU_code/soccr/test/tcp-conn.c b/CRIU_code/soccr/test/tcp-conn.c new file mode 100644 index 0000000..1a1a5bb --- /dev/null +++ b/CRIU_code/soccr/test/tcp-conn.c @@ -0,0 +1,168 @@ +#include +#include /* for srvaddr_in and inet_ntoa() */ +#include +#include +#include +#include "../soccr.h" +#include + +#define pr_perror(fmt, ...) printf(fmt ": %m\n", ##__VA_ARGS__) + +enum { + TCP_NO_QUEUE, + TCP_RECV_QUEUE, + TCP_SEND_QUEUE, + TCP_QUEUES_NR, +}; +static void pr_printf(unsigned int level, const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + vprintf(fmt, args); + va_end(args); +} + +int main() +{ + union libsoccr_addr addr, dst; + int srv, sock, clnt, rst; + int ret, dsize; + socklen_t dst_let; + struct libsoccr_sk_data data = {}; + struct libsoccr_sk *so, *so_rst; + char buf[11] = "0123456789", *queue; + + libsoccr_set_log(10, pr_printf); + + memset(&addr,0,sizeof(addr)); + +#ifndef TEST_IPV6 + addr.v4.sin_family = AF_INET; + inet_pton(AF_INET, "0.0.0.0", &(addr.v4.sin_addr)); +#else + addr.v6.sin6_family = AF_INET6; + inet_pton(AF_INET6, "::0", &(addr.v6.sin6_addr)); +#endif + +#ifndef TEST_IPV6 + srv = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); +#else + srv = socket(AF_INET6, SOCK_STREAM, IPPROTO_TCP); +#endif + if (srv == -1) { + pr_perror("socket() failed"); + return -1; + } + +#ifndef TEST_IPV6 + addr.v4.sin_port = htons(8765); +#else + addr.v6.sin6_port = htons(8765); +#endif + ret = bind(srv, (struct sockaddr *) &addr, sizeof(addr)); + if (ret == -1) { + pr_perror("bind() failed"); + return -1; + } + + if (listen(srv, 1) == -1) { + pr_perror("listen() failed"); + return -1; + } + +#ifndef TEST_IPV6 + clnt = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); +#else + clnt = socket(AF_INET6, SOCK_STREAM, IPPROTO_TCP); +#endif + if (clnt == -1) { + pr_perror("socket() failed"); + return -1; + } + + if (connect(clnt, (struct sockaddr *) &addr, sizeof(addr))) { + pr_perror("connect"); + return 1; + } + + dst_let = sizeof(dst); + sock = accept(srv, (struct sockaddr *) &dst, &dst_let); + if (sock < 0) { + pr_perror("accept"); + return 1; + } + + if (write(clnt, &buf, sizeof(buf)) != sizeof(buf)) { + pr_perror("write"); + return 1; + } + + /* Start testing */ + dst_let = sizeof(addr); + if (getsockname(sock, (struct sockaddr *) &addr, &dst_let)) { + pr_perror("connect"); + return 1; + } + dst_let = sizeof(addr); + if (getpeername(sock, (struct sockaddr *) &dst, &dst_let)) { + pr_perror("connect"); + return 1; + } + + + so = libsoccr_pause(sock); + + dsize = libsoccr_save(so, &data, sizeof(data)); + if (dsize < 0) { + pr_perror("libsoccr_save"); + return 1; + } + +#ifndef TEST_IPV6 + rst = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); +#else + rst = socket(AF_INET6, SOCK_STREAM, IPPROTO_TCP); +#endif + if (rst == -1) { + pr_perror("socket() failed"); + return -1; + } + close(sock); + + so_rst = libsoccr_pause(rst); + libsoccr_set_addr(so_rst, 1, &addr, 0); + libsoccr_set_addr(so_rst, 0, &dst, 0); + + queue = libsoccr_get_queue_bytes(so, TCP_RECV_QUEUE, SOCCR_MEM_EXCL); + libsoccr_set_queue_bytes(so_rst, TCP_RECV_QUEUE, queue, SOCCR_MEM_EXCL); + queue = libsoccr_get_queue_bytes(so, TCP_SEND_QUEUE, SOCCR_MEM_EXCL); + libsoccr_set_queue_bytes(so_rst, TCP_SEND_QUEUE, queue, SOCCR_MEM_EXCL); + + ret = libsoccr_restore(so_rst, &data, dsize); + if (ret) { + pr_perror("libsoccr_restore: %d", ret); + return 1; + } + + libsoccr_resume(so_rst); + libsoccr_resume(so); + + if (read(rst, &buf, sizeof(buf)) != sizeof(buf)) { + pr_perror("read"); + return 1; + } + + if (write(rst, &buf, sizeof(buf)) != sizeof(buf)) { + pr_perror("write"); + return 1; + } + shutdown(rst, SHUT_WR); + + if (read(clnt, &buf, sizeof(buf)) != sizeof(buf)) { + pr_perror("read"); + return 1; + } + + return 0; +} diff --git a/CRIU_code/soccr/test/tcp-constructor.c b/CRIU_code/soccr/test/tcp-constructor.c new file mode 100644 index 0000000..89f2010 --- /dev/null +++ b/CRIU_code/soccr/test/tcp-constructor.c @@ -0,0 +1,151 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "soccr/soccr.h" + +#define pr_perror(fmt, ...) ({ fprintf(stderr, "%s:%d: " fmt " : %m\n", __func__, __LINE__, ##__VA_ARGS__); 1; }) + +struct tcp { + char *addr; + uint32_t port; + uint32_t seq; + uint16_t mss_clamp; + uint16_t wscale; +}; + +static void usage() +{ + printf( + "Usage: --addr ADDR -port PORT --seq SEQ --next --addr ADDR -port PORT --seq SEQ -- CMD ...\n" + "\t Describe a source side of a connection, then set the --next option\n" + "\t and describe a destination side.\n" + "\t --reverse - swap source and destination sides\n" + "\t The idea is that the same command line is execute on both sides,\n" + "\t but the --reverse is added to one of them.\n" + "\n" + "\t CMD ... - a user command to handle a socket, which is the descriptor 3.\n" + "\n" + "\t It prints the \"start\" on stdout when a socket is created and\n" + "\t resumes it when you write \"start\" to stdin.\n" + ); +} + +int main(int argc, char **argv) +{ + static const char short_opts[] = ""; + static struct option long_opts[] = { + { "addr", required_argument, 0, 'a' }, + { "port", required_argument, 0, 'p' }, + { "seq", required_argument, 0, 's' }, + { "next", no_argument, 0, 'n'}, + { "reverse", no_argument, 0, 'r'}, + {}, + }; + struct tcp tcp[2] = { + {"127.0.0.1", 12345, 5000000, 1460, 7}, + {"127.0.0.1", 54321, 6000000, 1460, 7} + }; + + int sk, yes = 1, val, idx, opt, i, src = 0, dst = 1; + union libsoccr_addr src_addr, dst_addr; + struct libsoccr_sk_data data = {}; + struct libsoccr_sk *so; + char buf[1024]; + + i = 0; + while (1) { + idx = -1; + opt = getopt_long(argc, argv, short_opts, long_opts, &idx); + if (opt == -1) + break; + + switch (opt) { + case 'a': + tcp[i].addr = optarg; + break; + case 'p': + tcp[i].port = atol(optarg); + break; + case 's': + tcp[i].seq = atol(optarg); + break; + case 'n': + i++; + if (i > 1) + return pr_perror("--next is used twice or more"); + break; + case 'r': + src = 1; dst = 0; + break; + default: + usage(); + return 3; + } + } + if (i != 1) + return pr_perror("--next is required"); + + if (optind == argc) { + usage(); + return 1; + } + + for (i = 0; i < 2; i++) + fprintf(stderr, "%s:%d:%d\n", tcp[i].addr, tcp[i].port, tcp[i].seq); + + data.state = TCP_ESTABLISHED; + data.inq_seq = tcp[dst].seq; + data.outq_seq = tcp[src].seq; + + sk = socket(AF_INET, SOCK_STREAM, 0); + if (sk < 0) + return pr_perror("socket"); + + so = libsoccr_pause(sk); + + if (setsockopt(sk, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) == -1) + return pr_perror("setsockopt"); + + src_addr.v4.sin_family = AF_INET; + src_addr.v4.sin_port = htons(tcp[src].port); + if (inet_pton(AF_INET, tcp[src].addr, &src_addr.v4.sin_addr) != 1) + return pr_perror("inet_pton"); + + dst_addr.v4.sin_family = AF_INET; + dst_addr.v4.sin_port = htons(tcp[dst].port); + if (inet_pton(AF_INET, tcp[dst].addr, &(dst_addr.v4.sin_addr)) != 1) + return pr_perror("inet_pton"); + + libsoccr_set_addr(so, 1, &src_addr, 0); + libsoccr_set_addr(so, 0, &dst_addr, 0); + + data.snd_wscale = tcp[src].wscale; + data.rcv_wscale = tcp[dst].wscale; + data.mss_clamp = tcp[src].mss_clamp; + + data.opt_mask = TCPI_OPT_WSCALE | TCPOPT_MAXSEG; + + if (libsoccr_restore(so, &data, sizeof(data))) + return 1; + + /* Let's go */ + if (write(STDOUT_FILENO, "start", 5) != 5) + return pr_perror("write"); + if (read(STDIN_FILENO, buf, 5) != 5) + return pr_perror("read"); + + val = 0; + if (setsockopt(sk, SOL_TCP, TCP_REPAIR, &val, sizeof(val))) + return pr_perror("TCP_REPAIR"); + + execv(argv[optind], argv + optind); + + return pr_perror("Unable to exec %s", argv[optind]); +} diff --git a/CRIU_code/soccr/test/tcp-test.py b/CRIU_code/soccr/test/tcp-test.py new file mode 100644 index 0000000..ff3fe29 --- /dev/null +++ b/CRIU_code/soccr/test/tcp-test.py @@ -0,0 +1,20 @@ +#!/usr/bin/env python2 + +from __future__ import print_function +import sys, socket +import hashlib + +sk = socket.fromfd(3, socket.AF_INET, socket.SOCK_STREAM) + +s = sys.stdin.read() +ret = sk.send(s) +print("%s: send() -> %d" % (sys.argv[1], ret), file=sys.stderr) +sk.shutdown(socket.SHUT_WR) +m = hashlib.md5() +while True: + s = sk.recv((1 << 20) * 10) + if not s: + break + print("%s: recv() -> %d" % (sys.argv[1], len(s)), file=sys.stderr) + m.update(s) +print(repr(m.hexdigest())) diff --git a/CRIU_code/test/.gitignore b/CRIU_code/test/.gitignore new file mode 100644 index 0000000..6a735ba --- /dev/null +++ b/CRIU_code/test/.gitignore @@ -0,0 +1,16 @@ +/lib +/lib64 +/bin +/sbin +/dev +/dump +/tmp +/usr +/.constructed +/*.log +/zdtm_ct +/zdtm-tst-list +/stats-restore +/zdtm_mount_cgroups.lock +/compel/handle_binary +/umount2 diff --git a/CRIU_code/test/Makefile b/CRIU_code/test/Makefile new file mode 100644 index 0000000..cf7daca --- /dev/null +++ b/CRIU_code/test/Makefile @@ -0,0 +1,61 @@ +RM := rm -f --one-file-system + +ZDTM_ARGS ?= -C +export ZDTM_ARGS + +all: + $(MAKE) zdtm + $(MAKE) zdtm-pre-dump + $(MAKE) zdtm-snapshot + $(MAKE) zdtm-iter + $(MAKE) zdtm-freezer +.PHONY: all + +TESTS = unix-callback mem-snap rpc libcriu mounts/ext security pipes crit socketpairs overlayfs mnt-ext-dev shell-job + +other: + for t in $(TESTS); do \ + setsid $(MAKE) -C others/$$t run || exit 1; \ + done +.PHONY: other + +zdtm: + ./zdtm.py run -a --parallel 2 +.PHONY: zdtm + +zdtm-pre-dump: + ./zdtm.py run --pre 2:1 -t zdtm/transition/fork -f uns +.PHONY: zdtm-pre-dump + +zdtm-snapshot: + ./zdtm.py run --pre 2:1 --snap -t zdtm/transition/fork -f uns +.PHONY: zdtm-snapshot + +zdtm-iter: + ./zdtm.py run --iters 3:1 -t zdtm/transition/fork -f uns +.PHONY: zdtm-iter + +zdtm-freezer: + ./zdtm.py run --test zdtm/transition/thread-bomb --pre 3 --freezecg zdtm:t + ./zdtm.py run --test zdtm/transition/thread-bomb --pre 3 --freezecg zdtm:f +.PHONY: zdtm-freezer + +fault-injection: + $(MAKE) -C fault-injection +.PHONY: fault-injection + +override CFLAGS += -D_GNU_SOURCE + +clean_root: + $(Q) ./zdtm.py clean nsroot +.PHONY: clean_root + +clean: clean_root + $(RM) zdtm_ct zdtm-tst-list umount2 zdtm_test_config.conf + $(Q) $(RM) *.log + $(Q) $(RM) -r ./dump/ + $(Q) $(MAKE) -C zdtm cleandep clean cleanout + $(Q) $(MAKE) -C libcriu clean + $(Q) $(MAKE) -C rpc clean + $(Q) $(MAKE) -C crit clean +.PHONY: clean diff --git a/CRIU_code/test/abrt.sh b/CRIU_code/test/abrt.sh new file mode 100644 index 0000000..94aca21 --- /dev/null +++ b/CRIU_code/test/abrt.sh @@ -0,0 +1,35 @@ +#!/bin/bash -x + +pid=$1 +vpid=$2 +sig=$3 +comm=$4 + +exec &>> /tmp/zdtm-core.log + +expr match "$comm" zombie00 && { + cat > /dev/null + exit 0 +} + +expr match "$comm" seccomp_filter && { + cat > /dev/null + exit 0 +} + +report="/tmp/zdtm-core-$pid-$comm" +exec &> ${report}.txt + +ps axf +ps -p $pid + +cat /proc/$pid/status +ls -l /proc/$pid/fd +cat /proc/$pid/maps +exec 33< /proc/$pid/exe +cat > $report.core + +echo 'bt +i r +disassemble $rip-0x10,$rip + 0x10 +' | gdb -c $report.core /proc/self/fd/33 diff --git a/CRIU_code/test/check_actions.py b/CRIU_code/test/check_actions.py new file mode 100644 index 0000000..0e3daf1 --- /dev/null +++ b/CRIU_code/test/check_actions.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python2 + +import sys +import os + +actions = set(['pre-dump', 'pre-restore', 'post-dump', 'setup-namespaces', \ + 'post-setup-namespaces', 'post-restore', 'post-resume', \ + 'network-lock', 'network-unlock' ]) +errors = [] +af = os.path.dirname(os.path.abspath(__file__)) + '/actions_called.txt' + +for act in open(af): + act = act.strip().split() + act.append('EMPTY') + act.append('EMPTY') + + if act[0] == 'EMPTY': + raise Exception("Error in test, bogus actions line") + + if act[1] == 'EMPTY': + errors.append('Action %s misses CRTOOLS_IMAGE_DIR' % act[0]) + + if act[0] in ('post-dump', 'setup-namespaces', 'post-setup-namespaces', \ + 'post-restore', 'post-resume', 'network-lock', 'network-unlock'): + if act[2] == 'EMPTY': + errors.append('Action %s misses CRTOOLS_INIT_PID' % act[0]) + elif not act[2].isdigit() or int(act[2]) == 0: + errors.append('Action %s PID is not number (%s)' % (act[0], act[2])) + + actions -= set([act[0]]) + +if actions: + errors.append('Not all actions called: %r' % actions) + +if errors: + for x in errors: + print(x) + sys.exit(1) + +print('PASS') diff --git a/CRIU_code/test/compel/Makefile b/CRIU_code/test/compel/Makefile new file mode 100644 index 0000000..a23097f --- /dev/null +++ b/CRIU_code/test/compel/Makefile @@ -0,0 +1,18 @@ +# Relative path to original objects +define compel_obj_path + $(addprefix ../../compel/,$(1)) +endef + +host-ccflags-y += -iquote test/compel/arch/$(ARCH)/include +test_objs := $(filter-out main.o,$(compel-objs)) + +hostprogs-y += handle_binary +handle_binary-objs += $(call compel_obj_path,$(test_objs)) +handle_binary-objs += main.o +handle_binary-objs += handle_binary.o + +ifeq ($(ARCH),x86) + handle_binary-objs += handle_binary_32.o + HOSTCFLAGS_handle_binary.o += -DCONFIG_X86_64 + HOSTCFLAGS_handle_binary_32.o += -DCONFIG_X86_32 +endif diff --git a/CRIU_code/test/compel/arch/aarch64/include/arch_test_handle_binary.h b/CRIU_code/test/compel/arch/aarch64/include/arch_test_handle_binary.h new file mode 100644 index 0000000..dbaa1d6 --- /dev/null +++ b/CRIU_code/test/compel/arch/aarch64/include/arch_test_handle_binary.h @@ -0,0 +1,24 @@ +#ifndef __ARCH_TEST_HANDLE_BINARY__ +#define __ARCH_TEST_HANDLE_BINARY__ + +#include + +#include "uapi/elf64-types.h" +#define arch_run_tests(mem) __run_tests(mem, "") +extern int __run_tests(void *mem, const char *msg); + +static __maybe_unused void arch_test_set_elf_hdr_ident(void *mem) +{ +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + memcpy(mem, elf_ident_64_le, sizeof(elf_ident_64_le)); +#else + memcpy(mem, elf_ident_64_be, sizeof(elf_ident_64_be)); +#endif +} + +static __maybe_unused void arch_test_set_elf_hdr_machine(Ehdr_t *hdr) +{ + hdr->e_machine = EM_AARCH64; +} + +#endif /* __ARCH_TEST_HANDLE_BINARY__ */ diff --git a/CRIU_code/test/compel/arch/arm/include/arch_test_handle_binary.h b/CRIU_code/test/compel/arch/arm/include/arch_test_handle_binary.h new file mode 100644 index 0000000..234bd38 --- /dev/null +++ b/CRIU_code/test/compel/arch/arm/include/arch_test_handle_binary.h @@ -0,0 +1,21 @@ +#ifndef __ARCH_TEST_HANDLE_BINARY__ +#define __ARCH_TEST_HANDLE_BINARY__ + +#include + +#include "uapi/elf32-types.h" +#define arch_run_tests(mem) __run_tests(mem, "") +extern int __run_tests(void *mem, const char *msg); + +static __maybe_unused void arch_test_set_elf_hdr_ident(void *mem) +{ + memcpy(mem, elf_ident_32, sizeof(elf_ident_32)); +} + +static __maybe_unused void arch_test_set_elf_hdr_machine(Ehdr_t *hdr) +{ + hdr->e_machine = EM_ARM; +} + + +#endif /* __ARCH_TEST_HANDLE_BINARY__ */ diff --git a/CRIU_code/test/compel/arch/ppc64/include/arch_test_handle_binary.h b/CRIU_code/test/compel/arch/ppc64/include/arch_test_handle_binary.h new file mode 100644 index 0000000..5f826fe --- /dev/null +++ b/CRIU_code/test/compel/arch/ppc64/include/arch_test_handle_binary.h @@ -0,0 +1,24 @@ +#ifndef __ARCH_TEST_HANDLE_BINARY__ +#define __ARCH_TEST_HANDLE_BINARY__ + +#include + +#include "uapi/elf64-types.h" +#define arch_run_tests(mem) __run_tests(mem, "") +extern int __run_tests(void *mem, const char *msg); + +static __maybe_unused void arch_test_set_elf_hdr_ident(void *mem) +{ +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + memcpy(mem, elf_ident_64_le, sizeof(elf_ident_64_le)); +#else + memcpy(mem, elf_ident_64_be, sizeof(elf_ident_64_be)); +#endif +} + +static __maybe_unused void arch_test_set_elf_hdr_machine(Ehdr_t *hdr) +{ + hdr->e_machine = EM_PPC64; +} + +#endif /* __ARCH_TEST_HANDLE_BINARY__ */ diff --git a/CRIU_code/test/compel/arch/x86/include/arch_test_handle_binary.h b/CRIU_code/test/compel/arch/x86/include/arch_test_handle_binary.h new file mode 100644 index 0000000..ae16ac5 --- /dev/null +++ b/CRIU_code/test/compel/arch/x86/include/arch_test_handle_binary.h @@ -0,0 +1,49 @@ +#ifndef __ARCH_TEST_HANDLE_BINARY__ +#define __ARCH_TEST_HANDLE_BINARY__ + +#include + +#ifdef CONFIG_X86_64 +#include "uapi/elf64-types.h" +#define __run_tests run_tests_64 + +static __maybe_unused void arch_test_set_elf_hdr_ident(void *mem) +{ + memcpy(mem, elf_ident_64_le, sizeof(elf_ident_64_le)); +} + +static __maybe_unused void arch_test_set_elf_hdr_machine(Ehdr_t *hdr) +{ + hdr->e_machine = EM_X86_64; +} + +#else /* !CONFIG_X86_64 */ + +#include "uapi/elf32-types.h" +#define __run_tests run_tests_32 + +static __maybe_unused void arch_test_set_elf_hdr_ident(void *mem) +{ + memcpy(mem, elf_ident_32, sizeof(elf_ident_32)); +} + +static __maybe_unused void arch_test_set_elf_hdr_machine(Ehdr_t *hdr) +{ + hdr->e_machine = EM_386; +} + +#endif /* CONFIG_X86_32 */ + +extern int run_tests_64(void *mem, const char *msg); +extern int run_tests_32(void *mem, const char *msg); + +static __maybe_unused int arch_run_tests(void *mem) +{ + int ret; + + ret = run_tests_64(mem, "(64-bit ELF)"); + ret += run_tests_32(mem, "(32-bit ELF)"); + + return ret; +} +#endif /* __ARCH_TEST_HANDLE_BINARY__ */ diff --git a/CRIU_code/test/compel/handle_binary.c b/CRIU_code/test/compel/handle_binary.c new file mode 100644 index 0000000..4ef42ae --- /dev/null +++ b/CRIU_code/test/compel/handle_binary.c @@ -0,0 +1,99 @@ +#include + +#include "uapi/piegen-err.h" +#include "piegen.h" + +#include "arch_test_handle_binary.h" + +extern int launch_test(void *mem, int expected_ret, const char *test_fmt, ...); +extern const size_t test_elf_buf_size; + +static uintptr_t elf_addr; +static const char *test_bitness; +#define ASSERT(expected, fmt, ...) \ + launch_test((void *)elf_addr, expected, \ + fmt " %s", ##__VA_ARGS__, test_bitness) + +static const unsigned int sections_nr = 1; + +static void set_elf_hdr_relocatable(Ehdr_t *hdr) +{ + hdr->e_type = ET_REL; + hdr->e_version = EV_CURRENT; +} + +static int test_add_strings_section(Ehdr_t *hdr) +{ + Shdr_t *sec_strings_hdr; + uintptr_t sections_table = elf_addr + hdr->e_shoff; + size_t sections_table_size = sections_nr*sizeof(hdr->e_shentsize); + + hdr->e_shnum = sections_nr; + hdr->e_shstrndx = sections_nr; /* off-by-one */ + if (ASSERT(-E_NO_STR_SEC, + "strings section's header oob of section table")) + return -1; + + hdr->e_shstrndx = 0; + sec_strings_hdr = (void *)sections_table; + + sec_strings_hdr->sh_offset = (Off_t)-1; + if (ASSERT(-E_NO_STR_SEC, "strings section oob")) + return -1; + + /* Put strings just right after sections table. */ + sec_strings_hdr->sh_offset = sections_table - elf_addr + + sections_table_size; + return 0; +} + +static int test_prepare_section_table(Ehdr_t *hdr) +{ + hdr->e_shoff = (Off_t)test_elf_buf_size; + if (ASSERT(-E_NO_STR_SEC, "section table start oob")) + return -1; + + /* Lets put sections table right after ELF header. */ + hdr->e_shoff = (Off_t) sizeof(Ehdr_t); + hdr->e_shentsize = (Half_t) sizeof(Shdr_t); + + hdr->e_shnum = (Half_t)-1; + if (ASSERT(-E_NO_STR_SEC, "too many sections in table")) + return -1; + + if (test_add_strings_section(hdr)) + return -1; + return 0; +} + +static int test_prepare_elf_header(void *elf) +{ + memset(elf, 0, sizeof(Ehdr_t)); + if (ASSERT(-E_NOT_ELF, "zero ELF header")) + return -1; + + arch_test_set_elf_hdr_ident(elf); + if (ASSERT(-E_NOT_ELF, "unsupported ELF header")) + return -1; + + arch_test_set_elf_hdr_machine(elf); + if (ASSERT(-E_NOT_ELF, "non-relocatable ELF header")) + return -1; + + set_elf_hdr_relocatable(elf); + + if (test_prepare_section_table(elf)) + return -1; + + return 0; +} + +int __run_tests(void *mem, const char *msg) +{ + elf_addr = (uintptr_t)mem; + test_bitness = msg; + + if (test_prepare_elf_header(mem)) + return 1; + return 0; +} diff --git a/CRIU_code/test/compel/handle_binary_32.c b/CRIU_code/test/compel/handle_binary_32.c new file mode 100644 index 0000000..5364be3 --- /dev/null +++ b/CRIU_code/test/compel/handle_binary_32.c @@ -0,0 +1 @@ +handle_binary.c \ No newline at end of file diff --git a/CRIU_code/test/compel/main.c b/CRIU_code/test/compel/main.c new file mode 100644 index 0000000..a6a0149 --- /dev/null +++ b/CRIU_code/test/compel/main.c @@ -0,0 +1,57 @@ +/* + * Test for handle_binary(). + * In this test ELF binary file is constructed from + * header up to sections and relocations. + * On each stage it tests non-valid ELF binaries to be parsed. + * For passing test, handle_binary should return errors for all + * non-valid binaries and handle all relocations. + * + * Test author: Dmitry Safonov + */ + +#include +#include +#include +#include + +#include "piegen.h" +#include "arch_test_handle_binary.h" + +/* size of buffer with formed ELF file */ +const size_t test_elf_buf_size = 4096; + +extern int handle_binary(void *mem, size_t size); +extern void run_tests(void *mem); + +int launch_test(void *mem, int expected_ret, const char *test_fmt, ...) +{ + static unsigned test_nr = 1; + int ret = handle_binary(mem, test_elf_buf_size); + va_list params; + + va_start(params, test_fmt); + if (ret != expected_ret) { + printf("not ok %u - ", test_nr); + vprintf(test_fmt, params); + printf(", expected %d but ret is %d\n", expected_ret, ret); + } else { + printf("ok %u - ", test_nr); + vprintf(test_fmt, params); + putchar('\n'); + } + va_end(params); + test_nr++; + fflush(stdout); + + return ret != expected_ret; +} + +int main(int argc, char **argv) +{ + void *elf_buf = malloc(test_elf_buf_size); + int ret; + + ret = arch_run_tests(elf_buf); + free(elf_buf); + return ret; +} diff --git a/CRIU_code/test/crit-recode.py b/CRIU_code/test/crit-recode.py new file mode 100644 index 0000000..441f775 --- /dev/null +++ b/CRIU_code/test/crit-recode.py @@ -0,0 +1,75 @@ +#!/usr/bin/env python +# vim: noet ts=8 sw=8 sts=8 + +import pycriu +import sys +import os +import subprocess + +find = subprocess.Popen(['find', 'test/dump/', '-size', '+0', '-name', '*.img'], + stdout = subprocess.PIPE) + +test_pass = True + +def recode_and_check(imgf, o_img, pretty): + try: + pb = pycriu.images.loads(o_img, pretty) + except pycriu.images.MagicException as me: + print("%s magic %x error" % (imgf, me.magic)) + return False + except Exception as e: + print("%s %sdecode fails: %s" % (imgf, pretty and 'pretty ' or '', e)) + return False + + try: + r_img = pycriu.images.dumps(pb) + except Exception as e: + r_img = pycriu.images.dumps(pb) + print("%s %s encode fails: %s" % (imgf, pretty and 'pretty ' or '', e)) + return False + + if o_img != r_img: + print("%s %s recode mismatch" % (imgf, pretty and 'pretty ' or '')) + return False + + return True + + +for imgf in find.stdout.readlines(): + imgf = imgf.strip() + imgf_b = os.path.basename(imgf) + + if imgf_b.startswith(b'pages-'): + continue + if imgf_b.startswith(b'iptables-'): + continue + if imgf_b.startswith(b'ip6tables-'): + continue + if imgf_b.startswith(b'route-'): + continue + if imgf_b.startswith(b'route6-'): + continue + if imgf_b.startswith(b'ifaddr-'): + continue + if imgf_b.startswith(b'tmpfs-'): + continue + if imgf_b.startswith(b'netns-ct-'): + continue + if imgf_b.startswith(b'netns-exp-'): + continue + if imgf_b.startswith(b'rule-'): + continue + + o_img = open(imgf.decode(), "rb").read() + if not recode_and_check(imgf, o_img, False): + test_pass = False + if not recode_and_check(imgf, o_img, True): + test_pass = False + +find.wait() + +if not test_pass: + print("FAIL") + sys.exit(1) + +print("PASS") diff --git a/CRIU_code/test/empty-netns-prep.sh b/CRIU_code/test/empty-netns-prep.sh new file mode 100644 index 0000000..07e7e8f --- /dev/null +++ b/CRIU_code/test/empty-netns-prep.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +set -ex + +if [ "$CRTOOLS_SCRIPT_ACTION" == "setup-namespaces" ]; then + echo "Will up lo at $CRTOOLS_INIT_PID netns" + mkdir -p /var/run/netns + mount -t tmpfs xxx /var/run/netns + touch /var/run/netns/emptyns + mount --bind /proc/$CRTOOLS_INIT_PID/ns/net /var/run/netns/emptyns + ip netns exec emptyns ip link set up dev lo || exit 1 + ip netns exec emptyns ip a + umount -l /var/run/netns +fi + +exit 0 diff --git a/CRIU_code/test/exhaustive/pipe.py b/CRIU_code/test/exhaustive/pipe.py new file mode 100644 index 0000000..17e0658 --- /dev/null +++ b/CRIU_code/test/exhaustive/pipe.py @@ -0,0 +1,270 @@ +#!/usr/bin/env python + +import argparse +import os +import signal +import socket +import time +import sys +import subprocess + +criu_bin='../../criu/criu' + +def mix(nr_tasks, nr_pipes): + # Returned is the list of combinations. + # Each combination is the lists of pipe descriptors. + # Each pipe descriptor is a 2-elemtn tuple, that contains values + # for R and W ends of pipes, each being a bit-field denoting in + # which tasks the respective end should be opened or not. + + # First -- make a full set of combinations for a single pipe. + max_idx = 1 << nr_tasks + pipe_mix = [[(r, w)] for r in range(0, max_idx) for w in range(0, max_idx)] + + # Now, for every pipe throw another one into the game making + # all possible combinations of what was seen before with the + # newbie. + pipes_mix = pipe_mix + for t in range(1, nr_pipes): + pipes_mix = [ o + n for o in pipes_mix for n in pipe_mix ] + + return pipes_mix + + +# Called by a test sub-process. It just closes the not needed ends +# of pipes and sleeps waiting for death. +def make_pipes(task_nr, nr_pipes, pipes, comb, status_pipe): + print('\t\tMake pipes for %d' % task_nr) + # We need to make sure that pipes have their + # ends according to comb for task_nr + + for i in range(0, nr_pipes): + # Read end + if not (comb[i][0] & (1 << task_nr)): + os.close(pipes[i][0]) + # Write end + if not (comb[i][1] & (1 << task_nr)): + os.close(pipes[i][1]) + + os.write(status_pipe, '0') + os.close(status_pipe) + while True: + time.sleep(100) + + +def get_pipe_ino(pid, fd): + try: + return os.stat('/proc/%d/fd/%d' % (pid, fd)).st_ino + except: + return None + + +def get_pipe_rw(pid, fd): + for l in open('/proc/%d/fdinfo/%d' % (pid, fd)): + if l.startswith('flags:'): + f = l.split(None, 1)[1][-2] + if f == '0': + return 0 # Read + elif f == '1': + return 1 # Write + break + + raise Exception('Unexpected fdinfo contents') + + +def check_pipe_y(pid, fd, rw, inos): + ino = get_pipe_ino(pid, fd) + if ino == None: + return 'missing ' + if not inos.has_key(fd): + inos[fd] = ino + elif inos[fd] != ino: + return 'wrong ' + mod = get_pipe_rw(pid, fd) + if mod != rw: + return 'badmode ' + return None + + +def check_pipe_n(pid, fd): + ino = get_pipe_ino(pid, fd) + if ino == None: + return None + else: + return 'present ' + + +def check_pipe_end(kids, fd, comb, rw, inos): + t_nr = 0 + for t_pid in kids: + if comb & (1 << t_nr): + res = check_pipe_y(t_pid, fd, rw, inos) + else: + res = check_pipe_n(t_pid, fd) + if res != None: + return res + 'kid(%d)' % t_nr + t_nr += 1 + return None + + +def check_pipe(kids, fds, comb, inos): + for e in (0, 1): # 0 == R, 1 == W, see get_pipe_rw() + res = check_pipe_end(kids, fds[e], comb[e], e, inos) + if res != None: + return res + 'end(%d)' % e + return None + +def check_pipes(kids, pipes, comb): + # Kids contain pids + # Pipes contain pipe FDs + # Comb contain list of pairs of bits for RW ends + p_nr = 0 + p_inos = {} + for p_fds in pipes: + res = check_pipe(kids, p_fds, comb[p_nr], p_inos) + if res != None: + return res + 'pipe(%d)' % p_nr + p_nr += 1 + + return None + + +# Run by test main process. It opens pipes, then forks kids that +# will contain needed pipe ends, then report back that it's ready +# and waits for a signal (unix socket message) to start checking +# the kids' FD tables. +def make_comb(comb, opts, status_pipe): + print('\tMake pipes') + # 1st -- make needed pipes + pipes = [] + for p in range(0, opts.pipes): + pipes.append(os.pipe()) + + # Fork the kids that'll make pipes + kc_pipe = os.pipe() + kids = [] + for t in range(0, opts.tasks): + pid = os.fork() + if pid == 0: + os.close(status_pipe) + os.close(kc_pipe[0]) + make_pipes(t, opts.pipes, pipes, comb, kc_pipe[1]) + sys.exit(1) + kids.append(pid) + + os.close(kc_pipe[1]) + for p in pipes: + os.close(p[0]) + os.close(p[1]) + + # Wait for kids to get ready + k_res = '' + while True: + v = os.read(kc_pipe[0], 16) + if v == '': + break + k_res += v + os.close(kc_pipe[0]) + + ex_code = 1 + if k_res == '0' * opts.tasks: + print('\tWait for C/R') + cmd_sk = socket.socket(socket.AF_UNIX, socket.SOCK_DGRAM, 0) + cmd_sk.bind('\0CRIUPCSK') + + # Kids are ready, so is socket for kicking us. Notify the + # parent task that we are good to go. + os.write(status_pipe, '0') + os.close(status_pipe) + v = cmd_sk.recv(16) + if v == '0': + print('\tCheck pipes') + res = check_pipes(kids, pipes, comb) + if res == None: + ex_code = 0 + else: + print('\tFAIL %s' % res) + + # Just kill kids, all checks are done by us, we don't need'em any more + for t in kids: + os.kill(t, signal.SIGKILL) + os.waitpid(t, 0) + + return ex_code + + +def cr_test(pid): + print('C/R test') + img_dir = 'pimg_%d' % pid + try: + os.mkdir(img_dir) + subprocess.check_call([criu_bin, 'dump', '-t', '%d' % pid, '-D', img_dir, '-o', 'dump.log', '-v4', '-j']) + except: + print('`- dump fail') + return False + + try: + os.waitpid(pid, 0) + subprocess.check_call([criu_bin, 'restore', '-D', img_dir, '-o', 'rst.log', '-v4', '-j', '-d', '-S']) + except: + print('`- restore fail') + return False + + return True + + +def run(comb, opts): + print('Checking %r' % comb) + cpipe = os.pipe() + pid = os.fork() + if pid == 0: + os.close(cpipe[0]) + ret = make_comb(comb, opts, cpipe[1]) + sys.exit(ret) + + # Wait for the main process to get ready + os.close(cpipe[1]) + res = os.read(cpipe[0], 16) + os.close(cpipe[0]) + + if res == '0': + res = cr_test(pid) + + print('Wake up test') + s = socket.socket(socket.AF_UNIX, socket.SOCK_DGRAM, 0) + if res: + res = '0' + else: + res = 'X' + try: + # Kick the test to check its state + s.sendto(res, '\0CRIUPCSK') + except: + # Restore might have failed or smth else happened + os.kill(pid, signal.SIGKILL) + s.close() + + # Wait for the guy to exit and get the result (PASS/FAIL) + p, st = os.waitpid(pid, 0) + if os.WIFEXITED(st): + st = os.WEXITSTATUS(st) + + print('Done (%d, pid == %d)' % (st, pid)) + return st == 0 + + +p = argparse.ArgumentParser("CRIU test suite") +p.add_argument("--tasks", help = "Number of tasks", default = '2') +p.add_argument("--pipes", help = "Number of pipes", default = '2') +opts = p.parse_args() +opts.tasks = int(opts.tasks) +opts.pipes = int(opts.pipes) + +pipe_combs = mix(opts.tasks, opts.pipes) + +for comb in pipe_combs: + if not run(comb, opts): + print('FAIL') + break +else: + print('PASS') diff --git a/CRIU_code/test/exhaustive/unix.py b/CRIU_code/test/exhaustive/unix.py new file mode 100644 index 0000000..41053bd --- /dev/null +++ b/CRIU_code/test/exhaustive/unix.py @@ -0,0 +1,754 @@ +#!/usr/bin/env python + +import sys +import os +import socket +import argparse +import subprocess +import signal +import fcntl +import stat + +criu_bin='../../criu/criu' + +sk_type_s = { + socket.SOCK_STREAM: "S", + socket.SOCK_DGRAM: "D", +} + +# Actions that can be done by test. Actions are not only syscall +# names to call, but also arguments with which to do it +# +# Each action consists of +# - arguments, e.g. type of socket, or socket id to work on +# - act() method which just generates an record +# - do() method, that actually does what's required +# - show() method to return the string description of what's done + +def mk_socket(st, typ): + st.sk_id += 1 + sk = sock(st.sk_id, typ) + st.add_socket(sk) + return sk + +class act_socket: + def __init__(self, typ): + self.typ = typ + + def act(self, st): + sk = mk_socket(st, self.typ) + self.sk_id = sk.sk_id + + def do(self, st): + sk = socket.socket(socket.AF_UNIX, self.typ, 0) + st.real_sockets[self.sk_id] = sk + + def show(self): + return 'socket(%s) = %d' % (sk_type_s[self.typ], self.sk_id) + + +class act_close: + def __init__(self, sk_id): + self.sk_id = sk_id + + def act(self, st): + sk = st.get_socket(self.sk_id) + st.del_socket(sk) + for ic in sk.icons: + sk = st.get_socket(ic) + st.del_socket(sk) + + def do(self, st): + sk = st.real_sockets.pop(self.sk_id) + sk.close() + + def show(self): + return 'close(%d)' % self.sk_id + + +class act_listen: + def __init__(self, sk_id): + self.sk_id = sk_id + + def act(self, st): + sk = st.get_socket(self.sk_id) + sk.listen = True + + def do(self, st): + sk = st.real_sockets[self.sk_id] + sk.listen(10) + + def show(self): + return 'listen(%d)' % self.sk_id + + +class act_bind: + def __init__(self, sk_id, name_id): + self.sk_id = sk_id + self.name_id = name_id + + def act(self, st): + sk = st.get_socket(self.sk_id) + sk.name = self.name_id + + def do(self, st): + sk = st.real_sockets[self.sk_id] + sk.bind(sock.real_name_for(self.name_id)) + + def show(self): + return 'bind(%d, $name-%d)' % (self.sk_id, self.name_id) + + +class act_connect: + def __init__(self, sk_id, listen_sk_id): + self.sk_id = sk_id + self.lsk_id = listen_sk_id + + def act(self, st): + sk = st.get_socket(self.sk_id) + if st.sk_type == socket.SOCK_STREAM: + lsk = st.get_socket(self.lsk_id) + psk = mk_socket(st, socket.SOCK_STREAM) + psk.visible = False + sk.peer = psk.sk_id + psk.peer = sk.sk_id + psk.name = lsk.name + lsk.icons.append(psk.sk_id) + lsk.icons_seq += 1 + else: + sk.peer = self.lsk_id + psk = st.get_socket(self.lsk_id) + psk.icons_seq += 1 + + def do(self, st): + sk = st.real_sockets[self.sk_id] + sk.connect(sock.real_name_for(self.lsk_id)) + + def show(self): + return 'connect(%d, $name-%d)' % (self.sk_id, self.lsk_id) + + +class act_accept: + def __init__(self, sk_id): + self.sk_id = sk_id + + def act(self, st): + lsk = st.get_socket(self.sk_id) + iid = lsk.icons.pop(0) + nsk = st.get_socket(iid) + nsk.visible = True + self.nsk_id = nsk.sk_id + + def do(self, st): + sk = st.real_sockets[self.sk_id] + nsk, ai = sk.accept() + if self.nsk_id in st.real_sockets: + raise Exception("SK ID conflict") + st.real_sockets[self.nsk_id] = nsk + + def show(self): + return 'accept(%d) = %d' % (self.sk_id, self.nsk_id) + + +class act_sendmsg: + def __init__(self, sk_id, to_id): + self.sk_id = sk_id + self.to_id = to_id + self.direct_send = None + + def act(self, st): + sk = st.get_socket(self.sk_id) + msg = (sk.sk_id, sk.outseq) + self.msg_id = sk.outseq + sk.outseq += 1 + psk = st.get_socket(self.to_id) + psk.inqueue.append(msg) + self.direct_send = (sk.peer == psk.sk_id) + + def do(self, st): + sk = st.real_sockets[self.sk_id] + msgv = act_sendmsg.msgval(self.msg_id) + if self.direct_send: + sk.send(msgv) + else: + sk.sendto(msgv, sock.real_name_for(self.to_id)) + + def show(self): + return 'send(%d, %d, $message-%d)' % (self.sk_id, self.to_id, self.msg_id) + + @staticmethod + def msgval(msgid, pref = ''): + return '%sMSG%d' % (pref, msgid) + +# +# Description of a socket +# +class sock: + def __init__(self, sk_id, sock_type): + # ID of a socket. Since states and sockets are cloned + # while we scan the tree of states the only valid way + # to address a socket is to find one by ID. + self.sk_id = sk_id + # The socket.SOCK_FOO value + self.sk_type = sock_type + # Sockets that haven't yet been accept()-ed are in the + # state, but user cannot operate on them. Also this + # invisibility contributes to state description since + # connection to not accepted socket is not the same + # as connection to accepted one. + self.visible = True + # The listen() was called. + self.listen = False + # The bind() was called. Also set by accept(), the name + # inherits from listener. + self.name = None + # The connect() was called. Set on two sockets when the + # connect() is called. + self.peer = None + # Progress on accepting connections. Used to check when + # it's OK to close the socket (see comment below). + self.icons_seq = 0 + # List of IDs of sockets that can be accept()-ed + self.icons = [] + # Number to generate message contents. + self.outseq = 0 + # Incoming queue of messages. + self.inqueue = [] + + def clone(self): + sk = sock(self.sk_id, self.sk_type) + sk.visible = self.visible + sk.listen = self.listen + sk.name = self.name + sk.peer = self.peer + sk.icons_seq = self.icons_seq + sk.icons = list(self.icons) + sk.outseq = self.outseq + sk.inqueue = list(self.inqueue) + return sk + + def get_actions(self, st): + if not self.visible: + return [] + + if st.sk_type == socket.SOCK_STREAM: + return self.get_stream_actions(st) + else: + return self.get_dgram_actions(st) + + def get_send_action(self, to, st): + # However, if peer has a message from us at + # the queue tail, sending a new one doesn't + # really make sense + want_msg = True + if len(to.inqueue) != 0: + lmsg = to.inqueue[-1] + if lmsg[0] == self.sk_id: + want_msg = False + if want_msg: + return [ act_sendmsg(self.sk_id, to.sk_id) ] + else: + return [ ] + + def get_stream_actions(self, st): + act_list = [] + + # Any socket can be closed, but closing a socket + # that hasn't contributed to some new states is + # just waste of time, so we close only connected + # sockets or listeners that has at least one + # incoming connection pendig or served + + if self.listen: + if self.icons: + act_list.append(act_accept(self.sk_id)) + if self.icons_seq: + act_list.append(act_close(self.sk_id)) + elif self.peer: + act_list.append(act_close(self.sk_id)) + # Connected sockets can send and receive messages + # But receiving seem not to produce any new states, + # so only sending + # Also sending to a closed socket doesn't work + psk = st.get_socket(self.peer, True) + if psk: + act_list += self.get_send_action(psk, st) + else: + for psk in st.sockets: + if psk.listen and psk.name: + act_list.append(act_connect(self.sk_id, psk.sk_id)) + + # Listen on not-bound socket is prohibited as + # well as binding a listening socket + if not self.name: + # TODO: support for file paths (see real_name_for) + # TODO: these names can overlap each other + act_list.append(act_bind(self.sk_id, self.sk_id)) + else: + act_list.append(act_listen(self.sk_id)) + + return act_list + + def get_dgram_actions(self, st): + act_list = [] + + # Dgram socket can bind at any time + if not self.name: + act_list.append(act_bind(self.sk_id, self.sk_id)) + + # Can connect to peer-less sockets + for psk in st.sockets: + if psk == self: + continue + if psk.peer != None and psk.peer != self.sk_id: + # Peer by someone else, can do nothing + continue + + # Peer-less psk or having us as peer + # We can connect to or send messages + if psk.name and self.peer != psk.sk_id: + act_list.append(act_connect(self.sk_id, psk.sk_id)) + + if psk.name or self.peer == psk.sk_id: + act_list += self.get_send_action(psk, st) + + if self.outseq != 0 or self.icons_seq != 0: + act_list.append(act_close(self.sk_id)) + + return act_list + + @staticmethod + def name_of(sk): + if not sk: + return 'X' + elif not sk.visible: + return 'H' + elif sk.name: + return 'B' + else: + return 'A' + + @staticmethod + def real_name_for(sk_id): + return "\0" + "CRSK%d" % sk_id + + # The describe() generates a string that represents + # a state of a socket. Called by state.describe(), see + # comment there about what description is. + def describe(self, st): + dsc = '%s' % sk_type_s[self.sk_type] + dsc += sock.name_of(self) + + if self.listen: + dsc += 'L' + if self.peer: + psk = st.get_socket(self.peer, True) + dsc += '-C%s' % sock.name_of(psk) + if self.icons: + i_dsc = '' + for c in self.icons: + psk = st.get_socket(c) + psk = st.get_socket(psk.peer, True) + i_dsc += sock.name_of(psk) + dsc += '-I%s' % i_dsc + if self.inqueue: + froms = set() + for m in self.inqueue: + froms.add(m[0]) + q_dsc = '' + for f in froms: + fsk = st.get_socket(f, True) + q_dsc += sock.name_of(fsk) + dsc += '-M%s' % q_dsc + return dsc + + +class state: + def __init__(self, max_sockets, sk_type): + self.sockets = [] + self.sk_id = 0 + self.steps = [] + self.real_sockets = {} + self.sockets_left = max_sockets + self.sk_type = sk_type + + def add_socket(self, sk): + self.sockets.append(sk) + + def del_socket(self, sk): + self.sockets.remove(sk) + + def get_socket(self, sk_id, can_be_null = False): + for sk in self.sockets: + if sk.sk_id == sk_id: + return sk + + if not can_be_null: + raise Exception("%d socket not in list" % sk_id) + + return None + + def get_actions(self): + act_list = [] + + # Any socket in the state we can change it + for sk in self.sockets: + act_list += sk.get_actions(self) + + if self.sockets_left > 0: + act_list.append(act_socket(self.sk_type)) + self.sockets_left -= 1 + + return act_list + + def clone(self): + nst = state(self.sockets_left, self.sk_type) + for sk in self.sockets: + nst.sockets.append(sk.clone()) + nst.sk_id = self.sk_id + nst.steps = list(self.steps) + return nst + + # Generates textual description of a state. Different states + # may have same descriptions, e.g. if we have two sockets and + # only one of them is in listen state, we don't care which + # one in which. At the same time really different states + # shouldn't map to the same string. + def describe(self): + sks = [x.describe(self) for x in self.sockets] + sks = sorted(sks) + return '_'.join(sks) + + +def set_nonblock(sk): + fd = sk.fileno() + flags = fcntl.fcntl(fd, fcntl.F_GETFL) + fcntl.fcntl(fd, fcntl.F_SETFL, flags | os.O_NONBLOCK) + +CHK_FAIL_UNKNOWN = 10 +CHK_FAIL_SOCKET = 11 +CHK_FAIL_STAT = 12 +CHK_FAIL_LISTEN = 13 +CHK_FAIL_NAME = 14 +CHK_FAIL_ACCEPT = 15 +CHK_FAIL_RECV_0 = 16 +CHK_FAIL_RECV_MIX = 17 +CHK_FAIL_CONNECT = 18 +CHK_FAIL_CONNECT2 = 19 +CHK_FAIL_KILLED = 20 +CHK_FAIL_DUMP = 21 +CHK_FAIL_RESTORE = 22 + +CHK_PASS = 42 + +fail_desc = { + CHK_FAIL_UNKNOWN: 'Aliens invaded the test', + CHK_FAIL_LISTEN: 'Listen state lost on restore', + CHK_FAIL_NAME: 'Name lost on restore', + CHK_FAIL_ACCEPT: 'Incoming connection lost on restore', + CHK_FAIL_RECV_0: 'Message lost on restore', + CHK_FAIL_RECV_MIX: 'Message misorder on restore', + CHK_FAIL_CONNECT: 'Connectivity broken on restore', + CHK_FAIL_CONNECT2: 'Connectivity broken the hard way on restore', + CHK_FAIL_KILLED: 'Test process died unexpectedly', + CHK_FAIL_DUMP: 'Cannot dump', + CHK_FAIL_RESTORE: 'Cannot restore', +} + +def chk_real_state(st): + # Before enything else -- check that we still have + # all the sockets at hands + for sk in st.sockets: + if not sk.visible: + continue + + # In theory we can have key-not-found exception here, + # but this has nothing to do with sockets restore, + # since it's just bytes in memory, so ... we assume + # that we have object here and just check for it in + # the fdtable + rsk = st.real_sockets[sk.sk_id] + try: + s_st = os.fstat(rsk.fileno()) + except: + print('FAIL: Socket %d lost' % sk.sk_id) + return CHK_FAIL_SOCKET + if not stat.S_ISSOCK(s_st.st_mode): + print('FAIL: Not a socket %d at %d' % (sk.sk_id, rsk.fileno())) + return CHK_FAIL_STAT + + # First -- check the listen states and names + for sk in st.sockets: + if not sk.visible: + continue + + rsk = st.real_sockets[sk.sk_id] + r_listen = rsk.getsockopt(socket.SOL_SOCKET, socket.SO_ACCEPTCONN) + if (sk.listen and r_listen == 0) or (not sk.listen and r_listen == 1): + print("FAIL: Socket %d listen %d, expected %d" + % (sk.sk_id, r_listen, sk.listen and 1 or 0)) + return CHK_FAIL_LISTEN + + if sk.name: + r_name = rsk.getsockname() + w_name = sock.real_name_for(sk.name) + if r_name != w_name: + print('FAIL: Socket %d name mismatch [%s], want [%s]' + % (sk.sk_id, r_name, w_name)) + return CHK_FAIL_NAME + + # Second -- check (accept) pending connections + for sk in st.sockets: + if not sk.listen: + continue + + rsk = st.real_sockets[sk.sk_id] + set_nonblock(rsk) + + while sk.icons: + # Do act_accept to change the state properly + # and not write the code twice + acc = act_accept(sk.sk_id) + acc.act(st) + try: + acc.do(st) + except: + print('FAIL: Cannot accept pending connection for %d' % sk.sk_id) + return CHK_FAIL_ACCEPT + + print(' `- did %s' % acc.show()) + + # Third -- check inqueues + for sk in st.sockets: + if not sk.inqueue: + continue + + rsk = st.real_sockets[sk.sk_id] + set_nonblock(rsk) + + while sk.inqueue: + msg = sk.inqueue.pop(0) + try: + r_msg, m_from = rsk.recvfrom(128) + except: + print('FAIL: No message in queue for %d' % sk.sk_id) + return CHK_FAIL_RECV_0 + + w_msg = act_sendmsg.msgval(msg[1]) + if r_msg != w_msg: + print('FAIL: Message misorder: %s want %s (from %d)' + %(r_msg, w_msg, msg[0])) + return CHK_FAIL_RECV_MIX + + # TODO -- check sender + print(' `- recvd %d.%d msg %s -> %d' + % (msg[0], msg[1], m_from, sk.sk_id)) + + # Finally, after all sockets are visible and all inqueues are + # drained -- check the sockets connectivity + for sk in st.sockets: + if not sk.peer: + continue + + # Closed connection with one peer alive. Cannot check. + if not sk.peer in st.real_sockets: + continue + + rsk = st.real_sockets[sk.sk_id] + psk = st.real_sockets[sk.peer] + set_nonblock(psk) + msgv = act_sendmsg.msgval(3 * sk.sk_id + 5 * sk.peer, 'C') # just random + + try: + rsk.send(msgv) + rmsg = psk.recv(128) + except: + print('FAIL: Connectivity %d -> %d lost' % (sk.sk_id, sk.peer)) + return CHK_FAIL_CONNECT + + # If sockets are not connected the recv above + # would generate exception and the check would + # fail. But just in case we've screwed the queues + # the hard way -- also check for the message being + # delivered for real + if rmsg != msgv: + print('FAIL: Connectivity %d -> %d not verified' + % (sk.sk_id, sk.peer)) + return CHK_FAIL_CONNECT2 + + print(' `- checked %d -> %d with %s' % (sk.sk_id, sk.peer, msgv)) + + return CHK_PASS + + +def chk_state(st, opts): + print("Will check state") + + sigsk_name = "\0" + "CRSIGSKC" + signal_sk = socket.socket(socket.AF_UNIX, socket.SOCK_DGRAM, 0) + signal_sk.bind(sigsk_name) + + # FIXME Ideally call to criu should be performed by the run_state's + # pid!=0 branch, but for simplicity we fork the kid which has the + # same set of sockets we do, then dump it. Then restore and notify + # via dgram socket to check its state. Current task still has all + # the same sockets :) so we close them not to produce bind() name + # conflicts on restore + + pid = os.fork() + if pid == 0: + msg = signal_sk.recv(64) + ret = chk_real_state(st) + sys.exit(ret) + + signal_sk.close() + for rsk in st.real_sockets.values(): + rsk.close() + + print("`- dump") + img_path = "sti_" + st.describe() + try: + os.mkdir(img_path) + subprocess.check_call([criu_bin, "dump", "-t", "%d" % pid, "-D", img_path, "-v4", "-o", "dump.log", "-j"]) + except: + print("Dump failed") + os.kill(pid, signal.SIGKILL) + return CHK_FAIL_DUMP + + print("`- restore") + try: + os.waitpid(pid, 0) + subprocess.check_call([criu_bin, "restore", "-D", img_path, "-v4", "-o", "rst.log", "-j", "-d", "-S"]) + except: + print("Restore failed") + return CHK_FAIL_RESTORE + + print("`- check") + signal_sk = socket.socket(socket.AF_UNIX, socket.SOCK_DGRAM, 0) + try: + signal_sk.sendto('check', sigsk_name) + except: + # Probably the peer has died before us or smth else went wrong + os.kill(pid, signal.SIGKILL) + + wp, status = os.waitpid(pid, 0) + if os.WIFEXITED(status): + status = os.WEXITSTATUS(status) + if status != CHK_PASS: + print("`- exited with %d" % status) + return status + elif os.WIFSIGNALED(status): + status = os.WTERMSIG(status) + print("`- killed with %d" % status) + return CHK_FAIL_KILLED + else: + return CHK_FAIL_UNKNOWN + + return CHK_PASS + + +def run_state(st, opts): + print("Will run state") + pid = os.fork() + if pid != 0: + wpid, status = os.wait() + if os.WIFEXITED(status): + status = os.WEXITSTATUS(status) + elif os.WIFSIGNALED(status): + status = CHK_FAIL_KILLED + else: + status = CHK_FAIL_UNKNOWN + return status + + # Try the states in subprocess so that once + # it exits the created sockets are removed + for step in st.steps: + step.do(st) + + if not opts.run: + ret = chk_state(st, opts) + else: + ret = chk_real_state(st) + + sys.exit(ret) + + +def proceed(st, seen, failed, opts, depth = 0): + desc = st.describe() + if not desc: + pass + elif not desc in seen: + # When scanning the tree we run and try only states that + # differ, but don't stop tree traversal on them. This is + # because sometimes we can get into the already seen state + # using less steps and it's better to proceed as we have + # depth to move forward and generate more states. + seen[desc] = len(st.steps) + print('%s' % desc) + for s in st.steps: + print('\t%s' % s.show()) + + if not opts.gen: + ret = run_state(st, opts) + if ret != CHK_PASS: + failed.add((desc, ret)) + if not opts.keep: + return False + else: + # Don't even proceed with this state if we've already + # seen one but get there with less steps + seen_score = seen[desc] + if len(st.steps) > seen_score: + return True + else: + seen[desc] = len(st.steps) + + if depth >= opts.depth: + return True + + actions = st.get_actions() + for act in actions: + nst = st.clone() + act.act(nst) + nst.steps.append(act) + if not proceed(nst, seen, failed, opts, depth + 1): + return False + + return True + + +p = argparse.ArgumentParser("CRIU test suite") +p.add_argument("--depth", help = "Depth of generated tree", default = '8') +p.add_argument("--sockets", help = "Maximum number of sockets", default = '1') +p.add_argument("--dgram", help = "Use SOCK_DGRAM sockets", action = 'store_true') +p.add_argument("--stream", help = "Use SOCK_STREAM sockets", action = 'store_true') +p.add_argument("--gen", help = "Only generate and show states", action = 'store_true') +p.add_argument("--run", help = "Run the states, but don't C/R", action = 'store_true') +p.add_argument("--keep", help = "Don't stop on error", action = 'store_true') +opts = p.parse_args() +opts.depth = int(opts.depth) + +# XXX: does it make any sense to mix two types in one go? +if opts.stream and opts.dgram: + print('Choose only one type') + sys.exit(1) + +if opts.stream: + sk_type = socket.SOCK_STREAM +elif opts.dgram: + sk_type = socket.SOCK_DGRAM +else: + print('Choose some type') + sys.exit(1) + +st = state(int(opts.sockets), sk_type) +seen = {} +failed = set() +proceed(st, seen, failed, opts) + +if len(failed) == 0: + print('PASS (%d states)' % len(seen)) +else: + print('FAIL %d/%d' % (len(failed), len(seen))) + for f in failed: + print("\t%-50s: %s" % (f[0], fail_desc.get(f[1], 'unknown reason %d' % f[1]))) diff --git a/CRIU_code/test/groups.desc b/CRIU_code/test/groups.desc new file mode 100644 index 0000000..95c0fe7 --- /dev/null +++ b/CRIU_code/test/groups.desc @@ -0,0 +1 @@ +{ 'dir': 'groups/', 'exclude': [ ] } diff --git a/CRIU_code/test/inhfd.desc b/CRIU_code/test/inhfd.desc new file mode 100644 index 0000000..e44278a --- /dev/null +++ b/CRIU_code/test/inhfd.desc @@ -0,0 +1 @@ +{ 'dir': 'inhfd/', 'exclude': [ ] } diff --git a/CRIU_code/test/inhfd/fifo.py b/CRIU_code/test/inhfd/fifo.py new file mode 100644 index 0000000..64e5f8f --- /dev/null +++ b/CRIU_code/test/inhfd/fifo.py @@ -0,0 +1,39 @@ +import os +import tempfile + +id_str = "" + + +def create_fds(): + tdir = tempfile.mkdtemp("zdtm.inhfd.XXXXXX") + if os.system("mount -t tmpfs zdtm.inhfd %s" % tdir) != 0: + raise Exception("Unable to mount tmpfs") + tfifo = os.path.join(tdir, "test_fifo") + os.mkfifo(tfifo) + fd2 = open(tfifo, "w+b", buffering=0) + fd1 = open(tfifo, "rb") + os.system("umount -l %s" % tdir) + os.rmdir(tdir) + + mnt_id = -1 + with open("/proc/self/fdinfo/%d" % fd1.fileno()) as f: + for line in f: + line = line.split() + if line[0] == "mnt_id:": + mnt_id = int(line[1]) + break + else: + raise Exception("Unable to find mnt_id") + + global id_str + id_str = "file[%x:%x]" % (mnt_id, os.fstat(fd1.fileno()).st_ino) + + return [(fd2, fd1)] + + +def filename(pipef): + return id_str + + +def dump_opts(sockf): + return ["--external", id_str] diff --git a/CRIU_code/test/inhfd/fifo.py.desc b/CRIU_code/test/inhfd/fifo.py.desc new file mode 100644 index 0000000..10666c8 --- /dev/null +++ b/CRIU_code/test/inhfd/fifo.py.desc @@ -0,0 +1 @@ +{ 'flavor': 'h' } diff --git a/CRIU_code/test/inhfd/pipe.py b/CRIU_code/test/inhfd/pipe.py new file mode 100644 index 0000000..318dc86 --- /dev/null +++ b/CRIU_code/test/inhfd/pipe.py @@ -0,0 +1,17 @@ +import os + + +def create_fds(): + pipes = [] + for i in range(10): + (fd1, fd2) = os.pipe() + pipes.append((os.fdopen(fd2, "wb"), os.fdopen(fd1, "rb"))) + return pipes + + +def filename(pipef): + return 'pipe:[%d]' % os.fstat(pipef.fileno()).st_ino + + +def dump_opts(sockf): + return [] diff --git a/CRIU_code/test/inhfd/pipe.py.desc b/CRIU_code/test/inhfd/pipe.py.desc new file mode 100644 index 0000000..10666c8 --- /dev/null +++ b/CRIU_code/test/inhfd/pipe.py.desc @@ -0,0 +1 @@ +{ 'flavor': 'h' } diff --git a/CRIU_code/test/inhfd/socket.py b/CRIU_code/test/inhfd/socket.py new file mode 100644 index 0000000..feba0e0 --- /dev/null +++ b/CRIU_code/test/inhfd/socket.py @@ -0,0 +1,21 @@ +import socket +import os + + +def create_fds(): + (sk1, sk2) = socket.socketpair(socket.AF_UNIX, socket.SOCK_STREAM) + (sk3, sk4) = socket.socketpair(socket.AF_UNIX, socket.SOCK_STREAM) + return [(sk1.makefile("wb"), sk2.makefile("rb")), + (sk3.makefile("wb"), sk4.makefile("rb"))] + + +def __sock_ino(sockf): + return os.fstat(sockf.fileno()).st_ino + + +def filename(sockf): + return 'socket:[%d]' % __sock_ino(sockf) + + +def dump_opts(sockf): + return ['--external', 'unix[%d]' % __sock_ino(sockf)] diff --git a/CRIU_code/test/inhfd/socket.py.desc b/CRIU_code/test/inhfd/socket.py.desc new file mode 100644 index 0000000..10666c8 --- /dev/null +++ b/CRIU_code/test/inhfd/socket.py.desc @@ -0,0 +1 @@ +{ 'flavor': 'h' } diff --git a/CRIU_code/test/inhfd/tty.py b/CRIU_code/test/inhfd/tty.py new file mode 100644 index 0000000..ae76a96 --- /dev/null +++ b/CRIU_code/test/inhfd/tty.py @@ -0,0 +1,37 @@ +# vim: noet ts=8 sw=8 sts=8 +import fcntl +import os +import pty +import termios + + +ctl = False + + +def child_prep(fd): + global ctl + if ctl: + return + ctl = True + fcntl.ioctl(fd.fileno(), termios.TIOCSCTTY, 1) + + +def create_fds(): + ttys = [] + for i in range(10): + (fd1, fd2) = pty.openpty() + newattr = termios.tcgetattr(fd1) + newattr[3] &= ~termios.ICANON & ~termios.ECHO + termios.tcsetattr(fd1, termios.TCSADRAIN, newattr) + ttys.append((os.fdopen(fd1, "wb"), os.fdopen(fd2, "rb"))) + return ttys + + +def filename(pipef): + st = os.fstat(pipef.fileno()) + return 'tty[%x:%x]' % (st.st_rdev, st.st_dev) + + +def dump_opts(sockf): + st = os.fstat(sockf.fileno()) + return "--external", 'tty[%x:%x]' % (st.st_rdev, st.st_dev) diff --git a/CRIU_code/test/inhfd/tty.py.desc b/CRIU_code/test/inhfd/tty.py.desc new file mode 100644 index 0000000..10666c8 --- /dev/null +++ b/CRIU_code/test/inhfd/tty.py.desc @@ -0,0 +1 @@ +{ 'flavor': 'h' } diff --git a/CRIU_code/test/jenkins/_run_ct b/CRIU_code/test/jenkins/_run_ct new file mode 100644 index 0000000..88b5b6c --- /dev/null +++ b/CRIU_code/test/jenkins/_run_ct @@ -0,0 +1,8 @@ +#!/bin/sh + +set -e +mount --make-rslave / +umount -l /proc +mount -t proc proc /proc/ +mount -t binfmt_misc none /proc/sys/fs/binfmt_misc/ +exec "$@" diff --git a/CRIU_code/test/jenkins/actions.sh b/CRIU_code/test/jenkins/actions.sh new file mode 100644 index 0000000..8019045 --- /dev/null +++ b/CRIU_code/test/jenkins/actions.sh @@ -0,0 +1,8 @@ +# Check how crit de/encodes images +set -e +source `dirname $0`/criu-lib.sh +# prep +rm -f actions_called.txt +./test/zdtm.py run -t zdtm/static/env00 --script "$(pwd)/test/show_action.sh" || fail +./test/check_actions.py || fail +exit 0 diff --git a/CRIU_code/test/jenkins/crit.sh b/CRIU_code/test/jenkins/crit.sh new file mode 100644 index 0000000..fcf1c58 --- /dev/null +++ b/CRIU_code/test/jenkins/crit.sh @@ -0,0 +1,7 @@ +# Check how crit de/encodes images +set -e +source `dirname $0`/criu-lib.sh +prep +./test/zdtm.py run --all -f best -x maps04 -x cgroup02 --norst --keep-img always || fail +PYTHONPATH="$(pwd)/lib/" ./test/crit-recode.py || fail +exit 0 diff --git a/CRIU_code/test/jenkins/criu-btrfs.sh b/CRIU_code/test/jenkins/criu-btrfs.sh new file mode 100644 index 0000000..e749ad9 --- /dev/null +++ b/CRIU_code/test/jenkins/criu-btrfs.sh @@ -0,0 +1,6 @@ +# This is a job which is executed on btrfs + +source `dirname $0`/criu-lib.sh && +prep && +make -C test -j 4 ZDTM_ARGS="-C -x '\(maps04\|mountpoints\|inotify_irmap\)'" zdtm && +true || fail diff --git a/CRIU_code/test/jenkins/criu-by-id.sh b/CRIU_code/test/jenkins/criu-by-id.sh new file mode 100644 index 0000000..2381e73 --- /dev/null +++ b/CRIU_code/test/jenkins/criu-by-id.sh @@ -0,0 +1,12 @@ +echo 950000 > /sys/fs/cgroup/cpu,cpuacct/system/cpu.rt_runtime_us +echo 950000 > /sys/fs/cgroup/cpu,cpuacct/system/jenkins.service/cpu.rt_runtime_us +git checkout -f ${TEST_COMMIT} +git clean -dfx && +make -j 4 && make -j 4 -C test/zdtm && +mkdir -p test/dump && +mount -t tmpfs zdtm test/dump && +make -C test -j 4 zdtm_ns && +true || { + tar -czf /home/criu-by-id-${TEST_COMMIT}-$(date +%m%d%H%M).tar.gz . + exit 1 +} diff --git a/CRIU_code/test/jenkins/criu-dedup.sh b/CRIU_code/test/jenkins/criu-dedup.sh new file mode 100644 index 0000000..e75ef5f --- /dev/null +++ b/CRIU_code/test/jenkins/criu-dedup.sh @@ -0,0 +1,14 @@ +# Check auto-deduplication of pagemaps +set -e +source `dirname $0`/criu-lib.sh +prep +./test/zdtm.py run --all --keep-going --report report --parallel 4 -f h --pre 2 --dedup -x maps04 -x maps007 || fail + +# Additionally run these tests as they touch a lot of +# memory and it makes sense to additionally check it +# with delays between iterations +./test/zdtm.py run -t zdtm/transition/maps007 --keep-going --report report -f h --pre 8:.1 --dedup || fail +./test/zdtm.py run -t zdtm/static/mem-touch --keep-going --report report -f h --pre 8:.1 --dedup || fail +./test/zdtm.py run -t zdtm/transition/maps008 --keep-going --report report -f h --pre 8:.1 --dedup || fail +./test/zdtm.py run -t zdtm/transition/maps007 --keep-going --report report -f h --pre 8:.1 --noauto-dedup || fail +./test/zdtm.py run -t zdtm/static/mem-touch --keep-going --report report -f h --pre 8:.1 --noauto-dedup || fail diff --git a/CRIU_code/test/jenkins/criu-dump.sh b/CRIU_code/test/jenkins/criu-dump.sh new file mode 100644 index 0000000..381cf7a --- /dev/null +++ b/CRIU_code/test/jenkins/criu-dump.sh @@ -0,0 +1,6 @@ +# Check that dump is not destructive +set -e +source `dirname $0`/criu-lib.sh +prep +mount_tmpfs_to_dump +./test/zdtm.py run --all --keep-going --report report --parallel 4 --norst -x 'maps04' -x 'cgroup02' || fail diff --git a/CRIU_code/test/jenkins/criu-fault.sh b/CRIU_code/test/jenkins/criu-fault.sh new file mode 100644 index 0000000..ec6d26f --- /dev/null +++ b/CRIU_code/test/jenkins/criu-fault.sh @@ -0,0 +1,25 @@ +#!/bin/bash +# Check known fault injections +set -e +source `dirname $0`/criu-lib.sh +prep +./test/zdtm.py run -t zdtm/static/env00 --fault 1 --keep-going --report report -f h || fail +./test/zdtm.py run -t zdtm/static/unlink_fstat00 --fault 2 --keep-going --report report -f h || fail +./test/zdtm.py run -t zdtm/static/maps00 --fault 3 --keep-going --report report -f h || fail +./test/zdtm.py run -t zdtm/static/inotify_irmap --fault 128 --keep-going --pre 2 -f uns || fail +./test/zdtm.py run -t zdtm/static/env00 --fault 129 -f uns || fail +./test/zdtm.py run -t zdtm/transition/fork --fault 130 -f h || fail +./test/zdtm.py run -t zdtm/static/vdso01 --fault 127 || fail +./test/zdtm.py run -t zdtm/static/vdso-proxy --fault 127 --iters 3 || fail + +./test/zdtm.py run -t zdtm/static/mntns_ghost --fault 2 --keep-going --report report || fail +./test/zdtm.py run -t zdtm/static/mntns_ghost --fault 4 --keep-going --report report || fail + +./test/zdtm.py run -t zdtm/static/mntns_ghost --fault 6 --report report || fail +./test/zdtm.py run -t zdtm/static/mntns_link_remap --fault 6 --report report || fail +./test/zdtm.py run -t zdtm/static/unlink_fstat03 --fault 6 --report report || fail + +./test/zdtm.py run -t zdtm/static/env00 --fault 5 --keep-going --report report || fail +./test/zdtm.py run -t zdtm/static/maps04 --fault 131 --keep-going --report report --pre 2:1 || fail +./test/zdtm.py run -t zdtm/transition/maps008 --fault 131 --keep-going --report report --pre 2:1 || fail +./test/zdtm.py run -t zdtm/static/maps01 --fault 132 -f h || fail diff --git a/CRIU_code/test/jenkins/criu-fcg.sh b/CRIU_code/test/jenkins/criu-fcg.sh new file mode 100644 index 0000000..938a72f --- /dev/null +++ b/CRIU_code/test/jenkins/criu-fcg.sh @@ -0,0 +1,13 @@ +# Test how freeze cgroup works +set -e +source `dirname $0`/criu-lib.sh +prep +mount_tmpfs_to_dump + +./test/zdtm.py run -t zdtm/transition/thread-bomb -f h --keep-going --report report --freezecg zdtm:f || fail +./test/zdtm.py run -t zdtm/transition/thread-bomb -f h --keep-going --report report --freezecg zdtm:f --pre 3 || fail +./test/zdtm.py run -t zdtm/transition/thread-bomb -f h --keep-going --report report --freezecg zdtm:f --norst || fail + +./test/zdtm.py run -t zdtm/transition/thread-bomb -f h --keep-going --report report --freezecg zdtm:t || fail +./test/zdtm.py run -t zdtm/transition/thread-bomb -f h --keep-going --report report --freezecg zdtm:t --pre 3 || fail +./test/zdtm.py run -t zdtm/transition/thread-bomb -f h --keep-going --report report --freezecg zdtm:t --norst || fail diff --git a/CRIU_code/test/jenkins/criu-groups.sh b/CRIU_code/test/jenkins/criu-groups.sh new file mode 100644 index 0000000..508d20a --- /dev/null +++ b/CRIU_code/test/jenkins/criu-groups.sh @@ -0,0 +1,7 @@ +# Make one regular C/R cycle over randomly-generated groups +set -e +source `dirname $0`/criu-lib.sh +prep +mount_tmpfs_to_dump +./test/zdtm.py group --max 32 -x maps04 -x cgroup || fail +./test/zdtm.py --set groups run --all --keep-going --report report -f best || fail diff --git a/CRIU_code/test/jenkins/criu-inhfd.sh b/CRIU_code/test/jenkins/criu-inhfd.sh new file mode 100644 index 0000000..a59dcda --- /dev/null +++ b/CRIU_code/test/jenkins/criu-inhfd.sh @@ -0,0 +1,5 @@ +# Check known fault injections +set -e +source `dirname $0`/criu-lib.sh +prep +./test//zdtm.py --set inhfd run --all --keep-going --report report -f h || fail diff --git a/CRIU_code/test/jenkins/criu-iter.sh b/CRIU_code/test/jenkins/criu-iter.sh new file mode 100644 index 0000000..d414b05 --- /dev/null +++ b/CRIU_code/test/jenkins/criu-iter.sh @@ -0,0 +1,6 @@ +# Make 3 iteration of dump/restore for each test +set -e +source `dirname $0`/criu-lib.sh +prep +mount_tmpfs_to_dump +./test/zdtm.py run --all --keep-going --report report --parallel 4 --iter 3 -x 'maps04' || fail diff --git a/CRIU_code/test/jenkins/criu-join-ns.sh b/CRIU_code/test/jenkins/criu-join-ns.sh new file mode 100644 index 0000000..39ef182 --- /dev/null +++ b/CRIU_code/test/jenkins/criu-join-ns.sh @@ -0,0 +1,7 @@ +# Make one regular C/R cycle +set -e +source `dirname $0`/criu-lib.sh +prep +mkdir -p /var/run/netns +mount -t tmpfs zdtm_run /var/run/netns +./test/zdtm.py run --all --keep-going --report report --join-ns || fail diff --git a/CRIU_code/test/jenkins/criu-lazy-common.sh b/CRIU_code/test/jenkins/criu-lazy-common.sh new file mode 100644 index 0000000..7fdab40 --- /dev/null +++ b/CRIU_code/test/jenkins/criu-lazy-common.sh @@ -0,0 +1,11 @@ +KERN_MAJ=`uname -r | cut -d. -f1` +KERN_MIN=`uname -r | cut -d. -f2` +if [ $KERN_MAJ -ge "4" ] && [ $KERN_MIN -ge "11" ]; then + LAZY_EXCLUDE="-x cmdlinenv00 -x maps007" +else + LAZY_EXCLUDE="-x maps007 -x fork -x fork2 -x uffd-events -x cgroupns + -x socket_listen -x socket_listen6 -x cmdlinenv00 + -x socket_close_data01 -x file_read" +fi + +LAZY_EXCLUDE="$LAZY_EXCLUDE -x maps04" diff --git a/CRIU_code/test/jenkins/criu-lazy-migration.pipeline b/CRIU_code/test/jenkins/criu-lazy-migration.pipeline new file mode 100644 index 0000000..2c863f1 --- /dev/null +++ b/CRIU_code/test/jenkins/criu-lazy-migration.pipeline @@ -0,0 +1,35 @@ +pipeline { + options { + buildDiscarder(logRotator(numToKeepStr: '30', artifactNumToKeepStr: '30')) + } + agent { + node { + label 'x86_64' + } + } + triggers { + cron('H H/4 * * *') + } + stages { + stage('Build') { + steps { + sh 'git clean -dxf' + sh 'make' + sh 'make -C test/zdtm' + } + } + stage('Test'){ + steps { + sh './test/jenkins/run_ct sh -c "mount --make-rprivate / && mount --rbind . /mnt && cd /mnt && ./test/jenkins/criu-lazy-migration.sh"' + junit 'test/report/criu-testreport*.xml' + } + } + } + post { + failure { + emailext attachLog: true, body: '''$DEFAULT_CONTENT + +${FILE,path="test/report/output"}''', compressLog: true, subject: '$DEFAULT_SUBJECT', to: "${env.CRIU_RECIPIENTS}" + } + } +} diff --git a/CRIU_code/test/jenkins/criu-lazy-migration.sh b/CRIU_code/test/jenkins/criu-lazy-migration.sh new file mode 100644 index 0000000..30e3c03 --- /dev/null +++ b/CRIU_code/test/jenkins/criu-lazy-migration.sh @@ -0,0 +1,20 @@ +# Check lazy-pages +set -e +source `dirname $0`/criu-lib.sh +prep + +source `dirname $0`/criu-lazy-common.sh + +# These tests seem to require complete separation of dump and restore namespaces +LAZY_MIGRATE_EXCLUDE="-x fifo_loop -x file_locks -x ptrace_sig -x overmount_file -x file_lease -x cr_veth -x fifo -x overmount_sock -x unlink_largefile -x socket_udp-corked -x netns_sub_veth" + +# lazy restore from images +./test/zdtm.py run --all --keep-going --report report --parallel 4 -f uns \ + --lazy-migrate $LAZY_EXCLUDE $LAZY_MIGRATE_EXCLUDE || fail + +# During pre-dump + lazy-pages we leave VM_NOHUGEPAGE set +LAZY_EXCLUDE="$LAZY_EXCLUDE -x maps02" + +# lazy restore from images with pre-dumps +./test/zdtm.py run --all --keep-going --report report --parallel 4 -f uns \ + --lazy-migrate --pre 2 $LAZY_EXCLUDE $LAZY_MIGRATE_EXCLUDE || fail diff --git a/CRIU_code/test/jenkins/criu-lazy-pages.sh b/CRIU_code/test/jenkins/criu-lazy-pages.sh new file mode 100644 index 0000000..a3ee9a4 --- /dev/null +++ b/CRIU_code/test/jenkins/criu-lazy-pages.sh @@ -0,0 +1,17 @@ +# Check lazy-pages +set -e +source `dirname $0`/criu-lib.sh +prep + +source `dirname $0`/criu-lazy-common.sh + +# lazy restore from images +./test/zdtm.py run --all --keep-going --report report --parallel 4 \ + --lazy-pages $LAZY_EXCLUDE || fail + +# During pre-dump + lazy-pages we leave VM_NOHUGEPAGE set +LAZY_EXCLUDE="$LAZY_EXCLUDE -x maps02" + +# lazy restore from images with pre-dumps +./test/zdtm.py run --all --keep-going --report report --parallel 4 \ + --lazy-pages --pre 2 $LAZY_EXCLUDE || fail diff --git a/CRIU_code/test/jenkins/criu-lib.sh b/CRIU_code/test/jenkins/criu-lib.sh new file mode 100644 index 0000000..72d41b5 --- /dev/null +++ b/CRIU_code/test/jenkins/criu-lib.sh @@ -0,0 +1,42 @@ +function exit_hook() +{ + test -z "$GCOV" && return + make gcov +} + +function prep() +{ + test -n "$SKIP_PREP" && return + # systemd executes jenkins in a separate sched cgroup. + echo 950000 > /sys/fs/cgroup/cpu,cpuacct/system/cpu.rt_runtime_us || true + echo 950000 > /sys/fs/cgroup/cpu,cpuacct/system/jenkins.service/cpu.rt_runtime_us || true + + test -n "$GCOV" && umask 0000 + + ulimit -c unlimited && + export CFLAGS=-g + git clean -dfx && + make -j 4 && + make -j 4 -C test/zdtm/ && + make -C test zdtm_ct && + mkdir -p test/report && + trap exit_hook EXIT +} + +function mount_tmpfs_to_dump() +{ + test -n "$SKIP_PREP" && return + mkdir -p test/dump && + mount -t tmpfs criu_dump test/dump && + true +} + +function fail() +{ + set +e + uname -a + ps axf --width 256 > ps.log + tar -czf /home/`basename $0`-${BUILD_NUMBER}-${GIT_COMMIT}-$(date +%m%d%H%M).tar.gz . + tar -czf report.tar.gz -C test/ report + exit 1 +} diff --git a/CRIU_code/test/jenkins/criu-other.sh b/CRIU_code/test/jenkins/criu-other.sh new file mode 100644 index 0000000..c6c231c --- /dev/null +++ b/CRIU_code/test/jenkins/criu-other.sh @@ -0,0 +1,4 @@ +source `dirname $0`/criu-lib.sh && +prep && +make -C test other && +true || fail diff --git a/CRIU_code/test/jenkins/criu-overlay.sh b/CRIU_code/test/jenkins/criu-overlay.sh new file mode 100644 index 0000000..5ef7682 --- /dev/null +++ b/CRIU_code/test/jenkins/criu-overlay.sh @@ -0,0 +1,7 @@ +# Make one regular C/R cycle +set -e +source `dirname $0`/criu-lib.sh +prep +mkdir -p test.up test.work +mount -t overlay overlay -olowerdir=test,upperdir=test.up,workdir=test.work test +./test/zdtm.py run --all --keep-going --report report --parallel 4 -x inotify -x mntns_open -x socket -x sk-unix -x unlink -x fsnotify -x fanotify -x ghost || fail diff --git a/CRIU_code/test/jenkins/criu-pre-dump.sh b/CRIU_code/test/jenkins/criu-pre-dump.sh new file mode 100644 index 0000000..95f4d85 --- /dev/null +++ b/CRIU_code/test/jenkins/criu-pre-dump.sh @@ -0,0 +1,7 @@ +# Check 3 pre-dump-s before dump (with and w/o page server) +set -e +source `dirname $0`/criu-lib.sh +prep +mount_tmpfs_to_dump +./test/zdtm.py run --all --keep-going --report report --parallel 4 --pre 3 -x 'maps04' || fail +./test/zdtm.py run --all --keep-going --report report --parallel 4 --pre 3 --page-server -x 'maps04' || fail diff --git a/CRIU_code/test/jenkins/criu-remote-lazy-pages.sh b/CRIU_code/test/jenkins/criu-remote-lazy-pages.sh new file mode 100644 index 0000000..ea0d17f --- /dev/null +++ b/CRIU_code/test/jenkins/criu-remote-lazy-pages.sh @@ -0,0 +1,17 @@ +# Check remote-lazy-pages +set -e +source `dirname $0`/criu-lib.sh +prep + +source `dirname $0`/criu-lazy-common.sh + +# lazy restore from "remote" dump +./test/zdtm.py run --all --keep-going --report report --parallel 4 \ + --remote-lazy-pages $LAZY_EXCLUDE -x maps04 || fail + +# During pre-dump + lazy-pages we leave VM_NOHUGEPAGE set +LAZY_EXCLUDE="$LAZY_EXCLUDE -x maps02" + +# lazy restore from "remote" dump with pre-dumps +./test/zdtm.py run --all --keep-going --report report --parallel 4 \ + --remote-lazy-pages --pre 2 $LAZY_EXCLUDE || fail diff --git a/CRIU_code/test/jenkins/criu-sibling.sh b/CRIU_code/test/jenkins/criu-sibling.sh new file mode 100644 index 0000000..93f0703 --- /dev/null +++ b/CRIU_code/test/jenkins/criu-sibling.sh @@ -0,0 +1,6 @@ +# Make 3 iteration of dump/restore for each test +set -e +source `dirname $0`/criu-lib.sh +prep +mount_tmpfs_to_dump +./test/zdtm.py run --all --keep-going --report report --sibling --parallel 4 -x 'maps04' || fail diff --git a/CRIU_code/test/jenkins/criu-snap.sh b/CRIU_code/test/jenkins/criu-snap.sh new file mode 100644 index 0000000..d28ba45 --- /dev/null +++ b/CRIU_code/test/jenkins/criu-snap.sh @@ -0,0 +1,7 @@ +# Check snapshots +set -e +source `dirname $0`/criu-lib.sh +prep +mount_tmpfs_to_dump +./test/zdtm.py run --all --keep-going --report report --parallel 4 --pre 3 --snaps -x 'maps04' || fail +./test/zdtm.py run --all --keep-going --report report --parallel 4 --pre 3 --snaps --page-server -x 'maps04' || fail diff --git a/CRIU_code/test/jenkins/criu-stop.sh b/CRIU_code/test/jenkins/criu-stop.sh new file mode 100644 index 0000000..d92519d --- /dev/null +++ b/CRIU_code/test/jenkins/criu-stop.sh @@ -0,0 +1,5 @@ +# Check --leave-stopped option +set -e +source `dirname $0`/criu-lib.sh +prep +./test/zdtm.py run -t zdtm/transition/fork --stop --iter 3 || fail diff --git a/CRIU_code/test/jenkins/criu-user.sh b/CRIU_code/test/jenkins/criu-user.sh new file mode 100644 index 0000000..d89ede2 --- /dev/null +++ b/CRIU_code/test/jenkins/criu-user.sh @@ -0,0 +1,6 @@ +# Make 3 iteration of dump/restore for each test +set -e +source `dirname $0`/criu-lib.sh +prep +mount_tmpfs_to_dump +./test/zdtm.py run --all --keep-going --report report --parallel 4 --user -x 'maps04' || fail diff --git a/CRIU_code/test/jenkins/criu.sh b/CRIU_code/test/jenkins/criu.sh new file mode 100644 index 0000000..19d545c --- /dev/null +++ b/CRIU_code/test/jenkins/criu.sh @@ -0,0 +1,5 @@ +# Make one regular C/R cycle +set -e +source `dirname $0`/criu-lib.sh +prep +./test/zdtm.py run --all --keep-going --report report --parallel 4 || fail diff --git a/CRIU_code/test/jenkins/run_ct b/CRIU_code/test/jenkins/run_ct new file mode 100644 index 0000000..5992b2e --- /dev/null +++ b/CRIU_code/test/jenkins/run_ct @@ -0,0 +1,3 @@ +#!/bin/sh + +unshare --mount --pid --fork -- $(readlink -f `dirname $0`/_run_ct) "$@" diff --git a/CRIU_code/test/others/app-emu.sh b/CRIU_code/test/others/app-emu.sh new file mode 100644 index 0000000..4432b90 --- /dev/null +++ b/CRIU_code/test/others/app-emu.sh @@ -0,0 +1,29 @@ +#!/bin/sh + +TEST_LIST=" +vnc +java/HelloWorld +screen +tarbz +make +" + +[ -n "$1" ] && TEST_LIST="$1" + +BASE_DIR=`pwd`/`dirname $0` + +for t in $TEST_LIST; do + dir=$BASE_DIR/app-emu/$t + log=$dir/run.log + ( + cd $dir + bash ./run.sh + ) 2>&1 | tee $log + grep PASS $log || { + echo "Test: $t" + echo "====================== ERROR ======================" + echo "Run log : $log" + echo "$t " + exit 1 + } +done diff --git a/CRIU_code/test/others/app-emu/java/HelloWorld/HelloWorld.java b/CRIU_code/test/others/app-emu/java/HelloWorld/HelloWorld.java new file mode 100644 index 0000000..93a63dd --- /dev/null +++ b/CRIU_code/test/others/app-emu/java/HelloWorld/HelloWorld.java @@ -0,0 +1,20 @@ +/* + * Trivial program which requires no + * additional imports + */ +public class HelloWorld { + public static void main(String[] args) { + int nr_sleeps = 5; + for (;;) { + System.out.println("Hello World"); + if (nr_sleeps == 0) + System.exit(0); + try { + Thread.sleep(1000); + nr_sleeps--; + } catch(InterruptedException ex) { + Thread.currentThread().interrupt(); + } + } + } +} diff --git a/CRIU_code/test/others/app-emu/java/HelloWorld/run.sh b/CRIU_code/test/others/app-emu/java/HelloWorld/run.sh new file mode 100644 index 0000000..0ed6afd --- /dev/null +++ b/CRIU_code/test/others/app-emu/java/HelloWorld/run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +source ../../../functions.sh || exit 1 +source ../../../env.sh || exit 1 + +cleanup_class() { + rm -f ./*.class +} + +javac HelloWorld.java || exit 1 + +set -x + +rm -rf dump +mkdir dump + +setsid java HelloWorld & + +pid=${!} + +echo Lanuched java application with pid $pid in background + +${criu} dump -D dump -o dump.log -v4 --shell-job -t ${pid} || { + echo "Dump failed" + exit 1 +} + +wait_tasks dump + +echo "Dumped, restoring and waiting for completion" + +${criu} restore -D dump -o restore.log -v4 --shell-job || { + echo "Restore failed" + exit 1 +} + +echo PASS diff --git a/CRIU_code/test/others/app-emu/job/Makefile b/CRIU_code/test/others/app-emu/job/Makefile new file mode 100644 index 0000000..2fde2f6 --- /dev/null +++ b/CRIU_code/test/others/app-emu/job/Makefile @@ -0,0 +1,12 @@ +all: job +.PHONY: all + +%.o: %.c + gcc -c $< -o $@ + +job: job.o + gcc -o $@ job.o + +clean: + rm -f *.o job +.PHONY: clean diff --git a/CRIU_code/test/others/app-emu/job/job.c b/CRIU_code/test/others/app-emu/job/job.c new file mode 100644 index 0000000..44858a7 --- /dev/null +++ b/CRIU_code/test/others/app-emu/job/job.c @@ -0,0 +1,100 @@ +#include +#include +#include + +#include +#include +#include + +#include +#include +#include + +#include + +#include + +static int stop = 0; + +void sighandler(int sig) +{ + stop = 1; +} + +int main(int argc, char *argv[]) +{ + int pid, gid, sid; + int tty_sid, tty_gid; + int fd = fileno(stdout); + char buf[32]; + struct dirent *de; + DIR *fd_dir; + sigset_t bmask, cmask; + + if (signal(SIGTERM, sighandler)) { + printf("Unable to set a signal handler: %m\n"); + return 1; + } + + if (!isatty(fd)) { + printf("stdout is not tty\n"); + return -1; + } + + pid = getpid(); + gid = getgid(); + sid = getsid(pid); + + printf("pid %d gid %d sid %d\n", + pid, gid, sid); + + snprintf(buf, sizeof(buf), "/proc/%d/fd", pid); + fd_dir = opendir(buf); + if (!fd_dir) { + printf("cant open %s\n", buf); + return -1; + } + + while ((de = readdir(fd_dir))) { + int _fd; + if (!strcmp(de->d_name, ".")) + continue; + if (!strcmp(de->d_name, "..")) + continue; + + _fd = atoi(de->d_name); + if (_fd > 2 && _fd != fd && isatty(_fd)) { + close(_fd); + printf("Closed %d\n", _fd); + } + } + closedir(fd_dir); + + if (ioctl(fd, TIOCGSID, &tty_sid) < 0) { + printf("cant obtain sid on stdout\n"); + return -1; + } + printf("stdout sid = %d\n", tty_sid); + + if (ioctl(fd, TIOCGPGRP, &tty_gid) < 0) { + printf("cant obtain gid on stdout\n"); + return -1; + } + printf("stdout gid = %d\n", tty_gid); + + sigemptyset(&cmask); + sigemptyset(&bmask); + sigaddset(&bmask, SIGTERM); + + sigprocmask(SIG_SETMASK, &bmask, NULL); + + printf("READY\n"); + + while (!stop) + sigsuspend(&cmask); + + if (getsid(pid) == sid) + printf("ALIVE\n"); + + return 0; +} diff --git a/CRIU_code/test/others/app-emu/job/job.exp b/CRIU_code/test/others/app-emu/job/job.exp new file mode 100644 index 0000000..1409103 --- /dev/null +++ b/CRIU_code/test/others/app-emu/job/job.exp @@ -0,0 +1,59 @@ +#!/usr/bin/expect + +source ../../env.sh || exit 1 + +exec rm -rf ./dump +exec mkdir ./dump + +system echo "-1" > ./dump/pid.pid + +set current [fork] +switch $current { + -1 { + puts "Fork failed." + exit -1 + } + 0 { + set timeout 5 + spawn ./job + set pid [exp_pid] + expect "READY" { + puts "READY" + } timeout { + puts "FAIL: Timed out on ready" + exit -1 + } + system $criu dump -v4 -D ./dump -o dump.log -j -t $pid + system echo "$pid" > ./dump/pid.pid + exit 0 + } + default { + sleep 2 + set timeout 5 + + set ::pidfile [open ./dump/pid.pid r] + set pid [gets $::pidfile] + + if {$pid == -1} { + puts "FAIL: Invalid pid read" + exit -1 + } + + spawn $criu restore -v4 -D ./dump -o restore.log -j + # + # spawn doesn't wait for restore to complete, so + # add some sleep here. Still better would be to + # rewrite this test completely. + sleep 2 + system kill -15 $pid + + expect "ALIVE" { + puts "PASS" + } timeout { + puts "FAIL: Timed out" + exit -1 + } + + exit 0 + } +} diff --git a/CRIU_code/test/others/app-emu/job/run.sh b/CRIU_code/test/others/app-emu/job/run.sh new file mode 100644 index 0000000..1d1e0ea --- /dev/null +++ b/CRIU_code/test/others/app-emu/job/run.sh @@ -0,0 +1,3 @@ +#!/bin/sh + +exec expect ./job.exp diff --git a/CRIU_code/test/others/app-emu/lxc/network-script.sh b/CRIU_code/test/others/app-emu/lxc/network-script.sh new file mode 100644 index 0000000..ce0542f --- /dev/null +++ b/CRIU_code/test/others/app-emu/lxc/network-script.sh @@ -0,0 +1,57 @@ +#!/bin/bash + +[ -z "$CR_IP_TOOL" ] && CR_IP_TOOL=ip + +action=$1 +shift + +[[ "network-unlock" == "$CRTOOLS_SCRIPT_ACTION" || + "network-lock" == "$CRTOOLS_SCRIPT_ACTION" ]] || exit 0 + +set -o pipefail + +[ "$action" == dump ] && { + pid=$1 + name=$2 + + # Find a pair of CT's eth0 + ifindex=`$CR_IP_TOOL netns exec $name ethtool -S eth0 | awk '/index/ { print $2}'` + [ $? -eq 0 ] || exit 1 + + for i in /sys/devices/virtual/net/*; do + [ "`cat $i/ifindex`" == $ifindex ] && { + dst=`basename $i` + break; + } + done + + [ -z "$dst" ] && exit 1 + + echo "$dst<=>eth0" + + [ "network-unlock" == "$CRTOOLS_SCRIPT_ACTION" ] && { + echo Attach $dst to the bridge br0 + brctl addif br0 $dst + exit $? + } + + [ "network-lock" == "$CRTOOLS_SCRIPT_ACTION" ] && { + echo Detach $dst to the bridge br0 + brctl delif br0 $dst + exit $? + } + + exit 0 +} + +[ "$action" == restore ] && { + [ "network-unlock" == "$CRTOOLS_SCRIPT_ACTION" ] && { + ethname=$1 + echo Attach $ethname to the bridge br0 + ip link set up dev $ethname + brctl addif br0 $ethname + exit $? + } +} + +exit 0 diff --git a/CRIU_code/test/others/app-emu/lxc/run.sh b/CRIU_code/test/others/app-emu/lxc/run.sh new file mode 100644 index 0000000..b6c700a --- /dev/null +++ b/CRIU_code/test/others/app-emu/lxc/run.sh @@ -0,0 +1,64 @@ +#!/bin/bash + +source ../../env.sh || exit 1 + +[ -z "$CR_IP_TOOL" ] && CR_IP_TOOL=ip + +cd `dirname $0` + +name=$1 +[ -z "$name" ] && { cat <(b))?A0((a)-(b), (b)):A0((b)-(a), (a)) +#define A2(a, b) ((a)>(b))?A1((a)-(b), (b)):A1((b)-(a), (a)) +#define A3(a, b) ((a)>(b))?A2((a)-(b), (b)):A2((b)-(a), (a)) +#define A4(a, b) ((a)>(b))?A3((a)-(b), (b)):A3((b)-(a), (a)) +#define A5(a, b) ((a)>(b))?A4((a)-(b), (b)):A4((b)-(a), (a)) +#define A6(a, b) ((a)>(b))?A5((a)-(b), (b)):A5((b)-(a), (a)) +#define A7(a, b) ((a)>(b))?A6((a)-(b), (b)):A6((b)-(a), (a)) +#define A8(a, b) ((a)>(b))?A7((a)-(b), (b)):A7((b)-(a), (a)) +#define A9(a, b) ((a)>(b))?A8((a)-(b), (b)):A8((b)-(a), (a)) +#define A10(a, b) ((a)>(b))?A9((a)-(b), (b)):A9((b)-(a), (a)) +#define A11(a, b) ((a)>(b))?A10((a)-(b), (b)):A10((b)-(a), (a)) + return A10(a, b); +} diff --git a/CRIU_code/test/others/app-emu/screen/run.sh b/CRIU_code/test/others/app-emu/screen/run.sh new file mode 100644 index 0000000..3227491 --- /dev/null +++ b/CRIU_code/test/others/app-emu/screen/run.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +source ../../functions.sh || exit 1 +source ../../env.sh || exit 1 + +set -x + +echo "Creating reference objects" + +screen -d -m -S criu-zdtm +pid=$(screen -list | grep '\.*Detached' | sed 's/\s*\([0-9]*\).*/\1/'); +echo PID=$pid + +mkdir dump + +${criu} dump -D dump -o dump.log -v4 -t ${pid} || { + echo "Dump failed" + exit 1 +} + +wait_tasks dump + +echo "Dumped, restoring and waiting for completion" + +${criu} restore -d -D dump -o restore.log -v4 || { + echo "Restore failed" + exit 1 +} + +echo PASS diff --git a/CRIU_code/test/others/app-emu/tarbz/run.sh b/CRIU_code/test/others/app-emu/tarbz/run.sh new file mode 100644 index 0000000..677c6de --- /dev/null +++ b/CRIU_code/test/others/app-emu/tarbz/run.sh @@ -0,0 +1,73 @@ +#!/bin/bash + +source ../../functions.sh || exit 1 +source ../../env.sh || exit 1 + +DEPTH=3 +SPAN=5 +archref="arch-ref.tar.bz2" +archcr="arch.tar.bz2" + +rm -f ${archref} +rm -f ${archcr} +rm -rf tree/ +rm -rf dump/ +mkdir dump +mkdir tree + +echo "Generating tree, depth ${DEPTH} span ${SPAN}" + +function gen_sub { + local dir="${1}" + local dep="${2}" + + for i in $(seq 1 $SPAN); do + subdir="$dir/dir_$((RANDOM % 32))_$i" + subfl="$dir/file_$((RANDOM % 32))_$i" + + mkdir "$subdir" + dd if=/dev/urandom of=$subfl bs=4096 count=$((RANDOM % 32 + 16)) > /dev/null 2>&1 + + if [ $dep -gt 0 ]; then + gen_sub "$subdir" $((dep - 1)) + fi + done +} + +gen_sub "./tree/" "$DEPTH" + +set -x + +time tar cjf ${archref} tree || exit 1 + +setsid tar cjf ${archcr} tree & + +pid=${!} + +echo "Started tar in $pid background" +sleep 3 + +${criu} dump --shell-job -D dump -o dump.log -v4 -t ${pid} || { + echo "Dump failed" + exit 1 +} + +wait_tasks dump +echo "Dump OK, restoring" + +${criu} restore --shell-job -D dump -o restore.log -v4 || { + echo "Restore failed" + exit 1 +} + +echo "Finished, comparing tarballs" + +if ! cmp ${archref} ${archcr} ; then + echo "Archives differ" + echo "FAIL" +else + echo "PASS" + rm -f ${archref} + rm -f ${archcr} + rm -rf tree/ +fi diff --git a/CRIU_code/test/others/app-emu/vnc/run.sh b/CRIU_code/test/others/app-emu/vnc/run.sh new file mode 100644 index 0000000..b3a9822 --- /dev/null +++ b/CRIU_code/test/others/app-emu/vnc/run.sh @@ -0,0 +1,31 @@ +set -m + +source ../../functions.sh || exit 1 +source ../../env.sh || exit 1 + +mkdir data + +./vnc-server.sh 25 &> data/vnc.log +pid=`jobs -p %1` +bg + +$criu dump -j --tcp-established -D data/ -o dump.log -v4 -t $pid || { + echo "Dump failed" + exit 1 +} + +wait_tasks dump + +$criu restore -j --tcp-established -D data/ -d -o restore.log -v4 || { + echo "Restore failed" + exit 1 +} + +nc -w 1 localhost 5925 | grep -am 1 RFB +ret=$? + +kill $pid + +[ "$ret" -eq 0 ] && echo PASS || echo FAIL; + +exit $ret diff --git a/CRIU_code/test/others/app-emu/vnc/vnc-server.sh b/CRIU_code/test/others/app-emu/vnc/vnc-server.sh new file mode 100644 index 0000000..7c8168a --- /dev/null +++ b/CRIU_code/test/others/app-emu/vnc/vnc-server.sh @@ -0,0 +1,13 @@ +#!/bin/bash +#set -x +set -m +Xvnc :25 -v -geometry 500x500 -i 0.0.0.0 -SecurityTypes none & +pid=$! +trap "kill $pid; wait" EXIT +for i in `seq 10`; do + nc -w 1 localhost 5925 | grep -am 1 RFB && break || echo Waiting + kill -0 $pid || exit 1 + sleep 1 +done +kill -STOP $$ +DISPLAY=:25 glxgears diff --git a/CRIU_code/test/others/bers/Makefile b/CRIU_code/test/others/bers/Makefile new file mode 100644 index 0000000..3034d46 --- /dev/null +++ b/CRIU_code/test/others/bers/Makefile @@ -0,0 +1,55 @@ +ifeq ($(strip $(V)),) + E = @echo + Q = @ +else + E = @\# + Q = +endif + +export E Q + +ifneq ($(USE_ASCIIDOCTOR),) +ASCIIDOC := asciidoctor +XMLTO := +else +ASCIIDOC := asciidoc +XMLTO := xmlto +endif + +SRC += bers.txt +XMLS := $(patsubst %.txt,%.xml,$(SRC)) +MANS := $(patsubst %.txt,%.8,$(SRC)) + +%.8: %.txt + $(E) " GEN " $@ +ifneq ($(USE_ASCIIDOCTOR),) + $(Q) $(ASCIIDOC) -b manpage -d manpage -o $@ $< +else + $(Q) $(ASCIIDOC) -b docbook -d manpage -o $(patsubst %.8,%.xml,$@) $< + $(Q) $(XMLTO) man --skip-validation $(patsubst %.8,%.xml,$@) 2>/dev/null +endif + +docs: $(MANS) + @true + +CFLAGS := -O0 -ggdb3 +LIBS := -lpthread + +%.o: %.c + $(E) " CC " $@ + $(Q) $(CC) -c -o $@ $(CFLAGS) $^ + +bers: bers.o + $(E) " LINK " $@ + $(Q) $(CC) -o $@ $(CFLAGS) $(LIBS) $^ + +all: bers + @true + +clean: + $(E) " CLEAN " + $(Q) rm -f $(XMLS) $(MANS) + $(Q) rm -f bers.o + $(Q) rm -f bers + +.PHONY: all docs clean diff --git a/CRIU_code/test/others/bers/bers.c b/CRIU_code/test/others/bers/bers.c new file mode 100644 index 0000000..90b70c3 --- /dev/null +++ b/CRIU_code/test/others/bers/bers.c @@ -0,0 +1,418 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#define min(x, y) ({ \ + typeof(x) _min1 = (x); \ + typeof(y) _min2 = (y); \ + (void) (&_min1 == &_min2); \ + _min1 < _min2 ? _min1 : _min2; }) + +#define max(x, y) ({ \ + typeof(x) _max1 = (x); \ + typeof(y) _max2 = (y); \ + (void) (&_max1 == &_max2); \ + _max1 > _max2 ? _max1 : _max2; }) + +#define MAX_CHUNK 4096 +#define PAGE_SIZE 4096 + +#define pr_info(fmt, ...) \ + printf("%8d: " fmt, sys_gettid(), ##__VA_ARGS__) + +#define pr_err(fmt, ...) \ + printf("%8d: Error (%s:%d): " fmt, sys_gettid(),\ + __FILE__, __LINE__, ##__VA_ARGS__) + +#define pr_perror(fmt, ...) \ + pr_err(fmt ": %m\n", ##__VA_ARGS__) + +#define pr_msg(fmt, ...) \ + printf(fmt, ##__VA_ARGS__) + + +#define pr_trace(fmt, ...) \ + printf("%8d: %s: " fmt, sys_gettid(), __func__, \ + ##__VA_ARGS__) + +enum { + MEM_FILL_MODE_NONE = 0, + MEM_FILL_MODE_ALL = 1, + MEM_FILL_MODE_LIGHT = 2, + MEM_FILL_MODE_DIRTIFY = 3, +}; + +typedef struct { + pthread_mutex_t mutex; + pthread_mutexattr_t mutex_attr; + + size_t opt_tasks; + + size_t opt_files; + size_t opt_file_size; + int prev_fd[MAX_CHUNK]; + + size_t opt_mem; + size_t opt_mem_chunks; + size_t opt_mem_chunk_size; + int opt_mem_fill_mode; + int opt_mem_cycle_mode; + unsigned int opt_refresh_time; + + char *opt_work_dir; + int work_dir_fd; + DIR *work_dir; + + pid_t err_pid; + int err_no; + + unsigned long prev_map[MAX_CHUNK]; +} shared_data_t; + +static shared_data_t *shared; + +static int sys_gettid(void) +{ + return syscall(__NR_gettid); +} + +static void dirtify_memory(unsigned long *chunks, size_t nr_chunks, + size_t chunk_size, int mode, const size_t nr_pages) +{ + size_t i; + + pr_trace("filling memory\n"); + switch (mode) { + case MEM_FILL_MODE_LIGHT: + *((unsigned long *)chunks[0]) = -1ul; + break; + case MEM_FILL_MODE_ALL: + for (i = 0; i < nr_chunks; i++) + memset((void *)chunks[i], (char)i, chunk_size); + break; + case MEM_FILL_MODE_DIRTIFY: + for (i = 0; i < nr_chunks; i++) + *((unsigned long *)chunks[i]) = -1ul; + break; + } +} + +static void dirtify_files(int *fd, size_t nr_files, size_t size) +{ + size_t buf[8192]; + size_t i; + + /* + * Note we don't write any _sane_ data here, the only + * important thing is I/O activity by self. + */ + + for (i = 0; i < nr_files; i++) { + size_t c = min(size, sizeof(buf)); + size_t left = size; + + while (left > 0) { + write(fd[i], buf, c); + left -= c; + c = min(left, sizeof(buf)); + } + } +} + +static int create_files(shared_data_t *shared, int *fd, size_t nr_files) +{ + char path[PATH_MAX]; + size_t i; + + memset(fd, 0xff, sizeof(*fd) * MAX_CHUNK); + + pr_info("\tCreating %lu files\n", shared->opt_files); + + for (i = 0; i < shared->opt_files; i++) { + if (shared->prev_fd[i] != -1) { + close(shared->prev_fd[i]); + shared->prev_fd[i] = -1; + } + snprintf(path, sizeof(path), "%08d-%04d-temp", sys_gettid(), i); + fd[i] = openat(shared->work_dir_fd, path, O_RDWR | O_CREAT | O_TRUNC, 0666); + if (fd[i] < 0) { + pr_perror("Can't open %s/%s", shared->opt_work_dir, path); + shared->err_pid = sys_gettid(); + shared->err_no = -errno; + return -1; + } + shared->prev_fd[i] = fd[i]; + } + + return 0; +} + +static void work_on_fork(shared_data_t *shared) +{ + const size_t nr_pages = shared->opt_mem_chunk_size / PAGE_SIZE; + unsigned long chunks[MAX_CHUNK] = { }; + int fd[MAX_CHUNK]; + size_t i; + void *mem; + + pr_trace("locking\n"); + pthread_mutex_lock(&shared->mutex); + pr_trace("init\n"); + + pr_info("\tCreating %lu mmaps each %lu K\n", + shared->opt_mem_chunks, shared->opt_mem_chunk_size >> 10); + + for (i = 0; i < shared->opt_mem_chunks; i++) { + if (shared->prev_map[i]) { + munmap((void *)shared->prev_map[i], shared->opt_mem_chunk_size); + shared->prev_map[i] = 0; + } + + /* If we won't change proto here, the kernel might merge close areas */ + mem = mmap(NULL, shared->opt_mem_chunk_size, + PROT_READ | PROT_WRITE | ((i % 2) ? PROT_EXEC : 0), + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + + if (mem != (void *)MAP_FAILED) { + shared->prev_map[i] = (unsigned long)mem; + chunks[i] = (unsigned long)mem; + + pr_info("\t\tMap at %lx\n",(unsigned long)mem); + } else { + pr_info("\t\tCan't map\n"); + + shared->err_pid = sys_gettid(); + shared->err_no = -errno; + exit(1); + } + } + + if (shared->opt_mem_fill_mode) + dirtify_memory(chunks, shared->opt_mem_chunks, + shared->opt_mem_chunk_size, + shared->opt_mem_fill_mode, + nr_pages); + + if (create_files(shared, fd, shared->opt_files)) + exit(1); + + if (shared->opt_file_size) + dirtify_files(fd, shared->opt_files, shared->opt_file_size); + + pr_trace("releasing\n"); + pthread_mutex_unlock(&shared->mutex); + + while (1) { + sleep(shared->opt_refresh_time); + if (shared->opt_mem_cycle_mode) + dirtify_memory(chunks, shared->opt_mem_chunks, + shared->opt_mem_chunk_size, + shared->opt_mem_cycle_mode, + nr_pages); + if (shared->opt_file_size) + dirtify_files(fd, shared->opt_files, shared->opt_file_size); + } +} + +static int parse_mem_mode(int *mode, char *opt) +{ + if (!strcmp(opt, "all")) { + *mode = MEM_FILL_MODE_ALL; + } else if (!strcmp(opt, "light")) { + *mode = MEM_FILL_MODE_LIGHT; + } else if (!strcmp(opt, "dirtify")) { + *mode = MEM_FILL_MODE_DIRTIFY; + } else { + pr_err("Unrecognized option %s\n", opt); + return -1; + } + return 0; +} + +int main(int argc, char *argv[]) +{ + /* a - 97, z - 122, A - 65, 90 */ + static const char short_opts[] = "t:d:f:m:c:h"; + static struct option long_opts[] = { + {"tasks", required_argument, 0, 't'}, + {"dir", required_argument, 0, 'd'}, + {"files", required_argument, 0, 'f'}, + {"memory", required_argument, 0, 'm'}, + {"mem-chunks", required_argument, 0, 'c'}, + {"help", no_argument, 0, 'h'}, + {"mem-fill", required_argument, 0, 10}, + {"mem-cycle", required_argument, 0, 11}, + {"refresh", required_argument, 0, 12}, + {"file-size", required_argument, 0, 13}, + { }, + }; + + char workdir[PATH_MAX]; + int opt, idx, pidfd; + char pidbuf[32]; + pid_t pid; + size_t i; + + shared = (void *)mmap(NULL, sizeof(*shared), PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_SHARED, -1, 0); + if ((void *)shared == MAP_FAILED) { + pr_err("Failed to setup shared data\n"); + exit(1); + } + + pthread_mutexattr_init(&shared->mutex_attr); + pthread_mutexattr_setpshared(&shared->mutex_attr, PTHREAD_PROCESS_SHARED); + pthread_mutex_init(&shared->mutex, &shared->mutex_attr); + + /* + * Default options. + */ + shared->opt_mem_chunks = 1; + shared->opt_refresh_time = 1; + shared->opt_tasks = 1; + shared->opt_mem = 1 << 20ul; + memset(shared->prev_fd, 0xff, sizeof(shared->prev_fd)); + + while (1) { + idx = -1; + opt = getopt_long(argc, argv, short_opts, long_opts, &idx); + if (opt == -1) + break; + + switch(opt) { + case 't': + shared->opt_tasks = (size_t)atol(optarg); + break; + case 'f': + shared->opt_files = (size_t)atol(optarg); + break; + case 'm': + /* In megabytes */ + shared->opt_mem = (size_t)atol(optarg) << 20ul; + break; + case 'c': + shared->opt_mem_chunks = (size_t)atol(optarg); + break; + case 'd': + shared->opt_work_dir = optarg; + break; + case 'h': + goto usage; + break; + case 10: + if (parse_mem_mode(&shared->opt_mem_fill_mode, optarg)) + goto usage; + case 11: + if (parse_mem_mode(&shared->opt_mem_cycle_mode, optarg)) + goto usage; + break; + case 12: + shared->opt_refresh_time = (unsigned int)atoi(optarg); + break; + case 13: + shared->opt_file_size = (size_t)atol(optarg); + } + } + + if (!shared->opt_work_dir) { + shared->opt_work_dir = getcwd(workdir, sizeof(workdir)); + if (!shared->opt_work_dir) { + pr_perror("Can't fetch current working dir"); + exit(1); + } + shared->opt_work_dir = workdir; + } + + if (shared->opt_mem_chunks > MAX_CHUNK) + shared->opt_mem_chunks = MAX_CHUNK; + + if (shared->opt_files > MAX_CHUNK) + shared->opt_files = MAX_CHUNK; + + shared->work_dir = opendir(shared->opt_work_dir); + if (!shared->work_dir) { + pr_perror("Can't open working dir `%s'", + shared->opt_work_dir); + exit(1); + } + shared->work_dir_fd = dirfd(shared->work_dir); + + shared->opt_mem_chunk_size = shared->opt_mem / shared->opt_mem_chunks; + + if (shared->opt_mem_chunk_size && + shared->opt_mem_chunk_size < PAGE_SIZE) { + pr_err("Memory chunk size is too small, provide at least %lu M of memory\n", + (shared->opt_mem_chunks * PAGE_SIZE) >> 20ul); + exit(1); + } + + for (i = 0; i < shared->opt_tasks; i++) { + if (shared->err_no) + goto err_child; + + pid = fork(); + if (pid < 0) { + printf("Can't create fork: %m\n"); + exit(1); + } else if (pid == 0) { + work_on_fork(shared); + } + } + + /* + * Once everything is done and we're in cycle, + * create pidfile and go to sleep... + */ + pid = sys_gettid(); + pidfd = openat(shared->work_dir_fd, "bers.pid", O_RDWR | O_CREAT | O_TRUNC, 0666); + if (pidfd < 0) { + pr_perror("Can't open pidfile"); + exit(1); + } + snprintf(pidbuf, sizeof(pidbuf), "%d", sys_gettid()); + write(pidfd, pidbuf, strlen(pidbuf)); + close(pidfd); + pidfd = -1; + + /* + * Endless! + */ + while (!shared->err_no) + sleep(1); + +err_child: + pr_err("Child %d exited with %d\n", + shared->err_pid, shared->err_no); + return shared->err_no; + +usage: + pr_msg("bers [options]\n"); + pr_msg(" -t|--tasks create of tasks\n"); + pr_msg(" -d|--dir use directory for temporary files\n"); + pr_msg(" -f|--files create files for each task\n"); + pr_msg(" -m|--memory allocate megabytes for each task\n"); + pr_msg(" --memory-chunks split memory to equal parts\n"); + pr_msg(" --mem-fill fill memory with data dependin on :\n"); + pr_msg(" all fill every byte of memory\n"); + pr_msg(" light fill first bytes of every page\n"); + pr_msg(" dirtify fill every page\n"); + pr_msg(" --mem-cycle same as --mem-fill but for cycling\n"); + pr_msg(" --refresh refresh loading of every task each \n"); + pr_msg(" --file-size write of data into each file on every refresh cycle\n"); + + return 1; +} diff --git a/CRIU_code/test/others/bers/bers.txt b/CRIU_code/test/others/bers/bers.txt new file mode 100644 index 0000000..17c0c08 --- /dev/null +++ b/CRIU_code/test/others/bers/bers.txt @@ -0,0 +1,74 @@ +bers(8) +======= +:doctype: manpage +:man source: bers +:man version: 0.0.1 +:man manual: bers manual + +NAME +---- +bers - go berserk and eat computer resources + +SYNOPSIS +-------- +*bers* ['options'] + +DESCRIPTION +----------- +*bers* is a command line utility aimed to eat resources of the computer it runs on. +Idea behind is to create a number of tasks which would trash computer resources +eating cpu and i/o time. + +OPTIONS +------- +*-t*, *--tasks* 'num':: + Create 'num' number of forks. + +*-d*, *--dir* 'dir':: + Path to 'dir' directory where temporary files will be created to load + I/O subsystem. + +*-f*, *--files* 'num':: + Create 'num' files in each task. + +*-m*, *--memory* 'num':: + Allocate 'num' megabytes of memory for every task. + +*--mem-chunks* 'num':: + Allocate memory for each task not as one slab but split + it into 'num' equal parts. + +*--mem-fill* 'mode':: + Touch (write) into allocated memory once task is created. The + 'mode' might be one of the following: 'all' -- write every + single byte of the memory, 'light' -- write into first bytes + of first page of the allocated memory chunk, 'dirtify' -- write + into every page of every allocated chunk. + +*--mem-cycle* 'mode':: + Same as *--mem-fill*, but 'mode' taken into account while + task is cycling. By default each cycle initiated per one second. + +*--refresh* 'second':: + Refresh load state of every task each 'second'. By refsresh + here means to dirtify memory and file contents. + +*--file-size* 'bytes':: + Write 'bytes' of data into each file on every refresh cycle. + +EXAMPLE +------- + +bers -d test/bers/dump -t 256 -m 54 -c 4 -f 200 --mem-fill dirtify --mem-cycle dirtify + +We generate 256 tasks wit each allocating 54 megabytes of memory splitted +equally into 4 memory areas. Each task opens 200 files. On creation and +cycling we touch every page of every memory area. + +AUTHOR +------ +OpenVZ team. + +COPYRIGHT +--------- +Copyright \(C) 2014, Parallels Inc. diff --git a/CRIU_code/test/others/crit/.gitignore b/CRIU_code/test/others/crit/.gitignore new file mode 100644 index 0000000..9614eb6 --- /dev/null +++ b/CRIU_code/test/others/crit/.gitignore @@ -0,0 +1,5 @@ +*.img +*.log +*.txt +stats-* +*.json diff --git a/CRIU_code/test/others/crit/Makefile b/CRIU_code/test/others/crit/Makefile new file mode 100644 index 0000000..75d09b6 --- /dev/null +++ b/CRIU_code/test/others/crit/Makefile @@ -0,0 +1,5 @@ +run: clean + ./test.sh + +clean: + rm -f *.img *.log *.txt stats-* *.json diff --git a/CRIU_code/test/others/crit/loop.sh b/CRIU_code/test/others/crit/loop.sh new file mode 100644 index 0000000..0ab34ce --- /dev/null +++ b/CRIU_code/test/others/crit/loop.sh @@ -0,0 +1,4 @@ +#!/bin/bash +while :; do + sleep 1 +done diff --git a/CRIU_code/test/others/crit/test.sh b/CRIU_code/test/others/crit/test.sh new file mode 100644 index 0000000..a85b4c3 --- /dev/null +++ b/CRIU_code/test/others/crit/test.sh @@ -0,0 +1,49 @@ +source ../env.sh + +images_list="" + +function _exit { + if [ $? -ne 0 ]; then + echo "FAIL" + exit 1 + fi +} + +function gen_imgs { + setsid ./loop.sh < /dev/null &> /dev/null & + PID=$! + $CRIU dump -v4 -o dump.log -D ./ -t $PID + if [ $? -ne 0 ]; then + kill -9 $PID + _exit 1 + fi + + images_list=$(ls -1 *.img) + if [ -z "$images_list" ]; then + echo "Failed to generate images" + _exit 1 + fi +} + +function run_test { + for x in $images_list + do + echo "=== $x" + if [[ $x == pages* ]]; then + echo "skip" + continue + fi + + echo " -- to json" + $CRIT decode -o "$x"".json" --pretty < $x || _exit $? + echo " -- to img" + $CRIT encode -i "$x"".json" > "$x"".json.img" || _exit $? + echo " -- cmp" + cmp $x "$x"".json.img" || _exit $? + + echo "=== done" + done +} + +gen_imgs +run_test diff --git a/CRIU_code/test/others/criu-coredump/.gitignore b/CRIU_code/test/others/criu-coredump/.gitignore new file mode 100644 index 0000000..a2b70d8 --- /dev/null +++ b/CRIU_code/test/others/criu-coredump/.gitignore @@ -0,0 +1,6 @@ +*.img +*.log +*.txt +stats-* +*.json +core.* diff --git a/CRIU_code/test/others/criu-coredump/Makefile b/CRIU_code/test/others/criu-coredump/Makefile new file mode 100644 index 0000000..aa684a9 --- /dev/null +++ b/CRIU_code/test/others/criu-coredump/Makefile @@ -0,0 +1,5 @@ +run: clean + ./test.sh + +clean: + rm -f *.img stats-* core.* diff --git a/CRIU_code/test/others/criu-coredump/loop.sh b/CRIU_code/test/others/criu-coredump/loop.sh new file mode 100644 index 0000000..0ab34ce --- /dev/null +++ b/CRIU_code/test/others/criu-coredump/loop.sh @@ -0,0 +1,4 @@ +#!/bin/bash +while :; do + sleep 1 +done diff --git a/CRIU_code/test/others/criu-coredump/test.sh b/CRIU_code/test/others/criu-coredump/test.sh new file mode 100644 index 0000000..1830b9d --- /dev/null +++ b/CRIU_code/test/others/criu-coredump/test.sh @@ -0,0 +1,50 @@ +source ../env.sh + +function _exit { + if [ $? -ne 0 ]; then + echo "FAIL" + exit 1 + fi +} + +function gen_imgs { + setsid ./loop.sh < /dev/null &> /dev/null & + PID=$! + $CRIU dump -v4 -o dump.log -D ./ -t $PID + if [ $? -ne 0 ]; then + kill -9 $PID + _exit 1 + fi + + images_list=$(ls -1 *.img) + if [ -z "$images_list" ]; then + echo "Failed to generate images" + _exit 1 + fi +} + +function run_test { + echo "= Test core dump" + + echo "=== img to core dump" + $CRIU_COREDUMP -i ./ -o ./ || _exit $? + echo "=== done" + + cores=$(ls -1 core.*) + if [ -z "$cores" ]; then + echo "Failed to generate coredumps" + _exit 1 + fi + + for x in $cores + do + echo "=== try readelf $x" + readelf -a $x || _exit $? + echo "=== done" + done + + echo "= done" +} + +gen_imgs +run_test diff --git a/CRIU_code/test/others/env.sh b/CRIU_code/test/others/env.sh new file mode 100644 index 0000000..73369ad --- /dev/null +++ b/CRIU_code/test/others/env.sh @@ -0,0 +1,8 @@ +#!/bin/sh + +CRIU=$(readlink -f `dirname ${BASH_SOURCE[0]}`/../../criu/criu) +criu=$CRIU +CRIT=$(readlink -f `dirname ${BASH_SOURCE[0]}`/../../crit/crit) +crit=$CRIT +CRIU_COREDUMP=$(readlink -f `dirname ${BASH_SOURCE[0]}`/../../criu-coredump/criu-coredump) +criu_coredump=$CRIU_COREDUMP diff --git a/CRIU_code/test/others/exec/Makefile b/CRIU_code/test/others/exec/Makefile new file mode 100644 index 0000000..7779a99 --- /dev/null +++ b/CRIU_code/test/others/exec/Makefile @@ -0,0 +1,2 @@ +run: + ./run.sh diff --git a/CRIU_code/test/others/exec/run.sh b/CRIU_code/test/others/exec/run.sh new file mode 100644 index 0000000..c3f497e --- /dev/null +++ b/CRIU_code/test/others/exec/run.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +CRIU=../../../criu/criu + +set -e -m -x + +cat < /dev/zero > /dev/null & +pid=$! +sleep 1 +lsof -p $pid + +$CRIU exec -t $pid fake_syscall && exit 1 || true +fd=`$CRIU exec -t $pid open '&/dev/null' 0 | sed 's/.*(\(.*\))/\1/'` +$CRIU exec -t $pid dup2 $fd 0 +wait $pid +echo PASS diff --git a/CRIU_code/test/others/ext-links/Makefile b/CRIU_code/test/others/ext-links/Makefile new file mode 100644 index 0000000..236a97d --- /dev/null +++ b/CRIU_code/test/others/ext-links/Makefile @@ -0,0 +1,4 @@ +all: mvlink.so + +mvlink.so: mvlink.c + gcc -g -Werror -Wall -shared -nostartfiles mvlink.c -o mvlink.so -iquote ../../../criu/include -fPIC diff --git a/CRIU_code/test/others/ext-links/addmv.sh b/CRIU_code/test/others/ext-links/addmv.sh new file mode 100644 index 0000000..0ccc971 --- /dev/null +++ b/CRIU_code/test/others/ext-links/addmv.sh @@ -0,0 +1,8 @@ +#!/bin/bash +# $1 -- link name +# $2 -- file with namespace pid +if [ "$CRTOOLS_SCRIPT_ACTION" == "setup-namespaces" ]; then + $(dirname $0)/addmv_raw.sh $1 $(cat $2) +else + exit 0 +fi diff --git a/CRIU_code/test/others/ext-links/addmv_raw.sh b/CRIU_code/test/others/ext-links/addmv_raw.sh new file mode 100644 index 0000000..224f243 --- /dev/null +++ b/CRIU_code/test/others/ext-links/addmv_raw.sh @@ -0,0 +1,6 @@ +#!/bin/bash +# $1 -- link name +# $2 -- pid of task in namespace +set -x +$ip link add link eth0 name $1 type macvlan || exit 1 +$ip link set $1 netns $2 diff --git a/CRIU_code/test/others/ext-links/mvlink.c b/CRIU_code/test/others/ext-links/mvlink.c new file mode 100644 index 0000000..a1c764d --- /dev/null +++ b/CRIU_code/test/others/ext-links/mvlink.c @@ -0,0 +1,28 @@ +#include +#include +#include +#include +#include +#include + +#include "criu-plugin.h" +#include "criu-log.h" + +extern cr_plugin_init_t cr_plugin_init; +extern cr_plugin_dump_ext_link_t cr_plugin_dump_ext_link; + +int cr_plugin_init(void) +{ + pr_info("Initialized macvlan dumper\n"); + return 0; +} + +int cr_plugin_dump_ext_link(int index, int type, char *kind) +{ + if (strcmp(kind, "macvlan")) + return -ENOTSUP; + else { + pr_info("Dump %d macvlan\n", index); + return 0; + } +} diff --git a/CRIU_code/test/others/ext-links/run.sh b/CRIU_code/test/others/ext-links/run.sh new file mode 100644 index 0000000..82fa393 --- /dev/null +++ b/CRIU_code/test/others/ext-links/run.sh @@ -0,0 +1,58 @@ +#!/bin/bash + +ip=${CR_IP_TOOL:-ip} +mvln="mv0" +finf="finish" +outf="ns_output" +pidf="ns_pid" +criu="../../../criu/criu" + +export ip +export mvln +export finf +export outf +export pidf + +function fail { + $ip link del $mvln + touch $finf + echo $@ + exit 1 +} + +# Build the mvlink plugin +make + +set -x + +rm -f "$finf" "$outf" "$pidf" +rm -rf "dump" + +# Unshare netns. The run_ns will exit once ns is spawned. +unshare --net ./run_ns.sh +nspid=$(cat $pidf) +ps $nspid + +# Create and push macvlan device into it. CRIU doesn't support +# macvlans treating them as external devices. +./addmv_raw.sh $mvln $nspid || fail "Can't setup namespace" + +# Dump +sleep 1 +mkdir dump +$criu dump -t $nspid -D dump/ -o dump.log -v4 --lib $(pwd) || fail "Can't dump namespace" + +# Restore +# Ask for the pid (shouldn't change, so just as an example), ask to call +# script that will put macvlan device back into namespace +sleep 1 +rm -f $pidf +$criu restore -D dump/ -o restore.log -v4 --pidfile $(pwd)/$pidf --action-script "$(pwd)/addmv.sh $mvln $(pwd)/$pidf" -d || fail "Can't restore namespaces" + +# Finish and check results +touch $finf +set +x +while ! egrep 'PASS|FAIL' $outf; do + echo "Waiting" + sleep 1 +done diff --git a/CRIU_code/test/others/ext-links/run_ns.sh b/CRIU_code/test/others/ext-links/run_ns.sh new file mode 100644 index 0000000..878f038 --- /dev/null +++ b/CRIU_code/test/others/ext-links/run_ns.sh @@ -0,0 +1,10 @@ +#!/bin/bash +set -x +echo "NS: $$" >> $outf +echo "Links before:" >> $outf +$ip link list >> $outf 2>&1 +# Detach from session, terminal and parent +setsid ./run_wait.sh < /dev/null >> $outf 2>&1 & +# Keep pid for future reference :) +echo "$!" > $pidf +exit 0 diff --git a/CRIU_code/test/others/ext-links/run_wait.sh b/CRIU_code/test/others/ext-links/run_wait.sh new file mode 100644 index 0000000..d600e37 --- /dev/null +++ b/CRIU_code/test/others/ext-links/run_wait.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +echo "Wait: $$" +while [ ! -e "$finf" ]; do + echo "WAIT ($$)" + sleep 1; +done + +echo "Links after:" +$ip link list + +# The mvln device (exported from run.sh) should exits in +# namespace after we get restored +echo "Check for $mvln:" +$ip link list $mvln && echo "PASS" || echo "FAIL" diff --git a/CRIU_code/test/others/ext-tty/run.py b/CRIU_code/test/others/ext-tty/run.py new file mode 100644 index 0000000..f44b1d9 --- /dev/null +++ b/CRIU_code/test/others/ext-tty/run.py @@ -0,0 +1,36 @@ +#!/usr/bin/env python2 +import subprocess +import os, sys, time, signal, pty + +master, slave = pty.openpty() + +p = subprocess.Popen(["setsid", "--ctty", "sleep", "10000"], + stdin = slave, stdout = slave, stderr = slave, close_fds = True) +st = os.stat("/proc/self/fd/%d" % slave) +ttyid = "tty[%x:%x]" % (st.st_rdev, st.st_dev) +os.close(slave) +time.sleep(1) + +ret = subprocess.Popen(["../../../criu/criu", "dump", "-t", str(p.pid), "-v4", "--external", ttyid]).wait() +if ret: + sys.exit(ret) +p.wait() + +new_master, slave = pty.openpty() # get another pty pair +os.close(master) + +ttyid = "fd[%d]:tty[%x:%x]" % (slave, st.st_rdev, st.st_dev) + +ret = subprocess.Popen(["../../../criu/criu", "restore", "-v4", "--inherit-fd", ttyid, "--restore-sibling", "--restore-detach"]).wait() +if ret: + sys.exit(ret) +os.close(slave) +os.waitpid(-1, os.WNOHANG) # is the process alive + +os.close(new_master) +_, status = os.wait() +if not os.WIFSIGNALED(status) or os.WTERMSIG(status) != signal.SIGHUP: + print(status) + sys.exit(1) + +print("PASS") diff --git a/CRIU_code/test/others/functions.sh b/CRIU_code/test/others/functions.sh new file mode 100644 index 0000000..2ec66cb --- /dev/null +++ b/CRIU_code/test/others/functions.sh @@ -0,0 +1,16 @@ +# Wait while tasks are dying, otherwise PIDs would be busy. + +function wait_tasks() +{ + local dump=$1 + local pid + + for i in $dump/core-*.img; do + pid=`expr "$i" : '.*/core-\([0-9]*\).img'` + while :; do + kill -0 $pid > /dev/null 2>&1 || break; + echo Waiting the process $pid + sleep 0.1 + done + done +} diff --git a/CRIU_code/test/others/libcriu/.gitignore b/CRIU_code/test/others/libcriu/.gitignore new file mode 100644 index 0000000..6424681 --- /dev/null +++ b/CRIU_code/test/others/libcriu/.gitignore @@ -0,0 +1,6 @@ +test_errno +test_iters +test_notify +test_self +test_sub +wdir diff --git a/CRIU_code/test/others/libcriu/Makefile b/CRIU_code/test/others/libcriu/Makefile new file mode 100644 index 0000000..5289ed1 --- /dev/null +++ b/CRIU_code/test/others/libcriu/Makefile @@ -0,0 +1,26 @@ +TESTS += test_sub +TESTS += test_self +TESTS += test_notify +TESTS += test_iters +TESTS += test_errno + +all: $(TESTS) +.PHONY: all + +run: all + ./run.sh +.PHONY: run + +define genb +$(1): $(1).o lib.o + gcc $$^ -L ../../../../criu/lib/c/ -L ../../../../criu/images/ -lcriu -o $$@ +endef + +$(foreach t, $(TESTS), $(eval $(call genb, $(t)))) + +%.o: %.c + gcc -c $^ -I../../../../criu/lib/c/ -I../../../../criu/images/ -o $@ -Werror + +clean: + rm -rf $(TESTS) $(TESTS:%=%.o) lib.o +.PHONY: clean diff --git a/CRIU_code/test/others/libcriu/lib.c b/CRIU_code/test/others/libcriu/lib.c new file mode 100644 index 0000000..33aa409 --- /dev/null +++ b/CRIU_code/test/others/libcriu/lib.c @@ -0,0 +1,47 @@ +#include +#include +#include + +void what_err_ret_mean(int ret) +{ + /* NOTE: errno is set by libcriu */ + switch (ret) { + case -EBADE: + perror("RPC has returned fail"); + break; + case -ECONNREFUSED: + perror("Unable to connect to CRIU"); + break; + case -ECOMM: + perror("Unable to send/recv msg to/from CRIU"); + break; + case -EINVAL: + perror("CRIU doesn't support this type of request." + "You should probably update CRIU"); + break; + case -EBADMSG: + perror("Unexpected response from CRIU." + "You should probably update CRIU"); + break; + default: + perror("Unknown error type code." + "You should probably update CRIU"); + } +} + +int chk_exit(int status, int want) +{ + if (WIFEXITED(status)) { + if (WEXITSTATUS(status) == want) + return 0; + + printf(" `- FAIL (exit %d)\n", WEXITSTATUS(status)); + } else if (WIFSIGNALED(status)) + printf(" `- FAIL (die %d)\n", WTERMSIG(status)); + else + printf(" `- FAIL (%#x)\n", status); + + return 1; +} + + diff --git a/CRIU_code/test/others/libcriu/lib.h b/CRIU_code/test/others/libcriu/lib.h new file mode 100644 index 0000000..67b784b --- /dev/null +++ b/CRIU_code/test/others/libcriu/lib.h @@ -0,0 +1,2 @@ +void what_err_ret_mean(int ret); +int chk_exit(int status, int want); diff --git a/CRIU_code/test/others/libcriu/run.sh b/CRIU_code/test/others/libcriu/run.sh new file mode 100644 index 0000000..a99b91e --- /dev/null +++ b/CRIU_code/test/others/libcriu/run.sh @@ -0,0 +1,45 @@ +#!/bin/bash + +set -x +source ../env.sh || exit 1 + +echo "== Clean" +make clean +rm -rf wdir +rm -f ./libcriu.so.1 + +echo "== Prepare" +mkdir -p wdir/i/ + +echo "== Run tests" +ln -s ../../../../criu/lib/c/libcriu.so libcriu.so.1 +export LD_LIBRARY_PATH=. +export PATH="`dirname ${BASH_SOURCE[0]}`/../../:$PATH" + +RESULT=0 + +function run_test { + echo "== Build $1" + if ! make $1; then + echo "FAIL build $1" + RESULT=1; + else + echo "== Test $1" + mkdir wdir/i/$1/ + if ! setsid ./$1 ${CRIU} wdir/i/$1/ < /dev/null &>> wdir/i/$1/test.log; then + echo "$1: FAIL" + RESULT=1 + fi + fi +} + +run_test test_sub +run_test test_self +run_test test_notify +run_test test_iters +run_test test_errno + +echo "== Tests done" +unlink libcriu.so.1 +[ $RESULT -eq 0 ] && echo "Success" || echo "FAIL" +exit $RESULT diff --git a/CRIU_code/test/others/libcriu/test_errno.c b/CRIU_code/test/others/libcriu/test_errno.c new file mode 100644 index 0000000..8bd19fe --- /dev/null +++ b/CRIU_code/test/others/libcriu/test_errno.c @@ -0,0 +1,154 @@ +#include "criu.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#define PID_MAX "/proc/sys/kernel/pid_max" + +static int dir_fd; +static char *service; + +static int init(char *argv[]) +{ + service = argv[1]; + + dir_fd = open(argv[2], O_DIRECTORY); + if (dir_fd < 0) { + perror("Can't open images dir"); + return -1; + } + + return 0; +} + +static void get_base_req(void) +{ + criu_init_opts(); + criu_set_service_binary(service); + criu_set_images_dir_fd(dir_fd); + criu_set_log_level(4); +} + +static int check_resp(int ret, int expected_ret, int err, int expected_err) +{ + if (ret != expected_ret) { + fprintf(stderr, "Unexpected ret %d (%d expected)\n", ret, expected_ret); + return -1; + } + + if (err != expected_err) { + fprintf(stderr, "Unexpected errno %d (%d expected)\n", err, expected_err); + return -1; + } + + return 0; +} + +static int no_process(void) +{ + FILE *f = NULL; + size_t len; + ssize_t count; + char *buf = NULL; + int pid, ret; + + printf("--- Try to dump unexisting process\n"); + + f = fopen(PID_MAX, "r"); + if (!f) { + perror("Can't open " PID_MAX); + goto err; + } + + count = getline(&buf, &len, f); + if (count == -1) { + perror("Can't read " PID_MAX); + goto err; + } + pid = atoi(buf); + + if (!kill(pid, 0)) { + fprintf(stderr, "max pid is taken\n"); + goto err; + } + + get_base_req(); + criu_set_pid(pid); + ret = criu_dump(); + if (check_resp(ret, -EBADE, errno, ESRCH)) + goto err; + + printf(" `- Success\n"); + return 0; +err: + if (f) + fclose(f); + return -1; + +} + +static int process_exists(void) +{ + int ret; + + printf("--- Try to restore process which pid is already taken by other process\n"); + + get_base_req(); + criu_set_leave_running(true); + if (criu_dump()) { + fprintf(stderr, "Self-dump failed"); + goto err; + } + + get_base_req(); + ret = criu_restore(); + if (check_resp(ret, -EBADE, errno, EEXIST)) + goto err; + + printf(" `- Success\n"); + return 0; +err: + return -1; +} + +static int bad_options(void) +{ + int ret; + + printf("--- Try to send criu invalid opts\n"); + + get_base_req(); + criu_set_log_file("../file.log"); + ret = criu_dump(); + if (check_resp(ret, -EBADE, errno, EBADRQC)) + goto err; + + printf(" `- Success\n"); + return 0; +err: + return -1; +} + +int main(int argc, char *argv[]) +{ + int ret = 1; + + if (init(argv)) + goto out; + + if (no_process() || process_exists() || bad_options()) + goto out; + + ret = 0; +out: + if (dir_fd) + close(dir_fd); + + return ret; +} diff --git a/CRIU_code/test/others/libcriu/test_iters.c b/CRIU_code/test/others/libcriu/test_iters.c new file mode 100644 index 0000000..660da92 --- /dev/null +++ b/CRIU_code/test/others/libcriu/test_iters.c @@ -0,0 +1,143 @@ +#include "criu.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "lib.h" + +static int wdir_fd, cur_iter = 1, cur_imgdir = -1; + +static int stop = 0; +static void sh(int sig) +{ + stop = 1; +} + +static int open_imgdir(void) +{ + char p[10]; + + sprintf(p, "%d", cur_iter); + mkdirat(wdir_fd, p, 0700); + cur_imgdir = openat(wdir_fd, p, O_DIRECTORY); + criu_set_images_dir_fd(cur_imgdir); +} + +#define MAX_ITERS 2 + +static int next_iter(criu_predump_info pi) +{ + char p[10]; + + printf(" `- %d iter over\n", cur_iter); + + close(cur_imgdir); + sprintf(p, "../%d", cur_iter); + criu_set_parent_images(p); + + cur_iter++; + open_imgdir(); + + return cur_iter < MAX_ITERS; +} + +#define SUCC_ECODE 42 + +int main(int argc, char **argv) +{ + int pid, ret, p[2]; + + wdir_fd = open(argv[2], O_DIRECTORY); + if (wdir_fd < 0) { + perror("Can't open wdir"); + return 1; + } + + printf("--- Start loop ---\n"); + pipe(p); + pid = fork(); + if (pid < 0) { + perror("Can't"); + return -1; + } + + if (!pid) { + printf(" `- loop: initializing\n"); + if (setsid() < 0) + exit(1); + if (signal(SIGUSR1, sh) == SIG_ERR) + exit(1); + + close(0); + close(1); + close(2); + close(p[0]); + + ret = SUCC_ECODE; + write(p[1], &ret, sizeof(ret)); + close(p[1]); + + while (!stop) + sleep(1); + exit(SUCC_ECODE); + } + + close(p[1]); + + /* Wait for kid to start */ + ret = -1; + read(p[0], &ret, sizeof(ret)); + if (ret != SUCC_ECODE) { + printf("Error starting loop\n"); + goto err; + } + + /* Wait for pipe to get closed, then dump */ + read(p[0], &ret, 1); + close(p[0]); + + printf("--- Dump loop ---\n"); + criu_init_opts(); + criu_set_service_binary(argv[1]); + criu_set_pid(pid); + criu_set_log_file("dump.log"); + criu_set_log_level(4); + + open_imgdir(); + ret = criu_dump_iters(next_iter); + if (ret < 0) { + what_err_ret_mean(ret); + kill(pid, SIGKILL); + goto err; + } + + printf(" `- Dump succeeded\n"); + waitpid(pid, NULL, 0); + + printf("--- Restore loop ---\n"); + criu_init_opts(); + criu_set_log_level(4); + criu_set_log_file("restore.log"); + criu_set_images_dir_fd(cur_imgdir); + + pid = criu_restore_child(); + if (pid <= 0) { + what_err_ret_mean(pid); + return -1; + } + + printf(" `- Restore returned pid %d\n", pid); + kill(pid, SIGUSR1); +err: + if (waitpid(pid, &ret, 0) < 0) { + perror(" Can't wait kid"); + return -1; + } + + return chk_exit(ret, SUCC_ECODE); +} diff --git a/CRIU_code/test/others/libcriu/test_notify.c b/CRIU_code/test/others/libcriu/test_notify.c new file mode 100644 index 0000000..54fb255 --- /dev/null +++ b/CRIU_code/test/others/libcriu/test_notify.c @@ -0,0 +1,97 @@ +#include "criu.h" +#include +#include +#include +#include +#include +#include +#include +#include + +#include "lib.h" + +#define SUCC_ECODE 42 + +static int actions_called = 0; +static int notify(char *action, criu_notify_arg_t na) +{ + printf("ACTION: %s\n", action); + actions_called++; + return 0; +} + +int main(int argc, char **argv) +{ + int pid, ret, fd, p[2]; + + printf("--- Start loop ---\n"); + pipe(p); + pid = fork(); + if (pid < 0) { + perror("Can't"); + return -1; + } + + if (!pid) { + printf(" `- loop: initializing\n"); + if (setsid() < 0) + exit(1); + + close(0); + close(1); + close(2); + close(p[0]); + + ret = SUCC_ECODE; + write(p[1], &ret, sizeof(ret)); + close(p[1]); + + while (1) + sleep(1); + + exit(SUCC_ECODE); + } + + close(p[1]); + + /* Wait for kid to start */ + ret = -1; + read(p[0], &ret, sizeof(ret)); + if (ret != SUCC_ECODE) { + printf("Error starting loop\n"); + goto err; + } + + /* Wait for pipe to get closed, then dump */ + read(p[0], &ret, 1); + close(p[0]); + + printf("--- Dump loop ---\n"); + criu_init_opts(); + criu_set_service_binary(argv[1]); + criu_set_pid(pid); + criu_set_log_file("dump.log"); + criu_set_log_level(4); + criu_set_notify_cb(notify); + fd = open(argv[2], O_DIRECTORY); + criu_set_images_dir_fd(fd); + + ret = criu_dump(); + if (ret < 0) { + what_err_ret_mean(ret); + kill(pid, SIGKILL); + goto err; + } + + printf(" `- Dump succeeded\n"); + ret = 0; +err: + waitpid(pid, NULL, 0); + if (ret || !actions_called) { + printf("FAIL (%d/%d)\n", ret, actions_called); + return 1; + } + + printf(" `- Success (%d actions)\n", actions_called); + return 0; +} diff --git a/CRIU_code/test/others/libcriu/test_self.c b/CRIU_code/test/others/libcriu/test_self.c new file mode 100644 index 0000000..c9d2a2e --- /dev/null +++ b/CRIU_code/test/others/libcriu/test_self.c @@ -0,0 +1,96 @@ +#include "criu.h" +#include +#include +#include +#include +#include +#include +#include +#include "lib.h" + +#define SUCC_DUMP_ECODE 41 +#define SUCC_RSTR_ECODE 43 + +int main(int argc, char *argv[]) +{ + int ret, fd, pid; + + fd = open(argv[2], O_DIRECTORY); + if (fd < 0) { + perror("Can't open images dir"); + return 1; + } + + criu_init_opts(); + criu_set_service_binary(argv[1]); + criu_set_images_dir_fd(fd); + criu_set_log_level(4); + + printf("--- Start child ---\n"); + pid = fork(); + if (pid < 0) { + perror("Can't"); + return 1; + } + + if (!pid) { + /* + * Child process -- dump itself, then + * parent would restore us. + */ + + close(0); + close(1); + close(2); + if (setsid() < 0) + exit(1); + + criu_set_log_file("dump.log"); + criu_set_leave_running(true); + ret = criu_dump(); + if (ret < 0) { + what_err_ret_mean(ret); + exit(1); + } + + if (ret == 0) + ret = SUCC_DUMP_ECODE; /* dumped OK */ + else if (ret == 1) + ret = SUCC_RSTR_ECODE; /* restored OK */ + else + ret = 1; + + exit(ret); + } + + printf("--- Wait for self-dump ---\n"); + if (waitpid(pid, &ret, 0) < 0) { + perror("Can't wait child"); + goto errk; + } + + if (chk_exit(ret, SUCC_DUMP_ECODE)) + goto errk; + + printf("--- Restore ---\n"); + criu_set_log_file("restore.log"); + + pid = criu_restore_child(); + if (pid <= 0) { + what_err_ret_mean(pid); + goto err; + } + + if (waitpid(pid, &ret, 0) < 0) { + perror("Can't wait rchild"); + goto errk; + } + + return chk_exit(ret, SUCC_RSTR_ECODE); + +errk: + kill(pid, SIGKILL); +err: + return 1; + +} diff --git a/CRIU_code/test/others/libcriu/test_sub.c b/CRIU_code/test/others/libcriu/test_sub.c new file mode 100644 index 0000000..b435f67 --- /dev/null +++ b/CRIU_code/test/others/libcriu/test_sub.c @@ -0,0 +1,107 @@ +#include "criu.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include "lib.h" + +static int stop = 0; +static void sh(int sig) +{ + stop = 1; +} + +#define SUCC_ECODE 42 + +int main(int argc, char **argv) +{ + int pid, ret, fd, p[2]; + + printf("--- Start loop ---\n"); + pipe(p); + pid = fork(); + if (pid < 0) { + perror("Can't"); + return -1; + } + + if (!pid) { + printf(" `- loop: initializing\n"); + if (setsid() < 0) + exit(1); + if (signal(SIGUSR1, sh) == SIG_ERR) + exit(1); + + close(0); + close(1); + close(2); + close(p[0]); + + ret = SUCC_ECODE; + write(p[1], &ret, sizeof(ret)); + close(p[1]); + + while (!stop) + sleep(1); + exit(SUCC_ECODE); + } + + close(p[1]); + + /* Wait for kid to start */ + ret = -1; + read(p[0], &ret, sizeof(ret)); + if (ret != SUCC_ECODE) { + printf("Error starting loop\n"); + goto err; + } + + /* Wait for pipe to get closed, then dump */ + read(p[0], &ret, 1); + close(p[0]); + + printf("--- Dump loop ---\n"); + criu_init_opts(); + criu_set_service_binary(argv[1]); + criu_set_pid(pid); + criu_set_log_file("dump.log"); + criu_set_log_level(4); + fd = open(argv[2], O_DIRECTORY); + criu_set_images_dir_fd(fd); + + ret = criu_dump(); + if (ret < 0) { + what_err_ret_mean(ret); + kill(pid, SIGKILL); + goto err; + } + + printf(" `- Dump succeeded\n"); + waitpid(pid, NULL, 0); + + printf("--- Restore loop ---\n"); + criu_init_opts(); + criu_set_log_level(4); + criu_set_log_file("restore.log"); + criu_set_images_dir_fd(fd); + + pid = criu_restore_child(); + if (pid <= 0) { + what_err_ret_mean(pid); + return -1; + } + + printf(" `- Restore returned pid %d\n", pid); + kill(pid, SIGUSR1); +err: + if (waitpid(pid, &ret, 0) < 0) { + perror(" Can't wait kid"); + return -1; + } + + return chk_exit(ret, SUCC_ECODE); +} diff --git a/CRIU_code/test/others/make/Makefile b/CRIU_code/test/others/make/Makefile new file mode 100644 index 0000000..b77b825 --- /dev/null +++ b/CRIU_code/test/others/make/Makefile @@ -0,0 +1,5 @@ +# Tests for the build system + +run: + ./uninstall.sh +.PHONY: run diff --git a/CRIU_code/test/others/make/uninstall.sh b/CRIU_code/test/others/make/uninstall.sh new file mode 100644 index 0000000..ec5a74d --- /dev/null +++ b/CRIU_code/test/others/make/uninstall.sh @@ -0,0 +1,22 @@ +#!/bin/sh +# A test to make sure "make uninstall" works as intended. + +set -e +SELFDIR=$(dirname $(readlink -f $0)) +DESTDIR=$SELFDIR/test.install-$$ +cd $SELFDIR/../../.. + +set -x +make install DESTDIR=$DESTDIR +make uninstall DESTDIR=$DESTDIR +set +x + +# There should be no files left (directories are OK for now) +if [ $(find $DESTDIR -type f | wc -l) -gt 0 ]; then + echo "Files left after uninstall:" + find $DESTDIR -type f + echo "FAIL" + exit 1 +fi + +echo PASS diff --git a/CRIU_code/test/others/mem-snap/Makefile b/CRIU_code/test/others/mem-snap/Makefile new file mode 100644 index 0000000..7779a99 --- /dev/null +++ b/CRIU_code/test/others/mem-snap/Makefile @@ -0,0 +1,2 @@ +run: + ./run.sh diff --git a/CRIU_code/test/others/mem-snap/run-predump-2.sh b/CRIU_code/test/others/mem-snap/run-predump-2.sh new file mode 100644 index 0000000..46af806 --- /dev/null +++ b/CRIU_code/test/others/mem-snap/run-predump-2.sh @@ -0,0 +1,66 @@ +#!/bin/bash + +source ../env.sh || exit 1 + +function fail { + echo "$@" + exit 1 +} +set -x + +IMGDIR="dump/" + +rm -rf "$IMGDIR" +mkdir "$IMGDIR" + +function launch_test { + echo "Launching test" + cd ../../zdtm/static/ + make cleanout + make maps04 + make maps04.pid || fail "Can't start test" + PID=$(cat maps04.pid) + kill -0 $PID || fail "Test didn't start" + cd - +} + +function stop_test { + wtime=1 + cd ../../zdtm/static/ + make maps04.stop + cat maps04.out | fgrep PASS || fail "Test failed" + echo "OK" +} + +launch_test + +echo "Taking plain dump" + +mkdir "$IMGDIR/dump-1/" +${CRIU} dump -D "$IMGDIR/dump-1/" -o dump.log -t ${PID} -v4 || fail "Fail to dump" + +sleep 1 +echo "Restore to check it works" +${CRIU} restore -D "${IMGDIR}/dump-1/" -o restore.log -d -v4 || fail "Fail to restore server" + +stop_test + + +launch_test + +echo "Taking pre and plain dumps" + +echo "Pre-dump" +mkdir "$IMGDIR/dump-2/" +mkdir "$IMGDIR/dump-2/pre/" +${CRIU} pre-dump -D "$IMGDIR/dump-2/pre/" -o dump.log -t ${PID} -v4 || fail "Fail to pre-dump" + +echo "Plain dump" +mkdir "$IMGDIR/dump-2/plain/" +${CRIU} dump -D "$IMGDIR/dump-2/plain/" -o dump.log -t ${PID} -v4 --prev-images-dir=../pre/ --track-mem || fail "Fail to dump" + +sleep 1 +echo "Restore" +${CRIU} restore -D "${IMGDIR}/dump-2/plain/" -o restore.log -d -v4 || fail "Fail to restore server" + +stop_test diff --git a/CRIU_code/test/others/mem-snap/run-predump.sh b/CRIU_code/test/others/mem-snap/run-predump.sh new file mode 100644 index 0000000..d06d2d8 --- /dev/null +++ b/CRIU_code/test/others/mem-snap/run-predump.sh @@ -0,0 +1,77 @@ +#!/bin/bash + +source ../env.sh || exit 1 + +USEPS=0 + +if [ "$1" = "-s" ]; then + echo "Will test via page-server" + USEPS=1 + shift +fi + +NRSNAP=${1:-3} +SPAUSE=${2:-4} +PORT=12345 + +function fail { + echo "$@" + exit 1 +} +set -x + +IMGDIR="dump/" + +rm -rf "$IMGDIR" +mkdir "$IMGDIR" + +echo "Launching test" +cd ../../zdtm/static/ +make cleanout +make mem-touch +make mem-touch.pid || fail "Can't start test" +PID=$(cat mem-touch.pid) +kill -0 $PID || fail "Test didn't start" +cd - + +echo "Making $NRSNAP pre-dumps" + +for SNAP in $(seq 1 $NRSNAP); do + sleep $SPAUSE + mkdir "$IMGDIR/$SNAP/" + if [ $SNAP -eq 1 ] ; then + # First pre-dump + cmd="pre-dump" + args="--track-mem -R" + elif [ $SNAP -eq $NRSNAP ]; then + # Last dump + cmd="dump" + args="--prev-images-dir=../$((SNAP - 1))/ --track-mem" + else + # Other pre-dumps + cmd="pre-dump" + args="--prev-images-dir=../$((SNAP - 1))/ --track-mem -R" + fi + + if [ $USEPS -eq 1 ]; then + ${CRIU} page-server -D "${IMGDIR}/$SNAP/" -o ps.log --port ${PORT} -v4 & + PS_PID=$! + ps_args="--page-server --address 127.0.0.1 --port=${PORT}" + else + ps_args="" + fi + + ${CRIU} $cmd -D "${IMGDIR}/$SNAP/" -o dump.log -t ${PID} -v4 $args $ps_args || fail "Fail to dump" + if [ $USEPS -eq 1 ]; then + wait $PS_PID + fi +done + +echo "Restoring" +${CRIU} restore -D "${IMGDIR}/$NRSNAP/" -o restore.log -d -v4 || fail "Fail to restore server" + +cd ../../zdtm/static/ +make mem-touch.stop +cat mem-touch.out | fgrep PASS || fail "Test failed" + +echo "Test PASSED" diff --git a/CRIU_code/test/others/mem-snap/run-snap-auto-dedup.sh b/CRIU_code/test/others/mem-snap/run-snap-auto-dedup.sh new file mode 100644 index 0000000..f77aa1f --- /dev/null +++ b/CRIU_code/test/others/mem-snap/run-snap-auto-dedup.sh @@ -0,0 +1,93 @@ +#!/bin/bash + +source ../env.sh || exit 1 + +USEPS=0 + +if [ "$1" = "-s" ]; then + echo "Will test via page-server" + USEPS=1 + shift +fi + +NRSNAP=${1:-3} +SPAUSE=${2:-4} +PORT=12345 + +function fail { + echo "$@" + exit 1 +} +set -x + +IMGDIR="dump/" + +rm -rf "$IMGDIR" +mkdir "$IMGDIR" + +echo "Launching test" +cd ../../zdtm/static/ +make cleanout +make mem-touch +make mem-touch.pid || fail "Can't start test" +PID=$(cat mem-touch.pid) +kill -0 $PID || fail "Test didn't start" +cd - + +echo "Making $NRSNAP snapshots" + +for SNAP in $(seq 1 $NRSNAP); do + sleep $SPAUSE + mkdir "$IMGDIR/$SNAP/" + if [ $SNAP -eq 1 ] ; then + # First snapshot -- no parent, keep running + args="--track-mem -R" + elif [ $SNAP -eq $NRSNAP ]; then + # Last snapshot -- has parent, kill afterwards + size_first_2=$(du -sh -BK dump/2/pages-*.img | grep -Eo '[0-9]+' | head -1) + size_first_1=$(du -sh -BK dump/1/pages-*.img | grep -Eo '[0-9]+' | head -1) + args="--prev-images-dir=../$((SNAP - 1))/ --track-mem --auto-dedup" + else + # Other snapshots -- have parent, keep running + args="--prev-images-dir=../$((SNAP - 1))/ --track-mem -R" + fi + + if [ $USEPS -eq 1 ]; then + ${CRIU} page-server -D "${IMGDIR}/$SNAP/" -o ps.log --auto-dedup --port ${PORT} -v4 & + PS_PID=$! + ps_args="--page-server --address 127.0.0.1 --port=${PORT}" + else + ps_args="" + fi + + ${CRIU} dump -D "${IMGDIR}/$SNAP/" -o dump.log -t ${PID} -v4 $args $ps_args || fail "Fail to dump" + if [ $USEPS -eq 1 ]; then + wait $PS_PID + fi +done + +size_last_2=$(du -sh -BK dump/2/pages-*.img | grep -Eo '[0-9]+' | head -1) +size_last_1=$(du -sh -BK dump/1/pages-*.img | grep -Eo '[0-9]+' | head -1) + +dedup_ok_2=1 +if [ $size_first_2 -gt $size_last_2 ]; then + dedup_ok_2=0 +fi + +dedup_ok_1=1 +if [ $size_first_1 -gt $size_last_1 ]; then + dedup_ok_1=0 +fi + +echo "Restoring" +${CRIU} restore -D "${IMGDIR}/$NRSNAP/" -o restore.log -d -v4 || fail "Fail to restore server" + +cd ../../zdtm/static/ +make mem-touch.stop +cat mem-touch.out | fgrep PASS || fail "Test failed" + +if [[ $dedup_ok_2 -ne 0 || $dedup_ok_1 -ne 0 ]]; then + fail "Dedup test failed" +fi + +echo "Test PASSED" diff --git a/CRIU_code/test/others/mem-snap/run-snap-dedup-on-restore.sh b/CRIU_code/test/others/mem-snap/run-snap-dedup-on-restore.sh new file mode 100644 index 0000000..6ae050b --- /dev/null +++ b/CRIU_code/test/others/mem-snap/run-snap-dedup-on-restore.sh @@ -0,0 +1,87 @@ +#!/bin/bash + +source ../env.sh || exit 1 + +USEPS=0 + +if [ "$1" = "-s" ]; then + echo "Will test via page-server" + USEPS=1 + shift +fi + +NRSNAP=${1:-3} +SPAUSE=${2:-4} +PORT=12345 + +function fail { + echo "$@" + exit 1 +} +set -x + +IMGDIR="dump/" + +rm -rf "$IMGDIR" +mkdir "$IMGDIR" + +echo "Launching test" +cd ../../zdtm/static/ +make cleanout +make mem-touch +make mem-touch.pid || fail "Can't start test" +PID=$(cat mem-touch.pid) +kill -0 $PID || fail "Test didn't start" +cd - + +echo "Making $NRSNAP snapshots" + +for SNAP in $(seq 1 $NRSNAP); do + sleep $SPAUSE + mkdir "$IMGDIR/$SNAP/" + if [ $SNAP -eq 1 ] ; then + # First snapshot -- no parent, keep running + args="--track-mem -R" + elif [ $SNAP -eq $NRSNAP ]; then + # Last snapshot -- has parent, kill afterwards + args="--prev-images-dir=../$((SNAP - 1))/ --track-mem --auto-dedup" + else + # Other snapshots -- have parent, keep running + args="--prev-images-dir=../$((SNAP - 1))/ --track-mem -R --auto-dedup" + fi + + if [ $USEPS -eq 1 ]; then + ${CRIU} page-server -D "${IMGDIR}/$SNAP/" -o ps.log --auto-dedup --port ${PORT} -v4 & + PS_PID=$! + ps_args="--page-server --address 127.0.0.1 --port=${PORT}" + else + ps_args="" + fi + + ${CRIU} dump -D "${IMGDIR}/$SNAP/" -o dump.log -t ${PID} -v4 $args $ps_args || fail "Fail to dump" + if [ $USEPS -eq 1 ]; then + wait $PS_PID + fi +done + +echo "Restoring" +${CRIU} restore -D "${IMGDIR}/$NRSNAP/" -o restore.log --auto-dedup -d -v4 || fail "Fail to restore server" + +size_last3=$(du -sh -BK dump/3/pages-*.img | grep -Eo '[0-9]+' | head -1) +size_last2=$(du -sh -BK dump/2/pages-*.img | grep -Eo '[0-9]+' | head -1) +size_last1=$(du -sh -BK dump/1/pages-*.img | grep -Eo '[0-9]+' | head -1) + +restore_dedup_ok=0 +if [[ $size_last1 -ne 0 || $size_last2 -ne 0 || $size_last3 -ne 0 ]]; then + restore_dedup_ok=1 +fi + +cd ../../zdtm/static/ +make mem-touch.stop +cat mem-touch.out | fgrep PASS || fail "Test failed" + +if [ $restore_dedup_ok -ne 0 ]; then + fail "Dedup test failed" +fi + +echo "Test PASSED" diff --git a/CRIU_code/test/others/mem-snap/run-snap-dedup.sh b/CRIU_code/test/others/mem-snap/run-snap-dedup.sh new file mode 100644 index 0000000..27fcd55 --- /dev/null +++ b/CRIU_code/test/others/mem-snap/run-snap-dedup.sh @@ -0,0 +1,99 @@ +#!/bin/bash + +source ../env.sh || exit 1 + +USEPS=0 + +if [ "$1" = "-s" ]; then + echo "Will test via page-server" + USEPS=1 + shift +fi + +NRSNAP=${1:-3} +SPAUSE=${2:-4} +PORT=12345 + +function fail { + echo "$@" + exit 1 +} +set -x + +IMGDIR="dump/" + +rm -rf "$IMGDIR" +mkdir "$IMGDIR" + +echo "Launching test" +cd ../../zdtm/static/ +make cleanout +make mem-touch +make mem-touch.pid || fail "Can't start test" +PID=$(cat mem-touch.pid) +kill -0 $PID || fail "Test didn't start" +cd - + +echo "Making $NRSNAP snapshots" + +for SNAP in $(seq 1 $NRSNAP); do + sleep $SPAUSE + mkdir "$IMGDIR/$SNAP/" + if [ $SNAP -eq 1 ] ; then + # First snapshot -- no parent, keep running + args="--track-mem -R" + elif [ $SNAP -eq $NRSNAP ]; then + # Last snapshot -- has parent, kill afterwards + args="--prev-images-dir=../$((SNAP - 1))/ --track-mem" + else + # Other snapshots -- have parent, keep running + args="--prev-images-dir=../$((SNAP - 1))/ --track-mem -R" + fi + + if [ $USEPS -eq 1 ]; then + ${CRIU} page-server -D "${IMGDIR}/$SNAP/" -o ps.log --port ${PORT} -v4 & + PS_PID=$! + ps_args="--page-server --address 127.0.0.1 --port=${PORT}" + else + ps_args="" + fi + + ${CRIU} dump -D "${IMGDIR}/$SNAP/" -o dump.log -t ${PID} -v4 $args $ps_args || fail "Fail to dump" + if [ $USEPS -eq 1 ]; then + wait $PS_PID + fi +done + +echo "Dedup test" + +size_first_2=$(du -sh -BK dump/2/pages-*.img | grep -Eo '[0-9]+' | head -1) +size_first_1=$(du -sh -BK dump/1/pages-*.img | grep -Eo '[0-9]+' | head -1) + +${CRIU} dedup -D "${IMGDIR}/$NRSNAP/" + +size_last_2=$(du -sh -BK dump/2/pages-*.img | grep -Eo '[0-9]+' | head -1) +size_last_1=$(du -sh -BK dump/1/pages-*.img | grep -Eo '[0-9]+' | head -1) + +dedup_ok_2=1 +dedup_ok_1=1 + +if [ $size_first_2 -gt $size_last_2 ]; then + dedup_ok_2=0 +fi + +if [ $size_first_1 -gt $size_last_1 ]; then + dedup_ok_1=0 +fi + +echo "Restoring" +${CRIU} restore -D "${IMGDIR}/$NRSNAP/" -o restore.log -d -v4 || fail "Fail to restore server" + +cd ../../zdtm/static/ +make mem-touch.stop +cat mem-touch.out | fgrep PASS || fail "Test failed" + +if [[ $dedup_ok_2 -ne 0 || $dedup_ok_1 -ne 0 ]]; then + fail "Dedup test failed" +fi + +echo "Test PASSED" diff --git a/CRIU_code/test/others/mem-snap/run-snap-maps04.sh b/CRIU_code/test/others/mem-snap/run-snap-maps04.sh new file mode 100644 index 0000000..2def909 --- /dev/null +++ b/CRIU_code/test/others/mem-snap/run-snap-maps04.sh @@ -0,0 +1,68 @@ +#!/bin/bash + +source ../env.sh || exit 1 + +USEPS=0 + +if [ "$1" = "-s" ]; then + echo "Will test via page-server" + USEPS=1 + shift +fi + +NRSNAP=1 +SPAUSE=${2:-4} +PORT=12345 + +function fail { + echo "$@" + exit 1 +} +set -x + +IMGDIR="dump" +CURDIR=${pwd} +if ! mount | fgrep "$CURDIR/$IMGDIR" ; then + rm -rf "$IMGDIR" + mkdir "$IMGDIR" + + mount -t tmpfs -o size=1500M,nr_inodes=10k,mode=700 tmpfs $IMGDIR +fi +rm -rf "$IMGDIR/*" + +echo "Launching test" +make -C ../../zdtm/static/ cleanout +make -C ../../zdtm/static/ maps04 +make -C ../../zdtm/static/ maps04.pid || fail "Can't start test" +PID=$(cat ../../zdtm/static/maps04.pid) +kill -0 $PID || fail "Test haven't started" + +mkdir "$IMGDIR/$NRSNAP/" + +if [ $USEPS -eq 1 ] ; then + ${CRIU} page-server -D "${IMGDIR}/$NRSNAP/" -o ps.log --port ${PORT} -d -v4 #& + PS_PID=$! + ps_args="--page-server --address 127.0.0.1 --port=${PORT}" +else + ps_args="" +fi + +${CRIU} dump -D "${IMGDIR}/$NRSNAP/" -o dump.log -t ${PID} -v4 $ps_args || fail "Fail to dump" +if [ $USEPS -eq 1 ] ; then + wait $PS_PID +fi + +echo "Restoring" +${CRIU} restore -D "${IMGDIR}/$NRSNAP/" -o restore.log --auto-dedup -d -v4 || fail "Fail to restore" + +make -C ../../zdtm/static/ maps04.stop +sleep 1 + +cat "../zdtm/static/maps04.out" | fgrep PASS || fail "Test failed" + +size=$(du -sh -BK dump/1/pages-*.img | grep -Eo '[0-9]+' | head -1) +if [ $size -ne 0 ] ; then + fail "Size not null" +fi + +echo "Test PASSED" diff --git a/CRIU_code/test/others/mem-snap/run-snap.sh b/CRIU_code/test/others/mem-snap/run-snap.sh new file mode 100644 index 0000000..b97bd29 --- /dev/null +++ b/CRIU_code/test/others/mem-snap/run-snap.sh @@ -0,0 +1,74 @@ +#!/bin/bash + +source ../env.sh || exit 1 + +USEPS=0 + +if [ "$1" = "-s" ]; then + echo "Will test via page-server" + USEPS=1 + shift +fi + +NRSNAP=${1:-3} +SPAUSE=${2:-4} +PORT=12345 + +function fail { + echo "$@" + exit 1 +} +set -x + +IMGDIR="dump/" + +rm -rf "$IMGDIR" +mkdir "$IMGDIR" + +echo "Launching test" +cd ../../zdtm/static/ +make cleanout +make mem-touch +make mem-touch.pid || fail "Can't start test" +PID=$(cat mem-touch.pid) +kill -0 $PID || fail "Test didn't start" +cd - + +echo "Making $NRSNAP snapshots" + +for SNAP in $(seq 1 $NRSNAP); do + sleep $SPAUSE + mkdir "$IMGDIR/$SNAP/" + if [ $SNAP -eq 1 ] ; then + # First snapshot -- no parent, keep running + args="--track-mem -R" + elif [ $SNAP -eq $NRSNAP ]; then + # Last snapshot -- has parent, kill afterwards + args="--prev-images-dir=../$((SNAP - 1))/ --track-mem" + else + # Other snapshots -- have parent, keep running + args="--prev-images-dir=../$((SNAP - 1))/ --track-mem -R" + fi + + if [ $USEPS -eq 1 ]; then + ${CRIU} page-server -D "${IMGDIR}/$SNAP/" -o ps.log --port ${PORT} -v4 & + PS_PID=$! + ps_args="--page-server --address 127.0.0.1 --port=${PORT}" + else + ps_args="" + fi + + ${CRIU} dump -D "${IMGDIR}/$SNAP/" -o dump.log -t ${PID} -v4 $args $ps_args || fail "Fail to dump" + if [ $USEPS -eq 1 ]; then + wait $PS_PID + fi +done + +echo "Restoring" +${CRIU} restore -D "${IMGDIR}/$NRSNAP/" -o restore.log -d -v4 || fail "Fail to restore server" + +cd ../../zdtm/static/ +make mem-touch.stop +cat mem-touch.out | fgrep PASS || fail "Test failed" + +echo "Test PASSED" diff --git a/CRIU_code/test/others/mem-snap/run.sh b/CRIU_code/test/others/mem-snap/run.sh new file mode 100644 index 0000000..b3995ec --- /dev/null +++ b/CRIU_code/test/others/mem-snap/run.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +# Don't execute tests, which use maps04, they are executed by zdtm + +set -e + +#./run-predump-2.sh +./run-predump.sh +./run-snap-auto-dedup.sh +./run-snap-dedup-on-restore.sh +./run-snap-dedup.sh +#./run-snap-maps04.sh +./run-snap.sh diff --git a/CRIU_code/test/others/mnt-ext-dev/Makefile b/CRIU_code/test/others/mnt-ext-dev/Makefile new file mode 100644 index 0000000..7779a99 --- /dev/null +++ b/CRIU_code/test/others/mnt-ext-dev/Makefile @@ -0,0 +1,2 @@ +run: + ./run.sh diff --git a/CRIU_code/test/others/mnt-ext-dev/run.sh b/CRIU_code/test/others/mnt-ext-dev/run.sh new file mode 100644 index 0000000..9803a8f --- /dev/null +++ b/CRIU_code/test/others/mnt-ext-dev/run.sh @@ -0,0 +1,17 @@ +#!/bin/sh +set -e -x + +# construct root +python ../../zdtm.py run -t zdtm/static/env00 --iter 0 -f ns + +truncate -s 0 zdtm.loop +truncate -s 50M zdtm.loop +mkfs.ext4 -F zdtm.loop +dev=`losetup --find --show zdtm.loop` +mkdir -p ../../dev +cp -ap $dev ../../dev +export ZDTM_MNT_EXT_DEV=$dev +python ../../zdtm.py run -t zdtm/static/mnt_ext_dev || ret=$? +losetup -d $dev +unlink zdtm.loop +exit $ret diff --git a/CRIU_code/test/others/mounts/ext/Makefile b/CRIU_code/test/others/mounts/ext/Makefile new file mode 100644 index 0000000..282fba0 --- /dev/null +++ b/CRIU_code/test/others/mounts/ext/Makefile @@ -0,0 +1,13 @@ +all: ext-mount.so ns_init + +ext-mount.so: ext-mount.c + gcc -g -Werror -Wall -shared -nostartfiles ext-mount.c -o ext-mount.so -iquote ../../../include -fPIC + +ns_init: ns_init.o + gcc -static $< -o $@ + +ns_init.o: ns_init.c + gcc -c $< -o $@ + +run: all + ./run.sh diff --git a/CRIU_code/test/others/mounts/ext/ext-mount.c b/CRIU_code/test/others/mounts/ext/ext-mount.c new file mode 100644 index 0000000..e5e974b --- /dev/null +++ b/CRIU_code/test/others/mounts/ext/ext-mount.c @@ -0,0 +1,101 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "criu-plugin.h" +#include "criu-log.h" + +#define IMG_NAME "ext-mount-test-%d.img" + +extern cr_plugin_init_t cr_plugin_init; +extern cr_plugin_dump_ext_mount_t cr_plugin_dump_ext_mount; +extern cr_plugin_restore_ext_mount_t cr_plugin_restore_ext_mount; + +int cr_plugin_init(void) +{ + pr_info("Initialized ext mount c/r\n"); + return 0; +} + +int cr_plugin_dump_ext_mount(char *mountpoint, int id) +{ + char *aux, *dst; + int fd; + char img[64]; + + pr_info("Check for ext mount %s being mine\n", mountpoint); + aux = strrchr(mountpoint, '/'); + if (!aux) { + pr_err("Bad path provided\n"); + return -ENOTSUP; + } + + dst = getenv("EMP_MOUNTPOINT"); + if (!dst) { + pr_err("No EMP_MOUNTPOINT env\n"); + return -1; + } + + if (strcmp(aux + 1, dst)) { + pr_info("Not mine\n"); + return -ENOTSUP; + } + + pr_info("Dumping my mount %d\n", id); + sprintf(img, IMG_NAME, id); + fd = openat(criu_get_image_dir(), img, + O_RDWR | O_CREAT | O_TRUNC, 0600); + if (fd < 0) { + pr_perror("Can't open image"); + return -1; + } + + close(fd); + return 0; +} + +int cr_plugin_restore_ext_mount(int id, char *mountpoint, char *old_root, int *is_file) +{ + int fd; + char img[64], src[256], *src_file; + + pr_info("Restoring my mount %d?\n", id); + sprintf(img, IMG_NAME, id); + fd = openat(criu_get_image_dir(), img, O_RDONLY); + if (fd < 0) { + if (errno == ENOENT) + return -ENOTSUP; + pr_perror("Can't open my image"); + return -1; + } + close(fd); + + src_file = getenv("EMP_ROOT_P"); + if (!src_file) { + pr_err("Can't get EMP_ROOT_P env\n"); + return -1; + } + + if (creat(mountpoint, 0600) < 0) { + if (errno != EEXIST) { + pr_perror("Can't make mountpoint"); + return -1; + } + } + + if (is_file) + *is_file = 1; + + sprintf(src, "/%s/%s", old_root, src_file); + pr_info("Mount %s -> %s\n", src, mountpoint); + if (mount(src, mountpoint, NULL, MS_BIND, NULL) < 0) { + pr_perror("Can't bind mount"); + return -1; + } + + return 0; +} diff --git a/CRIU_code/test/others/mounts/ext/ns_init.c b/CRIU_code/test/others/mounts/ext/ns_init.c new file mode 100644 index 0000000..e85bf9d --- /dev/null +++ b/CRIU_code/test/others/mounts/ext/ns_init.c @@ -0,0 +1,143 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static void sigh(int sig) +{ +} + +int main(int argc, char **argv) +{ + int start[2]; + char res; + pid_t pid; + + /* + * Usage: + * run + */ + + if (getpid() == 1) { + int fd; + struct sigaction sa = {}; + sigset_t mask; + + if (setsid() == -1) { + fprintf(stderr, "setsid: %m\n"); + return 1; + } + + sa.sa_handler = sigh; + sigaction(SIGTERM, &sa, NULL); + + if (chdir(argv[2])) + return 1; + + fd = open(argv[3], O_WRONLY|O_CREAT|O_TRUNC|O_APPEND, 0600); + if (fd < 0) + return 1; + + dup2(fd, 1); + dup2(fd, 2); + close(fd); + close(0); + + if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL)) { + fprintf(stderr, "mount(/, S_REC | MS_PRIVATE)): %m"); + return 1; + } + + mkdir("oldm"); + if (pivot_root(".", "./oldm") < 0) + return 1; + + umount2("/oldm", MNT_DETACH); + + mkdir("/proc"); + if (mount("zdtm_proc", "/proc", "proc", 0, NULL)) { + fprintf(stderr, "mount(/proc): %m"); + return 1; + } + + sigemptyset(&mask); + sigaddset(&mask, SIGTERM); + sigprocmask(SIG_BLOCK, &mask, NULL); + + fd = atoi(argv[1]); + write(fd, "!", 1); + close(fd); + + sigemptyset(&mask); + sigsuspend(&mask); + + printf("Woken UP\n"); + printf("Reading %s for [%s]\n", argv[4], argv[5]); + { + FILE *f; + char buf[128]; + + f = fopen(argv[4], "r"); + if (!f) + perror("No file with message"); + else { + memset(buf, 0, sizeof(buf)); + fgets(buf, sizeof(buf), f); + fclose(f); + printf("Got [%s]\n", buf); + + if (!strcmp(buf, argv[5])) + printf("PASS\n"); + } + } + + exit(0); + } + + if (unshare(CLONE_NEWNS | CLONE_NEWPID)) + return 1; + + pipe(start); + pid = fork(); + if (pid == 0) { + char *nargv[7], aux[10]; + + close(start[0]); + sprintf(aux, "%d", start[1]); + nargv[0] = argv[0]; + nargv[1] = aux; + nargv[2] = argv[2]; + nargv[3] = argv[3]; + nargv[4] = argv[4]; + nargv[5] = argv[5]; + nargv[6] = NULL; + + execv(argv[0], nargv); + exit(0); + } + + close(start[1]); + res = 'F'; + read(start[0], &res, 1); + if (res != '!') { + printf("Failed to start\n"); + return 1; + } + + printf("Container w/ tests started\n"); + { + FILE *pidf; + pidf = fopen(argv[1], "w"); + fprintf(pidf, "%d", pid); + fclose(pidf); + } + + return 0; +} diff --git a/CRIU_code/test/others/mounts/ext/run.sh b/CRIU_code/test/others/mounts/ext/run.sh new file mode 100644 index 0000000..ff2a2d8 --- /dev/null +++ b/CRIU_code/test/others/mounts/ext/run.sh @@ -0,0 +1,125 @@ +#!/bin/bash + +set -x + +function fail { + echo $@ + exit 1 +} + +make || fail "Can't compile library or ns init" + +criu="../../../criu/criu" + +# New root for namespace +NSROOT="nsroot" +# External file with contents (exported for plugin.restore) +EMP_ROOT="external_file" +export EMP_ROOT_P="$(pwd)/$EMP_ROOT" +# Internal file as seen from namespace (exported for plugin.dump) +export EMP_MOUNTPOINT="file" +# Message in a file to check visibility +FMESSAGE="tram-pam-pam" +# Binary of namespace's init +NS_INIT="ns_init" +# File with namespace init pid +PIDF="pidf" + +start_ns() +{ + # + # Prepare the namespace's FS layout + # + mkdir $NSROOT + echo -n "$FMESSAGE" > "$EMP_ROOT" + mount --bind "$NSROOT" "$NSROOT" + mount --make-private "$NSROOT" + touch "$NSROOT/$EMP_MOUNTPOINT" + mount --bind "$EMP_ROOT" "$NSROOT/$EMP_MOUNTPOINT" || fail "Can't prepare fs for ns" + + # + # Start the namespace's init + # + cp $NS_INIT "$NSROOT/" + "./$NSROOT/$NS_INIT" "$PIDF" "$NSROOT" "log" "$EMP_MOUNTPOINT" "$FMESSAGE" || fail "Can't start namespace" + umount "$NSROOT/$EMP_MOUNTPOINT" + + echo "Namespace started, pid $(cat $PIDF)" +} + +stop_ns() +{ + # + # Kill the init + # + + kill -TERM $(cat $PIDF) + sleep 2 # Shitty, but... + umount $NSROOT + + if [ -z "$1" ]; then + rm -f "$NSROOT/log" + else + mv "$NSROOT/log" "$1" + fi + + rm -f "$PIDF" "$EMP_ROOT" "$NSROOT/$NS_INIT" "$NSROOT/log" "$NSROOT/$EMP_MOUNTPOINT" + rmdir "$NSROOT/oldm" + rmdir "$NSROOT/proc" + rmdir "$NSROOT" +} + +DDIR="dump" +rm -rf $DDIR +mkdir $DDIR + +chk_pass() +{ + tail -n1 $1 | fgrep -q "PASS" +} + +# +# Test 1: handle external mount with plugin +# + +test_plugin() +{ + echo "=== Testing how plugin works" + mkdir "$DDIR/plugin/" + start_ns + + $criu dump -D "$DDIR/plugin/" -v4 -o "dump.log" --lib=$(pwd) \ + -t $(cat pidf) || { stop_ns; return 1; } + + $criu restore -D "$DDIR/plugin/" -v4 -o "rstr.log" --lib=$(pwd) \ + -d --root="$(pwd)/$NSROOT" --pidfile=$PIDF || { stop_ns; return 1; } + + echo "Restored, checking results" + mv "$DDIR/plugin/$PIDF" . + stop_ns "$DDIR/plugin/ns.log" + chk_pass "$DDIR/plugin/ns.log" +} + +test_ext_mount_map() +{ + echo "=== Testing how --ext-mount-map works" + mkdir "$DDIR/ext_mount_map/" + start_ns + + $criu dump -D "$DDIR/ext_mount_map/" -v4 -o "dump.log" \ + -t $(cat pidf) --ext-mount-map "/$EMP_MOUNTPOINT:TM" || { stop_ns; return 1; } + + $criu restore -D "$DDIR/ext_mount_map/" -v4 -o "rstr.log" \ + -d --root="$(pwd)/$NSROOT" --pidfile=$PIDF --ext-mount-map "TM:$EMP_ROOT_P" || { stop_ns; return 1; } + + echo "Restored, checking results" + mv "$DDIR/ext_mount_map/$PIDF" . + stop_ns "$DDIR/ext_mount_map/ns.log" + chk_pass "$DDIR/ext_mount_map/ns.log" +} + +test_plugin || exit 1 +test_ext_mount_map || exit 1 + +echo "All tests passed" +exit 0 diff --git a/CRIU_code/test/others/mounts/mounts.py b/CRIU_code/test/others/mounts/mounts.py new file mode 100644 index 0000000..dc65ba4 --- /dev/null +++ b/CRIU_code/test/others/mounts/mounts.py @@ -0,0 +1,31 @@ +import os +import tempfile, random + +def mount(src, dst, shared, private, slave): + cmd = "mount" + if shared: + cmd += " --make-shared" + if private: + cmd += " --make-private" + if slave: + cmd += " --make-slave" + if src: + cmd += " --bind '%s' '%s'" % (src, dst) + else: + cmd += " -t tmpfs none '%s'" % (dst) + + print(cmd) + ret = os.system(cmd) + if ret: + print("failed") + +root = tempfile.mkdtemp(prefix = "root.mount", dir = "/tmp") +mount(None, root, 1, 0, 0) +mounts = [root] + +for i in range(10): + dstdir = random.choice(mounts) + dst = tempfile.mkdtemp(prefix = "mount", dir = dstdir) + src = random.choice(mounts + [None]) + mount(src, dst, random.randint(0,100) > 50, random.randint(0,100) > 90, random.randint(0,100) > 50) + mounts.append(dst) diff --git a/CRIU_code/test/others/mounts/mounts.sh b/CRIU_code/test/others/mounts/mounts.sh new file mode 100644 index 0000000..a9a1cc8 --- /dev/null +++ b/CRIU_code/test/others/mounts/mounts.sh @@ -0,0 +1,27 @@ +[ -z "$INMNTNS" ] && { + export INMNTNS=`pwd` + export INMNTNS_PID=$$ + unshare -m -- setsid bash -x "$0" "$@" < /dev/null &> mounts.log & + echo $! > mounts.pid + while :; do + sleep 1 + done +} + +cd $INMNTNS + +mount --make-rprivate / + +for i in `cat /proc/self/mounts | awk '{ print $2 }'`; do + [ '/' = "$i" ] && continue + [ '/proc' = "$i" ] && continue + [ '/dev' = "$i" ] && continue + echo $i + umount -l $i +done + +python2 mounts.py +kill $INMNTNS_PID +while :; do + sleep 10 +done diff --git a/CRIU_code/test/others/mounts/run.sh b/CRIU_code/test/others/mounts/run.sh new file mode 100644 index 0000000..35927fb --- /dev/null +++ b/CRIU_code/test/others/mounts/run.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +CRIU=../../../criu/criu +set -x + +mkdir -p dump + +./mounts.sh +pid=`cat mounts.pid` +kill -0 $pid || exit + +cat /proc/$pid/mountinfo | sort -k 4 +echo "Suspend server" +${CRIU} dump -D dump -o dump.log -t $pid -v4 || { + cat dump/dump.log | grep Error + exit 1 +} +echo "Resume server" +${CRIU} restore -d -D dump -o restore.log -v4 || { + cat dump/dump.log | grep Error + exit 1 +} +cat /proc/$pid/mountinfo | sort -k 4 +kill $pid diff --git a/CRIU_code/test/others/netns_ext/Makefile b/CRIU_code/test/others/netns_ext/Makefile new file mode 100644 index 0000000..7779a99 --- /dev/null +++ b/CRIU_code/test/others/netns_ext/Makefile @@ -0,0 +1,2 @@ +run: + ./run.sh diff --git a/CRIU_code/test/others/netns_ext/_run.sh b/CRIU_code/test/others/netns_ext/_run.sh new file mode 100644 index 0000000..59070fb --- /dev/null +++ b/CRIU_code/test/others/netns_ext/_run.sh @@ -0,0 +1,4 @@ +echo $$ > $1 +while :; do + sleep 1 +done diff --git a/CRIU_code/test/others/netns_ext/run.sh b/CRIU_code/test/others/netns_ext/run.sh new file mode 100644 index 0000000..6de6c5a --- /dev/null +++ b/CRIU_code/test/others/netns_ext/run.sh @@ -0,0 +1,40 @@ +#!/bin/bash + +set -e + +CRIU=../../../criu/criu +setsid unshare -n bash -c 'unshare -n sh _run.sh pidfile2 & unshare -n sh _run.sh pidfile3 & ip link add xxx type veth && ip link add mymacvlan1 link xxx type macvlan mode bridge && . _run.sh pidfile' < /dev/zero &> output & +sleep 1 +while :; do + test -f pidfile && test -f pidfile2 && break; + sleep 0.1 +done + +pid=$(cat pidfile) +pid2=$(cat pidfile2) + +touch test_netns +mount --bind /proc/$pid/ns/net test_netns +touch test_netns2 +mount --bind /proc/$pid2/ns/net test_netns2 +mkdir -p images +ino=$(ls -iL test_netns | awk '{ print $1 }') +ino2=$(ls -iL test_netns2 | awk '{ print $1 }') +exec 33< test_netns +exec 34< test_netns2 +$CRIU dump -t $pid -o dump.log -D images --external net[$ino]:test_netns --external net[$ino2]:test_netns2 +cat images/dump.log | grep -B 5 Error || echo ok +$CRIU restore -o restore.log -D images --inherit-fd fd[33]:test_netns --inherit-fd fd[34]:test_netns2 -d +cat images/restore.log | grep -B 5 Error || echo ok +new_ino=$(ls -iL /proc/$pid/ns/net | awk '{ print $1 }') +new_ino2=$(ls -iL /proc/$pid2/ns/net | awk '{ print $1 }') +[ "$ino" -ne "$new_ino" ] && { + echo FAIL + exit 1 +} +[ "$ino2" -ne "$new_ino2" ] && { + echo FAIL + exit 1 +} +echo PASS +exit 0 diff --git a/CRIU_code/test/others/overlayfs/Makefile b/CRIU_code/test/others/overlayfs/Makefile new file mode 100644 index 0000000..78c246b --- /dev/null +++ b/CRIU_code/test/others/overlayfs/Makefile @@ -0,0 +1,6 @@ +run: + ./run.sh + +clean: + umount -f overlay_test/z + rm -rf overlay_test output checkpoint diff --git a/CRIU_code/test/others/overlayfs/run.sh b/CRIU_code/test/others/overlayfs/run.sh new file mode 100644 index 0000000..26e6ec2 --- /dev/null +++ b/CRIU_code/test/others/overlayfs/run.sh @@ -0,0 +1,58 @@ +#!/bin/bash + +set -eu + +CRIU=../../../criu/criu + +setup() { + setup_mount + setsid sleep 10 3>z/file < /dev/null &> output & + PROC_PID=$! + echo "PROC_PID=$PROC_PID" + sleep 1 +} + +setup_mount() { + mkdir -p overlay_test + cd overlay_test + mkdir -p a b c z checkpoint + mount -t overlay -o lowerdir=a,upperdir=b,workdir=c overlayfs z +} + +check_criu() { + echo "Dumping $PROC_PID..." + if ! $CRIU dump -D checkpoint -t "${PROC_PID}"; then + echo "ERROR! dump failed" + return 1 + fi + + echo "Restoring..." + if ! $CRIU restore -d -D checkpoint; then + echo "ERROR! restore failed" + return 1 + fi + return 0 +} + +cleanup() { + kill -INT "${PROC_PID}" > /dev/null 2>&1 + umount z + cd "${ORIG_WD}" + rm -rf overlay_test +} + +main() { + ORIG_WD=$(pwd) + setup + + check_criu || { + cleanup + exit 1 + } + + cleanup + echo "OverlayFS C/R successful." + exit 0 +} + +main diff --git a/CRIU_code/test/others/pipes/Makefile b/CRIU_code/test/others/pipes/Makefile new file mode 100644 index 0000000..71b16be --- /dev/null +++ b/CRIU_code/test/others/pipes/Makefile @@ -0,0 +1,14 @@ +CFLAGS += -Wall +pipe: pipe.c +clean: + rm -f pipe +run: pipe + ./pipe - && \ + ./pipe -c && \ + ./pipe -cl && \ + ./pipe -d && \ + ./pipe -o && \ + ./pipe -r && \ + ./pipe -dc && \ + ./pipe -dcl && \ + true diff --git a/CRIU_code/test/others/pipes/pipe.c b/CRIU_code/test/others/pipes/pipe.c new file mode 100644 index 0000000..33ec60e --- /dev/null +++ b/CRIU_code/test/others/pipes/pipe.c @@ -0,0 +1,693 @@ +/* + * A simple demo/test program using criu's --inherit-fd command line + * option to restore a process with (1) an external pipe and (2) a + * new log file. + * + * Note that it's possible to restore the process without --inherit-fd, + * but when it reads from or writes to the pipe, it will get a broken + * pipe signal. + * + * Also note that changing the log file during restore has nothing to do + * with the pipe. It's just a nice feature for cases where it's desirable + * to have a restored process use a different file then the original one. + * + * The parent process spawns a child that will write messages to its + * parent through a pipe. After a couple of messages, parent invokes + * criu to checkpoint the child. Since the child exits after checkpoint, + * its pipe will be broken. Parent sets up a new pipe and invokes criu + * to restore the child using the new pipe (instead of the old one). + * The restored child exits after writing a couple more messages. + * + * To make sure that fd clashes are correctly handled during restore, + * child can optionally open a regular file and move it to a clashing fd. + * + * Make sure CRIU_BINARY defined below points to the right criu. + * + * $ cc -Wall -o pipe pipe.c + * $ sudo ./pipe -v + * + * The following should all succeed: + * + * $ sudo ./pipe -q && echo OK + * $ sudo ./pipe -qc && echo OK + * $ sudo ./pipe -qcl && echo OK + * $ sudo ./pipe -qd && echo OK + * $ sudo ./pipe -qdc && echo OK + * $ sudo ./pipe -qdcl && echo OK + * + * The following should all fail: + * + * $ sudo ./pipe -qn || echo $? + * $ sudo ./pipe -qo || echo $? + * $ sudo ./pipe -qr || echo $? + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +typedef void (*sighandler_t)(int); +typedef unsigned long ulong; + +/* colors */ +#define CS_PARENT "\033[00;32m" +#define CS_CHILD "\033[00;33m" +#define CS_DUMP "\033[00;34m" +#define CS_RESTORE "\033[00;35m" +#define CE "\033[0m" + +#define die(fmt, ...) do { \ + if (!qflag) \ + fprintf(stderr, fmt ": %m\n", __VA_ARGS__); \ + if (getpid() == parent_pid) { \ + (void)kill(0, 9); \ + exit(1); \ + } \ + _exit(1); \ +} while (0) + +#define READ_FD 0 /* pipe read fd */ +#define WRITE_FD 1 /* pipe write fd */ +#define CLASH_FD 3 /* force inherit fd clash */ + +#define MAX_FORKS 3 /* child, checkpoint, restore */ + +#define CRIU_BINARY "../../../criu/criu" +#define IMG_DIR "images" +#define DUMP_LOG_FILE "dump.log" +#define RESTORE_LOG_FILE "restore.log" +#define RESTORE_PID_FILE "restore.pid" +#define INHERIT_FD_OPTION "--inherit-fd" +#define OLD_LOG_FILE "/tmp/oldlog" +#define NEW_LOG_FILE "/tmp/newlog" + +/* + * Command line options (see usage()). + */ +char *cli_flags = "cdhlnoqrv"; + +int cflag; +int dflag; +int lflag; +int nflag; +int oflag; +int qflag; +int rflag; +int vflag; + +char pid_number[8]; +char inh_pipe_opt[16]; +char inh_pipe_arg[64]; +char inh_file_opt[16]; +char inh_file_arg[64]; + +char *dump_argv[] = { + "criu", "dump", + "-D", IMG_DIR, "-o", DUMP_LOG_FILE, + "-v4", + "-t", pid_number, + NULL +}; + +char *restore_argv[] = { + "criu", "restore", "-d", + "-D", IMG_DIR, "-o", RESTORE_LOG_FILE, + "--pidfile", RESTORE_PID_FILE, + "-v4", + inh_pipe_opt, inh_pipe_arg, + inh_file_opt, inh_file_arg, + NULL +}; + +int max_msgs; +int max_forks; +int parent_pid; +int child_pid; +int criu_dump_pid; +int criu_restore_pid; + +/* prototypes */ +void chld_handler(int signum); +int parent(int *pipefd); +int child(int *pipefd, int dupfd, int newfd); +void checkpoint_child(int child_pid, int *pipefd); +void restore_child(int *new_pipefd, char *old_pipe_name); +void write_to_fd(int fd, char *name, int i, int newline); +void ls_proc_fd(int fd); +char *pipe_name(int fd); +char *who(pid_t pid); +void pipe_safe(int pipefd[2]); +pid_t fork_safe(void); +void signal_safe(int signum, sighandler_t handler); +int open_safe(char *pathname, int flags); +void close_safe(int fd); +void write_safe(int fd, char *buf, int count); +int read_safe(int fd, char *buf, int count); +int dup_safe(int oldfd); +void move_fd(int oldfd, int newfd); +void mkdir_safe(char *dirname, int mode); +void unlink_safe(char *pathname); +void execv_safe(char *path, char *argv[], int ls); +pid_t waitpid_safe(pid_t pid, int *status, int options, int id); +void prctl_safe(int option, ulong arg2, ulong arg3, ulong arg4, ulong arg5); +int dup2_safe(int oldfd, int newfd); + +void usage(char *cmd) +{ + printf("Usage: %s [%s]\n", cmd, cli_flags); + printf("-c\tcause a clash during restore by opening %s as fd %d\n", + OLD_LOG_FILE, CLASH_FD); + printf("-d\tdup the pipe and write to it\n"); + printf("-l\tchange log file from %s to %s during restore\n", + OLD_LOG_FILE, NEW_LOG_FILE); + + printf("\n"); + printf("The following flags should cause restore failure\n"); + printf("-n\tdo not use the %s option\n", INHERIT_FD_OPTION); + printf("-o\topen the pipe via /proc//fd and write to it\n"); + printf("-r\tspecify read end of pipe during restore\n"); + + printf("\n"); + printf("Miscellaneous flags\n"); + printf("-h\tprint this help and exit\n"); + printf("-q\tquiet mode, don't print anything\n"); + printf("-v\tverbose mode (list contents of /proc//fd)\n"); + +} + +int main(int argc, char *argv[]) +{ + int ret; + int opt; + int pipefd[2]; + + max_msgs = 4; + while ((opt = getopt(argc, argv, cli_flags)) != -1) { + switch (opt) { + case 'c': cflag++; break; + case 'd': dflag++; max_msgs += 4; break; + case 'h': usage(argv[0]); return 0; + case 'l': lflag++; break; + case 'n': nflag++; break; + case 'o': oflag++; max_msgs += 4; break; + case 'q': qflag++; vflag = 0;break; + case 'r': rflag++; break; + case 'v': vflag++; qflag = 0; break; + default: usage(argv[0]); return 1; + } + } + + setbuf(stdout, NULL); + setbuf(stderr, NULL); + mkdir_safe(IMG_DIR, 0700); + + pipe_safe(pipefd); + child_pid = fork_safe(); + if (child_pid > 0) { + parent_pid = getpid(); + + signal_safe(SIGCHLD, chld_handler); + prctl_safe(PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0); + + close_safe(pipefd[WRITE_FD]); + + ret = parent(pipefd); + } else { + /* child */ + int dupfd = -1; + int openfd = -1; + int logfd, flags; + + child_pid = getpid(); + + close_safe(pipefd[READ_FD]); + setsid(); + logfd = open_safe(OLD_LOG_FILE, O_WRONLY | O_APPEND | O_CREAT); + dup2_safe(logfd, 1); + dup2_safe(logfd, 2); + close(logfd); + close(0); + + /* open a regular file and move it to CLASH_FD */ + if (cflag) + move_fd(open_safe(OLD_LOG_FILE, O_WRONLY | O_APPEND | O_CREAT), CLASH_FD); + + fcntl(pipefd[WRITE_FD], F_SETFL, O_NONBLOCK | O_WRONLY); + /* open additional descriptors on the pipe and use them all */ + if (dflag) + dupfd = dup_safe(pipefd[WRITE_FD]); + if (oflag) { + char buf[128]; + snprintf(buf, sizeof buf, "/proc/self/fd/%d", pipefd[WRITE_FD]); + openfd = open_safe(buf, O_WRONLY); + } + + ret = child(pipefd, dupfd, openfd); + + flags = fcntl(pipefd[WRITE_FD], F_GETFL, 0); + if ((flags & O_NONBLOCK) == 0) { + printf("Unexpected flags %x\n", flags); + ret = -1; + } + } + + return ret; +} + +/* + * Parent reads message from its pipe with the child. + * After a couple of messages, it checkpoints the child + * which causes the child to exit. Parent then creates + * a new pipe and restores the child. + */ +int parent(int *pipefd) +{ + char buf[32]; + char old_pipe[32]; + int nread; + + nread = 0; + while (max_forks <= MAX_FORKS) { + if (read_safe(pipefd[READ_FD], buf, sizeof buf) == 0) + continue; + nread++; + if (vflag && nread == 1) + ls_proc_fd(-1); + + if (!qflag) { + printf("%s read %s from %s\n", who(0), buf, + pipe_name(pipefd[READ_FD])); + } + + if (nread == (max_msgs / 2)) { + checkpoint_child(child_pid, pipefd); + + if (!nflag) { + /* save the old pipe's name before closing it */ + snprintf(old_pipe, sizeof old_pipe, "%s", + pipe_name(pipefd[READ_FD])); + close_safe(pipefd[READ_FD]); + + /* create a new one */ + if (!qflag) + printf("%s creating a new pipe\n", who(0)); + pipe_safe(pipefd); + } + restore_child(pipefd, old_pipe); + } + } + + return 0; +} + +/* + * Child sends a total of max_messages messages to its + * parent, half before checkpoint and half after restore. + */ +int child(int *pipefd, int dupfd, int openfd) +{ + int i; + int fd; + int num_wfds; + struct timespec req = { 1, 0 }; + + /* + * Count the number of pipe descriptors we'll be + * writing to. At least 1 (for pipefd[WRITE_FD]) + * and at most 3. + */ + num_wfds = 1; + if (dupfd >= 0) + num_wfds++; + if (openfd >= 0) + num_wfds++; + + for (i = 0; i < max_msgs; i++) { + /* print first time and after checkpoint */ + if (vflag && (i == 0 || i == (max_msgs / 2))) + ls_proc_fd(-1); + + switch (i % num_wfds) { + case 0: fd = pipefd[WRITE_FD]; break; + case 1: fd = dflag ? dupfd : openfd; break; + case 2: fd = openfd; break; + } + + write_to_fd(fd, pipe_name(pipefd[WRITE_FD]), i+1, 0); + if (cflag) + write_to_fd(CLASH_FD, "log file", i+1, 1); + + /* + * Since sleep will be interrupted by C/R, make sure + * to sleep an entire second to minimize the chance of + * writing before criu restore has exited. If criu is + * still around and we write to a broken pipe, we'll be + * killed but SIGCHLD will be delivered to criu instead + * of parent. + */ + while (nanosleep(&req, NULL)) + ; + if (!qflag) + printf("\n"); + } + + return 0; +} + +void chld_handler(int signum) +{ + int status; + pid_t pid; + + pid = waitpid_safe(-1, &status, WNOHANG, 1); + if (WIFEXITED(status)) + status = WEXITSTATUS(status); + if (pid == child_pid) { + if (!qflag) { + printf("%s %s exited with status %d\n", who(0), + who(pid), status); + } + /* if child exited successfully, we're done */ + if (status == 0) + exit(0); + /* checkpoint kills the child */ + if (status != 9) + exit(status); + } +} + +void checkpoint_child(int child_pid, int *pipefd) +{ + /* prepare -t */ + snprintf(pid_number, sizeof pid_number, "%d", child_pid); + + criu_dump_pid = fork_safe(); + if (criu_dump_pid > 0) { + int status; + pid_t pid; + + pid = waitpid_safe(criu_dump_pid, &status, 0, 2); + if (WIFEXITED(status)) + status = WEXITSTATUS(status); + if (!qflag) { + printf("%s %s exited with status %d\n", who(0), + who(pid), status); + } + if (status) + exit(status); + } else { + close(pipefd[READ_FD]); + criu_dump_pid = getpid(); + execv_safe(CRIU_BINARY, dump_argv, 0); + } +} + +void restore_child(int *new_pipefd, char *old_pipe_name) +{ + char buf[64]; + + criu_restore_pid = fork_safe(); + if (criu_restore_pid > 0) { + int status; + pid_t pid; + + if (!nflag) + close_safe(new_pipefd[WRITE_FD]); + + pid = waitpid_safe(criu_restore_pid, &status, 0, 3); + if (WIFEXITED(status)) + status = WEXITSTATUS(status); + if (!qflag) { + printf("%s %s exited with status %d\n", who(0), + who(pid), status); + } + if (status) + exit(status); + } else { + criu_restore_pid = getpid(); + + if (!nflag) { + /* + * We should close the read descriptor of the new pipe + * and use its write descriptor to call criu restore. + * But if rflag was set (for testing purposes), use the + * read descriptor which should cause the application to + * fail. + * + * Regardless of read or write descriptor, move it to a + * clashing fd to test inherit fd clash resolve code. + */ + if (rflag) + move_fd(new_pipefd[READ_FD], CLASH_FD); + else { + close_safe(new_pipefd[READ_FD]); + move_fd(new_pipefd[WRITE_FD], CLASH_FD); + } + + /* --inherit-fd fd[CLASH_FD]:pipe[xxxxxx] */ + snprintf(inh_pipe_opt, sizeof inh_pipe_opt, + "%s", INHERIT_FD_OPTION); + snprintf(inh_pipe_arg, sizeof inh_pipe_arg, "fd[%d]:%s", + CLASH_FD, old_pipe_name); + + if (lflag) { + /* create a new log file to replace the old one */ + int filefd = open_safe(NEW_LOG_FILE, O_WRONLY | O_APPEND | O_CREAT); + + /* --inherit-fd fd[x]:tmp/oldlog */ + snprintf(inh_file_opt, sizeof inh_file_opt, + "%s", INHERIT_FD_OPTION); + snprintf(inh_file_arg, sizeof inh_file_arg, + "fd[%d]:%s", filefd, OLD_LOG_FILE + 1); + + restore_argv[12] = inh_file_opt; + } else + restore_argv[12] = NULL; + restore_argv[10] = inh_pipe_opt; + } else + restore_argv[10] = NULL; + + snprintf(buf, sizeof buf, "%s/%s", IMG_DIR, RESTORE_PID_FILE); + unlink_safe(buf); + execv_safe(CRIU_BINARY, restore_argv, 1); + } +} + +void write_to_fd(int fd, char *name, int i, int newline) +{ + int n; + char buf[16]; /* fit "hello d\n" for small d */ + + n = snprintf(buf, sizeof buf, "hello %d", i); + if (!qflag) + printf("%s writing %s to %s via fd %d\n", who(0), buf, name, fd); + + if (newline) { + buf[n++] = '\n'; + buf[n] = '\0'; + } + write_safe(fd, buf, strlen(buf)); +} + +void ls_proc_fd(int fd) +{ + char cmd[128]; + + if (qflag) + return; + + if (fd == -1) + snprintf(cmd, sizeof cmd, "ls -l /proc/%d/fd", getpid()); + else + snprintf(cmd, sizeof cmd, "ls -l /proc/%d/fd/%d", getpid(), fd); + printf("%s %s\n", who(0), cmd); + system(cmd); +} + +char *pipe_name(int fd) +{ + static char pipe_name[64]; + char path[64]; + + snprintf(path, sizeof path, "/proc/self/fd/%d", fd); + if (readlink(path, pipe_name, sizeof pipe_name) == -1) + die("readlink: path=%s", path); + return pipe_name; +} + +/* + * Use two buffers to support two calls to + * this function in a printf argument list. + */ +char *who(pid_t pid) +{ + static char pidstr1[64]; + static char pidstr2[64]; + static char *cp; + char *np; + char *ep; + int p; + + p = pid ? pid : getpid(); + if (p == parent_pid) { + np = "parent"; + ep = CS_PARENT; + } else if (p == child_pid) { + np = "child"; + ep = CS_CHILD; + } else if (p == criu_dump_pid) { + np = "dump"; + ep = CS_DUMP; + } else if (p == criu_restore_pid) { + np = "restore"; + ep = CS_RESTORE; + } else + np = "???"; + + cp = (cp == pidstr1) ? pidstr2 : pidstr1; + snprintf(cp, sizeof pidstr1, "%s[%s %d]", pid ? "" : ep, np, p); + return cp; +} + +void pipe_safe(int pipefd[2]) +{ + if (pipe(pipefd) == -1) + die("pipe: %p", pipefd); +} + +pid_t fork_safe(void) +{ + pid_t pid; + + if ((pid = fork()) == -1) + die("fork: pid=%d", pid); + max_forks++; + return pid; +} + +void signal_safe(int signum, sighandler_t handler) +{ + if (signal(signum, handler) == SIG_ERR) + die("signal: signum=%d", signum); +} + +int open_safe(char *pathname, int flags) +{ + int fd; + + if ((fd = open(pathname, flags, 0777)) == -1) + die("open: pathname=%s", pathname); + return fd; +} + +void close_safe(int fd) +{ + if (close(fd) == -1) + die("close: fd=%d", fd); +} + +void write_safe(int fd, char *buf, int count) +{ + if (write(fd, buf, count) != count) { + die("write: fd=%d buf=\"%s\" count=%d errno=%d", + fd, buf, count, errno); + } +} + +int read_safe(int fd, char *buf, int count) +{ + int n; + + if ((n = read(fd, buf, count)) < 0) + die("read: fd=%d count=%d", fd, count); + buf[n] = '\0'; + return n; +} + +int dup_safe(int oldfd) +{ + int newfd; + + if ((newfd = dup(oldfd)) == -1) + die("dup: oldfd=%d", oldfd); + return newfd; +} + +int dup2_safe(int oldfd, int newfd) +{ + if (dup2(oldfd, newfd) != newfd) + die("dup2: oldfd=%d newfd=%d", oldfd, newfd); + return newfd; +} + +void move_fd(int oldfd, int newfd) +{ + if (oldfd != newfd) { + dup2_safe(oldfd, newfd); + close_safe(oldfd); + } +} + +void mkdir_safe(char *dirname, int mode) +{ + if (mkdir(dirname, mode) == -1 && errno != EEXIST) + die("mkdir dirname=%s mode=0x%x\n", dirname, mode); +} + +void unlink_safe(char *pathname) +{ + if (unlink(pathname) == -1 && errno != ENOENT) { + die("unlink: pathname=%s\n", pathname); + } +} + +void execv_safe(char *path, char *argv[], int ls) +{ + int i; + struct timespec req = { 0, 1000000 }; + + if (!qflag) { + printf("\n%s ", who(0)); + for (i = 0; argv[i] != NULL; i++) + printf("%s ", argv[i]); + printf("\n"); + } + + /* give parent a chance to wait for us */ + while (nanosleep(&req, NULL)) + ; + + if (vflag && ls) + ls_proc_fd(-1); + + execv(path, argv); + die("execv: path=%s", path); +} + +pid_t waitpid_safe(pid_t pid, int *status, int options, int id) +{ + pid_t p; + + p = waitpid(pid, status, options); + if (p == -1) + fprintf(stderr, "waitpid pid=%d id=%d %m\n", pid, id); + return p; +} + +void prctl_safe(int option, ulong arg2, ulong arg3, ulong arg4, ulong arg5) +{ + if (prctl(option, arg2, arg3, arg4, arg5) == -1) + die("prctl: option=0x%x", option); +} diff --git a/CRIU_code/test/others/rpc/.gitignore b/CRIU_code/test/others/rpc/.gitignore new file mode 100644 index 0000000..75fca39 --- /dev/null +++ b/CRIU_code/test/others/rpc/.gitignore @@ -0,0 +1,3 @@ +rpc.pb-c.* +*_pb2.py +test-c diff --git a/CRIU_code/test/others/rpc/Makefile b/CRIU_code/test/others/rpc/Makefile new file mode 100644 index 0000000..08caa0c --- /dev/null +++ b/CRIU_code/test/others/rpc/Makefile @@ -0,0 +1,45 @@ +all: test-c rpc_pb2.py criu +.PHONY: all + +CFLAGS += -g -Werror -Wall -I. +LDLIBS += -lprotobuf-c + +PYTHON ?= python + +run: all + mkdir -p build + chmod a+rwx build + rm -f build/status + sudo -g '#1000' -u '#1000' mkfifo build/status + @# Need to start the criu daemon here to access the pidfile. + @# The script read.py is used to wait until 'criu service' + @# is ready. As 'read -n 1' in some releases has a bug and does + @# not read correctly a \0, using python is a workaround. + sudo -g '#1000' -u '#1000' -- bash -c "exec 200<>build/status; \ + ./criu service -v4 -W build --address criu_service.socket \ + -d --pidfile pidfile -o service.log --status-fd 200; \ + $(PYTHON) read.py build/status" + rm -f build/status + chmod a+rw build/pidfile + sudo -g '#1000' -u '#1000' ./run.sh + sudo -g '#1000' -u '#1000' ./version.py + # run the configuration file via RPC test cases + ./config_file.py build + +criu: ../../../criu/criu + cp ../../../criu/criu $@ + chmod u+s $@ + +test-c: rpc.pb-c.o test-c.o + +test-c.o: test-c.c rpc.pb-c.c + +rpc_pb2.py: rpc.proto + protoc --proto_path=. --python_out=. rpc.proto + +rpc.pb-c.c: rpc.proto + protoc-c --proto_path=. --c_out=. rpc.proto + +clean: + rm -rf build rpc.pb-c.o test-c.o test-c rpc.pb-c.c rpc.pb-c.h rpc_pb2.py rpc_pb2.pyc criu +.PHONY: clean diff --git a/CRIU_code/test/others/rpc/config_file.py b/CRIU_code/test/others/rpc/config_file.py new file mode 100644 index 0000000..23a0661 --- /dev/null +++ b/CRIU_code/test/others/rpc/config_file.py @@ -0,0 +1,192 @@ +#!/usr/bin/python2 + +import os +import socket +import sys +import rpc_pb2 as rpc +import argparse +import subprocess +from tempfile import mkstemp +import time + +log_file = 'config_file_test.log' +does_not_exist = 'does-not.exist' + + +def setup_swrk(): + print('Connecting to CRIU in swrk mode.') + css = socket.socketpair(socket.AF_UNIX, socket.SOCK_SEQPACKET) + swrk = subprocess.Popen(['./criu', "swrk", "%d" % css[0].fileno()]) + css[0].close() + return swrk, css[1] + + +def setup_config_file(content): + # Creating a temporary file which will be used as configuration file. + fd, path = mkstemp() + + with os.fdopen(fd, 'w') as f: + f.write(content) + + os.environ['CRIU_CONFIG_FILE'] = path + + return path + + +def cleanup_config_file(path): + if os.environ.get('CRIU_CONFIG_FILE', None) is not None: + del os.environ['CRIU_CONFIG_FILE'] + os.unlink(path) + + +def cleanup_output(path): + for f in (does_not_exist, log_file): + f = os.path.join(path, f) + if os.access(f, os.F_OK): + os.unlink(f) + + +def setup_criu_dump_request(): + # Create criu msg, set it's type to dump request + # and set dump options. Checkout more options in protobuf/rpc.proto + req = rpc.criu_req() + req.type = rpc.DUMP + req.opts.leave_running = True + req.opts.log_level = 4 + req.opts.log_file = log_file + req.opts.images_dir_fd = os.open(args['dir'], os.O_DIRECTORY) + # Not necessary, just for testing + req.opts.tcp_established = True + req.opts.shell_job = True + return req + + +def do_rpc(s, req): + # Send request + s.send(req.SerializeToString()) + + # Recv response + resp = rpc.criu_resp() + MAX_MSG_SIZE = 1024 + resp.ParseFromString(s.recv(MAX_MSG_SIZE)) + + s.close() + return resp + + +def test_broken_configuration_file(): + # Testing RPC configuration file mode with a broken configuration file. + # This should fail + content = 'hopefully-this-option-will-never=exist' + path = setup_config_file(content) + swrk, s = setup_swrk() + s.close() + # This test is only about detecting wrong configuration files. + # If we do not sleep it might happen that we kill CRIU before + # it parses the configuration file. A short sleep makes sure + # that the configuration file has been parsed. Hopefully. + # (I am sure this will fail horribly at some point) + time.sleep(0.3) + swrk.kill() + return_code = swrk.wait() + # delete temporary file again + cleanup_config_file(path) + if return_code != 1: + print('FAIL: CRIU should have returned 1 instead of %d' % return_code) + sys.exit(-1) + + +def search_in_log_file(log, message): + with open(os.path.join(args['dir'], log)) as f: + if message not in f.read(): + print('FAIL: Missing the expected error message (%s) in the log file' % message) + sys.exit(-1) + + +def check_results(resp, log): + # Check if the specified log file exists + if not os.path.isfile(os.path.join(args['dir'], log)): + print('FAIL: Expected log file %s does not exist' % log) + sys.exit(-1) + # Dump should have failed with: 'The criu itself is within dumped tree' + if resp.type != rpc.DUMP: + print('FAIL: Unexpected msg type %r' % resp.type) + sys.exit(-1) + if 'The criu itself is within dumped tree' not in resp.cr_errmsg: + print('FAIL: Missing the expected error message in RPC response') + sys.exit(-1) + # Look into the log file for the same message + search_in_log_file(log, 'The criu itself is within dumped tree') + + +def test_rpc_without_configuration_file(): + # Testing without configuration file + # Just doing a dump and checking for the logfile + req = setup_criu_dump_request() + _, s = setup_swrk() + resp = do_rpc(s, req) + s.close() + check_results(resp, log_file) + + +def test_rpc_with_configuration_file(): + # Testing with configuration file + # Just doing a dump and checking for the logfile + + # Setting a different log file via configuration file + # This should not work as RPC settings overwrite configuration + # file settings in the default configuration. + log = does_not_exist + content = 'log-file ' + log + '\n' + content += 'no-tcp-established\nno-shell-job' + path = setup_config_file(content) + req = setup_criu_dump_request() + _, s = setup_swrk() + do_rpc(s, req) + s.close() + cleanup_config_file(path) + # Check if the specified log file exists + # It should not as configuration files do not overwrite RPC values. + if os.path.isfile(os.path.join(args['dir'], log)): + print('FAIL: log file %s should not exist' % log) + sys.exit(-1) + + +def test_rpc_with_configuration_file_overwriting_rpc(): + # Testing with configuration file + # Just doing a dump and checking for the logfile + + # Setting a different log file via configuration file + # This should not work as RPC settings overwrite configuration + # file settings in the default configuration. + log = does_not_exist + content = 'log-file ' + log + '\n' + content += 'no-tcp-established\nno-shell-job' + path = setup_config_file(content) + # Only set the configuration file via RPC; + # not via environment variable + del os.environ['CRIU_CONFIG_FILE'] + req = setup_criu_dump_request() + req.opts.config_file = path + _, s = setup_swrk() + resp = do_rpc(s, req) + s.close() + cleanup_config_file(path) + check_results(resp, log) + + +parser = argparse.ArgumentParser(description="Test config files using CRIU RPC") +parser.add_argument('dir', type = str, help = "Directory where CRIU images should be placed") + +args = vars(parser.parse_args()) + +cleanup_output(args['dir']) + +test_broken_configuration_file() +cleanup_output(args['dir']) +test_rpc_without_configuration_file() +cleanup_output(args['dir']) +test_rpc_with_configuration_file() +cleanup_output(args['dir']) +test_rpc_with_configuration_file_overwriting_rpc() +cleanup_output(args['dir']) diff --git a/CRIU_code/test/others/rpc/errno.py b/CRIU_code/test/others/rpc/errno.py new file mode 100644 index 0000000..ee9e90d --- /dev/null +++ b/CRIU_code/test/others/rpc/errno.py @@ -0,0 +1,135 @@ +#!/usr/bin/python2 +# Test criu errno + +import socket, os, errno +import rpc_pb2 as rpc +import argparse + +parser = argparse.ArgumentParser(description="Test errno reported by CRIU RPC") +parser.add_argument('socket', type = str, help = "CRIU service socket") +parser.add_argument('dir', type = str, help = "Directory where CRIU images should be placed") + +args = vars(parser.parse_args()) + +# Prepare dir for images +class test: + def __init__(self): + self.imgs_fd = os.open(args['dir'], os.O_DIRECTORY) + self.s = -1 + self._MAX_MSG_SIZE = 1024 + + def connect(self): + self.s = socket.socket(socket.AF_UNIX, socket.SOCK_SEQPACKET) + self.s.connect(args['socket']) + + def get_base_req(self): + req = rpc.criu_req() + req.opts.log_level = 4 + req.opts.images_dir_fd = self.imgs_fd + return req + + def send_req(self, req): + self.connect() + self.s.send(req.SerializeToString()) + + def recv_resp(self): + resp = rpc.criu_resp() + resp.ParseFromString(self.s.recv(self._MAX_MSG_SIZE)) + return resp + + def check_resp(self, resp, typ, err): + if resp.type != typ: + raise Exception('Unexpected responce type ' + str(resp.type)) + + if resp.success: + raise Exception('Unexpected success = True') + + if err and resp.cr_errno != err: + raise Exception('Unexpected cr_errno ' + str(resp.cr_errno)) + + def no_process(self): + print('Try to dump unexisting process') + # Get pid of non-existing process. + # Suppose max_pid is not taken by any process. + with open("/proc/sys/kernel/pid_max", "r") as f: + pid = int(f.readline()) + try: + os.kill(pid, 0) + except OSError: + pass + else: + raise Exception('max pid is taken') + + # Ask criu to dump non-existing process. + req = self.get_base_req() + req.type = rpc.DUMP + req.opts.pid = pid + + self.send_req(req) + resp = self.recv_resp() + + self.check_resp(resp, rpc.DUMP, errno.ESRCH) + + print('Success') + + def process_exists(self): + print('Try to restore process which pid is already taken by other process') + + # Perform self-dump + req = self.get_base_req() + req.type = rpc.DUMP + req.opts.leave_running = True + + self.send_req(req) + resp = self.recv_resp() + + if resp.success != True: + raise Exception('Self-dump failed') + + # Ask to restore process from images of ourselves + req = self.get_base_req() + req.type = rpc.RESTORE + + self.send_req(req) + resp = self.recv_resp() + + self.check_resp(resp, rpc.RESTORE, errno.EEXIST) + + print('Success') + + def bad_options(self): + print('Try to send criu invalid opts') + + # Subdirs are not allowed in log_file + req = self.get_base_req() + req.type = rpc.DUMP + req.opts.log_file = "../file.log" + + self.send_req(req) + resp = self.recv_resp() + + self.check_resp(resp, rpc.DUMP, errno.EBADRQC) + + print('Success') + + def bad_request(self): + print('Try to send criu invalid request type') + + req = self.get_base_req() + req.type = rpc.NOTIFY + + self.send_req(req) + resp = self.recv_resp() + + self.check_resp(resp, rpc.EMPTY, None) + + print('Success') + + def run(self): + self.no_process() + self.process_exists() + self.bad_options() + self.bad_request() + +t = test() +t.run() diff --git a/CRIU_code/test/others/rpc/loop.sh b/CRIU_code/test/others/rpc/loop.sh new file mode 100644 index 0000000..0ab34ce --- /dev/null +++ b/CRIU_code/test/others/rpc/loop.sh @@ -0,0 +1,4 @@ +#!/bin/bash +while :; do + sleep 1 +done diff --git a/CRIU_code/test/others/rpc/ps_test.py b/CRIU_code/test/others/rpc/ps_test.py new file mode 100644 index 0000000..1872120 --- /dev/null +++ b/CRIU_code/test/others/rpc/ps_test.py @@ -0,0 +1,74 @@ +#!/usr/bin/python2 + +import socket, os, sys, errno +import rpc_pb2 as rpc +import argparse + +parser = argparse.ArgumentParser(description="Test page-server using CRIU RPC") +parser.add_argument('socket', type = str, help = "CRIU service socket") +parser.add_argument('dir', type = str, help = "Directory where CRIU images should be placed") + +args = vars(parser.parse_args()) + +# Connect to service socket +s = socket.socket(socket.AF_UNIX, socket.SOCK_SEQPACKET) +s.connect(args['socket']) + +# Start page-server +print('Starting page-server') +req = rpc.criu_req() +req.type = rpc.PAGE_SERVER +req.opts.log_file = 'page-server.log' +req.opts.log_level = 4 +req.opts.images_dir_fd = os.open(args['dir'], os.O_DIRECTORY) + +s.send(req.SerializeToString()) + +resp = rpc.criu_resp() +MAX_MSG_SIZE = 1024 +resp.ParseFromString(s.recv(MAX_MSG_SIZE)) + +if resp.type != rpc.PAGE_SERVER: + print('Unexpected msg type') + sys.exit(1) +else: + if resp.success: + # check if pid even exists + try: + os.kill(resp.ps.pid, 0) + except OSError as err: + if err.errno == errno.ESRCH: + print('No process with page-server pid %d' %(resp.ps.pid)) + else: + print('Can\'t check that process %d exists' %(resp.ps.pid)) + sys.exit(1) + print('Success, page-server pid %d started on port %u' %(resp.ps.pid, resp.ps.port)) + else: + print('Failed to start page-server') + sys.exit(1) + + +# Perform self-dump +print('Dumping myself using page-server') +req.type = rpc.DUMP +req.opts.ps.port = resp.ps.port +req.opts.ps.address = "127.0.0.1" +req.opts.log_file = 'dump.log' +req.opts.leave_running = True + +s.close() +s = socket.socket(socket.AF_UNIX, socket.SOCK_SEQPACKET) +s.connect(args['socket']) +s.send(req.SerializeToString()) + +resp.ParseFromString(s.recv(MAX_MSG_SIZE)) + +if resp.type != rpc.DUMP: + print('Unexpected msg type') + sys.exit(1) +else: + if resp.success: + print('Success') + else: + print('Fail') + sys.exit(1) diff --git a/CRIU_code/test/others/rpc/read.py b/CRIU_code/test/others/rpc/read.py new file mode 100644 index 0000000..bbf69b6 --- /dev/null +++ b/CRIU_code/test/others/rpc/read.py @@ -0,0 +1,17 @@ +# This script is used to read a single character from CRIU's status FD. +# That way we know when the CRIU service is ready. CRIU writes a \0 to +# the status FD. +# In theory this could be easily done using 'read -n 1' from bash, but +# but the bash version on Ubuntu has probably the following bug: +# https://lists.gnu.org/archive/html/bug-bash/2017-07/msg00039.html + +import sys + +f = open(sys.argv[1]) +r = f.read(1) +f.close() + +if r == '\0': + sys.exit(0) + +sys.exit(-1) diff --git a/CRIU_code/test/others/rpc/restore-loop.py b/CRIU_code/test/others/rpc/restore-loop.py new file mode 100644 index 0000000..ce5786a --- /dev/null +++ b/CRIU_code/test/others/rpc/restore-loop.py @@ -0,0 +1,45 @@ +#!/usr/bin/python2 + +import socket, os, sys +import rpc_pb2 as rpc +import argparse + +parser = argparse.ArgumentParser(description="Test ability to restore a process from images using CRIU RPC") +parser.add_argument('socket', type = str, help = "CRIU service socket") +parser.add_argument('dir', type = str, help = "Directory where CRIU images could be found") + +args = vars(parser.parse_args()) + +# Connect to service socket +s = socket.socket(socket.AF_UNIX, socket.SOCK_SEQPACKET) +s.connect(args['socket']) + +# Create criu msg, set it's type to dump request +# and set dump options. Checkout more options in protobuf/rpc.proto +req = rpc.criu_req() +req.type = rpc.RESTORE +req.opts.images_dir_fd = os.open(args['dir'], os.O_DIRECTORY) +# As the dumped process is running with setsid this should not +# be necessary. There seems to be a problem for this testcase +# in combination with alpine's setsid. +# The dump is now done with -j and the restore also. +req.opts.shell_job = True + +# Send request +s.send(req.SerializeToString()) + +# Recv response +resp = rpc.criu_resp() +MAX_MSG_SIZE = 1024 +resp.ParseFromString(s.recv(MAX_MSG_SIZE)) + +if resp.type != rpc.RESTORE: + print('Unexpected msg type') + sys.exit(-1) +else: + if resp.success: + print('Restore success') + else: + print('Restore fail') + sys.exit(-1) + print("PID of the restored program is %d\n" %(resp.restore.pid)) diff --git a/CRIU_code/test/others/rpc/rpc.proto b/CRIU_code/test/others/rpc/rpc.proto new file mode 100644 index 0000000..f65c529 --- /dev/null +++ b/CRIU_code/test/others/rpc/rpc.proto @@ -0,0 +1 @@ +../../../images/rpc.proto \ No newline at end of file diff --git a/CRIU_code/test/others/rpc/run.sh b/CRIU_code/test/others/rpc/run.sh new file mode 100644 index 0000000..d1facd8 --- /dev/null +++ b/CRIU_code/test/others/rpc/run.sh @@ -0,0 +1,86 @@ +#!/bin/bash + +set -e + +CRIU=./criu + +export PROTODIR=`readlink -f "${PWD}/../../protobuf"` + +echo $PROTODIR + +function title_print { + echo -e "\n**************************************************" + echo -e "\t\t"$1 + echo -e "**************************************************\n" + +} + +function stop_server { + title_print "Shutdown service server" + kill -SIGTERM $(cat build/pidfile) + unlink build/pidfile +} + +function test_c { + mkdir -p build/imgs_c + + title_print "Run test-c" + setsid ./test-c build/criu_service.socket build/imgs_c < /dev/null &>> build/output_c + + title_print "Restore test-c" + ${CRIU} restore -v4 -o restore-c.log -D build/imgs_c +} + +function test_py { + mkdir -p build/imgs_py + + title_print "Run test-py" + setsid ./test.py build/criu_service.socket build/imgs_py < /dev/null &>> build/output_py + + title_print "Restore test-py" + ${CRIU} restore -v4 -o restore-py.log -D build/imgs_py +} + +function test_restore_loop { + mkdir -p build/imgs_loop + + title_print "Run loop.sh" + setsid ./loop.sh < /dev/null &> build/loop.log & + P=${!} + echo "pid ${P}" + + title_print "Dump loop.sh" + # So theoretically '-j' (--shell-job) should not be necessary, but on alpine + # this test fails without it. + ${CRIU} dump -j -v4 -o dump-loop.log -D build/imgs_loop -t ${P} + + title_print "Run restore-loop" + ./restore-loop.py build/criu_service.socket build/imgs_loop + kill -SIGTERM ${P} +} + +function test_ps { + mkdir -p build/imgs_ps + + title_print "Run ps_test" + setsid ./ps_test.py build/criu_service.socket build/imgs_ps < /dev/null &>> build/output_ps +} + +function test_errno { + mkdir -p build/imgs_errno + + title_print "Run cr_errno test" + setsid ./errno.py build/criu_service.socket build/imgs_errno < /dev/null &>> build/output_errno +} + +trap 'echo "FAIL"; stop_server' EXIT + +test_c +test_py +test_restore_loop +test_ps +test_errno + +stop_server + +trap 'echo "Success"' EXIT diff --git a/CRIU_code/test/others/rpc/test-c.c b/CRIU_code/test/others/rpc/test-c.c new file mode 100644 index 0000000..751f9ba --- /dev/null +++ b/CRIU_code/test/others/rpc/test-c.c @@ -0,0 +1,170 @@ +#include "rpc.pb-c.h" +#include +#include +#include +#include +#include +#include +#include +#include + +#define MAX_MSG_SIZE 1024 + +static CriuResp *recv_resp(int socket_fd) +{ + unsigned char buf[MAX_MSG_SIZE]; + int len; + CriuResp *msg = 0; + + len = read(socket_fd, buf, MAX_MSG_SIZE); + if (len == -1) { + perror("Can't read response"); + return NULL; + } + + msg = criu_resp__unpack(NULL, len, buf); + if (!msg) { + perror("Failed unpacking response"); + return NULL; + } + + return msg; +} + +static int send_req(int socket_fd, CriuReq *req) +{ + unsigned char buf[MAX_MSG_SIZE]; + int len; + + len = criu_req__get_packed_size(req); + + if (criu_req__pack(req, buf) != len) { + perror("Failed packing request"); + return -1; + } + + if (write(socket_fd, buf, len) == -1) { + perror("Can't send request"); + return -1; + } + + return 0; +} + +int main(int argc, char *argv[]) +{ + CriuReq req = CRIU_REQ__INIT; + CriuResp *resp = NULL; + int fd, dir_fd; + int ret = 0; + struct sockaddr_un addr; + socklen_t addr_len; + + if (argc != 3) { + fprintf(stderr, "Usage: test-c criu-service.socket imgs_dir"); + return -1; + } + + /* + * Open a directory, in which criu will + * put images + */ + + puts(argv[2]); + dir_fd = open(argv[2], O_DIRECTORY); + if (dir_fd == -1) { + perror("Can't open imgs dir"); + return -1; + } + + /* + * Set "DUMP" type of request. + * Allocate CriuDumpReq. + */ + req.type = CRIU_REQ_TYPE__DUMP; + req.opts = malloc(sizeof(CriuOpts)); + if (!req.opts) { + perror("Can't allocate memory for dump request"); + return -1; + } + + criu_opts__init(req.opts); + + /* + * Set dump options. + * Checkout more in protobuf/rpc.proto. + */ + req.opts->has_leave_running = true; + req.opts->leave_running = true; + req.opts->images_dir_fd = dir_fd; + req.opts->has_log_level = true; + req.opts->log_level = 4; + + /* + * Connect to service socket + */ + fd = socket(AF_LOCAL, SOCK_SEQPACKET, 0); + if (fd == -1) { + perror("Can't create socket"); + return -1; + } + + memset(&addr, 0, sizeof(addr)); + addr.sun_family = AF_LOCAL; + + strcpy(addr.sun_path, argv[1]); + + addr_len = strlen(addr.sun_path) + sizeof(addr.sun_family); + + ret = connect(fd, (struct sockaddr *) &addr, addr_len); + if (ret == -1) { + perror("Cant connect to socket"); + goto exit; + } + + /* + * Send request + */ + ret = send_req(fd, &req); + if (ret == -1) { + perror("Can't send request"); + goto exit; + } + + /* + * Recv response + */ + resp = recv_resp(fd); + if (!resp) { + perror("Can't recv response"); + ret = -1; + goto exit; + } + + if (resp->type != CRIU_REQ_TYPE__DUMP) { + perror("Unexpected response type"); + ret = -1; + goto exit; + } + + /* + * Check response. + */ + if (resp->success) + puts("Success"); + else { + puts("Fail"); + ret = -1; + goto exit; + } + + if (resp->dump->has_restored && resp->dump->restored) + puts("Restored"); + +exit: + close(fd); + close(dir_fd); + if (resp) + criu_resp__free_unpacked(resp, NULL); + return ret; +} diff --git a/CRIU_code/test/others/rpc/test.py b/CRIU_code/test/others/rpc/test.py new file mode 100644 index 0000000..0addbae --- /dev/null +++ b/CRIU_code/test/others/rpc/test.py @@ -0,0 +1,81 @@ +#!/usr/bin/python2 + +import socket, os, sys +import rpc_pb2 as rpc +import argparse + +parser = argparse.ArgumentParser(description="Test dump/restore using CRIU RPC") +parser.add_argument('socket', type = str, help = "CRIU service socket") +parser.add_argument('dir', type = str, help = "Directory where CRIU images should be placed") + +args = vars(parser.parse_args()) + +# Connect to service socket +s = socket.socket(socket.AF_UNIX, socket.SOCK_SEQPACKET) +s.connect(args['socket']) + +# Create criu msg, set it's type to dump request +# and set dump options. Checkout more options in protobuf/rpc.proto +req = rpc.criu_req() +req.type = rpc.DUMP +req.opts.leave_running = True +req.opts.log_level = 4 +req.opts.images_dir_fd = os.open(args['dir'], os.O_DIRECTORY) + +# Send request +s.send(req.SerializeToString()) + +# Recv response +resp = rpc.criu_resp() +MAX_MSG_SIZE = 1024 +resp.ParseFromString(s.recv(MAX_MSG_SIZE)) + +if resp.type != rpc.DUMP: + print('Unexpected msg type') + sys.exit(-1) +else: + if resp.success: + print('Success') + else: + print('Fail') + sys.exit(-1) + + if resp.dump.restored: + print('Restored') + +# Connect to service socket +s = socket.socket(socket.AF_UNIX, socket.SOCK_SEQPACKET) +s.connect(args['socket']) + +# Create criu msg, set it's type to dump request +# and set dump options. Checkout more options in protobuf/rpc.proto +req = rpc.criu_req() +req.type = rpc.VERSION + +# Send request +s.send(req.SerializeToString()) + +# Recv response +resp = rpc.criu_resp() +MAX_MSG_SIZE = 1024 +resp.ParseFromString(s.recv(MAX_MSG_SIZE)) + +if resp.type != rpc.VERSION: + print('RPC: Unexpected msg type') + sys.exit(-1) +else: + if resp.success: + print('RPC: Success') + print('CRIU major %d' % resp.version.major_number) + print('CRIU minor %d' % resp.version.minor_number) + if resp.version.HasField('gitid'): + print('CRIU gitid %s' % resp.version.gitid) + if resp.version.HasField('sublevel'): + print('CRIU sublevel %s' % resp.version.sublevel) + if resp.version.HasField('extra'): + print('CRIU extra %s' % resp.version.extra) + if resp.version.HasField('name'): + print('CRIU name %s' % resp.version.name) + else: + print('Fail') + sys.exit(-1) diff --git a/CRIU_code/test/others/rpc/version.py b/CRIU_code/test/others/rpc/version.py new file mode 100644 index 0000000..247bc46 --- /dev/null +++ b/CRIU_code/test/others/rpc/version.py @@ -0,0 +1,47 @@ +#!/usr/bin/python2 + +import socket +import sys +import rpc_pb2 as rpc +import subprocess + +print('Connecting to CRIU in swrk mode to check the version:') + +css = socket.socketpair(socket.AF_UNIX, socket.SOCK_SEQPACKET) +swrk = subprocess.Popen(['./criu', "swrk", "%d" % css[0].fileno()]) +css[0].close() + +s = css[1] + +# Create criu msg, set it's type to dump request +# and set dump options. Checkout more options in protobuf/rpc.proto +req = rpc.criu_req() +req.type = rpc.VERSION + +# Send request +s.send(req.SerializeToString()) + +# Recv response +resp = rpc.criu_resp() +MAX_MSG_SIZE = 1024 +resp.ParseFromString(s.recv(MAX_MSG_SIZE)) + +if resp.type != rpc.VERSION: + print('RPC: Unexpected msg type') + sys.exit(-1) +else: + if resp.success: + print('RPC: Success') + print('CRIU major %d' % resp.version.major_number) + print('CRIU minor %d' % resp.version.minor_number) + if resp.version.HasField('gitid'): + print('CRIU gitid %s' % resp.version.gitid) + if resp.version.HasField('sublevel'): + print('CRIU sublevel %s' % resp.version.sublevel) + if resp.version.HasField('extra'): + print('CRIU extra %s' % resp.version.extra) + if resp.version.HasField('name'): + print('CRIU name %s' % resp.version.name) + else: + print('Fail') + sys.exit(-1) diff --git a/CRIU_code/test/others/security/Makefile b/CRIU_code/test/others/security/Makefile new file mode 100644 index 0000000..1b9120d --- /dev/null +++ b/CRIU_code/test/others/security/Makefile @@ -0,0 +1,34 @@ +DIR := $(shell pwd)/data +LOOP := $(DIR)/loop.sh +PIDFILE := $(DIR)/loop.pid +IMGS := $(DIR)/imgs +CRIU := $(DIR)/criu + +ROOT :=root +USR1 :=criu-test-user1 +USR2 :=criu-test-user2 + +export DIR LOOP PIDFILE IMGS CRIU ROOT USR1 USR2 + +run: testdir users + ./run.sh + +testdir: ../../../criu/criu + mkdir -p $(DIR) + mkdir -p $(IMGS) + cp ../../../criu/criu $(CRIU) + chmod u+s $(CRIU) + cp loop.sh $(LOOP) + chmod 777 $(DIR) + +users: + -userdel -f $(USR1) + -userdel -f $(USR2) + useradd -M -U $(USR1) + useradd -M -U $(USR2) + usermod -a -G $(USR2) $(USR1) + +clean: + rm -rf $(DIR) + -userdel -f $(USR1) + -userdel -f $(USR2) diff --git a/CRIU_code/test/others/security/loop.sh b/CRIU_code/test/others/security/loop.sh new file mode 100644 index 0000000..0a01053 --- /dev/null +++ b/CRIU_code/test/others/security/loop.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +echo $$ > $1.int +mv $1.int $1 + +if [ "$2" == "--chgrp" ]; then + grps=( $(groups) ) + newgrp ${grps[2]} +fi + +while :; do + sleep 1 +done diff --git a/CRIU_code/test/others/security/run.sh b/CRIU_code/test/others/security/run.sh new file mode 100644 index 0000000..e75ab44 --- /dev/null +++ b/CRIU_code/test/others/security/run.sh @@ -0,0 +1,89 @@ +#!/bin/bash + +set -x + +PID= + +function run_as { + echo "== Run ${LOOP} as $1" + echo ${PIDFILE} + rm -f ${PIDFILE} + su $1 -c "setsid ${LOOP} ${PIDFILE} $2 < /dev/null &> /dev/null &" + for i in `seq 100`; do + test -f ${PIDFILE} && break + sleep 1 + done + PID=`cat ${PIDFILE}` + echo ${PID} +} + +function dump_as { + test -d ${IMGS} && rm -rf ${IMGS} + mkdir -p ${IMGS} + echo "== Dump ${PID} as $@" + su $@ -c "${CRIU} dump --tree ${PID} --images-dir ${IMGS}" + return $? +} + +function rstr_as { + echo "== Restore ${IMGS} as $@" + su $@ -c "${CRIU} restore --images-dir ${IMGS} --restore-detached" + return $? +} + +function result { + local BGRED='\033[41m' + local BGGREEN='\033[42m' + local NORMAL=$(tput sgr0) + + if [ $1 -ne 0 ]; then + echo -e "${BGRED}FAIL${NORMAL}" + exit 1 + else + echo -e "${BGGREEN}PASS${NORMAL}" + fi +} + +function test_root { + echo "==== Check that non-root can't dump/restore process owned by root" + + run_as ${ROOT} + + dump_as ${USR1} ; result $((!$?)) + dump_as ${ROOT} ; result $(($?)) + + rstr_as ${USR1} ; result $((!$?)) + rstr_as ${ROOT} ; result $(($?)) + + kill -SIGKILL ${PID} +} + +function test_other { + echo "==== Check that user2 can't dump/restore process owned by user1" + + run_as ${USR1} + + dump_as ${USR2} ; result $((!$?)) + dump_as ${USR1} ; result $(($?)) + + rstr_as ${USR2} ; result $((!$?)) + rstr_as ${USR1} ; result $(($?)) + + kill -SIGKILL ${PID} +} + +function test_own { + echo "==== Check that user1 can dump/restore his own process that changes it's gid to one from groups" + + run_as ${USR1} "--chgrp" + + dump_as ${USR1} ; result $(($?)) + + rstr_as ${USR1} ; result $(($?)) + + kill -SIGKILL ${PID} +} + +test_root +test_other +test_own diff --git a/CRIU_code/test/others/shell-job/Makefile b/CRIU_code/test/others/shell-job/Makefile new file mode 100644 index 0000000..d81733e --- /dev/null +++ b/CRIU_code/test/others/shell-job/Makefile @@ -0,0 +1,2 @@ +run: + ../../zdtm_ct run.py diff --git a/CRIU_code/test/others/shell-job/run.py b/CRIU_code/test/others/shell-job/run.py new file mode 100644 index 0000000..4f4dfad --- /dev/null +++ b/CRIU_code/test/others/shell-job/run.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python2 +import os, pty, sys, subprocess +import termios, fcntl, time + +cr_bin = "../../../criu/criu" + +os.chdir(os.getcwd()) + +def create_pty(): + (fd1, fd2) = pty.openpty() + return (os.fdopen(fd1, "w+"), os.fdopen(fd2, "w+")) + +if not os.access("work", os.X_OK): + os.mkdir("work", 0755) + +open("running", "w").close() +m,s = create_pty() +p = os.pipe() +pr = os.fdopen(p[0], "r") +pw = os.fdopen(p[1], "w") + +pid = os.fork() +if pid == 0: + m.close() + os.setsid() + os.dup2(s.fileno(), 0) + os.dup2(s.fileno(), 1) + os.dup2(s.fileno(), 2) + fcntl.ioctl(s.fileno(), termios.TIOCSCTTY, 1) + pr.close() + pw.close() + while True: + if not os.access("running", os.F_OK): + sys.exit(0) + time.sleep(1) + sys.exit(1) + +pw.close() +pr.read(1) + +cmd = [cr_bin, "dump", "-j", "-t", str(pid), "-D", "work", "-v"] +print("Run: %s" % " ".join(cmd)) +ret = subprocess.Popen(cmd).wait() +if ret != 0: + sys.exit(1) +os.wait() + +os.unlink("running") +m,s = create_pty() +cpid = os.fork() +if cpid == 0: + os.setsid() + fcntl.ioctl(m.fileno(), termios.TIOCSCTTY, 1) + cmd = [cr_bin, "restore", "-j", "-D", "work", "-v"] + print("Run: %s" % " ".join(cmd)) + ret = subprocess.Popen([cr_bin, "restore", "-j", "-D", "work", "-v"]).wait() + if ret != 0: + sys.exit(1) + sys.exit(0) + +pid, status = os.wait() +if status != 0: + print("A child process exited with %d" % status) + sys.exit(1) diff --git a/CRIU_code/test/others/socketpairs/Makefile b/CRIU_code/test/others/socketpairs/Makefile new file mode 100644 index 0000000..dbb152c --- /dev/null +++ b/CRIU_code/test/others/socketpairs/Makefile @@ -0,0 +1,9 @@ +CFLAGS += -Wall +socketpair: socketpair.c +clean: + rm -f socketpair +run: socketpair + ./socketpair && \ + ./socketpair -v && \ + ./socketpair -m4 && \ + true diff --git a/CRIU_code/test/others/socketpairs/socketpair.c b/CRIU_code/test/others/socketpairs/socketpair.c new file mode 100644 index 0000000..d0c1bbd --- /dev/null +++ b/CRIU_code/test/others/socketpairs/socketpair.c @@ -0,0 +1,600 @@ +/* + * A simple demo/test program using criu's --inherit-fd command line + * option to restore a process with an external unix socket. + * Extending inherit's logic to unix sockets created by socketpair(..) syscall. +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +typedef void (*sighandler_t)(int); +typedef unsigned long ulong; + +/* colors */ +#define CS_PARENT "\033[00;32m" +#define CS_CHILD "\033[00;33m" +#define CS_DUMP "\033[00;34m" +#define CS_RESTORE "\033[00;35m" +#define CE "\033[0m" + +#define die(fmt, ...) do { \ + fprintf(stderr, fmt ": %m\n", __VA_ARGS__); \ + if (getpid() == parent_pid) { \ + (void)kill(0, 9); \ + exit(1); \ + } \ + _exit(1); \ +} while (0) + +#define READ_FD 0 /* pipe read fd */ +#define WRITE_FD 1 /* pipe write fd */ +#define CLASH_FD 3 /* force inherit fd clash */ + +#define MAX_FORKS 3 /* child, checkpoint, restore */ + +#define CRIU_BINARY "../../../criu/criu" +#define IMG_DIR "images" +#define DUMP_LOG_FILE "dump.log" +#define RESTORE_LOG_FILE "restore.log" +#define RESTORE_PID_FILE "restore.pid" +#define INHERIT_FD_OPTION "--inherit-fd" +#define OLD_LOG_FILE "/tmp/oldlog" +#define NEW_LOG_FILE "/tmp/newlog" + +/* + * Command line options (see usage()). + */ + +char *cli_flags = "hm:nv"; +int max_msgs = 10; +int vflag; +int nflag; + +char pid_number[8]; +char inh_unixsk_opt[16]; +char inh_unixsk_arg[64]; +char external_sk_ino[32]; + +char *dump_argv[] = { + "criu", "dump", + "-D", IMG_DIR, "-o", DUMP_LOG_FILE, + "-v4", + external_sk_ino, + "-t", pid_number, + NULL +}; + +char *restore_argv[] = { + "criu", "restore", "-d", + "-D", IMG_DIR, "-o", RESTORE_LOG_FILE, + "--pidfile", RESTORE_PID_FILE, + "-v4", "-x", + inh_unixsk_opt, inh_unixsk_arg, + NULL +}; + +int max_forks; +int parent_pid; +int child_pid; +int criu_dump_pid; +int criu_restore_pid; + +/* prototypes */ +void chld_handler(int signum); +int parent(int *socketfd, const char *ino_child_sk); +int child(int *socketfd, int dupfd, int newfd); +void checkpoint_child(int child_pid, int *old_socket_namefd); +void restore_child(int *new_socketfd, const char *old_socket_name); +void write_to_fd(int fd, char *name, int i, int newline); +void ls_proc_fd(int fd); +char *socket_name(int fd); +ino_t socket_inode(int fd); +char *who(pid_t pid); +void socketpair_safe(int socketfd[2]); +pid_t fork_safe(void); +void signal_safe(int signum, sighandler_t handler); +int open_safe(char *pathname, int flags); +void close_safe(int fd); +void write_safe(int fd, char *buf, int count); +int read_safe(int fd, char *buf, int count); +int dup_safe(int oldfd); +void move_fd(int oldfd, int newfd); +void mkdir_safe(char *dirname, int mode); +void unlink_safe(char *pathname); +void execv_safe(char *path, char *argv[], int ls); +pid_t waitpid_safe(pid_t pid, int *status, int options, int id); +void prctl_safe(int option, ulong arg2, ulong arg3, ulong arg4, ulong arg5); +int dup2_safe(int oldfd, int newfd); + +void usage(char *cmd) +{ + printf("Usage: %s [%s]\n", cmd, cli_flags); + printf("-h\tprint this help and exit\n"); + printf("-m\tcount of send messages (by default 10 will send from child) \n"); + printf("-n\tdo not use the %s option\n", INHERIT_FD_OPTION); + printf("-v\tverbose mode (list contents of /proc//fd)\n"); +} + +int main(int argc, char *argv[]) +{ + int ret; + int opt; + int socketfd[2]; + + while ((opt = getopt(argc, argv, cli_flags)) != -1) { + switch (opt) { + case 'h': + usage(argv[0]); + return 0; + case 'm': + max_msgs = atoi(optarg); + break; + case 'n': + nflag++; + break; + case 'v': + vflag++; + break; + case '?': + if ('m' == optopt) + fprintf (stderr, "Option -%c requires an argument.\n", optopt); + else + fprintf ( + stderr, + "Unknown option character `\\x%x'.\n", + optopt); + return 1; + default: + usage(argv[0]); + return 1; + } + } + + setbuf(stdout, NULL); + setbuf(stderr, NULL); + mkdir_safe(IMG_DIR, 0700); + + socketpair_safe(socketfd); + child_pid = fork_safe(); + if (child_pid > 0) { + parent_pid = getpid(); + + signal_safe(SIGCHLD, chld_handler); + prctl_safe(PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0); + + snprintf(external_sk_ino, sizeof(external_sk_ino), "--ext-unix-sk=%u", + (unsigned int)socket_inode(socketfd[WRITE_FD])); + + char unix_sk_ino[32] = {0}; + strcpy(unix_sk_ino, socket_name(socketfd[WRITE_FD])); + close_safe(socketfd[WRITE_FD]); + ret = parent(socketfd, unix_sk_ino); + } else { + /* child */ + int dupfd = -1; + int openfd = -1; + int logfd; + + child_pid = getpid(); + + close_safe(socketfd[READ_FD]); + setsid(); + logfd = open_safe(OLD_LOG_FILE, O_WRONLY | O_APPEND | O_CREAT); + dup2_safe(logfd, 1); + dup2_safe(logfd, 2); + close(logfd); + close(0); + + ret = child(socketfd, dupfd, openfd); + } + + return ret; +} + +/* + * Parent reads message from its pipe with the child. + * After a couple of messages, it checkpoints the child + * which causes the child to exit. Parent then creates + * a new pipe and restores the child. + */ +int parent(int *socketfd, const char *ino_child_sk) +{ + char buf[32]; + int nread; + + nread = 0; + while (max_forks <= MAX_FORKS) { + if (read_safe(socketfd[READ_FD], buf, sizeof buf) == 0) + continue; + nread++; + if (vflag && nread == 1) + ls_proc_fd(-1); + + printf( + "%s read %s from %s\n", + who(0), buf, + socket_name(socketfd[READ_FD])); + + + if (nread == (max_msgs / 2)) { + checkpoint_child(child_pid, socketfd); + + if (!nflag) { + close_safe(socketfd[READ_FD]); + + /* create a new one */ + printf("%s creating a new socket\n", who(0)); + socketpair_safe(socketfd); + } + + restore_child(socketfd, ino_child_sk); + } + } + + return 0; +} + +/* + * Child sends a total of max_messages messages to its + * parent, half before checkpoint and half after restore. + */ +int child(int *socketfd, int dupfd, int openfd) +{ + int i; + int fd; + int num_wfds; + struct timespec req = { 1, 0 }; + + /* + * Count the number of pipe descriptors we'll be + * writing to. At least 1 (for socketfd[WRITE_FD]) + * and at most 3. + */ + num_wfds = 1; + if (dupfd >= 0) + num_wfds++; + if (openfd >= 0) + num_wfds++; + + for (i = 0; i < max_msgs; i++) { + /* print first time and after checkpoint */ + if (vflag && (i == 0 || i == (max_msgs / 2))) + ls_proc_fd(-1); + + switch (i % num_wfds) { + case 0: fd = socketfd[WRITE_FD]; break; + case 1: fd = openfd; break; + case 2: fd = openfd; break; + } + + write_to_fd(fd, socket_name(socketfd[WRITE_FD]), i+1, 0); + /* + * Since sleep will be interrupted by C/R, make sure + * to sleep an entire second to minimize the chance of + * writing before criu restore has exited. If criu is + * still around and we write to a broken pipe, we'll be + * killed but SIGCHLD will be delivered to criu instead + * of parent. + */ + while (nanosleep(&req, NULL)) + ; + printf("\n"); + } + + return 0; +} + +void chld_handler(int signum) +{ + int status; + pid_t pid; + + pid = waitpid_safe(-1, &status, WNOHANG, 1); + if (WIFEXITED(status)) + status = WEXITSTATUS(status); + if (pid == child_pid) { + printf("%s %s exited with status %d\n", who(0), + who(pid), status); + /* if child exited successfully, we're done */ + if (status == 0) + exit(0); + /* checkpoint kills the child */ + if (status != 9) + exit(status); + } +} + +void checkpoint_child(int child_pid, int *socketfd) +{ + /* prepare -t */ + snprintf(pid_number, sizeof pid_number, "%d", child_pid); + + criu_dump_pid = fork_safe(); + if (criu_dump_pid > 0) { + int status; + pid_t pid; + + pid = waitpid_safe(criu_dump_pid, &status, 0, 2); + if (WIFEXITED(status)) + status = WEXITSTATUS(status); + printf("%s %s exited with status %d\n", who(0), + who(pid), status); + if (status) + exit(status); + } else { + close(socketfd[READ_FD]); + criu_dump_pid = getpid(); + execv_safe(CRIU_BINARY, dump_argv, 0); + } +} + +void restore_child(int *new_socketfd, const char *old_sock_name) +{ + char buf[64]; + + criu_restore_pid = fork_safe(); + if (criu_restore_pid > 0) { + int status; + pid_t pid; + + if (!nflag) + close_safe(new_socketfd[WRITE_FD]); + + pid = waitpid_safe(criu_restore_pid, &status, 0, 3); + if (WIFEXITED(status)) + status = WEXITSTATUS(status); + + printf("%s %s exited with status %d\n", who(0), + who(pid), status); + + if (status) + exit(status); + } else { + criu_restore_pid = getpid(); + + if (!nflag) { + close_safe(new_socketfd[READ_FD]); + move_fd(new_socketfd[WRITE_FD], CLASH_FD); + + /* --inherit-fd fd[CLASH_FD]:socket[xxxxxx] */ + snprintf(inh_unixsk_opt, sizeof inh_unixsk_opt, + "%s", INHERIT_FD_OPTION); + snprintf(inh_unixsk_arg, sizeof inh_unixsk_arg, "fd[%d]:%s", + CLASH_FD, old_sock_name); + + restore_argv[11] = inh_unixsk_opt; + restore_argv[13] = NULL; + } else + restore_argv[11] = NULL; + + snprintf(buf, sizeof buf, "%s/%s", IMG_DIR, RESTORE_PID_FILE); + unlink_safe(buf); + execv_safe(CRIU_BINARY, restore_argv, 1); + } +} + +void write_to_fd(int fd, char *name, int i, int newline) +{ + int n; + char buf[16]; /* fit "hello d\n" for small d */ + + n = snprintf(buf, sizeof buf, "hello %d", i); + + printf("%s writing %s to %s via fd %d\n", who(0), buf, name, fd); + + if (newline) { + buf[n++] = '\n'; + buf[n] = '\0'; + } + write_safe(fd, buf, strlen(buf)); +} + +void ls_proc_fd(int fd) +{ + char cmd[128]; + + if (fd == -1) + snprintf(cmd, sizeof cmd, "ls -l /proc/%d/fd", getpid()); + else + snprintf(cmd, sizeof cmd, "ls -l /proc/%d/fd/%d", getpid(), fd); + printf("%s %s\n", who(0), cmd); + system(cmd); +} + +char *socket_name(int fd) +{ + static char sock_name[64]; + char path[64]; + + snprintf(path, sizeof path, "/proc/self/fd/%d", fd); + if (readlink(path, sock_name, sizeof sock_name) == -1) + die("readlink: path=%s", path); + return sock_name; +} + +ino_t socket_inode(int fd) +{ + struct stat sbuf; + + if (fstat(fd, &sbuf) == -1) + die("fstat: fd=%i", fd); + + return sbuf.st_ino; +} + +/* + * Use two buffers to support two calls to + * this function in a printf argument list. + */ +char *who(pid_t pid) +{ + static char pidstr1[64]; + static char pidstr2[64]; + static char *cp; + char *np; + char *ep; + int p; + + p = pid ? pid : getpid(); + if (p == parent_pid) { + np = "parent"; + ep = CS_PARENT; + } else if (p == child_pid) { + np = "child"; + ep = CS_CHILD; + } else if (p == criu_dump_pid) { + np = "dump"; + ep = CS_DUMP; + } else if (p == criu_restore_pid) { + np = "restore"; + ep = CS_RESTORE; + } else + np = "???"; + + cp = (cp == pidstr1) ? pidstr2 : pidstr1; + snprintf(cp, sizeof pidstr1, "%s[%s %d]", pid ? "" : ep, np, p); + return cp; +} + +void socketpair_safe(int socketfd[2]) +{ + if (socketpair(AF_UNIX, SOCK_STREAM, 0, socketfd) == -1) + die("socketpair %p", socketfd); +} + +pid_t fork_safe(void) +{ + pid_t pid; + + if ((pid = fork()) == -1) + die("fork: pid=%d", pid); + max_forks++; + return pid; +} + +void signal_safe(int signum, sighandler_t handler) +{ + if (signal(signum, handler) == SIG_ERR) + die("signal: signum=%d", signum); +} + +int open_safe(char *pathname, int flags) +{ + int fd; + + if ((fd = open(pathname, flags, 0777)) == -1) + die("open: pathname=%s", pathname); + return fd; +} + +void close_safe(int fd) +{ + if (close(fd) == -1) + die("close: fd=%d", fd); +} + +void write_safe(int fd, char *buf, int count) +{ + if (write(fd, buf, count) != count) { + die("write: fd=%d buf=\"%s\" count=%d errno=%d", + fd, buf, count, errno); + } +} + +int read_safe(int fd, char *buf, int count) +{ + int n; + + if ((n = read(fd, buf, count)) < 0) + die("read: fd=%d count=%d", fd, count); + buf[n] = '\0'; + return n; +} + +int dup_safe(int oldfd) +{ + int newfd; + + if ((newfd = dup(oldfd)) == -1) + die("dup: oldfd=%d", oldfd); + return newfd; +} + +int dup2_safe(int oldfd, int newfd) +{ + if (dup2(oldfd, newfd) != newfd) + die("dup2: oldfd=%d newfd=%d", oldfd, newfd); + return newfd; +} + +void move_fd(int oldfd, int newfd) +{ + if (oldfd != newfd) { + dup2_safe(oldfd, newfd); + close_safe(oldfd); + } +} + +void mkdir_safe(char *dirname, int mode) +{ + if (mkdir(dirname, mode) == -1 && errno != EEXIST) + die("mkdir dirname=%s mode=0x%x\n", dirname, mode); +} + +void unlink_safe(char *pathname) +{ + if (unlink(pathname) == -1 && errno != ENOENT) { + die("unlink: pathname=%s\n", pathname); + } +} + +void execv_safe(char *path, char *argv[], int ls) +{ + int i; + struct timespec req = { 0, 1000000 }; + + printf("\n%s ", who(0)); + for (i = 0; argv[i] != NULL; i++) + printf("%s ", argv[i]); + printf("\n"); + + /* give parent a chance to wait for us */ + while (nanosleep(&req, NULL)) + ; + + if (vflag && ls) + ls_proc_fd(-1); + + execv(path, argv); + die("execv: path=%s", path); +} + +pid_t waitpid_safe(pid_t pid, int *status, int options, int id) +{ + pid_t p; + + p = waitpid(pid, status, options); + if (p == -1) + fprintf(stderr, "waitpid pid=%d id=%d %m\n", pid, id); + return p; +} + +void prctl_safe(int option, ulong arg2, ulong arg3, ulong arg4, ulong arg5) +{ + if (prctl(option, arg2, arg3, arg4, arg5) == -1) + die("prctl: option=0x%x", option); +} diff --git a/CRIU_code/test/others/tcp/Makefile b/CRIU_code/test/others/tcp/Makefile new file mode 100644 index 0000000..311d68b --- /dev/null +++ b/CRIU_code/test/others/tcp/Makefile @@ -0,0 +1,11 @@ +OBJS=cln srv + +all: $(OBJS) +.PHONY: all + +run: all + ./run.sh + +clean: + rm -f $(OBJS) +.PHONY: clean diff --git a/CRIU_code/test/others/tcp/cln.c b/CRIU_code/test/others/tcp/cln.c new file mode 100644 index 0000000..6275d37 --- /dev/null +++ b/CRIU_code/test/others/tcp/cln.c @@ -0,0 +1,122 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define BUF_SIZE (1024) + +static char rbuf[BUF_SIZE]; +static char buf[BUF_SIZE]; + +static int check_buf(int sk, char *buf, int count) +{ + int rd, i; + + printf("Checking for %d bytes\n", count); + + rd = 0; + while (rd < count) { + int r; + + r = read(sk, rbuf + rd, count - rd); + if (r == 0) { + printf("Unexpected EOF\n"); + return 1; + } + + if (r < 0) { + perror("Can't read buf"); + return 1; + } + + rd += r; + } + + for (i = 0; i < count; i++) + if (buf[i] != rbuf[i]) { + printf("Mismatch on %d byte %d != %d\n", + i, (int)buf[i], (int)rbuf[i]); + return 1; + } + + return 0; +} + +static int serve_new_conn(int in_fd, int sk) +{ + printf("New connection\n"); + + while (1) { + int rd, wr; + + rd = read(in_fd, buf, sizeof(buf)); + if (rd == 0) + break; + if (rd < 0) { + perror("Can't read from infd"); + return 1; + } + + printf("Read %d bytes, sending to sock\n", rd); + + wr = 0; + while (wr < rd) { + int w; + + w = write(sk, buf + wr, rd - wr); + if (w <= 0) { + perror("Can't write to socket"); + return 1; + } + + if (check_buf(sk, buf + wr, w)) + return 1; + + wr += w; + } + } + + printf("Done\n"); + return 0; +} + +int main(int argc, char **argv) +{ + int sk, port, ret; + struct sockaddr_in addr; + + if (argc < 3) { + printf("Need addr, port and iters\n"); + return -1; + } + + sk = socket(PF_INET, SOCK_STREAM, IPPROTO_TCP); + if (sk < 0) { + perror("Can't create socket"); + return -1; + } + + port = atoi(argv[2]); + printf("Connecting to %s:%d\n", argv[1], port); + memset(&addr, 0, sizeof(addr)); + addr.sin_family = AF_INET; + ret = inet_aton(argv[1], &addr.sin_addr); + if (ret < 0) { + perror("Can't convert addr"); + return -1; + } + addr.sin_port = htons(port); + + ret = connect(sk, (struct sockaddr *)&addr, sizeof(addr)); + if (ret < 0) { + perror("Can't connect"); + return -1; + } + + return serve_new_conn(0, sk); +} diff --git a/CRIU_code/test/others/tcp/run.sh b/CRIU_code/test/others/tcp/run.sh new file mode 100644 index 0000000..702879d --- /dev/null +++ b/CRIU_code/test/others/tcp/run.sh @@ -0,0 +1,67 @@ +#!/bin/bash + +source ../env.sh || exit 1 + +set -x + +PORT=12345 +CLN_PIPE="./clnt_pipe" +SRV_LOG="./srv.log" +CLN_LOG="./cln.log" +DDIR="dump" + +TEXT=$(hexdump -C /dev/urandom | head -n 1) + +echo "Building services" + +make clean && make || { echo "Failed to build"; exit 1; } +rm -rf ${DDIR} ${SRV_LOG} ${CLN_LOG} ${CLN_PIPE} +mkdir ${DDIR} + +echo "Starting server" + +setsid ./srv ${PORT} > ${SRV_LOG} 2>&1 & +SRV_PID=${!} + +echo "Starting pipe" +mkfifo ${CLN_PIPE} + +echo "Starting client" +./cln "127.0.0.1" ${PORT} < ${CLN_PIPE} > ${CLN_LOG} & +CLN_PID=${!} + +exec 3>${CLN_PIPE} +echo "Make it run" +echo "${TEXT}" >&3 + +function fail { + echo FAIL + +( exec >&2 + + echo "$@" + kill -9 ${CLN_PID} + kill -9 ${SRV_PID} + echo ${CLN_LOG}: + cat ${CLN_LOG} +) + exit 1 +} + +kill -s 0 ${CLN_PID} || fail "Client is dead" + +echo "Suspend server" +${CRIU} dump -D ${DDIR} -o dump.log -t ${SRV_PID} --tcp-established -vvvv || fail "Fail to dump server" +sleep 1 +echo "Resume server" +${CRIU} restore -D ${DDIR} -o restore.log -d --tcp-established -vvvv --close 3 || fail "Fail to restore server" + +echo "Make client run again" +echo "${TEXT}" >&3 + +echo "Collect results" +exec 3>&- +wait ${CLN_PID} || fail "Client exits abruptly" +kill -9 ${SRV_PID} + +echo PASS diff --git a/CRIU_code/test/others/tcp/srv.c b/CRIU_code/test/others/tcp/srv.c new file mode 100644 index 0000000..656cc01 --- /dev/null +++ b/CRIU_code/test/others/tcp/srv.c @@ -0,0 +1,112 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int serve_new_conn(int sk) +{ + int rd, wr; + char buf[1024]; + + printf("New connection\n"); + + while (1) { + rd = read(sk, buf, sizeof(buf)); + if (!rd) + break; + + if (rd < 0) { + perror("Can't read socket"); + return 1; + } + + wr = 0; + while (wr < rd) { + int w; + + w = write(sk, buf + wr, rd - wr); + if (w <= 0) { + perror("Can't write socket"); + return 1; + } + + wr += w; + } + } + + printf("Done\n"); + return 0; +} + +int main(int argc, char **argv) +{ + int sk, port, ret; + struct sockaddr_in addr; + + if (argc < 2) { + printf("Need port\n"); + return -1; + } + + /* + * Let kids die themselves + */ + + signal(SIGCHLD, SIG_IGN); + + sk = socket(PF_INET, SOCK_STREAM, IPPROTO_TCP); + if (sk < 0) { + perror("Can't create socket"); + return -1; + } + + port = atoi(argv[1]); + memset(&addr, 0, sizeof(addr)); + addr.sin_family = AF_INET; + addr.sin_addr.s_addr = htonl(INADDR_ANY); + addr.sin_port = htons(port); + + printf("Binding to port %d\n", port); + + ret = bind(sk, (struct sockaddr *)&addr, sizeof(addr)); + if (ret < 0) { + perror("Can't bind socket"); + return -1; + } + + ret = listen(sk, 16); + if (ret < 0) { + perror("Can't put sock to listen"); + return -1; + } + + printf("Waiting for connections\n"); + while (1) { + int ask, pid; + + ask = accept(sk, NULL, NULL); + if (ask < 0) { + perror("Can't accept new conn"); + return -1; + } + + pid = fork(); + if (pid < 0) { + perror("Can't fork"); + return -1; + } + + if (pid > 0) + close(ask); + else { + close(sk); + ret = serve_new_conn(ask); + exit(ret); + } + } +} diff --git a/CRIU_code/test/others/unix-callback/Makefile b/CRIU_code/test/others/unix-callback/Makefile new file mode 100644 index 0000000..25bcf22 --- /dev/null +++ b/CRIU_code/test/others/unix-callback/Makefile @@ -0,0 +1,22 @@ +all: unix-lib.so unix-server unix-client syslog-lib.so + +run: all + ./run.sh + +unix.pb-c.c: unix.proto + protoc-c --proto_path=. --c_out=. unix.proto + +unix-lib.so: unix-lib.c unix.pb-c.c + gcc -g -Werror -Wall -shared -nostartfiles unix-lib.c unix.pb-c.c -o unix-lib.so -iquote ../../../criu/include -fPIC + +syslog-lib.so: syslog-lib.c + gcc -g -Werror -Wall -shared -nostartfiles syslog-lib.c -o syslog-lib.so -iquote ../../../criu/include -fPIC + +unix-server: unix-server.c + gcc -Werror -Wall -o unix-server unix-server.c + +unix-client: unix-client.c + gcc -Werror -Wall -o unix-client unix-client.c + +clean: + rm -rf data unix-lib.so unix-server unix-client syslog-lib.so output pid unix.pb-c.* diff --git a/CRIU_code/test/others/unix-callback/run.sh b/CRIU_code/test/others/unix-callback/run.sh new file mode 100644 index 0000000..ec5b7f5 --- /dev/null +++ b/CRIU_code/test/others/unix-callback/run.sh @@ -0,0 +1,48 @@ +#!/bin/bash -x + +cd `dirname $0` + +source ../env.sh || exit 1 + +rm -rf /tmp/criu.unix.callback.test* +test -f pid && unlink pid +test -f output && unlink output +rm -rf data +mkdir -p data + +./unix-server & +srv_pid=$! + +for i in `seq 20`; do + test -f /tmp/criu.unix.callback.test && break + sleep 0.1 +done + +( setsid ./unix-client < /dev/null &> output ) & + +while :; do + test -f pid && break + sleep 1 +done + +pid=`cat pid` + +${CRIU} dump -D data -o dump.log -v4 --lib `pwd`/lib -t $pid || exit 1 +kill $srv_pid +wait $srv_pid +unlink /tmp/criu.unix.callback.test +./unix-server & +srv_pid=$! +for i in `seq 20`; do + test -f /tmp/criu.unix.callback.test && break + sleep 0.1 +done +${CRIU} restore -D data -o restore.log -v4 --lib `pwd`/lib -d || exit 1 +kill $pid +while :; do + cat output | grep PASS && break + sleep 1 +done + +cat output +kill $srv_pid diff --git a/CRIU_code/test/others/unix-callback/syslog-lib.c b/CRIU_code/test/others/unix-callback/syslog-lib.c new file mode 100644 index 0000000..c7950ca --- /dev/null +++ b/CRIU_code/test/others/unix-callback/syslog-lib.c @@ -0,0 +1,66 @@ +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "criu-plugin.h" +#include "criu-log.h" + +extern cr_plugin_dump_unix_sk_t cr_plugin_dump_unix_sk; +extern cr_plugin_restore_unix_sk_t cr_plugin_restore_unix_sk; + +int cr_plugin_dump_unix_sk(int sk, int id) +{ + struct sockaddr_un addr; + socklen_t addr_len = sizeof(addr); + char buf[4096]; + int fd; + + if (getsockname(sk, (struct sockaddr *) &addr, &addr_len) < 0) + return -1; + + if (strncmp(addr.sun_path, "/dev/log", addr_len - sizeof(addr.sun_family))) + return -ENOTSUP; + + snprintf(buf, sizeof(buf), "syslog-%x.img", id); + fd = open(buf, O_WRONLY | O_CREAT); + if (fd < 0) + return -1; + close(fd); + + return 0; +} + +int cr_plugin_restore_unix_sk(int id) +{ + struct sockaddr_un addr; + socklen_t addr_len; + char buf[4096]; + int sk, fd; + + snprintf(buf, sizeof(buf), "syslog-%x.img", id); + fd = open(buf, O_RDONLY); + if (fd < 0) + return -ENOTSUP; + close(fd); + + sk = socket(AF_FILE, SOCK_DGRAM|SOCK_CLOEXEC, 0); + if (sk == -1) + return sk; + + addr.sun_family = AF_FILE; + addr_len = strlen("/dev/log"); + strncpy(addr.sun_path, "/dev/log", addr_len); + addr_len += sizeof(addr.sun_family); + if (connect(sk, (struct sockaddr *) &addr, addr_len) == -1) { + close(sk); + return -1; + } + + return sk; +} diff --git a/CRIU_code/test/others/unix-callback/unix-client.c b/CRIU_code/test/others/unix-callback/unix-client.c new file mode 100644 index 0000000..69808b5 --- /dev/null +++ b/CRIU_code/test/others/unix-callback/unix-client.c @@ -0,0 +1,121 @@ +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +#define SK_NAME "/tmp/criu.unix.callback.test" + +#define SK_NR 2 +struct { + int id; + int sk; + int val; +} sks[SK_NR]; + +static int create_sock(int i) +{ + int ret, id, sk, val = time(NULL) + i * 314; + char buf[4096]; + struct sockaddr_un addr; + socklen_t addr_len; + + id = getpid() * 10 + i; + sk = socket(AF_UNIX, SOCK_DGRAM, 0); + if (sk < 0) + return -1; + + addr.sun_family = AF_UNIX; + addr_len = snprintf(addr.sun_path, UNIX_PATH_MAX, "%s%d", SK_NAME, id); + addr_len += sizeof(addr.sun_family); + + if (bind(sk, (struct sockaddr *) &addr, addr_len) < 0) { + perror("bind"); + return 1; + } + + addr.sun_family = AF_UNIX; + addr_len = snprintf(addr.sun_path, UNIX_PATH_MAX, SK_NAME); + addr_len += sizeof(addr.sun_family); + + if (connect(sk, (struct sockaddr *) &addr, addr_len) < 0) { + perror("connect"); + return 1; + } + + printf("init %d\n", val); + ret = sprintf(buf, "t%d", val); + if (send(sk, buf, ret, 0) < 0) { + perror("send"); + return -1; + } + + sks[i].sk = sk; + sks[i].val = val; + + return 0; +} + +static int check_sock(int i) +{ + int sk = sks[i].sk, val = sks[i].val; + char buf[4096]; + + if (send(sk, "r", 1, 0) < 0) { + perror("send(\"r\")"); + return -1; + } + + if (recv(sk, buf, sizeof(buf), 0) <= 0) { + perror("recv"); + return -1; + } + + printf("%s - %d\n", buf, val); + if (atoi(buf) != val) + return -1; + + return 0; +} + +int main() +{ + int i, fd; + sigset_t set; + int sig; + + for (i = 0; i < SK_NR; i++) + if (create_sock(i)) + return -1; + + fd = open("pid", O_WRONLY | O_CREAT, 0666); + if (fd < 0) + return 1; + dprintf(fd, "%d\n", getpid()); + close(fd); + + openlog("test", LOG_NDELAY, LOG_USER ); + + sigemptyset(&set); + sigaddset(&set, SIGTERM); + sigprocmask(SIG_BLOCK, &set, NULL); + sigwait(&set, &sig); + + syslog(LOG_CRIT, "test message"); + + for (i = 0; i < SK_NR; i++) + if (check_sock(i)) + return -1; + + printf("PASS\n"); + return 0; +} + diff --git a/CRIU_code/test/others/unix-callback/unix-lib.c b/CRIU_code/test/others/unix-callback/unix-lib.c new file mode 100644 index 0000000..44b1498 --- /dev/null +++ b/CRIU_code/test/others/unix-callback/unix-lib.c @@ -0,0 +1,187 @@ +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +#include "criu-plugin.h" +#include "criu-log.h" + +#include "unix.pb-c.h" + +extern cr_plugin_init_t cr_plugin_init; +extern cr_plugin_dump_unix_sk_t cr_plugin_dump_unix_sk; +extern cr_plugin_restore_unix_sk_t cr_plugin_restore_unix_sk; + +#define SK_NAME "/tmp/criu.unix.callback.test" +static int get_srv_socket(void) +{ + struct sockaddr_un addr; + socklen_t addr_len; + int skd; + + skd = socket(AF_UNIX, SOCK_DGRAM, 0); + if (skd < 0) { + pr_perror("socket"); + return -1; + } + + addr.sun_family = AF_UNIX; + addr_len = snprintf(addr.sun_path, UNIX_PATH_MAX, "%s.dump.%d", SK_NAME, getpid()); + addr_len += sizeof(addr.sun_family); + + unlink(addr.sun_path); + if (bind(skd, (struct sockaddr *) &addr, addr_len) < 0) { + pr_perror("bind"); + return 1; + } + + addr.sun_family = AF_UNIX; + addr_len = snprintf(addr.sun_path, UNIX_PATH_MAX, SK_NAME); + addr_len += sizeof(addr.sun_family); + + if (connect(skd, (struct sockaddr *) &addr, addr_len) < 0) { + pr_perror("connect"); + return -1; + } + + return skd; +} + +int cr_plugin_init(void) +{ + return 0; +} + +int cr_plugin_dump_unix_sk(int sk, int sk_id) +{ + struct sockaddr_un addr; + socklen_t addr_len = sizeof(addr); + char buf[4096]; + int skd, id, ret, fd, len; + UnixTest e = UNIX_TEST__INIT; + + if (getpeername(sk, (struct sockaddr *) &addr, &addr_len)) { + pr_perror("getpeername"); + return -1; + } + + len = addr_len - sizeof(addr.sun_family); + if (addr.sun_path[len - 1] == 0) + len--; + + if (len != strlen(SK_NAME) || + strncmp(addr.sun_path, SK_NAME, strlen(SK_NAME))) + return -ENOTSUP; + + pr_info("Dump the socket %x\n", sk_id); + skd = get_srv_socket(); + if (skd < 0) + return -1; + + addr_len = sizeof(struct sockaddr_un); + + if (getsockname(sk, (struct sockaddr *) &addr, &addr_len) < 0) + return -1; + + id = atoi(addr.sun_path + strlen(SK_NAME)); + + ret = sprintf(buf, "d%d", id) + 1; + if (send(skd, buf, ret, 0) < 0) { + pr_perror("send"); + return -1; + } + + if (recv(skd, buf, sizeof(buf), 0) <= 0) + return -1; + + close(skd); + + e.val = atoi(buf); + pr_err("%x: val %d\n", sk_id, e.val); + e.name.data = (void *)addr.sun_path; + e.name.len = addr_len - sizeof(addr.sun_family); + + snprintf(buf, sizeof(buf), "unix-test-%x.img", sk_id); + fd = openat(criu_get_image_dir(), buf, O_WRONLY | O_CREAT, 0600); + if (fd < 0) + return -1; + + if (unix_test__get_packed_size(&e) > sizeof(buf)) { + pr_err("%ld\n", unix_test__get_packed_size(&e)); + return -1; + } + + ret = unix_test__pack(&e, (uint8_t *) buf); + if (write(fd, buf, ret) != ret) + return -1; + close(fd); + + return 0; +} + +int cr_plugin_restore_unix_sk(int sk_id) +{ + struct sockaddr_un addr; + socklen_t addr_len; + int fd, sk, ret; + char buf[4096]; + UnixTest *e; + + snprintf(buf, sizeof(buf), "unix-test-%x.img", sk_id); + fd = openat(criu_get_image_dir(), buf, O_RDONLY, 0600); + if (fd < 0) + return -ENOTSUP; + + ret = read(fd, buf, sizeof(buf)); + if (ret < 0) { + pr_perror("read"); + return -1; + } + close(fd); + + e = unix_test__unpack(NULL, ret, (uint8_t *) buf); + if (e == NULL) + return -1; + + sk = socket(AF_UNIX, SOCK_DGRAM, 0); + if (sk < 0) { + pr_perror("socket"); + return -1; + } + + addr.sun_family = AF_UNIX; + memcpy(addr.sun_path, e->name.data, e->name.len); + addr_len = sizeof(addr.sun_family) + e->name.len; + + if (bind(sk, (struct sockaddr *) &addr, addr_len) < 0) { + pr_perror("bind"); + return -1; + } + + addr.sun_family = AF_UNIX; + addr_len = snprintf(addr.sun_path, UNIX_PATH_MAX, SK_NAME); + addr_len += sizeof(addr.sun_family); + + if (connect(sk, (struct sockaddr *) &addr, addr_len) < 0) { + pr_perror("connect"); + return -1; + } + + pr_err("id %d val %d\n", sk_id, e->val); + ret = sprintf(buf, "t%d", e->val); + if (send(sk, buf, ret, 0) < 0) { + pr_perror("send"); + return -1; + } + + return sk; +} diff --git a/CRIU_code/test/others/unix-callback/unix-server.c b/CRIU_code/test/others/unix-callback/unix-server.c new file mode 100644 index 0000000..8f32f53 --- /dev/null +++ b/CRIU_code/test/others/unix-callback/unix-server.c @@ -0,0 +1,104 @@ +#include +#include +#include +#include + +#include +#include +#include +#include + +struct ticket +{ + struct ticket *next; + int val; + int id; +}; + +struct ticket *tickets; + +#define SK_NAME "/tmp/criu.unix.callback.test" + +int main() +{ + int sk, ret, id; + char buf[4096]; + struct ticket *t; + struct sockaddr_un addr; + socklen_t addr_len; + struct stat st; + + unlink(SK_NAME); + + sk = socket(AF_UNIX, SOCK_DGRAM, 0); + if (sk < 0) { + perror("socket"); + return -1; + } + + addr.sun_family = AF_UNIX; + addr_len = snprintf(addr.sun_path, UNIX_PATH_MAX, SK_NAME); + addr_len += sizeof(addr.sun_family); + + if (bind(sk, (struct sockaddr *) &addr, addr_len) < 0) { + perror("bind"); + return 1; + } + + fstat(sk, &st); + + while (1) { + addr_len = sizeof(struct sockaddr_un); + ret = recvfrom(sk, buf, sizeof(buf), 0, (struct sockaddr *) &addr, &addr_len); + if (ret == 0) + return 0; + if (ret < 0) { + perror("recvfrom"); + return 1; + } + id = 0; + switch (buf[0]) { + case 'l': + ret = sprintf(buf, "%ld", st.st_ino); + if (sendto(sk, buf, ret + 1, 0, (struct sockaddr *) &addr, addr_len) < 0) { + perror("sendto"); + return -1; + } + break; + case 't': /* ticket */ + t = malloc(sizeof(struct ticket)); + if (t == 0) { + perror("Can't allocate memory"); + return 1; + } + + t->val = atoi(buf + 1); + t->next = tickets; + t->id = atoi(addr.sun_path +strlen(SK_NAME)); + printf("t: id %d val %d\n", t->id, t->val); + tickets = t; + break; + case 'd': /* dump */ + id = atoi(buf + 1); + case 'r': /* request */ + if (!id) + id = atoi(addr.sun_path + strlen(SK_NAME)); + for (t = tickets; t; t = t->next) + if (t->id == id) + break; + if (t == NULL) + return 1; + printf("r: id %d val %d\n", id, t->val); + ret = sprintf(buf, "%d", t->val); + if (sendto(sk, buf, ret + 1, 0, (struct sockaddr *) &addr, addr_len) < 0) { + perror("sendto"); + return 1; + } + break; + default: + return -1; + } + } + + return 0; +} diff --git a/CRIU_code/test/others/unix-callback/unix.proto b/CRIU_code/test/others/unix-callback/unix.proto new file mode 100644 index 0000000..7112786 --- /dev/null +++ b/CRIU_code/test/others/unix-callback/unix.proto @@ -0,0 +1,6 @@ +syntax = "proto2"; + +message unix_test { + required uint32 val = 1; + required bytes name = 2; +} diff --git a/CRIU_code/test/pki/cacert.pem b/CRIU_code/test/pki/cacert.pem new file mode 100644 index 0000000..2f87066 --- /dev/null +++ b/CRIU_code/test/pki/cacert.pem @@ -0,0 +1,23 @@ +-----BEGIN CERTIFICATE----- +MIID0TCCAjmgAwIBAgIUWzgmx9p7y7mkrNptGX9+0acjpa4wDQYJKoZIhvcNAQEL +BQAwADAeFw0xOTA1MDYxMjAzMDJaFw0yMDA1MDUxMjAzMDJaMAAwggGiMA0GCSqG +SIb3DQEBAQUAA4IBjwAwggGKAoIBgQD0p0lJUlq917GmJuCBeP2eLNd1/MUg1ojy +s7rrpinPYtLZqqquUhp32lfQtt3uJLjkhTrseZd86zWi3SMZlGs8zGGmKfqg0vaG +BXIgpEIr5C0wU9995kL9A6LS+eFZR6vJQETO5T22tjponoqEPOXeU8VaiC9jNipC +uFJT0wyC0bKIo+TUn573kxsGMt8jMOv0tc/okUlH16UAsYrmN7kWzgkWTJPddB7S +v5a9ibpPkbh+wrIGK5A6V5hTZ8U1wz2bE6/Xp+qjsD2R3jeU6f1tDvc8FZilabQy +Rmbxggucl1G3Ulo6Nvor1lhog72eZlHZujzf/5e/aMiZ7Br6plZ1/WTwtNgoCw6A +rgpLDraasQohiK6opYs2rr7uuiQxPLLVWE/RryXwUEoPXzxaf782XtXxkB0UhGvz +y2JBxCVPn7uUGuyEYywjTjI2UFvsMcXnMiQ4WaAfMbAmrBWM7EQ4b7VpD2c+OZkQ +J/AJeg85/ovTAtHPjhPP+0a9hnirktkCAwEAAaNDMEEwDwYDVR0TAQH/BAUwAwEB +/zAPBgNVHQ8BAf8EBQMDBwQAMB0GA1UdDgQWBBQOg6AA8Qu/m/O/II5spzYsTnsn +pjANBgkqhkiG9w0BAQsFAAOCAYEA1KKtw+ILKOg1AhGwgPsJXAWZoSIt7hdLaJ3P +WGyTWHLKKoJiGlLj3bSsJcMmMO+UwHBH9lmXrOWe/xcOvob2H+7dXbddQ0pX1wzK +KJKzSG35FZ2BfuSn5oEDtRsHnN2Ikc5MYz1a+F4w2tVL/Hcxld+oSAodDlCbGoe+ +0MkI5f1KhdAw00l/5IO7xPOcThjHw+nB5/cZTQ+l4zLWCWaXkor4IAEq/plPcdX1 +uoLSj3JruLz7/ts/EgG+ARAzXQrJ+LM2hdPB1NiaVxFq7MSWM6FybUdmMYgbP5s4 +RMNqI/M+bU9K5LRySDaiPhDXUoVULuqG1a23GQwXLOjF0JbrUQewfAaTO7TaPFh1 +lr25j9Fc9/gcXZjvLl+CEIv6P/haGOwySCTCks0F5bDehbLjZStPmugcnJflXdBn +lzoejlw2rePojQMlffQsaRGmmhj0beU4WQBfGACcZQB8GFNxQB8aynf0CK7Dvvb0 +9c9y4k0gHL7RxeLoQfq+smzKm+Eo +-----END CERTIFICATE----- diff --git a/CRIU_code/test/pki/cert.pem b/CRIU_code/test/pki/cert.pem new file mode 100644 index 0000000..a0946ee --- /dev/null +++ b/CRIU_code/test/pki/cert.pem @@ -0,0 +1,24 @@ +-----BEGIN CERTIFICATE----- +MIIEAzCCAmugAwIBAgIUKV6zLC//OJDnmOYBuIG1Gvmv+V4wDQYJKoZIhvcNAQEL +BQAwADAeFw0xOTA1MDYxMjAzMDJaFw0yMDA1MDUxMjAzMDJaMBQxEjAQBgNVBAMT +CWxvY2FsaG9zdDCCAaIwDQYJKoZIhvcNAQEBBQADggGPADCCAYoCggGBANX1nv4J +U8+TEb2bWej5O2nOowpw2zSYTDAQ1oyAvV3P99Y6GZCuVZ1uT/7DWat0uRpcdmNi +HvownkO4VmDZdVqgiK1eHzY5YBJ7hBVDs3tpWNuN7eJPjnskNmJqKQ6l9rxYl/au +781T+tdtHp1ATtToMgVJxWaUx5lrpEJdmYc8Y6GpAA42D+rI3o4Sll3mI5rPCk16 +QY5dT2lnL2HuCKzM2bjWat6b3lMpfNz3A/blU9E/462Zxr/yKK/0yy3SBZhYzrrQ +1/erjIpm4I0sakHIOexM1AQliFiowFzVvr/paiXApWGOcuBJVIbmPI/bEGuTh0nr +3pmiF0YrkDCRhargElYcz64KQ9IxPFCKcKjkMnFPjTStZ7rcMyqKvGczqFaM5a6c +9gIn2ieUrVZ38yvtI5Lo/uxZ5IjXqB1Fdg4xi2tyf9WMHKy2tydBr9bTjfQRXfNT +/Zm3woDXOYsHzj+Sf6ntLVCkO1fnczw03fPRV03/uVRa5mPGyyj9xdPBqwIDAQAB +o2EwXzAMBgNVHRMBAf8EAjAAMA8GA1UdDwEB/wQFAwMHoAAwHQYDVR0OBBYEFEtF +ELehnIjLzoh/W51TGm2B00QAMB8GA1UdIwQYMBaAFA6DoADxC7+b878gjmynNixO +eyemMA0GCSqGSIb3DQEBCwUAA4IBgQA17NZCaiCgD9P4GPWgdVNkWUrmc8itJuIB +z3c9RdJGduxb9W/D7IW//D3hIblOPorNi3+08kO/IRMGah874MDCprMNppk94WGj +Kgqi/rsxq+rT6bcZXxMrcOIg0j2EvTPIgPh7trd8nHVWxNT/hvFClDtBJ2ssL2Tz +76EA7smDCUsfdzFJ2Xvk95fSTL49nfT2j9N/YoLaBQtCIxWAVZHKiCF2K+yXufHz +B/9UlXwsPJfqxM75dYWXFEqvhNf08YRHT1e1GRrybNGrNKF864KbLsnASdK4N5wu +sK9vZJ7VkLDQz+YpZkbm+UgOYK/BY3M8IX+F+WngV+43fr6Wh89TSgD7acEBvQTm +q1y9FipRvz0my7fwBh6UlYDja6/3yw6/YfN7uMFGsOOSgpNDCrMLqesf8l1HdQUF +VaVJyDjgFswV9KykAeJK2KU8QI7TGHv9soW60sr97DgUtCh4a6OPXLt79Ji3RSNw +MbU54JnpnfmMAj/0suDymdrJWv8EJKc= +-----END CERTIFICATE----- diff --git a/CRIU_code/test/pki/key.pem b/CRIU_code/test/pki/key.pem new file mode 100644 index 0000000..eda1aa7 --- /dev/null +++ b/CRIU_code/test/pki/key.pem @@ -0,0 +1,182 @@ +Public Key Info: + Public Key Algorithm: RSA + Key Security Level: High (3072 bits) + +modulus: + 00:d5:f5:9e:fe:09:53:cf:93:11:bd:9b:59:e8:f9:3b + 69:ce:a3:0a:70:db:34:98:4c:30:10:d6:8c:80:bd:5d + cf:f7:d6:3a:19:90:ae:55:9d:6e:4f:fe:c3:59:ab:74 + b9:1a:5c:76:63:62:1e:fa:30:9e:43:b8:56:60:d9:75 + 5a:a0:88:ad:5e:1f:36:39:60:12:7b:84:15:43:b3:7b + 69:58:db:8d:ed:e2:4f:8e:7b:24:36:62:6a:29:0e:a5 + f6:bc:58:97:f6:ae:ef:cd:53:fa:d7:6d:1e:9d:40:4e + d4:e8:32:05:49:c5:66:94:c7:99:6b:a4:42:5d:99:87 + 3c:63:a1:a9:00:0e:36:0f:ea:c8:de:8e:12:96:5d:e6 + 23:9a:cf:0a:4d:7a:41:8e:5d:4f:69:67:2f:61:ee:08 + ac:cc:d9:b8:d6:6a:de:9b:de:53:29:7c:dc:f7:03:f6 + e5:53:d1:3f:e3:ad:99:c6:bf:f2:28:af:f4:cb:2d:d2 + 05:98:58:ce:ba:d0:d7:f7:ab:8c:8a:66:e0:8d:2c:6a + 41:c8:39:ec:4c:d4:04:25:88:58:a8:c0:5c:d5:be:bf + e9:6a:25:c0:a5:61:8e:72:e0:49:54:86:e6:3c:8f:db + 10:6b:93:87:49:eb:de:99:a2:17:46:2b:90:30:91:85 + aa:e0:12:56:1c:cf:ae:0a:43:d2:31:3c:50:8a:70:a8 + e4:32:71:4f:8d:34:ad:67:ba:dc:33:2a:8a:bc:67:33 + a8:56:8c:e5:ae:9c:f6:02:27:da:27:94:ad:56:77:f3 + 2b:ed:23:92:e8:fe:ec:59:e4:88:d7:a8:1d:45:76:0e + 31:8b:6b:72:7f:d5:8c:1c:ac:b6:b7:27:41:af:d6:d3 + 8d:f4:11:5d:f3:53:fd:99:b7:c2:80:d7:39:8b:07:ce + 3f:92:7f:a9:ed:2d:50:a4:3b:57:e7:73:3c:34:dd:f3 + d1:57:4d:ff:b9:54:5a:e6:63:c6:cb:28:fd:c5:d3:c1 + ab: + +public exponent: + 01:00:01: + +private exponent: + 1e:38:b0:79:7f:85:c8:17:24:f5:5c:41:29:e8:32:5d + 32:a3:d2:f0:b7:f5:c8:e1:52:14:be:c9:5f:d1:df:b3 + 65:75:6c:05:7a:6b:35:8a:a4:2f:46:73:ff:71:79:6e + 3f:eb:f9:88:f6:2e:1b:f6:cc:14:12:b0:98:c3:7e:91 + 0b:85:e2:bf:1d:b7:82:09:30:f3:23:68:01:85:13:94 + 80:c9:9a:55:94:96:da:30:48:a0:29:ec:86:da:1b:d5 + 2b:2b:74:63:92:b8:2a:8f:87:29:f0:ae:d7:55:63:0d + 2d:b3:0b:0e:2d:84:dc:d5:08:b5:ac:a0:f7:29:9d:71 + 89:3d:27:6a:eb:96:f5:4e:9b:8a:dc:14:82:0a:c7:5c + 16:1c:d2:7e:b9:1b:13:69:d8:b2:b1:b1:7e:aa:a9:ad + 06:ce:66:0e:5b:50:10:42:2a:0a:fd:29:14:f7:09:63 + c1:20:18:5f:27:81:46:12:8c:b8:f4:89:a6:3d:55:a1 + d4:64:fc:f2:db:d7:9c:f5:be:f7:9d:88:5c:6d:36:a4 + 4b:ea:c5:e3:ea:32:81:6b:f3:47:b5:35:d5:c4:1a:b2 + ae:12:9d:19:a3:ec:a4:af:41:7e:5e:34:9d:f5:bc:b9 + 1f:a3:c2:32:b4:fc:95:a7:7a:54:04:e2:d6:4e:10:2f + 66:68:8b:3b:20:ea:05:db:2e:72:01:11:e7:7c:f8:72 + 0f:60:be:f1:27:19:ad:3a:6f:e9:70:56:3a:86:6e:46 + 0d:e3:55:31:66:77:09:84:48:b9:25:4b:c3:26:70:12 + ca:a4:5f:c6:3d:6a:e5:db:4d:63:04:b8:09:07:c9:30 + 85:08:9d:77:40:26:60:da:10:c2:53:d2:00:0d:9e:d9 + d5:71:06:30:eb:fb:f7:3f:82:1f:b3:9a:f3:4d:24:86 + 2e:94:fd:06:9e:dc:26:68:fa:64:c3:f9:fa:08:c4:b2 + ec:7a:f5:55:c5:10:b5:e2:2d:de:ba:04:30:10:5b:99 + + +prime1: + 00:fb:d1:47:9d:9e:73:f8:1e:09:21:fd:89:16:05:56 + af:a5:cf:52:d5:cd:f7:26:18:d1:84:3a:36:65:0b:a2 + cd:f9:b8:99:c0:c7:ef:00:c9:2f:c9:92:1a:1d:3d:86 + 58:3b:b1:be:d4:8c:c6:1b:df:ba:ee:87:aa:d1:22:47 + 18:bd:de:01:0f:0d:cb:ac:d0:48:a4:f4:93:e2:a6:cb + b5:b7:f5:f5:72:dd:ec:ac:13:e8:3d:62:23:54:ac:52 + ff:ee:9a:e1:7f:b0:ae:3b:41:38:d8:39:2b:40:ef:25 + 81:50:b0:98:db:f8:40:16:6e:1a:41:79:22:90:58:99 + 80:c2:0d:ba:b5:d3:54:ec:28:33:e4:b0:58:ea:de:61 + a1:b7:30:0b:9d:dc:73:62:c2:07:d3:75:91:48:49:dd + be:cf:b2:90:95:8f:29:6c:6f:f6:68:cb:cf:d5:24:a3 + d7:37:81:1b:34:3b:af:9a:48:52:af:53:7c:f7:32:a2 + 3f: + +prime2: + 00:d9:83:5e:be:0a:ea:0b:d9:66:63:56:3b:9e:44:aa + 46:6d:8d:6c:10:81:4b:de:19:5d:2c:16:7e:30:7c:ad + 23:9a:89:53:cc:18:e8:e8:51:2b:79:35:d0:67:7d:9e + 8f:be:ea:63:5e:14:c0:6b:ba:02:6c:4a:da:07:70:9d + 14:fa:be:1e:40:47:50:6f:f2:5a:87:9e:b6:b1:b8:55 + 2c:b6:a2:e3:b0:24:ba:ea:9b:55:87:8b:4b:cf:40:4a + 25:b4:89:cf:9e:76:ca:79:4a:f4:74:b7:ee:cf:6c:8f + cb:e3:3d:9e:86:3b:44:b7:70:ec:05:0c:68:ce:d6:c3 + a2:ec:e6:11:d6:2f:f7:80:26:a9:5c:aa:b9:a6:33:84 + a9:00:43:cf:72:07:8a:91:59:a2:b1:de:79:07:6b:81 + 67:a5:c2:4b:fd:29:8a:1a:96:66:57:66:d4:37:9a:98 + 69:d1:19:24:53:b1:a4:54:68:1e:8c:2b:b4:93:19:ed + 95: + +coefficient: + 00:90:9a:7f:6f:14:a8:bc:79:3f:25:e5:62:f9:5d:29 + 78:a4:78:8e:7a:e4:8a:62:8a:7f:9c:ae:75:95:fe:ee + 1a:99:53:40:01:76:29:7d:48:85:28:a2:2a:9f:0f:10 + 8c:19:6a:36:6b:e1:ac:a2:07:b9:72:5c:b9:a6:20:bb + 8f:cb:f5:ea:dd:3f:0e:ab:9d:c1:57:7e:7b:96:f9:da + b0:52:3c:3f:62:94:e7:5c:04:9e:ac:60:cd:4d:ec:7e + 68:d3:fb:2a:b4:02:f0:0e:be:37:bc:2a:f8:6e:8d:31 + b5:38:67:00:9e:67:9f:71:d0:88:36:32:69:4b:20:73 + eb:a1:d9:bc:72:c2:7e:39:1a:36:cc:c1:45:a2:14:37 + e6:ca:db:4d:0b:5b:68:a4:ff:b7:7b:b1:db:2f:70:27 + a1:6c:31:3f:c0:c3:23:04:b0:7a:e2:0d:21:ba:5a:80 + 52:c1:a1:2b:57:72:20:b6:ed:b1:e8:3b:95:88:81:90 + 5d: + +exp1: + 00:ef:ce:66:20:01:44:b9:35:89:46:f8:56:33:45:54 + 3f:23:6d:23:9a:7e:71:6d:b3:56:db:50:40:7a:cb:b0 + f7:ec:67:52:ec:96:b9:d1:8a:c6:5a:74:2b:30:4b:66 + 03:e2:9d:2b:78:e8:b2:c4:da:b3:fe:f1:ed:c7:09:98 + a1:44:37:05:d5:1b:33:2a:58:93:c5:9b:30:b6:38:57 + 68:af:4e:a8:b7:02:06:9f:fc:b9:3e:b3:95:a7:ce:0f + a0:b0:ce:88:0e:7c:e7:ff:7f:e6:2d:6b:8b:f8:63:85 + d8:f7:49:a5:d8:5d:3a:52:e1:f9:58:fe:8d:de:de:b1 + 18:40:34:a8:e8:fc:df:33:a2:39:81:00:3b:3d:38:17 + cb:d4:53:09:cd:04:a2:51:9b:2b:ae:c1:98:60:3a:0f + d4:e5:a0:4c:36:51:46:86:80:bd:2d:21:62:c3:bd:07 + d6:2d:82:62:b0:c4:62:3f:4f:be:86:3e:c0:93:fc:81 + 2b: + +exp2: + 11:e4:73:93:b0:74:26:3b:60:e7:c4:fd:2c:7c:bb:81 + 05:9b:ff:8a:b0:08:1c:a1:fb:7f:17:ee:93:70:7e:11 + 92:b1:bf:39:e7:c6:a8:ed:9c:64:e1:1f:5e:93:ff:ca + 15:4b:54:97:35:9f:ca:7c:c7:9c:3e:e0:06:82:a5:f9 + 46:d3:02:cc:08:d1:be:13:b2:8c:bb:6a:8d:dd:fa:eb + ad:ae:62:8a:67:cb:14:67:68:b6:b8:a7:a8:c9:c2:0f + ad:f5:34:25:f5:e1:9b:ee:a5:83:40:6a:1d:97:f1:90 + 35:06:29:97:23:22:f8:f0:0a:0a:34:46:1e:d5:9d:cc + 36:2e:8a:c3:12:b9:0a:4a:a3:dd:e2:91:58:f1:9d:f5 + 04:f7:8f:05:f3:46:db:c4:02:d5:1c:d6:d9:dc:67:0d + ae:9d:f8:00:40:3d:83:08:62:2c:c8:61:a6:9d:49:f2 + 52:67:fe:0c:00:6d:e3:1f:99:7b:b0:50:af:55:0f:ad + + + +Public Key PIN: + pin-sha256:EiqPFBPoLKkCzVlK8KoKYGQT/LSo7/0iLg/I7nKt1/0= +Public Key ID: + sha256:122a8f1413e82ca902cd594af0aa0a606413fcb4a8effd222e0fc8ee72add7fd + sha1:4b4510b7a19c88cbce887f5b9d531a6d81d34400 + +-----BEGIN RSA PRIVATE KEY----- +MIIG5AIBAAKCAYEA1fWe/glTz5MRvZtZ6Pk7ac6jCnDbNJhMMBDWjIC9Xc/31joZ +kK5VnW5P/sNZq3S5Glx2Y2Ie+jCeQ7hWYNl1WqCIrV4fNjlgEnuEFUOze2lY243t +4k+OeyQ2YmopDqX2vFiX9q7vzVP6120enUBO1OgyBUnFZpTHmWukQl2ZhzxjoakA +DjYP6sjejhKWXeYjms8KTXpBjl1PaWcvYe4IrMzZuNZq3pveUyl83PcD9uVT0T/j +rZnGv/Ior/TLLdIFmFjOutDX96uMimbgjSxqQcg57EzUBCWIWKjAXNW+v+lqJcCl +YY5y4ElUhuY8j9sQa5OHSevemaIXRiuQMJGFquASVhzPrgpD0jE8UIpwqOQycU+N +NK1nutwzKoq8ZzOoVozlrpz2AifaJ5StVnfzK+0jkuj+7FnkiNeoHUV2DjGLa3J/ +1YwcrLa3J0Gv1tON9BFd81P9mbfCgNc5iwfOP5J/qe0tUKQ7V+dzPDTd89FXTf+5 +VFrmY8bLKP3F08GrAgMBAAECggGAHjiweX+FyBck9VxBKegyXTKj0vC39cjhUhS+ +yV/R37NldWwFems1iqQvRnP/cXluP+v5iPYuG/bMFBKwmMN+kQuF4r8dt4IJMPMj +aAGFE5SAyZpVlJbaMEigKeyG2hvVKyt0Y5K4Ko+HKfCu11VjDS2zCw4thNzVCLWs +oPcpnXGJPSdq65b1TpuK3BSCCsdcFhzSfrkbE2nYsrGxfqqprQbOZg5bUBBCKgr9 +KRT3CWPBIBhfJ4FGEoy49ImmPVWh1GT88tvXnPW+952IXG02pEvqxePqMoFr80e1 +NdXEGrKuEp0Zo+ykr0F+XjSd9by5H6PCMrT8lad6VATi1k4QL2Zoizsg6gXbLnIB +Eed8+HIPYL7xJxmtOm/pcFY6hm5GDeNVMWZ3CYRIuSVLwyZwEsqkX8Y9auXbTWME +uAkHyTCFCJ13QCZg2hDCU9IADZ7Z1XEGMOv79z+CH7Oa800khi6U/Qae3CZo+mTD ++foIxLLsevVVxRC14i3eugQwEFuZAoHBAPvRR52ec/geCSH9iRYFVq+lz1LVzfcm +GNGEOjZlC6LN+biZwMfvAMkvyZIaHT2GWDuxvtSMxhvfuu6HqtEiRxi93gEPDcus +0Eik9JPipsu1t/X1ct3srBPoPWIjVKxS/+6a4X+wrjtBONg5K0DvJYFQsJjb+EAW +bhpBeSKQWJmAwg26tdNU7Cgz5LBY6t5hobcwC53cc2LCB9N1kUhJ3b7PspCVjyls +b/Zoy8/VJKPXN4EbNDuvmkhSr1N89zKiPwKBwQDZg16+CuoL2WZjVjueRKpGbY1s +EIFL3hldLBZ+MHytI5qJU8wY6OhRK3k10Gd9no++6mNeFMBrugJsStoHcJ0U+r4e +QEdQb/Jah562sbhVLLai47AkuuqbVYeLS89ASiW0ic+edsp5SvR0t+7PbI/L4z2e +hjtEt3DsBQxoztbDouzmEdYv94AmqVyquaYzhKkAQ89yB4qRWaKx3nkHa4FnpcJL +/SmKGpZmV2bUN5qYadEZJFOxpFRoHowrtJMZ7ZUCgcEA785mIAFEuTWJRvhWM0VU +PyNtI5p+cW2zVttQQHrLsPfsZ1LslrnRisZadCswS2YD4p0reOiyxNqz/vHtxwmY +oUQ3BdUbMypYk8WbMLY4V2ivTqi3Agaf/Lk+s5Wnzg+gsM6IDnzn/3/mLWuL+GOF +2PdJpdhdOlLh+Vj+jd7esRhANKjo/N8zojmBADs9OBfL1FMJzQSiUZsrrsGYYDoP +1OWgTDZRRoaAvS0hYsO9B9YtgmKwxGI/T76GPsCT/IErAoHAEeRzk7B0Jjtg58T9 +LHy7gQWb/4qwCByh+38X7pNwfhGSsb8558ao7Zxk4R9ek//KFUtUlzWfynzHnD7g +BoKl+UbTAswI0b4Tsoy7ao3d+uutrmKKZ8sUZ2i2uKeoycIPrfU0JfXhm+6lg0Bq +HZfxkDUGKZcjIvjwCgo0Rh7Vncw2LorDErkKSqPd4pFY8Z31BPePBfNG28QC1RzW +2dxnDa6d+ABAPYMIYizIYaadSfJSZ/4MAG3jH5l7sFCvVQ+tAoHBAJCaf28UqLx5 +PyXlYvldKXikeI565Ipiin+crnWV/u4amVNAAXYpfUiFKKIqnw8QjBlqNmvhrKIH +uXJcuaYgu4/L9erdPw6rncFXfnuW+dqwUjw/YpTnXASerGDNTex+aNP7KrQC8A6+ +N7wq+G6NMbU4ZwCeZ59x0Ig2MmlLIHProdm8csJ+ORo2zMFFohQ35srbTQtbaKT/ +t3ux2y9wJ6FsMT/AwyMEsHriDSG6WoBSwaErV3Igtu2x6DuViIGQXQ== +-----END RSA PRIVATE KEY----- diff --git a/CRIU_code/test/pycriu b/CRIU_code/test/pycriu new file mode 100644 index 0000000..d13a879 --- /dev/null +++ b/CRIU_code/test/pycriu @@ -0,0 +1 @@ +../lib/py/ \ No newline at end of file diff --git a/CRIU_code/test/show_action.sh b/CRIU_code/test/show_action.sh new file mode 100644 index 0000000..86468b6 --- /dev/null +++ b/CRIU_code/test/show_action.sh @@ -0,0 +1,3 @@ +#!/bin/bash +echo "${CRTOOLS_SCRIPT_ACTION} ${CRTOOLS_IMAGE_DIR} ${CRTOOLS_INIT_PID}" \ + >> "$(dirname $0)/actions_called.txt" diff --git a/CRIU_code/test/umount2.c b/CRIU_code/test/umount2.c new file mode 100644 index 0000000..a150f34 --- /dev/null +++ b/CRIU_code/test/umount2.c @@ -0,0 +1,16 @@ +#include +#include + +int main(int argc, char *argv[]) +{ + if (argc < 2) { + fprintf(stderr, "umount PATH\n"); + return 1; + } + if (umount2(argv[1], MNT_DETACH)) { + fprintf(stderr, "umount %s: %m\n", argv[1]); + return 1; + } + + return 0; +} diff --git a/CRIU_code/test/zdtm.desc b/CRIU_code/test/zdtm.desc new file mode 100644 index 0000000..0671d49 --- /dev/null +++ b/CRIU_code/test/zdtm.desc @@ -0,0 +1 @@ +{ 'dir': 'zdtm/', 'exclude': [ 'static/route_rules', 'static/criu-rtc.so', 'lib/parseargs.sh', 'lib/stop_and_chk.sh' ] } diff --git a/CRIU_code/test/zdtm.py b/CRIU_code/test/zdtm.py new file mode 100644 index 0000000..57eb68a --- /dev/null +++ b/CRIU_code/test/zdtm.py @@ -0,0 +1,2354 @@ +#!/usr/bin/env python +# vim: noet ts=8 sw=8 sts=8 +from __future__ import absolute_import, division, print_function, unicode_literals +from builtins import (str, open, range, zip, int, input) + +import argparse +import glob +import os +import subprocess +import time +import tempfile +import shutil +import re +import stat +import signal +import atexit +import sys +import linecache +import random +import string +import fcntl +import errno +import datetime +import yaml +import struct +import pycriu as crpc + +os.chdir(os.path.dirname(os.path.abspath(__file__))) + +prev_line = None + + +def alarm(*args): + print("==== ALARM ====") + + +signal.signal(signal.SIGALRM, alarm) + + +def traceit(f, e, a): + if e == "line": + lineno = f.f_lineno + fil = f.f_globals["__file__"] + if fil.endswith("zdtm.py"): + global prev_line + line = linecache.getline(fil, lineno) + if line == prev_line: + print(" ...") + else: + prev_line = line + print("+%4d: %s" % (lineno, line.rstrip())) + + return traceit + + +# Root dir for ns and uns flavors. All tests +# sit in the same dir +tests_root = None + + +def clean_tests_root(): + global tests_root + if tests_root and tests_root[0] == os.getpid(): + os.rmdir(tests_root[1]) + + +def make_tests_root(): + global tests_root + if not tests_root: + tests_root = (os.getpid(), tempfile.mkdtemp("", "criu-root-", "/tmp")) + atexit.register(clean_tests_root) + return tests_root[1] + + +# Report generation + +report_dir = None + + +def init_report(path): + global report_dir + report_dir = path + if not os.access(report_dir, os.F_OK): + os.makedirs(report_dir) + + +def add_to_report(path, tgt_name): + global report_dir + if report_dir: + tgt_path = os.path.join(report_dir, tgt_name) + att = 0 + while os.access(tgt_path, os.F_OK): + tgt_path = os.path.join(report_dir, tgt_name + ".%d" % att) + att += 1 + + ignore = shutil.ignore_patterns('*.socket') + if os.path.isdir(path): + shutil.copytree(path, tgt_path, ignore = ignore) + else: + if not os.path.exists(os.path.dirname(tgt_path)): + os.mkdir(os.path.dirname(tgt_path)) + shutil.copy2(path, tgt_path) + + +def add_to_output(path): + global report_dir + if not report_dir: + return + + output_path = os.path.join(report_dir, "output") + with open(path, "r") as fdi, open(output_path, "a") as fdo: + for line in fdi: + fdo.write(line) + + +prev_crash_reports = set(glob.glob("/tmp/zdtm-core-*.txt")) + + +def check_core_files(): + reports = set(glob.glob("/tmp/zdtm-core-*.txt")) - prev_crash_reports + if not reports: + return False + + while subprocess.Popen(r"ps axf | grep 'abrt\.sh'", shell = True).wait() == 0: + time.sleep(1) + + for i in reports: + add_to_report(i, os.path.basename(i)) + print_sep(i) + with open(i, "r") as report: + print(report.read()) + print_sep(i) + + return True + + +# Arch we run on +arch = os.uname()[4] + +# +# Flavors +# h -- host, test is run in the same set of namespaces as criu +# ns -- namespaces, test is run in itw own set of namespaces +# uns -- user namespace, the same as above plus user namespace +# + + +class host_flavor: + def __init__(self, opts): + self.name = "host" + self.ns = False + self.root = None + + def init(self, l_bins, x_bins): + pass + + def fini(self): + pass + + @staticmethod + def clean(): + pass + + +class ns_flavor: + __root_dirs = ["/bin", "/sbin", "/etc", "/lib", "/lib64", "/dev", "/dev/pts", "/dev/net", "/tmp", "/usr", "/proc", "/run"] + + def __init__(self, opts): + self.name = "ns" + self.ns = True + self.uns = False + self.root = make_tests_root() + self.root_mounted = False + + def __copy_one(self, fname): + tfname = self.root + fname + if not os.access(tfname, os.F_OK): + # Copying should be atomic as tests can be + # run in parallel + try: + os.makedirs(self.root + os.path.dirname(fname)) + except OSError as e: + if e.errno != errno.EEXIST: + raise + dst = tempfile.mktemp(".tso", "", self.root + os.path.dirname(fname)) + shutil.copy2(fname, dst) + os.rename(dst, tfname) + + def __copy_libs(self, binary): + ldd = subprocess.Popen(["ldd", binary], stdout = subprocess.PIPE) + xl = re.compile(r'^(linux-gate.so|linux-vdso(64)?.so|not a dynamic|.*\s*ldd\s)') + + # This Mayakovsky-style code gets list of libraries a binary + # needs minus vdso and gate .so-s + libs = map(lambda x: x[1] == '=>' and x[2] or x[0], + map(lambda x: str(x).split(), + filter(lambda x: not xl.match(x), + map(lambda x: str(x).strip(), + filter(lambda x: str(x).startswith('\t'), ldd.stdout.read().decode('ascii').splitlines()))))) + + ldd.wait() + + for lib in libs: + if not os.access(lib, os.F_OK): + raise test_fail_exc("Can't find lib %s required by %s" % (lib, binary)) + self.__copy_one(lib) + + def __mknod(self, name, rdev = None): + name = "/dev/" + name + if not rdev: + if not os.access(name, os.F_OK): + print("Skipping %s at root" % name) + return + else: + rdev = os.stat(name).st_rdev + + name = self.root + name + os.mknod(name, stat.S_IFCHR, rdev) + os.chmod(name, 0o666) + + def __construct_root(self): + for dir in self.__root_dirs: + os.mkdir(self.root + dir) + os.chmod(self.root + dir, 0o777) + + for ldir in ["/bin", "/sbin", "/lib", "/lib64"]: + os.symlink(".." + ldir, self.root + "/usr" + ldir) + + self.__mknod("tty", os.makedev(5, 0)) + self.__mknod("null", os.makedev(1, 3)) + self.__mknod("net/tun") + self.__mknod("rtc") + self.__mknod("autofs", os.makedev(10, 235)) + + def __copy_deps(self, deps): + for d in deps.split('|'): + if os.access(d, os.F_OK): + self.__copy_one(d) + self.__copy_libs(d) + return + raise test_fail_exc("Deps check %s failed" % deps) + + def init(self, l_bins, x_bins): + subprocess.check_call(["mount", "--make-slave", "--bind", ".", self.root]) + self.root_mounted = True + + if not os.access(self.root + "/.constructed", os.F_OK): + with open(os.path.abspath(__file__)) as o: + fcntl.flock(o, fcntl.LOCK_EX) + if not os.access(self.root + "/.constructed", os.F_OK): + print("Construct root for %s" % l_bins[0]) + self.__construct_root() + os.mknod(self.root + "/.constructed", stat.S_IFREG | 0o600) + + for b in l_bins: + self.__copy_libs(b) + for b in x_bins: + self.__copy_deps(b) + + def fini(self): + if self.root_mounted: + subprocess.check_call(["./umount2", self.root]) + self.root_mounted = False + + @staticmethod + def clean(): + for d in ns_flavor.__root_dirs: + p = './' + d + print('Remove %s' % p) + if os.access(p, os.F_OK): + shutil.rmtree('./' + d) + + if os.access('./.constructed', os.F_OK): + os.unlink('./.constructed') + + +class userns_flavor(ns_flavor): + def __init__(self, opts): + ns_flavor.__init__(self, opts) + self.name = "userns" + self.uns = True + + def init(self, l_bins, x_bins): + # To be able to create roots_yard in CRIU + os.chmod(".", os.stat(".").st_mode | 0o077) + ns_flavor.init(self, l_bins, x_bins) + + @staticmethod + def clean(): + pass + + +flavors = {'h': host_flavor, 'ns': ns_flavor, 'uns': userns_flavor} +flavors_codes = dict(zip(range(len(flavors)), sorted(flavors.keys()))) + +# +# Helpers +# + + +def encode_flav(f): + return sorted(flavors.keys()).index(f) + 128 + + +def decode_flav(i): + return flavors_codes.get(i - 128, "unknown") + + +def tail(path): + p = subprocess.Popen(['tail', '-n1', path], + stdout = subprocess.PIPE) + out = p.stdout.readline() + p.wait() + return out.decode() + + +def rpidfile(path): + with open(path) as fd: + return fd.readline().strip() + + +def wait_pid_die(pid, who, tmo = 30): + stime = 0.1 + while stime < tmo: + try: + os.kill(int(pid), 0) + except OSError as e: + if e.errno != errno.ESRCH: + print(e) + break + + print("Wait for %s(%d) to die for %f" % (who, pid, stime)) + time.sleep(stime) + stime *= 2 + else: + subprocess.Popen(["ps", "-p", str(pid)]).wait() + subprocess.Popen(["ps", "axf", str(pid)]).wait() + raise test_fail_exc("%s die" % who) + + +def test_flag(tdesc, flag): + return flag in tdesc.get('flags', '').split() + +# +# Exception thrown when something inside the test goes wrong, +# e.g. test doesn't start, criu returns with non zero code or +# test checks fail +# + + +class test_fail_exc(Exception): + def __init__(self, step): + self.step = step + + def __str__(self): + return str(self.step) + + +class test_fail_expected_exc(Exception): + def __init__(self, cr_action): + self.cr_action = cr_action + +# +# A test from zdtm/ directory. +# + + +class zdtm_test: + def __init__(self, name, desc, flavor, freezer): + self.__name = name + self.__desc = desc + self.__freezer = None + self.__make_action('cleanout') + self.__pid = 0 + self.__flavor = flavor + self.__freezer = freezer + self._bins = [name] + self._env = {} + self._deps = desc.get('deps', []) + self.auto_reap = True + self.__timeout = int(self.__desc.get('timeout') or 30) + + def __make_action(self, act, env = None, root = None): + sys.stdout.flush() # Not to let make's messages appear before ours + tpath = self.__name + '.' + act + s_args = ['make', '--no-print-directory', + '-C', os.path.dirname(tpath), + os.path.basename(tpath)] + + if env: + env = dict(os.environ, **env) + + s = subprocess.Popen(s_args, env = env, cwd = root, close_fds = True, + preexec_fn = self.__freezer and self.__freezer.attach or None) + if act == "pid": + try_run_hook(self, ["--post-start"]) + if s.wait(): + raise test_fail_exc(str(s_args)) + + if self.__freezer: + self.__freezer.freeze() + + def __pidfile(self): + return self.__name + '.pid' + + def __wait_task_die(self): + wait_pid_die(int(self.__pid), self.__name, self.__timeout) + + def __add_wperms(self): + # Add write perms for .out and .pid files + for b in self._bins: + p = os.path.dirname(b) + os.chmod(p, os.stat(p).st_mode | 0o222) + + def start(self): + self.__flavor.init(self._bins, self._deps) + + print("Start test") + + env = self._env + if not self.__freezer.kernel: + env['ZDTM_THREAD_BOMB'] = "5" + + if test_flag(self.__desc, 'pre-dump-notify'): + env['ZDTM_NOTIFY_FDIN'] = "100" + env['ZDTM_NOTIFY_FDOUT'] = "101" + + if not test_flag(self.__desc, 'suid'): + # Numbers should match those in criu + env['ZDTM_UID'] = "18943" + env['ZDTM_GID'] = "58467" + env['ZDTM_GROUPS'] = "27495 48244" + self.__add_wperms() + else: + print("Test is SUID") + + if self.__flavor.ns: + env['ZDTM_NEWNS'] = "1" + env['ZDTM_ROOT'] = self.__flavor.root + env['PATH'] = "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" + + if self.__flavor.uns: + env['ZDTM_USERNS'] = "1" + self.__add_wperms() + if os.getenv("GCOV"): + criu_dir = os.path.dirname(os.getcwd()) + criu_dir_r = "%s%s" % (self.__flavor.root, criu_dir) + + env['ZDTM_CRIU'] = os.path.dirname(os.getcwd()) + subprocess.check_call(["mkdir", "-p", criu_dir_r]) + + self.__make_action('pid', env, self.__flavor.root) + + try: + os.kill(int(self.getpid()), 0) + except Exception as e: + raise test_fail_exc("start: %s" % e) + + if not self.static(): + # Wait less than a second to give the test chance to + # move into some semi-random state + time.sleep(random.random()) + + def kill(self, sig = signal.SIGKILL): + self.__freezer.thaw() + if self.__pid: + print("Send the %d signal to %s" % (sig, self.__pid)) + os.kill(int(self.__pid), sig) + self.gone(sig == signal.SIGKILL) + + self.__flavor.fini() + + def pre_dump_notify(self): + env = self._env + + if 'ZDTM_NOTIFY_FDIN' not in env: + return + + if self.__pid == 0: + self.getpid() + + notify_fdout_path = "/proc/%s/fd/%s" % (self.__pid, env['ZDTM_NOTIFY_FDOUT']) + notify_fdin_path = "/proc/%s/fd/%s" % (self.__pid, env['ZDTM_NOTIFY_FDIN']) + + print("Send pre-dump notify to %s" % (self.__pid)) + with open(notify_fdout_path, "rb") as fdout: + with open(notify_fdin_path, "wb") as fdin: + fdin.write(struct.pack("i", 0)) + fdin.flush() + print("Wait pre-dump notify reply") + ret = struct.unpack('i', fdout.read(4)) + print("Completed pre-dump notify with %d" % (ret)) + + def stop(self): + self.__freezer.thaw() + self.getpid() # Read the pid from pidfile back + self.kill(signal.SIGTERM) + + res = tail(self.__name + '.out') + if 'PASS' not in list(map(lambda s: s.strip(), res.split())): + if os.access(self.__name + '.out.inprogress', os.F_OK): + print_sep(self.__name + '.out.inprogress') + with open(self.__name + '.out.inprogress') as fd: + print(fd.read()) + print_sep(self.__name + '.out.inprogress') + raise test_fail_exc("result check") + + def getpid(self): + if self.__pid == 0: + self.__pid = rpidfile(self.__pidfile()) + + return self.__pid + + def getname(self): + return self.__name + + def __getcropts(self): + opts = self.__desc.get('opts', '').split() + ["--pidfile", os.path.realpath(self.__pidfile())] + if self.__flavor.ns: + opts += ["--root", self.__flavor.root] + if test_flag(self.__desc, 'crlib'): + opts += ["-L", os.path.dirname(os.path.realpath(self.__name)) + '/lib'] + return opts + + def getdopts(self): + return self.__getcropts() + self.__freezer.getdopts() + self.__desc.get('dopts', '').split() + + def getropts(self): + return self.__getcropts() + self.__freezer.getropts() + self.__desc.get('ropts', '').split() + + def unlink_pidfile(self): + self.__pid = 0 + os.unlink(self.__pidfile()) + + def gone(self, force = True): + if not self.auto_reap: + pid, status = os.waitpid(int(self.__pid), 0) + if pid != int(self.__pid): + raise test_fail_exc("kill pid mess") + + self.__wait_task_die() + self.__pid = 0 + if force: + os.unlink(self.__pidfile()) + + def print_output(self): + if os.access(self.__name + '.out', os.R_OK): + print("Test output: " + "=" * 32) + with open(self.__name + '.out') as output: + print(output.read()) + print(" <<< " + "=" * 32) + + def static(self): + return self.__name.split('/')[1] == 'static' + + def ns(self): + return self.__flavor.ns + + def blocking(self): + return test_flag(self.__desc, 'crfail') + + @staticmethod + def available(): + if not os.access("umount2", os.X_OK): + subprocess.check_call(["make", "umount2"]) + if not os.access("zdtm_ct", os.X_OK): + subprocess.check_call(["make", "zdtm_ct"]) + if not os.access("zdtm/lib/libzdtmtst.a", os.F_OK): + subprocess.check_call(["make", "-C", "zdtm/"]) + subprocess.check_call(["flock", "zdtm_mount_cgroups.lock", "./zdtm_mount_cgroups"]) + + @staticmethod + def cleanup(): + subprocess.check_call(["flock", "zdtm_mount_cgroups.lock", "./zdtm_umount_cgroups"]) + + +def load_module_from_file(name, path): + if sys.version_info[0] == 3 and sys.version_info[1] >= 5: + import importlib.util + spec = importlib.util.spec_from_file_location(name, path) + mod = importlib.util.module_from_spec(spec) + spec.loader.exec_module(mod) + else: + import imp + mod = imp.load_source(name, path) + return mod + + +class inhfd_test: + def __init__(self, name, desc, flavor, freezer): + self.__name = os.path.basename(name) + print("Load %s" % name) + self.__fdtyp = load_module_from_file(self.__name, name) + self.__peer_pid = 0 + self.__files = None + self.__peer_file_names = [] + self.__dump_opts = [] + self.__messages = {} + + def __get_message(self, i): + m = self.__messages.get(i, None) + if not m: + m = b"".join([random.choice(string.ascii_letters).encode() for _ in range(10)]) + b"%06d" % i + self.__messages[i] = m + return m + + def start(self): + self.__files = self.__fdtyp.create_fds() + + # Check FDs returned for inter-connection + i = 0 + for my_file, peer_file in self.__files: + msg = self.__get_message(i) + my_file.write(msg) + my_file.flush() + data = peer_file.read(len(msg)) + if data != msg: + raise test_fail_exc("FDs screwup: %r %r" % (msg, data)) + i += 1 + + start_pipe = os.pipe() + self.__peer_pid = os.fork() + if self.__peer_pid == 0: + os.setsid() + + for _, peer_file in self.__files: + getattr(self.__fdtyp, "child_prep", lambda fd: None)(peer_file) + + try: + os.unlink(self.__name + ".out") + except Exception as e: + print(e) + fd = os.open(self.__name + ".out", os.O_WRONLY | os.O_APPEND | os.O_CREAT) + os.dup2(fd, 1) + os.dup2(fd, 2) + os.close(fd) + fd = os.open("/dev/null", os.O_RDONLY) + os.dup2(fd, 0) + for my_file, _ in self.__files: + my_file.close() + os.close(start_pipe[0]) + os.close(start_pipe[1]) + i = 0 + for _, peer_file in self.__files: + msg = self.__get_message(i) + my_file.close() + try: + data = peer_file.read(16) + except Exception as e: + print("Unable to read a peer file: %s" % e) + sys.exit(1) + + if data != msg: + print("%r %r" % (data, msg)) + i += 1 + sys.exit(data == msg and 42 or 2) + + os.close(start_pipe[1]) + os.read(start_pipe[0], 12) + os.close(start_pipe[0]) + + for _, peer_file in self.__files: + self.__peer_file_names.append(self.__fdtyp.filename(peer_file)) + self.__dump_opts += self.__fdtyp.dump_opts(peer_file) + + self.__fds = set(os.listdir("/proc/%s/fd" % self.__peer_pid)) + + def stop(self): + fds = set(os.listdir("/proc/%s/fd" % self.__peer_pid)) + if fds != self.__fds: + raise test_fail_exc("File descriptors mismatch: %s %s" % (fds, self.__fds)) + i = 0 + for my_file, _ in self.__files: + msg = self.__get_message(i) + my_file.write(msg) + my_file.flush() + i += 1 + pid, status = os.waitpid(self.__peer_pid, 0) + with open(self.__name + ".out") as output: + print(output.read()) + self.__peer_pid = 0 + if not os.WIFEXITED(status) or os.WEXITSTATUS(status) != 42: + raise test_fail_exc("test failed with %d" % status) + + def kill(self): + if self.__peer_pid: + os.kill(self.__peer_pid, signal.SIGKILL) + + def getname(self): + return self.__name + + def getpid(self): + return "%s" % self.__peer_pid + + def gone(self, force = True): + os.waitpid(self.__peer_pid, 0) + wait_pid_die(self.__peer_pid, self.__name) + self.__files = None + + def getdopts(self): + return self.__dump_opts + + def getropts(self): + self.__files = self.__fdtyp.create_fds() + ropts = ["--restore-sibling"] + for i in range(len(self.__files)): + my_file, peer_file = self.__files[i] + fd = peer_file.fileno() + fdflags = fcntl.fcntl(fd, fcntl.F_GETFD) & ~fcntl.FD_CLOEXEC + fcntl.fcntl(fd, fcntl.F_SETFD, fdflags) + peer_file_name = self.__peer_file_names[i] + ropts.extend(["--inherit-fd", "fd[%d]:%s" % (fd, peer_file_name)]) + return ropts + + def print_output(self): + pass + + def static(self): + return True + + def blocking(self): + return False + + @staticmethod + def available(): + pass + + @staticmethod + def cleanup(): + pass + + +class groups_test(zdtm_test): + def __init__(self, name, desc, flavor, freezer): + zdtm_test.__init__(self, 'zdtm/lib/groups', desc, flavor, freezer) + if flavor.ns: + self.__real_name = name + with open(name) as fd: + self.__subs = map(lambda x: x.strip(), fd.readlines()) + print("Subs:\n%s" % '\n'.join(self.__subs)) + else: + self.__real_name = '' + self.__subs = [] + + self._bins += self.__subs + self._deps += get_test_desc('zdtm/lib/groups')['deps'] + self._env = {'ZDTM_TESTS': self.__real_name} + + def __get_start_cmd(self, name): + tdir = os.path.dirname(name) + tname = os.path.basename(name) + + s_args = ['make', '--no-print-directory', '-C', tdir] + subprocess.check_call(s_args + [tname + '.cleanout']) + s = subprocess.Popen(s_args + ['--dry-run', tname + '.pid'], stdout = subprocess.PIPE) + cmd = s.stdout.readlines().pop().strip() + s.wait() + + return 'cd /' + tdir + ' && ' + cmd + + def start(self): + if (self.__subs): + with open(self.__real_name + '.start', 'w') as f: + for test in self.__subs: + cmd = self.__get_start_cmd(test) + f.write(cmd + '\n') + + with open(self.__real_name + '.stop', 'w') as f: + for test in self.__subs: + f.write('kill -TERM `cat /%s.pid`\n' % test) + + zdtm_test.start(self) + + def stop(self): + zdtm_test.stop(self) + + for test in self.__subs: + res = tail(test + '.out') + if 'PASS' not in res.split(): + raise test_fail_exc("sub %s result check" % test) + + +test_classes = {'zdtm': zdtm_test, 'inhfd': inhfd_test, 'groups': groups_test} + +# +# CRIU when launched using CLI +# + +join_ns_file = '/run/netns/zdtm_netns' + + +class criu_cli: + @staticmethod + def run(action, args, criu_bin, fault = None, strace = [], preexec = None, nowait = False): + env = dict(os.environ, ASAN_OPTIONS = "log_path=asan.log:disable_coredump=0:detect_leaks=0") + + if fault: + print("Forcing %s fault" % fault) + env['CRIU_FAULT'] = fault + + cr = subprocess.Popen(strace + [criu_bin, action, "--no-default-config"] + args, + env = env, close_fds = False, preexec_fn = preexec) + if nowait: + return cr + return cr.wait() + + +class criu_rpc_process: + def wait(self): + return self.criu.wait_pid(self.pid) + + def terminate(self): + os.kill(self.pid, signal.SIGTERM) + + +class criu_rpc: + @staticmethod + def __set_opts(criu, args, ctx): + while len(args) != 0: + arg = args.pop(0) + if arg == '-v4': + criu.opts.log_level = 4 + continue + if arg == '-o': + criu.opts.log_file = args.pop(0) + continue + if arg == '-D': + criu.opts.images_dir_fd = os.open(args.pop(0), os.O_DIRECTORY) + ctx['imgd'] = criu.opts.images_dir_fd + continue + if arg == '-t': + criu.opts.pid = int(args.pop(0)) + continue + if arg == '--pidfile': + ctx['pidf'] = args.pop(0) + continue + if arg == '--timeout': + criu.opts.timeout = int(args.pop(0)) + continue + if arg == '--restore-detached': + # Set by service by default + ctx['rd'] = True + continue + if arg == '--root': + criu.opts.root = args.pop(0) + continue + if arg == '--external': + criu.opts.external.append(args.pop(0)) + continue + if arg == '--status-fd': + fd = int(args.pop(0)) + os.write(fd, b"\0") + fcntl.fcntl(fd, fcntl.F_SETFD, fcntl.FD_CLOEXEC) + continue + if arg == '--port': + criu.opts.ps.port = int(args.pop(0)) + continue + if arg == '--address': + criu.opts.ps.address = args.pop(0) + continue + if arg == '--page-server': + continue + if arg == '--prev-images-dir': + criu.opts.parent_img = args.pop(0) + continue + if arg == '--track-mem': + criu.opts.track_mem = True + continue + if arg == '--tcp-established': + criu.opts.tcp_established = True + continue + if arg == '--restore-sibling': + criu.opts.rst_sibling = True + continue + if arg == "--inherit-fd": + inhfd = criu.opts.inherit_fd.add() + key = args.pop(0) + fd, key = key.split(":", 1) + inhfd.fd = int(fd[3:-1]) + inhfd.key = key + continue + + raise test_fail_exc('RPC for %s required' % arg) + + @staticmethod + def run(action, args, criu_bin, fault = None, strace = [], preexec = None, nowait = False): + if fault: + raise test_fail_exc('RPC and FAULT not supported') + if strace: + raise test_fail_exc('RPC and SAT not supported') + if preexec: + raise test_fail_exc('RPC and PREEXEC not supported') + + ctx = {} # Object used to keep info untill action is done + criu = crpc.criu() + criu.use_binary(criu_bin) + criu_rpc.__set_opts(criu, args, ctx) + p = None + + try: + if action == 'dump': + criu.dump() + elif action == 'pre-dump': + criu.pre_dump() + elif action == 'restore': + if 'rd' not in ctx: + raise test_fail_exc('RPC Non-detached restore is impossible') + + res = criu.restore() + pidf = ctx.get('pidf') + if pidf: + with open(pidf, 'w') as fd: + fd.write('%d\n' % res.pid) + elif action == "page-server": + res = criu.page_server_chld() + p = criu_rpc_process() + p.pid = res.pid + p.criu = criu + else: + raise test_fail_exc('RPC for %s required' % action) + except crpc.CRIUExceptionExternal as e: + print("Fail", e) + ret = -1 + else: + ret = 0 + + imgd = ctx.get('imgd') + if imgd: + os.close(imgd) + + if nowait and ret == 0: + return p + + return ret + + +class criu: + def __init__(self, opts): + self.__test = None + self.__dump_path = None + self.__iter = 0 + self.__prev_dump_iter = None + self.__page_server = bool(opts['page_server']) + self.__remote_lazy_pages = bool(opts['remote_lazy_pages']) + self.__lazy_pages = (self.__remote_lazy_pages or + bool(opts['lazy_pages'])) + self.__lazy_migrate = bool(opts['lazy_migrate']) + self.__restore_sibling = bool(opts['sibling']) + self.__join_ns = bool(opts['join_ns']) + self.__empty_ns = bool(opts['empty_ns']) + self.__fault = opts['fault'] + self.__script = opts['script'] + self.__sat = bool(opts['sat']) + self.__dedup = bool(opts['dedup']) + self.__mdedup = bool(opts['noauto_dedup']) + self.__user = bool(opts['user']) + self.__leave_stopped = bool(opts['stop']) + self.__remote = bool(opts['remote']) + self.__criu = (opts['rpc'] and criu_rpc or criu_cli) + self.__show_stats = bool(opts['show_stats']) + self.__lazy_pages_p = None + self.__page_server_p = None + self.__dump_process = None + self.__tls = self.__tls_options() if opts['tls'] else [] + self.__criu_bin = opts['criu_bin'] + self.__crit_bin = opts['crit_bin'] + + def fini(self): + if self.__lazy_migrate: + ret = self.__dump_process.wait() + if self.__lazy_pages_p: + ret = self.__lazy_pages_p.wait() + grep_errors(os.path.join(self.__ddir(), "lazy-pages.log")) + self.__lazy_pages_p = None + if ret: + raise test_fail_exc("criu lazy-pages exited with %s" % ret) + if self.__page_server_p: + ret = self.__page_server_p.wait() + grep_errors(os.path.join(self.__ddir(), "page-server.log")) + self.__page_server_p = None + if ret: + raise test_fail_exc("criu page-server exited with %s" % ret) + if self.__dump_process: + ret = self.__dump_process.wait() + grep_errors(os.path.join(self.__ddir(), "dump.log")) + self.__dump_process = None + if ret: + raise test_fail_exc("criu dump exited with %s" % ret) + return + + def logs(self): + return self.__dump_path + + def set_test(self, test): + self.__test = test + self.__dump_path = "dump/" + test.getname() + "/" + test.getpid() + if os.path.exists(self.__dump_path): + for i in range(100): + newpath = self.__dump_path + "." + str(i) + if not os.path.exists(newpath): + os.rename(self.__dump_path, newpath) + break + else: + raise test_fail_exc("couldn't find dump dir %s" % self.__dump_path) + + os.makedirs(self.__dump_path) + + def cleanup(self): + if self.__dump_path: + print("Removing %s" % self.__dump_path) + shutil.rmtree(self.__dump_path) + + def __tls_options(self): + pki_dir = os.path.dirname(os.path.abspath(__file__)) + "/pki" + return ["--tls", "--tls-no-cn-verify", + "--tls-key", pki_dir + "/key.pem", + "--tls-cert", pki_dir + "/cert.pem", + "--tls-cacert", pki_dir + "/cacert.pem"] + + def __ddir(self): + return os.path.join(self.__dump_path, "%d" % self.__iter) + + def set_user_id(self): + # Numbers should match those in zdtm_test + os.setresgid(58467, 58467, 58467) + os.setresuid(18943, 18943, 18943) + + def __criu_act(self, action, opts = [], log = None, nowait = False): + if not log: + log = action + ".log" + + s_args = ["-o", log, "-D", self.__ddir(), "-v4"] + opts + + with open(os.path.join(self.__ddir(), action + '.cropt'), 'w') as f: + f.write(' '.join(s_args) + '\n') + + print("Run criu " + action) + + strace = [] + if self.__sat: + fname = os.path.join(self.__ddir(), action + '.strace') + print_fname(fname, 'strace') + strace = ["strace", "-o", fname, '-T'] + if action == 'restore': + strace += ['-f'] + s_args += ['--action-script', os.getcwd() + '/../scripts/fake-restore.sh'] + + if self.__script: + s_args += ['--action-script', self.__script] + + if action == "restore": + preexec = None + else: + preexec = self.__user and self.set_user_id or None + + __ddir = self.__ddir() + + status_fds = None + if nowait: + status_fds = os.pipe() + fd = status_fds[1] + fdflags = fcntl.fcntl(fd, fcntl.F_GETFD) + fcntl.fcntl(fd, fcntl.F_SETFD, fdflags & ~fcntl.FD_CLOEXEC) + s_args += ["--status-fd", str(fd)] + + with open("/proc/sys/kernel/ns_last_pid") as ns_last_pid_fd: + ns_last_pid = ns_last_pid_fd.read() + + ret = self.__criu.run(action, s_args, self.__criu_bin, self.__fault, strace, preexec, nowait) + + if nowait: + os.close(status_fds[1]) + if os.read(status_fds[0], 1) != b'\0': + ret = ret.wait() + if self.__test.blocking(): + raise test_fail_expected_exc(action) + else: + raise test_fail_exc("criu %s exited with %s" % (action, ret)) + os.close(status_fds[0]) + return ret + + grep_errors(os.path.join(__ddir, log)) + if ret != 0: + if self.__fault and int(self.__fault) < 128: + try_run_hook(self.__test, ["--fault", action]) + if action == "dump": + # create a clean directory for images + os.rename(__ddir, __ddir + ".fail") + os.mkdir(__ddir) + os.chmod(__ddir, 0o777) + else: + # on restore we move only a log file, because we need images + os.rename(os.path.join(__ddir, log), os.path.join(__ddir, log + ".fail")) + # restore ns_last_pid to avoid a case when criu gets + # PID of one of restored processes. + with open("/proc/sys/kernel/ns_last_pid", "w+") as fd: + fd.write(ns_last_pid) + # try again without faults + print("Run criu " + action) + ret = self.__criu.run(action, s_args, self.__criu_bin, False, strace, preexec) + grep_errors(os.path.join(__ddir, log)) + if ret == 0: + return + rst_succeeded = os.access(os.path.join(__ddir, "restore-succeeded"), os.F_OK) + if self.__test.blocking() or (self.__sat and action == 'restore' and rst_succeeded): + raise test_fail_expected_exc(action) + else: + raise test_fail_exc("CRIU %s" % action) + + def __stats_file(self, action): + return os.path.join(self.__ddir(), "stats-%s" % action) + + def show_stats(self, action): + if not self.__show_stats: + return + + subprocess.Popen([self.__crit_bin, "show", self.__stats_file(action)]).wait() + + def check_pages_counts(self): + if not os.access(self.__stats_file("dump"), os.R_OK): + return + + stats_written = -1 + with open(self.__stats_file("dump"), 'rb') as stfile: + stats = crpc.images.load(stfile) + stent = stats['entries'][0]['dump'] + stats_written = int(stent['shpages_written']) + int(stent['pages_written']) + + real_written = 0 + for f in os.listdir(self.__ddir()): + if f.startswith('pages-'): + real_written += os.path.getsize(os.path.join(self.__ddir(), f)) + + r_pages = real_written / 4096 + r_off = real_written % 4096 + if (stats_written != r_pages) or (r_off != 0): + print("ERROR: bad page counts, stats = %d real = %d(%d)" % (stats_written, r_pages, r_off)) + raise test_fail_exc("page counts mismatch") + + def dump(self, action, opts = []): + self.__iter += 1 + os.mkdir(self.__ddir()) + os.chmod(self.__ddir(), 0o777) + + a_opts = ["-t", self.__test.getpid()] + if self.__prev_dump_iter: + a_opts += ["--prev-images-dir", "../%d" % self.__prev_dump_iter, "--track-mem"] + self.__prev_dump_iter = self.__iter + + if self.__page_server: + print("Adding page server") + + ps_opts = ["--port", "12345"] + self.__tls + if self.__dedup: + ps_opts += ["--auto-dedup"] + + self.__page_server_p = self.__criu_act("page-server", opts = ps_opts, nowait = True) + a_opts += ["--page-server", "--address", "127.0.0.1", "--port", "12345"] + self.__tls + + a_opts += self.__test.getdopts() + + if self.__remote: + logdir = os.getcwd() + "/" + self.__dump_path + "/" + str(self.__iter) + print("Adding image cache") + + cache_opts = [self.__criu_bin, "image-cache", "--port", "12345", "-v4", "-o", + logdir + "/image-cache.log", "-D", logdir] + + subprocess.Popen(cache_opts).pid + time.sleep(1) + + print("Adding image proxy") + + proxy_opts = [self.__criu_bin, "image-proxy", "--port", "12345", "--address", + "localhost", "-v4", "-o", logdir + "/image-proxy.log", + "-D", logdir] + + subprocess.Popen(proxy_opts).pid + time.sleep(1) + + a_opts += ["--remote"] + + if self.__dedup: + a_opts += ["--auto-dedup"] + + a_opts += ["--timeout", "10"] + + criu_dir = os.path.dirname(os.getcwd()) + if os.getenv("GCOV"): + a_opts.append('--external') + a_opts.append('mnt[%s]:zdtm' % criu_dir) + + if self.__leave_stopped: + a_opts += ['--leave-stopped'] + if self.__empty_ns: + a_opts += ['--empty-ns', 'net'] + + nowait = False + if self.__lazy_migrate and action == "dump": + a_opts += ["--lazy-pages", "--port", "12345"] + self.__tls + nowait = True + self.__dump_process = self.__criu_act(action, opts = a_opts + opts, nowait = nowait) + if self.__mdedup and self.__iter > 1: + self.__criu_act("dedup", opts = []) + + self.show_stats("dump") + self.check_pages_counts() + + if self.__leave_stopped: + pstree_check_stopped(self.__test.getpid()) + pstree_signal(self.__test.getpid(), signal.SIGKILL) + + if self.__page_server_p: + ret = self.__page_server_p.wait() + grep_errors(os.path.join(self.__ddir(), "page-server.log")) + self.__page_server_p = None + if ret: + raise test_fail_exc("criu page-server exited with %d" % ret) + + def restore(self): + r_opts = [] + if self.__restore_sibling: + r_opts = ["--restore-sibling"] + self.__test.auto_reap = False + r_opts += self.__test.getropts() + if self.__join_ns: + r_opts.append("--join-ns") + r_opts.append("net:%s" % join_ns_file) + if self.__empty_ns: + r_opts += ['--empty-ns', 'net'] + r_opts += ['--action-script', os.getcwd() + '/empty-netns-prep.sh'] + + if self.__remote: + r_opts += ["--remote"] + + if self.__dedup: + r_opts += ["--auto-dedup"] + + self.__prev_dump_iter = None + criu_dir = os.path.dirname(os.getcwd()) + if os.getenv("GCOV"): + r_opts.append('--external') + r_opts.append('mnt[zdtm]:%s' % criu_dir) + + if self.__lazy_pages or self.__lazy_migrate: + lp_opts = [] + if self.__remote_lazy_pages or self.__lazy_migrate: + lp_opts += ["--page-server", "--port", "12345", + "--address", "127.0.0.1"] + self.__tls + + if self.__remote_lazy_pages: + ps_opts = ["--pidfile", "ps.pid", + "--port", "12345", "--lazy-pages"] + self.__tls + self.__page_server_p = self.__criu_act("page-server", opts = ps_opts, nowait = True) + self.__lazy_pages_p = self.__criu_act("lazy-pages", opts = lp_opts, nowait = True) + r_opts += ["--lazy-pages"] + + if self.__leave_stopped: + r_opts += ['--leave-stopped'] + + self.__criu_act("restore", opts = r_opts + ["--restore-detached"]) + self.show_stats("restore") + + if self.__leave_stopped: + pstree_check_stopped(self.__test.getpid()) + pstree_signal(self.__test.getpid(), signal.SIGCONT) + + @staticmethod + def check(feature): + return criu_cli.run("check", ["--no-default-config", "-v0", + "--feature", feature], opts['criu_bin']) == 0 + + @staticmethod + def available(): + if not os.access(opts['criu_bin'], os.X_OK): + print("CRIU binary not found at %s" % opts['criu_bin']) + sys.exit(1) + + def kill(self): + if self.__lazy_pages_p: + self.__lazy_pages_p.terminate() + print("criu lazy-pages exited with %s" % self.__lazy_pages_p.wait()) + grep_errors(os.path.join(self.__ddir(), "lazy-pages.log")) + self.__lazy_pages_p = None + if self.__page_server_p: + self.__page_server_p.terminate() + print("criu page-server exited with %s" % self.__page_server_p.wait()) + grep_errors(os.path.join(self.__ddir(), "page-server.log")) + self.__page_server_p = None + if self.__dump_process: + self.__dump_process.terminate() + print("criu dump exited with %s" % self.__dump_process.wait()) + grep_errors(os.path.join(self.__ddir(), "dump.log")) + self.__dump_process = None + + +def try_run_hook(test, args): + hname = test.getname() + '.hook' + if os.access(hname, os.X_OK): + print("Running %s(%s)" % (hname, ', '.join(args))) + hook = subprocess.Popen([hname] + args) + if hook.wait() != 0: + raise test_fail_exc("hook " + " ".join(args)) + + +# +# Step by step execution +# + +do_sbs = False + + +def init_sbs(): + if sys.stdout.isatty(): + global do_sbs + do_sbs = True + else: + print("Can't do step-by-step in this runtime") + + +def sbs(what): + if do_sbs: + input("Pause at %s. Press Enter to continue." % what) + + +# +# Main testing entity -- dump (probably with pre-dumps) and restore +# +def iter_parm(opt, dflt): + x = ((opt or str(dflt)) + ":0").split(':') + return (range(0, int(x[0])), float(x[1])) + + +def cr(cr_api, test, opts): + if opts['nocr']: + return + + cr_api.set_test(test) + + iters = iter_parm(opts['iters'], 1) + for i in iters[0]: + pres = iter_parm(opts['pre'], 0) + for p in pres[0]: + if opts['snaps']: + cr_api.dump("dump", opts = ["--leave-running", "--track-mem"]) + else: + cr_api.dump("pre-dump") + try_run_hook(test, ["--post-pre-dump"]) + test.pre_dump_notify() + time.sleep(pres[1]) + + sbs('pre-dump') + + os.environ["ZDTM_TEST_PID"] = str(test.getpid()) + if opts['norst']: + try_run_hook(test, ["--pre-dump"]) + cr_api.dump("dump", opts = ["--leave-running"]) + else: + try_run_hook(test, ["--pre-dump"]) + cr_api.dump("dump") + if not opts['lazy_migrate']: + test.gone() + else: + test.unlink_pidfile() + sbs('pre-restore') + try_run_hook(test, ["--pre-restore"]) + cr_api.restore() + os.environ["ZDTM_TEST_PID"] = str(test.getpid()) + os.environ["ZDTM_IMG_DIR"] = cr_api.logs() + try_run_hook(test, ["--post-restore"]) + sbs('post-restore') + + time.sleep(iters[1]) + + +# Additional checks that can be done outside of test process + +def get_visible_state(test): + maps = {} + files = {} + mounts = {} + + if not getattr(test, "static", lambda: False)() or \ + not getattr(test, "ns", lambda: False)(): + return ({}, {}, {}) + + r = re.compile('^[0-9]+$') + pids = filter(lambda p: r.match(p), os.listdir("/proc/%s/root/proc/" % test.getpid())) + for pid in pids: + files[pid] = set(os.listdir("/proc/%s/root/proc/%s/fd" % (test.getpid(), pid))) + + cmaps = [[0, 0, ""]] + last = 0 + mapsfd = open("/proc/%s/root/proc/%s/maps" % (test.getpid(), pid)) + for mp in mapsfd: + m = list(map(lambda x: int('0x' + x, 0), mp.split()[0].split('-'))) + + m.append(mp.split()[1]) + + f = "/proc/%s/root/proc/%s/map_files/%s" % (test.getpid(), pid, mp.split()[0]) + if os.access(f, os.F_OK): + st = os.lstat(f) + m.append(oct(st.st_mode)) + + if cmaps[last][1] == m[0] and cmaps[last][2] == m[2]: + cmaps[last][1] = m[1] + else: + cmaps.append(m) + last += 1 + mapsfd.close() + + maps[pid] = set(map(lambda x: '%x-%x %s' % (x[0], x[1], " ".join(x[2:])), cmaps)) + + cmounts = [] + try: + r = re.compile(r"^\S+\s\S+\s\S+\s(\S+)\s(\S+)\s(\S+)\s[^-]*?(shared)?[^-]*?(master)?[^-]*?-") + with open("/proc/%s/root/proc/%s/mountinfo" % (test.getpid(), pid)) as mountinfo: + for m in mountinfo: + cmounts.append(r.match(m).groups()) + except IOError as e: + if e.errno != errno.EINVAL: + raise e + mounts[pid] = cmounts + return files, maps, mounts + + +def check_visible_state(test, state, opts): + new = get_visible_state(test) + + for pid in state[0].keys(): + fnew = new[0][pid] + fold = state[0][pid] + if fnew != fold: + print("%s: Old files lost: %s" % (pid, fold - fnew)) + print("%s: New files appeared: %s" % (pid, fnew - fold)) + raise test_fail_exc("fds compare") + + old_maps = state[1][pid] + new_maps = new[1][pid] + if os.getenv("COMPAT_TEST"): + # the vsyscall vma isn't unmapped from x32 processes + vsyscall = u"ffffffffff600000-ffffffffff601000 r-xp" + if vsyscall in new_maps and vsyscall not in old_maps: + new_maps.remove(vsyscall) + if old_maps != new_maps: + print("%s: Old maps lost: %s" % (pid, old_maps - new_maps)) + print("%s: New maps appeared: %s" % (pid, new_maps - old_maps)) + if not opts['fault']: # skip parasite blob + raise test_fail_exc("maps compare") + + old_mounts = state[2][pid] + new_mounts = new[2][pid] + for i in range(len(old_mounts)): + m = old_mounts.pop(0) + if m in new_mounts: + new_mounts.remove(m) + else: + old_mounts.append(m) + if old_mounts or new_mounts: + print("%s: Old mounts lost: %s" % (pid, old_mounts)) + print("%s: New mounts appeared: %s" % (pid, new_mounts)) + raise test_fail_exc("mounts compare") + + if '--link-remap' in test.getdopts(): + import glob + link_remap_list = glob.glob(os.path.dirname(test.getname()) + '/link_remap*') + if link_remap_list: + print("%s: link-remap files left: %s" % (test.getname(), link_remap_list)) + raise test_fail_exc("link remaps left") + + +class noop_freezer: + def __init__(self): + self.kernel = False + + def attach(self): + pass + + def freeze(self): + pass + + def thaw(self): + pass + + def getdopts(self): + return [] + + def getropts(self): + return [] + + +class cg_freezer: + def __init__(self, path, state): + self.__path = '/sys/fs/cgroup/freezer/' + path + self.__state = state + self.kernel = True + + def attach(self): + if not os.access(self.__path, os.F_OK): + os.makedirs(self.__path) + with open(self.__path + '/tasks', 'w') as f: + f.write('0') + + def __set_state(self, state): + with open(self.__path + '/freezer.state', 'w') as f: + f.write(state) + + def freeze(self): + if self.__state.startswith('f'): + self.__set_state('FROZEN') + + def thaw(self): + if self.__state.startswith('f'): + self.__set_state('THAWED') + + def getdopts(self): + return ['--freeze-cgroup', self.__path, '--manage-cgroups'] + + def getropts(self): + return ['--manage-cgroups'] + + +def get_freezer(desc): + if not desc: + return noop_freezer() + + fd = desc.split(':') + fr = cg_freezer(path = fd[0], state = fd[1]) + return fr + + +def cmp_ns(ns1, match, ns2, msg): + ns1_ino = os.stat(ns1).st_ino + ns2_ino = os.stat(ns2).st_ino + if eval("%r %s %r" % (ns1_ino, match, ns2_ino)): + print("%s match (%r %s %r) fail" % (msg, ns1_ino, match, ns2_ino)) + raise test_fail_exc("%s compare" % msg) + + +def check_joinns_state(t): + cmp_ns("/proc/%s/ns/net" % t.getpid(), "!=", join_ns_file, "join-ns") + + +def pstree_each_pid(root_pid): + f_children_path = "/proc/{0}/task/{0}/children".format(root_pid) + child_pids = [] + try: + with open(f_children_path, "r") as f_children: + pid_line = f_children.readline().strip(" \n") + if pid_line: + child_pids += pid_line.split(" ") + except Exception as e: + print("Unable to read /proc/*/children: %s" % e) + return # process is dead + + yield root_pid + for child_pid in child_pids: + for pid in pstree_each_pid(child_pid): + yield pid + + +def is_proc_stopped(pid): + def get_thread_status(thread_dir): + try: + with open(os.path.join(thread_dir, "status")) as f_status: + for line in f_status.readlines(): + if line.startswith("State:"): + return line.split(":", 1)[1].strip().split(" ")[0] + except Exception as e: + print("Unable to read a thread status: %s" % e) + pass # process is dead + return None + + def is_thread_stopped(status): + return (status is None) or (status == "T") or (status == "Z") + + tasks_dir = "/proc/%s/task" % pid + thread_dirs = [] + try: + thread_dirs = os.listdir(tasks_dir) + except Exception as e: + print("Unable to read threads: %s" % e) + pass # process is dead + + for thread_dir in thread_dirs: + thread_status = get_thread_status(os.path.join(tasks_dir, thread_dir)) + if not is_thread_stopped(thread_status): + return False + + if not is_thread_stopped(get_thread_status("/proc/%s" % pid)): + return False + + return True + + +def pstree_check_stopped(root_pid): + for pid in pstree_each_pid(root_pid): + if not is_proc_stopped(pid): + raise test_fail_exc("CRIU --leave-stopped %s" % pid) + + +def pstree_signal(root_pid, signal): + for pid in pstree_each_pid(root_pid): + try: + os.kill(int(pid), signal) + except Exception as e: + print("Unable to kill %d: %s" % (pid, e)) + pass # process is dead + + +def do_run_test(tname, tdesc, flavs, opts): + tcname = tname.split('/')[0] + tclass = test_classes.get(tcname, None) + if not tclass: + print("Unknown test class %s" % tcname) + return + + if opts['report']: + init_report(opts['report']) + if opts['sbs']: + init_sbs() + + fcg = get_freezer(opts['freezecg']) + + for f in flavs: + print_sep("Run %s in %s" % (tname, f)) + if opts['dry_run']: + continue + flav = flavors[f](opts) + t = tclass(tname, tdesc, flav, fcg) + cr_api = criu(opts) + + try: + t.start() + s = get_visible_state(t) + try: + cr(cr_api, t, opts) + except test_fail_expected_exc as e: + if e.cr_action == "dump": + t.stop() + else: + check_visible_state(t, s, opts) + if opts['join_ns']: + check_joinns_state(t) + t.stop() + cr_api.fini() + try_run_hook(t, ["--clean"]) + except test_fail_exc as e: + print_sep("Test %s FAIL at %s" % (tname, e.step), '#') + t.print_output() + t.kill() + cr_api.kill() + try_run_hook(t, ["--clean"]) + if cr_api.logs(): + add_to_report(cr_api.logs(), tname.replace('/', '_') + "_" + f + "/images") + if opts['keep_img'] == 'never': + cr_api.cleanup() + # When option --keep-going not specified this exit + # does two things: exits from subprocess and aborts the + # main script execution on the 1st error met + sys.exit(encode_flav(f)) + else: + if opts['keep_img'] != 'always': + cr_api.cleanup() + print_sep("Test %s PASS" % tname) + + +class Launcher: + def __init__(self, opts, nr_tests): + self.__opts = opts + self.__total = nr_tests + self.__runtest = 0 + self.__nr = 0 + self.__max = int(opts['parallel'] or 1) + self.__subs = {} + self.__fail = False + self.__file_report = None + self.__junit_file = None + self.__junit_test_cases = None + self.__failed = [] + self.__nr_skip = 0 + if self.__max > 1 and self.__total > 1: + self.__use_log = True + elif opts['report']: + self.__use_log = True + else: + self.__use_log = False + + if opts['report'] and (opts['keep_going'] or self.__total == 1): + global TestSuite, TestCase + from junit_xml import TestSuite, TestCase + now = datetime.datetime.now() + att = 0 + reportname = os.path.join(report_dir, "criu-testreport.tap") + junitreport = os.path.join(report_dir, "criu-testreport.xml") + while os.access(reportname, os.F_OK) or os.access(junitreport, os.F_OK): + reportname = os.path.join(report_dir, "criu-testreport" + ".%d.tap" % att) + junitreport = os.path.join(report_dir, "criu-testreport" + ".%d.xml" % att) + att += 1 + + self.__junit_file = open(junitreport, 'a') + self.__junit_test_cases = [] + + self.__file_report = open(reportname, 'a') + print(u"TAP version 13", file=self.__file_report) + print(u"# Hardware architecture: " + arch, file=self.__file_report) + print(u"# Timestamp: " + now.strftime("%Y-%m-%d %H:%M") + " (GMT+1)", file=self.__file_report) + print(u"# ", file=self.__file_report) + print(u"1.." + str(nr_tests), file=self.__file_report) + with open("/proc/sys/kernel/tainted") as taintfd: + self.__taint = taintfd.read() + if int(self.__taint, 0) != 0: + print("The kernel is tainted: %r" % self.__taint) + if not opts["ignore_taint"]: + raise Exception("The kernel is tainted: %r" % self.__taint) + + def __show_progress(self, msg): + perc = int(self.__nr * 16 / self.__total) + print("=== Run %d/%d %s %s" % (self.__nr, self.__total, '=' * perc + '-' * (16 - perc), msg)) + + def skip(self, name, reason): + print("Skipping %s (%s)" % (name, reason)) + self.__nr += 1 + self.__runtest += 1 + self.__nr_skip += 1 + + if self.__junit_test_cases is not None: + tc = TestCase(name) + tc.add_skipped_info(reason) + self.__junit_test_cases.append(tc) + if self.__file_report: + testline = u"ok %d - %s # SKIP %s" % (self.__runtest, name, reason) + print(testline, file=self.__file_report) + + def run_test(self, name, desc, flavor): + + if len(self.__subs) >= self.__max: + self.wait() + + with open("/proc/sys/kernel/tainted") as taintfd: + taint = taintfd.read() + if self.__taint != taint: + raise Exception("The kernel is tainted: %r (%r)" % (taint, self.__taint)) + + if test_flag(desc, 'excl'): + self.wait_all() + + self.__nr += 1 + self.__show_progress(name) + + nd = ('nocr', 'norst', 'pre', 'iters', 'page_server', 'sibling', 'stop', 'empty_ns', + 'fault', 'keep_img', 'report', 'snaps', 'sat', 'script', 'rpc', 'lazy_pages', + 'join_ns', 'dedup', 'sbs', 'freezecg', 'user', 'dry_run', 'noauto_dedup', + 'remote_lazy_pages', 'show_stats', 'lazy_migrate', 'remote', 'tls', + 'criu_bin', 'crit_bin') + arg = repr((name, desc, flavor, {d: self.__opts[d] for d in nd})) + + if self.__use_log: + logf = name.replace('/', '_') + ".log" + log = open(logf, "w") + else: + logf = None + log = None + + sub = subprocess.Popen(["./zdtm_ct", "zdtm.py"], + env = dict(os.environ, CR_CT_TEST_INFO = arg), + stdout = log, stderr = subprocess.STDOUT, close_fds = True) + self.__subs[sub.pid] = {'sub': sub, 'log': logf, 'name': name, "start": time.time()} + + if test_flag(desc, 'excl'): + self.wait() + + def __wait_one(self, flags): + pid = -1 + status = -1 + signal.alarm(10) + while True: + try: + pid, status = os.waitpid(0, flags) + except OSError as e: + if e.errno == errno.EINTR: + subprocess.Popen(["ps", "axf"]).wait() + continue + signal.alarm(0) + raise e + else: + break + signal.alarm(0) + + self.__runtest += 1 + if pid != 0: + sub = self.__subs.pop(pid) + tc = None + if self.__junit_test_cases is not None: + tc = TestCase(sub['name'], elapsed_sec=time.time() - sub['start']) + self.__junit_test_cases.append(tc) + if status != 0: + self.__fail = True + failed_flavor = decode_flav(os.WEXITSTATUS(status)) + self.__failed.append([sub['name'], failed_flavor]) + if self.__file_report: + testline = u"not ok %d - %s # flavor %s" % (self.__runtest, sub['name'], failed_flavor) + with open(sub['log']) as sublog: + output = sublog.read() + details = {'output': output} + tc.add_error_info(output = output) + print(testline, file=self.__file_report) + print("%s" % yaml.safe_dump(details, explicit_start=True, + explicit_end=True, default_style='|'), file=self.__file_report) + if sub['log']: + add_to_output(sub['log']) + else: + if self.__file_report: + testline = u"ok %d - %s" % (self.__runtest, sub['name']) + print(testline, file=self.__file_report) + + if sub['log']: + with open(sub['log']) as sublog: + print("%s" % sublog.read().encode('ascii', 'ignore').decode('utf-8')) + os.unlink(sub['log']) + + return True + + return False + + def __wait_all(self): + while self.__subs: + self.__wait_one(0) + + def wait(self): + self.__wait_one(0) + while self.__subs: + if not self.__wait_one(os.WNOHANG): + break + if self.__fail and not opts['keep_going']: + raise test_fail_exc('') + + def wait_all(self): + self.__wait_all() + if self.__fail and not opts['keep_going']: + raise test_fail_exc('') + + def finish(self): + self.__wait_all() + if not opts['fault'] and check_core_files(): + self.__fail = True + if self.__file_report: + ts = TestSuite(opts['title'], self.__junit_test_cases, os.getenv("NODE_NAME")) + self.__junit_file.write(TestSuite.to_xml_string([ts])) + self.__junit_file.close() + self.__file_report.close() + + if opts['keep_going']: + if self.__fail: + print_sep("%d TEST(S) FAILED (TOTAL %d/SKIPPED %d)" + % (len(self.__failed), self.__total, self.__nr_skip), "#") + for failed in self.__failed: + print(" * %s(%s)" % (failed[0], failed[1])) + else: + print_sep("ALL TEST(S) PASSED (TOTAL %d/SKIPPED %d)" + % (self.__total, self.__nr_skip), "#") + + if self.__fail: + print_sep("FAIL", "#") + sys.exit(1) + + +def all_tests(opts): + with open(opts['set'] + '.desc') as fd: + desc = eval(fd.read()) + + files = [] + mask = stat.S_IFREG | stat.S_IXUSR + for d in os.walk(desc['dir']): + for f in d[2]: + fp = os.path.join(d[0], f) + st = os.lstat(fp) + if (st.st_mode & mask) != mask: + continue + if stat.S_IFMT(st.st_mode) in [stat.S_IFLNK, stat.S_IFSOCK]: + continue + files.append(fp) + excl = list(map(lambda x: os.path.join(desc['dir'], x), desc['exclude'])) + tlist = filter(lambda x: + not x.endswith('.checkskip') and + not x.endswith('.hook') and + x not in excl, + map(lambda x: x.strip(), files) + ) + return tlist + + +# Descriptor for abstract test not in list +default_test = {} + + +def get_test_desc(tname): + d_path = tname + '.desc' + if os.access(d_path, os.F_OK) and os.path.getsize(d_path) > 0: + with open(d_path) as fd: + return eval(fd.read()) + + return default_test + + +def self_checkskip(tname): + chs = tname + '.checkskip' + if os.access(chs, os.X_OK): + ch = subprocess.Popen([chs]) + return not ch.wait() == 0 + + return False + + +def print_fname(fname, typ): + print("=[%s]=> %s" % (typ, fname)) + + +def print_sep(title, sep = "=", width = 80): + print((" " + title + " ").center(width, sep)) + + +def print_error(line): + line = line.rstrip() + print(line) + if line.endswith('>'): # combine pie output + return True + return False + + +def grep_errors(fname): + first = True + print_next = False + before = [] + with open(fname) as fd: + for l in fd: + before.append(l) + if len(before) > 5: + before.pop(0) + if "Error" in l: + if first: + print_fname(fname, 'log') + print_sep("grep Error", "-", 60) + first = False + for i in before: + print_next = print_error(i) + before = [] + else: + if print_next: + print_next = print_error(l) + before = [] + if not first: + print_sep("ERROR OVER", "-", 60) + + +def run_tests(opts): + excl = None + features = {} + + if opts['pre'] or opts['snaps']: + if not criu.check("mem_dirty_track"): + print("Tracking memory is not available") + return + + if opts['all']: + torun = all_tests(opts) + run_all = True + elif opts['tests']: + r = re.compile(opts['tests']) + torun = filter(lambda x: r.match(x), all_tests(opts)) + run_all = True + elif opts['test']: + torun = opts['test'] + run_all = False + elif opts['from']: + if not os.access(opts['from'], os.R_OK): + print("No such file") + return + + with open(opts['from']) as fd: + torun = map(lambda x: x.strip(), fd) + opts['keep_going'] = False + run_all = True + else: + print("Specify test with -t or -a") + return + + torun = list(torun) + if opts['keep_going'] and len(torun) < 2: + print("[WARNING] Option --keep-going is more useful when running multiple tests") + opts['keep_going'] = False + + if opts['exclude']: + excl = re.compile(".*(" + "|".join(opts['exclude']) + ")") + print("Compiled exclusion list") + + if opts['report']: + init_report(opts['report']) + + if opts['parallel'] and opts['freezecg']: + print("Parallel launch with freezer not supported") + opts['parallel'] = None + + if opts['join_ns']: + if subprocess.Popen(["ip", "netns", "add", "zdtm_netns"]).wait(): + raise Exception("Unable to create a network namespace") + if subprocess.Popen(["ip", "netns", "exec", "zdtm_netns", "ip", "link", "set", "up", "dev", "lo"]).wait(): + raise Exception("ip link set up dev lo") + + if opts['lazy_pages'] or opts['remote_lazy_pages'] or opts['lazy_migrate']: + uffd = criu.check("uffd") + uffd_noncoop = criu.check("uffd-noncoop") + if not uffd: + raise Exception("UFFD is not supported, cannot run with --lazy-pages") + if not uffd_noncoop: + # Most tests will work with 4.3 - 4.11 + print("[WARNING] Non-cooperative UFFD is missing, some tests might spuriously fail") + + launcher = Launcher(opts, len(torun)) + try: + for t in torun: + global arch + + if excl and excl.match(t): + launcher.skip(t, "exclude") + continue + + tdesc = get_test_desc(t) + if tdesc.get('arch', arch) != arch: + launcher.skip(t, "arch %s" % tdesc['arch']) + continue + + if test_flag(tdesc, 'reqrst') and opts['norst']: + launcher.skip(t, "restore stage is required") + continue + + if run_all and test_flag(tdesc, 'noauto'): + launcher.skip(t, "manual run only") + continue + + feat_list = tdesc.get('feature', "") + for feat in feat_list.split(): + if feat not in features: + print("Checking feature %s" % feat) + features[feat] = criu.check(feat) + + if not features[feat]: + launcher.skip(t, "no %s feature" % feat) + feat_list = None + break + if feat_list is None: + continue + + if self_checkskip(t): + launcher.skip(t, "checkskip failed") + continue + + if opts['user']: + if test_flag(tdesc, 'suid'): + launcher.skip(t, "suid test in user mode") + continue + if test_flag(tdesc, 'nouser'): + launcher.skip(t, "criu root prio needed") + continue + + if opts['join_ns']: + if test_flag(tdesc, 'samens'): + launcher.skip(t, "samens test in the same namespace") + continue + + if opts['lazy_pages'] or opts['remote_lazy_pages'] or opts['lazy_migrate']: + if test_flag(tdesc, 'nolazy'): + launcher.skip(t, "lazy pages are not supported") + continue + + if opts['remote_lazy_pages']: + if test_flag(tdesc, 'noremotelazy'): + launcher.skip(t, "remote lazy pages are not supported") + continue + + test_flavs = tdesc.get('flavor', 'h ns uns').split() + opts_flavs = (opts['flavor'] or 'h,ns,uns').split(',') + if opts_flavs != ['best']: + run_flavs = set(test_flavs) & set(opts_flavs) + else: + run_flavs = set([test_flavs.pop()]) + if not criu.check("userns"): + run_flavs -= set(['uns']) + if opts['user']: + # FIXME -- probably uns will make sense + run_flavs -= set(['ns', 'uns']) + + # remove ns and uns flavor in join_ns + if opts['join_ns']: + run_flavs -= set(['ns', 'uns']) + if opts['empty_ns']: + run_flavs -= set(['h']) + + if run_flavs: + launcher.run_test(t, tdesc, run_flavs) + else: + launcher.skip(t, "no flavors") + finally: + launcher.finish() + if opts['join_ns']: + subprocess.Popen(["ip", "netns", "delete", "zdtm_netns"]).wait() + + +sti_fmt = "%-40s%-10s%s" + + +def show_test_info(t): + tdesc = get_test_desc(t) + flavs = tdesc.get('flavor', '') + return sti_fmt % (t, flavs, tdesc.get('flags', '')) + + +def list_tests(opts): + tlist = all_tests(opts) + if opts['info']: + print(sti_fmt % ('Name', 'Flavors', 'Flags')) + tlist = map(lambda x: show_test_info(x), tlist) + print('\n'.join(tlist)) + + +class group: + def __init__(self, tname, tdesc): + self.__tests = [tname] + self.__desc = tdesc + self.__deps = set() + + def __is_mergeable_desc(self, desc): + # For now make it full match + if self.__desc.get('flags') != desc.get('flags'): + return False + if self.__desc.get('flavor') != desc.get('flavor'): + return False + if self.__desc.get('arch') != desc.get('arch'): + return False + if self.__desc.get('opts') != desc.get('opts'): + return False + if self.__desc.get('feature') != desc.get('feature'): + return False + return True + + def merge(self, tname, tdesc): + if not self.__is_mergeable_desc(tdesc): + return False + + self.__deps |= set(tdesc.get('deps', [])) + self.__tests.append(tname) + return True + + def size(self): + return len(self.__tests) + + # common method to write a "meta" auxiliary script (hook/checkskip) + # which will call all tests' scripts in turn + def __dump_meta(self, fname, ext): + scripts = filter(lambda names: os.access(names[1], os.X_OK), + map(lambda test: (test, test + ext), + self.__tests)) + if scripts: + f = open(fname + ext, "w") + f.write("#!/bin/sh -e\n") + + for test, script in scripts: + f.write("echo 'Running %s for %s'\n" % (ext, test)) + f.write('%s "$@"\n' % script) + + f.write("echo 'All %s scripts OK'\n" % ext) + f.close() + os.chmod(fname + ext, 0o700) + + def dump(self, fname): + f = open(fname, "w") + for t in self.__tests: + f.write(t + '\n') + f.close() + os.chmod(fname, 0o700) + + if len(self.__desc) or len(self.__deps): + f = open(fname + '.desc', "w") + if len(self.__deps): + self.__desc['deps'] = list(self.__deps) + f.write(repr(self.__desc)) + f.close() + + # write "meta" .checkskip and .hook scripts + self.__dump_meta(fname, '.checkskip') + self.__dump_meta(fname, '.hook') + + +def group_tests(opts): + excl = None + groups = [] + pend_groups = [] + maxs = int(opts['max_size']) + + if not os.access("groups", os.F_OK): + os.mkdir("groups") + + tlist = all_tests(opts) + random.shuffle(tlist) + if opts['exclude']: + excl = re.compile(".*(" + "|".join(opts['exclude']) + ")") + print("Compiled exclusion list") + + for t in tlist: + if excl and excl.match(t): + continue + + td = get_test_desc(t) + + for g in pend_groups: + if g.merge(t, td): + if g.size() == maxs: + pend_groups.remove(g) + groups.append(g) + break + else: + g = group(t, td) + pend_groups.append(g) + + groups += pend_groups + + nr = 0 + suf = opts['name'] or 'group' + + for g in groups: + if maxs > 1 and g.size() == 1: # Not much point in group test for this + continue + + fn = os.path.join("groups", "%s.%d" % (suf, nr)) + g.dump(fn) + nr += 1 + + print("Generated %d group(s)" % nr) + + +def clean_stuff(opts): + print("Cleaning %s" % opts['what']) + if opts['what'] == 'nsroot': + for f in flavors: + f = flavors[f] + f.clean() + + +# +# main() starts here +# + +if 'CR_CT_TEST_INFO' in os.environ: + # Fork here, since we're new pidns init and are supposed to + # collect this namespace's zombies + status = 0 + pid = os.fork() + if pid == 0: + tinfo = eval(os.environ['CR_CT_TEST_INFO']) + do_run_test(tinfo[0], tinfo[1], tinfo[2], tinfo[3]) + else: + while True: + wpid, status = os.wait() + if wpid == pid: + if os.WIFEXITED(status): + status = os.WEXITSTATUS(status) + else: + status = 1 + break + + sys.exit(status) + +p = argparse.ArgumentParser("CRIU test suite") +p.add_argument("--debug", help = "Print what's being executed", action = 'store_true') +p.add_argument("--set", help = "Which set of tests to use", default = 'zdtm') + +sp = p.add_subparsers(help = "Use --help for list of actions") + +rp = sp.add_parser("run", help = "Run test(s)") +rp.set_defaults(action = run_tests) +rp.add_argument("-a", "--all", action = 'store_true') +rp.add_argument("-t", "--test", help = "Test name", action = 'append') +rp.add_argument("-T", "--tests", help = "Regexp") +rp.add_argument("-F", "--from", help = "From file") +rp.add_argument("-f", "--flavor", help = "Flavor to run") +rp.add_argument("-x", "--exclude", help = "Exclude tests from --all run", action = 'append') + +rp.add_argument("--sibling", help = "Restore tests as siblings", action = 'store_true') +rp.add_argument("--join-ns", help = "Restore tests and join existing namespace", action = 'store_true') +rp.add_argument("--empty-ns", help = "Restore tests in empty net namespace", action = 'store_true') +rp.add_argument("--pre", help = "Do some pre-dumps before dump (n[:pause])") +rp.add_argument("--snaps", help = "Instead of pre-dumps do full dumps", action = 'store_true') +rp.add_argument("--dedup", help = "Auto-deduplicate images on iterations", action = 'store_true') +rp.add_argument("--noauto-dedup", help = "Manual deduplicate images on iterations", action = 'store_true') +rp.add_argument("--nocr", help = "Do not CR anything, just check test works", action = 'store_true') +rp.add_argument("--norst", help = "Don't restore tasks, leave them running after dump", action = 'store_true') +rp.add_argument("--stop", help = "Check that --leave-stopped option stops ps tree.", action = 'store_true') +rp.add_argument("--iters", help = "Do CR cycle several times before check (n[:pause])") +rp.add_argument("--fault", help = "Test fault injection") +rp.add_argument("--sat", help = "Generate criu strace-s for sat tool (restore is fake, images are kept)", action = 'store_true') +rp.add_argument("--sbs", help = "Do step-by-step execution, asking user for keypress to continue", action = 'store_true') +rp.add_argument("--freezecg", help = "Use freeze cgroup (path:state)") +rp.add_argument("--user", help = "Run CRIU as regular user", action = 'store_true') +rp.add_argument("--rpc", help = "Run CRIU via RPC rather than CLI", action = 'store_true') + +rp.add_argument("--page-server", help = "Use page server dump", action = 'store_true') +rp.add_argument("--remote", help = "Use remote option for diskless C/R", action = 'store_true') +rp.add_argument("-p", "--parallel", help = "Run test in parallel") +rp.add_argument("--dry-run", help="Don't run tests, just pretend to", action='store_true') +rp.add_argument("--script", help="Add script to get notified by criu") +rp.add_argument("-k", "--keep-img", help = "Whether or not to keep images after test", + choices = ['always', 'never', 'failed'], default = 'failed') +rp.add_argument("--report", help = "Generate summary report in directory") +rp.add_argument("--keep-going", help = "Keep running tests in spite of failures", action = 'store_true') +rp.add_argument("--ignore-taint", help = "Don't care about a non-zero kernel taint flag", action = 'store_true') +rp.add_argument("--lazy-pages", help = "restore pages on demand", action = 'store_true') +rp.add_argument("--lazy-migrate", help = "restore pages on demand", action = 'store_true') +rp.add_argument("--remote-lazy-pages", help = "simulate lazy migration", action = 'store_true') +rp.add_argument("--tls", help = "use TLS for migration", action = 'store_true') +rp.add_argument("--title", help = "A test suite title", default = "criu") +rp.add_argument("--show-stats", help = "Show criu statistics", action = 'store_true') +rp.add_argument("--criu-bin", help = "Path to criu binary", default = '../criu/criu') +rp.add_argument("--crit-bin", help = "Path to crit binary", default = '../crit/crit') + +lp = sp.add_parser("list", help = "List tests") +lp.set_defaults(action = list_tests) +lp.add_argument('-i', '--info', help = "Show more info about tests", action = 'store_true') + +gp = sp.add_parser("group", help = "Generate groups") +gp.set_defaults(action = group_tests) +gp.add_argument("-m", "--max-size", help = "Maximum number of tests in group") +gp.add_argument("-n", "--name", help = "Common name for group tests") +gp.add_argument("-x", "--exclude", help = "Exclude tests from --all run", action = 'append') + +cp = sp.add_parser("clean", help = "Clean something") +cp.set_defaults(action = clean_stuff) +cp.add_argument("what", choices = ['nsroot']) + +opts = vars(p.parse_args()) +if opts.get('sat', False): + opts['keep_img'] = 'always' + +if opts['debug']: + sys.settrace(traceit) + +if opts['action'] == 'run': + criu.available() +for tst in test_classes.values(): + tst.available() + +opts['action'](opts) + +for tst in test_classes.values(): + tst.cleanup() diff --git a/CRIU_code/test/zdtm/.gitignore b/CRIU_code/test/zdtm/.gitignore new file mode 100644 index 0000000..6471943 --- /dev/null +++ b/CRIU_code/test/zdtm/.gitignore @@ -0,0 +1,14 @@ +/lib/libzdtmtst.a +/lib/.gitignore +/static/.gitignore +/transition/.gitignore + +*.pid +*.pidns +*.out +*.outns +*.out.external +*.inprogress +*.test +*.test.* +*.state diff --git a/CRIU_code/test/zdtm/Makefile b/CRIU_code/test/zdtm/Makefile new file mode 100644 index 0000000..24a33f2 --- /dev/null +++ b/CRIU_code/test/zdtm/Makefile @@ -0,0 +1,13 @@ +SUBDIRS := lib static transition + +all: $(SUBDIRS) +.PHONY: all $(SUBDIRS) + +$(SUBDIRS): + $(MAKE) -C $@ all + +static: lib +transition: lib + +%: + set -e; for d in $(SUBDIRS); do $(MAKE) -C $$d $@; done diff --git a/CRIU_code/test/zdtm/Makefile.inc b/CRIU_code/test/zdtm/Makefile.inc new file mode 100644 index 0000000..170f316 --- /dev/null +++ b/CRIU_code/test/zdtm/Makefile.inc @@ -0,0 +1,108 @@ +.SUFFIXES: +MAKEFLAGS += -r + +ARCH ?= $(shell uname -m | sed \ + -e s/i.86/x86/ \ + -e s/x86_64/x86/ \ + -e s/sun4u/sparc64/ \ + -e s/arm.*/arm/ \ + -e s/sa110/arm/ \ + -e s/s390x/s390/ \ + -e s/parisc64/parisc/ \ + -e s/ppc64.*/ppc64/ \ + -e s/mips.*/mips/ \ + -e s/sh[234].*/sh/ \ + -e s/aarch64.*/arm64/) + +ifeq ($(ARCH),arm64) + ARCH ?= aarch64 + SRCARCH ?= aarch64 +endif + +SRCARCH ?= $(ARCH) + +ifeq ($(ARCH),arm) + ARMV := $(shell echo $(UNAME-M) | sed -nr 's/armv([[:digit:]]).*/\1/p; t; i7') + + ifeq ($(ARMV),6) + USERCFLAGS += -march=armv6 + else ifeq ($(ARMV),7) + USERCFLAGS += -march=armv7-a + endif +endif + +CC := gcc +CFLAGS += -g -O2 -Wall -Werror -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=0 +CFLAGS += $(USERCFLAGS) +CFLAGS += -D_GNU_SOURCE +CPPFLAGS += -iquote $(LIBDIR)/arch/$(SRCARCH)/include + +ifeq ($(strip $(V)),) + E = @echo + Q = @ +else + E = @\# + Q = +endif + +RM := rm -f --one-file-system + +ifeq ($(COMPAT_TEST),y) +ifeq ($(ARCH),x86) + export CFLAGS += -m32 + export LDFLAGS += -m32 +endif +endif + +%.d: %.c + $(E) " DEP " $@ + $(Q)$(CC) $(CFLAGS) $(CPPFLAGS) -MM -MP -c $< -o $@ + +%.o: %.c | %.d + $(E) " CC " $@ + $(Q)$(CC) $(CFLAGS) $(CPPFLAGS) -c $< -o $@ + +%: %.o $(LDLIBS) + @echo $@ >> .gitignore + $(E) " LINK " $@ + $(Q)$(CC) $(LDFLAGS) $^ $(LDLIBS) -o $@ + +default: all + @true +.PHONY: default + +gitignore-clean: + $(RM) .gitignore +.PHONY: gitignore-clean + +clean: gitignore-clean + $(RM) $(OBJ) $(TST) *~ +.PHONY: clean + +cleandep: clean + $(RM) $(DEP) +.PHONY: cleandep + +cleanout: + $(RM) -r *.pid *.out* *.test* *.state +.PHONY: cleanout + +%.cleanout: % + $(Q) $(RM) -r $<.pid* $<.out* *$<.test* $<.*.test $<.*.state $<.state chew_$<.test* + +realclean: cleandep cleanout +.PHONY: realclean + +dep: $(DEP) +.PHONY: dep + +no-deps-targets := clean cleandep cleanout realclean groups.cleanout + +ifeq ($(strip $(DEP)),) +$(error No DEP defined in sub-make) +endif +ifeq ($(filter $(no-deps-targets), $(MAKECMDGOALS)),) +-include $(wildcard $(DEP)) +endif + +.SECONDARY: diff --git a/CRIU_code/test/zdtm/lib/Makefile b/CRIU_code/test/zdtm/lib/Makefile new file mode 100644 index 0000000..d2d9f1c --- /dev/null +++ b/CRIU_code/test/zdtm/lib/Makefile @@ -0,0 +1,30 @@ +LIBDIR := . + +CFLAGS += $(USERCFLAGS) + +LIB := libzdtmtst.a + +LIBSRC := datagen.c msg.c parseargs.c test.c streamutil.c lock.c ns.c tcp.c fs.c +LIBOBJ := $(LIBSRC:%.c=%.o) + +BIN := groups +SRC := $(LIBSRC) groups.c +DEP := $(SRC:%.c=%.d) +OBJ := $(SRC:%.c=%.o) +LDLIBS := $(LIB) + +TARGETS := $(LIB) $(BIN) + +include ../Makefile.inc + +all: $(TARGETS) +.PHONY: all + +clean-more: + $(RM) $(TARGETS) +.PHONY: clean-more +clean: clean-more + +$(LIB): $(LIBOBJ) + $(E) " AR " $@ + $(Q)ar rcs $@ $^ diff --git a/CRIU_code/test/zdtm/lib/arch/aarch64/include/asm/atomic.h b/CRIU_code/test/zdtm/lib/arch/aarch64/include/asm/atomic.h new file mode 100644 index 0000000..ddf4ad9 --- /dev/null +++ b/CRIU_code/test/zdtm/lib/arch/aarch64/include/asm/atomic.h @@ -0,0 +1,73 @@ +#ifndef __CR_ATOMIC_H__ +#define __CR_ATOMIC_H__ + +typedef uint32_t atomic_t; + + +/* Copied from the Linux header arch/arm/include/asm/barrier.h */ + +#define smp_mb() asm volatile("dmb ish" : : : "memory") + + +/* Copied from the Linux kernel header arch/arm64/include/asm/atomic.h */ + +static inline int atomic_read(const atomic_t *v) +{ + return (*(volatile int *)v); +} + +static inline void atomic_set(atomic_t *v, int i) +{ + *v = i; +} + +#define atomic_get atomic_read + +static inline int atomic_add_return(int i, atomic_t *v) +{ + unsigned long tmp; + int result; + + asm volatile( +"1: ldxr %w0, %2\n" +" add %w0, %w0, %w3\n" +" stlxr %w1, %w0, %2\n" +" cbnz %w1, 1b" + : "=&r" (result), "=&r" (tmp), "+Q" (*v) + : "Ir" (i) + : "cc", "memory"); + + smp_mb(); + return result; +} + +static inline int atomic_sub_return(int i, atomic_t *v) +{ + unsigned long tmp; + int result; + + asm volatile( +"1: ldxr %w0, %2\n" +" sub %w0, %w0, %w3\n" +" stlxr %w1, %w0, %2\n" +" cbnz %w1, 1b" + : "=&r" (result), "=&r" (tmp), "+Q" (*v) + : "Ir" (i) + : "cc", "memory"); + + smp_mb(); + return result; +} + +static inline int atomic_inc(atomic_t *v) { return atomic_add_return(1, v) - 1; } + +static inline int atomic_add(int val, atomic_t *v) { return atomic_add_return(val, v) - val; } + +static inline int atomic_dec(atomic_t *v) { return atomic_sub_return(1, v) + 1; } + +/* true if the result is 0, or false for all other cases. */ +#define atomic_dec_and_test(v) (atomic_sub_return(1, v) == 0) + +#define atomic_inc_return(v) (atomic_add_return(1, v)) + +#endif /* __CR_ATOMIC_H__ */ diff --git a/CRIU_code/test/zdtm/lib/arch/arm/include/asm/atomic.h b/CRIU_code/test/zdtm/lib/arch/arm/include/asm/atomic.h new file mode 100644 index 0000000..0ff7640 --- /dev/null +++ b/CRIU_code/test/zdtm/lib/arch/arm/include/asm/atomic.h @@ -0,0 +1,68 @@ +#ifndef __CR_ATOMIC_H__ +#define __CR_ATOMIC_H__ + + +typedef uint32_t atomic_t; + + +/* Copied from the Linux kernel header arch/arm/include/asm/atomic.h */ + +#define smp_mb() __asm__ __volatile__ ("dmb" : : : "memory") + +#define atomic_set(mem,v) (*(mem) = (v)) +#define atomic_get(v) (*(volatile uint32_t *)v) + +static inline unsigned int atomic_add_return(int i, atomic_t *v) +{ + unsigned long tmp; + unsigned int result; + + smp_mb(); + + __asm__ __volatile__("@ atomic_add_return\n" +"1: ldrex %0, [%3]\n" +" add %0, %0, %4\n" +" strex %1, %0, [%3]\n" +" teq %1, #0\n" +" bne 1b\n" + : "=&r" (result), "=&r" (tmp), "+Qo" (*v) + : "r" (v), "Ir" (i) + : "cc"); + + smp_mb(); + + return result; +} + +static inline unsigned int atomic_sub_return(int i, atomic_t *v) +{ + unsigned long tmp; + int result; + + smp_mb(); + + __asm__ __volatile__("@ atomic_sub_return\n" +"1: ldrex %0, [%3]\n" +" sub %0, %0, %4\n" +" strex %1, %0, [%3]\n" +" teq %1, #0\n" +" bne 1b\n" + : "=&r" (result), "=&r" (tmp), "+Qo" (*v) + : "r" (v), "Ir" (i) + : "cc"); + + smp_mb(); + + return result; +} + +static inline unsigned int atomic_inc(atomic_t *v) { return atomic_add_return(1, v) - 1; } + +static inline unsigned int atomic_add(int val, atomic_t *v) { return atomic_add_return(val, v) - val; } + +static inline unsigned int atomic_dec(atomic_t *v) { return atomic_sub_return(1, v) + 1; } + +/* true if the result is 0, or false for all other cases. */ +#define atomic_dec_and_test(v) (atomic_sub_return(1, v) == 0) + +#endif /* __CR_ATOMIC_H__ */ diff --git a/CRIU_code/test/zdtm/lib/arch/ppc64/include/asm/atomic.h b/CRIU_code/test/zdtm/lib/arch/ppc64/include/asm/atomic.h new file mode 100644 index 0000000..bd14cc0 --- /dev/null +++ b/CRIU_code/test/zdtm/lib/arch/ppc64/include/asm/atomic.h @@ -0,0 +1,87 @@ +#ifndef __CR_ATOMIC_H__ +#define __CR_ATOMIC_H__ + +/* + * PowerPC atomic operations + * + * Copied from kernel header file arch/powerpc/include/asm/atomic.h + */ +typedef uint32_t atomic_t; + +#define PPC_ATOMIC_ENTRY_BARRIER "lwsync \n" +#define PPC_ATOMIC_EXIT_BARRIER "sync \n" + +#define ATOMIC_INIT(i) { (i) } + +static __inline__ int atomic_get(const atomic_t *v) +{ + int t; + + __asm__ __volatile__("lwz%U1%X1 %0,%1" : "=r"(t) : "m"(*v)); + + return t; +} + +static __inline__ void atomic_set(atomic_t *v, int i) +{ + __asm__ __volatile__("stw%U0%X0 %1,%0" : "=m"(*v) : "r"(i)); +} + +#define ATOMIC_OP(op, asm_op) \ +static __inline__ void atomic_##op(int a, atomic_t *v) \ +{ \ + int t; \ + \ + __asm__ __volatile__( \ +"1: lwarx %0,0,%3 # atomic_" #op "\n" \ + #asm_op " %0,%2,%0\n" \ +" stwcx. %0,0,%3 \n" \ +" bne- 1b\n" \ + : "=&r" (t), "+m" (*v) \ + : "r" (a), "r" (v) \ + : "cc"); \ +} \ + +ATOMIC_OP(add, add) +ATOMIC_OP(sub, subf) + +#undef ATOMIC_OP + +static __inline__ int atomic_inc_return(atomic_t *v) +{ + int t; + + __asm__ __volatile__( + PPC_ATOMIC_ENTRY_BARRIER \ +"1: lwarx %0,0,%1 # atomic_inc_return\n\ + addic %0,%0,1\n" +" stwcx. %0,0,%1 \n\ + bne- 1b \n" \ + PPC_ATOMIC_EXIT_BARRIER + : "=&r" (t) + : "r" (v) + : "cc", "xer", "memory"); + + return t; +} + +static __inline__ int atomic_inc(atomic_t *v) +{ + return atomic_inc_return(v) - 1; +} + +static __inline__ void atomic_dec(atomic_t *v) +{ + int t; + + __asm__ __volatile__( +"1: lwarx %0,0,%2 # atomic_dec\n\ + addic %0,%0,-1\n" +" stwcx. %0,0,%2\n\ + bne- 1b" + : "=&r" (t), "+m" (*v) + : "r" (v) + : "cc", "xer"); +} + +#endif /* __CR_ATOMIC_H__ */ diff --git a/CRIU_code/test/zdtm/lib/arch/s390/include/asm/atomic.h b/CRIU_code/test/zdtm/lib/arch/s390/include/asm/atomic.h new file mode 100644 index 0000000..b7c4b2c --- /dev/null +++ b/CRIU_code/test/zdtm/lib/arch/s390/include/asm/atomic.h @@ -0,0 +1,68 @@ +#ifndef __ARCH_S390_ATOMIC__ +#define __ARCH_S390_ATOMIC__ + +#include + +typedef uint32_t atomic_t; + +#define __ATOMIC_OP(op_name, op_type, op_string) \ +static inline op_type op_name(op_type val, op_type *ptr) \ +{ \ + op_type old, new; \ + \ + asm volatile( \ + "0: lr %[new],%[old]\n" \ + op_string " %[new],%[val]\n" \ + " cs %[old],%[new],%[ptr]\n" \ + " jl 0b" \ + : [old] "=d" (old), [new] "=&d" (new), [ptr] "+Q" (*ptr)\ + : [val] "d" (val), "0" (*ptr) : "cc", "memory"); \ + return old; \ +} + +#define __ATOMIC_OPS(op_name, op_type, op_string) \ + __ATOMIC_OP(op_name, op_type, op_string) \ + __ATOMIC_OP(op_name##_barrier, op_type, op_string) + +__ATOMIC_OPS(__atomic_add, uint32_t, "ar") + +#undef __ATOMIC_OPS +#undef __ATOMIC_OP + +static inline int atomic_get(const atomic_t *v) +{ + int c; + + asm volatile( + " l %0,%1\n" + : "=d" (c) : "Q" (*v)); + return c; +} + +static inline void atomic_set(atomic_t *v, int i) +{ + asm volatile( + " st %1,%0\n" + : "=Q" (*v) : "d" (i)); +} + +static inline int atomic_add_return(int i, atomic_t *v) +{ + return __atomic_add_barrier(i, v) + i; +} + +static inline void atomic_add(int i, atomic_t *v) +{ + __atomic_add(i, v); +} + +#define atomic_sub(_i, _v) atomic_add(-(int)(_i), _v) + +static inline int atomic_inc(atomic_t *v) +{ + return atomic_add_return(1, v) - 1; +} + +#define atomic_dec(_v) atomic_sub(1, _v) + +#endif /* __ARCH_S390_ATOMIC__ */ diff --git a/CRIU_code/test/zdtm/lib/arch/x86/include/asm/atomic.h b/CRIU_code/test/zdtm/lib/arch/x86/include/asm/atomic.h new file mode 100644 index 0000000..7621df0 --- /dev/null +++ b/CRIU_code/test/zdtm/lib/arch/x86/include/asm/atomic.h @@ -0,0 +1,49 @@ +#ifndef ATOMIC_H__ +#define ATOMIC_H__ + +#define atomic_set(mem, v) \ + ({ \ + asm volatile ("lock xchg %0, %1\n" \ + : "+r" (v), "+m" (*mem) \ + : \ + : "cc", "memory"); \ + }) + +#define atomic_get(mem) \ + ({ \ + uint32_t ret__ = 0; \ + asm volatile ("lock xadd %0, %1\n" \ + : "+r" (ret__), "+m" (*mem) \ + : \ + : "cc", "memory"); \ + ret__; \ + }) + +#define atomic_inc(mem) \ + ({ \ + uint32_t ret__ = 1; \ + asm volatile ("lock xadd %0, %1\n" \ + : "+r" (ret__), "+m" (*mem) \ + : \ + : "cc", "memory"); \ + ret__; \ + }) + +#define atomic_dec(mem) \ + ({ \ + uint32_t ret__ = -1; \ + asm volatile ("lock xadd %0, %1\n" \ + : "+r" (ret__), "+m" (*mem) \ + : \ + : "cc", "memory"); \ + ret__; \ + }) + +#define atomic_add(i, mem) \ +({ \ + asm volatile("lock addl %1,%0" \ + : "+m" (*mem) \ + : "ir" (i)); \ +}) + +#endif /* ATOMIC_H__ */ diff --git a/CRIU_code/test/zdtm/lib/cpuid.h b/CRIU_code/test/zdtm/lib/cpuid.h new file mode 100644 index 0000000..f87d1ac --- /dev/null +++ b/CRIU_code/test/zdtm/lib/cpuid.h @@ -0,0 +1,39 @@ +#ifndef ZDTM_CPUID_H__ +#define ZDTM_CPUID_H__ + +/* + * Adopted from linux kernel code. + */ + +static inline void native_cpuid(unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) +{ + /* ecx is often an input as well as an output. */ + asm volatile("cpuid" + : "=a" (*eax), + "=b" (*ebx), + "=c" (*ecx), + "=d" (*edx) + : "0" (*eax), "2" (*ecx) + : "memory"); +} + +static inline void cpuid(unsigned int op, + unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) +{ + *eax = op; + *ecx = 0; + native_cpuid(eax, ebx, ecx, edx); +} + +static inline void cpuid_count(unsigned int op, unsigned int count, + unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) +{ + *eax = op; + *ecx = count; + native_cpuid(eax, ebx, ecx, edx); +} + +#endif /* ZDTM_CPUID_H__ */ diff --git a/CRIU_code/test/zdtm/lib/datagen.c b/CRIU_code/test/zdtm/lib/datagen.c new file mode 100644 index 0000000..83fbea2 --- /dev/null +++ b/CRIU_code/test/zdtm/lib/datagen.c @@ -0,0 +1,140 @@ +#include +#include + +#include "zdtmtst.h" + +/* + * Generate random data only for buffers with sizes less than FAST_SIZE + * If a size of buffer is more than FAST_SIZE, the first FAST_SIZE bytes + * are filled by random generator and then this chunk is used as pattern + * for all other chunks. + */ + +#define FAST_SIZE 99971 /* Prime number */ + +static void datagen_fast(uint8_t *buffer, unsigned length, uint32_t *crc) +{ + size_t off; + + datagen(buffer, FAST_SIZE, crc); + off = FAST_SIZE; + + while (off < length) { + unsigned long size = FAST_SIZE; + + if (off + FAST_SIZE > length) + size = length - off; + memcpy(buffer + off, buffer, size); + + off += size; + } +} + +static int datachk_fast(const uint8_t *buffer, unsigned length, uint32_t *crc) +{ + size_t off; + + if (datachk(buffer, FAST_SIZE, crc)) + return 1; + + off = FAST_SIZE; + + while (off < length) { + unsigned long size = FAST_SIZE; + + if (off + FAST_SIZE > length) + size = length - off; + + if (memcmp(buffer + off, buffer, size)) { + test_msg("Memory corruption [%p, %p]\n", buffer, buffer + size); + return 1; + } + off += size; + } + + return 0; +} + +/* update CRC-32 */ +#define CRCPOLY 0xedb88320 +static inline uint32_t crc32_le8(uint32_t crc, uint8_t datum) +{ + int i; + crc ^= datum; + for (i = 0; i < 8; i++) + crc = (crc >> 1) ^ ((crc & 1) ? CRCPOLY : 0); + return crc; +} + +void datagen(uint8_t *buffer, unsigned length, uint32_t *crc) +{ + uint32_t rnd = 0; + unsigned shift; + + if (length > FAST_SIZE) + return datagen_fast(buffer, length, crc); + + for (shift = 0; length-- > 4; buffer++, shift--, rnd >>= 8) { + if (!shift) { + shift = 4; + rnd = mrand48(); + } + + *buffer = rnd; + if (crc) + *crc = crc32_le8(*crc, *buffer); + } + + if (crc) { + *buffer++ = *crc; + *buffer++ = *crc >> 8; + *buffer++ = *crc >> 16; + *buffer++ = *crc >> 24; + } +} + +void datagen2(uint8_t *buffer, unsigned length, uint32_t *crc) +{ + uint32_t rnd = 0; + unsigned shift; + + for (shift = 0; length-- > 0; buffer++, shift--, rnd >>= 8) { + if (!shift) { + shift = 4; + rnd = mrand48(); + } + + *buffer = rnd; + if (crc) + *crc = crc32_le8(*crc, *buffer); + } +} + +int datachk(const uint8_t *buffer, unsigned length, uint32_t *crc) +{ + uint32_t read_crc; + + if (length > FAST_SIZE) + return datachk_fast(buffer, length, crc); + + for (; length-- > 4; buffer++) + *crc = crc32_le8(*crc, *buffer); + + read_crc = buffer[0] | + buffer[1] << 8 | + buffer[2] << 16 | + buffer[3] << 24; + if (read_crc != *crc) { + test_msg("Read: %x, Expected: %x\n", read_crc, *crc); + return 1; + } + return 0; +} + +int datasum(const uint8_t *buffer, unsigned length, uint32_t *crc) +{ + for (; length-- > 0; buffer++) + *crc = crc32_le8(*crc, *buffer); + + return 0; +} diff --git a/CRIU_code/test/zdtm/lib/fs.c b/CRIU_code/test/zdtm/lib/fs.c new file mode 100644 index 0000000..0decfc3 --- /dev/null +++ b/CRIU_code/test/zdtm/lib/fs.c @@ -0,0 +1,96 @@ +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" +#include "fs.h" + +mnt_info_t *mnt_info_alloc(void) +{ + mnt_info_t *m = malloc(sizeof(*m)); + if (m) + memset(m, 0, sizeof(*m)); + return m; +} + +void mnt_info_free(mnt_info_t **m) +{ + if (m && *m) { + free(*m); + *m = NULL; + } +} + +mnt_info_t *get_cwd_mnt_info(void) +{ + int mnt_id, parent_mnt_id; + unsigned int kmaj, kmin; + char str[1024], *cwd; + int ret; + FILE *f; + + mnt_info_t *m = NULL; + + char mountpoint[PATH_MAX]; + char root[PATH_MAX]; + + char *fsname = NULL; + size_t len = 0, best_len = 0; + + f = fopen("/proc/self/mountinfo", "r"); + if (!f) + return NULL; + + cwd = get_current_dir_name(); + if (!cwd) + goto err; + + m = mnt_info_alloc(); + if (!m) + goto err; + + while (fgets(str, sizeof(str), f)) { + char *hyphen = strchr(str, '-'); + ret = sscanf(str, "%i %i %u:%u %s %s", + &mnt_id, &parent_mnt_id, + &kmaj, &kmin, + root, mountpoint); + if (ret != 6 || !hyphen) + goto err; + ret = sscanf(hyphen + 1, " %ms", &fsname); + if (ret != 1) + goto err; + + len = strlen(mountpoint); + if (!strncmp(mountpoint, cwd, len)) { + if (len > best_len) { + best_len = len; + + m->mnt_id = mnt_id; + m->parent_mnt_id = parent_mnt_id; + m->s_dev = MKKDEV(kmaj, kmin); + + strncpy(m->root, root, sizeof(m->root)); + strncpy(m->mountpoint, mountpoint, sizeof(m->mountpoint)); + strncpy(m->fsname, fsname, sizeof(m->fsname) - 1); + m->fsname[sizeof(m->fsname) - 1] = 0; + } + } + + free(fsname); + fsname = NULL; + } + +out: + free(cwd); + fclose(f); + + return m; + +err: + mnt_info_free(&m); + goto out; +} diff --git a/CRIU_code/test/zdtm/lib/fs.h b/CRIU_code/test/zdtm/lib/fs.h new file mode 100644 index 0000000..972b15a --- /dev/null +++ b/CRIU_code/test/zdtm/lib/fs.h @@ -0,0 +1,53 @@ +#ifndef ZDTM_FS_H_ +#define ZDTM_FS_H_ + +#ifndef _BSD_SOURCE +# define _BSD_SOURCE +#endif + +#include +#include + +#include + +#define KDEV_MINORBITS 20 +#define KDEV_MINORMASK ((1UL << KDEV_MINORBITS) - 1) +#define MKKDEV(ma, mi) (((ma) << KDEV_MINORBITS) | (mi)) + +static inline unsigned int kdev_major(unsigned int kdev) +{ + return kdev >> KDEV_MINORBITS; +} + +static inline unsigned int kdev_minor(unsigned int kdev) +{ + return kdev & KDEV_MINORMASK; +} + +static inline dev_t kdev_to_odev(unsigned int kdev) +{ + /* + * New kernels encode devices in a new form. + * See kernel's fs/stat.c for details, there + * choose_32_64 helpers which are the key. + */ + unsigned major = kdev_major(kdev); + unsigned minor = kdev_minor(kdev); + + return makedev(major, minor); +} + +typedef struct { + int mnt_id; + int parent_mnt_id; + unsigned int s_dev; + char root[PATH_MAX]; + char mountpoint[PATH_MAX]; + char fsname[64]; +} mnt_info_t; + +extern mnt_info_t *mnt_info_alloc(void); +extern void mnt_info_free(mnt_info_t **m); +extern mnt_info_t *get_cwd_mnt_info(void); + +#endif /* ZDTM_FS_H_ */ diff --git a/CRIU_code/test/zdtm/lib/groups.c b/CRIU_code/test/zdtm/lib/groups.c new file mode 100644 index 0000000..83b0938 --- /dev/null +++ b/CRIU_code/test/zdtm/lib/groups.c @@ -0,0 +1,45 @@ +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Group starter"; +const char *test_author = "Pavel Emelianov "; + +int main(int argc, char **argv) +{ + int sret = 0; + char *env; + char sh[1024]; + + test_init(argc, argv); + + env = getenv("ZDTM_TESTS"); + if (env[0] != '\0') { + unsetenv("ZDTM_NEWNS"); + unsetenv("ZDTM_GROUPS"); + unsetenv("ZDTM_UID"); + unsetenv("ZDTM_GID"); + unsetenv("ZDTM_ROOT"); + + test_msg("List: [%s]\n", env); + sprintf(sh, "sh /%s.start", env); + system(sh); + } + + test_daemon(); + test_waitsig(); + + if (env[0] != '\0') { + sprintf(sh, "sh /%s.stop", env); + sret = system(sh); + } + + if (sret == 0) + pass(); + else + fail("Some subs failed"); + + return 0; +} diff --git a/CRIU_code/test/zdtm/lib/groups.desc b/CRIU_code/test/zdtm/lib/groups.desc new file mode 100644 index 0000000..c44b3f2 --- /dev/null +++ b/CRIU_code/test/zdtm/lib/groups.desc @@ -0,0 +1 @@ +{'flags': 'noauto', 'deps': [ '/bin/sh', '/bin/kill', '/bin/cat' ]} diff --git a/CRIU_code/test/zdtm/lib/lock.c b/CRIU_code/test/zdtm/lib/lock.c new file mode 100644 index 0000000..3c3691d --- /dev/null +++ b/CRIU_code/test/zdtm/lib/lock.c @@ -0,0 +1,85 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +#define TASK_WAITER_INITIAL 0x0fffff + +static long sys_gettid(void) +{ + return syscall(__NR_gettid); +} + +void task_waiter_init(task_waiter_t *t) +{ + datagen((void *)&t->seed, sizeof(t->seed), NULL); + t->seed = t->seed % TASK_WAITER_INITIAL; + + if (pipe(t->pipes)) { + pr_perror("task_waiter_init failed"); + exit(1); + } +} + +void task_waiter_fini(task_waiter_t *t) +{ + close(t->pipes[0]); + close(t->pipes[1]); +} + +void task_waiter_wait4(task_waiter_t *t, unsigned int lockid) +{ + struct timespec req = { .tv_nsec = TASK_WAITER_INITIAL, }; + struct timespec rem = { }; + unsigned int v; + + for (;;) { + if (read(t->pipes[0], &v, sizeof(v)) != sizeof(v)) + goto err; + + /* + * If we read a value not intended for us, say parent + * waits for specified child to complete among set of + * children, or we just have completed and wait for + * another lockid from a parent -- we need to write + * the value back and wait for some time before + * next attempt. + */ + if (v != lockid) { + if (write(t->pipes[1], &v, sizeof(v)) != sizeof(v)) + goto err; + /* + * If we get a collision in access, lets sleep + * semi-random time magnitude to decrease probability + * of a new collision. + */ + nanosleep(&req, &rem); + req.tv_nsec += t->seed; + } else + break; + } + + return; + +err: + pr_perror("task_waiter_wait4 failed"); + exit(errno); +} + +void task_waiter_complete(task_waiter_t *t, unsigned int lockid) +{ + if (write(t->pipes[1], &lockid, sizeof(lockid)) != sizeof(lockid)) { + pr_perror("task_waiter_complete failed"); + exit(1); + } +} + +void task_waiter_complete_current(task_waiter_t *t) +{ + return task_waiter_complete(t, (int)sys_gettid()); +} diff --git a/CRIU_code/test/zdtm/lib/lock.h b/CRIU_code/test/zdtm/lib/lock.h new file mode 100644 index 0000000..9c0831d --- /dev/null +++ b/CRIU_code/test/zdtm/lib/lock.h @@ -0,0 +1,160 @@ +#ifndef CR_LOCK_H_ +#define CR_LOCK_H_ + +#include +#include +#include +#include +#include +#include +#include "asm/atomic.h" + +#define BUG_ON(condition) \ + do { \ + if ((condition)) { \ + raise(SIGABRT); \ + *(volatile unsigned long *)NULL = 0xdead0000 + __LINE__; \ + } \ + } while (0) +typedef struct { + uint32_t raw; +} futex_t; + +#define FUTEX_ABORT_FLAG (0x80000000) +#define FUTEX_ABORT_RAW (-1U) + +static inline int sys_futex(uint32_t *uaddr, int op, uint32_t val, const struct timespec *timeout, + uint32_t *uaddr2, uint32_t val3) +{ + return syscall(__NR_futex, uaddr, op, val, timeout, uaddr2, val3); +} + +/* Get current futex @f value */ +static inline uint32_t futex_get(futex_t *f) +{ + return atomic_get(&f->raw); +} + +/* Set futex @f value to @v */ +static inline void futex_set(futex_t *f, uint32_t v) +{ + atomic_set(&f->raw, v); +} + +/* Set futex @f to @v and wake up all waiters */ +static inline void futex_add_and_wake(futex_t *f, uint32_t v) +{ + atomic_add(v, &f->raw); + BUG_ON(sys_futex(&f->raw, FUTEX_WAKE, INT_MAX, NULL, NULL, 0) < 0); +} + + +#define futex_init(f) futex_set(f, 0) + +/* Wait on futex @__f value @__v become in condition @__c */ +#define futex_wait_if_cond(__f, __v, __cond) \ + do { \ + int ret; \ + uint32_t tmp; \ + \ + while (1) { \ + tmp = (__f)->raw; \ + if ((tmp & FUTEX_ABORT_FLAG) || \ + (tmp __cond (__v))) \ + break; \ + ret = sys_futex(&(__f)->raw, FUTEX_WAIT,\ + tmp, NULL, NULL, 0); \ + if (ret < 0 && (errno == EAGAIN || errno == EINTR)) \ + continue; \ + BUG_ON(ret < 0 && errno != EWOULDBLOCK); \ + } \ + } while (0) + +/* Set futex @f to @v and wake up all waiters */ +static inline void futex_set_and_wake(futex_t *f, uint32_t v) +{ + atomic_set(&f->raw, v); + BUG_ON(sys_futex(&f->raw, FUTEX_WAKE, INT_MAX, NULL, NULL, 0) < 0); +} + +/* Mark futex @f as wait abort needed and wake up all waiters */ +static inline void futex_abort_and_wake(futex_t *f) +{ + futex_set_and_wake(f, FUTEX_ABORT_RAW); +} + +/* Decrement futex @f value and wake up all waiters */ +static inline void futex_dec_and_wake(futex_t *f) +{ + atomic_dec(&f->raw); + BUG_ON(sys_futex(&f->raw, FUTEX_WAKE, INT_MAX, NULL, NULL, 0) < 0); +} + +/* Increment futex @f value and wake up all waiters */ +static inline void futex_inc_and_wake(futex_t *f) +{ + atomic_inc(&f->raw); + BUG_ON(sys_futex(&f->raw, FUTEX_WAKE, INT_MAX, NULL, NULL, 0) < 0); +} + +/* Plain increment futex @f value */ +static inline void futex_inc(futex_t *f) { atomic_inc(&f->raw); } + +/* Plain decrement futex @f value */ +static inline void futex_dec(futex_t *f) { atomic_dec(&f->raw); } + +/* Wait until futex @f value become @v */ +static inline void futex_wait_until(futex_t *f, uint32_t v) +{ futex_wait_if_cond(f, v, ==); } + +/* Wait while futex @f value is greater than @v */ +static inline void futex_wait_while_gt(futex_t *f, uint32_t v) +{ futex_wait_if_cond(f, v, <=); } + +/* Wait while futex @f value is less than @v */ +static inline void futex_wait_while_lt(futex_t *f, uint32_t v) +{ futex_wait_if_cond(f, v, >=); } +/* Wait while futex @f value is @v */ +static inline uint32_t futex_wait_while(futex_t *f, uint32_t v) +{ + while (f->raw == v) { + int ret = sys_futex(&f->raw, FUTEX_WAIT, v, NULL, NULL, 0); + if (ret < 0 && (errno == EAGAIN || errno == EINTR)) + continue; + BUG_ON(ret < 0 && errno != EWOULDBLOCK); + } + + return f->raw; +} + +typedef struct { + uint32_t raw; +} mutex_t; + +static void inline mutex_init(mutex_t *m) +{ + uint32_t c = 0; + atomic_set(&m->raw, c); +} + +static void inline mutex_lock(mutex_t *m) +{ + uint32_t c; + int ret; + + while ((c = atomic_inc(&m->raw)) != 0) { + ret = sys_futex(&m->raw, FUTEX_WAIT, c + 1, NULL, NULL, 0); + if (ret < 0) + pr_perror("futex"); + BUG_ON(ret < 0 && errno != EWOULDBLOCK); + } +} + +static void inline mutex_unlock(mutex_t *m) +{ + uint32_t c = 0; + atomic_set(&m->raw, c); + BUG_ON(sys_futex(&m->raw, FUTEX_WAKE, 1, NULL, NULL, 0) < 0); +} + +#endif /* CR_LOCK_H_ */ diff --git a/CRIU_code/test/zdtm/lib/msg.c b/CRIU_code/test/zdtm/lib/msg.c new file mode 100644 index 0000000..fe09940 --- /dev/null +++ b/CRIU_code/test/zdtm/lib/msg.c @@ -0,0 +1,69 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +int test_log_init(const char *fname, const char *suffix) +{ + char path[PATH_MAX]; + int logfd; + + snprintf(path, sizeof(path), "%s%s", fname, suffix); + logfd = open(path, O_WRONLY | O_EXCL | O_CREAT | O_APPEND, 0644); + if (logfd < 0) { + pr_perror("Can't open file %s", path); + return -1; + } + + dup2(logfd, STDERR_FILENO); + dup2(logfd, STDOUT_FILENO); + + close(logfd); + + setbuf(stdout, NULL); + setbuf(stderr, NULL); + + return 0; +} + +int zdtm_seccomp; +void test_msg(const char *format, ...) +{ + va_list arg; + int off = 0; + char buf[TEST_MSG_BUFFER_SIZE]; + int _errno = errno; + struct timeval tv; + struct tm *tm; + + if (zdtm_seccomp) /* seccomp allows restricted set of syscall-s */ + goto skip; + + gettimeofday(&tv, NULL); + tm = localtime(&tv.tv_sec); + if (tm == NULL) { + fprintf(stderr, "ERROR in %s: localtime() failed: %m\n", + __func__); + } else { + off += strftime(buf, sizeof(buf), "%H:%M:%S", tm); + } + + off += sprintf(buf + off, ".%.3ld: ", tv.tv_usec / 1000); + off += sprintf(buf + off, "%5d: ", getpid()); + +skip: + va_start(arg, format); + off += vsnprintf(buf + off, sizeof(buf) - off, format, arg); + va_end(arg); + + write(2, buf, off); + errno = _errno; +} diff --git a/CRIU_code/test/zdtm/lib/ns.c b/CRIU_code/test/zdtm/lib/ns.c new file mode 100644 index 0000000..3099f74 --- /dev/null +++ b/CRIU_code/test/zdtm/lib/ns.c @@ -0,0 +1,456 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" +#include "ns.h" + +int criu_status_in = -1, criu_status_in_peer = -1, criu_status_out = -1; + +extern int pivot_root(const char *new_root, const char *put_old); +static int prepare_mntns(void) +{ + int dfd, ret; + char *root, *criu_path; + char path[PATH_MAX]; + + root = getenv("ZDTM_ROOT"); + if (!root) { + fprintf(stderr, "ZDTM_ROOT isn't set\n"); + return -1; + } + + /* + * In a new userns all mounts are locked to protect what is + * under them. So we need to create another mount for the + * new root. + */ + if (mount(root, root, NULL, MS_SLAVE , NULL)) { + fprintf(stderr, "Can't bind-mount root: %m\n"); + return -1; + } + + if (mount(root, root, NULL, MS_BIND | MS_REC, NULL)) { + fprintf(stderr, "Can't bind-mount root: %m\n"); + return -1; + } + + criu_path = getenv("ZDTM_CRIU"); + if (criu_path) { + snprintf(path, sizeof(path), "%s%s", root, criu_path); + if (mount(criu_path, path, NULL, MS_BIND, NULL) || + mount(NULL, path, NULL, MS_PRIVATE, NULL)) { + pr_perror("Unable to mount %s", path); + return -1; + } + } + + /* Move current working directory to the new root */ + ret = readlink("/proc/self/cwd", path, sizeof(path) - 1); + if (ret < 0) + return -1; + path[ret] = 0; + + dfd = open(path, O_RDONLY | O_DIRECTORY); + if (dfd == -1) { + fprintf(stderr, "open(.) failed: %m\n"); + return -1; + } + + if (chdir(root)) { + fprintf(stderr, "chdir(%s) failed: %m\n", root); + return -1; + } + if (mkdir("old", 0777) && errno != EEXIST) { + fprintf(stderr, "mkdir(old) failed: %m\n"); + return -1; + } + + if (pivot_root(".", "./old")) { + fprintf(stderr, "pivot_root(., ./old) failed: %m\n"); + return -1; + } + + if (mount("./old", "./old", NULL, MS_SLAVE | MS_REC , NULL)) { + fprintf(stderr, "Can't bind-mount root: %m\n"); + return -1; + } + + /* + * proc and sysfs can be mounted in an unprivileged namespace, + * if they are already mounted when the user namespace is created. + * So ./old must be umounted after mounting /proc and /sys. + */ + if (mount("proc", "/proc", "proc", MS_MGC_VAL | MS_NOSUID | MS_NOEXEC | MS_NODEV, NULL)) { + fprintf(stderr, "mount(/proc) failed: %m\n"); + return -1; + } + + if (mount("zdtm_run", "/run", "tmpfs", 0, NULL)) { + fprintf(stderr, "Unable to mount /run: %m\n"); + return -1; + } + + if (umount2("./old", MNT_DETACH)) { + fprintf(stderr, "umount(./old) failed: %m\n"); + return -1; + } + + if (mount("pts", "/dev/pts", "devpts", MS_MGC_VAL, "mode=666,ptmxmode=666,newinstance")) { + fprintf(stderr, "mount(/dev/pts) failed: %m\n"); + return -1; + } + /* + * If CONFIG_DEVPTS_MULTIPLE_INSTANCES=n, then /dev/pts/ptmx + * does not exist. Fall back to creating the device with + * mknod() in that case. + */ + if (access("/dev/pts/ptmx", F_OK) == 0) { + if (symlink("pts/ptmx", "/dev/ptmx") && errno != EEXIST) { + fprintf(stderr, "symlink(/dev/ptmx) failed: %m\n"); + return -1; + } + } else { + if (mknod("/dev/ptmx", 0666 | S_IFCHR, makedev(5, 2)) == 0) { + chmod("/dev/ptmx", 0666); + } else if (errno != EEXIST) { + fprintf(stderr, "mknod(/dev/ptmx) failed: %m\n"); + return -1; + } + } + + if (fchdir(dfd)) { + fprintf(stderr, "fchdir() failed: %m\n"); + return -1; + } + close(dfd); + + return 0; +} + +static int prepare_namespaces(void) +{ + if (setuid(0) || setgid(0) || setgroups(0, NULL)) { + fprintf(stderr, "set*id failed: %m\n"); + return -1; + } + + system("ip link set up dev lo"); + + if (prepare_mntns()) + return -1; + + return 0; +} + +#define NS_STACK_SIZE 4096 + +/* All arguments should be above stack, because it grows down */ +struct ns_exec_args { + char stack[NS_STACK_SIZE] __stack_aligned__; + char stack_ptr[0]; + int argc; + char **argv; + int status_pipe[2]; +}; + +static void ns_sig_hand(int signo) +{ + int status, len = 0; + pid_t pid; + char buf[128] = ""; + + if (signo == SIGTERM) { + futex_set_and_wake(&sig_received, signo); + len = snprintf(buf, sizeof(buf), "Time to stop and check\n"); + goto write_out; + } + + while (1) { + pid = waitpid(-1, &status, WNOHANG); + if (pid == 0) + return; + if (pid == -1) { + if (errno == ECHILD) { + if (futex_get(&sig_received)) + return; + futex_set_and_wake(&sig_received, signo); + len = snprintf(buf, sizeof(buf), + "All test processes exited\n"); + } else { + len = snprintf(buf, sizeof(buf), + "wait() failed: %m\n"); + } + goto write_out; + } + if (status) + fprintf(stderr, "%d return %d\n", pid, status); + } + + return; +write_out: + /* fprintf can't be used in a sighandler due to glibc locks */ + write(STDERR_FILENO, buf, MIN(len, sizeof(buf))); +} + +#define STATUS_FD 255 +static int ns_exec(void *_arg) +{ + struct ns_exec_args *args = (struct ns_exec_args *) _arg; + char buf[4096]; + int ret; + + close(args->status_pipe[0]); + + setsid(); + + ret = dup2(args->status_pipe[1], STATUS_FD); + if (ret < 0) { + fprintf(stderr, "dup2() failed: %m\n"); + return -1; + } + close(args->status_pipe[1]); + read(STATUS_FD, buf, sizeof(buf)); + shutdown(STATUS_FD, SHUT_RD); + + if (prepare_namespaces()) + return -1; + + setenv("ZDTM_NEWNS", "2", 1); + execvp(args->argv[0], args->argv); + fprintf(stderr, "exec(%s) failed: %m\n", args->argv[0]); + return -1; +} + +int ns_init(int argc, char **argv) +{ + struct sigaction sa = { + .sa_handler = ns_sig_hand, + .sa_flags = SA_RESTART, + }; + int ret, fd, status_pipe = STATUS_FD; + char buf[128], *x; + pid_t pid; + bool reap; + + ret = fcntl(status_pipe, F_SETFD, FD_CLOEXEC); + if (ret == -1) { + fprintf(stderr, "fcntl failed %m\n"); + exit(1); + } + + if (init_notify()) { + fprintf(stderr, "Can't init pre-dump notification: %m"); + exit(1); + } + + reap = getenv("ZDTM_NOREAP") == NULL; + + sigemptyset(&sa.sa_mask); + sigaddset(&sa.sa_mask, SIGTERM); + if (reap) + sigaddset(&sa.sa_mask, SIGCHLD); + + if (sigaction(SIGTERM, &sa, NULL)) { + fprintf(stderr, "Can't set SIGTERM handler: %m\n"); + exit(1); + } + + x = malloc(strlen(pidfile) + 3); + sprintf(x, "%sns", pidfile); + pidfile = x; + + /* Start test */ + pid = fork(); + if (pid < 0) { + fprintf(stderr, "fork() failed: %m\n"); + exit(1); + } else if (pid == 0) { + close(status_pipe); + unsetenv("ZDTM_NEWNS"); + return 0; /* Continue normal test startup */ + } + + ret = -1; + if (waitpid(pid, &ret, 0) < 0) + fprintf(stderr, "waitpid() failed: %m\n"); + else if (ret) + fprintf(stderr, "The test returned non-zero code %d\n", ret); + + if (reap && sigaction(SIGCHLD, &sa, NULL)) { + fprintf(stderr, "Can't set SIGCHLD handler: %m\n"); + exit(1); + } + + while (reap && 1) { + int status; + + pid = waitpid(-1, &status, WNOHANG); + if (pid == 0) + break; + if (pid < 0) { + fprintf(stderr, "waitpid() failed: %m\n"); + exit (1); + } + if (status) + fprintf(stderr, "%d return %d\n", pid, status); + } + + /* Daemonize */ + write(status_pipe, &ret, sizeof(ret)); + close(status_pipe); + if (ret) + exit(ret); + + /* suspend/resume */ + test_waitsig(); + + fd = open(pidfile, O_RDONLY); + if (fd == -1) { + fprintf(stderr, "open(%s) failed: %m\n", pidfile); + exit(1); + } + ret = read(fd, buf, sizeof(buf) - 1); + if (ret == -1) { + fprintf(stderr, "read() failed: %m\n"); + exit(1); + } + buf[ret] = '\0'; + + pid = atoi(buf); + fprintf(stderr, "kill(%d, SIGTERM)\n", pid); + if (pid > 0) + kill(pid, SIGTERM); + + ret = 0; + if (reap) { + while (true) { + pid_t child; + ret = -1; + + child = waitpid(-1, &ret, 0); + if (child < 0) { + fprintf(stderr, "Unable to wait a test process: %m"); + exit(1); + } + if (child == pid) { + fprintf(stderr, "The test returned 0x%x", ret); + exit(!(ret == 0)); + } + if (ret) + fprintf(stderr, "The %d process exited with 0x%x", child, ret); + } + } else { + waitpid(pid, NULL, 0); + } + + + exit(1); +} + +#define UID_MAP "0 20000 20000\n100000 200000 50000" +#define GID_MAP "0 400000 50000\n50000 500000 100000" +void ns_create(int argc, char **argv) +{ + pid_t pid; + int ret, status; + struct ns_exec_args args; + int flags; + char *pidf; + + args.argc = argc; + args.argv = argv; + + ret = socketpair(AF_UNIX, SOCK_SEQPACKET, 0, args.status_pipe); + if (ret) { + fprintf(stderr, "Pipe() failed %m\n"); + exit(1); + } + + flags = CLONE_NEWPID | CLONE_NEWNS | CLONE_NEWUTS | + CLONE_NEWNET | CLONE_NEWIPC | SIGCHLD; + + if (getenv("ZDTM_USERNS")) + flags |= CLONE_NEWUSER; + + pid = clone(ns_exec, args.stack_ptr, flags, &args); + if (pid < 0) { + fprintf(stderr, "clone() failed: %m\n"); + exit(1); + } + + close(args.status_pipe[1]); + + if (flags & CLONE_NEWUSER) { + char pname[PATH_MAX]; + int fd; + + snprintf(pname, sizeof(pname), "/proc/%d/uid_map", pid); + fd = open(pname, O_WRONLY); + if (fd < 0) { + fprintf(stderr, "open(%s): %m\n", pname); + exit(1); + } + if (write(fd, UID_MAP, sizeof(UID_MAP)) < 0) { + fprintf(stderr, "write(" UID_MAP "): %m\n"); + exit(1); + } + close(fd); + + snprintf(pname, sizeof(pname), "/proc/%d/gid_map", pid); + fd = open(pname, O_WRONLY); + if (fd < 0) { + fprintf(stderr, "open(%s): %m\n", pname); + exit(1); + } + if (write(fd, GID_MAP, sizeof(GID_MAP)) < 0) { + fprintf(stderr, "write(" GID_MAP "): %m\n"); + exit(1); + } + close(fd); + } + shutdown(args.status_pipe[0], SHUT_WR); + + pidf = pidfile; + pidfile = malloc(strlen(pidfile) + 13); + sprintf(pidfile, "%s%s", pidf, INPROGRESS); + if (write_pidfile(pid)) { + fprintf(stderr, "Preparations fail\n"); + exit(1); + } + + status = 1; + ret = read(args.status_pipe[0], &status, sizeof(status)); + if (ret != sizeof(status) || status) { + fprintf(stderr, "The test failed (%d, %d)\n", ret, status); + exit(1); + } + ret = read(args.status_pipe[0], &status, sizeof(status)); + if (ret != 0) { + fprintf(stderr, "Unexpected message from test\n"); + exit(1); + } + + unlink(pidfile); + pidfile = pidf; + + if (write_pidfile(pid)) + exit(1); + + exit(0); +} + diff --git a/CRIU_code/test/zdtm/lib/ns.h b/CRIU_code/test/zdtm/lib/ns.h new file mode 100644 index 0000000..40cc1e0 --- /dev/null +++ b/CRIU_code/test/zdtm/lib/ns.h @@ -0,0 +1,17 @@ +#ifndef __ZDTM_NS__ +#define __ZDTM_NS__ + +#include "lock.h" + +extern futex_t sig_received; +extern char *pidfile; + +extern void ns_create(int argc, char **argv); +extern int ns_init(int argc, char **argv); + +extern void test_waitsig(void); +extern void parseargs(int, char **); + +extern int init_notify(void); + +#endif diff --git a/CRIU_code/test/zdtm/lib/parseargs.c b/CRIU_code/test/zdtm/lib/parseargs.c new file mode 100644 index 0000000..7e411f6 --- /dev/null +++ b/CRIU_code/test/zdtm/lib/parseargs.c @@ -0,0 +1,175 @@ +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +static struct long_opt *opt_head; + +static int help; +TEST_OPTION(help, bool, "print help message and exit", 0); + +void __push_opt(struct long_opt *opt) +{ + opt->next = opt_head; + /* FIXME: barrier ? */ + opt_head = opt; +} + +int parse_opt_bool(char *param, void *arg) +{ + if (param == NULL || + !strcmp(param, "on") || + !strcmp(param, "yes") || + !strcmp(param, "true")) { + * (int *) arg = 1; + return 0; + } + if (!strcmp(param, "off") || + !strcmp(param, "no") || + !strcmp(param, "false")) { + * (int *) arg = 0; + return 0; + } + return -EINVAL; +} + +int parse_opt_int(char *param, void *arg) +{ + char *tail; + if (param == NULL || param[0] == '\0') + return -EINVAL; + * (int *) arg = strtol(param, &tail, 0); + if (tail[0] != '\0') + return -EINVAL; + return 0; +} + +int parse_opt_uint(char *param, void *arg) +{ + char *tail; + if (param == NULL || param[0] == '\0') + return -EINVAL; + * (unsigned int *) arg = strtoul(param, &tail, 0); + if (tail[0] != '\0') + return -EINVAL; + return 0; +} + +int parse_opt_long(char *param, void *arg) +{ + char *tail; + if (param == NULL || param[0] == '\0') + return -EINVAL; + * (long *) arg = strtol(param, &tail, 0); + if (tail[0] != '\0') + return -EINVAL; + return 0; +} + +int parse_opt_ulong(char *param, void *arg) +{ + char *tail; + if (param == NULL || param[0] == '\0') + return -EINVAL; + * (unsigned long *) arg = strtoul(param, &tail, 0); + if (tail[0] != '\0') + return -EINVAL; + return 0; +} + +int parse_opt_string(char *param, void *arg) +{ + if (param == NULL || param[0] == '\0') + return -EINVAL; + * (char **) arg = param; + return 0; +} + +static void printopt(const struct long_opt *opt) +{ + const char *obracket = "", *cbracket = ""; + + if (!opt->is_required) { + obracket = "["; + cbracket = "]"; + } + + fprintf(stderr, " %s--%s=%s%s\t%s\n", + obracket, opt->name, opt->type, cbracket, opt->doc); +} + +static void helpexit(void) +{ + struct long_opt *opt; + + fputs("Usage:\n", stderr); + + for (opt = opt_head; opt; opt = opt->next) + printopt(opt); + + exit(1); +} + +const char *test_doc; +const char *test_author; + +static void prdoc(void) +{ + if (test_doc) + fprintf(stderr, "%s\n", test_doc); + if (test_author) + fprintf(stderr, "Author: %s\n", test_author); +} + +void parseargs(int argc, char ** argv) +{ + int i; + struct long_opt *opt; + + for (i = 1; i < argc; i++) { + char *name, *value; + + if (strlen(argv[i]) < 2 || strncmp(argv[i], "--", 2)) { + fprintf(stderr, "%s: options should start with --\n", argv[i]); + helpexit(); + } + + name = argv[i] + 2; + + value = strchr(name, '='); + if (value) + value++; + + for (opt = opt_head; opt; opt = opt->next) + if (!strncmp(name, opt->name, value - name - 1)) { + if (opt->parse_opt(value, opt->value)) { + fprintf(stderr, "%s: failed to parse\n", argv[i]); + helpexit(); + } + else + /* -1 marks fulfilled requirement */ + opt->is_required = - opt->is_required; + + break; + } + + if (!opt) { + fprintf(stderr, "%s: unknown option\n", argv[i]); + helpexit(); + } + } + + if (help) { + prdoc(); + helpexit(); + } + + for (opt = opt_head; opt; opt = opt->next) + if (opt->is_required > 0) { + fprintf(stderr, "mandatory flag --%s not given\n", opt->name); + helpexit(); + } +} diff --git a/CRIU_code/test/zdtm/lib/parseargs.sh b/CRIU_code/test/zdtm/lib/parseargs.sh new file mode 100644 index 0000000..19412b8 --- /dev/null +++ b/CRIU_code/test/zdtm/lib/parseargs.sh @@ -0,0 +1,92 @@ +#!/bin/bash +# +# parse command line flags of the form --foo=bar and print out an eval-able line + +name=$0 + +function die() { + echo "$name: $*" >&2 + exit 1 +} + +# eat our flags first +while : ; do + flag=$1 + shift || break + case $flag in + --flags-req=*) # req'd flags + oIFS="$IFS" IFS="," + vars_req=(${flag#*=}) + IFS="$oIFS" + ;; + --flags-opt=*) # optional flags + oIFS="$IFS" IFS="," + vars_opt=(${flag#*=}) + IFS="$oIFS" + ;; + --name=*) # name to report errors as + name=${flag#*=} + ;; + --flags-only) # report only flags + show_flags=true show_args=false + ;; + --no-flags) # report only remaining args + show_flags=false show_args=true + ;; + --) # end of our flags; external flags follow + break + ;; + esac +done + +# consume external flags +while : ; do + flag=$1 + shift || break + case $flag in + --*=*) + ;; + --) # end of external flags; uninterpreted arguments follow + break + ;; + *) # pass unrecognized arguments through + args="$args '$flag'" + continue + ;; + esac + + flagname=${flag%%=*} + flagname=${flagname#--} + flagval=${flag#*=} + + # check if this flag is declared + case " ${vars_req[*]} ${vars_opt[*]} " in + *" $flagname "*) + ;; + *) # pass unrecognized flags through + args="$args '$flag'" + continue + ;; + esac + + eval $flagname=\"$flagval\" +done + +# check that we have all required flags +for var in ${vars_req[@]}; do + ${!var+true} die "--$var is required" +done + +# now print 'em out +if ${show_flags:-true}; then + for var in ${vars_req[@]} ${vars_opt[@]}; do + # only print those that are set (even to an empty string) + ${!var+echo $var="'${!var}'"} + done +fi +if ${show_args:-true}; then + for arg in "$@"; do # get quotes right + args="$args '$arg'" + done + echo "set -- $args" +fi diff --git a/CRIU_code/test/zdtm/lib/stop_and_chk.sh b/CRIU_code/test/zdtm/lib/stop_and_chk.sh new file mode 100644 index 0000000..25fef56 --- /dev/null +++ b/CRIU_code/test/zdtm/lib/stop_and_chk.sh @@ -0,0 +1,54 @@ +#!/bin/bash + +export PATH=$PATH:${0%/*} + +function die() { + echo "ERR: $*" + exit 1 +} + +tmpargs="$(parseargs.sh --name=$0 --flags-req=pidfile,outfile -- "$@")" || + die "can't parse command line" +eval "$tmpargs" + +# check that pidfile exists +if [ ! -r "$pidfile" ]; then + # if the testcase has written out the outfile, print it + if [ -r "$outfile" ]; then + echo $(< "$outfile") + exit 1 + else + die "pidfile $pidfile doesn't exist" + fi +fi + +# try to stop the testcase +kill -TERM $(< $pidfile) + +# wait at most this many sec for the testcase to stop and wipe out the pidfile +declare -i loops=10 +while [ -f "$pidfile" ]; do + ((loops--)) || die "$pidfile still exists" + sleep 1 +done + +# see if the testcase has written out the result file +[ -f "$outfile" ] || die "$outfile doesn't exist" + +# read in the result +res="$(< "$outfile")" + +# dump it to stdout, with the return code reflecting the status +case "$res" in + PASS) + echo "$res" + exit 0 + ;; + FAIL:* | ERR:*) + echo "$res" + exit 1 + ;; + *) + die "$outfile is incomprehensible" + ;; +esac diff --git a/CRIU_code/test/zdtm/lib/streamutil.c b/CRIU_code/test/zdtm/lib/streamutil.c new file mode 100644 index 0000000..90305c0 --- /dev/null +++ b/CRIU_code/test/zdtm/lib/streamutil.c @@ -0,0 +1,75 @@ +#include +#include +#include +#include + +#include "zdtmtst.h" + +int set_nonblock(int fd, int on) +{ + int flag; + + flag = fcntl(fd, F_GETFL, 0); + + if (flag < 0) + return flag; + + if (on) + flag |= O_NONBLOCK; + else + flag &= ~O_NONBLOCK; + + return fcntl(fd, F_SETFL, flag); +} + +int pipe_in2out(int infd, int outfd, uint8_t *buffer, int length) +{ + uint8_t *buf; + int rlen, wlen; + + while (1) { + rlen = read(infd, buffer, length); + if (rlen <= 0) + return rlen; + + /* don't go reading until we're done with writing */ + for (buf = buffer; rlen > 0; buf += wlen, rlen -= wlen) { + wlen = write(outfd, buf, rlen); + if (wlen < 0) + return wlen; + } + } +} + +int read_data(int fd, unsigned char *buf, int size) +{ + int cur = 0; + int ret; + while (cur != size) { + ret = read(fd, buf + cur, size - cur); + if (ret <= 0) { + pr_perror("read(%d) = %d", size - cur, ret); + return -1; + } + cur += ret; + } + + return 0; +} + +int write_data(int fd, const unsigned char *buf, int size) +{ + int cur = 0; + int ret; + + while (cur != size) { + ret = write(fd, buf + cur, size - cur); + if (ret <= 0) { + pr_perror("write(%d) = %d", size - cur, ret); + return -1; + } + cur += ret; + } + + return 0; +} diff --git a/CRIU_code/test/zdtm/lib/tcp.c b/CRIU_code/test/zdtm/lib/tcp.c new file mode 100644 index 0000000..e753e3c --- /dev/null +++ b/CRIU_code/test/zdtm/lib/tcp.c @@ -0,0 +1,132 @@ +#include +#include +#include /* for sockaddr_in and inet_ntoa() */ + +#include "zdtmtst.h" + +union sockaddr_inet { + struct sockaddr_in v4; + struct sockaddr_in6 v6; +}; + +int tcp_init_server(int family, int *port) +{ + struct zdtm_tcp_opts opts = { + .reuseaddr = true, + .reuseport = false, + }; + + return tcp_init_server_with_opts(family, port, &opts); +} + +int tcp_init_server_with_opts(int family, int *port, struct zdtm_tcp_opts *opts) +{ + union sockaddr_inet addr; + int sock; + int yes = 1, ret; + + memset(&addr,0,sizeof(addr)); + if (family == AF_INET) { + addr.v4.sin_family = family; + inet_pton(family, "0.0.0.0", &(addr.v4.sin_addr)); + } else if (family == AF_INET6){ + addr.v6.sin6_family = family; + inet_pton(family, "::0", &(addr.v6.sin6_addr)); + } else + return -1; + + sock = socket(family, SOCK_STREAM | opts->flags, IPPROTO_TCP); + if (sock == -1) { + pr_perror("socket() failed"); + return -1; + } + + if (opts->reuseport && + setsockopt(sock, SOL_SOCKET, SO_REUSEPORT, &yes, sizeof(int)) == -1) { + pr_perror(""); + return -1; + } + + if (opts->reuseaddr && + setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(int)) == -1 ) { + pr_perror("setsockopt() error"); + return -1; + } + + while (1) { + if (family == AF_INET) + addr.v4.sin_port = htons(*port); + else if (family == AF_INET6) + addr.v6.sin6_port = htons(*port); + + ret = bind(sock, (struct sockaddr *) &addr, sizeof(addr)); + + /* criu doesn't restore sock opts, so we need this hack */ + if (ret == -1 && errno == EADDRINUSE) { + test_msg("The port %d is already in use.\n", *port); + (*port)++; + continue; + } + break; + } + + if (ret == -1) { + pr_perror("bind() failed"); + return -1; + } + + if (listen(sock, 1) == -1) { + pr_perror("listen() failed"); + return -1; + } + return sock; +} + +int tcp_accept_server(int sock) +{ + struct sockaddr_in maddr; + int sock2; + socklen_t addrlen; +#ifdef DEBUG + test_msg ("Waiting for connection..........\n"); +#endif + addrlen = sizeof(maddr); + sock2 = accept(sock,(struct sockaddr *) &maddr, &addrlen); + + if (sock2 == -1) { + pr_perror("accept() failed"); + return -1; + } + +#ifdef DEBUG + test_msg ("Connection!!\n"); +#endif + return sock2; +} + +int tcp_init_client(int family, char *servIP, unsigned short servPort) +{ + int sock; + union sockaddr_inet servAddr; + + if ((sock = socket(family, SOCK_STREAM, IPPROTO_TCP)) < 0) { + pr_perror("can't create socket"); + return -1; + } + /* Construct the server address structure */ + memset(&servAddr, 0, sizeof(servAddr)); + if (family == AF_INET) { + servAddr.v4.sin_family = AF_INET; + servAddr.v4.sin_port = htons(servPort); + inet_pton(AF_INET, servIP, &servAddr.v4.sin_addr); + } else { + servAddr.v6.sin6_family = AF_INET6; + servAddr.v6.sin6_port = htons(servPort); + inet_pton(AF_INET6, servIP, &servAddr.v6.sin6_addr); + } + if (connect(sock, (struct sockaddr *) &servAddr, sizeof(servAddr)) < 0) { + pr_perror("can't connect to server"); + return -1; + } + return sock; +} diff --git a/CRIU_code/test/zdtm/lib/test.c b/CRIU_code/test/zdtm/lib/test.c new file mode 100644 index 0000000..a1bdfc1 --- /dev/null +++ b/CRIU_code/test/zdtm/lib/test.c @@ -0,0 +1,413 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" +#include "lock.h" +#include "ns.h" + +futex_t sig_received; +static struct { + futex_t stage; +} *test_shared_state; + +enum { + TEST_INIT_STAGE = 0, + TEST_RUNNING_STAGE, + TEST_FINI_STAGE, + TEST_FAIL_STAGE, +}; + +static int parent; + +extern int criu_status_in, criu_status_in_peer, criu_status_out; + +static void sig_hand(int signo) +{ + if (parent) + futex_set_and_wake(&test_shared_state->stage, TEST_FAIL_STAGE); + futex_set_and_wake(&sig_received, signo); + if (criu_status_in >= 0) + close(criu_status_in); +} + +static char *outfile; +TEST_OPTION(outfile, string, "output file", 1); +char *pidfile; +TEST_OPTION(pidfile, string, "file to store pid", 1); + +static pid_t master_pid = 0; + +int test_fork_id(int id) +{ + return fork(); +} + +static int cwd = -1; + +static void test_fini(void) +{ + char path[PATH_MAX]; + + if (getpid() != master_pid) + return; + + snprintf(path, sizeof(path), "%s%s", outfile, INPROGRESS); + renameat(cwd, path, cwd, outfile); + + unlinkat(cwd, pidfile, 0); +} + +static void setup_outfile() +{ + if (!access(outfile, F_OK) || errno != ENOENT) { + fprintf(stderr, "Output file %s appears to exist, aborting\n", + outfile); + exit(1); + } + + cwd = open(".", O_RDONLY); + if (cwd < 0) { + fprintf(stderr, "Unable to open\n"); + exit(1); + } + + if (atexit(test_fini)) { + fprintf(stderr, "Can't register exit function\n"); + exit(1); + } + if (test_log_init(outfile, INPROGRESS)) + exit(1); +} + +static void redir_stdfds() +{ + int nullfd; + + nullfd = open("/dev/null", O_RDWR); + if (nullfd < 0) { + pr_perror("Can't open /dev/null"); + exit(1); + } + + dup2(nullfd, STDIN_FILENO); + if (nullfd > 2) + close(nullfd); +} + +void test_ext_init(int argc, char **argv) +{ + parseargs(argc, argv); + if (test_log_init(outfile, ".external")) + exit(1); +} + +#define PIPE_RD 0 +#define PIPE_WR 1 + +int init_notify(void) +{ + char *val; + int ret; + int p[2]; + + val = getenv("ZDTM_NOTIFY_FDIN"); + if (!val) + return 0; + criu_status_in = atoi(val); + + val = getenv("ZDTM_NOTIFY_FDOUT"); + if (!val) + return -1; + criu_status_out = atoi(val); + + if (pipe(p)) { + fprintf(stderr, "Unable to create a pipe: %m\n"); + return -1; + } + criu_status_in_peer = p[PIPE_WR]; + + ret = dup2(p[PIPE_RD], criu_status_in); + if (ret < 0) { + fprintf(stderr, "dup2() failed: %m\n"); + close(p[PIPE_RD]); + close(p[PIPE_WR]); + return -1; + } + close(p[PIPE_RD]); + + if (pipe(p)) { + fprintf(stderr, "Unable to create a pipe: %m\n"); + goto err_pipe_in; + } + close(p[PIPE_RD]); + + ret = dup2(p[PIPE_WR], criu_status_out); + if (ret < 0) { + fprintf(stderr, "dup2() failed: %m\n"); + goto err_pipe_out; + } + + close(p[PIPE_WR]); + return 0; +err_pipe_out: + close(p[PIPE_RD]); + close(p[PIPE_WR]); +err_pipe_in: + close(criu_status_in); + close(criu_status_in_peer); + return -1; +} + +int write_pidfile(int pid) +{ + int fd = -1; + char tmp[] = ".zdtm.pidfile.XXXXXX"; + + fd = mkstemp(tmp); + if (fd == -1) { + fprintf(stderr, "Can't create the file %s: %m\n", tmp); + return -1; + } + + if (fchmod(fd, 0666) < 0) { + fprintf(stderr, "Can't fchmod %s: %m\n", tmp); + goto err_c; + } + + if (dprintf(fd, "%d", pid) == -1) { + fprintf(stderr, "Can't write in the file %s: %m\n", tmp); + goto err_c; + } + + close(fd); + + if (rename(tmp, pidfile) < 0) { + fprintf(stderr, "Can't rename %s to %s: %m\n", tmp, pidfile); + goto err_u; + } + + return 0; + +err_c: + close(fd); +err_u: + unlink(tmp); + return -1; +} + +void test_init(int argc, char **argv) +{ + pid_t pid; + char *val; + struct sigaction sa = { + .sa_handler = sig_hand, + .sa_flags = SA_RESTART, + }; + sigemptyset(&sa.sa_mask); + + parseargs(argc, argv); + + val = getenv("ZDTM_NEWNS"); + if (val) { + if (!strcmp(val, "1")) { + ns_create(argc, argv); + exit(1); + } + + if (!strcmp(val, "2")) { + test_log_init(outfile, "ns"); + redir_stdfds(); + ns_init(argc, argv); + } + } else if (init_notify()) { + fprintf(stderr, "Can't init pre-dump notification: %m"); + exit(1); + } + + val = getenv("ZDTM_GROUPS"); + if (val) { + char *tok = NULL; + unsigned int size = 0, groups[NGROUPS_MAX]; + + tok = strtok(val, " "); + while (tok) { + size++; + groups[size - 1] = atoi(tok); + tok = strtok(NULL, " "); + } + + if (setgroups(size, groups)) { + fprintf(stderr, "Can't set groups: %m"); + exit(1); + } + } + + val = getenv("ZDTM_GID"); + if (val && (setgid(atoi(val)) == -1)) { + fprintf(stderr, "Can't set gid: %m"); + exit(1); + } + + val = getenv("ZDTM_UID"); + if (val && (setuid(atoi(val)) == -1)) { + fprintf(stderr, "Can't set gid: %m"); + exit(1); + } + + if (prctl(PR_SET_DUMPABLE, 1)) { + fprintf(stderr, "Can't set the dumpable flag"); + exit(1); + } + + if (sigaction(SIGTERM, &sa, NULL)) { + fprintf(stderr, "Can't set SIGTERM handler: %m\n"); + exit(1); + } + + if (sigaction(SIGCHLD, &sa, NULL)) { + fprintf(stderr, "Can't set SIGCHLD handler: %m\n"); + exit(1); + } + + setup_outfile(); + redir_stdfds(); + + test_shared_state = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_SHARED, 0, 0); + if (test_shared_state == MAP_FAILED) { + pr_perror("Unable to map a shared memory"); + exit(1); + } + + futex_init(&test_shared_state->stage); + futex_set(&test_shared_state->stage, TEST_INIT_STAGE); + + pid = fork(); + if (pid < 0) { + pr_perror("Daemonizing failed"); + exit(1); + } + + parent = 1; + if (pid) { /* parent will exit when the child is ready */ + futex_wait_while(&test_shared_state->stage, TEST_INIT_STAGE); + + if (futex_get(&test_shared_state->stage) == TEST_FAIL_STAGE) { + int ret; + if (waitpid(pid, &ret, 0) != pid) { + pr_perror("Unable to wait %d", pid); + exit(1); + } + + if (WIFEXITED(ret)) { + pr_err("Test exited unexpectedly with code %d\n", WEXITSTATUS(ret)); + exit(1); + } + if (WIFSIGNALED(ret)) { + pr_err("Test exited on unexpected signal %d\n", WTERMSIG(ret)); + exit(1); + } + } + + if (write_pidfile(pid)) + exit(1); + + _exit(0); + } + parent = 0; + + if (setsid() < 0) { + pr_perror("Can't become session group leader"); + exit(1); + } + + /* record the test pid to remember the ownership of the pidfile */ + master_pid = getpid(); + + sa.sa_handler = SIG_DFL; + if (sigaction(SIGCHLD, &sa, NULL)) { + pr_perror("Can't reset SIGCHLD handler"); + exit(1); + } + + srand48(time(NULL)); /* just in case we need it */ +} + +void test_daemon() +{ + futex_set_and_wake(&test_shared_state->stage, TEST_RUNNING_STAGE); +} + +int test_go(void) +{ + return !futex_get(&sig_received); +} + +void test_waitsig(void) +{ + futex_wait_while(&sig_received, 0); +} + +int test_wait_pre_dump(void) +{ + int ret; + + if (criu_status_in < 0) { + pr_err("Fd criu_status_in is not initialized\n"); + return -1; + } + + if (read(criu_status_in, &ret, sizeof(ret)) != sizeof(ret)) { + if (errno != EBADF || !futex_get(&sig_received)) + pr_perror("Can't wait pre-dump\n"); + return -1; + } + pr_err("pre-dump\n"); + + return 0; +} + +int test_wait_pre_dump_ack(void) +{ + int ret = 0; + + if (criu_status_out < 0) { + pr_err("Fd criu_status_out is not initialized\n"); + return -1; + } + + pr_err("pre-dump-ack\n"); + if (write(criu_status_out, &ret, sizeof(ret)) != sizeof(ret)) { + pr_perror("Can't reply to pre-dump notify"); + return -1; + } + + return 0; +} + +pid_t sys_clone_unified(unsigned long flags, void *child_stack, void *parent_tid, + void *child_tid, unsigned long newtls) +{ +#ifdef __x86_64__ + return (pid_t)syscall(__NR_clone, flags, child_stack, parent_tid, child_tid, newtls); +#elif (__i386__ || __arm__ || __aarch64__ ||__powerpc64__) + return (pid_t)syscall(__NR_clone, flags, child_stack, parent_tid, newtls, child_tid); +#elif __s390x__ + return (pid_t)syscall(__NR_clone, child_stack, flags, parent_tid, child_tid, newtls); +#else +#error "Unsupported architecture" +#endif +} diff --git a/CRIU_code/test/zdtm/lib/zdtmtst.h b/CRIU_code/test/zdtm/lib/zdtmtst.h new file mode 100644 index 0000000..1fbf795 --- /dev/null +++ b/CRIU_code/test/zdtm/lib/zdtmtst.h @@ -0,0 +1,171 @@ +#ifndef _VIMITESU_H_ +#define _VIMITESU_H_ + +#include +#include +#include +#include + +#define INPROGRESS ".inprogress" + +#ifndef PAGE_SIZE +# define PAGE_SIZE (unsigned int)(sysconf(_SC_PAGESIZE)) +#endif + +#ifndef PR_SET_CHILD_SUBREAPER +# define PR_SET_CHILD_SUBREAPER 36 +#endif + +/* set up test */ +extern void test_ext_init(int argc, char **argv); +extern void test_init(int argc, char **argv); + +#ifndef CLONE_NEWUTS +#define CLONE_NEWUTS 0x04000000 +#endif + +#ifndef CLONE_NEWIPC +#define CLONE_NEWIPC 0x08000000 +#endif + +#define TEST_MSG_BUFFER_SIZE 2048 +/*wrapper for fork: init log offset*/ +#define test_fork() test_fork_id(-1) +extern int test_fork_id(int id); +/* finish setting up the test, write out pid file, and go to background */ +extern void test_daemon(void); +/* store a message to a static buffer */ +extern void test_msg(const char *format, ...) + __attribute__ ((__format__ (__printf__, 1, 2))); +/* tell if SIGTERM hasn't been received yet */ +extern int test_go(void); +/* sleep until SIGTERM is delivered */ +extern void test_waitsig(void); +/* sleep until zdtm notifies about predump */ +extern int test_wait_pre_dump(void); +/* notify zdtm that we finished action after predump */ +extern int test_wait_pre_dump_ack(void); + +#include + +/* generate data with crc32 at the end of the buffer */ +extern void datagen(uint8_t *buffer, unsigned length, uint32_t *crc); +/* generate data without crc32 at the end of the buffer */ +extern void datagen2(uint8_t *buffer, unsigned length, uint32_t *crc); +/* check the data buffer against its crc32 */ +extern int datachk(const uint8_t *buffer, unsigned length, uint32_t *crc); +/* calculate crc for the data buffer*/ +extern int datasum(const uint8_t *buffer, unsigned length, uint32_t *crc); + +/* streaming helpers */ +extern int set_nonblock(int fd, int on); +extern int pipe_in2out(int infd, int outfd, uint8_t *buffer, int length); +extern int read_data(int fd, unsigned char *buf, int len); +extern int write_data(int fd, const unsigned char *buf, int len); + +/* command line args */ +struct long_opt { + const char *name; + const char *type; + const char *doc; + int is_required; + + int (*parse_opt)(char *arg, void *value); + void *value; + struct long_opt *next; +}; + +extern void __push_opt(struct long_opt *opt); + +#define TEST_OPTION(name, type, doc, is_required) \ + param_check_##type(name, &(name)); \ + static struct long_opt __long_opt_##name = { \ + #name, #type, doc, is_required, parse_opt_##type, &name }; \ + static void __init_opt_##name(void) __attribute__ ((constructor)); \ + static void __init_opt_##name(void) \ + { (void)__check_##name; __push_opt(&__long_opt_##name); } + +#define __param_check(name, p, type) \ + static inline type *__check_##name(void) { return(p); } + +#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) + +extern void parseargs(int, char **); +extern int parse_opt_bool(char *param, void *arg); +#define param_check_bool(name, p) __param_check(name, p, int) +extern int parse_opt_int(char *param, void *arg); +#define param_check_int(name, p) __param_check(name, p, int) +extern int parse_opt_uint(char *param, void *arg); +#define param_check_uint(name, p) __param_check(name, p, unsigned int) +extern int parse_opt_long(char *param, void *arg); +#define param_check_long(name, p) __param_check(name, p, long) +extern int parse_opt_ulong(char *param, void *arg); +#define param_check_ulong(name, p) __param_check(name, p, unsigned long) +extern int parse_opt_string(char *param, void *arg); +#define param_check_string(name, p) __param_check(name, p, char *) + +extern int write_pidfile(int pid); + +#include +#include +#include + +#define __stringify_1(x) #x +#define __stringify(x) __stringify_1(x) + +/* + * Macro to define stack alignment. + * aarch64 requires stack to be aligned to 16 bytes. + */ +#define __stack_aligned__ __attribute__((aligned(16))) + +/* message helpers */ +extern int test_log_init(const char *outfile, const char *suffix); +extern int zdtm_seccomp; +#define pr_err(format, arg...) \ + test_msg("ERR: %s:%d: " format, __FILE__, __LINE__, ## arg) +#define pr_perror(format, arg...) \ + test_msg("ERR: %s:%d: " format " (errno = %d (%s))\n", \ + __FILE__, __LINE__, ## arg, errno, strerror(errno)) +#define fail(format, arg...) \ + test_msg("FAIL: %s:%d: " format " (errno = %d (%s))\n", \ + __FILE__, __LINE__, ## arg, errno, strerror(errno)) +#define skip(format, arg...) \ + test_msg("SKIP: %s:%d: " format "\n", \ + __FILE__, __LINE__, ## arg) +#define pass() test_msg("PASS\n") + +typedef struct { + unsigned long seed; + int pipes[2]; +} task_waiter_t; + +extern void task_waiter_init(task_waiter_t *t); +extern void task_waiter_fini(task_waiter_t *t); +extern void task_waiter_wait4(task_waiter_t *t, unsigned int lockid); +extern void task_waiter_complete(task_waiter_t *t, unsigned int lockid); +extern void task_waiter_complete_current(task_waiter_t *t); +extern int tcp_init_server(int family, int *port); +extern int tcp_accept_server(int sock); +extern int tcp_init_client(int family, char *servIP, unsigned short servPort); + +struct zdtm_tcp_opts { + bool reuseaddr; + bool reuseport; + int flags; +}; + +extern int tcp_init_server_with_opts(int family, int *port, struct zdtm_tcp_opts *opts); +extern pid_t sys_clone_unified(unsigned long flags, void *child_stack, void *parent_tid, + void *child_tid, unsigned long newtls); + +#define ssprintf(s, fmt, ...) ({ \ + int ___ret; \ + \ + ___ret = snprintf(s, sizeof(s), fmt, ##__VA_ARGS__); \ + if (___ret >= sizeof(s)) \ + abort(); \ + ___ret; \ +}) + +#endif /* _VIMITESU_H_ */ diff --git a/CRIU_code/test/zdtm/static/Makefile b/CRIU_code/test/zdtm/static/Makefile new file mode 100644 index 0000000..7799c0b --- /dev/null +++ b/CRIU_code/test/zdtm/static/Makefile @@ -0,0 +1,574 @@ +LIBDIR := ../lib +LIB := $(LIBDIR)/libzdtmtst.a +LDLIBS += $(LIB) +CPPFLAGS += -I$(LIBDIR) + +TST_NOFILE := \ + busyloop00 \ + sleeping00 \ + pid00 \ + caps00 \ + wait00 \ + zombie00 \ + zombie01 \ + fpu00 \ + fpu01 \ + fpu02 \ + arm-neon00 \ + futex \ + futex-rl \ + mmx00 \ + sse00 \ + sse20 \ + mprotect00 \ + timers \ + timerfd \ + unbound_sock \ + sched_prio00 \ + sched_policy00 \ + socket_listen \ + socket_listen6 \ + socket_listen4v6 \ + socket_udp \ + socket_udp-broadcast \ + socket_udp-corked \ + socket6_udp \ + socket_udp_shutdown \ + sk-freebind \ + sk-freebind-false \ + socket_udplite \ + socket_aio \ + socket_close_data \ + socket_snd_addr \ + socket_dgram_data \ + packet_sock \ + packet_sock_mmap \ + packet_sock_spkt \ + sock_filter \ + msgque \ + inotify_system \ + inotify_system_nodel \ + shm \ + shm-mp \ + ptrace_sig \ + pipe00 \ + pipe01 \ + pipe02 \ + pthread00 \ + pthread01 \ + pthread02 \ + vdso00 \ + vdso01 \ + vdso02 \ + vdso-proxy \ + utsname \ + pstree \ + sockets01 \ + sockets02 \ + sockets_spair \ + socket_queues \ + socket-raw \ + socket-tcp \ + socket-tcp-reseted \ + socket-tcp6 \ + socket-tcp4v6 \ + socket-tcp-local \ + socket-tcp-reuseport \ + socket-tcp-nfconntrack \ + socket-tcp6-local \ + socket-tcp4v6-local \ + socket-tcpbuf \ + socket-tcpbuf-local \ + socket-tcpbuf6-local \ + socket-tcpbuf6 \ + socket-tcp-fin-wait1 \ + socket-tcp6-fin-wait1 \ + socket-tcp4v6-fin-wait1 \ + socket-tcp-fin-wait2 \ + socket-tcp6-fin-wait2 \ + socket-tcp4v6-fin-wait2 \ + socket-tcp-close-wait \ + socket-tcp6-close-wait \ + socket-tcp4v6-close-wait \ + socket-tcp-last-ack \ + socket-tcp6-last-ack \ + socket-tcp4v6-last-ack \ + socket-tcp-closing \ + socket-tcp6-closing \ + socket-tcp4v6-closing \ + socket-tcp-closed \ + socket-tcp-closed-last-ack \ + socket-tcp6-closed \ + socket-tcp4v6-closed \ + socket-tcp-close0 \ + socket-tcp-close1 \ + socket-tcp-unconn \ + socket-tcp6-unconn \ + socket-tcp-syn-sent \ + socket-tcp-skip-in-flight \ + sock_opts00 \ + sock_opts01 \ + sk-unix-unconn \ + ipc_namespace \ + selfexe00 \ + sem \ + maps01 \ + maps02 \ + maps04 \ + maps05 \ + mlock_setuid \ + xids00 \ + groups \ + pdeath_sig \ + file_fown \ + proc-self \ + eventfs00 \ + epoll \ + signalfd00 \ + inotify_irmap \ + fanotify00 \ + uptime_grow \ + session00 \ + rlimits00 \ + pty00 \ + pty01 \ + pty-console \ + pty02 \ + pty03 \ + pty04 \ + tty00 \ + tty02 \ + tty03 \ + poll \ + mountpoints \ + netns \ + netns-dev \ + session01 \ + session02 \ + session03 \ + socket-ext \ + unhashed_proc \ + cow00 \ + child_opened_proc \ + posix_timers \ + sigpending \ + sigaltstack \ + sk-netlink \ + mem-touch \ + grow_map \ + grow_map02 \ + grow_map03 \ + tun \ + tun_ns \ + stopped \ + stopped01 \ + stopped02 \ + stopped12 \ + rtc \ + clean_mntns \ + mntns_rw_ro_rw \ + dumpable01 \ + dumpable02 \ + remap_dead_pid \ + remap_dead_pid_root \ + scm00 \ + scm01 \ + scm02 \ + scm03 \ + scm04 \ + scm05 \ + scm06 \ + aio00 \ + aio01 \ + fd \ + fd01 \ + apparmor \ + seccomp_strict \ + seccomp_filter \ + seccomp_filter_tsync \ + seccomp_filter_threads \ + seccomp_filter_inheritance \ + different_creds \ + vsx \ + bridge \ + vfork00 \ + oom_score_adj \ + loginuid \ + cgroupns \ + helper_zombie_child \ + clone_fs \ + macvlan \ + sit \ + cr_veth \ + sock_peercred \ + s390x_mmap_high \ + uffd-events \ + thread_different_uid_gid \ + pipe03 \ + netns_sub \ + netns_sub_veth \ + unlink_multiple_largefiles \ + config_inotify_irmap \ + thp_disable \ + pid_file \ + selinux00 \ + selinux01 \ + selinux02 \ +# jobctl00 \ + +ifneq ($(SRCARCH),arm) +ifneq ($(COMPAT_TEST),y) + TST_NOFILE += maps03 +endif +endif + +ifeq ($(SRCARCH),s390) + TST_NOFILE += s390x_regs_check \ + s390x_gs_threads \ + s390x_runtime_instr +endif + +TST_FILE = \ + maps06 \ + write_read00 \ + write_read01 \ + write_read02 \ + write_read10 \ + maps00 \ + link10 \ + file_attr \ + deleted_unix_sock \ + sk-unix-rel \ + deleted_dev \ + unlink_fstat00 \ + unlink_fstat01 \ + unlink_fstat01+ \ + unlink_fstat02 \ + unlink_fstat03 \ + ghost_holes00 \ + ghost_holes01 \ + ghost_holes02 \ + unlink_largefile \ + mtime_mmap \ + fifo \ + fifo-ghost \ + fifo_ro \ + fifo_wronly \ + console \ + vt \ + unlink_fifo \ + unlink_fifo_wronly \ + unlink_mmap00 \ + unlink_mmap01 \ + unlink_mmap02 \ + file_shared \ + file_append \ + cow01 \ + fdt_shared \ + sockets00 \ + sockets03 \ + sockets_dgram \ + file_lease00 \ + file_lease01 \ + file_lease02 \ + file_lease03 \ + file_lease04 \ + file_locks00 \ + file_locks01 \ + file_locks02 \ + file_locks03 \ + file_locks04 \ + file_locks05 \ + file_locks06 \ + file_locks07 \ + file_locks08 \ + netns-nf \ + maps_file_prot \ + socket_close_data01 \ + +TST_DIR = \ + cwd00 \ + cwd01 \ + cwd02 \ + overmount_dev \ + overmount_file \ + overmount_fifo \ + overmount_sock \ + tempfs \ + tempfs_overmounted \ + tempfs_overmounted01 \ + tempfs_ro \ + tempfs_ro02 \ + tempfs_subns \ + mnt_ro_bind \ + mount_paths \ + bind-mount \ + shared_mount_propagation \ + overmount_with_shared_parent \ + inotify00 \ + inotify01 \ + inotify02 \ + cgroup00 \ + rmdir_open \ + cgroup01 \ + cgroup02 \ + cgroup03 \ + cgroup04 \ + cgroup_ifpriomap \ + cgroup_stray \ + unlink_fstat04 \ + unlink_fstat041 \ + mntns_remap \ + mntns_open \ + mntns_link_remap \ + mntns_ghost \ + mntns_ghost01 \ + mntns_ro_root \ + mntns_link_ghost \ + mntns_shared_bind \ + mntns_shared_bind02 \ + mntns_shared_bind03 \ + mntns_root_bind \ + mntns_root_bind02 \ + mntns_overmount \ + mntns_shared_vs_private \ + mnt_ext_manual \ + mnt_ext_auto \ + mnt_ext_master \ + mnt_ext_dev \ + mnt_tracefs \ + mntns_deleted \ + unlink_regular00 \ + mnt_enablefs \ + autofs \ + del_standalone_un \ + sk-unix-mntns \ + sk-unix01 \ + unsupported_children_collision \ + shared_slave_mount_children \ + non_uniform_share_propagation \ + private_bind_propagation \ + ghost_on_rofs \ + overmounted_file \ + +TST_DIR_FILE = \ + chroot \ + chroot-file \ + binfmt_misc \ + +TST = \ + $(TST_NOFILE) \ + $(TST_FILE) \ + $(TST_DIR) \ + $(TST_DIR_FILE) \ + env00 \ + fifo-rowo-pair \ + umask00 \ + cmdlinenv00 \ + shm-unaligned \ + +TST_STATE = \ + conntracks \ + route_rules \ + +AUX_SRC = get_smaps_bits.c ofd_file_locks.c + +SRC = $(TST:%=%.c) $(AUX_SRC) +OBJ = $(SRC:%.c=%.o) +DEP = $(SRC:%.c=%.d) +PID = $(TST:%=%.pid) +OUT = $(TST:%=%.out) +STATE = $(TST_STATE:%=%.state) +STATE_OUT = $(TST_STATE:%=%.out) + +include ../Makefile.inc + +all: $(TST) criu-rtc.so +install: all +.PHONY: all install + +$(TST_NOFILE:%=%.pid): %.pid: % + $(/dev/null` 2>/dev/null || break; \ + sleep 1; \ + echo -n .; \ + i=`expr $$i + 1`; \ + done; \ + echo; \ + [ $$i -lt $(WAIT_TIME) ] + +wait_stop: + i=0; \ + while [ $$i -lt $(WAIT_TIME) ] ; do \ + kill -0 `awk '{print}' *.pid 2>/dev/null` 2>/dev/null || break; \ + sleep 1; \ + i=`expr $$i + 1`; \ + done + +$(TST): | $(LIB) + +aio00: LDLIBS += -laio +different_creds: LDLIBS += -lcap +file_locks06 file_locks07 file_locks08: ofd_file_locks.o +futex: CFLAGS += -pthread +futex: LDFLAGS += -pthread +futex-rl: CFLAGS += -pthread +futex-rl: LDFLAGS += -pthread +jobctl00: LDLIBS += -lutil +socket_listen: LDLIBS += -lrt -pthread +socket_aio: LDLIBS += -lrt -pthread +uptime_grow: LDLIBS += -lrt -pthread +unlink_largefile: CFLAGS += -D_FILE_OFFSET_BITS=64 -D_LARGEFILE64_SOURCE +inotify_system_nodel: CFLAGS += -DNODEL +pthread00: LDLIBS += -pthread +pthread01: LDLIBS += -pthread +pthread02: LDLIBS += -pthread +different_creds: LDLIBS += -pthread +sigpending: LDLIBS += -pthread +sigaltstack: LDLIBS += -pthread +seccomp_filter_tsync: LDLIBS += -pthread +seccomp_filter_threads: LDLIBS += -pthread +shm: CFLAGS += -DNEW_IPC_NS +msgque: CFLAGS += -DNEW_IPC_NS +sem: CFLAGS += -DNEW_IPC_NS +posix_timers: LDLIBS += -lrt -pthread +remap_dead_pid_root: CFLAGS += -DREMAP_PID_ROOT +socket-tcp6: CFLAGS += -D ZDTM_IPV6 +socket-tcp4v6: CFLAGS += -D ZDTM_IPV4V6 +socket-tcpbuf6: CFLAGS += -D ZDTM_IPV6 +socket-tcpbuf-local: CFLAGS += -D ZDTM_TCP_LOCAL +socket-tcpbuf6-local: CFLAGS += -D ZDTM_TCP_LOCAL -D ZDTM_IPV6 +socket-tcp6-local: CFLAGS += -D ZDTM_TCP_LOCAL -D ZDTM_IPV6 +socket-tcp4v6-local: CFLAGS += -D ZDTM_TCP_LOCAL -D ZDTM_IPV4V6 +socket-tcp-local: CFLAGS += -D ZDTM_TCP_LOCAL +socket-tcp-nfconntrack: CFLAGS += -D ZDTM_TCP_LOCAL -DZDTM_CONNTRACK +socket_listen6: CFLAGS += -D ZDTM_IPV6 +socket_listen4v6: CFLAGS += -D ZDTM_IPV4V6 +socket-tcp6-closed: CFLAGS += -D ZDTM_IPV6 +socket-tcp6-closed: CFLAGS += -D ZDTM_IPV4V6 +socket-tcp-closed-last-ack: CFLAGS += -D ZDTM_TCP_LAST_ACK +socket-tcp-skip-in-flight: CFLAGS += -D ZDTM_IPV4V6 +tun_ns: CFLAGS += -DTUN_NS +mnt_ext_manual: CFLAGS += -D ZDTM_EXTMAP_MANUAL +sigpending: LDLIBS += -lrt +vdso01: LDLIBS += -lrt +scm01: CFLAGS += -DKEEP_SENT_FD +scm02: CFLAGS += -DSEND_BOTH +scm04: CFLAGS += -DSEPARATE +mntns_link_remap: CFLAGS += -DZDTM_LINK_REMAP +mntns_shared_bind02: CFLAGS += -DSHARED_BIND02 +mntns_root_bind02: CFLAGS += -DROOT_BIND02 +maps02: get_smaps_bits.o +mlock_setuid: get_smaps_bits.o +thp_disable: get_smaps_bits.o +inotify01: CFLAGS += -DINOTIFY01 +unlink_fstat01+: CFLAGS += -DUNLINK_OVER +unlink_fstat04: CFLAGS += -DUNLINK_FSTAT04 +unlink_fstat041: CFLAGS += -DUNLINK_FSTAT041 -DUNLINK_FSTAT04 +ghost_holes01: CFLAGS += -DTAIL_HOLE +ghost_holes02: CFLAGS += -DHEAD_HOLE +sk-freebind-false: CFLAGS += -DZDTM_FREEBIND_FALSE +selinux02: CFLAGS += -DUSING_SOCKCREATE +stopped01: CFLAGS += -DZDTM_STOPPED_KILL +stopped02: CFLAGS += -DZDTM_STOPPED_TKILL +stopped12: CFLAGS += -DZDTM_STOPPED_KILL -DZDTM_STOPPED_TKILL +clone_fs: LDLIBS += -pthread +# As generating dependencies won't work without proper includes, +# we have to explicitly specify both .o and .d for this case: +netns_sub_veth.o netns_sub_veth.d: CPPFLAGS += -I/usr/include/libnl3 +netns_sub_veth: LDLIBS += -lnl-3 -l nl-route-3 + +socket-tcp-fin-wait1: CFLAGS += -D ZDTM_TCP_FIN_WAIT1 +socket-tcp-fin-wait2: CFLAGS += -D ZDTM_TCP_FIN_WAIT2 +socket-tcp6-fin-wait1: CFLAGS += -D ZDTM_TCP_FIN_WAIT1 -D ZDTM_IPV6 +socket-tcp6-fin-wait2: CFLAGS += -D ZDTM_TCP_FIN_WAIT2 -D ZDTM_IPV6 +socket-tcp4v6-fin-wait1: CFLAGS += -D ZDTM_TCP_FIN_WAIT1 -D ZDTM_IPV4V6 +socket-tcp4v6-fin-wait2: CFLAGS += -D ZDTM_TCP_FIN_WAIT2 -D ZDTM_IPV4V6 +socket-tcp-close-wait: CFLAGS += -D ZDTM_TCP_CLOSE_WAIT +socket-tcp6-close-wait: CFLAGS += -D ZDTM_TCP_CLOSE_WAIT -D ZDTM_IPV6 +socket-tcp4v6-close-wait: CFLAGS += -D ZDTM_TCP_CLOSE_WAIT -D ZDTM_IPV4V6 +socket-tcp-last-ack: CFLAGS += -D ZDTM_TCP_LAST_ACK +socket-tcp6-last-ack: CFLAGS += -D ZDTM_TCP_LAST_ACK -D ZDTM_IPV6 +socket-tcp6-closing: CFLAGS += -D ZDTM_IPV6 +socket-tcp6-unconn: CFLAGS += -D ZDTM_IPV6 +socket-tcp4v6-last-ack: CFLAGS += -D ZDTM_TCP_LAST_ACK -D ZDTM_IPV4V6 +socket-tcp4v6-closing: CFLAGS += -D ZDTM_IPV4V6 + +pty-console: CFLAGS += -D ZDTM_DEV_CONSOLE + +shm-unaligned: CFLAGS += -DZDTM_SHM_UNALIGNED + +s390x_regs_check: LDFLAGS += -pthread +s390x_gs_threads: LDFLAGS += -pthread + +thread_different_uid_gid: LDLIBS += -pthread -lcap + +$(LIB): force + $(Q) $(MAKE) -C $(LIBDIR) + +clean-more: + $(RM) criu-rtc.so criu-rtc.pb-c.c criu-rtc.pb-c.h +.PHONY: clean-more +clean: clean-more + +rtc.c: criu-rtc.so + +criu-rtc.pb-c.c: criu-rtc.proto + $(Q)echo $@ >> .gitignore + $(Q)echo $(@:%.c=%.h) >> .gitignore + $(E) " PBCC " $@ + $(Q)protoc-c --proto_path=. --c_out=. criu-rtc.proto + +criu-rtc.so: criu-rtc.c criu-rtc.pb-c.c + $(E) " LD " $@ + $(Q)$(CC) -g -Wall -shared -nostartfiles criu-rtc.c criu-rtc.pb-c.c -o criu-rtc.so -iquote ../../../criu/include -fPIC $(filter-out -m32,$(USERCFLAGS)) + +.PHONY: force start check_start stop wait_stop diff --git a/CRIU_code/test/zdtm/static/aio00.c b/CRIU_code/test/zdtm/static/aio00.c new file mode 100644 index 0000000..93ca7aa --- /dev/null +++ b/CRIU_code/test/zdtm/static/aio00.c @@ -0,0 +1,36 @@ +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that plain io_setup works"; +const char *test_author = "Pavel Emelianov "; + +int main(int argc, char **argv) +{ + int ret; + io_context_t ctx = 0; + + test_init(argc, argv); + + if (io_setup(1, &ctx) < 0) { + pr_perror("Can't setup io ctx"); + return 1; + } + + test_daemon(); + test_waitsig(); + + ret = io_getevents(ctx, 0, 1, NULL, NULL); + if (ret != 0) { + if (ret < 0) + fail("IO ctx lost (%d)", ret); + else + fail("IO ctx screwed up (%d)", ret); + } else + pass(); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/aio00.desc b/CRIU_code/test/zdtm/static/aio00.desc new file mode 100644 index 0000000..fcaf84a --- /dev/null +++ b/CRIU_code/test/zdtm/static/aio00.desc @@ -0,0 +1 @@ +{'feature': 'aio_remap', 'flags': 'nouser'} diff --git a/CRIU_code/test/zdtm/static/aio01.c b/CRIU_code/test/zdtm/static/aio01.c new file mode 100644 index 0000000..fda7572 --- /dev/null +++ b/CRIU_code/test/zdtm/static/aio01.c @@ -0,0 +1,114 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check head and tail restore correct"; +const char *test_author = "Kirill Tkhai "; + +struct aio_ring { + unsigned id; /* kernel internal index number */ + unsigned nr; /* number of io_events */ + unsigned head; /* Written to by userland or under ring_lock + * mutex by aio_read_events_ring(). */ + unsigned tail; + unsigned magic; + unsigned compat_features; + unsigned incompat_features; + unsigned header_length; /* size of aio_ring */ + struct io_event io_events[0]; +}; /* 128 bytes + ring size */ + +int main(int argc, char **argv) +{ + struct iocb iocb, *iocbp = &iocb; + volatile struct aio_ring *ring; + aio_context_t ctx = 0; + struct io_event event; + unsigned tail[2], head[2]; + unsigned nr[2]; + int i, fd, ret; + char buf[1]; + + test_init(argc, argv); + + memset(&iocb, 0, sizeof(iocb)); + + if (syscall(__NR_io_setup, 64, &ctx) < 0) { + pr_perror("Can't setup io ctx"); + return 1; + } + + fd = open("/dev/null", O_WRONLY); + if (fd < 0) { + pr_perror("Can't open /dev/null"); + return 1; + } + + iocb.aio_fildes = fd; + iocb.aio_buf = (unsigned long)buf; + iocb.aio_nbytes = 1; + iocb.aio_lio_opcode = IOCB_CMD_PWRITE; + + ring = (struct aio_ring *)ctx; + nr[0] = ring->nr; + + for (i = 0; i < nr[0] + 2; i++) { + if (syscall(__NR_io_submit, ctx, 1, &iocbp) != 1) { + fail("Can't submit aio"); + return 1; + } + + if (!(i % 2)) + continue; + + ret = syscall(__NR_io_getevents, ctx, 0, 1, &event, NULL); + if (ret != 1) { + fail("Can't get event"); + return 1; + } + } + + tail[0] = *((volatile unsigned *)&ring->tail); + head[0] = *((volatile unsigned *)&ring->head); + + test_msg("tail=%u, head=%u, nr=%u\n", tail[0], head[0], nr[0]); + + test_daemon(); + test_waitsig(); + + tail[1] = *((volatile unsigned *)&ring->tail); + head[1] = *((volatile unsigned *)&ring->head); + nr[1] = *((volatile unsigned *)&ring->nr); + + test_msg("tail=%u, head=%u, nr=%u\n", tail[1], head[1], nr[1]); + + if (tail[0] != tail[1] || head[0] != head[1] || nr[0] != nr[1]) { + fail("mismatch"); + return 1; + } + + if (syscall(__NR_io_submit, ctx, 1, &iocbp) != 1) { + fail("Can't submit aio"); + return 1; + } + + tail[1] = *((volatile unsigned *)&ring->tail); + head[1] = *((volatile unsigned *)&ring->head); + nr[1] = *((volatile unsigned *)&ring->nr); + + test_msg("tail=%u, head=%u, nr=%u\n", tail[1], head[1], nr[1]); + + if (tail[1] == tail[0] + 1 && head[1] == head[0] && nr[1] == nr[0]) + pass(); + else + fail("mismatch"); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/aio01.desc b/CRIU_code/test/zdtm/static/aio01.desc new file mode 100644 index 0000000..fcaf84a --- /dev/null +++ b/CRIU_code/test/zdtm/static/aio01.desc @@ -0,0 +1 @@ +{'feature': 'aio_remap', 'flags': 'nouser'} diff --git a/CRIU_code/test/zdtm/static/apparmor.c b/CRIU_code/test/zdtm/static/apparmor.c new file mode 100644 index 0000000..15930c7 --- /dev/null +++ b/CRIU_code/test/zdtm/static/apparmor.c @@ -0,0 +1,90 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "zdtmtst.h" + +const char *test_doc = "Check that an apparmor profile is restored"; +const char *test_author = "Tycho Andersen "; + +#define PROFILE "criu_test" + +int setprofile() +{ + char profile[1024]; + int fd, len; + + len = snprintf(profile, sizeof(profile), "changeprofile " PROFILE); + if (len < 0 || len >= sizeof(profile)) { + fail("bad sprintf\n"); + return -1; + } + + fd = open("/proc/self/attr/current", O_WRONLY); + if (fd < 0) { + fail("couldn't open fd\n"); + return -1; + } + + /* apparmor wants this in exactly one write, so we use write() here + * vs. fprintf Just To Be Sure */ + len = write(fd, profile, len); + close(fd); + + if (len < 0) { + fail("couldn't write profile\n"); + return -1; + } + + return 0; +} + +int checkprofile() +{ + FILE *f; + char path[PATH_MAX], profile[1024]; + int len; + + sprintf(path, "/proc/self/attr/current"); + + f = fopen(path, "r"); + if (!f) { + fail("couldn't open lsm current\n"); + return -1; + } + + len = fscanf(f, "%[^ \n]s", profile); + fclose(f); + if (len != 1) { + fail("wrong number of items scanned %d\n", len); + return -1; + } + + if (strcmp(profile, PROFILE) != 0) { + fail("bad profile .%s. expected .%s.\n", profile, PROFILE); + return -1; + } + + return 0; +} + +int main(int argc, char **argv) +{ + test_init(argc, argv); + + setprofile(); + + test_daemon(); + test_waitsig(); + + if (checkprofile() == 0) + pass(); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/apparmor.checkskip b/CRIU_code/test/zdtm/static/apparmor.checkskip new file mode 100644 index 0000000..99fa727 --- /dev/null +++ b/CRIU_code/test/zdtm/static/apparmor.checkskip @@ -0,0 +1,4 @@ +#!/bin/bash + +test -d /sys/kernel/security/apparmor || exit 1 +apparmor_parser -r `dirname $0`/apparmor.profile diff --git a/CRIU_code/test/zdtm/static/apparmor.desc b/CRIU_code/test/zdtm/static/apparmor.desc new file mode 100644 index 0000000..d969725 --- /dev/null +++ b/CRIU_code/test/zdtm/static/apparmor.desc @@ -0,0 +1 @@ +{'flavor': 'h ns', 'flags': 'suid'} diff --git a/CRIU_code/test/zdtm/static/apparmor.profile b/CRIU_code/test/zdtm/static/apparmor.profile new file mode 100644 index 0000000..69b1b25 --- /dev/null +++ b/CRIU_code/test/zdtm/static/apparmor.profile @@ -0,0 +1,8 @@ +# vim:syntax=apparmor + +profile criu_test { + /** rwmlkix, + capability, + unix, + signal, +} diff --git a/CRIU_code/test/zdtm/static/arm-neon00.c b/CRIU_code/test/zdtm/static/arm-neon00.c new file mode 100644 index 0000000..96da16c --- /dev/null +++ b/CRIU_code/test/zdtm/static/arm-neon00.c @@ -0,0 +1,67 @@ +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Initialize VFP registers before a migration,\n" + "check the VFP state is the same after a restore."; +const char *test_author = "Alexander Karatshov "; + + +#ifdef __arm__ + +int main(int argc, char ** argv) +{ + srand(time(0)); + + int a = rand() % 100; + int b = rand() % 100; + int c = rand() % 100; + int y1 = a + b*c; + int y2; + + test_init(argc, argv); + + asm ( + ".fpu neon \n" + "vmov.32 d0[0], %0 \n" + "vmov.32 d1[0], %1 \n" + "vmov.32 d2[0], %2 \n" + ".fpu softvfp \n" + : : "r"(a), "r"(b), "r"(c) + ); + + test_msg("Preparing to wait...\n"); + + test_daemon(); + test_waitsig(); + + test_msg("Restored.\n"); + + asm ( + ".fpu neon \n" + "vmul.I32 d3, d1, d2 \n" + "vadd.I32 d4, d0, d3 \n" + "vmov.32 %0, d4[0] \n" + ".fpu softvfp \n" + : "=r"(y2) + ); + + if (y1 != y2) + fail("VFP restoration failed: result = %d, expected = %d (a = %d, b = %d, c = %d)\n", y2, y1, a, b, c); + else + pass(); + + return 0; +} + +#else + +int main(int argc, char *argv[]) +{ + test_init(argc, argv); + skip("This test is supposed to run on an ARM machine!"); + return 0; +} + +#endif diff --git a/CRIU_code/test/zdtm/static/arm-neon00.desc b/CRIU_code/test/zdtm/static/arm-neon00.desc new file mode 100644 index 0000000..95c58b4 --- /dev/null +++ b/CRIU_code/test/zdtm/static/arm-neon00.desc @@ -0,0 +1 @@ +{'flags': 'noauto'} diff --git a/CRIU_code/test/zdtm/static/auto_dev-ioctl.h b/CRIU_code/test/zdtm/static/auto_dev-ioctl.h new file mode 100644 index 0000000..aeaeb3e --- /dev/null +++ b/CRIU_code/test/zdtm/static/auto_dev-ioctl.h @@ -0,0 +1,228 @@ +/* + * Copyright 2008 Red Hat, Inc. All rights reserved. + * Copyright 2008 Ian Kent + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + */ + +#ifndef _LINUX_AUTO_DEV_IOCTL_H +#define _LINUX_AUTO_DEV_IOCTL_H + +#include + +#ifdef __KERNEL__ +#include +#else +#include +#endif /* __KERNEL__ */ + +#define AUTOFS_DEVICE_NAME "autofs" + +#define AUTOFS_DEV_IOCTL_VERSION_MAJOR 1 +#define AUTOFS_DEV_IOCTL_VERSION_MINOR 0 + +#define AUTOFS_DEVID_LEN 16 + +#define AUTOFS_DEV_IOCTL_SIZE sizeof(struct autofs_dev_ioctl) + +/* + * An ioctl interface for autofs mount point control. + */ + +struct args_protover { + __u32 version; +}; + +struct args_protosubver { + __u32 sub_version; +}; + +struct args_openmount { + __u32 devid; +}; + +struct args_ready { + __u32 token; +}; + +struct args_fail { + __u32 token; + __s32 status; +}; + +struct args_setpipefd { + __s32 pipefd; +}; + +struct args_timeout { + __u64 timeout; +}; + +struct args_requester { + __u32 uid; + __u32 gid; +}; + +struct args_expire { + __u32 how; +}; + +struct args_askumount { + __u32 may_umount; +}; + +struct args_ismountpoint { + union { + struct args_in { + __u32 type; + } in; + struct args_out { + __u32 devid; + __u32 magic; + } out; + }; +}; + +/* + * All the ioctls use this structure. + * When sending a path size must account for the total length + * of the chunk of memory otherwise is is the size of the + * structure. + */ + +struct autofs_dev_ioctl { + __u32 ver_major; + __u32 ver_minor; + __u32 size; /* total size of data passed in + * including this struct */ + __s32 ioctlfd; /* automount command fd */ + + /* Command parameters */ + + union { + struct args_protover protover; + struct args_protosubver protosubver; + struct args_openmount openmount; + struct args_ready ready; + struct args_fail fail; + struct args_setpipefd setpipefd; + struct args_timeout timeout; + struct args_requester requester; + struct args_expire expire; + struct args_askumount askumount; + struct args_ismountpoint ismountpoint; + }; + + char path[0]; +}; + +static inline void init_autofs_dev_ioctl(struct autofs_dev_ioctl *in) { + memset(in, 0, sizeof(struct autofs_dev_ioctl)); + in->ver_major = AUTOFS_DEV_IOCTL_VERSION_MAJOR; + in->ver_minor = AUTOFS_DEV_IOCTL_VERSION_MINOR; + in->size = sizeof(struct autofs_dev_ioctl); + in->ioctlfd = -1; + return; +} + +/* + * If you change this make sure you make the corresponding change + * to autofs-dev-ioctl.c:lookup_ioctl() + */ +enum { + /* Get various version info */ + AUTOFS_DEV_IOCTL_VERSION_CMD = 0x71, + AUTOFS_DEV_IOCTL_PROTOVER_CMD, + AUTOFS_DEV_IOCTL_PROTOSUBVER_CMD, + + /* Open mount ioctl fd */ + AUTOFS_DEV_IOCTL_OPENMOUNT_CMD, + + /* Close mount ioctl fd */ + AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD, + + /* Mount/expire status returns */ + AUTOFS_DEV_IOCTL_READY_CMD, + AUTOFS_DEV_IOCTL_FAIL_CMD, + + /* Activate/deactivate autofs mount */ + AUTOFS_DEV_IOCTL_SETPIPEFD_CMD, + AUTOFS_DEV_IOCTL_CATATONIC_CMD, + + /* Expiry timeout */ + AUTOFS_DEV_IOCTL_TIMEOUT_CMD, + + /* Get mount last requesting uid and gid */ + AUTOFS_DEV_IOCTL_REQUESTER_CMD, + + /* Check for eligible expire candidates */ + AUTOFS_DEV_IOCTL_EXPIRE_CMD, + + /* Request busy status */ + AUTOFS_DEV_IOCTL_ASKUMOUNT_CMD, + + /* Check if path is a mountpoint */ + AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD, +}; + +#define AUTOFS_IOCTL 0x93 + +#define AUTOFS_DEV_IOCTL_VERSION \ + _IOWR(AUTOFS_IOCTL, \ + AUTOFS_DEV_IOCTL_VERSION_CMD, struct autofs_dev_ioctl) + +#define AUTOFS_DEV_IOCTL_PROTOVER \ + _IOWR(AUTOFS_IOCTL, \ + AUTOFS_DEV_IOCTL_PROTOVER_CMD, struct autofs_dev_ioctl) + +#define AUTOFS_DEV_IOCTL_PROTOSUBVER \ + _IOWR(AUTOFS_IOCTL, \ + AUTOFS_DEV_IOCTL_PROTOSUBVER_CMD, struct autofs_dev_ioctl) + +#define AUTOFS_DEV_IOCTL_OPENMOUNT \ + _IOWR(AUTOFS_IOCTL, \ + AUTOFS_DEV_IOCTL_OPENMOUNT_CMD, struct autofs_dev_ioctl) + +#define AUTOFS_DEV_IOCTL_CLOSEMOUNT \ + _IOWR(AUTOFS_IOCTL, \ + AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD, struct autofs_dev_ioctl) + +#define AUTOFS_DEV_IOCTL_READY \ + _IOWR(AUTOFS_IOCTL, \ + AUTOFS_DEV_IOCTL_READY_CMD, struct autofs_dev_ioctl) + +#define AUTOFS_DEV_IOCTL_FAIL \ + _IOWR(AUTOFS_IOCTL, \ + AUTOFS_DEV_IOCTL_FAIL_CMD, struct autofs_dev_ioctl) + +#define AUTOFS_DEV_IOCTL_SETPIPEFD \ + _IOWR(AUTOFS_IOCTL, \ + AUTOFS_DEV_IOCTL_SETPIPEFD_CMD, struct autofs_dev_ioctl) + +#define AUTOFS_DEV_IOCTL_CATATONIC \ + _IOWR(AUTOFS_IOCTL, \ + AUTOFS_DEV_IOCTL_CATATONIC_CMD, struct autofs_dev_ioctl) + +#define AUTOFS_DEV_IOCTL_TIMEOUT \ + _IOWR(AUTOFS_IOCTL, \ + AUTOFS_DEV_IOCTL_TIMEOUT_CMD, struct autofs_dev_ioctl) + +#define AUTOFS_DEV_IOCTL_REQUESTER \ + _IOWR(AUTOFS_IOCTL, \ + AUTOFS_DEV_IOCTL_REQUESTER_CMD, struct autofs_dev_ioctl) + +#define AUTOFS_DEV_IOCTL_EXPIRE \ + _IOWR(AUTOFS_IOCTL, \ + AUTOFS_DEV_IOCTL_EXPIRE_CMD, struct autofs_dev_ioctl) + +#define AUTOFS_DEV_IOCTL_ASKUMOUNT \ + _IOWR(AUTOFS_IOCTL, \ + AUTOFS_DEV_IOCTL_ASKUMOUNT_CMD, struct autofs_dev_ioctl) + +#define AUTOFS_DEV_IOCTL_ISMOUNTPOINT \ + _IOWR(AUTOFS_IOCTL, \ + AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD, struct autofs_dev_ioctl) + +#endif /* _LINUX_AUTO_DEV_IOCTL_H */ diff --git a/CRIU_code/test/zdtm/static/autofs.c b/CRIU_code/test/zdtm/static/autofs.c new file mode 100644 index 0000000..4360f90 --- /dev/null +++ b/CRIU_code/test/zdtm/static/autofs.c @@ -0,0 +1,939 @@ +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "zdtmtst.h" +#include "auto_dev-ioctl.h" + +const char *test_doc = "Autofs (v5) migration test"; +const char *test_author = "Stanislav Kinsburskii "; + +char *dirname; +TEST_OPTION(dirname, string, "directory name", 1); + +#define AUTOFS_DEV "/dev/autofs" + +#define INDIRECT_MNT_DIR "mnt" + +int autofs_dev; +task_waiter_t t; + +static char *xvstrcat(char *str, const char *fmt, va_list args) +{ + size_t offset = 0, delta; + int ret; + char *new; + va_list tmp; + + if (str) + offset = strlen(str); + delta = strlen(fmt) * 2; + + do { + ret = -ENOMEM; + new = realloc(str, offset + delta); + if (new) { + va_copy(tmp, args); + ret = vsnprintf(new + offset, delta, fmt, tmp); + va_end(tmp); + if (ret >= delta) { + /* NOTE: vsnprintf returns the amount of bytes + * * to allocate. */ + delta = ret +1; + str = new; + ret = 0; + } + } + } while (ret == 0); + + if (ret == -ENOMEM) { + /* realloc failed. We must release former string */ + pr_err("Failed to allocate string\n"); + free(str); + } else if (ret < 0) { + /* vsnprintf failed */ + pr_err("Failed to print string\n"); + free(new); + new = NULL; + } + return new; +} + +char *xstrcat(char *str, const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + str = xvstrcat(str, fmt, args); + va_end(args); + + return str; +} + +char *xsprintf(const char *fmt, ...) +{ + va_list args; + char *str; + + va_start(args, fmt); + str = xvstrcat(NULL, fmt, args); + va_end(args); + + return str; +} + +struct autofs_params { + const char *mountpoint; + int (*create)(struct autofs_params *p); + int (*setup)(struct autofs_params *p); + int (*check)(struct autofs_params *p); + int (*reap)(struct autofs_params *p); + const unsigned type; + int fd; + struct stat fd_stat; + void (*onexit)(void); + const int close_pipe; + pid_t pid; +}; + +struct autofs_params *my_type; + +static int stop; + +static int setup_direct(struct autofs_params *p) +{ + char *path; + + path = xsprintf("%s/%s/direct_file", dirname, p->mountpoint); + if (!path) { + pr_err("failed to allocate path\n"); + return -ENOMEM; + } + p->fd = open(path, O_CREAT | O_EXCL, 0600); + if (p->fd < 0) { + pr_perror("%d: failed to open file %s", getpid(), path); + return -errno; + } + if (fstat(p->fd, &p->fd_stat)) { + pr_perror("%d: failed to stat %s", getpid(), path); + return -errno; + } + free(path); + return 0; +} + +static int setup_indirect(struct autofs_params *p) +{ + char *path; + + path = xsprintf("%s/%s/%s/indirect_file", dirname, p->mountpoint, INDIRECT_MNT_DIR); + if (!path) { + pr_err("failed to allocate path\n"); + return -ENOMEM; + } + p->fd = open(path, O_CREAT | O_EXCL, 0600); + if (p->fd < 0) { + pr_perror("%d: failed to open file %s", getpid(), path); + return -errno; + } + if (fstat(p->fd, &p->fd_stat)) { + pr_perror("%d: failed to stat %s", getpid(), path); + return -errno; + } + free(path); + return 0; +} + +static int umount_fs(const char *mountpoint, int magic) +{ + struct statfs buf; + + if (statfs(mountpoint, &buf)) { + pr_perror("%s: failed to statfs", mountpoint); + return -errno; + } + if (buf.f_type == magic) { + if (umount(mountpoint) < 0) { + pr_perror("failed to umount %s tmpfs", mountpoint); + return -errno; + } + } + return 0; +} + +static int check_fd(struct autofs_params *p) +{ + struct stat st; + int ret = 0; + + if (fstat(p->fd, &st)) { + pr_perror("failed to stat fd %d", p->fd); + return -errno; + } + + if (st.st_dev != p->fd_stat.st_dev) { + skip("%s: st_dev differs: %llu != %llu " + "(waiting for \"device namespaces\")", p->mountpoint, + (long long unsigned)st.st_dev, + (long long unsigned)p->fd_stat.st_dev); +// ret++; + } + if (st.st_mode != p->fd_stat.st_mode) { + pr_err("%s: st_mode differs: 0%o != 0%o\n", p->mountpoint, + st.st_mode, p->fd_stat.st_mode); + ret++; + } + if (st.st_nlink != p->fd_stat.st_nlink) { + pr_err("%s: st_nlink differs: %ld != %ld\n", p->mountpoint, + (long)st.st_nlink, (long)p->fd_stat.st_nlink); + ret++; + } + if (st.st_uid != p->fd_stat.st_uid) { + pr_err("%s: st_uid differs: %u != %u\n", p->mountpoint, + st.st_uid, p->fd_stat.st_uid); + ret++; + } + if (st.st_gid != p->fd_stat.st_gid) { + pr_err("%s: st_gid differs: %u != %u\n", p->mountpoint, + st.st_gid, p->fd_stat.st_gid); + ret++; + } + if (st.st_rdev != p->fd_stat.st_rdev) { + pr_err("%s: st_rdev differs: %lld != %lld\n", p->mountpoint, + (long long)st.st_rdev, + (long long)p->fd_stat.st_rdev); + ret++; + } + if (st.st_size != p->fd_stat.st_size) { + pr_err("%s: st_size differs: %lld != %lld\n", p->mountpoint, + (long long)st.st_size, + (long long)p->fd_stat.st_size); + ret++; + } + if (st.st_blksize != p->fd_stat.st_blksize) { + pr_err("%s: st_blksize differs %lld != %lld:\n", p->mountpoint, + (long long)st.st_blksize, + (long long)p->fd_stat.st_blksize); + ret++; + } + if (st.st_blocks != p->fd_stat.st_blocks) { + pr_err("%s: st_blocks differs: %lld != %lld\n", p->mountpoint, + (long long)st.st_blocks, + (long long)p->fd_stat.st_blocks); + ret++; + } + + return ret; +} + +static int check_automount(struct autofs_params *p) +{ + int err; + char *mountpoint; + + err = check_fd(p); + if (err) { + pr_err("%s: opened file descriptor wasn't migrated properly\n", + p->mountpoint); + return err; + } + + if (p->type == AUTOFS_TYPE_DIRECT) + mountpoint = xsprintf("%s/%s", dirname, p->mountpoint); + else if (p->type == AUTOFS_TYPE_INDIRECT) + mountpoint = xsprintf("%s/%s/%s", dirname, p->mountpoint, INDIRECT_MNT_DIR); + else { + pr_err("Unknown autofs type: %d\n", p->type); + return -EINVAL; + } + if (!mountpoint) { + pr_err("failed to allocate string\n"); + return -ENOMEM; + } + + if (close(p->fd)) { + pr_err("%s: failed to close fd %d\n", p->mountpoint, p->fd); + return -errno; + } + + err = umount_fs(mountpoint, TMPFS_MAGIC); + if (err) + return err; + + free(mountpoint); + + err = p->setup(p); + if (err) { + pr_err("autofs doesn't workafter restore\n"); + return err; + } + + if (close(p->fd)) { + pr_perror("%s: failed to close fd %d", mountpoint, + p->fd); + return -errno; + } + + return 0; +} + +static int autofs_dev_open(void) +{ + int fd; + + if (access(AUTOFS_DEV, R_OK | W_OK)) { + pr_perror("Device /dev/autofs is not accessible"); + return -1; + } + + fd = open(AUTOFS_DEV, O_RDONLY); + if (fd == -1) { + pr_perror("failed to open /dev/autofs"); + return -errno; + } + return fd; +} + +static int autofs_open_mount(int devid, const char *mountpoint) +{ + struct autofs_dev_ioctl *param; + size_t size; + int ret; + + size = sizeof(struct autofs_dev_ioctl) + strlen(mountpoint) + 1; + param = malloc(size); + + init_autofs_dev_ioctl(param); + param->size = size; + param->ioctlfd = -1; + param->openmount.devid = devid; + strcpy(param->path, mountpoint); + + if (ioctl(autofs_dev, AUTOFS_DEV_IOCTL_OPENMOUNT, param) < 0) { + pr_perror("failed to open autofs mount %s", mountpoint); + ret = -errno; + goto out; + } + + ret = param->ioctlfd; +out: + free(param); + return ret; +} + +static int autofs_report_result(int token, int devid, const char *mountpoint, + int result) +{ + int ioctl_fd; + struct autofs_dev_ioctl param; + int err; + + ioctl_fd = autofs_open_mount(devid, mountpoint); + if (ioctl_fd < 0) { + pr_err("failed to open autofs mountpoint %s\n", mountpoint); + return ioctl_fd; + } + + init_autofs_dev_ioctl(¶m); + param.ioctlfd = ioctl_fd; + + if (result) { + param.fail.token = token; + param.fail.status = result; + } else + param.ready.token = token; + + err = ioctl(autofs_dev, result ? AUTOFS_DEV_IOCTL_FAIL : AUTOFS_DEV_IOCTL_READY, ¶m); + if (err) { + pr_perror("failed to report result to autofs mountpoint %s", mountpoint); + err = -errno; + } + close(ioctl_fd); + return err; +} + +static int mount_tmpfs(const char *mountpoint) +{ + struct statfs buf; + + if (statfs(mountpoint, &buf)) { + pr_perror("failed to statfs %s", mountpoint); + return -errno; + } + if (buf.f_type == TMPFS_MAGIC) + return 0; + + if (mount("autofs_test", mountpoint, "tmpfs", 0, "size=1M") < 0) { + pr_perror("failed to mount tmpfs to %s", + mountpoint); + return -errno; + } + return 0; +} + +static int autofs_mount_direct(const char *mountpoint, + const struct autofs_v5_packet *packet) +{ + int err; + const char *direct_mnt = mountpoint; + + err = mount_tmpfs(direct_mnt); + if (err) + pr_err("%d: failed to mount direct autofs mountpoint\n", + getpid()); + return err; +} + +static int autofs_mount_indirect(const char *mountpoint, + const struct autofs_v5_packet *packet) +{ + char *indirect_mnt; + int err; + + indirect_mnt = xsprintf("%s/%s", mountpoint, packet->name); + if (!indirect_mnt) { + pr_err("failed to allocate indirect mount path\n"); + return -ENOMEM; + } + + if ((mkdir(indirect_mnt, 0755) < 0) && (errno != EEXIST)) { + pr_perror("failed to create %s directory", indirect_mnt); + return -errno; + } + + err = mount_tmpfs(indirect_mnt); + if (err) + pr_err("%d: failed to mount indirect autofs mountpoint\n", + getpid()); + return err; + +} + +static int automountd_serve(const char *mountpoint, struct autofs_params *p, + const union autofs_v5_packet_union *packet) +{ + const struct autofs_v5_packet *v5_packet = &packet->v5_packet; + int err, res; + + switch (packet->hdr.type) { + case autofs_ptype_missing_indirect: + res = autofs_mount_indirect(mountpoint, v5_packet); + break; + case autofs_ptype_missing_direct: + res = autofs_mount_direct(mountpoint, v5_packet); + break; + case autofs_ptype_expire_indirect: + pr_err("%d: expire request for indirect mount %s?", + getpid(), v5_packet->name); + return -EINVAL; + case autofs_ptype_expire_direct: + pr_err("%d: expire request for direct mount?", + getpid()); + return -EINVAL; + default: + pr_err("unknown request type: %d\n", packet->hdr.type); + return -EINVAL; + } + + err = autofs_report_result(v5_packet->wait_queue_token, v5_packet->dev, + mountpoint, res); + if (err) + return err; + return res; +} + +static int automountd_loop(int pipe, const char *mountpoint, struct autofs_params *param) +{ + union autofs_v5_packet_union *packet; + ssize_t bytes; + size_t psize = sizeof(*packet); + int err = 0; + + packet = malloc(psize); + if (!packet) { + pr_err("failed to allocate autofs packet\n"); + return -ENOMEM; + } + + /* Allow SIGUSR2 to interrupt system call */ + siginterrupt(SIGUSR2, 1); + + while (!stop && !err) { + memset(packet, 0, psize); + + bytes = read(pipe, packet, psize); + if (bytes < 0) { + if (errno != EINTR) { + pr_perror("failed to read packet"); + return -errno; + } + continue; + } + if (bytes != psize) { + pr_err("read less than expected: %zd < %zd\n", + bytes, psize); + return -EINVAL; + } + err = automountd_serve(mountpoint, param, packet); + if (err) + pr_err("request to autofs failed: %d\n", err); + } + return err; +} + +static int automountd(struct autofs_params *p, int control_fd) +{ + int pipes[2]; + char *autofs_path; + char *options; + int ret = -1; + char *type; + + my_type = p; + + if (p->onexit) + atexit(p->onexit); + + autofs_path = xsprintf("%s/%s", dirname, p->mountpoint); + if (!autofs_path) { + pr_err("failed to allocate autofs path"); + goto err; + } + + if (pipe(pipes) < 0) { + pr_perror("%d: failed to create pipe", getpid()); + goto err; + } + + if (setpgrp() < 0) { + pr_perror("failed to become a process group leader"); + goto err; + } + + switch (p->type) { + case AUTOFS_TYPE_DIRECT: + type = "direct"; + break; + case AUTOFS_TYPE_INDIRECT: + type = "indirect"; + break; + case AUTOFS_TYPE_OFFSET: + type = "offset"; + break; + default: + pr_err("unknown autofs type: %d\n", p->type); + return -EINVAL; + } + + options = xsprintf("fd=%d,pgrp=%d,minproto=5,maxproto=5,%s", + pipes[1], getpgrp(), type); + if (!options) { + pr_err("failed to allocate autofs options\n"); + goto err; + } + + if (mkdir(autofs_path, 0600) < 0) { + pr_perror("failed to create %s", autofs_path); + test_msg("cwd: %s\n", get_current_dir_name()); + goto err; + } + + if (mount("autofs_test", autofs_path, "autofs", 0, options) < 0) { + pr_perror("failed to mount autofs with options \"%s\"", + options); + goto err; + } + + if (p->close_pipe) + close(pipes[1]); + + ret = 0; + if (write(control_fd, &ret, sizeof(ret)) != sizeof(ret)) { + pr_perror("failed to send result"); + goto err; + } + close(control_fd); + task_waiter_complete(&t, getpid()); + return automountd_loop(pipes[0], autofs_path, p); + +err: + if (write(control_fd, &ret, sizeof(ret) != sizeof(ret))) { + pr_perror("failed to send result"); + return -errno; + } + return ret; +} + +static int start_automounter(struct autofs_params *p) +{ + int pid; + int control_fd[2]; + ssize_t bytes; + int ret; + + if (pipe(control_fd) < 0) { + pr_perror("failed to create control_fd pipe"); + return -errno; + } + + pid = test_fork(); + switch (pid) { + case -1: + pr_perror("failed to fork"); + return -1; + case 0: + close(control_fd[0]); + exit(automountd(p, control_fd[1])); + } + task_waiter_wait4(&t, pid); + p->pid = pid; + + close(control_fd[1]); + bytes = read(control_fd[0], &ret, sizeof(ret)); + close(control_fd[0]); + + if (bytes < 0) { + pr_perror("failed to get start result"); + return -errno; + } + if (bytes != sizeof(ret)) { + pr_err("received less than expected: %zu. Child %d died?\n", + bytes, p->pid); + return -EINVAL; + } + return ret; +} + +static void do_stop(int sig) +{ + stop = 1; +} + +static int reap_child(struct autofs_params *p) +{ + int status; + int pid = p->pid; + + if (kill(pid, SIGUSR2)) { + pr_perror("failed to kill child %d", pid); + return -errno; + } + + if (waitpid(pid, &status, 0) == -1) { + pr_perror("failed to collect child %d", pid); + return -errno; + } + + if (WIFSIGNALED(status)) { + pr_err("Child was killed by %d\n", WTERMSIG(status)); + return -1; + } + + return WEXITSTATUS(status); +} + +static int reap_catatonic(struct autofs_params *p) +{ + char *mountpoint; + int err; + + mountpoint = xsprintf("%s/%s", dirname, p->mountpoint); + if (!mountpoint) { + pr_err("failed to allocate string\n"); + return -ENOMEM; + } + + err = umount_fs(mountpoint, AUTOFS_SUPER_MAGIC); + if (!err) { + if (rmdir(mountpoint) < 0) { + skip("failed to remove %s directory: %s\n", mountpoint, + strerror(errno)); + err = -errno; + } + } + return err; +} + +static int setup_catatonic(struct autofs_params *p) +{ + char *path; + + path = xsprintf("%s/%s/file", dirname, p->mountpoint); + if (!path) { + pr_err("failed to allocate path\n"); + return -ENOMEM; + } + + p->fd = open(path, O_CREAT | O_EXCL, 0600); + if (p->fd >= 0) { + pr_perror("%d: was able to open file %s on catatonic mount", getpid(), path); + return -EINVAL; + } + free(path); + return 0; +} + +static int check_catatonic(struct autofs_params *p) +{ + char *mountpoint; + struct statfs buf; + + mountpoint = xsprintf("%s/%s", dirname, p->mountpoint); + if (!mountpoint) { + pr_err("failed to allocate path\n"); + return -ENOMEM; + } + + if (statfs(mountpoint, &buf)) { + pr_perror("%s: failed to statfs", mountpoint); + return -errno; + } + if (buf.f_type != AUTOFS_SUPER_MAGIC) { + pr_err("Non-autofs mount on path %s\n", mountpoint); + return -EINVAL; + } + + return setup_catatonic(p); +} + +static int create_catatonic(struct autofs_params *p) +{ + int err; + int status; + + err = start_automounter(p); + if (err) + return err; + + if (kill(p->pid, SIGKILL)) { + pr_perror("failed to kill child %d", p->pid); + return -errno; + } + + if (waitpid(p->pid, &status, 0) == -1) { + pr_perror("failed to collect child %d", p->pid); + return -errno; + } + + return 0; +} + +static void test_exit(void) +{ + if (rmdir(dirname) < 0) + skip("failed to remove %s directory: %s\n", dirname, + strerror(errno)); +} + +typedef enum { + AUTOFS_START, + AUTOFS_SETUP, + AUTOFS_CHECK, + AUTOFS_STOP +} autfs_test_action; + +static int test_action(autfs_test_action act, struct autofs_params *p) +{ + int ret = 0; + + while(p->mountpoint) { + int (*action)(struct autofs_params *p); + + switch (act) { + case AUTOFS_START: + action = p->create; + break; + case AUTOFS_SETUP: + action = p->setup; + break; + case AUTOFS_CHECK: + action = p->check; + break; + case AUTOFS_STOP: + action = p->reap; + break; + default: + pr_err("unknown action: %d\n", act); + return -1; + } + + if (action && action(p)) + ret++; + + p++; + } + return ret; +} + +static void direct_exit(void) +{ + struct autofs_params *p = my_type; + char *mountpoint; + + mountpoint = xsprintf("%s/%s", dirname, p->mountpoint); + if (!mountpoint) { + pr_err("failed to allocate string\n"); + return; + } + + if (umount_fs(mountpoint, TMPFS_MAGIC)) + return; + if (umount_fs(mountpoint, AUTOFS_SUPER_MAGIC)) + return; + + if (rmdir(mountpoint) < 0) + skip("failed to remove %s directory: %s\n", mountpoint, + strerror(errno)); +} + +static void indirect_exit(void) +{ + struct autofs_params *p = my_type; + char *mountpoint, *tmpfs; + + mountpoint = xsprintf("%s/%s", dirname, p->mountpoint); + if (!mountpoint) { + pr_err("failed to allocate string\n"); + return; + } + + tmpfs = xsprintf("%s/%s/%s", dirname, p->mountpoint, INDIRECT_MNT_DIR); + if (!tmpfs) { + pr_err("failed to allocate string\n"); + return; + } + + if (!access(tmpfs, F_OK)) { + if (umount_fs(tmpfs, TMPFS_MAGIC)) + return; + } + if (umount_fs(mountpoint, AUTOFS_SUPER_MAGIC)) + return; + + if (rmdir(mountpoint) < 0) + skip("failed to remove %s directory: %s\n", mountpoint, + strerror(errno)); +} + +enum autofs_tests { + AUTOFS_DIRECT, + AUTOFS_INDIRECT, + AUTOFS_CATATONIC, +}; + +struct autofs_params autofs_types[] = { + [AUTOFS_DIRECT] = { + .mountpoint = "direct", + .create = start_automounter, + .setup = setup_direct, + .check = check_automount, + .reap = reap_child, + .type = AUTOFS_TYPE_DIRECT, + .fd = -1, + .onexit = direct_exit, + .close_pipe = 1, + }, + [AUTOFS_INDIRECT] = { + .mountpoint = "indirect", + .create = start_automounter, + .setup = setup_indirect, + .check = check_automount, + .reap = reap_child, + .type = AUTOFS_TYPE_INDIRECT, + .fd = -1, + .onexit = indirect_exit, + .close_pipe = 0, + }, + [AUTOFS_CATATONIC] = { + .mountpoint = "catatonic", + .create = create_catatonic, + .setup = setup_catatonic, + .check = check_catatonic, + .reap = reap_catatonic, + .type = AUTOFS_TYPE_DIRECT, + .onexit = NULL, + .fd = -1, + .close_pipe = 1, + }, + { NULL, NULL, NULL, NULL } +}; + +int main(int argc, char **argv) +{ + int ret = 0; + + test_init(argc, argv); + + task_waiter_init(&t); + + if (mkdir(dirname, 0777) < 0) { + pr_perror("failed to create %s directory", dirname); + return -1; + } + + autofs_dev = autofs_dev_open(); + if (autofs_dev < 0) + return -1; + + if (signal(SIGUSR2, do_stop) == SIG_ERR) { + pr_perror("Failed to set SIGUSR2 handler"); + return -1; + } + + if (signal(SIGPIPE, SIG_IGN) == SIG_ERR) { + pr_perror("Failed to set SIGPIPE handler"); + return -1; + } + + if (test_action(AUTOFS_START, autofs_types)) { + pr_err("AUTOFS_START action failed\n"); + ret++; + goto err; + } + + close(autofs_dev); + + atexit(test_exit); + + if (test_action(AUTOFS_SETUP, autofs_types)) { + pr_err("AUTOFS_SETUP action failed\n"); + ret++; + goto err; + } + + test_daemon(); + test_waitsig(); + + if (test_action(AUTOFS_CHECK, autofs_types)) { + pr_err("AUTOFS_CHECK action failed\n"); + ret++; + } +err: + if (test_action(AUTOFS_STOP, autofs_types)) { + pr_err("AUTOFS_STOP action failed\n"); + ret++; + } + + if (ret) { + fail(); + return ret; + } + + pass(); + return 0; +} + diff --git a/CRIU_code/test/zdtm/static/autofs.desc b/CRIU_code/test/zdtm/static/autofs.desc new file mode 100644 index 0000000..20859d3 --- /dev/null +++ b/CRIU_code/test/zdtm/static/autofs.desc @@ -0,0 +1 @@ +{'feature': 'autofs', 'flavor': 'ns', 'flags': 'suid'} diff --git a/CRIU_code/test/zdtm/static/bind-mount.c b/CRIU_code/test/zdtm/static/bind-mount.c new file mode 100644 index 0000000..06c5cf1 --- /dev/null +++ b/CRIU_code/test/zdtm/static/bind-mount.c @@ -0,0 +1,62 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check bind-mounts"; +const char *test_author = "Pavel Emelianov "; + +char *dirname; +TEST_OPTION(dirname, string, "directory name", 1); + +int main(int argc, char **argv) +{ + char test_dir[PATH_MAX], test_bind[PATH_MAX]; + char test_file[PATH_MAX], test_bind_file[PATH_MAX]; + int fd; + + test_init(argc, argv); + + mkdir(dirname, 0700); + + snprintf(test_dir, sizeof(test_dir), "%s/test", dirname); + snprintf(test_bind, sizeof(test_bind), "%s/bind", dirname); + snprintf(test_file, sizeof(test_file), "%s/test/test.file", dirname); + snprintf(test_bind_file, sizeof(test_bind_file), "%s/bind/test.file", dirname); + + mkdir(test_dir, 0700); + mkdir(test_bind, 0700); + + if (mount(test_dir, test_bind, NULL, MS_BIND, NULL)) { + pr_perror("Unable to mount %s to %s", test_dir, test_bind); + return 1; + } + + test_daemon(); + test_waitsig(); + + fd = open(test_file, O_CREAT | O_WRONLY | O_EXCL, 0600); + if (fd < 0) { + pr_perror("Unable to open %s", test_file); + return 1; + } + close(fd); + + if (access(test_bind_file, F_OK)) { + pr_perror("%s doesn't exist", test_bind_file); + return 1; + } + + if (umount(test_bind)) { + pr_perror("Unable to umount %s", test_bind); + return 1; + } + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/bind-mount.desc b/CRIU_code/test/zdtm/static/bind-mount.desc new file mode 100644 index 0000000..7657ba4 --- /dev/null +++ b/CRIU_code/test/zdtm/static/bind-mount.desc @@ -0,0 +1 @@ +{'flavor': 'ns uns', 'flags': 'suid'} diff --git a/CRIU_code/test/zdtm/static/binfmt_misc.c b/CRIU_code/test/zdtm/static/binfmt_misc.c new file mode 100644 index 0000000..72ab8c1 --- /dev/null +++ b/CRIU_code/test/zdtm/static/binfmt_misc.c @@ -0,0 +1,199 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that binfmt_misc entries remain registered"; +const char *test_author = "Kirill Tkhai $tname/${name}_magic +fi + +if [ -e $tname/${name}_extension ]; then + echo -1 > $tname/${name}_extension +fi +set -e + +umount "$tname" +rmdir "$tname" diff --git a/CRIU_code/test/zdtm/static/bridge.c b/CRIU_code/test/zdtm/static/bridge.c new file mode 100644 index 0000000..983c262 --- /dev/null +++ b/CRIU_code/test/zdtm/static/bridge.c @@ -0,0 +1,113 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "zdtmtst.h" + +const char *test_doc = "check that empty bridges are c/r'd correctly"; +const char *test_author = "Tycho Andersen "; + +#define BRIDGE_NAME "zdtmbr0" + +int add_bridge(void) +{ + if (system("ip link add " BRIDGE_NAME " type bridge")) + return -1; + + if (system("ip addr add 10.0.55.55/32 dev " BRIDGE_NAME)) + return -1; + + /* use a link local address so we can test scope_id change */ + if (system("ip addr add fe80:4567::1/64 nodad dev " BRIDGE_NAME)) + return -1; + + if (system("ip link set " BRIDGE_NAME " up")) + return -1; + + return 0; +} + +int del_bridge(void) +{ + /* don't check for errors, let's try to make sure it's deleted */ + system("ip link set " BRIDGE_NAME " down"); + + if (system("ip link del " BRIDGE_NAME)) + return -1; + + return 0; +} + +int main(int argc, char **argv) +{ + int ret = 1; + struct sockaddr_in6 addr; + int sk; + + test_init(argc, argv); + + if (add_bridge() < 0) + return 1; + + sk = socket(AF_INET6, SOCK_DGRAM, 0); + if (sk < 0) { + fail("can't get socket"); + goto out; + } + + memset(&addr, 0, sizeof(addr)); + addr.sin6_port = htons(0); + addr.sin6_family = AF_INET6; + if (inet_pton(AF_INET6, "fe80:4567::1", &addr.sin6_addr) < 0) { + fail("can't convert inet6 addr"); + goto out; + } + addr.sin6_scope_id = if_nametoindex(BRIDGE_NAME); + + if (bind(sk, (struct sockaddr*)&addr, sizeof(addr)) < 0) { + fail("can't bind"); + goto out; + } + + /* Here, we grep for inet because some of the IPV6 DAD stuff can be + * racy, and all we really care about is that the bridge got restored + * with the right MAC, since we know DAD will succeed eventually. + * + * (I got this race with zdtm.py, but not with zdtm.sh; not quite sure + * what the environment difference is/was.) + */ + if (system("ip addr list dev " BRIDGE_NAME " | grep inet | sort > bridge.dump.test")) { + pr_perror("can't save net config"); + fail("Can't save net config"); + goto out; + } + + test_daemon(); + test_waitsig(); + + if (system("ip addr list dev " BRIDGE_NAME " | grep inet | sort > bridge.rst.test")) { + fail("Can't get net config"); + goto out; + } + + if (system("diff bridge.rst.test bridge.dump.test")) { + fail("Net config differs after restore"); + goto out; + } + + pass(); + + ret = 0; + +out: + del_bridge(); + return ret; +} diff --git a/CRIU_code/test/zdtm/static/bridge.desc b/CRIU_code/test/zdtm/static/bridge.desc new file mode 100644 index 0000000..a6110d6 --- /dev/null +++ b/CRIU_code/test/zdtm/static/bridge.desc @@ -0,0 +1,7 @@ +{ 'deps': [ '/bin/sh', + '/usr/bin/sort', + '/bin/grep', + '/sbin/ip|/bin/ip', + '/usr/bin/diff'], + 'flags': 'suid', + 'flavor': 'ns uns'} diff --git a/CRIU_code/test/zdtm/static/busyloop00.c b/CRIU_code/test/zdtm/static/busyloop00.c new file mode 100644 index 0000000..e9a065c --- /dev/null +++ b/CRIU_code/test/zdtm/static/busyloop00.c @@ -0,0 +1,18 @@ +#include "zdtmtst.h" + +const char *test_doc = "Run busy loop while migrating"; +const char *test_author = "Roman Kagan "; + +int main(int argc, char ** argv) +{ + test_init(argc, argv); + + test_daemon(); + + while (test_go()) + ; + + pass(); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/caps00.c b/CRIU_code/test/zdtm/static/caps00.c new file mode 100644 index 0000000..7a256c0 --- /dev/null +++ b/CRIU_code/test/zdtm/static/caps00.c @@ -0,0 +1,178 @@ +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that aps are preserved"; +const char *test_author = "Pavel Emelianov "; + +struct cap_hdr { + unsigned int version; + int pid; +}; + +struct cap_data { + unsigned int eff; + unsigned int prm; + unsigned int inh; +}; + +#define _LINUX_CAPABILITY_VERSION_3 0x20080522 +#define _LINUX_CAPABILITY_U32S_3 2 +#define CAP_CHOWN 0 +#define CAP_DAC_OVERRIDE 1 + +int capget(struct cap_hdr *hdrp, struct cap_data *datap); +int capset(struct cap_hdr *hdrp, const struct cap_data *datap); + +static int cap_last_cap = 63; +#define NORM_CAPS(v, cap) v[1].cap &= (1LL << (cap_last_cap + 1 - 32)) - 1; + +int main(int argc, char **argv) +{ + task_waiter_t t; + int pid, result_pipe[2]; + struct cap_data data[_LINUX_CAPABILITY_U32S_3]; + struct cap_data data_2[_LINUX_CAPABILITY_U32S_3]; + char res = 'x'; + FILE *f; + + test_init(argc, argv); + task_waiter_init(&t); + + f = fopen("/proc/sys/kernel/cap_last_cap", "r"); + if (f) { + if (fscanf(f, "%d", &cap_last_cap) != 1) { + pr_perror("Unable to read cal_last_cap"); + fclose(f); + return 1; + } + fclose(f); + } else + test_msg("/proc/sys/kernel/cap_last_cap is not available\n"); + + if (pipe(result_pipe)) { + pr_perror("Can't create pipe"); + return 1; + } + + pid = test_fork(); + if (pid == 0) { + struct cap_hdr hdr; + if (prctl(PR_CAPBSET_DROP, CAP_SETPCAP, 0, 0, 0)) { + res = 'x'; + task_waiter_complete_current(&t); + goto bad; + } + + hdr.version = _LINUX_CAPABILITY_VERSION_3; + hdr.pid = 0; + + if (capget(&hdr, data) < 0) { + pr_perror("capget"); + return -1; + } + + hdr.version = _LINUX_CAPABILITY_VERSION_3; + hdr.pid = 0; + + data[0].eff &= ~((1 << CAP_CHOWN) | (1 << CAP_DAC_OVERRIDE)); + data[0].prm &= ~(1 << CAP_DAC_OVERRIDE); + + if (capset(&hdr, data) < 0) { + pr_perror("capset"); + return -1; + } + + task_waiter_complete_current(&t); + task_waiter_wait4(&t, getppid()); + + hdr.version = _LINUX_CAPABILITY_VERSION_3; + hdr.pid = 0; + + if (capget(&hdr, data_2) < 0) { + pr_perror("second capget"); + return -1; + } + + NORM_CAPS(data, eff); + NORM_CAPS(data, prm); + NORM_CAPS(data, inh); + NORM_CAPS(data_2, eff); + NORM_CAPS(data_2, prm); + NORM_CAPS(data_2, inh); + + if (data[0].eff != data_2[0].eff) { + res = '1'; + goto bad; + } + if (data[1].eff != data_2[1].eff) { + res = '2'; + goto bad; + } + if (data[0].prm != data_2[0].prm) { + res = '3'; + goto bad; + } + if (data[1].prm != data_2[1].prm) { + res = '4'; + goto bad; + } + if (data[0].inh != data_2[0].inh) { + res = '3'; + goto bad; + } + if (data[1].inh != data_2[1].inh) { + res = '4'; + goto bad; + } + + if (prctl(PR_CAPBSET_READ, CAP_SETPCAP, 0, 0, 0) != 0) { + res='5'; + goto bad; + } + + res = '0'; +bad: + write(result_pipe[1], &res, 1); + + if (res != '0') { + write(result_pipe[1], data, sizeof(data)); + write(result_pipe[1], data_2, sizeof(data_2)); + } + + close(result_pipe[0]); + close(result_pipe[1]); + _exit(0); + } + + task_waiter_wait4(&t, pid); + + test_daemon(); + test_waitsig(); + + task_waiter_complete_current(&t); + + read(result_pipe[0], &res, 1); + + if (res == '0') + pass(); + else { + read(result_pipe[0], data, sizeof(data)); + read(result_pipe[0], data_2, sizeof(data_2)); + test_msg("{eff,prm,inh}[]={%08x,%08x,%08x}, {%08x,%08x,%08x}\n", + data[0].eff, data[0].prm, data[0].inh, + data[1].eff, data[1].prm, data[1].inh); + test_msg("{eff,prm,inh}[]={%08x,%08x,%08x}, {%08x,%08x,%08x}\n", + data_2[0].eff, data_2[0].prm, data_2[0].inh, + data_2[1].eff, data_2[1].prm, data_2[1].inh); + fail("Fail: %c", res); + } + close(result_pipe[0]); + close(result_pipe[1]); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/caps00.desc b/CRIU_code/test/zdtm/static/caps00.desc new file mode 100644 index 0000000..2eac7e6 --- /dev/null +++ b/CRIU_code/test/zdtm/static/caps00.desc @@ -0,0 +1 @@ +{'flags': 'suid'} diff --git a/CRIU_code/test/zdtm/static/cgroup00.c b/CRIU_code/test/zdtm/static/cgroup00.c new file mode 100644 index 0000000..ee14d1f --- /dev/null +++ b/CRIU_code/test/zdtm/static/cgroup00.c @@ -0,0 +1,205 @@ +#include +#include +#include +#include +#include +#include +#include +#include "zdtmtst.h" + +const char *test_doc = "Check that cgroups layout is preserved"; +const char *test_author = "Pavel Emelianov "; + +char *dirname; +TEST_OPTION(dirname, string, "cgroup directory name", 1); +static const char *cgname = "zdtmtst"; +#define SUBNAME "subcg00" +#define SUBNAME2 SUBNAME"/subsubcg" + +static int cg_move(char *name) +{ + int cgfd, l; + char paux[256]; + + sprintf(paux, "%s/%s", dirname, name); + mkdir(paux, 0600); + + sprintf(paux, "%s/%s/tasks", dirname, name); + + cgfd = open(paux, O_WRONLY); + if (cgfd < 0) { + pr_perror("Can't open tasks"); + return -1; + } + + l = write(cgfd, "0", 2); + close(cgfd); + + if (l < 0) { + pr_perror("Can't move self to subcg"); + return -1; + } + + return 0; +} + +static int cg_check(char *name) +{ + int found = 0; + FILE *cgf; + char paux[256], aux[128]; + + cgf = fopen("/proc/self/cgroup", "r"); + if (cgf == NULL) + return -1; + + sprintf(aux, "name=%s:/%s\n", cgname, name); + while (fgets(paux, sizeof(paux), cgf)) { + char *s; + + s = strchr(paux, ':') + 1; + test_msg("CMP [%s] vs [%s]\n", s, aux); + if (!strcmp(s, aux)) { + found = 1; + break; + } + } + + fclose(cgf); + + return found ? 0 : -1; +} + +int main(int argc, char **argv) +{ + char aux[64]; + int p1[2], p2[2], pr[2], status; + + test_init(argc, argv); + + /* + * Pipes to talk to two kids. + * First, they report that they are ready (int), + * then they report the restore status (int). + */ + + pipe(p1); + pipe(p2); + + /* "Restore happened" pipe */ + pipe(pr); + + if (mkdir(dirname, 0700) < 0) { + pr_perror("Can't make dir"); + goto out; + } + + sprintf(aux, "none,name=%s", cgname); + if (mount("none", dirname, "cgroup", 0, aux)) { + pr_perror("Can't mount cgroups"); + goto out_rd; + } + + if (cg_move(SUBNAME)) + goto out_rs; + + if (fork() == 0) { + if (fork() == 0) { + /* + * 2nd level kid -- moves into its own + * cgroup and triggers slow-path cg_set + * restore in criu + */ + + close(p1[0]); + close(p1[1]); + close(p2[0]); + close(pr[1]); + + status = cg_move(SUBNAME2); + write(p2[1], &status, sizeof(status)); + + if (status == 0) { + read(pr[0], &status, sizeof(status)); + + status = cg_check(SUBNAME2); + write(p2[1], &status, sizeof(status)); + } + + exit(0); + } + + /* + * 1st level kid -- inherits cgroup from + * parent and triggers fast-path cg_set + * restore in criu + */ + + close(p1[0]); + close(p2[0]); + close(p2[1]); + close(pr[1]); + + status = 0; + write(p1[1], &status, sizeof(status)); + + read(pr[0], &status, sizeof(status)); + + status = cg_check(SUBNAME); + write(p1[1], &status, sizeof(status)); + + exit(0); + } + + close(p1[1]); + close(p2[1]); + close(pr[0]); + + status = -1; + read(p1[0], &status, sizeof(status)); + if (status != 0) + goto out_ks; + + status = -1; + read(p2[0], &status, sizeof(status)); + if (status != 0) + goto out_ks; + + test_daemon(); + test_waitsig(); + + close(pr[1]); + + if (cg_check(SUBNAME)) { + fail("Top level task cg changed"); + goto out_rs; + } + + status = -1; + read(p1[0], &status, sizeof(status)); + if (status != 0) { + fail("1st level task cg changed"); + goto out_rs; + } + + status = -1; + read(p2[0], &status, sizeof(status)); + if (status != 0) { + fail("2nd level task cg changed"); + goto out_rs; + } + + pass(); + +out_rs: + umount(dirname); +out_rd: + rmdir(dirname); +out: + return 0; + +out_ks: + pr_perror("Error moving into cgroups"); + close(pr[0]); + goto out_rs; +} diff --git a/CRIU_code/test/zdtm/static/cgroup00.desc b/CRIU_code/test/zdtm/static/cgroup00.desc new file mode 100644 index 0000000..3c6c4a7 --- /dev/null +++ b/CRIU_code/test/zdtm/static/cgroup00.desc @@ -0,0 +1 @@ +{'flavor': 'h', 'flags': 'suid', 'opts': '--manage-cgroups'} diff --git a/CRIU_code/test/zdtm/static/cgroup00.hook b/CRIU_code/test/zdtm/static/cgroup00.hook new file mode 100644 index 0000000..a8af992 --- /dev/null +++ b/CRIU_code/test/zdtm/static/cgroup00.hook @@ -0,0 +1,20 @@ +#!/bin/bash + +[ "$1" == "--clean" -o "$1" == "--pre-restore" ] || exit 0 + +set -e + +tname=$(mktemp -d cgclean.XXXXXX) +mount -t cgroup none $tname -o "none,name=zdtmtst" + +echo "Cleaning $tname" + +set +e +rmdir "$tname/subcg00/subsubcg/" +rmdir "$tname/subcg00/" +set -e + +echo "Left there is:" +ls "$tname" +umount "$tname" +rmdir "$tname" diff --git a/CRIU_code/test/zdtm/static/cgroup01.c b/CRIU_code/test/zdtm/static/cgroup01.c new file mode 100644 index 0000000..cf54c1d --- /dev/null +++ b/CRIU_code/test/zdtm/static/cgroup01.c @@ -0,0 +1,116 @@ +#include +#include +#include +#include +#include +#include +#include "zdtmtst.h" + +const char *test_doc = "Check that empty cgroups are preserved"; +const char *test_author = "Tycho Andersen "; + +char *dirname; +TEST_OPTION(dirname, string, "cgroup directory name", 1); +static const char *cgname = "zdtmtst"; +static const char *subname = "subcg01"; +static const char *empty = "empty"; + +int main(int argc, char **argv) +{ + int cgfd, l, ret = 1, i; + char aux[1024], paux[1024]; + FILE *cgf; + struct stat st; + + test_init(argc, argv); + + if (mkdir(dirname, 0700) < 0) { + pr_perror("Can't make dir"); + goto out; + } + + sprintf(aux, "none,name=%s", cgname); + if (mount("none", dirname, "cgroup", 0, aux)) { + pr_perror("Can't mount cgroups"); + goto out_rd; + } + + sprintf(paux, "%s/%s", dirname, subname); + mkdir(paux, 0600); + + l = sprintf(aux, "%d", getpid()); + sprintf(paux, "%s/%s/tasks", dirname, subname); + + cgfd = open(paux, O_WRONLY); + if (cgfd < 0) { + pr_perror("Can't open tasks"); + goto out_rs; + } + + l = write(cgfd, aux, l); + close(cgfd); + + if (l < 0) { + pr_perror("Can't move self to subcg"); + goto out_rs; + } + + for (i = 0; i < 2; i++) { + sprintf(paux, "%s/%s/%s.%d", dirname, subname, empty, i); + if (mkdir(paux, 0600)) { + pr_perror("mkdir %s", paux); + goto out_rs; + } + } + + test_daemon(); + test_waitsig(); + + cgf = fopen("/proc/self/mountinfo", "r"); + if (cgf == NULL) { + fail("No mountinfo file"); + goto out_rs; + } + + while (fgets(paux, sizeof(paux), cgf)) { + char *s; + + s = strstr(paux, cgname); + if (!s) + continue; + + sscanf(paux, "%*d %*d %*d:%*d %*s %s", aux); + test_msg("found cgroup at %s\n", aux); + + for (i = 0; i < 2; i++) { + ssprintf(paux, "%s/%s/%s.%d", aux, subname, empty, i); + + if (stat(paux, &st)) { + fail("couldn't stat %s\n", paux); + ret = -1; + goto out_close; + } + + if (!S_ISDIR(st.st_mode)) { + fail("%s is not a directory\n", paux); + ret = -1; + goto out_close; + } + } + + pass(); + ret = 0; + goto out_close; + } + + fail("empty cgroup not found!\n"); + +out_close: + fclose(cgf); +out_rs: + umount(dirname); +out_rd: + rmdir(dirname); +out: + return ret; +} diff --git a/CRIU_code/test/zdtm/static/cgroup01.desc b/CRIU_code/test/zdtm/static/cgroup01.desc new file mode 100644 index 0000000..3c6c4a7 --- /dev/null +++ b/CRIU_code/test/zdtm/static/cgroup01.desc @@ -0,0 +1 @@ +{'flavor': 'h', 'flags': 'suid', 'opts': '--manage-cgroups'} diff --git a/CRIU_code/test/zdtm/static/cgroup01.hook b/CRIU_code/test/zdtm/static/cgroup01.hook new file mode 100644 index 0000000..d2eacbb --- /dev/null +++ b/CRIU_code/test/zdtm/static/cgroup01.hook @@ -0,0 +1,21 @@ +#!/bin/bash + +[ "$1" == "--clean" -o "$1" == "--pre-restore" ] || exit 0 + +set -e + +tname=$(mktemp -d cgclean.XXXXXX) +mount -t cgroup none $tname -o "none,name=zdtmtst" + +echo "Cleaning $tname" + +set +e +rmdir "$tname/subcg01/empty.0/" +rmdir "$tname/subcg01/empty.1/" +rmdir "$tname/subcg01/" +set -e + +echo "Left there is:" +ls "$tname" +umount "$tname" +rmdir "$tname" diff --git a/CRIU_code/test/zdtm/static/cgroup02.c b/CRIU_code/test/zdtm/static/cgroup02.c new file mode 100644 index 0000000..6de2222 --- /dev/null +++ b/CRIU_code/test/zdtm/static/cgroup02.c @@ -0,0 +1,176 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include "zdtmtst.h" + +const char *test_doc = "Check that empty cgroups are preserved"; +const char *test_author = "Tycho Andersen "; + +char *dirname; +TEST_OPTION(dirname, string, "cgroup directory name", 1); +static const char *cgname = "zdtmtst"; +static const char *subname = "oldroot"; +static const char *cgname2 = "zdtmtst.defaultroot"; + +int mount_and_add(const char *controller, const char *prefix, const char *path) +{ + char aux[1024], paux[1024], subdir[1024]; + int cgfd, l; + + if (mkdir(dirname, 0700) < 0 && errno != EEXIST) { + pr_perror("Can't make dir"); + return -1; + } + + sprintf(subdir, "%s/%s", dirname, controller); + if (mkdir(subdir, 0700) < 0) { + pr_perror("Can't make dir"); + return -1; + } + + sprintf(aux, "none,name=%s", controller); + if (mount("none", subdir, "cgroup", 0, aux)) { + pr_perror("Can't mount cgroups"); + goto err_rd; + } + + ssprintf(paux, "%s/%s", subdir, prefix); + mkdir(paux, 0600); + + ssprintf(paux, "%s/%s/%s", subdir, prefix, path); + mkdir(paux, 0600); + + l = sprintf(aux, "%d", getpid()); + ssprintf(paux, "%s/%s/%s/tasks", subdir, prefix, path); + + cgfd = open(paux, O_WRONLY); + if (cgfd < 0) { + pr_perror("Can't open tasks"); + goto err_rs; + } + + l = write(cgfd, aux, l); + close(cgfd); + + if (l < 0) { + pr_perror("Can't move self to subcg"); + goto err_rs; + } + + return 0; +err_rs: + umount(dirname); +err_rd: + rmdir(dirname); + return -1; +} + +bool test_exists(char *mountinfo_line, char *path) +{ + char aux[1024], paux[1024]; + struct stat st; + + sscanf(mountinfo_line, "%*d %*d %*d:%*d %*s %s", aux); + test_msg("found cgroup at %s\n", aux); + + ssprintf(paux, "%s/%s", aux, path); + if (stat(paux, &st)) { + return false; + } + + if (!S_ISDIR(st.st_mode)) { + return false; + } + + return true; +} + +int main(int argc, char **argv) +{ + FILE *cgf; + bool found_zdtmtstroot = false, found_newroot = false; + char paux[1024]; + int ret = -1; + int fd; + + test_init(argc, argv); + + if (mount_and_add(cgname, "prefix", subname)) + goto out; + if (mount_and_add(cgname2, "prefix", subname)) { + sprintf(paux, "%s/%s", dirname, cgname); + umount(paux); + rmdir(paux); + goto out; + } + + sprintf(paux, "%s/%s/prefix", dirname, cgname); + fd = open(paux, O_DIRECTORY); + if (fd < 0) + goto out_umount; + + if (fchmod(fd, 0777) < 0) { + fail("fchmod"); + goto out_umount; + } + + test_daemon(); + test_waitsig(); + + if (close(fd) < 0) { + fail("fd didn't survive"); + goto out_umount; + } + + cgf = fopen("/proc/self/mountinfo", "r"); + if (cgf == NULL) { + fail("No mountinfo file"); + goto out_umount; + } + + while (fgets(paux, sizeof(paux), cgf)) { + char *s; + + s = strstr(paux, cgname); + if (s && test_exists(paux, "prefix")) { + found_zdtmtstroot = true; + } + + s = strstr(paux, cgname2); + if (s && test_exists(paux, "newroot")) { + found_newroot = true; + } + } + + if (!found_zdtmtstroot) { + fail("oldroot not rewritten to zdtmtstroot!\n"); + goto out_close; + } + + if (!found_newroot) { + fail("oldroot not rewritten to newroot!\n"); + goto out_close; + } + + pass(); + ret = 0; + + +out_close: + fclose(cgf); +out_umount: + sprintf(paux, "%s/%s", dirname, cgname); + umount(paux); + rmdir(paux); + + sprintf(paux, "%s/%s", dirname, cgname2); + umount(paux); + rmdir(paux); +out: + return ret; +} diff --git a/CRIU_code/test/zdtm/static/cgroup02.desc b/CRIU_code/test/zdtm/static/cgroup02.desc new file mode 100644 index 0000000..df17a57 --- /dev/null +++ b/CRIU_code/test/zdtm/static/cgroup02.desc @@ -0,0 +1,4 @@ +{ 'dopts': '--manage-cgroups --cgroup-root name=zdtmtst:/prefix', + 'flags': 'suid', + 'flavor': 'h', + 'ropts': '--manage-cgroups --cgroup-root /newroot --cgroup-root name=zdtmtst:/prefix'} diff --git a/CRIU_code/test/zdtm/static/cgroup02.hook b/CRIU_code/test/zdtm/static/cgroup02.hook new file mode 100644 index 0000000..e4f1ee9 --- /dev/null +++ b/CRIU_code/test/zdtm/static/cgroup02.hook @@ -0,0 +1,32 @@ +#!/bin/bash + +[ "$1" == "--clean" -o "$1" == "--pre-restore" ] || exit 0 + +set -e + +rmroots() { + echo "Cleaning $tname ($1)" + + mount -t cgroup none $tname -o "$1" + + for d in "$tname/prefix" "$tname/newroot"; do + test -d "$d" || continue + # sort by line length + for i in `find $d -type d | awk '{print length, $0}' | sort -rn | cut -d " " -f2-`; do + echo $i + rmdir $i + done + done + + echo "Left there is:" + ls "$tname" + umount "$tname" +} + +tname=$(mktemp -d cgclean.XXXXXX) + +for ctl in $(cat /proc/self/cgroup | cut -d: -f2); do + rmroots "$ctl" +done + +rmdir $tname diff --git a/CRIU_code/test/zdtm/static/cgroup03.c b/CRIU_code/test/zdtm/static/cgroup03.c new file mode 100644 index 0000000..0b5db23 --- /dev/null +++ b/CRIU_code/test/zdtm/static/cgroup03.c @@ -0,0 +1,171 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "zdtmtst.h" + +const char *test_doc = "Check that global cgroup settings (+perms) are restored"; +const char *test_author = "Tycho Andersen "; + +char *dirname; +TEST_OPTION(dirname, string, "cgroup directory name", 1); +static const char *cgname = "zdtmtst"; + +int mount_and_add(const char *controller, const char *path) +{ + char aux[1024], paux[1024], subdir[1024]; + int cgfd, l; + + if (mkdir(dirname, 0700) < 0 && errno != EEXIST) { + pr_perror("Can't make dir"); + return -1; + } + + sprintf(subdir, "%s/%s", dirname, controller); + if (mkdir(subdir, 0700) < 0) { + pr_perror("Can't make dir"); + return -1; + } + + sprintf(aux, "none,name=%s", controller); + if (mount("none", subdir, "cgroup", 0, aux)) { + pr_perror("Can't mount cgroups"); + goto err_rd; + } + + ssprintf(paux, "%s/%s", subdir, path); + mkdir(paux, 0600); + + l = sprintf(aux, "%d", getpid()); + ssprintf(paux, "%s/%s/tasks", subdir, path); + + cgfd = open(paux, O_WRONLY); + if (cgfd < 0) { + pr_perror("Can't open tasks"); + goto err_rs; + } + + l = write(cgfd, aux, l); + close(cgfd); + + if (l < 0) { + pr_perror("Can't move self to subcg"); + goto err_rs; + } + + return 0; +err_rs: + umount(dirname); +err_rd: + rmdir(dirname); + return -1; +} + +int chownmod(char *path, int flags) +{ + int fd, ret = -1; + + fd = open(path, flags); + if (fd < 0) { + pr_perror("can't open %s", path); + return -1; + } + + if (fchown(fd, 1000, 1000) < 0) { + pr_perror("can't chown %s", path); + goto out; + } + + if (fchmod(fd, 0777) < 0) { + pr_perror("can't chmod %s", path); + goto out; + } + + ret = 0; +out: + close(fd); + return ret; +} + +int checkperms(char *path) +{ + struct stat sb; + + if (stat(path, &sb) < 0) { + pr_perror("can't stat %s", path); + return -1; + } + + if ((sb.st_mode & 0777) != 0777) { + fail("mode for %s doesn't match (%o)\n", path, sb.st_mode); + return -1; + } + + if (sb.st_uid != 1000) { + fail("uid for %s doesn't match (%d)\n", path, sb.st_uid); + return -1; + } + + if (sb.st_gid != 1000) { + fail("gid for %s doesn't match (%d)\n", path, sb.st_gid); + return -1; + } + + return 0; +} + +int main(int argc, char **argv) +{ + int ret = -1; + char path[PATH_MAX]; + + test_init(argc, argv); + + if (mount_and_add(cgname, "test") < 0) + return -1; + + sprintf(path, "%s/%s/test", dirname, cgname); + if (chownmod(path, O_DIRECTORY) < 0) + goto out_umount; + + sprintf(path, "%s/%s/test/notify_on_release", dirname, cgname); + if (chownmod(path, O_RDWR) < 0) + goto out_umount; + + + sprintf(path, "%s/%s/test/cgroup.procs", dirname, cgname); + if (chownmod(path, O_RDWR) < 0) + goto out_umount; + + test_daemon(); + test_waitsig(); + + sprintf(path, "%s/%s/test", dirname, cgname); + if (checkperms(path) < 0) + goto out_umount; + + sprintf(path, "%s/%s/test/notify_on_release", dirname, cgname); + if (checkperms(path) < 0) + goto out_umount; + + sprintf(path, "%s/%s/test/cgroup.procs", dirname, cgname); + if (checkperms(path) < 0) + goto out_umount; + + pass(); + ret = 0; + +out_umount: + sprintf(path, "%s/%s/test", dirname, cgname); + rmdir(path); + sprintf(path, "%s/%s", dirname, cgname); + umount(path); + rmdir(path); + rmdir(dirname); + return ret; +} diff --git a/CRIU_code/test/zdtm/static/cgroup03.desc b/CRIU_code/test/zdtm/static/cgroup03.desc new file mode 100644 index 0000000..42a3f2b --- /dev/null +++ b/CRIU_code/test/zdtm/static/cgroup03.desc @@ -0,0 +1 @@ +{'flavor': 'h', 'flags': 'suid excl', 'opts': '--manage-cgroups'} diff --git a/CRIU_code/test/zdtm/static/cgroup03.hook b/CRIU_code/test/zdtm/static/cgroup03.hook new file mode 100644 index 0000000..6f11022 --- /dev/null +++ b/CRIU_code/test/zdtm/static/cgroup03.hook @@ -0,0 +1,14 @@ +#!/bin/bash + +[ "$1" == "--clean" -o "$1" == "--pre-restore" ] || exit 0 + +tname=$(mktemp -d cgclean.XXXXXX) +mount -t cgroup none $tname -o "none,name=zdtmtst" + +echo "Cleaning $tname" +set +e +rmdir "$tname/test" +set -e + +umount "$tname" +rmdir "$tname" diff --git a/CRIU_code/test/zdtm/static/cgroup04.c b/CRIU_code/test/zdtm/static/cgroup04.c new file mode 100644 index 0000000..8ec0cff --- /dev/null +++ b/CRIU_code/test/zdtm/static/cgroup04.c @@ -0,0 +1,196 @@ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "zdtmtst.h" + +#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) + +const char *test_doc = "Check that some cgroups properties in kernel controllers are preserved"; +const char *test_author = "Tycho Andersen "; + +char *dirname; +TEST_OPTION(dirname, string, "cgroup directory name", 1); +static const char *cgname = "zdtmtst"; + +int write_value(const char *path, const char *value) +{ + int fd, l; + + fd = open(path, O_WRONLY); + if (fd < 0) { + pr_perror("open %s", path); + return -1; + } + + l = write(fd, value, strlen(value)); + close(fd); + if (l < 0) { + pr_perror("failed to write %s to %s", value, path); + return -1; + } + + return 0; +} + +int mount_and_add(const char *controller, const char *path, const char *prop, const char *value) +{ + char aux[1024], paux[1024], subdir[1024]; + + if (mkdir(dirname, 0700) < 0 && errno != EEXIST) { + pr_perror("Can't make dir"); + return -1; + } + + sprintf(subdir, "%s/%s", dirname, controller); + if (mkdir(subdir, 0700) < 0) { + pr_perror("Can't make dir"); + return -1; + } + + if (mount("none", subdir, "cgroup", 0, controller)) { + pr_perror("Can't mount cgroups"); + goto err_rd; + } + + ssprintf(paux, "%s/%s", subdir, path); + mkdir(paux, 0600); + + ssprintf(paux, "%s/%s/%s", subdir, path, prop); + if (write_value(paux, value) < 0) + goto err_rs; + + sprintf(aux, "%d", getpid()); + ssprintf(paux, "%s/%s/tasks", subdir, path); + if (write_value(paux, aux) < 0) + goto err_rs; + + ssprintf(paux, "%s/%s/special_prop_check", subdir, path); + mkdir(paux, 0600); + + return 0; +err_rs: + umount(dirname); +err_rd: + rmdir(dirname); + return -1; +} + +bool checkval(char *path, char *val) +{ + char buf[1024]; + int fd, n; + + fd = open(path, O_RDONLY); + if (fd < 0) { + pr_perror("open %s", path); + return false; + } + + n = read(fd, buf, sizeof(buf) - 1); + close(fd); + if (n < 0) { + pr_perror("read"); + return false; + } + buf[n] = 0; + + if (strcmp(val, buf)) { + pr_err("got %s expected %s\n", buf, val); + return false; + } + + return true; +} + +int main(int argc, char **argv) +{ + int ret = -1, i; + char buf[1024], path[PATH_MAX]; + struct stat sb; + + char *dev_allow[] = { + "c *:* m", + "b *:* m", + "c 1:3 rwm", + "c 1:5 rwm", + "c 1:7 rwm", + "c 5:0 rwm", + "c 5:2 rwm", + "c 1:8 rwm", + "c 1:9 rwm", + "c 136:* rwm", + "c 10:229 rwm", + }; + + test_init(argc, argv); + + if (mount_and_add("devices", cgname, "devices.deny", "a") < 0) + goto out; + + /* need to allow /dev/null for restore */ + sprintf(path, "%s/devices/%s/devices.allow", dirname, cgname); + for (i = 0; i < ARRAY_SIZE(dev_allow); i++) { + if (write_value(path, dev_allow[i]) < 0) + goto out; + } + + if (mount_and_add("memory", cgname, "memory.limit_in_bytes", "268435456") < 0) + goto out; + + test_daemon(); + test_waitsig(); + + buf[0] = 0; + for (i = 0; i < ARRAY_SIZE(dev_allow); i++) { + strcat(buf, dev_allow[i]); + strcat(buf, "\n"); + } + + sprintf(path, "%s/devices/%s/devices.list", dirname, cgname); + if (!checkval(path, buf)) { + fail(); + goto out; + } + + sprintf(path, "%s/memory/%s/memory.limit_in_bytes", dirname, cgname); + if (!checkval(path, "268435456\n")) { + fail(); + goto out; + } + + sprintf(path, "%s/devices/%s/special_prop_check", dirname, cgname); + if (stat(path, &sb) < 0) { + fail("special_prop_check doesn't exist?"); + goto out; + } + + if (!S_ISDIR(sb.st_mode)) { + fail("special_prop_check not a directory?"); + goto out; + } + + pass(); + ret = 0; +out: + sprintf(path, "%s/devices/%s/special_prop_check", dirname, cgname); + rmdir(path); + + sprintf(path, "%s/devices/%s", dirname, cgname); + rmdir(path); + sprintf(path, "%s/devices", dirname); + umount(path); + + sprintf(path, "%s/memory/%s", dirname, cgname); + rmdir(path); + sprintf(path, "%s/memory", dirname); + umount(path); + + return ret; +} diff --git a/CRIU_code/test/zdtm/static/cgroup04.desc b/CRIU_code/test/zdtm/static/cgroup04.desc new file mode 100644 index 0000000..5fa9d50 --- /dev/null +++ b/CRIU_code/test/zdtm/static/cgroup04.desc @@ -0,0 +1 @@ +{'flavor': 'h', 'flags': 'suid excl', 'opts': '--manage-cgroups=full'} diff --git a/CRIU_code/test/zdtm/static/cgroup04.hook b/CRIU_code/test/zdtm/static/cgroup04.hook new file mode 100644 index 0000000..f3eb616 --- /dev/null +++ b/CRIU_code/test/zdtm/static/cgroup04.hook @@ -0,0 +1,15 @@ +#!/bin/bash + +[ "$1" == "--clean" -o "$1" == "--pre-restore" ] || exit 0 + +tname=$(mktemp -d cgclean.XXXXXX) +mount -t cgroup none $tname -o "devices" + +echo "Cleaning $tname" +set +e +rmdir "$tname/zdtmtst/special_prop_check" +rmdir "$tname/zdtmtst" +set -e + +umount "$tname" +rmdir "$tname" diff --git a/CRIU_code/test/zdtm/static/cgroup_ifpriomap.c b/CRIU_code/test/zdtm/static/cgroup_ifpriomap.c new file mode 100644 index 0000000..f043b36 --- /dev/null +++ b/CRIU_code/test/zdtm/static/cgroup_ifpriomap.c @@ -0,0 +1,364 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check preserving multiline cgroup controller's property net_prio/net_prio.ifpriomap"; +const char *test_author = "Dmitry Safonov "; + +char *dirname; +TEST_OPTION(dirname, string, "cgroup directory name", 1); + +static const char *cgname = "zdtmtst"; + +#define BUF_SZ 1024 +#define PRIOMAPS_SZ 40 + +struct ifpriomap_t { + char *ifname; + uint32_t prio; +}; +struct ifpriomap_t maps[PRIOMAPS_SZ], new_maps[PRIOMAPS_SZ]; + +static int mount_cg(const char *controller) +{ + char mnt_point[BUF_SZ], subdir[BUF_SZ]; + char tasks_path[BUF_SZ], pid_str[BUF_SZ]; + int fd; + + sprintf(mnt_point, "%s/%s", dirname, controller); + sprintf(subdir, "%s/%s/%s", dirname, controller, cgname); + sprintf(pid_str, "%d", getpid()); + sprintf(tasks_path, "%s/%s/%s/tasks", dirname, controller, cgname); + + if (mkdir(dirname, 0700) < 0 && errno != EEXIST) { + pr_perror("Can't make dir"); + return -1; + } + if (mkdir(mnt_point, 0700) < 0 && errno != EEXIST) { + pr_perror("Can't make dir `%s'", mnt_point); + return -1; + } + if (mount("none", mnt_point, "cgroup", 0, controller)) { + pr_perror("Can't mount `%s' cgroup", controller); + goto err_rm; + } + if (mkdir(subdir, 0700) < 0 && errno != EEXIST) { + pr_perror("Can't make dir `%s'", subdir); + goto err_umount; + } + + /* Add self to newly created cgroup */ + fd = open(tasks_path, O_WRONLY); + if (fd < 0) { + pr_perror("Failed to open `%s'", tasks_path); + goto err_controller; + } + if (write(fd, pid_str, strlen(pid_str)) != strlen(pid_str)) { + pr_perror("failed to write `%s' to `%s'", pid_str, tasks_path); + close(fd); + goto err_controller; + } + + close(fd); + return 0; + +err_controller: + rmdir(subdir); +err_umount: + umount(mnt_point); +err_rm: + rmdir(mnt_point); + return -1; +} + +static int umount_cg(const char *controller) +{ + char mnt_point[BUF_SZ], subdir[BUF_SZ]; + + sprintf(mnt_point, "%s/%s", dirname, controller); + sprintf(subdir, "%s/%s/%s", dirname, controller, cgname); + + rmdir(subdir); + + return umount(mnt_point); +} + +static int read_one_priomap(char *prop_line, struct ifpriomap_t *out) +{ + char *space; + size_t len; + + space = strchr(prop_line, ' '); + if (!space) { + pr_err("Broken ifpriomap file line: `%s'\n", prop_line); + return -1; + } + len = space - prop_line; + + out->ifname = malloc(len + 1); + if (!out->ifname) { + pr_perror("malloc() failed\n"); + return -1; + } + + strncpy(out->ifname, prop_line, len); + out->ifname[len] = '\0'; /* poor man's strlcpy() */ + out->prio = (uint32_t)strtol(space + 1, NULL, 10); + + return 0; +} + +static int read_map(const char *path, struct ifpriomap_t *out, size_t out_sz) +{ + char buf[BUF_SZ]; + FILE *fpriomap; + size_t i; + + fpriomap = fopen(path, "r"); + if (!fpriomap) { + pr_perror("Failed to open `%s'", path); + return -1; + } + + for (i = 0; i < out_sz; i++) { + if (!fgets(buf, BUF_SZ, fpriomap)) + break; + + if (read_one_priomap(buf, &out[i])) { + fclose(fpriomap); + return -1; + } + } + + if (fclose(fpriomap)) { + pr_perror("Failed to close `%s'", path); + return -1; + } + + return 0; +} + +static int write_map(const char *path, struct ifpriomap_t *out, size_t out_sz) +{ + char buf[BUF_SZ]; + ssize_t written; + size_t i; + int fd; + + fd = open(path, O_WRONLY); + if (fd < 0) { + pr_perror("Failed to open `%s'", path); + return -1; + } + + for (i = 0; i < out_sz; i++) { + struct ifpriomap_t *p = &out[i]; + + if (!p->ifname) + break; + + snprintf(buf, BUF_SZ, "%s %lu", + p->ifname, (unsigned long)p->prio); + + written = write(fd, buf, strlen(buf)); + if (written < 0) { + pr_perror("Failed to write `%s' to `%s'", buf, path); + close(fd); + return -1; + } + } + + if (close(fd)) { + pr_perror("Failed to close `%s'", path); + return -1; + } + + return 0; +} + +static void randomize_map(struct ifpriomap_t *out, size_t out_sz) +{ + size_t i; + + for (i = 0; i < out_sz; i++) { + struct ifpriomap_t *p = &out[i]; + + if (!p->ifname) + return; + + p->prio += rand(); + } +} + +static int compare_maps(void) +{ + size_t i, j; + + for (i = 0; i < PRIOMAPS_SZ; i++) { + struct ifpriomap_t *a = &maps[i]; + + if (!a->ifname) + return 0; + + for (j = 0; j < PRIOMAPS_SZ; j++) { + struct ifpriomap_t *b = &new_maps[j]; + + if (!b->ifname) + break; + + if (strcmp(a->ifname, b->ifname) == 0) { + if (a->prio != b->prio) { + pr_err("`%s' prio: %lu != %lu\n", + a->ifname, + (unsigned long)a->prio, + (unsigned long)b->prio); + return -1; + } + } + } + } + + return 0; +} + +static ssize_t parse_cgroup_line(FILE *fcgroup, size_t *buf_sz, char **buf) +{ + ssize_t line_sz; + + /* Reading cgroup mount nr */ + errno = 0; + line_sz = getdelim(buf, buf_sz, ':', fcgroup); + if (errno) { + pr_perror("failed to read from file"); + return -1; + } + + if (line_sz == -1) /* EOF */ + return 0; + + /* Reading mounted controller name */ + errno = 0; + line_sz = getdelim(buf, buf_sz, ':', fcgroup); + if (line_sz == -1) { /* no EOF here */ + pr_perror("failed to read from file"); + return -1; + } + + /* + * Reading the rest of the line. + * It's zdtm's test, no need to optimize = use fgetc() + */ + do { + int c = fgetc(fcgroup); + + if (c == '\n' || c == EOF) + break; + } while (true); + + return line_sz; +} + +/* + * Controller's name may differ depending on the kernel's config: + * `net_prio' if only CONFIG_CGROUP_NET_PRIO is set + * `net_cls,net_prio' if also CONFIG_CGROUP_NET_CLASSID is set + */ +static int get_controller_name(char **name) +{ + FILE *self_cgroup = fopen("/proc/self/cgroup", "r"); + size_t buf_sz = 0; + int ret = -1; + + *name = NULL; + if (!self_cgroup) { + pr_perror("failed to open self/cgroup"); + return -1; + } + + do { + ssize_t len = parse_cgroup_line(self_cgroup, &buf_sz, name); + + if (len < 0) { + free(*name); + goto out_close; + } + + if (len == 0) /* EOF */ + break; + + if (strstr(*name, "net_prio")) { + /* erasing ':' delimiter */ + (*name)[len-1] = '\0'; + ret = 0; + goto out_close; + } + } while(1); + + /* self/cgroup has no mount for net_prio - try to map it */ + *name = "net_prio"; + ret = 0; + +out_close: + fclose(self_cgroup); + return ret; +} + +int main(int argc, char **argv) +{ + char subdir[PATH_MAX]; + char path[PATH_MAX]; + int ret = -1; + char *controller_name; + + srand(time(NULL)); + + test_init(argc, argv); + + if (get_controller_name(&controller_name)) + return -1; + + if (mount_cg(controller_name) < 0) + return -1; + + sprintf(path, "%s/%s/%s/net_prio.ifpriomap", + dirname, controller_name, cgname); + + if (read_map(path, maps, PRIOMAPS_SZ)) + goto out_umount; + + randomize_map(maps, PRIOMAPS_SZ); + + if (write_map(path, maps, PRIOMAPS_SZ)) + goto out_umount; + + test_daemon(); + test_waitsig(); + + if (read_map(path, new_maps, PRIOMAPS_SZ)) { + fail("Can't read ifpriomap after C/R"); + goto out_umount; + } + + if (!compare_maps()) { + ret = 0; + pass(); + } else { + fail("ifpriomap differs before/after C/R"); + } + +out_umount: + sprintf(subdir, "%s/%s/%s", dirname, "net_prio", cgname); + rmdir(subdir); + umount_cg("net_prio"); + free(controller_name); + + return ret; +} diff --git a/CRIU_code/test/zdtm/static/cgroup_ifpriomap.desc b/CRIU_code/test/zdtm/static/cgroup_ifpriomap.desc new file mode 100644 index 0000000..5fa9d50 --- /dev/null +++ b/CRIU_code/test/zdtm/static/cgroup_ifpriomap.desc @@ -0,0 +1 @@ +{'flavor': 'h', 'flags': 'suid excl', 'opts': '--manage-cgroups=full'} diff --git a/CRIU_code/test/zdtm/static/cgroup_ifpriomap.hook b/CRIU_code/test/zdtm/static/cgroup_ifpriomap.hook new file mode 100644 index 0000000..926ffee --- /dev/null +++ b/CRIU_code/test/zdtm/static/cgroup_ifpriomap.hook @@ -0,0 +1,22 @@ +#!/bin/bash + +[ "$1" == "--clean" -o "$1" == "--pre-restore" ] || exit 0 + +# Controller's name may differ depending on the kernel's config: +# `net_prio' if only CONFIG_CGROUP_NET_PRIO is set +# `net_cls,net_prio' if also CONFIG_CGROUP_NET_CLASSID is set +controller=$(sed -n '/net_prio/s/.*:\([^:]*net_prio[^:]*\):.*/\1/p' /proc/self/cgroup) + +echo "Controller's name '$controller'" + +tname=$(mktemp -d cgclean.XXXXXX) +mount -t cgroup foobar $tname -o "$controller" + +echo "Cleaning $tname" + +set +e +rmdir "$tname/zdtmtst" +set -e + +umount "$tname" +rmdir "$tname" diff --git a/CRIU_code/test/zdtm/static/cgroup_stray.c b/CRIU_code/test/zdtm/static/cgroup_stray.c new file mode 100644 index 0000000..8532f84 --- /dev/null +++ b/CRIU_code/test/zdtm/static/cgroup_stray.c @@ -0,0 +1,230 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "zdtmtst.h" + +const char *test_doc = "Check that stray cgroups are c/r'd correctly"; +const char *test_author = "Tycho Andersen "; + +char *dirname; +TEST_OPTION(dirname, string, "cgroup directory name", 1); +static const char *cgname = "zdtmtst"; + +static int mount_ctrl(const char *controller) +{ + char aux[1024], subdir[1024]; + + if (mkdir(dirname, 0700) < 0 && errno != EEXIST) { + pr_perror("Can't make dir"); + return -1; + } + + sprintf(subdir, "%s/%s", dirname, controller); + if (mkdir(subdir, 0700) < 0) { + pr_perror("Can't make dir"); + return -1; + } + + sprintf(aux, "none,name=%s", controller); + if (mount("none", subdir, "cgroup", 0, aux)) { + pr_perror("Can't mount cgroups"); + goto err_rd; + } + + return 0; +err_rd: + rmdir(dirname); + return -1; +} + +static int add_to_cg(const char *controller, const char *path) +{ + char aux[1024], paux[1024], subdir[1024]; + int cgfd, l; + + sprintf(subdir, "%s/%s", dirname, controller); + ssprintf(paux, "%s/%s", subdir, path); + mkdir(paux, 0600); + + l = sprintf(aux, "%d", getpid()); + ssprintf(paux, "%s/%s/tasks", subdir, path); + + cgfd = open(paux, O_WRONLY); + if (cgfd < 0) { + pr_perror("Can't open tasks %s", paux); + return -1; + } + + l = write(cgfd, aux, l); + close(cgfd); + + if (l < 0) { + pr_perror("Can't move self to subcg %s", path); + return -1; + } + + return 0; +} + +static bool pid_in_cgroup(pid_t pid, const char *controller, const char *path) { + char buf[2048]; + FILE *f; + bool ret = false; + + sprintf(buf, "/proc/%d/cgroup", pid); + f = fopen(buf, "r"); + if (!f) { + pr_perror("fopen"); + return false; + } + + while (NULL != fgets(buf, sizeof(buf), f)) { + char *pos, *pid_controller, *pid_path; + + /* chop off trailing \n */ + buf[strlen(buf)-1] = '\0'; + + /* skip hierarchy no. */ + pos = strstr(buf, ":"); + if (!pos) { + pr_err("invalid /proc/pid/cgroups file"); + goto out; + } + pos++; + pid_controller = pos; + + pos = strstr(pos, ":"); + if (!pos) { + pr_err("invalid /proc/pid/cgroups file"); + goto out; + } + + *pos = '\0'; + pos++; + pid_path = pos; + +test_msg("comparing %s and %s\n", controller, pid_controller); + if (strcmp(controller, pid_controller)) + continue; + + if (strcmp(path, pid_path)) + pr_err("task not in right cg for controller %s expected %s, got %s\n", controller, path, pid_path); + else + ret = true; + + goto out; + } + +out: + fclose(f); + return ret; +} + +int main(int argc, char **argv) +{ + int ret = -1, sk_pair[2], sk, status; + char path[PATH_MAX], c; + pid_t pid = 0; + + test_init(argc, argv); + + if (socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair)) { + pr_perror("socketpair"); + return -1; + } + + if (mount_ctrl(cgname) < 0) + return -1; + + pid = fork(); + if (pid < 0) { + pr_perror("fork"); + goto out_umount; + } + + if (pid == 0) { + close(sk_pair[0]); + sk = sk_pair[1]; + + if (add_to_cg(cgname, "foo")) + exit(1); + + if (write(sk, &c, 1) != 1) { + pr_perror("write"); + exit(1); + } + + if (read(sk, &c, 1) != 1) { + pr_perror("read %d", ret); + exit(1); + } + + sprintf(path, "name=%s", cgname); + if (!pid_in_cgroup(getpid(), path, "/foo")) + exit(1); + exit(0); + } + + close(sk_pair[1]); + sk = sk_pair[0]; + + if (add_to_cg(cgname, "bar")) + goto out_kill; + + if ((ret = read(sk, &c, 1)) != 1) { + pr_perror("read %d", ret); + goto out_kill; + } + + test_daemon(); + test_waitsig(); + + if (write(sk, &c, 1) != 1) { + pr_perror("write"); + goto out_kill; + } + + sprintf(path, "name=%s", cgname); + if (!pid_in_cgroup(getpid(), path, "/bar")) { + fail("parent not in cgroup /bar"); + goto out_kill; + } + + if (pid != waitpid(pid, &status, 0)) { + pr_perror("waitpid"); + goto out_umount; + } + + if (!WIFEXITED(status) || WEXITSTATUS(status)) { + fail("exit status %d\n", status); + goto out_umount; + } + + pass(); + ret = 0; + +out_kill: + if (pid > 0) + kill(pid, SIGKILL); + +out_umount: + sprintf(path, "%s/%s/foo", dirname, cgname); + rmdir(path); + sprintf(path, "%s/%s/test", dirname, cgname); + rmdir(path); + sprintf(path, "%s/%s", dirname, cgname); + umount(path); + rmdir(path); + rmdir(dirname); + return ret; +} diff --git a/CRIU_code/test/zdtm/static/cgroup_stray.desc b/CRIU_code/test/zdtm/static/cgroup_stray.desc new file mode 100644 index 0000000..32235b9 --- /dev/null +++ b/CRIU_code/test/zdtm/static/cgroup_stray.desc @@ -0,0 +1,4 @@ +{ 'feature': 'cgroupns', + 'flags': 'suid', + 'flavor': 'h ns', + 'opts': '--manage-cgroups'} diff --git a/CRIU_code/test/zdtm/static/cgroupns.c b/CRIU_code/test/zdtm/static/cgroupns.c new file mode 100644 index 0000000..7459af1 --- /dev/null +++ b/CRIU_code/test/zdtm/static/cgroupns.c @@ -0,0 +1,213 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "zdtmtst.h" + +#ifndef CLONE_NEWCGROUP +#define CLONE_NEWCGROUP 0x02000000 +#endif + +const char *test_doc = "Check that cgroup NS is correctly handled."; +const char *test_author = "Tycho Andersen "; + +/* we need dirname before test_init() here */ +char *dirname = "cgroupns.test"; +static const char *cgname = "zdtmtst"; + +int mount_and_add(const char *controller, const char *path) +{ + char aux[1024], paux[1024], subdir[1024]; + int cgfd, l; + + if (mkdir(dirname, 0700) < 0 && errno != EEXIST) { + pr_perror("Can't make dir"); + return -1; + } + + ssprintf(subdir, "%s/%s", dirname, controller); + if (mkdir(subdir, 0700) < 0) { + pr_perror("Can't make dir"); + return -1; + } + + ssprintf(aux, "none,name=%s", controller); + if (mount("none", subdir, "cgroup", 0, aux)) { + pr_perror("Can't mount cgroups"); + goto err_rd; + } + + ssprintf(paux, "%s/%s", subdir, path); + mkdir(paux, 0600); + + l = ssprintf(aux, "%d", getpid()); + ssprintf(paux, "%s/%s/tasks", subdir, path); + + cgfd = open(paux, O_WRONLY); + if (cgfd < 0) { + pr_perror("Can't open tasks"); + goto err_rs; + } + + l = write(cgfd, aux, l); + close(cgfd); + + if (l < 0) { + pr_perror("Can't move self to subcg"); + goto err_rs; + } + + return 0; +err_rs: + umount(dirname); +err_rd: + rmdir(dirname); + return -1; +} + +static bool pid_in_cgroup(pid_t pid, const char *controller, const char *path) { + char buf[2048]; + FILE *f; + bool ret = false; + + sprintf(buf, "/proc/%d/cgroup", pid); + f = fopen(buf, "r"); + if (!f) { + pr_perror("fopen"); + return false; + } + + while (NULL != fgets(buf, sizeof(buf), f)) { + char *pos, *pid_controller, *pid_path; + + /* chop off trailing \n */ + buf[strlen(buf)-1] = '\0'; + + /* skip hierarchy no. */ + pos = strstr(buf, ":"); + if (!pos) { + pr_err("invalid /proc/pid/cgroups file"); + goto out; + } + pos++; + pid_controller = pos; + + pos = strstr(pos, ":"); + if (!pos) { + pr_err("invalid /proc/pid/cgroups file"); + goto out; + } + + *pos = '\0'; + pos++; + pid_path = pos; + + if (strcmp(controller, pid_controller)) + continue; + + if (strcmp(path, pid_path)) + pr_err("task not in right cg for controller %s expected %s, got %s\n", controller, path, pid_path); + else + ret = true; + + goto out; + } + +out: + fclose(f); + return ret; +} + +int main(int argc, char **argv) +{ + int ret = -1, fd, status; + char path[PATH_MAX]; + pid_t pid; + + if (!getenv("ZDTM_NEWNS")) { + if (mount_and_add(cgname, "test") < 0) + return -1; + + if (unshare(CLONE_NEWCGROUP) < 0) { + pr_perror("unshare"); + goto out; + } + } + + test_init(argc, argv); + + test_daemon(); + test_waitsig(); + + sprintf(path, "name=%s", cgname); + + /* first check that the task is in zdtmtst:/ */ + if (!pid_in_cgroup(getpid(), path, "/")) { + fail("pid not in cgroup /"); + goto out; + } + + /* now check that the task is in the right place in a ns by setnsing to + * someone else's ns and looking there. + */ + pid = fork(); + if (pid < 0) { + pr_perror("fork"); + goto out; + } + + if (pid == 0) { + sprintf(path, "/proc/%d/ns/cgroup", 1); + fd = open(path, O_RDONLY); + if (fd < 0) { + pr_perror("open"); + exit(1); + } + + ret = setns(fd, CLONE_NEWCGROUP); + close(fd); + if (ret < 0) { + pr_perror("setns"); + exit(1); + } + + sprintf(path, "name=%s", cgname); + if (!pid_in_cgroup(getppid(), path, "/test")) { + fail("pid not in cgroup %s", path); + exit(1); + } + + exit(0); + } + + if (pid != waitpid(pid, &status, 0)) { + pr_err("wrong pid"); + goto out; + } + + if (!WIFEXITED(status) || WEXITSTATUS(status)) { + pr_err("got bad exit status %d\n", status); + goto out; + } + + ret = 0; + pass(); + +out: + sprintf(path, "%s/%s/test", dirname, cgname); + rmdir(path); + sprintf(path, "%s/%s", dirname, cgname); + umount(path); + rmdir(path); + rmdir(dirname); + return ret; +} diff --git a/CRIU_code/test/zdtm/static/cgroupns.desc b/CRIU_code/test/zdtm/static/cgroupns.desc new file mode 100644 index 0000000..80dd710 --- /dev/null +++ b/CRIU_code/test/zdtm/static/cgroupns.desc @@ -0,0 +1,4 @@ +{ 'feature': 'cgroupns', + 'flags': 'suid', + 'flavor': 'h', + 'opts': '--manage-cgroups'} diff --git a/CRIU_code/test/zdtm/static/child_opened_proc.c b/CRIU_code/test/zdtm/static/child_opened_proc.c new file mode 100644 index 0000000..2a1fa8c --- /dev/null +++ b/CRIU_code/test/zdtm/static/child_opened_proc.c @@ -0,0 +1,63 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that tree prior to files opening"; +const char *test_author = "Stanislav Kinsbursky +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that out-of-root file survives"; +const char *test_author = "Pavel Emelianov "; + +char *dirname; +TEST_OPTION(dirname, string, "directory name", 1); +char *filename; +TEST_OPTION(filename, string, "file name", 1); + +#define MSG "out-file-contents" + +static int make_file(char *name) +{ + int fd; + + fd = open(name, O_RDWR | O_CREAT, 0666); + if (fd < 0) + return -1; + + if (write(fd, MSG, sizeof(MSG)) != sizeof(MSG)) + return -1; + + return fd; +} + +static int check_file(int fd) +{ + char r[sizeof(MSG)]; + + lseek(fd, 0, SEEK_SET); + if (read(fd, r, sizeof(r)) != sizeof(MSG)) + return -1; + + if (memcmp(r, MSG, sizeof(MSG))) + return -1; + + return 0; +} + +#define SUCCESS 0 +#define ERR_PIPES (char)0x7f +/* bitmap of errors */ +#define ERR_IN_FILE 1 +#define ERR_ROOT 2 +#define ERR_DIR 4 +#define ERR_CHDIR 8 +#define ERR_ROOT2 4 + +int main(int argc, char **argv) +{ + int pid, pipe_prep[2], pipe_goon[2], pipe_res[2]; + char res; + int fd, fd2; + + test_init(argc, argv); + + pipe(pipe_prep); + pipe(pipe_goon); + pipe(pipe_res); + pid = test_fork(); + if (pid != 0) { + close(pipe_prep[1]); + close(pipe_goon[0]); + close(pipe_res[1]); + + res = ERR_PIPES; + read(pipe_prep[0], &res, 1); + read(pipe_prep[0], &res, 1); /* wait when a descriptor will be closed */ + if (res != SUCCESS) { + if (res == ERR_PIPES) + pr_perror("broken pipes"); + else { + if (res & ERR_IN_FILE) + pr_perror("inside-root file fail"); + if (res & ERR_ROOT) + pr_perror("chroot fail"); + if (res & ERR_DIR) + pr_perror("mkdir fail"); + if (res & ERR_CHDIR) + pr_perror("chrid fail"); + } + return 0; + } + + test_daemon(); + test_waitsig(); + close(pipe_goon[1]); + + res = ERR_PIPES; + read(pipe_res[0], &res, 1); + + if (res == SUCCESS) + pass(); + else if (res == ERR_PIPES) + fail("broken pipes"); + else { + if (res & ERR_IN_FILE) + fail("opened file broken"); + if (res & ERR_ROOT) + fail("open in chroot succeeded"); + if (res & ERR_ROOT2) + fail("open in chroot might work"); + } + + wait(NULL); + return 0; + } + + close(pipe_prep[0]); + close(pipe_goon[1]); + close(pipe_res[0]); + + fd = make_file(filename); + if (fd < 0) { + res = ERR_IN_FILE; + goto err; + } + + if (mkdir(dirname, 0700)) { + res = ERR_DIR; + goto err; + } + + if (chroot(dirname)) { + res = ERR_ROOT; + goto err; + } + + if (chdir("/")) { + res = ERR_CHDIR; + goto err; + } + + res = SUCCESS; + write(pipe_prep[1], &res, 1); + close(pipe_prep[1]); + read(pipe_goon[0], &res, 1); + + res = SUCCESS; + + if (check_file(fd)) + res |= ERR_IN_FILE; + + fd2 = open(filename, O_RDWR); + if (fd2 >= 0) { + res |= ERR_ROOT; + close(fd2); + } else if (errno != ENOENT) + res |= ERR_ROOT2; + + write(pipe_res[1], &res, 1); + exit(0); + +err: + write(pipe_prep[1], &res, 1); + exit(0); +} diff --git a/CRIU_code/test/zdtm/static/chroot-file.desc b/CRIU_code/test/zdtm/static/chroot-file.desc new file mode 100644 index 0000000..2eac7e6 --- /dev/null +++ b/CRIU_code/test/zdtm/static/chroot-file.desc @@ -0,0 +1 @@ +{'flags': 'suid'} diff --git a/CRIU_code/test/zdtm/static/chroot.c b/CRIU_code/test/zdtm/static/chroot.c new file mode 100644 index 0000000..439a8e7 --- /dev/null +++ b/CRIU_code/test/zdtm/static/chroot.c @@ -0,0 +1,164 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that root didn't change"; +const char *test_author = "Pavel Emelianov "; + +char *dirname; +TEST_OPTION(dirname, string, "directory name", 1); +char *filename; +TEST_OPTION(filename, string, "file name", 1); +static char *filepath; + +#define MSG "chroot-file-contents" + +static int make_file(char *name) +{ + int fd; + + fd = open(name, O_RDWR | O_CREAT, 0666); + if (fd < 0) + return -1; + + if (write(fd, MSG, sizeof(MSG)) != sizeof(MSG)) + return -1; + + return fd; +} + +static int check_file(int fd) +{ + char r[sizeof(MSG)]; + + lseek(fd, 0, SEEK_SET); + if (read(fd, r, sizeof(r)) != sizeof(MSG)) + return -1; + + if (memcmp(r, MSG, sizeof(MSG))) + return -1; + + return 0; +} + +#define SUCCESS 0 +#define ERR_PIPES (char)0x7f +/* bitmap of errors */ +#define ERR_IN_FILE 1 +#define ERR_ROOT 2 +#define ERR_DIR 4 +#define ERR_OPEN 2 +#define ERR_FILE2 4 + +int main(int argc, char **argv) +{ + int pid, pipe_prep[2], pipe_goon[2], pipe_res[2]; + char res; + int fd, fd2; + + test_init(argc, argv); + + filepath = malloc(strlen(filename) + 2); + sprintf(filepath, "/%s", filename); + + pipe(pipe_prep); + pipe(pipe_goon); + pipe(pipe_res); + pid = test_fork(); + if (pid != 0) { + close(pipe_prep[1]); + close(pipe_goon[0]); + close(pipe_res[1]); + + res = ERR_PIPES; + read(pipe_prep[0], &res, 1); + read(pipe_prep[0], &res, 1); /* wait when pipe_prep[] will be closed */ + if (res != SUCCESS) { + if (res == ERR_PIPES) + pr_perror("broken pipes"); + else { + if (res & ERR_IN_FILE) + pr_perror("inside-root file fail"); + if (res & ERR_ROOT) + pr_perror("chroot fail"); + if (res & ERR_DIR) + pr_perror("mkdir fail"); + } + return 0; + } + + test_daemon(); + test_waitsig(); + close(pipe_goon[1]); + + res = ERR_PIPES; + read(pipe_res[0], &res, 1); + + if (res == SUCCESS) + pass(); + else if (res == ERR_PIPES) + fail("broken pipes"); + else { + if (res & ERR_IN_FILE) + fail("opened file broken"); + if (res & ERR_OPEN) + fail("open in chroot fail"); + if (res & ERR_FILE2) + fail("wrong file opened"); + } + + wait(NULL); + return 0; + } + + close(pipe_prep[0]); + close(pipe_goon[1]); + close(pipe_res[0]); + + if (mkdir(dirname, 0700)) { + res = ERR_DIR; + goto err_nodir; + } + + if (chroot(dirname)) { + res = ERR_ROOT; + goto err_noroot; + } + + fd = make_file(filepath); + if (fd < 0) { + res = ERR_IN_FILE; + goto err_nofile2; + } + + res = SUCCESS; + write(pipe_prep[1], &res, 1); + close(pipe_prep[1]); + read(pipe_goon[0], &res, 1); + + res = SUCCESS; + + if (check_file(fd)) + res |= ERR_IN_FILE; + + fd2 = open(filepath, O_RDWR); + if (fd2 < 0) + res |= ERR_OPEN; + else if (check_file(fd2)) + res |= ERR_FILE2; + + write(pipe_res[1], &res, 1); + exit(0); + +err_nofile2: +err_noroot: +err_nodir: + write(pipe_prep[1], &res, 1); + exit(0); +} diff --git a/CRIU_code/test/zdtm/static/chroot.desc b/CRIU_code/test/zdtm/static/chroot.desc new file mode 100644 index 0000000..2eac7e6 --- /dev/null +++ b/CRIU_code/test/zdtm/static/chroot.desc @@ -0,0 +1 @@ +{'flags': 'suid'} diff --git a/CRIU_code/test/zdtm/static/clean_mntns.c b/CRIU_code/test/zdtm/static/clean_mntns.c new file mode 100644 index 0000000..7628946 --- /dev/null +++ b/CRIU_code/test/zdtm/static/clean_mntns.c @@ -0,0 +1,25 @@ +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that clean mntns works"; +const char *test_author = "Pavel Emelianov "; + +int main(int argc, char **argv) +{ + test_init(argc, argv); + + if (umount("/proc") < 0) + pr_perror("Can't umount proc"); + + if (umount("/dev/pts") < 0) + pr_perror("Can't umount devpts"); + + test_daemon(); + test_waitsig(); + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/clean_mntns.desc b/CRIU_code/test/zdtm/static/clean_mntns.desc new file mode 100644 index 0000000..dfe829b --- /dev/null +++ b/CRIU_code/test/zdtm/static/clean_mntns.desc @@ -0,0 +1 @@ +{'flavor': 'ns', 'flags': 'suid'} diff --git a/CRIU_code/test/zdtm/static/clone_fs.c b/CRIU_code/test/zdtm/static/clone_fs.c new file mode 100644 index 0000000..e368aff --- /dev/null +++ b/CRIU_code/test/zdtm/static/clone_fs.c @@ -0,0 +1,104 @@ +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that shared FS is migrated properly"; +const char *test_author = "Stanislav Kinsburskiy "; + +enum kcmp_type { + KCMP_FILE, + KCMP_VM, + KCMP_FILES, + KCMP_FS, + KCMP_SIGHAND, + KCMP_IO, + KCMP_SYSVSEM, + + KCMP_TYPES, +}; + +static int kcmp(int type, pid_t pid1, pid_t pid2, unsigned long idx1, unsigned long idx2) +{ + int ret; + + ret = syscall(SYS_kcmp, pid1, pid2, type, idx1, idx2); + + switch (ret) { + case 0: + break; + case 1: + case 2: + test_msg("FS for pids %d and %d doesn't match: %d\n", pid1, pid2, ret); + break; + case -1: + pr_err("kcmp (type: %d, pid1: %d, pid2: %d, " + "idx1: %ld, idx2: %ld) failed: %d\n", + type, pid1, pid2, idx1, idx2, errno); + break; + default: + pr_err("kcmp (type: %d, pid1: %d, pid2: %d, " + "idx1: %ld, idx2: %ld) returned %d\n", + type, pid1, pid2, idx1, idx2, ret); + break; + } + return ret; +} + +#define gettid(code) \ + syscall(__NR_gettid) + +static pthread_mutex_t init_lock; +static pthread_mutex_t exit_lock; + +static void *thread_func(void *tid2) +{ + *(int *)tid2 = gettid(); + + pthread_mutex_unlock(&init_lock); + pthread_mutex_lock(&exit_lock); + + return NULL; +} + +int main(int argc, char **argv) +{ + pid_t tid; + int ret; + pthread_t th; + + test_init(argc, argv); + + pthread_mutex_init(&init_lock, NULL); + pthread_mutex_lock(&init_lock); + pthread_mutex_init(&exit_lock, NULL); + pthread_mutex_lock(&exit_lock); + + if (pthread_create(&th, NULL, thread_func, &tid)) { + fail("Can't pthread_create"); + return 1; + } + + pthread_mutex_lock(&init_lock); + + ret = kcmp(KCMP_FS, gettid(), tid, 0, 0); + if (ret) + exit(1); + + test_daemon(); + test_waitsig(); + + ret = kcmp(KCMP_FS, gettid(), tid, 0, 0); + if (ret) { + fail(); + exit(1); + } + + pthread_mutex_unlock(&exit_lock); + pthread_join(th, NULL); + + pass(); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/cmdlinenv00.c b/CRIU_code/test/zdtm/static/cmdlinenv00.c new file mode 100644 index 0000000..90e7735 --- /dev/null +++ b/CRIU_code/test/zdtm/static/cmdlinenv00.c @@ -0,0 +1,123 @@ +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Test that env/cmdline/auxv restored well\n"; +const char *test_author = "Cyrill Gorcunov 0) { + if (*new != *old) + return -1; + new++; + old++; + size -= sizeof(*new); + } + return 0; +} + +int main(int argc, char *argv[]) +{ + char cmdline_orig[4096]; + char cmdline[4096]; + char env_orig[4096]; + char env[4096]; + char auxv_orig[1024]; + char auxv[1024]; + + memset(cmdline_orig, 0, sizeof(cmdline_orig)); + memset(cmdline, 0, sizeof(cmdline)); + memset(env_orig, 0, sizeof(env_orig)); + memset(env, 0, sizeof(env)); + memset(auxv_orig, 0, sizeof(auxv_orig)); + memset(auxv, 0, sizeof(auxv)); + + test_init(argc, argv); + + read_from_proc("/proc/self/cmdline", cmdline_orig, sizeof(cmdline_orig)); + read_from_proc("/proc/self/environ", env_orig, sizeof(env_orig)); + read_from_proc("/proc/self/auxv", auxv_orig, sizeof(auxv_orig)); + + test_msg("old cmdline: %s\n", cmdline_orig); + test_msg("old environ: %s\n", env_orig); + + test_daemon(); + test_waitsig(); + + read_from_proc("/proc/self/cmdline", cmdline, sizeof(cmdline)); + read_from_proc("/proc/self/environ", env, sizeof(env)); + read_from_proc("/proc/self/auxv", auxv, sizeof(auxv)); + + test_msg("new cmdline: %s\n", cmdline); + test_msg("new environ: %s\n", env); + + if (strncmp(cmdline_orig, cmdline, sizeof(cmdline_orig))) { + fail("cmdline corrupted on restore"); + exit(1); + } + + if (strncmp(env_orig, env, sizeof(env_orig))) { + fail("envirion corrupted on restore"); + exit(1); + } + + if (cmp_auxv(auxv_orig, auxv, sizeof(auxv_orig))) { + fail("auxv corrupted on restore"); + exit(1); + } + + pass(); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/cmdlinenv00.desc b/CRIU_code/test/zdtm/static/cmdlinenv00.desc new file mode 100644 index 0000000..2eac7e6 --- /dev/null +++ b/CRIU_code/test/zdtm/static/cmdlinenv00.desc @@ -0,0 +1 @@ +{'flags': 'suid'} diff --git a/CRIU_code/test/zdtm/static/config_inotify_irmap.c b/CRIU_code/test/zdtm/static/config_inotify_irmap.c new file mode 100644 index 0000000..831dc19 --- /dev/null +++ b/CRIU_code/test/zdtm/static/config_inotify_irmap.c @@ -0,0 +1,91 @@ +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +/* + * This test reuses inotify_irmap test for testing configuration files + * functionality. For parts not related to configuration files, please + * refer to the original test case and it's author. + */ + +const char *test_doc = "Default configuration files usage"; +const char *test_author = "Veronika Kabatova "; + +#define TDIR "/etc" +char test_files[2][128] = {TDIR"/zdtm-test", TDIR"/zdtm-test1",}; +#define CONFIG_PATH "../../zdtm_test_config.conf" + +#define BUFF_SIZE ((sizeof(struct inotify_event) + PATH_MAX)) + +int main (int argc, char *argv[]) +{ + char buf[BUFF_SIZE]; + int fd, wd, i; + + test_init(argc, argv); + + for (i = 0; i < 2; i++) { + unlink(test_files[i]); + if (creat(test_files[i], 0600) < 0) { + pr_perror("Can't make test file"); + exit(1); + } + } + fd = inotify_init1(IN_NONBLOCK); + if (fd < 0) { + pr_perror("inotify_init failed"); + goto err; + } + for (i = 0; i < 2; i++) { + wd = inotify_add_watch(fd, test_files[i], IN_OPEN); + if (wd < 0) { + pr_perror("inotify_add_watch failed"); + goto err; + } + } + + FILE *configfile = fopen(CONFIG_PATH, "w"); + if (configfile == NULL) { + pr_perror("Unable to create configuration file %s", CONFIG_PATH); + goto err; + } + fprintf(configfile, "force-irmap\t\nirmap-scan-path /zdtm/static\n"); + fclose(configfile); + + test_daemon(); + test_waitsig(); + + for (i = 0; i < 2; i++) { + memset(buf, 0, sizeof(buf)); + wd = open(test_files[i], O_RDONLY); + if (read(fd, buf, sizeof(buf)) <= 0) { + fail("No events in queue"); + unlink(CONFIG_PATH); + goto err; + } + } + + close(wd); + close(fd); + for (i = 0; i < 2; i++) + unlink(test_files[i]); + unlink(CONFIG_PATH); + pass(); + return 0; +err: + for (i = 0; i < 2; i++) + unlink(test_files[i]); + return 1; +} diff --git a/CRIU_code/test/zdtm/static/config_inotify_irmap.desc b/CRIU_code/test/zdtm/static/config_inotify_irmap.desc new file mode 100644 index 0000000..591ae71 --- /dev/null +++ b/CRIU_code/test/zdtm/static/config_inotify_irmap.desc @@ -0,0 +1,3 @@ +(lambda confpath: +{'flags': 'suid', 'opts': '--config %s' % (confpath) +}) (os.path.abspath('./zdtm_test_config.conf')) diff --git a/CRIU_code/test/zdtm/static/conntracks b/CRIU_code/test/zdtm/static/conntracks new file mode 100644 index 0000000..a30e0e2 --- /dev/null +++ b/CRIU_code/test/zdtm/static/conntracks @@ -0,0 +1,57 @@ +#!/bin/bash + + +export PATH=$PATH:${0%/*}/../../lib + +die() +{ + echo "$0:${BASH_LINENO[0]}: $*" >&2 + exit 1 +} + +fail() +{ + echo "FAIL: $0:${BASH_LINENO[0]}: $*" > "$outfile" + exit 1 +} + +do_or_fail() +{ + local failmsg="$1" output + shift + output="$(eval $@ 2>&1)" || + fail "$failmsg: $output" +} + +do_start() +{ + [ -f "$statefile" ] && die "state file $statefile aleady exists" + + do_or_fail "can't install a state match" \ + iptables -A INPUT \ + -m conntrack --ctstate RELATED,ESTABLISHED -j ACCEPT + + do_or_fail "can't list the loaded iptables" \ + iptables -L \> "$statefile" +} + +do_stop() +{ + do_or_fail "can't compare the iptables" \ + iptables -L \| diff -u "$statefile" - + + rm -f "$statefile" + + echo "PASS" > $outfile +} + +tmpargs="$(../lib/parseargs.sh --name=$0 \ + --flags-req=statefile,outfile \ + --flags-opt="start,stop" -- "$@")" || + die "can't parse command line" +eval "$tmpargs" + +[ -f "$outfile" ] && die "out file $outfile aleady exists" + +# expect "start" or "stop" +do_$1 diff --git a/CRIU_code/test/zdtm/static/conntracks.desc b/CRIU_code/test/zdtm/static/conntracks.desc new file mode 100644 index 0000000..95c58b4 --- /dev/null +++ b/CRIU_code/test/zdtm/static/conntracks.desc @@ -0,0 +1 @@ +{'flags': 'noauto'} diff --git a/CRIU_code/test/zdtm/static/console.c b/CRIU_code/test/zdtm/static/console.c new file mode 100644 index 0000000..026eacb --- /dev/null +++ b/CRIU_code/test/zdtm/static/console.c @@ -0,0 +1,59 @@ +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check c/r for console device"; +const char *test_author = "Cyrill Gorcunov "; + +char *filename; +TEST_OPTION(filename, string, "file name", 1); + +int main(int argc, char ** argv) +{ + struct stat st1, st2; + int fd; + + test_init(argc, argv); + + if (mknod(filename, S_IFCHR | S_IRUSR | S_IWUSR, makedev(5,1))) { + pr_perror("Can't create console %s", filename); + return 1; + } + + fd = open(filename, O_RDONLY); + if (fd < 0) { + pr_perror("Open console %s failed", filename); + return 1; + } + + if (fstat(fd, &st1)) { + pr_perror("Can't stat %s console", filename); + return 1; + } + + test_daemon(); + test_waitsig(); + + if (fstat(fd, &st2)) { + pr_perror("Can't stat %s console", filename); + return 1; + } + + if (st1.st_rdev != st2.st_rdev) { + fail("Console rdev mismatch %x != %x on %s", + (int)st1.st_rdev, (int)st2.st_rdev, + filename); + return 1; + } + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/console.desc b/CRIU_code/test/zdtm/static/console.desc new file mode 100644 index 0000000..d969725 --- /dev/null +++ b/CRIU_code/test/zdtm/static/console.desc @@ -0,0 +1 @@ +{'flavor': 'h ns', 'flags': 'suid'} diff --git a/CRIU_code/test/zdtm/static/cow00.c b/CRIU_code/test/zdtm/static/cow00.c new file mode 100644 index 0000000..92446a1 --- /dev/null +++ b/CRIU_code/test/zdtm/static/cow00.c @@ -0,0 +1,113 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that cow memory are restored"; +const char *test_author = "Andrey Vagin +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that cow memory are restored"; +const char *test_author = "Andrey Vagin 5) + break; + + p = (void **)(addr + i * PAGE_SIZE); + test_msg("Read *%p = %p\n", p, p[0]); + if (write(fd, &p, sizeof(p)) != sizeof(p)) { + pr_perror("write"); + return -1; + } + if (read(fd, &p, sizeof(p)) != sizeof(p)) { + pr_perror("read"); + return -1; + } + test_msg("Child %p\n", p); + } + + close(fd_child); + close(fd_parent); + + if (map_child_ret) + *map_child_ret = map_child; + if (map_parent_ret) + *map_parent_ret = map_parent; + + // Return 0 for success, 1 if the pages differ. + return map_child != map_parent; +} + +static int child_prep(struct test_cases *test_cases, int fd) +{ + int i; + uint8_t *addr = test_cases->addr; + + for (i = 0; i < TEST_CASES; i++) { + struct test_case *tc = test_cases->tc + i; + if (tc->a_f_write_child) { + tc->crc_child = ~1; + datagen2(addr + i * PAGE_SIZE, PAGE_SIZE, &tc->crc_child); + } + if (tc->a_f_read_child) { + uint32_t crc = ~1; + + datasum(addr + i * PAGE_SIZE, PAGE_SIZE, &crc); + } + } + + return 0; +} + +static int child_check(struct test_cases *test_cases, int fd) +{ + int i, ret = 0; + uint8_t *addr = test_cases->addr; + + for (i = 0; i < TEST_CASES; i++) { + uint32_t crc = ~1; + struct test_case *tc = test_cases->tc + i; + + datasum(addr + i * PAGE_SIZE, PAGE_SIZE, &crc); + if (crc != tc->crc_child) { + errno = 0; + fail("%s[%#x]: %p child data mismatch (expected [%04x] got [%04x])", + test_cases->tname, i, addr + i * PAGE_SIZE, tc->crc_child, crc); + ret |= 1; + } + } + + return ret; +} + +static int parent_before_fork(struct test_cases *test_cases, int fd) +{ + uint8_t *addr; + int i; + + if (test_cases->init(test_cases)) + return -1; + + addr = test_cases->addr; + + for (i = 0; i < TEST_CASES; i++) { + struct test_case *tc = test_cases->tc + i; + tc->num = i; + + if (tc->b_f_write) { + tc->crc_parent = ~1; + datagen2(addr + i * PAGE_SIZE, PAGE_SIZE, &tc->crc_parent); + if (test_cases != &sep_tcs) + tc->crc_child = tc->crc_parent; + } + if (tc->b_f_read) { + uint32_t crc = ~1; + + datasum(addr + i * PAGE_SIZE, PAGE_SIZE, &crc); + } + } + + return 0; +} + +static int parent_post_fork(struct test_cases *test_cases, int fd) +{ + uint8_t *addr = test_cases->addr; + int i; + + for (i = 0; i < TEST_CASES; i++) { + struct test_case *tc = test_cases->tc + i; + + if (tc->a_f_write_parent) { + tc->crc_parent = ~1; + datagen2(addr + i * PAGE_SIZE, PAGE_SIZE, &tc->crc_parent); + } + + if (tc->a_f_read_parent) { + uint32_t crc = ~1; + + datasum(addr + i * PAGE_SIZE, PAGE_SIZE, &crc); + } + } + + return 0; +} + +static int parent_check(struct test_cases *test_cases, int fd) +{ + uint8_t *addr = test_cases->addr; + int i, ret = 0; + + for (i = 0; i < TEST_CASES; i++) { + struct test_case *tc = test_cases->tc + i; + uint32_t crc = ~1; + + datasum(addr + i * PAGE_SIZE, PAGE_SIZE, &crc); + if (crc != tc->crc_parent) { + errno = 0; + fail("%s[%#x]: %p parent data mismatch (expected [%04x] got [%04x])", + test_cases->tname, i, addr + i * PAGE_SIZE, tc->crc_parent, crc); + ret |= 1; + } + + if (test_cases == &sep_tcs) + continue; + + if (!tc->a_f_write_child && + !tc->a_f_write_parent && + tc->b_f_write) { + uint64_t map_child, map_parent; + int is_cow_ret; + + is_cow_ret = is_cow(addr + i * PAGE_SIZE, child_pid, getpid(), + &map_child, &map_parent, fd); + ret |= is_cow_ret; + if (is_cow_ret == 1) { + errno = 0; + fail("%s[%#x]: %p is not COW-ed (pagemap of " + "child=[%"PRIx64"], parent=[%"PRIx64"])", + test_cases->tname, i, addr + i * PAGE_SIZE, + map_child, map_parent); + } + } + } + + return ret; +} + +static int __init_cow(struct test_cases *tcs, int flags) +{ + int i; + void *addr; + + addr = mmap(NULL, PAGE_SIZE * (TEST_CASES + 2), + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (addr == MAP_FAILED) { + pr_perror("Can't allocate memory"); + return -1; + } + + /* + * Guard pages are used for preventing merging with other vma-s. + * In parent cow-ed and coinciding regions can be merged, but + * in child they cannot be, so COW will not be restored. FIXME + */ + mmap(addr, PAGE_SIZE, PROT_NONE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0); + addr += PAGE_SIZE; + tcs->addr = addr; + mmap(addr + PAGE_SIZE * TEST_CASES, PAGE_SIZE, PROT_NONE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED | flags, -1, 0); + + test_msg("addr[%s]=%p\n", tcs->tname, tcs->addr); + for (i = 0; i < TEST_CASES; i++) { + struct test_case *tc = tcs->tc + i; + tc->crc_parent = zero_crc; + tc->crc_child = zero_crc; + } + + return 0; +} + +static int init_cow(struct test_cases *tcs) +{ + return __init_cow(tcs, 0); +} + +static int init_cow_gd(struct test_cases *tcs) +{ + return __init_cow(tcs, MAP_GROWSDOWN); +} + +static int init_sep(struct test_cases *tcs) +{ + int i; + + tcs->addr = mmap(NULL, PAGE_SIZE * TEST_CASES, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (tcs->addr == MAP_FAILED) { + pr_perror("Can't allocate memory"); + return -1; + } + + test_msg("addr[%s]=%p\n", tcs->tname, tcs->addr); + for (i = 0; i < TEST_CASES; i++) { + struct test_case *tc = tcs->tc + i; + tc->crc_parent = zero_crc; + tc->crc_child = zero_crc; + } + + return 0; +} + +static int init_file(struct test_cases *tcs) +{ + int i, ret, fd; + uint8_t buf[PAGE_SIZE]; + uint32_t crc; + + fd = open(filename, O_TRUNC | O_CREAT | O_RDWR, 0600); + if (fd < 0) { + pr_perror("Unable to create a test file"); + return -1; + } + + for (i = 0; i < TEST_CASES; i++) { + struct test_case *tc = tcs->tc + i; + crc = ~1; + datagen2(buf, sizeof(buf), &crc); + ret = write(fd, buf, sizeof(buf)); + if (ret != sizeof(buf)) { + pr_perror("Unable to write data in test file %s", filename); + return -1; + } + + tc->crc_parent = crc; + tc->crc_child = crc; + } + + tcs->addr = mmap(NULL, PAGE_SIZE * TEST_CASES, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_FILE, fd, 0); + if (tcs->addr == MAP_FAILED) { + pr_perror("Can't allocate memory"); + return -1; + } + + test_msg("addr[%s]=%p\n", tcs->tname, tcs->addr); + close(fd); + + return 0; +} + +static int child(task_waiter_t *child_waiter, int fd) +{ + int ret = 0; + + sep_tcs.addr = mmap(sep_tcs.addr, PAGE_SIZE * TEST_CASES, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0); + if (sep_tcs.addr == MAP_FAILED) { + pr_perror("Can't allocate memory"); + return -1; + } + + EXECUTE_ACTION(child_prep, fd); + + task_waiter_complete_current(child_waiter); + + while (1) { + void **p; + ret = read(fd, &p, sizeof(p)); + if (ret == 0) + break; + if (ret != sizeof(p)) { + pr_perror("read"); + return -1; + } + test_msg("Read *%p = %p\n", p, p[0]); + p = ((void **)p)[0]; + if (write(fd, &p, sizeof(p)) != sizeof(p)) { + pr_perror("write"); + return -1; + } + ret = 0; + } + + ret = EXECUTE_ACTION(child_check, fd); + + // Exit code of child process, so return 2 for a test error, 1 for a + // test failure (child_check got mismatched checksums) and 0 for + // success. + return (ret < 0) ? 2 : (ret != 0); +} + +int main(int argc, char ** argv) +{ + uint8_t zero_page[PAGE_SIZE]; + int status = -1, ret = 0; + task_waiter_t child_waiter; + int pfd[2], fd; + + test_init(argc, argv); + + task_waiter_init(&child_waiter); + + memset(zero_page, 0, sizeof(zero_page)); + + datasum(zero_page, sizeof(zero_page), &zero_crc); + + if (socketpair(AF_UNIX, SOCK_SEQPACKET, 0, pfd)) { + pr_perror("pipe"); + return 1; + } + + if (EXECUTE_ACTION(parent_before_fork, -1)) + return 2; + + child_pid = test_fork(); + if (child_pid < 0) { + pr_perror("Can't fork"); + return 2; + } + + if (child_pid == 0) { + close(pfd[0]); + return child(&child_waiter, pfd[1]); + } + close(pfd[1]); + fd = pfd[0]; + + task_waiter_wait4(&child_waiter, child_pid); + + EXECUTE_ACTION(parent_post_fork, -1); + + test_daemon(); + + test_waitsig(); + + ret |= EXECUTE_ACTION(parent_check, fd); + + close(fd); + wait(&status); + + unlink(filename); + + if (WIFEXITED(status) && WEXITSTATUS(status) != 2) + ret |= WEXITSTATUS(status); + else + ret |= -1; + + if (ret == 0) + pass(); + + // Exit code, so return 2 for a test error, 1 for a test failure and 0 + // for success. + return (ret < 0) ? 2 : (ret != 0); +} diff --git a/CRIU_code/test/zdtm/static/cow01.desc b/CRIU_code/test/zdtm/static/cow01.desc new file mode 100644 index 0000000..24a8142 --- /dev/null +++ b/CRIU_code/test/zdtm/static/cow01.desc @@ -0,0 +1 @@ +{'flavor': 'h ns', 'flags': 'suid nolazy'} diff --git a/CRIU_code/test/zdtm/static/cr_veth.c b/CRIU_code/test/zdtm/static/cr_veth.c new file mode 100644 index 0000000..deef735 --- /dev/null +++ b/CRIU_code/test/zdtm/static/cr_veth.c @@ -0,0 +1,69 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "zdtmtst.h" + +const char *test_doc = "check that veth C/R-s right"; +const char *test_author = "Pavel Emelyanov "; + +#define IF_NAME "zdtmvthc0" + +static bool wait_for_veth(void) +{ + int i; + + for (i = 0; i < 10; i++) { + if (system("ip addr list dev " IF_NAME) == 0) + return true; + + sleep(1); + } + + return false; +} + +int main(int argc, char **argv) +{ + int ret = 1; + + test_init(argc, argv); + + if (!wait_for_veth()) { + fail("failed to inject veth device\n"); + return 1; + } + + if (system("ip addr list dev " IF_NAME " | sed -e 's/@.*://' > cr_veth.dump.state")) { + fail("can't save net config"); + goto out; + } + + test_daemon(); + test_waitsig(); + + if (system("ip addr list dev " IF_NAME " | sed -e 's/@.*://' > cr_veth.rst.state")) { + fail("can't get net config"); + goto out; + } + + if (system("diff cr_veth.rst.state cr_veth.dump.state")) { + fail("Net config differs after restore"); + goto out; + } + + pass(); + ret = 0; + +out: + return ret; +} diff --git a/CRIU_code/test/zdtm/static/cr_veth.checkskip b/CRIU_code/test/zdtm/static/cr_veth.checkskip new file mode 100644 index 0000000..2995e0c --- /dev/null +++ b/CRIU_code/test/zdtm/static/cr_veth.checkskip @@ -0,0 +1,2 @@ +#!/bin/bash +unshare --net ip link add type veth diff --git a/CRIU_code/test/zdtm/static/cr_veth.desc b/CRIU_code/test/zdtm/static/cr_veth.desc new file mode 100644 index 0000000..1273122 --- /dev/null +++ b/CRIU_code/test/zdtm/static/cr_veth.desc @@ -0,0 +1,4 @@ +{ 'deps': ['/bin/sh', '/bin/sed', '/bin/grep', '/sbin/ip|/bin/ip', '/usr/bin/diff'], + 'flags': 'suid', + 'flavor': 'ns uns', + 'ropts': '--external veth[zdtmvthc0]:zdtmvthh0@zdtmbr0'} diff --git a/CRIU_code/test/zdtm/static/cr_veth.hook b/CRIU_code/test/zdtm/static/cr_veth.hook new file mode 100644 index 0000000..a697827 --- /dev/null +++ b/CRIU_code/test/zdtm/static/cr_veth.hook @@ -0,0 +1,40 @@ +#!/bin/bash + +if [ "$1" == "--post-start" ]; then + set -e + + PIDF="zdtm/static/cr_veth.pid.inprogress" + while [ ! -f "$PIDF" ]; do + sleep ".1" + done + + TPID=$(cat $PIDF) + ps xaf + echo "-> $TPID" + + set -x + + ip l l + ip link add zdtmvthc0 type veth peer name zdtmvthh0 + ip link set zdtmvthc0 netns $TPID + + ip link del zdtmbr0 || true # Ignore the failure + ip link add zdtmbr0 type bridge + ip link set zdtmbr0 up + ip link set zdtmvthh0 master zdtmbr0 +elif [ "$1" == "--post-restore" ]; then + ip link list zdtmvthh0 + + if ! ip link list zdtmvthh0 | fgrep -q 'master zdtmbr0'; then + echo "Device missing or not in bridge" + exit 1 + fi + + echo "Device OK" +elif [ "$1" == "--pre-restore" -o "$1" == "--clean" ]; then + # Wait for the link to die + ip l l + while ip l l zdtmvthh0 ; do + sleep ".5" + done +fi diff --git a/CRIU_code/test/zdtm/static/criu-rtc.c b/CRIU_code/test/zdtm/static/criu-rtc.c new file mode 100644 index 0000000..8588f9f --- /dev/null +++ b/CRIU_code/test/zdtm/static/criu-rtc.c @@ -0,0 +1,124 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "criu-plugin.h" +#include "criu-log.h" + +#include "criu-rtc.pb-c.h" + +extern cr_plugin_dump_file_t cr_plugin_dump_file; +extern cr_plugin_restore_file_t cr_plugin_restore_file; + +int cr_plugin_dump_file(int fd, int id) +{ + CriuRtc e = CRIU_RTC__INIT; + char img_path[PATH_MAX]; + unsigned char buf[4096]; + int img_fd, ret, len; + unsigned long irqp; + struct stat st, st_rtc; + + if (fstat(fd, &st) == -1) { + pr_perror("fstat"); + return -1; + } + + ret = stat("/dev/rtc", &st_rtc); + if (ret == -1) { + pr_perror("fstat"); + return -1; + } + + if (major(st.st_rdev) != major(st_rtc.st_rdev) || + minor(st.st_rdev) != 0) + return -ENOTSUP; + + if (ioctl(fd, RTC_IRQP_READ, &irqp) == -1) { + pr_perror("RTC_IRQP_READ"); + return -1; + } + + e.irqp = irqp; + + snprintf(img_path, sizeof(img_path), "rtc.%x", id); + img_fd = openat(criu_get_image_dir(), img_path, O_WRONLY | O_CREAT); + if (img_fd < 0) { + pr_perror("Can't open %s", img_path); + return -1; + } + + len = criu_rtc__get_packed_size(&e); + if (len > sizeof(buf)) + return -1; + + criu_rtc__pack(&e, buf); + + ret = write(img_fd, buf, len); + if (ret != len) { + pr_perror("Unable to write in %s", img_path); + close(img_fd); + return -1; + } + + close(img_fd); + return 0; +} + +int cr_plugin_restore_file(int id) +{ + unsigned char buf[4096]; + char img_path[PATH_MAX]; + int img_fd, len, fd; + CriuRtc *e; + + snprintf(img_path, sizeof(img_path), "rtc.%x", id); + img_fd = openat(criu_get_image_dir(), img_path, O_RDONLY); + if (img_fd < 0) { + pr_perror("open(%s)", img_path); + return -ENOTSUP; + } + + len = read(img_fd, &buf, sizeof(buf)); + if (len <= 0) { + pr_perror("Unable to read from %s", img_path); + close(img_fd); + return -1; + } + close(img_fd); + + e = criu_rtc__unpack(NULL, len, buf); + if (e == NULL) { + pr_err("Unable to parse the RTC message %#x", id); + return -1; + } + + fd = open("/dev/rtc", O_RDWR); + if (fd < 0) { + pr_perror("open"); + return -1; + } + + if (ioctl(fd, RTC_IRQP_SET, e->irqp) == -1) { + pr_perror("RTC_IRQP_SET"); + close(fd); + return -1; + } + + criu_rtc__free_unpacked(e, NULL); + + if (ioctl(fd, RTC_PIE_ON, 0) == -1) { + pr_perror("RTC_PIE_ON"); + close(fd); + return -1; + } + + return fd; +} diff --git a/CRIU_code/test/zdtm/static/criu-rtc.proto b/CRIU_code/test/zdtm/static/criu-rtc.proto new file mode 100644 index 0000000..fcf8815 --- /dev/null +++ b/CRIU_code/test/zdtm/static/criu-rtc.proto @@ -0,0 +1,5 @@ +syntax = "proto2"; + +message criu_rtc { + required uint64 IRQP = 1; +} diff --git a/CRIU_code/test/zdtm/static/cwd00.c b/CRIU_code/test/zdtm/static/cwd00.c new file mode 100644 index 0000000..b0736f5 --- /dev/null +++ b/CRIU_code/test/zdtm/static/cwd00.c @@ -0,0 +1,68 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that cwd didn't change"; +const char *test_author = "Pavel Emelianov "; + +char *dirname; +TEST_OPTION(dirname, string, "directory name", 1); + +int main(int argc, char **argv) +{ + char cwd1[256], cwd2[256]; + int fd; + + test_init(argc, argv); + + fd = open(".", O_DIRECTORY | O_RDONLY); + if (fd == -1) { + pr_perror("Unable to open the current dir"); + exit(1); + } + + if (mkdir(dirname, 0700)) { + pr_perror("can't make directory %s", dirname); + exit(1); + } + + if (chdir(dirname)) { + pr_perror("can't change directory to %s", dirname); + goto cleanup; + } + + if (!getcwd(cwd1, sizeof(cwd1))) { + pr_perror("can't get cwd"); + goto cleanup; + } + + test_daemon(); + test_waitsig(); + + if (!getcwd(cwd2, sizeof(cwd2))) { + fail("can't get cwd: %m\n"); + goto cleanup; + } + + if (strcmp(cwd1, cwd2)) + fail("%s != %s\n", cwd1, cwd2); + else + pass(); +cleanup: + /* return to the initial dir before writing out results */ + if (fchdir(fd)) { + pr_perror("can't restore cwd"); + exit(1); + } + if (rmdir(dirname)) { + pr_perror("can't remove directory %s", dirname); + exit(1); + } + return 0; +} diff --git a/CRIU_code/test/zdtm/static/cwd01.c b/CRIU_code/test/zdtm/static/cwd01.c new file mode 100644 index 0000000..4e10a6b --- /dev/null +++ b/CRIU_code/test/zdtm/static/cwd01.c @@ -0,0 +1,101 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include "zdtmtst.h" + +const char *test_doc = "Check that removed cwd works"; +const char *test_author = "Pavel Emelianov "; + +char *dirname; +TEST_OPTION(dirname, string, "directory name", 1); + +int main(int argc, char **argv) +{ + char cwd1[PATH_MAX], cwd2[PATH_MAX]; + int pid, p[2], aux, aux2, fd; + + test_init(argc, argv); + + pipe(p); + pid = fork(); + if (pid == 0) { + close(p[1]); + read(p[0], &aux, sizeof(aux)); + aux = rmdir(dirname); + exit(aux ? 1 : 0); + } + + fd = open(".", O_DIRECTORY | O_RDONLY); + if (fd == -1) { + pr_perror("Unable to open the current dir"); + exit(1); + } + + if (mkdir(dirname, 0700)) { + pr_perror("can't make directory %s", dirname); + exit(1); + } + + if (chdir(dirname)) { + pr_perror("can't change directory to %s", dirname); + goto cleanup; + } + + close(p[1]); + close(p[0]); + waitpid(pid, &aux, 0); + if (!WIFEXITED(aux) || WEXITSTATUS(aux) != 0) { + pr_perror("can't remove dir"); + goto cleanup; + } + + aux = readlink("/proc/self/cwd", cwd1, sizeof(cwd1)); + if (aux < 0) { + pr_perror("can't get cwd"); + goto cleanup; + } + if (aux == sizeof(cwd1)) { + pr_perror("A buffer is too small"); + goto cleanup; + } + + cwd1[aux] = '\0'; + + test_daemon(); + test_waitsig(); + + aux2 = readlink("/proc/self/cwd", cwd2, sizeof(cwd2)); + if (aux2 < 0) { + fail("can't get cwd: %m\n"); + goto cleanup; + } + if (aux2 == sizeof(cwd2)) { + pr_perror("A buffer is too small"); + goto cleanup; + } + + cwd2[aux2] = '\0'; + + /* FIXME -- criu adds a suffix to removed cwd */ + if (strncmp(cwd1, cwd2, aux)) + fail("%s != %s\n", cwd1, cwd2); + else + pass(); +cleanup: + /* return to the initial dir before writing out results */ + if (fchdir(fd)) { + pr_perror("can't restore cwd"); + exit(1); + } + + rmdir(dirname); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/cwd02.c b/CRIU_code/test/zdtm/static/cwd02.c new file mode 100644 index 0000000..82f2586 --- /dev/null +++ b/CRIU_code/test/zdtm/static/cwd02.c @@ -0,0 +1,92 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that removed and opened cwd are kept"; +const char *test_author = "Pavel Emelianov "; + +char *dirname; +TEST_OPTION(dirname, string, "directory name", 1); + +int main(int argc, char **argv) +{ + int cwd, fd, pid, p[2], aux; + struct stat std, stf; + + test_init(argc, argv); + + pipe(p); + pid = fork(); + if (pid == 0) { + close(p[1]); + read(p[0], &aux, sizeof(aux)); + aux = rmdir(dirname); + exit(aux ? 1 : 0); + } + + cwd = open(".", O_DIRECTORY | O_RDONLY); + if (cwd == -1) { + pr_perror("Unable to open the current dir"); + exit(1); + } + + if (mkdir(dirname, 0700)) { + pr_perror("can't make directory %s", dirname); + exit(1); + } + + if ((fd = open(dirname, O_DIRECTORY)) < 0) { + pr_perror("can't open dir %s", dirname); + goto cleanup; + } + + if (chdir(dirname)) { + pr_perror("can't change directory to %s", dirname); + goto cleanup; + } + + close(p[1]); + close(p[0]); + waitpid(pid, &aux, 0); + if (!WIFEXITED(aux) || WEXITSTATUS(aux) != 0) { + pr_perror("can't remove dir"); + goto cleanup; + } + + test_daemon(); + test_waitsig(); + + if (fstat(fd, &stf) < 0) { + fail("dir fd closed\n"); + goto cleanup; + } + + if (stat("/proc/self/cwd", &std) < 0) { + fail("cwd is not OK\n"); + goto cleanup; + } + + if (stf.st_ino != std.st_ino || + stf.st_dev != std.st_dev) { + fail("cwd and opened fd are not the same\n"); + goto cleanup; + } + + pass(); + +cleanup: + /* return to the initial dir before writing out results */ + if (fchdir(cwd)) { + pr_perror("can't restore cwd"); + exit(1); + } + + rmdir(dirname); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/del_standalone_un.c b/CRIU_code/test/zdtm/static/del_standalone_un.c new file mode 100644 index 0000000..d820006 --- /dev/null +++ b/CRIU_code/test/zdtm/static/del_standalone_un.c @@ -0,0 +1,124 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that deleted unix sockets are restored correctly"; +const char *test_author = "Tycho Andersen "; + +char *dirname; +TEST_OPTION(dirname, string, "directory name", 1); + +static int fill_sock_name(struct sockaddr_un *name, const char *filename) +{ + char *cwd; + + cwd = get_current_dir_name(); + if (strlen(filename) + strlen(cwd) + 1 >= sizeof(name->sun_path)) + return -1; + + name->sun_family = AF_LOCAL; + ssprintf(name->sun_path, "%s/%s", cwd, filename); + return 0; +} + +static int bind_and_listen(struct sockaddr_un *addr) +{ + int sk; + + sk = socket(PF_UNIX, SOCK_STREAM, 0); + if (sk < 0) { + fail("socket"); + return -1; + } + + if (bind(sk, (struct sockaddr *) addr, sizeof(*addr))) { + fail("bind %s", addr->sun_path); + close(sk); + return -1; + } + + if (listen(sk, 1)) { + fail("listen"); + close(sk); + return -1; + } + + return sk; +} + +int main(int argc, char **argv) +{ + struct sockaddr_un addr; + int sk1 = -1, sk2 = -1, ret = 1; + struct stat sb; + char filename[PATH_MAX], temp[PATH_MAX]; + + test_init(argc, argv); + + sprintf(filename, "%s/sock", dirname); + sprintf(temp, "%s/temp", dirname); + + if (mkdir(dirname, 0755) < 0) { + fail("mkdir"); + goto out; + } + + if (fill_sock_name(&addr, filename) < 0) { + pr_err("filename \"%s\" is too long\n", filename); + goto out; + } + + sk1 = bind_and_listen(&addr); + if (sk1 < 0) + goto out; + + if (rename(filename, temp) < 0) { + fail("rename"); + goto out; + } + + sk2 = bind_and_listen(&addr); + if (sk2 < 0) + goto out; + + if (rename(temp, filename) < 0) { + fail("rename2"); + goto out; + } + + test_daemon(); + test_waitsig(); + + if (getsockopt(sk1, 0, 0, NULL, 0) && errno != EOPNOTSUPP) { + fail("socket 1 didn't survive restore"); + goto out; + } + + if (getsockopt(sk2, 0, 0, NULL, 0) && errno != EOPNOTSUPP) { + fail("socket 2 didn't survive restore"); + goto out; + } + + if (stat(addr.sun_path, &sb) != 0) { + fail("%s doesn't exist after restore\n", addr.sun_path); + goto out; + } + + pass(); + ret = 0; +out: + if (sk1 > 0) + close(sk1); + if (sk2 > 0) + close(sk2); + rmdir(dirname); + return ret; +} diff --git a/CRIU_code/test/zdtm/static/del_standalone_un.desc b/CRIU_code/test/zdtm/static/del_standalone_un.desc new file mode 100644 index 0000000..289362e --- /dev/null +++ b/CRIU_code/test/zdtm/static/del_standalone_un.desc @@ -0,0 +1 @@ +{'flavor': 'h ns uns'} diff --git a/CRIU_code/test/zdtm/static/deleted_dev.c b/CRIU_code/test/zdtm/static/deleted_dev.c new file mode 100644 index 0000000..41319b7 --- /dev/null +++ b/CRIU_code/test/zdtm/static/deleted_dev.c @@ -0,0 +1,76 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that we can migrate with a device special file " + "open and unlinked before migration"; +const char *test_author = "Roman Kagan "; + +char *filename; +TEST_OPTION(filename, string, "file name", 1); + +int main(int argc, char **argv) +{ + int fd; + struct stat st; + /* /dev/null params - sure to exist in a VPS */ + mode_t mode = S_IFCHR | 0700; + dev_t dev = makedev(1, 3); + + test_init(argc, argv); + + if (mknod(filename, mode, dev)) { + pr_perror("can't make device file \"%s\"", filename); + exit(1); + } + + fd = open(filename, O_RDWR); + if (fd < 0) { + pr_perror("can't open %s", filename); + goto out; + } + + if (unlink(filename) < 0) { + pr_perror("can't unlink %s", filename); + goto out; + } + + test_daemon(); + test_waitsig(); + + if (fstat(fd, &st) < 0) { + fail("can't stat %s: %m", filename); + goto out; + } + + if (st.st_mode != mode || st.st_rdev != dev) { + fail("%s is no longer the device file we had", filename); + test_msg("mode %x want %x, dev %llx want %llx\n", + st.st_mode, mode, + (long long unsigned)st.st_rdev, + (long long unsigned)dev); + goto out; + } + + if (close(fd) < 0) { + fail("can't close %s: %m", filename); + goto out; + } + + if (unlink(filename) != -1 || errno != ENOENT) { + fail("file %s should have been deleted before migration: unlink: %m\n", filename); + goto out; + } + + pass(); +out: + close(fd); + unlink(filename); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/deleted_dev.desc b/CRIU_code/test/zdtm/static/deleted_dev.desc new file mode 100644 index 0000000..d969725 --- /dev/null +++ b/CRIU_code/test/zdtm/static/deleted_dev.desc @@ -0,0 +1 @@ +{'flavor': 'h ns', 'flags': 'suid'} diff --git a/CRIU_code/test/zdtm/static/deleted_unix_sock.c b/CRIU_code/test/zdtm/static/deleted_unix_sock.c new file mode 100644 index 0000000..bcc33f3 --- /dev/null +++ b/CRIU_code/test/zdtm/static/deleted_unix_sock.c @@ -0,0 +1,193 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Create a unix socket, and destroy it before " + "migration; check that the child can write to it " + "and the parent can read from it after migration"; +const char *test_author = "Roman Kagan "; + +char *filename; +TEST_OPTION(filename, string, "file name", 1); + +static int fill_sock_name(struct sockaddr_un *name, const char *filename) +{ + char *cwd; + + cwd = get_current_dir_name(); + if (strlen(filename) + strlen(cwd) + 1 >= sizeof(name->sun_path)) + return -1; + + name->sun_family = AF_LOCAL; + sprintf(name->sun_path, "%s/%s", cwd, filename); + return 0; +} + +static int setup_srv_sock(void) +{ + struct sockaddr_un name; + int sock; + + if (fill_sock_name(&name, filename) < 0) { + pr_perror("filename \"%s\" is too long", filename); + return -1; + } + + sock = socket(PF_LOCAL, SOCK_STREAM, 0); + if (sock < 0) { + pr_perror("can't create socket"); + return -1; + } + + if (bind(sock, (struct sockaddr *) &name, SUN_LEN(&name)) < 0) { + pr_perror("can't bind to socket \"%s\"", filename); + goto err; + } + + if (listen(sock, 1) < 0) { + pr_perror("can't listen on a socket \"%s\"", filename); + goto err; + } + + return sock; +err: + close(sock); + return -1; +} + +static int setup_clnt_sock(void) +{ + struct sockaddr_un name; + int sock; + + if (fill_sock_name(&name, filename) < 0) + return -1; + + sock = socket(PF_LOCAL, SOCK_STREAM, 0); + if (sock < 0) + return -1; + + if (connect(sock, (struct sockaddr *) &name, SUN_LEN(&name)) < 0) + goto err; + + return sock; +err: + close(sock); + return -1; +} + +int main(int argc, char ** argv) +{ + int sock, acc_sock, ret; + pid_t pid; + uint32_t crc; + uint8_t buf[1000]; + + test_init(argc, argv); + + sock = setup_srv_sock(); + if (sock < 0) + exit(1); + + pid = test_fork(); + if (pid < 0) { + pr_perror("can't fork"); + exit(1); + } + + if (pid == 0) { /* child writes to the unlinked socket and returns */ + close(sock); + + sock = setup_clnt_sock(); + if (sock < 0) + _exit(1); + + test_waitsig(); + + crc = ~0; + datagen(buf, sizeof(buf), &crc); + if (write(sock, buf, sizeof(buf)) != sizeof(buf)) { + pr_perror("can't write to socket"); + exit(errno); + } + + close(sock); + exit(0); + } + + acc_sock = accept(sock, NULL, NULL); + if (acc_sock < 0) { + pr_perror("can't accept() the connection on \"%s\"", filename); + goto out_kill; + } + + close(sock); + sock = acc_sock; + + if (unlink(filename)) { + pr_perror("can't unlink %s", filename); + goto out_kill; + } + + test_daemon(); + test_waitsig(); + + if (kill(pid, SIGTERM)) { + fail("terminating the child failed: %m\n"); + goto out; + } + + if (wait(&ret) != pid) { + fail("wait() returned wrong pid %d: %m\n", pid); + goto out; + } + + if (WIFEXITED(ret)) { + ret = WEXITSTATUS(ret); + if (ret) { + fail("child exited with nonzero code %d (%s)\n", ret, strerror(ret)); + goto out; + } + } + if (WIFSIGNALED(ret)) { + fail("child exited on unexpected signal %d\n", WTERMSIG(ret)); + goto out; + } + + if (read(sock, buf, sizeof(buf)) != sizeof(buf)) { + fail("can't read %s: %m\n", filename); + goto out; + } + + crc = ~0; + if (datachk(buf, sizeof(buf), &crc)) { + fail("CRC mismatch\n"); + goto out; + } + + + if (close(sock)) { + fail("close failed: %m\n"); + goto out; + } + + if (unlink(filename) != -1 || errno != ENOENT) { + fail("file %s should have been deleted before migration: unlink: %m\n", filename); + goto out; + } + + pass(); + +out_kill: + kill(pid, SIGTERM); +out: + close(sock); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/different_creds.c b/CRIU_code/test/zdtm/static/different_creds.c new file mode 100644 index 0000000..44a87c4 --- /dev/null +++ b/CRIU_code/test/zdtm/static/different_creds.c @@ -0,0 +1,148 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that threads with different creds aren't checkpointed"; +const char *test_author = "Tycho Andersen "; + +void *drop_caps_and_wait(void *arg) +{ + int fd = *((int *) arg), i; + void *retcode = (void *)0xdeadbeaf; + cap_t caps; + char c; + + typedef struct cap_set { + cap_flag_value_t val; + cap_flag_value_t new; + cap_flag_t flag; + cap_value_t bit; + } cap_set_t; + + cap_set_t src[] = { + { + .val = CAP_CLEAR, + .flag = CAP_EFFECTIVE, + .bit = CAP_CHOWN, + }, + { + .val = CAP_SET, + .flag = CAP_EFFECTIVE, + .bit = CAP_DAC_OVERRIDE, + }, + { + .val = CAP_CLEAR, + .flag = CAP_INHERITABLE, + .bit = CAP_SETPCAP, + }, + { + .val = CAP_SET, + .flag = CAP_INHERITABLE, + .bit = CAP_NET_BIND_SERVICE, + }, + }; + + caps = cap_get_proc(); + if (!caps) { + pr_perror("cap_get_proc"); + return NULL; + } + + for (i = 0; i < ARRAY_SIZE(src); i++) { + if (cap_set_flag(caps, src[i].flag, 1, &src[i].bit, src[i].val) < 0) { + pr_perror("Can't setup CAP %s", cap_to_name(src[i].bit)); + goto die; + } + } + + if (cap_set_proc(caps) < 0) { + pr_perror("cap_set_proc"); + goto die; + } + + if (write(fd, "a", 1) != 1) { + pr_perror("Unable to send a status"); + goto die; + } + + if (read(fd, &c, 1) != 1) { + pr_perror("Unable to read a status"); + goto die; + } + + for (i = 0; i < ARRAY_SIZE(src); i++) { + if (cap_get_flag(caps, src[i].bit, src[i].flag, &src[i].new) < 0) { + pr_perror("Can't get CAP %s", cap_to_name(src[i].bit)); + goto die; + } + + if (src[i].val != src[i].new) { + pr_err("Val mismatch on CAP %s\n", cap_to_name(src[i].bit)); + goto die; + } + } + + retcode = NULL; +die: + cap_free(caps); + return retcode; +} + +int main(int argc, char ** argv) +{ + int pipefd[2]; + pthread_t thr; + char c; + void *retcode; + + test_init(argc, argv); + + if (socketpair(AF_FILE, SOCK_SEQPACKET, 0, pipefd)) { + pr_perror("pipe"); + return -1; + } + + if (pthread_create(&thr, NULL, drop_caps_and_wait, &pipefd[0])) { + pr_perror("Unable to create thread"); + return -1; + } + + /* + * Wait for child to signal us that it has dropped caps. + */ + if (read(pipefd[1], &c, 1) != 1) { + pr_perror("read"); + return 1; + } + + test_daemon(); + test_waitsig(); + + if (write(pipefd[1], &c, 1) != 1) { + pr_perror("write"); + return 1; + } + + if (pthread_join(thr, &retcode)) { + pr_perror("Unable to jount a thread"); + return 1; + } + + if (retcode != NULL) { + fail("retcode returned %p", retcode); + return 1; + } + + pass(); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/different_creds.desc b/CRIU_code/test/zdtm/static/different_creds.desc new file mode 100644 index 0000000..fa2c82d --- /dev/null +++ b/CRIU_code/test/zdtm/static/different_creds.desc @@ -0,0 +1 @@ +{'flavor': 'h', 'flags': 'suid'} diff --git a/CRIU_code/test/zdtm/static/dumpable01.c b/CRIU_code/test/zdtm/static/dumpable01.c new file mode 100644 index 0000000..e5dfc9a --- /dev/null +++ b/CRIU_code/test/zdtm/static/dumpable01.c @@ -0,0 +1,48 @@ +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check dumpable flag handling (dumpable case)"; +const char *test_author = "Filipe Brandenburger "; + +int main(int argc, char **argv) +{ + int save_dumpable; + int dumpable; + + test_init(argc, argv); + + save_dumpable = prctl(PR_GET_DUMPABLE); + if (save_dumpable < 0) { + pr_perror("error getting prctl(PR_GET_DUMPABLE) before dump"); + return 1; + } +#ifdef DEBUG + test_msg("DEBUG: before dump: dumpable=%d\n", save_dumpable); +#endif + + /* Wait for criu dump and restore. */ + test_daemon(); + test_waitsig(); + + dumpable = prctl(PR_GET_DUMPABLE); + if (dumpable < 0) { + pr_perror("error getting prctl(PR_GET_DUMPABLE) after restore"); + return 1; + } +#ifdef DEBUG + test_msg("DEBUG: after dump: dumpable=%d\n", dumpable); +#endif + + if (dumpable != save_dumpable) { + errno = 0; + fail("dumpable flag was not preserved over migration"); + return 1; + } + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/dumpable02.c b/CRIU_code/test/zdtm/static/dumpable02.c new file mode 100644 index 0000000..024371b --- /dev/null +++ b/CRIU_code/test/zdtm/static/dumpable02.c @@ -0,0 +1,208 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check dumpable flag handling (non-dumpable case)"; +const char *test_author = "Filipe Brandenburger "; + +int dumpable_server() { + char buf[256]; + int ret; + + for (;;) { + ret = read(0, buf, sizeof(buf)); + if (ret == 0) + break; + ret = snprintf(buf, sizeof(buf), "DUMPABLE:%d\n", prctl(PR_GET_DUMPABLE)); + write(1, buf, ret); + } + return 0; +} + +int get_dumpable_from_pipes(int pipe_input, int pipe_output) { + char buf[256]; + int len; + long value; + char *endptr = NULL; + + /* input and output are from the child's point of view. */ + + write(pipe_input, "GET\n", 4); + len = read(pipe_output, buf, sizeof(buf) - 1); + if (len < 0) { + pr_perror("error in parent reading from pipe"); + return -1; + } + buf[len] = 0; + + if (memcmp(buf, "DUMPABLE:", 9) != 0) { + pr_perror("child returned [%s]", buf); + return -1; + } + + value = strtol(&buf[9], &endptr, 10); + if (!endptr || *endptr != '\n' || endptr != buf + len - 1) { + pr_perror("child returned [%s]", buf); + return -1; + } + + return (int)value; +} + + +int main(int argc, char **argv) +{ + int pipe_input[2]; + int pipe_output[2]; + int save_dumpable; + int dumpable; + int ret; + pid_t pid; + pid_t waited; + int status; + + /* + * Check if we are being re-executed to spawn the dumpable server. This + * re-execution is what essentially causes the dumpable flag to be + * cleared since we have execute but not read permissions to the + * binary. + */ + if (getenv("DUMPABLE_SERVER")) + return dumpable_server(); + + /* + * Otherwise, do normal startup and spawn a dumpable server. While we + * are still running as root, chmod() the binary to give it execute but + * not read permissions, that way when we execv() it as a non-root user + * the kernel will drop our dumpable flag and reset it to the value in + * /proc/sys/fs/suid_dumpable. + */ + ret = chmod(argv[0], 0111); + if (ret < 0) { + pr_perror("error chmodding %s", argv[0]); + return 1; + } + + test_init(argc, argv); + + ret = pipe(pipe_input); + if (ret < 0) { + pr_perror("error creating input pipe"); + return 1; + } + + ret = pipe(pipe_output); + if (ret < 0) { + pr_perror("error creating output pipe"); + return 1; + } + + pid = fork(); + if (pid < 0) { + pr_perror("error forking the dumpable server"); + return 1; + } + + if (pid == 0) { + /* + * Child process will execv() the dumpable server. Start by + * reopening stdin and stdout to use the pipes, then set the + * environment variable and execv() the same binary. + */ + close(0); + close(1); + + ret = dup2(pipe_input[0], 0); + if (ret < 0) { + pr_perror("could not dup2 pipe into child's stdin"); + return 1; + } + + ret = dup2(pipe_output[1], 1); + if (ret < 0) { + pr_perror("could not dup2 pipe into child's stdout"); + return 1; + } + + close(pipe_output[0]); + close(pipe_output[1]); + close(pipe_input[0]); + close(pipe_input[1]); + + ret = setenv("DUMPABLE_SERVER", "yes", 1); + if (ret < 0) { + pr_perror("could not set the DUMPABLE_SERVER env variable"); + return 1; + } + + execl(argv[0], "dumpable_server", NULL); + pr_perror("could not execv %s as a dumpable_server\nError No: %d", argv[0], errno); + return 1; + } + + /* + * Parent process, write to the pipe_input socket to ask the server + * child to tell us what its dumpable flag value is on its side. + */ + close(pipe_input[0]); + close(pipe_output[1]); + + save_dumpable = get_dumpable_from_pipes(pipe_input[1], pipe_output[0]); + if (save_dumpable < 0) return 1; +#ifdef DEBUG + test_msg("DEBUG: before dump: dumpable=%d\n", save_dumpable); +#endif + + /* Wait for dump and restore. */ + test_daemon(); + test_waitsig(); + + dumpable = get_dumpable_from_pipes(pipe_input[1], pipe_output[0]); + if (dumpable < 0) return 1; +#ifdef DEBUG + test_msg("DEBUG: after restore: dumpable=%d\n", dumpable); +#endif + + if (dumpable != save_dumpable) { + errno = 0; + fail("dumpable flag was not preserved over migration"); + return 1; + } + + /* Closing the pipes will terminate the child server. */ + close(pipe_input[1]); + close(pipe_output[0]); + + waited = wait(&status); + if (waited < 0) { + pr_perror("error calling wait on the child"); + return 1; + } + errno = 0; + if (waited != pid) { + pr_perror("waited pid %d did not match child pid %d", + waited, pid); + return 1; + } + if (!WIFEXITED(status)) { + pr_perror("child dumpable server returned abnormally with status=%d", + status); + return 1; + } + if (WEXITSTATUS(status) != 0) { + pr_perror("child dumpable server returned rc=%d", + WEXITSTATUS(status)); + return 1; + } + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/dumpable02.desc b/CRIU_code/test/zdtm/static/dumpable02.desc new file mode 100644 index 0000000..f8c8979 --- /dev/null +++ b/CRIU_code/test/zdtm/static/dumpable02.desc @@ -0,0 +1 @@ +{'flavor': 'h ns', 'flags': 'nouser'} diff --git a/CRIU_code/test/zdtm/static/env00.c b/CRIU_code/test/zdtm/static/env00.c new file mode 100644 index 0000000..1feabfa --- /dev/null +++ b/CRIU_code/test/zdtm/static/env00.c @@ -0,0 +1,39 @@ +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that environment didn't change"; +const char *test_author = "Pavel Emelianov "; + +char *envname; +TEST_OPTION(envname, string, "environment variable name", 1); + +int main(int argc, char **argv) +{ + char *env; + + test_init(argc, argv); + + if (setenv(envname, test_author, 1)) { + pr_perror("Can't set env var \"%s\" to \"%s\"", envname, test_author); + exit(1); + } + + test_daemon(); + test_waitsig(); + + env = getenv(envname); + if (!env) { + fail("can't get env var \"%s\": %m\n", envname); + goto out; + } + + if (strcmp(env, test_author)) + fail("%s != %s\n", env, test_author); + else + pass(); +out: + return 0; +} diff --git a/CRIU_code/test/zdtm/static/epoll.c b/CRIU_code/test/zdtm/static/epoll.c new file mode 100644 index 0000000..ce3b176 --- /dev/null +++ b/CRIU_code/test/zdtm/static/epoll.c @@ -0,0 +1,137 @@ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check for epoll"; +const char *test_author = "Andrei Vagin "; + +#define DUPFDNO 999 + +int main(int argc, char *argv[]) +{ + int epollfd1, epollfd2, fd; + struct epoll_event ev; + int i, ret; + + struct { + int pipefd[2]; + } pipes[250]; + + test_init(argc, argv); + + epollfd1 = epoll_create(1); + if (epollfd1 < 0) { + pr_perror("epoll_create failed"); + exit(1); + } + epollfd2 = epoll_create(1); + if (epollfd2 < 0) { + pr_perror("epoll_create failed"); + exit(1); + } + + memset(&ev, 0, sizeof(ev)); + ev.events = EPOLLIN | EPOLLOUT; + + for (i = 0; i < ARRAY_SIZE(pipes); i++) { + if (pipe(pipes[i].pipefd)) { + pr_err("Can't create pipe %d\n", i); + exit(1); + } + + if (i % 2) { + int nfd; + + nfd = dup2(pipes[i].pipefd[0], i + 700); + if (nfd < 0) { + pr_err("dup2"); + exit(1); + } + close(pipes[i].pipefd[0]); + pipes[i].pipefd[0] = nfd; + } + + ev.data.u64 = i; + fd = dup2(pipes[i].pipefd[0], DUPFDNO); + if (fd < 0) { + pr_perror("Can't dup %d to %d", pipes[i].pipefd[0], DUPFDNO); + exit(1); + } + + test_msg("epoll %d add %d native\n", epollfd1, pipes[i].pipefd[0]); + if (epoll_ctl(epollfd1, EPOLL_CTL_ADD, pipes[i].pipefd[0], &ev)) { + pr_perror("Can't add pipe %d", pipes[i].pipefd[0]); + exit(1); + } + + test_msg("epoll %d add %d dup'ed from %d\n", epollfd1, fd, pipes[i].pipefd[0]); + if (epoll_ctl(epollfd2, EPOLL_CTL_ADD, fd, &ev)) { + pr_perror("Can't add pipe %d", fd); + exit(1); + } + + close(fd); + test_msg("epoll source %d closed\n", fd); + } + + test_daemon(); + test_waitsig(); + + ret = 0; + for (i = 0; i < ARRAY_SIZE(pipes); i++) { + uint8_t cw = 1, cr; + + if (write(pipes[i].pipefd[1], &cw, sizeof(cw)) != sizeof(cw)) { + pr_perror("Unable to write into a pipe\n"); + return 1; + } + + if (epoll_wait(epollfd1, &ev, 1, -1) != 1) { + pr_perror("Unable to wain events"); + return 1; + } + if (ev.data.u64 != i) { + pr_err("ev.fd=%d ev.data.u64=%#llx (%d expected)\n", + ev.data.fd, (long long)ev.data.u64, i); + ret |= 1; + } + + if (epoll_wait(epollfd2, &ev, 1, -1) != 1) { + pr_perror("Unable to wain events"); + return 1; + } + if (ev.data.u64 != i) { + pr_err("ev.fd=%d ev.data.u64=%#llx (%d expected)\n", + ev.data.fd, (long long)ev.data.u64, i); + ret |= 1; + } + + if (read(pipes[i].pipefd[0], &cr, sizeof(cr)) != sizeof(cr)) { + pr_perror("read"); + return 1; + } + } + + if (ret) + return 1; + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/epoll.desc b/CRIU_code/test/zdtm/static/epoll.desc new file mode 100644 index 0000000..72cc898 --- /dev/null +++ b/CRIU_code/test/zdtm/static/epoll.desc @@ -0,0 +1 @@ +{ 'feature' : 'kcmp_epoll' } diff --git a/CRIU_code/test/zdtm/static/eventfs00.c b/CRIU_code/test/zdtm/static/eventfs00.c new file mode 100644 index 0000000..72fd38a --- /dev/null +++ b/CRIU_code/test/zdtm/static/eventfs00.c @@ -0,0 +1,98 @@ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +#ifndef F_SETSIG +#define F_SETSIG 10 /* for sockets. */ +#define F_GETSIG 11 /* for sockets. */ +#endif + +const char *test_doc = "Check for eventfs"; +const char *test_author = "Cyrill Gorcunov "; + +#define EVENTFD_INITIAL 30 +#define EVENTFD_FINAL 90 + +int main(int argc, char *argv[]) +{ + int efd, ret, epollfd; + int pipefd[2]; + uint64_t v = EVENTFD_INITIAL; + struct epoll_event ev; + + test_init(argc, argv); + + epollfd = epoll_create(1); + if (epollfd < 0) { + fail("epoll_create"); + exit(1); + } + + efd = eventfd((unsigned int)v, EFD_NONBLOCK); + if (efd < 0) { + fail("eventfd"); + exit(1); + } + + memset(&ev, 0xff, sizeof(ev)); + ev.events = EPOLLIN | EPOLLOUT; + + if (pipe(pipefd)) { + fail("pipe"); + exit(1); + } + + if (epoll_ctl(epollfd, EPOLL_CTL_ADD, pipefd[0], &ev)) { + fail("epoll_ctl"); + exit(1); + } + + test_msg("created eventfd with %"PRIu64"\n", v); + + ret = write(efd, &v, sizeof(v)); + if (ret != sizeof(v)) { + fail("write"); + exit(1); + } + + ret = write(efd, &v, sizeof(v)); + if (ret != sizeof(v)) { + fail("write"); + exit(1); + } + + test_daemon(); + test_waitsig(); + + ret = read(efd, &v, sizeof(v)); + if (ret != sizeof(v)) { + fail("write"); + exit(1); + } + + if (v != EVENTFD_FINAL) { + fail("EVENTFD_FINAL mismatch\n"); + exit(1); + } + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/fanotify00.c b/CRIU_code/test/zdtm/static/fanotify00.c new file mode 100644 index 0000000..e948a63 --- /dev/null +++ b/CRIU_code/test/zdtm/static/fanotify00.c @@ -0,0 +1,319 @@ +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +#ifdef __x86_64__ +# define __NR_fanotify_init 300 +# define __NR_fanotify_mark 301 +#elif defined(__PPC64__) +# define __NR_fanotify_init 323 +# define __NR_fanotify_mark 324 +#elif __aarch64__ +# define __NR_fanotify_init 262 +# define __NR_fanotify_mark 263 +#elif __s390x__ +# define __NR_fanotify_init 332 +# define __NR_fanotify_mark 333 +#else +# define __NR_fanotify_init 338 +# define __NR_fanotify_mark 339 +#endif + +const char *test_doc = "Check for fanotify delivery"; +const char *test_author = "Cyrill Gorcunov "; + +const char fanotify_path[] = "fanotify-del-after-cr"; + +#define BUFF_SIZE (8192) + +struct fanotify_mark_inode { + unsigned long i_ino; + unsigned int s_dev; + unsigned int mflags; + unsigned int mask; + unsigned int ignored_mask; + unsigned int fhandle_bytes; + unsigned int fhandle_type; + unsigned char fhandle[512]; +}; + +struct fanotify_mark_mount { + unsigned int mnt_id; + unsigned int mflags; + unsigned int mask; + unsigned int ignored_mask; +}; + +struct fanotify_glob { + unsigned int faflags; + unsigned int evflags; +}; + +struct fanotify_obj { + struct fanotify_glob glob; + struct fanotify_mark_inode inode; + struct fanotify_mark_mount mount; +}; + +static int fanotify_init(unsigned int flags, unsigned int event_f_flags) +{ + return syscall(__NR_fanotify_init, flags, event_f_flags); +} + +static int fanotify_mark(int fanotify_fd, unsigned int flags, unsigned long mask, + int dfd, const char *pathname) +{ +#ifdef __i386__ + return syscall(__NR_fanotify_mark, fanotify_fd, flags, mask, 0, dfd, pathname); +#else + return syscall(__NR_fanotify_mark, fanotify_fd, flags, mask, dfd, pathname); +#endif +} + +#define fdinfo_field(str, field) !strncmp(str, field":", sizeof(field)) + +static void show_fanotify_obj(struct fanotify_obj *obj) +{ + test_msg("fanotify obj at %p\n", obj); + + test_msg(" glob\n"); + test_msg(" faflags: %x evflags: %x\n", + obj->glob.faflags, obj->glob.evflags); + + test_msg(" inode\n"); + test_msg(" i_ino: %lx s_dev: %x mflags: %x " + "mask: %x ignored_mask: %x " + "fhandle_bytes: %x fhandle_type: %x " + "fhandle: %s", + obj->inode.i_ino, obj->inode.s_dev, + obj->inode.mflags, obj->inode.mask, + obj->inode.ignored_mask, obj->inode.fhandle_bytes, + obj->inode.fhandle_type, obj->inode.fhandle); + + test_msg(" mount\n"); + test_msg(" mnt_id: %x mflags: %x mask: %x ignored_mask: %x\n", + obj->mount.mnt_id, obj->mount.mflags, + obj->mount.mask, obj->mount.ignored_mask); +} + +static void copy_fhandle(char *tok, struct fanotify_mark_inode *inode) +{ + int off = 0; + + while (*tok && (*tok > '0' || *tok < 'f')) { + inode->fhandle[off++] = *tok++; + if (off >= sizeof(inode->fhandle) - 1) + break; + } + inode->fhandle[off] = '\0'; +} + +static int cmp_fanotify_obj(struct fanotify_obj *old, struct fanotify_obj *new) +{ + /* + * mnt_id and s_dev may change during container migration, + * moreover the backend (say PLOOP) may be re-mounted during + * c/r, so exclude them. + */ + if ((old->glob.faflags != new->glob.faflags) || + (old->glob.evflags != new->glob.evflags) || + (old->inode.i_ino != new->inode.i_ino) || + (old->inode.mflags != new->inode.mflags) || + (old->inode.mask != new->inode.mask) || + (old->inode.ignored_mask != new->inode.ignored_mask)) + return -1; + + if (memcmp(old->inode.fhandle, new->inode.fhandle, + sizeof(new->inode.fhandle))) + return -2; + + if ((old->mount.mflags != new->mount.mflags) || + (old->mount.mask != new->mount.mask) || + (old->mount.ignored_mask != new->mount.ignored_mask)) + return -3; + + return 0; +} + +int parse_fanotify_fdinfo(int fd, struct fanotify_obj *obj, unsigned int expected_to_meet) +{ + unsigned int met = 0; + char str[512]; + FILE *f; + int ret; + + sprintf(str, "/proc/self/fdinfo/%d", fd); + f = fopen(str, "r"); + if (!f) { + pr_perror("Can't open fdinfo to parse"); + return -1; + } + + while (fgets(str, sizeof(str), f)) { + if (fdinfo_field(str, "fanotify flags")) { + ret = sscanf(str, "fanotify flags:%x event-flags:%x", + &obj->glob.faflags, &obj->glob.evflags); + if (ret != 2) + goto parse_err; + met++; + continue; + } + if (fdinfo_field(str, "fanotify mnt_id")) { + ret = sscanf(str, + "fanotify mnt_id:%x mflags:%x mask:%x ignored_mask:%x", + &obj->mount.mnt_id, &obj->mount.mflags, + &obj->mount.mask, &obj->mount.ignored_mask); + if (ret != 4) + goto parse_err; + met++; + continue; + } + if (fdinfo_field(str, "fanotify ino")) { + int hoff; + ret = sscanf(str, + "fanotify ino:%lx sdev:%x mflags:%x mask:%x ignored_mask:%x " + "fhandle-bytes:%x fhandle-type:%x f_handle: %n", + &obj->inode.i_ino, &obj->inode.s_dev, + &obj->inode.mflags, &obj->inode.mask, &obj->inode.ignored_mask, + &obj->inode.fhandle_bytes, &obj->inode.fhandle_type, + &hoff); + if (ret != 7) + goto parse_err; + copy_fhandle(&str[hoff], &obj->inode); + met++; + continue; + } + } + + if (expected_to_meet != met) { + pr_perror("Expected to meet %d entries but got %d", + expected_to_meet, met); + return -1; + } + + return 0; + +parse_err: + pr_perror("Can't parse '%s'", str); + return -1; +} + +int main (int argc, char *argv[]) +{ + struct fanotify_obj old = { }, new = { }; + int fa_fd, fd, del_after; + char buf[BUFF_SIZE]; + ssize_t length; + int ns = getenv("ZDTM_NEWNS") != NULL; + + test_init(argc, argv); + + if (ns) { + if (mkdir("/tmp", 666) && errno != EEXIST) { + pr_perror("Unable to create the /tmp directory"); + return -1; + } + if (mount("zdtm", "/tmp", "tmpfs", 0, NULL)) { + pr_perror("Unable to mount tmpfs into %s", "/tmp"); + } + } + + fa_fd = fanotify_init(FAN_NONBLOCK | FAN_CLASS_NOTIF | FAN_UNLIMITED_QUEUE, + O_RDONLY | O_LARGEFILE); + if (fa_fd < 0) { + pr_perror("fanotify_init failed"); + exit(1); + } + + del_after = open(fanotify_path, O_CREAT | O_TRUNC); + if (del_after < 0) { + pr_perror("open failed"); + exit(1); + } + + if (fanotify_mark(fa_fd, FAN_MARK_ADD, + FAN_MODIFY | FAN_ACCESS | FAN_OPEN | FAN_CLOSE, + AT_FDCWD, fanotify_path)) { + pr_perror("fanotify_mark failed"); + exit(1); + } + + if (fanotify_mark(fa_fd, FAN_MARK_ADD | FAN_MARK_MOUNT, + FAN_ONDIR | FAN_OPEN | FAN_CLOSE, + AT_FDCWD, "/tmp")) { + pr_perror("fanotify_mark failed"); + exit(1); + } + + if (fanotify_mark(fa_fd, FAN_MARK_ADD | FAN_MARK_MOUNT | + FAN_MARK_IGNORED_MASK | FAN_MARK_IGNORED_SURV_MODIFY, + FAN_MODIFY | FAN_ACCESS, + AT_FDCWD, "/tmp")) { + pr_perror("fanotify_mark failed"); + exit(1); + } + + if (parse_fanotify_fdinfo(fa_fd, &old, 3)) { + pr_perror("parsing fanotify fdinfo failed"); + exit(1); + } + + show_fanotify_obj(&old); + + test_daemon(); + test_waitsig(); + + fd = open("/", O_RDONLY); + close(fd); + + fd = open(fanotify_path, O_RDWR); + close(fd); + + if (unlink(fanotify_path)) { + fail("can't unlink %s\n", fanotify_path); + exit(1); + } + + if (parse_fanotify_fdinfo(fa_fd, &new, 3)) { + fail("parsing fanotify fdinfo failed\n"); + exit(1); + } + + show_fanotify_obj(&new); + + if (cmp_fanotify_obj(&old, &new)) { + fail("fanotify mismatch on fdinfo level\n"); + exit(1); + } + + length = read(fa_fd, buf, sizeof(buf)); + if (length <= 0) { + fail("No events in fanotify queue\n"); + exit(1); + } + + if (fanotify_mark(fa_fd, FAN_MARK_REMOVE | FAN_MARK_MOUNT, + FAN_ONDIR | FAN_OPEN | FAN_CLOSE, + AT_FDCWD, "/tmp")) { + pr_perror("fanotify_mark failed"); + exit(1); + } + + pass(); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/fanotify00.desc b/CRIU_code/test/zdtm/static/fanotify00.desc new file mode 100644 index 0000000..d969725 --- /dev/null +++ b/CRIU_code/test/zdtm/static/fanotify00.desc @@ -0,0 +1 @@ +{'flavor': 'h ns', 'flags': 'suid'} diff --git a/CRIU_code/test/zdtm/static/fd.c b/CRIU_code/test/zdtm/static/fd.c new file mode 100644 index 0000000..a2e89d9 --- /dev/null +++ b/CRIU_code/test/zdtm/static/fd.c @@ -0,0 +1,109 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" +#include "lock.h" + +const char *test_doc = "Check that criu closes up all its descriptors"; +const char *test_author = "Andrew Vagin "; + +int main(int argc, char **argv) +{ + struct dirent *de; + char pfd[PATH_MAX]; + mutex_t *lock; + int status; + pid_t pid; + DIR *d; + + test_init(argc, argv); + + lock = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); + if (lock == MAP_FAILED) + return 1; + + mutex_init(lock); + mutex_lock(lock); + + pid = fork(); + if (pid < 0) { + pr_perror("fork()"); + return 1; + } + + if (pid == 0) { + + d = opendir("/proc/self/fd"); + if (d == NULL) + return 1; + + while ((de = readdir(d))) { + int fd; + + if (de->d_name[0] == '.') + continue; + + fd = atoi(de->d_name); + if (dirfd(d) == fd) + continue; + close(fd); + } + + closedir(d); + mutex_unlock(lock); + + test_waitsig(); + + return 0; + } + + mutex_lock(lock); + + test_daemon(); + test_waitsig(); + + snprintf(pfd, sizeof(pfd), "/proc/%d/fd", pid); + d = opendir(pfd); + if (d == NULL) + return 2; + + while ((de = readdir(d))) { + int ret; + + if (de->d_name[0] == '.') + continue; + + ret = readlinkat(dirfd(d), de->d_name, pfd, sizeof(pfd) - 1); + if (ret < 0) { + pr_perror("readlink"); + ret = 0; + } + pfd[ret] = '\0'; + fail("Unexpected fd: %s -> %s\n", de->d_name, pfd); + return 1; + } + + closedir(d); + kill(pid, SIGTERM); + + if (waitpid(pid, &status, 0) != pid) { + pr_perror("waitpid()"); + return 1; + } + + if (status != 0) { + fail("%d:%d:%d:%d", WIFEXITED(status), WEXITSTATUS(status), + WIFSIGNALED(status), WTERMSIG(status)); + return 1; + } + + pass(); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/fd01.c b/CRIU_code/test/zdtm/static/fd01.c new file mode 100644 index 0000000..4e78751 --- /dev/null +++ b/CRIU_code/test/zdtm/static/fd01.c @@ -0,0 +1,117 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" +#include "lock.h" + +const char *test_doc = "Create file descriptors with different numbers. Check that they do not intersect with service fds"; +const char *test_author = "Kirill Tkhai "; + +int main(int argc, char **argv) +{ + unsigned int i, max_nr, flags; + int fd, status, ret; + struct rlimit rlim; + futex_t *futex; + char buf[16]; + pid_t pid; + + test_init(argc, argv); + + futex = mmap(NULL, sizeof(*futex), PROT_WRITE | PROT_READ, MAP_ANONYMOUS|MAP_SHARED, -1, 0); + if (futex == MAP_FAILED) { + fail("mmap"); + exit(1); + } + futex_init(futex); + + fd = open("/proc/sys/fs/nr_open", O_RDONLY); + if (fd < 0) { + fail("Can't open /proc/sys/fs/nr_open"); + exit(1); + } + + ret = read(fd, buf, sizeof(buf)); + if (ret <= 0) { + fail("Can't read"); + exit(1); + } + buf[ret] = '\0'; + + max_nr = (unsigned int)atol(buf); + if (max_nr == 0) { + fail("max_nr"); + exit(1); + } + + if (getrlimit(RLIMIT_NOFILE, &rlim)) { + fail("getrlimit"); + exit(1); + } + + rlim.rlim_cur = rlim.rlim_max; + if (max_nr < rlim.rlim_cur) + rlim.rlim_cur = max_nr; + + if (prlimit(getpid(), RLIMIT_NOFILE, &rlim, NULL)) { + fail("rlimir: Can't setup RLIMIT_NOFILE for self"); + exit(1); + } + + + for (i = 1; (fd = (1 << i)) < (rlim.rlim_cur >> 1); i++) { + FILE *fp = tmpfile(); + if (!fp) { + fail("tmpfile"); + exit(1); + } + + /* This fd really exists, skip it */ + if (fcntl(fd, F_GETFL) >= 0) + continue; + + if (dup2(fileno(fp), fd) < 0) { + fail("dup2"); + exit(1); + } + + flags = SIGCHLD; + if (i % 2 == 0) + flags |= CLONE_FILES; + + pid = sys_clone_unified(flags, NULL, NULL, NULL, 0); + if (pid < 0) { + fail("fork"); + exit(1); + } else if (!pid) { + futex_wait_while(futex, 0); + exit(0); + } + } + + test_daemon(); + test_waitsig(); + + /* Cleanup */ + futex_set_and_wake(futex, 1); + while (wait(&status) > 0) { + if (!WIFEXITED(status) || WEXITSTATUS(status)) { + fail("Wrong exit status: %d", status); + exit(1); + } + } + + pass(); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/fd01.desc b/CRIU_code/test/zdtm/static/fd01.desc new file mode 100644 index 0000000..2eac7e6 --- /dev/null +++ b/CRIU_code/test/zdtm/static/fd01.desc @@ -0,0 +1 @@ +{'flags': 'suid'} diff --git a/CRIU_code/test/zdtm/static/fdt_shared.c b/CRIU_code/test/zdtm/static/fdt_shared.c new file mode 100644 index 0000000..2111356 --- /dev/null +++ b/CRIU_code/test/zdtm/static/fdt_shared.c @@ -0,0 +1,206 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check a shared file descriptor table."; +const char *test_author = "Andrew Vagin "; + +char *filename; +TEST_OPTION(filename, string, "file name", 1); + +#define STACK_SIZE 4096 +#define TEST_FD 128 +#define TEST_STRING "Hello World!" + +#define CHILDREN 4 +static int fork_pfd[2]; + +static void forked() +{ + char c = 0; + + if (write(fork_pfd[1], &c, 1) != 1) { + pr_perror("Unable to send a signal to the parent"); + exit(5); + } +} + +static void wait_children() +{ + int i; + char c; + + for (i = 0; i < CHILDREN; i++) { + if (read(fork_pfd[0], &c, 1) != 1) { + pr_perror("Unable to read a signal from a child"); + exit(5); + } + } +} + +static pid_t clone_child(int (*fn)(void *), int flags) +{ + char stack[STACK_SIZE] __stack_aligned__; + pid_t pid; + + pid = clone(fn, stack + STACK_SIZE, + flags | SIGCHLD, NULL); + if (pid == -1) { + pr_perror("Unable to clone a new process"); + return -1; + } + + return pid; +} + +static int child2(void *_arg) +{ + char buf[sizeof(TEST_STRING)]; + + forked(); + test_waitsig(); + + if (read(TEST_FD, buf, sizeof(TEST_STRING)) != sizeof(TEST_STRING)) { + pr_perror("Unable to read from %d", TEST_FD); + return 1; + } + + return 0; +} + +static int child3(void *_arg) +{ + forked(); + test_waitsig(); + + if (close(TEST_FD) != -1) { + fail("%d is exist\n", TEST_FD); + return 1; + } + + return 0; +} + +static int child(void *_arg) +{ + char buf[sizeof(TEST_STRING)]; + pid_t pid, pid2; + int status; + + pid = clone_child(child2, CLONE_FILES); + if (pid < 0) + return 1; + + pid2 = clone_child(child3, 0); + if (pid < 0) + return 1; + + forked(); + test_waitsig(); + + kill(pid2, SIGTERM); + kill(pid, SIGTERM); + waitpid(pid2, &status, 0); + + if (status) { + fail("The child3 returned %d\n", status); + return 1; + } + + waitpid(pid, &status, 0); + + if (status) { + fail("The child2 returned %d\n", status); + return 1; + } + + if (read(TEST_FD, buf, sizeof(TEST_STRING)) != sizeof(TEST_STRING)) { + pr_perror("Unable to read from %d", TEST_FD); + return 1; + } + + if (close(TEST_FD) == -1) { + pr_perror("Unable to close(%d)", TEST_FD); + return 1; + } + + return 0; +} + +int main(int argc, char ** argv) +{ + int status; + pid_t pid, pid2; + int fd, i; + + test_init(argc, argv); + + if (pipe(fork_pfd)) { + pr_perror("pipe"); + return 1; + } + + pid = clone_child(child, CLONE_FILES); + if (pid < 0) + return 1; + + pid2 = clone_child(child2, CLONE_FILES); + if (pid2 < 0) + return 1; + + wait_children(); + + test_daemon(); + test_waitsig(); + + fd = open(filename, O_RDWR | O_CREAT, 0666); + if (fd == -1) { + pr_perror("Can't open /dev/zero"); + return -1; + } + + for (i = 0; i < 3; i++) + if (write(fd, TEST_STRING, sizeof(TEST_STRING)) != sizeof(TEST_STRING)) { + pr_perror("Unable to write a test string"); + return -1; + } + + fd = dup2(fd, TEST_FD); + if (fd == -1) { + pr_perror("Can't dup fd %d to %d", fd, TEST_FD); + return -1; + } + + lseek(fd, 0, SEEK_SET); + + kill(pid2, SIGTERM); + waitpid(pid2, &status, 0); + kill(pid, SIGTERM); + + if (status) { + fail("The child returned %d\n", status); + return 1; + } + + waitpid(pid, &status, 0); + if (status) { + fail("The child returned %d\n", status); + return 1; + } + + if (close(TEST_FD) == 0) { + fail("%d was not closed\n", TEST_FD); + return 1; + } + + pass(); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/fifo-ghost.c b/CRIU_code/test/zdtm/static/fifo-ghost.c new file mode 100644 index 0000000..f5e11cf --- /dev/null +++ b/CRIU_code/test/zdtm/static/fifo-ghost.c @@ -0,0 +1,78 @@ +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that a ghost fifo with data restored"; +const char *test_author = "Cyrill Gorcunov "; + +char *filename; +TEST_OPTION(filename, string, "file name", 1); + +int main(int argc, char **argv) +{ + int fd; + int fd_ro; + mode_t mode = S_IFIFO | 0700; + uint8_t buf[256]; + uint32_t crc; + + test_init(argc, argv); + + if (mknod(filename, mode, 0)) { + pr_perror("can't make fifo \"%s\"", filename); + exit(1); + } + + fd = open(filename, O_RDWR); + if (fd < 0) { + pr_perror("can't open %s", filename); + return 1; + } + + fd_ro = open(filename, O_RDONLY); + if (fd_ro < 0) { + pr_perror("can't open %s", filename); + return 1; + } + + crc = ~0; + datagen(buf, sizeof(buf), &crc); + if (write_data(fd, buf, sizeof(buf))) { + pr_perror("write() failed"); + return 1; + } + + if (unlink(filename) < 0) { + fail("can't unlink %s", filename); + return 1; + } + + close(fd); + + test_daemon(); + test_waitsig(); + + if (read_data(fd_ro, buf, sizeof(buf))) { + pr_perror("read() failed"); + return 1; + } + + crc = ~0; + if (datachk(buf, sizeof(buf), &crc)) { + fail("data corrupted"); + return 1; + } + + if (close(fd_ro) < 0) { + fail("can't close %s", filename); + return 1; + } + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/fifo-rowo-pair.c b/CRIU_code/test/zdtm/static/fifo-rowo-pair.c new file mode 100644 index 0000000..85aad02 --- /dev/null +++ b/CRIU_code/test/zdtm/static/fifo-rowo-pair.c @@ -0,0 +1,158 @@ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "zdtmtst.h" + +const char *test_doc = "Test for fifo ro/wo with " + "fake fifo needed on criu side"; +const char *test_author = "Cyrill Gorcunov "; + +char *name_master; +TEST_OPTION(name_master, string, "master fifo name", 1); + +char *name_slave; +TEST_OPTION(name_slave, string, "slave fifo name", 1); + +#define TEST_VALUE (00100) + +#define exit_shot(pid, code) \ + do { kill(pid, SIGKILL); exit(code); } while (0) + +#define exit_shot_parent(code) \ + exit_shot(getppid(), 1) + +int main(int argc, char **argv) +{ + task_waiter_t t; + pid_t pid; + int fd_master, fd_slave; + int v, status; + + test_init(argc, argv); + + if (mknod(name_master, S_IFIFO | 0700, 0)) { + pr_perror("can't make fifo \"%s\"", name_master); + exit(1); + } + + if (mknod(name_slave, S_IFIFO | 0700, 0)) { + pr_perror("can't make fifo \"%s\"", name_slave); + exit(1); + } + + fd_slave = open(name_slave, O_RDWR); + if (fd_slave < 0) { + pr_perror("can't open %s", name_slave); + exit(1); + } + + task_waiter_init(&t); + + pid = test_fork(); + if (pid == 0) { + int new_slave; + + fd_master = open(name_master, O_WRONLY); + if (fd_master < 0) { + pr_perror("can't open %s", name_master); + exit_shot_parent(1); + } + + new_slave = dup2(fd_slave, 64); + if (new_slave < 0) { + pr_perror("can't dup %s", name_slave); + exit_shot_parent(1); + } + + close(fd_slave); + + task_waiter_complete_current(&t); + + v = TEST_VALUE; + if (write(new_slave, &v, sizeof(v)) != sizeof(v)) { + pr_perror("write failed"); + exit_shot_parent(1); + } + + v = TEST_VALUE; + if (write(fd_master, &v, sizeof(v)) != sizeof(v)) { + pr_perror("write failed"); + exit_shot_parent(1); + } + + /* Don't exit until explicitly asked */ + task_waiter_wait4(&t, getppid()); + + exit(0); + } else if (pid < 0) { + pr_perror("test_fork failed"); + exit(1); + } + + fd_master = open(name_master, O_RDONLY); + if (fd_master < 0) { + pr_perror("can't open %s", name_master); + exit_shot(pid, 1); + } + + /* Wait until data appear in kernel fifo buffer */ + task_waiter_wait4(&t, pid); + + test_daemon(); + test_waitsig(); + + if (read(fd_master, &v, sizeof(v)) != sizeof(v)) { + pr_perror("read failed"); + exit_shot(pid, 1); + } + + task_waiter_complete_current(&t); + + if (v != TEST_VALUE) { + fail("read data mismatch\n"); + exit_shot(pid, 1); + } + + if (read(fd_slave, &v, sizeof(v)) != sizeof(v)) { + pr_perror("read failed"); + exit_shot(pid, 1); + } + if (v != TEST_VALUE) { + fail("read data mismatch\n"); + exit_shot(pid, 1); + } + + waitpid(pid, &status, P_ALL); + + if (unlink(name_master) < 0) + pr_perror("can't unlink %s", name_master); + + if (unlink(name_slave) < 0) + pr_perror("can't unlink %s", name_slave); + + if (!WIFEXITED(status)) { + pr_perror("child %d is still running", pid); + exit_shot(pid, 1); + } + + errno = WEXITSTATUS(status); + if (errno) { + fail("Child exited with error %m"); + exit(errno); + } + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/fifo.c b/CRIU_code/test/zdtm/static/fifo.c new file mode 100644 index 0000000..ab5674a --- /dev/null +++ b/CRIU_code/test/zdtm/static/fifo.c @@ -0,0 +1,83 @@ +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that we can migrate with a named pipe " + "open"; +const char *test_author = "Roman Kagan "; + +char *filename; +TEST_OPTION(filename, string, "file name", 1); + +#define BUF_SIZE (16 * 4096) /* A fifo buffer has 16 slots by default */ + +int main(int argc, char **argv) +{ + int fd; + struct stat st; + mode_t mode = S_IFIFO | 0700; + uint8_t buf[BUF_SIZE]; + uint32_t crc; + + test_init(argc, argv); + + if (mknod(filename, mode, 0)) { + pr_perror("can't make fifo \"%s\"", filename); + exit(1); + } + + fd = open(filename, O_RDWR); + if (fd < 0) { + pr_perror("can't open %s", filename); + return 1; + } + + crc = ~0; + datagen(buf, BUF_SIZE, &crc); + if (write_data(fd, buf, BUF_SIZE)) { + pr_perror("write() failed"); + return 1; + } + + test_daemon(); + test_waitsig(); + + if (read_data(fd, buf, BUF_SIZE)) { + pr_perror("read() failed"); + return 1; + } + + crc = ~0; + if (datachk(buf, BUF_SIZE, &crc)) { + fail("data corrupted\n"); + return 1; + } + + if (close(fd) < 0) { + fail("can't close %s: %m", filename); + return 1; + } + + if (stat(filename, &st) < 0) { + fail("can't stat %s: %m", filename); + return 1; + } + + if (st.st_mode != mode) { + fail("%s is no longer the fifo we had", filename); + return 1; + } + + if (unlink(filename) < 0) { + fail("can't unlink %s: %m", filename); + return 1; + } + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/fifo_ro.c b/CRIU_code/test/zdtm/static/fifo_ro.c new file mode 100644 index 0000000..ea32329 --- /dev/null +++ b/CRIU_code/test/zdtm/static/fifo_ro.c @@ -0,0 +1,91 @@ +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that a fifo read-only descriptor is restored with data"; +const char *test_author = "Andrew Vagin "; + +char *filename; +TEST_OPTION(filename, string, "file name", 1); + +#define BUF_SIZE (16 * 4096) /* A fifo buffer has 16 slots by default */ + +int main(int argc, char **argv) +{ + int fd; + int fd_ro; + struct stat st; + mode_t mode = S_IFIFO | 0700; + uint8_t buf[BUF_SIZE]; + uint32_t crc; + + test_init(argc, argv); + + if (mknod(filename, mode, 0)) { + pr_perror("can't make fifo \"%s\"", filename); + exit(1); + } + + fd = open(filename, O_RDWR); + if (fd < 0) { + pr_perror("can't open %s", filename); + return 1; + } + + fd_ro = open(filename, O_RDONLY); + if (fd_ro < 0) { + pr_perror("can't open %s", filename); + return 1; + } + + crc = ~0; + datagen(buf, BUF_SIZE, &crc); + if (write_data(fd, buf, BUF_SIZE)) { + pr_perror("write() failed"); + return 1; + } + + close(fd); + + test_daemon(); + test_waitsig(); + + if (read_data(fd_ro, buf, BUF_SIZE)) { + pr_perror("read() failed"); + return 1; + } + + crc = ~0; + if (datachk(buf, BUF_SIZE, &crc)) { + fail("data corrupted\n"); + return 1; + } + + if (close(fd_ro) < 0) { + fail("can't close %s: %m", filename); + return 1; + } + + if (stat(filename, &st) < 0) { + fail("can't stat %s: %m", filename); + return 1; + } + + if (st.st_mode != mode) { + fail("%s is no longer the fifo we had", filename); + return 1; + } + + if (unlink(filename) < 0) { + fail("can't unlink %s: %m", filename); + return 1; + } + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/fifo_wronly.c b/CRIU_code/test/zdtm/static/fifo_wronly.c new file mode 100644 index 0000000..2fbd69e --- /dev/null +++ b/CRIU_code/test/zdtm/static/fifo_wronly.c @@ -0,0 +1,119 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that we can migrate with a named pipe, " + "opened in WRONLY mode"; +#define BUF_SIZE 256 +char *filename; +TEST_OPTION(filename, string, "file name", 1); + +int main(int argc, char **argv) +{ + task_waiter_t t; + int fd, fd1; + struct stat st; + mode_t mode = S_IFIFO | 0600; + int pid; + int chret; + + test_init(argc, argv); + + task_waiter_init(&t); + + if (mknod(filename, mode, 0)) { + pr_perror("can't make fifo \"%s\"", filename); + exit(1); + } + + pid = test_fork(); + if (pid < 0) { + pr_perror("Can't fork"); + exit(1); + } + + if (pid == 0) { + char rbuf[BUF_SIZE]; + int res; + fd1 = open(filename, O_RDONLY); + if (fd1 < 0) { + pr_perror("open(%s, O_RDONLY) Failed", filename); + chret = errno; + return chret; + } + task_waiter_complete(&t, 1); + res = read(fd1, rbuf, 7); + if (res < 0) { + pr_perror("read error %s", filename); + chret = errno; + return chret; + } + else if (res == 0) { + pr_perror("read(%d, rbuf, 7) return 0", fd1); + return 1; + } + if (close(fd1) < 0) { + fail("can't close %d, %s: %m", fd1, filename); + chret = errno; + return chret; + } + + } else { + + fd = open(filename, O_WRONLY); + if (fd < 0) { + pr_perror("open(%s, O_WRONLY) Failed", filename); + kill(pid, SIGKILL); + wait(NULL); + return 1; + } + task_waiter_wait4(&t, 1); + + test_daemon(); + test_waitsig(); + + if (write(fd, "string", 7) == -1) { + pr_perror("write(%d, 'string', 7) Failed", fd); + return 1; + } + + wait(&chret); + chret = WEXITSTATUS(chret); + if (chret) { + fail("child exited with non-zero code %d (%s)\n", + chret, strerror(chret)); + return 1; + } + + if (close(fd) < 0) { + fail("can't close %d, %s: %m", fd, filename); + return 1; + } + + if (stat(filename, &st) < 0) { + fail("can't stat %s: %m", filename); + return 1; + } + + if (st.st_mode != mode) { + fail("%s is no longer the fifo we had", filename); + return 1; + } + + if (unlink(filename) < 0) { + fail("can't unlink %s: %m", filename); + return 1; + } + } + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/file_append.c b/CRIU_code/test/zdtm/static/file_append.c new file mode 100644 index 0000000..14f1877 --- /dev/null +++ b/CRIU_code/test/zdtm/static/file_append.c @@ -0,0 +1,61 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check O_APPEND preserved"; +const char *test_author = "Pavel Emelyanov "; + +char *filename; +TEST_OPTION(filename, string, "file name", 1); + +int main(int argc, char **argv) +{ + int fd, fd2, ret; + char tmp[3]; + + test_init(argc, argv); + + fd = open(filename, O_RDWR | O_CREAT | O_APPEND, 0644); + if (fd == -1) + return 1; + + fd2 = open(filename, O_RDWR, 0644); + if (fd2 == -1) + return 1; + + test_daemon(); + test_waitsig(); + + if (write(fd2, "x", 1) != 1) { + pr_perror("Can't write x"); + return 1; + } + + if (write(fd, "y", 1) != 1) { + pr_perror("Can't write y"); + return 1; + } + + lseek(fd2, 0, SEEK_SET); + ret = read(fd2, tmp, 3); + if (ret != 2) { + fail("Smth's wrong with file size"); + return 1; + } + tmp[2] = '\0'; + if (strcmp(tmp, "xy")) { + fail("Smth's wron with file contents (%s)", tmp); + return 1; + } + + pass(); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/file_attr.c b/CRIU_code/test/zdtm/static/file_attr.c new file mode 100644 index 0000000..eb1c2a2 --- /dev/null +++ b/CRIU_code/test/zdtm/static/file_attr.c @@ -0,0 +1,121 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that attributes and content of an open, " + "written to, and then unlinked file migrate " + "correctly"; +const char *test_author = "Roman Kagan "; + +char *filename; +TEST_OPTION(filename, string, "file name", 1); +#define DEF_PERMS 06604 /* -rwS--Sr--, really esoteric one */ +unsigned int perms = DEF_PERMS; +TEST_OPTION(perms, uint, "permissions to set on file " + "(default " __stringify(DEF_PERMS) ")", 0); +#define DEF_MTIME 123456 /* another really esoteric one */ +unsigned int mtime = DEF_MTIME; +TEST_OPTION(mtime, uint, "mtime to set on file " + "(default " __stringify(DEF_MTIME) ")", 0); + + +int main(int argc, char ** argv) +{ + int fd; + struct utimbuf ut; + uint32_t crc; + struct stat st; + uint8_t buf[1000000]; + + test_init(argc, argv); + + fd = open(filename, O_RDWR | O_CREAT | O_TRUNC, 0644); + if (fd < 0) { + pr_perror("can't open %s", filename); + exit(1); + } + + crc = ~0; + datagen(buf, sizeof(buf), &crc); + if (write(fd, buf, sizeof(buf)) != sizeof(buf)) { + pr_perror("can't write to %s", filename); + exit(1); + } + + ut = (struct utimbuf) { + .actime = 0, + .modtime = mtime, + }; + if (utime(filename, &ut)) { + pr_perror("can't set modtime %d on %s", mtime, filename); + exit(1); + } + + if (fchmod(fd, perms)) { + pr_perror("can't set perms %o on %s", perms, filename); + exit(1); + } + + if (unlink(filename)) { + pr_perror("can't unlink %s", filename); + exit(1); + } + + test_daemon(); + test_waitsig(); + + if (lseek(fd, 0, SEEK_SET) < 0) { + fail("lseeking to the beginning of file failed: %m\n"); + goto out; + } + + if (read(fd, buf, sizeof(buf)) != sizeof(buf)) { + fail("can't read %s: %m\n", filename); + goto out; + } + + crc = ~0; + if (datachk(buf, sizeof(buf), &crc)) { + fail("CRC mismatch\n"); + goto out; + } + + if (fstat(fd, &st) < 0) { + fail("can't fstat %s: %m", filename); + goto out; + } + + if ((st.st_mode & 07777) != perms) { + fail("permissions have changed"); + goto out; + } + + if (st.st_mtime != mtime) { + fail("modification time has changed"); + goto out; + } + + if (close(fd)) { + fail("close failed: %m\n"); + goto out_noclose; + } + + if (unlink(filename) != -1 || errno != ENOENT) { + fail("file %s should have been deleted before migration: unlink: %m\n", filename); + goto out_noclose; + } + + pass(); + +out: + close(fd); +out_noclose: + return 0; +} diff --git a/CRIU_code/test/zdtm/static/file_fown.c b/CRIU_code/test/zdtm/static/file_fown.c new file mode 100644 index 0000000..f29ba45 --- /dev/null +++ b/CRIU_code/test/zdtm/static/file_fown.c @@ -0,0 +1,182 @@ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +#ifndef F_SETSIG +#define F_SETSIG 10 /* for sockets. */ +#define F_GETSIG 11 /* for sockets. */ +#endif + +const char *test_doc = "Check for signal delivery on file owners"; +const char *test_author = "Cyrill Gorcunov "; + +struct params { + int sigio; + int pipe_flags[2]; + int pipe_pid[2]; + int pipe_sig[2]; +} *shared; + +static void signal_handler_io(int status) +{ + shared->sigio++; +} + +static void fill_pipe_params(struct params *p, int *pipes) +{ + p->pipe_flags[0] = fcntl(pipes[0], F_GETFL); + p->pipe_flags[1] = fcntl(pipes[1], F_GETFL); + + test_msg("pipe_flags0 %08o\n", p->pipe_flags[0]); + test_msg("pipe_flags1 %08o\n", p->pipe_flags[1]); + + p->pipe_pid[0] = fcntl(pipes[0], F_GETOWN); + p->pipe_pid[1] = fcntl(pipes[1], F_GETOWN); + + p->pipe_sig[0] = fcntl(pipes[0], F_GETSIG); + p->pipe_sig[1] = fcntl(pipes[1], F_GETSIG); +} + +static int cmp_pipe_params(struct params *p1, struct params *p2) +{ + int i; + + for (i = 0; i < 2; i++) { + if (p1->pipe_flags[i] != p2->pipe_flags[i]) { + fail("pipe flags failed [%d] expected %08o got %08o\n", + i, p1->pipe_flags[i], p2->pipe_flags[i]); + return -1; + } + if (p1->pipe_pid[i] != p2->pipe_pid[i]) { + fail("pipe pid failed [%d] expected %d got %d\n", + i, p1->pipe_pid[i], p2->pipe_pid[i]); + return -1; + } + if (p1->pipe_sig[i] != p2->pipe_sig[i]) { + fail("pipe sig failed [%d] expected %d got %d\n", + i, p1->pipe_sig[i], p2->pipe_sig[i]); + return -1; + } + } + + return 0; +} + +int main(int argc, char *argv[]) +{ + struct sigaction saio = { }; + struct params obtained = { }; + uid_t ruid, euid, suid; + int status, pipes[2]; + pid_t pid; + + test_init(argc, argv); + + shared = (void *)mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); + if ((void *)shared == MAP_FAILED) { + fail("mmap failed"); + exit(1); + } + + if (getresuid(&ruid, &euid, &suid)) { + fail("getresuid failed\n"); + exit(1); + } + + if (pipe(pipes)) { + pr_perror("Can't create pipe"); + exit(1); + } + + saio.sa_handler = (sig_t)signal_handler_io; + saio.sa_flags = SA_RESTART; + if (sigaction(SIGIO, &saio, 0)) { + fail("sigaction failed\n"); + exit(1); + } + + if (!getuid() && setresuid(-1, 1, -1)) { + fail("setresuid failed\n"); + exit(1); + } + + if (fcntl(pipes[0], F_SETOWN, getpid()) || + fcntl(pipes[1], F_SETOWN, getpid()) || + fcntl(pipes[0], F_SETSIG, SIGIO) || + fcntl(pipes[1], F_SETSIG, SIGIO) || + fcntl(pipes[0], F_SETFL, fcntl(pipes[0], F_GETFL) | O_ASYNC) || + fcntl(pipes[1], F_SETFL, fcntl(pipes[1], F_GETFL) | O_ASYNC)) { + fail("fcntl failed\n"); + exit(1); + } + + fill_pipe_params(shared, pipes); + + if (setresuid(-1, euid, -1)) { + fail("setresuid failed\n"); + exit(1); + } + + pid = test_fork(); + if (pid < 0) { + pr_perror("can't fork"); + exit(1); + } + + if (pid == 0) { + struct params p = { }; + + test_waitsig(); + + fcntl(pipes[1], F_SETOWN, getpid()); + fill_pipe_params(&p, pipes); + + if (write(pipes[1], &p, sizeof(p)) != sizeof(p)) { + fail("write failed\n"); + exit(1); + } + + exit(0); + } + + test_daemon(); + test_waitsig(); + kill(pid, SIGTERM); + + if (waitpid(pid, &status, P_ALL) == -1) { + fail("waitpid failed\n"); + exit(1); + } + + if (read(pipes[0], &obtained, sizeof(obtained)) != sizeof(obtained)) { + fail("read failed\n"); + exit(1); + } + + if (shared->sigio < 1) { + fail("shared->sigio = %d (> 0 expected)\n", shared->sigio); + exit(1); + } + + shared->pipe_pid[1] = pid; + + if (cmp_pipe_params(shared, &obtained)) { + fail("params comparison failed\n"); + exit(1); + } + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/file_fown.desc b/CRIU_code/test/zdtm/static/file_fown.desc new file mode 100644 index 0000000..63df42a --- /dev/null +++ b/CRIU_code/test/zdtm/static/file_fown.desc @@ -0,0 +1 @@ +{'flavor': 'h'} diff --git a/CRIU_code/test/zdtm/static/file_lease00.c b/CRIU_code/test/zdtm/static/file_lease00.c new file mode 100644 index 0000000..8413f83 --- /dev/null +++ b/CRIU_code/test/zdtm/static/file_lease00.c @@ -0,0 +1,84 @@ +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check c/r of non-breaking leases"; +const char *test_author = "Pavel Begunkov "; + +char *filename; +TEST_OPTION(filename, string, "file name", 1); +char filename_rd[PATH_MAX]; +char filename_wr[PATH_MAX]; + +static void close_files(int fd1, int fd2) +{ + if (fd1 >= 0) + close(fd1); + if (fd2 >= 0) + close(fd2); + + unlink(filename_rd); + unlink(filename_wr); +} + +static int open_files(int *fd_rd, int *fd_wr) +{ + *fd_rd = open(filename_rd, O_RDONLY | O_CREAT, 0666); + *fd_wr = open(filename_wr, O_WRONLY | O_CREAT, 0666); + + if (*fd_rd < 0 || *fd_wr < 0) { + close_files(*fd_rd, *fd_wr); + return -1; + } + return 0; +} + +static int check_lease_type(int fd, int expected_type) +{ + int lease_type = fcntl(fd, F_GETLEASE); + + if (lease_type != expected_type) { + if (lease_type < 0) + pr_perror("Can't acquire lease type\n"); + else + pr_err("Mismatched lease type: %i\n", lease_type); + return -1; + } + return 0; +} + +int main(int argc, char **argv) +{ + int fd_rd = -1, fd_wr = -1; + + test_init(argc, argv); + + snprintf(filename_rd, sizeof(filename_rd), "%s.0", filename); + snprintf(filename_wr, sizeof(filename_wr), "%s.1", filename); + + if (open_files(&fd_rd, &fd_wr)) { + pr_err("Can't open files\n"); + return -1; + } + if (fcntl(fd_rd, F_SETLEASE, F_RDLCK) < 0 || + fcntl(fd_wr, F_SETLEASE, F_WRLCK) < 0) { + pr_perror("Can't set leases\n"); + close_files(fd_rd, fd_wr); + return -1; + } + + test_daemon(); + test_waitsig(); + + if (check_lease_type(fd_rd, F_RDLCK)) + fail("Read lease check failed\n"); + else if (check_lease_type(fd_wr, F_WRLCK)) + fail("Write lease check failed\n"); + else + pass(); + + close_files(fd_rd, fd_wr); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/file_lease00.desc b/CRIU_code/test/zdtm/static/file_lease00.desc new file mode 100644 index 0000000..f394d03 --- /dev/null +++ b/CRIU_code/test/zdtm/static/file_lease00.desc @@ -0,0 +1 @@ +{'feature': 'fdinfo_lock', 'opts': '--file-locks'} diff --git a/CRIU_code/test/zdtm/static/file_lease01.c b/CRIU_code/test/zdtm/static/file_lease01.c new file mode 100644 index 0000000..90fa74a --- /dev/null +++ b/CRIU_code/test/zdtm/static/file_lease01.c @@ -0,0 +1,88 @@ +#include +#include + +#include "zdtmtst.h" + +#define FD_COUNT 3 +#define FD_LEASED1 0 +#define FD_LEASED2 2 +#define FD_LEASE_FREE 1 + +const char *test_doc = "Check that extra leases are not set after c/r"; +const char *test_author = "Pavel Begunkov "; + +char *filename; +TEST_OPTION(filename, string, "file name", 1); + +static void close_files(int fds[FD_COUNT]) +{ + int i; + + for (i = 0; i < FD_COUNT; ++i) + if (fds[i] >= 0) + close(fds[i]); + unlink(filename); +} + +static int open_files(int fds[FD_COUNT]) +{ + int i; + + for (i = 0; i < FD_COUNT; ++i) { + fds[i] = open(filename, O_RDONLY | O_CREAT, 0666); + if (fds[i] < 0) { + close_files(fds); + return -1; + } + } + return 0; +} + +static int check_lease_type(int fd, int expected_type) +{ + int lease_type = fcntl(fd, F_GETLEASE); + + if (lease_type != expected_type) { + if (lease_type < 0) + pr_perror("Can't acquire lease type\n"); + else + pr_err("Mismatched lease type: %i\n", lease_type); + return -1; + } + return 0; +} + +int main(int argc, char **argv) +{ + int fds[FD_COUNT]; + + test_init(argc, argv); + + if (open_files(fds)) { + pr_err("Can't open files\n"); + return -1; + } + + if (fcntl(fds[FD_LEASED1], F_SETLEASE, F_RDLCK) < 0 || + fcntl(fds[FD_LEASED2], F_SETLEASE, F_RDLCK) < 0) { + pr_err("Can't set leases\n"); + close_files(fds); + return -1; + } + + test_daemon(); + test_waitsig(); + + if (check_lease_type(fds[FD_LEASE_FREE], F_UNLCK)) + fail("Unexpected lease was found (%i)\n", fds[FD_LEASE_FREE]); + else if (check_lease_type(fds[FD_LEASED1], F_RDLCK)) + fail("Lease isn't set (%i)\n", fds[FD_LEASED1]); + else if (check_lease_type(fds[FD_LEASED2], F_RDLCK)) + fail("Lease isn't set (%i)\n", fds[FD_LEASED2]); + else + pass(); + + close_files(fds); + return 0; +} + diff --git a/CRIU_code/test/zdtm/static/file_lease01.desc b/CRIU_code/test/zdtm/static/file_lease01.desc new file mode 100644 index 0000000..fba66d3 --- /dev/null +++ b/CRIU_code/test/zdtm/static/file_lease01.desc @@ -0,0 +1 @@ +file_lease00.desc \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/file_lease02.c b/CRIU_code/test/zdtm/static/file_lease02.c new file mode 100644 index 0000000..cd7945d --- /dev/null +++ b/CRIU_code/test/zdtm/static/file_lease02.c @@ -0,0 +1,145 @@ +#include +#include +#include +#include + +#include "zdtmtst.h" + +#define FD_COUNT 3 +#define BREAK_SIGNUM SIGIO + +const char *test_doc = "Check c/r of breaking leases"; +const char *test_author = "Pavel Begunkov "; + +char *filename; +TEST_OPTION(filename, string, "file name", 1); + +char filename1[PATH_MAX]; +char filename2[PATH_MAX]; +char filename3[PATH_MAX]; + +int expected_fd; +int sigaction_error; + +static void break_sigaction(int signo, siginfo_t *info, void *ctx) +{ + if (signo != BREAK_SIGNUM) { + pr_err("Unexpected signal(%i)\n", signo); + sigaction_error = -1; + } else if (info->si_fd != expected_fd) { + pr_err("Unexpected fd(%i)\n", info->si_fd); + sigaction_error = -1; + } + expected_fd = -1; +} + +static int check_lease_type(int fd, int expected_type) +{ + int lease_type = fcntl(fd, F_GETLEASE); + + if (lease_type != expected_type) { + if (lease_type < 0) + pr_perror("Can't acquire lease type\n"); + else + pr_err("Mismatched lease type: %i\n", lease_type); + return -1; + } + return 0; +} + +static int prepare_file(char *file, int file_type, int break_type) +{ + int fd, fd_break; + int lease_type = (file_type == O_RDONLY) ? F_RDLCK : F_WRLCK; + + fd = open(file, file_type | O_CREAT, 0666); + if (fd < 0) { + pr_perror("Can't open file (type %i)\n", file_type); + return fd; + } + if (fcntl(fd, F_SETLEASE, lease_type) < 0) { + pr_perror("Can't set exclusive lease\n"); + goto err; + } + if (fcntl(fd, F_SETSIG, BREAK_SIGNUM) < 0) { + pr_perror("Can't set signum for file i/o\n"); + goto err; + } + + expected_fd = fd; + fd_break = open(file, break_type | O_NONBLOCK); + + if (fd_break >= 0) { + close(fd_break); + pr_err("Conflicting lease not found\n"); + goto err; + } else if (errno != EWOULDBLOCK) { + pr_perror("Can't break lease\n"); + goto err; + } + return fd; +err: + close(fd); + return -1; +} + +static void close_files(int fds[FD_COUNT]) +{ + int i; + + for (i = 0; i < FD_COUNT; ++i) + if (fds[i] >= 0) + close(fds[i]); + + unlink(filename1); + unlink(filename2); + unlink(filename3); +} + +int main(int argc, char **argv) +{ + int fds[FD_COUNT] = {}; + int ret = -1; + struct sigaction act = {}; + + test_init(argc, argv); + + snprintf(filename1, sizeof(filename1), "%s.0", filename); + snprintf(filename2, sizeof(filename2), "%s.1", filename); + snprintf(filename3, sizeof(filename3), "%s.2", filename); + + act.sa_sigaction = break_sigaction; + act.sa_flags = SA_SIGINFO; + + if (sigemptyset(&act.sa_mask) || + sigaddset(&act.sa_mask, BREAK_SIGNUM) || + sigaction(BREAK_SIGNUM, &act, NULL)) { + pr_perror("Can't set signal action\n"); + fail(); + return -1; + } + + sigaction_error = 0; + fds[0] = prepare_file(filename1, O_RDONLY, O_WRONLY); + fds[1] = prepare_file(filename2, O_WRONLY, O_RDONLY); + fds[2] = prepare_file(filename3, O_WRONLY, O_WRONLY); + if (fds[0] < 0 || fds[1] < 0 || fds[2] < 0 || sigaction_error) + goto done; + + test_daemon(); + test_waitsig(); + + ret = 0; + if (sigaction_error) + fail("Ghost signal\n"); + else if (check_lease_type(fds[0], F_UNLCK) || + check_lease_type(fds[1], F_RDLCK) || + check_lease_type(fds[2], F_UNLCK)) + fail("Lease type doesn't match\n"); + else + pass(); +done: + close_files(fds); + return ret; +} + diff --git a/CRIU_code/test/zdtm/static/file_lease02.desc b/CRIU_code/test/zdtm/static/file_lease02.desc new file mode 100644 index 0000000..fba66d3 --- /dev/null +++ b/CRIU_code/test/zdtm/static/file_lease02.desc @@ -0,0 +1 @@ +file_lease00.desc \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/file_lease03.c b/CRIU_code/test/zdtm/static/file_lease03.c new file mode 100644 index 0000000..4cde2b6 --- /dev/null +++ b/CRIU_code/test/zdtm/static/file_lease03.c @@ -0,0 +1,146 @@ +#include +#include +#include +#include + +#include "zdtmtst.h" + +#define BREAK_SIGNUM SIGIO + +const char *test_doc = "Check multiple fds on OFD with breaking leases"; +const char *test_author = "Pavel Begunkov "; + +char *filename; +TEST_OPTION(filename, string, "file name", 1); + +int expected_fd; +int sigaction_error; + +static void break_sigaction(int signo, siginfo_t *info, void *ctx) +{ + if (signo != BREAK_SIGNUM) { + pr_err("Unexpected signal(%i)\n", signo); + sigaction_error = -1; + } else if (info->si_fd != expected_fd) { + pr_err("Unexpected fd(%i)\n", info->si_fd); + sigaction_error = -1; + } + expected_fd = -1; +} + +static int check_lease_type(int fd, int expected_type) +{ + int lease_type = fcntl(fd, F_GETLEASE); + + if (lease_type != expected_type) { + if (lease_type < 0) + pr_perror("Can't acquire lease type\n"); + else + pr_err("Mismatched lease type: %i\n", lease_type); + return -1; + } + return 0; +} + +static int prepare_file(char *file, int file_type, int break_type) +{ + int fd, fd_break; + int lease_type = (file_type == O_RDONLY) ? F_RDLCK : F_WRLCK; + + fd = open(file, file_type | O_CREAT, 0666); + if (fd < 0) { + pr_perror("Can't open file (type %i)\n", file_type); + return fd; + } + if (fcntl(fd, F_SETLEASE, lease_type) < 0) { + pr_perror("Can't set exclusive lease\n"); + goto err; + } + if (fcntl(fd, F_SETSIG, BREAK_SIGNUM) < 0) { + pr_perror("Can't set signum for file i/o\n"); + goto err; + } + + expected_fd = fd; + fd_break = open(file, break_type | O_NONBLOCK); + + if (fd_break >= 0) { + close(fd_break); + pr_err("Conflicting lease not found\n"); + goto err; + } else if (errno != EWOULDBLOCK) { + pr_perror("Can't break lease\n"); + goto err; + } + return fd; +err: + close(fd); + return -1; +} + +int main(int argc, char **argv) +{ + int fd = -1, fd_dup = -1; + int status, ret = -1; + struct sigaction act = {}; + pid_t pid; + + test_init(argc, argv); + + act.sa_sigaction = break_sigaction; + act.sa_flags = SA_SIGINFO; + if (sigemptyset(&act.sa_mask) || + sigaddset(&act.sa_mask, BREAK_SIGNUM) || + sigaction(BREAK_SIGNUM, &act, NULL)) { + pr_perror("Can't set signal action\n"); + return -1; + } + + sigaction_error = 0; + fd = prepare_file(filename, O_RDWR, O_WRONLY); + if (fd < 0 || sigaction_error) + goto done; + + pid = fork(); + if (pid < 0) + return 1; + if (pid == 0) { + test_waitsig(); + if (check_lease_type(fd, F_UNLCK)) + return 1; + close(fd); + return 0; + } + + ret = fd_dup = dup(fd); + if (fd_dup < 0) { + pr_perror("Can't dup fd\n"); + goto done; + } + + ret = 0; + + test_daemon(); + test_waitsig(); + + kill(pid, SIGTERM); + ret = waitpid(pid, &status, 0); + + if (ret < 0 || !WIFEXITED(status) || WEXITSTATUS(status)) + fail(); + if (sigaction_error) + fail("Ghost signal\n"); + else if (check_lease_type(fd, F_UNLCK)) + fail("Lease type doesn't match\n"); + else + pass(); + +done: + if (fd >= 0) + close(fd); + if (fd_dup >= 0) + close(fd_dup); + unlink(filename); + return ret; +} + diff --git a/CRIU_code/test/zdtm/static/file_lease03.desc b/CRIU_code/test/zdtm/static/file_lease03.desc new file mode 100644 index 0000000..fba66d3 --- /dev/null +++ b/CRIU_code/test/zdtm/static/file_lease03.desc @@ -0,0 +1 @@ +file_lease00.desc \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/file_lease04.c b/CRIU_code/test/zdtm/static/file_lease04.c new file mode 100644 index 0000000..c924f6c --- /dev/null +++ b/CRIU_code/test/zdtm/static/file_lease04.c @@ -0,0 +1,132 @@ +#include +#include +#include +#include + +#include "zdtmtst.h" + +#define BREAK_SIGNUM SIGIO + +const char *test_doc = "Check leases with no fds in owner process"; +const char *test_author = "Pavel Begunkov "; + +char *filename; +TEST_OPTION(filename, string, "file name", 1); + +int expected_fd; +int sigaction_error; + +static void break_sigaction(int signo, siginfo_t *info, void *ctx) +{ + if (signo != BREAK_SIGNUM) { + pr_err("Unexpected signal(%i)\n", signo); + sigaction_error = -1; + } else if (info->si_fd != expected_fd) { + pr_err("Unexpected fd(%i)\n", info->si_fd); + sigaction_error = -1; + } + expected_fd = -1; +} + +static int check_lease_type(int fd, int expected_type) +{ + int lease_type = fcntl(fd, F_GETLEASE); + + if (lease_type != expected_type) { + if (lease_type < 0) + pr_perror("Can't acquire lease type\n"); + else + pr_err("Mismatched lease type: %i\n", lease_type); + return -1; + } + return 0; +} + +static int prepare_file(char *file, int file_type, int break_type) +{ + int fd, fd_break; + int lease_type = (file_type == O_RDONLY) ? F_RDLCK : F_WRLCK; + + fd = open(file, file_type | O_CREAT, 0666); + if (fd < 0) { + pr_perror("Can't open file (type %i)\n", file_type); + return fd; + } + if (fcntl(fd, F_SETLEASE, lease_type) < 0) { + pr_perror("Can't set exclusive lease\n"); + goto err; + } + if (fcntl(fd, F_SETSIG, BREAK_SIGNUM) < 0) { + pr_perror("Can't set signum for file i/o\n"); + goto err; + } + + expected_fd = fd; + fd_break = open(file, break_type | O_NONBLOCK); + + if (fd_break >= 0) { + close(fd_break); + pr_err("Conflicting lease not found\n"); + goto err; + } else if (errno != EWOULDBLOCK) { + pr_perror("Can't break lease\n"); + goto err; + } + return fd; +err: + close(fd); + return -1; +} + +int main(int argc, char **argv) +{ + int fd = -1; + int status, ret = -1; + struct sigaction act = {}; + pid_t pid; + + test_init(argc, argv); + + act.sa_sigaction = break_sigaction; + act.sa_flags = SA_SIGINFO; + if (sigemptyset(&act.sa_mask) || + sigaddset(&act.sa_mask, BREAK_SIGNUM) || + sigaction(BREAK_SIGNUM, &act, NULL)) { + pr_perror("Can't set signal action\n"); + return -1; + } + + sigaction_error = 0; + fd = prepare_file(filename, O_RDWR, O_WRONLY); + if (fd < 0 || sigaction_error) + goto done; + + pid = fork(); + if (pid < 0) + return 1; + if (pid == 0) { + test_waitsig(); + if (check_lease_type(fd, F_UNLCK)) + return 1; + close(fd); + return 0; + } + close(fd); + + test_daemon(); + test_waitsig(); + + kill(pid, SIGTERM); + ret = waitpid(pid, &status, 0); + + if (ret < 0 || !WIFEXITED(status) || WEXITSTATUS(status)) + fail(); + else if (sigaction_error) + fail("Ghost signal\n"); + else + pass(); +done: + unlink(filename); + return ret; +} + diff --git a/CRIU_code/test/zdtm/static/file_lease04.desc b/CRIU_code/test/zdtm/static/file_lease04.desc new file mode 100644 index 0000000..fba66d3 --- /dev/null +++ b/CRIU_code/test/zdtm/static/file_lease04.desc @@ -0,0 +1 @@ +file_lease00.desc \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/file_locks00.c b/CRIU_code/test/zdtm/static/file_locks00.c new file mode 100644 index 0000000..59e19cf --- /dev/null +++ b/CRIU_code/test/zdtm/static/file_locks00.c @@ -0,0 +1,197 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that posix flocks are restored"; +const char *test_author = "Qiang Huang "; + +char *filename; +TEST_OPTION(filename, string, "file name", 1); + +char file0[PATH_MAX]; +char file1[PATH_MAX]; + +static int lock_reg(int fd, int cmd, int type, int whence, + off_t offset, off_t len) +{ + struct flock lock; + + lock.l_type = type; /* F_RDLCK, F_WRLCK, F_UNLCK */ + lock.l_whence = whence; /* SEEK_SET, SEEK_CUR, SEEK_END */ + lock.l_start = offset; /* byte offset, relative to l_whence */ + lock.l_len = len; /* #bytes (0 means to EOF) */ + + errno = 0; + return fcntl(fd, cmd, &lock); +} + +#define set_read_lock(fd, whence, offset, len) \ + lock_reg(fd, F_SETLK, F_RDLCK, whence, offset, len) +#define set_write_lock(fd, whence, offset, len) \ + lock_reg(fd, F_SETLK, F_WRLCK, whence, offset, len) + +static int check_read_lock(int fd, int whence, off_t offset, off_t len) +{ + struct flock lock; + int ret; + + lock.l_type = F_RDLCK; /* F_RDLCK, F_WRLCK, F_UNLCK */ + lock.l_whence = whence; /* SEEK_SET, SEEK_CUR, SEEK_END */ + lock.l_start = offset; /* byte offset, relative to l_whence */ + lock.l_len = len; /* #bytes (0 means to EOF) */ + lock.l_pid = -1; + + errno = 0; + ret = fcntl(fd, F_GETLK, &lock); + if (ret == -1) { + pr_perror("F_GETLK failed."); + return -1; + } + + if (lock.l_pid == -1) { + /* Share lock should succeed. */ + return 0; + } + + fail("Read lock check failed."); + return -1; +} + +static int check_write_lock(int fd, int whence, off_t offset, off_t len) +{ + struct flock lock; + + int ret; + pid_t ppid = getppid(); + + lock.l_type = F_WRLCK; /* F_RDLCK, F_WRLCK, F_UNLCK */ + lock.l_whence = whence; /* SEEK_SET, SEEK_CUR, SEEK_END */ + lock.l_start = offset; /* byte offset, relative to l_whence */ + lock.l_len = len; /* #bytes (0 means to EOF) */ + lock.l_pid = -1; + + errno = 0; + ret = fcntl(fd, F_GETLK, &lock); + if (ret == -1) { + pr_perror("F_GETLK failed."); + return -1; + } + + if (lock.l_pid == -1) { + fail("Write lock check failed."); + return -1; + } + + /* + * It only succeed when the file lock's owner is exactly + * the same as the file lock was dumped. + */ + if (lock.l_pid == ppid) + return 0; + + fail("Write lock check failed."); + return -1; +} + +static int check_file_locks() +{ + int fd_0, fd_1; + int ret0, ret1; + + fd_0 = open(file0, O_RDWR | O_CREAT, 0644); + if (fd_0 < 0) { + pr_perror("Unable to open file %s", file0); + return -1; + } + ret0 = check_read_lock(fd_0, SEEK_SET, 0, 0); + + fd_1 = open(file1, O_RDWR | O_CREAT, 0644); + if (fd_1 < 0) { + close(fd_0); + unlink(file0); + pr_perror("Unable to open file %s", file1); + return -1; + } + ret1 = check_write_lock(fd_1, SEEK_SET, 0, 0); + + close(fd_0); + close(fd_1); + + return ret0 | ret1; +} + +int main(int argc, char **argv) +{ + int fd_0, fd_1, ret; + pid_t pid; + + test_init(argc, argv); + + snprintf(file0, sizeof(file0), "%s.0", filename); + snprintf(file1, sizeof(file0), "%s.1", filename); + fd_0 = open(file0, O_RDWR | O_CREAT | O_EXCL, 0666); + if (fd_0 < 0) { + pr_perror("Unable to open file %s", file0); + return -1; + } + + fd_1 = open(file1, O_RDWR | O_CREAT | O_EXCL, 0666); + if (fd_1 < 0) { + close(fd_0); + unlink(file0); + pr_perror("Unable to open file %s", file1); + return -1; + } + + pid = fork(); + if (pid < 0) { + pr_perror("Can't fork"); + return -1; + } + + if (pid == 0) { /* child will check father's file locks */ + test_waitsig(); + + if (check_file_locks()) { + fail("Posix file lock check failed"); + exit(1); + } + + pass(); + exit(0); + } + + ret = set_read_lock(fd_0, SEEK_SET, 0, 0); + if (ret == -1) { + pr_perror("Failed to set read lock"); + kill(pid, SIGTERM); + return -1; + } + + ret = set_write_lock(fd_1, SEEK_SET, 0, 0); + if (ret == -1) { + pr_perror("Failed to set write lock"); + kill(pid, SIGTERM); + return -1; + } + + test_daemon(); + test_waitsig(); + + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); + close(fd_0); + close(fd_1); + unlink(file0); + unlink(file1); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/file_locks00.desc b/CRIU_code/test/zdtm/static/file_locks00.desc new file mode 100644 index 0000000..80cd04e --- /dev/null +++ b/CRIU_code/test/zdtm/static/file_locks00.desc @@ -0,0 +1 @@ +{'flags': 'excl', 'opts': '--file-locks'} diff --git a/CRIU_code/test/zdtm/static/file_locks01.c b/CRIU_code/test/zdtm/static/file_locks01.c new file mode 100644 index 0000000..c08c40b --- /dev/null +++ b/CRIU_code/test/zdtm/static/file_locks01.c @@ -0,0 +1,194 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" +#include "fs.h" + +#ifndef LOCK_MAND +#define LOCK_MAND 32 +#endif + +#ifndef LOCK_READ +#define LOCK_READ 64 +#endif + +const char *test_doc = "Check that flock locks are restored"; +const char *test_author = "Qiang Huang "; + +char *filename; +TEST_OPTION(filename, string, "file name", 1); + +char file0[PATH_MAX]; +char file1[PATH_MAX]; +char file2[PATH_MAX]; +unsigned long inodes[3]; +static mnt_info_t *m; +dev_t dev; + +static int open_all_files(int *fd_0, int *fd_1, int *fd_2) +{ + struct stat buf; + + snprintf(file0, sizeof(file0), "%s.0", filename); + snprintf(file1, sizeof(file0), "%s.1", filename); + snprintf(file2, sizeof(file0), "%s.2", filename); + *fd_0 = open(file0, O_RDWR | O_CREAT | O_EXCL, 0666); + if (*fd_0 < 0) { + pr_perror("Unable to open file %s", file0); + return -1; + } + + fstat(*fd_0, &buf); + inodes[0] = buf.st_ino; + + if (!strcmp(m->fsname, "btrfs")) + dev = m->s_dev; + else + dev = buf.st_dev; + + *fd_1 = open(file1, O_RDWR | O_CREAT | O_EXCL, 0666); + if (*fd_1 < 0) { + close(*fd_0); + unlink(file0); + pr_perror("Unable to open file %s", file1); + return -1; + } + + fstat(*fd_1, &buf); + inodes[1] = buf.st_ino; + + *fd_2 = open(file2, O_RDWR | O_CREAT | O_EXCL, 0666); + if (*fd_2 < 0) { + close(*fd_0); + close(*fd_1); + unlink(file0); + unlink(file1); + pr_perror("Unable to open file %s", file1); + return -1; + } + fstat(*fd_2, &buf); + inodes[2] = buf.st_ino; + + return 0; +} + +static int check_file_lock(int fd, char *expected_type, + char *expected_option, + unsigned int expected_dev, + unsigned long expected_ino) +{ + char buf[100], fl_flag[16], fl_type[16], fl_option[16]; + int found = 0, num, fl_owner; + FILE *fp_locks = NULL; + char path[PATH_MAX]; + unsigned long i_no; + int maj, min; + + test_msg("check_file_lock: (fsname %s) expecting fd %d type %s option %s dev %u ino %lu\n", + m->fsname, fd, expected_type, expected_option, expected_dev, expected_ino); + + snprintf(path, sizeof(path), "/proc/self/fdinfo/%d", fd); + fp_locks = fopen(path, "r"); + if (!fp_locks) { + pr_err("Can't open %s\n", path); + return -1; + } + + while (fgets(buf, sizeof(buf), fp_locks)) { + if (strncmp(buf, "lock:\t", 6) != 0) + continue; + test_msg("c: %s", buf); + + memset(fl_flag, 0, sizeof(fl_flag)); + memset(fl_type, 0, sizeof(fl_type)); + memset(fl_option, 0, sizeof(fl_option)); + + num = sscanf(buf, "%*s %*d:%s %s %s %d %x:%x:%ld %*d %*s", + fl_flag, fl_type, fl_option, &fl_owner, + &maj, &min, &i_no); + if (num < 7) { + pr_perror("Invalid lock info."); + break; + } + + if (!strcmp(m->fsname, "btrfs")) { + if (MKKDEV(major(maj), minor(min)) != expected_dev) + continue; + } else { + if (makedev(maj, min) != expected_dev) + continue; + } + + if (fl_owner != getpid()) + continue; + if (i_no != expected_ino) + continue; + if (strcmp(fl_flag, "FLOCK")) + continue; + if (strcmp(fl_type, expected_type)) + continue; + if (strcmp(fl_option, expected_option)) + continue; + found++; + } + + fclose(fp_locks); + + return found == 1 ? 0 : -1; +} + +int main(int argc, char **argv) +{ + int fd_0, fd_1, fd_2, ret = 0; + + test_init(argc, argv); + + m = get_cwd_mnt_info(); + if (!m) { + pr_perror("Can't fetch mountinfo"); + return -1; + } + if (!strcmp(m->fsname, "btrfs")) + m->s_dev = kdev_to_odev(m->s_dev); + + if (open_all_files(&fd_0, &fd_1, &fd_2)) + return -1; + + flock(fd_0, LOCK_SH); + flock(fd_1, LOCK_EX); + flock(fd_2, LOCK_MAND | LOCK_READ); + + test_daemon(); + test_waitsig(); + + if (check_file_lock(fd_0, "ADVISORY", "READ", dev, inodes[0])) { + fail("Failed on fd %d", fd_0); + ret |= 1; + } + if (check_file_lock(fd_1, "ADVISORY", "WRITE", dev, inodes[1])) { + fail("Failed on fd %d", fd_1); + ret |= 1; + } + if (check_file_lock(fd_2, "MSNFS", "READ", dev, inodes[2])) { + fail("Failed on fd %d", fd_2); + ret |= 1; + } + + if (!ret) + pass(); + + close(fd_0); + close(fd_1); + close(fd_2); + unlink(file0); + unlink(file1); + unlink(file2); + + return ret; +} diff --git a/CRIU_code/test/zdtm/static/file_locks01.desc b/CRIU_code/test/zdtm/static/file_locks01.desc new file mode 100644 index 0000000..80cd04e --- /dev/null +++ b/CRIU_code/test/zdtm/static/file_locks01.desc @@ -0,0 +1 @@ +{'flags': 'excl', 'opts': '--file-locks'} diff --git a/CRIU_code/test/zdtm/static/file_locks02.c b/CRIU_code/test/zdtm/static/file_locks02.c new file mode 100644 index 0000000..91d1385 --- /dev/null +++ b/CRIU_code/test/zdtm/static/file_locks02.c @@ -0,0 +1,105 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that 'shared' flocks work"; +const char *test_author = "Pavel Emelyanov "; + +char *filename; +TEST_OPTION(filename, string, "file name", 1); + +static int check_file_lock(pid_t pid, pid_t child, int fd, char *expected_type, + char *expected_option) +{ + char buf[100], fl_flag[16], fl_type[16], fl_option[16]; + int found = 0, num, fl_owner; + FILE *fp_locks = NULL; + char path[PATH_MAX]; + + test_msg("check_file_lock: (pid %d child %d) expecting fd %d type %s option %s\n", + pid, child, fd, expected_type, expected_option); + + snprintf(path, sizeof(path), "/proc/self/fdinfo/%d", fd); + fp_locks = fopen(path, "r"); + if (!fp_locks) { + pr_err("Can't open %s\n", path); + return -1; + } + + while (fgets(buf, sizeof(buf), fp_locks)) { + if (strncmp(buf, "lock:\t", 6) != 0) + continue; + test_msg("c: %s", buf); + + memset(fl_flag, 0, sizeof(fl_flag)); + memset(fl_type, 0, sizeof(fl_type)); + memset(fl_option, 0, sizeof(fl_option)); + + num = sscanf(buf, "%*s %*d:%s %s %s %d", + fl_flag, fl_type, fl_option, &fl_owner); + if (num < 4) { + pr_perror("Invalid lock info."); + break; + } + + if (fl_owner != pid && fl_owner != child) + continue; + if (strcmp(fl_flag, "FLOCK")) + continue; + if (strcmp(fl_type, expected_type)) + continue; + if (strcmp(fl_option, expected_option)) + continue; + found++; + } + + fclose(fp_locks); + + return found == 1 ? 0 : -1; +} + +int main(int argc, char **argv) +{ + int fd, pid, ret = 0; + + test_init(argc, argv); + + fd = open(filename, O_CREAT | O_RDWR, 0600); + if (fd < 0) { + pr_perror("No file"); + return -1; + } + + flock(fd, LOCK_EX); + + pid = fork(); + if (pid == 0) { + test_waitsig(); + exit(0); + } + + test_daemon(); + test_waitsig(); + + if (check_file_lock(getpid(), pid, fd, "ADVISORY", "WRITE")) { + fail("Flock file locks check failed"); + ret |= 1; + } + + if (!ret) + pass(); + + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); + close(fd); + unlink(filename); + + return ret; +} diff --git a/CRIU_code/test/zdtm/static/file_locks02.desc b/CRIU_code/test/zdtm/static/file_locks02.desc new file mode 100644 index 0000000..80cd04e --- /dev/null +++ b/CRIU_code/test/zdtm/static/file_locks02.desc @@ -0,0 +1 @@ +{'flags': 'excl', 'opts': '--file-locks'} diff --git a/CRIU_code/test/zdtm/static/file_locks03.c b/CRIU_code/test/zdtm/static/file_locks03.c new file mode 100644 index 0000000..19c11db --- /dev/null +++ b/CRIU_code/test/zdtm/static/file_locks03.c @@ -0,0 +1,111 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that 'inherited' flocks work"; +const char *test_author = "Pavel Emelyanov "; + +char *filename; +TEST_OPTION(filename, string, "file name", 1); + +static int check_file_lock(pid_t pid, pid_t child, int fd, char *expected_type, + char *expected_option) +{ + char buf[100], fl_flag[16], fl_type[16], fl_option[16]; + int found = 0, num, fl_owner; + FILE *fp_locks = NULL; + char path[PATH_MAX]; + + test_msg("check_file_lock: (pid %d child %d) expecting fd %d type %s option %s\n", + pid, child, fd, expected_type, expected_option); + + snprintf(path, sizeof(path), "/proc/%d/fdinfo/%d", child, fd); + fp_locks = fopen(path, "r"); + if (!fp_locks) { + pr_err("Can't open %s\n", path); + return -1; + } + + while (fgets(buf, sizeof(buf), fp_locks)) { + if (strncmp(buf, "lock:\t", 6) != 0) + continue; + test_msg("c: %s", buf); + + memset(fl_flag, 0, sizeof(fl_flag)); + memset(fl_type, 0, sizeof(fl_type)); + memset(fl_option, 0, sizeof(fl_option)); + + num = sscanf(buf, "%*s %*d:%s %s %s %d", + fl_flag, fl_type, fl_option, &fl_owner); + if (num < 4) { + pr_perror("Invalid lock info."); + break; + } + + if (fl_owner != pid && fl_owner != child) + continue; + if (strcmp(fl_flag, "FLOCK")) + continue; + if (strcmp(fl_type, expected_type)) + continue; + if (strcmp(fl_option, expected_option)) + continue; + found++; + } + + fclose(fp_locks); + + return found == 1 ? 0 : -1; +} + +int main(int argc, char **argv) +{ + int fd, pid, ret = 0; + task_waiter_t t; + + test_init(argc, argv); + task_waiter_init(&t); + + fd = open(filename, O_CREAT | O_RDWR, 0600); + if (fd < 0) { + pr_perror("No file"); + return -1; + } + + flock(fd, LOCK_EX); + + pid = fork(); + if (pid == 0) { + test_waitsig(); + task_waiter_wait4(&t, 1); + exit(0); + } + + close(fd); + + test_daemon(); + test_waitsig(); + + if (check_file_lock(getpid(), pid, fd, "ADVISORY", "WRITE")) { + fail("Flock file locks check failed"); + ret |= 1; + } + + task_waiter_complete(&t, 1); + + if (!ret) + pass(); + + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); + unlink(filename); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/file_locks03.desc b/CRIU_code/test/zdtm/static/file_locks03.desc new file mode 100644 index 0000000..80cd04e --- /dev/null +++ b/CRIU_code/test/zdtm/static/file_locks03.desc @@ -0,0 +1 @@ +{'flags': 'excl', 'opts': '--file-locks'} diff --git a/CRIU_code/test/zdtm/static/file_locks04.c b/CRIU_code/test/zdtm/static/file_locks04.c new file mode 100644 index 0000000..96170d5 --- /dev/null +++ b/CRIU_code/test/zdtm/static/file_locks04.c @@ -0,0 +1,120 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that 'overlapping' flocks work"; +const char *test_author = "Pavel Emelyanov "; + +char *filename; +TEST_OPTION(filename, string, "file name", 1); + +static int check_file_locks(pid_t child_pid, int fd, int child_fd) +{ + char path[PATH_MAX]; + FILE *fp_locks = NULL; + char buf[100], fl_flag[16], fl_type[16], fl_option[16]; + int found = 0, num, fl_owner; + + sprintf(path, "/proc/%d/fdinfo/%d", child_pid, child_fd); + fp_locks = fopen(path, "r"); + if (!fp_locks) { + pr_err("Can't open %s\n", path); + return -1; + } + + while (fgets(buf, sizeof(buf), fp_locks)) { + if (strncmp(buf, "lock:\t", 6) != 0) + continue; + test_msg("c: %s", buf); + + num = sscanf(buf, + "%*s %*d:%s %s %s %d %*02x:%*02x:%*d %*d %*s", + fl_flag, fl_type, fl_option, &fl_owner); + + if (num < 4) { + pr_perror("Invalid lock info."); + break; + } + + if (fl_owner != child_pid && fl_owner != getpid()) { + pr_err("Wrong owner\n"); + continue; + } + + if (!strcmp(fl_flag, "FLOCK") && + !strcmp(fl_type, "ADVISORY") && + !strcmp(fl_option, "WRITE")) + found++; + + memset(fl_flag, 0, sizeof(fl_flag)); + memset(fl_type, 0, sizeof(fl_type)); + memset(fl_option, 0, sizeof(fl_option)); + } + + fclose(fp_locks); + + if (flock(fd, LOCK_EX | LOCK_NB) == 0) + return 0; + + return found == 1; +} + +int main(int argc, char **argv) +{ + int fd, child_fd, pid; + + test_init(argc, argv); + + fd = child_fd = open(filename, O_CREAT | O_RDWR, 0600); + if (fd < 0) { + pr_perror("No file"); + return -1; + } + + flock(fd, LOCK_EX); + + pid = fork(); + if (pid == 0) { + test_waitsig(); + exit(0); + } + + close(fd); + + fd = open(filename, O_RDONLY); + if (fd < 0) { + pr_perror("No file 2"); + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); + return -1; + } + + if (flock(fd, LOCK_EX | LOCK_NB) == 0) { + pr_perror("Bogus locks"); + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); + return -1; + } + + test_daemon(); + test_waitsig(); + + if (check_file_locks(pid, fd, child_fd) > 0) + pass(); + else + fail("Flock file locks check failed"); + + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); + close(fd); + unlink(filename); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/file_locks04.desc b/CRIU_code/test/zdtm/static/file_locks04.desc new file mode 100644 index 0000000..41aad3f --- /dev/null +++ b/CRIU_code/test/zdtm/static/file_locks04.desc @@ -0,0 +1 @@ +{'flags': 'excl', 'opts': '--file-locks', 'feature': 'fdinfo_lock'} diff --git a/CRIU_code/test/zdtm/static/file_locks05.c b/CRIU_code/test/zdtm/static/file_locks05.c new file mode 100644 index 0000000..33877f8 --- /dev/null +++ b/CRIU_code/test/zdtm/static/file_locks05.c @@ -0,0 +1,50 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Sanity check for criu lock-test quirk"; +const char *test_author = "Pavel Emelyanov "; + +char *filename; +TEST_OPTION(filename, string, "file name", 1); + +int main(int argc, char **argv) +{ + int fd, fd2; + + test_init(argc, argv); + + fd = open(filename, O_CREAT | O_RDWR, 0600); + if (fd < 0) { + pr_perror("No file"); + return -1; + } + + fd2 = open(filename, O_RDWR); + if (fd2 < 0) { + pr_perror("No file2"); + return -1; + } + + flock(fd, LOCK_SH); + + test_daemon(); + test_waitsig(); + + if (flock(fd2, LOCK_SH) == 0) + pass(); + else + fail("Flock file locks check failed (%d)", errno); + + close(fd); + close(fd2); + unlink(filename); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/file_locks05.desc b/CRIU_code/test/zdtm/static/file_locks05.desc new file mode 100644 index 0000000..80cd04e --- /dev/null +++ b/CRIU_code/test/zdtm/static/file_locks05.desc @@ -0,0 +1 @@ +{'flags': 'excl', 'opts': '--file-locks'} diff --git a/CRIU_code/test/zdtm/static/file_locks06.c b/CRIU_code/test/zdtm/static/file_locks06.c new file mode 100644 index 0000000..780fb07 --- /dev/null +++ b/CRIU_code/test/zdtm/static/file_locks06.c @@ -0,0 +1,65 @@ +#include +#include +#include + +#include "ofd_file_locks.h" +#include "zdtmtst.h" + +const char *test_doc = "Check that OFD lock for the whole file is restored"; +const char *test_author = "Begunkov Pavel "; + +char *filename; +TEST_OPTION(filename, string, "file name", 1); + + +int init_lock(int *fd, struct flock *lck) +{ + *fd = open(filename, O_RDWR | O_CREAT, 0666); + if (*fd < 0) { + pr_perror("Can't open file"); + return -1; + } + + lck->l_type = F_WRLCK; + lck->l_whence = SEEK_SET; + lck->l_start = 0; + lck->l_len = 0; + lck->l_pid = 0; + + if (zdtm_fcntl(*fd, F_OFD_SETLK, lck) < 0) { + pr_perror("Can't set ofd lock"); + return -1; + } + return 0; +} + +void cleanup(int *fd) +{ + if (close(*fd)) + pr_perror("Can't close fd\n"); + + if (unlink(filename)) + pr_perror("Can't unlink file\n"); +} + +int main(int argc, char **argv) +{ + int fd; + struct flock lck; + + test_init(argc, argv); + if (init_lock(&fd, &lck)) + return 1; + + test_daemon(); + test_waitsig(); + + if (check_file_lock_restored(getpid(), fd, &lck) || + check_lock_exists(filename, &lck) < 0) + fail("OFD file locks check failed\n"); + else + pass(); + + cleanup(&fd); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/file_locks06.checkskip b/CRIU_code/test/zdtm/static/file_locks06.checkskip new file mode 100644 index 0000000..06ab585 --- /dev/null +++ b/CRIU_code/test/zdtm/static/file_locks06.checkskip @@ -0,0 +1,19 @@ +#!/usr/bin/env python +import fcntl +import tempfile +import struct +import errno + +F_OFD_SETLK = 37 + +try: + with tempfile.TemporaryFile() as fd: + flock = struct.pack('hhllhh', fcntl.F_RDLCK, 0, 0, 0, 0, 0) + fcntl.fcntl(fd.fileno(), F_OFD_SETLK, flock) +except IOError as e: + if e.errno == errno.EINVAL: + print("I/O error({0}): {1}".format(e.errno, e.strerror)) + print("OFD locks are not supported.") + exit(1) + +exit(0) diff --git a/CRIU_code/test/zdtm/static/file_locks06.desc b/CRIU_code/test/zdtm/static/file_locks06.desc new file mode 100644 index 0000000..80cd04e --- /dev/null +++ b/CRIU_code/test/zdtm/static/file_locks06.desc @@ -0,0 +1 @@ +{'flags': 'excl', 'opts': '--file-locks'} diff --git a/CRIU_code/test/zdtm/static/file_locks07.c b/CRIU_code/test/zdtm/static/file_locks07.c new file mode 100644 index 0000000..2fe169f --- /dev/null +++ b/CRIU_code/test/zdtm/static/file_locks07.c @@ -0,0 +1,99 @@ +#include +#include +#include + +#include "ofd_file_locks.h" +#include "zdtmtst.h" + +const char *test_doc = "Check that 'overlapping' OFD read locks work"; +const char *test_author = "Begunkov Pavel "; + +char *filename; +TEST_OPTION(filename, string, "file name", 1); + + +#define FILE_NUM 4 +static int fds[FILE_NUM]; +static struct flock lcks[FILE_NUM]; +static short types[] = {F_RDLCK, F_RDLCK, F_RDLCK, F_RDLCK}; +static off_t starts[] = {0, 10, 0, 70}; +static off_t lens[] = {20, 30, 100, 200}; + +void fill_lock(struct flock *lock, off_t start, off_t len, short int type) +{ + lock->l_start = start; + lock->l_len = len; + lock->l_type = type; + lock->l_whence = SEEK_SET; + lock->l_pid = 0; +} + +int init_file_locks(void) +{ + size_t i; + + for (i = 0; i < FILE_NUM; ++i) + fill_lock(&lcks[i], starts[i], lens[i], types[i]); + + for (i = 0; i < FILE_NUM; ++i) { + fds[i] = open(filename, O_RDWR | O_CREAT, 0666); + + if (fds[i] < 0) { + pr_perror("Can't open file"); + return -1; + } + } + + for (i = 0; i < FILE_NUM; ++i) + if (zdtm_fcntl(fds[i], F_OFD_SETLKW, &lcks[i]) < 0) { + pr_perror("Can't set ofd lock"); + return -1; + } + + return 0; +} + +void cleanup(void) +{ + size_t i; + + for (i = 0; i < FILE_NUM; ++i) + if (close(fds[i])) + pr_perror("Can't close fd\n"); + + if (unlink(filename)) + pr_perror("Can't unlink file failed\n"); +} + +int check_file_locks_restored(void) +{ + size_t i; + int pid = getpid(); + + for (i = 0; i < FILE_NUM; ++i) { + if (check_file_lock_restored(pid, fds[i], &lcks[i])) + return -1; + if (check_lock_exists(filename, &lcks[i]) < 0) + return -1; + } + + return 0; +} + +int main(int argc, char **argv) +{ + test_init(argc, argv); + if (init_file_locks()) + return -1; + + test_daemon(); + test_waitsig(); + + if (check_file_locks_restored()) + fail("OFD file locks check failed\n"); + else + pass(); + + cleanup(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/file_locks07.checkskip b/CRIU_code/test/zdtm/static/file_locks07.checkskip new file mode 100644 index 0000000..d3acb70 --- /dev/null +++ b/CRIU_code/test/zdtm/static/file_locks07.checkskip @@ -0,0 +1 @@ +file_locks06.checkskip \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/file_locks07.desc b/CRIU_code/test/zdtm/static/file_locks07.desc new file mode 100644 index 0000000..80cd04e --- /dev/null +++ b/CRIU_code/test/zdtm/static/file_locks07.desc @@ -0,0 +1 @@ +{'flags': 'excl', 'opts': '--file-locks'} diff --git a/CRIU_code/test/zdtm/static/file_locks08.c b/CRIU_code/test/zdtm/static/file_locks08.c new file mode 100644 index 0000000..fea8d9e --- /dev/null +++ b/CRIU_code/test/zdtm/static/file_locks08.c @@ -0,0 +1,90 @@ +#include +#include +#include +#include +#include + +#include "ofd_file_locks.h" +#include "zdtmtst.h" + +const char *test_doc = "Check that inherited OFD locks work"; +const char *test_author = "Begunkov Pavel "; + +char *filename; +TEST_OPTION(filename, string, "file name", 1); + + +int init_file_lock(int *fd, struct flock *lck) +{ + *fd = open(filename, O_RDWR | O_CREAT, 0666); + if (*fd < 0) { + pr_perror("Can't open file"); + return -1; + } + + lck->l_type = F_WRLCK; + lck->l_whence = SEEK_SET; + lck->l_start = 0; + lck->l_len = 0; /* lock whole file */ + lck->l_pid = 0; /* should be 0 for ofd lock */ + + if (zdtm_fcntl(*fd, F_OFD_SETLKW, lck) < 0) { + pr_perror("Can't set ofd lock"); + return -1; + } + return 0; +} + +void cleanup(int *fd) +{ + if (close(*fd)) + pr_perror("Can't close fd\n"); + + if (unlink(filename)) + pr_perror("Can't unlink file\n"); +} + +int main(int argc, char **argv) +{ + int fd; + int pid; + int status; + int ret = 0; + task_waiter_t tw; + struct flock lck; + + test_init(argc, argv); + if (init_file_lock(&fd, &lck)) + return -1; + + task_waiter_init(&tw); + + pid = fork(); + if (pid == 0) { + task_waiter_wait4(&tw, getppid()); + if (check_file_lock_restored(getpid(), fd, &lck) || + check_lock_exists(filename, &lck) < 0) + ret = -1; + exit(ret); + } + + test_daemon(); + test_waitsig(); + + task_waiter_complete_current(&tw); + + if (check_file_lock_restored(getpid(), fd, &lck) || + check_lock_exists(filename, &lck) < 0) + fail("OFD file locks check failed\n"); + + kill(pid, SIGTERM); + ret = waitpid(pid, &status, 0); + + if (ret < 0 || !WIFEXITED(status) || WEXITSTATUS(status)) + fail("OFD file locks check failed\n"); + else + pass(); + + cleanup(&fd); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/file_locks08.checkskip b/CRIU_code/test/zdtm/static/file_locks08.checkskip new file mode 100644 index 0000000..d3acb70 --- /dev/null +++ b/CRIU_code/test/zdtm/static/file_locks08.checkskip @@ -0,0 +1 @@ +file_locks06.checkskip \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/file_locks08.desc b/CRIU_code/test/zdtm/static/file_locks08.desc new file mode 100644 index 0000000..80cd04e --- /dev/null +++ b/CRIU_code/test/zdtm/static/file_locks08.desc @@ -0,0 +1 @@ +{'flags': 'excl', 'opts': '--file-locks'} diff --git a/CRIU_code/test/zdtm/static/file_shared.c b/CRIU_code/test/zdtm/static/file_shared.c new file mode 100644 index 0000000..8f1acb1 --- /dev/null +++ b/CRIU_code/test/zdtm/static/file_shared.c @@ -0,0 +1,117 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" +#define OFFSET 1000 +#define OFFSET2 500 + +const char *test_doc = "Check shared struct file-s"; +const char *test_author = "Andrey Vagin "; + +char *filename; +TEST_OPTION(filename, string, "file name", 1); + +int main(int argc, char **argv) +{ + pid_t pid; + int fd, fd2, fd3, ret, status; + off_t off; + + test_init(argc, argv); + + fd = open(filename, O_RDWR | O_CREAT, 0644); + if (fd == -1) + return 1; + + fd2 = dup(fd); + if (fd < 0) + return 1; + + fd3 = open(filename, O_RDWR | O_CREAT, 0644); + if (fd3 == -1) + return 1; + + pid = test_fork(); + + if (pid == -1) + return 1; + else if (pid) { + fcntl(fd2, F_SETFD, 1); + + test_daemon(); + test_waitsig(); + off = lseek(fd, OFFSET, SEEK_SET); + if (off == (off_t) -1) + return 1; + + off = lseek(fd3, OFFSET2, SEEK_SET); + if (off == (off_t) -1) + return 1; + + ret = kill(pid, SIGTERM); + if (ret == -1) { + pr_perror("kill() failed"); + } + + ret = wait(&status); + if (ret == -1) { + pr_perror("wait() failed"); + return 1; + } + + if (!WIFEXITED(status) || WEXITSTATUS(status)) { + fail("Child exited with non-zero status"); + return 1; + } + off = lseek(fd2, 0, SEEK_CUR); + if (off != OFFSET) { + fail("offset1 fail\n"); + return 1; + } + off = lseek(fd3, 0, SEEK_CUR); + if (off != OFFSET2) { + fail("offset2 fail\n"); + return 1; + } + + ret = fcntl(fd, F_GETFD, 0); + if (ret != 0) { + fail("fd cloexec broken\n"); + return 1; + } + + ret = fcntl(fd2, F_GETFD, 0); + if (ret != 1) { + fail("fd2 cloexec broken\n"); + return 1; + } + + } else { + test_waitsig(); + off = lseek(fd, 0, SEEK_CUR); + if (off != OFFSET) { + fail("offset3 fail\n"); + return 1; + } + off = lseek(fd2, 0, SEEK_CUR); + if (off != OFFSET) { + fail("offset4 fail\n"); + return 1; + } + off = lseek(fd3, 0, SEEK_CUR); + if (off != OFFSET2) { + fail("offset5 fail\n"); + return 1; + } + return 0; + } + + pass(); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/fpu00.c b/CRIU_code/test/zdtm/static/fpu00.c new file mode 100644 index 0000000..04aa738 --- /dev/null +++ b/CRIU_code/test/zdtm/static/fpu00.c @@ -0,0 +1,87 @@ +#include + +#include "zdtmtst.h" + +const char *test_doc = "Start a calculation, leaving FPU in a certain state,\n" +"before migration, continue after"; +const char *test_author = "Pavel Emelianov "; + +#if defined(__i386__) || defined(__x86_64__) + +#include "cpuid.h" + +void start(float a, float b, float c, float d) +{ + __asm__ volatile ( + "flds %0\n" + "fadds %1\n" + "flds %2\n" + "fadds %3\n" + "fmulp %%st(1)\n" + : + : "m" (a), "m" (b), "m" (c), "m" (d) + ); +} + +float finish(void) +{ + float res; + + __asm__ volatile ( + "fstps %0\n" + : "=m" (res) + ); + return res; +} + +#define CPUID_FEAT_EDX_FPU (1 << 0) + +int chk_proc_fpu(void) +{ + uint32_t eax, ebx, ecx, edx; + + cpuid(1, &eax, &ebx, &ecx, &edx); + + return edx & CPUID_FEAT_EDX_FPU; +} +#endif + +int main(int argc, char ** argv) +{ +#if defined(__i386__) || defined(__x86_64__) + float a, b, c, d; + float res1, res2; +#endif + + test_init(argc, argv); +#if defined(__i386__) || defined(__x86_64__) + if (!chk_proc_fpu()) { + skip("FPU not supported"); + return 1; + } + + a = drand48(); + b = drand48(); + c = drand48(); + d = drand48(); + + + start(a, b, c, d); + res1 = finish(); + + start(a, b, c, d); + + test_daemon(); + test_waitsig(); + + res2 = finish(); + + if (res1 != res2) + fail("%f != %f\n", res1, res2); + else + pass(); +#else + skip("Unsupported arch"); +#endif + return 0; +} diff --git a/CRIU_code/test/zdtm/static/fpu00.desc b/CRIU_code/test/zdtm/static/fpu00.desc new file mode 100644 index 0000000..d2f501d --- /dev/null +++ b/CRIU_code/test/zdtm/static/fpu00.desc @@ -0,0 +1 @@ +{'arch': 'x86_64'} diff --git a/CRIU_code/test/zdtm/static/fpu01.c b/CRIU_code/test/zdtm/static/fpu01.c new file mode 100644 index 0000000..11c4805 --- /dev/null +++ b/CRIU_code/test/zdtm/static/fpu01.c @@ -0,0 +1,119 @@ +#include +#include +#include + +#include + +#include "zdtmtst.h" + +#if defined(__i386__) || defined(__x86_64__) + +#include "cpuid.h" + +const char *test_doc = "Test if FPU data in YMM registers do survive the c/r"; +const char *test_author = "Cyrill Gorcunov "; + +static int verify_cpu(void) +{ + unsigned int eax, ebx, ecx, edx; + + /* Do we have xsave? */ + cpuid(1, &eax, &ebx, &ecx, &edx); + if (!(ecx & (1u << 27))) + return -1; + + /* Is YMM here? */ + cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx); + if ((eax & (0x4)) != 0x4) + return -1; + + return 0; +} + +#define __aligned __attribute__((aligned(64))) + +/* + * These are random strings generated by pwgen. + */ +static __aligned unsigned char ymm1[32 + 1] = "code9Ee5sohphie1ae1kaeMahngoh5oe"; +static __aligned unsigned char ymm2[32 + 1] = "Tacuthahhien9Fi7aGhaa5toGh6vi7Ch"; + +static __aligned unsigned char ymm3[32 + 1]; +static __aligned unsigned char ymm4[32 + 1]; + +static int fpu_test(void) +{ + int ret = 0; + + asm volatile("vmovapd %0, %%ymm0 \n" + : + : "m" (*ymm1) + : "memory"); + + asm volatile("vmovapd %0, %%ymm7 \n" + : + : "m" (*ymm2) + : "memory"); + + test_daemon(); + test_waitsig(); + + asm volatile("vmovapd %%ymm0, %0 \n" + : "=m" (*ymm3) + : + : "memory"); + + asm volatile("vmovapd %%ymm7, %0 \n" + : "=m" (*ymm4) + : + : "memory"); + + if (memcmp(ymm1, ymm3, 32) || memcmp(ymm2, ymm4, 32)) { + test_msg("Data mismatch ('%s' '%s' '%s' '%s')\n", + ymm1, ymm2, ymm3, ymm4); + ret = -1; + } else { + test_msg("Data match ('%s' '%s' '%s' '%s')\n", + ymm1, ymm2, ymm3, ymm4); + ret = 0; + } + + return ret; +} + +static int bare_run(void) +{ + test_msg("Your cpu doesn't support ymm registers, skipping\n"); + + test_daemon(); + test_waitsig(); + + return 0; +} + +int main(int argc, char *argv[]) +{ + int ret = 0; + + test_init(argc, argv); + + ret = verify_cpu() ? bare_run() : fpu_test(); + + if (!ret) + pass(); + else + fail(); + + return 0; +} + +#else + +int main(int argc, char *argv[]) +{ + test_init(argc, argv); + skip("Unsupported arch"); + return 0; +} + +#endif diff --git a/CRIU_code/test/zdtm/static/fpu01.desc b/CRIU_code/test/zdtm/static/fpu01.desc new file mode 100644 index 0000000..d2f501d --- /dev/null +++ b/CRIU_code/test/zdtm/static/fpu01.desc @@ -0,0 +1 @@ +{'arch': 'x86_64'} diff --git a/CRIU_code/test/zdtm/static/fpu02.c b/CRIU_code/test/zdtm/static/fpu02.c new file mode 100644 index 0000000..660fc7d --- /dev/null +++ b/CRIU_code/test/zdtm/static/fpu02.c @@ -0,0 +1,88 @@ +#include +#include +#include + +#include + +#include "zdtmtst.h" + +#if defined(__i386__) || defined(__x86_64__) + +#include "cpuid.h" + +const char *test_doc = "Test preserve of mxcsr in FPU"; +const char *test_author = "Dmitry Safonov <0x7f454c46@gmail.com>"; + +static int verify_cpu(void) +{ + unsigned int eax, ebx, ecx, edx; + + /* Do we have xsave? */ + cpuid(1, &eax, &ebx, &ecx, &edx); + if (!(ecx & (1u << 27))) + return -1; + + /* Is YMM here? */ + cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx); + if ((eax & (0x4)) != 0x4) + return -1; + + return 0; +} + +#define __aligned __attribute__((aligned(64))) + +static int fpu_test(void) +{ + uint32_t before, after; + + asm volatile("stmxcsr %0\n" + : "+m"(before)); + + test_daemon(); + test_waitsig(); + + asm volatile("stmxcsr %0\n" + : "+m"(after)); + + test_msg("before: %x, after: %x\n", before, after); + + return (before != after); +} + +static int bare_run(void) +{ + test_msg("Your cpu doesn't support ymm registers, skipping\n"); + + test_daemon(); + test_waitsig(); + + return 0; +} + +int main(int argc, char *argv[]) +{ + int ret = 0; + + test_init(argc, argv); + + ret = verify_cpu() ? bare_run() : fpu_test(); + + if (!ret) + pass(); + else + fail(); + + return 0; +} + +#else + +int main(int argc, char *argv[]) +{ + test_init(argc, argv); + skip("Unsupported arch"); + return 0; +} + +#endif diff --git a/CRIU_code/test/zdtm/static/fpu02.desc b/CRIU_code/test/zdtm/static/fpu02.desc new file mode 100644 index 0000000..d2f501d --- /dev/null +++ b/CRIU_code/test/zdtm/static/fpu02.desc @@ -0,0 +1 @@ +{'arch': 'x86_64'} diff --git a/CRIU_code/test/zdtm/static/futex-rl.c b/CRIU_code/test/zdtm/static/futex-rl.c new file mode 100644 index 0000000..678e175 --- /dev/null +++ b/CRIU_code/test/zdtm/static/futex-rl.c @@ -0,0 +1,126 @@ +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check the futex robust list c/r"; +const char *test_author = "Cyrill Gorcunov "; + +struct args { + task_waiter_t waiter; + int result; +}; + +static pid_t __gettid(void) +{ + return syscall(__NR_gettid); +} + +void *thread_fn(void *arg) +{ + struct robust_list_head *head_orig = NULL, *head_new = NULL; + size_t len_orig = 0, len_new = 0; + struct args *args = arg; + + test_msg("Obtaining old RL\n"); + if (syscall(__NR_get_robust_list, __gettid(), &head_orig, &len_orig)) { + args->result = -1; + fail("__NR_get_robust_list failed"); + } + + test_msg("Complete\n"); + task_waiter_complete(&args->waiter, 1); + if (args->result == -1) + goto out; + + task_waiter_wait4(&args->waiter, 2); + + test_msg("Obtaining new RL\n"); + if (syscall(__NR_get_robust_list, __gettid(), &head_new, &len_new)) { + args->result = -1; + fail("__NR_get_robust_list failed"); + } + if (args->result == -1) + goto out; + + if (head_orig != head_new || len_orig != len_new) { + args->result = -1; + fail("comparison failed"); + } + + args->result = 0; +out: + return NULL; +} + +int main(int argc, char **argv) +{ + struct robust_list_head *head_orig = NULL, *head_new = NULL; + size_t len_orig = 0, len_new = 0; + pthread_t thread; + struct args *args; + + test_init(argc, argv); + + args = (struct args *)mmap(NULL, sizeof(*args), PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_SHARED, -1, 0); + if ((void *)args == MAP_FAILED) { + fail("mmap failed\n"); + exit(1); + } + + test_msg("Obtaining old RL for thread-leader\n"); + if (syscall(__NR_get_robust_list, __gettid(), &head_orig, &len_orig)) { + fail("__NR_get_robust_list failed"); + exit(1); + } + + task_waiter_init(&args->waiter); + args->result = 0; + + test_msg("Creating thread\n"); + if (pthread_create(&thread, NULL, thread_fn, (void *)args)) { + fail("Can't create thread\n"); + exit(1); + } + + test_msg("Wait for thread work\n"); + task_waiter_wait4(&args->waiter, 1); + if (args->result == -1) { + fail("thread failed\n"); + exit(1); + } + + test_msg("C/R cycle\n"); + test_daemon(); + test_waitsig(); + + task_waiter_complete(&args->waiter, 2); + + test_msg("Obtaining new RL for thread-leader\n"); + if (syscall(__NR_get_robust_list, __gettid(), &head_new, &len_new)) { + fail("__NR_get_robust_list failed"); + exit(1); + } + + if (head_orig != head_new || len_orig != len_new) { + fail("comparison failed"); + exit(1); + } + + pthread_join(thread, NULL); + if (args->result) + fail(); + else + pass(); + + munmap((void *)args, sizeof(*args)); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/futex.c b/CRIU_code/test/zdtm/static/futex.c new file mode 100644 index 0000000..2ad82d2 --- /dev/null +++ b/CRIU_code/test/zdtm/static/futex.c @@ -0,0 +1,88 @@ +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check (via pthread/NPTL) that futeces behave through migration"; +const char *test_author = "Pavel Emelianov "; + +volatile int kid_passed; + +void *thread_fn(void *lock) +{ + pthread_mutex_t *mutex; + + mutex = (pthread_mutex_t *)lock; + pthread_mutex_lock(mutex); + kid_passed++; + pthread_mutex_unlock(mutex); + return NULL; +} + +#define DEF_NUM_THREADS 10 +#define MAX_NUM_THREADS 50 +int num_threads = DEF_NUM_THREADS; +TEST_OPTION(num_threads, int, "number of threads " + "(default " __stringify(DEF_NUM_THREADS) + " maximum " __stringify(MAX_NUM_THREADS) ")", 0); + +int main(int argc, char **argv) +{ + int i; + pthread_t thr[num_threads]; + pthread_mutex_t m; + + test_init(argc, argv); + + if (num_threads > MAX_NUM_THREADS) { + pr_perror("%d threads it too much. max is %d", + num_threads, MAX_NUM_THREADS); + goto out; + } + + pthread_mutex_init(&m, NULL); + pthread_mutex_lock(&m); + + for (i = 0; i < num_threads; i++) + if (pthread_create(&thr[i], NULL, thread_fn, &m)) { + pr_perror("Can't create %d'th thread", i + 1); + goto out_kill; + } + + kid_passed = 0; + + test_daemon(); + test_waitsig(); + + sleep(1); + if (kid_passed != 0) + fail("some kids broke through\n"); + + pthread_mutex_unlock(&m); + for (i = 0; i < num_threads; i++) + pthread_join(thr[i], NULL); + + if (pthread_mutex_trylock(&m)) { + if (errno == EBUSY) + fail("kids left my mutex locked\n"); + else + pr_perror("kids spoiled my mutex"); + } + + if (kid_passed != num_threads) + fail("some kids died during migration\n"); + + pass(); +out: + return 0; + +out_kill: + for (i--; i >= 0; i--) { + pthread_kill(thr[i], SIGKILL); + pthread_join(thr[i], NULL); + } + goto out; +} diff --git a/CRIU_code/test/zdtm/static/get_smaps_bits.c b/CRIU_code/test/zdtm/static/get_smaps_bits.c new file mode 100644 index 0000000..9253f4d --- /dev/null +++ b/CRIU_code/test/zdtm/static/get_smaps_bits.c @@ -0,0 +1,127 @@ +#include +#include +#include "zdtmtst.h" + +#ifndef MAP_HUGETLB +# define MAP_HUGETLB 0x40000 +#endif + +#ifndef MADV_HUGEPAGE +# define MADV_HUGEPAGE 14 +#endif + +#ifndef MADV_NOHUGEPAGE +# define MADV_NOHUGEPAGE 15 +#endif + +#ifndef MADV_DONTDUMP +# define MADV_DONTDUMP 16 +#endif + +static void parse_vmflags(char *buf, unsigned long *flags, unsigned long *madv) +{ + char *tok; + + if (!buf[0]) + return; + + tok = strtok(buf, " \n"); + if (!tok) + return; + +#define _vmflag_match(_t, _s) (_t[0] == _s[0] && _t[1] == _s[1]) + + do { + /* mmap() block */ + if (_vmflag_match(tok, "gd")) + *flags |= MAP_GROWSDOWN; + else if (_vmflag_match(tok, "lo")) + *flags |= MAP_LOCKED; + else if (_vmflag_match(tok, "nr")) + *flags |= MAP_NORESERVE; + else if (_vmflag_match(tok, "ht")) + *flags |= MAP_HUGETLB; + + /* madvise() block */ + if (_vmflag_match(tok, "sr")) + *madv |= (1ul << MADV_SEQUENTIAL); + else if (_vmflag_match(tok, "rr")) + *madv |= (1ul << MADV_RANDOM); + else if (_vmflag_match(tok, "dc")) + *madv |= (1ul << MADV_DONTFORK); + else if (_vmflag_match(tok, "dd")) + *madv |= (1ul << MADV_DONTDUMP); + else if (_vmflag_match(tok, "mg")) + *madv |= (1ul << MADV_MERGEABLE); + else if (_vmflag_match(tok, "hg")) + *madv |= (1ul << MADV_HUGEPAGE); + else if (_vmflag_match(tok, "nh")) + *madv |= (1ul << MADV_NOHUGEPAGE); + + /* + * Anything else is just ignored. + */ + } while ((tok = strtok(NULL, " \n"))); + +#undef _vmflag_match +} + +#define is_hex_digit(c) \ + (((c) >= '0' && (c) <= '9') || \ + ((c) >= 'a' && (c) <= 'f') || \ + ((c) >= 'A' && (c) <= 'F')) + +static int is_vma_range_fmt(char *line, unsigned long *start, unsigned long *end) +{ + char *p = line; + while (*line && is_hex_digit(*line)) + line++; + + if (*line++ != '-') + return 0; + + while (*line && is_hex_digit(*line)) + line++; + + if (*line++ != ' ') + return 0; + + sscanf(p, "%lx-%lx", start, end); + return 1; +} + +int get_smaps_bits(unsigned long where, unsigned long *flags, unsigned long *madv) +{ + unsigned long start = 0, end = 0; + FILE *smaps = NULL; + char buf[1024]; + int found = 0; + + if (!where) + return 0; + + smaps = fopen("/proc/self/smaps", "r"); + if (!smaps) { + pr_perror("Can't open smaps"); + return -1; + } + + while (fgets(buf, sizeof(buf), smaps)) { + is_vma_range_fmt(buf, &start, &end); + + if (!strncmp(buf, "VmFlags: ", 9) && start == where) { + found = 1; + parse_vmflags(buf, flags, madv); + break; + } + } + + fclose(smaps); + + if (!found) { + pr_perror("VmFlags not found for %lx", where); + return -1; + } + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/get_smaps_bits.h b/CRIU_code/test/zdtm/static/get_smaps_bits.h new file mode 100644 index 0000000..ce1070d --- /dev/null +++ b/CRIU_code/test/zdtm/static/get_smaps_bits.h @@ -0,0 +1,6 @@ +#ifndef ZDTM_GET_SMAPS_BITS_H_ +#define ZDTM_GET_SMAPS_BITS_H_ + +extern int get_smaps_bits(unsigned long where, unsigned long *flags, unsigned long *madv); + +#endif /* ZDTM_GET_SMAPS_BITS_H_ */ diff --git a/CRIU_code/test/zdtm/static/ghost_holes00.c b/CRIU_code/test/zdtm/static/ghost_holes00.c new file mode 100644 index 0000000..214a391 --- /dev/null +++ b/CRIU_code/test/zdtm/static/ghost_holes00.c @@ -0,0 +1,168 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Test ghost with one hole in the middle"; + +char *filename; +TEST_OPTION(filename, string, "file name", 1); + +/* Buffer that is suitable for hole size */ +#define BUFSIZE 4096 +static unsigned char buf4k[BUFSIZE]; + +#ifndef SEEK_DATA +#define SEEK_DATA 3 +#define SEEK_HOLE 4 +#endif + +#ifdef HEAD_HOLE +#define HH 1 +#else +#define HH 0 +#endif + +#ifdef TAIL_HOLE +#define TH 1 +#else +#define TH 0 +#endif + +#define DATA1_BLK (HH) +#define DATA1_OFF (DATA1_BLK * BUFSIZE) +#define DATA2_BLK (HH + 2) +#define DATA2_OFF (DATA2_BLK * BUFSIZE) +#define FILE_BLOCKS (TH + HH + 1 /* mid hole */ + 2 /* data */) +#define FILE_SIZE (FILE_BLOCKS * BUFSIZE) + +int main(int argc, char ** argv) +{ + int fd; + struct stat st; + uint32_t crc; + bool chk_hole = true; + + test_init(argc, argv); + + fd = open(filename, O_RDWR | O_CREAT | O_TRUNC, 0644); + if (fd < 0) { + pr_perror("can't open %s", filename); + exit(1); + } + + if (unlink(filename) < 0) { + pr_perror("can't unlink %s", filename); + goto failed; + } + + crc = ~0; + datagen(buf4k, BUFSIZE, &crc); + if (pwrite(fd, buf4k, BUFSIZE, DATA1_OFF) != BUFSIZE) { + pr_perror("can't write data1"); + goto failed; + } + + crc = ~0; + datagen(buf4k, BUFSIZE, &crc); + if (pwrite(fd, buf4k, BUFSIZE, DATA2_OFF) != BUFSIZE) { + pr_perror("can't write data2"); + goto failed; + } + + if (ftruncate(fd, FILE_SIZE)) { + pr_perror("Can't fixup file size"); + goto failed; + } + + if (lseek(fd, DATA1_OFF, SEEK_HOLE) != DATA1_OFF + BUFSIZE) { + test_msg("Won't check for hole\n"); + chk_hole = false; + } + + test_daemon(); + test_waitsig(); + + if (fstat(fd, &st) < 0) { + fail("can't stat after"); + goto failed; + } + + if (st.st_size != FILE_SIZE) { + fail("file size changed to %ld", (long)st.st_size); + goto failed; + } + + test_msg("Blocks %u OK\n", FILE_BLOCKS); + + /* Data 1 */ + if (pread(fd, buf4k, BUFSIZE, DATA1_OFF) != BUFSIZE) { + fail("pread1 fail"); + goto failed; + } + + crc = ~0; + if (datachk(buf4k, BUFSIZE, &crc)) { + fail("datachk1 fail"); + goto failed; + } + + test_msg("Data @%u OK\n", DATA1_BLK); + + /* Data 2 */ + if (pread(fd, buf4k, BUFSIZE, DATA2_OFF) != BUFSIZE) { + fail("pread2 fail"); + goto failed; + } + + crc = ~0; + if (datachk(buf4k, BUFSIZE, &crc)) { + fail("datachk2 fail"); + goto failed; + } + + test_msg("Data @%u OK\n", DATA2_BLK); + + /* Hole */ + if (chk_hole) { +#ifdef HEAD_HOLE + if (lseek(fd, 0, SEEK_HOLE) != 0) { + fail("hh not found"); + goto failed; + } + + test_msg("Head hole OK\n"); +#endif + + if (lseek(fd, DATA1_OFF, SEEK_HOLE) != DATA1_OFF + BUFSIZE) { + fail("mh not found"); + goto failed; + } + + test_msg("Mid hole OK\n"); + +#ifdef TAIL_HOLE + if (lseek(fd, DATA2_OFF, SEEK_HOLE) != DATA2_OFF + BUFSIZE) { + fail("tail hole not found"); + goto failed; + } + + test_msg("Tail hole OK\n"); +#endif + } + + close(fd); + pass(); + return 0; + +failed: + close(fd); + return 1; +} diff --git a/CRIU_code/test/zdtm/static/ghost_holes01.c b/CRIU_code/test/zdtm/static/ghost_holes01.c new file mode 100644 index 0000000..82bf9de --- /dev/null +++ b/CRIU_code/test/zdtm/static/ghost_holes01.c @@ -0,0 +1 @@ +ghost_holes00.c \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/ghost_holes02.c b/CRIU_code/test/zdtm/static/ghost_holes02.c new file mode 100644 index 0000000..82bf9de --- /dev/null +++ b/CRIU_code/test/zdtm/static/ghost_holes02.c @@ -0,0 +1 @@ +ghost_holes00.c \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/ghost_on_rofs.c b/CRIU_code/test/zdtm/static/ghost_on_rofs.c new file mode 100644 index 0000000..1a79223 --- /dev/null +++ b/CRIU_code/test/zdtm/static/ghost_on_rofs.c @@ -0,0 +1,179 @@ +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check ghost file on readonly fs mount restores fine"; +const char *test_author = "Pavel Tikhomirov "; + +#define GHOST_DATA "Ghost Data" + +char *dirname; +TEST_OPTION(dirname, string, "directory name", 1); + +int main(int argc, char **argv) +{ + char ro_mount[PATH_MAX], ro_bind_mount[PATH_MAX]; + char ghost_file[PATH_MAX], ghost_file_bind[PATH_MAX]; + char buf[sizeof(GHOST_DATA)]; + int fd, fd_bind; + + test_init(argc, argv); + + if (mkdir(dirname, 0700)) { + pr_perror("mkdir"); + return 1; + } + + if (mount("zdtm_fs", dirname, "tmpfs", 0, NULL)) { + pr_perror("mount"); + return 1; + } + + if (mount(NULL, dirname, NULL, MS_PRIVATE, NULL)) { + pr_perror("mount"); + return 1; + } + + ssprintf(ro_mount, "%s/ro_mount", dirname); + if (mkdir(ro_mount, 0700)) { + pr_perror("mkdir"); + return 1; + } + + if (mount("ro_mount", ro_mount, "tmpfs", 0, NULL)) { + pr_perror("mount"); + return 1; + } + + /* + * Need shared mount to check the hunk in do_bind_mount, we + * would have ro_bind_mount binded from ro_mount or vice versa. + */ + if (mount(NULL, ro_mount, NULL, MS_SHARED, NULL)) { + pr_perror("mount"); + return 1; + } + + ssprintf(ro_bind_mount, "%s/ro_bind_mount", dirname); + if (mkdir(ro_bind_mount, 0700)) { + pr_perror("mkdir"); + return 1; + } + + if (mount(ro_mount, ro_bind_mount, NULL, MS_BIND, NULL)) { + pr_perror("mount"); + return 1; + } + + ssprintf(ghost_file, "%s/ghost_file", ro_mount); + fd = open(ghost_file, O_CREAT|O_WRONLY, 0600); + if (fd < 0) { + pr_perror("open"); + return 1; + } + + if (write(fd, GHOST_DATA, sizeof(GHOST_DATA)) != sizeof(GHOST_DATA)) { + pr_perror("write"); + return 1; + } + + ssprintf(ghost_file_bind, "%s/ghost_file_bind", ro_bind_mount); + fd_bind = open(ghost_file_bind, O_CREAT|O_WRONLY, 0600); + if (fd_bind < 0) { + pr_perror("open"); + return 1; + } + + if (write(fd_bind, GHOST_DATA, sizeof(GHOST_DATA)) != sizeof(GHOST_DATA)) { + pr_perror("write"); + return 1; + } + + close(fd); + close(fd_bind); + + fd = open(ghost_file, O_RDONLY); + if (fd < 0) { + pr_perror("open"); + return 1; + } + + fd_bind = open(ghost_file_bind, O_RDONLY); + if (fd_bind < 0) { + pr_perror("open"); + return 1; + } + + if (unlink(ghost_file)) { + pr_perror("unlink"); + return 1; + } + + if (unlink(ghost_file_bind)) { + pr_perror("unlink"); + return 1; + } + + if (mount(NULL, ro_mount, NULL, MS_RDONLY|MS_REMOUNT|MS_BIND, NULL)) { + pr_perror("mount"); + return 1; + } + + /* + * Need MS_NOSUID flag to check the hunk in do_bind_mount, case of + * different flags for mount and it's ->bind + */ + if (mount(NULL, ro_bind_mount, NULL, MS_NOSUID|MS_RDONLY|MS_REMOUNT|MS_BIND, NULL)) { + pr_perror("mount"); + return 1; + } + + test_daemon(); + test_waitsig(); + + if (read(fd, buf, sizeof(GHOST_DATA)) != sizeof(GHOST_DATA)) { + fail("Can't read from ghost file"); + return 1; + } + + if (strcmp(buf, GHOST_DATA)) { + fail("Wrong data in a ghost file"); + return 1; + } + + if (read(fd_bind, buf, sizeof(GHOST_DATA)) != sizeof(GHOST_DATA)) { + fail("Can't read from ghost file on bind"); + return 1; + } + + if (strcmp(buf, GHOST_DATA)) { + fail("Wrong data in a ghost file on bind"); + return 1; + } + + close(fd); + close(fd_bind); + + if (umount(ro_bind_mount)) { + pr_perror("umount"); + return 1; + } + + if (umount(ro_mount)) { + pr_perror("umount"); + return 1; + } + + if (umount(dirname)) { + pr_perror("umount"); + return 1; + } + + pass(); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/ghost_on_rofs.desc b/CRIU_code/test/zdtm/static/ghost_on_rofs.desc new file mode 100644 index 0000000..7657ba4 --- /dev/null +++ b/CRIU_code/test/zdtm/static/ghost_on_rofs.desc @@ -0,0 +1 @@ +{'flavor': 'ns uns', 'flags': 'suid'} diff --git a/CRIU_code/test/zdtm/static/groups.c b/CRIU_code/test/zdtm/static/groups.c new file mode 100644 index 0000000..01ec3a1 --- /dev/null +++ b/CRIU_code/test/zdtm/static/groups.c @@ -0,0 +1,64 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that supplementary groups are supported"; +const char *test_author = "Pavel Emelianov "; + +int main(int argc, char **argv) +{ + int ng; + unsigned int *grp, *grp2, i, max; + + test_init(argc, argv); + + ng = getgroups(0, NULL); + if (ng < 0) { + pr_perror("Can't get groups"); + return -1; + } + + grp = malloc((ng + 1) * sizeof(*grp)); + ng = getgroups(ng, grp); + if (ng < 0) { + pr_perror("Can't get groups2"); + return -1; + } + + max = 0; + for (i = 0; i < ng; i++) + if (max < grp[i]) + max = grp[i]; + + grp[ng++] = max + 1; + + if (setgroups(ng, grp) < 0) { + pr_perror("Can't set groups"); + return -1; + } + + test_daemon(); + test_waitsig(); + + grp2 = malloc(ng * sizeof(*grp2)); + + if (getgroups(ng, grp2) != ng) { + fail("Nr groups changed"); + return -1; + } + + if (memcmp(grp, grp2, ng * sizeof(*grp))) { + fail("Groups have changed"); + return -1; + } + + pass(); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/groups.desc b/CRIU_code/test/zdtm/static/groups.desc new file mode 100644 index 0000000..2eac7e6 --- /dev/null +++ b/CRIU_code/test/zdtm/static/groups.desc @@ -0,0 +1 @@ +{'flags': 'suid'} diff --git a/CRIU_code/test/zdtm/static/grow_map.c b/CRIU_code/test/zdtm/static/grow_map.c new file mode 100644 index 0000000..dd1dffd --- /dev/null +++ b/CRIU_code/test/zdtm/static/grow_map.c @@ -0,0 +1,69 @@ +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that VMA-s with MAP_GROWSDOWN are restored correctly"; +const char *test_author = "Andrew Vagin "; + +int main(int argc, char **argv) +{ + char *start_addr, *fake_grow_down, *test_addr, *grow_down; + volatile char *p; + test_init(argc, argv); + + start_addr = mmap(NULL, PAGE_SIZE * 10, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (start_addr == MAP_FAILED) { + pr_perror("Can't mal a new region"); + return 1; + } + munmap(start_addr, PAGE_SIZE * 10); + + fake_grow_down = mmap(start_addr + PAGE_SIZE * 5, PAGE_SIZE, + PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED | MAP_GROWSDOWN, -1, 0); + if (fake_grow_down == MAP_FAILED) { + pr_perror("Can't mal a new region"); + return 1; + } + + p = fake_grow_down; + *p-- = 'c'; + *p = 'b'; + + /* overlap the guard page of fake_grow_down */ + test_addr = mmap(start_addr + PAGE_SIZE * 3, PAGE_SIZE, + PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0); + if (test_addr == MAP_FAILED) { + pr_perror("Can't mal a new region"); + return 1; + } + + grow_down = mmap(start_addr + PAGE_SIZE * 2, PAGE_SIZE, + PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED | MAP_GROWSDOWN, -1, 0); + if (grow_down == MAP_FAILED) { + pr_perror("Can't mal a new region"); + return 1; + } + + test_daemon(); + test_waitsig(); + + munmap(test_addr, PAGE_SIZE); + if (fake_grow_down[0] != 'c' || *(fake_grow_down - 1) != 'b') { + fail("%c %c\n", fake_grow_down[0], *(fake_grow_down - 1)); + return 1; + } + + p = grow_down; + *p-- = 'z'; + *p = 'x'; + + pass(); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/grow_map.desc b/CRIU_code/test/zdtm/static/grow_map.desc new file mode 100644 index 0000000..95c58b4 --- /dev/null +++ b/CRIU_code/test/zdtm/static/grow_map.desc @@ -0,0 +1 @@ +{'flags': 'noauto'} diff --git a/CRIU_code/test/zdtm/static/grow_map02.c b/CRIU_code/test/zdtm/static/grow_map02.c new file mode 100644 index 0000000..0b93714 --- /dev/null +++ b/CRIU_code/test/zdtm/static/grow_map02.c @@ -0,0 +1,63 @@ +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that a few grow-down VMA-s are restored correctly"; +const char *test_author = "Andrew Vagin "; + +int main(int argc, char **argv) +{ + char *start_addr, *grow_down; + test_init(argc, argv); + + start_addr = mmap(NULL, PAGE_SIZE * 10, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (start_addr == MAP_FAILED) { + pr_perror("Can't mal a new region"); + return 1; + } + munmap(start_addr, PAGE_SIZE * 10); + + grow_down = mmap(start_addr + PAGE_SIZE * 3, PAGE_SIZE * 3, + PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED | MAP_GROWSDOWN, -1, 0); + if (grow_down == MAP_FAILED) { + pr_perror("Can't mal a new region"); + return 1; + } + + grow_down[0 * PAGE_SIZE] = 'x'; + grow_down[1 * PAGE_SIZE] = 'y'; + grow_down[2 * PAGE_SIZE] = 'z'; + + /* + * Split the grow-down vma on three parts. + * Only the irst one will have a guard page + */ + if (mprotect(grow_down + PAGE_SIZE, PAGE_SIZE, PROT_READ)) { + pr_perror("Can't change set protection on a region of memory"); + return 1; + } + + test_daemon(); + test_waitsig(); + + test_msg("%c %c %c\n", grow_down[0 * PAGE_SIZE], + grow_down[1 * PAGE_SIZE], grow_down[2 * PAGE_SIZE]); + + if (grow_down[0 * PAGE_SIZE] != 'x') + return 1; + if (grow_down[1 * PAGE_SIZE] != 'y') + return 1; + if (grow_down[2 * PAGE_SIZE] != 'z') + return 1; + + pass(); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/grow_map02.desc b/CRIU_code/test/zdtm/static/grow_map02.desc new file mode 100644 index 0000000..95c58b4 --- /dev/null +++ b/CRIU_code/test/zdtm/static/grow_map02.desc @@ -0,0 +1 @@ +{'flags': 'noauto'} diff --git a/CRIU_code/test/zdtm/static/grow_map03.c b/CRIU_code/test/zdtm/static/grow_map03.c new file mode 100644 index 0000000..6310386 --- /dev/null +++ b/CRIU_code/test/zdtm/static/grow_map03.c @@ -0,0 +1,40 @@ +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that VMA-s with MAP_GROWSDOWN are restored correctly"; +const char *test_author = "Andrew Vagin "; + +/* +* This test case creates two consecutive grows down vmas with a hole +* between them. +*/ + +int main(int argc, char **argv) +{ + char *start_addr, *addr1, *addr2; + + test_init(argc, argv); + + start_addr = mmap(NULL, PAGE_SIZE * 10, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (start_addr == MAP_FAILED) { + pr_perror("Can't mal a new region"); + return 1; + } + munmap(start_addr, PAGE_SIZE * 10); + + addr1 = mmap(start_addr + PAGE_SIZE * 5, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE | MAP_GROWSDOWN, -1, 0); + addr2 = mmap(start_addr + PAGE_SIZE * 3, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE | MAP_GROWSDOWN, -1, 0); + + test_msg("%p %p\n", addr1, addr2); + + test_daemon(); + test_waitsig(); + + pass(); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/grow_map03.desc b/CRIU_code/test/zdtm/static/grow_map03.desc new file mode 100644 index 0000000..95c58b4 --- /dev/null +++ b/CRIU_code/test/zdtm/static/grow_map03.desc @@ -0,0 +1 @@ +{'flags': 'noauto'} diff --git a/CRIU_code/test/zdtm/static/helper_zombie_child.c b/CRIU_code/test/zdtm/static/helper_zombie_child.c new file mode 100644 index 0000000..657d56f --- /dev/null +++ b/CRIU_code/test/zdtm/static/helper_zombie_child.c @@ -0,0 +1,109 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that a zombie with a helper parent is restored"; +const char *test_author = "Tycho Andersen "; + +void setsid_and_fork(int sk) +{ + siginfo_t infop; + pid_t zombie; + + setsid(); + + zombie = fork(); + if (zombie < 0) { + fail("fork"); + exit(1); + } + + if (zombie == 0) + exit(0); + + if (waitid(P_PID, zombie, &infop, WNOWAIT | WEXITED) < 0) { + fail("waitid"); + exit(1); + } + + if (write(sk, &zombie, sizeof(zombie)) != sizeof(zombie)) { + fail("write"); + exit(1); + } + + close(sk); + + exit(0); +} + +int main(int argc, char **argv) +{ + pid_t pid, zombie; + int status, sk_pair[2]; + + if (setenv("ZDTM_NOREAP", "1", 1) < 0) { + fail("setenv"); + return 1; + } + + test_init(argc, argv); + + if (socketpair(PF_LOCAL, SOCK_STREAM, 0, sk_pair)) { + pr_perror("socketpair"); + return 1; + } + + pid = fork(); + if (pid < 0) { + fail("fork"); + return 1; + } + + if (pid == 0) { + close(sk_pair[0]); + setsid_and_fork(sk_pair[1]); + } + + close(sk_pair[1]); + + if (read(sk_pair[0], &zombie, sizeof(zombie)) != sizeof(zombie)) { + fail("read"); + kill(pid, SIGKILL); + return 1; + } + + if (waitpid(pid, &status, 0) < 0) { + fail("waitpid"); + return 1; + } + + if (!WIFEXITED(status) || WEXITSTATUS(status)) { + fail("setsid_and_fork"); + return 1; + } + + if (kill(zombie, 0) < 0) { + fail("zombie already dead?"); + return 1; + } + + test_daemon(); + test_waitsig(); + + /* XXX: we don't restore zombies with the right uid right now; they're all root */ + if (kill(zombie, 0) < 0 && errno != EPERM) { + fail("zombie didn't survive restore"); + return 1; + } + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/helper_zombie_child.desc b/CRIU_code/test/zdtm/static/helper_zombie_child.desc new file mode 100644 index 0000000..6c4afe5 --- /dev/null +++ b/CRIU_code/test/zdtm/static/helper_zombie_child.desc @@ -0,0 +1 @@ +{'flavor': 'ns uns'} diff --git a/CRIU_code/test/zdtm/static/inotify00.c b/CRIU_code/test/zdtm/static/inotify00.c new file mode 100644 index 0000000..67088ed --- /dev/null +++ b/CRIU_code/test/zdtm/static/inotify00.c @@ -0,0 +1,255 @@ +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check for inotify delivery"; +const char *test_author = "Cyrill Gorcunov "; + +char *dirname; +TEST_OPTION(dirname, string, "directory name", 1); + +#define TEST_FILE "inotify-removed" +#define TEST_LINK "inotify-hardlink" + +#define BUFF_SIZE ((sizeof(struct inotify_event) + PATH_MAX)) + +static void decode_event_mask(char *buf, size_t size, unsigned int mask) +{ + static const char *names[32] = { + [ 0] = "IN_ACCESS", + [ 1] = "IN_MODIFY", + [ 2] = "IN_ATTRIB", + [ 3] = "IN_CLOSE_WRITE", + [ 4] = "IN_CLOSE_NOWRITE", + [ 5] = "IN_OPEN", + [ 6] = "IN_MOVED_FROM", + [ 7] = "IN_MOVED_TO", + [ 8] = "IN_CREATE", + [ 9] = "IN_DELETE", + [10] = "IN_DELETE_SELF", + [11] = "IN_MOVE_SELF", + + [13] = "IN_UNMOUNT", + [14] = "IN_Q_OVERFLOW", + [15] = "IN_IGNORED", + + [24] = "IN_ONLYDIR", + [25] = "IN_DONT_FOLLOW", + [26] = "IN_EXCL_UNLINK", + + [29] = "IN_MASK_ADD", + [30] = "IN_ISDIR", + [31] = "IN_ONESHOT", + }; + + size_t i, j; + + memset(buf, 0, size); + for (i = 0, j = 0; i < 32 && j < size; i++) { + if (!(mask & (1u << i))) + continue; + if (j) + j += snprintf(&buf[j], size - j, " | %s", names[i]); + else + j += snprintf(&buf[j], size - j, "%s", names[i]); + } +} + +static int inotify_read_events(char *prefix, int inotify_fd, unsigned int *expected) +{ + struct inotify_event *event; + char buf[BUFF_SIZE * 8]; + int ret, off, n = 0; + + while (1) { + ret = read(inotify_fd, buf, sizeof(buf)); + if (ret < 0) { + if (errno != EAGAIN) { + pr_perror("Can't read inotify queue"); + return -1; + } else { + ret = 0; + goto out; + } + } else if (ret == 0) + break; + + for (off = 0; off < ret; n++, off += sizeof(*event) + event->len) { + char emask[128]; + + event = (void *)(buf + off); + decode_event_mask(emask, sizeof(emask), event->mask); + test_msg("\t%-16s: event %#10x -> %s\n", + prefix, event->mask, emask); + if (expected) + *expected &= ~event->mask; + } + } + +out: + test_msg("\t%-16s: read %2d events\n", prefix, n); + return ret; +} + +int main (int argc, char *argv[]) +{ + unsigned int mask = IN_DELETE | IN_CLOSE_WRITE | IN_DELETE_SELF | IN_CREATE; + char test_file_path[PATH_MAX]; + int fd, real_fd; + unsigned int emask; + + test_init(argc, argv); + + if (mkdir(dirname, S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH)) { + pr_perror("Can't create directory %s", dirname); + exit(1); + } + +#ifdef INOTIFY01 +{ + pid_t pid; + task_waiter_t t; + task_waiter_init(&t); + static char buf[PATH_MAX]; + + if (mount(NULL, "/", NULL, MS_PRIVATE | MS_REC, NULL)) { + pr_perror("Unable to remount /"); + return 1; + } + + pid = fork(); + if (pid < 0) { + pr_perror("Can't fork a test process"); + exit(1); + } + if (pid == 0) { + int fd; + + prctl(PR_SET_PDEATHSIG, SIGKILL, 0, 0, 0); + if (unshare(CLONE_NEWNS)) { + pr_perror("Unable to unshare mount namespace"); + exit(1); + } + + if (mount("zdtm", dirname, "tmpfs", 0, NULL)) { + pr_perror("Unable to mount tmpfs"); + exit(1); + } + fd = open(dirname, O_RDONLY); + if (fd < 0) { + pr_perror("Unable to open %s", dirname); + exit(1); + } + dup2(fd, 100); + task_waiter_complete_current(&t); + while (1) + sleep(1000); + exit(1); + } + task_waiter_wait4(&t, pid); + snprintf(buf, sizeof(buf), "/proc/%d/fd/100", pid); + dirname = buf; +} +#endif + + fd = inotify_init1(IN_NONBLOCK); + if (fd < 0) { + pr_perror("inotify_init failed"); + exit(1); + } + + snprintf(test_file_path, sizeof(test_file_path), "%s/%s", dirname, TEST_FILE); + + real_fd = open(test_file_path, O_CREAT | O_TRUNC | O_RDWR, 0644); + if (real_fd < 0) { + pr_perror("Can't create %s", test_file_path); + exit(1); + } + + if (inotify_add_watch(fd, dirname, mask) < 0) { + pr_perror("inotify_add_watch failed"); + exit(1); + } + + if (inotify_add_watch(fd, test_file_path, mask) < 0) { + pr_perror("inotify_add_watch failed"); + exit(1); + } + + /* + * At this moment we have a file inside testing + * directory and a hardlink to it. The file and + * hardlink are opened. + */ + +#ifndef INOTIFY01 + if (unlink(test_file_path)) { + pr_perror("can't unlink %s", test_file_path); + exit(1); + } + + emask = IN_DELETE; + inotify_read_events("unlink 02", fd, &emask); + if (emask) { + char emask_bits[128]; + decode_event_mask(emask_bits, sizeof(emask_bits), emask); + pr_perror("Unhandled events in emask %#x -> %s", + emask, emask_bits); + exit(1); + } +#endif + + test_daemon(); + test_waitsig(); + + close(real_fd); + + emask = IN_CLOSE_WRITE; + inotify_read_events("after", fd, &emask); + if (emask) { + char emask_bits[128]; + decode_event_mask(emask_bits, sizeof(emask_bits), emask); + fail("Unhandled events in emask %#x -> %s", + emask, emask_bits); + return 1; + } + +#ifndef INOTIFY01 + real_fd = open(test_file_path, O_CREAT | O_TRUNC | O_RDWR, 0644); + if (real_fd < 0) { + pr_perror("Can't create %s", test_file_path); + exit(1); + } + close(real_fd); + + emask = IN_CREATE | IN_CLOSE_WRITE; + inotify_read_events("after2", fd, &emask); + if (emask) { + char emask_bits[128]; + decode_event_mask(emask_bits, sizeof(emask_bits), emask); + fail("Unhandled events in emask %#x -> %s", + emask, emask_bits); + return 1; + } +#endif + + pass(); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/inotify00.desc b/CRIU_code/test/zdtm/static/inotify00.desc new file mode 100644 index 0000000..083b583 --- /dev/null +++ b/CRIU_code/test/zdtm/static/inotify00.desc @@ -0,0 +1 @@ +{'opts': '--link-remap', 'flags': 'nouser'} diff --git a/CRIU_code/test/zdtm/static/inotify01.c b/CRIU_code/test/zdtm/static/inotify01.c new file mode 100644 index 0000000..a7937cf --- /dev/null +++ b/CRIU_code/test/zdtm/static/inotify01.c @@ -0,0 +1 @@ +inotify00.c \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/inotify01.desc b/CRIU_code/test/zdtm/static/inotify01.desc new file mode 100644 index 0000000..a8849e0 --- /dev/null +++ b/CRIU_code/test/zdtm/static/inotify01.desc @@ -0,0 +1 @@ +{'flavor': 'ns uns', 'flags': 'suid', 'feature': 'mnt_id'} diff --git a/CRIU_code/test/zdtm/static/inotify02.c b/CRIU_code/test/zdtm/static/inotify02.c new file mode 100644 index 0000000..b1b75d2 --- /dev/null +++ b/CRIU_code/test/zdtm/static/inotify02.c @@ -0,0 +1,98 @@ +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check for inotify file-handles storm"; +const char *test_author = "Cyrill Gorcunov "; + +char *dirname; +TEST_OPTION(dirname, string, "directory name", 1); + +static int num_of_handles(int fd) +{ + char path[64]; + char buf[512]; + int ret = 0; + FILE *f; + + snprintf(path, sizeof(path), "/proc/self/fdinfo/%d", fd); + f = fopen(path, "r"); + if (!f) { + pr_err("Can't open %s", path); + return -1; + } + + while (fgets(buf, sizeof(buf), f)) { + if (memcmp(buf, "inotify ", 8)) + continue; + ret++; + } + + fclose(f); + return ret; +} + +int main (int argc, char *argv[]) +{ + const unsigned int mask = IN_DELETE | IN_CLOSE_WRITE | IN_DELETE_SELF | IN_CREATE; + const int nr_dirs = 64; + char temp[nr_dirs][16]; + char path[PATH_MAX]; + int fd, i; + + test_init(argc, argv); + + if (mkdir(dirname, S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH)) { + pr_err("Can't create directory %s", dirname); + exit(1); + } + + fd = inotify_init1(IN_NONBLOCK); + if (fd < 0) { + pr_err("inotify_init failed"); + exit(1); + } + + for (i = 0; i < nr_dirs; i++) { + snprintf(temp[i], sizeof(temp[0]), "d.%03d", i); + snprintf(path, sizeof(path), "%s/%s", dirname, temp[i]); + if (mkdir(path, S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH)) { + pr_err("Can't create %s", path); + exit(1); + } + + if (inotify_add_watch(fd, path, mask) < 0) { + pr_err("inotify_add_watch failed on %s", path); + exit(1); + } + } + + test_daemon(); + test_waitsig(); + + i = num_of_handles(fd); + close(fd); + + if (i < nr_dirs) + fail("Expected %d handles but got %d", nr_dirs, i); + else + pass(); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/inotify02.desc b/CRIU_code/test/zdtm/static/inotify02.desc new file mode 100644 index 0000000..95c58b4 --- /dev/null +++ b/CRIU_code/test/zdtm/static/inotify02.desc @@ -0,0 +1 @@ +{'flags': 'noauto'} diff --git a/CRIU_code/test/zdtm/static/inotify_irmap.c b/CRIU_code/test/zdtm/static/inotify_irmap.c new file mode 100644 index 0000000..cf35886 --- /dev/null +++ b/CRIU_code/test/zdtm/static/inotify_irmap.c @@ -0,0 +1,76 @@ +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check for irmap"; +const char *test_author = "Pavel Emelyanov "; + +#define TDIR "/etc" +char test_files[2][128] = {TDIR"/zdtm-test", TDIR"/zdtm-test1"}; + +#define BUFF_SIZE ((sizeof(struct inotify_event) + PATH_MAX)) + +int main (int argc, char *argv[]) +{ + char buf[BUFF_SIZE]; + int fd, wd, i; + + test_init(argc, argv); + + for (i = 0; i < 2; i++) { + unlink(test_files[i]); + if (creat(test_files[i], 0600) < 0) { + pr_perror("Can't make test file"); + exit(1); + } + } + + fd = inotify_init1(IN_NONBLOCK); + if (fd < 0) { + pr_perror("inotify_init failed"); + goto err; + } + + for (i = 0; i < 2; i++) { + wd = inotify_add_watch(fd, test_files[i], IN_OPEN); + if (wd < 0) { + pr_perror("inotify_add_watch failed"); + goto err; + } + } + + test_daemon(); + test_waitsig(); + + for (i = 0; i < 2; i++) { + memset(buf, 0, sizeof(buf)); + wd = open(test_files[i], O_RDONLY); + if (read(fd, buf, sizeof(buf)) <= 0) { + fail("No events in queue"); + goto err; + } + } + + close(wd); + close(fd); + for (i = 0; i < 2; i++) + unlink(test_files[i]); + pass(); + return 0; +err: + for (i = 0; i < 2; i++) + unlink(test_files[i]); + return 1; +} diff --git a/CRIU_code/test/zdtm/static/inotify_irmap.desc b/CRIU_code/test/zdtm/static/inotify_irmap.desc new file mode 100644 index 0000000..525a4a6 --- /dev/null +++ b/CRIU_code/test/zdtm/static/inotify_irmap.desc @@ -0,0 +1 @@ +{'flags': 'suid', 'opts' : '--force-irmap --irmap-scan-path /zdtm/static'} diff --git a/CRIU_code/test/zdtm/static/inotify_irmap.hook b/CRIU_code/test/zdtm/static/inotify_irmap.hook new file mode 100644 index 0000000..ed3691e --- /dev/null +++ b/CRIU_code/test/zdtm/static/inotify_irmap.hook @@ -0,0 +1,19 @@ +#!/bin/sh + +umask 0000 + +[ "$1" = "--pre-restore" ] && { + exit + # emulate rsync + rm -rf etc/zdtm-test + touch etc/zdtm-test +} + +[ "$1" = "--post-pre-dump" ] && { + echo 'invalidate the irmap cache' + mv etc/zdtm-test etc/zdtm-test2 + mv etc/zdtm-test1 etc/zdtm-test + mv etc/zdtm-test2 etc/zdtm-test1 +} + +exit 0 diff --git a/CRIU_code/test/zdtm/static/inotify_system.c b/CRIU_code/test/zdtm/static/inotify_system.c new file mode 100644 index 0000000..59f47c4 --- /dev/null +++ b/CRIU_code/test/zdtm/static/inotify_system.c @@ -0,0 +1,391 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Inotify on symlink should be checked"; +#ifndef NODEL +char filename[] = "file"; +char linkname[] = "file.lnk"; +const char *inot_dir = "./inotify"; +#else +char filename[] = "file.nodel"; +char linkname[] = "file.nodel.lnk"; +const char *inot_dir = "./inotify.nodel"; +#endif + +#ifdef __NR_inotify_init +#include + +#ifndef IN_DONT_FOLLOW +/* Missed in SLES 10 header */ +#define IN_DONT_FOLLOW 0x02000000 +#endif + +#define EVENT_MAX 1024 +/* size of the event structure, not counting name */ +#define EVENT_SIZE (sizeof (struct inotify_event)) +/* reasonable guess as to size of 1024 events */ +#define EVENT_BUF_LEN (EVENT_MAX * (EVENT_SIZE + 16)) +#define BUF_SIZE 256 + +#define min_value(a,b) (a +#include + +typedef struct { + int inot; + int file; + int link; + int dir; +} desc; + +void do_wait() { + test_daemon(); + test_waitsig(); +} + +int createFiles(char *path, char *target, char *link) { + int fd; + fd = open(path,O_CREAT, 0644); + if (fd < 0) { + pr_perror("can't open %s", path); + return -1; + } + close(fd); + if (symlink(target, link) < 0) { + pr_perror("can't symlink %s to %s", path, link); + return -1; + } + return 0; +} + +int addWatcher(int fd, const char *path) { + int wd; + wd = inotify_add_watch(fd, path, IN_ALL_EVENTS | IN_DONT_FOLLOW); + if (wd < 0) { + pr_perror("inotify_add_watch(%d, %s, IN_ALL_EVENTS) Failed, %s", + fd, path, strerror(errno)); + return -1; + } + return wd; +} + +int fChmod(char *path) { + if (chmod(path, 0755) < 0) { + pr_perror("chmod(%s, 0755) Failed, %s", + path, strerror(errno)); + return -1; + } + return 0; +} + +int fWriteClose(char *path) { + int fd = open(path, O_RDWR | O_CREAT, 0700); + if (fd == -1) { + pr_perror("open(%s, O_RDWR|O_CREAT,0700) Failed, %s", + path, strerror(errno)); + return -1; + } + if (write(fd, "string", 7) == -1) { + pr_perror("write(%d, %s, 1) Failed, %s", fd, path, strerror(errno)); + return -1; + } + if (close(fd) == -1) { + pr_perror("close(%s) Failed, %s", path, strerror(errno)); + return -1; + } + return 0; +} + +int fNoWriteClose(char *path) { + char buf[BUF_SIZE]; + int fd = open(path, O_RDONLY); + if ( fd < 0 ) { + pr_perror("open(%s, O_RDONLY) Failed, %s", + path, strerror(errno)); + return -1; + } + if (read(fd, buf, BUF_SIZE) == -1) { + pr_perror("read error: %s", strerror(errno)); + close(fd); + return -1; + } + if (close(fd) == -1) { + pr_perror("close(%s) Failed, %s", path, strerror(errno)); + return -1; + } + return 0; +} + +int fMove(char *from, char *to) { + if (rename(from, to) == -1) { + pr_perror("rename error (from: %s to: %s) : %s", + from, to, strerror(errno)); + return -1; + } + return 0; +} + +desc init_env(const char *dir, char *file_path, char *link_path) { + desc in_desc = {-1, -1, -1, -1}; + if (mkdir(dir, 0777) < 0) { + pr_perror("error in creating directory: %s, %s", + dir, strerror(errno)); + return in_desc; + } + in_desc.inot = inotify_init(); + if (in_desc.inot < 0) { + pr_perror("inotify_init () Failed, %s", strerror(errno)); + rmdir(dir); + return in_desc; + } + + if (snprintf(file_path, BUF_SIZE, "%s/%s", dir, filename) >= BUF_SIZE) { + pr_perror("filename %s is too long", filename); + rmdir(dir); + return in_desc; + } + + if (snprintf(link_path, BUF_SIZE, "%s/%s", dir, linkname) >= BUF_SIZE) { + pr_perror("filename %s is too long", linkname); + rmdir(dir); + return in_desc; + } + + in_desc.dir = addWatcher(in_desc.inot, dir); + if (createFiles(file_path, filename, link_path)) { + return in_desc; + } + in_desc.link = addWatcher(in_desc.inot, link_path); + in_desc.file = addWatcher(in_desc.inot, file_path); + + return in_desc; +} + +int fDelete(char *path) { + if (unlink(path) != 0) { + pr_perror("unlink: (%s)", strerror(errno)); + return -1; + } + return 0; +} + +int fRemDir(const char *target) { + if(rmdir(target)) { + pr_perror("rmdir: (%s)", strerror(errno)); + return -1; + } + return 0; +} + +int test_actions(const char *dir, char *file_path, char *link_path) { + + if ( + fChmod(link_path) == 0 && + fWriteClose(link_path) == 0 && + fNoWriteClose(link_path) == 0 && + fMove(file_path, filename) == 0 && + fMove(filename, file_path) == 0 +#ifndef NODEL + && fDelete(file_path) == 0 && + fDelete(link_path) == 0 && + fRemDir(dir) == 0 +#endif + ) + { + return 0; + } + return -1; +} + +void dump_events(char *buf, int len) { + int marker = 0; + struct inotify_event *event; + while (marker < len) { + event = (struct inotify_event *) &buf[marker]; + test_msg("\t%s (%x mask, %d len", handle_event(event->mask), event->mask, event->len); + if (event->len) + test_msg(", '%s' name", event->name); + test_msg(")\n"); + marker += EVENT_SIZE + event->len; + } +} + +int harmless(int mask) +{ + switch (mask) { + case IN_CLOSE_NOWRITE: + case IN_ATTRIB: + return 1; + } + return 0; +} + +int errors(int exp_len, int len, char *etalon_buf, char *buf) { + int marker=0; + int error=0; + while (marker < len){ + struct inotify_event *event; + struct inotify_event *exp_event; + event = (struct inotify_event *) &buf[marker]; + /* It's OK if some additional events are recevived */ + if (marker < exp_len) + exp_event = (struct inotify_event *) &etalon_buf[marker]; + else { + if (!harmless(event->mask)) { + fail("got unexpected event %s (%x mask)\n", + handle_event(event->mask), event->mask); + error++; + } + goto next_event; + } + + if (event->mask != exp_event->mask) { + fail("Handled %s (%x mask), expected %s (%x mask)", + handle_event(event->mask), event->mask, + handle_event(exp_event->mask), + exp_event->mask); + error++; + } + if (event->len != exp_event->len) { + fail("Incorrect length of field name."); + error++; + break; + } + else if (event->len && strncmp(event->name, exp_event->name, event->len)) { + fail("Handled file name %s, expected %s", + event->name, + exp_event->name); + error++; + } +next_event: + marker += EVENT_SIZE + event->len; + } + return error; +} + +int read_set(int inot_fd, char *event_set) { + int len; + if ((len = read(inot_fd, event_set, EVENT_BUF_LEN)) < 0) { + pr_perror("read(%d, buf, %lu) Failed, errno=%d", + inot_fd, (unsigned long)EVENT_BUF_LEN, errno); + return -1; + } + return len; +} + +void common_close(desc *descr) { + if (descr->inot > 0) { + close(descr->inot); + descr->inot=-1; + descr->file=-1; + descr->dir=-1; + descr->link=-1; + } +} + +int get_event_set(char *event_set, int wait) { + int len; + char link_path[BUF_SIZE]; + char file_path[BUF_SIZE]; + desc common_desc; + + common_desc = init_env(inot_dir, file_path, link_path); + if ((common_desc.inot < 0) || (common_desc.file < 0) || \ + (common_desc.dir < 0) || (common_desc.link < 0)) { + common_close(&common_desc); + return -1; + } + if(test_actions(inot_dir, file_path, link_path) < 0) { + common_close(&common_desc); + return -1; + } + if (wait) { + do_wait(); + } + len = read_set(common_desc.inot, event_set); + common_close(&common_desc); +#ifdef NODEL + if (! (fDelete(file_path) == 0 && + fDelete(link_path) == 0 && + fRemDir(inot_dir) == 0)) + return -1; +#endif + return len; +} + +int check(int len, char *event_set, int exp_len, char *etalon_event_set) { + + if ((exp_len < 0) || (len < 0)){ + fail("Error in preparing event sets."); + return -1; + } + if (len < exp_len) { + fail("Events are lost. Read: %d, Expected: %d", len, exp_len); + test_msg("expected events\n"); + dump_events(etalon_event_set, exp_len); + test_msg("real events\n"); + dump_events(event_set, len); + return -1; + } + if (errors(exp_len, len, etalon_event_set, event_set) == 0) { + pass(); + return 0; + } + return -1; +} + +int main(int argc, char ** argv) +{ + int exp_len=-1, len=-1; + char etalon_event_set[EVENT_BUF_LEN]; + char event_set[EVENT_BUF_LEN]; + + test_init(argc, argv); + + exp_len = get_event_set(etalon_event_set, 0); + len = get_event_set(event_set, 1); + + if (check(len, event_set, exp_len, etalon_event_set)) { + return 1; + } + return 0; +} +#else + +int main(int argc, char ** argv) +{ + test_init(argc, argv); + skip("Inotify not supported."); + return 0; +} +#endif //__NR_inotify_init diff --git a/CRIU_code/test/zdtm/static/inotify_system.desc b/CRIU_code/test/zdtm/static/inotify_system.desc new file mode 100644 index 0000000..95c58b4 --- /dev/null +++ b/CRIU_code/test/zdtm/static/inotify_system.desc @@ -0,0 +1 @@ +{'flags': 'noauto'} diff --git a/CRIU_code/test/zdtm/static/inotify_system_nodel.c b/CRIU_code/test/zdtm/static/inotify_system_nodel.c new file mode 100644 index 0000000..36049d9 --- /dev/null +++ b/CRIU_code/test/zdtm/static/inotify_system_nodel.c @@ -0,0 +1 @@ +inotify_system.c \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/inotify_system_nodel.desc b/CRIU_code/test/zdtm/static/inotify_system_nodel.desc new file mode 100644 index 0000000..95c58b4 --- /dev/null +++ b/CRIU_code/test/zdtm/static/inotify_system_nodel.desc @@ -0,0 +1 @@ +{'flags': 'noauto'} diff --git a/CRIU_code/test/zdtm/static/ipc_namespace.c b/CRIU_code/test/zdtm/static/ipc_namespace.c new file mode 100644 index 0000000..d01d654 --- /dev/null +++ b/CRIU_code/test/zdtm/static/ipc_namespace.c @@ -0,0 +1,414 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +#define CLONE_NEWIPC 0x08000000 + +extern int msgctl (int __msqid, int __cmd, struct msqid_ds *__buf); +extern int semctl (int __semid, int __semnum, int __cmd, ...); +extern int shmctl (int __shmid, int __cmd, struct shmid_ds *__buf); + +struct ipc_ids { + int in_use; /* TODO: Check for 0 */ +// unsigned short seq; +// unsigned short seq_max; +// struct rw_semaphore rw_mutex; +// struct idr ipcs_idr; /* TODO */ +}; + +struct ipc_ns { + struct ipc_ids ids[3]; + + int sem_ctls[4]; // + + int used_sems; // + + + int msg_ctlmax; // + + int msg_ctlmnb; // + + int msg_ctlmni; // + + int msg_bytes; // + + int msg_hdrs; // + + int auto_msgmni; // + + int msg_next_id; // + + int sem_next_id; // + + int shm_next_id; // + + + size_t shm_ctlmax; + size_t shm_ctlall; + int shm_ctlmni; + int shm_tot; + int shm_rmid_forced; + +// struct vfsmount *mq_mnt; + +// unsigned int mq_queues_count; + + unsigned int mq_queues_max; /* initialized to DFLT_QUEUESMAX */ + unsigned int mq_msg_max; /* initialized to DFLT_MSGMAX */ + unsigned int mq_msgsize_max; /* initialized to DFLT_MSGSIZEMAX */ + unsigned int mq_msg_default; /* initialized to DFLT_MSG */ + unsigned int mq_msgsize_default; /* initialized to DFLT_MSGSIZE */ + + struct user_ns *user_ns; +}; + +#define IPC_SEM_IDS 0 +#define IPC_MSG_IDS 1 +#define IPC_SHM_IDS 2 + +const char *test_doc = "Check that ipc ns context migrated successfully"; +const char *test_author = "Stanislav Kinsbursky "; + +struct ipc_ns ipc_before, ipc_after; + +static int read_ipc_sysctl(char *name, int *data, size_t size) +{ + int fd; + int ret; + char buf[32]; + + fd = open(name, O_RDONLY); + if (fd < 0) { + pr_perror("Can't open %s", name); + return fd; + } + ret = read(fd, buf, 32); + if (ret < 0) { + pr_perror("Can't read %s", name); + ret = -errno; + goto err; + } + *data = (int)strtoul(buf, NULL, 10); + ret = 0; +err: + close(fd); + return ret; +} + +static int get_messages_info(struct ipc_ns *ipc) +{ + struct msginfo info; + int ret; + + ret = msgctl(0, MSG_INFO, (struct msqid_ds *)&info); + if (ret < 0) { + pr_perror("msgctl failed with %d", errno); + return ret; + } + + ipc->msg_ctlmax = info.msgmax; + ipc->msg_ctlmnb = info.msgmnb; + ipc->msg_ctlmni = info.msgmni; + ipc->msg_bytes = info.msgtql; + ipc->msg_hdrs = info.msgmap; + ipc->ids[IPC_MSG_IDS].in_use = info.msgpool; + + if (read_ipc_sysctl("/proc/sys/kernel/auto_msgmni", + &ipc->auto_msgmni, sizeof(ipc->auto_msgmni))) + return -1; + if (read_ipc_sysctl("/proc/sys/kernel/msg_next_id", + &ipc->msg_next_id, sizeof(ipc->msg_next_id))) + return -1; + if (read_ipc_sysctl("/proc/sys/kernel/sem_next_id", + &ipc->sem_next_id, sizeof(ipc->sem_next_id))) + return -1; + if (read_ipc_sysctl("/proc/sys/kernel/shm_next_id", + &ipc->shm_next_id, sizeof(ipc->shm_next_id))) + return -1; + if (read_ipc_sysctl("/proc/sys/fs/mqueue/queues_max", + (int *)&ipc->mq_queues_max, sizeof(ipc->mq_queues_max))) + return -1; + if (read_ipc_sysctl("/proc/sys/fs/mqueue/msg_max", + (int *)&ipc->mq_msg_max, sizeof(ipc->mq_msg_max))) + return -1; + if (read_ipc_sysctl("/proc/sys/fs/mqueue/msgsize_max", + (int *)&ipc->mq_msgsize_max, sizeof(ipc->mq_msgsize_max))) + return -1; + if (read_ipc_sysctl("/proc/sys/fs/mqueue/msg_default", + (int *)&ipc->mq_msg_default, sizeof(ipc->mq_msg_default))) + return -1; + if (read_ipc_sysctl("/proc/sys/fs/mqueue/msgsize_default", + (int *)&ipc->mq_msgsize_default, sizeof(ipc->mq_msgsize_default))) + return -1; + + return 0; +} + +static int get_semaphores_info(struct ipc_ns *ipc) +{ + int err; + struct seminfo info; + + err = semctl(0, 0, SEM_INFO, &info); + if (err < 0) + pr_perror("semctl failed with %d", errno); + + ipc->sem_ctls[0] = info.semmsl; + ipc->sem_ctls[1] = info.semmns; + ipc->sem_ctls[2] = info.semopm; + ipc->sem_ctls[3] = info.semmni; + ipc->used_sems = info.semaem; + ipc->ids[IPC_SEM_IDS].in_use = info.semusz; + + return 0; +} + +static int get_shared_memory_info(struct ipc_ns *ipc) +{ + int ret; + union { + struct shminfo64 shminfo64; + struct shm_info shminfo; + struct shmid_ds shmid; + } u; + + ret = shmctl(0, IPC_INFO, &u.shmid); + if (ret < 0) + pr_perror("semctl failed with %d", errno); + + ipc->shm_ctlmax = u.shminfo64.shmmax; + ipc->shm_ctlall = u.shminfo64.shmall; + ipc->shm_ctlmni = u.shminfo64.shmmni; + + ret = shmctl(0, SHM_INFO, &u.shmid); + if (ret < 0) + pr_perror("semctl failed with %d", errno); + + ipc->shm_tot = u.shminfo.shm_tot; + ipc->ids[IPC_SHM_IDS].in_use = u.shminfo.used_ids; + + if (read_ipc_sysctl("/proc/sys/kernel/shm_rmid_forced", + &ipc->shm_rmid_forced, sizeof(ipc->shm_rmid_forced))) + return -1; + + return 0; +} + + +int fill_ipc_ns(struct ipc_ns *ipc) +{ + int ret; + + ret = get_messages_info(ipc); + if (ret < 0) { + pr_perror("Failed to collect messages"); + return ret; + } + + ret = get_semaphores_info(ipc); + if (ret < 0) { + pr_perror("Failed to collect semaphores"); + return ret; + } + + ret = get_shared_memory_info(ipc); + if (ret < 0) { + pr_perror("Failed to collect shared memory"); + return ret; + } + return 0; +} + +static int rand_ipc_sysctl(char *name, unsigned int val) +{ + int fd; + int ret; + char buf[32]; + + fd = open(name, O_WRONLY); + if (fd < 0) { + pr_perror("Can't open %s", name); + return fd; + } + sprintf(buf, "%d\n", val); + ret = write(fd, buf, strlen(buf)); + if (ret < 0) { + pr_perror("Can't write %u into %s", val, name); + return -errno; + } + close(fd); + return 0; +} + +#define MAX_MNI (1<<15) + +static int rand_ipc_sem(void) +{ + int fd; + int ret; + char buf[128]; + char *name = "/proc/sys/kernel/sem"; + + fd = open(name, O_WRONLY); + if (fd < 0) { + pr_perror("Can't open %s", name); + return fd; + } + sprintf(buf, "%d %d %d %d\n", (unsigned) lrand48(), (unsigned) lrand48(), + (unsigned) lrand48(), (unsigned) lrand48() % MAX_MNI); + ret = write(fd, buf, 128); + if (ret < 0) { + pr_perror("Can't write %s: %d", name, errno); + return -errno; + } + close(fd); + return 0; +} + +static int rand_ipc_ns(void) +{ + int ret; + + ret = rand_ipc_sem(); + if (!ret) + ret = rand_ipc_sysctl("/proc/sys/kernel/msgmax", (unsigned)lrand48()); + if (!ret) + ret = rand_ipc_sysctl("/proc/sys/kernel/msgmnb", (unsigned)lrand48()); + if (!ret) + ret = rand_ipc_sysctl("/proc/sys/kernel/msgmni", (unsigned)lrand48() % MAX_MNI); + if (!ret) + ret = rand_ipc_sysctl("/proc/sys/kernel/auto_msgmni", 0); + if (!ret && (unsigned)lrand48() % 2) + ret = rand_ipc_sysctl("/proc/sys/kernel/msg_next_id", (unsigned)lrand48() % ((unsigned)INT_MAX + 1)); + if (!ret && (unsigned)lrand48() % 2) + ret = rand_ipc_sysctl("/proc/sys/kernel/sem_next_id", (unsigned)lrand48() % ((unsigned)INT_MAX + 1)); + if (!ret && (unsigned)lrand48() % 2) + ret = rand_ipc_sysctl("/proc/sys/kernel/shm_next_id", (unsigned)lrand48() % ((unsigned)INT_MAX + 1)); + if (!ret) + ret = rand_ipc_sysctl("/proc/sys/kernel/shmmax", (unsigned)lrand48()); + if (!ret) + ret = rand_ipc_sysctl("/proc/sys/kernel/shmall", (unsigned)lrand48()); + if (!ret) + ret = rand_ipc_sysctl("/proc/sys/kernel/shmmni", (unsigned)lrand48() % MAX_MNI); + if (!ret) + ret = rand_ipc_sysctl("/proc/sys/kernel/shm_rmid_forced", (unsigned)lrand48() & 1); + + + if (!ret) + ret = rand_ipc_sysctl("/proc/sys/fs/mqueue/queues_max", (((unsigned)lrand48()) % 1023) + 1); + if (!ret) + ret = rand_ipc_sysctl("/proc/sys/fs/mqueue/msg_max", ((unsigned)lrand48() % 65536) + 1); + if (!ret) + ret = rand_ipc_sysctl("/proc/sys/fs/mqueue/msgsize_max", ((unsigned)lrand48() & (8192 * 128 - 1)) | 128); + if (!ret) + ret = rand_ipc_sysctl("/proc/sys/fs/mqueue/msg_default", ((unsigned)lrand48() % 65536) + 1); + if (!ret) + ret = rand_ipc_sysctl("/proc/sys/fs/mqueue/msgsize_default", ((unsigned)lrand48() & (8192 * 128 - 1)) | 128); + + if (ret < 0) + pr_perror("Failed to randomize ipc namespace tunables"); + + return ret; +} + +static void show_ipc_entry(struct ipc_ns *old, struct ipc_ns *new) +{ + int i; + + for (i = 0; i < 3; i++) { + if (old->ids[i].in_use != new->ids[i].in_use) + pr_perror("ids[%d].in_use differs: %d ---> %d", i, + old->ids[i].in_use, new->ids[i].in_use); + + } + for (i = 0; i < 4; i++) { + if (old->sem_ctls[i] != new->sem_ctls[i]) + pr_perror("sem_ctls[%d] differs: %d ---> %d", i, + old->sem_ctls[i], new->sem_ctls[i]); + + } + + if (old->msg_ctlmax != new->msg_ctlmax) + pr_perror("msg_ctlmax differs: %d ---> %d", + old->msg_ctlmax, new->msg_ctlmax); + if (old->msg_ctlmnb != new->msg_ctlmnb) + pr_perror("msg_ctlmnb differs: %d ---> %d", + old->msg_ctlmnb, new->msg_ctlmnb); + if (old->msg_ctlmni != new->msg_ctlmni) + pr_perror("msg_ctlmni differs: %d ---> %d", + old->msg_ctlmni, new->msg_ctlmni); + if (old->auto_msgmni != new->auto_msgmni) + pr_perror("auto_msgmni differs: %d ---> %d", + old->auto_msgmni, new->auto_msgmni); + if (old->msg_next_id != new->msg_next_id) + pr_perror("msg_next_id differs: %d ---> %d", + old->msg_next_id, new->msg_next_id); + if (old->sem_next_id != new->sem_next_id) + pr_perror("sem_next_id differs: %d ---> %d", + old->sem_next_id, new->sem_next_id); + if (old->shm_next_id != new->shm_next_id) + pr_perror("shm_next_id differs: %d ---> %d", + old->shm_next_id, new->shm_next_id); + if (old->shm_ctlmax != new->shm_ctlmax) + pr_perror("shm_ctlmax differs: %zu ---> %zu", + old->shm_ctlmax, new->shm_ctlmax); + if (old->shm_ctlall != new->shm_ctlall) + pr_perror("shm_ctlall differs: %zu ---> %zu", + old->shm_ctlall, new->shm_ctlall); + if (old->shm_ctlmni != new->shm_ctlmni) + pr_perror("shm_ctlmni differs: %d ---> %d", + old->shm_ctlmni, new->shm_ctlmni); + if (old->shm_rmid_forced != new->shm_rmid_forced) + pr_perror("shm_rmid_forced differs: %d ---> %d", + old->shm_rmid_forced, new->shm_rmid_forced); + if (old->mq_queues_max != new->mq_queues_max) + pr_perror("mq_queues_max differs: %d ---> %d", + old->mq_queues_max, new->mq_queues_max); + if (old->mq_msg_max != new->mq_msg_max) + pr_perror("mq_msg_max differs: %d ---> %d", + old->mq_msg_max, new->mq_msg_max); + if (old->mq_msgsize_max != new->mq_msgsize_max) + pr_perror("mq_msgsize_max differs: %d ---> %d", + old->mq_msgsize_max, new->mq_msgsize_max); + if (old->mq_msg_default != new->mq_msg_default) + pr_perror("mq_msg_default differs: %d ---> %d", + old->mq_msg_default, new->mq_msg_default); + if (old->mq_msgsize_default != new->mq_msgsize_default) + pr_perror("mq_msgsize_default differs: %d ---> %d", + old->mq_msgsize_default, new->mq_msgsize_default); +} + +int main(int argc, char **argv) +{ + int ret; + + test_init(argc, argv); + + ret = rand_ipc_ns(); + if (ret) { + pr_perror("Failed to randomize ipc ns before migration"); + return -1; + } + + ret = fill_ipc_ns(&ipc_before); + if (ret) { + pr_perror("Failed to collect ipc ns before migration"); + return ret; + } + + test_daemon(); + test_waitsig(); + + ret = fill_ipc_ns(&ipc_after); + if (ret) { + pr_perror("Failed to collect ipc ns after migration"); + return ret; + } + + if (memcmp(&ipc_before, &ipc_after, sizeof(ipc_after))) { + pr_perror("IPC's differ"); + show_ipc_entry(&ipc_before, &ipc_after); + return -EINVAL; + } + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/ipc_namespace.desc b/CRIU_code/test/zdtm/static/ipc_namespace.desc new file mode 100644 index 0000000..7b16528 --- /dev/null +++ b/CRIU_code/test/zdtm/static/ipc_namespace.desc @@ -0,0 +1 @@ +{'flavor': 'ns', 'flags' : 'suid'} diff --git a/CRIU_code/test/zdtm/static/jobctl00.c b/CRIU_code/test/zdtm/static/jobctl00.c new file mode 100644 index 0000000..397747b --- /dev/null +++ b/CRIU_code/test/zdtm/static/jobctl00.c @@ -0,0 +1,301 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that job control migrates correctly"; +const char *test_author = "Roman Kagan "; + +#define JOBS_DEF 8 +#define JOBS_MAX 64 +unsigned int num_jobs = JOBS_DEF; +TEST_OPTION(num_jobs, uint, "# \"jobs\" in a \"shell\" " + "(default " __stringify(JOBS_DEF) + ", max " __stringify(JOBS_MAX) ")", 0); + +#define PROCS_DEF 4 +unsigned int num_procs = PROCS_DEF; +TEST_OPTION(num_procs, uint, "# processes in a \"job\" " + "(default " __stringify(PROCS_DEF) ")", 0); + +static const char wr_string[] = "All you need is love!\n"; +static const char rd_string[] = "We all live in a yellow submarine\n"; +static const char susp_char = '\032'; /* ^Z */ + +static volatile sig_atomic_t signo = 0; + +static void record_sig(int sig) +{ + signo = sig; +} + +static void record_and_raise_sig(int sig) +{ + signo = sig; + signal(sig, SIG_DFL); + raise(sig); +} + +static int wait4sig(int sig) +{ + sigset_t mask, oldmask; + sigemptyset(&mask); + sigaddset(&mask, sig); + sigaddset(&mask, SIGCHLD); /* to see our children die */ + + sigprocmask(SIG_BLOCK, &mask, &oldmask); + while (!signo) + sigsuspend (&oldmask); + sigprocmask (SIG_UNBLOCK, &mask, NULL); + + return signo != sig; +} + +static int is_fg(void) +{ + pid_t pgid = getpgrp(); + pid_t tcpgid = tcgetpgrp(1); + + return (pgid != -1) && (pgid == tcpgid); +} + +static int reader(int sig) +{ + char str[sizeof(rd_string) + 1]; + return read(0, str, sizeof(str)) < 0 || + strcmp(str, rd_string); +} + +static int post_reader(int fd) +{ + if (write(fd, rd_string, sizeof(rd_string) - 1) < 0) { + fail("write failed: %m"); + return -1; + } + return 0; +} + +static int writer(int sig) +{ + return write(1, wr_string, sizeof(wr_string) - 1) < 0; +} + +static int post_writer(int fd) +{ + char str[sizeof(wr_string) + 1]; + if (read(0, str, sizeof(str)) < 0) { + fail("read failed: %m"); + return -1; + } + /* + if (strcmp(str, wr_string)) { + fail("read string mismatch"); + return -1; + } + */ + return 0; +} + +static struct job_type { + int sig; + int (*action)(int sig); + int (*post)(int fd); +} job_types[] = { + { SIGTTOU, writer, post_writer }, + { SIGTTIN, reader, post_reader }, + { SIGCONT, wait4sig, NULL }, +}; + +static int process(int (*action)(int), int sig) +{ + int ret; + if (is_fg()) /* we must be in background on entry */ + return 1; + + if (signal(sig, record_and_raise_sig) == SIG_ERR) + return 2; + + kill(getppid(), SIGUSR2); /* tell the parent we're ready */ + + ret = action(sig); /* will be busy doing nothing for the duration of migration */ + if (ret) + return 3; + + if (!is_fg()) /* we must be in foreground now */ + return 4; + + ret = signo != sig; /* have we got the desired signal? */ + + test_waitsig(); + return ret; +} + +static int job(int (*action)(int), int sig) +{ + int i; + + if (setpgrp() < 0) + return 1; + + for (i = num_procs; i; i--) { + pid_t pid = fork(); + if (pid < 0) + kill(0, SIGKILL); /* kill the whole job */ + + if (pid == 0) + /* the last is worker, others are sleepers */ + exit(process(i == 1 ? action : wait4sig, sig)); + + /* wait for the child to grow up before going to next one + * ignore return code as the child may get stopped and SIGCHILD + * us */ + wait4sig(SIGUSR2); + signo = 0; /* rearm sighandler */ + } + + kill(getppid(), SIGUSR2); /* tell the parent we're ready */ + + /* we (or our children) will get suspended somehow here, so the rest + * will hopefully happen after migration */ + for (i = num_procs; i; i--) { + int ret; + wait(&ret); + if (!WIFEXITED(ret) || WEXITSTATUS(ret)) + kill(0, SIGKILL); + } + + return 0; +} + +static int make_pty_pair(int *fdmaster, int *fdslave) +{ + struct termios tio; + + if (openpty(fdmaster, fdslave, NULL, &tio, NULL) < 0) + return -1; + + if (ioctl(*fdslave, TIOCSCTTY, NULL) < 0) + return -1; + + tio.c_lflag |= (ICANON | ISIG | TOSTOP); + if (tcsetattr(*fdslave, TCSANOW, &tio) < 0) + return -1; + return 0; +} + +int start_jobs(pid_t *jobs, int njobs, int fdmaster, int fdslave) +{ + int i; + + /* the children will signal readiness via SIGUSR2 or get stopped (or + * exit :) and signal that via SIGCHLD */ + if (signal(SIGUSR2, record_sig) == SIG_ERR || + signal(SIGCHLD, record_sig) == SIG_ERR) { + pr_perror("can't install signal handler"); + return -1; + } + + for (i = 0; i < njobs; i++) { + int jtno = i % (sizeof(job_types) / sizeof(job_types[0])); + + jobs[i] = fork(); + if (jobs[i] < 0) { /* we're busted - bail out */ + pr_perror("fork failed"); + goto killout; + } + + if (jobs[i] == 0) { + close(fdmaster); + dup2(fdslave, 0); + dup2(fdslave, 1); + dup2(fdslave, 2); + close(fdslave); + + exit(job(job_types[jtno].action, job_types[jtno].sig)); + } + + /* wait for the child to grow up before proceeding */ + wait4sig(SIGUSR2); + signo = 0; /* rearm sighandler */ + } + + return 0; +killout: + for (; i >= 0; i--) + kill(-jobs[i], SIGKILL); + return -1; +} + +int finish_jobs(pid_t *jobs, int njobs, int fdmaster, int fdslave) +{ + int i; + + for (i = num_jobs; i--; ) { + int ret; + int jtno = i % (sizeof(job_types) / sizeof(job_types[0])); + + if (tcsetpgrp(fdslave, jobs[i]) < 0) { + fail("can't bring a job into foreground: %m"); + goto killout; + } + + kill(-jobs[i], SIGCONT); + + if (job_types[jtno].post && job_types[jtno].post(fdmaster)) + goto killout; + + kill(-jobs[i], SIGTERM); + + waitpid(jobs[i], &ret, 0); + if (!WIFEXITED(ret) || WEXITSTATUS(ret)) { + fail("job didn't exit cleanly: %d", ret); + goto killout; + } + } + return 0; +killout: + for (; i >= 0; i--) + kill(-jobs[i], SIGKILL); + return -1; +} + +int main(int argc, char ** argv) +{ + int fdmaster, fdslave; + pid_t jobs[JOBS_MAX] = {}; + + test_init(argc, argv); + + if (num_jobs > JOBS_MAX) { + pr_perror("%d jobs is too many", num_jobs); + exit(1); + } + + if (make_pty_pair(&fdmaster, &fdslave) < 0) { + pr_perror("can't make pty pair"); + exit(1); + } + + sleep(30); + + if (start_jobs(jobs, num_jobs, fdmaster, fdslave)) { + pr_perror("failed to start jobs"); + exit(1); + } + + test_daemon(); + test_waitsig(); + + if (finish_jobs(jobs, num_jobs, fdmaster, fdslave)) + fail("failed to finish jobs"); + else + pass(); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/link10.c b/CRIU_code/test/zdtm/static/link10.c new file mode 100644 index 0000000..4bc2b5f --- /dev/null +++ b/CRIU_code/test/zdtm/static/link10.c @@ -0,0 +1,80 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Migrate two hardlinked, open, and unlinked files"; +const char *test_author = "Roman Kagan "; + +char *filename; +TEST_OPTION(filename, string, "file name", 1); + +int main(int argc, char ** argv) +{ + int fd, fd2 = 0; + struct stat stat, stat2; + char filename2[256]; + + test_init(argc, argv); + + if (snprintf(filename2, sizeof(filename2), "%s.lnk", filename) >= + sizeof(filename2)) { + pr_perror("filename %s is too long", filename); + exit(1); + } + + fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL, 0644); + if (fd < 0) { + pr_perror("can't open %s", filename); + exit(1); + } + + if (link(filename, filename2) < 0) { + pr_perror("can't link %s to %s", filename, filename2); + goto unlink; + } + + fd2 = open(filename2, O_RDONLY); + if (fd < 0) { + pr_perror("can't open %s", filename2); + goto unlink; + } + + unlink(filename2); + unlink(filename); + + test_daemon(); + test_waitsig(); + + if (fstat(fd, &stat) < 0 || fstat(fd2, &stat2) < 0) { + fail("fstat failed: %m"); + goto out; + } + + if (stat.st_ino != stat2.st_ino || + stat.st_dev != stat2.st_dev) { + fail("files are different: st_ino %lu != %lu or st_dev %lu != %lu", + (long unsigned)stat.st_ino, (long unsigned)stat2.st_ino, + (long unsigned)stat.st_dev, (long unsigned)stat2.st_dev); + } + + pass(); + +out: + close(fd); + close(fd2); + return 0; + +unlink: + close(fd); + close(fd2); + unlink(filename2); + unlink(filename); + return 1; +} diff --git a/CRIU_code/test/zdtm/static/loginuid.c b/CRIU_code/test/zdtm/static/loginuid.c new file mode 100644 index 0000000..628fe64 --- /dev/null +++ b/CRIU_code/test/zdtm/static/loginuid.c @@ -0,0 +1,99 @@ +#include +#include +#include +#include +#include + +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check for /proc/self/loginuid restore"; +const char *test_author = "Dmitry Safonov "; + +const char loginuid_self[] = "/proc/self/loginuid"; +const uid_t test_value = 3; +const uid_t INVALID_UID = (uid_t)-1; + +uid_t get_loginuid(const char *path, int *err) +{ + int fd; + ssize_t num; + char buf[11]; + + *err = 0; + fd = open(path, O_RDONLY); + if (fd < 0) { + pr_perror("Failed to open %s", path); + goto out; + } + + num = read(fd, buf, 10); + close(fd); + if (num < 0) { + pr_perror("Unable to read %s", path); + goto out; + } + buf[num] = '\0'; + + return strtol(buf, NULL, 10); + +out: + *err = -1; + return 0; +} + +int set_loginuid(const char *path, uid_t value) +{ + int fd, ret = 0; + char buf[11]; + + fd = open(path, O_RDWR); + if (fd < 0) { + pr_perror("Failed to open %s", path); + return -1; + } + + snprintf(buf, 11, "%u", value); + + if (write(fd, buf, 11) < 0) { + pr_perror("Write %s to %s failed", buf, path); + ret = -1; + } + + close(fd); + return ret; +} + + +int main(int argc, char *argv[]) +{ + int ret; + uid_t new_loginuid; + + /* unset before test */ + if (set_loginuid(loginuid_self, INVALID_UID) < 0) + return -1; + + test_init(argc, argv); + + if (set_loginuid(loginuid_self, test_value) < 0) + return -1; + + test_daemon(); + test_waitsig(); + + new_loginuid = get_loginuid(loginuid_self, &ret); + if (ret < 0) + return -1; + + if (new_loginuid != test_value) { + fail("loginuid value %d is different after restore: %d\n", + test_value, new_loginuid); + return -1; + } + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/loginuid.desc b/CRIU_code/test/zdtm/static/loginuid.desc new file mode 100644 index 0000000..e27f6cc --- /dev/null +++ b/CRIU_code/test/zdtm/static/loginuid.desc @@ -0,0 +1 @@ +{'feature': 'loginuid'} diff --git a/CRIU_code/test/zdtm/static/macvlan.c b/CRIU_code/test/zdtm/static/macvlan.c new file mode 100644 index 0000000..018dffd --- /dev/null +++ b/CRIU_code/test/zdtm/static/macvlan.c @@ -0,0 +1,70 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "zdtmtst.h" + +const char *test_doc = "check that macvlan interfaces are c/r'd correctly"; +const char *test_author = "Tycho Andersen "; + +#define BRIDGE_NAME "zdtmbr0" +#define IF_NAME "zdtmmvlan0" + +static bool wait_for_macvlan(void) +{ + int i; + + for (i = 0; i < 10; i++) { + if (system("ip addr list dev " IF_NAME) == 0) + return true; + + sleep(1); + } + + return false; +} + +int main(int argc, char **argv) +{ + int ret = 1; + + test_init(argc, argv); + + if (!wait_for_macvlan()) { + fail("failed to inject macvlan device\n"); + return 1; + } + + if (system("ip addr list dev " IF_NAME " > macvlan.dump.test")) { + fail("can't save net config"); + goto out; + } + + test_daemon(); + test_waitsig(); + + if (system("ip addr list dev " IF_NAME " > macvlan.rst.test")) { + fail("can't get net config"); + goto out; + } + + if (system("diff macvlan.rst.test macvlan.dump.test")) { + fail("Net config differs after restore"); + goto out; + } + + pass(); + ret = 0; + +out: + return ret; +} diff --git a/CRIU_code/test/zdtm/static/macvlan.desc b/CRIU_code/test/zdtm/static/macvlan.desc new file mode 100644 index 0000000..cf0fee4 --- /dev/null +++ b/CRIU_code/test/zdtm/static/macvlan.desc @@ -0,0 +1,8 @@ +{ 'deps': [ '/bin/sh', + '/usr/bin/sort', + '/bin/grep', + '/sbin/ip|/bin/ip', + '/usr/bin/diff'], + 'flags': 'suid', + 'flavor': 'ns uns', + 'ropts': '--external macvlan[zdtmmvlan0]:zdtmbr0'} diff --git a/CRIU_code/test/zdtm/static/macvlan.hook b/CRIU_code/test/zdtm/static/macvlan.hook new file mode 100644 index 0000000..d0a06b6 --- /dev/null +++ b/CRIU_code/test/zdtm/static/macvlan.hook @@ -0,0 +1,33 @@ +#!/bin/bash + +[ "$1" == "--clean" -o "$1" == "--pre-restore" -o "$1" == "--post-start" ] || exit 0 + +if [ "$1" == "--post-start" ]; then + set -e + + i=0 + PIDF="zdtm/static/macvlan.pid.inprogress" + while [ ! -f "$PIDF" ]; do + i=$(($i+1)) + if [ "$i" -eq "10" ]; then + echo "failed to create macvlan test" + exit 1 + fi + sleep 1 + done + + TPID=$(cat $PIDF) + + ip link add zdtmbr0 type bridge + ip addr add 10.0.55.55/32 dev zdtmbr0 + ip link set zdtmbr0 up + ip link add zdtmmvlan0 link zdtmbr0 type macvlan mode bridge + ip addr add 10.0.55.56/32 dev zdtmmvlan0 + ip link set zdtmmvlan0 netns $TPID +else + ip link del zdtmmvlan0 || true + + [ "$1" == "--clean" ] || exit 0 + + ip link del zdtmbr0 || true +fi diff --git a/CRIU_code/test/zdtm/static/maps00.c b/CRIU_code/test/zdtm/static/maps00.c new file mode 100644 index 0000000..a6c68cd --- /dev/null +++ b/CRIU_code/test/zdtm/static/maps00.c @@ -0,0 +1,268 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "zdtmtst.h" + +const char *test_doc = "Create all sorts of maps and compare /proc/pid/maps\n" + "before and after migration\n"; +const char *test_author = "Pavel Emelianov "; + +char *filename; +TEST_OPTION(filename, string, "file name", 1); + +const static int map_prots[] = { + PROT_NONE, + PROT_READ, + PROT_READ | PROT_WRITE, + PROT_READ | PROT_WRITE | PROT_EXEC, +}; +#define NUM_MPROTS sizeof(map_prots) / sizeof(int) +#define RW_PROT(x) ((x) & (PROT_READ | PROT_WRITE)) +#define X_PROT(x) ((x) & PROT_EXEC) + +int check_prot(int src_prot, int dst_prot) +{ + if (RW_PROT(src_prot) != RW_PROT(dst_prot)) + return 0; + /* If exec bit will be enabled may depend on NX capability of CPUs of + * source and destination nodes. In any case, migrated mapping should + * not have less permissions than newly created one + ** + * A is a subset of B iff (A & B) == A + */ + return (X_PROT(dst_prot) & X_PROT(src_prot)) == X_PROT(dst_prot); +} + +const static int map_flags[] = { + MAP_PRIVATE, + MAP_SHARED, + MAP_PRIVATE | MAP_ANONYMOUS, + MAP_SHARED | MAP_ANONYMOUS +}; +#define NUM_MFLAGS sizeof(map_flags) / sizeof(int) +#define NUM_MAPS NUM_MPROTS * NUM_MFLAGS +#define ONE_MAP_SIZE 0x2000 + +struct map +{ + int prot; + int prot_real; + int flag; + char filename[256]; + int fd; + void *ptr; +}; + +static void init_map(struct map *map, int prot_no, int flag_no) +{ + map->fd = -1; + map->prot = map_prots[prot_no]; + map->flag = map_flags[flag_no]; +} + +static int make_map(struct map *map) +{ + uint32_t crc; + uint8_t buf[ONE_MAP_SIZE]; + static int i = 0; + + if (!(map->flag & MAP_ANONYMOUS)) { + /* need file */ + if (snprintf(map->filename, sizeof(map->filename), + "%s-%02d", filename, i++) >= sizeof(map->filename)) { + pr_perror("filename %s is too long", filename); + return -1; + } + + map->fd = open(map->filename, O_RDWR | O_CREAT, 0600); + if (map->fd < 0) { + pr_perror("can't open %s", map->filename); + return -1; + } + + crc = ~0; + datagen(buf, sizeof(buf), &crc); + if (write(map->fd, buf, sizeof(buf)) != sizeof(buf)) { + pr_perror("failed to write %s", map->filename); + return -1; + } + } + + map->ptr = mmap(NULL, ONE_MAP_SIZE, map->prot, map->flag, map->fd, 0); + if (map->ptr == MAP_FAILED) { + pr_perror("can't create mapping"); + return -1; + } + + if ((map->flag & MAP_ANONYMOUS) && (map->prot & PROT_WRITE)) { + /* can't fill it with data otherwise */ + crc = ~0; + datagen(map->ptr, ONE_MAP_SIZE, &crc); + } + + test_msg("map: ptr %p flag %8x prot %8x\n", + map->ptr, map->flag, map->prot); + + return 0; +} + +static sigjmp_buf segv_ret; /* we need sig*jmp stuff, otherwise SIGSEGV will reset our handler */ +static void segfault(int signo) +{ + siglongjmp(segv_ret, 1); +} + +/* + * after test func should be placed check map, because size of test_func + * is calculated as (check_map-test_func) + */ +int test_func() +{ + return 1; +} +static int check_map(struct map *map) +{ + int prot = PROT_WRITE | PROT_READ | PROT_EXEC; + + if (signal(SIGSEGV, segfault) == SIG_ERR) + { + fail("setting SIGSEGV handler failed: %m\n"); + return -1; + } + if (!sigsetjmp(segv_ret, 1)) + { + uint32_t crc = ~0; + if (datachk(map->ptr, ONE_MAP_SIZE, &crc)) /* perform read access */ + if (!(map->flag & MAP_ANONYMOUS) || + (map->prot & PROT_WRITE)) { /* anon maps could only be filled when r/w */ + fail("CRC mismatch: ptr %p flag %8x prot %8x\n", + map->ptr, map->flag, map->prot); + return -1; + } + /* prot |= PROT_READ// need barrier before this line, + because compiler change order commands. + I finded one method: look at next lines*/ + } else + prot &= PROT_WRITE | !PROT_READ | PROT_EXEC; + + if (signal(SIGSEGV, segfault) == SIG_ERR) + { + fail("setting SIGSEGV handler failed: %m\n"); + return -1; + } + + if (!sigsetjmp(segv_ret, 1)) + { + * (int *) (map->ptr) = 1234; /* perform write access */ + } else + prot &= !PROT_WRITE | PROT_READ | PROT_EXEC; + + if (signal(SIGSEGV, segfault) == SIG_ERR) + { + fail("restoring SIGSEGV handler failed: %m\n"); + return -1; + } + + if (!sigsetjmp(segv_ret, 1)) + { + if (map->prot & PROT_WRITE) { + memcpy(map->ptr,test_func, getpagesize()); + } else { + if (!(map->flag & MAP_ANONYMOUS)) { + lseek(map->fd,0,SEEK_SET); + if (write(map->fd,test_func,check_map - test_func)filename); + return -1; + } + } + } + if (!(map->flag & MAP_ANONYMOUS) || map->prot & PROT_WRITE) + /* Function body has been copied into the mapping */ + ((int (*)())map->ptr)(); /* perform exec access */ + else + /* No way to copy function body into mapping, + * clear exec bit from effective protection + */ + prot &= PROT_WRITE | PROT_READ | !PROT_EXEC; + } else + prot &= PROT_WRITE | PROT_READ | !PROT_EXEC; + + if (signal(SIGSEGV, SIG_DFL) == SIG_ERR) + { + fail("restoring SIGSEGV handler failed: %m\n"); + return -1; + } + + return prot; +} + +static void destroy_map(struct map *map) +{ + munmap(map->ptr, ONE_MAP_SIZE); + + if (map->fd >= 0) + { + close(map->fd); + unlink(map->filename); + } +} + + +#define MAPS_LEN 0x10000 + +int main(int argc, char ** argv) +{ + struct map maps[NUM_MAPS] = {}, maps_compare[NUM_MAPS] = {}; + int i, j, k; + test_init(argc, argv); + + k = 0; + for (i = 0; i < NUM_MPROTS; i++) + for (j = 0; j < NUM_MFLAGS; j++) + init_map(maps + k++, i, j); + + for (i = 0; i < NUM_MAPS; i++) + if (make_map(maps + i)) + goto err; + + test_daemon(); + test_waitsig(); + + for (i = 0; i < NUM_MAPS; i++) + if ((maps[i].prot_real=check_map(maps + i))<0) + goto err; + k=0; + for (i = 0; i < NUM_MPROTS; i++) + for (j = 0; j < NUM_MFLAGS; j++) + init_map(maps_compare + k++, i, j); + for (i = 0; i < NUM_MAPS; i++) + if (make_map(maps_compare+ i)) + goto err; + for (i = 0; i < NUM_MAPS; i++) + if ((maps_compare[i].prot_real=check_map(maps_compare + i))<0) + goto err; + for (i = 0; i< NUM_MAPS; i++) + if (!check_prot(maps[i].prot_real, maps_compare[i].prot_real)){ + fail("protection on %i (flag=%d prot=%d) maps has changed (prot=%d(expected %d))", + i, maps[i].flag, maps[i].prot, maps[i].prot_real, maps_compare[i].prot_real); + goto err; + } + + pass(); + + for (i = 0; i < NUM_MAPS; i++) { + destroy_map(maps + i); + destroy_map(maps_compare + i); + } + return 0; + +err: + return 1; +} diff --git a/CRIU_code/test/zdtm/static/maps01.c b/CRIU_code/test/zdtm/static/maps01.c new file mode 100644 index 0000000..119d7a6 --- /dev/null +++ b/CRIU_code/test/zdtm/static/maps01.c @@ -0,0 +1,183 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "zdtmtst.h" + +#define MEM_SIZE (1LU << 30) +#define MEM_OFFSET (1LU << 29) +#define MEM_OFFSET2 (MEM_SIZE - PAGE_SIZE) +#define MEM_OFFSET3 (20LU * PAGE_SIZE) + +const char *test_doc = "Test shared memory"; +const char *test_author = "Andrew Vagin > 20); + goto err; + } + + p = mmap(NULL, MEM_SIZE, PROT_WRITE | PROT_READ, + MAP_SHARED | MAP_ANONYMOUS, -1, 0); + + if (p == MAP_FAILED) { + pr_err("Failed to mmap %ld Mb shared anonymous R/W memory\n", + MEM_SIZE >> 20); + goto err; + } + + p2 = mmap(NULL, MEM_OFFSET, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (p2 == MAP_FAILED) { + pr_err("Failed to mmap %lu Mb anonymous memory\n", + MEM_OFFSET >> 20); + goto err; + } + + pid = test_fork(); + if (pid < 0) { + pr_err("Fork failed with %d\n", pid); + goto err; + } else if (pid == 0) { + void *p3; + + p3 = mmap(NULL, MEM_OFFSET3, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (p3 == MAP_FAILED) { + pr_err("Failed to mmap %lu Mb anonymous R/W memory\n", + MEM_OFFSET3 >> 20); + goto err; + } + + crc = ~0; + datagen(m + MEM_OFFSET, PAGE_SIZE, &crc); + crc = ~0; + datagen(m + MEM_OFFSET2, PAGE_SIZE, &crc); + crc = ~0; + datagen(p + MEM_OFFSET + MEM_OFFSET3, PAGE_SIZE, &crc); + crc = ~0; + datagen(p + MEM_OFFSET + 2 * MEM_OFFSET3, PAGE_SIZE, &crc); + crc = ~0; + datagen(p + MEM_OFFSET3, PAGE_SIZE, &crc); + crc = ~0; + datagen(p3, PAGE_SIZE, &crc); + + task_waiter_complete(&t, 1); + + test_waitsig(); + + crc = ~0; + status = datachk(m + MEM_OFFSET, PAGE_SIZE, &crc); + if (status) + return 1; + crc = ~0; + status = datachk(m + MEM_OFFSET2, PAGE_SIZE, &crc); + if (status) + return 1; + crc = ~0; + status = datachk(m + PAGE_SIZE, PAGE_SIZE, &crc); + if (status) + return 1; + crc = ~0; + status = datachk(p + MEM_OFFSET + 2 * MEM_OFFSET3, PAGE_SIZE, &crc); + if (status) + return 1; + crc = ~0; + status = datachk(p + MEM_OFFSET3, PAGE_SIZE, &crc); + if (status) + return 1; + crc = ~0; + status = datachk(p3, PAGE_SIZE, &crc); + if (status) + return 1; + return 0; + } + task_waiter_wait4(&t, 1); + + munmap(p, MEM_OFFSET); + p2 = mremap(p + MEM_OFFSET, MEM_OFFSET, MEM_OFFSET, MREMAP_FIXED | MREMAP_MAYMOVE, p2); + if (p2 == MAP_FAILED) + goto err; + + snprintf(path, PATH_MAX, "/proc/self/map_files/%lx-%lx", + (unsigned long) m, + (unsigned long) m + MEM_SIZE); + fd = open(path, O_RDWR); + if (fd == -1) { + pr_perror("Can't open file %s", path); + goto err; + } + + m2 = mmap(NULL, PAGE_SIZE, PROT_WRITE | PROT_READ, MAP_SHARED, fd, MEM_OFFSET3); + if (m2 == MAP_FAILED) { + pr_perror("Can't map file %s", path); + goto err; + } + close(fd); + + munmap(m, PAGE_SIZE); + munmap(m + PAGE_SIZE * 10, PAGE_SIZE); + munmap(m + MEM_OFFSET2, PAGE_SIZE); + + crc = ~0; + datagen(m + PAGE_SIZE, PAGE_SIZE, &crc); + + crc = ~0; + datagen(m2, PAGE_SIZE, &crc); + + test_daemon(); + test_waitsig(); + + kill(pid, SIGTERM); + wait(&status); + if (WIFEXITED(status)) { + if (WEXITSTATUS(status)) + goto err; + } else + goto err; + + crc = ~0; + if (datachk(m + MEM_OFFSET, PAGE_SIZE, &crc)) + goto err; + + crc = ~0; + if (datachk(m2, PAGE_SIZE, &crc)) + goto err; + + crc = ~0; + if (datachk(p2 + MEM_OFFSET3, PAGE_SIZE, &crc)) + goto err; + + pass(); + + return 0; +err: + if (waitpid(-1, NULL, WNOHANG) == 0) { + kill(pid, SIGTERM); + wait(NULL); + } + return 1; +} diff --git a/CRIU_code/test/zdtm/static/maps01.desc b/CRIU_code/test/zdtm/static/maps01.desc new file mode 100644 index 0000000..d969725 --- /dev/null +++ b/CRIU_code/test/zdtm/static/maps01.desc @@ -0,0 +1 @@ +{'flavor': 'h ns', 'flags': 'suid'} diff --git a/CRIU_code/test/zdtm/static/maps02.c b/CRIU_code/test/zdtm/static/maps02.c new file mode 100644 index 0000000..eb7c09b --- /dev/null +++ b/CRIU_code/test/zdtm/static/maps02.c @@ -0,0 +1,111 @@ +#include +#include "zdtmtst.h" +#include "get_smaps_bits.h" + +#ifndef MADV_DONTDUMP +#define MADV_DONTDUMP 16 +#endif + +const char *test_doc = "Test shared memory with advises"; +const char *test_author = "Cyrill Gorcunov "; + +struct mmap_data { + void *start; + unsigned long orig_flags; + unsigned long orig_madv; + unsigned long new_flags; + unsigned long new_madv; +}; + +#define MEM_SIZE (8192) + +static int alloc_anon_mmap(struct mmap_data *m, int flags, int adv) +{ + m->start = mmap(NULL, MEM_SIZE, PROT_READ | PROT_WRITE, + flags, -1, 0); + if (m->start == MAP_FAILED) { + pr_perror("mmap failed"); + return -1; + } + + if (madvise(m->start, MEM_SIZE, adv)) { + if (errno == EINVAL) { + test_msg("madvise failed, no kernel support\n"); + munmap(m->start, MEM_SIZE); + *m = (struct mmap_data){ }; + } else { + pr_perror("madvise failed"); + return -1; + } + } + + return 0; +} + +int main(int argc, char **argv) +{ + struct mmap_data m[5] = { }; + size_t i; + + test_init(argc, argv); + + test_msg("Alloc growsdown\n"); + if (alloc_anon_mmap(&m[0], MAP_PRIVATE | MAP_ANONYMOUS, MADV_DONTFORK)) + return -1; + + test_msg("Alloc locked/sequential\n"); + if (alloc_anon_mmap(&m[1], MAP_PRIVATE | MAP_ANONYMOUS | MAP_LOCKED, MADV_SEQUENTIAL)) + return -1; + + test_msg("Alloc noreserve/dontdump\n"); + if (alloc_anon_mmap(&m[2], MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE, MADV_DONTDUMP)) + return -1; + + test_msg("Alloc hugetlb/hugepage\n"); + if (alloc_anon_mmap(&m[3], MAP_PRIVATE | MAP_ANONYMOUS, MADV_HUGEPAGE)) + return -1; + + test_msg("Alloc dontfork/random|mergeable\n"); + if (alloc_anon_mmap(&m[4], MAP_PRIVATE | MAP_ANONYMOUS, MADV_MERGEABLE)) + return -1; + + test_msg("Fetch existing flags/adv\n"); + for (i = 0; i < sizeof(m)/sizeof(m[0]); i++) { + if (get_smaps_bits((unsigned long)m[i].start, + &m[i].orig_flags, + &m[i].orig_madv)) + return -1; + } + + test_daemon(); + test_waitsig(); + + test_msg("Fetch restored flags/adv\n"); + for (i = 0; i < sizeof(m)/sizeof(m[0]); i++) { + if (get_smaps_bits((unsigned long)m[i].start, + &m[i].new_flags, + &m[i].new_madv)) + return -1; + + if (m[i].orig_flags != m[i].new_flags) { + pr_perror("Flags are changed %lx %lx -> %lx (%zu)", + (unsigned long)m[i].start, + m[i].orig_flags, m[i].new_flags, i); + fail(); + return -1; + } + + if (m[i].orig_madv != m[i].new_madv) { + pr_perror("Madvs are changed %lx %lx -> %lx (%zu)", + (unsigned long)m[i].start, + m[i].orig_madv, m[i].new_madv, i); + fail(); + return -1; + } + + } + + pass(); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/maps03.c b/CRIU_code/test/zdtm/static/maps03.c new file mode 100644 index 0000000..f2bf795 --- /dev/null +++ b/CRIU_code/test/zdtm/static/maps03.c @@ -0,0 +1,47 @@ +#include +#include +#include +#include +#include "zdtmtst.h" + +#if (LONG_MAX == 2147483647L) /* 32 bit */ + +#define TEST_SKIP_REASON "64-bit arch required" +#include "skip-me.c" + +#else + +const char *test_doc = "Test for huge VMA area"; +const char *test_author = "Cyrill Gorcunov "; + +int main(int argc, char **argv) +{ + test_init(argc, argv); + unsigned char *mem; + + test_msg("Alloc huge VMA\n"); + mem = (void *)mmap(NULL, (10L << 30), PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if ((void *)mem == MAP_FAILED) { + pr_perror("mmap failed"); + return -1; + } + + mem[4L << 30] = 1; + mem[8L << 30] = 2; + + test_daemon(); + test_waitsig(); + + test_msg("Testing restored data\n"); + + if (mem[4L << 30] != 1 || mem[8L << 30] != 2) { + fail("Data corrupted!\n"); + exit(1); + } + + pass(); + + return 0; +} +#endif diff --git a/CRIU_code/test/zdtm/static/maps03.desc b/CRIU_code/test/zdtm/static/maps03.desc new file mode 100644 index 0000000..95c58b4 --- /dev/null +++ b/CRIU_code/test/zdtm/static/maps03.desc @@ -0,0 +1 @@ +{'flags': 'noauto'} diff --git a/CRIU_code/test/zdtm/static/maps04.c b/CRIU_code/test/zdtm/static/maps04.c new file mode 100644 index 0000000..780c566 --- /dev/null +++ b/CRIU_code/test/zdtm/static/maps04.c @@ -0,0 +1,57 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "zdtmtst.h" + +#define MEM_SIZE (1L << 29) + +const char *test_doc = "Test big mappings"; +const char *test_author = "Andrew Vagin +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "zdtmtst.h" + +const char *test_doc = "Create a bunch of small VMAs and test they survive transferring\n"; +const char *test_author = "Cyrill Gorcunov "; + +#define NR_MAPS 4096 + +#define NR_MAPS_1 (NR_MAPS + 0) +#define NR_MAPS_2 (NR_MAPS + 1) + +#define MAPS_SIZE_1 (140 << 10) +#define MAPS_SIZE_2 (8192) + +int main(int argc, char *argv[]) +{ + void *map[NR_MAPS + 2] = { }, *addr; + size_t i, summary; + + test_init(argc, argv); + + summary = NR_MAPS * 2 * 4096 + MAPS_SIZE_1 + MAPS_SIZE_2 + (1 << 20); + + addr = mmap(NULL, summary, PROT_NONE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (addr == MAP_FAILED) { + pr_perror("Can't mmap"); + return 1; + } + munmap(addr, summary); + + for (i = 0; i < NR_MAPS; i++) { + map[i] = mmap(i > 0 ? map[i - 1] + 8192 : addr, 4096, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (map[i] == MAP_FAILED) { + pr_perror("Can't mmap"); + return 1; + } else { + /* Dirtify it */ + int *v = (void *)map[i]; + *v = i; + } + } + + map[NR_MAPS_1] = mmap(map[NR_MAPS_1 - 1] + 8192, MAPS_SIZE_1, PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_ANONYMOUS | MAP_PRIVATE | MAP_GROWSDOWN, -1, 0); + if (map[NR_MAPS_1] == MAP_FAILED) { + pr_perror("Can't mmap"); + return 1; + } else { + /* Dirtify it */ + int *v = (void *)map[NR_MAPS_1]; + *v = i; + test_msg("map-1: %p %p\n", map[NR_MAPS_1], map[NR_MAPS_1] + MAPS_SIZE_1); + } + + map[NR_MAPS_2] = mmap(map[NR_MAPS_1] + MAPS_SIZE_1, MAPS_SIZE_2, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE | MAP_GROWSDOWN, -1, 0); + if (map[NR_MAPS_2] == MAP_FAILED) { + pr_perror("Can't mmap"); + return 1; + } else { + /* Dirtify it */ + int *v = (void *)map[NR_MAPS_2]; + *v = i; + test_msg("map-2: %p %p\n", map[NR_MAPS_2], map[NR_MAPS_2] + MAPS_SIZE_2); + } + + test_daemon(); + test_waitsig(); + + for (i = 0; i < NR_MAPS; i++) { + int *v = (void *)map[i]; + + if (*v != i) { + fail("Data corrupted at page %lu", (unsigned long)i); + return 1; + } + } + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/maps06.c b/CRIU_code/test/zdtm/static/maps06.c new file mode 100644 index 0000000..7480d6b --- /dev/null +++ b/CRIU_code/test/zdtm/static/maps06.c @@ -0,0 +1,70 @@ +#include "zdtmtst.h" +#include +#include +#include +#include + +const char *test_doc = "Create a lot of file vma-s"; +const char *test_author = "Andrei Vagin "; + +char *filename; +TEST_OPTION(filename, string, "file name", 1); + +int main(int argc, char ** argv) +{ + void *start; + int fd, i; + int ps = sysconf(_SC_PAGESIZE); + int test_size; + + test_init(argc, argv); + + fd = open(filename, O_RDWR | O_CREAT, 0666); + if (fd < 0) + return 1; + + ftruncate(fd, ps); + + if (ps == 0x1000) + test_size = 10240; + else + test_size = 512; + + start = mmap(0, ps * test_size * 4, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); + if (start == MAP_FAILED) + return 1; + + for (i = 0; i < test_size; i++) { + int *addr; + addr = mmap(start + i * 3 * ps, ps, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_FILE | MAP_FIXED, fd, 0); + if (addr == MAP_FAILED) + return 1; + addr[0] = i * 2; + addr = mmap(start + (i * 3 + 1) * ps, ps, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0); + if (addr == MAP_FAILED) + return 1; + addr[0] = i; + } + + test_daemon(); + + test_waitsig(); + + for (i = 0; i < test_size; i++) { + int *addr; + addr = start + i * 3 * ps; + if (addr[0] != i * 2) + fail(); + addr = start + (i * 3 + 1) * ps; + if (addr[0] != i) + fail(); + } + + pass(); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/maps_file_prot.c b/CRIU_code/test/zdtm/static/maps_file_prot.c new file mode 100644 index 0000000..3b28c1f --- /dev/null +++ b/CRIU_code/test/zdtm/static/maps_file_prot.c @@ -0,0 +1,53 @@ +#include +#include +#include +#include +#include +#include +#include +#include "zdtmtst.h" + +const char *test_doc = "Test mappings of same file with different prot"; +const char *test_author = "Jamie Liu "; + +char *filename; +TEST_OPTION(filename, string, "file name", 1); + +#define die(fmt, arg...) do { pr_perror(fmt, ## arg); return 1; } while (0) + +int main(int argc, char ** argv) +{ + void *ro_map, *rw_map; + int fd; + + test_init(argc, argv); + + fd = open(filename, O_RDWR | O_CREAT, 0644); + if (fd < 0) + die("open failed"); + if (ftruncate(fd, 2 * PAGE_SIZE)) + die("ftruncate failed"); + + ro_map = mmap(NULL, 2 * PAGE_SIZE, PROT_READ, MAP_SHARED, fd, 0); + if (ro_map == MAP_FAILED) + die("mmap failed"); + rw_map = ro_map + PAGE_SIZE; + if (mprotect(rw_map, PAGE_SIZE, PROT_READ | PROT_WRITE)) + die("mprotect failed"); + + close(fd); + + test_daemon(); + test_waitsig(); + + /* Check that rw_map is still writeable */ + *(volatile char *)rw_map = 1; + + if (mprotect(ro_map, PAGE_SIZE, PROT_READ | PROT_WRITE)) { + fail("mprotect after restore failed"); + return 1; + } + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/mem-touch.c b/CRIU_code/test/zdtm/static/mem-touch.c new file mode 100644 index 0000000..5c8c339 --- /dev/null +++ b/CRIU_code/test/zdtm/static/mem-touch.c @@ -0,0 +1,62 @@ +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check changing memory"; +const char *test_author = "Pavel Emelyanov "; + +#define MEM_PAGES 16 + +int main(int argc, char **argv) +{ + void *mem; + int i, fail = 0; + unsigned rover = 1; + unsigned backup[MEM_PAGES] = {}; + + srand(time(NULL)); + + test_init(argc, argv); + + mem = mmap(NULL, MEM_PAGES * PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); + if (mem == MAP_FAILED) + return 1; + + test_msg("mem %p backup %p\n", mem, backup); + + test_daemon(); + while (test_go()) { + unsigned pfn; + struct timespec req = { .tv_sec = 0, .tv_nsec = 100000, }; + + pfn = random() % MEM_PAGES; + *(unsigned *)(mem + pfn * PAGE_SIZE) = rover; + backup[pfn] = rover; + test_msg("t %u %u\n", pfn, rover); + rover++; + nanosleep(&req, NULL); + } + test_waitsig(); + + test_msg("final rover %u\n", rover); + for (i = 0; i < MEM_PAGES; i++) + if (backup[i] != *(unsigned *)(mem + i * PAGE_SIZE)) { + test_msg("Page %u differs want %u has %u\n", i, + backup[i], *(unsigned *)(mem + i * PAGE_SIZE)); + fail = 1; + } else + test_msg("Page %u matches %u\n", i, backup[i]); + + if (fail) + fail("Memory corruption\n"); + else + pass(); + + return 0; +} + diff --git a/CRIU_code/test/zdtm/static/mem-touch.desc b/CRIU_code/test/zdtm/static/mem-touch.desc new file mode 100644 index 0000000..95c58b4 --- /dev/null +++ b/CRIU_code/test/zdtm/static/mem-touch.desc @@ -0,0 +1 @@ +{'flags': 'noauto'} diff --git a/CRIU_code/test/zdtm/static/mlock_setuid.c b/CRIU_code/test/zdtm/static/mlock_setuid.c new file mode 100644 index 0000000..a737bdd --- /dev/null +++ b/CRIU_code/test/zdtm/static/mlock_setuid.c @@ -0,0 +1,57 @@ +#include +#include +#include +#include "zdtmtst.h" +#include "get_smaps_bits.h" + +#define MEM_SIZE (69632) + +int main(int argc, char **argv) +{ + int ret; + void *start; + unsigned long new_flags = 0; + unsigned long new_madv = 0; + test_init(argc, argv); + + test_msg("Alloc vma of size %d\n", MEM_SIZE); + start = mmap(NULL, MEM_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (start == MAP_FAILED) { + pr_perror("mmap failed"); + return -1; + } + + test_msg("Lock vma from %p to %lx\n", + start, (unsigned long)start + MEM_SIZE); + ret = mlock(start, MEM_SIZE); + if (ret < 0) { + pr_perror("mlock"); + return -1; + } + + test_daemon(); + + test_msg("Setuid to 18943\n"); + ret = setuid(18943); + if (ret < 0) { + pr_perror("setuid"); + return -1; + } + + test_waitsig(); + + ret = get_smaps_bits((unsigned long)start, &new_flags, &new_madv); + if (ret < 0) + return -1; + + test_msg("Check smaps flags for MAP_LOCKED\n"); + if (new_flags & MAP_LOCKED) { + pass(); + } else { + fail("Vma is not locked after c/r\n"); + return -1; + } + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/mlock_setuid.desc b/CRIU_code/test/zdtm/static/mlock_setuid.desc new file mode 100644 index 0000000..d969725 --- /dev/null +++ b/CRIU_code/test/zdtm/static/mlock_setuid.desc @@ -0,0 +1 @@ +{'flavor': 'h ns', 'flags': 'suid'} diff --git a/CRIU_code/test/zdtm/static/mmx00.c b/CRIU_code/test/zdtm/static/mmx00.c new file mode 100644 index 0000000..f0f7c3c --- /dev/null +++ b/CRIU_code/test/zdtm/static/mmx00.c @@ -0,0 +1,99 @@ +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Start a calculation, leaving MMX in a certain state,\n" +"before migration, continue after"; +const char *test_author = "Pavel Emelianov "; + +#if defined(__i386__) || defined(__x86_64__) +void start(uint8_t *bytes, uint16_t *words) +{ + __asm__ volatile ( + "movq %0, %%mm0\n" + "movq %1, %%mm1\n" + "movq %2, %%mm2\n" + "movq %3, %%mm3\n" + "paddb %%mm0, %%mm1\n" + "psubw %%mm2, %%mm3\n" + : + : "m" (bytes[0]), "m" (bytes[8]), + "m" (words[0]), "m" (words[4]) + ); +} + +void finish(uint8_t *bytes, uint16_t *words) +{ + __asm__ volatile ( + "movq %%mm1, %0\n" + "movq %%mm3, %1\n" + : "=m" (bytes[0]), "=m" (words[0]) + ); +} + +static inline void cpuid(unsigned int op, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) +{ + __asm__("cpuid" + : "=a" (*eax), + "=b" (*ebx), + "=c" (*ecx), + "=d" (*edx) + : "0" (op), "c"(0)); +} + +int chk_proc_mmx(void) +{ + unsigned int eax, ebx, ecx, edx; + + cpuid(1, &eax, &ebx, &ecx, &edx); + return edx & (1 << 23); +} +#endif + +int main(int argc, char **argv) +{ +#if defined(__i386__) || defined(__x86_64__) + uint8_t bytes[16]; + uint16_t words[8]; + uint32_t rnd[8]; + int i; + + uint8_t resbytes1[8], resbytes2[8]; + uint16_t reswords1[4], reswords2[4]; +#endif + + test_init(argc, argv); +#if defined(__i386__) || defined(__x86_64__) + if (!chk_proc_mmx()) { + skip("MMX not supported"); + return 1; + } + + for (i = 0; i < (sizeof(bytes) + sizeof(words)) / 4; i++) + rnd[i] = mrand48(); + + memcpy((uint8_t *) bytes, (uint8_t *) rnd, sizeof(bytes)); + memcpy((uint8_t *) words, (uint8_t *) rnd + sizeof(bytes), sizeof(words)); + + start(bytes, words); + finish(resbytes1, reswords1); + + start(bytes, words); + + test_daemon(); + test_waitsig(); + + finish(resbytes2, reswords2); + + if (memcmp((uint8_t *) resbytes1, (uint8_t *) resbytes2, sizeof(resbytes1))) + fail("byte op mismatch\n"); + else if (memcmp((uint8_t *) reswords1, (uint8_t *) reswords2, sizeof(reswords2))) + fail("word op mismatch\n"); + else + pass(); +#else + skip("Unsupported arch"); +#endif + return 0; +} diff --git a/CRIU_code/test/zdtm/static/mmx00.desc b/CRIU_code/test/zdtm/static/mmx00.desc new file mode 100644 index 0000000..d2f501d --- /dev/null +++ b/CRIU_code/test/zdtm/static/mmx00.desc @@ -0,0 +1 @@ +{'arch': 'x86_64'} diff --git a/CRIU_code/test/zdtm/static/mnt_enablefs.c b/CRIU_code/test/zdtm/static/mnt_enablefs.c new file mode 100644 index 0000000..3de4c57 --- /dev/null +++ b/CRIU_code/test/zdtm/static/mnt_enablefs.c @@ -0,0 +1,43 @@ +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check enabled file systems (--enable-fs)"; +const char *test_author = "Andrei Vagin "; + +char *dirname; +TEST_OPTION(dirname, string, "directory name", 1); + +int main(int argc, char **argv) +{ + char fname[PATH_MAX]; + + test_init(argc, argv); + + mkdir(dirname, 0777); + + if (mount("zdtm_nfsd", dirname, "nfsd", 0, NULL) == -1) { + pr_perror("mount"); + return -1; + } + + snprintf(fname, sizeof(fname), "%s/exports", dirname); + + test_daemon(); + test_waitsig(); + + if (access(fname, F_OK)) + fail(); + + pass(); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/mnt_enablefs.checkskip b/CRIU_code/test/zdtm/static/mnt_enablefs.checkskip new file mode 100644 index 0000000..121f057 --- /dev/null +++ b/CRIU_code/test/zdtm/static/mnt_enablefs.checkskip @@ -0,0 +1,3 @@ +#!/bin/sh + +unshare -m --propagation private mount -t nfsd nfsd /mnt diff --git a/CRIU_code/test/zdtm/static/mnt_enablefs.desc b/CRIU_code/test/zdtm/static/mnt_enablefs.desc new file mode 100644 index 0000000..33a78ed --- /dev/null +++ b/CRIU_code/test/zdtm/static/mnt_enablefs.desc @@ -0,0 +1,4 @@ +{ 'feature': 'mnt_id', + 'flags': 'suid', + 'flavor': 'ns', + 'opts': '--enable-fs nfsd'} diff --git a/CRIU_code/test/zdtm/static/mnt_ext_auto.c b/CRIU_code/test/zdtm/static/mnt_ext_auto.c new file mode 100644 index 0000000..9181b37 --- /dev/null +++ b/CRIU_code/test/zdtm/static/mnt_ext_auto.c @@ -0,0 +1,200 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check --mnt-ext-map"; +const char *test_author = "Andrew Vagin "; + +#ifdef ZDTM_EXTMAP_MANUAL +char *dirname = "mnt_ext_manual.test"; +char *dirname_private_shared_bind = "mnt_ext_manual_private_shared_bind.test"; +char *dirname_bind = "mnt_ext_manual_bind.test"; +char *dirname_slave_shared_bind = "mnt_ext_manual_slave_shared_bind.test"; +char *dirname_slave_bind = "mnt_ext_manual_slave_bind.test"; +#define DDIR "mtest" +#else +char *dirname = "mnt_ext_auto.test"; +char *dirname_private_shared_bind = "mnt_ext_auto_private_shared_bind.test"; +char *dirname_bind = "mnt_ext_auto_bind.test"; +char *dirname_slave_shared_bind = "mnt_ext_auto_slave_shared_bind.test"; +char *dirname_slave_bind = "mnt_ext_auto_slave_bind.test"; +#define DDIR "atest" +#endif +TEST_OPTION(dirname, string, "directory name", 1); + +int main(int argc, char ** argv) +{ + char src[PATH_MAX], dst[PATH_MAX], *root; + char dst_bind[PATH_MAX], dst_private_shared_bind[PATH_MAX], + dst_slave_shared_bind[PATH_MAX], dst_slave_bind[PATH_MAX]; + char *dname = "/tmp/zdtm_ext_auto.XXXXXX"; + struct stat sta, stb, bsta, bstb, ssbsta, sbsta, ssbstb, sbstb, psbsta, psbstb; + char* zdtm_newns = getenv("ZDTM_NEWNS"); + + root = getenv("ZDTM_ROOT"); + if (root == NULL) { + pr_perror("root"); + return 1; + } + + sprintf(dst, "%s/%s", get_current_dir_name(), dirname); + sprintf(dst_private_shared_bind, "%s/%s", get_current_dir_name(), dirname_private_shared_bind); + sprintf(dst_bind, "%s/%s", get_current_dir_name(), dirname_bind); + sprintf(dst_slave_shared_bind, "%s/%s", get_current_dir_name(), dirname_slave_shared_bind); + sprintf(dst_slave_bind, "%s/%s", get_current_dir_name(), dirname_slave_bind); + + if (!zdtm_newns) { + pr_perror("ZDTM_NEWNS is not set"); + return 1; + } else if (strcmp(zdtm_newns, "1")) { + goto test; + } + + mkdir(dname, 755); + sprintf(src, "%s/%s", dname, DDIR); + if (mount("zdtm_auto_ext_mnt", dname, "tmpfs", 0, NULL)) { + pr_perror("mount"); + return 1; + } + mkdir(src, 755); + + if (unshare(CLONE_NEWNS)) { + pr_perror("unshare"); + return 1; + } + mkdir(dst, 755); + if (mount(src, dst, NULL, MS_BIND, NULL)) { + pr_perror("bind"); + return 1; + } + mkdir(dst_private_shared_bind, 755); + if (mount(dst, dst_private_shared_bind, NULL, MS_BIND, NULL)) { + pr_perror("bind"); + return 1; + } + if (mount("none", dst_private_shared_bind, NULL, MS_PRIVATE, NULL)) { + pr_perror("bind"); + return 1; + } + if (mount("none", dst_private_shared_bind, NULL, MS_SHARED, NULL)) { + pr_perror("bind"); + return 1; + } + mkdir(dst_bind, 755); + if (mount(dst_private_shared_bind, dst_bind, NULL, MS_BIND, NULL)) { + pr_perror("bind"); + return 1; + } + mkdir(dst_slave_shared_bind, 755); + if (mount(dst_bind, dst_slave_shared_bind, NULL, MS_BIND, NULL)) { + pr_perror("bind"); + return 1; + } + if (mount("none", dst_slave_shared_bind, NULL, MS_SLAVE, NULL)) { + pr_perror("bind"); + return 1; + } + if (mount("none", dst_slave_shared_bind, NULL, MS_SHARED, NULL)) { + pr_perror("bind"); + return 1; + } + mkdir(dst_slave_bind, 755); + if (mount(dst_slave_shared_bind, dst_slave_bind, NULL, MS_BIND, NULL)) { + pr_perror("bind"); + return 1; + } + if (mount("none", dst_slave_bind, NULL, MS_SLAVE, NULL)) { + pr_perror("bind"); + return 1; + } +test: + test_init(argc, argv); + + if (stat(dirname, &stb)) { + pr_perror("stat"); + sleep(100); + return 1; + } + if (stat(dirname_private_shared_bind, &psbstb)) { + pr_perror("stat"); + sleep(100); + return 1; + } + if (stat(dirname_bind, &bstb)) { + pr_perror("stat"); + sleep(100); + return 1; + } + if (stat(dirname_slave_shared_bind, &ssbstb)) { + pr_perror("stat"); + sleep(100); + return 1; + } + if (stat(dirname_slave_bind, &sbstb)) { + pr_perror("stat"); + sleep(100); + return 1; + } + + test_daemon(); + test_waitsig(); + + if (stat(dirname, &sta)) { + pr_perror("stat"); + sleep(100); + return 1; + } + if (stat(dirname_private_shared_bind, &psbsta)) { + pr_perror("stat"); + sleep(100); + return 1; + } + if (stat(dirname_bind, &bsta)) { + pr_perror("stat"); + sleep(100); + return 1; + } + if (stat(dirname_slave_shared_bind, &ssbsta)) { + pr_perror("stat"); + sleep(100); + return 1; + } + if (stat(dirname_slave_bind, &sbsta)) { + pr_perror("stat"); + sleep(100); + return 1; + } + + if (sta.st_dev != stb.st_dev) { + fail(); + return 1; + } + if (psbsta.st_dev != psbstb.st_dev) { + fail(); + return 1; + } + if (bsta.st_dev != bstb.st_dev) { + fail(); + return 1; + } + if (ssbsta.st_dev != ssbstb.st_dev) { + fail(); + return 1; + } + if (sbsta.st_dev != sbstb.st_dev) { + fail(); + return 1; + } + + pass(); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/mnt_ext_auto.desc b/CRIU_code/test/zdtm/static/mnt_ext_auto.desc new file mode 100644 index 0000000..f899fc8 --- /dev/null +++ b/CRIU_code/test/zdtm/static/mnt_ext_auto.desc @@ -0,0 +1 @@ +{'flavor': 'ns uns', 'feature': 'mnt_id', 'opts': '--external mnt[]:s'} diff --git a/CRIU_code/test/zdtm/static/mnt_ext_dev.c b/CRIU_code/test/zdtm/static/mnt_ext_dev.c new file mode 100644 index 0000000..a9ac013 --- /dev/null +++ b/CRIU_code/test/zdtm/static/mnt_ext_dev.c @@ -0,0 +1,108 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check mounts of external devices"; +const char *test_author = "Andrei Vagin + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that mounts with external master peers are c/r'd"; +const char *test_author = "Tycho Andersen "; + +char *dirname = "mnt_ext_auto.test"; +TEST_OPTION(dirname, string, "directory name", 1); + +int main(int argc, char ** argv) +{ + char src[PATH_MAX], dst[PATH_MAX], *root; + char *dname = "/tmp/zdtm_ext_auto.XXXXXX"; + + root = getenv("ZDTM_ROOT"); + if (root == NULL) { + pr_perror("root"); + return 1; + } + + sprintf(dst, "%s/ext_mounts", getenv("ZDTM_ROOT")); + + if (strcmp(getenv("ZDTM_NEWNS"), "1")) + goto test; + + mkdir(dname, 755); + sprintf(src, "%s/test", dname); + if (mount("zdtm_auto_ext_mnt", dname, "tmpfs", 0, NULL)) { + pr_perror("mount"); + return 1; + } + + mkdir(src, 755); + mkdir(dst, 755); + + if (unshare(CLONE_NEWNS)) { + pr_perror("unshare"); + return 1; + } + + if (mount(src, dst, NULL, MS_BIND, NULL)) { + pr_perror("bind"); + return 1; + } + + if (mount(src, dst, NULL, MS_SLAVE, NULL)) { + pr_perror("slave"); + return 1; + } + +test: + test_init(argc, argv); + + test_daemon(); + test_waitsig(); + + + pass(); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/mnt_ext_master.desc b/CRIU_code/test/zdtm/static/mnt_ext_master.desc new file mode 100644 index 0000000..3f00dcf --- /dev/null +++ b/CRIU_code/test/zdtm/static/mnt_ext_master.desc @@ -0,0 +1,3 @@ +{ 'feature': 'mnt_id', + 'flavor': 'ns uns', + 'opts': '--ext-mount-map auto --enable-external-masters'} diff --git a/CRIU_code/test/zdtm/static/mnt_ro_bind.c b/CRIU_code/test/zdtm/static/mnt_ro_bind.c new file mode 100644 index 0000000..1d98814 --- /dev/null +++ b/CRIU_code/test/zdtm/static/mnt_ro_bind.c @@ -0,0 +1,84 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check read-only bind-mounts"; +const char *test_author = "Andrew Vagin "; + +char *dirname; +TEST_OPTION(dirname, string, "directory name", 1); + +#define TEST_WORD "testtest" +#define TEST_WORD2 "TESTTEST" + +int main(int argc, char **argv) +{ + int fd, ret = 1; + char rw_path[PATH_MAX], ro_path[PATH_MAX], rw_f[PATH_MAX], ro_f[PATH_MAX]; + + test_init(argc, argv); + + snprintf(rw_path, sizeof(rw_path), "%s/rw", dirname); + snprintf(ro_path, sizeof(ro_path), "%s/ro", dirname); + snprintf(rw_f, sizeof(rw_f), "%s/rw/test", dirname); + snprintf(ro_f, sizeof(ro_f), "%s/ro/test", dirname); + + mkdir(dirname, 0700); + if (mount("none", dirname, "tmpfs", 0, "") < 0) { + fail("Can't mount tmpfs"); + return 1; + } + mkdir(rw_path, 0700); + mkdir(ro_path, 0700); + + if (mount("zdtm_rw", rw_path, "tmpfs", 0, "") < 0) { + fail("Can't mount tmpfs"); + return 1; + } + + if (mount(rw_path, ro_path, NULL, MS_BIND, NULL) < 0) { + fail("Can't mount tmpfs"); + return 1; + } + + if (mount(NULL, ro_path, NULL, MS_BIND | MS_REMOUNT | MS_RDONLY, NULL) < 0) { + fail("Can't mount tmpfs"); + return 1; + } + + test_daemon(); + test_waitsig(); + + fd = open(ro_f, O_CREAT | O_WRONLY, 0666); + if (fd != -1 || errno != EROFS) { + fail("%s is created", ro_f); + goto err; + } + + fd = open(rw_f, O_CREAT | O_WRONLY, 0666); + if (fd < 0) { + fail("Unable to create %s", rw_f); + goto err; + } + close(fd); + + fd = open(ro_f, O_RDONLY); + if (fd < 0) { + fail("Unable to create %s", rw_f); + goto err; + } + + pass(); + ret = 0; +err: + umount2(dirname, MNT_DETACH); + rmdir(dirname); + return ret; +} diff --git a/CRIU_code/test/zdtm/static/mnt_ro_bind.desc b/CRIU_code/test/zdtm/static/mnt_ro_bind.desc new file mode 100644 index 0000000..7657ba4 --- /dev/null +++ b/CRIU_code/test/zdtm/static/mnt_ro_bind.desc @@ -0,0 +1 @@ +{'flavor': 'ns uns', 'flags': 'suid'} diff --git a/CRIU_code/test/zdtm/static/mnt_tracefs.c b/CRIU_code/test/zdtm/static/mnt_tracefs.c new file mode 100644 index 0000000..99d6087 --- /dev/null +++ b/CRIU_code/test/zdtm/static/mnt_tracefs.c @@ -0,0 +1,72 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Test c/r of tracefs"; +const char *test_author = "Tycho Andersen "; + +char *dirname = "mnt_tracefs.test"; +TEST_OPTION(dirname, string, "directory name", 1); + +int main(int argc, char ** argv) +{ + char dst[PATH_MAX]; + + if (strcmp(getenv("ZDTM_NEWNS"), "1")) + goto test; + + if (unshare(CLONE_NEWNS)) { + pr_perror("unshare"); + return 1; + } + + sprintf(dst, "%s/%s", get_current_dir_name(), dirname); + if (mkdir(dst, 755) < 0) { + pr_perror("mkdir"); + return 1; + } + + if (mount("/sys/kernel/debug", dst, NULL, MS_BIND | MS_REC, NULL)) { + rmdir(dst); + pr_perror("mount"); + return 1; + } + + /* trigger the tracefs mount */ + strcat(dst, "/tracing/README"); + if (access(dst, F_OK) < 0) { + umount(dst); + rmdir(dst); + pr_perror("access"); + return 1; + } + +test: + test_init(argc, argv); + + test_daemon(); + test_waitsig(); + + sprintf(dst, "%s/%s/tracing/README", get_current_dir_name(), dirname); + + /* EACCES is what we expect, since users can't actually /see/ this + * filesystem, but CRIU needs to know how to remount it, so the restore + * should succeed + */ + if (access(dst, F_OK) < 0 && errno != EACCES) { + fail("couldn't access tracefs at %s", dst); + return 1; + } + + pass(); + return 0; +} + diff --git a/CRIU_code/test/zdtm/static/mnt_tracefs.checkskip b/CRIU_code/test/zdtm/static/mnt_tracefs.checkskip new file mode 100644 index 0000000..0c20ec6 --- /dev/null +++ b/CRIU_code/test/zdtm/static/mnt_tracefs.checkskip @@ -0,0 +1,5 @@ +#!/bin/bash + +# tracefs is automatically mounted under debugfs if the kernel has it, so we +# just need to check for a file in the tracing directory. +test -f /sys/kernel/debug/tracing/README || exit 1 diff --git a/CRIU_code/test/zdtm/static/mnt_tracefs.desc b/CRIU_code/test/zdtm/static/mnt_tracefs.desc new file mode 100644 index 0000000..e90ea94 --- /dev/null +++ b/CRIU_code/test/zdtm/static/mnt_tracefs.desc @@ -0,0 +1,3 @@ +{ 'feature': 'mnt_id', + 'flavor': 'uns', + 'opts': '--ext-mount-map auto --enable-external-masters'} diff --git a/CRIU_code/test/zdtm/static/mnt_tracefs.hook b/CRIU_code/test/zdtm/static/mnt_tracefs.hook new file mode 100644 index 0000000..d57df0a --- /dev/null +++ b/CRIU_code/test/zdtm/static/mnt_tracefs.hook @@ -0,0 +1,5 @@ +#!/bin/bash + +[ "$1" == "--clean" ] || exit 0 + +rmdir zdtm/static/mnt_tracefs.test diff --git a/CRIU_code/test/zdtm/static/mntns-deleted-dst b/CRIU_code/test/zdtm/static/mntns-deleted-dst new file mode 100644 index 0000000..e69de29 diff --git a/CRIU_code/test/zdtm/static/mntns_deleted.c b/CRIU_code/test/zdtm/static/mntns_deleted.c new file mode 100644 index 0000000..39f9d0b --- /dev/null +++ b/CRIU_code/test/zdtm/static/mntns_deleted.c @@ -0,0 +1,102 @@ +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +#ifndef CLONE_NEWNS +#define CLONE_NEWNS 0x00020000 +#endif + +const char *test_doc = "Check the restore of deleted bindmounts"; +const char *test_author = "Cyrill Gorcunov "; + +char *dirname; +TEST_OPTION(dirname, string, "directory name", 1); + +#define TEST_DIR_SRC "test-src" +#define TEST_DIR_DST "test-dst" + +#define TEST_FILE_SRC "mntns-deleted-src" +#define TEST_FILE_DST "mntns-deleted-dst" + +int main(int argc, char *argv[]) +{ + char path_src[PATH_MAX], path_dst[PATH_MAX]; + int fd1, fd2; + + test_init(argc, argv); + + if (mkdir(dirname, 0700)) { + pr_perror("mkdir %s", dirname); + exit(1); + } + + if (mount("none", dirname, "tmpfs", MS_MGC_VAL, NULL)) { + pr_perror("mount %s", dirname); + return 1; + } + + snprintf(path_src, sizeof(path_src), "%s/%s", dirname, TEST_DIR_SRC); + snprintf(path_dst, sizeof(path_dst), "%s/%s", dirname, TEST_DIR_DST); + + rmdir(path_src); + rmdir(path_dst); + + unlink(TEST_FILE_SRC); + unlink(TEST_FILE_DST); + + if (mkdir(path_src, 0700) || + mkdir(path_dst, 0700)) { + pr_perror("mkdir"); + return 1; + } + + if ((fd1 = open(TEST_FILE_SRC, O_WRONLY | O_CREAT | O_TRUNC, 0600) < 0)) { + pr_perror("touching %s", TEST_FILE_SRC); + return 1; + } + close(fd1); + + if ((fd2 = open(TEST_FILE_DST, O_WRONLY | O_CREAT | O_TRUNC, 0600) < 0)) { + pr_perror("touching %s", TEST_FILE_DST); + return 1; + } + close(fd2); + + if (mount(path_src, path_dst, NULL, MS_BIND | MS_MGC_VAL, NULL)) { + pr_perror("mount %s -> %s", path_src, path_dst); + return 1; + } + + if (mount(TEST_FILE_SRC, TEST_FILE_DST, NULL, MS_BIND | MS_MGC_VAL, NULL)) { + pr_perror("mount %s -> %s", TEST_FILE_SRC, TEST_FILE_DST); + return 1; + } + + if (rmdir(path_src)) { + pr_perror("rmdir %s", path_src); + return 1; + } + + if (unlink(TEST_FILE_SRC)) { + pr_perror("unlink %s", TEST_FILE_SRC); + return 1; + } + + test_daemon(); + test_waitsig(); + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/mntns_deleted.desc b/CRIU_code/test/zdtm/static/mntns_deleted.desc new file mode 100644 index 0000000..a8849e0 --- /dev/null +++ b/CRIU_code/test/zdtm/static/mntns_deleted.desc @@ -0,0 +1 @@ +{'flavor': 'ns uns', 'flags': 'suid', 'feature': 'mnt_id'} diff --git a/CRIU_code/test/zdtm/static/mntns_ghost.c b/CRIU_code/test/zdtm/static/mntns_ghost.c new file mode 100644 index 0000000..e0d3157 --- /dev/null +++ b/CRIU_code/test/zdtm/static/mntns_ghost.c @@ -0,0 +1,115 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check ghost and link-remap files in a few mntns"; +const char *test_author = "Andrew Vagin "; + +char *dirname; +TEST_OPTION(dirname, string, "directory name", 1); + + +int main(int argc, char **argv) +{ + task_waiter_t lock; + pid_t pid = -1; + int status = 1; + + test_init(argc, argv); + task_waiter_init(&lock); + + pid = fork(); + if (pid < 0) { + pr_perror("fork"); + return 1; + } + + if (pid == 0) { + int fd; + DIR *d; + struct dirent *de; + + if (unshare(CLONE_NEWNS)) { + pr_perror("unshare"); + return 1; + } + if (mount(NULL, "/", NULL, MS_PRIVATE | MS_REC, NULL)) { + pr_perror("mount"); + return 1; + } + + if (mkdir(dirname, 0600) < 0) { + pr_perror("mkdir"); + return 1; + } + + if (mount(dirname, dirname, NULL, MS_BIND, NULL)) { + pr_perror("mount"); + return 1; + } + if (chdir(dirname)) + return 1; + + fd = open("test.ghost", O_CREAT | O_WRONLY, 0600); + if (fd < 0) { + pr_perror("open"); + return 1; + } + + if (unlink("test.ghost")) { + pr_perror("unlink"); + return 1; + } + + task_waiter_complete(&lock, 1); + test_waitsig(); + + if (close(fd)) { + pr_perror("close"); + return 1; + } + d = opendir("."); + if (d == NULL) { + pr_perror("opendir"); + return 1; + } + while ((de = readdir(d)) != NULL) { + if (!strcmp(de->d_name, ".")) + continue; + if (!strcmp(de->d_name, "..")) + continue; + pr_err("%s\n", de->d_name); + } + closedir(d); + + return 0; + } + + task_waiter_wait4(&lock, 1); + test_daemon(); + test_waitsig(); + + + kill(pid, SIGTERM); + wait(&status); + if (status) { + fail("Test died"); + return 1; + } + pass(); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/mntns_ghost.desc b/CRIU_code/test/zdtm/static/mntns_ghost.desc new file mode 100644 index 0000000..8a1ef8a --- /dev/null +++ b/CRIU_code/test/zdtm/static/mntns_ghost.desc @@ -0,0 +1 @@ +{'flavor': 'ns uns', 'flags': 'suid', 'feature': 'mnt_id', 'opts': '--link-remap'} diff --git a/CRIU_code/test/zdtm/static/mntns_ghost01.c b/CRIU_code/test/zdtm/static/mntns_ghost01.c new file mode 100644 index 0000000..665e1e1 --- /dev/null +++ b/CRIU_code/test/zdtm/static/mntns_ghost01.c @@ -0,0 +1,120 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check ghost file is restored on readonly fs if it was ghost-remaped on writable bind"; +const char *test_author = "Pavel Tikhomirov "; + +char *dirname; +TEST_OPTION(dirname, string, "directory name", 1); + + +int main(int argc, char **argv) +{ + char ghost_path[PATH_MAX]; + task_waiter_t lock; + pid_t pid = -1; + int status = 1; + int pfd; + + test_init(argc, argv); + task_waiter_init(&lock); + + if (mkdir(dirname, 0600) < 0) { + pr_perror("mkdir"); + return 1; + } + + snprintf(ghost_path, PATH_MAX, "%s/test.ghost", dirname); + + pfd = open(ghost_path, O_CREAT | O_WRONLY, 0600); + if (pfd < 0) { + pr_perror("open"); + return 1; + } + close(pfd); + + pfd = open(ghost_path, O_RDONLY); + if (pfd < 0) { + pr_perror("open"); + return 1; + } + + pid = fork(); + if (pid < 0) { + pr_perror("fork"); + return 1; + } + + if (pid == 0) { + int fd; + + if (unshare(CLONE_NEWNS)) { + pr_perror("unshare"); + return 1; + } + + if (mount(NULL, "/", NULL, MS_PRIVATE | MS_REC, NULL)) { + pr_perror("mount"); + return 1; + } + + if (mount(dirname, dirname, NULL, MS_BIND, NULL)) { + pr_perror("mount"); + return 1; + } + + if (mount(NULL, dirname, NULL, MS_RDONLY|MS_REMOUNT|MS_BIND, NULL)) { + pr_perror("remount"); + return 1; + } + + fd = open(ghost_path, O_RDONLY); + if (fd < 0) { + pr_perror("open"); + return 1; + } + + task_waiter_complete(&lock, 1); + test_waitsig(); + + if (close(fd)) { + pr_perror("close"); + return 1; + } + + return 0; + } + + task_waiter_wait4(&lock, 1); + + if (unlink(ghost_path)) { + pr_perror("unlink"); + return 1; + } + + test_daemon(); + test_waitsig(); + + if (close(pfd)) { + pr_perror("close"); + return 1; + } + + kill(pid, SIGTERM); + wait(&status); + if (status) { + fail("Test died"); + return 1; + } + pass(); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/mntns_ghost01.desc b/CRIU_code/test/zdtm/static/mntns_ghost01.desc new file mode 100644 index 0000000..8a1ef8a --- /dev/null +++ b/CRIU_code/test/zdtm/static/mntns_ghost01.desc @@ -0,0 +1 @@ +{'flavor': 'ns uns', 'flags': 'suid', 'feature': 'mnt_id', 'opts': '--link-remap'} diff --git a/CRIU_code/test/zdtm/static/mntns_link_ghost.c b/CRIU_code/test/zdtm/static/mntns_link_ghost.c new file mode 100644 index 0000000..0314c62 --- /dev/null +++ b/CRIU_code/test/zdtm/static/mntns_link_ghost.c @@ -0,0 +1 @@ +mntns_link_remap.c \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/mntns_link_ghost.desc b/CRIU_code/test/zdtm/static/mntns_link_ghost.desc new file mode 100644 index 0000000..a8849e0 --- /dev/null +++ b/CRIU_code/test/zdtm/static/mntns_link_ghost.desc @@ -0,0 +1 @@ +{'flavor': 'ns uns', 'flags': 'suid', 'feature': 'mnt_id'} diff --git a/CRIU_code/test/zdtm/static/mntns_link_remap.c b/CRIU_code/test/zdtm/static/mntns_link_remap.c new file mode 100644 index 0000000..642641b --- /dev/null +++ b/CRIU_code/test/zdtm/static/mntns_link_remap.c @@ -0,0 +1,250 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +#ifndef CLONE_NEWNS +#define CLONE_NEWNS 0x00020000 +#endif + +const char *test_doc = "Check ghost and link-remap files in a few mntns"; +const char *test_author = "Andrew Vagin "; + +#define MPTS_FILE "F" +char *dirname; +TEST_OPTION(dirname, string, "directory name", 1); + +#define NS_STACK_SIZE 4096 +/* All arguments should be above stack, because it grows down */ +struct ns_exec_args { + char stack[NS_STACK_SIZE] __stack_aligned__; + char stack_ptr[0]; + int fd; + int sync; +}; + +#define AWK_OK 13 +#define AWK_FAIL 42 + +static int get_mntid(int fd) +{ + char str[256]; + int mnt_id = -1; + FILE *f; + + snprintf(str, sizeof(str), "/proc/self/fdinfo/%d", fd); + f = fopen(str, "r"); + if (!f) { + pr_perror("Can't open %s to parse", str); + return -1; + } + while (fgets(str, sizeof(str), f)) { + if (sscanf(str, "mnt_id: %d", &mnt_id) == 1) + break; + } + + fclose(f); + return mnt_id; +} + +int ns_child(void *_arg) +{ + struct ns_exec_args *args = _arg; + int fd2; + int id1, id2; + struct stat st1, st2; + char lpath[PATH_MAX], fpath[PATH_MAX]; + + snprintf(fpath, sizeof(fpath), "%s/1", dirname); + if (umount(fpath)) { + pr_perror("umount"); + return 1; + } + + snprintf(lpath, sizeof(lpath), "%s/0/2", dirname); + snprintf(fpath, sizeof(fpath), "%s/2", dirname); + + if (mkdir(fpath, 0600) < 0) { + fail("Can't make zdtm_sys"); + return 1; + } + + if (mount(lpath, fpath, NULL, MS_BIND, NULL)) { + pr_perror("mount"); + return 1; + } + + snprintf(fpath, sizeof(fpath), "%s/0", dirname); + if (umount(fpath)) { + pr_perror("umount"); + return 1; + } + + snprintf(fpath, sizeof(fpath), "%s/2/%s", dirname, MPTS_FILE); + fd2 = open(fpath, O_RDWR); + if (fd2 < 0) { + pr_perror("open"); + return -1; + } + close(args->sync); + test_waitsig(); + + id1 = get_mntid(args->fd); + id2 = get_mntid(fd2); + if (id1 <0 || id2 < 0) + exit(1); + + if (fstat(args->fd, &st1) || fstat(fd2, &st2)) { + pr_perror("stat"); + exit(1); + } + + test_msg("%d %d", id1, id2); + +#ifdef ZDTM_LINK_REMAP + if (st1.st_nlink != 1) { +#else + if (st1.st_nlink != 0) { +#endif + pr_perror("Wrong number of links: %lu", + (long unsigned)st1.st_nlink); + exit(1); + } + + if (id1 > 0 && id1 != id2 && st1.st_ino == st2.st_ino) + exit(AWK_OK); + else + exit(AWK_FAIL); +} + +int main(int argc, char **argv) +{ + struct ns_exec_args args; + pid_t pid = -1; + char lpath[PATH_MAX], fpath[PATH_MAX]; + char buf[256]; + int p[2]; + + test_init(argc, argv); + + if (mkdir(dirname, 0600) < 0) { + fail("Can't make zdtm_sys"); + return 1; + } + + if (mount("test", dirname, "tmpfs", 0, NULL)) { + pr_perror("mount"); + return 1; + } + + snprintf(fpath, sizeof(fpath), "%s/0", dirname); + if (mkdir(fpath, 0600) < 0) { + fail("Can't make zdtm_sys"); + return 1; + } + if (mount("test", fpath, "tmpfs", 0, NULL)) { + pr_perror("mount"); + return 1; + } + + snprintf(lpath, sizeof(lpath), "%s/0/1", dirname); + if (mkdir(lpath, 0600) < 0) { + fail("Can't make zdtm_sys"); + return 1; + } + snprintf(fpath, sizeof(fpath), "%s/1", dirname); + if (mkdir(fpath, 0600) < 0) { + fail("Can't make zdtm_sys"); + return 1; + } + if (mount(lpath, fpath, NULL, MS_BIND, NULL)) { + pr_perror("mount"); + return 1; + } + snprintf(lpath, sizeof(lpath), "%s/0/2", dirname); + if (mkdir(lpath, 0600) < 0) { + fail("Can't make zdtm_sys"); + return 1; + } + + if (pipe(p) == -1) { + pr_perror("pipe"); + return 1; + } + + if (getenv("ZDTM_NOSUBNS") == NULL) { + snprintf(fpath, sizeof(fpath), "%s/1/%s", dirname, MPTS_FILE); + + args.fd = open(fpath, O_CREAT | O_RDWR, 0600); + if (args.fd < 0) { + fail("Can't open file"); + return 1; + } + snprintf(fpath, sizeof(fpath), "%s/0/1/%s", dirname, MPTS_FILE); + snprintf(lpath, sizeof(fpath), "%s/0/2/%s", dirname, MPTS_FILE); + if (link(fpath, lpath) == -1) { + pr_perror("link"); + return -1; + } +#ifdef ZDTM_LINK_REMAP + snprintf(lpath, sizeof(fpath), "%s/0/%s", dirname, MPTS_FILE); + if (link(fpath, lpath) == -1) { + pr_perror("link"); + return -1; + } +#endif + args.sync = p[1]; + + pid = clone(ns_child, args.stack_ptr, CLONE_NEWNS | SIGCHLD, &args); + if (pid < 0) { + pr_perror("Unable to fork child"); + return 1; + } + + close(args.fd); + } + + close(p[1]); + read(p[0], buf, sizeof(buf)); + + snprintf(fpath, sizeof(fpath), "%s/0/1/%s", dirname, MPTS_FILE); + if (unlink(fpath)) + return 1; + snprintf(fpath, sizeof(fpath), "%s/0/2/%s", dirname, MPTS_FILE); + if (unlink(fpath)) + return 1; + + test_daemon(); + test_waitsig(); + + + if (pid > 0) { + kill(pid, SIGTERM); + int status = 1; + wait(&status); + if (WIFEXITED(status)) { + if (WEXITSTATUS(status) == AWK_OK) + pass(); + else if (WEXITSTATUS(status) == AWK_FAIL) + fail("Mount ID not restored"); + else + fail("Failed to check mount IDs (%d)", WEXITSTATUS(status)); + } else + fail("Test died"); + } + + umount2(dirname, MNT_DETACH); + rmdir(dirname); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/mntns_link_remap.desc b/CRIU_code/test/zdtm/static/mntns_link_remap.desc new file mode 100644 index 0000000..be1fbbd --- /dev/null +++ b/CRIU_code/test/zdtm/static/mntns_link_remap.desc @@ -0,0 +1 @@ +{'flavor': 'uns ns', 'flags': 'suid', 'feature': 'mnt_id', 'opts': '--link-remap'} diff --git a/CRIU_code/test/zdtm/static/mntns_open.c b/CRIU_code/test/zdtm/static/mntns_open.c new file mode 100644 index 0000000..e19c4ea --- /dev/null +++ b/CRIU_code/test/zdtm/static/mntns_open.c @@ -0,0 +1,139 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +#ifndef CLONE_NEWNS +#define CLONE_NEWNS 0x00020000 +#endif + +const char *test_doc = "Check that mnt_id is repsected"; +const char *test_author = "Pavel Emelianov "; + +#define MPTS_FILE "F" +char *dirname; +TEST_OPTION(dirname, string, "directory name", 1); +char fpath[PATH_MAX]; + +#define NS_STACK_SIZE 4096 +/* All arguments should be above stack, because it grows down */ +struct ns_exec_args { + char stack[NS_STACK_SIZE] __stack_aligned__; + char stack_ptr[0]; + int fd; +}; + +#define AWK_OK 13 +#define AWK_FAIL 42 + +static int get_mntid(int fd) +{ + char str[256]; + int mnt_id = -1; + FILE *f; + + snprintf(str, sizeof(str), "/proc/self/fdinfo/%d", fd); + f = fopen(str, "r"); + if (!f) { + pr_perror("Can't open %s to parse", str); + return -1; + } + while (fgets(str, sizeof(str), f)) { + if (sscanf(str, "mnt_id: %d", &mnt_id) == 1) + break; + } + + fclose(f); + return mnt_id; +} + +task_waiter_t t; + +int ns_child(void *_arg) +{ + struct ns_exec_args *args = _arg; + int fd2; + int id1, id2; + + fd2 = open(fpath, O_RDWR); + task_waiter_complete(&t, 1); + test_waitsig(); + + id1 = get_mntid(args->fd); + id2 = get_mntid(fd2); + + test_msg("%d %d", id1, id2); + + if (id1 <0 || id2 < 0) + exit(1); + if (id1 > 0 && id1 != id2) + exit(AWK_OK); + else + exit(AWK_FAIL); +} + +int main(int argc, char **argv) +{ + struct ns_exec_args args; + pid_t pid = -1; + + test_init(argc, argv); + + task_waiter_init(&t); + + snprintf(fpath, sizeof(fpath), "%s/%s", dirname, MPTS_FILE); + if (mkdir(dirname, 0600) < 0) { + fail("Can't make zdtm_sys"); + return 1; + } + + if (getenv("ZDTM_NOSUBNS") == NULL) { + args.fd = open(fpath, O_CREAT | O_RDWR, 0600); + if (args.fd < 0) { + fail("Can't open file"); + return 1; + } + + pid = clone(ns_child, args.stack_ptr, CLONE_NEWNS | SIGCHLD, &args); + if (pid < 0) { + pr_perror("Unable to fork child"); + return 1; + } + + close(args.fd); + } + + task_waiter_wait4(&t, 1); + + test_daemon(); + test_waitsig(); + + if (pid > 0) { + kill(pid, SIGTERM); + int status = 1; + wait(&status); + if (WIFEXITED(status)) { + if (WEXITSTATUS(status) == AWK_OK) + pass(); + else if (WEXITSTATUS(status) == AWK_FAIL) + fail("Mount ID not restored"); + else + fail("Failed to check mount IDs (%d)", WEXITSTATUS(status)); + } else + fail("Test died"); + } + + unlink(fpath); + rmdir(dirname); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/mntns_open.desc b/CRIU_code/test/zdtm/static/mntns_open.desc new file mode 100644 index 0000000..a8849e0 --- /dev/null +++ b/CRIU_code/test/zdtm/static/mntns_open.desc @@ -0,0 +1 @@ +{'flavor': 'ns uns', 'flags': 'suid', 'feature': 'mnt_id'} diff --git a/CRIU_code/test/zdtm/static/mntns_overmount.c b/CRIU_code/test/zdtm/static/mntns_overmount.c new file mode 100644 index 0000000..ba23afc --- /dev/null +++ b/CRIU_code/test/zdtm/static/mntns_overmount.c @@ -0,0 +1,69 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check two mounts in the same directory"; +const char *test_author = "Andrew Vagin "; + +char *dirname; +TEST_OPTION(dirname, string, "directory name", 1); + + +int main(int argc, char **argv) +{ + char d1[PATH_MAX], d2[PATH_MAX], f1[PATH_MAX], f2[PATH_MAX]; + struct stat st; + + test_init(argc, argv); + + snprintf(d1, sizeof(d1), "%s/1/", dirname); + snprintf(d2, sizeof(d2), "%s/2/", dirname); + + if (mkdir(dirname, 0700) || + mkdir(d1, 0777) || + mkdir(d2, 0700)) { + pr_perror("mkdir"); + return 1; + } + + if (mount("zdtm_d1", d1, "sysfs", 0, NULL) || + mount(NULL, d1, NULL, MS_SHARED, NULL) || + mount(d1, d2, NULL, MS_BIND, NULL) || + mount(NULL, d2, NULL, MS_SLAVE, NULL)) { + pr_perror("mount"); + return 1; + } + + ssprintf(f1, "%s/devices", d1); + ssprintf(f2, "%s/devices", d2); + + if (mount("zdtm_d1", d1, "tmpfs", 0, NULL)) { + pr_perror("mount"); + return 1; + } + + test_daemon(); + test_waitsig(); + + if (umount(d1)) { + pr_perror("umount"); + return 1; + } + + if (stat(f1, &st) || stat(f2, &st)) { + pr_perror("stat"); + return 1; + } + + pass(); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/mntns_overmount.desc b/CRIU_code/test/zdtm/static/mntns_overmount.desc new file mode 100644 index 0000000..3fd8e03 --- /dev/null +++ b/CRIU_code/test/zdtm/static/mntns_overmount.desc @@ -0,0 +1 @@ +{'flavor': 'ns', 'flags': 'suid', 'feature': 'mnt_id'} diff --git a/CRIU_code/test/zdtm/static/mntns_remap.c b/CRIU_code/test/zdtm/static/mntns_remap.c new file mode 100644 index 0000000..fd62cbf --- /dev/null +++ b/CRIU_code/test/zdtm/static/mntns_remap.c @@ -0,0 +1,100 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check a case when one mount overmount another one"; +const char *test_author = "Andrew Vagin "; + +char *dirname; +TEST_OPTION(dirname, string, "directory name", 1); + +int main(int argc, char **argv) +{ + task_waiter_t t; + pid_t pid; + + test_init(argc, argv); + + mkdir(dirname, 0755); + if (mount("zdtm", dirname, "tmpfs", 0, NULL)) { + pr_perror("mount"); + return 1; + } + if (chdir(dirname)) { + pr_perror("chdir"); + return 1; + } + mkdir("1", 0755); + mkdir("2", 0755); + if (mount("1", "1", NULL, MS_BIND, NULL)) { + pr_perror("mount"); + return 1; + } + if (mount(NULL, "1", NULL, MS_PRIVATE, NULL)) { + pr_perror("mount"); + return 1; + } + if (mount("zdtm", "2", "tmpfs", 0, NULL)) { + pr_perror("mount"); + return 1; + } + mkdir("1/a", 0755); + mkdir("2/a", 0755); + if (mount("1/a", "1/a", NULL, MS_BIND, NULL)) { + pr_perror("mount"); + return 1; + } + if (mount(NULL, "1/a", NULL, MS_SHARED, NULL)) { + pr_perror("mount"); + return 1; + } + if (mount("1/a", "2/a", NULL, MS_BIND, NULL)) { + pr_perror("mount"); + return 1; + } + mkdir("1/a/c", 0755); + if (mount("zdtm", "1/a/c", "tmpfs", 0, NULL)) { + pr_perror("mount"); + return 1; + } + if (mount("2", "1", NULL, MS_MOVE, NULL)) { + pr_perror("mount"); + return 1; + } + + task_waiter_init(&t); + + pid = fork(); + if (pid < 0) + return -1; + + if (pid == 0) { + if (unshare(CLONE_NEWNS)) + return 1; + task_waiter_complete_current(&t); + test_waitsig(); + return 0; + } + + task_waiter_wait4(&t, pid); + test_daemon(); + test_waitsig(); + + kill(pid, SIGTERM); + wait(NULL); + + pass(); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/mntns_remap.desc b/CRIU_code/test/zdtm/static/mntns_remap.desc new file mode 100644 index 0000000..a8849e0 --- /dev/null +++ b/CRIU_code/test/zdtm/static/mntns_remap.desc @@ -0,0 +1 @@ +{'flavor': 'ns uns', 'flags': 'suid', 'feature': 'mnt_id'} diff --git a/CRIU_code/test/zdtm/static/mntns_ro_root.c b/CRIU_code/test/zdtm/static/mntns_ro_root.c new file mode 100644 index 0000000..8a79710 --- /dev/null +++ b/CRIU_code/test/zdtm/static/mntns_ro_root.c @@ -0,0 +1,69 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check a case when a root is read-only for a sub-mntns"; +const char *test_author = "Andrew Vagin "; + +char *dirname; +TEST_OPTION(dirname, string, "directory name", 1); + + +int main(int argc, char **argv) +{ + task_waiter_t lock; + pid_t pid = -1; + int status = 1; + + test_init(argc, argv); + + task_waiter_init(&lock); + + pid = fork(); + if (pid < 0) { + pr_perror("fork"); + return 1; + } + + if (pid == 0) { + if (unshare(CLONE_NEWNS)) { + pr_perror("unshare"); + return 1; + } + if (mount(NULL, "/", NULL, MS_REMOUNT | MS_RDONLY | MS_BIND, NULL)) { + pr_perror("mount"); + return 1; + } + + task_waiter_complete(&lock, 1); + test_waitsig(); + + return 0; + } + + task_waiter_wait4(&lock, 1); + test_daemon(); + test_waitsig(); + + kill(pid, SIGTERM); + wait(&status); + if (status) { + fail("Test died"); + return 1; + } + pass(); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/mntns_ro_root.desc b/CRIU_code/test/zdtm/static/mntns_ro_root.desc new file mode 100644 index 0000000..a8849e0 --- /dev/null +++ b/CRIU_code/test/zdtm/static/mntns_ro_root.desc @@ -0,0 +1 @@ +{'flavor': 'ns uns', 'flags': 'suid', 'feature': 'mnt_id'} diff --git a/CRIU_code/test/zdtm/static/mntns_root_bind.c b/CRIU_code/test/zdtm/static/mntns_root_bind.c new file mode 100644 index 0000000..86c44e1 --- /dev/null +++ b/CRIU_code/test/zdtm/static/mntns_root_bind.c @@ -0,0 +1,125 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +#ifndef CLONE_NEWNS +#define CLONE_NEWNS 0x00020000 +#endif + +const char *test_doc = "Check bind-mouns of the root mount"; +const char *test_author = "Andrew Vagin "; + +char *dirname; +TEST_OPTION(dirname, string, "directory name", 1); + + +int main(int argc, char **argv) +{ + char subdir1[PATH_MAX], path[PATH_MAX], bpath[PATH_MAX], spath[PATH_MAX], bspath[PATH_MAX]; + char subdir2[PATH_MAX], bsubdir2[PATH_MAX]; + pid_t pid; + int status; + task_waiter_t t; + + test_init(argc, argv); + + task_waiter_init(&t); + + mount(NULL, "/", NULL, MS_SHARED, NULL); + + ssprintf(subdir1, "%s/subdir1", dirname); + ssprintf(path, "%s/test", subdir1); + ssprintf(bpath, "%s/test.bind", subdir1); + ssprintf(spath, "%s/test/sub", subdir1); + ssprintf(bspath, "%s/test.bind/sub", subdir1); + + ssprintf(subdir2, "%s/subdir2", dirname); + ssprintf(bsubdir2, "%s/bsubdir2", dirname); + + if (mkdir(dirname, 0700) || + mkdir(subdir1, 0777) || + mkdir(subdir2, 0777) || + mkdir(bsubdir2, 0777) || + mkdir(path, 0700) || + mkdir(spath, 0700) || + mkdir(bpath, 0700)) { + pr_perror("mkdir"); + return 1; + } + + pid = fork(); + if (pid < 0) { + pr_perror("fork"); + return 1; + } + if (pid == 0) { + if (unshare(CLONE_NEWNS)) { + pr_perror("unshare"); + return 1; + } + if (mount(path, bpath, NULL, MS_BIND, NULL)) { + pr_perror("mount"); + return 1; + } + + task_waiter_complete(&t, 1); + task_waiter_wait4(&t, 2); + + if (access(bspath, F_OK)) { + fail("%s isn't accessiable", bspath); + return 1; + } + + + if (umount2(bpath, MNT_DETACH)) { + fail("umount"); + return 1; + } + + return 0; + } + + task_waiter_wait4(&t, 1); + + if (mount("test", spath, "tmpfs", 0, NULL)) { + pr_perror("mount"); + return 1; + } + +#ifdef ROOT_BIND02 + if (mount(subdir2, bsubdir2, NULL, MS_BIND, NULL)) { + pr_perror("Unable to mount %s to %s", subdir2, bsubdir2); + return 1; + } +#endif + + test_daemon(); + test_waitsig(); + + task_waiter_complete(&t, 2); + + if (waitpid(pid, &status, 0) != pid) { + pr_perror("waitpid %d", pid); + return 1; + } + + if (status) { + pr_perror("%d/%d/%d/%d", WIFEXITED(status), WEXITSTATUS(status), WIFSIGNALED(status), WTERMSIG(status)); + return 1; + } + + pass(); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/mntns_root_bind.desc b/CRIU_code/test/zdtm/static/mntns_root_bind.desc new file mode 100644 index 0000000..a8849e0 --- /dev/null +++ b/CRIU_code/test/zdtm/static/mntns_root_bind.desc @@ -0,0 +1 @@ +{'flavor': 'ns uns', 'flags': 'suid', 'feature': 'mnt_id'} diff --git a/CRIU_code/test/zdtm/static/mntns_root_bind02.c b/CRIU_code/test/zdtm/static/mntns_root_bind02.c new file mode 100644 index 0000000..4957c1f --- /dev/null +++ b/CRIU_code/test/zdtm/static/mntns_root_bind02.c @@ -0,0 +1 @@ +mntns_root_bind.c \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/mntns_root_bind02.desc b/CRIU_code/test/zdtm/static/mntns_root_bind02.desc new file mode 100644 index 0000000..cd4ed51 --- /dev/null +++ b/CRIU_code/test/zdtm/static/mntns_root_bind02.desc @@ -0,0 +1 @@ +mntns_root_bind.desc \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/mntns_rw_ro_rw.c b/CRIU_code/test/zdtm/static/mntns_rw_ro_rw.c new file mode 100644 index 0000000..7aed254 --- /dev/null +++ b/CRIU_code/test/zdtm/static/mntns_rw_ro_rw.c @@ -0,0 +1,46 @@ +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Test read-only bind mounts"; +const char *test_author = "Andrey Vagin "; + +int main(int argc, char **argv) +{ + test_init(argc, argv); + + if (mount("/proc/sys/", "/proc/sys", NULL, MS_BIND, NULL)) { + pr_perror("Unable to bind-mount /proc/sys"); + return 1; + } + if (mount("/proc/sys/net", "/proc/sys/net", NULL, MS_BIND, NULL)) { + pr_perror("Unable to bind-mount /proc/sys/net"); + return 1; + } + if (mount("/proc/sys/", "/proc/sys", NULL, MS_RDONLY|MS_BIND|MS_REMOUNT, NULL)) { + pr_perror("Unable to remount /proc/sys"); + return 1; + } + + test_daemon(); + test_waitsig(); + + if (access("/proc/sys/net/ipv4/ip_forward", W_OK)) { + fail("Unable to access /proc/sys/net/core/wmem_max"); + return 1; + } + + if (access("/proc/sys/kernel/ns_last_pid", W_OK) != -1 || errno != EROFS) { + fail("Unable to access /proc/sys/kernel/pid_max"); + return 1; + } + + pass(); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/mntns_rw_ro_rw.desc b/CRIU_code/test/zdtm/static/mntns_rw_ro_rw.desc new file mode 100644 index 0000000..7657ba4 --- /dev/null +++ b/CRIU_code/test/zdtm/static/mntns_rw_ro_rw.desc @@ -0,0 +1 @@ +{'flavor': 'ns uns', 'flags': 'suid'} diff --git a/CRIU_code/test/zdtm/static/mntns_shared_bind.c b/CRIU_code/test/zdtm/static/mntns_shared_bind.c new file mode 100644 index 0000000..b0d63af --- /dev/null +++ b/CRIU_code/test/zdtm/static/mntns_shared_bind.c @@ -0,0 +1,130 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +#ifndef CLONE_NEWNS +#define CLONE_NEWNS 0x00020000 +#endif + +const char *test_doc = "Check shared non-root bind-mounts"; +const char *test_author = "Andrew Vagin "; + +char *dirname; +TEST_OPTION(dirname, string, "directory name", 1); + + +int main(int argc, char **argv) +{ + char path[PATH_MAX], bpath[PATH_MAX], spath[PATH_MAX]; + pid_t pid; + int status; + task_waiter_t t; + + test_init(argc, argv); + + task_waiter_init(&t); + + snprintf(path, sizeof(path), "%s/test", dirname); + snprintf(bpath, sizeof(bpath), "%s/test.bind", dirname); + snprintf(spath, sizeof(spath), "%s/test/sub", dirname); + if (mkdir(dirname, 0700)) { + pr_perror("mkdir"); + return 1; + } + + if (mount(NULL, "/", NULL, MS_SHARED, NULL)) { + pr_perror("mount"); + return 1; + } + +#ifdef SHARED_BIND02 + /* */ + if (mount(dirname, dirname, "tmpfs", 0, NULL) || + mount(NULL, dirname, NULL, MS_SHARED, NULL)) { + pr_perror("mount"); + return 1; + } +#endif + + if (mkdir(path, 0700) || + mkdir(spath, 0700) || + mkdir(bpath, 0700)) { + pr_perror("mkdir"); + return 1; + } + + pid = fork(); + if (pid < 0) { + pr_perror("fork"); + return 1; + } + if (pid == 0) { + if (unshare(CLONE_NEWNS)) { + pr_perror("unshare"); + return 1; + } + if (mount(path, bpath, NULL, MS_BIND, NULL)) { + pr_perror("mount"); + return 1; + } + + task_waiter_complete(&t, 1); + task_waiter_wait4(&t, 2); + if (umount(spath)) { + task_waiter_complete(&t, 2); + fail("umount"); + return 1; + } + task_waiter_complete(&t, 3); + task_waiter_wait4(&t, 4); + + return 0; + } + + task_waiter_wait4(&t, 1); + + if (mount("test", spath, "tmpfs", 0, NULL)) { + pr_perror("mount"); + return 1; + } + + + test_daemon(); + test_waitsig(); + + task_waiter_complete(&t, 2); + task_waiter_wait4(&t, 3); + + if (umount(bpath)) { + task_waiter_complete(&t, 2); + fail("umount"); + return 1; + } + + task_waiter_complete(&t, 4); + + if (waitpid(pid, &status, 0) != pid) { + pr_perror("waitpid %d", pid); + return 1; + } + + if (status) { + pr_perror("%d/%d/%d/%d", WIFEXITED(status), WEXITSTATUS(status), WIFSIGNALED(status), WTERMSIG(status)); + return 1; + } + + pass(); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/mntns_shared_bind.desc b/CRIU_code/test/zdtm/static/mntns_shared_bind.desc new file mode 100644 index 0000000..a8849e0 --- /dev/null +++ b/CRIU_code/test/zdtm/static/mntns_shared_bind.desc @@ -0,0 +1 @@ +{'flavor': 'ns uns', 'flags': 'suid', 'feature': 'mnt_id'} diff --git a/CRIU_code/test/zdtm/static/mntns_shared_bind02.c b/CRIU_code/test/zdtm/static/mntns_shared_bind02.c new file mode 100644 index 0000000..5efca67 --- /dev/null +++ b/CRIU_code/test/zdtm/static/mntns_shared_bind02.c @@ -0,0 +1 @@ +mntns_shared_bind.c \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/mntns_shared_bind02.desc b/CRIU_code/test/zdtm/static/mntns_shared_bind02.desc new file mode 100644 index 0000000..a8849e0 --- /dev/null +++ b/CRIU_code/test/zdtm/static/mntns_shared_bind02.desc @@ -0,0 +1 @@ +{'flavor': 'ns uns', 'flags': 'suid', 'feature': 'mnt_id'} diff --git a/CRIU_code/test/zdtm/static/mntns_shared_bind03.c b/CRIU_code/test/zdtm/static/mntns_shared_bind03.c new file mode 100644 index 0000000..e75fe62 --- /dev/null +++ b/CRIU_code/test/zdtm/static/mntns_shared_bind03.c @@ -0,0 +1,123 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +#ifndef CLONE_NEWNS +#define CLONE_NEWNS 0x00020000 +#endif + +const char *test_doc = "Check shared non-root bind-mounts with different shared groups"; +const char *test_author = "Andrew Vagin "; + +char *dirname; +TEST_OPTION(dirname, string, "directory name", 1); + + +int main(int argc, char **argv) +{ + test_init(argc, argv); + + if (mkdir(dirname, 0700)) { + pr_perror("mkdir"); + return 1; + } + + if (chdir(dirname)) + return 1; + + if (mkdir("1", 0700) || mkdir("2", 0700) || mkdir("3", 0700)) { + pr_perror("mkdir"); + return 1; + } + + if (mkdir("A", 0700)) { + pr_perror("mkdir"); + return 1; + } + + if (mkdir("B", 0700)) { + pr_perror("mkdir"); + return 1; + } + + if (mount("1", "1", NULL, MS_BIND, NULL) || + mount(NULL, "1", NULL, MS_PRIVATE, NULL) || + mount(NULL, "1", NULL, MS_SHARED, NULL)) { + pr_perror("mount"); + return 1; + } + + if (mount("1", "A", NULL, MS_BIND, NULL) || + mount(NULL, "A", NULL, MS_PRIVATE, NULL) || + mount(NULL, "A", NULL, MS_SHARED, NULL)) { + pr_perror("mount"); + return 1; + } + + if (mount("1", "B", NULL, MS_BIND, NULL) || + mount(NULL, "B", NULL, MS_SLAVE, NULL)) { + pr_perror("mount"); + return 1; + } + + if (mkdir("1/D", 0700)) { + pr_perror("mkdir"); + return 1; + } + + if (mount("1/D", "2", NULL, MS_BIND, NULL)) { + pr_perror("mount"); + return 1; + } + + if (mount("1", "3", NULL, MS_BIND, NULL)) { + pr_perror("mount"); + return 1; + } + + test_daemon(); + test_waitsig(); + + if (mkdir("1/D/test", 0700)) { + pr_perror("mkdir"); + return 1; + } + + if (mount("zdtm_shared", "1/D/test", "tmpfs", 0, NULL)) { + pr_perror("mount"); + return 1; + } + + if (mount(NULL, "3", NULL, MS_PRIVATE, NULL)) { + pr_perror("mount"); + return 1; + } + + if (umount("B/D/test")) { + pr_perror("umount"); + return 1; + } + if (umount("2/test")) { + pr_perror("umount"); + return 1; + } + if (umount("3/D/test")) { + pr_perror("umount"); + return 1; + } + + pass(); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/mntns_shared_bind03.desc b/CRIU_code/test/zdtm/static/mntns_shared_bind03.desc new file mode 100644 index 0000000..a8849e0 --- /dev/null +++ b/CRIU_code/test/zdtm/static/mntns_shared_bind03.desc @@ -0,0 +1 @@ +{'flavor': 'ns uns', 'flags': 'suid', 'feature': 'mnt_id'} diff --git a/CRIU_code/test/zdtm/static/mntns_shared_vs_private.c b/CRIU_code/test/zdtm/static/mntns_shared_vs_private.c new file mode 100644 index 0000000..5849bbb --- /dev/null +++ b/CRIU_code/test/zdtm/static/mntns_shared_vs_private.c @@ -0,0 +1,117 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +#ifndef CLONE_NEWNS +#define CLONE_NEWNS 0x00020000 +#endif + +const char *test_doc = "Check a private mount in a shared mount"; +const char *test_author = "Andrew Vagin "; + +char *dirname; +TEST_OPTION(dirname, string, "directory name", 1); + + +int main(int argc, char **argv) +{ + char path[PATH_MAX]; + pid_t pid; + int status, i; + task_waiter_t t; + + test_init(argc, argv); + + task_waiter_init(&t); + + snprintf(path, sizeof(path), "%s/fs", dirname); + if (mkdir(dirname, 0700)) { + pr_perror("mkdir"); + return 1; + } + + if (mount(NULL, "/", NULL, MS_SHARED, NULL)) { + pr_perror("mount"); + return 1; + } + + if (mount("zdtm_fs", dirname, "tmpfs", 0, NULL)) { + pr_perror("mount"); + return 1; + } + + if (mount(NULL, dirname, NULL, MS_PRIVATE, NULL)) { + pr_perror("mount"); + return 1; + } + + if (mkdir(path, 0700)) { + pr_perror("mkdir"); + return 1; + } + + if (mount("zdtm_fs", path, "tmpfs", 0, NULL)) { + pr_perror("mount"); + return 1; + } + + for (i = 0; i < 2; i++) { + pid = fork(); + if (pid < 0) { + pr_perror("fork"); + return 1; + } + if (pid == 0) { + unshare(CLONE_NEWNS); + + task_waiter_complete(&t, 1); + task_waiter_wait4(&t, 2); + + return 0; + } + } + + for (i = 0; i < 2; i++) + task_waiter_wait4(&t, 1); + + test_daemon(); + test_waitsig(); + + if (umount(path)) { + pr_perror("Unable to umount %s", path); + return 1; + } + if (umount(dirname)) { + pr_perror("Unable to umount %s", dirname); + return 1; + } + + for (i = 0; i < 2; i++) { + task_waiter_complete(&t, 2); + + if (waitpid(-1, &status, 0) < 0) { + pr_perror("waitpid %d", pid); + return 1; + } + + if (status) { + pr_err("%d/%d/%d/%d\n", WIFEXITED(status), WEXITSTATUS(status), WIFSIGNALED(status), WTERMSIG(status)); + return 1; + } + } + + pass(); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/mntns_shared_vs_private.desc b/CRIU_code/test/zdtm/static/mntns_shared_vs_private.desc new file mode 100644 index 0000000..a8849e0 --- /dev/null +++ b/CRIU_code/test/zdtm/static/mntns_shared_vs_private.desc @@ -0,0 +1 @@ +{'flavor': 'ns uns', 'flags': 'suid', 'feature': 'mnt_id'} diff --git a/CRIU_code/test/zdtm/static/mount_paths.c b/CRIU_code/test/zdtm/static/mount_paths.c new file mode 100644 index 0000000..2377e9c --- /dev/null +++ b/CRIU_code/test/zdtm/static/mount_paths.c @@ -0,0 +1,57 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that special characters in paths are handled correctly"; +const char *test_author = "Andrew Vagin "; + +char *dirname; +TEST_OPTION(dirname, string, "directory name", 1); + +#define TEST_DIR "tmpfs \t \t\\\\\t test \t\t\\\\ \t\\" + +int main(int argc, char **argv) +{ + int ret = 1; + char test_dir[PATH_MAX], fname[PATH_MAX]; + + test_init(argc, argv); + + mkdir(dirname, 0700); + + ssprintf(test_dir, "%s/%s", dirname, TEST_DIR); + mkdir(test_dir, 0700); + + if (mount("", test_dir, "tmpfs", 0, NULL)) { + pr_perror("mount"); + return 1; + } + + ssprintf(fname, "%s/\\\t \\\\ \\tt", test_dir); + if (mkdir(fname, 0700)) { + pr_perror("mkdir"); + return 1; + } + + test_daemon(); + test_waitsig(); + + if (access(fname, F_OK)) { + fail(); + goto err; + } + + pass(); + ret = 0; +err: + umount2(test_dir, MNT_DETACH); + rmdir(test_dir); + rmdir(dirname); + return ret; +} diff --git a/CRIU_code/test/zdtm/static/mount_paths.desc b/CRIU_code/test/zdtm/static/mount_paths.desc new file mode 100644 index 0000000..7657ba4 --- /dev/null +++ b/CRIU_code/test/zdtm/static/mount_paths.desc @@ -0,0 +1 @@ +{'flavor': 'ns uns', 'flags': 'suid'} diff --git a/CRIU_code/test/zdtm/static/mountpoints.c b/CRIU_code/test/zdtm/static/mountpoints.c new file mode 100644 index 0000000..00475cd --- /dev/null +++ b/CRIU_code/test/zdtm/static/mountpoints.c @@ -0,0 +1,304 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that mountpoints (in mount namespace) are supported"; +const char *test_author = "Pavel Emelianov "; + +#define MPTS_ROOT "/zdtm_mpts/" + +#define NS_STACK_SIZE 4096 +/* All arguments should be above stack, because it grows down */ +struct ns_exec_args { + char stack[NS_STACK_SIZE] __stack_aligned__; + char stack_ptr[0]; + int status_pipe[2]; +}; + +task_waiter_t t; + +int ns_child(void *_arg) +{ + struct stat st; + pid_t pid; + int fd, ufd; + + mkdir(MPTS_ROOT"/dev/mntns2", 0600); + if (mount("none", MPTS_ROOT"/dev/mntns2", "tmpfs", 0, "") < 0) { + fail("Can't mount tmpfs"); + return 1; + } + + mkdir(MPTS_ROOT"/dev/mntns2/test", 0600); + + fd = open(MPTS_ROOT"/dev/mntns2/test/test.file", O_WRONLY | O_CREAT, 0666); + if (fd < 0) + return 1; + + ufd = open(MPTS_ROOT"/dev/mntns2/test/test.file.unlinked", O_WRONLY | O_CREAT, 0666); + if (ufd < 0) + return 1; + unlink(MPTS_ROOT"/dev/mntns2/test/test.file.unlinked"); + + pid = fork(); + + task_waiter_complete(&t, 1); + + test_waitsig(); + + if (pid) { + int status = 1; + kill(pid, SIGTERM); + wait(&status); + if (status) + return 1; + } + + if (stat(MPTS_ROOT"/dev/mntns2/test", &st)) { + pr_perror("Can't stat /dev/share-1/test.share/test.share"); + return 1; + } + + return 0; +} + +int main(int argc, char **argv) +{ + int fd, tmpfs_fd, have_bfmtm = 0; + struct ns_exec_args args; + pid_t pid = -1; + + test_init(argc, argv); + + task_waiter_init(&t); + + rmdir(MPTS_ROOT); + if (mkdir(MPTS_ROOT, 0600) < 0) { + fail("Can't make zdtm_sys"); + return 1; + } + + if (mount("none", MPTS_ROOT, "sysfs", 0, "") < 0) { + fail("Can't mount sysfs"); + return 1; + } + + if (mount("none", MPTS_ROOT"/dev", "tmpfs", 0, "") < 0) { + fail("Can't mount tmpfs"); + return 1; + } + tmpfs_fd = open(MPTS_ROOT"/dev/test", O_WRONLY | O_CREAT); + if (write(tmpfs_fd, "hello", 5) <= 0) { + pr_perror("write() failed"); + return 1; + } + + /* Check that over-mounted files are restored on tmpfs */ + mkdir(MPTS_ROOT"/dev/overmount", 0600); + fd = open(MPTS_ROOT"/dev/overmount/test.over", O_WRONLY | O_CREAT); + if (fd == -1) { + pr_perror("Unable to open "MPTS_ROOT"/dev/overmount"); + return -1; + } + close(fd); + if (mount("none", MPTS_ROOT"/dev/overmount", "tmpfs", 0, "") < 0) { + pr_perror("Can't mount "MPTS_ROOT"/dev/overmount"); + return 1; + } + + mkdir(MPTS_ROOT"/dev/non-root", 0600); + if (mount(MPTS_ROOT"/dev/non-root", MPTS_ROOT"/module", NULL, MS_BIND, NULL) < 0) { + pr_perror("Can't bind-mount %s -> %s", MPTS_ROOT"/dev/tdir", MPTS_ROOT"/module"); + } + mkdir(MPTS_ROOT"/dev/non-root/test", 0600); + + mkdir(MPTS_ROOT"/dev/share-1", 0600); + if (mount("none", MPTS_ROOT"/dev/share-1/", "tmpfs", 0, "") < 0) { + fail("Can't mount tmpfs"); + return 1; + } + if (mount("none", MPTS_ROOT"/dev/share-1/", NULL, MS_SHARED, NULL) < 0) { + fail("Can't mount tmpfs"); + return 1; + } + +//#define CR_NEXT +#ifdef CR_NEXT + mkdir(MPTS_ROOT"/dev/share-1/alone", 0600); + if (mount("none", MPTS_ROOT"/dev/share-1/alone", "tmpfs", 0, "") < 0) { + fail("Can't mount tmpfs"); + return 1; + } +#endif + + mkdir(MPTS_ROOT"/dev/share-2", 0600); + if (mount(MPTS_ROOT"/dev/share-1", MPTS_ROOT"/dev/share-2", NULL, MS_BIND, NULL) < 0) { + fail("Can't bind mount a tmpfs directory"); + return 1; + } + + mkdir(MPTS_ROOT"/dev/share-3", 0600); + if (mount(MPTS_ROOT"/dev/share-1", MPTS_ROOT"/dev/share-3", NULL, MS_BIND, NULL) < 0) { + fail("Can't bind mount a tmpfs directory"); + return 1; + } + mkdir(MPTS_ROOT"/dev/slave", 0600); + if (mount(MPTS_ROOT"/dev/share-1", MPTS_ROOT"/dev/slave", NULL, MS_BIND, NULL) < 0) { + fail("Can't bind mount a tmpfs directory"); + return 1; + } + if (mount("none", MPTS_ROOT"/dev/slave", NULL, MS_SLAVE, NULL) < 0) { + fail("Can't mount tmpfs"); + return 1; + } + + mkdir(MPTS_ROOT"/dev/slave2", 0600); + if (mount(MPTS_ROOT"/dev/share-3", MPTS_ROOT"/dev/slave2", NULL, MS_BIND, NULL) < 0) { + fail("Can't bind mount a tmpfs directory"); + return 1; + } + if (mount("none", MPTS_ROOT"/dev/slave2", NULL, MS_SLAVE, NULL) < 0) { + fail("Can't mount tmpfs"); + return 1; + } + + mkdir(MPTS_ROOT"/dev/share-1/test.mnt.share", 0600); + if (mount("none", MPTS_ROOT"/dev/share-1/test.mnt.share", "tmpfs", 0, "size=1G") < 0) { + fail("Can't mount tmpfs"); + return 1; + } + + mkdir(MPTS_ROOT"/dev/share-1/test.mnt.share/test.share", 0600); + if (umount(MPTS_ROOT"/dev/slave2/test.mnt.share")) { + pr_perror("Can't umount "MPTS_ROOT"/dev/slave2/test.mnt.share"); + return 1; + } + + mkdir(MPTS_ROOT"/dev/slave/test.mnt.slave", 0600); + if (mount("none", MPTS_ROOT"/dev/slave/test.mnt.slave", "tmpfs", 0, "") < 0) { + fail("Can't mount tmpfs"); + return 1; + } + mkdir(MPTS_ROOT"/dev/slave/test.mnt.slave/test.slave", 0600); + + fd = open(MPTS_ROOT"/dev/bmfile", O_CREAT | O_WRONLY); + if (fd < 0) { + pr_perror("Can't create " MPTS_ROOT "/dev/share-1/bmfile"); + return 1; + } + close(fd); + + fd = open(MPTS_ROOT"/dev/bmfile-mount", O_CREAT | O_WRONLY); + if (fd < 0) { + pr_perror("Can't create " MPTS_ROOT "/dev/share-1/bmfile"); + return 1; + } + close(fd); + + if (mount(MPTS_ROOT"/dev/bmfile", MPTS_ROOT"/dev/bmfile-mount", NULL, MS_BIND, NULL) < 0) { + fail("Can't mount tmpfs"); + return 1; + } + + if (mount("none", MPTS_ROOT"/kernel", "proc", 0, "") < 0) { + fail("Can't mount proc"); + return 1; + } + + if (mount("none", MPTS_ROOT"/kernel/sys/fs/binfmt_misc", + "binfmt_misc", 0, "") == 0) + have_bfmtm = 1; + + fd = open(MPTS_ROOT"/kernel/meminfo", O_RDONLY); + if (fd == -1) + return 1; + + if (getenv("ZDTM_NOSUBNS") == NULL) { + pid = clone(ns_child, args.stack_ptr, CLONE_NEWNS | SIGCHLD, &args); + if (pid < 0) { + pr_perror("Unable to fork child"); + return 1; + } + } + + task_waiter_wait4(&t, 1); + + test_daemon(); + test_waitsig(); + + /* this checks both -- sys and proc presence */ + if (access(MPTS_ROOT"/kernel/meminfo", F_OK)) { + fail("No proc after restore"); + return 1; + } + + if (have_bfmtm && access(MPTS_ROOT"/kernel/sys/fs/binfmt_misc/register", F_OK)) { + fail("No binfmt_misc after restore"); + return 1; + } + + if (umount(MPTS_ROOT"/dev/overmount") == -1) { + pr_perror("Can't umount "MPTS_ROOT"/dev/overmount"); + return -1; + } + if (access(MPTS_ROOT"/dev/overmount/test.over", F_OK)) { + fail(MPTS_ROOT"/dev/overmount/test.over"); + return -1; + } + + { + struct stat st1, st2; + if (stat(MPTS_ROOT"/dev/share-1/test.mnt.share/test.share", &st1)) { + pr_perror("Can't stat /dev/share-1/test.share/test.share"); + return 1; + } + if (stat(MPTS_ROOT"/dev/share-2/test.mnt.share/test.share", &st2)) { + pr_perror("Can't stat /dev/share-2/test.mnt.share/test.share"); + return 1; + } + if (st1.st_ino != st2.st_ino) { + fail("/dev/share-1 and /dev/share-1 is not shared"); + return 1; + } + if (stat(MPTS_ROOT"/dev/slave/test.mnt.share/test.share", &st2)) { + pr_perror("Can't stat /dev/slave/test.mnt.share/test.share"); + return 1; + } + if (st1.st_ino != st2.st_ino) { + fail("/dev/slave is not slave of /dev/share-1"); + return 1; + } + if (stat(MPTS_ROOT"/dev/share-1/test.mnt.slave/test.slave", &st1) != -1 || errno != ENOENT) { + pr_perror("/dev/share-1/test.mnt.slave/test.slave exists"); + return 1; + } + if (stat(MPTS_ROOT"/dev/slave/test.mnt.slave/test.slave", &st2)) { + pr_perror("Can't stat /dev/slave/test.mnt.slave/test.slave"); + return 1; + } + if (stat(MPTS_ROOT"/dev/non-root/test", &st1)) { + pr_perror("Can't stat /dev/non-root/test"); + return 1; + } + } + + if (pid > 0) { + kill(pid, SIGTERM); + int status = 1; + wait(&status); + if (status) + return 1; + } + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/mountpoints.desc b/CRIU_code/test/zdtm/static/mountpoints.desc new file mode 100644 index 0000000..aba60c9 --- /dev/null +++ b/CRIU_code/test/zdtm/static/mountpoints.desc @@ -0,0 +1 @@ +{'flavor': 'ns', 'flags': 'suid excl', 'feature': 'mnt_id'} diff --git a/CRIU_code/test/zdtm/static/mprotect00.c b/CRIU_code/test/zdtm/static/mprotect00.c new file mode 100644 index 0000000..463091a --- /dev/null +++ b/CRIU_code/test/zdtm/static/mprotect00.c @@ -0,0 +1,116 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that memory protection migrates correctly\n"; +const char *test_author = "Roman Kagan "; + +const static int prots[] = { + PROT_NONE, + PROT_READ, + /* PROT_WRITE, */ /* doesn't work w/o READ */ + PROT_READ | PROT_WRITE, + PROT_READ | PROT_WRITE | PROT_EXEC, +}; +#define NUM_MPROTS sizeof(prots) / sizeof(int) + +static sigjmp_buf segv_ret; /* we need sig*jmp stuff, otherwise SIGSEGV will reset our handler */ +static void segfault(int signo) +{ + siglongjmp(segv_ret, 1); +} + +static int check_prot(char *ptr, int prot) +{ + if (signal(SIGSEGV, segfault) == SIG_ERR) { + fail("setting SIGSEGV handler failed: %m\n"); + return -1; + } + + if (!sigsetjmp(segv_ret, 1)) { + if (ptr[10] != 0) { + fail("read value doesn't match what I wrote"); + return -1; + } + if (!(prot & PROT_READ)) { + fail("PROT_READ bypassed\n"); + return -1; + } + } + else /* we come here on return from SIGSEGV handler */ + if (prot & PROT_READ) { + fail("PROT_READ rejected\n"); + return -1; + } + + if (!sigsetjmp(segv_ret, 1)) { + ptr[20] = 67; + if (!(prot & PROT_WRITE)) { + fail("PROT_WRITE bypassed\n"); + return -1; + } + } + else /* we come here on return from SIGSEGV handler */ + if (prot & PROT_WRITE) { + fail("PROT_WRITE rejected\n"); + return -1; + } + + + if (signal(SIGSEGV, SIG_DFL) == SIG_ERR) { + fail("restoring SIGSEGV handler failed: %m\n"); + return -1; + } + + return 0; +} + +int main(int argc, char ** argv) +{ + char *ptr, *ptr_aligned; + int pagesize; + int i; + + test_init(argc, argv); + + pagesize = sysconf(_SC_PAGESIZE); + if (pagesize < 0) { + pr_perror("can't get PAGE_SIZE"); + exit(1); + } + + ptr = mmap(NULL, pagesize * (NUM_MPROTS + 1), PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); + if (ptr == MAP_FAILED) { + pr_perror("calloc failed"); + return -1; + } + + ptr_aligned = (char *)(((unsigned long) ptr + pagesize - 1) & + ~(pagesize - 1)); + + for (i = 0; i < NUM_MPROTS; i++) + if (mprotect(ptr_aligned + pagesize * i, + pagesize / 2, prots[i]) < 0) { + pr_perror("mprotect failed"); + exit(1); + } + + test_daemon(); + test_waitsig(); + + for (i = 0; i < NUM_MPROTS; i++) + if (check_prot(ptr_aligned + pagesize * i, prots[i])) + goto out; + + pass(); +out: + return 0; +} diff --git a/CRIU_code/test/zdtm/static/msgque.c b/CRIU_code/test/zdtm/static/msgque.c new file mode 100644 index 0000000..6bbb103 --- /dev/null +++ b/CRIU_code/test/zdtm/static/msgque.c @@ -0,0 +1,137 @@ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc="Tests sysv5 msg queues supporting by checkpointing"; +const char *test_author="Stanislav Kinsbursky "; + +struct msg1 { + long mtype; + char mtext[30]; +}; +#define TEST_STRING "Test sysv5 msg" +#define MSG_TYPE 1 + +#define ANOTHER_TEST_STRING "Yet another test sysv5 msg" +#define ANOTHER_MSG_TYPE 26538 + +int main(int argc, char **argv) +{ + key_t key; + int msg, pid; + struct msg1 msgbuf; + int chret; + + test_init(argc, argv); + + key = ftok(argv[0], 822155650); + if (key == -1) { + pr_perror("Can't make key"); + exit(1); + } + + pid = test_fork(); + if (pid < 0) { + pr_perror("Can't fork"); + exit(1); + } + + msg = msgget(key, IPC_CREAT | IPC_EXCL | 0666); + if (msg == -1) { + msg = msgget(key, 0666); + if (msg == -1) { + pr_perror("Can't get queue"); + goto err_kill; + } + } + + if (pid == 0) { + test_waitsig(); + + if (msgrcv(msg, &msgbuf, sizeof(TEST_STRING), MSG_TYPE, IPC_NOWAIT) == -1) { + fail("Child: msgrcv failed (%m)"); + return -errno; + } + + if (strncmp(TEST_STRING, msgbuf.mtext, sizeof(TEST_STRING))) { + fail("Child: the source and received strings aren't equal"); + return -errno; + } + test_msg("Child: received %s\n", msgbuf.mtext); + + msgbuf.mtype = ANOTHER_MSG_TYPE; + memcpy(msgbuf.mtext, ANOTHER_TEST_STRING, sizeof(ANOTHER_TEST_STRING)); + if (msgsnd(msg, &msgbuf, sizeof(ANOTHER_TEST_STRING), IPC_NOWAIT) != 0) { + fail("Child: msgsnd failed (%m)"); + return -errno; + }; + pass(); + return 0; + } else { + msgbuf.mtype = MSG_TYPE; + memcpy(msgbuf.mtext, TEST_STRING, sizeof(TEST_STRING)); + if (msgsnd(msg, &msgbuf, sizeof(TEST_STRING), IPC_NOWAIT) != 0) { + fail("Parent: msgsnd failed (%m)"); + goto err_kill; + }; + + msgbuf.mtype = ANOTHER_MSG_TYPE; + memcpy(msgbuf.mtext, ANOTHER_TEST_STRING, sizeof(ANOTHER_TEST_STRING)); + if (msgsnd(msg, &msgbuf, sizeof(ANOTHER_TEST_STRING), IPC_NOWAIT) != 0) { + fail("child: msgsnd (2) failed (%m)"); + return -errno; + }; + + test_daemon(); + test_waitsig(); + + kill(pid, SIGTERM); + + wait(&chret); + chret = WEXITSTATUS(chret); + if (chret) { + fail("Parent: child exited with non-zero code %d (%s)\n", + chret, strerror(chret)); + goto out; + } + + if (msgrcv(msg, &msgbuf, sizeof(ANOTHER_TEST_STRING), ANOTHER_MSG_TYPE, IPC_NOWAIT) == -1) { + fail("Parent: msgrcv failed (%m)"); + goto err; + } + + if (strncmp(ANOTHER_TEST_STRING, msgbuf.mtext, sizeof(ANOTHER_TEST_STRING))) { + fail("Parent: the source and received strings aren't equal"); + goto err; + } + test_msg("Parent: received %s\n", msgbuf.mtext); + + pass(); + } + +out: + if (msgctl(msg, IPC_RMID, 0)) { + fail("Failed to destroy message queue: %d\n", -errno); + return -errno; + } + return chret; + +err_kill: + kill(pid, SIGKILL); + wait(NULL); +err: + chret = -errno; + goto out; +} diff --git a/CRIU_code/test/zdtm/static/msgque.desc b/CRIU_code/test/zdtm/static/msgque.desc new file mode 100644 index 0000000..6c4afe5 --- /dev/null +++ b/CRIU_code/test/zdtm/static/msgque.desc @@ -0,0 +1 @@ +{'flavor': 'ns uns'} diff --git a/CRIU_code/test/zdtm/static/mtime_mmap.c b/CRIU_code/test/zdtm/static/mtime_mmap.c new file mode 100644 index 0000000..64d92ca --- /dev/null +++ b/CRIU_code/test/zdtm/static/mtime_mmap.c @@ -0,0 +1,115 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "file mmaped for write and being written should change mtime\n" + "and be migrated with correct new data"; + +char *filename; +TEST_OPTION(filename, string, "file name", 1); + +#define FILE_SIZE (16 * 1024) + +int main(int argc, char **argv) +{ + int fd; + char buf[FILE_SIZE]; + size_t count; + int i; + char *ptr; + struct stat fst; + time_t mtime_old, mtime_new; + time_t ctime_old, ctime_new; + + test_init(argc, argv); + + fd = open(filename, O_RDWR | O_CREAT, 0666); + if (fd < 0) { + pr_perror("can't open %s", filename); + exit(1); + } + + /* initialization */ + count = sizeof(buf); + memset(buf, 1, count); + if (write(fd, buf, sizeof(buf)) != sizeof(buf)) { + pr_perror("failed to write %s", filename); + exit(1); + } + + if (fstat(fd, &fst) < 0) { + pr_perror("can't get %s file info", filename); + goto failed; + } + + ptr = (char *)mmap(NULL, count, PROT_READ | PROT_WRITE, + MAP_SHARED, fd, 0); + if (ptr == MAP_FAILED) { + pr_perror("mmap() Failed, errno=%d : %s", errno, strerror(errno)); + goto failed; + } + + mtime_old = fst.st_mtime; + ctime_old = fst.st_ctime; + sleep(2); + + for (i = 0; i < count; i++) + ptr[i]++; + + if (munmap(ptr, count)) { + pr_perror("munmap Failed, errno=%d : %s", errno, strerror(errno)); + goto failed; + } + + if (fstat(fd, &fst) < 0) { + pr_perror("can't get %s file info", filename); + goto failed; + } + + + mtime_new = fst.st_mtime; + /* time of last modification */ + if (mtime_new <= mtime_old) { + fail("mtime %ld wasn't updated on mmapped %s file", + mtime_new, filename); + goto failed; + } + + ctime_new = fst.st_ctime; + /* time of last status change */ + if (ctime_new <= ctime_old) { + fail("time of last status change of %s file wasn't changed\n", + filename); + goto failed; + } + + test_daemon(); + test_waitsig(); + + if (fstat(fd, &fst) < 0) { + pr_perror("can't get %s file info", filename); + goto failed; + } + + /* time of last modification */ + if (fst.st_mtime != mtime_new) { + fail("After migration, mtime changed to %ld", fst.st_mtime); + goto failed; + } + + pass(); + unlink(filename); + close(fd); + return 0; +failed: + return 1; +} diff --git a/CRIU_code/test/zdtm/static/netns-dev.c b/CRIU_code/test/zdtm/static/netns-dev.c new file mode 100644 index 0000000..46bb906 --- /dev/null +++ b/CRIU_code/test/zdtm/static/netns-dev.c @@ -0,0 +1,503 @@ +#include +#include +#include + +#include "zdtmtst.h" + +#define LO_CONF_DIR_PATH "/proc/sys/net/ipv4/conf/lo" +#define DEF_CONF_DIR_PATH "/proc/sys/net/ipv4/conf/default" +#define ALL_CONF_DIR_PATH "/proc/sys/net/ipv4/conf/all" +#define LO_CONF6_DIR_PATH "/proc/sys/net/ipv6/conf/lo" +#define DEF_CONF6_DIR_PATH "/proc/sys/net/ipv6/conf/default" +#define ALL_CONF6_DIR_PATH "/proc/sys/net/ipv6/conf/all" + +#define INT_MAX ((int)(~0U>>1)) +#define INT_MIN (-INT_MAX - 1) + +char *devconfs4[] = { + "accept_local", + "accept_source_route", + "arp_accept", + "arp_announce", + "arp_filter", + "arp_ignore", + "arp_notify", + "bootp_relay", + "disable_policy", + "disable_xfrm", + "drop_gratuitous_arp", + "drop_unicast_in_l2_multicast", + "force_igmp_version", + "forwarding", + "accept_redirects", + "igmpv2_unsolicited_report_interval", + "igmpv3_unsolicited_report_interval", + "ignore_routes_with_linkdown", + "log_martians", + "mc_forwarding", + "medium_id", + "promote_secondaries", + "proxy_arp", + "proxy_arp_pvlan", + "route_localnet", + "rp_filter", + "secure_redirects", + "send_redirects", + "shared_media", + "src_valid_mark", + "tag", + NULL, +}; + +struct range { + int min; + int max; +}; + +struct range rand_range4[] = { + {0, 1}, /* accept_local */ + {-1, 0}, /* accept_source_route */ + {0, 1}, /* arp_accept */ + {0, 2}, /* arp_announce */ + {0, 1}, /* arp_filter */ + {0, 8}, /* arp_ignore */ + {0, 1}, /* arp_notify */ + {0, 1}, /* bootp_relay */ + {0, 1}, /* disable_policy */ + {0, 1}, /* disable_xfrm */ + {0, 1}, /* drop_gratuitous_arp */ + {0, 1}, /* drop_unicast_in_l2_multicast */ + {0, INT_MAX}, /* force_igmp_version */ + {0, 1}, /* forwarding */ + {0, 1}, /* accept_redirects */ + {0, INT_MAX}, /* igmpv2_unsolicited_report_interval */ + {0, INT_MAX}, /* igmpv3_unsolicited_report_interval */ + {0, 1}, /* ignore_routes_with_linkdown */ + {0, 1}, /* log_martians */ + {0, 1}, /* mc_forwarding */ + {-1, INT_MAX}, /* medium_id */ + {0, 1}, /* promote_secondaries */ + {0, 1}, /* proxy_arp */ + {0, 1}, /* proxy_arp_pvlan */ + {0, 1}, /* route_localnet */ + {0, 2}, /* rp_filter */ + {0, 1}, /* secure_redirects */ + {0, 1}, /* send_redirects */ + {0, 1}, /* shared_media */ + {0, 1}, /* src_valid_mark */ + {INT_MIN, INT_MAX}, /* tag */ +}; + +char *devconfs6[] = { + "accept_dad", + "accept_ra", + "accept_ra_defrtr", + "accept_ra_from_local", + "accept_ra_min_hop_limit", + "accept_ra_mtu", + "accept_ra_pinfo", + "accept_ra_rt_info_max_plen", + "accept_ra_rtr_pref", + "accept_source_route", + "autoconf", + "dad_transmits", + "disable_ipv6", + "drop_unicast_in_l2_multicast", + "drop_unsolicited_na", + "force_mld_version", + "force_tllao", + "forwarding", + "accept_redirects", + "hop_limit", + "ignore_routes_with_linkdown", + "keep_addr_on_down", + "max_addresses", + "max_desync_factor", + "mldv1_unsolicited_report_interval", + "mldv2_unsolicited_report_interval", + "mtu", + "ndisc_notify", + "optimistic_dad", + "proxy_ndp", + "regen_max_retry", + "router_probe_interval", + "router_solicitation_delay", + "router_solicitation_interval", + "router_solicitations", + "suppress_frag_ndisc", + "temp_prefered_lft", + "temp_valid_lft", + "use_oif_addrs_only", + "use_optimistic", + "use_tempaddr", + NULL, +}; + +#define IPV6_MIN_MTU 1280 +#define ROUTER_MAX 60 +/* According to kernel docs do not make max_addresses too large */ +#define MAX_ADDRESSES 128 + +struct range rand_range6[] = { + {0, 2}, /* accept_dad */ + {0, 2}, /* accept_ra */ + {0, 1}, /* accept_ra_defrtr */ + {0, 1}, /* accept_ra_from_local */ + {0, INT_MAX}, /* accept_ra_min_hop_limit */ + {0, 1}, /* accept_ra_mtu */ + {0, 1}, /* accept_ra_pinfo */ + {0, INT_MAX}, /* accept_ra_rt_info_max_plen */ + {0, 1}, /* accept_ra_rtr_pref */ + {-1, 0}, /* accept_source_route */ + {0, 1}, /* autoconf */ + {0, INT_MAX}, /* dad_transmits */ + {0, 1}, /* disable_ipv6 */ + {0, 1}, /* drop_unicast_in_l2_multicast */ + {0, 1}, /* drop_unsolicited_na */ + {0, 2}, /* force_mld_version */ + {0, 1}, /* force_tllao */ + {0, 1}, /* forwarding */ + {0, 1}, /* accept_redirects */ + {1, 255}, /* hop_limit */ + {0, 1}, /* ignore_routes_with_linkdown */ + {-1, 1}, /* keep_addr_on_down */ + {0, MAX_ADDRESSES}, /* max_addresses */ + {0, INT_MAX}, /* max_desync_factor */ + {0, INT_MAX}, /* mldv1_unsolicited_report_interval */ + {0, INT_MAX}, /* mldv2_unsolicited_report_interval */ + {IPV6_MIN_MTU, IPV6_MIN_MTU}, /* mtu */ + {0, 1}, /* ndisc_notify */ + {0, 1}, /* optimistic_dad */ + {0, 1}, /* proxy_ndp */ + {0, INT_MAX}, /* regen_max_retry */ + {0, ROUTER_MAX}, /* router_probe_interval */ + {0, ROUTER_MAX}, /* router_solicitation_delay */ + {0, ROUTER_MAX}, /* router_solicitation_interval */ + {0, ROUTER_MAX}, /* router_solicitations */ + {0, 1}, /* suppress_frag_ndisc */ + {0, INT_MAX}, /* temp_prefered_lft */ + {0, INT_MAX}, /* temp_valid_lft */ + {0, 1}, /* use_oif_addrs_only */ + {0, 1}, /* use_optimistic */ + {0, 2}, /* use_tempaddr */ +}; + +struct test_conf { + int ipv4_conf[ARRAY_SIZE(devconfs4)]; + int ipv4_conf_rand[ARRAY_SIZE(devconfs4)]; + int ipv6_conf[ARRAY_SIZE(devconfs6)]; + int ipv6_conf_rand[ARRAY_SIZE(devconfs6)]; + char *dir4; + char *dir6; +} lo, def, all; + +static int save_conf(FILE *fp, int *conf, int *conf_rand, + struct range *range, char *path) { + int ret; + + /* + * Save + */ + ret = fscanf(fp, "%d", conf); + if (ret != 1) { + pr_perror("fscanf"); + return -1; + } + + return 0; +} + +static int rand_in_small_range(struct range *r) { + return lrand48() % (r->max - r->min + 1) + r->min; +} + +static int rand_in_range(struct range *r) { + struct range small; + int mid = r->max / 2 + r->min / 2; + int half = r->max / 2 - r->min / 2; + + if (half < INT_MAX / 2) + return rand_in_small_range(r); + + if (lrand48() % 2) { + small.min = r->min; + small.max = mid; + } else { + small.min = mid + 1; + small.max = r->max; + } + + return rand_in_small_range(&small); +} + +static int gen_conf(FILE *fp, int *conf, int *conf_rand, + struct range *range, char *path) { + int ret; + /* + * Set random value + */ + *conf_rand = rand_in_range(range); + + ret = fprintf(fp, "%d", *conf_rand); + if (ret < 0) { + pr_perror("fprintf"); + return -1; + } + + return 0; +} + +#define MAX_MSEC_GRANULARITY 10 + +static int check_conf(FILE *fp, int *conf, int *conf_rand, + struct range *range, char *path) { + int ret; + int val; + + /* + * Check opt + */ + ret = fscanf(fp, "%d", &val); + if (ret != 1) { + pr_perror("fscanf"); + return -1; + } + + if (val != *conf_rand) { + fail("Option \"%s\" changed from %d to %d", + path, *conf_rand, val); + if ((strstr(path, "mldv1_unsolicited_report_interval") + || strstr(path, "mldv2_unsolicited_report_interval")) + && val - *conf_rand < MAX_MSEC_GRANULARITY) + return 0; + return -1; + } + + return 0; +} + +static int restore_conf(FILE *fp, int *conf, int *conf_rand, + struct range *range, char *path) { + int ret; + /* + * Restore opt + */ + ret = fprintf(fp, "%d", *conf); + if (ret < 0) { + pr_perror("fprintf"); + return -1; + } + + return 0; +} + +static int for_each_option_do(int (*f)(FILE *fp, int *conf, int *conf_rand, + struct range *range, char *path), struct test_conf *tc) { + int ret; + int i; + + for (i = 0; devconfs4[i]; i++) { + FILE *fp; + char path[PATH_MAX]; + + ret = snprintf(path, sizeof(path), "%s/%s", tc->dir4, devconfs4[i]); + if (ret < 0) { + pr_perror("snprintf"); + return -1; + } + + ret = access(path, W_OK); + if (ret < 0) + continue; + + fp = fopen(path, "r+"); + if (fp == NULL) { + pr_perror("fopen"); + return -1; + } + + ret = (*f)(fp, &tc->ipv4_conf[i], &tc->ipv4_conf_rand[i], &rand_range4[i], path); + if (ret < 0) + return -1; + + fclose(fp); + } + + for (i = 0; devconfs6[i]; i++) { + FILE *fp; + char path[PATH_MAX]; + + ret = snprintf(path, sizeof(path), "%s/%s", tc->dir6, devconfs6[i]); + if (ret < 0) { + pr_perror("snprintf"); + return -1; + } + + ret = access(path, W_OK); + if (ret < 0) + continue; + + fp = fopen(path, "r+"); + if (fp == NULL) { + pr_perror("fopen"); + return -1; + } + + ret = (*f)(fp, &tc->ipv6_conf[i], &tc->ipv6_conf_rand[i], &rand_range6[i], path); + if (ret < 0) + return -1; + + fclose(fp); + } + + return 0; +} + +#define IPV6ADDR_EXAMPLE "2607:f0d0:1002:0051:0000:0000:0000:0004" +#define MAX_STR_CONF_LEN 200 + +static int set_stable_secret(struct test_conf *tc) { + int ret; + FILE *fp; + char path[PATH_MAX]; + + ret = snprintf(path, sizeof(path), "%s/%s", tc->dir6, "stable_secret"); + if (ret < 0) { + pr_perror("snprintf"); + return -1; + } + + ret = access(path, W_OK); + if (ret < 0) + return 0; + + fp = fopen(path, "r+"); + if (fp == NULL) { + pr_perror("fopen"); + return -1; + } + + ret = fprintf(fp, IPV6ADDR_EXAMPLE); + if (ret < 0) { + pr_perror("fprintf"); + fclose(fp); + return -1; + } + + fclose(fp); + return 0; +} + +static int check_stable_secret(struct test_conf *tc) { + int ret; + FILE *fp; + char path[PATH_MAX]; + char val[MAX_STR_CONF_LEN+1]; + + ret = snprintf(path, sizeof(path), "%s/%s", tc->dir6, "stable_secret"); + if (ret < 0) { + pr_perror("snprintf"); + return -1; + } + + ret = access(path, W_OK); + if (ret < 0) + return 0; + + fp = fopen(path, "r+"); + if (fp == NULL) { + pr_perror("fopen"); + return -1; + } + + ret = fscanf(fp, "%s", val); + if (ret != 1) { + pr_perror("fscanf"); + fclose(fp); + return -1; + } + + if (strcmp(val, IPV6ADDR_EXAMPLE)) { + fail("Option \"%s\" changed from %s to %s", + path, IPV6ADDR_EXAMPLE, val); + fclose(fp); + return -1; + } + + fclose(fp); + return 0; +} + +int main(int argc, char **argv) +{ + int ret; + + lo.dir4 = LO_CONF_DIR_PATH; + def.dir4 = DEF_CONF_DIR_PATH; + all.dir4 = ALL_CONF_DIR_PATH; + lo.dir6 = LO_CONF6_DIR_PATH; + def.dir6 = DEF_CONF6_DIR_PATH; + all.dir6 = ALL_CONF6_DIR_PATH; + + test_init(argc, argv); + + ret = for_each_option_do(save_conf, &all); + if (ret < 0) + return -1; + ret = for_each_option_do(save_conf, &def); + if (ret < 0) + return -1; + ret = for_each_option_do(save_conf, &lo); + if (ret < 0) + return -1; + + ret = for_each_option_do(gen_conf, &all); + if (ret < 0) + return -1; + ret = for_each_option_do(gen_conf, &def); + if (ret < 0) + return -1; + ret = for_each_option_do(gen_conf, &lo); + if (ret < 0) + return -1; + + ret = set_stable_secret(&def); + if (ret < 0) + return -1; + ret = set_stable_secret(&lo); + if (ret < 0) + return -1; + + test_daemon(); + test_waitsig(); + + ret = for_each_option_do(check_conf, &all); + if (ret < 0) + return -1; + ret = for_each_option_do(check_conf, &def); + if (ret < 0) + return -1; + ret = for_each_option_do(check_conf, &lo); + if (ret < 0) + return -1; + + ret = for_each_option_do(restore_conf, &all); + if (ret < 0) + return -1; + ret = for_each_option_do(restore_conf, &def); + if (ret < 0) + return -1; + ret = for_each_option_do(restore_conf, &lo); + if (ret < 0) + return -1; + + ret = check_stable_secret(&def); + if (ret < 0) + return -1; + ret = check_stable_secret(&lo); + if (ret < 0) + return -1; + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/netns-dev.desc b/CRIU_code/test/zdtm/static/netns-dev.desc new file mode 100644 index 0000000..7657ba4 --- /dev/null +++ b/CRIU_code/test/zdtm/static/netns-dev.desc @@ -0,0 +1 @@ +{'flavor': 'ns uns', 'flags': 'suid'} diff --git a/CRIU_code/test/zdtm/static/netns-nf.c b/CRIU_code/test/zdtm/static/netns-nf.c new file mode 100644 index 0000000..393f0e7 --- /dev/null +++ b/CRIU_code/test/zdtm/static/netns-nf.c @@ -0,0 +1,48 @@ +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that netfilter rules (some) are kept"; +const char *test_author = "Pavel Emelianov "; + +char *filename; +TEST_OPTION(filename, string, "file name", 1); + +int main(int argc, char **argv) +{ + char cmd[128]; + + test_init(argc, argv); + + if (system("iptables -A INPUT -t filter --protocol icmp -j DROP")) { + pr_perror("Can't set input rule"); + return -1; + } + + sprintf(cmd, "iptables -L > pre-%s", filename); + if (system(cmd)) { + pr_perror("Can't save iptables"); + return -1; + } + + test_daemon(); + test_waitsig(); + + sprintf(cmd, "iptables -L > post-%s", filename); + if (system(cmd)) { + fail("Can't get iptables"); + return -1; + } + + sprintf(cmd, "diff pre-%s post-%s", filename, filename); + if (system(cmd)) { + fail("Iptables differ"); + return -1; + } + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/netns-nf.desc b/CRIU_code/test/zdtm/static/netns-nf.desc new file mode 100644 index 0000000..496477a --- /dev/null +++ b/CRIU_code/test/zdtm/static/netns-nf.desc @@ -0,0 +1,6 @@ +{ 'deps': [ '/bin/sh', + '/sbin/iptables', + '/usr/lib64/xtables/libxt_standard.so|/usr/lib/iptables/libxt_standard.so|/lib/xtables/libxt_standard.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_standard.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_standard.so|/usr/lib/s390x-linux-gnu/xtables/libxt_standard.so|/usr/lib/xtables/libxt_standard.so', + '/usr/bin/diff'], + 'flags': 'suid', + 'flavor': 'ns uns'} diff --git a/CRIU_code/test/zdtm/static/netns.c b/CRIU_code/test/zdtm/static/netns.c new file mode 100644 index 0000000..b7a75b6 --- /dev/null +++ b/CRIU_code/test/zdtm/static/netns.c @@ -0,0 +1,55 @@ +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that network environment (links, addresses and routes) are preserved"; +const char *test_author = "Pavel Emelianov "; + +int main(int argc, char **argv) +{ + test_init(argc, argv); + + if (system("ip link set lo up")) { + fail("Can't set lo up"); + return -1; + } + + if (system("ip addr add 1.2.3.4 dev lo")) { + fail("Can't add addr on lo"); + return -1; + } + + if (system("ip route add 1.2.3.5 dev lo")) { + fail("Can't add route via lo"); + return -1; + } + + if (system("ip route add 1.2.3.6 via 1.2.3.5")) { + fail("Can't add route via lo (2)"); + return -1; + } + + if (system("ip link > netns.dump.test && ip addr >> netns.dump.test && ip route >> netns.dump.test")) { + sleep(1000); + fail("Can't save net config"); + return -1; + } + + test_daemon(); + test_waitsig(); + + if (system("ip link > netns.rst.test && ip addr >> netns.rst.test && ip route >> netns.rst.test")) { + fail("Can't get net config"); + return -1; + } + + if (system("diff netns.rst.test netns.dump.test")) { + fail("Net config differs after restore"); + return -1; + } + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/netns.desc b/CRIU_code/test/zdtm/static/netns.desc new file mode 100644 index 0000000..b9f9d28 --- /dev/null +++ b/CRIU_code/test/zdtm/static/netns.desc @@ -0,0 +1,3 @@ +{ 'deps': ['/bin/sh', '/sbin/ip|/bin/ip', '/usr/bin/diff'], + 'flags': 'suid', + 'flavor': 'ns uns'} diff --git a/CRIU_code/test/zdtm/static/netns_sub.c b/CRIU_code/test/zdtm/static/netns_sub.c new file mode 100644 index 0000000..6515057 --- /dev/null +++ b/CRIU_code/test/zdtm/static/netns_sub.c @@ -0,0 +1,208 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check dump and restore a few network namespaces"; + +static int fill_name(int nsid, struct sockaddr_un *name) +{ + int len; + + name->sun_family = AF_LOCAL; + snprintf(name->sun_path, 108, "X/zdtm/static/netns_sub-%d", nsid); + len = SUN_LEN(name); + name->sun_path[0] = 0; + + return len; +} + +static int create_socket(int nsid) +{ + struct sockaddr_un name; + int len, sk; + + len = fill_name(nsid, &name); + + sk = socket(AF_LOCAL, SOCK_DGRAM, 0); + if (sk < 0) { + pr_perror("socket"); + return -1; + } + + if (bind(sk, (struct sockaddr *) &name, len) < 0) { + pr_perror("bind"); + close(sk); + return -1; + } + + return sk; +} + +static int check_socket(int nsid, bool success) +{ + struct sockaddr_un name; + int len, sk; + + len = fill_name(nsid, &name); + + sk = socket(AF_LOCAL, SOCK_DGRAM, 0); + if (sk < 0) { + pr_perror("socket"); + return -1; + } + + if (connect(sk, (struct sockaddr *) &name, len) < 0) { + if (!success && errno == ECONNREFUSED) + return 0; + pr_perror("connect to %d", nsid); + close(sk); + return -1; + } + close(sk); + + if (!success) { + pr_err("A sokcet is able to connect to %d\n", nsid); + return -1; + } + + return 0; +} + +int main(int argc, char **argv) +{ + task_waiter_t lock; + pid_t pid1, pid2, pid3, pid0 = getpid(); + int status = -1, sk; + + test_init(argc, argv); + task_waiter_init(&lock); + + sk = create_socket(0); + if (sk < 0) + return 1; + + pid1 = fork(); + if (pid1 < 0) { + pr_perror("fork"); + return -1; + } + if (pid1 == 0) { + close(sk); + if (unshare(CLONE_NEWNET)) { + pr_perror("unshare"); + return 1; + } + sk = create_socket(1); + if (sk < 0) + return 1; + + pid3 = fork(); + if (pid3 < 0) { + pr_perror("fork"); + return 1; + } + if (pid3 == 0) { + char ns[] = "/proc/0123456789/ns/net"; + int fd; + + snprintf(ns, sizeof(ns), "/proc/%d/ns/net", pid0); + fd = open(ns, O_RDONLY); + if (fd < 0) + return 1; + + if (setns(fd, 0)) + return 1; + close(fd); + + task_waiter_complete(&lock, 3); + test_waitsig(); + + if (check_socket(0, true)) + return 1; + if (check_socket(2, false)) + return 1; + if (check_socket(1, false)) + return 1; + + return 0; + } + /* This socket will be alive in the 3 process */ + close(sk); + + task_waiter_complete(&lock, 1); + test_waitsig(); + + if (check_socket(1, true)) + return 1; + + kill(pid3, SIGTERM); + waitpid(pid3, &status, 0); + if (status) { + fail(); + return 1; + } + + return 0; + } + pid2 = fork(); + if (pid2 < 0) { + pr_perror("fork"); + return -1; + } + if (pid2 == 0) { + if (unshare(CLONE_NEWNET)) { + pr_perror("unshare"); + return 1; + } + sk = create_socket(2); + if (sk < 0) + return 1; + task_waiter_complete(&lock, 2); + + test_waitsig(); + + if (check_socket(0, false)) + return 1; + if (check_socket(1, false)) + return 1; + if (check_socket(2, true)) + return 1; + + return 0; + } + close(sk); + task_waiter_wait4(&lock, 1); + task_waiter_wait4(&lock, 2); + task_waiter_wait4(&lock, 3); + + test_daemon(); + test_waitsig(); + + kill(pid1, SIGTERM); + waitpid(pid1, &status, 0); + if (status) { + fail(); + return 1; + } + kill(pid2, SIGTERM); + status = -1; + waitpid(pid2, &status, 0); + if (status) { + fail(); + return 1; + } + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/netns_sub.desc b/CRIU_code/test/zdtm/static/netns_sub.desc new file mode 100644 index 0000000..7657ba4 --- /dev/null +++ b/CRIU_code/test/zdtm/static/netns_sub.desc @@ -0,0 +1 @@ +{'flavor': 'ns uns', 'flags': 'suid'} diff --git a/CRIU_code/test/zdtm/static/netns_sub_veth.c b/CRIU_code/test/zdtm/static/netns_sub_veth.c new file mode 100644 index 0000000..9278271 --- /dev/null +++ b/CRIU_code/test/zdtm/static/netns_sub_veth.c @@ -0,0 +1,124 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check dump and restore a few network namespaces"; + +#ifndef NSIO +#define NSIO 0xb7 +#define NS_GET_USERNS _IO(NSIO, 0x1) +#define NS_GET_PARENT _IO(NSIO, 0x2) +#endif + +int main(int argc, char **argv) +{ + task_waiter_t lock; + pid_t pid[2]; + int status = -1, ret, i; + struct rtnl_link *link = NULL, *new; + struct nl_sock *sk; + int has_index = 1; + + test_init(argc, argv); + task_waiter_init(&lock); + + for (i = 0; i < 2; i++) { + pid[i] = fork(); + if (pid[i] < 0) { + pr_perror("fork"); + return -1; + } + if (pid[i] == 0) { + if (unshare(CLONE_NEWNET)) + return 1; + + task_waiter_complete(&lock, i); + test_waitsig(); + + return 0; + } + task_waiter_wait4(&lock, i); + } + + sk = nl_socket_alloc(); + if (sk == NULL) + return -1; + + ret = nl_connect(sk, NETLINK_ROUTE); + if (ret < 0) { + nl_socket_free(sk); + pr_err("Unable to connect socket: %s", nl_geterror(ret)); + return -1; + } + + if (system("ip link add name zdtmbr type bridge")) + return -1; + + for (i = 0; i < 2; i++) { + char cmd[4096]; + + snprintf(cmd, sizeof(cmd), "ip link add name zdtm%d index %d netns %d type veth peer name zdtm%d index %d", + i, i * 10 + 12, pid[i], i, i * 10 + 12); + if (system(cmd)) { + has_index = 0; + snprintf(cmd, sizeof(cmd), "ip link add name zdtm%d netns %d type veth peer name zdtm%d", i, pid[i], i); + if (system(cmd)) + return 1; + } + snprintf(cmd, sizeof(cmd), "ip link set dev zdtm%d master zdtmbr", i); + if (system(cmd)) + return 1; + } + + test_daemon(); + test_waitsig(); + + for (i = 0; i < 2; i++) { + link = rtnl_link_alloc(); + new = rtnl_link_alloc(); + if (has_index) + rtnl_link_set_ifindex(link, i * 10 + 12); + else { + char name[43]; + snprintf(name, sizeof(name), "zdtm%d", i); + rtnl_link_set_name(link, name); + rtnl_link_set_name(new, name); + } + rtnl_link_set_flags(new, IFF_UP); + ret = rtnl_link_change(sk, link, new, 0); + if (ret) { + fail("Unable to up the link: %s", nl_geterror(ret)); + return 1; + } + } + + for (i = 0; i < 2; i++) { + kill(pid[i], SIGTERM); + waitpid(pid[i], &status, 0); + if (status) { + fail(); + return 1; + } + } + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/netns_sub_veth.desc b/CRIU_code/test/zdtm/static/netns_sub_veth.desc new file mode 100644 index 0000000..ea9e15c --- /dev/null +++ b/CRIU_code/test/zdtm/static/netns_sub_veth.desc @@ -0,0 +1,6 @@ +{ + 'deps': ['/sbin/ip', '/bin/sh'], + 'flags': 'suid', + 'flavor': 'ns uns', + 'feature': 'link_nsid', +} diff --git a/CRIU_code/test/zdtm/static/non_uniform_share_propagation.c b/CRIU_code/test/zdtm/static/non_uniform_share_propagation.c new file mode 100644 index 0000000..7a18247 --- /dev/null +++ b/CRIU_code/test/zdtm/static/non_uniform_share_propagation.c @@ -0,0 +1,131 @@ +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check non-uniform shares restore fine"; +const char *test_author = "Pavel Tikhomirov "; + +char *dirname; +TEST_OPTION(dirname, string, "directory name", 1); + +int main(int argc, char **argv) +{ + char share1[PATH_MAX], share2[PATH_MAX]; + char child1[PATH_MAX], child2[PATH_MAX], child3[PATH_MAX]; + + test_init(argc, argv); + + if (mkdir(dirname, 0700)) { + pr_perror("mkdir"); + return 1; + } + + if (mount("zdtm_fs", dirname, "tmpfs", 0, NULL)) { + pr_perror("mount"); + return 1; + } + + if (mount(NULL, dirname, NULL, MS_PRIVATE, NULL)) { + pr_perror("mount"); + return 1; + } + + snprintf(share1, sizeof(share1), "%s/share1", dirname); + if (mkdir(share1, 0700)) { + pr_perror("mkdir"); + return 1; + } + + if (mount("share", share1, "tmpfs", 0, NULL)) { + pr_perror("mount"); + return 1; + } + + if (mount(NULL, share1, NULL, MS_SHARED, NULL)) { + pr_perror("mount"); + return 1; + } + + snprintf(child1, sizeof(child1), "%s/share1/child1", dirname); + if (mkdir(child1, 0700)) { + pr_perror("mkdir"); + return 1; + } + + if (mount("child1", child1, "tmpfs", 0, NULL)) { + pr_perror("mount"); + return 1; + } + + snprintf(share2, sizeof(share2), "%s/share2", dirname); + if (mkdir(share2, 0700)) { + pr_perror("mkdir"); + return 1; + } + + if (mount(share1, share2, NULL, MS_BIND, NULL)) { + pr_perror("mount"); + return 1; + } + + snprintf(child2, sizeof(child2), "%s/share1/child2", dirname); + if (mkdir(child2, 0700)) { + pr_perror("mkdir"); + return 1; + } + + if (mount(share1, child2, NULL, MS_BIND, NULL)) { + pr_perror("mount"); + return 1; + } + + snprintf(child3, sizeof(child3), "%s/share1/child3", dirname); + if (mkdir(child3, 0700)) { + pr_perror("mkdir"); + return 1; + } + + if (mount("child3", child3, "tmpfs", 0, NULL)) { + pr_perror("mount"); + return 1; + } + + test_daemon(); + test_waitsig(); + + if (umount(child3)) { + pr_perror("Unable to umount %s", child1); + return 1; + } + + if (umount(child2)) { + pr_perror("Unable to umount %s", share1); + return 1; + } + + if (umount(share2)) { + pr_perror("Unable to umount %s", share2); + return 1; + } + + if (umount(child1)) { + pr_perror("Unable to umount %s", child1); + return 1; + } + + if (umount(share1)) { + pr_perror("Unable to umount %s", share1); + return 1; + } + + if (umount(dirname)) { + pr_perror("Unable to umount %s", dirname); + return 1; + } + + pass(); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/non_uniform_share_propagation.desc b/CRIU_code/test/zdtm/static/non_uniform_share_propagation.desc new file mode 100644 index 0000000..7657ba4 --- /dev/null +++ b/CRIU_code/test/zdtm/static/non_uniform_share_propagation.desc @@ -0,0 +1 @@ +{'flavor': 'ns uns', 'flags': 'suid'} diff --git a/CRIU_code/test/zdtm/static/ofd_file_locks.c b/CRIU_code/test/zdtm/static/ofd_file_locks.c new file mode 100644 index 0000000..5b19532 --- /dev/null +++ b/CRIU_code/test/zdtm/static/ofd_file_locks.c @@ -0,0 +1,194 @@ +#include +#include +#include +#include + +#include "zdtmtst.h" +#include "fs.h" +#include "ofd_file_locks.h" + +static int parse_ofd_lock(char *buf, struct flock *lck) +{ + char fl_flag[10], fl_type[15], fl_option[10], fl_end[32]; + long long start; + int num; + + if (strncmp(buf, "lock:\t", 6) != 0) + return 1; /* isn't lock, skip record */ + + num = sscanf(buf, + "%*s %*d: %s %s %s %*d %*x:%*x:%*d %lld %s", + fl_flag, fl_type, fl_option, &start, fl_end); + + if (num < 4) { + pr_err("Invalid lock info %s\n", buf); + return -1; + } + if (strcmp(fl_flag, "OFDLCK")) + return 1; + + lck->l_start = start; + + if (strcmp(fl_end, "EOF")) { + unsigned long end; + + if (sscanf(fl_end, "%lu", &end) <= 0) { + pr_err("Invalid lock entry\n"); + return -1; + } + lck->l_len = end - lck->l_start + 1; + } else { + lck->l_len = 0; + } + if (strcmp(fl_option, "WRITE") == 0) + lck->l_type = F_WRLCK; + else + lck->l_type = F_RDLCK; + + return 0; +} + +static int read_fd_ofd_lock(int pid, int fd, struct flock *lck) +{ + char path[PATH_MAX]; + char buf[100]; + int num; + FILE *proc_file = NULL; + + sprintf(path, "/proc/%i/fdinfo/%i", pid, fd); + proc_file = fopen(path, "r"); + + if (!proc_file) { + pr_err("Can't open %s\n", path); + return -1; + } + + num = -1; + while (fgets(buf, sizeof(buf), proc_file)) { + num = parse_ofd_lock(buf, lck); + if (num <= 0) + break; + } + + if (fclose(proc_file)) { + pr_err("Can't close %s\n", path); + return -1; + } + return num; +} + +int check_lock_exists(const char *filename, struct flock *lck) +{ + int ret = -1; + int fd; + + fd = open(filename, O_RDWR, 0666); + + if (lck->l_type == F_RDLCK) { + /* check, that there is no write lock */ + ret = zdtm_fcntl(fd, F_OFD_GETLK, lck); + if (ret) { + pr_err("fcntl failed (%i)\n", ret); + goto out; + } + if (lck->l_type != F_UNLCK) { + pr_err("OFD lock type do not match\n"); + goto out; + } + } + + /* check, that lock is set */ + lck->l_type = F_WRLCK; + ret = zdtm_fcntl(fd, F_OFD_GETLK, lck); + if (ret) { + pr_err("fcntl failed (%i)\n", ret); + goto out; + } + if (lck->l_type == F_UNLCK) { + pr_err("Lock not found\n"); + goto out; + } + + ret = 0; +out: + if (close(fd)) + return -1; + return ret; +} + +static int check_file_locks_match(struct flock *orig_lck, struct flock *lck) +{ + return orig_lck->l_start == lck->l_start && + orig_lck->l_len == lck->l_len && + orig_lck->l_type == lck->l_type; +} + +int check_file_lock_restored(int pid, int fd, struct flock *lck) +{ + struct flock lck_restored; + + if (read_fd_ofd_lock(pid, fd, &lck_restored)) + return -1; + + if (!check_file_locks_match(lck, &lck_restored)) { + pr_err("Can't restore file lock (fd: %i)\n", fd); + return -1; + } + return 0; +} + +/* + * fcntl() wrapper for ofd locks. + * + * Kernel requires ia32 processes to use fcntl64() syscall for ofd: + * COMPAT_SYSCALL_DEFINE3(fcntl, [..]) + * { + * switch (cmd) { + * case F_GETLK64: + * case F_SETLK64: + * case F_SETLKW64: + * case F_OFD_GETLK: + * case F_OFD_SETLK: + * case F_OFD_SETLKW: + * return -EINVAL; + * } + * + * Glibc does all the needed wraps for fcntl(), but only from v2.28. + * To make ofd tests run on the older glibc's - provide zdtm wrap. + * + * Note: we don't need the wraps in CRIU itself as parasite/restorer + * run in 64-bit mode as long as possible, including the time to play + * with ofd (and they are dumped from CRIU). + */ +int zdtm_fcntl(int fd, int cmd, struct flock *f) +{ +#if defined(__i386__) +#ifndef __NR_fcntl64 +# define __NR_fcntl64 221 +#endif + struct flock64 f64 = {}; + int ret; + + switch (cmd) { + case F_OFD_SETLK: + case F_OFD_SETLKW: + f64.l_type = f->l_type; + f64.l_whence = f->l_whence; + f64.l_start = f->l_start; + f64.l_len = f->l_len; + f64.l_pid = f->l_pid; + return syscall(__NR_fcntl64, fd, cmd, &f64); + case F_OFD_GETLK: + ret = syscall(__NR_fcntl64, fd, cmd, &f64); + f->l_type = f64.l_type; + f->l_whence = f64.l_whence; + f->l_start = f64.l_start; + f->l_len = f64.l_len; + f->l_pid = f64.l_pid; + return ret; + default: + break; + } +#endif + return fcntl(fd, cmd, f); +} diff --git a/CRIU_code/test/zdtm/static/ofd_file_locks.h b/CRIU_code/test/zdtm/static/ofd_file_locks.h new file mode 100644 index 0000000..1b206a2 --- /dev/null +++ b/CRIU_code/test/zdtm/static/ofd_file_locks.h @@ -0,0 +1,21 @@ +#ifndef ZDTM_OFD_FILE_LOCKS_H_ +#define ZDTM_OFD_FILE_LOCKS_H_ + +#include + +#ifndef F_OFD_GETLK +#define F_OFD_GETLK 36 +#define F_OFD_SETLK 37 +#define F_OFD_SETLKW 38 +#endif + +/* + * Functions for parsing of OFD locks + * from procfs and checking them after restoring. + */ + +extern int check_lock_exists(const char *filename, struct flock *lck); +extern int check_file_lock_restored(int pid, int fd, struct flock *lck); +extern int zdtm_fcntl(int fd, int cmd, struct flock *f); + +#endif /* ZDTM_OFD_FILE_LOCKS_H_ */ diff --git a/CRIU_code/test/zdtm/static/oom_score_adj.c b/CRIU_code/test/zdtm/static/oom_score_adj.c new file mode 100644 index 0000000..b427530 --- /dev/null +++ b/CRIU_code/test/zdtm/static/oom_score_adj.c @@ -0,0 +1,94 @@ +#include +#include +#include +#include +#include + +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check for /proc/self/oom_score_adj restore"; +const char *test_author = "Dmitry Safonov "; + +const char oom_score_adj_self[] = "/proc/self/oom_score_adj"; +const int test_value = 400; + +int get_oom_score_adj(const char *path, int *err) +{ + int fd; + ssize_t num; + char buf[11]; + + *err = 0; + fd = open(path, O_RDONLY); + if (fd < 0) { + pr_perror("Failed to open %s", path); + goto out; + } + + num = read(fd, buf, 10); + close(fd); + if (num < 0) { + pr_perror("Unable to read %s", path); + goto out; + } + buf[num] = '\0'; + + return strtol(buf, NULL, 10); + +out: + *err = -1; + return 0; +} + +int set_oom_score_adj(const char *path, int value) +{ + int fd, ret = 0; + char buf[11]; + + fd = open(path, O_RDWR); + if (fd < 0) { + pr_perror("Failed to open %s", path); + return -1; + } + + snprintf(buf, 11, "%d", value); + + if (write(fd, buf, 11) < 0) { + pr_perror("Write %s to %s failed", buf, path); + ret = -1; + } + + close(fd); + return ret; +} + + +int main(int argc, char *argv[]) +{ + int ret; + int new_oom_score_adj; + + test_init(argc, argv); + + if (set_oom_score_adj(oom_score_adj_self, test_value) < 0) + return -1; + + test_daemon(); + test_waitsig(); + + new_oom_score_adj = get_oom_score_adj(oom_score_adj_self, &ret); + if (ret < 0) + return -1; + + if (new_oom_score_adj != test_value) { + fail("OOM score value %d is different after restore: %d\n", + test_value, new_oom_score_adj); + return -1; + } + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/overmount_dev.c b/CRIU_code/test/zdtm/static/overmount_dev.c new file mode 100644 index 0000000..a8dc8b8 --- /dev/null +++ b/CRIU_code/test/zdtm/static/overmount_dev.c @@ -0,0 +1,93 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that we can migrate with a device special file " + "open in a directory which has been mounted over by " + "another filesystem"; +const char *test_author = "Roman Kagan "; + +char *dirname; +TEST_OPTION(dirname, string, "directory name", 1); + +int main(int argc, char **argv) +{ + int fd; + char path[256]; + struct stat st; + /* /dev/null params - sure to exist in a VPS */ + mode_t mode = S_IFCHR | 0700; + dev_t dev = makedev(1, 3); + + test_init(argc, argv); + + if (snprintf(path, sizeof(path), "%s/foo", dirname) >= sizeof(path)) { + pr_perror("directory name \"%s\"is too long", dirname); + exit(1); + } + + if (mkdir(dirname, 0700)) { + pr_perror("can't make directory %s", dirname); + exit(1); + } + + if (mknod(path, mode, dev)) { + pr_perror("can't make device file \"%s\"", path); + exit(1); + } + + fd = open(path, O_RDWR); + if (fd < 0) { + pr_perror("can't open %s", path); + goto rmdir; + } + + if (mount("rien", dirname, "tmpfs", 0, 0) < 0) { + pr_perror("can't mount tmpfs over %s", dirname); + goto cleanup; + } + + test_daemon(); + test_waitsig(); + + if (umount(dirname) < 0) { + fail("can't umount %s: %m", dirname); + goto cleanup; + } + + if (close(fd) < 0) { + fail("can't close %s: %m", path); + goto unlink; + } + + if (stat(path, &st) < 0) { + fail("can't stat %s: %m", path); + goto unlink; + } + + if (st.st_mode != mode || st.st_rdev != dev) { + fail("%s is no longer the device file we had", path); + goto unlink; + } + + if (unlink(path) < 0) { + fail("can't unlink %s: %m", path); + goto rmdir; + } + + pass(); + goto rmdir; +cleanup: + close(fd); +unlink: + unlink(path); +rmdir: + rmdir(dirname); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/overmount_dev.desc b/CRIU_code/test/zdtm/static/overmount_dev.desc new file mode 100644 index 0000000..c739fc9 --- /dev/null +++ b/CRIU_code/test/zdtm/static/overmount_dev.desc @@ -0,0 +1 @@ +{'flavor' : "ns", 'flags': 'suid crfail'} diff --git a/CRIU_code/test/zdtm/static/overmount_fifo.c b/CRIU_code/test/zdtm/static/overmount_fifo.c new file mode 100644 index 0000000..e452f1b --- /dev/null +++ b/CRIU_code/test/zdtm/static/overmount_fifo.c @@ -0,0 +1,90 @@ +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that we can migrate with a named pipe " + "open in a directory which has been mounted over by " + "another filesystem"; +const char *test_author = "Roman Kagan "; + +char *dirname; +TEST_OPTION(dirname, string, "directory name", 1); + +int main(int argc, char **argv) +{ + int fd; + char path[256]; + struct stat st; + mode_t mode = S_IFIFO | 0700; + + test_init(argc, argv); + + if (snprintf(path, sizeof(path), "%s/foo", dirname) >= sizeof(path)) { + pr_perror("directory name \"%s\"is too long", dirname); + exit(1); + } + + if (mkdir(dirname, 0700)) { + pr_perror("can't make directory %s", dirname); + exit(1); + } + + if (mknod(path, mode, 0)) { + pr_perror("can't make fifo \"%s\"", path); + exit(1); + } + + fd = open(path, O_RDWR); + if (fd < 0) { + pr_perror("can't open %s", path); + goto rmdir; + } + + if (mount("rien", dirname, "tmpfs", 0, 0) < 0) { + pr_perror("can't mount tmpfs over %s", dirname); + goto cleanup; + } + + test_daemon(); + test_waitsig(); + + if (umount(dirname) < 0) { + fail("can't umount %s: %m", dirname); + goto cleanup; + } + + if (close(fd) < 0) { + fail("can't close %s: %m", path); + goto unlink; + } + + if (stat(path, &st) < 0) { + fail("can't stat %s: %m", path); + goto unlink; + } + + if (st.st_mode != mode) { + fail("%s is no longer the fifo we had", path); + goto unlink; + } + + if (unlink(path) < 0) { + fail("can't unlink %s: %m", path); + goto rmdir; + } + + pass(); + goto rmdir; +cleanup: + close(fd); +unlink: + unlink(path); +rmdir: + rmdir(dirname); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/overmount_fifo.desc b/CRIU_code/test/zdtm/static/overmount_fifo.desc new file mode 100644 index 0000000..1ba68c3 --- /dev/null +++ b/CRIU_code/test/zdtm/static/overmount_fifo.desc @@ -0,0 +1 @@ +{'flavor' : 'ns uns', 'flags': 'suid crfail'} diff --git a/CRIU_code/test/zdtm/static/overmount_file.c b/CRIU_code/test/zdtm/static/overmount_file.c new file mode 100644 index 0000000..5c370e0 --- /dev/null +++ b/CRIU_code/test/zdtm/static/overmount_file.c @@ -0,0 +1,73 @@ +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that we can't migrate with a file open in a " + "directory which has been mounted over by another " + "filesystem"; +const char *test_author = "Roman Kagan "; + +char *dirname; +TEST_OPTION(dirname, string, "directory name", 1); + +int main(int argc, char **argv) +{ + int fd; + char path[256]; + + test_init(argc, argv); + + if (snprintf(path, sizeof(path), "%s/foo", dirname) >= sizeof(path)) { + pr_perror("directory name \"%s\"is too long", dirname); + exit(1); + } + + if (mkdir(dirname, 0700)) { + pr_perror("can't make directory %s", dirname); + exit(1); + } + + fd = open(path, O_RDWR | O_CREAT | O_TRUNC, 0644); + if (fd < 0) { + pr_perror("can't open %s", path); + goto rmdir; + } + + if (mount("rien", dirname, "tmpfs", 0, 0) < 0) { + pr_perror("can't mount tmpfs over %s", dirname); + goto cleanup; + } + + test_daemon(); + test_waitsig(); + + if (umount(dirname) < 0) { + fail("can't umount %s: %m", dirname); + goto cleanup; + } + + if (close(fd) < 0) { + fail("can't close %s: %m", path); + goto unlink; + } + + if (unlink(path) < 0) { + fail("can't unlink %s: %m", path); + goto rmdir; + } + + pass(); + goto rmdir; +cleanup: + close(fd); +unlink: + unlink(path); +rmdir: + rmdir(dirname); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/overmount_file.desc b/CRIU_code/test/zdtm/static/overmount_file.desc new file mode 100644 index 0000000..1ba68c3 --- /dev/null +++ b/CRIU_code/test/zdtm/static/overmount_file.desc @@ -0,0 +1 @@ +{'flavor' : 'ns uns', 'flags': 'suid crfail'} diff --git a/CRIU_code/test/zdtm/static/overmount_sock.c b/CRIU_code/test/zdtm/static/overmount_sock.c new file mode 100644 index 0000000..94e4c7e --- /dev/null +++ b/CRIU_code/test/zdtm/static/overmount_sock.c @@ -0,0 +1,207 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that we can migrate with a unix socket " + "bound in a directory which has been mounted over by" + " another filesystem"; +const char *test_author = "Roman Kagan "; + +char *dirname; +TEST_OPTION(dirname, string, "directory name", 1); + +static int fill_sock_name(struct sockaddr_un *name, const char *filename) +{ + if (strlen(filename) >= sizeof(name->sun_path)) + return -1; + + name->sun_family = AF_LOCAL; + strcpy(name->sun_path, filename); + return 0; +} + +static int setup_srv_sock(const char *filename) +{ + struct sockaddr_un name; + int sock; + + if (fill_sock_name(&name, filename) < 0) { + pr_perror("filename \"%s\" is too long", filename); + return -1; + } + + sock = socket(PF_LOCAL, SOCK_STREAM, 0); + if (sock < 0) { + pr_perror("can't create socket"); + return -1; + } + + if (bind(sock, (struct sockaddr *) &name, SUN_LEN(&name)) < 0) { + pr_perror("can't bind to socket \"%s\"", filename); + goto err; + } + + if (listen(sock, 1) < 0) { + pr_perror("can't listen on a socket \"%s\"", filename); + goto err; + } + + return sock; +err: + close(sock); + return -1; +} + +static int setup_clnt_sock(const char *filename) +{ + struct sockaddr_un name; + int sock; + + if (fill_sock_name(&name, filename) < 0) + return -1; + + sock = socket(PF_LOCAL, SOCK_STREAM, 0); + if (sock < 0) + return -1; + + if (connect(sock, (struct sockaddr *) &name, SUN_LEN(&name)) < 0) + goto err; + + return sock; +err: + close(sock); + return -1; +} + +int main(int argc, char ** argv) +{ + int sock, acc_sock, ret; + char path[256]; + pid_t pid; + uint32_t crc; + uint8_t buf[1000]; + + test_init(argc, argv); + + if (snprintf(path, sizeof(path), "%s/foo", dirname) >= sizeof(path)) { + pr_perror("directory name \"%s\"is too long", dirname); + exit(1); + } + + if (mkdir(dirname, 0700)) { + pr_perror("can't make directory %s", dirname); + exit(1); + } + + sock = setup_srv_sock(path); + if (sock < 0) + goto out; + + pid = fork(); + if (pid < 0) { + pr_perror("can't fork"); + goto out; + } + + if (pid == 0) { /* child writes to the overmounted socket and returns */ + close(sock); + + sock = setup_clnt_sock(path); + if (sock < 0) + _exit(1); + + test_waitsig(); + + crc = ~0; + datagen(buf, sizeof(buf), &crc); + if (write(sock, buf, sizeof(buf)) != sizeof(buf)) + _exit(errno); + + close(sock); + _exit(0); + } + + acc_sock = accept(sock, NULL, NULL); + if (acc_sock < 0) { + pr_perror("can't accept() the connection on \"%s\"", path); + goto out_kill; + } + + close(sock); + sock = acc_sock; + + if (mount("rien", dirname, "tmpfs", 0, 0) < 0) { + pr_perror("can't mount tmpfs over %s", dirname); + goto out_kill; + } + + test_daemon(); + test_waitsig(); + + if (kill(pid, SIGTERM)) { + fail("terminating the child failed: %m\n"); + goto out; + } + + if (wait(&ret) != pid) { + fail("wait() returned wrong pid %d: %m\n", pid); + goto out; + } + + if (WIFEXITED(ret)) { + ret = WEXITSTATUS(ret); + if (ret) { + fail("child exited with nonzero code %d (%s)\n", ret, + strerror(ret)); + goto out; + } + } + if (WIFSIGNALED(ret)) { + fail("child exited on unexpected signal %d\n", WTERMSIG(ret)); + goto out; + } + + if (read(sock, buf, sizeof(buf)) != sizeof(buf)) { + fail("can't read %s: %m\n", path); + goto out; + } + + crc = ~0; + if (datachk(buf, sizeof(buf), &crc)) { + fail("CRC mismatch\n"); + goto out; + } + + if (umount(dirname) < 0) { + fail("can't umount %s: %m", dirname); + goto out; + } + + if (close(sock) < 0) { + fail("can't close %s: %m", path); + goto out; + } + + if (unlink(path) < 0) { + fail("can't unlink %s: %m", path); + goto out; + } + + pass(); + +out_kill: + kill(pid, SIGKILL); +out: + close(sock); + unlink(path); + rmdir(dirname); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/overmount_sock.desc b/CRIU_code/test/zdtm/static/overmount_sock.desc new file mode 100644 index 0000000..1ba68c3 --- /dev/null +++ b/CRIU_code/test/zdtm/static/overmount_sock.desc @@ -0,0 +1 @@ +{'flavor' : 'ns uns', 'flags': 'suid crfail'} diff --git a/CRIU_code/test/zdtm/static/overmount_with_shared_parent.c b/CRIU_code/test/zdtm/static/overmount_with_shared_parent.c new file mode 100644 index 0000000..1fcb5a5 --- /dev/null +++ b/CRIU_code/test/zdtm/static/overmount_with_shared_parent.c @@ -0,0 +1,69 @@ +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check overmount on shared parent works"; +const char *test_author = "Pavel Tikhomirov "; + +char *dirname; +TEST_OPTION(dirname, string, "directory name", 1); + +int main(int argc, char **argv) +{ + char dir_a[PATH_MAX], dir_b[PATH_MAX], dir_c[PATH_MAX]; + char dir_d[PATH_MAX], dir_a_c[PATH_MAX]; + + test_init(argc, argv); + + mkdir(dirname, 0700); + + if (mount(dirname, dirname, NULL, MS_BIND, NULL)) { + pr_perror("Unable to self bind mount %s", dirname); + return 1; + } + + if (mount(NULL, dirname, NULL, MS_SHARED, NULL)) { + pr_perror("Unable to make shared mount %s", dirname); + return 1; + } + + ssprintf(dir_a, "%s/a", dirname); + ssprintf(dir_d, "%s/d", dirname); + mkdir(dir_a, 0700); + mkdir(dir_d, 0700); + + ssprintf(dir_b, "%s/b", dir_a); + ssprintf(dir_c, "%s/c", dir_b); + mkdir(dir_b, 0700); + mkdir(dir_c, 0700); + + if (mount(dir_b, dir_a, NULL, MS_BIND, NULL)) { + pr_perror("Unable to bind mount %s to %s", dir_b, dir_a); + return 1; + } + + ssprintf(dir_a_c, "%s/c", dir_a); + + if (mount(dir_d, dir_a_c, NULL, MS_BIND, NULL)) { + pr_perror("Unable to bind mount %s to %s", dir_d, dir_a_c); + return 1; + } + + test_daemon(); + test_waitsig(); + + if (umount(dir_a_c)) { + pr_perror("Unable to umount %s", dir_a_c); + return 1; + } + + if (umount(dir_a)) { + pr_perror("Unable to umount %s", dir_a); + return 1; + } + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/overmount_with_shared_parent.desc b/CRIU_code/test/zdtm/static/overmount_with_shared_parent.desc new file mode 100644 index 0000000..7657ba4 --- /dev/null +++ b/CRIU_code/test/zdtm/static/overmount_with_shared_parent.desc @@ -0,0 +1 @@ +{'flavor': 'ns uns', 'flags': 'suid'} diff --git a/CRIU_code/test/zdtm/static/overmounted_file.c b/CRIU_code/test/zdtm/static/overmounted_file.c new file mode 100644 index 0000000..ee1d1c5 --- /dev/null +++ b/CRIU_code/test/zdtm/static/overmounted_file.c @@ -0,0 +1,109 @@ +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check open file on overmounted mounts doesn't dump"; +const char *test_author = "Pavel Tikhomirov "; + +#define DATA "Data" + +char *dirname; +TEST_OPTION(dirname, string, "directory name", 1); + +int main(int argc, char **argv) +{ + char overmounted[PATH_MAX]; + char buf[sizeof(DATA)]; + char file[PATH_MAX]; + int fd; + + test_init(argc, argv); + + if (mkdir(dirname, 0700)) { + pr_perror("mkdir"); + return 1; + } + + if (mount("zdtm_fs", dirname, "tmpfs", 0, NULL)) { + pr_perror("mount"); + return 1; + } + + if (mount(NULL, dirname, NULL, MS_PRIVATE, NULL)) { + pr_perror("mount"); + return 1; + } + + ssprintf(overmounted, "%s/overmounted", dirname); + if (mkdir(overmounted, 0700)) { + pr_perror("mkdir"); + return 1; + } + + if (mount("overmounted", overmounted, "tmpfs", 0, NULL)) { + pr_perror("mount"); + return 1; + } + + ssprintf(file, "%s/file", overmounted); + fd = open(file, O_CREAT|O_WRONLY, 0600); + if (fd < 0) { + pr_perror("open"); + return 1; + } + + if (write(fd, DATA, sizeof(DATA)) != sizeof(DATA)) { + pr_perror("write"); + return 1; + } + close(fd); + + fd = open(file, O_RDONLY); + if (fd < 0) { + pr_perror("open"); + return 1; + } + + if (mount(overmounted, overmounted, NULL, MS_BIND, NULL)) { + pr_perror("mount"); + return 1; + } + + test_daemon(); + test_waitsig(); + + if (read(fd, buf, sizeof(DATA)) != sizeof(DATA)) { + fail("Can't read from file"); + return 1; + } + + if (strcmp(buf, DATA)) { + fail("Wrong data in a file"); + return 1; + } + + close(fd); + + if (umount(overmounted)) { + pr_perror("umount"); + return 1; + } + + if (umount(overmounted)) { + pr_perror("umount"); + return 1; + } + + if (umount(dirname)) { + pr_perror("umount"); + return 1; + } + + pass(); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/overmounted_file.desc b/CRIU_code/test/zdtm/static/overmounted_file.desc new file mode 100644 index 0000000..0d8b7f2 --- /dev/null +++ b/CRIU_code/test/zdtm/static/overmounted_file.desc @@ -0,0 +1 @@ +{'flavor': 'ns uns', 'flags': 'suid crfail'} diff --git a/CRIU_code/test/zdtm/static/packet_sock.c b/CRIU_code/test/zdtm/static/packet_sock.c new file mode 100644 index 0000000..66175d4 --- /dev/null +++ b/CRIU_code/test/zdtm/static/packet_sock.c @@ -0,0 +1,301 @@ +#include "zdtmtst.h" + +const char *test_doc = "static test for packet sockets"; +const char *test_author = "Pavel Emelyanov "; + +/* + * Description: + * Create and bind several packet sockets, check thet getname + * reports same result before and after c/r cycle. This is enough + * for _basic_ packet functionality only, but still. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define SK_RESERVE 8 +#define DEF_FANOUT 13 + +#ifndef PACKET_FANOUT +#define PACKET_FANOUT 18 +#endif + +static int test_sockaddr(int n, struct sockaddr_ll *have, struct sockaddr_ll *want) +{ + if (have->sll_family != want->sll_family) { + fail("%d Family mismatch %d/%d", n, + (int)have->sll_family, (int)want->sll_family); + return 1; + } + + if (have->sll_protocol != want->sll_protocol) { + fail("%d Proto mismatch %d/%d", n, + (int)have->sll_protocol, (int)want->sll_protocol); + return 1; + } + + if (have->sll_ifindex != want->sll_ifindex) { + fail("%d Index mismatch %d/%d", n, + have->sll_ifindex, want->sll_ifindex); + return 1; + } + + /* all the others are derivatives from dev */ + return 0; +} + +#ifndef MAX_ADDR_LEN +#define MAX_ADDR_LEN 32 +#endif + +struct packet_mreq_max { + int mr_ifindex; + unsigned short mr_type; + unsigned short mr_alen; + unsigned char mr_address[MAX_ADDR_LEN]; +}; + +#define LO_ADDR_LEN 6 + +#if LINUX_VERSION_CODE < KERNEL_VERSION(3,2,0) + +struct tpacket_req3 { + unsigned int tp_block_size; + unsigned int tp_block_nr; + unsigned int tp_frame_size; + unsigned int tp_frame_nr; + unsigned int tp_retire_blk_tov; + unsigned int tp_sizeof_priv; + unsigned int tp_feature_req_word; +}; + +#endif + +int main(int argc, char **argv) +{ + int sk1, sk2; + struct sockaddr_ll addr, addr1, addr2; + socklen_t alen; + int ver, rsv, yes; + struct packet_mreq_max mreq; + struct tpacket_req3 ring; + + test_init(argc, argv); + + sk1 = socket(PF_PACKET, SOCK_RAW, 0); + if (sk1 < 0) { + pr_perror("Can't create socket 1"); + return 1; + } + + sk2 = socket(PF_PACKET, SOCK_DGRAM, htons(ETH_P_IP)); + if (sk2 < 0) { + pr_perror("Can't create socket 2"); + return 1; + } + + memset(&addr, 0, sizeof(addr)); + addr.sll_family = AF_PACKET; + addr.sll_ifindex = 1; /* loopback should be 1 in all namespaces */ + if (bind(sk2, (struct sockaddr *)&addr, sizeof(addr)) < 0) { + pr_perror("Can't bind socket"); + return 1; + } + + alen = sizeof(addr1); + if (getsockname(sk1, (struct sockaddr *)&addr1, &alen) < 0) { + pr_perror("Can't get sockname 1"); + return 1; + } + + alen = sizeof(addr2); + if (getsockname(sk2, (struct sockaddr *)&addr2, &alen) < 0) { + pr_perror("Can't get sockname 2"); + return 1; + } + + ver = TPACKET_V2; + if (setsockopt(sk1, SOL_PACKET, PACKET_VERSION, &ver, sizeof(ver)) < 0) { + pr_perror("Can't set version"); + return 1; + } + + yes = 1; + if (setsockopt(sk1, SOL_PACKET, PACKET_AUXDATA, &yes, sizeof(yes)) < 0) { + pr_perror("Can't set auxdata"); + return 1; + } + + memset(&ring, 0, sizeof(ring)); + ring.tp_block_size = PAGE_SIZE; + ring.tp_block_nr = 1; + ring.tp_frame_size = 1024; + ring.tp_frame_nr = (ring.tp_block_size / ring.tp_frame_size) * ring.tp_block_nr; + if (setsockopt(sk1, SOL_PACKET, PACKET_RX_RING, &ring, sizeof(ring)) < 0) { + pr_perror("Can't set rx ring"); + return 1; + } + + rsv = SK_RESERVE; + if (setsockopt(sk2, SOL_PACKET, PACKET_RESERVE, &rsv, sizeof(rsv)) < 0) { + pr_perror("Can't set reserve"); + return 1; + } + + yes = 1; + if (setsockopt(sk2, SOL_PACKET, PACKET_ORIGDEV, &yes, sizeof(yes)) < 0) { + pr_perror("Can't set origdev"); + return 1; + } + + yes = DEF_FANOUT; + if (setsockopt(sk2, SOL_PACKET, PACKET_FANOUT, &yes, sizeof(yes)) < 0) { + pr_perror("Can't configure fanout"); + return 1; + } + + memset(&mreq, 0, sizeof(mreq)); + mreq.mr_ifindex = 1; + mreq.mr_type = PACKET_MR_PROMISC; + if (setsockopt(sk1, SOL_PACKET, PACKET_ADD_MEMBERSHIP, &mreq, sizeof(mreq)) < 0) { + pr_perror("Can't add promisc member"); + return 1; + } + + memset(&mreq, 0, sizeof(mreq)); + mreq.mr_ifindex = 1; + mreq.mr_type = PACKET_MR_UNICAST; + mreq.mr_alen = LO_ADDR_LEN; + if (setsockopt(sk2, SOL_PACKET, PACKET_ADD_MEMBERSHIP, &mreq, sizeof(mreq)) < 0) { + pr_perror("Can't add ucast member"); + return 1; + } + + memset(&ring, 0, sizeof(ring)); + ring.tp_block_size = PAGE_SIZE; + ring.tp_block_nr = 1; + ring.tp_frame_size = 1024; + ring.tp_frame_nr = (ring.tp_block_size / ring.tp_frame_size) * ring.tp_block_nr; + if (setsockopt(sk2, SOL_PACKET, PACKET_TX_RING, &ring, sizeof(ring)) < 0) { + pr_perror("Can't set tx ring"); + return 1; + } + + test_daemon(); + test_waitsig(); + + alen = sizeof(addr); + if (getsockname(sk1, (struct sockaddr *)&addr, &alen) < 0) { + fail("Can't get sockname 1 rst"); + return 1; + } + + if (test_sockaddr(1, &addr, &addr1)) + return 1; + + alen = sizeof(ver); + if (getsockopt(sk1, SOL_PACKET, PACKET_VERSION, &ver, &alen) < 0) { + fail("Can't get sockopt ver %m"); + return 1; + } + + if (ver != TPACKET_V2) { + fail("Version mismatch have %d, want %d\n", ver, TPACKET_V2); + return 1; + } + + alen = sizeof(yes); + if (getsockopt(sk1, SOL_PACKET, PACKET_AUXDATA, &yes, &alen) < 0) { + fail("Can't get sockopt auxdata %m"); + return 1; + } + + if (yes != 1) { + fail("Auxdata not ON"); + return 1; + } + + memset(&mreq, 0, sizeof(mreq)); + mreq.mr_ifindex = 1; + mreq.mr_type = PACKET_MR_PROMISC; + if (setsockopt(sk1, SOL_PACKET, PACKET_DROP_MEMBERSHIP, &mreq, sizeof(mreq)) < 0) { + fail("Promisc member not kept"); + return 1; + } + + alen = sizeof(yes); + if (getsockopt(sk1, SOL_PACKET, PACKET_FANOUT, &yes, &alen) < 0) { + fail("Can't read fanout back %m"); + return 1; + } + + if (yes != 0) { + fail("Fanout screwed up to %x", yes); + return 1; + } + + alen = sizeof(addr); + if (getsockname(sk2, (struct sockaddr *)&addr, &alen) < 0) { + fail("Can't get sockname 2 rst"); + return 1; + } + + if (test_sockaddr(2, &addr, &addr2)) + return 1; + + alen = sizeof(rsv); + if (getsockopt(sk2, SOL_PACKET, PACKET_RESERVE, &rsv, &alen) < 0) { + fail("Can't get sockopt rsv %m"); + return 1; + } + + alen = sizeof(yes); + if (getsockopt(sk2, SOL_PACKET, PACKET_ORIGDEV, &yes, &alen) < 0) { + fail("Can't get sockopt origdev %m"); + return 1; + } + + if (yes != 1) { + fail("OrigDev not ON"); + return 1; + } + + if (rsv != SK_RESERVE) { + fail("Reserve mismatch have %d, want %d\n", rsv, SK_RESERVE); + return 1; + } + + memset(&mreq, 0, sizeof(mreq)); + mreq.mr_ifindex = 1; + mreq.mr_type = PACKET_MR_UNICAST; + mreq.mr_alen = LO_ADDR_LEN; + if (setsockopt(sk2, SOL_PACKET, PACKET_DROP_MEMBERSHIP, &mreq, sizeof(mreq)) < 0) { + fail("Ucast member not kept"); + return 1; + } + + alen = sizeof(yes); + if (getsockopt(sk2, SOL_PACKET, PACKET_FANOUT, &yes, &alen) < 0) { + fail("Can't read fanout2 back %m"); + return 1; + } + + if (yes != DEF_FANOUT) { + fail("Fanout2 screwed up to %x", yes); + return 1; + } + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/packet_sock.desc b/CRIU_code/test/zdtm/static/packet_sock.desc new file mode 100644 index 0000000..2eac7e6 --- /dev/null +++ b/CRIU_code/test/zdtm/static/packet_sock.desc @@ -0,0 +1 @@ +{'flags': 'suid'} diff --git a/CRIU_code/test/zdtm/static/packet_sock_mmap.c b/CRIU_code/test/zdtm/static/packet_sock_mmap.c new file mode 100644 index 0000000..2a82950 --- /dev/null +++ b/CRIU_code/test/zdtm/static/packet_sock_mmap.c @@ -0,0 +1,104 @@ +#include "zdtmtst.h" + +const char *test_doc = "static test for packet sockets mmaps"; +const char *test_author = "Pavel Emelyanov "; + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if LINUX_VERSION_CODE < KERNEL_VERSION(3,2,0) + +struct tpacket_req3 { + unsigned int tp_block_size; + unsigned int tp_block_nr; + unsigned int tp_frame_size; + unsigned int tp_frame_nr; + unsigned int tp_retire_blk_tov; + unsigned int tp_sizeof_priv; + unsigned int tp_feature_req_word; +}; + +#endif + +static void check_map_is_there(unsigned long addr, int sk) +{ + FILE *f; + char line[64]; + struct stat ss; + + fstat(sk, &ss); + f = fopen("/proc/self/maps", "r"); + while (fgets(line, sizeof(line), f) != NULL) { + unsigned long start; + int maj, min, ino; + + sscanf(line, "%lx-%*x %*s %*s %x:%x %d %*s", &start, &maj, &min, &ino); + if ((start == addr) && ss.st_dev == makedev(maj, min) && ss.st_ino == ino) { + pass(); + fclose(f); + return; + } + } + + fail("No socket mapping found"); +} + +int main(int argc, char **argv) +{ + int sk; + struct tpacket_req3 ring; + void *mem; + + test_init(argc, argv); + + sk = socket(PF_PACKET, SOCK_RAW, 0); + if (sk < 0) { + pr_perror("Can't create socket 1"); + return 1; + } + + memset(&ring, 0, sizeof(ring)); + ring.tp_block_size = PAGE_SIZE; + ring.tp_block_nr = 1; + ring.tp_frame_size = 1024; + ring.tp_frame_nr = (ring.tp_block_size / ring.tp_frame_size) * ring.tp_block_nr; + if (setsockopt(sk, SOL_PACKET, PACKET_RX_RING, &ring, sizeof(ring)) < 0) { + pr_perror("Can't set rx ring"); + return 1; + } + + memset(&ring, 0, sizeof(ring)); + ring.tp_block_size = PAGE_SIZE; + ring.tp_block_nr = 1; + ring.tp_frame_size = 1024; + ring.tp_frame_nr = (ring.tp_block_size / ring.tp_frame_size) * ring.tp_block_nr; + if (setsockopt(sk, SOL_PACKET, PACKET_TX_RING, &ring, sizeof(ring)) < 0) { + pr_perror("Can't set tx ring"); + return 1; + } + + mem = mmap(NULL, 2 * PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_FILE, sk, 0); + if (mem == MAP_FAILED) { + pr_perror("Can't mmap socket"); + return 1; + } + + test_daemon(); + test_waitsig(); + + check_map_is_there((unsigned long)mem, sk); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/packet_sock_mmap.desc b/CRIU_code/test/zdtm/static/packet_sock_mmap.desc new file mode 100644 index 0000000..2eac7e6 --- /dev/null +++ b/CRIU_code/test/zdtm/static/packet_sock_mmap.desc @@ -0,0 +1 @@ +{'flags': 'suid'} diff --git a/CRIU_code/test/zdtm/static/packet_sock_spkt.c b/CRIU_code/test/zdtm/static/packet_sock_spkt.c new file mode 100644 index 0000000..55cd165 --- /dev/null +++ b/CRIU_code/test/zdtm/static/packet_sock_spkt.c @@ -0,0 +1,88 @@ +#include "zdtmtst.h" + +#include +#include +#include +#include +#include +#include +#include + +const char *test_doc = "Check bound and not bound SOCK_PACKET sockets"; +const char *test_author = "Gleb Valin "; + +struct ethframe { + struct ethhdr header; + char data[ETH_DATA_LEN]; +}; + +static int do_bind(int sk) +{ + struct sockaddr addr = {}; + + addr.sa_family = AF_PACKET; + strcpy(addr.sa_data, "lo"); + + return bind(sk, (struct sockaddr *) &addr, sizeof(addr)); +} + +static int check_socket_binding(int sk, char *dev) +{ + struct sockaddr addr = {}; + + socklen_t l = sizeof(addr); + + if (getsockname(sk, &addr, &l) < 0) + return -1; + + if (addr.sa_family != AF_PACKET) + return -1; + + if (strcmp(addr.sa_data, dev) != 0) + return -1; + + return 0; +} + +int main(int argc, char **argv) +{ + int sk1; + int sk2; + + test_init(argc, argv); + + sk1 = socket(AF_PACKET, SOCK_PACKET, htons(ETH_P_ALL)); + + if (sk1 < 0) { + pr_perror("Can't create socket 1"); + return 1; + } + + if (do_bind(sk1) < 0) { + pr_perror("Can't bind sosket 1"); + return 1; + } + + sk2 = socket(AF_PACKET, SOCK_PACKET, htons(ETH_P_ALL)); + + if (sk2 < 0) { + pr_perror("Can't create socket 2"); + return 1; + } + + test_daemon(); + test_waitsig(); + + if (check_socket_binding(sk1, "lo") < 0) { + fail("Socket 1 has wrong binding"); + return 1; + } + + if (check_socket_binding(sk2, "") < 0) { + fail("Socket 2 has wrong binding"); + return 1; + } + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/packet_sock_spkt.desc b/CRIU_code/test/zdtm/static/packet_sock_spkt.desc new file mode 100644 index 0000000..5adab69 --- /dev/null +++ b/CRIU_code/test/zdtm/static/packet_sock_spkt.desc @@ -0,0 +1 @@ +{'flavor':'h uns ns', 'flags' : 'suid'} diff --git a/CRIU_code/test/zdtm/static/pdeath_sig.c b/CRIU_code/test/zdtm/static/pdeath_sig.c new file mode 100644 index 0000000..0f7436f --- /dev/null +++ b/CRIU_code/test/zdtm/static/pdeath_sig.c @@ -0,0 +1,109 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that pdeath sig is preserved"; +const char *test_author = "Pavel Emelianov "; + +static int sigrecvd = 0; +static void sigh(int s, siginfo_t *i, void *d) +{ + sigrecvd = 1; +} + +#ifndef PR_SET_PDEATH_SIGNAL +#define PR_SET_PDEATH_SIGNAL 1 +#endif + +int main(int argc, char **argv) +{ + int pid, ret, pw[2], pr[2]; + + test_init(argc, argv); + + /* + * Here's what will happen here: + * + * me -(fork)-> P -(fork)-> C + * | | + * +-------------->-(pw)->-+ + * +-<-(pr)-<--------------+ + * + * We wait for C to prepare himself via pr. + * After C/R we kill P and close pw to wake up + * C. The we wait for it to report back via pr + * which signals has he received. + */ + + pipe(pw); + pipe(pr); + + pid = fork(); + if (pid == 0) { + pid = fork(); + if (pid == 0) { + struct sigaction sa = {}; + /* C */ + close(pw[1]); + close(pr[0]); + sa.sa_sigaction = sigh; + ret = sigaction(SIGUSR1, &sa, NULL); + if (ret == 0) + ret = prctl(PR_SET_PDEATH_SIGNAL, SIGUSR1, 0, 0, 0); + write(pr[1], &ret, sizeof(ret)); + read(pw[0], &ret, sizeof(ret)); + write(pr[1], &sigrecvd, sizeof(sigrecvd)); + } else { + /* P, pid == C */ + close(pw[0]); + close(pw[1]); + close(pr[0]); + close(pr[1]); + + /* Just hang */ + waitpid(pid, NULL, 0); + } + + exit(0); + } + + /* me, pid == P */ + close(pw[0]); + close(pr[1]); + + ret = -1; + read(pr[0], &ret, sizeof(ret)); + if (ret != 0) { + pr_perror("C start error"); + goto out; + } + + /* + * P didn't have time to close his pipes? + * That's OK, CRIU should C/R these knots. + */ + + test_daemon(); + test_waitsig(); + +out: + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + close(pw[1]); + + if (ret == 0) { + read(pr[0], &ret, sizeof(ret)); + if (ret != 1) + fail("USR1 isn't delivered"); + else + pass(); + } + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/pid00.c b/CRIU_code/test/zdtm/static/pid00.c new file mode 100644 index 0000000..4fd3e09 --- /dev/null +++ b/CRIU_code/test/zdtm/static/pid00.c @@ -0,0 +1,93 @@ +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that p?pid and e?[ug]id didn't change"; +const char *test_author = "Pavel Emelianov "; + +int setfsuid(uid_t fsuid); +int setfsgid(uid_t fsgid); + +int main(int argc, char **argv) +{ + int pid, s_p[2], f_p[2], r_p[3]; + const uid_t w_ruid = 1, w_euid = 2, w_suid = 3, w_fsuid = w_euid; + const gid_t w_rgid = 5, w_egid = 6, w_sgid = 7, w_fsgid = 8; + uid_t rid, eid, sid, fsid; + char res = 'x'; + + test_init(argc, argv); + + pipe(s_p); + pipe(f_p); + pipe(r_p); + + pid = fork(); + if (pid == 0) { + close(s_p[0]); + close(f_p[1]); + close(r_p[0]); + + setresgid(w_rgid, w_egid, w_sgid); + setfsgid(w_fsgid); + setresuid(w_ruid, w_euid, w_suid); + /* fsuid change is impossible after above */ + + close(s_p[1]); + + read(f_p[0], &res, 1); + close(f_p[0]); + +#define CHECK_ID(__t, __w, __e) do { \ + if (__t##id != w_##__t##__w##id) { \ + res = __e; \ + goto bad; \ + } \ + } while (0) + + rid = eid = sid = fsid = 0; + getresuid(&rid, &eid, &sid); + fsid = setfsuid(w_euid); + CHECK_ID(r, u, '1'); + CHECK_ID(e, u, '2'); + CHECK_ID(s, u, '3'); + CHECK_ID(s, u, '3'); + CHECK_ID(fs, u, '4'); + + rid = eid = sid = fsid = 0; + getresgid(&rid, &eid, &sid); + fsid = setfsgid(w_fsgid); + CHECK_ID(r, g, '5'); + CHECK_ID(e, g, '6'); + CHECK_ID(s, g, '7'); + CHECK_ID(fs, g, '8'); + + res = '0'; +bad: + write(r_p[1], &res, 1); + close(r_p[1]); + _exit(0); + } + + close(f_p[0]); + close(s_p[1]); + close(r_p[1]); + + read(s_p[0], &res, 1); + close(s_p[0]); + + test_daemon(); + test_waitsig(); + + close(f_p[1]); + + read(r_p[0], &res, 1); + if (res == '0') + pass(); + else + fail("Fail: %c", res); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/pid00.desc b/CRIU_code/test/zdtm/static/pid00.desc new file mode 100644 index 0000000..2eac7e6 --- /dev/null +++ b/CRIU_code/test/zdtm/static/pid00.desc @@ -0,0 +1 @@ +{'flags': 'suid'} diff --git a/CRIU_code/test/zdtm/static/pid_file.c b/CRIU_code/test/zdtm/static/pid_file.c new file mode 100644 index 0000000..3ee6a39 --- /dev/null +++ b/CRIU_code/test/zdtm/static/pid_file.c @@ -0,0 +1,52 @@ +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that environment didn't change"; +const char *test_author = "Andrei Vagin "; + +int main(int argc, char **argv) +{ + int fd, fd2; + struct stat st, st2; + + test_init(argc, argv); + + fd = open("/proc/1/status", O_RDONLY); + if (fd < 0) { + pr_perror("Unable to open /proc/1/status"); + return 1; + } + + test_daemon(); + test_waitsig(); + + fd2 = open("/proc/1/status", O_RDONLY); + if (fd2 < 0) { + pr_perror("Unable to open /proc/1/status"); + return 1; + } + if (fstat(fd, &st)) { + pr_perror("fstat"); + return 1; + } + if (fstat(fd2, &st2)) { + pr_perror("fstat"); + return 1; + } + close(fd); + close(fd2); + + if (st.st_ino != st2.st_ino) { + fail("inode numbers mismatch"); + return 1; + } + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/pipe00.c b/CRIU_code/test/zdtm/static/pipe00.c new file mode 100644 index 0000000..dd487d0 --- /dev/null +++ b/CRIU_code/test/zdtm/static/pipe00.c @@ -0,0 +1,121 @@ +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Lock inversion"; +const char *test_author = "Andrey Vagin "; + +#define TEST_STRING "Hello world" + +int main(int argc, char ** argv) +{ + int pipe1[2]; + int pipe2[2]; + int ret; + pid_t pid; + char buf[sizeof(TEST_STRING)]; + task_waiter_t t; + + test_init(argc, argv); + + task_waiter_init(&t); + + ret = pipe(pipe1); + if (ret) + return 1; + + ret = pipe(pipe2); + if (ret) + return 1; + + pid = test_fork(); + if (pid < 0) { + pr_perror("Can't fork"); + exit(1); + } else if (pid == 0) { + if (dup2(pipe1[1], 11) == -1 || dup2(pipe2[0], 12) == -1) { + pr_perror("dup2 failed"); + return 1; + } + } else { + if (dup2(pipe1[0], 12) == -1 || dup2(pipe2[1], 11) == -1) { + pr_perror("dup2 failed"); + goto err; + } + } + + close(pipe2[0]); + close(pipe2[1]); + close(pipe1[0]); + close(pipe1[1]); + + if (pid > 0) { + int status; + + task_waiter_wait4(&t, 1); + + test_daemon(); + + test_waitsig(); + + ret = read(12, buf, sizeof(TEST_STRING)); + if (ret != sizeof(TEST_STRING)) { + pr_perror("read failed: %d", ret); + goto err; + } + ret = write(11, TEST_STRING, sizeof(TEST_STRING)); + if (ret != sizeof(TEST_STRING)) { + pr_perror("write failed: %d", ret); + goto err; + } + close(11); + ret = read(12, buf, sizeof(TEST_STRING)); + if (ret != sizeof(TEST_STRING)) { + pr_perror("read failed: %d", ret); + goto err; + } + if (strcmp(TEST_STRING, buf)) { + pr_perror("data curruption"); + goto err; + } + + ret = wait(&status); + if (ret == -1 || !WIFEXITED(status) || WEXITSTATUS(status)) { + kill(pid, SIGKILL); + goto err; + } + + pass(); + } else { + task_waiter_complete(&t, 1); + ret = write(11, TEST_STRING, sizeof(TEST_STRING)); + if (ret != sizeof(TEST_STRING)) { + pr_perror("write failed: %d", ret); + return 1; + } + ret = read(12, buf, sizeof(TEST_STRING)); + if (ret != sizeof(TEST_STRING)) { + pr_perror("read failed: %d", ret); + return 1; + } + ret = write(11, TEST_STRING, sizeof(TEST_STRING)); + if (ret != sizeof(TEST_STRING)) { + pr_perror("write failed: %d", ret); + return 1; + } + close(11); + if (strcmp(TEST_STRING, buf)) { + pr_perror("data curruption"); + return 1; + } + } + + return 0; +err: + pr_perror("FAIL"); + return 1; +} diff --git a/CRIU_code/test/zdtm/static/pipe01.c b/CRIU_code/test/zdtm/static/pipe01.c new file mode 100644 index 0000000..ec8d1fb --- /dev/null +++ b/CRIU_code/test/zdtm/static/pipe01.c @@ -0,0 +1,132 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Test that all data can be restored"; +const char *test_author = "Andrey Vagin "; + +#define TEST_STRING "Hello world" + +int main(int argc, char ** argv) +{ + int pfd[2], pfd_dup[2], pfd_rop[2]; + char path[PATH_MAX]; + int ret; + uint8_t buf[4096]; + uint32_t crc; + int flags, size = 0; + + test_init(argc, argv); + + crc = ~0; + datagen(buf, sizeof(buf), &crc); + + ret = pipe(pfd); + if (ret) { + pr_perror("pipe() failed"); + return 1; + } + + pfd_dup[0] = dup(pfd[0]); + pfd_dup[1] = dup(pfd[1]); + + snprintf(path, PATH_MAX, "/proc/self/fd/%d", pfd[0]); + pfd_rop[0] = open(path, O_RDONLY); + snprintf(path, PATH_MAX, "/proc/self/fd/%d", pfd[1]); + pfd_rop[1] = open(path, O_WRONLY); + + if (pfd_rop[0] == -1 || pfd_rop[1] == -1 || + pfd_dup[0] == -1 || pfd_dup[1] == -1) { + pr_perror("dup() failed"); + return 1; + } + + flags = fcntl(pfd[1], F_GETFL, 0); + if (flags == -1) { + pr_perror("fcntl() failed"); + return 1; + } + + ret = fcntl(pfd[1], F_SETFL, flags | O_NONBLOCK); + if (ret == -1) { + pr_perror("fcntl() failed"); + return 1; + } + + while (1) { + ret = write(pfd[1], buf, sizeof(buf)); + if (ret == -1) { + if (errno == EAGAIN) + break; + pr_perror("write() failed"); + goto err; + } + + size += ret; + } + + test_daemon(); + + test_waitsig(); + + flags = fcntl(pfd[1], F_GETFL, 0); + if (!(flags & O_NONBLOCK)) { + pr_perror("O_NONBLOCK is absent"); + goto err; + } + + flags = fcntl(pfd_dup[1], F_GETFL, 0); + if (!(flags & O_NONBLOCK)) { + pr_perror("O_NONBLOCK is absent"); + goto err; + } + + flags = fcntl(pfd_rop[1], F_GETFL, 0); + if (flags & O_NONBLOCK) { + pr_perror("O_NONBLOCK appeared"); + goto err; + } + + if (close(pfd[1]) == -1) { + pr_perror("close() failed"); + goto err; + } + + close(pfd_dup[1]); + close(pfd_rop[1]); + + while (1) { + ret = read(pfd[0], buf, sizeof(buf)); + if (ret == 0) + break; + if (ret == -1) { + pr_perror("read() failed"); + goto err; + } + size -= ret; + + crc = ~0; + ret = datachk(buf, sizeof(buf), &crc); + if (ret) { + fail("CRC mismatch\n"); + goto err; + } + } + + if (size) + goto err; + + pass(); + return 0; +err: + fail(); + return 1; +} diff --git a/CRIU_code/test/zdtm/static/pipe02.c b/CRIU_code/test/zdtm/static/pipe02.c new file mode 100644 index 0000000..2a8cca7 --- /dev/null +++ b/CRIU_code/test/zdtm/static/pipe02.c @@ -0,0 +1,61 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Create two unshared descriptor for the one end of a pipe"; +const char *test_author = "Andrey Vagin "; + +int main(int argc, char ** argv) +{ + int p[2], fd; + int ret; + char path[PATH_MAX]; + int flags; + + test_init(argc, argv); + + ret = pipe(p); + if (ret) + return 1; + + snprintf(path, sizeof(path), "/proc/self/fd/%d", p[0]); + + fd = open(path, O_RDONLY); + if (fd == -1) { + pr_perror("open"); + return 1; + }; + + if (fcntl(fd, F_SETFL, fcntl(fd, F_GETFL, 0) | O_NONBLOCK) == -1) { + pr_perror("fcntl"); + return 1; + } + + test_daemon(); + + test_waitsig(); + + flags = fcntl(fd, F_GETFL, 0); + if ((flags & O_NONBLOCK) == 0) { + fail("O_NONBLOCK are not restored for %d", fd); + return 1; + } + + flags = fcntl(p[0], F_GETFL, 0); + if ((flags & O_NONBLOCK) != 0) { + fail("Unexpected O_NONBLOCK on %d", p[0]); + return 1; + } + + pass(); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/pipe03.c b/CRIU_code/test/zdtm/static/pipe03.c new file mode 100644 index 0000000..a8721e9 --- /dev/null +++ b/CRIU_code/test/zdtm/static/pipe03.c @@ -0,0 +1,54 @@ +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that pipes with a non-default size can be c/r-ed"; +const char *test_author = "Andrei Vagin "; + +#define DATA_SIZE (1 << 20) +#define BUF_SIZE (4096) + +int main(int argc, char **argv) +{ + int p[2], i; + uint8_t buf[BUF_SIZE]; + uint32_t crc; + + test_init(argc, argv); + + if (pipe2(p, O_NONBLOCK)) { + pr_perror("pipe"); + return 1; + } + + if (fcntl(p[1], F_SETPIPE_SZ, DATA_SIZE) == -1) { + pr_perror("Unable to change a pipe size"); + return 1; + } + + crc = ~0; + datagen(buf, BUF_SIZE, &crc); + + for (i = 0; i < DATA_SIZE / BUF_SIZE; i++) { + if (write(p[1], buf, BUF_SIZE) != BUF_SIZE) { + pr_perror("write"); + return 1; + } + } + + test_daemon(); + test_waitsig(); + + for (i = 0; i < DATA_SIZE / BUF_SIZE; i++) { + if (read(p[0], buf, BUF_SIZE) != BUF_SIZE) { + pr_perror("read"); + return 1; + } + } + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/poll.c b/CRIU_code/test/zdtm/static/poll.c new file mode 100644 index 0000000..53d801a --- /dev/null +++ b/CRIU_code/test/zdtm/static/poll.c @@ -0,0 +1,138 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check poll() timeouts"; +const char *test_author = "Cyrill Gorcunov "; + +static void show_timestamp(char *prefix, unsigned long tv_sec, unsigned long tv_usec) +{ + test_msg("%8s: sec %20lu nsec %20lu\n", prefix, tv_sec, tv_usec); +} + +static void show_pollfd(struct pollfd *fds, size_t nfds) +{ + size_t i; + + for (i = 0; i < nfds; i++) { + test_msg("%2zu) fd: %2d events %2x revents %2x\n", + i, fds[i].fd, fds[i].events, fds[i].revents); + } +} + +int main(int argc, char *argv[]) +{ + struct timeval time1, time2; + struct timespec delay; + struct pollfd ufds[2]; + int pipes[2], ret; + int delta, status; + task_waiter_t t; + pid_t pid; + char *deltaenv; + + test_init(argc, argv); + task_waiter_init(&t); + + if (pipe(pipes)) { + pr_perror("Can't create pipes"); + exit(1); + } + + memset(ufds, 0, sizeof(ufds)); + ufds[0].fd = pipes[0]; + ufds[0].events = POLLIN; + + ufds[1].fd = pipes[1]; + ufds[1].events = POLLIN; + + show_pollfd(ufds, 2); + + if (gettimeofday(&time1, NULL)) { + pr_perror("Can't get first delta"); + exit(1); + } + show_timestamp("Init", time1.tv_sec, time1.tv_usec); + + pid = test_fork(); + if (pid < 0) { + pr_perror("Fork failed"); + exit(1); + } else if (pid == 0) { + if (gettimeofday(&time1, NULL)) { + pr_perror("Can't get from times"); + exit(1); + } + + show_timestamp("Start", time1.tv_sec, time1.tv_usec); + + task_waiter_complete(&t, 1); + deltaenv = getenv("ZDTM_DELTA"); + if (deltaenv) + delta = atoi(deltaenv); + else + delta = 5; + while (test_go()) { + ret = poll(ufds, 2, delta * 1000); + show_pollfd(ufds, 2); + if (ret && errno != EINTR) { + pr_perror("Poll-2 returned %d (events?!)", ret); + exit(1); + } + + if (gettimeofday(&time2, NULL)) { + pr_perror("Can't get from times"); + exit(1); + } + + show_timestamp("Stop", time2.tv_sec, time2.tv_usec); + show_timestamp("Diff", time2.tv_sec - time1.tv_sec, + time2.tv_usec - time1.tv_usec); + if ((time2.tv_sec - time1.tv_sec) > delta) { + fail("Delta is too big %lu", + (unsigned long)(time2.tv_sec - time1.tv_sec)); + exit(1); + } + } + exit(0); + } + + task_waiter_wait4(&t, 1); + + /* Wait to make sure we're in poll internals */ + delay.tv_sec = 1; + delay.tv_nsec = 0; + nanosleep(&delay, NULL); + + test_daemon(); + test_waitsig(); + kill(pid, SIGTERM); + + /* Return immediately if child run or stopped(by SIGSTOP) */ + if (waitpid(pid, &status, 0) == -1) { + pr_perror("Unable to wait child"); + exit(1); + } + + if (!WIFEXITED(status) || WEXITSTATUS(status)) { + fail("Child exited with error"); + exit(1); + } + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/poll.desc b/CRIU_code/test/zdtm/static/poll.desc new file mode 100644 index 0000000..63df42a --- /dev/null +++ b/CRIU_code/test/zdtm/static/poll.desc @@ -0,0 +1 @@ +{'flavor': 'h'} diff --git a/CRIU_code/test/zdtm/static/posix_timers.c b/CRIU_code/test/zdtm/static/posix_timers.c new file mode 100644 index 0000000..97fd3c0 --- /dev/null +++ b/CRIU_code/test/zdtm/static/posix_timers.c @@ -0,0 +1,443 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc ="Posix timers migration check"; +const char *test_author = "Kinsbursky Stanislav "; + +sigset_t mask; + +#define WRONG_SIGNAL 1 +#define WRONG_SI_PTR 2 +#define FAIL_OVERRUN 4 + +#define MAX_TIMER_DISPLACEMENT 10 +#define NO_PERIODIC + +#ifndef CLOCK_MONOTONIC_COARSE +# define CLOCK_MONOTONIC_COARSE 6 +#endif + +#ifndef CLOCK_BOOTTIME +# define CLOCK_BOOTTIME 7 +#endif + +#ifndef NO_PERIODIC +static void realtime_periodic_handler(int sig, siginfo_t *si, void *uc); +static void monotonic_periodic_handler(int sig, siginfo_t *si, void *uc); +static void boottime_periodic_handler(int sig, siginfo_t *si, void *uc); +#endif +static void realtime_oneshot_handler(int sig, siginfo_t *si, void *uc); +static void monotonic_oneshot_handler(int sig, siginfo_t *si, void *uc); +static void boottime_oneshot_handler(int sig, siginfo_t *si, void *uc); + +enum { +#ifndef NO_PERIODIC + REALTIME_PERIODIC_INFO, + MONOTONIC_PERIODIC_INFO, + BOOTTIME_PERIODIC_INFO, +#endif + REALTIME_ONESHOT_INFO, + MONOTONIC_ONESHOT_INFO, + BOOTTIME_ONESHOT_INFO, +}; + +static struct posix_timers_info { + char clock; + char *name; + void (*handler)(int sig, siginfo_t *si, void *uc); + int sig; + int oneshot; + int ms_int; + struct sigaction sa; + int handler_status; + int handler_cnt; + timer_t timerid; + int overrun; + struct timespec start, end; +} posix_timers[] = { +#ifndef NO_PERIODIC + [REALTIME_PERIODIC_INFO] = { + .clock = CLOCK_REALTIME, + .name = "REALTIME (periodic)", + .handler = realtime_periodic_handler, + .sig = SIGALRM, + .oneshot = 0, + .ms_int = 1, + }, + [MONOTONIC_PERIODIC_INFO] = { + .clock = CLOCK_MONOTONIC, + .name = "MONOTONIC (periodic)", + .handler = monotonic_periodic_handler, + .sig = SIGINT, + .oneshot = 0, + .ms_int = 3, + }, + [BOOTTIME_PERIODIC_INFO] = { + .clock = CLOCK_BOOTTIME, + .name = "BOOTTIME (periodic)", + .handler = boottime_periodic_handler, + .sig = SIGWINCH, + .oneshot = 0, + .ms_int = 3, + }, +#endif + [REALTIME_ONESHOT_INFO] = { + .clock = CLOCK_REALTIME, + .name = "REALTIME (oneshot)", + .handler = realtime_oneshot_handler, + .sig = SIGUSR1, + .oneshot = 1, + .ms_int = INT_MAX, + }, + [MONOTONIC_ONESHOT_INFO] = { + .clock = CLOCK_MONOTONIC, + .name = "MONOTONIC (oneshot)", + .handler = monotonic_oneshot_handler, + .sig = SIGUSR2, + .oneshot = 1, + .ms_int = INT_MAX, + }, + [BOOTTIME_ONESHOT_INFO] = { + .clock = CLOCK_BOOTTIME, + .name = "BOOTTIME (oneshot)", + .handler = boottime_oneshot_handler, + .sig = SIGPROF, + .oneshot = 1, + .ms_int = INT_MAX, + }, + { } +}; + +static int check_handler_status(struct posix_timers_info *info, + struct itimerspec *its, int ms_passed, int delta) +{ + int displacement; + int timer_ms; + + if (!info->handler_cnt && !info->oneshot) { + fail("%s: Signal handler wasn't called\n", info->name); + return -EINVAL; + } + + if (info->handler_status) { + if (info->handler_status & WRONG_SIGNAL) + fail("%s: Handler: wrong signal received\n", info->name); + if (info->handler_status & WRONG_SI_PTR) + fail("%s: Handler: wrong timer address\n", info->name); + if (info->handler_status & FAIL_OVERRUN) + fail("%s: Handler: failed to get overrun count\n", info->name); + return -1; + } + + if (!info->oneshot && !its->it_value.tv_sec && !its->it_value.tv_nsec) { + fail("%s: timer became unset\n", info->name); + return -EFAULT; + } + + if (info->oneshot && (its->it_interval.tv_sec || its->it_interval.tv_nsec)) { + fail("%s: timer became periodic\n", info->name); + return -EFAULT; + } + + if (!info->oneshot && !its->it_interval.tv_sec && !its->it_interval.tv_nsec) { + fail("%s: timer became oneshot\n", info->name); + return -EFAULT; + } + + if (info->oneshot) { + int val = its->it_value.tv_sec * 1000 + its->it_value.tv_nsec / 1000 / 1000; + if (info->handler_cnt) { + if (val != 0) { + fail("%s: timer continues ticking after expiration\n", info->name); + return -EFAULT; + } + if (info->handler_cnt > 1) { + fail("%s: timer expired %d times\n", info->name, info->handler_cnt); + return -EFAULT; + } + if (info->ms_int > ms_passed) { + fail("%s: timer expired too early\n", info->name); + return -EFAULT; + } + return 0; + } + timer_ms = info->ms_int - val; + } else + timer_ms = (info->overrun + info->handler_cnt) * info->ms_int; + displacement = (abs(ms_passed - timer_ms) - delta) * 100 / ms_passed; + + test_msg("%20s: cpt/rst : %-8d msec\n", info->name, delta); + test_msg("%20s: Time passed (ms) : %-8d msec\n", info->name, ms_passed); + test_msg("%20s: Timer results : %-8d msec\n", info->name, timer_ms); + test_msg("%20s: Handler count : %d\n", info->name, info->handler_cnt); + + if (displacement > MAX_TIMER_DISPLACEMENT) { + fail("%32s: Time displacement: %d%% (max alloved: %d%%)\n", info->name, displacement, MAX_TIMER_DISPLACEMENT); + return -EFAULT; + } + return 0; +} + +static int check_timers(int delta, struct timespec *sleep_start, struct timespec *sleep_end) +{ + struct posix_timers_info *info = posix_timers; + int ms_passed; + int status = 0; + struct itimerspec val, oldval; + + if (sigprocmask(SIG_UNBLOCK, &mask, NULL) == -1) { + fail("Failed to unlock signal\n"); + return -errno; + } + + while (info->handler) { + memset(&val, 0, sizeof(val)); + if (timer_settime(info->timerid, 0, &val, &oldval) == -1) { + fail("%s: failed to reset timer\n", info->name); + return -errno; + } + + if (clock_gettime(info->clock, &info->end) == -1) { + fail("Can't get %s end time\n", info->name); + return -errno; + } + + /* + * Adjust with @total_sleep_time if needed. + */ + if (info->clock == CLOCK_BOOTTIME) { + info->start.tv_sec -= sleep_start->tv_sec; + info->start.tv_nsec -= sleep_start->tv_nsec; + info->end.tv_sec -= sleep_end->tv_sec; + info->end.tv_nsec -= sleep_end->tv_nsec; + } + + ms_passed = (info->end.tv_sec - info->start.tv_sec) * 1000 + + (info->end.tv_nsec - info->start.tv_nsec) / (1000 * 1000); + + if (check_handler_status(info, &oldval, ms_passed, delta)) + status--; + info++; + } + return status; +} + +static void generic_handler(struct posix_timers_info *info, + struct posix_timers_info *real, int sig) +{ + int overrun; + + if (info == NULL) + info = &posix_timers[MONOTONIC_ONESHOT_INFO]; + + if (info != real) { + real->handler_status |= WRONG_SI_PTR; + return; + } + + if (sig != info->sig) + info->handler_status |= WRONG_SIGNAL; + + overrun = timer_getoverrun(info->timerid); + if (overrun == -1) + info->handler_status |= FAIL_OVERRUN; + else + info->overrun += overrun; + info->handler_cnt++; +} + +#ifndef NO_PERIODIC +static void monotonic_periodic_handler(int sig, siginfo_t *si, void *uc) +{ + generic_handler(si->si_value.sival_ptr, + &posix_timers[MONOTONIC_PERIODIC_INFO], sig); +} + +static void boottime_periodic_handler(int sig, siginfo_t *si, void *uc) +{ + generic_handler(si->si_value.sival_ptr, + &posix_timers[BOOTTIME_PERIODIC_INFO], sig); +} +#endif + +static void monotonic_oneshot_handler(int sig, siginfo_t *si, void *uc) +{ + generic_handler(si->si_value.sival_ptr, + &posix_timers[MONOTONIC_ONESHOT_INFO], sig); +} + +static void boottime_oneshot_handler(int sig, siginfo_t *si, void *uc) +{ + generic_handler(si->si_value.sival_ptr, + &posix_timers[BOOTTIME_ONESHOT_INFO], sig); +} + +#ifndef NO_PERIODIC +static void realtime_periodic_handler(int sig, siginfo_t *si, void *uc) +{ + generic_handler(si->si_value.sival_ptr, + &posix_timers[REALTIME_PERIODIC_INFO], sig); +} +#endif + +static void realtime_oneshot_handler(int sig, siginfo_t *si, void *uc) +{ + generic_handler(si->si_value.sival_ptr, + &posix_timers[REALTIME_ONESHOT_INFO], sig); +} + +static int setup_timers(void) +{ + int i; + int ret; + struct posix_timers_info *info = posix_timers; + struct sigevent sev; + struct itimerspec its; + + sigemptyset(&mask); + while(info->handler) { + sigaddset(&mask, info->sig); + info++; + } + + if (sigprocmask(SIG_SETMASK, &mask, NULL) == -1) { + pr_perror("Failed to unlock signal"); + return -errno; + } + + info = posix_timers; + while(info->handler) { + /* Add and delete fake timers to test restoring 'with holes' */ + timer_t timeridt; + for (i = 0; i < 10; i++) { + ret = timer_create(CLOCK_REALTIME, NULL, &timeridt); + if (ret < 0) { + pr_perror("Can't create temporary posix timer %lx", (long) timeridt); + return -errno; + } + ret = timer_delete(timeridt); + if (ret < 0) { + pr_perror("Can't remove temporaty posix timer %lx", (long) timeridt); + return -errno; + } + } + + info->sa.sa_flags = SA_SIGINFO; + info->sa.sa_sigaction = info->handler; + sigemptyset(&info->sa.sa_mask); + + if (sigaction(info->sig, &info->sa, NULL) == -1) { + pr_perror("Failed to set SIGALRM handler"); + return -errno; + } + + sev.sigev_notify = SIGEV_SIGNAL; + sev.sigev_signo = info->sig; + if (&posix_timers[MONOTONIC_ONESHOT_INFO] == info) + sev.sigev_value.sival_ptr = NULL; + else + sev.sigev_value.sival_ptr = info; + + if (timer_create(info->clock, &sev, &info->timerid) == -1) { + pr_perror("Can't create timer"); + return -errno; + } + + its.it_value.tv_sec = info->ms_int / 1000; + its.it_value.tv_nsec = info->ms_int % 1000 * 1000 * 1000; + if (!info->oneshot) { + its.it_interval.tv_sec = its.it_value.tv_sec; + its.it_interval.tv_nsec = its.it_value.tv_nsec; + } else + its.it_interval.tv_sec = its.it_interval.tv_nsec = 0; + + if (clock_gettime(info->clock, &info->start) == -1) { + pr_perror("Can't get %s start time", info->name); + return -errno; + } + + if (timer_settime(info->timerid, 0, &its, NULL) == -1) { + pr_perror("Can't set timer"); + return -errno; + } + info++; + } + return 0; +} + +/* + * Figure out @total_sleep_time, ie time the system was in hardware + * suspend mode, will need this value to exclude from boottime clock + * testing. + */ +static int get_total_sleep_time(struct timespec *tv, char *type) +{ + struct timespec boottime_coarse; + struct timespec boottime; + + if (clock_gettime(CLOCK_BOOTTIME, &boottime) == -1) { + pr_perror("Can't get CLOCK_BOOTTIME %s time", type); + return -errno; + } + + if (clock_gettime(CLOCK_MONOTONIC_COARSE, &boottime_coarse) == -1) { + pr_perror("Can't get CLOCK_MONOTONIC_COARSE %s time", type); + return -errno; + } + + tv->tv_sec = boottime.tv_sec - boottime_coarse.tv_sec; + tv->tv_nsec = boottime.tv_nsec - boottime_coarse.tv_nsec; + + test_msg("(%6s) boottime %lu " + "boottime-coarse %lu " + "total_sleep_time %lu\n", + type, + (long)boottime.tv_sec, + (long)boottime_coarse.tv_sec, + (long)tv->tv_sec); + + return 0; +} + +int main(int argc, char **argv) +{ + struct timespec sleep_start, sleep_end; + struct timespec start, end; + int err; + + test_init(argc, argv); + + err = setup_timers(); + if (err) + return err; + + usleep(500 * 1000); + + clock_gettime(CLOCK_REALTIME, &start); + err = get_total_sleep_time(&sleep_start, "start"); + if (err) + return err; + + test_daemon(); + test_waitsig(); + + clock_gettime(CLOCK_REALTIME, &end); + err = get_total_sleep_time(&sleep_end, "end"); + if (err) + return err; + err = check_timers((end.tv_sec - start.tv_sec) * 1000 + + (end.tv_nsec - start.tv_nsec) / 1000000, + &sleep_start, &sleep_end); + if (err) + return err; + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/private_bind_propagation.c b/CRIU_code/test/zdtm/static/private_bind_propagation.c new file mode 100644 index 0000000..ee4adbd --- /dev/null +++ b/CRIU_code/test/zdtm/static/private_bind_propagation.c @@ -0,0 +1,116 @@ +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check sharing options are restored for bindmounted shared group children"; +const char *test_author = "Pavel Tikhomirov "; + +char *dirname; +TEST_OPTION(dirname, string, "directory name", 1); + +int main(int argc, char **argv) +{ + char share1[PATH_MAX], share2[PATH_MAX], source[PATH_MAX]; + char child1[PATH_MAX], child2[PATH_MAX]; + + test_init(argc, argv); + + if (mkdir(dirname, 0700)) { + pr_perror("mkdir"); + return 1; + } + + if (mount("zdtm_fs", dirname, "tmpfs", 0, NULL)) { + pr_perror("mount"); + return 1; + } + + if (mount(NULL, dirname, NULL, MS_SHARED, NULL)) { + pr_perror("mount"); + return 1; + } + + snprintf(share1, sizeof(share1), "%s/share1", dirname); + if (mkdir(share1, 0700)) { + pr_perror("mkdir"); + return 1; + } + + if (mount("share", share1, "tmpfs", 0, NULL)) { + pr_perror("mount"); + return 1; + } + + if (mount(NULL, share1, NULL, MS_SHARED, NULL)) { + pr_perror("mount"); + return 1; + } + + snprintf(share2, sizeof(share2), "%s/share2", dirname); + if (mkdir(share2, 0700)) { + pr_perror("mkdir"); + return 1; + } + + if (mount(share1, share2, NULL, MS_BIND, NULL)) { + pr_perror("mount"); + return 1; + } + + snprintf(source, sizeof(source), "%s/source", dirname); + if (mkdir(source, 0700)) { + pr_perror("mkdir"); + return 1; + } + + snprintf(child1, sizeof(child1), "%s/share1/child", dirname); + snprintf(child2, sizeof(child2), "%s/share1/child", dirname); + if (mkdir(child1, 0700)) { + pr_perror("mkdir"); + return 1; + } + + if (mount(source, child1, NULL, MS_BIND, NULL)) { + pr_perror("mount"); + return 1; + } + + if (mount(NULL, child1, NULL, MS_PRIVATE, NULL)) { + pr_perror("mount"); + return 1; + } + + if (mount(NULL, child2, NULL, MS_PRIVATE, NULL)) { + pr_perror("mount"); + return 1; + } + + test_daemon(); + test_waitsig(); + + if (umount(child1)) { + pr_perror("Unable to umount %s", child1); + return 1; + } + + if (umount(share1)) { + pr_perror("Unable to umount %s", share1); + return 1; + } + + if (umount(share2)) { + pr_perror("Unable to umount %s", share2); + return 1; + } + + if (umount(dirname)) { + pr_perror("Unable to umount %s", dirname); + return 1; + } + + pass(); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/private_bind_propagation.desc b/CRIU_code/test/zdtm/static/private_bind_propagation.desc new file mode 100644 index 0000000..7657ba4 --- /dev/null +++ b/CRIU_code/test/zdtm/static/private_bind_propagation.desc @@ -0,0 +1 @@ +{'flavor': 'ns uns', 'flags': 'suid'} diff --git a/CRIU_code/test/zdtm/static/proc-self.c b/CRIU_code/test/zdtm/static/proc-self.c new file mode 100644 index 0000000..8292c08 --- /dev/null +++ b/CRIU_code/test/zdtm/static/proc-self.c @@ -0,0 +1,78 @@ +#include + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check for /proc/self/ns path restore"; +const char *test_author = "Cyrill Gorcunov "; + +const char nspath[] = "/proc/self/ns/net"; + +int read_fd_link(int lfd, char *buf, size_t size) +{ + ssize_t ret; + char t[32]; + + snprintf(t, sizeof(t), "/proc/self/fd/%d", lfd); + ret = readlink(t, buf, size - 1); + if (ret < 0) { + pr_perror("Can't read link of fd %d", lfd); + return -1; + } + buf[ret] = 0; + + return 0; +} + +int main(int argc, char *argv[]) +{ + char path_orig[64], path_new[64]; + int fd_self, fd_new; + + test_init(argc, argv); + + memset(path_orig, 0, sizeof(path_orig)); + memset(path_new, 0, sizeof(path_new)); + + fd_self = open(nspath, O_RDONLY); + if (fd_self < 0) { + pr_perror("Can't open %s", nspath); + return -1; + } + + test_daemon(); + test_waitsig(); + + if (read_fd_link(fd_self, path_orig, sizeof(path_orig))) { + pr_perror("Can't fill original path"); + return -1; + } + + fd_new = open(nspath, O_RDONLY); + if (fd_new < 0) { + pr_perror("Can't open %s", nspath); + return -1; + } + + if (read_fd_link(fd_new, path_new, sizeof(path_new))) { + pr_perror("Can't fill new path"); + return -1; + } + + if (memcmp(path_orig, path_new, sizeof(path_orig))) { + fail("Paths mismatch %s %s\n", path_orig, path_new); + return -1; + } + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/pstree.c b/CRIU_code/test/zdtm/static/pstree.c new file mode 100644 index 0000000..ba94a27 --- /dev/null +++ b/CRIU_code/test/zdtm/static/pstree.c @@ -0,0 +1,87 @@ +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that environment didn't change"; +const char *test_author = "Pavel Emelianov "; + +int main(int argc, char **argv) +{ + char x; + int pid, ppid; + int sp[2], fp[2], rp[2]; + + test_init(argc, argv); + + if (pipe(sp) || pipe(fp) || pipe(rp)) { + pr_perror("pipe"); + return 1; + } + + pid = fork(); + if (pid == 0) { + close(sp[0]); + close(fp[1]); + close(rp[0]); + + pid = getpid(); + ppid = getppid(); + + close(sp[1]); + if (read(fp[0], &x, 1)) { + pr_perror("read"); + return 1; + } + close(fp[0]); + + if (pid != getpid()) + x = 'p'; + else if (ppid != getppid()) + x = 'P'; + else + x = '0'; + + if (write(rp[1], &x, 1) != 1) { + pr_perror("write"); + return 1; + } + close(rp[1]); + _exit(0); + } + + x = 'X'; + close(sp[1]); + close(fp[0]); + close(rp[1]); + + if (read(sp[0], &x, 1)) { + pr_perror("read"); + return 1; + } + + test_daemon(); + test_waitsig(); + + close(fp[1]); + if (read(rp[0], &x, 1) != 1) { + pr_perror("read"); + return 1; + } + close(rp[0]); + + if (x == 'X') + fail("Sync failed"); + else if (x == 'p') + fail("Pid failed"); + else if (x == 'P') + fail("PPid failed"); + else if (x != '0') + fail("Shit happened"); + else + pass(); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/pthread00.c b/CRIU_code/test/zdtm/static/pthread00.c new file mode 100644 index 0000000..2b248b2 --- /dev/null +++ b/CRIU_code/test/zdtm/static/pthread00.c @@ -0,0 +1,185 @@ +/* + * A simple testee program with threads + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +#define exit_group(code) \ + syscall(__NR_exit_group, code) + +const char *test_doc = "Create a few pthreads/forks and compare TLS and mmap data on restore\n"; +const char *test_author = "Cyrill Gorcunov = 0; i--) + task_waiter_wait4(&waiter[i], 1); + + test_daemon(); + test_waitsig(); + + for (i = 0; i < NR_WAITERS; i++) + task_waiter_complete(&waiter[i], 2); + + test_msg("Waiting while all threads are joined\n"); + pthread_join(th1, NULL); + pthread_join(th2, NULL); + + if (IS_PASSED(map, 0) && + IS_PASSED(map, 1) && + IS_PASSED(map, 2) && + IS_PASSED(map, 3) && + IS_PASSED(map, 4) && + IS_PASSED(map, 5)) + pass(); + else + fail(); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/pthread01.c b/CRIU_code/test/zdtm/static/pthread01.c new file mode 100644 index 0000000..1e84463 --- /dev/null +++ b/CRIU_code/test/zdtm/static/pthread01.c @@ -0,0 +1,209 @@ +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +#define gettid() pthread_self() + +const char *test_doc = "Create a few pthreads and test TLS + blocked signals\n"; +const char *test_author = "Cyrill Gorcunov +#include +#include + +#include + +#include "zdtmtst.h" + +const char *test_doc = "Create a thread with a dead leader\n"; +const char *test_author = "Andrew Vagin +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check ptrace, if the child process's stopped by signal"; +const char *test_author = "Andrey Vagin "; + +typedef void (*sighandler_t)(int); + +int child_fd; +int child_exit = 0; +void sig_handler(int signo, siginfo_t *siginfo, void *data) +{ + int pid, ret; + test_msg("receive signal sig=%d from pid=%d\n", signo, siginfo->si_pid); + pid = siginfo->si_pid; + ret = write(child_fd, &pid, sizeof(pid)); + if (ret != sizeof(pid)) + pr_perror("write"); + child_exit = 1; +} + +int child(int fd) +{ + int ret = 0; + struct sigaction act = { + .sa_sigaction = sig_handler, + .sa_flags = SA_SIGINFO, + }, old_act; + + sigemptyset(&act.sa_mask); + + child_fd = fd; + + ret = sigaction(SIGUSR2, &act, &old_act); + if (ret < 0) { + pr_perror("signal failed"); + return 1; + } + + ret = ptrace(PTRACE_TRACEME, 0, 0, 0); + if (ret < 0) { + pr_perror("ptrace failed"); + return 1; + } + ret = write(child_fd, &ret, sizeof(ret)); + while (!child_exit) + ret = sleep(1); + close(child_fd); + return 0; +} + +int main(int argc, char ** argv) +{ + int ret, status = 0; + pid_t pid, spid, cpid; + int child_pipe[2]; + siginfo_t siginfo; + + test_init(argc, argv); + + ret = pipe(child_pipe); + if (ret < 0) { + pr_perror("pipe failed"); + return 1; + } + + cpid = test_fork(); + if (cpid < 0) { + pr_perror("fork failed"); + return 1; + } + else if (cpid == 0) { + close(child_pipe[0]); + return child(child_pipe[1]); + } + + close(child_pipe[1]); + test_msg("wait while child initialized"); + ret = read(child_pipe[0], &status, sizeof(status)); + if (ret != sizeof(status)) { + pr_perror("read from child process failed"); + return 1; + } + + spid = test_fork(); + if (spid < 0) { + pr_perror("Can't fork signal process"); + return 1; + } else if (spid == 0) { + test_msg("send signal to %d\n", cpid); + ret = kill(cpid, SIGUSR2); + if (ret < 0) { + pr_perror("kill failed"); + } + return 0; + } + + if (waitid(P_PID, spid, &siginfo, WEXITED | WNOWAIT)) { + pr_perror("Unable to wait spid"); + return 1; + } + if (waitid(P_PID, cpid, &siginfo, WSTOPPED | WNOWAIT)) { + pr_perror("Unable to wait cpid"); + return 1; + } + + test_daemon(); + test_waitsig(); + + while (1) { + test_msg("waiting...\n"); + pid = wait(&status); + if (pid < 0) { + if (errno != ECHILD) + pr_perror("wait"); + break; + } + + if (WIFSTOPPED(status)) { + + test_msg("pid=%d stopsig=%d\n", pid, WSTOPSIG(status)); + + ret = ptrace(PTRACE_GETSIGINFO, pid, 0, &siginfo); + if (ret < 0) { + pr_perror("ptrace failed"); + return 1; + } else + test_msg("pid=%d sends signal\n", siginfo.si_pid); + + ret = ptrace(PTRACE_CONT, pid, 0, WSTOPSIG(status)); + if (ret < 0) + pr_perror("ptrace failed"); + + ret = read(child_pipe[0], &status, sizeof(status)); + if (ret != sizeof(status)) { + pr_perror("read"); + return 1; + } + + if (spid != siginfo.si_pid) { + fail("%d!=%d", cpid, siginfo.si_pid); + return 1; + } else if (status != siginfo.si_pid) { + fail("%d!=%d", status, siginfo.si_pid); + return 1; + } + } else if (WIFEXITED(status)) { + test_msg("pid = %d status = %d\n", pid, WEXITSTATUS(status)); + if (WEXITSTATUS(status)) + return 1; + } else if (WIFSIGNALED(status)) { + test_msg("pid = %d signal = %d\n", pid, WTERMSIG(status)); + return 1; + } + } + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/ptrace_sig.desc b/CRIU_code/test/zdtm/static/ptrace_sig.desc new file mode 100644 index 0000000..ded8987 --- /dev/null +++ b/CRIU_code/test/zdtm/static/ptrace_sig.desc @@ -0,0 +1 @@ +{'flags': 'crfail'} diff --git a/CRIU_code/test/zdtm/static/pty-console.c b/CRIU_code/test/zdtm/static/pty-console.c new file mode 100644 index 0000000..de7990f --- /dev/null +++ b/CRIU_code/test/zdtm/static/pty-console.c @@ -0,0 +1 @@ +pty01.c \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/pty-console.desc b/CRIU_code/test/zdtm/static/pty-console.desc new file mode 100644 index 0000000..fba5e87 --- /dev/null +++ b/CRIU_code/test/zdtm/static/pty-console.desc @@ -0,0 +1 @@ +{'flags': 'suid', 'flavor' : 'ns uns'} diff --git a/CRIU_code/test/zdtm/static/pty00.c b/CRIU_code/test/zdtm/static/pty00.c new file mode 100644 index 0000000..9e4f452 --- /dev/null +++ b/CRIU_code/test/zdtm/static/pty00.c @@ -0,0 +1,138 @@ +#define _XOPEN_SOURCE 500 +#include +#include "zdtmtst.h" +#include +#include +#include +#include +#include +#include +#include + +const char *test_doc = "Check, that pseudoterminals are restored"; +const char *test_author = "Andrey Vagin "; + +static unsigned int nr_sighups; + +static void signal_handler_sighup(int signum) +{ + nr_sighups++; +} + +int main(int argc, char ** argv) +{ + int fdm, fds, ret, tty, i; + char *slavename; + char buf[4096]; + const char teststr[] = "hello\n"; + + struct sigaction sa = { + .sa_handler = signal_handler_sighup, + .sa_flags = 0, + }; + + test_init(argc, argv); + + /* + * On closing control terminal we're expecting to + * receive SIGHUP, so make sure it's delivered. + */ + if (sigaction(SIGHUP, &sa, 0)) { + fail("sigaction failed\n"); + return 1; + } + + fdm = open("/dev/ptmx", O_RDWR); + if (fdm == -1) { + pr_perror("open(%s) failed", "/dev/ptmx"); + return 1; + } + grantpt(fdm); + unlockpt(fdm); + slavename = ptsname(fdm); + fds = open(slavename, O_RDWR); + if (fds == -1) { + pr_perror("open(%s) failed", slavename); + return 1; + } + + tty = open("/dev/tty", O_RDWR); + if (tty < 0) { + pr_perror("open(%s) failed", "/dev/tty"); + return 1; + } + + /* Try to reproduce a deadlock */ + if (dup2(fdm, 101) != 101) { + pr_perror("dup( , 101) failed"); + return 1; + } + close(fdm); + fdm = 101; + + if (dup2(fds, 100) != 100) { + pr_perror("dup( , 100) failed"); + return 1; + } + close(fds); + fds = 100; + + for (i = 0; i < 10; i++) { + /* Check connectivity */ + ret = write(fdm, teststr, sizeof(teststr) - 1); + if (ret != sizeof(teststr) - 1) { + pr_perror("write(fdm) failed"); + return 1; + } + } + + test_daemon(); + test_waitsig(); + + for (i = 0; i < 10; i++) { + ret = read(fds, buf, sizeof(teststr) - 1); + if (ret != sizeof(teststr) - 1) { + pr_perror("read(fds) failed"); + return 1; + } + } + + if (strncmp(teststr, buf, sizeof(teststr) - 1)) { + fail("data mismatch"); + return 1; + } + + ret = write(fdm, teststr, sizeof(teststr) - 1); + if (ret != sizeof(teststr) - 1) { + pr_perror("write(fdm) failed"); + return 1; + } + + ret = read(tty, buf, sizeof(teststr) - 1); + if (ret != sizeof(teststr) - 1) { + pr_perror("read(tty) failed"); + return 1; + } + + if (strncmp(teststr, buf, sizeof(teststr) - 1)) { + fail("data mismatch"); + return 1; + } + + if (nr_sighups != 0) { + fail("Expected 0 SIGHUP before closing control terminal but got %d", nr_sighups); + return 1; + } + + close(fdm); + close(fds); + close(tty); + + if (nr_sighups != 1) { + fail("Expected 1 SIGHUP after closing control terminal but got %d", nr_sighups); + return 1; + } else + pass(); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/pty01.c b/CRIU_code/test/zdtm/static/pty01.c new file mode 100644 index 0000000..cb7fff9 --- /dev/null +++ b/CRIU_code/test/zdtm/static/pty01.c @@ -0,0 +1,131 @@ +#define _XOPEN_SOURCE 500 +#define _DEFAULT_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check two pts on ptmx"; +const char *test_author = "Cyrill Gorcunov "; + +static const char teststr[] = "ping\n"; + +int main(int argc, char *argv[]) +{ + char buf[sizeof(teststr)]; + int master, slave1, slave2, ret; + char *slavename; + struct stat st; + + uid_t new_uid = 13333; + gid_t new_gid = 44444; + + test_init(argc, argv); + + master = open("/dev/ptmx", O_RDWR); + if (master == -1) { + pr_perror("open(%s) failed", "/dev/ptmx"); + return 1; + } + + grantpt(master); + unlockpt(master); + + slavename = ptsname(master); + slave1 = open(slavename, O_RDWR); + if (slave1 == -1) { + pr_perror("open(%s) failed", slavename); + return 1; + } + + slave2 = open(slavename, O_RDWR); + if (slave2 == -1) { + pr_perror("open(%s) failed", slavename); + return 1; + } + +#ifdef ZDTM_DEV_CONSOLE + { + int fd; + fd = open("/dev/console", O_CREAT | O_RDONLY, 0755); + if (fd < 0) + return -1; + close(fd); + + if (mount(slavename, "/dev/console", NULL, MS_BIND, NULL)) + return -1; + } +#endif + + if (fchown(slave1, new_uid, new_gid)) { + pr_perror("Can't set uid/gid on %s", slavename); + return 1; + } + + test_daemon(); + test_waitsig(); + + signal(SIGHUP, SIG_IGN); + + if (fstat(slave1, &st)) { + pr_perror("Can't fetch stat on %s", slavename); + return 1; + } + + if (st.st_uid != new_uid || st.st_gid != new_gid) { + fail("UID/GID mismatch (got %d/%d but %d/%d expected)", + (int)st.st_uid, (int)st.st_gid, + (int)new_uid, (int)new_gid); + return 1; + } + + ret = write(master, teststr, sizeof(teststr) - 1); + if (ret != sizeof(teststr) - 1) { + pr_perror("write(master) failed"); + return 1; + } + + ret = read(slave1, buf, sizeof(teststr) - 1); + if (ret != sizeof(teststr) - 1) { + pr_perror("read(slave1) failed"); + return 1; + } + + if (strncmp(teststr, buf, sizeof(teststr) - 1)) { + fail("data mismatch"); + return 1; + } + + ret = write(master, teststr, sizeof(teststr) - 1); + if (ret != sizeof(teststr) - 1) { + pr_perror("write(master) failed"); + return 1; + } + + ret = read(slave2, buf, sizeof(teststr) - 1); + if (ret != sizeof(teststr) - 1) { + pr_perror("read(slave1) failed"); + return 1; + } + + if (strncmp(teststr, buf, sizeof(teststr) - 1)) { + fail("data mismatch"); + return 1; + } + + close(master); + close(slave1); + close(slave2); + + pass(); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/pty01.desc b/CRIU_code/test/zdtm/static/pty01.desc new file mode 100644 index 0000000..2eac7e6 --- /dev/null +++ b/CRIU_code/test/zdtm/static/pty01.desc @@ -0,0 +1 @@ +{'flags': 'suid'} diff --git a/CRIU_code/test/zdtm/static/pty02.c b/CRIU_code/test/zdtm/static/pty02.c new file mode 100644 index 0000000..041a4e4 --- /dev/null +++ b/CRIU_code/test/zdtm/static/pty02.c @@ -0,0 +1,103 @@ +#define _XOPEN_SOURCE 500 +#include +#include "zdtmtst.h" +#include +#include +#include +#include +#include +#include +#include + +const char *test_doc = "Check forked master ptmx"; +const char *test_author = "Cyrill Gorcunov "; + +static const char teststr[] = "ping\n"; + +#define exit_shot(pid, code) \ + do { kill(pid, SIGKILL); exit(code); } while (0) + +#define exit_shot_parent(code) \ + exit_shot(getppid(), 1) + +int main(int argc, char *argv[]) +{ + char buf[sizeof(teststr)]; + int master, slave, ret; + char *slavename; + task_waiter_t t; + pid_t pid; + + test_init(argc, argv); + + master = open("/dev/ptmx", O_RDWR); + if (master == -1) { + pr_perror("open(%s) failed", "/dev/ptmx"); + return 1; + } + + grantpt(master); + unlockpt(master); + + slavename = ptsname(master); + slave = open(slavename, O_RDWR); + if (slave == -1) { + pr_perror("open(%s) failed", slavename); + return 1; + } + + task_waiter_init(&t); + + pid = test_fork(); + if (pid == 0) { + int new_master, ret; + + new_master = dup(master); + if (new_master < 0) { + pr_perror("can't dup master"); + exit_shot_parent(1); + } + + task_waiter_complete_current(&t); + + ret = write(new_master, teststr, sizeof(teststr) - 1); + if (ret != sizeof(teststr) - 1) { + pr_perror("write(new_master) failed (ret = %d)", ret); + exit_shot_parent(1); + } + + task_waiter_wait4(&t, 1); + + close(new_master); + exit(0); + } else if (pid < 0) { + pr_perror("test_fork failed"); + exit(1); + } + + task_waiter_wait4(&t, pid); + close(master); + + test_daemon(); + test_waitsig(); + + signal(SIGHUP, SIG_IGN); + + ret = read(slave, buf, sizeof(teststr) - 1); + if (ret != sizeof(teststr) - 1) { + pr_perror("read(slave) failed (ret = %d)", ret); + return 1; + } + + if (strncmp(teststr, buf, sizeof(teststr) - 1)) { + fail("data mismatch"); + return 1; + } + + task_waiter_complete(&t, 1); + close(slave); + + pass(); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/pty03.c b/CRIU_code/test/zdtm/static/pty03.c new file mode 100644 index 0000000..3c2b363 --- /dev/null +++ b/CRIU_code/test/zdtm/static/pty03.c @@ -0,0 +1,83 @@ +#define _XOPEN_SOURCE 500 +#include +#include "zdtmtst.h" +#include +#include +#include +#include +#include +#include +#include +#include + +const char *test_doc = "Check a non-opened control terminal"; +const char *test_author = "Andrey Vagin "; + +static const char teststr[] = "ping\n"; + +int main(int argc, char *argv[]) +{ + char buf[sizeof(teststr)]; + int master, slave, ret; + char *slavename; + + test_init(argc, argv); + + master = open("/dev/ptmx", O_RDWR); + if (master == -1) { + pr_perror("open(%s) failed", "/dev/ptmx"); + return 1; + } + + grantpt(master); + unlockpt(master); + + slavename = ptsname(master); + slave = open(slavename, O_RDWR); + if (slave == -1) { + pr_perror("open(%s) failed", slavename); + return 1; + } + + if (ioctl(slave, TIOCSCTTY, 1)) { + pr_perror("Can't set a controll terminal"); + return 1; + } + + close(slave); + + test_daemon(); + test_waitsig(); + + slave = open("/dev/tty", O_RDWR); + if (slave == -1) { + pr_perror("Can't open the controll terminal"); + return -1; + } + + signal(SIGHUP, SIG_IGN); + + ret = write(master, teststr, sizeof(teststr) - 1); + if (ret != sizeof(teststr) - 1) { + pr_perror("write(master) failed"); + return 1; + } + + ret = read(slave, buf, sizeof(teststr) - 1); + if (ret != sizeof(teststr) - 1) { + pr_perror("read(slave1) failed"); + return 1; + } + + if (strncmp(teststr, buf, sizeof(teststr) - 1)) { + fail("data mismatch"); + return 1; + } + + close(master); + close(slave); + + pass(); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/pty03.desc b/CRIU_code/test/zdtm/static/pty03.desc new file mode 100644 index 0000000..c504739 --- /dev/null +++ b/CRIU_code/test/zdtm/static/pty03.desc @@ -0,0 +1 @@ +{'flavor': 'h uns'} diff --git a/CRIU_code/test/zdtm/static/pty04.c b/CRIU_code/test/zdtm/static/pty04.c new file mode 100644 index 0000000..74d8ab4 --- /dev/null +++ b/CRIU_code/test/zdtm/static/pty04.c @@ -0,0 +1,64 @@ +#define _XOPEN_SOURCE 500 +#include +#include "zdtmtst.h" +#include +#include +#include +#include +#include +#include +#include +#include + +const char *test_doc = "Check two pts with a fake ptmx"; +const char *test_author = "Cyrill Gorcunov "; + +int main(int argc, char *argv[]) +{ + int master, slave1, slave2; + char *slavename; + + test_init(argc, argv); + + master = open("/dev/ptmx", O_RDWR); + if (master == -1) { + pr_perror("open(%s) failed", "/dev/ptmx"); + return 1; + } + + grantpt(master); + unlockpt(master); + + slavename = ptsname(master); + + slave1 = open(slavename, O_RDWR); + if (slave1 == -1) { + pr_perror("open(%s) failed", slavename); + return 1; + } + + slave2 = open(slavename, O_RDWR); + if (slave2 == -1) { + pr_perror("open(%s) failed", slavename); + return 1; + } + + if (ioctl(slave1, TIOCSCTTY, 1)) { + pr_perror("Can't set a controll terminal"); + return 1; + } + + test_msg("Closing master\n"); + signal(SIGHUP, SIG_IGN); + close(master); + + test_daemon(); + test_waitsig(); + + close(slave1); + close(slave2); + + pass(); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/remap_dead_pid.c b/CRIU_code/test/zdtm/static/remap_dead_pid.c new file mode 100644 index 0000000..261c591 --- /dev/null +++ b/CRIU_code/test/zdtm/static/remap_dead_pid.c @@ -0,0 +1,78 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +#ifndef CLONE_NEWNS +#define CLONE_NEWNS 0x00020000 +#endif + +#ifdef REMAP_PID_ROOT +const char *proc_path = "/proc/%d"; +#else +const char *proc_path = "/proc/%d/mountinfo"; +#endif + +const char *test_doc = "Check that dead pid's /proc entries are remapped correctly"; +const char *test_author = "Tycho Andersen "; + +int main(int argc, char **argv) +{ + pid_t pid; + + test_init(argc, argv); + + pid = fork(); + if (pid < 0) { + fail("fork() failed"); + return -1; + } + + if (pid == 0) { + /* Child process just sleeps until it is killed. All we need + * here is a process to open the mountinfo of. */ + while(1) + sleep(10); + } else { + test_msg("child is %d\n", pid); + + int fd, ret; + char path[PATH_MAX]; + pid_t result; + + sprintf(path, proc_path, pid); + fd = open(path, O_RDONLY); + if (fd < 0) { + fail("failed to open fd"); + return -1; + } + + /* no matter what, we should kill the child */ + kill(pid, SIGKILL); + result = waitpid(pid, NULL, 0); + if (result < 0) { + fail("failed waitpid()"); + return -1; + } + + test_daemon(); + test_waitsig(); + + ret = fcntl(fd, F_GETFD); + close(fd); + + if (ret) { + fail("bad fd after restore"); + return -1; + } + } + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/remap_dead_pid.desc b/CRIU_code/test/zdtm/static/remap_dead_pid.desc new file mode 100644 index 0000000..6c4afe5 --- /dev/null +++ b/CRIU_code/test/zdtm/static/remap_dead_pid.desc @@ -0,0 +1 @@ +{'flavor': 'ns uns'} diff --git a/CRIU_code/test/zdtm/static/remap_dead_pid_root.c b/CRIU_code/test/zdtm/static/remap_dead_pid_root.c new file mode 100644 index 0000000..3739fc7 --- /dev/null +++ b/CRIU_code/test/zdtm/static/remap_dead_pid_root.c @@ -0,0 +1 @@ +remap_dead_pid.c \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/remap_dead_pid_root.desc b/CRIU_code/test/zdtm/static/remap_dead_pid_root.desc new file mode 100644 index 0000000..6c4afe5 --- /dev/null +++ b/CRIU_code/test/zdtm/static/remap_dead_pid_root.desc @@ -0,0 +1 @@ +{'flavor': 'ns uns'} diff --git a/CRIU_code/test/zdtm/static/rlimits00.c b/CRIU_code/test/zdtm/static/rlimits00.c new file mode 100644 index 0000000..17ea8da --- /dev/null +++ b/CRIU_code/test/zdtm/static/rlimits00.c @@ -0,0 +1,66 @@ +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that rlimits are saved"; +const char *test_author = "Pavel Emelianov "; + +int main(int argc, char **argv) +{ + int r, changed = 0; + struct rlimit rlims[RLIM_NLIMITS], trlim; + + test_init(argc, argv); + + for (r = 0; r < RLIM_NLIMITS; r++) { + if (getrlimit(r, &rlims[r])) { + pr_perror("Can't get rlimit"); + goto out; + } + + if (rlims[r].rlim_cur > 1 && + rlims[r].rlim_cur != RLIM_INFINITY) { + rlims[r].rlim_cur--; + + if (setrlimit(r, &rlims[r])) { + pr_perror("Can't set rlimit"); + goto out; + } + + changed = 1; + } + } + + if (!changed) { + pr_perror("Can't change any rlimir"); + goto out; + } + + test_daemon(); + test_waitsig(); + + for (r = 0; r < RLIM_NLIMITS; r++) { + if (getrlimit(r, &trlim)) { + fail("Can't get rlimit after rst"); + goto out; + } + + if (rlims[r].rlim_cur != trlim.rlim_cur) { + fail("Cur changed"); + goto out; + } + + if (rlims[r].rlim_max != trlim.rlim_max) { + fail("Max changed"); + goto out; + } + } + + pass(); +out: + return 0; +} + diff --git a/CRIU_code/test/zdtm/static/rmdir_open.c b/CRIU_code/test/zdtm/static/rmdir_open.c new file mode 100644 index 0000000..279aa78 --- /dev/null +++ b/CRIU_code/test/zdtm/static/rmdir_open.c @@ -0,0 +1,73 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that opened removed dir works"; +const char *test_author = "Pavel Emelianov "; + +char *dirname; +TEST_OPTION(dirname, string, "directory name", 1); + +int main(int argc, char **argv) +{ + char subdir[PATH_MAX]; + int fd; + struct stat st; + + test_init(argc, argv); + + sprintf(subdir, "%s/subdir", dirname); + if (mkdir(dirname, 0700) || mkdir(subdir, 0700)) { + pr_perror("Can't make dir"); + goto out; + } + + fd = open(subdir, O_DIRECTORY); + if (fd < 0) { + pr_perror("Can't open dir"); + goto outr; + } + + if (rmdir(subdir) || rmdir(dirname)) { + pr_perror("Can't remove dir"); + goto outr; + } + + test_daemon(); + test_waitsig(); + + /* + * We can't compare anything with previous, since + * inode _will_ change, so can the device. The only + * reasonable thing we can do is check that the fd + * still points to some removed directory. + */ + if (fstat(fd, &st)) { + fail("Can't stat fd\n"); + goto out; + } + + if (!S_ISDIR(st.st_mode)) { + fail("Fd is no longer directory\n"); + goto out; + } + + if (st.st_nlink != 0) { + fail("Directory is not removed\n"); + goto out; + } + + pass(); + return 0; + +outr: + rmdir(dirname); +out: + return 1; +} diff --git a/CRIU_code/test/zdtm/static/route_rules b/CRIU_code/test/zdtm/static/route_rules new file mode 100644 index 0000000..9a735c6 --- /dev/null +++ b/CRIU_code/test/zdtm/static/route_rules @@ -0,0 +1,73 @@ +#!/bin/bash +# $Id: route_rules,v 1.1 2007/06/04 12:11:30 agladkov Exp $ +# +# Copyright (c) 2007 by SWsoft. +# All rights reserved. +# +# Description: +# check that routes saved after migration + +export PATH=$PATH:${0%/*}/../../lib + +die() +{ + echo "$0:${BASH_LINENO[0]}: $*" >&2 + exit 1 +} + +fail() +{ + echo "FAIL: $0:${BASH_LINENO[0]}: $*" > "$outfile" + exit 1 +} + +do_or_fail() +{ + local failmsg="$1" output + shift + output="$(eval $@ 2>&1)" || + fail "$failmsg: $output" +} + +do_start() +{ + [ -f "$statefile" ] && die "state file $statefile aleady exists" + + # Get default route + dev_name=`ip route list match 0.0.0.0/0 | sed 's/.*dev \([^ ]*\).*/\1/'` + [ -n "$dev_name" ] || fail "dev_name is zero: " \ + "\$dev_name=\`ip route list match 0.0.0.0/0 | " \ + "sed 's/.*dev \([^ ]*\).*/\1/'" + do_or_fail "can't add routes" \ + ip r a 1.2.3.4/32 dev $dev_name && ip r a 1.2.0.0/16 via 1.2.3.4 + + do_or_fail "can't list created routes" \ + ip r \| grep "1.2.3.4" \> "$statefile" +} + +do_stop() +{ + do_or_fail "can't compare the routes" \ + ip r \| grep "1.2.3.4" \| diff -u "$statefile" - + + rm -f "$statefile" + IFS=" + "; + for line in `ip r | grep "1.2.3.4"`; do + eval ip r del $line + done + + echo "PASS" > $outfile +} + +tmpargs="$(../lib/parseargs.sh --name=$0 \ + --flags-req=statefile,outfile \ + -- "$@")" || + die "can't parse command line" +eval "$tmpargs" + +[ -f "$outfile" ] && die "out file $outfile aleady exists" + +# expect "start" or "stop" +action=${1:?Specify action$(die 'Specify action')} +do_$action diff --git a/CRIU_code/test/zdtm/static/rtc.c b/CRIU_code/test/zdtm/static/rtc.c new file mode 100644 index 0000000..28a79b1 --- /dev/null +++ b/CRIU_code/test/zdtm/static/rtc.c @@ -0,0 +1,62 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +#define TEST_HZ 4 +#define NR_FAILS 10 + +int main(int argc, char **argv) +{ + unsigned long data; + long delta; + int fd, fail = NR_FAILS, to_pass = NR_FAILS; + struct timeval start, end; + + test_init(argc, argv); + + fd = open("/dev/rtc", O_RDWR); + if (fd < 0) { + pr_perror("open"); + return 1; + } + + if (ioctl(fd, RTC_IRQP_SET, TEST_HZ) == -1) { + pr_perror("RTC_IRQP_SET"); + return 1; + } + + if (ioctl(fd, RTC_PIE_ON, 0) == -1) { + pr_perror("RTC_PIE_ON"); + return 1; + } + + test_daemon(); + + gettimeofday(&start, NULL); + start.tv_usec += start.tv_sec * 1000000; + while (test_go() || to_pass--) { + if (read(fd, &data, sizeof(unsigned long)) == -1) + return 1; + gettimeofday(&end, NULL); + end.tv_usec += end.tv_sec * 1000000; + delta = end.tv_usec - start.tv_usec; + if (labs(delta - 1000000 / TEST_HZ ) > 100000) { + pr_perror("delta = %ld", delta); + fail--; + if (fail == 0) + return 1; + } + start = end; + } + pass(); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/rtc.desc b/CRIU_code/test/zdtm/static/rtc.desc new file mode 100644 index 0000000..2c9c7e5 --- /dev/null +++ b/CRIU_code/test/zdtm/static/rtc.desc @@ -0,0 +1 @@ +{'flavor': 'h', 'flags': 'suid crlib','arch': 'x86_64 aarch64 arm ppc64'} diff --git a/CRIU_code/test/zdtm/static/s390x_gs_threads.c b/CRIU_code/test/zdtm/static/s390x_gs_threads.c new file mode 100644 index 0000000..8d14421 --- /dev/null +++ b/CRIU_code/test/zdtm/static/s390x_gs_threads.c @@ -0,0 +1,187 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" +#include "lock.h" + +#define NR_THREADS 4 +#define GS_ENABLE 0 +#define GS_SET_BC_CB 2 +#define GS_BROADCAST 4 + +#ifndef __NR_guarded_storage +#define __NR_guarded_storage 378 +#endif + +const char *test_doc = "Check the guarded storage broadcast"; +/* Original test provided by Martin Schwidefsky */ +const char *test_author = "Alice Frosi "; + +static unsigned long main_thread_tid; + +/* + * This test case executes the following procedure: + * + * 1) The parent thread creates NR_THREADS child threads + * + * 2) For each thread (including the parent thread): + * - Enable guarded-storage + * - Set the guarded-storage broadcast control block and + * specify gs_handler as Guarded-Storage-Event Parameter-List + * address + * + * 3) Dump and restore + * + * 4) Guarded-storage broadcast event + * - Child threads: Wait until main thread does GS broadcast + * - Parent thread: Trigger GS broadcast + * + * 5) Verify that all GS works as expected and all threads have been + * executed the gs_handler + */ + +struct gs_cb { + __u64 reserved; + __u64 gsd; + __u64 gssm; + __u64 gs_epl_a; +}; + +static futex_t futex; +static futex_t futex2; + +/* + * Load guarded-storage + */ +void load_guarded(unsigned long *mem); +asm( + ".global load_guarded\n" + "load_guarded:\n" + " .insn rxy,0xe3000000004c,%r2,0(%r2)\n" + " br %r14\n" + " .size load_guarded,.-load_guarded\n"); + +/* + * Inline assembly to deal with interrupted context to the call of + * the GS handler. Load guarded can be turned into a branch to this + * function. + */ +void gs_handler_asm(void); +asm( + ".globl gs_handler_asm\n" + "gs_handler_asm:\n" + " lgr %r14,%r15\n" + " aghi %r15,-320\n" + " stmg %r0,%r14,192(%r15)\n" + " stg %r14,312(%r14)\n" + " la %r2,160(%r15)\n" + " .insn rxy,0xe30000000049,0,160(%r15)\n" + " lg %r14,24(%r2)\n" + " lg %r14,40(%r14)\n" + " la %r14,6(%r14)\n" + " stg %r14,304(%r15)\n" + " brasl %r14,gs_handler\n" + " lmg %r0,%r15,192(%r15)\n" + " br %r14\n" + " .size gs_handler_asm,.-gs_handler_asm\n"); + +/* + * GS handler called when GS event occurs + */ +void gs_handler(struct gs_cb *this_cb) +{ + unsigned long tid = syscall(SYS_gettid); + test_msg("gs_handler for thread %016lx\n", tid); + futex_dec_and_wake(&futex2); +} + +/* + * Entry point for threads + */ +static void *thread_run(void *param) +{ + unsigned long test = 0x1234000000; + unsigned long *gs_epl; + struct gs_cb *gs_cb; + + /* Enable guarded-storage */ + if (syscall(__NR_guarded_storage, GS_ENABLE) != 0) { + fail("Unable to enable guarded storage"); + exit(1); + } + gs_epl = malloc(sizeof(unsigned long) * 6); + gs_cb = malloc(sizeof(*gs_cb)); + if (gs_epl == NULL || gs_cb == NULL) { + fail("Error allocating memory\n"); + exit(1); + } + gs_cb->gsd = 0x1234000000UL | 26; + gs_cb->gssm = -1UL; + gs_cb->gs_epl_a = (unsigned long) gs_epl; + gs_epl[1] = (unsigned long) gs_handler_asm; + /* Set the GS broadcast control block */ + syscall(__NR_guarded_storage, GS_SET_BC_CB, gs_cb); + futex_dec_and_wake(&futex); + /* Wait for all threads to set the GS broadcast control block */ + futex_wait_until(&futex, 0); + test_msg("Thread %016lx staring loop\n", syscall(SYS_gettid)); + /* + * Designate a guarded-storage section until the main task + * performs the GS_BROADCAST action and the following load_guarded + * will provoke the switch to the gs handler + */ + while (1) + load_guarded(&test); +} + +int main(int argc, char *argv[]) +{ + pthread_t tids[NR_THREADS]; + int i; + + main_thread_tid = syscall(SYS_gettid); + test_init(argc, argv); + /* Enable guarded-storage */ + if (syscall(__NR_guarded_storage, GS_ENABLE) != 0) { + if (errno == ENOSYS || errno == EOPNOTSUPP) { + test_daemon(); + test_waitsig(); + skip("No guarded storage support"); + pass(); + return 0; + } + fail("Unable to enable guarded storage"); + return 1; + } + + futex_set(&futex, NR_THREADS); + + for (i = 0; i < NR_THREADS; i++) + pthread_create(tids + i, NULL, thread_run, NULL); + + test_msg("Waiting for thread startup\n"); + /* Wait for all threads to set the GS broadcast control block */ + futex_wait_until(&futex, 0); + + test_daemon(); + test_waitsig(); + + test_msg("Doing broadcast\n"); + futex_set(&futex2, NR_THREADS); + /* + * Triggers a GS event and force all the threads to execute + * the gs handler + */ + syscall(__NR_guarded_storage, GS_BROADCAST); + + test_msg("Waiting for thread completion\n"); + futex_wait_until(&futex2, 0); + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/s390x_mmap_high.c b/CRIU_code/test/zdtm/static/s390x_mmap_high.c new file mode 100644 index 0000000..5eb06e6 --- /dev/null +++ b/CRIU_code/test/zdtm/static/s390x_mmap_high.c @@ -0,0 +1,64 @@ +#include +#include +#include +#include + +#include "zdtmtst.h" + +#define TASK_SIZE_LEVEL_4 0x20000000000000UL /* 8 PB */ +#define MAP_SIZE 0x1000 +#define VAL 0x77 + +const char *test_doc = "Verify that tasks > 4TB can be checkpointed"; +const char *test_author = "Michael Holzheu "; + +/* + * Map memory at the very end of the 8 PB address space + */ +int main(int argc, char **argv) +{ + void *addr = (void *) TASK_SIZE_LEVEL_4 - MAP_SIZE; + char *buf; + int i; + + + test_init(argc, argv); + + /* + * Skip test if kernel does not have the following fix: + * + * ee71d16d22 ("s390/mm: make TASK_SIZE independent from the number + * of page table levels") + */ + if (munmap(addr, MAP_SIZE) == -1) { + test_daemon(); + test_waitsig(); + skip("Detected kernel without 4 level TASK_SIZE fix"); + pass(); + return 0; + } + + /* Map memory at the very end of the 8 PB address space */ + buf = mmap(addr, MAP_SIZE, PROT_WRITE | PROT_READ, + MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0); + if (buf == MAP_FAILED) { + pr_perror("Could not create mapping"); + exit(1); + } + /* Initialize buffer with data */ + memset(buf, VAL, MAP_SIZE); + + test_daemon(); + test_waitsig(); + + /* Verify that we restored the data correctly */ + for (i = 0; i < MAP_SIZE; i++) { + if (buf[i] == VAL) + continue; + fail("%d: %d != %d\n", i, buf[i], VAL); + goto out; + } + pass(); +out: + return 0; +} diff --git a/CRIU_code/test/zdtm/static/s390x_mmap_high.desc b/CRIU_code/test/zdtm/static/s390x_mmap_high.desc new file mode 100644 index 0000000..8621263 --- /dev/null +++ b/CRIU_code/test/zdtm/static/s390x_mmap_high.desc @@ -0,0 +1 @@ +{'arch': 's390x'} diff --git a/CRIU_code/test/zdtm/static/s390x_regs_check.c b/CRIU_code/test/zdtm/static/s390x_regs_check.c new file mode 100644 index 0000000..9aaf419 --- /dev/null +++ b/CRIU_code/test/zdtm/static/s390x_regs_check.c @@ -0,0 +1,575 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that FP and VX registers do not change"; +const char *test_author = "Michael Holzheu "; + +/* + * This test case executes the following procedure: + * + * 1) Set registers to defined values + * The main process creates one child process and within that process + * NR_THREADS threads. Then the main process uses ptrace(SETREGS) to + * set the registers in the child process and in all threads. + * + * 2) Detach from child and threads + * Do this in order to allow criu to use ptrace for dumping. + * + * 3) Issue criu commands + * Useful tests are: dump, dump --check-only, dump --leave-running + * + * 4) Check registers + * Use ptrace(GETREGS) and compare with original values from step 1. + * + * This test can be used for two purposes: + * + * - Verify that "criu restore" sets the correct register sets + * from "criu dump": + * $ zdtmp.py run -t zdtm/static/s390x_regs_check + * + * - Verify that dumpee continues running with correct registers after + * parasite injection: + * $ zdtmp.py run --norst -t zdtm/static/s390x_regs_check + * $ zdtmp.py run --norst --pre 2 -t zdtm/static/s390x_regs_check + * $ zdtmp.py run --check-only -t zdtm/static/s390x_regs_check + */ +#define NR_THREADS 2 +#define NR_THREADS_ALL (NR_THREADS + 1) + +static pid_t thread_pids[NR_THREADS_ALL]; +static int pipefd[2]; + +/* + * Generic structure to define a register set and test data + */ +struct reg_set { + const char *name; /* Name of regset */ + int nr; /* Number of regset */ + void *data; /* Test data */ + int len; /* Number of bytes of test data */ + bool optional; /* Not all kernels/machines have this reg set */ + bool available; /* Current kernel/machine has this reg set */ +}; + +/* + * s390 floating point registers + */ +struct prfpreg { + uint32_t fpc; + uint64_t fprs[16]; +}; + +struct prfpreg prfpreg_data = { + .fpc = 0, + .fprs = { + 0x0000000000000000, + 0x1111111111111110, + 0x2222222222222220, + 0x3333333333333330, + 0x4444444444444440, + 0x5555555555555550, + 0x6666666666666660, + 0x7777777777777770, + 0x8888888888888880, + 0x9999999999999990, + 0xaaaaaaaaaaaaaaa0, + 0xbbbbbbbbbbbbbbb0, + 0xccccccccccccccc0, + 0xddddddddddddddd0, + 0xeeeeeeeeeeeeeee0, + 0xfffffffffffffff0, + } +}; + +struct reg_set reg_set_prfpreg = { + .name = "PRFPREG", + .nr = NT_PRFPREG, + .data = &prfpreg_data, + .len = sizeof(prfpreg_data), + .optional = false, +}; + +/* + * s390 vector VXRS_LOW registers + */ + +#define NT_S390_VXRS_LOW 0x309 + +struct vxrs_low { + uint64_t regs[16]; +}; + +struct vxrs_low vxrs_low_data = { + .regs = { + 0x0000000000000001, + 0x1111111111111111, + 0x2222222222222221, + 0x3333333333333331, + 0x4444444444444441, + 0x5555555555555551, + 0x6666666666666661, + 0x7777777777777771, + 0x8888888888888881, + 0x9999999999999991, + 0xaaaaaaaaaaaaaaa1, + 0xbbbbbbbbbbbbbbb1, + 0xccccccccccccccc1, + 0xddddddddddddddd1, + 0xeeeeeeeeeeeeeee1, + 0xfffffffffffffff1, + } +}; + +struct reg_set reg_set_vxrs_low = { + .name = "VXRS_LOW", + .nr = NT_S390_VXRS_LOW, + .data = &vxrs_low_data, + .len = sizeof(vxrs_low_data), + .optional = true, +}; + +/* + * s390 vector VXRS_HIGH registers + */ + +#define NT_S390_VXRS_HIGH 0x30a + +struct vxrs_high { + uint64_t regs[32]; +}; + +struct vxrs_high vxrs_high_data = { + .regs = { + 0x0000000000000002, 0x0000000000000002, + 0x1111111111111112, 0x1111111111111112, + 0x2222222222222222, 0x2222222222222222, + 0x3333333333333332, 0x3333333333333332, + 0x4444444444444442, 0x4444444444444442, + 0x5555555555555552, 0x5555555555555552, + 0x6666666666666662, 0x6666666666666662, + 0x7777777777777772, 0x7777777777777772, + 0x8888888888888882, 0x8888888888888882, + 0x9999999999999992, 0x9999999999999992, + 0xaaaaaaaaaaaaaaa2, 0xaaaaaaaaaaaaaaa2, + 0xbbbbbbbbbbbbbbb2, 0xbbbbbbbbbbbbbbb2, + 0xccccccccccccccc2, 0xccccccccccccccc2, + 0xddddddddddddddd2, 0xddddddddddddddd2, + 0xeeeeeeeeeeeeeee2, 0xeeeeeeeeeeeeeee2, + 0xfffffffffffffff2, 0xfffffffffffffff2, + } +}; + +struct reg_set reg_set_vxrs_high = { + .name = "VXRS_HIGH", + .nr = NT_S390_VXRS_HIGH, + .data = &vxrs_high_data, + .len = sizeof(vxrs_high_data), + .optional = true, +}; + +/* + * s390 guarded-storage registers + */ +#define NT_S390_GS_CB 0x30b +#define NT_S390_GS_BC 0x30c + +struct gs_cb { + uint64_t regs[4]; +}; + +struct gs_cb gs_cb_data = { + .regs = { + 0x0000000000000000, + 0x000000123400001a, + 0x5555555555555555, + 0x000000014b58a010, + } +}; + +struct reg_set reg_set_gs_cb = { + .name = "GS_CB", + .nr = NT_S390_GS_CB, + .data = &gs_cb_data, + .len = sizeof(gs_cb_data), + .optional = true, +}; + +struct gs_cb gs_bc_data = { + .regs = { + 0x0000000000000000, + 0x000000123400001a, + 0xffffffffffffffff, + 0x0000000aaaaaaaaa, + } +}; + +struct reg_set reg_set_gs_bc = { + .name = "GS_BC_CB", + .nr = NT_S390_GS_BC, + .data = &gs_bc_data, + .len = sizeof(gs_bc_data), + .optional = true, +}; + +/* + * s390 runtime-instrumentation control block + */ +#define NT_S390_RI_CB 0x30d + +struct ri_cb { + uint64_t regs[8]; +}; + +struct ri_cb ri_cb_data = { + .regs = { + 0x000002aa13aae000, + 0x000002aa13aad000, + 0x000002aa13aadfff, + 0xe0a1000400000000, + 0x0000000000000000, + 0x0000000000004e20, + 0x0000000000003479, + 0x0000000000000000, + } +}; + +struct reg_set reg_set_ri_cb = { + .name = "RI_CB", + .nr = NT_S390_RI_CB, + .data = &ri_cb_data, + .len = sizeof(ri_cb_data), + .optional = true, +}; + +/* + * Vector with all regsets + */ +struct reg_set *reg_set_vec[] = { + ®_set_prfpreg, + ®_set_vxrs_low, + ®_set_vxrs_high, + ®_set_gs_cb, + ®_set_gs_bc, + ®_set_ri_cb, + NULL, +}; + +/* + * Print hexdump for buffer with variable group parameter + */ +void util_hexdump_grp(const char *tag, const void *data, int grp, + int count, int indent) +{ + char str[1024], *ptr = str; + const char *buf = data; + int i, first = 1; + + for (i = 0; i < count; i++) { + if (first) { + ptr = str; + ptr += sprintf(ptr, "%*s", indent, " "); + if (tag) + ptr += sprintf(ptr, "%s: ", tag); + ptr += sprintf(ptr, "%08x: ", i); + first = 0; + } + ptr += sprintf(ptr, "%02x", buf[i]); + if (i % 16 == 15 || i + 1 == count) { + test_msg("%s\n", str); + first = 1; + } else if (i % grp == grp - 1) { + ptr += sprintf(ptr, " "); + } + } +} + +/* + * Print hexdump for buffer with fix grp parameter + */ +void util_hexdump(const char *tag, const void *data, int count) +{ + util_hexdump_grp(tag, data, sizeof(long), count, 0); +} + +/* + * Set regset for pid + */ +static int set_regset(pid_t pid, struct reg_set *reg_set) +{ + struct iovec iov; + + iov.iov_base = reg_set->data; + iov.iov_len = reg_set->len; + + if (ptrace(PTRACE_SETREGSET, pid, reg_set->nr, iov) == 0) { + test_msg(" REGSET: %12s -> DONE\n", reg_set->name); + reg_set->available = true; + return 0; + } + if (reg_set->optional) { + switch (errno) { + case EOPNOTSUPP: + case ENODEV: + test_msg(" REGSET: %12s -> not supported by machine\n", + reg_set->name); + return 0; + case EINVAL: + test_msg(" REGSET: %12s -> not supported by kernel\n", + reg_set->name); + return 0; + default: + break; + } + } + pr_perror("PTRACE_SETREGSET for %s failed for pid %d", + reg_set->name, pid); + return -1; +} + +/* + * Apply all regsets + */ +static int set_regset_all(pid_t pid) +{ + int i; + + for (i = 0; reg_set_vec[i] != NULL; i++) { + if (set_regset(pid, reg_set_vec[i])) + return -1; + } + return 0; +} + +/* + * Check if regset for pid has changed + */ +static int check_regset(pid_t pid, struct reg_set *reg_set) +{ + struct iovec iov; + char *data; + + if (!reg_set->available) + return 0; + data = calloc(reg_set->len, 1); + if (!data) + return -1; + + iov.iov_base = data; + iov.iov_len = reg_set->len; + + if (ptrace(PTRACE_GETREGSET, pid, reg_set->nr, iov) != 0) { + pr_perror("PTRACE_SETREGSET for %s failed for pid %d", + reg_set->name, pid); + free(data); + return -1; + } + if (memcmp(data, reg_set->data, reg_set->len) != 0) { + test_msg("RegSet %s changed for pid=%d\n", reg_set->name, pid); + test_msg("Original values:\n"); + util_hexdump(reg_set->name, reg_set->data, reg_set->len); + test_msg("New values:\n"); + util_hexdump(reg_set->name, data, reg_set->len); + free(data); + return -1; + } + free(data); + return 0; +} + +/* + * Check all regsets + */ +static int check_regset_all(pid_t pid) +{ + int i; + + for (i = 0; reg_set_vec[i] != NULL; i++) { + if (check_regset(pid, reg_set_vec[i])) + return -1; + } + return 0; +} + +/* + * Send error to father + */ +static void send_error(void) +{ + int val = 0; + + if (write(pipefd[1], &val, sizeof(val)) == -1) + pr_perror("write failed"); +} + +/* + * Write tid to pipe and then loop without changing registers + */ +static inline void send_tid_and_loop(int fd) +{ + int tid = syscall(__NR_gettid); + + asm volatile( + "lgr 2,%0\n" /* Arg 1: fd */ + "la 3,%1\n" /* Arg 2: &tid */ + "lghi 4,4\n" /* Arg 3: sizeof(int) */ + "svc 4\n" /* __NR_write SVC: */ + /* After SVC no more registers are changed */ + "0: j 0b\n" /* Loop here */ + : : "d" (fd), "Q" (tid) : "2", "3", "4"); +} + +/* + * Function for threads + */ +static void *thread_func(void *fd) +{ + send_tid_and_loop(pipefd[1]); + return NULL; +} + +/* + * Function executed by the child + */ +static void child_func(void) +{ + pthread_t thread; + int i; + + /* Close read end of pipe */ + close(pipefd[0]); + /* Create threads and send TID */ + for (i = 0; i < NR_THREADS; i++) { + if (pthread_create(&thread, NULL, thread_func, NULL) != 0) { + pr_perror("Error create thread: %d", i); + send_error(); + } + } + /* Send tid and wait until get killed */ + send_tid_and_loop(pipefd[1]); +} + +/* + * Attach to a thread + */ +static int ptrace_attach(pid_t pid) +{ + if (ptrace(PTRACE_ATTACH, pid, 0, 0) == 0) { + if (waitpid(pid, NULL, __WALL) < 0) { + pr_perror("Waiting for thread %d failed", pid); + return -1; + } + return 0; + } + pr_perror("Attach to thread %d failed", pid); + return -1; +} + +/* + * Detach from a thread + */ +static int ptrace_detach(pid_t pid) +{ + if (ptrace(PTRACE_DETACH, pid, 0, 0) == 0) + return 0; + pr_perror("Detach from thread %d failed", pid); + return -1; +} + +/* + * Create child with threads and verify that registers are not corrupted + */ +int main(int argc, char *argv[]) +{ + bool failed = false; + pid_t child, pid; + int i; + + test_init(argc, argv); + + test_msg("------------- START 1 PROCESS + %d THREADS ---------------\n", + NR_THREADS); + if (pipe(pipefd) == -1) { + perror("pipe failed"); + exit(EXIT_FAILURE); + } + child = fork(); + + if (child == 0) + child_func(); + + /* Parent */ + for (i = 0; i < NR_THREADS_ALL; i++) { + if (read(pipefd[0], &pid, sizeof(pid_t)) == -1) { + perror("Read from pipe failed"); + failed = true; + goto kill_all_threads; + } + if (pid == 0) { + pr_err("Not all threads are started\n"); + failed = true; + goto kill_all_threads; + } + test_msg("STARTED: pid = %d\n", pid); + thread_pids[i] = pid; + } + + /* Close write end */ + close(pipefd[1]); + test_msg("---------------------- SET REGISTERS --------------------\n"); + for (i = 0; i < NR_THREADS_ALL; i++) { + pid = thread_pids[i]; + test_msg("SET: pid = %d\n", pid); + ptrace_attach(pid); + set_regset_all(pid); + ptrace_detach(pid); + } + + test_daemon(); + test_waitsig(); + + test_msg("-------------------- CHECK REGISTERS --------------------\n"); + for (i = 0; i < NR_THREADS_ALL; i++) { + pid = thread_pids[i]; + test_msg("CHECK: pid = %d:\n", pid); + ptrace_attach(pid); + if (check_regset_all(pid) == 0) { + test_msg(" -> OK\n"); + } else { + test_msg(" -> FAIL\n"); + failed = true; + } + ptrace_detach(pid); + } + test_msg("----------------------- CLEANUP ------------------------\n"); + +kill_all_threads: + for (i = 0; i < NR_THREADS_ALL; i++) { + pid = thread_pids[i]; + if (pid == 0) + continue; + test_msg("KILL: pid = %d\n", pid); + kill(pid, SIGTERM); + } + + if (failed) { + fail("Registers changed"); + return 1; + } + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/s390x_regs_check.desc b/CRIU_code/test/zdtm/static/s390x_regs_check.desc new file mode 100644 index 0000000..8621263 --- /dev/null +++ b/CRIU_code/test/zdtm/static/s390x_regs_check.desc @@ -0,0 +1 @@ +{'arch': 's390x'} diff --git a/CRIU_code/test/zdtm/static/s390x_runtime_instr.c b/CRIU_code/test/zdtm/static/s390x_runtime_instr.c new file mode 100644 index 0000000..e0a5742 --- /dev/null +++ b/CRIU_code/test/zdtm/static/s390x_runtime_instr.c @@ -0,0 +1,214 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +#ifndef __NR_s390_runtime_instr +#define __NR_s390_runtime_instr 342 +#endif +#define NT_S390_RI_CB 0x30d + +#define BUF_SIZE (1024*1024) + +const char *test_doc = "Check runtime-instrumentation"; +/* Original test provided by Martin Schwidefsky */ +const char *test_author = "Alice Frosi "; + +struct runtime_instr_cb { + unsigned long rca; + unsigned long roa; + unsigned long rla; + + unsigned int v : 1; + unsigned int s : 1; + unsigned int k : 1; + unsigned int h : 1; + unsigned int a : 1; + unsigned int reserved1 : 3; + unsigned int ps : 1; + unsigned int qs : 1; + unsigned int pc : 1; + unsigned int qc : 1; + unsigned int reserved2 : 1; + unsigned int g : 1; + unsigned int u : 1; + unsigned int l : 1; + unsigned int key : 4; + unsigned int reserved3 : 8; + unsigned int t : 1; + unsigned int rgs : 3; + + unsigned int m : 4; + unsigned int n : 1; + unsigned int mae : 1; + unsigned int reserved4 : 2; + unsigned int c : 1; + unsigned int r : 1; + unsigned int b : 1; + unsigned int j : 1; + unsigned int e : 1; + unsigned int x : 1; + unsigned int reserved5 : 2; + unsigned int bpxn : 1; + unsigned int bpxt : 1; + unsigned int bpti : 1; + unsigned int bpni : 1; + unsigned int reserved6 : 2; + + unsigned int d : 1; + unsigned int f : 1; + unsigned int ic : 4; + unsigned int dc : 4; + + unsigned long reserved7; + unsigned long sf; + unsigned long rsic; + unsigned long reserved8; +}; + +/* + * Return PSW mask + */ +static inline unsigned long extract_psw(void) +{ + unsigned int reg1, reg2; + + asm volatile("epsw %0,%1" : "=d" (reg1), "=a" (reg2)); + return (((unsigned long) reg1) << 32) | ((unsigned long) reg2); +} + +/* + * Enable runtime-instrumentation + */ +static inline void rion(void) +{ + asm volatile (".word 0xaa01, 0x0000"); +} + +/* + * Disable runtime-instrumentation + */ +static inline void rioff(void) +{ + asm volatile (".word 0xaa03, 0x0000"); +} + +/* + * Modify the current runtime-instrumentation control block + */ +static inline void mric(struct runtime_instr_cb *cb) +{ + asm volatile(".insn rsy,0xeb0000000062,0,0,%0" : : "Q" (*cb)); +} + +/* + * Store the current runtime-instrumentation control block + */ +static inline void stric(struct runtime_instr_cb *cb) +{ + asm volatile(".insn rsy,0xeb0000000061,0,0,%0" : "=Q" (*cb) : : "cc"); +} + +/* + * Ensure that runtime-intstrumentation is still working after C/R + */ +int main(int argc, char **argv) +{ + struct runtime_instr_cb ricb, ricb_check; + unsigned long *ricb_check_ptr = (unsigned long *) &ricb_check; + unsigned long *ricb_ptr = (unsigned long *) &ricb; + unsigned long psw_mask; + void *buf; + int i; + + test_init(argc, argv); + buf = malloc(BUF_SIZE); + memset(buf, 0, BUF_SIZE); + memset(&ricb, 0, sizeof(ricb)); + /* Initialize the default RI control block in the kernel */ + if (syscall(__NR_s390_runtime_instr, 1, NULL) < 0) { + if (errno == EOPNOTSUPP) { + test_daemon(); + test_waitsig(); + skip("RI not supported"); + pass(); + free(buf); + return 0; + } + fail("Fail with error %d", errno); + free(buf); + return -1; + } + /* Set buffer for RI */ + ricb.rca = ricb.roa = (unsigned long) buf; + ricb.rla = (unsigned long) buf + BUF_SIZE; + mric(&ricb); + /* Enable RI - afterwards the PSW will have RI bit set */ + rion(); + psw_mask = extract_psw(); + /* Verify that the RI bit is set in the PSW */ + if (!(psw_mask & PSW_MASK_RI)) { + fail("Failed to enable RI"); + return -1; + } + /* Collect RI records until we hit buffer-full condition */ + while (ricb.rca < ricb.rla + 1) { + for (i = 0; i < 10000; i++) + asm volatile("" : : : "memory"); + rioff(); + stric(&ricb); + rion(); + } + /* Disable RI */ + rioff(); + /* Save the current RI control block */ + stric(&ricb); + ricb_check = ricb; + /* Re-enable RI for checkpoint */ + rion(); + + /* Do C/R now */ + test_daemon(); + test_waitsig(); + + /* Verify that the RI bit is set in the PSW */ + psw_mask = extract_psw(); + if (!(psw_mask & PSW_MASK_RI)) { + fail("RI bit in PSW not set"); + return -1; + } + /* + * Verify that the RI block has been restored correctly + * and the buffer is unchanged + */ + rioff(); + stric(&ricb); + for (i = 0; i < 8; i++) { + if (ricb_ptr[i] == ricb_check_ptr[i]) + continue; + /* Skip sf field because its value may change */ + if (i == 6) + continue; + fail("%d:Got %016lx expected %016lx", + i, ricb_ptr[i], ricb_check_ptr[i]); + return -1; + } + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/sched_policy00.c b/CRIU_code/test/zdtm/static/sched_policy00.c new file mode 100644 index 0000000..7312abd --- /dev/null +++ b/CRIU_code/test/zdtm/static/sched_policy00.c @@ -0,0 +1,88 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check sched policy to be preserved"; +const char *test_author = "Pavel Emelyanov "; + +static const int parm = 3; + +static int do_nothing(void) +{ + while (1) + sleep(10); + + return -1; +} + +int main(int argc, char ** argv) +{ + int pid, ret, err = 0; + struct sched_param p; + int old_rt_runtime_us = -1; + FILE *file; + + test_init(argc, argv); + + pid = fork(); + if (!pid) + return do_nothing(); + + file = fopen("/sys/fs/cgroup/cpu/user.slice/cpu.rt_runtime_us", "r"); + if (file) { + ret = fscanf(file, "%d", &old_rt_runtime_us); + fclose(file); + + if ((ret > 0) && (old_rt_runtime_us == 0)) { + file = fopen("/sys/fs/cgroup/cpu/user.slice/cpu.rt_runtime_us", "w"); + if (file) { + fprintf(file, "100\n"); + fclose(file); + } + } + } + + p.sched_priority = parm; + if (sched_setscheduler(pid, SCHED_RR, &p)) { + pr_perror("Can't set policy"); + kill(pid, SIGKILL); + return -1; + } + + test_daemon(); + test_waitsig(); + + ret = sched_getscheduler(pid); + if (ret != SCHED_RR) { + fail("Broken/No policy"); + err++; + } + + ret = sched_getparam(pid, &p); + if (ret < 0 || p.sched_priority != parm) { + fail("Broken prio"); + err++; + } + + if (!err) + pass(); + + kill(pid, SIGKILL); + if (old_rt_runtime_us != -1) { + file = fopen("/sys/fs/cgroup/cpu/user.slice/cpu.rt_runtime_us", "w"); + if (file) { + fprintf(file, "%d\n", old_rt_runtime_us); + fclose(file); + } + + } + return err; +} diff --git a/CRIU_code/test/zdtm/static/sched_policy00.desc b/CRIU_code/test/zdtm/static/sched_policy00.desc new file mode 100644 index 0000000..d969725 --- /dev/null +++ b/CRIU_code/test/zdtm/static/sched_policy00.desc @@ -0,0 +1 @@ +{'flavor': 'h ns', 'flags': 'suid'} diff --git a/CRIU_code/test/zdtm/static/sched_prio00.c b/CRIU_code/test/zdtm/static/sched_prio00.c new file mode 100644 index 0000000..ea4a2ee --- /dev/null +++ b/CRIU_code/test/zdtm/static/sched_prio00.c @@ -0,0 +1,79 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check sched prios to be preserved"; +const char *test_author = "Pavel Emelyanov "; + +#define NRTASKS 3 + +static int do_nothing(void) +{ + while (1) + sleep(10); + + return -1; +} + +static void kill_all(int *pid, int n) +{ + int i; + + for (i = 0; i < n; i++) + kill(pid[i], SIGKILL); +} + +int main(int argc, char ** argv) +{ + int pid[NRTASKS], i, parm[NRTASKS], ret; + + test_init(argc, argv); + + parm[0] = -20; + parm[1] = 19; + parm[2] = 1; + + for (i = 0; i < NRTASKS; i++) { + pid[i] = fork(); + if (!pid[i]) + return do_nothing(); + + if (setpriority(PRIO_PROCESS, pid[i], parm[i])) { + pr_perror("Can't set prio %d", i); + kill_all(pid, i); + return -1; + } + } + + test_daemon(); + test_waitsig(); + + for (i = 0; i < NRTASKS; i++) { + errno = 0; + ret = getpriority(PRIO_PROCESS, pid[i]); + if (errno) { + fail("No prio for task %d", i); + break; + } + + if (ret != parm[i]) { + fail("Broken nice for %d", i); + break; + } + } + + if (i == NRTASKS) + pass(); + + kill_all(pid, NRTASKS); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/sched_prio00.desc b/CRIU_code/test/zdtm/static/sched_prio00.desc new file mode 100644 index 0000000..d969725 --- /dev/null +++ b/CRIU_code/test/zdtm/static/sched_prio00.desc @@ -0,0 +1 @@ +{'flavor': 'h ns', 'flags': 'suid'} diff --git a/CRIU_code/test/zdtm/static/scm00.c b/CRIU_code/test/zdtm/static/scm00.c new file mode 100644 index 0000000..de626d9 --- /dev/null +++ b/CRIU_code/test/zdtm/static/scm00.c @@ -0,0 +1,162 @@ +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that SCM_RIGHTS are preserved"; +const char *test_author = "Pavel Emelyanov "; + +static int send_fd(int via, int fd) +{ + struct msghdr h = {}; + struct cmsghdr *ch; + struct iovec iov; + char buf[CMSG_SPACE(sizeof(int))], c = '\0'; + int *fdp; + + h.msg_control = buf; + h.msg_controllen = sizeof(buf); + ch = CMSG_FIRSTHDR(&h); + ch->cmsg_level = SOL_SOCKET; + ch->cmsg_type = SCM_RIGHTS; + ch->cmsg_len = CMSG_LEN(sizeof(int)); + fdp = (int *)CMSG_DATA(ch); + *fdp = fd; + h.msg_iov = &iov; + h.msg_iovlen = 1; + iov.iov_base = &c; + iov.iov_len = sizeof(c); + + if (sendmsg(via, &h, 0) <= 0) + return -1; + + return 0; +} + +static int recv_fd(int via) +{ + struct msghdr h = {}; + struct cmsghdr *ch; + struct iovec iov; + char buf[CMSG_SPACE(sizeof(int))], c; + int *fdp; + + h.msg_control = buf; + h.msg_controllen = sizeof(buf); + h.msg_iov = &iov; + h.msg_iovlen = 1; + iov.iov_base = &c; + iov.iov_len = sizeof(c); + + if (recvmsg(via, &h, 0) <= 0) + return -1; + + ch = CMSG_FIRSTHDR(&h); + if (h.msg_flags & MSG_TRUNC) + return -2; + if (ch == NULL) + return -3; + if (ch->cmsg_type != SCM_RIGHTS) + return -4; + + fdp = (int *)CMSG_DATA(ch); + return *fdp; +} + +int main(int argc, char **argv) +{ + int sk[2], p[2], rfd; +#define MSG "HELLO" + char buf[8]; /* bigger than the MSG to check boundaries */ + + test_init(argc, argv); + + if (socketpair(PF_UNIX, SOCK_DGRAM, 0, sk) < 0) { + pr_perror("Can't make unix pair"); + exit(1); + } + + if (pipe(p) < 0) { + pr_perror("Can't make pipe"); + exit(1); + } + + if (send_fd(sk[0], p[0]) < 0) { + pr_perror("Can't send descriptor"); + exit(1); + } + +#ifndef KEEP_SENT_FD + close(p[0]); +#ifdef SEND_BOTH + if (send_fd(sk[0], p[1]) < 0) { + pr_perror("Can't send 2nd descriptor"); + exit(1); + } + close(p[1]); + p[0] = p[1] = -1; +#else + /* Swap pipe ends to make scm recv put pipe into different place */ + dup2(p[1], p[0]); + close(p[1]); + p[1] = p[0]; + p[0] = -1; +#endif +#endif + + test_daemon(); + test_waitsig(); + + rfd = recv_fd(sk[1]); + if (rfd < 0) { + fail("Can't recv pipe back (%d)", p[0]); + goto out; + } + +#ifdef SEND_BOTH + test_msg("Recv 2nd end\n"); + p[1] = recv_fd(sk[1]); + if (p[1] < 0) { + fail("Can't recv 2nd pipe back (%d)", p[1]); + goto out; + } +#endif + +#ifdef KEEP_SENT_FD + if (rfd == p[0]) { + fail("Original descriptor not kept"); + goto out; + } +again: +#endif + if (write(p[1], MSG, sizeof(MSG)) != sizeof(MSG)) { + fail("Pipe write-broken"); + goto out; + } + + if (read(rfd, buf, sizeof(buf)) != sizeof(MSG)) { + fail("Pipe read-broken"); + goto out; + } + + if (strcmp(buf, MSG)) { + buf[sizeof(buf) - 1] = '\0'; + fail("Pipe read-broken (%s)", buf); + goto out; + } + +#ifdef KEEP_SENT_FD + if (rfd != p[0]) { + test_msg("Check kept\n"); + rfd = p[0]; + goto again; + } +#endif + + pass(); +out: + return 0; +} diff --git a/CRIU_code/test/zdtm/static/scm01.c b/CRIU_code/test/zdtm/static/scm01.c new file mode 100644 index 0000000..4cab0ed --- /dev/null +++ b/CRIU_code/test/zdtm/static/scm01.c @@ -0,0 +1 @@ +scm00.c \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/scm02.c b/CRIU_code/test/zdtm/static/scm02.c new file mode 100644 index 0000000..4cab0ed --- /dev/null +++ b/CRIU_code/test/zdtm/static/scm02.c @@ -0,0 +1 @@ +scm00.c \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/scm03.c b/CRIU_code/test/zdtm/static/scm03.c new file mode 100644 index 0000000..cf60497 --- /dev/null +++ b/CRIU_code/test/zdtm/static/scm03.c @@ -0,0 +1,154 @@ +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that SCM_RIGHTS are preserved"; +const char *test_author = "Pavel Emelyanov "; + +static int send_fd(int via, int fd1, int fd2) +{ + struct msghdr h = {}; + struct cmsghdr *ch; + struct iovec iov; +#ifdef SEPARATE + char buf[2 * CMSG_SPACE(sizeof(int))]; +#else + char buf[CMSG_SPACE(2 * sizeof(int))]; +#endif + char c = '\0'; + int *fdp; + + memset(buf, 0, sizeof(buf)); + h.msg_control = buf; + h.msg_controllen = sizeof(buf); +#ifdef SEPARATE + ch = CMSG_FIRSTHDR(&h); + ch->cmsg_level = SOL_SOCKET; + ch->cmsg_type = SCM_RIGHTS; + ch->cmsg_len = CMSG_LEN(sizeof(int)); + fdp = (int *)CMSG_DATA(ch); + fdp[0] = fd1; + ch = CMSG_NXTHDR(&h, ch); + ch->cmsg_level = SOL_SOCKET; + ch->cmsg_type = SCM_RIGHTS; + ch->cmsg_len = CMSG_LEN(sizeof(int)); + fdp = (int *)CMSG_DATA(ch); + fdp[0] = fd2; +#else + ch = CMSG_FIRSTHDR(&h); + ch->cmsg_level = SOL_SOCKET; + ch->cmsg_type = SCM_RIGHTS; + ch->cmsg_len = CMSG_LEN(2 * sizeof(int)); + fdp = (int *)CMSG_DATA(ch); + fdp[0] = fd1; + fdp[1] = fd2; +#endif + h.msg_iov = &iov; + h.msg_iovlen = 1; + iov.iov_base = &c; + iov.iov_len = sizeof(c); + + if (sendmsg(via, &h, 0) <= 0) + return -1; + + return 0; +} + +static int recv_fd(int via, int *fd1, int *fd2) +{ + struct msghdr h = {}; + struct cmsghdr *ch; + struct iovec iov; + char buf[CMSG_SPACE(2 * sizeof(int))]; + char c; + int *fdp; + + h.msg_control = buf; + h.msg_controllen = sizeof(buf); + h.msg_iov = &iov; + h.msg_iovlen = 1; + iov.iov_base = &c; + iov.iov_len = sizeof(c); + + if (recvmsg(via, &h, 0) <= 0) + return -1; + + if (h.msg_flags & MSG_CTRUNC) { + test_msg("CTR\n"); + return -2; + } + + /* No 2 SCM-s here, kernel merges them upon send */ + ch = CMSG_FIRSTHDR(&h); + if (h.msg_flags & MSG_TRUNC) + return -2; + if (ch == NULL) + return -3; + if (ch->cmsg_type != SCM_RIGHTS) + return -4; + + fdp = (int *)CMSG_DATA(ch); + *fd1 = fdp[0]; + *fd2 = fdp[1]; + return 0; +} + +int main(int argc, char **argv) +{ + int sk[2], p[2]; +#define MSG "HELLO" + char buf[8]; /* bigger than the MSG to check boundaries */ + + test_init(argc, argv); + + if (socketpair(PF_UNIX, SOCK_DGRAM, 0, sk) < 0) { + pr_perror("Can't make unix pair"); + exit(1); + } + + if (pipe(p) < 0) { + pr_perror("Can't make pipe"); + exit(1); + } + + if (send_fd(sk[0], p[0], p[1]) < 0) { + pr_perror("Can't send descriptor"); + exit(1); + } + + close(p[0]); + close(p[1]); + p[0] = p[1] = -1; + + test_daemon(); + test_waitsig(); + + if (recv_fd(sk[1], &p[0], &p[1]) < 0) { + fail("Can't recv pipes back"); + goto out; + } + + if (write(p[1], MSG, sizeof(MSG)) != sizeof(MSG)) { + fail("Pipe write-broken"); + goto out; + } + + if (read(p[0], buf, sizeof(buf)) != sizeof(MSG)) { + fail("Pipe read-broken"); + goto out; + } + + if (strcmp(buf, MSG)) { + buf[sizeof(buf) - 1] = '\0'; + fail("Pipe read-broken (%s)", buf); + goto out; + } + + pass(); +out: + return 0; +} diff --git a/CRIU_code/test/zdtm/static/scm04.c b/CRIU_code/test/zdtm/static/scm04.c new file mode 100644 index 0000000..f1f86dd --- /dev/null +++ b/CRIU_code/test/zdtm/static/scm04.c @@ -0,0 +1 @@ +scm03.c \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/scm05.c b/CRIU_code/test/zdtm/static/scm05.c new file mode 100644 index 0000000..c17bddd --- /dev/null +++ b/CRIU_code/test/zdtm/static/scm05.c @@ -0,0 +1,139 @@ +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that SCM_RIGHTS are preserved"; +const char *test_author = "Kirill Tkhai "; + +static int send_fd(int via, int fd) +{ + struct msghdr h = {}; + struct cmsghdr *ch; + struct iovec iov; + char buf[CMSG_SPACE(sizeof(int))]; + char c = '\0'; + int *fdp; + + memset(buf, 0, sizeof(buf)); + h.msg_control = buf; + h.msg_controllen = sizeof(buf); + ch = CMSG_FIRSTHDR(&h); + ch->cmsg_level = SOL_SOCKET; + ch->cmsg_type = SCM_RIGHTS; + ch->cmsg_len = CMSG_LEN(sizeof(int)); + fdp = (int *)CMSG_DATA(ch); + fdp[0] = fd; + + h.msg_iov = &iov; + h.msg_iovlen = 1; + iov.iov_base = &c; + iov.iov_len = sizeof(c); + + if (sendmsg(via, &h, 0) <= 0) + return -1; + + return 0; +} + +static int recv_fd(int via, int *fd) +{ + struct msghdr h = {}; + struct cmsghdr *ch; + struct iovec iov; + char buf[CMSG_SPACE(sizeof(int))]; + char c; + int *fdp; + + h.msg_control = buf; + h.msg_controllen = sizeof(buf); + h.msg_iov = &iov; + h.msg_iovlen = 1; + iov.iov_base = &c; + iov.iov_len = sizeof(c); + + if (recvmsg(via, &h, 0) <= 0) + return -1; + + if (h.msg_flags & MSG_CTRUNC) { + test_msg("CTR\n"); + return -2; + } + + /* No 2 SCM-s here, kernel merges them upon send */ + ch = CMSG_FIRSTHDR(&h); + if (h.msg_flags & MSG_TRUNC) + return -2; + if (ch == NULL) + return -3; + if (ch->cmsg_type != SCM_RIGHTS) + return -4; + + fdp = (int *)CMSG_DATA(ch); + *fd = fdp[0]; + return 0; +} + +int main(int argc, char **argv) +{ + struct epoll_event event = { + .events = EPOLLIN, + }; + int sk[2], ep, ret; + + test_init(argc, argv); + + if (socketpair(PF_UNIX, SOCK_DGRAM, 0, sk) < 0) { + pr_perror("Can't make unix pair"); + exit(1); + } + + ep = epoll_create(1); + if (ep < 0) { + perror("Can't create epoll"); + exit(1); + } + + event.data.fd = sk[1]; + if (epoll_ctl(ep, EPOLL_CTL_ADD, sk[1], &event) < 0) { + perror("Can't add fd"); + exit(1); + } + + if (send_fd(sk[0], ep) < 0) { + pr_perror("Can't send epoll"); + exit(1); + } + if (send_fd(sk[0], ep) < 0) { + pr_perror("Can't send epoll"); + exit(1); + } + + close(ep); + memset(&event, 0, sizeof(event)); + + test_daemon(); + test_waitsig(); + + if (recv_fd(sk[1], &ep) < 0) { + fail("Can't recv epoll back"); + ret = -1; + goto out; + } + + ret = epoll_wait(ep, &event, 1, 0); + if (ret != 1) { + fail("Can't get epoll event"); + ret = -1; + goto out; + } + + pass(); + ret = 0; +out: + return ret; +} diff --git a/CRIU_code/test/zdtm/static/scm06.c b/CRIU_code/test/zdtm/static/scm06.c new file mode 100644 index 0000000..6c9a78b --- /dev/null +++ b/CRIU_code/test/zdtm/static/scm06.c @@ -0,0 +1,147 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check a send of looped unix sockets"; +const char *test_author = "Kirill Tkhai "; + +static int send_fd(int via, int fd) +{ + struct msghdr h = {}; + struct cmsghdr *ch; + struct iovec iov; + char buf[CMSG_SPACE(sizeof(int))]; + char c = '\0'; + int *fdp; + + memset(buf, 0, sizeof(buf)); + h.msg_control = buf; + h.msg_controllen = sizeof(buf); + ch = CMSG_FIRSTHDR(&h); + ch->cmsg_level = SOL_SOCKET; + ch->cmsg_type = SCM_RIGHTS; + ch->cmsg_len = CMSG_LEN(sizeof(int)); + fdp = (int *)CMSG_DATA(ch); + fdp[0] = fd; + + h.msg_iov = &iov; + h.msg_iovlen = 1; + iov.iov_base = &c; + iov.iov_len = sizeof(c); + + if (sendmsg(via, &h, 0) <= 0) + return -1; + + return 0; +} + +static int recv_fd(int via, int *fd) +{ + struct msghdr h = {}; + struct cmsghdr *ch; + struct iovec iov; + char buf[CMSG_SPACE(sizeof(int))]; + char c; + int *fdp; + + h.msg_control = buf; + h.msg_controllen = sizeof(buf); + h.msg_iov = &iov; + h.msg_iovlen = 1; + iov.iov_base = &c; + iov.iov_len = sizeof(c); + + if (recvmsg(via, &h, 0) <= 0) + return -1; + + if (h.msg_flags & MSG_CTRUNC) { + test_msg("CTR\n"); + return -2; + } + + /* No 2 SCM-s here, kernel merges them upon send */ + ch = CMSG_FIRSTHDR(&h); + if (h.msg_flags & MSG_TRUNC) + return -2; + if (ch == NULL) + return -3; + if (ch->cmsg_type != SCM_RIGHTS) + return -4; + + fdp = (int *)CMSG_DATA(ch); + *fd = fdp[0]; + return 0; +} + +int main(int argc, char **argv) +{ + int ska[2], skc, i, j, ret; + struct sockaddr_un addr; + socklen_t len; + + test_init(argc, argv); + + if (socketpair(PF_UNIX, SOCK_DGRAM, 0, ska) < 0) { + fail("Can't make unix pair"); + exit(1); + } + + addr.sun_family = AF_UNIX; + for (i = 0; i < 2; i++) { + addr.sun_path[0] = '\0'; + addr.sun_path[1] = i; + if (bind(ska[i], (struct sockaddr *)&addr, + sizeof(addr.sun_family) + 2)) { + fail("Can't bind"); + exit(1); + } + } + + /* Make the vinaigrette */ + for (i = 0; i < 2; i++) { + for (j = 0; j < 2; j++) { + if (send_fd(ska[i], ska[j]) < 0) { + fail("Can't send sk"); + exit(1); + } + } + } + + test_daemon(); + test_waitsig(); + + ret = -1; + skc = ska[0]; + for (i = 0; i < 3; i++) { + if (recv_fd(skc, &skc) < 0) { + fail("Can't recv"); + goto out; + } + + len = sizeof(addr.sun_family) + 2; + + if (getsockname(skc, (struct sockaddr *)&addr, &len)) { + fail("Can't getsockname()"); + goto out; + } + + if (addr.sun_path[1] != (i % 2)) { + fail("Wrong socket or path"); + goto out; + } + } + + pass(); + ret = 0; +out: + return ret; +} diff --git a/CRIU_code/test/zdtm/static/scm06.desc b/CRIU_code/test/zdtm/static/scm06.desc new file mode 100644 index 0000000..2eac7e6 --- /dev/null +++ b/CRIU_code/test/zdtm/static/scm06.desc @@ -0,0 +1 @@ +{'flags': 'suid'} diff --git a/CRIU_code/test/zdtm/static/seccomp_filter.c b/CRIU_code/test/zdtm/static/seccomp_filter.c new file mode 100644 index 0000000..81d9851 --- /dev/null +++ b/CRIU_code/test/zdtm/static/seccomp_filter.c @@ -0,0 +1,200 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef __NR_seccomp +#include +#include +#endif + +#include "zdtmtst.h" + +const char *test_doc = "Check that SECCOMP_MODE_FILTER is restored"; +const char *test_author = "Tycho Andersen "; + +#ifdef __NR_seccomp + +int get_seccomp_mode(pid_t pid) +{ + FILE *f; + char buf[PATH_MAX]; + + sprintf(buf, "/proc/%d/status", pid); + f = fopen(buf, "r+"); + if (!f) { + pr_perror("fopen failed"); + return -1; + } + + while (NULL != fgets(buf, sizeof(buf), f)) { + int mode; + + if (sscanf(buf, "Seccomp:\t%d", &mode) != 1) + continue; + + fclose(f); + return mode; + } + fclose(f); + + return -1; +} + +int filter_syscall(int syscall_nr) +{ + struct sock_filter filter[] = { + BPF_STMT(BPF_LD+BPF_W+BPF_ABS, offsetof(struct seccomp_data, nr)), + BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, syscall_nr, 0, 1), + BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL), + BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW), + }; + + struct sock_fprog bpf_prog = { + .len = (unsigned short)(sizeof(filter)/sizeof(filter[0])), + .filter = filter, + }; + + if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &bpf_prog) < 0) { + pr_perror("prctl failed"); + return -1; + } + + return 0; +} + +int main(int argc, char ** argv) +{ + pid_t pid; + int mode, status; + int sk_pair[2], sk, ret; + char c = 'K'; + + test_init(argc, argv); + + if (socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair)) { + pr_perror("socketpair"); + return -1; + } + + pid = fork(); + if (pid < 0) { + pr_perror("fork"); + return -1; + } + + if (pid == 0) { + + sk = sk_pair[1]; + close(sk_pair[0]); + + /* + * Let's install a few filters separately to make sure the + * chaining actually works. + */ + if (filter_syscall(__NR_ptrace) < 0) + _exit(1); + + /* + * The idea is to have a syscall that is used in restore_creds, + * so we can make sure seccomp is actually suspended when that + * is called. + */ + if (filter_syscall(__NR_setresuid) < 0) + _exit(1); + + setuid(1000); + + zdtm_seccomp = 1; + test_msg("SECCOMP_MODE_FILTER is enabled\n"); + + if (write(sk, &c, 1) != 1) { + pr_perror("write"); + _exit(1); + } + + if (read(sk, &c, 1) != 1) { + pr_perror("read"); + _exit(1); + } + + prctl(PR_SET_DUMPABLE, 1); + + if (write(sk, &c, 1) != 1) { + pr_perror("write"); + _exit(1); + } + + if (read(sk, &c, 1) != 1) { + pr_perror("read"); + _exit(1); + } + + /* We expect to be killed by our policy above. */ + ptrace(PTRACE_TRACEME); + + syscall(__NR_exit, 0); + } + + sk = sk_pair[0]; + close(sk_pair[1]); + + if ((ret = read(sk, &c, 1)) != 1) { + pr_perror("read %d", ret); + goto err; + } + + test_daemon(); + test_waitsig(); + + if (write(sk, &c, 1) != 1) { + pr_perror("write"); + goto err; + } + if ((ret = read(sk, &c, 1)) != 1) { + pr_perror("read %d", ret); + goto err; + } + + mode = get_seccomp_mode(pid); + if (write(sk, &c, 1) != 1) { + pr_perror("write"); + goto err; + } + if (waitpid(pid, &status, 0) != pid) { + pr_perror("waitpid"); + exit(1); + } + + if (WTERMSIG(status) != SIGSYS) { + pr_perror("expected SIGSYS, got %d", WTERMSIG(status)); + exit(1); + } + + if (mode != SECCOMP_MODE_FILTER) { + fail("seccomp mode mismatch %d\n", mode); + return 1; + } + + pass(); + + return 0; +err: + kill(pid, SIGKILL); + return 1; +} + +#else /* __NR_seccomp */ + +#define TEST_SKIP_REASON "incompatible kernel (no seccomp)" +#include "skip-me.c" + +#endif /* __NR_seccomp */ diff --git a/CRIU_code/test/zdtm/static/seccomp_filter.desc b/CRIU_code/test/zdtm/static/seccomp_filter.desc new file mode 100644 index 0000000..14dd961 --- /dev/null +++ b/CRIU_code/test/zdtm/static/seccomp_filter.desc @@ -0,0 +1 @@ +{'flags': 'suid', 'feature': 'seccomp_filters'} diff --git a/CRIU_code/test/zdtm/static/seccomp_filter_inheritance.c b/CRIU_code/test/zdtm/static/seccomp_filter_inheritance.c new file mode 100644 index 0000000..840136c --- /dev/null +++ b/CRIU_code/test/zdtm/static/seccomp_filter_inheritance.c @@ -0,0 +1,190 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef __NR_seccomp +# include +# include +# include +#endif + +#include "zdtmtst.h" + +const char *test_doc = "Check that SECCOMP_MODE_FILTER is restored"; +const char *test_author = "Tycho Andersen "; + +#ifdef __NR_seccomp + +int get_seccomp_mode(pid_t pid) +{ + FILE *f; + char buf[PATH_MAX]; + + sprintf(buf, "/proc/%d/status", pid); + f = fopen(buf, "r+"); + if (!f) { + pr_perror("fopen failed"); + return -1; + } + + while (NULL != fgets(buf, sizeof(buf), f)) { + int mode; + + if (sscanf(buf, "Seccomp:\t%d", &mode) != 1) + continue; + + fclose(f); + return mode; + } + fclose(f); + + return -1; +} + +int filter_syscall(int syscall_nr) +{ + struct sock_filter filter[] = { + BPF_STMT(BPF_LD+BPF_W+BPF_ABS, offsetof(struct seccomp_data, nr)), + BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, syscall_nr, 0, 1), + BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL), + BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW), + }; + + struct sock_fprog bpf_prog = { + .len = (unsigned short)(sizeof(filter)/sizeof(filter[0])), + .filter = filter, + }; + + if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &bpf_prog) < 0) { + pr_perror("prctl failed"); + return -1; + } + + return 0; +} + +int main(int argc, char ** argv) +{ + pid_t pid; + int mode, status; + int sk_pair[2], sk, ret; + char c = 'K'; + + test_init(argc, argv); + + if (socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair)) { + pr_perror("socketpair"); + return -1; + } + + pid = fork(); + if (pid < 0) { + pr_perror("fork"); + return -1; + } + + if (pid == 0) { + + pid_t pid2; + + sk = sk_pair[1]; + close(sk_pair[0]); + + if (filter_syscall(__NR_ptrace) < 0) + _exit(1); + + if (filter_syscall(__NR_fstat) < 0) + _exit(1); + + zdtm_seccomp = 1; + test_msg("SECCOMP_MODE_FILTER is enabled\n"); + + pid2 = fork(); + if (pid2 < 0) + _exit(1); + + if (!pid2) { + + if (write(sk, &c, 1) != 1) { + pr_perror("write"); + _exit(1); + } + + if (read(sk, &c, 1) != 1) { + pr_perror("read"); + _exit(1); + } + + /* We expect to be killed by our policy above. */ + ptrace(PTRACE_TRACEME); + _exit(1); + } + + if (waitpid(pid2, &status, 0) != pid2) { + pr_perror("waitpid"); + _exit(1); + } + + if (WTERMSIG(status) != SIGSYS) { + pr_err("expected SIGSYS, got %d\n", WTERMSIG(status)); + _exit(1); + } + + _exit(0); + } + + sk = sk_pair[0]; + close(sk_pair[1]); + + if ((ret = read(sk, &c, 1)) != 1) { + pr_perror("read %d", ret); + goto err; + } + + test_daemon(); + test_waitsig(); + + mode = get_seccomp_mode(pid); + if (write(sk, &c, 1) != 1) { + pr_perror("write"); + goto err; + } + + if (mode != SECCOMP_MODE_FILTER) { + fail("seccomp mode mismatch %d\n", mode); + return 1; + } + + if (waitpid(pid, &status, 0) != pid) { + pr_perror("waitpid"); + _exit(1); + } + + if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) { + fail("bad exit status"); + return 1; + } + + pass(); + + return 0; +err: + kill(pid, SIGKILL); + return 1; +} + + +#else /* __NR_seccomp */ + +#define TEST_SKIP_REASON "incompatible kernel (no seccomp)" +#include "skip-me.c" + +#endif /* __NR_seccomp */ diff --git a/CRIU_code/test/zdtm/static/seccomp_filter_inheritance.desc b/CRIU_code/test/zdtm/static/seccomp_filter_inheritance.desc new file mode 100644 index 0000000..14dd961 --- /dev/null +++ b/CRIU_code/test/zdtm/static/seccomp_filter_inheritance.desc @@ -0,0 +1 @@ +{'flags': 'suid', 'feature': 'seccomp_filters'} diff --git a/CRIU_code/test/zdtm/static/seccomp_filter_threads.c b/CRIU_code/test/zdtm/static/seccomp_filter_threads.c new file mode 100644 index 0000000..b3fa608 --- /dev/null +++ b/CRIU_code/test/zdtm/static/seccomp_filter_threads.c @@ -0,0 +1,225 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef __NR_seccomp +# include +# include +# include +# include +#endif + +#include "zdtmtst.h" +#include "lock.h" + +#ifndef SECCOMP_SET_MODE_FILTER +#define SECCOMP_SET_MODE_FILTER 1 +#endif + +#ifndef SECCOMP_FILTER_FLAG_TSYNC +#define SECCOMP_FILTER_FLAG_TSYNC 1 +#endif + +const char *test_doc = "Check threads to carry different seccomps"; +const char *test_author = "Cyrill Gorcunov "; + +#ifdef __NR_seccomp + +static long sys_gettid(void) { return syscall(__NR_gettid); } + +static futex_t *wait_rdy; +static futex_t *wait_run; + +int get_seccomp_mode(pid_t pid) +{ + FILE *f; + char buf[PATH_MAX]; + + sprintf(buf, "/proc/%d/status", pid); + f = fopen(buf, "r"); + if (!f) { + pr_perror("fopen failed"); + return -1; + } + + while (NULL != fgets(buf, sizeof(buf), f)) { + int mode; + + if (sscanf(buf, "Seccomp:\t%d", &mode) != 1) + continue; + + fclose(f); + return mode; + } + fclose(f); + + return -1; +} + +int filter_syscall(int syscall_nr, unsigned int flags) +{ + struct sock_filter filter[] = { + BPF_STMT(BPF_LD+BPF_W+BPF_ABS, offsetof(struct seccomp_data, nr)), + BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, syscall_nr, 0, 1), + BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL), + BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW), + }; + + struct sock_fprog bpf_prog = { + .len = (unsigned short)(sizeof(filter)/sizeof(filter[0])), + .filter = filter, + }; + + if (syscall(__NR_seccomp, SECCOMP_SET_MODE_FILTER, flags, &bpf_prog) < 0) { + pr_perror("seccomp failed"); + return -1; + } + + return 0; +} + +void tigger_ptrace(void) { ptrace(PTRACE_TRACEME); } +void trigger_prctl(void) { prctl(PR_SET_PDEATHSIG, 9, 0, 0, 0); } +void trigger_mincore(void) { mincore(NULL, 0, NULL); } + +#define gen_param(__syscall_nr, __trigger) \ +{ \ + .syscall_name = # __syscall_nr, \ + .syscall_nr = __syscall_nr, \ + .trigger = __trigger, \ +} + +struct { + char *syscall_name; + unsigned int syscall_nr; + void (*trigger)(void); +} pthread_seccomp_params[] = { + gen_param(__NR_ptrace, tigger_ptrace), + gen_param(__NR_prctl, trigger_prctl), + gen_param(__NR_mincore, trigger_mincore), +}; + +#define WAITER_VALS_OFFSET (ARRAY_SIZE(pthread_seccomp_params) * 2) + +void *thread_main(void *arg) +{ + size_t nr = (long) arg; + + if (filter_syscall(pthread_seccomp_params[nr].syscall_nr, 0) < 0) + pthread_exit((void *)1); + + test_msg("%s filtered inside a sole thread %lu\n", + pthread_seccomp_params[nr].syscall_name, + sys_gettid()); + + futex_inc_and_wake(wait_rdy); + futex_wait_while_lt(wait_run, 1); + + test_msg("Triggering %zu %s thread %lu\n", + nr, pthread_seccomp_params[nr].syscall_name, + sys_gettid()); + + pthread_seccomp_params[nr].trigger(); + + test_msg("Abnormal exit %zu thread %lu\n", nr, sys_gettid()); + pthread_exit((void *)1); +} + +int main(int argc, char ** argv) +{ + int ret, mode, status; + size_t i; + pid_t pid; + + test_init(argc, argv); + + wait_rdy = mmap(NULL, sizeof(*wait_rdy), PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_SHARED, -1, 0); + wait_run = mmap(NULL, sizeof(*wait_rdy), PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_SHARED, -1, 0); + + if (wait_rdy == MAP_FAILED || wait_run == MAP_FAILED) { + pr_perror("mmap failed\n"); + exit(1); + } + + futex_init(wait_rdy); + futex_init(wait_run); + + futex_set(wait_rdy, 0); + futex_set(wait_run, 0); + + pid = fork(); + if (pid < 0) { + pr_perror("fork"); + return -1; + } + + + if (pid == 0) { + pthread_t thread[ARRAY_SIZE(pthread_seccomp_params)]; + void *p = NULL; + + zdtm_seccomp = 1; + + for (i = 0; i < ARRAY_SIZE(pthread_seccomp_params); i++) { + if (pthread_create(&thread[i], NULL, thread_main, (void *)i)) { + pr_perror("pthread_create"); + exit(1); + } + } + + for (i = 0; i < ARRAY_SIZE(pthread_seccomp_params); i++) { + test_msg("Waiting thread %zu\n", i); + if (pthread_join(thread[i], &p) != 0) { + pr_perror("pthread_join"); + exit(1); + } + } + + syscall(__NR_exit, 0); + } + + futex_wait_until(wait_rdy, ARRAY_SIZE(pthread_seccomp_params)); + + test_daemon(); + test_waitsig(); + + futex_inc_and_wake(wait_run); + mode = get_seccomp_mode(pid); + + if (mode != SECCOMP_MODE_DISABLED) { + fail("seccomp mode mismatch %d\n", mode); + return 1; + } + + ret = waitpid(pid, &status, 0); + if (ret != pid) { + fail("waitpid: %d != %d", ret, pid); + exit(1); + } + + if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) { + fail("expected 0 exit, got %d\n", WEXITSTATUS(status)); + exit(1); + } + + pass(); + return 0; +} + +#else /* __NR_seccomp */ + +#define TEST_SKIP_REASON "incompatible kernel (no seccomp)" +#include "skip-me.c" + +#endif /* __NR_seccomp */ diff --git a/CRIU_code/test/zdtm/static/seccomp_filter_threads.desc b/CRIU_code/test/zdtm/static/seccomp_filter_threads.desc new file mode 100644 index 0000000..14dd961 --- /dev/null +++ b/CRIU_code/test/zdtm/static/seccomp_filter_threads.desc @@ -0,0 +1 @@ +{'flags': 'suid', 'feature': 'seccomp_filters'} diff --git a/CRIU_code/test/zdtm/static/seccomp_filter_tsync.c b/CRIU_code/test/zdtm/static/seccomp_filter_tsync.c new file mode 100644 index 0000000..9b4742b --- /dev/null +++ b/CRIU_code/test/zdtm/static/seccomp_filter_tsync.c @@ -0,0 +1,215 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef __NR_seccomp +# include +# include +# include +# include +#endif + +#include "zdtmtst.h" + +#ifndef SECCOMP_SET_MODE_FILTER +#define SECCOMP_SET_MODE_FILTER 1 +#endif + +#ifndef SECCOMP_FILTER_FLAG_TSYNC +#define SECCOMP_FILTER_FLAG_TSYNC 1 +#endif + +const char *test_doc = "Check that SECCOMP_FILTER_FLAG_TSYNC works correctly after restore"; +const char *test_author = "Tycho Andersen "; + +#ifdef __NR_seccomp + +pthread_mutex_t getpid_wait; + +int get_seccomp_mode(pid_t pid) +{ + FILE *f; + char buf[PATH_MAX]; + + sprintf(buf, "/proc/%d/status", pid); + f = fopen(buf, "r+"); + if (!f) { + pr_perror("fopen failed"); + return -1; + } + + while (NULL != fgets(buf, sizeof(buf), f)) { + int mode; + + if (sscanf(buf, "Seccomp:\t%d", &mode) != 1) + continue; + + fclose(f); + return mode; + } + fclose(f); + + return -1; +} + +int filter_syscall(int syscall_nr, unsigned int flags) +{ + struct sock_filter filter[] = { + BPF_STMT(BPF_LD+BPF_W+BPF_ABS, offsetof(struct seccomp_data, nr)), + BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, syscall_nr, 0, 1), + BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL), + BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW), + }; + + struct sock_fprog bpf_prog = { + .len = (unsigned short)(sizeof(filter)/sizeof(filter[0])), + .filter = filter, + }; + + if (syscall(__NR_seccomp, SECCOMP_SET_MODE_FILTER, flags, &bpf_prog) < 0) { + pr_perror("seccomp failed"); + return -1; + } + + return 0; +} + +void *wait_and_getpid(void *arg) +{ + pthread_mutex_lock(&getpid_wait); + pthread_mutex_unlock(&getpid_wait); + pthread_mutex_destroy(&getpid_wait); + + /* we expect the tg to get killed by the seccomp filter that was + * installed via TSYNC */ + ptrace(PTRACE_TRACEME); + pthread_exit((void *)1); +} + +int main(int argc, char ** argv) +{ + pid_t pid; + int mode, status; + int sk_pair[2], sk, ret; + char c = 'K'; + + test_init(argc, argv); + + if (socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair)) { + pr_perror("socketpair"); + return -1; + } + + pid = fork(); + if (pid < 0) { + pr_perror("fork"); + return -1; + } + + if (pid == 0) { + pthread_t th; + void *p = NULL; + + if (pthread_mutex_init(&getpid_wait, NULL)) { + pr_perror("pthread_mutex_init"); + _exit(1); + } + + sk = sk_pair[1]; + close(sk_pair[0]); + + if (filter_syscall(__NR_getpid, 0) < 0) + _exit(1); + + zdtm_seccomp = 1; + + pthread_mutex_lock(&getpid_wait); + pthread_create(&th, NULL, wait_and_getpid, NULL); + + test_msg("SECCOMP_MODE_FILTER is enabled\n"); + + if (write(sk, &c, 1) != 1) { + pr_perror("write"); + _exit(1); + } + + if (read(sk, &c, 1) != 1) { + pr_perror("read"); + _exit(1); + } + + /* Now we have c/r'd with a shared filter, so let's install + * another filter with TSYNC and make sure that it is + * inherited. + */ + if (filter_syscall(__NR_ptrace, SECCOMP_FILTER_FLAG_TSYNC) < 0) + _exit(1); + + pthread_mutex_unlock(&getpid_wait); + if (pthread_join(th, &p) != 0) { + pr_perror("pthread_join"); + exit(1); + } + + /* Here we're abusing pthread exit slightly: if the thread gets + * to call pthread_exit, the value of p is one, but if it gets + * killed pthread_join doesn't set a value since the thread + * didn't, so the value is null; we exit 0 to indicate success + * as usual. + */ + syscall(__NR_exit, p); + } + + sk = sk_pair[0]; + close(sk_pair[1]); + + if ((ret = read(sk, &c, 1)) != 1) { + pr_perror("read %d", ret); + goto err; + } + + test_daemon(); + test_waitsig(); + + mode = get_seccomp_mode(pid); + if (write(sk, &c, 1) != 1) { + pr_perror("write"); + goto err; + } + if (waitpid(pid, &status, 0) != pid) { + pr_perror("waitpid"); + exit(1); + } + + if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) { + pr_err("expected 0 exit, got %d\n", WEXITSTATUS(status)); + exit(1); + } + + if (mode != SECCOMP_MODE_FILTER) { + fail("seccomp mode mismatch %d\n", mode); + return 1; + } + + pass(); + + return 0; +err: + kill(pid, SIGKILL); + return 1; +} + +#else /* __NR_seccomp */ + +#define TEST_SKIP_REASON "incompatible kernel (no seccomp)" +#include "skip-me.c" + +#endif /* __NR_seccomp */ diff --git a/CRIU_code/test/zdtm/static/seccomp_filter_tsync.desc b/CRIU_code/test/zdtm/static/seccomp_filter_tsync.desc new file mode 100644 index 0000000..14dd961 --- /dev/null +++ b/CRIU_code/test/zdtm/static/seccomp_filter_tsync.desc @@ -0,0 +1 @@ +{'flags': 'suid', 'feature': 'seccomp_filters'} diff --git a/CRIU_code/test/zdtm/static/seccomp_strict.c b/CRIU_code/test/zdtm/static/seccomp_strict.c new file mode 100644 index 0000000..ac95ac8 --- /dev/null +++ b/CRIU_code/test/zdtm/static/seccomp_strict.c @@ -0,0 +1,135 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef __NR_seccomp +# include +# include +#endif + +#include "zdtmtst.h" + +const char *test_doc = "Check that SECCOMP_MODE_STRICT is restored"; +const char *test_author = "Tycho Andersen "; + +#ifdef __NR_seccomp + +int get_seccomp_mode(pid_t pid) +{ + FILE *f; + char buf[PATH_MAX]; + + sprintf(buf, "/proc/%d/status", pid); + f = fopen(buf, "r+"); + if (!f) { + pr_perror("fopen failed"); + return -1; + } + + while (NULL != fgets(buf, sizeof(buf), f)) { + int mode; + + if (sscanf(buf, "Seccomp:\t%d", &mode) != 1) + continue; + + fclose(f); + return mode; + } + fclose(f); + + return -1; +} + +int main(int argc, char ** argv) +{ + pid_t pid; + int mode, status; + int sk_pair[2], sk; + char c = 'K'; + + test_init(argc, argv); + + if (socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair)) { + pr_perror("socketpair"); + return -1; + } + + pid = fork(); + if (pid < 0) { + pr_perror("fork"); + return -1; + } + + if (pid == 0) { + sk = sk_pair[1]; + close(sk_pair[0]); + zdtm_seccomp = 1; + + if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT) < 0) { + pr_perror("prctl failed"); + return -1; + } + test_msg("SECCOMP_MODE_STRICT is enabled\n"); + + if (write(sk, &c, 1) != 1) { + pr_perror("write"); + return -1; + } + if (read(sk, &c, 1) != 1) { + _exit(1); + pr_perror("read"); + return -1; + } + + syscall(__NR_exit, 0); + } + + sk = sk_pair[0]; + close(sk_pair[1]); + + if (read(sk, &c, 1) != 1) { + pr_perror("read"); + goto err; + } + + test_daemon(); + test_waitsig(); + + mode = get_seccomp_mode(pid); + if (write(sk, &c, 1) != 1) { + pr_perror("write"); + goto err; + } + if (waitpid(pid, &status, 0) != pid) { + pr_perror("waitpid"); + exit(1); + } + if (status != 0) { + pr_perror("The child exited with an unexpected code %d", status); + exit(1); + } + if (mode != SECCOMP_MODE_STRICT) { + fail("seccomp mode mismatch %d\n", mode); + return 1; + } + + pass(); + + return 0; +err: + kill(pid, SIGKILL); + return 1; +} + +#else /* __NR_seccomp */ + +#define TEST_SKIP_REASON "incompatible kernel (no seccomp)" +#include "skip-me.c" + +#endif /* __NR_seccomp */ diff --git a/CRIU_code/test/zdtm/static/seccomp_strict.desc b/CRIU_code/test/zdtm/static/seccomp_strict.desc new file mode 100644 index 0000000..9c58186 --- /dev/null +++ b/CRIU_code/test/zdtm/static/seccomp_strict.desc @@ -0,0 +1 @@ +{'flags': 'suid', 'feature': 'seccomp_suspend'} diff --git a/CRIU_code/test/zdtm/static/selfexe00.c b/CRIU_code/test/zdtm/static/selfexe00.c new file mode 100644 index 0000000..bc61172 --- /dev/null +++ b/CRIU_code/test/zdtm/static/selfexe00.c @@ -0,0 +1,60 @@ +/* + * A simple testee program with threads + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "zdtmtst.h" + +#define gettid() pthread_self() + +const char *test_doc = "Check if /proc/self/exe points to same location after restore\n"; +const char *test_author = "Cyrill Gorcunov +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "zdtmtst.h" + +/* Enabling the right policy happens in selinux00.hook and selinx00.checkskip */ + +const char *test_doc = "Check that a SELinux profile is restored"; +const char *test_author = "Adrian Reber "; + +/* This is all based on Tycho's apparmor code */ + +#define CONTEXT "unconfined_u:unconfined_r:unconfined_dbusd_t:s0" + +/* + * This is used to store the state of SELinux. For this test + * SELinux is switched to permissive mode and later the previous + * SELinux state is restored. + */ +char state; + +int check_for_selinux() +{ + if (access("/sys/fs/selinux", F_OK) == 0) + return 0; + return 1; +} + +int setprofile() +{ + int fd, len; + + fd = open("/proc/self/attr/current", O_WRONLY); + if (fd < 0) { + fail("Could not open /proc/self/attr/current\n"); + return -1; + } + + len = write(fd, CONTEXT, strlen(CONTEXT)); + close(fd); + + if (len < 0) { + fail("Could not write context\n"); + return -1; + } + + return 0; +} + +int checkprofile() +{ + int fd; + char context[1024]; + int len; + + + fd = open("/proc/self/attr/current", O_RDONLY); + if (fd < 0) { + fail("Could not open /proc/self/attr/current\n"); + return -1; + } + + len = read(fd, context, strlen(CONTEXT)); + close(fd); + if (len != strlen(CONTEXT)) { + fail("SELinux context has unexpected length %d, expected %zd\n", + len, strlen(CONTEXT)); + return -1; + } + + if (strncmp(context, CONTEXT, strlen(CONTEXT)) != 0) { + fail("Wrong SELinux context %s expected %s\n", context, CONTEXT); + return -1; + } + + return 0; +} + +int check_sockcreate() +{ + char *output = NULL; + FILE *f = fopen("/proc/self/attr/sockcreate", "r"); + int ret = fscanf(f, "%ms", &output); + fclose(f); + + if (ret >= 1) { + free(output); + /* sockcreate should be empty, if fscanf found something + * it is wrong.*/ + fail("sockcreate should be empty\n"); + return -1; + } + + if (output) { + free(output); + /* Same here, output should still be NULL. */ + fail("sockcreate should be empty\n"); + return -1; + } + + return 0; +} + +int main(int argc, char **argv) +{ + test_init(argc, argv); + + if (check_for_selinux()) { + skip("SELinux not found on this system."); + test_daemon(); + test_waitsig(); + pass(); + return 0; + } + + if (check_sockcreate()) + return -1; + + if (setprofile()) + return -1; + + if (check_sockcreate()) + return -1; + + test_daemon(); + test_waitsig(); + + if (check_sockcreate()) + return -1; + + if (checkprofile() == 0) + pass(); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/selinux00.checkskip b/CRIU_code/test/zdtm/static/selinux00.checkskip new file mode 100644 index 0000000..8d946a7 --- /dev/null +++ b/CRIU_code/test/zdtm/static/selinux00.checkskip @@ -0,0 +1,25 @@ +#!/bin/bash + +test -d /sys/fs/selinux || exit 1 + +# See selinux00.hook for details + +getsebool unconfined_dyntrans_all > /dev/null 2>&1 +RESULT=$? +BOOLEAN=0 + +if [ "$RESULT" = "0" ]; then + BOOLEAN=1 +fi + +if [ "$BOOLEAN" = "1" ]; then + getsebool unconfined_dyntrans_all | grep off -q + RESULT=$? + echo $RESULT > /tmp/zdtm.selinux.state + if [ "$RESULT" = "0" ]; then + setsebool -P unconfined_dyntrans_all 1 + fi +else + cat /sys/fs/selinux/enforce > /tmp/zdtm.selinux.state + setenforce 0 +fi diff --git a/CRIU_code/test/zdtm/static/selinux00.desc b/CRIU_code/test/zdtm/static/selinux00.desc new file mode 100644 index 0000000..63df42a --- /dev/null +++ b/CRIU_code/test/zdtm/static/selinux00.desc @@ -0,0 +1 @@ +{'flavor': 'h'} diff --git a/CRIU_code/test/zdtm/static/selinux00.hook b/CRIU_code/test/zdtm/static/selinux00.hook new file mode 100644 index 0000000..300766e --- /dev/null +++ b/CRIU_code/test/zdtm/static/selinux00.hook @@ -0,0 +1,32 @@ +#!/bin/sh + +# This script configures SELinux in such a way to enable the +# test 'selinux00' to be able to dyntransition from one +# SELinux context to another, as well as CRIU to change the +# context of a restored process. +# If a new enough selinux-policy is installed which includes +# https://github.com/fedora-selinux/selinux-policy/commit/2d537cabbb2df614ea598ac20873c653cbf271a8 +# then the boolean 'unconfined_dyntrans_all' will be changed +# to enable this test. If that boolean is not available, +# this just does 'setenforce 0'. + +# also see selinux00.checkskip + +getsebool unconfined_dyntrans_all > /dev/null 2>&1 +RESULT=$? +BOOLEAN=0 + +if [ "$RESULT" = "0" ]; then + BOOLEAN=1 +fi + +[ "$1" = "--post-restore" ] && { + if [ "$BOOLEAN" = "1" ]; then + setsebool -P unconfined_dyntrans_all `cat /tmp/zdtm.selinux.state` + else + setenforce `cat /tmp/zdtm.selinux.state` + rm -f /tmp/zdtm.selinux.state + fi +} + +exit 0 diff --git a/CRIU_code/test/zdtm/static/selinux01.c b/CRIU_code/test/zdtm/static/selinux01.c new file mode 100644 index 0000000..9966455 --- /dev/null +++ b/CRIU_code/test/zdtm/static/selinux01.c @@ -0,0 +1,200 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "zdtmtst.h" + +/* Enabling the right policy happens in selinux00.hook and selinx00.checkskip */ + +const char *test_doc = "Check that a SELinux socket context is restored"; +const char *test_author = "Adrian Reber "; + +/* This is all based on Tycho's apparmor code */ + +#define CONTEXT "unconfined_u:unconfined_r:unconfined_dbusd_t:s0" + +/* + * This is used to store the state of SELinux. For this test + * SELinux is switched to permissive mode and later the previous + * SELinux state is restored. + */ +char state; + +int check_for_selinux() +{ + if (access("/sys/fs/selinux", F_OK) == 0) + return 0; + return 1; +} + +int setprofile() +{ + int fd, len; + + fd = open("/proc/self/attr/current", O_WRONLY); + if (fd < 0) { + fail("Could not open /proc/self/attr/current\n"); + return -1; + } + + len = write(fd, CONTEXT, strlen(CONTEXT)); + close(fd); + + if (len < 0) { + fail("Could not write context\n"); + return -1; + } + + return 0; +} + +int set_sockcreate() +{ + int fd, len; + + fd = open("/proc/self/attr/sockcreate", O_WRONLY); + if (fd < 0) { + fail("Could not open /proc/self/attr/sockcreate\n"); + return -1; + } + + len = write(fd, CONTEXT, strlen(CONTEXT)); + close(fd); + + if (len < 0) { + fail("Could not write context\n"); + return -1; + } + + return 0; +} + +int check_sockcreate() +{ + int fd; + char context[1024]; + int len; + + + fd = open("/proc/self/attr/sockcreate", O_RDONLY); + if (fd < 0) { + fail("Could not open /proc/self/attr/sockcreate\n"); + return -1; + } + + len = read(fd, context, strlen(CONTEXT)); + close(fd); + if (len != strlen(CONTEXT)) { + fail("SELinux context has unexpected length %d, expected %zd\n", + len, strlen(CONTEXT)); + return -1; + } + + if (strncmp(context, CONTEXT, strlen(CONTEXT)) != 0) { + fail("Wrong SELinux context %s expected %s\n", context, CONTEXT); + return -1; + } + + return 0; +} + +int check_sockcreate_empty() +{ + char *output = NULL; + FILE *f = fopen("/proc/self/attr/sockcreate", "r"); + int ret = fscanf(f, "%ms", &output); + fclose(f); + + if (ret >= 1) { + free(output); + /* sockcreate should be empty, if fscanf found something + * it is wrong.*/ + fail("sockcreate should be empty\n"); + return -1; + } + + if (output) { + free(output); + /* Same here, output should still be NULL. */ + fail("sockcreate should be empty\n"); + return -1; + } + + return 0; +} + +int main(int argc, char **argv) +{ + char ctx[1024]; + test_init(argc, argv); + + if (check_for_selinux()) { + skip("SELinux not found on this system."); + test_daemon(); + test_waitsig(); + pass(); + return 0; + } + +#ifdef USING_SOCKCREATE + if (set_sockcreate()) + return -1; +#else + if (check_sockcreate_empty()) + return -1; + + if (setprofile()) + return -1; + + if (check_sockcreate_empty()) + return -1; +#endif + + /* Open our test socket */ + int sk = socket(AF_INET, SOCK_STREAM, 0); + memset(ctx, 0, 1024); + /* Read out the socket label */ + if (fgetxattr(sk, "security.selinux", ctx, 1024) == -1) { + fail("Reading xattr 'security.selinux' failed.\n"); + return -1; + } + if (strncmp(ctx, CONTEXT, strlen(CONTEXT)) != 0) { + fail("Wrong SELinux context %s expected %s\n", ctx, CONTEXT); + return -1; + } + memset(ctx, 0, 1024); + + test_daemon(); + test_waitsig(); + + /* Read out the socket label again */ + + if (fgetxattr(sk, "security.selinux", ctx, 1024) == -1) { + fail("Reading xattr 'security.selinux' failed.\n"); + return -1; + } + if (strncmp(ctx, CONTEXT, strlen(CONTEXT)) != 0) { + fail("Wrong SELinux context %s expected %s\n", ctx, CONTEXT); + return -1; + } + +#ifdef USING_SOCKCREATE + if (check_sockcreate()) + return -1; +#else + if (check_sockcreate_empty()) + return -1; +#endif + + pass(); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/selinux01.checkskip b/CRIU_code/test/zdtm/static/selinux01.checkskip new file mode 100644 index 0000000..e8a1724 --- /dev/null +++ b/CRIU_code/test/zdtm/static/selinux01.checkskip @@ -0,0 +1 @@ +selinux00.checkskip \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/selinux01.desc b/CRIU_code/test/zdtm/static/selinux01.desc new file mode 100644 index 0000000..2d2961a --- /dev/null +++ b/CRIU_code/test/zdtm/static/selinux01.desc @@ -0,0 +1 @@ +selinux00.desc \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/selinux01.hook b/CRIU_code/test/zdtm/static/selinux01.hook new file mode 100644 index 0000000..dd7ed6b --- /dev/null +++ b/CRIU_code/test/zdtm/static/selinux01.hook @@ -0,0 +1 @@ +selinux00.hook \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/selinux02.c b/CRIU_code/test/zdtm/static/selinux02.c new file mode 100644 index 0000000..5702677 --- /dev/null +++ b/CRIU_code/test/zdtm/static/selinux02.c @@ -0,0 +1 @@ +selinux01.c \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/selinux02.checkskip b/CRIU_code/test/zdtm/static/selinux02.checkskip new file mode 100644 index 0000000..2696e6e --- /dev/null +++ b/CRIU_code/test/zdtm/static/selinux02.checkskip @@ -0,0 +1 @@ +selinux01.checkskip \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/selinux02.desc b/CRIU_code/test/zdtm/static/selinux02.desc new file mode 100644 index 0000000..9c6802c --- /dev/null +++ b/CRIU_code/test/zdtm/static/selinux02.desc @@ -0,0 +1 @@ +selinux01.desc \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/selinux02.hook b/CRIU_code/test/zdtm/static/selinux02.hook new file mode 100644 index 0000000..e3ea0a6 --- /dev/null +++ b/CRIU_code/test/zdtm/static/selinux02.hook @@ -0,0 +1 @@ +selinux01.hook \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/sem.c b/CRIU_code/test/zdtm/static/sem.c new file mode 100644 index 0000000..1db8f00 --- /dev/null +++ b/CRIU_code/test/zdtm/static/sem.c @@ -0,0 +1,186 @@ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc="Tests IPC semaphores migrates fine"; +const char *test_author="Stanislav Kinsbursky "; + +static int sem_test(int id, + struct sembuf *lock, struct sembuf *unlock, + int lock_ops, int unlock_ops) +{ + if (semop(id, lock, lock_ops) == -1) { + fail("Failed to lock semaphore"); + return -errno; + } + if (semop(id, unlock, unlock_ops) == -1) { + fail("Failed to unlock semaphore"); + return -errno; + } + return 0; +} + +#define NSEMS 10 + +static int check_sem_by_key(int key, int num) +{ + int id; + struct sembuf lock[2] = { + { + .sem_num = num, + .sem_op = 0, + .sem_flg = 0, + }, + { + .sem_num = num, + .sem_op = 1, + .sem_flg = 0, + }, + }; + struct sembuf unlock[1] = { + { + .sem_num = num, + .sem_op = -1, + .sem_flg = 0, + } + }; + int val; + + id = semget(key, NSEMS, 0777); + if (id == -1) { + fail("Can't get sem"); + return -errno; + } + + val = semctl(id, num, GETVAL); + if (val < 0) { + fail("Failed to get sem value"); + return -errno; + } + + return sem_test(id, lock, unlock, + sizeof(lock)/sizeof(struct sembuf), + sizeof(unlock)/sizeof(struct sembuf)); +} + +static int check_sem_by_id(int id, int num, int val) +{ + int curr; + struct sembuf lock[] = { + { + .sem_num = num, + .sem_op = val, + .sem_flg = 0, + }, + }; + struct sembuf unlock[] = { + { + .sem_num = num, + .sem_op = - val * 2, + .sem_flg = 0, + } + }; + + curr = semctl(id, num, GETVAL); + if (curr < 0) { + fail("Failed to get sem value"); + return -errno; + } + if (curr != val) { + fail("Sem has wrong value: %d instead of %d\n", curr, val); + return -EFAULT; + } + return sem_test(id, lock, unlock, + sizeof(lock)/sizeof(struct sembuf), + sizeof(unlock)/sizeof(struct sembuf)); +} + +int main(int argc, char **argv) +{ + int id, key; + int i; + /* See man semctl */ + union semun { + int val; + struct semid_ds *buf; + unsigned short *array; + struct seminfo *__buf; + } val[NSEMS]; + int ret, fail_count = 0; + + test_init(argc, argv); + + key = ftok(argv[0], 89063453); + if (key == -1) { + pr_perror("Can't make key"); + return -1; + } + + id = semget(key, NSEMS, 0777 | IPC_CREAT | IPC_EXCL); + if (id == -1) { + fail_count++; + pr_perror("Can't get sem array"); + goto out; + } + + for (i = 0; i < NSEMS; i++) { + val[i].val = lrand48() & 0x7; + + if (semctl(id, i, SETVAL, val[i]) == -1) { + fail_count++; + pr_perror("Can't init sem %d", i); + goto out_destroy; + } + } + + test_daemon(); + test_waitsig(); + + for (i = 0; i < NSEMS; i++) { + ret = check_sem_by_id(id, i, val[i].val); + if (ret < 0) { + fail_count++; + fail("Check sem %d by id failed", i); + goto out_destroy; + } + + if (check_sem_by_key(key, i) < 0) { + fail("Check sem %d by key failed", i); + fail_count++; + goto out_destroy; + } + + val[i].val = semctl(id, 0, GETVAL); + if (val[i].val < 0) { + fail("Failed to get sem %d value", i); + fail_count++; + goto out_destroy; + } + if (val[i].val != 0) { + fail("Non-zero sem %d value: %d", i, val[i].val); + fail_count++; + } + } + +out_destroy: + ret = semctl(id, 0, IPC_RMID); + if (ret < 0) { + fail("Destroy sem array failed"); + fail_count++; + } +out: + if (fail_count == 0) + pass(); + return fail_count; +} diff --git a/CRIU_code/test/zdtm/static/sem.desc b/CRIU_code/test/zdtm/static/sem.desc new file mode 100644 index 0000000..6c4afe5 --- /dev/null +++ b/CRIU_code/test/zdtm/static/sem.desc @@ -0,0 +1 @@ +{'flavor': 'ns uns'} diff --git a/CRIU_code/test/zdtm/static/session00.c b/CRIU_code/test/zdtm/static/session00.c new file mode 100644 index 0000000..4a239e1 --- /dev/null +++ b/CRIU_code/test/zdtm/static/session00.c @@ -0,0 +1,236 @@ +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Test that sid, pgid are restored"; +const char *test_author = "Andrey Vagin "; + +#define DETACH 1 +#define NEWSID 2 +#define CHANGESID 4 +#define DOUBLE_CHANGESID 8 + +struct testcase { + int flags; + pid_t pid; + pid_t sid; +}; + +static struct testcase testcases[] = { + {DETACH, }, + {NEWSID, }, + {0, }, + {DETACH|NEWSID, }, + {CHANGESID, }, + {DOUBLE_CHANGESID | CHANGESID, } + }; +/* + 2 2 session00 + 4 4 \_ session00 # {NEWSID, }, + 2 5 \_ session00 # {0, }, + 8 8 \_ session00 + 2 9 | \_ session00 # {CHANGESID, } + 10 10 \_ session00 + 11 11 \_ session00 + 2 12 \_ session00 # {DOUBLE_CHANGESID | CHANGESID, } + 2 3 session00 # {DETACH, }, + 6 7 session00 # {DETACH|NEWSID, }, +*/ + +#define NUM_CASES (sizeof(testcases) / sizeof(struct testcase)) + +static int fork_child(int i) +{ + int p[2]; + int status, ret; + pid_t pid, sid; + + ret = pipe(p); + if (ret) { + pr_perror("pipe() failed"); + return 1; + } + + pid = test_fork(); + if (pid < 0) { + pr_perror("Can't fork"); + return 1; + } + + if (pid == 0) { + if (testcases[i].flags & NEWSID) { + sid = setsid(); + if (sid == -1) { + pr_perror("setsid failed"); + write(p[1], &sid, sizeof(sid)); + exit(1); + } + } + + if (testcases[i].flags & (DETACH | CHANGESID)) { + pid = test_fork(); + if (pid < 0) { + write(p[1], &pid, sizeof(pid)); + exit(1); + } + } + + if (pid != 0) { + if (!(testcases[i].flags & CHANGESID)) + exit(0); + + sid = setsid(); + if (sid == -1) { + pr_perror("setsid failed"); + write(p[1], &sid, sizeof(sid)); + exit(1); + } + + close(p[1]); + wait(NULL); + if (getsid(0) != sid) { + fail("The process %d (%x) has SID=%d (expected %d)", + pid, testcases[i].flags, sid, testcases[i].sid); + exit(1); + } + exit(0); + } + + if (testcases[i].flags & DOUBLE_CHANGESID) { + pid = fork(); + if (pid < 0) { + write(p[1], &pid, sizeof(pid)); + exit(1); + } + + if (pid == 0) + goto child; + + sid = setsid(); + if (sid == -1) { + pr_perror("setsid failed"); + write(p[1], &sid, sizeof(sid)); + exit(1); + } + + close(p[1]); + wait(NULL); + if (getsid(0) != sid) { + fail("The process %d (%x) has SID=%d (expected %d)", + pid, testcases[i].flags, sid, testcases[i].sid); + exit(1); + } + exit(0); + } + +child: + pid = getpid(); + write(p[1], &pid, sizeof(pid)); + close(p[1]); + + test_waitsig(); + pass(); + exit(0); + } + + close(p[1]); + + if (testcases[i].flags & DETACH) { + pid_t ret; + ret = wait(&status); + if (ret != pid) { + pr_perror("wait return %d instead of %d", ret, pid); + kill(pid, SIGKILL); + return 1; + } + } + + ret = read(p[0], &testcases[i].pid, sizeof(pid)); + if (ret != sizeof(ret)) { + pr_perror("read failed"); + return 1; + } + /* wait when a child closes fd */ + ret = read(p[0], &testcases[i].pid, sizeof(pid)); + if (ret != 0) { + pr_perror("read failed"); + return 1; + } + + close(p[0]); + + if (testcases[i].pid < 0) { + pr_perror("child failed"); + return 1; + } + + testcases[i].sid = getsid(testcases[i].pid); + + return 0; +} + +int main(int argc, char ** argv) +{ + int i, ret, err = 0, status; + pid_t pid; + + test_init(argc, argv); + + for (i = 0; i < NUM_CASES; i++) + if (fork_child(i)) + break; + + if (i != NUM_CASES) { + int j; + for (j = 0; j < i; j++) + kill(testcases[j].pid, SIGTERM); + return 1; + } + + test_daemon(); + + test_waitsig(); + + for (i = 0; i < NUM_CASES; i++) { + pid_t pid = testcases[i].pid; + pid_t sid = getsid(pid); + + if (sid != testcases[i].sid) { + fail("The process %d (%x) has SID=%d (expected %d)", + pid, testcases[i].flags, sid, testcases[i].sid); + err++; + } + + ret = kill(pid, SIGKILL); + if (ret == -1) { + pr_perror("kill failed"); + err++; + } + waitpid(pid, NULL, 0); + + if (testcases[i].flags & CHANGESID) { + pid = wait(&status); + if (pid == -1) { + pr_perror("wait() failed"); + err++; + } + if (!WIFEXITED(status) || WEXITSTATUS(status)) { + fail("The process with pid %d returns %d\n", pid, status); + err++; + } + } + } + + pid = wait(&status); + if (pid != -1 || errno != ECHILD) { + pr_perror("%d isn't waited", pid); + err++; + } + + if (!err) + pass(); + + return err > 0; +} diff --git a/CRIU_code/test/zdtm/static/session00.desc b/CRIU_code/test/zdtm/static/session00.desc new file mode 100644 index 0000000..6c4afe5 --- /dev/null +++ b/CRIU_code/test/zdtm/static/session00.desc @@ -0,0 +1 @@ +{'flavor': 'ns uns'} diff --git a/CRIU_code/test/zdtm/static/session01.c b/CRIU_code/test/zdtm/static/session01.c new file mode 100644 index 0000000..48cfb26 --- /dev/null +++ b/CRIU_code/test/zdtm/static/session01.c @@ -0,0 +1,337 @@ +#include +#include +#include +#include +#include + +#include "zdtmtst.h" +#include "lock.h" + +const char *test_doc = "Test that sid, pgid are restored"; +const char *test_author = "Andrey Vagin "; + +struct master { + pid_t pid; + pid_t ppid; + pid_t sid; + pid_t pgid; +}; + +struct testcase { + pid_t pid; + pid_t ppid; + pid_t sid; + pid_t born_sid; + pid_t pgid; + int alive; + struct master master; + futex_t futex; +}; + +enum { + TEST_FORK, + TEST_PGID, + TEST_WAIT, + TEST_MASTER, + TEST_CHECK, + TEST_EXIT, +}; + +static struct testcase *testcases; +static futex_t *fstate; +static struct testcase __testcases[] = { + { 2, 1, 2, 1, 2, 1 }, /* session00 */ + { 4, 2, 4, 2, 4, 1 }, /* |\_session00 */ + {15, 4, 4, 4, 15, 1 }, /* | |\_session00 */ + {16, 4, 4, 4, 15, 1 }, /* | \_session00 */ + {17, 4, 4, 4, 17, 0 }, /* | |\_session00 */ + {18, 4, 4, 4, 17, 1 }, /* | \_session00 */ + { 5, 2, 2, 2, 2, 1 }, /* |\_session00 */ + { 8, 2, 8, 2, 8, 1 }, /* |\_session00 */ + { 9, 8, 2, 2, 2, 1 }, /* | \_session00 */ + {10, 2, 10, 2, 10, 1 }, /* |\_session00 */ + {11, 10, 11, 2, 11, 1 }, /* | \_session00 */ + {12, 11, 2, 2, 2, 1 }, /* | \_session00 */ + {13, 2, 2, 2, 2, 0 }, /* \_session00 */ + { 3, 13, 2, 2, 2, 1 }, /* session00 */ + { 6, 2, 6, 2, 6, 0 }, /* \_session00 */ + {14, 6, 6, 6, 6, 1 }, /* session00 */ +}; + +#define TESTS (sizeof(__testcases) / sizeof(struct testcase)) + +#define check(n, a, b) do { if ((a) != (b)) { pr_perror("%s mismatch %d != %d", n, a, b); goto err; } } while (0) + +static int child(const int c); +static int fork_children(struct testcase *t, int leader) +{ + int i; + pid_t cid; + + for (i = 0; i < TESTS; i++) { + if (t->pid != testcases[i].ppid) + continue; + + if (leader ^ (t->pid == testcases[i].born_sid)) + continue; + + cid = test_fork_id(i); + if (cid < 0) + goto err; + if (cid == 0) { + test_msg("I'm %d with pid %d\n", i, getpid()); + child(i); + exit(0); + } + + testcases[i].master.pid = cid; + } + return 0; +err: + return -1; +} + +static int child(const int c) +{ + int i; + struct testcase *t = &testcases[c]; + + t->master.pid = getpid(); + + if (fork_children(t, 0)) + goto err; + + if (t->pid == t->sid) { + if (getpid() != getsid(0)) + if (setsid() < 0) + goto err; + if (fork_children(t, 1)) + goto err; + } + if (t->pid == t->pgid) { + if (getpid() != getpgid(0)) + if (setpgid(getpid(), getpid()) < 0) { + pr_perror("setpgid() failed"); + goto err; + } + t->master.pgid = t->master.pid; + } + + futex_set_and_wake(&t->futex, c); + + if (c == 0) + goto out; + + futex_wait_until(fstate, TEST_PGID); + + for (i = 0; i < TESTS; i++) { + if (c == 0) + break; + if (t->pgid != testcases[i].pid) + continue; + if (getpgid(0) != testcases[i].master.pid) + if (setpgid(getpid(), testcases[i].master.pid) < 0) { + pr_perror("setpgid() failed (%d) (%d)", c, i); + goto err; + } + + t->master.pgid = testcases[i].master.pid; + break; + } + + futex_set_and_wake(&t->futex, c); + + futex_wait_until(fstate, TEST_WAIT); + + for (i = 0; i < TESTS; i++) { + if (t->pid != testcases[i].ppid) + continue; + if (testcases[i].alive) + continue; + test_msg("Wait porcess %d (pid %d)\n", i, testcases[i].master.pid); + waitpid(testcases[i].master.pid, NULL, 0); + } + + if (!t->alive) + goto out; + + futex_set_and_wake(&t->futex, c); + + futex_wait_until(fstate, TEST_MASTER); + + /* Save the master copy */ + t->master.ppid = getppid(); + t->master.sid = getsid(0); + + futex_set_and_wake(&t->futex, c); + + futex_wait_until(fstate, TEST_CHECK); + + check("pid", t->master.pid, getpid()); + check("ppid", t->master.ppid, getppid()); + check("sid", t->master.sid, getsid(0)); + check("pgid", t->master.pgid, getpgid(0)); + + futex_set_and_wake(&t->futex, c); + + /* Wait while all test cases check results */ + futex_wait_until(fstate, TEST_EXIT); +out: + return 0; +err: + futex_set_and_wake(&t->futex, -1); + return 1; +} + +int main(int argc, char ** argv) +{ + int i, err, ret; + void *ptr; + + BUG_ON(sizeof(*fstate) + sizeof(__testcases) > 4096); + + ptr = mmap(NULL, 4096, PROT_WRITE | PROT_READ, MAP_SHARED | MAP_ANONYMOUS, -1, 0); + if (ptr == MAP_FAILED) + return 1; + + fstate = ptr; + futex_set(fstate, TEST_FORK); + testcases = ptr + sizeof(*fstate); + + memcpy(testcases, &__testcases, sizeof(__testcases)); + + test_init(argc, argv); + + testcases[0].master.pid = getpid(); + if (child(0)) + goto err; + + for (i = 1; i < TESTS; i++) { + ret = futex_wait_while(&testcases[i].futex, 0); + if (ret < 0) + return 1; + futex_set(&testcases[i].futex, 0); + } + + test_msg("TEST_PGID\n"); + futex_set_and_wake(fstate, TEST_PGID); + for (i = 1; i < TESTS; i++) { + ret = futex_wait_while(&testcases[i].futex, 0); + if (ret < 0) + goto err; + futex_set(&testcases[i].futex, 0); + } + + test_msg("TEST_WAIT\n"); + futex_set_and_wake(fstate, TEST_WAIT); + for (i = 1; i < TESTS; i++) { + if (!testcases[i].alive) + continue; + ret = futex_wait_while(&testcases[i].futex, 0); + if (ret < 0) + goto err; + futex_set(&testcases[i].futex, 0); + } + + for (i = 0; i < TESTS; i++) { + if (testcases[0].pid != testcases[i].ppid) + continue; + if (testcases[i].alive) + continue; + test_msg("Wait porcess %d (pid %d)\n", + i, testcases[i].master.pid); + waitpid(testcases[i].master.pid, NULL, 0); + } + + test_msg("TEST_MASTER\n"); + futex_set_and_wake(fstate, TEST_MASTER); + for (i = 1; i < TESTS; i++) { + if (!testcases[i].alive) + continue; + ret = futex_wait_while(&testcases[i].futex, 0); + if (ret < 0) + goto err; + futex_set(&testcases[i].futex, 0); + test_msg("The process %d initialized\n", ret); + } + + test_daemon(); + + test_waitsig(); + + err = 0; + for (i = 1; i < TESTS; i++) { + int j; + struct testcase *t = testcases + i; + pid_t sid, pgid; + + if (!t->alive) + continue; + + for (j = 0; j < TESTS; j++) { + struct testcase *p = testcases + j; + /* sanity check */ + if (p->pid == t->sid && t->master.sid != p->master.pid) { + pr_perror("session mismatch (%d) %d != (%d) %d", + i, t->master.sid, j, p->master.pid); + err++; + } + if (p->pid == t->pgid && t->master.pgid != p->master.pid) { + pr_perror("pgid mismatch (%d) %d != (%d) %d", + i, t->master.pgid, j, p->master.pid); + err++; + } + } + + sid = getsid(t->master.pid); + if (t->master.sid != sid) { + pr_perror("%d: session mismatch %d (expected %d)", + i, sid, t->master.sid); + err++; + } + + pgid = getpgid(t->master.pid); + if (t->master.pgid != pgid) { + pr_perror("%d: pgid mismatch %d (expected %d)", + i, t->master.pgid, pgid); + err++; + } + } + + test_msg("TEST_CHECK\n"); + futex_set_and_wake(fstate, TEST_CHECK); + + for (i = 1; i < TESTS; i++) { + if (!testcases[i].alive) + continue; + + ret = futex_wait_while(&testcases[i].futex, 0); + if (ret < 0) + goto err; + futex_set(&testcases[i].futex, 0); + + if (ret < 0) { + fail("Someone failed"); + err++; + continue; + } + test_msg("The process %u is restored correctly\n", (unsigned)ret); + } + + test_msg("TEST_EXIT\n"); + futex_set_and_wake(fstate, TEST_EXIT); + + if (!err) + pass(); + + return 0; +err: + for (i = 1; i < TESTS; i++) { + pid_t pid = testcases[i].master.pid; + if (pid > 0) { + ret = kill(pid, SIGKILL); + test_msg("kill %d %s\n", pid, strerror(ret == -1 ? errno : 0)); + } + } + return 1; +} diff --git a/CRIU_code/test/zdtm/static/session01.desc b/CRIU_code/test/zdtm/static/session01.desc new file mode 100644 index 0000000..6c4afe5 --- /dev/null +++ b/CRIU_code/test/zdtm/static/session01.desc @@ -0,0 +1 @@ +{'flavor': 'ns uns'} diff --git a/CRIU_code/test/zdtm/static/session02.c b/CRIU_code/test/zdtm/static/session02.c new file mode 100644 index 0000000..37f245d --- /dev/null +++ b/CRIU_code/test/zdtm/static/session02.c @@ -0,0 +1,327 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Create a crazy process tree"; +const char *test_author = "Andrew Vagin "; + +struct process +{ + pid_t pid; + pid_t sid; + int sks[2]; + int dead; +}; + +struct process *processes; +int nr_processes = 20; +int current = 0; + +static void cleanup() +{ + int i; + + for (i = 0; i < nr_processes; i++) { + if (processes[i].dead) + continue; + if (processes[i].pid <= 0) + continue; + + kill(processes[i].pid, SIGKILL); + } +} + +enum commands +{ + TEST_FORK, + TEST_WAIT, + TEST_SUBREAPER, + TEST_SETSID, + TEST_DIE +}; + +struct command +{ + enum commands cmd; + int arg1; + int arg2; +}; + +static void handle_command(); + +static void mainloop() +{ + while (1) + handle_command(); +} + +#define CLONE_STACK_SIZE 4096 +/* All arguments should be above stack, because it grows down */ +struct clone_args { + char stack[CLONE_STACK_SIZE] __stack_aligned__; + char stack_ptr[0]; + int id; +}; + +static int clone_func(void *_arg) +{ + struct clone_args *args = (struct clone_args *) _arg; + + current = args->id; + + test_msg("%3d: Hello. My pid is %d\n", args->id, getpid()); + mainloop(); + exit(0); +} + +static int make_child(int id, int flags) +{ + struct clone_args args; + pid_t cid; + + args.id = id; + + cid = clone(clone_func, args.stack_ptr, + flags | SIGCHLD, &args); + + if (cid < 0) + pr_perror("clone(%d, %d)", id, flags); + + processes[id].pid = cid; + + return cid; +} + +static void handle_command() +{ + int sk = processes[current].sks[0], ret, status = 0; + struct command cmd; + + ret = read(sk, &cmd, sizeof(cmd)); + if (ret != sizeof(cmd)) { + pr_perror("Unable to get command"); + goto err; + } + + switch (cmd.cmd) { + case TEST_FORK: + { + pid_t pid; + + pid = make_child(cmd.arg1, cmd.arg2); + if (pid == -1) { + status = -1; + goto err; + } + + test_msg("%3d: fork(%d, %x) = %d\n", + current, cmd.arg1, cmd.arg2, pid); + processes[cmd.arg1].pid = pid; + } + break; + case TEST_WAIT: + test_msg("%3d: wait(%d) = %d\n", current, + cmd.arg1, processes[cmd.arg1].pid); + + if (waitpid(processes[cmd.arg1].pid, NULL, 0) == -1) { + pr_perror("waitpid(%d)", processes[cmd.arg1].pid); + status = -1; + } + break; + case TEST_SUBREAPER: + test_msg("%3d: subreaper(%d)\n", current, cmd.arg1); + if (prctl(PR_SET_CHILD_SUBREAPER, cmd.arg1, 0, 0, 0) == -1) { + pr_perror("PR_SET_CHILD_SUBREAPER"); + status = -1; + } + break; + case TEST_SETSID: + test_msg("%3d: setsid()\n", current); + if(setsid() == -1) { + pr_perror("setsid"); + status = -1; + } + break; + case TEST_DIE: + test_msg("%3d: die()\n", current); + processes[current].dead = 1; + shutdown(sk, SHUT_RDWR); + exit(0); + } + + ret = write(sk, &status, sizeof(status)); + if (ret != sizeof(status)) { + pr_perror("Unable to answer"); + goto err; + } + + if (status < 0) + goto err; + + return; +err: + shutdown(sk, SHUT_RDWR); + exit(1); +} + +static int send_command(int id, enum commands op, int arg1, int arg2) +{ + int sk = processes[id].sks[1], ret, status; + struct command cmd = {op, arg1, arg2}; + + if (op == TEST_FORK) { + if (processes[arg1].pid) { + pr_perror("%d is busy", arg1); + return -1; + } + } + + ret = write(sk, &cmd, sizeof(cmd)); + if (ret != sizeof(cmd)) { + pr_perror("Unable to send command"); + goto err; + } + + status = 0; + ret = read(sk, &status, sizeof(status)); + if (ret != sizeof(status) && !(status == 0 && op == TEST_DIE)) { + pr_perror("Unable to get answer"); + goto err; + } + + if (status) { + pr_perror("The command(%d, %d, %d) failed", op, arg1, arg2); + goto err; + } + + return 0; +err: + cleanup(); + exit(1); +} + +int main(int argc, char ** argv) +{ + int pid, i; + int fail_cnt = 0; + + test_init(argc, argv); + + processes = mmap(NULL, PAGE_SIZE, PROT_WRITE | PROT_READ, + MAP_SHARED | MAP_ANONYMOUS, 0, 0); + if (processes == NULL) { + pr_perror("Unable to map share memory"); + return 1; + } + + for (i = 0; i < nr_processes; i++) { + if (socketpair(PF_UNIX, SOCK_STREAM, 0, processes[i].sks) == -1) { + pr_perror("socketpair"); + return 1; + } + } + + pid = make_child(0, 0); + if (pid < 0) + return -1; + + /* + * 5 5 \_ session02 ( 0) + * 6 6 \_ session02 ( 1) + * 8 7 | \_ session02 ( 3) + * 15 12 | \_ session02 (10) + * 10 10 \_ session02 ( 5) + * 11 7 \_ session02 ( 6) + * 13 12 \_ session02 ( 8) + */ + + send_command(0, TEST_SUBREAPER, 1, 0); + send_command(0, TEST_SETSID, 0, 0); + + send_command(0, TEST_FORK, 1, 0); + send_command(1, TEST_FORK, 2, 0); + + send_command(2, TEST_SETSID, 0, 0); + send_command(2, TEST_FORK, 3, CLONE_PARENT); + send_command(2, TEST_DIE, 0, 0); + send_command(1, TEST_WAIT, 2, 0); + + send_command(3, TEST_FORK, 4, 0); + send_command(4, TEST_FORK, 5, 0); + send_command(5, TEST_FORK, 6, 0); + + send_command(5, TEST_FORK, 7, 0); + send_command(7, TEST_SETSID, 0, 0); + send_command(7, TEST_FORK, 8, CLONE_PARENT); + send_command(7, TEST_FORK, 9, CLONE_PARENT); + send_command(7, TEST_DIE, 0, 0); + send_command(5, TEST_WAIT, 7, 0); + + send_command(9, TEST_FORK, 10, 0); + send_command(1, TEST_SUBREAPER, 1, 0); + send_command(9, TEST_DIE, 0, 0); + send_command(5, TEST_WAIT, 9, 0); + send_command(1, TEST_SUBREAPER, 0, 0); + + send_command(4, TEST_DIE, 0, 0); + send_command(3, TEST_WAIT, 4, 0); + + send_command(1, TEST_SETSID, 0, 0); + send_command(5, TEST_SETSID, 0, 0); + + for (i = 0; i < nr_processes; i++) { + if (processes[i].dead) + continue; + if (processes[i].pid == 0) + continue; + + processes[i].sid = getsid(processes[i].pid); + if (processes[i].sid == -1) { + pr_perror("getsid(%d)", i); + goto err; + } + } + + test_daemon(); + + test_waitsig(); + + for (i = 0; i < nr_processes; i++) { + pid_t sid; + + if (processes[i].dead) + continue; + if (processes[i].pid == 0) + continue; + + sid = getsid(processes[i].pid); + if (sid == -1) { + pr_perror("getsid(%d)", i); + goto err; + } + + if (sid != processes[i].sid) { + fail("%d, %d: wrong sid %d (expected %d)", + i, processes[i].pid, sid, processes[i].sid); + fail_cnt++; + } + } + + if (fail_cnt) + goto err; + + pass(); + + return 0; +err: + cleanup(); + return 1; +} diff --git a/CRIU_code/test/zdtm/static/session02.desc b/CRIU_code/test/zdtm/static/session02.desc new file mode 100644 index 0000000..95c58b4 --- /dev/null +++ b/CRIU_code/test/zdtm/static/session02.desc @@ -0,0 +1 @@ +{'flags': 'noauto'} diff --git a/CRIU_code/test/zdtm/static/session03.c b/CRIU_code/test/zdtm/static/session03.c new file mode 100644 index 0000000..2b3c46c --- /dev/null +++ b/CRIU_code/test/zdtm/static/session03.c @@ -0,0 +1,376 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Create a crazy process tree"; +const char *test_author = "Andrew Vagin "; + +struct process +{ + pid_t pid; + pid_t sid; + int sks[2]; + int dead; + int wait; +}; + +#define MEM_SIZE (2 * PAGE_SIZE) +#define PR_MAX (MEM_SIZE / sizeof(struct process)) + +struct process *processes; +int nr_processes = 0; +int current = 0; + +static void sigchld_handler(int signal, siginfo_t *siginfo, void *data) +{ + pid_t pid = siginfo->si_pid; + if (siginfo->si_status == 2) + waitpid(pid, NULL, WNOHANG); +} + +static void cleanup() +{ + int i, ret; + + for (i = 0; i < nr_processes; i++) { + if (processes[i].dead) + continue; + if (processes[i].pid <= 0) + continue; + + kill(processes[i].pid, SIGKILL); + } + + while (1) { + ret = wait(NULL); + if (ret == -1) { + if (errno == ECHILD) + break; + pr_perror("wait"); + exit(1); + } + } +} + +enum commands +{ + TEST_FORK, + TEST_DIE_WAIT, + TEST_DIE, + TEST_SUBREAPER, + TEST_SETSID, + TEST_MAX +}; + +int cmd_weght[TEST_MAX] = {10, 3, 1, 10, 7}; +int sum_weight = 0; +static int get_rnd_op() +{ + int i, m; + if (sum_weight == 0) { + for (i = 0; i < TEST_MAX; i++) + sum_weight += cmd_weght[i]; + } + m = lrand48() % sum_weight; + for (i = 0; i < TEST_MAX; i++) { + if (m > cmd_weght[i]) { + m -= cmd_weght[i]; + continue; + } + return i; + } + return -1; +} + +struct command +{ + enum commands cmd; + int arg1; + int arg2; +}; + +static void handle_command(); + +static void mainloop() +{ + while (1) + handle_command(); +} + +#define CLONE_STACK_SIZE 4096 +/* All arguments should be above stack, because it grows down */ +struct clone_args { + char stack[CLONE_STACK_SIZE] __stack_aligned__; + char stack_ptr[0]; + int id; +}; + +static int clone_func(void *_arg) +{ + struct clone_args *args = (struct clone_args *) _arg; + + current = args->id; + + test_msg("%3d: Hello. My pid is %d\n", args->id, getpid()); + mainloop(); + exit(0); +} + +static int make_child(int id, int flags) +{ + struct clone_args args; + pid_t cid; + + args.id = id; + + cid = clone(clone_func, args.stack_ptr, + flags | SIGCHLD, &args); + + if (cid < 0) + pr_perror("clone(%d, %d)", id, flags); + + processes[id].pid = cid; + + return cid; +} + +static void handle_command() +{ + int sk = processes[current].sks[0], ret, status = 0; + struct command cmd; + + ret = read(sk, &cmd, sizeof(cmd)); + if (ret != sizeof(cmd)) { + pr_perror("Unable to get command"); + goto err; + } + + switch (cmd.cmd) { + case TEST_FORK: + { + pid_t pid; + + pid = make_child(cmd.arg1, cmd.arg2 ? CLONE_PARENT : 0); + if (pid < 0) { + status = -1; + goto err; + } + + test_msg("%3d: fork(%d, %x) = %d\n", + current, cmd.arg1, cmd.arg2, pid); + processes[cmd.arg1].pid = pid; + } + break; + case TEST_SUBREAPER: + test_msg("%3d: subreaper(%d)\n", current, cmd.arg1); + if (prctl(PR_SET_CHILD_SUBREAPER, cmd.arg1, 0, 0, 0) == -1) { + pr_perror("PR_SET_CHILD_SUBREAPER"); + status = -1; + } + break; + case TEST_SETSID: + if (getsid(0) == getpid()) + break; + test_msg("%3d: setsid()\n", current); + if(setsid() == -1) { + pr_perror("setsid"); + status = -1; + } + break; + case TEST_DIE_WAIT: + test_msg("%3d: wait()\n", current); + case TEST_DIE: + test_msg("%3d: die()\n", current); + processes[current].dead = 1; + shutdown(sk, SHUT_RDWR); + if (cmd.cmd == TEST_DIE_WAIT) + exit(2); + exit(0); + default: + pr_perror("Unknown operation %d", cmd.cmd); + status = -1; + break; + } + + ret = write(sk, &status, sizeof(status)); + if (ret != sizeof(status)) { + pr_perror("Unable to answer"); + goto err; + } + + if (status < 0) + goto err; + + return; +err: + shutdown(sk, SHUT_RDWR); + exit(1); +} + +static int send_command(int id, enum commands op, int arg) +{ + int sk = processes[id].sks[1], ret, status; + struct command cmd = {op, arg}; + + if (op == TEST_FORK) { + cmd.arg1 = nr_processes; + nr_processes++; + if (nr_processes > PR_MAX) + return -1; + cmd.arg2 = arg; + } + + ret = write(sk, &cmd, sizeof(cmd)); + if (ret != sizeof(cmd)) { + pr_perror("Unable to send command"); + goto err; + } + + status = 0; + ret = read(sk, &status, sizeof(status)); + if (ret != sizeof(status) && + !(status == 0 && (op == TEST_DIE || op == TEST_DIE_WAIT))) { + pr_perror("Unable to get answer"); + goto err; + } + + if (status) { + pr_perror("The command(%d, %d) failed", op, arg); + goto err; + } + + return 0; +err: + cleanup(); + exit(1); +} + +int main(int argc, char ** argv) +{ + struct sigaction act; + int pid, i, ret; + int fail_cnt = 0; + + test_init(argc, argv); + + if (prctl(PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0) == -1) { + pr_perror("PR_SET_CHILD_SUBREAPER"); + return -1; + } + + ret = sigaction(SIGCHLD, NULL, &act); + if (ret < 0) { + pr_perror("sigaction() failed"); + return -1; + } + + act.sa_flags |= SA_NOCLDSTOP | SA_SIGINFO | SA_RESTART; + act.sa_sigaction = sigchld_handler; + sigemptyset(&act.sa_mask); + sigaddset(&act.sa_mask, SIGCHLD); + + ret = sigaction(SIGCHLD, &act, NULL); + if (ret < 0) { + pr_perror("sigaction() failed"); + return -1; + } + + processes = mmap(NULL, MEM_SIZE, PROT_WRITE | PROT_READ, + MAP_SHARED | MAP_ANONYMOUS, 0, 0); + if (processes == NULL) { + pr_perror("Unable to map share memory"); + return 1; + } + + for (i = 0; i < PR_MAX; i++) { + if (socketpair(PF_UNIX, SOCK_STREAM, 0, processes[i].sks) == -1) { + pr_perror("socketpair"); + return 1; + } + } + + nr_processes++; + pid = make_child(0, 0); + if (pid < 0) + return -1; + + while(nr_processes < PR_MAX) { + int op, id; + int flags = lrand48() % 2; + + op = get_rnd_op(); + if (op == TEST_DIE || op == TEST_DIE_WAIT || op == TEST_SUBREAPER) { + if (nr_processes == 1) + continue; + else + id = lrand48() % (nr_processes - 1) + 1; + } else if (op == TEST_FORK) { + id = nr_processes * 9 / 10 + lrand48() % nr_processes / 10; + while (processes[id].dead != 0) + id--; + } else + id = lrand48() % nr_processes; + + if (processes[id].dead) + continue; + + send_command(id, op, flags); + } + + for (i = 0; i < nr_processes; i++) { + if (processes[i].dead) + continue; + if (processes[i].pid == 0) + continue; + + processes[i].sid = getsid(processes[i].pid); + if (processes[i].sid == -1) { + pr_perror("getsid(%d)", i); + goto err; + } + } + + test_daemon(); + + test_waitsig(); + + for (i = 0; i < nr_processes; i++) { + pid_t sid; + + if (processes[i].dead) + continue; + if (processes[i].pid == 0) + continue; + + sid = getsid(processes[i].pid); + if (sid == -1) { + pr_perror("getsid(%d)", i); + goto err; + } + + if (sid != processes[i].sid) { + fail("%d, %d: wrong sid %d (expected %d)", + i, processes[i].pid, sid, processes[i].sid); + fail_cnt++; + } + } + + if (fail_cnt) + goto err; + + pass(); + + cleanup(); + return 0; +err: + cleanup(); + return 1; +} diff --git a/CRIU_code/test/zdtm/static/session03.desc b/CRIU_code/test/zdtm/static/session03.desc new file mode 100644 index 0000000..95c58b4 --- /dev/null +++ b/CRIU_code/test/zdtm/static/session03.desc @@ -0,0 +1 @@ +{'flags': 'noauto'} diff --git a/CRIU_code/test/zdtm/static/shared_mount_propagation.c b/CRIU_code/test/zdtm/static/shared_mount_propagation.c new file mode 100644 index 0000000..4e81b9e --- /dev/null +++ b/CRIU_code/test/zdtm/static/shared_mount_propagation.c @@ -0,0 +1,119 @@ +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check mounts are propagated to shared mounts"; +const char *test_author = "Pavel Tikhomirov "; + +char *dirname; +TEST_OPTION(dirname, string, "directory name", 1); + +int main(int argc, char **argv) +{ + char dir_a[PATH_MAX], dir_b[PATH_MAX], dir_c[PATH_MAX]; + char dir_d[PATH_MAX], dir_e[PATH_MAX], dir_f[PATH_MAX]; + char test_file[PATH_MAX]; + char test_bind_file1[PATH_MAX]; + char test_bind_file2[PATH_MAX]; + char test_bind_file3[PATH_MAX]; + int fd; + + test_init(argc, argv); + + mkdir(dirname, 0700); + + if (mount(dirname, dirname, NULL, MS_BIND, NULL)) { + pr_perror("Unable to self bind mount %s", dirname); + return 1; + } + + if (mount(NULL, dirname, NULL, MS_SHARED, NULL)) { + pr_perror("Unable to make shared mount %s", dirname); + return 1; + } + + ssprintf(dir_a, "%s/a", dirname); + ssprintf(dir_d, "%s/d", dirname); + ssprintf(dir_e, "%s/e", dirname); + ssprintf(dir_f, "%s/f", dirname); + mkdir(dir_a, 0700); + mkdir(dir_d, 0700); + mkdir(dir_e, 0700); + mkdir(dir_f, 0700); + + ssprintf(dir_b, "%s/b", dir_a); + ssprintf(dir_c, "%s/c", dir_b); + mkdir(dir_b, 0700); + mkdir(dir_c, 0700); + + if (mount(dir_a, dir_d, NULL, MS_BIND, NULL)) { + pr_perror("Unable to bind mount %s to %s", dir_a, dir_d); + return 1; + } + + if (mount(dir_b, dir_e, NULL, MS_BIND, NULL)) { + pr_perror("Unable to bind mount %s to %s", dir_b, dir_e); + return 1; + } + + if (mount(dir_f, dir_c, NULL, MS_BIND, NULL)) { + pr_perror("Unable to bind mount %s to %s", dir_f, dir_c); + return 1; + } + + ssprintf(test_file, "%s/file", dir_f); + fd = open(test_file, O_CREAT | O_WRONLY | O_EXCL, 0600); + if (fd < 0) { + pr_perror("Unable to open %s", test_file); + return 1; + } + close(fd); + + test_daemon(); + test_waitsig(); + + ssprintf(test_bind_file1, "%s/file", dir_c); + ssprintf(test_bind_file2, "%s/b/c/file", dir_d); + ssprintf(test_bind_file3, "%s/c/file", dir_e); + + if (access(test_file, F_OK)) { + pr_perror("%s doesn't exist", test_file); + return 1; + } + + if (access(test_bind_file1, F_OK)) { + pr_perror("%s doesn't exist", test_bind_file1); + return 1; + } + + if (access(test_bind_file2, F_OK)) { + pr_perror("%s doesn't exist", test_bind_file2); + return 1; + } + + if (access(test_bind_file3, F_OK)) { + pr_perror("%s doesn't exist", test_bind_file3); + return 1; + } + + if (umount(dir_c)) { + pr_perror("Unable to umount %s", dir_c); + return 1; + } + + if (umount(dir_e)) { + pr_perror("Unable to umount %s", dir_e); + return 1; + } + + if (umount(dir_d)) { + pr_perror("Unable to umount %s", dir_d); + return 1; + } + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/shared_mount_propagation.desc b/CRIU_code/test/zdtm/static/shared_mount_propagation.desc new file mode 100644 index 0000000..7657ba4 --- /dev/null +++ b/CRIU_code/test/zdtm/static/shared_mount_propagation.desc @@ -0,0 +1 @@ +{'flavor': 'ns uns', 'flags': 'suid'} diff --git a/CRIU_code/test/zdtm/static/shared_slave_mount_children.c b/CRIU_code/test/zdtm/static/shared_slave_mount_children.c new file mode 100644 index 0000000..75c2513 --- /dev/null +++ b/CRIU_code/test/zdtm/static/shared_slave_mount_children.c @@ -0,0 +1,125 @@ +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check non-uniform shares restore fine"; +const char *test_author = "Pavel Tikhomirov "; + +char *dirname; +TEST_OPTION(dirname, string, "directory name", 1); + +int main(int argc, char **argv) +{ + char share[PATH_MAX], slave1[PATH_MAX], slave2[PATH_MAX]; + char child[PATH_MAX]; + + test_init(argc, argv); + + if (mkdir(dirname, 0700)) { + pr_perror("mkdir"); + return 1; + } + + if (mount("zdtm_fs", dirname, "tmpfs", 0, NULL)) { + pr_perror("mount"); + return 1; + } + + if (mount(NULL, dirname, NULL, MS_PRIVATE, NULL)) { + pr_perror("mount"); + return 1; + } + + snprintf(share, sizeof(share), "%s/share", dirname); + if (mkdir(share, 0700)) { + pr_perror("mkdir"); + return 1; + } + + if (mount("share", share, "tmpfs", 0, NULL)) { + pr_perror("mount"); + return 1; + } + + if (mount(NULL, share, NULL, MS_SHARED, NULL)) { + pr_perror("mount"); + return 1; + } + + snprintf(slave1, sizeof(slave1), "%s/slave1", dirname); + if (mkdir(slave1, 0700)) { + pr_perror("mkdir"); + return 1; + } + + if (mount(share, slave1, NULL, MS_BIND, NULL)) { + pr_perror("mount"); + return 1; + } + + if (mount(NULL, slave1, NULL, MS_SLAVE, NULL)) { + pr_perror("mount"); + return 1; + } + + if (mount(NULL, slave1, NULL, MS_SHARED, NULL)) { + pr_perror("mount"); + return 1; + } + + snprintf(slave2, sizeof(slave2), "%s/slave2", dirname); + if (mkdir(slave2, 0700)) { + pr_perror("mkdir"); + return 1; + } + + if (mount(slave1, slave2, NULL, MS_BIND, NULL)) { + pr_perror("mount"); + return 1; + } + + snprintf(child, sizeof(child), "%s/slave1/child", dirname); + if (mkdir(child, 0700)) { + pr_perror("mkdir"); + return 1; + } + + if (mount("child", child, "tmpfs", 0, NULL)) { + pr_perror("mount"); + return 1; + } + + test_daemon(); + test_waitsig(); + + if (umount(child)) { + pr_perror("Unable to umount %s", child); + return 1; + } + + if (umount(slave2)) { + pr_perror("Unable to umount %s", slave2); + return 1; + } + + if (umount(slave1)) { + pr_perror("Unable to umount %s", slave1); + return 1; + } + + if (umount(share)) { + pr_perror("Unable to umount %s", share); + return 1; + } + + if (umount(dirname)) { + pr_perror("Unable to umount %s", dirname); + return 1; + } + + pass(); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/shared_slave_mount_children.desc b/CRIU_code/test/zdtm/static/shared_slave_mount_children.desc new file mode 100644 index 0000000..7657ba4 --- /dev/null +++ b/CRIU_code/test/zdtm/static/shared_slave_mount_children.desc @@ -0,0 +1 @@ +{'flavor': 'ns uns', 'flags': 'suid'} diff --git a/CRIU_code/test/zdtm/static/shm-mp.c b/CRIU_code/test/zdtm/static/shm-mp.c new file mode 100644 index 0000000..084796a --- /dev/null +++ b/CRIU_code/test/zdtm/static/shm-mp.c @@ -0,0 +1,115 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc="Tests mprotected SYSVIPC shmems"; +const char *test_author="Pavel Emelyanov "; + +static sigjmp_buf segv_ret; /* we need sig*jmp stuff, otherwise SIGSEGV will reset our handler */ +static void segfault(int signo) +{ + siglongjmp(segv_ret, 1); +} + +static int check_prot(char *ptr, char val, int prot) +{ + if (signal(SIGSEGV, segfault) == SIG_ERR) { + fail("setting SIGSEGV handler failed: %m\n"); + return -1; + } + + if (!sigsetjmp(segv_ret, 1)) { + if (*ptr != val) { + fail("read value doesn't match what I wrote"); + return -1; + } + if (!(prot & PROT_READ)) { + fail("PROT_READ bypassed\n"); + return -1; + } + } else /* we come here on return from SIGSEGV handler */ + if (prot & PROT_READ) { + fail("PROT_READ rejected\n"); + return -1; + } + + if (!sigsetjmp(segv_ret, 1)) { + *ptr = val; + if (!(prot & PROT_WRITE)) { + fail("PROT_WRITE bypassed\n"); + return -1; + } + } else /* we come here on return from SIGSEGV handler */ + if (prot & PROT_WRITE) { + fail("PROT_WRITE rejected\n"); + return -1; + } + + if (signal(SIGSEGV, SIG_DFL) == SIG_ERR) { + fail("restoring SIGSEGV handler failed: %m\n"); + return -1; + } + + return 0; +} +int main(int argc, char **argv) +{ + key_t key; + int id, f = 0; + char *mem; + + test_init(argc, argv); + + key = ftok(argv[0], 812135646); + if (key == -1) { + pr_perror("Can't make key"); + goto out; + } + + id = shmget(key, 2 * PAGE_SIZE, 0777 | IPC_CREAT | IPC_EXCL); + if (id == -1) { + pr_perror("Can't make seg"); + goto out; + } + + mem = shmat(id, NULL, 0); + if (mem == (void *)-1) { + pr_perror("Can't shmat"); + goto out_rm; + } + + mem[0] = 'R'; + mem[PAGE_SIZE] = 'W'; + + if (mprotect(mem, PAGE_SIZE, PROT_READ)) { + pr_perror("Can't mprotect shmem"); + goto out_dt; + } + + test_daemon(); + test_waitsig(); + + if (check_prot(mem, 'R', PROT_READ)) + f++; + if (check_prot(mem + PAGE_SIZE, 'W', PROT_READ | PROT_WRITE)) + f++; + + + if (!f) + pass(); + else + fail("Some checks failed"); + +out_dt: + shmdt(mem); +out_rm: + shmctl(id, IPC_RMID, NULL); +out: + return 0; +} diff --git a/CRIU_code/test/zdtm/static/shm-mp.desc b/CRIU_code/test/zdtm/static/shm-mp.desc new file mode 100644 index 0000000..6c4afe5 --- /dev/null +++ b/CRIU_code/test/zdtm/static/shm-mp.desc @@ -0,0 +1 @@ +{'flavor': 'ns uns'} diff --git a/CRIU_code/test/zdtm/static/shm-unaligned.c b/CRIU_code/test/zdtm/static/shm-unaligned.c new file mode 100644 index 0000000..7e1c916 --- /dev/null +++ b/CRIU_code/test/zdtm/static/shm-unaligned.c @@ -0,0 +1 @@ +shm.c \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/shm-unaligned.desc b/CRIU_code/test/zdtm/static/shm-unaligned.desc new file mode 100644 index 0000000..be44f8a --- /dev/null +++ b/CRIU_code/test/zdtm/static/shm-unaligned.desc @@ -0,0 +1 @@ +shm.desc \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/shm.c b/CRIU_code/test/zdtm/static/shm.c new file mode 100644 index 0000000..a177e35 --- /dev/null +++ b/CRIU_code/test/zdtm/static/shm.c @@ -0,0 +1,197 @@ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc="Tests detached shmems migrate fine"; +const char *test_author="Stanislav Kinsbursky "; + +#define DEF_MEM_SIZE (40960) +unsigned int shmem_size = DEF_MEM_SIZE; +TEST_OPTION(shmem_size, uint, "Size of shared memory segment", 0); + +#define INIT_CRC (~0) + +static int fill_shm_seg(int id, size_t size) +{ + uint8_t *mem; + uint32_t crc = INIT_CRC; + + mem = shmat(id, NULL, 0); + if (mem == (void *)-1) { + pr_perror("Can't attach shm: %d", -errno); + return -1; + } + + datagen(mem, size, &crc); + + if (shmdt(mem) < 0) { + pr_perror("Can't detach shm: %d", -errno); + return -1; + } + return 0; +} + +static int get_shm_seg(int key, size_t size, unsigned int flags) +{ + int id; + + id = shmget(key, size, 0777 | flags); + if (id == -1) { + pr_perror("Can't get shm: %d", -errno); + return -1; + } + return id; +} + +static int prepare_shm(int key, size_t size) +{ + int id; + + id = get_shm_seg(key, shmem_size, IPC_CREAT | IPC_EXCL); + if (id == -1) { + return -1; + } + if (fill_shm_seg(id, shmem_size) < 0) + return -1; + return id; +} + +static int check_shm_id(int id, size_t size) +{ + uint8_t *mem; + uint32_t crc = INIT_CRC; + + mem = shmat(id, NULL, 0); + if (mem == (void *)-1) { + pr_perror("Can't attach shm: %d", -errno); + return -1; + } + crc = INIT_CRC; + if (datachk(mem, size, &crc)) { + fail("shmem data are corrupted"); + return -1; + } + if (shmdt(mem) < 0) { + pr_perror("Can't detach shm: %d", -errno); + return -1; + } + return 0; +} + +static int check_shm_key(int key, size_t size) +{ + int id; + + id = get_shm_seg(key, size, 0); + if (id < 0) + return -1; + return check_shm_id(id, size); +} + +int main(int argc, char **argv) +{ + key_t key; + int shm; + int fail_count = 0; + int ret = -1; + + void *mem; + uint32_t crc = INIT_CRC; + + test_init(argc, argv); + +#ifdef ZDTM_SHM_UNALIGNED + key = ftok(argv[0], 822155666); +#else + key = ftok(argv[0], 822155667); +#endif + if (key == -1) { + pr_perror("Can't make key"); + goto out; + } + + shm = prepare_shm(key, shmem_size); + if (shm == -1) { + pr_perror("Can't prepare shm (1)"); + goto out; + } + + mem = shmat(shm, NULL, 0); + if (mem == (void *)-1) { + pr_perror("Can't shmat"); + goto out; + } + + test_daemon(); + test_waitsig(); + + ret = check_shm_id(shm, shmem_size); + if (ret < 0) { + fail("ID check (1) failed\n"); + fail_count++; + goto out_shm; + } + + ret = check_shm_key(key, shmem_size); + if (ret < 0) { + fail("KEY check failed\n"); + fail_count++; + goto out_shm; + } + + if (datachk(mem, shmem_size, &crc)) { + fail("shmem data is corrupted"); + return -1; + } + + if (shmdt(mem) < 0) { + pr_perror("Can't detach shm"); + return -1; + } + + ret = shmctl(shm, IPC_RMID, NULL); + if (ret < 0) { + fail("Failed (1) to destroy segment: %d\n", -errno); + fail_count++; + goto out_shm; + } + /* + * Code below checks that it's still possible to create new IPC SHM + * segments + */ + shm = prepare_shm(key, shmem_size); + if (shm == -1) { + fail("Can't prepare shm (2)"); + fail_count++; + goto out; + } + + ret = check_shm_id(shm, shmem_size); + if (ret < 0) { + fail("ID check (2) failed\n"); + fail_count++; + goto out_shm; + } + +out_shm: + ret = shmctl(shm, IPC_RMID, NULL); + if (ret < 0) { + fail("Failed (2) to destroy segment: %d\n", -errno); + fail_count++; + } + if (fail_count == 0) + pass(); +out: + return ret; +} diff --git a/CRIU_code/test/zdtm/static/shm.desc b/CRIU_code/test/zdtm/static/shm.desc new file mode 100644 index 0000000..6c4afe5 --- /dev/null +++ b/CRIU_code/test/zdtm/static/shm.desc @@ -0,0 +1 @@ +{'flavor': 'ns uns'} diff --git a/CRIU_code/test/zdtm/static/sigaltstack.c b/CRIU_code/test/zdtm/static/sigaltstack.c new file mode 100644 index 0000000..d324b0d --- /dev/null +++ b/CRIU_code/test/zdtm/static/sigaltstack.c @@ -0,0 +1,169 @@ +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check for alternate signal stack"; +const char *test_author = "Cyrill Gorcunov "; + +static char stack_thread[SIGSTKSZ + TEST_MSG_BUFFER_SIZE] __stack_aligned__; +static char stack_main[SIGSTKSZ + TEST_MSG_BUFFER_SIZE] __stack_aligned__; + +enum { + SAS_MAIN_OLD, + SAS_MAIN_NEW, + SAS_THRD_OLD, + SAS_THRD_NEW, + + SAS_MAX +}; +static stack_t sas_state[SAS_MAX]; + +static task_waiter_t t; + +#define exit_group(code) syscall(__NR_exit_group, code) +#define gettid() syscall(__NR_gettid) + +static int sascmp(stack_t *old, stack_t *new) +{ + return old->ss_size != new->ss_size || + old->ss_sp != new->ss_sp || + old->ss_flags != new->ss_flags; +} + +static void show_ss(char *prefix, stack_t *s) +{ + test_msg("%20s: at %p (size %8zu flags %#2x)\n", + prefix, s->ss_sp, s->ss_size, s->ss_flags); +} + +void thread_sigaction(int signo, siginfo_t *info, void *context) +{ + if (sigaltstack(NULL, &sas_state[SAS_THRD_NEW])) + pr_perror("thread sigaltstack"); + + show_ss("thread in sas", &sas_state[SAS_THRD_NEW]); + + task_waiter_complete(&t, 2); + + test_msg("Waiting in thread SAS\n"); + task_waiter_wait4(&t, 3); + test_msg("Leaving thread SAS\n"); +} + +static void *thread_func(void *arg) +{ + sas_state[SAS_THRD_OLD] = (stack_t) { + .ss_size = sizeof(stack_thread) - 8, + .ss_sp = stack_thread, + .ss_flags = 0, + }; + + struct sigaction sa = { + .sa_sigaction = thread_sigaction, + .sa_flags = SA_RESTART | SA_ONSTACK, + }; + + sigemptyset(&sa.sa_mask); + + if (sigaction(SIGUSR2, &sa, NULL)) { + pr_perror("Can't set SIGUSR2 handler"); + exit_group(-1); + } + + task_waiter_wait4(&t, 1); + + if (sigaltstack(&sas_state[SAS_THRD_OLD], NULL)) { + pr_perror("thread sigaltstack"); + exit_group(-1); + } + + syscall(__NR_tkill, gettid(), SIGUSR2); + + return NULL; +} + +void leader_sigaction(int signo, siginfo_t *info, void *context) +{ + if (sigaltstack(NULL, &sas_state[SAS_MAIN_NEW])) + pr_perror("leader sigaltstack"); + + show_ss("leader in sas", &sas_state[SAS_MAIN_NEW]); +} + +int main(int argc, char *argv[]) +{ + pthread_t thread; + + sas_state[SAS_MAIN_OLD] = (stack_t) { + .ss_size = sizeof(stack_main) - 8, + .ss_sp = stack_main, + .ss_flags = 0, + }; + + struct sigaction sa = { + .sa_sigaction = leader_sigaction, + .sa_flags = SA_RESTART | SA_ONSTACK, + }; + + sigemptyset(&sa.sa_mask); + + test_init(argc, argv); + task_waiter_init(&t); + + if (sigaction(SIGUSR1, &sa, NULL)) { + pr_perror("Can't set SIGUSR1 handler"); + exit(-1); + } + + if (pthread_create(&thread, NULL, &thread_func, NULL)) { + pr_perror("Can't create thread"); + exit(-1); + } + + if (sigaltstack(&sas_state[SAS_MAIN_OLD], NULL)) { + pr_perror("sigaltstack"); + exit(-1); + } + + task_waiter_complete(&t, 1); + task_waiter_wait4(&t, 2); + + test_daemon(); + test_waitsig(); + + test_msg("Thread may leave SAS\n"); + task_waiter_complete(&t, 3); + + syscall(__NR_tkill, gettid(), SIGUSR1); + + if (pthread_join(thread, NULL)) { + fail("Error joining thread"); + exit(-1); + } + task_waiter_fini(&t); + + sas_state[SAS_THRD_OLD].ss_flags = SS_ONSTACK; + sas_state[SAS_MAIN_OLD].ss_flags = SS_ONSTACK; + + show_ss("main old", &sas_state[SAS_MAIN_OLD]); + show_ss("main new", &sas_state[SAS_MAIN_NEW]); + show_ss("thrd old", &sas_state[SAS_THRD_OLD]); + show_ss("thrd new", &sas_state[SAS_THRD_NEW]); + + if (sascmp(&sas_state[SAS_MAIN_OLD], &sas_state[SAS_MAIN_NEW]) || + sascmp(&sas_state[SAS_THRD_OLD], &sas_state[SAS_THRD_NEW])) { + fail("sas not restored"); + } else + pass(); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/signalfd00.c b/CRIU_code/test/zdtm/static/signalfd00.c new file mode 100644 index 0000000..61b1f03 --- /dev/null +++ b/CRIU_code/test/zdtm/static/signalfd00.c @@ -0,0 +1,71 @@ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check for signalfd without signals"; +const char *test_author = "Pavel Emelyanov "; + +int main(int argc, char *argv[]) +{ + int fd, ret; + sigset_t mask; + siginfo_t info; + + test_init(argc, argv); + + sigemptyset(&mask); + sigaddset(&mask, SIGUSR1); + fd = signalfd(-1, &mask, SFD_NONBLOCK); + if (fd < 0) { + fail("Can't create signalfd"); + exit(1); + } + + sigemptyset(&mask); + sigaddset(&mask, SIGUSR1); + sigaddset(&mask, SIGUSR2); + sigprocmask(SIG_BLOCK, &mask, NULL); + + test_daemon(); + test_waitsig(); + + kill(getpid(), SIGUSR2); + + ret = read(fd, &info, sizeof(info)); + if (ret >= 0) { + fail("ghost signal"); + exit(1); + } + + kill(getpid(), SIGUSR1); + + ret = read(fd, &info, sizeof(info)); + if (ret != sizeof(info)) { + fail("no signal"); + exit(1); + } + + if (info.si_signo != SIGUSR1) { + fail("wrong signal"); + exit(1); + } + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/sigpending.c b/CRIU_code/test/zdtm/static/sigpending.c new file mode 100644 index 0000000..ac2fdcf --- /dev/null +++ b/CRIU_code/test/zdtm/static/sigpending.c @@ -0,0 +1,306 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check pending signals"; +const char *test_author = "Andrew Vagin "; + +static pid_t child; +static int numsig; + +#define TESTSIG (SIGRTMAX) +#define THREADSIG (SIGRTMIN) +static siginfo_t share_infos[2]; +static siginfo_t self_infos[64]; /* self */ +static siginfo_t thread_infos[3]; /* thread */ +static int share_nr; +static int self_nr; +static int thread_nr; + +#ifndef offsetof +# define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER) +#endif + +/* cr_siginfo is declared to get an offset of _sifields */ +union cr_siginfo { + struct { + int si_signo; + int si_errno; + int si_code; + + union { + int _pad[10]; + /* ... */ + } _sifields; + } _info; + siginfo_t info; +}; +typedef union cr_siginfo cr_siginfo_t; + +#define siginf_body(s) (&((cr_siginfo_t *)(s))->_info._sifields) + +/* + * The kernel puts only relevant union member when signal arrives, + * leaving _si_fields to be filled with junk from stack. Check only + * first 12 bytes: + * // POSIX.1b signals. + * struct + * { + * __pid_t si_pid; // Sending process ID. + * __uid_t si_uid; // Real user ID of sending process. + * sigval_t si_sigval; // Signal value. + * } _rt; + * Look at __copy_siginfo_to_user32() for more information. + */ +# define _si_fields_sz 12 +#define siginfo_filled (offsetof(cr_siginfo_t, _info._sifields) + _si_fields_sz) + +static pthread_mutex_t exit_lock; +static pthread_mutex_t init_lock; + +static void sig_handler(int signal, siginfo_t *info, void *data) +{ + uint32_t crc; + + test_msg("signo=%d si_code=%x\n", signal, info->si_code); + + if (test_go()) { + pr_perror("The signal is received before unlocking"); + return; + } + + switch (signal) { + case SIGCHLD: + if ((info->si_code & CLD_EXITED) && + (info->si_pid == child) && + (info->si_status == 5)) + numsig++; + else { + fail("Wrong siginfo"); + exit(1); + } + return; + } + + if (TESTSIG == signal || THREADSIG == signal) { + siginfo_t *src; + + if (signal == TESTSIG) { + src = &share_infos[share_nr]; + share_nr++; + } else if (getpid() == syscall(SYS_gettid)) { + src = &self_infos[self_nr]; + self_nr++; + } else { + src = &thread_infos[thread_nr]; + thread_nr++; + } + + crc = ~0; + if (datachk((uint8_t *) siginf_body(info), _si_fields_sz, &crc)) { + fail("CRC mismatch\n"); + return; + } + + if (memcmp(info, src, siginfo_filled)) { + fail("Source and received info are differ\n"); + return; + } + + numsig++; + return; + } + + pr_perror("Unexpected signal"); + exit(1); +} + +static int thread_id; + +static void *thread_fn(void *args) +{ + sigset_t blockmask, oldset, newset; + struct sigaction act; + + memset(&oldset, 0, sizeof(oldset)); + memset(&newset, 0, sizeof(oldset)); + + sigfillset(&blockmask); + sigdelset(&blockmask, SIGTERM); + + if (sigprocmask(SIG_BLOCK, &blockmask, NULL) == -1) { + pr_perror("sigprocmask"); + return NULL; + } + + if (sigprocmask(SIG_SETMASK, NULL, &oldset) == -1) { + pr_perror("sigprocmask"); + return NULL; + } + + thread_id = syscall(SYS_gettid); + + act.sa_flags = SA_SIGINFO | SA_RESTART; + act.sa_sigaction = sig_handler; + sigemptyset(&act.sa_mask); + + sigaddset(&act.sa_mask, TESTSIG); + sigaddset(&act.sa_mask, THREADSIG); + if (sigaction(TESTSIG, &act, NULL)) { + pr_perror("sigaction() failed"); + return NULL; + } + + pthread_mutex_unlock(&init_lock); + pthread_mutex_lock(&exit_lock); + + if (sigprocmask(SIG_UNBLOCK, &blockmask, &newset) == -1) { + pr_perror("sigprocmask"); + return NULL; + } + + sigdelset(&oldset, SIGTRAP); + sigdelset(&newset, SIGTRAP); + if (memcmp(&newset, &oldset, sizeof(newset))) { + fail("The signal blocking mask was changed"); + numsig = INT_MAX; + } + + return NULL; +} + +static int sent_sigs; + +int send_siginfo(int signo, pid_t pid, pid_t tid, int group, siginfo_t *info) +{ + static int si_code = -10; + uint32_t crc = ~0; + + info->si_code = si_code; + si_code--; + info->si_signo = signo; + datagen((uint8_t *) siginf_body(info), _si_fields_sz, &crc); + + sent_sigs++; + + if (group) + return syscall(SYS_rt_sigqueueinfo, pid, signo, info); + else + return syscall(SYS_rt_tgsigqueueinfo, pid, tid, signo, info); +} + +int main(int argc, char ** argv) +{ + sigset_t blockmask, oldset, newset; + struct sigaction act; + pthread_t pthrd; + siginfo_t infop; + int i; + + memset(&oldset, 0, sizeof(oldset)); + memset(&newset, 0, sizeof(oldset)); + + test_init(argc, argv); + pthread_mutex_init(&exit_lock, NULL); + pthread_mutex_lock(&exit_lock); + pthread_mutex_init(&init_lock, NULL); + pthread_mutex_lock(&init_lock); + + if (pthread_create(&pthrd, NULL, thread_fn, NULL)) { + pr_perror("Can't create a thread"); + return 1; + } + + pthread_mutex_lock(&init_lock); + + sigfillset(&blockmask); + sigdelset(&blockmask, SIGTERM); + + if (sigprocmask(SIG_BLOCK, &blockmask, NULL) == -1) { + pr_perror("sigprocmask"); + return -1; + } + + if (sigprocmask(SIG_BLOCK, NULL, &oldset) == -1) { + pr_perror("sigprocmask"); + return -1; + } + + child = fork(); + if (child == -1) { + pr_perror("fork"); + return -1; + } + + if(child == 0) + return 5; /* SIGCHLD */ + if (waitid(P_PID, child, &infop, WNOWAIT | WEXITED)) { + pr_perror("waitid"); + return 1; + } + + sent_sigs++; + + for (i = 0; i < sizeof(share_infos) / sizeof(siginfo_t); i++) { + send_siginfo(TESTSIG, getpid(), -1, 1, share_infos + i); + } + + for (i = 0; i < sizeof(self_infos) / sizeof(siginfo_t); i++) { + send_siginfo(THREADSIG, getpid(), getpid(), 0, self_infos + i); + } + + for (i = 0; i < sizeof(thread_infos) / sizeof(siginfo_t); i++) { + send_siginfo(THREADSIG, getpid(), thread_id, 0, thread_infos + i); + } + + act.sa_flags = SA_SIGINFO | SA_RESTART; + act.sa_sigaction = sig_handler; + sigemptyset(&act.sa_mask); + + if (sigaction(SIGCHLD, &act, NULL)) { + pr_perror("sigaction() failed"); + return -1; + } + + sigaddset(&act.sa_mask, TESTSIG); + sigaddset(&act.sa_mask, THREADSIG); + if (sigaction(TESTSIG, &act, NULL)) { + pr_perror("sigaction() failed"); + return -1; + } + + if (sigaction(THREADSIG, &act, NULL)) { + pr_perror("sigaction() failed"); + return -1; + } + + test_daemon(); + + test_waitsig(); + + if (sigprocmask(SIG_UNBLOCK, &blockmask, &newset) == -1) { + pr_perror("sigprocmask"); + return -1; + } + pthread_mutex_unlock(&exit_lock); + pthread_join(pthrd, NULL); + + sigdelset(&oldset, SIGTRAP); + sigdelset(&newset, SIGTRAP); + if (memcmp(&newset, &oldset, sizeof(newset))) { + fail("The signal blocking mask was changed"); + return 1; + } + + if (numsig == sent_sigs) + pass(); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/sit.c b/CRIU_code/test/zdtm/static/sit.c new file mode 100644 index 0000000..07c36ab --- /dev/null +++ b/CRIU_code/test/zdtm/static/sit.c @@ -0,0 +1,58 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "zdtmtst.h" + +const char *test_doc = "check sit devices"; +const char *test_author = "Pavel Emelyanov "; + +#define IF_NAME "zdtmsit0" +#define LOCAL_ADDR "1.1.1.2" +#define REMOT_ADDR "2.2.2.1" + +int main(int argc, char **argv) +{ + int ret = 1; + + test_init(argc, argv); + + if (system("ip link add " IF_NAME " type sit ttl 13 local " LOCAL_ADDR " remote " REMOT_ADDR)) { + pr_perror("Can't make sit device"); + return 1; + } + + if (system("ip -details addr list dev " IF_NAME " > sit.dump.test")) { + fail("can't save net config"); + goto out; + } + + test_daemon(); + test_waitsig(); + + if (system("ip -details addr list dev " IF_NAME " > sit.rst.test")) { + fail("can't get net config"); + goto out; + } + + if (system("diff sit.rst.test sit.dump.test")) { + fail("Net config differs after restore"); + goto out; + } + + pass(); + ret = 0; + +out: + return ret; +} + diff --git a/CRIU_code/test/zdtm/static/sit.desc b/CRIU_code/test/zdtm/static/sit.desc new file mode 100644 index 0000000..01c4149 --- /dev/null +++ b/CRIU_code/test/zdtm/static/sit.desc @@ -0,0 +1,3 @@ +{ 'deps': [ '/bin/sh', '/sbin/ip|/bin/ip', '/usr/bin/diff' ], + 'flags': 'suid', + 'flavor': 'ns uns' } diff --git a/CRIU_code/test/zdtm/static/sk-freebind-false.c b/CRIU_code/test/zdtm/static/sk-freebind-false.c new file mode 100644 index 0000000..de243ce --- /dev/null +++ b/CRIU_code/test/zdtm/static/sk-freebind-false.c @@ -0,0 +1 @@ +sk-freebind.c \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/sk-freebind.c b/CRIU_code/test/zdtm/static/sk-freebind.c new file mode 100644 index 0000000..de30329 --- /dev/null +++ b/CRIU_code/test/zdtm/static/sk-freebind.c @@ -0,0 +1,77 @@ +#include +#include +#include +#include +#include /* for sockaddr_in and inet_ntoa() */ + +#include "zdtmtst.h" + +const char *test_doc = "Check that IP_FREEBIND is restored"; +const char *test_author = "Andrew Vagin "; + +union sockaddr_inet { + struct sockaddr_in v4; + struct sockaddr_in6 v6; +}; + +#ifdef ZDTM_FREEBIND_FALSE +static const int fb_keep = 0; +static const int port = 56789; +#else +static const int fb_keep = 1; +static const int port = 56787; +#endif + +int main(int argc, char **argv) +{ + union sockaddr_inet addr; + socklen_t len; + int val, sock; + + test_init(argc, argv); + + addr.v6.sin6_family = AF_INET6; + inet_pton(AF_INET6, "2001:db8::ff00:42:8329", &(addr.v6.sin6_addr)); + addr.v6.sin6_port = htons(port); + + sock = socket(AF_INET6, SOCK_DGRAM, IPPROTO_UDP); + if (sock == -1) { + pr_perror("socket() failed"); + return -1; + } + val = 1; + if (setsockopt(sock, SOL_IP, IP_FREEBIND, &val, sizeof(int)) == -1 ) { + pr_perror("setsockopt() error"); + return -1; + } + if (bind(sock, (struct sockaddr *) &addr, sizeof(addr))) { + pr_perror("bind()"); + return -1; + } + + if (!fb_keep) { + val = 0; + if (setsockopt(sock, SOL_IP, IP_FREEBIND, &val, sizeof(int)) == -1 ) { + pr_perror("setsockopt() error"); + return -1; + } + } + + test_daemon(); + test_waitsig(); + + len = sizeof(int); + if (getsockopt(sock, SOL_IP, IP_FREEBIND, &val, &len) == -1 ) { + pr_perror("setsockopt() error"); + return -1; + } + + if (val != fb_keep) { + fail("Unexpected value: %d", val); + return -1; + } + + pass(); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/sk-netlink.c b/CRIU_code/test/zdtm/static/sk-netlink.c new file mode 100644 index 0000000..7f06f02 --- /dev/null +++ b/CRIU_code/test/zdtm/static/sk-netlink.c @@ -0,0 +1,160 @@ +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +#ifndef SOL_NETLINK +#define SOL_NETLINK 270 +#endif + +#define UDEV_MONITOR_TEST 32 + +const char *test_doc = "Support of netlink sockets"; +const char *test_author = "Andrew Vagin "; + +int main(int argc, char ** argv) +{ + int ssk, bsk, csk, dsk; + struct sockaddr_nl addr; + struct msghdr msg; + struct { + struct nlmsghdr hdr; + } req; + struct iovec iov; + char buf[4096]; + + test_init(argc, argv); + + ssk = socket(PF_NETLINK, SOCK_RAW, NETLINK_KOBJECT_UEVENT); + if (ssk < 0) { + pr_perror("Can't create sock diag socket"); + return -1; + } + bsk = socket(PF_NETLINK, SOCK_RAW, NETLINK_KOBJECT_UEVENT); + if (bsk < 0) { + pr_perror("Can't create sock diag socket"); + return -1; + } +#if 0 + int on, bbsk; + + bbsk = socket(PF_NETLINK, SOCK_RAW, NETLINK_KOBJECT_UEVENT); + if (bbsk < 0) { + pr_perror("Can't create sock diag socket"); + return -1; + } + + on = UDEV_MONITOR_TEST; + setsockopt(bbsk, SOL_NETLINK, NETLINK_ADD_MEMBERSHIP, &on, sizeof(on)); +#endif + csk = socket(PF_NETLINK, SOCK_RAW, NETLINK_KOBJECT_UEVENT); + if (csk < 0) { + pr_perror("Can't create sock diag socket"); + return -1; + } + dsk = socket(PF_NETLINK, SOCK_RAW, NETLINK_KOBJECT_UEVENT); + if (dsk < 0) { + pr_perror("Can't create sock diag socket"); + return -1; + } + + addr.nl_family = AF_NETLINK; + addr.nl_groups = 0; + addr.nl_pid = getpid(); + if (bind(ssk, (struct sockaddr *) &addr, sizeof(struct sockaddr_nl))) { + pr_perror("bind"); + return 1; + } + + addr.nl_groups = 1 << (UDEV_MONITOR_TEST - 1); + addr.nl_pid = 0; + if (bind(bsk, (struct sockaddr *) &addr, sizeof(struct sockaddr_nl))) { + pr_perror("bind"); + return 1; + } + + addr.nl_pid = getpid(); + addr.nl_groups = 1 << (UDEV_MONITOR_TEST - 1); + if (connect(csk, (struct sockaddr *) &addr, sizeof(struct sockaddr_nl))) { + pr_perror("connect"); + return 1; + } + + test_daemon(); + + test_waitsig(); + + req.hdr.nlmsg_len = sizeof(req); + req.hdr.nlmsg_type = 0x1234; + req.hdr.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST; + req.hdr.nlmsg_seq = 0xabcd; + + memset(&msg, 0, sizeof(msg)); + msg.msg_namelen = 0; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + + iov.iov_base = (void *) &req; + iov.iov_len = sizeof(req); + + if (sendmsg(csk, &msg, 0) < 0) { + pr_perror("Can't send request message"); + return 1; + } + + memset(&msg, 0, sizeof(msg)); + msg.msg_namelen = 0; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + + iov.iov_base = buf; + iov.iov_len = sizeof(buf); + + if (recvmsg(ssk, &msg, 0) < 0) { + pr_perror("Can't recv request message"); + return 1; + } + + if (recvmsg(bsk, &msg, 0) < 0) { + pr_perror("Can't recv request message"); + return 1; + } + + addr.nl_family = AF_NETLINK; + addr.nl_groups = 0; + addr.nl_pid = getpid(); + + memset(&msg, 0, sizeof(msg)); + msg.msg_namelen = sizeof(addr); + msg.msg_name = &addr; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + + iov.iov_base = (void *) &req; + iov.iov_len = sizeof(req); + + if (sendmsg(dsk, &msg, 0) < 0) { + pr_perror("Can't send request message"); + return 1; + } + + memset(&msg, 0, sizeof(msg)); + msg.msg_namelen = 0; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + + iov.iov_base = buf; + iov.iov_len = sizeof(buf); + + if (recvmsg(ssk, &msg, 0) < 0) { + pr_perror("Can't recv request message"); + return 1; + } + + pass(); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/sk-netlink.desc b/CRIU_code/test/zdtm/static/sk-netlink.desc new file mode 100644 index 0000000..2eac7e6 --- /dev/null +++ b/CRIU_code/test/zdtm/static/sk-netlink.desc @@ -0,0 +1 @@ +{'flags': 'suid'} diff --git a/CRIU_code/test/zdtm/static/sk-unix-mntns.c b/CRIU_code/test/zdtm/static/sk-unix-mntns.c new file mode 100644 index 0000000..58f6379 --- /dev/null +++ b/CRIU_code/test/zdtm/static/sk-unix-mntns.c @@ -0,0 +1,162 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Test that unix sockets are restored in proper mount namespaces\n"; +const char *test_author = "Andrei Vagin "; + +char *dirname; +TEST_OPTION(dirname, string, "socket file name", 1); + +#define TEST_MODE 0640 + +int main(int argc, char *argv[]) +{ + struct sockaddr_un addr; + unsigned int addrlen; + int sk, csk; + pid_t pid; + char path[PATH_MAX]; + char sbuf[256], rbuf[256]; + char *cwd; + int ret, status, i; + task_waiter_t t; + + test_init(argc, argv); + + task_waiter_init(&t); + cwd = get_current_dir_name(); + if (!cwd) { + fail("getcwd\n"); + exit(1); + } + + mkdir(dirname, 0777); + + pid = fork(); + if (pid < 0) { + pr_perror("fork"); + return 1; + } + + if (pid == 0) { + if (unshare(CLONE_NEWNS)) { + pr_perror("unshare"); + return 1; + } + if (mount(NULL, "/", NULL, MS_PRIVATE | MS_REC, NULL)) { + pr_perror("mount"); + return 1; + } + if (mount("test", dirname, "tmpfs", 0, NULL)) { + pr_perror("mount"); + return 1; + } + } + + addrlen = snprintf(path, sizeof(path), "%s/%s/%s", cwd, dirname, "test.socket"); + unlink(path); + + addr.sun_family = AF_UNIX; + if (addrlen > sizeof(addr.sun_path)) + return 1; + memcpy(addr.sun_path, path, addrlen); + addrlen += sizeof(addr.sun_family); + + sk = socket(AF_UNIX, SOCK_DGRAM | SOCK_NONBLOCK, 0); + if (sk < 0) { + pr_perror("socket\n"); + exit(1); + } + csk = socket(AF_UNIX, SOCK_DGRAM | SOCK_NONBLOCK, 0); + if (csk < 0) { + pr_perror("socket\n"); + exit(1); + } + + ret = bind(sk, (struct sockaddr *) &addr, addrlen); + if (ret) { + fail("bind\n"); + exit(1); + } + + if (connect(csk, (struct sockaddr *) &addr, addrlen)) { + fail("connect\n"); + exit(1); + } + + if (pid) { + task_waiter_wait4(&t, pid); + test_daemon(); + } else { + task_waiter_complete(&t, getpid()); + } + + test_waitsig(); + + if (pid) + kill(pid, SIGTERM); + + for (i = 0; i < 2; i++) { + int len; + + memset(sbuf, 0, sizeof(sbuf)); + len = ssprintf(sbuf, "%d-%d test test test", getpid(), i); + if (write(csk, sbuf, len) != len) { + pr_perror("write"); + return 1; + } + memset(rbuf, 0, sizeof(rbuf)); + if (read(sk, &rbuf, sizeof(rbuf)) != len) { + pr_perror("read"); + return 1; + } + + if (strncmp(rbuf, sbuf, len)) { + fail("data corrupted\n"); + exit(1); + } + + close(csk); + csk = socket(AF_UNIX, SOCK_DGRAM | SOCK_NONBLOCK, 0); + if (csk < 0) { + pr_perror("socket\n"); + exit(1); + } + if (connect(csk, (struct sockaddr *) &addr, addrlen)) { + pr_perror("connect"); + exit(1); + } + } + close(sk); + close(csk); + unlink(path); + + if (pid) { + if (waitpid(pid, &status, 0) != pid) { + pr_perror("waitpid"); + return 1; + } + + if (status) { + fail("A child process returned %d", status); + return 1; + } + } + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/sk-unix-mntns.desc b/CRIU_code/test/zdtm/static/sk-unix-mntns.desc new file mode 100644 index 0000000..71e5eae --- /dev/null +++ b/CRIU_code/test/zdtm/static/sk-unix-mntns.desc @@ -0,0 +1,5 @@ +{ + 'feature': 'mnt_id sk_unix_file', + 'flags': 'suid', + 'flavor': 'ns uns', +} diff --git a/CRIU_code/test/zdtm/static/sk-unix-rel.c b/CRIU_code/test/zdtm/static/sk-unix-rel.c new file mode 100644 index 0000000..5b3e852 --- /dev/null +++ b/CRIU_code/test/zdtm/static/sk-unix-rel.c @@ -0,0 +1,111 @@ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Test unix stream sockets with relative name\n"; +const char *test_author = "Cyrill Gorcunov sizeof(addr.sun_path)) + return 1; + memcpy(addr.sun_path, filename, addrlen); + addrlen += sizeof(addr.sun_family); + + sock[0] = socket(AF_UNIX, SOCK_STREAM, 0); + sock[1] = socket(AF_UNIX, SOCK_STREAM, 0); + if (sock[0] < 0 || sock[1] < 0) { + fail("socket\n"); + exit(1); + } + + if (setsockopt(sock[0], SOL_SOCKET, SO_REUSEADDR, &(int){ 1 }, sizeof(int)) < 0 || + setsockopt(sock[1], SOL_SOCKET, SO_REUSEADDR, &(int){ 1 }, sizeof(int)) < 0) { + fail("setsockopt\n"); + exit(1); + } + + ret = bind(sock[0], (struct sockaddr *) &addr, addrlen); + if (ret) { + fail("bind\n"); + exit(1); + } + + ret = listen(sock[0], 16); + if (ret) { + fail("bind\n"); + exit(1); + } + + test_daemon(); + test_waitsig(); + + if (connect(sock[1], (struct sockaddr *) &addr, addrlen)) { + fail("connect\n"); + exit(1); + } + + ret = accept(sock[0], NULL, NULL); + if (ret < 0) { + fail("accept"); + exit(1); + } + + memset(buf, 0, sizeof(buf)); + write(sock[1], SK_DATA, sizeof(SK_DATA)); + read(ret, &buf, sizeof(buf)); + + if (strcmp(buf, SK_DATA)) { + fail("data corrupted\n"); + exit(1); + } + test_msg("stream : '%s'\n", buf); + close(sock[0]); + close(sock[1]); + unlink(path); + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/sk-unix-unconn.c b/CRIU_code/test/zdtm/static/sk-unix-unconn.c new file mode 100644 index 0000000..72d1348 --- /dev/null +++ b/CRIU_code/test/zdtm/static/sk-unix-unconn.c @@ -0,0 +1,69 @@ +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check unconnected unix sockets"; +const char *test_author = "Vagin Andrew "; + +int main(int argc, char ** argv) +{ + int sk, skc; + int ret, len; + char path[PATH_MAX]; + struct sockaddr_un addr; + socklen_t addrlen; + + test_init(argc, argv); + + sk = socket(AF_UNIX, SOCK_STREAM, 0); + if (sk == -1) { + pr_perror("socket"); + return 1; + } + + skc = socket(AF_UNIX, SOCK_STREAM, 0); + if (skc == -1) { + pr_perror("socket"); + return 1; + } + + len = snprintf(path, sizeof(path), "X/zdtm-%s-%d/X", argv[0], getpid()); + + if (len >= sizeof(addr.sun_path)) { + pr_err("%s\n", path); + return 1; + } + addr.sun_family = AF_UNIX; + memcpy(addr.sun_path, path, len); + addrlen = sizeof(addr.sun_family) + len; + addr.sun_path[0] = 0; + addr.sun_path[len - 1] = 0; + + ret = bind(sk, (struct sockaddr *) &addr, addrlen); + if (ret) { + fail("bind\n"); + return 1; + } + + test_daemon(); + + test_waitsig(); + + if (listen(sk, 1) == -1) { + pr_perror("listen"); + return 1; + } + + if (connect(skc, (struct sockaddr *) &addr, addrlen) == -1) { + fail("Unable to connect"); + return 1; + } + + pass(); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/sk-unix01.c b/CRIU_code/test/zdtm/static/sk-unix01.c new file mode 100644 index 0000000..2bceef7 --- /dev/null +++ b/CRIU_code/test/zdtm/static/sk-unix01.c @@ -0,0 +1,375 @@ +#ifndef _GNU_SOURCE +# define _GNU_SOURCE +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that deleted unix sockets with dirs are restored correctly"; +const char *test_author = "Cyrill Gorcunov "; + +char *dirname; +TEST_OPTION(dirname, string, "directory name", 1); + +static int fill_sock_name(struct sockaddr_un *name, const char *filename) +{ + char *cwd; + + cwd = get_current_dir_name(); + if (strlen(filename) + strlen(cwd) + 1 >= sizeof(name->sun_path)) { + pr_err("Name %s/%s is too long for socket\n", + cwd, filename); + return -1; + } + + name->sun_family = AF_LOCAL; + ssprintf(name->sun_path, "%s/%s", cwd, filename); + return 0; +} + +static int sk_alloc_bind(int type, struct sockaddr_un *addr) +{ + int sk; + + sk = socket(PF_UNIX, type, 0); + if (sk < 0) { + pr_perror("socket"); + return -1; + } + + if (addr && bind(sk, (const struct sockaddr *)addr, sizeof(*addr))) { + pr_perror("bind %s", addr->sun_path); + close(sk); + return -1; + } + + return sk; +} + +static int sk_alloc_connect(int type, struct sockaddr_un *addr) +{ + int sk; + + sk = socket(PF_UNIX, type, 0); + if (sk < 0) { + pr_perror("socket"); + return -1; + } + + if (connect(sk, (const struct sockaddr *)addr, sizeof(*addr))) { + pr_perror("connect %s", addr->sun_path); + close(sk); + return -1; + } + + return sk; +} + +#define write_int(__f, __p) \ + ({ \ + ssize_t __r = write(__f, __p, sizeof(*(__p))); \ + (__r == sizeof(*(__p))) ? 0 : -1; \ + }) + +#define read_int(__f, __p) \ + ({ \ + ssize_t __r = read(__f, __p, sizeof(*(__p))); \ + (__r == sizeof(*(__p))) ? 0 : -1; \ + }) + +int main(int argc, char **argv) +{ + int c1 = 1, c2 = 0, c3 = 3, c4 = 0; + int c5 = 5, c6 = 0, c7 = 7, c8 = 0; + int c9 = 8, c10 = 0; + char filename[PATH_MAX]; + char subdir_dg[PATH_MAX]; + char subdir_st[PATH_MAX]; + struct sockaddr_un addr_from; + struct sockaddr_un addr; + int sk_dgram_pair[2]; + int sk_dgram[9]; + socklen_t len; + int sk_st[5]; + DIR *dir; + + test_init(argc, argv); + + /* + * All sockets are under dir to not clash + * with other tests. + */ + if (mkdir(dirname, 0755) < 0) { + if (errno != EEXIST) { + pr_perror("Can't create %s", dirname); + return 1; + } + } + + /* + * Subdir for dgram sockets. + */ + ssprintf(subdir_dg, "%s/%s", dirname, "dg"); + if (mkdir(subdir_dg, 0755) < 0) { + if (errno != EEXIST) { + pr_perror("Can't create %s", subdir_dg); + return 1; + } + } + + /* + * Subdir for stream sockets. + */ + ssprintf(subdir_st, "%s/%s", dirname, "st"); + if (mkdir(subdir_st, 0755) < 0) { + if (errno != EEXIST) { + pr_perror("Can't create %s", subdir_st); + return 1; + } + } + + /* + * DGRAM sockets + * + * - create 2 sockets + * - bind first to subdired + * - connect second to it + * - delete socket on fs + * - do the same for second pair with same name + * - delete socket on fs + * - create 3 more sockets + * - bind /connect to same name, where one is deleted + */ + + ssprintf(filename, "%s/%s", subdir_dg, "sk-dt"); + if (fill_sock_name(&addr, filename) < 0) { + pr_err("%s is too long for socket\n", filename); + return 1; + } + unlink(addr.sun_path); + + sk_dgram[0] = sk_alloc_bind(SOCK_DGRAM, &addr); + if (sk_dgram[0] < 0) + return 1; + test_msg("sk-dt: alloc/bind %d %s\n", sk_dgram[0], addr.sun_path); + + sk_dgram[1] = sk_alloc_connect(SOCK_DGRAM, &addr); + if (sk_dgram[1] < 0) + return 1; + unlink(addr.sun_path); + test_msg("sk-dt: alloc/connect/unlink %d %s\n", sk_dgram[1], addr.sun_path); + + sk_dgram[2] = sk_alloc_bind(SOCK_DGRAM, &addr); + if (sk_dgram[2] < 0) + return 1; + test_msg("sk-dt: alloc/bind %d %s\n", sk_dgram[2], addr.sun_path); + + sk_dgram[3] = sk_alloc_connect(SOCK_DGRAM, &addr); + if (sk_dgram[3] < 0) + return 1; + unlink(addr.sun_path); + test_msg("sk-dt: alloc/connect/unlink %d %s\n", sk_dgram[3], addr.sun_path); + + ssprintf(filename, "%s/%s", dirname, "sole"); + if (fill_sock_name(&addr, filename) < 0) { + pr_err("%s is too long for socket\n", filename); + return 1; + } + unlink(addr.sun_path); + + sk_dgram[4] = sk_alloc_bind(SOCK_DGRAM, &addr); + if (sk_dgram[4] < 0) + return 1; + test_msg("sk-dt: alloc/bind %d %s\n", sk_dgram[4], addr.sun_path); + + sk_dgram[5] = sk_alloc_connect(SOCK_DGRAM, &addr); + if (sk_dgram[5] < 0) + return 1; + unlink(addr.sun_path); + test_msg("sk-dt: alloc/connect/unlink %d %s\n", sk_dgram[5], addr.sun_path); + + sk_dgram[6] = sk_alloc_bind(SOCK_DGRAM, &addr); + if (sk_dgram[6] < 0) + return 1; + test_msg("sk-dt: alloc/bind %d %s\n", sk_dgram[6], addr.sun_path); + + sk_dgram[7] = sk_alloc_connect(SOCK_DGRAM, &addr); + if (sk_dgram[7] < 0) + return 1; + unlink(addr.sun_path); + test_msg("sk-dt: alloc/connect/unlink %d %s\n", sk_dgram[7], addr.sun_path); + + sk_dgram[8] = sk_alloc_bind(SOCK_DGRAM, &addr); + if (sk_dgram[8] < 0) + return 1; + test_msg("sk-dt: alloc/bind %d %s\n", sk_dgram[8], addr.sun_path); + + if (dup2(sk_dgram[4], 110) < 0 || dup2(sk_dgram[6], 100) < 0) { + pr_perror("Can't move socket"); + return 1; + } + close(sk_dgram[4]); + sk_dgram[4] = 110; + close(sk_dgram[6]); + sk_dgram[6] = 100; + + /* + * DGRAM paired sockets. Just bind both to the same name. + */ + if (socketpair(PF_UNIX, SOCK_DGRAM, 0, sk_dgram_pair)) { + pr_perror("Can't create dgram pair"); + return 1; + } + test_msg("sk-dgp: sockpair %d %d\n", + sk_dgram_pair[0], sk_dgram_pair[1]); + ssprintf(filename, "%s/%s", subdir_dg, "sk-dtp"); + + if (fill_sock_name(&addr, filename) < 0) { + pr_err("%s is too long for socket\n", filename); + return 1; + } + + if (bind(sk_dgram_pair[0], (const struct sockaddr *)&addr, sizeof(addr))) { + pr_perror("bind %d to %s", sk_dgram_pair[0], addr.sun_path); + return -1; + } + unlink(addr.sun_path); + if (bind(sk_dgram_pair[1], (const struct sockaddr *)&addr, sizeof(addr))) { + pr_perror("bind %d to %s", sk_dgram_pair[1], addr.sun_path); + return -1; + } + unlink(addr.sun_path); + + /* + * Drop subdirectory. + */ + rmdir(subdir_dg); + + /* + * STREAM sockets + * + * - create server, bind to subdired + * - create client + * - connect to server + * - delete socket on fs + * - bind again to subdired + * - connect to server + * - delete socket on fs + */ + ssprintf(filename, "%s/%s", subdir_st, "sk-st"); + if (fill_sock_name(&addr, filename) < 0) { + pr_err("%s is too long for socket\n", filename); + return 1; + } + unlink(addr.sun_path); + + sk_st[0] = sk_alloc_bind(SOCK_STREAM, &addr); + if (sk_st[0] < 0) + return 1; + test_msg("sk-st: alloc/bind/listen %d\n", sk_st[0]); + + if (listen(sk_st[0], 16)) { + pr_perror("Can't listen on socket"); + return 1; + } + + sk_st[1] = sk_alloc_connect(SOCK_STREAM, &addr); + if (sk_st[1] < 0) + return 1; + test_msg("sk-st: alloc/connect %d\n", sk_st[1]); + + len = sizeof(addr_from); + sk_st[2] = accept(sk_st[0], (struct sockaddr *)&addr_from, &len); + if (sk_st[2] < 0) { + pr_perror("Can't accept on socket"); + return 1; + } + test_msg("sk-st: accept %d\n", sk_st[2]); + + sk_st[3] = sk_alloc_connect(SOCK_STREAM, &addr); + if (sk_st[3] < 0) + return 1; + test_msg("sk-st: alloc/connect %d\n", sk_st[3]); + + len = sizeof(addr_from); + sk_st[4] = accept(sk_st[0], (struct sockaddr *)&addr_from, &len); + if (sk_st[4] < 0) { + pr_perror("Can't accept on socket"); + return 1; + } + test_msg("sk-st: accept %d\n", sk_st[4]); + + unlink(addr.sun_path); + + /* + * Drop subdirectory. + */ + rmdir(subdir_st); + + test_daemon(); + test_waitsig(); + + if (write_int(sk_dgram[1], &c1) || read_int(sk_dgram[0], &c2) || + write_int(sk_dgram[3], &c3) || read_int(sk_dgram[2], &c4)) { + fail("Unable to send/receive a message on dgram"); + return 1; + } + + if (c1 != c2 || c3 != c4) { + fail("Vals mismatch on dgram: c1 %d c2 %d c3 %d c4 %d", + c1, c2, c3, c4); + return 1; + } + + if (write_int(sk_dgram_pair[1], &c9) || + read_int(sk_dgram_pair[0], &c10)) { + fail("Unable to send/receive a message on paired dgram"); + return 1; + } + + if (c9 != c10) { + fail("Vals mismatch on dgram: c9 %d c10 %d", + c9, c10); + return 1; + } + + if (write_int(sk_st[2], &c5) || read_int(sk_st[1], &c6) || + write_int(sk_st[4], &c7) || read_int(sk_st[3], &c8)) { + fail("Unable to send/receive a message on stream"); + return 1; + } + + if (c5 != c6 || c7 != c8) { + fail("Vals mismatch on stream: c5 %d c6 %d c7 %d c8 %d", + c5, c6, c7, c8); + return 1; + } + + dir = opendir(subdir_dg); + if (dir != NULL || errno != ENOENT) { + fail("Directory %s is not deteled", subdir_dg); + return 1; + } + + dir = opendir(subdir_st); + if (dir != NULL || errno != ENOENT) { + fail("Directory %s is not deteled", subdir_st); + return 1; + } + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/sk-unix01.desc b/CRIU_code/test/zdtm/static/sk-unix01.desc new file mode 100644 index 0000000..2651c4d --- /dev/null +++ b/CRIU_code/test/zdtm/static/sk-unix01.desc @@ -0,0 +1 @@ +{'flavor': 'h ns uns', 'flags': 'suid'} diff --git a/CRIU_code/test/zdtm/static/skip-me.c b/CRIU_code/test/zdtm/static/skip-me.c new file mode 100644 index 0000000..9a55276 --- /dev/null +++ b/CRIU_code/test/zdtm/static/skip-me.c @@ -0,0 +1,12 @@ +int main(int argc, char ** argv) +{ + test_init(argc, argv); + + test_msg("Skipping test:" TEST_SKIP_REASON); + + test_daemon(); + test_waitsig(); + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/sleeping00.c b/CRIU_code/test/zdtm/static/sleeping00.c new file mode 100644 index 0000000..f59ffaf --- /dev/null +++ b/CRIU_code/test/zdtm/static/sleeping00.c @@ -0,0 +1,18 @@ +#include + +#include "zdtmtst.h" + +const char *test_doc = "Suspend while migrating"; +const char *test_author = "Roman Kagan "; + +int main(int argc, char ** argv) +{ + test_init(argc, argv); + + test_daemon(); + test_waitsig(); + + pass(); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/sock_filter.c b/CRIU_code/test/zdtm/static/sock_filter.c new file mode 100644 index 0000000..e2475cd --- /dev/null +++ b/CRIU_code/test/zdtm/static/sock_filter.c @@ -0,0 +1,115 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check socket filter"; +const char *test_author = "Pavel Emelyanov "; + +#ifndef SO_GET_FILTER +#define SO_GET_FILTER SO_ATTACH_FILTER +#endif + +#define SFLEN 14 + +int main(int argc, char **argv) +{ + int sk; + struct sock_fprog p; + struct sock_filter f[SFLEN] = { + { 0x28, 0, 0, 0x0000000c }, + { 0x15, 0, 4, 0x00000800 }, + { 0x20, 0, 0, 0x0000001a }, + { 0x15, 8, 0, 0x7f000001 }, + { 0x20, 0, 0, 0x0000001e }, + { 0x15, 6, 7, 0x7f000001 }, + { 0x15, 1, 0, 0x00000806 }, + { 0x15, 0, 5, 0x00008035 }, + { 0x20, 0, 0, 0x0000001c }, + { 0x15, 2, 0, 0x7f000001 }, + { 0x20, 0, 0, 0x00000026 }, + { 0x15, 0, 1, 0x7f000001 }, + { 0x6, 0, 0, 0x0000ffff }, + { 0x6, 0, 0, 0x00000000 }, + }; + struct sock_filter f2[SFLEN], f3[SFLEN]; + socklen_t len; + + test_init(argc, argv); + + sk = socket(PF_INET, SOCK_DGRAM, IPPROTO_UDP); + if (sk < 0) { + pr_perror("No socket"); + return 1; + } + + p.len = SFLEN; + p.filter = f; + + if (setsockopt(sk, SOL_SOCKET, SO_ATTACH_FILTER, &p, sizeof(p))) { + pr_perror("No filter"); + return 1; + } + + len = 0; + if (getsockopt(sk, SOL_SOCKET, SO_GET_FILTER, NULL, &len)) { + pr_perror("No len"); + return 1; + } + + if (len != SFLEN) { + pr_perror("Len mismatch"); + return 1; + } + + memset(f2, 0, sizeof(f2)); + if (getsockopt(sk, SOL_SOCKET, SO_GET_FILTER, f2, &len)) { + perror("No filter"); + return 1; + } + + if (len != SFLEN) { + pr_perror("Len mismatch2"); + return 1; + } + + test_daemon(); + test_waitsig(); + + len = 0; + if (getsockopt(sk, SOL_SOCKET, SO_GET_FILTER, NULL, &len)) { + fail("No len"); + return 1; + } + + if (len != SFLEN) { + fail("Len mismatch"); + return 1; + } + + memset(f3, 0, sizeof(f3)); + if (getsockopt(sk, SOL_SOCKET, SO_GET_FILTER, f3, &len)) { + fail("No filter"); + return 1; + } + + if (len != SFLEN) { + fail("Len mismatch2"); + return 1; + } + + if (memcmp(f2, f3, sizeof(f2))) { + fail("Filters mismatch"); + return 1; + } + + pass(); + + return 0; +} + diff --git a/CRIU_code/test/zdtm/static/sock_opts00.c b/CRIU_code/test/zdtm/static/sock_opts00.c new file mode 100644 index 0000000..08fc1d3 --- /dev/null +++ b/CRIU_code/test/zdtm/static/sock_opts00.c @@ -0,0 +1,91 @@ +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check various socket options to work"; +const char *test_author = "Pavel Emelyanov "; + +#define TEST_PORT 59687 +#define TEST_ADDR INADDR_ANY + +#define NOPTS 7 + +int main(int argc, char ** argv) +{ + int sock, ret = 0, vname[NOPTS], val[NOPTS], rval, i; + socklen_t len = sizeof(int); + + vname[0] = SO_PRIORITY; + vname[1] = SO_RCVLOWAT; + vname[2] = SO_MARK; + vname[3] = SO_PASSCRED; + vname[4] = SO_PASSSEC; + vname[5] = SO_DONTROUTE; + vname[6] = SO_NO_CHECK; + + test_init(argc, argv); + + sock = socket(PF_INET, SOCK_STREAM, 0); + if (sock < 0) { + pr_perror("can't create socket"); + return 1; + } + + for (i = 0; i < NOPTS; i++) { + ret = getsockopt(sock, SOL_SOCKET, vname[i], &val[i], &len); + if (ret) { + pr_perror("can't get option %d", i); + return 1; + } + + val[i]++; + + ret = setsockopt(sock, SOL_SOCKET, vname[i], &val[i], len); + if (ret) { + pr_perror("can't set option %d", i); + return 1; + } + + ret = getsockopt(sock, SOL_SOCKET, vname[i], &rval, &len); + if (ret) { + pr_perror("can't get option %d 2", i); + return 1; + } + + if (rval != val[i]) { + if (rval + 1 == val[i]) { + pr_perror("can't reset option %d want %d have %d", i, + val[i], rval); + return 1; + } + + /* kernel tuned things up on set */ + val[i] = rval; + } + } + + test_daemon(); + test_waitsig(); + + for (i = 0; i < NOPTS; i++) { + ret = getsockopt(sock, SOL_SOCKET, vname[i], &rval, &len); + if (ret) { + pr_perror("can't get option %d again", i); + return 1; + } + + if (val[i] != rval) { + fail("option %d changed", i); + return 1; + } + } + + pass(); + close(sock); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/sock_opts00.desc b/CRIU_code/test/zdtm/static/sock_opts00.desc new file mode 100644 index 0000000..2eac7e6 --- /dev/null +++ b/CRIU_code/test/zdtm/static/sock_opts00.desc @@ -0,0 +1 @@ +{'flags': 'suid'} diff --git a/CRIU_code/test/zdtm/static/sock_opts01.c b/CRIU_code/test/zdtm/static/sock_opts01.c new file mode 100644 index 0000000..7dfce22 --- /dev/null +++ b/CRIU_code/test/zdtm/static/sock_opts01.c @@ -0,0 +1,61 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that SO_BINDTODEVICE option works"; +const char *test_author = "Pavel Emelyanov "; + +int main(int argc, char ** argv) +{ + int sock, ret; + char dev[IFNAMSIZ], dev2[IFNAMSIZ]; + socklen_t len, len2; + + test_init(argc, argv); + + sock = socket(PF_INET, SOCK_STREAM, IPPROTO_TCP); + if (sock < 0) { + pr_perror("can't create socket"); + return 1; + } + + ret = setsockopt(sock, SOL_SOCKET, SO_BINDTODEVICE, "eth0", 5); + if (ret < 0) + ret = setsockopt(sock, SOL_SOCKET, SO_BINDTODEVICE, "lo", 3); + if (ret < 0) { + pr_perror("can't bind to eth0"); + return 1; + } + + len = sizeof(dev); + ret = getsockopt(sock, SOL_SOCKET, SO_BINDTODEVICE, &dev, &len); + if (ret < 0) { + pr_perror("can't get dev binding"); + return 1; + } + + test_daemon(); + test_waitsig(); + + len2 = sizeof(dev); + ret = getsockopt(sock, SOL_SOCKET, SO_BINDTODEVICE, &dev2, &len2); + if (ret < 0) { + fail("can't get dev binding2"); + return 1; + } + + if ((len != len2) || strncmp(dev, dev2, len)) + fail("wrong bound device"); + else + pass(); + + close(sock); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/sock_opts01.desc b/CRIU_code/test/zdtm/static/sock_opts01.desc new file mode 100644 index 0000000..2eac7e6 --- /dev/null +++ b/CRIU_code/test/zdtm/static/sock_opts01.desc @@ -0,0 +1 @@ +{'flags': 'suid'} diff --git a/CRIU_code/test/zdtm/static/sock_peercred.c b/CRIU_code/test/zdtm/static/sock_peercred.c new file mode 100644 index 0000000..069cc52 --- /dev/null +++ b/CRIU_code/test/zdtm/static/sock_peercred.c @@ -0,0 +1,127 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +#define STACK_SIZE (1024 * 1024) +#define GID_INC 1 +#define UID_INC 1 + +const char *test_doc = "Check peercred of a unix socket remains the same"; +const char *test_author = "Kirill Tkhai "; + +static int child_func(void *fd_p) +{ + int fd = (int)(unsigned long)fd_p; + struct ucred ucred; + socklen_t len; + int sks[2]; + + if (setgid(getgid() + GID_INC) != 0) { + pr_perror("Can't setgid()"); + return 1; + } + + if (setuid(getuid() + UID_INC) != 0) { + pr_perror("Can't setuid()"); + return 1; + } + + if (socketpair(PF_UNIX, SOCK_DGRAM | SOCK_NONBLOCK, 0, sks) < 0) { + pr_perror("Can't create socketpair"); + return 1; + } + + len = sizeof(ucred); + if (getsockopt(sks[0], SOL_SOCKET, SO_PEERCRED, &ucred, &len) < 0) { + pr_perror("Can't getsockopt()"); + return 1; + } + + if (ucred.pid != getpid() || ucred.uid != getuid() || ucred.gid != getgid()) { + pr_perror("Wrong sockopts"); + return 1; + } + + /* If sks[1] == fd, the below closes it, but we don't care */ + if (dup2(sks[0], fd) == -1) { + pr_perror("Can't dup fd\n"); + return 1; + } + + return 0; +} + +int main(int argc, char **argv) +{ + struct ucred ucred; + int fd, status; + socklen_t len; + char *stack; + pid_t pid; + int exit_code = 1; + + test_init(argc, argv); + + /* + * We do not know, which direction stack grows. + * So just allocate 2 * STACK_SIZE for stack and + * give clone() pointer to middle of this memory. + */ + stack = malloc(2 * STACK_SIZE); + if (!stack) { + pr_err("malloc\n"); + goto out; + } + + /* Find unused fd */ + for (fd = 0; fd < INT_MAX; fd++) { + if (fcntl(fd, F_GETFD) == -1 && errno == EBADF) + break; + } + + if (fd == INT_MAX) { + pr_err("INT_MAX happens...\n"); + goto out; + } + + pid = clone(child_func, stack + STACK_SIZE, CLONE_FILES|SIGCHLD, (void *)(unsigned long)fd); + if (pid == -1) { + pr_perror("clone"); + goto out; + } + + if (wait(&status) == -1 || status) { + pr_perror("wait error: status=%d\n", status); + goto out; + } + + test_daemon(); + test_waitsig(); + + len = sizeof(ucred); + if (getsockopt(fd, SOL_SOCKET, SO_PEERCRED, &ucred, &len) < 0) { + fail("Can't getsockopt()"); + goto out; + } + + if (ucred.pid != pid || ucred.gid != getuid() + UID_INC || + ucred.gid != getgid() + GID_INC) { + fail("Wrong pid, uid or gid\n"); + goto out; + } + + pass(); + exit_code = 0; + out: + free(stack); + return exit_code; +} diff --git a/CRIU_code/test/zdtm/static/sock_peercred.desc b/CRIU_code/test/zdtm/static/sock_peercred.desc new file mode 100644 index 0000000..3a7cc18 --- /dev/null +++ b/CRIU_code/test/zdtm/static/sock_peercred.desc @@ -0,0 +1 @@ +{ 'flags': 'suid noauto' } diff --git a/CRIU_code/test/zdtm/static/socket-ext.c b/CRIU_code/test/zdtm/static/socket-ext.c new file mode 100644 index 0000000..333fb8c --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-ext.c @@ -0,0 +1,128 @@ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Test external sockets\n"; +const char *test_author = "Andrey Vagin +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include "zdtmtst.h" + +/* + * Some code snippets are taken from + * http://www.binarytides.com/raw-udp-sockets-c-linux/ + */ + +const char *test_doc = "Test RAW sockets (IPv4,6)\n"; +const char *test_author = "Cyrill Gorcunov "; + +#ifndef SO_IP_SET +# define SO_IP_SET 83 +#endif + +#ifndef IP_SET_OP_VERSION +# define IP_SET_OP_VERSION 0x00000100 /* Ask kernel version */ +#endif + +#define pr_debug(format, arg...) test_msg("DBG: %s:%d: " format, __FILE__, __LINE__, ## arg) + +struct ip_set_req_version { + unsigned int op; + unsigned int version; +}; + +struct pseudo_header { + uint32_t source_address; + uint32_t dest_address; + uint8_t placeholder; + uint8_t protocol; + uint16_t udp_length; +}; + +static int stop_icmp(int sk_icmp, int sk_icmpv6) +{ + struct icmp6_filter filter6 = { }; + struct icmp_filter filter = { }; + socklen_t aux; + int ret = 0; + + aux = sizeof(filter); + ret = getsockopt(sk_icmp, SOL_RAW, ICMP_FILTER, &filter, &aux); + if (ret < 0) { + pr_perror("stop_icmp: Can't fetch icmp filter"); + return ret; + } + + if (filter.data != (1 << ICMP_TIMESTAMP)) { + pr_err("data mismatch on icmp filter %d != %d\n", + filter.data, (1 << ICMP_TIMESTAMP)); + return -1; + } + + aux = sizeof(filter6); + ret = getsockopt(sk_icmpv6, SOL_ICMPV6, ICMPV6_FILTER, &filter6, &aux); + if (ret < 0) { + pr_perror("stop_icmp: Can't fetch icmpv6 filter"); + return ret; + } + + if (filter6.data[0] != (1 << ICMP_TIMESTAMP)) { + pr_err("data mismatch on icmp filter %d != %d\n", + filter6.data[0], (1 << ICMP_TIMESTAMP)); + return -1; + } + + return ret; +} + +static int start_icmp(int sk_icmp, int sk_icmpv6, + const char *a4, const char *a6, int port) +{ + struct sockaddr_in addr_client; + struct icmp6_filter filter6 = { }; + struct icmp_filter filter = { }; + int ret = 0; + + memset(&addr_client, 0, sizeof(addr_client)); + + addr_client.sin_family = AF_INET; + addr_client.sin_port = htons(port); + addr_client.sin_addr.s_addr = inet_addr(a4); + + ret = bind(sk_icmp, (struct sockaddr *)&addr_client, sizeof(addr_client)); + if (ret < 0) { + pr_perror("start_icmp: Can't bind RAW client socket"); + return ret; + } + pr_debug("start_icmp: Bound sk_icmp\n"); + + filter.data = (1 << ICMP_TIMESTAMP); + ret = setsockopt(sk_icmp, SOL_RAW, ICMP_FILTER, &filter, sizeof(filter)); + if (ret < 0) { + pr_perror("start_icmp: Can't setup icmp filter"); + return ret; + } + + filter6.data[0] = (1 << ICMP_TIMESTAMP); + ret = setsockopt(sk_icmpv6, SOL_ICMPV6, ICMPV6_FILTER, &filter6, sizeof(filter6)); + if (ret < 0) { + pr_perror("start_icmp: Can't setup icmpv6 filter"); + return ret; + } + + return ret; +} + +static unsigned short csum(unsigned short *ptr, int nbytes) +{ + unsigned short oddbyte; + register short answer; + register long sum; + + sum = 0; + while (nbytes > 1) { + sum += *ptr++; + nbytes -= 2; + } + + if (nbytes == 1) { + oddbyte = 0; + *((unsigned char *)&oddbyte) = *(unsigned char *)ptr; + sum += oddbyte; + } + + sum = (sum >> 16) + (sum & 0xffff); + sum = sum + (sum >> 16); + answer = (short)~sum; + + return answer; +} + +/* + * Just create IPv6/IPv6 sockets with any protos + * to make sure criu won't BUG on unknown proto. + */ +static void raw_socks_storm(void) +{ + int sk4[IPPROTO_MAX]; + int sk6[IPPROTO_MAX]; + size_t i; + + + for (i = 1; i < ARRAY_SIZE(sk4); i++) { + sk4[i] = socket(PF_INET, SOCK_RAW | SOCK_NONBLOCK, i); + if (sk4[i] >= 0) + test_msg("Created IPv4 proto %zd: %d\n", i, sk4[i]); + } + + for (i = 1; i < ARRAY_SIZE(sk6); i++) { + sk6[i] = socket(PF_INET6, SOCK_RAW | SOCK_NONBLOCK, i); + if (sk6[i] >= 0) + test_msg("Created IPv6 proto %zd: %d\n", i, sk6[i]); + } +} + +int main(int argc, char *argv[]) +{ + const char string_data[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; + const char string_client_ip[] = "127.0.0.12"; + const char string_serv_ip[] = "127.0.0.10"; + const char string_client_icmp_ip[] = "127.0.0.14"; + const char string_client_icmpv6_ip[] = "::14"; + char datagram[512], *data, *pseudogram; + char receiver[512]; + + struct ip_set_req_version req_version; + socklen_t size = sizeof(req_version); + + int sk_udp, sk_udp_serv; + int sk_raw, sk6_raw; + int sk_icmp, sk_icmpv6; + + struct udphdr *udph = (struct udphdr *)(datagram + sizeof(struct ip)); + struct iphdr *iph = (struct iphdr *)datagram; + struct sockaddr_in addr_serv, addr_client; + struct pseudo_header psh; + + int port_client = 8080; + int port_serv = 8081; + + int psize, one = 1; + const int *val = &one; + + socklen_t len = sizeof(struct sockaddr_in); + int ret, status; + + pid_t pid; + + task_waiter_t waiter; + + test_init(argc, argv); + + task_waiter_init(&waiter); + + sk_raw = socket(PF_INET, SOCK_RAW | SOCK_NONBLOCK, IPPROTO_RAW); + if (sk_raw < 0) { + pr_perror("Can't create IPv4 raw socket"); + exit(1); + } + pr_debug("sk_raw %d\n", sk_raw); + + /* Simply to make sure it can be recreated on restore */ + sk6_raw = socket(PF_INET6, SOCK_RAW | SOCK_NONBLOCK, IPPROTO_RAW); + if (sk6_raw < 0) { + pr_perror("Can't create IPv6 raw socket"); + exit(1); + } + pr_debug("sk6_raw %d\n", sk6_raw); + + sk_udp = socket(PF_INET, SOCK_RAW | SOCK_NONBLOCK, IPPROTO_UDP); + if (sk_udp < 0) { + pr_perror("Can't create IPv4 raw-udp socket"); + exit(1); + } + pr_debug("sk_udp %d\n", sk_udp); + + sk_icmp = socket(PF_INET, SOCK_RAW | SOCK_NONBLOCK, IPPROTO_ICMP); + if (sk_icmp < 0) { + pr_perror("Can't create IPv4 raw icmp socket"); + exit(1); + } + pr_debug("sk_icmp %d\n", sk_icmp); + + sk_icmpv6 = socket(PF_INET6, SOCK_RAW | SOCK_NONBLOCK, IPPROTO_ICMPV6); + if (sk_icmpv6 < 0) { + pr_perror("Can't create IPv6 raw icmpv6 socket"); + exit(1); + } + pr_debug("sk_icmpv6 %d\n", sk_icmpv6); + + sk_udp_serv = socket(PF_INET, SOCK_DGRAM, IPPROTO_UDP); + if (sk_udp_serv < 0) { + pr_perror("Can't create DGRAM server socket"); + exit(1); + } + pr_debug("sk_udp_serv %d\n", sk_udp_serv); + + memset(datagram, 0, sizeof(datagram)); + memset(receiver, 0, sizeof(receiver)); + memset(&addr_serv, 0, sizeof(addr_serv)); + memset(&addr_client, 0, sizeof(addr_client)); + + addr_client.sin_family = AF_INET; + addr_client.sin_port = htons(port_client); + addr_client.sin_addr.s_addr = inet_addr(string_client_ip); + + addr_serv.sin_family = AF_INET; + addr_serv.sin_port = htons(port_serv); + addr_serv.sin_addr.s_addr = inet_addr(string_serv_ip); + + ret = bind(sk_udp_serv, (struct sockaddr *)&addr_serv, sizeof(addr_serv)); + if (ret < 0) { + pr_perror("Can't bind DGRAM server socket"); + return 1; + } + pr_debug("Bound sk_udp_serv\n"); + + ret = bind(sk_udp, (struct sockaddr *)&addr_client, sizeof(addr_client)); + if (ret < 0) { + pr_perror("Can't bind DGRAM client socket"); + return 1; + } + pr_debug("Bound sk_udp\n"); + + if (start_icmp(sk_icmp, sk_icmpv6, string_client_icmp_ip, + string_client_icmpv6_ip, port_client)) + return 1; + + data = datagram + sizeof(struct iphdr) + sizeof(struct udphdr); + strcpy(data, string_data); + + iph->ihl = 5; + iph->version = 4; + iph->tos = 0; + iph->tot_len = sizeof(struct iphdr) + sizeof(struct udphdr) + strlen(string_data); + iph->id = htonl(54321); + iph->frag_off = 0; + iph->ttl = 255; + iph->protocol = IPPROTO_UDP; + iph->check = 0; + iph->saddr = inet_addr(string_client_ip); + iph->daddr = addr_serv.sin_addr.s_addr; + iph->check = csum((unsigned short *)datagram, sizeof(struct iphdr)); + + udph->source = htons(port_client); + udph->dest = htons(port_serv); + udph->len = htons(8 + strlen(data)); + udph->check = 0; + + psh.source_address = inet_addr(string_client_ip); + psh.dest_address = addr_serv.sin_addr.s_addr; + psh.placeholder = 0; + psh.protocol = IPPROTO_UDP; + psh.udp_length = htons(sizeof(struct udphdr) + strlen(string_data)); + + psize = sizeof(psh) + sizeof(struct udphdr) + strlen(string_data); + pseudogram = malloc(psize); + if (!pseudogram) { + pr_err("No free memory\n"); + exit(1); + } + + memcpy(pseudogram, (char *)&psh , sizeof(psh)); + memcpy(pseudogram + sizeof(psh), udph, sizeof(*udph) + strlen(string_data)); + + udph->check = csum((unsigned short *)pseudogram, psize); + free(pseudogram); + + if (setsockopt(sk_udp, IPPROTO_IP, IP_HDRINCL, val, sizeof(one)) < 0) { + pr_perror("Error setting IP_HDRINCL"); + exit(1); + } + + pid = test_fork(); + if (pid == 0) { + task_waiter_wait4(&waiter, 2); + pr_debug("Gonna read data\n"); + ret = recvfrom(sk_udp_serv, receiver, sizeof(receiver), 0, + (struct sockaddr *)&addr_client, &len); + if (ret < 0) { + task_waiter_complete(&waiter, 2); + fail("Can't read data"); + exit(1); + } + receiver[ret] = '\0'; + pr_debug("Read %d bytes\n", ret); + + task_waiter_complete(&waiter, 3); + + if (strcmp(receiver, string_data)) { + pr_err("Data mismatch (got %s but expected %s)\n", + receiver, string_data); + exit(1); + } else + pr_debug("Data match\n"); + exit(0); + } else if (pid < 0) { + pr_err("Can't fork\n"); + exit(1); + } + + raw_socks_storm(); + + test_daemon(); + test_waitsig(); + + if (sendto(sk_udp, datagram, iph->tot_len, 0, + (struct sockaddr *)&addr_serv, sizeof(addr_serv)) < 0) { + kill(pid, SIGKILL); + fail("Can't send RAW data"); + exit(1); + } + + task_waiter_complete(&waiter, 2); + pr_debug("Sent %d bytes\n", (int)iph->tot_len); + task_waiter_wait4(&waiter, 3); + + ret = wait(&status); + if (ret == -1 || !WIFEXITED(status) || WEXITSTATUS(status)) { + kill(pid, SIGKILL); + fail("Failed waiting server\n"); + exit(1); + } + + req_version.op = IP_SET_OP_VERSION; + ret = getsockopt(sk_raw, SOL_IP, SO_IP_SET, &req_version, &size); + if (ret) { + pr_perror("xt_set getsockopt"); + if (errno != ENOPROTOOPT) { + fail("Can't fetch SO_IP_SET"); + exit(1); + } + } else + test_msg("SO_IP_SET version = %d\n", req_version.version); + + if (stop_icmp(sk_icmp, sk_icmpv6)) { + fail("Failed on ICMP sockets"); + exit(1); + } + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/socket-raw.desc b/CRIU_code/test/zdtm/static/socket-raw.desc new file mode 100644 index 0000000..359b344 --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-raw.desc @@ -0,0 +1 @@ +{'flags': 'suid', 'feature': 'net_diag_raw'} diff --git a/CRIU_code/test/zdtm/static/socket-tcp-close-wait.c b/CRIU_code/test/zdtm/static/socket-tcp-close-wait.c new file mode 100644 index 0000000..3e3462b --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp-close-wait.c @@ -0,0 +1,287 @@ +#include "zdtmtst.h" + +#ifdef ZDTM_IPV4V6 +#define ZDTM_FAMILY AF_INET +#define ZDTM_SRV_FAMILY AF_INET6 +#elif defined(ZDTM_IPV6) +#define ZDTM_FAMILY AF_INET6 +#define ZDTM_SRV_FAMILY AF_INET6 +#else +#define ZDTM_FAMILY AF_INET +#define ZDTM_SRV_FAMILY AF_INET +#endif + +const char *test_doc = "Check sockets in TCP_WAIT_STOP and TCP_LAST_ACK states\n"; +const char *test_author = "Andrey Vagin +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int port = 8880; + +#define BUF_SIZE 4096 + +int fill_sock_buf(int fd) +{ + int flags; + int size; + int ret; + + flags = fcntl(fd, F_GETFL, 0); + if (flags == -1) { + pr_err("Can't get flags"); + return -1; + } + if (fcntl(fd, F_SETFL, flags | O_NONBLOCK) == -1) { + pr_err("Can't set flags"); + return -1; + } + + size = 0; + while (1) { + char zdtm[] = "zdtm test packet"; + ret = write(fd, zdtm, sizeof(zdtm)); + if (ret == -1) { + if (errno == EAGAIN) + break; + pr_err("write"); + return -1; + } + size += ret; + } + + if (fcntl(fd, F_SETFL, flags) == -1) { + pr_err("Can't set flags"); + return -1; + } + + test_msg("snd_size = %d\n", size); + + return size; +} + +static int clean_sk_buf(int fd) +{ + int size, ret; + char buf[BUF_SIZE]; + + size = 0; + while (1) { + ret = read(fd, buf, sizeof(buf)); + if (ret == -1) { + pr_err("read"); + return -11; + } + + if (ret == 0) + break; + + size += ret; + } + + test_msg("rcv_size = %d\n", size); + + return size; +} + +#define TEST_MSG "Hello World!" + +int main(int argc, char **argv) +{ + char *newns = getenv("ZDTM_NEWNS"); + int fd, fd_s, ctl_fd; + pid_t extpid; + int pfd[2]; + int ret = 0, snd_size = 0, rcv_size = 0; +#ifndef ZDTM_TCP_LAST_ACK + char buf[BUF_SIZE]; +#endif + + if (newns) + test_init(argc, argv); + + if (pipe(pfd)) { + pr_perror("pipe() failed"); + return 1; + } + + extpid = fork(); + if (extpid < 0) { + pr_perror("fork() failed"); + return 1; + } else if (extpid == 0) { + int size = 0; + char c; + + if (!newns) + test_ext_init(argc, argv); + + close(pfd[1]); + if (read(pfd[0], &port, sizeof(port)) != sizeof(port)) { + pr_perror("Can't read port\n"); + return 1; + } + close(pfd[0]); + + fd = tcp_init_client(ZDTM_FAMILY, "127.0.0.1", port); + if (fd < 0) + return 1; + + ctl_fd = tcp_init_client(ZDTM_FAMILY, "127.0.0.1", port); + if (ctl_fd < 0) + return 1; + + /* == The preparation stage == */ + if (read(ctl_fd, &size, sizeof(size)) != sizeof(size)) { + pr_perror("read"); + return 1; + } + + if (shutdown(fd, SHUT_WR) == -1) { + pr_perror("shutdown"); + return 1; + } + + if (write(ctl_fd, &size, sizeof(size)) != sizeof(size)) { + pr_perror("write"); + return 1; + } + /* == End of the preparation stage == */ + + /* Checkpoint/restore */ + + /* == The final stage == */ + if (read(ctl_fd, &c, 1) != 0) { + pr_perror("read"); + return 1; + } + +#ifdef ZDTM_TCP_LAST_ACK + size = clean_sk_buf(fd); + if (size < 0) + return 1; +#else + if (read(fd, buf, sizeof(buf)) != sizeof(TEST_MSG) || + strncmp(buf, TEST_MSG, sizeof(TEST_MSG))) { + pr_perror("read"); + return 1; + } +#endif + + if (write(ctl_fd, &size, sizeof(size)) != sizeof(size)) { + pr_perror("write"); + return 1; + } + /* == End of the final stage == */ + + close(ctl_fd); + close(fd); + + return 0; + } + + if (!newns) + test_init(argc, argv); + + if ((fd_s = tcp_init_server(ZDTM_SRV_FAMILY, &port)) < 0) { + pr_err("initializing server failed"); + return 1; + } + + close(pfd[0]); + if (write(pfd[1], &port, sizeof(port)) != sizeof(port)) { + pr_perror("Can't send port"); + return 1; + } + close(pfd[1]); + + /* + * parent is server of TCP connection + */ + fd = tcp_accept_server(fd_s); + if (fd < 0) { + pr_err("can't accept client connection"); + return 1; + } + + ctl_fd = tcp_accept_server(fd_s); + if (ctl_fd < 0) { + pr_err("can't accept client connection"); + return 1; + } + + /* == The preparation stage == */ +#ifdef ZDTM_TCP_LAST_ACK + snd_size = fill_sock_buf(fd); + if (snd_size <= 0) + return 1; +#endif + + if (write(ctl_fd, &ret, sizeof(ret)) != sizeof(ret)) { + pr_perror("read"); + return 1; + } + + if (read(ctl_fd, &ret, sizeof(ret)) != sizeof(ret)) { + pr_perror("read"); + return 1; + } + /* == End of the preparation stage */ + +#ifdef ZDTM_TCP_LAST_ACK + if (shutdown(fd, SHUT_WR) == -1) { + pr_perror("shutdown"); + return 1; + } +#endif + + test_daemon(); + test_waitsig(); + + /* == The final stage == */ + if (shutdown(ctl_fd, SHUT_WR) == -1) { + pr_perror("shutdown"); + return 1; + } + +#ifndef ZDTM_TCP_LAST_ACK + if (write(fd, TEST_MSG, sizeof(TEST_MSG)) != sizeof(TEST_MSG)) { + pr_perror("write"); + return 1; + } + + if (shutdown(fd, SHUT_WR) == -1) { + pr_perror("shutdown"); + return 1; + } +#endif + + rcv_size = clean_sk_buf(fd); + + if (ret != rcv_size) { + fail("The child sent %d bytes, but the parent received %d bytes\n", ret, rcv_size); + return 1; + } + + if (read(ctl_fd, &ret, sizeof(ret)) != sizeof(ret)) { + pr_perror("read"); + return 1; + } + /* == End of the final stage == */ + + if (ret != snd_size) { + fail("The parent sent %d bytes, but the child received %d bytes\n", snd_size, ret); + return 1; + } + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/socket-tcp-close-wait.desc b/CRIU_code/test/zdtm/static/socket-tcp-close-wait.desc new file mode 100644 index 0000000..155e103 --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp-close-wait.desc @@ -0,0 +1 @@ +{'opts': '--tcp-established', 'flags': 'nouser samens', 'feature' : 'tcp_half_closed'} diff --git a/CRIU_code/test/zdtm/static/socket-tcp-close-wait.hook b/CRIU_code/test/zdtm/static/socket-tcp-close-wait.hook new file mode 100644 index 0000000..73d7da1 --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp-close-wait.hook @@ -0,0 +1 @@ +socket-tcp-fin-wait1.hook \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/socket-tcp-close0.c b/CRIU_code/test/zdtm/static/socket-tcp-close0.c new file mode 100644 index 0000000..6043ceb --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp-close0.c @@ -0,0 +1,75 @@ +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that tcp-close option closes connected tcp socket"; +const char *test_author = "Pavel Begunkov "; + +static int port = 8880; + +static int check_socket_closed(int sk) +{ + int err, buffer = 0; + struct { + __u8 tcpi_state; + } info; + socklen_t len = sizeof(info); + + err = getsockopt(sk, IPPROTO_TCP, TCP_INFO, (void *)&info, &len); + if (err != 0) { + pr_perror("Can't get socket state\n"); + return -1; + } else if (info.tcpi_state != TCP_CLOSE) { + pr_err("Invalid socket state (%i)\n", (int)info.tcpi_state); + return -1; + } + + err = recv(sk, &buffer, sizeof(buffer), 0); + if (!err || errno != ENOTCONN) { + pr_perror("Invalid recv response\n"); + return -1; + } + return 0; +} + +int main(int argc, char **argv) +{ + int fd, fd_s, clt; + + test_init(argc, argv); + + fd_s = tcp_init_server(AF_INET, &port); + if (fd_s < 0) { + pr_err("Server initializations failed\n"); + return 1; + } + clt = tcp_init_client(AF_INET, "localhost", port); + if (clt < 0) + return 1; + + fd = tcp_accept_server(fd_s); + if (fd < 0) { + pr_err("Can't accept client connection\n"); + return 1; + } + close(fd_s); + + test_daemon(); + test_waitsig(); + + if (check_socket_closed(fd)) { + fail("Server socket isn't closed\n"); + return 1; + } + if (check_socket_closed(clt)) { + fail("Client socket isn't closed\n"); + return 1; + } + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/socket-tcp-close0.desc b/CRIU_code/test/zdtm/static/socket-tcp-close0.desc new file mode 100644 index 0000000..75ce8a5 --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp-close0.desc @@ -0,0 +1 @@ +{'dopts': '--tcp-established', 'ropts': '--tcp-close', 'flags': 'reqrst '} diff --git a/CRIU_code/test/zdtm/static/socket-tcp-close1.c b/CRIU_code/test/zdtm/static/socket-tcp-close1.c new file mode 100644 index 0000000..3dba4e5 --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp-close1.c @@ -0,0 +1,53 @@ +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that tcp-close option doesn't close listening tcp socket"; +const char *test_author = "Pavel Begunkov "; + +static int port = 8880; + +static int check_socket_state(int sk, int state) +{ + int err; + struct { + __u8 tcpi_state; + } info; + socklen_t len = sizeof(info); + + err = getsockopt(sk, IPPROTO_TCP, TCP_INFO, (void *)&info, &len); + if (err != 0) { + pr_perror("Can't get socket state\n"); + return -1; + } + return info.tcpi_state == state ? 0 : -1; +} + +int main(int argc, char **argv) +{ + int fd_s; + + test_init(argc, argv); + + fd_s = tcp_init_server(AF_INET, &port); + if (fd_s < 0) { + pr_err("Server initializations failed\n"); + return 1; + } + + test_daemon(); + test_waitsig(); + + if (check_socket_state(fd_s, TCP_LISTEN)) { + fail("Listen socket state is changed\n"); + close(fd_s); + return 1; + } + close(fd_s); + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/socket-tcp-close1.desc b/CRIU_code/test/zdtm/static/socket-tcp-close1.desc new file mode 100644 index 0000000..836b8fa --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp-close1.desc @@ -0,0 +1 @@ +socket-tcp-close0.desc \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/socket-tcp-closed-last-ack.c b/CRIU_code/test/zdtm/static/socket-tcp-closed-last-ack.c new file mode 100644 index 0000000..b0c363c --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp-closed-last-ack.c @@ -0,0 +1 @@ +socket-tcp-closed.c \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/socket-tcp-closed-last-ack.desc b/CRIU_code/test/zdtm/static/socket-tcp-closed-last-ack.desc new file mode 100644 index 0000000..8a9df7f --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp-closed-last-ack.desc @@ -0,0 +1,10 @@ +{ 'deps': [ '/bin/sh', + '/sbin/iptables', + '/usr/lib64/xtables/libxt_tcp.so|/lib/xtables/libxt_tcp.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_tcp.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_tcp.so|/usr/lib/s390x-linux-gnu/xtables/libxt_tcp.so|/usr/lib/xtables/libxt_tcp.so', + '/usr/lib64/xtables/libxt_standard.so|/lib/xtables/libxt_standard.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_standard.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_standard.so|/usr/lib/s390x-linux-gnu/xtables/libxt_standard.so|/usr/lib/xtables/libxt_standard.so', + ], + 'opts': '--tcp-established', + 'flags': 'suid nouser samens', + 'feature' : 'tcp_half_closed', + 'flavor': 'ns uns', +} diff --git a/CRIU_code/test/zdtm/static/socket-tcp-closed-last-ack.hook b/CRIU_code/test/zdtm/static/socket-tcp-closed-last-ack.hook new file mode 100644 index 0000000..73d7da1 --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp-closed-last-ack.hook @@ -0,0 +1 @@ +socket-tcp-fin-wait1.hook \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/socket-tcp-closed.c b/CRIU_code/test/zdtm/static/socket-tcp-closed.c new file mode 100644 index 0000000..a51f9b9 --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp-closed.c @@ -0,0 +1,146 @@ +#include "zdtmtst.h" + +#ifdef ZDTM_IPV4V6 +#define ZDTM_FAMILY AF_INET +#define ZDTM_SRV_FAMILY AF_INET6 +#elif defined(ZDTM_IPV6) +#define ZDTM_FAMILY AF_INET6 +#define ZDTM_SRV_FAMILY AF_INET6 +#else +#define ZDTM_FAMILY AF_INET +#define ZDTM_SRV_FAMILY AF_INET +#endif + +const char *test_doc = "Check closed tcp sockets\n"; +const char *test_author = "Andrey Vagin +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int port = 8880; + +union sockaddr_inet { + struct sockaddr addr; + struct sockaddr_in v4; + struct sockaddr_in6 v6; +}; + +int main(int argc, char **argv) +{ + int fd, fd_s, clt, sk; + union sockaddr_inet src_addr, dst_addr, addr; + socklen_t aux; + char c = 5; +#ifdef ZDTM_TCP_LAST_ACK + char cmd[4096]; +#endif + + test_init(argc, argv); + signal(SIGPIPE, SIG_IGN); + + sk = socket(ZDTM_FAMILY, SOCK_STREAM, 0); + if (sk < 0) { + pr_perror("socket"); + return 1; + } + + if ((fd_s = tcp_init_server(ZDTM_SRV_FAMILY, &port)) < 0) { + pr_err("initializing server failed\n"); + return 1; + } + + clt = tcp_init_client(ZDTM_FAMILY, "localhost", port); + if (clt < 0) + return 1; + + /* + * parent is server of TCP connection + */ + fd = tcp_accept_server(fd_s); + if (fd < 0) { + pr_err("can't accept client connection\n"); + return 1; + } + close(fd_s); + + shutdown(clt, SHUT_WR); + +#ifdef ZDTM_TCP_LAST_ACK + snprintf(cmd, sizeof(cmd), "iptables -w -t filter --protocol tcp -A INPUT --dport %d -j DROP", port); + if (system(cmd)) + return -1; +#endif + + shutdown(fd, SHUT_WR); + + if (ZDTM_FAMILY == AF_INET) + aux = sizeof(struct sockaddr_in); + else if (ZDTM_FAMILY == AF_INET6) + aux = sizeof(struct sockaddr_in6); + else + return 1; + + if (getsockopt(clt, SOL_SOCKET, SO_PEERNAME, &dst_addr, &aux)) { + pr_perror("SO_PEERNAME"); + return 1; + } + if (getsockname(clt, &src_addr.addr, &aux)) { + pr_perror("getsockname"); + return 1; + } + + test_daemon(); + test_waitsig(); + +#ifdef ZDTM_TCP_LAST_ACK + snprintf(cmd, sizeof(cmd), "iptables -w -t filter --protocol tcp -D INPUT --dport %d -j DROP", port); + if (system(cmd)) + return -1; +#endif + + if (read(fd, &c, 1) != 0) { + fail("read"); + return 1; + } + if (read(clt, &c, 1) != 0) { + fail("read"); + return 1; + } + if (write(clt, &c, 1) != -1) { + fail("write"); + return 1; + } + if (write(fd, &c, 1) != -1) { + fail("write"); + return 1; + } + + if (getsockopt(clt, SOL_SOCKET, SO_PEERNAME, &addr, &aux)) { + pr_perror("SO_PEERNAME"); + return 1; + } + if (memcmp(&addr, &dst_addr, aux)) { + pr_err("A destination address mismatch"); + return 1; + } + + if (getsockname(clt, &addr.addr, &aux)) { + pr_perror("getsockname"); + return 1; + } + if (memcmp(&addr, &src_addr, aux)) { + pr_err("A source address mismatch"); + return 1; + } + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/socket-tcp-closed.desc b/CRIU_code/test/zdtm/static/socket-tcp-closed.desc new file mode 100644 index 0000000..21cbeff --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp-closed.desc @@ -0,0 +1 @@ +{'opts': '--tcp-established', 'flags': 'nouser samens', 'feature' : 'tcp_half_closed', 'flavor' : 'ns uns'} diff --git a/CRIU_code/test/zdtm/static/socket-tcp-closed.hook b/CRIU_code/test/zdtm/static/socket-tcp-closed.hook new file mode 100644 index 0000000..73d7da1 --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp-closed.hook @@ -0,0 +1 @@ +socket-tcp-fin-wait1.hook \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/socket-tcp-closing.c b/CRIU_code/test/zdtm/static/socket-tcp-closing.c new file mode 100644 index 0000000..23d49cc --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp-closing.c @@ -0,0 +1,243 @@ +#include "zdtmtst.h" + +#ifdef ZDTM_IPV4V6 +#define ZDTM_FAMILY AF_INET +#define ZDTM_SRV_FAMILY AF_INET6 +#elif defined(ZDTM_IPV6) +#define ZDTM_FAMILY AF_INET6 +#define ZDTM_SRV_FAMILY AF_INET6 +#else +#define ZDTM_FAMILY AF_INET +#define ZDTM_SRV_FAMILY AF_INET +#endif + +const char *test_doc = "Check sockets in the TCP_CLOSING state\n"; +const char *test_author = "Andrey Vagin +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int port = 8880; + +#define BUF_SIZE 4096 + +int fill_sock_buf(int fd) +{ + int flags; + int size; + int ret; + + flags = fcntl(fd, F_GETFL, 0); + if (flags == -1) { + pr_err("Can't get flags"); + return -1; + } + if (fcntl(fd, F_SETFL, flags | O_NONBLOCK) == -1) { + pr_err("Can't set flags"); + return -1; + } + + size = 0; + while (1) { + char zdtm[] = "zdtm test packet"; + ret = write(fd, zdtm, sizeof(zdtm)); + if (ret == -1) { + if (errno == EAGAIN) + break; + pr_err("write"); + return -1; + } + size += ret; + } + + if (fcntl(fd, F_SETFL, flags) == -1) { + pr_err("Can't set flags"); + return -1; + } + + test_msg("snd_size = %d\n", size); + + return size; +} + +static int clean_sk_buf(int fd) +{ + int size, ret; + char buf[BUF_SIZE]; + + size = 0; + while (1) { + ret = read(fd, buf, sizeof(buf)); + if (ret == -1) { + pr_err("read"); + return -11; + } + + if (ret == 0) + break; + + size += ret; + } + + test_msg("rcv_size = %d\n", size); + + return size; +} + +int main(int argc, char **argv) +{ + char *newns = getenv("ZDTM_NEWNS"); + int fd, fd_s, ctl_fd; + pid_t extpid; + int pfd[2]; + int ret, snd_size = 0, rcv_size = 0; + + if (newns) + test_init(argc, argv); + + if (pipe(pfd)) { + pr_err("pipe() failed"); + return 1; + } + + extpid = fork(); + if (extpid < 0) { + pr_err("fork() failed"); + return 1; + } else if (extpid == 0) { + int size = 0; + char c; + + if (!newns) + test_ext_init(argc, argv); + + close(pfd[1]); + if (read(pfd[0], &port, sizeof(port)) != sizeof(port)) { + pr_err("Can't read port\n"); + return 1; + } + + fd = tcp_init_client(ZDTM_FAMILY, "127.0.0.1", port); + if (fd < 0) + return 1; + + ctl_fd = tcp_init_client(ZDTM_FAMILY, "127.0.0.1", port); + if (ctl_fd < 0) + return 1; + + size = fill_sock_buf(fd); + if (size <= 0) + return 1; + + if (shutdown(fd, SHUT_WR) == -1) { + pr_err("shutdown"); + return 1; + } + + if (write(ctl_fd, &size, sizeof(size)) != sizeof(size)) { + pr_err("write"); + return 1; + } + + if (read(ctl_fd, &c, 1) != 0) { + pr_err("read"); + return 1; + } + + size = clean_sk_buf(fd); + if (size < 0) + return 1; + + write(ctl_fd, &size, sizeof(size)); + close(fd); + + return 0; + } + + if (!newns) + test_init(argc, argv); + + if ((fd_s = tcp_init_server(ZDTM_SRV_FAMILY, &port)) < 0) { + pr_err("initializing server failed"); + return 1; + } + + close(pfd[0]); + if (write(pfd[1], &port, sizeof(port)) != sizeof(port)) { + pr_err("Can't send port"); + return 1; + } + close(pfd[1]); + + /* + * parent is server of TCP connection + */ + fd = tcp_accept_server(fd_s); + if (fd < 0) { + pr_err("can't accept client connection %m"); + return 1; + } + + ctl_fd = tcp_accept_server(fd_s); + if (ctl_fd < 0) { + pr_err("can't accept client connection %m"); + return 1; + } + + snd_size = fill_sock_buf(fd); + if (snd_size <= 0) + return 1; + + if (shutdown(fd, SHUT_WR) == -1) { + pr_err("shutdown"); + return 1; + } + + if (read(ctl_fd, &ret, sizeof(ret)) != sizeof(ret)) { + pr_err("read"); + return 1; + } + + rcv_size = clean_sk_buf(fd); + + if (ret != rcv_size) { + fail("The child sent %d bytes, but the parent received %d bytes\n", ret, rcv_size); + return 1; + } + + sleep(1); + + test_daemon(); + test_waitsig(); + + if (read(fd, &ret, sizeof(ret))) { + pr_perror("read"); + return 1; + } + + if (shutdown(ctl_fd, SHUT_WR) == -1) { + pr_err("shutdown"); + return 1; + } + + if (read(ctl_fd, &ret, sizeof(ret)) != sizeof(ret)) { + pr_err("read"); + return 1; + } + + if (ret != snd_size) { + fail("The parent sent %d bytes, but the child received %d bytes\n", snd_size, ret); + return 1; + } + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/socket-tcp-closing.desc b/CRIU_code/test/zdtm/static/socket-tcp-closing.desc new file mode 100644 index 0000000..155e103 --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp-closing.desc @@ -0,0 +1 @@ +{'opts': '--tcp-established', 'flags': 'nouser samens', 'feature' : 'tcp_half_closed'} diff --git a/CRIU_code/test/zdtm/static/socket-tcp-closing.hook b/CRIU_code/test/zdtm/static/socket-tcp-closing.hook new file mode 100644 index 0000000..73d7da1 --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp-closing.hook @@ -0,0 +1 @@ +socket-tcp-fin-wait1.hook \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/socket-tcp-fin-wait1.c b/CRIU_code/test/zdtm/static/socket-tcp-fin-wait1.c new file mode 100644 index 0000000..6c7cc93 --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp-fin-wait1.c @@ -0,0 +1,232 @@ +#include "zdtmtst.h" + +#ifdef ZDTM_IPV4V6 +#define ZDTM_FAMILY AF_INET +#define ZDTM_SRV_FAMILY AF_INET6 +#elif defined(ZDTM_IPV6) +#define ZDTM_FAMILY AF_INET6 +#define ZDTM_SRV_FAMILY AF_INET6 +#else +#define ZDTM_FAMILY AF_INET +#define ZDTM_SRV_FAMILY AF_INET +#endif + +const char *test_doc = "Check sockets in TCP_FIN_WAIT* states\n"; +const char *test_author = "Andrey Vagin +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int port = 8880; + +#define TEST_MSG "Hello World!" +#define BUF_SIZE 4096 + +int fill_sock_buf(int fd) +{ + int flags; + int size; + int ret; + + flags = fcntl(fd, F_GETFL, 0); + if (flags == -1) { + pr_err("Can't get flags"); + return -1; + } + if (fcntl(fd, F_SETFL, flags | O_NONBLOCK) == -1) { + pr_err("Can't set flags"); + return -1; + } + + size = 0; + while (1) { + char zdtm[] = "zdtm test packet"; + ret = write(fd, zdtm, sizeof(zdtm)); + if (ret == -1) { + if (errno == EAGAIN) + break; + pr_err("write"); + return -1; + } + size += ret; + } + + if (fcntl(fd, F_SETFL, flags) == -1) { + pr_err("Can't set flags"); + return -1; + } + + test_msg("snd_size = %d\n", size); + + return size; +} + +static int clean_sk_buf(int fd) +{ + int size, ret; + char buf[BUF_SIZE]; + + size = 0; + while (1) { + ret = read(fd, buf, sizeof(buf)); + if (ret == -1) { + pr_err("read"); + return -11; + } + + if (ret == 0) + break; + + size += ret; + } + + test_msg("rcv_size = %d\n", size); + + return size; +} + +int main(int argc, char **argv) +{ + char *newns = getenv("ZDTM_NEWNS"); + int fd, fd_s, ctl_fd; + pid_t extpid; + int pfd[2]; + int ret, snd_size = 0; + char buf[BUF_SIZE]; + + if (newns) + test_init(argc, argv); + + if (pipe(pfd)) { + pr_err("pipe() failed"); + return 1; + } + + extpid = fork(); + if (extpid < 0) { + pr_err("fork() failed"); + return 1; + } else if (extpid == 0) { + int size = 0; + char c; + + if (!newns) + test_ext_init(argc, argv); + + close(pfd[1]); + if (read(pfd[0], &port, sizeof(port)) != sizeof(port)) { + pr_err("Can't read port\n"); + return 1; + } + + fd = tcp_init_client(ZDTM_FAMILY, "127.0.0.1", port); + if (fd < 0) + return 1; + write(fd, TEST_MSG, 2); + + ctl_fd = tcp_init_client(ZDTM_FAMILY, "127.0.0.1", port); + if (ctl_fd < 0) + return 1; + + if (read(ctl_fd, &c, 1) != 0) { + pr_err("read"); + return 1; + } + + if (write(fd, TEST_MSG + 2, sizeof(TEST_MSG) - 2) != sizeof(TEST_MSG) - 2) { + pr_err("write"); + return 1; + } + + if (shutdown(fd, SHUT_WR) == -1) { + pr_err("shutdown"); + return 1; + } + + size = clean_sk_buf(fd); + if (size < 0) + return 1; + + write(ctl_fd, &size, sizeof(size)); + close(fd); + + return 0; + } + + if (!newns) + test_init(argc, argv); + + if ((fd_s = tcp_init_server(ZDTM_SRV_FAMILY, &port)) < 0) { + pr_err("initializing server failed"); + return 1; + } + + close(pfd[0]); + if (write(pfd[1], &port, sizeof(port)) != sizeof(port)) { + pr_err("Can't send port"); + return 1; + } + close(pfd[1]); + + /* + * parent is server of TCP connection + */ + fd = tcp_accept_server(fd_s); + if (fd < 0) { + pr_err("can't accept client connection %m"); + return 1; + } + + ctl_fd = tcp_accept_server(fd_s); + if (ctl_fd < 0) { + pr_err("can't accept client connection %m"); + return 1; + } + +#if !defined(ZDTM_TCP_FIN_WAIT2) + snd_size = fill_sock_buf(fd); + if (snd_size <= 0) + return 1; +#endif + + if (shutdown(fd, SHUT_WR) == -1) { + pr_err("shutdown"); + return 1; + } + + + test_daemon(); + test_waitsig(); + + if (shutdown(ctl_fd, SHUT_WR) == -1) { + pr_err("shutdown"); + return 1; + } + + if (recv(fd, buf, sizeof(buf), MSG_WAITALL) != sizeof(TEST_MSG) || + strncmp(buf, TEST_MSG, sizeof(TEST_MSG))) { + pr_err("read"); + return 1; + } + + if (read(ctl_fd, &ret, sizeof(ret)) != sizeof(ret)) { + pr_err("read"); + return 1; + } + + if (ret != snd_size) { + fail("The parent sent %d bytes, but the child received %d bytes\n", snd_size, ret); + return 1; + } + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/socket-tcp-fin-wait1.desc b/CRIU_code/test/zdtm/static/socket-tcp-fin-wait1.desc new file mode 100644 index 0000000..155e103 --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp-fin-wait1.desc @@ -0,0 +1 @@ +{'opts': '--tcp-established', 'flags': 'nouser samens', 'feature' : 'tcp_half_closed'} diff --git a/CRIU_code/test/zdtm/static/socket-tcp-fin-wait1.hook b/CRIU_code/test/zdtm/static/socket-tcp-fin-wait1.hook new file mode 100644 index 0000000..9504557 --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp-fin-wait1.hook @@ -0,0 +1,70 @@ +#!/usr/bin/env python +import sys + +sys.path.append("../crit") + +import pycriu +import os, os.path +import json +import difflib +import subprocess + +if sys.argv[1] in ["--pre-dump", "--post-restore"]: + pid = os.getenv("ZDTM_TEST_PID") + try: + subprocess.Popen(["nsenter", "-t", pid, "-n", "ss", "-t", "-a", "-n"]).wait() + except OSError as e: + pass + +if sys.argv[1] != "--post-restore": + sys.exit(0) + +print("Check TCP images") + +def get_sockets(image_dir): + fname = os.path.join(image_dir, "inetsk.img") + if not os.access(fname, os.F_OK): + return None + + f = open(fname) + sockets = pycriu.images.load(f) + sockets = sockets["entries"] + + for s in sockets: + f = open(os.path.join(image_dir, "inetsk.img")) + ids = pycriu.images.load(f) + tcp_img = os.path.join(image_dir, "tcp-stream-%x.img" % int(s["ino"])) + print(tcp_img) + if os.access(tcp_img, os.F_OK): + f = open(tcp_img) + tcp = pycriu.images.load(f) + s['tcp'] = tcp["entries"][0] + s["tcp"].pop("extra", None) + s["tcp"].pop("timestamp", None) + s["tcp"].pop("snd_wl1", None) + s["tcp"].pop("rcv_wnd", None) + s["tcp"].pop("snd_wnd", None) + s["tcp"].pop("max_window", None) + s.pop("id", None) + s.pop("ino") + sockets.sort(lambda a, b: cmp(a["src_port"] + a["dst_port"], b["src_port"] + b["dst_port"])) + return sockets + +path = os.getenv("ZDTM_IMG_DIR") +prev = None +exit_code = 0 +for d in os.listdir(path): + sockets = get_sockets(os.path.join(path, d)) + if not prev: + prev = sockets + continue + + if prev == sockets: + continue + + sockets_str = json.dumps(sockets, sys.stdout, indent=8, sort_keys=True) + prev_str = json.dumps(prev, sys.stdout, indent=8, sort_keys=True) + + print("\n".join(difflib.unified_diff(prev_str.split("\n"), sockets_str.split("\n")))) + +sys.exit(exit_code) diff --git a/CRIU_code/test/zdtm/static/socket-tcp-fin-wait2.c b/CRIU_code/test/zdtm/static/socket-tcp-fin-wait2.c new file mode 100644 index 0000000..2dede5f --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp-fin-wait2.c @@ -0,0 +1 @@ +socket-tcp-fin-wait1.c \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/socket-tcp-fin-wait2.desc b/CRIU_code/test/zdtm/static/socket-tcp-fin-wait2.desc new file mode 100644 index 0000000..fba0e07 --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp-fin-wait2.desc @@ -0,0 +1 @@ +socket-tcp-fin-wait1.desc \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/socket-tcp-fin-wait2.hook b/CRIU_code/test/zdtm/static/socket-tcp-fin-wait2.hook new file mode 100644 index 0000000..73d7da1 --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp-fin-wait2.hook @@ -0,0 +1 @@ +socket-tcp-fin-wait1.hook \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/socket-tcp-last-ack.c b/CRIU_code/test/zdtm/static/socket-tcp-last-ack.c new file mode 100644 index 0000000..20d7e78 --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp-last-ack.c @@ -0,0 +1 @@ +socket-tcp-close-wait.c \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/socket-tcp-last-ack.desc b/CRIU_code/test/zdtm/static/socket-tcp-last-ack.desc new file mode 100644 index 0000000..8cf8416 --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp-last-ack.desc @@ -0,0 +1 @@ +{'flavor': 'h ns uns', 'opts': '--tcp-established', 'flags': 'nouser samens', 'feature' : 'tcp_half_closed'} diff --git a/CRIU_code/test/zdtm/static/socket-tcp-last-ack.hook b/CRIU_code/test/zdtm/static/socket-tcp-last-ack.hook new file mode 100644 index 0000000..73d7da1 --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp-last-ack.hook @@ -0,0 +1 @@ +socket-tcp-fin-wait1.hook \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/socket-tcp-local.c b/CRIU_code/test/zdtm/static/socket-tcp-local.c new file mode 100644 index 0000000..8cb60dd --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp-local.c @@ -0,0 +1 @@ +socket-tcp.c \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/socket-tcp-local.desc b/CRIU_code/test/zdtm/static/socket-tcp-local.desc new file mode 100644 index 0000000..6a406f6 --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp-local.desc @@ -0,0 +1 @@ +{'flavor': 'h ns uns', 'opts': '--tcp-established', 'flags': 'nouser samens'} diff --git a/CRIU_code/test/zdtm/static/socket-tcp-local.hook b/CRIU_code/test/zdtm/static/socket-tcp-local.hook new file mode 100644 index 0000000..4ffe289 --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp-local.hook @@ -0,0 +1 @@ +socket-tcp-fin-wait2.hook \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/socket-tcp-nfconntrack.c b/CRIU_code/test/zdtm/static/socket-tcp-nfconntrack.c new file mode 100644 index 0000000..8cb60dd --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp-nfconntrack.c @@ -0,0 +1 @@ +socket-tcp.c \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/socket-tcp-nfconntrack.desc b/CRIU_code/test/zdtm/static/socket-tcp-nfconntrack.desc new file mode 100644 index 0000000..add2513 --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp-nfconntrack.desc @@ -0,0 +1 @@ +{'flavor': 'h', 'opts': '--tcp-established', 'flags': 'suid'} diff --git a/CRIU_code/test/zdtm/static/socket-tcp-reseted.c b/CRIU_code/test/zdtm/static/socket-tcp-reseted.c new file mode 100644 index 0000000..07ad360 --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp-reseted.c @@ -0,0 +1,96 @@ + +#include "zdtmtst.h" +#include +#include +#include /* for sockaddr_in and inet_ntoa() */ +#include +#include +#include + +#ifdef ZDTM_IPV4V6 +#define ZDTM_FAMILY AF_INET +#define ZDTM_SRV_FAMILY AF_INET6 +#elif defined(ZDTM_IPV6) +#define ZDTM_FAMILY AF_INET6 +#define ZDTM_SRV_FAMILY AF_INET6 +#else +#define ZDTM_FAMILY AF_INET +#define ZDTM_SRV_FAMILY AF_INET +#endif + +const char *test_doc = "Check, that a reseted TCP connection can be restored\n"; +const char *test_author = "Andrey Vagin +#include +#include +#include +#include +#include +#include +#include +#include + +static int port = 8880; + +int main(int argc, char **argv) +{ + int fd, fd_s, clt; + char cmd[4096], buf[10]; + + test_init(argc, argv); + signal(SIGPIPE, SIG_IGN); + + if ((fd_s = tcp_init_server(ZDTM_SRV_FAMILY, &port)) < 0) { + pr_err("initializing server failed\n"); + return 1; + } + + + clt = tcp_init_client(ZDTM_FAMILY, "localhost", port); + if (clt < 0) { + pr_perror("Unable to create a client socket"); + return 1; + } + + /* + * parent is server of TCP connection + */ + fd = tcp_accept_server(fd_s); + if (fd < 0) { + pr_err("can't accept client connection\n"); + return 1; + } + if (write(clt, "asd", 3) != 3) { + pr_perror("Unable to write into a socket"); + return 1; + } + snprintf(cmd, sizeof(cmd), "iptables -w -t filter --protocol tcp -A INPUT --dport %d -j REJECT --reject-with tcp-reset", port); + if (system(cmd)) + return 1; + + if (write(fd, "asdas", 5) == -1) { + pr_perror("Unable to write into a socket"); + return 1; + } + + snprintf(cmd, sizeof(cmd), "iptables -w -t filter --protocol tcp -D INPUT --dport %d -j REJECT --reject-with tcp-reset", port); + if (system(cmd)) + return 1; + + test_daemon(); + test_waitsig(); + + if (read(fd, buf, sizeof(buf)) != 3) { + fail("Unable to read data from a socket"); + return 1; + } + + if (write(fd, buf, 3) != -1) { + fail("Can write into a closed socket"); + return 1; + } + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/socket-tcp-reseted.desc b/CRIU_code/test/zdtm/static/socket-tcp-reseted.desc new file mode 100644 index 0000000..94425b4 --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp-reseted.desc @@ -0,0 +1,10 @@ +{ 'deps': [ '/bin/sh', + '/sbin/iptables', + '/usr/lib64/xtables/libxt_tcp.so|/lib/xtables/libxt_tcp.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_tcp.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_tcp.so|/usr/lib/xtables/libxt_tcp.so|/usr/lib/s390x-linux-gnu/xtables/libxt_tcp.so', + '/usr/lib64/xtables/libxt_standard.so|/lib/xtables/libxt_standard.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_standard.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_standard.so|/usr/lib/xtables/libxt_standard.so|/usr/lib/s390x-linux-gnu/xtables/libxt_standard.so', + '/usr/lib64/xtables/libipt_REJECT.so|/lib/xtables/libipt_REJECT.so|/usr/lib/powerpc64le-linux-gnu/xtables/libipt_REJECT.so|/usr/lib/x86_64-linux-gnu/xtables/libipt_REJECT.so|/usr/lib/xtables/libipt_REJECT.so|/usr/lib/s390x-linux-gnu/xtables/libipt_REJECT.so', + ], + 'opts': '--tcp-established', + 'flags': 'suid nouser samens', + 'feature' : 'tcp_half_closed' +} diff --git a/CRIU_code/test/zdtm/static/socket-tcp-reseted.hook b/CRIU_code/test/zdtm/static/socket-tcp-reseted.hook new file mode 100644 index 0000000..73d7da1 --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp-reseted.hook @@ -0,0 +1 @@ +socket-tcp-fin-wait1.hook \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/socket-tcp-reuseport.c b/CRIU_code/test/zdtm/static/socket-tcp-reuseport.c new file mode 100644 index 0000000..4cd1802 --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp-reuseport.c @@ -0,0 +1,171 @@ +#include "zdtmtst.h" + +#ifdef ZDTM_IPV6 +#define ZDTM_FAMILY AF_INET6 +#else +#define ZDTM_FAMILY AF_INET +#endif + +const char *test_doc = "Check a case when one port is shared between two listening sockets\n"; +const char *test_author = "Andrey Vagin +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for sockaddr_in and inet_ntoa() */ + +#define BUF_SIZE 4096 + +int read_data(int fd, unsigned char *buf, int size) +{ + int cur = 0; + int ret; + while (cur != size) { + ret = read(fd, buf + cur, size - cur); + if (ret <= 0) + return -1; + cur += ret; + } + + return 0; +} + +int write_data(int fd, const unsigned char *buf, int size) +{ + int cur = 0; + int ret; + + while (cur != size) { + ret = write(fd, buf + cur, size - cur); + if (ret <= 0) + return -1; + cur += ret; + } + + return 0; +} + +int main(int argc, char **argv) +{ + struct zdtm_tcp_opts opts = { .reuseaddr = false, + .reuseport = true, + .flags = SOCK_NONBLOCK}; + unsigned char buf[BUF_SIZE]; + int port = 8880, port2; + int fd, fd_s, fd_s2, clt, i; + socklen_t optlen; + int no = 0, val; + uint32_t crc; + + test_init(argc, argv); + + if ((fd_s = tcp_init_server_with_opts(ZDTM_FAMILY, &port, &opts)) < 0) { + pr_err("initializing server failed\n"); + return 1; + } + + port2 = port; + if ((fd_s2 = tcp_init_server_with_opts(ZDTM_FAMILY, &port2, &opts)) < 0) { + pr_err("initializing server failed\n"); + return 1; + } + if (port != port2) + return 1; + + if (setsockopt(fd_s, SOL_SOCKET, SO_REUSEPORT, &no, sizeof(int)) == -1 ) { + pr_perror("Unable to set SO_REUSEPORT"); + return -1; + } + + + clt = tcp_init_client(ZDTM_FAMILY, "localhost", port); + if (clt < 0) + return 1; + + /* + * parent is server of TCP connection + */ + fd = tcp_accept_server(fd_s); + if (fd < 0) + fd = tcp_accept_server(fd_s2); + if (fd < 0) { + pr_err("can't accept client connection\n"); + return 1; + } + + test_daemon(); + test_waitsig(); + + + optlen = sizeof(val); + if (getsockopt(fd_s, SOL_SOCKET, SO_REUSEPORT, &val, &optlen)) { + pr_perror("getsockopt"); + return 1; + } + if (val == 1) { + fail("SO_REUSEPORT is set for %d\n", fd); + return 1; + } + optlen = sizeof(val); + if (getsockopt(fd_s2, SOL_SOCKET, SO_REUSEPORT, &val, &optlen)) { + pr_perror("getsockopt"); + return 1; + } + if (val != 1) { + fail("SO_REUSEPORT is not set for %d\n", fd); + return 1; + } + + for (i = 0; ; i++) { + crc = 0; + datagen(buf, BUF_SIZE, &crc); + if (write_data(fd, buf, BUF_SIZE)) { + pr_perror("can't write"); + return 1; + } + + memset(buf, 0, BUF_SIZE); + if (read_data(clt, buf, BUF_SIZE)) { + pr_perror("read less then have to"); + return 1; + } + crc = 0; + if (datachk(buf, BUF_SIZE, &crc)) + return 2; + + close(clt); + close(fd); + + if (i == 2) + break; + + clt = tcp_init_client(ZDTM_FAMILY, "localhost", port); + if (clt < 0) + return 1; + + /* + * parent is server of TCP connection + */ + fd = tcp_accept_server(fd_s2); + if (fd < 0) { + fd = tcp_accept_server(fd_s); + close(fd_s); + } else { + close(fd_s2); + } + if (fd < 0) { + pr_err("can't accept client connection %d\n", i); + return 1; + } + } + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/socket-tcp-reuseport.desc b/CRIU_code/test/zdtm/static/socket-tcp-reuseport.desc new file mode 100644 index 0000000..6a406f6 --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp-reuseport.desc @@ -0,0 +1 @@ +{'flavor': 'h ns uns', 'opts': '--tcp-established', 'flags': 'nouser samens'} diff --git a/CRIU_code/test/zdtm/static/socket-tcp-skip-in-flight.c b/CRIU_code/test/zdtm/static/socket-tcp-skip-in-flight.c new file mode 100644 index 0000000..99ca03a --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp-skip-in-flight.c @@ -0,0 +1,89 @@ +#include +#include "zdtmtst.h" + +#ifdef ZDTM_IPV4V6 +#define ZDTM_FAMILY AF_INET +#elif defined(ZDTM_IPV6) +#define ZDTM_FAMILY AF_INET6 +#else +#define ZDTM_FAMILY AF_INET +#endif + +const char *test_doc = "Check that in-flight TCP connections are ignored\n"; +const char *test_author = "Radostin Stoyanov "; + +/* Description: + * Initialise server and client tcp sockets and verify that + * in-flight TCP connections are ignored. + */ + +#include +#include + +#include +#include +#include + + +int main(int argc, char **argv) +{ + struct pollfd poll_set[1]; + int port = 9990; + int fd_s, fd_c, fd; + int ret; + + test_init(argc, argv); + + fd_s = tcp_init_server(ZDTM_FAMILY, &port); + if (fd_s < 0) + return -1; + + if (set_nonblock(fd_s, true)) { + pr_perror("setting O_NONBLOCK failed"); + return -1; + } + + fd_c = tcp_init_client(ZDTM_FAMILY, "localhost", port); + if (fd_c < 0) + return -1; + + test_daemon(); + test_waitsig(); + + if (close(fd_c)) { + fail("Unable to close a client socket"); + return 1; + } + + fd = tcp_accept_server(fd_s); + if (fd >= 0) + close(fd); + + fd_c = tcp_init_client(ZDTM_FAMILY, "localhost", port); + if (fd_c < 0) { + fail("Unable to create a client socket"); + return -1; + } + + memset(poll_set, '\0', sizeof(poll_set)); + poll_set[0].fd = fd_s; + poll_set[0].events = POLLIN; + ret = poll(poll_set, 1, -1); + if (ret < 0) { + pr_perror("poll() failed"); + return 1; + } + + fd = tcp_accept_server(fd_s); + if (fd < 0) { + fail("Unable to accept a new connection"); + return 1; + } + close(fd); + + close(fd_c); + close(fd_s); + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/socket-tcp-skip-in-flight.desc b/CRIU_code/test/zdtm/static/socket-tcp-skip-in-flight.desc new file mode 100644 index 0000000..0ef6e6d --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp-skip-in-flight.desc @@ -0,0 +1 @@ +{'opts': '--tcp-established --skip-in-flight'} diff --git a/CRIU_code/test/zdtm/static/socket-tcp-syn-sent.c b/CRIU_code/test/zdtm/static/socket-tcp-syn-sent.c new file mode 100644 index 0000000..cf4c3bb --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp-syn-sent.c @@ -0,0 +1,143 @@ +#include "zdtmtst.h" + +#ifdef ZDTM_IPV4V6 +#define ZDTM_FAMILY AF_INET +#define ZDTM_SRV_FAMILY AF_INET6 +#elif defined(ZDTM_IPV6) +#define ZDTM_FAMILY AF_INET6 +#define ZDTM_SRV_FAMILY AF_INET6 +#else +#define ZDTM_FAMILY AF_INET +#define ZDTM_SRV_FAMILY AF_INET +#endif + +const char *test_doc = "Check unconnected tcp sockets\n"; +const char *test_author = "Andrey Vagin +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int port = 8880; + +union sockaddr_inet { + struct sockaddr addr; + struct sockaddr_in v4; + struct sockaddr_in6 v6; +}; + +int main(int argc, char **argv) +{ + int fd, fd_s, sock, sk; + union sockaddr_inet addr; + char cmd[4096]; + + test_init(argc, argv); + + sk = socket(ZDTM_FAMILY, SOCK_STREAM, 0); + if (sk < 0) { + pr_perror("socket"); + return 1; + } + + if ((fd_s = tcp_init_server(ZDTM_SRV_FAMILY, &port)) < 0) { + pr_err("initializing server failed\n"); + return 1; + } + + + if ((sock = socket(ZDTM_FAMILY, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP)) < 0) { + pr_perror("can't create socket"); + return -1; + } + + /* Construct the server address structure */ + memset(&addr, 0, sizeof(addr)); + if (ZDTM_FAMILY == AF_INET) { + addr.v4.sin_family = AF_INET; + inet_pton(AF_INET, "localhost", &addr.v4.sin_addr); + } else { + addr.v6.sin6_family = AF_INET6; + inet_pton(AF_INET6, "localhost", &addr.v6.sin6_addr); + } + if (bind(sock, (struct sockaddr *) &addr, sizeof(addr)) < 0) { + pr_perror("can't connect to server"); + return -1; + } + + snprintf(cmd, sizeof(cmd), "iptables -w -t filter --protocol tcp -A INPUT --dport %d -j DROP", port); + if (system(cmd)) + return -1; + + /* Construct the server address structure */ + memset(&addr, 0, sizeof(addr)); + if (ZDTM_FAMILY == AF_INET) { + addr.v4.sin_family = AF_INET; + addr.v4.sin_port = htons(port); + inet_pton(AF_INET, "localhost", &addr.v4.sin_addr); + } else { + addr.v6.sin6_family = AF_INET6; + addr.v6.sin6_port = htons(port); + inet_pton(AF_INET6, "localhost", &addr.v6.sin6_addr); + } + + errno = 0; + if (connect(sock, (struct sockaddr *) &addr, sizeof(addr)) == 0 || errno != EINPROGRESS) { + pr_perror("can't connect to server"); + return -1; + } + + test_daemon(); + test_waitsig(); + + snprintf(cmd, sizeof(cmd), "iptables -w -t filter --protocol tcp -D INPUT --dport %d -j DROP", port); + if (system(cmd)) + return -1; + + /* + * parent is server of TCP connection + */ + fd = tcp_accept_server(fd_s); + if (fd < 0) { + pr_err("can't accept client connection\n"); + return 1; + } + close(fd_s); + + fcntl(sock, F_SETFL, 0); + + char c = 5; + if (write(sock, &c, 1) != 1) { + fail("Unable to send data"); + return 1; + } + + c = 0; + if (read(fd, &c, 1) != 1 || c != 5) { + fail("Unable to recv data"); + return 1; + } + + c = 6; + if (write(fd, &c, 1) != 1) { + fail("Unable to send data"); + return 1; + } + + c = 0; + if (read(sock, &c, 1) != 1 || c != 6) { + fail("Unable to recv data"); + return 1; + } + + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/socket-tcp-syn-sent.desc b/CRIU_code/test/zdtm/static/socket-tcp-syn-sent.desc new file mode 100644 index 0000000..b9f3d5e --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp-syn-sent.desc @@ -0,0 +1,9 @@ +{ 'deps': [ '/bin/sh', + '/sbin/iptables', + '/usr/lib64/xtables/libxt_tcp.so|/lib/xtables/libxt_tcp.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_tcp.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_tcp.so|/usr/lib/xtables/libxt_tcp.so|/usr/lib/s390x-linux-gnu/xtables/libxt_tcp.so', + '/usr/lib64/xtables/libxt_standard.so|/lib/xtables/libxt_standard.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_standard.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_standard.so|/usr/lib/xtables/libxt_standard.so|/usr/lib/s390x-linux-gnu/xtables/libxt_standard.so', + ], + 'opts': '--tcp-established', + 'flags': 'suid nouser samens', + 'feature' : 'tcp_half_closed' +} diff --git a/CRIU_code/test/zdtm/static/socket-tcp-syn-sent.hook b/CRIU_code/test/zdtm/static/socket-tcp-syn-sent.hook new file mode 100644 index 0000000..73d7da1 --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp-syn-sent.hook @@ -0,0 +1 @@ +socket-tcp-fin-wait1.hook \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/socket-tcp-unconn.c b/CRIU_code/test/zdtm/static/socket-tcp-unconn.c new file mode 100644 index 0000000..e943b7c --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp-unconn.c @@ -0,0 +1,122 @@ +#include "zdtmtst.h" + +#ifdef ZDTM_IPV4V6 +#define ZDTM_FAMILY AF_INET +#define ZDTM_SRV_FAMILY AF_INET6 +#elif defined(ZDTM_IPV6) +#define ZDTM_FAMILY AF_INET6 +#define ZDTM_SRV_FAMILY AF_INET6 +#else +#define ZDTM_FAMILY AF_INET +#define ZDTM_SRV_FAMILY AF_INET +#endif + +const char *test_doc = "Check unconnected tcp sockets\n"; +const char *test_author = "Andrey Vagin +#include +#include +#include +#include +#include +#include +#include +#include + +static int port = 8880; + +union sockaddr_inet { + struct sockaddr addr; + struct sockaddr_in v4; + struct sockaddr_in6 v6; +}; + +int main(int argc, char **argv) +{ + int fd, fd_s, sock, sk; + union sockaddr_inet addr, src_addr; + socklen_t aux; + + test_init(argc, argv); + + sk = socket(ZDTM_FAMILY, SOCK_STREAM, 0); + if (sk < 0) { + pr_perror("socket"); + return 1; + } + + if ((fd_s = tcp_init_server(ZDTM_SRV_FAMILY, &port)) < 0) { + pr_err("initializing server failed\n"); + return 1; + } + + + if ((sock = socket(ZDTM_FAMILY, SOCK_STREAM, IPPROTO_TCP)) < 0) { + pr_perror("can't create socket"); + return -1; + } + + /* Construct the server address structure */ + memset(&addr, 0, sizeof(addr)); + if (ZDTM_FAMILY == AF_INET) { + addr.v4.sin_family = AF_INET; + inet_pton(AF_INET, "localhost", &addr.v4.sin_addr); + } else { + addr.v6.sin6_family = AF_INET6; + inet_pton(AF_INET6, "localhost", &addr.v6.sin6_addr); + } + if (bind(sock, (struct sockaddr *) &addr, sizeof(addr)) < 0) { + pr_perror("can't connect to server"); + return -1; + } + aux = sizeof(src_addr); + memset(&src_addr, 0, sizeof(src_addr)); + if (getsockname(sock, &src_addr.addr, &aux)) { + pr_perror("getsockname"); + return 1; + } + + test_daemon(); + test_waitsig(); + + memset(&addr, 0, sizeof(addr)); + if (getsockname(sock, &addr.addr, &aux)) { + pr_perror("getsockname"); + return 1; + } + if (memcmp(&addr, &src_addr, aux)) { + pr_err("A source address mismatch"); + return 1; + } + + /* Construct the server address structure */ + memset(&addr, 0, sizeof(addr)); + if (ZDTM_FAMILY == AF_INET) { + addr.v4.sin_family = AF_INET; + addr.v4.sin_port = htons(port); + inet_pton(AF_INET, "localhost", &addr.v4.sin_addr); + } else { + addr.v6.sin6_family = AF_INET6; + addr.v6.sin6_port = htons(port); + inet_pton(AF_INET6, "localhost", &addr.v6.sin6_addr); + } + if (connect(sock, (struct sockaddr *) &addr, sizeof(addr)) < 0) { + pr_perror("can't connect to server"); + return -1; + } + + /* + * parent is server of TCP connection + */ + fd = tcp_accept_server(fd_s); + if (fd < 0) { + pr_err("can't accept client connection\n"); + return 1; + } + close(fd_s); + + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/socket-tcp-unconn.desc b/CRIU_code/test/zdtm/static/socket-tcp-unconn.desc new file mode 100644 index 0000000..155e103 --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp-unconn.desc @@ -0,0 +1 @@ +{'opts': '--tcp-established', 'flags': 'nouser samens', 'feature' : 'tcp_half_closed'} diff --git a/CRIU_code/test/zdtm/static/socket-tcp.c b/CRIU_code/test/zdtm/static/socket-tcp.c new file mode 100644 index 0000000..5158fe3 --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp.c @@ -0,0 +1,219 @@ +#include "zdtmtst.h" + +#ifdef ZDTM_IPV4V6 +#define ZDTM_FAMILY AF_INET +#define ZDTM_SRV_FAMILY AF_INET6 +#elif defined(ZDTM_IPV6) +#define ZDTM_FAMILY AF_INET6 +#define ZDTM_SRV_FAMILY AF_INET6 +#else +#define ZDTM_FAMILY AF_INET +#define ZDTM_SRV_FAMILY AF_INET +#endif + +const char *test_doc = "Check, that a TCP connection can be restored\n"; +const char *test_author = "Andrey Vagin +#include +#include +#include +#include +#include +#include +#include +#include + +static int port = 8880; + +#define BUF_SIZE 4096 + +int read_data(int fd, unsigned char *buf, int size) +{ + int cur = 0; + int ret; + while (cur != size) { + ret = read(fd, buf + cur, size - cur); + if (ret <= 0) + return -1; + cur += ret; + } + + return 0; +} + +int write_data(int fd, const unsigned char *buf, int size) +{ + int cur = 0; + int ret; + + while (cur != size) { + ret = write(fd, buf + cur, size - cur); + if (ret <= 0) + return -1; + cur += ret; + } + + return 0; +} + +int main(int argc, char **argv) +{ + unsigned char buf[BUF_SIZE]; + int fd, fd_s; + pid_t extpid; + uint32_t crc; + int pfd[2]; + int val; + socklen_t optlen; + +#ifdef ZDTM_CONNTRACK + if (unshare(CLONE_NEWNET)) { + pr_perror("unshare"); + return 1; + } + if (system("ip link set up dev lo")) + return 1; + if (system("iptables -w -A INPUT -i lo -p tcp -m state --state NEW,ESTABLISHED -j ACCEPT")) + return 1; + if (system("iptables -w -A INPUT -j DROP")) + return 1; +#endif + +#ifdef ZDTM_TCP_LOCAL + test_init(argc, argv); +#endif + + if (pipe(pfd)) { + pr_perror("pipe() failed"); + return 1; + } + + extpid = fork(); + if (extpid < 0) { + pr_perror("fork() failed"); + return 1; + } else if (extpid == 0) { +#ifndef ZDTM_TCP_LOCAL + test_ext_init(argc, argv); +#endif + + close(pfd[1]); + if (read(pfd[0], &port, sizeof(port)) != sizeof(port)) { + pr_perror("Can't read port"); + return 1; + } + + fd = tcp_init_client(ZDTM_FAMILY, "localhost", port); + if (fd < 0) + return 1; + +#ifdef STREAM + while (1) { + if (read_data(fd, buf, BUF_SIZE)) { + pr_perror("read less then have to"); + return 1; + } + if (datachk(buf, BUF_SIZE, &crc)) + return 2; + + datagen(buf, BUF_SIZE, &crc); + if (write_data(fd, buf, BUF_SIZE)) { + pr_perror("can't write"); + return 1; + } + } +#else + if (read_data(fd, buf, BUF_SIZE)) { + pr_perror("read less then have to"); + return 1; + } + if (datachk(buf, BUF_SIZE, &crc)) + return 2; + + datagen(buf, BUF_SIZE, &crc); + if (write_data(fd, buf, BUF_SIZE)) { + pr_perror("can't write"); + return 1; + } +#endif + return 0; + } + +#ifndef ZDTM_TCP_LOCAL + test_init(argc, argv); +#endif + + if ((fd_s = tcp_init_server(ZDTM_SRV_FAMILY, &port)) < 0) { + pr_err("initializing server failed\n"); + return 1; + } + + close(pfd[0]); + if (write(pfd[1], &port, sizeof(port)) != sizeof(port)) { + pr_perror("Can't send port"); + return 1; + } + close(pfd[1]); + + /* + * parent is server of TCP connection + */ + fd = tcp_accept_server(fd_s); + if (fd < 0) { + pr_err("can't accept client connection\n"); + return 1; + } + + val = 1; + if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &val, sizeof(val))) { + pr_perror("setsockopt"); + return 1; + } + + test_daemon(); +#ifdef STREAM + while (test_go()) { + datagen(buf, BUF_SIZE, &crc); + if (write_data(fd, buf, BUF_SIZE)) { + pr_perror("can't write"); + return 1; + } + + if (read_data(fd, buf, BUF_SIZE)) { + pr_perror("read less then have to"); + return 1; + } + if (datachk(buf, BUF_SIZE, &crc)) + return 2; + } + kill(extpid, SIGKILL); +#else + test_waitsig(); + + datagen(buf, BUF_SIZE, &crc); + if (write_data(fd, buf, BUF_SIZE)) { + pr_perror("can't write"); + return 1; + } + + if (read_data(fd, buf, BUF_SIZE)) { + pr_perror("read less then have to"); + return 1; + } + if (datachk(buf, BUF_SIZE, &crc)) + return 2; +#endif + optlen = sizeof(val); + if (getsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &val, &optlen)) { + pr_perror("getsockopt"); + return 1; + } + if (val != 1) { + fail("SO_REUSEADDR are not set for %d\n", fd); + return 1; + } + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/socket-tcp.desc b/CRIU_code/test/zdtm/static/socket-tcp.desc new file mode 100644 index 0000000..ca3268b --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp.desc @@ -0,0 +1 @@ +{'flavor': 'h', 'opts': '--tcp-established', 'flags': 'nouser samens'} diff --git a/CRIU_code/test/zdtm/static/socket-tcp4v6-close-wait.c b/CRIU_code/test/zdtm/static/socket-tcp4v6-close-wait.c new file mode 100644 index 0000000..20d7e78 --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp4v6-close-wait.c @@ -0,0 +1 @@ +socket-tcp-close-wait.c \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/socket-tcp4v6-close-wait.desc b/CRIU_code/test/zdtm/static/socket-tcp4v6-close-wait.desc new file mode 100644 index 0000000..c8cfe5e --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp4v6-close-wait.desc @@ -0,0 +1 @@ +socket-tcp6-close-wait.desc \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/socket-tcp4v6-closed.c b/CRIU_code/test/zdtm/static/socket-tcp4v6-closed.c new file mode 100644 index 0000000..b0c363c --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp4v6-closed.c @@ -0,0 +1 @@ +socket-tcp-closed.c \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/socket-tcp4v6-closed.desc b/CRIU_code/test/zdtm/static/socket-tcp4v6-closed.desc new file mode 100644 index 0000000..9644990 --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp4v6-closed.desc @@ -0,0 +1 @@ +socket-tcp6-closed.desc \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/socket-tcp4v6-closing.c b/CRIU_code/test/zdtm/static/socket-tcp4v6-closing.c new file mode 100644 index 0000000..154ef8b --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp4v6-closing.c @@ -0,0 +1 @@ +socket-tcp-closing.c \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/socket-tcp4v6-closing.desc b/CRIU_code/test/zdtm/static/socket-tcp4v6-closing.desc new file mode 100644 index 0000000..6981d0a --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp4v6-closing.desc @@ -0,0 +1 @@ +socket-tcp6-closing.desc \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/socket-tcp4v6-fin-wait1.c b/CRIU_code/test/zdtm/static/socket-tcp4v6-fin-wait1.c new file mode 100644 index 0000000..2dede5f --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp4v6-fin-wait1.c @@ -0,0 +1 @@ +socket-tcp-fin-wait1.c \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/socket-tcp4v6-fin-wait1.desc b/CRIU_code/test/zdtm/static/socket-tcp4v6-fin-wait1.desc new file mode 100644 index 0000000..82ad86e --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp4v6-fin-wait1.desc @@ -0,0 +1 @@ +socket-tcp6-fin-wait1.desc \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/socket-tcp4v6-fin-wait2.c b/CRIU_code/test/zdtm/static/socket-tcp4v6-fin-wait2.c new file mode 100644 index 0000000..a95fa07 --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp4v6-fin-wait2.c @@ -0,0 +1 @@ +socket-tcp-fin-wait2.c \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/socket-tcp4v6-fin-wait2.desc b/CRIU_code/test/zdtm/static/socket-tcp4v6-fin-wait2.desc new file mode 100644 index 0000000..f393f96 --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp4v6-fin-wait2.desc @@ -0,0 +1 @@ +socket-tcp6-fin-wait2.desc \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/socket-tcp4v6-last-ack.c b/CRIU_code/test/zdtm/static/socket-tcp4v6-last-ack.c new file mode 100644 index 0000000..1f0bedf --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp4v6-last-ack.c @@ -0,0 +1 @@ +socket-tcp-last-ack.c \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/socket-tcp4v6-last-ack.desc b/CRIU_code/test/zdtm/static/socket-tcp4v6-last-ack.desc new file mode 100644 index 0000000..4b39f99 --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp4v6-last-ack.desc @@ -0,0 +1 @@ +socket-tcp6-last-ack.desc \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/socket-tcp4v6-local.c b/CRIU_code/test/zdtm/static/socket-tcp4v6-local.c new file mode 100644 index 0000000..8cb60dd --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp4v6-local.c @@ -0,0 +1 @@ +socket-tcp.c \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/socket-tcp4v6-local.desc b/CRIU_code/test/zdtm/static/socket-tcp4v6-local.desc new file mode 100644 index 0000000..ce647d0 --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp4v6-local.desc @@ -0,0 +1 @@ +socket-tcp6-local.desc \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/socket-tcp4v6.c b/CRIU_code/test/zdtm/static/socket-tcp4v6.c new file mode 100644 index 0000000..8cb60dd --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp4v6.c @@ -0,0 +1 @@ +socket-tcp.c \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/socket-tcp4v6.desc b/CRIU_code/test/zdtm/static/socket-tcp4v6.desc new file mode 100644 index 0000000..094f5b8 --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp4v6.desc @@ -0,0 +1 @@ +socket-tcp6.desc \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/socket-tcp6-close-wait.c b/CRIU_code/test/zdtm/static/socket-tcp6-close-wait.c new file mode 100644 index 0000000..20d7e78 --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp6-close-wait.c @@ -0,0 +1 @@ +socket-tcp-close-wait.c \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/socket-tcp6-close-wait.desc b/CRIU_code/test/zdtm/static/socket-tcp6-close-wait.desc new file mode 100644 index 0000000..df1973c --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp6-close-wait.desc @@ -0,0 +1 @@ +socket-tcp-close-wait.desc \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/socket-tcp6-closed.c b/CRIU_code/test/zdtm/static/socket-tcp6-closed.c new file mode 100644 index 0000000..b0c363c --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp6-closed.c @@ -0,0 +1 @@ +socket-tcp-closed.c \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/socket-tcp6-closed.desc b/CRIU_code/test/zdtm/static/socket-tcp6-closed.desc new file mode 100644 index 0000000..a365149 --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp6-closed.desc @@ -0,0 +1 @@ +socket-tcp-closed.desc \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/socket-tcp6-closing.c b/CRIU_code/test/zdtm/static/socket-tcp6-closing.c new file mode 100644 index 0000000..154ef8b --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp6-closing.c @@ -0,0 +1 @@ +socket-tcp-closing.c \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/socket-tcp6-closing.desc b/CRIU_code/test/zdtm/static/socket-tcp6-closing.desc new file mode 100644 index 0000000..63046e2 --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp6-closing.desc @@ -0,0 +1 @@ +socket-tcp-closing.desc \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/socket-tcp6-closing.hook b/CRIU_code/test/zdtm/static/socket-tcp6-closing.hook new file mode 100644 index 0000000..d87972b --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp6-closing.hook @@ -0,0 +1 @@ +socket-tcp-closing.hook \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/socket-tcp6-fin-wait1.c b/CRIU_code/test/zdtm/static/socket-tcp6-fin-wait1.c new file mode 100644 index 0000000..2dede5f --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp6-fin-wait1.c @@ -0,0 +1 @@ +socket-tcp-fin-wait1.c \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/socket-tcp6-fin-wait1.desc b/CRIU_code/test/zdtm/static/socket-tcp6-fin-wait1.desc new file mode 100644 index 0000000..fba0e07 --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp6-fin-wait1.desc @@ -0,0 +1 @@ +socket-tcp-fin-wait1.desc \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/socket-tcp6-fin-wait2.c b/CRIU_code/test/zdtm/static/socket-tcp6-fin-wait2.c new file mode 100644 index 0000000..a95fa07 --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp6-fin-wait2.c @@ -0,0 +1 @@ +socket-tcp-fin-wait2.c \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/socket-tcp6-fin-wait2.desc b/CRIU_code/test/zdtm/static/socket-tcp6-fin-wait2.desc new file mode 100644 index 0000000..fba0e07 --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp6-fin-wait2.desc @@ -0,0 +1 @@ +socket-tcp-fin-wait1.desc \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/socket-tcp6-last-ack.c b/CRIU_code/test/zdtm/static/socket-tcp6-last-ack.c new file mode 100644 index 0000000..1f0bedf --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp6-last-ack.c @@ -0,0 +1 @@ +socket-tcp-last-ack.c \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/socket-tcp6-last-ack.desc b/CRIU_code/test/zdtm/static/socket-tcp6-last-ack.desc new file mode 100644 index 0000000..caace98 --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp6-last-ack.desc @@ -0,0 +1 @@ +socket-tcp-last-ack.desc \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/socket-tcp6-local.c b/CRIU_code/test/zdtm/static/socket-tcp6-local.c new file mode 100644 index 0000000..8cb60dd --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp6-local.c @@ -0,0 +1 @@ +socket-tcp.c \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/socket-tcp6-local.desc b/CRIU_code/test/zdtm/static/socket-tcp6-local.desc new file mode 100644 index 0000000..c915663 --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp6-local.desc @@ -0,0 +1 @@ +socket-tcp-local.desc \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/socket-tcp6-unconn.c b/CRIU_code/test/zdtm/static/socket-tcp6-unconn.c new file mode 100644 index 0000000..59efc05 --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp6-unconn.c @@ -0,0 +1 @@ +socket-tcp-unconn.c \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/socket-tcp6-unconn.desc b/CRIU_code/test/zdtm/static/socket-tcp6-unconn.desc new file mode 100644 index 0000000..426e48c --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp6-unconn.desc @@ -0,0 +1 @@ +socket-tcp-unconn.desc \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/socket-tcp6.c b/CRIU_code/test/zdtm/static/socket-tcp6.c new file mode 100644 index 0000000..8cb60dd --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp6.c @@ -0,0 +1 @@ +socket-tcp.c \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/socket-tcp6.desc b/CRIU_code/test/zdtm/static/socket-tcp6.desc new file mode 100644 index 0000000..ca3268b --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcp6.desc @@ -0,0 +1 @@ +{'flavor': 'h', 'opts': '--tcp-established', 'flags': 'nouser samens'} diff --git a/CRIU_code/test/zdtm/static/socket-tcpbuf-local.c b/CRIU_code/test/zdtm/static/socket-tcpbuf-local.c new file mode 100644 index 0000000..58c46c7 --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcpbuf-local.c @@ -0,0 +1 @@ +socket-tcpbuf.c \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/socket-tcpbuf-local.desc b/CRIU_code/test/zdtm/static/socket-tcpbuf-local.desc new file mode 100644 index 0000000..6a406f6 --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcpbuf-local.desc @@ -0,0 +1 @@ +{'flavor': 'h ns uns', 'opts': '--tcp-established', 'flags': 'nouser samens'} diff --git a/CRIU_code/test/zdtm/static/socket-tcpbuf.c b/CRIU_code/test/zdtm/static/socket-tcpbuf.c new file mode 100644 index 0000000..f61f6a5 --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcpbuf.c @@ -0,0 +1,321 @@ +#include "zdtmtst.h" + +#ifdef ZDTM_IPV6 +#define ZDTM_FAMILY AF_INET6 +#else +#define ZDTM_FAMILY AF_INET +#endif + +const char *test_doc = "Check full tcp buffers with custom sizes\n"; +const char *test_author = "Andrey Vagin +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int port = 8880; + +#define BUF_SIZE 4096 +#define TCP_MAX_BUF (100 << 20) + +static void read_safe(int fd, void *buf, size_t size) +{ + if (read(fd, buf, size) != size) { + pr_perror("Unable to read from %d", fd); + exit(1); + } +} + +static void write_safe(int fd, void *buf, size_t size) +{ + if (write(fd, buf, size) != size) { + pr_perror("Unable to write to %d", fd); + exit(1); + } +} + +static int fill_sock_buf(int fd) +{ + int flags; + int size; + int ret; + + flags = fcntl(fd, F_GETFL, 0); + if (flags == -1) { + pr_perror("Can't get flags"); + return -1; + } + if (fcntl(fd, F_SETFL, flags | O_NONBLOCK) == -1) { + pr_perror("Can't set flags"); + return -1; + } + + size = 0; + while (1) { + char zdtm[] = "zdtm test packet"; + ret = write(fd, zdtm, sizeof(zdtm)); + if (ret == -1) { + if (errno == EAGAIN) + break; + pr_perror("write"); + return -1; + } + size += ret; + } + + if (fcntl(fd, F_SETFL, flags) == -1) { + pr_perror("Can't set flags"); + return -1; + } + + return size; +} + +static int clean_sk_buf(int fd, int limit) +{ + int size, ret; + char buf[BUF_SIZE]; + + size = 0; + while (1) { + ret = read(fd, buf, sizeof(buf)); + if (ret == -1) { + pr_perror("read"); + return -11; + } + + if (ret == 0) + break; + + size += ret; + + if (limit && size >= limit) + break; + } + + return size; +} + +int main(int argc, char **argv) +{ + int fd, fd_s, ctl_fd; + pid_t extpid; + int pfd[2]; + int sk_bsize; + int ret, snd, snd_size, rcv_size = 0, rcv_buf_size; + +#ifdef ZDTM_TCP_LOCAL + test_init(argc, argv); +#endif + + if (pipe(pfd)) { + pr_perror("pipe() failed"); + return 1; + } + + extpid = fork(); + if (extpid < 0) { + pr_perror("fork() failed"); + return 1; + } else if (extpid == 0) { + int size; + char c; + +#ifndef ZDTM_TCP_LOCAL + test_ext_init(argc, argv); +#endif + + close(pfd[1]); + read_safe(pfd[0], &port, sizeof(port)); + + fd = tcp_init_client(ZDTM_FAMILY, "127.0.0.1", port); + if (fd < 0) + return 1; + + ctl_fd = tcp_init_client(ZDTM_FAMILY, "127.0.0.1", port); + if (fd < 0) + return 1; + + snd_size = fill_sock_buf(fd); + if (snd_size <= 0) + return 1; + + write_safe(ctl_fd, &snd_size, sizeof(snd_size)); + + read_safe(ctl_fd, &rcv_buf_size, sizeof(rcv_buf_size)); + + while (1) { + /* heart beat */ + read_safe(ctl_fd, &ret, sizeof(ret)); + if (ret < 0) + break; + rcv_buf_size += ret; + + snd = fill_sock_buf(fd); + if (snd < 0) + return -1; + snd_size += snd; + + if (rcv_buf_size / 2) { + ret = clean_sk_buf(fd, rcv_buf_size / 2); + if (ret <= 0) + return 1; + } else + ret = 0; + + rcv_buf_size -= ret; + rcv_size += ret; + + write_safe(ctl_fd, &snd, sizeof(snd)); + } + + read_safe(ctl_fd, &ret, sizeof(ret)); + rcv_buf_size += ret; + + write_safe(ctl_fd, &snd_size, sizeof(snd_size)); + + if (read(ctl_fd, &c, 1) != 0) { + pr_perror("read"); + return 1; + } + + if (shutdown(fd, SHUT_WR) == -1) { + pr_perror("shutdown"); + return 1; + } + + size = clean_sk_buf(fd, 0); + if (size < 0) + return 1; + + if (size != rcv_buf_size) { + fail("the received buffer contains only %d bytes (%d)\n", size, rcv_buf_size); + } + + rcv_size += size; + + write_safe(ctl_fd, &rcv_size, sizeof(rcv_size)); + close(fd); + + return 0; + } + +#ifndef ZDTM_TCP_LOCAL + test_init(argc, argv); +#endif + + if ((fd_s = tcp_init_server(ZDTM_FAMILY, &port)) < 0) { + pr_err("initializing server failed\n"); + return 1; + } + + close(pfd[0]); + write_safe(pfd[1], &port, sizeof(port)); + close(pfd[1]); + + /* + * parent is server of TCP connection + */ + fd = tcp_accept_server(fd_s); + if (fd < 0) { + pr_err("can't accept client connection\n"); + return 1; + } + + ctl_fd = tcp_accept_server(fd_s); + if (ctl_fd < 0) { + pr_err("can't accept client connection\n"); + return 1; + } + + sk_bsize = TCP_MAX_BUF; + if (setsockopt(fd, SOL_SOCKET, SO_SNDBUF, + &sk_bsize, sizeof(sk_bsize)) == -1) { + pr_perror("Can't set snd buf"); + return 1; + } + + sk_bsize = TCP_MAX_BUF; + if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, + &sk_bsize, sizeof(sk_bsize)) == -1) { + pr_perror("Can't set snd buf"); + return 1; + } + + snd_size = fill_sock_buf(fd); + if (snd_size <= 0) + return 1; + + read_safe(ctl_fd, &rcv_buf_size, sizeof(rcv_buf_size)); + + write_safe(ctl_fd, &snd_size, sizeof(snd_size)); + + test_daemon(); + + snd = 0; + while (test_go()) { + /* heart beat */ + if (rcv_buf_size / 2) { + ret = clean_sk_buf(fd, rcv_buf_size / 2); + if (ret <= 0) + return 1; + } else + ret = 0; + + rcv_size += ret; + rcv_buf_size -= ret; + + write_safe(ctl_fd, &snd, sizeof(snd)); + read_safe(ctl_fd, &ret, sizeof(ret)); + + rcv_buf_size += ret; + + snd = fill_sock_buf(fd); + if (snd < 0) + return -1; + snd_size += snd; + } + + ret = -1; + write_safe(ctl_fd, &ret, sizeof(ret)); + write_safe(ctl_fd, &snd, sizeof(ret)); + read_safe(ctl_fd, &snd, sizeof(snd)); + + if (shutdown(ctl_fd, SHUT_WR) == -1) { + pr_perror("shutdown"); + return 1; + } + + if (shutdown(fd, SHUT_WR) == -1) { + pr_perror("shutdown"); + return 1; + } + + ret = clean_sk_buf(fd, 0); + if (ret != rcv_buf_size) { + fail("the received buffer contains only %d bytes (%d)\n", ret, rcv_buf_size); + } + rcv_size += ret; + + if (snd != rcv_size) { + fail("The child sent %d bytes, but the parent received %d bytes\n", rcv_buf_size, rcv_size); + return 1; + } + + read_safe(ctl_fd, &ret, sizeof(ret)); + + if (ret != snd_size) { + fail("The parent sent %d bytes, but the child received %d bytes\n", snd_size, ret); + return 1; + } + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/socket-tcpbuf.desc b/CRIU_code/test/zdtm/static/socket-tcpbuf.desc new file mode 100644 index 0000000..ca3268b --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcpbuf.desc @@ -0,0 +1 @@ +{'flavor': 'h', 'opts': '--tcp-established', 'flags': 'nouser samens'} diff --git a/CRIU_code/test/zdtm/static/socket-tcpbuf6-local.c b/CRIU_code/test/zdtm/static/socket-tcpbuf6-local.c new file mode 100644 index 0000000..58c46c7 --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcpbuf6-local.c @@ -0,0 +1 @@ +socket-tcpbuf.c \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/socket-tcpbuf6-local.desc b/CRIU_code/test/zdtm/static/socket-tcpbuf6-local.desc new file mode 100644 index 0000000..3c9afd2 --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcpbuf6-local.desc @@ -0,0 +1 @@ +socket-tcpbuf-local.desc \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/socket-tcpbuf6.c b/CRIU_code/test/zdtm/static/socket-tcpbuf6.c new file mode 100644 index 0000000..58c46c7 --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcpbuf6.c @@ -0,0 +1 @@ +socket-tcpbuf.c \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/socket-tcpbuf6.desc b/CRIU_code/test/zdtm/static/socket-tcpbuf6.desc new file mode 100644 index 0000000..ca3268b --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket-tcpbuf6.desc @@ -0,0 +1 @@ +{'flavor': 'h', 'opts': '--tcp-established', 'flags': 'nouser samens'} diff --git a/CRIU_code/test/zdtm/static/socket6_udp.c b/CRIU_code/test/zdtm/static/socket6_udp.c new file mode 100644 index 0000000..b81a6af --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket6_udp.c @@ -0,0 +1,124 @@ +#include "zdtmtst.h" + +const char *test_doc = "Static test for IP6/UDP socket\n"; +const char *test_author = "Cyrill Gorcunov \n"; + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for sockaddr_in and inet_ntoa() */ +#include + +static int port = 8880; +static char buf[64]; + +#define MSG1 "msg1" +#define MSG2 "msg_2" + +int main(int argc, char **argv) +{ + int ret, sk1, sk2; + socklen_t len = sizeof(struct sockaddr_in6); + struct sockaddr_in6 addr1, addr2, addr; + + test_init(argc, argv); + + sk1 = socket(PF_INET6, SOCK_DGRAM, IPPROTO_UDP); + if (sk1 < 0) { + pr_perror("Can't create socket"); + return 1; + } + + memset(&addr1, 0, sizeof(addr1)); + addr1.sin6_family = AF_INET6; + addr1.sin6_port = htons(port); + inet_pton(AF_INET6, "::1", &addr1.sin6_addr); + + ret = bind(sk1, (struct sockaddr *)&addr1, len); + if (ret < 0) { + pr_perror("Can't bind socket"); + return 1; + } + + sk2 = socket(PF_INET6, SOCK_DGRAM, IPPROTO_UDP); + if (sk2 < 0) { + pr_perror("Can't create socket"); + return 1; + } + + memset(&addr2, 0, sizeof(addr2)); + addr2.sin6_family = AF_INET6; + addr2.sin6_port = htons(port+1); + inet_pton(AF_INET6, "::1", &addr2.sin6_addr); + + ret = bind(sk2, (struct sockaddr *)&addr2, len); + if (ret < 0) { + pr_perror("Can't bind socket"); + return 1; + } + + ret = connect(sk2, (struct sockaddr *)&addr1, len); + if (ret < 0) { + pr_perror("Can't connect"); + return 1; + } + + test_daemon(); + test_waitsig(); + + ret = sendto(sk1, MSG1, sizeof(MSG1), 0, + (struct sockaddr *)&addr2, len); + if (ret < 0) { + fail("Can't send"); + return 1; + } + + ret = send(sk2, MSG2, sizeof(MSG2), 0); + if (ret < 0) { + fail("Can't send C"); + return 1; + } + + ret = recvfrom(sk1, buf, sizeof(buf), 0, + (struct sockaddr *)&addr, &len); + if (ret <= 0) { + fail("Can't recv C"); + return 1; + } + + if (len != sizeof(struct sockaddr_in6) || memcmp(&addr2, &addr, len)) { + fail("Wrong peer C"); + return 1; + } + + if (ret != sizeof(MSG2) || memcmp(buf, MSG2, ret)) { + fail("Wrong message C"); + return 1; + } + + ret = recvfrom(sk2, buf, sizeof(buf), 0, + (struct sockaddr *)&addr, &len); + if (ret <= 0) { + fail("Can't recv"); + return 1; + } + + if (len != sizeof(struct sockaddr_in6) || memcmp(&addr1, &addr, len)) { + fail("Wrong peer"); + return 1; + } + + if (ret != sizeof(MSG1) || memcmp(buf, MSG1, ret)) { + fail("Wrong message"); + return 1; + } + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/socket_aio.c b/CRIU_code/test/zdtm/static/socket_aio.c new file mode 100644 index 0000000..b276adb --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket_aio.c @@ -0,0 +1,145 @@ +#include "zdtmtst.h" + +const char *test_doc = "static test for AIO\n"; +const char *test_author = "Andrew Vagin "; + +/* Description: + * Create two tcp socket, server send asynchronous request on + * read data and clietn write data after migration + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int port = 8880; + +#define BUF_SIZE 1024 + +int main(int argc, char **argv) +{ + char buf[BUF_SIZE]; + int fd, fd_s; + struct aiocb aiocb; + int status; + pid_t pid; + int ret, res; + const struct aiocb *aioary[1]; + task_waiter_t child_waiter; + + test_init(argc, argv); + + task_waiter_init(&child_waiter); + + if ((fd_s = tcp_init_server(AF_INET, &port)) < 0) { + pr_err("initializing server failed\n"); + return 1; + } + + pid = test_fork(); + if (pid < 0) { + pr_perror("fork failed"); + return 1; + } + + if (pid == 0) { + /* + * Chiled is client of TCP connection + */ + close(fd_s); + fd = tcp_init_client(AF_INET, "127.0.0.1", port); + if (fd < 0) + return 1; + + memset(&aiocb, 0, sizeof(struct aiocb)); + aiocb.aio_fildes = fd; + aiocb.aio_buf = buf; + aiocb.aio_nbytes = BUF_SIZE; + ret = aio_read(&aiocb); + if (ret < 0) { + pr_perror("aio_read failed"); + return 1; + } + + task_waiter_complete_current(&child_waiter); + + /* Wait for request completion */ + aioary[0] = &aiocb; + ret = aio_error(&aiocb); +#ifdef DEBUG + test_msg("."); +#endif + res = 0; +again: + if (aio_suspend(aioary, 1, NULL) < 0 && errno != EINTR) { + pr_perror("aio_suspend failed"); + res = 1; + } + + ret = aio_error(&aiocb); + if (!res && ret == EINPROGRESS) { +#ifdef DEBUG + test_msg("restart aio_suspend\n"); +#endif + goto again; + } + if (ret != 0) { + pr_err("Error at aio_error(): %s\n", strerror(ret)); + res = 1; + } + + if (aio_return(&aiocb) != BUF_SIZE) { + pr_perror("Error at aio_return()"); + res = 1; + } + + close(fd); + return res; + } + + /* + * parent is server of TCP connection + */ + fd = tcp_accept_server(fd_s); + close(fd_s); + if (fd < 0) { + pr_err("can't accept client connection\n"); + goto error; + } + + task_waiter_wait4(&child_waiter, pid); + + test_daemon(); + test_waitsig(); + + if (write(fd, buf, BUF_SIZE) < BUF_SIZE) { + pr_perror("can't write"); + goto error; + } + close(fd); + + + if (wait(&status) < 0) { + pr_perror("wait failed"); + goto error; + } + + if (WIFEXITED(status) && WEXITSTATUS(status) != 0) { + pr_err("child failed with exit code %d\n", WEXITSTATUS(status)); + return 1; + } + + pass(); + return 0; +error: + kill(pid, SIGKILL); + wait(&status); + return -1; +} diff --git a/CRIU_code/test/zdtm/static/socket_aio.desc b/CRIU_code/test/zdtm/static/socket_aio.desc new file mode 100644 index 0000000..756028b --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket_aio.desc @@ -0,0 +1 @@ +{'opts': '--tcp-established', 'flags' : 'nouser'} diff --git a/CRIU_code/test/zdtm/static/socket_close_data.c b/CRIU_code/test/zdtm/static/socket_close_data.c new file mode 100644 index 0000000..de552ad --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket_close_data.c @@ -0,0 +1,43 @@ +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check one end of socketpair with data"; +const char *test_author = "Andrew Vagin 0 ? ret : 0] = 0; + if (ret != sizeof(MSG)) { + fail("%d: %s", ret, buf); + return 1; + } + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/socket_close_data01.c b/CRIU_code/test/zdtm/static/socket_close_data01.c new file mode 100644 index 0000000..df4b894 --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket_close_data01.c @@ -0,0 +1,115 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check data of bound socket and possibility to connect"; +const char *test_author = "Kirill Tkhai 0 ? ret : 0] = 0; + if (ret != sizeof(MSG)) { + fail("%d: %s", ret, buf); + ret = 1; + goto unlink; + } + + /* Test2: check it's still possible to connect to the bound socket */ + if (fork() == 0) { + exit(client("(iter2)")); + } + + if (wait(&status) < 0) { + fail("wait failed"); + goto unlink; + } + + if (WEXITSTATUS(status) != 0) { + fail("connect failed"); + goto unlink; + } + + ret = 0; + pass(); +unlink: + unlink(filename); + return ret; +} diff --git a/CRIU_code/test/zdtm/static/socket_dgram_data.c b/CRIU_code/test/zdtm/static/socket_dgram_data.c new file mode 100644 index 0000000..2f635b6 --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket_dgram_data.c @@ -0,0 +1,81 @@ +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that data in dgram socket are restored correctly"; +const char *test_author = "Andrew Vagin 0 ? ret : 0] = 0; + if (ret != sizeof(MSG)) { + fail("%d: %s", ret, buf); + return 1; + } + + ret = read(srv, buf, sizeof(buf)); + if (ret != -1 || errno != EAGAIN) { + fail("unexpected data: %d", ret); + return 1; + } + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/socket_listen.c b/CRIU_code/test/zdtm/static/socket_listen.c new file mode 100644 index 0000000..d0acfd9 --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket_listen.c @@ -0,0 +1,123 @@ +#include "zdtmtst.h" + +#ifdef ZDTM_IPV4V6 +#define ZDTM_FAMILY AF_INET +#define ZDTM_SRV_FAMILY AF_INET6 +#elif defined(ZDTM_IPV6) +#define ZDTM_FAMILY AF_INET6 +#define ZDTM_SRV_FAMILY AF_INET6 +#else +#define ZDTM_FAMILY AF_INET +#define ZDTM_SRV_FAMILY AF_INET +#endif + +const char *test_doc = "static test for listening socket\n"; +const char *test_author = "Stanislav Kinsbursky "; + +/* Description: + * Create two tcp socket, server send asynchronous request on + * read data and clietn write data after migration + */ + +#include +#include +#include +#include +#include +#include +#include + +static int port = 8880; + +#define BUF_SIZE 1024 + +static void sig_hand(int signo) {} + +int main(int argc, char **argv) +{ + unsigned char buf[BUF_SIZE]; + int fd, fd_s; + int status; + pid_t pid; + int res; + uint32_t crc; + struct sigaction sa = { + .sa_handler = sig_hand, + /* don't set SA_RESTART */ + }; + + test_init(argc, argv); + + if ((fd_s = tcp_init_server(ZDTM_SRV_FAMILY, &port)) < 0) { + pr_err("initializing server failed\n"); + return 1; + } + + test_daemon(); + test_waitsig(); + + sigemptyset(&sa.sa_mask); + if (sigaction(SIGCHLD, &sa, NULL)) + pr_perror("Can't set SIGCHLD handler"); + + pid = test_fork(); + if (pid < 0) { + pr_perror("fork failed"); + return 1; + } + + if (pid == 0) { + /* + * Chiled is client of TCP connection + */ + close(fd_s); + fd = tcp_init_client(ZDTM_FAMILY, "localhost", port); + if (fd < 0) + return 1; + + res = read(fd, buf, BUF_SIZE); + close(fd); + if (res != BUF_SIZE) { + pr_perror("read less then have to: %d instead of %d", res, BUF_SIZE); + return -1; + } + if (datachk(buf, BUF_SIZE, &crc)) + return -2; + return 0; + } + + /* + * parent is server of TCP connection + */ + fd = tcp_accept_server(fd_s); + close(fd_s); + if (fd < 0) { + pr_err("can't accept client connection\n"); + goto error; + } + + datagen(buf, BUF_SIZE, &crc); + if (write(fd, buf, BUF_SIZE) < BUF_SIZE) { + pr_perror("can't write"); + goto error; + } + close(fd); + + + if (wait(&status) < 0) { + pr_perror("wait failed"); + goto error; + } + + if (WIFEXITED(status) && WEXITSTATUS(status) != 0) { + pr_err("child failed with exit code %d\n", WEXITSTATUS(status)); + return 1; + } + + pass(); + return 0; +error: + kill(pid, SIGKILL); + wait(&status); + return -1; +} diff --git a/CRIU_code/test/zdtm/static/socket_listen4v6.c b/CRIU_code/test/zdtm/static/socket_listen4v6.c new file mode 100644 index 0000000..d026b3b --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket_listen4v6.c @@ -0,0 +1 @@ +socket_listen.c \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/socket_listen6.c b/CRIU_code/test/zdtm/static/socket_listen6.c new file mode 100644 index 0000000..d026b3b --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket_listen6.c @@ -0,0 +1 @@ +socket_listen.c \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/socket_queues.c b/CRIU_code/test/zdtm/static/socket_queues.c new file mode 100644 index 0000000..375b449 --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket_queues.c @@ -0,0 +1,108 @@ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +/* FIXME Need gram sockets tests */ + +const char *test_doc = "Test unix sockets queues (2 messages in queue)\n"; +const char *test_author = "Stanislav Kinsbursky \n"; + +#define SK_DATA_S1 "packet stream left" +#define SK_DATA_S2 "packet stream right" +#define SK_DATA_D1 "packet dgram left" +#define SK_DATA_D2 "packet dgram right" + +int main(int argc, char *argv[]) +{ + int ssk_pair_d[2]; + int ssk_pair_s[2]; + char buf_left[64], buf_right[64]; + + test_init(argc, argv); + + if (socketpair(AF_UNIX, SOCK_STREAM, 0, ssk_pair_s) == -1) { + fail("socketpair\n"); + exit(1); + } + + write(ssk_pair_s[0], SK_DATA_S1, sizeof(SK_DATA_S1)); + write(ssk_pair_s[0], SK_DATA_S2, sizeof(SK_DATA_S2)); + write(ssk_pair_s[1], SK_DATA_S2, sizeof(SK_DATA_S2)); + write(ssk_pair_s[1], SK_DATA_S1, sizeof(SK_DATA_S1)); + + if (socketpair(AF_UNIX, SOCK_DGRAM, 0, ssk_pair_d) == -1) { + fail("socketpair\n"); + exit(1); + } + + write(ssk_pair_d[0], SK_DATA_D1, sizeof(SK_DATA_D1)); + write(ssk_pair_d[0], SK_DATA_D2, sizeof(SK_DATA_D2)); + write(ssk_pair_d[1], SK_DATA_D2, sizeof(SK_DATA_D2)); + write(ssk_pair_d[1], SK_DATA_D1, sizeof(SK_DATA_D1)); + + test_daemon(); + test_waitsig(); + + read(ssk_pair_s[1], buf_left, strlen(SK_DATA_S1) + 1); + if (strcmp(buf_left, SK_DATA_S1)) { + fail("SK_DATA_S2: '%s\n", SK_DATA_S1); + exit(1); + } + read(ssk_pair_s[1], buf_right, strlen(SK_DATA_S2) + 1); + if (strcmp(buf_right, SK_DATA_S2)) { + fail("data corrupted\n"); + exit(1); + } + test_msg("stream1 : '%s' '%s'\n", buf_left, buf_right); + + read(ssk_pair_s[0], buf_left, strlen(SK_DATA_S2) + 1); + if (strcmp(buf_left, SK_DATA_S2)) { + fail("data corrupted\n"); + exit(1); + } + read(ssk_pair_s[0], buf_right, strlen(SK_DATA_S1) + 1); + if (strcmp(buf_right, SK_DATA_S1)) { + fail("data corrupted\n"); + exit(1); + } + test_msg("stream2 : '%s' '%s'\n", buf_left, buf_right); + + read(ssk_pair_d[1], buf_left, strlen(SK_DATA_D1) + 1); + if (strcmp(buf_left, SK_DATA_D1)) { + fail("data corrupted\n"); + exit(1); + } + read(ssk_pair_d[1], buf_right, strlen(SK_DATA_D2) + 1); + if (strcmp(buf_right, SK_DATA_D2)) { + fail("data corrupted\n"); + exit(1); + } + test_msg("dgram1 : '%s' '%s'\n", buf_left, buf_right); + + read(ssk_pair_d[0], buf_left, strlen(SK_DATA_D2) + 1); + if (strcmp(buf_left, SK_DATA_D2)) { + fail("data corrupted\n"); + exit(1); + } + read(ssk_pair_d[0], buf_right,strlen(SK_DATA_D1) + 1); + if (strcmp(buf_right, SK_DATA_D1)) { + fail("data corrupted\n"); + exit(1); + } + test_msg("dgram2 : '%s' '%s'\n", buf_left, buf_right); + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/socket_snd_addr.c b/CRIU_code/test/zdtm/static/socket_snd_addr.c new file mode 100644 index 0000000..7e5376e --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket_snd_addr.c @@ -0,0 +1,100 @@ +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that sender addresses are restored"; +const char *test_author = "Andrew Vagin 0 ? ret : 0] = 0; + if (ret != sizeof(MSG)) { + fail("%d: %s", ret, buf); + return 1; + } + if (hdr.msg_namelen > sizeof(addr.sun_family) + 1) + pr_perror("%d, %s", hdr.msg_namelen, addr.sun_path + 1); + if (memcmp(addr.sun_path, sk_names[i], sizeof(SK_NAME))) { + fail("A sender address is mismatch"); + return 1; + } + } + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/socket_snd_addr.desc b/CRIU_code/test/zdtm/static/socket_snd_addr.desc new file mode 100644 index 0000000..95c58b4 --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket_snd_addr.desc @@ -0,0 +1 @@ +{'flags': 'noauto'} diff --git a/CRIU_code/test/zdtm/static/socket_udp-broadcast.c b/CRIU_code/test/zdtm/static/socket_udp-broadcast.c new file mode 100644 index 0000000..a5fb554 --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket_udp-broadcast.c @@ -0,0 +1,47 @@ +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "test checkpoint/restore of SO_BROADCAST\n"; +const char *test_author = "Radostin Stoyanov \n"; + +/* Description: + * Create UDP socket, set SO_BROADCAST and verify its value after restore. + */ + +int main(int argc, char **argv) +{ + int sockfd; + int val; + socklen_t len = sizeof(val); + + test_init(argc, argv); + + sockfd = socket(AF_INET, SOCK_DGRAM, 0); + if (sockfd < 0) { + pr_perror("Can't create socket"); + return 1; + } + + if (setsockopt(sockfd, SOL_SOCKET, SO_BROADCAST, &(int){ 1 }, len)) { + pr_perror("setsockopt"); + return 1; + } + + test_daemon(); + test_waitsig(); + + if (getsockopt(sockfd, SOL_SOCKET, SO_BROADCAST, &val, &len)) { + pr_perror("getsockopt"); + return 1; + } + + if (len != sizeof(val) || val != 1) { + fail("SO_BROADCAST not set"); + return 1; + } + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/socket_udp-corked.c b/CRIU_code/test/zdtm/static/socket_udp-corked.c new file mode 100644 index 0000000..30cfac5 --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket_udp-corked.c @@ -0,0 +1,76 @@ +#include "zdtmtst.h" + +const char *test_doc = "static test for UDP socket\n"; +const char *test_author = "Pavel Emelyanov \n"; + +/* Description: + * Create two tcp socket, server send asynchronous request on + * read data and clietn write data after migration + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for sockaddr_in and inet_ntoa() */ +#include +#include + +static int port = 8880; + +#define MSG1 "msg1" + +int main(int argc, char **argv) +{ + int ret, sk1; + socklen_t len = sizeof(struct sockaddr_in); + struct sockaddr_in addr1; + int opt; + + test_init(argc, argv); + + sk1 = socket(PF_INET, SOCK_DGRAM, IPPROTO_UDP); + if (sk1 < 0) { + pr_perror("Can't create socket"); + return 1; + } + + memset(&addr1, 0, sizeof(addr1)); + addr1.sin_family = AF_INET; + addr1.sin_addr.s_addr = inet_addr("127.0.0.1"); + addr1.sin_port = htons(port); + + ret = bind(sk1, (struct sockaddr *)&addr1, len); + if (ret < 0) { + pr_perror("Can't bind socket"); + return 1; + } + ret = connect(sk1, (struct sockaddr *)&addr1, len); + if (ret < 0) { + pr_perror("Can't connect"); + return 1; + } + + opt = 1; + if (setsockopt(sk1, SOL_UDP, UDP_CORK, &opt, sizeof(opt))) { + pr_perror("Unable to set UDP_CORK"); + return 1; + } + + if (write(sk1, MSG1, sizeof(MSG1)) != sizeof(MSG1)) { + pr_perror("write"); + return 1; + } + + test_daemon(); + test_waitsig(); + + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/socket_udp-corked.desc b/CRIU_code/test/zdtm/static/socket_udp-corked.desc new file mode 100644 index 0000000..ded8987 --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket_udp-corked.desc @@ -0,0 +1 @@ +{'flags': 'crfail'} diff --git a/CRIU_code/test/zdtm/static/socket_udp.c b/CRIU_code/test/zdtm/static/socket_udp.c new file mode 100644 index 0000000..36a9542 --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket_udp.c @@ -0,0 +1,129 @@ +#include "zdtmtst.h" + +const char *test_doc = "static test for UDP socket\n"; +const char *test_author = "Pavel Emelyanov \n"; + +/* Description: + * Create two tcp socket, server send asynchronous request on + * read data and clietn write data after migration + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for sockaddr_in and inet_ntoa() */ +#include + +static int port = 8880; +static char buf[8]; + +#define MSG1 "msg1" +#define MSG2 "msg_2" + +int main(int argc, char **argv) +{ + int ret, sk1, sk2; + socklen_t len = sizeof(struct sockaddr_in); + struct sockaddr_in addr1, addr2, addr; + + test_init(argc, argv); + + sk1 = socket(PF_INET, SOCK_DGRAM, IPPROTO_UDP); + if (sk1 < 0) { + pr_perror("Can't create socket"); + return 1; + } + + memset(&addr1, 0, sizeof(addr1)); + addr1.sin_family = AF_INET; + addr1.sin_addr.s_addr = inet_addr("127.0.0.1"); + addr1.sin_port = htons(port); + + ret = bind(sk1, (struct sockaddr *)&addr1, len); + if (ret < 0) { + pr_perror("Can't bind socket"); + return 1; + } + + sk2 = socket(PF_INET, SOCK_DGRAM, IPPROTO_UDP); + if (sk2 < 0) { + pr_perror("Can't create socket"); + return 1; + } + + memset(&addr2, 0, sizeof(addr1)); + addr2.sin_family = AF_INET; + addr2.sin_addr.s_addr = inet_addr("127.0.0.1"); + addr2.sin_port = htons(port + 1); + + ret = bind(sk2, (struct sockaddr *)&addr2, len); + if (ret < 0) { + pr_perror("Can't bind socket"); + return 1; + } + + ret = connect(sk2, (struct sockaddr *)&addr1, len); + if (ret < 0) { + pr_perror("Can't connect"); + return 1; + } + + test_daemon(); + test_waitsig(); + + ret = sendto(sk1, MSG1, sizeof(MSG1), 0, + (struct sockaddr *)&addr2, len); + if (ret < 0) { + fail("Can't send"); + return 1; + } + + ret = send(sk2, MSG2, sizeof(MSG2), 0); + if (ret < 0) { + fail("Can't send C"); + return 1; + } + + ret = recvfrom(sk1, buf, sizeof(buf), 0, + (struct sockaddr *)&addr, &len); + if (ret <= 0) { + fail("Can't recv C"); + return 1; + } + + if (len != sizeof(struct sockaddr_in) || memcmp(&addr2, &addr, len)) { + fail("Wrong peer C"); + return 1; + } + + if (ret != sizeof(MSG2) || memcmp(buf, MSG2, ret)) { + fail("Wrong message C"); + return 1; + } + + ret = recvfrom(sk2, buf, sizeof(buf), 0, + (struct sockaddr *)&addr, &len); + if (ret <= 0) { + fail("Can't recv"); + return 1; + } + + if (len != sizeof(struct sockaddr_in) || memcmp(&addr1, &addr, len)) { + fail("Wrong peer"); + return 1; + } + + if (ret != sizeof(MSG1) || memcmp(buf, MSG1, ret)) { + fail("Wrong message"); + return 1; + } + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/socket_udp_shutdown.c b/CRIU_code/test/zdtm/static/socket_udp_shutdown.c new file mode 100644 index 0000000..ae148c5 --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket_udp_shutdown.c @@ -0,0 +1,128 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for sockaddr_in and inet_ntoa() */ +#include + +#include "zdtmtst.h" + +const char *test_doc = "static test for UDP shutdown'ed socket"; +const char *test_author = "Cyrill Gorcunov "; + +static int port = 8881; + +#define MSG1 "msg1" + +int main(int argc, char **argv) +{ + socklen_t len = sizeof(struct sockaddr_in); + struct sockaddr_in addr1, addr2, addr; + int ret, sk1, sk2; + char buf[512]; + + test_init(argc, argv); + + sk1 = socket(PF_INET, SOCK_DGRAM, IPPROTO_UDP); + sk2 = socket(PF_INET, SOCK_DGRAM, IPPROTO_UDP); + if (sk1 < 0 || sk2 < 0) { + pr_err("Can't create socket"); + exit(1); + return 1; + } + + memset(&addr1, 0, sizeof(addr1)); + memset(&addr2, 0, sizeof(addr1)); + + addr1.sin_family = AF_INET; + addr1.sin_addr.s_addr = inet_addr("127.0.0.10"); + addr1.sin_port = htons(port); + + addr2.sin_family = AF_INET; + addr2.sin_addr.s_addr = inet_addr("127.0.0.10"); + addr2.sin_port = htons(port + 1); + + if (bind(sk1, (struct sockaddr *)&addr1, len) < 0 || + bind(sk2, (struct sockaddr *)&addr2, len) < 0) { + pr_err("Can't bind socket"); + return 1; + } + + if (connect(sk1, (struct sockaddr *)&addr2, len) || + connect(sk2, (struct sockaddr *)&addr1, len)) { + pr_err("Can't connect"); + return 1; + } + + if (shutdown(sk1, SHUT_WR) || + shutdown(sk2, SHUT_RD)) { + pr_err("Can't shutdown\n"); + return 1; + } + + ret = sendto(sk2, MSG1, sizeof(MSG1), 0, + (struct sockaddr *)&addr1, len); + if (ret < 0) { + pr_perror("Can't send"); + return 1; + } + + ret = recvfrom(sk1, buf, sizeof(buf), 0, + (struct sockaddr *)&addr, &len); + if (ret <= 0) { + pr_err("Can't receive data"); + return 1; + } + + if (len != sizeof(struct sockaddr_in) || memcmp(&addr2, &addr, len)) { + pr_err("Data received from wrong peer"); + return 1; + } + + if (ret != sizeof(MSG1) || memcmp(buf, MSG1, ret)) { + pr_err("Wrong message received"); + return 1; + } + + test_daemon(); + test_waitsig(); + + ret = sendto(sk2, MSG1, sizeof(MSG1), 0, + (struct sockaddr *)&addr1, len); + if (ret < 0) { + pr_perror("Can't send"); + return 1; + } + + ret = recvfrom(sk1, buf, sizeof(buf), 0, + (struct sockaddr *)&addr, &len); + if (ret <= 0) { + pr_err("Can't receive data"); + return 1; + } + + if (len != sizeof(struct sockaddr_in) || memcmp(&addr2, &addr, len)) { + pr_err("Data received from wrong peer"); + return 1; + } + + if (ret != sizeof(MSG1) || memcmp(buf, MSG1, ret)) { + pr_err("Wrong message received"); + return 1; + } + + ret = sendto(sk1, MSG1, sizeof(MSG1), 0, + (struct sockaddr *)&addr2, len); + if (ret >= 0) { + fail("Sent to write-shutdown'ed socket"); + return 1; + } + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/socket_udplite.c b/CRIU_code/test/zdtm/static/socket_udplite.c new file mode 100644 index 0000000..229005a --- /dev/null +++ b/CRIU_code/test/zdtm/static/socket_udplite.c @@ -0,0 +1,185 @@ +#include "zdtmtst.h" + +const char *test_doc = "static test for UDP socket\n"; +const char *test_author = "Pavel Emelyanov \n"; + +/* Description: + * Create two tcp socket, server send asynchronous request on + * read data and clietn write data after migration + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for sockaddr_in and inet_ntoa() */ +#include + +static int port = 8890; +static char buf[8]; + +#define MSG1 "msg1" +#define MSG2 "msg_2" + +int main(int argc, char **argv) +{ + int ret, sk1, sk2, sk3, sk4; + socklen_t len = sizeof(struct sockaddr_in); + struct sockaddr_in addr1, addr2, addr3, addr4, addr; + + test_init(argc, argv); + + sk1 = socket(PF_INET, SOCK_DGRAM, IPPROTO_UDPLITE); + if (sk1 < 0) { + pr_perror("Can't create socket"); + return 1; + } + + memset(&addr1, 0, sizeof(addr1)); + addr1.sin_family = AF_INET; + addr1.sin_addr.s_addr = inet_addr("127.0.0.1"); + addr1.sin_port = htons(port); + + ret = bind(sk1, (struct sockaddr *)&addr1, len); + if (ret < 0) { + pr_perror("Can't bind socket"); + return 1; + } + + sk2 = socket(PF_INET, SOCK_DGRAM, IPPROTO_UDPLITE); + if (sk2 < 0) { + pr_perror("Can't create socket"); + return 1; + } + + memset(&addr2, 0, sizeof(addr1)); + addr2.sin_family = AF_INET; + addr2.sin_addr.s_addr = inet_addr("127.0.0.1"); + addr2.sin_port = htons(port + 1); + + ret = bind(sk2, (struct sockaddr *)&addr2, len); + if (ret < 0) { + pr_perror("Can't bind socket"); + return 1; + } + + ret = connect(sk2, (struct sockaddr *)&addr1, len); + if (ret < 0) { + pr_perror("Can't connect"); + return 1; + } + + sk3 = socket(PF_INET, SOCK_DGRAM, IPPROTO_UDPLITE); + if (sk3 < 0) { + pr_perror("Can't create socket"); + return 1; + } + + memset(&addr3, 0, sizeof(addr3)); + addr3.sin_family = AF_INET; + addr3.sin_addr.s_addr = inet_addr("127.0.0.1"); + addr3.sin_port = htons(port + 2); + + ret = bind(sk3, (struct sockaddr *)&addr3, len); + if (ret < 0) { + pr_perror("Can't bind socket"); + return 1; + } + + sk4 = socket(PF_INET, SOCK_DGRAM, IPPROTO_UDPLITE); + if (sk4 < 0) { + pr_perror("Can't create socket"); + return 1; + } + + memset(&addr4, 0, sizeof(addr4)); + addr4.sin_family = AF_INET; + addr4.sin_addr.s_addr = inet_addr("0.0.0.0"); + addr4.sin_port = htons(0); + + ret = bind(sk4, (struct sockaddr *)&addr4, len); + if (ret < 0) { + pr_perror("Can't bind socket"); + return 1; + } + + ret = connect(sk4, (struct sockaddr *)&addr3, len); + if (ret < 0) { + pr_perror("Can't connect"); + return 1; + } + + ret = connect(sk3, (struct sockaddr *)&addr4, len); + if (ret < 0) { + pr_perror("Can't connect"); + return 1; + } + + if (shutdown(sk4, SHUT_RDWR)) { + pr_perror("Can't shutdown socket"); + return 1; + } + + if (shutdown(sk3, SHUT_RDWR)) { + pr_perror("Can't shutdown socket"); + return 1; + } + + test_daemon(); + test_waitsig(); + + ret = sendto(sk1, MSG1, sizeof(MSG1), 0, + (struct sockaddr *)&addr2, len); + if (ret < 0) { + fail("Can't send"); + return 1; + } + + ret = send(sk2, MSG2, sizeof(MSG2), 0); + if (ret < 0) { + fail("Can't send C"); + return 1; + } + + ret = recvfrom(sk1, buf, sizeof(buf), 0, + (struct sockaddr *)&addr, &len); + if (ret <= 0) { + fail("Can't recv C"); + return 1; + } + + if (len != sizeof(struct sockaddr_in) || memcmp(&addr2, &addr, len)) { + fail("Wrong peer C"); + return 1; + } + + if (ret != sizeof(MSG2) || memcmp(buf, MSG2, ret)) { + fail("Wrong message C"); + return 1; + } + + ret = recvfrom(sk2, buf, sizeof(buf), 0, + (struct sockaddr *)&addr, &len); + if (ret <= 0) { + fail("Can't recv"); + return 1; + } + + if (len != sizeof(struct sockaddr_in) || memcmp(&addr1, &addr, len)) { + fail("Wrong peer"); + return 1; + } + + if (ret != sizeof(MSG1) || memcmp(buf, MSG1, ret)) { + fail("Wrong message"); + return 1; + } + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/sockets00.c b/CRIU_code/test/zdtm/static/sockets00.c new file mode 100644 index 0000000..41c64c7 --- /dev/null +++ b/CRIU_code/test/zdtm/static/sockets00.c @@ -0,0 +1,164 @@ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Test unix stream sockets\n"; +const char *test_author = "Cyrill Gorcunov = sizeof(addr.sun_path)) + return 1; + memcpy(addr.sun_path, path, addrlen); + addrlen += sizeof(addr.sun_family); + + ssk_icon[0] = socket(AF_UNIX, SOCK_STREAM, 0); + ssk_icon[1] = socket(AF_UNIX, SOCK_STREAM, 0); + ssk_icon[2] = socket(AF_UNIX, SOCK_STREAM, 0); + if (ssk_icon[0] < 0 || ssk_icon[1] < 0 || ssk_icon[2] < 0) { + fail("socket\n"); + exit(1); + } + + ret = bind(ssk_icon[0], (struct sockaddr *) &addr, addrlen); + if (ret) { + fail("bind\n"); + exit(1); + } + + ret = chmod(path, TEST_MODE); + if (ret) { + pr_perror("chmod"); + exit(1); + } + + ret = chown(path, uid, gid); + if (ret) { + pr_perror("chown"); + exit(1); + } + + ret = listen(ssk_icon[0], 16); + if (ret) { + fail("bind\n"); + exit(1); + } + + ret = connect(ssk_icon[2], (struct sockaddr *) &addr, addrlen); + if (ret) { + fail("connect\n"); + exit(1); + } + + ssk_icon[3] = accept(ssk_icon[0], NULL, NULL); + if (ssk_icon[3] < 0) { + fail("accept"); + exit(1); + } + + ret = connect(ssk_icon[1], (struct sockaddr *) &addr, addrlen); + if (ret) { + fail("connect\n"); + exit(1); + } + + ret = stat(path, &st_b); + if (ret) { + fail("stat"); + exit(1); + } + + test_daemon(); + test_waitsig(); + + ret = stat(path, &st_a); + if (ret) { + fail("stat"); + exit(1); + } + + if (st_b.st_mode != st_a.st_mode) { + fail("The file permissions for %s were changed %o %o\n", + path, st_b.st_mode, st_a.st_mode); + exit(1); + } + + if (st_b.st_uid != uid || st_b.st_gid != gid) { + fail("Owner user or group for %s corrupted, uid=%d, gid=%d", + path, st_b.st_uid, st_b.st_gid); + exit(1); + } + + ret = accept(ssk_icon[0], NULL, NULL); + if (ret < 0) { + fail("accept\n"); + exit(1); + } + + memset(buf, 0, sizeof(buf)); + write(ssk_icon[1], SK_DATA, sizeof(SK_DATA)); + read(ret, &buf, sizeof(buf)); + if (strcmp(buf, SK_DATA)) { + fail("data corrupted\n"); + exit(1); + } + test_msg("stream1 : '%s'\n", buf); + + memset(buf, 0, sizeof(buf)); + write(ssk_icon[2], SK_DATA, sizeof(SK_DATA)); + read(ssk_icon[3], &buf, sizeof(buf)); + if (strcmp(buf, SK_DATA)) { + fail("data corrupted\n"); + exit(1); + } + test_msg("stream2 : '%s'\n", buf); + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/sockets00.desc b/CRIU_code/test/zdtm/static/sockets00.desc new file mode 100644 index 0000000..2eac7e6 --- /dev/null +++ b/CRIU_code/test/zdtm/static/sockets00.desc @@ -0,0 +1 @@ +{'flags': 'suid'} diff --git a/CRIU_code/test/zdtm/static/sockets01.c b/CRIU_code/test/zdtm/static/sockets01.c new file mode 100644 index 0000000..1265053 --- /dev/null +++ b/CRIU_code/test/zdtm/static/sockets01.c @@ -0,0 +1,148 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Test unix sockets shutdown"; +const char *test_author = "Pavel Emelyanov "; + +#define fin(msg) do { pr_perror(msg); exit(1); } while (0) +#define ffin(msg) do { fail(msg); exit(1); } while (0) + +#define TEST_MSG "test-message" +static char buf[sizeof(TEST_MSG)]; + +int main(int argc, char *argv[]) +{ + int spu[2], spb[2], dpu[2], dpb[2], dpd[2]; + int ret; + + test_init(argc, argv); + + signal(SIGPIPE, SIG_IGN); + + /* spu -- stream pair, unidirectional shutdown */ + if (socketpair(PF_UNIX, SOCK_STREAM, 0, spu) < 0) + fin("no stream pair 1"); + + if (shutdown(spu[0], SHUT_RD) < 0) + fin("no stream shutdown 1"); + + /* spb -- stream pair, bidirectional shutdown */ + if (socketpair(PF_UNIX, SOCK_STREAM, 0, spb) < 0) + fin("no stream pair 2"); + + if (shutdown(spb[0], SHUT_RDWR) < 0) + fin("no stream shutdown 2"); + + /* dpu -- dgram pair, one end read shutdown */ + if (socketpair(PF_UNIX, SOCK_DGRAM, 0, dpu) < 0) + fin("no dgram pair 1"); + + if (shutdown(dpu[0], SHUT_RD) < 0) + fin("no dgram shutdown 1"); + + /* dpb -- dgram pair, one end read-write shutdown */ + if (socketpair(PF_UNIX, SOCK_DGRAM, 0, dpb) < 0) + fin("no dgram pair 2"); + + if (shutdown(dpb[0], SHUT_RDWR) < 0) + fin("no dgram shutdown 2"); + + /* dpd -- dgram pair, one end write shutdown with data */ + if (socketpair(PF_UNIX, SOCK_DGRAM, 0, dpd) < 0) + fin("no dgram pair 3"); + + if (write(dpd[0], TEST_MSG, sizeof(TEST_MSG)) < 0) + fin("no dgram write"); + + if (shutdown(dpd[0], SHUT_WR) < 0) + fin("no dgram shutdown 3"); + + test_daemon(); + test_waitsig(); + + /* + * spu -- check that one direction is blocked and + * the other one is not + */ + + ret = write(spu[0], TEST_MSG, sizeof(TEST_MSG)); + if (ret < 0) + ffin("SU shutdown broken 1"); + + ret = read(spu[1], buf, sizeof(buf)); + if (ret < 0) + ffin("SU shutdown broken 2"); + + ret = write(spu[1], TEST_MSG, sizeof(TEST_MSG)); + if (ret >= 0) + ffin("SU shutdown broken 3"); + + /* + * spb -- check that both ends are off + */ + + ret = write(spb[0], TEST_MSG, sizeof(TEST_MSG)); + if (ret >= 0) + ffin("SB shutdown broken 1"); + + ret = write(spb[1], TEST_MSG, sizeof(TEST_MSG)); + if (ret >= 0) + ffin("SB shutdown broken 2"); + + /* + * dpu -- check that one direction works, and + * the other does not + */ + + ret = write(dpu[0], TEST_MSG, sizeof(TEST_MSG)); + if (ret < 0) + ffin("DU shutdown broken 1"); + + ret = read(dpu[1], buf, sizeof(buf)); + if (ret < 0) + ffin("DU shutdown broken 2"); + + ret = write(dpu[1], TEST_MSG, sizeof(TEST_MSG)); + if (ret >= 0) + ffin("DU shutdown broken 3"); + + /* + * dpb -- check that both ends are read + */ + + ret = write(dpb[0], TEST_MSG, sizeof(TEST_MSG)); + if (ret >= 0) + ffin("DB shutdown broken 1"); + + ret = write(dpb[1], TEST_MSG, sizeof(TEST_MSG)); + if (ret >= 0) + ffin("DB shutdown broken 2"); + + /* + * dpd -- check that data is in there, but can't + * feed more + */ + + ret = read(dpd[1], buf, sizeof(buf)); + if (ret < 0) + ffin("DD shutdown nodata"); + + ret = write(dpd[0], TEST_MSG, sizeof(buf)); + if (ret >= 0) + ffin("DB shutdown broken"); + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/sockets02.c b/CRIU_code/test/zdtm/static/sockets02.c new file mode 100644 index 0000000..ed4afbb --- /dev/null +++ b/CRIU_code/test/zdtm/static/sockets02.c @@ -0,0 +1,65 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Test semi-closed unix stream connection\n"; +const char *test_author = "Pavel Emelyanov \n"; + +int main(int argc, char *argv[]) +{ + int ssk_pair[2], ret; + char aux, data; + + test_init(argc, argv); + + data = (char)lrand48(); + + if (socketpair(AF_UNIX, SOCK_STREAM, 0, ssk_pair) == -1) { + fail("socketpair\n"); + exit(1); + } + + if (write(ssk_pair[1], &data, sizeof(data)) != sizeof(data)) { + fail("write\n"); + exit(1); + } + + close(ssk_pair[1]); + + test_daemon(); + test_waitsig(); + + ret = read(ssk_pair[0], &aux, sizeof(aux)); + if (ret != sizeof(data) && aux != data) { + fail("Data loss (write %d, read %d)", data, aux); + return 1; + } + + errno = 0; + ret = read(ssk_pair[0], &aux, sizeof(aux)); + if (ret != 0 || errno != 0) { + fail("Opened end in wrong state (%d/%d)", ret, errno); + return 0; + } + + errno = 0; + ret = read(ssk_pair[1], &aux, sizeof(aux)); + if (ret != -1 || errno != EBADF) { + fail("Closed end in wrong state (%d/%d)", ret, errno); + return 0; + } + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/sockets03.c b/CRIU_code/test/zdtm/static/sockets03.c new file mode 100644 index 0000000..e4c647d --- /dev/null +++ b/CRIU_code/test/zdtm/static/sockets03.c @@ -0,0 +1,121 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Test unix stream sockets with mismatch in shutdown state\n"; +const char *test_author = "Andrey Ryabinin "; + +#define SK_DATA "packet" + +char *filename; +TEST_OPTION(filename, string, "socket file name", 1); + +int main(int argc, char *argv[]) +{ + int sk[3]; + struct sockaddr_un addr; + unsigned int addrlen; + char path[PATH_MAX]; + char buf[64]; + char *cwd; + int ret; + + test_init(argc, argv); + + signal(SIGPIPE, SIG_IGN); + + cwd = get_current_dir_name(); + if (!cwd) { + fail("getcwd\n"); + exit(1); + } + + snprintf(path, sizeof(path), "%s/%s", cwd, filename); + unlink(path); + + addr.sun_family = AF_UNIX; + addrlen = strlen(path); + if (addrlen >= sizeof(addr.sun_path)) + return 1; + memcpy(addr.sun_path, path, addrlen); + addrlen += sizeof(addr.sun_family); + + sk[0] = socket(AF_UNIX, SOCK_STREAM, 0); + sk[1] = socket(AF_UNIX, SOCK_STREAM, 0); + if (sk[0] < 0 || sk[1] < 0) { + fail("socket\n"); + exit(1); + } + + ret = bind(sk[0], (struct sockaddr *) &addr, addrlen); + if (ret) { + fail("bind\n"); + exit(1); + } + + ret = listen(sk[0], 16); + if (ret) { + fail("listen\n"); + exit(1); + } + + ret = shutdown(sk[1], SHUT_RD); + if (ret) { + fail("shutdown\n"); + exit(1); + } + + ret = connect(sk[1], (struct sockaddr *) &addr, addrlen); + if (ret) { + fail("connect\n"); + exit(1); + } + + sk[2] = accept(sk[0], NULL, NULL); + if (sk[2] < 0) { + fail("accept"); + exit(1); + } + + test_daemon(); + test_waitsig(); + + if (write(sk[1], SK_DATA, sizeof(SK_DATA)) < 0) { + fail("write\n"); + exit(1); + } + + if (read(sk[2], &buf, sizeof(buf)) < 0) { + fail("read\n"); + exit(1); + } + + if (strncmp(buf, SK_DATA, sizeof(SK_DATA))) { + fail("data corrupted\n"); + exit(1); + } + + if (write(sk[2], SK_DATA, sizeof(SK_DATA)) >= 0) { + fail("successful write to shutdown receiver\n"); + exit(1); + } + + close(sk[0]); + close(sk[1]); + close(sk[2]); + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/sockets03.desc b/CRIU_code/test/zdtm/static/sockets03.desc new file mode 100644 index 0000000..2eac7e6 --- /dev/null +++ b/CRIU_code/test/zdtm/static/sockets03.desc @@ -0,0 +1 @@ +{'flags': 'suid'} diff --git a/CRIU_code/test/zdtm/static/sockets_dgram.c b/CRIU_code/test/zdtm/static/sockets_dgram.c new file mode 100644 index 0000000..f135a3b --- /dev/null +++ b/CRIU_code/test/zdtm/static/sockets_dgram.c @@ -0,0 +1,210 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Test unix dgram sockets\n"; +const char *test_author = "Cyrill Gorcunov = sizeof(name_bound.sun_path)) { + fail("too long path"); + exit(1); + } + + name_bound.sun_family = AF_UNIX; + strncpy(name_bound.sun_path, path, sizeof(name_bound.sun_path)); + + snprintf(path, sizeof(path), "%s/%s.conn", dirname, filename); + unlink(path); + if (strlen(path) >= sizeof(name_conn.sun_path)) { + fail("too long path"); + exit(1); + } + + name_conn.sun_family = AF_UNIX; + strncpy(name_conn.sun_path, path, sizeof(name_conn.sun_path)); + + snprintf(path, sizeof(path), "%s/%s.bound-conn", dirname, filename); + unlink(path); + if (strlen(path) >= sizeof(name_bound_conn.sun_path)) { + fail("too long path"); + exit(1); + } + + name_bound_conn.sun_family = AF_UNIX; + strncpy(name_bound_conn.sun_path, path, sizeof(name_bound_conn.sun_path)); + + ret = bind(sk_dgram_bound_server, (struct sockaddr *) &name_bound, sizeof(name_bound)); + if (ret) { + fail("bind"); + exit(1); + } + + ret = bind(sk_dgram_conn_server, (struct sockaddr *) &name_conn, sizeof(name_conn)); + if (ret) { + fail("bind"); + exit(1); + } + + ret = connect(sk_dgram_conn_client, (struct sockaddr *) &name_conn, sizeof(name_conn)); + if (ret) { + fail("connect"); + exit(1); + } + + ret = connect(sk_dgram_conn_client2, (struct sockaddr *) &name_conn, sizeof(name_conn)); + if (ret) { + fail("connect"); + exit(1); + } + + ret = bind(sk_dgram_bound_conn, (struct sockaddr *) &name_bound_conn, sizeof(name_bound_conn)); + if (ret) { + fail("bind"); + exit(1); + } + + /* Note, it's already bound, so make it more idiotic! */ + ret = connect(sk_dgram_bound_conn, (struct sockaddr *) &name_bound_conn, sizeof(name_bound_conn)); + if (ret) { + fail("connect"); + exit(1); + } + + memset(buf, 0, sizeof(buf)); + sendto(sk_dgram_bound_client, SK_DATA_BOUND, sizeof(SK_DATA_BOUND), 0, + (struct sockaddr *) &name_bound, sizeof(name_bound)); + read(sk_dgram_bound_server, &buf, sizeof(buf)); + if (strcmp(buf, SK_DATA_BOUND)) { + fail("data corrupted\n"); + exit(1); + } + test_msg("dgram-bound : '%s'\n", buf); + + memset(buf, 0, sizeof(buf)); + write(sk_dgram_conn_client, SK_DATA_CONN, sizeof(SK_DATA_CONN)); + read(sk_dgram_conn_server, &buf, sizeof(buf)); + if (strcmp(buf, SK_DATA_CONN)) { + fail("data corrupted\n"); + exit(1); + } + test_msg("dgram-conn : '%s'\n", buf); + + memset(buf, 0, sizeof(buf)); + write(sk_dgram_bound_conn, SK_DATA_BOUND_CONN, sizeof(SK_DATA_BOUND_CONN)); + read(sk_dgram_bound_conn, &buf, sizeof(buf)); + if (strcmp(buf, SK_DATA_BOUND_CONN)) { + fail("data corrupted\n"); + exit(1); + } + test_msg("dgram-bound-conn : '%s'\n", buf); + + test_daemon(); + test_waitsig(); + + memset(buf, 0, sizeof(buf)); + sendto(sk_dgram_bound_client, SK_DATA_BOUND, sizeof(SK_DATA_BOUND), 0, + (struct sockaddr *) &name_bound, sizeof(name_bound)); + read(sk_dgram_bound_server, &buf, sizeof(buf)); + if (strcmp(buf, SK_DATA_BOUND)) { + fail("data corrupted\n"); + exit(1); + } + test_msg("dgram-bound : '%s'\n", buf); + + memset(buf, 0, sizeof(buf)); + write(sk_dgram_conn_client, SK_DATA_CONN, sizeof(SK_DATA_CONN)); + read(sk_dgram_conn_server, &buf, sizeof(buf)); + if (strcmp(buf, SK_DATA_CONN)) { + fail("data corrupted\n"); + exit(1); + } + test_msg("dgram-conn : '%s'\n", buf); + + memset(buf, 0, sizeof(buf)); + write(sk_dgram_bound_conn, SK_DATA_BOUND_CONN, sizeof(SK_DATA_BOUND_CONN)); + read(sk_dgram_bound_conn, &buf, sizeof(buf)); + if (strcmp(buf, SK_DATA_BOUND_CONN)) { + fail("data corrupted\n"); + exit(1); + } + test_msg("dgram-bound-conn : '%s'\n", buf); + + pass(); + + /* + * Do cleanup work + */ + unlink(name_bound.sun_path); + unlink(name_conn.sun_path); + unlink(name_bound_conn.sun_path); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/sockets_spair.c b/CRIU_code/test/zdtm/static/sockets_spair.c new file mode 100644 index 0000000..ed9ffa9 --- /dev/null +++ b/CRIU_code/test/zdtm/static/sockets_spair.c @@ -0,0 +1,56 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Test unix stream socketpair\n"; +const char *test_author = "Cyrill Gorcunov +#include + +#include "zdtmtst.h" + +const char *test_doc = "Start a calculation, leaving SSE in a certain state,\n" + "before migration, continue after"; +const char *test_author = "Pavel Emelianov "; + +#if defined(__i386__) || defined(__x86_64__) +void start(float *in) +{ + __asm__ volatile ( + "movaps %0, %%xmm0\n" + "movaps %1, %%xmm1\n" + "addps %%xmm0, %%xmm1\n" + "sqrtps %%xmm1, %%xmm2\n" + : + : "m" (in[0]), "m" (in[4]) + ); +} + +void finish(float *out) +{ + __asm__ volatile ( + "movaps %%xmm1, %0\n" + "movaps %%xmm2, %1\n" + : "=m" (out[0]), "=m" (out[4]) + ); +} + +static inline void cpuid(unsigned int op, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) +{ + __asm__("cpuid" + : "=a" (*eax), + "=b" (*ebx), + "=c" (*ecx), + "=d" (*edx) + : "0" (op), "c"(0)); +} + +int chk_proc_sse(void) +{ + unsigned int eax, ebx, ecx, edx; + + cpuid(1, &eax, &ebx, &ecx, &edx); + return edx & (1 << 25); +} +#endif + +int main(int argc, char **argv) +{ +#if defined(__i386__) || defined(__x86_64__) + float input[8] __attribute__((aligned(16))); + float res1[8] __attribute__((aligned(16))); + float res2[8] __attribute__((aligned(16))); + int i; +#endif + + test_init(argc, argv); +#if defined(__i386__) || defined(__x86_64__) + if (!chk_proc_sse()) { + skip("SSE not supported"); + return 1; + } + for (i = 0; i < sizeof(input) / sizeof(float); i++) + input[i] = drand48(); + + start(input); + finish(res1); + + start(input); + finish(res1); + + test_daemon(); + test_waitsig(); + + finish(res2); + + if (memcmp((uint8_t *) res1, (uint8_t *) res2, sizeof(res1))) + fail("results differ\n"); + else + pass(); +#else + skip("Unsupported arch"); +#endif + return 0; +} diff --git a/CRIU_code/test/zdtm/static/sse00.desc b/CRIU_code/test/zdtm/static/sse00.desc new file mode 100644 index 0000000..d2f501d --- /dev/null +++ b/CRIU_code/test/zdtm/static/sse00.desc @@ -0,0 +1 @@ +{'arch': 'x86_64'} diff --git a/CRIU_code/test/zdtm/static/sse20.c b/CRIU_code/test/zdtm/static/sse20.c new file mode 100644 index 0000000..912528b --- /dev/null +++ b/CRIU_code/test/zdtm/static/sse20.c @@ -0,0 +1,88 @@ +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Start a calculation, leaving SSE2 in a certain state,\n" + "before migration, continue after"; +const char *test_author = "Pavel Emelianov "; + +#if defined(__i386__) || defined(__x86_64__) +void start(double *in) +{ + __asm__ volatile ( + "movapd %0, %%xmm0\n" + "movapd %1, %%xmm1\n" + "addpd %%xmm0, %%xmm1\n" + "sqrtpd %%xmm1, %%xmm2\n" + : + : "m" (in[0]), "m" (in[2]) + ); +} + +void finish(double *out) +{ + __asm__ volatile ( + "movapd %%xmm1, %0\n" + "movapd %%xmm2, %1\n" + : "=m" (out[0]), "=m" (out[2]) + ); +} + +static inline void cpuid(unsigned int op, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) +{ + __asm__("cpuid" + : "=a" (*eax), + "=b" (*ebx), + "=c" (*ecx), + "=d" (*edx) + : "0" (op), "c"(0)); +} + +int chk_proc_sse2(void) +{ + unsigned int eax, ebx, ecx, edx; + + cpuid(1, &eax, &ebx, &ecx, &edx); + return edx & (1 << 26); +} +#endif + +int main(int argc, char **argv) +{ +#if defined(__i386__) || defined(__x86_64__) + double input[4] __attribute__((aligned(16))); + double res1[4] __attribute__((aligned(16))); + double res2[4] __attribute__((aligned(16))); + int i; +#endif + + test_init(argc, argv); +#if defined(__i386__) || defined(__x86_64__) + if (!chk_proc_sse2()) { + skip("SSE2 not supported"); + return 1; + } + + for (i = 0; i < sizeof(input) / sizeof(double); i++) + input[i] = drand48(); + + start(input); + finish(res1); + + start(input); + + test_daemon(); + test_waitsig(); + + finish(res2); + + if (memcmp((uint8_t *) res1, (uint8_t *) res2, sizeof(res1))) + fail("results differ\n"); + else + pass(); +#else + skip("Unsupported arch"); +#endif + return 0; +} diff --git a/CRIU_code/test/zdtm/static/sse20.desc b/CRIU_code/test/zdtm/static/sse20.desc new file mode 100644 index 0000000..d2f501d --- /dev/null +++ b/CRIU_code/test/zdtm/static/sse20.desc @@ -0,0 +1 @@ +{'arch': 'x86_64'} diff --git a/CRIU_code/test/zdtm/static/stopped.c b/CRIU_code/test/zdtm/static/stopped.c new file mode 100644 index 0000000..9bb8493 --- /dev/null +++ b/CRIU_code/test/zdtm/static/stopped.c @@ -0,0 +1,87 @@ +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check, that stopped tasts are restored correctly"; +const char *test_author = "Andrew Vagin "; + +int main(int argc, char **argv) +{ + pid_t pid; + siginfo_t infop; + int p[2], ret, status; + + test_init(argc, argv); + + if (pipe(p)) { + pr_perror("Unable to create pipe"); + return 1; + } + + pid = test_fork(); + if (pid < 0) + return -1; + else if (pid == 0) { + char c; + + close(p[1]); + ret = read(p[0], &c, 1); + if (ret != 1) { + pr_perror("Unable to read: %d", ret); + return 1; + } + + return 0; + } + close(p[0]); + + kill(pid, SIGSTOP); + if (waitid(P_PID, pid, &infop, WNOWAIT | WSTOPPED) < 0) { + pr_perror("waitid"); + return 1; + } +#ifdef ZDTM_STOPPED_TKILL + syscall(__NR_tkill, pid, SIGSTOP); +#endif +#ifdef ZDTM_STOPPED_KILL + kill(pid, SIGSTOP); +#endif + + write(p[1], "0", 1); + close(p[1]); + + test_daemon(); + test_waitsig(); + + // Return immediately if child run or stopped(by SIGSTOP) + if (waitpid(pid, &status, WUNTRACED | WCONTINUED) == -1) { + pr_perror("Unable to wait child"); + goto out; + } + + if (WIFSTOPPED(status)) + test_msg("The procces stopped\n"); + else{ + fail("The process doesn't stopped"); + goto out; + } + + kill(pid, SIGCONT); + + if (waitpid(pid, &status, 0) == -1) { + pr_perror("Unable to wait child"); + goto out; + } + + if (WIFEXITED(status)) + pass(); + else + fail("The process doesn't continue"); +out: + return 0; +} diff --git a/CRIU_code/test/zdtm/static/stopped01.c b/CRIU_code/test/zdtm/static/stopped01.c new file mode 100644 index 0000000..8779030 --- /dev/null +++ b/CRIU_code/test/zdtm/static/stopped01.c @@ -0,0 +1 @@ +stopped.c \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/stopped02.c b/CRIU_code/test/zdtm/static/stopped02.c new file mode 100644 index 0000000..8779030 --- /dev/null +++ b/CRIU_code/test/zdtm/static/stopped02.c @@ -0,0 +1 @@ +stopped.c \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/stopped12.c b/CRIU_code/test/zdtm/static/stopped12.c new file mode 100644 index 0000000..8779030 --- /dev/null +++ b/CRIU_code/test/zdtm/static/stopped12.c @@ -0,0 +1 @@ +stopped.c \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/tempfs.c b/CRIU_code/test/zdtm/static/tempfs.c new file mode 100644 index 0000000..8a103be --- /dev/null +++ b/CRIU_code/test/zdtm/static/tempfs.c @@ -0,0 +1,111 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check tmpfs mount"; +const char *test_author = "Pavel Emelianov "; + +char *dirname; +TEST_OPTION(dirname, string, "directory name", 1); + +#define TEST_WORD "testtest" +#define TEST_WORD2 "TESTTEST" + +int main(int argc, char **argv) +{ + int fd, fdo, ret = 1; + char buf[1024], fname[PATH_MAX], overmount[PATH_MAX]; + + test_init(argc, argv); + + mkdir(dirname, 0700); + if (mount("none", dirname, "tmpfs", 0, "") < 0) { + fail("Can't mount tmpfs"); + return 1; + } + + ssprintf(fname, "%s/test.file", dirname); + fdo = open(fname, O_RDWR | O_CREAT, 0644); + if (fdo < 0) { + pr_perror("open failed"); + goto err; + } + + if (write(fdo, TEST_WORD, sizeof(TEST_WORD)) != sizeof(TEST_WORD)) { + pr_perror("write() failed"); + goto err; + } + + ssprintf(overmount, "%s/test", dirname); + mkdir(overmount, 0700); + + ssprintf(fname, "%s/test.file", overmount); + fd = open(fname, O_RDWR | O_CREAT, 0644); + if (fd < 0) { + pr_perror("open failed"); + goto err; + } + + if (write(fd, TEST_WORD2, sizeof(TEST_WORD2)) != sizeof(TEST_WORD2)) { + pr_perror("write() failed"); + goto err; + } + close(fd); + + if (mount("none", overmount, "tmpfs", 0, "") < 0) { + fail("Can't mount tmpfs"); + goto err; + } + + test_daemon(); + test_waitsig(); + + if (umount(overmount) < 0) { + fail("Can't mount tmpfs"); + goto err; + } + + lseek(fdo, 0, SEEK_SET); + buf[sizeof(TEST_WORD) + 1] = '\0'; + if (read(fdo, buf, sizeof(TEST_WORD)) != sizeof(TEST_WORD)) { + fail("Read failed"); + goto err; + } + close(fdo); + + if (strcmp(buf, TEST_WORD)) { + fail("File corrupted"); + goto err; + } + + fd = open(fname, O_RDONLY); + if (fd < 0) { + pr_perror("open failed"); + goto err; + } + + buf[sizeof(TEST_WORD2) + 1] = '\0'; + if (read(fd, buf, sizeof(TEST_WORD2)) != sizeof(TEST_WORD2)) { + fail("Read failed"); + goto err; + } + close(fd); + + if (strcmp(buf, TEST_WORD2)) { + fail("File corrupted"); + goto err; + } + + pass(); + ret = 0; +err: + umount2(dirname, MNT_DETACH); + rmdir(dirname); + return ret; +} diff --git a/CRIU_code/test/zdtm/static/tempfs.desc b/CRIU_code/test/zdtm/static/tempfs.desc new file mode 100644 index 0000000..7657ba4 --- /dev/null +++ b/CRIU_code/test/zdtm/static/tempfs.desc @@ -0,0 +1 @@ +{'flavor': 'ns uns', 'flags': 'suid'} diff --git a/CRIU_code/test/zdtm/static/tempfs_overmounted.c b/CRIU_code/test/zdtm/static/tempfs_overmounted.c new file mode 100644 index 0000000..29eca00 --- /dev/null +++ b/CRIU_code/test/zdtm/static/tempfs_overmounted.c @@ -0,0 +1,68 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check tmpfs mount"; +const char *test_author = "Pavel Emelianov "; + +char *dirname; +TEST_OPTION(dirname, string, "directory name", 1); + +int main(int argc, char **argv) +{ + test_init(argc, argv); + + mkdir(dirname, 0700); + if (chdir(dirname)) { + pr_perror("chdir"); + return 1; + } + + mkdir("a", 0777); + mkdir("a/b", 0777); + + mount(NULL, "/", NULL, MS_PRIVATE, ""); + if (mount("none", "a/b", "tmpfs", 0, "") < 0) { + fail("Can't mount tmpfs"); + return 1; + } + if (mount("none", "a/b", "tmpfs", 0, "") < 0) { + fail("Can't mount tmpfs"); + return 1; + } + mkdir("a/b/c", 0777); + if (mount("none", "a/b/c", "tmpfs", 0, "") < 0) { + fail("Can't mount tmpfs"); + return 1; + } + if (mount("none", "a", "tmpfs", 0, "") < 0) { + fail("Can't mount tmpfs"); + return 1; + } + if (mount("none", "a", "tmpfs", 0, "") < 0) { + fail("Can't mount tmpfs"); + return 1; + } + mkdir("a/b", 0777); + if (mount("none", "a/b", "tmpfs", 0, "") < 0) { + fail("Can't mount tmpfs"); + return 1; + } + + test_daemon(); + test_waitsig(); + + if (umount("a/b") || umount("a") || umount("a") || umount("a/b/c") || umount("a/b") || umount("a/b")) { + pr_err("umount"); + return 1; + } + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/tempfs_overmounted.desc b/CRIU_code/test/zdtm/static/tempfs_overmounted.desc new file mode 100644 index 0000000..7657ba4 --- /dev/null +++ b/CRIU_code/test/zdtm/static/tempfs_overmounted.desc @@ -0,0 +1 @@ +{'flavor': 'ns uns', 'flags': 'suid'} diff --git a/CRIU_code/test/zdtm/static/tempfs_overmounted01.c b/CRIU_code/test/zdtm/static/tempfs_overmounted01.c new file mode 100644 index 0000000..e868b18 --- /dev/null +++ b/CRIU_code/test/zdtm/static/tempfs_overmounted01.c @@ -0,0 +1,118 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check how file systems are dumped if some mount points are overmounted"; +const char *test_author = "Andrei Vagin "; + +char *dirname; +TEST_OPTION(dirname, string, "directory name", 1); + +int main(int argc, char **argv) +{ + task_waiter_t lock; + int pid, status = -1; + + test_init(argc, argv); + + task_waiter_init(&lock); + + mkdir(dirname, 0700); + + pid = fork(); + if (pid < 0) { + pr_perror("fork"); + return 1; + } + if (pid == 0) { + if (mount("zdtm", dirname, "tmpfs", 0, "") < 0) { + pr_err("Can't mount tmpfs"); + return 1; + } + if (chdir(dirname)) { + pr_err("chdir"); + return 1; + } + + /* + * We don't know a direction in which criu enumerates mount, + * so lets create two chains of mounts. + */ + + /* Create a chain when a parent mount is overmounted */ + mkdir("a", 0700); + mkdir("b", 0700); + if (mount("zdtm1", "a", "tmpfs", 0, "") || + mount("a", "b", NULL, MS_BIND, "")) { + pr_perror("Can't mount tmpfs"); + return 1; + } + + mkdir("a/b", 0700); + mkdir("a/b/c", 0700); + if (mount("a/b", "a", NULL, MS_BIND, "")) { + pr_perror("mount"); + return 1; + } + + if (mount("b", "a/c", NULL, MS_MOVE, "")) { + pr_perror("Can't mount tmpfs"); + return 1; + } + + /* create a second chain where a child mount is overmounted*/ + if (mount("zdtm2", "b", "tmpfs", 0, "")) { + pr_perror("can't mount tmpfs"); + return 1; + } + mkdir("b/b", 0700); + mkdir("b/b/z", 0700); + if (mount("b", "b/b", NULL, MS_BIND, NULL) || + mount("b/b/b", "b/b", NULL, MS_BIND, NULL)) { + pr_perror("can't mount tmpfs"); + return 1; + } + + task_waiter_complete(&lock, 1); + + test_waitsig(); + if (umount2("a", MNT_DETACH)) { + pr_perror("umount"); + return 1; + } + if (umount2("b/b", MNT_DETACH) || + umount2("b/b", MNT_DETACH)) { + pr_perror("umount"); + return 1; + } + + if (access("a/b/c", R_OK) || access("b/b/z", R_OK)) { + pr_perror("access"); + return 1; + } + return 0; + } + + task_waiter_wait4(&lock, 1); + + test_daemon(); + test_waitsig(); + + kill(pid, SIGTERM); + wait(&status); + if (status) { + fail(); + return 1; + } + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/tempfs_overmounted01.desc b/CRIU_code/test/zdtm/static/tempfs_overmounted01.desc new file mode 100644 index 0000000..7657ba4 --- /dev/null +++ b/CRIU_code/test/zdtm/static/tempfs_overmounted01.desc @@ -0,0 +1 @@ +{'flavor': 'ns uns', 'flags': 'suid'} diff --git a/CRIU_code/test/zdtm/static/tempfs_ro.c b/CRIU_code/test/zdtm/static/tempfs_ro.c new file mode 100644 index 0000000..f30ae8d --- /dev/null +++ b/CRIU_code/test/zdtm/static/tempfs_ro.c @@ -0,0 +1,78 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check read-only tmpfs mount"; +const char *test_author = "Andrew Vagin "; + +char *dirname; +TEST_OPTION(dirname, string, "directory name", 1); + +#define TEST_WORD "testtest" + +int main(int argc, char **argv) +{ + int fd, ret = 1; + char buf[1024], fname[PATH_MAX]; + + test_init(argc, argv); + + mkdir(dirname, 0700); + if (mount("none", dirname, "tmpfs", 0, "") < 0) { + fail("Can't mount tmpfs"); + return 1; + } + + snprintf(fname, sizeof(buf), "%s/test.file", dirname); + fd = open(fname, O_RDWR | O_CREAT, 0644); + if (fd < 0) { + pr_perror("open failed"); + goto err; + } + + if (write(fd, TEST_WORD, sizeof(TEST_WORD)) != sizeof(TEST_WORD)) { + pr_perror("write() failed"); + goto err; + } + close(fd); + + if (mount(NULL, dirname, "tmpfs", MS_REMOUNT | MS_RDONLY, NULL) < 0) { + fail("Can't mount tmpfs"); + return 1; + } + + test_daemon(); + test_waitsig(); + + + fd = open(fname, O_RDONLY); + if (fd < 0) { + pr_perror("open failed"); + goto err; + } + + buf[sizeof(TEST_WORD) + 1] = '\0'; + if (read(fd, buf, sizeof(TEST_WORD)) != sizeof(TEST_WORD)) { + fail("Read failed"); + goto err; + } + close(fd); + + if (strcmp(buf, TEST_WORD)) { + fail("File corrupted"); + goto err; + } + + pass(); + ret = 0; +err: + umount2(dirname, MNT_DETACH); + rmdir(dirname); + return ret; +} diff --git a/CRIU_code/test/zdtm/static/tempfs_ro.desc b/CRIU_code/test/zdtm/static/tempfs_ro.desc new file mode 100644 index 0000000..dfe829b --- /dev/null +++ b/CRIU_code/test/zdtm/static/tempfs_ro.desc @@ -0,0 +1 @@ +{'flavor': 'ns', 'flags': 'suid'} diff --git a/CRIU_code/test/zdtm/static/tempfs_ro02.c b/CRIU_code/test/zdtm/static/tempfs_ro02.c new file mode 100644 index 0000000..7b70b86 --- /dev/null +++ b/CRIU_code/test/zdtm/static/tempfs_ro02.c @@ -0,0 +1,50 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check read-only tmpfs mount"; +const char *test_author = "Andrew Vagin "; + +char *dirname; +TEST_OPTION(dirname, string, "directory name", 1); + +#define TEST_WORD "testtest" + +int main(int argc, char **argv) +{ + int fd, ret = 1; + char buf[1024], fname[PATH_MAX]; + + test_init(argc, argv); + + mkdir(dirname, 0700); + if (mount("none", dirname, "tmpfs", MS_RDONLY, "") < 0) { + fail("Can't mount tmpfs"); + return 1; + } + + snprintf(fname, sizeof(buf), "%s/test.file", dirname); + + test_daemon(); + test_waitsig(); + + + fd = open(fname, O_RDWR | O_CREAT, 0777); + if (fd >= 0 || errno != EROFS) { + pr_perror("open failed -> %d", fd); + goto err; + } + + pass(); + ret = 0; +err: + umount2(dirname, MNT_DETACH); + rmdir(dirname); + return ret; +} diff --git a/CRIU_code/test/zdtm/static/tempfs_ro02.desc b/CRIU_code/test/zdtm/static/tempfs_ro02.desc new file mode 100644 index 0000000..7657ba4 --- /dev/null +++ b/CRIU_code/test/zdtm/static/tempfs_ro02.desc @@ -0,0 +1 @@ +{'flavor': 'ns uns', 'flags': 'suid'} diff --git a/CRIU_code/test/zdtm/static/tempfs_subns.c b/CRIU_code/test/zdtm/static/tempfs_subns.c new file mode 100644 index 0000000..610f427 --- /dev/null +++ b/CRIU_code/test/zdtm/static/tempfs_subns.c @@ -0,0 +1,135 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check tmpfs in a non-root mntns"; +const char *test_author = "Andrew Vagin +#include +#include "zdtmtst.h" +#include "get_smaps_bits.h" + +#ifndef MADV_DONTDUMP +#define MADV_DONTDUMP 16 +#endif + +const char *test_doc = "Test prctl(THP_DISABLE) behaviour"; +const char *test_author = "Mike Rapoport "; + +#define MEM_SIZE (2 << 20) + +int main(int argc, char **argv) +{ + unsigned long orig_flags = 0, new_flags = 0; + unsigned long orig_madv = 0, new_madv = 0; + void *area; + + test_init(argc, argv); + + area = mmap(NULL, MEM_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (area == MAP_FAILED) { + pr_perror("mmap failed"); + return -1; + } + + test_msg("Fetch existing flags/adv\n"); + if (get_smaps_bits((unsigned long)area, &orig_flags, &orig_madv)) + return -1; + + if (prctl(PR_SET_THP_DISABLE, 1, 0, 0, 0)) { + pr_perror("Disabling THP failed"); + return -1; + } + + test_daemon(); + test_waitsig(); + + if (prctl(PR_SET_THP_DISABLE, 0, 0, 0, 0)) { + pr_perror("Enabling THP failed"); + return -1; + } + + test_msg("Fetch restored flags/adv\n"); + if (get_smaps_bits((unsigned long)area, &new_flags, &new_madv)) + return -1; + + if (orig_flags != new_flags) { + pr_err("Flags are changed %lx -> %lx\n", orig_flags, new_flags); + fail(); + return -1; + } + + if (orig_madv != new_madv) { + pr_err("Madvs are changed %lx -> %lx\n", orig_madv, new_madv); + fail(); + return -1; + } + + pass(); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/thread_different_uid_gid.c b/CRIU_code/test/zdtm/static/thread_different_uid_gid.c new file mode 100644 index 0000000..1a6cdc9 --- /dev/null +++ b/CRIU_code/test/zdtm/static/thread_different_uid_gid.c @@ -0,0 +1,163 @@ +/* + * Check that we can dump a process with threads having mismatching UID/GID + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "zdtmtst.h" + +#define exit_group(code) \ + syscall(__NR_exit_group, code) + +const char *test_doc = "Acquire UID/GID setting caps, create thread and drop thread to non-root by changing UID/GID\n"; +const char *test_author = "Vitaly Ostrosablin "; + +unsigned int gid; +unsigned int uid; +pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER; +pthread_cond_t cond = PTHREAD_COND_INITIALIZER; +task_waiter_t t; + +int done = 0; + +void *chg_uid_gid(void *arg) +{ + cap_t newcaps; + cap_t mycaps; + int ret; + + test_msg("Aux thread runs as UID: %d; GID: %d\n", getuid(), getgid()); + + newcaps = cap_from_text("cap_setgid,cap_setuid=+eip"); + if (!newcaps) { + pr_perror("Failed to get capability struct\n"); + exit(1); + } + + ret = cap_set_proc(newcaps); + if (ret) { + pr_perror("Failed to set capabilities for the process\n"); + exit(1); + } + + mycaps = cap_get_proc(); + if (!mycaps) { + pr_perror("Failed to get child thread capabilities\n"); + exit_group(2); + } + + test_msg("Child capabilities: %s\n", cap_to_text(mycaps, NULL)); + test_msg("Changing UID/GID in child thread to %d:%d\n", uid, gid); + + ret = syscall(SYS_setresgid, gid, gid, gid); + if (ret >= 0) { + syscall(SYS_setresuid, uid, uid, uid); + } else if (ret < 0) { + pr_perror("Failed to change UID/GID\n"); + exit_group(2); + } + + gid = getgid(); + uid = getuid(); + test_msg("Now aux thread runs as UID: %d; GID: %d\n", uid, gid); + + test_msg("Child thread is waiting for main thread's signal\n"); + task_waiter_complete(&t, 1); + + pthread_mutex_lock(&mutex); + while (!done) { + pthread_cond_wait(&cond, &mutex); + } + pthread_mutex_unlock(&mutex); + + test_msg("Child thread returns\n"); + return NULL; +} + +int main(int argc, char **argv) +{ + pthread_t diff_cred_thread; + cap_t newcaps; + int maingroup; + int mainuser; + int ret; + + test_init(argc, argv); + task_waiter_init(&t); + + if (getuid() != 0) { + fail("Test is expected to be run with root privileges\n"); + exit(1); + } + + test_msg("Acquiring CAP_SETGID and CAP_SETUID...\n"); + + newcaps = cap_from_text("cap_setgid,cap_setuid=+eip"); + if (!newcaps) { + pr_perror("Failed to get capability struct\n"); + exit(1); + } + ret = cap_set_proc(newcaps); + if (ret) { + pr_perror("Failed to set capabilities for the process\n"); + exit(1); + } + ret = prctl(PR_SET_KEEPCAPS, 1, 0, 0, 0); + if (ret) { + pr_perror("Unable to set KEEPCAPS\n"); + exit(1); + } + + test_msg("Main thread runs as UID: %d; GID: %d\n", getuid(), getgid()); + gid = 99; + uid = 99; + maingroup = 8; + mainuser = 12; + + test_msg("Creating thread with different UID/GID\n"); + ret = pthread_create(&diff_cred_thread, NULL, &chg_uid_gid, NULL); + task_waiter_wait4(&t, 1); + + test_msg("Relinquishing root privileges\n"); + ret = syscall(SYS_setresgid, maingroup, maingroup, maingroup); + if (ret >= 0) { + ret = syscall(SYS_setresuid, mainuser, mainuser, mainuser); + } else if (ret < 0) { + pr_perror("Failed to drop privileges\n"); + exit(1); + } + test_msg("Now main thread runs as UID: %d; GID: %d\n", getuid(), getgid()); + if (gid == getgid() || uid == getuid()) { + pr_perror("Thread credentials match\n"); + exit(1); + } + test_msg("Main thread is waiting for signal\n"); + + test_daemon(); + test_waitsig(); + + if (gid == getgid() || uid == getuid()) { + pr_perror("Thread credentials match after restore\n"); + exit(1); + } + + pthread_mutex_lock(&mutex); + done = 1; + pthread_cond_signal(&cond); + pthread_mutex_unlock(&mutex); + pthread_join(diff_cred_thread, NULL); + test_msg("Threads joined\n"); + + pass(); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/thread_different_uid_gid.desc b/CRIU_code/test/zdtm/static/thread_different_uid_gid.desc new file mode 100644 index 0000000..2eac7e6 --- /dev/null +++ b/CRIU_code/test/zdtm/static/thread_different_uid_gid.desc @@ -0,0 +1 @@ +{'flags': 'suid'} diff --git a/CRIU_code/test/zdtm/static/timerfd.c b/CRIU_code/test/zdtm/static/timerfd.c new file mode 100644 index 0000000..132c3ac --- /dev/null +++ b/CRIU_code/test/zdtm/static/timerfd.c @@ -0,0 +1,166 @@ +#include +#include +#include +#include +#include + +#include + +#include "zdtmtst.h" + +const char *test_doc = "Checks timerfd survives checkpoint/restore\n"; +const char *test_author = "Cyrill Gorcunov "; + +#define TIMERFD_VNSEC 50000 +#define TIMERFD_ISEC 4 + +struct timerfd_status { + int clockid; + uint64_t ticks; + int settime_flags; + struct itimerspec v; +}; + +static void show_timerfd(char *prefix, struct timerfd_status *s) +{ + test_msg("\t%s clockid %d ticks %llu settime_flags %d it_value(%llu, %llu) it_interval(%llu, %llu)\n", + prefix, + s->clockid, + (unsigned long long)s->ticks, + s->settime_flags, + (unsigned long long)s->v.it_value.tv_sec, + (unsigned long long)s->v.it_value.tv_nsec, + (unsigned long long)s->v.it_interval.tv_sec, + (unsigned long long)s->v.it_interval.tv_nsec); +} + +static int parse_self_fdinfo(int fd, struct timerfd_status *s) +{ + char buf[256]; + int ret = -1; + FILE *f; + + sprintf(buf, "/proc/self/fdinfo/%d", fd); + f = fopen(buf, "r"); + if (!f) { + pr_perror("Can't open %s to parse", buf); + return -1; + } + + memset(s, 0, sizeof(*s)); + + /* + * clockid: 0 + * ticks: 0 + * settime flags: 01 + * it_value: (0, 49406829) + * it_interval: (1, 0) + */ + while (fgets(buf, sizeof(buf), f)) { + if (strncmp(buf, "clockid:", 8)) + continue; + + if (sscanf(buf, "clockid: %d", &s->clockid) != 1) + goto parse_err; + + if (!fgets(buf, sizeof(buf), f)) + goto parse_err; + if (sscanf(buf, "ticks: %llu", (unsigned long long *)&s->ticks) != 1) + goto parse_err; + + if (!fgets(buf, sizeof(buf), f)) + goto parse_err; + if (sscanf(buf, "settime flags: 0%o", &s->settime_flags) != 1) + goto parse_err; + + if (!fgets(buf, sizeof(buf), f)) + goto parse_err; + if (sscanf(buf, "it_value: (%llu, %llu)", + (unsigned long long *)&s->v.it_value.tv_sec, + (unsigned long long *)&s->v.it_value.tv_nsec) != 2) + goto parse_err; + + if (!fgets(buf, sizeof(buf), f)) + goto parse_err; + if (sscanf(buf, "it_interval: (%llu, %llu)", + (unsigned long long *)&s->v.it_interval.tv_sec, + (unsigned long long *)&s->v.it_interval.tv_nsec) != 2) + goto parse_err; + + ret = 0; + break; + } + + if (ret) + goto parse_err; +err: + fclose(f); + return ret; + +parse_err: + pr_perror("Format error"); + goto err; +} + +static int check_timerfd(int fd, struct timerfd_status *old) +{ + struct timerfd_status new; + + if (parse_self_fdinfo(fd, &new)) + return -1; + show_timerfd("restored", &new); + + if (old->clockid != new.clockid || + old->settime_flags != new.settime_flags || + old->ticks > new.ticks || + old->v.it_value.tv_sec > new.v.it_value.tv_sec || + old->v.it_interval.tv_sec != new.v.it_interval.tv_sec) + return -1; + + return 0; +} + +int main(int argc, char *argv[]) +{ + struct timerfd_status old = { + .clockid = CLOCK_MONOTONIC, + .ticks = 0, + .settime_flags = 0, + .v = { + .it_value = { + .tv_sec = 0, + .tv_nsec= TIMERFD_VNSEC, + }, + .it_interval = { + .tv_sec = TIMERFD_ISEC, + .tv_nsec= 0, + }, + }, + }; + int timerfd = 0, ret; + + test_init(argc, argv); + + timerfd = timerfd_create(old.clockid, 0); + if (timerfd < 0) { + pr_perror("timerfd_create failed"); + return -1; + } + + show_timerfd("setup", &old); + if (timerfd_settime(timerfd, old.settime_flags, &old.v, NULL)) { + pr_perror("timerfd_settime failed"); + return -1; + } + sleep(1); + + test_daemon(); + test_waitsig(); + + ret = check_timerfd(timerfd, &old); + if (ret) + fail(); + else + pass(); + return ret; +} diff --git a/CRIU_code/test/zdtm/static/timerfd.desc b/CRIU_code/test/zdtm/static/timerfd.desc new file mode 100644 index 0000000..6f54193 --- /dev/null +++ b/CRIU_code/test/zdtm/static/timerfd.desc @@ -0,0 +1 @@ +{'feature': 'timerfd'} diff --git a/CRIU_code/test/zdtm/static/timers.c b/CRIU_code/test/zdtm/static/timers.c new file mode 100644 index 0000000..256a5c1 --- /dev/null +++ b/CRIU_code/test/zdtm/static/timers.c @@ -0,0 +1,93 @@ +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Checks timers keep ticking after migration\n"; +const char *test_author = "Pavel Emelianov "; + +static struct { + const int timer_type; + const int signal; + volatile sig_atomic_t count; +} timer_tests[] = { /* from slowest to fastest */ + { ITIMER_VIRTUAL, SIGVTALRM }, + { ITIMER_PROF, SIGPROF }, + { ITIMER_REAL, SIGALRM }, +}; + +#define NUM_TIMERS (sizeof(timer_tests) / sizeof(timer_tests[0])) +#define MAX_TIMER_COUNT 10 + +static void timer_tick(int sig) +{ + int i; + for (i = 0; i < NUM_TIMERS; i++) + if (timer_tests[i].signal == sig) { + /* don't go beyond MAX_TIMER_COUNT, to avoid overflow */ + if (timer_tests[i].count < MAX_TIMER_COUNT) + timer_tests[i].count++; + break; + } +} + +static void setup_timers(void) +{ + int i; + struct itimerval tv = { + .it_interval = { + .tv_sec = 0, + .tv_usec = 100000 + }, + .it_value = { + .tv_sec = 0, + .tv_usec = 100 + }, + }; + + for (i = 0; i < NUM_TIMERS; i++) { + if (signal(timer_tests[i].signal, timer_tick) == SIG_ERR) { + pr_perror("can't set signal handler %d", i); + exit(1); + } + + if (setitimer(timer_tests[i].timer_type, &tv, NULL) < 0) { + pr_perror("can't set timer %d", i); + exit(1); + } + } +} + +static void check_timers(void) +{ + int i; + volatile unsigned int j; /* avoid optimizing the loop away */ + + for (i = 0; i < NUM_TIMERS; i++) /* reset counters first */ + timer_tests[i].count = 0; + + /* waste some real and CPU time: run for MAX_TIMER_COUNT ticks or until + * j overflows */ + for (j = 1; j && timer_tests[0].count < MAX_TIMER_COUNT; j++); + + for (i = 0; i < NUM_TIMERS; i++) + if (!timer_tests[i].count) { + fail("timer %d stuck", i); + return; + } + pass(); +} + +int main(int argc, char **argv) +{ + test_init(argc, argv); + + setup_timers(); + + test_daemon(); + test_waitsig(); + + check_timers(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/tty00.c b/CRIU_code/test/zdtm/static/tty00.c new file mode 100644 index 0000000..473650c --- /dev/null +++ b/CRIU_code/test/zdtm/static/tty00.c @@ -0,0 +1,108 @@ +#define _XOPEN_SOURCE 500 +#include +#include "zdtmtst.h" +#include +#include +#include +#include +#include +#include +#include +#include + +const char *test_doc = "Check that a control terminal is restored"; +const char *test_author = "Andrey Vagin "; + +static int sighup = 0; +static void sighup_handler(int signo) +{ + test_msg("SIGHUP is here\n"); + sighup = 1; +} + +int main(int argc, char ** argv) +{ + int fdm, fds, status; + task_waiter_t t; + char *slavename; + pid_t pid; + + test_init(argc, argv); + + task_waiter_init(&t); + + fdm = open("/dev/ptmx", O_RDWR); + if (fdm == -1) { + pr_perror("Can't open a master pseudoterminal"); + return 1; + } + + grantpt(fdm); + unlockpt(fdm); + slavename = ptsname(fdm); + + pid = test_fork(); + if (pid < 0) { + pr_perror("fork() failed"); + return 1; + } + + if (pid == 0) { + close(fdm); + signal(SIGHUP, sighup_handler); + + if (setsid() == -1) + return 1; + + /* set up a controlling terminal */ + fds = open(slavename, O_RDWR); + if (fds == -1) { + pr_perror("Can't open a slave pseudoterminal %s", slavename); + return 1; + } + + if (ioctl(fdm, TIOCSCTTY, 1) < 0) { + pr_perror("Can't setup a controlling terminal"); + return 1; + } + close(fds); + + task_waiter_complete_current(&t); + + test_waitsig(); + if (sighup) + return 0; + return 1; + } + + task_waiter_wait4(&t, pid); + + test_daemon(); + + test_waitsig(); + + close(fdm); + + if (kill(pid, SIGTERM) == -1) { + pr_perror("kill failed"); + return 1; + } + + pid = waitpid(pid, &status, 0); + if (pid < 0) + return 1; + + if (WIFEXITED(status)) { + if (WEXITSTATUS(status)) { + fail("The child returned %d", WEXITSTATUS(status)); + return 1; + } + } else { + test_msg("The child has been killed by %d\n", WTERMSIG(status)); + return 1; + } + + pass(); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/tty02.c b/CRIU_code/test/zdtm/static/tty02.c new file mode 100644 index 0000000..bac0d82 --- /dev/null +++ b/CRIU_code/test/zdtm/static/tty02.c @@ -0,0 +1,53 @@ +#define _XOPEN_SOURCE 500 +#include +#include "zdtmtst.h" +#include +#include +#include +#include +#include +#include +#include + +const char *test_doc = "Check a non-controling terminal"; +const char *test_author = "Andrey Vagin "; + +int main(int argc, char ** argv) +{ + int fdm, fds; + char *slavename; + pid_t sid; + + test_init(argc, argv); + + setsid(); + + fdm = open("/dev/ptmx", O_RDWR); + if (fdm == -1) { + pr_perror("Can't open a master pseudoterminal"); + return 1; + } + + grantpt(fdm); + unlockpt(fdm); + slavename = ptsname(fdm); + + /* set up a controlling terminal */ + fds = open(slavename, O_RDWR | O_NOCTTY); + if (fds == -1) { + pr_perror("Can't open a slave pseudoterminal %s", slavename); + return 1; + } + + test_daemon(); + test_waitsig(); + + if (ioctl(fds, TIOCGSID, &sid) != -1 || errno != ENOTTY) { + fail("The tty is a controlling for someone"); + return 1; + } + + pass(); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/tty03.c b/CRIU_code/test/zdtm/static/tty03.c new file mode 100644 index 0000000..a582f10 --- /dev/null +++ b/CRIU_code/test/zdtm/static/tty03.c @@ -0,0 +1,121 @@ +#define _XOPEN_SOURCE 500 +#include +#include "zdtmtst.h" +#include +#include +#include +#include +#include +#include +#include + +const char *test_doc = "Check a controlling terminal, if a proper fd belongs to another session leader"; +const char *test_author = "Andrey Vagin "; + +int main(int argc, char ** argv) +{ + int fdm, fds, exit_code = 1, status; + task_waiter_t t; + char *slavename; + pid_t sid_b, sid_a, pid; + int pfd[2]; + + test_init(argc, argv); + task_waiter_init(&t); + + if (pipe(pfd) == -1) { + pr_perror("pipe"); + return 1; + } + + fdm = open("/dev/ptmx", O_RDWR); + if (fdm == -1) { + pr_perror("Can't open a master pseudoterminal"); + return 1; + } + + grantpt(fdm); + unlockpt(fdm); + slavename = ptsname(fdm); + + pid = test_fork(); + if (pid == 0) { + if (setsid() == -1) { + pr_perror("setsid"); + return 1; + } + + close(pfd[0]); + + /* set up a controlling terminal */ + fds = open(slavename, O_RDWR | O_NOCTTY); + if (fds == -1) { + pr_perror("Can't open a slave pseudoterminal %s", slavename); + return 1; + } + ioctl(fds, TIOCSCTTY, 1); + + pid = test_fork(); + if (pid == 0) { + if (setsid() == -1) { + pr_perror("setsid"); + return 1; + } + + close(pfd[1]); + + task_waiter_complete(&t, 1); + test_waitsig(); + exit(0); + } + + close(fds); + close(pfd[1]); + + task_waiter_wait4(&t, 1); + task_waiter_complete(&t, 0); + + test_waitsig(); + + kill(pid, SIGTERM); + wait(&status); + + exit(status); + } + + close(pfd[1]); + if (read(pfd[0], &sid_a, 1) != 0) { + pr_perror("read"); + goto out; + } + + if (ioctl(fdm, TIOCGSID, &sid_b) == -1) { + pr_perror("The tty is not a controlling"); + goto out; + } + + task_waiter_wait4(&t, 0); + test_daemon(); + test_waitsig(); + + if (ioctl(fdm, TIOCGSID, &sid_a) == -1) { + fail("The tty is not a controlling"); + goto out; + } + + if (sid_b != sid_a) { + fail("The tty is controlling for someone else"); + goto out; + } + + exit_code = 0; + +out: + kill(pid, SIGTERM); + wait(&status); + + if (status == 0 && exit_code == 0) + pass(); + + return exit_code; +} diff --git a/CRIU_code/test/zdtm/static/tun.c b/CRIU_code/test/zdtm/static/tun.c new file mode 100644 index 0000000..f6bfeb9 --- /dev/null +++ b/CRIU_code/test/zdtm/static/tun.c @@ -0,0 +1,238 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Test TUN/TAP devices\n"; +const char *test_author = "Pavel Emelianov "; + +#define TUN_DEVICE "/dev/net/tun" +#ifndef IFF_MULTI_QUEUE +#define IFF_MULTI_QUEUE 0x0100 +#define IFF_ATTACH_QUEUE 0x0200 +#define IFF_DETACH_QUEUE 0x0400 +#define IFF_PERSIST 0x0800 +#endif + +#ifndef TUNSETQUEUE +#define TUNSETQUEUE _IOW('T', 217, int) +#endif + +static int any_fail = 0; + +static int __open_tun(void) +{ + int fd; + + fd = open(TUN_DEVICE, O_RDWR); + if (fd < 0) + pr_perror("Can't open tun file %s", TUN_DEVICE); + + return fd; +} + +static int set_tun_queue(int fd, const char *name, unsigned flags) +{ + struct ifreq ifr = { .ifr_flags = flags, }; + + if (ioctl(fd, TUNSETQUEUE, &ifr) < 0) { + pr_perror("Can't set queue on %s", name); + return -1; + } + + return 0; +} + +static int __attach_tun(int fd, const char *name, unsigned flags) +{ + struct ifreq ifr = { .ifr_flags = flags, }; + + strncpy(ifr.ifr_name, name, sizeof(ifr.ifr_name)-1); + + if (ioctl(fd, TUNSETIFF, &ifr) < 0) { + if (!(flags & IFF_TUN_EXCL)) + pr_perror("Can't attach iff %s", name); + return -1; + } + + return fd; +} + +static int open_tun(const char *name, unsigned flags) +{ + int fd; + + fd = __open_tun(); + if (fd < 0) + return -1; + + return __attach_tun(fd, name, flags); +} + +static void check_tun(int fd, const char *name, unsigned flags) +{ + struct ifreq ifr = { }; + + if (ioctl(fd, TUNGETIFF, &ifr) > 0) { + any_fail = 1; + fail("Attached tun %s file lost device", name); + } + + if (strcmp(ifr.ifr_name, name)) { + any_fail = 1; + fail("Attached tun %s wrong device", name); + } + + if ((ifr.ifr_flags & flags) != flags) { + any_fail = 1; + fail("Attached tun %s wrong device type", name); + } +} + +static int dev_get_hwaddr(int fd, const char *name, char *a) +{ + struct ifreq ifr = { }; + + if (ioctl(fd, SIOCGIFHWADDR, &ifr) < 0) { + pr_perror("Can't get hwaddr on %s", name); + return -1; + } + + memcpy(a, ifr.ifr_hwaddr.sa_data, ETH_ALEN); + return 0; +} + +int main(int argc, char **argv) +{ + int fds[5], ret; + char addr[ETH_ALEN], a2[ETH_ALEN]; + + test_init(argc, argv); +#ifdef TUN_NS + if (unshare(CLONE_NEWNET)) { + pr_perror("unshare"); + return 1; + } + system("ip link set up dev lo"); +#endif + /* fd[0] -- opened file */ + fds[0] = __open_tun(); + if (fds[0] < 0) { + pr_perror("No file 0"); + return 1; + } + + /* fd[1] -- opened file with tun device */ + fds[1] = open_tun("tunx0", IFF_TUN); + if (fds[1] < 0) { + pr_perror("No file 1"); + return 1; + } + + /* fd[2] and [3] -- two-queued device, with 3 detached */ + fds[2] = open_tun("tunx1", IFF_TUN | IFF_MULTI_QUEUE); + if (fds[2] < 0) { + pr_perror("No file 2"); + return 1; + } + + fds[3] = open_tun("tunx1", IFF_TUN | IFF_MULTI_QUEUE); + if (fds[3] < 0) { + pr_perror("No file 3"); + return 1; + } + + ret = set_tun_queue(fds[3], "tunx1", IFF_DETACH_QUEUE); + if (ret < 0) + return 1; + + /* special case -- persistent device */ + ret = open_tun("tunx2", IFF_TUN); + if (ret < 0) { + pr_perror("No persistent device"); + return 1; + } + + if (ioctl(ret, TUNSETPERSIST, 1) < 0) { + pr_perror("Can't make persistent"); + return 1; + } + + /* and one tap in fd[4] */ + fds[4] = open_tun("tapx0", IFF_TAP); + if (fds[4] < 0) { + pr_perror("No tap"); + return 1; + } + + if (dev_get_hwaddr(fds[4], "tapx0", addr) < 0) { + pr_perror("No hwaddr for tap?"); + return 1; + } + + close(ret); + + test_daemon(); + test_waitsig(); + + /* check fds[0] is not attached to device */ + ret = __attach_tun(fds[0], "tunx3", IFF_TUN); + if (ret < 0) { + any_fail = 1; + fail("Opened tun file broken"); + } + + /* check that fds[1] has device */ + check_tun(fds[1], "tunx0", IFF_TUN); + + /* check that fds[2] and [3] are at MQ device with */ + check_tun(fds[2], "tunx1", IFF_TUN | IFF_MULTI_QUEUE); + check_tun(fds[3], "tunx1", IFF_TUN | IFF_MULTI_QUEUE); + + ret = set_tun_queue(fds[2], "tunx1", IFF_DETACH_QUEUE); + if (ret < 0) { + any_fail = 1; + fail("Queue not attached"); + } + + ret = set_tun_queue(fds[3], "tunx1", IFF_ATTACH_QUEUE); + if (ret < 0) { + any_fail = 1; + fail("Queue not detached"); + } + + /* check persistent device */ + ret = open_tun("tunx2", IFF_TUN | IFF_TUN_EXCL); + if (ret >= 0) { + any_fail = 1; + fail("Persistent device lost"); + } else { + ret = open_tun("tunx2", IFF_TUN); + if (ret < 0) + pr_perror("Can't attach tun2"); + else + ioctl(ret, TUNSETPERSIST, 0); + } + + check_tun(fds[4], "tapx0", IFF_TAP); + if (dev_get_hwaddr(fds[4], "tapx0", a2) < 0) { + pr_perror("No hwaddr for tap? (2)"); + any_fail = 1; + } else if (memcmp(addr, a2, sizeof(addr))) { + fail("Address mismatch on tap %x:%x -> %x:%x", + (int)addr[0], (int)addr[1], + (int)a2[0], (int)a2[1]); + any_fail = 1; + } + + if (!any_fail) + pass(); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/tun.desc b/CRIU_code/test/zdtm/static/tun.desc new file mode 100644 index 0000000..eac32c2 --- /dev/null +++ b/CRIU_code/test/zdtm/static/tun.desc @@ -0,0 +1 @@ +{'flavor': 'ns uns', 'flags': 'suid noauto', 'feature': 'tun'} diff --git a/CRIU_code/test/zdtm/static/tun_ns.c b/CRIU_code/test/zdtm/static/tun_ns.c new file mode 100644 index 0000000..bbc509a --- /dev/null +++ b/CRIU_code/test/zdtm/static/tun_ns.c @@ -0,0 +1 @@ +tun.c \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/tun_ns.desc b/CRIU_code/test/zdtm/static/tun_ns.desc new file mode 100644 index 0000000..cea7cfd --- /dev/null +++ b/CRIU_code/test/zdtm/static/tun_ns.desc @@ -0,0 +1 @@ +{'flavor': 'ns uns', 'flags': 'suid', 'feature': 'tun tun_ns'} diff --git a/CRIU_code/test/zdtm/static/uffd-events.c b/CRIU_code/test/zdtm/static/uffd-events.c new file mode 100644 index 0000000..5a46d7e --- /dev/null +++ b/CRIU_code/test/zdtm/static/uffd-events.c @@ -0,0 +1,187 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "zdtmtst.h" + +const char *test_doc = "Test uffd events"; +const char *test_author = "Mike Rapoport "; + +#define NR_MAPS 5 +#define MAP_SIZE (1 << 20) + +static void *map[NR_MAPS]; + +static int create_mappings(void) +{ + uint32_t crc; + int i; + + for (i = 0; i < NR_MAPS; i++) { + map[i] = mmap(NULL, MAP_SIZE, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (map[i] == MAP_FAILED) { + fail("mmap failed"); + return 1; + } + + crc = i; + datagen(map[i], MAP_SIZE, &crc); + } + + return 0; +} + +static int verify_zeroes(void *m) +{ + int i; + + for (i = 0; i < MAP_SIZE; i += PAGE_SIZE) { + char *p = m + i; + if (*p != 0) + return 1; + } + + return 0; +} + +static int check_madv_dn(int idx) +{ + void *m = map[idx]; + + if (madvise(m, MAP_SIZE, MADV_DONTNEED)) { + fail("madvise failed"); + return 1; + } + + if (verify_zeroes(m)) { + fail("not zero"); + return 1; + } + + return 0; +} + +static int check_mremap_grow(int idx) +{ + void *m = map[idx]; + uint32_t crc = idx; + + m = mremap(m, MAP_SIZE, MAP_SIZE * 2, MREMAP_MAYMOVE); + if (m == MAP_FAILED) { + fail("mremap failed"); + return 1; + } + + if (datachk(m, MAP_SIZE, &crc)) { + fail("Mem corrupted"); + return 1; + } + + /* the new part of the mapping should be filled with zeroes */ + m += MAP_SIZE; + if (verify_zeroes(m)) { + fail("not zeroes"); + return 1; + } + + return 0; +} + +static int check_swapped_mappings(int idx) +{ + uint32_t crc = idx; + void *m1 = map[idx]; + void *m2 = map[idx + 1]; + void *p = map[0]; + + p = mremap(m1, MAP_SIZE, MAP_SIZE, MREMAP_MAYMOVE | MREMAP_FIXED, p); + if (p == MAP_FAILED) { + fail("mremap failed"); + return 1; + } + + m1 = mremap(m2, MAP_SIZE, MAP_SIZE, MREMAP_MAYMOVE | MREMAP_FIXED, m1); + if (m1 == MAP_FAILED) { + fail("mremap failed"); + return 1; + } + + m2 = mremap(p, MAP_SIZE, MAP_SIZE, MREMAP_MAYMOVE | MREMAP_FIXED, m2); + if (m2 == MAP_FAILED) { + fail("mremap failed"); + return 1; + } + + if (datachk(m2, MAP_SIZE, &crc)) { + fail("Mem corrupted"); + return 1; + } + + crc = idx + 1; + if (datachk(m1, MAP_SIZE, &crc)) { + fail("Mem corrupted"); + return 1; + } + + return 0; +} + +int main(int argc, char ** argv) +{ + uint32_t crc; + int pid; + + test_init(argc, argv); + + if (create_mappings()) + return -1; + + test_daemon(); + test_waitsig(); + + /* run some page faults */ + crc = 0; + if (datachk(map[0], MAP_SIZE, &crc)) { + fail("Mem corrupted"); + return 1; + } + + pid = fork(); + if (pid < 0) { + fail("Can't fork"); + return 1; + } + + /* check madvise(MADV_DONTNEED) */ + if (check_madv_dn(1)) + return 1; + + /* check growing mremap */ + if (check_mremap_grow(2)) + return 1; + + /* check swapped mappings */ + if (check_swapped_mappings(3)) + return 1; + + if (pid) { + int status; + + waitpid(-1, &status, 0); + if (status) { + fail("child failed"); + return status; + } + } + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/umask00.c b/CRIU_code/test/zdtm/static/umask00.c new file mode 100644 index 0000000..1157f0d --- /dev/null +++ b/CRIU_code/test/zdtm/static/umask00.c @@ -0,0 +1,30 @@ +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that umask didn't change"; +const char *test_author = "Pavel Emelianov "; + +unsigned int mask; +TEST_OPTION(mask, uint, "umask", 1); + +int main(int argc, char **argv) +{ + unsigned int cur_mask, mask2; + + test_init(argc, argv); + + cur_mask = umask(mask); + + test_daemon(); + test_waitsig(); + + mask2 = umask(0); + if (mask != mask2) + fail("mask changed: %o != %o\n", mask, mask2); + else + pass(); + + umask(cur_mask); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/unbound_sock.c b/CRIU_code/test/zdtm/static/unbound_sock.c new file mode 100644 index 0000000..be8318c --- /dev/null +++ b/CRIU_code/test/zdtm/static/unbound_sock.c @@ -0,0 +1,42 @@ +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Create a socket before migration, and bind to it after\n"; +const char *test_author = "Roman Kagan "; + +#define TEST_PORT 59687 +#define TEST_ADDR INADDR_ANY + +int main(int argc, char ** argv) +{ + int sock; + struct sockaddr_in name = { + .sin_family = AF_INET, + .sin_port = htons(TEST_PORT), + .sin_addr.s_addr = htonl(TEST_ADDR), + }; + + test_init(argc, argv); + + sock = socket(PF_INET, SOCK_STREAM, 0); + if (sock < 0) { + pr_perror("can't create socket"); + return 1; + } + + test_daemon(); + test_waitsig(); + + if (bind(sock, (struct sockaddr *) &name, sizeof(name)) < 0) + fail("can't bind to a socket: %m"); + else + pass(); + + close(sock); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/unhashed_proc.c b/CRIU_code/test/zdtm/static/unhashed_proc.c new file mode 100644 index 0000000..1fdc38f --- /dev/null +++ b/CRIU_code/test/zdtm/static/unhashed_proc.c @@ -0,0 +1,81 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Chdir into unhashed proc entry"; +const char *test_author = "Konstantin Khlebnikov "; + +int main(int argc, char ** argv) +{ + int pid, len; + char cwd1[PATH_MAX], cwd2[PATH_MAX]; + + test_init(argc, argv); + + pid = fork(); + if (pid < 0) { + pr_perror("fork failed"); + exit(1); + } else if (!pid) { + pause(); + return 0; + } + + sprintf(cwd1, "/proc/%d", pid); + + if (chdir(cwd1) < 0) { + kill(pid, SIGKILL); + pr_perror("chdir failed"); + exit(1); + } + + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + + if (getcwd(cwd1, sizeof(cwd1))) { + pr_perror("successful getcwd: %s", cwd1); + exit(1); + } else if (errno != ENOENT) { + pr_perror("wrong errno"); + exit(1); + } + + len = readlink("/proc/self/cwd", cwd1, sizeof(cwd1)); + if (len < 0) { + pr_perror("can't read cwd symlink"); + exit(1); + } + cwd1[len] = 0; + + test_daemon(); + test_waitsig(); + + if (getcwd(cwd2, sizeof(cwd2))) { + fail("successful getcwd: %s\n", cwd2); + exit(1); + } else if (errno != ENOENT) { + fail("wrong errno: %m\n"); + exit(1); + } + + len = readlink("/proc/self/cwd", cwd2, sizeof(cwd2)-1); + if (len < 0) { + fail("can't read cwd symlink %m\n"); + exit(1); + } + cwd2[len] = 0; + + if (strcmp(cwd1, cwd2)) + test_msg("cwd differs: %s != %s\n", cwd1, cwd2); + + pass(); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/unhashed_proc.desc b/CRIU_code/test/zdtm/static/unhashed_proc.desc new file mode 100644 index 0000000..847e3b2 --- /dev/null +++ b/CRIU_code/test/zdtm/static/unhashed_proc.desc @@ -0,0 +1 @@ +{'flags': 'crfail', 'opts' : '--link-remap'} diff --git a/CRIU_code/test/zdtm/static/unlink_fifo.c b/CRIU_code/test/zdtm/static/unlink_fifo.c new file mode 100644 index 0000000..765f5eb --- /dev/null +++ b/CRIU_code/test/zdtm/static/unlink_fifo.c @@ -0,0 +1,50 @@ +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that we can migrate with a named pipe " + "open and then unlinked"; +const char *test_author = "Roman Kagan "; + +char *filename; +TEST_OPTION(filename, string, "file name", 1); + +int main(int argc, char **argv) +{ + int fd; + mode_t mode = S_IFIFO | 0700; + + test_init(argc, argv); + + if (mknod(filename, mode, 0)) { + pr_perror("can't make fifo \"%s\"", filename); + exit(1); + } + + fd = open(filename, O_RDWR); + if (fd < 0) { + pr_perror("can't open %s", filename); + return 1; + } + + if (unlink(filename) < 0) { + pr_perror("can't unlink %s", filename); + return 1; + } + + test_daemon(); + test_waitsig(); + + if (close(fd) < 0) { + fail("can't close %s: %m", filename); + return 1; + } + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/unlink_fifo_wronly.c b/CRIU_code/test/zdtm/static/unlink_fifo_wronly.c new file mode 100644 index 0000000..5fb4c34 --- /dev/null +++ b/CRIU_code/test/zdtm/static/unlink_fifo_wronly.c @@ -0,0 +1,60 @@ +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that we can migrate with a named pipe, " + "opened in WRONLY mode and then unlinked"; +char *filename; +TEST_OPTION(filename, string, "file name", 1); + +int main(int argc, char **argv) +{ + int fd, fd1; + mode_t mode = S_IFIFO | 0600; + + test_init(argc, argv); + + if (mknod(filename, mode, 0)) { + pr_perror("can't make fifo \"%s\"", filename); + exit(1); + } + + fd = open(filename, O_RDONLY | O_NONBLOCK); + if (fd < 0) { + pr_perror("open(%s, O_RDONLY | O_NONBLOCK) Failed", + filename); + return 1; + } + + fd1 = open(filename, O_WRONLY); + if (fd1 < 0) { + pr_perror("open(%s, O_WRONLY) Failed", filename); + return 1; + } + + if (unlink(filename) < 0) { + pr_perror("can't unlink %s", filename); + return 1; + } + + test_daemon(); + test_waitsig(); + + if (close(fd) < 0) { + fail("can't close (O_RDONLY | O_NONBLOCK) %s: %m", filename); + return 1; + } + + if (close(fd1) < 0) { + fail("can't close (O_WRONLY) %s: %m", filename); + return 1; + } + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/unlink_fstat00.c b/CRIU_code/test/zdtm/static/unlink_fstat00.c new file mode 100644 index 0000000..79ea902 --- /dev/null +++ b/CRIU_code/test/zdtm/static/unlink_fstat00.c @@ -0,0 +1,173 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +#ifndef __O_TMPFILE +#define __O_TMPFILE 020000000 +#endif + +#ifndef O_TMPFILE +#define O_TMPFILE (__O_TMPFILE | O_DIRECTORY) +#endif + +const char *test_doc = "Open, unlink, change size, seek, migrate, check size"; + +#ifdef UNLINK_FSTAT04 +char *dirname; +TEST_OPTION(dirname, string, "directory name", 1); +#else +char *filename; +TEST_OPTION(filename, string, "file name", 1); +#endif + +int main(int argc, char ** argv) +{ + int fd; + size_t fsize=1000; + mode_t mode; + uid_t uid; + gid_t gid; + uint8_t buf[fsize]; + struct stat fst; + uint32_t crc; +#ifdef UNLINK_FSTAT04 + char filename[PATH_MAX]; +#endif + + test_init(argc, argv); + +#ifdef UNLINK_FSTAT04 + snprintf(filename, sizeof(filename), "%s/test\\file'\n\"un%%linkfstat00", dirname); + + mkdir(dirname, 0700); +#endif +#ifndef UNLINK_FSTAT041 + fd = open(filename, O_RDWR | O_CREAT | O_TRUNC, 0644); +#else + fd = open(dirname, O_RDWR | O_TMPFILE, 0644); +#endif + if (fd < 0) { + pr_perror("can't open %s", filename); + exit(1); + } + +#ifdef UNLINK_FSTAT04 + if (chmod(dirname, 0500)) { + pr_perror("chmod"); + exit(1); + } +#endif + + if (fstat(fd, &fst) < 0) { + pr_perror("can't get file info %s before", filename); + goto failed; + } + +#ifndef UNLINK_FSTAT041 + if (unlink(filename) < 0) { + pr_perror("can't unlink %s", filename); + goto failed; + } +#endif + /* Change file size */ + if (fst.st_size != 0) { + pr_perror("%s file size eq %ld", filename, (long)fst.st_size); + goto failed; + } + + crc = ~0; + datagen(buf, sizeof(buf), &crc); + if (write(fd, buf, sizeof(buf)) != sizeof(buf)) { + pr_perror("can't write %s", filename); + goto failed; + } + /* Change file mode */ + if ((fst.st_mode & S_IXOTH) == 0) + mode = (fst.st_mode | S_IXOTH); + else + mode = (fst.st_mode ^ S_IXOTH); + + if (fchmod(fd, mode) < 0) { + pr_perror("can't chmod %s", filename); + goto failed; + } + + if (getuid()) { + uid = getuid(); + gid = getgid(); + } else { + /* Change uid, gid */ + if (fchown(fd, (uid = fst.st_uid + 1), (gid = fst.st_gid + 1)) < 0) { + pr_perror("can't chown %s", filename); + goto failed; + } + } + + if (lseek(fd, 0, SEEK_SET) != 0) { + pr_perror("can't reposition to 0"); + goto failed; + } + + test_daemon(); + test_waitsig(); + + if (fstat(fd, &fst) < 0) { + pr_perror("can't get %s file info after", filename); + goto failed; + } + + /* Check file size */ + if (fst.st_size != fsize) { + fail("(via fstat): file size changed to %ld", + (long)fst.st_size); + goto failed; + } + fst.st_size = lseek(fd, 0, SEEK_END); + if (fst.st_size != fsize) { + fail("(via lseek): file size changed to %ld", + (long)fst.st_size); + goto failed; + } + /* Check mode */ + if (fst.st_mode != mode) { + fail("mode is changed to %o(%o)", fst.st_mode, mode); + goto failed; + } + /* Check uid, gid */ + if (fst.st_uid != uid || fst.st_gid != gid) { + fail("u(g)id changed: uid=%d(%d), gid=%d(%d)", + fst.st_uid, uid, fst.st_gid, gid); + goto failed; + } + + if (lseek(fd, 0, SEEK_SET) != 0) { + pr_perror("can't reposition to 0"); + goto failed; + } + if (read(fd, buf, sizeof(buf)) != sizeof(buf)) { + fail("can't read %s: %m\n", filename); + goto failed; + } + + crc = ~0; + if (datachk(buf, sizeof(buf), &crc)) { + fail("CRC mismatch\n"); + goto failed; + } + + close(fd); + + pass(); + return 0; +failed: + unlink(filename); + close(fd); + return 1; +} diff --git a/CRIU_code/test/zdtm/static/unlink_fstat00.hook b/CRIU_code/test/zdtm/static/unlink_fstat00.hook new file mode 100644 index 0000000..dfae0f6 --- /dev/null +++ b/CRIU_code/test/zdtm/static/unlink_fstat00.hook @@ -0,0 +1,11 @@ +#!/bin/bash + +[ "$1" == "--fault" -a "$2" == "restore" ] || exit 0 + +if [ $(find -name 'unlink_fstat00*ghost' | wc -l ) -ne 0 ]; then + echo "Dangling ghost file" + exit 1 +fi + +echo "Restore fault handled" +exit 0 diff --git a/CRIU_code/test/zdtm/static/unlink_fstat01+.c b/CRIU_code/test/zdtm/static/unlink_fstat01+.c new file mode 100644 index 0000000..232e698 --- /dev/null +++ b/CRIU_code/test/zdtm/static/unlink_fstat01+.c @@ -0,0 +1 @@ +unlink_fstat01.c \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/unlink_fstat01.c b/CRIU_code/test/zdtm/static/unlink_fstat01.c new file mode 100644 index 0000000..43ea6b9 --- /dev/null +++ b/CRIU_code/test/zdtm/static/unlink_fstat01.c @@ -0,0 +1,93 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Open, unlink, change size, migrate, check size"; + +char *filename; +TEST_OPTION(filename, string, "file name", 1); + +int main(int argc, char ** argv) +{ + int fd; + size_t fsize=1000; + uint8_t buf[fsize]; + struct stat fst; + + test_init(argc, argv); + + fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, 0644); + if (fd < 0) { + pr_perror("can't open %s", filename); + exit(1); + } + + if (fstat(fd, &fst) < 0) { + pr_perror("can't get file info %s before", filename); + goto failed; + } + + if (fst.st_size != 0) { + pr_perror("%s file size eq %ld", filename, (long)fst.st_size); + goto failed; + } + + if (unlink(filename) < 0) { + pr_perror("can't unlink %s", filename); + goto failed; + } + +#ifdef UNLINK_OVER +{ + int fdo; + + fdo = open(filename, O_WRONLY | O_CREAT | O_TRUNC, 0644); + if (fdo < 0) { + pr_perror("can't open %s", filename); + exit(1); + } +} +#endif + + memset(buf, '0', sizeof(buf)); + if (write(fd, buf, sizeof(buf)) != sizeof(buf)) { + pr_perror("can't write %s", filename); + goto failed; + } + + test_daemon(); + test_waitsig(); + + if (fstat(fd, &fst) < 0) { + pr_perror("can't get %s file info after", filename); + goto failed; + } + + if (fst.st_size != fsize) { + fail("(via fstat): file size changed to %lld", + (long long)fst.st_size); + goto failed; + } + + fst.st_size = lseek(fd, 0, SEEK_END); + if (fst.st_size != fsize) { + fail("(via lseek): file size changed to %lld", + (long long)fst.st_size); + goto failed; + } + + close(fd); + + pass(); + return 0; +failed: + unlink(filename); + close(fd); + return 1; +} diff --git a/CRIU_code/test/zdtm/static/unlink_fstat02.c b/CRIU_code/test/zdtm/static/unlink_fstat02.c new file mode 100644 index 0000000..1ffeffd --- /dev/null +++ b/CRIU_code/test/zdtm/static/unlink_fstat02.c @@ -0,0 +1,115 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Open, link, unlink x2, change size, migrate, check size"; + +char *filename; +TEST_OPTION(filename, string, "file name", 1); +static char link_name[1024]; + +int main(int argc, char ** argv) +{ + int fd[2]; + size_t fsize=1000; + uint8_t buf[fsize]; + struct stat fst, fst2; + + test_init(argc, argv); + + fd[0] = open(filename, O_WRONLY | O_CREAT | O_TRUNC, 0644); + if (fd[0] < 0) { + pr_perror("can't open %s", filename); + exit(1); + } + + sprintf(link_name, "%s.link", filename); + if (link(filename, link_name)) { + pr_perror("can't link files"); + goto failed0; + } + + fd[1] = open(link_name, O_RDONLY); + if (fd[1] < 0) { + pr_perror("can't open %s", link_name); + goto failed0; + } + + if (fstat(fd[0], &fst) < 0) { + pr_perror("can't get file info %s before", filename); + goto failed; + } + + if (fst.st_size != 0) { + pr_perror("%s file size eq %lld", + filename, (long long)fst.st_size); + goto failed; + } + + if (unlink(filename) < 0) { + pr_perror("can't unlink %s", filename); + goto failed; + } + + if (unlink(link_name) < 0) { + pr_perror("can't unlink %s", link_name); + goto failed; + } + + memset(buf, '0', sizeof(buf)); + if (write(fd[0], buf, sizeof(buf)) != sizeof(buf)) { + pr_perror("can't write %s", filename); + goto failed; + } + + test_daemon(); + test_waitsig(); + + if (fstat(fd[0], &fst) < 0) { + pr_perror("can't get %s file info after", filename); + goto failed; + } + + if (fstat(fd[1], &fst2) < 0) { + pr_perror("can't get %s file2 info after", link_name); + goto failed; + } + + if ((fst.st_dev != fst2.st_dev) || (fst.st_ino != fst2.st_ino)) { + fail("files differ after restore\n"); + goto failed; + } + + if (fst.st_size != fsize) { + fail("(via fstat): file size changed to %lld", + (long long)fst.st_size); + goto failed; + } + + fst.st_size = lseek(fd[0], 0, SEEK_END); + if (fst.st_size != fsize) { + fail("(via lseek): file size changed to %lld", + (long long)fst.st_size); + goto failed; + } + + close(fd[0]); + close(fd[1]); + + pass(); + return 0; + +failed: + unlink(link_name); + close(fd[1]); +failed0: + unlink(filename); + close(fd[0]); + return 1; +} diff --git a/CRIU_code/test/zdtm/static/unlink_fstat03.c b/CRIU_code/test/zdtm/static/unlink_fstat03.c new file mode 100644 index 0000000..b31ef19 --- /dev/null +++ b/CRIU_code/test/zdtm/static/unlink_fstat03.c @@ -0,0 +1,111 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Open, link, unlink former, change size, migrate, check size"; + +char *filename; +TEST_OPTION(filename, string, "file name", 1); +static char link_name[1024]; + +int main(int argc, char ** argv) +{ + int fd; + size_t fsize=1000; + uint8_t buf[fsize]; + struct stat fst, fst2; + struct statfs fsst; + + test_init(argc, argv); + + fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, 0644); + if (fd < 0) { + pr_perror("can't open %s", filename); + exit(1); + } + + sprintf(link_name, "%s.link", filename); + if (link(filename, link_name)) { + pr_perror("can't link files"); + goto failed0; + } + + if (fstat(fd, &fst) < 0) { + pr_perror("can't get file info %s before", filename); + goto failed; + } + + if (fst.st_size != 0) { + pr_perror("%s file size eq %lld", filename, + (long long)fst.st_size); + goto failed; + } + + if (unlink(filename) < 0) { + pr_perror("can't unlink %s", filename); + goto failed; + } + + memset(buf, '0', sizeof(buf)); + if (write(fd, buf, sizeof(buf)) != sizeof(buf)) { + pr_perror("can't write %s", filename); + goto failed; + } + + test_daemon(); + test_waitsig(); + + if (statfs(link_name, &fsst) < 0) { + pr_perror("statfs(%s)", link_name); + goto failed; + } + + if (fstat(fd, &fst2) < 0) { + pr_perror("can't get %s file info after", filename); + goto failed; + } + + /* An NFS mount is restored with another st_dev */ + if (fsst.f_type != NFS_SUPER_MAGIC && fst.st_dev != fst2.st_dev) { + fail("files differ after restore\n"); + goto failed; + } + + if (fst.st_ino != fst2.st_ino) { + fail("files differ after restore\n"); + goto failed; + } + + if (fst2.st_size != fsize) { + fail("(via fstat): file size changed to %lld", + (long long)fst.st_size); + goto failed; + } + + fst2.st_size = lseek(fd, 0, SEEK_END); + if (fst2.st_size != fsize) { + fail("(via lseek): file size changed to %lld", + (long long)fst.st_size); + goto failed; + } + + close(fd); + + pass(); + return 0; + +failed: + unlink(link_name); +failed0: + unlink(filename); + close(fd); + return 1; +} diff --git a/CRIU_code/test/zdtm/static/unlink_fstat03.desc b/CRIU_code/test/zdtm/static/unlink_fstat03.desc new file mode 100644 index 0000000..083b583 --- /dev/null +++ b/CRIU_code/test/zdtm/static/unlink_fstat03.desc @@ -0,0 +1 @@ +{'opts': '--link-remap', 'flags': 'nouser'} diff --git a/CRIU_code/test/zdtm/static/unlink_fstat04.c b/CRIU_code/test/zdtm/static/unlink_fstat04.c new file mode 100644 index 0000000..f51b3c6 --- /dev/null +++ b/CRIU_code/test/zdtm/static/unlink_fstat04.c @@ -0,0 +1 @@ +unlink_fstat00.c \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/unlink_fstat04.desc b/CRIU_code/test/zdtm/static/unlink_fstat04.desc new file mode 100644 index 0000000..48e131a --- /dev/null +++ b/CRIU_code/test/zdtm/static/unlink_fstat04.desc @@ -0,0 +1 @@ +{ "flags" : "suid" } diff --git a/CRIU_code/test/zdtm/static/unlink_fstat041.c b/CRIU_code/test/zdtm/static/unlink_fstat041.c new file mode 100644 index 0000000..f51b3c6 --- /dev/null +++ b/CRIU_code/test/zdtm/static/unlink_fstat041.c @@ -0,0 +1 @@ +unlink_fstat00.c \ No newline at end of file diff --git a/CRIU_code/test/zdtm/static/unlink_largefile.c b/CRIU_code/test/zdtm/static/unlink_largefile.c new file mode 100644 index 0000000..f473912 --- /dev/null +++ b/CRIU_code/test/zdtm/static/unlink_largefile.c @@ -0,0 +1,59 @@ +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Checkpointing/restore of big (2Gb) unlinked files"; + +char *filename; +TEST_OPTION(filename, string, "file name", 1); + +int main(int argc, char ** argv) +{ + int fd; + char buf[1000000]; + off64_t offset= 0x80002000ULL; + size_t count; + + test_init(argc, argv); + + fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_LARGEFILE, 0644); + if (fd < 0) { + pr_perror("can't open %s", filename); + exit(1); + } + + if (lseek64(fd, offset, SEEK_SET) < 0) { + pr_perror("can't lseek %s, offset= %llx", filename, + (long long unsigned)offset); + goto failed; + } + + count = sizeof(buf); + memset(buf, 0, count); + if (write(fd, buf, count) != count) { + pr_perror("can't write %s", filename); + goto failed; + } + + if (unlink(filename) < 0) { + pr_perror("can't unlink %s", filename); + goto failed; + } + + test_daemon(); + test_waitsig(); + + close(fd); + + pass(); + return 0; +failed: + unlink(filename); + close(fd); + return 1; +} diff --git a/CRIU_code/test/zdtm/static/unlink_largefile.desc b/CRIU_code/test/zdtm/static/unlink_largefile.desc new file mode 100644 index 0000000..ded8987 --- /dev/null +++ b/CRIU_code/test/zdtm/static/unlink_largefile.desc @@ -0,0 +1 @@ +{'flags': 'crfail'} diff --git a/CRIU_code/test/zdtm/static/unlink_mmap00.c b/CRIU_code/test/zdtm/static/unlink_mmap00.c new file mode 100644 index 0000000..03509aa --- /dev/null +++ b/CRIU_code/test/zdtm/static/unlink_mmap00.c @@ -0,0 +1,78 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Test mmaped and unlinked files"; + +char *filename; +TEST_OPTION(filename, string, "file name", 1); + +#ifndef PAGE_SIZE +#define PAGE_SIZE 4096 +#endif + +static void touch_file_page(int fd, unsigned long off, char c) +{ + if (lseek(fd, off, SEEK_SET) != off) { + pr_perror("Lseek fail"); + exit(1); + } + + if (write(fd, &c, 1) != 1) { + pr_perror("Write fail"); + exit(1); + } +} + +int main(int argc, char ** argv) +{ + int fd; + char *mem_a, *mem_b; + + test_init(argc, argv); + + fd = open(filename, O_RDWR | O_CREAT | O_TRUNC, 0644); + if (fd < 0) { + pr_perror("can't open file"); + exit(1); + } + + + touch_file_page(fd, 0, 'a'); + touch_file_page(fd, PAGE_SIZE, 'b'); + touch_file_page(fd, 2 * PAGE_SIZE - 1, 'c'); /* for aligned file */ + + /* map with different prots to create 2 regions */ + mem_a = mmap(NULL, PAGE_SIZE, PROT_READ, MAP_PRIVATE | MAP_FILE, fd, 0); + mem_b = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_FILE, fd, PAGE_SIZE); + if (mem_a == MAP_FAILED || mem_b == MAP_FAILED) { + pr_perror("can't map file"); + exit(1); + } + + if (unlink(filename) < 0) { + pr_perror("can't unlink file"); + exit(1); + } + + close(fd); + + test_daemon(); + test_waitsig(); + + if (mem_a[0] != 'a') + fail("1st region fail"); + else if (mem_b[0] != 'b' || mem_b[PAGE_SIZE - 1] != 'c') + fail("2nd regin fail"); + else + pass(); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/unlink_mmap00.desc b/CRIU_code/test/zdtm/static/unlink_mmap00.desc new file mode 100644 index 0000000..1fda483 --- /dev/null +++ b/CRIU_code/test/zdtm/static/unlink_mmap00.desc @@ -0,0 +1 @@ +{'flags': 'nouser'} diff --git a/CRIU_code/test/zdtm/static/unlink_mmap01.c b/CRIU_code/test/zdtm/static/unlink_mmap01.c new file mode 100644 index 0000000..66c1bc3 --- /dev/null +++ b/CRIU_code/test/zdtm/static/unlink_mmap01.c @@ -0,0 +1,102 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Test mmaped and unlinked files (2, with hard links)"; + +char *filename; +TEST_OPTION(filename, string, "file name", 1); +static char linkname[4096]; + +#ifndef PAGE_SIZE +#define PAGE_SIZE 4096 +#endif + +static void touch_file_page(int fd, unsigned long off, char c) +{ + if (lseek(fd, off, SEEK_SET) != off) { + pr_perror("Lseek fail"); + exit(1); + } + + if (write(fd, &c, 1) != 1) { + pr_perror("Write fail"); + exit(1); + } +} + +int main(int argc, char ** argv) +{ + int fd; + char *mem_a, *mem_b; + + test_init(argc, argv); + + fd = open(filename, O_RDWR | O_CREAT | O_TRUNC, 0644); + if (fd < 0) { + pr_perror("can't open file"); + exit(1); + } + + touch_file_page(fd, 0, 'a'); + touch_file_page(fd, PAGE_SIZE - 1, 'b');/* for aligned file */ + + mem_a = mmap(NULL, PAGE_SIZE, PROT_READ, MAP_PRIVATE | MAP_FILE, fd, 0); + if (mem_a == MAP_FAILED) { + pr_perror("can't map file"); + exit(1); + } + + sprintf(linkname, "%s.lnk", filename); + if (link(filename, linkname)) { + pr_perror("can't link file"); + exit(1); + } + + if (unlink(filename) < 0) { + pr_perror("can't unlink file"); + exit(1); + } + + close(fd); + + fd = open(linkname, O_RDWR); + if (fd < 0) { + pr_perror("can't open link"); + exit(1); + } + + mem_b = mmap(NULL, PAGE_SIZE, PROT_READ, MAP_PRIVATE | MAP_FILE, fd, 0); + if (mem_b == MAP_FAILED) { + pr_perror("can't map link"); + exit(1); + } + + if (unlink(linkname) < 0) { + pr_perror("can't unlink link"); + exit(1); + } + + close(fd); + + test_daemon(); + test_waitsig(); + + if (mem_a[0] != 'a' || mem_a[PAGE_SIZE - 1] != 'b') + fail("1st region fail"); + else if (mem_b[0] != 'a' || mem_b[PAGE_SIZE - 1] != 'b') + fail("2nd regin fail"); + else + pass(); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/unlink_mmap01.desc b/CRIU_code/test/zdtm/static/unlink_mmap01.desc new file mode 100644 index 0000000..1fda483 --- /dev/null +++ b/CRIU_code/test/zdtm/static/unlink_mmap01.desc @@ -0,0 +1 @@ +{'flags': 'nouser'} diff --git a/CRIU_code/test/zdtm/static/unlink_mmap02.c b/CRIU_code/test/zdtm/static/unlink_mmap02.c new file mode 100644 index 0000000..85d6b38 --- /dev/null +++ b/CRIU_code/test/zdtm/static/unlink_mmap02.c @@ -0,0 +1,77 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Test mmaped, opened and unlinked files"; + +char *filename; +TEST_OPTION(filename, string, "file name", 1); + +#ifndef PAGE_SIZE +#define PAGE_SIZE 4096 +#endif + +static void touch_file_page(int fd, unsigned long off, char c) +{ + if (lseek(fd, off, SEEK_SET) != off) { + pr_perror("Lseek fail"); + exit(1); + } + + if (write(fd, &c, 1) != 1) { + pr_perror("Write fail"); + exit(1); + } +} + +int main(int argc, char ** argv) +{ + int fd; + char *mem_a, *mem_b; + + test_init(argc, argv); + + fd = open(filename, O_RDWR | O_CREAT | O_TRUNC, 0644); + if (fd < 0) { + pr_perror("can't open file"); + exit(1); + } + + + touch_file_page(fd, 2 * PAGE_SIZE - 1, 'c'); /* for aligned file */ + + /* map with different prots to create 2 regions */ + mem_a = mmap(NULL, PAGE_SIZE, PROT_READ, MAP_PRIVATE | MAP_FILE, fd, 0); + mem_b = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_FILE, fd, PAGE_SIZE); + if (mem_a == MAP_FAILED || mem_b == MAP_FAILED) { + pr_perror("can't map file"); + exit(1); + } + + if (unlink(filename) < 0) { + pr_perror("can't unlink file"); + exit(1); + } + + test_daemon(); + test_waitsig(); + + touch_file_page(fd, 0, 'a'); + touch_file_page(fd, PAGE_SIZE, 'b'); + + if (mem_a[0] != 'a') + fail("1st region fail"); + else if (mem_b[0] != 'b' || mem_b[PAGE_SIZE - 1] != 'c') + fail("2nd regin fail"); + else + pass(); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/unlink_mmap02.desc b/CRIU_code/test/zdtm/static/unlink_mmap02.desc new file mode 100644 index 0000000..1fda483 --- /dev/null +++ b/CRIU_code/test/zdtm/static/unlink_mmap02.desc @@ -0,0 +1 @@ +{'flags': 'nouser'} diff --git a/CRIU_code/test/zdtm/static/unlink_multiple_largefiles.c b/CRIU_code/test/zdtm/static/unlink_multiple_largefiles.c new file mode 100644 index 0000000..7cf6286 --- /dev/null +++ b/CRIU_code/test/zdtm/static/unlink_multiple_largefiles.c @@ -0,0 +1,267 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include "zdtmtst.h" + + +#define FSIZE 0x3B600000ULL +#define NFILES 10 +#define BUFSIZE (1<<20) + +const char *test_doc = "C/R of ten big (951MiB) unlinked files in root dir"; +const char *test_author = "Vitaly Ostrosablin "; + +void create_check_pattern(char *buf, size_t count, unsigned char seed) +{ + int i; + + for (i = 0; i < count; i++) + buf[i] = seed++; +} + +struct fiemap *read_fiemap(int fd) +{ + test_msg("Obtaining fiemap for fd %d\n", fd); + struct fiemap *fiemap, *tmp; + int extents_size; + + fiemap = malloc(sizeof(struct fiemap)); + if (fiemap == NULL) { + pr_perror("Cannot allocate fiemap"); + return NULL; + } + memset(fiemap, 0, sizeof(struct fiemap)); + + fiemap->fm_length = FIEMAP_MAX_OFFSET; + fiemap->fm_start = 0; + fiemap->fm_flags = 0; + fiemap->fm_extent_count = 0; + + if (ioctl(fd, FS_IOC_FIEMAP, fiemap) < 0) { + pr_perror("FIEMAP ioctl failed"); + free(fiemap); + return NULL; + } + + extents_size = sizeof(struct fiemap_extent) * fiemap->fm_mapped_extents; + + tmp = realloc(fiemap, sizeof(struct fiemap) + extents_size); + if (tmp == NULL) { + free(fiemap); + pr_perror("Cannot resize fiemap"); + return NULL; + } + fiemap = tmp; + memset(fiemap->fm_extents, 0, extents_size); + + fiemap->fm_extent_count = fiemap->fm_mapped_extents; + fiemap->fm_mapped_extents = 0; + + if (ioctl(fd, FS_IOC_FIEMAP, fiemap) < 0) { + pr_perror("fiemap ioctl() failed"); + free(fiemap); + return NULL; + } + test_msg("Debugkillme: %x\n", fiemap->fm_mapped_extents); + + return fiemap; +} + +void check_extent_map(struct fiemap *map) +{ + int i; + unsigned long long datasize = 0; + unsigned long long holesize = 0; + + test_msg("Verifying extent map...\n"); + + for (i = 0; i < map->fm_mapped_extents; i++) { + test_msg("Extent %d, start %llx, length %llx\n", + i, + (long long) map->fm_extents[i].fe_logical, + (long long) map->fm_extents[i].fe_length); + + if (i == 0) + holesize = map->fm_extents[i].fe_logical; + datasize += map->fm_extents[i].fe_length; + } + if (holesize != FSIZE) { + pr_err("Unexpected hole size %llx != %llx\n", + (long long) holesize, (unsigned long long) FSIZE); + exit(1); + } + if (datasize != BUFSIZE) { + pr_err("Unexpected data size %llx != %llx\n", + (long long) datasize, (unsigned long long) BUFSIZE); + exit(1); + } +} + +void compare_file_content(int fildes, int seed) +{ + char ebuf[BUFSIZE]; + char rbuf[BUFSIZE]; + char linkpath[PATH_MAX]; + int fd; + struct fiemap *fiemap; + + ssprintf(linkpath, "/proc/self/fd/%d", fildes); + + fd = open(linkpath, O_RDONLY | O_LARGEFILE); + if (fd < 0) { + pr_perror("Cannot open unlinked file %s", linkpath); + exit(1); + } + + memset(ebuf, 0, BUFSIZE); + + fiemap = read_fiemap(fd); + check_extent_map(fiemap); + free(fiemap); + + lseek64(fd, FSIZE, SEEK_SET); + + create_check_pattern(ebuf, BUFSIZE, seed); + + if (read(fd, rbuf, BUFSIZE) != BUFSIZE) { + pr_perror("Cannot read %i bytes from file", BUFSIZE); + goto failed; + } + + if (memcmp(&ebuf, &rbuf, BUFSIZE)) { + pr_err("Control Block: Data mismatch detected\n"); + goto failed; + } + + close(fd); + return; +failed: + close(fd); + exit(1); +} + +void read_proc_fd_link(int fd, char *buf) +{ + ssize_t res; + char linkpath[PATH_MAX]; + + ssprintf(linkpath, "/proc/%d/fd/%d", getpid(), fd); + + res = readlink(linkpath, buf, PATH_MAX - 1); + buf[res] = 0; + if (res < 0) { + pr_perror("Cannot read fd symlink %s", linkpath); + exit(1); + } +} + +int create_unlinked_file(int fileno) +{ + int fd; + char buf[BUFSIZE]; + char fnm[PATH_MAX]; + + ssprintf(fnm, "/unlinked%d", fileno); + fd = open(fnm, O_WRONLY | O_CREAT | O_TRUNC | O_LARGEFILE, 0644); + if (fd < 0) { + pr_perror("Cannot create file %s", fnm); + exit(1); + } + test_msg("Created file: %s, fd %d\n", fnm, fd); + + if (lseek64(fd, FSIZE, SEEK_SET) < 0) { + pr_perror("Cannot seek to offset %llx", FSIZE); + goto failed; + } + test_msg("File positioning done, offset=%llx\n", FSIZE); + + create_check_pattern(&buf[0], BUFSIZE, fileno); + if (write(fd, buf, BUFSIZE) != BUFSIZE) { + pr_perror("Cannot write %i bytes to file", BUFSIZE); + goto failed; + } + test_msg("%i bytes written to file\n", BUFSIZE); + + if (unlink(fnm) < 0) { + pr_perror("Cannot unlink file %s", fnm); + goto failed; + } + test_msg("File %s is unlinked\n", fnm); + + return fd; +failed: + unlink(fnm); + close(fd); + return -1; +} + +int main(int argc, char **argv) +{ + int fd[NFILES] = {0}; + char links[NFILES][PATH_MAX]; + char link[PATH_MAX]; + int count = 0; + int tempfd; + + test_init(argc, argv); + + /* We need to create 10 unlinked files, each is around 1GB in size */ + for (count = 0; count < NFILES; count++) { + + test_msg("Creating unlinked file %d/%d\n", count + 1, NFILES); + tempfd = create_unlinked_file(count); + + if (tempfd < 0) { + pr_err("Cannot create unlinked file %d/%d\n", + count + 1, NFILES); + return 1; + } + + memset(&links[count][0], 0, PATH_MAX); + read_proc_fd_link(tempfd, &links[count][0]); + + fd[count] = tempfd; + } + test_msg("Created %d unlinked files\n", NFILES); + + test_daemon(); + test_msg("Test daemonized, PID %d\n", getpid()); + test_waitsig(); + + test_msg("PID %d resumed, doing final checks...\n", getpid()); + + for (count = 0; count < NFILES; count++) { + test_msg("Processing fd #%d (%d)\n", count, fd[count]); + + test_msg("Checking symlink consistency...\n"); + memset(&link[0], 0, PATH_MAX); + read_proc_fd_link(fd[count], &link[0]); + + if (strcmp(&links[count][0], &link[0])) { + pr_err("Symlink target %s has changed to %s\n", + links[count], link); + return 1; + } + + test_msg("Checking file contents...\n"); + compare_file_content(fd[count], count); + + test_msg("Closing file descriptor...\n"); + if (close(fd[count]) == -1) { + pr_perror("Close failed"); + return 1; + } + } + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/unlink_multiple_largefiles.desc b/CRIU_code/test/zdtm/static/unlink_multiple_largefiles.desc new file mode 100644 index 0000000..b4ab1a8 --- /dev/null +++ b/CRIU_code/test/zdtm/static/unlink_multiple_largefiles.desc @@ -0,0 +1 @@ +{'flavor': 'ns uns', 'opts': '--ghost-limit 1G'} diff --git a/CRIU_code/test/zdtm/static/unlink_regular00.c b/CRIU_code/test/zdtm/static/unlink_regular00.c new file mode 100644 index 0000000..383fabb --- /dev/null +++ b/CRIU_code/test/zdtm/static/unlink_regular00.c @@ -0,0 +1,110 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Checkpointing/restore of unlinked file inside unlinked directory"; +const char *test_author = "Kirill Tkhai "; + +char *dirname; +TEST_OPTION(dirname, string, "directory name", 1); + +#define SUBDIR "subdir" +#define FNAME "testfile" +#define MSG "Hello!!!111" + +int main(int argc, char ** argv) +{ + char subdir[PATH_MAX], fname[PATH_MAX], lname[PATH_MAX]; + char buf[sizeof(MSG) + 1]; + int fd, ret = -1; + + test_init(argc, argv); + + memset(buf, 0, sizeof(buf)); + + if (mkdir(dirname, 0777)) { + fail("can't create %s", dirname); + exit(1); + } + + if (mount("none", dirname, "tmpfs", 0, "") < 0) { + fail("can't mount tmpfs to %s", dirname); + goto rm_topdir; + } + + sprintf(subdir, "%s/" SUBDIR, dirname); + + if (mkdir(subdir, 0777)) { + fail("can't create %s", subdir); + goto umount; + } + + sprintf(fname, "%s/" SUBDIR "/" FNAME, dirname); + sprintf(lname, "%s/" FNAME, dirname); + + fd = open(fname, O_RDWR | O_CREAT, 0644); + if (fd < 0) { + fail("can't open %s", fname); + rmdir(subdir); + goto umount; + } + + if (link(fname, lname) < 0) { + fail("can't link %s to %s", fname, lname); + unlink(fname); + rmdir(subdir); + goto umount; + } + + if (unlink(fname) || rmdir(subdir)) { + fail("can't unlink %s or %s", fname, subdir); + goto close_file; + } + + if (write(fd, MSG, sizeof(MSG)) != sizeof(MSG)) { + fail("can't write %s", fname); + goto close_file; + } + + test_daemon(); + test_waitsig(); + + if (lseek(fd, 0, SEEK_SET) != 0) { + fail("can't lseek %s", fname); + goto close_file; + } + + if (read(fd, buf, sizeof(MSG)) != sizeof(MSG)) { + fail("can't read %s", fname); + goto close_file; + } + + if (strcmp(buf, MSG)) { + fail("content differs: %s, %s, sizeof=%zu", + buf, MSG, sizeof(MSG)); + goto close_file; + } + + ret = 0; + pass(); + +close_file: + close(fd); + unlink(lname); +umount: + if (umount(dirname) < 0) + pr_err("Can't umount\n"); +rm_topdir: + if (rmdir(dirname) < 0) + pr_err("Can't rmdir()\n"); + + return ret; +} diff --git a/CRIU_code/test/zdtm/static/unlink_regular00.desc b/CRIU_code/test/zdtm/static/unlink_regular00.desc new file mode 100644 index 0000000..4e3f10a --- /dev/null +++ b/CRIU_code/test/zdtm/static/unlink_regular00.desc @@ -0,0 +1 @@ +{'flavor': 'ns uns', 'flags': 'suid', 'opts': '--link-remap'} diff --git a/CRIU_code/test/zdtm/static/unsupported_children_collision.c b/CRIU_code/test/zdtm/static/unsupported_children_collision.c new file mode 100644 index 0000000..15e8f3b --- /dev/null +++ b/CRIU_code/test/zdtm/static/unsupported_children_collision.c @@ -0,0 +1,110 @@ +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check unsupported children collision for mounts"; +const char *test_author = "Pavel Tikhomirov "; + +char *dirname; +TEST_OPTION(dirname, string, "directory name", 1); + +int main(int argc, char **argv) +{ + char share1[PATH_MAX], share2[PATH_MAX]; + char child1[PATH_MAX], child2[PATH_MAX]; + + test_init(argc, argv); + + if (mkdir(dirname, 0700)) { + pr_perror("mkdir"); + return 1; + } + + if (mount("zdtm_fs", dirname, "tmpfs", 0, NULL)) { + pr_perror("mount"); + return 1; + } + + if (mount(NULL, dirname, NULL, MS_PRIVATE, NULL)) { + pr_perror("mount"); + return 1; + } + + snprintf(share1, sizeof(share1), "%s/share1", dirname); + if (mkdir(share1, 0700)) { + pr_perror("mkdir"); + return 1; + } + + if (mount("share", share1, "tmpfs", 0, NULL)) { + pr_perror("mount"); + return 1; + } + + if (mount(NULL, share1, NULL, MS_SHARED, NULL)) { + pr_perror("mount"); + return 1; + } + + snprintf(child1, sizeof(child1), "%s/share1/child", dirname); + if (mkdir(child1, 0700)) { + pr_perror("mkdir"); + return 1; + } + + if (mount("child1", child1, "tmpfs", 0, NULL)) { + pr_perror("mount"); + return 1; + } + + snprintf(share2, sizeof(share2), "%s/share2", dirname); + if (mkdir(share2, 0700)) { + pr_perror("mkdir"); + return 1; + } + + if (mount(share1, share2, NULL, MS_BIND, NULL)) { + pr_perror("mount"); + return 1; + } + + snprintf(child2, sizeof(child2), "%s/share2/child", dirname); + if (mount("child2", child2, "tmpfs", 0, NULL)) { + pr_perror("mount"); + return 1; + } + + test_daemon(); + test_waitsig(); + + if (umount(child2)) { + pr_perror("Unable to umount %s", share1); + return 1; + } + + if (umount(share2)) { + pr_perror("Unable to umount %s", share2); + return 1; + } + + if (umount(child1)) { + pr_perror("Unable to umount %s", child1); + return 1; + } + + if (umount(share1)) { + pr_perror("Unable to umount %s", share1); + return 1; + } + + if (umount(dirname)) { + pr_perror("Unable to umount %s", dirname); + return 1; + } + + pass(); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/unsupported_children_collision.desc b/CRIU_code/test/zdtm/static/unsupported_children_collision.desc new file mode 100644 index 0000000..6da6e74 --- /dev/null +++ b/CRIU_code/test/zdtm/static/unsupported_children_collision.desc @@ -0,0 +1 @@ +{'flavor': 'ns uns', 'flags': 'suid noauto crfail'} diff --git a/CRIU_code/test/zdtm/static/uptime_grow.c b/CRIU_code/test/zdtm/static/uptime_grow.c new file mode 100644 index 0000000..6d99509 --- /dev/null +++ b/CRIU_code/test/zdtm/static/uptime_grow.c @@ -0,0 +1,51 @@ +#include "zdtmtst.h" + +const char *test_doc = "test to ensure that monotonic clock doesn't decrease"; +const char *test_author = "Evgeny Antysev "; + +#include +#include + +# define tv_ge(a, b) \ + (((a)->tv_sec == (b)->tv_sec) ? \ + ((a)->tv_nsec >= (b)->tv_nsec) : \ + ((a)->tv_sec > (b)->tv_sec)) + +int main(int argc, char **argv) +{ + struct timespec tm_old, tm, ts; + double diff_nsec; + ts.tv_sec = 0; + ts.tv_nsec = 1000000; + + test_init(argc, argv); + + if (clock_gettime(CLOCK_MONOTONIC, &tm_old)) { + pr_perror("clock_gettime failed"); + exit(1); + } + + test_daemon(); + + while (test_go()) { + if (clock_gettime(CLOCK_MONOTONIC, &tm)) { + pr_perror("clock_gettime failed"); + exit(1); + } + if (!tv_ge(&tm, &tm_old)) { + diff_nsec = (tm_old.tv_sec - tm.tv_sec) * 1.0E9 +\ + (tm_old.tv_nsec - tm.tv_nsec); + fail("clock step backward for %e nsec\n", diff_nsec); + exit(1); + } + tm_old = tm; + /* + Kernel can't suspend container by design if calls + clock_gettime() in a loop, so we need to sleep + between clock_gettime(). + */ + nanosleep(&ts, NULL); + } + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/uptime_grow.desc b/CRIU_code/test/zdtm/static/uptime_grow.desc new file mode 100644 index 0000000..95c58b4 --- /dev/null +++ b/CRIU_code/test/zdtm/static/uptime_grow.desc @@ -0,0 +1 @@ +{'flags': 'noauto'} diff --git a/CRIU_code/test/zdtm/static/utsname.c b/CRIU_code/test/zdtm/static/utsname.c new file mode 100644 index 0000000..964548a --- /dev/null +++ b/CRIU_code/test/zdtm/static/utsname.c @@ -0,0 +1,46 @@ +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that utsname hasn't changed"; +const char *test_author = "Pavel Emelianov "; + +static struct utsname after; + +#define ZDTM_NODE "zdtm.nodename.ru" +#define ZDTM_DOMAIN "zdtm.nodename.ru" + +int main(int argc, char **argv) +{ + test_init(argc, argv); + + if (sethostname(ZDTM_NODE, sizeof(ZDTM_NODE))) { + pr_perror("Unable to set hostname"); + return 1; + } + + if (setdomainname(ZDTM_DOMAIN, sizeof(ZDTM_DOMAIN))) { + pr_perror("Unable to set domainname"); + return 1; + } + + test_daemon(); + test_waitsig(); + + uname(&after); + + if (strcmp(ZDTM_NODE, after.nodename)) { + fail("Nodename doesn't match"); + return 1; + } + if (strcmp(ZDTM_DOMAIN, after.domainname)) { + fail("Domainname doesn't match"); + return 1; + } + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/utsname.desc b/CRIU_code/test/zdtm/static/utsname.desc new file mode 100644 index 0000000..7657ba4 --- /dev/null +++ b/CRIU_code/test/zdtm/static/utsname.desc @@ -0,0 +1 @@ +{'flavor': 'ns uns', 'flags': 'suid'} diff --git a/CRIU_code/test/zdtm/static/vdso-proxy.c b/CRIU_code/test/zdtm/static/vdso-proxy.c new file mode 100644 index 0000000..2946eb7 --- /dev/null +++ b/CRIU_code/test/zdtm/static/vdso-proxy.c @@ -0,0 +1,170 @@ +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Compare mappings before/after C/R for vdso/vvar presence. Should run iterative under vdso proxy fault-injection.\n"; +const char *test_author = "Dmitry Safonov "; + +#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)])) +#define VSYSCALL_START 0xffffffffff600000ULL +/* + * Use constant MAX_VMAS - to minimize the risk of allocating a new + * mapping or changing the size of existent VMA with realloc() + */ +#define MAX_VMAS 80 +#define BUF_SIZE 1024 + +/* + * After C/R with vdso trampolines insertion, there should + * be added one or two vmas: vdso and possibly vvar. + * We need to check that nr. vmas after C/R <= +2 new vmas. + * Also previous vdso/vvar vma should still be present after C/R. + */ +struct vm_area { + /* + * Intentionally use 64bit integer to make sure that it's possible + * to parse mappings >4Gb - those might appear on ia32 + * that's restored by x86_64 CRIU ¯\(°_o)/¯ + */ + uint64_t start; + uint64_t end; + bool is_vvar_or_vdso; +}; + +static char buf[BUF_SIZE]; + +static int parse_maps(struct vm_area *vmas) +{ + FILE *maps; + int i; + + maps = fopen("/proc/self/maps", "r"); + if (maps == NULL) { + pr_err("Failed to open maps file: %m\n"); + return -1; + } + + for (i = 0; i < MAX_VMAS; i++) { + struct vm_area *v = &vmas[i]; + char *end; + + if (fgets(buf, BUF_SIZE, maps) == NULL) + break; + + v->start = strtoull(buf, &end, 16); + v->end = strtoull(end + 1, NULL, 16); + +#if defined(__i386__) + /* + * XXX: ia32 is being restored from x86_64 and leaves + * emulated vsyscall "mapping". Hopefully, will be done + * per-process, ignore for now. + */ + if (v->start == VSYSCALL_START) { + i--; + continue; + } +#endif + v->is_vvar_or_vdso |= strstr(buf, "[vdso]") != NULL; + v->is_vvar_or_vdso |= strstr(buf, "[vvar]") != NULL; + test_msg("[NOTE]\tVMA: [%#" PRIx64 ", %#" PRIx64 "]\n", + v->start, v->end); + } + + if (fclose(maps)) { + pr_err("Failed to close maps file: %m\n"); + return -1; + } + + if (i == MAX_VMAS) { + pr_err("Number of VMAs is bigger than reserved array's size\n"); + return -1; + } + + return i; +} + +int compare_vmas(struct vm_area *vmax, struct vm_area *vmay) +{ + if (vmax->start > vmay->start) + return 1; + if (vmax->start < vmay->start) + return -1; + if (vmax->end > vmay->end) + return 1; + if (vmax->end < vmay->end) + return -1; + + return 0; +} + +static int check_vvar_vdso(struct vm_area *before, struct vm_area *after) +{ + int i, j = 0; + + for (i = 0; i < MAX_VMAS && j < MAX_VMAS; i++, j++) { + int cmp = compare_vmas(&before[i], &after[j]); + + if (cmp == 0) + continue; + + if (cmp < 0) {/* Lost mapping */ + test_msg("[NOTE]\tLost mapping: %#" PRIx64 "-%#" PRIx64 "\n", + before[i].start, before[i].end); + j--; + if (before[i].is_vvar_or_vdso) { + fail("Lost vvar/vdso mapping"); + return -1; + } + continue; + } + + test_msg("[NOTE]\tNew mapping appeared: %#" PRIx64 "-%#" PRIx64 "\n", + after[j].start, after[j].end); + i--; + } + + return 0; +} + +static struct vm_area vmas_before[MAX_VMAS]; +static struct vm_area vmas_after[MAX_VMAS]; + +int main(int argc, char *argv[]) +{ + int nr_before, nr_after; + + test_init(argc, argv); + + test_msg("[NOTE]\tMappings before:\n"); + nr_before = parse_maps(vmas_before); + if (nr_before < 0) { + pr_perror("Failed to parse maps"); + return -1; + } + + test_daemon(); + test_waitsig(); + + test_msg("[NOTE]\tMappings after:\n"); + nr_after = parse_maps(vmas_after); + if (nr_after < 0) { + pr_perror("Failed to parse maps"); + return -1; + } + + /* After restore vDSO/VVAR blobs must remain in the old place. */ + if (check_vvar_vdso(vmas_before, vmas_after)) + return -1; + + if (nr_before + 2 < nr_after) { + fail("There is more than two (VVAR/vDSO) vmas added after C/R"); + return -1; + } + + pass(); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/vdso00.c b/CRIU_code/test/zdtm/static/vdso00.c new file mode 100644 index 0000000..8ac4cca --- /dev/null +++ b/CRIU_code/test/zdtm/static/vdso00.c @@ -0,0 +1,34 @@ +#include +#include + +#include + +#include +#include +#include "zdtmtst.h" + +const char *test_doc = "Check if we can use vDSO after restore\n"; +const char *test_author = "Cyrill Gorcunov +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check if we can use vDSO using direct vDSO calls\n"; +const char *test_author = "Cyrill Gorcunov offset == VDSO_BAD_ADDR && s->name[0] == '\0'; +} + +enum { + VDSO_SYMBOL_CLOCK_GETTIME, + VDSO_SYMBOL_GETCPU, + VDSO_SYMBOL_GETTIMEOFDAY, + VDSO_SYMBOL_TIME, + + VDSO_SYMBOL_MAX +}; + +const char *vdso_symbols[VDSO_SYMBOL_MAX] = { + [VDSO_SYMBOL_CLOCK_GETTIME] = "__vdso_clock_gettime", + [VDSO_SYMBOL_GETCPU] = "__vdso_getcpu", + [VDSO_SYMBOL_GETTIMEOFDAY] = "__vdso_gettimeofday", + [VDSO_SYMBOL_TIME] = "__vdso_time", +}; + +struct vdso_symtable { + unsigned long vma_start; + unsigned long vma_end; + struct vdso_symbol symbols[VDSO_SYMBOL_MAX]; +}; + +#define VDSO_SYMTABLE_INIT \ + { \ + .vma_start = VDSO_BAD_ADDR, \ + .vma_end = VDSO_BAD_ADDR, \ + .symbols = { \ + [0 ... VDSO_SYMBOL_MAX - 1] = \ + (struct vdso_symbol)VDSO_SYMBOL_INIT, \ + }, \ + } + +static bool __ptr_oob(void *ptr, void *start, size_t size) +{ + void *end = (void *)((unsigned long)start + size); + return ptr > end || ptr < start; +} + +static unsigned long elf_hash(const unsigned char *name) +{ + unsigned long h = 0, g; + + while (*name) { + h = (h << 4) + *name++; + g = h & 0xf0000000ul; + if (g) + h ^= g >> 24; + h &= ~g; + } + return h; +} + +static int vdso_fill_symtable(char *mem, size_t size, struct vdso_symtable *t) +{ + Phdr_t *dynamic = NULL, *load = NULL; + Ehdr_t *ehdr = (void *)mem; + Dyn_t *dyn_strtab = NULL; + Dyn_t *dyn_symtab = NULL; + Dyn_t *dyn_strsz = NULL; + Dyn_t *dyn_syment = NULL; + Dyn_t *dyn_hash = NULL; + Word_t *hash = NULL; + Phdr_t *phdr; + Dyn_t *d; + + Word_t *bucket, *chain; + Word_t nbucket, nchain; + + char *dynsymbol_names; + unsigned int i, j, k; + + BUILD_BUG_ON(sizeof(elf_ident) != sizeof(ehdr->e_ident)); + + test_msg("Parsing at %lx %lx\n", (long)mem, (long)mem + (long)size); + + /* + * Make sure it's a file we support. + */ + if (memcmp(ehdr->e_ident, elf_ident, sizeof(elf_ident))) { + pr_perror("Elf header magic mismatch"); + return -EINVAL; + } + + /* + * We need PT_LOAD and PT_DYNAMIC here. Each once. + */ + phdr = (void *)&mem[ehdr->e_phoff]; + for (i = 0; i < ehdr->e_phnum; i++, phdr++) { + if (__ptr_oob(phdr, mem, size)) + goto err_oob; + switch (phdr->p_type) { + case PT_DYNAMIC: + if (dynamic) { + pr_perror("Second PT_DYNAMIC header"); + return -EINVAL; + } + dynamic = phdr; + break; + case PT_LOAD: + if (load) { + pr_perror("Second PT_LOAD header"); + return -EINVAL; + } + load = phdr; + break; + } + } + + if (!load || !dynamic) { + pr_perror("One of obligated program headers is missed"); + return -EINVAL; + } + + test_msg("PT_LOAD p_vaddr: %lx\n", (unsigned long)load->p_vaddr); + + /* + * Dynamic section tags should provide us the rest of information + * needed. Note that we're interested in a small set of tags. + */ + d = (void *)&mem[dynamic->p_offset]; + for (i = 0; i < dynamic->p_filesz / sizeof(*d); i++, d++) { + if (__ptr_oob(d, mem, size)) + goto err_oob; + + if (d->d_tag == DT_NULL) { + break; + } else if (d->d_tag == DT_STRTAB) { + dyn_strtab = d; + } else if (d->d_tag == DT_SYMTAB) { + dyn_symtab = d; + } else if (d->d_tag == DT_STRSZ) { + dyn_strsz = d; + } else if (d->d_tag == DT_SYMENT) { + dyn_syment = d; + } else if (d->d_tag == DT_HASH) { + dyn_hash = d; + } + } + + if (!dyn_strtab || !dyn_symtab || !dyn_strsz || !dyn_syment || !dyn_hash) { + pr_perror("Not all dynamic entries are present"); + return -EINVAL; + } + + dynsymbol_names = &mem[dyn_strtab->d_un.d_val - load->p_vaddr]; + if (__ptr_oob(dynsymbol_names, mem, size)) + goto err_oob; + + hash = (void *)&mem[(unsigned long)dyn_hash->d_un.d_ptr - (unsigned long)load->p_vaddr]; + if (__ptr_oob(hash, mem, size)) + goto err_oob; + + nbucket = hash[0]; + nchain = hash[1]; + bucket = &hash[2]; + chain = &hash[nbucket + 2]; + + test_msg("nbucket %lu nchain %lu bucket %p chain %p\n", + (long)nbucket, (long)nchain, bucket, chain); + + for (i = 0; i < ARRAY_SIZE(vdso_symbols); i++) { + k = elf_hash((const unsigned char *)vdso_symbols[i]); + + for (j = bucket[k % nbucket]; j < nchain && j != STN_UNDEF; j = chain[j]) { + Sym_t *sym = (void *)&mem[dyn_symtab->d_un.d_ptr - load->p_vaddr]; + char *name; + + sym = &sym[j]; + if (__ptr_oob(sym, mem, size)) + continue; + + if (ELF_ST_TYPE(sym->st_info) != STT_FUNC && + ELF_ST_BIND(sym->st_info) != STB_GLOBAL) + continue; + + name = &dynsymbol_names[sym->st_name]; + if (__ptr_oob(name, mem, size)) + continue; + + if (strcmp(name, vdso_symbols[i])) + continue; + + memcpy(t->symbols[i].name, name, sizeof(t->symbols[i].name)); + t->symbols[i].offset = (unsigned long)sym->st_value - load->p_vaddr; + test_msg("symbol %s offset %lx\n", t->symbols[i].name, t->symbols[i].offset); + break; + } + } + + return 0; + +err_oob: + pr_perror("Corrupted Elf data"); + return -EFAULT; +} + +static int vdso_fill_self_symtable(struct vdso_symtable *s) +{ + char buf[512]; + int ret = -1; + FILE *maps; + + *s = (struct vdso_symtable)VDSO_SYMTABLE_INIT; + + maps = fopen("/proc/self/maps", "r"); + if (!maps) { + pr_perror("Can't open self-vma"); + return -1; + } + + while (fgets(buf, sizeof(buf), maps)) { + unsigned long start, end; + + if (!strstr(buf, "[vdso]")) + continue; + + ret = sscanf(buf, "%lx-%lx", &start, &end); + if (ret != 2) { + ret = -1; + pr_perror("Can't find vDSO bounds"); + goto err; + } + + s->vma_start = start; + s->vma_end = end; + + ret = vdso_fill_symtable((void *)start, end - start, s); + break; + } + + test_msg("[vdso] %lx-%lx\n", s->vma_start, s->vma_end); +err: + fclose(maps); + return ret; +} + +static int vdso_clock_gettime_handler(void *func) +{ + __vdso_clock_gettime_t *vdso_clock_gettime = func; + struct timespec ts1, ts2; + + clock_gettime(CLOCK_REALTIME, &ts1); + vdso_clock_gettime(CLOCK_REALTIME, &ts2); + + test_msg("clock_gettime: tv_sec %li vdso_clock_gettime: tv_sec %li\n", + ts1.tv_sec, ts2.tv_sec); + + if (labs(ts1.tv_sec - ts2.tv_sec) > TIME_DELTA_SEC) { + pr_perror("Delta is too big"); + return -1; + } + + return 0; +} + +static int vdso_getcpu_handler(void *func) +{ + __vdso_getcpu_t *vdso_getcpu = func; + unsigned cpu, node; + + vdso_getcpu(&cpu, &node, NULL); + test_msg("vdso_getcpu: cpu %d node %d\n", cpu, node); + + return 0; +} + +static int vdso_gettimeofday_handler(void *func) +{ + __vdso_gettimeofday_t *vdso_gettimeofday = func; + struct timeval tv1, tv2; + struct timezone tz; + + gettimeofday(&tv1, &tz); + vdso_gettimeofday(&tv2, &tz); + + test_msg("gettimeofday: tv_sec %li vdso_gettimeofday: tv_sec %li\n", + tv1.tv_sec, tv2.tv_sec); + + if (labs(tv1.tv_sec - tv2.tv_sec) > TIME_DELTA_SEC) { + pr_perror("Delta is too big"); + return -1; + } + + return 0; +} + +static int vdso_time_handler(void *func) +{ + __vdso_time_t *vdso_time = func; + time_t t1, t2; + + t1 = time(NULL); + t2 = vdso_time(NULL); + + test_msg("time: %li vdso_time: %li\n", (long)t1, (long)t1); + + if (labs(t1 - t2) > TIME_DELTA_SEC) { + pr_perror("Delta is too big"); + return -1; + } + + return 0; +} + +static int call_handlers(struct vdso_symtable *symtable) +{ + typedef int (handler_t)(void *func); + handler_t *handlers[VDSO_SYMBOL_MAX] = { + [VDSO_SYMBOL_CLOCK_GETTIME] = vdso_clock_gettime_handler, + [VDSO_SYMBOL_GETCPU] = vdso_getcpu_handler, + [VDSO_SYMBOL_GETTIMEOFDAY] = vdso_gettimeofday_handler, + [VDSO_SYMBOL_TIME] = vdso_time_handler, + }; + size_t i; + + for (i = 0; i < ARRAY_SIZE(symtable->symbols); i++) { + struct vdso_symbol *s = &symtable->symbols[i]; + handler_t *func; + + if (vdso_symbol_empty(s) || i > ARRAY_SIZE(handlers)) + continue; + func = handlers[i]; + + if (func((void *)(s->offset + symtable->vma_start))) { + pr_perror("Handler error"); + return -1; + } + } + + return 0; +} + +int main(int argc, char *argv[]) +{ + struct vdso_symtable symtable; + + test_init(argc, argv); + + if (vdso_fill_self_symtable(&symtable)) { + pr_perror("Failed to parse vdso"); + return -1; + } + + if (call_handlers(&symtable)) + return -1; + + test_daemon(); + test_waitsig(); + + /* + * After restore the vDSO must remain in old place. + */ + if (call_handlers(&symtable)) { + fail("Failed to call vdso handlers from symtable after C/R"); + return -1; + } + + pass(); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/vdso01.desc b/CRIU_code/test/zdtm/static/vdso01.desc new file mode 100644 index 0000000..d2f501d --- /dev/null +++ b/CRIU_code/test/zdtm/static/vdso01.desc @@ -0,0 +1 @@ +{'arch': 'x86_64'} diff --git a/CRIU_code/test/zdtm/static/vdso02.c b/CRIU_code/test/zdtm/static/vdso02.c new file mode 100644 index 0000000..f0047bc --- /dev/null +++ b/CRIU_code/test/zdtm/static/vdso02.c @@ -0,0 +1,231 @@ +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Restoring task with unmapped vDSO blob. Poor man's test for C/R on vdso64_enabled=0 booted kernel.\n"; +const char *test_author = "Dmitry Safonov "; + +#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)])) +#define VDSO_BAD_ADDR (-1ul) +#define VVAR_BAD_ADDR (-1ul) +#define BUF_SZ 1024 + +struct vm_area { + unsigned long start; + unsigned long end; +}; + +static int parse_vm_area(char *buf, struct vm_area *vma) +{ + if (sscanf(buf, "%lx-%lx", &vma->start, &vma->end) == 2) + return 0; + + pr_perror("Can't find VMA bounds"); + return -1; +} + +static int find_blobs(pid_t pid, struct vm_area *vdso, struct vm_area *vvar) +{ + char buf[BUF_SZ]; + int ret = -1; + FILE *maps; + + vdso->start = VDSO_BAD_ADDR; + vdso->end = VDSO_BAD_ADDR; + vvar->start = VVAR_BAD_ADDR; + vvar->end = VVAR_BAD_ADDR; + + if (snprintf(buf, BUF_SZ, "/proc/%d/maps", pid) < 0) { + pr_perror("snprintf() failure for path"); + return -1; + } + + maps = fopen(buf, "r"); + if (!maps) { + pr_perror("Can't open maps for %d", pid); + return -1; + } + + while (fgets(buf, sizeof(buf), maps)) { + if (strstr(buf, "[vdso]") && parse_vm_area(buf, vdso)) + goto err; + + if (strstr(buf, "[vvar]") && parse_vm_area(buf, vvar)) + goto err; + } + + if (vdso->start != VDSO_BAD_ADDR) + test_msg("[vdso] %lx-%lx\n", vdso->start, vdso->end); + if (vvar->start != VVAR_BAD_ADDR) + test_msg("[vvar] %lx-%lx\n", vvar->start, vvar->end); + ret = 0; +err: + fclose(maps); + return ret; +} + +#ifdef __i386__ +/* + * On i386 syscalls for speed are optimized trough vdso, + * call raw int80 as vdso is unmapped. + */ +#define __NR32_munmap 91 +#define __NR32_kill 37 +#define __NR32_exit 1 +struct syscall_args32 { + uint32_t nr, arg0, arg1; +}; + +static inline void do_full_int80(struct syscall_args32 *args) +{ + asm volatile ( + "int $0x80\n\t" + : "+a" (args->nr), + "+b" (args->arg0), "+c" (args->arg1)); +} + +int sys_munmap(void *addr, size_t len) +{ + struct syscall_args32 s = {0}; + + s.nr = __NR32_munmap; + s.arg0 = (uint32_t)(uintptr_t)addr; + s.arg1 = (uint32_t)len; + + do_full_int80(&s); + + return (int)s.nr; +} + +int sys_kill(pid_t pid, int sig) +{ + struct syscall_args32 s = {0}; + + s.nr = __NR32_kill; + s.arg0 = (uint32_t)pid; + s.arg1 = (uint32_t)sig; + + do_full_int80(&s); + + return (int)s.nr; +} + +void sys_exit(int status) +{ + struct syscall_args32 s = {0}; + + s.nr = __NR32_exit; + s.arg0 = (uint32_t)status; + + do_full_int80(&s); +} + +#else /* !__i386__ */ + +int sys_munmap(void *addr, size_t len) +{ + return syscall(SYS_munmap, addr, len); +} + +int sys_kill(pid_t pid, int sig) +{ + return syscall(SYS_kill, pid, sig); +} + +void sys_exit(int status) +{ + syscall(SYS_exit, status); +} + +#endif + +static int unmap_blobs(void) +{ + struct vm_area vdso, vvar; + int ret; + + if (find_blobs(getpid(), &vdso, &vvar)) + return -1; + + if (vdso.start != VDSO_BAD_ADDR) { + ret = sys_munmap((void*)vdso.start, vdso.end - vdso.start); + if (ret) + return ret; + } + if (vvar.start != VVAR_BAD_ADDR) { + ret = sys_munmap((void*)vvar.start, vvar.end - vvar.start); + if (ret) + return ret; + } + + return 0; +} + +int main(int argc, char *argv[]) +{ + struct vm_area vdso, vvar; + pid_t child; + int status, ret = -1; + + test_init(argc, argv); + + child = fork(); + if (child < 0) { + pr_perror("fork() failed"); + exit(1); + } + + if (child == 0) { + child = getpid(); + if (unmap_blobs() < 0) + syscall(SYS_exit, 1); + sys_kill(child, SIGSTOP); + sys_exit(2); + } + + waitpid(child, &status, WUNTRACED); + if (WIFEXITED(status)) { + int ret = WEXITSTATUS(status); + + pr_err("Child unexpectedly exited with %d\n", ret); + goto out_kill; + } else if (WIFSIGNALED(status)) { + int sig = WTERMSIG(status); + + pr_err("Child unexpectedly signaled with %d: %s\n", + sig, strsignal(sig)); + goto out_kill; + } else if (!WIFSTOPPED(status) || WSTOPSIG(status) != SIGSTOP) { + pr_err("Child is unstoppable or was stopped by other means\n"); + goto out_kill; + } + + if (find_blobs(child, &vdso, &vvar)) + goto out_kill; + if (vdso.start != VDSO_BAD_ADDR || vvar.start != VVAR_BAD_ADDR) { + pr_err("Found vvar or vdso blob(s) in child, which should have unmapped them\n"); + goto out_kill; + } + + test_daemon(); + test_waitsig(); + + if (find_blobs(child, &vdso, &vvar)) + goto out_kill; + if (vdso.start != VDSO_BAD_ADDR || vvar.start != VVAR_BAD_ADDR) { + pr_err("Child without vdso got it after C/R\n"); + fail(); + goto out_kill; + } + + pass(); + + ret = 0; +out_kill: + kill(child, SIGKILL); + return ret; +} diff --git a/CRIU_code/test/zdtm/static/vfork00.c b/CRIU_code/test/zdtm/static/vfork00.c new file mode 100644 index 0000000..002ddee --- /dev/null +++ b/CRIU_code/test/zdtm/static/vfork00.c @@ -0,0 +1,80 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Block migration by a pending (non-exec()-ed) vfork()"; +const char *test_author = "Pavel Emelianov "; + +int main(int argc, char ** argv) +{ + int ret = 0; + pid_t pid; + + test_init(argc, argv); + + /* vfork() won't let us control the test, so fork() first, and vfork() + * in the child */ + pid = fork(); + if (pid < 0) { + pr_err("fork failed: %m"); + exit(1); + } + + if (pid == 0) { + int ret2; + + pid = vfork(); + if (pid < 0) + ret = errno; + + /* wait for signal in _both_ branches */ + test_waitsig(); + + /* vforked guy shouldn't return, hence we _exit() */ + if (pid == 0) + _exit(0); + + if (wait(&ret2) != pid) + ret = errno; + + _exit(ret); + } + + test_daemon(); + test_waitsig(); + + /* signal the whole process group, because our child is suspended until + * the grand-child has exec()-ed, but we don't know the pid of the + * latter */ + if (kill(0, SIGTERM)) { + fail("terminating the children failed: %m"); + exit(1); + } + + if (wait(&ret) != pid) { + fail("wait() returned wrong pid: %m"); + exit(1); + } + + if (WIFEXITED(ret)) { + ret = WEXITSTATUS(ret); + if (ret) { + fail("child exited with nonzero code %d (%s)", ret, strerror(ret)); + exit(1); + } + } + if (WIFSIGNALED(ret)) { + fail("child exited on unexpected signal %d", WTERMSIG(ret)); + exit(1); + } + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/vfork00.desc b/CRIU_code/test/zdtm/static/vfork00.desc new file mode 100644 index 0000000..802caed --- /dev/null +++ b/CRIU_code/test/zdtm/static/vfork00.desc @@ -0,0 +1 @@ +{'flags': 'noauto crfail'} diff --git a/CRIU_code/test/zdtm/static/vsx.c b/CRIU_code/test/zdtm/static/vsx.c new file mode 100644 index 0000000..be02cfe --- /dev/null +++ b/CRIU_code/test/zdtm/static/vsx.c @@ -0,0 +1,400 @@ +#include +#include +#include + +#include + +#include "zdtmtst.h" + +/* + * This test is specific to PowerPC + */ +#ifndef _ARCH_PPC64 +int main(int argc, char *argv[]) +{ + test_init(argc, argv); + skip("Unsupported arch"); + return 0; +} + +#else + +#include +#include + +/* + * This test verifies that data stored in the VSX registers are still there + * once the restart is done. + * + * The test is filling the registers with dedicated values and then check + * their content. + */ + +const char *test_doc = "Test if data in vector registers do survive the c/r"; +const char *test_author = "Laurent Dufour "; + +int is_test_doable(void) +{ + unsigned long val; + + val = getauxval(AT_HWCAP); +#define CHECK_FEATURE(f) do { \ + if (!(val & f)) { \ + test_msg("CPU feature " #f " is missing\n"); \ + return 0; \ + } \ + } while(0) + + CHECK_FEATURE(PPC_FEATURE_64); + CHECK_FEATURE(PPC_FEATURE_HAS_ALTIVEC); + CHECK_FEATURE(PPC_FEATURE_HAS_VSX); + return 1; +} + +void fill_vsx(uint64_t *pt) +{ + asm volatile( + "lis 3, 0 \n" + + "lxvd2x 0, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "lxvd2x 1, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "lxvd2x 2, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "lxvd2x 3, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "lxvd2x 4, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "lxvd2x 5, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "lxvd2x 6, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "lxvd2x 7, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "lxvd2x 8, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "lxvd2x 9, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + + "lxvd2x 10, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "lxvd2x 11, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "lxvd2x 12, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "lxvd2x 13, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "lxvd2x 14, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "lxvd2x 15, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "lxvd2x 16, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "lxvd2x 17, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "lxvd2x 18, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "lxvd2x 19, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + + "lxvd2x 20, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "lxvd2x 21, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "lxvd2x 22, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "lxvd2x 23, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "lxvd2x 24, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "lxvd2x 25, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "lxvd2x 26, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "lxvd2x 27, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "lxvd2x 28, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "lxvd2x 29, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + + "lxvd2x 30, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "lxvd2x 31, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "lxvd2x 32, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "lxvd2x 33, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "lxvd2x 34, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "lxvd2x 35, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "lxvd2x 36, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "lxvd2x 37, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "lxvd2x 38, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "lxvd2x 39, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + + "lxvd2x 40, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "lxvd2x 41, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "lxvd2x 42, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "lxvd2x 43, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "lxvd2x 44, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "lxvd2x 45, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "lxvd2x 46, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "lxvd2x 47, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "lxvd2x 48, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "lxvd2x 49, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + + "lxvd2x 50, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "lxvd2x 51, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "lxvd2x 52, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "lxvd2x 53, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "lxvd2x 54, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "lxvd2x 55, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "lxvd2x 56, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "lxvd2x 57, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "lxvd2x 58, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "lxvd2x 59, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + + "lxvd2x 60, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "lxvd2x 61, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "lxvd2x 62, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "lxvd2x 63, 3, %0 \n" + : /* no output */ + : "r" (pt) + : "3"); +} + +void read_vsx(uint64_t *pt) +{ + asm volatile( + "lis 3, 0 \n" + + "stxvd2x 0, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "stxvd2x 1, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "stxvd2x 2, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "stxvd2x 3, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "stxvd2x 4, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "stxvd2x 5, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "stxvd2x 6, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "stxvd2x 7, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "stxvd2x 8, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "stxvd2x 9, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + + "stxvd2x 10, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "stxvd2x 11, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "stxvd2x 12, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "stxvd2x 13, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "stxvd2x 14, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "stxvd2x 15, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "stxvd2x 16, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "stxvd2x 17, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "stxvd2x 18, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "stxvd2x 19, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + + "stxvd2x 20, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "stxvd2x 21, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "stxvd2x 22, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "stxvd2x 23, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "stxvd2x 24, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "stxvd2x 25, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "stxvd2x 26, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "stxvd2x 27, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "stxvd2x 28, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "stxvd2x 29, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + + "stxvd2x 30, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "stxvd2x 31, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "stxvd2x 32, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "stxvd2x 33, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "stxvd2x 34, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "stxvd2x 35, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "stxvd2x 36, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "stxvd2x 37, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "stxvd2x 38, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "stxvd2x 39, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + + "stxvd2x 40, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "stxvd2x 41, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "stxvd2x 42, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "stxvd2x 43, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "stxvd2x 44, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "stxvd2x 45, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "stxvd2x 46, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "stxvd2x 47, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "stxvd2x 48, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "stxvd2x 49, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + + "stxvd2x 50, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "stxvd2x 51, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "stxvd2x 52, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "stxvd2x 53, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "stxvd2x 54, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "stxvd2x 55, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "stxvd2x 56, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "stxvd2x 57, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "stxvd2x 58, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "stxvd2x 59, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + + "stxvd2x 60, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "stxvd2x 61, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "stxvd2x 62, 3, %0 \n" + "addi 3, 3, 16 \n" /* move to the next qword */ + "stxvd2x 63, 3, %0 \n" + + : /* no output */ + : "r" (pt) + : "3"); +} + +int main(int argc, char *argv[]) +{ + /* A random buffer of 1024 bytes (64 * 128bit registers to fill) */ + static const char ibuffer[128/8*64]= + "sahwoleiGun9loosliz0Aech9aiph5eiIengei7Ogh8zu7ye" + "Aeshie6vai0thaehool1ooK6ayaj3Neitahn8yeeh5ahfuiT" + "uCeir1bife4ieceema8choo2Wengaec1seDaidohteipa4ai" + "aequee7AiriejaeJar1giak8Gei2uathloh5uemaeG6EiSoo" + "PhaenaethoPhej8nEecheegeihosho8Zohroo8ea6Juuheif" + "nu2Hahvai1tuf0Zeeeveephu2EitaexiVaekieboac7Nushu" + "aeTh6Quoo3iozeisaudaGheed0aPah2Schoog0eiChaeN5su" + "xoo1phoic1mahXohSai1thoogo0oesooeaxai7eBahHahMue" + "quiloh2ooPahpiujeithae0Dau0shuwicobinaaYooj0ajiw" + "iiheeS4awoh3haevlaiGe8phaev3eiluaChaF6ieng4aith4" + "aif3TooYo1aigoomZiuhai8eesoo4maiLahr3PoM8Eir5ooz" + "Iequ9ahre4Op4bahaiso6ohnah8Shokimooch1Oafahf5aih" + "xohphee1pi5Iecaiaigh7Eisah2uew5acie7wi6Zo0Eelah9" + "woi8QueerohfeiThaBoh5jaic3peiPohAhng0bu5shoop7ca" + "Qui5kodaika8quioahmohreeVe8loquaeeLi5ze3oceiHa0l" + "roh8Ooxae7uish9ioog7ieS3aibeo2thOosiuvaiS5lohp4U" + "emieG0eit6Bien8EzaiwiTh3geighaexshee8eHiec1TooH2" + "Eeceacai0inaejieboo8NeishieweiraHooj9apeecooy0th" + "daThei6aexeisahdsei3keik0diPheejchais6ezo0iep5Ae" + "Wiqu6aepeing4ba8diek3aev9waYooveAebai9eef6Iex6vo" + "Quee9MeitahMighoHuo3seveeMoh3ohtoxaib6ootaiF5EeT" + "Ohb9eijoonoh6ich"; + char obuffer[128/8*64]; + int do_test; + + test_init(argc, argv); + + do_test = is_test_doable(); + + if (do_test) { + memset(obuffer, 0xFF, sizeof(obuffer)); + fill_vsx((uint64_t *)ibuffer); + } + + test_daemon(); + test_waitsig(); + + if (do_test) { + read_vsx((uint64_t *)obuffer); + + if (!memcmp(ibuffer, obuffer, sizeof(ibuffer))) + pass(); + else { + test_msg("Data mismatch\n"); + fail(); + } + } + else { + test_msg("The CPU is missing some features.\n"); + fail(); + } + + return 0; +} + +#endif /* _ARCH_PPC64 */ diff --git a/CRIU_code/test/zdtm/static/vsx.desc b/CRIU_code/test/zdtm/static/vsx.desc new file mode 100644 index 0000000..2ba6eda --- /dev/null +++ b/CRIU_code/test/zdtm/static/vsx.desc @@ -0,0 +1 @@ +{'arch': 'ppc64le'} diff --git a/CRIU_code/test/zdtm/static/vt.c b/CRIU_code/test/zdtm/static/vt.c new file mode 100644 index 0000000..0d843c4 --- /dev/null +++ b/CRIU_code/test/zdtm/static/vt.c @@ -0,0 +1,64 @@ +#include +#include +#include +#include +#include + +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check c/r of a virtual terminal"; +const char *test_author = "Ruslan Kuprieiev "; + +char *filename; +TEST_OPTION(filename, string, "file name", 1); + +#ifdef __s390x__ +#define MINOR 64 /* ttyS0 */ +#else +#define MINOR 5 +#endif + +int main(int argc, char **argv) +{ + struct stat st1, st2; + int fd; + + test_init(argc, argv); + + if (mknod(filename, S_IFCHR | S_IRUSR | S_IWUSR, makedev(4, MINOR))) { + pr_perror("Can't create virtual terminal %s", filename); + return 1; + } + + fd = open(filename, O_RDONLY); + if (fd < 0) { + pr_perror("Open virtual terminal %s failed", filename); + return 1; + } + + if (fstat(fd, &st1)) { + pr_perror("Can't stat %s virtual terminal", filename); + return 1; + } + + test_daemon(); + test_waitsig(); + + if (fstat(fd, &st2)) { + pr_perror("Can't stat %s virtual terminal", filename); + return 1; + } + + if (st1.st_rdev != st2.st_rdev) { + fail("Virtual terminal rdev mismatch %x != %x on %s", + (int)st1.st_rdev, (int)st2.st_rdev, + filename); + return 1; + } + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/vt.desc b/CRIU_code/test/zdtm/static/vt.desc new file mode 100644 index 0000000..d969725 --- /dev/null +++ b/CRIU_code/test/zdtm/static/vt.desc @@ -0,0 +1 @@ +{'flavor': 'h ns', 'flags': 'suid'} diff --git a/CRIU_code/test/zdtm/static/wait00.c b/CRIU_code/test/zdtm/static/wait00.c new file mode 100644 index 0000000..f16505c --- /dev/null +++ b/CRIU_code/test/zdtm/static/wait00.c @@ -0,0 +1,61 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "See if we can wait() for a child after migration"; +const char *test_author = "Roman Kagan "; + +int main(int argc, char ** argv) +{ + int ret; + pid_t pid; + + test_init(argc, argv); + + pid = fork(); + if (pid < 0) { + pr_perror("fork failed"); + exit(1); + } + + if (pid == 0) { + test_waitsig(); + _exit(0); + } + + test_daemon(); + test_waitsig(); + + if (kill(pid, SIGTERM)) { + fail("terminating the child failed: %m\n"); + goto out; + } + + if (wait(&ret) != pid) { + fail("wait() returned wrong pid: %m\n"); + goto out; + } + + if (WIFEXITED(ret)) { + ret = WEXITSTATUS(ret); + if (ret) { + fail("child exited with nonzero code %d (%s)\n", ret, strerror(ret)); + goto out; + } + } + if (WIFSIGNALED(ret)) { + fail("child exited on unexpected signal %d\n", WTERMSIG(ret)); + goto out; + } + + pass(); + +out: + return 0; +} diff --git a/CRIU_code/test/zdtm/static/write_read00.c b/CRIU_code/test/zdtm/static/write_read00.c new file mode 100644 index 0000000..1648e35 --- /dev/null +++ b/CRIU_code/test/zdtm/static/write_read00.c @@ -0,0 +1,61 @@ +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Write file before migration, read after"; +const char *test_author = "Roman Kagan "; + +char *filename; +TEST_OPTION(filename, string, "file name", 1); + +int main(int argc, char ** argv) +{ + int fd; + uint32_t crc; + uint8_t buf[1000000]; + + test_init(argc, argv); + + fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, 0644); + if (fd < 0) { + pr_perror("can't open %s", filename); + exit(1); + } + + crc = ~0; + datagen(buf, sizeof(buf), &crc); + if (write(fd, buf, sizeof(buf)) != sizeof(buf)) { + pr_perror("can't write %s", filename); + exit(1); + } + + close(fd); + + test_daemon(); + test_waitsig(); + + fd = open(filename, O_RDONLY); + if (fd < 0) { + fail("can't open %s: %m\n", filename); + exit(1); + } + + if (read(fd, buf, sizeof(buf)) != sizeof(buf)) { + fail("can't read %s: %m\n", filename); + goto out; + } + + crc = ~0; + if (datachk(buf, sizeof(buf), &crc)) { + fail("CRC mismatch\n"); + goto out; + } + + pass(); +out: + unlink(filename); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/write_read01.c b/CRIU_code/test/zdtm/static/write_read01.c new file mode 100644 index 0000000..0d41767 --- /dev/null +++ b/CRIU_code/test/zdtm/static/write_read01.c @@ -0,0 +1,69 @@ +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Write and half way read file before migration, complete after"; +const char *test_author = "Roman Kagan "; + +char *filename; +TEST_OPTION(filename, string, "file name", 1); + +int main(int argc, char ** argv) +{ + int fd; + int len; + uint32_t crc = ~0; + uint8_t buf[1000000]; + + test_init(argc, argv); + + fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, 0644); + if (fd < 0) { + pr_perror("can't open %s", filename); + exit(1); + } + + crc = ~0; + datagen(buf, sizeof(buf), &crc); + if (write(fd, buf, sizeof(buf)) != sizeof(buf)) { + pr_perror("can't write %s", filename); + exit(1); + } + + close(fd); + + fd = open(filename, O_RDONLY); + if (fd < 0) { + pr_perror("can't open %s", filename); + exit(1); + } + + len = sizeof(buf) / 2; + if (read(fd, buf, len) != len) { + pr_perror("can't read %s", filename); + exit(1); + } + + test_daemon(); + test_waitsig(); + + /* recover reading */ + if (read(fd, buf + len, sizeof(buf) - len) != (sizeof(buf) - len)) { + fail("can't read %s: %m\n", filename); + goto out; + } + + crc = ~0; + if (datachk(buf, sizeof(buf), &crc)) { + fail("CRC mismatch\n"); + goto out; + } + + pass(); +out: + unlink(filename); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/write_read02.c b/CRIU_code/test/zdtm/static/write_read02.c new file mode 100644 index 0000000..0a80558 --- /dev/null +++ b/CRIU_code/test/zdtm/static/write_read02.c @@ -0,0 +1,80 @@ +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Write file half way before migration, complete and read after"; +const char *test_author = "Roman Kagan "; + +char *filename; +TEST_OPTION(filename, string, "file name", 1); + +int main(int argc, char ** argv) +{ + int fd, fd1; + int len, full_len; + uint32_t crc; + uint8_t buf[1000000]; + char str[32]; + + test_init(argc, argv); + + fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, 0644); + if (fd < 0) { + pr_perror("can't open %s", filename); + exit(1); + } + + crc = ~0; + datagen(buf, sizeof(buf), &crc); + + full_len = sizeof(buf); + // create standard file + sprintf(str, "standard_%s", filename); + fd1 = open(str, O_WRONLY | O_CREAT | O_TRUNC, 0644); + if (write(fd1, buf, full_len) != full_len) { + pr_perror("can't write %s", str); + exit(1); + } + close(fd1); + + len = sizeof(buf) / 2; + if (write(fd, buf, len) != len) { + pr_perror("can't write %s", filename); + exit(1); + } + + test_daemon(); + test_waitsig(); + + if (write(fd, buf + len, sizeof(buf) - len) != (sizeof(buf) - len)) { + fail("can't write %s: %m\n", filename); + goto out; + } + + close(fd); + + fd = open(filename, O_RDONLY); + if (fd < 0) { + fail("can't open %s: %m\n", filename); + return 1; + } + + if (read(fd, buf, full_len) != full_len) { + fail("can't read %s: %m\n", filename); + return 1; + } + + crc = ~0; + if (datachk(buf, full_len, &crc)) { + fail("CRC mismatch\n"); + return 1; + } + + pass(); +out: + unlink(filename); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/write_read10.c b/CRIU_code/test/zdtm/static/write_read10.c new file mode 100644 index 0000000..4e8a67e --- /dev/null +++ b/CRIU_code/test/zdtm/static/write_read10.c @@ -0,0 +1,130 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Open r/w and unlink file, and fork before migration;\n" + "check that the child can write to it and the parent\n" + "can read from it after migration"; +const char *test_author = "Roman Kagan "; + +char *filename; +TEST_OPTION(filename, string, "file name", 1); + +int main(int argc, char ** argv) +{ + int fd, child_fd, ret; + pid_t pid; + uint32_t crc; + uint8_t buf[1000000]; + task_waiter_t t; + + test_init(argc, argv); + + fd = open(filename, O_RDWR | O_CREAT | O_TRUNC, 0644); + if (fd < 0) { + pr_perror("can't open %s", filename); + exit(1); + } + + child_fd = open(filename, O_RDWR | O_CREAT | O_TRUNC, 0644); + if (child_fd < 0) { + pr_perror("can't open %s", filename); + exit(1); + } + + if (unlink(filename)) { + pr_perror("can't unlink %s", filename); + exit(1); + } + + task_waiter_init(&t); + + pid = fork(); + if (pid < 0) { + pr_perror("can't fork"); + exit(1); + } + + if (pid == 0) { /* child writes to the unlinked file and returns */ + close(fd); + task_waiter_complete_current(&t); + test_waitsig(); + + crc = ~0; + datagen(buf, sizeof(buf), &crc); + if (write(child_fd, buf, sizeof(buf)) != sizeof(buf)) + _exit(errno); + + close(child_fd); + _exit(0); + } else + task_waiter_wait4(&t, pid); + + close(child_fd); + + test_daemon(); + test_waitsig(); + + if (kill(pid, SIGTERM)) { + fail("terminating the child failed: %m\n"); + goto out; + } + + if (wait(&ret) != pid) { + fail("wait() returned wrong pid %d: %m\n", pid); + goto out; + } + + if (WIFEXITED(ret)) { + ret = WEXITSTATUS(ret); + if (ret) { + fail("child exited with nonzero code %d (%s)\n", ret, strerror(ret)); + goto out; + } + } + if (WIFSIGNALED(ret)) { + fail("child exited on unexpected signal %d\n", WTERMSIG(ret)); + goto out; + } + + if (lseek(fd, 0, SEEK_SET) < 0) { + fail("lseeking to the beginning of file failed: %m\n"); + goto out; + } + + if (read(fd, buf, sizeof(buf)) != sizeof(buf)) { + fail("can't read %s: %m\n", filename); + goto out; + } + + crc = ~0; + if (datachk(buf, sizeof(buf), &crc)) { + fail("CRC mismatch\n"); + goto out; + } + + + if (close(fd)) { + fail("close failed: %m\n"); + goto out_noclose; + } + + if (unlink(filename) != -1 || errno != ENOENT) { + fail("file %s should have been deleted before migration: unlink: %m\n", + filename); + goto out_noclose; + } + + pass(); + +out: + close(fd); +out_noclose: + return 0; +} diff --git a/CRIU_code/test/zdtm/static/xids00.c b/CRIU_code/test/zdtm/static/xids00.c new file mode 100644 index 0000000..0e8f324 --- /dev/null +++ b/CRIU_code/test/zdtm/static/xids00.c @@ -0,0 +1,127 @@ +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that environment didn't change"; +const char *test_author = "Pavel Emelianov "; + +int main(int argc, char **argv) +{ + int tmp_pipe[2], i; + int pids[2], syncfd[2], stat, fail = 0; + + test_init(argc, argv); + + pipe(tmp_pipe); + pids[0] = test_fork(); + if (pids[0] == 0) { + close(tmp_pipe[0]); + + setsid(); + + close(tmp_pipe[1]); + test_waitsig(); + + if (getpid() != getsid(0)) + exit(1); + + if (getpid() != getpgid(0)) + exit(2); + + test_msg("P1 OK\n"); + exit(0); + } + close(tmp_pipe[1]); + syncfd[0] = tmp_pipe[0]; + + pipe(tmp_pipe); + pids[1] = test_fork(); + if (pids[1] == 0) { + int tmp_pipe_sub[2], pid; + + close(tmp_pipe[0]); + + setsid(); + + pipe(tmp_pipe_sub); + pid = test_fork(); + if (pid == 0) { + close(tmp_pipe[1]); + close(tmp_pipe_sub[0]); + + setpgid(0, 0); + + close(tmp_pipe_sub[1]); + test_waitsig(); + + if (getsid(0) != getppid()) + exit(1); + if (getpgid(0) != getpid()) + exit(1); + + exit(0); + } + close(tmp_pipe_sub[1]); + + read(tmp_pipe_sub[0], &stat, 1); + close(tmp_pipe_sub[0]); + + close(tmp_pipe[1]); + + test_waitsig(); + + if (getpid() != getsid(0)) + exit(1); + + if (getpid() != getpgid(0)) + exit(2); + + kill(pid, SIGTERM); + if (waitpid(pid, &stat, 0) < 0) { + pr_perror("Unable to wait P2 %d", pid); + exit(3); + } else if (!WIFEXITED(stat) || WEXITSTATUS(stat)) { + pr_perror("P2 stat %d/%d/%d/%d", WIFEXITED(stat), WEXITSTATUS(stat), + WIFSIGNALED(stat), WTERMSIG(stat)); + exit(3); + } + + exit(0); + } + close(tmp_pipe[1]); + syncfd[1] = tmp_pipe[0]; + + read(syncfd[0], &stat, 1); + close(syncfd[0]); + read(syncfd[1], &stat, 1); + close(syncfd[1]); + + test_daemon(); + test_waitsig(); + + for (i = 0; i < sizeof(pids) / sizeof(pids[0]); i++) + kill(pids[i], SIGTERM); + + for (i = 0; i < sizeof(pids) / sizeof(pids[0]); i++) { + if (waitpid(pids[i], &stat, 0) < 0) { + pr_perror("Unable to wait %d", pids[i]); + fail = 1; + } else if (!WIFEXITED(stat) || WEXITSTATUS(stat)) { + pr_perror("P%d stat %d/%d/%d/%d", i, WIFEXITED(stat), WEXITSTATUS(stat), + WIFSIGNALED(stat), WTERMSIG(stat)); + fail = 1; + } + } + + if (fail) + fail("Something failed"); + else + pass(); + + return 0; +} diff --git a/CRIU_code/test/zdtm/static/zombie00.c b/CRIU_code/test/zdtm/static/zombie00.c new file mode 100644 index 0000000..df12a70 --- /dev/null +++ b/CRIU_code/test/zdtm/static/zombie00.c @@ -0,0 +1,110 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "See if we can wait() for a zombified child after migration"; +const char *test_author = "Roman Kagan "; + +struct zombie { + int pid; + int exited; + int exitcode; +}; + +#define NR_ZOMBIES 4 + +int main(int argc, char ** argv) +{ + int i, status; + struct zombie zombie[NR_ZOMBIES]; + + zombie[0].exited = 1; + zombie[0].exitcode = 0; + + zombie[1].exited = 1; + zombie[1].exitcode = 3; + + zombie[2].exited = 0; + zombie[2].exitcode = SIGKILL; + + zombie[3].exited = 0; + zombie[3].exitcode = SIGSEGV; + + test_init(argc, argv); + + for (i = 0; i < NR_ZOMBIES; i++) { + zombie[i].pid = fork(); + if (zombie[i].pid < 0) { + pr_perror("fork failed"); + exit(1); + } + + if (zombie[i].pid == 0) { + if (zombie[i].exited) + _exit(zombie[i].exitcode); + else if (zombie[i].exitcode == SIGSEGV) + *(volatile int *)NULL = 0; + else + kill(getpid(), zombie[i].exitcode); + + _exit(13); /* just in case */ + } + + test_msg("kid %d will %d/%d\n", zombie[i].pid, + zombie[i].exited, zombie[i].exitcode); + } + + /* + * We must wait for zombies to appear, but we cannot use + * wait4 here :( Use sleep. + */ + + for (i = 0; i < NR_ZOMBIES; i++) { + siginfo_t siginfo; + if (waitid(P_PID, zombie[i].pid, &siginfo, WNOWAIT | WEXITED)) { + pr_perror("Unable to wait %d", zombie[i].pid); + exit(1); + } + } + + test_daemon(); + test_waitsig(); + + for (i = 0; i < NR_ZOMBIES; i++) { + if (waitpid(zombie[i].pid, &status, 0) != zombie[i].pid) { + fail("Exit with wrong pid\n"); + exit(1); + } + + if (zombie[i].exited) { + if (!WIFEXITED(status)) { + fail("Not exited, but should (%d)\n", zombie[i].pid); + exit(1); + } + + if (WEXITSTATUS(status) != zombie[i].exitcode) { + fail("Exit with wrong status (%d)\n", zombie[i].pid); + exit(1); + } + } else { + if (!WIFSIGNALED(status)) { + fail("Not killed, but should (%d)\n", zombie[i].pid); + exit(1); + } + + if (WTERMSIG(status) != zombie[i].exitcode) { + fail("Killed with wrong signal (%d)\n", zombie[i].pid); + exit(1); + } + } + } + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/zombie01.c b/CRIU_code/test/zdtm/static/zombie01.c new file mode 100644 index 0000000..6e904c0 --- /dev/null +++ b/CRIU_code/test/zdtm/static/zombie01.c @@ -0,0 +1,80 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that zombie pgid is restored"; +const char *test_author = "Kirill Tkhai "; + +int main(int argc, char **argv) +{ + pid_t pid, pgrp; + siginfo_t info; + int status; + + test_init(argc, argv); + + pid = fork(); + if (pid < 0) { + fail("fork"); + exit(1); + } + + if (!pid) { + /* Child */ + if (setpgid(0, 0) < 0) { + fail("setpgid"); + exit(1); + } + pid = sys_clone_unified(CLONE_PARENT|SIGCHLD, NULL, NULL, NULL, 0); + if (pid < 0) { + fail("fork"); + exit(1); + } + + exit(0); + } + + if (waitpid(pid, &status, 0) < 0) { + fail("waitpid"); + exit(1); + } + if (!WIFEXITED(status) || WEXITSTATUS(status)) { + pr_err("Exited with problems: status=%d\n", status); + fail("fail"); + exit(1); + } + + if (waitid(P_ALL, 0, &info, WEXITED|WNOWAIT) < 0) { + fail("waitpid"); + exit(1); + } + + test_daemon(); + test_waitsig(); + + if (waitid(P_ALL, 0, &info, WEXITED|WNOWAIT) < 0) { + fail("waitpid"); + exit(1); + } + + pgrp = getpgid(info.si_pid); + if (pgrp < 0) { + fail("getpgrp"); + exit(1); + } + + if (pgrp != pid) { + pr_err("Wrong pgrp: %d != %d\n", pgrp, pid); + fail("fail"); + exit(1); + } + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/static/zombie01.desc b/CRIU_code/test/zdtm/static/zombie01.desc new file mode 100644 index 0000000..2eac7e6 --- /dev/null +++ b/CRIU_code/test/zdtm/static/zombie01.desc @@ -0,0 +1 @@ +{'flags': 'suid'} diff --git a/CRIU_code/test/zdtm/transition/Makefile b/CRIU_code/test/zdtm/transition/Makefile new file mode 100644 index 0000000..35301ac --- /dev/null +++ b/CRIU_code/test/zdtm/transition/Makefile @@ -0,0 +1,91 @@ +LIBDIR := ../lib +LIB := $(LIBDIR)/libzdtmtst.a +LDLIBS += $(LIB) +CPPFLAGS += -I$(LIBDIR) + +TST_NOFILE = \ + ipc \ + ptrace \ + epoll \ + fork \ + fork2 \ + thread-bomb \ + maps007 \ + maps008 \ + pipe_loop00 \ + pipe_shared00 \ + socket_loop00 \ + netlink00 \ + file_aio \ + socket-tcp \ + socket-tcp6 \ + shmem \ + lazy-thp \ + pid_reuse \ + + +TST_FILE = \ + file_read \ + unix_sock \ + fifo_dyn \ + fifo_loop \ + + +TST = $(TST_NOFILE) $(TST_FILE) +SRC = $(TST:%=%.c) +OBJ = $(SRC:%.c=%.o) +DEP = $(SRC:%.c=%.d) +PID = $(TST:%=%.pid) +OUT = $(TST:%=%.out) + +include ../Makefile.inc + +all: $(TST) +install: all +.PHONY: all install + +$(TST_NOFILE:%=%.pid): %.pid: % + $(/dev/null` 2>/dev/null || break; \ + sleep 1; \ + done + +$(TST): | $(LIB) + +file_aio: LDLIBS += -lrt -pthread +socket-tcp: CFLAGS += -D STREAM +socket-tcp6: CFLAGS += -D ZDTM_IPV6 -D STREAM +ptrace.o: CFLAGS += -pthread +ptrace: LDFLAGS += -pthread +fork2: CFLAGS += -D FORK2 +thread-bomb.o: CFLAGS += -pthread +thread-bomb: LDFLAGS += -pthread + +%: %.sh + cp $< $@ + chmod +x $@ + +$(LIB): force + $(Q) $(MAKE) -C $(LIBDIR) + +.PHONY: force start check_start stop wait_stop diff --git a/CRIU_code/test/zdtm/transition/epoll.c b/CRIU_code/test/zdtm/transition/epoll.c new file mode 100644 index 0000000..4eac521 --- /dev/null +++ b/CRIU_code/test/zdtm/transition/epoll.c @@ -0,0 +1,200 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "migrate application using epoll"; + +#define MAX_SCALE 128 + +enum child_exit_codes { + SUCCESS = 0, + GETTIMEOFDAYERROR, + WRITEERROR, + + MAX_EXIT_CODE +}; + +static char *child_fail_reason[] = { + "Success", + "Can't get time", + "Can't write" +}; + +int scale = 13; +TEST_OPTION(scale, int, "How many children should perform testing", 0); + +static int pids[MAX_SCALE]; +static int fds[MAX_SCALE][2]; +static volatile int stop = 0; + +static void killall(void) +{ + int i; + + for (i = 0; i < scale; i++) { + close(fds[i][0]); + close(fds[i][1]); + kill(pids[i], SIGUSR2); + } +} + +static void do_stop(int sig) +{ + stop = 1; +} + +static void run_child(int num) +{ + int fd = fds[num][1]; + uint32_t crc = ~0; + size_t buf_size=512; + uint8_t buf[buf_size]; + struct timeval tv; + struct timespec ts; + int rv; + + close(fds[num][0]); + + datagen(buf, sizeof(buf), &crc); + + if (gettimeofday(&tv, NULL) < 0) { + rv = GETTIMEOFDAYERROR; + goto out; + } + + srand(tv.tv_sec + tv.tv_usec); + + ts.tv_sec = 0; + while (!stop) { + ts.tv_nsec = rand() % 999999999; + nanosleep(&ts, &ts); + if (write(fd, buf, buf_size) < 0 && + (!stop /* signal SIGUSR2 NOT received */ || + (errno != EINTR && errno != EPIPE))) { + fail("child write: %m\n"); + rv = WRITEERROR; + goto out; + } + } + rv = SUCCESS; +out: close(fds[num][1]); + exit(rv); +} + +int main(int argc, char **argv) +{ + int rv, i; + int counter = 0; + int efd; + size_t buf_size=512; + char buf[buf_size]; + struct epoll_event event = { + .events = EPOLLIN + }, *events; + + test_init(argc, argv); + + if (scale > MAX_SCALE) { + pr_err("Too many children specified\n"); + exit(1); + } + + if (signal(SIGUSR2, do_stop) == SIG_ERR) { + pr_perror("Can't setup signal handler"); + exit(1); + } + + if ((efd = epoll_create(scale)) < 0) { + pr_perror("Can't create epoll"); + exit(1); + } + + for (i = 0; i < scale; i++) { + if (pipe(fds[i]) < 0) { + pr_perror("Can't create pipe[%d]", i); + killall(); + exit(1); + } + if (fcntl(fds[i][0], F_SETFL, O_NONBLOCK) < 0) { + pr_perror("Can't set O_NONBLOCK flag on fd[%d]", i); + killall(); + exit(1); + } + event.data.fd = fds[i][0]; + if (epoll_ctl(efd, EPOLL_CTL_ADD, fds[i][0], &event) < 0) { + pr_perror("Can't add fd[%d]", i); + killall(); + exit(1); + } + + if ((rv = test_fork()) < 0) { + pr_perror("Can't fork[%d]", i); + killall(); + exit(1); + } + if (rv == 0) + run_child(i); + close(fds[i][1]); + pids[i] = rv; + } + + if ((events = (struct epoll_event*) malloc (sizeof(struct epoll_event)*scale)) == NULL) { + pr_perror("Can't allocate memory"); + killall(); + exit(1); + } + + test_daemon(); + + while (test_go()) { + if ((rv = epoll_wait(efd, events, scale, rand() % 999)) < 0 && errno != EINTR) { + pr_perror("epoll_wait error"); + killall(); + exit(1); + } + for (i = 0; i < rv; i++) { + while (read(events[i].data.fd, buf, buf_size) > 0); + if (errno != EAGAIN && errno != 0 && errno) { + pr_perror("read error"); + killall(); + exit(1); + } + } + } + + test_waitsig(); + + for (i = 0; i < scale; i++) { + kill(pids[i], SIGUSR2); + if (waitpid(pids[i], &rv, 0) < 0) { + fail("waitpid error: %m\n"); + counter++; + continue; + } + else { + rv = WEXITSTATUS(rv); + if (rv < MAX_EXIT_CODE && rv > SUCCESS) { + fail("Child failed: %s (%d)\n", + child_fail_reason[rv], rv); + counter++; + } else if (rv != SUCCESS) { + fail("Unknown exitcode from child: %d\n", rv); + counter++; + } + } + } + if (counter == 0) + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/transition/fifo_dyn.c b/CRIU_code/test/zdtm/transition/fifo_dyn.c new file mode 100644 index 0000000..62ebce4 --- /dev/null +++ b/CRIU_code/test/zdtm/transition/fifo_dyn.c @@ -0,0 +1,151 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "dynamic FIFO test"; + +#define PROCS_DEF 2 /* 0 - parent, 1 - child */ +#define BUF_SIZE 256 +unsigned int num_procs = PROCS_DEF; +char *filename; +TEST_OPTION(filename, string, "file name", 1); + +int main(int argc, char **argv) +{ + int ret = 0; + int readfd, writefd; + mode_t mode = S_IFIFO | 0600; + char path[PROCS_DEF][BUF_SIZE]; + pid_t pid; + int i; + uint8_t buf[0x100000]; + int chret; + char *file_path; + + test_init(argc, argv); + + for (i = 0; i < PROCS_DEF; i++) { + file_path = path[i]; + if (snprintf(file_path, BUF_SIZE, "%s-%02d", filename, i) >= BUF_SIZE) { + pr_perror("filename %s is too long", filename); + exit(1); + } + if (mkfifo(file_path, mode)) { + pr_perror("can't make fifo \"%s\"", file_path); + exit(1); + } + } + + pid = test_fork(); + if (pid < 0) { + pr_perror("Can't fork"); + kill(0, SIGKILL); + exit(1); + } + if (pid == 0) { + file_path = path[0]; + readfd = open(file_path, O_RDONLY); + if (readfd < 0) { + pr_perror("open(%s, O_RDONLY) Failed", file_path); + ret = errno; + return ret; + } + file_path = path[1]; + writefd = open(file_path, O_WRONLY); + if (writefd < 0) { + pr_perror("open(%s, O_WRONLY) Failed", file_path); + ret = errno; + return ret; + } + + if (pipe_in2out(readfd, writefd, buf, sizeof(buf)) < 0) + /* pass errno as exit code to the parent */ + if (test_go() /* signal NOT delivered */ || + (errno != EINTR && errno != EPIPE)) + ret = errno; + close(readfd); + close(writefd); + exit(ret); + } + file_path = path[0]; + writefd = open(file_path, O_WRONLY); + if (writefd < 0) { + pr_perror("open(%s, O_WRONLY) Failed", file_path); + kill(pid, SIGKILL); + return 1; + } + + file_path = path[1]; + readfd = open(file_path, O_RDONLY); + if (readfd < 0) { + pr_perror("open(%s, O_RDONLY) Failed", file_path); + kill(pid, SIGKILL); + return 1; + } + test_daemon(); + + while (test_go()) { + int len, rlen = 0, wlen; + uint8_t rbuf[sizeof(buf)], *p; + + datagen(buf, sizeof(buf), NULL); + wlen = write(writefd, buf, sizeof(buf)); + if (wlen < 0) { + if (errno == EINTR) + continue; + else { + fail("write failed: %m\n"); + ret = 1; + break; + } + } + + for (p = rbuf, len = wlen; len > 0; p += rlen, len -= rlen) { + rlen = read(readfd, p, len); + if (rlen <= 0) + break; + } + + if (rlen < 0 && errno == EINTR) + continue; + + if (len > 0) { + fail("read failed: %m\n"); + ret = 1; + break; + } + + if (memcmp(buf, rbuf, wlen)) { + fail("data mismatch\n"); + ret = 1; + break; + } + } + + close(writefd); + test_waitsig(); + + wait(&chret); + chret = WEXITSTATUS(chret); + if (chret) { + fail("child exited with non-zero code %d (%s)\n", + chret, strerror(chret)); + return 1; + } + if (!ret) + pass(); + close(readfd); + for (i = 0; i < PROCS_DEF; i++) + unlink(path[i]); + return 0; +} diff --git a/CRIU_code/test/zdtm/transition/fifo_dyn.desc b/CRIU_code/test/zdtm/transition/fifo_dyn.desc new file mode 100644 index 0000000..95c58b4 --- /dev/null +++ b/CRIU_code/test/zdtm/transition/fifo_dyn.desc @@ -0,0 +1 @@ +{'flags': 'noauto'} diff --git a/CRIU_code/test/zdtm/transition/fifo_loop.c b/CRIU_code/test/zdtm/transition/fifo_loop.c new file mode 100644 index 0000000..2e28320 --- /dev/null +++ b/CRIU_code/test/zdtm/transition/fifo_loop.c @@ -0,0 +1,194 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Multi-process fifo loop"; +#define BUF_SIZE 256 +#define PROCS_DEF 4 +unsigned int num_procs = PROCS_DEF; +TEST_OPTION(num_procs, uint, "# processes to create " + "(default " __stringify(PROCS_DEF) ")", 0); +char *filename; +TEST_OPTION(filename, string, "file name", 1); + +static int pids[PROCS_DEF]; + +volatile sig_atomic_t num_exited = 0; +void inc_num_exited(int signo) +{ + num_exited++; +} + +int main(int argc, char **argv) +{ + int ret = 0; + int readfd, writefd; + mode_t mode = S_IFIFO | 0644; + char path[PROCS_DEF][BUF_SIZE]; + pid_t pid; + int i; + uint8_t buf[0x100000]; + char *file_path; + + test_init(argc, argv); + + for (i = 0; i < PROCS_DEF; i++) { + file_path = path[i]; + if (snprintf(file_path, BUF_SIZE, "%s-%02d", filename, i) >= BUF_SIZE) { + pr_err("filename %s is too long\n", filename); + exit(1); + } + if (mkfifo(file_path, mode)) { + pr_perror("can't make fifo \"%s\"", file_path); + exit(1); + } + } + + if (signal(SIGCHLD, inc_num_exited) == SIG_ERR) { + pr_perror("can't set SIGCHLD handler"); + exit(1); + } + + for (i = 1; i < num_procs; i++) { /* i = 0 - parent */ + pid = test_fork(); + if (pid < 0) { + pr_perror("Can't fork"); + kill(0, SIGKILL); + exit(1); + } + if (pid == 0) { + file_path = path[i - 1]; + readfd = open(file_path, O_RDONLY); + if (readfd < 0) { + pr_perror("open(%s, O_RDONLY) failed", + file_path); + ret = errno; + return ret; + } + file_path = path[i]; + writefd = open(file_path, O_WRONLY); + if (writefd < 0) { + pr_perror("open(%s, O_WRONLY) failed", + file_path); + ret = errno; + return ret; + } + signal(SIGPIPE, SIG_IGN); + if (pipe_in2out(readfd, writefd, buf, sizeof(buf)) < 0) + /* pass errno as exit code to the parent */ + if (test_go() /* signal NOT delivered */ || + (errno != EINTR && errno != EPIPE)) + ret = errno; + close(readfd); + close(writefd); + exit(ret); + } + pids[i] = pid; + } + + file_path = path[0]; + writefd = open(file_path, O_WRONLY); + if (writefd < 0) { + pr_perror("open(%s, O_WRONLY) failed", file_path); + kill(0, SIGKILL); + exit(1); + } + + file_path = path[i - 1]; + readfd = open(file_path, O_RDONLY); + if (readfd < 0) { + pr_perror("open(%s, O_RDONLY) failed", file_path); + kill(0, SIGKILL); + exit(1); + } + + if (num_exited) { + pr_err("Some children died unexpectedly\n"); + kill(0, SIGKILL); + exit(1); + } + + test_daemon(); + + while (test_go()) { + int len, rlen = 0, wlen; + uint8_t rbuf[sizeof(buf)], *p; + + datagen(buf, sizeof(buf), NULL); + wlen = write(writefd, buf, sizeof(buf)); + if (wlen < 0) { + if (errno == EINTR) + continue; + else { + fail("write failed: %m\n"); + ret = 1; + break; + } + } + + for (p = rbuf, len = wlen; len > 0; p += rlen, len -= rlen) { + rlen = read(readfd, p, len); + if (rlen <= 0) + break; + } + + if (rlen < 0 && errno == EINTR) + continue; + + if (len > 0) { + fail("read failed: %m\n"); + ret = 1; + break; + } + + if (memcmp(buf, rbuf, wlen)) { + fail("data mismatch\n"); + ret = 1; + break; + } + } + + close(writefd); + + test_waitsig(); /* even if failed, wait for migration to complete */ + + if (kill(0, SIGTERM)) { + fail("failed to send SIGTERM to my process group: %m\n"); + return 1; /* shouldn't wait() in this case */ + } + close(readfd); + + for (i = 1; i < num_procs; i++) { /* i = 0 - parent */ + int chret; + if (waitpid(pids[i], &chret, 0) < 0) { + fail("waitpid error: %m\n"); + ret = 1; + continue; + } + + chret = WEXITSTATUS(chret); + if (chret) { + fail("child %d exited with non-zero code %d (%s)\n", + i, chret, strerror(chret)); + ret = 1; + continue; + } + } + + if (!ret) + pass(); + + for (i = 0; i < PROCS_DEF; i++) + unlink(path[i]); + return 0; +} diff --git a/CRIU_code/test/zdtm/transition/file_aio.c b/CRIU_code/test/zdtm/transition/file_aio.c new file mode 100644 index 0000000..a160101 --- /dev/null +++ b/CRIU_code/test/zdtm/transition/file_aio.c @@ -0,0 +1,102 @@ +#include "zdtmtst.h" + +const char *test_doc = "test for AIO"; +const char *test_author = "Andrew Vagin "; + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define BUF_SIZE 1024 + +int main(int argc, char **argv) +{ + test_init(argc, argv); + char buf[BUF_SIZE]; + int fd; + struct aiocb aiocb; + const struct aiocb *aioary[1]; + char tmpfname[256]="/tmp/file_aio.XXXXXX"; + int ret; + + fd = mkstemp(tmpfname); + if (fd == -1) { + pr_perror("mkstemp() failed"); + exit(1); + } + + unlink(tmpfname); + + if (write(fd, buf, BUF_SIZE) != BUF_SIZE) { + pr_perror("Error at write()"); + exit(1); + } + + test_daemon(); + + while (test_go()) { + memset(&aiocb, 0, sizeof(struct aiocb)); + aiocb.aio_offset = 0; + aiocb.aio_fildes = fd; + aiocb.aio_buf = buf; + aiocb.aio_nbytes = BUF_SIZE; + + ret = aio_read(&aiocb); + if (ret < 0) { + if ((errno == EINTR) && (!test_go())) + break; + pr_perror("aio_read failed"); + return 1; + } + + if (ret < 0) { + pr_perror("aio_read failed"); + exit(1); + } + /* Wait for request completion */ + aioary[0] = &aiocb; +again: + ret = aio_suspend(aioary, 1, NULL); + if (ret < 0) { + if ((errno == EINTR) && (! test_go())) + break; + if (errno != EINTR) { + pr_perror("aio_suspend failed"); + return 1; + } + } + + ret = aio_error(&aiocb); + if (ret == EINPROGRESS) { +#ifdef DEBUG + test_msg("restart aio_suspend\n"); +#endif + goto again; + } + if (ret != 0) { + pr_err("Error at aio_error(): %s\n", strerror(ret)); + return 1; + } + + ret = aio_return(&aiocb); + if (ret < 0) { + if ((errno == EINTR) && (!test_go())) + break; + pr_perror("aio_return failed"); + return 1; + } + if (ret != BUF_SIZE) { + pr_perror("Error at aio_return()"); + exit(1); + } + } + close(fd); + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/transition/file_read.c b/CRIU_code/test/zdtm/transition/file_read.c new file mode 100644 index 0000000..50dffd8 --- /dev/null +++ b/CRIU_code/test/zdtm/transition/file_read.c @@ -0,0 +1,240 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Fill/read file continuously to check" + "it's migrated at the right moment"; +const char *test_author = "Pavel Emelianov "; + +#define MAX_SCALE 128 +#define FILE_SIZE (16 * 1024) + +enum kids_exit_codes { + SUCCESS = 0, + FILE_CORRUPTED, + MMAP_FAILED, + OPEN_FAILED, + WRITE_FAILED, + READ_FAILED, + FSYNC_FAILED, + SEEK_FAILED, + + MAX_EXIT_CODE_VAL +}; + +static char *kids_fail_reasons[] = { + "Success", + /* 1 */ "File corrupted", + /* 2 */ "Map failed", + /* 3 */ "Open (create) failed", + /* 4 */ "Write failed", + /* 5 */ "Read failed", + /* 6 */ "Fsync failed", + /* 7 */ "Lseek failed" +}; + +int scale = 13; +TEST_OPTION(scale, int, "How many children should perform testing", 0); + +char *filename; +TEST_OPTION(filename, string, "file name", 1); + +static int pids[MAX_SCALE]; +static volatile int stop = 0; + +static void killall(void) +{ + int i; + + for (i = 0; i < MAX_SCALE; i++) + kill(pids[i], SIGUSR2); +} + +static void do_stop(int sig) +{ + stop = 1; +} + +static char *buf; + +static void prepare_buf(void) +{ + int i; + + for (i = 0; i < FILE_SIZE; i++) + buf[i] = rand(); +} + +static int fill_file(int fd) +{ + int rv, wr; + + if (lseek(fd, 0, SEEK_SET) == -1) + return -2; + + wr = 0; + while (1) { + rv = write(fd, buf + wr, FILE_SIZE - wr); + if (rv <= 0) + return -1; + wr += rv; + if (wr == FILE_SIZE) + break; + } + return 0; +} + +static int check_file(int fd) +{ + char rbuf[1024]; + int rv, rd; + + if (lseek(fd, 0, SEEK_SET) == -1) + return -2; + + rd = 0; + while (1) { + rv = read(fd, rbuf, 1024); + if (rv <= 0) + return -1; + if (memcmp(buf + rd, rbuf, rv)) + return 1; + rd += rv; + if (rd == FILE_SIZE) + break; + } + return 0; +} + +static void chew_some_file(int num) +{ + int fd, rv; + char chew_file[PATH_MAX]; + + buf = mmap(NULL, FILE_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); + rv = MMAP_FAILED; + if (buf == MAP_FAILED) + goto out_exit; + + sprintf(chew_file, "chew_%s.%d", filename, num); + fd = open(chew_file, O_CREAT | O_EXCL | O_RDWR, 0666); + rv = OPEN_FAILED; + if (fd == -1) + goto out_unmap; + + while (!stop) { + prepare_buf(); + switch (fill_file(fd)) { + case -1: + rv = WRITE_FAILED; + goto out_exit; + case -2: + rv = SEEK_FAILED; + goto out_exit; + } + if (fsync(fd) == -1) { + rv = FSYNC_FAILED; + goto out_exit; + } + if (fsync(fd) == -1) { + rv = FSYNC_FAILED; + goto out_exit; + } + switch (check_file(fd)) { + case -1: + rv = READ_FAILED; + goto out_exit; + case -2: + rv = SEEK_FAILED; + goto out_exit; + case 1: + rv = FILE_CORRUPTED; + int fd1; + char str[PATH_MAX]; + // create standard file + sprintf(str, "standard_%s.%d", filename, num); + fd1 = open(str, O_WRONLY | O_CREAT | O_TRUNC, 0666); + if (write(fd1, buf, FILE_SIZE) != FILE_SIZE) + pr_perror("can't write %s", str); + close(fd1); + goto out_exit; + } + } + rv = SUCCESS; + close(fd); + unlink(chew_file); +out_unmap: + munmap(buf, FILE_SIZE); +out_exit: + exit(rv); +} + +int main(int argc, char **argv) +{ + int rv, i; + int counter = 0; + + test_init(argc, argv); + + if (scale > MAX_SCALE) { + pr_err("Too many children specified\n"); + exit(-1); + } + + if (signal(SIGUSR2, do_stop) == SIG_ERR) { + pr_perror("Can't setup signal handler"); + exit(-1); + } + + for (i = 0; i < scale; i++) { + rv = test_fork(); + if (rv == -1) { + pr_perror("Can't fork"); + killall(); + exit(-1); + } + if (rv == 0) + chew_some_file(i); + pids[i] = rv; + } + + test_daemon(); + test_waitsig(); + + killall(); + for (i = 0; i < scale; i++) { + if (waitpid(pids[i], &rv, 0) == -1) { + fail("Can't wipe up the kid\n"); + counter++; + continue; + } + if (!WIFEXITED(rv)) { + fail("Kid was killed\n"); + counter++; + } else { + rv = WEXITSTATUS(rv); + if (rv < MAX_EXIT_CODE_VAL && rv > SUCCESS) { + fail("Kid failed: %s (%d)\n", + kids_fail_reasons[rv], rv); + counter++; + } else if (rv != SUCCESS) { + fail("Unknown exitcode from kid: %d\n", rv); + counter++; + } + } + } + + if (counter == 0) + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/transition/fork.c b/CRIU_code/test/zdtm/transition/fork.c new file mode 100644 index 0000000..9ab1605 --- /dev/null +++ b/CRIU_code/test/zdtm/transition/fork.c @@ -0,0 +1,94 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Tests that forking tasks are handled properly"; +const char *test_author = "Pavel Emelyanov "; + +char children[] = "0123456789"; + +int main(int argc, char **argv) +{ + int pid, wpid, status; + int p[2]; + + test_init(argc, argv); + + if (pipe(p)) { + pr_perror("pipe"); + return -1; + } + + if (write(p[1], children, sizeof(children)) != sizeof(children)) { + pr_perror("write"); + return -1; + } + + test_daemon(); + + while (test_go()) { + char c = 0; + int ret; + + ret = read(p[0], &children, sizeof(children)); + if (ret <= 0) { + pr_perror("read"); + return 1; + } + + for (; ret > 0; ret--) { + pid = fork(); + if (pid < 0) { + fail("Can't fork"); + goto out; + } + + if (pid == 0) { +#ifdef FORK2 + usleep(10000); +#endif + if (write(p[1], &c, 1) != 1) { + pr_perror("write"); + return 1; + } + exit(0); + } + } + + while (1) { + wpid = waitpid(-1, &status, WNOHANG); + if (wpid < 0) { + if (errno == ECHILD) + break; + pr_perror("waitpid"); + return -1; + } + if (wpid == 0) + break; + + if (!WIFEXITED(status)) { + fail("Task %d didn't exit with status %d", wpid, status); + goto out; + } + + if (WEXITSTATUS(status) != 0) { + fail("Task %d exited with wrong status %d", wpid, status); + goto out; + } + } + + } + pass(); +out: + return 0; +} diff --git a/CRIU_code/test/zdtm/transition/fork2.c b/CRIU_code/test/zdtm/transition/fork2.c new file mode 100644 index 0000000..c2e58c0 --- /dev/null +++ b/CRIU_code/test/zdtm/transition/fork2.c @@ -0,0 +1 @@ +fork.c \ No newline at end of file diff --git a/CRIU_code/test/zdtm/transition/ipc.c b/CRIU_code/test/zdtm/transition/ipc.c new file mode 100644 index 0000000..be52d73 --- /dev/null +++ b/CRIU_code/test/zdtm/transition/ipc.c @@ -0,0 +1,201 @@ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc="Tests ipc sems and shmems migrate fine"; +const char *test_author="Pavel Emelianov "; + +static struct sembuf unlock = { + .sem_op = 1, + .sem_num = 0, + .sem_flg = 0, +}; + +static struct sembuf lock = { + .sem_op = -1, + .sem_num = 0, + .sem_flg = 0, +}; + +#define DEF_MEM_SIZE (40960) +unsigned int shmem_size = DEF_MEM_SIZE; +TEST_OPTION(shmem_size, uint, "Size of shared memory segment", 0); + +#define INIT_CRC (~0) + +#define POISON 0xac +static inline void poison_area(int *mem) +{ + memset(mem, POISON, shmem_size); +} + +static int child(key_t key) +{ + int sem, shm, ret, res = 0; + uint8_t *mem; + uint32_t crc; + + sem = semget(key, 1, 0777); + if (sem == -1) + return -1; + shm = shmget(key, shmem_size, 0777); + if (shm == -1) + return -2; + mem = shmat(shm, NULL, 0); + if (mem == (uint8_t *)-1) + return -3; + + while (test_go()) { + ret = semop(sem, &lock, 1); + if (ret) { + if (errno == EINTR) + continue; + fail("Error in semop lock"); + res = errno; + break; + } + crc = INIT_CRC; + datagen(mem, shmem_size, &crc); + while ((ret = semop(sem, &unlock, 1)) && (errno == EINTR)); + if (ret) { + fail("Error in semop unlock"); + res = errno; + break; + } + } + shmdt(mem); + return res; +} + +int main(int argc, char **argv) +{ + key_t key; + int sem, shm, pid1, pid2; + int fail_count = 0; + uint8_t *mem; + uint32_t crc; + int ret; + + test_init(argc, argv); + + key = ftok(argv[0], 822155650); + if (key == -1) { + pr_perror("Can't make key"); + goto out; + } + + sem = semget(key, 1, 0777 | IPC_CREAT | IPC_EXCL); + if (sem == -1) { + pr_perror("Can't get sem"); + goto out; + } + + if (semctl(sem, 0, SETVAL, 1) == -1) { + pr_perror("Can't init sem"); + fail_count++; + goto out_sem; + } + + shm = shmget(key, shmem_size, 0777 | IPC_CREAT | IPC_EXCL); + if (shm == -1) { + pr_perror("Can't get shm"); + fail_count++; + goto out_sem; + } + + mem = shmat(shm, NULL, 0); + if (mem == (void *)-1) { + pr_perror("Can't attach shm"); + fail_count++; + goto out_shm; + } + + poison_area((int *)mem); + + pid1 = test_fork(); + if (pid1 == -1) { + pr_perror("Can't fork 1st time"); + goto out_shdt; + } else if (pid1 == 0) + exit(child(key)); + + pid2 = test_fork(); + if (pid2 == -1) { + pr_perror("Can't fork 2nd time"); + fail_count++; + goto out_child; + } else if (pid2 == 0) + exit(child(key)); + + test_daemon(); + while (test_go()) { + ret = semop(sem, &lock, 1); + if (ret) { + if (errno == EINTR) + continue; + fail_count++; + fail("Error in semop lock"); + break; + } + if (mem[0] != POISON) { + crc = INIT_CRC; + if (datachk(mem, shmem_size, &crc)) { + fail_count++; + fail("Semaphore protection is broken or " + "shmem pages are messed"); + semop(sem, &unlock, 1); + break; + } + poison_area((int *)mem); + } + while ((ret = semop(sem, &unlock, 1)) && (errno == EINTR)); + if (ret) { + fail_count++; + fail("Error in semop unlock"); + break; + } + } + test_waitsig(); + + kill(pid2, SIGTERM); + waitpid(pid2, &ret, 0); + if (!WIFEXITED(ret)) { + fail_count++; + pr_perror("Child 2 was killed"); + } else if (WEXITSTATUS(ret)) { + fail_count++; + pr_perror("Child 2 couldn't inititalise"); + } +out_child: + kill(pid1, SIGTERM); + waitpid(pid1, &ret, 0); + if (!WIFEXITED(ret)) { + fail_count++; + pr_perror("Child 1 was killed"); + } else if (WEXITSTATUS(ret)) { + fail_count++; + pr_perror("Child 1 couldn't inititalise"); + } +out_shdt: + shmdt(mem); +out_shm: + shmctl(shm, IPC_RMID, NULL); +out_sem: + semctl(sem, 1, IPC_RMID); + if (fail_count == 0) + pass(); +out: + return 0; +} diff --git a/CRIU_code/test/zdtm/transition/ipc.desc b/CRIU_code/test/zdtm/transition/ipc.desc new file mode 100644 index 0000000..5939706 --- /dev/null +++ b/CRIU_code/test/zdtm/transition/ipc.desc @@ -0,0 +1 @@ +{'flavor': 'ns'} diff --git a/CRIU_code/test/zdtm/transition/lazy-thp.c b/CRIU_code/test/zdtm/transition/lazy-thp.c new file mode 100644 index 0000000..a0cf330 --- /dev/null +++ b/CRIU_code/test/zdtm/transition/lazy-thp.c @@ -0,0 +1,63 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "zdtmtst.h" + +#define N_PAGES 1024 + +const char *test_doc = "Test interaction between THP and lazy-pages"; + +/* The test is based on example by Adrian Reber */ +const char *test_author = "Mike Rapoport "; + +int main(int argc, char ** argv) +{ + char *mem, *org, *m; + int count; + + test_init(argc, argv); + + /* we presume that malloc returns not page aliged address */ + mem = malloc(PAGE_SIZE * N_PAGES); + org = malloc(PAGE_SIZE); + if (!mem || !org) { + fail("malloc failed\n"); + exit(1); + } + + memset(mem, 0x42, PAGE_SIZE * N_PAGES); + memset(org, 0x42, PAGE_SIZE); + + test_daemon(); + while (test_go()) { + for (count = 0; count < N_PAGES; count += 2) { + m = mem + (count * PAGE_SIZE) + 128; + *m = count; + } + + for (count = 0; count < N_PAGES; count++) { + m = mem+(count*PAGE_SIZE); + org[128] = (count % 2 == 0) ? count : 0x42; + + if (memcmp(org, m, PAGE_SIZE)) { + fail("memory corruption\n"); + return 1; + } + } + + sleep(1); + } + + pass(); + free(org); + free(mem); + return 0; +} diff --git a/CRIU_code/test/zdtm/transition/maps007.c b/CRIU_code/test/zdtm/transition/maps007.c new file mode 100644 index 0000000..ee5e7c7 --- /dev/null +++ b/CRIU_code/test/zdtm/transition/maps007.c @@ -0,0 +1,178 @@ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" +#include "lock.h" + +#define MAP_SIZE (1UL << 20) +#define MEM_SIZE (1UL << 29) + +const char *test_doc = "create random mappings and touch memory"; + +int sys_process_vm_readv(pid_t pid, void *addr, void *buf, int size) +{ + struct iovec lvec = {.iov_base = buf, .iov_len = size }; + struct iovec rvec = {.iov_base = addr, .iov_len = size }; + /* workaround bug in glibc with sixth argument of syscall */ + char nop[PAGE_SIZE]; + + memset(nop, 0, sizeof(nop)); + + return syscall(__NR_process_vm_readv, pid, &lvec, 1, &rvec, 1, 0); +} + +/* The child follows the parents two steps behind. */ +#define MAX_DELTA 1000 +int main(int argc, char **argv) +{ + void *start, *end, *p; + pid_t child; + struct { + futex_t delta; + futex_t stop; + } *shm; + uint32_t v; + unsigned long long count = 0; + int i; + + test_init(argc, argv); + + /* shared memory for synchronization */ + shm = mmap(NULL, PAGE_SIZE, PROT_WRITE | PROT_READ, MAP_ANONYMOUS | MAP_SHARED, -1, 0); + if (shm == MAP_FAILED) + return -1; + + /* allocate workspace */ + start = mmap(NULL, MEM_SIZE, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (start == MAP_FAILED) + return -1; + + test_msg("%p-%p\n", start, start + MEM_SIZE); + + end = start + MEM_SIZE; + + v = 0; + futex_set(&shm->delta, v); + futex_set(&shm->stop, 0); + + child = fork(); + if (child < 0) { + pr_perror("fork"); + return 1; + } + + while (1) { + void *ret; + unsigned long size; + int prot = PROT_NONE; + + if (child) { + if (!test_go()) + break; + futex_wait_while_gt(&shm->delta, 2 * MAX_DELTA); + futex_inc_and_wake(&shm->delta); + } else { + if (!futex_get(&shm->stop)) + /* shm->delta must be always bigger than MAX_DELTA */ + futex_wait_while_lt(&shm->delta, MAX_DELTA + 2); + else if (count % 100 == 0) + test_msg("count %llu delta %d\n", + count, futex_get(&shm->delta)); /* heartbeat */ + + if (futex_get(&shm->stop) && atomic_get(&shm->delta.raw) == MAX_DELTA) + break; + futex_dec_and_wake(&shm->delta); + } + + count++; + if (child && count == MAX_DELTA + 1) + test_daemon(); + + p = start + ((lrand48() * PAGE_SIZE) % MEM_SIZE); + size = lrand48() * PAGE_SIZE; + size %= (end - p); + size %= MAP_SIZE; + if (size == 0) + size = PAGE_SIZE; + + if (lrand48() % 2) + prot |= PROT_READ; + if (lrand48() % 2) + prot |= PROT_EXEC; + if (lrand48() % 2) + prot |= PROT_WRITE; + + ret = mmap(p, size, prot, MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0); + if (ret == MAP_FAILED) { + pr_perror("%p-%p", p, p + size); + goto err; + } + + if (!(prot & PROT_WRITE)) + continue; + + for (i = 0; i < lrand48() % 50; i++) { + char *t = p + (lrand48() * PAGE_SIZE) % (size); + t[0] = lrand48(); + } + } + test_msg("count %llu\n", count); + + if (child == 0) { + if (!test_go()) + pr_perror("unexpected state"); + futex_set_and_wake(&shm->stop, 2); + test_waitsig(); + return 0; + } else { + int readable = 0, status = -1; + + /* stop the child */ + futex_set(&shm->stop, 1); + futex_add_and_wake(&shm->delta, MAX_DELTA); + /* wait until the child will be in the same point */ + futex_wait_until(&shm->stop, 2); + + /* check that child and parent have the identical content of memory */ + for (p = start; p < end; p += PAGE_SIZE) { + char rbuf[PAGE_SIZE], lbuf[PAGE_SIZE]; + int rret, lret; + + lret = sys_process_vm_readv(getpid(), p, lbuf, PAGE_SIZE); + rret = sys_process_vm_readv(child, p, rbuf, PAGE_SIZE); + if (rret != lret) { + pr_perror("%p %d %d", p, lret, rret); + goto err; + } + if (lret < 0) + continue; + readable++; + if (memcmp(rbuf, lbuf, PAGE_SIZE)) { + pr_perror("%p", p); + goto err; + } + } + test_msg("readable %d\n", readable); + kill(child, SIGTERM); + wait(&status); + if (status != 0) { + pr_perror("Non-zero exit code: %d", status); + goto err; + } + pass(); + } + + return 0; +err: + kill(child, SIGSEGV); + *((volatile int *) 0) = 0; + return 1; +} diff --git a/CRIU_code/test/zdtm/transition/maps007.desc b/CRIU_code/test/zdtm/transition/maps007.desc new file mode 100644 index 0000000..2eac7e6 --- /dev/null +++ b/CRIU_code/test/zdtm/transition/maps007.desc @@ -0,0 +1 @@ +{'flags': 'suid'} diff --git a/CRIU_code/test/zdtm/transition/maps008.c b/CRIU_code/test/zdtm/transition/maps008.c new file mode 100644 index 0000000..5f6eb08 --- /dev/null +++ b/CRIU_code/test/zdtm/transition/maps008.c @@ -0,0 +1,512 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "zdtmtst.h" +#include "lock.h" + +const char *test_doc = "ps tree with anon shared vmas for dedup"; + +/* + * 1. ps tree with non triavial anon shmem vmas is created first. + * 2. Each process gets its portion of shmem vmas. + * 3. Each process continuously datagens its portion until + * criu dump is finished. + * 4. Each process datachecks all its shmem portions after restore. + * 5. Contents of anon shmem vmas are checked for equality in + * different processes. + */ + +typedef int (*proc_func_t)(task_waiter_t *setup_waiter); + +static pid_t fork_and_setup(proc_func_t pfunc) +{ + task_waiter_t setup_waiter; + pid_t pid; + + task_waiter_init(&setup_waiter); + pid = test_fork(); + if (pid < 0) { + pr_perror("fork failed"); + exit(1); + } + + if (pid == 0) + exit(pfunc(&setup_waiter)); + + task_waiter_wait4(&setup_waiter, pid); + task_waiter_fini(&setup_waiter); + return pid; +} + +static void cont_and_wait_child(pid_t pid) +{ + int status; + + kill(pid, SIGTERM); + waitpid(pid, &status, 0); + if (WIFEXITED(status)) { + if (WEXITSTATUS(status)) + exit(WEXITSTATUS(status)); + } else + exit(1); +} + +static void *mmap_ashmem(size_t size) +{ + void *mem = mmap(NULL, size, PROT_WRITE | PROT_READ, + MAP_SHARED | MAP_ANONYMOUS, -1, 0); + if (mem == MAP_FAILED) { + pr_perror("Can't map shmem %zx", size); + exit(1); + } + return mem; +} + +static void *mmap_proc_mem(pid_t pid, unsigned long addr, + unsigned long size) +{ + int fd; + void *mem; + char path[PATH_MAX]; + + snprintf(path, PATH_MAX, "/proc/%d/map_files/%lx-%lx", + (int)pid, addr, addr + size); + fd = open(path, O_RDWR); + if (fd == -1) { + pr_perror("Can't open file %s", path); + exit(1); + } + + mem = mmap(NULL, size, PROT_READ, MAP_SHARED, fd, 0); + close(fd); + if (mem == MAP_FAILED) { + pr_perror("Can't map file %s", path); + exit(1); + } + return mem; +} + +static void check_mem_eq(void *addr1, size_t size1, void *addr2, size_t size2) +{ + unsigned long min_size = size1 < size2 ? size1 : size2; + + if (memcmp(addr1, addr2, min_size)) { + pr_err("Mem differs %lx %lx %lx", (unsigned long)addr1, + (unsigned long)addr2, min_size); + exit(1); + } +} + +static void xmunmap(void *map, size_t size) +{ + if (munmap(map, size)) { + pr_err("xmunmap"); + exit(1); + } +} + +static void chk_proc_mem_eq(pid_t pid1, void *addr1, unsigned long size1, + pid_t pid2, void *addr2, unsigned long size2) +{ + void *map1, *map2; + + map1 = mmap_proc_mem(pid1, (unsigned long)addr1, size1); + map2 = mmap_proc_mem(pid2, (unsigned long)addr2, size2); + check_mem_eq(map1, size1, map2, size2); + xmunmap(map1, size1); + xmunmap(map2, size2); +} + +/* + * ps tree: + * proc1_______________ + * | | | + * proc11___ proc12 proc13 + * | | | + * proc111 proc112 proc131 + */ +#define PROC_CNT 7 + +#define PROC1_PGIX 0 +#define PROC11_PGIX 1 +#define PROC12_PGIX 2 +#define PROC13_PGIX 3 +#define PROC111_PGIX 4 +#define PROC112_PGIX 5 +#define PROC131_PGIX 6 +#define ZERO_PGIX 7 +/* unused pgix: 8 */ +#define MEM_PERIOD (9 * PAGE_SIZE) + +struct pstree { + pid_t proc1; + pid_t proc11; + pid_t proc12; + pid_t proc13; + pid_t proc111; + pid_t proc112; + pid_t proc131; +}; +struct pstree *pstree; + +struct test_sync { + futex_t datagen; + futex_t datagen_exit_cnt; +}; +struct test_sync *test_sync; + +size_t mem1_size, mem2_size, mem3_size; +uint8_t *mem1, *mem2, *mem3; + +#define CRC_EPOCH_OFFSET (PAGE_SIZE - sizeof(uint32_t)) + +static void read_each_pg(volatile uint8_t *mem, size_t size, size_t off) +{ + if (!mem) + return; + + while (off < size) { + (mem + off)[0]; + off += MEM_PERIOD; + } +} + +void datagen_each_pg(uint8_t *mem, size_t size, size_t off, uint32_t crc_epoch) +{ + if (!mem) + return; + + while (futex_get(&test_sync->datagen) && (off < size)) { + uint32_t crc = crc_epoch; + + datagen(mem + off, CRC_EPOCH_OFFSET, &crc); + *(uint32_t *)(mem + off + CRC_EPOCH_OFFSET) = crc_epoch; + off += MEM_PERIOD; + } +} + +void datachck_each_pg(uint8_t *mem, size_t size, size_t off) +{ + if (!mem) + return; + + while (off < size) { + uint32_t crc = *(uint32_t *)(mem + off + CRC_EPOCH_OFFSET); + + if (datachk(mem + off, CRC_EPOCH_OFFSET, &crc)) + exit(1); + off += MEM_PERIOD; + } +} + +static void mems_read_each_pgix(size_t pgix) +{ + const size_t off = pgix * PAGE_SIZE; + + read_each_pg(mem1, mem1_size, off); + read_each_pg(mem2, mem2_size, off); + read_each_pg(mem3, mem3_size, off); +} + +static void mems_datagen_each_pgix(size_t pgix, uint32_t *crc_epoch) +{ + const size_t off = pgix * PAGE_SIZE; + + ++(*crc_epoch); + datagen_each_pg(mem1, mem1_size, off, *crc_epoch); + datagen_each_pg(mem2, mem2_size, off, *crc_epoch); + datagen_each_pg(mem3, mem3_size, off, *crc_epoch); +} + +static void mems_datachck_each_pgix(size_t pgix) +{ + const size_t off = pgix * PAGE_SIZE; + + datachck_each_pg(mem1, mem1_size, off); + datachck_each_pg(mem2, mem2_size, off); + datachck_each_pg(mem3, mem3_size, off); +} + +static int proc131_func(task_waiter_t *setup_waiter) +{ + uint32_t crc_epoch = 0; + + pstree->proc131 = getpid(); + mems_datagen_each_pgix(PROC131_PGIX, &crc_epoch); + task_waiter_complete_current(setup_waiter); + + while (futex_get(&test_sync->datagen)) + mems_datagen_each_pgix(PROC131_PGIX, &crc_epoch); + futex_inc_and_wake(&test_sync->datagen_exit_cnt); + test_waitsig(); + + mems_datachck_each_pgix(PROC131_PGIX); + return 0; +} + +static int proc13_func(task_waiter_t *setup_waiter) +{ + size_t MEM1_HOLE_START = 2 * MEM_PERIOD; + size_t MEM1_HOLE_SIZE = 1 * MEM_PERIOD; + uint32_t crc_epoch = 0; + + pstree->proc13 = getpid(); + xmunmap(mem1 + MEM1_HOLE_START, MEM1_HOLE_SIZE); + xmunmap(mem2, mem2_size); + xmunmap(mem3, mem3_size); + mem2 = mem1 + MEM1_HOLE_START + MEM1_HOLE_SIZE; + mem2_size = mem1_size - (mem2 - mem1); + mem1_size = MEM1_HOLE_START; + mem3 = mmap_ashmem(mem3_size); + mems_datagen_each_pgix(PROC13_PGIX, &crc_epoch); + fork_and_setup(proc131_func); + task_waiter_complete_current(setup_waiter); + + while (futex_get(&test_sync->datagen)) + mems_datagen_each_pgix(PROC13_PGIX, &crc_epoch); + futex_inc_and_wake(&test_sync->datagen_exit_cnt); + test_waitsig(); + + mems_datachck_each_pgix(PROC13_PGIX); + + chk_proc_mem_eq(pstree->proc13, mem1, mem1_size, + pstree->proc131, mem1, mem1_size); + chk_proc_mem_eq(pstree->proc13, mem2, mem2_size, + pstree->proc131, mem2, mem2_size); + chk_proc_mem_eq(pstree->proc13, mem3, mem3_size, + pstree->proc131, mem3, mem3_size); + + cont_and_wait_child(pstree->proc131); + return 0; +} + +static int proc12_func(task_waiter_t *setup_waiter) +{ + uint32_t crc_epoch = 0; + + pstree->proc12 = getpid(); + mems_datagen_each_pgix(PROC12_PGIX, &crc_epoch); + task_waiter_complete_current(setup_waiter); + + while (futex_get(&test_sync->datagen)) + mems_datagen_each_pgix(PROC12_PGIX, &crc_epoch); + futex_inc_and_wake(&test_sync->datagen_exit_cnt); + test_waitsig(); + + mems_datachck_each_pgix(PROC12_PGIX); + + return 0; +} + +static int proc111_func(task_waiter_t *setup_waiter) +{ + uint32_t crc_epoch = 0; + + pstree->proc111 = getpid(); + mems_datagen_each_pgix(PROC111_PGIX, &crc_epoch); + task_waiter_complete_current(setup_waiter); + + while (futex_get(&test_sync->datagen)) + mems_datagen_each_pgix(PROC111_PGIX, &crc_epoch); + futex_inc_and_wake(&test_sync->datagen_exit_cnt); + test_waitsig(); + + mems_datachck_each_pgix(PROC111_PGIX); + return 0; +} + +static int proc112_func(task_waiter_t *setup_waiter) +{ + uint32_t crc_epoch = 0; + + pstree->proc112 = getpid(); + mems_datagen_each_pgix(PROC112_PGIX, &crc_epoch); + task_waiter_complete_current(setup_waiter); + + while (futex_get(&test_sync->datagen)) + mems_datagen_each_pgix(PROC112_PGIX, &crc_epoch); + futex_inc_and_wake(&test_sync->datagen_exit_cnt); + test_waitsig(); + + mems_datachck_each_pgix(PROC112_PGIX); + return 0; +} + +static int proc11_func(task_waiter_t *setup_waiter) +{ + const size_t MEM3_START_CUT = 1 * MEM_PERIOD; + const size_t MEM3_END_CUT = 2 * MEM_PERIOD; + void *mem3_old = mem3; + size_t mem3_size_old = mem3_size; + uint32_t crc_epoch = 0; + + pstree->proc11 = getpid(); + xmunmap(mem3, MEM3_START_CUT); + mem3 += MEM3_START_CUT; + mem3_size -= MEM3_START_CUT; + fork_and_setup(proc111_func); + fork_and_setup(proc112_func); + xmunmap(mem3 + mem3_size - MEM3_END_CUT, MEM3_END_CUT); + mem3_size -= MEM3_END_CUT; + mems_datagen_each_pgix(PROC11_PGIX, &crc_epoch); + task_waiter_complete_current(setup_waiter); + + while (futex_get(&test_sync->datagen)) + mems_datagen_each_pgix(PROC11_PGIX, &crc_epoch); + futex_inc_and_wake(&test_sync->datagen_exit_cnt); + test_waitsig(); + + mems_datachck_each_pgix(PROC11_PGIX); + + chk_proc_mem_eq(pstree->proc11, mem1, mem1_size, + pstree->proc111, mem1, mem1_size); + chk_proc_mem_eq(pstree->proc11, mem1, mem1_size, + pstree->proc112, mem1, mem1_size); + + chk_proc_mem_eq(pstree->proc11, mem2, mem2_size, + pstree->proc111, mem2, mem2_size); + chk_proc_mem_eq(pstree->proc11, mem2, mem2_size, + pstree->proc112, mem2, mem2_size); + + chk_proc_mem_eq(pstree->proc11, mem3, mem3_size, + pstree->proc111, mem3, mem3_size + MEM3_END_CUT); + chk_proc_mem_eq(pstree->proc11, mem3, mem3_size, + pstree->proc112, mem3, mem3_size + MEM3_END_CUT); + + uint8_t *proc1_mem3 = mmap_proc_mem(pstree->proc1, + (unsigned long)mem3_old, mem3_size_old); + check_mem_eq(mem3, mem3_size, proc1_mem3 + MEM3_START_CUT, mem3_size); + xmunmap(proc1_mem3, mem3_size_old); + + cont_and_wait_child(pstree->proc111); + cont_and_wait_child(pstree->proc112); + return 0; +} + +#define MAX(a, b) ((a) > (b) ? (a) : (b)) +#define MB(n) ((n) * (1UL << 20)) + +static int proc1_func(void) +{ + uint32_t crc_epoch = 0; + uint8_t *mem2_old = NULL; + + /* + * Min mem size: + * At least 5 mem periods for mem pages and vma holes. + * At least 1 MB mem size not to test on tiny working set. + */ + mem1_size = MEM_PERIOD * MAX(5, MB(1) / MEM_PERIOD + 1); + mem2_size = mem1_size * 2; + mem3_size = mem2_size * 3; + + futex_set(&test_sync->datagen, 1); + pstree->proc1 = getpid(); + mem1 = mmap_ashmem(mem1_size); + mem2 = mmap_ashmem(mem2_size); + mem3 = mmap_ashmem(mem3_size); + mems_datagen_each_pgix(PROC1_PGIX, &crc_epoch); + mems_read_each_pgix(ZERO_PGIX); + + fork_and_setup(proc11_func); + fork_and_setup(proc12_func); + fork_and_setup(proc13_func); + + xmunmap(mem1, mem1_size); + if (mremap(mem2, mem2_size, mem1_size, MREMAP_MAYMOVE | MREMAP_FIXED, + mem1) != mem1) { + pr_perror("proc1 mem2 remap"); + exit(1); + } + mem2_old = mem2; + mem2 = NULL; + + test_daemon(); + while (test_go()) + mems_datagen_each_pgix(PROC1_PGIX, &crc_epoch); + test_waitsig(); + futex_set(&test_sync->datagen_exit_cnt, 0); + futex_set(&test_sync->datagen, 0); + futex_wait_while(&test_sync->datagen_exit_cnt, PROC_CNT); + + mems_datachck_each_pgix(PROC1_PGIX); + + chk_proc_mem_eq(pstree->proc1, mem1, mem1_size, + pstree->proc11, mem2_old, mem2_size); + chk_proc_mem_eq(pstree->proc1, mem1, mem1_size, + pstree->proc12, mem2_old, mem2_size); + + chk_proc_mem_eq(pstree->proc1, mem3, mem3_size, + pstree->proc12, mem3, mem3_size); + + cont_and_wait_child(pstree->proc11); + cont_and_wait_child(pstree->proc12); + cont_and_wait_child(pstree->proc13); + + pass(); + return 0; +} + +static void kill_pstree_from_root(void) +{ + if (getpid() != pstree->proc1) + return; + + kill(pstree->proc11, SIGKILL); + kill(pstree->proc12, SIGKILL); + kill(pstree->proc13, SIGKILL); + kill(pstree->proc111, SIGKILL); + kill(pstree->proc112, SIGKILL); + kill(pstree->proc131, SIGKILL); +} + +static void sigchld_hand(int signo, siginfo_t *info, void *ucontext) +{ + if (info->si_code != CLD_EXITED) + return; + if (!info->si_status) + return; + + /* + * If we are not ps tree root then propagate child error to parent. + * If we are ps tree root then also call all + * atexit handlers set up by zdtm test framework and this test. + * exit() is not async signal safe but it's ok for testing purposes. + * exit() usage allows us to use very simple error handling + * and pstree killing logic. + */ + exit(info->si_status); +} + +int main(int argc, char **argv) +{ + test_init(argc, argv); + + pstree = (struct pstree *)mmap_ashmem(PAGE_SIZE); + test_sync = (struct test_sync *)mmap_ashmem(sizeof(*test_sync)); + + struct sigaction sa = { + .sa_sigaction = sigchld_hand, + .sa_flags = SA_RESTART | SA_SIGINFO | SA_NOCLDSTOP + }; + sigemptyset(&sa.sa_mask); + if (sigaction(SIGCHLD, &sa, NULL)) { + pr_perror("SIGCHLD handler setup"); + exit(1); + }; + + if (atexit(kill_pstree_from_root)) { + pr_err("Can't setup atexit cleanup func"); + exit(1); + } + return proc1_func(); +} diff --git a/CRIU_code/test/zdtm/transition/maps008.desc b/CRIU_code/test/zdtm/transition/maps008.desc new file mode 100644 index 0000000..fa2c82d --- /dev/null +++ b/CRIU_code/test/zdtm/transition/maps008.desc @@ -0,0 +1 @@ +{'flavor': 'h', 'flags': 'suid'} diff --git a/CRIU_code/test/zdtm/transition/netlink00.c b/CRIU_code/test/zdtm/transition/netlink00.c new file mode 100644 index 0000000..c9b2303 --- /dev/null +++ b/CRIU_code/test/zdtm/transition/netlink00.c @@ -0,0 +1,305 @@ +/* Description: testcase for netlink sockets migration. + * e.g. + * ip rule show + * ip rule add + * ip rule show + * ip rule del + * in a loop + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "zdtmtst.h" + +#undef DEBUG +//#define DEBUG + +const char *test_doc = "Netlink socket loop"; +const char *test_author = "Andrew Vagin (avagin@parallels.com)"; + +//buffer to hold the RTNETLINK request +struct { + struct nlmsghdr nl; + struct rtmsg rt; + char buf[8192]; +} req; + +// variables used for +// socket communications +int fd; +struct sockaddr_nl la; +struct sockaddr_nl pa; +struct msghdr msg; +struct iovec iov; +int rtn; +// buffer to hold the RTNETLINK reply(ies) +char buf[8192]; +char dsts[24] = "192.168.0.255"; +int pn = 32;//network prefix + +// RTNETLINK message pointers & lengths +// used when processing messages +struct nlmsghdr *nlp; +int nll; +struct rtmsg *rtp; +int rtl; +struct rtattr *rtap; + +int send_request(); +int recv_reply(); +int form_request_add(); +int form_request_del(); +int read_reply(); +typedef int (*cmd_t)(); +#define CMD_NUM 2 +cmd_t cmd[CMD_NUM]={form_request_add, form_request_del}; + + +int main(int argc, char *argv[]) +{ + int i; + + test_init(argc, argv); + + fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); + if (fd<0){ + pr_perror("socket"); + goto out; + } + // setup local address & bind using + // this address + bzero(&la, sizeof(la)); + la.nl_family = AF_NETLINK; + la.nl_pid = getpid(); + if (bind(fd, (struct sockaddr*) &la, sizeof(la))){ + pr_perror("bind failed"); + goto out; + } + //Preparation: + form_request_del(); + send_request(); + recv_reply(); + + test_daemon(); + + while (test_go()){ + for (i=0; i < CMD_NUM; i++){ + cmd[i](); + if (send_request() < 0){ + fail("send_request failed"); + goto out; + }; + if (recv_reply() < 0){ + fail("RTNETLINK answers: %m"); + goto out; + }; + +#ifdef DEBUG + if (read_reply() < 0){ + fail("read_reply failed"); + goto out; + } +#endif + } + } + + pass(); + +out: + return 0; +} + +int send_request() +{ + // create the remote address + // to communicate + bzero(&pa, sizeof(pa)); + pa.nl_family = AF_NETLINK; + // initialize & create the struct msghdr supplied + // to the sendmsg() function + bzero(&msg, sizeof(msg)); + msg.msg_name = (void *) &pa; + msg.msg_namelen = sizeof(pa); + // place the pointer & size of the RTNETLINK + // message in the struct msghdr + iov.iov_base = (void *) &req.nl; + iov.iov_len = req.nl.nlmsg_len; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + // send the RTNETLINK message to kernel + rtn = sendmsg(fd, &msg, 0); + if (rtn<0){ + pr_perror("sendmsg failed"); + return -1; + } + return 0; +} +int recv_reply() +{ + char *p; + // initialize the socket read buffer + bzero(buf, sizeof(buf)); + p = buf; + nll = 0; + // read from the socket until the NLMSG_DONE is + // returned in the type of the RTNETLINK message + // or if it was a monitoring socket + while(1) { + rtn = recv(fd, p, sizeof(buf) - nll, 0); + if (rtn < 0) { + pr_perror("recv failed"); + return -1; + } + + if (rtn == 0) { + pr_err("EOF on netlink\n"); + return -1; + } + + nlp = (struct nlmsghdr *) p; + if(nlp->nlmsg_type == NLMSG_DONE) + return 0; + if (nlp->nlmsg_type == NLMSG_ERROR) { + struct nlmsgerr *err = (struct nlmsgerr*)NLMSG_DATA(nlp); + errno=-err->error; + if (errno) { + return -1; + } + return 0; + } + // increment the buffer pointer to place + // next message + p += rtn; + // increment the total size by the size of + // the last received message + nll += rtn; + if((la.nl_groups & RTMGRP_IPV4_ROUTE) + == RTMGRP_IPV4_ROUTE) + break; + } + return 0; +} + +int read_reply() +{ + //string to hold content of the route + // table (i.e. one entry) + char dsts[24], gws[24], ifs[16], ms[24]; + // outer loop: loops thru all the NETLINK + // headers that also include the route entry + // header + nlp = (struct nlmsghdr *) buf; + for(; NLMSG_OK(nlp, nll); nlp = NLMSG_NEXT(nlp, nll)) + { + // get route entry header + rtp = (struct rtmsg *) NLMSG_DATA(nlp); + // we are only concerned about the + // main route table + if(rtp->rtm_table != RT_TABLE_MAIN) + continue; + // init all the strings + bzero(dsts, sizeof(dsts)); + bzero(gws, sizeof(gws)); + bzero(ifs, sizeof(ifs)); + bzero(ms, sizeof(ms)); + // inner loop: loop thru all the attributes of + // one route entry + rtap = (struct rtattr *) RTM_RTA(rtp); + rtl = RTM_PAYLOAD(nlp); + for( ; RTA_OK(rtap, rtl); rtap = RTA_NEXT(rtap,rtl)) + { + switch(rtap->rta_type) + { + // destination IPv4 address + case RTA_DST: + inet_ntop(AF_INET, RTA_DATA(rtap), + dsts, 24); + break; + // next hop IPv4 address + case RTA_GATEWAY: + inet_ntop(AF_INET, RTA_DATA(rtap), + gws, 24); + break; + // unique ID associated with the network + // interface + case RTA_OIF: + sprintf(ifs, "%d", + *((int *) RTA_DATA(rtap))); + default: + break; + } + } + sprintf(ms, "%d", rtp->rtm_dst_len); + test_msg("dst %s/%s gw %s if %s\n", + dsts, ms, gws, ifs); + } + return 0; +} + +#define NLMSG_TAIL(nmsg) \ + ((struct rtattr *) (((void *) (nmsg)) + NLMSG_ALIGN((nmsg)->nlmsg_len))) + +int form_request_del() +{ + bzero(&req, sizeof(req)); + req.nl.nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg)); + + rtap = NLMSG_TAIL(&req.nl); + rtap->rta_type = RTA_DST; + rtap->rta_len = RTA_LENGTH(4); + inet_pton(AF_INET, dsts, + ((char *)rtap) + sizeof(struct rtattr)); + req.nl.nlmsg_len = NLMSG_ALIGN(req.nl.nlmsg_len) + RTA_ALIGN(rtap->rta_len); + req.nl.nlmsg_flags = NLM_F_CREATE | NLM_F_ACK | NLM_F_REQUEST; + req.nl.nlmsg_type = RTM_DELROUTE; + req.rt.rtm_family = AF_INET; + req.rt.rtm_table = RT_TABLE_MAIN; + req.rt.rtm_protocol = RTPROT_STATIC; + req.rt.rtm_scope = RT_SCOPE_UNIVERSE; + req.rt.rtm_type = RTN_UNICAST; + req.rt.rtm_dst_len = pn; + return 0; +} + +int form_request_add() +{ + int ifcn = 1; //interface number + + bzero(&req, sizeof(req)); + req.nl.nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg)); + rtap = NLMSG_TAIL(&req.nl); + rtap->rta_type = RTA_DST; + rtap->rta_len = RTA_LENGTH(4); + inet_pton(AF_INET, dsts, + ((char *)rtap) + sizeof(struct rtattr)); + req.nl.nlmsg_len = NLMSG_ALIGN(req.nl.nlmsg_len) + RTA_ALIGN(rtap->rta_len); + + rtap = NLMSG_TAIL(&req.nl); + rtap->rta_type = RTA_OIF;//Output interface index + rtap->rta_len = RTA_LENGTH(sizeof(int)); + memcpy(((char *)rtap) + sizeof(struct rtattr), + &ifcn, sizeof(int)); + + req.nl.nlmsg_len = NLMSG_ALIGN(req.nl.nlmsg_len) + RTA_ALIGN(rtap->rta_len); + req.nl.nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_ACK; + req.nl.nlmsg_type = RTM_NEWROUTE; + + req.rt.rtm_family = AF_INET; + req.rt.rtm_table = RT_TABLE_MAIN; + req.rt.rtm_protocol = RTPROT_STATIC; + req.rt.rtm_scope = RT_SCOPE_UNIVERSE; + req.rt.rtm_type = RTN_UNICAST; + req.rt.rtm_dst_len = pn; + return 0; +} diff --git a/CRIU_code/test/zdtm/transition/netlink00.desc b/CRIU_code/test/zdtm/transition/netlink00.desc new file mode 100644 index 0000000..95c58b4 --- /dev/null +++ b/CRIU_code/test/zdtm/transition/netlink00.desc @@ -0,0 +1 @@ +{'flags': 'noauto'} diff --git a/CRIU_code/test/zdtm/transition/pid_reuse.c b/CRIU_code/test/zdtm/transition/pid_reuse.c new file mode 100644 index 0000000..f8f9230 --- /dev/null +++ b/CRIU_code/test/zdtm/transition/pid_reuse.c @@ -0,0 +1,116 @@ +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Tests that forking tasks with same pid does not break iterative dump"; +const char *test_author = "Pavel Tikhomirov "; + +enum { + VALUE_A = 1, + VALUE_B = 2, +}; + +#define CHILD_NS_PID 11235 + +static int set_ns_next_pid(pid_t pid) +{ + char buf[32]; + int len, fd; + + fd = open("/proc/sys/kernel/ns_last_pid", O_WRONLY); + if (fd < 0) { + pr_perror("Failed to open ns_last_pid"); + return -1; + } + + len = snprintf(buf, sizeof(buf), "%d", pid - 1); + len -= write(fd, buf, len); + if (len) + pr_perror("Can't set ns_last_pid"); + close(fd); + + return len ? -1 : 0; +} + +int main(int argc, char **argv) +{ + int pid, wpid, status; + bool overwrite = true; + bool wait = true; + int *variable; + void *mem; + + test_init(argc, argv); + + mem = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (mem == MAP_FAILED) { + pr_perror("Can't mmap memory region"); + return 1; + } + + variable = (int *)mem; + *variable = VALUE_A; + + test_daemon(); + + while (wait) { + if (set_ns_next_pid(CHILD_NS_PID)) + return 1; + + pid = fork(); + if (pid == -1) { + pr_perror("fork"); + return 1; + } else if (pid == 0) { + if (overwrite) + *variable = VALUE_B; + + /* Reuse test_waitsig to wait SIGTERM from parent */ + test_waitsig(); + + if (*variable != (overwrite ? VALUE_B : VALUE_A)) { + pr_err("Wrong value in a variable after restore\n"); + exit(1); + } + exit(0); + } + + if (pid != CHILD_NS_PID) { + pr_err("Child started with wrong pid %d (expected %d)\n", pid, CHILD_NS_PID); + kill(pid, SIGKILL); + waitpid(pid, NULL, 0); + return 1; + } + + /* Notify we are ready for a next pre-dump/dump */ + if (!overwrite) + test_wait_pre_dump_ack(); + + /* Wait for next pre-dump/dump finish */ + if (test_wait_pre_dump()) + wait = false; + + if (kill(pid, SIGTERM)) { + pr_perror("kill"); + return 1; + } + + wpid = waitpid(pid, &status, 0); + if (wpid <= 0) { + pr_perror("waitpid"); + return 1; + } + + if (status) { + fail("Task %d died with exit status %x", wpid, status); + return 1; + } + + overwrite = false; + } + + pass(); + return 0; +} diff --git a/CRIU_code/test/zdtm/transition/pid_reuse.desc b/CRIU_code/test/zdtm/transition/pid_reuse.desc new file mode 100644 index 0000000..474f44e --- /dev/null +++ b/CRIU_code/test/zdtm/transition/pid_reuse.desc @@ -0,0 +1 @@ +{'flags': 'suid pre-dump-notify'} diff --git a/CRIU_code/test/zdtm/transition/pipe_loop00.c b/CRIU_code/test/zdtm/transition/pipe_loop00.c new file mode 100644 index 0000000..01964c6 --- /dev/null +++ b/CRIU_code/test/zdtm/transition/pipe_loop00.c @@ -0,0 +1,176 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Multi-process pipe loop"; +const char *test_author = "Pavel Emelianov "; + +#define PROCS_DEF 4 +#define PROCS_MAX 64 +unsigned int num_procs = PROCS_DEF; +TEST_OPTION(num_procs, uint, "# processes to create " + "(default " __stringify(PROCS_DEF) + ", max " __stringify(PROCS_MAX) ")", 0); + +volatile sig_atomic_t num_exited = 0; +void inc_num_exited(int signo) +{ + num_exited++; +} + +int main(int argc, char **argv) +{ + int ret = 0; + pid_t pid; + int i; + uint8_t buf[0x100000]; + int pipes[PROCS_MAX * 2]; + int in, out; + + test_init(argc, argv); + + if (num_procs > PROCS_MAX) { + pr_err("%d processes is too many: max = %d\n", num_procs, PROCS_MAX); + exit(1); + } + + for (i = 0; i < num_procs; i++) + if (pipe(pipes + i * 2)) { + pr_perror("Can't create pipes"); + exit(1); + } + + if (signal(SIGCHLD, inc_num_exited) == SIG_ERR) { + pr_perror("can't set SIGCHLD handler"); + exit(1); + } + + for (i = 1; i < num_procs; i++) { /* i = 0 - parent */ + pid = test_fork(); + if (pid < 0) { + pr_perror("Can't fork"); + kill(0, SIGKILL); + exit(1); + } + + if (pid == 0) { + int j; + in = i * 2; + out = in - 1; + for (j = 0; j < num_procs * 2; j++) + if (j != in && j != out) + close(pipes[j]); + + signal(SIGPIPE, SIG_IGN); + if (pipe_in2out(pipes[in], pipes[out], buf, sizeof(buf)) < 0) + /* pass errno as exit code to the parent */ + if (test_go() /* signal NOT delivered */ || + (errno != EINTR && errno != EPIPE)) + ret = errno; + + test_waitsig(); /* even if failed, wait for migration to complete */ + + close(pipes[in]); + close(pipes[out]); + exit(ret); + } + } + + for (i = 1; i < num_procs * 2 - 1; i++) + close(pipes[i]); + in = pipes[0]; + out = pipes[num_procs * 2 - 1]; + + /* don't block on writing, _do_ block on reading */ + if (set_nonblock(out,1) < 0) { + pr_perror("setting O_NONBLOCK failed"); + exit(1); + } + + if (num_exited) { + pr_err("Some children died unexpectedly\n"); + kill(0, SIGKILL); + exit(1); + } + + test_daemon(); + + while (test_go()) { + int len, rlen = 0, wlen; + uint8_t rbuf[sizeof(buf)], *p; + + datagen(buf, sizeof(buf), NULL); + wlen = write(out, buf, sizeof(buf)); + if (wlen < 0) { + if (errno == EINTR) + continue; + else { + fail("write failed\n"); + ret = 1; + break; + } + } + + for (p = rbuf, len = wlen; len > 0; p += rlen, len -= rlen) { + rlen = read(in, p, len); + if (rlen <= 0) + break; + } + + if (rlen < 0 && errno == EINTR) + continue; + + if (len > 0) { + fail("read failed: %m\n"); + ret = 1; + break; + } + + if (memcmp(buf, rbuf, wlen)) { + fail("data mismatch\n"); + ret = 1; + break; + } + } + + close(out); + + test_waitsig(); /* even if failed, wait for migration to complete */ + + if (kill(0, SIGTERM)) { + fail("failed to send SIGTERM to my process group: %m\n"); + goto out; /* shouldn't wait() in this case */ + } + + for (i = 1; i < num_procs; i++) { /* i = 0 - parent */ + int chret; + if (wait(&chret) < 0) { + fail("can't wait for a child: %m\n"); + ret = 1; + continue; + } + + chret = WEXITSTATUS(chret); + if (chret) { + fail("child %d exited with non-zero code %d (%s)\n", + i, chret, strerror(chret)); + ret = 1; + continue; + } + } + + if (!ret) + pass(); + +out: + close(in); + return 0; +} diff --git a/CRIU_code/test/zdtm/transition/pipe_shared00.c b/CRIU_code/test/zdtm/transition/pipe_shared00.c new file mode 100644 index 0000000..64708d9 --- /dev/null +++ b/CRIU_code/test/zdtm/transition/pipe_shared00.c @@ -0,0 +1,140 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Multi-process pipe split"; +const char *test_author = "Pavel Emelianov "; + +#define PROCS_DEF 4 +#define PROCS_MAX 64 +unsigned int num_procs = PROCS_DEF; +TEST_OPTION(num_procs, uint, "# processes to create " + "(default " __stringify(PROCS_DEF) + ", max " __stringify(PROCS_MAX) ")", 0); + +volatile sig_atomic_t num_exited = 0; +void inc_num_exited(int signo) +{ + num_exited++; +} + +#define SND_CHR 'y' + +int main(int argc, char **argv) +{ + int ret = 0; + pid_t pid; + int i; + uint8_t buf[PIPE_BUF * 100]; + int pipes[2]; + + test_init(argc, argv); + + if (num_procs > PROCS_MAX) { + pr_err("%d processes is too many: max = %d\n", num_procs, PROCS_MAX); + exit(1); + } + + if (pipe(pipes)) { + pr_perror("Can't create pipes"); + exit(1); + } + + if (signal(SIGCHLD, inc_num_exited) == SIG_ERR) { + pr_perror("can't set SIGCHLD handler"); + exit(1); + } + + for (i = 1; i < num_procs; i++) { /* i = 0 - parent */ + pid = test_fork(); + if (pid < 0) { + pr_perror("can't fork"); + kill(0, SIGKILL); + exit(1); + } + + if (pid == 0) { + close(pipes[1]); + + while (test_go()) { + int rlen = read(pipes[0], buf, sizeof(buf)); + if (rlen == 0) + break; + else if (rlen < 0) { + ret = errno; /* pass errno as exit code to the parent */ + break; + } + + for (i = 0; i < rlen && buf[i] == SND_CHR; i++) + ; + if (i < rlen) { + ret = EILSEQ; + break; + } + } + + test_waitsig(); /* even if failed, wait for migration to complete */ + + close(pipes[0]); + exit(ret); + } + } + + close(pipes[0]); + + if (num_exited) { + pr_err("Some children died unexpectedly\n"); + kill(0, SIGKILL); + exit(1); + } + + test_daemon(); + + memset(buf, SND_CHR, sizeof(buf)); + while(test_go()) + if (write(pipes[1], buf, sizeof(buf)) < 0 && + (errno != EINTR || test_go())) { /* only SIGTERM may stop us */ + fail("write failed: %m\n"); + ret = 1; + break; + } + close(pipes[1]); + + test_waitsig(); /* even if failed, wait for migration to complete */ + + if (kill(0, SIGTERM)) { + fail("failed to send SIGTERM to my process group: %m\n"); + goto out; /* shouldn't wait() in this case */ + } + + for (i = 1; i < num_procs; i++) { /* i = 0 - parent */ + int chret; + if (wait(&chret) < 0) { + fail("can't wait for a child: %m\n"); + ret = 1; + continue; + } + + chret = WEXITSTATUS(chret); + if (chret) { + fail("child exited with non-zero code %d (%s)\n", + chret, strerror(chret)); + ret = 1; + continue; + } + } + + if (!ret) + pass(); + +out: + return 0; +} diff --git a/CRIU_code/test/zdtm/transition/ptrace.c b/CRIU_code/test/zdtm/transition/ptrace.c new file mode 100644 index 0000000..29e2e01 --- /dev/null +++ b/CRIU_code/test/zdtm/transition/ptrace.c @@ -0,0 +1,123 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Tests that ptraced thread do not escape from tracing"; +const char *test_author = "Pavel Emelianov "; + +#define NR_THREADS 2 +unsigned int nr_threads = NR_THREADS; +TEST_OPTION(nr_threads, uint, "Number of threads", 0); + +static void *thread(void *arg) +{ + *(int *)arg = syscall(SYS_gettid); + while (1) + sleep(1); + return NULL; +} + +int main(int argc, char **argv) +{ + int pid, status, i, stopped; +#define PT_REGS_SIZE 4096 /* big enough for any arch */ +#define PT_REGS_ALIGN 16 /* big enough for any arch */ + char regs[PT_REGS_SIZE] __attribute__((aligned(PT_REGS_ALIGN))); + + int *pids; + + test_init(argc, argv); + + pids = (int *)mmap(NULL, sizeof(int) * nr_threads, + PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, 0, 0); + if (pids == MAP_FAILED) { + pr_perror("Can't map"); + exit(1); + } + + memset(pids, 0, sizeof(int) * nr_threads); + + pid = fork(); + if (pid < 0) { + pr_perror("Can't fork"); + goto out; + } else if (pid == 0) { + pthread_t pt[nr_threads]; + + for (i = 0; i < nr_threads - 1; i++) { + if (pthread_create(&pt[i], NULL, thread, pids + i)) { + pr_perror("Can't make thread"); + goto out_th; + } + } + thread(pids + i); +out_th: + for (i--; i >=0; i--) { + pthread_kill(pt[i], SIGKILL); + pthread_join(pt[i], NULL); + } + return 0; + } + + for (i = 0; i < nr_threads; i++) { + while (pids[i] == 0) + sched_yield(); + if (ptrace(PTRACE_ATTACH, pids[i], (char *)1, NULL) == -1) { + pr_perror("Can't attach"); + goto out_pt; + } + } + + test_daemon(); + + while (test_go()) { + for (i = 0; i < nr_threads; i++) + if (pids[i]) + break; + if (i == nr_threads) + break; + stopped = wait4(-1, &status, __WALL, NULL); + if (stopped == -1) { + pr_perror("Can't wait"); + break; + } + + if (WIFSTOPPED(status)) { + if (ptrace(PTRACE_GETSIGINFO, stopped, NULL, regs)) { + /* FAIL */ + fail("Ptrace won't work"); + break; + } + + for (i = 0; i < nr_threads; i++) + if (pids[i] == stopped) + break; + if (i == nr_threads) + continue; + + pids[i] = 0; + ptrace(PTRACE_DETACH, stopped, (char *)1, NULL); + ptrace(PTRACE_CONT, stopped, (char *)1, NULL); + continue; + } + } + test_waitsig(); + pass(); +out_pt: + kill(pid, SIGKILL); + wait(NULL); +out: + munmap(pids, sizeof(int) * nr_threads); + return 0; +} diff --git a/CRIU_code/test/zdtm/transition/ptrace.desc b/CRIU_code/test/zdtm/transition/ptrace.desc new file mode 100644 index 0000000..95c58b4 --- /dev/null +++ b/CRIU_code/test/zdtm/transition/ptrace.desc @@ -0,0 +1 @@ +{'flags': 'noauto'} diff --git a/CRIU_code/test/zdtm/transition/shmem.c b/CRIU_code/test/zdtm/transition/shmem.c new file mode 100644 index 0000000..56b0f67 --- /dev/null +++ b/CRIU_code/test/zdtm/transition/shmem.c @@ -0,0 +1,81 @@ +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_author = "Andrei Vagin "; + +#define MEM_SIZE (1<<25) + +int main(int argc, char **argv) +{ + pid_t pid; + void *addr; + int *sum, status; + long size; + + test_init(argc, argv); + + sum = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_SHARED, -1, 0); + if (sum == MAP_FAILED) + return 1; + addr = mmap(NULL, MEM_SIZE, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_SHARED, -1, 0); + if (addr == MAP_FAILED) + return 1; + + pid = fork(); + if (pid < 0) + return 1; + + if (pid == 0) { + int i = 0; + long size = PAGE_SIZE, old_size = MEM_SIZE; + + status = 0; + while (test_go()) { + addr = mremap(addr, old_size, size, MREMAP_MAYMOVE); + + status -= *((int *)(addr + size - PAGE_SIZE)); + + *((int *)(addr + size - PAGE_SIZE)) = i++; + + status += *((int *)(addr + size - PAGE_SIZE)); + + old_size = size; + size += PAGE_SIZE; + if (size > MEM_SIZE) + size = PAGE_SIZE; + } + *sum = status; + return 0; + } + + test_daemon(); + test_waitsig(); + + kill(pid, SIGTERM); + status = -1; + waitpid(pid, &status, 0); + if (status) { + pr_err("The child return non-zero code: %d\n", status); + return 1; + } + + status = 0; + for (size = PAGE_SIZE; size <= MEM_SIZE; size += PAGE_SIZE) { + status += *((int *)(addr + size - PAGE_SIZE)); + } + + if (status != *sum) { + fail("checksum mismatch: %x %x\n", status, *sum); + return 1; + } + + pass(); + + return 0; +} diff --git a/CRIU_code/test/zdtm/transition/socket-tcp.c b/CRIU_code/test/zdtm/transition/socket-tcp.c new file mode 100644 index 0000000..772a5dd --- /dev/null +++ b/CRIU_code/test/zdtm/transition/socket-tcp.c @@ -0,0 +1 @@ +../static/socket-tcp.c \ No newline at end of file diff --git a/CRIU_code/test/zdtm/transition/socket-tcp.desc b/CRIU_code/test/zdtm/transition/socket-tcp.desc new file mode 100644 index 0000000..ca3268b --- /dev/null +++ b/CRIU_code/test/zdtm/transition/socket-tcp.desc @@ -0,0 +1 @@ +{'flavor': 'h', 'opts': '--tcp-established', 'flags': 'nouser samens'} diff --git a/CRIU_code/test/zdtm/transition/socket-tcp6.c b/CRIU_code/test/zdtm/transition/socket-tcp6.c new file mode 100644 index 0000000..772a5dd --- /dev/null +++ b/CRIU_code/test/zdtm/transition/socket-tcp6.c @@ -0,0 +1 @@ +../static/socket-tcp.c \ No newline at end of file diff --git a/CRIU_code/test/zdtm/transition/socket-tcp6.desc b/CRIU_code/test/zdtm/transition/socket-tcp6.desc new file mode 100644 index 0000000..ca3268b --- /dev/null +++ b/CRIU_code/test/zdtm/transition/socket-tcp6.desc @@ -0,0 +1 @@ +{'flavor': 'h', 'opts': '--tcp-established', 'flags': 'nouser samens'} diff --git a/CRIU_code/test/zdtm/transition/socket_loop00.c b/CRIU_code/test/zdtm/transition/socket_loop00.c new file mode 100644 index 0000000..1d7097b --- /dev/null +++ b/CRIU_code/test/zdtm/transition/socket_loop00.c @@ -0,0 +1,187 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Multi-process socket loop"; +const char *test_author = "Pavel Emelianov "; + +#define PROCS_DEF 4 +#define PROCS_MAX 64 +unsigned int num_procs = PROCS_DEF; +TEST_OPTION(num_procs, uint, "# processes to create " + "(default " __stringify(PROCS_DEF) + ", max " __stringify(PROCS_MAX) ")", 0); + +volatile sig_atomic_t num_exited = 0; +void inc_num_exited(int signo) +{ + num_exited++; +} + +int main(int argc, char **argv) +{ + int ret = 0; + pid_t pid; + int i; + uint8_t buf[0x100000]; + int socks[PROCS_MAX * 2]; + int in, out; + + test_init(argc, argv); + + if (num_procs > PROCS_MAX) { + pr_err("%d processes is too many: max = %d\n", num_procs, PROCS_MAX); + exit(1); + } + + for (i = 0; i < num_procs; i++) + if (socketpair(AF_LOCAL, SOCK_STREAM, 0, socks + i * 2)) { + pr_perror("can't create sockets"); + exit(1); + } + + if (signal(SIGCHLD, inc_num_exited) == SIG_ERR) { + pr_perror("can't set SIGCHLD handler"); + exit(1); + } + + for (i = 1; i < num_procs; i++) { /* i = 0 - parent */ + pid = test_fork(); + if (pid < 0) { + pr_perror("Can't fork"); + kill(0, SIGKILL); + exit(1); + } + + if (pid == 0) { + int j; + in = i * 2; + out = in - 1; + for (j = 0; j < num_procs * 2; j++) + if (j != in && j != out) + close(socks[j]); + + signal(SIGPIPE, SIG_IGN); + if (pipe_in2out(socks[in], socks[out], buf, sizeof(buf)) < 0) + /* pass errno as exit code to the parent */ + if (test_go() /* signal NOT delivered */ || + (errno != EINTR && errno != EPIPE + && errno != ECONNRESET)) + ret = errno; + + test_waitsig(); /* even if failed, wait for migration to complete */ + + close(socks[in]); + close(socks[out]); + exit(ret); + } + } + + for (i = 1; i < num_procs * 2 - 1; i++) + close(socks[i]); + in = socks[0]; + out = socks[num_procs * 2 - 1]; + + /* don't block on writing, _do_ block on reading */ + if (set_nonblock(out,1) < 0) { + pr_perror("setting O_NONBLOCK failed"); + exit(1); + } + + if (num_exited) { + pr_err("Some children died unexpectedly\n"); + kill(0, SIGKILL); + exit(1); + } + + test_daemon(); + + while (test_go()) { + int len, rlen = 0, wlen; + uint8_t rbuf[sizeof(buf)], *p; + + datagen(buf, sizeof(buf), NULL); + wlen = write(out, buf, sizeof(buf)); + if (wlen < 0) { + if (errno == EINTR) + continue; + else { + fail("write failed\n"); + ret = 1; + break; + } + } + + for (p = rbuf, len = wlen; len > 0; p += rlen, len -= rlen) { + rlen = read(in, p, len); + if (rlen <= 0) + break; + } + + if (rlen < 0 && errno == EINTR) + continue; + + if (len > 0) { + fail("read failed: %m\n"); + ret = 1; + break; + } + + if (memcmp(buf, rbuf, wlen)) { + fail("data mismatch\n"); + ret = 1; + break; + } + } + + + test_waitsig(); /* even if failed, wait for migration to complete */ + + /* We expect that write(2) in child may return error only after signal + * has been received. Thus, send signal before closing parent fds. + */ + if (kill(0, SIGTERM)) { + fail("failed to send SIGTERM to my process group: %m\n"); + goto out; /* shouldn't wait() in this case */ + } + if (close(out)) + fail("Failed to close parent fd 'out': %m\n"); + /* If last child in the chain (from whom we read data) receives signal + * after parent has finished reading but before calling write(2), this + * child can block forever. To avoid this, close 'in' fd. + */ + if (close(in)) + fail("failed to close parent fd 'in': %m\n"); + + for (i = 1; i < num_procs; i++) { /* i = 0 - parent */ + int chret; + if (wait(&chret) < 0) { + fail("can't wait for a child: %m\n"); + ret = 1; + continue; + } + + chret = WEXITSTATUS(chret); + if (chret) { + fail("child %d exited with non-zero code %d (%s)\n", + i, chret, strerror(chret)); + ret = 1; + continue; + } + } + + if (!ret) + pass(); + +out: + return 0; +} diff --git a/CRIU_code/test/zdtm/transition/thread-bomb.c b/CRIU_code/test/zdtm/transition/thread-bomb.c new file mode 100644 index 0000000..6621b18 --- /dev/null +++ b/CRIU_code/test/zdtm/transition/thread-bomb.c @@ -0,0 +1,76 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +#define exit_group(code) \ + syscall(__NR_exit_group, code) + +static pthread_attr_t attr; +/* Having in mind setup with 64 Kb large pages */ +static const size_t stack_size = 64 * 1024; + +static void *thread_fn(void *arg) +{ + pthread_t t, p, *self; + + if (arg) { + p = *(pthread_t *)arg; + pthread_join(p, NULL); + free(arg); + } + + self = malloc(sizeof(*self)); + *self = pthread_self(); + + pthread_create(&t, &attr, thread_fn, self); + return NULL; +} + +int main(int argc, char **argv) +{ + int max_nr = 1024, i; + char *val; + int err; + + err = pthread_attr_init(&attr); + if (err) { + pr_err("pthread_attr_init(): %d\n", err); + exit(1); + } + + err = pthread_attr_setstacksize(&attr, stack_size); + if (err) { + pr_err("pthread_attr_setstacksize(): %d\n", err); + exit(1); + } + + val = getenv("ZDTM_THREAD_BOMB"); + if (val) + max_nr = atoi(val); + + test_msg("%d\n", max_nr); + + test_init(argc, argv); + + for (i = 0; i < max_nr; i++) { + pthread_t p; + err = pthread_create(&p, &attr, thread_fn, NULL); + if (err) { + pr_err("pthread_create(): %d\n", err); + exit(1); + } + } + + test_daemon(); + test_waitsig(); + + pass(); + + return 0; +} diff --git a/CRIU_code/test/zdtm/transition/thread-bomb.desc b/CRIU_code/test/zdtm/transition/thread-bomb.desc new file mode 100644 index 0000000..95c58b4 --- /dev/null +++ b/CRIU_code/test/zdtm/transition/thread-bomb.desc @@ -0,0 +1 @@ +{'flags': 'noauto'} diff --git a/CRIU_code/test/zdtm/transition/unix_sock.c b/CRIU_code/test/zdtm/transition/unix_sock.c new file mode 100644 index 0000000..64a4919 --- /dev/null +++ b/CRIU_code/test/zdtm/transition/unix_sock.c @@ -0,0 +1,288 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Multi-client - server app"; +const char *test_author = "Roman Kagan "; + +#define PROCS_DEF 4 +#define PROCS_MAX 64 +unsigned int num_procs = PROCS_DEF; +TEST_OPTION(num_procs, uint, "# processes to create " + "(default " __stringify(PROCS_DEF) + ", max " __stringify(PROCS_MAX) ")", 0); + +char *filename; +TEST_OPTION(filename, string, "file name", 1); + +#define ACCEPT_TIMEOUT 100 /* max delay for the child to connect */ + +static int fill_sock_name(struct sockaddr_un *name, const char *filename) +{ + if (strlen(filename) >= sizeof(name->sun_path)) + return -1; + + name->sun_family = AF_LOCAL; + strcpy(name->sun_path, filename); + return 0; +} + +static int setup_srv_sock(void) +{ + struct sockaddr_un name; + int sock; + + if (fill_sock_name(&name, filename) < 0) { + pr_err("filename \"%s\" is too long\n", filename); + return -1; + } + + sock = socket(PF_LOCAL, SOCK_STREAM, 0); + if (sock < 0) { + pr_perror("can't create socket"); + return -1; + } + + if (bind(sock, (struct sockaddr *) &name, SUN_LEN(&name)) < 0) { + pr_perror("can't bind to socket \"%s\"", filename); + goto err; + } + + if (fcntl(sock, F_SETFL, O_NONBLOCK) < 0) { + pr_perror("can't make socket \"%s\" non-blocking", filename); + goto err; + } + + if (listen(sock, 1) < 0) { + pr_perror("can't listen on a socket \"%s\"", filename); + goto err; + } + + return sock; +err: + close(sock); + return -1; +} + +static int accept_one_conn(int sock) +{ + int acc_sock; + fd_set fds; + struct timeval timeout = { + .tv_sec = ACCEPT_TIMEOUT, + }; + + FD_ZERO(&fds); + FD_SET(sock, &fds); + + switch (select(FD_SETSIZE, &fds, NULL, NULL, &timeout)) { + case 1: + break; + case 0: + pr_err("timeout accepting a connection\n"); + return -1; + default: + pr_perror("error while waiting for a connection"); + return -1; + } + + acc_sock = accept(sock, NULL, NULL); + if (acc_sock < 0) + pr_perror("error accepting a connection"); + return acc_sock; +} + +static int setup_clnt_sock(void) +{ + struct sockaddr_un name; + int sock; + int ret = 0; + + if (fill_sock_name(&name, filename) < 0) { + pr_err("filename \"%s\" is too long\n", filename); + return -1; + } + + sock = socket(PF_LOCAL, SOCK_STREAM, 0); + if (sock < 0) { + ret = -errno; + pr_perror("can't create socket"); + return ret; + } + + if (connect(sock, (struct sockaddr *) &name, SUN_LEN(&name)) < 0) { + ret = -errno; + pr_perror("can't connect"); + goto err; + } + + return sock; +err: + close(sock); + return ret; +} + +#define BUFLEN 1000 + +static int child(void) +{ + int ret = 1; + uint8_t buf[BUFLEN]; + uint32_t crc = ~0; + int sock = setup_clnt_sock(); + + if (sock < 0) { + ret = -sock; + goto out; + } + + signal(SIGPIPE, SIG_IGN); + while (test_go()) { + datagen(buf, sizeof(buf), &crc); + if (write(sock, buf, sizeof(buf)) < 0 && + (test_go() /* signal NOT received */ || + (errno != EINTR && errno != EPIPE && \ + errno != ECONNRESET))) { + ret = errno; + fail("child write: %m\n"); + goto out; + } + } + + ret = 0; +out: + close(sock); + return ret; +} + +int main(int argc, char **argv) +{ + struct { + pid_t pid; + int sock; + uint32_t crc; + } child_desc[PROCS_MAX]; + int i, nproc; + int sock; + uint8_t buf[BUFLEN]; + fd_set active_fds, read_fds; + + test_init(argc, argv); + + if (num_procs > PROCS_MAX) { + pr_err("%d processes is too many: max = %d\n", num_procs, PROCS_MAX); + exit(1); + } + + sock = setup_srv_sock(); + if (sock < 0) + exit(1); + + FD_ZERO(&active_fds); + for (nproc = 0; nproc < num_procs; nproc++) { + child_desc[nproc].pid = test_fork(); + if (child_desc[nproc].pid < 0) { + pr_perror("can't fork"); + goto cleanup; + } + + if (child_desc[nproc].pid == 0) { + close(sock); + exit(child()); + } + + child_desc[nproc].sock = accept_one_conn(sock); + if (child_desc[nproc].sock < 0) { + kill(child_desc[nproc].pid, SIGKILL); + goto cleanup; + } + + child_desc[nproc].crc = ~0; + FD_SET(child_desc[nproc].sock, &active_fds); + } + + close(sock); /* no more connections */ + test_daemon(); + + while (test_go()) { + read_fds = active_fds; + if (select(FD_SETSIZE, &read_fds, NULL, NULL, NULL) < 0 && + errno != EINTR) { + fail("error waiting for data: %m"); + goto out; + } + + for (i = 0; i < num_procs; i++) + if (FD_ISSET(child_desc[i].sock, &read_fds)) { + if (read(child_desc[i].sock, buf, sizeof(buf)) < 0) { + if(errno == EINTR) /* we're asked to stop */ + break; + else { + fail("error reading data from socket: %m"); + goto out; + } + } + + if (datachk(buf, sizeof(buf), &child_desc[i].crc)) { + fail("CRC mismatch"); + goto out; + } + } + } + + +out: + test_waitsig(); + + if (kill(0, SIGTERM)) { + fail("failed to send SIGTERM to my process group: %m\n"); + goto cleanup; /* shouldn't wait() in this case */ + } + + while (nproc-- > 0) { + int chret; + /* + * Close socket to make sure that child's write() returns. + * This is to avoid race when server stopped reading & sent + * signal to child, child has checked for signal & found none + * (not yet delivered), then called write(), blocking forever. + */ + if(close(child_desc[nproc].sock)) + fail("Can't close server socket: %m\n"); + + if (wait(&chret) < 0) { + fail("can't wait for a child: %m\n"); + goto cleanup; + } + + chret = WEXITSTATUS(chret); + if (chret) { + fail("child exited with non-zero code %d (%s)\n", + chret, strerror(chret)); + goto cleanup; + } + } + + pass(); + +cleanup: + while (nproc-- > 0) { + close(child_desc[nproc].sock); + if (child_desc[nproc].pid > 0) + kill(child_desc[nproc].pid, SIGKILL); + } + close(sock); + unlink(filename); + return 0; +} diff --git a/CRIU_code/test/zdtm_ct.c b/CRIU_code/test/zdtm_ct.c new file mode 100644 index 0000000..bc88dad --- /dev/null +++ b/CRIU_code/test/zdtm_ct.c @@ -0,0 +1,67 @@ +#include +#include +#include +#include +#include +#include +#include + +int main(int argc, char **argv) +{ + pid_t pid; + int status; + + /* + * pidns is used to avoid conflicts + * mntns is used to mount /proc + * net is used to avoid conflicts of parasite sockets + */ + if (unshare(CLONE_NEWNS | CLONE_NEWPID | CLONE_NEWNET | CLONE_NEWIPC)) + return 1; + pid = fork(); + if (pid == 0) { + if (mount(NULL, "/", NULL, MS_REC | MS_SLAVE, NULL)) { + fprintf(stderr, "mount(/, S_REC | MS_SLAVE)): %m"); + return 1; + } + umount2("/proc", MNT_DETACH); + umount2("/dev/pts", MNT_DETACH); + if (mount("zdtm_proc", "/proc", "proc", 0, NULL)) { + fprintf(stderr, "mount(/proc): %m"); + return 1; + } + if (mount("zdtm_devpts", "/dev/pts", "devpts", 0, + "newinstance,ptmxmode=0666")) { + fprintf(stderr, "mount(pts): %m"); + return 1; + } + if (mount("zdtm_binfmt", "/proc/sys/fs/binfmt_misc", "binfmt_misc", 0, + NULL)) { + fprintf(stderr, "mount(binfmt_misc): %m"); + return 1; + } + if (mount("/dev/pts/ptmx", "/dev/ptmx", NULL, MS_BIND, NULL)) { + fprintf(stderr, "mount(ptmx): %m"); + return 1; + } + if (system("ip link set up dev lo")) + return 1; + execv(argv[1], argv + 1); + fprintf(stderr, "execve: %m"); + return 1; + } + + if (waitpid(pid, &status, 0) != pid) { + fprintf(stderr, "waitpid: %m"); + return 1; + } + + if (WIFEXITED(status)) + return WEXITSTATUS(status); + else if (WIFSIGNALED(status)) + kill(getpid(), WTERMSIG(status)); + else + fprintf(stderr, "Unexpected exit status: %x\n", status); + + return 1; +} diff --git a/CRIU_code/test/zdtm_mount_cgroups b/CRIU_code/test/zdtm_mount_cgroups new file mode 100644 index 0000000..34e0e85 --- /dev/null +++ b/CRIU_code/test/zdtm_mount_cgroups @@ -0,0 +1,16 @@ +#!/bin/sh + +# If a controller is created during dumping processes, criu may fail with error: +# Error (cgroup.c:768): cg: Set 3 is not subset of 2 +# so lets create all test controllers before executing tests. + +cat /proc/self/cgroup | grep -q zdtmtst.defaultroot && exit + +tdir=`mktemp -d zdtm.XXXXXX` +for i in "zdtmtst" "zdtmtst.defaultroot"; do + mount -t cgroup -o none,name=$i zdtm $tdir && + # a fake group prevents destroying of a controller + mkdir -p $tdir/holder && + umount -l $tdir || exit 1 +done +rmdir $tdir diff --git a/CRIU_code/test/zdtm_umount_cgroups b/CRIU_code/test/zdtm_umount_cgroups new file mode 100644 index 0000000..75a8ea2 --- /dev/null +++ b/CRIU_code/test/zdtm_umount_cgroups @@ -0,0 +1,16 @@ +#!/bin/sh + +# Lets delete all test controllers after executing tests. + +cat /proc/self/cgroup | grep -q zdtmtst.defaultroot || exit 0 + +tdir=`mktemp -d zdtm.XXXXXX` +for i in "zdtmtst" "zdtmtst.defaultroot"; do + mount -t cgroup -o none,name=$i zdtm $tdir || { rmdir $tdir; exit 1; } + # remove a fake group if exists + if [ -d "$tdir/holder" ]; then + rmdir $tdir/holder || { umount -l $tdir && rmdir $tdir; exit 1; } + fi + umount -l $tdir || exit 1; +done +rmdir $tdir diff --git a/README.md b/README.md index 5b1ea97..0dfbb4b 100644 --- a/README.md +++ b/README.md @@ -1,37 +1,30 @@ -# DyscheOS-utils +# 项目功能说明书 +@(team-1243952768)[openEuler全架构集成CRIU工具] -#### 介绍 -It provides utility tools for DyscheOS, including management tools, scripts, user-guide and kernel modules. +### 一、开发思路 -#### 软件架构 -软件架构说明 +本赛题的目标是添加CRIU对riscv架构的支持以及补全对aarch64架构的支持,我们通过阅读较为熟悉的x86架构相关部分,提炼出CRIU在冻结进程时保存的内容,即通用寄存器组、控制寄存器、浮点寄存器组、线程的TLS以及CPU信息,CPU信息包括开发商、产品系列编号等内容。之后根据aarch64架构与riscv架构的寄存器和CPU编写相关的代码,包括寄存器的结构体、CPU信息结构体、线程信息结构体以及实现系统调用、设置进程断点、读取寄存器组信息、读取CPU信息和读取TLS部分的汇编代码。通用寄存器组结构体以及浮点寄存器组结构体采用Linux内核中的对应源码,方便调用内核接口获取。 + +------------------ -#### 安装教程 +### 二、代码架构 +aarch64相关: +/compel/arch/aarch64/src/lib/include/uapi/asm/cpu.h +/compel/arch/aarch64/src/lib/cpu.c +/criu/include/cpu.h +/criu/arch/aarch64/cpu.c +/images/cpuinfo.proto中添加cpuinfo_aarch64_entry -1. xxxx -2. xxxx -3. xxxx +riscv相关: +/compel/arch/riscv/ +/criu/arch/riscv/ +/include/common/arch/riscv/ +/images/core-riscv.proto +/images/cpuinfo.proto中添加cpuinfo_riscv_entry +/images/core.proto中的core_entry中添加thread_info_riscv,以及core_entry中的march中添加RISCV +根目录的Makefile -#### 使用说明 +### 三、测试方法 +在riscv虚拟机中编译criu,启动simple-loop进程,并成功执行criu checkpoint/restore;restore后进程可延续之前状态继续执行。 -1. xxxx -2. xxxx -3. xxxx - -#### 参与贡献 - -1. Fork 本仓库 -2. 新建 Feat_xxx 分支 -3. 提交代码 -4. 新建 Pull Request - - -#### 特技 - -1. 使用 Readme\_XXX.md 来支持不同的语言,例如 Readme\_en.md, Readme\_zh.md -2. Gitee 官方博客 [blog.gitee.com](https://blog.gitee.com) -3. 你可以 [https://gitee.com/explore](https://gitee.com/explore) 这个地址来了解 Gitee 上的优秀开源项目 -4. [GVP](https://gitee.com/gvp) 全称是 Gitee 最有价值开源项目,是综合评定出的优秀开源项目 -5. Gitee 官方提供的使用手册 [https://gitee.com/help](https://gitee.com/help) -6. Gitee 封面人物是一档用来展示 Gitee 会员风采的栏目 [https://gitee.com/gitee-stars/](https://gitee.com/gitee-stars/) -- Gitee