From adb4b4a2d44f0067898144b1504317cba74a6fb7 Mon Sep 17 00:00:00 2001 From: fangzhiyi18 Date: Mon, 14 Oct 2024 03:44:41 +0000 Subject: [PATCH] =?UTF-8?q?=E5=8D=87=E7=BA=A72.7?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: fangzhiyi18 --- .github/workflows/build.yml | 40 +- .github/workflows/shellcheck.yml | 2 +- .gitignore | 9 + CHANGELOG | 65 + Makefile | 26 +- Makefile.common | 1 + README | 48 + README.OpenSource | 4 +- bundle.json | 2 +- configure | 153 +- debian/changelog | 11 + debian/compat | 1 - debian/control | 23 +- debian/liburing-dev.manpages | 11 +- debian/liburing1-udeb.install | 1 - debian/liburing1.install | 1 - debian/liburing1.symbols | 32 - debian/liburing2.install | 1 + debian/liburing2.symbols | 56 + debian/rules | 80 +- examples/Makefile | 20 +- examples/helpers.c | 62 + examples/helpers.h | 7 + examples/io_uring-close-test.c | 123 ++ examples/io_uring-test.c | 2 +- examples/io_uring-udp.c | 14 +- examples/napi-busy-poll-client.c | 509 ++++++ examples/napi-busy-poll-server.c | 450 ++++++ examples/proxy.c | 2461 ++++++++++++++++++++++++++++++ examples/proxy.h | 102 ++ examples/rsrc-update-bench.c | 100 ++ examples/send-zerocopy.c | 441 +++++- examples/ucontext-cp.c | 19 +- liburing-ffi.pc.in | 12 + liburing.pc.in | 4 +- liburing.spec | 4 +- make-debs.sh | 12 +- src/Makefile | 57 +- src/arch/aarch64/lib.h | 3 +- src/arch/riscv64/lib.h | 48 + src/arch/riscv64/syscall.h | 100 ++ src/ffi.c | 15 + src/include/liburing.h | 587 ++++--- src/include/liburing/io_uring.h | 172 ++- src/int_flags.h | 2 + src/lib.h | 23 +- src/liburing-ffi.map | 206 +++ src/liburing.map | 28 + src/nolibc.c | 11 +- src/queue.c | 24 +- src/register.c | 189 ++- src/setup.c | 500 ++++-- src/setup.h | 9 + src/syscall.h | 2 + src/version.c | 21 + test/232c93d07b74.c | 12 +- test/35fa71a030ca.c | 6 +- test/500f9fbadef8.c | 2 + test/917257daa0fe.c | 2 +- test/Makefile | 76 +- test/a0908ae19763.c | 4 +- test/a4c0b3decb33.c | 6 +- test/accept-link.c | 8 +- test/accept-non-empty.c | 256 ++++ test/accept-reuse.c | 14 +- test/accept.c | 92 +- test/b19062a56726.c | 2 +- test/bind-listen.c | 408 +++++ test/buf-ring-nommap.c | 123 ++ test/buf-ring-put.c | 83 + test/buf-ring.c | 223 +-- test/ce593a6c480a.c | 4 +- test/close-opath.c | 3 +- test/config | 4 + test/connect-rep.c | 204 +++ test/connect.c | 65 +- test/coredump.c | 60 + test/cq-overflow.c | 22 +- test/d4ae271dfaae.c | 2 +- test/defer-taskrun.c | 73 +- test/defer-tw-timeout.c | 173 +++ test/defer.c | 8 +- test/double-poll-crash.c | 6 +- test/eeed8b54e0df.c | 11 +- test/eploop.c | 74 + test/eventfd-reg.c | 2 +- test/eventfd-ring.c | 2 +- test/eventfd.c | 2 +- test/evloop.c | 73 + test/exit-no-cleanup.c | 2 +- test/fadvise.c | 2 +- test/fallocate.c | 9 + test/fc2a85cb02ef.c | 6 +- test/fd-install.c | 500 ++++++ test/fd-pass.c | 84 +- test/file-register.c | 71 +- test/file-verify.c | 39 +- test/files-exit-hang-timeout.c | 4 +- test/fixed-buf-merge.c | 101 ++ test/fixed-hugepage.c | 411 +++++ test/fixed-link.c | 2 +- test/fpos.c | 2 +- test/fsnotify.c | 118 ++ test/futex.c | 571 +++++++ test/hardlink.c | 110 +- test/helpers.c | 56 +- test/helpers.h | 14 + test/ignore-single-mmap.c | 48 + test/init-mem.c | 164 ++ test/io-cancel.c | 24 +- test/io_uring_enter.c | 11 +- test/io_uring_passthrough.c | 47 +- test/io_uring_register.c | 53 +- test/io_uring_setup.c | 85 +- test/iopoll-overflow.c | 118 ++ test/iopoll.c | 94 +- test/lfs-openat-write.c | 16 +- test/lfs-openat.c | 14 +- test/link_drain.c | 36 +- test/madvise.c | 2 +- test/msg-ring-fd.c | 331 ++++ test/msg-ring-flags.c | 212 +++ test/msg-ring-overflow.c | 159 ++ test/msg-ring.c | 257 +++- test/multicqes_drain.c | 53 +- test/no-mmap-inval.c | 42 + test/nolibc.c | 8 +- test/nvme.h | 7 +- test/ooo-file-unreg.c | 82 + test/openat2.c | 6 +- test/pipe-bug.c | 95 ++ test/poll-cancel-all.c | 34 +- test/poll-cancel.c | 4 +- test/poll-link.c | 6 +- test/poll-many.c | 60 +- test/poll-mshot-overflow.c | 107 +- test/poll-race-mshot.c | 276 ++++ test/poll-race.c | 105 ++ test/poll.c | 270 +++- test/pollfree.c | 426 ------ test/read-before-exit.c | 23 +- test/read-mshot-empty.c | 153 ++ test/read-mshot.c | 404 +++++ test/read-write.c | 55 + test/recv-msgall.c | 44 +- test/recv-multishot.c | 107 +- test/recvsend_bundle.c | 691 +++++++++ test/reg-fd-only.c | 131 ++ test/reg-hint.c | 56 + test/reg-reg-ring.c | 90 ++ test/regbuf-merge.c | 91 ++ test/rename.c | 2 +- test/ring-leak.c | 25 +- test/ring-leak2.c | 2 +- test/ringbuf-read.c | 24 +- test/ringbuf-status.c | 242 +++ test/runtests-loop.sh | 0 test/runtests-quiet.sh | 0 test/runtests.sh | 0 test/send-zerocopy.c | 648 +++++--- test/send_recv.c | 117 +- test/send_recvmsg.c | 21 +- test/sendmsg_fs_cve.c | 200 --- test/shutdown.c | 3 +- test/single-issuer.c | 16 +- test/skip-cqe.c | 7 +- test/socket-getsetsock-cmd.c | 346 +++++ test/socket-io-cmd.c | 237 +++ test/socket-rw-eagain.c | 3 +- test/socket-rw-offset.c | 3 +- test/socket-rw.c | 3 +- test/socket.c | 1 - test/sq-poll-dup.c | 13 +- test/sq-poll-kthread.c | 2 +- test/sq-poll-share.c | 5 +- test/sqpoll-cancel-hang.c | 157 -- test/sqpoll-disable-exit.c | 8 +- test/sqpoll-exec.c | 132 ++ test/symlink.c | 3 +- test/test.h | 3 +- test/timeout-new.c | 18 +- test/timeout-overflow.c | 204 --- test/timeout.c | 281 +++- test/truncate.c | 186 +++ test/unlink.c | 2 +- test/version.c | 25 + test/waitid.c | 373 +++++ test/wakeup-hang.c | 2 +- test/wq-aff.c | 146 ++ test/xattr.c | 75 +- 190 files changed, 17175 insertions(+), 2570 deletions(-) mode change 100755 => 100644 configure delete mode 100644 debian/compat delete mode 100644 debian/liburing1-udeb.install delete mode 100644 debian/liburing1.install delete mode 100644 debian/liburing1.symbols create mode 100644 debian/liburing2.install create mode 100644 debian/liburing2.symbols mode change 100755 => 100644 debian/rules create mode 100644 examples/helpers.c create mode 100644 examples/helpers.h create mode 100644 examples/io_uring-close-test.c create mode 100644 examples/napi-busy-poll-client.c create mode 100644 examples/napi-busy-poll-server.c create mode 100644 examples/proxy.c create mode 100644 examples/proxy.h create mode 100644 examples/rsrc-update-bench.c create mode 100644 liburing-ffi.pc.in mode change 100755 => 100644 make-debs.sh create mode 100644 src/arch/riscv64/lib.h create mode 100644 src/arch/riscv64/syscall.h create mode 100644 src/ffi.c create mode 100644 src/liburing-ffi.map create mode 100644 src/setup.h create mode 100644 src/version.c create mode 100644 test/accept-non-empty.c create mode 100644 test/bind-listen.c create mode 100644 test/buf-ring-nommap.c create mode 100644 test/buf-ring-put.c create mode 100644 test/connect-rep.c create mode 100644 test/coredump.c create mode 100644 test/defer-tw-timeout.c create mode 100644 test/eploop.c create mode 100644 test/evloop.c create mode 100644 test/fd-install.c create mode 100644 test/fixed-buf-merge.c create mode 100644 test/fixed-hugepage.c create mode 100644 test/fsnotify.c create mode 100644 test/futex.c create mode 100644 test/ignore-single-mmap.c create mode 100644 test/init-mem.c create mode 100644 test/iopoll-overflow.c create mode 100644 test/msg-ring-fd.c create mode 100644 test/msg-ring-flags.c create mode 100644 test/msg-ring-overflow.c create mode 100644 test/no-mmap-inval.c create mode 100644 test/ooo-file-unreg.c create mode 100644 test/pipe-bug.c create mode 100644 test/poll-race-mshot.c create mode 100644 test/poll-race.c delete mode 100644 test/pollfree.c create mode 100644 test/read-mshot-empty.c create mode 100644 test/read-mshot.c create mode 100644 test/recvsend_bundle.c create mode 100644 test/reg-fd-only.c create mode 100644 test/reg-hint.c create mode 100644 test/reg-reg-ring.c create mode 100644 test/regbuf-merge.c create mode 100644 test/ringbuf-status.c mode change 100755 => 100644 test/runtests-loop.sh mode change 100755 => 100644 test/runtests-quiet.sh mode change 100755 => 100644 test/runtests.sh delete mode 100644 test/sendmsg_fs_cve.c create mode 100644 test/socket-getsetsock-cmd.c create mode 100644 test/socket-io-cmd.c delete mode 100644 test/sqpoll-cancel-hang.c create mode 100644 test/sqpoll-exec.c delete mode 100644 test/timeout-overflow.c create mode 100644 test/truncate.c create mode 100644 test/version.c create mode 100644 test/waitid.c create mode 100644 test/wq-aff.c diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index b0e669d..8298608 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -26,7 +26,8 @@ jobs: cxx_pkg: clang cc: clang cxx: clang++ - extra_flags: -Wshorten-64-to-32 + liburing_extra_flags: -Wshorten-64-to-32 + extra_flags: -Wmissing-prototypes -Wstrict-prototypes -Wunreachable-code-loop-increment -Wunreachable-code -Wmissing-variable-declarations -Wextra-semi-stmt # x86 (32-bit) gcc - arch: i686 @@ -49,6 +50,13 @@ jobs: cc: arm-linux-gnueabi-gcc cxx: arm-linux-gnueabi-g++ + # riscv64 + - arch: riscv64 + cc_pkg: gcc-riscv64-linux-gnu + cxx_pkg: g++-riscv64-linux-gnu + cc: riscv64-linux-gnu-gcc + cxx: riscv64-linux-gnu-g++ + # powerpc64 - arch: powerpc64 cc_pkg: gcc-powerpc64-linux-gnu @@ -84,23 +92,31 @@ jobs: cc: mips-linux-gnu-gcc cxx: mips-linux-gnu-g++ + # hppa + - arch: hppa + cc_pkg: gcc-hppa-linux-gnu + cxx_pkg: g++-hppa-linux-gnu + cc: hppa-linux-gnu-gcc + cxx: hppa-linux-gnu-g++ + env: - FLAGS: -g -O3 -Wall -Wextra -Werror + FLAGS: -g -O3 -Wall -Wextra -Werror -Wno-sign-compare ${{matrix.extra_flags}} # Flags for building sources in src/ dir only. - LIBURING_CFLAGS: ${{matrix.extra_flags}} + LIBURING_CFLAGS: ${{matrix.liburing_extra_flags}} steps: - name: Checkout source - uses: actions/checkout@v2 + uses: actions/checkout@v3 - name: Install Compilers run: | if [[ "${{matrix.cc_pkg}}" == "clang" ]]; then \ wget https://apt.llvm.org/llvm.sh -O /tmp/llvm.sh; \ - sudo bash /tmp/llvm.sh 16; \ - sudo update-alternatives --install /usr/bin/clang++ clang++ /usr/bin/clang++-16 400; \ - sudo update-alternatives --install /usr/bin/clang clang /usr/bin/clang-16 400; \ + sudo apt-get purge --auto-remove llvm python3-lldb-14 llvm-14 -y; \ + sudo bash /tmp/llvm.sh 17; \ + sudo update-alternatives --install /usr/bin/clang++ clang++ /usr/bin/clang++-17 400; \ + sudo update-alternatives --install /usr/bin/clang clang /usr/bin/clang-17 400; \ else \ sudo apt-get update -y; \ sudo apt-get install -y ${{matrix.cc_pkg}} ${{matrix.cxx_pkg}}; \ @@ -116,16 +132,6 @@ jobs: ./configure --cc=${{matrix.cc}} --cxx=${{matrix.cxx}}; make -j$(nproc) V=1 CPPFLAGS="-Werror" CFLAGS="$FLAGS" CXXFLAGS="$FLAGS"; - - name: Build nolibc - run: | - if [[ "${{matrix.arch}}" == "x86_64" || "${{matrix.arch}}" == "i686" || "${{matrix.arch}}" == "aarch64" ]]; then \ - make clean; \ - ./configure --cc=${{matrix.cc}} --cxx=${{matrix.cxx}} --nolibc; \ - make -j$(nproc) V=1 CPPFLAGS="-Werror" CFLAGS="$FLAGS" CXXFLAGS="$FLAGS"; \ - else \ - echo "Skipping nolibc build, this arch doesn't support building liburing without libc"; \ - fi; - - name: Test install command run: | sudo make install; diff --git a/.github/workflows/shellcheck.yml b/.github/workflows/shellcheck.yml index 148a6b3..306caa2 100644 --- a/.github/workflows/shellcheck.yml +++ b/.github/workflows/shellcheck.yml @@ -11,7 +11,7 @@ jobs: steps: - name: Checkout source - uses: actions/checkout@v2 + uses: actions/checkout@v3 - name: Display shellcheck version run: shellcheck --version diff --git a/.gitignore b/.gitignore index 6e8a2f7..94966e7 100644 --- a/.gitignore +++ b/.gitignore @@ -9,15 +9,23 @@ /src/liburing.a /src/liburing.so* +/src/liburing-ffi.a +/src/liburing-ffi.so* /src/include/liburing/compat.h +/src/include/liburing/io_uring_version.h +/examples/io_uring-close-test /examples/io_uring-cp /examples/io_uring-test /examples/io_uring-udp /examples/link-cp +/examples/napi-busy-poll-client +/examples/napi-busy-poll-server /examples/ucontext-cp /examples/poll-bench +/examples/proxy /examples/send-zerocopy +/examples/rsrc-update-bench /test/*.t /test/*.dmesg @@ -28,5 +36,6 @@ config-host.mak config.log liburing.pc +liburing-ffi.pc cscope.out diff --git a/CHANGELOG b/CHANGELOG index 09511af..4eb15f3 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,68 @@ +liburing-2.7 release + +- Man page updates +- Sync with kernel 6.10 + - send/recv bundle support + - accept nowait and CQE_F_MORE +- Add and update test cases +- Fix io_uring_queue_init_mem() returning a value that was too small, + potentially causing memory corruption in userspace by overwriting + 64 bytes beyond the returned value. Also add test case for that. +- Add 64-bit length variants of io_uring_prep_{m,f}advise() +- Add BIND/LISTEN support and helpers / man pages +- Add io_uring_enable_rings.3 man page +- Fix bug in io_uring_prep_read_multishot() +- Fixup bundle test cases +- Add fixed-hugepage test case +- Fix io_uring_prep_fixed_fd_install.3 man page +- Note 'len' == 0 requirement in io_uring_prep_send.3 man page +- Fix some test cases for skipping on older kernels + +liburing-2.6 release + +- Add getsockopt and setsockopt socket commands +- Add test cases to test/hardlink +- Man page fixes +- Add futex support, and test cases +- Add waitid support, and test cases +- Add read multishot, and test cases +- Add support for IORING_SETUP_NO_SQARRAY +- Use IORING_SETUP_NO_SQARRAY as the default +- Add support for IORING_OP_FIXED_FD_INSTALL +- Add io_uring_prep_fixed_fd_install() helper +- Support for napi busy polling +- Improve/add test cases +- Man page fixes +- Add sample 'proxy' example + +liburing-2.5 release + +- Add support for io_uring_prep_cmd_sock() +- Add support for application allocated ring memory, for placing rings + in huge mem. Available through io_uring_queue_init_mem(). +- Add support for registered ring fds +- Various documentation updates +- Various fixes + +liburing-2.4 release + +- Add io_uring_{major,minor,check}_version() functions. +- Add IO_URING_{MAJOR,MINOR,CHECK}_VERSION() macros. +- FFI support (for non-C/C++ languages integration). +- Add io_uring_prep_msg_ring_cqe_flags() function. +- Deprecate --nolibc configure option. +- CONFIG_NOLIBC is always enabled on x86-64, x86, and aarch64. +- Add support for IORING_REGISTER_USE_REGISTERED_RING and use if available. +- Add io_uring_close_ring_fd() function. +- Add io_uring_prep_msg_ring_fd_alloc function. +- Add io_uring_free_buf_ring() and io_uring_setup_buf_ring() functions. +- Ensure that io_uring_prep_accept_direct(), io_uring_prep_openat_direct(), + io_uring_prep_openat2_direct(), io_uring_prep_msg_ring_fd(), and + io_uring_prep_socket_direct() factor in being called with + IORING_FILE_INDEX_ALLOC for allocating a direct descriptor. +- Add io_uring_prep_sendto() function. +- Add io_uring_prep_cmd_sock() function. + liburing-2.3 release - Support non-libc build for aarch64. diff --git a/Makefile b/Makefile index 686be4f..6a2d63b 100644 --- a/Makefile +++ b/Makefile @@ -11,11 +11,11 @@ all: @$(MAKE) -C test @$(MAKE) -C examples -.PHONY: all install default clean test -.PHONY: FORCE cscope +library: + @$(MAKE) -C src -partcheck: all - @echo "make partcheck => TODO add tests with out kernel support" +.PHONY: all install default clean test library +.PHONY: FORCE cscope runtests: all @$(MAKE) -C test runtests @@ -25,7 +25,7 @@ runtests-parallel: all @$(MAKE) -C test runtests-parallel config-host.mak: configure - @if [ ! -e "$@" ]; then \ + +@if [ ! -e "$@" ]; then \ echo "Running configure ..."; \ ./configure; \ else \ @@ -45,13 +45,14 @@ endif -e "s%@VERSION@%$(VERSION)%g" \ $< >$@ -install: $(NAME).pc +install: $(NAME).pc $(NAME)-ffi.pc @$(MAKE) -C src install prefix=$(DESTDIR)$(prefix) \ includedir=$(DESTDIR)$(includedir) \ libdir=$(DESTDIR)$(libdir) \ libdevdir=$(DESTDIR)$(libdevdir) \ relativelibdir=$(relativelibdir) $(INSTALL) -D -m 644 $(NAME).pc $(DESTDIR)$(libdevdir)/pkgconfig/$(NAME).pc + $(INSTALL) -D -m 644 $(NAME)-ffi.pc $(DESTDIR)$(libdevdir)/pkgconfig/$(NAME)-ffi.pc $(INSTALL) -m 755 -d $(DESTDIR)$(mandir)/man2 $(INSTALL) -m 644 man/*.2 $(DESTDIR)$(mandir)/man2 $(INSTALL) -m 755 -d $(DESTDIR)$(mandir)/man3 @@ -59,11 +60,22 @@ install: $(NAME).pc $(INSTALL) -m 755 -d $(DESTDIR)$(mandir)/man7 $(INSTALL) -m 644 man/*.7 $(DESTDIR)$(mandir)/man7 +uninstall: + @$(MAKE) -C src uninstall prefix=$(DESTDIR)$(prefix) datadir=$(DESTDIR)$(datadir) + @rm -f $(DESTDIR)$(libdevdir)/pkgconfig/$(NAME).pc + @rm -f $(DESTDIR)$(libdevdir)/pkgconfig/$(NAME)-ffi.pc + @rm -rf $(DESTDIR)$(mandir)/man2/io_uring*.2 + @rm -rf $(DESTDIR)$(mandir)/man3/io_uring*.3 + @rm -rf $(DESTDIR)$(mandir)/man7/io_uring*.7 + install-tests: @$(MAKE) -C test install prefix=$(DESTDIR)$(prefix) datadir=$(DESTDIR)$(datadir) +uninstall-tests: + @$(MAKE) -C test uninstall prefix=$(DESTDIR)$(prefix) datadir=$(DESTDIR)$(datadir) + clean: - @rm -f config-host.mak config-host.h cscope.out $(NAME).pc test/*.dmesg + @rm -f config-host.mak config-host.h cscope.out $(NAME).pc $(NAME)-ffi.pc test/*.dmesg @$(MAKE) -C src clean @$(MAKE) -C test clean @$(MAKE) -C examples clean diff --git a/Makefile.common b/Makefile.common index 27fc233..ea75c34 100644 --- a/Makefile.common +++ b/Makefile.common @@ -3,4 +3,5 @@ NAME=liburing SPECFILE=$(TOP)/$(NAME).spec VERSION=$(shell awk '/Version:/ { print $$2 }' $(SPECFILE)) VERSION_MAJOR=$(shell echo $(VERSION) | cut -d. -f1) +VERSION_MINOR=$(shell echo $(VERSION) | cut -d. -f2) TAG = $(NAME)-$(VERSION) diff --git a/README b/README index 80d2b3d..3e41a8f 100644 --- a/README +++ b/README @@ -47,6 +47,54 @@ the kernel io_uring support. Please note that this suite isn't expected to pass on older kernels, and may even crash or hang older kernels! +Building liburing +----------------- + + # + # Prepare build config (optional). + # + # --cc specifies the C compiler. + # --cxx specifies the C++ compiler. + # + ./configure --cc=gcc --cxx=g++; + + # + # Build liburing. + # + make -j$(nproc); + + # + # Install liburing (headers, shared/static libs, and manpage). + # + sudo make install; + +See './configure --help' for more information about build config options. + + +FFI support +----------- + +By default, the build results in 4 lib files: + + 2 shared libs: + + liburing.so + liburing-ffi.so + + 2 static libs: + + liburing.a + liburing-ffi.a + +Languages and applications that can't use 'static inline' functions in +liburing.h should use the FFI variants. + +liburing's main public interface lives in liburing.h as 'static inline' +functions. Users wishing to consume liburing purely as a binary dependency +should link against liburing-ffi. It contains definitions for every 'static +inline' function. + + License ------- diff --git a/README.OpenSource b/README.OpenSource index 2fd6195..a122a0c 100644 --- a/README.OpenSource +++ b/README.OpenSource @@ -3,9 +3,9 @@ "Name": "liburing", "License": "MIT License", "License File": "LICENSE", - "Version Number": "2.3", + "Version Number": "2.7", "Owner": "mailto:maojingjing1@huawei.com", - "Upstream URL": "https://github.com/axboe/liburing.git", + "Upstream URL": "https://github.com/axboe/liburing/releases/tag/liburing-2.7", "Description": "liburing provides helpers to setup and reardown io_uring instances, and also a simplified interface for applications that don't need (or want) to deal with the full kernel side implementation." } ] \ No newline at end of file diff --git a/bundle.json b/bundle.json index c811f88..3b1668e 100644 --- a/bundle.json +++ b/bundle.json @@ -1,7 +1,7 @@ { "name": "@ohos/liburing", "description": "liburing provides helpers to setup and teardown io_uring instances, and also a simplified interface for applications that don't need (or want) to deal with the full kernel side implementation.", - "version": "2.3", + "version": "2.7", "license": "MIT License", "publishAs": "code-segment", "segment": { diff --git a/configure b/configure old mode 100755 new mode 100644 index 1b0cc50..21a9356 --- a/configure +++ b/configure @@ -10,7 +10,7 @@ for opt do case "$opt" in --help|-h) show_help=yes ;; - --prefix=*) prefix="$optarg" + --prefix=*) prefix="$(realpath -s $optarg)" ;; --includedir=*) includedir="$optarg" ;; @@ -26,7 +26,7 @@ for opt do ;; --cxx=*) cxx="$optarg" ;; - --nolibc) liburing_nolibc="yes" + --use-libc) use_libc=yes ;; *) echo "ERROR: unknown option $opt" @@ -75,7 +75,7 @@ Options: [defaults in brackets after descriptions] --datadir=PATH install shared data in PATH [$datadir] --cc=CMD use CMD as the C compiler --cxx=CMD use CMD as the C++ compiler - --nolibc build liburing without libc + --use-libc use libc for liburing (useful for hardening) EOF exit 0 fi @@ -115,7 +115,7 @@ print_config() { } # Default CFLAGS -CFLAGS="-D_GNU_SOURCE -include config-host.h" +CFLAGS="-D_GNU_SOURCE -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -include config-host.h" BUILD_CFLAGS="" # Print configure header at the top of $config_host_h @@ -202,6 +202,15 @@ print_and_output_mak "relativelibdir" "$relativelibdir" print_and_output_mak "mandir" "$mandir" print_and_output_mak "datadir" "$datadir" +#################################################### +# Check for correct compiler runtime library to link with +libgcc_link_flag="-lgcc" +if $cc -print-libgcc-file-name >/dev/null 2>&1; then + libgcc_link_flag="$($cc $CFLAGS $LDFLAGS -print-libgcc-file-name)" +fi +print_and_output_mak "libgcc_link_flag" "$libgcc_link_flag" +#################################################### + ########################################## # check for compiler -Wstringop-overflow stringop_overflow="no" @@ -384,14 +393,87 @@ if compile_prog "" "" "nvme uring cmd"; then fi print_config "NVMe uring command support" "$nvme_uring_cmd" +########################################## +# Check futexv support +futexv="no" +cat > $TMPC << EOF +#include +#include +#include +int main(void) +{ + struct futex_waitv fw; + + memset(&fw, FUTEX_32, sizeof(fw)); + + return sizeof(struct futex_waitv); +} +EOF +if compile_prog "" "" "futexv"; then + futexv="yes" +fi +print_config "futex waitv support" "$futexv" + +########################################## +# Check idtype_t support +has_idtype_t="no" +cat > $TMPC << EOF +#include +int main(void) +{ + idtype_t v; + return 0; +} +EOF +if compile_prog "" "" "idtype_t"; then + has_idtype_t="yes" +fi +print_config "has_idtype_t" "$has_idtype_t" + +############################################################################# +liburing_nolibc="no" +if test "$use_libc" != "yes"; then + + # + # Currently, CONFIG_NOLIBC only supports x86-64, x86 (32-bit), aarch64 and riscv64. + # + cat > $TMPC << EOF +int main(void){ +#if defined(__x86_64__) || defined(__i386__) || defined(__aarch64__) || (defined(__riscv) && __riscv_xlen == 64) + return 0; +#else +#error libc is needed +#endif +} +EOF + + if compile_prog "" "" "nolibc"; then + liburing_nolibc="yes" + fi +fi + +print_config "nolibc" "$liburing_nolibc"; ############################################################################# + +#################################################### +# Most Android devices don't have sys/fanotify.h +has_fanotify="no" +cat > $TMPC << EOF +#include +int main(void) +{ + return 0; +} +EOF +if compile_prog "" "" "fanotify"; then + has_fanotify="yes" +fi +print_config "has_fanotify" "$has_fanotify" +#################################################### + if test "$liburing_nolibc" = "yes"; then output_sym "CONFIG_NOLIBC" -else - liburing_nolibc="no" fi -print_config "liburing_nolibc" "$liburing_nolibc" - if test "$__kernel_rwf_t" = "yes"; then output_sym "CONFIG_HAVE_KERNEL_RWF_T" fi @@ -422,12 +504,36 @@ fi if test "$nvme_uring_cmd" = "yes"; then output_sym "CONFIG_HAVE_NVME_URING" fi +if test "$has_fanotify" = "yes"; then + output_sym "CONFIG_HAVE_FANOTIFY" +fi +if test "$futexv" = "yes"; then + output_sym "CONFIG_HAVE_FUTEXV" +fi echo "CC=$cc" >> $config_host_mak print_config "CC" "$cc" echo "CXX=$cxx" >> $config_host_mak print_config "CXX" "$cxx" +# generate io_uring_version.h +# Reset MAKEFLAGS +MAKEFLAGS= +MAKE_PRINT_VARS="include Makefile.common\nprint-%%: ; @echo \$(\$*)\n" +VERSION_MAJOR=$(printf "$MAKE_PRINT_VARS" | make -s --no-print-directory -f - print-VERSION_MAJOR) +VERSION_MINOR=$(printf "$MAKE_PRINT_VARS" | make -s --no-print-directory -f - print-VERSION_MINOR) +io_uring_version_h="src/include/liburing/io_uring_version.h" +cat > $io_uring_version_h << EOF +/* SPDX-License-Identifier: MIT */ +#ifndef LIBURING_VERSION_H +#define LIBURING_VERSION_H + +#define IO_URING_VERSION_MAJOR $VERSION_MAJOR +#define IO_URING_VERSION_MINOR $VERSION_MINOR + +#endif +EOF + # generate compat.h compat_h="src/include/liburing/compat.h" cat > $compat_h << EOF @@ -452,10 +558,15 @@ struct __kernel_timespec { long long tv_nsec; }; +/* is not available, so it can't be included */ +#define UAPI_LINUX_IO_URING_H_SKIP_LINUX_TIME_TYPES_H 1 + EOF else cat >> $compat_h << EOF #include +/* is included above and not needed again */ +#define UAPI_LINUX_IO_URING_H_SKIP_LINUX_TIME_TYPES_H 1 EOF fi @@ -481,7 +592,33 @@ cat >> $compat_h << EOF EOF fi +if test "$futexv" != "yes"; then +cat >> $compat_h << EOF +#include + +#define FUTEX_32 2 +#define FUTEX_WAITV_MAX 128 + +struct futex_waitv { + uint64_t val; + uint64_t uaddr; + uint32_t flags; + uint32_t __reserved; +}; +EOF +fi + +if test "$has_idtype_t" != "yes"; then +cat >> $compat_h << EOF +typedef enum +{ + P_ALL, /* Wait for any child. */ + P_PID, /* Wait for specified process. */ + P_PGID /* Wait for members of process group. */ +} idtype_t; +EOF +fi cat >> $compat_h << EOF #endif EOF diff --git a/debian/changelog b/debian/changelog index f0032e3..4c06b4f 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,14 @@ +liburing (2.2-1) stable; urgency=low + + * Update to 2.2 + * Bump up so version to 2 + * Drop liburing1-udeb + * Package using dh instead of using dh_* helpers manually + * Add linux header dependency to liburing-dev + * Bump up debhelper-compact level to 13 + + -- Kefu Chai Sun, 16 Oct 2022 16:30:48 +0800 + liburing (0.7-1) stable; urgency=low * Update to 0.7 diff --git a/debian/compat b/debian/compat deleted file mode 100644 index ec63514..0000000 --- a/debian/compat +++ /dev/null @@ -1 +0,0 @@ -9 diff --git a/debian/control b/debian/control index 831a314..c4da982 100644 --- a/debian/control +++ b/debian/control @@ -2,13 +2,14 @@ Source: liburing Section: libs Priority: optional Maintainer: Liu Changcheng -Build-Depends: debhelper (>=9) +Build-Depends: + debhelper-compat (= 13) Standards-Version: 4.1.4 Homepage: https://git.kernel.dk/cgit/liburing/tree/README Vcs-Git: https://git.kernel.dk/liburing Vcs-Browser: https://git.kernel.dk/cgit/liburing/ -Package: liburing1 +Package: liburing2 Architecture: linux-any Multi-Arch: same Pre-Depends: ${misc:Pre-Depends} @@ -21,24 +22,14 @@ Description: userspace library for using io_uring . This package contains the shared library. -Package: liburing1-udeb -Package-Type: udeb -Section: debian-installer -Architecture: linux-any -Depends: ${misc:Depends}, ${shlibs:Depends}, -Description: userspace library for using io_uring - io_uring is kernel feature to improve development - The newese Linux IO interface, io_uring could improve - system performance a lot. liburing is the userpace - library to use io_uring feature. - . - This package contains the udeb shared library. - Package: liburing-dev Section: libdevel Architecture: linux-any Multi-Arch: same -Depends: ${misc:Depends}, liburing1 (= ${binary:Version}), +Depends: + ${misc:Depends}, + liburing2 (= ${binary:Version}), + linux-libc-dev (>= 5.1) Description: userspace library for using io_uring io_uring is kernel feature to improve development The newese Linux IO interface, io_uring could improve diff --git a/debian/liburing-dev.manpages b/debian/liburing-dev.manpages index fbbee23..ff885fd 100644 --- a/debian/liburing-dev.manpages +++ b/debian/liburing-dev.manpages @@ -1,6 +1,5 @@ -man/io_uring_setup.2 -man/io_uring_enter.2 -man/io_uring_register.2 -man/io_uring_queue_exit.3 -man/io_uring_queue_init.3 -man/io_uring_get_sqe.3 +usr/share/man/man2/io_uring_*.2 +usr/share/man/man3/io_uring_*.3 +usr/share/man/man7/io_uring.7 +usr/share/man/man3/IO_URING_*.3 +usr/share/man/man3/__io_uring_*.3 diff --git a/debian/liburing1-udeb.install b/debian/liburing1-udeb.install deleted file mode 100644 index 622f9ef..0000000 --- a/debian/liburing1-udeb.install +++ /dev/null @@ -1 +0,0 @@ -lib/*/lib*.so.* diff --git a/debian/liburing1.install b/debian/liburing1.install deleted file mode 100644 index 622f9ef..0000000 --- a/debian/liburing1.install +++ /dev/null @@ -1 +0,0 @@ -lib/*/lib*.so.* diff --git a/debian/liburing1.symbols b/debian/liburing1.symbols deleted file mode 100644 index 29109f2..0000000 --- a/debian/liburing1.symbols +++ /dev/null @@ -1,32 +0,0 @@ -liburing.so.1 liburing1 #MINVER# - (symver)LIBURING_0.1 0.1-1 - io_uring_get_sqe@LIBURING_0.1 0.1-1 - io_uring_queue_exit@LIBURING_0.1 0.1-1 - io_uring_queue_init@LIBURING_0.1 0.1-1 - io_uring_queue_mmap@LIBURING_0.1 0.1-1 - io_uring_register_buffers@LIBURING_0.1 0.1-1 - io_uring_register_eventfd@LIBURING_0.1 0.1-1 - io_uring_register_eventfd_async@LIBURING_0.6 0.6-1 - io_uring_register_files@LIBURING_0.1 0.1-1 - io_uring_submit@LIBURING_0.1 0.1-1 - io_uring_submit_and_wait@LIBURING_0.1 0.1-1 - io_uring_unregister_buffers@LIBURING_0.1 0.1-1 - io_uring_unregister_files@LIBURING_0.1 0.1-1 - (symver)LIBURING_0.2 0.2-1 - __io_uring_get_cqe@LIBURING_0.2 0.2-1 - io_uring_queue_init_params@LIBURING_0.2 0.2-1 - io_uring_register_files_update@LIBURING_0.2 0.2-1 - io_uring_peek_batch_cqe@LIBURING_0.2 0.2-1 - io_uring_wait_cqe_timeout@LIBURING_0.2 0.2-1 - io_uring_wait_cqes@LIBURING_0.2 0.2-1 - (symver)LIBURING_0.3 0.3-1 - (symver)LIBURING_0.4 0.4-1 - (symver)LIBURING_0.5 0.5-1 - (symver)LIBURING_0.6 0.6-1 - (symver)LIBURING_0.7 0.7-1 - io_uring_get_probe@LIBURING_0.4 0.4-1 - io_uring_get_probe_ring@LIBURING_0.4 0.4-1 - io_uring_register_personality@LIBURING_0.4 0.4-1 - io_uring_register_probe@LIBURING_0.4 0.4-1 - io_uring_ring_dontfork@LIBURING_0.4 0.4-1 - io_uring_unregister_personality@LIBURING_0.4 0.4-1 diff --git a/debian/liburing2.install b/debian/liburing2.install new file mode 100644 index 0000000..3ddde58 --- /dev/null +++ b/debian/liburing2.install @@ -0,0 +1 @@ +usr/lib/*/lib*.so.* diff --git a/debian/liburing2.symbols b/debian/liburing2.symbols new file mode 100644 index 0000000..725a039 --- /dev/null +++ b/debian/liburing2.symbols @@ -0,0 +1,56 @@ +liburing.so.2 liburing2 #MINVER# [47/1887] + LIBURING_2.0@LIBURING_2.0 0.7-1 + LIBURING_2.1@LIBURING_2.1 0.7-1 + LIBURING_2.2@LIBURING_2.2 0.7-1 + LIBURING_2.3@LIBURING_2.3 0.7-1 + __io_uring_get_cqe@LIBURING_2.0 0.7-1 + __io_uring_sqring_wait@LIBURING_2.0 0.7-1 + io_uring_enter2@LIBURING_2.3 0.7-1 + io_uring_enter@LIBURING_2.3 0.7-1 + io_uring_free_probe@LIBURING_2.0 0.7-1 + io_uring_get_events@LIBURING_2.3 0.7-1 + io_uring_get_probe@LIBURING_2.0 0.7-1 + io_uring_get_probe_ring@LIBURING_2.0 0.7-1 + io_uring_get_sqe@LIBURING_2.0 0.7-1 + io_uring_mlock_size@LIBURING_2.1 0.7-1 + io_uring_mlock_size_params@LIBURING_2.1 0.7-1 + io_uring_peek_batch_cqe@LIBURING_2.0 0.7-1 + io_uring_queue_exit@LIBURING_2.0 0.7-1 + io_uring_queue_init@LIBURING_2.0 0.7-1 + io_uring_queue_init_params@LIBURING_2.0 0.7-1 + io_uring_queue_mmap@LIBURING_2.0 0.7-1 + io_uring_register@LIBURING_2.3 0.7-1 + io_uring_register_buf_ring@LIBURING_2.2 0.7-1 + io_uring_register_buffers@LIBURING_2.0 0.7-1 + io_uring_register_buffers_sparse@LIBURING_2.2 0.7-1 + io_uring_register_buffers_tags@LIBURING_2.1 0.7-1 + io_uring_register_buffers_update_tag@LIBURING_2.1 0.7-1 + io_uring_register_eventfd@LIBURING_2.0 0.7-1 + io_uring_register_eventfd_async@LIBURING_2.0 0.7-1 + io_uring_register_file_alloc_range@LIBURING_2.3 0.7-1 + io_uring_register_files@LIBURING_2.0 0.7-1 + io_uring_register_files_sparse@LIBURING_2.2 0.7-1 + io_uring_register_files_tags@LIBURING_2.1 0.7-1 + io_uring_register_files_update@LIBURING_2.0 0.7-1 + io_uring_register_files_update_tag@LIBURING_2.1 0.7-1 + io_uring_register_iowq_aff@LIBURING_2.1 0.7-1 + io_uring_register_iowq_max_workers@LIBURING_2.1 0.7-1 + io_uring_register_personality@LIBURING_2.0 0.7-1 + io_uring_register_probe@LIBURING_2.0 0.7-1 + io_uring_register_ring_fd@LIBURING_2.2 0.7-1 + io_uring_register_sync_cancel@LIBURING_2.3 0.7-1 + io_uring_ring_dontfork@LIBURING_2.0 0.7-1 + io_uring_setup@LIBURING_2.3 0.7-1 + io_uring_submit@LIBURING_2.0 0.7-1 + io_uring_submit_and_get_events@LIBURING_2.3 0.7-1 + io_uring_submit_and_wait@LIBURING_2.0 0.7-1 + io_uring_submit_and_wait_timeout@LIBURING_2.2 0.7-1 + io_uring_unregister_buf_ring@LIBURING_2.2 0.7-1 + io_uring_unregister_buffers@LIBURING_2.0 0.7-1 + io_uring_unregister_eventfd@LIBURING_2.0 0.7-1 + io_uring_unregister_files@LIBURING_2.0 0.7-1 + io_uring_unregister_iowq_aff@LIBURING_2.1 0.7-1 + io_uring_unregister_personality@LIBURING_2.0 0.7-1 + io_uring_unregister_ring_fd@LIBURING_2.2 0.7-1 + io_uring_wait_cqe_timeout@LIBURING_2.0 0.7-1 + io_uring_wait_cqes@LIBURING_2.0 0.7-1 diff --git a/debian/rules b/debian/rules old mode 100755 new mode 100644 index 1a334b3..cdc0a60 --- a/debian/rules +++ b/debian/rules @@ -5,77 +5,25 @@ DEB_BUILD_MAINT_OPTIONS = hardening=+bindnow DEB_CFLAGS_MAINT_PREPEND = -Wall +DEB_BUILD_OPTIONS += nocheck include /usr/share/dpkg/default.mk include /usr/share/dpkg/buildtools.mk -export CC +%: + dh $@ --parallel -lib := liburing1 -libdbg := $(lib)-dbg -libudeb := $(lib)-udeb -libdev := liburing-dev - -build-indep: - -build-arch: - dh_testdir - - $(MAKE) CPPFLAGS="$(CPPFLAGS)" CFLAGS="$(CFLAGS)" LDFLAGS="$(LDFLAGS)" - -build: build-indep build-arch - -clean: - dh_testdir - dh_testroot - - $(MAKE) clean - - dh_clean - -check-arch: build-arch - dh_testdir +override_dh_auto_configure: + ./configure \ + --prefix=/usr \ + --includedir=/usr/include \ + --datadir=/usr/share \ + --mandir=/usr/share/man \ + --libdir=/usr/lib/$(DEB_HOST_MULTIARCH) \ + --libdevdir=/usr/lib/$(DEB_HOST_MULTIARCH) \ + --cc=$(CC) +override_dh_auto_test: ifeq (,$(filter nocheck,$(DEB_BUILD_OPTIONS))) - $(MAKE) CPPFLAGS="$(CPPFLAGS)" CFLAGS="$(CFLAGS)" LDFLAGS="$(LDFLAGS)" \ - partcheck + $(MAKE) runtests endif - -install-arch: check-arch - dh_testdir - dh_testroot - dh_clean - dh_installdirs - - $(MAKE) install \ - DESTDIR=$(CURDIR)/debian/tmp \ - libdir=/lib/$(DEB_HOST_MULTIARCH) \ - libdevdir=/usr/lib/$(DEB_HOST_MULTIARCH) \ - relativelibdir=/lib/$(DEB_HOST_MULTIARCH)/ - -binary: binary-indep binary-arch - -binary-indep: - # Nothing to do. - -binary-arch: install-arch - dh_testdir - dh_testroot - dh_install -a - dh_installdocs -a - dh_installexamples -a - dh_installman -a - dh_lintian -a - dh_link -a - dh_strip -a --ddeb-migration='$(libdbg) (<< 0.3)' - dh_compress -a - dh_fixperms -a - dh_makeshlibs -a --add-udeb '$(libudeb)' - dh_shlibdeps -a - dh_installdeb -a - dh_gencontrol -a - dh_md5sums -a - dh_builddeb -a - -.PHONY: clean build-indep build-arch build -.PHONY: install-arch binary-indep binary-arch binary diff --git a/examples/Makefile b/examples/Makefile index e561e05..7c27d8f 100644 --- a/examples/Makefile +++ b/examples/Makefile @@ -10,13 +10,21 @@ ifneq ($(MAKECMDGOALS),clean) include ../config-host.mak endif +LDFLAGS ?= +override LDFLAGS += -L../src/ -luring -lpthread + example_srcs := \ + io_uring-close-test.c \ io_uring-cp.c \ io_uring-test.c \ io_uring-udp.c \ link-cp.c \ + napi-busy-poll-client.c \ + napi-busy-poll-server.c \ poll-bench.c \ - send-zerocopy.c + send-zerocopy.c \ + rsrc-update-bench.c \ + proxy.c all_targets := @@ -24,16 +32,20 @@ all_targets := ifdef CONFIG_HAVE_UCONTEXT example_srcs += ucontext-cp.c endif -all_targets += ucontext-cp +all_targets += ucontext-cp helpers.o example_targets := $(patsubst %.c,%,$(patsubst %.cc,%,$(example_srcs))) all_targets += $(example_targets) +helpers = helpers.o all: $(example_targets) -%: %.c ../src/liburing.a - $(QUIET_CC)$(CC) $(CPPFLAGS) $(CFLAGS) -o $@ $< $(LDFLAGS) +helpers.o: helpers.c + $(QUIET_CC)$(CC) $(CPPFLAGS) $(CFLAGS) -o $@ -c $< + +%: %.c $(helpers) ../src/liburing.a + $(QUIET_CC)$(CC) $(CPPFLAGS) $(CFLAGS) -o $@ $< $(helpers) $(LDFLAGS) clean: @rm -f $(all_targets) diff --git a/examples/helpers.c b/examples/helpers.c new file mode 100644 index 0000000..b70ce7c --- /dev/null +++ b/examples/helpers.c @@ -0,0 +1,62 @@ +/* SPDX-License-Identifier: MIT */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "helpers.h" + +int setup_listening_socket(int port, int ipv6) +{ + struct sockaddr_in srv_addr = { }; + struct sockaddr_in6 srv_addr6 = { }; + int fd, enable, ret, domain; + + if (ipv6) + domain = AF_INET6; + else + domain = AF_INET; + + fd = socket(domain, SOCK_STREAM, 0); + if (fd == -1) { + perror("socket()"); + return -1; + } + + enable = 1; + ret = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &enable, sizeof(int)); + if (ret < 0) { + perror("setsockopt(SO_REUSEADDR)"); + return -1; + } + + if (ipv6) { + srv_addr6.sin6_family = AF_INET6; + srv_addr6.sin6_port = htons(port); + srv_addr6.sin6_addr = in6addr_any; + ret = bind(fd, (const struct sockaddr *)&srv_addr6, sizeof(srv_addr6)); + } else { + srv_addr.sin_family = AF_INET; + srv_addr.sin_port = htons(port); + srv_addr.sin_addr.s_addr = htonl(INADDR_ANY); + ret = bind(fd, (const struct sockaddr *)&srv_addr, sizeof(srv_addr)); + } + + if (ret < 0) { + perror("bind()"); + return -1; + } + + if (listen(fd, 1024) < 0) { + perror("listen()"); + return -1; + } + + return fd; +} diff --git a/examples/helpers.h b/examples/helpers.h new file mode 100644 index 0000000..9b1cf34 --- /dev/null +++ b/examples/helpers.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: MIT */ +#ifndef LIBURING_EX_HELPERS_H +#define LIBURING_EX_HELPERS_H + +int setup_listening_socket(int port, int ipv6); + +#endif diff --git a/examples/io_uring-close-test.c b/examples/io_uring-close-test.c new file mode 100644 index 0000000..3936d41 --- /dev/null +++ b/examples/io_uring-close-test.c @@ -0,0 +1,123 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Simple app that demonstrates how to setup an io_uring interface, and use it + * via a registered ring fd, without leaving the original fd open. + * + * gcc -Wall -O2 -D_GNU_SOURCE -o io_uring-close-test io_uring-close-test.c -luring + */ +#include +#include +#include +#include +#include +#include +#include +#include "liburing.h" + +#define QD 4 + +int main(int argc, char *argv[]) +{ + struct io_uring ring; + int i, fd, ret, pending, done; + struct io_uring_sqe *sqe; + struct io_uring_cqe *cqe; + struct iovec *iovecs; + struct stat sb; + ssize_t fsize; + off_t offset; + void *buf; + + if (argc < 2) { + printf("%s: file\n", argv[0]); + return 1; + } + + ret = io_uring_queue_init(QD, &ring, 0); + if (ret < 0) { + fprintf(stderr, "queue_init: %s\n", strerror(-ret)); + return 1; + } + + ret = io_uring_register_ring_fd(&ring); + if (ret < 0) { + fprintf(stderr, "register_ring_fd: %s\n", strerror(-ret)); + return 1; + } + ret = io_uring_close_ring_fd(&ring); + if (ret < 0) { + fprintf(stderr, "close_ring_fd: %s\n", strerror(-ret)); + return 1; + } + + fd = open(argv[1], O_RDONLY); + if (fd < 0) { + perror("open"); + return 1; + } + + if (fstat(fd, &sb) < 0) { + perror("fstat"); + return 1; + } + + fsize = 0; + iovecs = calloc(QD, sizeof(struct iovec)); + for (i = 0; i < QD; i++) { + if (posix_memalign(&buf, 4096, 4096)) + return 1; + iovecs[i].iov_base = buf; + iovecs[i].iov_len = 4096; + fsize += 4096; + } + + offset = 0; + i = 0; + do { + sqe = io_uring_get_sqe(&ring); + if (!sqe) + break; + io_uring_prep_readv(sqe, fd, &iovecs[i], 1, offset); + offset += iovecs[i].iov_len; + i++; + if (offset > sb.st_size) + break; + } while (1); + + ret = io_uring_submit(&ring); + if (ret < 0) { + fprintf(stderr, "io_uring_submit: %s\n", strerror(-ret)); + return 1; + } else if (ret != i) { + fprintf(stderr, "io_uring_submit submitted less %d\n", ret); + return 1; + } + + done = 0; + pending = ret; + fsize = 0; + for (i = 0; i < pending; i++) { + ret = io_uring_wait_cqe(&ring, &cqe); + if (ret < 0) { + fprintf(stderr, "io_uring_wait_cqe: %s\n", strerror(-ret)); + return 1; + } + + done++; + ret = 0; + if (cqe->res != 4096 && cqe->res + fsize != sb.st_size) { + fprintf(stderr, "ret=%d, wanted 4096\n", cqe->res); + ret = 1; + } + fsize += cqe->res; + io_uring_cqe_seen(&ring, cqe); + if (ret) + break; + } + + printf("Submitted=%d, completed=%d, bytes=%lu\n", pending, done, + (unsigned long) fsize); + close(fd); + io_uring_queue_exit(&ring); + return 0; +} diff --git a/examples/io_uring-test.c b/examples/io_uring-test.c index 1a68536..d3fcc9e 100644 --- a/examples/io_uring-test.c +++ b/examples/io_uring-test.c @@ -69,7 +69,7 @@ int main(int argc, char *argv[]) io_uring_prep_readv(sqe, fd, &iovecs[i], 1, offset); offset += iovecs[i].iov_len; i++; - if (offset > sb.st_size) + if (offset >= sb.st_size) break; } while (1); diff --git a/examples/io_uring-udp.c b/examples/io_uring-udp.c index a07c3e2..4697af1 100644 --- a/examples/io_uring-udp.c +++ b/examples/io_uring-udp.c @@ -271,14 +271,22 @@ static int process_cqe_recv(struct ctx *ctx, struct io_uring_cqe *cqe, } if (ctx->verbose) { + struct sockaddr_in *addr = io_uring_recvmsg_name(o); + struct sockaddr_in6 *addr6 = (void *)addr; char buff[INET6_ADDRSTRLEN + 1]; const char *name; - struct sockaddr_in *addr = io_uring_recvmsg_name(o); + void *paddr; - name = inet_ntop(ctx->af, addr, buff, sizeof(buff)); + if (ctx->af == AF_INET6) + paddr = &addr6->sin6_addr; + else + paddr = &addr->sin_addr; + + name = inet_ntop(ctx->af, paddr, buff, sizeof(buff)); if (!name) name = ""; - fprintf(stderr, "received %u bytes %d from %s:%d\n", + + fprintf(stderr, "received %u bytes %d from [%s]:%d\n", io_uring_recvmsg_payload_length(o, cqe->res, &ctx->msg), o->namelen, name, (int)ntohs(addr->sin_port)); } diff --git a/examples/napi-busy-poll-client.c b/examples/napi-busy-poll-client.c new file mode 100644 index 0000000..43dcca8 --- /dev/null +++ b/examples/napi-busy-poll-client.c @@ -0,0 +1,509 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Simple ping/pong client which can use the io_uring NAPI support. + * + * Needs to be run as root because it sets SCHED_FIFO scheduling class, + * but will work without that. + * + * Example: + * + * sudo examples/napi-busy-poll-client -a 192.168.2.2 -n100000 -p4444 \ + * -b -t10 -u + * + * send and receive 100k packets, using NAPI. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MAXBUFLEN 100 +#define PORTNOLEN 10 +#define ADDRLEN 80 +#define RINGSIZE 1024 + +#define printable(ch) (isprint((unsigned char)ch) ? ch : '#') + +enum { + IOURING_RECV, + IOURING_SEND, + IOURING_RECVMSG, + IOURING_SENDMSG +}; + +struct ctx +{ + struct io_uring ring; + union { + struct sockaddr_in6 saddr6; + struct sockaddr_in saddr; + }; + + int sockfd; + int buffer_len; + int num_pings; + bool napi_check; + + union { + char buffer[MAXBUFLEN]; + struct timespec ts; + }; + + int rtt_index; + double *rtt; +}; + +struct options +{ + int num_pings; + __u32 timeout; + + bool sq_poll; + bool defer_tw; + bool busy_loop; + bool prefer_busy_poll; + bool ipv6; + + char port[PORTNOLEN]; + char addr[ADDRLEN]; +}; + +static struct option longopts[] = +{ + {"address" , 1, NULL, 'a'}, + {"busy" , 0, NULL, 'b'}, + {"help" , 0, NULL, 'h'}, + {"num_pings", 1, NULL, 'n'}, + {"port" , 1, NULL, 'p'}, + {"prefer" , 1, NULL, 'u'}, + {"sqpoll" , 0, NULL, 's'}, + {"timeout" , 1, NULL, 't'}, + {NULL , 0, NULL, 0 } +}; + +static void printUsage(const char *name) +{ + fprintf(stderr, + "Usage: %s [-l|--listen] [-a|--address ip_address] [-p|--port port-no] [-s|--sqpoll]" + " [-b|--busy] [-n|--num pings] [-t|--timeout busy-poll-timeout] [-u||--prefer] [-6] [-h|--help]\n" + "--address\n" + "-a : remote or local ipv6 address\n" + "--busy\n" + "-b : busy poll io_uring instead of blocking.\n" + "--num_pings\n" + "-n : number of pings\n" + "--port\n" + "-p : port\n" + "--sqpoll\n" + "-s : Configure io_uring to use SQPOLL thread\n" + "--timeout\n" + "-t : Configure NAPI busy poll timeout" + "--prefer\n" + "-u : prefer NAPI busy poll\n" + "-6 : use IPV6\n" + "--help\n" + "-h : Display this usage message\n\n", + name); +} + +static void printError(const char *msg, int opt) +{ + if (msg && opt) + fprintf(stderr, "%s (-%c)\n", msg, printable(opt)); +} + +static void setProcessScheduler(void) +{ + struct sched_param param; + + param.sched_priority = sched_get_priority_max(SCHED_FIFO); + if (sched_setscheduler(0, SCHED_FIFO, ¶m) < 0) + fprintf(stderr, "sched_setscheduler() failed: (%d) %s\n", + errno, strerror(errno)); +} + +static double diffTimespec(const struct timespec *time1, const struct timespec *time0) +{ + return (time1->tv_sec - time0->tv_sec) + + (time1->tv_nsec - time0->tv_nsec) / 1000000000.0; +} + +static uint64_t encodeUserData(char type, int fd) +{ + return (uint32_t)fd | ((uint64_t)type << 56); +} + +static void decodeUserData(uint64_t data, char *type, int *fd) +{ + *type = data >> 56; + *fd = data & 0xffffffffU; +} + +static const char *opTypeToStr(char type) +{ + const char *res; + + switch (type) { + case IOURING_RECV: + res = "IOURING_RECV"; + break; + case IOURING_SEND: + res = "IOURING_SEND"; + break; + case IOURING_RECVMSG: + res = "IOURING_RECVMSG"; + break; + case IOURING_SENDMSG: + res = "IOURING_SENDMSG"; + break; + default: + res = "Unknown"; + } + + return res; +} + +static void reportNapi(struct ctx *ctx) +{ + unsigned int napi_id = 0; + socklen_t len = sizeof(napi_id); + + getsockopt(ctx->sockfd, SOL_SOCKET, SO_INCOMING_NAPI_ID, &napi_id, &len); + if (napi_id) + printf(" napi id: %d\n", napi_id); + else + printf(" unassigned napi id\n"); + + ctx->napi_check = true; +} + +static void sendPing(struct ctx *ctx) +{ + struct io_uring_sqe *sqe = io_uring_get_sqe(&ctx->ring); + + clock_gettime(CLOCK_REALTIME, (struct timespec *)ctx->buffer); + io_uring_prep_send(sqe, ctx->sockfd, ctx->buffer, sizeof(struct timespec), 0); + sqe->user_data = encodeUserData(IOURING_SEND, ctx->sockfd); +} + +static void receivePing(struct ctx *ctx) +{ + struct io_uring_sqe *sqe = io_uring_get_sqe(&ctx->ring); + + io_uring_prep_recv(sqe, ctx->sockfd, ctx->buffer, MAXBUFLEN, 0); + sqe->user_data = encodeUserData(IOURING_RECV, ctx->sockfd); +} + +static void recordRTT(struct ctx *ctx) +{ + struct timespec startTs = ctx->ts; + + // Send next ping. + sendPing(ctx); + + // Store round-trip time. + ctx->rtt[ctx->rtt_index] = diffTimespec(&ctx->ts, &startTs); + ctx->rtt_index++; +} + +static void printStats(struct ctx *ctx) +{ + double minRTT = DBL_MAX; + double maxRTT = 0.0; + double avgRTT = 0.0; + double stddevRTT = 0.0; + + // Calculate min, max, avg. + for (int i = 0; i < ctx->rtt_index; i++) { + if (ctx->rtt[i] < minRTT) + minRTT = ctx->rtt[i]; + if (ctx->rtt[i] > maxRTT) + maxRTT = ctx->rtt[i]; + + avgRTT += ctx->rtt[i]; + } + avgRTT /= ctx->rtt_index; + + // Calculate stddev. + for (int i = 0; i < ctx->rtt_index; i++) + stddevRTT += fabs(ctx->rtt[i] - avgRTT); + stddevRTT /= ctx->rtt_index; + + fprintf(stdout, " rtt(us) min/avg/max/mdev = %.3f/%.3f/%.3f/%.3f\n", + minRTT * 1000000, avgRTT * 1000000, maxRTT * 1000000, stddevRTT * 1000000); +} + +static int completion(struct ctx *ctx, struct io_uring_cqe *cqe) +{ + char type; + int fd; + int res = cqe->res; + + decodeUserData(cqe->user_data, &type, &fd); + if (res < 0) { + fprintf(stderr, "unexpected %s failure: (%d) %s\n", + opTypeToStr(type), -res, strerror(-res)); + return -1; + } + + switch (type) { + case IOURING_SEND: + receivePing(ctx); + break; + case IOURING_RECV: + if (res != sizeof(struct timespec)) { + fprintf(stderr, "unexpected ping reply len: %d\n", res); + abort(); + } + + if (!ctx->napi_check) { + reportNapi(ctx); + sendPing(ctx); + } else { + recordRTT(ctx); + } + + --ctx->num_pings; + break; + + default: + fprintf(stderr, "unexpected %s completion\n", + opTypeToStr(type)); + return -1; + break; + } + + return 0; +} + +int main(int argc, char *argv[]) +{ + struct ctx ctx; + struct options opt; + struct __kernel_timespec *tsPtr; + struct __kernel_timespec ts; + struct io_uring_params params; + struct io_uring_napi napi; + int flag, ret, af; + + memset(&opt, 0, sizeof(struct options)); + + // Process flags. + while ((flag = getopt_long(argc, argv, ":hs:bua:n:p:t:6d:", longopts, NULL)) != -1) { + switch (flag) { + case 'a': + strcpy(opt.addr, optarg); + break; + case 'b': + opt.busy_loop = true; + break; + case 'h': + printUsage(argv[0]); + exit(0); + break; + case 'n': + opt.num_pings = atoi(optarg) + 1; + break; + case 'p': + strcpy(opt.port, optarg); + break; + case 's': + opt.sq_poll = !!atoi(optarg); + break; + case 't': + opt.timeout = atoi(optarg); + break; + case 'u': + opt.prefer_busy_poll = true; + break; + case '6': + opt.ipv6 = true; + break; + case 'd': + opt.defer_tw = !!atoi(optarg); + break; + case ':': + printError("Missing argument", optopt); + printUsage(argv[0]); + exit(-1); + break; + case '?': + printError("Unrecognized option", optopt); + printUsage(argv[0]); + exit(-1); + break; + + default: + fprintf(stderr, "Fatal: Unexpected case in CmdLineProcessor switch()\n"); + exit(-1); + break; + } + } + + if (strlen(opt.addr) == 0) { + fprintf(stderr, "address option is mandatory\n"); + printUsage(argv[0]); + exit(1); + } + + if (opt.ipv6) { + af = AF_INET6; + ctx.saddr6.sin6_port = htons(atoi(opt.port)); + ctx.saddr6.sin6_family = AF_INET6; + } else { + af = AF_INET; + ctx.saddr.sin_port = htons(atoi(opt.port)); + ctx.saddr.sin_family = AF_INET; + } + + if (opt.ipv6) + ret = inet_pton(af, opt.addr, &ctx.saddr6.sin6_addr); + else + ret = inet_pton(af, opt.addr, &ctx.saddr.sin_addr); + if (ret <= 0) { + fprintf(stderr, "inet_pton error for %s\n", optarg); + printUsage(argv[0]); + exit(1); + } + + // Connect to server. + fprintf(stdout, "Connecting to %s... (port=%s) to send %d pings\n", opt.addr, opt.port, opt.num_pings - 1); + + if ((ctx.sockfd = socket(af, SOCK_DGRAM, 0)) < 0) { + fprintf(stderr, "socket() failed: (%d) %s\n", errno, strerror(errno)); + exit(1); + } + + if (opt.ipv6) + ret = connect(ctx.sockfd, (struct sockaddr *)&ctx.saddr6, sizeof(struct sockaddr_in6)); + else + ret = connect(ctx.sockfd, (struct sockaddr *)&ctx.saddr, sizeof(struct sockaddr_in)); + if (ret < 0) { + fprintf(stderr, "connect() failed: (%d) %s\n", errno, strerror(errno)); + exit(1); + } + + // Setup ring. + memset(¶ms, 0, sizeof(params)); + memset(&ts, 0, sizeof(ts)); + memset(&napi, 0, sizeof(napi)); + + params.flags = IORING_SETUP_SINGLE_ISSUER; + if (opt.defer_tw) { + params.flags |= IORING_SETUP_DEFER_TASKRUN; + } else if (opt.sq_poll) { + params.flags = IORING_SETUP_SQPOLL; + params.sq_thread_idle = 50; + } else { + params.flags |= IORING_SETUP_COOP_TASKRUN; + } + + ret = io_uring_queue_init_params(RINGSIZE, &ctx.ring, ¶ms); + if (ret) { + fprintf(stderr, "io_uring_queue_init_params() failed: (%d) %s\n", + ret, strerror(-ret)); + exit(1); + } + + if (opt.timeout || opt.prefer_busy_poll) { + napi.prefer_busy_poll = opt.prefer_busy_poll; + napi.busy_poll_to = opt.timeout; + + ret = io_uring_register_napi(&ctx.ring, &napi); + if (ret) { + fprintf(stderr, "io_uring_register_napi: %d\n", ret); + exit(1); + } + } + + if (opt.busy_loop) + tsPtr = &ts; + else + tsPtr = NULL; + + // Use realtime scheduler. + setProcessScheduler(); + + // Copy payload. + clock_gettime(CLOCK_REALTIME, &ctx.ts); + + // Setup context. + ctx.napi_check = false; + ctx.buffer_len = sizeof(struct timespec); + ctx.num_pings = opt.num_pings; + + ctx.rtt_index = 0; + ctx.rtt = (double *)malloc(sizeof(double) * opt.num_pings); + if (!ctx.rtt) { + fprintf(stderr, "Cannot allocate results array\n"); + exit(1); + } + + // Send initial message to get napi id. + sendPing(&ctx); + + while (ctx.num_pings != 0) { + int res; + unsigned num_completed = 0; + unsigned head; + struct io_uring_cqe *cqe; + + do { + res = io_uring_submit_and_wait_timeout(&ctx.ring, &cqe, 1, tsPtr, NULL); + if (res >= 0) + break; + else if (res == -ETIME) + continue; + fprintf(stderr, "submit_and_wait: %d\n", res); + exit(1); + } while (1); + + io_uring_for_each_cqe(&ctx.ring, head, cqe) { + ++num_completed; + if (completion(&ctx, cqe)) + goto out; + } + + if (num_completed) + io_uring_cq_advance(&ctx.ring, num_completed); + } + + printStats(&ctx); + +out: + // Clean up. + if (opt.timeout || opt.prefer_busy_poll) { + ret = io_uring_unregister_napi(&ctx.ring, &napi); + if (ret) + fprintf(stderr, "io_uring_unregister_napi: %d\n", ret); + if (opt.timeout != napi.busy_poll_to || + opt.prefer_busy_poll != napi.prefer_busy_poll) { + fprintf(stderr, "Expected busy poll to = %d, got %d\n", + opt.timeout, napi.busy_poll_to); + fprintf(stderr, "Expected prefer busy poll = %d, got %d\n", + opt.prefer_busy_poll, napi.prefer_busy_poll); + } + } else { + ret = io_uring_unregister_napi(&ctx.ring, NULL); + if (ret) + fprintf(stderr, "io_uring_unregister_napi: %d\n", ret); + } + + io_uring_queue_exit(&ctx.ring); + free(ctx.rtt); + close(ctx.sockfd); + return 0; +} diff --git a/examples/napi-busy-poll-server.c b/examples/napi-busy-poll-server.c new file mode 100644 index 0000000..584632a --- /dev/null +++ b/examples/napi-busy-poll-server.c @@ -0,0 +1,450 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Simple ping/pong backend which can use the io_uring NAPI support. + * + * Needs to be run as root because it sets SCHED_FIFO scheduling class, + * but will work without that. + * + * Example: + * + * sudo examples/napi-busy-poll-server -l -a 192.168.2.2 -n100000 \ + * -p4444 -t10 -b -u + * + * will respond to 100k packages, using NAPI. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MAXBUFLEN 100 +#define PORTNOLEN 10 +#define ADDRLEN 80 +#define RINGSIZE 1024 + +#define printable(ch) (isprint((unsigned char)ch) ? ch : '#') + +enum { + IOURING_RECV, + IOURING_SEND, + IOURING_RECVMSG, + IOURING_SENDMSG +}; + +struct ctx +{ + struct io_uring ring; + union { + struct sockaddr_in6 saddr6; + struct sockaddr_in saddr; + }; + struct iovec iov; + struct msghdr msg; + + int sockfd; + int buffer_len; + int num_pings; + bool napi_check; + + union { + char buffer[MAXBUFLEN]; + struct timespec ts; + }; +}; + +struct options +{ + int num_pings; + __u32 timeout; + + bool listen; + bool defer_tw; + bool sq_poll; + bool busy_loop; + bool prefer_busy_poll; + bool ipv6; + + char port[PORTNOLEN]; + char addr[ADDRLEN]; +}; + +static struct options opt; + +static struct option longopts[] = +{ + {"address" , 1, NULL, 'a'}, + {"busy" , 0, NULL, 'b'}, + {"help" , 0, NULL, 'h'}, + {"listen" , 0, NULL, 'l'}, + {"num_pings", 1, NULL, 'n'}, + {"port" , 1, NULL, 'p'}, + {"prefer" , 1, NULL, 'u'}, + {"sqpoll" , 0, NULL, 's'}, + {"timeout" , 1, NULL, 't'}, + {NULL , 0, NULL, 0 } +}; + +static void printUsage(const char *name) +{ + fprintf(stderr, + "Usage: %s [-l|--listen] [-a|--address ip_address] [-p|--port port-no] [-s|--sqpoll]" + " [-b|--busy] [-n|--num pings] [-t|--timeout busy-poll-timeout] [-u|--prefer] [-6] [-h|--help]\n" + " --listen\n" + "-l : Server mode\n" + "--address\n" + "-a : remote or local ipv6 address\n" + "--busy\n" + "-b : busy poll io_uring instead of blocking.\n" + "--num_pings\n" + "-n : number of pings\n" + "--port\n" + "-p : port\n" + "--sqpoll\n" + "-s : Configure io_uring to use SQPOLL thread\n" + "--timeout\n" + "-t : Configure NAPI busy poll timeout" + "--prefer\n" + "-u : prefer NAPI busy poll\n" + "-6 : use IPV6\n" + "--help\n" + "-h : Display this usage message\n\n", + name); +} + +static void printError(const char *msg, int opt) +{ + if (msg && opt) + fprintf(stderr, "%s (-%c)\n", msg, printable(opt)); +} + +static void setProcessScheduler(void) +{ + struct sched_param param; + + param.sched_priority = sched_get_priority_max(SCHED_FIFO); + if (sched_setscheduler(0, SCHED_FIFO, ¶m) < 0) + fprintf(stderr, "sched_setscheduler() failed: (%d) %s\n", + errno, strerror(errno)); +} + +static uint64_t encodeUserData(char type, int fd) +{ + return (uint32_t)fd | ((__u64)type << 56); +} + +static void decodeUserData(uint64_t data, char *type, int *fd) +{ + *type = data >> 56; + *fd = data & 0xffffffffU; +} + +static const char *opTypeToStr(char type) +{ + const char *res; + + switch (type) { + case IOURING_RECV: + res = "IOURING_RECV"; + break; + case IOURING_SEND: + res = "IOURING_SEND"; + break; + case IOURING_RECVMSG: + res = "IOURING_RECVMSG"; + break; + case IOURING_SENDMSG: + res = "IOURING_SENDMSG"; + break; + default: + res = "Unknown"; + } + + return res; +} + +static void reportNapi(struct ctx *ctx) +{ + unsigned int napi_id = 0; + socklen_t len = sizeof(napi_id); + + getsockopt(ctx->sockfd, SOL_SOCKET, SO_INCOMING_NAPI_ID, &napi_id, &len); + if (napi_id) + printf(" napi id: %d\n", napi_id); + else + printf(" unassigned napi id\n"); + + ctx->napi_check = true; +} + +static void sendPing(struct ctx *ctx) +{ + struct io_uring_sqe *sqe = io_uring_get_sqe(&ctx->ring); + + io_uring_prep_sendmsg(sqe, ctx->sockfd, &ctx->msg, 0); + sqe->user_data = encodeUserData(IOURING_SENDMSG, ctx->sockfd); +} + +static void receivePing(struct ctx *ctx) +{ + struct io_uring_sqe *sqe; + + bzero(&ctx->msg, sizeof(struct msghdr)); + if (opt.ipv6) { + ctx->msg.msg_name = &ctx->saddr6; + ctx->msg.msg_namelen = sizeof(struct sockaddr_in6); + } else { + ctx->msg.msg_name = &ctx->saddr; + ctx->msg.msg_namelen = sizeof(struct sockaddr_in); + } + ctx->iov.iov_base = ctx->buffer; + ctx->iov.iov_len = MAXBUFLEN; + ctx->msg.msg_iov = &ctx->iov; + ctx->msg.msg_iovlen = 1; + + sqe = io_uring_get_sqe(&ctx->ring); + io_uring_prep_recvmsg(sqe, ctx->sockfd, &ctx->msg, 0); + sqe->user_data = encodeUserData(IOURING_RECVMSG, ctx->sockfd); +} + +static void completion(struct ctx *ctx, struct io_uring_cqe *cqe) +{ + char type; + int fd; + int res = cqe->res; + + decodeUserData(cqe->user_data, &type, &fd); + if (res < 0) { + fprintf(stderr, "unexpected %s failure: (%d) %s\n", + opTypeToStr(type), -res, strerror(-res)); + abort(); + } + + switch (type) { + case IOURING_SENDMSG: + receivePing(ctx); + --ctx->num_pings; + break; + case IOURING_RECVMSG: + ctx->iov.iov_len = res; + sendPing(ctx); + if (!ctx->napi_check) + reportNapi(ctx); + break; + default: + fprintf(stderr, "unexpected %s completion\n", + opTypeToStr(type)); + abort(); + break; + } +} + +int main(int argc, char *argv[]) +{ + int flag; + struct ctx ctx; + struct __kernel_timespec *tsPtr; + struct __kernel_timespec ts; + struct io_uring_params params; + struct io_uring_napi napi; + int ret, af; + + memset(&opt, 0, sizeof(struct options)); + + // Process flags. + while ((flag = getopt_long(argc, argv, ":lhs:bua:n:p:t:6d:", longopts, NULL)) != -1) { + switch (flag) { + case 'a': + strcpy(opt.addr, optarg); + break; + case 'b': + opt.busy_loop = true; + break; + case 'h': + printUsage(argv[0]); + exit(0); + break; + case 'l': + opt.listen = true; + break; + case 'n': + opt.num_pings = atoi(optarg) + 1; + break; + case 'p': + strcpy(opt.port, optarg); + break; + case 's': + opt.sq_poll = !!atoi(optarg); + break; + case 't': + opt.timeout = atoi(optarg); + break; + case 'u': + opt.prefer_busy_poll = true; + break; + case '6': + opt.ipv6 = true; + break; + case 'd': + opt.defer_tw = !!atoi(optarg); + break; + case ':': + printError("Missing argument", optopt); + printUsage(argv[0]); + exit(-1); + break; + case '?': + printError("Unrecognized option", optopt); + printUsage(argv[0]); + exit(-1); + break; + + default: + fprintf(stderr, "Fatal: Unexpected case in CmdLineProcessor switch()\n"); + exit(-1); + break; + } + } + + if (strlen(opt.addr) == 0) { + fprintf(stderr, "address option is mandatory\n"); + printUsage(argv[0]); + exit(1); + } + + if (opt.ipv6) { + af = AF_INET6; + ctx.saddr6.sin6_port = htons(atoi(opt.port)); + ctx.saddr6.sin6_family = AF_INET6; + } else { + af = AF_INET; + ctx.saddr.sin_port = htons(atoi(opt.port)); + ctx.saddr.sin_family = AF_INET; + } + + if (opt.ipv6) + ret = inet_pton(AF_INET6, opt.addr, &ctx.saddr6.sin6_addr); + else + ret = inet_pton(AF_INET, opt.addr, &ctx.saddr.sin_addr); + if (ret <= 0) { + fprintf(stderr, "inet_pton error for %s\n", optarg); + printUsage(argv[0]); + exit(1); + } + + // Connect to server. + fprintf(stdout, "Listening %s : %s...\n", opt.addr, opt.port); + + if ((ctx.sockfd = socket(af, SOCK_DGRAM, 0)) < 0) { + fprintf(stderr, "socket() failed: (%d) %s\n", errno, strerror(errno)); + exit(1); + } + + if (opt.ipv6) + ret = bind(ctx.sockfd, (struct sockaddr *)&ctx.saddr6, sizeof(struct sockaddr_in6)); + else + ret = bind(ctx.sockfd, (struct sockaddr *)&ctx.saddr, sizeof(struct sockaddr_in)); + if (ret < 0) { + fprintf(stderr, "bind() failed: (%d) %s\n", errno, strerror(errno)); + exit(1); + } + + // Setup ring. + memset(¶ms, 0, sizeof(params)); + memset(&ts, 0, sizeof(ts)); + memset(&napi, 0, sizeof(napi)); + + params.flags = IORING_SETUP_SINGLE_ISSUER; + if (opt.defer_tw) { + params.flags |= IORING_SETUP_DEFER_TASKRUN; + } else if (opt.sq_poll) { + params.flags = IORING_SETUP_SQPOLL; + params.sq_thread_idle = 50; + } else { + params.flags |= IORING_SETUP_COOP_TASKRUN; + } + + ret = io_uring_queue_init_params(RINGSIZE, &ctx.ring, ¶ms); + if (ret) { + fprintf(stderr, "io_uring_queue_init_params() failed: (%d) %s\n", + ret, strerror(-ret)); + exit(1); + } + + if (opt.timeout || opt.prefer_busy_poll) { + napi.prefer_busy_poll = opt.prefer_busy_poll; + napi.busy_poll_to = opt.timeout; + + ret = io_uring_register_napi(&ctx.ring, &napi); + if (ret) { + fprintf(stderr, "io_uring_register_napi: %d\n", ret); + exit(1); + } + } + + if (opt.busy_loop) + tsPtr = &ts; + else + tsPtr = NULL; + + // Use realtime scheduler. + setProcessScheduler(); + + // Copy payload. + clock_gettime(CLOCK_REALTIME, &ctx.ts); + + // Setup context. + ctx.napi_check = false; + ctx.buffer_len = sizeof(struct timespec); + ctx.num_pings = opt.num_pings; + + // Receive initial message to get napi id. + receivePing(&ctx); + + while (ctx.num_pings != 0) { + int res; + unsigned int num_completed = 0; + unsigned int head; + struct io_uring_cqe *cqe; + + do { + res = io_uring_submit_and_wait_timeout(&ctx.ring, &cqe, 1, tsPtr, NULL); + if (res >= 0) + break; + else if (res == -ETIME) + continue; + fprintf(stderr, "submit_and_wait: %d\n", res); + exit(1); + } while (1); + + io_uring_for_each_cqe(&ctx.ring, head, cqe) { + ++num_completed; + completion(&ctx, cqe); + } + + if (num_completed) + io_uring_cq_advance(&ctx.ring, num_completed); + } + + // Clean up. + if (opt.timeout || opt.prefer_busy_poll) { + ret = io_uring_unregister_napi(&ctx.ring, &napi); + if (ret) + fprintf(stderr, "io_uring_unregister_napi: %d\n", ret); + } + + io_uring_queue_exit(&ctx.ring); + close(ctx.sockfd); + return 0; +} diff --git a/examples/proxy.c b/examples/proxy.c new file mode 100644 index 0000000..67df81e --- /dev/null +++ b/examples/proxy.c @@ -0,0 +1,2461 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Sample program that can act either as a packet sink, where it just receives + * packets and doesn't do anything with them, or it can act as a proxy where it + * receives packets and then sends them to a new destination. The proxy can + * be unidirectional (-B0), or bi-direction (-B1). + * + * Examples: + * + * Act as a proxy, listening on port 4444, and send data to 192.168.2.6 on port + * 4445. Use multishot receive, DEFER_TASKRUN, and fixed files + * + * ./proxy -m1 -r4444 -H 192.168.2.6 -p4445 + * + * Same as above, but utilize send bundles (-C1, requires -u1 send_ring) as well + * with ring provided send buffers, and recv bundles (-c1). + * + * ./proxy -m1 -c1 -u1 -C1 -r4444 -H 192.168.2.6 -p4445 + * + * Act as a bi-directional proxy, listening on port 8888, and send data back + * and forth between host and 192.168.2.6 on port 22. Use multishot receive, + * DEFER_TASKRUN, fixed files, and buffers of size 1500. + * + * ./proxy -m1 -B1 -b1500 -r8888 -H 192.168.2.6 -p22 + * + * Act a sink, listening on port 4445, using multishot receive, DEFER_TASKRUN, + * and fixed files: + * + * ./proxy -m1 -s1 -r4445 + * + * Run with -h to see a list of options, and their defaults. + * + * (C) 2024 Jens Axboe + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "proxy.h" +#include "helpers.h" + +/* + * Will go away once/if bundles are upstreamed and we put the generic + * definitions in the kernel header. + */ +#ifndef IORING_RECVSEND_BUNDLE +#define IORING_RECVSEND_BUNDLE (1U << 4) +#endif +#ifndef IORING_FEAT_SEND_BUF_SELECT +#define IORING_FEAT_SEND_BUF_SELECT (1U << 14) +#endif + +static int cur_bgid = 1; +static int nr_conns; +static int open_conns; +static long page_size; + +static unsigned long event_loops; +static unsigned long events; + +static int recv_mshot = 1; +static int sqpoll; +static int defer_tw = 1; +static int is_sink; +static int fixed_files = 1; +static char *host = "192.168.3.2"; +static int send_port = 4445; +static int receive_port = 4444; +static int buf_size = 32; +static int bidi; +static int ipv6; +static int napi; +static int napi_timeout; +static int wait_batch = 1; +static int wait_usec = 1000000; +static int rcv_msg; +static int snd_msg; +static int snd_zc; +static int send_ring = -1; +static int snd_bundle; +static int rcv_bundle; +static int use_huge; +static int ext_stat; +static int verbose; + +static int nr_bufs = 256; +static int br_mask; + +static int ring_size = 128; + +static pthread_mutex_t thread_lock; +static struct timeval last_housekeeping; + +/* + * For sendmsg/recvmsg. recvmsg just has a single vec, sendmsg will have + * two vecs - one that is currently submitted and being sent, and one that + * is being prepared. When a new sendmsg is issued, we'll swap which one we + * use. For send, even though we don't pass in the iovec itself, we use the + * vec to serialize the sends to avoid reordering. + */ +struct msg_vec { + struct iovec *iov; + /* length of allocated vec */ + int vec_size; + /* length currently being used */ + int iov_len; + /* only for send, current index we're processing */ + int cur_iov; +}; + +struct io_msg { + struct msghdr msg; + struct msg_vec vecs[2]; + /* current msg_vec being prepared */ + int vec_index; +}; + +/* + * Per socket stats per connection. For bi-directional, we'll have both + * sends and receives on each socket, this helps track them seperately. + * For sink or one directional, each of the two stats will be only sends + * or receives, not both. + */ +struct conn_dir { + int index; + + int pending_shutdown; + int pending_send; + int pending_recv; + + int snd_notif; + + int out_buffers; + + int rcv, rcv_shrt, rcv_enobufs, rcv_mshot; + int snd, snd_shrt, snd_enobufs, snd_busy, snd_mshot; + + int snd_next_bid; + int rcv_next_bid; + + int *rcv_bucket; + int *snd_bucket; + + unsigned long in_bytes, out_bytes; + + /* only ever have a single recv pending */ + struct io_msg io_rcv_msg; + + /* one send that is inflight, and one being prepared for the next one */ + struct io_msg io_snd_msg; +}; + +enum { + CONN_F_STARTED = 1, + CONN_F_DISCONNECTING = 2, + CONN_F_DISCONNECTED = 4, + CONN_F_PENDING_SHUTDOWN = 8, + CONN_F_STATS_SHOWN = 16, + CONN_F_END_TIME = 32, + CONN_F_REAPED = 64, +}; + +/* + * buffer ring belonging to a connection + */ +struct conn_buf_ring { + struct io_uring_buf_ring *br; + void *buf; + int bgid; +}; + +struct conn { + struct io_uring ring; + + /* receive side buffer ring, new data arrives here */ + struct conn_buf_ring in_br; + /* if send_ring is used, outgoing data to send */ + struct conn_buf_ring out_br; + + int tid; + int in_fd, out_fd; + int pending_cancels; + int flags; + + struct conn_dir cd[2]; + + struct timeval start_time, end_time; + + union { + struct sockaddr_in addr; + struct sockaddr_in6 addr6; + }; + + pthread_t thread; + pthread_barrier_t startup_barrier; +}; + +#define MAX_CONNS 1024 +static struct conn conns[MAX_CONNS]; + +#define vlog(str, ...) do { \ + if (verbose) \ + printf(str, ##__VA_ARGS__); \ +} while (0) + +static int prep_next_send(struct io_uring *ring, struct conn *c, + struct conn_dir *cd, int fd); +static void *thread_main(void *data); + +static struct conn *cqe_to_conn(struct io_uring_cqe *cqe) +{ + struct userdata ud = { .val = cqe->user_data }; + + return &conns[ud.op_tid & TID_MASK]; +} + +static struct conn_dir *cqe_to_conn_dir(struct conn *c, + struct io_uring_cqe *cqe) +{ + int fd = cqe_to_fd(cqe); + + return &c->cd[fd != c->in_fd]; +} + +static int other_dir_fd(struct conn *c, int fd) +{ + if (c->in_fd == fd) + return c->out_fd; + return c->in_fd; +} + +/* currently active msg_vec */ +static struct msg_vec *msg_vec(struct io_msg *imsg) +{ + return &imsg->vecs[imsg->vec_index]; +} + +static struct msg_vec *snd_msg_vec(struct conn_dir *cd) +{ + return msg_vec(&cd->io_snd_msg); +} + +/* + * Goes from accept new connection -> create socket, connect to end + * point, prepare recv, on receive do send (unless sink). If either ends + * disconnects, we transition to shutdown and then close. + */ +enum { + __ACCEPT = 1, + __SOCK = 2, + __CONNECT = 3, + __RECV = 4, + __RECVMSG = 5, + __SEND = 6, + __SENDMSG = 7, + __SHUTDOWN = 8, + __CANCEL = 9, + __CLOSE = 10, + __FD_PASS = 11, + __NOP = 12, + __STOP = 13, +}; + +struct error_handler { + const char *name; + int (*error_fn)(struct error_handler *, struct io_uring *, struct io_uring_cqe *); +}; + +static int recv_error(struct error_handler *err, struct io_uring *ring, + struct io_uring_cqe *cqe); +static int send_error(struct error_handler *err, struct io_uring *ring, + struct io_uring_cqe *cqe); + +static int default_error(struct error_handler *err, + struct io_uring __attribute__((__unused__)) *ring, + struct io_uring_cqe *cqe) +{ + struct conn *c = cqe_to_conn(cqe); + + fprintf(stderr, "%d: %s error %s\n", c->tid, err->name, strerror(-cqe->res)); + fprintf(stderr, "fd=%d, bid=%d\n", cqe_to_fd(cqe), cqe_to_bid(cqe)); + return 1; +} + +/* + * Move error handling out of the normal handling path, cleanly seperating + * them. If an opcode doesn't need any error handling, set it to NULL. If + * it wants to stop the connection at that point and not do anything else, + * then the default handler can be used. Only receive has proper error + * handling, as we can get -ENOBUFS which is not a fatal condition. It just + * means we need to wait on buffer replenishing before re-arming the receive. + */ +static struct error_handler error_handlers[] = { + { .name = "NULL", .error_fn = NULL, }, + { .name = "ACCEPT", .error_fn = default_error, }, + { .name = "SOCK", .error_fn = default_error, }, + { .name = "CONNECT", .error_fn = default_error, }, + { .name = "RECV", .error_fn = recv_error, }, + { .name = "RECVMSG", .error_fn = recv_error, }, + { .name = "SEND", .error_fn = send_error, }, + { .name = "SENDMSG", .error_fn = send_error, }, + { .name = "SHUTDOWN", .error_fn = NULL, }, + { .name = "CANCEL", .error_fn = NULL, }, + { .name = "CLOSE", .error_fn = NULL, }, + { .name = "FD_PASS", .error_fn = default_error, }, + { .name = "NOP", .error_fn = NULL, }, + { .name = "STOP", .error_fn = default_error, }, +}; + +static void free_buffer_ring(struct io_uring *ring, struct conn_buf_ring *cbr) +{ + if (!cbr->br) + return; + + io_uring_free_buf_ring(ring, cbr->br, nr_bufs, cbr->bgid); + cbr->br = NULL; + if (use_huge) + munmap(cbr->buf, buf_size * nr_bufs); + else + free(cbr->buf); +} + +static void free_buffer_rings(struct io_uring *ring, struct conn *c) +{ + free_buffer_ring(ring, &c->in_br); + free_buffer_ring(ring, &c->out_br); +} + +/* + * Setup a ring provided buffer ring for each connection. If we get -ENOBUFS + * on receive, for multishot receive we'll wait for half the provided buffers + * to be returned by pending sends, then re-arm the multishot receive. If + * this happens too frequently (see enobufs= stat), then the ring size is + * likely too small. Use -nXX to make it bigger. See recv_enobufs(). + * + * The alternative here would be to use the older style provided buffers, + * where you simply setup a buffer group and use SQEs with + * io_urign_prep_provide_buffers() to add to the pool. But that approach is + * slower and has been deprecated by using the faster ring provided buffers. + */ +static int setup_recv_ring(struct io_uring *ring, struct conn *c) +{ + struct conn_buf_ring *cbr = &c->in_br; + int ret, i; + size_t len; + void *ptr; + + len = buf_size * nr_bufs; + if (use_huge) { + cbr->buf = mmap(NULL, len, PROT_READ|PROT_WRITE, + MAP_PRIVATE|MAP_HUGETLB|MAP_HUGE_2MB|MAP_ANONYMOUS, + -1, 0); + if (cbr->buf == MAP_FAILED) { + perror("mmap"); + return 1; + } + } else { + if (posix_memalign(&cbr->buf, page_size, len)) { + perror("posix memalign"); + return 1; + } + } + cbr->br = io_uring_setup_buf_ring(ring, nr_bufs, cbr->bgid, 0, &ret); + if (!cbr->br) { + fprintf(stderr, "Buffer ring register failed %d\n", ret); + return 1; + } + + ptr = cbr->buf; + for (i = 0; i < nr_bufs; i++) { + vlog("%d: add bid %d, data %p\n", c->tid, i, ptr); + io_uring_buf_ring_add(cbr->br, ptr, buf_size, i, br_mask, i); + ptr += buf_size; + } + io_uring_buf_ring_advance(cbr->br, nr_bufs); + printf("%d: recv buffer ring bgid %d, bufs %d\n", c->tid, cbr->bgid, nr_bufs); + return 0; +} + +/* + * If 'send_ring' is used and the kernel supports it, we can skip serializing + * sends as the data will be ordered regardless. This reduces the send handling + * complexity, as buffers can always be added to the outgoing ring and will be + * processed in the order in which they were added. + */ +static int setup_send_ring(struct io_uring *ring, struct conn *c) +{ + struct conn_buf_ring *cbr = &c->out_br; + int ret; + + cbr->br = io_uring_setup_buf_ring(ring, nr_bufs, cbr->bgid, 0, &ret); + if (!cbr->br) { + fprintf(stderr, "Buffer ring register failed %d\n", ret); + return 1; + } + + printf("%d: send buffer ring bgid %d, bufs %d\n", c->tid, cbr->bgid, nr_bufs); + return 0; +} + +static int setup_send_zc(struct io_uring *ring, struct conn *c) +{ + struct iovec *iovs; + void *buf; + int i, ret; + + if (snd_msg) + return 0; + + buf = c->in_br.buf; + iovs = calloc(nr_bufs, sizeof(struct iovec)); + for (i = 0; i < nr_bufs; i++) { + iovs[i].iov_base = buf; + iovs[i].iov_len = buf_size; + buf += buf_size; + } + + ret = io_uring_register_buffers(ring, iovs, nr_bufs); + if (ret) { + fprintf(stderr, "failed registering buffers: %d\n", ret); + free(iovs); + return ret; + } + free(iovs); + return 0; +} + +/* + * Setup an input and output buffer ring. + */ +static int setup_buffer_rings(struct io_uring *ring, struct conn *c) +{ + int ret; + + /* no locking needed on cur_bgid, parent serializes setup */ + c->in_br.bgid = cur_bgid++; + c->out_br.bgid = cur_bgid++; + c->out_br.br = NULL; + + ret = setup_recv_ring(ring, c); + if (ret) + return ret; + if (is_sink) + return 0; + if (snd_zc) { + ret = setup_send_zc(ring, c); + if (ret) + return ret; + } + if (send_ring) { + ret = setup_send_ring(ring, c); + if (ret) { + free_buffer_ring(ring, &c->in_br); + return ret; + } + } + + return 0; +} + +struct bucket_stat { + int nr_packets; + int count; +}; + +static int stat_cmp(const void *p1, const void *p2) +{ + const struct bucket_stat *b1 = p1; + const struct bucket_stat *b2 = p2; + + if (b1->count < b2->count) + return 1; + else if (b1->count > b2->count) + return -1; + return 0; +} + +static void show_buckets(struct conn_dir *cd) +{ + unsigned long snd_total, rcv_total; + struct bucket_stat *rstat, *sstat; + int i; + + if (!cd->rcv_bucket || !cd->snd_bucket) + return; + + rstat = calloc(nr_bufs + 1, sizeof(struct bucket_stat)); + sstat = calloc(nr_bufs + 1, sizeof(struct bucket_stat)); + + snd_total = rcv_total = 0; + for (i = 0; i <= nr_bufs; i++) { + snd_total += cd->snd_bucket[i]; + sstat[i].nr_packets = i; + sstat[i].count = cd->snd_bucket[i]; + rcv_total += cd->rcv_bucket[i]; + rstat[i].nr_packets = i; + rstat[i].count = cd->rcv_bucket[i]; + } + + if (!snd_total && !rcv_total) { + free(sstat); + free(rstat); + } + if (snd_total) + qsort(sstat, nr_bufs, sizeof(struct bucket_stat), stat_cmp); + if (rcv_total) + qsort(rstat, nr_bufs, sizeof(struct bucket_stat), stat_cmp); + + printf("\t Packets per recv/send:\n"); + for (i = 0; i <= nr_bufs; i++) { + double snd_prc = 0.0, rcv_prc = 0.0; + if (!rstat[i].count && !sstat[i].count) + continue; + if (rstat[i].count) + rcv_prc = 100.0 * (rstat[i].count / (double) rcv_total); + if (sstat[i].count) + snd_prc = 100.0 * (sstat[i].count / (double) snd_total); + printf("\t bucket(%3d/%3d): rcv=%u (%.2f%%) snd=%u (%.2f%%)\n", + rstat[i].nr_packets, sstat[i].nr_packets, + rstat[i].count, rcv_prc, + sstat[i].count, snd_prc); + } + + free(sstat); + free(rstat); +} + +static void __show_stats(struct conn *c) +{ + unsigned long msec, qps; + unsigned long bytes, bw; + struct conn_dir *cd; + int i; + + if (c->flags & (CONN_F_STATS_SHOWN | CONN_F_REAPED)) + return; + if (!(c->flags & CONN_F_STARTED)) + return; + + if (!(c->flags & CONN_F_END_TIME)) + gettimeofday(&c->end_time, NULL); + + msec = (c->end_time.tv_sec - c->start_time.tv_sec) * 1000; + msec += (c->end_time.tv_usec - c->start_time.tv_usec) / 1000; + + qps = 0; + for (i = 0; i < 2; i++) + qps += c->cd[i].rcv + c->cd[i].snd; + + if (!qps) + return; + + if (msec) + qps = (qps * 1000) / msec; + + printf("Conn %d/(in_fd=%d, out_fd=%d): qps=%lu, msec=%lu\n", c->tid, + c->in_fd, c->out_fd, qps, msec); + + bytes = 0; + for (i = 0; i < 2; i++) { + cd = &c->cd[i]; + + if (!cd->in_bytes && !cd->out_bytes && !cd->snd && !cd->rcv) + continue; + + bytes += cd->in_bytes; + bytes += cd->out_bytes; + + printf("\t%3d: rcv=%u (short=%u, enobufs=%d), snd=%u (short=%u," + " busy=%u, enobufs=%d)\n", i, cd->rcv, cd->rcv_shrt, + cd->rcv_enobufs, cd->snd, cd->snd_shrt, cd->snd_busy, + cd->snd_enobufs); + printf("\t : in_bytes=%lu (Kb %lu), out_bytes=%lu (Kb %lu)\n", + cd->in_bytes, cd->in_bytes >> 10, + cd->out_bytes, cd->out_bytes >> 10); + printf("\t : mshot_rcv=%d, mshot_snd=%d\n", cd->rcv_mshot, + cd->snd_mshot); + show_buckets(cd); + + } + if (msec) { + bytes *= 8UL; + bw = bytes / 1000; + bw /= msec; + printf("\tBW=%'luMbit\n", bw); + } + + c->flags |= CONN_F_STATS_SHOWN; +} + +static void show_stats(void) +{ + float events_per_loop = 0.0; + static int stats_shown; + int i; + + if (stats_shown) + return; + + if (events) + events_per_loop = (float) events / (float) event_loops; + + printf("Event loops: %lu, events %lu, events per loop %.2f\n", event_loops, + events, events_per_loop); + + for (i = 0; i < MAX_CONNS; i++) { + struct conn *c = &conns[i]; + + __show_stats(c); + } + stats_shown = 1; +} + +static void sig_int(int __attribute__((__unused__)) sig) +{ + printf("\n"); + show_stats(); + exit(1); +} + +/* + * Special cased for SQPOLL only, as we don't control when SQEs are consumed if + * that is used. Hence we may need to wait for the SQPOLL thread to keep up + * until we can get a new SQE. All other cases will break immediately, with a + * fresh SQE. + * + * If we grossly undersized our SQ ring, getting a NULL sqe can happen even + * for the !SQPOLL case if we're handling a lot of CQEs in our event loop + * and multishot isn't used. We can do io_uring_submit() to flush what we + * have here. Only caveat here is that if linked requests are used, SQEs + * would need to be allocated upfront as a link chain is only valid within + * a single submission cycle. + */ +static struct io_uring_sqe *get_sqe(struct io_uring *ring) +{ + struct io_uring_sqe *sqe; + + do { + sqe = io_uring_get_sqe(ring); + if (sqe) + break; + if (!sqpoll) + io_uring_submit(ring); + else + io_uring_sqring_wait(ring); + } while (1); + + return sqe; +} + +/* + * See __encode_userdata() for how we encode sqe->user_data, which is passed + * back as cqe->user_data at completion time. + */ +static void encode_userdata(struct io_uring_sqe *sqe, struct conn *c, int op, + int bid, int fd) +{ + __encode_userdata(sqe, c->tid, op, bid, fd); +} + +static void __submit_receive(struct io_uring *ring, struct conn *c, + struct conn_dir *cd, int fd) +{ + struct conn_buf_ring *cbr = &c->in_br; + struct io_uring_sqe *sqe; + + vlog("%d: submit receive fd=%d\n", c->tid, fd); + + assert(!cd->pending_recv); + cd->pending_recv = 1; + + /* + * For both recv and multishot receive, we use the ring provided + * buffers. These are handed to the application ahead of time, and + * are consumed when a receive triggers. Note that the address and + * length of the receive are set to NULL/0, and we assign the + * sqe->buf_group to tell the kernel which buffer group ID to pick + * a buffer from. Finally, IOSQE_BUFFER_SELECT is set to tell the + * kernel that we want a buffer picked for this request, we are not + * passing one in with the request. + */ + sqe = get_sqe(ring); + if (rcv_msg) { + struct io_msg *imsg = &cd->io_rcv_msg; + struct msghdr *msg = &imsg->msg; + + memset(msg, 0, sizeof(*msg)); + msg->msg_iov = msg_vec(imsg)->iov; + msg->msg_iovlen = msg_vec(imsg)->iov_len; + + if (recv_mshot) { + cd->rcv_mshot++; + io_uring_prep_recvmsg_multishot(sqe, fd, &imsg->msg, 0); + } else { + io_uring_prep_recvmsg(sqe, fd, &imsg->msg, 0); + } + } else { + if (recv_mshot) { + cd->rcv_mshot++; + io_uring_prep_recv_multishot(sqe, fd, NULL, 0, 0); + } else { + io_uring_prep_recv(sqe, fd, NULL, 0, 0); + } + } + encode_userdata(sqe, c, __RECV, 0, fd); + sqe->buf_group = cbr->bgid; + sqe->flags |= IOSQE_BUFFER_SELECT; + if (fixed_files) + sqe->flags |= IOSQE_FIXED_FILE; + if (rcv_bundle) + sqe->ioprio |= IORING_RECVSEND_BUNDLE; +} + +/* + * One directional just arms receive on our in_fd + */ +static void submit_receive(struct io_uring *ring, struct conn *c) +{ + __submit_receive(ring, c, &c->cd[0], c->in_fd); +} + +/* + * Bi-directional arms receive on both in and out fd + */ +static void submit_bidi_receive(struct io_uring *ring, struct conn *c) +{ + __submit_receive(ring, c, &c->cd[0], c->in_fd); + __submit_receive(ring, c, &c->cd[1], c->out_fd); +} + +/* + * We hit -ENOBUFS, which means that we ran out of buffers in our current + * provided buffer group. This can happen if there's an imbalance between the + * receives coming in and the sends being processed, particularly with multishot + * receive as they can trigger very quickly. If this happens, defer arming a + * new receive until we've replenished half of the buffer pool by processing + * pending sends. + */ +static void recv_enobufs(struct io_uring *ring, struct conn *c, + struct conn_dir *cd, int fd) +{ + vlog("%d: enobufs hit\n", c->tid); + + cd->rcv_enobufs++; + + /* + * If we're a sink, mark rcv as rearm. If we're not, then mark us as + * needing a rearm for receive and send. The completing send will + * kick the recv rearm. + */ + if (!is_sink) { + int do_recv_arm = 1; + + if (!cd->pending_send) + do_recv_arm = !prep_next_send(ring, c, cd, fd); + if (do_recv_arm) + __submit_receive(ring, c, &c->cd[0], c->in_fd); + } else { + __submit_receive(ring, c, &c->cd[0], c->in_fd); + } +} + +/* + * Kill this socket - submit a shutdown and link a close to it. We don't + * care about shutdown status, so mark it as not needing to post a CQE unless + * it fails. + */ +static void queue_shutdown_close(struct io_uring *ring, struct conn *c, int fd) +{ + struct io_uring_sqe *sqe1, *sqe2; + + /* + * On the off chance that we run out of SQEs after the first one, + * grab two upfront. This it to prevent our link not working if + * get_sqe() ends up doing submissions to free up an SQE, as links + * are not valid across separate submissions. + */ + sqe1 = get_sqe(ring); + sqe2 = get_sqe(ring); + + io_uring_prep_shutdown(sqe1, fd, SHUT_RDWR); + if (fixed_files) + sqe1->flags |= IOSQE_FIXED_FILE; + sqe1->flags |= IOSQE_IO_LINK | IOSQE_CQE_SKIP_SUCCESS; + encode_userdata(sqe1, c, __SHUTDOWN, 0, fd); + + if (fixed_files) + io_uring_prep_close_direct(sqe2, fd); + else + io_uring_prep_close(sqe2, fd); + encode_userdata(sqe2, c, __CLOSE, 0, fd); +} + +/* + * This connection is going away, queue a cancel for any pending recv, for + * example, we have pending for this ring. For completeness, we issue a cancel + * for any request we have pending for both in_fd and out_fd. + */ +static void queue_cancel(struct io_uring *ring, struct conn *c) +{ + struct io_uring_sqe *sqe; + int flags = 0; + + if (fixed_files) + flags |= IORING_ASYNC_CANCEL_FD_FIXED; + + sqe = get_sqe(ring); + io_uring_prep_cancel_fd(sqe, c->in_fd, flags); + encode_userdata(sqe, c, __CANCEL, 0, c->in_fd); + c->pending_cancels++; + + if (c->out_fd != -1) { + sqe = get_sqe(ring); + io_uring_prep_cancel_fd(sqe, c->out_fd, flags); + encode_userdata(sqe, c, __CANCEL, 0, c->out_fd); + c->pending_cancels++; + } + + io_uring_submit(ring); +} + +static int pending_shutdown(struct conn *c) +{ + return c->cd[0].pending_shutdown + c->cd[1].pending_shutdown; +} + +static bool should_shutdown(struct conn *c) +{ + int i; + + if (!pending_shutdown(c)) + return false; + if (is_sink) + return true; + if (!bidi) + return c->cd[0].in_bytes == c->cd[1].out_bytes; + + for (i = 0; i < 2; i++) { + if (c->cd[0].rcv != c->cd[1].snd) + return false; + if (c->cd[1].rcv != c->cd[0].snd) + return false; + } + + return true; +} + +/* + * Close this connection - send a ring message to the connection with intent + * to stop. When the client gets the message, it will initiate the stop. + */ +static void __close_conn(struct io_uring *ring, struct conn *c) +{ + struct io_uring_sqe *sqe; + uint64_t user_data; + + printf("Client %d: queueing stop\n", c->tid); + + user_data = __raw_encode(c->tid, __STOP, 0, 0); + sqe = io_uring_get_sqe(ring); + io_uring_prep_msg_ring(sqe, c->ring.ring_fd, 0, user_data, 0); + encode_userdata(sqe, c, __NOP, 0, 0); + io_uring_submit(ring); +} + +static void close_cd(struct conn *c, struct conn_dir *cd) +{ + cd->pending_shutdown = 1; + + if (cd->pending_send) + return; + + if (!(c->flags & CONN_F_PENDING_SHUTDOWN)) { + gettimeofday(&c->end_time, NULL); + c->flags |= CONN_F_PENDING_SHUTDOWN | CONN_F_END_TIME; + } +} + +/* + * We're done with this buffer, add it back to our pool so the kernel is + * free to use it again. + */ +static int replenish_buffer(struct conn_buf_ring *cbr, int bid, int offset) +{ + void *this_buf = cbr->buf + bid * buf_size; + + assert(bid < nr_bufs); + + io_uring_buf_ring_add(cbr->br, this_buf, buf_size, bid, br_mask, offset); + return buf_size; +} + +/* + * Iterate buffers from '*bid' and with a total size of 'bytes' and add them + * back to our receive ring so they can be reused for new receives. + */ +static int replenish_buffers(struct conn *c, int *bid, int bytes) +{ + struct conn_buf_ring *cbr = &c->in_br; + int nr_packets = 0; + + while (bytes) { + int this_len = replenish_buffer(cbr, *bid, nr_packets); + + if (this_len > bytes) + this_len = bytes; + bytes -= this_len; + + *bid = (*bid + 1) & (nr_bufs - 1); + nr_packets++; + } + + io_uring_buf_ring_advance(cbr->br, nr_packets); + return nr_packets; +} + +static void free_mvec(struct msg_vec *mvec) +{ + free(mvec->iov); + mvec->iov = NULL; +} + +static void init_mvec(struct msg_vec *mvec) +{ + memset(mvec, 0, sizeof(*mvec)); + mvec->iov = malloc(sizeof(struct iovec)); + mvec->vec_size = 1; +} + +static void init_msgs(struct conn_dir *cd) +{ + memset(&cd->io_snd_msg, 0, sizeof(cd->io_snd_msg)); + memset(&cd->io_rcv_msg, 0, sizeof(cd->io_rcv_msg)); + init_mvec(&cd->io_snd_msg.vecs[0]); + init_mvec(&cd->io_snd_msg.vecs[1]); + init_mvec(&cd->io_rcv_msg.vecs[0]); +} + +static void free_msgs(struct conn_dir *cd) +{ + free_mvec(&cd->io_snd_msg.vecs[0]); + free_mvec(&cd->io_snd_msg.vecs[1]); + free_mvec(&cd->io_rcv_msg.vecs[0]); +} + +/* + * Multishot accept completion triggered. If we're acting as a sink, we're + * good to go. Just issue a receive for that case. If we're acting as a proxy, + * then start opening a socket that we can use to connect to the other end. + */ +static int handle_accept(struct io_uring *ring, struct io_uring_cqe *cqe) +{ + struct conn *c; + int i; + + if (nr_conns == MAX_CONNS) { + fprintf(stderr, "max clients reached %d\n", nr_conns); + return 1; + } + + /* main thread handles this, which is obviously serialized */ + c = &conns[nr_conns]; + c->tid = nr_conns++; + c->in_fd = -1; + c->out_fd = -1; + + for (i = 0; i < 2; i++) { + struct conn_dir *cd = &c->cd[i]; + + cd->index = i; + cd->snd_next_bid = -1; + cd->rcv_next_bid = -1; + if (ext_stat) { + cd->rcv_bucket = calloc(nr_bufs + 1, sizeof(int)); + cd->snd_bucket = calloc(nr_bufs + 1, sizeof(int)); + } + init_msgs(cd); + } + + printf("New client: id=%d, in=%d\n", c->tid, c->in_fd); + gettimeofday(&c->start_time, NULL); + + pthread_barrier_init(&c->startup_barrier, NULL, 2); + pthread_create(&c->thread, NULL, thread_main, c); + + /* + * Wait for thread to have its ring setup, then either assign the fd + * if it's non-fixed, or pass the fixed one + */ + pthread_barrier_wait(&c->startup_barrier); + if (!fixed_files) { + c->in_fd = cqe->res; + } else { + struct io_uring_sqe *sqe; + uint64_t user_data; + + /* + * Ring has just been setup, we'll use index 0 as the descriptor + * value. + */ + user_data = __raw_encode(c->tid, __FD_PASS, 0, 0); + sqe = io_uring_get_sqe(ring); + io_uring_prep_msg_ring_fd(sqe, c->ring.ring_fd, cqe->res, 0, + user_data, 0); + encode_userdata(sqe, c, __NOP, 0, cqe->res); + } + + return 0; +} + +/* + * Our socket request completed, issue a connect request to the other end. + */ +static int handle_sock(struct io_uring *ring, struct io_uring_cqe *cqe) +{ + struct conn *c = cqe_to_conn(cqe); + struct io_uring_sqe *sqe; + int ret; + + vlog("%d: sock: res=%d\n", c->tid, cqe->res); + + c->out_fd = cqe->res; + + if (ipv6) { + memset(&c->addr6, 0, sizeof(c->addr6)); + c->addr6.sin6_family = AF_INET6; + c->addr6.sin6_port = htons(send_port); + ret = inet_pton(AF_INET6, host, &c->addr6.sin6_addr); + } else { + memset(&c->addr, 0, sizeof(c->addr)); + c->addr.sin_family = AF_INET; + c->addr.sin_port = htons(send_port); + ret = inet_pton(AF_INET, host, &c->addr.sin_addr); + } + if (ret <= 0) { + if (!ret) + fprintf(stderr, "host not in right format\n"); + else + perror("inet_pton"); + return 1; + } + + sqe = get_sqe(ring); + if (ipv6) { + io_uring_prep_connect(sqe, c->out_fd, + (struct sockaddr *) &c->addr6, + sizeof(c->addr6)); + } else { + io_uring_prep_connect(sqe, c->out_fd, + (struct sockaddr *) &c->addr, + sizeof(c->addr)); + } + encode_userdata(sqe, c, __CONNECT, 0, c->out_fd); + if (fixed_files) + sqe->flags |= IOSQE_FIXED_FILE; + return 0; +} + +/* + * Connection to the other end is done, submit a receive to start receiving + * data. If we're a bidirectional proxy, issue a receive on both ends. If not, + * then just a single recv will do. + */ +static int handle_connect(struct io_uring *ring, struct io_uring_cqe *cqe) +{ + struct conn *c = cqe_to_conn(cqe); + + pthread_mutex_lock(&thread_lock); + open_conns++; + pthread_mutex_unlock(&thread_lock); + + if (bidi) + submit_bidi_receive(ring, c); + else + submit_receive(ring, c); + + return 0; +} + +/* + * Append new segment to our currently active msg_vec. This will be submitted + * as a sendmsg (with all of it), or as separate sends, later. If we're using + * send_ring, then we won't hit this path. Instead, outgoing buffers are + * added directly to our outgoing send buffer ring. + */ +static void send_append_vec(struct conn_dir *cd, void *data, int len) +{ + struct msg_vec *mvec = snd_msg_vec(cd); + + if (mvec->iov_len == mvec->vec_size) { + mvec->vec_size <<= 1; + mvec->iov = realloc(mvec->iov, mvec->vec_size * sizeof(struct iovec)); + } + + mvec->iov[mvec->iov_len].iov_base = data; + mvec->iov[mvec->iov_len].iov_len = len; + mvec->iov_len++; +} + +/* + * Queue a send based on the data received in this cqe, which came from + * a completed receive operation. + */ +static void send_append(struct conn *c, struct conn_dir *cd, void *data, + int bid, int len) +{ + vlog("%d: send %d (%p, bid %d)\n", c->tid, len, data, bid); + + assert(bid < nr_bufs); + + /* if using provided buffers for send, add it upfront */ + if (send_ring) { + struct conn_buf_ring *cbr = &c->out_br; + + io_uring_buf_ring_add(cbr->br, data, len, bid, br_mask, 0); + io_uring_buf_ring_advance(cbr->br, 1); + } else { + send_append_vec(cd, data, len); + } +} + +/* + * For non recvmsg && multishot, a zero receive marks the end. For recvmsg + * with multishot, we always get the header regardless. Hence a "zero receive" + * is the size of the header. + */ +static int recv_done_res(int res) +{ + if (!res) + return 1; + if (rcv_msg && recv_mshot && res == sizeof(struct io_uring_recvmsg_out)) + return 1; + return 0; +} + +/* + * Any receive that isn't recvmsg with multishot can be handled the same way. + * Iterate from '*bid' and 'in_bytes' in total, and append the data to the + * outgoing queue. + */ +static int recv_bids(struct conn *c, struct conn_dir *cd, int *bid, int in_bytes) +{ + struct conn_buf_ring *cbr = &c->out_br; + struct conn_buf_ring *in_cbr = &c->in_br; + struct io_uring_buf *buf; + int nr_packets = 0; + + while (in_bytes) { + int this_bytes; + void *data; + + buf = &in_cbr->br->bufs[*bid]; + data = (void *) (unsigned long) buf->addr; + this_bytes = buf->len; + if (this_bytes > in_bytes) + this_bytes = in_bytes; + + in_bytes -= this_bytes; + + if (send_ring) + io_uring_buf_ring_add(cbr->br, data, this_bytes, *bid, + br_mask, nr_packets); + else + send_append(c, cd, data, *bid, this_bytes); + + *bid = (*bid + 1) & (nr_bufs - 1); + nr_packets++; + } + + if (send_ring) + io_uring_buf_ring_advance(cbr->br, nr_packets); + + return nr_packets; +} + +/* + * Special handling of recvmsg with multishot + */ +static int recv_mshot_msg(struct conn *c, struct conn_dir *cd, int *bid, + int in_bytes) +{ + struct conn_buf_ring *cbr = &c->out_br; + struct conn_buf_ring *in_cbr = &c->in_br; + struct io_uring_buf *buf; + int nr_packets = 0; + + while (in_bytes) { + struct io_uring_recvmsg_out *pdu; + int this_bytes; + void *data; + + buf = &in_cbr->br->bufs[*bid]; + + /* + * multishot recvmsg puts a header in front of the data - we + * have to take that into account for the send setup, and + * adjust the actual data read to not take this metadata into + * account. For this use case, namelen and controllen will not + * be set. If they were, they would need to be factored in too. + */ + buf->len -= sizeof(struct io_uring_recvmsg_out); + in_bytes -= sizeof(struct io_uring_recvmsg_out); + + pdu = (void *) (unsigned long) buf->addr; + vlog("pdu namelen %d, controllen %d, payload %d flags %x\n", + pdu->namelen, pdu->controllen, pdu->payloadlen, + pdu->flags); + data = (void *) (pdu + 1); + + this_bytes = pdu->payloadlen; + if (this_bytes > in_bytes) + this_bytes = in_bytes; + + in_bytes -= this_bytes; + + if (send_ring) + io_uring_buf_ring_add(cbr->br, data, this_bytes, *bid, + br_mask, nr_packets); + else + send_append(c, cd, data, *bid, this_bytes); + + *bid = (*bid + 1) & (nr_bufs - 1); + nr_packets++; + } + + if (send_ring) + io_uring_buf_ring_advance(cbr->br, nr_packets); + + return nr_packets; +} + +static int __handle_recv(struct io_uring *ring, struct conn *c, + struct conn_dir *cd, struct io_uring_cqe *cqe) +{ + struct conn_dir *ocd = &c->cd[!cd->index]; + int bid, nr_packets; + + /* + * Not having a buffer attached should only happen if we get a zero + * sized receive, because the other end closed the connection. It + * cannot happen otherwise, as all our receives are using provided + * buffers and hence it's not possible to return a CQE with a non-zero + * result and not have a buffer attached. + */ + if (!(cqe->flags & IORING_CQE_F_BUFFER)) { + cd->pending_recv = 0; + + if (!recv_done_res(cqe->res)) { + fprintf(stderr, "no buffer assigned, res=%d\n", cqe->res); + return 1; + } +start_close: + prep_next_send(ring, c, ocd, other_dir_fd(c, cqe_to_fd(cqe))); + close_cd(c, cd); + return 0; + } + + if (cqe->res && cqe->res < buf_size) + cd->rcv_shrt++; + + bid = cqe->flags >> IORING_CQE_BUFFER_SHIFT; + + /* + * BIDI will use the same buffer pool and do receive on both CDs, + * so can't reliably check. TODO. + */ + if (!bidi && cd->rcv_next_bid != -1 && bid != cd->rcv_next_bid) { + fprintf(stderr, "recv bid %d, wanted %d\n", bid, cd->rcv_next_bid); + goto start_close; + } + + vlog("%d: recv: bid=%d, res=%d, cflags=%x\n", c->tid, bid, cqe->res, cqe->flags); + /* + * If we're a sink, we're done here. Just replenish the buffer back + * to the pool. For proxy mode, we will send the data to the other + * end and the buffer will be replenished once the send is done with + * it. + */ + if (is_sink) + nr_packets = replenish_buffers(c, &bid, cqe->res); + else if (rcv_msg && recv_mshot) + nr_packets = recv_mshot_msg(c, ocd, &bid, cqe->res); + else + nr_packets = recv_bids(c, ocd, &bid, cqe->res); + + if (cd->rcv_bucket) + cd->rcv_bucket[nr_packets]++; + + if (!is_sink) { + ocd->out_buffers += nr_packets; + assert(ocd->out_buffers <= nr_bufs); + } + + cd->rcv++; + cd->rcv_next_bid = bid; + + /* + * If IORING_CQE_F_MORE isn't set, then this is either a normal recv + * that needs rearming, or it's a multishot that won't post any further + * completions. Setup a new one for these cases. + */ + if (!(cqe->flags & IORING_CQE_F_MORE)) { + cd->pending_recv = 0; + if (recv_done_res(cqe->res)) + goto start_close; + if (is_sink) + __submit_receive(ring, c, &c->cd[0], c->in_fd); + } + + /* + * Submit a send if we won't get anymore notifications from this + * recv, or if we have nr_bufs / 2 queued up. If BIDI mode, send + * every buffer. We assume this is interactive mode, and hence don't + * delay anything. + */ + if (((!ocd->pending_send && (bidi || (ocd->out_buffers >= nr_bufs / 2))) || + !(cqe->flags & IORING_CQE_F_MORE)) && !is_sink) + prep_next_send(ring, c, ocd, other_dir_fd(c, cqe_to_fd(cqe))); + + if (!recv_done_res(cqe->res)) + cd->in_bytes += cqe->res; + return 0; +} + +static int handle_recv(struct io_uring *ring, struct io_uring_cqe *cqe) +{ + struct conn *c = cqe_to_conn(cqe); + struct conn_dir *cd = cqe_to_conn_dir(c, cqe); + + return __handle_recv(ring, c, cd, cqe); +} + +static int recv_error(struct error_handler *err, struct io_uring *ring, + struct io_uring_cqe *cqe) +{ + struct conn *c = cqe_to_conn(cqe); + struct conn_dir *cd = cqe_to_conn_dir(c, cqe); + + cd->pending_recv = 0; + + if (cqe->res != -ENOBUFS) + return default_error(err, ring, cqe); + + recv_enobufs(ring, c, cd, other_dir_fd(c, cqe_to_fd(cqe))); + return 0; +} + +static void submit_send(struct io_uring *ring, struct conn *c, + struct conn_dir *cd, int fd, void *data, int len, + int bid, int flags) +{ + struct io_uring_sqe *sqe; + int bgid = c->out_br.bgid; + + if (cd->pending_send) + return; + cd->pending_send = 1; + + flags |= MSG_WAITALL | MSG_NOSIGNAL; + + sqe = get_sqe(ring); + if (snd_msg) { + struct io_msg *imsg = &cd->io_snd_msg; + + if (snd_zc) { + io_uring_prep_sendmsg_zc(sqe, fd, &imsg->msg, flags); + cd->snd_notif++; + } else { + io_uring_prep_sendmsg(sqe, fd, &imsg->msg, flags); + } + } else if (send_ring) { + io_uring_prep_send(sqe, fd, NULL, 0, flags); + } else if (!snd_zc) { + io_uring_prep_send(sqe, fd, data, len, flags); + } else { + io_uring_prep_send_zc(sqe, fd, data, len, flags, 0); + sqe->ioprio |= IORING_RECVSEND_FIXED_BUF; + sqe->buf_index = bid; + cd->snd_notif++; + } + encode_userdata(sqe, c, __SEND, bid, fd); + if (fixed_files) + sqe->flags |= IOSQE_FIXED_FILE; + if (send_ring) { + sqe->flags |= IOSQE_BUFFER_SELECT; + sqe->buf_group = bgid; + } + if (snd_bundle) { + sqe->ioprio |= IORING_RECVSEND_BUNDLE; + cd->snd_mshot++; + } else if (send_ring) + cd->snd_mshot++; +} + +/* + * Prepare the next send request, if we need to. If one is already pending, + * or if we're a sink and we don't need to do sends, then there's nothing + * to do. + * + * Return 1 if another send completion is expected, 0 if not. + */ +static int prep_next_send(struct io_uring *ring, struct conn *c, + struct conn_dir *cd, int fd) +{ + int bid; + + if (cd->pending_send || is_sink) + return 0; + if (!cd->out_buffers) + return 0; + + bid = cd->snd_next_bid; + if (bid == -1) + bid = 0; + + if (send_ring) { + /* + * send_ring mode is easy, there's nothing to do but submit + * our next send request. That will empty the entire outgoing + * queue. + */ + submit_send(ring, c, cd, fd, NULL, 0, bid, 0); + return 1; + } else if (snd_msg) { + /* + * For sendmsg mode, submit our currently prepared iovec, if + * we have one, and swap our iovecs so that any further + * receives will start preparing that one. + */ + struct io_msg *imsg = &cd->io_snd_msg; + + if (!msg_vec(imsg)->iov_len) + return 0; + imsg->msg.msg_iov = msg_vec(imsg)->iov; + imsg->msg.msg_iovlen = msg_vec(imsg)->iov_len; + msg_vec(imsg)->iov_len = 0; + imsg->vec_index = !imsg->vec_index; + submit_send(ring, c, cd, fd, NULL, 0, bid, 0); + return 1; + } else { + /* + * send without send_ring - submit the next available vec, + * if any. If this vec is the last one in the current series, + * then swap to the next vec. We flag each send with MSG_MORE, + * unless this is the last part of the current vec. + */ + struct io_msg *imsg = &cd->io_snd_msg; + struct msg_vec *mvec = msg_vec(imsg); + int flags = !snd_zc ? MSG_MORE : 0; + struct iovec *iov; + + if (mvec->iov_len == mvec->cur_iov) + return 0; + imsg->msg.msg_iov = msg_vec(imsg)->iov; + iov = &mvec->iov[mvec->cur_iov]; + mvec->cur_iov++; + if (mvec->cur_iov == mvec->iov_len) { + mvec->iov_len = 0; + mvec->cur_iov = 0; + imsg->vec_index = !imsg->vec_index; + flags = 0; + } + submit_send(ring, c, cd, fd, iov->iov_base, iov->iov_len, bid, flags); + return 1; + } +} + +/* + * Handling a send with an outgoing send ring. Get the buffers from the + * receive side, and add them to the ingoing buffer ring again. + */ +static int handle_send_ring(struct conn *c, struct conn_dir *cd, + int bid, int bytes) +{ + struct conn_buf_ring *in_cbr = &c->in_br; + struct conn_buf_ring *out_cbr = &c->out_br; + int i = 0; + + while (bytes) { + struct io_uring_buf *buf = &out_cbr->br->bufs[bid]; + int this_bytes; + void *this_buf; + + this_bytes = buf->len; + if (this_bytes > bytes) + this_bytes = bytes; + + cd->out_bytes += this_bytes; + + vlog("%d: send: bid=%d, len=%d\n", c->tid, bid, this_bytes); + + this_buf = in_cbr->buf + bid * buf_size; + io_uring_buf_ring_add(in_cbr->br, this_buf, buf_size, bid, br_mask, i); + /* + * Find the provided buffer that the receive consumed, and + * which we then used for the send, and add it back to the + * pool so it can get picked by another receive. Once the send + * is done, we're done with it. + */ + bid = (bid + 1) & (nr_bufs - 1); + bytes -= this_bytes; + i++; + } + cd->snd_next_bid = bid; + io_uring_buf_ring_advance(in_cbr->br, i); + + if (pending_shutdown(c)) + close_cd(c, cd); + + return i; +} + +/* + * sendmsg, or send without a ring. Just add buffers back to the ingoing + * ring for receives. + */ +static int handle_send_buf(struct conn *c, struct conn_dir *cd, int bid, + int bytes) +{ + struct conn_buf_ring *in_cbr = &c->in_br; + int i = 0; + + while (bytes) { + struct io_uring_buf *buf = &in_cbr->br->bufs[bid]; + int this_bytes; + + this_bytes = bytes; + if (this_bytes > buf->len) + this_bytes = buf->len; + + vlog("%d: send: bid=%d, len=%d\n", c->tid, bid, this_bytes); + + cd->out_bytes += this_bytes; + /* each recvmsg mshot package has this overhead */ + if (rcv_msg && recv_mshot) + cd->out_bytes += sizeof(struct io_uring_recvmsg_out); + replenish_buffer(in_cbr, bid, i); + bid = (bid + 1) & (nr_bufs - 1); + bytes -= this_bytes; + i++; + } + io_uring_buf_ring_advance(in_cbr->br, i); + cd->snd_next_bid = bid; + return i; +} + +static int __handle_send(struct io_uring *ring, struct conn *c, + struct conn_dir *cd, struct io_uring_cqe *cqe) +{ + struct conn_dir *ocd; + int bid, nr_packets; + + if (send_ring) { + if (!(cqe->flags & IORING_CQE_F_BUFFER)) { + fprintf(stderr, "no buffer in send?! %d\n", cqe->res); + return 1; + } + bid = cqe->flags >> IORING_CQE_BUFFER_SHIFT; + } else { + bid = cqe_to_bid(cqe); + } + + /* + * CQE notifications only happen with send/sendmsg zerocopy. They + * tell us that the data has been acked, and that hence the buffer + * is now free to reuse. Waiting on an ACK for each packet will slow + * us down tremendously, so do all of our sends and then wait for + * the ACKs to come in. They tend to come in bundles anyway. Once + * all acks are done (cd->snd_notif == 0), then fire off the next + * receive. + */ + if (cqe->flags & IORING_CQE_F_NOTIF) { + cd->snd_notif--; + } else { + if (cqe->res && cqe->res < buf_size) + cd->snd_shrt++; + + /* + * BIDI will use the same buffer pool and do sends on both CDs, + * so can't reliably check. TODO. + */ + if (!bidi && send_ring && cd->snd_next_bid != -1 && + bid != cd->snd_next_bid) { + fprintf(stderr, "send bid %d, wanted %d at %lu\n", bid, + cd->snd_next_bid, cd->out_bytes); + goto out_close; + } + + assert(bid <= nr_bufs); + + vlog("send: got %d, %lu\n", cqe->res, cd->out_bytes); + + if (send_ring) + nr_packets = handle_send_ring(c, cd, bid, cqe->res); + else + nr_packets = handle_send_buf(c, cd, bid, cqe->res); + + if (cd->snd_bucket) + cd->snd_bucket[nr_packets]++; + + cd->out_buffers -= nr_packets; + assert(cd->out_buffers >= 0); + + cd->snd++; + } + + if (!(cqe->flags & IORING_CQE_F_MORE)) { + int do_recv_arm; + + cd->pending_send = 0; + + /* + * send done - see if the current vec has data to submit, and + * do so if it does. if it doesn't have data yet, nothing to + * do. + */ + do_recv_arm = !prep_next_send(ring, c, cd, cqe_to_fd(cqe)); + + ocd = &c->cd[!cd->index]; + if (!cd->snd_notif && do_recv_arm && !ocd->pending_recv) { + int fd = other_dir_fd(c, cqe_to_fd(cqe)); + + __submit_receive(ring, c, ocd, fd); + } +out_close: + if (pending_shutdown(c)) + close_cd(c, cd); + } + + vlog("%d: pending sends %d\n", c->tid, cd->pending_send); + return 0; +} + +static int handle_send(struct io_uring *ring, struct io_uring_cqe *cqe) +{ + struct conn *c = cqe_to_conn(cqe); + struct conn_dir *cd = cqe_to_conn_dir(c, cqe); + + return __handle_send(ring, c, cd, cqe); +} + +static int send_error(struct error_handler *err, struct io_uring *ring, + struct io_uring_cqe *cqe) +{ + struct conn *c = cqe_to_conn(cqe); + struct conn_dir *cd = cqe_to_conn_dir(c, cqe); + + cd->pending_send = 0; + + /* res can have high bit set */ + if (cqe->flags & IORING_CQE_F_NOTIF) + return handle_send(ring, cqe); + if (cqe->res != -ENOBUFS) + return default_error(err, ring, cqe); + + cd->snd_enobufs++; + return 0; +} + +/* + * We don't expect to get here, as we marked it with skipping posting a + * CQE if it was successful. If it does trigger, than means it fails and + * that our close has not been done. Log the shutdown error and issue a new + * separate close. + */ +static int handle_shutdown(struct io_uring *ring, struct io_uring_cqe *cqe) +{ + struct conn *c = cqe_to_conn(cqe); + struct io_uring_sqe *sqe; + int fd = cqe_to_fd(cqe); + + fprintf(stderr, "Got shutdown notication on fd %d\n", fd); + + if (!cqe->res) + fprintf(stderr, "Unexpected success shutdown CQE\n"); + else if (cqe->res < 0) + fprintf(stderr, "Shutdown got %s\n", strerror(-cqe->res)); + + sqe = get_sqe(ring); + if (fixed_files) + io_uring_prep_close_direct(sqe, fd); + else + io_uring_prep_close(sqe, fd); + encode_userdata(sqe, c, __CLOSE, 0, fd); + return 0; +} + +/* + * Final stage of a connection, the shutdown and close has finished. Mark + * it as disconnected and let the main loop reap it. + */ +static int handle_close(struct io_uring *ring, struct io_uring_cqe *cqe) +{ + struct conn *c = cqe_to_conn(cqe); + int fd = cqe_to_fd(cqe); + + printf("Closed client: id=%d, in_fd=%d, out_fd=%d\n", c->tid, c->in_fd, c->out_fd); + if (fd == c->in_fd) + c->in_fd = -1; + else if (fd == c->out_fd) + c->out_fd = -1; + + if (c->in_fd == -1 && c->out_fd == -1) { + c->flags |= CONN_F_DISCONNECTED; + + pthread_mutex_lock(&thread_lock); + __show_stats(c); + open_conns--; + pthread_mutex_unlock(&thread_lock); + free_buffer_rings(ring, c); + free_msgs(&c->cd[0]); + free_msgs(&c->cd[1]); + free(c->cd[0].rcv_bucket); + free(c->cd[0].snd_bucket); + } + + return 0; +} + +static int handle_cancel(struct io_uring *ring, struct io_uring_cqe *cqe) +{ + struct conn *c = cqe_to_conn(cqe); + int fd = cqe_to_fd(cqe); + + c->pending_cancels--; + + vlog("%d: got cancel fd %d, refs %d\n", c->tid, fd, c->pending_cancels); + + if (!c->pending_cancels) { + queue_shutdown_close(ring, c, c->in_fd); + if (c->out_fd != -1) + queue_shutdown_close(ring, c, c->out_fd); + io_uring_submit(ring); + } + + return 0; +} + +static void open_socket(struct conn *c) +{ + if (is_sink) { + pthread_mutex_lock(&thread_lock); + open_conns++; + pthread_mutex_unlock(&thread_lock); + + submit_receive(&c->ring, c); + } else { + struct io_uring_sqe *sqe; + int domain; + + if (ipv6) + domain = AF_INET6; + else + domain = AF_INET; + + /* + * If fixed_files is set, proxy will use fixed files for any new + * file descriptors it instantiates. Fixd files, or fixed + * descriptors, are io_uring private file descriptors. They + * cannot be accessed outside of io_uring. io_uring holds a + * fixed reference to them, which means that we do not need to + * grab per-request references to them. Particularly for + * threaded applications, grabbing and dropping file references + * for each operation can be costly as the file table is shared. + * This generally shows up as fget/fput related overhead in any + * workload profiles. + * + * Fixed descriptors are passed in via the 'fd' field just like + * regular descriptors, and then marked as such by setting the + * IOSQE_FIXED_FILE flag in the sqe->flags field. Some helpers + * do that automatically, like the below, others will need it + * set manually if they don't have a *direct*() helper. + * + * For operations that instantiate them, like the opening of a + * direct socket, the application may either ask the kernel to + * find a free one (as is done below), or the application may + * manage the space itself and pass in an index for a currently + * free slot in the table. If the kernel is asked to allocate a + * free direct descriptor, note that io_uring does not abide by + * the POSIX mandated "lowest free must be returned". It may + * return any free descriptor of its choosing. + */ + sqe = get_sqe(&c->ring); + if (fixed_files) + io_uring_prep_socket_direct_alloc(sqe, domain, SOCK_STREAM, 0, 0); + else + io_uring_prep_socket(sqe, domain, SOCK_STREAM, 0, 0); + encode_userdata(sqe, c, __SOCK, 0, 0); + } +} + +/* + * Start of connection, we got our in descriptor. + */ +static int handle_fd_pass(struct io_uring_cqe *cqe) +{ + struct conn *c = cqe_to_conn(cqe); + int fd = cqe_to_fd(cqe); + + vlog("%d: got fd pass %d\n", c->tid, fd); + c->in_fd = fd; + open_socket(c); + return 0; +} + +static int handle_stop(struct io_uring_cqe *cqe) +{ + struct conn *c = cqe_to_conn(cqe); + + printf("Client %d: queueing shutdown\n", c->tid); + queue_cancel(&c->ring, c); + return 0; +} + +/* + * Called for each CQE that we receive. Decode the request type that it + * came from, and call the appropriate handler. + */ +static int handle_cqe(struct io_uring *ring, struct io_uring_cqe *cqe) +{ + int ret; + + /* + * Unlikely, but there's an error in this CQE. If an error handler + * is defined, call it, and that will deal with it. If no error + * handler is defined, the opcode handler either doesn't care or will + * handle it on its own. + */ + if (cqe->res < 0) { + struct error_handler *err = &error_handlers[cqe_to_op(cqe)]; + + if (err->error_fn) + return err->error_fn(err, ring, cqe); + } + + switch (cqe_to_op(cqe)) { + case __ACCEPT: + ret = handle_accept(ring, cqe); + break; + case __SOCK: + ret = handle_sock(ring, cqe); + break; + case __CONNECT: + ret = handle_connect(ring, cqe); + break; + case __RECV: + case __RECVMSG: + ret = handle_recv(ring, cqe); + break; + case __SEND: + case __SENDMSG: + ret = handle_send(ring, cqe); + break; + case __CANCEL: + ret = handle_cancel(ring, cqe); + break; + case __SHUTDOWN: + ret = handle_shutdown(ring, cqe); + break; + case __CLOSE: + ret = handle_close(ring, cqe); + break; + case __FD_PASS: + ret = handle_fd_pass(cqe); + break; + case __STOP: + ret = handle_stop(cqe); + break; + case __NOP: + ret = 0; + break; + default: + fprintf(stderr, "bad user data %lx\n", (long) cqe->user_data); + return 1; + } + + return ret; +} + +static void house_keeping(struct io_uring *ring) +{ + static unsigned long last_bytes; + unsigned long bytes, elapsed; + struct conn *c; + int i, j; + + vlog("House keeping entered\n"); + + bytes = 0; + for (i = 0; i < nr_conns; i++) { + c = &conns[i]; + + for (j = 0; j < 2; j++) { + struct conn_dir *cd = &c->cd[j]; + + bytes += cd->in_bytes + cd->out_bytes; + } + if (c->flags & CONN_F_DISCONNECTED) { + vlog("%d: disconnected\n", i); + + if (!(c->flags & CONN_F_REAPED)) { + void *ret; + + pthread_join(c->thread, &ret); + c->flags |= CONN_F_REAPED; + } + continue; + } + if (c->flags & CONN_F_DISCONNECTING) + continue; + + if (should_shutdown(c)) { + __close_conn(ring, c); + c->flags |= CONN_F_DISCONNECTING; + } + } + + elapsed = mtime_since_now(&last_housekeeping); + if (bytes && elapsed >= 900) { + unsigned long bw; + + bw = (8 * (bytes - last_bytes) / 1000UL) / elapsed; + if (bw) { + if (open_conns) + printf("Bandwidth (threads=%d): %'luMbit\n", open_conns, bw); + gettimeofday(&last_housekeeping, NULL); + last_bytes = bytes; + } + } +} + +/* + * Event loop shared between the parent, and the connections. Could be + * split in two, as they don't handle the same types of events. For the per + * connection loop, 'c' is valid. For the main loop, it's NULL. + */ +static int __event_loop(struct io_uring *ring, struct conn *c) +{ + struct __kernel_timespec active_ts, idle_ts; + int flags; + + idle_ts.tv_sec = 0; + idle_ts.tv_nsec = 100000000LL; + active_ts = idle_ts; + if (wait_usec > 1000000) { + active_ts.tv_sec = wait_usec / 1000000; + wait_usec -= active_ts.tv_sec * 1000000; + } + active_ts.tv_nsec = wait_usec * 1000; + + gettimeofday(&last_housekeeping, NULL); + + flags = 0; + while (1) { + struct __kernel_timespec *ts = &idle_ts; + struct io_uring_cqe *cqe; + unsigned int head; + int ret, i, to_wait; + + /* + * If wait_batch is set higher than 1, then we'll wait on + * that amount of CQEs to be posted each loop. If used with + * DEFER_TASKRUN, this can provide a substantial reduction + * in context switch rate as the task isn't woken until the + * requested number of events can be returned. + * + * Can be used with -t to set a wait_usec timeout as well. + * For example, if an application can deal with 250 usec + * of wait latencies, it can set -w8 -t250 which will cause + * io_uring to return when either 8 events have been received, + * or if 250 usec of waiting has passed. + * + * If we don't have any open connections, wait on just 1 + * always. + */ + to_wait = 1; + if (open_conns && !flags) { + ts = &active_ts; + to_wait = wait_batch; + } + + vlog("Submit and wait for %d\n", to_wait); + ret = io_uring_submit_and_wait_timeout(ring, &cqe, to_wait, ts, NULL); + + if (*ring->cq.koverflow) + printf("overflow %u\n", *ring->cq.koverflow); + if (*ring->sq.kflags & IORING_SQ_CQ_OVERFLOW) + printf("saw overflow\n"); + + vlog("Submit and wait: %d\n", ret); + + i = flags = 0; + io_uring_for_each_cqe(ring, head, cqe) { + if (handle_cqe(ring, cqe)) + return 1; + flags |= cqe_to_conn(cqe)->flags; + ++i; + } + + vlog("Handled %d events\n", i); + + /* + * Advance the CQ ring for seen events when we've processed + * all of them in this loop. This can also be done with + * io_uring_cqe_seen() in each handler above, which just marks + * that single CQE as seen. However, it's more efficient to + * mark a batch as seen when we're done with that batch. + */ + if (i) { + io_uring_cq_advance(ring, i); + events += i; + } + + event_loops++; + if (c) { + if (c->flags & CONN_F_DISCONNECTED) + break; + } else { + house_keeping(ring); + } + } + + return 0; +} + +/* + * Main event loop, Submit our multishot accept request, and then just loop + * around handling incoming connections. + */ +static int parent_loop(struct io_uring *ring, int fd) +{ + struct io_uring_sqe *sqe; + + /* + * proxy provides a way to use either multishot receive or not, but + * for accept, we always use multishot. A multishot accept request + * needs only be armed once, and then it'll trigger a completion and + * post a CQE whenever a new connection is accepted. No need to do + * anything else, unless the multishot accept terminates. This happens + * if it encounters an error. Applications should check for + * IORING_CQE_F_MORE in cqe->flags - this tells you if more completions + * are expected from this request or not. Non-multishot never have + * this set, where multishot will always have this set unless an error + * occurs. + */ + sqe = get_sqe(ring); + if (fixed_files) + io_uring_prep_multishot_accept_direct(sqe, fd, NULL, NULL, 0); + else + io_uring_prep_multishot_accept(sqe, fd, NULL, NULL, 0); + __encode_userdata(sqe, 0, __ACCEPT, 0, fd); + + return __event_loop(ring, NULL); +} + +static int init_ring(struct io_uring *ring, int nr_files) +{ + struct io_uring_params params; + int ret; + + /* + * By default, set us up with a big CQ ring. Not strictly needed + * here, but it's very important to never overflow the CQ ring. + * Events will not be dropped if this happens, but it does slow + * the application down in dealing with overflown events. + * + * Set SINGLE_ISSUER, which tells the kernel that only one thread + * is doing IO submissions. This enables certain optimizations in + * the kernel. + */ + memset(¶ms, 0, sizeof(params)); + params.flags |= IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_CLAMP; + params.flags |= IORING_SETUP_CQSIZE; + params.cq_entries = 1024; + + /* + * If use_huge is set, setup the ring with IORING_SETUP_NO_MMAP. This + * means that the application allocates the memory for the ring, and + * the kernel maps it. The alternative is having the kernel allocate + * the memory, and then liburing will mmap it. But we can't really + * support huge pages that way. If this fails, then ensure that the + * system has huge pages set aside upfront. + */ + if (use_huge) + params.flags |= IORING_SETUP_NO_MMAP; + + /* + * DEFER_TASKRUN decouples async event reaping and retrying from + * regular system calls. If this isn't set, then io_uring uses + * normal task_work for this. task_work is always being run on any + * exit to userspace. Real applications do more than just call IO + * related system calls, and hence we can be running this work way + * too often. Using DEFER_TASKRUN defers any task_work running to + * when the application enters the kernel anyway to wait on new + * events. It's generally the preferred and recommended way to setup + * a ring. + */ + if (defer_tw) { + params.flags |= IORING_SETUP_DEFER_TASKRUN; + sqpoll = 0; + } + + /* + * SQPOLL offloads any request submission and retry operations to a + * dedicated thread. This enables an application to do IO without + * ever having to enter the kernel itself. The SQPOLL thread will + * stay busy as long as there's work to do, and go to sleep if + * sq_thread_idle msecs have passed. If it's running, submitting new + * IO just needs to make them visible to the SQPOLL thread, it needs + * not enter the kernel. For submission, the application will only + * enter the kernel if the SQPOLL has been idle long enough that it + * has gone to sleep. + * + * Waiting on events still need to enter the kernel, if none are + * available. The application may also use io_uring_peek_cqe() to + * check for new events without entering the kernel, as completions + * will be continually produced to the CQ ring by the SQPOLL thread + * as they occur. + */ + if (sqpoll) { + params.flags |= IORING_SETUP_SQPOLL; + params.sq_thread_idle = 1000; + defer_tw = 0; + } + + /* + * If neither DEFER_TASKRUN or SQPOLL is used, set COOP_TASKRUN. This + * avoids heavy signal based notifications, which can force an + * application to enter the kernel and process it as soon as they + * occur. + */ + if (!sqpoll && !defer_tw) + params.flags |= IORING_SETUP_COOP_TASKRUN; + + /* + * The SQ ring size need not be larger than any batch of requests + * that need to be prepared before submit. Normally in a loop we'd + * only need a few, if any, particularly if multishot is used. + */ + ret = io_uring_queue_init_params(ring_size, ring, ¶ms); + if (ret) { + fprintf(stderr, "%s\n", strerror(-ret)); + return 1; + } + + /* + * If send serialization is available and no option was given to use + * it or not, default it to on. If it was turned on and the kernel + * doesn't support it, turn it off. + */ + if (params.features & IORING_FEAT_SEND_BUF_SELECT) { + if (send_ring == -1) + send_ring = 1; + } else { + if (send_ring == 1) { + fprintf(stderr, "Kernel doesn't support ring provided " + "buffers for sends, disabled\n"); + } + send_ring = 0; + } + + if (!send_ring && snd_bundle) { + fprintf(stderr, "Can't use send bundle without send_ring\n"); + snd_bundle = 0; + } + + if (fixed_files) { + /* + * If fixed files are used, we need to allocate a fixed file + * table upfront where new direct descriptors can be managed. + */ + ret = io_uring_register_files_sparse(ring, nr_files); + if (ret) { + fprintf(stderr, "file register: %d\n", ret); + return 1; + } + + /* + * If fixed files are used, we also register the ring fd. See + * comment near io_uring_prep_socket_direct_alloc() further + * down. This avoids the fget/fput overhead associated with + * the io_uring_enter(2) system call itself, which is used to + * submit and wait on events. + */ + ret = io_uring_register_ring_fd(ring); + if (ret != 1) { + fprintf(stderr, "ring register: %d\n", ret); + return 1; + } + } + + if (napi) { + struct io_uring_napi n = { + .prefer_busy_poll = napi > 1 ? 1 : 0, + .busy_poll_to = napi_timeout, + }; + + ret = io_uring_register_napi(ring, &n); + if (ret) { + fprintf(stderr, "io_uring_register_napi: %d\n", ret); + if (ret != -EINVAL) + return 1; + fprintf(stderr, "NAPI not available, turned off\n"); + } + } + + return 0; +} + +static void *thread_main(void *data) +{ + struct conn *c = data; + int ret; + + c->flags |= CONN_F_STARTED; + + /* we need a max of 4 descriptors for each client */ + ret = init_ring(&c->ring, 4); + if (ret) + goto done; + + if (setup_buffer_rings(&c->ring, c)) + goto done; + + /* + * If we're using fixed files, then we need to wait for the parent + * to install the c->in_fd into our direct descriptor table. When + * that happens, we'll set things up. If we're not using fixed files, + * we can set up the receive or connect now. + */ + if (!fixed_files) + open_socket(c); + + /* we're ready */ + pthread_barrier_wait(&c->startup_barrier); + + __event_loop(&c->ring, c); +done: + return NULL; +} + +static void usage(const char *name) +{ + printf("%s:\n", name); + printf("\t-m:\t\tUse multishot receive (%d)\n", recv_mshot); + printf("\t-d:\t\tUse DEFER_TASKRUN (%d)\n", defer_tw); + printf("\t-S:\t\tUse SQPOLL (%d)\n", sqpoll); + printf("\t-f:\t\tUse only fixed files (%d)\n", fixed_files); + printf("\t-a:\t\tUse huge pages for the ring (%d)\n", use_huge); + printf("\t-t:\t\tTimeout for waiting on CQEs (usec) (%d)\n", wait_usec); + printf("\t-w:\t\tNumber of CQEs to wait for each loop (%d)\n", wait_batch); + printf("\t-B:\t\tUse bi-directional mode (%d)\n", bidi); + printf("\t-s:\t\tAct only as a sink (%d)\n", is_sink); + printf("\t-q:\t\tRing size to use (%d)\n", ring_size); + printf("\t-H:\t\tHost to connect to (%s)\n", host); + printf("\t-r:\t\tPort to receive on (%d)\n", receive_port); + printf("\t-p:\t\tPort to connect to (%d)\n", send_port); + printf("\t-6:\t\tUse IPv6 (%d)\n", ipv6); + printf("\t-N:\t\tUse NAPI polling (%d)\n", napi); + printf("\t-T:\t\tNAPI timeout (usec) (%d)\n", napi_timeout); + printf("\t-b:\t\tSend/receive buf size (%d)\n", buf_size); + printf("\t-n:\t\tNumber of provided buffers (pow2) (%d)\n", nr_bufs); + printf("\t-u:\t\tUse provided buffers for send (%d)\n", send_ring); + printf("\t-C:\t\tUse bundles for send (%d)\n", snd_bundle); + printf("\t-z:\t\tUse zerocopy send (%d)\n", snd_zc); + printf("\t-c:\t\tUse bundles for recv (%d)\n", snd_bundle); + printf("\t-M:\t\tUse sendmsg (%d)\n", snd_msg); + printf("\t-M:\t\tUse recvmsg (%d)\n", rcv_msg); + printf("\t-x:\t\tShow extended stats (%d)\n", ext_stat); + printf("\t-V:\t\tIncrease verbosity (%d)\n", verbose); +} + +/* + * Options parsing the ring / net setup + */ +int main(int argc, char *argv[]) +{ + struct io_uring ring; + struct sigaction sa = { }; + const char *optstring; + int opt, ret, fd; + + setlocale(LC_NUMERIC, "en_US"); + + page_size = sysconf(_SC_PAGESIZE); + if (page_size < 0) { + perror("sysconf(_SC_PAGESIZE)"); + return 1; + } + + pthread_mutex_init(&thread_lock, NULL); + + optstring = "m:d:S:s:b:f:H:r:p:n:B:N:T:w:t:M:R:u:c:C:q:a:x:z:6Vh?"; + while ((opt = getopt(argc, argv, optstring)) != -1) { + switch (opt) { + case 'm': + recv_mshot = !!atoi(optarg); + break; + case 'S': + sqpoll = !!atoi(optarg); + break; + case 'd': + defer_tw = !!atoi(optarg); + break; + case 'b': + buf_size = atoi(optarg); + break; + case 'n': + nr_bufs = atoi(optarg); + break; + case 'u': + send_ring = !!atoi(optarg); + break; + case 'c': + rcv_bundle = !!atoi(optarg); + break; + case 'C': + snd_bundle = !!atoi(optarg); + break; + case 'w': + wait_batch = atoi(optarg); + break; + case 't': + wait_usec = atoi(optarg); + break; + case 's': + is_sink = !!atoi(optarg); + break; + case 'f': + fixed_files = !!atoi(optarg); + break; + case 'H': + host = strdup(optarg); + break; + case 'r': + receive_port = atoi(optarg); + break; + case 'p': + send_port = atoi(optarg); + break; + case 'B': + bidi = !!atoi(optarg); + break; + case 'N': + napi = !!atoi(optarg); + break; + case 'T': + napi_timeout = atoi(optarg); + break; + case '6': + ipv6 = true; + break; + case 'M': + snd_msg = !!atoi(optarg); + break; + case 'z': + snd_zc = !!atoi(optarg); + break; + case 'R': + rcv_msg = !!atoi(optarg); + break; + case 'q': + ring_size = atoi(optarg); + break; + case 'a': + use_huge = !!atoi(optarg); + break; + case 'x': + ext_stat = !!atoi(optarg); + break; + case 'V': + verbose++; + break; + case 'h': + default: + usage(argv[0]); + return 1; + } + } + + if (bidi && is_sink) { + fprintf(stderr, "Can't be both bidi proxy and sink\n"); + return 1; + } + if (snd_msg && sqpoll) { + fprintf(stderr, "SQPOLL with msg variants disabled\n"); + snd_msg = 0; + } + if (rcv_msg && rcv_bundle) { + fprintf(stderr, "Can't use bundles with recvmsg\n"); + rcv_msg = 0; + } + if (snd_msg && snd_bundle) { + fprintf(stderr, "Can't use bundles with sendmsg\n"); + snd_msg = 0; + } + if (snd_msg && send_ring) { + fprintf(stderr, "Can't use send ring sendmsg\n"); + snd_msg = 0; + } + if (snd_zc && (send_ring || snd_bundle)) { + fprintf(stderr, "Can't use send zc with bundles or ring\n"); + send_ring = snd_bundle = 0; + } + /* + * For recvmsg w/multishot, we waste some data at the head of the + * packet every time. Adjust the buffer size to account for that, + * so we're still handing 'buf_size' actual payload of data. + */ + if (rcv_msg && recv_mshot) { + fprintf(stderr, "Adjusted buf size for recvmsg w/multishot\n"); + buf_size += sizeof(struct io_uring_recvmsg_out); + } + + br_mask = nr_bufs - 1; + + fd = setup_listening_socket(receive_port, ipv6); + if (is_sink) + send_port = -1; + + if (fd == -1) + return 1; + + atexit(show_stats); + sa.sa_handler = sig_int; + sa.sa_flags = SA_RESTART; + sigaction(SIGINT, &sa, NULL); + + ret = init_ring(&ring, MAX_CONNS * 3); + if (ret) + return ret; + + printf("Backend: sqpoll=%d, defer_tw=%d, fixed_files=%d, " + "is_sink=%d, buf_size=%d, nr_bufs=%d, host=%s, send_port=%d, " + "receive_port=%d, napi=%d, napi_timeout=%d, huge_page=%d\n", + sqpoll, defer_tw, fixed_files, is_sink, + buf_size, nr_bufs, host, send_port, receive_port, + napi, napi_timeout, use_huge); + printf(" recv options: recvmsg=%d, recv_mshot=%d, recv_bundle=%d\n", + rcv_msg, recv_mshot, rcv_bundle); + printf(" send options: sendmsg=%d, send_ring=%d, send_bundle=%d, " + "send_zerocopy=%d\n", snd_msg, send_ring, snd_bundle, + snd_zc); + + return parent_loop(&ring, fd); +} diff --git a/examples/proxy.h b/examples/proxy.h new file mode 100644 index 0000000..3fa187b --- /dev/null +++ b/examples/proxy.h @@ -0,0 +1,102 @@ +/* SPDX-License-Identifier: MIT */ +#ifndef LIBURING_PROXY_H +#define LIBURING_PROXY_H + +#include + +/* + * Generic opcode agnostic encoding to sqe/cqe->user_data + */ +struct userdata { + union { + struct { + uint16_t op_tid; /* 4 bits op, 12 bits tid */ + uint16_t bid; + uint16_t fd; + }; + uint64_t val; + }; +}; + +#define OP_SHIFT (12) +#define TID_MASK ((1U << 12) - 1) + +/* + * Packs the information that we will need at completion time into the + * sqe->user_data field, which is passed back in the completion in + * cqe->user_data. Some apps would need more space than this, and in fact + * I'd love to pack the requested IO size in here, and it's not uncommon to + * see apps use this field as just a cookie to either index a data structure + * at completion time, or even just put the pointer to the associated + * structure into this field. + */ +static inline void __encode_userdata(struct io_uring_sqe *sqe, int tid, int op, + int bid, int fd) +{ + struct userdata ud = { + .op_tid = (op << OP_SHIFT) | tid, + .bid = bid, + .fd = fd + }; + + io_uring_sqe_set_data64(sqe, ud.val); +} + +static inline uint64_t __raw_encode(int tid, int op, int bid, int fd) +{ + struct userdata ud = { + .op_tid = (op << OP_SHIFT) | tid, + .bid = bid, + .fd = fd + }; + + return ud.val; +} + +static inline int cqe_to_op(struct io_uring_cqe *cqe) +{ + struct userdata ud = { .val = cqe->user_data }; + + return ud.op_tid >> OP_SHIFT; +} + +static inline int cqe_to_bid(struct io_uring_cqe *cqe) +{ + struct userdata ud = { .val = cqe->user_data }; + + return ud.bid; +} + +static inline int cqe_to_fd(struct io_uring_cqe *cqe) +{ + struct userdata ud = { .val = cqe->user_data }; + + return ud.fd; +} + +static unsigned long long mtime_since(const struct timeval *s, + const struct timeval *e) +{ + long long sec, usec; + + sec = e->tv_sec - s->tv_sec; + usec = (e->tv_usec - s->tv_usec); + if (sec > 0 && usec < 0) { + sec--; + usec += 1000000; + } + + sec *= 1000; + usec /= 1000; + return sec + usec; +} + +static unsigned long long mtime_since_now(struct timeval *tv) +{ + struct timeval end; + + gettimeofday(&end, NULL); + return mtime_since(tv, &end); +} + +#endif diff --git a/examples/rsrc-update-bench.c b/examples/rsrc-update-bench.c new file mode 100644 index 0000000..5e3cd99 --- /dev/null +++ b/examples/rsrc-update-bench.c @@ -0,0 +1,100 @@ +/* SPDX-License-Identifier: MIT */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "liburing.h" + +static unsigned long runtime_ms = 10000; + +static unsigned long gettimeofday_ms(void) +{ + struct timeval tv; + + gettimeofday(&tv, NULL); + return (tv.tv_sec * 1000) + (tv.tv_usec / 1000); +} + +int main(void) +{ + unsigned long tstop; + unsigned long nr_reqs = 0; + struct io_uring_cqe *cqe; + struct io_uring_sqe *sqe; + struct io_uring ring; + int pipe1[2]; + int ret, i, qd = 32; + int table_size = 128; + + if (pipe(pipe1) != 0) { + perror("pipe"); + return 1; + } + + ret = io_uring_queue_init(1024, &ring, IORING_SETUP_SINGLE_ISSUER | + IORING_SETUP_DEFER_TASKRUN); + if (ret) { + fprintf(stderr, "io_uring_queue_init failed: %d\n", ret); + return 1; + } + ret = io_uring_register_ring_fd(&ring); + if (ret < 0) { + fprintf(stderr, "io_uring_register_ring_fd failed\n"); + return 1; + } + ret = io_uring_register_files_sparse(&ring, table_size); + if (ret < 0) { + fprintf(stderr, "io_uring_register_files_sparse failed\n"); + return 1; + } + + for (i = 0; i < table_size; i++) { + ret = io_uring_register_files_update(&ring, i, pipe1, 1); + if (ret < 0) { + fprintf(stderr, "io_uring_register_files_update failed\n"); + return 1; + } + } + + srand(time(NULL)); + + tstop = gettimeofday_ms() + runtime_ms; + do { + int off = rand(); + + for (i = 0; i < qd; i++) { + sqe = io_uring_get_sqe(&ring); + int roff = (off + i) % table_size; + io_uring_prep_files_update(sqe, pipe1, 1, roff); + } + + ret = io_uring_submit(&ring); + if (ret != qd) { + fprintf(stderr, "child: sqe submit failed: %d\n", ret); + return 1; + } + + for (i = 0; i < qd; i++) { + ret = io_uring_wait_cqe(&ring, &cqe); + if (ret < 0) { + fprintf(stderr, "child: wait completion %d\n", ret); + break; + } + io_uring_cqe_seen(&ring, cqe); + nr_reqs++; + } + } while (gettimeofday_ms() < tstop); + + fprintf(stderr, "max updates/s: %lu\n", nr_reqs * 1000UL / runtime_ms); + + io_uring_queue_exit(&ring); + close(pipe1[0]); + close(pipe1[1]); + return 0; +} diff --git a/examples/send-zerocopy.c b/examples/send-zerocopy.c index 7f5f2b1..691a0cc 100644 --- a/examples/send-zerocopy.c +++ b/examples/send-zerocopy.c @@ -5,15 +5,17 @@ #include #include #include -#include #include #include #include #include +#include #include +#include +#include +#include #include -#include #include #include #include @@ -35,27 +37,107 @@ #include #include #include +#include +#include +#include #include "liburing.h" #define ZC_TAG 0xfffffffULL #define MAX_SUBMIT_NR 512 +#define MAX_THREADS 100 + +struct thread_data { + pthread_t thread; + void *ret; + int idx; + unsigned long long packets; + unsigned long long bytes; + unsigned long long dt_ms; + struct sockaddr_storage dst_addr; + int fd; +}; static bool cfg_reg_ringfd = true; static bool cfg_fixed_files = 1; static bool cfg_zc = 1; static int cfg_nr_reqs = 8; static bool cfg_fixed_buf = 1; +static bool cfg_hugetlb = 0; +static bool cfg_defer_taskrun = 0; +static int cfg_cpu = -1; +static bool cfg_rx = 0; +static unsigned cfg_nr_threads = 1; static int cfg_family = PF_UNSPEC; +static int cfg_type = 0; static int cfg_payload_len; static int cfg_port = 8000; static int cfg_runtime_ms = 4200; +static bool cfg_rx_poll = false; static socklen_t cfg_alen; -static struct sockaddr_storage cfg_dst_addr; +static char *str_addr = NULL; + +static char payload_buf[IP_MAXPACKET] __attribute__((aligned(4096))); +static char *payload; +static struct thread_data threads[MAX_THREADS]; +static pthread_barrier_t barrier; + +static bool should_stop = false; + +static void sigint_handler(__attribute__((__unused__)) int sig) +{ + /* kill if should_stop can't unblock threads fast enough */ + if (should_stop) + _exit(-1); + should_stop = true; +} + +/* + * Implementation of error(3), prints an error message and exits. + */ +static void t_error(int status, int errnum, const char *format, ...) +{ + va_list args; + va_start(args, format); + + vfprintf(stderr, format, args); + if (errnum) + fprintf(stderr, ": %s", strerror(errnum)); + + fprintf(stderr, "\n"); + va_end(args); + exit(status); +} + +static void set_cpu_affinity(void) +{ + cpu_set_t mask; -static char payload[IP_MAXPACKET] __attribute__((aligned(4096))); + if (cfg_cpu == -1) + return; + + CPU_ZERO(&mask); + CPU_SET(cfg_cpu, &mask); + if (sched_setaffinity(0, sizeof(mask), &mask)) + t_error(1, errno, "unable to pin cpu\n"); +} + +static void set_iowq_affinity(struct io_uring *ring) +{ + cpu_set_t mask; + int ret; + + if (cfg_cpu == -1) + return; + + CPU_ZERO(&mask); + CPU_SET(cfg_cpu, &mask); + ret = io_uring_register_iowq_aff(ring, 1, &mask); + if (ret) + t_error(1, ret, "unabled to set io-wq affinity\n"); +} static unsigned long gettimeofday_ms(void) { @@ -68,7 +150,7 @@ static unsigned long gettimeofday_ms(void) static void do_setsockopt(int fd, int level, int optname, int val) { if (setsockopt(fd, level, optname, &val, sizeof(val))) - error(1, errno, "setsockopt %d.%d: %d", level, optname, val); + t_error(1, errno, "setsockopt %d.%d: %d", level, optname, val); } static void setup_sockaddr(int domain, const char *str_addr, @@ -76,42 +158,156 @@ static void setup_sockaddr(int domain, const char *str_addr, { struct sockaddr_in6 *addr6 = (void *) sockaddr; struct sockaddr_in *addr4 = (void *) sockaddr; + int port = cfg_port; switch (domain) { case PF_INET: memset(addr4, 0, sizeof(*addr4)); addr4->sin_family = AF_INET; - addr4->sin_port = htons(cfg_port); + addr4->sin_port = htons(port); if (str_addr && inet_pton(AF_INET, str_addr, &(addr4->sin_addr)) != 1) - error(1, 0, "ipv4 parse error: %s", str_addr); + t_error(1, 0, "ipv4 parse error: %s", str_addr); break; case PF_INET6: memset(addr6, 0, sizeof(*addr6)); addr6->sin6_family = AF_INET6; - addr6->sin6_port = htons(cfg_port); + addr6->sin6_port = htons(port); if (str_addr && inet_pton(AF_INET6, str_addr, &(addr6->sin6_addr)) != 1) - error(1, 0, "ipv6 parse error: %s", str_addr); + t_error(1, 0, "ipv6 parse error: %s", str_addr); break; default: - error(1, 0, "illegal domain"); + t_error(1, 0, "illegal domain"); } } -static int do_setup_tx(int domain, int type, int protocol) +static int do_poll(int fd, int events) { - int fd; + struct pollfd pfd; + int ret; + + pfd.events = events; + pfd.revents = 0; + pfd.fd = fd; + + ret = poll(&pfd, 1, -1); + if (ret == -1) + t_error(1, errno, "poll"); + + return ret && (pfd.revents & events); +} + +/* Flush all outstanding bytes for the tcp receive queue */ +static int do_flush_tcp(struct thread_data *td, int fd) +{ + int ret; + + /* MSG_TRUNC flushes up to len bytes */ + ret = recv(fd, NULL, 1 << 21, MSG_TRUNC | MSG_DONTWAIT); + if (ret == -1 && errno == EAGAIN) + return 0; + if (ret == -1) + t_error(1, errno, "flush"); + if (!ret) + return 1; + + td->packets++; + td->bytes += ret; + return 0; +} + +/* Flush all outstanding datagrams. Verify first few bytes of each. */ +static int do_flush_datagram(struct thread_data *td, int fd) +{ + long ret, off = 0; + char buf[64]; + + /* MSG_TRUNC will return full datagram length */ + ret = recv(fd, buf, sizeof(buf), MSG_DONTWAIT | MSG_TRUNC); + if (ret == -1 && errno == EAGAIN) + return 0; + + if (ret == -1) + t_error(1, errno, "recv"); + if (ret != cfg_payload_len) + t_error(1, 0, "recv: ret=%u != %u", ret, cfg_payload_len); + if ((unsigned long) ret > sizeof(buf) - off) + ret = sizeof(buf) - off; + if (memcmp(buf + off, payload, ret)) + t_error(1, 0, "recv: data mismatch"); + + td->packets++; + td->bytes += cfg_payload_len; + return 0; +} + +static void do_setup_rx(int domain, int type, int protocol) +{ + struct sockaddr_storage addr = {}; + struct thread_data *td; + int listen_fd, fd; + unsigned int i; fd = socket(domain, type, protocol); if (fd == -1) - error(1, errno, "socket t"); + t_error(1, errno, "socket r"); + + do_setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, 1); + + setup_sockaddr(cfg_family, str_addr, &addr); + + if (bind(fd, (void *)&addr, cfg_alen)) + t_error(1, errno, "bind"); + + if (type != SOCK_STREAM) { + if (cfg_nr_threads != 1) + t_error(1, 0, "udp rx cant multithread"); + threads[0].fd = fd; + return; + } + + listen_fd = fd; + if (listen(listen_fd, cfg_nr_threads)) + t_error(1, errno, "listen"); + + for (i = 0; i < cfg_nr_threads; i++) { + td = &threads[i]; + + fd = accept(listen_fd, NULL, NULL); + if (fd == -1) + t_error(1, errno, "accept"); + td->fd = fd; + } + + if (close(listen_fd)) + t_error(1, errno, "close listen sock"); +} + +static void *do_rx(void *arg) +{ + struct thread_data *td = arg; + const int cfg_receiver_wait_ms = 400; + uint64_t tstop; + int ret, fd = td->fd; + + tstop = gettimeofday_ms() + cfg_runtime_ms + cfg_receiver_wait_ms; + do { + if (cfg_type == SOCK_STREAM) + ret = do_flush_tcp(td, fd); + else + ret = do_flush_datagram(td, fd); - do_setsockopt(fd, SOL_SOCKET, SO_SNDBUF, 1 << 21); + if (ret) + break; + + do_poll(fd, POLLIN); + } while (gettimeofday_ms() < tstop); - if (connect(fd, (void *) &cfg_dst_addr, cfg_alen)) - error(1, errno, "connect"); - return fd; + if (close(fd)) + t_error(1, errno, "close"); + pthread_exit(&td->ret); + return NULL; } static inline struct io_uring_cqe *wait_cqe_fast(struct io_uring *ring) @@ -125,35 +321,47 @@ static inline struct io_uring_cqe *wait_cqe_fast(struct io_uring *ring) ret = io_uring_wait_cqe(ring, &cqe); if (ret) - error(1, ret, "wait cqe"); + t_error(1, ret, "wait cqe"); return cqe; } -static void do_tx(int domain, int type, int protocol) +static void do_tx(struct thread_data *td, int domain, int type, int protocol) { - unsigned long packets = 0; - unsigned long bytes = 0; + const int notif_slack = 128; struct io_uring ring; struct iovec iov; - uint64_t tstop; + uint64_t tstart; int i, fd, ret; int compl_cqes = 0; + int ring_flags = IORING_SETUP_COOP_TASKRUN | IORING_SETUP_SINGLE_ISSUER; + unsigned loop = 0; + + if (cfg_defer_taskrun) + ring_flags |= IORING_SETUP_DEFER_TASKRUN; + + fd = socket(domain, type, protocol); + if (fd == -1) + t_error(1, errno, "socket t"); - fd = do_setup_tx(domain, type, protocol); + if (connect(fd, (void *)&td->dst_addr, cfg_alen)) + t_error(1, errno, "connect, idx %i", td->idx); - ret = io_uring_queue_init(512, &ring, IORING_SETUP_COOP_TASKRUN); + ret = io_uring_queue_init(512, &ring, ring_flags); if (ret) - error(1, ret, "io_uring: queue init"); + t_error(1, ret, "io_uring: queue init"); + + set_cpu_affinity(); + set_iowq_affinity(&ring); if (cfg_fixed_files) { ret = io_uring_register_files(&ring, &fd, 1); if (ret < 0) - error(1, ret, "io_uring: files registration"); + t_error(1, ret, "io_uring: files registration"); } if (cfg_reg_ringfd) { ret = io_uring_register_ring_fd(&ring); if (ret < 0) - error(1, ret, "io_uring: io_uring_register_ring_fd"); + t_error(1, ret, "io_uring: io_uring_register_ring_fd"); } iov.iov_base = payload; @@ -161,9 +369,22 @@ static void do_tx(int domain, int type, int protocol) ret = io_uring_register_buffers(&ring, &iov, 1); if (ret) - error(1, ret, "io_uring: buffer registration"); + t_error(1, ret, "io_uring: buffer registration"); + + if (cfg_rx_poll) { + struct io_uring_sqe *sqe; + + sqe = io_uring_get_sqe(&ring); + io_uring_prep_poll_add(sqe, fd, POLLIN); + + ret = io_uring_submit(&ring); + if (ret != 1) + t_error(1, ret, "submit poll"); + } - tstop = gettimeofday_ms() + cfg_runtime_ms; + pthread_barrier_wait(&barrier); + + tstart = gettimeofday_ms(); do { struct io_uring_sqe *sqe; struct io_uring_cqe *cqe; @@ -191,16 +412,20 @@ static void do_tx(int domain, int type, int protocol) } } - ret = io_uring_submit(&ring); + if (cfg_defer_taskrun && compl_cqes >= notif_slack) + ret = io_uring_submit_and_get_events(&ring); + else + ret = io_uring_submit(&ring); + if (ret != cfg_nr_reqs) - error(1, ret, "submit"); + t_error(1, ret, "submit"); for (i = 0; i < cfg_nr_reqs; i++) { cqe = wait_cqe_fast(&ring); if (cqe->flags & IORING_CQE_F_NOTIF) { if (cqe->flags & IORING_CQE_F_MORE) - error(1, -EINVAL, "F_MORE notif"); + t_error(1, -EINVAL, "F_MORE notif"); compl_cqes--; i--; io_uring_cqe_seen(&ring, cqe); @@ -210,28 +435,27 @@ static void do_tx(int domain, int type, int protocol) compl_cqes++; if (cqe->res >= 0) { - packets++; - bytes += cqe->res; + td->packets++; + td->bytes += cqe->res; } else if (cqe->res == -ECONNREFUSED || cqe->res == -EPIPE || cqe->res == -ECONNRESET) { - fprintf(stderr, "Connection failure"); + fprintf(stderr, "Connection failure\n"); goto out_fail; } else if (cqe->res != -EAGAIN) { - error(1, cqe->res, "send failed"); + t_error(1, cqe->res, "send failed"); } io_uring_cqe_seen(&ring, cqe); } - } while (gettimeofday_ms() < tstop); + if (should_stop) + break; + } while ((++loop % 16 != 0) || gettimeofday_ms() < tstart + cfg_runtime_ms); + + td->dt_ms = gettimeofday_ms() - tstart; out_fail: shutdown(fd, SHUT_RDWR); if (close(fd)) - error(1, errno, "close"); - - fprintf(stderr, "tx=%lu (MB=%lu), tx/s=%lu (MB/s=%lu)\n", - packets, bytes >> 20, - packets / (cfg_runtime_ms / 1000), - (bytes >> 20) / (cfg_runtime_ms / 1000)); + t_error(1, errno, "close"); while (compl_cqes) { struct io_uring_cqe *cqe = wait_cqe_fast(&ring); @@ -242,47 +466,67 @@ out_fail: io_uring_queue_exit(&ring); } -static void do_test(int domain, int type, int protocol) +static void *do_test(void *arg) { - int i; + struct thread_data *td = arg; + int protocol = 0; - for (i = 0; i < IP_MAXPACKET; i++) - payload[i] = 'a' + (i % 26); + setup_sockaddr(cfg_family, str_addr, &td->dst_addr); - do_tx(domain, type, protocol); + do_tx(td, cfg_family, cfg_type, protocol); + pthread_exit(&td->ret); + return NULL; } static void usage(const char *filepath) { - error(1, 0, "Usage: %s [-n] [-z] [-s] " - "(-4|-6) [-t