From 4080d0ad8ad635213ff952b3e5cfb73e41601496 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 5 Apr 2024 12:52:07 +0200 Subject: [PATCH 1/9] selftests: mptcp: export ip_mptcp to mptcp_lib mainline inclusion from mainline-v6.10-rc1 commit 29aa32fee7d0fdc71221158719137c1f4f5de3b4 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I9VYQ9 CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=29aa32fee7d0fdc71221158719137c1f4f5de3b4 -------------------------------- This patch exports ip_mptcp into mptcp_lib.sh as a public variable, named MPTCP_LIB_IP_MPTCP. Add a helper mptcp_lib_set_ip_mptcp() to set it, and a helper mptcp_lib_is_ip_mptcp() to test whether it is set. Use these two helpers in mptcp_join.sh. This patch is prepared for coming commits. Signed-off-by: Geliang Tang Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: David S. Miller --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 17 ++++++++--------- tools/testing/selftests/net/mptcp/mptcp_lib.sh | 9 +++++++++ 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index a1df06f741ae1..b9d82f10c32c7 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -31,7 +31,6 @@ timeout_poll=30 timeout_test=$((timeout_poll * 2 + 1)) capture=false checksum=false -ip_mptcp=0 check_invert=0 validate_checksum=false init=0 @@ -616,7 +615,7 @@ pm_nl_set_limits() local addrs=$2 local subflows=$3 - if [ $ip_mptcp -eq 1 ]; then + if mptcp_lib_is_ip_mptcp; then ip -n $ns mptcp limits set add_addr_accepted $addrs subflows $subflows else ip netns exec $ns ./pm_nl_ctl limits $addrs $subflows @@ -656,7 +655,7 @@ pm_nl_add_endpoint() nr=$((nr + 1)) done - if [ $ip_mptcp -eq 1 ]; then + if mptcp_lib_is_ip_mptcp; then ip -n $ns mptcp endpoint add $addr ${_flags//","/" "} $dev $id $port else ip netns exec $ns ./pm_nl_ctl add $addr $flags $dev $id $port @@ -669,7 +668,7 @@ pm_nl_del_endpoint() local id=$2 local addr=$3 - if [ $ip_mptcp -eq 1 ]; then + if mptcp_lib_is_ip_mptcp; then [ $id -ne 0 ] && addr='' ip -n $ns mptcp endpoint delete id $id $addr else @@ -681,7 +680,7 @@ pm_nl_flush_endpoint() { local ns=$1 - if [ $ip_mptcp -eq 1 ]; then + if mptcp_lib_is_ip_mptcp; then ip -n $ns mptcp endpoint flush else ip netns exec $ns ./pm_nl_ctl flush @@ -692,7 +691,7 @@ pm_nl_show_endpoints() { local ns=$1 - if [ $ip_mptcp -eq 1 ]; then + if mptcp_lib_is_ip_mptcp; then ip -n $ns mptcp endpoint show else ip netns exec $ns ./pm_nl_ctl dump @@ -705,7 +704,7 @@ pm_nl_change_endpoint() local id=$2 local flags=$3 - if [ $ip_mptcp -eq 1 ]; then + if mptcp_lib_is_ip_mptcp; then ip -n $ns mptcp endpoint change id $id ${flags//","/" "} else ip netns exec $ns ./pm_nl_ctl set id $id flags $flags @@ -755,7 +754,7 @@ pm_nl_check_endpoint() return fi - if [ $ip_mptcp -eq 1 ]; then + if mptcp_lib_is_ip_mptcp; then # get line and trim trailing whitespace line=$(ip -n $ns mptcp endpoint show $id) line="${line% }" @@ -3963,7 +3962,7 @@ while getopts "${all_tests_args}cCih" opt; do checksum=true ;; i) - ip_mptcp=1 + mptcp_lib_set_ip_mptcp ;; h) usage diff --git a/tools/testing/selftests/net/mptcp/mptcp_lib.sh b/tools/testing/selftests/net/mptcp/mptcp_lib.sh index d529b4b37af86..3986adfece448 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_lib.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_lib.sh @@ -23,6 +23,7 @@ MPTCP_LIB_SUBTESTS=() MPTCP_LIB_SUBTESTS_DUPLICATED=0 MPTCP_LIB_TEST_COUNTER=0 MPTCP_LIB_TEST_FORMAT="%02u %-50s" +MPTCP_LIB_IP_MPTCP=0 # only if supported (or forced) and not disabled, see no-color.org if { [ -t 1 ] || [ "${SELFTESTS_MPTCP_LIB_COLOR_FORCE:-}" = "1" ]; } && @@ -505,3 +506,11 @@ mptcp_lib_verify_listener_events() { mptcp_lib_check_expected "type" "family" "saddr" "sport" || rc="${?}" return "${rc}" } + +mptcp_lib_set_ip_mptcp() { + MPTCP_LIB_IP_MPTCP=1 +} + +mptcp_lib_is_ip_mptcp() { + [ "${MPTCP_LIB_IP_MPTCP}" = "1" ] +} -- Gitee From cd20358b7aca6da2aba8353415fe68aa32b753e4 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 24 May 2024 18:30:56 +0200 Subject: [PATCH 2/9] selftests: mptcp: lib: support flaky subtests mainline inclusion from mainline-v6.10-rc2 commit 5597613fb3cf0e36d26cfd8fb2a63196da249333 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I9VYQ9 CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=5597613fb3cf0e36d26cfd8fb2a63196da249333 -------------------------------- Some subtests can be unstable, failing once every X runs. Fixing them can take time: there could be an issue in the kernel or in the subtest, and it is then important to do a proper analysis, not to hide real bugs. To avoid creating noises on the different CIs, it is important to have a simple way to mark subtests as flaky, and ignore the errors. This is what this patch introduces: subtests can be marked as flaky by setting MPTCP_LIB_SUBTEST_FLAKY env var to 1, e.g. MPTCP_LIB_SUBTEST_FLAKY=1 The subtest will be executed, and errors (if any) will be ignored. It is still good to run these subtests, as it exercises code, and the results can still be useful for the on-going investigations. Note that the MPTCP CI will continue to track these flaky subtests by setting SELFTESTS_MPTCP_LIB_OVERRIDE_FLAKY env var to 1, and a ticket has to be created before marking subtests as flaky. Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://lore.kernel.org/r/20240524-upstream-net-20240524-selftests-mptcp-flaky-v1-1-a352362f3f8e@kernel.org Signed-off-by: Jakub Kicinski --- .../testing/selftests/net/mptcp/mptcp_lib.sh | 30 +++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_lib.sh b/tools/testing/selftests/net/mptcp/mptcp_lib.sh index 3986adfece448..77cf3303e4fa1 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_lib.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_lib.sh @@ -21,6 +21,7 @@ declare -rx MPTCP_LIB_AF_INET6=10 MPTCP_LIB_SUBTESTS=() MPTCP_LIB_SUBTESTS_DUPLICATED=0 +MPTCP_LIB_SUBTEST_FLAKY=0 MPTCP_LIB_TEST_COUNTER=0 MPTCP_LIB_TEST_FORMAT="%02u %-50s" MPTCP_LIB_IP_MPTCP=0 @@ -41,6 +42,16 @@ else readonly MPTCP_LIB_COLOR_RESET= fi +# SELFTESTS_MPTCP_LIB_OVERRIDE_FLAKY env var can be set not to ignore errors +# from subtests marked as flaky +mptcp_lib_override_flaky() { + [ "${SELFTESTS_MPTCP_LIB_OVERRIDE_FLAKY:-}" = 1 ] +} + +mptcp_lib_subtest_is_flaky() { + [ "${MPTCP_LIB_SUBTEST_FLAKY}" = 1 ] && ! mptcp_lib_override_flaky +} + # $1: color, $2: text mptcp_lib_print_color() { echo -e "${MPTCP_LIB_START_PRINT:-}${*}${MPTCP_LIB_COLOR_RESET}" @@ -72,7 +83,16 @@ mptcp_lib_pr_skip() { } mptcp_lib_pr_fail() { - mptcp_lib_print_err "[FAIL]${1:+ ${*}}" + local title cmt + + if mptcp_lib_subtest_is_flaky; then + title="IGNO" + cmt=" (flaky)" + else + title="FAIL" + fi + + mptcp_lib_print_err "[${title}]${cmt}${1:+ ${*}}" } mptcp_lib_pr_info() { @@ -208,7 +228,13 @@ mptcp_lib_result_pass() { # $1: test name mptcp_lib_result_fail() { - __mptcp_lib_result_add "not ok" "${1}" + if mptcp_lib_subtest_is_flaky; then + # It might sound better to use 'not ok # TODO' or 'ok # SKIP', + # but some CIs don't understand 'TODO' and treat SKIP as errors. + __mptcp_lib_result_add "ok" "${1} # IGNORE Flaky" + else + __mptcp_lib_result_add "not ok" "${1}" + fi } # $1: test name -- Gitee From 447c3f2647337c7953bd921ae8fb9e043d240d4f Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 24 May 2024 18:30:58 +0200 Subject: [PATCH 3/9] selftests: mptcp: join: mark 'fastclose' tests as flaky mainline inclusion from mainline-v6.10-rc2 commit 8c06ac2178a9dee887929232226e35a5cdda1793 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I9VYQ9 CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=8c06ac2178a9dee887929232226e35a5cdda1793 -------------------------------- These tests are flaky since their introduction. This might be less or not visible depending on the CI running the tests, especially if it is also busy doing other tasks in parallel, and if a debug kernel config is being used. It looks like this issue is often present with the NetDev CI. While this is being investigated, the tests are marked as flaky not to create noises on such CIs. Fixes: 01542c9bf9ab ("selftests: mptcp: add fastclose testcase") Link: https://github.com/multipath-tcp/mptcp_net-next/issues/324 Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://lore.kernel.org/r/20240524-upstream-net-20240524-selftests-mptcp-flaky-v1-3-a352362f3f8e@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index b9d82f10c32c7..05429017beb7f 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -261,6 +261,8 @@ reset() TEST_NAME="${1}" + MPTCP_LIB_SUBTEST_FLAKY=0 # reset if modified + if skip_test; then MPTCP_LIB_TEST_COUNTER=$((MPTCP_LIB_TEST_COUNTER+1)) last_test_ignored=1 @@ -454,7 +456,9 @@ reset_with_tcp_filter() # $1: err msg fail_test() { - ret=${KSFT_FAIL} + if ! mptcp_lib_subtest_is_flaky; then + ret=${KSFT_FAIL} + fi if [ ${#} -gt 0 ]; then print_fail "${@}" @@ -3258,6 +3262,7 @@ fullmesh_tests() fastclose_tests() { if reset_check_counter "fastclose test" "MPTcpExtMPFastcloseTx"; then + MPTCP_LIB_SUBTEST_FLAKY=1 test_linkfail=1024 fastclose=client \ run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 0 0 0 @@ -3266,6 +3271,7 @@ fastclose_tests() fi if reset_check_counter "fastclose server test" "MPTcpExtMPFastcloseRx"; then + MPTCP_LIB_SUBTEST_FLAKY=1 test_linkfail=1024 fastclose=server \ run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 0 0 0 0 0 0 1 -- Gitee From bd6b6c2cb9d5fd921758cfba150ab3d2f61d4664 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Thu, 30 May 2024 16:07:30 +0200 Subject: [PATCH 4/9] doc: mptcp: add missing 'available_schedulers' entry mainline inclusion from mainline-v6.11-rc1 commit ccf45c92d746e8eac2cb1e60243d38a479e8dc0a category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I9VYQ9 CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=ccf45c92d746e8eac2cb1e60243d38a479e8dc0a -------------------------------- This sysctl knob has been added recently, but the documentation has not been updated. This knob is used to show the available schedulers choices that are registered, similar to 'net.ipv4.tcp_available_congestion_control'. Fixes: 73c900aa3660 ("mptcp: add net.mptcp.available_schedulers") Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://lore.kernel.org/r/20240530-upstream-net-20240520-mptcp-doc-v3-1-e94cdd9f2673@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/networking/mptcp-sysctl.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Documentation/networking/mptcp-sysctl.rst b/Documentation/networking/mptcp-sysctl.rst index 69975ce25a029..102a45e7bfa8a 100644 --- a/Documentation/networking/mptcp-sysctl.rst +++ b/Documentation/networking/mptcp-sysctl.rst @@ -93,3 +93,7 @@ scheduler - STRING sysctl. Default: "default" + +available_schedulers - STRING + Shows the available schedulers choices that are registered. More packet + schedulers may be available, but not loaded. -- Gitee From 845a152d74bb1565886992699c06a3bd82a3b715 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Thu, 30 May 2024 16:07:31 +0200 Subject: [PATCH 5/9] doc: mptcp: alphabetical order mainline inclusion from mainline-v6.11-rc1 commit a32c6966b23d987520f7c5ac12d153eaff425553 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I9VYQ9 CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=a32c6966b23d987520f7c5ac12d153eaff425553 -------------------------------- Similar to what is done in other 'sysctl' pages: it looks clearer from a readability perspective. This might cause troubles in the short/mid-term with the backports, but by not putting new entries at the end, this can help to reduce conflicts in case of backports in the long term. We don't change the information here too often, so it looks OK to do that. Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://lore.kernel.org/r/20240530-upstream-net-20240520-mptcp-doc-v3-2-e94cdd9f2673@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/networking/mptcp-sysctl.rst | 74 +++++++++++------------ 1 file changed, 37 insertions(+), 37 deletions(-) diff --git a/Documentation/networking/mptcp-sysctl.rst b/Documentation/networking/mptcp-sysctl.rst index 102a45e7bfa8a..fd514bba8c431 100644 --- a/Documentation/networking/mptcp-sysctl.rst +++ b/Documentation/networking/mptcp-sysctl.rst @@ -7,14 +7,6 @@ MPTCP Sysfs variables /proc/sys/net/mptcp/* Variables =============================== -enabled - BOOLEAN - Control whether MPTCP sockets can be created. - - MPTCP sockets can be created if the value is 1. This is a - per-namespace sysctl. - - Default: 1 (enabled) - add_addr_timeout - INTEGER (seconds) Set the timeout after which an ADD_ADDR control message will be resent to an MPTCP peer that has not acknowledged a previous @@ -25,16 +17,22 @@ add_addr_timeout - INTEGER (seconds) Default: 120 -close_timeout - INTEGER (seconds) - Set the make-after-break timeout: in absence of any close or - shutdown syscall, MPTCP sockets will maintain the status - unchanged for such time, after the last subflow removal, before - moving to TCP_CLOSE. +allow_join_initial_addr_port - BOOLEAN + Allow peers to send join requests to the IP address and port number used + by the initial subflow if the value is 1. This controls a flag that is + sent to the peer at connection time, and whether such join requests are + accepted or denied. - The default value matches TCP_TIMEWAIT_LEN. This is a per-namespace - sysctl. + Joins to addresses advertised with ADD_ADDR are not affected by this + value. - Default: 60 + This is a per-namespace sysctl. + + Default: 1 + +available_schedulers - STRING + Shows the available schedulers choices that are registered. More packet + schedulers may be available, but not loaded. checksum_enabled - BOOLEAN Control whether DSS checksum can be enabled. @@ -44,18 +42,24 @@ checksum_enabled - BOOLEAN Default: 0 -allow_join_initial_addr_port - BOOLEAN - Allow peers to send join requests to the IP address and port number used - by the initial subflow if the value is 1. This controls a flag that is - sent to the peer at connection time, and whether such join requests are - accepted or denied. +close_timeout - INTEGER (seconds) + Set the make-after-break timeout: in absence of any close or + shutdown syscall, MPTCP sockets will maintain the status + unchanged for such time, after the last subflow removal, before + moving to TCP_CLOSE. - Joins to addresses advertised with ADD_ADDR are not affected by this - value. + The default value matches TCP_TIMEWAIT_LEN. This is a per-namespace + sysctl. - This is a per-namespace sysctl. + Default: 60 - Default: 1 +enabled - BOOLEAN + Control whether MPTCP sockets can be created. + + MPTCP sockets can be created if the value is 1. This is a + per-namespace sysctl. + + Default: 1 (enabled) pm_type - INTEGER Set the default path manager type to use for each new MPTCP @@ -74,6 +78,14 @@ pm_type - INTEGER Default: 0 +scheduler - STRING + Select the scheduler of your choice. + + Support for selection of different schedulers. This is a per-namespace + sysctl. + + Default: "default" + stale_loss_cnt - INTEGER The number of MPTCP-level retransmission intervals with no traffic and pending outstanding data on a given subflow required to declare it stale. @@ -85,15 +97,3 @@ stale_loss_cnt - INTEGER This is a per-namespace sysctl. Default: 4 - -scheduler - STRING - Select the scheduler of your choice. - - Support for selection of different schedulers. This is a per-namespace - sysctl. - - Default: "default" - -available_schedulers - STRING - Shows the available schedulers choices that are registered. More packet - schedulers may be available, but not loaded. -- Gitee From 60aaae041eda91a6d057d384d2051a9fdc8cdc4a Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Mon, 9 Dec 2024 17:01:04 +0800 Subject: [PATCH 6/9] doc: new 'mptcp' page in 'networking' mainline inclusion from mainline-v6.11-rc1 commit c049275f24de70edd43e521f5a549c9e63165906 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I9VYQ9 CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=c049275f24de70edd43e521f5a549c9e63165906 -------------------------------- A general documentation about MPTCP was missing since its introduction in v5.6. Most of what is there comes from our recently updated mptcp.dev website, with additional links to resources from the kernel documentation. This is a first version, mainly targeting app developers and users. Link: https://www.mptcp.dev Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://lore.kernel.org/r/20240530-upstream-net-20240520-mptcp-doc-v3-3-e94cdd9f2673@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/networking/index.rst | 1 + Documentation/networking/mptcp.rst | 156 +++++++++++++++++++++++++++++ MAINTAINERS | 2 +- 3 files changed, 158 insertions(+), 1 deletion(-) create mode 100644 Documentation/networking/mptcp.rst diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 5b75c3f7a137b..7a356dc40ca2e 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -72,6 +72,7 @@ Contents: mac80211-injection mctp mpls-sysctl + mptcp mptcp-sysctl multiqueue napi diff --git a/Documentation/networking/mptcp.rst b/Documentation/networking/mptcp.rst new file mode 100644 index 0000000000000..17f2bab611644 --- /dev/null +++ b/Documentation/networking/mptcp.rst @@ -0,0 +1,156 @@ +.. SPDX-License-Identifier: GPL-2.0 + +===================== +Multipath TCP (MPTCP) +===================== + +Introduction +============ + +Multipath TCP or MPTCP is an extension to the standard TCP and is described in +`RFC 8684 (MPTCPv1) `_. It allows a +device to make use of multiple interfaces at once to send and receive TCP +packets over a single MPTCP connection. MPTCP can aggregate the bandwidth of +multiple interfaces or prefer the one with the lowest latency. It also allows a +fail-over if one path is down, and the traffic is seamlessly reinjected on other +paths. + +For more details about Multipath TCP in the Linux kernel, please see the +official website: `mptcp.dev `_. + + +Use cases +========= + +Thanks to MPTCP, being able to use multiple paths in parallel or simultaneously +brings new use-cases, compared to TCP: + +- Seamless handovers: switching from one path to another while preserving + established connections, e.g. to be used in mobility use-cases, like on + smartphones. +- Best network selection: using the "best" available path depending on some + conditions, e.g. latency, losses, cost, bandwidth, etc. +- Network aggregation: using multiple paths at the same time to have a higher + throughput, e.g. to combine fixed and mobile networks to send files faster. + + +Concepts +======== + +Technically, when a new socket is created with the ``IPPROTO_MPTCP`` protocol +(Linux-specific), a *subflow* (or *path*) is created. This *subflow* consists of +a regular TCP connection that is used to transmit data through one interface. +Additional *subflows* can be negotiated later between the hosts. For the remote +host to be able to detect the use of MPTCP, a new field is added to the TCP +*option* field of the underlying TCP *subflow*. This field contains, amongst +other things, a ``MP_CAPABLE`` option that tells the other host to use MPTCP if +it is supported. If the remote host or any middlebox in between does not support +it, the returned ``SYN+ACK`` packet will not contain MPTCP options in the TCP +*option* field. In that case, the connection will be "downgraded" to plain TCP, +and it will continue with a single path. + +This behavior is made possible by two internal components: the path manager, and +the packet scheduler. + +Path Manager +------------ + +The Path Manager is in charge of *subflows*, from creation to deletion, and also +address announcements. Typically, it is the client side that initiates subflows, +and the server side that announces additional addresses via the ``ADD_ADDR`` and +``REMOVE_ADDR`` options. + +Path managers are controlled by the ``net.mptcp.pm_type`` sysctl knob -- see +mptcp-sysctl.rst. There are two types: the in-kernel one (type ``0``) where the +same rules are applied for all the connections (see: ``ip mptcp``) ; and the +userspace one (type ``1``), controlled by a userspace daemon (i.e. `mptcpd +`_) where different rules can be applied for each +connection. The path managers can be controlled via a Netlink API; see +netlink_spec/mptcp_pm.rst. + +To be able to use multiple IP addresses on a host to create multiple *subflows* +(paths), the default in-kernel MPTCP path-manager needs to know which IP +addresses can be used. This can be configured with ``ip mptcp endpoint`` for +example. + +Packet Scheduler +---------------- + +The Packet Scheduler is in charge of selecting which available *subflow(s)* to +use to send the next data packet. It can decide to maximize the use of the +available bandwidth, only to pick the path with the lower latency, or any other +policy depending on the configuration. + +Packet schedulers are controlled by the ``net.mptcp.scheduler`` sysctl knob -- +see mptcp-sysctl.rst. + + +Sockets API +=========== + +Creating MPTCP sockets +---------------------- + +On Linux, MPTCP can be used by selecting MPTCP instead of TCP when creating the +``socket``: + +.. code-block:: C + + int sd = socket(AF_INET(6), SOCK_STREAM, IPPROTO_MPTCP); + +Note that ``IPPROTO_MPTCP`` is defined as ``262``. + +If MPTCP is not supported, ``errno`` will be set to: + +- ``EINVAL``: (*Invalid argument*): MPTCP is not available, on kernels < 5.6. +- ``EPROTONOSUPPORT`` (*Protocol not supported*): MPTCP has not been compiled, + on kernels >= v5.6. +- ``ENOPROTOOPT`` (*Protocol not available*): MPTCP has been disabled using + ``net.mptcp.enabled`` sysctl knob; see mptcp-sysctl.rst. + +MPTCP is then opt-in: applications need to explicitly request it. Note that +applications can be forced to use MPTCP with different techniques, e.g. +``LD_PRELOAD`` (see ``mptcpize``), eBPF (see ``mptcpify``), SystemTAP, +``GODEBUG`` (``GODEBUG=multipathtcp=1``), etc. + +Switching to ``IPPROTO_MPTCP`` instead of ``IPPROTO_TCP`` should be as +transparent as possible for the userspace applications. + +Socket options +-------------- + +MPTCP supports most socket options handled by TCP. It is possible some less +common options are not supported, but contributions are welcome. + +Generally, the same value is propagated to all subflows, including the ones +created after the calls to ``setsockopt()``. eBPF can be used to set different +values per subflow. + +There are some MPTCP specific socket options at the ``SOL_MPTCP`` (284) level to +retrieve info. They fill the ``optval`` buffer of the ``getsockopt()`` system +call: + +- ``MPTCP_INFO``: Uses ``struct mptcp_info``. +- ``MPTCP_TCPINFO``: Uses ``struct mptcp_subflow_data``, followed by an array of + ``struct tcp_info``. +- ``MPTCP_SUBFLOW_ADDRS``: Uses ``struct mptcp_subflow_data``, followed by an + array of ``mptcp_subflow_addrs``. +- ``MPTCP_FULL_INFO``: Uses ``struct mptcp_full_info``, with one pointer to an + array of ``struct mptcp_subflow_info`` (including the + ``struct mptcp_subflow_addrs``), and one pointer to an array of + ``struct tcp_info``, followed by the content of ``struct mptcp_info``. + +Note that at the TCP level, ``TCP_IS_MPTCP`` socket option can be used to know +if MPTCP is currently being used: the value will be set to 1 if it is. + + +Design choices +============== + +A new socket type has been added for MPTCP for the userspace-facing socket. The +kernel is in charge of creating subflow sockets: they are TCP sockets where the +behavior is modified using TCP-ULP. + +MPTCP listen sockets will create "plain" *accepted* TCP sockets if the +connection request from the client didn't ask for MPTCP, making the performance +impact minimal when MPTCP is enabled by default. diff --git a/MAINTAINERS b/MAINTAINERS index 008a26b0f027d..e73f18b24e000 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15042,7 +15042,7 @@ B: https://github.com/multipath-tcp/mptcp_net-next/issues T: git https://github.com/multipath-tcp/mptcp_net-next.git export-net T: git https://github.com/multipath-tcp/mptcp_net-next.git export F: Documentation/netlink/specs/mptcp_pm.yaml -F: Documentation/networking/mptcp-sysctl.rst +F: Documentation/networking/mptcp*.rst F: include/net/mptcp.h F: include/trace/events/mptcp.h F: include/uapi/linux/mptcp*.h -- Gitee From 6529c26dd3f7ec72a534325490143d1129fa3cdd Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Wed, 5 Jun 2024 09:15:40 +0200 Subject: [PATCH 7/9] mptcp: use mptcp_win_from_space helper mainline inclusion from mainline-v6.11-rc1 commit 5f0d0649c83f72399c19b18591ea1413ca94c015 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I9VYQ9 CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=5f0d0649c83f72399c19b18591ea1413ca94c015 -------------------------------- The MPTCP dedicated win_from_space helper mptcp_win_from_space() is defined in protocol.h, use it in mptcp_rcv_space_adjust() instead of using the TCP one. Here scaling_ratio is the same as msk->scaling_ratio. Signed-off-by: Geliang Tang Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: Paolo Abeni --- net/mptcp/protocol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 63e16eba83aa6..2d5b48efeb23b 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2061,7 +2061,7 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) if (rcvbuf > sk->sk_rcvbuf) { u32 window_clamp; - window_clamp = __tcp_win_from_space(scaling_ratio, rcvbuf); + window_clamp = mptcp_win_from_space(sk, rcvbuf); WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); /* Make subflows follow along. If we do not do this, we -- Gitee From ec0aff17fdf42358d1431a80b9bf8dba57aa2c64 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Wed, 5 Jun 2024 09:15:41 +0200 Subject: [PATCH 8/9] mptcp: add mptcp_space_from_win helper mainline inclusion from mainline-v6.11-rc1 commit 5cdedad62eaba22c45b9c45c8199bacd461afd87 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I9VYQ9 CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=5cdedad62eaba22c45b9c45c8199bacd461afd87 -------------------------------- As a wrapper of __tcp_space_from_win(), this patch adds a MPTCP dedicated space_from_win helper mptcp_space_from_win() in protocol.h to paired with mptcp_win_from_space(). Use it instead of __tcp_space_from_win() in both mptcp_rcv_space_adjust() and mptcp_set_rcvlowat(). Signed-off-by: Geliang Tang Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: Paolo Abeni --- net/mptcp/protocol.c | 2 +- net/mptcp/protocol.h | 5 +++++ net/mptcp/sockopt.c | 2 +- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 2d5b48efeb23b..05270dababa7c 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2055,7 +2055,7 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) do_div(grow, msk->rcvq_space.space); rcvwin += (grow << 1); - rcvbuf = min_t(u64, __tcp_space_from_win(scaling_ratio, rcvwin), + rcvbuf = min_t(u64, mptcp_space_from_win(sk, rcvwin), READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])); if (rcvbuf > sk->sk_rcvbuf) { diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 6b2cfe96c9099..3bfa29b59f3a2 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -384,6 +384,11 @@ static inline int mptcp_win_from_space(const struct sock *sk, int space) return __tcp_win_from_space(mptcp_sk(sk)->scaling_ratio, space); } +static inline int mptcp_space_from_win(const struct sock *sk, int win) +{ + return __tcp_space_from_win(mptcp_sk(sk)->scaling_ratio, win); +} + static inline int __mptcp_space(const struct sock *sk) { return mptcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf) - __mptcp_rmem(sk)); diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index ee3ecd3af4722..6df94572bc0cb 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -1576,7 +1576,7 @@ int mptcp_set_rcvlowat(struct sock *sk, int val) if (sk->sk_userlocks & SOCK_RCVBUF_LOCK) return 0; - space = __tcp_space_from_win(mptcp_sk(sk)->scaling_ratio, val); + space = mptcp_space_from_win(sk, val); if (space <= sk->sk_rcvbuf) return 0; -- Gitee From 248fef7833300b748cc71d06bb369cc8691df799 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Wed, 5 Jun 2024 09:15:42 +0200 Subject: [PATCH 9/9] mptcp: refer to 'MPTCP' socket in comments mainline inclusion from mainline-v6.11-rc1 commit 92f74c1e05b044b51398d6d4a85e659e4384f2cb category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I9VYQ9 CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=92f74c1e05b044b51398d6d4a85e659e4384f2cb -------------------------------- We used to call it 'master' socket at the early stages of MPTCP development, but the correct wording is 'MPTCP' socket opposed to 'TCP subflows': convert the last 3 comments to use a more appropriate term. Signed-off-by: Davide Caratti Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: Paolo Abeni --- net/mptcp/protocol.c | 4 ++-- net/mptcp/subflow.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 05270dababa7c..d3a81296f7cd3 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2217,7 +2217,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, if (skb_queue_empty(&msk->receive_queue) && __mptcp_move_skbs(msk)) continue; - /* only the master socket status is relevant here. The exit + /* only the MPTCP socket status is relevant here. The exit * conditions mirror closely tcp_recvmsg() */ if (copied >= target) @@ -3548,7 +3548,7 @@ void mptcp_subflow_process_delegated(struct sock *ssk, long status) static int mptcp_hash(struct sock *sk) { /* should never be called, - * we hash the TCP subflows not the master socket + * we hash the TCP subflows not the MPTCP socket */ WARN_ON_ONCE(1); return 0; diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 719dc80c0bf57..4897e47c4edab 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -1721,7 +1721,7 @@ int mptcp_subflow_create_socket(struct sock *sk, unsigned short family, mptcp_sockopt_sync_locked(mptcp_sk(sk), sf->sk); release_sock(sf->sk); - /* the newly created socket really belongs to the owning MPTCP master + /* the newly created socket really belongs to the owning MPTCP * socket, even if for additional subflows the allocation is performed * by a kernel workqueue. Adjust inode references, so that the * procfs/diag interfaces really show this one belonging to the correct -- Gitee