diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 5b75c3f7a137b3dbde03903cb2a6a61c0f2facea..7a356dc40ca2e7fa18f412b0211a6453a10302e8 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -72,6 +72,7 @@ Contents: mac80211-injection mctp mpls-sysctl + mptcp mptcp-sysctl multiqueue napi diff --git a/Documentation/networking/mptcp-sysctl.rst b/Documentation/networking/mptcp-sysctl.rst index 69975ce25a029400e4e55779ee8220a322b930f9..fd514bba8c431ca6489d76fc4d6b0a25ff52f490 100644 --- a/Documentation/networking/mptcp-sysctl.rst +++ b/Documentation/networking/mptcp-sysctl.rst @@ -7,14 +7,6 @@ MPTCP Sysfs variables /proc/sys/net/mptcp/* Variables =============================== -enabled - BOOLEAN - Control whether MPTCP sockets can be created. - - MPTCP sockets can be created if the value is 1. This is a - per-namespace sysctl. - - Default: 1 (enabled) - add_addr_timeout - INTEGER (seconds) Set the timeout after which an ADD_ADDR control message will be resent to an MPTCP peer that has not acknowledged a previous @@ -25,16 +17,22 @@ add_addr_timeout - INTEGER (seconds) Default: 120 -close_timeout - INTEGER (seconds) - Set the make-after-break timeout: in absence of any close or - shutdown syscall, MPTCP sockets will maintain the status - unchanged for such time, after the last subflow removal, before - moving to TCP_CLOSE. +allow_join_initial_addr_port - BOOLEAN + Allow peers to send join requests to the IP address and port number used + by the initial subflow if the value is 1. This controls a flag that is + sent to the peer at connection time, and whether such join requests are + accepted or denied. - The default value matches TCP_TIMEWAIT_LEN. This is a per-namespace - sysctl. + Joins to addresses advertised with ADD_ADDR are not affected by this + value. - Default: 60 + This is a per-namespace sysctl. + + Default: 1 + +available_schedulers - STRING + Shows the available schedulers choices that are registered. More packet + schedulers may be available, but not loaded. checksum_enabled - BOOLEAN Control whether DSS checksum can be enabled. @@ -44,18 +42,24 @@ checksum_enabled - BOOLEAN Default: 0 -allow_join_initial_addr_port - BOOLEAN - Allow peers to send join requests to the IP address and port number used - by the initial subflow if the value is 1. This controls a flag that is - sent to the peer at connection time, and whether such join requests are - accepted or denied. +close_timeout - INTEGER (seconds) + Set the make-after-break timeout: in absence of any close or + shutdown syscall, MPTCP sockets will maintain the status + unchanged for such time, after the last subflow removal, before + moving to TCP_CLOSE. - Joins to addresses advertised with ADD_ADDR are not affected by this - value. + The default value matches TCP_TIMEWAIT_LEN. This is a per-namespace + sysctl. - This is a per-namespace sysctl. + Default: 60 - Default: 1 +enabled - BOOLEAN + Control whether MPTCP sockets can be created. + + MPTCP sockets can be created if the value is 1. This is a + per-namespace sysctl. + + Default: 1 (enabled) pm_type - INTEGER Set the default path manager type to use for each new MPTCP @@ -74,6 +78,14 @@ pm_type - INTEGER Default: 0 +scheduler - STRING + Select the scheduler of your choice. + + Support for selection of different schedulers. This is a per-namespace + sysctl. + + Default: "default" + stale_loss_cnt - INTEGER The number of MPTCP-level retransmission intervals with no traffic and pending outstanding data on a given subflow required to declare it stale. @@ -85,11 +97,3 @@ stale_loss_cnt - INTEGER This is a per-namespace sysctl. Default: 4 - -scheduler - STRING - Select the scheduler of your choice. - - Support for selection of different schedulers. This is a per-namespace - sysctl. - - Default: "default" diff --git a/Documentation/networking/mptcp.rst b/Documentation/networking/mptcp.rst new file mode 100644 index 0000000000000000000000000000000000000000..17f2bab611644727e19c3969fa08fa974c702d92 --- /dev/null +++ b/Documentation/networking/mptcp.rst @@ -0,0 +1,156 @@ +.. SPDX-License-Identifier: GPL-2.0 + +===================== +Multipath TCP (MPTCP) +===================== + +Introduction +============ + +Multipath TCP or MPTCP is an extension to the standard TCP and is described in +`RFC 8684 (MPTCPv1) `_. It allows a +device to make use of multiple interfaces at once to send and receive TCP +packets over a single MPTCP connection. MPTCP can aggregate the bandwidth of +multiple interfaces or prefer the one with the lowest latency. It also allows a +fail-over if one path is down, and the traffic is seamlessly reinjected on other +paths. + +For more details about Multipath TCP in the Linux kernel, please see the +official website: `mptcp.dev `_. + + +Use cases +========= + +Thanks to MPTCP, being able to use multiple paths in parallel or simultaneously +brings new use-cases, compared to TCP: + +- Seamless handovers: switching from one path to another while preserving + established connections, e.g. to be used in mobility use-cases, like on + smartphones. +- Best network selection: using the "best" available path depending on some + conditions, e.g. latency, losses, cost, bandwidth, etc. +- Network aggregation: using multiple paths at the same time to have a higher + throughput, e.g. to combine fixed and mobile networks to send files faster. + + +Concepts +======== + +Technically, when a new socket is created with the ``IPPROTO_MPTCP`` protocol +(Linux-specific), a *subflow* (or *path*) is created. This *subflow* consists of +a regular TCP connection that is used to transmit data through one interface. +Additional *subflows* can be negotiated later between the hosts. For the remote +host to be able to detect the use of MPTCP, a new field is added to the TCP +*option* field of the underlying TCP *subflow*. This field contains, amongst +other things, a ``MP_CAPABLE`` option that tells the other host to use MPTCP if +it is supported. If the remote host or any middlebox in between does not support +it, the returned ``SYN+ACK`` packet will not contain MPTCP options in the TCP +*option* field. In that case, the connection will be "downgraded" to plain TCP, +and it will continue with a single path. + +This behavior is made possible by two internal components: the path manager, and +the packet scheduler. + +Path Manager +------------ + +The Path Manager is in charge of *subflows*, from creation to deletion, and also +address announcements. Typically, it is the client side that initiates subflows, +and the server side that announces additional addresses via the ``ADD_ADDR`` and +``REMOVE_ADDR`` options. + +Path managers are controlled by the ``net.mptcp.pm_type`` sysctl knob -- see +mptcp-sysctl.rst. There are two types: the in-kernel one (type ``0``) where the +same rules are applied for all the connections (see: ``ip mptcp``) ; and the +userspace one (type ``1``), controlled by a userspace daemon (i.e. `mptcpd +`_) where different rules can be applied for each +connection. The path managers can be controlled via a Netlink API; see +netlink_spec/mptcp_pm.rst. + +To be able to use multiple IP addresses on a host to create multiple *subflows* +(paths), the default in-kernel MPTCP path-manager needs to know which IP +addresses can be used. This can be configured with ``ip mptcp endpoint`` for +example. + +Packet Scheduler +---------------- + +The Packet Scheduler is in charge of selecting which available *subflow(s)* to +use to send the next data packet. It can decide to maximize the use of the +available bandwidth, only to pick the path with the lower latency, or any other +policy depending on the configuration. + +Packet schedulers are controlled by the ``net.mptcp.scheduler`` sysctl knob -- +see mptcp-sysctl.rst. + + +Sockets API +=========== + +Creating MPTCP sockets +---------------------- + +On Linux, MPTCP can be used by selecting MPTCP instead of TCP when creating the +``socket``: + +.. code-block:: C + + int sd = socket(AF_INET(6), SOCK_STREAM, IPPROTO_MPTCP); + +Note that ``IPPROTO_MPTCP`` is defined as ``262``. + +If MPTCP is not supported, ``errno`` will be set to: + +- ``EINVAL``: (*Invalid argument*): MPTCP is not available, on kernels < 5.6. +- ``EPROTONOSUPPORT`` (*Protocol not supported*): MPTCP has not been compiled, + on kernels >= v5.6. +- ``ENOPROTOOPT`` (*Protocol not available*): MPTCP has been disabled using + ``net.mptcp.enabled`` sysctl knob; see mptcp-sysctl.rst. + +MPTCP is then opt-in: applications need to explicitly request it. Note that +applications can be forced to use MPTCP with different techniques, e.g. +``LD_PRELOAD`` (see ``mptcpize``), eBPF (see ``mptcpify``), SystemTAP, +``GODEBUG`` (``GODEBUG=multipathtcp=1``), etc. + +Switching to ``IPPROTO_MPTCP`` instead of ``IPPROTO_TCP`` should be as +transparent as possible for the userspace applications. + +Socket options +-------------- + +MPTCP supports most socket options handled by TCP. It is possible some less +common options are not supported, but contributions are welcome. + +Generally, the same value is propagated to all subflows, including the ones +created after the calls to ``setsockopt()``. eBPF can be used to set different +values per subflow. + +There are some MPTCP specific socket options at the ``SOL_MPTCP`` (284) level to +retrieve info. They fill the ``optval`` buffer of the ``getsockopt()`` system +call: + +- ``MPTCP_INFO``: Uses ``struct mptcp_info``. +- ``MPTCP_TCPINFO``: Uses ``struct mptcp_subflow_data``, followed by an array of + ``struct tcp_info``. +- ``MPTCP_SUBFLOW_ADDRS``: Uses ``struct mptcp_subflow_data``, followed by an + array of ``mptcp_subflow_addrs``. +- ``MPTCP_FULL_INFO``: Uses ``struct mptcp_full_info``, with one pointer to an + array of ``struct mptcp_subflow_info`` (including the + ``struct mptcp_subflow_addrs``), and one pointer to an array of + ``struct tcp_info``, followed by the content of ``struct mptcp_info``. + +Note that at the TCP level, ``TCP_IS_MPTCP`` socket option can be used to know +if MPTCP is currently being used: the value will be set to 1 if it is. + + +Design choices +============== + +A new socket type has been added for MPTCP for the userspace-facing socket. The +kernel is in charge of creating subflow sockets: they are TCP sockets where the +behavior is modified using TCP-ULP. + +MPTCP listen sockets will create "plain" *accepted* TCP sockets if the +connection request from the client didn't ask for MPTCP, making the performance +impact minimal when MPTCP is enabled by default. diff --git a/MAINTAINERS b/MAINTAINERS index 008a26b0f027d852f0ff5be17bcc3884584a3988..e73f18b24e00025633055bb3d774d827a495bf42 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15042,7 +15042,7 @@ B: https://github.com/multipath-tcp/mptcp_net-next/issues T: git https://github.com/multipath-tcp/mptcp_net-next.git export-net T: git https://github.com/multipath-tcp/mptcp_net-next.git export F: Documentation/netlink/specs/mptcp_pm.yaml -F: Documentation/networking/mptcp-sysctl.rst +F: Documentation/networking/mptcp*.rst F: include/net/mptcp.h F: include/trace/events/mptcp.h F: include/uapi/linux/mptcp*.h diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 63e16eba83aa699e855b24abf4f7947a8953d57d..d3a81296f7cd3426327195285d3784e6f578e45b 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2055,13 +2055,13 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) do_div(grow, msk->rcvq_space.space); rcvwin += (grow << 1); - rcvbuf = min_t(u64, __tcp_space_from_win(scaling_ratio, rcvwin), + rcvbuf = min_t(u64, mptcp_space_from_win(sk, rcvwin), READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])); if (rcvbuf > sk->sk_rcvbuf) { u32 window_clamp; - window_clamp = __tcp_win_from_space(scaling_ratio, rcvbuf); + window_clamp = mptcp_win_from_space(sk, rcvbuf); WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); /* Make subflows follow along. If we do not do this, we @@ -2217,7 +2217,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, if (skb_queue_empty(&msk->receive_queue) && __mptcp_move_skbs(msk)) continue; - /* only the master socket status is relevant here. The exit + /* only the MPTCP socket status is relevant here. The exit * conditions mirror closely tcp_recvmsg() */ if (copied >= target) @@ -3548,7 +3548,7 @@ void mptcp_subflow_process_delegated(struct sock *ssk, long status) static int mptcp_hash(struct sock *sk) { /* should never be called, - * we hash the TCP subflows not the master socket + * we hash the TCP subflows not the MPTCP socket */ WARN_ON_ONCE(1); return 0; diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 6b2cfe96c9099bd741eba738817ee2cd84718a26..3bfa29b59f3a2470c5955f4982be88829c144e73 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -384,6 +384,11 @@ static inline int mptcp_win_from_space(const struct sock *sk, int space) return __tcp_win_from_space(mptcp_sk(sk)->scaling_ratio, space); } +static inline int mptcp_space_from_win(const struct sock *sk, int win) +{ + return __tcp_space_from_win(mptcp_sk(sk)->scaling_ratio, win); +} + static inline int __mptcp_space(const struct sock *sk) { return mptcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf) - __mptcp_rmem(sk)); diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index ee3ecd3af47220ea74b19d1fc201d4129d91d9b7..6df94572bc0cb8b493c0cb23470cf64dc749f48d 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -1576,7 +1576,7 @@ int mptcp_set_rcvlowat(struct sock *sk, int val) if (sk->sk_userlocks & SOCK_RCVBUF_LOCK) return 0; - space = __tcp_space_from_win(mptcp_sk(sk)->scaling_ratio, val); + space = mptcp_space_from_win(sk, val); if (space <= sk->sk_rcvbuf) return 0; diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 719dc80c0bf57d62ee849b8a18647173ac68c3a4..4897e47c4edab27c29fbc80e7587b9530e8fe094 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -1721,7 +1721,7 @@ int mptcp_subflow_create_socket(struct sock *sk, unsigned short family, mptcp_sockopt_sync_locked(mptcp_sk(sk), sf->sk); release_sock(sf->sk); - /* the newly created socket really belongs to the owning MPTCP master + /* the newly created socket really belongs to the owning MPTCP * socket, even if for additional subflows the allocation is performed * by a kernel workqueue. Adjust inode references, so that the * procfs/diag interfaces really show this one belonging to the correct diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index a1df06f741ae1f787d411af093986d3189f939fc..05429017beb7f57da0982ef8c59bfe98dbf3a3b3 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -31,7 +31,6 @@ timeout_poll=30 timeout_test=$((timeout_poll * 2 + 1)) capture=false checksum=false -ip_mptcp=0 check_invert=0 validate_checksum=false init=0 @@ -262,6 +261,8 @@ reset() TEST_NAME="${1}" + MPTCP_LIB_SUBTEST_FLAKY=0 # reset if modified + if skip_test; then MPTCP_LIB_TEST_COUNTER=$((MPTCP_LIB_TEST_COUNTER+1)) last_test_ignored=1 @@ -455,7 +456,9 @@ reset_with_tcp_filter() # $1: err msg fail_test() { - ret=${KSFT_FAIL} + if ! mptcp_lib_subtest_is_flaky; then + ret=${KSFT_FAIL} + fi if [ ${#} -gt 0 ]; then print_fail "${@}" @@ -616,7 +619,7 @@ pm_nl_set_limits() local addrs=$2 local subflows=$3 - if [ $ip_mptcp -eq 1 ]; then + if mptcp_lib_is_ip_mptcp; then ip -n $ns mptcp limits set add_addr_accepted $addrs subflows $subflows else ip netns exec $ns ./pm_nl_ctl limits $addrs $subflows @@ -656,7 +659,7 @@ pm_nl_add_endpoint() nr=$((nr + 1)) done - if [ $ip_mptcp -eq 1 ]; then + if mptcp_lib_is_ip_mptcp; then ip -n $ns mptcp endpoint add $addr ${_flags//","/" "} $dev $id $port else ip netns exec $ns ./pm_nl_ctl add $addr $flags $dev $id $port @@ -669,7 +672,7 @@ pm_nl_del_endpoint() local id=$2 local addr=$3 - if [ $ip_mptcp -eq 1 ]; then + if mptcp_lib_is_ip_mptcp; then [ $id -ne 0 ] && addr='' ip -n $ns mptcp endpoint delete id $id $addr else @@ -681,7 +684,7 @@ pm_nl_flush_endpoint() { local ns=$1 - if [ $ip_mptcp -eq 1 ]; then + if mptcp_lib_is_ip_mptcp; then ip -n $ns mptcp endpoint flush else ip netns exec $ns ./pm_nl_ctl flush @@ -692,7 +695,7 @@ pm_nl_show_endpoints() { local ns=$1 - if [ $ip_mptcp -eq 1 ]; then + if mptcp_lib_is_ip_mptcp; then ip -n $ns mptcp endpoint show else ip netns exec $ns ./pm_nl_ctl dump @@ -705,7 +708,7 @@ pm_nl_change_endpoint() local id=$2 local flags=$3 - if [ $ip_mptcp -eq 1 ]; then + if mptcp_lib_is_ip_mptcp; then ip -n $ns mptcp endpoint change id $id ${flags//","/" "} else ip netns exec $ns ./pm_nl_ctl set id $id flags $flags @@ -755,7 +758,7 @@ pm_nl_check_endpoint() return fi - if [ $ip_mptcp -eq 1 ]; then + if mptcp_lib_is_ip_mptcp; then # get line and trim trailing whitespace line=$(ip -n $ns mptcp endpoint show $id) line="${line% }" @@ -3259,6 +3262,7 @@ fullmesh_tests() fastclose_tests() { if reset_check_counter "fastclose test" "MPTcpExtMPFastcloseTx"; then + MPTCP_LIB_SUBTEST_FLAKY=1 test_linkfail=1024 fastclose=client \ run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 0 0 0 @@ -3267,6 +3271,7 @@ fastclose_tests() fi if reset_check_counter "fastclose server test" "MPTcpExtMPFastcloseRx"; then + MPTCP_LIB_SUBTEST_FLAKY=1 test_linkfail=1024 fastclose=server \ run_tests $ns1 $ns2 10.0.1.1 chk_join_nr 0 0 0 0 0 0 1 @@ -3963,7 +3968,7 @@ while getopts "${all_tests_args}cCih" opt; do checksum=true ;; i) - ip_mptcp=1 + mptcp_lib_set_ip_mptcp ;; h) usage diff --git a/tools/testing/selftests/net/mptcp/mptcp_lib.sh b/tools/testing/selftests/net/mptcp/mptcp_lib.sh index d529b4b37af86954c90699e1b5e0cb376c147d8c..77cf3303e4fa1f9714690b7378d57b9e2d1cee8b 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_lib.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_lib.sh @@ -21,8 +21,10 @@ declare -rx MPTCP_LIB_AF_INET6=10 MPTCP_LIB_SUBTESTS=() MPTCP_LIB_SUBTESTS_DUPLICATED=0 +MPTCP_LIB_SUBTEST_FLAKY=0 MPTCP_LIB_TEST_COUNTER=0 MPTCP_LIB_TEST_FORMAT="%02u %-50s" +MPTCP_LIB_IP_MPTCP=0 # only if supported (or forced) and not disabled, see no-color.org if { [ -t 1 ] || [ "${SELFTESTS_MPTCP_LIB_COLOR_FORCE:-}" = "1" ]; } && @@ -40,6 +42,16 @@ else readonly MPTCP_LIB_COLOR_RESET= fi +# SELFTESTS_MPTCP_LIB_OVERRIDE_FLAKY env var can be set not to ignore errors +# from subtests marked as flaky +mptcp_lib_override_flaky() { + [ "${SELFTESTS_MPTCP_LIB_OVERRIDE_FLAKY:-}" = 1 ] +} + +mptcp_lib_subtest_is_flaky() { + [ "${MPTCP_LIB_SUBTEST_FLAKY}" = 1 ] && ! mptcp_lib_override_flaky +} + # $1: color, $2: text mptcp_lib_print_color() { echo -e "${MPTCP_LIB_START_PRINT:-}${*}${MPTCP_LIB_COLOR_RESET}" @@ -71,7 +83,16 @@ mptcp_lib_pr_skip() { } mptcp_lib_pr_fail() { - mptcp_lib_print_err "[FAIL]${1:+ ${*}}" + local title cmt + + if mptcp_lib_subtest_is_flaky; then + title="IGNO" + cmt=" (flaky)" + else + title="FAIL" + fi + + mptcp_lib_print_err "[${title}]${cmt}${1:+ ${*}}" } mptcp_lib_pr_info() { @@ -207,7 +228,13 @@ mptcp_lib_result_pass() { # $1: test name mptcp_lib_result_fail() { - __mptcp_lib_result_add "not ok" "${1}" + if mptcp_lib_subtest_is_flaky; then + # It might sound better to use 'not ok # TODO' or 'ok # SKIP', + # but some CIs don't understand 'TODO' and treat SKIP as errors. + __mptcp_lib_result_add "ok" "${1} # IGNORE Flaky" + else + __mptcp_lib_result_add "not ok" "${1}" + fi } # $1: test name @@ -505,3 +532,11 @@ mptcp_lib_verify_listener_events() { mptcp_lib_check_expected "type" "family" "saddr" "sport" || rc="${?}" return "${rc}" } + +mptcp_lib_set_ip_mptcp() { + MPTCP_LIB_IP_MPTCP=1 +} + +mptcp_lib_is_ip_mptcp() { + [ "${MPTCP_LIB_IP_MPTCP}" = "1" ] +}