From e256088d4d58e1688e43b3b69fd6ecede79f9bf9 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Wed, 16 Aug 2023 09:11:56 +0800 Subject: [PATCH 1/5] bpf: Add update_socket_protocol hook commit 0dd061a6a115f25132989cbd591a25afb2dee086 upstream. Add a hook named update_socket_protocol in __sys_socket(), for bpf progs to attach to and update socket protocol. One user case is to force legacy TCP apps to create and use MPTCP sockets instead of TCP ones. Define a fmod_ret set named bpf_mptcp_fmodret_ids, add the hook update_socket_protocol into this set, and register it in bpf_mptcp_kfunc_init(). D. Wythe: Some additional adjustments were made in the adaptation for 5.10, and this patch does not fully align with the upstream. Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/79 Acked-by: Matthieu Baerts Acked-by: Yonghong Song Signed-off-by: Geliang Tang Link: https://lore.kernel.org/r/ac84be00f97072a46f8a72b4e2be46cbb7fa5053.1692147782.git.geliang.tang@suse.com Signed-off-by: Martin KaFai Lau Signed-off-by: D. Wythe --- kernel/bpf/verifier.c | 7 +++++++ net/socket.c | 26 +++++++++++++++++++++++++- 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 31f2fc2c3a40..2df03143b24e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -12875,6 +12875,10 @@ BTF_ID(func, check_preempt_wakeup) BTF_ID(func, select_idle_cpu) BTF_SET_END(btf_customized_allow_fmodret) +BTF_SET_START(bpf_mptcp_fmodret_ids) +BTF_ID(func, update_socket_protocol) +BTF_SET_END(bpf_mptcp_fmodret_ids) + int sysctl_bpf_customized_fmodret __read_mostly; static int check_attach_modify_return(unsigned long addr, const char *func_name, u32 btf_id) { @@ -12886,6 +12890,9 @@ static int check_attach_modify_return(unsigned long addr, const char *func_name, btf_id_set_contains(&btf_customized_allow_fmodret, btf_id)) return 0; + if (btf_id_set_contains(&bpf_mptcp_fmodret_ids, btf_id)) + return 0; + return -EINVAL; } diff --git a/net/socket.c b/net/socket.c index 2c3e14d5382a..c7e567671a83 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1507,6 +1507,29 @@ int sock_create_kern(struct net *net, int family, int type, int protocol, struct } EXPORT_SYMBOL(sock_create_kern); +/* A hook for bpf progs to attach to and update socket protocol. + * + * A static noinline declaration here could cause the compiler to + * optimize away the function. A global noinline declaration will + * keep the definition, but may optimize away the callsite. + * Therefore, __weak is needed to ensure that the call is still + * emitted, by telling the compiler that we don't know what the + * function might eventually be. + * + * __diag_* below are needed to dismiss the missing prototype warning. + */ + +__diag_push(); +__diag_ignore(GCC, 8, "-Wmissing-prototypes", + "A fmod_ret entry point for BPF programs"); +__weak noinline int update_socket_protocol(int family, int type, int protocol) +{ + return protocol; +} + +__diag_pop(); + + int __sys_socket(int family, int type, int protocol) { int retval; @@ -1536,7 +1559,8 @@ int __sys_socket(int family, int type, int protocol) SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol) { - return __sys_socket(family, type, protocol); + return __sys_socket(family, type, + update_socket_protocol(family, type, protocol)); } /* -- Gitee From 276b26a286080db18fb1d3bece908874dbca2cbf Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Fri, 24 May 2024 10:32:27 +0800 Subject: [PATCH 2/5] anolis: net/smc: restore the correspondence between sk_{snd|rcv}buf and sndbuf/RMB ANBZ: #9167 The patch[1], which was merged from upstream, changes the correspondence between sk_sndbuf/sk_rcvbuf and sndbuf/RMB. - the sk_{snd|rcv}buf value is equal to 2 * sysctl net.smc.{w|r}mem, and is equal to 2 * val that set by setsockopt(SO_{SND|RCV}BUF); - the sndbuf/RMB size is equal to sk_sndbuf/sk_rcvbuf value / 2; Therefore, the size of sndbuf/RMB will be the value of sysctl net.smc.{w|rmem} or the value set by setsockopt(SO_{SND|RCV}BUF) However, this is inconsistent with the behavior of TCP, and SMC is mainly used to transparently upgrade TCP. Therefore, the relationship between sk_{snd|rcv}buf and sndbuf/RMB in SMC is restored to that before the introduction of [1], that is: - the sk_{snd|rcv}buf value is equal to sysctl net.smc.{w|r}mem, and is equal to 2 * val that set by setsockopt(SO_{SND|RCV}BUF); - the sndbuf/RMB size is equal to sk_sndbuf/sk_rcvbuf value; Then, the size of sndbuf/RMB will be the value of sysctl net.smc.{w|rmem} or *2 times* the value set by setsockopt(SO_{SND|RCV}BUF). [1] 8658623023f7 ("net/smc: Fix setsockopt and sysctl to specify same buffer size again") Fixes: 8658623023f7 ("net/smc: Fix setsockopt and sysctl to specify same buffer size again") Fixes: 9eedcb7a19cd ("net/smc: Use correct buffer sizes when switching between TCP and SMC") Signed-off-by: Wen Gu --- net/smc/af_smc.c | 10 +++++----- net/smc/smc_core.c | 9 ++++----- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 6e45b3038f28..03e9596f32c8 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -480,7 +480,7 @@ static void smc_sock_init_passive(struct sock *par, struct sock *sk) smc_sock_init_common(sk); smc_sk(sk)->listen_smc = parent; /* restore the smc_sk_sndbuf before handshake */ - smc_sk(sk)->smc_sk_sndbuf = 2 * READ_ONCE(sock_net(sk)->smc.sysctl_wmem); + smc_sk(sk)->smc_sk_sndbuf = READ_ONCE(sock_net(sk)->smc.sysctl_wmem); smc_sock_clone_negotiator_ops(par, sk); @@ -503,8 +503,8 @@ static void smc_sock_init(struct sock *sk, struct net *net) INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work); INIT_LIST_HEAD(&smc->accept_q); spin_lock_init(&smc->accept_q_lock); - WRITE_ONCE(sk->sk_sndbuf, 2 * READ_ONCE(net->smc.sysctl_wmem)); - WRITE_ONCE(sk->sk_rcvbuf, 2 * READ_ONCE(net->smc.sysctl_rmem)); + WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(net->smc.sysctl_wmem)); + WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(net->smc.sysctl_rmem)); smc->limit_smc_hs = net->smc.limit_smc_hs; smc_sock_assign_negotiator_ops(smc, "anolis"); @@ -610,7 +610,7 @@ static void smc_adjust_sock_bufsizes(struct sock *nsk, struct sock *osk, READ_ONCE(nnet->ipv4.sysctl_tcp_wmem[1])); else WRITE_ONCE(nsk->sk_sndbuf, - 2 * READ_ONCE(nnet->smc.sysctl_wmem)); + READ_ONCE(nnet->smc.sysctl_wmem)); } if (osk->sk_userlocks & SOCK_RCVBUF_LOCK) { nsk->sk_rcvbuf = osk->sk_rcvbuf; @@ -620,7 +620,7 @@ static void smc_adjust_sock_bufsizes(struct sock *nsk, struct sock *osk, READ_ONCE(nnet->ipv4.sysctl_tcp_rmem[1])); else WRITE_ONCE(nsk->sk_rcvbuf, - 2 * READ_ONCE(nnet->smc.sysctl_rmem)); + READ_ONCE(nnet->smc.sysctl_rmem)); } } diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 22db9b0b864c..b388ac099874 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -2487,11 +2487,10 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) if (is_rmb) /* use socket recv buffer size (w/o overhead) as start value */ - bufsize = smc->sk.sk_rcvbuf / 2; + bufsize = smc->sk.sk_rcvbuf; else /* only inet sock will set smc_sk_sndbuf, for smc sock it is zero */ - bufsize = smc->smc_sk_sndbuf ? - smc->smc_sk_sndbuf / 2 : smc->sk.sk_sndbuf / 2; + bufsize = smc->smc_sk_sndbuf ?: smc->sk.sk_sndbuf; for (bufsize_comp = smc_compress_bufsize(bufsize, is_smcd, is_rmb); bufsize_comp >= 0; bufsize_comp--) { @@ -2550,7 +2549,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) if (is_rmb) { conn->rmb_desc = buf_desc; conn->rmbe_size_comp = bufsize_comp; - smc->sk.sk_rcvbuf = bufsize * 2; + smc->sk.sk_rcvbuf = bufsize; atomic_set(&conn->bytes_to_rcv, 0); conn->rmbe_update_limit = smc_rmb_wnd_update_limit(buf_desc->len); @@ -2558,7 +2557,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) smc_ism_set_conn(conn); /* map RMB/smcd_dev to conn */ } else { conn->sndbuf_desc = buf_desc; - smc->sk.sk_sndbuf = bufsize * 2; + smc->sk.sk_sndbuf = bufsize; atomic_set(&conn->sndbuf_space, bufsize); } return 0; -- Gitee From 19f967a9d4f3fa9161b5304bebbe7124ac2c3514 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 10 Feb 2021 06:41:44 -0800 Subject: [PATCH 3/5] net: initialize net->net_cookie at netns setup It is simpler to make net->net_cookie a plain u64 written once in setup_net() instead of looping and using atomic64 helpers. Lorenz Bauer wants to add SO_NETNS_COOKIE socket option and this patch would makes his patch series simpler. Signed-off-by: Eric Dumazet Cc: Daniel Borkmann Cc: Lorenz Bauer Acked-by: Daniel Borkmann Tested-by: Lorenz Bauer Signed-off-by: David S. Miller --- include/net/net_namespace.h | 4 +--- net/core/filter.c | 8 +++----- net/core/net_namespace.c | 19 +++---------------- 3 files changed, 7 insertions(+), 24 deletions(-) diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 098cbc479acc..807cd375d0ba 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -174,7 +174,7 @@ struct net { struct netns_xfrm xfrm; #endif - atomic64_t net_cookie; /* written once */ + u64 net_cookie; /* written once */ #if IS_ENABLED(CONFIG_IP_VS) struct netns_ipvs *ipvs; @@ -249,8 +249,6 @@ extern struct list_head net_namespace_list; struct net *get_net_ns_by_pid(pid_t pid); struct net *get_net_ns_by_fd(int fd); -u64 __net_gen_cookie(struct net *net); - #ifdef CONFIG_SYSCTL void ipx_register_sysctl(void); void ipx_unregister_sysctl(void); diff --git a/net/core/filter.c b/net/core/filter.c index 0bc3b4eafa7d..021c1e7a13fd 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4664,11 +4664,9 @@ static const struct bpf_func_proto bpf_get_socket_cookie_sock_ops_proto = { static u64 __bpf_get_netns_cookie(struct sock *sk) { -#ifdef CONFIG_NET_NS - return __net_gen_cookie(sk ? sk->sk_net.net : &init_net); -#else - return 0; -#endif + const struct net *net = sk ? sock_net(sk) : &init_net; + + return net->net_cookie; } BPF_CALL_1(bpf_get_netns_cookie_sock, struct sock *, ctx) diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index cbff7d94b993..79591408d3ec 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -72,18 +72,6 @@ static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS; DEFINE_COOKIE(net_cookie); -u64 __net_gen_cookie(struct net *net) -{ - while (1) { - u64 res = atomic64_read(&net->net_cookie); - - if (res) - return res; - res = gen_cookie_next(&net_cookie); - atomic64_cmpxchg(&net->net_cookie, 0, res); - } -} - static struct net_generic *net_alloc_generic(void) { struct net_generic *ng; @@ -334,6 +322,9 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns) refcount_set(&net->count, 1); refcount_set(&net->passive, 1); get_random_bytes(&net->hash_mix, sizeof(u32)); + preempt_disable(); + net->net_cookie = gen_cookie_next(&net_cookie); + preempt_enable(); net->dev_base_seq = 1; net->user_ns = user_ns; idr_init(&net->netns_ids); @@ -1121,10 +1112,6 @@ static int __init net_ns_init(void) rcu_assign_pointer(init_net.gen, ng); - preempt_disable(); - __net_gen_cookie(&init_net); - preempt_enable(); - down_write(&pernet_ops_rwsem); if (setup_net(&init_net, &init_user_ns)) panic("Could not setup the initial network namespace"); -- Gitee From 97c75e8e1da6a99cae77ba183e0aa9b88459baff Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Mon, 27 May 2024 19:05:59 +0800 Subject: [PATCH 4/5] anolis: net/smc: fix incorrect sndbuf setting Fix a incorrect judgment, which will result in the function smc_copy_sock_settings_to_clc() always returning prematurely. Make sure smc_sk_sndbuf to be set only when it's inet sock. Fixes: 63c0f133958a("anolis: net/smc: fix incorrect sk_buf_size via inet sock") Signed-off-by: D. Wythe --- net/smc/af_smc.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 03e9596f32c8..31e0b839e275 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -480,8 +480,8 @@ static void smc_sock_init_passive(struct sock *par, struct sock *sk) smc_sock_init_common(sk); smc_sk(sk)->listen_smc = parent; /* restore the smc_sk_sndbuf before handshake */ - smc_sk(sk)->smc_sk_sndbuf = READ_ONCE(sock_net(sk)->smc.sysctl_wmem); - + if (smc_sock_is_inet_sock(sk)) + smc_sk(sk)->smc_sk_sndbuf = READ_ONCE(sk->sk_sndbuf); smc_sock_clone_negotiator_ops(par, sk); clcsk = smc_sock_is_inet_sock(sk) ? sk : smc_sk(sk)->clcsock->sk; @@ -595,6 +595,11 @@ static int smc_bind(struct socket *sock, struct sockaddr *uaddr, (1UL << SOCK_FILTER_LOCKED) | \ (1UL << SOCK_TSTAMP_NEW)) +#define SK_FLAGS_CLC_TO_SMC ((1UL << SOCK_URGINLINE) | \ + (1UL << SOCK_KEEPOPEN) | \ + (1UL << SOCK_LINGER) | \ + (1UL << SOCK_DBG)) + /* if set, use value set by setsockopt() - else use IPv4 or SMC sysctl value */ static void smc_adjust_sock_bufsizes(struct sock *nsk, struct sock *osk, unsigned long mask) @@ -628,7 +633,8 @@ static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk, unsigned long mask) { /* no need for inet smc */ - if (smc_sock_is_inet_sock(nsk)) + if (((mask == SK_FLAGS_SMC_TO_CLC) && smc_sock_is_inet_sock(osk)) || + ((mask == SK_FLAGS_CLC_TO_SMC) && smc_sock_is_inet_sock(nsk))) return; /* options we don't get control via setsockopt for */ @@ -652,10 +658,6 @@ static void smc_copy_sock_settings_to_clc(struct smc_sock *smc) smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC); } -#define SK_FLAGS_CLC_TO_SMC ((1UL << SOCK_URGINLINE) | \ - (1UL << SOCK_KEEPOPEN) | \ - (1UL << SOCK_LINGER) | \ - (1UL << SOCK_DBG)) /* copy only settings and flags relevant for smc from clc to smc socket */ static void smc_copy_sock_settings_to_smc(struct smc_sock *smc) { -- Gitee From c2745bd74869b8f6b96af792152c871e3391c6d9 Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Thu, 19 Dec 2024 20:06:41 +0800 Subject: [PATCH 5/5] anolis: net/smc: fix deadlock on inet sock with cgroup bpf Signed-off-by: D. Wythe --- net/smc/smc_inet.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/net/smc/smc_inet.c b/net/smc/smc_inet.c index d430890a2314..21e657204ad4 100644 --- a/net/smc/smc_inet.c +++ b/net/smc/smc_inet.c @@ -415,12 +415,21 @@ static int smc_inet_clcsock_release(struct socket *sock) static int smc_inet_clcsock_getname(struct socket *sock, struct sockaddr *addr, int peer) { - return sock->sk->sk_family == PF_INET ? inet_getname(sock, addr, peer) : + struct smc_sock *smc = smc_sk(sock->sk); + int err; + + release_sock(sock->sk); + err = sock->sk->sk_family == PF_INET ? inet_getname(sock, addr, peer) : #if IS_ENABLED(CONFIG_IPV6) inet6_getname(sock, addr, peer); #else -EINVAL; #endif + /* since we release sock before, there might be state changed */ + lock_sock(sock->sk); + if (smc_sk_state(&smc->sk) != SMC_INIT) + err = -ENOTCONN; + return err; } static __poll_t smc_inet_clcsock_poll(struct file *file, struct socket *sock, -- Gitee