diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..0f69df1b7ee28e3a07ad4817b7f60747b78db003 --- /dev/null +++ b/LICENSE @@ -0,0 +1,48 @@ +NewIP - New IP Stack +Copyright (c) 2022 Huawei Device Co., Ltd. All rights reserved. + +NewIP is dual licensed: you can use it either under the terms of +the GPL V2, or the BSD2 license, at your option. +a) GNU General Public License version 2, (https://opensource.org/licenses/GPL-2.0) +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License as +published by the Free Software Foundation; either version 2 of the +License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public +License along with this library; if not, write to the Free +Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, +MA 02110-1301 USA + +Alternatively, +b) The BSD2 License, (https://opensource.org/licenses/BSD-3-Clause) +Redistribution and use in source and binary forms, with or +without modification, are permitted provided that the following +conditions are met: + +1. Redistributions of source code must retain the above + copyright notice, this list of conditions and the following + disclaimer. +2. Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials + provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND +CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, +INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT +NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/apply_newip.sh b/apply_newip.sh new file mode 100755 index 0000000000000000000000000000000000000000..3d37496decbb84764dc77e14fac068835a9d2462 --- /dev/null +++ b/apply_newip.sh @@ -0,0 +1,40 @@ +#!/bin/bash +# +# Copyright (c) 2022 Huawei Device Co., Ltd. +# +# NewIP is dual licensed: you can use it either under the terms of +# the GPL, or the BSD license, at your option. +# See the LICENSE file in directory / of this repository for complete details. +# + +set -e + +OHOS_SOURCE_ROOT=$1 +KERNEL_BUILD_ROOT=$2 +PRODUCT_NAME=$3 +KERNEL_VERSION=$4 + +PATCH_FILE=$OHOS_SOURCE_ROOT/foundation/communication/sfc/newip/patches/$KERNEL_VERSION/newip.patch +PRODUCT_SWITCH=$OHOS_SOURCE_ROOT/foundation/communication/sfc/newip/patches/$PRODUCT_NAME.flag +function main() +{ + if [ ! -f $PATCH_FILE ]; then + echo "newip not supportted!kernel=$KERNEL_VERSION!" + return; + fi + if [ ! -f $PRODUCT_SWITCH ]; then + echo "newip not supportted!product=$PRODUCT_NAME!" + return; + fi + + + cd $KERNEL_BUILD_ROOT + echo "patch for newip..." + patch -p1 < $PATCH_FILE + + ln -sf $OHOS_SOURCE_ROOT/foundation/communication/sfc/newip/code/net/newip net/ + cp -arfL $OHOS_SOURCE_ROOT/foundation/communication/sfc/newip/code/include/* include/ + cd - +} + +main diff --git a/code/include/linux/newip_route.h b/code/include/linux/newip_route.h new file mode 100644 index 0000000000000000000000000000000000000000..d38c087369299a45afcd44a3bcbf6f8ae98ff10a --- /dev/null +++ b/code/include/linux/newip_route.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ +/* + * Copyright (c) 2022 Huawei Device Co., Ltd. + * + * Linux NewIP INET implementation + * + * Based on include/uapi/linux/ipv6_route.h + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#ifndef _LINUX_NEWIP_ROUTE_H +#define _LINUX_NEWIP_ROUTE_H + +#include + +#endif + diff --git a/code/include/linux/nip.h b/code/include/linux/nip.h new file mode 100644 index 0000000000000000000000000000000000000000..d65846afd6ebd4d2231fd3f67f952bfdcd55dcfa --- /dev/null +++ b/code/include/linux/nip.h @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2022 Huawei Device Co., Ltd. + * + * Based on include/linux/ipv6.h + */ +#ifndef _NIP_H +#define _NIP_H + +#include +#include +#include +#include + +struct nip_devconf { + __s32 forwarding; + __s32 mtu; + __s32 ignore_routes_with_linkdown; + + __s32 disable_nip; + __s32 nndisc_notify; + __s32 use_oif_addrs_only; + __s32 keep_addr_on_down; + + struct ctl_table_header *sysctl_header; +}; + +/* This structure contains results of exthdrs parsing + * as offsets from skb->nh. + */ +#pragma pack(1) +struct ninet_skb_parm { + struct nip_addr dstaddr; + struct nip_addr srcaddr; + u8 nexthdr; +}; +#pragma pack() + +struct tcp_nip_request_sock { + struct tcp_request_sock tcp_nip_rsk_tcp; +}; + +struct nip_udp_sock { + struct udp_sock udp; +}; + +struct tcp_nip_sock { + struct tcp_sock tcp; +}; + +int find_nip_forward_stamp(struct net *net, void __user *arg); + +#endif /* _NIP_H */ diff --git a/code/include/linux/nip_addr.h b/code/include/linux/nip_addr.h new file mode 100644 index 0000000000000000000000000000000000000000..c43d217b32fe823e55032239048ad37c9bb2a061 --- /dev/null +++ b/code/include/linux/nip_addr.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: BSD-2-Clause */ +/* + * Copyright (c) 2022 Huawei Device Co., Ltd. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, this list + * of conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _NIP_ADDR_H +#define _NIP_ADDR_H + +#include + +#endif /* _NIP_ADDR_H */ diff --git a/code/include/linux/nip_icmp.h b/code/include/linux/nip_icmp.h new file mode 100644 index 0000000000000000000000000000000000000000..bb67221e2be007f64c3b9e4a27481e1a765cef94 --- /dev/null +++ b/code/include/linux/nip_icmp.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2022 Huawei Device Co., Ltd. + * + * NewIP INET + * An implementation of the TCP/IP protocol suite for the LINUX + * operating system. NewIP INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Definitions for the NewIP ICMP protocol. + * + * Based on include/linux/icmp.h + */ +#ifndef _LINUX_NIP_ICMP_H +#define _LINUX_NIP_ICMP_H + +#include +#include +#include + +static inline struct nip_icmp_hdr *nip_icmp_header(const struct sk_buff *skb) +{ + return (struct nip_icmp_hdr *)skb_transport_header(skb); +} + +int nip_icmp_init(void); + +#endif diff --git a/code/include/net/flow_nip.h b/code/include/net/flow_nip.h new file mode 100644 index 0000000000000000000000000000000000000000..fe625d0b63d570fc418016d21e1aad0239d94a91 --- /dev/null +++ b/code/include/net/flow_nip.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2022 Huawei Device Co., Ltd. + * + * NewIP Generic internet FLOW. + * + * Based on include/net/flow.h + */ +#ifndef _NET_FLOW_NIP_H +#define _NET_FLOW_NIP_H + +#include + +struct flow_nip { + struct flowi_common __fl_common; +#define flowin_oif __fl_common.flowic_oif +#define flowin_iif __fl_common.flowic_iif + struct nip_addr daddr; + struct nip_addr saddr; + union flowi_uli uli; +#define fln_sport uli.ports.sport +#define fln_dport uli.ports.dport +} __attribute__((__aligned__(BITS_PER_LONG / 8))); + +#endif diff --git a/code/include/net/if_ninet.h b/code/include/net/if_ninet.h new file mode 100644 index 0000000000000000000000000000000000000000..e04df5c85bd9420fa8f3fb5235b5a6cb8ebaf6cd --- /dev/null +++ b/code/include/net/if_ninet.h @@ -0,0 +1,65 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2022 Huawei Device Co., Ltd. + * + * NewIP inet interface/address list definitions + * Linux NewIP INET implementation + * + * Based on include/net/if_inet6.h + */ +#ifndef _NET_IF_NINET_H +#define _NET_IF_NINET_H + +#include +#include + +enum { + NINET_IFADDR_STATE_NEW, + NINET_IFADDR_STATE_DEAD, +}; + +struct ninet_ifaddr { + struct nip_addr addr; + + /* In seconds, relative to tstamp. Expiry is at tstamp + HZ * lft. */ + __u32 valid_lft; + __u32 preferred_lft; + refcount_t refcnt; + + /* protect one ifaddr itself */ + spinlock_t lock; + + int state; + + __u32 flags; + + unsigned long cstamp; /* created timestamp */ + unsigned long tstamp; /* updated timestamp */ + + struct ninet_dev *idev; + struct nip_rt_info *rt; + + struct hlist_node addr_lst; + struct list_head if_list; + + struct rcu_head rcu; +}; + +struct ninet_dev { + struct net_device *dev; + + struct list_head addr_list; + + rwlock_t lock; + refcount_t refcnt; + __u32 if_flags; + int dead; + + struct neigh_parms *nd_parms; + struct nip_devconf cnf; + + unsigned long tstamp; /* newip InterfaceTable update timestamp */ + struct rcu_head rcu; +}; + +#endif diff --git a/code/include/net/netns/nip.h b/code/include/net/netns/nip.h new file mode 100644 index 0000000000000000000000000000000000000000..ed9ceb2e2806d12e28e1fdd0a64c0881dc674559 --- /dev/null +++ b/code/include/net/netns/nip.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2022 Huawei Device Co., Ltd. + * + * NewIP in net namespaces + * + * Based on include/net/netns/ipv6.h + */ +#ifndef __NETNS_NEWIP_H__ +#define __NETNS_NEWIP_H__ + +#include +#include + +struct ctl_table_header; + +struct netns_sysctl_newip { + int nip_rt_gc_interval; +}; +struct netns_newip { + uint32_t resv; + struct netns_sysctl_newip sysctl; + struct nip_devconf *devconf_dflt; + + struct nip_rt_info *nip_null_entry; + struct nip_rt_info *nip_broadcast_entry; + + struct dst_ops nip_dst_ops; + struct nip_fib_table *nip_fib_main_tbl; + struct nip_fib_table *nip_fib_local_tbl; +}; + +#endif + diff --git a/code/include/net/ninet_connection_sock.h b/code/include/net/ninet_connection_sock.h new file mode 100644 index 0000000000000000000000000000000000000000..1c13485ba677aa39ef6de86f180eaedb6599f8fc --- /dev/null +++ b/code/include/net/ninet_connection_sock.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2022 Huawei Device Co., Ltd. + * + * NewIP NET + * Generic infrastructure for NewIP INET connection oriented protocols. + * + * Based on include/net/inet_connection_sock.h + */ +#ifndef _NINET_CONNECTION_SOCK_H +#define _NINET_CONNECTION_SOCK_H + +#include +#include +#include + +struct inet_bind_bucket; +struct request_sock; +struct sk_buff; +struct sock; +struct sockaddr; + +int ninet_csk_bind_conflict(const struct sock *sk, + const struct inet_bind_bucket *tb, bool relax); +int ninet_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl); +void ninet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, + unsigned long timeout); + +#endif /* _NINET_CONNECTION_SOCK_H */ diff --git a/code/include/net/ninet_hashtables.h b/code/include/net/ninet_hashtables.h new file mode 100644 index 0000000000000000000000000000000000000000..81f00e39427fd70269de921bd0a422592512255f --- /dev/null +++ b/code/include/net/ninet_hashtables.h @@ -0,0 +1,113 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2022 Huawei Device Co., Ltd. + * + * NewIP INET + * An implementation of the TCP/IP protocol suite for the LINUX + * operating system. NewIP INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Based on include/net/inet6_hashtables.h + */ +#ifndef NINET_HASHTABLES_H +#define NINET_HASHTABLES_H + +#if IS_ENABLED(CONFIG_NEWIP) +#include +#include +#include +#include + +#include + +#include +#include + +struct inet_hashinfo; + +int ninet_hash(struct sock *sk); +void ninet_unhash(struct sock *sk); +int ninet_hash_connect(struct inet_timewait_death_row *death_row, + struct sock *sk); + +int __ninet_hash(struct sock *sk, struct sock *osk); + + +static inline unsigned int __ninet_ehashfn(const u32 lhash, + const u16 lport, + const u32 fhash, + const __be16 fport, + const u32 initval) +{ + const u32 ports = (((u32) lport) << 16) | (__force u32) fport; + + return jhash_3words(lhash, fhash, ports, initval); +} + +struct sock *__ninet_lookup_established(struct net *net, + struct inet_hashinfo *hashinfo, + const struct nip_addr *saddr, + const __be16 sport, + const struct nip_addr *daddr, + const u16 hnum, const int dif); + +struct sock *ninet_lookup_listener(struct net *net, + struct inet_hashinfo *hashinfo, + struct sk_buff *skb, int doff, + const struct nip_addr *saddr, + const __be16 sport, + const struct nip_addr *daddr, + const unsigned short hnum, const int dif, const int sdif); + +static inline struct sock *__ninet_lookup(struct net *net, + struct inet_hashinfo *hashinfo, + struct sk_buff *skb, int doff, + const struct nip_addr *saddr, + const __be16 sport, + const struct nip_addr *daddr, + const u16 hnum, + const int dif, bool *refcounted) +{ + struct sock *sk = __ninet_lookup_established(net, hashinfo, saddr, + sport, daddr, hnum, dif); + *refcounted = true; + if (sk) + return sk; + *refcounted = false; + return ninet_lookup_listener(net, hashinfo, skb, doff, saddr, sport, + daddr, hnum, dif, 0); +} + +static inline struct sock *__ninet_lookup_skb(struct inet_hashinfo *hashinfo, + struct sk_buff *skb, int doff, + const __be16 sport, + const __be16 dport, + int iif, bool *refcounted) +{ + struct sock *sk; + + *refcounted = true; + sk = skb_steal_sock(skb, refcounted); + if (sk) + return sk; + + return __ninet_lookup(dev_net(skb->dev), hashinfo, skb, + doff, &(NIPCB(skb)->srcaddr), sport, + &(NIPCB(skb)->dstaddr), ntohs(dport), + iif, refcounted); +} + +#define NINET_MATCH(__sk, __net, __saddr, __daddr, __ports, __dif) \ + (((__sk)->sk_portpair == (__ports)) && \ + ((__sk)->sk_family == AF_NINET) && \ + nip_addr_eq(&(__sk)->sk_nip_daddr, (__saddr)) && \ + nip_addr_eq(&(__sk)->sk_nip_rcv_saddr, (__daddr)) && \ + (!(__sk)->sk_bound_dev_if || \ + ((__sk)->sk_bound_dev_if == (__dif))) && \ + net_eq(sock_net(__sk), (__net))) + +int ninet_hash_connect(struct inet_timewait_death_row *death_row, + struct sock *sk); + +#endif /* IS_ENABLED(CONFIG_NEWIP) */ +#endif /* _NINET_HASHTABLES_H */ diff --git a/code/include/net/nip.h b/code/include/net/nip.h new file mode 100644 index 0000000000000000000000000000000000000000..9c66e10c2a5c323edf18487dbd8654b48434dd20 --- /dev/null +++ b/code/include/net/nip.h @@ -0,0 +1,164 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2022 Huawei Device Co., Ltd. + * + * NewIP INET + * An implementation of the TCP/IP protocol suite for the LINUX + * operating system. NewIP INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Definitions for the NewIP module. + * + * Based on include/net/ip.h + * Based on include/net/protocol.h + */ +#ifndef _NET_NEWIP_H +#define _NET_NEWIP_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include "if_ninet.h" +#include "flow_nip.h" + +#define NIP_MAX_SOCKET_NUM 1024 + +struct ninet_protocol { + void (*early_demux)(struct sk_buff *skb); + + int (*handler)(struct sk_buff *skb); + + void (*err_handler)(struct sk_buff *skb, + struct ninet_skb_parm *opt, + u8 type, u8 code, int offset, __be32 info); + unsigned int flags; +}; + +#define NIPCB(skb) ((struct ninet_skb_parm *)&(TCP_SKB_CB(skb)->header.hnip)) + +extern const struct ninet_protocol __rcu *ninet_protos[MAX_INET_PROTOS]; + +int ninet_add_protocol(const struct ninet_protocol *prot, + unsigned char protocol); +int ninet_del_protocol(const struct ninet_protocol *prot, + unsigned char protocol); +int ninet_register_protosw(struct inet_protosw *p); +void ninet_unregister_protosw(struct inet_protosw *p); + +extern const struct proto_ops ninet_dgram_ops; +extern const struct proto_ops ninet_stream_ops; +extern struct neigh_table nnd_tbl; + +int tcp_nip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl); +void tcp_nip_actual_send_reset(struct sock *sk, struct sk_buff *skb, u32 seq, + u32 ack_seq, u32 win, int rst, u32 priority); +int nip_rcv(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt, struct net_device *orig_dev); +struct nip_rt_info *nip_dst_alloc(struct net *net, struct net_device *dev, + int flags); + +static inline bool nip_addr_eq(const struct nip_addr *a1, + const struct nip_addr *a2) +{ + return (a1->bitlen == a2->bitlen) && (a1->bitlen <= NIP_ADDR_BIT_LEN_MAX) && + (memcmp(&a1->v.u, &a2->v.u, a1->bitlen >> 3) == 0); +}; + +static inline u32 nip_addr_hash(const struct nip_addr *a) +{ + u32 tmp[4]; + u8 len = a->bitlen >> 3; + + /* set unused bit to 0 */ + memset(tmp, 0, NIP_ADDR_BIT_LEN_16); + memcpy(tmp, &a->v.u, + len > NIP_ADDR_BIT_LEN_16 ? NIP_ADDR_BIT_LEN_16 : len); + + return (__force u32)(tmp[0] ^ tmp[1] ^ tmp[2] ^ tmp[3]); +} + +int nip_send_skb(struct sk_buff *skb); + +void ninet_destroy_sock(struct sock *sk); +int nip_datagram_connect(struct sock *sk, struct sockaddr *addr, int addr_len); +int nip_datagram_connect_v6_only(struct sock *sk, struct sockaddr *addr, + int addr_len); +int nip_datagram_dst_update(struct sock *sk, bool fix_sk_saddr); +void nip_datagram_release_cb(struct sock *sk); +int ninet_add_protocol(const struct ninet_protocol *prot, + unsigned char protocol); +int ninet_eld_protocol(const struct ninet_protocol *prot, + unsigned char protocol); +int ninet_register_protosw(struct inet_protosw *p); +void ninet_unregister_protosw(struct inet_protosw *p); +int nip_input(struct sk_buff *skb); +int nip_output(struct net *net, struct sock *sk, struct sk_buff *skb); +int nip_forward(struct sk_buff *skb); + +unsigned int tcp_nip_sync_mss(struct sock *sk, u32 pmtu); +unsigned int tcp_nip_current_mss(struct sock *sk); +int tcp_nip_send_mss(struct sock *sk, int *size_goal, int flags); + +struct nip_addr *nip_nexthop(struct nip_rt_info *rt, struct nip_addr *daddr); +struct dst_entry *nip_sk_dst_lookup_flow(struct sock *sk, struct flow_nip *fln); +struct dst_entry *nip_dst_lookup_flow(struct net *net, const struct sock *sk, + struct flow_nip *fln, + const struct nip_addr *final_dst); +u_char *nip_get_mac(struct nip_addr *nipaddr, struct net_device *dev); +struct net_device *nip_get_defaultdev(void); +int nip_init_dev(void); + +int _nip_udp_output(struct sock *sk, void *from, int datalen, + int transhdrlen, const struct nip_addr *saddr, + ushort sport, const struct nip_addr *daddr, + ushort dport, struct dst_entry *dst); + +/* functions defined in nip_sockglue.c */ +int nip_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, + unsigned int optlen); +int nip_getsockopt(struct sock *sk, int level, + int optname, char __user *optval, int __user *optlen); + +/* functions defined in nip_addrconf.c */ +int nip_addrconf_get_ifaddr(struct net *net, unsigned int cmd, void __user *arg); +/* 0 - No LOG + * 1 - Logging the kernel (for the official version) + * 2 - Logs are directly printed on the screen for debugging + */ +#define __NIP_DEBUG 0 + +#if __NIP_DEBUG >= 2 +#define TRACE_OUT(fmt, ...) \ + do { \ + pr_crit("%s:%s:%d", __FILE__, __func__, __LINE__); \ + pr_crit(fmt, ##__VA_ARGS__); \ + pr_crit("\n"); \ + } while (0) +#define TRACE(fmt, ...) pr_crit(fmt, ##__VA_ARGS__) +#elif __NIP_DEBUG >= 1 +#define TRACE_OUT(fmt, ...) \ + do { \ + pr_warn("%s:%s:%d", __FILE__, __func__, __LINE__); \ + pr_warn(fmt, ##__VA_ARGS__); \ + pr_warn("\n"); \ + } while (0) +#define TRACE(fmt, ...) pr_warn(fmt, ##__VA_ARGS__) +#else +#define TRACE(fmt, ...) +#define TRACE_OUT(fmt, ...) +#endif + +#define DEBUG(format, ...) TRACE(format, ##__VA_ARGS__) +#define DEBUG_TRACE(format, ...) TRACE_OUT(format, ##__VA_ARGS__) + +#endif diff --git a/code/include/net/nip_addrconf.h b/code/include/net/nip_addrconf.h new file mode 100644 index 0000000000000000000000000000000000000000..d38487d10554f270cdfe766de3e564cf925213b3 --- /dev/null +++ b/code/include/net/nip_addrconf.h @@ -0,0 +1,113 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2022 Huawei Device Co., Ltd. + * + * Based on include/net/addrconf.h + */ +#ifndef _NIP_ADDRCONF_H +#define _NIP_ADDRCONF_H + +#include +#include + +#include +#include +#include +#include + +#define ADDRCONF_NOTIFY_PRIORITY 0 +#define NIN_ADDR_HSIZE_SHIFT (4) +#define NIN_ADDR_HSIZE (1 << NIN_ADDR_HSIZE_SHIFT) + +int nip_addrconf_add_ifaddr(struct net *net, void __user *arg); +int nip_addrconf_del_ifaddr(struct net *net, void __user *arg); + +int nip_dev_get_saddr(struct net *net, const struct net_device *dev, + const struct nip_addr *daddr, struct nip_addr *saddr); + +int nip_addrconf_init(void); +void nip_addrconf_cleanup(void); + +/** + * __nin_dev_get - get ninet_dev pointer from netdevice + * @dev: network device + * + * Caller must hold rcu_read_lock or RTNL, because this function + * does not take a reference on the ninet_dev. + */ +static inline struct ninet_dev *__nin_dev_get(const struct net_device *dev) +{ + return rcu_dereference_rtnl(dev->nip_ptr); +} + +/** + * nin_dev_get - get ninet_dev pointer from netdevice + * @dev: network device + */ +static inline struct ninet_dev *nin_dev_get(const struct net_device *dev) +{ + struct ninet_dev *idev; + + rcu_read_lock(); + idev = rcu_dereference(dev->nip_ptr); + if (idev) + refcount_inc(&idev->refcnt); + rcu_read_unlock(); + return idev; +} + +static inline struct neigh_parms *__nin_dev_nd_parms_get_rcu( + const struct net_device *dev) +{ + struct ninet_dev *idev = __nin_dev_get(dev); + + return idev ? idev->nd_parms : NULL; +} + +void nin_dev_finish_destroy(struct ninet_dev *idev); + +static inline void nin_dev_put(struct ninet_dev *idev) +{ + if (refcount_dec_and_test(&idev->refcnt)) + nin_dev_finish_destroy(idev); +} + +static inline void nin_dev_put_clear(struct ninet_dev **pidev) +{ + struct ninet_dev *idev = *pidev; + + if (idev) { + nin_dev_put(idev); + *pidev = NULL; + } +} + +static inline void __nin_dev_put(struct ninet_dev *idev) +{ + refcount_dec(&idev->refcnt); +} + +static inline void nin_dev_hold(struct ninet_dev *idev) +{ + refcount_inc(&idev->refcnt); +} + +void ninet_ifa_finish_destroy(struct ninet_ifaddr *ifp); + +static inline void nin_ifa_put(struct ninet_ifaddr *ifp) +{ + if (refcount_dec_and_test(&ifp->refcnt)) + ninet_ifa_finish_destroy(ifp); +} + +static inline void __nin_ifa_put(struct ninet_ifaddr *ifp) +{ + refcount_dec(&ifp->refcnt); +} + +static inline void nin_ifa_hold(struct ninet_ifaddr *ifp) +{ + refcount_inc(&ifp->refcnt); +} + +#endif diff --git a/code/include/net/nip_fib.h b/code/include/net/nip_fib.h new file mode 100644 index 0000000000000000000000000000000000000000..13ac586af93a52725a7c3e46c6d966563e58eff1 --- /dev/null +++ b/code/include/net/nip_fib.h @@ -0,0 +1,154 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2022 Huawei Device Co., Ltd. + * + * Linux NewIP INET implementation + * + * Based on include/net/ip6_fib.h + */ +#ifndef _NET_NEWIP_FIB_H +#define _NET_NEWIP_FIB_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include "nip.h" +#include "flow_nip.h" + +#define NIN_ROUTE_HSIZE_SHIFT 4 +#define NIN_ROUTE_HSIZE (1 << NIN_ROUTE_HSIZE_SHIFT) + +struct nip_fib_config { + u32 fc_table; + u32 fc_metric; + int fc_ifindex; + u32 fc_flags; + u32 fc_protocol; + u32 fc_type; /* only 8 bits are used */ + + struct nip_addr fc_dst; + struct nip_addr fc_src; + struct nip_addr fc_gateway; + + struct nl_info fc_nlinfo; + unsigned long fc_expires; +}; + +struct nip_fib_node { + struct hlist_node fib_hlist; + struct nip_rt_info *nip_route_info; + struct rcu_head rcu; +}; + +struct nip_fib_table; + +struct nip_rt_info { + struct dst_entry dst; + struct dst_entry *from; + struct nip_fib_table *rt_table; + struct nip_fib_node __rcu *rt_node; + struct ninet_dev *rt_idev; + struct nip_rt_info *__percpu *rt_pcpu; + + atomic_t rt_ref; + + uint32_t rt_flags; + struct nip_addr gateway; + struct nip_addr rt_dst; + struct nip_addr rt_src; + + u32 rt_metric; + u32 rt_pmtu; + u8 rt_protocol; +}; + +static inline struct ninet_dev *nip_dst_idev(struct dst_entry *dst) +{ + return ((struct nip_rt_info *)dst)->rt_idev; +} + +struct nip_fib_table { + u32 nip_tb_id; + spinlock_t nip_tb_lock; + struct hlist_head nip_tb_head[NIN_ROUTE_HSIZE]; + unsigned int flags; +}; + +#define NIP_RT_TABLE_MAIN RT_TABLE_MAIN +#define NIP_RT_TABLE_LOCAL RT_TABLE_LOCAL + +typedef struct nip_rt_info *(*nip_pol_lookup_t) (struct net *, + struct nip_fib_table *, + struct flow_nip *, int); + +struct nip_fib_table *nip_fib_get_table(struct net *net, u32 id); + +struct dst_entry *nip_fib_rule_lookup(struct net *net, struct flow_nip *fln, + int flags, nip_pol_lookup_t lookup); + +#define NIP_RT_EXPIRES_FLAGS 12 +static inline void nip_rt_set_expires(struct nip_rt_info *rt, + unsigned long expires) +{ + rt->dst.expires = expires; + + rt->rt_flags |= NIP_RT_EXPIRES_FLAGS; +} + +static inline void nip_rt_clean_expires(struct nip_rt_info *rt) +{ + rt->rt_flags &= ~NIP_RT_EXPIRES_FLAGS; + rt->dst.expires = 0; +} + +static inline void nip_rt_put(struct nip_rt_info *rt) +{ + BUILD_BUG_ON(offsetof(struct nip_rt_info, dst) != 0); + dst_release(&rt->dst); +} + +void nip_rt_free_pcpu(struct nip_rt_info *non_pcpu_rt); + +static inline void nip_rt_hold(struct nip_rt_info *rt) +{ + atomic_inc(&rt->rt_ref); +} + +static inline void nip_rt_release(struct nip_rt_info *rt) +{ + if (atomic_dec_and_test(&rt->rt_ref)) { + nip_rt_free_pcpu(rt); + dst_dev_put(&rt->dst); + + dst_release(&rt->dst); + } +} + +int nip_fib_init(void); + +void nip_fib_gc_cleanup(void); + +struct nip_fib_node *nip_fib_locate(struct hlist_head *nip_tb_head, + const struct nip_addr *daddr); + +void nip_fib_clean_all(struct net *net, + int (*func)(struct nip_rt_info *, void *arg), void *arg); + +int nip_fib_add(struct hlist_head *nip_tb_head, struct nip_rt_info *rt); + +int nip_fib_del(struct nip_rt_info *rt_info, struct nl_info *info); + +int nip_set_route_netlink(struct net *net, struct nip_rtmsg *rtmsg); + +int nip_del_route_netlink(struct net *net, struct nip_rtmsg *rtmsg); + +#endif /* _NET_NEWIP_FIB_H */ diff --git a/code/include/net/nip_route.h b/code/include/net/nip_route.h new file mode 100644 index 0000000000000000000000000000000000000000..0cc11e15a5464b020a0331431e0d1e8513438ee9 --- /dev/null +++ b/code/include/net/nip_route.h @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2022 Huawei Device Co., Ltd. + * + * Based on include/net/ip6_route.h + */ +#ifndef _NET_NIP_ROUTE_H +#define _NET_NIP_ROUTE_H + +#include +#include "nip_fib.h" +#include "nip_addrconf.h" + +#define NIP_RT_PRIO_USER 1024 + +struct nip_rt_info *nip_addrconf_dst_alloc(struct ninet_dev *idev, + const struct nip_addr *addr); + + +void nip_route_input(struct sk_buff *skb); +struct dst_entry *nip_route_input_lookup(struct net *net, + struct net_device *dev, + struct flow_nip *fln, int flags); + +struct dst_entry *nip_route_output_flags(struct net *net, const struct sock *sk, + struct flow_nip *fln, int flags); + + +static inline struct dst_entry *nip_route_output(struct net *net, + const struct sock *sk, + struct flow_nip *fln) +{ + return nip_route_output_flags(net, sk, fln, 0); +} + +struct nip_rt_info *nip_pol_route(struct net *net, struct nip_fib_table *table, + int oif, struct flow_nip *fln, int flags); + +bool nip_bind_addr_check(struct net *net, + struct nip_addr *addr); + +int nip_ins_rt(struct nip_rt_info *rt); +int nip_del_rt(struct nip_rt_info *rt); + +static inline int nip_route_get_saddr(struct net *net, struct nip_rt_info *rt, + const struct nip_addr *daddr, + struct nip_addr *saddr) +{ + struct ninet_dev *idev = + rt ? nip_dst_idev((struct dst_entry *)rt) : NULL; + int err = 0; + + err = nip_dev_get_saddr(net, idev ? idev->dev : NULL, daddr, saddr); + + return err; +} + +void nip_rt_ifdown(struct net *net, struct net_device *dev); + +int nip_route_ioctl(struct net *net, unsigned int cmd, struct nip_rtmsg *rtmsg); + +int nip_route_init(void); + +void nip_route_cleanup(void); + +#endif /*_NET_NIP_ROUTE_H*/ diff --git a/code/include/net/nip_udp.h b/code/include/net/nip_udp.h new file mode 100644 index 0000000000000000000000000000000000000000..62f225526c33b9f3a201305f12a74bfed8f836cd --- /dev/null +++ b/code/include/net/nip_udp.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2022 Huawei Device Co., Ltd. + * + * NewIP INET + * An implementation of the TCP/IP protocol suite for the LINUX + * operating system. NewIP INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Definitions for the NewIP UDP module. + * + * Based on include/net/udp.h + */ +#ifndef _NET_NEWIP_UDP_H +#define _NET_NEWIP_UDP_H + +#include +#include +#include +#include +#include +#include + +#define NIP_UDP_HSLOT_COUNT 10 + +int nip_udp_init(void); + +int nip_udp_output(struct sock *sk, struct msghdr *msg, size_t len); + +int nip_udp_input(struct sk_buff *skb); +int nip_udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, + int noblock, int flags, int *addr_len); + +#endif diff --git a/code/include/net/nndisc.h b/code/include/net/nndisc.h new file mode 100644 index 0000000000000000000000000000000000000000..bd4bf77ac4b953506ab08b8068a3707798223929 --- /dev/null +++ b/code/include/net/nndisc.h @@ -0,0 +1,73 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2022 Huawei Device Co., Ltd. + * + * Based on include/net/ndisc.h + */ +#ifndef _NNDISC_H +#define _NNDISC_H + +#include +#include +#include +#include +#include +#include +#include +#include + +#define NEWIP_NEIGH_BUCKET_MAX 8 +extern struct neigh_table nnd_tbl; + +#define NIP_ARP_NS 0x01 /* ARP request */ +#define NIP_ARP_NA 0x02 /* ARP response */ + +struct nnd_msg { + struct nip_icmp_hdr icmph; + __u8 data[0]; +}; + +static inline bool neigh_key_eq800(const struct neighbour *n, const void *pkey) +{ + struct nip_addr *a1, *a2; + + a1 = (struct nip_addr *)(pkey); + a2 = (struct nip_addr *)(n->primary_key); + +#define RIGHT_POS_3 3 + return a1->bitlen == a2->bitlen && a1->bitlen <= NIP_ADDR_BIT_LEN_MAX && + memcmp(&a1->v.u, &a2->v.u, a1->bitlen >> RIGHT_POS_3) == 0; +} + +static inline u32 nndisc_hashfn(const void *pkey, const struct net_device *dev, + __u32 *hash_rnd) +{ + return (*(int *)pkey % NEWIP_NEIGH_BUCKET_MAX); +} + +static inline struct neighbour *__nip_neigh_lookup_noref(struct net_device *dev, + const void *pkey) +{ + return ___neigh_lookup_noref(&nnd_tbl, neigh_key_eq800, nndisc_hashfn, + pkey, dev); +} + +static inline struct neighbour *__nip_neigh_lookup(struct net_device *dev, + const void *pkey) +{ + struct neighbour *n; + + rcu_read_lock_bh(); + n = __nip_neigh_lookup_noref(dev, pkey); + if (n && !refcount_inc_not_zero(&n->refcnt)) + n = NULL; + rcu_read_unlock_bh(); + + return n; +} + +int nndisc_rcv(struct sk_buff *skb); + +int nndisc_init(void); + +#endif diff --git a/code/include/net/tcp_nip.h b/code/include/net/tcp_nip.h new file mode 100644 index 0000000000000000000000000000000000000000..1e39fd5b64e4435c6cc43d6dd53ce074d5c2d6b0 --- /dev/null +++ b/code/include/net/tcp_nip.h @@ -0,0 +1,182 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2022 Huawei Device Co., Ltd. + * + * NewIP INET + * An implementation of the TCP/IP protocol suite for the LINUX + * operating system. NewIP INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Definitions for the NewIP TCP module. + * + * Based on include/net/tcp.h + */ +#ifndef _TCP_NIP_H +#define _TCP_NIP_H + +#define FASTRETRANS_DEBUG 1 + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +extern struct proto tcp_nip_prot; + +#define TCP_HDR_LEN_OFFSET 6 +#define TCP_HDR_LEN_POS_PAYLOAD 12 +#define TCP_NIP_4BYTE_PAYLOAD 2 + +#define TCP_OPT_MSS_PAYLOAD 24 +#define TCP_OLEN_MSS_PAYLOAD 16 + +#define TCP_NUM_2 2 +#define TCP_NUM_4 4 + +#define TCP_ARRAY_INDEX_2 2 + +#define TCP_NIP_KEEPALIVE_CYCLE_MS_DIVISOR 20 /* 1 HZ = 1 seconds */ +#define TCP_NIP_CSK_KEEPALIVE_CYCLE 10 /* 1 HZ = 1 seconds */ + +#define TCP_NIP_WINDOW_MAX 65535U + +/* init */ +int tcp_nip_init(void); +void tcp_nip_exit(void); + +void tcp_nip_done(struct sock *sk); +int tcp_direct_connect(struct sock *sk, void __user *arg); +void tcp_nip_rcv_established( + struct sock *sk, + struct sk_buff *skb, + const struct tcphdr *th, + unsigned int len); + +void __tcp_nip_push_pending_frames( + struct sock *sk, + unsigned int cur_mss, + int nonagle); + +u32 __nip_tcp_select_window(struct sock *sk); +unsigned short nip_get_output_checksum_tcp(struct sk_buff *skb, struct nip_addr src_addr, + struct nip_addr dst_addr); +void tcp_nip_rearm_rto(struct sock *sk); + +int tcp_nip_rcv_state_process(struct sock *sk, struct sk_buff *skb); + +/* tcp_nip_output */ +int tcp_nip_transmit_skb( + struct sock *sk, + struct sk_buff *skb, + int clone_it, + gfp_t gfp_mask); +int __tcp_nip_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs); +int tcp_nip_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs); +void tcp_nip_send_fin(struct sock *sk); +void tcp_nip_send_active_reset(struct sock *sk, gfp_t priority); +void tcp_nip_send_probe0(struct sock *sk); +int tcp_nip_write_wakeup(struct sock *sk, int mib); + +/* tcp_nip_timer */ +void tcp_nip_init_xmit_timers(struct sock *sk); +void tcp_nip_clear_xmit_timers(struct sock *sk); +void tcp_nip_delack_timer_handler(struct sock *sk); +void tcp_nip_write_timer_handler(struct sock *sk); + +/* check probe0 timer */ +static inline void tcp_nip_check_probe_timer(struct sock *sk) +{ + if (!tcp_sk(sk)->packets_out && !inet_csk(sk)->icsk_pending) + inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, + tcp_probe0_base(sk), TCP_RTO_MAX); +} + +static inline struct sk_buff *tcp_nip_send_head(const struct sock *sk) +{ + return sk->sk_send_head; +} + +static inline void tcp_nip_add_write_queue_tail( + struct sock *sk, + struct sk_buff *skb) +{ + __skb_queue_tail(&sk->sk_write_queue, skb); + + if (sk->sk_send_head == NULL) + sk->sk_send_head = skb; +} + +static inline void tcp_nip_write_queue_purge(struct sock *sk) +{ + struct sk_buff *skb; + + while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) { + tcp_skb_tsorted_anchor_cleanup(skb); + sk_wmem_free_skb(sk, skb); + } + + tcp_clear_all_retrans_hints(tcp_sk(sk)); + sk->sk_send_head = NULL; + inet_csk(sk)->icsk_backoff = 0; +} + +static inline bool tcp_nip_write_queue_empty(struct sock *sk) +{ + return skb_queue_empty(&sk->sk_write_queue); +} + +/* connect */ +int __tcp_nip_connect(struct sock *sk); +int tcp_newip_conn_request(struct request_sock_ops *rsk_ops, + const struct tcp_request_sock_ops *af_ops, + struct sock *sk, struct sk_buff *skb); +struct sk_buff *tcp_nip_make_synack( + const struct sock *sk, + struct dst_entry *dst, + struct request_sock *req, + struct tcp_fastopen_cookie *foc, + enum tcp_synack_type synack_type); +int nip_send_synack(struct request_sock *req, struct sk_buff *skb); +struct sock *tcp_nip_check_req(struct sock *sk, struct sk_buff *skb, + struct request_sock *req); +int tcp_nip_child_process(struct sock *parent, struct sock *child, + struct sk_buff *skb); +int tcp_nip_rtx_synack(const struct sock *sk, struct request_sock *req); + +/* client send ack */ +void tcp_nip_send_ack(struct sock *sk); +struct sock *tcp_nip_create_openreq_child(const struct sock *sk, + struct request_sock *req, + struct sk_buff *skb); +void tcp_nip_initialize_rcv_mss(struct sock *sk); + +/* release */ +void tcp_nip_release_cb(struct sock *sk); + +void tcp_nip_keepalive_enable(struct sock *sk); +void tcp_nip_keepalive_disable(struct sock *sk); + +#endif /* _NIP_TCP_H */ diff --git a/code/include/net/transp_nip.h b/code/include/net/transp_nip.h new file mode 100644 index 0000000000000000000000000000000000000000..2688e55f5f7799e5ed816cb117e878f247985961 --- /dev/null +++ b/code/include/net/transp_nip.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2022 Huawei Device Co., Ltd. + * + * Based on include/net/transp_v6.h + */ +#ifndef _TRANSP_NIP_H +#define _TRANSP_NIP_H + +extern struct proto nip_udp_prot; + +int nip_udp_init(void); +void nip_udp_exit(void); + +int nip_udp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); + +void nip_datagram_recv_ctl(struct sock *sk, struct msghdr *msg, + struct sk_buff *skb); +void nip_datagram_recv_common_ctl(struct sock *sk, struct msghdr *msg, + struct sk_buff *skb); +void nip_datagram_recv_specific_ctl(struct sock *sk, struct msghdr *msg, + struct sk_buff *skb); + +void nip_dgram_sock_seq_show(struct seq_file *seq, struct sock *sp, __u16 srcp, + __u16 destp, int bucket); + +void ninet_destroy_sock(struct sock *sk); + +#define NEWIP_SEQ_DGRAM_HEADER \ + " s1 " \ + "local_address " \ + "remote_address " \ + "st tx_queue rc_queue tr tm->when retrnsmt" \ + " uid timeout inode ref pointer drops\n" + +#endif diff --git a/code/include/uapi/linux/newip_route.h b/code/include/uapi/linux/newip_route.h new file mode 100644 index 0000000000000000000000000000000000000000..15495b3a9a8292401f7a28051e8059c220abb7e0 --- /dev/null +++ b/code/include/uapi/linux/newip_route.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ +/* + * Copyright (c) 2022 Huawei Device Co., Ltd. + * + * Linux NewIP INET implementation + * + * Based on include/uapi/linux/ipv6_route.h + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#ifndef _UAPI_LINUX_NEWIP_ROUTE_H +#define _UAPI_LINUX_NEWIP_ROUTE_H + +#include "nip_addr.h" + +struct nip_rtmsg { + struct nip_addr rtmsg_dst; + struct nip_addr rtmsg_src; + struct nip_addr rtmsg_gateway; + char dev_name[10]; + unsigned int rtmsg_type; + int rtmsg_ifindex; + unsigned int rtmsg_metric; + unsigned long rtmsg_info; + unsigned int rtmsg_flags; +}; +#endif /* _UAPI_LINUX_NEWIP_ROUTE_H */ diff --git a/code/include/uapi/linux/nip.h b/code/include/uapi/linux/nip.h new file mode 100644 index 0000000000000000000000000000000000000000..745f2090f15f5bc5d27ade144067e5c2b3cf5de3 --- /dev/null +++ b/code/include/uapi/linux/nip.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * Copyright (c) 2022 Huawei Device Co., Ltd. + * + * Based on include/uapi/linux/ipv6.h + */ +#ifndef _UAPI_NEWIP_H +#define _UAPI_NEWIP_H + +#include +#include +#include +#include +#include + +struct nip_ifreq { + struct nip_addr ifrn_addr; + int ifrn_ifindex; +}; + +struct nip_devreq { + char nip_ifr_name[IFNAMSIZ]; /* if name, e.g. "eth0", "wlan0" */ + + union { + struct sockaddr_nin addr; + short flags; + } devreq; +}; + +#define nip_dev_addr devreq.addr /* nip address */ +#define nip_dev_flags devreq.flags /* net device flags */ + +#endif /*_UAPI_NEWIP_H*/ diff --git a/code/include/uapi/linux/nip_addr.h b/code/include/uapi/linux/nip_addr.h new file mode 100644 index 0000000000000000000000000000000000000000..b58a0883ce273aacea09d3739062d751b876f12f --- /dev/null +++ b/code/include/uapi/linux/nip_addr.h @@ -0,0 +1,128 @@ +/* SPDX-License-Identifier: BSD-2-Clause */ +/* + * Copyright (c) 2022 Huawei Device Co., Ltd. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, this list + * of conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _UAPI_NEWIP_ADDR_H +#define _UAPI_NEWIP_ADDR_H + +#define NIP_ADDR_LEN_1 1 +#define NIP_ADDR_LEN_2 2 +#define NIP_ADDR_LEN_3 3 +#define NIP_ADDR_LEN_4 4 +#define NIP_ADDR_LEN_5 5 + +#define NIP_ADDR_BIT_LEN_8 8 +#define NIP_ADDR_BIT_LEN_16 16 +#define NIP_ADDR_BIT_LEN_24 24 +#define NIP_ADDR_BIT_LEN_40 40 +#define NIP_ADDR_BIT_LEN_MAX 64 + +enum nip_addr_check_value { + ADDR_FIRST_DC = 0xDC, + ADDR_FIRST_F0 = 0xF0, + ADDR_FIRST_F1, + ADDR_FIRST_F2, + ADDR_FIRST_F3, + ADDR_FIRST_F4, + ADDR_FIRST_FF = 0xFF, + ADDR_SECOND_MIN_DD = 0xDD, + ADDR_SECOND_MIN_F1 = 0x14, /* f1 14 00 */ + ADDR_THIRD_MIN_F2 = 0x01, /* f2 00 01 00 00 */ +}; + +enum nip_8bit_addr_index { + NIP_8BIT_ADDR_INDEX_0 = 0, + NIP_8BIT_ADDR_INDEX_1 = 1, + NIP_8BIT_ADDR_INDEX_2 = 2, + NIP_8BIT_ADDR_INDEX_3 = 3, + NIP_8BIT_ADDR_INDEX_4 = 4, + NIP_8BIT_ADDR_INDEX_5 = 5, + NIP_8BIT_ADDR_INDEX_6 = 6, + NIP_8BIT_ADDR_INDEX_7 = 7, + NIP_8BIT_ADDR_INDEX_MAX, +}; + +enum nip_16bit_addr_index { + NIP_16BIT_ADDR_INDEX_0 = 0, + NIP_16BIT_ADDR_INDEX_1 = 1, + NIP_16BIT_ADDR_INDEX_2 = 2, + NIP_16BIT_ADDR_INDEX_3 = 3, + NIP_16BIT_ADDR_INDEX_MAX, +}; + +enum nip_32bit_addr_index { + NIP_32BIT_ADDR_INDEX_0 = 0, + NIP_32BIT_ADDR_INDEX_1 = 1, + NIP_32BIT_ADDR_INDEX_MAX, +}; + +#define nip_addr_field8 v.u.u8 +#define nip_addr_field16 v.u.u16 +#define nip_addr_field32 v.u.u32 + +#pragma pack(1) +struct nip_addr_field { + union { + unsigned char u8[NIP_8BIT_ADDR_INDEX_MAX]; + unsigned short u16[NIP_16BIT_ADDR_INDEX_MAX]; /* big-endian */ + unsigned int u32[NIP_32BIT_ADDR_INDEX_MAX]; /* big-endian */ + } u; +}; + +struct nip_addr { + unsigned char bitlen; + struct nip_addr_field v; +}; +#pragma pack() + +#define POD_SOCKADDR_SIZE 10 + +struct sockaddr_nin { + unsigned short sin_family; /* AF_NINET */ + unsigned short sin_port; /* Transport layer port, big-endian */ + struct nip_addr sin_addr; /* NIP address */ + + /* Pad to size of struct sockaddr + * We don't neet to use this field + * Due to the flexible size of nip_addr, we consider the extreme situation: + * the size of nip_addr is 2 bytes, so we need to add 10 bytes to make sure + * it has the same size as struct sockaddr. And it won't have trouble if you + * increase the length of nip_addr. + */ + unsigned char sin_zero[POD_SOCKADDR_SIZE]; +}; + +extern const struct nip_addr nip_any_addr; +extern const struct nip_addr nip_broadcast_addr_arp; + +int nip_addr_invalid(const struct nip_addr *addr); +int nip_addr_public(const struct nip_addr *addr); +int nip_addr_any(const struct nip_addr *ad); +int get_nip_addr_len(const struct nip_addr *addr); +unsigned char *build_nip_addr(const struct nip_addr *addr, unsigned char *buf); +unsigned char *decode_nip_addr(unsigned char *buf, struct nip_addr *addr); + +#endif /* _UAPI_NEWIP_ADDR_H */ + diff --git a/code/include/uapi/linux/nip_icmp.h b/code/include/uapi/linux/nip_icmp.h new file mode 100644 index 0000000000000000000000000000000000000000..7970fe14624e77d48cec6a726d5481efd1555e01 --- /dev/null +++ b/code/include/uapi/linux/nip_icmp.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ +/* + * Copyright (c) 2022 Huawei Device Co., Ltd. + * + * NewIP INET + * An implementation of the TCP/IP protocol suite for the LINUX + * operating system. NewIP INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Definitions for the NewIP ICMP protocol. + * + * Based on include/uapi/linux/icmp.h + */ +#ifndef _UAPI_LINUX_NIP_ICMP_H +#define _UAPI_LINUX_NIP_ICMP_H + +#include +#include + +struct nip_icmp_hdr { + __u8 nip_icmp_type; + __u8 nip_icmp_code; + __sum16 nip_icmp_cksum; +}; + +#endif diff --git a/code/net/newip/Kconfig b/code/net/newip/Kconfig new file mode 100644 index 0000000000000000000000000000000000000000..85b19828550ffd3f8ad95f5bfa128c3f34796369 --- /dev/null +++ b/code/net/newip/Kconfig @@ -0,0 +1,16 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# Copyright (c) 2022 Huawei Device Co., Ltd. +# +# NewIP configuration +# + +# NewIP as module will cause a CRASH if you try to unload it +menuconfig NEWIP + tristate "The NewIP protocol" + default y + help + Support for NewIP. + + To compile this protocol support as a module, choose M here: the + module will be called NewIP. diff --git a/code/net/newip/Makefile b/code/net/newip/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..c97368ec8555c534beb118fbdfd4810057dc32ab --- /dev/null +++ b/code/net/newip/Makefile @@ -0,0 +1,13 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Copyright (c) 2022 Huawei Device Co., Ltd. +# +# Makefile for the Linux newip layer +# + +obj-$(CONFIG_NEWIP) += newip.o + + +newip-objs := nip_addr.o nip_hdr_encap.o nip_hdr_decap.o nip_checksum.o af_ninet.o nip_input.o udp.o protocol.o nip_output.o datagram.o nip_addrconf.o nip_addrconf_core.o route.o nip_fib.o nip_fib_rules.o nndisc.o icmp.o tcp_nip_parameter.o +newip-objs += tcp_nip.o ninet_connection_sock.o ninet_hashtables.o tcp_nip_output.o tcp_nip_input.o tcp_nip_timer.o nip_sockglue.o +EXTRA_CFLAGS := -I$(src)/include diff --git a/code/net/newip/af_ninet.c b/code/net/newip/af_ninet.c new file mode 100644 index 0000000000000000000000000000000000000000..cda24d130c1a3c50d3b7b3049c7fa737c2477eac --- /dev/null +++ b/code/net/newip/af_ninet.c @@ -0,0 +1,763 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2022 Huawei Device Co., Ltd. + * + * NewIP INET socket protocol family + * Linux NewIP INET implementation + * + * Based on linux/net/ipv6/af_inet6.c + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for signal_pending() */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +MODULE_DESCRIPTION("NewiIP protocol stack for linux"); + +/* The inetsw_nip table contains everything that ninet_create needs to + * build a new socket + */ +static struct list_head inetsw_nip[SOCK_MAX]; +static DEFINE_SPINLOCK(inetsw_nip_lock); +/* count the socket number */ +atomic_t g_nip_socket_number = ATOMIC_INIT(0); + +static int disable_nip_mod; +module_param_named(disable, disable_nip_mod, int, 0444); +MODULE_PARM_DESC(disable, + "Disable NewIP module such that it is non_functional"); + +bool newip_mod_enabled(void) +{ + return disable_nip_mod == 0; +} +EXPORT_SYMBOL_GPL(newip_mod_enabled); + +static int ninet_create(struct net *net, struct socket *sock, int protocol, + int kern) +{ + struct inet_sock *inet; + struct sock *sk; + struct inet_protosw *answer; + struct proto *answer_prot; + unsigned char answer_flags; + int err; + int num; + + if (protocol < 0 || + protocol >= IPPROTO_MAX || + sock->type >= SOCK_MAX) + return -EINVAL; + + num = atomic_add_return(1, &g_nip_socket_number); + if (num > NIP_MAX_SOCKET_NUM) { + DEBUG("The number of socket is biger than 1024!"); + err = -EPERM; + goto number_sub; + } + + sock->state = SS_UNCONNECTED; + /* look for the requested type/protocol pair. */ + err = -ESOCKTNOSUPPORT; + rcu_read_lock(); + list_for_each_entry_rcu(answer, &inetsw_nip[sock->type], list) { + err = 0; + /* Check the non-wild matcg */ + if (protocol == answer->protocol) { + if (protocol != IPPROTO_IP) + break; + } else { + /* check for the two wild case. */ + if (protocol == IPPROTO_IP) { + protocol = answer->protocol; + break; + } + if (answer->protocol == IPPROTO_IP) + break; + } + err = -EPROTONOSUPPORT; + } + + if (err) + goto out_rcu_unlock; + + err = -EPERM; + + sock->ops = answer->ops; + answer_prot = answer->prot; + answer_flags = answer->flags; + rcu_read_unlock(); + + WARN_ON(!answer_prot->slab); + + err = -ENOBUFS; + sk = sk_alloc(net, PF_NINET, GFP_KERNEL, answer_prot, kern); + if (!sk) + goto number_sub; + + sock_init_data(sock, sk); + + err = 0; + if (answer_flags & INET_PROTOSW_REUSE) + sk->sk_reuse = SK_CAN_REUSE; + inet = inet_sk(sk); + inet->is_icsk = (answer_flags & INET_PROTOSW_ICSK) != 0; + inet->nodefrag = 0; + + if (sock->type == SOCK_RAW) { + inet->inet_num = protocol; + if (protocol == IPPROTO_RAW) + inet->hdrincl = 1; + } + + sk->sk_destruct = inet_sock_destruct; + sk->sk_family = PF_NINET; + sk->sk_protocol = protocol; + sk->sk_backlog_rcv = answer->prot->backlog_rcv; + sk->sk_nip_daddr = nip_any_addr; + sk->sk_nip_rcv_saddr = nip_any_addr; + + inet->uc_ttl = -1; + inet->mc_loop = 1; + inet->mc_ttl = 1; + inet->mc_all = 1; + inet->mc_index = 0; + inet->mc_list = NULL; + inet->rcv_tos = 0; + sk_refcnt_debug_inc(sk); + + if (inet->inet_num) { + inet->inet_sport = htons(inet->inet_num); + err = sk->sk_prot->hash(sk); + if (err) { + sk_common_release(sk); + goto number_sub; + } + } + if (sk->sk_prot->init) { + err = sk->sk_prot->init(sk); + if (err) { + sk_common_release(sk); + goto number_sub; + } + } +out: + DEBUG("The final number of socket is: %d", num); + return err; +out_rcu_unlock: + rcu_read_unlock(); +number_sub: + atomic_dec_if_positive(&g_nip_socket_number); + num = atomic_read(&g_nip_socket_number); + DEBUG("The final number of socket is: %d", num); + goto out; +} + +int ninet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +{ + struct sockaddr_nin *addr = (struct sockaddr_nin *)uaddr; + struct sock *sk = sock->sk; + struct inet_sock *inet = inet_sk(sk); + struct net *net = sock_net(sk); + u_short snum; + int err = 0; + + /* If the socket has its own bind function then use it */ + if (sk->sk_prot->bind) + return sk->sk_prot->bind(sk, uaddr, addr_len); + + if (addr_len < sizeof(struct sockaddr_nin)) + return -EINVAL; + + snum = ntohs(addr->sin_port); + if (snum && snum < PROT_SOCK) + return -EACCES; + + if (nip_bind_addr_check(net, &addr->sin_addr) == false) { + DEBUG("%s: binding-addr invalid.", __func__); + return -EADDRNOTAVAIL; + } + lock_sock(sk); + + /* check these errors (active socket, double bind) */ + if (sk->sk_state != TCP_CLOSE || inet->inet_num) { + err = -EINVAL; + goto out; + } + + sk->sk_nip_rcv_saddr = addr->sin_addr; + + /* make sure we are allowed to bind here */ + if ((snum || !inet->bind_address_no_port) && + sk->sk_prot->get_port(sk, snum)) { + inet->inet_saddr = 0; + err = -EADDRINUSE; + goto out; + } + inet->inet_sport = htons(inet->inet_num); + inet->inet_daddr = 0; + inet->inet_dport = 0; + sk_dst_reset(sk); + +out: + release_sock(sk); + return err; +} + +/* Function + * Move a socket into listening state. + * Parameter + * sock: The socket + * backlog: Specifies the number of clients that use a three-way handshake + * to establish a TCP connection + */ +int ninet_listen(struct socket *sock, int backlog) +{ + struct sock *sk = sock->sk; + unsigned char old_state; + int err; + + lock_sock(sk); + + err = -EINVAL; + if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM) + goto out; + + old_state = sk->sk_state; + if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN))) + goto out; + + WRITE_ONCE(sk->sk_max_ack_backlog, backlog); + /* Really, if the socket is already in listen state + * we can only allow the backlog to be adjusted. + */ + if (old_state != TCP_LISTEN) { + err = inet_csk_listen_start(sk, backlog); + if (err) + goto out; + } + err = 0; + +out: + release_sock(sk); + return err; +} + +int ninet_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + + if (!sk) + return -EINVAL; + + atomic_dec_if_positive(&g_nip_socket_number); + return inet_release(sock); +} + +void ninet_destroy_sock(struct sock *sk) +{ + ; +} + +int ninet_getname(struct socket *sock, struct sockaddr *uaddr, + int peer) +{ + struct sock *sk = sock->sk; + struct inet_sock *inet = inet_sk(sk); + DECLARE_SOCKADDR(struct sockaddr_nin *, sin, uaddr); + + sin->sin_family = AF_NINET; + if (peer) { + if (!inet->inet_dport) + return -ENOTCONN; + if (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)) && + peer == 1) + return -ENOTCONN; + sin->sin_port = inet->inet_dport; + sin->sin_addr = sk->sk_nip_daddr; + } else { + sin->sin_port = inet->inet_sport; + sin->sin_addr = sk->sk_nip_rcv_saddr; + } + return sizeof(*sin); +} + +static long ninet_wait_for_connect(struct sock *sk, long timeo, int writebias) +{ + DEFINE_WAIT_FUNC(wait, woken_wake_function); + + add_wait_queue(sk_sleep(sk), &wait); + sk->sk_write_pending += writebias; + + /* Basic assumption: if someone sets sk->sk_err, he _must_ + * change state of the socket from TCP_SYN_*. + * Connect() does not allow to get error notifications + * without closing the socket. + */ + while ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { + release_sock(sk); + timeo = wait_woken(&wait, TASK_INTERRUPTIBLE, timeo); + lock_sock(sk); + if (signal_pending(current) || !timeo) + break; + } + remove_wait_queue(sk_sleep(sk), &wait); + sk->sk_write_pending -= writebias; + return timeo; +} + +/* Function + * The client socket layer is used to establish connection requests + * Parameter + * sock: The socket + * uaddr:The destination address + */ +int __ninet_stream_connect(struct socket *sock, struct sockaddr *uaddr, + int addr_len, int flags) +{ + struct sock *sk = sock->sk; + int err; + long timeo; + + if (uaddr) { + if (addr_len < sizeof(uaddr->sa_family)) + return -EINVAL; + } + + switch (sock->state) { + default: + err = -EINVAL; + goto out; + case SS_CONNECTED: + err = -EISCONN; + goto out; + case SS_CONNECTING: + err = -EALREADY; + break; + case SS_UNCONNECTED: + err = -EISCONN; + if (sk->sk_state != TCP_CLOSE) + goto out; + /* Call the tcp_nip_connect function */ + err = sk->sk_prot->connect(sk, uaddr, addr_len); + if (err < 0) + goto out; + /* Switch to connecting, and then perform subsequent operations */ + sock->state = SS_CONNECTING; + err = -EINPROGRESS; + break; + } + + /* Get blocking time */ + timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); + if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { + int writebias = 0; + /* Error code is set above */ + if (!timeo || !ninet_wait_for_connect(sk, timeo, writebias)) + goto out; + + err = sock_intr_errno(timeo); + if (signal_pending(current)) + goto out; + } + + if (sk->sk_state == TCP_CLOSE) + goto sock_error; + sock->state = SS_CONNECTED; + err = 0; + +out: + return err; +sock_error: + err = sock_error(sk) ? : -ECONNABORTED; + sock->state = SS_DISCONNECTING; + goto out; +} + +int ninet_stream_connect(struct socket *sock, struct sockaddr *uaddr, + int addr_len, int flags) +{ + int err; + + lock_sock(sock->sk); + err = __ninet_stream_connect(sock, uaddr, addr_len, flags); + release_sock(sock->sk); + return err; +} + +int ninet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + struct sock *sk = sock->sk; + struct net *net = sock_net(sk); + + DEBUG("%s: cmd=0x%x.", __func__, cmd); + switch (cmd) { + case SIOCADDRT: + case SIOCDELRT: { + struct nip_rtmsg rtmsg; + + if (copy_from_user(&rtmsg, (void __user *)arg, sizeof(rtmsg))) { + DEBUG("%s: fail to copy route cfg data.", __func__); + return -EFAULT; + } + return nip_route_ioctl(net, cmd, &rtmsg); + } + case SIOCSIFADDR: + return nip_addrconf_add_ifaddr(net, (void __user *)arg); + case SIOCDIFADDR: + return nip_addrconf_del_ifaddr(net, (void __user *)arg); + case SIOCGIFADDR: + return nip_addrconf_get_ifaddr(net, cmd, (void __user *)arg); + + default: + if (!sk->sk_prot->ioctl) { + DEBUG("%s: sock sk_prot ioctl is null, cmd=0x%x.", __func__, cmd); + return -ENOIOCTLCMD; + } + return sk->sk_prot->ioctl(sk, cmd, arg); + } +} + +#ifdef CONFIG_COMPAT +struct compat_nip_rtmsg { + struct nip_addr rtmsg_dst; + struct nip_addr rtmsg_src; + struct nip_addr rtmsg_gateway; + char dev_name[10]; + unsigned int rtmsg_type; + int rtmsg_ifindex; + unsigned int rtmsg_metric; + unsigned int rtmsg_info; /* long convert to int */ + unsigned int rtmsg_flags; +}; + +static int ninet_compat_routing_ioctl(struct sock *sk, unsigned int cmd, + struct compat_nip_rtmsg __user *ur) +{ + struct nip_rtmsg rt; + + if (copy_from_user(&rt.rtmsg_dst, &ur->rtmsg_dst, 3 * sizeof(struct nip_addr)) || + copy_from_user(&rt.dev_name, &ur->dev_name, sizeof(rt.dev_name)) || + get_user(rt.rtmsg_type, &ur->rtmsg_type) || + get_user(rt.rtmsg_ifindex, &ur->rtmsg_ifindex) || + get_user(rt.rtmsg_metric, &ur->rtmsg_metric) || + get_user(rt.rtmsg_info, &ur->rtmsg_info) || + get_user(rt.rtmsg_flags, &ur->rtmsg_flags)) { + DEBUG("%s: fail to convert input para, cmd=0x%x.", __func__, cmd); + return -EFAULT; + } + + DEBUG("%s: cmd=0x%x.", __func__, cmd); + return nip_route_ioctl(sock_net(sk), cmd, &rt); +} + +int ninet_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + void __user *argp = compat_ptr(arg); + struct sock *sk = sock->sk; + + switch (cmd) { + case SIOCADDRT: + case SIOCDELRT: + return ninet_compat_routing_ioctl(sk, cmd, argp); + default: + return -ENOIOCTLCMD; + } +} +EXPORT_SYMBOL_GPL(ninet_compat_ioctl); +#endif /* CONFIG_COMPAT */ + +/* register new IP socket */ +const struct proto_ops ninet_dgram_ops = { + .family = PF_NINET, + .owner = THIS_MODULE, + .release = ninet_release, + .bind = ninet_bind, + .connect = inet_dgram_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .getname = ninet_getname, + .poll = datagram_poll, + .ioctl = ninet_ioctl, + .gettstamp = sock_gettstamp, + .listen = sock_no_listen, + .shutdown = inet_shutdown, + .setsockopt = sock_common_setsockopt, + .getsockopt = sock_common_getsockopt, + .sendmsg = inet_sendmsg, + .recvmsg = inet_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, + .set_peek_off = sk_set_peek_off, +#ifdef CONFIG_COMPAT + .compat_ioctl = ninet_compat_ioctl, +#endif +}; + +const struct proto_ops ninet_stream_ops = { + .family = PF_NINET, + .owner = THIS_MODULE, + .release = ninet_release, + .bind = ninet_bind, + .connect = ninet_stream_connect, + .socketpair = sock_no_socketpair, + .accept = inet_accept, + .getname = ninet_getname, + .poll = tcp_poll, + .ioctl = ninet_ioctl, + .listen = ninet_listen, + .shutdown = inet_shutdown, + .setsockopt = sock_common_setsockopt, + .getsockopt = sock_common_getsockopt, + .sendmsg = inet_sendmsg, + .recvmsg = inet_recvmsg, + .mmap = sock_no_mmap, + .sendpage = inet_sendpage, +#ifdef CONFIG_COMPAT + .compat_ioctl = ninet_compat_ioctl, +#endif +}; + +static const struct net_proto_family ninet_family_ops = { + .family = PF_NINET, + .create = ninet_create, + .owner = THIS_MODULE, +}; + +int ninet_register_protosw(struct inet_protosw *p) +{ + struct list_head *lh; + struct inet_protosw *answer; + struct list_head *last_perm; + int protocol = p->protocol; + int ret; + + spin_lock_bh(&inetsw_nip_lock); + + ret = -EINVAL; + if (p->type >= SOCK_MAX) + goto out_illegal; + + /* If we are trying to override a permanent protocol, bail. */ + answer = NULL; + ret = -EPERM; + last_perm = &inetsw_nip[p->type]; + list_for_each(lh, &inetsw_nip[p->type]) { + answer = list_entry(lh, struct inet_protosw, list); + + /* Check only the non-wild match. */ + if (answer->flags & INET_PROTOSW_PERMANENT) { + if (protocol == answer->protocol) + break; + last_perm = lh; + } + + answer = NULL; + } + if (answer) + goto out_permanent; + + list_add_rcu(&p->list, last_perm); + ret = 0; +out: + spin_unlock_bh(&inetsw_nip_lock); + return ret; + +out_permanent: + pr_err("Attempt to override permanent protocol %d\n", protocol); + goto out; + +out_illegal: + pr_err("Ignoring attempt to register invalid socket type %d\n", + p->type); + goto out; +} + +void ninet_unregister_protosw(struct inet_protosw *p) +{ + if (INET_PROTOSW_PERMANENT & p->flags) { + pr_err("Attempt to unregister permanent protocol %d\n", + p->protocol); + } else { + spin_lock_bh(&inetsw_nip_lock); + list_del_rcu(&p->list); + spin_unlock_bh(&inetsw_nip_lock); + + synchronize_net(); + } +} + +int ninet_sk_rebuild_header(struct sock *sk) +{ + return 0; +} + +/* register to data link layer */ +static struct packet_type nip_packet_type __read_mostly = { + .type = cpu_to_be16(ETH_P_NEWIP), + .func = nip_rcv, +}; + +static int __init nip_packet_init(void) +{ + dev_add_pack(&nip_packet_type); + return 0; +} + +static int __net_init ninet_net_init(struct net *net) +{ + int err = 0; + return err; +} + +static void __net_exit ninet_net_exit(struct net *net) +{ + ; +} + +static struct pernet_operations ninet_net_ops = { + .init = ninet_net_init, + .exit = ninet_net_exit, +}; + +static int __init ninet_init(void) +{ + struct list_head *r; + int err = 0; + + sock_skb_cb_check_size(sizeof(struct ninet_skb_parm)); + + DEBUG("NET: start to init nip network.\n"); + /* register the socket-side information for ninet_create */ + for (r = &inetsw_nip[0]; r < &inetsw_nip[SOCK_MAX]; ++r) + INIT_LIST_HEAD(r); + + if (disable_nip_mod) { + DEBUG("Loaded, but adminstratively disabled,"); + DEBUG("reboot required to enable\n"); + goto out; + } + + err = proto_register(&tcp_nip_prot, 1); + if (err) + goto out; + + err = proto_register(&nip_udp_prot, 1); + if (err) { + DEBUG_TRACE("failed to register udp proto!\n"); + goto out_udp_register_fail; + } + + err = sock_register(&ninet_family_ops); + if (err) { + DEBUG_TRACE("failed to register newip_family_ops!"); + goto out_sock_register_fail; + } + + err = register_pernet_subsys(&ninet_net_ops); + if (err) { + DEBUG_TRACE("failed to register ninet_net_ops!\n"); + goto register_pernet_fail; + } + + err = nip_icmp_init(); + if (err) { + DEBUG_TRACE("nip_icmp_init failed!\n"); + goto nip_icmp_fail; + } + + err = nndisc_init(); + if (err) { + DEBUG_TRACE("nndisc_init failed!\n"); + goto nndisc_fail; + } + + err = nip_route_init(); + if (err) + goto nip_route_fail; + + err = nip_addrconf_init(); + if (err) + goto nip_addr_fail; + + err = nip_udp_init(); + if (err) { + DEBUG_TRACE("failed to init udp layer!\n"); + goto udp_fail; + } + + err = tcp_nip_init(); + if (err) { + DEBUG("failed to init tcp layer!\n"); + goto tcp_fail; + } else { + DEBUG("nip_tcp_init ok!"); + } + + err = nip_packet_init(); + if (err) { + DEBUG_TRACE("failed to register to l2 layer!\n"); + goto nip_packet_fail; + } + + DEBUG("NewIP: init newip address family ok!"); + +out: + return err; + +nip_packet_fail: +udp_fail: +tcp_fail: + nip_addrconf_cleanup(); +nip_addr_fail: + nip_route_cleanup(); +nip_route_fail: +nndisc_fail: +nip_icmp_fail: + unregister_pernet_subsys(&ninet_net_ops); +register_pernet_fail: + sock_unregister(PF_NINET); +out_sock_register_fail: + proto_unregister(&nip_udp_prot); +out_udp_register_fail: + DEBUG_TRACE("newip family init failed!!!\n"); + goto out; +} + +module_init(ninet_init); + +MODULE_ALIAS_NETPROTO(PF_NINET); + diff --git a/code/net/newip/datagram.c b/code/net/newip/datagram.c new file mode 100644 index 0000000000000000000000000000000000000000..e4e6709ba29d9bb4d3a7fe4c07c737bc7e68253c --- /dev/null +++ b/code/net/newip/datagram.c @@ -0,0 +1,26 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2022 Huawei Device Co., Ltd. + * + * NewIP common UDP code + * Linux NewIP INET implementation + * + * Adapted from linux/net/ipv6/datagram.c + */ +#include +#include +#include +#include +#include + +int nip_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) +{ + int res = 0; + return res; +} + +void nip_datagram_release_cb(struct sock *sk) +{ + ; +} + diff --git a/code/net/newip/icmp.c b/code/net/newip/icmp.c new file mode 100644 index 0000000000000000000000000000000000000000..01ab70822453204b9671fac61bf52531a5c46cd4 --- /dev/null +++ b/code/net/newip/icmp.c @@ -0,0 +1,69 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2022 Huawei Device Co., Ltd. + * + * Internet Control Message Protocol (NewIP ICMP) + * Linux NewIP INET implementation + * + * Based on net/ipv6/icmp.c + * Based on net/ipv4/af_inet.c + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "nip_hdr.h" + +int nip_icmp_rcv(struct sk_buff *skb) +{ + int ret = 0; + struct nip_icmp_hdr *hdr = nip_icmp_header(skb); + u8 type = hdr->nip_icmp_type; + + DEBUG("rcv newip icmp packet. type = %u\n", type); + switch (type) { + case NIP_ARP_NS: + case NIP_ARP_NA: + ret = nndisc_rcv(skb); + break; + default: + DEBUG("nip icmp packet type error\n"); + } + return ret; +} + +static void nip_icmp_err(struct sk_buff *skb, + struct ninet_skb_parm *opt, + u8 type, uint8_t code, + int offset, __be32 info) +{ +} + +static const struct ninet_protocol nip_icmp_protocol = { + .handler = nip_icmp_rcv, + .err_handler = nip_icmp_err, + .flags = 0, +}; + +int __init nip_icmp_init(void) +{ + int ret; + + ret = ninet_add_protocol(&nip_icmp_protocol, IPPROTO_NIP_ICMP); + return ret; +} diff --git a/code/net/newip/ninet_connection_sock.c b/code/net/newip/ninet_connection_sock.c new file mode 100644 index 0000000000000000000000000000000000000000..2879dac2345cf2144ff1191f97bfff1492513094 --- /dev/null +++ b/code/net/newip/ninet_connection_sock.c @@ -0,0 +1,103 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2022 Huawei Device Co., Ltd. + * + * NewIP INET + * An implementation of the TCP/IP protocol suite for the LINUX + * operating system. NewIP INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Support for NewIP INET connection oriented protocols. + * + * Based on net/ipv4/inet_connection_sock.c + */ +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +/* Function + * Timeout handler for request processing, used to retransmit SYN+ACK + * Parameter + * t: Request control block + */ +static void ninet_reqsk_timer_handler(struct timer_list *t) +{ + struct request_sock *req = from_timer(req, t, rsk_timer); + struct sock *sk_listener = req->rsk_listener; + struct net *net = sock_net(sk_listener); + struct inet_connection_sock *icsk = inet_csk(sk_listener); + struct request_sock_queue *queue = &icsk->icsk_accept_queue; + int max_retries, thresh; + + /* Defines the maximum number of retransmissions. Thresh defaults to 5 */ + max_retries = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_synack_retries; + thresh = max_retries; + + /* Check timeout times. SYN+ACK retransmission times +1 */ + if (req->num_timeout <= thresh) { + unsigned long timeo; + + req->rsk_ops->rtx_syn_ack(sk_listener, req); + req->num_retrans++; + /* If the number of times out is still 0, the number is increased by 1 + * to determine whether it is the first time out + */ + if (req->num_timeout++ == 0) + atomic_dec(&queue->young); + timeo = min(TCP_TIMEOUT_INIT, TCP_RTO_MAX); + mod_timer(&req->rsk_timer, jiffies + timeo); + return; + } + + inet_csk_reqsk_queue_drop_and_put(sk_listener, req); +} + +/* Function + * Add request_SOCK to the connection queue and ehash table, + * and set the SYNACK timeout retransmission timer + * Parameter + * sk: Transmission control block + * req: Connection request block + * timeout: The initial timeout period + */ +void ninet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, + unsigned long timeout) +{ + req->num_retrans = 0; + req->num_timeout = 0; + req->sk = NULL; + + timer_setup(&req->rsk_timer, ninet_reqsk_timer_handler, + TIMER_PINNED); + mod_timer(&req->rsk_timer, jiffies + timeout); + + inet_ehash_insert(req_to_sk(req), NULL, NULL); + + smp_wmb(); /* memory barrier */ + refcount_set(&req->rsk_refcnt, TCP_NUM_2 + 1); + + inet_csk_reqsk_queue_added(sk); +} + +/* Function + * Check whether the socket conflicts with the linked list. If no, 0 is returned + * Parameter + * sk: The transport control block to listen + * tb: bind bucket, sock list for storing bind + */ +int ninet_csk_bind_conflict(const struct sock *sk, + const struct inet_bind_bucket *tb, bool relax) +{ + return 0; +} + diff --git a/code/net/newip/ninet_hashtables.c b/code/net/newip/ninet_hashtables.c new file mode 100644 index 0000000000000000000000000000000000000000..0e3419bef1dc5228c910fa530c4b16c5aac067dc --- /dev/null +++ b/code/net/newip/ninet_hashtables.c @@ -0,0 +1,417 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2022 Huawei Device Co., Ltd. + * + * NewIP INET + * An implementation of the TCP/IP protocol suite for the LINUX + * operating system. NewIP INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Generic NewIP INET transport hashtables + * + * Based on net/ipv4/inet_hashtables.c + * Based on net/ipv6/inet6_hashtables.c + * Based on include/net/ip.h + * Based on include/net/ipv6.h + */ +#include +#include + +#include +#include +#include +#include +#include + +static inline u32 nip_portaddr_hash(const struct net *net, + const struct nip_addr *saddr, + unsigned int port) +{ + u32 v = (__force u32)saddr->nip_addr_field32[0] ^ (__force u32)saddr->nip_addr_field32[1]; + + return jhash_1word(v, net_hash_mix(net)) ^ port; +} + +static u32 __nip_addr_jhash(const struct nip_addr *a, const u32 initval) +{ + u32 v = (__force u32)a->nip_addr_field32[0] ^ (__force u32)a->nip_addr_field32[1]; + + return jhash_3words(v, + (__force u32)a->nip_addr_field32[0], + (__force u32)a->nip_addr_field32[1], + initval); +} + +static struct inet_listen_hashbucket * +ninet_lhash2_bucket_sk(struct inet_hashinfo *h, struct sock *sk) +{ + u32 hash = nip_portaddr_hash(sock_net(sk), + &sk->sk_nip_rcv_saddr, + inet_sk(sk)->inet_num); + return inet_lhash2_bucket(h, hash); +} + +static void ninet_hash2(struct inet_hashinfo *h, struct sock *sk) +{ + struct inet_listen_hashbucket *ilb2; + + if (!h->lhash2) + return; + + ilb2 = ninet_lhash2_bucket_sk(h, sk); + + spin_lock(&ilb2->lock); + hlist_add_head_rcu(&inet_csk(sk)->icsk_listen_portaddr_node, &ilb2->head); + + ilb2->count++; + spin_unlock(&ilb2->lock); +} + +/* Function + * Returns the hash value based on the passed argument + * Parameter + * net: The namespace + * laddr: The destination address + * lport: Destination port + * faddr: Source address + * fport: Source port + */ +u32 ninet_ehashfn(const struct net *net, + const struct nip_addr *laddr, const u16 lport, + const struct nip_addr *faddr, const __be16 fport) +{ + static u32 ninet_ehash_secret __read_mostly; + static u32 ninet_hash_secret __read_mostly; + + u32 lhash, fhash; + + net_get_random_once(&ninet_ehash_secret, sizeof(ninet_ehash_secret)); + net_get_random_once(&ninet_hash_secret, sizeof(ninet_hash_secret)); + + /* Ipv6 uses S6_ADdr32 [3], the last 32bits of the address */ + lhash = (__force u32)laddr->nip_addr_field32[0]; + fhash = __nip_addr_jhash(faddr, ninet_hash_secret); + + return __ninet_ehashfn(lhash, lport, fhash, fport, + ninet_ehash_secret + net_hash_mix(net)); +} + +/* Function + * The socket is put into the Listen hash in case the server finds + the socket in the second handshake + * Parameter + * sk: Transmission control block + * osk: old socket + */ +int __ninet_hash(struct sock *sk, struct sock *osk) +{ + struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; + struct inet_listen_hashbucket *ilb; + int err = 0; + + if (sk->sk_state != TCP_LISTEN) { + inet_ehash_nolisten(sk, osk, NULL); + return 0; + } + WARN_ON(!sk_unhashed(sk)); + ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; + + spin_lock(&ilb->lock); + + __sk_nulls_add_node_rcu(sk, &ilb->nulls_head); + + ninet_hash2(hashinfo, sk); + ilb->count++; + sock_set_flag(sk, SOCK_RCU_FREE); + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); + + spin_unlock(&ilb->lock); + + return err; +} + +int ninet_hash(struct sock *sk) +{ + int err = 0; + + if (sk->sk_state != TCP_CLOSE) { + local_bh_disable(); + err = __ninet_hash(sk, NULL); + local_bh_enable(); + } + + return err; +} + +static void ninet_unhash2(struct inet_hashinfo *h, struct sock *sk) +{ + struct inet_listen_hashbucket *ilb2; + + if (!h->lhash2 || + WARN_ON_ONCE(hlist_unhashed(&inet_csk(sk)->icsk_listen_portaddr_node))) + return; + + ilb2 = ninet_lhash2_bucket_sk(h, sk); + + spin_lock(&ilb2->lock); + hlist_del_init_rcu(&inet_csk(sk)->icsk_listen_portaddr_node); + ilb2->count--; + spin_unlock(&ilb2->lock); +} + +void ninet_unhash(struct sock *sk) +{ + struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; + struct inet_listen_hashbucket *ilb = NULL; + spinlock_t *lock; /* Spin lock (note deleted alarm) */ + + if (sk_unhashed(sk)) + return; + + if (sk->sk_state == TCP_LISTEN) { + ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; + lock = &ilb->lock; + } else { + lock = inet_ehash_lockp(hashinfo, sk->sk_hash); + } + spin_lock_bh(lock); + if (sk_unhashed(sk)) + goto unlock; + + if (ilb) { + ninet_unhash2(hashinfo, sk); + ilb->count--; + } + __sk_nulls_del_node_init_rcu(sk); + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); + +unlock: + spin_unlock_bh(lock); +} + +/* Function + * Find transport control blocks based on address and port in the ehash table. + * If found, three handshakes have been made and a connection has been established, + * and normal communication can proceed. + * Parameter + * net: The namespace + * hashinfo: A global scalar of type tcp_hashinfo that stores tcp_SOCK(including ESTABLISHED, + * listen, and bind) for various states of the current system. + * saddr: Source address + * sport: Source port + * daddr: The destination address + * hnum: Destination port + */ +struct sock *__ninet_lookup_established(struct net *net, + struct inet_hashinfo *hashinfo, + const struct nip_addr *saddr, + const __be16 sport, + const struct nip_addr *daddr, + const u16 hnum, + const int dif) +{ + struct sock *sk; + const struct hlist_nulls_node *node; + + const __portpair ports = INET_COMBINED_PORTS(sport, hnum); + + unsigned int hash = ninet_ehashfn(net, daddr, hnum, saddr, sport); + unsigned int slot = hash & hashinfo->ehash_mask; + + struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; + +begin: + sk_nulls_for_each_rcu(sk, node, &head->chain) { + DEBUG("%s: sk->sk_hash:%u", __func__, sk->sk_hash); + DEBUG("%s: dif:%d", __func__, dif); + if (sk->sk_hash != hash) + continue; + if (!NINET_MATCH(sk, net, saddr, daddr, ports, dif)) + continue; + if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) { + DEBUG("[nip]%s:sk->sk_refcnt == 0!!!!\n", __func__); + goto out; + } + + if (unlikely(!NINET_MATCH(sk, net, saddr, daddr, ports, dif))) { + sock_gen_put(sk); + goto begin; + } + DEBUG("%s: find sock in ehash table!", __func__); + goto found; + } + if (get_nulls_value(node) != slot) + goto begin; +out: + sk = NULL; +found: + return sk; +} + +static inline int nip_tcp_compute_score(struct sock *sk, struct net *net, + const unsigned short hnum, + const struct nip_addr *daddr, + const int dif, int sdif) +{ + int score = -1; + + if (inet_sk(sk)->inet_num == hnum && sk->sk_family == PF_NINET && + net_eq(sock_net(sk), net)) { + score = 1; + if (!nip_addr_eq(&sk->sk_nip_rcv_saddr, &nip_any_addr)) { + if (!nip_addr_eq(&sk->sk_nip_rcv_saddr, daddr)) + return -1; + score++; + } + if (!inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif)) + return -1; + score++; + if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id()) + score++; + } + + return score; +} + +static struct sock *ninet_lhash2_lookup(struct net *net, + struct inet_listen_hashbucket *ilb2, + struct sk_buff *skb, int doff, + const struct nip_addr *saddr, __be16 sport, + const struct nip_addr *daddr, const unsigned short hnum, + const int dif, const int sdif) +{ + struct inet_connection_sock *icsk; + struct sock *sk, *result = NULL; + int score, hiscore = 0, matches = 0, reuseport = 0; + u32 phash = 0; + + inet_lhash2_for_each_icsk_rcu(icsk, &ilb2->head) { + sk = (struct sock *)icsk; + score = nip_tcp_compute_score(sk, net, hnum, daddr, dif, sdif); + if (score > hiscore) { + DEBUG("%s: find sock in lhash table", __func__); + result = sk; + hiscore = score; + reuseport = sk->sk_reuseport; + if (reuseport) { + DEBUG("%s: find reuseport sock in lhash table", __func__); + phash = ninet_ehashfn(net, daddr, hnum, saddr, sport); + matches = 1; + } + } else if (score == hiscore && reuseport) { + matches++; + if (reciprocal_scale(phash, matches) == 0) + result = sk; + phash = next_pseudo_random32(phash); + } + } + return result; +} + +struct sock *ninet_lookup_listener(struct net *net, + struct inet_hashinfo *hashinfo, + struct sk_buff *skb, int doff, + const struct nip_addr *saddr, + const __be16 sport, const struct nip_addr *daddr, + const unsigned short hnum, const int dif, const int sdif) +{ + struct inet_listen_hashbucket *ilb2; + struct sock *result = NULL; + unsigned int hash2 = nip_portaddr_hash(net, daddr, hnum); + + ilb2 = inet_lhash2_bucket(hashinfo, hash2); + + result = ninet_lhash2_lookup(net, ilb2, skb, doff, + saddr, sport, daddr, hnum, + dif, sdif); + if (result) + goto done; + + hash2 = nip_portaddr_hash(net, &nip_any_addr, hnum); + ilb2 = inet_lhash2_bucket(hashinfo, hash2); + + result = ninet_lhash2_lookup(net, ilb2, skb, doff, + saddr, sport, &nip_any_addr, hnum, + dif, sdif); +done: + if (IS_ERR(result)) + return NULL; + return result; +} + +/* Check whether the quad information in sock is bound by ehash. If not, + * the SK is inserted into the ehash and 0 is returned + */ +static int __ninet_check_established(struct inet_timewait_death_row *death_row, + struct sock *sk, const __u16 lport, + struct inet_timewait_sock **twp) +{ + struct inet_hashinfo *hinfo = death_row->hashinfo; + struct inet_sock *inet = inet_sk(sk); + struct nip_addr *daddr = &sk->sk_nip_rcv_saddr; + struct nip_addr *saddr = &sk->sk_nip_daddr; + int dif = sk->sk_bound_dev_if; + struct net *net = sock_net(sk); + const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport); + unsigned int hash = ninet_ehashfn(net, daddr, lport, + saddr, inet->inet_dport); + struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); + spinlock_t *lock = inet_ehash_lockp(hinfo, hash); + struct sock *sk2; + const struct hlist_nulls_node *node; + + spin_lock(lock); + + sk_nulls_for_each(sk2, node, &head->chain) { + if (sk2->sk_hash != hash) + continue; + + if (likely(NINET_MATCH(sk2, net, + saddr, daddr, ports, dif))) { + DEBUG("%s: found same sk in ehash!\n", __func__); + goto not_unique; + } + } + + /* Must record num and sport now. Otherwise we will see + * in hash table socket with a funny identity. + */ + DEBUG("%s: add tcp sock into ehash table. sport=%u\n", + __func__, lport); + inet->inet_num = lport; + inet->inet_sport = htons(lport); + sk->sk_hash = hash; + WARN_ON(!sk_unhashed(sk)); + __sk_nulls_add_node_rcu(sk, &head->chain); + + spin_unlock(lock); + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); + return 0; + +not_unique: + spin_unlock(lock); + return -EADDRNOTAVAIL; +} + +static u64 ninet_sk_port_offset(const struct sock *sk) +{ + const struct inet_sock *inet = inet_sk(sk); + + return secure_newip_port_ephemeral(sk->sk_nip_rcv_saddr.nip_addr_field32, + sk->sk_nip_daddr.nip_addr_field32, + inet->inet_dport); +} + +/* Bind local ports randomly */ +int ninet_hash_connect(struct inet_timewait_death_row *death_row, + struct sock *sk) +{ + u64 port_offset = 0; + + if (!inet_sk(sk)->inet_num) + port_offset = ninet_sk_port_offset(sk); + + return __inet_hash_connect(death_row, sk, port_offset, + __ninet_check_established); +} + diff --git a/code/net/newip/nip_addr.c b/code/net/newip/nip_addr.c new file mode 100644 index 0000000000000000000000000000000000000000..fb41473f3fcffd83211092efd04c42a302aa9939 --- /dev/null +++ b/code/net/newip/nip_addr.c @@ -0,0 +1,270 @@ +// SPDX-License-Identifier: BSD-2-Clause +/* + * Copyright (c) 2022 Huawei Device Co., Ltd. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, this list + * of conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include + +/* This is similar to 0.0.0.0 in IPv4. Does not appear as a real address, + * just a constant used by the native for special processing + */ +const struct nip_addr nip_any_addr = { + .bitlen = NIP_ADDR_BIT_LEN_16, + .nip_addr_field8[0] = 0xFF, /* 0xFF09 addr, big-endian */ + .nip_addr_field8[1] = 0x09, +}; + +const struct nip_addr nip_broadcast_addr_arp = { + .bitlen = NIP_ADDR_BIT_LEN_16, + .nip_addr_field8[0] = 0xFF, /* 0xFF04 addr, big-endian */ + .nip_addr_field8[1] = 0x04, +}; + +/* Short address range: + * 【1-byte】0 ~ 220 + * 00 ~ DC + * + * 【2-byte】221 ~ 5119 + * DD/DE/.../F0 is a 2-byte address descriptor followed by the address value + * DDDD ~ DDFF : 221 ~ 255 + * DE00 ~ DEFF : 256 ~ 511 + * DF00 ~ DFFF : 512 ~ 767 + * ... + * F000 ~ F0FF : 4864 ~ 5119 + * + * 【3-byte】5120 ~ 65535 + * F1 is a 3-byte address descriptor followed by the address value + * F1 1400 ~ F1 FFFF + * + * 【5-byte】65536 ~ 4,294,967,295 + * F2 is a 5-byte address descriptor followed by the address value + * F2 0001 0000 ~ F2 FFFF FFFF + * + * 【7-byte】4,294,967,296 ~ 281,474,976,710,655 + * F3 is a 7-byte address descriptor followed by the address value + * F3 0001 0000 0000 ~ F3 FFFF FFFF FFFF + * + * 【9-byte】281,474,976,710,656 ~ xxxx + * F4 is a 9-byte address descriptor followed by the address value + * F4 0001 0000 0000 0000 ~ F4 FFFF FFFF FFFF FFFF + * + * 0xFF00 - The loopback address + * 0xFF01 - Public address for access authentication + * 0xFF02 - Public address of access authentication + * 0xFF03 - The neighbor found a public address + * 0xFF04 - Address resolution (ARP) + * 0xFF05 - DHCP public address + * 0xFF06 - Public address for minimalist access authentication + * 0xFF07 - Self-organizing protocol public address + * 0xFF08 - The IEEE EUI - 64 addresses + * 0xFF09 - any_addr + */ +int nip_addr_invalid(const struct nip_addr *addr) +{ + unsigned char first_byte, second_byte, third_byte; + int addr_len, i, err; + + first_byte = addr->nip_addr_field8[NIP_8BIT_ADDR_INDEX_0]; + second_byte = addr->nip_addr_field8[NIP_8BIT_ADDR_INDEX_1]; + third_byte = addr->nip_addr_field8[NIP_8BIT_ADDR_INDEX_2]; + addr_len = addr->bitlen / NIP_ADDR_BIT_LEN_8; + + /* The value of the field after the effective length of the short address should be 0 */ + for (i = addr_len; i < NIP_8BIT_ADDR_INDEX_MAX; i++) { + if (addr->nip_addr_field8[i] > 0x00) { + /* newip bitlen error */ + err = 1; + return err; + } + } + + if (first_byte <= ADDR_FIRST_DC && addr_len == NIP_ADDR_LEN_1) { + err = 0; + } else if (first_byte <= ADDR_FIRST_F0 && addr_len == NIP_ADDR_LEN_2) { + if (first_byte > ADDR_FIRST_DC + 1 || + second_byte >= ADDR_SECOND_MIN_DD) { + err = 0; + } else { + /* addr2 is not valid */ + err = 1; + } + } else if (first_byte == ADDR_FIRST_F1 && addr_len == NIP_ADDR_LEN_3) { + if (second_byte >= ADDR_SECOND_MIN_F1) { + err = 0; + } else { + /* addr3 is not valid */ + err = 1; + } + } else if (first_byte == ADDR_FIRST_F2 && addr_len == NIP_ADDR_LEN_5) { + if (second_byte > 0 || third_byte >= ADDR_THIRD_MIN_F2) { + err = 0; + } else { + /* addr5 is not valid */ + err = 1; + } + } else if (first_byte == ADDR_FIRST_FF && addr_len == NIP_ADDR_LEN_2) { + err = 0; + } else { + /* addr check fail */ + err = 1; + } + return err; +} + +/* 0xFF00 - The loopback address + * 0xFF01 - Public address for access authentication + * 0xFF02 - Public address of access authentication + * 0xFF03 - The neighbor found a public address + * 0xFF04 - Address resolution (ARP) + * 0xFF05 - DHCP public address + * 0xFF06 - Public address for minimalist access authentication + * 0xFF07 - Self-organizing protocol public address + * 0xFF08 - The IEEE EUI - 64 addresses + * 0xFF09 - any_addr + */ +int nip_addr_public(const struct nip_addr *addr) +{ + if (addr->bitlen == NIP_ADDR_BIT_LEN_16 && + addr->nip_addr_field8[NIP_8BIT_ADDR_INDEX_0] == ADDR_FIRST_FF) + return 1; + else + return 0; +} + +/* judge whether the nip_addr is equal to 0xFF09 */ +int nip_addr_any(const struct nip_addr *ad) +{ + int result = 0; + + if (ad->bitlen == NIP_ADDR_BIT_LEN_16) { + if (ad->nip_addr_field16[0] == nip_any_addr.nip_addr_field16[0] && + ad->nip_addr_field16[1] == nip_any_addr.nip_addr_field16[1]) + result = 1; + } + return result; +} + +int get_nip_addr_len(const struct nip_addr *addr) +{ + int len = 0; + + if (addr->nip_addr_field8[0] <= ADDR_FIRST_DC) + len = NIP_ADDR_LEN_1; + else if ((addr->nip_addr_field8[0] > ADDR_FIRST_DC && + addr->nip_addr_field8[0] <= ADDR_FIRST_F0) || + addr->nip_addr_field8[0] == ADDR_FIRST_FF) + len = NIP_ADDR_LEN_2; + else if (addr->nip_addr_field8[0] == ADDR_FIRST_F1) + len = NIP_ADDR_LEN_3; + else if (addr->nip_addr_field8[0] == ADDR_FIRST_F2) + len = NIP_ADDR_LEN_5; + else + return 0; + return len; +} + +unsigned char *build_nip_addr(const struct nip_addr *addr, unsigned char *buf) +{ + unsigned char *p = buf; + int i; + + if (addr->nip_addr_field8[0] <= ADDR_FIRST_DC) { + *p = addr->nip_addr_field8[0]; + } else if (((addr->nip_addr_field8[0] > ADDR_FIRST_DC) && + (addr->nip_addr_field8[0] <= ADDR_FIRST_F0)) || + (addr->nip_addr_field8[0] == ADDR_FIRST_FF)) { + *p = addr->nip_addr_field8[0]; + p++; + *p = addr->nip_addr_field8[NIP_8BIT_ADDR_INDEX_1]; + } else if (addr->nip_addr_field8[0] == ADDR_FIRST_F1) { + for (i = 0; i < NIP_ADDR_LEN_2; i++) { + *p = addr->nip_addr_field8[i]; + p++; + } + *p = addr->nip_addr_field8[NIP_8BIT_ADDR_INDEX_2]; + } else if (addr->nip_addr_field8[0] == ADDR_FIRST_F2) { + for (i = 0; i < NIP_ADDR_LEN_4; i++) { + *p = addr->nip_addr_field8[i]; + p++; + } + *p = addr->nip_addr_field8[NIP_8BIT_ADDR_INDEX_4]; + } else { + return 0; + } + + return ++p; +} + +unsigned char *decode_nip_addr(unsigned char *buf, struct nip_addr *addr) +{ + unsigned char *p = buf; + int i; + + if (*p <= ADDR_FIRST_DC) { + addr->nip_addr_field8[0] = *p; + p++; + addr->bitlen = NIP_ADDR_BIT_LEN_8; + } else if (*p > ADDR_FIRST_DC && *p <= ADDR_FIRST_F0) { + if (*p > ADDR_FIRST_DC + 1 || *(p + 1) >= ADDR_SECOND_MIN_DD) { + addr->nip_addr_field8[0] = *p; + p++; + addr->nip_addr_field8[1] = *p; + p++; + addr->bitlen = NIP_ADDR_BIT_LEN_16; + } else { + return 0; + } + } else if (*p == ADDR_FIRST_F1) { + if (*(p + 1) >= ADDR_SECOND_MIN_F1) { + for (i = 0; i < NIP_ADDR_LEN_3; i++) { + addr->nip_addr_field8[i] = *p; + p++; + } + addr->bitlen = NIP_ADDR_BIT_LEN_24; + } else { + return 0; + } + } else if (*p == ADDR_FIRST_F2) { + if (*(p + 1) > 0 || *(p + 2) >= ADDR_THIRD_MIN_F2) { /* 偏移2 */ + for (i = 0; i < NIP_ADDR_LEN_5; i++) { + addr->nip_addr_field8[i] = *p; + p++; + } + addr->bitlen = NIP_ADDR_BIT_LEN_40; + } else { + return 0; + } + } else if (*p == ADDR_FIRST_FF) { + addr->nip_addr_field8[0] = *p; + p++; + addr->nip_addr_field8[1] = *p; + p++; + addr->bitlen = NIP_ADDR_BIT_LEN_16; + } else { + return 0; + } + + return p; +} + diff --git a/code/net/newip/nip_addrconf.c b/code/net/newip/nip_addrconf.c new file mode 100644 index 0000000000000000000000000000000000000000..f8ac0fe5dc8b53fc36d1ad886b61eee8d595f26b --- /dev/null +++ b/code/net/newip/nip_addrconf.c @@ -0,0 +1,886 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2022 Huawei Device Co., Ltd. + * + * NewIP Address [auto]configuration + * Linux NewIP INET implementation + * + * Based on net/ipv6/addrconf.c + */ +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "nip_hdr.h" + +#define INFINITY_LIFE_TIME 0xFFFFFFFF + +/* Configured unicast address hash table */ +static struct hlist_head ninet_addr_lst[NIN_ADDR_HSIZE]; +static DEFINE_SPINLOCK(addrconf_hash_lock); + +static bool nip_chk_same_addr(struct net *net, const struct nip_addr *addr, + struct net_device *dev); +static int nip_get_firstaddr(const struct net_device *dev, + struct nip_addr *addr); +static int nip_addrconf_ifdown(struct net_device *dev, int how); + +static struct nip_devconf newip_devconf_dflt __read_mostly = { + .forwarding = 0, + .mtu = NIP_MIN_MTU, + .disable_nip = 0, + .ignore_routes_with_linkdown = 0, +}; + +/* Check if link is ready: is it up and is a valid qdisc available */ +static inline bool nip_addrconf_link_ready(const struct net_device *dev) +{ + return netif_oper_up(dev) && !qdisc_tx_is_noop(dev); +} + +static void nip_link_dev_addr(struct ninet_dev *idev, struct ninet_ifaddr *ifp) +{ + list_add_tail(&ifp->if_list, &idev->addr_list); +} + +static u32 ninet_addr_hash(const struct nip_addr *addr) +{ + return hash_32(nip_addr_hash(addr), NIN_ADDR_HSIZE_SHIFT); +} + +static struct ninet_ifaddr *nip_add_addr(struct ninet_dev *idev, + const struct nip_addr *addr, + u32 flags, u32 valid_lft, + u32 preferred_lft) +{ + struct ninet_ifaddr *ifa = NULL; + struct nip_rt_info *rt = NULL; + unsigned int hash; + int err = 0; + + rcu_read_lock_bh(); + + nin_dev_hold(idev); + + if (idev->dead) { + err = -ENODEV; + goto out2; + } + + if (idev->cnf.disable_nip) { + err = -EACCES; + goto out2; + } + + spin_lock(&addrconf_hash_lock); + + /* Do not configure two same addresses in a netdevice */ + if (nip_chk_same_addr(dev_net(idev->dev), addr, idev->dev)) { + DEBUG("%s: already assigned\n", __func__); + err = -EEXIST; + goto out; + } + + ifa = kzalloc(sizeof(*ifa), GFP_ATOMIC); + if (!ifa) { + DEBUG("%s: malloc failed\n", __func__); + err = -ENOBUFS; + goto out; + } + + rt = nip_addrconf_dst_alloc(idev, addr); + if (IS_ERR(rt)) { + err = PTR_ERR(rt); + goto out; + } + + neigh_parms_data_state_setall(idev->nd_parms); + + ifa->addr = *addr; + + spin_lock_init(&ifa->lock); + INIT_HLIST_NODE(&ifa->addr_lst); + ifa->flags = flags; + ifa->valid_lft = valid_lft; + ifa->preferred_lft = preferred_lft; + ifa->tstamp = jiffies; + ifa->cstamp = ifa->tstamp; + + ifa->rt = rt; + + ifa->idev = idev; + refcount_set(&ifa->refcnt, 1); + + /* Add to big hash table */ + hash = ninet_addr_hash(addr); + + hlist_add_head_rcu(&ifa->addr_lst, &ninet_addr_lst[hash]); + spin_unlock(&addrconf_hash_lock); + + write_lock(&idev->lock); + /* Add to ninet_dev unicast addr list. */ + nip_link_dev_addr(idev, ifa); + + nin_ifa_hold(ifa); + write_unlock(&idev->lock); + +out2: + rcu_read_unlock_bh(); + + if (likely(err == 0)) { + DEBUG("%s: success! idev->refcnt=%u\n", __func__, + refcount_read(&idev->refcnt)); + } else { + kfree(ifa); + nin_dev_put(idev); + ifa = ERR_PTR(err); + } + + return ifa; +out: + spin_unlock(&addrconf_hash_lock); + goto out2; +} + +static struct ninet_dev *nip_add_dev(struct net_device *dev) +{ + struct ninet_dev *ndev; + int err = -ENOMEM; + + ASSERT_RTNL(); + + if (dev->mtu < NIP_MIN_MTU) + return ERR_PTR(-EINVAL); + + ndev = kzalloc(sizeof(*ndev), GFP_KERNEL); + if (!ndev) + return ERR_PTR(err); + + rwlock_init(&ndev->lock); + ndev->dev = dev; + INIT_LIST_HEAD(&ndev->addr_list); + memcpy(&ndev->cnf, dev_net(dev)->newip.devconf_dflt, sizeof(ndev->cnf)); + + ndev->cnf.mtu = dev->mtu; + ndev->nd_parms = neigh_parms_alloc(dev, &nnd_tbl); + if (!ndev->nd_parms) { + kfree(ndev); + return ERR_PTR(err); + } + + /* We refer to the device */ + dev_hold(dev); + + refcount_set(&ndev->refcnt, 1); + + DEBUG("%s: init ninet_dev success!, set ndev->refcnt=1\n", __func__); + + if (netif_running(dev) && nip_addrconf_link_ready(dev)) + ndev->if_flags |= IF_READY; + + /* protected by rtnl_lock */ + rcu_assign_pointer(dev->nip_ptr, ndev); + return ndev; +} + +static struct ninet_dev *nip_find_idev(struct net_device *dev) +{ + struct ninet_dev *idev; + + ASSERT_RTNL(); + + idev = __nin_dev_get(dev); + if (!idev) { + idev = nip_add_dev(dev); + if (IS_ERR(idev)) + return NULL; + } + return idev; +} + +static struct ninet_dev *nip_addrconf_add_dev(struct net_device *dev) +{ + struct ninet_dev *idev; + + ASSERT_RTNL(); + + idev = nip_find_idev(dev); + if (!idev) + return ERR_PTR(-ENOBUFS); + + if (idev->cnf.disable_nip) + return ERR_PTR(-EACCES); + + return idev; +} + +/* Manual configuration of address on an interface */ +static int ninet_addr_add(struct net *net, int ifindex, + const struct nip_addr *pfx, + __u32 ifa_flags, __u32 preferred_lft, __u32 valid_lft) +{ + struct ninet_ifaddr *ifp; + struct ninet_dev *idev; + struct net_device *dev; + unsigned long timeout; + clock_t expires; + u32 flags; + __u32 ifa_flags_tmp = ifa_flags; + __u32 valid_lft_tmp = valid_lft; + + ASSERT_RTNL(); + + /* check the lifetime */ + if (!valid_lft_tmp || preferred_lft > valid_lft_tmp) + return -EINVAL; + + dev = __dev_get_by_index(net, ifindex); + if (!dev) + return -ENODEV; + + idev = nip_addrconf_add_dev(dev); /* 挂接dev和idev */ + if (IS_ERR(idev)) + return PTR_ERR(idev); + + timeout = addrconf_timeout_fixup(valid_lft_tmp, HZ); + if (addrconf_finite_timeout(timeout)) { + expires = jiffies_to_clock_t(timeout * HZ); + valid_lft_tmp = timeout; + } else { + expires = 0; + flags = 0; + ifa_flags_tmp |= IFA_F_PERMANENT; + } + + timeout = addrconf_timeout_fixup(preferred_lft, HZ); + if (addrconf_finite_timeout(timeout)) { + if (timeout == 0) + ifa_flags_tmp |= IFA_F_DEPRECATED; + preferred_lft = timeout; + } + + ifp = nip_add_addr(idev, pfx, ifa_flags_tmp, + valid_lft_tmp, + preferred_lft); + if (!IS_ERR(ifp)) { + nin_ifa_put(ifp); + nip_ins_rt(ifp->rt); + DEBUG("%s: success! ifp->refcnt=%u\n", __func__, + refcount_read(&ifp->refcnt)); + return 0; + } + + return PTR_ERR(ifp); +} + +/* Nobody refers to this ifaddr, destroy it */ +void ninet_ifa_finish_destroy(struct ninet_ifaddr *ifp) +{ + WARN_ON(!hlist_unhashed(&ifp->addr_lst)); + + DEBUG("%s: before idev put. idev->refcnt=%u\n", __func__, + refcount_read(&ifp->idev->refcnt)); + + nin_dev_put(ifp->idev); + + nip_rt_put(ifp->rt); + + kfree_rcu(ifp, rcu); +} + +static void nip_del_addr(struct ninet_ifaddr *ifp) +{ + int state; + + ASSERT_RTNL(); + + spin_lock_bh(&ifp->lock); + state = ifp->state; + ifp->state = NINET_IFADDR_STATE_DEAD; + spin_unlock_bh(&ifp->lock); + + if (state == NINET_IFADDR_STATE_DEAD) + goto out; + + spin_lock_bh(&addrconf_hash_lock); + hlist_del_init_rcu(&ifp->addr_lst); + spin_unlock_bh(&addrconf_hash_lock); + + write_lock_bh(&ifp->idev->lock); + + list_del_init(&ifp->if_list); + __nin_ifa_put(ifp); + + write_unlock_bh(&ifp->idev->lock); + + if (ifp->rt) { + /* If the ifp - & gt; Rt does not belong to any NIP_FIB_node. + * The DST reference count does not change + */ + if (dst_hold_safe(&ifp->rt->dst)) + nip_del_rt(ifp->rt); + } + +out: + nin_ifa_put(ifp); +} + +static int ninet_addr_del(struct net *net, int ifindex, u32 ifa_flags, + const struct nip_addr *pfx) +{ + struct ninet_ifaddr *ifp; + struct ninet_dev *idev; + struct net_device *dev; + + dev = __dev_get_by_index(net, ifindex); + if (!dev) + return -ENODEV; + + idev = __nin_dev_get(dev); + if (!idev) + return -ENXIO; + + read_lock_bh(&idev->lock); + list_for_each_entry(ifp, &idev->addr_list, if_list) { + if (nip_addr_eq(pfx, &ifp->addr)) { + nin_ifa_hold(ifp); + read_unlock_bh(&idev->lock); + + nip_del_addr(ifp); + DEBUG("nip_addr_del: success!"); + return 0; + } + } + read_unlock_bh(&idev->lock); + return -EADDRNOTAVAIL; +} + +int nip_addrconf_add_ifaddr(struct net *net, void __user *arg) +{ + struct nip_ifreq ireq; + + int err; + + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) { + DEBUG("%s: not admin can`t cfg.", __func__); + return -EPERM; + } + + if (copy_from_user(&ireq, arg, sizeof(struct nip_ifreq))) { + DEBUG("%s: fail to copy cfg data.", __func__); + return -EFAULT; + } + + if (nip_addr_invalid(&ireq.ifrn_addr)) { + DEBUG("%s: nip addr invalid.", __func__); + return -EFAULT; + } + + if (nip_addr_public(&ireq.ifrn_addr)) { + DEBUG("%s: The public address cannot be configured.", __func__); + return -EFAULT; + } + + rtnl_lock(); + err = ninet_addr_add(net, ireq.ifrn_ifindex, &ireq.ifrn_addr, + IFA_F_PERMANENT, INFINITY_LIFE_TIME, + INFINITY_LIFE_TIME); + rtnl_unlock(); + return err; +} + +int nip_addrconf_del_ifaddr(struct net *net, void __user *arg) +{ + struct nip_ifreq ireq; + int err; + + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) { + DEBUG("%s: not admin can`t cfg.", __func__); + return -EPERM; + } + + if (copy_from_user(&ireq, arg, sizeof(struct nip_ifreq))) { + DEBUG("%s: fail to copy cfg data.", __func__); + return -EFAULT; + } + + if (nip_addr_invalid(&ireq.ifrn_addr)) { + DEBUG("%s: nip addr invalid.", __func__); + return -EFAULT; + } + + if (nip_addr_public(&ireq.ifrn_addr)) { + DEBUG("%s: Public addresses cannot be deleted.", __func__); + return -EFAULT; + } + + rtnl_lock(); + err = ninet_addr_del(net, ireq.ifrn_ifindex, 0, &ireq.ifrn_addr); + rtnl_unlock(); + return err; +} + +static bool nip_chk_same_addr(struct net *net, const struct nip_addr *addr, + struct net_device *dev) +{ + unsigned int hash = ninet_addr_hash(addr); + struct ninet_ifaddr *ifp; + + hlist_for_each_entry(ifp, &ninet_addr_lst[hash], addr_lst) { + if (!net_eq(dev_net(ifp->idev->dev), net)) + continue; + if (nip_addr_eq(&ifp->addr, addr)) { + if (!dev || ifp->idev->dev == dev) + return true; + } + } + return false; +} + +int __nip_get_lladdr(struct ninet_dev *idev, struct nip_addr *addr, u32 banned_flags) +{ + struct ninet_ifaddr *ifp; + int err = -EADDRNOTAVAIL; + + list_for_each_entry_reverse(ifp, &idev->addr_list, if_list) { + if (!(ifp->flags & banned_flags)) { + *addr = ifp->addr; + err = 0; + break; + } + } + return err; +} + +int nip_get_lladdr(struct net_device *dev, struct nip_addr *addr, u32 banned_flags) +{ + struct ninet_dev *idev; + int err = -EADDRNOTAVAIL; + + rcu_read_lock(); + idev = __nin_dev_get(dev); + if (idev) { + read_lock_bh(&idev->lock); + err = __nip_get_lladdr(idev, addr, banned_flags); + read_unlock_bh(&idev->lock); + } + rcu_read_unlock(); + return err; +} + +static int __nip_get_firstaddr(struct ninet_dev *idev, struct nip_addr *addr) +{ + struct ninet_ifaddr *ifp; + int err = -EADDRNOTAVAIL; + + list_for_each_entry(ifp, &idev->addr_list, if_list) { + *addr = ifp->addr; + err = 0; + break; + } + return err; +} + +static int nip_get_firstaddr(const struct net_device *dev, + struct nip_addr *addr) +{ + struct ninet_dev *idev; + int err = -EADDRNOTAVAIL; + + rcu_read_lock(); + idev = __nin_dev_get(dev); + if (idev) { + read_lock_bh(&idev->lock); + err = __nip_get_firstaddr(idev, addr); + read_unlock_bh(&idev->lock); + } + rcu_read_unlock(); + return err; +} + +int nip_dev_get_saddr(struct net *net, const struct net_device *dev, + const struct nip_addr *daddr, struct nip_addr *saddr) +{ + if (!dev || !saddr) + return -EADDRNOTAVAIL; + + return nip_get_firstaddr(dev, saddr); +} + +static int nip_addrconf_notify(struct notifier_block *this, unsigned long event, + void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct ninet_dev *idev = __nin_dev_get(dev); + struct net *net = dev_net(dev); + + switch (event) { + case NETDEV_REGISTER: + if (!idev && dev->mtu >= NIP_MIN_MTU) { + DEBUG("NIP_ADDRCONF(NETDEV_REGISTER): "); + idev = nip_add_dev(dev); + if (IS_ERR(idev)) + return notifier_from_errno(PTR_ERR(idev)); + } + break; + + case NETDEV_CHANGEMTU: + /* if MTU under NIP_MIN_MTU stop New IP on this interface. */ + if (dev->mtu < NIP_MIN_MTU) { + nip_addrconf_ifdown(dev, dev != net->loopback_dev); + break; + } + + if (idev) { + idev->cnf.mtu = dev->mtu; + break; + } + + /* allocate new idev */ + idev = nip_add_dev(dev); + if (IS_ERR(idev)) + break; + + /* device is still not ready */ + if (!(idev->if_flags & IF_READY)) + break; + + fallthrough; + case NETDEV_UP: + case NETDEV_CHANGE: + if (dev->flags & IFF_SLAVE) + break; + + if (idev && idev->cnf.disable_nip) + break; + + if (event == NETDEV_UP) { + if (!nip_addrconf_link_ready(dev)) { + /* device is not ready yet. */ + DEBUG("NIP_ADDRCONF(NETDEV_UP): "); + DEBUG("%s:link is not ready\n", dev->name); + break; + } + + if (!idev && dev->mtu >= NIP_MIN_MTU) + idev = nip_add_dev(dev); + + if (!IS_ERR_OR_NULL(idev)) + idev->if_flags |= IF_READY; + } else if (event == NETDEV_CHANGE) { + if (!nip_addrconf_link_ready(dev)) { + /* device is still not ready. */ + break; + } + + if (idev) + idev->if_flags |= IF_READY; + + DEBUG("NIP_ADDRCONF(NETDEV_CHANGE):"); + DEBUG("%s:link becomes ready\n", dev->name); + } + + if (!IS_ERR_OR_NULL(idev)) { + /* If the MTU changed during the interface down, + * when the interface up, the changed MTU must be + * reflected in the idev as well as routers. + */ + if (idev->cnf.mtu != dev->mtu && + dev->mtu >= NIP_MIN_MTU) { + idev->cnf.mtu = dev->mtu; + } + idev->tstamp = jiffies; + + /* If the changed mtu during down is lower than + * NIP_MIN_MTU stop New IP on this interface. + */ + if (dev->mtu < NIP_MIN_MTU) + nip_addrconf_ifdown(dev, + dev != net->loopback_dev); + } + break; + + case NETDEV_DOWN: + case NETDEV_UNREGISTER: + /* Remove all addresses from this interface. */ + nip_addrconf_ifdown(dev, event != NETDEV_DOWN); + break; + default: + break; + } + + return NOTIFY_OK; +} + +static int nip_addrconf_ifdown(struct net_device *dev, int how) +{ + struct net *net = dev_net(dev); + struct ninet_dev *idev = __nin_dev_get(dev); + struct ninet_ifaddr *ifa, *tmp; + struct list_head del_list; + int state, i; + + ASSERT_RTNL(); + + nip_rt_ifdown(net, dev); + neigh_ifdown(&nnd_tbl, dev); + if (!idev) + return -ENODEV; + + /* Step 1: remove reference to newip device from parent device. + * Do not dev_put! + */ + if (how) { + idev->dead = 1; + + /* protected by rtnl_lock */ + RCU_INIT_POINTER(dev->nip_ptr, NULL); + } + + /* Step 2: clear hash table */ + for (i = 0; i < NIN_ADDR_HSIZE; i++) { + struct hlist_head *h = &ninet_addr_lst[i]; + + spin_lock_bh(&addrconf_hash_lock); + hlist_for_each_entry_rcu(ifa, h, addr_lst) { + if (ifa->idev == idev) + hlist_del_init_rcu(&ifa->addr_lst); + } + spin_unlock_bh(&addrconf_hash_lock); + } + + write_lock_bh(&idev->lock); + + /* Step 2: clear flags for stateless addrconf */ + if (!how) + idev->if_flags &= ~(IF_RS_SENT | IF_RA_RCVD | IF_READY); + + /* Step 3: clear addr list in idev */ + INIT_LIST_HEAD(&del_list); + list_for_each_entry_safe(ifa, tmp, &idev->addr_list, if_list) { + list_move(&ifa->if_list, &del_list); + + write_unlock_bh(&idev->lock); + spin_lock_bh(&ifa->lock); + + state = ifa->state; + ifa->state = NINET_IFADDR_STATE_DEAD; + + spin_unlock_bh(&ifa->lock); + write_lock_bh(&idev->lock); + } + write_unlock_bh(&idev->lock); + + /* now clean up addresses to be removed */ + while (!list_empty(&del_list)) { + ifa = list_first_entry(&del_list, struct ninet_ifaddr, if_list); + list_del(&ifa->if_list); + nin_ifa_put(ifa); + } + + /* Last: Shot the device (if unregistered) */ + if (how) { + neigh_parms_release(&nnd_tbl, idev->nd_parms); + neigh_ifdown(&nnd_tbl, dev); + DEBUG("%s: before idev put. idev->refcnt=%u\n", __func__, + refcount_read(&idev->refcnt)); + nin_dev_put(idev); + } + return 0; +} + +static int nip_addr_proc_show(struct seq_file *seq, void *v) +{ + struct net *net = seq->private; + struct ninet_ifaddr *ifp; + int i, j; + + rcu_read_lock(); + for (i = 0; i < NIN_ADDR_HSIZE; i++) { + hlist_for_each_entry_rcu(ifp, &ninet_addr_lst[i], addr_lst) { + if (!net_eq(dev_net(ifp->idev->dev), net)) + continue; + + for (j = 0; j < ifp->addr.bitlen / NIP_ADDR_BIT_LEN_8; + j++) { + seq_printf(seq, "%02x", + ifp->addr.nip_addr_field8[j]); + } + seq_printf(seq, "\t%8s\n", + ifp->idev->dev ? ifp->idev->dev->name : ""); + } + } + rcu_read_unlock(); + return 0; +} + +static int __net_init nip_addr_net_init(struct net *net) +{ + int err = -ENOMEM; + struct nip_devconf *dflt; + + dflt = kmemdup(&newip_devconf_dflt, + sizeof(newip_devconf_dflt), + GFP_KERNEL); + if (!dflt) + goto err_alloc_dflt; + + net->newip.devconf_dflt = dflt; + + if (!proc_create_net_single("nip_addr", 0444, net->proc_net, + nip_addr_proc_show, NULL)) { + goto err_addr_proc; + } + + return 0; + +err_addr_proc: + kfree(dflt); +err_alloc_dflt: + return err; +} + +static void __net_exit nip_addr_net_exit(struct net *net) +{ + kfree(net->newip.devconf_dflt); + remove_proc_entry("nip_addr", net->proc_net); +} + +static struct pernet_operations nip_route_proc_net_ops = { + .init = nip_addr_net_init, + .exit = nip_addr_net_exit, +}; + +/* addrconf module should be notified of a device going up + */ +static struct notifier_block nip_dev_notf = { + .notifier_call = nip_addrconf_notify, + .priority = ADDRCONF_NOTIFY_PRIORITY, +}; + +int __init nip_addrconf_init(void) +{ + int err; + + err = register_pernet_subsys(&nip_route_proc_net_ops); + if (err < 0) { + DEBUG("%s: register_pernet_subsys failed!\n", __func__); + goto out; + } + + register_netdevice_notifier(&nip_dev_notf); + +out: + return err; +} + +void nip_addrconf_cleanup(void) +{ + struct net_device *dev; + int i; + + unregister_netdevice_notifier(&nip_dev_notf); + unregister_pernet_subsys(&nip_route_proc_net_ops); + + rtnl_lock(); + + /* clean dev list */ + for_each_netdev(&init_net, dev) { + if (!__nin_dev_get(dev)) + continue; + nip_addrconf_ifdown(dev, 1); + } + + /* Check hash table. */ + spin_lock_bh(&addrconf_hash_lock); + for (i = 0; i < NIN_ADDR_HSIZE; i++) + WARN_ON(!hlist_empty(&ninet_addr_lst[i])); + spin_unlock_bh(&addrconf_hash_lock); + rtnl_unlock(); +} + +static int ninet_addr_get(const struct net_device *dev, struct ninet_ifaddr *ifa) +{ + int err; + struct nip_addr addr; + + err = nip_get_firstaddr(dev, &addr); + if (err) + return err; + ifa->addr = addr; + + return err; +} + +int nip_addrconf_get_ifaddr(struct net *net, unsigned int cmd, void __user *arg) +{ + struct nip_devreq ifr; + struct sockaddr_nin *snin; + struct ninet_ifaddr ifa; + struct net_device *dev; + void __user *p = (void __user *)arg; + int ret = -EFAULT; + + if (copy_from_user(&ifr, p, sizeof(struct nip_ifreq))) + goto out; + + ifr.nip_ifr_name[IFNAMSIZ - 1] = 0; + snin = (struct sockaddr_nin *)&ifr.nip_dev_addr; + + DEBUG("%s, dev name is %s", __func__, ifr.nip_ifr_name); + dev_load(net, ifr.nip_ifr_name); + + if (cmd == SIOCGIFADDR) { + memset(snin, 0, sizeof(*snin)); + snin->sin_family = AF_NINET; + } else { + goto out; + } + + rtnl_lock(); + + dev = __dev_get_by_name(net, ifr.nip_ifr_name); + if (!dev) + goto done; + + ret = ninet_addr_get(dev, &ifa); + if (ret) + goto done; + /* Get interface address */ + snin->sin_addr = ifa.addr; + + if (copy_to_user(p, &ifr, sizeof(struct nip_devreq))) + ret = -EFAULT; + +done: + rtnl_unlock(); +out: + return ret; +} diff --git a/code/net/newip/nip_addrconf_core.c b/code/net/newip/nip_addrconf_core.c new file mode 100644 index 0000000000000000000000000000000000000000..12d104dde28a2b834dca8c9c66a150fdef00047e --- /dev/null +++ b/code/net/newip/nip_addrconf_core.c @@ -0,0 +1,35 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2022 Huawei Device Co., Ltd. + * + * NewIP library code, needed by static components when full NewIP support is + * not configured or static. + * + * Based on net/ipv6/addrconf_core.c + */ +#include +#include +#include +#include + +static void nin_dev_finish_destroy_rcu(struct rcu_head *head) +{ + struct ninet_dev *idev = container_of(head, struct ninet_dev, rcu); + + kfree(idev); +} + +void nin_dev_finish_destroy(struct ninet_dev *idev) +{ + struct net_device *dev = idev->dev; + + WARN_ON(!list_empty(&idev->addr_list)); + + dev_put(dev); + if (!idev->dead) { + DEBUG(KERN_WARNING "Freeing alive ninet device.\n"); + return; + } + call_rcu(&idev->rcu, nin_dev_finish_destroy_rcu); +} + diff --git a/code/net/newip/nip_checksum.c b/code/net/newip/nip_checksum.c new file mode 100644 index 0000000000000000000000000000000000000000..437e52bdfb2d5d5ac7bae700a20c7f374d45ae9a --- /dev/null +++ b/code/net/newip/nip_checksum.c @@ -0,0 +1,117 @@ +// SPDX-License-Identifier: BSD-2-Clause +/* + * Copyright (c) 2022 Huawei Device Co., Ltd. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, this list + * of conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include +#include "nip_hdr.h" +#include "nip_checksum.h" + +#define USHORT_PAYLOAD 16 +#define NIP_CHECKSUM_UINT8_PAYLOAD 8 +unsigned int _nip_check_sum(unsigned char *data, unsigned short data_len) +{ + unsigned int i = 0, sum = 0; + + while (i + 1 < data_len) { + sum += (data[i] << NIP_CHECKSUM_UINT8_PAYLOAD) + data[i + 1]; + i += 2; /* Offset 2 bytes */ + } + + if (i < (unsigned int)data_len) + sum += (data[i] << NIP_CHECKSUM_UINT8_PAYLOAD); + + return sum; +} + +unsigned int _nip_header_chksum(struct nip_pseudo_header *chksum_header) +{ + int i, j; + int addr_len; + unsigned char pseudo_header[NIP_HDR_MAX] = {0}; + unsigned short hdr_len = 0; + + addr_len = chksum_header->saddr.bitlen / NIP_ADDR_BIT_LEN_8; + if (addr_len) { + j = 0; + for (i = 0; i < addr_len; i++, j++) + pseudo_header[j] = chksum_header->saddr.nip_addr_field8[i]; + hdr_len += addr_len; + } + + addr_len = chksum_header->daddr.bitlen / NIP_ADDR_BIT_LEN_8; + if (addr_len) { + j = hdr_len; + for (i = 0; i < addr_len; i++, j++) + pseudo_header[j] = chksum_header->daddr.nip_addr_field8[i]; + hdr_len += addr_len; + } + + /* chksum_header->check_len is network order.(big end) */ + *(unsigned short *)(pseudo_header + hdr_len) = chksum_header->check_len; + hdr_len += sizeof(chksum_header->check_len); + *(pseudo_header + hdr_len) = chksum_header->nexthdr; + hdr_len += sizeof(chksum_header->nexthdr); + + return _nip_check_sum(pseudo_header, hdr_len); +} + +/* The checksum is calculated when the packet is received + * Note: + * 1.chksum_header->check_len is network order.(big end) + * 2.check_len is host order. + */ +unsigned short nip_check_sum_parse(unsigned char *data, + unsigned short check_len, + struct nip_pseudo_header *chksum_header) +{ + unsigned int sum = 0; + + sum = _nip_check_sum(data, check_len); + sum += _nip_header_chksum(chksum_header); + + while (sum >> USHORT_PAYLOAD) + sum = (sum >> USHORT_PAYLOAD) + (sum & 0xffff); + return (unsigned short)sum; +} + +/* The checksum is calculated when the packet is sent + * Note: + * 1.chksum_header->check_len is network order.(big end) + * 2.data_len is host order. + */ +unsigned short nip_check_sum_build(unsigned char *data, + unsigned short data_len, + struct nip_pseudo_header *chksum_header) +{ + unsigned int sum = 0; + + sum = _nip_check_sum(data, data_len); + sum += _nip_header_chksum(chksum_header); + + while (sum >> USHORT_PAYLOAD) + sum = (sum >> USHORT_PAYLOAD) + (sum & 0xffff); + return (unsigned short)(~sum); +} + diff --git a/code/net/newip/nip_checksum.h b/code/net/newip/nip_checksum.h new file mode 100644 index 0000000000000000000000000000000000000000..97a359f3ada5ff4d74c8f46d5da389b0dc18d874 --- /dev/null +++ b/code/net/newip/nip_checksum.h @@ -0,0 +1,58 @@ +/* SPDX-License-Identifier: BSD-2-Clause */ +/* + * Copyright (c) 2022 Huawei Device Co., Ltd. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, this list + * of conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _NIP_CHECKSUM_H +#define _NIP_CHECKSUM_H + +#include + +struct nip_pseudo_header { + struct nip_addr saddr; /* Source address, network order.(big end) */ + struct nip_addr daddr; /* Destination address, network order.(big end) */ + unsigned short check_len; /* network order.(big end) */ + unsigned char nexthdr; /* Upper-layer Protocol Type: IPPROTO_UDP */ +}; + +/* The checksum is calculated when the packet is received + * Note: + * 1.chksum_header->check_len is network order.(big end) + * 2.data_len is host order. + */ +unsigned short nip_check_sum_parse(unsigned char *data, + unsigned short check_len, + struct nip_pseudo_header *chksum_header); + +/* The checksum is calculated when the packet is sent + * Note: + * 1.chksum_header->check_len is network order.(big end) + * 2.data_len is host order. + */ +unsigned short nip_check_sum_build(unsigned char *data, + unsigned short data_len, + struct nip_pseudo_header *chksum_header); + +#endif /* _NIP_CHECKSUM_H */ + diff --git a/code/net/newip/nip_fib.c b/code/net/newip/nip_fib.c new file mode 100644 index 0000000000000000000000000000000000000000..f369620804b1aad58eea6a65bf897857fb527c8c --- /dev/null +++ b/code/net/newip/nip_fib.c @@ -0,0 +1,295 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2022 Huawei Device Co., Ltd. + * + * Linux NewIP INET implementation + * Forwarding Information Database + * + * Based on net/ipv6/ip6_fib.c + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +static struct kmem_cache *nip_fib_node_kmem __read_mostly; + +struct nip_fib_table *nip_fib_get_table(struct net *net, u32 id) +{ + if (id == NIP_RT_TABLE_MAIN) + return net->newip.nip_fib_main_tbl; + else if (id == NIP_RT_TABLE_LOCAL) + return net->newip.nip_fib_local_tbl; + else + return NULL; +} + +static struct nip_fib_node *nip_node_alloc(void) +{ + struct nip_fib_node *fn; + + fn = kmem_cache_zalloc(nip_fib_node_kmem, GFP_ATOMIC); + + return fn; +} + +void nip_rt_free_pcpu(struct nip_rt_info *non_pcpu_rt) +{ + int cpu; + + if (!non_pcpu_rt->rt_pcpu) + return; + + for_each_possible_cpu(cpu) { + struct nip_rt_info **ppcpu_rt; + struct nip_rt_info *pcpu_rt; + + ppcpu_rt = per_cpu_ptr(non_pcpu_rt->rt_pcpu, cpu); + pcpu_rt = *ppcpu_rt; + if (pcpu_rt) { + dst_dev_put(&pcpu_rt->dst); + dst_release(&pcpu_rt->dst); + *ppcpu_rt = NULL; + } + } + + free_percpu(non_pcpu_rt->rt_pcpu); + non_pcpu_rt->rt_pcpu = NULL; +} + +static u32 ninet_route_hash(const struct nip_addr *addr) +{ + return hash_32(nip_addr_hash(addr), NIN_ROUTE_HSIZE_SHIFT); +} + +struct nip_fib_node *nip_fib_locate(struct hlist_head *nip_tb_head, + const struct nip_addr *daddr) +{ + struct nip_fib_node *fib_node; + struct hlist_head *h; + unsigned int hash; + + hash = ninet_route_hash(daddr); + h = &nip_tb_head[hash]; + + hlist_for_each_entry_rcu(fib_node, h, fib_hlist) { + if (nip_addr_eq(&fib_node->nip_route_info->rt_dst, daddr)) + return fib_node; + } + + /* find default route */ + hash = ninet_route_hash(&nip_any_addr); + h = &nip_tb_head[hash]; + + hlist_for_each_entry_rcu(fib_node, h, fib_hlist) { + if (nip_addr_eq + (&fib_node->nip_route_info->rt_dst, &nip_any_addr)) { + return fib_node; + } + } + + return NULL; +} + +/* nip_tb_lock must be taken to avoid racing */ +int nip_fib_add(struct hlist_head *nip_tb_head, struct nip_rt_info *rt) +{ + struct nip_fib_node *fib_node, *new_node; + int err = 0; + struct hlist_head *h; + unsigned int hash; + + hash = ninet_route_hash(&rt->rt_dst); + h = &nip_tb_head[hash]; + + hlist_for_each_entry(fib_node, h, fib_hlist) { + if (nip_addr_eq(&fib_node->nip_route_info->rt_dst, + &rt->rt_dst)) { + err = -EEXIST; + goto fail; + } + } + + new_node = nip_node_alloc(); + if (!new_node) { + DEBUG("%s: fail to alloc mem.", __func__); + err = -ENOMEM; + goto fail; + } + new_node->nip_route_info = rt; + rcu_assign_pointer(rt->rt_node, new_node); + atomic_inc(&rt->rt_ref); + hlist_add_tail_rcu(&new_node->fib_hlist, h); + +out: + return err; + +fail: + dst_release_immediate(&rt->dst); + goto out; +} + +static void nip_fib_destroy_rcu(struct rcu_head *head) +{ + struct nip_fib_node *fn = container_of(head, struct nip_fib_node, rcu); + + nip_rt_release(fn->nip_route_info); + kfree(fn); +} + +/* nip_tb_lock must be taken to avoid racing */ +int nip_fib_del(struct nip_rt_info *rt, struct nl_info *info) +{ + struct nip_fib_node *fn; + struct net *net = info->nl_net; + + fn = rcu_dereference_protected(rt->rt_node, + lockdep_is_held(&rt->rt_table->nip_tb_lock)); + if (!fn || rt == net->newip.nip_null_entry) + return -ENOENT; + + hlist_del_init_rcu(&fn->fib_hlist); + + /* 当fib_node释放后,fib_node指向的route_info才可释放 */ + RCU_INIT_POINTER(rt->rt_node, NULL); + call_rcu(&fn->rcu, nip_fib_destroy_rcu); + + return 0; +} + +static void nip_fib_free_table(struct nip_fib_table *table) +{ + kfree(table); +} + +/* caller must hold nip_tb_lock */ +static void nip_fib_clean_hash(struct net *net, struct hlist_head *nip_tb_head, + int (*func)(struct nip_rt_info *, void *arg), + void *arg) +{ + int i; + struct nip_fib_node *fn; + struct hlist_node *tmp; + struct nl_info info = { + .nl_net = net, + }; + + for (i = 0; i < NIN_ROUTE_HSIZE; i++) { + struct hlist_head *h = &nip_tb_head[i]; + + hlist_for_each_entry_safe(fn, tmp, h, fib_hlist) { + if (func(fn->nip_route_info, arg) < 0) { + DEBUG("%s: try to del nip_rt_info\n", __func__); + nip_fib_del(fn->nip_route_info, &info); + } + } + } +} + +void nip_fib_clean_all(struct net *net, + int (*func)(struct nip_rt_info *, void *arg), void *arg) +{ + struct nip_fib_table *main_tbl = net->newip.nip_fib_main_tbl; + struct nip_fib_table *local_tbl = net->newip.nip_fib_local_tbl; + + spin_lock_bh(&main_tbl->nip_tb_lock); + nip_fib_clean_hash(net, main_tbl->nip_tb_head, func, arg); + spin_unlock_bh(&main_tbl->nip_tb_lock); + + spin_lock_bh(&local_tbl->nip_tb_lock); + nip_fib_clean_hash(net, local_tbl->nip_tb_head, func, arg); + spin_unlock_bh(&local_tbl->nip_tb_lock); +} + +static void nip_fib_link_table(struct nip_fib_table *tb) +{ + /* You need to initialize multiple routing tables */ + spin_lock_init(&tb->nip_tb_lock); +} + +static void __net_init nip_fib_tables_init(struct net *net) +{ + nip_fib_link_table(net->newip.nip_fib_main_tbl); + nip_fib_link_table(net->newip.nip_fib_local_tbl); +} + +static int __net_init nip_fib_net_init(struct net *net) +{ + net->newip.nip_fib_main_tbl = + kzalloc(sizeof(*net->newip.nip_fib_main_tbl), GFP_KERNEL); + if (!net->newip.nip_fib_main_tbl) + goto out_fib_table_hash; + + net->newip.nip_fib_main_tbl->nip_tb_id = NIP_RT_TABLE_MAIN; + net->newip.nip_fib_main_tbl->flags = 1; + + net->newip.nip_fib_local_tbl = + kzalloc(sizeof(*net->newip.nip_fib_local_tbl), GFP_KERNEL); + if (!net->newip.nip_fib_local_tbl) + goto out_main_tbl; + + net->newip.nip_fib_local_tbl->nip_tb_id = NIP_RT_TABLE_LOCAL; + + nip_fib_tables_init(net); + + return 0; + +out_main_tbl: + kfree(net->newip.nip_fib_main_tbl); +out_fib_table_hash: + return -ENOMEM; +} + +static void nip_fib_net_exit(struct net *net) +{ + nip_fib_free_table(net->newip.nip_fib_main_tbl); + nip_fib_free_table(net->newip.nip_fib_local_tbl); +} + +static struct pernet_operations nip_fib_net_ops = { + .init = nip_fib_net_init, + .exit = nip_fib_net_exit, +}; + +int __init nip_fib_init(void) +{ + int ret = -ENOMEM; + + nip_fib_node_kmem = kmem_cache_create("nip_fib_nodes", + sizeof(struct nip_fib_node), + 0, SLAB_HWCACHE_ALIGN, NULL); + if (!nip_fib_node_kmem) + goto out; + + DEBUG("nip_fib_node size is %lu\n", + sizeof(struct nip_fib_node) + sizeof(struct nip_rt_info)); + + ret = register_pernet_subsys(&nip_fib_net_ops); + if (ret) + goto out_kmem_cache_create; + +out: + return ret; + +out_kmem_cache_create: + kmem_cache_destroy(nip_fib_node_kmem); + goto out; +} + +void nip_fib_gc_cleanup(void) +{ + unregister_pernet_subsys(&nip_fib_net_ops); + kmem_cache_destroy(nip_fib_node_kmem); +} + diff --git a/code/net/newip/nip_fib_rules.c b/code/net/newip/nip_fib_rules.c new file mode 100644 index 0000000000000000000000000000000000000000..1f3481b20a463a4b6916e97a448bafc80bad2a75 --- /dev/null +++ b/code/net/newip/nip_fib_rules.c @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2022 Huawei Device Co., Ltd. + * + * NewIP Routing Policy Rules + * + * Based on net/ipv6/fib_rules.c + * Based on net/ipv6/fib6_rules.c + */ +#include +#include +#include +#include + +struct dst_entry *nip_fib_rule_lookup(struct net *net, struct flow_nip *fln, + int flags, nip_pol_lookup_t lookup) +{ + struct nip_rt_info *rt; + + rt = lookup(net, net->newip.nip_fib_local_tbl, fln, flags); + if (rt != net->newip.nip_null_entry) + return &rt->dst; + nip_rt_put(rt); + rt = lookup(net, net->newip.nip_fib_main_tbl, fln, flags); + if (rt != net->newip.nip_null_entry) + return &rt->dst; + nip_rt_put(rt); + + dst_hold(&net->newip.nip_null_entry->dst); + return &net->newip.nip_null_entry->dst; +} diff --git a/code/net/newip/nip_hdr.h b/code/net/newip/nip_hdr.h new file mode 100644 index 0000000000000000000000000000000000000000..1622c5d9113e9cc0eb164645fdadebb0389af798 --- /dev/null +++ b/code/net/newip/nip_hdr.h @@ -0,0 +1,234 @@ +/* SPDX-License-Identifier: BSD-2-Clause */ +/* + * Copyright (c) 2022 Huawei Device Co., Ltd. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, this list + * of conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _NEWIP_HDR_H +#define _NEWIP_HDR_H + +#include + +/* Ethernet head 14B, +2B byte alignment, +66 to avoid + * HMAC driver SKB space expansion caused by Coredum problem + */ +#define NIP_ETH_HDR_BASE_LEN 14 +#define NIP_ETH_HDR_LEN (NIP_ETH_HDR_BASE_LEN + 2 + 66) + +/* bitmap1 + bitmap2 + TTL + total len + nexthd + daddr + saddr + * 1B 1B 1B 2B 1B 7B 7B = 20B + * NIP_HDR_MAX 可以从50 修改成 20 + * V4 TCP 1448 + * NIP TCP 1430 + 30 = 1460 + */ +#define NIP_HDR_MAX 8 // 50 -> 8 +#define NIP_UDP_HDR_LEN 8 +#define NIP_MIN_MTU (NIP_HDR_MAX + NIP_UDP_HDR_LEN) +#define NIP_BYTE_ALIGNMENT 2 + +#define NIP_BITMAP_HAVE_MORE_BIT 0x01 + +/* Bitmap 1st Byte: bit0 - bit7 */ +#define NIP_BITMAP_INVALID_SET 0x80 /* Bit 0 is set */ +#define NIP_BITMAP_INCLUDE_TTL 0x40 /* Bit 1 is set */ +#define NIP_BITMAP_INCLUDE_TOTAL_LEN 0x20 /* Bit 2 is set */ +#define NIP_BITMAP_INCLUDE_NEXT_HDR 0x10 /* Bit 3 is set */ +#define NIP_BITMAP_INCLUDE_RES1 0x08 /* Bit 4 is set */ +#define NIP_BITMAP_INCLUDE_DADDR 0x04 /* Bit 5 is set */ +#define NIP_BITMAP_INCLUDE_SADDR 0x02 /* Bit 6 is set */ +#define NIP_BITMAP_HAVE_BYTE_2 NIP_BITMAP_HAVE_MORE_BIT /* Bit 7 is set */ + +/* Bitmap 2nd Byte: bit0 - bit7 */ +#define NIP_BITMAP_INCLUDE_HDR_LEN 0x80 /* Bit 0 is set */ +#define NIP_BITMAP_INCLUDE_RES2 0x40 /* Bit 1 is set */ +#define NIP_BITMAP_INCLUDE_RES3 0x20 /* Bit 2 is set */ +#define NIP_BITMAP_INCLUDE_RES4 0x10 /* Bit 3 is set */ +#define NIP_BITMAP_INCLUDE_RES5 0x08 /* Bit 4 is set */ +#define NIP_BITMAP_INCLUDE_RES6 0x04 /* Bit 5 is set */ +#define NIP_BITMAP_INCLUDE_RES7 0x02 /* Bit 6 is set */ +#define NIP_BITMAP_HAVE_BYTE_3 NIP_BITMAP_HAVE_MORE_BIT /* Bit 7 is set */ + +/* Bitmap 1st Byte: + * | valid | ttl | total_len | next_hdr | res1 | daddr | saddr | have byte2 | + * | 0 | 1 | 0 | 1 | 0 | 1 | 1 | 0 | + */ +#define NIP_UDP_BITMAP_1 0x56 +#define NIP_UDP_BITMAP_1_INC_2 0x57 + +/* Bitmap 1st Byte: + * | valid | ttl | total_len | next_hdr | res1 | daddr | saddr | have byte2 | + * | 0 | 1 | 1 | 1 | 0 | 1 | 1 | 0 | + */ +#define NIP_NORMAL_BITMAP_1 0x76 +#define NIP_NORMAL_BITMAP_1_INC_2 0x77 + +/* Bitmap 2nd Byte: + * | hdr_len | res2 | res2 | res2 | res2 | res2 | res2 | have byte3 | + * | 0 or 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | + */ +#define NIP_NODATA_BITMAP_2 0x00 +#define NIP_NORMAL_BITMAP_2 0x80 + +/* invalid Bitmap 2nd Byte: + * | hdr_len | res2 | res2 | res2 | res2 | res2 | res2 | have byte3 | + * | 0 or 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | + */ +#define NIP_INVALID_BITMAP_2 0x7F + +#define NIP_DEFAULT_TTL 128 +#define NIP_ARP_DEFAULT_TTL 64 +#define IPPROTO_NIP_ICMP 0xB1 + +enum NIP_HDR_DECAP_ERR { + NIP_HDR_BITMAP_INVALID = 1, + NIP_HDR_BITMAP_NUM_OUT_RANGE = 2, + NIP_HDR_NO_TTL = 3, + NIP_HDR_NO_NEXT_HDR = 4, + NIP_HDR_NO_DADDR = 5, + NIP_HDR_DECAP_DADDR_ERR = 6, + NIP_HDR_DADDR_INVALID = 7, + NIP_HDR_DECAP_SADDR_ERR = 8, + NIP_HDR_SADDR_INVALID = 9, + NIP_HDR_RCV_BUF_READ_OUT_RANGE = 10, + NIP_HDR_UNKNOWN_AND_NO_HDR_LEN = 11, + NIP_HDR_LEN_INVALID = 12, + NIP_HDR_LEN_OUT_RANGE = 13, + + NIP_HDR_DECAP_ERRCODE_MAX, +}; + +/* The newIP header contains variable-length fields. + * The header structure is defined only for function parameter transmission. + * The fields are parsed in the original packet and saved + */ +struct nip_hdr_decap { + struct nip_addr saddr; /* Source address, network order.(big end) */ + struct nip_addr daddr; /* Destination address, network order.(big end) */ + + unsigned char ttl; /* Hop count limit */ + unsigned char nexthdr; /* Upper-layer Protocol Type: IPPROTO_UDP */ + unsigned char hdr_len; /* Indicates the length of the packet header */ + unsigned char hdr_real_len; /* Indicates the actual length of the packet header */ + + unsigned short total_len; /* Packet length (Header + packet), network order.(big end) */ + unsigned short no_hdr_len : 1; /* The header does not contain a header length field */ + unsigned short include_unknown_bit : 1; /* There is no other bitmap field */ + unsigned short include_saddr : 1; + unsigned short include_daddr : 1; + unsigned short include_ttl : 1; + unsigned short include_nexthdr : 1; + unsigned short include_hdr_len : 1; + unsigned short include_total_len : 1; + unsigned short res : 8; + + unsigned int rcv_buf_len; +}; + +/* The newIP packet header function is an incoming or outgoing parameter, + * which is not the content encapsulated in the packet + */ +#define BITMAP_MAX 8 +#define RES_NUM 2 +struct nip_hdr_encap { + struct nip_addr daddr; /* Destination address, network order.(big end) */ + struct nip_addr saddr; /* Source address, network order.(big end) */ + + unsigned char ttl; /* Hop count limit */ + unsigned char nexthdr; /* Upper-layer Protocol Type: IPPROTO_UDP */ + unsigned short total_len; /* Packet header length + packet data length */ + + void *usr_data; /* User data pointer */ + unsigned int usr_data_len; /* Length of data sent by the user */ + unsigned int trans_hdr_len; /* Transport layer header length */ + + unsigned short sport; + unsigned short dport; + + /* The following are the output parameters */ + unsigned char bitmap[BITMAP_MAX]; /* Bitmap currently supports a maximum of 8 bytes */ + unsigned int bitmap_num; /* Number of valid elements in the bitmap array */ + + unsigned char *hdr_buf; /* Cache the newIP header */ + unsigned int hdr_buf_pos; /* Buf Buffer writable address offset */ + unsigned short *frag_id_pos; /* Fragment Offset in the original packet */ + unsigned char *hdr_len_pos; /* Indicates the actual length of the packet header */ + unsigned short *total_len_pos; /* Total length position of the packet */ + + /* Whether the bitmap of the packet header carries a flag */ + unsigned char encap_ttl : 1; + unsigned char encap_hdr_len : 1; + unsigned char encap_daddr : 1; + unsigned char encap_saddr : 1; + unsigned char encap_total_len : 1; + unsigned char encap_res : 3; +}; + +/* Packet segment information */ +struct nip_pkt_seg_info { + unsigned int mid_pkt_num; /* Number of intermediate segments */ + unsigned int last_pkt_num; /* Number of last segments */ + + unsigned int mid_usr_pkt_len; /* Middle segment data length (8B aligned) */ + unsigned int last_usr_pkt_len; /* Length of the last data segment */ + + unsigned char *usr_data; /* Holds a pointer to the user's raw data */ + unsigned int usr_data_len; /* Length of user data read this time */ +}; + +void nip_calc_pkt_frag_num(unsigned int mtu, unsigned int usr_data_len, + struct nip_pkt_seg_info *seg_info); + +void nip_hdr_udp_encap(struct nip_hdr_encap *head); + +/* need update total len after this func, call nip_update_total_len */ +void nip_hdr_comm_encap(struct nip_hdr_encap *head); + +/* input must be network order. */ +void nip_update_total_len(struct nip_hdr_encap *head, unsigned short total_len); + +/* Note: a function call requires its own byte order conversion.(niph->total_len) */ +int nip_hdr_parse(unsigned char *buf, unsigned int buf_len, struct nip_hdr_decap *niph); + +struct udp_hdr { + unsigned short sport; + unsigned short dport; + unsigned short len; + unsigned short checksum; +}; + +/* input must be network order. */ +static inline void nip_build_udp_hdr(unsigned short sport, unsigned short dport, + unsigned short len, unsigned char *buf, + unsigned short checksum) +{ + struct udp_hdr *uh; + + uh = (struct udp_hdr *)buf; + uh->sport = sport; + uh->dport = dport; + uh->len = len; + uh->checksum = checksum; +} + +#endif /* _NEWIP_HDR_H */ + diff --git a/code/net/newip/nip_hdr_decap.c b/code/net/newip/nip_hdr_decap.c new file mode 100644 index 0000000000000000000000000000000000000000..bf8743bc67490469d253c2d0c56d6f0718fb4b1b --- /dev/null +++ b/code/net/newip/nip_hdr_decap.c @@ -0,0 +1,292 @@ +// SPDX-License-Identifier: BSD-2-Clause +/* + * Copyright (c) 2022 Huawei Device Co., Ltd. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, this list + * of conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include "nip_hdr.h" + +/* Must carry the current field */ +static int _get_nip_hdr_bitmap(unsigned char *buf, + unsigned char bitmap[], + unsigned char bitmap_index_max) +{ + int i = 0; + unsigned char *p = buf; + + if (*p & NIP_BITMAP_INVALID_SET) + return -NIP_HDR_BITMAP_INVALID; + + do { + if (i >= bitmap_index_max) + return -NIP_HDR_BITMAP_NUM_OUT_RANGE; + + bitmap[i] = *p; + p++; + } while (bitmap[i++] & NIP_BITMAP_HAVE_MORE_BIT); + + return i; +} + +/* Must carry the current field */ +static int _get_nip_hdr_ttl(unsigned char *buf, + unsigned char bitmap, + struct nip_hdr_decap *niph) +{ + if (!(bitmap & NIP_BITMAP_INCLUDE_TTL)) + return -NIP_HDR_NO_TTL; + + niph->ttl = *buf; + niph->include_ttl = 1; + + return sizeof(niph->ttl); +} + +/* Optional fields */ +/* Communication between devices of the same version may not carry packet Header length, + * but communication between devices of different versions must carry packet header length + */ +static int _get_nip_hdr_len(unsigned char *buf, + unsigned char bitmap, + struct nip_hdr_decap *niph) +{ + if (!(bitmap & NIP_BITMAP_INCLUDE_HDR_LEN)) + return 0; + + /* Total_len is a network sequence and cannot be + * compared directly with the local sequence + */ + niph->hdr_len = *buf; + niph->include_hdr_len = 1; + + if (niph->include_total_len && niph->hdr_len >= niph->rcv_buf_len) + return -NIP_HDR_LEN_OUT_RANGE; + + return sizeof(niph->hdr_len); +} + +/* Must carry the current field */ +static int _get_nip_hdr_nexthdr(unsigned char *buf, + unsigned char bitmap, + struct nip_hdr_decap *niph) +{ + if (!(bitmap & NIP_BITMAP_INCLUDE_NEXT_HDR)) + return -NIP_HDR_NO_NEXT_HDR; + + niph->nexthdr = *buf; + niph->include_nexthdr = 1; + + return sizeof(niph->nexthdr); +} + +/* Must carry the current field */ +/* Note: niph->saddr is network order.(big end) */ +static int _get_nip_hdr_daddr(unsigned char *buf, + unsigned char bitmap, + struct nip_hdr_decap *niph) +{ + unsigned char *p; + + if (!(bitmap & NIP_BITMAP_INCLUDE_DADDR)) + return -NIP_HDR_NO_DADDR; + + p = decode_nip_addr(buf, &niph->daddr); + if (!p) + return -NIP_HDR_DECAP_DADDR_ERR; + + if (nip_addr_invalid(&niph->daddr)) + return -NIP_HDR_DADDR_INVALID; + + niph->include_daddr = 1; + return (niph->daddr.bitlen / NIP_ADDR_BIT_LEN_8); +} + +/* Optional fields */ +/* Note: niph->daddr is network order.(big end) */ +static int _get_nip_hdr_saddr(unsigned char *buf, + unsigned char bitmap, + struct nip_hdr_decap *niph) +{ + unsigned char *p; + + if (!(bitmap & NIP_BITMAP_INCLUDE_SADDR)) + return 0; + + p = decode_nip_addr(buf, &niph->saddr); + if (!p) + return -NIP_HDR_DECAP_SADDR_ERR; + + if (nip_addr_invalid(&niph->saddr)) + return -NIP_HDR_SADDR_INVALID; + + niph->include_saddr = 1; + return (niph->saddr.bitlen / NIP_ADDR_BIT_LEN_8); +} + +/* Optional fields: tcp/arp need, udp needless */ +/* Note: niph->total_len is network order.(big end), need change to host order */ +static int _get_nip_total_len(unsigned char *buf, + unsigned char bitmap, + struct nip_hdr_decap *niph) +{ + if (!(bitmap & NIP_BITMAP_INCLUDE_TOTAL_LEN)) + return 0; + + /* Total_len is a network sequence and cannot be + * compared directly with the local sequence + */ + niph->total_len = *((unsigned short *)buf); + niph->include_total_len = 1; + + return sizeof(niph->total_len); +} + +static int _nip_hdr_bitmap0_parse(unsigned char *buf, + unsigned char bitmap, + struct nip_hdr_decap *niph) +{ + int len; + int len_total = 0; + + len = _get_nip_hdr_ttl(buf, bitmap, niph); + if (len < 0) + return len; + len_total += len; + + /* Optional fields */ + len = _get_nip_total_len(buf + len_total, bitmap, niph); + if (len < 0) + return len; + len_total += len; + + len = _get_nip_hdr_nexthdr(buf + len_total, bitmap, niph); + if (len < 0) + return len; + len_total += len; + + len = _get_nip_hdr_daddr(buf + len_total, bitmap, niph); + if (len < 0) + return len; + len_total += len; + + len = _get_nip_hdr_saddr(buf + len_total, bitmap, niph); + if (len < 0) + return len; + len_total += len; + + return len_total; +} + +static int _nip_hdr_bitmap1_parse(unsigned char *buf, + unsigned char bitmap, + struct nip_hdr_decap *niph) +{ + int len; + int len_total = 0; + + /* If add new field needs to be modified with the macro definition */ + if (bitmap & NIP_INVALID_BITMAP_2) + niph->include_unknown_bit = 1; + + /* Optional fields */ + len = _get_nip_hdr_len(buf + len_total, bitmap, niph); + if (len < 0) + return len; + len_total += len; + + return len_total; +} + +static int _nip_hdr_unknown_bit_check(unsigned char *buf, + unsigned char bitmap, + struct nip_hdr_decap *niph) +{ + niph->include_unknown_bit = 1; + return 0; +} + +#define FACTORY_NUM_MAX 3 +static int (*hdr_parse_factory[FACTORY_NUM_MAX])(unsigned char *, + unsigned char, + struct nip_hdr_decap *) = { + _nip_hdr_bitmap0_parse, + _nip_hdr_bitmap1_parse, + _nip_hdr_unknown_bit_check, +}; + +static int nip_hdr_check(struct nip_hdr_decap *niph) +{ + if (niph->include_unknown_bit && !niph->include_hdr_len) + /* different ver pkt but no hdr len */ + return -NIP_HDR_UNKNOWN_AND_NO_HDR_LEN; + + if (niph->include_hdr_len) { + if (niph->hdr_len == 0 || + niph->hdr_len < niph->hdr_real_len) + return -NIP_HDR_LEN_INVALID; + } + + return 0; +} + +/* Note: + * 1.niph->total_len is network order.(big end), need change to host order + * 2.niph->saddr/daddr is network order.(big end) + */ +int nip_hdr_parse(unsigned char *buf, unsigned int buf_len, struct nip_hdr_decap *niph) +{ + int i = 0; + int len; + int ret; + unsigned char bitmap[BITMAP_MAX] = {0}; + int num = _get_nip_hdr_bitmap(buf, bitmap, BITMAP_MAX); + + if (num <= 0) + return num; + + niph->hdr_real_len = num * sizeof(bitmap[0]); + buf += niph->hdr_real_len; + + niph->rcv_buf_len = buf_len; + while (i < num) { + if (i >= FACTORY_NUM_MAX) + break; + len = hdr_parse_factory[i](buf, bitmap[i], niph); + if (len < 0) + return len; + + buf += len; + niph->hdr_real_len += len; + if (niph->hdr_real_len >= buf_len) + return -NIP_HDR_RCV_BUF_READ_OUT_RANGE; + i++; + } + + ret = nip_hdr_check(niph); + if (ret < 0) + return ret; + + return niph->hdr_len > niph->hdr_real_len ? + niph->hdr_len : niph->hdr_real_len; +} + diff --git a/code/net/newip/nip_hdr_encap.c b/code/net/newip/nip_hdr_encap.c new file mode 100644 index 0000000000000000000000000000000000000000..4a034da22679275854048eb980ce49531ef1c888 --- /dev/null +++ b/code/net/newip/nip_hdr_encap.c @@ -0,0 +1,175 @@ +// SPDX-License-Identifier: BSD-2-Clause +/* + * Copyright (c) 2022 Huawei Device Co., Ltd. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, this list + * of conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include "nip_hdr.h" + +#define INTEGER_MULTIPLE_OF_8 (~7) /* ~7 is an integer multiple of 8 */ +#define FMT_FACTORY_NUM_MAX 1 +#define ENCAP_FACTORY_NUM_MAX 1 + +void nip_calc_pkt_frag_num(unsigned int mtu, unsigned int usr_data_len, + struct nip_pkt_seg_info *seg_info) +{ + unsigned int mid_usr_pkt_len = (mtu - NIP_HDR_MAX - NIP_UDP_HDR_LEN) & + INTEGER_MULTIPLE_OF_8; + unsigned int mid_pkt_num = usr_data_len / mid_usr_pkt_len; + unsigned int last_usr_pkt_len = 0; + + if (usr_data_len != 0) { + last_usr_pkt_len = usr_data_len % mid_usr_pkt_len; + if (last_usr_pkt_len == 0) { + last_usr_pkt_len = mid_usr_pkt_len; + mid_pkt_num--; + } + } + + seg_info->last_pkt_num = 1; + seg_info->mid_pkt_num = mid_pkt_num; + seg_info->mid_usr_pkt_len = mid_usr_pkt_len; + seg_info->last_usr_pkt_len = last_usr_pkt_len; +} + +static inline void _nip_hdr_ttl_encap(struct nip_hdr_encap *head) +{ + *(head->hdr_buf + head->hdr_buf_pos) = head->ttl; + head->hdr_buf_pos += sizeof(head->ttl); +} + +static inline void _nip_hdr_len_encap(struct nip_hdr_encap *head) +{ + head->hdr_len_pos = head->hdr_buf + head->hdr_buf_pos; + head->hdr_buf_pos += 1; +} + +static inline void _nip_update_hdr_len(struct nip_hdr_encap *head) +{ + *head->hdr_len_pos = head->hdr_buf_pos; +} + +static inline void _nip_hdr_nexthdr_encap(struct nip_hdr_encap *head) +{ + *(head->hdr_buf + head->hdr_buf_pos) = head->nexthdr; + head->hdr_buf_pos += sizeof(head->nexthdr); +} + +static inline void _nip_hdr_daddr_encap(struct nip_hdr_encap *head) +{ + (void)build_nip_addr(&head->daddr, (head->hdr_buf + head->hdr_buf_pos)); + head->hdr_buf_pos += (head->daddr.bitlen / NIP_ADDR_BIT_LEN_8); +} + +static inline void _nip_hdr_saddr_encap(struct nip_hdr_encap *head) +{ + (void)build_nip_addr(&head->saddr, (head->hdr_buf + head->hdr_buf_pos)); + head->hdr_buf_pos += (head->saddr.bitlen / NIP_ADDR_BIT_LEN_8); +} + +static inline void _nip_hdr_total_len_encap(struct nip_hdr_encap *head) +{ + head->total_len_pos = (unsigned short *)(head->hdr_buf + head->hdr_buf_pos); + head->hdr_buf_pos += sizeof(head->total_len); +} + +/* total_len must be network order.(big end) */ +void nip_update_total_len(struct nip_hdr_encap *head, unsigned short total_len) +{ + *head->total_len_pos = total_len; +} + +static inline void _nip_hdr_encap_udp_bitmap(struct nip_hdr_encap *head) +{ + /* bitmap(1B) + ttl(1B) + total_len(2B) + nexthdr(1B) + daddr(xB) + saddr(xB) */ + /* If the length of the destination address and the source address is even, + * the length of the packet header must be odd. You need to add 1-byte alignment + * and 1-byte bitmap + */ + if (((head->daddr.bitlen / NIP_ADDR_BIT_LEN_8) + (head->saddr.bitlen / NIP_ADDR_BIT_LEN_8)) + % NIP_BYTE_ALIGNMENT != 0) { + head->hdr_buf[0] = NIP_UDP_BITMAP_1; + head->hdr_buf_pos = 1; + } else { + head->hdr_buf[0] = NIP_UDP_BITMAP_1_INC_2; + head->hdr_buf[1] = NIP_NODATA_BITMAP_2; + head->hdr_buf_pos = 2; + } +} + +static inline void _nip_hdr_encap_comm_bitmap(struct nip_hdr_encap *head) +{ + /* bitmap(1B) + ttl(1B) + nexthdr(1B) + daddr(xB) + saddr(xB) */ + /* If the length of the destination address and the source address is even, + * the length of the packet header must be odd. You need to add 1-byte alignment + * and 1-byte bitmap + */ + if (((head->daddr.bitlen / NIP_ADDR_BIT_LEN_8) + (head->saddr.bitlen / NIP_ADDR_BIT_LEN_8)) + % NIP_BYTE_ALIGNMENT != 0) { + head->hdr_buf[0] = NIP_NORMAL_BITMAP_1; + head->hdr_buf_pos = 1; + } else { + head->hdr_buf[0] = NIP_NORMAL_BITMAP_1_INC_2; + head->hdr_buf[1] = NIP_NODATA_BITMAP_2; + head->hdr_buf_pos = 2; + } +} + +#define NEWIP_BYTE_ALIGNMENT_ENABLE 1 // 0: disable; 1: enable + +void nip_hdr_udp_encap(struct nip_hdr_encap *head) +{ + /* Encapsulate the bitmap into the newIP packet header BUF */ +#if (NEWIP_BYTE_ALIGNMENT_ENABLE == 1) + _nip_hdr_encap_udp_bitmap(head); +#else + head->hdr_buf[0] = NIP_UDP_BITMAP_1; + head->hdr_buf_pos = 1; +#endif + + /* Encapsulate bitmap fields into newIP packet header BUF */ + _nip_hdr_ttl_encap(head); + _nip_hdr_nexthdr_encap(head); + _nip_hdr_daddr_encap(head); + _nip_hdr_saddr_encap(head); +} + +/* need update total len after this func, call nip_update_total_len */ +void nip_hdr_comm_encap(struct nip_hdr_encap *head) +{ + /* Encapsulate the bitmap into the newIP packet header BUF */ +#if (NEWIP_BYTE_ALIGNMENT_ENABLE == 1) + _nip_hdr_encap_comm_bitmap(head); +#else + head->hdr_buf[0] = NIP_NORMAL_BITMAP_1; + head->hdr_buf_pos = 1; +#endif + + /* Encapsulate bitmap fields into newIP packet header BUF */ + _nip_hdr_ttl_encap(head); + _nip_hdr_total_len_encap(head); /* ARP/TCP need include hdr total len */ + _nip_hdr_nexthdr_encap(head); + _nip_hdr_daddr_encap(head); + _nip_hdr_saddr_encap(head); +} + diff --git a/code/net/newip/nip_input.c b/code/net/newip/nip_input.c new file mode 100644 index 0000000000000000000000000000000000000000..f0fa06aa66f732c703147bb315dc50dde9f1647c --- /dev/null +++ b/code/net/newip/nip_input.c @@ -0,0 +1,150 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2022 Huawei Device Co., Ltd. + * + * NewIP input + * Linux NewIP INET implementation + * + * Based on net/ipv6/ip6_input.c + */ +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "nip_hdr.h" + +static int _nip_update_recv_skb_len(struct sk_buff *skb, + struct nip_hdr_decap *niph) +{ + if (!niph->include_total_len) + return 0; + + if (niph->total_len > skb->len) { + DEBUG("%s: total_len(%u) is bigger than skb_len(%u), Drop a packet.", + __func__, niph->total_len, skb->len); + return NET_RX_DROP; + } + + skb->len = niph->total_len; + return 0; +} + +static int nip_rcv_finish(struct sk_buff *skb) +{ + struct net *net = dev_net(skb->dev); + void (*edemux)(struct sk_buff *skb); + + /* set /proc/sys/net/ipv4/ip_early_demux to change sysctl_ip_early_demux, + * which is used by ipv4, ipv6 and newip + */ + if (net->ipv4.sysctl_ip_early_demux && !skb_dst(skb) && !skb->sk) { + const struct ninet_protocol *ipprot; + + DEBUG("%s: try to early demux skb.\n", __func__); + ipprot = rcu_dereference(ninet_protos[NIPCB(skb)->nexthdr]); + if (ipprot) + edemux = READ_ONCE(ipprot->early_demux); + if (edemux) + edemux(skb); + } + + /* nip_route_input will set nip_null_entry + * instead of NULL in skb when looking up failed. + */ + if (!skb_valid_dst(skb)) + nip_route_input(skb); + + return dst_input(skb); +} + +int nip_rcv(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt, struct net_device *orig_dev) +{ + int offset = 0; + struct nip_hdr_decap niph = {0}; + + if (skb->pkt_type == PACKET_OTHERHOST) { + kfree_skb(skb); + return NET_RX_DROP; + } + + skb = skb_share_check(skb, GFP_ATOMIC); + if (!skb) + goto out; + + memset(NIPCB(skb), 0, sizeof(struct ninet_skb_parm)); + offset = nip_hdr_parse(skb->data, skb->len, &niph); + if (offset <= 0) { + DEBUG("%s check in failure, errcode=%d, Drop a packet.(nexthdr=%u, hdr_len=%u)", + __func__, offset, niph.nexthdr, niph.hdr_len); + goto drop; + } + + if (niph.nexthdr != IPPROTO_UDP && niph.nexthdr != IPPROTO_TCP && + niph.nexthdr != IPPROTO_NIP_ICMP) { + DEBUG("%s nexthdr(%u) invalid, Drop a packet.", __func__, niph.nexthdr); + goto drop; + } + + niph.total_len = ntohs(niph.total_len); + NIPCB(skb)->dstaddr = niph.daddr; + NIPCB(skb)->srcaddr = niph.saddr; + NIPCB(skb)->nexthdr = niph.nexthdr; + skb->transport_header = skb->network_header + offset; + skb_orphan(skb); + + /* SKB refreshes the length after replication */ + if (_nip_update_recv_skb_len(skb, &niph)) + goto drop; + + return nip_rcv_finish(skb); +drop: + kfree_skb(skb); +out: + return NET_RX_DROP; +} + +/* Deliver the packet to transport layer, + * including TCP, UDP and ICMP. + * Caller must hold rcu. + */ +void nip_protocol_deliver_rcu(struct sk_buff *skb) +{ + const struct ninet_protocol *ipprot; + + if (!pskb_pull(skb, skb_transport_offset(skb))) + goto discard; + + ipprot = rcu_dereference(ninet_protos[NIPCB(skb)->nexthdr]); + if (ipprot) { + ipprot->handler(skb); + } else { + kfree_skb(skb); + DEBUG("not found transport protol, drop this packet!"); + } + return; + +discard: + kfree_skb(skb); +} + +/* Generally called by dst_input */ +int nip_input(struct sk_buff *skb) +{ + rcu_read_lock(); + nip_protocol_deliver_rcu(skb); + rcu_read_unlock(); + + return 0; +} diff --git a/code/net/newip/nip_output.c b/code/net/newip/nip_output.c new file mode 100644 index 0000000000000000000000000000000000000000..2fee31d185cc10adba140cb252d1d99e2ee53b58 --- /dev/null +++ b/code/net/newip/nip_output.c @@ -0,0 +1,506 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2022 Huawei Device Co., Ltd. + * + * NewIP output functions + * Linux NewIP INET implementation + * + * Based on net/ipv6/ip6_output.c + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "nip_hdr.h" +#include "nip_checksum.h" + +#define NIP_BIT_TO_BYTE 1024 +void update_memory_rate(const char *upper_fun) +{ + struct sysinfo mem_info; + unsigned long total; + unsigned long free; + unsigned long used; + unsigned int uint_kb; + + si_meminfo(&mem_info); + uint_kb = mem_info.mem_unit / NIP_BIT_TO_BYTE; + total = (unsigned long)mem_info.totalram * uint_kb; + free = (unsigned long)mem_info.freeram * uint_kb; + used = total - free; + DEBUG("%s -> %s mem total: %ld KB, mem used: %ld KB", upper_fun, __func__, total, used); +} + +int nip_output(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct nip_addr *nexthop; + struct neighbour *neigh; + int ret = 0; + int res; + struct net_device *dev = skb_dst(skb)->dev; + bool is_v6gw = false; + + skb->protocol = htons(ETH_P_NEWIP); + skb->dev = dev; + + /* prepare to build ethernet header */ + nexthop = nip_nexthop((struct nip_rt_info *)dst, &NIPCB(skb)->dstaddr); + + rcu_read_lock_bh(); + + neigh = __nip_neigh_lookup_noref(dev, nexthop); + if (unlikely(!neigh)) + neigh = __neigh_create(&nnd_tbl, nexthop, dev, false); + if (!IS_ERR(neigh)) { + res = neigh_output(neigh, skb, is_v6gw); + rcu_read_unlock_bh(); + return res; + } + DEBUG("find neigh and create neigh failed!"); + + rcu_read_unlock_bh(); + kfree_skb(skb); + return ret; +} + +int nip_forward(struct sk_buff *skb) +{ + return nip_output(NULL, NULL, skb); +} + +static int nip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + int err; + + err = dst_output(net, sk, skb); + return err; +} + +int nip_send_skb(struct sk_buff *skb) +{ + struct net *net; + int err = 0; + + net = skb->sk ? sock_net(skb->sk) : dev_net(skb_dst(skb)->dev); + err = nip_local_out(net, skb->sk, skb); + if (err) + DEBUG("%s: failed to out skb!", __func__); + + return err; +} + +unsigned short nip_get_output_checksum(struct sk_buff *skb, + struct nip_hdr_encap *head) +{ + struct nip_pseudo_header nph = {0}; + u8 *udp_hdr = skb_transport_header(skb); + unsigned short check_len = head->trans_hdr_len + head->usr_data_len; + + nph.nexthdr = IPPROTO_UDP; + nph.saddr = NIPCB(skb)->srcaddr; + nph.daddr = NIPCB(skb)->dstaddr; + nph.check_len = htons(check_len); + return nip_check_sum_build(udp_hdr, check_len, &nph); +} + +static struct sk_buff *_nip_alloc_skb(struct sock *sk, + struct nip_hdr_encap *head, + struct nip_pkt_seg_info *seg_info, + struct dst_entry *dst) +{ + int len = NIP_ETH_HDR_LEN + NIP_HDR_MAX + + head->trans_hdr_len + seg_info->mid_usr_pkt_len; + struct sk_buff *skb = alloc_skb(len, 0); + + if (!skb) { + DEBUG("%s: no space for skb", __func__); + return NULL; + } + + skb->protocol = htons(ETH_P_NEWIP); + skb->ip_summed = CHECKSUM_NONE; + skb->csum = 0; + skb->sk = sk; + + dst_hold(dst); + DEBUG("%s: malloc_len=%d, dst->__refcnt=%u", __func__, + len, atomic_read(&dst->__refcnt)); + skb_dst_set(skb, dst); + memset(NIPCB(skb), 0, sizeof(struct ninet_skb_parm)); + + return skb; +} + +static int _nip_udp_single_output(struct sock *sk, + struct nip_hdr_encap *head, + struct nip_pkt_seg_info *seg_info, + struct dst_entry *dst) +{ + int len; + int ret; + struct msghdr *from = (struct msghdr *)head->usr_data; + struct sk_buff *skb = _nip_alloc_skb(sk, head, seg_info, dst); + unsigned short check = 0; + + if (IS_ERR_OR_NULL(skb)) { + DEBUG("%s: skb alloc fail", __func__); + return -ENOMEM; + } + + /* Reserved Position of the Ethernet header (to be filled after the + * Ethernet header is delivered to the link layer) + */ + skb_reserve(skb, NIP_ETH_HDR_LEN); + + /* Fill in the Network-layer Header (newIP) */ + skb_reset_network_header(skb); + head->hdr_buf = skb->data; + nip_hdr_udp_encap(head); + skb_reserve(skb, head->hdr_buf_pos); + NIPCB(skb)->dstaddr = head->daddr; + NIPCB(skb)->srcaddr = head->saddr; + NIPCB(skb)->nexthdr = IPPROTO_UDP; + + /* Fill in the Transport Layer Header (UDP) */ + skb_reset_transport_header(skb); + nip_build_udp_hdr(head->sport, head->dport, + htons(head->trans_hdr_len + head->usr_data_len), + skb->data, htons(0)); + skb_reserve(skb, head->trans_hdr_len); + len = copy_from_iter(skb->data, head->usr_data_len, &from->msg_iter); + if (len < 0) { + /* The DST has been set to the SKB. When the SKB is released, + * the DST is automatically released + */ + DEBUG("%s: copy from iter fail.(datalen=%u)", + __func__, head->usr_data_len); + kfree_skb(skb); + return -EFBIG; + } + + /* insert check sum */ + check = nip_get_output_checksum(skb, head); + nip_build_udp_hdr(head->sport, head->dport, + htons(head->trans_hdr_len + head->usr_data_len), + skb->data - head->trans_hdr_len, htons(check)); + + /* Refresh the data/tail of the SKB after the packet copy is complete */ + skb_put(skb, head->usr_data_len); + skb->data = skb_network_header(skb); + skb->len = head->hdr_buf_pos + head->trans_hdr_len + + head->usr_data_len; + + /* Add the actual size of the current SKB to the SOCK send cache count + * and set destructor to __sock_wfree to reduce the SOCK send cache size + * when the SKB is released. + */ + skb->destructor = __sock_wfree; + refcount_add(skb->truesize, &sk->sk_wmem_alloc); + skb->priority = sk->sk_priority; + + ret = nip_send_skb(skb); + DEBUG("%s: newip output finish.(ret=%d, datalen=%u)", + __func__, ret, head->usr_data_len); + update_memory_rate(__func__); + return ret; +} + +int _nip_udp_output(struct sock *sk, void *from, int datalen, + int transhdrlen, const struct nip_addr *saddr, + ushort sport, const struct nip_addr *daddr, + ushort dport, struct dst_entry *dst) +{ + int i; + u32 ret = 0; + u32 mtu = dst_mtu(dst); + struct nip_pkt_seg_info seg_info = {0}; + struct nip_hdr_encap head = {0}; + + head.saddr = *saddr; + head.daddr = *daddr; + head.sport = sport; + head.dport = dport; + head.usr_data = from; + head.ttl = NIP_DEFAULT_TTL; + head.nexthdr = IPPROTO_UDP; + head.trans_hdr_len = transhdrlen; + + nip_calc_pkt_frag_num(mtu, datalen, &seg_info); + + /* Send intermediate data segments */ + for (i = 0; i < seg_info.mid_pkt_num; i++) { + head.usr_data_len = seg_info.mid_usr_pkt_len; + ret = _nip_udp_single_output(sk, &head, &seg_info, dst); + if (ret) + goto end; + } + + /* Send the last data segment */ + if (seg_info.last_pkt_num) { + head.usr_data_len = seg_info.last_usr_pkt_len; + ret = _nip_udp_single_output(sk, &head, &seg_info, dst); + } + +end: + return ret; +} + +static int nip_sk_dst_check(struct dst_entry *dst, + struct flow_nip *fln) +{ + int err = 0; + + if (!dst) + goto out; + + if (fln->flowin_oif && fln->flowin_oif != dst->dev->ifindex) + err = -EPERM; + +out: + return err; +} + +/* 1. Based on FLN, the routing table is searched to obtain the corresponding DST + * 2. The newIP address of the source end is obtained based on the routing table + * search result and stored in the fln->saddr + */ +static int nip_dst_lookup_tail(struct net *net, const struct sock *sk, + struct dst_entry **dst, struct flow_nip *fln) +{ + int err; + struct nip_rt_info *rt; + + if (!(*dst)) + *dst = nip_route_output(net, sk, fln); + + err = (*dst)->error; + if (err) { + rt = NULL; + DEBUG("%s: nip_route_output search error!", __func__); + goto out_err_release; + } + + err = nip_sk_dst_check(*dst, fln); + if (err) + goto out_err_release; + + rt = (struct nip_rt_info *)*dst; + if (*dst == &net->newip.nip_broadcast_entry->dst) { + if (!&fln->saddr) + fln->saddr = fln->daddr; + err = 0; + } else { + err = nip_route_get_saddr(net, rt, &fln->daddr, &fln->saddr); + } + + if (err) + goto out_err_release; + + return 0; + +out_err_release: + dst_release(*dst); + *dst = NULL; + + return err; +} + +struct dst_entry *nip_dst_lookup_flow(struct net *net, const struct sock *sk, + struct flow_nip *fln, + const struct nip_addr *final_dst) +{ + struct dst_entry *dst = NULL; + int err; + + err = nip_dst_lookup_tail(net, sk, &dst, fln); + if (err) + return ERR_PTR(err); + if (final_dst) + fln->daddr = *final_dst; + + return dst; +} + +struct dst_entry *nip_sk_dst_lookup_flow(struct sock *sk, struct flow_nip *fln) +{ + struct dst_entry *dst = NULL; + int err; + + err = nip_dst_lookup_tail(sock_net(sk), sk, &dst, fln); + if (err) + return ERR_PTR(err); + + return dst; +} + +int tcp_nip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl) +{ + int err = -EFAULT; + struct net *net = sock_net(sk); + struct nip_addr *saddr, *daddr; + struct dst_entry *dst; + struct flow_nip fln; + struct nip_hdr_encap head = {0}; + unsigned char hdr_buf[NIP_HDR_MAX]; /* Cache the newIP header */ + + rcu_read_lock(); + skb->protocol = htons(ETH_P_NEWIP); + skb->ip_summed = CHECKSUM_NONE; + skb->csum = 0; + saddr = &sk->sk_nip_rcv_saddr; + daddr = &sk->sk_nip_daddr; + + head.saddr = *saddr; + head.daddr = *daddr; + head.ttl = NIP_DEFAULT_TTL; + head.nexthdr = IPPROTO_TCP; + head.hdr_buf = hdr_buf; + nip_hdr_comm_encap(&head); + head.total_len = head.hdr_buf_pos + skb->len; + nip_update_total_len(&head, htons(head.total_len)); + + fln.daddr = sk->sk_nip_daddr; + dst = __sk_dst_check(sk, 0); + if (!dst) { + DEBUG("%s: no dst cache for sk, search newip rt.", __func__); + dst = nip_route_output(net, sk, &fln); + if (!dst) { + DEBUG("%s: cannot find dst.", __func__); + goto out; + } + sk_dst_set(sk, dst); + } + skb_dst_set_noref(skb, dst); + + /* build nwk header */ + skb_push(skb, head.hdr_buf_pos); + memcpy(skb->data, head.hdr_buf, head.hdr_buf_pos); + + skb_reset_network_header(skb); + NIPCB(skb)->srcaddr = *saddr; + NIPCB(skb)->dstaddr = *daddr; + NIPCB(skb)->nexthdr = head.nexthdr; + + skb->priority = sk->sk_priority; + head.total_len = skb->len; + err = nip_send_skb(skb); + if (err) + DEBUG("%s: failed to send skb, skb->len=%u", __func__, head.total_len); + else + DEBUG("%s: send skb ok, skb->len=%u", __func__, head.total_len); + +out: + rcu_read_unlock(); + return err; +} + +void tcp_nip_actual_send_reset(struct sock *sk, struct sk_buff *skb, u32 seq, + u32 ack_seq, u32 win, int rst, u32 priority) +{ + const struct tcphdr *th = tcp_hdr(skb); + struct tcphdr *t1; + struct sk_buff *buff; + struct flow_nip fln; + struct net *net; + struct nip_addr *saddr, *daddr; + unsigned int tot_len = sizeof(struct tcphdr); + struct nip_hdr_encap head = {0}; + unsigned char hdr_buf[NIP_HDR_MAX]; + struct dst_entry *dst; + int err; + + net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); + + /* alloc skb */ + buff = alloc_skb(MAX_TCP_HEADER, priority); + if (!buff) { + DEBUG("%s: alloc_skb failed.\n", __func__); + return; + } + skb_reserve(buff, MAX_TCP_HEADER); + + buff->sk = sk; // sk could be NULL + saddr = &(NIPCB(skb)->dstaddr); + daddr = &(NIPCB(skb)->srcaddr); + + /* Fill in tcp header */ + t1 = skb_push(buff, sizeof(struct tcphdr)); + skb_reset_transport_header(buff); + memset(t1, 0, sizeof(*t1)); + t1->dest = th->source; + t1->source = th->dest; + t1->doff = tot_len / TCP_NUM_4; + t1->seq = htonl(seq); + t1->ack_seq = htonl(ack_seq); + t1->ack = !rst || !th->ack; + t1->rst = rst; + t1->window = htons(win); + t1->check = htons(nip_get_output_checksum_tcp(buff, *saddr, *daddr)); + DEBUG("%s: host dport==%u, net dport==%x, host sport==%u, net sport==%x\n", + __func__, ntohs(t1->dest), t1->dest, ntohs(t1->source), t1->source); + DEBUG("%s: host seq==%u, net seq==%x, host ack_seq==%u, net ack_seq==%x\n", + __func__, seq, t1->seq, ack_seq, t1->ack_seq); + + buff->protocol = htons(ETH_P_NEWIP); + buff->ip_summed = CHECKSUM_NONE; + buff->csum = 0; + + /* Fill in nip header */ + head.saddr = *saddr; + head.daddr = *daddr; + head.ttl = NIP_DEFAULT_TTL; + head.nexthdr = IPPROTO_TCP; + head.hdr_buf = hdr_buf; + nip_hdr_comm_encap(&head); + head.total_len = head.hdr_buf_pos + buff->len; + nip_update_total_len(&head, htons(head.total_len)); + + /* Check routine */ + fln.daddr = *daddr; + dst = nip_route_output(net, sk, &fln); // here, sk not used. + if (!dst) { + DEBUG("%s: cannot find dst.", __func__); + goto out; + } + skb_dst_set_noref(buff, dst); + + /* Build newip header */ + skb_push(buff, head.hdr_buf_pos); + memcpy(buff->data, head.hdr_buf, head.hdr_buf_pos); + + skb_reset_network_header(buff); + NIPCB(buff)->srcaddr = *saddr; + NIPCB(buff)->dstaddr = *daddr; + NIPCB(buff)->nexthdr = head.nexthdr; + + buff->priority = priority; + head.total_len = buff->len; + err = nip_send_skb(buff); + if (err) + DEBUG("%s: failed to send skb, skb->len=%u", __func__, head.total_len); + else + DEBUG("%s: send skb ok, skb->len=%u", __func__, head.total_len); + +out: + return; +} diff --git a/code/net/newip/nip_sockglue.c b/code/net/newip/nip_sockglue.c new file mode 100644 index 0000000000000000000000000000000000000000..a94fbef7341a9ef919920630956680234c952a97 --- /dev/null +++ b/code/net/newip/nip_sockglue.c @@ -0,0 +1,167 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2022 Huawei Device Co., Ltd. + * + * NewIP INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. NewIP is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * The NewIP to API glue. + * + * Based on net/ipv4/ip_sockglue.c + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define NIP_OPTNAME_MAX 255 + +static void __nip_set_sock_tos(struct sock *sk, int val) +{ + sk->sk_priority = rt_tos2priority(val); + sk_dst_reset(sk); +} + +static bool nip_setsockopt_needs_rtnl(int optname) +{ + switch (optname) { + case IP_MSFILTER: + return true; + } + return false; +} + +static bool nip_getsockopt_needs_rtnl(int optname) +{ + switch (optname) { + case IP_MSFILTER: + return true; + } + return false; +} + +static int do_nip_setsockopt(struct sock *sk, int level, int optname, + sockptr_t optval, unsigned int optlen) +{ + struct inet_sock *inet = inet_sk(sk); + int val = 0, err = 0; + bool needs_rtnl = nip_setsockopt_needs_rtnl(optname); + + if (optlen >= sizeof(int)) { + if (copy_from_sockptr(&val, optval, sizeof(val))) + return -EFAULT; + } else if (optlen >= sizeof(char)) { + unsigned char ucval; + + if (copy_from_sockptr(&ucval, optval, sizeof(ucval))) + return -EFAULT; + val = (int)ucval; + } + + if (needs_rtnl) + rtnl_lock(); + lock_sock(sk); + + switch (optname) { + case IP_TOS: + inet->tos = val; + __nip_set_sock_tos(sk, val); + break; + default: + err = -ENOPROTOOPT; + break; + } + + release_sock(sk); + if (needs_rtnl) + rtnl_unlock(); + + return err; +} + +int nip_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, + unsigned int optlen) +{ + int err; + + if (level != SOL_IP) + return -ENOPROTOOPT; + + err = do_nip_setsockopt(sk, level, optname, optval, optlen); + + return err; +} + +static int do_nip_getsockopt(struct sock *sk, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct inet_sock *inet = inet_sk(sk); + bool needs_rtnl = nip_getsockopt_needs_rtnl(optname); + int val, err = 0; + int len; + + if (level != SOL_IP) + return -EOPNOTSUPP; + if (get_user(len, optlen)) + return -EFAULT; + if (len < 0) + return -EINVAL; + + if (needs_rtnl) + rtnl_lock(); + lock_sock(sk); + + switch (optname) { + case IP_TOS: + val = inet->tos; + break; + default: + err = -ENOPROTOOPT; + goto out; + } + + if (len < sizeof(int) && len > 0 && val >= 0 && val <= NIP_OPTNAME_MAX) { + unsigned char ucval = (unsigned char)val; + + len = 1; + if (put_user(len, optlen)) { + err = -EFAULT; + goto out; + } + if (copy_to_user(optval, &ucval, 1)) { + err = -EFAULT; + goto out; + } + } else { + len = min_t(unsigned int, sizeof(int), len); + if (put_user(len, optlen)) { + err = -EFAULT; + goto out; + } + if (copy_to_user(optval, &val, len)) { + err = -EFAULT; + goto out; + } + } +out: + release_sock(sk); + if (needs_rtnl) + rtnl_unlock(); + + return err; +} + +int nip_getsockopt(struct sock *sk, int level, + int optname, char __user *optval, int __user *optlen) +{ + return do_nip_getsockopt(sk, level, optname, optval, optlen); +} + diff --git a/code/net/newip/nndisc.c b/code/net/newip/nndisc.c new file mode 100644 index 0000000000000000000000000000000000000000..a75bd599f23de6dde59f58dcc703591413e3ba8f --- /dev/null +++ b/code/net/newip/nndisc.c @@ -0,0 +1,603 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2022 Huawei Device Co., Ltd. + * + * Neighbour Discovery for NewIP + * Linux NewIP INET implementation + * + * Based on net/ipv6/ndisc.c + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "nip_hdr.h" +#include "nip_checksum.h" + +/* NUD_INCOMPLETE + * The neighbor request packet has been sent but no response has been received + * NUD_REACHABLE + * Reachable: Indicates that the neighbor is reachable + * NUD_STAL + * Idle state, which has not been confirmed for a long time, + * and the idle time exceeds the rated time + * NUD_DELAY + * If the acknowledgment time expires but the idle time does not exceed the rated time, + * you need to obtain the acknowledgment packet + * NUD_PROBE + * After NUD_DELAY does not receive confirmation for a long time, ARP request messages are sent + * NUD_FAILED + * The neighbor is unreachable + * NUD_NOARP + * Indicates the status of the neighbor that does not need the ARP status change + * NUD_PERMANENT + * Indicates that the status of the neighbor item is permanent and does not need to change + * NUD_NONE + * Initialization status of the neighbor item + */ +static void nndisc_solicit(struct neighbour *neigh, struct sk_buff *skb); + +static u32 nndisc_hash(const void *pkey, + const struct net_device *dev, __u32 *fhash_rnd); +static bool nndisc_key_eq(const struct neighbour *neigh, const void *pkey); +static int nndisc_constructor(struct neighbour *neigh); + +static void nndisc_error_report(struct neighbour *neigh, struct sk_buff *skb) +{ + kfree_skb(skb); +} + +static const struct neigh_ops nndisc_generic_ops = { + .family = AF_NINET, + .solicit = nndisc_solicit, + .output = neigh_resolve_output, + .connected_output = neigh_connected_output, +}; + +static const struct neigh_ops nndisc_hh_ops = { + .family = AF_NINET, + .solicit = nndisc_solicit, + .error_report = nndisc_error_report, + .output = neigh_resolve_output, + .connected_output = neigh_resolve_output, +}; + +static const struct neigh_ops nndisc_direct_ops = { + .family = AF_NINET, + .output = neigh_direct_output, + .connected_output = neigh_direct_output, +}; + +#define NIP_NEIGH_MCAST_PROBES 4 +#define NIP_NEIGH_UCAST_PROBES 4 +#define NIP_NEIGH_DELAY_PROBE_TIME (5 * HZ) +#define NIP_NEIGH_GC_STALETIME (60 * HZ) +#define NIP_NEIGH_QUEUE_LEN_BYTES (64 * 1024) +#define NIP_NEIGH_PROXY_QLEN 64 +#define NIP_NEIGH_ANYCAST_DELAY (1 * HZ) +#define NIP_NEIGH_PROXY_DELAY ((8 * HZ) / 10) +#define NIP_NEIGH_GC_INTERVAL (30 * HZ) +#define NIP_NEIGH_GC_THRESH_1 128 +#define NIP_NEIGH_GC_THRESH_2 512 +#define NIP_NEIGH_GC_THRESH_3 1024 + +struct neigh_table nnd_tbl = { + .family = AF_NINET, + .key_len = sizeof(struct nip_addr), + .protocol = cpu_to_be16(ETH_P_NEWIP), + .hash = nndisc_hash, + .key_eq = nndisc_key_eq, + .constructor = nndisc_constructor, + .id = "nndisc_cache", + .parms = { + .tbl = &nnd_tbl, + .reachable_time = ND_REACHABLE_TIME, + .data = { + [NEIGH_VAR_MCAST_PROBES] = NIP_NEIGH_MCAST_PROBES, + [NEIGH_VAR_UCAST_PROBES] = NIP_NEIGH_UCAST_PROBES, + [NEIGH_VAR_RETRANS_TIME] = ND_RETRANS_TIMER, + [NEIGH_VAR_BASE_REACHABLE_TIME] = ND_REACHABLE_TIME, + [NEIGH_VAR_DELAY_PROBE_TIME] = NIP_NEIGH_DELAY_PROBE_TIME, + [NEIGH_VAR_GC_STALETIME] = NIP_NEIGH_GC_STALETIME, + [NEIGH_VAR_QUEUE_LEN_BYTES] = NIP_NEIGH_QUEUE_LEN_BYTES, + [NEIGH_VAR_PROXY_QLEN] = NIP_NEIGH_PROXY_QLEN, + [NEIGH_VAR_ANYCAST_DELAY] = NIP_NEIGH_ANYCAST_DELAY, + [NEIGH_VAR_PROXY_DELAY] = NIP_NEIGH_PROXY_DELAY, + }, + }, + .gc_interval = NIP_NEIGH_GC_INTERVAL, + .gc_thresh1 = NIP_NEIGH_GC_THRESH_1, + .gc_thresh2 = NIP_NEIGH_GC_THRESH_2, + .gc_thresh3 = NIP_NEIGH_GC_THRESH_3, +}; + +static u32 nndisc_hash(const void *pkey, + const struct net_device *dev, __u32 *hash_rnd) +{ + return nndisc_hashfn(pkey, dev, hash_rnd); +} + +static bool nndisc_key_eq(const struct neighbour *n, const void *pkey) +{ + return neigh_key_eq800(n, pkey); +} + +static int nndisc_constructor(struct neighbour *neigh) +{ + struct nip_addr *addr = (struct nip_addr *)&neigh->primary_key; + struct net_device *dev = neigh->dev; + struct ninet_dev *nin_dev; + struct neigh_parms *parms; + bool is_broadcast = (bool)nip_addr_eq(addr, &nip_broadcast_addr_arp); + + nin_dev = nin_dev_get(dev); + if (!nin_dev) + return -EINVAL; + + parms = nin_dev->nd_parms; + __neigh_parms_put(neigh->parms); + neigh->parms = neigh_parms_clone(parms); + neigh->type = RTN_UNICAST; + if (!dev->header_ops) { + neigh->nud_state = NUD_NOARP; + neigh->ops = &nndisc_direct_ops; + neigh->output = neigh_direct_output; + } else { + if (is_broadcast || + (dev->flags & IFF_POINTOPOINT)) { + neigh->nud_state = NUD_NOARP; + memcpy(neigh->ha, dev->broadcast, dev->addr_len); + } else if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) { + neigh->nud_state = NUD_NOARP; + memcpy(neigh->ha, dev->dev_addr, dev->addr_len); + if (dev->flags & IFF_LOOPBACK) + neigh->type = RTN_LOCAL; + } + + if (dev->header_ops->cache) + neigh->ops = &nndisc_hh_ops; + else + neigh->ops = &nndisc_generic_ops; + + if (neigh->nud_state & NUD_VALID) + neigh->output = neigh->ops->connected_output; + else + neigh->output = neigh->ops->output; + } + + nin_dev_put(nin_dev); + + return 0; +} + +void nip_insert_nndisc_send_checksum(struct sk_buff *skb, u_short checksum) +{ +#define NNDISC_CHECKSUM_BIAS 2 + *(__u16 *)(skb_transport_header(skb) + NNDISC_CHECKSUM_BIAS) = + htons(checksum); +} + +unsigned short nip_get_nndisc_send_checksum(struct sk_buff *skb, + struct nip_hdr_encap *head, + int payload_len) +{ + struct nip_pseudo_header nph = {0}; + + nph.nexthdr = head->nexthdr; + nph.saddr = head->saddr; + nph.daddr = head->daddr; + nph.check_len = htons(payload_len); + return nip_check_sum_build(skb_transport_header(skb), + payload_len, &nph); +} + +bool nip_get_nndisc_rcv_checksum(struct sk_buff *skb, + u_char *transport_tail) +{ + struct nip_pseudo_header nph = {0}; + unsigned short check_len = (unsigned short)(transport_tail - (skb_transport_header(skb))); + + nph.nexthdr = NIPCB(skb)->nexthdr; + nph.saddr = NIPCB(skb)->srcaddr; + nph.daddr = NIPCB(skb)->dstaddr; + nph.check_len = htons(check_len); + + return nip_check_sum_parse(skb_transport_header(skb), check_len, &nph) + == 0xffff ? true : false; +} + +static void nndisc_payload_ns_pack(const struct nip_addr *solicit, + struct sk_buff *skb) +{ + struct nnd_msg *msg = (struct nnd_msg *)skb->data; + u_char *p = msg->data; + + memset(&msg->icmph, 0, sizeof(msg->icmph)); + msg->icmph.nip_icmp_type = NIP_ARP_NS; + msg->icmph.nip_icmp_cksum = 0; + p = build_nip_addr(solicit, p); +} + +static struct dst_entry *nndisc_dst_alloc(struct net_device *dev) +{ + struct nip_rt_info *rt; + struct net *net = dev_net(dev); + + rt = nip_dst_alloc(net, dev, 0); + if (!rt) + return NULL; + + rt->dst.flags |= DST_HOST; + rt->dst.input = nip_input; + rt->dst.output = nip_output; + atomic_set(&rt->dst.__refcnt, 1); + + return &rt->dst; +} + +static int get_ns_payload_len(const struct nip_addr *solicit) +{ + return sizeof(struct nip_icmp_hdr) + get_nip_addr_len(solicit); +} + +static void nndisc_send_ns(struct net_device *dev, + const struct nip_addr *solicit, + const struct nip_addr *daddr, + const struct nip_addr *saddr) +{ + int ret; + struct sk_buff *skb; + struct dst_entry *dst; + struct net *net; + struct sock *sk = NULL; + int payload_len = get_ns_payload_len(solicit); + int len = NIP_ETH_HDR_LEN + NIP_HDR_MAX + payload_len; + struct nip_hdr_encap head = {0}; + unsigned short checksum; + + head.saddr = *saddr; + head.daddr = *daddr; + head.ttl = NIP_ARP_DEFAULT_TTL; + head.nexthdr = IPPROTO_NIP_ICMP; + + skb = alloc_skb(len, 0); + if (!skb) { + DEBUG("%s: no space for skbuff!", __func__); + return; + } + + skb->protocol = htons(ETH_P_NEWIP); + skb->dev = dev; + skb->ip_summed = CHECKSUM_NONE; + skb->csum = 0; + memset(NIPCB(skb), 0, sizeof(struct ninet_skb_parm)); + + NIPCB(skb)->dstaddr = head.daddr; + NIPCB(skb)->srcaddr = head.saddr; + NIPCB(skb)->nexthdr = head.nexthdr; + + /* reserve space for hardware header */ + skb_reserve(skb, NIP_ETH_HDR_LEN); + skb_reset_network_header(skb); + + /* build nwk header */ + head.hdr_buf = (unsigned char *)skb->data; + nip_hdr_comm_encap(&head); + head.total_len = head.hdr_buf_pos + payload_len; + nip_update_total_len(&head, htons(head.total_len)); + skb_reserve(skb, head.hdr_buf_pos); + skb_reset_transport_header(skb); + + /* build transport header */ + nndisc_payload_ns_pack(solicit, skb); + skb_reserve(skb, payload_len); + + skb->data = skb_network_header(skb); + skb->len = head.hdr_buf_pos + payload_len; + + dst = nndisc_dst_alloc(dev); + if (!dst) { + kfree_skb(skb); + return; + } + + /* add check sum*/ + checksum = nip_get_nndisc_send_checksum(skb, &head, payload_len); + nip_insert_nndisc_send_checksum(skb, checksum); + + skb_dst_set(skb, dst); + net = dev_net(skb->dev); + + /* DST is set to SKB, and output is used to release SKB regardless of success or failure */ + ret = dst_output(net, sk, skb); + if (ret) + DEBUG("%s: dst output fail.", __func__); +} + +static void nndisc_solicit(struct neighbour *neigh, struct sk_buff *skb) +{ + struct net_device *dev = neigh->dev; + struct nip_addr *target = (struct nip_addr *)&neigh->primary_key; + struct nip_addr *saddr = NULL; + struct ninet_dev *idev; + + /* Obtain the NewIP address from the current dev as + * the source address of the request packet + */ + rcu_read_lock(); + idev = __nin_dev_get(dev); + if (idev) { + read_lock_bh(&idev->lock); + if (!list_empty(&idev->addr_list)) { + struct ninet_ifaddr *ifp; + + list_for_each_entry(ifp, &idev->addr_list, if_list) { + saddr = &ifp->addr; + nndisc_send_ns(dev, target, + &nip_broadcast_addr_arp, + saddr); + } + } + read_unlock_bh(&idev->lock); + } else { + DEBUG("%s:idev don't exist.", __func__); + } + rcu_read_unlock(); +} + +static void build_na_hdr(u_char *smac, u_char mac_len, struct sk_buff *skb) +{ + struct nnd_msg *msg = (struct nnd_msg *)skb->data; + u_char *p = msg->data; + + memset(&msg->icmph, 0, sizeof(msg->icmph)); + msg->icmph.nip_icmp_type = NIP_ARP_NA; + msg->icmph.nip_icmp_cksum = 0; + *p = mac_len; + p++; + memcpy(p, smac, mac_len); +} + +static int get_na_payload_len(struct net_device *dev) +{ + /* Icmp Header Length + * Number of bytes in the MAC address length field + * MAC Address Length + */ + return sizeof(struct nip_icmp_hdr) + 1 + dev->addr_len; +} + +static void nndisc_send_na(struct net_device *dev, + const struct nip_addr *daddr, + const struct nip_addr *saddr) +{ + int ret; + struct sk_buff *skb = NULL; + struct dst_entry *dst = NULL; + struct sock *sk = NULL; + int csummode = CHECKSUM_NONE; + int payload_len = get_na_payload_len(dev); + int len = NIP_ETH_HDR_LEN + NIP_HDR_MAX + payload_len; + u_char *smac = dev->dev_addr; + struct nip_hdr_encap head = {0}; + u_short checksum = 0; + + head.saddr = *saddr; + head.daddr = *daddr; + head.ttl = NIP_ARP_DEFAULT_TTL; + head.nexthdr = IPPROTO_NIP_ICMP; + + skb = alloc_skb(len, 0); + if (!skb) { + DEBUG("%s: no space for skbuff!", __func__); + return; + } + skb->protocol = htons(ETH_P_NEWIP); + skb->ip_summed = csummode; + skb->csum = 0; + skb->dev = dev; + memset(NIPCB(skb), 0, sizeof(struct ninet_skb_parm)); + + NIPCB(skb)->dstaddr = head.daddr; + NIPCB(skb)->srcaddr = head.saddr; + NIPCB(skb)->nexthdr = head.nexthdr; + + /* reserve space for hardware header */ + skb_reserve(skb, NIP_ETH_HDR_LEN); + skb_reset_network_header(skb); + + /* build nwk header */ + head.hdr_buf = (unsigned char *)skb->data; + nip_hdr_comm_encap(&head); + head.total_len = head.hdr_buf_pos + payload_len; + nip_update_total_len(&head, htons(head.total_len)); + skb_reserve(skb, head.hdr_buf_pos); + skb_reset_transport_header(skb); + + /* build na header */ + build_na_hdr(smac, dev->addr_len, skb); + + /* skip transport hdr */ + skb_reserve(skb, payload_len); + + /* set skb->data to point network header */ + skb->data = skb_network_header(skb); + skb->len = head.hdr_buf_pos + payload_len; + + dst = nndisc_dst_alloc(dev); + if (!dst) { + kfree_skb(skb); + return; + } + + /* add check sum*/ + checksum = nip_get_nndisc_send_checksum(skb, &head, payload_len); + nip_insert_nndisc_send_checksum(skb, checksum); + + skb_dst_set(skb, dst); + ret = dst_output(dev_net(skb->dev), sk, skb); + if (ret) + DEBUG("%s: dst output fail.", __func__); +} + +bool nip_addr_local(struct net_device *dev, struct nip_addr *addr) +{ + struct ninet_dev *idev; + bool ret = false; + + rcu_read_lock(); + idev = __nin_dev_get(dev); + if (idev) { + read_lock_bh(&idev->lock); + if (!list_empty(&idev->addr_list)) { + struct ninet_ifaddr *ifp; + + list_for_each_entry(ifp, &idev->addr_list, if_list) { + if (nip_addr_eq(addr, &ifp->addr)) { + ret = true; + break; + } + } + } + read_unlock_bh(&idev->lock); + } + rcu_read_unlock(); + + return ret; +} + +int nndisc_rcv_ns(struct sk_buff *skb) +{ + struct nnd_msg *msg = (struct nnd_msg *)skb_transport_header(skb); + u_char *p = msg->data; + u_char *lladdr; + struct nip_addr addr = {0}; + struct neighbour *neigh; + struct ethhdr *eth; + struct net_device *dev = skb->dev; + int err = 0; + + p = decode_nip_addr(p, &addr); + if (!p) { + DEBUG("failure when decode source address!"); + err = -EFAULT; + goto out; + } + + if (nip_addr_invalid(&addr)) { + DEBUG("%s: icmp hdr addr invalid.", __func__); + err = -EFAULT; + goto out; + } + + if (!nip_addr_local(dev, &addr)) { + err = -ENXIO; + goto out; + } + + eth = (struct ethhdr *)skb_mac_header(skb); + lladdr = eth->h_source; + + /* checksum parse*/ + if (!nip_get_nndisc_rcv_checksum(skb, p)) { + DEBUG("%s:ns ICMP checksum failed, drop the packet", __func__); + err = -EINVAL; + goto out; + } + + neigh = __neigh_lookup(&nnd_tbl, &NIPCB(skb)->srcaddr, dev, lladdr || + !dev->addr_len); + if (neigh) { + neigh_update(neigh, lladdr, NUD_STALE, NEIGH_UPDATE_F_OVERRIDE, + 0); + neigh_release(neigh); + } + + nndisc_send_na(dev, &NIPCB(skb)->srcaddr, &addr); +out: + kfree_skb(skb); + return err; +} + +int nndisc_rcv_na(struct sk_buff *skb) +{ + struct nnd_msg *msg = (struct nnd_msg *)skb_transport_header(skb); + u_char *p = msg->data; + u_char len; + u8 lladdr[ALIGN(MAX_ADDR_LEN, sizeof(unsigned long))]; + struct net_device *dev = skb->dev; + struct neighbour *neigh; + + len = *p; + p++; + memset(lladdr, 0, ALIGN(MAX_ADDR_LEN, sizeof(unsigned long))); + memcpy(lladdr, p, len); + + if (!nip_get_nndisc_rcv_checksum(skb, p + len)) { + DEBUG("%s:na ICMP checksum failed! drop the packet!" + , __func__); + kfree_skb(skb); + return 0; + } + + neigh = neigh_lookup(&nnd_tbl, &NIPCB(skb)->srcaddr, dev); + if (neigh) { + neigh_update(neigh, lladdr, NUD_REACHABLE, + NEIGH_UPDATE_F_OVERRIDE, 0); + neigh_release(neigh); + kfree_skb(skb); + return 0; + } + kfree_skb(skb); + return -EFAULT; +} + +int nndisc_rcv(struct sk_buff *skb) +{ + int ret = 0; + struct nip_icmp_hdr *hdr = nip_icmp_header(skb); + u8 type = hdr->nip_icmp_type; + + switch (type) { + case NIP_ARP_NS: + ret = nndisc_rcv_ns(skb); + break; + case NIP_ARP_NA: + ret = nndisc_rcv_na(skb); + break; + default: + DEBUG("arp packet type error"); + } + + return ret; +} + +int __init nndisc_init(void) +{ + neigh_table_init(NEIGH_NND_TABLE, &nnd_tbl); + return 0; +} diff --git a/code/net/newip/protocol.c b/code/net/newip/protocol.c new file mode 100644 index 0000000000000000000000000000000000000000..1f35d9ee2c77758d1c0dc2ffe0983e91298dca34 --- /dev/null +++ b/code/net/newip/protocol.c @@ -0,0 +1,39 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2022 Huawei Device Co., Ltd. + * + * NewIP INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. NewIP INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * NewIP INET protocol dispatch tables. + * + * Based on net/ipv6/protocol.c + */ +#include +#include +#include +#include + +const struct ninet_protocol __rcu *ninet_protos[MAX_INET_PROTOS] __read_mostly; + +int ninet_add_protocol(const struct ninet_protocol *prot, + unsigned char protocol) +{ + return !cmpxchg((const struct ninet_protocol **)&ninet_protos[protocol], + NULL, prot) ? 0 : -1; +} + +int ninet_del_protocol(const struct ninet_protocol *prot, + unsigned char protocol) +{ + int ret; + + ret = (cmpxchg((const struct ninet_protocol **)&ninet_protos[protocol], + prot, NULL) == prot) ? 0 : -1; + + synchronize_net(); + + return ret; +} + diff --git a/code/net/newip/route.c b/code/net/newip/route.c new file mode 100644 index 0000000000000000000000000000000000000000..b9fdb06fbc4e7c3564a8276bf85a758ef08addd6 --- /dev/null +++ b/code/net/newip/route.c @@ -0,0 +1,960 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2022 Huawei Device Co., Ltd. + * + * NewIP INET + * An implementation of the TCP/IP protocol suite for the LINUX + * operating system. NewIP INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * ROUTE - implementation of the NewIP router. + * + * Based on net/ipv4/route.c + * Based on net/ipv6/route.c + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include /*copy_from_user()*/ +#include /*rtnl_lock()*/ + +#include +#include +#include +#include +#include + +#include +#include +#include "nip_hdr.h" + +static int nip_pkt_discard(struct sk_buff *skb); +static int nip_pkt_discard_out(struct net *net, struct sock *sk, + struct sk_buff *skb); +static unsigned int nip_mtu(const struct dst_entry *dst); + +static const struct nip_rt_info nip_null_entry_template = { + .dst = { + .__refcnt = ATOMIC_INIT(1), + .__use = 1, + .obsolete = DST_OBSOLETE_FORCE_CHK, + .error = -ENETUNREACH, + .input = nip_pkt_discard, + .output = nip_pkt_discard_out, + }, + .rt_ref = ATOMIC_INIT(1), +}; + +static const struct nip_rt_info nip_broadcast_entry_template = { + .dst = { + .__refcnt = ATOMIC_INIT(1), + .__use = 1, + .obsolete = DST_OBSOLETE_FORCE_CHK, + .input = nip_input, + .output = nip_output, + }, + .rt_ref = ATOMIC_INIT(1), +}; + +struct nip_addr *nip_nexthop(struct nip_rt_info *rt, struct nip_addr *daddr) +{ + if (rt->rt_flags & RTF_GATEWAY) + return &rt->gateway; + else + return daddr; +} + +static void rtmsg_to_fibni_config(struct net *net, struct nip_rtmsg *rtmsg, + struct nip_fib_config *cfg) +{ + memset(cfg, 0, sizeof(*cfg)); + + cfg->fc_table = NIP_RT_TABLE_MAIN; + cfg->fc_ifindex = rtmsg->rtmsg_ifindex; + cfg->fc_metric = rtmsg->rtmsg_metric; + cfg->fc_expires = rtmsg->rtmsg_info; + + cfg->fc_flags = rtmsg->rtmsg_flags; + + cfg->fc_nlinfo.nl_net = net; + + cfg->fc_dst = rtmsg->rtmsg_dst; + cfg->fc_src = rtmsg->rtmsg_src; + cfg->fc_gateway = rtmsg->rtmsg_gateway; +} + +static void nip_rt_info_init(struct nip_rt_info *rt) +{ + struct dst_entry *dst = &rt->dst; + + memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst)); + rt->from = NULL; +} + +static struct nip_rt_info *__nip_dst_alloc(struct net *net, + struct net_device *dev, int flags) +{ + struct nip_rt_info *rt = + dst_alloc(&net->newip.nip_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK, + flags); + + if (rt) + nip_rt_info_init(rt); + + return rt; +} + +struct nip_rt_info *nip_dst_alloc(struct net *net, struct net_device *dev, + int flags) +{ + struct nip_rt_info *rt = __nip_dst_alloc(net, dev, flags); + + if (rt) { + rt->rt_pcpu = + alloc_percpu_gfp(struct nip_rt_info *, GFP_ATOMIC); + if (rt->rt_pcpu) { + int cpu; + + for_each_possible_cpu(cpu) { + struct nip_rt_info **p; + + p = per_cpu_ptr(rt->rt_pcpu, cpu); + /* no one shares rt */ + *p = NULL; + } + } else { + dst_destroy((struct dst_entry *)rt); + return NULL; + } + } + + return rt; +} + +static void nip_rt_dst_from_metrics_check(struct nip_rt_info *rt) +{ + if (rt->from && + dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->from)) + dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->from), true); +} + +static struct nip_rt_info *nip_rt_get_pcpu_route(struct nip_rt_info *rt) +{ + struct nip_rt_info *pcpu_rt, **p; + + p = this_cpu_ptr(rt->rt_pcpu); + pcpu_rt = *p; + + if (pcpu_rt) { + dst_hold(&pcpu_rt->dst); + nip_rt_dst_from_metrics_check(pcpu_rt); + } + return pcpu_rt; +} + +static void nip_rt_set_from(struct nip_rt_info *rt, struct nip_rt_info *from) +{ + WARN_ON(from->from); + + rt->rt_flags &= ~RTF_EXPIRES; + dst_hold(&from->dst); + rt->from = &from->dst; + dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true); +} + +static void nip_rt_copy_init(struct nip_rt_info *rt, struct nip_rt_info *ort) +{ + rt->dst.input = ort->dst.input; + rt->dst.output = ort->dst.output; + rt->rt_dst = ort->rt_dst; + rt->dst.error = ort->dst.error; + rt->rt_idev = ort->rt_idev; + if (rt->rt_idev) + nin_dev_hold(rt->rt_idev); + + rt->dst.lastuse = jiffies; + rt->gateway = ort->gateway; + rt->rt_flags = ort->rt_flags; + nip_rt_set_from(rt, ort); + rt->rt_metric = ort->rt_metric; + rt->rt_table = ort->rt_table; + rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate); +} + +static struct nip_rt_info *nip_rt_pcpu_alloc(struct nip_rt_info *rt) +{ + struct nip_rt_info *pcpu_rt; + + pcpu_rt = __nip_dst_alloc(dev_net(rt->dst.dev), + rt->dst.dev, rt->dst.flags); + if (!pcpu_rt) + return NULL; + nip_rt_copy_init(pcpu_rt, rt); + pcpu_rt->rt_protocol = rt->rt_protocol; + pcpu_rt->rt_flags |= RTF_PCPU; + return pcpu_rt; +} + +static struct nip_rt_info *nip_rt_make_pcpu_route(struct nip_rt_info *rt) +{ + struct nip_rt_info *pcpu_rt, *prev, **p; + + pcpu_rt = nip_rt_pcpu_alloc(rt); + if (!pcpu_rt) { + struct net *net = dev_net(rt->dst.dev); + + dst_hold(&net->newip.nip_null_entry->dst); + return net->newip.nip_null_entry; + } + + rcu_read_lock_bh(); + if (rt->rt_pcpu) { + p = this_cpu_ptr(rt->rt_pcpu); + prev = cmpxchg(p, NULL, pcpu_rt); + if (prev) { + /* If someone did it before us, return prev instead */ + dst_destroy(&pcpu_rt->dst); + pcpu_rt = prev; + } + } else { + dst_destroy(&pcpu_rt->dst); + pcpu_rt = rt; + } + dst_hold(&pcpu_rt->dst); + nip_rt_dst_from_metrics_check(pcpu_rt); + rcu_read_unlock_bh(); + return pcpu_rt; +} + +static struct nip_rt_info *nip_pol_route_input(struct net *net, + struct nip_fib_table *table, + struct flow_nip *fln, int flags) +{ + return nip_pol_route(net, table, fln->flowin_iif, fln, flags); +} + +struct dst_entry *nip_route_input_lookup(struct net *net, + struct net_device *dev, + struct flow_nip *fln, int flags) +{ + return nip_fib_rule_lookup(net, fln, flags, nip_pol_route_input); +} + +void nip_route_input(struct sk_buff *skb) +{ + struct net *net = dev_net(skb->dev); + int flags = 0; + struct flow_nip fln = { + .flowin_iif = skb->skb_iif, + .daddr = NIPCB(skb)->dstaddr, + .saddr = NIPCB(skb)->srcaddr, + }; + + if (nip_addr_eq(&fln.daddr, &nip_broadcast_addr_arp)) { + DEBUG("%s: recv broadcast packet!\n", __func__); + dst_hold(&net->newip.nip_broadcast_entry->dst); + skb_dst_set(skb, + (struct dst_entry *)net->newip.nip_broadcast_entry); + return; + } + + skb_dst_set(skb, nip_route_input_lookup(net, skb->dev, &fln, flags)); +} + +static struct nip_rt_info *nip_pol_route_output(struct net *net, + struct nip_fib_table *table, + struct flow_nip *fln, int flags) +{ + return nip_pol_route(net, table, fln->flowin_oif, fln, flags); +} + +struct dst_entry *nip_route_output_flags(struct net *net, const struct sock *sk, + struct flow_nip *fln, int flags) +{ + struct dst_entry *dst; + struct nip_rt_info *rt; + + dst = nip_fib_rule_lookup(net, fln, flags, nip_pol_route_output); + rt = (struct nip_rt_info *)dst; + + if (rt->rt_flags & RTF_LOCAL) { + rcu_read_lock(); + if (rt->rt_idev) { + read_lock_bh(&rt->rt_idev->lock); + /* search saddr in idev->addr */ + if (!list_empty(&rt->rt_idev->addr_list)) { + struct ninet_ifaddr *ifp; + + list_for_each_entry(ifp, &rt->rt_idev->addr_list, if_list) { + fln->saddr = ifp->addr; + break; + } + } + read_unlock_bh(&rt->rt_idev->lock); + } + rcu_read_unlock(); + + dst_release(dst); + dst_hold(&net->newip.nip_broadcast_entry->dst); + return &net->newip.nip_broadcast_entry->dst; + } + + return dst; +} + +struct nip_rt_info *nip_pol_route(struct net *net, struct nip_fib_table *table, + int oif, struct flow_nip *fln, int flags) +{ + struct nip_fib_node *fn; + struct nip_rt_info *rt, *pcpu_rt; + + rcu_read_lock_bh(); + fn = nip_fib_locate(table->nip_tb_head, &fln->daddr); + if (!fn) { + rcu_read_unlock_bh(); + DEBUG("%s: search fail!\n", __func__); + rt = net->newip.nip_null_entry; + dst_hold_and_use(&rt->dst, jiffies); + return rt; + } + rt = fn->nip_route_info; + + /* Get a percpu copy */ + rt->dst.lastuse = jiffies; + rt->dst.__use++; + pcpu_rt = nip_rt_get_pcpu_route(rt); + + DEBUG("%s: cpu id = %d\n", __func__, smp_processor_id()); + + if (pcpu_rt) { + rcu_read_unlock_bh(); + DEBUG("%s: pcpu found!\n", __func__); + } else { + dst_hold(&rt->dst); + rcu_read_unlock_bh(); + pcpu_rt = nip_rt_make_pcpu_route(rt); + dst_release(&rt->dst); + } + + DEBUG("%s: rt dst.__refcnt = %d ; pcpu dst.__refcnt = %d\n", __func__, + atomic_read(&rt->dst.__refcnt), + atomic_read(&pcpu_rt->dst.__refcnt)); + return pcpu_rt; +} + +bool nip_bind_addr_check(struct net *net, + struct nip_addr *addr) +{ + struct nip_fib_node *fn; + struct nip_fib_table *fib_tbl = net->newip.nip_fib_local_tbl; + + if (nip_addr_invalid(addr)) { + DEBUG("%s: binding-addr invalid.", __func__); + return false; + } + + if (nip_addr_eq(addr, &nip_any_addr)) { + DEBUG("%s: binding-addr is any addr.", __func__); + return true; + } + + rcu_read_lock_bh(); + fn = nip_fib_locate(fib_tbl->nip_tb_head, addr); + rcu_read_unlock_bh(); + if (!fn) { + DEBUG("%s: binding-addr is not local addr.", __func__); + return false; + } + + DEBUG("%s: binding-addr is local addr.", __func__); + return true; +} + +static struct nip_rt_info *nip_route_info_create(struct nip_fib_config *cfg) +{ + struct net *net = cfg->fc_nlinfo.nl_net; + struct nip_rt_info *rt = NULL; + struct net_device *dev = NULL; + struct ninet_dev *idev = NULL; + struct nip_fib_table *table; + int err = -ENODEV; + + /* find net_device */ + dev = dev_get_by_index(net, cfg->fc_ifindex); + if (!dev) { + DEBUG("%s: fail to get dev by ifindex(%u).", __func__, cfg->fc_ifindex); + goto out; + } + + /* find ninet_dev,which has the newip address list */ + idev = nin_dev_get(dev); + if (!idev) { + DEBUG("%s: fail to get ninet dev.(ifindex=%u)", __func__, cfg->fc_ifindex); + goto out; + } + + if (cfg->fc_metric == 0) + cfg->fc_metric = NIP_RT_PRIO_USER; + + err = -ENOBUFS; + table = nip_fib_get_table(net, cfg->fc_table); + if (!table) { + DEBUG("%s: fail to get fib table.(fc_table=%u)", __func__, cfg->fc_table); + goto out; + } + + rt = nip_dst_alloc(net, NULL, (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT); + if (!rt) { + DEBUG("%s: fail to alloc dst mem.", __func__); + err = -ENOMEM; + goto out; + } + + nip_rt_clean_expires(rt); + + if (cfg->fc_protocol == RTPROT_UNSPEC) + cfg->fc_protocol = RTPROT_BOOT; + rt->rt_protocol = cfg->fc_protocol; + + if (cfg->fc_flags & RTF_LOCAL) { + rt->dst.input = nip_input; + DEBUG("rt->dst.input = nip_input, ifindex=%u", cfg->fc_ifindex); + } else { + rt->dst.input = nip_forward; + DEBUG("rt->dst.input = nip_forward, ifindex=%u", cfg->fc_ifindex); + } + + rt->dst.output = nip_output; + rt->rt_dst = cfg->fc_dst; + rt->rt_src = cfg->fc_src; + rt->rt_metric = cfg->fc_metric; + + if (cfg->fc_flags & RTF_GATEWAY) + rt->gateway = cfg->fc_gateway; + else + rt->gateway = nip_any_addr; + + rt->rt_flags = cfg->fc_flags; + rt->dst.dev = dev; + rt->rt_idev = idev; + rt->rt_table = table; + + return rt; +out: + if (dev) + dev_put(dev); + if (idev) + nin_dev_put(idev); + return ERR_PTR(err); +} + +/* __nip_ins_rt is called with FREE table->nip_tb_lock. + * It takes new route entry, the addition fails by any reason the + * route is released. + */ +static int __nip_ins_rt(struct nip_rt_info *rt) +{ + int err; + struct nip_fib_table *table; + + table = rt->rt_table; + + spin_lock_bh(&table->nip_tb_lock); + err = nip_fib_add(table->nip_tb_head, rt); + spin_unlock_bh(&table->nip_tb_lock); + + return err; +} + +int nip_ins_rt(struct nip_rt_info *rt) +{ + /* Hold dst to account for the reference from the nip fib hash */ + dst_hold(&rt->dst); + return __nip_ins_rt(rt); +} + +int nip_route_add(struct nip_fib_config *cfg) +{ + struct nip_rt_info *rt; + int err; + + rt = nip_route_info_create(cfg); + if (IS_ERR(rt)) { + DEBUG("%s: fail to creat route info.", __func__); + err = PTR_ERR(rt); + rt = NULL; + goto out; + } + + err = __nip_ins_rt(rt); +out: + return err; +} + +static int __nip_del_rt(struct nip_rt_info *rt, struct nl_info *info) +{ + int err; + struct nip_fib_table *table; + struct net *net = dev_net(rt->dst.dev); + + if (rt == net->newip.nip_null_entry) { + err = -ENOENT; + goto out; + } + + table = rt->rt_table; + spin_lock_bh(&table->nip_tb_lock); + err = nip_fib_del(rt, info); + spin_unlock_bh(&table->nip_tb_lock); + +out: + nip_rt_put(rt); + return err; +} + +int nip_del_rt(struct nip_rt_info *rt) +{ + struct nl_info info = { + .nl_net = dev_net(rt->dst.dev), + }; + return __nip_del_rt(rt, &info); +} + +static int nip_route_del(struct nip_fib_config *cfg) +{ + struct net *net = cfg->fc_nlinfo.nl_net; + struct nip_fib_table *table; + struct nip_fib_node *fn; + struct nip_rt_info *rt; + int err = -ESRCH; + + table = nip_fib_get_table(net, cfg->fc_table); + if (!table) + return err; + + rcu_read_lock_bh(); + fn = nip_fib_locate(table->nip_tb_head, &cfg->fc_dst); + if (fn) { + rt = fn->nip_route_info; + dst_hold(&rt->dst); + rcu_read_unlock_bh(); + + return __nip_del_rt(rt, &cfg->fc_nlinfo); + } + rcu_read_unlock_bh(); + + return err; +} + +int nip_route_ioctl(struct net *net, unsigned int cmd, struct nip_rtmsg *rtmsg) +{ + struct nip_fib_config cfg; + int err; + + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) { + DEBUG("%s: not admin can`t cfg.", __func__); + return -EPERM; + } + + rtmsg_to_fibni_config(net, rtmsg, &cfg); + if (nip_addr_invalid(&cfg.fc_dst)) { + DEBUG("%s: nip daddr invalid.", __func__); + return -EFAULT; + } + + if (cfg.fc_flags & RTF_GATEWAY) { + if (nip_addr_invalid(&cfg.fc_gateway)) { + DEBUG("%s: nip gateway daddr invalid.", __func__); + return -EFAULT; + } + } + + rtnl_lock(); + switch (cmd) { + case SIOCADDRT: /* Add a route */ + err = nip_route_add(&cfg); + break; + case SIOCDELRT: /* Delete a route */ + err = nip_route_del(&cfg); + break; + default: + err = -EINVAL; + } + rtnl_unlock(); + + return err; +} + +static void nip_dst_destroy(struct dst_entry *dst) +{ + struct nip_rt_info *rt = (struct nip_rt_info *)dst; + struct dst_entry *from = rt->from; + struct ninet_dev *idev; + + dst_destroy_metrics_generic(dst); + free_percpu(rt->rt_pcpu); + + idev = rt->rt_idev; + if (idev) { + rt->rt_idev = NULL; + DEBUG("%s: idev->refcnt=%u\n", __func__, + refcount_read(&idev->refcnt)); + nin_dev_put(idev); + } + + if (from) { + DEBUG("%s: from->__refcnt = %d\n", __func__, + atomic_read(&from->__refcnt)); + } + rt->from = NULL; + dst_release(from); +} + +static inline const void *nip_choose_neigh_daddr(struct nip_rt_info *rt, + struct sk_buff *skb, + const void *daddr) +{ + struct nip_addr *p = &rt->gateway; + + if (rt->rt_flags & RTF_GATEWAY) + return (const void *)p; + else if (skb) + return &NIPCB(skb)->dstaddr; + return daddr; +} + +static struct neighbour *nip_neigh_lookup(const struct dst_entry *dst, + struct sk_buff *skb, + const void *daddr) +{ + struct nip_rt_info *rt = (struct nip_rt_info *)dst; + struct neighbour *n; + + daddr = nip_choose_neigh_daddr(rt, skb, daddr); + n = __nip_neigh_lookup(dst->dev, daddr); + if (n) + return n; + return neigh_create(&nnd_tbl, daddr, dst->dev); +} + +static struct dst_entry *nip_dst_check(struct dst_entry *dst, u32 cookie) +{ + return dst; +} + +/* Used to calculate the MSS value required by TCP + * Because there is no MSS in the TCP of NewIP, + * the value is calculated based on the MTU of the network port + */ +static unsigned int nip_default_advmss(const struct dst_entry *dst) +{ + unsigned int mtu = dst_mtu(dst); + + mtu -= NIP_HDR_MAX + sizeof(struct tcphdr); + + return mtu; +} + +static unsigned int nip_mtu(const struct dst_entry *dst) +{ + unsigned int mtu; + struct ninet_dev *idev; + + mtu = NIP_MIN_MTU; + + rcu_read_lock(); + idev = __nin_dev_get(dst->dev); + if (idev) + mtu = idev->cnf.mtu; + rcu_read_unlock(); + + return mtu; +} + +static struct dst_ops nip_dst_ops_template = { + .family = AF_NINET, + .destroy = nip_dst_destroy, + .neigh_lookup = nip_neigh_lookup, + .check = nip_dst_check, + .default_advmss = nip_default_advmss, + .mtu = nip_mtu, +}; + +static int nip_pkt_discard(struct sk_buff *skb) +{ + kfree_skb(skb); + return 0; +} + +static int nip_pkt_discard_out(struct net *net, struct sock *sk, + struct sk_buff *skb) +{ + kfree_skb(skb); + return 0; +} + +struct nip_rt_info *nip_addrconf_dst_alloc(struct ninet_dev *idev, + const struct nip_addr *addr) +{ + u32 tb_id; + struct net *net = dev_net(idev->dev); + struct net_device *dev = idev->dev; + struct nip_rt_info *rt; + + rt = nip_dst_alloc(net, dev, DST_NOCOUNT); + if (!rt) + return ERR_PTR(-ENOMEM); + + nin_dev_hold(idev); + + rt->dst.flags |= DST_HOST; + rt->dst.input = nip_input; + rt->dst.output = nip_output; + rt->rt_idev = idev; + + rt->rt_protocol = RTPROT_KERNEL; + rt->rt_flags = RTF_UP | RTF_NONEXTHOP; + rt->rt_flags |= RTF_LOCAL; + + rt->gateway = *addr; + rt->rt_dst = *addr; + tb_id = NIP_RT_TABLE_LOCAL; + rt->rt_table = nip_fib_get_table(net, tb_id); + + return rt; +} + +struct arg_dev_net { + struct net_device *dev; + struct net *net; +}; + +/* Determine whether an RT should be deleted along with ifDown + * called with nip_tb_lock held for table with rt + */ +static int nip_fib_ifdown(struct nip_rt_info *rt, void *arg) +{ + const struct arg_dev_net *adn = arg; + const struct net_device *dev = adn->dev; + + if ((rt->dst.dev == dev || !dev) && + rt != adn->net->newip.nip_null_entry && + rt != adn->net->newip.nip_broadcast_entry && + ((dev && netdev_unregistering(dev)) || + !rt->rt_idev->cnf.ignore_routes_with_linkdown)) + return -1; + + return 0; +} + +void nip_rt_ifdown(struct net *net, struct net_device *dev) +{ + struct arg_dev_net adn = { + .dev = dev, + .net = net, + }; + + nip_fib_clean_all(net, nip_fib_ifdown, &adn); +} + +static int __net_init nip_route_net_init(struct net *net) +{ + int ret = -ENOMEM; + + memcpy(&net->newip.nip_dst_ops, &nip_dst_ops_template, + sizeof(net->newip.nip_dst_ops)); + + if (dst_entries_init(&net->newip.nip_dst_ops) < 0) + goto out; + + net->newip.nip_null_entry = kmemdup(&nip_null_entry_template, + sizeof(*net->newip.nip_null_entry), + GFP_KERNEL); + if (!net->newip.nip_null_entry) + goto out_nip_dst_entries; + net->newip.nip_null_entry->dst.ops = &net->newip.nip_dst_ops; + dst_init_metrics(&net->newip.nip_null_entry->dst, dst_default_metrics.metrics, true); + + net->newip.nip_broadcast_entry = + kmemdup(&nip_broadcast_entry_template, + sizeof(*net->newip.nip_broadcast_entry), + GFP_KERNEL); + if (!net->newip.nip_broadcast_entry) + goto out_nip_null_entry; + net->newip.nip_broadcast_entry->dst.ops = &net->newip.nip_dst_ops; + dst_init_metrics(&net->newip.nip_broadcast_entry->dst, dst_default_metrics.metrics, true); + ret = 0; +out: + return ret; + +out_nip_null_entry: + kfree(net->newip.nip_null_entry); +out_nip_dst_entries: + dst_entries_destroy(&net->newip.nip_dst_ops); + goto out; +} + +static void __net_exit nip_route_net_exit(struct net *net) +{ + kfree(net->newip.nip_broadcast_entry); + kfree(net->newip.nip_null_entry); + dst_entries_destroy(&net->newip.nip_dst_ops); +} + +static struct pernet_operations nip_route_net_ops = { + .init = nip_route_net_init, + .exit = nip_route_net_exit, +}; + +static int nip_route_dev_notify(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct net *net = dev_net(dev); + + if (!(dev->flags & IFF_LOOPBACK)) + return NOTIFY_OK; + + if (event == NETDEV_REGISTER) { + net->newip.nip_null_entry->dst.dev = dev; + net->newip.nip_null_entry->rt_idev = nin_dev_get(dev); + + net->newip.nip_broadcast_entry->dst.dev = dev; + net->newip.nip_broadcast_entry->rt_idev = nin_dev_get(dev); + } else if (event == NETDEV_UNREGISTER && + dev->reg_state != NETREG_UNREGISTERED) { + nin_dev_put_clear(&net->newip.nip_null_entry->rt_idev); + nin_dev_put_clear(&net->newip.nip_broadcast_entry->rt_idev); + } + + return NOTIFY_OK; +} + +static void seq_printf_nipaddr_to_proc(struct seq_file *seq, + struct nip_addr *addr) +{ + int i = 0; + + for (i = 0; i < addr->bitlen / NIP_ADDR_BIT_LEN_8; i++) + seq_printf(seq, "%02x", addr->nip_addr_field8[i]); + + seq_puts(seq, "\t"); +} + +static void nip_route_show_table(struct seq_file *seq, + struct nip_fib_table *table) +{ + struct nip_fib_node *fn; + int i; + + rcu_read_lock_bh(); + for (i = 0; i < NIN_ROUTE_HSIZE; i++) { + hlist_for_each_entry_rcu(fn, &table->nip_tb_head[i], + fib_hlist) { + struct nip_rt_info *rt = fn->nip_route_info; + + seq_printf_nipaddr_to_proc(seq, &rt->rt_dst); + seq_printf_nipaddr_to_proc(seq, &rt->gateway); + seq_printf(seq, "%4u %4s\n", rt->rt_flags, + rt->dst.dev ? rt->dst.dev->name : ""); + } + } + rcu_read_unlock_bh(); +} + +static int nip_route_proc_show(struct seq_file *seq, void *v) +{ + struct net *net = seq->private; + + nip_route_show_table(seq, net->newip.nip_fib_main_tbl); + nip_route_show_table(seq, net->newip.nip_fib_local_tbl); + + return 0; +} + +static int __net_init nip_route_net_init_late(struct net *net) +{ + proc_create_net_single("nip_route", 0444, net->proc_net, + nip_route_proc_show, NULL); + return 0; +} + +static void __net_exit nip_route_net_exit_late(struct net *net) +{ + remove_proc_entry("nip_route", net->proc_net); +} + +static struct pernet_operations nip_route_net_late_ops = { + .init = nip_route_net_init_late, + .exit = nip_route_net_exit_late, +}; + +static struct notifier_block nip_route_dev_notifier = { + .notifier_call = nip_route_dev_notify, + .priority = ADDRCONF_NOTIFY_PRIORITY - 10, +}; + +int __init nip_route_init(void) +{ + int ret; + + ret = -ENOMEM; + + nip_dst_ops_template.kmem_cachep = + kmem_cache_create("nip_dst_cache", sizeof(struct nip_rt_info), 0, + SLAB_HWCACHE_ALIGN, NULL); + if (!nip_dst_ops_template.kmem_cachep) + goto out; + + ret = register_pernet_subsys(&nip_route_net_ops); + if (ret) + goto out_kmem_cache; + + ret = nip_fib_init(); + if (ret) + goto out_register_subsys; + + ret = register_pernet_subsys(&nip_route_net_late_ops); + if (ret) + goto out_nip_fib_init; + + ret = register_netdevice_notifier(&nip_route_dev_notifier); + if (ret) + goto out_register_late_subsys; + +out: + return ret; + +out_register_late_subsys: + unregister_pernet_subsys(&nip_route_net_late_ops); +out_nip_fib_init: + nip_fib_gc_cleanup(); +out_register_subsys: + unregister_pernet_subsys(&nip_route_net_ops); +out_kmem_cache: + kmem_cache_destroy(nip_dst_ops_template.kmem_cachep); + goto out; +} + +void nip_route_cleanup(void) +{ + unregister_pernet_subsys(&nip_route_net_late_ops); + nip_fib_gc_cleanup(); + unregister_pernet_subsys(&nip_route_net_ops); + kmem_cache_destroy(nip_dst_ops_template.kmem_cachep); +} + diff --git a/code/net/newip/tcp_nip.c b/code/net/newip/tcp_nip.c new file mode 100644 index 0000000000000000000000000000000000000000..8c278528478be7e9a2ff52856357fe876fce62c7 --- /dev/null +++ b/code/net/newip/tcp_nip.c @@ -0,0 +1,1631 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2022 Huawei Device Co., Ltd. + * + * NewIP INET + * An implementation of the TCP/IP protocol suite for the LINUX + * operating system. NewIP INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Implementation of the Transmission Control Protocol(TCP). + * + * TCP over NewIP + * + * Based on net/ipv4/tcp.c + * Based on net/ipv4/tcp_ipv4.c + * Based on net/ipv6/tcp_ipv6.c + * Based on net/core/stream.c + * + * Description of States: + * + * TCP_SYN_SENT sent a connection request, waiting for ack + * + * TCP_SYN_RECV received a connection request, sent ack, + * waiting for final ack in three-way handshake. + * + * TCP_ESTABLISHED connection established + * + * TCP_FIN_WAIT1 our side has shutdown, waiting to complete + * transmission of remaining buffered data + * + * TCP_FIN_WAIT2 all buffered data sent, waiting for remote + * to shutdown + * + * TCP_CLOSING both sides have shutdown but we still have + * data we have to finish sending + * + * TCP_TIME_WAIT timeout to catch resent junk before entering + * closed, can only be entered from FIN_WAIT2 + * or CLOSING. Required because the other end + * may not have gotten our last ACK causing it + * to retransmit the data packet (which we ignore) + * + * TCP_CLOSE_WAIT remote side has shutdown and is waiting for + * us to finish writing our data and to shutdown + * (we have to close() to move on to LAST_ACK) + * + * TCP_LAST_ACK out side has shutdown after remote has + * shutdown. There may still be data in our + * buffer that we have to finish sending + * + * TCP_CLOSE socket is finished + */ +#define pr_fmt(fmt) "NIP-TCP: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "nip_checksum.h" +#include "tcp_nip_parameter.h" + +static const struct inet_connection_sock_af_ops newip_specific; + +static void tcp_nip_push(struct sock *sk, int flags, int mss_now, + int nonagle, int size_goal) +{ + __tcp_nip_push_pending_frames(sk, mss_now, nonagle); +} + +static const unsigned char new_state[16] = { + /* current state: new state: action: */ +[0 /* (Invalid) */] = TCP_CLOSE, +[TCP_ESTABLISHED] = TCP_FIN_WAIT1 | TCP_ACTION_FIN, +[TCP_SYN_SENT] = TCP_CLOSE, +[TCP_SYN_RECV] = TCP_FIN_WAIT1 | TCP_ACTION_FIN, +[TCP_FIN_WAIT1] = TCP_FIN_WAIT1, +[TCP_FIN_WAIT2] = TCP_FIN_WAIT2, +[TCP_TIME_WAIT] = TCP_CLOSE, +[TCP_CLOSE] = TCP_CLOSE, +[TCP_CLOSE_WAIT] = TCP_LAST_ACK | TCP_ACTION_FIN, +[TCP_LAST_ACK] = TCP_LAST_ACK, +[TCP_LISTEN] = TCP_CLOSE, +[TCP_CLOSING] = TCP_CLOSING, +[TCP_NEW_SYN_RECV] = TCP_CLOSE, /* should not happen ! */ +}; + +bool nip_get_tcp_input_checksum(struct sk_buff *skb) +{ + struct nip_pseudo_header nph = {0}; + + nph.nexthdr = NIPCB(skb)->nexthdr; + nph.saddr = NIPCB(skb)->srcaddr; + nph.daddr = NIPCB(skb)->dstaddr; + + nph.check_len = htons(skb->len); + return nip_check_sum_parse(skb_transport_header(skb), + skb->len, &nph) + == 0xffff ? true : false; +} + +static int tcp_nip_close_state(struct sock *sk) +{ + int next = (int)new_state[sk->sk_state]; + int ns = next & TCP_STATE_MASK; + + tcp_set_state(sk, ns); + + return next & TCP_ACTION_FIN; +} + +void sk_nip_stream_kill_queues(struct sock *sk) +{ + /* First the read buffer. */ + __skb_queue_purge(&sk->sk_receive_queue); + + /* Next, the error queue. */ + __skb_queue_purge(&sk->sk_error_queue); + + /* Next, the write queue. */ + WARN_ON(!skb_queue_empty(&sk->sk_write_queue)); + + WARN_ON(sk->sk_wmem_queued); +} + +void tcp_nip_shutdown(struct sock *sk, int how) +{ + if (!(how & SEND_SHUTDOWN)) + return; + + /* If we've already sent a FIN, or it's a closed state, skip this. */ + if ((1 << sk->sk_state) & + (TCPF_ESTABLISHED | TCPF_SYN_SENT | + TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) { + /* Clear out any half completed packets. FIN if needed. */ + if (tcp_nip_close_state(sk)) + tcp_nip_send_fin(sk); + } +} + +void tcp_nip_close(struct sock *sk, long timeout) +{ + struct sk_buff *skb; + int data_was_unread = 0; + int state; + + lock_sock(sk); + sk->sk_shutdown = SHUTDOWN_MASK; + + DEBUG("%s: sk_state:%d\n", __func__, sk->sk_state); + + if (sk->sk_state == TCP_LISTEN) { + tcp_set_state(sk, TCP_CLOSE); + + inet_csk_listen_stop(sk); + + goto adjudge_to_death; + } + + while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) { + u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq; + + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) + len--; + data_was_unread += len; + __kfree_skb(skb); + } + + if (sk->sk_state == TCP_CLOSE) + goto adjudge_to_death; + + if (data_was_unread) { + tcp_set_state(sk, TCP_CLOSE); + tcp_nip_send_active_reset(sk, sk->sk_allocation); + } else if (tcp_nip_close_state(sk)) { + /* RED-PEN. Formally speaking, we have broken TCP state + * machine. State transitions: + * + * TCP_ESTABLISHED -> TCP_FIN_WAIT1 + * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible) + * TCP_CLOSE_WAIT -> TCP_LAST_ACK + */ + DEBUG("%s: ready to send fin, sk_state:%d\n", __func__, sk->sk_state); + tcp_nip_send_fin(sk); + } + +adjudge_to_death: + state = sk->sk_state; + sock_hold(sk); + sock_orphan(sk); + + /* It is the last release_sock in its life. It will remove backlog. */ + release_sock(sk); + + local_bh_disable(); + bh_lock_sock(sk); + WARN_ON(sock_owned_by_user(sk)); + + this_cpu_dec(*sk->sk_prot->orphan_count); + + if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE) + goto out; + + if (sk->sk_state == TCP_CLOSE) + inet_csk_destroy_sock(sk); + +out: + bh_unlock_sock(sk); + local_bh_enable(); + sock_put(sk); +} + +/* These states need RST on ABORT according to RFC793 */ +static inline bool tcp_nip_need_reset(int state) +{ + return (1 << state) & + (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 | + TCPF_FIN_WAIT2 | TCPF_SYN_RECV); +} + +/* Function + * Initialize some of the parameters in request_sock + * Parameter + * req: Request connection control block + * sk_listener: Transmission control block + * skb: Transfer control block buffer + */ +static void tcp_nip_init_req(struct request_sock *req, + const struct sock *sk_listener, + struct sk_buff *skb) +{ + struct inet_request_sock *ireq = inet_rsk(req); + + ireq->ir_nip_rmt_addr = NIPCB(skb)->srcaddr; + ireq->ir_nip_loc_addr = NIPCB(skb)->dstaddr; +} + +/* Function + * Initialize The initialization number SEQ. Calculate the initial serial number of + * the server based on part of the source address source port, part of the destination + * address, and destination port + * Parameter + * skb: Transfer control block buffer + */ +static __u32 tcp_nip_init_sequence(const struct sk_buff *skb) +{ + return secure_tcp_nip_sequence_number(NIPCB(skb)->dstaddr.nip_addr_field32, + NIPCB(skb)->srcaddr.nip_addr_field32, + tcp_hdr(skb)->dest, + tcp_hdr(skb)->source); +} + +static struct dst_entry *tcp_nip_route_req(const struct sock *sk, + struct flowi *fl, + const struct request_sock *req) +{ + struct dst_entry *dst; + struct inet_request_sock *ireq = inet_rsk(req); + struct flow_nip fln; + + fln.daddr = ireq->ir_nip_rmt_addr; + dst = nip_route_output(sock_net(sk), sk, &fln); + return dst; +} + +/* Function + * Functions used by the client transport layer to connect requests + * This parameter is used to set the source address, destination address and interface + * Parameter + * sk: Transmission control block + * uaddr:The destination address + * addr_len:Destination address Length + */ +static int tcp_nip_connect(struct sock *sk, struct sockaddr *uaddr, + int addr_len) +{ + struct sockaddr_nin *usin = (struct sockaddr_nin *)uaddr; + struct inet_sock *inet = inet_sk(sk); + struct tcp_sock *tp = tcp_sk(sk); + __be16 orig_dport; + struct nip_addr *daddr; + struct dst_entry *dst; + int err; + struct ip_options_rcu *inet_opt; + struct inet_timewait_death_row *tcp_death_row; + struct flow_nip fln; + + fln.daddr = usin->sin_addr; + + if (addr_len < sizeof(struct sockaddr_nin)) + return -EINVAL; + + if (usin->sin_family != AF_NINET) + return -EAFNOSUPPORT; + + inet_opt = rcu_dereference_protected(inet->inet_opt, + lockdep_sock_is_held(sk)); + /* Destination ADDRESS and port */ + daddr = &usin->sin_addr; + orig_dport = usin->sin_port; + + /* Find the route and obtain the source address */ + DEBUG("%s, sk->sk_bound_dev_if is %d", __func__, sk->sk_bound_dev_if); + fln.flowin_oif = sk->sk_bound_dev_if; + dst = nip_dst_lookup_flow(sock_net(sk), sk, &fln, NULL); + if (IS_ERR(dst)) { + DEBUG("%s cannot find dst\n", __func__); + err = PTR_ERR(dst); + goto failure; + } + + /* find the actual source addr for sk->sk_nip_rcv_saddr */ + if (nip_addr_eq(&sk->sk_nip_rcv_saddr, &nip_any_addr)) + sk->sk_nip_rcv_saddr = fln.saddr; + fln.saddr = sk->sk_nip_rcv_saddr; + + if (nip_addr_invalid(&fln.daddr)) { + DEBUG("%s: nip daddr invalid.", __func__); + err = -EFAULT; + goto failure; + } + + if (nip_addr_invalid(&fln.saddr)) { + DEBUG("%s: nip saddr invalid.", __func__); + err = -EFAULT; + goto failure; + } + + /* The destination address and port are set to the transport control block */ + inet->inet_dport = usin->sin_port; + sk->sk_nip_daddr = usin->sin_addr; + + inet_csk(sk)->icsk_ext_hdr_len = 0; + if (inet_opt) + inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; + + tcp_set_state(sk, TCP_SYN_SENT); + sk_set_txhash(sk); + sk_dst_set(sk, dst); + + /* Dynamically bind local ports */ + tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; + err = ninet_hash_connect(tcp_death_row, sk); + if (err) + goto late_failure; + + /* Class if the transport control block has already been linked */ + if (tp->rx_opt.ts_recent_stamp) { + /* Reset inherited state */ + tp->rx_opt.ts_recent = 0; + tp->rx_opt.ts_recent_stamp = 0; + if (likely(!tp->repair)) + tp->write_seq = 0; + } + + if (!tp->write_seq) + tp->write_seq = + secure_tcp_nip_sequence_number(sk->sk_nip_rcv_saddr.nip_addr_field32, + sk->sk_nip_daddr.nip_addr_field32, + inet->inet_sport, + usin->sin_port); + + inet->inet_id = prandom_u32(); + + /* Call tcp_connect to send the SYN field */ + err = __tcp_nip_connect(sk); + if (err) + goto late_failure; + + return 0; + +/* failure after tcp_set_state(sk, TCP_SYN_SENT) */ +late_failure: + tcp_set_state(sk, TCP_CLOSE); +failure: + sk->sk_route_caps = 0; + inet->inet_dport = 0; + return err; +} + +static void tcp_nip_send_reset(struct sock *sk, struct sk_buff *skb) +{ + const struct tcphdr *th = tcp_hdr(skb); + u32 seq = 0, ack_seq = 0, priority = gfp_any(); + + /* Never send a reset in response to a reset. */ + if (th->rst) + return; + + DEBUG("%s: send RST!\n", __func__); + + if (th->ack) + seq = ntohl(th->ack_seq); + else + ack_seq = ntohl(th->seq) + th->syn + th->fin + skb->len - + (th->doff << 2); + + tcp_nip_actual_send_reset(sk, skb, seq, ack_seq, 0, 1, priority); +} + +/* Function + * function used by the server to send SYN+ACK segments + * Parameter + * sk: Transmission control block + * dst: routing。 + * flowi: Flow control block + * req: Request connection control block + * foc: Fast open options + * synack_type: Type of the SYN+ACK segment + */ +static int tcp_nip_send_synack(const struct sock *sk, struct dst_entry *dst, + struct flowi *fl, + struct request_sock *req, + struct tcp_fastopen_cookie *foc, + enum tcp_synack_type synack_type, + struct sk_buff *syn_skb) +{ + struct sk_buff *skb; + int err = -ENOMEM; + + skb = tcp_nip_make_synack(sk, dst, req, foc, synack_type); + if (skb) { + DEBUG("%s: TCP server create SYN+ACK skb successfully!", __func__); + rcu_read_lock(); + err = nip_send_synack(req, skb); + rcu_read_unlock(); + } + + return err; +} + +static void tcp_nip_reqsk_destructor(struct request_sock *req) +{ + kfree_skb(inet_rsk(req)->nip_pktopts); +} + +struct request_sock_ops tcp_nip_request_sock_ops __read_mostly = { + .family = AF_NINET, + .obj_size = sizeof(struct tcp_nip_request_sock), + .rtx_syn_ack = tcp_nip_rtx_synack, + .send_ack = NULL, + .destructor = tcp_nip_reqsk_destructor, + .send_reset = NULL, + .syn_ack_timeout = NULL, +}; + +static const struct tcp_request_sock_ops tcp_request_sock_newip_ops = { + .mss_clamp = TCP_BASE_MSS, +#ifdef CONFIG_TCP_MD5SIG + .req_md5_lookup = NULL, + .calc_md5_hash = NULL, +#endif + .init_req = tcp_nip_init_req, +#ifdef CONFIG_SYN_COOKIES + .cookie_init_seq = NULL, +#endif + .route_req = tcp_nip_route_req, + .init_seq = tcp_nip_init_sequence, + .send_synack = tcp_nip_send_synack, +}; + +/* Function + * The route cache saves the transport control block from the SKB + * Parameter + * sk: Transmission control block + * skb: Transfer control block buffer + * req: Request connection control block + * dst: routing + * req_unhash: Request connection control block + */ +void ninet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + + if (dst && dst_hold_safe(dst)) { + sk->sk_rx_dst = dst; + inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; + } +} + +/* Function + * A function used by the server to process client connection requests + * Parameter + * sk: Transmission control block + * skb: Transfer control block buffer + */ +static int tcp_nip_conn_request(struct sock *sk, struct sk_buff *skb) +{ + return tcp_newip_conn_request(&tcp_nip_request_sock_ops, + &tcp_request_sock_newip_ops, sk, skb); +} + +/* Function + * Create child control blocks + * Parameter + * sk: Transmission control block + * skb: Transfer control block buffer + * req: Request connection control block + * dst: routing + * req_unhash: Request connection control block + */ +static struct sock *tcp_nip_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct dst_entry *dst, + struct request_sock *req_unhash, + bool *own_req) +{ + struct inet_request_sock *ireq = inet_rsk(req); + bool found_dup_sk = false; + struct tcp_nip_sock *newtcpnipsk; + struct inet_sock *newinet; + struct tcp_sock *newtp; + struct sock *newsk; + struct flow_nip fln; + + if (sk_acceptq_is_full(sk)) + goto out_overflow; + + fln.daddr = ireq->ir_nip_rmt_addr; + if (!dst) { + dst = nip_route_output(sock_net(sk), sk, &fln); + if (!dst) + goto out; + } + + newsk = tcp_nip_create_openreq_child(sk, req, skb); + if (!newsk) + goto out_nonewsk; + + /* Save the received route cache */ + ninet_sk_rx_dst_set(newsk, skb); + + newtcpnipsk = (struct tcp_nip_sock *)newsk; + + newtp = tcp_sk(newsk); + newinet = inet_sk(newsk); + + newsk->sk_nip_daddr = ireq->ir_nip_rmt_addr; + newsk->sk_nip_rcv_saddr = ireq->ir_nip_loc_addr; + + newinet->inet_opt = NULL; + + inet_csk(newsk)->icsk_ext_hdr_len = 0; + + newtp->retrans_stamp = jiffies; + + /* Negotiate MSS */ + newtp->mss_cache = TCP_BASE_MSS; + newtp->nip_out_of_order_queue = NULL; + newtp->advmss = dst_metric_advmss(dst); + if (tcp_sk(sk)->rx_opt.user_mss && + tcp_sk(sk)->rx_opt.user_mss < newtp->advmss) + newtp->advmss = tcp_sk(sk)->rx_opt.user_mss; + + tcp_nip_initialize_rcv_mss(newsk); + if (__inet_inherit_port(sk, newsk) < 0) + goto put_and_exit; + /* Deleting the old sock from the ehash table and adding the new sock to the + * ehash table succeeds *own_req equals true + */ + *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), + &found_dup_sk); + + /* newip newsk doesn't save this dst. release it. */ + dst_release(dst); + return newsk; + +out_overflow: + __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); +out_nonewsk: +out: + /* newip newsk doesn't save this dst. release it. */ + dst_release(dst); + tcp_listendrop(sk); + return NULL; +put_and_exit: + newinet->inet_opt = NULL; + inet_csk_prepare_forced_close(newsk); + tcp_nip_done(newsk); + goto out; +} + +static const struct inet_connection_sock_af_ops newip_specific = { + .queue_xmit = tcp_nip_queue_xmit, + .send_check = NULL, + .rebuild_header = NULL, + .sk_rx_dst_set = ninet_sk_rx_dst_set, + .conn_request = tcp_nip_conn_request, + .syn_recv_sock = tcp_nip_syn_recv_sock, + .net_header_len = 0, + .net_frag_header_len = 0, + .setsockopt = nip_setsockopt, + .getsockopt = nip_getsockopt, + .addr2sockaddr = NULL, + .sockaddr_len = sizeof(struct sockaddr_nin), + + .mtu_reduced = NULL, +}; + +#define MAX_NIP_TCP_KEEPIDLE 32767 +#define MAX_NIP_TCP_KEEPINTVL 32767 +#define MAX_NIP_TCP_KEEPCNT 255 +static int tcp_nip_keepalive_para_update(struct sock *sk, + u32 keepalive_time, + u32 keepalive_intvl, + u8 keepalive_probes) +{ + int val; + struct tcp_sock *tp = tcp_sk(sk); + + /* set keep idle (TCP_KEEPIDLE) */ + val = keepalive_time; + if (val < 1 || val > MAX_NIP_TCP_KEEPIDLE) { + pr_crit("%s keepalive_time(%u) invalid.", __func__, val); + return -EINVAL; + } + + tp->keepalive_time = val; + if (sock_flag(sk, SOCK_KEEPOPEN) && + !((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) { + u32 elapsed = keepalive_time_elapsed(tp); + + if (tp->keepalive_time > elapsed) + elapsed = tp->keepalive_time - elapsed; + else + elapsed = 0; + inet_csk_reset_keepalive_timer(sk, elapsed); + } + + /* set keep intvl (TCP_KEEPINTVL) */ + val = keepalive_intvl; + if (val < 1 || val > MAX_NIP_TCP_KEEPINTVL) { + pr_crit("%s keepalive_intvl(%u) invalid.", __func__, val); + return -EINVAL; + } + tp->keepalive_intvl = val; + + /* set keep cnt (TCP_KEEPCNT) */ + val = keepalive_probes; + if (val < 1 || val > MAX_NIP_TCP_KEEPCNT) { + pr_crit("%s keepalive_probes(%u) invalid.", __func__, val); + return -EINVAL; + } + tp->keepalive_probes = val; + + /* enable keepalive (SO_KEEPALIVE) */ + if (sk->sk_prot->keepalive) { + sk->sk_prot->keepalive(sk, 1); + sock_valbool_flag(sk, SOCK_KEEPOPEN, 1); + } else { + pr_crit("%s keepalive func is null.", __func__); + } + + return 0; +} + +void tcp_nip_keepalive_enable(struct sock *sk) +{ + int ret; + struct tcp_sock *tp = tcp_sk(sk); + + if (tp->nip_keepalive_enable) + return; + + ret = tcp_nip_keepalive_para_update(sk, g_nip_keepalive_time, + g_nip_keepalive_intvl, + g_nip_keepalive_probes); + if (ret != 0) { + pr_crit("%s fail", __func__); + return; + } + + pr_crit("%s ok", __func__); + tp->nip_keepalive_enable = true; +} + +void tcp_nip_keepalive_disable(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (!tp->nip_keepalive_enable) + return; + + if (tp->idle_ka_probes_out < g_nip_idle_ka_probes_out) + return; + + /* enable keepalive (SO_KEEPALIVE) */ + if (sk->sk_prot->keepalive) + sk->sk_prot->keepalive(sk, 0); + sock_valbool_flag(sk, SOCK_KEEPOPEN, 0); + + pr_crit("%s ok, idle_ka_probes_out=%u", __func__, g_nip_idle_ka_probes_out); + tp->nip_keepalive_enable = false; +} + +/* Function + * Example Initialize sock information in TCP + * Parameter + * sk: Sock to be initialized + * Note: Currently, this function does not initialize timer, pre-queue, and congestion control, + * and does not allow fast retransmission. No function is set to adjust MSS + */ +static int tcp_nip_init_sock(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + + tp->out_of_order_queue = RB_ROOT; + tcp_nip_init_xmit_timers(sk); + INIT_LIST_HEAD(&tp->tsq_node); + + icsk->icsk_rto = g_nip_rto == 0 ? TCP_TIMEOUT_INIT : (unsigned int)(HZ / g_nip_rto); + icsk->icsk_rto_min = TCP_RTO_MIN; + icsk->icsk_delack_max = TCP_DELACK_MAX; + tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); + minmax_reset(&tp->rtt_min, tcp_jiffies32, ~0U); + + tp->snd_cwnd = TCP_INIT_CWND; + tp->app_limited = ~0U; + tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; + tp->snd_cwnd_clamp = ~0; + tp->mss_cache = TCP_MSS_DEFAULT; + + tp->sacked_out = 0; + tp->rcv_tstamp = 0; + tp->selective_acks[0].start_seq = 0; + tp->selective_acks[0].end_seq = 0; + tp->ack_retrans_seq = 0; + tp->ack_retrans_num = 0; + tp->nip_ssthresh = g_nip_ssthresh_default; + tp->nip_ssthresh_reset = 0; + tp->nip_keepalive_enable = false; + tp->idle_ka_probes_out = 0; + tp->nip_keepalive_timeout_scale = 0; + + tp->reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering; + tp->tsoffset = 0; + sk->sk_state = TCP_CLOSE; + sk->sk_write_space = sk_stream_write_space; + sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); + + icsk->icsk_sync_mss = tcp_nip_sync_mss; + + WRITE_ONCE(sk->sk_sndbuf, g_nip_sndbuf); // sock_net(sk)->ipv4.sysctl_tcp_wmem[1] + WRITE_ONCE(sk->sk_rcvbuf, g_nip_rcvbuf); // sock_net(sk)->ipv4.sysctl_tcp_rmem[1] + + local_bh_disable(); + sk_sockets_allocated_inc(sk); + local_bh_enable(); + + icsk->icsk_af_ops = &newip_specific; + + return 0; +} + +static void skb_nip_entail(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); + + skb->csum = 0; + tcb->seq = tp->write_seq; + tcb->end_seq = tp->write_seq; + tcb->tcp_flags = TCPHDR_ACK; + tcb->sacked = 0; + + tcp_nip_add_write_queue_tail(sk, skb); + + sk->sk_wmem_queued += skb->truesize; + sk_mem_charge(sk, skb->truesize); +} + +static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, + int large_allowed) +{ + struct tcp_sock *tp = tcp_sk(sk); + u32 new_size_goal, size_goal; + + if (!large_allowed) + return mss_now; + + /* Note : tcp_tso_autosize() will eventually split this later */ + new_size_goal = sk->sk_gso_max_size - 1 - MAX_TCP_HEADER; + new_size_goal = tcp_bound_to_half_wnd(tp, new_size_goal); + + /* We try hard to avoid divides here */ + size_goal = tp->gso_segs * mss_now; + if (unlikely(new_size_goal < size_goal || + new_size_goal >= size_goal + mss_now)) { + tp->gso_segs = min_t(u16, new_size_goal / mss_now, + sk->sk_gso_max_segs); + size_goal = tp->gso_segs * mss_now; + } + + return max(size_goal, mss_now); +} + +int tcp_nip_send_mss(struct sock *sk, int *size_goal, int flags) +{ + int mss_now; + + mss_now = tcp_nip_current_mss(sk); + *size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB)); + + DEBUG("%snip_send_mss%d", __func__, mss_now); + return mss_now; +} + +int tcp_nip_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + int flags, err, copied = 0; + int mss_now = 0, size_goal; + bool process_backlog = false; + long timeo; + + lock_sock(sk); + + flags = msg->msg_flags; + + timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); + + if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) && + !tcp_passive_fastopen(sk)) { + err = sk_stream_wait_connect(sk, &timeo); + if (err != 0) + goto do_error; + } + + /* This should be in poll */ + sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); + + copied = 0; + +restart: + mss_now = tcp_nip_send_mss(sk, &size_goal, flags); + + DEBUG("%s: tcp_nip_send_mss %d\n", __func__, mss_now); + + err = -EPIPE; + if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) + goto do_error; + + while (msg_data_left(msg)) { + int copy = 0; + int max = mss_now; + + bool first_skb; + + if (!sk_stream_memory_free(sk)) + goto wait_for_sndbuf; + + if (process_backlog && sk_flush_backlog(sk)) { + process_backlog = false; + goto restart; + } + first_skb = skb_queue_empty(&sk->sk_write_queue); + skb = sk_stream_alloc_skb(sk, mss_now, sk->sk_allocation, first_skb); + if (!skb) + goto wait_for_memory; + + skb->tstamp = 0; + process_backlog = true; + + skb_nip_entail(sk, skb); + copy = mss_now; + max = mss_now; + + /* Try to append data to the end of skb. */ + if (copy > msg_data_left(msg)) + copy = msg_data_left(msg); + + if (skb_availroom(skb) > 0) { + /* We have some space in skb head. Superb! */ + copy = min_t(int, copy, skb_availroom(skb)); + err = skb_add_data_nocache(sk, skb, &msg->msg_iter, copy); + if (err) + goto do_fault; + } else { + DEBUG("%s: msg too big! tcp cannot devide packet now\n", __func__); + goto out; + } + + if (!copied) + TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH; + + tp->write_seq += copy; + TCP_SKB_CB(skb)->end_seq += copy; + tcp_skb_pcount_set(skb, 0); + copied += copy; + if (!msg_data_left(msg)) { + if (unlikely(flags & MSG_EOR)) + TCP_SKB_CB(skb)->eor = 1; + goto out; + } + + continue; + +wait_for_sndbuf: + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); +wait_for_memory: + if (copied) + tcp_nip_push(sk, flags & ~MSG_MORE, mss_now, + TCP_NAGLE_PUSH, size_goal); + + err = sk_stream_wait_memory(sk, &timeo); + if (err != 0) + goto do_error; + + mss_now = tcp_nip_send_mss(sk, &size_goal, flags); + } + +out: + if (copied) + tcp_nip_push(sk, flags, mss_now, tp->nonagle, size_goal); + release_sock(sk); + return copied; + +do_fault: + if (!skb->len) { + tcp_unlink_write_queue(skb, sk); + sk_wmem_free_skb(sk, skb); + } + +do_error: + if (copied) + goto out; + + err = sk_stream_error(sk, flags, err); + /* make sure we wake any epoll edge trigger waiter */ + if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN)) + sk->sk_write_space(sk); + release_sock(sk); + return err; +} + +/* Clean up the receive buffer for full frames taken by the user, + * then send an ACK if necessary. COPIED is the number of bytes + * tcp_recvmsg has given to the user so far, it speeds up the + * calculation of whether or not we must ACK for the sake of + * a window update. + */ +void tcp_nip_cleanup_rbuf(struct sock *sk, int copied) +{ + struct tcp_sock *tp = tcp_sk(sk); + bool time_to_ack = false; + + struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); + + WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq), + "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n", + tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt); + + if (inet_csk_ack_scheduled(sk)) { + const struct inet_connection_sock *icsk = inet_csk(sk); + + if (/* Once-per-two-segments ACK was not sent */ + tp->rcv_nxt - tp->rcv_wup > (g_ack_num * 20 * icsk->icsk_ack.rcv_mss) || + /* If this read emptied read buffer, we send ACK, if + * connection is not bidirectional, user drained + * receive buffer and there was a small segment + * in queue. + */ + (copied > 0 && + ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) || + ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) && + !inet_csk_in_pingpong_mode(sk))) && + !atomic_read(&sk->sk_rmem_alloc))) { + time_to_ack = true; + } + } + + /* We send an ACK if we can now advertise a non-zero window + * which has been raised "significantly". + * + * Even if window raised up to infinity, do not send window open ACK + * in states, where we will not receive more. It is useless. + */ + if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) { + __u32 rcv_window_now = tcp_receive_window(tp); + + /* Optimize, __nip_tcp_select_window() is not cheap. */ + if (2 * rcv_window_now <= tp->window_clamp) { + __u32 new_window = __nip_tcp_select_window(sk); + + /* Send ACK now, if this read freed lots of space + * in our buffer. Certainly, new_window is new window. + * We can advertise it now, if it is not less than current one. + * "Lots" means "at least twice" here. + */ + if (new_window && new_window >= 2 * rcv_window_now) + time_to_ack = true; + } + } + if (time_to_ack) + tcp_nip_send_ack(sk); +} + +int tcp_nip_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, + int flags, int *addr_len) +{ + struct tcp_sock *tp = tcp_sk(sk); + int copied = 0; + u32 *seq; + unsigned long used; + int err; + int target; + long timeo; + size_t len_tmp = len; + struct sk_buff *skb, *last; + + lock_sock(sk); + + if (sk->sk_state == TCP_LISTEN) + goto out; + + timeo = sock_rcvtimeo(sk, nonblock); + + seq = &tp->copied_seq; + + target = sock_rcvlowat(sk, flags & MSG_WAITALL, len_tmp); + + do { + u32 offset; + /* Next get a buffer. */ + last = skb_peek_tail(&sk->sk_receive_queue); + skb_queue_walk(&sk->sk_receive_queue, skb) { + last = skb; + /* Now that we have two receive queues this + * shouldn't happen. + */ + if (WARN(before(*seq, TCP_SKB_CB(skb)->seq), + "TCP recvmsg seq # bug: copied %X, seq %X, rcvnxt %X, fl %X\n", + *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, + flags)) + break; + offset = *seq - TCP_SKB_CB(skb)->seq; + if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { + pr_err_once("%s: found a SYN, please report !\n", __func__); + offset--; + } + if (offset < skb->len) + goto found_ok_skb; + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) + goto found_fin_ok; + /* If the first SKB in the current SK_receive_queue is not the SKB to + * be replicated, then MSG_PEEK should be set in flags + */ + WARN(!(flags & MSG_PEEK), + "TCP recvmsg seq # bug 2: copied %X, seq %X, rcvnxt %X, fl %X\n", + *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags); + } + + /* If the program is executed at this point, the SK_receive_queue is finished */ + /* If there is no data in the backlog, stop reading at target */ + if (copied >= target && !sk->sk_backlog.tail) + break; + + if (copied) { + if (sk->sk_err || + sk->sk_state == TCP_CLOSE || + (sk->sk_shutdown & RCV_SHUTDOWN) || + !timeo || + signal_pending(current)) + break; + } else { + if (sock_flag(sk, SOCK_DONE)) + break; + + if (sk->sk_err) { + copied = sock_error(sk); + break; + } + + if (sk->sk_shutdown & RCV_SHUTDOWN) + break; + + if (sk->sk_state == TCP_CLOSE) { + if (!sock_flag(sk, SOCK_DONE)) { + /* This occurs when user tries to read + * from never connected socket. + */ + copied = -ENOTCONN; + break; + } + break; + } + + if (!timeo) { + copied = -EAGAIN; + break; + } + + if (signal_pending(current)) { + copied = sock_intr_errno(timeo); + break; + } + } + + tcp_nip_cleanup_rbuf(sk, copied); + + if (copied >= target) { + /* Do not sleep, just process backlog. */ + release_sock(sk); + lock_sock(sk); + } else { + DEBUG("%s: no enough data receive queue, wait\n", __func__); + sk_wait_data(sk, &timeo, last); + } + continue; +found_ok_skb: + used = skb->len - offset; + if (len_tmp < used) + used = len_tmp; + DEBUG("%s: copy data into msg, len=%ld\n", __func__, used); + if (!(flags & MSG_TRUNC)) { + err = skb_copy_datagram_msg(skb, offset, msg, used); + if (err) { + DEBUG("%s: copy data failed!\n", __func__); + if (!copied) + copied = -EFAULT; + break; + } + } + *seq += used; + len_tmp -= used; + copied += used; + + if (used + offset < skb->len) + continue; + + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) + goto found_fin_ok; + if (!(flags & MSG_PEEK)) + sk_eat_skb(sk, skb); + continue; + +found_fin_ok: + /* Process the FIN. */ + ++*seq; + if (!(flags & MSG_PEEK)) + sk_eat_skb(sk, skb); + break; + } while (len_tmp > 0); + + /* Clean up data we have read: This will do ACK frames. */ + tcp_nip_cleanup_rbuf(sk, copied); + + release_sock(sk); + return copied; + +out: + release_sock(sk); + return err; +} + +void skb_nip_ofo_queue_purge(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + + while ((skb = tp->nip_out_of_order_queue) != NULL) { + tp->nip_out_of_order_queue = tp->nip_out_of_order_queue->next; + kfree_skb(skb); + } +} + +void tcp_nip_destroy_sock(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + tcp_nip_clear_xmit_timers(sk); + + tcp_nip_write_queue_purge(sk); + + skb_nip_ofo_queue_purge(sk); + + if (inet_csk(sk)->icsk_bind_hash) + inet_put_port(sk); + + tcp_saved_syn_free(tp); + local_bh_disable(); + sk_sockets_allocated_dec(sk); + local_bh_enable(); +} + +/* Function + * The sock handler for THE LISTEN and ESTABLISHED states is called by tcp_nip_rCV + * Parameter + * skb: Packets received from the network layer + * sk: A SOCK instance needs to be processed + */ +static int tcp_nip_do_rcv(struct sock *sk, struct sk_buff *skb) +{ + DEBUG("%s: received newip tcp skb, sk_state=%d\n", __func__, sk->sk_state); + + if (sk->sk_state == TCP_ESTABLISHED) { + tcp_nip_rcv_established(sk, skb, tcp_hdr(skb), skb->len); + return 0; + } + + /* The connection is established in cookie mode to defend against SYN-flood attacks */ + if (sk->sk_state == TCP_LISTEN) + DEBUG("found TCP_LISTEN SOCK!!!\n"); + + if (tcp_nip_rcv_state_process(sk, skb)) + goto discard; + return 0; + +discard: + kfree_skb(skb); + return 0; +} + +/* Function: + * Fill the TCP header field in SKB into the TCP private control block, + * because the TCP header field in SKB is the network byte order, + * in order to facilitate later call, need to convert the host byte order + * and store in the TCP control block. + * Parameter: + * skb:Packets delivered by the network layer + * th:TCP header field in a packet + */ +static void tcp_nip_fill_cb(struct sk_buff *skb, const struct tcphdr *th) +{ + barrier(); + + TCP_SKB_CB(skb)->seq = ntohl(th->seq); + TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + + skb->len - th->doff * TCP_NUM_4); + + TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); + TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); + TCP_SKB_CB(skb)->tcp_tw_isn = 0; + TCP_SKB_CB(skb)->sacked = 0; +} + +static bool tcp_nip_add_backlog(struct sock *sk, struct sk_buff *skb) +{ + u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf); + + /* Only socket owner can try to collapse/prune rx queues + * to reduce memory overhead, so add a little headroom here. + * Few sockets backlog are possibly concurrently non empty. + */ + limit += 64 * 1024; + + /* In case all data was pulled from skb frags (in __pskb_pull_tail()), + * we can fix skb->truesize to its real value to avoid future drops. + * This is valid because skb is not yet charged to the socket. + * It has been noticed pure SACK packets were sometimes dropped + * (if cooked by drivers without copybreak feature). + */ + skb_condense(skb); + + if (unlikely(sk_add_backlog(sk, skb, limit))) { + bh_unlock_sock(sk); + __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP); + DEBUG("%s: insert backlog fail.\n", __func__); + return true; + } + return false; +} + +/* Function + * TCP is the gateway from the network layer to the transport layer + * and receives data packets from the network layer + * Parameter + * skb:Packets delivered by the network layer + */ +static int tcp_nip_rcv(struct sk_buff *skb) +{ + const struct tcphdr *th; + bool refcounted; + struct sock *sk; + int ret; + int dif = skb->skb_iif; + + if (skb->pkt_type != PACKET_HOST) + goto discard_it; + + if (!nip_get_tcp_input_checksum(skb)) + goto discard_it; + + th = (const struct tcphdr *)skb->data; + + if (unlikely(th->doff < sizeof(struct tcphdr) / TCP_NUM_4)) + goto bad_packet; + + sk = __ninet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), + th->source, th->dest, dif, + &refcounted); + if (!sk) + goto no_tcp_socket; + + if (sk->sk_state == TCP_TIME_WAIT) + goto do_time_wait; + if (sk->sk_state == TCP_NEW_SYN_RECV) { + struct request_sock *req = inet_reqsk(sk); + struct sock *nsk; + + DEBUG("%s: TCP server into third shake hands! sk->sk_state:%d", + __func__, sk->sk_state); + sk = req->rsk_listener; + + sock_hold(sk); + refcounted = true; + nsk = NULL; + /* You need to create a new SOCK and enter TCP_SYN_RECV, + * which is then set to Established + */ + if (!tcp_filter(sk, skb)) { + th = (const struct tcphdr *)skb->data; + tcp_nip_fill_cb(skb, th); + nsk = tcp_nip_check_req(sk, skb, req); + } + if (!nsk || nsk == sk) { + DEBUG("%s skb info error and create newsk failure!!!", __func__); + reqsk_put(req); + goto discard_and_relse; + } + if (tcp_nip_child_process(sk, nsk, skb)) { + goto discard_and_relse; + } else { + sock_put(sk); + return 0; + } + } + + tcp_nip_fill_cb(skb, th); + + if (tcp_filter(sk, skb)) + goto discard_and_relse; + th = (const struct tcphdr *)skb->data; + skb->dev = NULL; + + if (sk->sk_state == TCP_LISTEN) { + DEBUG("%s: TCP server into first shake hands! sk->sk_state:%d", + __func__, sk->sk_state); + ret = tcp_nip_do_rcv(sk, skb); + goto put_and_return; + } + bh_lock_sock_nested(sk); + + ret = 0; + if (!sock_owned_by_user(sk)) { + ret = tcp_nip_do_rcv(sk, skb); + } else { + DEBUG("%s: sock locked by user! put packet into backlog\n", + __func__); + if (tcp_nip_add_backlog(sk, skb)) + goto discard_and_relse; + } + + bh_unlock_sock(sk); + +put_and_return: + if (refcounted) + sock_put(sk); + return ret ? -1 : 0; + +no_tcp_socket: + /* Checksum checked, send reset back */ + tcp_nip_send_reset(NULL, skb); + DEBUG("%s: cannot find related tcp sock for skb", __func__); + goto discard_it; +bad_packet: + goto discard_it; +discard_it: + DEBUG("%s: drop tcp newip skb and release it\n", __func__); + kfree_skb(skb); + return 0; + +discard_and_relse: + sk_drops_add(sk, skb); + if (refcounted) + sock_put(sk); + goto discard_it; +/* Handles the SK portion of the interrupt state */ +do_time_wait: + goto discard_it; +} + +static void tcp_nip_early_demux(struct sk_buff *skb) +{ + const struct tcphdr *th; + struct sock *sk; + + if (skb->pkt_type != PACKET_HOST) + return; + + if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) + return; + + th = tcp_hdr(skb); + if (th->doff < sizeof(struct tcphdr) / 4) + return; + + sk = __ninet_lookup_established(dev_net(skb->dev), &tcp_hashinfo, + &NIPCB(skb)->srcaddr, th->source, + &NIPCB(skb)->dstaddr, ntohs(th->dest), skb->skb_iif); + if (sk) { + DEBUG("%s: find related sock in ehash.", __func__); + skb->sk = sk; + skb->destructor = sock_edemux; + if (sk_fullsock(sk)) { + struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst); + + if (dst && + inet_sk(sk)->rx_dst_ifindex == skb->skb_iif) { + DEBUG("%s: set dst for skb.", __func__); + skb_dst_set_noref(skb, dst); + } + } + } +} + +void tcp_nip_done(struct sock *sk) +{ + struct request_sock *req = tcp_sk(sk)->fastopen_rsk; + + if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV) + TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS); + + tcp_set_state(sk, TCP_CLOSE); + inet_csk_clear_xmit_timers(sk); + if (req) + reqsk_fastopen_remove(sk, req, false); + + sk->sk_shutdown = SHUTDOWN_MASK; + + if (!sock_flag(sk, SOCK_DEAD)) { + sk->sk_state_change(sk); + } else { + WARN_ON(sk->sk_state != TCP_CLOSE); + WARN_ON(!sock_flag(sk, SOCK_DEAD)); + + /* It cannot be in hash table! */ + WARN_ON(!sk_unhashed(sk)); + + /* If it has not 0 inet_sk(sk)->inet_num, it must be bound */ + WARN_ON(inet_sk(sk)->inet_num && !inet_csk(sk)->icsk_bind_hash); + sk->sk_prot->destroy(sk); + + sk_nip_stream_kill_queues(sk); + + local_bh_disable(); + this_cpu_dec(*sk->sk_prot->orphan_count); + local_bh_enable(); + sock_put(sk); + DEBUG("%s: close sock done!!\n", __func__); + } +} + +/* Function + * Disconnect the connection to the peer end, non-blocking + * Release read/write queue, send RST (not sent yet), clear timer + * Parameter + * sk: Transmission control block + */ +int tcp_nip_disconnect(struct sock *sk, int flags) +{ + struct inet_sock *inet = inet_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + int err = 0; + int old_state = sk->sk_state; + + if (old_state != TCP_CLOSE) + tcp_set_state(sk, TCP_CLOSE); + + if (old_state == TCP_LISTEN) { + inet_csk_listen_stop(sk); + } else if (tcp_nip_need_reset(old_state) || + (tp->snd_nxt != tp->write_seq && + (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) { + tcp_nip_send_active_reset(sk, gfp_any()); + sk->sk_err = ECONNRESET; + } else if (old_state == TCP_SYN_SENT) { + sk->sk_err = ECONNRESET; + } + + tcp_nip_clear_xmit_timers(sk); + __skb_queue_purge(&sk->sk_receive_queue); + tcp_write_queue_purge(sk); + + inet->inet_dport = 0; + sk->sk_shutdown = 0; + sock_reset_flag(sk, SOCK_DONE); + tp->srtt_us = 0; + tp->write_seq += tp->max_window + TCP_NUM_2; + if (tp->write_seq == 0) + tp->write_seq = 1; + tp->snd_cwnd = TCP_NUM_2; + icsk->icsk_probes_out = 0; + tp->packets_out = 0; + tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; + tp->snd_cwnd_cnt = 0; + tp->window_clamp = 0; + tp->delivered = 0; + tcp_clear_retrans(tp); + tp->total_retrans = 0; + inet_csk_delack_init(sk); + + icsk->icsk_ack.rcv_mss = TCP_MIN_MSS; + sk->sk_send_head = NULL; + memset(&tp->rx_opt, 0, sizeof(tp->rx_opt)); + __sk_dst_reset(sk); + dst_release(sk->sk_rx_dst); + sk->sk_rx_dst = NULL; + tp->segs_in = 0; + tp->segs_out = 0; + tp->bytes_acked = 0; + tp->bytes_received = 0; + tp->data_segs_in = 0; + tp->data_segs_out = 0; + + WARN_ON(inet->inet_num && !icsk->icsk_bind_hash); + + if (sk->sk_frag.page) { + put_page(sk->sk_frag.page); + sk->sk_frag.page = NULL; + sk->sk_frag.offset = 0; + } + + sk->sk_error_report(sk); + return err; +} + +struct proto tcp_nip_prot = { + .name = "NIP_TCP", + .owner = THIS_MODULE, + .close = tcp_nip_close, + .connect = tcp_nip_connect, + .disconnect = tcp_nip_disconnect, + .accept = inet_csk_accept, + .ioctl = tcp_ioctl, + .init = tcp_nip_init_sock, + .destroy = tcp_nip_destroy_sock, + .shutdown = tcp_nip_shutdown, + .setsockopt = tcp_setsockopt, + .getsockopt = tcp_getsockopt, + .keepalive = tcp_set_keepalive, + .recvmsg = tcp_nip_recvmsg, + .sendmsg = tcp_nip_sendmsg, + .sendpage = NULL, + .backlog_rcv = tcp_nip_do_rcv, + .release_cb = tcp_nip_release_cb, + .hash = ninet_hash, + .unhash = ninet_unhash, + .get_port = inet_csk_get_port, + .sockets_allocated = &tcp_sockets_allocated, + .orphan_count = &tcp_orphan_count, + .memory_allocated = &tcp_memory_allocated, + .memory_pressure = &tcp_memory_pressure, + .sysctl_mem = sysctl_tcp_mem, + .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), + .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), + .max_header = MAX_TCP_HEADER, + .obj_size = sizeof(struct tcp_nip_sock), + .rsk_prot = &tcp_nip_request_sock_ops, + .h.hashinfo = &tcp_hashinfo, + .no_autobind = true, +}; + +static const struct ninet_protocol tcp_nip_protocol = { + .early_demux = tcp_nip_early_demux, + .handler = tcp_nip_rcv, + .flags = 0, +}; + +static struct inet_protosw tcp_nip_protosw = { + .type = SOCK_STREAM, + .protocol = IPPROTO_TCP, + .prot = &tcp_nip_prot, + .ops = &ninet_stream_ops, + .flags = INET_PROTOSW_PERMANENT | + INET_PROTOSW_ICSK, +}; + +int __init tcp_nip_init(void) +{ + int ret; + + ret = ninet_add_protocol(&tcp_nip_protocol, IPPROTO_TCP); + if (ret) + goto out; + + /* register ninet protocol */ + ret = ninet_register_protosw(&tcp_nip_protosw); + if (ret) + goto out_nip_tcp_protocol; + +out: + return ret; + +out_nip_tcp_protocol: + ninet_del_protocol(&tcp_nip_protocol, IPPROTO_TCP); + goto out; +} + +void tcp_nip_exit(void) +{ + ninet_unregister_protosw(&tcp_nip_protosw); + ninet_del_protocol(&tcp_nip_protocol, IPPROTO_TCP); +} + diff --git a/code/net/newip/tcp_nip_input.c b/code/net/newip/tcp_nip_input.c new file mode 100644 index 0000000000000000000000000000000000000000..d37d1d6b5329a8370439c52e7fa63bf20df1a1a2 --- /dev/null +++ b/code/net/newip/tcp_nip_input.c @@ -0,0 +1,1694 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2022 Huawei Device Co., Ltd. + * + * NewIP INET + * An implementation of the TCP/IP protocol suite for the LINUX + * operating system. NewIP INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Implementation of the Transmission Control Protocol(TCP). + * + * Based on net/ipv4/tcp_input.c + * Based on net/ipv4/tcp_output.c + * Based on net/ipv4/tcp_minisocks.c + */ +#define pr_fmt(fmt) "NIP-TCP: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include "tcp_nip_parameter.h" + +#define FLAG_DATA 0x01 /* Incoming frame contained data. */ +#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ +#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */ +#define FLAG_RETRANS_DATA_ACKED 0x08 /* some of which was retransmitted. */ +#define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */ +#define FLAG_DATA_SACKED 0x20 /* New SACK. */ +#define FLAG_ECE 0x40 /* ECE in this ACK */ +#define FLAG_LOST_RETRANS 0x80 /* This ACK marks some retransmission lost */ +#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/ +#define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */ +#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ +#define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */ +#define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */ +#define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */ +#define FLAG_NO_CHALLENGE_ACK 0x8000 /* do not call tcp_send_challenge_ack() */ + +#define FLAG_ACKED (FLAG_DATA_ACKED | FLAG_SYN_ACKED) +#define FLAG_NOT_DUP (FLAG_DATA | FLAG_WIN_UPDATE | FLAG_ACKED) +#define FLAG_CA_ALERT (FLAG_DATA_SACKED | FLAG_ECE) +#define FLAG_FORWARD_PROGRESS (FLAG_ACKED | FLAG_DATA_SACKED) + +#define TCP_REMNANT (TCP_FLAG_FIN | TCP_FLAG_URG | TCP_FLAG_SYN | TCP_FLAG_PSH) +#define TCP_HP_BITS (~(TCP_RESERVED_BITS | TCP_FLAG_PSH)) + +#define REXMIT_NONE 0 /* no loss recovery to do */ +#define REXMIT_LOST 1 /* retransmit packets marked lost */ +#define REXMIT_NEW 2 /* FRTO-style transmit of unsent/new packets */ + +#define TCP_MAX_MSS 1460 + +void tcp_nip_fin(struct sock *sk) +{ + inet_csk_schedule_ack(sk); + + sk->sk_shutdown |= RCV_SHUTDOWN; + sock_set_flag(sk, SOCK_DONE); + + switch (sk->sk_state) { + case TCP_SYN_RECV: + case TCP_ESTABLISHED: + /* Move to CLOSE_WAIT */ + tcp_set_state(sk, TCP_CLOSE_WAIT); + inet_csk(sk)->icsk_ack.pingpong = 1; + break; + + case TCP_CLOSE_WAIT: + case TCP_CLOSING: + /* Received a retransmission of the FIN, do + * nothing. + */ + break; + case TCP_LAST_ACK: + /* RFC793: Remain in the LAST-ACK state. */ + break; + + case TCP_FIN_WAIT1: + /* This case occurs when a simultaneous close + * happens, we must ack the received FIN and + * enter the CLOSING state. + */ + tcp_nip_send_ack(sk); + tcp_set_state(sk, TCP_CLOSING); + break; + case TCP_FIN_WAIT2: + /* Received a FIN -- send ACK and enter TIME_WAIT. */ + tcp_nip_send_ack(sk); + inet_csk_reset_keepalive_timer(sk, TCP_TIMEWAIT_LEN); + break; + default: + /* Only TCP_LISTEN and TCP_CLOSE are left, in these + * cases we should never reach this piece of code. + */ + pr_err("%s: Impossible, sk->sk_state=%d\n", + __func__, sk->sk_state); + break; + } + + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_state_change(sk); +} + +static void tcp_nip_overlap_handle(struct tcp_sock *tp, struct sk_buff *skb) +{ + u32 diff = tp->rcv_nxt - TCP_SKB_CB(skb)->seq; + struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); + + skb->data += diff; + skb->len -= diff; + tcb->seq += diff; +} + +static void tcp_nip_ofo_queue(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + + while (tp->nip_out_of_order_queue) { + skb = tp->nip_out_of_order_queue; + if (after(TCP_SKB_CB(tp->nip_out_of_order_queue)->seq, tp->rcv_nxt)) + return; + tp->nip_out_of_order_queue = tp->nip_out_of_order_queue->next; + skb->next = NULL; + if (tp->rcv_nxt != TCP_SKB_CB(skb)->seq) + tcp_nip_overlap_handle(tp, skb); + + __skb_queue_tail(&sk->sk_receive_queue, skb); + tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + + while (tp->nip_out_of_order_queue && + before(TCP_SKB_CB(tp->nip_out_of_order_queue)->end_seq, tp->rcv_nxt)) { + struct sk_buff *tmp_skb = tp->nip_out_of_order_queue; + + tp->nip_out_of_order_queue = tp->nip_out_of_order_queue->next; + tmp_skb->next = NULL; + __kfree_skb(tmp_skb); + } + } +} + + /* Maintain a sort list order by the seq. */ +static void tcp_nip_data_queue_ofo(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *pre_skb, *cur_skb; + + inet_csk_schedule_ack(sk); + skb->next = NULL; + if (!tp->nip_out_of_order_queue) { + tp->nip_out_of_order_queue = skb; + skb_set_owner_r(skb, sk); + return; + } + pre_skb = tp->nip_out_of_order_queue; + cur_skb = pre_skb->next; + if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(pre_skb)->seq) { + if (TCP_SKB_CB(skb)->end_seq > TCP_SKB_CB(pre_skb)->end_seq) { + skb->next = pre_skb->next; + pre_skb->next = NULL; + skb_set_owner_r(skb, sk); + __kfree_skb(pre_skb); + return; + } + __kfree_skb(skb); + return; + } else if (TCP_SKB_CB(skb)->seq < TCP_SKB_CB(pre_skb)->seq) { + tp->nip_out_of_order_queue = skb; + skb->next = pre_skb; + skb_set_owner_r(skb, sk); + return; + } + while (cur_skb) { + if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(cur_skb)->seq) { + /* Same seq, if skb end_seq is bigger, replace. */ + if (TCP_SKB_CB(skb)->end_seq > TCP_SKB_CB(cur_skb)->end_seq) { + pre_skb->next = skb; + skb->next = cur_skb->next; + cur_skb->next = NULL; + skb_set_owner_r(skb, sk); + __kfree_skb(cur_skb); + } else { + __kfree_skb(skb); + } + return; + } else if (TCP_SKB_CB(skb)->seq < TCP_SKB_CB(cur_skb)->seq) { + pre_skb->next = skb; + skb->next = cur_skb; + skb_set_owner_r(skb, sk); + return; + } + pre_skb = pre_skb->next; + cur_skb = cur_skb->next; + } + pre_skb->next = skb; + skb_set_owner_r(skb, sk); +} + +static void tcp_drop(struct sock *sk, struct sk_buff *skb) +{ + sk_drops_add(sk, skb); + __kfree_skb(skb); +} + +static void tcp_nip_data_queue(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); + + if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) { + DEBUG("%s: no data, only handle ack.\n", __func__); + __kfree_skb(skb); + return; + } + + if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) { + if (tcp_receive_window(tp) == 0) + goto out_of_window; + } + + if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_wup + tp->rcv_wnd)) { + DEBUG("seq is %u and %u\n", TCP_SKB_CB(skb)->seq, tp->rcv_nxt); + __kfree_skb(skb); + return; + } + + if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { +out_of_window: + inet_csk_schedule_ack(sk); + __kfree_skb(skb); + return; + } + icsk->icsk_ack.lrcvtime = tcp_jiffies32; + __skb_pull(skb, tcp_hdr(skb)->doff * TCP_NUM_4); + + if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt || + (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt) && + after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) { + if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) { + sk->sk_data_ready(sk); + tcp_drop(sk, skb); + return; + } + + if (TCP_SKB_CB(skb)->seq != tp->rcv_nxt) + tcp_nip_overlap_handle(tp, skb); + + DEBUG("%s: tcp newip packet received. data len:%d\n", __func__, skb->len); + + __skb_queue_tail(&sk->sk_receive_queue, skb); + skb_set_owner_r(skb, sk); + tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + inet_csk_schedule_ack(sk); + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) + tcp_nip_fin(sk); + if (tp->nip_out_of_order_queue) + tcp_nip_ofo_queue(sk); + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_data_ready(sk); + return; + } + tcp_nip_data_queue_ofo(sk, skb); +} + +static inline void tcp_nip_push_pending_frames(struct sock *sk) +{ + if (tcp_nip_send_head(sk)) { + struct tcp_sock *tp = tcp_sk(sk); + u32 cur_mss = tcp_nip_current_mss(sk); // TCP_BASE_MSS + + __tcp_nip_push_pending_frames(sk, cur_mss, tp->nonagle); + } +} + +static void tcp_nip_new_space(struct sock *sk) +{ + sk->sk_write_space(sk); +} + +static void tcp_nip_check_space(struct sock *sk) +{ + /* Invoke memory barrier (annotated prior to checkpatch requirements) */ + smp_mb(); + if (sk->sk_socket && + test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) + tcp_nip_new_space(sk); +} + +static inline void tcp_nip_data_snd_check(struct sock *sk) +{ + tcp_nip_push_pending_frames(sk); + tcp_nip_check_space(sk); +} + +#define TCP_NIP_DELACK_MIN (HZ / 50) +void tcp_nip_send_delayed_ack(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + int ato = TCP_NIP_DELACK_MIN; // rtt + unsigned long timeout; + + icsk->icsk_ack.ato = TCP_DELACK_MIN; + + /* Stay within the limit we were given */ + timeout = jiffies + ato; + + /* Use new timeout only if there wasn't a older one earlier. */ + if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) { + if (time_before_eq(icsk->icsk_ack.timeout, + jiffies + (ato >> TCP_NIP_4BYTE_PAYLOAD))) { + tcp_nip_send_ack(sk); + return; + } + + if (!time_before(timeout, icsk->icsk_ack.timeout)) + timeout = icsk->icsk_ack.timeout; + } + icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER; + icsk->icsk_ack.timeout = timeout; + sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout); +} + +static void __tcp_nip_ack_snd_check(struct sock *sk, int ofo_possible) +{ + struct tcp_sock *tp = tcp_sk(sk); + + inet_csk(sk)->icsk_ack.rcv_mss = tcp_nip_current_mss(sk); // TCP_BASE_MSS + + /* More than n full frame received... */ + if (((tp->rcv_nxt - tp->rcv_wup) > g_ack_num * inet_csk(sk)->icsk_ack.rcv_mss && + __nip_tcp_select_window(sk) >= tp->rcv_wnd) || + /* We have out of order data. */ + (ofo_possible && tp->nip_out_of_order_queue)) { + tcp_nip_send_ack(sk); + } else { + /* Else, send delayed ack. */ + DEBUG("%s: send delayed ack!!", __func__); + tcp_nip_send_delayed_ack(sk); + } +} + +static inline void tcp_nip_ack_snd_check(struct sock *sk) +{ + if (!inet_csk_ack_scheduled(sk)) { + /* We sent a data segment already. */ + DEBUG("We sent a data segment already.!!\n"); + return; + } + __tcp_nip_ack_snd_check(sk, 1); +} + +static void tcp_nip_snd_una_update(struct tcp_sock *tp, u32 ack) +{ + u32 delta = ack - tp->snd_una; + + sock_owned_by_me((struct sock *)tp); + tp->bytes_acked += delta; + tp->snd_una = ack; +} + +void tcp_nip_rearm_rto(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (!tp->packets_out) { + inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); + } else { + u32 rto = inet_csk(sk)->icsk_rto; + + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto, + TCP_RTO_MAX); + } +} + +static int tcp_nip_clean_rtx_queue(struct sock *sk, ktime_t *skb_snd_tstamp) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + int flag = 0; + struct inet_connection_sock *icsk = inet_csk(sk); + + while ((skb = tcp_write_queue_head(sk)) && skb != tcp_nip_send_head(sk)) { + struct tcp_skb_cb *scb = TCP_SKB_CB(skb); + u32 acked_pcount; + + if (after(scb->end_seq, tp->snd_una)) { + if (tcp_skb_pcount(skb) == 1 || !after(tp->snd_una, scb->seq)) + break; + DEBUG("%s: ack error!\n", __func__); + } else { + prefetchw(skb->next); + acked_pcount = tcp_skb_pcount(skb); + } + + if (likely(!(scb->tcp_flags & TCPHDR_SYN))) { + flag |= FLAG_DATA_ACKED; + } else { + flag |= FLAG_SYN_ACKED; + tp->retrans_stamp = 0; + } + + tp->packets_out -= acked_pcount; + + if (*skb_snd_tstamp == 0) + *skb_snd_tstamp = skb->tstamp; + + tcp_unlink_write_queue(skb, sk); + sk_wmem_free_skb(sk, skb); + } + + if ((*skb_snd_tstamp != 0) && (tp->rcv_tstamp - *skb_snd_tstamp) >= g_rtt_tstamp_rto_up) + icsk->icsk_rto = (unsigned int)(HZ / g_nip_rto_up); + else + icsk->icsk_rto = (unsigned int)(HZ / g_nip_rto); + + if (flag & FLAG_ACKED) + tcp_nip_rearm_rto(sk); + return 0; +} + +/* Function + * Allocate a connection request block that holds connection request information. + * At the same time, initialize the set of operations used to send ACK/RST segments + * during connection, so that these interfaces can be easily called during establishment. + * Set the socket state to TCP_NEW_SYN_RECV + * Parameter + * ops: Request the functional interface of the control block + * sk_listener: Transmission control block + * attach_listener: Whether to set cookies + */ +struct request_sock *ninet_reqsk_alloc(const struct request_sock_ops *ops, + struct sock *sk_listener, + bool attach_listener) +{ + struct request_sock *req = reqsk_alloc(ops, sk_listener, + attach_listener); + + if (req) { + struct inet_request_sock *ireq = inet_rsk(req); + + ireq->ireq_opt = NULL; + ireq->nip_pktopts = NULL; + atomic64_set(&ireq->ir_cookie, 0); + ireq->ireq_state = TCP_NEW_SYN_RECV; + write_pnet(&ireq->ireq_net, sock_net(sk_listener)); + ireq->ireq_family = sk_listener->sk_family; + } + + return req; +} + +static void tcp_nip_drop(struct sock *sk, struct sk_buff *skb) +{ + sk_drops_add(sk, skb); + __kfree_skb(skb); +} + +void tcp_nip_parse_mss(struct tcp_options_received *opt_rx, + const struct tcphdr *th, + const unsigned char *ptr, + int opsize, + int estab) +{ + if (opsize == TCPOLEN_MSS && th->syn && !estab) { + u16 in_mss = get_unaligned_be16(ptr); + + DEBUG("%s: in_mss %d\n", __func__, in_mss); + + if (in_mss) { + if (opt_rx->user_mss && + opt_rx->user_mss < in_mss) + in_mss = opt_rx->user_mss; + opt_rx->mss_clamp = in_mss; + } + } +} + +/* Function + * Look for tcp options. Normally only called on SYN and SYNACK packets. + * Parsing of TCP options in SKB + * Parameter + * skb: Transfer control block buffer + * opt_rx: Saves the structure for TCP options + * estab: WANTCOOKIE + * foc: Len field + */ +void tcp_nip_parse_options(const struct sk_buff *skb, + struct tcp_options_received *opt_rx, int estab, + struct tcp_fastopen_cookie *foc) +{ + const unsigned char *ptr; + const struct tcphdr *th = tcp_hdr(skb); + /* The length of the TCP option = Length of TCP header - The length of the TCP structure */ + int length = (th->doff * 4) - sizeof(struct tcphdr); + + /* A pointer to the option position */ + ptr = (const unsigned char *)(th + 1); + opt_rx->saw_tstamp = 0; + + while (length > 0) { + int opcode = *ptr++; + int opsize; + + switch (opcode) { + case TCPOPT_EOL: + return; + case TCPOPT_NOP: + length--; + continue; + default: + opsize = *ptr++; + if (opsize < 2) /* "2 - silly options" */ + return; + if (opsize > length) + return; /* don't parse partial options */ + switch (opcode) { + case TCPOPT_MSS: + tcp_nip_parse_mss(opt_rx, th, ptr, opsize, estab); + break; + default: + break; + } + ptr += opsize - TCP_NUM_2; + length -= opsize; + } + } +} + +/* Function + * Initializes the connection request block information based + * on the options and sequence number in the received SYN segment + * Parameter + * req: Request connection control block + * rx_opt: Saves the structure for TCP options + * skb: Transfer control block buffer. + * sk: transmission control block. + */ +static void tcp_nip_openreq_init(struct request_sock *req, + const struct tcp_options_received *rx_opt, + struct sk_buff *skb, const struct sock *sk) +{ + struct inet_request_sock *ireq = inet_rsk(req); + + req->rsk_rcv_wnd = 0; + tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq; + tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; + tcp_rsk(req)->snt_synack = tcp_clock_us(); + tcp_rsk(req)->last_oow_ack_time = 0; + req->mss = rx_opt->mss_clamp; + req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0; + ireq->tstamp_ok = rx_opt->tstamp_ok; + ireq->snd_wscale = rx_opt->snd_wscale; + + if (g_wscale_enable == 1) { + ireq->wscale_ok = 1; + ireq->snd_wscale = g_wscale; // rx_opt->snd_wscale; + ireq->rcv_wscale = g_wscale; + } + + ireq->acked = 0; + ireq->ecn_ok = 0; + ireq->ir_rmt_port = tcp_hdr(skb)->source; + ireq->ir_num = ntohs(tcp_hdr(skb)->dest); + ireq->ir_mark = sk->sk_mark; +} + +/* Function + * Based on listening SOCK and REQ, create a transport control block + * for the new connection and initialize it. + * Parameter + * sk: the listening transmission control block. + * req: Request connection control block + * skb: Transfer control block buffer. + */ +struct sock *tcp_nip_create_openreq_child(const struct sock *sk, + struct request_sock *req, + struct sk_buff *skb) +{ + /* Clone a transport control block and lock the new transport control block */ + struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC); + + if (newsk) { + const struct inet_request_sock *ireq = inet_rsk(req); + struct tcp_request_sock *treq = tcp_rsk(req); + struct inet_connection_sock *newicsk = inet_csk(newsk); + struct tcp_sock *newtp = tcp_sk(newsk); + + /* Now setup tcp_sock */ + newtp->pred_flags = 0; + + /* The variables related to the receiving and sending serial numbers + * are initialized. The second handshake sends an ACK in the SYN+ACK segment + */ + newtp->rcv_wup = treq->rcv_isn + 1; + newtp->copied_seq = treq->rcv_isn + 1; + newtp->rcv_nxt = treq->rcv_isn + 1; + newtp->segs_in = 1; + /* The second handshake sends seq+1 in the SYN+ACK segment */ + newtp->snd_sml = treq->snt_isn + 1; + newtp->snd_una = treq->snt_isn + 1; + newtp->snd_nxt = treq->snt_isn + 1; + newtp->snd_up = treq->snt_isn + 1; + + INIT_LIST_HEAD(&newtp->tsq_node); + + /* The ACK segment number of the send window that + * received the first handshake update + */ + tcp_init_wl(newtp, treq->rcv_isn); + + /* Initialization of delay-related variables */ + minmax_reset(&newtp->rtt_min, tcp_jiffies32, ~0U); + newicsk->icsk_rto = g_nip_rto == 0 ? TCP_TIMEOUT_INIT : (HZ / g_nip_rto); + newicsk->icsk_ack.lrcvtime = tcp_jiffies32; + + /* The congestion control-related variables are initialized */ + newtp->packets_out = 0; + + newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH; + + newtp->lsndtime = tcp_jiffies32; + + newtp->total_retrans = req->num_retrans; + + newtp->snd_cwnd = TCP_INIT_CWND; + + /* There's a bubble in the pipe until at least the first ACK. */ + newtp->app_limited = ~0U; + + /* Initialize several timers */ + tcp_nip_init_xmit_timers(newsk); + newtp->write_seq = treq->snt_isn + 1; + newtp->pushed_seq = treq->snt_isn + 1; + + /* TCP option correlation */ + newtp->rx_opt.saw_tstamp = 0; + + newtp->rx_opt.dsack = 0; + newtp->rx_opt.num_sacks = 0; + + newtp->urg_data = 0; + + newtp->rx_opt.tstamp_ok = ireq->tstamp_ok; + newtp->window_clamp = req->rsk_window_clamp; + newtp->rcv_ssthresh = req->rsk_rcv_wnd; + newtp->rcv_wnd = req->rsk_rcv_wnd; + newtp->rx_opt.wscale_ok = ireq->wscale_ok; + if (newtp->rx_opt.wscale_ok) { + newtp->rx_opt.snd_wscale = ireq->snd_wscale; + newtp->rx_opt.rcv_wscale = ireq->rcv_wscale; + } else { + newtp->rx_opt.snd_wscale = 0; + newtp->rx_opt.rcv_wscale = 0; + newtp->window_clamp = min(newtp->window_clamp, 65535U); + } + newtp->snd_wnd = (ntohs(tcp_hdr(skb)->window) << + newtp->rx_opt.snd_wscale); + newtp->max_window = newtp->snd_wnd; + + if (newtp->rx_opt.tstamp_ok) { + newtp->rx_opt.ts_recent = req->ts_recent; + newtp->rx_opt.ts_recent_stamp = get_seconds(); + newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; + } else { + newtp->rx_opt.ts_recent_stamp = 0; + newtp->tcp_header_len = sizeof(struct tcphdr); + } + newtp->tsoffset = 0; + + /* Determines the size of the last passed segment */ + if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len) + newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; + newtp->rx_opt.mss_clamp = req->mss; + newtp->fastopen_req = NULL; + newtp->fastopen_rsk = NULL; + newtp->syn_data_acked = 0; + newtp->rack.mstamp = 0; + newtp->rack.advanced = 0; + + __TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS); + } + return newsk; +} + +void tcp_nip_openreq_init_rwin(struct request_sock *req, + const struct sock *sk_listener, + const struct dst_entry *dst) +{ + struct inet_request_sock *ireq = inet_rsk(req); + const struct tcp_sock *tp = tcp_sk(sk_listener); + int full_space = tcp_full_space(sk_listener); + int mss; + u32 window_clamp; + __u8 rcv_wscale; + int sysctl_tcp_nip_window_scaling = 0; + + mss = tcp_mss_clamp(tp, dst_metric_advmss(dst)); + + window_clamp = READ_ONCE(tp->window_clamp); + /* Set this up on the first call only */ + req->rsk_window_clamp = window_clamp ? : dst_metric(dst, RTAX_WINDOW); + + /* limit the window selection if the user enforce a smaller rx buffer */ + if (sk_listener->sk_userlocks & SOCK_RCVBUF_LOCK && + (req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0)) + req->rsk_window_clamp = full_space; + + /* tcp_full_space because it is guaranteed to be the first packet */ + tcp_select_initial_window(sk_listener, full_space, + mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), + &req->rsk_rcv_wnd, + &req->rsk_window_clamp, + sysctl_tcp_nip_window_scaling, + &rcv_wscale, + 0); + ireq->rcv_wscale = g_wscale_enable == 1 ? g_wscale : rcv_wscale; +} + +/* Function + * A function used by the server to process client connection requests. + * Parameter + * rsk_ops: Functional interface to request control blocks. + * af_ops: The functional interface of the TCP request block. + * sk: transmission control block. + * skb: Transfer control block buffer. + */ +int tcp_newip_conn_request(struct request_sock_ops *rsk_ops, + const struct tcp_request_sock_ops *af_ops, + struct sock *sk, struct sk_buff *skb) +{ + struct tcp_fastopen_cookie foc = { .len = -1 }; + + __u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn; + /* All received TCP options are resolved into this structure */ + struct tcp_options_received tmp_opt; + struct tcp_sock *tp = tcp_sk(sk); + struct dst_entry *dst = NULL; + struct request_sock *req; + + /* If the half-connection queue length has reached the upper limit, + * the current request is discarded + */ + if (inet_csk_reqsk_queue_is_full(sk) && !isn) { + DEBUG("inet_csk_reqsk_queue_is_full!!!!!\n"); + goto drop; + } + + /* If the queue holds the socket that has completed the connection (full connection queue) + * The length has reached its upper limit + * The current request is discarded + */ + if (sk_acceptq_is_full(sk)) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); + DEBUG("sk_acceptq_is_full!!!!!\n"); + goto drop; + } + + /* Allocate a connection request block that holds connection request information + * While initializing the connection process + * The set of operations that send ACK/RST segments + * These interfaces can be easily invoked during the setup process. + */ + req = ninet_reqsk_alloc(rsk_ops, sk, true); + if (!req) + goto drop; + + tcp_rsk(req)->af_specific = af_ops; + + tcp_clear_options(&tmp_opt); + /* Maximum MSS negotiated during connection establishment */ + tmp_opt.mss_clamp = af_ops->mss_clamp; + /* The best way to do this is to prink the value of user_mss and see if it is 0 */ + tmp_opt.user_mss = tp->rx_opt.user_mss; + /* Parsing of TCP options in SKB */ + tcp_nip_parse_options(skb, &tmp_opt, 0, false); + + /* Tstamp_ok indicates the TIMESTAMP seen on the received SYN packet */ + tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; + /* Initializes the connection request block information based on the options + * and sequence number in the received SYN segment + */ + tcp_nip_openreq_init(req, &tmp_opt, skb, sk); + + inet_rsk(req)->ir_iif = sk->sk_bound_dev_if; + + af_ops->init_req(req, sk, skb); + + if (!isn) + isn = af_ops->init_seq(skb); + + if (!dst) { + dst = af_ops->route_req(sk, NULL, req); + if (!dst) + goto drop_and_free; + } + + tcp_rsk(req)->snt_isn = isn; + tcp_rsk(req)->txhash = net_tx_rndhash(); + /* Initialize the receive window */ + tcp_nip_openreq_init_rwin(req, sk, dst); + /* Record the syn */ + tcp_rsk(req)->tfo_listener = false; + /* Add a timer to add reQ to the ehash table */ + ninet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); + + af_ops->send_synack(sk, dst, NULL, req, &foc, TCP_SYNACK_NORMAL, NULL); + + reqsk_put(req); + return 0; + +drop_and_free: + reqsk_free(req); +drop: + tcp_listendrop(sk); + return 0; +} + +static inline bool tcp_nip_paws_check(const struct tcp_options_received *rx_opt, + int paws_win) +{ + if ((s32)(rx_opt->ts_recent - rx_opt->rcv_tsval) <= paws_win) + return true; + if (unlikely(get_seconds() >= rx_opt->ts_recent_stamp + TCP_PAWS_24DAYS)) + return true; + + if (!rx_opt->ts_recent) + return true; + return false; +} + +static inline bool tcp_nip_may_update_window(const struct tcp_sock *tp, + const u32 ack, const u32 ack_seq, + const u32 nwin) +{ + return after(ack, tp->snd_una) || + after(ack_seq, tp->snd_wl1) || + (ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd); +} + +static int tcp_nip_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32 ack, + u32 ack_seq) +{ + struct tcp_sock *tp = tcp_sk(sk); + int flag = 0; + u32 nwin = ntohs(tcp_hdr(skb)->window); + + if (likely(!tcp_hdr(skb)->syn)) + nwin <<= tp->rx_opt.snd_wscale; + + if (tcp_nip_may_update_window(tp, ack, ack_seq, nwin)) { + flag |= FLAG_WIN_UPDATE; + tcp_update_wl(tp, ack_seq); + + if (tp->snd_wnd != nwin) { + tp->snd_wnd = nwin; + tp->pred_flags = 0; + } + } + + return flag; +} + +/* Check whether the ACK returned by the packet is detected + *and whether the peer window is opened + */ +static void tcp_nip_ack_probe(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); + + if (!after(TCP_SKB_CB(tcp_nip_send_head(sk))->end_seq, tcp_wnd_end(tp))) { + icsk->icsk_backoff = 0; + inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0); + /* Socket must be waked up by subsequent tcp_data_snd_check(). + * This function is not for random using! + */ + } else { + unsigned long when = tcp_probe0_when(sk, TCP_RTO_MAX); + + inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, + when, TCP_RTO_MAX); + } +} + +#define DUP_ACK 0 +#define NOR_ACK 1 +#define ACK_DEF 2 +static void tcp_nip_ack_retrans(struct sock *sk, u32 ack, int ack_type) +{ + int skb_index = 0; + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb, *tmp; + const char *ack_str[ACK_DEF] = {"dup", "nor"}; + int index = ack_type == DUP_ACK ? DUP_ACK : NOR_ACK; + + skb_queue_walk_safe(&sk->sk_write_queue, skb, tmp) { + if (skb == tcp_nip_send_head(sk)) { + SSTHRESH_DBG("%s %s ack retrans(%u) end, ack=%u, seq=%u~%u, pkt_out=%u", + __func__, ack_str[index], tp->ack_retrans_num, ack, + tp->selective_acks[0].start_seq, + tp->selective_acks[0].end_seq, tp->packets_out); + tp->selective_acks[0].start_seq = 0; + tp->selective_acks[0].end_seq = 0; + tp->ack_retrans_seq = 0; + tp->ack_retrans_num = 0; + break; + } + + if (TCP_SKB_CB(skb)->seq > tp->selective_acks[0].end_seq) { + SSTHRESH_DBG("%s %s ack retrans(%u) finish, ack=%u, seq=%u~%u, pkt_out=%u", + __func__, ack_str[index], tp->ack_retrans_num, ack, + tp->selective_acks[0].start_seq, + tp->selective_acks[0].end_seq, tp->packets_out); + + tp->selective_acks[0].start_seq = 0; + tp->selective_acks[0].end_seq = 0; + tp->ack_retrans_seq = 0; + tp->ack_retrans_num = 0; + break; + } + + if (TCP_SKB_CB(skb)->seq != tp->ack_retrans_seq) + continue; + + if (skb_index < g_ack_retrans_num) { + tcp_nip_retransmit_skb(sk, skb, 1); + skb_index++; + tp->ack_retrans_num++; + tp->ack_retrans_seq = TCP_SKB_CB(skb)->end_seq; + } else { + RETRANS_DBG("%s %s ack retrans(%u) no end, ack=%u, seq=%u~%u, pkt_out=%u", + __func__, ack_str[index], tp->ack_retrans_num, ack, + tp->selective_acks[0].start_seq, + tp->selective_acks[0].end_seq, tp->packets_out); + break; + } + } +} + +#define DUP_ACK_RETRANS_START_NUM 3 +#define DIVIDEND_UP 3 +#define DIVIDEND_DOWN 5 +static void tcp_nip_dup_ack_retrans(struct sock *sk, u32 ack) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (tcp_write_queue_head(sk)) { + tp->sacked_out++; + if (tp->sacked_out == DUP_ACK_RETRANS_START_NUM) { + int last_nip_ssthresh = tp->nip_ssthresh; + int nip_ssthresh = (tp->nip_ssthresh * DIVIDEND_UP) / DIVIDEND_DOWN; + + tp->nip_ssthresh = nip_ssthresh < g_ssthresh_low ? + g_ssthresh_low : nip_ssthresh; + if (tp->selective_acks[0].end_seq) + SSTHRESH_DBG("%s last retans(%u) not end, seq=%u~%u, pkt_out=%u", + __func__, tp->ack_retrans_num, + tp->selective_acks[0].start_seq, + tp->selective_acks[0].end_seq, + tp->packets_out); + + SSTHRESH_DBG("%s new dup ack, win %u to %u, seq=%u~%u", + __func__, last_nip_ssthresh, tp->nip_ssthresh, + ack, tp->snd_nxt); + + tp->selective_acks[0].start_seq = ack; + tp->selective_acks[0].end_seq = tp->snd_nxt; + tp->ack_retrans_seq = ack; + tp->ack_retrans_num = 0; + + tcp_nip_ack_retrans(sk, ack, DUP_ACK); + } + } +} + +static void tcp_nip_nor_ack_retrans(struct sock *sk, u32 ack) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (tp->selective_acks[0].end_seq != 0) { + if (ack >= tp->selective_acks[0].end_seq || + (ack >= ((tp->selective_acks[0].end_seq - tp->selective_acks[0].start_seq) / + g_retrans_seg_end_divisor) + tp->selective_acks[0].start_seq)) { + SSTHRESH_DBG("%s nor ack retrans(%u) resume, seq=%u~%u, pkt_out=%u, ack=%u", + __func__, tp->ack_retrans_num, + tp->selective_acks[0].start_seq, + tp->selective_acks[0].end_seq, tp->packets_out, ack); + tp->selective_acks[0].start_seq = 0; + tp->selective_acks[0].end_seq = 0; + tp->ack_retrans_seq = 0; + tp->ack_retrans_num = 0; + + tp->sacked_out = 0; + return; + } + + tcp_nip_ack_retrans(sk, ack, NOR_ACK); + } + + tp->sacked_out = 0; +} + +static void tcp_nip_ack_calc_ssthresh(struct sock *sk, u32 ack, int icsk_rto_last, + ktime_t skb_snd_tstamp) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); + int ack_reset = ack / g_nip_ssthresh_reset; + u32 nip_ssthresh; + + if (tp->nip_ssthresh_reset != ack_reset) { + SSTHRESH_DBG("%s ack reset win %u to %u, ack=%u", + __func__, tp->nip_ssthresh, g_ssthresh_low, ack); + tp->nip_ssthresh_reset = ack_reset; + tp->nip_ssthresh = g_ssthresh_low; + } else { + if (skb_snd_tstamp) { + u32 rtt_tstamp = tp->rcv_tstamp - skb_snd_tstamp; + + if (rtt_tstamp >= g_rtt_tstamp_rto_up) { + SSTHRESH_DBG("%s rtt %u >= %u, win %u to %u, rto %u to %u, ack=%u", + __func__, rtt_tstamp, g_rtt_tstamp_rto_up, + tp->nip_ssthresh, g_ssthresh_low_min, + icsk_rto_last, icsk->icsk_rto, ack); + + tp->nip_ssthresh = g_ssthresh_low_min; + } else if (rtt_tstamp >= g_rtt_tstamp_high) { + SSTHRESH_DBG("%s rtt %u >= %u, win %u to %u, ack=%u", + __func__, rtt_tstamp, g_rtt_tstamp_high, + tp->nip_ssthresh, g_ssthresh_low, ack); + + tp->nip_ssthresh = g_ssthresh_low; + } else if (rtt_tstamp >= g_rtt_tstamp_mid_high) { + SSTHRESH_DBG("%s rtt %u >= %u, win %u to %u, ack=%u", + __func__, rtt_tstamp, g_rtt_tstamp_mid_high, + tp->nip_ssthresh, g_ssthresh_mid_low, ack); + + tp->nip_ssthresh = g_ssthresh_mid_low; + } else if (rtt_tstamp >= g_rtt_tstamp_mid_low) { + u32 rtt_tstamp_scale = g_rtt_tstamp_mid_high - rtt_tstamp; + int half_mid_high = g_ssthresh_mid_high / 2; + + nip_ssthresh = half_mid_high + rtt_tstamp_scale * half_mid_high / + (g_rtt_tstamp_mid_high - g_rtt_tstamp_mid_low); + + tp->nip_ssthresh = tp->nip_ssthresh > g_ssthresh_mid_high ? + half_mid_high : tp->nip_ssthresh; + nip_ssthresh = (tp->nip_ssthresh * g_ssthresh_high_step + + nip_ssthresh) / (g_ssthresh_high_step + 1); + + SSTHRESH_DBG("%s rtt %u >= %u, win %u to %u, ack=%u", + __func__, rtt_tstamp, g_rtt_tstamp_mid_low, + tp->nip_ssthresh, nip_ssthresh, ack); + + tp->nip_ssthresh = nip_ssthresh; + } else if (rtt_tstamp != 0) { + nip_ssthresh = (tp->nip_ssthresh * g_ssthresh_high_step + + g_ssthresh_high) / (g_ssthresh_high_step + 1); + + SSTHRESH_DBG("%s rtt %u < %u, win %u to %u, ack=%u", + __func__, rtt_tstamp, g_rtt_tstamp_mid_low, + tp->nip_ssthresh, nip_ssthresh, ack); + + tp->nip_ssthresh = nip_ssthresh; + } + } + } +} + +static int tcp_nip_ack(struct sock *sk, const struct sk_buff *skb, int flag) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); + u32 prior_snd_una = tp->snd_una; + u32 ack_seq = TCP_SKB_CB(skb)->seq; + u32 ack = TCP_SKB_CB(skb)->ack_seq; + int prior_packets = tp->packets_out; + ktime_t skb_snd_tstamp = 0; + int icsk_rto_last; + + if (before(ack, prior_snd_una)) + return 0; + if (after(ack, tp->snd_nxt)) + return -1; + + flag |= tcp_nip_ack_update_window(sk, skb, ack, ack_seq); + + if (!prior_packets) { + DEBUG("No prior pack and ack is %u\n", ack); + if (tcp_nip_send_head(sk)) + tcp_nip_ack_probe(sk); + } + + icsk->icsk_probes_out = 0; + tp->nip_keepalive_timeout_scale = 0; + tp->rcv_tstamp = tcp_jiffies32; + + if (after(ack, prior_snd_una)) { + icsk->icsk_retransmits = 0; + tp->retrans_stamp = tcp_time_stamp(tp); + tp->rcv_tstamp = tcp_jiffies32; + tcp_nip_snd_una_update(tp, ack); + + icsk_rto_last = icsk->icsk_rto; + tcp_nip_clean_rtx_queue(sk, &skb_snd_tstamp); + + tcp_nip_ack_calc_ssthresh(sk, ack, icsk_rto_last, skb_snd_tstamp); + tcp_nip_nor_ack_retrans(sk, ack); + return 1; + } + + // ack == tp->snd_una + tcp_nip_dup_ack_retrans(sk, ack); + + return 1; +} + +static inline bool tcp_nip_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq) +{ + /* False is returned if end_seq has been received, + * or if SEq is not behind the receive window + */ + return !before(end_seq, tp->rcv_wup) && + !after(seq, tp->rcv_nxt + tcp_receive_window(tp)); +} + +/* When we get a reset we do this. */ +void tcp_nip_reset(struct sock *sk) +{ + DEBUG("%s: handle RST!", __func__); + + /* We want the right error as BSD sees it (and indeed as we do). */ + switch (sk->sk_state) { + case TCP_SYN_SENT: + sk->sk_err = ECONNREFUSED; + break; + case TCP_CLOSE_WAIT: + sk->sk_err = EPIPE; + break; + case TCP_CLOSE: + return; + default: + sk->sk_err = ECONNRESET; + } + /* This barrier is coupled with smp_rmb() in tcp_poll() */ + smp_wmb(); + + tcp_nip_write_queue_purge(sk); + tcp_nip_done(sk); + + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_error_report(sk); +} + +/* Reack some incorrect packets, because if you do not ACK these packets, + * they may be retransmitted frequently + */ +static void tcp_nip_send_dupack(struct sock *sk, const struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && + before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST); + } + DEBUG("[nip]%s send dupack!\n", __func__); + tcp_nip_send_ack(sk); +} + +static bool tcp_nip_reset_check(const struct sock *sk, const struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + + return unlikely(TCP_SKB_CB(skb)->seq == (tp->rcv_nxt - 1) && + (1 << sk->sk_state) & (TCPF_CLOSE_WAIT | TCPF_LAST_ACK | + TCPF_CLOSING)); +} + +/* This function is used to process the SYN received in RST packets + * and illegal SEQ packets in ESTABLISHED state. Currently only seQ checks are included + */ +static bool tcp_nip_validate_incoming(struct sock *sk, struct sk_buff *skb, + const struct tcphdr *th, int syn_inerr) +{ + struct tcp_sock *tp = tcp_sk(sk); + bool rst_seq_match = false; + + /* Step 1: check sequence number */ + /* Check for unexpected packets. For some probe packets, + * unexpected packets do not need to be processed, but reply for an ACK + */ + if (!tcp_nip_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) { + DEBUG("%s receive an err seq and seq is %u, ack is %u\n", __func__, + TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); + if (!th->rst) + tcp_nip_send_dupack(sk, skb); + else if (tcp_nip_reset_check(sk, skb)) + tcp_nip_reset(sk); + goto discard; + } + + /* Step 2: check RST bit */ + if (th->rst) { + if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt || tcp_nip_reset_check(sk, skb)) + rst_seq_match = true; + if (rst_seq_match) + tcp_nip_reset(sk); + goto discard; + } + + return true; + +discard: + tcp_drop(sk, skb); + return false; +} + +void tcp_nip_rcv_established(struct sock *sk, struct sk_buff *skb, + const struct tcphdr *th, unsigned int len) +{ + struct tcp_sock *tp = tcp_sk(sk); + + tcp_mstamp_refresh(tp); + if (!tcp_nip_validate_incoming(sk, skb, th, 1)) + return; + + if (tcp_nip_ack(sk, skb, 0) < 0) + goto discard; + + tcp_nip_data_queue(sk, skb); + tcp_nip_data_snd_check(sk); + tcp_nip_ack_snd_check(sk); + + return; + +discard: + tcp_drop(sk, skb); +} + +static u32 tcp_default_init_rwnd(u32 mss) +{ + u32 init_rwnd = TCP_INIT_CWND * 2; + + if (mss > TCP_MAX_MSS) + init_rwnd = max((TCP_MAX_MSS * init_rwnd) / mss, 2U); + return init_rwnd; +} + +static void tcp_nip_fixup_rcvbuf(struct sock *sk) +{ + u32 mss = TCP_BASE_MSS; + int rcvmem; + + rcvmem = TCP_NUM_2 * SKB_TRUESIZE(mss + MAX_TCP_HEADER) * + tcp_default_init_rwnd(mss); + + if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) + rcvmem <<= TCP_NIP_4BYTE_PAYLOAD; + + if (sk->sk_rcvbuf < rcvmem) + sk->sk_rcvbuf = min(rcvmem, + sock_net(sk)->ipv4.sysctl_tcp_rmem[TCP_ARRAY_INDEX_2]); +} + +#define TCP_NIP_SND_BUF_SIZE 30720 +void tcp_nip_init_buffer_space(struct sock *sk) +{ + int tcp_app_win = sock_net(sk)->ipv4.sysctl_tcp_app_win; + struct tcp_sock *tp = tcp_sk(sk); + int maxwin; + + if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) + tcp_nip_fixup_rcvbuf(sk); + + tp->rcvq_space.space = tp->rcv_wnd; + tcp_mstamp_refresh(tp); + tp->rcvq_space.time = jiffies; + tp->rcvq_space.seq = tp->copied_seq; + maxwin = tcp_full_space(sk); + if (tp->window_clamp >= maxwin) { + tp->window_clamp = maxwin; + if (tcp_app_win && maxwin > TCP_NUM_4 * tp->advmss) + tp->window_clamp = max(maxwin - + (maxwin >> tcp_app_win), + TCP_NUM_4 * tp->advmss); + } + /* Force reservation of one segment. */ + if (tcp_app_win && + tp->window_clamp > TCP_NUM_2 * tp->advmss && + tp->window_clamp + tp->advmss > maxwin) + tp->window_clamp = max(TCP_NUM_2 * tp->advmss, maxwin - tp->advmss); + tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp); + tp->snd_cwnd_stamp = tcp_jiffies32; +} + +void tcp_nip_finish_connect(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); + + tcp_set_state(sk, TCP_ESTABLISHED); + icsk->icsk_ack.lrcvtime = tcp_jiffies32; + if (skb) { + icsk->icsk_af_ops->sk_rx_dst_set(sk, skb); + security_inet_conn_established(sk, skb); + } + + tp->lsndtime = tcp_jiffies32; + + tcp_nip_init_buffer_space(sk); +} + +/* Function: + * A function that handles the second handshake + * Parameter: + * sk: transmission control block + * skb: Transfer control block buffer + * Th: TCP header field + */ +static int tcp_nip_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, + const struct tcphdr *th) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + int saved_clamp = tp->rx_opt.mss_clamp; + + /* TCP Option Parsing */ + tcp_nip_parse_options(skb, &tp->rx_opt, 0, NULL); + /* Rcv_tsecr saves the timestamp of the last TCP segment received from the peer end */ + if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) + tp->rx_opt.rcv_tsecr -= tp->tsoffset; + + if (th->ack) { + /* Whether the ACK value is between the initial send sequence number + * and the next sequence number + */ + if (!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_una) || + after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) + goto reset_and_undo; + /* Must be within the corresponding time*/ + if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && + !between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp, tcp_time_stamp(tp))) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSACTIVEREJECTED); + goto reset_and_undo; + } + + if (th->rst) { + tcp_nip_reset(sk); + goto discard; + } + + if (!th->syn) + goto discard_and_undo; + + tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); + + tcp_nip_ack(sk, skb, FLAG_SLOWPATH); + tp->nip_out_of_order_queue = NULL; + /* The next data number expected to be accepted is +1 */ + tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; + /* Accept the left margin of the window +1 */ + tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1; + tp->snd_wnd = ntohs(th->window); + + if (g_wscale_enable == 1) { + tp->rx_opt.wscale_ok = 1; + tp->rx_opt.snd_wscale = g_wscale; + tp->rx_opt.rcv_wscale = g_wscale; + } + + if (!tp->rx_opt.wscale_ok) { + tp->rx_opt.snd_wscale = 0; + tp->rx_opt.rcv_wscale = 0; + tp->window_clamp = min(tp->window_clamp, 65535U); + } + + if (tp->rx_opt.saw_tstamp) { + tp->rx_opt.tstamp_ok = 1; + tp->tcp_header_len = + sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; + tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; + tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval; + tp->rx_opt.ts_recent_stamp = get_seconds(); + } else { + tp->tcp_header_len = sizeof(struct tcphdr); + } + + tp->copied_seq = tp->rcv_nxt; + /* Invoke memory barrier (annotated prior to checkpatch requirements) */ + smp_mb(); + + tcp_nip_sync_mss(sk, icsk->icsk_pmtu_cookie); + tcp_nip_initialize_rcv_mss(sk); + + tcp_nip_finish_connect(sk, skb); + /* Wake up the process */ + if (!sock_flag(sk, SOCK_DEAD)) { + sk->sk_state_change(sk); + rcu_read_lock(); + sock_wake_async(rcu_dereference(sk->sk_wq), SOCK_WAKE_IO, POLL_OUT); + rcu_read_unlock(); + } + + tcp_nip_send_ack(sk); + return -1; +discard: + tcp_drop(sk, skb); + return 0; + } + +discard_and_undo: + tcp_clear_options(&tp->rx_opt); + tp->rx_opt.mss_clamp = saved_clamp; + goto discard; + +reset_and_undo: + tcp_clear_options(&tp->rx_opt); + tp->rx_opt.mss_clamp = saved_clamp; + return 1; +} + +/* Function: + * TCP processing function that is differentiated according to + * different states after receiving data packets + * Parameter: + * sk: transmission control block + * skb: Transfer control block buffer + * Note: Currently this function only has code for handling the first handshake packet + * Implementation of the third handshake ACK to handle the code + */ +int tcp_nip_rcv_state_process(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); + const struct tcphdr *th = tcp_hdr(skb); + int queued = 0; + bool acceptable; + + /* Step 1: Connect handshake packet processing */ + switch (sk->sk_state) { + case TCP_CLOSE: + goto discard; + + case TCP_LISTEN: + if (th->ack) + return 1; + + if (th->rst) + goto discard; + + if (th->syn) { + if (th->fin) + goto discard; + + rcu_read_lock(); + local_bh_disable(); + acceptable = icsk->icsk_af_ops->conn_request(sk, skb) >= 0; + local_bh_enable(); + rcu_read_unlock(); + + if (!acceptable) + return 1; + consume_skb(skb); + return 0; + } + goto discard; + case TCP_SYN_SENT: + DEBUG("%s TCP_SYN_SENT!!\n", __func__); + tp->rx_opt.saw_tstamp = 0; + tcp_mstamp_refresh(tp); + queued = tcp_nip_rcv_synsent_state_process(sk, skb, th); + if (queued >= 0) + return queued; + __kfree_skb(skb); + return 0; + } + tcp_mstamp_refresh(tp); + tp->rx_opt.saw_tstamp = 0; + + if (!th->ack && !th->rst && !th->syn) + goto discard; + + if (!tcp_nip_validate_incoming(sk, skb, th, 0)) + return 0; + + acceptable = tcp_nip_ack(sk, skb, 0); + + /* If the third handshake ACK is invalid, 1 is returned + * and the SKB is discarded in tcp_nip_rcv + */ + if (!acceptable) { + if (sk->sk_state == TCP_SYN_RECV) + return 1; + goto discard; + } + + switch (sk->sk_state) { + case TCP_SYN_RECV: + tp->copied_seq = tp->rcv_nxt; + tcp_nip_init_buffer_space(sk); + /* Invoke memory barrier (annotated prior to checkpatch requirements) */ + smp_mb(); + tcp_set_state(sk, TCP_ESTABLISHED); + DEBUG("TCP_ESTABLISHED!!!!!\n"); + sk->sk_state_change(sk); + + /* Sets the part to be sent, and the size of the send window */ + tp->snd_una = TCP_SKB_CB(skb)->ack_seq; + tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale; + tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); + + tp->lsndtime = tcp_jiffies32; + + tcp_initialize_rcv_mss(sk); + break; + case TCP_FIN_WAIT1: { + if (tp->snd_una != tp->write_seq) { + DEBUG("%s: tp->snd_una != tp->write_seq!!\n", __func__); + break; + } + + tcp_set_state(sk, TCP_FIN_WAIT2); + sk->sk_shutdown |= SEND_SHUTDOWN; + + if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && + after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) { + tcp_nip_done(sk); + DEBUG("%s: received payload packets, call tcp_nip_done.\n", __func__); + return 1; + } + + DEBUG("%s: TCP_FIN_WAIT1: recvd ack for fin.Wait for fin from other side.\n", + __func__); + inet_csk_reset_keepalive_timer(sk, TCP_NIP_CSK_KEEPALIVE_CYCLE * HZ); + + break; + } + + case TCP_CLOSING: + if (tp->snd_una == tp->write_seq) { + DEBUG("%s: TCP_CLOSING: recvd ack for fin.Ready to destroy.\n", __func__); + inet_csk_reset_keepalive_timer(sk, TCP_TIMEWAIT_LEN); + goto discard; + } + break; + case TCP_LAST_ACK: + DEBUG("tcp_nip_rcv_state_process_2: TCP_LAST_ACK\n"); + if (tp->snd_una == tp->write_seq) { + DEBUG("%s: LAST_ACK: recvd ack for fin.Directly destroy.\n", __func__); + tcp_nip_done(sk); + goto discard; + } + break; + } + + switch (sk->sk_state) { + case TCP_CLOSE_WAIT: + DEBUG("%s: into TCP_CLOSE_WAIT, rst = %d, seq = %u, end_seq = %u, rcv_nxt = %u\n", + __func__, th->rst, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt); + fallthrough; + case TCP_CLOSING: + case TCP_LAST_ACK: + if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { + DEBUG("%s: break in TCP_LAST_ACK\n", __func__); + break; + } + DEBUG("tcp_nip_rcv_state_process_3: TCP_LAST_ACK_2\n"); + fallthrough; + case TCP_FIN_WAIT1: + case TCP_FIN_WAIT2: + /* Reset is required according to RFC 1122. + * Do not enter the reset process temporarily + */ + if (sk->sk_shutdown & RCV_SHUTDOWN) { + if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && + after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) { + tcp_nip_reset(sk); + DEBUG("%s: call tcp_nip_reset\n", __func__); + return 1; + } + } + fallthrough; + case TCP_ESTABLISHED: + tcp_nip_data_queue(sk, skb); + queued = 1; + break; + } + + if (sk->sk_state != TCP_CLOSE) { + tcp_nip_data_snd_check(sk); + tcp_nip_ack_snd_check(sk); + } + + if (!queued) { +discard: + tcp_nip_drop(sk, skb); + } + return 0; +} + +/* Function + * Initialize RCV_MSS + * Parameter + * sk: transmission control block + */ +void tcp_nip_initialize_rcv_mss(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache); + + hint = min(hint, tp->rcv_wnd / TCP_NUM_2); + hint = min(hint, TCP_MSS_DEFAULT); + hint = max(hint, TCP_MIN_MSS); + + inet_csk(sk)->icsk_ack.rcv_mss = hint; +} + +/* Function + * Handle the third handshake ACK and return the new control block successfully. + * Is the core process for handling ACKS. + * (1)Create a child control block. Note that the state of the child control + * block is TCP_SYN_RECV + * This is different from the TCP_NEW_SYN_RECV control block created when syn was received. + * (2)Remove the request control block from the incomplete connection queue + * and add it to the completed connection queue + * Parameter + * sk: transmission control block + * skb: Transfer control block buffer + * req: Request connection control block + */ +struct sock *tcp_nip_check_req(struct sock *sk, struct sk_buff *skb, + struct request_sock *req) +{ + struct tcp_options_received tmp_opt; + struct sock *child; + const struct tcphdr *th = tcp_hdr(skb); + __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST | TCP_FLAG_SYN | TCP_FLAG_ACK); + bool own_req; + + tmp_opt.saw_tstamp = 0; + /* Check whether the TCP option exists */ + if (th->doff > (sizeof(struct tcphdr) >> TCP_NIP_4BYTE_PAYLOAD)) { + /* Parsing TCP options */ + tcp_nip_parse_options(skb, &tmp_opt, 0, NULL); + } + + /* ACK but the serial number does not match, + * return to the original control block, no processing outside + */ + if ((flg & TCP_FLAG_ACK) && + (TCP_SKB_CB(skb)->ack_seq != + tcp_rsk(req)->snt_isn + 1)) { + DEBUG("%s ack_seq is wrong!", __func__); + return sk; + } + + /* The above process guarantees that there is an ACK, if not, return directly */ + if (!(flg & TCP_FLAG_ACK)) { + DEBUG("%s No TCP_FLAG_ACK !!!!", __func__); + return NULL; + } + + /* The ack is valid and the child control block is created. + * Note that the state of the child control block is TCP_SYN_RECV + */ + child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL, + req, &own_req); + if (!child) { + DEBUG("%s No listen_overflow!!!!", __func__); + goto listen_overflow; + } + DEBUG("%s creat child sock successfully!", __func__); + + sock_rps_save_rxhash(child, skb); + /* Calculate the time spent synack-ack in three handshakes */ + tcp_synack_rtt_meas(child, req); + /* Delete the original control block from the incomplete queue + * and add it to the completed queue + */ + return inet_csk_complete_hashdance(sk, child, req, own_req); + +listen_overflow: + if (!sock_net(sk)->ipv4.sysctl_tcp_abort_on_overflow) { + inet_rsk(req)->acked = 1; + return NULL; + } + return NULL; +} + diff --git a/code/net/newip/tcp_nip_output.c b/code/net/newip/tcp_nip_output.c new file mode 100644 index 0000000000000000000000000000000000000000..c9030b448ce1e297519956945eae5a625aef18bf --- /dev/null +++ b/code/net/newip/tcp_nip_output.c @@ -0,0 +1,1257 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2022 Huawei Device Co., Ltd. + * + * NewIP INET + * An implementation of the TCP/IP protocol suite for the LINUX + * operating system. NewIP INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Implementation of the Transmission Control Protocol(TCP). + * + * Based on net/ipv4/tcp_output.c + * Based on net/ipv4/tcp_minisocks.c + */ +#define pr_fmt(fmt) "NIP-TCP: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include "nip_hdr.h" +#include "nip_checksum.h" +#include "tcp_nip_parameter.h" + +#define OPTION_SACK_ADVERTISE BIT(0) +#define OPTION_TS BIT(1) +#define OPTION_MD5 BIT(2) +#define OPTION_WSCALE BIT(3) +#define OPTION_FAST_OPEN_COOKIE BIT(8) + +/* Store the options contained in TCP when sending TCP packets */ +struct tcp_nip_out_options { + u16 options; /* bit field of OPTION_* */ + u16 mss; /* If it is zero, the MSS option is disabled */ + + u8 ws; /* window scale, 0 to disable, If the window is enlarged, + * 0 indicates that the option is disabled + */ + u8 hash_size; /* bytes in hash_location */ + __u8 *hash_location; /* temporary pointer, overloaded */ + __u32 tsval, tsecr; /* need to include OPTION_TS */ +}; + +static bool tcp_nip_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, + int push_one, gfp_t gfp); + +static void tcp_nip_event_data_sent(struct tcp_sock *tp, + struct sock *sk) +{ +} + +/* Calculate MSS not accounting any TCP options. */ +static inline int __tcp_nip_mtu_to_mss(struct sock *sk, int pmtu) +{ + const struct tcp_sock *tp = tcp_sk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); + int mss_now; + + /* Calculate base mss without TCP options: It is MMS_S - sizeof(tcphdr) of rfc1122 */ + mss_now = pmtu - NIP_HDR_MAX - sizeof(struct tcphdr); + + /* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */ + if (icsk->icsk_af_ops->net_frag_header_len) { + const struct dst_entry *dst = __sk_dst_get(sk); + + if (dst && dst_allfrag(dst)) + mss_now -= icsk->icsk_af_ops->net_frag_header_len; + } + + /* Clamp it (mss_clamp does not include tcp options) */ + if (mss_now > tp->rx_opt.mss_clamp) + mss_now = tp->rx_opt.mss_clamp; + + /* Now subtract optional transport overhead */ + mss_now -= icsk->icsk_ext_hdr_len; + + /* Then reserve room for full set of TCP options and 8 bytes of data */ + mss_now = max(mss_now, sock_net(sk)->ipv4.sysctl_tcp_min_snd_mss); + return mss_now; +} + +/* Calculate MSS. Not accounting for SACKs here. */ +int tcp_nip_mtu_to_mss(struct sock *sk, int pmtu) +{ + /* Subtract TCP options size, not including SACKs */ + return __tcp_nip_mtu_to_mss(sk, pmtu) - + (tcp_sk(sk)->tcp_header_len - sizeof(struct tcphdr)); +} + +/* Inverse of above */ +int tcp_nip_mss_to_mtu(struct sock *sk, int mss) +{ + const struct tcp_sock *tp = tcp_sk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); + int mtu; + + mtu = mss + + tp->tcp_header_len + + icsk->icsk_ext_hdr_len + + NIP_HDR_MAX; + /* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */ + if (icsk->icsk_af_ops->net_frag_header_len) { + const struct dst_entry *dst = __sk_dst_get(sk); + + if (dst && dst_allfrag(dst)) + mtu += icsk->icsk_af_ops->net_frag_header_len; + } + return mtu; +} + +static inline void tcp_advance_send_head(struct sock *sk, const struct sk_buff *skb) +{ + if (tcp_skb_is_last(sk, skb)) + sk->sk_send_head = NULL; + else + sk->sk_send_head = skb_queue_next(&sk->sk_write_queue, skb); +} + +static void tcp_nip_event_new_data_sent(struct sock *sk, struct sk_buff *skb) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + unsigned int prior_packets = tp->packets_out; + + tcp_advance_send_head(sk, skb); + tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; + tp->packets_out += tcp_skb_pcount(skb); + if (!prior_packets || icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || + icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { + tcp_nip_rearm_rto(sk); + } +} + +void __tcp_nip_push_pending_frames(struct sock *sk, unsigned int cur_mss, + int nonagle) +{ + if (unlikely(sk->sk_state == TCP_CLOSE)) + return; + + if (tcp_nip_write_xmit(sk, cur_mss, nonagle, 0, + sk_gfp_mask(sk, GFP_ATOMIC))) { + DEBUG("%s check probe0 timer!\n", __func__); + tcp_nip_check_probe_timer(sk); + } +} + +u32 __nip_tcp_select_window(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + int mss = tcp_nip_current_mss(sk); // TCP_BASE_MSS + int free_space; + int allowed_space; + int full_space; + int window; + + if (g_rcv_win_max) { + allowed_space = g_rcv_win_max; + full_space = allowed_space; + WRITE_ONCE(sk->sk_rcvbuf, g_nip_rcvbuf); + free_space = tcp_space(sk); + } else { + allowed_space = tcp_full_space(sk); + full_space = min_t(int, tp->window_clamp, allowed_space); + free_space = tcp_space(sk); + } + + if (unlikely(mss > full_space)) { + mss = full_space; + if (mss <= 0) + return 0; + } + + if (free_space < (full_space >> 1)) { + icsk->icsk_ack.quick = 0; + + free_space = round_down(free_space, 1 << tp->rx_opt.rcv_wscale); + if (free_space < (allowed_space >> TCP_NUM_4) || free_space < mss) + return 0; + } + + if (g_nip_tcp_rcv_win_enable) { + if (g_ssthresh_enable == 1) + free_space = free_space > tp->nip_ssthresh ? tp->nip_ssthresh : free_space; + else + free_space = free_space > tp->rcv_ssthresh ? tp->rcv_ssthresh : free_space; + } else { + free_space = free_space > g_ssthresh_high ? g_ssthresh_high : free_space; + } + + window = tp->rcv_wnd; + if (tp->rx_opt.rcv_wscale) { + window = free_space; + if (((window >> tp->rx_opt.rcv_wscale) << tp->rx_opt.rcv_wscale) != window) + window = (((window >> tp->rx_opt.rcv_wscale) + 1) + << tp->rx_opt.rcv_wscale); + } else { + if (window <= free_space - mss || window > free_space) + window = (free_space / mss) * mss; + else if (mss == full_space && + free_space > window + (full_space >> 1)) + window = free_space; + } + return window; +} + +static u16 nip_tcp_select_window(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + u32 old_win = tp->rcv_wnd; + u32 cur_win = tcp_receive_window(tp); + u32 new_win = __nip_tcp_select_window(sk); + + if (new_win < cur_win) { + if (new_win == 0) + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPWANTZEROWINDOWADV); + new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale); + } + tp->rcv_wnd = new_win; + tp->rcv_wup = tp->rcv_nxt; + + if (!tp->rx_opt.rcv_wscale && + sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows) + new_win = min(new_win, MAX_TCP_WINDOW); + else + new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale)); + + new_win >>= tp->rx_opt.rcv_wscale; + if (new_win == 0) { + tp->pred_flags = 0; + if (old_win) + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPTOZEROWINDOWADV); + } else if (old_win == 0) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFROMZEROWINDOWADV); + } + + return new_win; +} + +/* Function + * Initialize transport layer parameters. + * Parameter + * sk: transmission control block. + */ +static void tcp_nip_connect_init(struct sock *sk) +{ + const struct dst_entry *dst = __sk_dst_get(sk); + struct tcp_sock *tp = tcp_sk(sk); + __u8 rcv_wscale = 0; + int sysctl_tcp_nip_window_scaling = 0; + + /* Header structure length + timestamp length */ + tp->tcp_header_len = sizeof(struct tcphdr); + if (sock_net(sk)->ipv4.sysctl_tcp_timestamps) + tp->tcp_header_len += TCPOLEN_TSTAMP_ALIGNED; + + if (tp->rx_opt.user_mss) + tp->rx_opt.mss_clamp = tp->rx_opt.user_mss; + tp->max_window = 0; + + tcp_mtup_init(sk); + tp->rx_opt.mss_clamp = tcp_nip_sync_mss(sk, dst_mtu(dst)); + + if (!tp->window_clamp) + tp->window_clamp = dst_metric(dst, RTAX_WINDOW); + tp->advmss = tcp_mss_clamp(tp, dst_metric_advmss(dst)); + + tcp_initialize_rcv_mss(sk); + + /* Initialization window */ + tcp_select_initial_window(sk, tcp_full_space(sk), + tp->advmss - (tp->rx_opt.ts_recent_stamp ? + tp->tcp_header_len - sizeof(struct tcphdr) : 0), + &tp->rcv_wnd, + &tp->window_clamp, + sysctl_tcp_nip_window_scaling, + &rcv_wscale, + 0); + + tp->rx_opt.rcv_wscale = g_wscale_enable == 1 ? g_wscale : rcv_wscale; + tp->rcv_ssthresh = tp->rcv_wnd; + + sk->sk_err = 0; + sock_reset_flag(sk, SOCK_DONE); + tp->snd_wnd = 0; + tp->snd_wl1 = 0; + tcp_write_queue_purge(sk); + + tp->snd_una = tp->write_seq; + tp->snd_sml = tp->write_seq; + tp->snd_up = tp->write_seq; + tp->snd_nxt = tp->write_seq; + + tp->rcv_nxt = 0; + tp->rcv_wup = tp->rcv_nxt; + tp->copied_seq = tp->rcv_nxt; + inet_csk(sk)->icsk_rto = g_nip_rto == 0 ? TCP_TIMEOUT_INIT : (HZ / g_nip_rto); + inet_csk(sk)->icsk_retransmits = 0; + tcp_clear_retrans(tp); +} + +static void tcp_nip_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) +{ + skb->ip_summed = CHECKSUM_PARTIAL; + skb->csum = 0; + + TCP_SKB_CB(skb)->tcp_flags = flags; + TCP_SKB_CB(skb)->sacked = 0; + + tcp_skb_pcount_set(skb, 1); + + TCP_SKB_CB(skb)->seq = seq; + if (flags & (TCPHDR_SYN | TCPHDR_FIN)) + seq++; + TCP_SKB_CB(skb)->end_seq = seq; +} + +#define OPTION_TS BIT(1) +#define OPTION_WSCALE BIT(3) + +static void tcp_nip_connect_queue_skb(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); + + tcb->end_seq += skb->len; + __skb_header_release(skb); + __skb_queue_tail(&sk->sk_write_queue, skb); + sk->sk_wmem_queued += skb->truesize; + sk_mem_charge(sk, skb->truesize); + WRITE_ONCE(tp->write_seq, tcb->end_seq); + tp->packets_out += tcp_skb_pcount(skb); +} + +static __u16 tcp_nip_advertise_mss(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + const struct dst_entry *dst = __sk_dst_get(sk); + int mss = tp->advmss; + + if (dst) { + unsigned int metric = dst_metric_advmss(dst); + + if (metric < mss) { + mss = metric; + tp->advmss = mss; + } + } + + return (__u16)mss; +} + +/* Compute TCP options for SYN packets. This is not the final + * network wire format yet. + */ +static unsigned int tcp_nip_syn_options(struct sock *sk, struct sk_buff *skb, + struct tcp_nip_out_options *opts) +{ + unsigned int remaining = MAX_TCP_OPTION_SPACE; + + opts->mss = tcp_nip_advertise_mss(sk); + DEBUG("advertise mss%d", opts->mss); + remaining -= TCPOLEN_MSS_ALIGNED; + + return MAX_TCP_OPTION_SPACE - remaining; +} + +/* Compute TCP options for ESTABLISHED sockets. This is not the + * final wire format yet. + */ +static unsigned int tcp_nip_established_options(struct sock *sk, struct sk_buff *skb, + struct tcp_nip_out_options *opts) +{ + struct tcp_sock *tp = tcp_sk(sk); + unsigned int size = 0; + + opts->options = 0; + + if (likely(tp->rx_opt.tstamp_ok)) { + opts->options |= OPTION_TS; + opts->tsval = skb ? tcp_skb_timestamp(skb) + tp->tsoffset : 0; + opts->tsecr = tp->rx_opt.ts_recent; + size += TCPOLEN_TSTAMP_ALIGNED; + } + return size; +} + +/* Function + * Put the parameters from the TCP option into SKB. + * Write previously computed TCP options to the packet. + * Parameter + * ptr: pointer to TCP options in SKB. + * tp: transmission control block. + * opts: structure to be sent to temporarily load TCP options. + */ +static void tcp_nip_options_write(__be32 *ptr, struct tcp_sock *tp, + struct tcp_nip_out_options *opts) +{ + if (unlikely(opts->mss)) { + *ptr++ = htonl((TCPOPT_MSS << TCP_OPT_MSS_PAYLOAD) | + (TCPOLEN_MSS << TCP_OLEN_MSS_PAYLOAD) | + opts->mss); + } +} + +static inline void tcp_nip_event_ack_sent(struct sock *sk, unsigned int pkts, + u32 rcv_nxt) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (unlikely(rcv_nxt != tp->rcv_nxt)) + return; + inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); +} + +unsigned short nip_get_output_checksum_tcp(struct sk_buff *skb, struct nip_addr src_addr, + struct nip_addr dst_addr) +{ + struct nip_pseudo_header nph = {0}; + u8 *tcp_hdr = skb_transport_header(skb); + + nph.nexthdr = IPPROTO_TCP; + nph.saddr = src_addr; + nph.daddr = dst_addr; + + nph.check_len = htons(skb->len); + return nip_check_sum_build(tcp_hdr, skb->len, &nph); +} + +static int __tcp_nip_transmit_skb(struct sock *sk, struct sk_buff *skb, + int clone_it, gfp_t gfp_mask, u32 rcv_nxt) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + struct inet_sock *inet; + struct tcp_sock *tp = tcp_sk(sk); + struct tcp_skb_cb *tcb; + struct tcp_nip_out_options opts; + unsigned int tcp_options_size, tcp_header_size; + struct sk_buff *oskb = NULL; + struct tcphdr *th; + int err = 0; + __be16 len; + unsigned short check = 0; + + if (skb->tstamp == 0) + skb->tstamp = tcp_jiffies32; + + if (clone_it) { + TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq + - tp->snd_una; + oskb = skb; + + tcp_skb_tsorted_save(oskb) { + if (unlikely(skb_cloned(oskb))) + skb = pskb_copy(oskb, gfp_mask); + else + skb = skb_clone(oskb, gfp_mask); + } tcp_skb_tsorted_restore(oskb); + + if (unlikely(!skb)) + return -ENOBUFS; + } + + inet = inet_sk(sk); + tcb = TCP_SKB_CB(skb); + memset(&opts, 0, sizeof(opts)); + + if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) + tcp_options_size = tcp_nip_syn_options(sk, skb, &opts); + else + tcp_options_size = tcp_nip_established_options(sk, skb, &opts); + tcp_header_size = tcp_options_size + sizeof(struct tcphdr); + + skb->ooo_okay = sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1); + /* The data pointer moves up */ + skb_push(skb, tcp_header_size); + skb_reset_transport_header(skb); + + /* Disassociate the control block */ + skb_orphan(skb); + + /* Establishes associations with control blocks */ + skb->sk = sk; + skb->destructor = skb_is_tcp_pure_ack(skb) ? __sock_wfree : tcp_wfree; + skb_set_hash_from_sk(skb, sk); + /* Increase allocated memory */ + refcount_add(skb->truesize, &sk->sk_wmem_alloc); + DEBUG("th->inet_sport==%u, th->inet_dport==%u\n", + ntohs(inet->inet_sport), ntohs(inet->inet_dport)); + DEBUG("sk->sk_rcvbuf==%d, sk->sk_rmem_alloc==%d\n", + sk->sk_rcvbuf, atomic_read(&sk->sk_rmem_alloc)); + /* Build TCP header and checksum it. */ + th = (struct tcphdr *)skb->data; + th->source = inet->inet_sport; + th->dest = inet->inet_dport; + th->seq = htonl(tcb->seq); + th->ack_seq = htonl(rcv_nxt); + /* TCP's header offset is measured in 4 bytes, so moving two to the right + * means dividing by 4. In addition, according to the position of the offset + * field in the packet, the offset field is at the beginning of a short type, + * accounting for 4 bits. Therefore, the offset field should be shifted 12 bits + * to the left + */ + len = htons(((tcp_header_size >> TCP_NIP_4BYTE_PAYLOAD) << TCP_HDR_LEN_POS_PAYLOAD) | + tcb->tcp_flags); + *(((__be16 *)th) + TCP_HDR_LEN_OFFSET) = len; + + th->check = 0; + th->urg_ptr = 0; + + /* Write TCP option */ + tcp_nip_options_write((__be32 *)(th + 1), tp, &opts); + + /* Window Settings */ + if (likely(!(tcb->tcp_flags & TCPHDR_SYN))) + th->window = htons(nip_tcp_select_window(sk)); + else + th->window = htons(min(tp->rcv_wnd, TCP_NIP_WINDOW_MAX)); + + /* Fill in checksum */ + check = nip_get_output_checksum_tcp(skb, sk->sk_nip_rcv_saddr, sk->sk_nip_daddr); + th->check = htons(check); + + if (likely(tcb->tcp_flags & TCPHDR_ACK)) + tcp_nip_event_ack_sent(sk, tcp_skb_pcount(skb), rcv_nxt); + + /* There's data to send */ + if (skb->len != tcp_header_size) { + tcp_nip_event_data_sent(tp, sk); + tp->data_segs_out += tcp_skb_pcount(skb); + } + + memset(skb->cb, 0, sizeof(struct ninet_skb_parm)); + err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl); + return err; +} + +/* Function + * TCP's transport layer sends code that builds and initializes the TCP header + * Construct the SK_buff call transport layer to network layer interface + * Parameter + * sk: Transmission control block. + * skb: Structure stores all information about network datagrams + */ +int tcp_nip_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, + gfp_t gfp_mask) +{ + return __tcp_nip_transmit_skb(sk, skb, clone_it, gfp_mask, + tcp_sk(sk)->rcv_nxt); +} + +static void tcp_nip_queue_skb(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + + /* Advance write_seq and place onto the write_queue. */ + tp->write_seq = TCP_SKB_CB(skb)->end_seq; + tcp_nip_add_write_queue_tail(sk, skb); + sk->sk_wmem_queued += skb->truesize; + sk_mem_charge(sk, skb->truesize); +} + +/* Function + * A function used by the client transport layer to connect requests. + * Parameter + * sk: transmission control block. + */ +int __tcp_nip_connect(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *buff; + int err; + + tcp_nip_connect_init(sk); + buff = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, true); + if (unlikely(!buff)) + return -ENOBUFS; + + /* Initializes the SYN flag bit */ + tcp_nip_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN); + tcp_mstamp_refresh(tp); + tp->retrans_stamp = tcp_time_stamp(tp); + tcp_nip_init_xmit_timers(sk); + + tcp_nip_connect_queue_skb(sk, buff); + + /* Send off SYN */ + err = tcp_nip_transmit_skb(sk, buff, 1, sk->sk_allocation); + if (err == -ECONNREFUSED) + return err; + + tp->snd_nxt = tp->write_seq; + tp->pushed_seq = tp->write_seq; + buff = tcp_nip_send_head(sk); + + TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS); + + /* Timer for repeating the SYN until an answer. */ + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + inet_csk(sk)->icsk_rto, TCP_RTO_MAX); + + return 0; +} + +unsigned int tcp_nip_sync_mss(struct sock *sk, u32 pmtu) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); + int mss_now; + + if (icsk->icsk_mtup.search_high > pmtu) + icsk->icsk_mtup.search_high = pmtu; + + mss_now = tcp_nip_mtu_to_mss(sk, pmtu); + DEBUG("%s: sync mtu_to_mss %d\n", __func__, mss_now); + mss_now = tcp_bound_to_half_wnd(tp, mss_now); + DEBUG("%s: sync bound to half wnd %d\n", __func__, mss_now); + + /* And store cached results */ + icsk->icsk_pmtu_cookie = pmtu; + if (icsk->icsk_mtup.enabled) + mss_now = min(mss_now, tcp_nip_mtu_to_mss(sk, icsk->icsk_mtup.search_low)); + tp->mss_cache = mss_now; + + DEBUG("%s: sync final mss %d\n", __func__, mss_now); + + return mss_now; +} + +unsigned int tcp_nip_current_mss(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + const struct dst_entry *dst = __sk_dst_get(sk); + u32 mss_now; + unsigned int header_len; + + struct tcp_nip_out_options opts; + + mss_now = tp->mss_cache; + + DEBUG("%s: mss_cache %d\n", __func__, mss_now); + + if (dst) { + u32 mtu = dst_mtu(dst); + + if (mtu != inet_csk(sk)->icsk_pmtu_cookie) + mss_now = tcp_nip_sync_mss(sk, mtu); + DEBUG("%s: mtu %d\n", __func__, mtu); + } + + header_len = tcp_nip_established_options(sk, NULL, &opts) + + sizeof(struct tcphdr); + + if (header_len != tp->tcp_header_len) { + int delta = (int)header_len - tp->tcp_header_len; + + mss_now -= delta; + } + DEBUG("%s:after sync_mss%d\n", __func__, mss_now); + return mss_now; +} + +/* Function: + * Set up TCP options for SYN-ACKs. + * Initializes the TCP option for the SYN-ACK segment. Returns the SIZE of the TCP header. + * Parameter + * req: Request connection control block. + * mss: maximum segment length. + * skb: Transfer control block buffer. + * opts: stores the options contained in TCP packets when they are sent. + * foc: Fast Open option. + * synack_type: type of SYN+ACK segment. + */ +static unsigned int tcp_nip_synack_options(struct request_sock *req, + unsigned int mss, struct sk_buff *skb, + struct tcp_nip_out_options *opts, + const struct tcp_md5sig_key *md5, + struct tcp_fastopen_cookie *foc, + enum tcp_synack_type synack_type) +{ + struct inet_request_sock *ireq = inet_rsk(req); + unsigned int remaining = MAX_TCP_OPTION_SPACE; + + /* We always send an MSS option. */ + opts->mss = mss; + remaining -= TCPOLEN_MSS_ALIGNED; + + if (likely(ireq->tstamp_ok)) { + opts->options |= OPTION_TS; + opts->tsval = tcp_skb_timestamp(skb); + opts->tsecr = req->ts_recent; + remaining -= TCPOLEN_TSTAMP_ALIGNED; + } + return MAX_TCP_OPTION_SPACE - remaining; +} + +/* Function + * The SYN + ACK segment is constructed based on the current transport control block, + * routing information, and request information. + * Parameter + * sk: transmission control block. + * dst: routing. + * req: Request connection control block. + * foc: Fast Open option. + * synack_type: type of SYN+ACK segment. + */ +struct sk_buff *tcp_nip_make_synack(const struct sock *sk, struct dst_entry *dst, + struct request_sock *req, + struct tcp_fastopen_cookie *foc, + enum tcp_synack_type synack_type) +{ + struct inet_request_sock *ireq = inet_rsk(req); + const struct tcp_sock *tp = tcp_sk(sk); + struct tcp_md5sig_key *md5 = NULL; + struct tcp_nip_out_options opts; + struct sk_buff *skb; + int tcp_header_size; + struct tcphdr *th; + u16 user_mss; + int mss; + unsigned short check = 0; + + skb = alloc_skb(MAX_TCP_HEADER, 0); + if (unlikely(!skb)) { + dst_release(dst); + return NULL; + } + + /* Reserve space for headers. */ + skb_reserve(skb, MAX_TCP_HEADER); + + switch (synack_type) { + case TCP_SYNACK_NORMAL: + /* Release the original SKB and treat itself as the SKB of the current SK */ + skb_set_owner_w(skb, req_to_sk(req)); + break; + default: + break; + } + skb_dst_set(skb, dst); + /* set skb priority from sk */ + skb->priority = sk->sk_priority; + + mss = dst_metric_advmss(dst); + user_mss = READ_ONCE(tp->rx_opt.user_mss); + if (user_mss && user_mss < mss) + mss = user_mss; + + /* Clear the options and set the associated timestamp */ + memset(&opts, 0, sizeof(opts)); + skb->skb_mstamp_ns = tcp_clock_us(); + + /* Get the TCP header size, then set the size and reset the transport layer header */ + skb_set_hash(skb, tcp_rsk(req)->txhash, PKT_HASH_TYPE_L4); + tcp_header_size = tcp_nip_synack_options(req, mss, skb, &opts, md5, + foc, synack_type) + sizeof(*th); + skb_push(skb, tcp_header_size); + skb_reset_transport_header(skb); + + /* Clear the TCP header and set the fields of the TCP header */ + th = (struct tcphdr *)skb->data; + memset(th, 0, sizeof(struct tcphdr)); + th->syn = 1; + th->ack = 1; + if (inet_rsk(req)->ecn_ok) + th->ece = 1; + th->source = htons(ireq->ir_num); + th->dest = ireq->ir_rmt_port; + skb->ip_summed = CHECKSUM_PARTIAL; + th->seq = htonl(tcp_rsk(req)->snt_isn); + th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt); + th->check = 0; + + th->window = htons(min(req->rsk_rcv_wnd, 65535U)); + + tcp_nip_options_write((__be32 *)(th + 1), NULL, &opts); + /* TCP data offset, divided by 4 because doff is a 32-bit word + * That is, words four bytes long are counted in units + */ + th->doff = (tcp_header_size >> 2); + __TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS); + + /* Fill in checksum */ + check = nip_get_output_checksum_tcp(skb, ireq->ir_nip_loc_addr, ireq->ir_nip_rmt_addr); + th->check = htons(check); + + /* Do not fool tcpdump (if any), clean our debris */ + skb->tstamp = 0; + return skb; +} + +/* Function + * Send SKB packets with SYN+ACK segments to the network layer. + * Parameter + * req: Request connection control block. + * skb: Transfer control block buffer. + */ +int __nip_send_synack(struct request_sock *req, struct sk_buff *skb) +{ + struct inet_request_sock *ireq = inet_rsk(req); /* 连接请求块 */ + int err = -EFAULT; + int csummode = CHECKSUM_NONE; + struct nip_addr *saddr, *daddr; + struct nip_hdr_encap head = {0}; + unsigned char hdr_buf[NIP_HDR_MAX]; /* Cache the newIP header */ + + skb->protocol = htons(ETH_P_NEWIP); + skb->ip_summed = csummode; + skb->csum = 0; + saddr = &ireq->ir_nip_loc_addr; + daddr = &ireq->ir_nip_rmt_addr; + + head.saddr = *saddr; + head.daddr = *daddr; + head.ttl = NIP_DEFAULT_TTL; + head.nexthdr = IPPROTO_TCP; + head.hdr_buf = hdr_buf; + nip_hdr_comm_encap(&head); + head.total_len = head.hdr_buf_pos + skb->len; + nip_update_total_len(&head, htons(head.total_len)); + + skb_push(skb, head.hdr_buf_pos); + memcpy(skb->data, head.hdr_buf, head.hdr_buf_pos); + skb_reset_network_header(skb); + NIPCB(skb)->srcaddr = *saddr; + NIPCB(skb)->dstaddr = *daddr; + NIPCB(skb)->nexthdr = head.nexthdr; + + head.total_len = skb->len; + err = nip_send_skb(skb); + if (err) + DEBUG("%s: failed to send skb, skb->len=%u", __func__, head.total_len); + else + DEBUG("%s: send skb ok, skb->len=%u", __func__, head.total_len); + + return err; +} + +int nip_send_synack(struct request_sock *req, struct sk_buff *skb) +{ + return __nip_send_synack(req, skb); +} + +/* Function: + * Creates a subtransport block to complete the establishment of the three-way handshake + * Parameter: + * parent: indicates the parent transmission control block + * child: indicates the child transmission control block + * skb: Transfer control block buffer + */ +int tcp_nip_child_process(struct sock *parent, struct sock *child, + struct sk_buff *skb) +{ + int ret = 0; + int state = child->sk_state; + /* Child is not occupied by the user process */ + if (!sock_owned_by_user(child)) { + ret = tcp_nip_rcv_state_process(child, skb); + /* At this point the state of the child has been migrated, + * waking up the process on the listening socket, + * which may be blocked due to Accept + */ + if (state == TCP_SYN_RECV && child->sk_state != state) + parent->sk_data_ready(parent); + } else { + __sk_add_backlog(child, skb); + } + bh_unlock_sock(child); + sock_put(child); + return ret; +} + +static inline __u32 tcp_nip_acceptable_seq(const struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + + if (!before(tcp_wnd_end(tp), tp->snd_nxt)) + return tp->snd_nxt; + else + return tcp_wnd_end(tp); +} + +/* Function: + * The client sends an ACK + * Parameter: + * sk: transmission control block + * rcv_nxt: serial number to be accepted + */ +void __tcp_nip_send_ack(struct sock *sk, u32 rcv_nxt) +{ + struct sk_buff *buff; + + if (sk->sk_state == TCP_CLOSE) + return; + + buff = alloc_skb(MAX_TCP_HEADER, + sk_gfp_mask(sk, GFP_ATOMIC | __GFP_NOWARN)); + + /* Reserve space for the header. */ + skb_reserve(buff, MAX_TCP_HEADER); + /* Initialize SKB without data */ + tcp_nip_init_nondata_skb(buff, tcp_nip_acceptable_seq(sk), TCPHDR_ACK); + + /* Mark pure ack,skb->truesize set to 2 */ + skb_set_tcp_pure_ack(buff); + + /* Record the timestamp and send the SKB. */ + __tcp_nip_transmit_skb(sk, buff, 0, (__force gfp_t)0, rcv_nxt); +} + +void tcp_nip_send_ack(struct sock *sk) +{ + __tcp_nip_send_ack(sk, tcp_sk(sk)->rcv_nxt); +} + +void tcp_nip_send_fin(struct sock *sk) +{ + struct sk_buff *skb, *tskb = tcp_write_queue_tail(sk); + struct tcp_sock *tp = tcp_sk(sk); + u32 cur_mss; + + DEBUG("%s: send fin!\n", __func__); + /* Set the fin position of the last packet to 1 */ + if (tskb && tcp_nip_send_head(sk)) { +coalesce: + TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN; + TCP_SKB_CB(tskb)->end_seq++; + tp->write_seq++; + } else { + skb = alloc_skb_fclone(MAX_TCP_HEADER, sk->sk_allocation); + if (unlikely(!skb)) { + if (tskb) + goto coalesce; + return; + } + skb_reserve(skb, MAX_TCP_HEADER); + + tcp_nip_init_nondata_skb(skb, tp->write_seq, + TCPHDR_ACK | TCPHDR_FIN); + tcp_nip_queue_skb(sk, skb); + } + + cur_mss = tcp_nip_current_mss(sk); // TCP_BASE_MSS + __tcp_nip_push_pending_frames(sk, cur_mss, TCP_NAGLE_OFF); +} + +void tcp_nip_send_active_reset(struct sock *sk, gfp_t priority) +{ + struct sk_buff *skb; + + DEBUG("%s: send RST!\n", __func__); + /* NOTE: No TCP options attached and we never retransmit this. */ + skb = alloc_skb(MAX_TCP_HEADER, priority); + if (!skb) { + DEBUG("%s: alloc_skb failed.\n", __func__); + return; + } + /* Reserve space for headers and prepare control bits. */ + skb_reserve(skb, MAX_TCP_HEADER); + tcp_nip_init_nondata_skb(skb, tcp_nip_acceptable_seq(sk), + TCPHDR_ACK | TCPHDR_RST); + /* Send it off. */ + tcp_nip_transmit_skb(sk, skb, 0, priority); +} + +static bool tcp_nip_snd_wnd_test(const struct tcp_sock *tp, + const struct sk_buff *skb, + unsigned int cur_mss) +{ + u32 end_seq = TCP_SKB_CB(skb)->end_seq; + + if (skb->len > cur_mss) + end_seq = TCP_SKB_CB(skb)->seq + cur_mss; + + return !after(end_seq, tcp_wnd_end(tp)); +} + +static void tcp_nip_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now) +{ + if (skb->len <= mss_now || skb->ip_summed == CHECKSUM_NONE) { + /* Avoid the costly divide in the normal + * non-TSO case. + */ + tcp_skb_pcount_set(skb, 1); + TCP_SKB_CB(skb)->tcp_gso_size = 0; + } else { + tcp_skb_pcount_set(skb, DIV_ROUND_UP(skb->len, mss_now)); + TCP_SKB_CB(skb)->tcp_gso_size = mss_now; + } +} + +static int tcp_nip_init_tso_segs(struct sk_buff *skb, unsigned int mss_now) +{ + int tso_segs = tcp_skb_pcount(skb); + + if (!tso_segs || (tso_segs > 1 && tcp_skb_mss(skb) != mss_now)) { + tcp_nip_set_skb_tso_segs(skb, mss_now); + tso_segs = tcp_skb_pcount(skb); + } + return tso_segs; +} + +static bool tcp_nip_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, + int push_one, gfp_t gfp) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + u32 snd_num = g_nip_tcp_snd_win_enable ? (tp->nip_ssthresh / mss_now) : 0xFFFFFFFF; + u32 last_nip_ssthresh = tp->nip_ssthresh; + + tcp_nip_keepalive_enable(sk); + tp->idle_ka_probes_out = 0; + + tcp_mstamp_refresh(tp); + + if (tp->rcv_tstamp) { + u32 tstamp = tcp_jiffies32 - tp->rcv_tstamp; + + if (tstamp >= g_ack_to_nxt_snd_tstamp) { + tp->nip_ssthresh = g_ssthresh_low_min; + snd_num = tp->nip_ssthresh / mss_now; + SSTHRESH_DBG("%s new snd tstamp %u >= %u, ssthresh %u to %u, snd_num=%u", + __func__, tstamp, g_ack_to_nxt_snd_tstamp, + last_nip_ssthresh, tp->nip_ssthresh, snd_num); + } + } + + while ((skb = tcp_nip_send_head(sk)) && (snd_num--)) { + DEBUG("%s:tcp_nip_send_head head found!\n", __func__); + tcp_nip_init_tso_segs(skb, mss_now); + if (unlikely(!tcp_nip_snd_wnd_test(tp, skb, mss_now))) + break; + + if (unlikely(tcp_nip_transmit_skb(sk, skb, 1, gfp))) + break; + + tcp_nip_event_new_data_sent(sk, skb); + + if (push_one) + break; + } + return !tp->packets_out && tcp_nip_send_head(sk); +} + +int tcp_nip_rtx_synack(const struct sock *sk, struct request_sock *req) +{ + const struct tcp_request_sock_ops *af_ops = tcp_rsk(req)->af_specific; + int res; + struct dst_entry *dst; + + dst = af_ops->route_req(sk, NULL, req); + tcp_rsk(req)->txhash = net_tx_rndhash(); + + res = af_ops->send_synack(sk, dst, NULL, req, NULL, TCP_SYNACK_NORMAL, + NULL); + + return res; +} + +static void tcp_nip_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr) +{ + struct tcp_sock *tp = tcp_sk(sk); + + tp->packets_out -= decr; +} + +int __tcp_nip_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) +{ + struct tcp_sock *tp = tcp_sk(sk); + unsigned int cur_mss; + int diff, len, err; + + if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) { + if (unlikely(before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))) { + WARN_ON_ONCE(1); + return -EINVAL; + } + if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq)) + return -ENOMEM; + } + + cur_mss = tcp_nip_current_mss(sk); // TCP_BASE_MSS + + if (!before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp)) && + TCP_SKB_CB(skb)->seq != tp->snd_una) + return -EAGAIN; + + len = cur_mss * segs; + if (skb->len > len) { + if (tcp_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE, + skb, len, cur_mss, GFP_ATOMIC)) + return -ENOMEM; /* We'll try again later. */ + } else { + diff = tcp_skb_pcount(skb); + tcp_nip_set_skb_tso_segs(skb, cur_mss); + diff -= tcp_skb_pcount(skb); + if (diff) + tcp_nip_adjust_pcount(sk, skb, diff); + } + + err = tcp_nip_transmit_skb(sk, skb, 1, GFP_ATOMIC); + if (likely(!err)) { + segs = tcp_skb_pcount(skb); + + tp->total_retrans += segs; + } + return err; +} + +int tcp_nip_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) +{ + struct tcp_sock *tp = tcp_sk(sk); + int err = __tcp_nip_retransmit_skb(sk, skb, segs); + + if (err == 0) { + TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS; + tp->retrans_out += tcp_skb_pcount(skb); + + /* Save stamp of the first retransmit. */ + if (!tp->retrans_stamp) + tp->retrans_stamp = tcp_skb_timestamp(skb); + } else if (err != -EBUSY) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL); + } + + return err; +} + +#define TCP_NIP_DEFERRED_ALL ((1UL << TCP_TSQ_DEFERRED) | \ + (1UL << TCP_NIP_WRITE_TIMER_DEFERRED) | \ + (1UL << TCP_NIP_DELACK_TIMER_DEFERRED) | \ + (1UL << TCP_MTU_REDUCED_DEFERRED)) + +void tcp_nip_release_cb(struct sock *sk) +{ + unsigned long flags, nflags; + + /* perform an atomic operation only if at least one flag is set */ + do { + flags = sk->sk_tsq_flags; + if (!(flags & TCP_NIP_DEFERRED_ALL)) + return; + nflags = flags & ~TCP_NIP_DEFERRED_ALL; + } while (cmpxchg(&sk->sk_tsq_flags, flags, nflags) != flags); + + sock_release_ownership(sk); + if (flags & (1UL << TCP_NIP_WRITE_TIMER_DEFERRED)) { + tcp_nip_write_timer_handler(sk); + __sock_put(sk); + } + if (flags & (1UL << TCP_NIP_DELACK_TIMER_DEFERRED)) { + tcp_nip_delack_timer_handler(sk); + __sock_put(sk); + } + if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED)) { + inet_csk(sk)->icsk_af_ops->mtu_reduced(sk); + __sock_put(sk); + } +} + +static int tcp_nip_xmit_probe_skb(struct sock *sk, int urgent, int mib) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + + /* We don't queue it, tcp_transmit_skb() sets ownership. */ + skb = alloc_skb(MAX_TCP_HEADER, + sk_gfp_mask(sk, GFP_ATOMIC | __GFP_NOWARN)); + if (!skb) + return -1; + + /* Reserve space for headers and set control bits. */ + skb_reserve(skb, MAX_TCP_HEADER); + + tcp_nip_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK); + + NET_INC_STATS(sock_net(sk), mib); + DEBUG("[nip]%s: send probe packet!\n", __func__); + return tcp_nip_transmit_skb(sk, skb, 0, (__force gfp_t)0); +} + +int tcp_nip_write_wakeup(struct sock *sk, int mib) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + + if (sk->sk_state == TCP_CLOSE) + return -1; + + skb = tcp_nip_send_head(sk); + /* If the serial number of the next packet is in the sending window */ + if (skb && before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) { + int err; + unsigned int mss = tcp_nip_current_mss(sk); + unsigned int seg_size = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; + + if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq)) + tp->pushed_seq = TCP_SKB_CB(skb)->end_seq; + /* If the current window size is not enough to send a complete packet */ + if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq) { + TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; + err = tcp_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE, + skb, seg_size, mss, GFP_ATOMIC); + if (err) { + DEBUG("[nip]:tcp_fragment return err = %d!\n", err); + return -1; + } + } + err = tcp_nip_transmit_skb(sk, skb, 1, GFP_ATOMIC); + if (!err) + tcp_nip_event_new_data_sent(sk, skb); + return err; + } else { + return tcp_nip_xmit_probe_skb(sk, 0, mib); + } +} + +/* The 0 window probe packet is sent */ +void tcp_nip_send_probe0(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + struct net *net = sock_net(sk); + unsigned long probe_max; + int err; + /* An ACK packet with snd_UNa-1 and length 0 is sent as a zero-window detection packet */ + err = tcp_nip_write_wakeup(sk, LINUX_MIB_TCPWINPROBE); + + /* If there are packets to be sent on the network and no packets to be + * sent in the send queue, the packet is returned directly + */ + if (tp->packets_out || !tcp_nip_send_head(sk)) { + /* Cancel probe timer, if it is not required. */ + icsk->icsk_probes_out = 0; + icsk->icsk_backoff = 0; + return; + } + + if (err <= 0) { + if (icsk->icsk_backoff < net->ipv4.sysctl_tcp_retries2) + icsk->icsk_backoff++; + icsk->icsk_probes_out++; /* Number of probes +1 */ + probe_max = TCP_RTO_MAX; + } else { + if (!icsk->icsk_probes_out) + icsk->icsk_probes_out = 1; + probe_max = TCP_RESOURCE_PROBE_INTERVAL; + } + inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, + tcp_probe0_when(sk, probe_max), + TCP_RTO_MAX); +} diff --git a/code/net/newip/tcp_nip_parameter.c b/code/net/newip/tcp_nip_parameter.c new file mode 100644 index 0000000000000000000000000000000000000000..18094c9d96b172865b9942115b2dcdb728908046 --- /dev/null +++ b/code/net/newip/tcp_nip_parameter.c @@ -0,0 +1,175 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2022 Huawei Device Co., Ltd. + * + * NewIP INET + * An implementation of the TCP/IP protocol suite for the LINUX + * operating system. NewIP INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Definitions for the NewIP parameter module. + */ +#include +#include +#include +#include +#include +#include +#include +#include + +/*********************************************************************************************/ +/* Newip protocol name */ +/*********************************************************************************************/ +int g_af_ninet = AF_NINET; +module_param_named(af_ninet, g_af_ninet, int, 0444); + +/*********************************************************************************************/ +/* Rto timeout timer period (HZ/n) */ +/*********************************************************************************************/ +/* RTT RTO in the small-delay scenario */ +int g_nip_rto = 50; +module_param_named(nip_rto, g_nip_rto, int, 0644); + +/* RTT RTO of a large delay scenario */ +int g_nip_rto_up = 100; +module_param_named(nip_rto_up, g_nip_rto_up, int, 0644); + +/*********************************************************************************************/ +/* TCP sending and receiving buffer configuration */ +/*********************************************************************************************/ +int g_nip_sndbuf = 1050000; // 1M +module_param_named(nip_sndbuf, g_nip_sndbuf, int, 0644); + +int g_nip_rcvbuf = 2000000; // 2M +module_param_named(nip_rcvbuf, g_nip_rcvbuf, int, 0644); + +/*********************************************************************************************/ +/* Window configuration */ +/*********************************************************************************************/ +/* Maximum receiving window */ +int g_wscale_enable = 1; +module_param_named(wscale_enable, g_wscale_enable, int, 0644); + +/* Window scale configuration, 2^n */ +int g_wscale = 7; +module_param_named(wscale, g_wscale, int, 0644); + +/*********************************************************************************************/ +/* Enables the debugging of special scenarios */ +/*********************************************************************************************/ +/* After receiving n packets, an ACK packet is sent */ +int g_ack_num = 5; +module_param_named(ack_num, g_ack_num, int, 0644); + +/* Reset the packet sending window threshold after receiving n ACK packets */ +int g_nip_ssthresh_reset = 10000000; // 10M +module_param_named(nip_ssthresh_reset, g_nip_ssthresh_reset, int, 0644); + +/*********************************************************************************************/ +/* Enables the debugging of special scenarios */ +/*********************************************************************************************/ +/* Debugging of threshold change */ +int g_rtt_ssthresh_debug; +module_param_named(rtt_ssthresh_debug, g_rtt_ssthresh_debug, int, 0644); + +/* Debugging of packet retransmission after ACK */ +int g_ack_retrans_debug; +module_param_named(ack_retrans_debug, g_ack_retrans_debug, int, 0644); + +/*********************************************************************************************/ +/* Retransmission parameters after ACK */ +/*********************************************************************************************/ +/* Three DUP ACK packets indicates the number of retransmission packets */ +int g_dup_ack_retrans_num = 5; +module_param_named(dup_ack_retrans_num, g_dup_ack_retrans_num, int, 0644); + +/* Common ACK Indicates the number of retransmissions */ +int g_ack_retrans_num = 1; +module_param_named(ack_retrans_num, g_ack_retrans_num, int, 0644); + +/* Ack retransmission seg retransmission divisor */ +int g_retrans_seg_end_divisor = 1; +module_param_named(retrans_seg_end_divisor, g_retrans_seg_end_divisor, int, 0644); + +/*********************************************************************************************/ +/* RTT timestamp parameters */ +/*********************************************************************************************/ +int g_rtt_tstamp_rto_up = 100; // rtt_tstamp >= 100 ==> shorten rto +module_param_named(rtt_tstamp_rto_up, g_rtt_tstamp_rto_up, int, 0644); + +int g_rtt_tstamp_high = 30; // rtt_tstamp >= 30 ==> ssthresh = 100K +module_param_named(rtt_tstamp_high, g_rtt_tstamp_high, int, 0644); + +int g_rtt_tstamp_mid_high = 20; // rtt_tstamp >= 20 ==> ssthresh = 250K +module_param_named(rtt_tstamp_mid_high, g_rtt_tstamp_mid_high, int, 0644); + +/* rtt_tstamp >= 10 ==> ssthresh = 1M (500K ~ 1M) + * rtt_tstamp < 10 ==> ssthresh = 1.5M + */ +int g_rtt_tstamp_mid_low = 10; +module_param_named(rtt_tstamp_mid_low, g_rtt_tstamp_mid_low, int, 0644); + +int g_ack_to_nxt_snd_tstamp = 100; +module_param_named(ack_to_nxt_snd_tstamp, g_ack_to_nxt_snd_tstamp, int, 0644); + +/*********************************************************************************************/ +/* Window threshold parameters */ +/*********************************************************************************************/ +int g_ssthresh_enable = 1; +module_param_named(ssthresh_enable, g_ssthresh_enable, int, 0644); + +int g_nip_ssthresh_default = 300000; // 300K +module_param_named(nip_ssthresh_default, g_nip_ssthresh_default, int, 0644); + +int g_ssthresh_high = 1500000; // rtt_tstamp < 10 ==> ssthresh = 1.5M +module_param_named(ssthresh_high, g_ssthresh_high, int, 0644); + +int g_ssthresh_mid_high = 1000000; // rtt_tstamp >= 10 ==> ssthresh = 1M (500K ~ 1M) +module_param_named(ssthresh_mid_high, g_ssthresh_mid_high, int, 0644); + +int g_ssthresh_mid_low = 250000; // rtt_tstamp >= 20 ==> ssthresh = 250K +module_param_named(ssthresh_mid_low, g_ssthresh_mid_low, int, 0644); + +int g_ssthresh_low = 100000; // rtt_tstamp >= 30 ==> ssthresh = 100K +module_param_named(ssthresh_low, g_ssthresh_low, int, 0644); + +int g_ssthresh_low_min = 10000; // rtt_tstamp >= 100 ==> ssthresh = 10K +module_param_named(ssthresh_low_min, g_ssthresh_low_min, int, 0644); + +int g_ssthresh_high_step = 1; +module_param_named(ssthresh_high_step, g_ssthresh_high_step, int, 0644); + +int g_rcv_win_max = 512000; +module_param_named(rcv_win_max, g_rcv_win_max, int, 0644); + +/*********************************************************************************************/ +/* keepalive parameters */ +/*********************************************************************************************/ +int g_nip_idle_ka_probes_out = 200; +module_param_named(nip_idle_ka_probes_out, g_nip_idle_ka_probes_out, int, 0644); + +int g_nip_keepalive_time = 25; +module_param_named(nip_keepalive_time, g_nip_keepalive_time, int, 0644); + +int g_nip_keepalive_intvl = 25; +module_param_named(nip_keepalive_intvl, g_nip_keepalive_intvl, int, 0644); + +int g_nip_keepalive_probes = 255; +module_param_named(nip_keepalive_probes, g_nip_keepalive_probes, int, 0644); + +/*********************************************************************************************/ +/* zero probeparameters */ +/*********************************************************************************************/ +int g_nip_tcp_zero_probe = 20; +module_param_named(nip_tcp_zero_probe, g_nip_tcp_zero_probe, int, 0644); + +/*********************************************************************************************/ +/* window mode parameters */ +/*********************************************************************************************/ +bool g_nip_tcp_snd_win_enable; +module_param_named(nip_tcp_snd_win_enable, g_nip_tcp_snd_win_enable, bool, 0644); + +bool g_nip_tcp_rcv_win_enable = true; +module_param_named(nip_tcp_rcv_win_enable, g_nip_tcp_rcv_win_enable, bool, 0644); + diff --git a/code/net/newip/tcp_nip_parameter.h b/code/net/newip/tcp_nip_parameter.h new file mode 100644 index 0000000000000000000000000000000000000000..7467dde6fa742a2f9b972056dc0cc44ada39c2ec --- /dev/null +++ b/code/net/newip/tcp_nip_parameter.h @@ -0,0 +1,104 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2022 Huawei Device Co., Ltd. + * + * NewIP INET + * An implementation of the TCP/IP protocol suite for the LINUX + * operating system. NewIP INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Definitions for the NewIP parameter module. + */ +#ifndef _TCP_NIP_PARAMETER_H +#define _TCP_NIP_PARAMETER_H + +/*********************************************************************************************/ +/* Rto timeout timer period (HZ/n) */ +/*********************************************************************************************/ +extern int g_nip_rto; +extern int g_nip_rto_up; + +/*********************************************************************************************/ +/* TCP sending and receiving buffer configuration */ +/*********************************************************************************************/ +extern int g_nip_sndbuf; +extern int g_nip_rcvbuf; + +/*********************************************************************************************/ +/* Window configuration */ +/*********************************************************************************************/ +extern int g_wscale_enable; +extern int g_wscale; + +/*********************************************************************************************/ +/* Enables the debugging of special scenarios */ +/*********************************************************************************************/ +extern int g_ack_num; +extern int g_nip_ssthresh_reset; + +/*********************************************************************************************/ +/* Enables the debugging of special scenarios */ +/*********************************************************************************************/ +extern int g_rtt_ssthresh_debug; +#define SSTHRESH_DBG(fmt, ...) \ +do { \ + if (g_rtt_ssthresh_debug) \ + pr_crit(fmt, ##__VA_ARGS__); \ +} while (0) + +extern int g_ack_retrans_debug; +#define RETRANS_DBG(fmt, ...) \ +do { \ + if (g_ack_retrans_debug) \ + pr_crit(fmt, ##__VA_ARGS__); \ +} while (0) + +/*********************************************************************************************/ +/* Retransmission parameters after ACK */ +/*********************************************************************************************/ +extern int g_dup_ack_retrans_num; +extern int g_ack_retrans_num; +extern int g_retrans_seg_end_divisor; + +/*********************************************************************************************/ +/* RTT timestamp parameters */ +/*********************************************************************************************/ +extern int g_rtt_tstamp_rto_up; +extern int g_rtt_tstamp_high; +extern int g_rtt_tstamp_mid_high; +extern int g_rtt_tstamp_mid_low; +extern int g_ack_to_nxt_snd_tstamp; + +/*********************************************************************************************/ +/* Window threshold parameters */ +/*********************************************************************************************/ +extern int g_ssthresh_enable; +extern int g_nip_ssthresh_default; +extern int g_ssthresh_high; +extern int g_ssthresh_mid_high; +extern int g_ssthresh_mid_low; +extern int g_ssthresh_low; +extern int g_ssthresh_low_min; +extern int g_ssthresh_high_step; +extern int g_rcv_win_max; + +/*********************************************************************************************/ +/* keepalive parameters */ +/*********************************************************************************************/ +extern int g_nip_idle_ka_probes_out; +extern int g_nip_keepalive_time; +extern int g_nip_keepalive_intvl; +extern int g_nip_keepalive_probes; + +/*********************************************************************************************/ +/* zero probeparameters */ +/*********************************************************************************************/ +extern int g_nip_tcp_zero_probe; + +/*********************************************************************************************/ +/* window mode parameters */ +/*********************************************************************************************/ +extern bool g_nip_tcp_snd_win_enable; +extern bool g_nip_tcp_rcv_win_enable; + +#endif /* _TCP_NIP_PARAMETER_H */ diff --git a/code/net/newip/tcp_nip_timer.c b/code/net/newip/tcp_nip_timer.c new file mode 100644 index 0000000000000000000000000000000000000000..4f3c44cc182ae83e76b13d12206d7d8cc34cb0db --- /dev/null +++ b/code/net/newip/tcp_nip_timer.c @@ -0,0 +1,381 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2022 Huawei Device Co., Ltd. + * + * NewIP INET + * An implementation of the TCP/IP protocol suite for the LINUX + * operating system. NewIP INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Implementation of the Transmission Control Protocol(TCP). + * + * Based on net/ipv4/tcp_timer.c + */ +#define pr_fmt(fmt) "NIP-TCP: " fmt + +#include +#include +#include "tcp_nip_parameter.h" + +/** + * tcp_nip_orphan_retries() - Returns maximal number of retries on an orphaned socket + * @sk: Pointer to the current socket. + * @alive: bool, socket alive state + */ +static int tcp_nip_orphan_retries(struct sock *sk, bool alive) +{ + int retries = sock_net(sk)->ipv4.sysctl_tcp_orphan_retries; /* May be zero. */ + + /* We know from an ICMP that something is wrong. */ + if (sk->sk_err_soft && !alive) + retries = 0; + + /* However, if socket sent something recently, select some safe + * number of retries. 8 corresponds to >100 seconds with minimal + * RTO of 200msec. + */ + if (retries == 0 && alive) + retries = 8; + return retries; +} + +void tcp_nip_delack_timer_handler(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + if (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) || + !(icsk->icsk_ack.pending & ICSK_ACK_TIMER)) + goto out; + + if (time_after(icsk->icsk_ack.timeout, jiffies)) { + sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout); + goto out; + } + icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER; + + if (inet_csk_ack_scheduled(sk)) { + icsk->icsk_ack.ato = TCP_ATO_MIN; + tcp_mstamp_refresh(tcp_sk(sk)); + tcp_nip_send_ack(sk); + __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKS); + } + +out:; +} + +static void tcp_nip_write_err(struct sock *sk) +{ + sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT; + sk->sk_error_report(sk); + /* Releasing TCP Resources */ + tcp_nip_done(sk); + __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONTIMEOUT); +} + +static void tcp_nip_delack_timer(struct timer_list *t) +{ + struct inet_connection_sock *icsk = + from_timer(icsk, t, icsk_delack_timer); + struct sock *sk = &icsk->icsk_inet.sk; + + bh_lock_sock(sk); + if (!sock_owned_by_user(sk)) { + tcp_nip_delack_timer_handler(sk); + } else { + __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED); + /* deleguate our work to tcp_release_cb() */ + if (!test_and_set_bit(TCP_NIP_DELACK_TIMER_DEFERRED, &sk->sk_tsq_flags)) + sock_hold(sk); + } + bh_unlock_sock(sk); + sock_put(sk); +} + +static bool retransmits_nip_timed_out(struct sock *sk, + unsigned int boundary, + unsigned int timeout, + bool syn_set) +{ + /* Newip does not support the calculation of the timeout period based on the timestamp. + * Currently, it determines whether the timeout period is based on + * the retransmission times + */ + DEBUG("%s: icsk->retransmits=%u\n", __func__, + inet_csk(sk)->icsk_retransmits); + return inet_csk(sk)->icsk_retransmits > boundary; +} + +static int tcp_nip_write_timeout(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct net *net = sock_net(sk); + int retry_until; + bool syn_set = false; + + if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { + retry_until = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries; + syn_set = true; + } else { + retry_until = net->ipv4.sysctl_tcp_retries2; + if (sock_flag(sk, SOCK_DEAD)) { + const bool alive = icsk->icsk_rto < TCP_RTO_MAX; + + /* In the case of SOCK_DEAD, the retry_until value is smaller */ + retry_until = tcp_nip_orphan_retries(sk, alive); + } + } + + if (retransmits_nip_timed_out(sk, retry_until, + syn_set ? 0 : icsk->icsk_user_timeout, syn_set)) { + DEBUG("%s: tcp retransmit time out!!!\n", __func__); + tcp_nip_write_err(sk); + return 1; + } + return 0; +} + +void tcp_nip_retransmit_timer(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); + struct sk_buff *skb = tcp_write_queue_head(sk); + struct tcp_skb_cb *scb = TCP_SKB_CB(skb); + u32 icsk_rto_last; + + if (!tp->packets_out) + return; + + if (tcp_nip_write_queue_empty(sk)) + return; + + tp->tlp_high_seq = 0; + + if (tcp_nip_write_timeout(sk)) + return; + + if (tcp_nip_retransmit_skb(sk, skb, 1) > 0) { + if (!icsk->icsk_retransmits) + icsk->icsk_retransmits = 1; + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + min(icsk->icsk_rto, TCP_RESOURCE_PROBE_INTERVAL), + TCP_RTO_MAX); + + SSTHRESH_DBG("%s seq %u retransmit fail, win=%u, rto=%u, pkt_out=%u", + __func__, scb->seq, tp->nip_ssthresh, icsk->icsk_rto, tp->packets_out); + return; + } + icsk->icsk_backoff++; + icsk->icsk_retransmits++; + + icsk_rto_last = icsk->icsk_rto; + icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); + + SSTHRESH_DBG("%s seq %u, reset win %u to %u, rto %u to %u, pkt_out=%u", + __func__, scb->seq, tp->nip_ssthresh, g_ssthresh_low, + icsk_rto_last, icsk->icsk_rto, tp->packets_out); + + tp->nip_ssthresh = g_ssthresh_low; + + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX); +} + +void tcp_nip_probe_timer(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + int max_probes; + + if (tp->packets_out || !tcp_nip_send_head(sk)) { + icsk->icsk_probes_out = 0; + DEBUG("[nip]%s packets_out!=0 or send_head=NULL, don't send probe packet.", + __func__); + return; + } + + max_probes = sock_net(sk)->ipv4.sysctl_tcp_retries2; + if (sock_flag(sk, SOCK_DEAD)) { + const bool alive = inet_csk_rto_backoff(icsk, TCP_RTO_MAX) < TCP_RTO_MAX; + + max_probes = tcp_nip_orphan_retries(sk, alive); + if (!alive && icsk->icsk_backoff >= max_probes) + goto abort; + } + + if (icsk->icsk_probes_out >= max_probes) { +abort: tcp_nip_write_err(sk); + } else { + /* Only send another probe if we didn't close things up. */ + tcp_nip_send_probe0(sk); + } +} + +void tcp_nip_write_timer_handler(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + int event; + + if (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) || + !icsk->icsk_pending) + goto out; + + if (time_after(icsk->icsk_timeout, jiffies)) { + sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout); + goto out; + } + tcp_mstamp_refresh(tcp_sk(sk)); + event = icsk->icsk_pending; + + switch (event) { + case ICSK_TIME_RETRANS: + icsk->icsk_pending = 0; + tcp_nip_retransmit_timer(sk); + break; + case ICSK_TIME_PROBE0: + icsk->icsk_pending = 0; + tcp_nip_probe_timer(sk); + break; + default: + break; + } + +out:; +} + +static void tcp_nip_write_timer(struct timer_list *t) +{ + struct inet_connection_sock *icsk = + from_timer(icsk, t, icsk_retransmit_timer); + struct sock *sk = &icsk->icsk_inet.sk; + + bh_lock_sock(sk); + if (!sock_owned_by_user(sk)) { + tcp_nip_write_timer_handler(sk); + } else { + /* delegate our work to tcp_release_cb() */ + if (!test_and_set_bit(TCP_NIP_WRITE_TIMER_DEFERRED, &sk->sk_tsq_flags)) + sock_hold(sk); + } + bh_unlock_sock(sk); + sock_put(sk); +} + +#define NIP_KA_TIMEOUT_SCALE_MAX 1000 +static void tcp_nip_keepalive_timeout(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + u32 keepalive_time = keepalive_time_when(tp); + + if (keepalive_time > HZ) { + pr_crit("%s keepalive timeout, disconnect sock.", __func__); + tcp_nip_write_err(sk); + return; + } + + tp->nip_keepalive_timeout_scale++; + if (tp->nip_keepalive_timeout_scale <= NIP_KA_TIMEOUT_SCALE_MAX) { + icsk->icsk_probes_out = 0; + inet_csk_reset_keepalive_timer(sk, keepalive_time); + + pr_crit("%s ms keepalive scale(%u) < thresh, connect sock continue.", + __func__, tp->nip_keepalive_timeout_scale); + } else { + pr_crit("%s ms keepalive timeout(scale=%u), disconnect sock.", + __func__, tp->nip_keepalive_timeout_scale); + tcp_nip_write_err(sk); + } +} + +static void tcp_nip_keepalive_timer(struct timer_list *t) +{ + struct sock *sk = from_timer(sk, t, sk_timer); + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + u32 elapsed; + + /* Only process if socket is not in use. */ + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) { + /* Try again later. */ + inet_csk_reset_keepalive_timer(sk, HZ / TCP_NIP_KEEPALIVE_CYCLE_MS_DIVISOR); + goto out; + } + + if (sk->sk_state == TCP_LISTEN) { + pr_err("Hmm... keepalive on a LISTEN\n"); + goto out; + } + tcp_mstamp_refresh(tp); + /* 2022-02-18 + * NewIP TCP doesn't have TIME_WAIT state, so socket in TCP_CLOSING + * uses keepalive timer to release socket. + */ + if ((sk->sk_state == TCP_FIN_WAIT2 || sk->sk_state == TCP_CLOSING) && + sock_flag(sk, SOCK_DEAD)) { + DEBUG("%s: finish wait, close sock\n", __func__); + goto death; + } + + if (!sock_flag(sk, SOCK_KEEPOPEN) || + ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT))) + goto out; + + elapsed = keepalive_time_when(tp); + + /* It is alive without keepalive 8) */ + if (tp->packets_out || !tcp_write_queue_empty(sk)) + goto resched; + + elapsed = keepalive_time_elapsed(tp); + if (elapsed >= keepalive_time_when(tp)) { + /* If the TCP_USER_TIMEOUT option is enabled, use that + * to determine when to timeout instead. + */ + if ((icsk->icsk_user_timeout != 0 && + elapsed >= msecs_to_jiffies(icsk->icsk_user_timeout) && + icsk->icsk_probes_out > 0) || + (icsk->icsk_user_timeout == 0 && + icsk->icsk_probes_out >= keepalive_probes(tp))) { + tcp_nip_keepalive_timeout(sk); + goto out; + } + if (tcp_nip_write_wakeup(sk, LINUX_MIB_TCPKEEPALIVE) <= 0) { + icsk->icsk_probes_out++; + tp->idle_ka_probes_out++; + elapsed = keepalive_intvl_when(tp); + } else { + /* If keepalive was lost due to local congestion, + * try harder. + */ + elapsed = TCP_RESOURCE_PROBE_INTERVAL; + } + } else { + /* It is tp->rcv_tstamp + keepalive_time_when(tp) */ + elapsed = keepalive_time_when(tp) - elapsed; + } + + sk_mem_reclaim(sk); + +resched: + inet_csk_reset_keepalive_timer(sk, elapsed); + goto out; + +death: + tcp_nip_done(sk); + +out: + tcp_nip_keepalive_disable(sk); + bh_unlock_sock(sk); + sock_put(sk); +} + +void tcp_nip_init_xmit_timers(struct sock *sk) +{ + inet_csk_init_xmit_timers(sk, &tcp_nip_write_timer, &tcp_nip_delack_timer, + &tcp_nip_keepalive_timer); +} + +void tcp_nip_clear_xmit_timers(struct sock *sk) +{ + inet_csk_clear_xmit_timers(sk); +} diff --git a/code/net/newip/udp.c b/code/net/newip/udp.c new file mode 100644 index 0000000000000000000000000000000000000000..f8114a1416a67a05309069b0eed2ac1bf5aeafb7 --- /dev/null +++ b/code/net/newip/udp.c @@ -0,0 +1,507 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2022 Huawei Device Co., Ltd. + * + * NewIP INET + * An implementation of the TCP/IP protocol suite for the LINUX + * operating system. NewIP INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * The User Datagram Protocol (NewIP UDP). + * + * Based on net/ipv4/udp.c + * Based on net/ipv6/udp.c + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "nip_hdr.h" +#include "nip_checksum.h" + +static u32 nip_udp_portaddr_hash(const struct net *net, + const struct nip_addr *niaddr, + u_short port) +{ + u32 hash; + u32 mix = net_hash_mix(net); + + /* use nip_addr_hash() to obtain a hash result of nip_addr */ + hash = jhash_1word(nip_addr_hash(niaddr), mix); + + return hash ^ port; +} + +/* Called during the bind & sendto procedure, bind ports */ +int nip_udp_get_port(struct sock *sk, unsigned short snum) +{ + unsigned int hash2_nulladdr, hash2_partial; + + hash2_nulladdr = nip_udp_portaddr_hash(sock_net(sk), + &nip_any_addr, snum); + /* hash2_partial is the hash result of nip_addr only */ + hash2_partial = nip_udp_portaddr_hash(sock_net(sk), + &sk->sk_nip_rcv_saddr, 0); + + /* precompute partial secondary hash*/ + udp_sk(sk)->udp_portaddr_hash = hash2_partial; + return udp_lib_get_port(sk, snum, hash2_nulladdr); +} + +static int nip_udp_compute_score(struct sock *sk, struct net *net, + const struct nip_addr *saddr, __be16 sport, + const struct nip_addr *daddr, unsigned short hnum, + int dif, int sdif) +{ + bool dev_match; + int score = 0; + struct inet_sock *inet; + + if (!net_eq(sock_net(sk), net) || + udp_sk(sk)->udp_port_hash != hnum || + sk->sk_family != PF_NINET) + return -1; + + /* Destination port of the peer device + * In the header sent by the peer end, it is the source port + */ + inet = inet_sk(sk); + if (inet->inet_dport) { + if (inet->inet_dport != sport) + return -1; + score++; + } + + /* Source ADDRESS of the local device + * In the header sent by the peer device, it is the destination address + */ + if (!nip_addr_any(&sk->sk_nip_rcv_saddr)) { + if (!nip_addr_eq(&sk->sk_nip_rcv_saddr, daddr)) + return -1; + score++; + } + + /* Address of the peer device + * In the packet header sent by the peer device, is the source ADDRESS + */ + if (!nip_addr_any(&sk->sk_nip_daddr)) { + if (!nip_addr_eq(&sk->sk_nip_daddr, saddr)) + return -1; + score++; + } + + /* Check the dev index */ + if (sk->sk_bound_dev_if) { + dev_match = dif == sk->sk_bound_dev_if || + sdif == sk->sk_bound_dev_if; + if (!dev_match) + return -1; + score++; + } + + if (sk->sk_incoming_cpu == raw_smp_processor_id()) + score++; + return score; +} + +static struct sock *nip_udp_lib_lookup2(struct net *net, + const struct nip_addr *saddr, + u_short sport, + const struct nip_addr *daddr, + unsigned short hnum, + int dif, int sdif, + struct udp_hslot *hslot2, + struct sk_buff *skb) +{ + struct sock *sk; + struct sock *result = NULL; + int score, badness; + + badness = -1; + udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) { + score = nip_udp_compute_score(sk, net, saddr, sport, daddr, + hnum, dif, sdif); + if (score > badness) { + result = sk; + badness = score; + } + } + return result; +} + +struct sock *__nip_udp_lib_lookup(struct net *net, + const struct nip_addr *saddr, __be16 sport, + const struct nip_addr *daddr, __be16 dport, + int dif, int sdif, struct udp_table *udptable, + struct sk_buff *skb) +{ + unsigned short hnum = ntohs(dport); + unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask); + unsigned int old_slot2; + int score, badness; + struct sock *sk, *result; + struct udp_hslot *hslot2, *hslot = &udptable->hash[slot]; + + if (hslot->count > NIP_UDP_HSLOT_COUNT) { + hash2 = nip_udp_portaddr_hash(net, daddr, hnum); + DEBUG("hash2 is: 0x%x", hash2); + slot2 = hash2 & udptable->mask; + hslot2 = &udptable->hash2[slot2]; + if (hslot->count < hslot2->count) + goto begin; + + result = nip_udp_lib_lookup2(net, saddr, sport, + daddr, hnum, dif, sdif, + hslot2, skb); + if (!result) { + old_slot2 = slot2; + + hash2 = nip_udp_portaddr_hash(net, &nip_any_addr, hnum); + slot2 = hash2 & udptable->mask; + /* avoid searching the same slot again. */ + if (unlikely(slot2 == old_slot2)) + return result; + + hslot2 = &udptable->hash2[slot2]; + if (hslot->count < hslot2->count) + goto begin; + + result = nip_udp_lib_lookup2(net, saddr, sport, + daddr, hnum, dif, sdif, + hslot2, skb); + } + return result; + } +begin: + result = NULL; + badness = -1; + sk_for_each_rcu(sk, &hslot->head) { + score = nip_udp_compute_score(sk, net, saddr, sport, daddr, + hnum, dif, sdif); + if (score > badness) { + result = sk; + badness = score; + } + DEBUG("score is: %d", score); + } + return result; +} + +static struct sock *__nip_udp_lib_lookup_skb(struct sk_buff *skb, + __be16 sport, __be16 dport, + struct udp_table *udptable) +{ + return __nip_udp_lib_lookup(dev_net(skb->dev), + &NIPCB(skb)->srcaddr, sport, + &NIPCB(skb)->dstaddr, dport, skb->skb_iif, + 0, udptable, skb); +} + +void udp_table_del(struct sock *sk) +{ + udp_lib_unhash(sk); +} + +int nip_udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, + int noblock, int flags, int *addr_len) +{ + struct sk_buff *skb; + unsigned int ulen, copied, datalen; + int peeking, off; + int err; + + off = sk_peek_offset(sk, flags); + peeking = off; /* Fetch the SKB from the queue */ + skb = __skb_recv_udp(sk, flags, noblock, &off, &err); + if (!skb) + return err; + ulen = skb->len; + copied = len; + if (copied > ulen - off) + copied = ulen - off; + else if (copied < ulen) + msg->msg_flags |= MSG_TRUNC; + + /* copy data */ + datalen = copy_to_iter(skb->data, copied, &msg->msg_iter); + + if (datalen < 0) { + DEBUG("%s: copy to iter in failure! len = %d", __func__, + datalen); + err = -EFAULT; + return err; + } + + sock_recv_ts_and_drops(msg, sk, skb); + /* Update information such as the timestamp received + * by the last datagram in the transport control block + */ + /* copy the address */ + if (msg->msg_name) { + DECLARE_SOCKADDR(struct sockaddr_nin *, sin, msg->msg_name); + + sin->sin_family = AF_NINET; + sin->sin_port = udp_hdr(skb)->source; + sin->sin_addr = NIPCB(skb)->srcaddr; + *addr_len = sizeof(*sin); + } + + err = copied; + if (flags & MSG_TRUNC) + err = ulen; + + skb_consume_udp(sk, skb, peeking ? -err : err); + return err; +} + +static void nip_udp_err(struct sk_buff *skb, + struct ninet_skb_parm *opt, + u8 type, + u8 code, int offset, + __be32 info) +{ +} + +static int __nip_udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) +{ + int rc; + + sk_incoming_cpu_update(sk); + + rc = __udp_enqueue_schedule_skb(sk, skb); + if (rc < 0) { + kfree_skb(skb); + return -1; + } + return 0; +} + +bool nip_get_udp_input_checksum(struct sk_buff *skb) +{ + struct nip_pseudo_header nph = {0}; + struct udphdr *udphead = udp_hdr(skb); + unsigned short check_len = ntohs(udphead->len); + + nph.nexthdr = NIPCB(skb)->nexthdr; + nph.saddr = NIPCB(skb)->srcaddr; + nph.daddr = NIPCB(skb)->dstaddr; + nph.check_len = udphead->len; + + return nip_check_sum_parse(skb_transport_header(skb), check_len, &nph) + == 0xffff ? true : false; +} + +/* Udp packets are received at the network layer */ +int nip_udp_input(struct sk_buff *skb) +{ + struct sock *sk; + int rc = 0; + struct udphdr *udphead = udp_hdr(skb); + + if (!nip_get_udp_input_checksum(skb)) { + DEBUG("%s: checksum failed, drop the packet. ", + __func__); + kfree_skb(skb); + rc = -1; + goto end; + } + + sk = __nip_udp_lib_lookup_skb(skb, udphead->source, + udphead->dest, &udp_table); + if (!sk) { + DEBUG("%s: dport not match, drop the packet. sport=%u, dport=%u, data_len=%u.", + __func__, ntohs(udphead->source), ntohs(udphead->dest), ntohs(udphead->len)); + kfree_skb(skb); + rc = -1; + goto end; + } + + skb_pull(skb, sizeof(struct udphdr)); + skb->len = ntohs(udphead->len) - sizeof(struct udphdr); + + skb_dst_drop(skb); + /* enqueue */ + rc = __nip_udp_queue_rcv_skb(sk, skb); +end: + return rc; +} + +int nip_udp_output(struct sock *sk, struct msghdr *msg, size_t len) +{ + DECLARE_SOCKADDR(struct sockaddr_nin *, sin, msg->msg_name); + struct flow_nip fln; + u_short sport, dport; + struct dst_entry *dst; + int err = 0; + struct inet_sock *inet; + + if (sin->sin_family != AF_NINET) { + DEBUG("%s: sin_family false.", __func__); + return -EAFNOSUPPORT; + } + if (nip_addr_invalid(&sin->sin_addr)) { + DEBUG("%s: sin_addr false.", __func__); + return -EFAULT; + } + + inet = inet_sk(sk); + if (sin) { + /* Destination address, port (network order) must be specified when sendto */ + dport = sin->sin_port; + fln.daddr = sin->sin_addr; + } else { + /* Currently, udp socket Connect function is not implemented. + * The destination address and port must be directly provided by Sendto + */ + return -EDESTADDRREQ; + } + sport = htons(inet->inet_num); + + /* Check the dev index */ + fln.flowin_oif = sk->sk_bound_dev_if; + + /* Query the route & Obtain the Saddr */ + dst = nip_sk_dst_lookup_flow(sk, &fln); + if (IS_ERR(dst)) { + err = PTR_ERR(dst); + dst = NULL; + goto out; + } + + err = _nip_udp_output(sk, msg, len, + sizeof(struct udphdr), &fln.saddr, + sport, &fln.daddr, + dport, dst); + +out: + dst_release(dst); + if (!err) + return len; + + return err; +} + +/* Close the connection using */ +void nip_udp_destroy_sock(struct sock *sk) +{ + udp_table_del(sk); + ninet_destroy_sock(sk); +} + +/* socket option code for udp */ +int nip_udp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, + unsigned int optlen) +{ + if (level == SOL_UDP || level == SOL_UDPLITE) + return 0; + return nip_setsockopt(sk, level, optname, optval, optlen); +} + +int nip_udp_getsockopt(struct sock *sk, int level, + int optname, char __user *optval, + int __user *optlen) +{ + if (level == SOL_UDP || level == SOL_UDPLITE) + return 0; + return nip_getsockopt(sk, level, optname, optval, optlen); +} + +static const struct ninet_protocol nip_udp_protocol = { + .handler = nip_udp_input, + .err_handler = nip_udp_err, + .flags = 0, +}; + +int udp_stub_hash(struct sock *sk) +{ + return 0; +} + +void udp_stub_unhash(struct sock *sk) +{ +} + +void udp_stub_rehash(struct sock *sk) +{ +} + +/* Newip Udp related operations */ +struct proto nip_udp_prot = { + .name = "nip_udp", + .owner = THIS_MODULE, + .close = udp_lib_close, + .connect = nip_datagram_connect, + .disconnect = udp_disconnect, + .ioctl = udp_ioctl, + .init = udp_init_sock, + .destroy = nip_udp_destroy_sock, + .setsockopt = nip_udp_setsockopt, + .getsockopt = nip_udp_getsockopt, + .sendmsg = nip_udp_output, + .recvmsg = nip_udp_recvmsg, + .backlog_rcv = __nip_udp_queue_rcv_skb, + .release_cb = nip_datagram_release_cb, + .hash = udp_lib_hash, + .unhash = udp_lib_unhash, + .rehash = udp_stub_rehash, + .get_port = nip_udp_get_port, + .memory_allocated = &udp_memory_allocated, + .sysctl_mem = sysctl_udp_mem, + .obj_size = sizeof(struct nip_udp_sock), + .h.udp_table = &udp_table, + .diag_destroy = udp_abort, +}; + +/* Example Create newip socket information */ +static struct inet_protosw nip_udp_protosw = { + .type = SOCK_DGRAM, + .protocol = IPPROTO_UDP, + .prot = &nip_udp_prot, + .ops = &ninet_dgram_ops, + .flags = INET_PROTOSW_PERMANENT, +}; + +/* Af_NINET initializes the call */ +int __init nip_udp_init(void) +{ + int ret; + + ret = ninet_add_protocol(&nip_udp_protocol, IPPROTO_UDP); + if (ret) + goto out; + + ret = ninet_register_protosw(&nip_udp_protosw); + if (ret) + goto out_nip_udp_protocol; +out: + return ret; + +out_nip_udp_protocol: + ninet_del_protocol(&nip_udp_protocol, IPPROTO_UDP); + goto out; +} + +void nip_udp_exit(void) +{ + ninet_unregister_protosw(&nip_udp_protosw); + ninet_del_protocol(&nip_udp_protocol, IPPROTO_UDP); +} diff --git a/patches/hispark_taurus.flag b/patches/hispark_taurus.flag new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/patches/linux-5.10/newip.patch b/patches/linux-5.10/newip.patch new file mode 100644 index 0000000000000000000000000000000000000000..b1b62edce4f9ebce0309f280647d5d2484c0f8ce --- /dev/null +++ b/patches/linux-5.10/newip.patch @@ -0,0 +1,486 @@ +diff -Naur old/include/linux/netdevice.h new/include/linux/netdevice.h +--- old/include/linux/netdevice.h 2022-07-15 20:22:22.732134658 +0800 ++++ new/include/linux/netdevice.h 2022-07-15 20:22:31.002134775 +0800 +@@ -2016,6 +2016,9 @@ + struct dn_dev __rcu *dn_ptr; + #endif + struct inet6_dev __rcu *ip6_ptr; ++#if IS_ENABLED(CONFIG_NEWIP) ++ struct ninet_dev __rcu *nip_ptr; /* NIP */ ++#endif + #if IS_ENABLED(CONFIG_AX25) + void *ax25_ptr; + #endif +diff -Naur old/include/linux/socket.h new/include/linux/socket.h +--- old/include/linux/socket.h 2022-07-15 20:22:22.732134658 +0800 ++++ new/include/linux/socket.h 2022-07-15 20:22:31.002134775 +0800 +@@ -223,8 +223,8 @@ + * reuses AF_INET address family + */ + #define AF_XDP 44 /* XDP sockets */ +- +-#define AF_MAX 45 /* For now.. */ ++#define AF_NINET 45 /* NIP */ ++#define AF_MAX 46 /* For now.. */ + + /* Protocol families, same as address families. */ + #define PF_UNSPEC AF_UNSPEC +@@ -274,6 +274,7 @@ + #define PF_QIPCRTR AF_QIPCRTR + #define PF_SMC AF_SMC + #define PF_XDP AF_XDP ++#define PF_NINET AF_NINET /* NIP */ + #define PF_MAX AF_MAX + + /* Maximum queue length specifiable by listen. */ +diff -Naur old/include/linux/tcp.h new/include/linux/tcp.h +--- old/include/linux/tcp.h 2022-07-15 20:22:22.732134658 +0800 ++++ new/include/linux/tcp.h 2022-07-15 20:22:31.002134775 +0800 +@@ -317,6 +317,9 @@ + + /* OOO segments go in this rbtree. Socket lock must be held. */ + struct rb_root out_of_order_queue; ++#ifdef CONFIG_NEWIP ++ struct sk_buff *nip_out_of_order_queue; /* NIP */ ++#endif + struct sk_buff *ooo_last_skb; /* cache rb_last(out_of_order_queue) */ + + /* SACKs data, these 2 need to be together (see tcp_options_write) */ +@@ -412,6 +415,17 @@ + */ + struct request_sock __rcu *fastopen_rsk; + struct saved_syn *saved_syn; ++ ++#ifdef CONFIG_NEWIP ++/* newip tcp retrans */ ++ u32 ack_retrans_num; ++ u32 ack_retrans_seq; ++ u32 nip_ssthresh; ++ u32 nip_ssthresh_reset; ++ bool nip_keepalive_enable; ++ u32 idle_ka_probes_out; ++ u32 nip_keepalive_timeout_scale; ++#endif + }; + + enum tsq_enum { +@@ -423,6 +437,10 @@ + TCP_MTU_REDUCED_DEFERRED, /* tcp_v{4|6}_err() could not call + * tcp_v{4|6}_mtu_reduced() + */ ++#ifdef CONFIG_NEWIP ++ TCP_NIP_WRITE_TIMER_DEFERRED, /* NIP */ ++ TCP_NIP_DELACK_TIMER_DEFERRED, /* NIP */ ++#endif + }; + + enum tsq_flags { +diff -Naur old/include/net/dst.h new/include/net/dst.h +--- old/include/net/dst.h 2022-07-15 20:22:22.732134658 +0800 ++++ new/include/net/dst.h 2022-07-15 20:22:31.002134775 +0800 +@@ -35,6 +35,8 @@ + int (*output)(struct net *net, struct sock *sk, struct sk_buff *skb); + + unsigned short flags; ++ ++#define DST_HOST 0x0001 /* NIP */ + #define DST_NOXFRM 0x0002 + #define DST_NOPOLICY 0x0004 + #define DST_NOCOUNT 0x0008 +diff -Naur old/include/net/inet_hashtables.h new/include/net/inet_hashtables.h +--- old/include/net/inet_hashtables.h 2022-07-15 20:22:22.732134658 +0800 ++++ new/include/net/inet_hashtables.h 2022-07-15 20:22:31.002134775 +0800 +@@ -83,6 +83,9 @@ + #if IS_ENABLED(CONFIG_IPV6) + struct in6_addr fast_v6_rcv_saddr; + #endif ++#if IS_ENABLED(CONFIG_NEWIP) ++ struct nip_addr fast_nip_rcv_saddr; ++#endif + __be32 fast_rcv_saddr; + unsigned short fast_sk_family; + bool fast_ipv6_only; +@@ -402,6 +405,13 @@ + const struct in6_addr *laddr, const u16 lport, + const struct in6_addr *faddr, const __be16 fport); + ++#ifdef CONFIG_NEWIP ++/* NIP */ ++u32 ninet_ehashfn(const struct net *net, ++ const struct nip_addr *laddr, const u16 lport, ++ const struct nip_addr *faddr, const __be16 fport); ++#endif ++ + static inline void sk_daddr_set(struct sock *sk, __be32 addr) + { + sk->sk_daddr = addr; /* alias of inet_daddr */ +diff -Naur old/include/net/inet_sock.h new/include/net/inet_sock.h +--- old/include/net/inet_sock.h 2022-07-15 20:22:22.732134658 +0800 ++++ new/include/net/inet_sock.h 2022-07-15 20:22:31.002134775 +0800 +@@ -73,6 +73,10 @@ + #define ir_rmt_port req.__req_common.skc_dport + #define ir_v6_rmt_addr req.__req_common.skc_v6_daddr + #define ir_v6_loc_addr req.__req_common.skc_v6_rcv_saddr ++ ++#define ir_nip_rmt_addr req.__req_common.nip_daddr /* NIP */ ++#define ir_nip_loc_addr req.__req_common.nip_rcv_saddr /* NIP */ ++ + #define ir_iif req.__req_common.skc_bound_dev_if + #define ir_cookie req.__req_common.skc_cookie + #define ireq_net req.__req_common.skc_net +@@ -97,6 +101,12 @@ + struct sk_buff *pktopts; + }; + #endif ++#if IS_ENABLED(CONFIG_NEWIP) ++ /* NIP */ ++ struct { ++ struct sk_buff *nip_pktopts; ++ }; ++#endif + }; + }; + +diff -Naur old/include/net/neighbour.h new/include/net/neighbour.h +--- old/include/net/neighbour.h 2022-07-15 20:22:22.732134658 +0800 ++++ new/include/net/neighbour.h 2022-07-15 20:22:31.002134775 +0800 +@@ -232,6 +232,9 @@ + NEIGH_ARP_TABLE = 0, + NEIGH_ND_TABLE = 1, + NEIGH_DN_TABLE = 2, ++#ifdef CONFIG_NEWIP ++ NEIGH_NND_TABLE = 3, /* NIP */ ++#endif + NEIGH_NR_TABLES, + NEIGH_LINK_TABLE = NEIGH_NR_TABLES /* Pseudo table for neigh_xmit */ + }; +diff -Naur old/include/net/net_namespace.h new/include/net/net_namespace.h +--- old/include/net/net_namespace.h 2022-07-15 20:22:22.732134658 +0800 ++++ new/include/net/net_namespace.h 2022-07-15 20:22:31.002134775 +0800 +@@ -38,6 +38,9 @@ + #include + #include + #include ++#ifdef CONFIG_NEWIP ++#include ++#endif + + struct user_namespace; + struct proc_dir_entry; +@@ -127,6 +130,9 @@ + #if IS_ENABLED(CONFIG_IPV6) + struct netns_ipv6 ipv6; + #endif ++#if IS_ENABLED(CONFIG_NEWIP) ++ struct netns_newip newip; /* NIP */ ++#endif + #if IS_ENABLED(CONFIG_IEEE802154_6LOWPAN) + struct netns_ieee802154_lowpan ieee802154_lowpan; + #endif +diff -Naur old/include/net/secure_seq.h new/include/net/secure_seq.h +--- old/include/net/secure_seq.h 2022-07-15 20:22:22.732134658 +0800 ++++ new/include/net/secure_seq.h 2022-07-15 20:22:31.002134775 +0800 +@@ -19,4 +19,11 @@ + u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr, + __be16 sport, __be16 dport); + ++#ifdef CONFIG_NEWIP ++/* NIP */ ++u64 secure_newip_port_ephemeral(const __be32 *saddr, const __be32 *daddr, ++ __be16 dport); ++__u32 secure_tcp_nip_sequence_number(const __be32 *saddr, const __be32 *daddr, ++ __be16 sport, __be16 dport); ++#endif + #endif /* _NET_SECURE_SEQ */ +diff -Naur old/include/net/sock.h new/include/net/sock.h +--- old/include/net/sock.h 2022-07-15 20:22:22.732134658 +0800 ++++ new/include/net/sock.h 2022-07-15 20:22:31.012134775 +0800 +@@ -68,6 +68,9 @@ + #include + #include + #include ++#ifdef CONFIG_NEWIP ++#include /* NIP */ ++#endif + + /* + * This structure really needs to be cleaned up. +@@ -202,6 +205,11 @@ + struct in6_addr skc_v6_rcv_saddr; + #endif + ++#if IS_ENABLED(CONFIG_NEWIP) ++ struct nip_addr nip_daddr; /* NIP */ ++ struct nip_addr nip_rcv_saddr; /* NIP */ ++#endif ++ + atomic64_t skc_cookie; + + /* following fields are padding to force +@@ -379,6 +387,8 @@ + #define sk_net __sk_common.skc_net + #define sk_v6_daddr __sk_common.skc_v6_daddr + #define sk_v6_rcv_saddr __sk_common.skc_v6_rcv_saddr ++#define sk_nip_daddr __sk_common.nip_daddr /* NIP */ ++#define sk_nip_rcv_saddr __sk_common.nip_rcv_saddr /* NIP */ + #define sk_cookie __sk_common.skc_cookie + #define sk_incoming_cpu __sk_common.skc_incoming_cpu + #define sk_flags __sk_common.skc_flags +diff -Naur old/include/net/tcp.h new/include/net/tcp.h +--- old/include/net/tcp.h 2022-07-15 20:22:22.742134658 +0800 ++++ new/include/net/tcp.h 2022-07-15 20:22:31.012134775 +0800 +@@ -40,7 +40,9 @@ + #include + #include + #include +- ++#ifdef CONFIG_NEWIP ++#include /* NIP */ ++#endif + #include + #include + #include +@@ -869,6 +871,9 @@ + #if IS_ENABLED(CONFIG_IPV6) + struct inet6_skb_parm h6; + #endif ++#if IS_ENABLED(CONFIG_NEWIP) ++ struct ninet_skb_parm hnip; /* NIP */ ++#endif + } header; /* For incoming skbs */ + struct { + __u32 flags; +diff -Naur old/include/uapi/linux/if_ether.h new/include/uapi/linux/if_ether.h +--- old/include/uapi/linux/if_ether.h 2022-07-15 20:22:22.742134658 +0800 ++++ new/include/uapi/linux/if_ether.h 2022-07-15 20:22:31.012134775 +0800 +@@ -72,6 +72,7 @@ + #define ETH_P_ERSPAN 0x88BE /* ERSPAN type II */ + #define ETH_P_IPX 0x8137 /* IPX over DIX */ + #define ETH_P_IPV6 0x86DD /* IPv6 over bluebook */ ++#define ETH_P_NEWIP 0xEADD /* NIP */ + #define ETH_P_PAUSE 0x8808 /* IEEE Pause frames. See 802.3 31B */ + #define ETH_P_SLOW 0x8809 /* Slow Protocol. See 802.3ad 43B */ + #define ETH_P_WCCP 0x883E /* Web-cache coordination protocol +diff -Naur old/net/Kconfig new/net/Kconfig +--- old/net/Kconfig 2022-07-15 20:22:22.742134658 +0800 ++++ new/net/Kconfig 2022-07-15 20:22:31.012134775 +0800 +@@ -93,6 +93,7 @@ + if INET + source "net/ipv4/Kconfig" + source "net/ipv6/Kconfig" ++source "net/newip/Kconfig" + source "net/netlabel/Kconfig" + source "net/mptcp/Kconfig" + +diff -Naur old/net/Makefile new/net/Makefile +--- old/net/Makefile 2022-07-15 20:22:22.742134658 +0800 ++++ new/net/Makefile 2022-07-15 20:22:31.012134775 +0800 +@@ -20,6 +20,7 @@ + obj-$(CONFIG_XFRM) += xfrm/ + obj-$(CONFIG_UNIX_SCM) += unix/ + obj-$(CONFIG_NET) += ipv6/ ++obj-$(CONFIG_NET) += newip/ + obj-$(CONFIG_BPFILTER) += bpfilter/ + obj-$(CONFIG_PACKET) += packet/ + obj-$(CONFIG_NET_KEY) += key/ +diff -Naur old/net/core/neighbour.c new/net/core/neighbour.c +--- old/net/core/neighbour.c 2022-07-15 20:22:22.742134658 +0800 ++++ new/net/core/neighbour.c 2022-07-15 20:22:31.012134775 +0800 +@@ -1779,6 +1779,11 @@ + case AF_DECnet: + tbl = neigh_tables[NEIGH_DN_TABLE]; + break; ++#ifdef CONFIG_NEWIP ++ case AF_NINET: /* NIP */ ++ tbl = neigh_tables[NEIGH_NND_TABLE]; ++ break; ++#endif + } + + return tbl; +diff -Naur old/net/core/secure_seq.c new/net/core/secure_seq.c +--- old/net/core/secure_seq.c 2022-07-15 20:22:22.742134658 +0800 ++++ new/net/core/secure_seq.c 2022-07-15 20:22:31.012134775 +0800 +@@ -151,6 +151,51 @@ + EXPORT_SYMBOL_GPL(secure_ipv4_port_ephemeral); + #endif + ++#ifdef CONFIG_NEWIP ++/* NIP */ ++__u32 secure_tcp_nip_sequence_number(const __be32 *saddr, const __be32 *daddr, ++ __be16 sport, __be16 dport) ++{ ++ const struct { ++ struct nip_addr saddr; ++ struct nip_addr daddr; ++ __be16 sport; ++ __be16 dport; ++ } __aligned(SIPHASH_ALIGNMENT) combined = { ++ .saddr = *(struct nip_addr *)saddr, ++ .daddr = *(struct nip_addr *)daddr, ++ .sport = sport, ++ .dport = dport, ++ }; ++ u32 hash; ++ ++ net_secret_init(); ++ hash = siphash(&combined, offsetofend(typeof(combined), dport), ++ &net_secret); ++ return seq_scale(hash); ++} ++EXPORT_SYMBOL_GPL(secure_tcp_nip_sequence_number); ++ ++/* NIP */ ++u64 secure_newip_port_ephemeral(const __be32 *saddr, const __be32 *daddr, ++ __be16 dport) ++{ ++ const struct { ++ struct nip_addr saddr; ++ struct nip_addr daddr; ++ __be16 dport; ++ } __aligned(SIPHASH_ALIGNMENT) combined = { ++ .saddr = *(struct nip_addr *)saddr, ++ .daddr = *(struct nip_addr *)daddr, ++ .dport = dport, ++ }; ++ net_secret_init(); ++ return siphash(&combined, offsetofend(typeof(combined), dport), ++ &net_secret); ++} ++EXPORT_SYMBOL_GPL(secure_newip_port_ephemeral); ++#endif ++ + #if IS_ENABLED(CONFIG_IP_DCCP) + u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr, + __be16 sport, __be16 dport) +diff -Naur old/net/ipv4/inet_connection_sock.c new/net/ipv4/inet_connection_sock.c +--- old/net/ipv4/inet_connection_sock.c 2022-07-15 20:22:22.742134658 +0800 ++++ new/net/ipv4/inet_connection_sock.c 2022-07-15 20:22:31.012134775 +0800 +@@ -22,7 +22,34 @@ + #include + #include + #include ++#if IS_ENABLED(CONFIG_NEWIP) ++#include ++#include ++#endif + ++#if IS_ENABLED(CONFIG_NEWIP) ++/* only match New IP sock ++ * match_sk*_wildcard == true: NIP_ADDR_ANY equals to any New IP addresses ++ * ++ * match_sk*_wildcard == false: addresses must be exactly the same, i.e. ++ * NIP_ADDR_ANY only equals to NIP_ADDR_ANY ++ */ ++static bool nip_rcv_saddr_equal(const struct nip_addr *sk1_rcv_saddr, ++ const struct nip_addr *sk2_rcv_saddr, ++ bool sk2_isnewip, ++ bool match_sk1_wildcard, ++ bool match_sk2_wildcard) ++{ ++ if (!sk2_isnewip) ++ return false; ++ if (nip_addr_eq(sk1_rcv_saddr, sk2_rcv_saddr)) ++ return true; ++ return (match_sk1_wildcard && ++ nip_addr_eq(sk1_rcv_saddr, &nip_any_addr)) || ++ (match_sk2_wildcard && ++ nip_addr_eq(sk2_rcv_saddr, &nip_any_addr)); ++} ++#endif + #if IS_ENABLED(CONFIG_IPV6) + /* match_sk*_wildcard == true: IPV6_ADDR_ANY equals to any IPv6 addresses + * if IPv6 only, and any IPv4 addresses +@@ -102,6 +129,16 @@ + match_wildcard, + match_wildcard); + #endif ++ ++#if IS_ENABLED(CONFIG_NEWIP) ++ if (sk->sk_family == AF_NINET) ++ return nip_rcv_saddr_equal(&sk->sk_nip_rcv_saddr, ++ &sk2->sk_nip_rcv_saddr, ++ sk2->sk_family == AF_NINET, ++ match_wildcard, ++ match_wildcard); ++#endif ++ + return ipv4_rcv_saddr_equal(sk->sk_rcv_saddr, sk2->sk_rcv_saddr, + ipv6_only_sock(sk2), match_wildcard, + match_wildcard); +@@ -292,6 +329,13 @@ + tb->fast_ipv6_only, + ipv6_only_sock(sk), true, false); + #endif ++#if IS_ENABLED(CONFIG_NEWIP) ++ if (tb->fast_sk_family == AF_NINET) ++ return nip_rcv_saddr_equal(&tb->fast_nip_rcv_saddr, ++ &sk->sk_nip_rcv_saddr, ++ sk->sk_family == AF_NINET, ++ true, false); ++#endif + return ipv4_rcv_saddr_equal(tb->fast_rcv_saddr, sk->sk_rcv_saddr, + ipv6_only_sock(sk), true, false); + } +@@ -313,6 +357,9 @@ + #if IS_ENABLED(CONFIG_IPV6) + tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr; + #endif ++#if IS_ENABLED(CONFIG_NEWIP) ++ tb->fast_nip_rcv_saddr = sk->sk_nip_rcv_saddr; ++#endif + } else { + tb->fastreuseport = 0; + } +@@ -340,6 +387,9 @@ + #if IS_ENABLED(CONFIG_IPV6) + tb->fast_v6_rcv_saddr = sk->sk_v6_rcv_saddr; + #endif ++#if IS_ENABLED(CONFIG_NEWIP) ++ tb->fast_nip_rcv_saddr = sk->sk_nip_rcv_saddr; ++#endif + } + } else { + tb->fastreuseport = 0; +diff -Naur old/net/ipv4/inet_hashtables.c new/net/ipv4/inet_hashtables.c +--- old/net/ipv4/inet_hashtables.c 2022-07-15 20:22:22.742134658 +0800 ++++ new/net/ipv4/inet_hashtables.c 2022-07-15 20:22:31.012134775 +0800 +@@ -52,6 +52,15 @@ + &sk->sk_v6_rcv_saddr, sk->sk_num, + &sk->sk_v6_daddr, sk->sk_dport); + #endif ++ ++#if IS_ENABLED(CONFIG_NEWIP) ++ /* NIP */ ++ if (sk->sk_family == AF_NINET) ++ return ninet_ehashfn(sock_net(sk), ++ &sk->sk_nip_rcv_saddr, sk->sk_num, ++ &sk->sk_nip_daddr, sk->sk_dport); ++#endif ++ + return inet_ehashfn(sock_net(sk), + sk->sk_rcv_saddr, sk->sk_num, + sk->sk_daddr, sk->sk_dport); +diff -Naur old/security/selinux/hooks.c new/security/selinux/hooks.c +--- old/security/selinux/hooks.c 2022-07-15 20:22:22.742134658 +0800 ++++ new/security/selinux/hooks.c 2022-07-15 20:22:31.012134775 +0800 +@@ -1271,7 +1271,7 @@ + return SECCLASS_SMC_SOCKET; + case PF_XDP: + return SECCLASS_XDP_SOCKET; +-#if PF_MAX > 45 ++#if PF_MAX > 46 + #error New address family defined, please update this function. + #endif + } +diff -Naur old/security/selinux/include/classmap.h new/security/selinux/include/classmap.h +--- old/security/selinux/include/classmap.h 2022-07-15 20:22:22.742134658 +0800 ++++ new/security/selinux/include/classmap.h 2022-07-15 20:22:31.012134775 +0800 +@@ -253,6 +253,6 @@ + { NULL } + }; + +-#if PF_MAX > 45 ++#if PF_MAX > 46 + #error New address family defined, please update secclass_map. + #endif diff --git a/patches/rk3568.flag b/patches/rk3568.flag new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391