diff --git a/include/linux/newip_route.h b/include/linux/newip_route.h new file mode 100755 index 0000000000000000000000000000000000000000..05ad294e14367d150e644f5eef5efcf3c6f34a9f --- /dev/null +++ b/include/linux/newip_route.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Linux newIP implementation + * + */ +#ifndef _LINUX_NEWIP_ROUTE_H +#define _LINUX_NEWIP_ROUTE_H + +#include + +#endif + diff --git a/include/linux/nin.h b/include/linux/nin.h new file mode 100755 index 0000000000000000000000000000000000000000..ea548bcddd4c152a4a216c2cbc29adec44f97b23 --- /dev/null +++ b/include/linux/nin.h @@ -0,0 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#ifndef _LINUX_NIN_H +#define _LINUX_NIN_H + +#include + +#endif diff --git a/include/linux/nip.h b/include/linux/nip.h new file mode 100755 index 0000000000000000000000000000000000000000..c49366035f303340b0766291a38416adf52bad46 --- /dev/null +++ b/include/linux/nip.h @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _NIP_H +#define _NIP_H + +#include +#include +#include +#include + +/* 取目前NewIP使用的最大包头大小,作为最小MTU值 + * NewIP协议栈不使用MTU小于此值的网口 + */ +#define NEWIP_MIN_MTU (NIP_HDR_MAX + NIP_UDP_HDR_LEN) + +/* This structure contains configuration options per NewIP link. + */ +struct nip_devconf { + __s32 forwarding; + __s32 mtu; + __s32 ignore_routes_with_linkdown; + + __s32 disable_nip; + __s32 nndisc_notify; + __s32 use_oif_addrs_only; + __s32 keep_addr_on_down; + + struct ctl_table_header *sysctl_header; +}; + +/* This structure contains results of exthdrs parsing + * as offsets from skb->nh. + */ + +#pragma pack(1) +struct ninet_skb_parm { + struct nip_addr dstaddr; + struct nip_addr srcaddr; + u8 nexthdr; +}; +#pragma pack() + +struct tcp_nip_request_sock { + struct tcp_request_sock tcp_nip_rsk_tcp; +}; + +struct nip_udp_sock { + struct udp_sock udp; +}; + +struct tcp_nip_sock { + struct tcp_sock tcp; +}; + +int find_nip_forward_stamp(struct net *net, void __user *arg); + +#endif /* _NIP_H */ diff --git a/include/linux/nip_icmp.h b/include/linux/nip_icmp.h new file mode 100755 index 0000000000000000000000000000000000000000..9a71979d56b57d872b4801abe9eb17d657fd49cc --- /dev/null +++ b/include/linux/nip_icmp.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _LINUX_NIP_ICMP_H +#define _LINUX_NIP_ICMP_H + +#include +#include +#include + +static inline struct nip_icmp_hdr *nip_icmp_header(const struct sk_buff *skb) +{ + return (struct nip_icmp_hdr *)skb_transport_header(skb); +} + +int nip_icmp_init(void); + +#endif diff --git a/include/net/flow_nip.h b/include/net/flow_nip.h new file mode 100755 index 0000000000000000000000000000000000000000..4e5f4bb236f4c547e2883a253dbd87b047a61608 --- /dev/null +++ b/include/net/flow_nip.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * internet FLOW for newip + * newip暂未使用flow的任何功能。使用此结构体 + * 目前只是为了未来可能接入flow做准备 + */ +#ifndef _NET_FLOW_NIP_H +#define _NET_FLOW_NIP_H + +#include + +struct flow_nip { + struct flowi_common __fl_common; +#define flowin_oif __fl_common.flowic_oif +#define flowin_iif __fl_common.flowic_iif + struct nip_addr daddr; + struct nip_addr saddr; + union flowi_uli uli; +#define fln_sport uli.ports.sport +#define fln_dport uli.ports.dport +} __attribute__((__aligned__(BITS_PER_LONG / 8))); + +#endif diff --git a/include/net/if_ninet.h b/include/net/if_ninet.h new file mode 100755 index 0000000000000000000000000000000000000000..fabbfa1ddac7a92c429208cb7a760c29495d4dfa --- /dev/null +++ b/include/net/if_ninet.h @@ -0,0 +1,63 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * ninet interface/address list definitions + * Linux NEWIP implementation + * + */ + +#ifndef _NET_IF_NINET_H +#define _NET_IF_NINET_H + +#include +#include + +enum { + NINET_IFADDR_STATE_NEW, + NINET_IFADDR_STATE_DEAD, +}; + +struct ninet_ifaddr { + struct nip_addr addr; + + /* In seconds, relative to tstamp. Expiry is at tstamp + HZ * lft. */ + __u32 valid_lft; + __u32 preferred_lft; + refcount_t refcnt; + + /* protect one ifaddr itself */ + spinlock_t lock; + + int state; + + __u32 flags; + + unsigned long cstamp; /* created timestamp */ + unsigned long tstamp; /* updated timestamp */ + + struct ninet_dev *idev; + struct nip_rt_info *rt; + + struct hlist_node addr_lst; + struct list_head if_list; + + struct rcu_head rcu; +}; + +struct ninet_dev { + struct net_device *dev; + + struct list_head addr_list; + + rwlock_t lock; + refcount_t refcnt; + __u32 if_flags; + int dead; + + struct neigh_parms *nd_parms; + struct nip_devconf cnf; + + unsigned long tstamp; /* newip InterfaceTable update timestamp */ + struct rcu_head rcu; +}; + +#endif diff --git a/include/net/ninet_connection_sock.h b/include/net/ninet_connection_sock.h new file mode 100755 index 0000000000000000000000000000000000000000..b7fb534622d3affd4f25e2ed8d811acafd3a4b54 --- /dev/null +++ b/include/net/ninet_connection_sock.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * NET Generic infrastructure for NINET connection oriented protocols. + */ +#ifndef _NINET_CONNECTION_SOCK_H +#define _NINET_CONNECTION_SOCK_H + +#include +#include +#include + +struct inet_bind_bucket; +struct request_sock; +struct sk_buff; +struct sock; +struct sockaddr; + +int ninet_csk_bind_conflict(const struct sock *sk, + const struct inet_bind_bucket *tb, bool relax); +int ninet_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl); +void ninet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, + unsigned long timeout); + +#endif /* _NINET_CONNECTION_SOCK_H */ diff --git a/include/net/ninet_hashtables.h b/include/net/ninet_hashtables.h new file mode 100755 index 0000000000000000000000000000000000000000..7653dc8a947e0b0b6c044fbb0fbb00b01f22d9ed --- /dev/null +++ b/include/net/ninet_hashtables.h @@ -0,0 +1,130 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#ifndef NINET_HASHTABLES_H +#define NINET_HASHTABLES_H + +#if IS_ENABLED(CONFIG_NEWIP) +#include +#include +#include +#include + +#include + +#include +#include + +struct inet_hashinfo; + +int ninet_hash(struct sock *sk); +void ninet_unhash(struct sock *sk); +int ninet_hash_connect(struct inet_timewait_death_row *death_row, + struct sock *sk); + +static inline unsigned int __ninet_ehashfn(const u32 lhash, + const u16 lport, + const u32 fhash, + const __be16 fport, + const u32 initval) +{ + const u32 ports = (((u32) lport) << 16) | (__force u32) fport; + + return jhash_3words(lhash, fhash, ports, initval); +} + +struct sock *__ninet_lookup_established(struct net *net, + struct inet_hashinfo *hashinfo, + const struct nip_addr *saddr, + const __be16 sport, + const struct nip_addr *daddr, + const u16 hnum, const int dif); + +struct sock *ninet_lookup_listener(struct net *net, + struct inet_hashinfo *hashinfo, + struct sk_buff *skb, int doff, + const struct nip_addr *saddr, + const __be16 sport, + const struct nip_addr *daddr, + const unsigned short hnum, const int dif); + +/* Function: + * 在ehash或者lhash散列表中根据地址和端口来查找传输控制块 + * 如果在 ehash 中找到,则表示已经经历了三次握手并且已建 + * 立了连接,可以进行正常的通信。 + * 如果在 lhash 中找到,则表示已经绑定已经绑定了端口,处 + * 于侦听状态。如果在两个散列表中都查找不到,说明此时 + * 对应的传输控制块还没有创建。 + * Parameter: + * net: 命名空间。 + * hashinfo: 类型为tcp_hashinfo的全局标量,保存当前系统的各种 + * 状态的tcp_sock(包括established, listen, bind) + * skb: 传输控制块缓冲区。 + * doff: TCP头的大小,其数值乘4就是TCP头的字节数。 + * saddr: 源地址。 + * sport: 源端口。 + * daddr: 目的地址。 + * hnum: 目的端口。 + */ +static inline struct sock *__ninet_lookup(struct net *net, + struct inet_hashinfo *hashinfo, + struct sk_buff *skb, int doff, + const struct nip_addr *saddr, + const __be16 sport, + const struct nip_addr *daddr, + const u16 hnum, + const int dif, bool *refcounted) +{ + struct sock *sk = __ninet_lookup_established(net, hashinfo, saddr, + sport, daddr, hnum, dif); + *refcounted = true; + if (sk) + return sk; + *refcounted = false; + return ninet_lookup_listener(net, hashinfo, skb, doff, saddr, sport, + daddr, hnum, dif); +} + +/* Function: + * 在ehash或lhash散列表中根据地址和端口来查找传输控制块。 + * Parameter: + * hashinfo: 类型为tcp_hashinfo的全局标量,保存当前系统的各种 + * 状态的tcp_sock(包括established, listen, bind) + * skb: 传输控制块缓冲区。 + * doff: TCP头的大小,其数值乘4就是TCP头的字节数。 + * sport: 源端口。 + * dport: 目的端口。 + */ +static inline struct sock *__ninet_lookup_skb(struct inet_hashinfo *hashinfo, + struct sk_buff *skb, int doff, + const __be16 sport, + const __be16 dport, + int iif, bool *refcounted) +{ + struct sock *sk; + + *refcounted = true; + sk = skb_steal_sock(skb, refcounted); + if (sk) + return sk; + + return __ninet_lookup(dev_net(skb->dev), hashinfo, skb, + doff, &(NIPCB(skb)->srcaddr), sport, + &(NIPCB(skb)->dstaddr), ntohs(dport), + iif, refcounted); +} + +#endif /* IS_ENABLED(CONFIG_NEWIP) */ + +#define NINET_MATCH(__sk, __net, __saddr, __daddr, __ports, __dif) \ + (((__sk)->sk_portpair == (__ports)) && \ + ((__sk)->sk_family == AF_NINET) && \ + nip_addr_eq(&(__sk)->sk_nip_daddr, (__saddr)) && \ + nip_addr_eq(&(__sk)->sk_nip_rcv_saddr, (__daddr)) && \ + (!(__sk)->sk_bound_dev_if || \ + ((__sk)->sk_bound_dev_if == (__dif))) && \ + net_eq(sock_net(__sk), (__net))) + +int ninet_hash_connect(struct inet_timewait_death_row *death_row, + struct sock *sk); + +#endif /* _NINET_HASHTABLES_H */ diff --git a/include/net/nip.h b/include/net/nip.h new file mode 100755 index 0000000000000000000000000000000000000000..d658a646ff4b1644433e28e5ee9f1c1962cee4e0 --- /dev/null +++ b/include/net/nip.h @@ -0,0 +1,368 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _NET_NEWIP_H +#define _NET_NEWIP_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include "if_ninet.h" +#include "flow_nip.h" + +/* 以太头14B,+2B字节对齐, + * +66规避hmac驱动skb空间扩容导致coredum问题 + */ +#define NIP_ETH_HDLEN 14 +#define NIP_ETH_HDR_LEN (NIP_ETH_HDLEN + 2 + 66) +#define NIP_HDR_MAX 50 + +#define NIP_UDP_HDR_LEN 8 +/* maximum socket number is 1024 */ +#define NIP_MAX_SOCKET_NUM 1024 + +enum NIP_HDR_TYPE { + NIP_HDR_T_NEXT_HDR = 0x0, + NIP_HDR_T_SRC_ADDR, + NIP_HDR_T_DST_ADDR, + NIP_HDR_T_AUTH_CODE, + NIP_HDR_T_FRAG_HDR, + + NIP_HDR_T_EXTEND_HDR_TYPE = 0xF, +}; + +#ifndef DESC +#define DESC(n) (1) +#endif + +/* bitmap 包含多少个bit样例: + * 二进制:1xxx xxx0 表示 后面最多可以有 6 bit 标志 + * 二进制:1xxx xxx1 xxxx xxx0 表示 最多可以有 13 bit 标志 + */ +#define NIP_BITMAP_VALID_SET 0x80 /* bit 0 设置 */ +#define NIP_BITMAP_INCLUDE_TTL 0x40 /* bit 1 设置 */ +#define NIP_BITMAP_INCLUDE_HDR_LEN 0x20 /* bit 2 设置 */ +#define NIP_BITMAP_INCLUDE_NEXT_HDR 0x10 /* bit 3 设置 */ + +#define NIP_BITMAP_INCLUDE_DADDR 0x08 /* bit 4 设置 */ +#define NIP_BITMAP_INCLUDE_SADDR 0x04 /* bit 5 设置 */ +#define NIP_BITMAP_INCLUDE_TOTAL_LEN 0x02 /* bit 6 设置 */ + +#define NIP_BITMAP_HAVE_MORE_BIT 0x01 /* bit 7 设置 */ + +#define NIP_MORE_FRAG_SET (0x8000) +#define NIP_MORE_FRAG_CLEAR (~0x8000) + +#define NIP_ADDR_LEN_1 1 +#define NIP_ADDR_LEN_2 2 +#define NIP_ADDR_LEN_3 3 +#define NIP_ADDR_LEN_4 4 +#define NIP_ADDR_LEN_5 5 + +#define NIP_ADDR_BIT_LEN_8 8 +#define NIP_ADDR_BIT_LEN_16 16 +#define NIP_ADDR_BIT_LEN_24 24 +#define NIP_ADDR_BIT_LEN_40 40 + +#define USHORT_PAYLOAD 16 + +enum NIP_ADDR_CHECK { + ADDR_FIRST_DC = 0xDC, + ADDR_FIRST_F0 = 0xF0, + ADDR_FIRST_F1, + ADDR_FIRST_F2, + ADDR_FIRST_F3, + ADDR_FIRST_F4, + ADDR_FIRST_FF = 0xFF, + ADDR_SECOND_MIN_DD = 0xDD, + ADDR_SECOND_MIN_F1 = 0x14, /* f1 14 00 */ + ADDR_THIRD_MIN_F2 = 0x01 /* f2 00 01 00 00 */ +}; + +#define NIP_ADDR_LEN_MAX 8 + +/* newIP报头里都是变长字段, + * 此处定义报头结构只是为了函数传参, + * 字段由原始报文中解析后保存 + */ +struct nip_hdr { + struct nip_addr saddr; /* 源地址,长度可变 */ + struct nip_addr daddr; /* 目的地址,长度可变 */ + + u8 ttl; /* 跳数限制 */ + u8 nexthdr; /* 上层协议类型,IPPROTO_UDP */ + u8 hdr_len; /* newip报文头长度 */ + u8 hdr_real_len; /* newip报文头实际长度 */ + + u16 total_len; /* newip报文长度(报文头+报文) */ + u16 res1; /* 分片在原始报文中偏移量 */ + + u16 identification;/* 报文序列号 */ + u16 more_frag : 1; /* true表示后面“还有分片” */ + u16 is_fragment : 1; /* 分片标志 */ + u16 is_bad_hdr : 1; /* 坏包标志 */ + u16 no_hdr_len : 1; /* 报文头中没有报文头长度字段 */ + u16 include_unknown_bit : 1; /* 有不识别的bitmap字段 */ + u16 include_saddr : 1; + u16 include_daddr : 1; + u16 include_ttl : 1; + u16 include_nexthdr : 1; + u16 include_hdr_len : 1; + u16 include_total_len : 1; + u16 res : 5; + /* The options start here. */ +}; + +/* newip报文头函数入参/出参,并非真实封装到报文中内容 */ +#define BITMAP_MAX 8 +struct nip_head_para { + struct nip_addr daddr; /* 目的地址,长度可变 */ + struct nip_addr saddr; /* 源地址,长度可变 */ + + u8 ttl; /* 跳数限制 */ + u8 nexthdr; /* 上层协议类型,IPPROTO_UDP */ + u8 res1[2]; + + u16 total_len; /* 网络层报文头长度+报文数据长度 */ + u16 res2; /* 保留字段 */ + + void *usr_data; /* 用户数据指针 */ + u32 usr_data_len; /* 用户发送数据长度 */ + u32 trans_hdr_len; /* 传输层报头长度 */ + + u16 sport; + u16 dport; + + /* 以下属于输出参数 */ +#define INDEX_MAX 8 + u8 bitmap[BITMAP_MAX]; /* bitmap目前最大支持8字节 */ + u32 bitmap_num; /* bitmap数组有效元素个数 */ + + u8 hdr_buf[NIP_HDR_MAX]; /* 缓存newip报头 */ + u32 hdr_buf_pos; /* buf缓冲区可写地址偏移 */ + u16 *frag_id_pos; /* 分片在原始报文中的偏移 */ + u8 *hdr_len_pos; /* 报文头实际长度位置 */ + u16 *total_len_pos; /* 报文总长度位置 */ + + /* 报文头bitmap是否携带标志 */ + u8 encap_ttl : 1; + u8 encap_hdr_len : 1; + u8 encap_daddr : 1; + u8 encap_saddr : 1; + u8 encap_total_len : 1; + u8 encap_res : 3; +}; + +/* 报文分段信息 */ +struct nip_pkt_seg_info { + u32 mid_pkt_num; /* 中间段个数 */ + u32 last_pkt_num; /* 最后段个数 */ + + u32 mid_usr_pkt_len; /* 中间段数据长度(8B对齐) */ + u32 last_usr_pkt_len; /* 最后段数据长度 */ + + u8 *usr_data; /* 存放用户原始数据指针 */ + u32 usr_data_len; /* 用户数据本次读取长度 */ +}; + +enum NIP_TYPE { + NIP_TYPE_WELL_KNOWN = 0X00, + NIP_TYPE_SINGLE_BYTE_IP_MIN = 0x01, + NIP_TYPE_SINGLE_BYTE_IP_MAX = 0xEF, + NIP_TYPE_VLEN_BYTE_1, + NIP_TYPE_VLEN_BYTE_2, + NIP_TYPE_VLEN_BYTE_4, + NIP_TYPE_VLEN_BYTE_8, + NIP_TYPE_VLEN_BYTE_16, + NIP_TYPE_VLEN_BYTE_X, + NIP_TYPE_VLEVEL_LEVEL_2, + NIP_TYPE_VLEVEL_LEVEL_3, + NIP_TYPE_VLEVEL_LEVEL_4, + NIP_TYPE_VLEVEL_LEVEL_Y, + NIP_TYPE_NON_TOP, + NIP_TYPE_MULTI_SEMANTIC_2, + NIP_TYPE_MULTI_SEMANTIC_3, + NIP_TYPE_MULTI_SEMANTIC_Z, + NIP_TYPE_RESERVED_MIN, + NIP_TYPE_RESERVED_MAX = 0xFF +}; + +enum NIP_NONE_TOP_SEM_TYPE { + NIP_NONE_TOP_SEM_TYPE_LOCATOR = 0x01, + NIP_NONE_TOP_SEM_TYPE_ID, + NIP_NONE_TOP_SEM_TYPE_SERVICE_ID, + NIP_NONE_TOP_SEM_TYPE_CONTEXT_ID, + NIP_NONE_TOP_SEM_TYPE_TRUST_ID, + + NIP_NONE_TOP_SEM_TYPE_VLEN = 0xFF +}; + +enum NIP_SERVICE_ID { + NIP_PROTOCOL_TYPE_ARP = 0x01, + NIP_PROTOCOL_TYPE_DHCP, + NIP_PROTOCOL_TYPE_AUTH +}; + +enum NIP_PROTOCOL_TYPE { + NIP_PROTOCOL_TYPE_AMP = 0x01, + NIP_PROTOCOL_TYPE_TCP = 0x06, + NIP_PROTOCOL_TYPE_UDP = 0x11 +}; + +struct ninet_protocol { + void (*early_demux)(struct sk_buff *skb); + + int (*handler)(struct sk_buff *skb); + + void (*err_handler)(struct sk_buff *skb, + struct ninet_skb_parm *opt, + u8 type, u8 code, int offset, __be32 info); + unsigned int flags; +}; + +#define NIPCB(skb) ((struct ninet_skb_parm *)&(TCP_SKB_CB(skb)->header.hnip)) + +extern const struct ninet_protocol __rcu *ninet_protos[MAX_INET_PROTOS]; + +int ninet_add_protocol(const struct ninet_protocol *prot, + unsigned char protocol); +int ninet_del_protocol(const struct ninet_protocol *prot, + unsigned char protocol); +int ninet_register_protosw(struct inet_protosw *p); +void ninet_unregister_protosw(struct inet_protosw *p); + +#define NIP_DEFAULT_TTL 128 +#define NIP_ARP_DEFAULT_TTL 64 +#define IPPROTO_NIP_ICMP 0xB1 + +extern const struct proto_ops ninet_dgram_ops; +extern const struct proto_ops ninet_stream_ops; + +extern struct nip_addr nip_local_addr; +extern struct nip_addr nip_broadcast_addr_arp; +extern struct neigh_table nnd_tbl; + +int nip_round_up(int len); +int nip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl); +u_char *decode_nip_addr(u_char *buf, struct nip_addr *addr); +int parse_newiphdr(struct sk_buff *skb); +int nip_nwk_input(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt, struct net_device *orig_dev); +u_char *build_nip_addr(const struct nip_addr *addr, u_char *buf); +int get_nip_addr_len(const struct nip_addr *addr); +u_char *build_tlv(u_char type, u_char len, u_char *value, u_char *buf); +struct nip_rt_info *nip_dst_alloc(struct net *net, struct net_device *dev, + int flags); + +static inline bool nip_addr_eq(const struct nip_addr *a1, + const struct nip_addr *a2) +{ + return (a1->bitlen == a2->bitlen) && (a1->bitlen <= NIP_BITLEN_MAX) && + (memcmp(&a1->v.u, &a2->v.u, a1->bitlen >> 3) == 0); +}; + +static inline u32 nip_addr_hash(const struct nip_addr *a) +{ + u32 tmp[4]; + u8 len = a->bitlen >> 3; + + /* set unused bit to 0 */ + memset(tmp, 0, NIP_ADDR_BIT_LEN_16); + memcpy(tmp, &a->v.u, + len > NIP_ADDR_BIT_LEN_16 ? NIP_ADDR_BIT_LEN_16 : len); + + return (__force u32)(tmp[0] ^ tmp[1] ^ tmp[2] ^ tmp[3]); +} + +int nip_nwk_output(struct sk_buff *skb); + +void ninet_destroy_sock(struct sock *sk); +int __nip_datagram_connect(struct sock *sk, struct sockaddr *addr, + int addr_len); +int nip_datagram_connect(struct sock *sk, struct sockaddr *addr, int addr_len); +int nip_datagram_connect_v6_only(struct sock *sk, struct sockaddr *addr, + int addr_len); +int nip_datagram_dst_update(struct sock *sk, bool fix_sk_saddr); +void nip_datagram_release_cb(struct sock *sk); +int ninet_add_protocol(const struct ninet_protocol *prot, + unsigned char protocol); +int ninet_eld_protocol(const struct ninet_protocol *prot, + unsigned char protocol); +int ninet_register_protosw(struct inet_protosw *p); +void ninet_unregister_protosw(struct inet_protosw *p); +int nip_nwk_input_up(struct sk_buff *skb); +int nip_nwk_output_down(struct net *net, struct sock *sk, struct sk_buff *skb); +int nip_nwk_forward(struct sk_buff *skb); + +/* 封装待udp发送报文newip报头bitmap */ +void _nip_udp_bitmap_flag_encap(struct nip_head_para *head); + +/* 封装待tcp发送报文newip报头bitmap */ +/* tcp, arp icmp使用此接口 */ +void _nip_comm_bitmap_flag_encap(struct nip_head_para *head); + +/* 更新newip报文总长度 */ +/* 调用前需要设置head->total_len */ +void _nip_update_total_len(struct nip_head_para *head); + +void _nip_hdr_encap(struct nip_head_para *head); +struct nip_addr *nip_nexthop(struct nip_rt_info *rt, struct nip_addr *daddr); +struct dst_entry *nip_sk_dst_lookup_flow(struct sock *sk, struct flow_nip *fln); +struct dst_entry *nip_dst_lookup_flow(struct net *net, const struct sock *sk, + struct flow_nip *fln, + const struct nip_addr *final_dst); +int nip_addr_check(struct nip_addr *addr); +u_char *nip_get_mac(struct nip_addr *nipaddr, struct net_device *dev); +struct net_device *nip_get_defaultdev(void); +int nip_init_dev(void); + +/* 报文分段发送接口 */ +int nip_segment_output(struct sock *sk, void *from, int datalen, + int transhdrlen, const struct nip_addr *saddr, + ushort sport, const struct nip_addr *daddr, + ushort dport, struct dst_entry *dst); + +/* 从接收到的报文中解析newip报文头, + * 返回报文头长度,解析失败则返回0 + */ +int nip_hdr_parse(struct sk_buff *skb, struct nip_hdr *niph); + +/* 0 - 无LOG + * 1 - 记录内核日志(正式版本使用) + * 2 - 日志直接打印屏幕,调试时使用 + */ +#define __NIP_DEBUG 1 + +#if __NIP_DEBUG >= 2 +#define TRACE_OUT(fmt, ...) \ + do { \ + pr_crit("%s:%s:%d", __FILE__, __func__, __LINE__); \ + pr_crit(fmt, ##__VA_ARGS__); \ + pr_crit("\n"); \ + } while (0) +#define TRACE(fmt, ...) pr_crit(fmt, ##__VA_ARGS__) +#elif __NIP_DEBUG >= 1 +#define TRACE_OUT(fmt, ...) \ + do { \ + pr_warn("%s:%s:%d", __FILE__, __func__, __LINE__); \ + pr_warn(fmt, ##__VA_ARGS__); \ + pr_warn("\n"); \ + } while (0) +#define TRACE(fmt, ...) pr_debug(fmt, ##__VA_ARGS__) +#else +#define TRACE(fmt, ...) +#define TRACE_OUT(fmt, ...) +#endif + +#define DEBUG(format, ...) TRACE(format, ##__VA_ARGS__) +#define DEBUG_TRACE(format, ...) TRACE_OUT(format, ##__VA_ARGS__) + +#endif diff --git a/include/net/nip_addrconf.h b/include/net/nip_addrconf.h new file mode 100755 index 0000000000000000000000000000000000000000..ee055c9f69f6f3283f771420afe1aaed2d73566f --- /dev/null +++ b/include/net/nip_addrconf.h @@ -0,0 +1,129 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _NIP_ADDRCONF_H +#define _NIP_ADDRCONF_H + +#include +#include +#include +#include +#include +#include + +#define MAX_RTR_SOLICITATIONS (-1) /* unlimited */ +#define RTR_SOLICITATION_INTERVAL (4 * HZ) +#define RTR_SOLICITATION_MAX_INTERVAL (3600 * HZ) /* 1 hour */ + +#define MIN_VALID_LIFETIME (2 * 3600) /* 2 hours */ + +#define TEMP_VALID_LIFETIME (7 * 86400) +#define TEMP_PREFERRED_LIFETIME (86400) +#define REGEN_MAX_RETRY (3) +#define MAX_DESYNC_FACTOR (600) + +#define ADDR_CHECK_FREQUENCY (120 * HZ) + +#define NIP_MAX_ADDRESSES (16) + +#define ADDRCONF_TIMER_FUZZ_MINUS (HZ > 50 ? HZ / 50 : 1) +#define ADDRCONF_TIMER_FUZZ (HZ / 4) +#define ADDRCONF_TIMER_FUZZ_MAX (HZ) + +#define ADDRCONF_NOTIFY_PRIORITY (0) + +#define NIN_ADDR_HSIZE_SHIFT (4) +#define NIN_ADDR_HSIZE (1 << NIN_ADDR_HSIZE_SHIFT) + +int nip_addrconf_add_ifaddr(struct net *net, void __user *arg); +int nip_addrconf_del_ifaddr(struct net *net, void __user *arg); + +int nip_dev_get_saddr(struct net *net, const struct net_device *dev, + const struct nip_addr *daddr, struct nip_addr *saddr); +int __nip_get_lladdr(struct ninet_dev *idev, struct nip_addr *addr, + u32 banned_flags); + +int nip_addrconf_init(void); +void nip_addrconf_cleanup(void); + +/** + * __nin_dev_get - get ninet_dev pointer from netdevice + * @dev: network device + * + * Caller must hold rcu_read_lock or RTNL, because this function + * does not take a reference on the ninet_dev. + */ +static inline struct ninet_dev *__nin_dev_get(const struct net_device *dev) +{ + return rcu_dereference_rtnl(dev->nip_ptr); +} + +/** + * nin_dev_get - get ninet_dev pointer from netdevice + * @dev: network device + */ +static inline struct ninet_dev *nin_dev_get(const struct net_device *dev) +{ + struct ninet_dev *idev; + + rcu_read_lock(); + idev = rcu_dereference(dev->nip_ptr); + if (idev) + refcount_inc(&idev->refcnt); + rcu_read_unlock(); + return idev; +} + +static inline struct neigh_parms *__nin_dev_nd_parms_get_rcu( + const struct net_device *dev) +{ + struct ninet_dev *idev = __nin_dev_get(dev); + + return idev ? idev->nd_parms : NULL; +} + +void nin_dev_finish_destroy(struct ninet_dev *idev); + +static inline void nin_dev_put(struct ninet_dev *idev) +{ + if (refcount_dec_and_test(&idev->refcnt)) + nin_dev_finish_destroy(idev); +} + +static inline void nin_dev_put_clear(struct ninet_dev **pidev) +{ + struct ninet_dev *idev = *pidev; + + if (idev) { + nin_dev_put(idev); + *pidev = NULL; + } +} + +static inline void __nin_dev_put(struct ninet_dev *idev) +{ + refcount_dec(&idev->refcnt); +} + +static inline void nin_dev_hold(struct ninet_dev *idev) +{ + refcount_inc(&idev->refcnt); +} + +void ninet_ifa_finish_destroy(struct ninet_ifaddr *ifp); + +static inline void nin_ifa_put(struct ninet_ifaddr *ifp) +{ + if (refcount_dec_and_test(&ifp->refcnt)) + ninet_ifa_finish_destroy(ifp); +} + +static inline void __nin_ifa_put(struct ninet_ifaddr *ifp) +{ + refcount_dec(&ifp->refcnt); +} + +static inline void nin_ifa_hold(struct ninet_ifaddr *ifp) +{ + refcount_inc(&ifp->refcnt); +} + +#endif diff --git a/include/net/nip_fib.h b/include/net/nip_fib.h new file mode 100755 index 0000000000000000000000000000000000000000..49f4ca35bd2c781f08f25a8af9964da31656f5bb --- /dev/null +++ b/include/net/nip_fib.h @@ -0,0 +1,148 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _NET_NEWIP_FIB_H +#define _NET_NEWIP_FIB_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include "nip.h" +#include "flow_nip.h" + +#define NIN_ROUTE_HSIZE_SHIFT 4 +#define NIN_ROUTE_HSIZE (1 << NIN_ROUTE_HSIZE_SHIFT) + +extern const struct nip_addr nip_any_addr; + +struct nip_fib_config { + u32 fc_table; + u32 fc_metric; + int fc_ifindex; + u32 fc_flags; + u32 fc_protocol; + u32 fc_type; /* only 8 bits are used */ + + struct nip_addr fc_dst; + struct nip_addr fc_src; + struct nip_addr fc_gateway; + + struct nl_info fc_nlinfo; + unsigned long fc_expires; +}; + +struct nip_fib_node { + struct hlist_node fib_hlist; + struct nip_rt_info *nip_route_info; + struct rcu_head rcu; +}; + +struct nip_fib_table; + +struct nip_rt_info { + struct dst_entry dst; + struct dst_entry *from; + struct nip_fib_table *rt_table; + struct nip_fib_node __rcu *rt_node; + struct ninet_dev *rt_idev; + struct nip_rt_info *__percpu *rt_pcpu; + + atomic_t rt_ref; + + uint32_t rt_flags; + struct nip_addr gateway; + struct nip_addr rt_dst; + struct nip_addr rt_src; + + u32 rt_metric; + u32 rt_pmtu; + u8 rt_protocol; +}; + +static inline struct ninet_dev *nip_dst_idev(struct dst_entry *dst) +{ + return ((struct nip_rt_info *)dst)->rt_idev; +} + +struct nip_fib_table { + u32 nip_tb_id; + spinlock_t nip_tb_lock; + struct hlist_head nip_tb_head[NIN_ROUTE_HSIZE]; + unsigned int flags; +}; + +#define NIP_RT_TABLE_MAIN RT_TABLE_MAIN +#define NIP_RT_TABLE_LOCAL RT_TABLE_LOCAL + +typedef struct nip_rt_info *(*nip_pol_lookup_t) (struct net *, + struct nip_fib_table *, + struct flow_nip *, int); + +struct nip_fib_table *nip_fib_get_table(struct net *net, u32 id); + +struct dst_entry *nip_fib_rule_lookup(struct net *net, struct flow_nip *fln, + int flags, nip_pol_lookup_t lookup); + +static inline void nip_rt_set_expires(struct nip_rt_info *rt, + unsigned long expires) +{ + rt->dst.expires = expires; + + rt->rt_flags |= 12; +} + +static inline void nip_rt_clean_expires(struct nip_rt_info *rt) +{ + rt->rt_flags &= ~12; + rt->dst.expires = 0; +} + +static inline void nip_rt_put(struct nip_rt_info *rt) +{ + BUILD_BUG_ON(offsetof(struct nip_rt_info, dst) != 0); + dst_release(&rt->dst); +} + +void nip_rt_free_pcpu(struct nip_rt_info *non_pcpu_rt); + +static inline void nip_rt_hold(struct nip_rt_info *rt) +{ + atomic_inc(&rt->rt_ref); +} + +static inline void nip_rt_release(struct nip_rt_info *rt) +{ + if (atomic_dec_and_test(&rt->rt_ref)) { + nip_rt_free_pcpu(rt); + dst_dev_put(&rt->dst); + + dst_release(&rt->dst); + } +} + +int nip_fib_init(void); + +void nip_fib_gc_cleanup(void); + +struct nip_fib_node *nip_fib_locate(struct hlist_head *nip_tb_head, + const struct nip_addr *daddr); + +void nip_fib_clean_all(struct net *net, + int (*func)(struct nip_rt_info *, void *arg), void *arg); + +int nip_fib_add(struct hlist_head *nip_tb_head, struct nip_rt_info *rt); + +int nip_fib_del(struct nip_rt_info *rt_info, struct nl_info *info); + +int nip_set_route_netlink(struct net *net, struct nip_rtmsg *rtmsg); + +int nip_del_route_netlink(struct net *net, struct nip_rtmsg *rtmsg); + +#endif /* _NET_NEWIP_FIB_H */ diff --git a/include/net/nip_route.h b/include/net/nip_route.h new file mode 100755 index 0000000000000000000000000000000000000000..da7eb5248bfe710da6144f8746bd4579f2095d0e --- /dev/null +++ b/include/net/nip_route.h @@ -0,0 +1,62 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _NET_NIP_ROUTE_H +#define _NET_NIP_ROUTE_H + +#include +#include "nip_fib.h" +#include "nip_addrconf.h" + +#define NIP_RT_PRIO_USER 1024 + +struct nip_rt_info *nip_addrconf_dst_alloc(struct ninet_dev *idev, + const struct nip_addr *addr); + + +void nip_route_input(struct sk_buff *skb); +struct dst_entry *nip_route_input_lookup(struct net *net, + struct net_device *dev, + struct flow_nip *fln, int flags); + +struct dst_entry *nip_route_output_flags(struct net *net, const struct sock *sk, + struct flow_nip *fln, int flags); + + +static inline struct dst_entry *nip_route_output(struct net *net, + const struct sock *sk, + struct flow_nip *fln) +{ + return nip_route_output_flags(net, sk, fln, 0); +} + +struct nip_rt_info *nip_pol_route(struct net *net, struct nip_fib_table *table, + int oif, struct flow_nip *fln, int flags); + +bool nip_bind_addr_check(struct net *net, + struct nip_addr *addr); + +int nip_ins_rt(struct nip_rt_info *rt); +int nip_del_rt(struct nip_rt_info *rt); + +static inline int nip_route_get_saddr(struct net *net, struct nip_rt_info *rt, + const struct nip_addr *daddr, + struct nip_addr *saddr) +{ + struct ninet_dev *idev = + rt ? nip_dst_idev((struct dst_entry *)rt) : NULL; + int err = 0; + + err = nip_dev_get_saddr(net, idev ? idev->dev : NULL, daddr, saddr); + + return err; +} + +void nip_rt_ifdown(struct net *net, struct net_device *dev); + +int nip_route_ioctl(struct net *net, unsigned int cmd, + void __user *arg); + +int nip_route_init(void); + +void nip_route_cleanup(void); + +#endif /*_NET_NIP_ROUTE_H*/ diff --git a/include/net/nip_udp.h b/include/net/nip_udp.h new file mode 100755 index 0000000000000000000000000000000000000000..8d8b3d8f0bc0fe0fc91b0d896bb02db7ea68966b --- /dev/null +++ b/include/net/nip_udp.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _NET_NEWIP_UDP_H +#define _NET_NEWIP_UDP_H + +#include +#include +#include +#include +#include +#include + +#define NIP_UDP_CONNS 8 +#define NIP_UDP_HSLOT_COUNT 10 + +struct nip_udp_conn { + struct nip_addr ripaddr; + u16 lport; + u16 rport; + struct sock *sk; +}; + +int nip_udp_init(void); + +int nip_udp_output(struct sock *sk, struct msghdr *msg, size_t len); + +int nip_udp_input(struct sk_buff *skb); +int nip_udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, + int noblock, int flags, int *addr_len); + +struct nip_pseudo_header { + struct nip_addr src_addr; + struct nip_addr dst_addr; + unsigned short check_len; + unsigned char next_header; +}; + +unsigned int nip_check_sum(unsigned char *data, unsigned short data_len); +unsigned int nip_header_chksum(struct nip_pseudo_header *chksum_header); + +unsigned short nip_check_sum_build(unsigned char *data, + unsigned short data_len, + struct nip_pseudo_header *chksum_header); +unsigned short nip_check_sum_parse(unsigned char *data, + unsigned short check_len, + struct nip_pseudo_header *chksum_header); + +#endif diff --git a/include/net/nndisc.h b/include/net/nndisc.h new file mode 100755 index 0000000000000000000000000000000000000000..19c7ee86074f667f6fbc60f2cb04ede38834f206 --- /dev/null +++ b/include/net/nndisc.h @@ -0,0 +1,68 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _NNDISC_H +#define _NNDISC_H + +#include +#include +#include +#include +#include +#include +#include +#include + +#define NEWIP_NEIGH_BUCKET_MAX 8 +extern struct neigh_table nnd_tbl; + +#define NIP_ARP_NS 0x01 /* ARP请求 */ +#define NIP_ARP_NA 0x02 /* ARP回应 */ + +struct nnd_msg { + struct nip_icmp_hdr icmph; + __u8 data[0]; +}; + +static inline bool neigh_key_eq800(const struct neighbour *n, const void *pkey) +{ + struct nip_addr *a1, *a2; + + a1 = (struct nip_addr *)(pkey); + a2 = (struct nip_addr *)(n->primary_key); + +#define RIGHT_POS_3 3 + return a1->bitlen == a2->bitlen && a1->bitlen <= NIP_BITLEN_MAX && + memcmp(&a1->v.u, &a2->v.u, a1->bitlen >> RIGHT_POS_3) == 0; +} + +static inline u32 nndisc_hashfn(const void *pkey, const struct net_device *dev, + __u32 *hash_rnd) +{ + return (*(int *)pkey % NEWIP_NEIGH_BUCKET_MAX); +} + +static inline struct neighbour *__nip_neigh_lookup_noref(struct net_device *dev, + const void *pkey) +{ + return ___neigh_lookup_noref(&nnd_tbl, neigh_key_eq800, nndisc_hashfn, + pkey, dev); +} + +static inline struct neighbour *__nip_neigh_lookup(struct net_device *dev, + const void *pkey) +{ + struct neighbour *n; + + rcu_read_lock_bh(); + n = __nip_neigh_lookup_noref(dev, pkey); + if (n && !refcount_inc_not_zero(&n->refcnt)) + n = NULL; + rcu_read_unlock_bh(); + + return n; +} + +int nndisc_rcv(struct sk_buff *skb); + +int nndisc_init(void); + +#endif diff --git a/include/net/tcp_nip.h b/include/net/tcp_nip.h new file mode 100755 index 0000000000000000000000000000000000000000..80311cdf727f3d85d8c68f43b9c0f6e0a98f4d06 --- /dev/null +++ b/include/net/tcp_nip.h @@ -0,0 +1,138 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _TCP_NIP_H +#define _TCP_NIP_H + +#define FASTRETRANS_DEBUG 1 + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +/* 本文件是TCP协议主要的头文件 + * 包含tcp注册,建立连接,输入输出的接口函数 + */ + +extern struct proto tcp_nip_prot; + +#define TCP_HEADERLEN_OFFSET 6 + +/* init */ +int tcp_nip_init(void); +void tcp_nip_exit(void); + +void tcp_nip_done(struct sock *sk); +int tcp_direct_connect(struct sock *sk, void __user *arg); +void tcp_nip_rcv_established( + struct sock *sk, + struct sk_buff *skb, + const struct tcphdr *th, + unsigned int len); + +void __tcp_nip_push_pending_frames( + struct sock *sk, + unsigned int cur_mss, + int nonagle); + +u32 __nip_tcp_select_window(struct sock *sk); +void tcp_nip_rearm_rto(struct sock *sk); + +int tcp_nip_rcv_state_process(struct sock *sk, struct sk_buff *skb); + +/* tcp_nip_output */ +int tcp_nip_transmit_skb( + struct sock *sk, + struct sk_buff *skb, + int clone_it, + gfp_t gfp_mask); +int __tcp_nip_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs); +int tcp_nip_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs); +void tcp_nip_send_fin(struct sock *sk); +void tcp_nip_send_probe0(struct sock *sk); +int tcp_nip_write_wakeup(struct sock *sk, int mib); + +/* tcp_nip_timer */ +void tcp_nip_init_xmit_timers(struct sock *sk); +void tcp_nip_clear_xmit_timers(struct sock *sk); +void tcp_nip_delack_timer_handler(struct sock *sk); +void tcp_nip_write_timer_handler(struct sock *sk); + +/* check probe0 timer */ +static inline void tcp_nip_check_probe_timer(struct sock *sk) +{ + if (!tcp_sk(sk)->packets_out && !inet_csk(sk)->icsk_pending) + inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, + tcp_probe0_base(sk), TCP_RTO_MAX); +} + +static inline struct sk_buff *tcp_nip_send_head(const struct sock *sk) +{ + return sk->sk_send_head; +} + +static inline void tcp_nip_add_write_queue_tail( + struct sock *sk, + struct sk_buff *skb) +{ + __skb_queue_tail(&sk->sk_write_queue, skb); + + if (sk->sk_send_head == NULL) + sk->sk_send_head = skb; +} + +static inline bool tcp_nip_write_queue_empty(struct sock *sk) +{ + return skb_queue_empty(&sk->sk_write_queue); +} + +/* connect */ +int __tcp_nip_connect(struct sock *sk); +int tcp_newip_conn_request(struct request_sock_ops *rsk_ops, + const struct tcp_request_sock_ops *af_ops, + struct sock *sk, struct sk_buff *skb); +struct sk_buff *tcp_nip_make_synack( + const struct sock *sk, + struct dst_entry *dst, + struct request_sock *req, + struct tcp_fastopen_cookie *foc, + enum tcp_synack_type synack_type); +int nip_send_synack(struct request_sock *req, struct sk_buff *skb); +struct sock *tcp_nip_check_req(struct sock *sk, struct sk_buff *skb, + struct request_sock *req); +int tcp_nip_child_process(struct sock *parent, struct sock *child, + struct sk_buff *skb); +int tcp_nip_rtx_synack(const struct sock *sk, struct request_sock *req); +/* client send ack */ +void tcp_nip_send_ack(struct sock *sk); +struct sock *tcp_nip_create_openreq_child(const struct sock *sk, + struct request_sock *req, + struct sk_buff *skb); +void tcp_nip_initialize_rcv_mss(struct sock *sk); + +/* release */ +void tcp_nip_release_cb(struct sock *sk); + +#endif /* _NIP_TCP_H */ diff --git a/include/net/transp_nip.h b/include/net/transp_nip.h new file mode 100755 index 0000000000000000000000000000000000000000..e2e302854d63e63329c485b24e9a2b6c67f65ea7 --- /dev/null +++ b/include/net/transp_nip.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _TRANSP_NIP_H +#define _TRANSP_NIP_H + +extern struct proto nip_udp_prot; + +int nip_udp_init(void); +void nip_udp_exit(void); + +int nip_udp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); + +void nip_datagram_recv_ctl(struct sock *sk, struct msghdr *msg, + struct sk_buff *skb); +void nip_datagram_recv_common_ctl(struct sock *sk, struct msghdr *msg, + struct sk_buff *skb); +void nip_datagram_recv_specific_ctl(struct sock *sk, struct msghdr *msg, + struct sk_buff *skb); + +void nip_dgram_sock_seq_show(struct seq_file *seq, struct sock *sp, __u16 srcp, + __u16 destp, int bucket); + +void ninet_destroy_sock(struct sock *sk); + +#define NEWIP_SEQ_DGRAM_HEADER \ + " s1 " \ + "local_address " \ + "remote_address " \ + "st tx_queue rc_queue tr tm->when retrnsmt" \ + " uid timeout inode ref pointer drops\n" + +#endif diff --git a/include/uapi/linux/newip_route.h b/include/uapi/linux/newip_route.h new file mode 100755 index 0000000000000000000000000000000000000000..ff24542852e1219ec2f2965db650ef4e179fe87f --- /dev/null +++ b/include/uapi/linux/newip_route.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#ifndef _UAPI_LINUX_NEWIP_ROUTE_H +#define _UAPI_LINUX_NEWIP_ROUTE_H + +#include +#include +#include +#include "nin.h" + +struct nip_arpmsg { + struct nip_addr dst_addr; + char ifrn_name[10]; + __u8 lladdr[10]; +}; + +struct nip_rtmsg { + struct nip_addr rtmsg_dst; + struct nip_addr rtmsg_src; + struct nip_addr rtmsg_gateway; + char dev_name[10]; + __u32 rtmsg_type; + int rtmsg_ifindex; + __u32 rtmsg_metric; + unsigned long rtmsg_info; + __u32 rtmsg_flags; +}; +#endif /* _UAPI_LINUX_NEWIP_ROUTE_H */ diff --git a/include/uapi/linux/nin.h b/include/uapi/linux/nin.h new file mode 100755 index 0000000000000000000000000000000000000000..6e4083e6e22374def9948f75d918b7c0243bc1e2 --- /dev/null +++ b/include/uapi/linux/nin.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Types and definitions for AF_NINET + * Linux NEWIP implementation + * + */ + +#ifndef _UAPI_LINUX_NIN_H +#define _UAPI_LINUX_NIN_H + +#include +#include + +#define NIP_BITLEN_MAX 64 + +#define nip_addr_field8 v.u.u8 +#define nip_addr_field16 v.u.u16 +#define nip_addr_field32 v.u.u32 + +/* NewIP address field */ +#pragma pack(1) +struct nip_addr_field { + union { + __u8 u8[8]; + __be16 u16[4]; + __be32 u32[2]; + } u; +}; +#pragma pack() + +/* NewIP topology address structure */ +#pragma pack(1) +struct nip_addr { + uint8_t bitlen; + struct nip_addr_field v; +}; +#pragma pack() + +/* NewIP network socket address structure */ +struct sockaddr_nin { + unsigned short int sin_family; /* AF_NINET */ + __be16 sin_port; /* Transport layer port */ + struct nip_addr sin_addr; /* NIP address */ +}; + +#endif /* _UAPI_LINUX_NIN_H */ diff --git a/include/uapi/linux/nip.h b/include/uapi/linux/nip.h new file mode 100755 index 0000000000000000000000000000000000000000..38dfa71c5d531190bc251c82d9deeda6db67a4f4 --- /dev/null +++ b/include/uapi/linux/nip.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _UAPI_NEWIP_H +#define _UAPI_NEWIP_H + +#include +#include +#include +#include "nin.h" + +/* The latest drafts declared increase in minimal mtu up to 1280. */ +#define NIP_MIN_MTU 1280 + +struct nip_ifreq { + struct nip_addr ifrn_addr; + int ifrn_ifindex; +}; + +struct locator_secgroup_map { + uint64_t id; + struct nip_addr addr; + __u16 secgroup; +}; + +struct nip_connection_msg { + struct nip_addr saddr; + __u16 snum; + struct nip_addr daddr; + __be16 dport; +}; + +#endif /*_UAPI_NEWIP_H*/ diff --git a/include/uapi/linux/nip_icmp.h b/include/uapi/linux/nip_icmp.h new file mode 100755 index 0000000000000000000000000000000000000000..78b4fda29cc716324b89a80e12a58740e6032979 --- /dev/null +++ b/include/uapi/linux/nip_icmp.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _UAPI_LINUX_NIP_ICMP_H +#define _UAPI_LINUX_NIP_ICMP_H + +#include +#include + +struct nip_icmp_hdr { + __u8 nip_icmp_type; + __u8 nip_icmp_code; + __sum16 nip_icmp_cksum; +}; + +#endif diff --git a/net/newip/Kconfig b/net/newip/Kconfig new file mode 100755 index 0000000000000000000000000000000000000000..89265f6c49f90a9a8561eb0bd1576b6d25fcdbe5 --- /dev/null +++ b/net/newip/Kconfig @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# NewIP configuration +# + +# NewIP as module will cause a CRASH if you try to unload it +menuconfig NEWIP + tristate "The NewIP protocol" + default y + help + Support for NewIP. + + To compile this protocol support as a module, choose M here: the + module will be called NewIP. \ No newline at end of file diff --git a/net/newip/Makefile b/net/newip/Makefile new file mode 100755 index 0000000000000000000000000000000000000000..59ab06a6c145b9b73a496144f1fdbe5f1964b144 --- /dev/null +++ b/net/newip/Makefile @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Makefile for the Linux newip layer +# + +obj-$(CONFIG_NEWIP) += newip.o + + +newip-objs := af_ninet.o nip_input.o udp.o protocol.o nip_output.o datagram.o nip_addrconf.o nip_addrconf_core.o route.o nip_fib.o nip_fib_rules.o nndisc.o icmp.o +newip-objs += tcp_nip.o ninet_connection_sock.o ninet_hashtables.o tcp_nip_output.o tcp_nip_input.o tcp_nip_timer.o +EXTRA_CFLAGS := -I$(src)/include diff --git a/net/newip/af_ninet.c b/net/newip/af_ninet.c new file mode 100755 index 0000000000000000000000000000000000000000..32f3d6422bfaf2b5ebdbfe1d59b55f2b42ebdf9d --- /dev/null +++ b/net/newip/af_ninet.c @@ -0,0 +1,695 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * newIP protocal family + * + * + * 在系统中注册newIP网络层协议。 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for signal_pending() */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +MODULE_DESCRIPTION("NewiIP protocol stack for linux"); + +/* The inetsw_nip table contains everything that ninet_create needs to + * build a new socket + */ +static struct list_head inetsw_nip[SOCK_MAX]; +static DEFINE_SPINLOCK(inetsw_nip_lock); +/* count the socket number */ +atomic_t g_nip_socket_number = ATOMIC_INIT(0); + +static int disable_nip_mod; +module_param_named(disable, disable_nip_mod, int, 0444); +MODULE_PARM_DESC(disable, + "Disable NewIP module such that it is non_functional"); + +bool newip_mod_enabled(void) +{ + return disable_nip_mod == 0; +} +EXPORT_SYMBOL_GPL(newip_mod_enabled); + +static int ninet_create(struct net *net, struct socket *sock, int protocol, + int kern) +{ + struct inet_sock *inet; + struct sock *sk; + struct inet_protosw *answer; + struct proto *answer_prot; + unsigned char answer_flags; + int err; + int num; + + if (protocol < 0 || + protocol >= IPPROTO_MAX || + sock->type >= SOCK_MAX) + return -EINVAL; + + num = atomic_add_return(1, &g_nip_socket_number); + if (num > NIP_MAX_SOCKET_NUM) { + err = -EPERM; + goto number_sub; + } + + /* 初始化状态为未连接 */ + sock->state = SS_UNCONNECTED; + /* look for the requested type/protocol pair. */ + err = -ESOCKTNOSUPPORT; + rcu_read_lock(); + list_for_each_entry_rcu(answer, &inetsw_nip[sock->type], list) { + err = 0; + /* Check the non-wild matcg */ + if (protocol == answer->protocol) { + if (protocol != IPPROTO_IP) + break; + } else { + /* check for the two wild case. */ + if (protocol == IPPROTO_IP) { + protocol = answer->protocol; + break; + } + if (answer->protocol == IPPROTO_IP) + break; + } + err = -EPROTONOSUPPORT; + } + + if (err) + goto out_rcu_unlock; + + err = -EPERM; + + sock->ops = answer->ops; + answer_prot = answer->prot; + answer_flags = answer->flags; + rcu_read_unlock(); + + WARN_ON(!answer_prot->slab); + + err = -ENOBUFS; + sk = sk_alloc(net, PF_NINET, GFP_KERNEL, answer_prot, kern); + if (!sk) + goto number_sub; + + sock_init_data(sock, sk); + + err = 0; + if (answer_flags & INET_PROTOSW_REUSE) + sk->sk_reuse = SK_CAN_REUSE; + inet = inet_sk(sk); + inet->is_icsk = (answer_flags & INET_PROTOSW_ICSK) != 0; + + if (sock->type == SOCK_RAW) { + inet->inet_num = protocol; + if (protocol == IPPROTO_RAW) + inet->hdrincl = 1; + } + + sk->sk_destruct = inet_sock_destruct; + sk->sk_family = PF_NINET; + sk->sk_protocol = protocol; + sk->sk_backlog_rcv = answer->prot->backlog_rcv; + /* initiate sk->sk_nip_daddr */ + sk->sk_nip_daddr = nip_any_addr; + sk->sk_nip_rcv_saddr = nip_any_addr; + + inet->uc_ttl = -1; + sk_refcnt_debug_inc(sk); + + if (inet->inet_num) { + inet->inet_sport = htons(inet->inet_num); + err = sk->sk_prot->hash(sk); + if (err) { + sk_common_release(sk); + goto number_sub; + } + } + if (sk->sk_prot->init) { + err = sk->sk_prot->init(sk); + if (err) { + sk_common_release(sk); + goto number_sub; + } + } +out: + DEBUG("number of socket is: %d", num); + return err; +out_rcu_unlock: + rcu_read_unlock(); +number_sub: + atomic_sub(1, &g_nip_socket_number); + goto out; +} + +/* bind for ninet API */ +int ninet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +{ + struct sockaddr_nin *addr = (struct sockaddr_nin *)uaddr; + struct sock *sk = sock->sk; + struct inet_sock *inet = inet_sk(sk); + struct net *net = sock_net(sk); + u_short snum; + int err = 0; + + /* If the socket has its own bind function then use it */ + if (sk->sk_prot->bind) + return sk->sk_prot->bind(sk, uaddr, addr_len); + + snum = ntohs(addr->sin_port); + if (snum && snum < PROT_SOCK) + return -EACCES; + lock_sock(sk); + + /* check these errors (active socket, double bind) */ + if (sk->sk_state != TCP_CLOSE || inet->inet_num) { + err = -EINVAL; + goto out; + } + + if (nip_bind_addr_check(net, &addr->sin_addr) == false) { + err = -EFAULT; + DEBUG("%s: binding-addr invalid.", __func__); + goto out; + } + + sk->sk_nip_rcv_saddr = addr->sin_addr; + + /* make sure we are allowed to bind here */ + if ((snum || !inet->bind_address_no_port) && + sk->sk_prot->get_port(sk, snum)) { + err = -EADDRINUSE; + goto out; + } + +out: + release_sock(sk); + return err; +} +EXPORT_SYMBOL(ninet_bind); + +/* Function: + * Move a socket into listening state. + * Parameter: + * sock: 创建的socket + * backlog: 指定在TCP连接时,同时进行3次握手 + * 建立连接的客户端数量 + */ +int ninet_listen(struct socket *sock, int backlog) +{ + struct sock *sk = sock->sk; + unsigned char old_state; + int err; + + lock_sock(sk); + + err = -EINVAL; + if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM) + goto out; + + old_state = sk->sk_state; + if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN))) + goto out; + + /* Really, if the socket is already in listen state + * we can only allow the backlog to be adjusted. + */ + if (old_state != TCP_LISTEN) { + err = inet_csk_listen_start(sk, backlog); + if (err) + goto out; + } + sk->sk_max_ack_backlog = backlog; + err = 0; + +out: + release_sock(sk); + return err; +} +EXPORT_SYMBOL(ninet_listen); + +int ninet_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + + if (!sk) + return -EINVAL; + + atomic_sub(1, &g_nip_socket_number); + return inet_release(sock); +} +EXPORT_SYMBOL(ninet_release); + +void ninet_destroy_sock(struct sock *sk) +{ + ; +} +EXPORT_SYMBOL_GPL(ninet_destroy_sock); + +int ninet_getname(struct socket *sock, struct sockaddr *uaddr, + int peer) +{ + struct sock *sk = sock->sk; + struct inet_sock *inet = inet_sk(sk); + DECLARE_SOCKADDR(struct sockaddr_nin *, sin, uaddr); + + sin->sin_family = AF_NINET; + if (peer) { + if (!inet->inet_dport) + return -ENOTCONN; + if (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)) && + peer == 1) + return -ENOTCONN; + sin->sin_port = inet->inet_dport; + sin->sin_addr = sk->sk_nip_daddr; + } else { + sin->sin_port = inet->inet_sport; + sin->sin_addr = sk->sk_nip_rcv_saddr; + } + return sizeof(*sin); +} +EXPORT_SYMBOL(ninet_getname); + +static long ninet_wait_for_connect(struct sock *sk, long timeo, int writebias) +{ + DEFINE_WAIT_FUNC(wait, woken_wake_function); + + add_wait_queue(sk_sleep(sk), &wait); + sk->sk_write_pending += writebias; + + /* Basic assumption: if someone sets sk->sk_err, he _must_ + * change state of the socket from TCP_SYN_*. + * Connect() does not allow to get error notifications + * without closing the socket. + */ + while ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { + release_sock(sk); + timeo = wait_woken(&wait, TASK_INTERRUPTIBLE, timeo); + lock_sock(sk); + if (signal_pending(current) || !timeo) + break; + } + remove_wait_queue(sk_sleep(sk), &wait); + sk->sk_write_pending -= writebias; + return timeo; +} + +/* Function: + * 客户端套接口层用来建立连接请求的函数。 + * Parameter: + * sock: 套接字。 + * uaddr:目的地址。 + */ +int __ninet_stream_connect(struct socket *sock, struct sockaddr *uaddr, + int addr_len, int flags) +{ + struct sock *sk = sock->sk; + int err; + long timeo; + + switch (sock->state) { + default: + err = -EINVAL; + goto out; + case SS_CONNECTED: /* 如果已经建立 */ + err = -EISCONN; + goto out; + case SS_CONNECTING: /* 如果正在建立 */ + err = -EALREADY; + break; + case SS_UNCONNECTED: + err = -EISCONN; + if (sk->sk_state != TCP_CLOSE) + goto out; + /* 调用tcp_nip_connect函数 */ + err = sk->sk_prot->connect(sk, uaddr, addr_len); + if (err < 0) + goto out; + /* 先切换成正在连接,等后续操作 */ + sock->state = SS_CONNECTING; + err = -EINPROGRESS; + break; + } + /* 获取阻塞时间 */ + timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); + if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { + int writebias = 0; + /* Error code is set above */ + if (!timeo || !ninet_wait_for_connect(sk, timeo, writebias)) + goto out; + + err = sock_intr_errno(timeo); + if (signal_pending(current)) + goto out; + } + + if (sk->sk_state == TCP_CLOSE) + goto sock_error; + sock->state = SS_CONNECTED; + err = 0; + +out: + return err; +sock_error: + err = sock_error(sk) ? : -ECONNABORTED; + sock->state = SS_UNCONNECTED; + + sock->state = SS_DISCONNECTING; + goto out; +} +EXPORT_SYMBOL(__ninet_stream_connect); + +int ninet_stream_connect(struct socket *sock, struct sockaddr *uaddr, + int addr_len, int flags) +{ + int err; + + lock_sock(sock->sk); + err = __ninet_stream_connect(sock, uaddr, addr_len, flags); + release_sock(sock->sk); + return err; +} +EXPORT_SYMBOL(ninet_stream_connect); + +int ninet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + struct sock *sk = sock->sk; + struct net *net = sock_net(sk); + + switch (cmd) { + case SIOCADDRT: + case SIOCDELRT: + return nip_route_ioctl(net, cmd, (void __user *)arg); + case SIOCSIFADDR: + return nip_addrconf_add_ifaddr(net, (void __user *)arg); + case SIOCDIFADDR: + return nip_addrconf_del_ifaddr(net, (void __user *)arg); + default: + if (!sk->sk_prot->ioctl) + return -ENOIOCTLCMD; + return sk->sk_prot->ioctl(sk, cmd, arg); + } + + return 0; +} +EXPORT_SYMBOL(ninet_ioctl); + +/* register new IP socket */ +const struct proto_ops ninet_dgram_ops = { + .family = PF_NINET, + .owner = THIS_MODULE, + .release = ninet_release, + .bind = ninet_bind, + .connect = inet_dgram_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .getname = ninet_getname, + .poll = datagram_poll, + .ioctl = ninet_ioctl, + .listen = sock_no_listen, + .shutdown = inet_shutdown, + .setsockopt = sock_common_setsockopt, + .getsockopt = sock_common_getsockopt, + .sendmsg = inet_sendmsg, + .recvmsg = inet_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, + .set_peek_off = sk_set_peek_off, +}; + +const struct proto_ops ninet_stream_ops = { + .family = PF_NINET, + .owner = THIS_MODULE, + .release = ninet_release, + .bind = ninet_bind, + .connect = ninet_stream_connect, + .socketpair = sock_no_socketpair, + .accept = inet_accept, + .getname = ninet_getname, + .poll = tcp_poll, + .ioctl = ninet_ioctl, + .listen = ninet_listen, + .shutdown = inet_shutdown, + .setsockopt = sock_common_setsockopt, + .getsockopt = sock_common_getsockopt, + .sendmsg = inet_sendmsg, + .recvmsg = inet_recvmsg, + .mmap = sock_no_mmap, + .sendpage = inet_sendpage, +}; + +static const struct net_proto_family ninet_family_ops = { + .family = PF_NINET, + .create = ninet_create, + .owner = THIS_MODULE, +}; + +int ninet_register_protosw(struct inet_protosw *p) +{ + struct list_head *lh; + struct inet_protosw *answer; + struct list_head *last_perm; + int protocol = p->protocol; + int ret; + + spin_lock_bh(&inetsw_nip_lock); + + ret = -EINVAL; + if (p->type >= SOCK_MAX) + goto out_illegal; + + /* If we are trying to override a permanent protocol, bail. */ + answer = NULL; + ret = -EPERM; + last_perm = &inetsw_nip[p->type]; + list_for_each(lh, &inetsw_nip[p->type]) { + answer = list_entry(lh, struct inet_protosw, list); + + /* Check only the non-wild match. */ + if (answer->flags & INET_PROTOSW_PERMANENT) { + if (protocol == answer->protocol) + break; + last_perm = lh; + } + + answer = NULL; + } + if (answer) + goto out_permanent; + + list_add_rcu(&p->list, last_perm); + ret = 0; +out: + spin_unlock_bh(&inetsw_nip_lock); + return ret; + +out_permanent: + pr_err("Attempt to override permanent protocol %d\n", protocol); + goto out; + +out_illegal: + pr_err("Ignoring attempt to register invalid socket type %d\n", + p->type); + goto out; +} +EXPORT_SYMBOL(ninet_register_protosw); + +void ninet_unregister_protosw(struct inet_protosw *p) +{ + if (INET_PROTOSW_PERMANENT & p->flags) { + pr_err("Attempt to unregister permanent protocol %d\n", + p->protocol); + } else { + spin_lock_bh(&inetsw_nip_lock); + list_del_rcu(&p->list); + spin_unlock_bh(&inetsw_nip_lock); + + synchronize_net(); + } +} +EXPORT_SYMBOL(ninet_unregister_protosw); + +int ninet_sk_rebuild_header(struct sock *sk) +{ + return 0; +} +EXPORT_SYMBOL_GPL(ninet_sk_rebuild_header); + +/* register to data link layer */ +static struct packet_type nip_packet_type __read_mostly = { + .type = cpu_to_be16(ETH_P_NEWIP), + .func = nip_nwk_input, +}; + +static int __init nip_packet_init(void) +{ + dev_add_pack(&nip_packet_type); + return 0; +} + +static int __net_init ninet_net_init(struct net *net) +{ + int err = 0; + return err; +} + +static void __net_exit ninet_net_exit(struct net *net) +{ + ; +} + +static struct pernet_operations ninet_net_ops = { + .init = ninet_net_init, + .exit = ninet_net_exit, +}; + +static int __init nip_nwk_init(void) +{ + struct list_head *r; + int err = 0; + + sock_skb_cb_check_size(sizeof(struct ninet_skb_parm)); + + DEBUG("NET: start to init nip network.\n"); + /* register the socket-side information for ninet_create */ + for (r = &inetsw_nip[0]; r < &inetsw_nip[SOCK_MAX]; ++r) + INIT_LIST_HEAD(r); + + if (disable_nip_mod) { + DEBUG("Loaded, but adminstratively disabled,"); + DEBUG("reboot required to enable\n"); + goto out; + } + + err = proto_register(&tcp_nip_prot, 1); + if (err) + goto out; + + err = proto_register(&nip_udp_prot, 1); + if (err) { + DEBUG_TRACE("failed to register udp proto!\n"); + goto out_udp_register_fail; + } + + err = sock_register(&ninet_family_ops); + if (err) { + DEBUG_TRACE("failed to register newip_family_ops!"); + goto out_sock_register_fail; + } + + err = register_pernet_subsys(&ninet_net_ops); + if (err) { + DEBUG_TRACE("failed to register ninet_net_ops!\n"); + goto register_pernet_fail; + } + + err = nip_icmp_init(); + if (err) { + DEBUG_TRACE("nip_icmp_init failed!\n"); + goto nip_icmp_fail; + } + + err = nndisc_init(); + if (err) { + DEBUG_TRACE("nndisc_init failed!\n"); + goto nndisc_fail; + } + + err = nip_route_init(); + if (err) + goto nip_route_fail; + + err = nip_addrconf_init(); + if (err) + goto nip_addr_fail; + + err = nip_udp_init(); + if (err) { + DEBUG_TRACE("failed to init udp layer!\n"); + goto udp_fail; + } + + + err = tcp_nip_init(); + if (err) { + DEBUG("failed to init tcp layer!\n"); + goto tcp_fail; + } else { + DEBUG("nip_tcp_init ok!"); + } + + err = nip_packet_init(); + if (err) { + DEBUG_TRACE("failed to register to l2 layer!\n"); + goto nip_packet_fail; + } + + DEBUG("NewIP: init newip address family ok!"); + +out: + return err; + +nip_packet_fail: +udp_fail: +tcp_fail: + nip_addrconf_cleanup(); +nip_addr_fail: + nip_route_cleanup(); +nip_route_fail: +nndisc_fail: +nip_icmp_fail: + unregister_pernet_subsys(&ninet_net_ops); +register_pernet_fail: + sock_unregister(PF_NINET); +out_sock_register_fail: + proto_unregister(&nip_udp_prot); +out_udp_register_fail: + DEBUG_TRACE("newip family init failed!!!\n"); + goto out; +} + +module_init(nip_nwk_init); + +MODULE_ALIAS_NETPROTO(PF_NINET); + diff --git a/net/newip/datagram.c b/net/newip/datagram.c new file mode 100755 index 0000000000000000000000000000000000000000..1addd5b344158e5f69d073e2638f084d8cc5c9f2 --- /dev/null +++ b/net/newip/datagram.c @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include +#include +#include +#include +#include +#include + +int nip_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) +{ + int res = 0; + return res; +} +EXPORT_SYMBOL_GPL(nip_datagram_connect); + +void nip_datagram_release_cb(struct sock *sk) +{ + ; +} +EXPORT_SYMBOL_GPL(nip_datagram_release_cb); + diff --git a/net/newip/icmp.c b/net/newip/icmp.c new file mode 100755 index 0000000000000000000000000000000000000000..7e6c1b140eaa9434f37bde93a8c9c15b5b661f9c --- /dev/null +++ b/net/newip/icmp.c @@ -0,0 +1,74 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_SYSCTL +#include +#endif + +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include + +int nip_icmp_rcv(struct sk_buff *skb) +{ + int ret = 0; + struct nip_icmp_hdr *hdr = nip_icmp_header(skb); + u8 type = hdr->nip_icmp_type; + + DEBUG("rcv newip icmp packet. type = %u\n", type); + switch (type) { + case NIP_ARP_NS: + case NIP_ARP_NA: + ret = nndisc_rcv(skb); + break; + default: + DEBUG("nip icmp packet type error\n"); + } + return ret; +} + +static void nip_icmp_err(struct sk_buff *skb, + struct ninet_skb_parm *opt, + u8 type, uint8_t code, + int offset, __be32 info) +{ +} + +static const struct ninet_protocol nip_icmp_protocol = { + .handler = nip_icmp_rcv, + .err_handler = nip_icmp_err, + .flags = 0, +}; + +int __init nip_icmp_init(void) +{ + int ret; + + ret = ninet_add_protocol(&nip_icmp_protocol, IPPROTO_NIP_ICMP); + return ret; +} diff --git a/net/newip/ninet_connection_sock.c b/net/newip/ninet_connection_sock.c new file mode 100755 index 0000000000000000000000000000000000000000..43256f19577816fd6fc48b730b1fa96c73dfe2c0 --- /dev/null +++ b/net/newip/ninet_connection_sock.c @@ -0,0 +1,99 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Function: + * 请求处理的超时处理函数,用于重传SYN+ACK + * Parameter: + * data:请求控制块 + */ +static void ninet_reqsk_timer_handler(struct timer_list *t) +{ + struct request_sock *req = from_timer(req, t, rsk_timer); + struct sock *sk_listener = req->rsk_listener; + struct net *net = sock_net(sk_listener); + struct inet_connection_sock *icsk = inet_csk(sk_listener); + struct request_sock_queue *queue = &icsk->icsk_accept_queue; + int qlen, expire = 0, resend = 0; + int max_retries, thresh; + u8 defer_accept; + + /* 定义最大重传次数,thresh默认为5 */ + max_retries = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_synack_retries; + thresh = max_retries; + + /* 检查超时次数,重传SYN+ACK,重传次数+1 */ + if (req->num_timeout <= thresh) { + unsigned long timeo; + + req->rsk_ops->rtx_syn_ack(sk_listener, req); + req->num_retrans++; + /* 如果超时次数还是0,则次数加1,判断是否为首次超时 */ + if (req->num_timeout++ == 0) + atomic_dec(&queue->young); + /* 重置定时器,这里间隔只设置为1s */ + timeo = min(TCP_TIMEOUT_INIT, TCP_RTO_MAX); + mod_timer(&req->rsk_timer, jiffies + timeo); + return; + } + +drop: + inet_csk_reqsk_queue_drop_and_put(sk_listener, req); +} + +/* Function: + * 将request_sock加入连接队列和ehash表中,并设置SYNACK的超时重传定时器 + * Parameter: + * sk:传输控制块 + * req:连接请求块 + * timeout:初始的超时时间 + */ +void ninet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, + unsigned long timeout) +{ + req->num_retrans = 0; + req->num_timeout = 0; + req->sk = NULL; + + timer_setup(&req->rsk_timer, ninet_reqsk_timer_handler, + TIMER_PINNED); + mod_timer(&req->rsk_timer, jiffies + timeout); + + inet_ehash_insert(req_to_sk(req), NULL, NULL); + + /* 计数设置 */ + smp_wmb(); + refcount_set(&req->rsk_refcnt, 2 + 1); + + inet_csk_reqsk_queue_added(sk); +} +EXPORT_SYMBOL_GPL(ninet_csk_reqsk_queue_hash_add); + + +/* Function: + * 查看socket和链表中的是否冲突,不冲突则返回0 + * Parameter: + * sk: 要listen的传输控制块。 + * tb: bind bucket,储存bind的sock的链表 + */ +int ninet_csk_bind_conflict(const struct sock *sk, + const struct inet_bind_bucket *tb, bool relax) +{ + return 0; +} +EXPORT_SYMBOL_GPL(ninet_csk_bind_conflict); + diff --git a/net/newip/ninet_hashtables.c b/net/newip/ninet_hashtables.c new file mode 100755 index 0000000000000000000000000000000000000000..42258f4ae168c5b8da63321b31ab469469bf4627 --- /dev/null +++ b/net/newip/ninet_hashtables.c @@ -0,0 +1,266 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +static u32 __nip_addr_jhash(const struct nip_addr *a, const u32 initval) +{ + u32 v = (__force u32)a->nip_addr_field32[0] ^ (__force u32)a->nip_addr_field32[1]; + + return jhash_3words(v, + (__force u32)a->nip_addr_field32[0], + (__force u32)a->nip_addr_field32[1], + initval); +} + +/* Function: + * 根据传入的参数返回hash值。 + * Parameter: + * net: 命名空间。 + * laddr: 目的地址。 + * lport: 目的端口。 + * faddr: 源地址。 + * fport: 源端口。 + */ +u32 ninet_ehashfn(const struct net *net, + const struct nip_addr *laddr, const u16 lport, + const struct nip_addr *faddr, const __be16 fport) +{ + static u32 ninet_ehash_secret __read_mostly; + static u32 ninet_hash_secret __read_mostly; + + u32 lhash, fhash; + + net_get_random_once(&ninet_ehash_secret, sizeof(ninet_ehash_secret)); + net_get_random_once(&ninet_hash_secret, sizeof(ninet_hash_secret)); + + lhash = (__force u32)laddr->nip_addr_field32[0]; //ipv6用的是s6_addr32[3],地址的最后32bits + fhash = __nip_addr_jhash(faddr, ninet_hash_secret); + + return __ninet_ehashfn(lhash, lport, fhash, fport, + ninet_ehash_secret + net_hash_mix(net)); +} + +/* Function: + * 将socket放进listen hash散列表中,以备后面服务器第二次握手查找到相应的socket + * Parameter: + * sk: 传输控制块 + * osk: old socket + * saddr_same: 地址比较函数 + */ +int __ninet_hash(struct sock *sk, struct sock *osk) +{ + struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; + struct inet_listen_hashbucket *ilb; + int err = 0; + + if (sk->sk_state != TCP_LISTEN) { + inet_ehash_nolisten(sk, osk, NULL); + return 0; + } + WARN_ON(!sk_unhashed(sk)); + ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; + + spin_lock(&ilb->lock); + + __sk_nulls_add_node_rcu(sk, &ilb->nulls_head); + sock_set_flag(sk, SOCK_RCU_FREE); + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); +unlock: + spin_unlock(&ilb->lock); + + return err; +} +EXPORT_SYMBOL(__ninet_hash); + +int ninet_hash(struct sock *sk) +{ + int err = 0; + + if (sk->sk_state != TCP_CLOSE) { + local_bh_disable(); + err = __ninet_hash(sk, NULL); + local_bh_enable(); + } + + return err; +} +EXPORT_SYMBOL_GPL(ninet_hash); + +/* Function: 将sock从hash表中移除 */ +void ninet_unhash(struct sock *sk) +{ + struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; + struct inet_listen_hashbucket *ilb = NULL; + spinlock_t *lock; + + if (sk_unhashed(sk)) + return; + + if (sk->sk_state == TCP_LISTEN) { + ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; + lock = &ilb->lock; + } else { + lock = inet_ehash_lockp(hashinfo, sk->sk_hash); + } + spin_lock_bh(lock); + if (sk_unhashed(sk)) + goto unlock; + + __sk_nulls_del_node_init_rcu(sk); + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); + +unlock: + spin_unlock_bh(lock); +} +EXPORT_SYMBOL_GPL(ninet_unhash); + +/* Function: + * 在ehash表中根据地址和端口来查找传输控制块。 + * 如果找到则表示已经经历了 + * 三次握手并且已建立了连接,可以进行正常的通信。 + * Parameter: + * net: 命名空间。 + * hashinfo: 类型为tcp_hashinfo的全局标量,保存当前系统的各种 + * 状态的tcp_sock(包括established, listen, bind) + * saddr: 源地址。 + * sport: 源端口。 + * daddr: 目的地址。 + * hnum: 目的端口。 + */ +struct sock *__ninet_lookup_established(struct net *net, + struct inet_hashinfo *hashinfo, + const struct nip_addr *saddr, + const __be16 sport, + const struct nip_addr *daddr, + const u16 hnum, + const int dif) +{ + struct sock *sk; + const struct hlist_nulls_node *node; + + const __portpair ports = INET_COMBINED_PORTS(sport, hnum); + + unsigned int hash = ninet_ehashfn(net, daddr, hnum, saddr, sport); + unsigned int slot = hash & hashinfo->ehash_mask; + + struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; + +begin: + sk_nulls_for_each_rcu(sk, node, &head->chain) { + DEBUG("%s: sk->sk_hash:%u", __func__, sk->sk_hash); + DEBUG("%s: dif:%d", __func__, dif); + if (sk->sk_hash != hash) + continue; + if (!NINET_MATCH(sk, net, saddr, daddr, ports, dif)) + continue; + if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) + goto out; + + if (unlikely(!NINET_MATCH(sk, net, saddr, daddr, ports, dif))) { + sock_gen_put(sk); + goto begin; + } + DEBUG("%s: find sock in ehash table!", __func__); + goto found; + } + if (get_nulls_value(node) != slot) + goto begin; +out: + sk = NULL; +found: + return sk; +} +EXPORT_SYMBOL(__ninet_lookup_established); + +static inline int nip_tcp_compute_score(struct sock *sk, struct net *net, + const unsigned short hnum, + const struct nip_addr *daddr, + const int dif, bool exact_dif) +{ + int score = -1; + + if (inet_sk(sk)->inet_num == hnum && sk->sk_family == PF_NINET && + net_eq(sock_net(sk), net)) { + score = 1; + } + + return score; +} + +struct sock *ninet_lookup_listener(struct net *net, + struct inet_hashinfo *hashinfo, + struct sk_buff *skb, int doff, + const struct nip_addr *saddr, + const __be16 sport, const struct nip_addr *daddr, + const unsigned short hnum, const int dif) +{ + unsigned int hash = inet_lhashfn(net, hnum); + struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; + int score, hiscore = 0, matches = 0, reuseport = 0; + bool exact_dif = false; + struct sock *sk, *result = NULL; + struct hlist_nulls_node *node; + u32 phash = 0; + + DEBUG("%s before nip_compute_score hnum=%d", __func__, hnum); + + sk_nulls_for_each(sk, node, &ilb->nulls_head) { + score = nip_tcp_compute_score(sk, net, hnum, daddr, dif, exact_dif); + + if (score > hiscore) { + result = sk; + hiscore = score; + DEBUG("%s: find sock in lhash table", __func__); + + return result; + } else if (score == hiscore && reuseport) { + matches++; + if (reciprocal_scale(phash, matches) == 0) + result = sk; + phash = next_pseudo_random32(phash); + } + } + DEBUG("%s: don't find sock in lhash table", __func__); + return result; +} +EXPORT_SYMBOL_GPL(ninet_lookup_listener); + +static int __ninet_check_established(struct inet_timewait_death_row *death_row, + struct sock *sk, const __u16 lport, + struct inet_timewait_sock **twp) +{ + return 0; +} + +static u32 ninet_sk_port_offset(const struct sock *sk) +{ + const struct inet_sock *inet = inet_sk(sk); + + return secure_newip_port_ephemeral(sk->sk_nip_rcv_saddr.nip_addr_field32, + sk->sk_nip_daddr.nip_addr_field32, + inet->inet_dport); +} + +/* Function: 随机绑定本地端口 */ +int ninet_hash_connect(struct inet_timewait_death_row *death_row, + struct sock *sk) +{ + u32 port_offset = 0; + + if (!inet_sk(sk)->inet_num) + port_offset = ninet_sk_port_offset(sk); + + return __inet_hash_connect(death_row, sk, port_offset, + __ninet_check_established); +} +EXPORT_SYMBOL_GPL(ninet_hash_connect); + diff --git a/net/newip/nip.c b/net/newip/nip.c new file mode 100755 index 0000000000000000000000000000000000000000..3278617ea0377e9bc073c880c2b36ac12c8d94b5 --- /dev/null +++ b/net/newip/nip.c @@ -0,0 +1,79 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +struct nip_addr *nip_nexthop(struct nip_rt_info *rt, struct nip_addr *daddr) +{ + if (rt->rt_flags & RTF_GATEWAY) + return &rt->gateway; + else + return daddr; +} + +struct neighbour *nip_get_neigh(struct nip_addr *nipaddr, + struct net_device *dev) +{ + struct neighbour *neigh; + + neigh = neigh_lookup(&nnd_tbl, nipaddr, dev); + + return neigh; +} + +int nip_addr_check(struct nip_addr *addr) +{ + unsigned char first_byte, second_byte; + int addr_len, i, err; + + first_byte = addr->nip_addr_field8[0]; + second_byte = addr->nip_addr_field8[1]; + addr_len = addr->bitlen / NIP_ADDR_BIT_LEN_8; + + for (i = addr_len; i < NIP_ADDR_LEN_MAX; i++) { + if (addr->nip_addr_field8[i] > 0x00) { + DEBUG("%s: newip bitlen error\n", __func__); + err = 1; + return err; + } + } + + if (first_byte <= ADDR_FIRST_DC && addr_len == NIP_ADDR_LEN_1) + err = 0; + else if (first_byte <= ADDR_FIRST_F0 && addr_len == NIP_ADDR_LEN_2) + err = 0; + else if (first_byte == ADDR_FIRST_F1 && addr_len == NIP_ADDR_LEN_3) { + if (second_byte >= ADDR_SECOND_MIN) + err = 0; + else { + DEBUG("%s: addr3 check fail, second_byte too small", + __func__); + err = 1; + } + } else if (first_byte == ADDR_FIRST_F2 && addr_len == NIP_ADDR_LEN_5) { + if (second_byte >= ADDR_SECOND_MIN) + err = 0; + else { + DEBUG("%s: addr5 check fail, second_byte too small", + __func__); + err = 1; + } + } else if (first_byte == ADDR_FIRST_FF && addr_len == NIP_ADDR_LEN_2) + err = 0; /* 功能性地址 */ + else { + DEBUG("%s: addr check fail", __func__); + err = 1; + } + return err; +} diff --git a/net/newip/nip_addrconf.c b/net/newip/nip_addrconf.c new file mode 100755 index 0000000000000000000000000000000000000000..2a426ee3ba73202e099966a8236e6e2d0114c435 --- /dev/null +++ b/net/newip/nip_addrconf.c @@ -0,0 +1,883 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * newIP command kernel processing + * newIP Address [auto]configuration + * Linux newIP implementation + * + */ +#define pr_fmt(fmt) "NewIP ADDRCONF: " fmt + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_SYSCTL +#include +#endif +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define INFINITY_LIFE_TIME 0xFFFFFFFF + +/* Configured unicast address hash table */ +static struct hlist_head ninet_addr_lst[NIN_ADDR_HSIZE]; +static DEFINE_SPINLOCK(addrconf_hash_lock); + +static bool nip_chk_same_addr(struct net *net, const struct nip_addr *addr, + struct net_device *dev); +static int nip_get_firstaddr(const struct net_device *dev, + struct nip_addr *addr); +static int nip_addrconf_ifdown(struct net_device *dev, int how); + +static struct nip_devconf newip_devconf_dflt __read_mostly = { + .forwarding = 0, + .mtu = NEWIP_MIN_MTU, + .disable_nip = 0, + .ignore_routes_with_linkdown = 0, +}; + +/* Check if link is ready: is it up and is a valid qdisc available */ +static inline bool nip_addrconf_link_ready(const struct net_device *dev) +{ + return netif_oper_up(dev) && !qdisc_tx_is_noop(dev); +} + +static void nip_link_dev_addr(struct ninet_dev *idev, struct ninet_ifaddr *ifp) +{ + list_add_tail(&ifp->if_list, &idev->addr_list); +} + +static u32 ninet_addr_hash(const struct nip_addr *addr) +{ + return hash_32(nip_addr_hash(addr), NIN_ADDR_HSIZE_SHIFT); +} + +static struct ninet_ifaddr *nip_add_addr(struct ninet_dev *idev, + const struct nip_addr *addr, + u32 flags, u32 valid_lft, + u32 preferred_lft) +{ + struct ninet_ifaddr *ifa = NULL; + struct nip_rt_info *rt = NULL; + unsigned int hash; + int err = 0; + + rcu_read_lock_bh(); + + nin_dev_hold(idev); + + if (idev->dead) { + err = -ENODEV; + goto out2; + } + + if (idev->cnf.disable_nip) { + err = -EACCES; + goto out2; + } + + spin_lock(&addrconf_hash_lock); + + /* 禁止在一个网络命名空间内配置两个相同地址 */ + if (nip_chk_same_addr(dev_net(idev->dev), addr, NULL)) { + DEBUG("%s: already assigned\n", __func__); + err = -EEXIST; + goto out; + } + + ifa = kzalloc(sizeof(*ifa), GFP_ATOMIC); + if (!ifa) { + DEBUG("%s: malloc failed\n", __func__); + err = -ENOBUFS; + goto out; + } + + rt = nip_addrconf_dst_alloc(idev, addr); + if (IS_ERR(rt)) { + err = PTR_ERR(rt); + goto out; + } + + neigh_parms_data_state_setall(idev->nd_parms); + + ifa->addr = *addr; + + spin_lock_init(&ifa->lock); + INIT_HLIST_NODE(&ifa->addr_lst); + ifa->flags = flags; + ifa->valid_lft = valid_lft; + ifa->preferred_lft = preferred_lft; + ifa->tstamp = jiffies; + ifa->cstamp = ifa->tstamp; + + ifa->rt = rt; + + ifa->idev = idev; + /* For caller */ + refcount_set(&ifa->refcnt, 1); + + /* Add to big hash table */ + hash = ninet_addr_hash(addr); + + hlist_add_head_rcu(&ifa->addr_lst, &ninet_addr_lst[hash]); + spin_unlock(&addrconf_hash_lock); + + write_lock(&idev->lock); + /* Add to ninet_dev unicast addr list. */ + nip_link_dev_addr(idev, ifa); + + nin_ifa_hold(ifa); + write_unlock(&idev->lock); + +out2: + rcu_read_unlock_bh(); + + if (likely(err == 0)) { + DEBUG("%s: success! idev->refcnt=%u\n", __func__, + refcount_read(&idev->refcnt)); + } else { + kfree(ifa); + nin_dev_put(idev); + ifa = ERR_PTR(err); + } + + return ifa; +out: + spin_unlock(&addrconf_hash_lock); + goto out2; +} + +static struct ninet_dev *nip_add_dev(struct net_device *dev) +{ + struct ninet_dev *ndev; + int err = -ENOMEM; + + ASSERT_RTNL(); + + if (dev->mtu < NEWIP_MIN_MTU) + return ERR_PTR(-EINVAL); + + ndev = kzalloc(sizeof(*ndev), GFP_KERNEL); + if (!ndev) + return ERR_PTR(err); + + rwlock_init(&ndev->lock); + ndev->dev = dev; + INIT_LIST_HEAD(&ndev->addr_list); + memcpy(&ndev->cnf, dev_net(dev)->newip.devconf_dflt, sizeof(ndev->cnf)); + + ndev->cnf.mtu = dev->mtu; + ndev->nd_parms = neigh_parms_alloc(dev, &nnd_tbl); + if (!ndev->nd_parms) { + kfree(ndev); + return ERR_PTR(err); + } + + /* We refer to the device */ + dev_hold(dev); + + refcount_set(&ndev->refcnt, 1); + + DEBUG("%s: init ninet_dev success!, set ndev->refcnt=1\n", __func__); + + if (netif_running(dev) && nip_addrconf_link_ready(dev)) + ndev->if_flags |= IF_READY; + + /* protected by rtnl_lock */ + rcu_assign_pointer(dev->nip_ptr, ndev); + return ndev; +} + +static struct ninet_dev *nip_find_idev(struct net_device *dev) +{ + struct ninet_dev *idev; + + ASSERT_RTNL(); + + idev = __nin_dev_get(dev); + if (!idev) { + idev = nip_add_dev(dev); + if (IS_ERR(idev)) + return NULL; + } + return idev; +} + +static struct ninet_dev *nip_addrconf_add_dev(struct net_device *dev) +{ + struct ninet_dev *idev; + + ASSERT_RTNL(); + + idev = nip_find_idev(dev); + if (!idev) + return ERR_PTR(-ENOBUFS); + + if (idev->cnf.disable_nip) + return ERR_PTR(-EACCES); + + return idev; +} + +/* 0xFF00 - 环回地址 + * 0xFF01 - 入网认证公知地址 + * 0xFF02 - 接入认证公知地址 + * 0xFF03 - 邻居发现公知地址 + * 0xFF04 - 地址解析(ARP) + * 0xFF05 - DHCP公知地址 + * 0xFF06 - 极简接入认证公知地址 + * 0xFF07 - 自组织协议公知地址 + * 0xFF08 - IEEE EUI-64地址 + * 0xFF09 - any_addr + */ +int nip_addr_check(struct nip_addr *addr) +{ + unsigned char first_byte, second_byte, third_byte; + int addr_len, i, err; + + first_byte = addr->nip_addr_field8[0]; + second_byte = addr->nip_addr_field8[1]; + third_byte = addr->nip_addr_field8[2]; /* 2表示第3个B短地址 */ + addr_len = addr->bitlen / NIP_ADDR_BIT_LEN_8; + /* 短地址有效长度后面的字段数值应该都是0 */ + for (i = addr_len; i < NIP_ADDR_LEN_MAX; i++) { + if (addr->nip_addr_field8[i] > 0x00) { + DEBUG("%s: newip bitlen error\n", __func__); + err = 1; + return err; + } + } + + if (first_byte <= ADDR_FIRST_DC && addr_len == NIP_ADDR_LEN_1) { + err = 0; + } else if (first_byte <= ADDR_FIRST_F0 && addr_len == NIP_ADDR_LEN_2) { + if (first_byte > ADDR_FIRST_DC + 1 || + second_byte >= ADDR_SECOND_MIN_DD) { + err = 0; + } else { + DEBUG("%s: addr2 is not valid", __func__); + err = 1; + } + } else if (first_byte == ADDR_FIRST_F1 && addr_len == NIP_ADDR_LEN_3) { + if (second_byte >= ADDR_SECOND_MIN_F1) { + err = 0; + } else { + DEBUG("%s: addr3 is not valid", __func__); + err = 1; + } + } else if (first_byte == ADDR_FIRST_F2 && addr_len == NIP_ADDR_LEN_5) { + if (second_byte > 0 || third_byte >= ADDR_THIRD_MIN_F2) { + err = 0; + } else { + DEBUG("%s: addr5 is not valid", __func__); + err = 1; + } + } else if (first_byte == ADDR_FIRST_FF && addr_len == NIP_ADDR_LEN_2) { + /* 功能性地址 */ + err = 0; + } else { + DEBUG("%s: addr check fail", __func__); + err = 1; + } + return err; +} + +/* Manual configuration of address on an interface */ +static int ninet_addr_add(struct net *net, int ifindex, + const struct nip_addr *pfx, + __u32 ifa_flags, __u32 preferred_lft, __u32 valid_lft) +{ + struct ninet_ifaddr *ifp; + struct ninet_dev *idev; + struct net_device *dev; + unsigned long timeout; + clock_t expires; + u32 flags; + __u32 ifa_flags_tmp = ifa_flags; + __u32 valid_lft_tmp = valid_lft; + + ASSERT_RTNL(); + + /* check the lifetime */ + if (!valid_lft_tmp || preferred_lft > valid_lft_tmp) + return -EINVAL; + + dev = __dev_get_by_index(net, ifindex); + if (!dev) + return -ENODEV; + + idev = nip_addrconf_add_dev(dev); /* 挂接dev和idev */ + if (IS_ERR(idev)) + return PTR_ERR(idev); + + timeout = addrconf_timeout_fixup(valid_lft_tmp, HZ); + if (addrconf_finite_timeout(timeout)) { + expires = jiffies_to_clock_t(timeout * HZ); + valid_lft_tmp = timeout; + } else { + expires = 0; + flags = 0; + ifa_flags_tmp |= IFA_F_PERMANENT; + } + + timeout = addrconf_timeout_fixup(preferred_lft, HZ); + if (addrconf_finite_timeout(timeout)) { + if (timeout == 0) + ifa_flags_tmp |= IFA_F_DEPRECATED; + preferred_lft = timeout; + } + + ifp = nip_add_addr(idev, pfx, ifa_flags_tmp, + valid_lft_tmp, + preferred_lft); + if (!IS_ERR(ifp)) { + nin_ifa_put(ifp); + nip_ins_rt(ifp->rt); + DEBUG("%s: success! ifp->refcnt=%u\n", __func__, + refcount_read(&ifp->refcnt)); + return 0; + } + + return PTR_ERR(ifp); +} + +/* Nobody refers to this ifaddr, destroy it */ +void ninet_ifa_finish_destroy(struct ninet_ifaddr *ifp) +{ + WARN_ON(!hlist_unhashed(&ifp->addr_lst)); + + DEBUG("%s: before idev put. idev->refcnt=%u\n", __func__, + refcount_read(&ifp->idev->refcnt)); + + nin_dev_put(ifp->idev); + + nip_rt_put(ifp->rt); + + kfree_rcu(ifp, rcu); +} + +static void nip_del_addr(struct ninet_ifaddr *ifp) +{ + int state; + + ASSERT_RTNL(); + + spin_lock_bh(&ifp->lock); + state = ifp->state; + ifp->state = NINET_IFADDR_STATE_DEAD; + spin_unlock_bh(&ifp->lock); + + if (state == NINET_IFADDR_STATE_DEAD) + goto out; + + spin_lock_bh(&addrconf_hash_lock); + hlist_del_init_rcu(&ifp->addr_lst); + spin_unlock_bh(&addrconf_hash_lock); + + write_lock_bh(&ifp->idev->lock); + + list_del_init(&ifp->if_list); + __nin_ifa_put(ifp); + + write_unlock_bh(&ifp->idev->lock); + + if (ifp->rt) { + /* 若ifp->rt不属于任何一个nip_fib_node, + * 此项删除操作后,rt->dst的引用计数不变 + */ + if (dst_hold_safe(&ifp->rt->dst)) + nip_del_rt(ifp->rt); + } + +out: + nin_ifa_put(ifp); +} + +static int ninet_addr_del(struct net *net, int ifindex, u32 ifa_flags, + const struct nip_addr *pfx) +{ + struct ninet_ifaddr *ifp; + struct ninet_dev *idev; + struct net_device *dev; + + dev = __dev_get_by_index(net, ifindex); + if (!dev) + return -ENODEV; + + idev = __nin_dev_get(dev); + if (!idev) + return -ENXIO; + + read_lock_bh(&idev->lock); + list_for_each_entry(ifp, &idev->addr_list, if_list) { + if (nip_addr_eq(pfx, &ifp->addr)) { + nin_ifa_hold(ifp); + read_unlock_bh(&idev->lock); + + nip_del_addr(ifp); + DEBUG("nip_addr_del: success!"); + return 0; + } + } + read_unlock_bh(&idev->lock); + return -EADDRNOTAVAIL; +} + +int nip_addrconf_add_ifaddr(struct net *net, void __user *arg) +{ + struct nip_ifreq ireq; + + int err; + + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) + return -EPERM; + + if (copy_from_user(&ireq, arg, sizeof(struct nip_ifreq))) + return -EFAULT; + + rtnl_lock(); + err = ninet_addr_add(net, ireq.ifrn_ifindex, &ireq.ifrn_addr, + IFA_F_PERMANENT, INFINITY_LIFE_TIME, + INFINITY_LIFE_TIME); + rtnl_unlock(); + return err; +} + +int nip_addrconf_del_ifaddr(struct net *net, void __user *arg) +{ + struct nip_ifreq ireq; + int err; + + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) + return -EPERM; + + if (copy_from_user(&ireq, arg, sizeof(struct nip_ifreq))) + return -EFAULT; + + rtnl_lock(); + err = ninet_addr_del(net, ireq.ifrn_ifindex, 0, &ireq.ifrn_addr); + rtnl_unlock(); + return err; +} + +static bool nip_chk_same_addr(struct net *net, const struct nip_addr *addr, + struct net_device *dev) +{ + unsigned int hash = ninet_addr_hash(addr); + struct ninet_ifaddr *ifp; + + hlist_for_each_entry(ifp, &ninet_addr_lst[hash], addr_lst) { + if (!net_eq(dev_net(ifp->idev->dev), net)) + continue; + if (nip_addr_eq(&ifp->addr, addr)) { + if (!dev || ifp->idev->dev == dev) + return true; + } + } + return false; +} + +int nip_get_lladdr(struct net_device *dev, struct nip_addr *addr, + u32 banned_flags) +{ + struct ninet_dev *idev; + int err = -EADDRNOTAVAIL; + + rcu_read_lock(); + idev = __nin_dev_get(dev); + if (idev) { + read_lock_bh(&idev->lock); + err = __nip_get_lladdr(idev, addr, banned_flags); + read_unlock_bh(&idev->lock); + } + rcu_read_unlock(); + return err; +} + +int __nip_get_lladdr(struct ninet_dev *idev, struct nip_addr *addr, + u32 banned_flags) +{ + struct ninet_ifaddr *ifp; + int err = -EADDRNOTAVAIL; + + list_for_each_entry_reverse(ifp, &idev->addr_list, if_list) { + if (!(ifp->flags & banned_flags)) { + *addr = ifp->addr; + err = 0; + break; + } + } + return err; +} + +static int __nip_get_firstaddr(struct ninet_dev *idev, struct nip_addr *addr) +{ + struct ninet_ifaddr *ifp; + int err = -EADDRNOTAVAIL; + + list_for_each_entry(ifp, &idev->addr_list, if_list) { + *addr = ifp->addr; + err = 0; + break; + } + return err; +} + +static int nip_get_firstaddr(const struct net_device *dev, + struct nip_addr *addr) +{ + struct ninet_dev *idev; + int err = -EADDRNOTAVAIL; + + rcu_read_lock(); + idev = __nin_dev_get(dev); + if (idev) { + read_lock_bh(&idev->lock); + err = __nip_get_firstaddr(idev, addr); + read_unlock_bh(&idev->lock); + } + rcu_read_unlock(); + return err; +} + +int nip_dev_get_saddr(struct net *net, const struct net_device *dev, + const struct nip_addr *daddr, struct nip_addr *saddr) +{ + /* 此函数用于确定源地址。 + * ipv6有专门的policy(candidate source address)来判断源地址 + * newip加入长短地址后也可能需要有类似的policy + */ + if (!dev || !saddr) + return -EADDRNOTAVAIL; + + return nip_get_firstaddr(dev, saddr); +} + +static int nip_addrconf_notify(struct notifier_block *this, unsigned long event, + void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct ninet_dev *idev = __nin_dev_get(dev); + struct net *net = dev_net(dev); + int run_pending = 0; + + switch (event) { + case NETDEV_REGISTER: + if (!idev && dev->mtu >= NEWIP_MIN_MTU) { + DEBUG("NIP_ADDRCONF(NETDEV_REGISTER): "); + idev = nip_add_dev(dev); + if (IS_ERR(idev)) + return notifier_from_errno(PTR_ERR(idev)); + } + break; + + case NETDEV_CHANGEMTU: + /* if MTU under NEWIP_MIN_MTU stop New IP on this interface. */ + if (dev->mtu < NEWIP_MIN_MTU) { + nip_addrconf_ifdown(dev, dev != net->loopback_dev); + break; + } + + if (idev) { + idev->cnf.mtu = dev->mtu; + break; + } + + /* allocate new idev */ + idev = nip_add_dev(dev); + if (IS_ERR(idev)) + break; + + /* device is still not ready */ + if (!(idev->if_flags & IF_READY)) + break; + + run_pending = 1; + /* fall through */ + case NETDEV_UP: + case NETDEV_CHANGE: + if (dev->flags & IFF_SLAVE) + break; + + if (idev && idev->cnf.disable_nip) + break; + + if (event == NETDEV_UP) { + if (!nip_addrconf_link_ready(dev)) { + /* device is not ready yet. */ + DEBUG("NIP_ADDRCONF(NETDEV_UP): "); + DEBUG("%s:link is not ready\n", dev->name); + break; + } + + if (!idev && dev->mtu >= NEWIP_MIN_MTU) + idev = nip_add_dev(dev); + + if (!IS_ERR_OR_NULL(idev)) { + idev->if_flags |= IF_READY; + run_pending = 1; + } + } else if (event == NETDEV_CHANGE) { + if (!nip_addrconf_link_ready(dev)) { + /* device is still not ready. */ + break; + } + + if (idev) + idev->if_flags |= IF_READY; + + DEBUG("NIP_ADDRCONF(NETDEV_CHANGE):"); + DEBUG("%s:link becomes ready\n", dev->name); + + run_pending = 1; + } + + if (!IS_ERR_OR_NULL(idev)) { + /* If the MTU changed during the interface down, + * when the interface up, the changed MTU must be + * reflected in the idev as well as routers. + */ + if (idev->cnf.mtu != dev->mtu && + dev->mtu >= NEWIP_MIN_MTU) { + idev->cnf.mtu = dev->mtu; + } + idev->tstamp = jiffies; + + /* If the changed mtu during down is lower than + * NEWIP_MIN_MTU stop New IP on this interface. + */ + if (dev->mtu < NEWIP_MIN_MTU) + nip_addrconf_ifdown(dev, + dev != net->loopback_dev); + } + break; + + case NETDEV_DOWN: + case NETDEV_UNREGISTER: + /* Remove all addresses from this interface. */ + nip_addrconf_ifdown(dev, event != NETDEV_DOWN); + break; + default: + break; + } + + return NOTIFY_OK; +} + +static int nip_addrconf_ifdown(struct net_device *dev, int how) +{ + struct net *net = dev_net(dev); + struct ninet_dev *idev = __nin_dev_get(dev); + struct ninet_ifaddr *ifa, *tmp; + struct list_head del_list; + int state, i; + + ASSERT_RTNL(); + + nip_rt_ifdown(net, dev); + neigh_ifdown(&nnd_tbl, dev); + if (!idev) + return -ENODEV; + + /* Step 1: remove reference to newip device from parent device. + * Do not dev_put! + */ + if (how) { + idev->dead = 1; + + /* protected by rtnl_lock */ + RCU_INIT_POINTER(dev->nip_ptr, NULL); + } + + /* Step 2: clear hash table */ + for (i = 0; i < NIN_ADDR_HSIZE; i++) { + struct hlist_head *h = &ninet_addr_lst[i]; + + spin_lock_bh(&addrconf_hash_lock); + hlist_for_each_entry_rcu(ifa, h, addr_lst) { + if (ifa->idev == idev) + hlist_del_init_rcu(&ifa->addr_lst); + } + spin_unlock_bh(&addrconf_hash_lock); + } + + write_lock_bh(&idev->lock); + + /* Step 2: clear flags for stateless addrconf */ + if (!how) + idev->if_flags &= ~(IF_RS_SENT | IF_RA_RCVD | IF_READY); + + /* Step 3: clear addr list in idev */ + INIT_LIST_HEAD(&del_list); + list_for_each_entry_safe(ifa, tmp, &idev->addr_list, if_list) { + list_move(&ifa->if_list, &del_list); + + write_unlock_bh(&idev->lock); + spin_lock_bh(&ifa->lock); + + state = ifa->state; + ifa->state = NINET_IFADDR_STATE_DEAD; + + spin_unlock_bh(&ifa->lock); + write_lock_bh(&idev->lock); + } + write_unlock_bh(&idev->lock); + + /* now clean up addresses to be removed */ + while (!list_empty(&del_list)) { + ifa = list_first_entry(&del_list, struct ninet_ifaddr, if_list); + list_del(&ifa->if_list); + nin_ifa_put(ifa); + } + + /* Last: Shot the device (if unregistered) */ + if (how) { + neigh_parms_release(&nnd_tbl, idev->nd_parms); + neigh_ifdown(&nnd_tbl, dev); + DEBUG("%s: before idev put. idev->refcnt=%u\n", __func__, + refcount_read(&idev->refcnt)); + nin_dev_put(idev); + } + return 0; +} + +static int nip_addr_proc_show(struct seq_file *seq, void *v) +{ + struct net *net = seq->private; + struct ninet_ifaddr *ifp; + int i, j; + + rcu_read_lock(); + for (i = 0; i < NIN_ADDR_HSIZE; i++) { + hlist_for_each_entry_rcu(ifp, &ninet_addr_lst[i], addr_lst) { + if (!net_eq(dev_net(ifp->idev->dev), net)) + continue; + + for (j = 0; j < ifp->addr.bitlen / NIP_ADDR_BIT_LEN_8; + j++) { + seq_printf(seq, "%02x", + ifp->addr.nip_addr_field8[j]); + } + seq_printf(seq, "\t%8s\n", + ifp->idev->dev ? ifp->idev->dev->name : ""); + } + } + rcu_read_unlock(); + return 0; +} + +static int __net_init nip_addr_net_init(struct net *net) +{ + int err = -ENOMEM; + struct nip_devconf *dflt; + + dflt = kmemdup(&newip_devconf_dflt, + sizeof(newip_devconf_dflt), + GFP_KERNEL); + if (!dflt) + goto err_alloc_dflt; + + net->newip.devconf_dflt = dflt; + + if (!proc_create_net_single("nip_addr", 0444, net->proc_net, + nip_addr_proc_show, NULL)) { + goto err_addr_proc; + } + + return 0; + +err_addr_proc: + kfree(dflt); +err_alloc_dflt: + return err; +} + +static void __net_exit nip_addr_net_exit(struct net *net) +{ + kfree(net->newip.devconf_dflt); + remove_proc_entry("nip_addr", net->proc_net); +} + +static struct pernet_operations nip_route_proc_net_ops = { + .init = nip_addr_net_init, + .exit = nip_addr_net_exit, +}; + +/* addrconf module should be notified of a device going up + */ +static struct notifier_block nip_dev_notf = { + .notifier_call = nip_addrconf_notify, + .priority = ADDRCONF_NOTIFY_PRIORITY, +}; + +int __init nip_addrconf_init(void) +{ + int err; + + err = register_pernet_subsys(&nip_route_proc_net_ops); + if (err < 0) { + DEBUG("%s: register_pernet_subsys failed!\n", __func__); + goto out; + } + + register_netdevice_notifier(&nip_dev_notf); + +out: + return err; +} + +void nip_addrconf_cleanup(void) +{ + struct net_device *dev; + int i; + + unregister_netdevice_notifier(&nip_dev_notf); + unregister_pernet_subsys(&nip_route_proc_net_ops); + + rtnl_lock(); + + /* clean dev list */ + for_each_netdev(&init_net, dev) { + if (!__nin_dev_get(dev)) + continue; + nip_addrconf_ifdown(dev, 1); + } + + /* Check hash table. */ + spin_lock_bh(&addrconf_hash_lock); + for (i = 0; i < NIN_ADDR_HSIZE; i++) + WARN_ON(!hlist_empty(&ninet_addr_lst[i])); + spin_unlock_bh(&addrconf_hash_lock); + rtnl_unlock(); +} + diff --git a/net/newip/nip_addrconf_core.c b/net/newip/nip_addrconf_core.c new file mode 100755 index 0000000000000000000000000000000000000000..8f563846d418119507b8121c13a48116f39ab7bf --- /dev/null +++ b/net/newip/nip_addrconf_core.c @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include +#include +#include +#include + +static void nin_dev_finish_destroy_rcu(struct rcu_head *head) +{ + struct ninet_dev *idev = container_of(head, struct ninet_dev, rcu); + + kfree(idev); +} + +void nin_dev_finish_destroy(struct ninet_dev *idev) +{ + struct net_device *dev = idev->dev; + + WARN_ON(!list_empty(&idev->addr_list)); + + dev_put(dev); + if (!idev->dead) { + DEBUG(KERN_WARNING "Freeing alive ninet device.\n"); + return; + } + call_rcu(&idev->rcu, nin_dev_finish_destroy_rcu); +} +EXPORT_SYMBOL(nin_dev_finish_destroy); diff --git a/net/newip/nip_fib.c b/net/newip/nip_fib.c new file mode 100755 index 0000000000000000000000000000000000000000..d131b7287e6884e3b9b59e8f508479a20be8aec3 --- /dev/null +++ b/net/newip/nip_fib.c @@ -0,0 +1,294 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +static struct kmem_cache *nip_fib_node_kmem __read_mostly; + +/* 作用类似于IPv4中的0.0.0.0。 + * 不会作为真实地址出现,仅为本机用来特殊处理的常量 + */ +const struct nip_addr nip_any_addr = { + .bitlen = NIP_ADDR_BIT_LEN_16, + .nip_addr_field16[0] = htons(0xff09), +}; + +struct nip_fib_table *nip_fib_get_table(struct net *net, u32 id) +{ + if (id == NIP_RT_TABLE_MAIN) + return net->newip.nip_fib_main_tbl; + else if (id == NIP_RT_TABLE_LOCAL) + return net->newip.nip_fib_local_tbl; + else + return NULL; +} + +static struct nip_fib_node *nip_node_alloc(void) +{ + struct nip_fib_node *fn; + + fn = kmem_cache_zalloc(nip_fib_node_kmem, GFP_ATOMIC); + + return fn; +} + +void nip_rt_free_pcpu(struct nip_rt_info *non_pcpu_rt) +{ + int cpu; + + if (!non_pcpu_rt->rt_pcpu) + return; + + for_each_possible_cpu(cpu) { + struct nip_rt_info **ppcpu_rt; + struct nip_rt_info *pcpu_rt; + + ppcpu_rt = per_cpu_ptr(non_pcpu_rt->rt_pcpu, cpu); + pcpu_rt = *ppcpu_rt; + if (pcpu_rt) { + dst_dev_put(&pcpu_rt->dst); + dst_release(&pcpu_rt->dst); + *ppcpu_rt = NULL; + } + } + + free_percpu(non_pcpu_rt->rt_pcpu); + non_pcpu_rt->rt_pcpu = NULL; +} +EXPORT_SYMBOL_GPL(nip_rt_free_pcpu); + +static u32 ninet_route_hash(const struct nip_addr *addr) +{ + return hash_32(nip_addr_hash(addr), NIN_ROUTE_HSIZE_SHIFT); +} + +struct nip_fib_node *nip_fib_locate(struct hlist_head *nip_tb_head, + const struct nip_addr *daddr) +{ + struct nip_fib_node *fib_node; + struct hlist_head *h; + unsigned int hash; + + hash = ninet_route_hash(daddr); + h = &nip_tb_head[hash]; + + hlist_for_each_entry_rcu(fib_node, h, fib_hlist) { + if (nip_addr_eq(&fib_node->nip_route_info->rt_dst, daddr)) + return fib_node; + } + + /* find default route */ + hash = ninet_route_hash(&nip_any_addr); + h = &nip_tb_head[hash]; + + hlist_for_each_entry_rcu(fib_node, h, fib_hlist) { + if (nip_addr_eq + (&fib_node->nip_route_info->rt_dst, &nip_any_addr)) { + return fib_node; + } + } + + return NULL; +} + +/* nip_tb_lock must be taken to avoid racing */ +int nip_fib_add(struct hlist_head *nip_tb_head, struct nip_rt_info *rt) +{ + struct nip_fib_node *fib_node, *new_node; + int err = 0; + struct hlist_head *h; + unsigned int hash; + + hash = ninet_route_hash(&rt->rt_dst); + h = &nip_tb_head[hash]; + + hlist_for_each_entry(fib_node, h, fib_hlist) { + if (nip_addr_eq(&fib_node->nip_route_info->rt_dst, + &rt->rt_dst)) { + err = -EEXIST; + goto fail; + } + } + + new_node = nip_node_alloc(); + if (!new_node) { + err = -ENOMEM; + goto fail; + } + new_node->nip_route_info = rt; + rcu_assign_pointer(rt->rt_node, new_node); + atomic_inc(&rt->rt_ref); + hlist_add_tail_rcu(&new_node->fib_hlist, h); + +out: + return err; + +fail: + dst_release_immediate(&rt->dst); + goto out; +} + +static void nip_fib_destroy_rcu(struct rcu_head *head) +{ + struct nip_fib_node *fn = container_of(head, struct nip_fib_node, rcu); + + nip_rt_release(fn->nip_route_info); + kfree(fn); +} + +/* nip_tb_lock must be taken to avoid racing */ +int nip_fib_del(struct nip_rt_info *rt, struct nl_info *info) +{ + struct nip_fib_node *fn = rcu_dereference_protected( + rt->rt_node, + lockdep_is_held(&rt->rt_table->nip_tb_lock)); + struct net *net = info->nl_net; + + if (!fn || rt == net->newip.nip_null_entry) + return -ENOENT; + + hlist_del_init_rcu(&fn->fib_hlist); + + /* 当fib_node释放后,fib_node指向的route_info才可释放 */ + RCU_INIT_POINTER(rt->rt_node, NULL); + call_rcu(&fn->rcu, nip_fib_destroy_rcu); + + return 0; +} + +static void nip_fib_free_table(struct nip_fib_table *table) +{ + kfree(table); +} + +/* caller must hold nip_tb_lock */ +static void nip_fib_clean_hash(struct net *net, struct hlist_head *nip_tb_head, + int (*func)(struct nip_rt_info *, void *arg), + void *arg) +{ + int i; + struct nip_fib_node *fn; + struct hlist_node *tmp; + struct nl_info info = { + .nl_net = net, + }; + + for (i = 0; i < NIN_ROUTE_HSIZE; i++) { + struct hlist_head *h = &nip_tb_head[i]; + + hlist_for_each_entry_safe(fn, tmp, h, fib_hlist) { + if (func(fn->nip_route_info, arg) < 0) { + DEBUG("%s: try to del nip_rt_info\n", __func__); + nip_fib_del(fn->nip_route_info, &info); + } + } + } +} + +void nip_fib_clean_all(struct net *net, + int (*func)(struct nip_rt_info *, void *arg), void *arg) +{ + struct nip_fib_table *main_tbl = net->newip.nip_fib_main_tbl; + struct nip_fib_table *local_tbl = net->newip.nip_fib_local_tbl; + + spin_lock_bh(&main_tbl->nip_tb_lock); + nip_fib_clean_hash(net, main_tbl->nip_tb_head, func, arg); + spin_unlock_bh(&main_tbl->nip_tb_lock); + + spin_lock_bh(&local_tbl->nip_tb_lock); + nip_fib_clean_hash(net, local_tbl->nip_tb_head, func, arg); + spin_unlock_bh(&local_tbl->nip_tb_lock); +} + +static void nip_fib_link_table(struct nip_fib_table *tb) +{ + /* 后续若加入多路由表,需在此处进行初始化 */ + spin_lock_init(&tb->nip_tb_lock); +} + +static void __net_init nip_fib_tables_init(struct net *net) +{ + nip_fib_link_table(net->newip.nip_fib_main_tbl); + nip_fib_link_table(net->newip.nip_fib_local_tbl); +} + +static int __net_init nip_fib_net_init(struct net *net) +{ + net->newip.nip_fib_main_tbl = + kzalloc(sizeof(*net->newip.nip_fib_main_tbl), GFP_KERNEL); + if (!net->newip.nip_fib_main_tbl) + goto out_fib_table_hash; + + net->newip.nip_fib_main_tbl->nip_tb_id = NIP_RT_TABLE_MAIN; + net->newip.nip_fib_main_tbl->flags = 1; + + net->newip.nip_fib_local_tbl = + kzalloc(sizeof(*net->newip.nip_fib_local_tbl), GFP_KERNEL); + if (!net->newip.nip_fib_local_tbl) + goto out_main_tbl; + + net->newip.nip_fib_local_tbl->nip_tb_id = NIP_RT_TABLE_LOCAL; + + nip_fib_tables_init(net); + + return 0; + +out_main_tbl: + kfree(net->newip.nip_fib_main_tbl); +out_fib_table_hash: + return -ENOMEM; +} + +static void nip_fib_net_exit(struct net *net) +{ + nip_fib_free_table(net->newip.nip_fib_main_tbl); + nip_fib_free_table(net->newip.nip_fib_local_tbl); +} + +static struct pernet_operations nip_fib_net_ops = { + .init = nip_fib_net_init, + .exit = nip_fib_net_exit, +}; + +int __init nip_fib_init(void) +{ + int ret = -ENOMEM; + + nip_fib_node_kmem = kmem_cache_create("nip_fib_nodes", + sizeof(struct nip_fib_node), + 0, SLAB_HWCACHE_ALIGN, NULL); + if (!nip_fib_node_kmem) + goto out; + + DEBUG("nip_fib_node size is %lu\n", + sizeof(struct nip_fib_node) + sizeof(struct nip_rt_info)); + + ret = register_pernet_subsys(&nip_fib_net_ops); + if (ret) + goto out_kmem_cache_create; + +out: + return ret; + +out_kmem_cache_create: + kmem_cache_destroy(nip_fib_node_kmem); + goto out; +} + +void nip_fib_gc_cleanup(void) +{ + unregister_pernet_subsys(&nip_fib_net_ops); + kmem_cache_destroy(nip_fib_node_kmem); +} + diff --git a/net/newip/nip_fib_rules.c b/net/newip/nip_fib_rules.c new file mode 100755 index 0000000000000000000000000000000000000000..b91c9ea9a834dc3f8e9de8bbd057b325ef39f33b --- /dev/null +++ b/net/newip/nip_fib_rules.c @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include +#include +#include +#include + +struct dst_entry *nip_fib_rule_lookup(struct net *net, struct flow_nip *fln, + int flags, nip_pol_lookup_t lookup) +{ + struct nip_rt_info *rt; + + rt = lookup(net, net->newip.nip_fib_local_tbl, fln, flags); + if (rt != net->newip.nip_null_entry) + return &rt->dst; + nip_rt_put(rt); + rt = lookup(net, net->newip.nip_fib_main_tbl, fln, flags); + if (rt != net->newip.nip_null_entry) + return &rt->dst; + nip_rt_put(rt); + + dst_hold(&net->newip.nip_null_entry->dst); + return &net->newip.nip_null_entry->dst; +} diff --git a/net/newip/nip_input.c b/net/newip/nip_input.c new file mode 100755 index 0000000000000000000000000000000000000000..0a5bcf69ce11ac56789f0c9727aa008aba3989df --- /dev/null +++ b/net/newip/nip_input.c @@ -0,0 +1,422 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include + +u_char *decode_nip_addr(u_char *buf, struct nip_addr *addr) +{ + u_char *p; + int i; + + p = buf; + + if (*p <= ADDR_FIRST_DC) { + addr->nip_addr_field8[0] = *p; + p++; + addr->bitlen = NIP_ADDR_BIT_LEN_8; + } else if (*p > ADDR_FIRST_DC && *p <= ADDR_FIRST_F0) { + if (*p > ADDR_FIRST_DC + 1 || *(p + 1) >= ADDR_SECOND_MIN_DD) { + addr->nip_addr_field8[0] = *p; + p++; + addr->nip_addr_field8[1] = *p; + p++; + addr->bitlen = NIP_ADDR_BIT_LEN_16; + } else { + return NULL; + } + } else if (*p == ADDR_FIRST_F1) { + if (*(p + 1) >= ADDR_SECOND_MIN_F1) { + for (i = 0; i < NIP_ADDR_LEN_3; i++) { + addr->nip_addr_field8[i] = *p; + p++; + } + addr->bitlen = NIP_ADDR_BIT_LEN_24; + } else { + return NULL; + } + } else if (*p == ADDR_FIRST_F2) { + if (*(p + 1) > 0 || *(p + 2) >= ADDR_THIRD_MIN_F2) { /* 偏移2 */ + for (i = 0; i < NIP_ADDR_LEN_5; i++) { + addr->nip_addr_field8[i] = *p; + p++; + } + addr->bitlen = NIP_ADDR_BIT_LEN_40; + } else { + return NULL; + } + } else if (*p == ADDR_FIRST_FF) { + addr->nip_addr_field8[0] = *p; + p++; + addr->nip_addr_field8[1] = *p; + p++; + addr->bitlen = NIP_ADDR_BIT_LEN_16; + } else { + return NULL; + } + + return p; +} + +/* 底层驱动对报文接收skb len会补充字节对齐数据,skb->len会大于数据实际长度 + * 此函数根据nip报头中携带的total len实际长度刷新skb->len + */ +static void _nip_update_recv_skb_len(struct sk_buff *skb, + struct nip_hdr *niph) +{ + if (!niph->include_total_len) + return; + + skb->len = niph->total_len; +} + +int nip_nwk_input(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt, struct net_device *orig_dev) +{ + int offset = 0; + struct nip_hdr niph = {0}; + + if (skb->pkt_type == PACKET_OTHERHOST) { + kfree_skb(skb); + return NET_RX_DROP; + } + + skb = skb_share_check(skb, GFP_ATOMIC); + if (!skb) + goto out; + + memset(NIPCB(skb), 0, sizeof(struct ninet_skb_parm)); + + offset = nip_hdr_parse(skb, &niph); + if (offset <= 0) { + DEBUG("%s check in failure. Drop a packet!\n", __func__); + goto drop; + } + + skb->transport_header = skb->network_header + offset; + skb_orphan(skb); + + /* skb复制后刷新nip skb->len */ + _nip_update_recv_skb_len(skb, &niph); + + nip_route_input(skb); + if (!skb_dst(skb)) { + DEBUG("%s: drop the packet!", __func__); + goto drop; + } + return dst_input(skb); + +drop: + kfree_skb(skb); +out: + return NET_RX_DROP; +} + +int nip_nwk_input_up(struct sk_buff *skb) +{ + const struct ninet_protocol *ipprot; + + rcu_read_lock(); + if (!pskb_pull(skb, skb_transport_offset(skb))) + goto discard; + + ipprot = rcu_dereference(ninet_protos[NIPCB(skb)->nexthdr]); + if (ipprot) { + ipprot->handler(skb); + } else { + kfree_skb(skb); + DEBUG("not found transport protol, drop this packet!"); + } + rcu_read_unlock(); + return 0; + +discard: + rcu_read_unlock(); + kfree_skb(skb); + return 0; +} +EXPORT_SYMBOL_GPL(nip_nwk_input_up); + +#if DESC("报文头解封装") +/* 从接收报文buf中获取bitmap, + * 并返回bitmap字节数,0表示获取失败 + */ +static int _get_nip_hdr_bitmap(u_char *buf, u8 bitmap[], u8 bitmap_index_max) +{ + int i = 0; + u_char *p = buf; + + if (!(*p & NIP_BITMAP_VALID_SET)) + return 0; + + do { + /* 避免报文攻击,构造无限长bitmap触发死循环 */ + if (i >= bitmap_index_max) + return 0; + bitmap[i++] = *p; + p++; + } while (*p & NIP_BITMAP_HAVE_MORE_BIT); + + DEBUG("%s: head->bitmap_num=%u", __func__, i); + return i; +} + +/* 获取接收报文头中ttl字段, + * 获取成功返回字段长度,否则返回0 + */ +static int _get_nip_hdr_ttl(u_char *buf, u8 bitmap, struct nip_hdr *niph) +{ + if (!(bitmap & NIP_BITMAP_INCLUDE_TTL)) + return 0; + + niph->ttl = *buf; + niph->include_ttl = true; + + return sizeof(niph->ttl); +} + +/* 获取接收报文头中hdr len字段, + * 获取成功返回字段长度,否则返回0 + */ +static int _get_nip_hdr_len(u_char *buf, u8 bitmap, struct nip_hdr *niph) +{ + if (!(bitmap & NIP_BITMAP_INCLUDE_HDR_LEN)) { + DEBUG("%s: no hdr len.", __func__); + return 0; + } + + niph->hdr_len = *buf; + niph->include_hdr_len = true; + + return sizeof(niph->hdr_len); +} + +/* 获取接收报文头中next_protocol字段, + * 获取成功返回字段长度,否则返回0 + */ +static int _get_nip_hdr_nexthdr(u_char *buf, u8 bitmap, + struct nip_hdr *niph) +{ + if (!(bitmap & NIP_BITMAP_INCLUDE_NEXT_HDR)) { + DEBUG("%s: no nexthdr.", __func__); + niph->is_bad_hdr = true; + return 0; + } + + niph->nexthdr = *buf; + niph->include_nexthdr = true; + + return sizeof(niph->nexthdr); +} + +/* 获取接收报文头中saddr字段, + * 获取成功返回字段长度,否则返回0 + */ +static int _get_nip_hdr_daddr(u_char *buf, u8 bitmap, + struct nip_hdr *niph) +{ + u_char *p; + + if (!(bitmap & NIP_BITMAP_INCLUDE_DADDR)) { + DEBUG("%s: no daddr.", __func__); + niph->is_bad_hdr = true; + return 0; + } + + p = decode_nip_addr(buf, &niph->daddr); + if (!p) { + DEBUG("%s: daddr decode fail.", __func__); + niph->is_bad_hdr = true; + return 0; + } + + if (nip_addr_check(&niph->daddr)) { + DEBUG("%s: daddr invalid.", __func__); + niph->is_bad_hdr = true; + return 0; + } + + niph->include_daddr = true; + + return (niph->daddr.bitlen / NIP_ADDR_BIT_LEN_8); +} + +/* 获取接收报文头中saddr字段, + * 获取成功返回字段长度,否则返回0 + */ +static int _get_nip_hdr_saddr(u_char *buf, u8 bitmap, struct nip_hdr *niph) +{ + u_char *p; + + if (!(bitmap & NIP_BITMAP_INCLUDE_SADDR)) + return 0; + + p = decode_nip_addr(buf, &niph->saddr); + if (!p) { + DEBUG("%s: saddr decode fail.", __func__); + niph->is_bad_hdr = true; + return 0; + } + + if (nip_addr_check(&niph->saddr)) { + DEBUG("%s: saddr invalid.", __func__); + niph->is_bad_hdr = true; + return 0; + } + + niph->include_saddr = true; + + return (niph->saddr.bitlen / NIP_ADDR_BIT_LEN_8); +} + +/* 获取接收报文头中hdr len字段, + * 获取成功返回字段长度,否则返回0 + */ +static int _get_nip_total_len(u_char *buf, u8 bitmap, struct nip_hdr *niph) +{ + if (!(bitmap & NIP_BITMAP_INCLUDE_TOTAL_LEN)) { + DEBUG("%s: no total len.", __func__); + return 0; + } + + niph->total_len = ntohs(*((u16 *)buf)); + niph->include_total_len = true; + + return sizeof(niph->total_len); +} + +static int _nip_hdr_bitmap0_parse(u_char *buf, u8 bitmap, struct nip_hdr *niph) +{ + int len = 0; + + len = _get_nip_hdr_ttl(buf, bitmap, niph); + len += _get_nip_hdr_len(buf + len, bitmap, niph); + len += _get_nip_hdr_nexthdr(buf + len, bitmap, niph); + len += _get_nip_hdr_daddr(buf + len, bitmap, niph); + len += _get_nip_hdr_saddr(buf + len, bitmap, niph); + len += _get_nip_total_len(buf + len, bitmap, niph); + return len; +} + +/* 接收报文头中包含未知字段检查函数 + * PS:永远注册在有效字段解析函数的后面 + */ +static int _nip_hdr_unknown_bit_check(u_char *buf, u8 bitmap, + struct nip_hdr *niph) +{ + niph->include_unknown_bit = true; + + DEBUG("%s: recv different ver pkt.", __func__); + return 0; +} + +static int (*hdr_parse_factory[2])(u_char *, u8, struct nip_hdr *) = { + _nip_hdr_bitmap0_parse, + _nip_hdr_unknown_bit_check, +}; + +static int factory_num = ARRAY_SIZE(hdr_parse_factory); + +/* 接收报文头校验,校验通过返回true,校验失败返回false */ +static bool nip_hdr_check(struct nip_hdr *niph) +{ + if (niph->include_saddr && + niph->include_daddr && + nip_addr_eq(&niph->saddr, &niph->daddr)) { + DEBUG("%s: saddr can`t be the same as daddr.", __func__); + return false; + } + + /* 相同版本收发包通讯可以不携带hdr_len + * 不同版本间收发包必须携带hdr + * 接收到新版本发送的报文头中未包含报文头长度 + * 字段,旧版本解析完认识字段后无法通过报文头 + * 长度偏移到报文payload部分 + */ + if (niph->include_unknown_bit && !niph->include_hdr_len) { + DEBUG("%s: different ver pkt but no hdr len.", __func__); + return false; + } + + if (niph->include_hdr_len) { + if (niph->hdr_len == 0 || + niph->hdr_len > NIP_HDR_MAX || + niph->hdr_len < niph->hdr_real_len) { + DEBUG("%s: hdr len invalid, hdr_len=%u, real_len=%u", + __func__, niph->hdr_len, niph->hdr_real_len); + return false; + } + } + + return true; +} + +/* 从接收到的报文中解析newip报文头, + * 返回报文头长度,解析失败则返回0 + * 调试OK后 nip_nwk_input 调用下面函数 + */ +int nip_hdr_parse(struct sk_buff *skb, struct nip_hdr *niph) +{ + int i = 0; + int len; + u_char *buf = skb->data; + u8 bitmap[BITMAP_MAX] = {0}; + int num = _get_nip_hdr_bitmap(buf, bitmap, BITMAP_MAX); + + if (num <= 0) { + DEBUG("%s: bitmap invalid, bitmap_num=%d", __func__, num); + return 0; + } + + /* 指向报文头bitmap后面的数据 */ + niph->hdr_real_len = num * sizeof(bitmap[0]); + buf += niph->hdr_real_len; + DEBUG("%s: after bitmap parse, hdr_real_len=%u, bitmap_num=%u", + __func__, niph->hdr_real_len, num); + + /* 开始获取报文头字段 */ + while (i < num) { + if (i >= factory_num) + break; + len = hdr_parse_factory[i](buf, bitmap[i], niph); + if (niph->is_bad_hdr) + return 0; + + buf += len; + niph->hdr_real_len += len; + if (niph->hdr_real_len >= skb->len) { + DEBUG("%s: skb read overflow, hdr_len=%u, skb_len=%u", + __func__, niph->hdr_real_len, skb->len); + return 0; + } + i++; + } + + if (nip_hdr_check(niph) == false) + return 0; + + NIPCB(skb)->dstaddr = niph->daddr; + NIPCB(skb)->srcaddr = niph->saddr; + NIPCB(skb)->nexthdr = niph->nexthdr; + + /* 对于相同版本交互,报头中不携带报头长度时, + * hdr_len等于0,此时使用hdr_real_len作为报文头长度 + */ + return niph->hdr_len > niph->hdr_real_len ? + niph->hdr_len : niph->hdr_real_len; +} +#endif + diff --git a/net/newip/nip_output.c b/net/newip/nip_output.c new file mode 100755 index 0000000000000000000000000000000000000000..c84987a2569f3bd0e90cec431d203fd5b2208026 --- /dev/null +++ b/net/newip/nip_output.c @@ -0,0 +1,694 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include +#include +#include +#include +#include +#include + +unsigned short nip_check_sum_build(unsigned char *data, + unsigned short data_len, + struct nip_pseudo_header *chksum_header) +{ + unsigned int sum = 0; + + sum = nip_check_sum(data, data_len); + sum += nip_header_chksum(chksum_header); + + while (sum >> USHORT_PAYLOAD) + sum = (sum >> USHORT_PAYLOAD) + (sum & 0xffff); + + return (unsigned short)(~sum); +} + +void nip_insert_udp_hdr_checksum(u_char *buf, u_short check) +{ + if (buf) { + struct udphdr *uh; + + uh = (struct udphdr *)buf; + uh->check = htons(check); + } else { + DEBUG("%s skb->data = NULL", __func__); + } +} + +static void nip_build_udp_hdr(u_short sport, u_short dport, + u_short len, u_char *buf, + u_short check) +{ + struct udphdr *uh; + + if (buf) { + uh = (struct udphdr *)buf; + uh->source = sport; + uh->dest = dport; + uh->len = htons(len); + uh->check = htons(check); + } else { + DEBUG("%s skb->data = NULL", __func__); + } +} + +int nip_nwk_output_down(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct nip_addr *nexthop; + struct neighbour *neigh; + int ret = 0; + int res; + struct net_device *dev = skb_dst(skb)->dev; + bool is_v6gw = false; + + skb->protocol = htons(ETH_P_NEWIP); + skb->dev = dev; + + /* prepare to build ethernet header */ + nexthop = nip_nexthop((struct nip_rt_info *)dst, &NIPCB(skb)->dstaddr); + + rcu_read_lock_bh(); + + neigh = __nip_neigh_lookup_noref(dev, nexthop); + if (unlikely(!neigh)) + neigh = __neigh_create(&nnd_tbl, nexthop, dev, false); + if (!IS_ERR(neigh)) { + res = neigh_output(neigh, skb, is_v6gw); + + rcu_read_unlock_bh(); + return res; + } + DEBUG("find neigh and create neigh failed!"); + + rcu_read_unlock_bh(); + kfree_skb(skb); + return ret; +} + +int nip_nwk_forward(struct sk_buff *skb) +{ + return nip_nwk_output_down(NULL, NULL, skb); +} +EXPORT_SYMBOL_GPL(nip_nwk_forward); + +static int nip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + int err; + + err = dst_output(net, sk, skb); + return err; +} + +int nip_nwk_output(struct sk_buff *skb) +{ + struct net *net; + int err = 0; + + net = sock_net(skb->sk); + err = nip_local_out(net, skb->sk, skb); + if (err) + DEBUG("%s: failed to out skb!", __func__); + + return err; +} +EXPORT_SYMBOL_GPL(nip_nwk_output); + +int get_nip_addr_len(const struct nip_addr *addr) +{ + int len = 0; + + if (addr->nip_addr_field8[0] <= ADDR_FIRST_DC) + len = NIP_ADDR_LEN_1; + else if ((addr->nip_addr_field8[0] > ADDR_FIRST_DC && + addr->nip_addr_field8[0] <= ADDR_FIRST_F0) || + addr->nip_addr_field8[0] == ADDR_FIRST_FF) + len = NIP_ADDR_LEN_2; + else if (addr->nip_addr_field8[0] == ADDR_FIRST_F1) + len = NIP_ADDR_LEN_3; + else if (addr->nip_addr_field8[0] == ADDR_FIRST_F2) + len = NIP_ADDR_LEN_5; + else + return 0; + return len; +} + +u_char *build_nip_addr(const struct nip_addr *addr, u_char *buf) +{ + u_char *p = buf; + int i; + + if (addr->nip_addr_field8[0] <= ADDR_FIRST_DC) { + *p = addr->nip_addr_field8[0]; + } else if (((addr->nip_addr_field8[0] > ADDR_FIRST_DC) && + (addr->nip_addr_field8[0] <= ADDR_FIRST_F0)) || + (addr->nip_addr_field8[0] == ADDR_FIRST_FF)) { + *p = addr->nip_addr_field8[0]; + p++; + *p = addr->nip_addr_field8[NIP_ADDR_LEN_1]; + } else if (addr->nip_addr_field8[0] == ADDR_FIRST_F1) { + for (i = 0; i < NIP_ADDR_LEN_2; i++) { + *p = addr->nip_addr_field8[i]; + p++; + } + *p = addr->nip_addr_field8[NIP_ADDR_LEN_2]; + } else if (addr->nip_addr_field8[0] == ADDR_FIRST_F2) { + for (i = 0; i < NIP_ADDR_LEN_4; i++) { + *p = addr->nip_addr_field8[i]; + p++; + } + *p = addr->nip_addr_field8[NIP_ADDR_LEN_4]; + } else { + return NULL; + } + + return ++p; +} + +#if DESC("报文头封装") +/* 计算报文分片信息(分片个数,分片包长等) */ +static void _nip_calc_pkt_frag_num(u32 mtu, u32 usr_data_len, + struct nip_pkt_seg_info *seg_info) +{ + u32 mid_usr_pkt_len = (mtu - NIP_HDR_MAX - NIP_UDP_HDR_LEN) & + (~7); /* 8B整数倍 */ + u32 mid_pkt_num = usr_data_len / mid_usr_pkt_len; + u32 last_usr_pkt_len = 0; + + if (usr_data_len != 0) { + last_usr_pkt_len = usr_data_len % mid_usr_pkt_len; + if (last_usr_pkt_len == 0) { + last_usr_pkt_len = mid_usr_pkt_len; + mid_pkt_num--; + } + } + + seg_info->last_pkt_num = 1; + seg_info->mid_pkt_num = mid_pkt_num; + seg_info->mid_usr_pkt_len = mid_usr_pkt_len; + seg_info->last_usr_pkt_len = last_usr_pkt_len; +} + +static void _nip_hdr_bitmap0_fmt(struct nip_head_para *head) +{ + head->bitmap[0] |= NIP_BITMAP_VALID_SET; + head->bitmap[0] |= NIP_BITMAP_INCLUDE_NEXT_HDR; + + if (head->encap_ttl) + head->bitmap[0] |= NIP_BITMAP_INCLUDE_TTL; + + if (head->encap_hdr_len) + head->bitmap[0] |= NIP_BITMAP_INCLUDE_HDR_LEN; + + if (head->encap_daddr) + head->bitmap[0] |= NIP_BITMAP_INCLUDE_DADDR; + + if (head->encap_saddr) + head->bitmap[0] |= NIP_BITMAP_INCLUDE_SADDR; + + if (head->encap_total_len) + head->bitmap[0] |= NIP_BITMAP_INCLUDE_TOTAL_LEN; + + head->bitmap[0] &= (~NIP_BITMAP_HAVE_MORE_BIT); /* bitmap结束标志 */ + head->bitmap_num = 1; +} + +/* 封装newip报头的ttl字段 */ +static void _nip_hdr_ttl_encap(struct nip_head_para *head) +{ + u8 *buf; + + if (!(head->bitmap[0] & NIP_BITMAP_INCLUDE_TTL)) + return; + + buf = head->hdr_buf + head->hdr_buf_pos; + *buf = head->ttl; + head->hdr_buf_pos += sizeof(head->ttl); +} + +/* 封装newip报头长度 */ +static void _nip_hdr_len_encap(struct nip_head_para *head) +{ + u8 *buf; + + if (!(head->bitmap[0] & NIP_BITMAP_INCLUDE_HDR_LEN)) + return; + + /* 报文头长度初始先填写0,等后面报文头 + * 全部封装完成得到实际长度后再刷新 + */ + buf = head->hdr_buf + head->hdr_buf_pos; + head->hdr_len_pos = buf; + *buf = 0; + head->hdr_buf_pos += 1; +} + +/* 更新newip报头长度 */ +static void _nip_update_hdr_len(struct nip_head_para *head) +{ + *head->hdr_len_pos = head->hdr_buf_pos; +} + +/* 封装newip报头的next header字段 */ +static void _nip_hdr_nexthdr_encap(struct nip_head_para *head) +{ + u8 *buf; + + if (!(head->bitmap[0] & NIP_BITMAP_INCLUDE_NEXT_HDR)) + return; + + buf = head->hdr_buf + head->hdr_buf_pos; + *buf = head->nexthdr; + head->hdr_buf_pos += sizeof(head->nexthdr); +} + +/* 封装newip报头的目的地址字段 */ +static void _nip_hdr_daddr_encap(struct nip_head_para *head) +{ + u8 *buf; + + if (!(head->bitmap[0] & NIP_BITMAP_INCLUDE_DADDR)) + return; + + buf = head->hdr_buf + head->hdr_buf_pos; + buf = build_nip_addr(&head->daddr, buf); + head->hdr_buf_pos += (head->daddr.bitlen / NIP_ADDR_BIT_LEN_8); +} + +/* 封装newip报头的源地址字段 */ +static void _nip_hdr_saddr_encap(struct nip_head_para *head) +{ + u8 *buf; + + if (!(head->bitmap[0] & NIP_BITMAP_INCLUDE_SADDR)) + return; + + buf = head->hdr_buf + head->hdr_buf_pos; + buf = build_nip_addr(&head->saddr, buf); + head->hdr_buf_pos += (head->saddr.bitlen / NIP_ADDR_BIT_LEN_8); +} + +/* 封装newip报文总长度 */ +static void _nip_hdr_total_len_encap(struct nip_head_para *head) +{ + u16 *buf; + + if (!(head->bitmap[0] & NIP_BITMAP_INCLUDE_TOTAL_LEN)) + return; + + /* 报文总长度初始先填写0,等后面报文 + * 全部封装完成得到实际长度后再刷新 + */ + buf = (u16 *)(head->hdr_buf + head->hdr_buf_pos); + head->total_len_pos = buf; + *buf = 0; + head->hdr_buf_pos += sizeof(head->total_len); +} + +/* 更新newip报文总长度 */ +/* 调用前需要设置head->total_len */ +void _nip_update_total_len(struct nip_head_para *head) +{ + if (!(head->bitmap[0] & NIP_BITMAP_INCLUDE_TOTAL_LEN)) + return; + + *head->total_len_pos = htons(head->total_len); +} + +/* bitmap格式化函数列表 */ +static void (*bitmap_fmt_factory[1])(struct nip_head_para *) = { + _nip_hdr_bitmap0_fmt, +}; + +static int fmt_factory_num = ARRAY_SIZE(bitmap_fmt_factory); + +/* 将bitmap封装到newip报文头buf中 */ +static void _nip_hdr_bitmap_encap(struct nip_head_para *head) +{ + int i; + u8 *buf; + + for (i = 0; i < head->bitmap_num; i++) { + buf = head->hdr_buf + head->hdr_buf_pos; + *buf = head->bitmap[i]; + head->hdr_buf_pos += 1; + } +} + +/* 将bitmap0对应的字段封装到newip报文头buf中 */ +static void _nip_hdr_bitmap0_encap(struct nip_head_para *head) +{ + /* 函数调用顺序必须和bitmap格式设置顺序保持一致 */ + _nip_hdr_ttl_encap(head); + _nip_hdr_len_encap(head); + _nip_hdr_nexthdr_encap(head); + _nip_hdr_daddr_encap(head); + _nip_hdr_saddr_encap(head); + _nip_hdr_total_len_encap(head); +} + +static void (*hdr_encap_factory[1])(struct nip_head_para *) = { + _nip_hdr_bitmap0_encap, +}; + +static int encap_factory_num = ARRAY_SIZE(hdr_encap_factory); + +/* 封装待发送报文newip报头 */ +void _nip_hdr_encap(struct nip_head_para *head) +{ + int i; + + /* bitmap格式化 */ + for (i = 0; i < fmt_factory_num; i++) + bitmap_fmt_factory[i](head); + + /* 将bitmap封装到newip报文头buf中 */ + _nip_hdr_bitmap_encap(head); + + /* 将bitmap对应的字段封装到newip报文头buf中 */ + for (i = 0; i < encap_factory_num; i++) + hdr_encap_factory[i](head); + + /* 根据报文头实际长度刷新报文头长度字段 */ + _nip_update_hdr_len(head); +} + +/* 封装udp发送报文newip报头bitmap */ +void _nip_udp_bitmap_flag_encap(struct nip_head_para *head) +{ + head->encap_ttl = 1; + head->encap_daddr = 1; + head->encap_saddr = 1; + head->encap_hdr_len = 1; +} + +/* 封装待tcp发送报文newip报头bitmap */ +/* tcp, arp icmp使用此接口 */ +void _nip_comm_bitmap_flag_encap(struct nip_head_para *head) +{ + head->encap_ttl = 1; + head->encap_daddr = 1; + head->encap_saddr = 1; + head->encap_hdr_len = 1; + head->encap_total_len = 1; +} + +unsigned short nip_get_output_checksum(struct sk_buff *skb, + struct nip_head_para *head) +{ + struct nip_pseudo_header nph = {0}; + u8 *udp_hdr = skb_transport_header(skb); + + nph.next_header = IPPROTO_UDP; + nph.src_addr = NIPCB(skb)->srcaddr; + nph.dst_addr = NIPCB(skb)->dstaddr; + + nph.check_len = head->trans_hdr_len + head->usr_data_len; + return nip_check_sum_build(udp_hdr, nph.check_len, &nph); +} +#endif + +#if DESC("UDP报文分段发送") +static struct sk_buff *_nip_alloc_skb(struct sock *sk, + struct nip_head_para *head, + struct nip_pkt_seg_info *seg_info, + struct dst_entry *dst) +{ + int len = NIP_ETH_HDR_LEN + NIP_HDR_MAX + + head->trans_hdr_len + seg_info->mid_usr_pkt_len; + struct sk_buff *skb = alloc_skb(len, 0); + + if (!skb) { + DEBUG("%s: no space for skb", __func__); + return NULL; + } + + skb->protocol = htons(ETH_P_NEWIP); + skb->ip_summed = CHECKSUM_NONE; + skb->csum = 0; + skb->sk = sk; + + /* 可能会分段发送,多次使用dst, + * 因此每次申请skb时对dst增加1次引用计数。 + * 但上层路由查找dst匹配时已增加过1次, + * 对于发送流程会多增加1次引用计数, + * 因此在所有分段都发送完后还需要 + * 手动减少一次引用计数, + * 调用dst_release,内部判断引用计数为0时释放dst + */ + dst_hold(dst); + DEBUG("%s: malloc_len=%d, dst->__refcnt=%u", __func__, + len, atomic_read(&dst->__refcnt)); + skb_dst_set(skb, dst); + memset(NIPCB(skb), 0, sizeof(struct ninet_skb_parm)); + + return skb; +} + +static int _nip_segment_output(struct sock *sk, + struct nip_head_para *head, + struct nip_pkt_seg_info *seg_info, + struct dst_entry *dst) +{ + int len; + int ret; + struct msghdr *from = (struct msghdr *)head->usr_data; + struct sk_buff *skb = _nip_alloc_skb(sk, head, seg_info, dst); + unsigned short check = 0; + + if (IS_ERR_OR_NULL(skb)) { + DEBUG("%s: skb alloc fail", __func__); + return -ENOMEM; + } + + /* 预留以太头位置(下发到链路层后再填写) */ + skb_reserve(skb, NIP_ETH_HDR_LEN); + + /* 填写网络层报文头(newip) */ + skb_reset_network_header(skb); + memcpy(skb->data, head->hdr_buf, head->hdr_buf_pos); + skb_reserve(skb, head->hdr_buf_pos); + NIPCB(skb)->dstaddr = head->daddr; + NIPCB(skb)->srcaddr = head->saddr; + + /* 填写传输层报文头(UDP) */ + skb_reset_transport_header(skb); + nip_build_udp_hdr(head->sport, head->dport, + head->trans_hdr_len + head->usr_data_len, + skb->data, 0); + skb_reserve(skb, head->trans_hdr_len); + len = copy_from_iter(skb->data, head->usr_data_len, &from->msg_iter); + if (len < 0) { + /* dst已经set到skb,skb释放时会自动释放dst */ + DEBUG("%s: copy from iter fail.(datalen=%u)", + __func__, head->usr_data_len); + kfree_skb(skb); + return -EFBIG; + } + + /* insert check sum */ + check = nip_get_output_checksum(skb, head); + nip_build_udp_hdr(head->sport, head->dport, + head->trans_hdr_len + head->usr_data_len, + skb->data - head->trans_hdr_len, check); + + /* 报文拷贝完成后刷新skb的data/tail */ + /* skb->data 至 skb->tail 间是newip完整数据 */ + skb_put(skb, head->usr_data_len); + skb->data = skb_network_header(skb); + /* 网络层报头+传输层报头+传输层数据 */ + skb->len = head->hdr_buf_pos + head->trans_hdr_len + + head->usr_data_len; + + ret = nip_nwk_output(skb); + DEBUG("%s: newip output finish.(ret=%d, datalen=%u)", + __func__, ret, head->usr_data_len); + return ret; +} + +/* 报文分段发送接口 */ +int nip_segment_output(struct sock *sk, void *from, int datalen, + int transhdrlen, const struct nip_addr *saddr, + ushort sport, const struct nip_addr *daddr, + ushort dport, struct dst_entry *dst) +{ + int i; + u32 ret = 0; + u32 mtu = dst_mtu(dst); + struct nip_pkt_seg_info seg_info = {0}; + struct nip_head_para head = {0}; + + head.saddr = *saddr; + head.daddr = *daddr; + head.sport = sport; + head.dport = dport; + head.usr_data = from; + head.ttl = NIP_DEFAULT_TTL; + head.nexthdr = IPPROTO_UDP; + head.trans_hdr_len = transhdrlen; + + _nip_calc_pkt_frag_num(mtu, datalen, &seg_info); + _nip_udp_bitmap_flag_encap(&head); + _nip_hdr_encap(&head); + + /* 发送中间数据段 */ + for (i = 0; i < seg_info.mid_pkt_num; i++) { + head.usr_data_len = seg_info.mid_usr_pkt_len; + ret = _nip_segment_output(sk, &head, &seg_info, dst); + if (ret) + goto end; + } + + /* 发送最后数据段 */ + if (seg_info.last_pkt_num) { + head.usr_data_len = seg_info.last_usr_pkt_len; + ret = _nip_segment_output(sk, &head, &seg_info, dst); + } + +end: + dst_release(dst); + return ret; +} +#endif + +/* 主要功能: + * 1. 根据fln,查找路由表,得到对应的dst + * 2. 根据路由表查找结果,得到源端newip地址, + * 并存入fln->saddr中 + */ +static int nip_dst_lookup_tail(struct net *net, const struct sock *sk, + struct dst_entry **dst, struct flow_nip *fln) +{ + int err; + struct nip_rt_info *rt; + + if (!(*dst)) + *dst = nip_route_output(net, sk, fln); + + err = (*dst)->error; + if (err) { + rt = NULL; + DEBUG("%s: nip_route_output search error!", __func__); + goto out_err_release; + } + + rt = (struct nip_rt_info *)*dst; + err = nip_route_get_saddr(net, rt, &fln->daddr, &fln->saddr); + + if (err) + goto out_err_release; + + return 0; + +out_err_release: + dst_release(*dst); + *dst = NULL; + + return err; +} + +struct dst_entry *nip_dst_lookup_flow(struct net *net, const struct sock *sk, + struct flow_nip *fln, + const struct nip_addr *final_dst) +{ + struct dst_entry *dst = NULL; + int err; + + err = nip_dst_lookup_tail(net, sk, &dst, fln); + if (err) + return ERR_PTR(err); + if (final_dst) + fln->daddr = *final_dst; + + return dst; +} +EXPORT_SYMBOL_GPL(nip_dst_lookup_flow); + +struct dst_entry *nip_sk_dst_lookup_flow(struct sock *sk, struct flow_nip *fln) +{ + struct dst_entry *dst = NULL; + int err; + + err = nip_dst_lookup_tail(sock_net(sk), sk, &dst, fln); + if (err) + return ERR_PTR(err); + + return dst; +} + +int nip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl) +{ + int err = -EFAULT; + struct net *net = sock_net(sk); + struct nip_addr *saddr, *daddr; + struct dst_entry *dst; + struct flow_nip fln; + struct nip_head_para head = {0}; + + rcu_read_lock(); + skb->protocol = htons(ETH_P_NEWIP); + skb->ip_summed = CHECKSUM_NONE; + skb->csum = 0; + saddr = &(sk->sk_nip_rcv_saddr); + daddr = &(sk->sk_nip_daddr); + + head.saddr = *saddr; + head.daddr = *daddr; + head.ttl = NIP_DEFAULT_TTL; + head.nexthdr = IPPROTO_TCP; + _nip_comm_bitmap_flag_encap(&head); + _nip_hdr_encap(&head); + + /* nip报文头长度 + (tcp传输层包头长度 + 用户数据长度) */ + head.total_len = head.hdr_buf_pos + skb->len; + + _nip_update_total_len(&head); + fln.daddr = sk->sk_nip_daddr; + dst = __sk_dst_check(sk, 0); + if (!dst) { + DEBUG("%s: no dst cache for sk, search newip rt.", __func__); + dst = nip_route_output(net, sk, &fln); + if (!dst) { + DEBUG("%s: cannot find dst.", __func__); + goto error; + } + sk_dst_set(sk, dst); + } + skb_dst_set_noref(skb, dst); + + /* build nwk header */ + skb_push(skb, head.hdr_buf_pos); + memcpy(skb->data, head.hdr_buf, head.hdr_buf_pos); + + skb_reset_network_header(skb); + NIPCB(skb)->dstaddr = *daddr; + + err = nip_nwk_output(skb); + if (err) + DEBUG("%s: failed to send skb.", __func__); + else + DEBUG("%s: send a skb ok.", __func__); + + rcu_read_unlock(); + return 0; +error: + rcu_read_unlock(); + return -1; +} diff --git a/net/newip/nndisc.c b/net/newip/nndisc.c new file mode 100755 index 0000000000000000000000000000000000000000..46812b82d2d7df688ca91ec5586c2373792a8a8e --- /dev/null +++ b/net/newip/nndisc.c @@ -0,0 +1,605 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_SYSCTL +#include +#endif + +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +#include + +/* #define NUD_INCOMPLETE 0x01 + * 邻居请求报文已经发送,但尚未收到应答的状态 + * #define NUD_REACHABLE 0x02 + * 可达状态,已经收到了邻居应答报文,表明邻居可达 + * #define NUD_STALE 0x04 + * 闲置状态,长时间未得到确认,且闲置时间超过额定时间 + * #define NUD_DELAY 0x08 + * 确认时间超时,但闲置未超过额定时间,需得到确认报文 + * #define NUD_PROBE 0x10 + * 由NUD_DELAY长时间未收到确认得到,发送ARP请求报文 + * #define NUD_FAILED 0x20 + * 邻居不可达状态 + * #define NUD_NOARP 0x40 + * 表示不需要ARP状态转变的邻居项状态 + * #define NUD_PERMANENT 0x80 + * 表示该邻居项状态是永久的,不用转变 + * #define NUD_NONE 0x00 + * 邻居项初始化状态 + */ + +static void nndisc_solicit(struct neighbour *neigh, struct sk_buff *skb); + +struct nip_addr nip_broadcast_addr_arp = { + .bitlen = NIP_ADDR_BIT_LEN_16, + .nip_addr_field8[0] = 0xFF, + .nip_addr_field8[1] = 0x04, +}; + +static u32 nndisc_hash(const void *pkey, + const struct net_device *dev, __u32 *fhash_rnd); +static bool nndisc_key_eq(const struct neighbour *neigh, const void *pkey); +static int nndisc_constructor(struct neighbour *neigh); + +static void nndisc_error_report(struct neighbour *neigh, struct sk_buff *skb) +{ + kfree_skb(skb); +} + +/* 普通场景 */ +static const struct neigh_ops nndisc_generic_ops = { + .family = AF_NINET, + .solicit = nndisc_solicit, + .output = neigh_resolve_output, + .connected_output = neigh_connected_output, +}; + +/* cache存在场景 */ +static const struct neigh_ops nndisc_hh_ops = { + .family = AF_NINET, + .solicit = nndisc_solicit, + .error_report = nndisc_error_report, + .output = neigh_resolve_output, + .connected_output = neigh_resolve_output, +}; + +/* 未挂接操作函数场景,采用不封装报文直接发送的形式 */ +static const struct neigh_ops nndisc_direct_ops = { + .family = AF_NINET, + .output = neigh_direct_output, + .connected_output = neigh_direct_output, +}; + +struct neigh_table nnd_tbl = { + .family = AF_NINET, + .key_len = sizeof(struct nip_addr), + .protocol = cpu_to_be16(ETH_P_NEWIP), + .hash = nndisc_hash, + .key_eq = nndisc_key_eq, + .constructor = nndisc_constructor, + .id = "nndisc_cache", + .parms = { + .tbl = &nnd_tbl, + .reachable_time = ND_REACHABLE_TIME, + .data = { + [NEIGH_VAR_MCAST_PROBES] = 3, + [NEIGH_VAR_UCAST_PROBES] = 3, + [NEIGH_VAR_RETRANS_TIME] = ND_RETRANS_TIMER, + [NEIGH_VAR_BASE_REACHABLE_TIME] = ND_REACHABLE_TIME, + [NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ, + [NEIGH_VAR_GC_STALETIME] = 60 * HZ, + [NEIGH_VAR_QUEUE_LEN_BYTES] = 64 * 1024, + [NEIGH_VAR_PROXY_QLEN] = 64, + [NEIGH_VAR_ANYCAST_DELAY] = 1 * HZ, + [NEIGH_VAR_PROXY_DELAY] = (8 * HZ) / 10, + }, + }, + .gc_interval = 30 * HZ, + .gc_thresh1 = 128, + .gc_thresh2 = 512, + .gc_thresh3 = 1024, +}; +EXPORT_SYMBOL_GPL(nnd_tbl); + +static u32 nndisc_hash(const void *pkey, + const struct net_device *dev, __u32 *hash_rnd) +{ + return nndisc_hashfn(pkey, dev, hash_rnd); +} + +static bool nndisc_key_eq(const struct neighbour *n, const void *pkey) +{ + return neigh_key_eq800(n, pkey); +} + +static int nndisc_constructor(struct neighbour *neigh) +{ + struct nip_addr *addr = (struct nip_addr *)&neigh->primary_key; + struct net_device *dev = neigh->dev; + struct ninet_dev *nin_dev; + struct neigh_parms *parms; + bool is_broadcast = nip_addr_eq(addr, &nip_broadcast_addr_arp); + + nin_dev = nin_dev_get(dev); + if (!nin_dev) + return -EINVAL; + + parms = nin_dev->nd_parms; + __neigh_parms_put(neigh->parms); + neigh->parms = neigh_parms_clone(parms); + + neigh->type = RTN_UNICAST; + if (!dev->header_ops) { + neigh->nud_state = NUD_NOARP; + neigh->ops = &nndisc_direct_ops; + neigh->output = neigh_direct_output; + } else { + if (is_broadcast || + (dev->flags & IFF_POINTOPOINT)) { + neigh->nud_state = NUD_NOARP; + memcpy(neigh->ha, dev->broadcast, dev->addr_len); + } else if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) { + neigh->nud_state = NUD_NOARP; + memcpy(neigh->ha, dev->dev_addr, dev->addr_len); + if (dev->flags & IFF_LOOPBACK) + neigh->type = RTN_LOCAL; + } + + if (dev->header_ops->cache) + neigh->ops = &nndisc_hh_ops; + else + neigh->ops = &nndisc_generic_ops; + + if (neigh->nud_state & NUD_VALID) + neigh->output = neigh->ops->connected_output; + else + neigh->output = neigh->ops->output; + } + + nin_dev_put(nin_dev); + + return 0; +} + +void nip_insert_nndisc_send_checksum(struct sk_buff *skb, u_short checksum) +{ +#define NNDISC_CHECKSUM_BIAS 2 + *(__u16 *)(skb_transport_header(skb) + NNDISC_CHECKSUM_BIAS) = + htons(checksum); +} + +unsigned short nip_get_nndisc_send_checksum(struct sk_buff *skb, + struct nip_head_para *head, + int payload_len) +{ + struct nip_pseudo_header nph = {0}; + + nph.next_header = head->nexthdr; + nph.src_addr = head->saddr; + nph.dst_addr = head->daddr; + nph.check_len = payload_len; + + return nip_check_sum_build(skb_transport_header(skb), + nph.check_len, &nph); +} + +bool nip_get_nndisc_rcv_checksum(struct sk_buff *skb, + u_char *transport_tail) +{ + struct nip_pseudo_header nph = {0}; + + nph.next_header = NIPCB(skb)->nexthdr; + nph.src_addr = NIPCB(skb)->srcaddr; + nph.dst_addr = NIPCB(skb)->dstaddr; + nph.check_len = (unsigned short) + (transport_tail - (skb_transport_header(skb))); + + return nip_check_sum_parse(skb_transport_header(skb), + nph.check_len, &nph) + == 0xffff ? true : false; +} + +static void nndisc_payload_ns_pack(const struct nip_addr *solicit, + struct sk_buff *skb) +{ + struct nnd_msg *msg = (struct nnd_msg *)skb->data; + u_char *p = msg->data; + + memset(&msg->icmph, 0, sizeof(msg->icmph)); + msg->icmph.nip_icmp_type = NIP_ARP_NS; + msg->icmph.nip_icmp_cksum = 0; + p = build_nip_addr(solicit, p); +} + +static struct dst_entry *nndisc_dst_alloc(struct net_device *dev) +{ + struct nip_rt_info *rt; + struct net *net = dev_net(dev); + + rt = nip_dst_alloc(net, dev, 0); + if (!rt) + return NULL; + + rt->dst.flags |= DST_HOST; + rt->dst.input = nip_nwk_input_up; + rt->dst.output = nip_nwk_output_down; + atomic_set(&rt->dst.__refcnt, 1); + + return &rt->dst; +} + +static int get_ns_payload_len(const struct nip_addr *solicit) +{ + return sizeof(struct nip_icmp_hdr) + get_nip_addr_len(solicit); +} + +static void nndisc_send_ns(struct net_device *dev, + const struct nip_addr *solicit, + const struct nip_addr *daddr, + const struct nip_addr *saddr) +{ + int ret; + struct sk_buff *skb; + struct dst_entry *dst; + struct net *net; + struct sock *sk = NULL; + int payload_len = get_ns_payload_len(solicit); + int len = NIP_ETH_HDR_LEN + NIP_HDR_MAX + payload_len; + struct nip_head_para head = {0}; + unsigned short checksum; + + head.saddr = *saddr; + head.daddr = *daddr; + head.ttl = NIP_ARP_DEFAULT_TTL; + head.nexthdr = IPPROTO_NIP_ICMP; + + _nip_comm_bitmap_flag_encap(&head); + _nip_hdr_encap(&head); + + head.total_len = head.hdr_buf_pos + payload_len; + _nip_update_total_len(&head); + + skb = alloc_skb(len, 0); + if (!skb) { + DEBUG("%s: no space for skbuff!", __func__); + return; + } + + skb->protocol = htons(ETH_P_NEWIP); + skb->dev = dev; + skb->ip_summed = CHECKSUM_NONE; + skb->csum = 0; + memset(NIPCB(skb), 0, sizeof(struct ninet_skb_parm)); + + /* reserve space for hardware header */ + skb_reserve(skb, NIP_ETH_HDR_LEN); + skb_reset_network_header(skb); + + /* build nwk header */ + memcpy(skb->data, head.hdr_buf, head.hdr_buf_pos); + skb_reserve(skb, head.hdr_buf_pos); + skb_reset_transport_header(skb); + NIPCB(skb)->dstaddr = head.daddr; + + /* build transport header */ + nndisc_payload_ns_pack(solicit, skb); + skb_reserve(skb, payload_len); + + skb->data = skb_network_header(skb); + skb->len = head.hdr_buf_pos + payload_len; + + dst = nndisc_dst_alloc(dev); + if (!dst) { + kfree_skb(skb); + return; + } + + /* add check sum*/ + checksum = nip_get_nndisc_send_checksum(skb, &head, payload_len); + nip_insert_nndisc_send_checksum(skb, checksum); + + skb_dst_set(skb, dst); + net = dev_net(skb->dev); + + /* dst设置到skb,并且调用output函数后无论成功失败, + * 都有output函数释放skb,dst + */ + ret = dst_output(net, sk, skb); + if (ret) + DEBUG("%s: dst output fail.", __func__); +} + +static void nndisc_solicit(struct neighbour *neigh, struct sk_buff *skb) +{ + struct net_device *dev = neigh->dev; + struct nip_addr *target = (struct nip_addr *)&neigh->primary_key; + struct nip_addr *saddr = NULL; + struct ninet_dev *idev; + + /* 从当前dev中获取NewIP地址,作为请求报文的源地址 */ + rcu_read_lock(); + idev = __nin_dev_get(dev); + if (idev) { + read_lock_bh(&idev->lock); + if (!list_empty(&idev->addr_list)) { + struct ninet_ifaddr *ifp; + + list_for_each_entry(ifp, &idev->addr_list, if_list) { + saddr = &ifp->addr; + nndisc_send_ns(dev, target, + &nip_broadcast_addr_arp, + saddr); + } + } + read_unlock_bh(&idev->lock); + } else { + DEBUG("%s:idev don't exist!!!!!!", __func__); + } + rcu_read_unlock(); +} + +static void build_na_hdr(u_char *smac, u_char mac_len, struct sk_buff *skb) +{ + struct nnd_msg *msg = (struct nnd_msg *)skb->data; + u_char *p = msg->data; + + memset(&msg->icmph, 0, sizeof(msg->icmph)); + msg->icmph.nip_icmp_type = NIP_ARP_NA; + msg->icmph.nip_icmp_cksum = 0; + *p = mac_len; + p++; + memcpy(p, smac, mac_len); +} + +static int get_na_payload_len(struct net_device *dev) +{ + /* icmp头部长度、 + * mac地址长度字段所占字节数、 + * mac地址长度 + */ + return sizeof(struct nip_icmp_hdr) + 1 + dev->addr_len; +} + +static void nndisc_send_na(struct net_device *dev, const struct nip_addr *daddr, + const struct nip_addr *saddr) +{ + struct sk_buff *skb = NULL; + struct dst_entry *dst = NULL; + struct sock *sk = NULL; + int csummode = CHECKSUM_NONE; + int payload_len = get_na_payload_len(dev); + int len = NIP_ETH_HDR_LEN + NIP_HDR_MAX + payload_len; + u_char *smac = dev->dev_addr; + struct nip_head_para head = {0}; + u_short checksum = 0; + + head.saddr = *saddr; + head.daddr = *daddr; + head.ttl = NIP_ARP_DEFAULT_TTL; + head.nexthdr = IPPROTO_NIP_ICMP; + + _nip_comm_bitmap_flag_encap(&head); + _nip_hdr_encap(&head); + + head.total_len = head.hdr_buf_pos + payload_len; + _nip_update_total_len(&head); + + skb = alloc_skb(len, 0); + if (!skb) { + DEBUG("%s: no space for skbuff!", __func__); + return; + } + skb->protocol = htons(ETH_P_NEWIP); + skb->ip_summed = csummode; + skb->csum = 0; + skb->dev = dev; + memset(NIPCB(skb), 0, sizeof(struct ninet_skb_parm)); + + /* reserve space for hardware header */ + skb_reserve(skb, NIP_ETH_HDR_LEN); + skb_reset_network_header(skb); + + /* build nwk header */ + memcpy(skb->data, head.hdr_buf, head.hdr_buf_pos); + skb_reserve(skb, head.hdr_buf_pos); + skb_reset_transport_header(skb); + NIPCB(skb)->dstaddr = *daddr; + + /* build na header */ + build_na_hdr(smac, dev->addr_len, skb); + + /* skip transport hdr */ + skb_reserve(skb, payload_len); + + /* set skb->data to point network header */ + skb->data = skb_network_header(skb); + skb->len = head.hdr_buf_pos + payload_len; + + dst = nndisc_dst_alloc(dev); + if (!dst) { + kfree_skb(skb); + return; + } + + /* add check sum*/ + checksum = nip_get_nndisc_send_checksum(skb, &head, payload_len); + nip_insert_nndisc_send_checksum(skb, checksum); + + skb_dst_set(skb, dst); + dst_output(dev_net(skb->dev), sk, skb); +} + +bool nip_addr_local(struct net_device *dev, struct nip_addr *addr) +{ + struct ninet_dev *idev; + bool ret = false; + + rcu_read_lock(); + idev = __nin_dev_get(dev); + if (idev) { + read_lock_bh(&idev->lock); + /* 根据idev->addr给addr */ + if (!list_empty(&idev->addr_list)) { + struct ninet_ifaddr *ifp; + + list_for_each_entry(ifp, &idev->addr_list, if_list) { + if (nip_addr_eq(addr, &ifp->addr)) { + ret = true; + break; + } + } + } + read_unlock_bh(&idev->lock); + } + rcu_read_unlock(); + + return ret; +} + +int nndisc_rcv_ns(struct sk_buff *skb) +{ + struct nnd_msg *msg = (struct nnd_msg *)skb_transport_header(skb); + u_char *p = msg->data; + u_char *lladdr; + struct nip_addr addr = {0}; + struct neighbour *neigh; + struct ethhdr *eth; + struct net_device *dev = skb->dev; + int err = 0; + + p = decode_nip_addr(p, &addr); + if (!p) { + DEBUG("failure when decode source address!"); + err = -EFAULT; + goto out; + } + + if (nip_addr_check(&addr)) { + DEBUG("%s: icmp hdr addr invalid.", __func__); + err = -EFAULT; + goto out; + } + + if (!nip_addr_local(dev, &addr)) { + err = -ENXIO; + goto out; + } + + eth = (struct ethhdr *)skb_mac_header(skb); + lladdr = eth->h_source; + + /* checksum parse*/ + if (!nip_get_nndisc_rcv_checksum(skb, p)) { + DEBUG("%s:ns ICMP checksum failed, drop the packet", __func__); + err = -EINVAL; + goto out; + } + + neigh = __neigh_lookup(&nnd_tbl, &NIPCB(skb)->srcaddr, dev, lladdr || + !dev->addr_len); + if (neigh) { + neigh_update(neigh, lladdr, NUD_STALE, NEIGH_UPDATE_F_OVERRIDE, + 0); + neigh_release(neigh); + } + nndisc_send_na(dev, &NIPCB(skb)->srcaddr, &addr); +out: + kfree_skb(skb); + return err; +} + +int nndisc_rcv_na(struct sk_buff *skb) +{ + struct nnd_msg *msg = (struct nnd_msg *)skb_transport_header(skb); + u_char *p = msg->data; + u_char len; + u8 lladdr[ALIGN(MAX_ADDR_LEN, sizeof(unsigned long))]; + struct net_device *dev = skb->dev; + struct neighbour *neigh; + + len = *p; + p++; + memset(lladdr, 0, ALIGN(MAX_ADDR_LEN, sizeof(unsigned long))); + memcpy(lladdr, p, len); + + /* checksum parse*/ + /* p+len=tail */ + /* 修改指针位置时需要特别注意确认 + * checksum输入是否改变 + */ + if (!nip_get_nndisc_rcv_checksum(skb, p + len)) { + DEBUG("%s:na ICMP checksum failed! drop the packet!" + , __func__); + kfree_skb(skb); + return 0; + } + + neigh = neigh_lookup(&nnd_tbl, &NIPCB(skb)->srcaddr, dev); + if (neigh) { + neigh_update(neigh, lladdr, NUD_REACHABLE, + NEIGH_UPDATE_F_OVERRIDE, 0); + neigh_release(neigh); + kfree_skb(skb); + return 0; + } + kfree_skb(skb); + return -EFAULT; +} + +int nndisc_rcv(struct sk_buff *skb) +{ + int ret = 0; + struct nip_icmp_hdr *hdr = nip_icmp_header(skb); + u8 type = hdr->nip_icmp_type; + + switch (type) { + case NIP_ARP_NS: + ret = nndisc_rcv_ns(skb); + break; + case NIP_ARP_NA: + ret = nndisc_rcv_na(skb); + break; + default: + DEBUG("arp packet type error"); + } + + return ret; +} + +int __init nndisc_init(void) +{ + neigh_table_init(NEIGH_NND_TABLE, &nnd_tbl); + return 0; +} diff --git a/net/newip/protocol.c b/net/newip/protocol.c new file mode 100755 index 0000000000000000000000000000000000000000..7a0265fd5647f3191c994dc9894062d8de87547b --- /dev/null +++ b/net/newip/protocol.c @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include +#include +#include +#include + +const struct ninet_protocol __rcu *ninet_protos[MAX_INET_PROTOS] __read_mostly; +EXPORT_SYMBOL(ninet_protos); + +int ninet_add_protocol(const struct ninet_protocol *prot, + unsigned char protocol) +{ + return !cmpxchg((const struct ninet_protocol **)&ninet_protos[protocol], + NULL, prot) ? 0 : -1; +} +EXPORT_SYMBOL(ninet_add_protocol); + +int ninet_del_protocol(const struct ninet_protocol *prot, + unsigned char protocol) +{ + int ret; + + ret = (cmpxchg((const struct ninet_protocol **)&ninet_protos[protocol], + prot, NULL) == prot) ? 0 : -1; + + synchronize_net(); + + return ret; +} +EXPORT_SYMBOL(ninet_del_protocol); diff --git a/net/newip/route.c b/net/newip/route.c new file mode 100755 index 0000000000000000000000000000000000000000..e8119bfaf7741dbcb807a93f021e8e6ac016f369 --- /dev/null +++ b/net/newip/route.c @@ -0,0 +1,922 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include /*copy_from_user()*/ +#include /*rtnl_lock()*/ + +#include +#include +#include +#include +#include + +#include +#include + +static int nip_pkt_discard(struct sk_buff *skb); +static int nip_pkt_discard_out(struct net *net, struct sock *sk, + struct sk_buff *skb); +static unsigned int nip_mtu(const struct dst_entry *dst); + +static const struct nip_rt_info nip_null_entry_template = { + .dst = { + .__refcnt = ATOMIC_INIT(1), + .__use = 1, + .obsolete = DST_OBSOLETE_FORCE_CHK, + .error = -ENETUNREACH, + .input = nip_pkt_discard, + .output = nip_pkt_discard_out, + }, + .rt_ref = ATOMIC_INIT(1), +}; + +static const struct nip_rt_info nip_broadcast_entry_template = { + .dst = { + .__refcnt = ATOMIC_INIT(1), + .__use = 1, + .obsolete = DST_OBSOLETE_FORCE_CHK, + .error = -ENETUNREACH, + .input = nip_nwk_input_up, + .output = nip_nwk_output_down, + }, + .rt_ref = ATOMIC_INIT(1), +}; + +struct nip_addr *nip_nexthop(struct nip_rt_info *rt, struct nip_addr *daddr) +{ + if (rt->rt_flags & RTF_GATEWAY) + return &rt->gateway; + else + return daddr; +} + +static void rtmsg_to_fibni_config(struct net *net, struct nip_rtmsg *rtmsg, + struct nip_fib_config *cfg) +{ + memset(cfg, 0, sizeof(*cfg)); + + cfg->fc_table = NIP_RT_TABLE_MAIN; + cfg->fc_ifindex = rtmsg->rtmsg_ifindex; + cfg->fc_metric = rtmsg->rtmsg_metric; + cfg->fc_expires = rtmsg->rtmsg_info; + + cfg->fc_flags = rtmsg->rtmsg_flags; + + cfg->fc_nlinfo.nl_net = net; + + cfg->fc_dst = rtmsg->rtmsg_dst; + cfg->fc_src = rtmsg->rtmsg_src; + cfg->fc_gateway = rtmsg->rtmsg_gateway; +} + +static void nip_rt_info_init(struct nip_rt_info *rt) +{ + struct dst_entry *dst = &rt->dst; + + memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst)); + rt->from = NULL; +} + +static struct nip_rt_info *__nip_dst_alloc(struct net *net, + struct net_device *dev, int flags) +{ + struct nip_rt_info *rt = + dst_alloc(&net->newip.nip_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK, + flags); + + if (rt) + nip_rt_info_init(rt); + + return rt; +} + +struct nip_rt_info *nip_dst_alloc(struct net *net, struct net_device *dev, + int flags) +{ + struct nip_rt_info *rt = __nip_dst_alloc(net, dev, flags); + + if (rt) { + rt->rt_pcpu = + alloc_percpu_gfp(struct nip_rt_info *, GFP_ATOMIC); + if (rt->rt_pcpu) { + int cpu; + + for_each_possible_cpu(cpu) { + struct nip_rt_info **p; + + p = per_cpu_ptr(rt->rt_pcpu, cpu); + /* no one shares rt */ + *p = NULL; + } + } else { + dst_destroy((struct dst_entry *)rt); + return NULL; + } + } + + return rt; +} +EXPORT_SYMBOL(nip_dst_alloc); + +static void nip_rt_dst_from_metrics_check(struct nip_rt_info *rt) +{ + if (rt->from && + dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->from)) + dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->from), true); +} + +static struct nip_rt_info *nip_rt_get_pcpu_route(struct nip_rt_info *rt) +{ + struct nip_rt_info *pcpu_rt, **p; + + p = this_cpu_ptr(rt->rt_pcpu); + pcpu_rt = *p; + + if (pcpu_rt) { + dst_hold(&pcpu_rt->dst); + nip_rt_dst_from_metrics_check(pcpu_rt); + } + return pcpu_rt; +} + +static void nip_rt_set_from(struct nip_rt_info *rt, struct nip_rt_info *from) +{ + WARN_ON(from->from); + + rt->rt_flags &= ~RTF_EXPIRES; + dst_hold(&from->dst); + rt->from = &from->dst; + dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true); +} + +static void nip_rt_copy_init(struct nip_rt_info *rt, struct nip_rt_info *ort) +{ + rt->dst.input = ort->dst.input; + rt->dst.output = ort->dst.output; + rt->rt_dst = ort->rt_dst; + rt->dst.error = ort->dst.error; + rt->rt_idev = ort->rt_idev; + if (rt->rt_idev) + nin_dev_hold(rt->rt_idev); + + rt->dst.lastuse = jiffies; + rt->gateway = ort->gateway; + rt->rt_flags = ort->rt_flags; + nip_rt_set_from(rt, ort); + rt->rt_metric = ort->rt_metric; + rt->rt_table = ort->rt_table; + rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate); +} + +static struct nip_rt_info *nip_rt_pcpu_alloc(struct nip_rt_info *rt) +{ + struct nip_rt_info *pcpu_rt; + + pcpu_rt = __nip_dst_alloc(dev_net(rt->dst.dev), + rt->dst.dev, rt->dst.flags); + if (!pcpu_rt) + return NULL; + nip_rt_copy_init(pcpu_rt, rt); + pcpu_rt->rt_protocol = rt->rt_protocol; + pcpu_rt->rt_flags |= RTF_PCPU; + return pcpu_rt; +} + +static struct nip_rt_info *nip_rt_make_pcpu_route(struct nip_rt_info *rt) +{ + struct nip_rt_info *pcpu_rt, *prev, **p; + + pcpu_rt = nip_rt_pcpu_alloc(rt); + if (!pcpu_rt) { + struct net *net = dev_net(rt->dst.dev); + + dst_hold(&net->newip.nip_null_entry->dst); + return net->newip.nip_null_entry; + } + + rcu_read_lock_bh(); + if (rt->rt_pcpu) { + p = this_cpu_ptr(rt->rt_pcpu); + prev = cmpxchg(p, NULL, pcpu_rt); + if (prev) { + /* If someone did it before us, return prev instead */ + dst_destroy(&pcpu_rt->dst); + pcpu_rt = prev; + } + } else { + dst_destroy(&pcpu_rt->dst); + pcpu_rt = rt; + } + dst_hold(&pcpu_rt->dst); + nip_rt_dst_from_metrics_check(pcpu_rt); + rcu_read_unlock_bh(); + return pcpu_rt; +} + +static struct nip_rt_info *nip_pol_route_input(struct net *net, + struct nip_fib_table *table, + struct flow_nip *fln, int flags) +{ + return nip_pol_route(net, table, fln->flowin_iif, fln, flags); +} + +struct dst_entry *nip_route_input_lookup(struct net *net, + struct net_device *dev, + struct flow_nip *fln, int flags) +{ + return nip_fib_rule_lookup(net, fln, flags, nip_pol_route_input); +} +EXPORT_SYMBOL_GPL(nip_route_input_lookup); + +void nip_route_input(struct sk_buff *skb) +{ + struct net *net = dev_net(skb->dev); + int flags = 0; + struct flow_nip fln = { + .flowin_iif = skb->dev->ifindex, + .daddr = NIPCB(skb)->dstaddr, + .saddr = NIPCB(skb)->srcaddr, + }; + + if (nip_addr_eq(&fln.daddr, &nip_broadcast_addr_arp)) { + DEBUG("%s: recv broadcast packet!\n", __func__); + dst_hold(&net->newip.nip_broadcast_entry->dst); + skb_dst_set(skb, + (struct dst_entry *)net->newip.nip_broadcast_entry); + return; + } + + skb_dst_set(skb, nip_route_input_lookup(net, skb->dev, &fln, flags)); +} + +static struct nip_rt_info *nip_pol_route_output(struct net *net, + struct nip_fib_table *table, + struct flow_nip *fln, int flags) +{ + return nip_pol_route(net, table, fln->flowin_oif, fln, flags); +} + +struct dst_entry *nip_route_output_flags(struct net *net, const struct sock *sk, + struct flow_nip *fln, int flags) +{ + return nip_fib_rule_lookup(net, fln, flags, nip_pol_route_output); +} + +struct nip_rt_info *nip_pol_route(struct net *net, struct nip_fib_table *table, + int oif, struct flow_nip *fln, int flags) +{ + struct nip_fib_node *fn; + struct nip_rt_info *rt, *pcpu_rt; + + rcu_read_lock_bh(); + fn = nip_fib_locate(table->nip_tb_head, &fln->daddr); + if (!fn) { + rcu_read_unlock_bh(); + DEBUG("%s: search fail!\n", __func__); + rt = net->newip.nip_null_entry; + dst_hold_and_use(&rt->dst, jiffies); + return rt; + } + rt = fn->nip_route_info; + + /* Get a percpu copy */ + rt->dst.lastuse = jiffies; + rt->dst.__use++; + pcpu_rt = nip_rt_get_pcpu_route(rt); + + DEBUG("%s: cpu id = %d\n", __func__, smp_processor_id()); + + if (pcpu_rt) { + rcu_read_unlock_bh(); + DEBUG("%s: pcpu found!\n", __func__); + } else { + dst_hold(&rt->dst); + rcu_read_unlock_bh(); + pcpu_rt = nip_rt_make_pcpu_route(rt); + dst_release(&rt->dst); + } + + DEBUG("%s: rt dst.__refcnt = %d ; pcpu dst.__refcnt = %d\n", __func__, + atomic_read(&rt->dst.__refcnt), + atomic_read(&pcpu_rt->dst.__refcnt)); + return pcpu_rt; +} + +/* 检查绑定地址合法性,true表示合法,false表示不合法 */ +bool nip_bind_addr_check(struct net *net, + struct nip_addr *addr) +{ + struct nip_fib_node *fn; + struct nip_fib_table *fib_tbl = net->newip.nip_fib_local_tbl; + + if (nip_addr_check(addr)) { + DEBUG("%s: binding-addr invalid.", __func__); + return false; + } + + if (nip_addr_eq(addr, &nip_any_addr)) { + DEBUG("%s: binding-addr is any addr.", __func__); + return true; + } + + rcu_read_lock_bh(); + fn = nip_fib_locate(fib_tbl->nip_tb_head, addr); + rcu_read_unlock_bh(); + if (!fn) { + DEBUG("%s: binding-addr is not local addr.", __func__); + return false; + } + + DEBUG("%s: binding-addr is local addr.", __func__); + return true; +} + +static struct nip_rt_info *nip_route_info_create(struct nip_fib_config *cfg) +{ + struct net *net = cfg->fc_nlinfo.nl_net; + struct nip_rt_info *rt = NULL; + struct net_device *dev = NULL; + struct ninet_dev *idev = NULL; + struct nip_fib_table *table; + int err = -ENODEV; + + /* find net_device */ + dev = dev_get_by_index(net, cfg->fc_ifindex); + if (!dev) + goto out; + /* find ninet_dev,which has the newip address list */ + idev = nin_dev_get(dev); + if (!idev) + goto out; + + /* 根据所传入的fc_metric,如果是0,则赋值 + * 为IP6_RT_PRIO_USER,需要对应为NIP_RT_PRIO_USER, + * 但ipv6的值为1024,不知道有没有位比较,不敢乱赋值 + */ + if (cfg->fc_metric == 0) + cfg->fc_metric = NIP_RT_PRIO_USER; + + err = -ENOBUFS; + + table = nip_fib_get_table(net, cfg->fc_table); + if (!table) + goto out; + + rt = nip_dst_alloc(net, NULL, + (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT); + if (!rt) { + err = -ENOMEM; + goto out; + } + + nip_rt_clean_expires(rt); + + if (cfg->fc_protocol == RTPROT_UNSPEC) + cfg->fc_protocol = RTPROT_BOOT; + rt->rt_protocol = cfg->fc_protocol; + + if (cfg->fc_flags & RTF_LOCAL) { + rt->dst.input = nip_nwk_input_up; + DEBUG("rt->dst.input = nip_nwk_input_up\n"); + } else { + rt->dst.input = nip_nwk_forward; + DEBUG("rt->dst.input = nip_nwk_forward\n"); + } + + rt->dst.output = nip_nwk_output_down; + + rt->rt_dst = cfg->fc_dst; + rt->rt_src = cfg->fc_src; + rt->rt_metric = cfg->fc_metric; + + if (cfg->fc_flags & RTF_GATEWAY) + rt->gateway = cfg->fc_gateway; + else + rt->gateway = nip_any_addr; + + rt->rt_flags = cfg->fc_flags; + + rt->dst.dev = dev; + rt->rt_idev = idev; + rt->rt_table = table; + + return rt; +out: + if (dev) + dev_put(dev); + if (idev) + nin_dev_put(idev); + DEBUG("%s failed!\n", __func__); + return ERR_PTR(err); +} + +/* __nip_ins_rt is called with FREE table->nip_tb_lock. + * It takes new route entry, the addition fails by any reason the + * route is released. + */ +static int __nip_ins_rt(struct nip_rt_info *rt) +{ + int err; + struct nip_fib_table *table; + + table = rt->rt_table; + + spin_lock_bh(&table->nip_tb_lock); + err = nip_fib_add(table->nip_tb_head, rt); + spin_unlock_bh(&table->nip_tb_lock); + + return err; +} + +int nip_ins_rt(struct nip_rt_info *rt) +{ + /* Hold dst to account for the reference from the nip fib hash */ + dst_hold(&rt->dst); + return __nip_ins_rt(rt); +} + +int nip_route_add(struct nip_fib_config *cfg) +{ + struct nip_rt_info *rt; + int err; + + rt = nip_route_info_create(cfg); + if (IS_ERR(rt)) { + err = PTR_ERR(rt); + rt = NULL; + goto out; + } + + err = __nip_ins_rt(rt); +out: + return err; +} + +static int __nip_del_rt(struct nip_rt_info *rt, struct nl_info *info) +{ + int err; + struct nip_fib_table *table; + struct net *net = dev_net(rt->dst.dev); + + if (rt == net->newip.nip_null_entry) { + err = -ENOENT; + goto out; + } + + table = rt->rt_table; + spin_lock_bh(&table->nip_tb_lock); + err = nip_fib_del(rt, info); + spin_unlock_bh(&table->nip_tb_lock); + +out: + nip_rt_put(rt); + return err; +} + +int nip_del_rt(struct nip_rt_info *rt) +{ + struct nl_info info = { + .nl_net = dev_net(rt->dst.dev), + }; + return __nip_del_rt(rt, &info); +} + +static int nip_route_del(struct nip_fib_config *cfg) +{ + struct net *net = cfg->fc_nlinfo.nl_net; + struct nip_fib_table *table; + struct nip_fib_node *fn; + struct nip_rt_info *rt; + int err = -ESRCH; + + table = nip_fib_get_table(net, cfg->fc_table); + if (!table) + return err; + + rcu_read_lock_bh(); + fn = nip_fib_locate(table->nip_tb_head, &cfg->fc_dst); + if (fn) { + rt = fn->nip_route_info; + dst_hold(&rt->dst); + rcu_read_unlock_bh(); + + return __nip_del_rt(rt, &cfg->fc_nlinfo); + } + rcu_read_unlock_bh(); + + return err; +} + +int nip_route_ioctl(struct net *net, unsigned int cmd, void __user *arg) +{ + struct nip_fib_config cfg; + struct nip_rtmsg rtmsg; + int err; + + switch (cmd) { + case SIOCADDRT: /* Add a route */ + case SIOCDELRT: /* Delete a route */ + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) + return -EPERM; + err = copy_from_user(&rtmsg, arg, sizeof(struct nip_rtmsg)); + if (err) + return -EFAULT; + + rtmsg_to_fibni_config(net, &rtmsg, &cfg); + + rtnl_lock(); + switch (cmd) { + case SIOCADDRT: + err = nip_route_add(&cfg); + break; + case SIOCDELRT: + err = nip_route_del(&cfg); + break; + default: + err = -EINVAL; + } + rtnl_unlock(); + + return err; + default: + break; + } + + return -EINVAL; +} + +static void nip_dst_destroy(struct dst_entry *dst) +{ + struct nip_rt_info *rt = (struct nip_rt_info *)dst; + struct dst_entry *from = rt->from; + struct ninet_dev *idev; + + dst_destroy_metrics_generic(dst); + free_percpu(rt->rt_pcpu); + + idev = rt->rt_idev; + if (idev) { + rt->rt_idev = NULL; + DEBUG("%s: idev->refcnt=%u\n", __func__, + refcount_read(&idev->refcnt)); + nin_dev_put(idev); + } + + if (from) { + DEBUG("%s: from->__refcnt = %d\n", __func__, + atomic_read(&from->__refcnt)); + } + rt->from = NULL; + dst_release(from); +} + +static inline const void *nip_choose_neigh_daddr(struct nip_rt_info *rt, + struct sk_buff *skb, + const void *daddr) +{ + struct nip_addr *p = &rt->gateway; + + if (rt->rt_flags & RTF_GATEWAY) + return (const void *)p; + else if (skb) + return &NIPCB(skb)->dstaddr; + return daddr; +} + +static struct neighbour *nip_neigh_lookup(const struct dst_entry *dst, + struct sk_buff *skb, + const void *daddr) +{ + struct nip_rt_info *rt = (struct nip_rt_info *)dst; + struct neighbour *n; + + daddr = nip_choose_neigh_daddr(rt, skb, daddr); + n = __nip_neigh_lookup(dst->dev, daddr); + if (n) + return n; + return neigh_create(&nnd_tbl, daddr, dst->dev); +} + +static struct dst_entry *nip_dst_check(struct dst_entry *dst, u32 cookie) +{ + return dst; +} + +/* 用于计算TCP所需的MSS值。 + * 由于目前NewIP的TCP中暂无MSS相关规定, + * 仅根据网口MTU来计算 + */ +static unsigned int nip_default_advmss(const struct dst_entry *dst) +{ + unsigned int mtu = dst_mtu(dst); + + mtu -= NIP_HDR_MAX + sizeof(struct tcphdr); + + return mtu; +} + +static unsigned int nip_mtu(const struct dst_entry *dst) +{ + unsigned int mtu; + struct ninet_dev *idev; + + mtu = NIP_MIN_MTU; + + rcu_read_lock(); + idev = __nin_dev_get(dst->dev); + if (idev) + mtu = idev->cnf.mtu; + rcu_read_unlock(); + + return mtu; +} + +static struct dst_ops nip_dst_ops_template = { + .family = AF_NINET, + .destroy = nip_dst_destroy, + .neigh_lookup = nip_neigh_lookup, + .check = nip_dst_check, + .default_advmss = nip_default_advmss, + .mtu = nip_mtu, +}; + +static int nip_pkt_discard(struct sk_buff *skb) +{ + kfree_skb(skb); + return 0; +} + +static int nip_pkt_discard_out(struct net *net, struct sock *sk, + struct sk_buff *skb) +{ + kfree_skb(skb); + return 0; +} + +struct nip_rt_info *nip_addrconf_dst_alloc(struct ninet_dev *idev, + const struct nip_addr *addr) +{ + u32 tb_id; + struct net *net = dev_net(idev->dev); + struct net_device *dev = idev->dev; + struct nip_rt_info *rt; + + rt = nip_dst_alloc(net, dev, DST_NOCOUNT); + if (!rt) + return ERR_PTR(-ENOMEM); + + nin_dev_hold(idev); + + rt->dst.flags |= DST_HOST; + rt->dst.input = nip_nwk_input_up; + rt->dst.output = nip_nwk_output_down; + rt->rt_idev = idev; + + rt->rt_protocol = RTPROT_KERNEL; + rt->rt_flags = RTF_UP | RTF_NONEXTHOP; + rt->rt_flags |= RTF_LOCAL; + + rt->gateway = *addr; + rt->rt_dst = *addr; + tb_id = NIP_RT_TABLE_LOCAL; + rt->rt_table = nip_fib_get_table(net, tb_id); + + return rt; +} + +struct arg_dev_net { + struct net_device *dev; + struct net *net; +}; + +/* 判断一个rt是否应该随着ifdown被删除 + * called with nip_tb_lock held for table with rt + */ +static int nip_fib_ifdown(struct nip_rt_info *rt, void *arg) +{ + const struct arg_dev_net *adn = arg; + const struct net_device *dev = adn->dev; + + if ((rt->dst.dev == dev || !dev) && + rt != adn->net->newip.nip_null_entry && + rt != adn->net->newip.nip_broadcast_entry && + ((dev && netdev_unregistering(dev)) || + !rt->rt_idev->cnf.ignore_routes_with_linkdown)) + return -1; + + return 0; +} + +void nip_rt_ifdown(struct net *net, struct net_device *dev) +{ + struct arg_dev_net adn = { + .dev = dev, + .net = net, + }; + + nip_fib_clean_all(net, nip_fib_ifdown, &adn); +} + +static int __net_init nip_route_net_init(struct net *net) +{ + int ret = -ENOMEM; + + memcpy(&net->newip.nip_dst_ops, &nip_dst_ops_template, + sizeof(net->newip.nip_dst_ops)); + + if (dst_entries_init(&net->newip.nip_dst_ops) < 0) + goto out; + + net->newip.nip_null_entry = kmemdup(&nip_null_entry_template, + sizeof(*net->newip.nip_null_entry), + GFP_KERNEL); + if (!net->newip.nip_null_entry) + goto out_nip_dst_entries; + net->newip.nip_null_entry->dst.ops = &net->newip.nip_dst_ops; + + net->newip.nip_broadcast_entry = + kmemdup(&nip_broadcast_entry_template, + sizeof(*net->newip.nip_broadcast_entry), + GFP_KERNEL); + if (!net->newip.nip_broadcast_entry) + goto out_nip_null_entry; + net->newip.nip_broadcast_entry->dst.ops = &net->newip.nip_dst_ops; + + ret = 0; +out: + return ret; + +out_nip_null_entry: + kfree(net->newip.nip_null_entry); +out_nip_dst_entries: + dst_entries_destroy(&net->newip.nip_dst_ops); + goto out; +} + +static void __net_exit nip_route_net_exit(struct net *net) +{ + kfree(net->newip.nip_broadcast_entry); + kfree(net->newip.nip_null_entry); + dst_entries_destroy(&net->newip.nip_dst_ops); +} + +static struct pernet_operations nip_route_net_ops = { + .init = nip_route_net_init, + .exit = nip_route_net_exit, +}; + +static int nip_route_dev_notify(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct net *net = dev_net(dev); + + if (!(dev->flags & IFF_LOOPBACK)) + return NOTIFY_OK; + + if (event == NETDEV_REGISTER) { + net->newip.nip_null_entry->dst.dev = dev; + net->newip.nip_null_entry->rt_idev = nin_dev_get(dev); + + net->newip.nip_broadcast_entry->dst.dev = dev; + net->newip.nip_broadcast_entry->rt_idev = nin_dev_get(dev); + } else if (event == NETDEV_UNREGISTER && + dev->reg_state != NETREG_UNREGISTERED) { + nin_dev_put_clear(&net->newip.nip_null_entry->rt_idev); + nin_dev_put_clear(&net->newip.nip_broadcast_entry->rt_idev); + } + + return NOTIFY_OK; +} + +static void seq_printf_nipaddr_to_proc(struct seq_file *seq, + struct nip_addr *addr) +{ + int i = 0; + + for (i = 0; i < addr->bitlen / NIP_ADDR_BIT_LEN_8; i++) + seq_printf(seq, "%02x", addr->nip_addr_field8[i]); + + seq_puts(seq, "\t"); +} + +static void nip_route_show_table(struct seq_file *seq, + struct nip_fib_table *table) +{ + struct nip_fib_node *fn; + int i; + + rcu_read_lock_bh(); + for (i = 0; i < NIN_ROUTE_HSIZE; i++) { + hlist_for_each_entry_rcu(fn, &table->nip_tb_head[i], + fib_hlist) { + struct nip_rt_info *rt = fn->nip_route_info; + + seq_printf_nipaddr_to_proc(seq, &rt->rt_dst); + seq_printf_nipaddr_to_proc(seq, &rt->gateway); + seq_printf(seq, "%4u %4s\n", rt->rt_flags, + rt->dst.dev ? rt->dst.dev->name : ""); + } + } + rcu_read_unlock_bh(); +} + +static int nip_route_proc_show(struct seq_file *seq, void *v) +{ + struct net *net = seq->private; + + nip_route_show_table(seq, net->newip.nip_fib_main_tbl); + nip_route_show_table(seq, net->newip.nip_fib_local_tbl); + + return 0; +} + +static int __net_init nip_route_net_init_late(struct net *net) +{ + proc_create_net_single("nip_route", 0444, net->proc_net, + nip_route_proc_show, NULL); + return 0; +} + +static void __net_exit nip_route_net_exit_late(struct net *net) +{ + remove_proc_entry("nip_route", net->proc_net); +} + +static struct pernet_operations nip_route_net_late_ops = { + .init = nip_route_net_init_late, + .exit = nip_route_net_exit_late, +}; + +static struct notifier_block nip_route_dev_notifier = { + .notifier_call = nip_route_dev_notify, + .priority = ADDRCONF_NOTIFY_PRIORITY - 10, +}; + +int __init nip_route_init(void) +{ + int ret; + + ret = -ENOMEM; + + nip_dst_ops_template.kmem_cachep = + kmem_cache_create("nip_dst_cache", sizeof(struct nip_rt_info), 0, + SLAB_HWCACHE_ALIGN, NULL); + if (!nip_dst_ops_template.kmem_cachep) + goto out; + + ret = register_pernet_subsys(&nip_route_net_ops); + if (ret) + goto out_kmem_cache; + + ret = nip_fib_init(); + if (ret) + goto out_register_subsys; + + ret = register_pernet_subsys(&nip_route_net_late_ops); + if (ret) + goto out_nip_fib_init; + + ret = register_netdevice_notifier(&nip_route_dev_notifier); + if (ret) + goto out_register_late_subsys; + +out: + return ret; + +out_register_late_subsys: + unregister_pernet_subsys(&nip_route_net_late_ops); +out_nip_fib_init: + nip_fib_gc_cleanup(); +out_register_subsys: + unregister_pernet_subsys(&nip_route_net_ops); +out_kmem_cache: + kmem_cache_destroy(nip_dst_ops_template.kmem_cachep); + goto out; +} + +void nip_route_cleanup(void) +{ + unregister_pernet_subsys(&nip_route_net_late_ops); + nip_fib_gc_cleanup(); + unregister_pernet_subsys(&nip_route_net_ops); + kmem_cache_destroy(nip_dst_ops_template.kmem_cachep); +} + diff --git a/net/newip/tcp_nip.c b/net/newip/tcp_nip.c new file mode 100755 index 0000000000000000000000000000000000000000..eb06994664ac09ef37bfe9b1eb0500c48e6898f0 --- /dev/null +++ b/net/newip/tcp_nip.c @@ -0,0 +1,1275 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +static const struct inet_connection_sock_af_ops newip_specific; + +static void tcp_nip_push(struct sock *sk, int flags, int mss_now, + int nonagle, int size_goal) +{ + __tcp_nip_push_pending_frames(sk, mss_now, nonagle); +} + +static const unsigned char new_state[16] = { + /* current state: new state: action: */ +[0 /* (Invalid) */] = TCP_CLOSE, +[TCP_ESTABLISHED] = TCP_FIN_WAIT1 | TCP_ACTION_FIN, +[TCP_SYN_SENT] = TCP_CLOSE, +[TCP_SYN_RECV] = TCP_FIN_WAIT1 | TCP_ACTION_FIN, +[TCP_FIN_WAIT1] = TCP_FIN_WAIT1, +[TCP_FIN_WAIT2] = TCP_FIN_WAIT2, +[TCP_TIME_WAIT] = TCP_CLOSE, +[TCP_CLOSE] = TCP_CLOSE, +[TCP_CLOSE_WAIT] = TCP_LAST_ACK | TCP_ACTION_FIN, +[TCP_LAST_ACK] = TCP_LAST_ACK, +[TCP_LISTEN] = TCP_CLOSE, +[TCP_CLOSING] = TCP_CLOSING, +[TCP_NEW_SYN_RECV] = TCP_CLOSE, /* should not happen ! */ +}; + +static int tcp_nip_close_state(struct sock *sk) +{ + int next = (int)new_state[sk->sk_state]; + int ns = next & TCP_STATE_MASK; + + tcp_set_state(sk, ns); + + return next & TCP_ACTION_FIN; +} + +void sk_nip_stream_kill_queues(struct sock *sk) +{ + /* First the read buffer. */ + __skb_queue_purge(&sk->sk_receive_queue); + + /* Next, the error queue. */ + __skb_queue_purge(&sk->sk_error_queue); + + /* Next, the write queue. */ + WARN_ON(!skb_queue_empty(&sk->sk_write_queue)); + + WARN_ON(sk->sk_wmem_queued); +} + +void tcp_nip_shutdown(struct sock *sk, int how) +{ + if (!(how & SEND_SHUTDOWN)) + return; + + /* If we've already sent a FIN, or it's a closed state, skip this. */ + if ((1 << sk->sk_state) & + (TCPF_ESTABLISHED | TCPF_SYN_SENT | + TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) { + /* Clear out any half completed packets. FIN if needed. */ + if (tcp_nip_close_state(sk)) + tcp_nip_send_fin(sk); + } +} +EXPORT_SYMBOL(tcp_nip_shutdown); + +void tcp_nip_close(struct sock *sk, long timeout) +{ + struct sk_buff *skb; + int data_was_unread = 0; + int state; + + lock_sock(sk); + sk->sk_shutdown = SHUTDOWN_MASK; + + DEBUG("%s: sk_state:%d\n", __func__, sk->sk_state); + + if (sk->sk_state == TCP_LISTEN) { + tcp_set_state(sk, TCP_CLOSE); + + inet_csk_listen_stop(sk); + + goto adjudge_to_death; + } + + while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) { + u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq; + + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) + len--; + data_was_unread += len; + __kfree_skb(skb); + } + + if (sk->sk_state == TCP_CLOSE) + goto adjudge_to_death; + + if (tcp_nip_close_state(sk)) { + /* RED-PEN. Formally speaking, we have broken TCP state + * machine. State transitions: + * + * TCP_ESTABLISHED -> TCP_FIN_WAIT1 + * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible) + * TCP_CLOSE_WAIT -> TCP_LAST_ACK + */ + DEBUG("%s: ready to send fin, sk_state:%d\n", __func__, sk->sk_state); + tcp_nip_send_fin(sk); + } + +adjudge_to_death: + state = sk->sk_state; + sock_hold(sk); + sock_orphan(sk); + + /* It is the last release_sock in its life. It will remove backlog. */ + release_sock(sk); + + local_bh_disable(); + bh_lock_sock(sk); + WARN_ON(sock_owned_by_user(sk)); + + percpu_counter_inc(sk->sk_prot->orphan_count); + + if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE) + goto out; + + if (sk->sk_state == TCP_CLOSE) + inet_csk_destroy_sock(sk); + +out: + bh_unlock_sock(sk); + local_bh_enable(); + sock_put(sk); +} + +/* Function: + * 初始化request_sock中的部分参数。 + * Parameter: + * req: 请求连接控制块。 + * sk_listener: 传输控制块。 + * skb: 传输控制块缓冲区。 + */ +static void tcp_nip_init_req(struct request_sock *req, + const struct sock *sk_listener, + struct sk_buff *skb) +{ + struct inet_request_sock *ireq = inet_rsk(req); + + ireq->ir_nip_rmt_addr = NIPCB(skb)->srcaddr; + ireq->ir_nip_loc_addr = NIPCB(skb)->dstaddr; +} + +/* Function: + * 初始化初始序号seq。通过源地址中的一部分源端口, + * 目的地址的一部分以及目的端口计算出服务端初始序 + * 列号。 + * Parameter: + * skb: 传输控制块缓冲区。 + */ +static __u32 tcp_nip_init_sequence(const struct sk_buff *skb) +{ + return secure_tcpv6_seq(NIPCB(skb)->dstaddr.nip_addr_field32, + NIPCB(skb)->srcaddr.nip_addr_field32, + tcp_hdr(skb)->dest, + tcp_hdr(skb)->source); +} + +static struct dst_entry *tcp_nip_route_req(const struct sock *sk, + struct flowi *fl, + const struct request_sock *req) +{ + struct dst_entry *dst; + struct inet_request_sock *ireq = inet_rsk(req); + struct flow_nip fln; + + fln.daddr = ireq->ir_nip_rmt_addr; + dst = nip_route_output(sock_net(sk), sk, &fln); + return dst; +} + +/* Function: + * 客户端传输层用来连接请求的函数 + * 主要用来设置源、目的地址与接口 + * Parameter: + * sk: 传输控制块。 + * uaddr:目的地址结构 + * addr_len:目的地址结构长度 + */ +static int tcp_nip_connect(struct sock *sk, struct sockaddr *uaddr, + int addr_len) +{ + struct sockaddr_nin *usin = (struct sockaddr_nin *)uaddr; + struct inet_sock *inet = inet_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + __be16 orig_sport, orig_dport; + struct nip_addr *saddr = NULL, *daddr; + struct dst_entry *dst; + int err; + struct ip_options_rcu *inet_opt; + struct inet_timewait_death_row *tcp_death_row; + struct flow_nip fln; + + fln.daddr = usin->sin_addr; + + /* 检查地址族 */ + if (usin->sin_family != AF_NINET) + return -EAFNOSUPPORT; + /* RCU锁 */ + inet_opt = rcu_dereference_protected(inet->inet_opt, + lockdep_sock_is_held(sk)); + /* 目的地址及端口 */ + daddr = &usin->sin_addr; + orig_dport = usin->sin_port; + + /* 查找路由,并获取源地址 */ + dst = nip_dst_lookup_flow(sock_net(sk), sk, &fln, NULL); + if (IS_ERR(dst)) { + DEBUG("%s cannot find dst\n", __func__); + return -1; + } + + if (!saddr) { + saddr = &fln.saddr; + sk->sk_nip_rcv_saddr = *saddr; + } + + /* 目标地址及端口设置到传输控制块中 */ + inet->inet_dport = usin->sin_port; + sk->sk_nip_daddr = usin->sin_addr; + + inet_csk(sk)->icsk_ext_hdr_len = 0; + if (inet_opt) + inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; + + tcp_set_state(sk, TCP_SYN_SENT); + sk_set_txhash(sk); + sk_dst_set(sk, dst); + + /* 动态绑定本地端口 */ + tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; + err = ninet_hash_connect(tcp_death_row, sk); + + /* 如果传输控制块已经建立过链接则初始化 */ + if (tp->rx_opt.ts_recent_stamp) { + /* Reset inherited state */ + tp->rx_opt.ts_recent = 0; + tp->rx_opt.ts_recent_stamp = 0; + if (likely(!tp->repair)) + tp->write_seq = 0; + } + /* 初始化write_seq */ + if (!tp->write_seq) + tp->write_seq = + secure_tcp_nip_sequence_number(sk->sk_nip_rcv_saddr.nip_addr_field32, + sk->sk_nip_daddr.nip_addr_field32, + inet->inet_sport, + usin->sin_port); + + inet->inet_id = prandom_u32(); + + /* 调用tcp_connect 发送SYN字段 */ + err = __tcp_nip_connect(sk); + + dst = NULL; + if (err) + goto failure; + + return 0; + +failure: + + tcp_set_state(sk, TCP_CLOSE); + sk->sk_route_caps = 0; + inet->inet_dport = 0; + return err; +} + +/* Function: + * 服务端用来发送SYN+ACK段的函数. + * Parameter: + * sk: 传输控制块。 + * dst: 路由。 + * flowi: 流控制块。 + * req: 请求连接控制块。 + * foc: fast open选项。 + * synack_type: SYN+ACK段的类型。 + */ +static int tcp_nip_send_synack(const struct sock *sk, struct dst_entry *dst, + struct flowi *fl, + struct request_sock *req, + struct tcp_fastopen_cookie *foc, + enum tcp_synack_type synack_type, + struct sk_buff *syn_skb) +{ + struct sk_buff *skb; + int err = -ENOMEM; + + /* 构建SYN+ACK段 */ + skb = tcp_nip_make_synack(sk, dst, req, foc, synack_type); + if (skb) { + DEBUG("%s: TCP server create SYN+ACK skb successfully!", __func__); + rcu_read_lock(); + err = nip_send_synack(req, skb); + rcu_read_unlock(); + } + +done: + return err; +} + +static void tcp_nip_err(struct sk_buff *skb, struct ninet_skb_parm *opt, + __u8 type, __u8 code, int offset, __be32 info) +{ +} + +static void tcp_nip_reqsk_destructor(struct request_sock *req) +{ + kfree_skb(inet_rsk(req)->nip_pktopts); +} + +struct request_sock_ops tcp_nip_request_sock_ops __read_mostly = { + .family = AF_NINET, + .obj_size = sizeof(struct tcp_nip_request_sock), + .rtx_syn_ack = tcp_nip_rtx_synack, + .send_ack = NULL, + .destructor = tcp_nip_reqsk_destructor, + .send_reset = NULL, + .syn_ack_timeout = NULL, +}; + +static const struct tcp_request_sock_ops tcp_request_sock_newip_ops = { + .mss_clamp = TCP_BASE_MSS, +#ifdef CONFIG_TCP_MD5SIG + .req_md5_lookup = NULL, + .calc_md5_hash = NULL, +#endif + .init_req = tcp_nip_init_req, +#ifdef CONFIG_SYN_COOKIES + .cookie_init_seq = NULL, +#endif + .route_req = tcp_nip_route_req, + .init_seq = tcp_nip_init_sequence, + .send_synack = tcp_nip_send_synack, +}; + +/* Function: + * 路由缓存从skb保存的传输控制块 + * Parameter: + * sk: 传输控制块。 + * skb: 传输控制块缓冲区。 + * req: 请求连接控制块。 + * dst: 路由。 + * req_unhash: 请求连接控制块。 + */ +void ninet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + + if (dst && dst_hold_safe(dst)) { + sk->sk_rx_dst = dst; + inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; + } +} +EXPORT_SYMBOL(ninet_sk_rx_dst_set); + +/* Function: + * 服务端用来处理客户端连接请求的函数。 + * Parameter: + * sk: 传输控制块。 + * skb: 传输控制块缓冲区。 + */ +static int tcp_nip_conn_request(struct sock *sk, struct sk_buff *skb) +{ + return tcp_newip_conn_request(&tcp_nip_request_sock_ops, + &tcp_request_sock_newip_ops, sk, skb); +} + +/* Function: + * 创建子控制块。 + * Parameter: + * sk: 传输控制块。 + * skb: 传输控制块缓冲区。 + * req: 请求连接控制块。 + * dst: 路由。 + * req_unhash: 请求连接控制块。 + */ +static struct sock *tcp_nip_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct dst_entry *dst, + struct request_sock *req_unhash, + bool *own_req) +{ + struct inet_request_sock *ireq = inet_rsk(req); + bool found_dup_sk = false; + struct tcp_nip_sock *newtcpnipsk; + struct inet_sock *newinet; + struct tcp_sock *newtp; + struct sock *newsk; + struct flow_nip fln; + + if (sk_acceptq_is_full(sk)) + goto out_overflow; + + fln.daddr = ireq->ir_nip_rmt_addr; + if (!dst) { + dst = nip_route_output(sock_net(sk), sk, &fln); + if (!dst) + goto out; + } + + newsk = tcp_nip_create_openreq_child(sk, req, skb); + if (!newsk) + goto out_nonewsk; + + /* 保存接收路由缓存 */ + ninet_sk_rx_dst_set(newsk, skb); + + newtcpnipsk = (struct tcp_nip_sock *)newsk; + + newtp = tcp_sk(newsk); + newinet = inet_sk(newsk); + + newsk->sk_nip_daddr = ireq->ir_nip_rmt_addr; + newsk->sk_nip_rcv_saddr = ireq->ir_nip_loc_addr; + + newinet->inet_opt = NULL; + + inet_csk(newsk)->icsk_ext_hdr_len = 0; + + newtp->retrans_stamp = jiffies; + + /* 协商mss */ + newtp->mss_cache = TCP_BASE_MSS; + newtp->nip_out_of_order_queue = NULL; + newtp->advmss = dst_metric_advmss(dst); + if (tcp_sk(sk)->rx_opt.user_mss && + tcp_sk(sk)->rx_opt.user_mss < newtp->advmss) + newtp->advmss = tcp_sk(sk)->rx_opt.user_mss; + + /* 初始化RCV_MSS */ + tcp_nip_initialize_rcv_mss(newsk); + if (__inet_inherit_port(sk, newsk) < 0) + goto put_and_exit; + /* 将旧的sock从ehash表中删除,然后添加新创建的sock到ehash表中 + * 成功则*own_req = true + */ + *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), + &found_dup_sk); + + return newsk; + +out_overflow: + __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); +out_nonewsk: + dst_release(dst); +out: + tcp_listendrop(sk); + return NULL; +put_and_exit: + newinet->inet_opt = NULL; + inet_csk_prepare_forced_close(newsk); + tcp_nip_done(newsk); + goto out; +} + +static const struct inet_connection_sock_af_ops newip_specific = { + .queue_xmit = nip_queue_xmit, + .send_check = NULL, + .rebuild_header = NULL, + .sk_rx_dst_set = ninet_sk_rx_dst_set, + .conn_request = tcp_nip_conn_request, + .syn_recv_sock = tcp_nip_syn_recv_sock, + .net_header_len = 0, + .net_frag_header_len = 0, + .setsockopt = NULL, + .getsockopt = NULL, + .addr2sockaddr = NULL, + .sockaddr_len = sizeof(struct sockaddr_nin), + + .mtu_reduced = NULL, +}; + +/* Function: + * 初始化tcp中sock的相关信息 + * Parameter: + * sk: 待初始化的sock + * Note: 当前该函数未初始化定时器,预队列及拥塞控制, + * 并且不允许快速重传,并未设置调整mss的函数 + */ +static int tcp_nip_init_sock(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + + tp->out_of_order_queue = RB_ROOT; + tcp_nip_init_xmit_timers(sk); + INIT_LIST_HEAD(&tp->tsq_node); + + icsk->icsk_rto = TCP_TIMEOUT_INIT; + tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); + minmax_reset(&tp->rtt_min, tcp_jiffies32, ~0U); + + tp->snd_cwnd = TCP_INIT_CWND; + tp->app_limited = ~0U; + tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; + tp->snd_cwnd_clamp = ~0; + tp->mss_cache = TCP_MSS_DEFAULT; + + tp->reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering; + tp->tsoffset = 0; + sk->sk_state = TCP_CLOSE; + sk->sk_write_space = sk_stream_write_space; + sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); + + WRITE_ONCE(sk->sk_sndbuf, sock_net(sk)->ipv4.sysctl_tcp_wmem[1]); + WRITE_ONCE(sk->sk_rcvbuf, sock_net(sk)->ipv4.sysctl_tcp_rmem[1]); + + local_bh_disable(); + sk_sockets_allocated_inc(sk); + local_bh_enable(); + + icsk->icsk_af_ops = &newip_specific; + + return 0; +} + +static void skb_nip_entail(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); + + skb->csum = 0; + tcb->seq = tp->write_seq; + tcb->end_seq = tp->write_seq; + tcb->tcp_flags = TCPHDR_ACK; + tcb->sacked = 0; + + tcp_nip_add_write_queue_tail(sk, skb); + + sk->sk_wmem_queued += skb->truesize; + sk_mem_charge(sk, skb->truesize); +} + +int tcp_nip_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + struct sockcm_cookie sockc; + int flags, err, copied = 0; + int mss_now = 0, size_goal, copied_syn = 0; + bool process_backlog = false; + bool sg; + long timeo; + + lock_sock(sk); + + flags = msg->msg_flags; + + timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); + + if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) && + !tcp_passive_fastopen(sk)) { + err = sk_stream_wait_connect(sk, &timeo); + if (err != 0) + goto do_error; + } + + /* This should be in poll */ + sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); + + copied = 0; + +restart: + mss_now = TCP_BASE_MSS; + + err = -EPIPE; + if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) + goto do_error; + + while (msg_data_left(msg)) { + int copy = 0; + int max = mss_now; + + bool first_skb; +new_segment: + if (!sk_stream_memory_free(sk)) + goto wait_for_sndbuf; + + if (process_backlog && sk_flush_backlog(sk)) { + process_backlog = false; + goto restart; + } + first_skb = skb_queue_empty(&sk->sk_write_queue); + skb = sk_stream_alloc_skb(sk, mss_now, sk->sk_allocation, first_skb); + if (!skb) + goto wait_for_memory; + + process_backlog = true; + + skb_nip_entail(sk, skb); + copy = mss_now; + max = mss_now; + + /* Try to append data to the end of skb. */ + if (copy > msg_data_left(msg)) + copy = msg_data_left(msg); + + if (skb_availroom(skb) > 0) { + /* We have some space in skb head. Superb! */ + copy = min_t(int, copy, skb_availroom(skb)); + err = skb_add_data_nocache(sk, skb, &msg->msg_iter, copy); + if (err) + goto do_fault; + } else { + DEBUG("%s: msg too big! tcp cannot devide packet now\n", __func__); + goto out; + } + + if (!copied) + TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH; + + tp->write_seq += copy; + TCP_SKB_CB(skb)->end_seq += copy; + tcp_skb_pcount_set(skb, 0); + copied += copy; + if (!msg_data_left(msg)) { + if (unlikely(flags & MSG_EOR)) + TCP_SKB_CB(skb)->eor = 1; + goto out; + } + + continue; + +wait_for_sndbuf: + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); +wait_for_memory: + if (copied) + tcp_nip_push(sk, flags & ~MSG_MORE, mss_now, + TCP_NAGLE_PUSH, size_goal); + + err = sk_stream_wait_memory(sk, &timeo); + if (err != 0) + goto do_error; + + mss_now = 1024; + } + +out: + if (copied) + tcp_nip_push(sk, flags, mss_now, tp->nonagle, size_goal); + release_sock(sk); + return copied; + +do_fault: + if (!skb->len) { + tcp_unlink_write_queue(skb, sk); + sk_wmem_free_skb(sk, skb); + } + +do_error: + if (copied) + goto out; +out_err: + err = sk_stream_error(sk, flags, err); + /* make sure we wake any epoll edge trigger waiter */ + if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN)) + sk->sk_write_space(sk); + release_sock(sk); + return err; +} +EXPORT_SYMBOL(tcp_nip_sendmsg); + +int tcp_nip_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, + int flags, int *addr_len) +{ + struct tcp_sock *tp = tcp_sk(sk); + int copied = 0; + u32 peek_seq; + u32 *seq; + unsigned long used; + int err; + int target; + long timeo; + size_t len_tmp = len; + struct sk_buff *skb, *last; + + lock_sock(sk); + + if (sk->sk_state == TCP_LISTEN) + goto out; + + timeo = sock_rcvtimeo(sk, nonblock); + + seq = &tp->copied_seq; + + target = sock_rcvlowat(sk, flags & MSG_WAITALL, len_tmp); + + do { + u32 offset; + /* Next get a buffer. */ + last = skb_peek_tail(&sk->sk_receive_queue); + skb_queue_walk(&sk->sk_receive_queue, skb) { + last = skb; + /* Now that we have two receive queues this + * shouldn't happen. + */ + if (WARN(before(*seq, TCP_SKB_CB(skb)->seq), + "TCP recvmsg seq # bug: copied %X, seq %X, rcvnxt %X, fl %X\n", + *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, + flags)) + break; + offset = *seq - TCP_SKB_CB(skb)->seq; + if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { + pr_err_once("%s: found a SYN, please report !\n", __func__); + offset--; + } + if (offset < skb->len) + goto found_ok_skb; + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) + goto found_fin_ok; + /* 如果当前sk_receive_queue中的第一个skb不是要复制的skb, + * 那么flags中理应设置了MSG_PEEK值 + */ + WARN(!(flags & MSG_PEEK), + "TCP recvmsg seq # bug 2: copied %X, seq %X, rcvnxt %X, fl %X\n", + *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags); + } + + /* 如果程序执行到这里,说明sk_receive_queue已经被读完了 */ + /* 如果backlog中无数据了,此时读到target就结束读取 */ + if (copied >= target && !sk->sk_backlog.tail) + break; + + if (copied) { + if (sk->sk_err || + sk->sk_state == TCP_CLOSE || + (sk->sk_shutdown & RCV_SHUTDOWN) || + !timeo || + signal_pending(current)) + break; + } else { + if (sock_flag(sk, SOCK_DONE)) + break; + + if (sk->sk_err) { + copied = sock_error(sk); + break; + } + + if (sk->sk_shutdown & RCV_SHUTDOWN) + break; + + if (sk->sk_state == TCP_CLOSE) { + if (!sock_flag(sk, SOCK_DONE)) { + /* This occurs when user tries to read + * from never connected socket. + */ + copied = -ENOTCONN; + break; + } + break; + } + + if (!timeo) { + copied = -EAGAIN; + break; + } + + if (signal_pending(current)) { + copied = sock_intr_errno(timeo); + break; + } + } + + if (copied >= target) { + /* Do not sleep, just process backlog. */ + release_sock(sk); + lock_sock(sk); + } else { + DEBUG("%s: no enough data receive queue, wait\n", __func__); + sk_wait_data(sk, &timeo, last); + } + continue; +found_ok_skb: + used = skb->len - offset; + if (len_tmp < used) + used = len_tmp; + DEBUG("%s: copy data into msg, len=%ld\n", __func__, used); + if (!(flags & MSG_TRUNC)) { + err = skb_copy_datagram_msg(skb, offset, msg, used); + if (err) { + DEBUG("%s: copy data failed!\n", __func__); + if (!copied) + copied = -EFAULT; + break; + } + } + *seq += used; + len_tmp -= used; + copied += used; + +skip_copy: + if (used + offset < skb->len) + continue; + + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) + goto found_fin_ok; + if (!(flags & MSG_PEEK)) + sk_eat_skb(sk, skb); + continue; + +found_fin_ok: + /* Process the FIN. */ + ++*seq; + if (!(flags & MSG_PEEK)) + sk_eat_skb(sk, skb); + break; + } while (len_tmp > 0); + + release_sock(sk); + return copied; + +out: + release_sock(sk); + return err; +} + +static inline void tcp_nip_write_queue_purge(struct sock *sk) +{ + struct sk_buff *skb; + + while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) + sk_wmem_free_skb(sk, skb); + + tcp_clear_all_retrans_hints(tcp_sk(sk)); + sk->sk_send_head = NULL; + inet_csk(sk)->icsk_backoff = 0; +} + +void skb_nip_ofo_queue_purge(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + + while ((skb = tp->nip_out_of_order_queue) != NULL) { + tp->nip_out_of_order_queue = tp->nip_out_of_order_queue->next; + kfree_skb(skb); + } +} + +void tcp_nip_destroy_sock(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + tcp_nip_clear_xmit_timers(sk); + + tcp_nip_write_queue_purge(sk); + + skb_nip_ofo_queue_purge(sk); + + if (inet_csk(sk)->icsk_bind_hash) + inet_put_port(sk); + + tcp_saved_syn_free(tp); + local_bh_disable(); + sk_sockets_allocated_dec(sk); + local_bh_enable(); +} +EXPORT_SYMBOL(tcp_nip_destroy_sock); + +/* Function: + * 对listen和established状态的sock的处理函数,由tcp_nip_rcv调用 + * Parameter: + * skb: 从网络层收到的数据包 + * sk: 需要处理sock实例 + */ +static int tcp_nip_do_rcv(struct sock *sk, struct sk_buff *skb) +{ + DEBUG("%s: received newip tcp skb, sk_state=%d\n", __func__, sk->sk_state); + + if (sk->sk_state == TCP_ESTABLISHED) { + tcp_nip_rcv_established(sk, skb, tcp_hdr(skb), skb->len); + return 0; + } + + /* 该部分为cookie方式建立连接,其目的主要是防范SYN-Flood攻击 */ + if (sk->sk_state == TCP_LISTEN) + DEBUG("found TCP_LISTEN SOCK!!!\n"); + + if (tcp_nip_rcv_state_process(sk, skb)) + goto discard; + return 0; + +discard: + kfree_skb(skb); + return 0; +} + +/* Function: + * 将skb中TCP头部字段填入TCP私有控制块中,因为在skb中的TCP首部 + * 字段是网络字节序的,为了之后方便调用,需要转换成主机字节序存 + * 入TCP控制块中。 + * Parameter: + * skb:网络层交付的数据包 + * th:数据包中的TCP首部字段 + */ +static void tcp_nip_fill_cb(struct sk_buff *skb, const struct tcphdr *th) +{ + barrier(); + + TCP_SKB_CB(skb)->seq = ntohl(th->seq); + TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + + skb->len - th->doff * 4); + + TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); + TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); + TCP_SKB_CB(skb)->tcp_tw_isn = 0; + TCP_SKB_CB(skb)->sacked = 0; +} + +/* Function: + * TCP在从网络层到传输层的总入口,负责接收网络层交付的数据包 + * Parameter: + * skb:网络层交付上来的数据包 + */ +static int tcp_nip_rcv(struct sk_buff *skb) +{ + const struct tcphdr *th; + bool refcounted; + struct sock *sk; + int ret; + struct net *net = dev_net(skb->dev); + + if (skb->pkt_type != PACKET_HOST) + goto discard_it; + + th = (const struct tcphdr *)skb->data; + + if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) + goto bad_packet; + + sk = __ninet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), + th->source, th->dest, 1, + &refcounted); + if (!sk) + goto no_tcp_socket; + + if (sk->sk_state == TCP_TIME_WAIT) + goto do_time_wait; + if (sk->sk_state == TCP_NEW_SYN_RECV) { + DEBUG("%s: TCP server into third shake hands! sk->sk_state:%d", + __func__, sk->sk_state); + struct request_sock *req = inet_reqsk(sk); + struct sock *nsk; + + sk = req->rsk_listener; + tcp_nip_fill_cb(skb, th); + + sock_hold(sk); + refcounted = true; + /* 之后要创建一个新的sock并进入TCP_SYN_RECV + * 再设置为Established状态 + */ + nsk = tcp_nip_check_req(sk, skb, req); + if (!nsk) { + DEBUG("%s create newsk failure!!!", __func__); + reqsk_put(req); + goto discard_and_relse; + } + if (tcp_nip_child_process(sk, nsk, skb)) { + goto discard_and_relse; + } else { + sock_put(sk); + return 0; + } + } + + tcp_nip_fill_cb(skb, th); + + if (tcp_filter(sk, skb)) + goto discard_and_relse; + th = (const struct tcphdr *)skb->data; + skb->dev = NULL; + + if (sk->sk_state == TCP_LISTEN) { + DEBUG("%s: TCP server into first shake hands! sk->sk_state:%d", + __func__, sk->sk_state); + ret = tcp_nip_do_rcv(sk, skb); + goto put_and_return; + } + bh_lock_sock_nested(sk); + + ret = 0; + if (!sock_owned_by_user(sk)) { + ret = tcp_nip_do_rcv(sk, skb); + } else { + DEBUG("%s: sock locked by user! put packet into backlog\n", + __func__); + if (tcp_add_backlog(sk, skb)) + goto discard_and_relse; + } + + bh_unlock_sock(sk); + +put_and_return: + if (refcounted) + sock_put(sk); + return ret ? -1 : 0; + +no_tcp_socket: + /* 校验和检查,暂未考虑,直接丢弃skb */ + DEBUG("%s: cannot find related tcp sock for skb", __func__); + goto discard_it; +bad_packet: + goto discard_it; +discard_it: + DEBUG("%s: drop tcp newip skb and release it\n", __func__); + kfree_skb(skb); + return 0; + +discard_and_relse: + sk_drops_add(sk, skb); + if (refcounted) + sock_put(sk); + goto discard_it; +/* 处理中断状态的sk部分 */ +do_time_wait: + goto discard_it; +} + +static void tcp_nip_early_demux(struct sk_buff *skb) +{ +} + +void tcp_nip_done(struct sock *sk) +{ + struct request_sock *req = tcp_sk(sk)->fastopen_rsk; + + if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV) + TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS); + + tcp_set_state(sk, TCP_CLOSE); + inet_csk_clear_xmit_timers(sk); + if (req) + reqsk_fastopen_remove(sk, req, false); + + sk->sk_shutdown = SHUTDOWN_MASK; + + if (!sock_flag(sk, SOCK_DEAD)) { + sk->sk_state_change(sk); + } else { + /* 检查是否有警告 返回0不会报警告 */ + WARN_ON(sk->sk_state != TCP_CLOSE); + WARN_ON(!sock_flag(sk, SOCK_DEAD)); + + /* It cannot be in hash table! */ + WARN_ON(!sk_unhashed(sk)); + + /* If it has not 0 inet_sk(sk)->inet_num, it must be bound */ + WARN_ON(inet_sk(sk)->inet_num && !inet_csk(sk)->icsk_bind_hash); + sk->sk_prot->destroy(sk); + + sk_nip_stream_kill_queues(sk); + + local_bh_disable(); + percpu_counter_dec(sk->sk_prot->orphan_count); + local_bh_enable(); + sock_put(sk); + DEBUG("%s: close sock done!!\n", __func__); + } +} +EXPORT_SYMBOL_GPL(tcp_nip_done); + +/* Function: + * 断开与对端的连接,非阻塞。 + * 释放读写队列,发送rst(暂不发),清除定时器。 + * Parameter: + * sk: 传输控制块。 + */ +int tcp_nip_disconnect(struct sock *sk, int flags) +{ + struct inet_sock *inet = inet_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + int err = 0; + int old_state = sk->sk_state; + + if (old_state != TCP_CLOSE) + tcp_set_state(sk, TCP_CLOSE); + + if (old_state == TCP_LISTEN) + inet_csk_listen_stop(sk); + else if (old_state == TCP_SYN_SENT) + sk->sk_err = ECONNRESET; + + tcp_nip_clear_xmit_timers(sk); + __skb_queue_purge(&sk->sk_receive_queue); + tcp_write_queue_purge(sk); + + /* 清除乱序队列,现在实现为单链表,删除函数 */ + inet->inet_dport = 0; + + sk->sk_shutdown = 0; + sock_reset_flag(sk, SOCK_DONE); + tp->srtt_us = 0; + tp->write_seq += tp->max_window + 2; + if (tp->write_seq == 0) + tp->write_seq = 1; + tp->snd_cwnd = 2; + icsk->icsk_probes_out = 0; + tp->packets_out = 0; + tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; + tp->snd_cwnd_cnt = 0; + tp->window_clamp = 0; + tp->delivered = 0; + tcp_clear_retrans(tp); + tp->total_retrans = 0; + inet_csk_delack_init(sk); + + icsk->icsk_ack.rcv_mss = TCP_MIN_MSS; + sk->sk_send_head = NULL; + memset(&tp->rx_opt, 0, sizeof(tp->rx_opt)); + __sk_dst_reset(sk); + dst_release(sk->sk_rx_dst); + sk->sk_rx_dst = NULL; + tp->segs_in = 0; + tp->segs_out = 0; + tp->bytes_acked = 0; + tp->bytes_received = 0; + tp->data_segs_in = 0; + tp->data_segs_out = 0; + + WARN_ON(inet->inet_num && !icsk->icsk_bind_hash); + + if (sk->sk_frag.page) { + put_page(sk->sk_frag.page); + sk->sk_frag.page = NULL; + sk->sk_frag.offset = 0; + } + + sk->sk_error_report(sk); + return err; +} +EXPORT_SYMBOL(tcp_nip_disconnect); + +struct proto tcp_nip_prot = { + .name = "NIP_TCP", + .owner = THIS_MODULE, + .close = tcp_nip_close, + .connect = tcp_nip_connect, + .disconnect = tcp_nip_disconnect, + .accept = inet_csk_accept, + .ioctl = tcp_ioctl, + .init = tcp_nip_init_sock, + .destroy = tcp_nip_destroy_sock, + .shutdown = tcp_nip_shutdown, + .setsockopt = tcp_setsockopt, + .getsockopt = tcp_getsockopt, + .recvmsg = tcp_nip_recvmsg, + .sendmsg = tcp_nip_sendmsg, + .sendpage = NULL, + .backlog_rcv = tcp_nip_do_rcv, + .release_cb = tcp_nip_release_cb, + .hash = ninet_hash, + .unhash = ninet_unhash, + .get_port = inet_csk_get_port, + .sockets_allocated = &tcp_sockets_allocated, + .orphan_count = &tcp_orphan_count, + .memory_allocated = &tcp_memory_allocated, + .memory_pressure = &tcp_memory_pressure, + .sysctl_mem = sysctl_tcp_mem, + .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), + .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), + .max_header = MAX_TCP_HEADER, + .obj_size = sizeof(struct tcp_nip_sock), + .rsk_prot = &tcp_nip_request_sock_ops, + .h.hashinfo = &tcp_hashinfo, + .no_autobind = true, +}; + +static const struct ninet_protocol tcp_nip_protocol = { + .early_demux = tcp_nip_early_demux, + .handler = tcp_nip_rcv, + .flags = 0, +}; + +static struct inet_protosw tcp_nip_protosw = { + .type = SOCK_STREAM, + .protocol = IPPROTO_TCP, + .prot = &tcp_nip_prot, + .ops = &ninet_stream_ops, + .flags = INET_PROTOSW_PERMANENT | + INET_PROTOSW_ICSK, +}; + +/* TCP协议初始化 */ +int __init tcp_nip_init(void) +{ + int ret; + + ret = ninet_add_protocol(&tcp_nip_protocol, IPPROTO_TCP); + if (ret) + goto out; + + /* register ninet protocol */ + ret = ninet_register_protosw(&tcp_nip_protosw); + if (ret) + goto out_nip_tcp_protocol; + +out: + return ret; + +out_nip_tcp_protosw: + ninet_unregister_protosw(&tcp_nip_protosw); +out_nip_tcp_protocol: + ninet_del_protocol(&tcp_nip_protocol, IPPROTO_TCP); + goto out; +} + +void tcp_nip_exit(void) +{ + ninet_unregister_protosw(&tcp_nip_protosw); + ninet_del_protocol(&tcp_nip_protocol, IPPROTO_TCP); +} + diff --git a/net/newip/tcp_nip_input.c b/net/newip/tcp_nip_input.c new file mode 100755 index 0000000000000000000000000000000000000000..4f89c25e6ee4092fc10415da53a91d4ad9c0291f --- /dev/null +++ b/net/newip/tcp_nip_input.c @@ -0,0 +1,1401 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#define pr_fmt(fmt) "TCP: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define FLAG_DATA 0x01 /* Incoming frame contained data. */ +#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ +#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */ +#define FLAG_RETRANS_DATA_ACKED 0x08 /* some of which was retransmitted. */ +#define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */ +#define FLAG_DATA_SACKED 0x20 /* New SACK. */ +#define FLAG_ECE 0x40 /* ECE in this ACK */ +#define FLAG_LOST_RETRANS 0x80 /* This ACK marks some retransmission lost */ +#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/ +#define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */ +#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ +#define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */ +#define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */ +#define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */ +#define FLAG_NO_CHALLENGE_ACK 0x8000 /* do not call tcp_send_challenge_ack() */ + +#define FLAG_ACKED (FLAG_DATA_ACKED | FLAG_SYN_ACKED) +#define FLAG_NOT_DUP (FLAG_DATA | FLAG_WIN_UPDATE | FLAG_ACKED) +#define FLAG_CA_ALERT (FLAG_DATA_SACKED | FLAG_ECE) +#define FLAG_FORWARD_PROGRESS (FLAG_ACKED | FLAG_DATA_SACKED) + +#define TCP_REMNANT (TCP_FLAG_FIN | TCP_FLAG_URG | TCP_FLAG_SYN | TCP_FLAG_PSH) +#define TCP_HP_BITS (~(TCP_RESERVED_BITS | TCP_FLAG_PSH)) + +#define REXMIT_NONE 0 /* no loss recovery to do */ +#define REXMIT_LOST 1 /* retransmit packets marked lost */ +#define REXMIT_NEW 2 /* FRTO-style transmit of unsent/new packets */ + +#define TCP_MAX_MSS 1460 + +void tcp_nip_fin(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + inet_csk_schedule_ack(sk); + + sk->sk_shutdown |= RCV_SHUTDOWN; + sock_set_flag(sk, SOCK_DONE); + + switch (sk->sk_state) { + case TCP_SYN_RECV: + case TCP_ESTABLISHED: + /* Move to CLOSE_WAIT */ + tcp_set_state(sk, TCP_CLOSE_WAIT); + inet_csk(sk)->icsk_ack.pingpong = 1; + break; + + case TCP_CLOSE_WAIT: + case TCP_CLOSING: + /* Received a retransmission of the FIN, do + * nothing. + */ + break; + case TCP_LAST_ACK: + /* RFC793: Remain in the LAST-ACK state. */ + break; + + case TCP_FIN_WAIT1: + /* This case occurs when a simultaneous close + * happens, we must ack the received FIN and + * enter the CLOSING state. + */ + tcp_nip_send_ack(sk); + tcp_set_state(sk, TCP_CLOSING); + break; + case TCP_FIN_WAIT2: + /* Received a FIN -- send ACK and enter TIME_WAIT. */ + tcp_nip_send_ack(sk); + inet_csk_reset_keepalive_timer(sk, TCP_TIMEWAIT_LEN); + break; + default: + /* Only TCP_LISTEN and TCP_CLOSE are left, in these + * cases we should never reach this piece of code. + */ + pr_err("%s: Impossible, sk->sk_state=%d\n", + __func__, sk->sk_state); + break; + } + + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_state_change(sk); +} + +static void tcp_nip_ofo_queue(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + + while (tp->nip_out_of_order_queue) { + skb = tp->nip_out_of_order_queue; + if (after(TCP_SKB_CB(tp->nip_out_of_order_queue)->seq, tp->rcv_nxt)) + return; + tp->nip_out_of_order_queue = tp->nip_out_of_order_queue->next; + skb->next = NULL; + __skb_queue_tail(&sk->sk_receive_queue, skb); + tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + } +} + +static void tcp_nip_data_queue_ofo(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *pre_skb, *cur_skb; + + inet_csk_schedule_ack(sk); + skb->next = NULL; + if (!tp->nip_out_of_order_queue) { + tp->nip_out_of_order_queue = skb; + skb_set_owner_r(skb, sk); + return; + } + pre_skb = tp->nip_out_of_order_queue; + cur_skb = pre_skb->next; + if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(pre_skb)->seq) { + __kfree_skb(skb); + return; + } else if (TCP_SKB_CB(skb)->seq < TCP_SKB_CB(pre_skb)->seq) { + tp->nip_out_of_order_queue = skb; + skb->next = pre_skb; + skb_set_owner_r(skb, sk); + return; + } + while (cur_skb) { + if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(cur_skb)->seq) { + __kfree_skb(skb); + return; + } else if (TCP_SKB_CB(skb)->seq < TCP_SKB_CB(cur_skb)->seq) { + pre_skb->next = skb; + skb->next = cur_skb; + skb_set_owner_r(skb, sk); + return; + } + pre_skb = pre_skb->next; + cur_skb = cur_skb->next; + } + pre_skb->next = skb; + skb_set_owner_r(skb, sk); +} + +static void tcp_nip_data_queue(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + bool fragstolen = false; + int eaten = -1; + + if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) { + DEBUG("%s: no data, only handle ack.\n", __func__); + __kfree_skb(skb); + return; + } + + if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) { + if (tcp_receive_window(tp) == 0) + inet_csk_schedule_ack(sk); + } + + if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_wup + tp->rcv_wnd) || + !after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { + DEBUG("seq is %d and %d\n", TCP_SKB_CB(skb)->seq, tp->rcv_nxt); + __kfree_skb(skb); + return; + } + + __skb_pull(skb, tcp_hdr(skb)->doff * 4); + if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) { + DEBUG("%s: tcp newip packet received. data len:%d\n", __func__, skb->len); + __skb_queue_tail(&sk->sk_receive_queue, skb); + skb_set_owner_r(skb, sk); + tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + + inet_csk_schedule_ack(sk); + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) + tcp_nip_fin(sk); + + if (tp->nip_out_of_order_queue) + tcp_nip_ofo_queue(sk); + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_data_ready(sk); + return; + } + + tcp_nip_data_queue_ofo(sk, skb); +} + +static void tcp_drop(struct sock *sk, struct sk_buff *skb) +{ + sk_drops_add(sk, skb); + __kfree_skb(skb); +} + +static inline void tcp_nip_push_pending_frames(struct sock *sk) +{ + if (tcp_nip_send_head(sk)) { + struct tcp_sock *tp = tcp_sk(sk); + + __tcp_nip_push_pending_frames(sk, TCP_BASE_MSS, tp->nonagle); + } +} + +static void tcp_nip_new_space(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + sk->sk_write_space(sk); +} + +static void tcp_nip_check_space(struct sock *sk) +{ + /* 调用内存屏障 (根据checkpatch要求在之前加注释) */ + smp_mb(); + if (sk->sk_socket && + test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) + tcp_nip_new_space(sk); +} + +static inline void tcp_nip_data_snd_check(struct sock *sk) +{ + tcp_nip_push_pending_frames(sk); + tcp_nip_check_space(sk); +} + +void tcp_nip_send_delayed_ack(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + int ato = icsk->icsk_ack.ato; + unsigned long timeout; + + icsk->icsk_ack.ato = TCP_DELACK_MIN; + ato = TCP_DELACK_MIN; + + /* Stay within the limit we were given */ + timeout = jiffies + ato; + + /* Use new timeout only if there wasn't a older one earlier. */ + if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) { + if (time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) { + tcp_nip_send_ack(sk); + return; + } + + if (!time_before(timeout, icsk->icsk_ack.timeout)) + timeout = icsk->icsk_ack.timeout; + } + icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER; + icsk->icsk_ack.timeout = timeout; + sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout); +} + +static void __tcp_nip_ack_snd_check(struct sock *sk, int ofo_possible) +{ + struct tcp_sock *tp = tcp_sk(sk); + + inet_csk(sk)->icsk_ack.rcv_mss = TCP_BASE_MSS; + + /* More than one full frame received... */ + if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss && + __nip_tcp_select_window(sk) >= tp->rcv_wnd) || + /* We have out of order data. */ + (ofo_possible && tp->nip_out_of_order_queue)) { + tcp_nip_send_ack(sk); + } else { + /* Else, send delayed ack. */ + DEBUG("%s: send delayed ack!!", __func__); + tcp_nip_send_delayed_ack(sk); + } +} + +static inline void tcp_nip_ack_snd_check(struct sock *sk) +{ + if (!inet_csk_ack_scheduled(sk)) { + /* We sent a data segment already. */ + DEBUG("We sent a data segment already.!!\n"); + return; + } + __tcp_nip_ack_snd_check(sk, 1); +} + +static void tcp_nip_snd_una_update(struct tcp_sock *tp, u32 ack) +{ + u32 delta = ack - tp->snd_una; + + sock_owned_by_me((struct sock *)tp); + tp->bytes_acked += delta; + tp->snd_una = ack; +} + +void tcp_nip_rearm_rto(struct sock *sk) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + + if (!tp->packets_out) { + inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); + } else { + u32 rto = inet_csk(sk)->icsk_rto; + + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto, + TCP_RTO_MAX); + } +} + +static int tcp_nip_clean_rtx_queue(struct sock *sk) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + u32 prior_sacked = tp->sacked_out; + struct sk_buff *skb; + int flag = 0; + + while ((skb = tcp_write_queue_head(sk)) && skb != tcp_nip_send_head(sk)) { + struct tcp_skb_cb *scb = TCP_SKB_CB(skb); + u8 sacked = scb->sacked; + u32 acked_pcount; + + if (after(scb->end_seq, tp->snd_una)) { + if (tcp_skb_pcount(skb) == 1 || !after(tp->snd_una, scb->seq)) + break; + DEBUG("%s: ack error!\n", __func__); + } else { + prefetchw(skb->next); + acked_pcount = tcp_skb_pcount(skb); + } + + if (likely(!(scb->tcp_flags & TCPHDR_SYN))) { + flag |= FLAG_DATA_ACKED; + } else { + flag |= FLAG_SYN_ACKED; + tp->retrans_stamp = 0; + } + + tp->packets_out -= acked_pcount; + + tcp_unlink_write_queue(skb, sk); + sk_wmem_free_skb(sk, skb); + } + if (flag & FLAG_ACKED) + tcp_nip_rearm_rto(sk); + return 0; +} + +/* Function: + * 分配一个连接请求块,用于保存连接请求信息 + * 同时初始化连接过程中用来 + * 发送ACK/RST段的操作集合,以便在建立过程中能方便的调用这些接口。 + * 设置socket状态为TCP_NEW_SYN_RECV + * Parameter: + * ops: 请求控制块的函数接口。 + * sk_listener: 传输控制块。 + * attach_listener: 是否设定cookie。 + */ +struct request_sock *ninet_reqsk_alloc(const struct request_sock_ops *ops, + struct sock *sk_listener, + bool attach_listener) +{ + struct request_sock *req = reqsk_alloc(ops, sk_listener, + attach_listener); + + if (req) { + struct inet_request_sock *ireq = inet_rsk(req); + + ireq->ireq_opt = NULL; + ireq->nip_pktopts = NULL; + atomic64_set(&ireq->ir_cookie, 0); + ireq->ireq_state = TCP_NEW_SYN_RECV; + write_pnet(&ireq->ireq_net, sock_net(sk_listener)); + ireq->ireq_family = sk_listener->sk_family; + } + + return req; +} +EXPORT_SYMBOL(ninet_reqsk_alloc); + +static void tcp_nip_drop(struct sock *sk, struct sk_buff *skb) +{ + sk_drops_add(sk, skb); + __kfree_skb(skb); +} + +void tcp_nip_parse_mss( + struct tcp_options_received *opt_rx, + const struct tcphdr *th, + const unsigned char *ptr, + int opsize, + int estab) +{ + if (opsize == TCPOLEN_MSS && th->syn && !estab) { + u16 in_mss = get_unaligned_be16(ptr); + + if (in_mss) { + if (opt_rx->user_mss && + opt_rx->user_mss < in_mss) + in_mss = opt_rx->user_mss; + opt_rx->mss_clamp = in_mss; + } + } +} + +/* Function: + * Look for tcp options. Normally only called on SYN and SYNACK packets. + * 对skb中TCP选项的解析 + * Parameter: + * skb: 传输控制块缓冲区。 + * opt_rx: 保存TCP选项的结构体。 + * estab: WANTCOOKIE + * foc: len字段。 + */ +void tcp_nip_parse_options(const struct sk_buff *skb, + struct tcp_options_received *opt_rx, int estab, + struct tcp_fastopen_cookie *foc) +{ + const unsigned char *ptr; + const struct tcphdr *th = tcp_hdr(skb); + /* TCP option长度 = TCP头的长度 - TCP结构体的长度 */ + int length = (th->doff * 4) - sizeof(struct tcphdr); + + /* 指向了option位置的指针 */ + ptr = (const unsigned char *)(th + 1); + opt_rx->saw_tstamp = 0; + + while (length > 0) { + int opcode = *ptr++; + int opsize; + + switch (opcode) { + case TCPOPT_EOL: + return; + case TCPOPT_NOP: + length--; + continue; + default: + opsize = *ptr++; + if (opsize < 2) /* "2 - silly options" */ + return; + if (opsize > length) + return; /* don't parse partial options */ + switch (opcode) { + case TCPOPT_MSS: + tcp_nip_parse_mss(opt_rx, th, ptr, opsize, estab); + break; + default: + break; + } + ptr += opsize - 2; + length -= opsize; + } + } +} +EXPORT_SYMBOL(tcp_nip_parse_options); + +/*Function: + * 记录syn中的信息。 + *Parameter: + * req: 请求连接控制块。 + * rx_opt: 保存TCP选项的结构体。 + * skb: 传输控制块缓冲区。 + * sk: 传输控制块。 + */ +static void tcp_nip_reqsk_record_syn(const struct sock *sk, + struct request_sock *req, + const struct sk_buff *skb) +{ + if (tcp_sk(sk)->save_syn) { + u32 len = skb_network_header_len(skb) + tcp_hdrlen(skb); + struct saved_syn *saved_syn; + u32 mac_hdrlen; + void *base; + + if (tcp_sk(sk)->save_syn == 2) { /* 2 - Save full header. */ + base = skb_mac_header(skb); + mac_hdrlen = skb_mac_header_len(skb); + len += mac_hdrlen; + } else { + base = skb_network_header(skb); + mac_hdrlen = 0; + } + + saved_syn = kmalloc(struct_size(saved_syn, data, len), + GFP_ATOMIC); + if (saved_syn) { + saved_syn->mac_hdrlen = mac_hdrlen; + saved_syn->network_hdrlen = skb_network_header_len(skb); + saved_syn->tcp_hdrlen = tcp_hdrlen(skb); + memcpy(saved_syn->data, base, len); + req->saved_syn = saved_syn; + } + } +} + +/* Function: + * 根据收到SYN段中的选项和序号来初始化连接请求块信息。 + * Parameter: + * req: 请求连接控制块。 + * rx_opt: 保存TCP选项的结构体。 + * skb: 传输控制块缓冲区。 + * sk: 传输控制块。 + */ +static void tcp_nip_openreq_init(struct request_sock *req, + const struct tcp_options_received *rx_opt, + struct sk_buff *skb, const struct sock *sk) +{ + struct inet_request_sock *ireq = inet_rsk(req); + + req->rsk_rcv_wnd = 0; + tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq; + tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; + tcp_rsk(req)->snt_synack = tcp_clock_us(); + tcp_rsk(req)->last_oow_ack_time = 0; + req->mss = rx_opt->mss_clamp; + req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0; + ireq->tstamp_ok = rx_opt->tstamp_ok; + ireq->snd_wscale = rx_opt->snd_wscale; + ireq->acked = 0; + ireq->ecn_ok = 0; + ireq->ir_rmt_port = tcp_hdr(skb)->source; + ireq->ir_num = ntohs(tcp_hdr(skb)->dest); + ireq->ir_mark = sk->sk_mark; +} + +/* Function: + * 根据监听sock和req,为新连接创建一个传输控制块,并初始化。 + * Parameter: + * sk: 监听的传输控制块。 + * req: 请求连接控制块。 + * skb: 传输控制块缓冲区。 + */ +struct sock *tcp_nip_create_openreq_child(const struct sock *sk, + struct request_sock *req, + struct sk_buff *skb) +{ + /* 克隆一个传输控制块,并对新的传输控制块上锁 + * 觉得可以复用这个公共的,待检测 + */ + struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC); + + if (newsk) { + const struct inet_request_sock *ireq = inet_rsk(req); + struct tcp_request_sock *treq = tcp_rsk(req); + struct inet_connection_sock *newicsk = inet_csk(newsk); + struct tcp_sock *newtp = tcp_sk(newsk); + + /* Now setup tcp_sock */ + newtp->pred_flags = 0; + + /* 接收序号、发送序号相关变量初始化, + * 第二次握手服务器发送SYN+ACK段中的ack + */ + newtp->rcv_wup = treq->rcv_isn + 1; + newtp->copied_seq = treq->rcv_isn + 1; + newtp->rcv_nxt = treq->rcv_isn + 1; + newtp->segs_in = 1; + /* 第二次握手服务器发送SYN+ACK段中的seq+1 */ + newtp->snd_sml = treq->snt_isn + 1; + newtp->snd_una = treq->snt_isn + 1; + newtp->snd_nxt = treq->snt_isn + 1; + newtp->snd_up = treq->snt_isn + 1; + + /* prequeue队列初始化 */ + INIT_LIST_HEAD(&newtp->tsq_node); + + /* 接收第一次握手更新的发送窗口的ACK段序号 */ + tcp_init_wl(newtp, treq->rcv_isn); + + /* 时延相关变量初始化 */ + minmax_reset(&newtp->rtt_min, tcp_jiffies32, ~0U); + newicsk->icsk_rto = TCP_TIMEOUT_INIT; + newicsk->icsk_ack.lrcvtime = tcp_jiffies32; + + /* 拥塞控制相关变量初始化 */ + newtp->packets_out = 0; + + newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH; + + newtp->lsndtime = tcp_jiffies32; + + newtp->total_retrans = req->num_retrans; + + newtp->snd_cwnd = TCP_INIT_CWND; + + /* There's a bubble in the pipe until at least the first ACK. */ + newtp->app_limited = ~0U; + + /* 初始化几个定时器 */ + tcp_nip_init_xmit_timers(newsk); + newtp->write_seq = treq->snt_isn + 1; + newtp->pushed_seq = treq->snt_isn + 1; + + /* TCP选项相关 */ + newtp->rx_opt.saw_tstamp = 0; + + newtp->rx_opt.dsack = 0; + newtp->rx_opt.num_sacks = 0; + + newtp->urg_data = 0; + + newtp->rx_opt.tstamp_ok = ireq->tstamp_ok; + newtp->window_clamp = req->rsk_window_clamp; + newtp->rcv_ssthresh = req->rsk_rcv_wnd; + newtp->rcv_wnd = req->rsk_rcv_wnd; + newtp->rx_opt.wscale_ok = ireq->wscale_ok; + if (newtp->rx_opt.wscale_ok) { + newtp->rx_opt.snd_wscale = ireq->snd_wscale; + newtp->rx_opt.rcv_wscale = ireq->rcv_wscale; + } else { + newtp->rx_opt.snd_wscale = 0; + newtp->rx_opt.rcv_wscale = 0; + newtp->window_clamp = min(newtp->window_clamp, 65535U); + } + newtp->snd_wnd = (ntohs(tcp_hdr(skb)->window) << + newtp->rx_opt.snd_wscale); + newtp->max_window = newtp->snd_wnd; + + if (newtp->rx_opt.tstamp_ok) { + newtp->rx_opt.ts_recent = req->ts_recent; + newtp->rx_opt.ts_recent_stamp = get_seconds(); + newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; + } else { + newtp->rx_opt.ts_recent_stamp = 0; + newtp->tcp_header_len = sizeof(struct tcphdr); + } + newtp->tsoffset = 0; + + /* 确定最后传入段的大小 */ + if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len) + newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; + newtp->rx_opt.mss_clamp = req->mss; + newtp->fastopen_req = NULL; + newtp->fastopen_rsk = NULL; + newtp->syn_data_acked = 0; + newtp->rack.mstamp = 0; + newtp->rack.advanced = 0; + + __TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS); + } + return newsk; +} +EXPORT_SYMBOL(tcp_nip_create_openreq_child); + +void tcp_nip_openreq_init_rwin(struct request_sock *req, + const struct sock *sk_listener, + const struct dst_entry *dst) +{ + struct inet_request_sock *ireq = inet_rsk(req); + const struct tcp_sock *tp = tcp_sk(sk_listener); + u16 user_mss = READ_ONCE(tp->rx_opt.user_mss); + int full_space = tcp_full_space(sk_listener); + int mss = dst_metric_advmss(dst); + u32 window_clamp; + __u8 rcv_wscale; + int sysctl_tcp_nip_window_scaling = 0; + + mss = TCP_BASE_MSS; + + window_clamp = READ_ONCE(tp->window_clamp); + /* Set this up on the first call only */ + req->rsk_window_clamp = window_clamp ? : dst_metric(dst, RTAX_WINDOW); + + /* limit the window selection if the user enforce a smaller rx buffer */ + if (sk_listener->sk_userlocks & SOCK_RCVBUF_LOCK && + (req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0)) + req->rsk_window_clamp = full_space; + + /* tcp_full_space because it is guaranteed to be the first packet */ + tcp_select_initial_window(sk_listener, full_space, + mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), + &req->rsk_rcv_wnd, + &req->rsk_window_clamp, + sysctl_tcp_nip_window_scaling, + &rcv_wscale, + 0); + ireq->rcv_wscale = rcv_wscale; +} + +/* Function: + * 服务端用来处理客户端连接请求的函数。 + * Parameter: + * rsk_ops: 请求控制块的函数接口。 + * af_ops: TCP请求块的函数接口。 + * sk: 传输控制块。 + * skb: 传输控制块缓冲区。 + */ +int tcp_newip_conn_request(struct request_sock_ops *rsk_ops, + const struct tcp_request_sock_ops *af_ops, + struct sock *sk, struct sk_buff *skb) +{ + struct tcp_fastopen_cookie foc = { .len = -1 }; + + __u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn; + /* 接收到的TCP选项都是解析到此结构体中 */ + struct tcp_options_received tmp_opt; + struct tcp_sock *tp = tcp_sk(sk); + struct net *net = sock_net(sk); + struct dst_entry *dst = NULL; + struct request_sock *req; + + /* 如果半连接队列的长度已经达到了上线,则丢弃当前请求 */ + if (inet_csk_reqsk_queue_is_full(sk) && !isn) { + DEBUG("inet_csk_reqsk_queue_is_full!!!!!\n"); + goto drop; + } + + /* 如果存放已经完成连接的套接字的队列(全连接队列) + * 长度已经达到上限 + * 则丢弃当前请求 + */ + if (sk_acceptq_is_full(sk)) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); + DEBUG("sk_acceptq_is_full!!!!!\n"); + goto drop; + } + + /* 分配一个连接请求块,用于保存连接请求信息 + * 同时初始化连接过程中用来 + * 发送ACK/RST段的操作集合 + * 以便在建立过程中能方便的调用这些接口。 + */ + req = ninet_reqsk_alloc(rsk_ops, sk, true); + if (!req) + goto drop; + /* 挂接操作集 */ + tcp_rsk(req)->af_specific = af_ops; + + /* 清除TCP选项 */ + tcp_clear_options(&tmp_opt); + /* 最大MSS,在建立连接过程中协商 */ + tmp_opt.mss_clamp = af_ops->mss_clamp; + /* 进行这一步的时候最好prink下user_mss的值,看是否为0,to be done */ + tmp_opt.user_mss = tp->rx_opt.user_mss; + /* 对skb中TCP选项的解析 */ + tcp_nip_parse_options(skb, &tmp_opt, 0, false); + + /* tstamp_ok表示在收到的SYN包上看到的TIMESTAMP */ + tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; + /* 根据收到SYN段中的选项和序号来初始化连接请求块信息。 */ + tcp_nip_openreq_init(req, &tmp_opt, skb, sk); + + inet_rsk(req)->ir_iif = 1; + + af_ops->init_req(req, sk, skb); + + if (!isn) + isn = af_ops->init_seq(skb); + + if (!dst) { + dst = af_ops->route_req(sk, NULL, req); + if (!dst) + goto drop_and_free; + } + + tcp_rsk(req)->snt_isn = isn; + tcp_rsk(req)->txhash = net_tx_rndhash(); + /* 初始化接收窗口 */ + tcp_nip_openreq_init_rwin(req, sk, dst); + /* 记录syn*/ + + tcp_rsk(req)->tfo_listener = false; + /* 添加定时器,将req加入ehash表 */ + ninet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); + + af_ops->send_synack(sk, dst, NULL, req, &foc, TCP_SYNACK_NORMAL, NULL); + + reqsk_put(req); + return 0; + +drop_and_release: + dst_release(dst); +drop_and_free: + reqsk_free(req); +drop: + tcp_listendrop(sk); + return 0; +} +EXPORT_SYMBOL(tcp_newip_conn_request); + +static void tcp_nip_store_ts_recent(struct tcp_sock *tp) +{ + tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval; + tp->rx_opt.ts_recent_stamp = get_seconds(); +} + +static inline bool tcp_nip_paws_check(const struct tcp_options_received *rx_opt, + int paws_win) +{ + if ((s32)(rx_opt->ts_recent - rx_opt->rcv_tsval) <= paws_win) + return true; + if (unlikely(get_seconds() >= rx_opt->ts_recent_stamp + TCP_PAWS_24DAYS)) + return true; + + if (!rx_opt->ts_recent) + return true; + return false; +} + +static void tcp_nip_replace_ts_recent(struct tcp_sock *tp, u32 seq) +{ + if (tp->rx_opt.saw_tstamp && !after(seq, tp->rcv_wup)) { + if (tcp_nip_paws_check(&tp->rx_opt, 0)) + tcp_nip_store_ts_recent(tp); + } +} + +static inline bool tcp_nip_may_update_window(const struct tcp_sock *tp, + const u32 ack, const u32 ack_seq, + const u32 nwin) +{ + return after(ack, tp->snd_una) || + after(ack_seq, tp->snd_wl1) || + (ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd); +} + +static int tcp_nip_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32 ack, + u32 ack_seq) +{ + struct tcp_sock *tp = tcp_sk(sk); + int flag = 0; + u32 nwin = ntohs(tcp_hdr(skb)->window); + + if (likely(!tcp_hdr(skb)->syn)) + nwin <<= tp->rx_opt.snd_wscale; + + if (tcp_nip_may_update_window(tp, ack, ack_seq, nwin)) { + flag |= FLAG_WIN_UPDATE; + tcp_update_wl(tp, ack_seq); + + if (tp->snd_wnd != nwin) { + tp->snd_wnd = nwin; + tp->pred_flags = 0; + } + } + + return flag; +} + +/* 检查是否探测报文返回的ACK,同时查看对端窗口是否打开 */ +static void tcp_nip_ack_probe(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); + + if (!after(TCP_SKB_CB(tcp_nip_send_head(sk))->end_seq, tcp_wnd_end(tp))) { + icsk->icsk_backoff = 0; + inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0); + /* Socket must be waked up by subsequent tcp_data_snd_check(). + * This function is not for random using! + */ + } else { + unsigned long when = tcp_probe0_when(sk, TCP_RTO_MAX); + + inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, + when, TCP_RTO_MAX); + } +} + +static int tcp_nip_ack(struct sock *sk, const struct sk_buff *skb, int flag) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); + u32 prior_snd_una = tp->snd_una; + u32 ack_seq = TCP_SKB_CB(skb)->seq; + u32 ack = TCP_SKB_CB(skb)->ack_seq; + int prior_packets = tp->packets_out; + + if (before(ack, prior_snd_una)) + return 0; + if (after(ack, tp->snd_nxt)) + return -1; + + flag |= tcp_nip_ack_update_window(sk, skb, ack, ack_seq); + + if (!prior_packets) { + DEBUG("No prior pack and ack is %d\n", ack); + if (tcp_nip_send_head(sk)) + tcp_nip_ack_probe(sk); + } + + if (after(ack, prior_snd_una)) { + icsk->icsk_probes_out = 0; + icsk->icsk_retransmits = 0; + icsk->icsk_rto = TCP_TIMEOUT_INIT; + tp->retrans_stamp = tcp_time_stamp(tp); + tp->rcv_tstamp = tcp_jiffies32; + tcp_nip_snd_una_update(tp, ack); + tcp_nip_clean_rtx_queue(sk); + return 1; + } + return 1; +} + +static inline bool tcp_nip_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq) +{ + /* 若end_seq已经接收过,或者seq不在接收窗口之后,就会返回false */ + return !before(end_seq, tp->rcv_wup) && + !after(seq, tp->rcv_nxt + tcp_receive_window(tp)); +} + +/* 该函数主要是对一些错误的包重发ACK,因为如果不发ACK + * 这些包可能会被经常重传 + */ +static void tcp_nip_send_dupack(struct sock *sk, const struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && + before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST); + } + + tcp_nip_send_ack(sk); +} + +/* 该函数主要是用于对RST报文,非法seq报文 + * ESTABLISHED状态下收到SYN时的处理 + * 当前只包含对seq的检查 + */ +static bool tcp_nip_validate_incoming(struct sock *sk, struct sk_buff *skb, + const struct tcphdr *th, int syn_inerr) +{ + struct tcp_sock *tp = tcp_sk(sk); + + /* Step 1: check sequence number */ + /* 检查非预期包,对于一些探测包就是非预期的 + * 这类包不需要处理,但是要回复ACK + */ + if (!tcp_nip_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) { + DEBUG("%s receive an err seq and seq is %d, ack is %d\n", __func__, + TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); + if (!th->rst) + tcp_nip_send_dupack(sk, skb); + goto discard; + } + + return true; + +discard: + tcp_drop(sk, skb); + return false; +} + +void tcp_nip_rcv_established(struct sock *sk, struct sk_buff *skb, + const struct tcphdr *th, unsigned int len) +{ + struct tcp_sock *tp = tcp_sk(sk); + + tcp_mstamp_refresh(tp); + if (!tcp_nip_validate_incoming(sk, skb, th, 1)) + return; + + if (tcp_nip_ack(sk, skb, 0) < 0) + goto discard; + + tcp_nip_data_queue(sk, skb); + tcp_nip_data_snd_check(sk); + tcp_nip_ack_snd_check(sk); + + return; + +discard: + tcp_drop(sk, skb); +} +EXPORT_SYMBOL(tcp_nip_rcv_established); + +static u32 tcp_default_init_rwnd(u32 mss) +{ + u32 init_rwnd = TCP_INIT_CWND * 2; + + if (mss > TCP_MAX_MSS) + init_rwnd = max((TCP_MAX_MSS * init_rwnd) / mss, 2U); + return init_rwnd; +} + +static void tcp_nip_fixup_rcvbuf(struct sock *sk) +{ + u32 mss = TCP_BASE_MSS; + int rcvmem; + + rcvmem = 2 * SKB_TRUESIZE(mss + MAX_TCP_HEADER) * + tcp_default_init_rwnd(mss); + + if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) + rcvmem <<= 2; + + if (sk->sk_rcvbuf < rcvmem) + sk->sk_rcvbuf = min(rcvmem, + sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); +} + +void tcp_nip_init_buffer_space(struct sock *sk) +{ + int tcp_app_win = sock_net(sk)->ipv4.sysctl_tcp_app_win; + struct tcp_sock *tp = tcp_sk(sk); + int maxwin; + + if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) + tcp_nip_fixup_rcvbuf(sk); + sk->sk_sndbuf = 30720; + + tp->rcvq_space.space = tp->rcv_wnd; + tcp_mstamp_refresh(tp); + tp->rcvq_space.time = tcp_time_stamp; + tp->rcvq_space.seq = tp->copied_seq; + maxwin = tcp_full_space(sk); + if (tp->window_clamp >= maxwin) { + tp->window_clamp = maxwin; + if (tcp_app_win && maxwin > 4 * tp->advmss) + tp->window_clamp = max(maxwin - + (maxwin >> tcp_app_win), + 4 * tp->advmss); + } + /* Force reservation of one segment. */ + if (tcp_app_win && + tp->window_clamp > 2 * tp->advmss && + tp->window_clamp + tp->advmss > maxwin) + tp->window_clamp = max(2 * tp->advmss, maxwin - tp->advmss); + tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp); + tp->snd_cwnd_stamp = tcp_jiffies32; +} + +void tcp_nip_finish_connect(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); + + tcp_set_state(sk, TCP_ESTABLISHED); + icsk->icsk_ack.lrcvtime = tcp_jiffies32; + if (skb) { + icsk->icsk_af_ops->sk_rx_dst_set(sk, skb); + security_inet_conn_established(sk, skb); + } + + tp->lsndtime = tcp_jiffies32; + + tcp_nip_init_buffer_space(sk); +} + +/* Function: + * tcp处理第二次握手的函数 + * Parameter: + * sk:传输控制块 + * skb:传输控制块缓冲区 + * th:TCP首部字段 + * Note: 静态函数,只在该文件中引用,不需在头文件中声明 + */ +static int tcp_nip_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, + const struct tcphdr *th) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + int saved_clamp = tp->rx_opt.mss_clamp; + + /* TCP选项解析 */ + tcp_nip_parse_options(skb, &tp->rx_opt, 0, NULL); + /* rcv_tsecr 保存最近一次接收到对端的 TCP 段的时间戳 + * 选项中的时间戳回显应答 + */ + if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) + tp->rx_opt.rcv_tsecr -= tp->tsoffset; + + if (th->ack) { + /* ACK 的值是否在初始发送序号和下一个序号之间 */ + if (!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_una) || + after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) + goto reset_and_undo; + /* 必须在对应的时间内*/ + if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && + !between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp, tcp_time_stamp(tp))) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSACTIVEREJECTED); + goto reset_and_undo; + } + + if (th->rst) + goto discard; + + if (!th->syn) + goto discard_and_undo; + + tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); + + tcp_nip_ack(sk, skb, FLAG_SLOWPATH); + tp->nip_out_of_order_queue = NULL; + /* 期望接受的下一个数据序号 +1 */ + tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; + /* 接受窗口的左边界 +1 */ + tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1; + tp->snd_wnd = ntohs(th->window); + if (!tp->rx_opt.wscale_ok) { + tp->rx_opt.snd_wscale = 0; + tp->rx_opt.rcv_wscale = 0; + tp->window_clamp = min(tp->window_clamp, 65535U); + } + + if (tp->rx_opt.saw_tstamp) { + tp->rx_opt.tstamp_ok = 1; + tp->tcp_header_len = + sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; + tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; + tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval; + tp->rx_opt.ts_recent_stamp = get_seconds(); + } else { + tp->tcp_header_len = sizeof(struct tcphdr); + } + + tp->copied_seq = tp->rcv_nxt; + /* 调用内存屏障 (根据checkpatch要求在之前加注释) */ + smp_mb(); + + tcp_nip_finish_connect(sk, skb); + /* 唤醒进程 */ + if (!sock_flag(sk, SOCK_DEAD)) { + sk->sk_state_change(sk); + rcu_read_lock(); + sock_wake_async(rcu_dereference(sk->sk_wq), SOCK_WAKE_IO, POLL_OUT); + rcu_read_unlock(); + } + + tcp_nip_send_ack(sk); +discard: + return 0; + } + +discard_and_undo: + tcp_clear_options(&tp->rx_opt); + tp->rx_opt.mss_clamp = saved_clamp; + goto discard; + +reset_and_undo: + tcp_clear_options(&tp->rx_opt); + tp->rx_opt.mss_clamp = saved_clamp; + return 1; +} + +/* Function: + * TCP接收数据包后,根据不同状态区分的处理函数 + * Parameter: + * sk:传输控制块 + * skb:传输控制块缓冲区 + * Note: 当前该函数只有第一次握手数据包的处理代码 + * 实现对第三次握手的ACK进行处理的代码 + */ +int tcp_nip_rcv_state_process(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); + const struct tcphdr *th = tcp_hdr(skb); + struct request_sock *req = NULL; + int queued = 0; + bool acceptable; + + /* 第一步:连接握手包处理 */ + switch (sk->sk_state) { + case TCP_CLOSE: + goto discard; + + case TCP_LISTEN: + if (th->ack) + return 1; + + if (th->rst) + goto discard; + + if (th->syn) { + if (th->fin) + goto discard; + + rcu_read_lock(); + local_bh_disable(); + acceptable = icsk->icsk_af_ops->conn_request(sk, skb) >= 0; + local_bh_enable(); + rcu_read_unlock(); + + if (!acceptable) + return 1; + consume_skb(skb); + return 0; + } + goto discard; + case TCP_SYN_SENT: + DEBUG("%s TCP_SYN_SENT!!\n", __func__); + tp->rx_opt.saw_tstamp = 0; + tcp_mstamp_refresh(tp); + queued = tcp_nip_rcv_synsent_state_process(sk, skb, th); + if (queued >= 0) + return queued; + __kfree_skb(skb); + return 0; + } + tcp_mstamp_refresh(tp); + tp->rx_opt.saw_tstamp = 0; + + if (!th->ack && !th->rst && !th->syn) + goto discard; + + acceptable = tcp_nip_ack(sk, skb, 0); + + /* 若第三次握手ack不合法,则直接返回1,在tcp_nip_rcv中会丢弃该skb */ + if (!acceptable) { + if (sk->sk_state == TCP_SYN_RECV) + return 1; + goto discard; + } + + switch (sk->sk_state) { + case TCP_SYN_RECV: + tp->copied_seq = tp->rcv_nxt; + tcp_nip_init_buffer_space(sk); + /* 调用内存屏障 (根据checkpatch要求在之前加注释) */ + smp_mb(); + tcp_set_state(sk, TCP_ESTABLISHED); + DEBUG("TCP_ESTABLISHED!!!!!\n"); + sk->sk_state_change(sk); + + /* 设置即将要发送的部分,以及发送窗口大小 */ + tp->snd_una = TCP_SKB_CB(skb)->ack_seq; + tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale; + tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); + + tp->lsndtime = tcp_jiffies32; + + tcp_initialize_rcv_mss(sk); + break; + case TCP_FIN_WAIT1: { + struct dst_entry *dst; + int tmo; + + if (tp->snd_una != tp->write_seq) { + DEBUG("%s: tp->snd_una != tp->write_seq!!\n", __func__); + break; + } + + tcp_set_state(sk, TCP_FIN_WAIT2); + sk->sk_shutdown |= SEND_SHUTDOWN; + + DEBUG("%s: TCP_FIN_WAIT1: recvd ack for fin.Wait for fin from other side.\n", + __func__); + inet_csk_reset_keepalive_timer(sk, 10 * HZ); + + break; + } + + case TCP_CLOSING: + if (tp->snd_una == tp->write_seq) { + DEBUG("%s: TCP_CLOSING: recvd ack for fin.Ready to destroy.\n", __func__); + inet_csk_reset_keepalive_timer(sk, TCP_TIMEWAIT_LEN); + goto discard; + } + break; + case TCP_LAST_ACK: + DEBUG("tcp_nip_rcv_state_process_2: TCP_LAST_ACK\n"); + if (tp->snd_una == tp->write_seq) { + DEBUG("%s: LAST_ACK: recvd ack for fin.Directly destroy.\n", __func__); + tcp_nip_done(sk); + goto discard; + } + break; + } + + switch (sk->sk_state) { + case TCP_CLOSE_WAIT: + case TCP_CLOSING: + case TCP_LAST_ACK: + if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) + break; + DEBUG("tcp_nip_rcv_state_process_3: TCP_LAST_ACK_2\n"); + case TCP_FIN_WAIT1: + case TCP_FIN_WAIT2: + if (sk->sk_shutdown & RCV_SHUTDOWN) { + if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && + after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) { + return 1; + } + } + case TCP_ESTABLISHED: + tcp_nip_data_queue(sk, skb); + queued = 1; + break; + } + + if (sk->sk_state != TCP_CLOSE) { + tcp_nip_data_snd_check(sk); + tcp_nip_ack_snd_check(sk); + } + + if (!queued) { +discard: + tcp_nip_drop(sk, skb); + } + return 0; +} +EXPORT_SYMBOL(tcp_nip_rcv_state_process); + +static bool tcp_nip_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) +{ + if (seq == s_win) + return true; + if (after(end_seq, s_win) && before(seq, e_win)) + return true; + return seq == e_win && seq == end_seq; +} + +/* Function: + * 初始化RCV_MSS + * Parameter: + * sk:传输控制块 + */ +void tcp_nip_initialize_rcv_mss(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache); + + hint = min(hint, tp->rcv_wnd / 2); + hint = min(hint, TCP_MSS_DEFAULT); + hint = max(hint, TCP_MIN_MSS); + + inet_csk(sk)->icsk_ack.rcv_mss = hint; +} +EXPORT_SYMBOL(tcp_nip_initialize_rcv_mss); + +/* Function: + * 处理第三次握手ack,成功返回新控制块。是处理ack的核心流程。 + * (1)调用child = + * inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL, req, &own_req); + * 创建子控制块,这里需要注意,子控制块的状态为TCP_SYN_RECV + * 这与刚收到syn建立的控制块状态不一样,那 + * 时创建的控制块为TCP_NEW_SYN_RECV; + * (2)将请求控制块从未完成连接队列中删除 + * 加入到已完成连接队列中; + * Parameter: + * sk:传输控制块 + * skb:传输控制块缓冲区 + * req: 请求连接控制块。 + */ +struct sock *tcp_nip_check_req(struct sock *sk, struct sk_buff *skb, + struct request_sock *req) +{ + struct tcp_options_received tmp_opt; + struct sock *child; + const struct tcphdr *th = tcp_hdr(skb); + __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST | TCP_FLAG_SYN | TCP_FLAG_ACK); + bool paws_reject = false; + bool own_req; + + tmp_opt.saw_tstamp = 0; + /* 判断是否有tcp选项。 */ + if (th->doff > (sizeof(struct tcphdr) >> 2)) { + /* 解析tcp选项。 */ + tcp_nip_parse_options(skb, &tmp_opt, 0, NULL); + } + + /* ACK但是序号对不上,返回 原有控制块,外面不做处理 */ + if ((flg & TCP_FLAG_ACK) && + (TCP_SKB_CB(skb)->ack_seq != + tcp_rsk(req)->snt_isn + 1)) { + DEBUG("%s ack_seq is wrong!", __func__); + return sk; + } + + /* 上面流程保证了有ack,若没有,直接返回 */ + if (!(flg & TCP_FLAG_ACK)) { + DEBUG("%s No TCP_FLAG_ACK !!!!", __func__); + return NULL; + } + + /* ack有效,创建子控制块,注意此时子控制块的状态为TCP_SYN_RECV */ + child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL, + req, &own_req); + if (!child) { + DEBUG("%s No listen_overflow!!!!", __func__); + goto listen_overflow; + } + DEBUG("%s creat child sock successfully!", __func__); + + sock_rps_save_rxhash(child, skb); + /* 计算三次握手中synack-ack消耗的时间 */ + tcp_synack_rtt_meas(child, req); + /* 从未完成队列删除原控制块,加入到已完成队列 */ + return inet_csk_complete_hashdance(sk, child, req, own_req); + +listen_overflow: + if (!sock_net(sk)->ipv4.sysctl_tcp_abort_on_overflow) { + inet_rsk(req)->acked = 1; + return NULL; + } + return NULL; +} +EXPORT_SYMBOL(tcp_nip_check_req); + diff --git a/net/newip/tcp_nip_output.c b/net/newip/tcp_nip_output.c new file mode 100755 index 0000000000000000000000000000000000000000..fd996d3a2973e5eb7b46feb61f55a49f71049327 --- /dev/null +++ b/net/newip/tcp_nip_output.c @@ -0,0 +1,1033 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#define pr_fmt(fmt) "TCP: " fmt + +#include + +#include +#include +#include + +#include +#include +#include + +#define OPTION_SACK_ADVERTISE BIT(0) +#define OPTION_TS BIT(1) +#define OPTION_MD5 BIT(2) +#define OPTION_WSCALE BIT(3) +#define OPTION_FAST_OPEN_COOKIE BIT(8) + +/* 存放发送TCP包时,TCP所包含的选项 */ +struct tcp_nip_out_options { + u16 options; /* bit field of OPTION_*, OPTION_*的位域 */ + u16 mss; /* 0 to disable, 如果为零则表示关闭mss选项 */ + /* window scale, 0 to disable, 窗口放大,0表示关闭该选项 */ + u8 ws; + u8 hash_size; /* bytes in hash_location */ + __u8 *hash_location; /* temporary pointer, overloaded */ + __u32 tsval, tsecr; /* need to include OPTION_TS */ +}; + +static bool tcp_nip_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, + int push_one, gfp_t gfp); + +static void tcp_nip_event_data_sent(struct tcp_sock *tp, + struct sock *sk) +{ +} + +static inline void tcp_advance_send_head(struct sock *sk, const struct sk_buff *skb) +{ + if (tcp_skb_is_last(sk, skb)) + sk->sk_send_head = NULL; + else + sk->sk_send_head = skb_queue_next(&sk->sk_write_queue, skb); +} + +static void tcp_nip_event_new_data_sent(struct sock *sk, struct sk_buff *skb) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + unsigned int prior_packets = tp->packets_out; + + tcp_advance_send_head(sk, skb); + tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; + tp->packets_out += tcp_skb_pcount(skb); + if (!prior_packets || icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || + icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { + tcp_nip_rearm_rto(sk); + } +} + +void __tcp_nip_push_pending_frames(struct sock *sk, unsigned int cur_mss, + int nonagle) +{ + if (unlikely(sk->sk_state == TCP_CLOSE)) + return; + + if (tcp_nip_write_xmit(sk, cur_mss, nonagle, 0, + sk_gfp_mask(sk, GFP_ATOMIC))) { + DEBUG("%s check probe0 timer!\n", __func__); + tcp_nip_check_probe_timer(sk); + } +} + +u32 __nip_tcp_select_window(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + int mss = TCP_BASE_MSS; + int free_space = tcp_space(sk); + int allowed_space = tcp_full_space(sk); + int full_space = min_t(int, tp->window_clamp, allowed_space); + int window; + + if (unlikely(mss > full_space)) { + mss = full_space; + if (mss <= 0) + return 0; + } + if (free_space < (full_space >> 1)) { + icsk->icsk_ack.quick = 0; + + free_space = round_down(free_space, 1 << tp->rx_opt.rcv_wscale); + if (free_space < (allowed_space >> 4) || free_space < mss) + return 0; + } + + if (free_space > tp->rcv_ssthresh) + free_space = tp->rcv_ssthresh; + + window = tp->rcv_wnd; + if (tp->rx_opt.rcv_wscale) { + window = free_space; + if (((window >> tp->rx_opt.rcv_wscale) << tp->rx_opt.rcv_wscale) != window) + window = (((window >> tp->rx_opt.rcv_wscale) + 1) + << tp->rx_opt.rcv_wscale); + } else { + if (window <= free_space - mss || window > free_space) + window = (free_space / mss) * mss; + else if (mss == full_space && + free_space > window + (full_space >> 1)) + window = free_space; + } + return window; +} + +static u16 nip_tcp_select_window(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + u32 old_win = tp->rcv_wnd; + u32 cur_win = tcp_receive_window(tp); + u32 new_win = __nip_tcp_select_window(sk); + + if (new_win < cur_win) { + if (new_win == 0) + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPWANTZEROWINDOWADV); + new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale); + } + tp->rcv_wnd = new_win; + tp->rcv_wup = tp->rcv_nxt; + + if (!tp->rx_opt.rcv_wscale && + sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows) + new_win = min(new_win, MAX_TCP_WINDOW); + else + new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale)); + + new_win >>= tp->rx_opt.rcv_wscale; + if (new_win == 0) { + tp->pred_flags = 0; + if (old_win) + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPTOZEROWINDOWADV); + } else if (old_win == 0) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFROMZEROWINDOWADV); + } + + return new_win; +} + +/* Function: + * 初始化传输层相关参数。 + * Parameter: + * sk: 传输控制块。 + */ +static void tcp_nip_connect_init(struct sock *sk) +{ + const struct dst_entry *dst = __sk_dst_get(sk); + struct tcp_sock *tp = tcp_sk(sk); + __u8 rcv_wscale = 0; + int sysctl_tcp_nip_window_scaling = 0; + /* 头部结构长度+时间戳长度 */ + tp->tcp_header_len = sizeof(struct tcphdr); + if (sock_net(sk)->ipv4.sysctl_tcp_timestamps) + tp->tcp_header_len += TCPOLEN_TSTAMP_ALIGNED; + /* 设置MSS为默认值,536u */ + tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; + /* mss的上限值,用户指定优先级更高)*/ + if (tp->rx_opt.user_mss) + tp->rx_opt.mss_clamp = tp->rx_opt.user_mss; + tp->max_window = 0; + + tp->rx_opt.mss_clamp = TCP_BASE_MSS; + tp->advmss = dst_metric_advmss(dst); + + /* 初始化窗口 */ + tcp_select_initial_window(sk, tcp_full_space(sk), + tp->advmss - (tp->rx_opt.ts_recent_stamp ? + tp->tcp_header_len - sizeof(struct tcphdr) : 0), + &tp->rcv_wnd, + &tp->window_clamp, + sysctl_tcp_nip_window_scaling, + &rcv_wscale, + 0); + + tp->rx_opt.rcv_wscale = rcv_wscale; + tp->rcv_ssthresh = tp->rcv_wnd; + + sk->sk_err = 0; + sock_reset_flag(sk, SOCK_DONE); + tp->snd_wnd = 0; + tp->snd_wl1 = 0; + tcp_write_queue_purge(sk); + + tp->snd_una = tp->write_seq; + tp->snd_sml = tp->write_seq; + tp->snd_up = tp->write_seq; + tp->snd_nxt = tp->write_seq; + + tp->rcv_nxt = 0; + tp->rcv_wup = tp->rcv_nxt; + tp->copied_seq = tp->rcv_nxt; + inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT; + inet_csk(sk)->icsk_retransmits = 0; + tcp_clear_retrans(tp); +} + +static void tcp_nip_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) +{ + skb->ip_summed = CHECKSUM_PARTIAL; + skb->csum = 0; + + TCP_SKB_CB(skb)->tcp_flags = flags; + TCP_SKB_CB(skb)->sacked = 0; + + tcp_skb_pcount_set(skb, 1); + + TCP_SKB_CB(skb)->seq = seq; + if (flags & (TCPHDR_SYN | TCPHDR_FIN)) + seq++; + TCP_SKB_CB(skb)->end_seq = seq; +} + +#define OPTION_TS BIT(1) +#define OPTION_WSCALE BIT(3) + +static void tcp_nip_connect_queue_skb(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); + + tcb->end_seq += skb->len; + __skb_header_release(skb); + __skb_queue_tail(&sk->sk_write_queue, skb); + sk->sk_wmem_queued += skb->truesize; + sk_mem_charge(sk, skb->truesize); + WRITE_ONCE(tp->write_seq, tcb->end_seq); + tp->packets_out += tcp_skb_pcount(skb); +} + +/* Compute TCP options for SYN packets. This is not the final + * network wire format yet. + */ +static unsigned int tcp_nip_syn_options(struct sock *sk, struct sk_buff *skb, + struct tcp_nip_out_options *opts) +{ + struct tcp_sock *tp = tcp_sk(sk); + unsigned int remaining = MAX_TCP_OPTION_SPACE; + + opts->mss = tp->advmss; + remaining -= TCPOLEN_MSS_ALIGNED; + + if (likely(sock_net(sk)->ipv4.sysctl_tcp_timestamps)) { + opts->options |= OPTION_TS; + opts->tsval = tcp_skb_timestamp(skb) + tp->tsoffset; + opts->tsecr = tp->rx_opt.ts_recent; + remaining -= TCPOLEN_TSTAMP_ALIGNED; + } + return MAX_TCP_OPTION_SPACE - remaining; +} + +/* Compute TCP options for ESTABLISHED sockets. This is not the + * final wire format yet. + */ +static unsigned int tcp_nip_established_options(struct sock *sk, struct sk_buff *skb, + struct tcp_nip_out_options *opts) +{ + struct tcp_sock *tp = tcp_sk(sk); + unsigned int size = 0; + + opts->options = 0; + + if (likely(tp->rx_opt.tstamp_ok)) { + opts->options |= OPTION_TS; + opts->tsval = skb ? tcp_skb_timestamp(skb) + tp->tsoffset : 0; + opts->tsecr = tp->rx_opt.ts_recent; + size += TCPOLEN_TSTAMP_ALIGNED; + } + return size; +} + +/* Function: + * 将TCP选项中的参数放到SKB中。 + * Write previously computed TCP options to the packet. + * Parameter: + * ptr: 指向skb中TCP选项的指针。 + * tp: 传输控制块。 + * opts: 要发送出去的暂装载TCP选项的结构体。 + */ +static void tcp_nip_options_write(__be32 *ptr, struct tcp_sock *tp, + struct tcp_nip_out_options *opts) +{ + u16 options = opts->options; /* mungable copy */ + + if (unlikely(opts->mss)) { + *ptr++ = htonl((TCPOPT_MSS << 24) | + (TCPOLEN_MSS << 16) | + opts->mss); + } +} + +static inline void tcp_nip_event_ack_sent(struct sock *sk, unsigned int pkts, + u32 rcv_nxt) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (unlikely(rcv_nxt != tp->rcv_nxt)) + return; + inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); +} + +static int __tcp_nip_transmit_skb(struct sock *sk, struct sk_buff *skb, + int clone_it, gfp_t gfp_mask, u32 rcv_nxt) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + struct inet_sock *inet; + struct tcp_sock *tp = tcp_sk(sk); + struct tcp_skb_cb *tcb; + struct tcp_nip_out_options opts; + unsigned int tcp_options_size, tcp_header_size; + struct sk_buff *oskb = NULL; + struct tcphdr *th; + int err = 0; + + if (clone_it) { + TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq + - tp->snd_una; + oskb = skb; + + tcp_skb_tsorted_save(oskb) { + if (unlikely(skb_cloned(oskb))) + skb = pskb_copy(oskb, gfp_mask); + else + skb = skb_clone(oskb, gfp_mask); + } tcp_skb_tsorted_restore(oskb); + + if (unlikely(!skb)) + return -ENOBUFS; + } + + inet = inet_sk(sk); + tcb = TCP_SKB_CB(skb); + memset(&opts, 0, sizeof(opts)); + + if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) + tcp_options_size = tcp_nip_syn_options(sk, skb, &opts); + else + tcp_options_size = tcp_nip_established_options(sk, skb, &opts); + tcp_header_size = tcp_options_size + sizeof(struct tcphdr); + + skb->ooo_okay = sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1); + /* data指针上移 */ + skb_push(skb, tcp_header_size); + skb_reset_transport_header(skb); + + /* 与控制块解除关联 */ + skb_orphan(skb); + + /* 与控制块建立关联 */ + skb->sk = sk; + skb->destructor = skb_is_tcp_pure_ack(skb) ? __sock_wfree : tcp_wfree; + skb_set_hash_from_sk(skb, sk); + /* 增加分配的内存 */ + refcount_add(skb->truesize, &sk->sk_wmem_alloc); + DEBUG("th->inet_sport==%d, th->inet_dport==%d\n", inet->inet_sport, inet->inet_dport); + DEBUG("sk->sk_rcvbuf==%d, sk->sk_rmem_alloc==%d\n", + sk->sk_rcvbuf, atomic_read(&sk->sk_rmem_alloc)); + /* Build TCP header and checksum it. */ + th = (struct tcphdr *)skb->data; + th->source = inet->inet_sport; + th->dest = inet->inet_dport; + th->seq = htonl(tcb->seq); + th->ack_seq = htonl(rcv_nxt); + *(((__be16 *)th) + TCP_HEADERLEN_OFFSET) = htons(((tcp_header_size >> 2) << 12) | + tcb->tcp_flags); + + th->check = 0; + th->urg_ptr = 0; + + /* 写入tcp选项 */ + tcp_nip_options_write((__be32 *)(th + 1), tp, &opts); + + /* 窗口设置 */ + if (likely(!(tcb->tcp_flags & TCPHDR_SYN))) + th->window = htons(nip_tcp_select_window(sk)); + else + th->window = htons(min(tp->rcv_wnd, 65535U)); + + if (likely(tcb->tcp_flags & TCPHDR_ACK)) + tcp_nip_event_ack_sent(sk, tcp_skb_pcount(skb), rcv_nxt); + + /* 有数据要发送 */ + if (skb->len != tcp_header_size) { + tcp_nip_event_data_sent(tp, sk); + tp->data_segs_out += tcp_skb_pcount(skb); + } + + skb->tstamp = 0; + + memset(skb->cb, 0, max(sizeof(struct inet_skb_parm), + sizeof(struct inet6_skb_parm))); + + err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl); + return err; +} + +/* Function: + * TCP的传输层发送代码,主要功能是构建并初始化TCP头部 + * 构造sk_buff调用传输层到网络层接口 + * Parameter: + * sk: 传输控制块; + * skb: 结构体存储了网络数据报; + * 的所有信息 + */ +int tcp_nip_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, + gfp_t gfp_mask) +{ + return __tcp_nip_transmit_skb(sk, skb, clone_it, gfp_mask, + tcp_sk(sk)->rcv_nxt); +} + +static void tcp_nip_queue_skb(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + + /* Advance write_seq and place onto the write_queue. */ + tp->write_seq = TCP_SKB_CB(skb)->end_seq; + tcp_nip_add_write_queue_tail(sk, skb); + sk->sk_wmem_queued += skb->truesize; + sk_mem_charge(sk, skb->truesize); +} + +/* Function: + * 客户端传输层用来连接请求的函数,主要用来。 + * Parameter: + * sk: 传输控制块。 + */ +int __tcp_nip_connect(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *buff; + int err; + /* 初始化tcp相关参数 */ + tcp_nip_connect_init(sk); + /* 分配sk_buff空间 */ + buff = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, true); + if (unlikely(!buff)) + return -ENOBUFS; + + /* 初始化SYN标志位 */ + tcp_nip_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN); + tcp_mstamp_refresh(tp); + tp->retrans_stamp = tcp_time_stamp(tp); + tcp_nip_init_xmit_timers(sk); + + tcp_nip_connect_queue_skb(sk, buff); + + /* Send off SYN */ + err = tcp_nip_transmit_skb(sk, buff, 1, sk->sk_allocation); + if (err == -ECONNREFUSED) + return err; + + tp->snd_nxt = tp->write_seq; + tp->pushed_seq = tp->write_seq; + buff = tcp_nip_send_head(sk); + + TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS); + + /* Timer for repeating the SYN until an answer. */ + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + inet_csk(sk)->icsk_rto, TCP_RTO_MAX); + + return 0; +} +EXPORT_SYMBOL(__tcp_nip_connect); + +/* Function: + * Set up TCP options for SYN-ACKs. + * 初始化SYN-ACK段的TCP选项。返回TCP头部大小。 + * Parameter: + * req: 请求连接控制块。 + * mss: 最大分段长度。 + * skb: 传输控制块缓冲区。 + * opts: 存放发送TCP包时,TCP所包含的选项。 + * foc: fast open选项。 + * synack_type: SYN+ACK段的类型。 + */ +static unsigned int tcp_nip_synack_options(struct request_sock *req, + unsigned int mss, struct sk_buff *skb, + struct tcp_nip_out_options *opts, + const struct tcp_md5sig_key *md5, + struct tcp_fastopen_cookie *foc, + enum tcp_synack_type synack_type) +{ + struct inet_request_sock *ireq = inet_rsk(req); + unsigned int remaining = MAX_TCP_OPTION_SPACE; + + /* We always send an MSS option. */ + opts->mss = mss; + remaining -= TCPOLEN_MSS_ALIGNED; + + if (likely(ireq->tstamp_ok)) { + opts->options |= OPTION_TS; + opts->tsval = tcp_skb_timestamp(skb); + opts->tsecr = req->ts_recent; + remaining -= TCPOLEN_TSTAMP_ALIGNED; + } + return MAX_TCP_OPTION_SPACE - remaining; +} + +/* Function: + * 根据当前的传输控制块,路由信息,请求等信息构建syn+ack段。 + * Parameter: + * sk: 传输控制块。 + * dst: 路由。 + * req: 请求连接控制块。 + * foc: fast open选项。 + * synack_type: SYN+ACK段的类型。 + */ +struct sk_buff *tcp_nip_make_synack(const struct sock *sk, struct dst_entry *dst, + struct request_sock *req, + struct tcp_fastopen_cookie *foc, + enum tcp_synack_type synack_type) +{ + struct inet_request_sock *ireq = inet_rsk(req); + const struct tcp_sock *tp = tcp_sk(sk); + struct tcp_md5sig_key *md5 = NULL; + struct tcp_nip_out_options opts; + struct sk_buff *skb; + int tcp_header_size; + struct tcphdr *th; + u16 user_mss; + int mss; + + /* 申请缓存skb */ + skb = alloc_skb(MAX_TCP_HEADER, 0); + if (unlikely(!skb)) { + dst_release(dst); + return NULL; + } + + /* Reserve space for headers. + * 为MAC层,IP层,TCP层首部预留必要的空间 + */ + skb_reserve(skb, MAX_TCP_HEADER); + + /* 这里只考虑NORMAL的情况。若有需要以后添加。 */ + switch (synack_type) { + case TCP_SYNACK_NORMAL: + /* 释放掉原SKB,把自己当作当前SK的SKB */ + skb_set_owner_w(skb, req_to_sk(req)); + break; + default: + break; + } + skb_dst_set(skb, dst); + + mss = dst_metric_advmss(dst); + user_mss = READ_ONCE(tp->rx_opt.user_mss); + if (user_mss && user_mss < mss) + mss = user_mss; + + /* 清空选项,并且设置相关的时间戳 */ + memset(&opts, 0, sizeof(opts)); + skb->skb_mstamp_ns = tcp_clock_us(); + + /* 得到tcp头部大小,然后进行大小设置,并且重置传输层头部 */ + skb_set_hash(skb, tcp_rsk(req)->txhash, PKT_HASH_TYPE_L4); + tcp_header_size = tcp_nip_synack_options(req, mss, skb, &opts, md5, + foc, synack_type) + sizeof(*th); + skb_push(skb, tcp_header_size); + skb_reset_transport_header(skb); + + /* 清空TCP头部,并设置TCP头部的各个字段 */ + th = (struct tcphdr *)skb->data; + memset(th, 0, sizeof(struct tcphdr)); + th->syn = 1; + th->ack = 1; + if (inet_rsk(req)->ecn_ok) + th->ece = 1; + th->source = htons(ireq->ir_num); + th->dest = ireq->ir_rmt_port; + skb->ip_summed = CHECKSUM_PARTIAL; + th->seq = htonl(tcp_rsk(req)->snt_isn); + th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt); + + th->window = htons(min(req->rsk_rcv_wnd, 65535U)); + + tcp_nip_options_write((__be32 *)(th + 1), NULL, &opts); + /* TCP数据偏移,除以4是因为doff的单位是32位字 + * 即以四个字节长的字为计算单位 + */ + th->doff = (tcp_header_size >> 2); + __TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS); + + /* Do not fool tcpdump (if any), clean our debris */ + skb->tstamp = 0; + return skb; +} +EXPORT_SYMBOL(tcp_nip_make_synack); + +/* Function: + * 发送SYN+ACK段的skb包到网络层。 + * Parameter: + * req: 请求连接控制块。 + * skb: 传输控制块缓冲区。 + */ +int __nip_send_synack(struct request_sock *req, struct sk_buff *skb) +{ + struct inet_request_sock *ireq = inet_rsk(req); /* 连接请求块 */ + int err = -EFAULT; + int csummode = CHECKSUM_NONE; + int nwkhdr_len = 0; + struct nip_addr *saddr, *daddr; + struct nip_head_para head = {0}; + + skb->protocol = htons(ETH_P_NEWIP); + skb->ip_summed = csummode; + skb->csum = 0; + saddr = &ireq->ir_nip_loc_addr; + daddr = &ireq->ir_nip_rmt_addr; + + head.saddr = *saddr; + head.daddr = *daddr; + head.ttl = NIP_DEFAULT_TTL; + head.nexthdr = IPPROTO_TCP; + _nip_comm_bitmap_flag_encap(&head); + _nip_hdr_encap(&head); + + /* nip报文头长度 + (tcp传输层包头长度 + 用户数据长度) */ + head.total_len = head.hdr_buf_pos + skb->len; + _nip_update_total_len(&head); + + unsigned char *tmp_nip_hdr = kmalloc(NIP_HDR_MAX, GFP_KERNEL); + + skb_push(skb, head.hdr_buf_pos); + memcpy(skb->data, head.hdr_buf, head.hdr_buf_pos); + skb_reset_network_header(skb); + NIPCB(skb)->srcaddr = *saddr; + NIPCB(skb)->dstaddr = *daddr; + kfree(tmp_nip_hdr); + + err = nip_nwk_output(skb); + if (err) + DEBUG("%s: failed to send skb!\n", __func__); + else + DEBUG("%s: send a skb ok!", __func__); + + return 0; +error: + return -1; +} + +int nip_send_synack(struct request_sock *req, struct sk_buff *skb) +{ + return __nip_send_synack(req, skb); +} + +/* Function: + * 创建一个子传输块,用于完成三次握手的建立 + * Parameter: + * parent:父传输控制块 + * child:子传输控制块 + * skb:传输控制块缓冲区 + */ +int tcp_nip_child_process(struct sock *parent, struct sock *child, + struct sk_buff *skb) +{ + int ret = 0; + int state = child->sk_state; + /* child 没被用户进程占用 */ + if (!sock_owned_by_user(child)) { + ret = tcp_nip_rcv_state_process(child, skb); + /* 此时child的状态发生了迁移,唤醒监听套接字上的进程 + * 可能由于accept而阻塞 + */ + if (state == TCP_SYN_RECV && child->sk_state != state) + parent->sk_data_ready(parent); + } else { + __sk_add_backlog(child, skb); + } + bh_unlock_sock(child); + sock_put(child); + return ret; +} + +static inline __u32 tcp_nip_acceptable_seq(const struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + + if (!before(tcp_wnd_end(tp), tp->snd_nxt)) + return tp->snd_nxt; + else + return tcp_wnd_end(tp); +} + +/* Function: + * 客户端发送ACK + * Parameter: + * sk: 传输控制块 + * rcv_nxt:期望接受的序号 + */ +void __tcp_nip_send_ack(struct sock *sk, u32 rcv_nxt) +{ + struct sk_buff *buff; + + if (sk->sk_state == TCP_CLOSE) + return; + + /* 为数据包分配空间 */ + buff = alloc_skb(MAX_TCP_HEADER, + sk_gfp_mask(sk, GFP_ATOMIC | __GFP_NOWARN)); + + /* 为header预留空间. */ + skb_reserve(buff, MAX_TCP_HEADER); + /* 初始化不含数据的 skb */ + tcp_nip_init_nondata_skb(buff, tcp_nip_acceptable_seq(sk), TCPHDR_ACK); + + /* 标记纯ack,做法是将skb->truesize设置为2 */ + skb_set_tcp_pure_ack(buff); + + /* 记录时间戳,发送skb. */ + __tcp_nip_transmit_skb(sk, buff, 0, (__force gfp_t)0, rcv_nxt); +} +EXPORT_SYMBOL_GPL(__tcp_nip_send_ack); + +void tcp_nip_send_ack(struct sock *sk) +{ + __tcp_nip_send_ack(sk, tcp_sk(sk)->rcv_nxt); +} + +void tcp_nip_send_fin(struct sock *sk) +{ + struct sk_buff *skb, *tskb = tcp_write_queue_tail(sk); + struct tcp_sock *tp = tcp_sk(sk); + + DEBUG("%s: send fin!\n", __func__); + /* 优化:直接将最后一个数据包fin位置为1 */ + if (tskb && tcp_nip_send_head(sk)) { +coalesce: + TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN; + TCP_SKB_CB(tskb)->end_seq++; + tp->write_seq++; + } else { + skb = alloc_skb_fclone(MAX_TCP_HEADER, sk->sk_allocation); + if (unlikely(!skb)) { + if (tskb) + goto coalesce; + return; + } + skb_reserve(skb, MAX_TCP_HEADER); + + tcp_nip_init_nondata_skb(skb, tp->write_seq, + TCPHDR_ACK | TCPHDR_FIN); + tcp_nip_queue_skb(sk, skb); + } + __tcp_nip_push_pending_frames(sk, TCP_BASE_MSS, TCP_NAGLE_OFF); +} + +static bool tcp_nip_snd_wnd_test(const struct tcp_sock *tp, + const struct sk_buff *skb, + unsigned int cur_mss) +{ + u32 end_seq = TCP_SKB_CB(skb)->end_seq; + + if (skb->len > cur_mss) + end_seq = TCP_SKB_CB(skb)->seq + cur_mss; + + return !after(end_seq, tcp_wnd_end(tp)); +} + +static void tcp_nip_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now) +{ + if (skb->len <= mss_now || skb->ip_summed == CHECKSUM_NONE) { + /* Avoid the costly divide in the normal + * non-TSO case. + */ + tcp_skb_pcount_set(skb, 1); + TCP_SKB_CB(skb)->tcp_gso_size = 0; + } else { + tcp_skb_pcount_set(skb, DIV_ROUND_UP(skb->len, mss_now)); + TCP_SKB_CB(skb)->tcp_gso_size = mss_now; + } +} + +static int tcp_nip_init_tso_segs(struct sk_buff *skb, unsigned int mss_now) +{ + int tso_segs = tcp_skb_pcount(skb); + + if (!tso_segs || (tso_segs > 1 && tcp_skb_mss(skb) != mss_now)) { + tcp_nip_set_skb_tso_segs(skb, mss_now); + tso_segs = tcp_skb_pcount(skb); + } + return tso_segs; +} + +static bool tcp_nip_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, + int push_one, gfp_t gfp) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + unsigned int tso_segs, sent_pkts; + int cwnd_quota; + int result; + bool is_cwnd_limited = false; + u32 max_segs; + + sent_pkts = 0; + + tcp_mstamp_refresh(tp); + + while ((skb = tcp_nip_send_head(sk))) { + DEBUG("%s:tcp_nip_send_head head found!\n", __func__); + tcp_nip_init_tso_segs(skb, mss_now); + if (unlikely(!tcp_nip_snd_wnd_test(tp, skb, mss_now))) + break; + + if (unlikely(tcp_nip_transmit_skb(sk, skb, 1, gfp))) + break; + +repair: + tcp_nip_event_new_data_sent(sk, skb); + + if (push_one) + break; + } + return !tp->packets_out && tcp_nip_send_head(sk); +} + +int tcp_nip_rtx_synack(const struct sock *sk, struct request_sock *req) +{ + const struct tcp_request_sock_ops *af_ops = tcp_rsk(req)->af_specific; + struct flowi fl; + int res; + struct dst_entry *dst; + + dst = af_ops->route_req(sk, NULL, req); + tcp_rsk(req)->txhash = net_tx_rndhash(); + + res = af_ops->send_synack(sk, dst, NULL, req, NULL, TCP_SYNACK_NORMAL, + NULL); + + return res; +} +EXPORT_SYMBOL(tcp_nip_rtx_synack); +static void tcp_nip_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr) +{ + struct tcp_sock *tp = tcp_sk(sk); + + tp->packets_out -= decr; +} + +int __tcp_nip_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + unsigned int cur_mss; + int diff, len, err; + + if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) { + if (unlikely(before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))) { + WARN_ON_ONCE(1); + return -EINVAL; + } + if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq)) + return -ENOMEM; + } + + cur_mss = TCP_BASE_MSS; + + if (!before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp)) && + TCP_SKB_CB(skb)->seq != tp->snd_una) + return -EAGAIN; + + len = cur_mss * segs; + if (skb->len > len) { + if (tcp_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE, + skb, len, cur_mss, GFP_ATOMIC)) + return -ENOMEM; /* We'll try again later. */ + } else { + diff = tcp_skb_pcount(skb); + tcp_nip_set_skb_tso_segs(skb, cur_mss); + diff -= tcp_skb_pcount(skb); + if (diff) + tcp_nip_adjust_pcount(sk, skb, diff); + } + + err = tcp_nip_transmit_skb(sk, skb, 1, GFP_ATOMIC); + if (likely(!err)) { + segs = tcp_skb_pcount(skb); + + tp->total_retrans += segs; + } + return err; +} + +int tcp_nip_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) +{ + struct tcp_sock *tp = tcp_sk(sk); + int err = __tcp_nip_retransmit_skb(sk, skb, segs); + + if (err == 0) { + TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS; + tp->retrans_out += tcp_skb_pcount(skb); + + /* Save stamp of the first retransmit. */ + if (!tp->retrans_stamp) + tp->retrans_stamp = tcp_skb_timestamp(skb); + } else if (err != -EBUSY) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL); + } + + return err; +} + +#define TCP_NIP_DEFERRED_ALL ((1UL << TCP_TSQ_DEFERRED) | \ + (1UL << TCP_NIP_WRITE_TIMER_DEFERRED) | \ + (1UL << TCP_NIP_DELACK_TIMER_DEFERRED) | \ + (1UL << TCP_MTU_REDUCED_DEFERRED)) + +void tcp_nip_release_cb(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + unsigned long flags, nflags; + /* perform an atomic operation only if at least one flag is set */ + do { + flags = sk->sk_tsq_flags; + if (!(flags & TCP_NIP_DEFERRED_ALL)) + return; + nflags = flags & ~TCP_NIP_DEFERRED_ALL; + } while (cmpxchg(&sk->sk_tsq_flags, flags, nflags) != flags); + + sock_release_ownership(sk); + if (flags & (1UL << TCP_NIP_WRITE_TIMER_DEFERRED)) { + tcp_nip_write_timer_handler(sk); + __sock_put(sk); + } + if (flags & (1UL << TCP_NIP_DELACK_TIMER_DEFERRED)) { + tcp_nip_delack_timer_handler(sk); + __sock_put(sk); + } + if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED)) { + inet_csk(sk)->icsk_af_ops->mtu_reduced(sk); + __sock_put(sk); + } +} + +static int tcp_nip_xmit_probe_skb(struct sock *sk, int urgent, int mib) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + + /* We don't queue it, tcp_transmit_skb() sets ownership. */ + skb = alloc_skb(MAX_TCP_HEADER, + sk_gfp_mask(sk, GFP_ATOMIC | __GFP_NOWARN)); + if (!skb) + return -1; + + /* Reserve space for headers and set control bits. */ + skb_reserve(skb, MAX_TCP_HEADER); + + tcp_nip_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK); + + NET_INC_STATS(sock_net(sk), mib); + return tcp_nip_transmit_skb(sk, skb, 0, (__force gfp_t)0); +} + +int tcp_nip_write_wakeup(struct sock *sk, int mib) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + + if (sk->sk_state == TCP_CLOSE) + return -1; + + skb = tcp_nip_send_head(sk); + /* 若此时下一个包的序号在发送窗口内 */ + if (skb && before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) { + int err; + + unsigned int mss = TCP_BASE_MSS; + unsigned int seg_size = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; + + if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq)) + tp->pushed_seq = TCP_SKB_CB(skb)->end_seq; + /* 若当前窗口大小不够发送一个完整的数据包 */ + if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq) + return -1; + err = tcp_nip_transmit_skb(sk, skb, 1, GFP_ATOMIC); + if (!err) + tcp_nip_event_new_data_sent(sk, skb); + return err; + } else { + return tcp_nip_xmit_probe_skb(sk, 0, mib); + } +} + +/* 发送0窗口探测报文 */ +void tcp_nip_send_probe0(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + struct net *net = sock_net(sk); + unsigned long probe_max; + int err; + /* 发送一个序号为snd_una - 1,长度为0的ACK包作为零窗口探测报文 */ + err = tcp_nip_write_wakeup(sk, LINUX_MIB_TCPWINPROBE); + + /* 若网络中有发送的数据包且发送队列中没有待发送的数据包了 + * 就直接返回 + */ + if (tp->packets_out || !tcp_nip_send_head(sk)) { + /* Cancel probe timer, if it is not required. */ + icsk->icsk_probes_out = 0; + icsk->icsk_backoff = 0; + return; + } + /* err = 0:发送成功 err = -1:发送失败 */ + if (err <= 0) { + if (icsk->icsk_backoff < net->ipv4.sysctl_tcp_retries2) + icsk->icsk_backoff++; + icsk->icsk_probes_out++; /* 探测次数+1 */ + probe_max = TCP_RTO_MAX; + } else { + if (!icsk->icsk_probes_out) + icsk->icsk_probes_out = 1; + probe_max = TCP_RESOURCE_PROBE_INTERVAL; + } + inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, + tcp_probe0_when(sk, probe_max), + TCP_RTO_MAX); +} diff --git a/net/newip/tcp_nip_timer.c b/net/newip/tcp_nip_timer.c new file mode 100755 index 0000000000000000000000000000000000000000..a44c0769d14e0b08bbe9cbe24a2be6af3737ae7c --- /dev/null +++ b/net/newip/tcp_nip_timer.c @@ -0,0 +1,270 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include +#include +#include + +void tcp_nip_delack_timer_handler(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); + + if (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) || + !(icsk->icsk_ack.pending & ICSK_ACK_TIMER)) + goto out; + + if (time_after(icsk->icsk_ack.timeout, jiffies)) { + sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout); + goto out; + } + icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER; + + if (inet_csk_ack_scheduled(sk)) { + icsk->icsk_ack.ato = TCP_ATO_MIN; + tcp_mstamp_refresh(tcp_sk(sk)); + tcp_nip_send_ack(sk); + __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKS); + } + +out:; +} + +static void tcp_nip_write_err(struct sock *sk) +{ + sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT; + sk->sk_error_report(sk); + /* 释放TCP资源 */ + tcp_nip_done(sk); + __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONTIMEOUT); +} + +static void tcp_nip_delack_timer(struct timer_list *t) +{ + struct inet_connection_sock *icsk = + from_timer(icsk, t, icsk_delack_timer); + struct sock *sk = &icsk->icsk_inet.sk; + + bh_lock_sock(sk); + if (!sock_owned_by_user(sk)) { + tcp_nip_delack_timer_handler(sk); + } else { + __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED); + /* deleguate our work to tcp_release_cb() */ + if (!test_and_set_bit(TCP_NIP_DELACK_TIMER_DEFERRED, &sk->sk_tsq_flags)) + sock_hold(sk); + } + bh_unlock_sock(sk); + sock_put(sk); +} + +static bool retransmits_nip_timed_out(struct sock *sk, + unsigned int boundary, + unsigned int timeout, + bool syn_set) +{ + unsigned int linear_backoff_thresh, start_ts; + unsigned int rto_base = syn_set ? TCP_TIMEOUT_INIT : TCP_RTO_MIN; + + if (!inet_csk(sk)->icsk_retransmits) + return false; + + start_ts = tcp_sk(sk)->retrans_stamp; + if (unlikely(!start_ts)) + start_ts = tcp_skb_timestamp(tcp_write_queue_head(sk)); + + if (likely(timeout == 0)) { + linear_backoff_thresh = ilog2(TCP_RTO_MAX / rto_base); + if (boundary <= linear_backoff_thresh) + timeout = ((2 << boundary) - 1) * rto_base; + else + timeout = ((2 << linear_backoff_thresh) - 1) * rto_base + + (boundary - linear_backoff_thresh) * TCP_RTO_MAX; + } + DEBUG("tcp_time_stamp(tcp_sk(sk)): %d, start_ts: %d,", + "tcp_time_stamp(tcp_sk(sk)) - start_ts: %d\n", + tcp_time_stamp(tcp_sk(sk)), start_ts, tcp_time_stamp(tcp_sk(sk)) - start_ts); + DEBUG("timeout: %d\n", jiffies_to_msecs(timeout)); + + return (tcp_time_stamp(tcp_sk(sk)) - start_ts) >= jiffies_to_msecs(timeout); +} + +static int tcp_nip_write_timeout(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + struct net *net = sock_net(sk); + int retry_until; + bool do_reset, syn_set = false; + + if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { + retry_until = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries; + syn_set = true; + } else { + retry_until = net->ipv4.sysctl_tcp_retries2; + } + + if (retransmits_nip_timed_out(sk, retry_until, + syn_set ? 0 : icsk->icsk_user_timeout, syn_set)) { + DEBUG("%s: tcp retransmit time out!!!\n", __func__); + + return 1; + } + return 0; +} + +void tcp_nip_retransmit_timer(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct net *net = sock_net(sk); + struct inet_connection_sock *icsk = inet_csk(sk); + + if (!tp->packets_out) + goto out; + + WARN_ON(tcp_nip_write_queue_empty(sk)); + + tp->tlp_high_seq = 0; + + if (tcp_nip_write_timeout(sk)) + goto out; + + if (tcp_nip_retransmit_skb(sk, tcp_write_queue_head(sk), 1) > 0) { + if (!icsk->icsk_retransmits) + icsk->icsk_retransmits = 1; + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + min(icsk->icsk_rto, TCP_RESOURCE_PROBE_INTERVAL), + TCP_RTO_MAX); + goto out; + } + icsk->icsk_backoff++; + icsk->icsk_retransmits++; + +out_reset_timer: + icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX); + +out:; +} + +void tcp_nip_probe_timer(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + int max_probes; + + if (tp->packets_out || !tcp_nip_send_head(sk)) { + icsk->icsk_probes_out = 0; + return; + } + + max_probes = sock_net(sk)->ipv4.sysctl_tcp_retries2; + if (sock_flag(sk, SOCK_DEAD)) { + const bool alive = inet_csk_rto_backoff(icsk, TCP_RTO_MAX) < TCP_RTO_MAX; + + max_probes = 3; + if (!alive && icsk->icsk_backoff >= max_probes) + goto abort; + + tcp_nip_done(sk); + return; + } + + if (icsk->icsk_probes_out >= max_probes) { +abort: tcp_nip_write_err(sk); + } else { + /* Only send another probe if we didn't close things up. */ + tcp_nip_send_probe0(sk); + } +} + +void tcp_nip_write_timer_handler(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + int event; + + if (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) || + !icsk->icsk_pending) + goto out; + + if (time_after(icsk->icsk_timeout, jiffies)) { + sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout); + goto out; + } + tcp_mstamp_refresh(tcp_sk(sk)); + event = icsk->icsk_pending; + + switch (event) { + case ICSK_TIME_RETRANS: + icsk->icsk_pending = 0; + tcp_nip_retransmit_timer(sk); + break; + case ICSK_TIME_PROBE0: + icsk->icsk_pending = 0; + tcp_nip_probe_timer(sk); + break; + default: + break; + } + +out:; +} + +static void tcp_nip_write_timer(struct timer_list *t) +{ + struct inet_connection_sock *icsk = + from_timer(icsk, t, icsk_retransmit_timer); + struct sock *sk = &icsk->icsk_inet.sk; + + bh_lock_sock(sk); + if (!sock_owned_by_user(sk)) { + tcp_nip_write_timer_handler(sk); + } else { + /* delegate our work to tcp_release_cb() */ + //这一bit怎么办。。。 + if (!test_and_set_bit(TCP_NIP_WRITE_TIMER_DEFERRED, &sk->sk_tsq_flags)) + sock_hold(sk); + } + bh_unlock_sock(sk); + sock_put(sk); +} + +static void tcp_nip_keepalive_timer(struct timer_list *t) +{ + struct sock *sk = from_timer(sk, t, sk_timer); + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + u32 elapsed; + + /* Only process if socket is not in use. */ + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) { + /* Try again later. */ + inet_csk_reset_keepalive_timer(sk, HZ / 20); + goto out; + } + + if (sk->sk_state == TCP_LISTEN) { + pr_err("Hmm... keepalive on a LISTEN ???\n"); + goto out; + } + tcp_mstamp_refresh(tp); + if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) { + DEBUG("%s: finish wait, close sock\n", __func__); + goto death; + } +death: + tcp_nip_done(sk); +out: + bh_unlock_sock(sk); + sock_put(sk); +} + +void tcp_nip_init_xmit_timers(struct sock *sk) +{ + inet_csk_init_xmit_timers(sk, &tcp_nip_write_timer, &tcp_nip_delack_timer, + &tcp_nip_keepalive_timer); +} + +void tcp_nip_clear_xmit_timers(struct sock *sk) +{ + inet_csk_clear_xmit_timers(sk); +} diff --git a/net/newip/udp.c b/net/newip/udp.c new file mode 100755 index 0000000000000000000000000000000000000000..ec2a0f345b774592ed2a854f6a4216f0be3a86d1 --- /dev/null +++ b/net/newip/udp.c @@ -0,0 +1,553 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +unsigned int nip_check_sum(unsigned char *data, unsigned short data_len) +{ + unsigned int i = 0, sum = 0; + + while (i + 1 < data_len) { + sum += (data[i] << 8) + data[i + 1]; + i += 2; /* 偏移2字节 */ + } + + if (i < (unsigned int)data_len) + sum += (data[i] << 8); + + return sum; +} + +unsigned int nip_header_chksum(struct nip_pseudo_header *chksum_header) +{ + unsigned char pseudo_header[NIP_HDR_MAX] = {0}; + unsigned short hdr_len = 0; + + if (chksum_header->src_addr.bitlen / NIP_ADDR_BIT_LEN_8) { + memcpy(pseudo_header + hdr_len, + chksum_header->src_addr.nip_addr_field8, + chksum_header->src_addr.bitlen / NIP_ADDR_BIT_LEN_8); + hdr_len += chksum_header->src_addr.bitlen / NIP_ADDR_BIT_LEN_8; + } + + if (chksum_header->dst_addr.bitlen / NIP_ADDR_BIT_LEN_8) { + memcpy(pseudo_header + hdr_len, + chksum_header->dst_addr.nip_addr_field8, + chksum_header->dst_addr.bitlen / NIP_ADDR_BIT_LEN_8); + hdr_len += chksum_header->dst_addr.bitlen / NIP_ADDR_BIT_LEN_8; + } + + *(unsigned short *)(pseudo_header + hdr_len) = + htons(chksum_header->check_len); + hdr_len += sizeof(chksum_header->check_len); + *(pseudo_header + hdr_len) = chksum_header->next_header; + hdr_len += sizeof(chksum_header->next_header); + + return nip_check_sum(pseudo_header, hdr_len); +} + +unsigned short nip_check_sum_parse(unsigned char *data, + unsigned short check_len, + struct nip_pseudo_header *chksum_header) +{ + unsigned int sum = 0; + + sum = nip_check_sum(data, check_len); + sum += nip_header_chksum(chksum_header); + + while (sum >> USHORT_PAYLOAD) + sum = (sum >> USHORT_PAYLOAD) + (sum & 0xffff); + + return (unsigned short)sum; +} + +static u32 nip_udp_portaddr_hash(const struct net *net, + const struct nip_addr *niaddr, + u_short port) +{ + u32 hash; + u32 mix = net_hash_mix(net); + + /* use nip_addr_hash() to obtain a hash result of nip_addr */ + hash = jhash_1word(nip_addr_hash(niaddr), mix); + + return hash ^ port; +} + +/* bind & sendto过程中调用,绑定端口 */ +int nip_udp_get_port(struct sock *sk, unsigned short snum) +{ + unsigned int hash2_nulladdr, hash2_partial; + + hash2_nulladdr = nip_udp_portaddr_hash(sock_net(sk), + &nip_any_addr, snum); + /* hash2_partial is the hash result of nip_addr only */ + hash2_partial = nip_udp_portaddr_hash(sock_net(sk), + &sk->sk_nip_rcv_saddr, 0); + + /* precompute partial secondary hash*/ + udp_sk(sk)->udp_portaddr_hash = hash2_partial; + return udp_lib_get_port(sk, snum, hash2_nulladdr); +} + +/* judge whether the nip_addr is equal to 0xFF09 */ +static bool nip_addr_any(const struct nip_addr *ad) +{ + bool result; + + if (ad->bitlen == NIP_ADDR_BIT_LEN_16) { + if (ad->nip_addr_field16[0] == + nip_any_addr.nip_addr_field16[0]) + result = 1; + else + result = 0; + } else { + result = 0; + } + return result; +} + +static int nip_udp_compute_score(struct sock *sk, struct net *net, + const struct nip_addr *saddr, __be16 sport, + const struct nip_addr *daddr, unsigned short hnum, + int dif, int sdif, bool exact_dif) +{ + /* we don't consider dif and sdif at the moment */ + int score = 0; + struct inet_sock *inet; + + if (!net_eq(sock_net(sk), net) || + udp_sk(sk)->udp_port_hash != hnum || + sk->sk_family != PF_NINET) + return -1; + + /* 本端设备侧的对端设备目的端口 + * 在对端发送报文头中是源端口 + */ + inet = inet_sk(sk); + if (inet->inet_dport) { + if (inet->inet_dport != sport) + return -1; + score++; + } + + /* 本端设备侧的源地址 + * 在对端设备发送报文头中是目的地址 + */ + if (!nip_addr_any(&sk->sk_nip_rcv_saddr)) { + if (!nip_addr_eq(&sk->sk_nip_rcv_saddr, daddr)) + return -1; + score++; + } + + /* 本端设备侧目的地址 + * 在对端设备发送报文头中是源地址 + */ + if (!nip_addr_any(&sk->sk_nip_daddr)) { + if (!nip_addr_eq(&sk->sk_nip_daddr, saddr)) + return -1; + score++; + } + + if (sk->sk_incoming_cpu == raw_smp_processor_id()) + score++; + return score; +} + +static struct sock *nip_udp_lib_lookup2(struct net *net, + const struct nip_addr *saddr, + u_short sport, + const struct nip_addr *daddr, + unsigned short hnum, + int dif, int sdif, bool exact_dif, + struct udp_hslot *hslot2, + struct sk_buff *skb) +{ + struct sock *sk; + struct sock *result = NULL; + int score, badness; + + badness = -1; + udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) { + score = nip_udp_compute_score(sk, net, saddr, sport, + daddr, hnum, dif, sdif, exact_dif); + if (score > badness) { + result = sk; + badness = score; + } + } + return result; +} + +struct sock *__nip_udp_lib_lookup(struct net *net, + const struct nip_addr *saddr, __be16 sport, + const struct nip_addr *daddr, __be16 dport, + int dif, int sdif, struct udp_table *udptable, + struct sk_buff *skb) +{ + unsigned short hnum = ntohs(dport); + unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask); + unsigned int old_slot2; + bool exact_dif = 0; + int score, badness; + struct sock *sk, *result; + struct udp_hslot *hslot2, *hslot = &udptable->hash[slot]; + + if (hslot->count > NIP_UDP_HSLOT_COUNT) { + hash2 = nip_udp_portaddr_hash(net, daddr, hnum); + DEBUG("hash2 is: 0x%x", hash2); + slot2 = hash2 & udptable->mask; + hslot2 = &udptable->hash2[slot2]; + if (hslot->count < hslot2->count) + goto begin; + + result = nip_udp_lib_lookup2(net, saddr, sport, + daddr, hnum, dif, sdif, exact_dif, + hslot2, skb); + if (!result) { + old_slot2 = slot2; + + hash2 = nip_udp_portaddr_hash(net, &nip_any_addr, hnum); + slot2 = hash2 & udptable->mask; + /* avoid searching the same slot again. */ + if (unlikely(slot2 == old_slot2)) + return result; + + hslot2 = &udptable->hash2[slot2]; + if (hslot->count < hslot2->count) + goto begin; + + result = nip_udp_lib_lookup2(net, saddr, sport, + daddr, hnum, dif, sdif, + exact_dif, hslot2, + skb); + } + return result; + } +begin: + result = NULL; + badness = -1; + sk_for_each_rcu(sk, &hslot->head) { + score = nip_udp_compute_score(sk, net, + saddr, sport, daddr, hnum, + dif, sdif, exact_dif); + if (score > badness) { + result = sk; + badness = score; + } + DEBUG("score is: %d", score); + } + return result; +} +EXPORT_SYMBOL_GPL(__nip_udp_lib_lookup); + +static struct sock *__nip_udp_lib_lookup_skb(struct sk_buff *skb, + __be16 sport, __be16 dport, + struct udp_table *udptable) +{ + return __nip_udp_lib_lookup(dev_net(skb->dev), + &NIPCB(skb)->srcaddr, sport, + &NIPCB(skb)->dstaddr, dport, 0, + 0, udptable, skb); +} + +void udp_table_del(struct sock *sk) +{ + udp_lib_unhash(sk); +} + +int nip_udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, + int noblock, int flags, int *addr_len) +{ + struct sk_buff *skb; + unsigned int ulen, copied, datalen; + int peeking, off; + int err; + + off = sk_peek_offset(sk, flags); + peeking = off; /* 从队列里取出skb */ + skb = __skb_recv_udp(sk, flags, noblock, &off, &err); + if (!skb) + return err; + ulen = skb->len; + copied = len; + if (copied > ulen - off) + copied = ulen - off; + else if (copied < ulen) + msg->msg_flags |= MSG_TRUNC; + /* 计算复制数据的长度,不包括udp首部 */ + DEBUG_TRACE("copied = %d", copied); + /* copy data */ + datalen = copy_to_iter(skb->data, copied, &msg->msg_iter); + if (datalen < 0) { + DEBUG("%s: copy to iter in failure! len = %d", __func__, + datalen); + err = -EFAULT; + return err; + } + DEBUG_TRACE("udp_payloadlen = %d", datalen); + + sock_recv_ts_and_drops(msg, sk, skb); + /* 更新传输控制块中最后一个数据报接收 + * 的时间戳等信息 + */ + /* copy the address */ + if (msg->msg_name) { + DECLARE_SOCKADDR(struct sockaddr_nin *, sin, msg->msg_name); + + sin->sin_family = AF_NINET; + sin->sin_port = udp_hdr(skb)->source; + sin->sin_addr = NIPCB(skb)->srcaddr; + *addr_len = sizeof(*sin); + } + + err = copied; + if (flags & MSG_TRUNC) + err = ulen; + + skb_consume_udp(sk, skb, peeking ? -err : err); + return err; +} + +static void nip_udp_err(struct sk_buff *skb, + struct ninet_skb_parm *opt, + u8 type, + u8 code, int offset, + __be32 info) +{ +} + +static int __nip_udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) +{ + int rc; + + sk_incoming_cpu_update(sk); + + rc = __udp_enqueue_schedule_skb(sk, skb); + if (rc < 0) { + kfree_skb(skb); + return -1; + } + return 0; +} + +bool nip_get_udp_input_checksum(struct sk_buff *skb) +{ + struct nip_pseudo_header nph = {0}; + struct udphdr *udphead = udp_hdr(skb); + + nph.next_header = NIPCB(skb)->nexthdr; + nph.src_addr = NIPCB(skb)->srcaddr; + nph.dst_addr = NIPCB(skb)->dstaddr; + nph.check_len = ntohs(udphead->len); + + return nip_check_sum_parse(skb_transport_header(skb), + nph.check_len, &nph) + == 0xffff ? true : false; +} + +/* udp接收,网络层接收 */ +int nip_udp_input(struct sk_buff *skb) +{ + struct sock *sk; + int rc = 0; + struct udphdr *udphead = udp_hdr(skb); + + if (!nip_get_udp_input_checksum(skb)) { + DEBUG("%s: checksum failed, drop the packet. ", + __func__); + kfree_skb(skb); + rc = -1; + goto end; + } + + sk = __nip_udp_lib_lookup_skb(skb, udphead->source, + udphead->dest, &udp_table); + if (!sk) { + kfree_skb(skb); + DEBUG("%s: dport not match, free the skb.", __func__); + rc = -1; + goto end; + } + + skb_pull(skb, sizeof(struct udphdr)); + skb->len = ntohs(udphead->len) - sizeof(struct udphdr); + + skb_dst_drop(skb); + /* enqueue */ + rc = __nip_udp_queue_rcv_skb(sk, skb); +end: + return rc; +} + +int nip_udp_output(struct sock *sk, struct msghdr *msg, size_t len) +{ + DECLARE_SOCKADDR(struct sockaddr_nin *, sin, msg->msg_name); + struct flow_nip fln; + u_short sport, dport; + struct dst_entry *dst; + int err = 0; + struct inet_sock *inet; + + if (sin->sin_family != AF_NINET) { + DEBUG("%s: sin_family false.", __func__); + return -EAFNOSUPPORT; + } + if (nip_addr_check(&sin->sin_addr)) { + DEBUG("%s: sin_addr false.", __func__); + return -EADDRNOTAVAIL; + } + + inet = inet_sk(sk); + if (sin) { + /* sendto时必须指定目的地址,端口(网络序) */ + dport = sin->sin_port; + fln.daddr = sin->sin_addr; + } else { + /* 目前未实现udp socket connect 函数, + * 目的地址与端口必须由sendto直接提供 + */ + return -EDESTADDRREQ; + } + sport = htons(inet->inet_num); + + /* 查路由 & 获取saddr */ + dst = nip_sk_dst_lookup_flow(sk, &fln); + if (IS_ERR(dst)) { + err = PTR_ERR(dst); + dst = NULL; + goto out; + } + + err = nip_segment_output(sk, msg, len, + sizeof(struct udphdr), &fln.saddr, + sport, &fln.daddr, + dport, dst); + +out: + if (!err) + return len; + + return err; +} + +/* 关闭连接用到 */ +void nip_udp_destroy_sock(struct sock *sk) +{ + udp_table_del(sk); + ninet_destroy_sock(sk); +} + +/* socket option code for udp */ +int nip_udp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, + unsigned int optlen) +{ + return 0; +} + +int nip_udp_getsockopt(struct sock *sk, int level, + int optname, char __user *optval, + int __user *optlen) +{ + return 0; +} + +static const struct ninet_protocol nip_udp_protocol = { + .handler = nip_udp_input, + .err_handler = nip_udp_err, + .flags = 0, +}; + +int udp_stub_hash(struct sock *sk) +{ + return 0; +} + +void udp_stub_unhash(struct sock *sk) +{ +} + +void udp_stub_rehash(struct sock *sk) +{ +} + +/* newip udp 相关操作 */ +struct proto nip_udp_prot = { + .name = "nip_udp", + .owner = THIS_MODULE, + .close = udp_lib_close, + .connect = nip_datagram_connect, + .disconnect = udp_disconnect, + .ioctl = udp_ioctl, + .init = udp_init_sock, + .destroy = nip_udp_destroy_sock, + .setsockopt = nip_udp_setsockopt, + .getsockopt = nip_udp_getsockopt, + .sendmsg = nip_udp_output, + .recvmsg = nip_udp_recvmsg, + .backlog_rcv = __nip_udp_queue_rcv_skb, + .release_cb = nip_datagram_release_cb, + .hash = udp_lib_hash, + .unhash = udp_lib_unhash, + .rehash = udp_stub_rehash, + .get_port = nip_udp_get_port, + .memory_allocated = &udp_memory_allocated, + .sysctl_mem = sysctl_udp_mem, + .obj_size = sizeof(struct nip_udp_sock), + .h.udp_table = &udp_table, + .diag_destroy = udp_abort, +}; + +/* 创建newip socket相关信息 */ +static struct inet_protosw nip_udp_protosw = { + .type = SOCK_DGRAM, + .protocol = IPPROTO_UDP, + .prot = &nip_udp_prot, + .ops = &ninet_dgram_ops, + .flags = INET_PROTOSW_PERMANENT, +}; + +/* af_NINET 初始化调用 */ +int __init nip_udp_init(void) +{ + int ret; + + ret = ninet_add_protocol(&nip_udp_protocol, IPPROTO_UDP); + if (ret) + goto out; + + ret = ninet_register_protosw(&nip_udp_protosw); + if (ret) + goto out_nip_udp_protocol; +out: + return ret; + +out_nip_udp_protocol: + ninet_del_protocol(&nip_udp_protocol, IPPROTO_UDP); + goto out; +} + +void nip_udp_exit(void) +{ + ninet_unregister_protosw(&nip_udp_protosw); + ninet_del_protocol(&nip_udp_protocol, IPPROTO_UDP); +}