From 757b5b7627524ba4b2d4d2182224e338d6ba7ad5 Mon Sep 17 00:00:00 2001 From: Wang Liang Date: Tue, 26 Aug 2025 20:30:34 +0800 Subject: [PATCH 1/7] net/oenetcls: introduce oenetcls for network optimization commit 4bed6ba0e88f50484fd5fb06bd993727b981b718 openEuler commit 75b4c5128ba410608241b54d9dfd13711804917a openEuler commit 22d4075bf5ef29ba4b329f954ac28a7de1d69a65 openEuler This introduces a kind of network optimization method named oenetcls. It can configure the ntuple rule, and bind interrupt to the netdev queue automatically. Signed-off-by: Yue Haibing Signed-off-by: Wang Liang Signed-off-by: Liu Jian Signed-off-by: yuelg Signed-off-by: liujian <66liujian@163.com> --- arch/arm64/configs/tencent.config | 1 + include/linux/oenetcls.h | 81 +++ include/net/netdev_rx_queue.h | 2 +- kernel/irq/irqdesc.c | 2 +- net/Kconfig | 1 + net/Makefile | 1 + net/core/dev.c | 23 + net/ipv4/af_inet.c | 7 + net/ipv4/tcp.c | 9 + net/oenetcls/Kconfig | 9 + net/oenetcls/Makefile | 8 + net/oenetcls/asmdefs.h | 61 ++ net/oenetcls/memcpy-sve.S | 157 +++++ net/oenetcls/oenetcls.h | 177 +++++ net/oenetcls/oenetcls_flow.c | 406 +++++++++++ net/oenetcls/oenetcls_main.c | 1076 +++++++++++++++++++++++++++++ net/oenetcls/oenetcls_ntuple.c | 572 +++++++++++++++ 17 files changed, 2591 insertions(+), 2 deletions(-) create mode 100644 include/linux/oenetcls.h create mode 100644 net/oenetcls/Kconfig create mode 100644 net/oenetcls/Makefile create mode 100644 net/oenetcls/asmdefs.h create mode 100644 net/oenetcls/memcpy-sve.S create mode 100644 net/oenetcls/oenetcls.h create mode 100644 net/oenetcls/oenetcls_flow.c create mode 100644 net/oenetcls/oenetcls_main.c create mode 100644 net/oenetcls/oenetcls_ntuple.c diff --git a/arch/arm64/configs/tencent.config b/arch/arm64/configs/tencent.config index 4268a2642279..4883f8d928f6 100644 --- a/arch/arm64/configs/tencent.config +++ b/arch/arm64/configs/tencent.config @@ -194,6 +194,7 @@ CONFIG_SMC=m CONFIG_SMC_DIAG=m CONFIG_XDP_SOCKETS=y CONFIG_XDP_SOCKETS_DIAG=m +CONFIG_OENETCLS=m CONFIG_INET=y CONFIG_IP_MULTICAST=y CONFIG_IP_ADVANCED_ROUTER=y diff --git a/include/linux/oenetcls.h b/include/linux/oenetcls.h new file mode 100644 index 000000000000..29c0db40971f --- /dev/null +++ b/include/linux/oenetcls.h @@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef _LINUX_OENETCLS_H +#define _LINUX_OENETCLS_H + +struct oecls_hook_ops { + void (*oecls_cfg_rxcls)(struct sock *sk, int is_del); + void (*oecls_flow_update)(struct sock *sk); + void (*oecls_set_cpu)(struct sk_buff *skb); + bool (*oecls_timeout)(struct net_device *dev, u16 rxq_index, + u32 flow_id, u16 filter_id); +}; + +extern const struct oecls_hook_ops __rcu *oecls_ops; + +static inline void oenetcls_cfg_rxcls(struct sock *sk, int is_del) +{ + const struct oecls_hook_ops *ops; + + rcu_read_lock(); + ops = rcu_dereference(oecls_ops); + if (ops && ops->oecls_cfg_rxcls) + ops->oecls_cfg_rxcls(sk, is_del); + rcu_read_unlock(); +} + +static inline void oenetcls_flow_update(struct sock *sk) +{ + const struct oecls_hook_ops *ops; + + rcu_read_lock(); + ops = rcu_dereference(oecls_ops); + if (ops && ops->oecls_flow_update) + ops->oecls_flow_update(sk); + rcu_read_unlock(); +} + +static inline void oenetcls_skb_set_cpu(struct sk_buff *skb) +{ + const struct oecls_hook_ops *ops; + + rcu_read_lock(); + ops = rcu_dereference(oecls_ops); + if (ops && ops->oecls_set_cpu) + ops->oecls_set_cpu(skb); + rcu_read_unlock(); +} + +static inline void oenetcls_skblist_set_cpu(struct list_head *head) +{ + const struct oecls_hook_ops *ops; + struct sk_buff *skb, *next; + + rcu_read_lock(); + ops = rcu_dereference(oecls_ops); + if (ops && ops->oecls_set_cpu) { + list_for_each_entry_safe(skb, next, head, list) + ops->oecls_set_cpu(skb); + } + rcu_read_unlock(); +} + +static inline bool oenetcls_may_expire_flow(struct net_device *dev, + u16 rxq_index, u32 flow_id, + u16 filter_id, bool *expire) +{ + const struct oecls_hook_ops *ops; + + rcu_read_lock(); + ops = rcu_dereference(oecls_ops); + if (ops && ops->oecls_timeout) { + *expire = ops->oecls_timeout(dev, rxq_index, flow_id, filter_id); + rcu_read_unlock(); + return true; + } + rcu_read_unlock(); + + return false; +} + +#endif /* _LINUX_OENETCLS_H */ + diff --git a/include/net/netdev_rx_queue.h b/include/net/netdev_rx_queue.h index bafc4fbba51b..06cc07d5a816 100644 --- a/include/net/netdev_rx_queue.h +++ b/include/net/netdev_rx_queue.h @@ -23,7 +23,7 @@ struct netdev_rx_queue { struct xsk_buff_pool *pool; #endif - KABI_RESERVE(1); + KABI_USE(1, void *__rcu oecls_ftb); KABI_RESERVE(2); } ____cacheline_aligned_in_smp; diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 1a9303b5c37f..93f61ddc8da7 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -428,7 +428,7 @@ struct irq_desc *irq_to_desc(unsigned int irq) { return mtree_load(&sparse_irqs, irq); } -#ifdef CONFIG_KVM_BOOK3S_64_HV_MODULE +#if defined(CONFIG_KVM_BOOK3S_64_HV_MODULE) || IS_ENABLED(CONFIG_OENETCLS) EXPORT_SYMBOL_GPL(irq_to_desc); #endif diff --git a/net/Kconfig b/net/Kconfig index 6fa566507e60..2b0e2fd67584 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -73,6 +73,7 @@ source "net/iucv/Kconfig" source "net/smc/Kconfig" source "net/xdp/Kconfig" source "net/toa/Kconfig" +source "net/oenetcls/Kconfig" config NET_HANDSHAKE bool diff --git a/net/Makefile b/net/Makefile index dea256b1b293..c740ec85824b 100644 --- a/net/Makefile +++ b/net/Makefile @@ -79,5 +79,6 @@ obj-$(CONFIG_NET_NCSI) += ncsi/ obj-$(CONFIG_XDP_SOCKETS) += xdp/ obj-$(CONFIG_MPTCP) += mptcp/ obj-$(CONFIG_MCTP) += mctp/ +obj-$(CONFIG_OENETCLS) += oenetcls/ obj-$(CONFIG_NET_HANDSHAKE) += handshake/ obj-$(CONFIG_TOA) += toa/ diff --git a/net/core/dev.c b/net/core/dev.c index f03459445e55..2947a9369362 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -158,6 +158,12 @@ #include "dev.h" #include "net-sysfs.h" +#if IS_ENABLED(CONFIG_OENETCLS) +#include +const struct oecls_hook_ops __rcu *oecls_ops __read_mostly; +EXPORT_SYMBOL_GPL(oecls_ops); +#endif + static DEFINE_SPINLOCK(ptype_lock); struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; struct list_head ptype_all __read_mostly; /* Taps */ @@ -4732,6 +4738,10 @@ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, bool expire = true; unsigned int cpu; +#if IS_ENABLED(CONFIG_OENETCLS) + if (oenetcls_may_expire_flow(dev, rxq_index, flow_id, filter_id, &expire)) + return expire; +#endif rcu_read_lock(); flow_table = rcu_dereference(rxqueue->rps_flow_table); if (flow_table && flow_id <= flow_table->mask) { @@ -5813,6 +5823,11 @@ static int netif_receive_skb_internal(struct sk_buff *skb) } } #endif + +#if IS_ENABLED(CONFIG_OENETCLS) + oenetcls_skb_set_cpu(skb); +#endif + ret = __netif_receive_skb(skb); rcu_read_unlock(); return ret; @@ -5847,6 +5862,11 @@ void netif_receive_skb_list_internal(struct list_head *head) } } #endif + +#if IS_ENABLED(CONFIG_OENETCLS) + oenetcls_skblist_set_cpu(head); +#endif + __netif_receive_skb_list(head); rcu_read_unlock(); } @@ -9978,6 +9998,9 @@ int __netdev_update_features(struct net_device *dev) return err < 0 ? 0 : 1; } +#if IS_ENABLED(CONFIG_OENETCLS) +EXPORT_SYMBOL(__netdev_update_features); +#endif /** * netdev_update_features - recalculate device features diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 2c7091fc4ddd..84aca4df2b11 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -125,6 +125,10 @@ #include #include +#if IS_ENABLED(CONFIG_OENETCLS) +#include +#endif + /* The inetsw table contains everything that inet_create needs to * build a new socket. */ @@ -226,6 +230,9 @@ int __inet_listen_sk(struct sock *sk, int backlog) tcp_set_state(sk, TCP_CLOSE); return -EPERM; } +#if IS_ENABLED(CONFIG_OENETCLS) + oenetcls_cfg_rxcls(sk, 0); +#endif } return 0; } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index b8bb4bd3b1e7..f8e7e9c35ed2 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -279,6 +279,9 @@ #include #include #include +#if IS_ENABLED(CONFIG_OENETCLS) +#include +#endif #include "netlat.h" /* Track pending CMSGs. */ @@ -2600,6 +2603,9 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, if (unlikely(flags & MSG_ERRQUEUE)) return inet_recv_error(sk, msg, len, addr_len); +#if IS_ENABLED(CONFIG_OENETCLS) + oenetcls_flow_update(sk); +#endif if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue) && sk->sk_state == TCP_ESTABLISHED) @@ -2965,6 +2971,9 @@ void __tcp_close(struct sock *sk, long timeout) void tcp_close(struct sock *sk, long timeout) { lock_sock(sk); +#if IS_ENABLED(CONFIG_OENETCLS) + oenetcls_cfg_rxcls(sk, 1); +#endif __tcp_close(sk, timeout); release_sock(sk); if (!sk->sk_net_refcnt) diff --git a/net/oenetcls/Kconfig b/net/oenetcls/Kconfig new file mode 100644 index 000000000000..68d5c6904319 --- /dev/null +++ b/net/oenetcls/Kconfig @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: GPL-2.0-only +config OENETCLS + tristate "Network classification" + default n + help + Allows to configure ntuple rule, and bind interrupt to netdev + automatically. + Use OENETCLS && OENETCLS_HOOKS to enable oenetcls feature. + Use parameter mode to decide running mode. diff --git a/net/oenetcls/Makefile b/net/oenetcls/Makefile new file mode 100644 index 000000000000..cdf17ea096d3 --- /dev/null +++ b/net/oenetcls/Makefile @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: GPL-2.0-only + +obj-$(CONFIG_OENETCLS) = oenetcls.o +oenetcls-y := oenetcls_main.o oenetcls_ntuple.o oenetcls_flow.o +ifeq ($(CONFIG_ARM64_SVE),y) +oenetcls-y += memcpy-sve.o +endif + diff --git a/net/oenetcls/asmdefs.h b/net/oenetcls/asmdefs.h new file mode 100644 index 000000000000..8138a94c18af --- /dev/null +++ b/net/oenetcls/asmdefs.h @@ -0,0 +1,61 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef _ASMDEFS_H +#define _ASMDEFS_H + +/* Branch Target Identitication support. */ +#define BTI_C hint 34 +#define BTI_J hint 36 +/* Return address signing support (pac-ret). */ +#define PACIASP hint 25; .cfi_window_save +#define AUTIASP hint 29; .cfi_window_save + +/* GNU_PROPERTY_AARCH64_* macros from elf.h. */ +#define FEATURE_1_AND 0xc0000000 +#define FEATURE_1_BTI 1 +#define FEATURE_1_PAC 2 + +/* Add a NT_GNU_PROPERTY_TYPE_0 note. */ +#define GNU_PROPERTY(type, value) \ + .section .note.gnu.property, "a"; \ + .p2align 3; \ + .word 4; \ + .word 16; \ + .word 5; \ + .asciz "GNU"; \ + .word type; \ + .word 4; \ + .word value; \ + .word 0; \ + .text + +#ifndef WANT_GNU_PROPERTY +#define WANT_GNU_PROPERTY 1 +#endif + +#if WANT_GNU_PROPERTY +/* Add property note with supported features to all asm files. */ +GNU_PROPERTY(FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC) +#endif + +#define ENTRY_ALIGN(name, alignment) \ + .global name; \ + .type name, %function; \ + .align alignment; \ +name: \ + .cfi_startproc; \ + BTI_C; + +#define ENTRY(name) ENTRY_ALIGN(name, 6) + +#define ENTRY_ALIAS(name) \ + .global name; \ + .type name, %function; \ + name: + +#define END(name) \ + .cfi_endproc; \ + .size name, .-name; + +#define L(l) .L ## l + +#endif diff --git a/net/oenetcls/memcpy-sve.S b/net/oenetcls/memcpy-sve.S new file mode 100644 index 000000000000..106e4c30294c --- /dev/null +++ b/net/oenetcls/memcpy-sve.S @@ -0,0 +1,157 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#include "asmdefs.h" + +.arch armv8-a+sve + +#define dstin x0 +#define src x1 +#define count x2 +#define dst x3 +#define srcend x4 +#define dstend x5 +#define tmp1 x6 +#define vlen x6 + +#define A_q q0 +#define B_q q1 +#define C_q q2 +#define D_q q3 +#define E_q q4 +#define F_q q5 +#define G_q q6 +#define H_q q7 + +/* This implementation handles overlaps and supports both memcpy and memmove + from a single entry point. It uses unaligned accesses and branchless + sequences to keep the code small, simple and improve performance. + SVE vectors are used to speedup small copies. + + Copies are split into 3 main cases: small copies of up to 32 bytes, medium + copies of up to 128 bytes, and large copies. The overhead of the overlap + check is negligible since it is only required for large copies. + + Large copies use a software pipelined loop processing 64 bytes per iteration. + The source pointer is 16-byte aligned to minimize unaligned accesses. + The loop tail is handled by always copying 64 bytes from the end. +*/ + +ENTRY_ALIAS (__memmove_aarch64_sve) +ENTRY (__memcpy_aarch64_sve) + cmp count, 128 + b.hi L(copy_long) + cntb vlen + cmp count, vlen, lsl 1 + b.hi L(copy32_128) + + whilelo p0.b, xzr, count + whilelo p1.b, vlen, count + ld1b z0.b, p0/z, [src, 0, mul vl] + ld1b z1.b, p1/z, [src, 1, mul vl] + st1b z0.b, p0, [dstin, 0, mul vl] + st1b z1.b, p1, [dstin, 1, mul vl] + ret + + /* Medium copies: 33..128 bytes. */ +L(copy32_128): + add srcend, src, count + add dstend, dstin, count + ldp A_q, B_q, [src] + ldp C_q, D_q, [srcend, -32] + cmp count, 64 + b.hi L(copy128) + stp A_q, B_q, [dstin] + stp C_q, D_q, [dstend, -32] + ret + + /* Copy 65..128 bytes. */ +L(copy128): + ldp E_q, F_q, [src, 32] + cmp count, 96 + b.ls L(copy96) + ldp G_q, H_q, [srcend, -64] + stp G_q, H_q, [dstend, -64] +L(copy96): + stp A_q, B_q, [dstin] + stp E_q, F_q, [dstin, 32] + stp C_q, D_q, [dstend, -32] + ret + + /* Copy more than 128 bytes. */ +L(copy_long): + add srcend, src, count + add dstend, dstin, count + + /* Use backwards copy if there is an overlap. */ + sub tmp1, dstin, src + cmp tmp1, count + b.lo L(copy_long_backwards) + + /* Copy 16 bytes and then align src to 16-byte alignment. */ + ldr D_q, [src] + and tmp1, src, 15 + bic src, src, 15 + sub dst, dstin, tmp1 + add count, count, tmp1 /* Count is now 16 too large. */ + ldp A_q, B_q, [src, 16] + str D_q, [dstin] + ldp C_q, D_q, [src, 48] + subs count, count, 128 + 16 /* Test and readjust count. */ + b.ls L(copy64_from_end) +L(loop64): + stp A_q, B_q, [dst, 16] + ldp A_q, B_q, [src, 80] + stp C_q, D_q, [dst, 48] + ldp C_q, D_q, [src, 112] + add src, src, 64 + add dst, dst, 64 + subs count, count, 64 + b.hi L(loop64) + + /* Write the last iteration and copy 64 bytes from the end. */ +L(copy64_from_end): + ldp E_q, F_q, [srcend, -64] + stp A_q, B_q, [dst, 16] + ldp A_q, B_q, [srcend, -32] + stp C_q, D_q, [dst, 48] + stp E_q, F_q, [dstend, -64] + stp A_q, B_q, [dstend, -32] + ret + + /* Large backwards copy for overlapping copies. + Copy 16 bytes and then align srcend to 16-byte alignment. */ +L(copy_long_backwards): + cbz tmp1, L(return) + ldr D_q, [srcend, -16] + and tmp1, srcend, 15 + bic srcend, srcend, 15 + sub count, count, tmp1 + ldp A_q, B_q, [srcend, -32] + str D_q, [dstend, -16] + ldp C_q, D_q, [srcend, -64] + sub dstend, dstend, tmp1 + subs count, count, 128 + b.ls L(copy64_from_start) + +L(loop64_backwards): + str B_q, [dstend, -16] + str A_q, [dstend, -32] + ldp A_q, B_q, [srcend, -96] + str D_q, [dstend, -48] + str C_q, [dstend, -64]! + ldp C_q, D_q, [srcend, -128] + sub srcend, srcend, 64 + subs count, count, 64 + b.hi L(loop64_backwards) + + /* Write the last iteration and copy 64 bytes from the start. */ +L(copy64_from_start): + ldp E_q, F_q, [src, 32] + stp A_q, B_q, [dstend, -32] + ldp A_q, B_q, [src] + stp C_q, D_q, [dstend, -64] + stp E_q, F_q, [dstin, 32] + stp A_q, B_q, [dstin] +L(return): + ret + +END (__memcpy_aarch64_sve) diff --git a/net/oenetcls/oenetcls.h b/net/oenetcls/oenetcls.h new file mode 100644 index 000000000000..123c568e1096 --- /dev/null +++ b/net/oenetcls/oenetcls.h @@ -0,0 +1,177 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef _NET_OENETCLS_H +#define _NET_OENETCLS_H +#include +#include +#include + +#define OECLS_MAX_NETDEV_NUM 8 +#define OECLS_MAX_RXQ_NUM_PER_DEV 256 +#define OECLS_MAX_CPU_NUM 1024 + +#define OECLS_TIMEOUT (5 * HZ) +#define OECLS_NO_FILTER 0xffff +#define OECLS_NO_CPU 0xffff + +struct oecls_netdev_queue_info { + int irq; + int affinity_cpu; +}; + +struct oecls_netdev_info { + char dev_name[IFNAMSIZ]; + struct net_device *netdev; + int rxq_num; + struct oecls_netdev_queue_info rxq[OECLS_MAX_RXQ_NUM_PER_DEV]; + int old_filter_state; +}; + +struct oecls_rxq { + int rxq_id; + int status; +}; + +struct oecls_numa_clusterinfo { + int cluster_id; + int cur_freeidx; + struct oecls_rxq rxqs[OECLS_MAX_RXQ_NUM_PER_DEV]; +}; + +struct oecls_numa_bound_dev_info { + DECLARE_BITMAP(bitmap_rxq, OECLS_MAX_RXQ_NUM_PER_DEV); + struct oecls_numa_clusterinfo *cluster_info; +}; + +struct oecls_numa_info { + DECLARE_BITMAP(avail_cpus, OECLS_MAX_CPU_NUM); + struct oecls_numa_bound_dev_info bound_dev[OECLS_MAX_NETDEV_NUM]; +}; + +struct cmd_context { + char netdev[IFNAMSIZ]; + u32 dip4; + u16 dport; + u16 action; + u32 ruleid; + u32 del_ruleid; + int ret_loc; +}; + +#define OECLS_SK_RULE_HASHSIZE 256 +#define OECLS_SK_RULE_HASHMASK (OECLS_SK_RULE_HASHSIZE - 1) + +struct oecls_sk_rule_list { + struct hlist_head hash[OECLS_SK_RULE_HASHSIZE]; + /* Mutex to synchronize access to ntuple rule locking */ + struct mutex mutex; +}; + +struct oecls_sk_rule { + struct hlist_node node; + int devid; + void *sk; + int dip4; + int dport; + int action; + int ruleid; + int nid; +}; + +struct oecls_sk_entry { + struct hlist_node node; + void *sk; + u32 sk_rule_hash; +}; + +struct oecls_dev_flow { + unsigned short cpu; + unsigned short filter; + unsigned int last_qtail; + int isvalid; + unsigned long timeout; +}; + +struct oecls_dev_flow_table { + unsigned int mask; + struct rcu_head rcu; + struct oecls_dev_flow flows[]; +}; + +struct oecls_sock_flow_table { + u32 mask; + u32 ents[] ____cacheline_aligned_in_smp; +}; + +#define OECLS_DEV_FLOW_TABLE_NUM 0x1000 +#define OECLS_SOCK_FLOW_TABLE_NUM 0x100000 +#define OECLS_DEV_FLOW_TABLE_SIZE(_num) (sizeof(struct oecls_dev_flow_table) + \ + ((_num) * sizeof(struct oecls_dev_flow))) +#define OECLS_SOCK_FLOW_TABLE_SIZE(_num) (offsetof(struct oecls_sock_flow_table, ents[_num])) + +#define ETH_ALL_FLAGS (ETH_FLAG_LRO | ETH_FLAG_RXVLAN | ETH_FLAG_TXVLAN | \ + ETH_FLAG_NTUPLE | ETH_FLAG_RXHASH) +#define ETH_ALL_FEATURES (NETIF_F_LRO | NETIF_F_HW_VLAN_CTAG_RX | \ + NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_NTUPLE | \ + NETIF_F_RXHASH) + +struct rmgr_ctrl { + int driver_select; + unsigned long *slot; + __u32 n_rules; + __u32 size; +}; + +extern int match_ip_flag; +extern int oecls_debug_lvl; +extern int oecls_netdev_num; +extern int oecls_numa_num; + +#define oecls_debug(fmt, ...) \ + do { \ + if (oecls_debug_lvl) \ + trace_printk(fmt, ## __VA_ARGS__); \ + } while (0) + +#define oecls_error(fmt, ...) \ + do { \ + pr_err("oenetcls [%s:%d]: " fmt, __FILE__, __LINE__, ## __VA_ARGS__); \ + trace_printk(fmt, ## __VA_ARGS__); \ + } while (0) + +struct oecls_netdev_info *get_oecls_netdev_info(unsigned int index); + +#define for_each_oecls_netdev(devid, oecls_dev) \ + for (devid = 0, oecls_dev = get_oecls_netdev_info(devid); \ + (devid < oecls_netdev_num) && oecls_dev; \ + devid++, oecls_dev = get_oecls_netdev_info(devid)) + +struct oecls_numa_info *get_oecls_numa_info(unsigned int nid); + +#define for_each_oecls_numa(nid, numa_info) \ + for (nid = 0, numa_info = get_oecls_numa_info(nid); \ + (nid < oecls_numa_num) && numa_info; \ + nid++, numa_info = get_oecls_numa_info(nid)) + +#ifdef CONFIG_ARM64_SVE +void *__memcpy_aarch64_sve(void *, const void *, size_t); +#define memcpy_r(dst, src, len) \ + do { \ + if (system_supports_sve()) \ + __memcpy_aarch64_sve(dst, src, len); \ + else \ + memcpy(dst, src, len); \ + } while (0) +#else +#define memcpy_r(dst, src, len) memcpy(dst, src, len) +#endif + +int check_appname(char *task_name); +int send_ethtool_ioctl(struct cmd_context *ctx, void *cmd); +int alloc_rxq_id(int nid, int devid); +void free_rxq_id(int nid, int devid, int rxq_id); +void oecls_ntuple_res_init(void); +void oecls_ntuple_res_clean(void); +void oecls_flow_res_init(void); +void oecls_flow_res_clean(void); + +#endif /* _NET_OENETCLS_H */ diff --git a/net/oenetcls/oenetcls_flow.c b/net/oenetcls/oenetcls_flow.c new file mode 100644 index 000000000000..aaa5881a817c --- /dev/null +++ b/net/oenetcls/oenetcls_flow.c @@ -0,0 +1,406 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "oenetcls.h" + +static u32 oecls_cpu_mask; +static struct oecls_sock_flow_table __rcu *oecls_sock_flow_table; +static DEFINE_MUTEX(oecls_sock_flow_mutex); +static DEFINE_SPINLOCK(oecls_dev_flow_lock); + +bool is_oecls_config_netdev(const char *name) +{ + struct oecls_netdev_info *netdev_info; + int netdev_loop; + + for_each_oecls_netdev(netdev_loop, netdev_info) + if (strcmp(netdev_info->dev_name, name) == 0) + return true; + + return false; +} + +static bool _oecls_timeout(struct net_device *dev, u16 rxq_index, + u32 flow_id, u16 filter_id) +{ + struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index; + struct oecls_dev_flow_table *flow_table; + struct oecls_dev_flow *rflow; + bool expire = true; + unsigned int cpu; + + rcu_read_lock(); + flow_table = rcu_dereference(rxqueue->oecls_ftb); + if (flow_table && flow_id <= flow_table->mask) { + rflow = &flow_table->flows[flow_id]; + cpu = READ_ONCE(rflow->cpu); + oecls_debug("dev:%s, rxq:%d, flow_id:%u, filter_id:%d/%d, cpu:%d\n", dev->name, + rxq_index, flow_id, filter_id, rflow->filter, cpu); + + if (rflow->filter == filter_id && cpu < nr_cpu_ids) { + if (time_before(jiffies, rflow->timeout + OECLS_TIMEOUT)) { + expire = false; + } else { + rflow->isvalid = 0; + WRITE_ONCE(rflow->cpu, OECLS_NO_CPU); + } + } + } + rcu_read_unlock(); + oecls_debug("%s, dev:%s, rxq:%d, flow_id:%u, filter_id:%d, expire:%d\n", __func__, + dev->name, rxq_index, flow_id, filter_id, expire); + return expire; +} + +static void _oecls_flow_update(struct sock *sk) +{ + struct oecls_sock_flow_table *tb; + unsigned int hash, index; + u32 val; + u32 cpu = raw_smp_processor_id(); + + if (sk->sk_state != TCP_ESTABLISHED) + return; + + if (check_appname(current->comm)) + return; + + rcu_read_lock(); + tb = rcu_dereference(oecls_sock_flow_table); + hash = READ_ONCE(sk->sk_rxhash); + if (tb && hash) { + index = hash & tb->mask; + val = hash & ~oecls_cpu_mask; + val |= cpu; + + if (READ_ONCE(tb->ents[index]) != val) { + WRITE_ONCE(tb->ents[index], val); + + oecls_debug("[%s] sk:%p, hash:0x%x, index:0x%x, val:0x%x, cpu:%d\n", + current->comm, sk, hash, index, val, cpu); + } + } + rcu_read_unlock(); +} + +static int flow_get_queue_idx(struct net_device *dev, int nid, struct sk_buff *skb) +{ + struct oecls_netdev_info *netdev_info; + int netdev_loop; + u32 hash, index; + struct oecls_numa_info *numa_info; + struct oecls_numa_bound_dev_info *bound_dev = NULL; + int rxq_id, rxq_num, i; + + numa_info = get_oecls_numa_info(nid); + if (!numa_info) + return -1; + + for_each_oecls_netdev(netdev_loop, netdev_info) { + if (strcmp(netdev_info->dev_name, dev->name) == 0) { + bound_dev = &numa_info->bound_dev[netdev_loop]; + break; + } + } + + if (!bound_dev) + return -1; + rxq_num = bitmap_weight(bound_dev->bitmap_rxq, OECLS_MAX_RXQ_NUM_PER_DEV); + if (rxq_num == 0) + return -1; + + hash = skb_get_hash(skb); + index = hash % rxq_num; + + i = 0; + for_each_set_bit(rxq_id, bound_dev->bitmap_rxq, OECLS_MAX_RXQ_NUM_PER_DEV) + if (index == i++) + return rxq_id; + + return -1; +} + +static void set_oecls_cpu(struct net_device *dev, struct sk_buff *skb, + struct oecls_dev_flow *old_rflow, int old_rxq_id, u16 next_cpu) +{ + struct netdev_rx_queue *rxqueue; + struct oecls_dev_flow_table *dtb; + struct oecls_dev_flow *rflow; + u32 flow_id, hash; + u16 rxq_index; + int rc; + + if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap || + !(dev->features & NETIF_F_NTUPLE)) + return; + + rxq_index = flow_get_queue_idx(dev, cpu_to_node(next_cpu), skb); + if (rxq_index == skb_get_rx_queue(skb) || rxq_index < 0) + return; + + rxqueue = dev->_rx + rxq_index; + dtb = rcu_dereference(rxqueue->oecls_ftb); + if (!dtb) + return; + + hash = skb_get_hash(skb); + flow_id = hash & dtb->mask; + rflow = &dtb->flows[flow_id]; + if (rflow->isvalid && rflow->cpu == next_cpu) { + rflow->timeout = jiffies; + return; + } + + rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb, rxq_index, flow_id); + oecls_debug("skb:%p, rxq:%d, hash:0x%x, flow_id:%u, old_rxq_id:%d, next_cpu:%d, rc:%d\n", + skb, rxq_index, hash, flow_id, old_rxq_id, next_cpu, rc); + if (rc < 0) + return; + + rflow->filter = rc; + rflow->isvalid = 1; + rflow->timeout = jiffies; + if (old_rflow->filter == rflow->filter) + old_rflow->filter = OECLS_NO_FILTER; + rflow->cpu = next_cpu; +} + +static void __oecls_set_cpu(struct sk_buff *skb, struct net_device *ndev, + struct oecls_sock_flow_table *tb, struct oecls_dev_flow_table *dtb, + int old_rxq_id) +{ + struct oecls_dev_flow *rflow; + u32 last_recv_cpu, hash, val; + u32 tcpu = 0; + u32 cpu = raw_smp_processor_id(); + + skb_reset_network_header(skb); + hash = skb_get_hash(skb); + if (!hash) + return; + + val = READ_ONCE(tb->ents[hash & tb->mask]); + last_recv_cpu = val & oecls_cpu_mask; + rflow = &dtb->flows[hash & dtb->mask]; + tcpu = rflow->cpu; + + if ((val ^ hash) & ~oecls_cpu_mask) + return; + + if (cpu_to_node(cpu) == cpu_to_node(last_recv_cpu)) + return; + + if (tcpu >= nr_cpu_ids) + set_oecls_cpu(ndev, skb, rflow, old_rxq_id, last_recv_cpu); +} + +static void _oecls_set_cpu(struct sk_buff *skb) +{ + struct net_device *ndev = skb->dev; + struct oecls_sock_flow_table *stb; + struct oecls_dev_flow_table *dtb; + struct netdev_rx_queue *rxqueue; + int rxq_id = -1; + + if (!ndev) + return; + + if (!is_oecls_config_netdev(ndev->name)) + return; + + rxqueue = ndev->_rx; + if (skb_rx_queue_recorded(skb)) { + rxq_id = skb_get_rx_queue(skb); + if (rxq_id >= ndev->real_num_rx_queues) { + oecls_debug("ndev:%s, rxq:%d, real_num:%d\n", ndev->name, + rxq_id, ndev->real_num_rx_queues); + return; + } + rxqueue += rxq_id; + } + + // oecls_debug("skb:%px, dev:%s, rxq_id:%d\n", skb, ndev->name, rxq_id); + if (rxq_id < 0) + return; + + rcu_read_lock(); + stb = rcu_dereference(oecls_sock_flow_table); + dtb = rcu_dereference(rxqueue->oecls_ftb); + if (stb && dtb) + __oecls_set_cpu(skb, ndev, stb, dtb, rxq_id); + + rcu_read_unlock(); +} + +static void oecls_dev_flow_table_free(struct rcu_head *rcu) +{ + struct oecls_dev_flow_table *table = container_of(rcu, + struct oecls_dev_flow_table, rcu); + vfree(table); +} + +static void oecls_dev_flow_table_cleanup(struct net_device *netdev, int qid) +{ + struct oecls_dev_flow_table *dtb; + struct netdev_rx_queue *queue; + int i; + + spin_lock(&oecls_dev_flow_lock); + for (i = 0; i < qid; i++) { + queue = netdev->_rx + i; + dtb = rcu_dereference_protected(queue->oecls_ftb, + lockdep_is_held(&oecls_dev_flow_lock)); + rcu_assign_pointer(queue->oecls_ftb, NULL); + } + spin_unlock(&oecls_dev_flow_lock); + call_rcu(&dtb->rcu, oecls_dev_flow_table_free); +} + +static int oecls_dev_flow_table_release(void) +{ + struct oecls_netdev_info *netdev_info; + int netdev_loop; + struct net_device *netdev; + + for_each_oecls_netdev(netdev_loop, netdev_info) { + netdev = netdev_info->netdev; + if (!netdev) + continue; + oecls_dev_flow_table_cleanup(netdev, netdev->num_rx_queues); + } + + return 0; +} + +static int _oecls_dev_flow_table_init(struct net_device *netdev) +{ + struct oecls_dev_flow_table *table; + int size = OECLS_DEV_FLOW_TABLE_NUM; + struct netdev_rx_queue *queue; + int i, j, ret = 0; + + size = roundup_pow_of_two(size); + oecls_debug("dev:%s, num_rx_queues:%d, mask:0x%x\n", netdev->name, netdev->num_rx_queues, + size - 1); + + for (i = 0; i < netdev->num_rx_queues; i++) { + table = vmalloc(OECLS_DEV_FLOW_TABLE_SIZE(size)); + if (!table) { + ret = -ENOMEM; + goto fail; + } + + table->mask = size - 1; + for (j = 0; j < size; j++) { + table->flows[j].cpu = OECLS_NO_CPU; + table->flows[j].isvalid = 0; + } + + queue = netdev->_rx + i; + + spin_lock(&oecls_dev_flow_lock); + rcu_assign_pointer(queue->oecls_ftb, table); + spin_unlock(&oecls_dev_flow_lock); + } + return ret; +fail: + oecls_dev_flow_table_cleanup(netdev, i); + return ret; +} + +static int oecls_dev_flow_table_init(void) +{ + struct oecls_netdev_info *netdev_info; + int netdev_loop; + struct net_device *ndev; + int i, err; + + for_each_oecls_netdev(netdev_loop, netdev_info) { + ndev = netdev_info->netdev; + if (!ndev) + continue; + err = _oecls_dev_flow_table_init(ndev); + if (err) + goto out; + } + + return 0; +out: + for (i = 0; i < netdev_loop; i++) { + netdev_info = get_oecls_netdev_info(i); + ndev = netdev_info->netdev; + if (!ndev) + continue; + oecls_dev_flow_table_cleanup(ndev, ndev->num_rx_queues); + } + return err; +} + +static int oecls_sock_flow_table_release(void) +{ + struct oecls_sock_flow_table *tb; + + mutex_lock(&oecls_sock_flow_mutex); + tb = rcu_dereference_protected(oecls_sock_flow_table, + lockdep_is_held(&oecls_sock_flow_mutex)); + if (tb) + rcu_assign_pointer(oecls_sock_flow_table, NULL); + mutex_unlock(&oecls_sock_flow_mutex); + synchronize_rcu(); + vfree(tb); + + return 0; +} + +static int oecls_sock_flow_table_init(void) +{ + struct oecls_sock_flow_table *table; + int size = OECLS_SOCK_FLOW_TABLE_NUM; + int i; + + size = roundup_pow_of_two(size); + table = vmalloc(OECLS_SOCK_FLOW_TABLE_SIZE(size)); + if (!table) + return -ENOMEM; + + oecls_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1; + oecls_debug("nr_cpu_ids:%d, oecls_cpu_mask:0x%x\n", nr_cpu_ids, oecls_cpu_mask); + + table->mask = size - 1; + for (i = 0; i < size; i++) + table->ents[i] = OECLS_NO_CPU; + + mutex_lock(&oecls_sock_flow_mutex); + rcu_assign_pointer(oecls_sock_flow_table, table); + mutex_unlock(&oecls_sock_flow_mutex); + + return 0; +} + +static const struct oecls_hook_ops oecls_flow_ops = { + .oecls_flow_update = _oecls_flow_update, + .oecls_set_cpu = _oecls_set_cpu, + .oecls_timeout = _oecls_timeout, + .oecls_cfg_rxcls = NULL, +}; + +void oecls_flow_res_init(void) +{ + oecls_sock_flow_table_init(); + oecls_dev_flow_table_init(); + RCU_INIT_POINTER(oecls_ops, &oecls_flow_ops); +} + +void oecls_flow_res_clean(void) +{ + RCU_INIT_POINTER(oecls_ops, NULL); + oecls_sock_flow_table_release(); + oecls_dev_flow_table_release(); +} diff --git a/net/oenetcls/oenetcls_main.c b/net/oenetcls/oenetcls_main.c new file mode 100644 index 000000000000..4be09a3f56cb --- /dev/null +++ b/net/oenetcls/oenetcls_main.c @@ -0,0 +1,1076 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include +#include +#include +#include +#include +#include +#include "oenetcls.h" + +int oecls_netdev_num; +static struct oecls_netdev_info oecls_netdev_info_table[OECLS_MAX_NETDEV_NUM]; + +int oecls_numa_num; +static int oecls_cluster_cpu_num, oecls_cluster_per_numa; +static struct oecls_numa_info *oecls_numa_info_table; + +int oecls_debug_lvl; +module_param(oecls_debug_lvl, int, 0644); +MODULE_PARM_DESC(oecls_debug_lvl, "debug switch"); + +static int mode; +module_param(mode, int, 0444); +MODULE_PARM_DESC(mode, "mode, default 0"); + +static char ifname[64] = { 0 }; +module_param_string(ifname, ifname, sizeof(ifname), 0444); +MODULE_PARM_DESC(ifname, "ifname"); + +static char appname[64] = "redis-server"; +module_param_string(appname, appname, sizeof(appname), 0644); +MODULE_PARM_DESC(appname, "appname, default redis-server"); + +int match_ip_flag = 1; +module_param(match_ip_flag, int, 0644); +MODULE_PARM_DESC(match_ip_flag, "match ip flag"); + +static int strategy; +module_param(strategy, int, 0444); +MODULE_PARM_DESC(strategy, "strategy, default 0"); + +static bool check_params(void) +{ + if (mode != 0 && mode != 1) + return false; + + if (strlen(ifname) == 0) + return false; + + return true; +} + +int check_appname(char *task_name) +{ + char *start = appname, *end; + + if (!strlen(appname)) + return 0; + + // support appname: app1#app2#appN + while (*start != '\0') { + end = strchr(start, '#'); + if (end == start) { + start++; + continue; + } + + if (!end) { + if (!strncmp(task_name, start, strlen(start))) + return 0; + break; + } + + if (!strncmp(task_name, start, end - start)) + return 0; + start = end + 1; + } + return -EOPNOTSUPP; +} + +static u32 __ethtool_get_flags(struct net_device *dev) +{ + u32 flags = 0; + + if (dev->features & NETIF_F_LRO) + flags |= ETH_FLAG_LRO; + if (dev->features & NETIF_F_HW_VLAN_CTAG_RX) + flags |= ETH_FLAG_RXVLAN; + if (dev->features & NETIF_F_HW_VLAN_CTAG_TX) + flags |= ETH_FLAG_TXVLAN; + if (dev->features & NETIF_F_NTUPLE) + flags |= ETH_FLAG_NTUPLE; + if (dev->features & NETIF_F_RXHASH) + flags |= ETH_FLAG_RXHASH; + + return flags; +} + +static int __ethtool_set_flags(struct net_device *dev, u32 data) +{ + netdev_features_t features = 0, changed; + + if (data & ~ETH_ALL_FLAGS) + return -EINVAL; + + if (data & ETH_FLAG_LRO) + features |= NETIF_F_LRO; + if (data & ETH_FLAG_RXVLAN) + features |= NETIF_F_HW_VLAN_CTAG_RX; + if (data & ETH_FLAG_TXVLAN) + features |= NETIF_F_HW_VLAN_CTAG_TX; + if (data & ETH_FLAG_NTUPLE) + features |= NETIF_F_NTUPLE; + if (data & ETH_FLAG_RXHASH) + features |= NETIF_F_RXHASH; + + /* allow changing only bits set in hw_features */ + changed = (features ^ dev->features) & ETH_ALL_FEATURES; + if (changed & ~dev->hw_features) + return (changed & dev->hw_features) ? -EINVAL : -EOPNOTSUPP; + + dev->wanted_features = + (dev->wanted_features & ~changed) | (features & changed); + + __netdev_update_features(dev); + + return 0; +} + +static void ethtool_rxnfc_copy_to_user(void *useraddr, + const struct ethtool_rxnfc *rxnfc, + size_t size, const u32 *rule_buf) +{ + memcpy_r(useraddr, rxnfc, size); + useraddr += offsetof(struct ethtool_rxnfc, rule_locs); + + if (rule_buf) + memcpy_r(useraddr, rule_buf, rxnfc->rule_cnt * sizeof(u32)); +} + +static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev, + u32 cmd, void *useraddr) +{ + struct ethtool_rxnfc info; + size_t info_size = sizeof(info); + int rc; + + if (!dev->ethtool_ops->set_rxnfc) + return -EOPNOTSUPP; + + if (cmd == ETHTOOL_SRXFH) + info_size = (offsetof(struct ethtool_rxnfc, data) + + sizeof(info.data)); + + memcpy_r(&info, useraddr, info_size); + rc = dev->ethtool_ops->set_rxnfc(dev, &info); + if (rc) + return rc; + + if (cmd == ETHTOOL_SRXCLSRLINS) + ethtool_rxnfc_copy_to_user(useraddr, &info, info_size, NULL); + + return 0; +} + +static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev, + u32 cmd, void *useraddr) +{ + struct ethtool_rxnfc info; + size_t info_size = sizeof(info); + const struct ethtool_ops *ops = dev->ethtool_ops; + int ret; + void *rule_buf = NULL; + + if (!ops->get_rxnfc) + return -EOPNOTSUPP; + + if (cmd == ETHTOOL_GRXFH) + info_size = (offsetof(struct ethtool_rxnfc, data) + + sizeof(info.data)); + + memcpy_r(&info, useraddr, info_size); + + /* If FLOW_RSS was requested then user-space must be using the + * new definition, as FLOW_RSS is newer. + */ + if (cmd == ETHTOOL_GRXFH && info.flow_type & FLOW_RSS) { + info_size = sizeof(info); + memcpy_r(&info, useraddr, info_size); + /* Since malicious users may modify the original data, + * we need to check whether FLOW_RSS is still requested. + */ + if (!(info.flow_type & FLOW_RSS)) + return -EINVAL; + } + + if (info.cmd != cmd) + return -EINVAL; + + if (info.cmd == ETHTOOL_GRXCLSRLALL) { + if (info.rule_cnt > 0) { + if (info.rule_cnt <= KMALLOC_MAX_SIZE / sizeof(u32)) + rule_buf = kcalloc(info.rule_cnt, sizeof(u32), + GFP_KERNEL); + if (!rule_buf) + return -ENOMEM; + } + } + + ret = ops->get_rxnfc(dev, &info, rule_buf); + if (ret < 0) + goto err_out; + + ethtool_rxnfc_copy_to_user(useraddr, &info, info_size, rule_buf); +err_out: + kfree(rule_buf); + + return ret; +} + +static noinline_for_stack int ethtool_get_channels(struct net_device *dev, + void *useraddr) +{ + struct ethtool_channels channels = { .cmd = ETHTOOL_GCHANNELS }; + + if (!dev->ethtool_ops->get_channels) + return -EOPNOTSUPP; + + dev->ethtool_ops->get_channels(dev, &channels); + + memcpy_r(useraddr, &channels, sizeof(channels)); + return 0; +} + +static int ethtool_get_value(struct net_device *dev, char *useraddr, + u32 cmd, u32 (*actor)(struct net_device *)) +{ + struct ethtool_value edata = { .cmd = cmd }; + + if (!actor) + return -EOPNOTSUPP; + + edata.data = actor(dev); + + memcpy_r(useraddr, &edata, sizeof(edata)); + return 0; +} + +static int ethtool_set_value(struct net_device *dev, char *useraddr, + int (*actor)(struct net_device *, u32)) +{ + struct ethtool_value edata; + + if (!actor) + return -EOPNOTSUPP; + + memcpy_r(&edata, useraddr, sizeof(edata)); + + return actor(dev, edata.data); +} + +static int dev_ethtool_kern(struct net *net, struct ifreq *ifr) +{ + struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name); + void *useraddr = ifr->ifr_data; + u32 ethcmd, sub_cmd; + int rc; + netdev_features_t old_features; + + if (!dev || !netif_device_present(dev)) + return -ENODEV; + + memcpy_r(ðcmd, useraddr, sizeof(ethcmd)); + + if (ethcmd == ETHTOOL_PERQUEUE) + memcpy_r(&sub_cmd, useraddr + sizeof(ethcmd), sizeof(sub_cmd)); + else + sub_cmd = ethcmd; + + /* Allow some commands to be done by anyone */ + switch (sub_cmd) { + case ETHTOOL_GFLAGS: + case ETHTOOL_GRXFH: + case ETHTOOL_GRXRINGS: + case ETHTOOL_GRXCLSRLCNT: + case ETHTOOL_GRXCLSRULE: + case ETHTOOL_GRXCLSRLALL: + case ETHTOOL_GCHANNELS: + break; + default: + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) + return -EPERM; + } + + if (dev->ethtool_ops->begin) { + rc = dev->ethtool_ops->begin(dev); + if (rc < 0) + return rc; + } + old_features = dev->features; + + switch (ethcmd) { + case ETHTOOL_GFLAGS: + rc = ethtool_get_value(dev, useraddr, ethcmd, + __ethtool_get_flags); + break; + case ETHTOOL_SFLAGS: + rc = ethtool_set_value(dev, useraddr, __ethtool_set_flags); + break; + case ETHTOOL_GRXFH: + case ETHTOOL_GRXRINGS: + case ETHTOOL_GRXCLSRLCNT: + case ETHTOOL_GRXCLSRULE: + case ETHTOOL_GRXCLSRLALL: + rc = ethtool_get_rxnfc(dev, ethcmd, useraddr); + break; + case ETHTOOL_SRXFH: + case ETHTOOL_SRXCLSRLDEL: + case ETHTOOL_SRXCLSRLINS: + rc = ethtool_set_rxnfc(dev, ethcmd, useraddr); + break; + case ETHTOOL_GCHANNELS: + rc = ethtool_get_channels(dev, useraddr); + break; + default: + rc = -EOPNOTSUPP; + } + + if (dev->ethtool_ops->complete) + dev->ethtool_ops->complete(dev); + + if (old_features != dev->features) + netdev_features_change(dev); + + return rc; +} + +int send_ethtool_ioctl(struct cmd_context *ctx, void *cmd) +{ + struct ifreq ifr = {0}; + int ret; + + strscpy(ifr.ifr_name, ctx->netdev, sizeof(ifr.ifr_name)); + ifr.ifr_data = cmd; + + rtnl_lock(); + ret = dev_ethtool_kern(&init_net, &ifr); + rtnl_unlock(); + + return ret; +} + +struct oecls_netdev_info *get_oecls_netdev_info(unsigned int index) +{ + if (index >= OECLS_MAX_NETDEV_NUM) + return NULL; + return &oecls_netdev_info_table[index]; +} + +static struct oecls_netdev_info *alloc_oecls_netdev_info(void) +{ + if (oecls_netdev_num >= OECLS_MAX_NETDEV_NUM) + return NULL; + + return &oecls_netdev_info_table[oecls_netdev_num++]; +} + +static bool check_irq_name(const char *irq_name, struct oecls_netdev_info *oecls_dev) +{ + if (!strstr(irq_name, "TxRx") && !strstr(irq_name, "comp") && !strstr(irq_name, "rx")) + return false; + + if (strstr(irq_name, oecls_dev->dev_name)) + return true; + + if (oecls_dev->netdev->dev.parent && + strstr(irq_name, dev_name(oecls_dev->netdev->dev.parent))) + return true; + + return false; +} + +static void get_netdev_queue_info(struct oecls_netdev_info *oecls_dev) +{ + struct oecls_netdev_queue_info *rxq_info; + struct irq_desc *desc; + int irq, cpu; + + for_each_irq_desc(irq, desc) { + if (!desc->action) + continue; + if (!desc->action->name) + continue; + if (!check_irq_name(desc->action->name, oecls_dev)) + continue; + if (oecls_dev->rxq_num >= OECLS_MAX_RXQ_NUM_PER_DEV) + break; + rxq_info = &oecls_dev->rxq[oecls_dev->rxq_num++]; + rxq_info->irq = irq; + cpu = cpumask_first(irq_data_get_effective_affinity_mask(&desc->irq_data)); + rxq_info->affinity_cpu = cpu; + oecls_debug("irq=%d, [%s], rxq_id=%d affinity_cpu:%d\n", + irq, desc->action->name, oecls_dev->rxq_num, cpu); + } +} + +static int oecls_filter_enable(const char *dev_name, bool *old_state) +{ + struct ethtool_value eval = {0}; + struct cmd_context ctx = {0}; + int ret; + + strscpy(ctx.netdev, dev_name, IFNAMSIZ); + + eval.cmd = ETHTOOL_GFLAGS; + ret = send_ethtool_ioctl(&ctx, &eval); + if (ret != 0) { + oecls_error("get %s flags fail, ret:%d\n", dev_name, ret); + return ret; + } + if (eval.data & ETH_FLAG_NTUPLE) { + *old_state = true; + oecls_debug("%s ntuple is already on\n", dev_name); + return 0; + } + + // Set ntuple feature + eval.cmd = ETHTOOL_SFLAGS; + eval.data |= ETH_FLAG_NTUPLE; + ret = send_ethtool_ioctl(&ctx, &eval); + if (ret != 0) { + oecls_error("set %s flags fail, ret:%d\n", dev_name, ret); + return ret; + } + + // Get ntuple feature + eval.cmd = ETHTOOL_GFLAGS; + eval.data = 0; + ret = send_ethtool_ioctl(&ctx, &eval); + if (ret != 0) { + oecls_error("get %s flags fail, ret:%d\n", dev_name, ret); + return ret; + } + if (!(eval.data & ETH_FLAG_NTUPLE)) { + oecls_error("enable ntuple feature fail!\n"); + return -EOPNOTSUPP; + } + + return 0; +} + +static void oecls_filter_restore(const char *dev_name, bool old_state) +{ + struct ethtool_value eval = {0}; + struct cmd_context ctx = {0}; + bool cur_filter_state; + int ret; + + strscpy(ctx.netdev, dev_name, IFNAMSIZ); + + eval.cmd = ETHTOOL_GFLAGS; + ret = send_ethtool_ioctl(&ctx, &eval); + if (ret != 0) { + oecls_error("get %s flags fail, ret:%d\n", dev_name, ret); + return; + } + + cur_filter_state = (eval.data & ETH_FLAG_NTUPLE) ? true : false; + if (cur_filter_state == old_state) + return; + + // Set ntuple feature + eval.cmd = ETHTOOL_SFLAGS; + if (old_state) + eval.data |= ETH_FLAG_NTUPLE; + else + eval.data &= ~ETH_FLAG_NTUPLE; + ret = send_ethtool_ioctl(&ctx, &eval); + if (ret != 0) { + oecls_error("set %s flags fail, ret:%d\n", dev_name, ret); + return; + } +} + +static int init_single_oecls_dev(char *if_name, unsigned int length) +{ + struct oecls_netdev_info *oecls_dev; + char dev_name[IFNAMSIZ] = { 0 }; + struct net_device *netdev; + int cpy_len = length < IFNAMSIZ ? length : IFNAMSIZ; + bool old_state = false; + int ret; + + strscpy(dev_name, if_name, cpy_len); + netdev = dev_get_by_name(&init_net, dev_name); + if (!netdev) { + oecls_error("dev [%s] is not exist!\n", dev_name); + return -ENODEV; + } + + if (!(netdev->flags & IFF_UP)) { + ret = -ENETDOWN; + oecls_error("dev:%s not up! flags=%d.\n", dev_name, netdev->flags); + goto out; + } + + if (netdev->flags & IFF_LOOPBACK) { + ret = -EOPNOTSUPP; + oecls_error("Do not support loopback.\n"); + goto out; + } + + ret = oecls_filter_enable(dev_name, &old_state); + if (ret) { + oecls_error("dev [%s] not support ntuple! ret=%d\n", dev_name, ret); + goto out; + } + + oecls_dev = alloc_oecls_netdev_info(); + if (!oecls_dev) { + ret = -ENOMEM; + oecls_filter_restore(dev_name, old_state); + oecls_error("alloc oecls_dev fail! oecls_netdev_num:%d\n", oecls_netdev_num); + goto out; + } + + memcpy_r(oecls_dev->dev_name, dev_name, IFNAMSIZ); + oecls_dev->old_filter_state = old_state; + oecls_dev->netdev = netdev; + get_netdev_queue_info(oecls_dev); + return 0; + +out: + dev_put(netdev); + return ret; +} + +static void clean_oecls_netdev_info(void) +{ + struct oecls_netdev_info *oecls_dev; + struct net_device *netdev; + int devid; + + for_each_oecls_netdev(devid, oecls_dev) { + oecls_filter_restore(oecls_dev->dev_name, oecls_dev->old_filter_state); + netdev = oecls_dev->netdev; + if (netdev) { + oecls_dev->netdev = NULL; + dev_put(netdev); + } + } + + oecls_netdev_num = 0; +} + +static int init_oecls_netdev_info(char *netdev_str) +{ + char *start = netdev_str, *end; + int err = -ENODEV; + + while (*start != '\0') { + // skip start # + end = strchr(start, '#'); + if (end == start) { + start++; + continue; + } + + // find the last ifname + if (!end) { + err = init_single_oecls_dev(start, strlen(start)); + break; + } + + err = init_single_oecls_dev(start, end - start); + if (err) + break; + start = end + 1; + } + + return err; +} + +struct oecls_numa_info *get_oecls_numa_info(unsigned int nid) +{ + if (nid >= oecls_numa_num) + return NULL; + return &oecls_numa_info_table[nid]; +} + +static void clean_oecls_numa_info(void) +{ + oecls_numa_num = 0; + kfree(oecls_numa_info_table); +} + +static void init_numa_avail_cpus(int nid, struct oecls_numa_info *numa_info) +{ + int cpu; + + oecls_debug("numa node %d: %*pb, %*pbl\n", nid, cpumask_pr_args(cpumask_of_node(nid)), + cpumask_pr_args(cpumask_of_node(nid))); + + bitmap_zero(numa_info->avail_cpus, OECLS_MAX_CPU_NUM); + for_each_cpu(cpu, cpumask_of_node(nid)) { + if (cpu >= OECLS_MAX_CPU_NUM) + return; + set_bit(cpu, numa_info->avail_cpus); + } +} + +static void clean_oecls_rxq(void) +{ + struct oecls_numa_bound_dev_info *bound_dev; + struct oecls_netdev_info *oecls_dev; + struct oecls_numa_info *numa_info; + int nid, devid; + + for_each_oecls_numa(nid, numa_info) { + for_each_oecls_netdev(devid, oecls_dev) { + bound_dev = &numa_info->bound_dev[devid]; + kfree(bound_dev->cluster_info); + } + } +} + +static int init_numa_rxq_bitmap(int nid, struct oecls_numa_info *numa_info) +{ + int bound_rxq_num, cluster_id, cluster_idx, cur_idx; + struct oecls_numa_bound_dev_info *bound_dev; + struct oecls_netdev_info *oecls_dev; + int rxq_id, devid, cpu, ret = 0; + + for_each_oecls_netdev(devid, oecls_dev) { + bound_rxq_num = 0; + bound_dev = &numa_info->bound_dev[devid]; + bitmap_zero(bound_dev->bitmap_rxq, OECLS_MAX_RXQ_NUM_PER_DEV); + bound_dev->cluster_info = kcalloc(oecls_cluster_per_numa, + sizeof(*bound_dev->cluster_info), GFP_ATOMIC); + if (!bound_dev->cluster_info) { + ret = -ENOMEM; + goto out; + } + + for (rxq_id = 0; rxq_id < oecls_dev->rxq_num; rxq_id++) { + cpu = oecls_dev->rxq[rxq_id].affinity_cpu; + if (cpu_to_node(cpu) == nid) { + set_bit(rxq_id, bound_dev->bitmap_rxq); + cluster_id = cpu / oecls_cluster_cpu_num; + cluster_idx = cluster_id % oecls_cluster_per_numa; + bound_dev->cluster_info[cluster_idx].cluster_id = cluster_id; + cur_idx = bound_dev->cluster_info[cluster_idx].cur_freeidx++; + bound_dev->cluster_info[cluster_idx].rxqs[cur_idx].rxq_id = rxq_id; + bound_dev->cluster_info[cluster_idx].rxqs[cur_idx].status = 1; + bound_rxq_num++; + oecls_debug("cpu:%d cluster_id:%d cluster_idx:%d rxq_id:%d cur_idx:%d\n", + cpu, cluster_id, cluster_idx, rxq_id, cur_idx); + } + } + + oecls_debug("nid:%d, dev_id:%d, dev:%s, rxq_num:%d, bit_num:%d, bitmap_rxq:%*pbl\n", + nid, devid, oecls_dev->dev_name, oecls_dev->rxq_num, + bound_rxq_num, OECLS_MAX_RXQ_NUM_PER_DEV, bound_dev->bitmap_rxq); + } + return ret; + +out: + clean_oecls_rxq(); + return ret; +} + +static int get_cluster_rxq(struct oecls_numa_bound_dev_info *bound_dev) +{ + int cpu = raw_smp_processor_id(); + int cluster_id = cpu / oecls_cluster_cpu_num; + int i, j, rxq_id; + + for (i = 0; i < oecls_cluster_per_numa; i++) { + if (cluster_id != bound_dev->cluster_info[i].cluster_id) + continue; + for (j = 0; j < OECLS_MAX_RXQ_NUM_PER_DEV; j++) { + if (bound_dev->cluster_info[i].rxqs[j].status == 1) { + bound_dev->cluster_info[i].rxqs[j].status = 2; + rxq_id = bound_dev->cluster_info[i].rxqs[j].rxq_id; + oecls_debug("cluster:%d cpu:%d alloc rxq_id:%d\n", + cluster_id, cpu, rxq_id); + return rxq_id; + } + } + } + oecls_debug("cluster:%d no free rxq for cpu:%d\n", cluster_id, cpu); + return -1; +} + +static int put_cluster_rxq(struct oecls_numa_bound_dev_info *bound_dev, int rxq_id) +{ + int i, j; + + for (i = 0; i < oecls_cluster_per_numa; i++) { + for (j = 0; j < OECLS_MAX_RXQ_NUM_PER_DEV; j++) { + if (bound_dev->cluster_info[i].rxqs[j].status == 2 && + bound_dev->cluster_info[i].rxqs[j].rxq_id == rxq_id) { + bound_dev->cluster_info[i].rxqs[j].status = 1; + oecls_debug("free rxq_id:%d\n", rxq_id); + return 0; + } + } + } + oecls_debug("no match malloced rxq_id:%d\n", rxq_id); + return -1; +} + +int alloc_rxq_id(int nid, int devid) +{ + struct oecls_numa_bound_dev_info *bound_dev; + struct oecls_numa_info *numa_info; + int rxq_id; + + numa_info = get_oecls_numa_info(nid); + if (!numa_info) { + oecls_error("error nid:%d\n", nid); + return -EINVAL; + } + + if (devid >= OECLS_MAX_NETDEV_NUM) { + oecls_error("error bound_dev index:%d\n", devid); + return -EINVAL; + } + bound_dev = &numa_info->bound_dev[devid]; + + if (strategy == 1) { + rxq_id = get_cluster_rxq(bound_dev); + if (rxq_id < 0 || rxq_id >= OECLS_MAX_RXQ_NUM_PER_DEV) + pr_info("failed to get rxq_id:%d in cluster, try numa\n", rxq_id); + else + goto found; + } + + rxq_id = find_first_bit(bound_dev->bitmap_rxq, OECLS_MAX_RXQ_NUM_PER_DEV); + if (rxq_id >= OECLS_MAX_RXQ_NUM_PER_DEV) { + oecls_error("error rxq_id:%d\n", rxq_id); + return -EINVAL; + } + +found: + clear_bit(rxq_id, bound_dev->bitmap_rxq); + oecls_debug("alloc nid:%d, dev_id:%d, rxq_id:%d\n", nid, devid, rxq_id); + return rxq_id; +} + +void free_rxq_id(int nid, int devid, int rxq_id) +{ + struct oecls_numa_bound_dev_info *bound_dev; + struct oecls_numa_info *numa_info; + + numa_info = get_oecls_numa_info(nid); + if (!numa_info) { + oecls_error("error nid:%d\n", nid); + return; + } + + if (devid >= OECLS_MAX_NETDEV_NUM) { + oecls_error("error bound_dev index:%d\n", devid); + return; + } + bound_dev = &numa_info->bound_dev[devid]; + + if (rxq_id >= OECLS_MAX_RXQ_NUM_PER_DEV) { + oecls_error("error rxq_id:%d\n", rxq_id); + return; + } + + if (strategy == 1) + put_cluster_rxq(bound_dev, rxq_id); + + if (test_bit(rxq_id, bound_dev->bitmap_rxq)) { + oecls_error("error nid:%d, devid:%d, rxq_id:%d\n", nid, devid, rxq_id); + return; + } + + set_bit(rxq_id, bound_dev->bitmap_rxq); + oecls_debug("free nid:%d, dev_id:%d, rxq_id:%d\n", nid, devid, rxq_id); +} + +static int init_oecls_numa_info(void) +{ + struct oecls_numa_info *numa_info; + int nid, ret = 0; + + oecls_numa_num = num_online_nodes(); + oecls_numa_info_table = kcalloc(oecls_numa_num, sizeof(*oecls_numa_info_table), + GFP_ATOMIC); + if (!oecls_numa_info_table) { + ret = -ENOMEM; + oecls_error("oecls_numa_info_table alloc failed:%d\n", ret); + return ret; + } + + oecls_cluster_cpu_num = cpumask_weight(topology_cluster_cpumask(raw_smp_processor_id())); + oecls_cluster_per_numa = (nr_cpu_ids / oecls_cluster_cpu_num) / oecls_numa_num; + oecls_debug("oecls_numa_num=%d cluster_cpu_num:%d cluster_cpu_num:%d\n", + oecls_numa_num, oecls_cluster_per_numa, oecls_cluster_cpu_num); + + for_each_oecls_numa(nid, numa_info) + init_numa_avail_cpus(nid, numa_info); + + return ret; +} + +static int alloc_available_cpu(int nid, struct oecls_numa_info *numa_info) +{ + int cpu; + + cpu = find_first_bit(numa_info->avail_cpus, OECLS_MAX_CPU_NUM); + if (cpu >= OECLS_MAX_CPU_NUM) { + oecls_error("no available cpus: nid=%d, cpu=%d\n", nid, cpu); + return -1; + } + + clear_bit(cpu, numa_info->avail_cpus); + return cpu; +} + +static void add_netdev_irq_affinity_cpu(struct oecls_netdev_info *oecls_dev, int rxq_id, int cpu) +{ + struct oecls_netdev_queue_info *rxq_info; + + if (rxq_id >= OECLS_MAX_RXQ_NUM_PER_DEV) + return; + + rxq_info = &oecls_dev->rxq[rxq_id]; + rxq_info->affinity_cpu = cpu; +} + +static void config_affinity_strategy_default(struct oecls_netdev_info *oecls_dev) +{ + struct oecls_numa_info *numa_info; + int rxq_num = oecls_dev->rxq_num; + int rxq_per_numa = rxq_num / oecls_numa_num; + int remain = rxq_num - rxq_per_numa * oecls_numa_num; + int numa_rxq_id, rxq_id, nid, cpu; + + oecls_debug("dev=%s, rxq_num=%d, rxq_per_numa=%d, remain=%d\n", oecls_dev->dev_name, + rxq_num, rxq_per_numa, remain); + + // average config rxq to every numa + for_each_oecls_numa(nid, numa_info) { + for (numa_rxq_id = 0; numa_rxq_id < rxq_per_numa; numa_rxq_id++) { + cpu = alloc_available_cpu(nid, numa_info); + if (cpu < 0) + break; + + rxq_id = rxq_per_numa * nid + numa_rxq_id; + add_netdev_irq_affinity_cpu(oecls_dev, rxq_id, cpu); + } + } + + if (!remain) + return; + + // config remain rxq to every numa + numa_rxq_id = 0; + for_each_oecls_numa(nid, numa_info) { + if (numa_rxq_id >= remain) + break; + cpu = alloc_available_cpu(nid, numa_info); + if (cpu < 0) + break; + + rxq_id = rxq_per_numa * oecls_numa_num + numa_rxq_id; + numa_rxq_id++; + add_netdev_irq_affinity_cpu(oecls_dev, rxq_id, cpu); + } +} + +static void config_affinity_strategy_cluster(struct oecls_netdev_info *oecls_dev) +{ + int rxq_num = oecls_dev->rxq_num; + int rxq_per_numa = rxq_num / oecls_numa_num; + int remain = rxq_num - rxq_per_numa * oecls_numa_num; + int cpu_idx = oecls_cluster_cpu_num - 1; + int cluster, cpu, rxq_id = 0, round; + + round = rxq_per_numa < oecls_cluster_per_numa ? rxq_per_numa : oecls_cluster_per_numa; + if (remain > 0) + round++; + oecls_debug("round=%d\n", round); + + while (rxq_id < oecls_dev->rxq_num) { + for (cluster = 0; cluster < oecls_cluster_per_numa * oecls_numa_num; cluster++) { + if (cluster % oecls_cluster_per_numa >= round) + continue; + cpu = cluster * oecls_cluster_cpu_num + cpu_idx; + if (rxq_id >= oecls_dev->rxq_num) + break; + add_netdev_irq_affinity_cpu(oecls_dev, rxq_id++, cpu); + } + cpu_idx--; + if (--cpu_idx < 0) + cpu_idx = oecls_cluster_cpu_num - 1; + } +} + +static void config_affinity_strategy_numa(struct oecls_netdev_info *oecls_dev) +{ + int rxq_num = oecls_dev->rxq_num; + int rxq_per_numa = rxq_num / oecls_numa_num; + int cpu_per_numa = nr_cpu_ids / oecls_numa_num; + int remain = rxq_num - rxq_per_numa * oecls_numa_num; + struct oecls_numa_info *numa_info; + int numa_start_cpu, numa_cpu_id; + int rxq_id = 0, nid, cpu; + + for_each_oecls_numa(nid, numa_info) { + numa_start_cpu = find_first_bit(numa_info->avail_cpus, OECLS_MAX_CPU_NUM); + for (numa_cpu_id = 0; numa_cpu_id < rxq_per_numa; numa_cpu_id++) { + cpu = numa_start_cpu + (numa_cpu_id % cpu_per_numa); + if (rxq_id >= oecls_dev->rxq_num) + break; + add_netdev_irq_affinity_cpu(oecls_dev, rxq_id++, cpu); + } + if (remain-- > 0) { + cpu = numa_start_cpu + (numa_cpu_id % cpu_per_numa); + add_netdev_irq_affinity_cpu(oecls_dev, rxq_id++, cpu); + } + } +} + +static void config_affinity_strategy_custom(struct oecls_netdev_info *oecls_dev) +{ + oecls_debug("dev=%s\n", oecls_dev->dev_name); +} + +static void config_affinity_strategy(void) +{ + struct oecls_netdev_info *oecls_dev; + int devid; + + for_each_oecls_netdev(devid, oecls_dev) { + switch (strategy) { + case 1: + config_affinity_strategy_cluster(oecls_dev); + break; + case 2: + config_affinity_strategy_numa(oecls_dev); + break; + case 3: + config_affinity_strategy_custom(oecls_dev); + break; + case 0: + default: + config_affinity_strategy_default(oecls_dev); + break; + } + } +} + +static inline void irq_set_affinity_wrapper(int rxq, int irq, int cpu) +{ + int err = 0; + + err = irq_set_affinity(irq, get_cpu_mask(cpu)); + oecls_debug("rxq=%d, irq=%d, cpu=%d, err=%d\n", rxq, irq, cpu, err); +} + +static void enable_affinity_strategy(void) +{ + struct oecls_netdev_queue_info *rxq_info; + struct oecls_netdev_info *oecls_dev; + int rxq_id, devid; + + for_each_oecls_netdev(devid, oecls_dev) { + for (rxq_id = 0; rxq_id < oecls_dev->rxq_num; rxq_id++) { + rxq_info = &oecls_dev->rxq[rxq_id]; + irq_set_affinity_wrapper(rxq_id, rxq_info->irq, rxq_info->affinity_cpu); + } + } +} + +static inline void netif_set_xps_queue_wrapper(struct net_device *netdev, int rxq_id, + const struct cpumask *cpu_mask) +{ + int err = 0; + + err = netif_set_xps_queue(netdev, cpu_mask, rxq_id); + oecls_debug("name=%s, rxq_id=%d, mask=%*pbl, err=%d\n", netdev->name, rxq_id, + cpumask_pr_args(cpu_mask), err); +} + +static void set_netdev_xps_queue(bool enable) +{ + const struct cpumask clear_mask = { 0 }; + struct oecls_netdev_info *oecls_dev; + const struct cpumask *cpu_mask; + int rxq_id, devid, cpu, nid; + + for_each_oecls_netdev(devid, oecls_dev) { + for (rxq_id = 0; rxq_id < oecls_dev->rxq_num; rxq_id++) { + cpu = oecls_dev->rxq[rxq_id].affinity_cpu; + nid = cpu_to_node(cpu); + if (enable) + cpu_mask = cpumask_of_node(nid); + else + cpu_mask = &clear_mask; + + netif_set_xps_queue_wrapper(oecls_dev->netdev, rxq_id, cpu_mask); + } + } +} + +static __init int oecls_init(void) +{ + struct oecls_numa_info *numa_info; + int nid, err; + + if (!check_params()) + return -EINVAL; + + err = init_oecls_numa_info(); + if (err) + return err; + + err = init_oecls_netdev_info(ifname); + if (err) + goto clean_numa; + + // Set irq affinity + config_affinity_strategy(); + enable_affinity_strategy(); + + // Calculate rxq bounded to one numa + for_each_oecls_numa(nid, numa_info) { + err = init_numa_rxq_bitmap(nid, numa_info); + if (err) + goto clean_rxq; + } + +#ifdef CONFIG_XPS + set_netdev_xps_queue(true); +#endif + + if (mode == 0) + oecls_ntuple_res_init(); + else + oecls_flow_res_init(); + + return 0; + +clean_rxq: +clean_numa: + clean_oecls_netdev_info(); + clean_oecls_numa_info(); + return err; +} + +static __exit void oecls_exit(void) +{ + if (mode == 0) + oecls_ntuple_res_clean(); + else + oecls_flow_res_clean(); + +#ifdef CONFIG_XPS + set_netdev_xps_queue(false); +#endif + + clean_oecls_rxq(); + clean_oecls_netdev_info(); + clean_oecls_numa_info(); +} + +module_init(oecls_init); +module_exit(oecls_exit); + +MODULE_DESCRIPTION("oenetcls"); +MODULE_LICENSE("GPL v2"); diff --git a/net/oenetcls/oenetcls_ntuple.c b/net/oenetcls/oenetcls_ntuple.c new file mode 100644 index 000000000000..3b69f927d25b --- /dev/null +++ b/net/oenetcls/oenetcls_ntuple.c @@ -0,0 +1,572 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "oenetcls.h" + +struct oecls_sk_rule_list oecls_sk_rules, oecls_sk_list; + +static void init_oecls_sk_rules(void) +{ + unsigned int i; + + for (i = 0; i < OECLS_SK_RULE_HASHSIZE; i++) + INIT_HLIST_HEAD(oecls_sk_rules.hash + i); + mutex_init(&oecls_sk_rules.mutex); +} + +static inline struct hlist_head *get_rule_hashlist(u32 dip4, u16 dport) +{ + return oecls_sk_rules.hash + (jhash_2words(dip4, dport, 0) & OECLS_SK_RULE_HASHMASK); +} + +static inline struct hlist_head *get_sk_hashlist(void *sk) +{ + return oecls_sk_list.hash + (jhash(sk, sizeof(sk), 0) & OECLS_SK_RULE_HASHMASK); +} + +static void add_sk_rule(int devid, u32 dip4, u16 dport, void *sk, int action, + int ruleid, int nid) +{ + struct hlist_head *hlist = get_rule_hashlist(dip4, dport); + struct hlist_head *sk_hlist = get_sk_hashlist(sk); + struct oecls_sk_rule *rule; + struct oecls_sk_entry *entry; + + rule = kzalloc(sizeof(*rule), GFP_ATOMIC); + entry = kzalloc(sizeof(*entry), GFP_ATOMIC); + if (!rule || !entry) + goto out; + + rule->sk = sk; + rule->dip4 = dip4; + rule->dport = dport; + rule->devid = devid; + rule->action = action; + rule->ruleid = ruleid; + rule->nid = nid; + hlist_add_head(&rule->node, hlist); + + entry->sk = sk; + entry->sk_rule_hash = jhash_2words(dip4, dport, 0); + hlist_add_head(&entry->node, sk_hlist); + return; +out: + oecls_debug("alloc failed rule:%p entry:%p\n", rule, entry); + kfree(entry); + kfree(rule); +} + +static struct oecls_sk_entry *get_sk_entry(void *sk) +{ + struct hlist_head *sk_hlist = get_sk_hashlist(sk); + struct oecls_sk_entry *entry = NULL; + + hlist_for_each_entry(entry, sk_hlist, node) { + if (entry->sk == sk) + break; + } + return entry; +} + +static void del_sk_rule(struct oecls_sk_rule *rule) +{ + struct oecls_sk_entry *entry; + + entry = get_sk_entry(rule->sk); + if (!entry) + return; + hlist_del_init(&entry->node); + kfree(entry); + + oecls_debug("del rule=%p\n", rule); + hlist_del_init(&rule->node); + kfree(rule); +} + +static struct oecls_sk_rule *get_sk_rule(int devid, u32 dip4, u16 dport) +{ + struct hlist_head *hlist = get_rule_hashlist(dip4, dport); + struct oecls_sk_rule *rule = NULL; + + hlist_for_each_entry(rule, hlist, node) { + if (rule->devid == devid && rule->dip4 == dip4 && rule->dport == dport) + break; + } + return rule; +} + +static struct oecls_sk_rule *get_rule_from_sk(int devid, void *sk) +{ + struct oecls_sk_rule *rule = NULL; + struct oecls_sk_entry *entry; + struct hlist_head *hlist; + + entry = get_sk_entry(sk); + if (!entry) + return NULL; + + hlist = oecls_sk_rules.hash + (entry->sk_rule_hash & OECLS_SK_RULE_HASHMASK); + hlist_for_each_entry(rule, hlist, node) { + if (rule->devid == devid && rule->sk == sk) + break; + } + return rule; +} + +static inline bool reuseport_check(int devid, u32 dip4, u16 dport) +{ + return !!get_sk_rule(devid, dip4, dport); +} + +static u32 get_first_ip4_addr(struct net *net) +{ + struct in_device *in_dev; + struct net_device *dev; + struct in_ifaddr *ifa; + u32 dip4 = 0; + + rtnl_lock(); + rcu_read_lock(); + for_each_netdev(net, dev) { + if (dev->flags & IFF_LOOPBACK || !(dev->flags & IFF_UP)) + continue; + in_dev = __in_dev_get_rcu(dev); + if (!in_dev) + continue; + + in_dev_for_each_ifa_rcu(ifa, in_dev) { + if (!strcmp(dev->name, ifa->ifa_label)) { + dip4 = ifa->ifa_local; + oecls_debug("dev: %s, dip4:%pI4\n", dev->name, &dip4); + goto out; + } + } + } +out: + rcu_read_unlock(); + rtnl_unlock(); + return dip4; +} + +static void get_sk_rule_addr(struct sock *sk, u32 *dip4, u16 *dport) +{ + *dport = htons(sk->sk_num); + + if (!match_ip_flag) { + *dip4 = 0; + return; + } + + if (sk->sk_rcv_saddr) + *dip4 = sk->sk_rcv_saddr; + else + *dip4 = get_first_ip4_addr(sock_net(sk)); +} + +static int rxclass_rule_del(struct cmd_context *ctx, __u32 loc) +{ + struct ethtool_rxnfc nfccmd; + int err; + + nfccmd.cmd = ETHTOOL_SRXCLSRLDEL; + nfccmd.fs.location = loc; + err = send_ethtool_ioctl(ctx, &nfccmd); + if (err < 0) + oecls_debug("rmgr: Cannot delete RX class rule, loc:%u\n", loc); + return err; +} + +static int rmgr_ins(struct rmgr_ctrl *rmgr, __u32 loc) +{ + if (loc >= rmgr->size) { + oecls_error("rmgr: Location out of range\n"); + return -1; + } + + set_bit(loc, rmgr->slot); + return 0; +} + +static int rmgr_find_empty_slot(struct rmgr_ctrl *rmgr, struct ethtool_rx_flow_spec *fsp) +{ + __u32 loc, slot_num; + + if (rmgr->driver_select) + return 0; + + loc = rmgr->size - 1; + slot_num = loc / BITS_PER_LONG; + if (!~(rmgr->slot[slot_num] | (~1UL << rmgr->size % BITS_PER_LONG))) { + loc -= 1 + (loc % BITS_PER_LONG); + slot_num--; + } + + while (loc < rmgr->size && !~(rmgr->slot[slot_num])) { + loc -= BITS_PER_LONG; + slot_num--; + } + + while (loc < rmgr->size && test_bit(loc, rmgr->slot)) + loc--; + + if (loc < rmgr->size) { + fsp->location = loc; + return rmgr_ins(rmgr, loc); + } + + return -1; +} + +static int rxclass_get_dev_info(struct cmd_context *ctx, __u32 *count, int *driver_select) +{ + struct ethtool_rxnfc nfccmd; + int err; + + nfccmd.cmd = ETHTOOL_GRXCLSRLCNT; + nfccmd.data = 0; + err = send_ethtool_ioctl(ctx, &nfccmd); + *count = nfccmd.rule_cnt; + if (driver_select) + *driver_select = !!(nfccmd.data & RX_CLS_LOC_SPECIAL); + if (err < 0) + oecls_debug("rxclass: Cannot get RX class rule count\n"); + + return err; +} + +static int rmgr_init(struct cmd_context *ctx, struct rmgr_ctrl *rmgr) +{ + struct ethtool_rxnfc *nfccmd; + __u32 *rule_locs; + int i, err = 0; + + memset(rmgr, 0, sizeof(*rmgr)); + err = rxclass_get_dev_info(ctx, &rmgr->n_rules, &rmgr->driver_select); + if (err < 0) + return err; + + if (rmgr->driver_select) + return err; + + nfccmd = kzalloc(sizeof(*nfccmd) + (rmgr->n_rules * sizeof(__u32)), GFP_ATOMIC); + if (!nfccmd) { + oecls_error("rmgr: Cannot allocate memory for RX class rule locations\n"); + err = -ENOMEM; + goto out; + } + + nfccmd->cmd = ETHTOOL_GRXCLSRLALL; + nfccmd->rule_cnt = rmgr->n_rules; + err = send_ethtool_ioctl(ctx, nfccmd); + if (err < 0) { + oecls_debug("rmgr: Cannot get RX class rules\n"); + goto out; + } + + rmgr->size = nfccmd->data; + if (rmgr->size == 0 || rmgr->size < rmgr->n_rules) { + oecls_error("rmgr: Invalid RX class rules table size\n"); + err = -EINVAL; + goto out; + } + + rmgr->slot = kzalloc(BITS_TO_LONGS(rmgr->size) * sizeof(long), GFP_ATOMIC); + if (!rmgr->slot) { + oecls_error("rmgr: Cannot allocate memory for RX class rules\n"); + err = -ENOMEM; + goto out; + } + + rule_locs = nfccmd->rule_locs; + for (i = 0; i < rmgr->n_rules; i++) { + err = rmgr_ins(rmgr, rule_locs[i]); + if (err < 0) + break; + } + +out: + kfree(nfccmd); + return err; +} + +static void rmgr_cleanup(struct rmgr_ctrl *rmgr) +{ + kfree(rmgr->slot); + rmgr->slot = NULL; + rmgr->size = 0; +} + +static int rmgr_set_location(struct cmd_context *ctx, + struct ethtool_rx_flow_spec *fsp) +{ + struct rmgr_ctrl rmgr; + int ret; + + ret = rmgr_init(ctx, &rmgr); + if (ret < 0) + goto out; + + ret = rmgr_find_empty_slot(&rmgr, fsp); +out: + rmgr_cleanup(&rmgr); + return ret; +} + +static int rxclass_rule_ins(struct cmd_context *ctx, + struct ethtool_rx_flow_spec *fsp, u32 rss_context) +{ + struct ethtool_rxnfc nfccmd; + u32 loc = fsp->location; + int ret; + + if (loc & RX_CLS_LOC_SPECIAL) { + ret = rmgr_set_location(ctx, fsp); + if (ret < 0) + return ret; + } + + nfccmd.cmd = ETHTOOL_SRXCLSRLINS; + nfccmd.rss_context = rss_context; + nfccmd.fs = *fsp; + ret = send_ethtool_ioctl(ctx, &nfccmd); + if (ret < 0) { + oecls_debug("Can not insert the clasification rule\n"); + return ret; + } + + if (loc & RX_CLS_LOC_SPECIAL) + oecls_debug("Added rule with ID %d\n", nfccmd.fs.location); + + return 0; +} + +static void flow_spec_to_ntuple(struct ethtool_rx_flow_spec *fsp, + struct ethtool_rx_ntuple_flow_spec *ntuple) +{ + int i; + + memset(ntuple, ~0, sizeof(*ntuple)); + ntuple->flow_type = fsp->flow_type; + ntuple->action = fsp->ring_cookie; + memcpy_r(&ntuple->h_u, &fsp->h_u, sizeof(fsp->h_u)); + memcpy_r(&ntuple->m_u, &fsp->m_u, sizeof(fsp->m_u)); + for (i = 0; i < sizeof(ntuple->m_u); i++) + ntuple->m_u.hdata[i] ^= 0xFF; + ntuple->flow_type &= ~FLOW_EXT; +} + +static int do_srxntuple(struct cmd_context *ctx, struct ethtool_rx_flow_spec *fsp) +{ + struct ethtool_rx_ntuple ntuplecmd; + struct ethtool_value eval; + int ret = 0; + + flow_spec_to_ntuple(fsp, &ntuplecmd.fs); + + eval.cmd = ETHTOOL_GFLAGS; + ret = send_ethtool_ioctl(ctx, &eval); + if (ret || !(eval.data & ETH_FLAG_NTUPLE)) + return -1; + + ntuplecmd.cmd = ETHTOOL_SRXNTUPLE; + ret = send_ethtool_ioctl(ctx, &ntuplecmd); + if (ret) + oecls_debug("Cannot add new rule via N-tuple, ret:%d\n", ret); + + return ret; +} + +static int cfg_ethtool_rule(struct cmd_context *ctx, bool is_del) +{ + struct ethtool_rx_flow_spec *fsp, rx_rule_fs; + u32 rss_context = 0; + int ret; + + oecls_debug("is_del:%d netdev:%s, dip4:%pI4, dport:%d, action:%d, ruleid:%u, del_ruleid:%u\n", + is_del, ctx->netdev, &ctx->dip4, ntohs(ctx->dport), ctx->action, ctx->ruleid, + ctx->del_ruleid); + + if (is_del) + return rxclass_rule_del(ctx, ctx->del_ruleid); + + ctx->ret_loc = -1; + + fsp = &rx_rule_fs; + memset(fsp, 0, sizeof(*fsp)); + fsp->flow_type = TCP_V4_FLOW; + fsp->location = RX_CLS_LOC_ANY; + fsp->h_u.tcp_ip4_spec.ip4dst = ctx->dip4; + fsp->h_u.tcp_ip4_spec.pdst = ctx->dport; + if (ctx->dip4) + fsp->m_u.tcp_ip4_spec.ip4dst = (u32)~0ULL; + fsp->m_u.tcp_ip4_spec.pdst = (u16)~0ULL; + if (ctx->ruleid) + fsp->location = ctx->ruleid; + fsp->ring_cookie = ctx->action; + + ret = do_srxntuple(ctx, &rx_rule_fs); + if (!ret) + return 0; + + ret = rxclass_rule_ins(ctx, &rx_rule_fs, rss_context); + if (!ret) + ctx->ret_loc = rx_rule_fs.location; + return ret; +} + +static void del_ntuple_rule(struct sock *sk) +{ + struct oecls_netdev_info *oecls_dev; + struct cmd_context ctx = { 0 }; + struct oecls_sk_rule *rule; + int devid; + u16 dport; + u32 dip4; + int err; + + get_sk_rule_addr(sk, &dip4, &dport); + + mutex_lock(&oecls_sk_rules.mutex); + for_each_oecls_netdev(devid, oecls_dev) { + strncpy(ctx.netdev, oecls_dev->dev_name, IFNAMSIZ); + rule = get_rule_from_sk(devid, sk); + if (!rule) { + oecls_debug("rule not found! sk:%p, devid:%d, dip4:%pI4, dport:%d\n", + sk, devid, &dip4, ntohs(dport)); + continue; + } + + // Config Ntuple rule to dev + ctx.del_ruleid = rule->ruleid; + err = cfg_ethtool_rule(&ctx, true); + if (err) { + oecls_error("del sk:%p, nid:%d, devid:%d, action:%d, ruleid:%d, err:%d\n", + sk, rule->nid, devid, rule->action, rule->ruleid, err); + } + + // Free the bound queue + free_rxq_id(rule->nid, devid, rule->action); + + // Delete sk rule + del_sk_rule(rule); + } + mutex_unlock(&oecls_sk_rules.mutex); +} + +static void add_ntuple_rule(struct sock *sk) +{ + struct oecls_netdev_info *oecls_dev; + struct cmd_context ctx = { 0 }; + int cpu = raw_smp_processor_id(); + int nid = cpu_to_node(cpu); + int rxq_id; + int devid; + int err; + + if (check_appname(current->comm)) + return; + get_sk_rule_addr(sk, &ctx.dip4, &ctx.dport); + + mutex_lock(&oecls_sk_rules.mutex); + for_each_oecls_netdev(devid, oecls_dev) { + strncpy(ctx.netdev, oecls_dev->dev_name, IFNAMSIZ); + if (reuseport_check(devid, ctx.dip4, ctx.dport)) { + oecls_error("dip4:%pI4, dport:%d reuse!\n", &ctx.dip4, ctx.dport); + continue; + } + + // Calculate the bound queue + rxq_id = alloc_rxq_id(nid, devid); + if (rxq_id < 0) + continue; + + // Config Ntuple rule to dev + ctx.action = (u16)rxq_id; + err = cfg_ethtool_rule(&ctx, false); + if (err) { + oecls_error("add sk:%p, nid:%d, devid:%d, action:%d, ruleid:%d, err:%d\n", + sk, nid, devid, ctx.action, ctx.ret_loc, err); + continue; + } + + // Add sk rule + add_sk_rule(devid, ctx.dip4, ctx.dport, sk, ctx.action, ctx.ret_loc, nid); + } + mutex_unlock(&oecls_sk_rules.mutex); +} + +static void ethtool_cfg_rxcls(struct sock *sk, int is_del) +{ + if (sk->sk_state != TCP_LISTEN) + return; + + if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6) + return; + + oecls_debug("[cpu:%d] app:%s, sk:%p, is_del:%d, ip:%pI4, port:%d\n", raw_smp_processor_id(), + current->comm, sk, is_del, &sk->sk_rcv_saddr, (u16)sk->sk_num); + + if (is_del) + del_ntuple_rule(sk); + else + add_ntuple_rule(sk); +} + +static void clean_oecls_sk_rules(void) +{ + struct oecls_netdev_info *oecls_dev; + struct cmd_context ctx = { 0 }; + struct oecls_sk_rule *rule; + struct hlist_head *hlist; + struct hlist_node *n; + unsigned int i; + int err; + + mutex_lock(&oecls_sk_rules.mutex); + for (i = 0; i < OECLS_SK_RULE_HASHSIZE; i++) { + hlist = &oecls_sk_rules.hash[i]; + + hlist_for_each_entry_safe(rule, n, hlist, node) { + oecls_dev = get_oecls_netdev_info(rule->devid); + if (!oecls_dev) + continue; + strncpy(ctx.netdev, oecls_dev->dev_name, IFNAMSIZ); + ctx.del_ruleid = rule->ruleid; + err = cfg_ethtool_rule(&ctx, true); + oecls_debug("sk:%p, dev_id:%d, action:%d, ruleid:%d, err:%d\n", rule->sk, + rule->devid, rule->action, rule->ruleid, err); + + hlist_del(&rule->node); + oecls_debug("clean rule=%p\n", rule); + kfree(rule); + } + } + mutex_unlock(&oecls_sk_rules.mutex); +} + +static const struct oecls_hook_ops oecls_ntuple_ops = { + .oecls_flow_update = NULL, + .oecls_set_cpu = NULL, + .oecls_timeout = NULL, + .oecls_cfg_rxcls = ethtool_cfg_rxcls, +}; + +void oecls_ntuple_res_init(void) +{ + init_oecls_sk_rules(); + RCU_INIT_POINTER(oecls_ops, &oecls_ntuple_ops); +} + +void oecls_ntuple_res_clean(void) +{ + RCU_INIT_POINTER(oecls_ops, NULL); + clean_oecls_sk_rules(); +} -- Gitee From 9436575a55aad3653530dcd4215389c233450e21 Mon Sep 17 00:00:00 2001 From: Wang Liang Date: Fri, 12 Sep 2025 20:31:10 +0800 Subject: [PATCH 2/7] net/oenetcls: use workqueue for ntuple cfg commit c9f5e390a63c45ee67328962d3fd0731c5f4c8c4 openEuler Use workqueue for ntuple cfg to avoid mutex_lock in rcu context. Fixes: 4bed6ba0e88f ("net/oenetcls: introduce oenetcls for network optimization") Signed-off-by: Wang Liang Signed-off-by: liujian <66liujian@163.com> --- net/oenetcls/oenetcls.h | 10 +++ net/oenetcls/oenetcls_ntuple.c | 132 +++++++++++++++++++-------------- 2 files changed, 85 insertions(+), 57 deletions(-) diff --git a/net/oenetcls/oenetcls.h b/net/oenetcls/oenetcls.h index 123c568e1096..2b403a886032 100644 --- a/net/oenetcls/oenetcls.h +++ b/net/oenetcls/oenetcls.h @@ -121,6 +121,16 @@ struct rmgr_ctrl { __u32 size; }; +struct cfg_param { + struct work_struct work; + struct cmd_context ctx; + struct oecls_sk_rule *rule; + struct sock *sk; + bool is_del; + int devid; + int nid; +}; + extern int match_ip_flag; extern int oecls_debug_lvl; extern int oecls_netdev_num; diff --git a/net/oenetcls/oenetcls_ntuple.c b/net/oenetcls/oenetcls_ntuple.c index 3b69f927d25b..7df40f618fdd 100644 --- a/net/oenetcls/oenetcls_ntuple.c +++ b/net/oenetcls/oenetcls_ntuple.c @@ -12,6 +12,7 @@ #include "oenetcls.h" struct oecls_sk_rule_list oecls_sk_rules, oecls_sk_list; +static struct workqueue_struct *do_cfg_workqueue; static void init_oecls_sk_rules(void) { @@ -422,85 +423,96 @@ static int cfg_ethtool_rule(struct cmd_context *ctx, bool is_del) return ret; } -static void del_ntuple_rule(struct sock *sk) +static void cfg_work(struct work_struct *work) { + struct cfg_param *ctx_p = container_of(work, struct cfg_param, work); struct oecls_netdev_info *oecls_dev; - struct cmd_context ctx = { 0 }; struct oecls_sk_rule *rule; - int devid; - u16 dport; - u32 dip4; + int devid, rxq_id; int err; - get_sk_rule_addr(sk, &dip4, &dport); - mutex_lock(&oecls_sk_rules.mutex); for_each_oecls_netdev(devid, oecls_dev) { - strncpy(ctx.netdev, oecls_dev->dev_name, IFNAMSIZ); - rule = get_rule_from_sk(devid, sk); - if (!rule) { - oecls_debug("rule not found! sk:%p, devid:%d, dip4:%pI4, dport:%d\n", - sk, devid, &dip4, ntohs(dport)); - continue; - } + strscpy(ctx_p->ctx.netdev, oecls_dev->dev_name, IFNAMSIZ); + if (!ctx_p->is_del) { + if (reuseport_check(devid, ctx_p->ctx.dip4, ctx_p->ctx.dport)) { + oecls_error("dip4:%pI4, dport:%d reuse!\n", &ctx_p->ctx.dip4, + ctx_p->ctx.dport); + continue; + } - // Config Ntuple rule to dev - ctx.del_ruleid = rule->ruleid; - err = cfg_ethtool_rule(&ctx, true); - if (err) { - oecls_error("del sk:%p, nid:%d, devid:%d, action:%d, ruleid:%d, err:%d\n", - sk, rule->nid, devid, rule->action, rule->ruleid, err); - } + // Calculate the bound queue + rxq_id = alloc_rxq_id(ctx_p->nid, devid); + if (rxq_id < 0) + continue; - // Free the bound queue - free_rxq_id(rule->nid, devid, rule->action); + // Config Ntuple rule to dev + ctx_p->ctx.action = (u16)rxq_id; + err = cfg_ethtool_rule(&ctx_p->ctx, ctx_p->is_del); + // Add sk rule only on success + if (err) { + free_rxq_id(ctx_p->nid, devid, rxq_id); + continue; + } + add_sk_rule(ctx_p->devid, ctx_p->ctx.dip4, ctx_p->ctx.dport, ctx_p->sk, + ctx_p->ctx.action, ctx_p->ctx.ret_loc, ctx_p->nid); + } else { + rule = get_rule_from_sk(ctx_p->devid, ctx_p->sk); + if (!rule) { + oecls_debug("rule not found! sk:%p, devid:%d, dip4:%pI4, dport:%d\n", + ctx_p->sk, ctx_p->devid, &ctx_p->ctx.dip4, + ntohs(ctx_p->ctx.dport)); + continue; + } - // Delete sk rule - del_sk_rule(rule); + // Config Ntuple rule to dev + ctx_p->ctx.del_ruleid = rule->ruleid; + ctx_p->rule = rule; + err = cfg_ethtool_rule(&ctx_p->ctx, ctx_p->is_del); + // Free the bound queue + free_rxq_id(ctx_p->rule->nid, ctx_p->devid, ctx_p->rule->action); + // Delete sk rule + del_sk_rule(ctx_p->rule); + } } mutex_unlock(&oecls_sk_rules.mutex); + kfree(ctx_p); +} + +static void del_ntuple_rule(struct sock *sk) +{ + struct cfg_param *ctx_p; + + ctx_p = kzalloc(sizeof(*ctx_p), GFP_ATOMIC); + if (!ctx_p) + return; + get_sk_rule_addr(sk, &ctx_p->ctx.dip4, &ctx_p->ctx.dport); + + ctx_p->is_del = true; + ctx_p->sk = sk; + INIT_WORK(&ctx_p->work, cfg_work); + queue_work(do_cfg_workqueue, &ctx_p->work); } static void add_ntuple_rule(struct sock *sk) { - struct oecls_netdev_info *oecls_dev; - struct cmd_context ctx = { 0 }; + struct cfg_param *ctx_p; int cpu = raw_smp_processor_id(); int nid = cpu_to_node(cpu); - int rxq_id; - int devid; - int err; if (check_appname(current->comm)) return; - get_sk_rule_addr(sk, &ctx.dip4, &ctx.dport); - - mutex_lock(&oecls_sk_rules.mutex); - for_each_oecls_netdev(devid, oecls_dev) { - strncpy(ctx.netdev, oecls_dev->dev_name, IFNAMSIZ); - if (reuseport_check(devid, ctx.dip4, ctx.dport)) { - oecls_error("dip4:%pI4, dport:%d reuse!\n", &ctx.dip4, ctx.dport); - continue; - } - // Calculate the bound queue - rxq_id = alloc_rxq_id(nid, devid); - if (rxq_id < 0) - continue; - - // Config Ntuple rule to dev - ctx.action = (u16)rxq_id; - err = cfg_ethtool_rule(&ctx, false); - if (err) { - oecls_error("add sk:%p, nid:%d, devid:%d, action:%d, ruleid:%d, err:%d\n", - sk, nid, devid, ctx.action, ctx.ret_loc, err); - continue; - } + ctx_p = kzalloc(sizeof(*ctx_p), GFP_ATOMIC); + if (!ctx_p) + return; + get_sk_rule_addr(sk, &ctx_p->ctx.dip4, &ctx_p->ctx.dport); - // Add sk rule - add_sk_rule(devid, ctx.dip4, ctx.dport, sk, ctx.action, ctx.ret_loc, nid); - } - mutex_unlock(&oecls_sk_rules.mutex); + ctx_p->is_del = false; + ctx_p->sk = sk; + ctx_p->nid = nid; + INIT_WORK(&ctx_p->work, cfg_work); + queue_work(do_cfg_workqueue, &ctx_p->work); } static void ethtool_cfg_rxcls(struct sock *sk, int is_del) @@ -538,7 +550,7 @@ static void clean_oecls_sk_rules(void) oecls_dev = get_oecls_netdev_info(rule->devid); if (!oecls_dev) continue; - strncpy(ctx.netdev, oecls_dev->dev_name, IFNAMSIZ); + strscpy(ctx.netdev, oecls_dev->dev_name, IFNAMSIZ); ctx.del_ruleid = rule->ruleid; err = cfg_ethtool_rule(&ctx, true); oecls_debug("sk:%p, dev_id:%d, action:%d, ruleid:%d, err:%d\n", rule->sk, @@ -561,6 +573,12 @@ static const struct oecls_hook_ops oecls_ntuple_ops = { void oecls_ntuple_res_init(void) { + do_cfg_workqueue = alloc_ordered_workqueue("oecls_cfg", 0); + if (!do_cfg_workqueue) { + oecls_debug("alloc_ordered_workqueue fails\n"); + return; + } + init_oecls_sk_rules(); RCU_INIT_POINTER(oecls_ops, &oecls_ntuple_ops); } -- Gitee From 8493fd10d056824029df606d9beb0b3c0bb2aae1 Mon Sep 17 00:00:00 2001 From: Wang Liang Date: Fri, 12 Sep 2025 20:31:11 +0800 Subject: [PATCH 3/7] net/oenetcls: clean resource when init ntuple/flow fail commit 15ef092d637bc84e09ba0e10ae915c6a0829340a openEuler Clean resource when init ntuple/flow fail. Fixes: 4bed6ba0e88f ("net/oenetcls: introduce oenetcls for network optimization") Signed-off-by: Wang Liang Signed-off-by: liujian <66liujian@163.com> --- net/oenetcls/oenetcls.h | 4 ++-- net/oenetcls/oenetcls_flow.c | 17 ++++++++++++++--- net/oenetcls/oenetcls_main.c | 7 +++++-- net/oenetcls/oenetcls_ntuple.c | 5 +++-- 4 files changed, 24 insertions(+), 9 deletions(-) diff --git a/net/oenetcls/oenetcls.h b/net/oenetcls/oenetcls.h index 2b403a886032..efeb88e5dd2e 100644 --- a/net/oenetcls/oenetcls.h +++ b/net/oenetcls/oenetcls.h @@ -179,9 +179,9 @@ int check_appname(char *task_name); int send_ethtool_ioctl(struct cmd_context *ctx, void *cmd); int alloc_rxq_id(int nid, int devid); void free_rxq_id(int nid, int devid, int rxq_id); -void oecls_ntuple_res_init(void); +int oecls_ntuple_res_init(void); void oecls_ntuple_res_clean(void); -void oecls_flow_res_init(void); +int oecls_flow_res_init(void); void oecls_flow_res_clean(void); #endif /* _NET_OENETCLS_H */ diff --git a/net/oenetcls/oenetcls_flow.c b/net/oenetcls/oenetcls_flow.c index aaa5881a817c..9a7550544305 100644 --- a/net/oenetcls/oenetcls_flow.c +++ b/net/oenetcls/oenetcls_flow.c @@ -391,11 +391,22 @@ static const struct oecls_hook_ops oecls_flow_ops = { .oecls_cfg_rxcls = NULL, }; -void oecls_flow_res_init(void) +int oecls_flow_res_init(void) { - oecls_sock_flow_table_init(); - oecls_dev_flow_table_init(); + int err; + + err = oecls_sock_flow_table_init(); + if (err) + return err; + + err = oecls_dev_flow_table_init(); + if (err) { + oecls_sock_flow_table_release(); + return err; + } + RCU_INIT_POINTER(oecls_ops, &oecls_flow_ops); + return 0; } void oecls_flow_res_clean(void) diff --git a/net/oenetcls/oenetcls_main.c b/net/oenetcls/oenetcls_main.c index 4be09a3f56cb..69bcb6101966 100644 --- a/net/oenetcls/oenetcls_main.c +++ b/net/oenetcls/oenetcls_main.c @@ -1040,9 +1040,12 @@ static __init int oecls_init(void) #endif if (mode == 0) - oecls_ntuple_res_init(); + err = oecls_ntuple_res_init(); else - oecls_flow_res_init(); + err = oecls_flow_res_init(); + + if (err) + goto clean_rxq; return 0; diff --git a/net/oenetcls/oenetcls_ntuple.c b/net/oenetcls/oenetcls_ntuple.c index 7df40f618fdd..d79eb4a40276 100644 --- a/net/oenetcls/oenetcls_ntuple.c +++ b/net/oenetcls/oenetcls_ntuple.c @@ -571,16 +571,17 @@ static const struct oecls_hook_ops oecls_ntuple_ops = { .oecls_cfg_rxcls = ethtool_cfg_rxcls, }; -void oecls_ntuple_res_init(void) +int oecls_ntuple_res_init(void) { do_cfg_workqueue = alloc_ordered_workqueue("oecls_cfg", 0); if (!do_cfg_workqueue) { oecls_debug("alloc_ordered_workqueue fails\n"); - return; + return -ENOMEM; } init_oecls_sk_rules(); RCU_INIT_POINTER(oecls_ops, &oecls_ntuple_ops); + return 0; } void oecls_ntuple_res_clean(void) -- Gitee From 404c554eca4a71c95fb2773e52dd3b9b89a2af23 Mon Sep 17 00:00:00 2001 From: Wang Liang Date: Mon, 22 Sep 2025 15:16:03 +0800 Subject: [PATCH 4/7] net/oenetcls: Fix crash when removing module commit d10ae31e539a86a4f2bd2f0a18d732714464584d openEuler Destroy workqueue in oecls_ntuple_res_clean() and use 'oecls_worker_count' to avoid visiting oenetcls code after removed module. Moreover, call synchronize_rcu() to keep the rcu data consistency. Fixes: c9f5e390a63c ("net/oenetcls: use workqueue for ntuple cfg") Signed-off-by: Wang Liang Signed-off-by: liujian <66liujian@163.com> --- net/oenetcls/oenetcls.h | 6 ++-- net/oenetcls/oenetcls_flow.c | 5 ++- net/oenetcls/oenetcls_main.c | 43 +++++++++---------------- net/oenetcls/oenetcls_ntuple.c | 57 ++++++++++++++++++++++++---------- 4 files changed, 62 insertions(+), 49 deletions(-) diff --git a/net/oenetcls/oenetcls.h b/net/oenetcls/oenetcls.h index efeb88e5dd2e..86b8a1d1ac98 100644 --- a/net/oenetcls/oenetcls.h +++ b/net/oenetcls/oenetcls.h @@ -74,7 +74,7 @@ struct oecls_sk_rule { int dport; int action; int ruleid; - int nid; + int cpu; }; struct oecls_sk_entry { @@ -124,11 +124,9 @@ struct rmgr_ctrl { struct cfg_param { struct work_struct work; struct cmd_context ctx; - struct oecls_sk_rule *rule; struct sock *sk; bool is_del; - int devid; - int nid; + int cpu; }; extern int match_ip_flag; diff --git a/net/oenetcls/oenetcls_flow.c b/net/oenetcls/oenetcls_flow.c index 9a7550544305..87efb5e6a126 100644 --- a/net/oenetcls/oenetcls_flow.c +++ b/net/oenetcls/oenetcls_flow.c @@ -406,12 +406,15 @@ int oecls_flow_res_init(void) } RCU_INIT_POINTER(oecls_ops, &oecls_flow_ops); + synchronize_rcu(); return 0; } void oecls_flow_res_clean(void) { - RCU_INIT_POINTER(oecls_ops, NULL); + rcu_assign_pointer(oecls_ops, NULL); + synchronize_rcu(); + oecls_sock_flow_table_release(); oecls_dev_flow_table_release(); } diff --git a/net/oenetcls/oenetcls_main.c b/net/oenetcls/oenetcls_main.c index 69bcb6101966..998117bb56c5 100644 --- a/net/oenetcls/oenetcls_main.c +++ b/net/oenetcls/oenetcls_main.c @@ -23,11 +23,11 @@ static int mode; module_param(mode, int, 0444); MODULE_PARM_DESC(mode, "mode, default 0"); -static char ifname[64] = { 0 }; +static char ifname[128] = { 0 }; module_param_string(ifname, ifname, sizeof(ifname), 0444); MODULE_PARM_DESC(ifname, "ifname"); -static char appname[64] = "redis-server"; +static char appname[256] = "redis-server"; module_param_string(appname, appname, sizeof(appname), 0644); MODULE_PARM_DESC(appname, "appname, default redis-server"); @@ -39,6 +39,10 @@ static int strategy; module_param(strategy, int, 0444); MODULE_PARM_DESC(strategy, "strategy, default 0"); +static int check_cap = 1; +module_param(check_cap, int, 0444); +MODULE_PARM_DESC(check_cap, "check_cap, default 1"); + static bool check_params(void) { if (mode != 0 && mode != 1) @@ -218,20 +222,6 @@ static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev, return ret; } -static noinline_for_stack int ethtool_get_channels(struct net_device *dev, - void *useraddr) -{ - struct ethtool_channels channels = { .cmd = ETHTOOL_GCHANNELS }; - - if (!dev->ethtool_ops->get_channels) - return -EOPNOTSUPP; - - dev->ethtool_ops->get_channels(dev, &channels); - - memcpy_r(useraddr, &channels, sizeof(channels)); - return 0; -} - static int ethtool_get_value(struct net_device *dev, char *useraddr, u32 cmd, u32 (*actor)(struct net_device *)) { @@ -285,10 +275,9 @@ static int dev_ethtool_kern(struct net *net, struct ifreq *ifr) case ETHTOOL_GRXCLSRLCNT: case ETHTOOL_GRXCLSRULE: case ETHTOOL_GRXCLSRLALL: - case ETHTOOL_GCHANNELS: break; default: - if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) + if (check_cap && !ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; } @@ -319,9 +308,6 @@ static int dev_ethtool_kern(struct net *net, struct ifreq *ifr) case ETHTOOL_SRXCLSRLINS: rc = ethtool_set_rxnfc(dev, ethcmd, useraddr); break; - case ETHTOOL_GCHANNELS: - rc = ethtool_get_channels(dev, useraddr); - break; default: rc = -EOPNOTSUPP; } @@ -400,7 +386,7 @@ static void get_netdev_queue_info(struct oecls_netdev_info *oecls_dev) cpu = cpumask_first(irq_data_get_effective_affinity_mask(&desc->irq_data)); rxq_info->affinity_cpu = cpu; oecls_debug("irq=%d, [%s], rxq_id=%d affinity_cpu:%d\n", - irq, desc->action->name, oecls_dev->rxq_num, cpu); + irq, desc->action->name, oecls_dev->rxq_num - 1, cpu); } } @@ -669,9 +655,8 @@ static int init_numa_rxq_bitmap(int nid, struct oecls_numa_info *numa_info) return ret; } -static int get_cluster_rxq(struct oecls_numa_bound_dev_info *bound_dev) +static int get_cluster_rxq(int cpu, struct oecls_numa_bound_dev_info *bound_dev) { - int cpu = raw_smp_processor_id(); int cluster_id = cpu / oecls_cluster_cpu_num; int i, j, rxq_id; @@ -710,10 +695,11 @@ static int put_cluster_rxq(struct oecls_numa_bound_dev_info *bound_dev, int rxq_ return -1; } -int alloc_rxq_id(int nid, int devid) +int alloc_rxq_id(int cpu, int devid) { struct oecls_numa_bound_dev_info *bound_dev; struct oecls_numa_info *numa_info; + int nid = cpu_to_node(cpu); int rxq_id; numa_info = get_oecls_numa_info(nid); @@ -729,7 +715,7 @@ int alloc_rxq_id(int nid, int devid) bound_dev = &numa_info->bound_dev[devid]; if (strategy == 1) { - rxq_id = get_cluster_rxq(bound_dev); + rxq_id = get_cluster_rxq(cpu, bound_dev); if (rxq_id < 0 || rxq_id >= OECLS_MAX_RXQ_NUM_PER_DEV) pr_info("failed to get rxq_id:%d in cluster, try numa\n", rxq_id); else @@ -744,14 +730,15 @@ int alloc_rxq_id(int nid, int devid) found: clear_bit(rxq_id, bound_dev->bitmap_rxq); - oecls_debug("alloc nid:%d, dev_id:%d, rxq_id:%d\n", nid, devid, rxq_id); + oecls_debug("alloc cpu:%d, nid:%d, devid:%d, rxq_id:%d\n", cpu, nid, devid, rxq_id); return rxq_id; } -void free_rxq_id(int nid, int devid, int rxq_id) +void free_rxq_id(int cpu, int devid, int rxq_id) { struct oecls_numa_bound_dev_info *bound_dev; struct oecls_numa_info *numa_info; + int nid = cpu_to_node(cpu); numa_info = get_oecls_numa_info(nid); if (!numa_info) { diff --git a/net/oenetcls/oenetcls_ntuple.c b/net/oenetcls/oenetcls_ntuple.c index d79eb4a40276..d616420aa4fc 100644 --- a/net/oenetcls/oenetcls_ntuple.c +++ b/net/oenetcls/oenetcls_ntuple.c @@ -13,6 +13,7 @@ struct oecls_sk_rule_list oecls_sk_rules, oecls_sk_list; static struct workqueue_struct *do_cfg_workqueue; +static atomic_t oecls_worker_count = ATOMIC_INIT(0); static void init_oecls_sk_rules(void) { @@ -33,8 +34,7 @@ static inline struct hlist_head *get_sk_hashlist(void *sk) return oecls_sk_list.hash + (jhash(sk, sizeof(sk), 0) & OECLS_SK_RULE_HASHMASK); } -static void add_sk_rule(int devid, u32 dip4, u16 dport, void *sk, int action, - int ruleid, int nid) +static void add_sk_rule(int devid, u32 dip4, u16 dport, void *sk, int action, int ruleid, int cpu) { struct hlist_head *hlist = get_rule_hashlist(dip4, dport); struct hlist_head *sk_hlist = get_sk_hashlist(sk); @@ -52,7 +52,7 @@ static void add_sk_rule(int devid, u32 dip4, u16 dport, void *sk, int action, rule->devid = devid; rule->action = action; rule->ruleid = ruleid; - rule->nid = nid; + rule->cpu = cpu; hlist_add_head(&rule->node, hlist); entry->sk = sk; @@ -442,7 +442,7 @@ static void cfg_work(struct work_struct *work) } // Calculate the bound queue - rxq_id = alloc_rxq_id(ctx_p->nid, devid); + rxq_id = alloc_rxq_id(ctx_p->cpu, devid); if (rxq_id < 0) continue; @@ -451,38 +451,55 @@ static void cfg_work(struct work_struct *work) err = cfg_ethtool_rule(&ctx_p->ctx, ctx_p->is_del); // Add sk rule only on success if (err) { - free_rxq_id(ctx_p->nid, devid, rxq_id); + free_rxq_id(ctx_p->cpu, devid, rxq_id); continue; } - add_sk_rule(ctx_p->devid, ctx_p->ctx.dip4, ctx_p->ctx.dport, ctx_p->sk, - ctx_p->ctx.action, ctx_p->ctx.ret_loc, ctx_p->nid); + add_sk_rule(devid, ctx_p->ctx.dip4, ctx_p->ctx.dport, ctx_p->sk, + ctx_p->ctx.action, ctx_p->ctx.ret_loc, ctx_p->cpu); } else { - rule = get_rule_from_sk(ctx_p->devid, ctx_p->sk); + rule = get_rule_from_sk(devid, ctx_p->sk); if (!rule) { oecls_debug("rule not found! sk:%p, devid:%d, dip4:%pI4, dport:%d\n", - ctx_p->sk, ctx_p->devid, &ctx_p->ctx.dip4, + ctx_p->sk, devid, &ctx_p->ctx.dip4, ntohs(ctx_p->ctx.dport)); continue; } // Config Ntuple rule to dev ctx_p->ctx.del_ruleid = rule->ruleid; - ctx_p->rule = rule; err = cfg_ethtool_rule(&ctx_p->ctx, ctx_p->is_del); // Free the bound queue - free_rxq_id(ctx_p->rule->nid, ctx_p->devid, ctx_p->rule->action); + free_rxq_id(rule->cpu, devid, rule->action); // Delete sk rule - del_sk_rule(ctx_p->rule); + del_sk_rule(rule); } } mutex_unlock(&oecls_sk_rules.mutex); kfree(ctx_p); + atomic_dec(&oecls_worker_count); +} + +static bool has_sock_rule(struct sock *sk) +{ + struct oecls_netdev_info *oecls_dev; + struct oecls_sk_rule *rule; + int devid; + + for_each_oecls_netdev(devid, oecls_dev) { + rule = get_rule_from_sk(devid, sk); + if (rule) + return true; + } + return false; } static void del_ntuple_rule(struct sock *sk) { struct cfg_param *ctx_p; + if (!has_sock_rule(sk)) + return; + ctx_p = kzalloc(sizeof(*ctx_p), GFP_ATOMIC); if (!ctx_p) return; @@ -492,13 +509,12 @@ static void del_ntuple_rule(struct sock *sk) ctx_p->sk = sk; INIT_WORK(&ctx_p->work, cfg_work); queue_work(do_cfg_workqueue, &ctx_p->work); + atomic_inc(&oecls_worker_count); } static void add_ntuple_rule(struct sock *sk) { struct cfg_param *ctx_p; - int cpu = raw_smp_processor_id(); - int nid = cpu_to_node(cpu); if (check_appname(current->comm)) return; @@ -510,9 +526,10 @@ static void add_ntuple_rule(struct sock *sk) ctx_p->is_del = false; ctx_p->sk = sk; - ctx_p->nid = nid; + ctx_p->cpu = raw_smp_processor_id(); INIT_WORK(&ctx_p->work, cfg_work); queue_work(do_cfg_workqueue, &ctx_p->work); + atomic_inc(&oecls_worker_count); } static void ethtool_cfg_rxcls(struct sock *sk, int is_del) @@ -581,11 +598,19 @@ int oecls_ntuple_res_init(void) init_oecls_sk_rules(); RCU_INIT_POINTER(oecls_ops, &oecls_ntuple_ops); + synchronize_rcu(); return 0; } void oecls_ntuple_res_clean(void) { - RCU_INIT_POINTER(oecls_ops, NULL); + rcu_assign_pointer(oecls_ops, NULL); + synchronize_rcu(); + + oecls_debug("oecls_worker_count:%d\n", atomic_read(&oecls_worker_count)); + while (atomic_read(&oecls_worker_count) != 0) + mdelay(1); + + destroy_workqueue(do_cfg_workqueue); clean_oecls_sk_rules(); } -- Gitee From 5a4ba9148d214eb65b8ed1bf7d996a045492db00 Mon Sep 17 00:00:00 2001 From: Wang Liang Date: Tue, 14 Oct 2025 13:19:07 +0800 Subject: [PATCH 5/7] net/oenetcls: Fix memleak when clean flow table commit 85c0f905ae0aae2c37818335de614bd2fb6754ac openEuler When clean dev flow table in oecls_dev_flow_table_cleanup(), the oecls_ftb memory of every dev queue should be freed. Fixes: 4bed6ba0e88f ("net/oenetcls: introduce oenetcls for network optimization") Signed-off-by: Wang Liang Signed-off-by: liujian <66liujian@163.com> --- net/oenetcls/oenetcls_flow.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/net/oenetcls/oenetcls_flow.c b/net/oenetcls/oenetcls_flow.c index 87efb5e6a126..eb7fae400e8e 100644 --- a/net/oenetcls/oenetcls_flow.c +++ b/net/oenetcls/oenetcls_flow.c @@ -252,15 +252,16 @@ static void oecls_dev_flow_table_cleanup(struct net_device *netdev, int qid) struct netdev_rx_queue *queue; int i; - spin_lock(&oecls_dev_flow_lock); for (i = 0; i < qid; i++) { queue = netdev->_rx + i; + spin_lock(&oecls_dev_flow_lock); dtb = rcu_dereference_protected(queue->oecls_ftb, lockdep_is_held(&oecls_dev_flow_lock)); rcu_assign_pointer(queue->oecls_ftb, NULL); + spin_unlock(&oecls_dev_flow_lock); + if (dtb) + call_rcu(&dtb->rcu, oecls_dev_flow_table_free); } - spin_unlock(&oecls_dev_flow_lock); - call_rcu(&dtb->rcu, oecls_dev_flow_table_free); } static int oecls_dev_flow_table_release(void) -- Gitee From 64fd711d7ec1f26ecdbdb80d31190b4c0faa95c0 Mon Sep 17 00:00:00 2001 From: Yue Haibing Date: Wed, 21 Jan 2026 16:52:24 +0800 Subject: [PATCH 6/7] net/oenetcls: Balancing softirq to improve performance commit 79cd29819d65a5f030c418cf92b26a3a6ad9ac12 openEuler Add numa affinity RPS to balance softirq, this fix high load performance regression. Fixes: 4bed6ba0e88f ("net/oenetcls: introduce oenetcls for network optimization") Signed-off-by: Yue Haibing Signed-off-by: Liu Jian Signed-off-by: liujian <66liujian@163.com> --- include/linux/oenetcls.h | 32 ++++++++++++---- net/core/dev.c | 19 ++++++++- net/oenetcls/oenetcls_flow.c | 70 +++++++++++++++++++++++++++------- net/oenetcls/oenetcls_main.c | 7 +++- net/oenetcls/oenetcls_ntuple.c | 2 +- 5 files changed, 106 insertions(+), 24 deletions(-) diff --git a/include/linux/oenetcls.h b/include/linux/oenetcls.h index 29c0db40971f..09f89131f32b 100644 --- a/include/linux/oenetcls.h +++ b/include/linux/oenetcls.h @@ -5,12 +5,14 @@ struct oecls_hook_ops { void (*oecls_cfg_rxcls)(struct sock *sk, int is_del); void (*oecls_flow_update)(struct sock *sk); - void (*oecls_set_cpu)(struct sk_buff *skb); + void (*oecls_set_cpu)(struct sk_buff *skb, int *cpu, int *last_qtail); bool (*oecls_timeout)(struct net_device *dev, u16 rxq_index, u32 flow_id, u16 filter_id); }; +typedef int (*enqueue_f)(struct sk_buff *skb, int cpu, unsigned int *qtail); extern const struct oecls_hook_ops __rcu *oecls_ops; +extern struct static_key_false oecls_rps_needed; static inline void oenetcls_cfg_rxcls(struct sock *sk, int is_del) { @@ -34,27 +36,43 @@ static inline void oenetcls_flow_update(struct sock *sk) rcu_read_unlock(); } -static inline void oenetcls_skb_set_cpu(struct sk_buff *skb) +static inline bool +oenetcls_skb_set_cpu(struct sk_buff *skb, enqueue_f enq_func, int *ret) { const struct oecls_hook_ops *ops; + int cpu, last_qtail; + bool result = false; rcu_read_lock(); ops = rcu_dereference(oecls_ops); - if (ops && ops->oecls_set_cpu) - ops->oecls_set_cpu(skb); + if (ops && ops->oecls_set_cpu) { + ops->oecls_set_cpu(skb, &cpu, &last_qtail); + if (cpu >= 0) { + *ret = enq_func(skb, cpu, &last_qtail); + result = true; + } + } rcu_read_unlock(); + return result; } -static inline void oenetcls_skblist_set_cpu(struct list_head *head) +static inline void +oenetcls_skblist_set_cpu(struct list_head *head, enqueue_f enq_func) { const struct oecls_hook_ops *ops; struct sk_buff *skb, *next; + int cpu, last_qtail; rcu_read_lock(); ops = rcu_dereference(oecls_ops); if (ops && ops->oecls_set_cpu) { - list_for_each_entry_safe(skb, next, head, list) - ops->oecls_set_cpu(skb); + list_for_each_entry_safe(skb, next, head, list) { + ops->oecls_set_cpu(skb, &cpu, &last_qtail); + if (cpu >= 0) { + skb_list_del_init(skb); + enq_func(skb, cpu, &last_qtail); + } + } } rcu_read_unlock(); } diff --git a/net/core/dev.c b/net/core/dev.c index 2947a9369362..46e3bd2aa2da 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -162,6 +162,8 @@ #include const struct oecls_hook_ops __rcu *oecls_ops __read_mostly; EXPORT_SYMBOL_GPL(oecls_ops); +struct static_key_false oecls_rps_needed __read_mostly; +EXPORT_SYMBOL(oecls_rps_needed); #endif static DEFINE_SPINLOCK(ptype_lock); @@ -5812,6 +5814,10 @@ static int netif_receive_skb_internal(struct sk_buff *skb) rcu_read_lock(); #ifdef CONFIG_RPS +#if IS_ENABLED(CONFIG_OENETCLS) + if (static_branch_unlikely(&oecls_rps_needed)) + goto oecls_rps; +#endif if (static_branch_unlikely(&rps_needed)) { struct rps_dev_flow voidflow, *rflow = &voidflow; int cpu = get_rps_cpu(skb->dev, skb, &rflow); @@ -5825,7 +5831,11 @@ static int netif_receive_skb_internal(struct sk_buff *skb) #endif #if IS_ENABLED(CONFIG_OENETCLS) - oenetcls_skb_set_cpu(skb); +oecls_rps: + if (oenetcls_skb_set_cpu(skb, enqueue_to_backlog, &ret)) { + rcu_read_unlock(); + return ret; + } #endif ret = __netif_receive_skb(skb); @@ -5849,6 +5859,10 @@ void netif_receive_skb_list_internal(struct list_head *head) rcu_read_lock(); #ifdef CONFIG_RPS +#if IS_ENABLED(CONFIG_OENETCLS) + if (static_branch_unlikely(&oecls_rps_needed)) + goto oecls_rps_list; +#endif if (static_branch_unlikely(&rps_needed)) { list_for_each_entry_safe(skb, next, head, list) { struct rps_dev_flow voidflow, *rflow = &voidflow; @@ -5864,7 +5878,8 @@ void netif_receive_skb_list_internal(struct list_head *head) #endif #if IS_ENABLED(CONFIG_OENETCLS) - oenetcls_skblist_set_cpu(head); +oecls_rps_list: + oenetcls_skblist_set_cpu(head, enqueue_to_backlog); #endif __netif_receive_skb_list(head); diff --git a/net/oenetcls/oenetcls_flow.c b/net/oenetcls/oenetcls_flow.c index eb7fae400e8e..d4d9a8f15660 100644 --- a/net/oenetcls/oenetcls_flow.c +++ b/net/oenetcls/oenetcls_flow.c @@ -134,8 +134,7 @@ static void set_oecls_cpu(struct net_device *dev, struct sk_buff *skb, struct oecls_dev_flow_table *dtb; struct oecls_dev_flow *rflow; u32 flow_id, hash; - u16 rxq_index; - int rc; + int rxq_index, rc; if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap || !(dev->features & NETIF_F_NTUPLE)) @@ -153,7 +152,8 @@ static void set_oecls_cpu(struct net_device *dev, struct sk_buff *skb, hash = skb_get_hash(skb); flow_id = hash & dtb->mask; rflow = &dtb->flows[flow_id]; - if (rflow->isvalid && rflow->cpu == next_cpu) { + //Return if someone has configured this. + if (rflow->isvalid && cpu_to_node(rflow->cpu) == cpu_to_node(next_cpu)) { rflow->timeout = jiffies; return; } @@ -172,15 +172,41 @@ static void set_oecls_cpu(struct net_device *dev, struct sk_buff *skb, rflow->cpu = next_cpu; } +static int get_cpu_in_mask(int tcpu, u32 hash) +{ + const struct cpumask *mask; + int nr_cpus, cpu, index; + + mask = cpumask_of_node(cpu_to_node(tcpu)); + + nr_cpus = cpumask_weight(mask); + if (nr_cpus == 0) + return -1; + + index = reciprocal_scale(hash, nr_cpus); + if (index < 0) + return -1; + + cpu = cpumask_first(mask); + while (--nr_cpus > 0) { + if (index == 0) + break; + cpu = cpumask_next(cpu, mask); + index--; + } + + return cpu; +} + static void __oecls_set_cpu(struct sk_buff *skb, struct net_device *ndev, struct oecls_sock_flow_table *tb, struct oecls_dev_flow_table *dtb, - int old_rxq_id) + int old_rxq_id, int *rcpu, int *last_qtail) { + u32 last_recv_cpu, hash, val, cpu, tcpu; struct oecls_dev_flow *rflow; - u32 last_recv_cpu, hash, val; - u32 tcpu = 0; - u32 cpu = raw_smp_processor_id(); + int newcpu; + cpu = raw_smp_processor_id(); skb_reset_network_header(skb); hash = skb_get_hash(skb); if (!hash) @@ -194,14 +220,20 @@ static void __oecls_set_cpu(struct sk_buff *skb, struct net_device *ndev, if ((val ^ hash) & ~oecls_cpu_mask) return; - if (cpu_to_node(cpu) == cpu_to_node(last_recv_cpu)) + newcpu = get_cpu_in_mask(last_recv_cpu, hash); + if (newcpu >= 0) + *rcpu = newcpu; + else + newcpu = last_recv_cpu; + + if (cpu_to_node(cpu) == cpu_to_node(newcpu)) return; if (tcpu >= nr_cpu_ids) - set_oecls_cpu(ndev, skb, rflow, old_rxq_id, last_recv_cpu); + set_oecls_cpu(ndev, skb, rflow, old_rxq_id, newcpu); } -static void _oecls_set_cpu(struct sk_buff *skb) +static void _oecls_set_cpu(struct sk_buff *skb, int *cpu, int *last_qtail) { struct net_device *ndev = skb->dev; struct oecls_sock_flow_table *stb; @@ -209,6 +241,8 @@ static void _oecls_set_cpu(struct sk_buff *skb) struct netdev_rx_queue *rxqueue; int rxq_id = -1; + *cpu = -1; + last_qtail = 0;//unused if (!ndev) return; @@ -234,7 +268,7 @@ static void _oecls_set_cpu(struct sk_buff *skb) stb = rcu_dereference(oecls_sock_flow_table); dtb = rcu_dereference(rxqueue->oecls_ftb); if (stb && dtb) - __oecls_set_cpu(skb, ndev, stb, dtb, rxq_id); + __oecls_set_cpu(skb, ndev, stb, dtb, rxq_id, cpu, last_qtail); rcu_read_unlock(); } @@ -246,13 +280,13 @@ static void oecls_dev_flow_table_free(struct rcu_head *rcu) vfree(table); } -static void oecls_dev_flow_table_cleanup(struct net_device *netdev, int qid) +static void oecls_dev_flow_table_cleanup(struct net_device *netdev, int queues) { struct oecls_dev_flow_table *dtb; struct netdev_rx_queue *queue; int i; - for (i = 0; i < qid; i++) { + for (i = 0; i < queues; i++) { queue = netdev->_rx + i; spin_lock(&oecls_dev_flow_lock); dtb = rcu_dereference_protected(queue->oecls_ftb, @@ -408,11 +442,21 @@ int oecls_flow_res_init(void) RCU_INIT_POINTER(oecls_ops, &oecls_flow_ops); synchronize_rcu(); + +#ifdef CONFIG_RPS + static_branch_inc(&oecls_rps_needed); + oecls_debug("oecls_rps_needed true\n"); +#endif + return 0; } void oecls_flow_res_clean(void) { +#ifdef CONFIG_RPS + static_branch_dec(&oecls_rps_needed); + oecls_debug("oecls_rps_needed false\n"); +#endif rcu_assign_pointer(oecls_ops, NULL); synchronize_rcu(); diff --git a/net/oenetcls/oenetcls_main.c b/net/oenetcls/oenetcls_main.c index 998117bb56c5..f9797cce19a9 100644 --- a/net/oenetcls/oenetcls_main.c +++ b/net/oenetcls/oenetcls_main.c @@ -43,6 +43,10 @@ static int check_cap = 1; module_param(check_cap, int, 0444); MODULE_PARM_DESC(check_cap, "check_cap, default 1"); +static char irqname[64] = "comp"; +module_param_string(irqname, irqname, sizeof(irqname), 0644); +MODULE_PARM_DESC(irqname, "nic irq name string, default comp"); + static bool check_params(void) { if (mode != 0 && mode != 1) @@ -353,7 +357,8 @@ static struct oecls_netdev_info *alloc_oecls_netdev_info(void) static bool check_irq_name(const char *irq_name, struct oecls_netdev_info *oecls_dev) { - if (!strstr(irq_name, "TxRx") && !strstr(irq_name, "comp") && !strstr(irq_name, "rx")) + if (!strstr(irq_name, "TxRx") && !strstr(irq_name, "comp") && !strstr(irq_name, "rx") && + strlen(irqname) > 0 && !strstr(irq_name, irqname)) return false; if (strstr(irq_name, oecls_dev->dev_name)) diff --git a/net/oenetcls/oenetcls_ntuple.c b/net/oenetcls/oenetcls_ntuple.c index d616420aa4fc..5dbbc0ce7b0f 100644 --- a/net/oenetcls/oenetcls_ntuple.c +++ b/net/oenetcls/oenetcls_ntuple.c @@ -60,7 +60,7 @@ static void add_sk_rule(int devid, u32 dip4, u16 dport, void *sk, int action, in hlist_add_head(&entry->node, sk_hlist); return; out: - oecls_debug("alloc failed rule:%p entry:%p\n", rule, entry); + oecls_debug("alloc rule failed\n"); kfree(entry); kfree(rule); } -- Gitee From 6758a63aabfcaec93e8e7eea8078b5fe2967b2f6 Mon Sep 17 00:00:00 2001 From: Liu Jian Date: Wed, 21 Jan 2026 16:52:25 +0800 Subject: [PATCH 7/7] net/oenetcls: add a switch to enable/disable checking NIC ntuple feature commit 5524fd8f2a91c035c70aacb1c93b5550d0857e9b openEuler Some NICs support the ntuple feature, but due to driver issues or other reasons, they do not allow external modification. Therefore, we have added a switch here to control whether to check the NIC's ntuple feature. Fixes: 4bed6ba0e88f ("net/oenetcls: introduce oenetcls for network optimization") Signed-off-by: Liu Jian Signed-off-by: liujian <66liujian@163.com> --- net/oenetcls/oenetcls.h | 1 + net/oenetcls/oenetcls_flow.c | 2 +- net/oenetcls/oenetcls_main.c | 10 ++++++++++ net/oenetcls/oenetcls_ntuple.c | 10 ++++++---- 4 files changed, 18 insertions(+), 5 deletions(-) diff --git a/net/oenetcls/oenetcls.h b/net/oenetcls/oenetcls.h index 86b8a1d1ac98..f0af6d25a4f5 100644 --- a/net/oenetcls/oenetcls.h +++ b/net/oenetcls/oenetcls.h @@ -133,6 +133,7 @@ extern int match_ip_flag; extern int oecls_debug_lvl; extern int oecls_netdev_num; extern int oecls_numa_num; +extern int check_nic_feature; #define oecls_debug(fmt, ...) \ do { \ diff --git a/net/oenetcls/oenetcls_flow.c b/net/oenetcls/oenetcls_flow.c index d4d9a8f15660..e9730c8abd2e 100644 --- a/net/oenetcls/oenetcls_flow.c +++ b/net/oenetcls/oenetcls_flow.c @@ -137,7 +137,7 @@ static void set_oecls_cpu(struct net_device *dev, struct sk_buff *skb, int rxq_index, rc; if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap || - !(dev->features & NETIF_F_NTUPLE)) + (!(dev->features & NETIF_F_NTUPLE) && check_nic_feature)) return; rxq_index = flow_get_queue_idx(dev, cpu_to_node(next_cpu), skb); diff --git a/net/oenetcls/oenetcls_main.c b/net/oenetcls/oenetcls_main.c index f9797cce19a9..f83dba5aa878 100644 --- a/net/oenetcls/oenetcls_main.c +++ b/net/oenetcls/oenetcls_main.c @@ -47,6 +47,10 @@ static char irqname[64] = "comp"; module_param_string(irqname, irqname, sizeof(irqname), 0644); MODULE_PARM_DESC(irqname, "nic irq name string, default comp"); +int check_nic_feature = 1; +module_param(check_nic_feature, int, 0444); +MODULE_PARM_DESC(check_nic_feature, "check nic feature, default 1"); + static bool check_params(void) { if (mode != 0 && mode != 1) @@ -401,6 +405,9 @@ static int oecls_filter_enable(const char *dev_name, bool *old_state) struct cmd_context ctx = {0}; int ret; + if (!check_nic_feature) + return 0; + strscpy(ctx.netdev, dev_name, IFNAMSIZ); eval.cmd = ETHTOOL_GFLAGS; @@ -447,6 +454,9 @@ static void oecls_filter_restore(const char *dev_name, bool old_state) bool cur_filter_state; int ret; + if (!check_nic_feature) + return; + strscpy(ctx.netdev, dev_name, IFNAMSIZ); eval.cmd = ETHTOOL_GFLAGS; diff --git a/net/oenetcls/oenetcls_ntuple.c b/net/oenetcls/oenetcls_ntuple.c index 5dbbc0ce7b0f..87e3b4bac3ea 100644 --- a/net/oenetcls/oenetcls_ntuple.c +++ b/net/oenetcls/oenetcls_ntuple.c @@ -372,10 +372,12 @@ static int do_srxntuple(struct cmd_context *ctx, struct ethtool_rx_flow_spec *fs flow_spec_to_ntuple(fsp, &ntuplecmd.fs); - eval.cmd = ETHTOOL_GFLAGS; - ret = send_ethtool_ioctl(ctx, &eval); - if (ret || !(eval.data & ETH_FLAG_NTUPLE)) - return -1; + if (check_nic_feature) { + eval.cmd = ETHTOOL_GFLAGS; + ret = send_ethtool_ioctl(ctx, &eval); + if (ret || !(eval.data & ETH_FLAG_NTUPLE)) + return -1; + } ntuplecmd.cmd = ETHTOOL_SRXNTUPLE; ret = send_ethtool_ioctl(ctx, &ntuplecmd); -- Gitee