From 91d1bd952f38a0ce0c07051346e61fc022465f6c Mon Sep 17 00:00:00 2001 From: luzhongjun23 Date: Sun, 11 Jun 2023 20:27:24 +0800 Subject: [PATCH 1/8] drivers/block/zram: Replace bit spinlocks with rtmutex for -rt commit 8a1176aa4e84170b10ae1bb39169e015bfa308e6 upstream. They're nondeterministic, and lead to ___might_sleep() splats in -rt. OTOH, they're a lot less wasteful than an rtmutex per page. Signed-off-by: Mike Galbraith Signed-off-by: Sebastian Andrzej Siewior --- drivers/block/zram/zram_drv.c | 36 +++++++++++++++++++++++++++++++++++ drivers/block/zram/zram_drv.h | 1 + 2 files changed, 37 insertions(+) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 0636df6b67db..1a7523cefbe9 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -59,6 +59,40 @@ static void zram_free_page(struct zram *zram, size_t index); static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, u32 index, int offset, struct bio *bio); +#ifdef CONFIG_PREEMPT_RT +static void zram_meta_init_table_locks(struct zram *zram, size_t num_pages) +{ + size_t index; + + for (index = 0; index < num_pages; index++) + spin_lock_init(&zram->table[index].lock); +} + +static int zram_slot_trylock(struct zram *zram, u32 index) +{ + int ret; + + ret = spin_trylock(&zram->table[index].lock); + if (ret) + __set_bit(ZRAM_LOCK, &zram->table[index].flags); + return ret; +} + +static void zram_slot_lock(struct zram *zram, u32 index) +{ + spin_lock(&zram->table[index].lock); + __set_bit(ZRAM_LOCK, &zram->table[index].flags); +} + +static void zram_slot_unlock(struct zram *zram, u32 index) +{ + __clear_bit(ZRAM_LOCK, &zram->table[index].flags); + spin_unlock(&zram->table[index].lock); +} + +#else + +static void zram_meta_init_table_locks(struct zram *zram, size_t num_pages) { } static int zram_slot_trylock(struct zram *zram, u32 index) { @@ -74,6 +108,7 @@ static void zram_slot_unlock(struct zram *zram, u32 index) { bit_spin_unlock(ZRAM_LOCK, &zram->table[index].flags); } +#endif static inline bool init_done(struct zram *zram) { @@ -1165,6 +1200,7 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize) if (!huge_class_size) huge_class_size = zs_huge_class_size(zram->mem_pool); + zram_meta_init_table_locks(zram, num_pages); return true; } diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index f2fd46daa760..7e4dd447e1dd 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -63,6 +63,7 @@ struct zram_table_entry { unsigned long element; }; unsigned long flags; + spinlock_t lock; #ifdef CONFIG_ZRAM_MEMORY_TRACKING ktime_t ac_time; #endif -- Gitee From 9102edf959d802a374b8c70c2a129b1e55a36e71 Mon Sep 17 00:00:00 2001 From: luzhongjun23 Date: Sun, 11 Jun 2023 20:27:57 +0800 Subject: [PATCH 2/8] tpm_tis: fix stall after iowrite*()s commit 7be548e782e5edd1d6275899310aa657f787d756 upstream. ioread8() operations to TPM MMIO addresses can stall the cpu when immediately following a sequence of iowrite*()'s to the same region. For example, cyclitest measures ~400us latency spikes when a non-RT usermode application communicates with an SPI-based TPM chip (Intel Atom E3940 system, PREEMPT_RT kernel). The spikes are caused by a stalling ioread8() operation following a sequence of 30+ iowrite8()s to the same address. I believe this happens because the write sequence is buffered (in cpu or somewhere along the bus), and gets flushed on the first LOAD instruction (ioread*()) that follows. The enclosed change appears to fix this issue: read the TPM chip's access register (status code) after every iowrite*() operation to amortize the cost of flushing data to chip across multiple instructions. Signed-off-by: Haris Okanovic Signed-off-by: Sebastian Andrzej Siewior --- drivers/char/tpm/tpm_tis.c | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/drivers/char/tpm/tpm_tis.c b/drivers/char/tpm/tpm_tis.c index 4ed6e660273a..c2bd0d40b5fc 100644 --- a/drivers/char/tpm/tpm_tis.c +++ b/drivers/char/tpm/tpm_tis.c @@ -50,6 +50,31 @@ static inline struct tpm_tis_tcg_phy *to_tpm_tis_tcg_phy(struct tpm_tis_data *da return container_of(data, struct tpm_tis_tcg_phy, priv); } +#ifdef CONFIG_PREEMPT_RT +/* + * Flushes previous write operations to chip so that a subsequent + * ioread*()s won't stall a cpu. + */ +static inline void tpm_tis_flush(void __iomem *iobase) +{ + ioread8(iobase + TPM_ACCESS(0)); +} +#else +#define tpm_tis_flush(iobase) do { } while (0) +#endif + +static inline void tpm_tis_iowrite8(u8 b, void __iomem *iobase, u32 addr) +{ + iowrite8(b, iobase + addr); + tpm_tis_flush(iobase); +} + +static inline void tpm_tis_iowrite32(u32 b, void __iomem *iobase, u32 addr) +{ + iowrite32(b, iobase + addr); + tpm_tis_flush(iobase); +} + static int interrupts = -1; module_param(interrupts, int, 0444); MODULE_PARM_DESC(interrupts, "Enable interrupts"); @@ -169,7 +194,7 @@ static int tpm_tcg_write_bytes(struct tpm_tis_data *data, u32 addr, u16 len, struct tpm_tis_tcg_phy *phy = to_tpm_tis_tcg_phy(data); while (len--) - iowrite8(*value++, phy->iobase + addr); + tpm_tis_iowrite8(*value++, phy->iobase, addr); return 0; } @@ -196,7 +221,7 @@ static int tpm_tcg_write32(struct tpm_tis_data *data, u32 addr, u32 value) { struct tpm_tis_tcg_phy *phy = to_tpm_tis_tcg_phy(data); - iowrite32(value, phy->iobase + addr); + tpm_tis_iowrite32(value, phy->iobase, addr); return 0; } -- Gitee From 218a03886745685caf23411722f1713162451c32 Mon Sep 17 00:00:00 2001 From: luzhongjun23 Date: Sun, 11 Jun 2023 20:30:41 +0800 Subject: [PATCH 3/8] signals: Allow rt tasks to cache one sigqueue struct commit 24c619b1ce4a5276a3aa5ea4018c457ac3b13309 upstream. To avoid allocation allow rt tasks to cache one sigqueue struct in task struct. Signed-off-by: Thomas Gleixner --- include/linux/sched.h | 1 + include/linux/signal.h | 1 + kernel/exit.c | 2 +- kernel/fork.c | 1 + kernel/signal.c | 69 +++++++++++++++++++++++++++++++++++++++--- 5 files changed, 69 insertions(+), 5 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 50fa3b2ee06d..d59aa4f198d9 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1095,6 +1095,7 @@ struct task_struct { /* Signal handlers: */ struct signal_struct *signal; struct sighand_struct __rcu *sighand; + struct sigqueue *sigqueue_cache; sigset_t blocked; sigset_t real_blocked; /* Restored if set_restore_sigmask() was used: */ diff --git a/include/linux/signal.h b/include/linux/signal.h index 3038a0610407..fff1656c6b6f 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -265,6 +265,7 @@ static inline void init_sigpending(struct sigpending *sig) } extern void flush_sigqueue(struct sigpending *queue); +extern void flush_task_sigqueue(struct task_struct *tsk); /* Test if 'sig' is valid signal. Use this instead of testing _NSIG directly */ static inline int valid_signal(unsigned long sig) diff --git a/kernel/exit.c b/kernel/exit.c index 68ed2393636b..71428ef49e00 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -155,7 +155,7 @@ static void __exit_signal(struct task_struct *tsk) * Do this under ->siglock, we can race with another thread * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals. */ - flush_sigqueue(&tsk->pending); + flush_task_sigqueue(tsk); tsk->sighand = NULL; spin_unlock(&sighand->siglock); diff --git a/kernel/fork.c b/kernel/fork.c index ab5865ddbfe2..b4e29f27b682 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2073,6 +2073,7 @@ static __latent_entropy struct task_struct *copy_process( spin_lock_init(&p->alloc_lock); init_sigpending(&p->pending); + p->sigqueue_cache = NULL; p->utime = p->stime = p->gtime = 0; #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME diff --git a/kernel/signal.c b/kernel/signal.c index 6896dd4da09a..3175452a5cfa 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -405,13 +406,30 @@ void task_join_group_stop(struct task_struct *task) task_set_jobctl_pending(task, mask | JOBCTL_STOP_PENDING); } +static inline struct sigqueue *get_task_cache(struct task_struct *t) +{ + struct sigqueue *q = t->sigqueue_cache; + + if (cmpxchg(&t->sigqueue_cache, q, NULL) != q) + return NULL; + return q; +} + +static inline int put_task_cache(struct task_struct *t, struct sigqueue *q) +{ + if (cmpxchg(&t->sigqueue_cache, NULL, q) == NULL) + return 0; + return 1; +} + /* * allocate a new signal queue record * - this may be called without locks if and only if t == current, otherwise an * appropriate lock must be held to stop the target task from exiting */ static struct sigqueue * -__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit) +__sigqueue_do_alloc(int sig, struct task_struct *t, gfp_t flags, + int override_rlimit, int fromslab) { struct sigqueue *q = NULL; struct user_struct *user; @@ -433,7 +451,10 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi rcu_read_unlock(); if (override_rlimit || likely(sigpending <= task_rlimit(t, RLIMIT_SIGPENDING))) { - q = kmem_cache_alloc(sigqueue_cachep, flags); + if (!fromslab) + q = get_task_cache(t); + if (!q) + q = kmem_cache_alloc(sigqueue_cachep, flags); } else { print_dropped_signal(sig); } @@ -450,6 +471,13 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi return q; } +static struct sigqueue * +__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, + int override_rlimit) +{ + return __sigqueue_do_alloc(sig, t, flags, override_rlimit, 0); +} + static void __sigqueue_free(struct sigqueue *q) { if (q->flags & SIGQUEUE_PREALLOC) @@ -459,6 +487,21 @@ static void __sigqueue_free(struct sigqueue *q) kmem_cache_free(sigqueue_cachep, q); } +static void sigqueue_free_current(struct sigqueue *q) +{ + struct user_struct *up; + + if (q->flags & SIGQUEUE_PREALLOC) + return; + + up = q->user; + if (rt_prio(current->normal_prio) && !put_task_cache(current, q)) { + atomic_dec(&up->sigpending); + free_uid(up); + } else + __sigqueue_free(q); +} + void flush_sigqueue(struct sigpending *queue) { struct sigqueue *q; @@ -471,6 +514,21 @@ void flush_sigqueue(struct sigpending *queue) } } +/* + * Called from __exit_signal. Flush tsk->pending and + * tsk->sigqueue_cache + */ +void flush_task_sigqueue(struct task_struct *tsk) +{ + struct sigqueue *q; + + flush_sigqueue(&tsk->pending); + + q = get_task_cache(tsk); + if (q) + kmem_cache_free(sigqueue_cachep, q); +} + /* * Flush all pending signals for this kthread. */ @@ -595,7 +653,7 @@ static void collect_signal(int sig, struct sigpending *list, kernel_siginfo_t *i (info->si_code == SI_TIMER) && (info->si_sys_private); - __sigqueue_free(first); + sigqueue_free_current(first); } else { /* * Ok, it wasn't in the queue. This must be @@ -632,6 +690,8 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, kernel_siginfo_t *in bool resched_timer = false; int signr; + WARN_ON_ONCE(tsk != current); + /* We only dequeue private signals from ourselves, we don't let * signalfd steal them */ @@ -1808,7 +1868,8 @@ EXPORT_SYMBOL(kill_pid); */ struct sigqueue *sigqueue_alloc(void) { - struct sigqueue *q = __sigqueue_alloc(-1, current, GFP_KERNEL, 0); + /* Preallocated sigqueue objects always from the slabcache ! */ + struct sigqueue *q = __sigqueue_do_alloc(-1, current, GFP_KERNEL, 0, 1); if (q) q->flags |= SIGQUEUE_PREALLOC; -- Gitee From 52d682807b6f74b498af85e25e2caeca359c5e06 Mon Sep 17 00:00:00 2001 From: luzhongjun23 Date: Sun, 11 Jun 2023 20:31:00 +0800 Subject: [PATCH 4/8] signal: Prevent double-free of user struct commit 9e7c7f0309fdacd81a41a4643075bca20aefc7a9 upstream. The way user struct reference counting works changed significantly with, fda31c50292a ("signal: avoid double atomic counter increments for user accounting") Now user structs are only freed once the last pending signal is dequeued. Make sigqueue_free_current() follow this new convention to avoid freeing the user struct multiple times and triggering this warning: refcount_t: underflow; use-after-free. WARNING: CPU: 0 PID: 6794 at lib/refcount.c:288 refcount_dec_not_one+0x45/0x50 Call Trace: refcount_dec_and_lock_irqsave+0x16/0x60 free_uid+0x31/0xa0 __dequeue_signal+0x17c/0x190 dequeue_signal+0x5a/0x1b0 do_sigtimedwait+0x208/0x250 __x64_sys_rt_sigtimedwait+0x6f/0xd0 do_syscall_64+0x72/0x200 entry_SYSCALL_64_after_hwframe+0x49/0xbe Signed-off-by: Matt Fleming Reported-by: Daniel Wagner Signed-off-by: Sebastian Andrzej Siewior --- kernel/signal.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/signal.c b/kernel/signal.c index 3175452a5cfa..c0ec1343aee0 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -496,8 +496,8 @@ static void sigqueue_free_current(struct sigqueue *q) up = q->user; if (rt_prio(current->normal_prio) && !put_task_cache(current, q)) { - atomic_dec(&up->sigpending); - free_uid(up); + if (atomic_dec_and_test(&up->sigpending)) + free_uid(up); } else __sigqueue_free(q); } -- Gitee From 2023c1375aa03c796a8452c69bf6f96891384cb6 Mon Sep 17 00:00:00 2001 From: luzhongjun23 Date: Sun, 11 Jun 2023 20:35:26 +0800 Subject: [PATCH 5/8] genirq: Disable irqpoll on -rt commit 0bd3f3fc362a13a983a3c8c3f0393d3606680b16 upstream. Creates long latencies for no value Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/irq/spurious.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index f865e5f4d382..dc7311dd74b1 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -443,6 +443,10 @@ MODULE_PARM_DESC(noirqdebug, "Disable irq lockup detection when true"); static int __init irqfixup_setup(char *str) { +#ifdef CONFIG_PREEMPT_RT + pr_warn("irqfixup boot option not supported w/ CONFIG_PREEMPT_RT\n"); + return 1; +#endif irqfixup = 1; printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n"); printk(KERN_WARNING "This may impact system performance.\n"); @@ -455,6 +459,10 @@ module_param(irqfixup, int, 0644); static int __init irqpoll_setup(char *str) { +#ifdef CONFIG_PREEMPT_RT + pr_warn("irqpoll boot option not supported w/ CONFIG_PREEMPT_RT\n"); + return 1; +#endif irqfixup = 2; printk(KERN_WARNING "Misrouted IRQ fixup and polling support " "enabled\n"); -- Gitee From cd6e3a96c1ebb80dfd883f67f03d5997cf85fb2d Mon Sep 17 00:00:00 2001 From: luzhongjun23 Date: Sun, 11 Jun 2023 20:35:35 +0800 Subject: [PATCH 6/8] sysfs: Add /sys/kernel/realtime entry commit 8a6a69a711796651f613dbb1bbee304d2cbfc45c upstream. Add a /sys/kernel entry to indicate that the kernel is a realtime kernel. Clark says that he needs this for udev rules, udev needs to evaluate if its a PREEMPT_RT kernel a few thousand times and parsing uname output is too slow or so. Are there better solutions? Should it exist and return 0 on !-rt? Signed-off-by: Clark Williams Signed-off-by: Peter Zijlstra --- kernel/ksysfs.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 35859da8bd4f..dfff31ed644a 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -138,6 +138,15 @@ KERNEL_ATTR_RO(vmcoreinfo); #endif /* CONFIG_CRASH_CORE */ +#if defined(CONFIG_PREEMPT_RT) +static ssize_t realtime_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", 1); +} +KERNEL_ATTR_RO(realtime); +#endif + /* whether file capabilities are enabled */ static ssize_t fscaps_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) @@ -228,6 +237,9 @@ static struct attribute * kernel_attrs[] = { #ifndef CONFIG_TINY_RCU &rcu_expedited_attr.attr, &rcu_normal_attr.attr, +#endif +#ifdef CONFIG_PREEMPT_RT + &realtime_attr.attr, #endif NULL }; -- Gitee From a49e4bcc9b636c59b1038f934e0027e587629bce Mon Sep 17 00:00:00 2001 From: luzhongjun23 Date: Sun, 11 Jun 2023 20:35:57 +0800 Subject: [PATCH 7/8] net: xfrm: Use sequence counter with associated spinlock commit 624a1127c6286b21a24111b4ab00acfdbc8c9528 upstream. A sequence counter write section must be serialized or its internal state can get corrupted. A plain seqcount_t does not contain the information of which lock must be held to guaranteee write side serialization. For xfrm_state_hash_generation, use seqcount_spinlock_t instead of plain seqcount_t. This allows to associate the spinlock used for write serialization with the sequence counter. It thus enables lockdep to verify that the write serialization lock is indeed held before entering the sequence counter write section. If lockdep is disabled, this lock association is compiled out and has neither storage size nor runtime overhead. Signed-off-by: Ahmed S. Darwish Signed-off-by: Steffen Klassert --- include/net/netns/xfrm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h index e18feffdade1..0ddc1e66ed83 100644 --- a/include/net/netns/xfrm.h +++ b/include/net/netns/xfrm.h @@ -77,7 +77,7 @@ struct netns_xfrm { struct dst_ops xfrm6_dst_ops; #endif spinlock_t xfrm_state_lock; - seqcount_t xfrm_state_hash_generation; + seqcount_spinlock_t xfrm_state_hash_generation; seqcount_spinlock_t xfrm_policy_hash_generation; spinlock_t xfrm_policy_lock; -- Gitee From 7cec1b1914d4ed6ae04d33eaa12942891ed25f51 Mon Sep 17 00:00:00 2001 From: luzhongjun23 Date: Sun, 11 Jun 2023 20:36:08 +0800 Subject: [PATCH 8/8] Add localversion for -RT release commit 7166b400d601acc616ff6534ab5ca71f47362568 upstream. Signed-off-by: Thomas Gleixner --- localversion-rt | 1 + 1 file changed, 1 insertion(+) create mode 100644 localversion-rt diff --git a/localversion-rt b/localversion-rt new file mode 100644 index 000000000000..21988f9ad53f --- /dev/null +++ b/localversion-rt @@ -0,0 +1 @@ +-rt34 -- Gitee