From 2403e8044f222e7c816fb2416661f5f469662973 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 21 Mar 2022 18:41:46 -0700 Subject: [PATCH 01/75] rcu: Make normal polling GP be more precise about sequence numbers Currently, poll_state_synchronize_rcu() uses rcu_seq_done() to check whether the specified grace period has completed. However, rcu_seq_done() does a simple comparison that reserves have of the sequence-number space for uncompleted grace periods. This has the unfortunate side-effect of not handling sequence-number wrap gracefully. Of course, one can argue that if someone has already waited for half of the full range of grace periods, they can wait for the other half, but why wait at all in this case? This commit therefore creates a rcu_seq_done_exact() that counts as uncompleted only the two grace periods during which the sequence number might have been handed out, while still being uncompleted. This way, if sequence-number wrap happens to hit that range, at most two additional grace periods need be waited for. This commit is in preparation for polled expedited grace periods. Link: https://lore.kernel.org/all/20220121142454.1994916-1-bfoster@redhat.com/ Link: https://docs.google.com/document/d/1RNKWW9jQyfjxw2E8dsXVTdvZYh0HnYeSHDKog9jhdN8/edit?usp=sharing Cc: Brian Foster Cc: Dave Chinner Cc: Al Viro Cc: Ian Kent Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 12 ++++++++++++ kernel/rcu/tree.c | 4 ++-- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 4916077119f3..0adb55941aeb 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -119,6 +119,18 @@ static inline bool rcu_seq_done(unsigned long *sp, unsigned long s) return ULONG_CMP_GE(READ_ONCE(*sp), s); } +/* + * Given a snapshot from rcu_seq_snap(), determine whether or not a + * full update-side operation has occurred, but do not allow the + * (ULONG_MAX / 2) safety-factor/guard-band. + */ +static inline bool rcu_seq_done_exact(unsigned long *sp, unsigned long s) +{ + unsigned long cur_s = READ_ONCE(*sp); + + return ULONG_CMP_GE(cur_s, s) || ULONG_CMP_LT(cur_s, s - (2 * RCU_SEQ_STATE_MASK + 1)); +} + /* * Has a grace period completed since the time the old gp_seq was collected? */ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index c25ba442044a..ec28e259774e 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3911,7 +3911,7 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu); * * Yes, this function does not take counter wrap into account. * But counter wrap is harmless. If the counter wraps, we have waited for - * more than 2 billion grace periods (and way more on a 64-bit system!). + * more than a billion grace periods (and way more on a 64-bit system!). * Those needing to keep oldstate values for very long time periods * (many hours even on 32-bit systems) should check them occasionally * and either refresh them or set a flag indicating that the grace period @@ -3924,7 +3924,7 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu); */ bool poll_state_synchronize_rcu(unsigned long oldstate) { - if (rcu_seq_done(&rcu_state.gp_seq, oldstate)) { + if (rcu_seq_done_exact(&rcu_state.gp_seq, oldstate)) { smp_mb(); /* Ensure GP ends before subsequent accesses. */ return true; } From 414c12385d4741e35d88670c6cc2f40a77809734 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 13 Apr 2022 15:17:25 -0700 Subject: [PATCH 02/75] rcu: Provide a get_completed_synchronize_rcu() function It is currently up to the caller to handle stale return values from get_state_synchronize_rcu(). If poll_state_synchronize_rcu() returned true once, a grace period has elapsed, regardless of the fact that counter wrap might cause some future poll_state_synchronize_rcu() invocation to return false. For example, the caller might store a separate flag that indicates whether some previous call to poll_state_synchronize_rcu() determined that the relevant grace period had already ended. This approach works, but it requires extra storage and is easy to get wrong. This commit therefore introduces a get_completed_synchronize_rcu() that returns a cookie that causes poll_state_synchronize_rcu() to always return true. This already-completed cookie can be stored in place of the cookie that previously caused poll_state_synchronize_rcu() to return true. It can also be used to flag a given structure as not having been exposed to readers, and thus not requiring a grace period to elapse. This commit is in preparation for polled expedited grace periods. Link: https://lore.kernel.org/all/20220121142454.1994916-1-bfoster@redhat.com/ Link: https://docs.google.com/document/d/1RNKWW9jQyfjxw2E8dsXVTdvZYh0HnYeSHDKog9jhdN8/edit?usp=sharing Cc: Brian Foster Cc: Dave Chinner Cc: Al Viro Cc: Ian Kent Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 1 + kernel/rcu/rcu.h | 3 +++ kernel/rcu/tiny.c | 4 ++-- kernel/rcu/tree.c | 3 ++- kernel/rcu/update.c | 13 +++++++++++++ 5 files changed, 21 insertions(+), 3 deletions(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 1a32036c918c..7f12daa4708b 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -41,6 +41,7 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func); void rcu_barrier_tasks(void); void rcu_barrier_tasks_rude(void); void synchronize_rcu(void); +unsigned long get_completed_synchronize_rcu(void); #ifdef CONFIG_PREEMPT_RCU diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 0adb55941aeb..32291f4eefde 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -23,6 +23,9 @@ #define RCU_SEQ_CTR_SHIFT 2 #define RCU_SEQ_STATE_MASK ((1 << RCU_SEQ_CTR_SHIFT) - 1) +/* Low-order bit definition for polled grace-period APIs. */ +#define RCU_GET_STATE_COMPLETED 0x1 + extern int sysctl_sched_rt_runtime; /* diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 340b3f8b090d..dbee6bea6726 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -58,7 +58,7 @@ void rcu_qs(void) rcu_ctrlblk.donetail = rcu_ctrlblk.curtail; raise_softirq_irqoff(RCU_SOFTIRQ); } - WRITE_ONCE(rcu_ctrlblk.gp_seq, rcu_ctrlblk.gp_seq + 1); + WRITE_ONCE(rcu_ctrlblk.gp_seq, rcu_ctrlblk.gp_seq + 2); local_irq_restore(flags); } @@ -213,7 +213,7 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu); */ bool poll_state_synchronize_rcu(unsigned long oldstate) { - return READ_ONCE(rcu_ctrlblk.gp_seq) != oldstate; + return oldstate == RCU_GET_STATE_COMPLETED || READ_ONCE(rcu_ctrlblk.gp_seq) != oldstate; } EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index ec28e259774e..46cfceea8784 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3924,7 +3924,8 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu); */ bool poll_state_synchronize_rcu(unsigned long oldstate) { - if (rcu_seq_done_exact(&rcu_state.gp_seq, oldstate)) { + if (oldstate == RCU_GET_STATE_COMPLETED || + rcu_seq_done_exact(&rcu_state.gp_seq, oldstate)) { smp_mb(); /* Ensure GP ends before subsequent accesses. */ return true; } diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index fc7fef575606..2e93acad1e31 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -516,6 +516,19 @@ int rcu_cpu_stall_suppress_at_boot __read_mostly; // !0 = suppress boot stalls. EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress_at_boot); module_param(rcu_cpu_stall_suppress_at_boot, int, 0444); +/** + * get_completed_synchronize_rcu - Return a pre-completed polled state cookie + * + * Returns a value that will always be treated by functions like + * poll_state_synchronize_rcu() as a cookie whose grace period has already + * completed. + */ +unsigned long get_completed_synchronize_rcu(void) +{ + return RCU_GET_STATE_COMPLETED; +} +EXPORT_SYMBOL_GPL(get_completed_synchronize_rcu); + #ifdef CONFIG_PROVE_RCU /* From d0eac20f9909de044fbeb958960b08ff235cf8a2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 13 Apr 2022 16:14:02 -0700 Subject: [PATCH 03/75] rcutorture: Validate get_completed_synchronize_rcu() This commit verifies that the RCU grace-period state cookie returned from get_completed_synchronize_rcu() causes poll_state_synchronize_rcu() to return true, as required. This commit is in preparation for polled expedited grace periods. Link: https://lore.kernel.org/all/20220121142454.1994916-1-bfoster@redhat.com/ Link: https://docs.google.com/document/d/1RNKWW9jQyfjxw2E8dsXVTdvZYh0HnYeSHDKog9jhdN8/edit?usp=sharing Cc: Brian Foster Cc: Dave Chinner Cc: Al Viro Cc: Ian Kent Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 7120165a9342..4ceec9f4169c 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -338,6 +338,7 @@ struct rcu_torture_ops { void (*sync)(void); void (*exp_sync)(void); unsigned long (*get_gp_state)(void); + unsigned long (*get_gp_completed)(void); unsigned long (*start_gp_poll)(void); bool (*poll_gp_state)(unsigned long oldstate); void (*cond_sync)(unsigned long oldstate); @@ -504,6 +505,7 @@ static struct rcu_torture_ops rcu_ops = { .sync = synchronize_rcu, .exp_sync = synchronize_rcu_expedited, .get_gp_state = get_state_synchronize_rcu, + .get_gp_completed = get_completed_synchronize_rcu, .start_gp_poll = start_poll_synchronize_rcu, .poll_gp_state = poll_state_synchronize_rcu, .cond_sync = cond_synchronize_rcu, @@ -1254,6 +1256,10 @@ rcu_torture_writer(void *arg) rcu_torture_writer_state_getname(), rcu_torture_writer_state, cookie, cur_ops->get_gp_state()); + if (cur_ops->get_gp_completed) { + cookie = cur_ops->get_gp_completed(); + WARN_ON_ONCE(!cur_ops->poll_gp_state(cookie)); + } cur_ops->readunlock(idx); } switch (synctype[torture_random(&rand) % nsynctypes]) { From 4cf0585c4d663654fefa0b359f1908b5cd72802b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 6 Dec 2021 16:19:40 -0800 Subject: [PATCH 04/75] rcu-tasks: Check for abandoned callbacks This commit adds a debugging scan for callbacks that got lost during a callback-queueing transition. Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 3925e32159b5..b8690a412c5b 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -439,6 +439,11 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp) WRITE_ONCE(rtp->percpu_dequeue_lim, 1); pr_info("Completing switch %s to CPU-0 callback queuing.\n", rtp->name); } + for (cpu = rtp->percpu_dequeue_lim; cpu < nr_cpu_ids; cpu++) { + struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu); + + WARN_ON_ONCE(rcu_segcblist_n_cbs(&rtpcp->cblist)); + } raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags); } From d96225fd09ffb3a6424cebe01c8b1e1bddc30f07 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 19 Apr 2022 10:47:28 -0700 Subject: [PATCH 05/75] rcu-tasks: Split rcu_tasks_one_gp() from rcu_tasks_kthread() This commit abstracts most of the rcu_tasks_kthread() function's loop body into a new rcu_tasks_one_gp() function. It also introduces a new ->tasks_gp_mutex to synchronize concurrent calls to this new rcu_tasks_one_gp() function. This commit is preparation for allowing RCU tasks grace periods to be driven by the calling task during the mid-boot dead zone. Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 58 ++++++++++++++++++++++++++++------------------ 1 file changed, 36 insertions(+), 22 deletions(-) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index b8690a412c5b..d7b12f524e81 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -48,6 +48,7 @@ struct rcu_tasks_percpu { * struct rcu_tasks - Definition for a Tasks-RCU-like mechanism. * @cbs_wait: RCU wait allowing a new callback to get kthread's attention. * @cbs_gbl_lock: Lock protecting callback list. + * @tasks_gp_mutex: Mutex protecting grace period, needed during mid-boot dead zone. * @kthread_ptr: This flavor's grace-period/callback-invocation kthread. * @gp_func: This flavor's grace-period-wait function. * @gp_state: Grace period's most recent state transition (debugging). @@ -79,6 +80,7 @@ struct rcu_tasks_percpu { struct rcu_tasks { struct rcuwait cbs_wait; raw_spinlock_t cbs_gbl_lock; + struct mutex tasks_gp_mutex; int gp_state; int gp_sleep; int init_fract; @@ -119,6 +121,7 @@ static struct rcu_tasks rt_name = \ { \ .cbs_wait = __RCUWAIT_INITIALIZER(rt_name.wait), \ .cbs_gbl_lock = __RAW_SPIN_LOCK_UNLOCKED(rt_name.cbs_gbl_lock), \ + .tasks_gp_mutex = __MUTEX_INITIALIZER(rt_name.tasks_gp_mutex), \ .gp_func = gp, \ .call_func = call, \ .rtpcpu = &rt_name ## __percpu, \ @@ -502,10 +505,37 @@ static void rcu_tasks_invoke_cbs_wq(struct work_struct *wp) rcu_tasks_invoke_cbs(rtp, rtpcp); } -/* RCU-tasks kthread that detects grace periods and invokes callbacks. */ -static int __noreturn rcu_tasks_kthread(void *arg) +// Wait for one grace period. +static void rcu_tasks_one_gp(struct rcu_tasks *rtp) { int needgpcb; + + mutex_lock(&rtp->tasks_gp_mutex); + set_tasks_gp_state(rtp, RTGS_WAIT_CBS); + + // If there were none, wait a bit and start over. + rcuwait_wait_event(&rtp->cbs_wait, + (needgpcb = rcu_tasks_need_gpcb(rtp)), + TASK_IDLE); + + if (needgpcb & 0x2) { + // Wait for one grace period. + set_tasks_gp_state(rtp, RTGS_WAIT_GP); + rtp->gp_start = jiffies; + rcu_seq_start(&rtp->tasks_gp_seq); + rtp->gp_func(rtp); + rcu_seq_end(&rtp->tasks_gp_seq); + } + + // Invoke callbacks. + set_tasks_gp_state(rtp, RTGS_INVOKE_CBS); + rcu_tasks_invoke_cbs(rtp, per_cpu_ptr(rtp->rtpcpu, 0)); + mutex_unlock(&rtp->tasks_gp_mutex); +} + +// RCU-tasks kthread that detects grace periods and invokes callbacks. +static int __noreturn rcu_tasks_kthread(void *arg) +{ struct rcu_tasks *rtp = arg; /* Run on housekeeping CPUs by default. Sysadm can move if desired. */ @@ -519,27 +549,11 @@ static int __noreturn rcu_tasks_kthread(void *arg) * This loop is terminated by the system going down. ;-) */ for (;;) { - set_tasks_gp_state(rtp, RTGS_WAIT_CBS); + // Wait for one grace period and invoke any callbacks + // that are ready. + rcu_tasks_one_gp(rtp); - /* If there were none, wait a bit and start over. */ - rcuwait_wait_event(&rtp->cbs_wait, - (needgpcb = rcu_tasks_need_gpcb(rtp)), - TASK_IDLE); - - if (needgpcb & 0x2) { - // Wait for one grace period. - set_tasks_gp_state(rtp, RTGS_WAIT_GP); - rtp->gp_start = jiffies; - rcu_seq_start(&rtp->tasks_gp_seq); - rtp->gp_func(rtp); - rcu_seq_end(&rtp->tasks_gp_seq); - } - - /* Invoke callbacks. */ - set_tasks_gp_state(rtp, RTGS_INVOKE_CBS); - rcu_tasks_invoke_cbs(rtp, per_cpu_ptr(rtp->rtpcpu, 0)); - - /* Paranoid sleep to keep this from entering a tight loop */ + // Paranoid sleep to keep this from entering a tight loop. schedule_timeout_idle(rtp->gp_sleep); } } From 68cb47204db478302a37adb2aaa26a05882dbaa1 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 19 Apr 2022 11:06:03 -0700 Subject: [PATCH 06/75] rcu-tasks: Move synchronize_rcu_tasks_generic() down This is strictly a code-motion commit that moves the synchronize_rcu_tasks_generic() down to where it can invoke rcu_tasks_one_gp() without the need for a forward declaration. Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index d7b12f524e81..ad993c4ed924 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -326,17 +326,6 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, irq_work_queue(&rtpcp->rtp_irq_work); } -// Wait for a grace period for the specified flavor of Tasks RCU. -static void synchronize_rcu_tasks_generic(struct rcu_tasks *rtp) -{ - /* Complain if the scheduler has not started. */ - RCU_LOCKDEP_WARN(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE, - "synchronize_rcu_tasks called too soon"); - - /* Wait for the grace period. */ - wait_rcu_gp(rtp->call_func); -} - // RCU callback function for rcu_barrier_tasks_generic(). static void rcu_barrier_tasks_generic_cb(struct rcu_head *rhp) { @@ -558,6 +547,17 @@ static int __noreturn rcu_tasks_kthread(void *arg) } } +// Wait for a grace period for the specified flavor of Tasks RCU. +static void synchronize_rcu_tasks_generic(struct rcu_tasks *rtp) +{ + /* Complain if the scheduler has not started. */ + RCU_LOCKDEP_WARN(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE, + "synchronize_rcu_tasks called too soon"); + + /* Wait for the grace period. */ + wait_rcu_gp(rtp->call_func); +} + /* Spawn RCU-tasks grace-period kthread. */ static void __init rcu_spawn_tasks_kthread_generic(struct rcu_tasks *rtp) { From 4a8cc433b8bf3106cf7b1173a936c62d77b40b40 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 19 Apr 2022 15:41:38 -0700 Subject: [PATCH 07/75] rcu-tasks: Drive synchronous grace periods from calling task This commit causes synchronous grace periods to be driven from the task invoking synchronize_rcu_*(), allowing these functions to be invoked from the mid-boot dead zone extending from when the scheduler was initialized to to point that the various RCU tasks grace-period kthreads are spawned. This change will allow the self-tests to run in a consistent manner. Reported-by: Matthew Wilcox Reported-by: Zhouyi Zhou Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index ad993c4ed924..bd9f2e24f5c7 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -495,17 +495,21 @@ static void rcu_tasks_invoke_cbs_wq(struct work_struct *wp) } // Wait for one grace period. -static void rcu_tasks_one_gp(struct rcu_tasks *rtp) +static void rcu_tasks_one_gp(struct rcu_tasks *rtp, bool midboot) { int needgpcb; mutex_lock(&rtp->tasks_gp_mutex); - set_tasks_gp_state(rtp, RTGS_WAIT_CBS); // If there were none, wait a bit and start over. - rcuwait_wait_event(&rtp->cbs_wait, - (needgpcb = rcu_tasks_need_gpcb(rtp)), - TASK_IDLE); + if (unlikely(midboot)) { + needgpcb = 0x2; + } else { + set_tasks_gp_state(rtp, RTGS_WAIT_CBS); + rcuwait_wait_event(&rtp->cbs_wait, + (needgpcb = rcu_tasks_need_gpcb(rtp)), + TASK_IDLE); + } if (needgpcb & 0x2) { // Wait for one grace period. @@ -540,7 +544,7 @@ static int __noreturn rcu_tasks_kthread(void *arg) for (;;) { // Wait for one grace period and invoke any callbacks // that are ready. - rcu_tasks_one_gp(rtp); + rcu_tasks_one_gp(rtp, false); // Paranoid sleep to keep this from entering a tight loop. schedule_timeout_idle(rtp->gp_sleep); @@ -554,8 +558,12 @@ static void synchronize_rcu_tasks_generic(struct rcu_tasks *rtp) RCU_LOCKDEP_WARN(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE, "synchronize_rcu_tasks called too soon"); - /* Wait for the grace period. */ - wait_rcu_gp(rtp->call_func); + // If the grace-period kthread is running, use it. + if (READ_ONCE(rtp->kthread_ptr)) { + wait_rcu_gp(rtp->call_func); + return; + } + rcu_tasks_one_gp(rtp, true); } /* Spawn RCU-tasks grace-period kthread. */ From 3847b64570b1753e9863a4106aec03f4d68e7b17 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 23 May 2022 20:50:11 -0700 Subject: [PATCH 08/75] rcu-tasks: Merge state into .b.need_qs and atomically update This commit gets rid of the task_struct structure's ->trc_reader_checked field, making it instead be a bit within the task_struct structure's existing ->trc_reader_special.b.need_qs field. This commit also atomically loads, stores, and checks the resulting combination of the reader-checked and need-quiescent state flags. This will in turn allow significant simplification of the rcu_tasks_trace_postgp() function as well as elimination of the trc_n_readers_need_end counter in later commits. These changes will in turn simplify later elimination of the RCU Tasks Trace scan of the task list, which will make RCU Tasks Trace grace periods less CPU-intensive. [ paulmck: Apply kernel test robot feedback. ] Signed-off-by: Paul E. McKenney Cc: Neeraj Upadhyay Cc: Eric Dumazet Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Martin KaFai Lau Cc: KP Singh --- include/linux/rcupdate.h | 18 ++++--- include/linux/sched.h | 1 - kernel/rcu/tasks.h | 103 +++++++++++++++++++++++++++------------ 3 files changed, 82 insertions(+), 40 deletions(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 1a32036c918c..1e728d544fc1 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -169,13 +169,17 @@ void synchronize_rcu_tasks(void); # endif # ifdef CONFIG_TASKS_TRACE_RCU -# define rcu_tasks_trace_qs(t) \ - do { \ - if (!likely(READ_ONCE((t)->trc_reader_checked)) && \ - !unlikely(READ_ONCE((t)->trc_reader_nesting))) { \ - smp_store_release(&(t)->trc_reader_checked, true); \ - smp_mb(); /* Readers partitioned by store. */ \ - } \ +// Bits for ->trc_reader_special.b.need_qs field. +#define TRC_NEED_QS 0x1 // Task needs a quiescent state. +#define TRC_NEED_QS_CHECKED 0x2 // Task has been checked for needing quiescent state. + +u8 rcu_trc_cmpxchg_need_qs(struct task_struct *t, u8 old, u8 new); + +# define rcu_tasks_trace_qs(t) \ + do { \ + if (likely(!READ_ONCE((t)->trc_reader_special.b.need_qs)) && \ + likely(!READ_ONCE((t)->trc_reader_nesting))) \ + rcu_trc_cmpxchg_need_qs((t), 0, TRC_NEED_QS_CHECKED); \ } while (0) # else # define rcu_tasks_trace_qs(t) do { } while (0) diff --git a/include/linux/sched.h b/include/linux/sched.h index c46f3a63b758..e6eb5871593e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -843,7 +843,6 @@ struct task_struct { int trc_reader_nesting; int trc_ipi_to_cpu; union rcu_special trc_reader_special; - bool trc_reader_checked; struct list_head trc_holdout_list; #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index bd9f2e24f5c7..7bdc62606816 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1208,6 +1208,39 @@ void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func); DEFINE_RCU_TASKS(rcu_tasks_trace, rcu_tasks_wait_gp, call_rcu_tasks_trace, "RCU Tasks Trace"); +/* Load from ->trc_reader_special.b.need_qs with proper ordering. */ +static u8 rcu_ld_need_qs(struct task_struct *t) +{ + smp_mb(); // Enforce full grace-period ordering. + return smp_load_acquire(&t->trc_reader_special.b.need_qs); +} + +/* Store to ->trc_reader_special.b.need_qs with proper ordering. */ +static void rcu_st_need_qs(struct task_struct *t, u8 v) +{ + smp_store_release(&t->trc_reader_special.b.need_qs, v); + smp_mb(); // Enforce full grace-period ordering. +} + +/* + * Do a cmpxchg() on ->trc_reader_special.b.need_qs, allowing for + * the four-byte operand-size restriction of some platforms. + * Returns the old value, which is often ignored. + */ +u8 rcu_trc_cmpxchg_need_qs(struct task_struct *t, u8 old, u8 new) +{ + union rcu_special ret; + union rcu_special trs_old = READ_ONCE(t->trc_reader_special); + union rcu_special trs_new = trs_old; + + if (trs_old.b.need_qs != old) + return trs_old.b.need_qs; + trs_new.b.need_qs = new; + ret.s = cmpxchg(&t->trc_reader_special.s, trs_old.s, trs_new.s); + return ret.b.need_qs; +} +EXPORT_SYMBOL_GPL(rcu_trc_cmpxchg_need_qs); + /* * This irq_work handler allows rcu_read_unlock_trace() to be invoked * while the scheduler locks are held. @@ -1221,16 +1254,20 @@ static DEFINE_IRQ_WORK(rcu_tasks_trace_iw, rcu_read_unlock_iw); /* If we are the last reader, wake up the grace-period kthread. */ void rcu_read_unlock_trace_special(struct task_struct *t) { - int nq = READ_ONCE(t->trc_reader_special.b.need_qs); + int nqs = (rcu_ld_need_qs(t) == (TRC_NEED_QS_CHECKED | TRC_NEED_QS)); - if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) && - t->trc_reader_special.b.need_mb) + if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) && t->trc_reader_special.b.need_mb) smp_mb(); // Pairs with update-side barriers. // Update .need_qs before ->trc_reader_nesting for irq/NMI handlers. - if (nq) - WRITE_ONCE(t->trc_reader_special.b.need_qs, false); + if (nqs) { + u8 result = rcu_trc_cmpxchg_need_qs(t, TRC_NEED_QS_CHECKED | TRC_NEED_QS, + TRC_NEED_QS_CHECKED); + + WARN_ONCE(result != (TRC_NEED_QS_CHECKED | TRC_NEED_QS), + "%s: result = %d", __func__, result); + } WRITE_ONCE(t->trc_reader_nesting, 0); - if (nq && atomic_dec_and_test(&trc_n_readers_need_end)) + if (nqs && atomic_dec_and_test(&trc_n_readers_need_end)) irq_work_queue(&rcu_tasks_trace_iw); } EXPORT_SYMBOL_GPL(rcu_read_unlock_trace_special); @@ -1260,27 +1297,24 @@ static void trc_read_check_handler(void *t_in) struct task_struct *texp = t_in; // If the task is no longer running on this CPU, leave. - if (unlikely(texp != t)) { + if (unlikely(texp != t)) goto reset_ipi; // Already on holdout list, so will check later. - } // If the task is not in a read-side critical section, and // if this is the last reader, awaken the grace-period kthread. if (likely(!READ_ONCE(t->trc_reader_nesting))) { - WRITE_ONCE(t->trc_reader_checked, true); + rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED); goto reset_ipi; } // If we are racing with an rcu_read_unlock_trace(), try again later. if (unlikely(READ_ONCE(t->trc_reader_nesting) < 0)) goto reset_ipi; - WRITE_ONCE(t->trc_reader_checked, true); // Get here if the task is in a read-side critical section. Set // its state so that it will awaken the grace-period kthread upon // exit from that critical section. - atomic_inc(&trc_n_readers_need_end); // One more to wait on. - WARN_ON_ONCE(READ_ONCE(t->trc_reader_special.b.need_qs)); - WRITE_ONCE(t->trc_reader_special.b.need_qs, true); + if (!rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS | TRC_NEED_QS_CHECKED)) + atomic_inc(&trc_n_readers_need_end); // One more to wait on. reset_ipi: // Allow future IPIs to be sent on CPU and for task. @@ -1291,8 +1325,9 @@ reset_ipi: } /* Callback function for scheduler to check locked-down task. */ -static int trc_inspect_reader(struct task_struct *t, void *arg) +static int trc_inspect_reader(struct task_struct *t, void *bhp_in) { + struct list_head *bhp = bhp_in; int cpu = task_cpu(t); int nesting; bool ofl = cpu_is_offline(cpu); @@ -1323,16 +1358,19 @@ static int trc_inspect_reader(struct task_struct *t, void *arg) // If not exiting a read-side critical section, mark as checked // so that the grace-period kthread will remove it from the // holdout list. - t->trc_reader_checked = nesting >= 0; - if (nesting <= 0) + if (nesting <= 0) { + if (!nesting) + rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED); return nesting ? -EINVAL : 0; // If in QS, done, otherwise try again later. + } // The task is in a read-side critical section, so set up its // state so that it will awaken the grace-period kthread upon exit // from that critical section. - atomic_inc(&trc_n_readers_need_end); // One more to wait on. - WARN_ON_ONCE(READ_ONCE(t->trc_reader_special.b.need_qs)); - WRITE_ONCE(t->trc_reader_special.b.need_qs, true); + if (!rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS | TRC_NEED_QS_CHECKED)) { + atomic_inc(&trc_n_readers_need_end); // One more to wait on. + trc_add_holdout(t, bhp); + } return 0; } @@ -1348,14 +1386,14 @@ static void trc_wait_for_one_reader(struct task_struct *t, // The current task had better be in a quiescent state. if (t == current) { - t->trc_reader_checked = true; + rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED); WARN_ON_ONCE(READ_ONCE(t->trc_reader_nesting)); return; } // Attempt to nail down the task for inspection. get_task_struct(t); - if (!task_call_func(t, trc_inspect_reader, NULL)) { + if (!task_call_func(t, trc_inspect_reader, bhp)) { put_task_struct(t); return; } @@ -1419,8 +1457,7 @@ static void rcu_tasks_trace_pertask(struct task_struct *t, if (unlikely(t == NULL)) return; - WRITE_ONCE(t->trc_reader_special.b.need_qs, false); - WRITE_ONCE(t->trc_reader_checked, false); + rcu_st_need_qs(t, 0); t->trc_ipi_to_cpu = -1; trc_wait_for_one_reader(t, hop); } @@ -1442,7 +1479,8 @@ static void rcu_tasks_trace_postscan(struct list_head *hop) // Wait for late-stage exiting tasks to finish exiting. // These might have passed the call to exit_tasks_rcu_finish(). synchronize_rcu(); - // Any tasks that exit after this point will set ->trc_reader_checked. + // Any tasks that exit after this point will set + // TRC_NEED_QS_CHECKED in ->trc_reader_special.b.need_qs. } /* Communicate task state back to the RCU tasks trace stall warning request. */ @@ -1460,7 +1498,7 @@ static int trc_check_slow_task(struct task_struct *t, void *arg) return false; // It is running, so decline to inspect it. trc_rdrp->nesting = READ_ONCE(t->trc_reader_nesting); trc_rdrp->ipi_to_cpu = READ_ONCE(t->trc_ipi_to_cpu); - trc_rdrp->needqs = READ_ONCE(t->trc_reader_special.b.need_qs); + trc_rdrp->needqs = rcu_ld_need_qs(t); return true; } @@ -1514,12 +1552,12 @@ static void check_all_holdout_tasks_trace(struct list_head *hop, list_for_each_entry_safe(t, g, hop, trc_holdout_list) { // If safe and needed, try to check the current task. if (READ_ONCE(t->trc_ipi_to_cpu) == -1 && - !READ_ONCE(t->trc_reader_checked)) + !(rcu_ld_need_qs(t) & TRC_NEED_QS_CHECKED)) trc_wait_for_one_reader(t, hop); // If check succeeded, remove this task from the list. if (smp_load_acquire(&t->trc_ipi_to_cpu) == -1 && - READ_ONCE(t->trc_reader_checked)) + rcu_ld_need_qs(t) == TRC_NEED_QS_CHECKED) trc_del_holdout(t); else if (needreport) show_stalled_task_trace(t, firstreport); @@ -1574,12 +1612,12 @@ static void rcu_tasks_trace_postgp(struct rcu_tasks *rtp) // Stall warning time, so make a list of the offenders. rcu_read_lock(); for_each_process_thread(g, t) - if (READ_ONCE(t->trc_reader_special.b.need_qs)) + if (rcu_ld_need_qs(t) & TRC_NEED_QS) trc_add_holdout(t, &holdouts); rcu_read_unlock(); firstreport = true; list_for_each_entry_safe(t, g, &holdouts, trc_holdout_list) { - if (READ_ONCE(t->trc_reader_special.b.need_qs)) + if (rcu_ld_need_qs(t) & TRC_NEED_QS) show_stalled_task_trace(t, &firstreport); trc_del_holdout(t); // Release task_struct reference. } @@ -1595,11 +1633,12 @@ static void rcu_tasks_trace_postgp(struct rcu_tasks *rtp) /* Report any needed quiescent state for this exiting task. */ static void exit_tasks_rcu_finish_trace(struct task_struct *t) { - WRITE_ONCE(t->trc_reader_checked, true); + rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED); WARN_ON_ONCE(READ_ONCE(t->trc_reader_nesting)); - WRITE_ONCE(t->trc_reader_nesting, 0); - if (WARN_ON_ONCE(READ_ONCE(t->trc_reader_special.b.need_qs))) + if (WARN_ON_ONCE(rcu_ld_need_qs(t) & TRC_NEED_QS)) rcu_read_unlock_trace_special(t); + else + WRITE_ONCE(t->trc_reader_nesting, 0); } /** From 550611269b153dc17b44fa2d692c30f628d1c65c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 25 May 2022 13:37:36 -0700 Subject: [PATCH 09/75] rcu-tasks: Remove rcu_tasks_trace_postgp() wait for counter Now that tasks are not removed from the list until they have responded to any needed request for a quiescent state, it is no longer necessary to wait for the trc_n_readers_need_end counter to go to zero. This commit therefore removes that waiting code. It is therefore also no longer necessary for rcu_tasks_trace_postgp() to do the final decrement of this counter, so that code is also removed. This in turn means that trc_n_readers_need_end counter itself can be removed, as can the rcu_tasks_trace_iw irq_work structure and the rcu_read_unlock_iw() function. [ paulmck: Apply feedback from Zqiang. ] Signed-off-by: Paul E. McKenney Cc: Neeraj Upadhyay Cc: Eric Dumazet Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Martin KaFai Lau Cc: KP Singh --- kernel/rcu/tasks.h | 62 +++------------------------------------------- 1 file changed, 3 insertions(+), 59 deletions(-) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 7bdc62606816..561d24f7f73c 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1192,9 +1192,6 @@ EXPORT_SYMBOL_GPL(rcu_trace_lock_map); #ifdef CONFIG_TASKS_TRACE_RCU -static atomic_t trc_n_readers_need_end; // Number of waited-for readers. -static DECLARE_WAIT_QUEUE_HEAD(trc_wait); // List of holdout tasks. - // Record outstanding IPIs to each CPU. No point in sending two... static DEFINE_PER_CPU(bool, trc_ipi_to_cpu); @@ -1241,16 +1238,6 @@ u8 rcu_trc_cmpxchg_need_qs(struct task_struct *t, u8 old, u8 new) } EXPORT_SYMBOL_GPL(rcu_trc_cmpxchg_need_qs); -/* - * This irq_work handler allows rcu_read_unlock_trace() to be invoked - * while the scheduler locks are held. - */ -static void rcu_read_unlock_iw(struct irq_work *iwp) -{ - wake_up(&trc_wait); -} -static DEFINE_IRQ_WORK(rcu_tasks_trace_iw, rcu_read_unlock_iw); - /* If we are the last reader, wake up the grace-period kthread. */ void rcu_read_unlock_trace_special(struct task_struct *t) { @@ -1267,8 +1254,6 @@ void rcu_read_unlock_trace_special(struct task_struct *t) "%s: result = %d", __func__, result); } WRITE_ONCE(t->trc_reader_nesting, 0); - if (nqs && atomic_dec_and_test(&trc_n_readers_need_end)) - irq_work_queue(&rcu_tasks_trace_iw); } EXPORT_SYMBOL_GPL(rcu_read_unlock_trace_special); @@ -1313,8 +1298,7 @@ static void trc_read_check_handler(void *t_in) // Get here if the task is in a read-side critical section. Set // its state so that it will awaken the grace-period kthread upon // exit from that critical section. - if (!rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS | TRC_NEED_QS_CHECKED)) - atomic_inc(&trc_n_readers_need_end); // One more to wait on. + rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS | TRC_NEED_QS_CHECKED); reset_ipi: // Allow future IPIs to be sent on CPU and for task. @@ -1367,10 +1351,8 @@ static int trc_inspect_reader(struct task_struct *t, void *bhp_in) // The task is in a read-side critical section, so set up its // state so that it will awaken the grace-period kthread upon exit // from that critical section. - if (!rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS | TRC_NEED_QS_CHECKED)) { - atomic_inc(&trc_n_readers_need_end); // One more to wait on. + if (!rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS | TRC_NEED_QS_CHECKED)) trc_add_holdout(t, bhp); - } return 0; } @@ -1436,9 +1418,6 @@ static void rcu_tasks_trace_pregp_step(void) { int cpu; - // Allow for fast-acting IPIs. - atomic_set(&trc_n_readers_need_end, 1); - // There shouldn't be any old IPIs, but... for_each_possible_cpu(cpu) WARN_ON_ONCE(per_cpu(trc_ipi_to_cpu, cpu)); @@ -1581,10 +1560,6 @@ static void rcu_tasks_trace_empty_fn(void *unused) static void rcu_tasks_trace_postgp(struct rcu_tasks *rtp) { int cpu; - bool firstreport; - struct task_struct *g, *t; - LIST_HEAD(holdouts); - long ret; // Wait for any lingering IPI handlers to complete. Note that // if a CPU has gone offline or transitioned to userspace in the @@ -1595,37 +1570,6 @@ static void rcu_tasks_trace_postgp(struct rcu_tasks *rtp) if (WARN_ON_ONCE(smp_load_acquire(per_cpu_ptr(&trc_ipi_to_cpu, cpu)))) smp_call_function_single(cpu, rcu_tasks_trace_empty_fn, NULL, 1); - // Remove the safety count. - smp_mb__before_atomic(); // Order vs. earlier atomics - atomic_dec(&trc_n_readers_need_end); - smp_mb__after_atomic(); // Order vs. later atomics - - // Wait for readers. - set_tasks_gp_state(rtp, RTGS_WAIT_READERS); - for (;;) { - ret = wait_event_idle_exclusive_timeout( - trc_wait, - atomic_read(&trc_n_readers_need_end) == 0, - READ_ONCE(rcu_task_stall_timeout)); - if (ret) - break; // Count reached zero. - // Stall warning time, so make a list of the offenders. - rcu_read_lock(); - for_each_process_thread(g, t) - if (rcu_ld_need_qs(t) & TRC_NEED_QS) - trc_add_holdout(t, &holdouts); - rcu_read_unlock(); - firstreport = true; - list_for_each_entry_safe(t, g, &holdouts, trc_holdout_list) { - if (rcu_ld_need_qs(t) & TRC_NEED_QS) - show_stalled_task_trace(t, &firstreport); - trc_del_holdout(t); // Release task_struct reference. - } - if (firstreport) - pr_err("INFO: rcu_tasks_trace detected stalls? (Counter/taskslist mismatch?)\n"); - show_stalled_ipi_trace(); - pr_err("\t%d holdouts\n", atomic_read(&trc_n_readers_need_end)); - } smp_mb(); // Caller's code must be ordered after wakeup. // Pairs with pretty much every ordering primitive. } @@ -1725,7 +1669,7 @@ void show_rcu_tasks_trace_gp_kthread(void) { char buf[64]; - sprintf(buf, "N%d h:%lu/%lu/%lu", atomic_read(&trc_n_readers_need_end), + sprintf(buf, "h:%lu/%lu/%lu", data_race(n_heavy_reader_ofl_updates), data_race(n_heavy_reader_updates), data_race(n_heavy_reader_attempts)); From 9ff86b4c443cc93bf3a9b624e3385e4114388e85 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 26 May 2022 16:12:51 -0700 Subject: [PATCH 10/75] rcu-tasks: Make trc_read_check_handler() fetch ->trc_reader_nesting only once This commit replaces the pair of READ_ONCE(t->trc_reader_nesting) calls with a single such call and a local variable. This makes the code's intent more clear. Signed-off-by: Paul E. McKenney Cc: Neeraj Upadhyay Cc: Eric Dumazet Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Martin KaFai Lau Cc: KP Singh --- kernel/rcu/tasks.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 561d24f7f73c..8fe78a7fecaf 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1278,6 +1278,7 @@ static void trc_del_holdout(struct task_struct *t) /* IPI handler to check task state. */ static void trc_read_check_handler(void *t_in) { + int nesting; struct task_struct *t = current; struct task_struct *texp = t_in; @@ -1287,12 +1288,13 @@ static void trc_read_check_handler(void *t_in) // If the task is not in a read-side critical section, and // if this is the last reader, awaken the grace-period kthread. - if (likely(!READ_ONCE(t->trc_reader_nesting))) { + nesting = READ_ONCE(t->trc_reader_nesting); + if (likely(!nesting)) { rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED); goto reset_ipi; } // If we are racing with an rcu_read_unlock_trace(), try again later. - if (unlikely(READ_ONCE(t->trc_reader_nesting) < 0)) + if (unlikely(nesting < 0)) goto reset_ipi; // Get here if the task is in a read-side critical section. Set From 5c9a9ca44fda41c5e82f50efced5297a9c19760d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 24 May 2022 16:59:52 -0700 Subject: [PATCH 11/75] rcu-tasks: Idle tasks on offline CPUs are in quiescent states Any idle task corresponding to an offline CPU is in an RCU Tasks Trace quiescent state. This commit causes rcu_tasks_trace_postscan() to ignore idle tasks for offline CPUs, which it can do safely due to CPU-hotplug operations being disabled. Signed-off-by: Paul E. McKenney Cc: Neeraj Upadhyay Cc: Eric Dumazet Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Martin KaFai Lau Cc: KP Singh --- kernel/rcu/tasks.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 8fe78a7fecaf..ec68bfe98c95 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1451,7 +1451,7 @@ static void rcu_tasks_trace_postscan(struct list_head *hop) { int cpu; - for_each_possible_cpu(cpu) + for_each_online_cpu(cpu) rcu_tasks_trace_pertask(idle_task(cpu), hop); // Re-enable CPU hotplug now that the tasklist scan has completed. From 897ba84dc5aa7a5518e19da180d2985790723d30 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 24 May 2022 18:02:40 -0700 Subject: [PATCH 12/75] rcu-tasks: Handle idle tasks for recently offlined CPUs This commit identifies idle tasks for recently offlined CPUs as residing in a quiescent state. This is safe only because CPU-hotplug operations are excluded during these checks. Signed-off-by: Paul E. McKenney Cc: Neeraj Upadhyay Cc: Eric Dumazet Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Martin KaFai Lau Cc: KP Singh --- kernel/rcu/tasks.h | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index ec68bfe98c95..414861d65196 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1318,27 +1318,26 @@ static int trc_inspect_reader(struct task_struct *t, void *bhp_in) int nesting; bool ofl = cpu_is_offline(cpu); - if (task_curr(t)) { - WARN_ON_ONCE(ofl && !is_idle_task(t)); - + if (task_curr(t) && !ofl) { // If no chance of heavyweight readers, do it the hard way. - if (!ofl && !IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) + if (!IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) return -EINVAL; // If heavyweight readers are enabled on the remote task, // we can inspect its state despite its currently running. // However, we cannot safely change its state. n_heavy_reader_attempts++; - if (!ofl && // Check for "running" idle tasks on offline CPUs. - !rcu_dynticks_zero_in_eqs(cpu, &t->trc_reader_nesting)) + // Check for "running" idle tasks on offline CPUs. + if (!rcu_dynticks_zero_in_eqs(cpu, &t->trc_reader_nesting)) return -EINVAL; // No quiescent state, do it the hard way. n_heavy_reader_updates++; - if (ofl) - n_heavy_reader_ofl_updates++; nesting = 0; } else { // The task is not running, so C-language access is safe. nesting = t->trc_reader_nesting; + WARN_ON_ONCE(ofl && task_curr(t) && !is_idle_task(t)); + if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) && ofl) + n_heavy_reader_ofl_updates++; } // If not exiting a read-side critical section, mark as checked From 5d4c90d755d5703d5a74fac0870c0a697250ec33 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 24 May 2022 18:19:17 -0700 Subject: [PATCH 13/75] rcu-tasks: RCU Tasks Trace grace-period kthread has implicit QS Because the task driving the grace-period kthread is in quiescent state throughout, this commit excludes it from the list of tasks from which a quiescent state is needed. This does mean that attaching a sleepable BPF program to function in kernel/rcu/tasks.h is a bad idea, by the way. Signed-off-by: Paul E. McKenney Cc: Neeraj Upadhyay Cc: Eric Dumazet Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Martin KaFai Lau Cc: KP Singh --- kernel/rcu/tasks.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 414861d65196..554b2e59a1d5 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1433,8 +1433,9 @@ static void rcu_tasks_trace_pertask(struct task_struct *t, struct list_head *hop) { // During early boot when there is only the one boot CPU, there - // is no idle task for the other CPUs. Just return. - if (unlikely(t == NULL)) + // is no idle task for the other CPUs. Also, the grace-period + // kthread is always in a quiescent state. Either way, just return. + if (unlikely(t == NULL) || t == current) return; rcu_st_need_qs(t, 0); From 6a694411977a6d57ff76a896a745c2f717372dac Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 24 May 2022 20:33:17 -0700 Subject: [PATCH 14/75] rcu-tasks: Make rcu_note_context_switch() unconditionally call rcu_tasks_qs() This commit makes rcu_note_context_switch() unconditionally invoke the rcu_tasks_qs() function, as opposed to doing so only when RCU (as opposed to RCU Tasks Trace) urgently needs a grace period to end. Signed-off-by: Paul E. McKenney Cc: Neeraj Upadhyay Cc: Eric Dumazet Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Martin KaFai Lau Cc: KP Singh --- kernel/rcu/tree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index c8ba0fe17267..c966d680b789 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -899,8 +899,8 @@ void rcu_note_context_switch(bool preempt) this_cpu_write(rcu_data.rcu_urgent_qs, false); if (unlikely(raw_cpu_read(rcu_data.rcu_need_heavy_qs))) rcu_momentary_dyntick_idle(); - rcu_tasks_qs(current, preempt); out: + rcu_tasks_qs(current, preempt); trace_rcu_utilization(TPS("End context switch")); } EXPORT_SYMBOL_GPL(rcu_note_context_switch); From 0968e8920b5b11b7a33982890ad9150e09e1cb1f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 31 May 2022 21:38:15 -0700 Subject: [PATCH 15/75] rcu-tasks: Simplify trc_inspect_reader() QS logic Currently, trc_inspect_reader() does one check for nesting less than or equal to zero, then sorts out the distinctions within this single "if" statement. This commit simplifies the logic by providing one "if" statement for quiescent states (nesting of zero) and another "if" statement for transitioning from one nesting level to another or the outermost rcu_read_unlock_trace() (negative nesting). Signed-off-by: Paul E. McKenney Cc: Neeraj Upadhyay Cc: Eric Dumazet Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Martin KaFai Lau Cc: KP Singh --- kernel/rcu/tasks.h | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 554b2e59a1d5..6b44c69eeca8 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1343,15 +1343,16 @@ static int trc_inspect_reader(struct task_struct *t, void *bhp_in) // If not exiting a read-side critical section, mark as checked // so that the grace-period kthread will remove it from the // holdout list. - if (nesting <= 0) { - if (!nesting) - rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED); - return nesting ? -EINVAL : 0; // If in QS, done, otherwise try again later. + if (!nesting) { + rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED); + return 0; // In QS, so done. } + if (nesting < 0) + return -EINVAL; // QS transitioning, try again later. // The task is in a read-side critical section, so set up its - // state so that it will awaken the grace-period kthread upon exit - // from that critical section. + // state so that it will update state upon exit from that critical + // section. if (!rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS | TRC_NEED_QS_CHECKED)) trc_add_holdout(t, bhp); return 0; From 9f3eb5fb8e468d3c3a6073d0c816405fa73c8038 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 24 May 2022 16:05:15 -0700 Subject: [PATCH 16/75] rcu-tasks: Add slow-IPI indicator to RCU Tasks Trace stall warnings This commit adds a "I" indicator to the RCU Tasks Trace CPU stall warning when an IPI directed to a task has thus far failed to arrive. This serves as a debugging aid. Signed-off-by: Paul E. McKenney Cc: Neeraj Upadhyay Cc: Eric Dumazet Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Martin KaFai Lau Cc: KP Singh --- kernel/rcu/tasks.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 6b44c69eeca8..1cfbebf2b597 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1497,8 +1497,9 @@ static void show_stalled_task_trace(struct task_struct *t, bool *firstreport) } cpu = task_cpu(t); if (!task_call_func(t, trc_check_slow_task, &trc_rdr)) - pr_alert("P%d: %c\n", + pr_alert("P%d: %c%c\n", t->pid, + ".I"[t->trc_ipi_to_cpu >= 0], ".i"[is_idle_tsk]); else pr_alert("P%d: %c%c%c nesting: %d%c cpu: %d\n", From c8c03ad9d7cd694b88ab8db4199b4e4d3c6cc5aa Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 24 May 2022 20:36:08 -0700 Subject: [PATCH 17/75] rcu-tasks: Flag offline CPUs in RCU Tasks Trace stall warnings This commit tags offline CPUs with "(offline)" in RCU Tasks Trace CPU stall warnings. This is a debugging aid. Signed-off-by: Paul E. McKenney Cc: Neeraj Upadhyay Cc: Eric Dumazet Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Martin KaFai Lau Cc: KP Singh --- kernel/rcu/tasks.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 1cfbebf2b597..93096188d363 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1502,14 +1502,14 @@ static void show_stalled_task_trace(struct task_struct *t, bool *firstreport) ".I"[t->trc_ipi_to_cpu >= 0], ".i"[is_idle_tsk]); else - pr_alert("P%d: %c%c%c nesting: %d%c cpu: %d\n", + pr_alert("P%d: %c%c%c nesting: %d%c cpu: %d%s\n", t->pid, ".I"[trc_rdr.ipi_to_cpu >= 0], ".i"[is_idle_tsk], ".N"[cpu >= 0 && tick_nohz_full_cpu(cpu)], trc_rdr.nesting, " N"[!!trc_rdr.needqs], - cpu); + cpu, cpu_online(cpu) ? "" : "(offline)"); sched_show_task(t); } From be15a16486dd6513ad801ea320eb21e10eec2b55 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 25 May 2022 09:20:59 -0700 Subject: [PATCH 18/75] rcu-tasks: Make RCU Tasks Trace stall warnings print full .b.need_qs field Currently, the RCU Tasks Trace CPU stall warning simply indicates whether or not the .b.need_qs field is zero. This commit shows the three permitted values and flags other values with either "!" or "?". This is a debugging aid. Signed-off-by: Paul E. McKenney Cc: Neeraj Upadhyay Cc: Eric Dumazet Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Martin KaFai Lau Cc: KP Singh --- kernel/rcu/tasks.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 93096188d363..5eefbab7f2ed 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1502,13 +1502,14 @@ static void show_stalled_task_trace(struct task_struct *t, bool *firstreport) ".I"[t->trc_ipi_to_cpu >= 0], ".i"[is_idle_tsk]); else - pr_alert("P%d: %c%c%c nesting: %d%c cpu: %d%s\n", + pr_alert("P%d: %c%c%c nesting: %d%c%c cpu: %d%s\n", t->pid, ".I"[trc_rdr.ipi_to_cpu >= 0], ".i"[is_idle_tsk], ".N"[cpu >= 0 && tick_nohz_full_cpu(cpu)], trc_rdr.nesting, - " N"[!!trc_rdr.needqs], + " !CN"[trc_rdr.needqs & 0x3], + " ?"[trc_rdr.needqs > 0x3], cpu, cpu_online(cpu) ? "" : "(offline)"); sched_show_task(t); } From f90f19da88bfe32dd1fdfd104de4c0526a3be701 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 25 May 2022 09:49:26 -0700 Subject: [PATCH 19/75] rcu-tasks: Make RCU Tasks Trace stall warning handle idle offline tasks When a CPU is offline, its idle task can appear to be running, but it cannot be doing anything while CPU-hotplug operations are excluded. This commit takes advantage of that fact by making trc_check_slow_task() check for task_curr(t) && cpu_online(task_cpu(t)), and recording full information in that case. Signed-off-by: Paul E. McKenney Cc: Peter Zijlstra Cc: Neeraj Upadhyay Cc: Eric Dumazet Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Martin KaFai Lau Cc: KP Singh --- kernel/rcu/tasks.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 5eefbab7f2ed..64eb4d7b142e 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1476,7 +1476,7 @@ static int trc_check_slow_task(struct task_struct *t, void *arg) { struct trc_stall_chk_rdr *trc_rdrp = arg; - if (task_curr(t)) + if (task_curr(t) && cpu_online(task_cpu(t))) return false; // It is running, so decline to inspect it. trc_rdrp->nesting = READ_ONCE(t->trc_reader_nesting); trc_rdrp->ipi_to_cpu = READ_ONCE(t->trc_ipi_to_cpu); From 434c9eefb959c36331a93617ea95df903469b99f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 16 May 2022 17:56:16 -0700 Subject: [PATCH 20/75] rcu-tasks: Add data structures for lightweight grace periods This commit adds fields to task_struct and to rcu_tasks_percpu that will be used to avoid the task-list scan for RCU Tasks Trace grace periods, and also initializes these fields. Signed-off-by: Paul E. McKenney Cc: Neeraj Upadhyay Cc: Eric Dumazet Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Martin KaFai Lau Cc: KP Singh --- include/linux/sched.h | 2 ++ init/init_task.c | 1 + kernel/fork.c | 1 + kernel/rcu/tasks.h | 4 ++++ 4 files changed, 8 insertions(+) diff --git a/include/linux/sched.h b/include/linux/sched.h index e6eb5871593e..b88caf54e168 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -844,6 +844,8 @@ struct task_struct { int trc_ipi_to_cpu; union rcu_special trc_reader_special; struct list_head trc_holdout_list; + struct list_head trc_blkd_node; + int trc_blkd_cpu; #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ struct sched_info sched_info; diff --git a/init/init_task.c b/init/init_task.c index 73cc8f03511a..ff6c4b9bfe6b 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -157,6 +157,7 @@ struct task_struct init_task .trc_reader_nesting = 0, .trc_reader_special.s = 0, .trc_holdout_list = LIST_HEAD_INIT(init_task.trc_holdout_list), + .trc_blkd_node = LIST_HEAD_INIT(init_task.trc_blkd_node), #endif #ifdef CONFIG_CPUSETS .mems_allowed_seq = SEQCNT_SPINLOCK_ZERO(init_task.mems_allowed_seq, diff --git a/kernel/fork.c b/kernel/fork.c index 9d44f2d46c69..1950eb870244 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1814,6 +1814,7 @@ static inline void rcu_copy_process(struct task_struct *p) p->trc_reader_nesting = 0; p->trc_reader_special.s = 0; INIT_LIST_HEAD(&p->trc_holdout_list); + INIT_LIST_HEAD(&p->trc_blkd_node); #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ } diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 64eb4d7b142e..fd4508af055e 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -29,6 +29,7 @@ typedef void (*postgp_func_t)(struct rcu_tasks *rtp); * @rtp_work: Work queue for invoking callbacks. * @rtp_irq_work: IRQ work queue for deferred wakeups. * @barrier_q_head: RCU callback for barrier operation. + * @rtp_blkd_tasks: List of tasks blocked as readers. * @cpu: CPU number corresponding to this entry. * @rtpp: Pointer to the rcu_tasks structure. */ @@ -40,6 +41,7 @@ struct rcu_tasks_percpu { struct work_struct rtp_work; struct irq_work rtp_irq_work; struct rcu_head barrier_q_head; + struct list_head rtp_blkd_tasks; int cpu; struct rcu_tasks *rtpp; }; @@ -256,6 +258,8 @@ static void cblist_init_generic(struct rcu_tasks *rtp) INIT_WORK(&rtpcp->rtp_work, rcu_tasks_invoke_cbs_wq); rtpcp->cpu = cpu; rtpcp->rtpp = rtp; + if (!rtpcp->rtp_blkd_tasks.next) + INIT_LIST_HEAD(&rtpcp->rtp_blkd_tasks); raw_spin_unlock_rcu_node(rtpcp); // irqs remain disabled. } raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags); From 0356d4e66214569de674ab2684f2e0b440a466ab Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 17 May 2022 11:30:32 -0700 Subject: [PATCH 21/75] rcu-tasks: Track blocked RCU Tasks Trace readers This commit places any task that has ever blocked within its current RCU Tasks Trace read-side critical section on a per-CPU list within the rcu_tasks_percpu structure. Tasks are removed from this list when they exit by the exit_tasks_rcu_finish_trace() function. The purpose of this commit is to provide the information needed to eliminate the current scan of the full task list. This commit offsets the INT_MIN value for ->trc_reader_nesting with the new nesting level in order to avoid queueing tasks that are exiting their read-side critical sections. [ paulmck: Apply kernel test robot feedback. ] [ paulmck: Apply feedback from syzbot+9bb26e7c5e8e4fa7e641@syzkaller.appspotmail.com ] Signed-off-by: Paul E. McKenney Tested-by: syzbot Tested-by: "Zhang, Qiang1" Cc: Peter Zijlstra Cc: Neeraj Upadhyay Cc: Eric Dumazet Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Martin KaFai Lau Cc: KP Singh --- include/linux/rcupdate.h | 11 +++++++++-- include/linux/rcupdate_trace.h | 2 +- kernel/rcu/tasks.h | 22 +++++++++++++++++++++- 3 files changed, 31 insertions(+), 4 deletions(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 1e728d544fc1..ebdfeead44e5 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -174,12 +174,19 @@ void synchronize_rcu_tasks(void); #define TRC_NEED_QS_CHECKED 0x2 // Task has been checked for needing quiescent state. u8 rcu_trc_cmpxchg_need_qs(struct task_struct *t, u8 old, u8 new); +void rcu_tasks_trace_qs_blkd(struct task_struct *t); # define rcu_tasks_trace_qs(t) \ do { \ + int ___rttq_nesting = READ_ONCE((t)->trc_reader_nesting); \ + \ if (likely(!READ_ONCE((t)->trc_reader_special.b.need_qs)) && \ - likely(!READ_ONCE((t)->trc_reader_nesting))) \ + likely(!___rttq_nesting)) { \ rcu_trc_cmpxchg_need_qs((t), 0, TRC_NEED_QS_CHECKED); \ + } else if (___rttq_nesting && ___rttq_nesting != INT_MIN && \ + !READ_ONCE((t)->trc_reader_special.b.blocked)) { \ + rcu_tasks_trace_qs_blkd(t); \ + } \ } while (0) # else # define rcu_tasks_trace_qs(t) do { } while (0) @@ -188,7 +195,7 @@ u8 rcu_trc_cmpxchg_need_qs(struct task_struct *t, u8 old, u8 new); #define rcu_tasks_qs(t, preempt) \ do { \ rcu_tasks_classic_qs((t), (preempt)); \ - rcu_tasks_trace_qs((t)); \ + rcu_tasks_trace_qs(t); \ } while (0) # ifdef CONFIG_TASKS_RUDE_RCU diff --git a/include/linux/rcupdate_trace.h b/include/linux/rcupdate_trace.h index 6f9c35817398..9bc8cbb33340 100644 --- a/include/linux/rcupdate_trace.h +++ b/include/linux/rcupdate_trace.h @@ -75,7 +75,7 @@ static inline void rcu_read_unlock_trace(void) nesting = READ_ONCE(t->trc_reader_nesting) - 1; barrier(); // Critical section before disabling. // Disable IPI-based setting of .need_qs. - WRITE_ONCE(t->trc_reader_nesting, INT_MIN); + WRITE_ONCE(t->trc_reader_nesting, INT_MIN + nesting); if (likely(!READ_ONCE(t->trc_reader_special.s)) || nesting) { WRITE_ONCE(t->trc_reader_nesting, nesting); return; // We assume shallow reader nesting. diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index fd4508af055e..bab75ec26bdb 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1261,6 +1261,24 @@ void rcu_read_unlock_trace_special(struct task_struct *t) } EXPORT_SYMBOL_GPL(rcu_read_unlock_trace_special); +/* Add a newly blocked reader task to its CPU's list. */ +void rcu_tasks_trace_qs_blkd(struct task_struct *t) +{ + unsigned long flags; + struct rcu_tasks_percpu *rtpcp; + + local_irq_save(flags); + rtpcp = this_cpu_ptr(rcu_tasks_trace.rtpcpu); + raw_spin_lock_rcu_node(rtpcp); // irqs already disabled + t->trc_blkd_cpu = smp_processor_id(); + if (!rtpcp->rtp_blkd_tasks.next) + INIT_LIST_HEAD(&rtpcp->rtp_blkd_tasks); + list_add(&t->trc_blkd_node, &rtpcp->rtp_blkd_tasks); + t->trc_reader_special.b.blocked = true; + raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); +} +EXPORT_SYMBOL_GPL(rcu_tasks_trace_qs_blkd); + /* Add a task to the holdout list, if it is not already on the list. */ static void trc_add_holdout(struct task_struct *t, struct list_head *bhp) { @@ -1586,9 +1604,11 @@ static void rcu_tasks_trace_postgp(struct rcu_tasks *rtp) /* Report any needed quiescent state for this exiting task. */ static void exit_tasks_rcu_finish_trace(struct task_struct *t) { + union rcu_special trs = READ_ONCE(t->trc_reader_special); + rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED); WARN_ON_ONCE(READ_ONCE(t->trc_reader_nesting)); - if (WARN_ON_ONCE(rcu_ld_need_qs(t) & TRC_NEED_QS)) + if (WARN_ON_ONCE(rcu_ld_need_qs(t) & TRC_NEED_QS) || trs.b.blocked) rcu_read_unlock_trace_special(t); else WRITE_ONCE(t->trc_reader_nesting, 0); From 0bcb386857376224b5fd3f38b6e3173ec74d8d36 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 17 May 2022 15:01:14 -0700 Subject: [PATCH 22/75] rcu-tasks: Untrack blocked RCU Tasks Trace at reader end This commit causes rcu_read_unlock_trace() to check for the current task being on a per-CPU list within the rcu_tasks_percpu structure, and removes it from that list if so. This has the effect of curtailing tracking of a task that blocked within an RCU Tasks Trace read-side critical section once it exits that critical section. Signed-off-by: Paul E. McKenney Cc: Neeraj Upadhyay Cc: Eric Dumazet Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Martin KaFai Lau Cc: KP Singh --- kernel/rcu/tasks.h | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index bab75ec26bdb..eb87a759ef0b 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1245,17 +1245,29 @@ EXPORT_SYMBOL_GPL(rcu_trc_cmpxchg_need_qs); /* If we are the last reader, wake up the grace-period kthread. */ void rcu_read_unlock_trace_special(struct task_struct *t) { - int nqs = (rcu_ld_need_qs(t) == (TRC_NEED_QS_CHECKED | TRC_NEED_QS)); + unsigned long flags; + struct rcu_tasks_percpu *rtpcp; + union rcu_special trs; + + // Open-coded full-word version of rcu_ld_need_qs(). + smp_mb(); // Enforce full grace-period ordering. + trs = smp_load_acquire(&t->trc_reader_special); if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) && t->trc_reader_special.b.need_mb) smp_mb(); // Pairs with update-side barriers. // Update .need_qs before ->trc_reader_nesting for irq/NMI handlers. - if (nqs) { + if (trs.b.need_qs == (TRC_NEED_QS_CHECKED | TRC_NEED_QS)) { u8 result = rcu_trc_cmpxchg_need_qs(t, TRC_NEED_QS_CHECKED | TRC_NEED_QS, TRC_NEED_QS_CHECKED); - WARN_ONCE(result != (TRC_NEED_QS_CHECKED | TRC_NEED_QS), - "%s: result = %d", __func__, result); + WARN_ONCE(result != trs.b.need_qs, "%s: result = %d", __func__, result); + } + if (trs.b.blocked) { + rtpcp = per_cpu_ptr(rcu_tasks_trace.rtpcpu, t->trc_blkd_cpu); + raw_spin_lock_irqsave_rcu_node(rtpcp, flags); + list_del_init(&t->trc_blkd_node); + WRITE_ONCE(t->trc_reader_special.b.blocked, false); + raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); } WRITE_ONCE(t->trc_reader_nesting, 0); } @@ -1274,7 +1286,7 @@ void rcu_tasks_trace_qs_blkd(struct task_struct *t) if (!rtpcp->rtp_blkd_tasks.next) INIT_LIST_HEAD(&rtpcp->rtp_blkd_tasks); list_add(&t->trc_blkd_node, &rtpcp->rtp_blkd_tasks); - t->trc_reader_special.b.blocked = true; + WRITE_ONCE(t->trc_reader_special.b.blocked, true); raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); } EXPORT_SYMBOL_GPL(rcu_tasks_trace_qs_blkd); @@ -1608,7 +1620,7 @@ static void exit_tasks_rcu_finish_trace(struct task_struct *t) rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS_CHECKED); WARN_ON_ONCE(READ_ONCE(t->trc_reader_nesting)); - if (WARN_ON_ONCE(rcu_ld_need_qs(t) & TRC_NEED_QS) || trs.b.blocked) + if (WARN_ON_ONCE(rcu_ld_need_qs(t) & TRC_NEED_QS || trs.b.blocked)) rcu_read_unlock_trace_special(t); else WRITE_ONCE(t->trc_reader_nesting, 0); From 387c0ad702296c4eb7b95322c8af0e4391bf146f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 17 May 2022 16:47:40 -0700 Subject: [PATCH 23/75] rcu-tasks: Add blocked-task indicator to RCU Tasks Trace stall warnings This commit adds a "B" indicator to the RCU Tasks Trace CPU stall warning when the task has blocked within its current read-side critical section. This serves as a debugging aid. Signed-off-by: Paul E. McKenney Cc: Neeraj Upadhyay Cc: Eric Dumazet Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Martin KaFai Lau Cc: KP Singh --- kernel/rcu/tasks.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index eb87a759ef0b..6f4b89f9517e 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1536,11 +1536,12 @@ static void show_stalled_task_trace(struct task_struct *t, bool *firstreport) ".I"[t->trc_ipi_to_cpu >= 0], ".i"[is_idle_tsk]); else - pr_alert("P%d: %c%c%c nesting: %d%c%c cpu: %d%s\n", + pr_alert("P%d: %c%c%c%c nesting: %d%c%c cpu: %d%s\n", t->pid, ".I"[trc_rdr.ipi_to_cpu >= 0], ".i"[is_idle_tsk], ".N"[cpu >= 0 && tick_nohz_full_cpu(cpu)], + ".B"[!!data_race(t->trc_reader_special.b.blocked)], trc_rdr.nesting, " !CN"[trc_rdr.needqs & 0x3], " ?"[trc_rdr.needqs > 0x3], From 1fa98e2e40e5a46fa528b6ab256ba00dfb319dde Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 17 May 2022 17:11:37 -0700 Subject: [PATCH 24/75] rcu-tasks: Move rcu_tasks_trace_pertask() before rcu_tasks_trace_pregp_step() This is a code-motion-only commit that moves rcu_tasks_trace_pertask() to precede rcu_tasks_trace_pregp_step(), so that the latter will be able to invoke the other without forward references. Signed-off-by: Paul E. McKenney Cc: Neeraj Upadhyay Cc: Eric Dumazet Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Martin KaFai Lau Cc: KP Singh --- kernel/rcu/tasks.h | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 6f4b89f9517e..66d8473f1bda 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1449,20 +1449,6 @@ static void trc_wait_for_one_reader(struct task_struct *t, } } -/* Initialize for a new RCU-tasks-trace grace period. */ -static void rcu_tasks_trace_pregp_step(void) -{ - int cpu; - - // There shouldn't be any old IPIs, but... - for_each_possible_cpu(cpu) - WARN_ON_ONCE(per_cpu(trc_ipi_to_cpu, cpu)); - - // Disable CPU hotplug across the tasklist scan. - // This also waits for all readers in CPU-hotplug code paths. - cpus_read_lock(); -} - /* Do first-round processing for the specified task. */ static void rcu_tasks_trace_pertask(struct task_struct *t, struct list_head *hop) @@ -1478,6 +1464,20 @@ static void rcu_tasks_trace_pertask(struct task_struct *t, trc_wait_for_one_reader(t, hop); } +/* Initialize for a new RCU-tasks-trace grace period. */ +static void rcu_tasks_trace_pregp_step(void) +{ + int cpu; + + // There shouldn't be any old IPIs, but... + for_each_possible_cpu(cpu) + WARN_ON_ONCE(per_cpu(trc_ipi_to_cpu, cpu)); + + // Disable CPU hotplug across the tasklist scan. + // This also waits for all readers in CPU-hotplug code paths. + cpus_read_lock(); +} + /* * Do intermediate processing between task and holdout scans and * pick up the idle tasks. From 19415004d5221ba59be6ef566fdbb52c44808f7e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 17 May 2022 17:38:02 -0700 Subject: [PATCH 25/75] rcu-tasks: Avoid rcu_tasks_trace_pertask() duplicate list additions This commit adds checks within rcu_tasks_trace_pertask() to avoid duplicate (and destructive) additions to the holdouts list. These checks will be required later due to the possibility of a given task having blocked while in an RCU Tasks Trace read-side critical section, but now running on a CPU. Signed-off-by: Paul E. McKenney Cc: Neeraj Upadhyay Cc: Eric Dumazet Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Martin KaFai Lau Cc: KP Singh --- kernel/rcu/tasks.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 66d8473f1bda..1aa6a24a9bc2 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1454,9 +1454,10 @@ static void rcu_tasks_trace_pertask(struct task_struct *t, struct list_head *hop) { // During early boot when there is only the one boot CPU, there - // is no idle task for the other CPUs. Also, the grace-period - // kthread is always in a quiescent state. Either way, just return. - if (unlikely(t == NULL) || t == current) + // is no idle task for the other CPUs. Also, the grace-period + // kthread is always in a quiescent state. In addition, just return + // if this task is already on the list. + if (unlikely(t == NULL) || t == current || !list_empty(&t->trc_holdout_list)) return; rcu_st_need_qs(t, 0); From 7460ade1fc6e3f62d5c0006c972755f0aefd41b2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 18 May 2022 16:06:55 -0700 Subject: [PATCH 26/75] rcu-tasks: Scan running tasks for RCU Tasks Trace readers A running task might be within an RCU Tasks Trace read-side critical section for any length of time, but will not be placed on any of the per-CPU rcu_tasks_percpu structure's ->rtp_blkd_tasks lists. Therefore any RCU Tasks Trace grace-period processing that does not scan the full task list must interact with the running tasks. This commit therefore causes the rcu_tasks_trace_pregp_step() function to IPI each CPU in order to place the corresponding task on the holdouts list and to record whether or not it was in an RCU Tasks Trace read-side critical section. Yes, it is possible to avoid adding it to that list if it is not a reader, but that would prevent the system from remembering that this task was in a quiescent state. Which is why the running tasks are unconditionally added to the holdout list. Signed-off-by: Paul E. McKenney Cc: Neeraj Upadhyay Cc: Eric Dumazet Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Martin KaFai Lau Cc: KP Singh --- kernel/rcu/tasks.h | 51 ++++++++++++++++++++++++++++++++++++---------- 1 file changed, 40 insertions(+), 11 deletions(-) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 1aa6a24a9bc2..a8f95864c921 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -14,7 +14,7 @@ struct rcu_tasks; typedef void (*rcu_tasks_gp_func_t)(struct rcu_tasks *rtp); -typedef void (*pregp_func_t)(void); +typedef void (*pregp_func_t)(struct list_head *hop); typedef void (*pertask_func_t)(struct task_struct *t, struct list_head *hop); typedef void (*postscan_func_t)(struct list_head *hop); typedef void (*holdouts_func_t)(struct list_head *hop, bool ndrpt, bool *frptp); @@ -661,7 +661,7 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) struct task_struct *t; set_tasks_gp_state(rtp, RTGS_PRE_WAIT_GP); - rtp->pregp_func(); + rtp->pregp_func(&holdouts); /* * There were callbacks, so we need to wait for an RCU-tasks @@ -791,7 +791,7 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) // disabling. /* Pre-grace-period preparation. */ -static void rcu_tasks_pregp_step(void) +static void rcu_tasks_pregp_step(struct list_head *hop) { /* * Wait for all pre-existing t->on_rq and t->nvcsw transitions @@ -1449,24 +1449,48 @@ static void trc_wait_for_one_reader(struct task_struct *t, } } -/* Do first-round processing for the specified task. */ -static void rcu_tasks_trace_pertask(struct task_struct *t, - struct list_head *hop) +/* + * Initialize for first-round processing for the specified task. + * Return false if task is NULL or already taken care of, true otherwise. + */ +static bool rcu_tasks_trace_pertask_prep(struct task_struct *t, bool notself) { // During early boot when there is only the one boot CPU, there // is no idle task for the other CPUs. Also, the grace-period // kthread is always in a quiescent state. In addition, just return // if this task is already on the list. - if (unlikely(t == NULL) || t == current || !list_empty(&t->trc_holdout_list)) - return; + if (unlikely(t == NULL) || (t == current && notself) || !list_empty(&t->trc_holdout_list)) + return false; rcu_st_need_qs(t, 0); t->trc_ipi_to_cpu = -1; - trc_wait_for_one_reader(t, hop); + return true; +} + +/* Do first-round processing for the specified task. */ +static void rcu_tasks_trace_pertask(struct task_struct *t, struct list_head *hop) +{ + if (rcu_tasks_trace_pertask_prep(t, true)) + trc_wait_for_one_reader(t, hop); +} + +/* + * Get the current CPU's current task on the holdout list. + * Calls to this function must be serialized. + */ +static void rcu_tasks_trace_pertask_handler(void *hop_in) +{ + struct list_head *hop = hop_in; + struct task_struct *t = current; + + // Pull in the currently running task, but only if it is currently + // in an RCU tasks trace read-side critical section. + if (rcu_tasks_trace_pertask_prep(t, false)) + trc_add_holdout(t, hop); } /* Initialize for a new RCU-tasks-trace grace period. */ -static void rcu_tasks_trace_pregp_step(void) +static void rcu_tasks_trace_pregp_step(struct list_head *hop) { int cpu; @@ -1474,9 +1498,14 @@ static void rcu_tasks_trace_pregp_step(void) for_each_possible_cpu(cpu) WARN_ON_ONCE(per_cpu(trc_ipi_to_cpu, cpu)); - // Disable CPU hotplug across the tasklist scan. + // Disable CPU hotplug across the CPU scan. // This also waits for all readers in CPU-hotplug code paths. cpus_read_lock(); + + // These smp_call_function_single() calls are serialized to + // allow safe access to the hop list. + for_each_online_cpu(cpu) + smp_call_function_single(cpu, rcu_tasks_trace_pertask_handler, hop, 1); } /* From dc7d54b45170e1e3ced9f86718aa4274fd727790 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 18 May 2022 17:19:27 -0700 Subject: [PATCH 27/75] rcu-tasks: Pull in tasks blocked within RCU Tasks Trace readers This commit scans each CPU's ->rtp_blkd_tasks list, adding them to the list of holdout tasks. This will cause the current RCU Tasks Trace grace period to wait until these tasks exit their RCU Tasks Trace read-side critical sections. This commit will enable later work omitting the scan of the full task list. Signed-off-by: Paul E. McKenney Cc: Neeraj Upadhyay Cc: Eric Dumazet Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Martin KaFai Lau Cc: KP Singh --- kernel/rcu/tasks.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index a8f95864c921..d318cdfd2309 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1492,7 +1492,11 @@ static void rcu_tasks_trace_pertask_handler(void *hop_in) /* Initialize for a new RCU-tasks-trace grace period. */ static void rcu_tasks_trace_pregp_step(struct list_head *hop) { + LIST_HEAD(blkd_tasks); int cpu; + unsigned long flags; + struct rcu_tasks_percpu *rtpcp; + struct task_struct *t; // There shouldn't be any old IPIs, but... for_each_possible_cpu(cpu) @@ -1506,6 +1510,26 @@ static void rcu_tasks_trace_pregp_step(struct list_head *hop) // allow safe access to the hop list. for_each_online_cpu(cpu) smp_call_function_single(cpu, rcu_tasks_trace_pertask_handler, hop, 1); + + // Only after all running tasks have been accounted for is it + // safe to take care of the tasks that have blocked within their + // current RCU tasks trace read-side critical section. + for_each_possible_cpu(cpu) { + rtpcp = per_cpu_ptr(rcu_tasks_trace.rtpcpu, cpu); + raw_spin_lock_irqsave_rcu_node(rtpcp, flags); + list_splice_init(&rtpcp->rtp_blkd_tasks, &blkd_tasks); + while (!list_empty(&blkd_tasks)) { + rcu_read_lock(); + t = list_first_entry(&blkd_tasks, struct task_struct, trc_blkd_node); + list_del_init(&t->trc_blkd_node); + list_add(&t->trc_blkd_node, &rtpcp->rtp_blkd_tasks); + raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); + rcu_tasks_trace_pertask(t, hop); + rcu_read_unlock(); + raw_spin_lock_irqsave_rcu_node(rtpcp, flags); + } + raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); + } } /* From 955a0192082023bf08f1be279182090264cb2557 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 18 May 2022 17:19:27 -0700 Subject: [PATCH 28/75] rcu-tasks: Stop RCU Tasks Trace from scanning idle tasks Now that RCU scans both running tasks and tasks that have blocked within their current RCU Tasks Trace read-side critical section, there is no need for it to scan the idle tasks. After all, an idle loop should not be remain within an RCU Tasks Trace read-side critical section across exit from idle, and from a BPF viewpoint, functions invoked from the idle loop should not sleep. So only running idle tasks can be within RCU Tasks Trace read-side critical sections. This commit therefore removes the scan of the idle tasks from the rcu_tasks_trace_postscan() function. Signed-off-by: Paul E. McKenney Cc: Neeraj Upadhyay Cc: Eric Dumazet Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Martin KaFai Lau Cc: KP Singh --- kernel/rcu/tasks.h | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index d318cdfd2309..272c905995e5 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1533,16 +1533,10 @@ static void rcu_tasks_trace_pregp_step(struct list_head *hop) } /* - * Do intermediate processing between task and holdout scans and - * pick up the idle tasks. + * Do intermediate processing between task and holdout scans. */ static void rcu_tasks_trace_postscan(struct list_head *hop) { - int cpu; - - for_each_online_cpu(cpu) - rcu_tasks_trace_pertask(idle_task(cpu), hop); - // Re-enable CPU hotplug now that the tasklist scan has completed. cpus_read_unlock(); From ab69d3c8b99435ebd2ac4a241a5d931024c83c02 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 12 Apr 2022 15:17:37 -0700 Subject: [PATCH 29/75] torture: Make kvm-remote.sh announce which system is being waited on If a remote system fails in certain ways, for example, if it is rebooted without removing the contents of the /tmp directory, its remote.run file never will be removed and the kvm-remote.sh script will loop waiting forever. The manual workaround for this (hopefully!) rare event is to manually remove the file, which will cause the results up to the reboot to be collected and evaluated. Unfortunately, to work out which system is holding things up, the user must refer to the name of the last system whose results were collected, then look up the name of the next system in sequence, then manually remove the remote.run file. Even more unfortunately, this procedure can be fooled in runs where each system handles more than one batch should a given system take longer than expected, causing the systems to be handled out of order. This commit therefore causes kvm-remote.sh to print out the name of the system it will wait on next, allowing the user to refer directly to that name. Making the kvm-remote.sh script automatically handle unscheduled termination of the qemu processes is left as future work. Quite possibly deep future work. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/kvm-remote.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/rcutorture/bin/kvm-remote.sh b/tools/testing/selftests/rcutorture/bin/kvm-remote.sh index 0ff59bd8b640..9f0a5d5ff2dd 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-remote.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-remote.sh @@ -262,6 +262,7 @@ echo All batches started. `date` | tee -a "$oldrun/remote-log" # Wait for all remaining scenarios to complete and collect results. for i in $systems do + echo " ---" Waiting for $i `date` | tee -a "$oldrun/remote-log" while checkremotefile "$i" "$resdir/$ds/remote.run" do sleep 30 From 1a4a8153e0df1705397aa3ff561dd8fdc2e6fc23 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 20 May 2022 10:21:00 -0700 Subject: [PATCH 30/75] rcu-tasks: Stop RCU Tasks Trace from scanning full tasks list This commit takes off the training wheels and relies only on scanning currently running tasks and tasks that have blocked or been preempted within their current RCU Tasks Trace read-side critical section. Before this commit, the time complexity of an RCU Tasks Trace grace period is O(T), where T is the number of tasks. After this commit, this time complexity is O(C+B), where C is the number of CPUs and B is the number of tasks that have blocked (or been preempted) at least once during their current RCU Tasks Trace read-side critical sections. Of course, if all tasks have blocked (or been preempted) at least once during their current RCU Tasks Trace read-side critical sections, this is still O(T), but current expectations are that RCU Tasks Trace read-side critical section will be short and that there will normally not be large numbers of tasks blocked within such a critical section. Dave Marchevsky kindly measured the effects of this commit on the RCU Tasks Trace grace-period latency and the rcu_tasks_trace_kthread task's CPU consumption per RCU Tasks Trace grace period over the course of a fixed test, all in milliseconds: Before After GP latency 22.3 ms stddev > 0.1 17.0 ms stddev < 0.1 GP CPU 2.3 ms stddev 0.3 1.1 ms stddev 0.2 This was on a system with 15,000 tasks, so it is reasonable to expect much larger savings on the systems on which this issue was first noted, given that they sport well in excess of 100,000 tasks. CPU consumption was measured using profiling techniques. Signed-off-by: Paul E. McKenney Cc: Neeraj Upadhyay Cc: Eric Dumazet Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Martin KaFai Lau Cc: KP Singh Tested-by: Dave Marchevsky --- kernel/rcu/tasks.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 272c905995e5..fe0552086ccf 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -670,10 +670,12 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) * and make a list of them in holdouts. */ set_tasks_gp_state(rtp, RTGS_SCAN_TASKLIST); - rcu_read_lock(); - for_each_process_thread(g, t) - rtp->pertask_func(t, &holdouts); - rcu_read_unlock(); + if (rtp->pertask_func) { + rcu_read_lock(); + for_each_process_thread(g, t) + rtp->pertask_func(t, &holdouts); + rcu_read_unlock(); + } set_tasks_gp_state(rtp, RTGS_POST_SCAN_TASKLIST); rtp->postscan_func(&holdouts); @@ -1746,7 +1748,6 @@ static int __init rcu_spawn_tasks_trace_kthread(void) rcu_tasks_trace.init_fract = 1; } rcu_tasks_trace.pregp_func = rcu_tasks_trace_pregp_step; - rcu_tasks_trace.pertask_func = rcu_tasks_trace_pertask; rcu_tasks_trace.postscan_func = rcu_tasks_trace_postscan; rcu_tasks_trace.holdouts_func = check_all_holdout_tasks_trace; rcu_tasks_trace.postgp_func = rcu_tasks_trace_postgp; From ffcc21a315e1ebafad51b318e8ac0cb884df0cdc Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 1 Jun 2022 21:26:57 -0700 Subject: [PATCH 31/75] rcu-tasks: Maintain a count of tasks blocking RCU Tasks Trace grace period This commit maintains a new n_trc_holdouts counter that tracks the number of tasks blocking the RCU Tasks grace period. This counter is useful for debugging, and its value has been added to a diagostic message. Signed-off-by: Paul E. McKenney Cc: Neeraj Upadhyay Cc: Eric Dumazet Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Martin KaFai Lau Cc: KP Singh --- kernel/rcu/tasks.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index fe0552086ccf..9d7d6fd4b8a7 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1206,6 +1206,7 @@ static DEFINE_PER_CPU(bool, trc_ipi_to_cpu); static unsigned long n_heavy_reader_attempts; static unsigned long n_heavy_reader_updates; static unsigned long n_heavy_reader_ofl_updates; +static unsigned long n_trc_holdouts; void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func); DEFINE_RCU_TASKS(rcu_tasks_trace, rcu_tasks_wait_gp, call_rcu_tasks_trace, @@ -1299,6 +1300,7 @@ static void trc_add_holdout(struct task_struct *t, struct list_head *bhp) if (list_empty(&t->trc_holdout_list)) { get_task_struct(t); list_add(&t->trc_holdout_list, bhp); + n_trc_holdouts++; } } @@ -1308,6 +1310,7 @@ static void trc_del_holdout(struct task_struct *t) if (!list_empty(&t->trc_holdout_list)) { list_del_init(&t->trc_holdout_list); put_task_struct(t); + n_trc_holdouts--; } } @@ -1760,7 +1763,8 @@ void show_rcu_tasks_trace_gp_kthread(void) { char buf[64]; - sprintf(buf, "h:%lu/%lu/%lu", + sprintf(buf, "N%lu h:%lu/%lu/%lu", + data_race(n_trc_holdouts), data_race(n_heavy_reader_ofl_updates), data_race(n_heavy_reader_updates), data_race(n_heavy_reader_attempts)); From e386b6725798eec07facedf4d4bb710c079fd25c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 2 Jun 2022 17:30:01 -0700 Subject: [PATCH 32/75] rcu-tasks: Eliminate RCU Tasks Trace IPIs to online CPUs Currently, the RCU Tasks Trace grace-period kthread IPIs each online CPU using smp_call_function_single() in order to track any tasks currently in RCU Tasks Trace read-side critical sections during which the corresponding task has neither blocked nor been preempted. These IPIs are annoying and are also not strictly necessary because any task that blocks or is preempted within its current RCU Tasks Trace read-side critical section will be tracked on one of the per-CPU rcu_tasks_percpu structure's ->rtp_blkd_tasks list. So the only time that this is a problem is if one of the CPUs runs through a long-duration RCU Tasks Trace read-side critical section without a context switch. Note that the task_call_func() function cannot help here because there is no safe way to identify the target task. Of course, the task_call_func() function will be very useful later, when processing the list of tasks, but it needs to know the task. This commit therefore creates a cpu_curr_snapshot() function that returns a pointer the task_struct structure of some task that happened to be running on the specified CPU more or less during the time that the cpu_curr_snapshot() function was executing. If there was no context switch during this time, this function will return a pointer to the task_struct structure of the task that was running throughout. If there was a context switch, then the outgoing task will be taken care of by RCU's context-switch hook, and the incoming task was either already taken care during some previous context switch, or it is not currently within an RCU Tasks Trace read-side critical section. And in this latter case, the grace period already started, so there is no need to wait on this task. This new cpu_curr_snapshot() function is invoked on each CPU early in the RCU Tasks Trace grace-period processing, and the resulting tasks are queued for later quiescent-state inspection. Signed-off-by: Paul E. McKenney Cc: Peter Zijlstra Cc: Neeraj Upadhyay Cc: Eric Dumazet Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Martin KaFai Lau Cc: KP Singh --- include/linux/sched.h | 1 + kernel/rcu/tasks.h | 24 +++++++----------------- kernel/sched/core.c | 32 ++++++++++++++++++++++++++++++++ 3 files changed, 40 insertions(+), 17 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index b88caf54e168..72242bc73d85 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2224,6 +2224,7 @@ static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) extern bool sched_task_on_rq(struct task_struct *p); extern unsigned long get_wchan(struct task_struct *p); +extern struct task_struct *cpu_curr_snapshot(int cpu); /* * In order to reduce various lock holder preemption latencies provide an diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 9d7d6fd4b8a7..c2aae2643a0b 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1479,21 +1479,6 @@ static void rcu_tasks_trace_pertask(struct task_struct *t, struct list_head *hop trc_wait_for_one_reader(t, hop); } -/* - * Get the current CPU's current task on the holdout list. - * Calls to this function must be serialized. - */ -static void rcu_tasks_trace_pertask_handler(void *hop_in) -{ - struct list_head *hop = hop_in; - struct task_struct *t = current; - - // Pull in the currently running task, but only if it is currently - // in an RCU tasks trace read-side critical section. - if (rcu_tasks_trace_pertask_prep(t, false)) - trc_add_holdout(t, hop); -} - /* Initialize for a new RCU-tasks-trace grace period. */ static void rcu_tasks_trace_pregp_step(struct list_head *hop) { @@ -1513,8 +1498,13 @@ static void rcu_tasks_trace_pregp_step(struct list_head *hop) // These smp_call_function_single() calls are serialized to // allow safe access to the hop list. - for_each_online_cpu(cpu) - smp_call_function_single(cpu, rcu_tasks_trace_pertask_handler, hop, 1); + for_each_online_cpu(cpu) { + rcu_read_lock(); + t = cpu_curr_snapshot(cpu); + if (rcu_tasks_trace_pertask_prep(t, true)) + trc_add_holdout(t, hop); + rcu_read_unlock(); + } // Only after all running tasks have been accounted for is it // safe to take care of the tasks that have blocked within their diff --git a/kernel/sched/core.c b/kernel/sched/core.c index da0bf6fe9ecd..9568019be124 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4263,6 +4263,38 @@ int task_call_func(struct task_struct *p, task_call_f func, void *arg) return ret; } +/** + * cpu_curr_snapshot - Return a snapshot of the currently running task + * @cpu: The CPU on which to snapshot the task. + * + * Returns the task_struct pointer of the task "currently" running on + * the specified CPU. If the same task is running on that CPU throughout, + * the return value will be a pointer to that task's task_struct structure. + * If the CPU did any context switches even vaguely concurrently with the + * execution of this function, the return value will be a pointer to the + * task_struct structure of a randomly chosen task that was running on + * that CPU somewhere around the time that this function was executing. + * + * If the specified CPU was offline, the return value is whatever it + * is, perhaps a pointer to the task_struct structure of that CPU's idle + * task, but there is no guarantee. Callers wishing a useful return + * value must take some action to ensure that the specified CPU remains + * online throughout. + * + * This function executes full memory barriers before and after fetching + * the pointer, which permits the caller to confine this function's fetch + * with respect to the caller's accesses to other shared variables. + */ +struct task_struct *cpu_curr_snapshot(int cpu) +{ + struct task_struct *t; + + smp_mb(); /* Pairing determined by caller's synchronization design. */ + t = rcu_dereference(cpu_curr(cpu)); + smp_mb(); /* Pairing determined by caller's synchronization design. */ + return t; +} + /** * wake_up_process - Wake up a specific process * @p: The process to be woken up. From 56096ecd5b04148b6d292e3847c23d4a2a454e94 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 7 Jun 2022 17:25:03 -0700 Subject: [PATCH 33/75] rcu-tasks: Disable and enable CPU hotplug in same function The rcu_tasks_trace_pregp_step() function invokes cpus_read_lock() to disable CPU hotplug, and a later call to the rcu_tasks_trace_postscan() function invokes cpus_read_unlock() to re-enable it. This was absolutely necessary in the past in order to protect the intervening scan of the full tasks list, but there is no longer such a scan. This commit therefore improves readability by moving the cpus_read_unlock() call to the end of the rcu_tasks_trace_pregp_step() function. This commit is a pure code-motion commit without any (intended) change in functionality. Signed-off-by: Paul E. McKenney Cc: Neeraj Upadhyay Cc: Eric Dumazet Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Martin KaFai Lau Cc: KP Singh --- kernel/rcu/tasks.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index c2aae2643a0b..bf9cc5bc4ae5 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1525,6 +1525,9 @@ static void rcu_tasks_trace_pregp_step(struct list_head *hop) } raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); } + + // Re-enable CPU hotplug now that the holdout list is populated. + cpus_read_unlock(); } /* @@ -1532,9 +1535,6 @@ static void rcu_tasks_trace_pregp_step(struct list_head *hop) */ static void rcu_tasks_trace_postscan(struct list_head *hop) { - // Re-enable CPU hotplug now that the tasklist scan has completed. - cpus_read_unlock(); - // Wait for late-stage exiting tasks to finish exiting. // These might have passed the call to exit_tasks_rcu_finish(). synchronize_rcu(); From eea3423b162d5d5cdc08af23e8ee2c2d1134fd07 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 6 Jun 2022 21:30:38 -0700 Subject: [PATCH 34/75] rcu-tasks: Update comments This commit updates comments to reflect the changes in the series of commits that eliminated the full task-list scan. Signed-off-by: Paul E. McKenney Cc: Neeraj Upadhyay Cc: Eric Dumazet Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Martin KaFai Lau Cc: KP Singh --- kernel/rcu/tasks.h | 71 +++++++++++++++++++++------------------------- 1 file changed, 33 insertions(+), 38 deletions(-) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index bf9cc5bc4ae5..df6b2cb2f205 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1138,11 +1138,10 @@ EXPORT_SYMBOL_GPL(show_rcu_tasks_rude_gp_kthread); // 3. Avoids expensive read-side instructions, having overhead similar // to that of Preemptible RCU. // -// There are of course downsides. The grace-period code can send IPIs to -// CPUs, even when those CPUs are in the idle loop or in nohz_full userspace. -// It is necessary to scan the full tasklist, much as for Tasks RCU. There -// is a single callback queue guarded by a single lock, again, much as for -// Tasks RCU. If needed, these downsides can be at least partially remedied. +// There are of course downsides. For example, the grace-period code +// can send IPIs to CPUs, even when those CPUs are in the idle loop or +// in nohz_full userspace. If needed, these downsides can be at least +// partially remedied. // // Perhaps most important, this variant of RCU does not affect the vanilla // flavors, rcu_preempt and rcu_sched. The fact that RCU Tasks Trace @@ -1155,38 +1154,30 @@ EXPORT_SYMBOL_GPL(show_rcu_tasks_rude_gp_kthread); // invokes these functions in this order: // // rcu_tasks_trace_pregp_step(): -// Initialize the count of readers and block CPU-hotplug operations. -// rcu_tasks_trace_pertask(), invoked on every non-idle task: -// Initialize per-task state and attempt to identify an immediate -// quiescent state for that task, or, failing that, attempt to -// set that task's .need_qs flag so that task's next outermost -// rcu_read_unlock_trace() will report the quiescent state (in which -// case the count of readers is incremented). If both attempts fail, -// the task is added to a "holdout" list. Note that IPIs are used -// to invoke trc_read_check_handler() in the context of running tasks -// in order to avoid ordering overhead on common-case shared-variable -// accessses. +// Disables CPU hotplug, adds all currently executing tasks to the +// holdout list, then checks the state of all tasks that blocked +// or were preempted within their current RCU Tasks Trace read-side +// critical section, adding them to the holdout list if appropriate. +// Finally, this function re-enables CPU hotplug. +// The ->pertask_func() pointer is NULL, so there is no per-task processing. // rcu_tasks_trace_postscan(): -// Initialize state and attempt to identify an immediate quiescent -// state as above (but only for idle tasks), unblock CPU-hotplug -// operations, and wait for an RCU grace period to avoid races with -// tasks that are in the process of exiting. +// Invokes synchronize_rcu() to wait for late-stage exiting tasks +// to finish exiting. // check_all_holdout_tasks_trace(), repeatedly until holdout list is empty: // Scans the holdout list, attempting to identify a quiescent state // for each task on the list. If there is a quiescent state, the -// corresponding task is removed from the holdout list. +// corresponding task is removed from the holdout list. Once this +// list is empty, the grace period has completed. // rcu_tasks_trace_postgp(): -// Wait for the count of readers do drop to zero, reporting any stalls. -// Also execute full memory barriers to maintain ordering with code -// executing after the grace period. +// Provides the needed full memory barrier and does debug checks. // // The exit_tasks_rcu_finish_trace() synchronizes with exiting tasks. // -// Pre-grace-period update-side code is ordered before the grace -// period via the ->cbs_lock and barriers in rcu_tasks_kthread(). -// Pre-grace-period read-side code is ordered before the grace period by -// atomic_dec_and_test() of the count of readers (for IPIed readers) and by -// scheduler context-switch ordering (for locked-down non-running readers). +// Pre-grace-period update-side code is ordered before the grace period +// via the ->cbs_lock and barriers in rcu_tasks_kthread(). Pre-grace-period +// read-side code is ordered before the grace period by atomic operations +// on .b.need_qs flag of each task involved in this process, or by scheduler +// context-switch ordering (for locked-down non-running readers). // The lockdep state must be outside of #ifdef to be useful. #ifdef CONFIG_DEBUG_LOCK_ALLOC @@ -1245,7 +1236,10 @@ u8 rcu_trc_cmpxchg_need_qs(struct task_struct *t, u8 old, u8 new) } EXPORT_SYMBOL_GPL(rcu_trc_cmpxchg_need_qs); -/* If we are the last reader, wake up the grace-period kthread. */ +/* + * If we are the last reader, signal the grace-period kthread. + * Also remove from the per-CPU list of blocked tasks. + */ void rcu_read_unlock_trace_special(struct task_struct *t) { unsigned long flags; @@ -1336,9 +1330,9 @@ static void trc_read_check_handler(void *t_in) if (unlikely(nesting < 0)) goto reset_ipi; - // Get here if the task is in a read-side critical section. Set - // its state so that it will awaken the grace-period kthread upon - // exit from that critical section. + // Get here if the task is in a read-side critical section. + // Set its state so that it will update state for the grace-period + // kthread upon exit from that critical section. rcu_trc_cmpxchg_need_qs(t, 0, TRC_NEED_QS | TRC_NEED_QS_CHECKED); reset_ipi: @@ -1387,7 +1381,7 @@ static int trc_inspect_reader(struct task_struct *t, void *bhp_in) return 0; // In QS, so done. } if (nesting < 0) - return -EINVAL; // QS transitioning, try again later. + return -EINVAL; // Reader transitioning, try again later. // The task is in a read-side critical section, so set up its // state so that it will update state upon exit from that critical @@ -1492,11 +1486,12 @@ static void rcu_tasks_trace_pregp_step(struct list_head *hop) for_each_possible_cpu(cpu) WARN_ON_ONCE(per_cpu(trc_ipi_to_cpu, cpu)); - // Disable CPU hotplug across the CPU scan. - // This also waits for all readers in CPU-hotplug code paths. + // Disable CPU hotplug across the CPU scan for the benefit of + // any IPIs that might be needed. This also waits for all readers + // in CPU-hotplug code paths. cpus_read_lock(); - // These smp_call_function_single() calls are serialized to + // These rcu_tasks_trace_pertask_prep() calls are serialized to // allow safe access to the hop list. for_each_online_cpu(cpu) { rcu_read_lock(); @@ -1608,7 +1603,7 @@ static void check_all_holdout_tasks_trace(struct list_head *hop, { struct task_struct *g, *t; - // Disable CPU hotplug across the holdout list scan. + // Disable CPU hotplug across the holdout list scan for IPIs. cpus_read_lock(); list_for_each_entry_safe(t, g, hop, trc_holdout_list) { From 1cf1144e8473e8c3180ac8b91309e29b6acfd95f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 7 Jun 2022 15:23:52 -0700 Subject: [PATCH 35/75] rcu-tasks: Be more patient for RCU Tasks boot-time testing The RCU-Tasks family of grace-period primitives can take some time to complete, and the amount of time can depend on the exact hardware and software configuration. Some configurations boot up fast enough that the RCU-Tasks verification process gets false-positive failures. This commit therefore allows up to 30 seconds for the grace periods to complete, with this value adjustable downwards using the rcupdate.rcu_task_stall_timeout kernel boot parameter. Reported-by: Matthew Wilcox Reported-by: Zhouyi Zhou Signed-off-by: Paul E. McKenney Tested-by: Zhouyi Zhou Tested-by: Mark Rutland --- kernel/rcu/tasks.h | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index df6b2cb2f205..fcbd0ec33c86 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -145,6 +145,7 @@ static int rcu_task_ipi_delay __read_mostly = RCU_TASK_IPI_DELAY; module_param(rcu_task_ipi_delay, int, 0644); /* Control stall timeouts. Disable with <= 0, otherwise jiffies till stall. */ +#define RCU_TASK_BOOT_STALL_TIMEOUT (HZ * 30) #define RCU_TASK_STALL_TIMEOUT (HZ * 60 * 10) static int rcu_task_stall_timeout __read_mostly = RCU_TASK_STALL_TIMEOUT; module_param(rcu_task_stall_timeout, int, 0644); @@ -1776,23 +1777,24 @@ struct rcu_tasks_test_desc { struct rcu_head rh; const char *name; bool notrun; + unsigned long runstart; }; static struct rcu_tasks_test_desc tests[] = { { .name = "call_rcu_tasks()", /* If not defined, the test is skipped. */ - .notrun = !IS_ENABLED(CONFIG_TASKS_RCU), + .notrun = IS_ENABLED(CONFIG_TASKS_RCU), }, { .name = "call_rcu_tasks_rude()", /* If not defined, the test is skipped. */ - .notrun = !IS_ENABLED(CONFIG_TASKS_RUDE_RCU), + .notrun = IS_ENABLED(CONFIG_TASKS_RUDE_RCU), }, { .name = "call_rcu_tasks_trace()", /* If not defined, the test is skipped. */ - .notrun = !IS_ENABLED(CONFIG_TASKS_TRACE_RCU) + .notrun = IS_ENABLED(CONFIG_TASKS_TRACE_RCU) } }; @@ -1803,23 +1805,28 @@ static void test_rcu_tasks_callback(struct rcu_head *rhp) pr_info("Callback from %s invoked.\n", rttd->name); - rttd->notrun = true; + rttd->notrun = false; } static void rcu_tasks_initiate_self_tests(void) { + unsigned long j = jiffies; + pr_info("Running RCU-tasks wait API self tests\n"); #ifdef CONFIG_TASKS_RCU + tests[0].runstart = j; synchronize_rcu_tasks(); call_rcu_tasks(&tests[0].rh, test_rcu_tasks_callback); #endif #ifdef CONFIG_TASKS_RUDE_RCU + tests[1].runstart = j; synchronize_rcu_tasks_rude(); call_rcu_tasks_rude(&tests[1].rh, test_rcu_tasks_callback); #endif #ifdef CONFIG_TASKS_TRACE_RCU + tests[2].runstart = j; synchronize_rcu_tasks_trace(); call_rcu_tasks_trace(&tests[2].rh, test_rcu_tasks_callback); #endif @@ -1829,11 +1836,18 @@ static int rcu_tasks_verify_self_tests(void) { int ret = 0; int i; + unsigned long bst = rcu_task_stall_timeout; + if (bst <= 0 || bst > RCU_TASK_BOOT_STALL_TIMEOUT) + bst = RCU_TASK_BOOT_STALL_TIMEOUT; for (i = 0; i < ARRAY_SIZE(tests); i++) { - if (!tests[i].notrun) { // still hanging. - pr_err("%s has been failed.\n", tests[i].name); - ret = -1; + while (tests[i].notrun) { // still hanging. + if (time_after(jiffies, tests[i].runstart + bst)) { + pr_err("%s has failed boot-time tests.\n", tests[i].name); + ret = -1; + break; + } + schedule_timeout_uninterruptible(1); } } From e72ee5e1a866b85cb6c3d4c80a1125976020a7e8 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 14 Jun 2022 08:06:20 -0400 Subject: [PATCH 36/75] rcu-tasks: Use delayed_work to delay rcu_tasks_verify_self_tests() Commit 2585014188d5 ("rcu-tasks: Be more patient for RCU Tasks boot-time testing") fixes false positive rcu_tasks verification check failure by repeating the test once every second until timeout using schedule_timeout_uninterruptible(). Since rcu_tasks_verify_selft_tests() is called from do_initcalls() as a late_initcall, this has the undesirable side effect of delaying other late_initcall's queued after it by a second or more. Fix this by instead using delayed_work to repeat the verification check. Fixes: 2585014188d5 ("rcu-tasks: Be more patient for RCU Tasks boot-time testing") Signed-off-by: Waiman Long Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 37 ++++++++++++++++++++++++++++++++----- 1 file changed, 32 insertions(+), 5 deletions(-) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index fcbd0ec33c86..83c7e6620d40 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1832,6 +1832,11 @@ static void rcu_tasks_initiate_self_tests(void) #endif } +/* + * Return: 0 - test passed + * 1 - test failed, but have not timed out yet + * -1 - test failed and timed out + */ static int rcu_tasks_verify_self_tests(void) { int ret = 0; @@ -1847,16 +1852,38 @@ static int rcu_tasks_verify_self_tests(void) ret = -1; break; } - schedule_timeout_uninterruptible(1); + ret = 1; + break; } } - - if (ret) - WARN_ON(1); + WARN_ON(ret < 0); return ret; } -late_initcall(rcu_tasks_verify_self_tests); + +/* + * Repeat the rcu_tasks_verify_self_tests() call once every second until the + * test passes or has timed out. + */ +static struct delayed_work rcu_tasks_verify_work; +static void rcu_tasks_verify_work_fn(struct work_struct *work __maybe_unused) +{ + int ret = rcu_tasks_verify_self_tests(); + + if (ret <= 0) + return; + + /* Test fails but not timed out yet, reschedule another check */ + schedule_delayed_work(&rcu_tasks_verify_work, HZ); +} + +static int rcu_tasks_verify_schedule_work(void) +{ + INIT_DELAYED_WORK(&rcu_tasks_verify_work, rcu_tasks_verify_work_fn); + rcu_tasks_verify_work_fn(NULL); + return 0; +} +late_initcall(rcu_tasks_verify_schedule_work); #else /* #ifdef CONFIG_PROVE_RCU */ static void rcu_tasks_initiate_self_tests(void) { } #endif /* #else #ifdef CONFIG_PROVE_RCU */ From 14c0017c19ead104511dbcf6a59b83b2456d09af Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Mon, 11 Apr 2022 17:19:03 +0200 Subject: [PATCH 37/75] rcu/torture: Change order of warning and trace dump Dumping a big ftrace buffer could lead to a RCU stall. So there is the ftrace buffer and the stall information which needs to be printed. When there is additionally a WARN_ON() which describes the reason for the ftrace buffer dump and the WARN_ON() is executed _after_ ftrace buffer dump, the information get lost in the middle of the RCU stall information. Therefore print WARN_ON() message before dumping the ftrace buffer in rcu_torture_writer(). [ paulmck: Add tracing_off() to avoid cruft from WARN(). ] Signed-off-by: Anna-Maria Behnsen Reviewed-by: Benedikt Spranger Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 7120165a9342..3032dd7c7ad3 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1321,8 +1321,9 @@ rcu_torture_writer(void *arg) if (list_empty(&rcu_tortures[i].rtort_free) && rcu_access_pointer(rcu_torture_current) != &rcu_tortures[i]) { - rcu_ftrace_dump(DUMP_ALL); + tracing_off(); WARN(1, "%s: rtort_pipe_count: %d\n", __func__, rcu_tortures[i].rtort_pipe_count); + rcu_ftrace_dump(DUMP_ALL); } if (stutter_waited) sched_set_normal(current, oldnice); From d984114ec23818670c8873939eac81ba6e104ff5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 27 Apr 2022 11:46:02 -0700 Subject: [PATCH 38/75] rcutorture: Simplify rcu_torture_read_exit_child() loop The existing loop has an implicit manual loop that obscures the flow and requires an extra control variable. This commit makes this implicit loop explicit, thus saving several lines of code. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 47 ++++++++++++++++++----------------------- 1 file changed, 20 insertions(+), 27 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 3032dd7c7ad3..eed32cea938b 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -2874,7 +2874,6 @@ static int rcu_torture_read_exit_child(void *trsp_in) // Parent kthread which creates and destroys read-exit child kthreads. static int rcu_torture_read_exit(void *unused) { - int count = 0; bool errexit = false; int i; struct task_struct *tsp; @@ -2886,34 +2885,28 @@ static int rcu_torture_read_exit(void *unused) // Each pass through this loop does one read-exit episode. do { - if (++count > read_exit_burst) { - VERBOSE_TOROUT_STRING("rcu_torture_read_exit: End of episode"); - rcu_barrier(); // Wait for task_struct free, avoid OOM. - for (i = 0; i < read_exit_delay; i++) { - schedule_timeout_uninterruptible(HZ); - if (READ_ONCE(read_exit_child_stop)) - break; + VERBOSE_TOROUT_STRING("rcu_torture_read_exit: Start of episode"); + for (i = 0; i < read_exit_burst; i++) { + if (READ_ONCE(read_exit_child_stop)) + break; + stutter_wait("rcu_torture_read_exit"); + // Spawn child. + tsp = kthread_run(rcu_torture_read_exit_child, + &trs, "%s", "rcu_torture_read_exit_child"); + if (IS_ERR(tsp)) { + TOROUT_ERRSTRING("out of memory"); + errexit = true; + break; } - if (!READ_ONCE(read_exit_child_stop)) - VERBOSE_TOROUT_STRING("rcu_torture_read_exit: Start of episode"); - count = 0; + cond_resched(); + kthread_stop(tsp); + n_read_exits++; } - if (READ_ONCE(read_exit_child_stop)) - break; - // Spawn child. - tsp = kthread_run(rcu_torture_read_exit_child, - &trs, "%s", - "rcu_torture_read_exit_child"); - if (IS_ERR(tsp)) { - TOROUT_ERRSTRING("out of memory"); - errexit = true; - tsp = NULL; - break; - } - cond_resched(); - kthread_stop(tsp); - n_read_exits ++; - stutter_wait("rcu_torture_read_exit"); + VERBOSE_TOROUT_STRING("rcu_torture_read_exit: End of episode"); + rcu_barrier(); // Wait for task_struct free, avoid OOM. + i = 0; + for (; !errexit && !READ_ONCE(read_exit_child_stop) && i < read_exit_delay; i++) + schedule_timeout_uninterruptible(HZ); } while (!errexit && !READ_ONCE(read_exit_child_stop)); // Clean up and exit. From 98ea20328786372cbbc90c601be168f5fe1f8845 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Wed, 27 Apr 2022 15:15:20 +0800 Subject: [PATCH 39/75] rcutorture: Fix memory leak in rcu_test_debug_objects() The kernel memory leak detector located the following: unreferenced object 0xffff95d941135b50 (size 16): comm "swapper/0", pid 1, jiffies 4294667610 (age 1367.451s) hex dump (first 16 bytes): f0 c6 c2 bd d9 95 ff ff 00 00 00 00 00 00 00 00 ................ backtrace: [<00000000bc81d9b1>] kmem_cache_alloc_trace+0x2f6/0x500 [<00000000d28be229>] rcu_torture_init+0x1235/0x1354 [<0000000032c3acd9>] do_one_initcall+0x51/0x210 [<000000003c117727>] kernel_init_freeable+0x205/0x259 [<000000003961f965>] kernel_init+0x1a/0x120 [<000000001998f890>] ret_from_fork+0x22/0x30 This is caused by the rcu_test_debug_objects() function allocating an rcu_head structure, then failing to free it. This commit therefore adds the needed kfree() after the last use of this structure. Signed-off-by: Zqiang Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index eed32cea938b..62841e9cd268 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -3116,6 +3116,7 @@ static void rcu_test_debug_objects(void) pr_alert("%s: WARN: Duplicate call_rcu() test complete.\n", KBUILD_MODNAME); destroy_rcu_head_on_stack(&rh1); destroy_rcu_head_on_stack(&rh2); + kfree(rhp); #else /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ pr_alert("%s: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_rcu()\n", KBUILD_MODNAME); #endif /* #else #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ From 5c92d7501699a5deb72a579f808500db5bb6f92a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 17 May 2022 13:22:28 -0700 Subject: [PATCH 40/75] torture: Adjust to again produce debugging information A recent change to the DEBUG_INFO Kconfig option means that simply adding CONFIG_DEBUG_INFO=y to the .config file and running "make oldconfig" no longer works. It is instead necessary to add CONFIG_DEBUG_INFO_NONE=n and (for example) CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y. This combination will then result in CONFIG_DEBUG_INFO being selected. This commit therefore updates the Kconfig options produced in response to the kvm.sh --gdb, --kasan, and --kcsan Kconfig options. Fixes: f9b3cd245784 ("Kconfig.debug: make DEBUG_INFO selectable from a choice") Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/kvm.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index 263e16aeca0e..6c734818a875 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -164,7 +164,7 @@ do shift ;; --gdb) - TORTURE_KCONFIG_GDB_ARG="CONFIG_DEBUG_INFO=y"; export TORTURE_KCONFIG_GDB_ARG + TORTURE_KCONFIG_GDB_ARG="CONFIG_DEBUG_INFO_NONE=n CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y"; export TORTURE_KCONFIG_GDB_ARG TORTURE_BOOT_GDB_ARG="nokaslr"; export TORTURE_BOOT_GDB_ARG TORTURE_QEMU_GDB_ARG="-s -S"; export TORTURE_QEMU_GDB_ARG ;; @@ -180,7 +180,7 @@ do shift ;; --kasan) - TORTURE_KCONFIG_KASAN_ARG="CONFIG_DEBUG_INFO=y CONFIG_KASAN=y"; export TORTURE_KCONFIG_KASAN_ARG + TORTURE_KCONFIG_KASAN_ARG="CONFIG_DEBUG_INFO_NONE=n CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_KASAN=y"; export TORTURE_KCONFIG_KASAN_ARG if test -n "$torture_qemu_mem_default" then TORTURE_QEMU_MEM=2G @@ -192,7 +192,7 @@ do shift ;; --kcsan) - TORTURE_KCONFIG_KCSAN_ARG="CONFIG_DEBUG_INFO=y CONFIG_KCSAN=y CONFIG_KCSAN_STRICT=y CONFIG_KCSAN_REPORT_ONCE_IN_MS=100000 CONFIG_KCSAN_VERBOSE=y CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_PROVE_LOCKING=y"; export TORTURE_KCONFIG_KCSAN_ARG + TORTURE_KCONFIG_KCSAN_ARG="CONFIG_DEBUG_INFO_NONE=n CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y CONFIG_KCSAN=y CONFIG_KCSAN_STRICT=y CONFIG_KCSAN_REPORT_ONCE_IN_MS=100000 CONFIG_KCSAN_VERBOSE=y CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_PROVE_LOCKING=y"; export TORTURE_KCONFIG_KCSAN_ARG ;; --kmake-arg|--kmake-args) checkarg --kmake-arg "(kernel make arguments)" $# "$2" '.*' '^error$' From 8c0666d320f2fff6bc7cf76422bfbe90c20f53cc Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 20 May 2022 13:18:16 -0700 Subject: [PATCH 41/75] rcutorture: Make failure indication note reader-batch overflow The loop scanning the pipesummary[] array currently skips the last element, which means that the diagnostics ignore those rarest of situations, namely where some readers persist across more than ten grace periods, but all other readers avoid spanning a full grace period. This commit therefore adjusts the scan to include the last element of this array. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 62841e9cd268..7e7c3518ab06 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1869,7 +1869,7 @@ rcu_torture_stats_print(void) batchsummary[i] += READ_ONCE(per_cpu(rcu_torture_batch, cpu)[i]); } } - for (i = RCU_TORTURE_PIPE_LEN - 1; i >= 0; i--) { + for (i = RCU_TORTURE_PIPE_LEN; i >= 0; i--) { if (pipesummary[i] != 0) break; } From 92366810644d5675043c792abb70eaf974a77384 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Sat, 21 May 2022 14:56:26 +0800 Subject: [PATCH 42/75] rcuscale: Fix smp_processor_id()-in-preemptible warnings Systems built with CONFIG_DEBUG_PREEMPT=y can trigger the following BUG while running the rcuscale performance test: BUG: using smp_processor_id() in preemptible [00000000] code: rcu_scale_write/69 CPU: 0 PID: 66 Comm: rcu_scale_write Not tainted 5.18.0-rc7-next-20220517-yoctodev-standard+ caller is debug_smp_processor_id+0x17/0x20 Call Trace: dump_stack_lvl+0x49/0x5e dump_stack+0x10/0x12 check_preemption_disabled+0xdf/0xf0 debug_smp_processor_id+0x17/0x20 rcu_scale_writer+0x2b5/0x580 kthread+0x177/0x1b0 ret_from_fork+0x22/0x30 Reproduction method: runqemu kvm slirp nographic qemuparams="-m 4096 -smp 8" bootparams="isolcpus=2,3 nohz_full=2,3 rcu_nocbs=2,3 rcutree.dump_tree=1 rcuscale.shutdown=false rcuscale.gp_async=true" -d The problem is that the rcu_scale_writer() kthreads fail to set the PF_NO_SETAFFINITY flags, which causes is_percpu_thread() to assume that the kthread's affinity might change at any time, thus the BUG noted above. This commit therefore causes rcu_scale_writer() to set PF_NO_SETAFFINITY in its kthread's ->flags field, thus preventing this BUG. Signed-off-by: Zqiang Signed-off-by: Paul E. McKenney --- kernel/rcu/rcuscale.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index 277a5bfb37d4..3ef02d4a8108 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -419,6 +419,7 @@ rcu_scale_writer(void *arg) VERBOSE_SCALEOUT_STRING("rcu_scale_writer task started"); WARN_ON(!wdpp); set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); + current->flags |= PF_NO_SETAFFINITY; sched_set_fifo_low(current); if (holdoff) From 148df92fb14e5aab1c84b8ca4d2181e7b117290d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 2 Jun 2022 19:53:52 -0700 Subject: [PATCH 43/75] torture: Create kvm-check-branches.sh output in proper location Currently, kvm-check-branches.sh causes each kvm.sh invocation create a separate date-stamped directory, then after that invocation completes, moves it into the *-group/NNNN directory. This works, but makes it more difficult to monitor an ongoing run. This commit therefore uses the kvm.sh --datestamp argument to make kvm.sh put the output in the right place to start with, and also dispenses with the additional level of datestamping. (Those wanting datestamps can find them in the log files.) Signed-off-by: Paul E. McKenney --- .../selftests/rcutorture/bin/kvm-check-branches.sh | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/kvm-check-branches.sh b/tools/testing/selftests/rcutorture/bin/kvm-check-branches.sh index f17000a2ccf1..ed0ec7f0927e 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-check-branches.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-check-branches.sh @@ -35,7 +35,7 @@ then exit 1 fi -# Remember where we started so that we can get back and the end. +# Remember where we started so that we can get back at the end. curcommit="`git status | head -1 | awk '{ print $NF }'`" nfail=0 @@ -73,15 +73,10 @@ do # Test the specified commit. git checkout $i > $resdir/$ds/$idir/git-checkout.out 2>&1 echo git checkout return code: $? "(Commit $ntry: $i)" - kvm.sh --allcpus --duration 3 --trust-make > $resdir/$ds/$idir/kvm.sh.out 2>&1 + kvm.sh --allcpus --duration 3 --trust-make --datestamp "$ds/$idir" > $resdir/$ds/$idir/kvm.sh.out 2>&1 ret=$? echo kvm.sh return code $ret for commit $i from branch $gitbr - - # Move the build products to their resting place. - runresdir="`grep -m 1 '^Results directory:' < $resdir/$ds/$idir/kvm.sh.out | sed -e 's/^Results directory://'`" - mv $runresdir $resdir/$ds/$idir - rrd="`echo $runresdir | sed -e 's,^.*/,,'`" - echo Run results: $resdir/$ds/$idir/$rrd + echo Run results: $resdir/$ds/$idir if test "$ret" -ne 0 then # Failure, so leave all evidence intact. From 3002153a91a9732a6d1d0bb95138593c7da15743 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 10 Jun 2022 15:03:57 +0200 Subject: [PATCH 44/75] rcutorture: Fix ksoftirqd boosting timing and iteration The RCU priority boosting can fail in two situations: 1) If (nr_cpus= > maxcpus=), which means if the total number of CPUs is higher than those brought online at boot, then torture_onoff() may later bring up CPUs that weren't online on boot. Now since rcutorture initialization only boosts the ksoftirqds of the CPUs that have been set online on boot, the CPUs later set online by torture_onoff won't benefit from the boost, making RCU priority boosting fail. 2) The ksoftirqd kthreads are boosted after the creation of rcu_torture_boost() kthreads, which opens a window large enough for these rcu_torture_boost() kthreads to wait (despite running at FIFO priority) for ksoftirqds that are still running at SCHED_NORMAL priority. The issues can trigger for example with: ./kvm.sh --configs TREE01 --kconfig "CONFIG_RCU_BOOST=y" [ 34.968561] rcu-torture: !!! [ 34.968627] ------------[ cut here ]------------ [ 35.014054] WARNING: CPU: 4 PID: 114 at kernel/rcu/rcutorture.c:1979 rcu_torture_stats_print+0x5ad/0x610 [ 35.052043] Modules linked in: [ 35.069138] CPU: 4 PID: 114 Comm: rcu_torture_sta Not tainted 5.18.0-rc1 #1 [ 35.096424] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.14.0-0-g155821a-rebuilt.opensuse.org 04/01/2014 [ 35.154570] RIP: 0010:rcu_torture_stats_print+0x5ad/0x610 [ 35.198527] Code: 63 1b 02 00 74 02 0f 0b 48 83 3d 35 63 1b 02 00 74 02 0f 0b 48 83 3d 21 63 1b 02 00 74 02 0f 0b 48 83 3d 0d 63 1b 02 00 74 02 <0f> 0b 83 eb 01 0f 8e ba fc ff ff 0f 0b e9 b3 fc ff f82 [ 37.251049] RSP: 0000:ffffa92a0050bdf8 EFLAGS: 00010202 [ 37.277320] rcu: De-offloading 8 [ 37.290367] RAX: 0000000000000000 RBX: 0000000000000001 RCX: 0000000000000001 [ 37.290387] RDX: 0000000000000000 RSI: 00000000ffffbfff RDI: 00000000ffffffff [ 37.290398] RBP: 000000000000007b R08: 0000000000000000 R09: c0000000ffffbfff [ 37.290407] R10: 000000000000002a R11: ffffa92a0050bc18 R12: ffffa92a0050be20 [ 37.290417] R13: ffffa92a0050be78 R14: 0000000000000000 R15: 000000000001bea0 [ 37.290427] FS: 0000000000000000(0000) GS:ffff96045eb00000(0000) knlGS:0000000000000000 [ 37.290448] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 37.290460] CR2: 0000000000000000 CR3: 000000001dc0c000 CR4: 00000000000006e0 [ 37.290470] Call Trace: [ 37.295049] [ 37.295065] ? preempt_count_add+0x63/0x90 [ 37.295095] ? _raw_spin_lock_irqsave+0x12/0x40 [ 37.295125] ? rcu_torture_stats_print+0x610/0x610 [ 37.295143] rcu_torture_stats+0x29/0x70 [ 37.295160] kthread+0xe3/0x110 [ 37.295176] ? kthread_complete_and_exit+0x20/0x20 [ 37.295193] ret_from_fork+0x22/0x30 [ 37.295218] Fix this with boosting the ksoftirqds kthreads from the boosting hotplug callback itself and before the boosting kthreads are created. Fixes: ea6d962e80b6 ("rcutorture: Judge RCU priority boosting on grace periods, not callbacks") Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 7e7c3518ab06..6ba1cf8b3b71 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -2076,6 +2076,19 @@ static int rcutorture_booster_init(unsigned int cpu) if (boost_tasks[cpu] != NULL) return 0; /* Already created, nothing more to do. */ + // Testing RCU priority boosting requires rcutorture do + // some serious abuse. Counter this by running ksoftirqd + // at higher priority. + if (IS_BUILTIN(CONFIG_RCU_TORTURE_TEST)) { + struct sched_param sp; + struct task_struct *t; + + t = per_cpu(ksoftirqd, cpu); + WARN_ON_ONCE(!t); + sp.sched_priority = 2; + sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); + } + /* Don't allow time recalculation while creating a new task. */ mutex_lock(&boost_mutex); rcu_torture_disable_rt_throttle(); @@ -3324,21 +3337,6 @@ rcu_torture_init(void) rcutor_hp = firsterr; if (torture_init_error(firsterr)) goto unwind; - - // Testing RCU priority boosting requires rcutorture do - // some serious abuse. Counter this by running ksoftirqd - // at higher priority. - if (IS_BUILTIN(CONFIG_RCU_TORTURE_TEST)) { - for_each_online_cpu(cpu) { - struct sched_param sp; - struct task_struct *t; - - t = per_cpu(ksoftirqd, cpu); - WARN_ON_ONCE(!t); - sp.sched_priority = 2; - sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); - } - } } shutdown_jiffies = jiffies + shutdown_secs * HZ; firsterr = torture_shutdown_init(shutdown_secs, rcu_torture_cleanup); From 1a5ca5e09811dec9472f50b98eeb3e3b2442d050 Mon Sep 17 00:00:00 2001 From: Li Qiong Date: Sun, 12 Jun 2022 14:48:25 +0800 Subject: [PATCH 45/75] rcutorture: Handle failure of memory allocation functions This commit adds warnings for allocation failure during the mem_dump_obj() tests. It also terminates these tests upon such failure. Signed-off-by: Li Qiong Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 6ba1cf8b3b71..2ba74498d36d 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1991,7 +1991,13 @@ static void rcu_torture_mem_dump_obj(void) static int z; kcp = kmem_cache_create("rcuscale", 136, 8, SLAB_STORE_USER, NULL); + if (WARN_ON_ONCE(!kcp)) + return; rhp = kmem_cache_alloc(kcp, GFP_KERNEL); + if (WARN_ON_ONCE(!rhp)) { + kmem_cache_destroy(kcp); + return; + } pr_alert("mem_dump_obj() slab test: rcu_torture_stats = %px, &rhp = %px, rhp = %px, &z = %px\n", stats_task, &rhp, rhp, &z); pr_alert("mem_dump_obj(ZERO_SIZE_PTR):"); mem_dump_obj(ZERO_SIZE_PTR); @@ -2008,6 +2014,8 @@ static void rcu_torture_mem_dump_obj(void) kmem_cache_free(kcp, rhp); kmem_cache_destroy(kcp); rhp = kmalloc(sizeof(*rhp), GFP_KERNEL); + if (WARN_ON_ONCE(!rhp)) + return; pr_alert("mem_dump_obj() kmalloc test: rcu_torture_stats = %px, &rhp = %px, rhp = %px\n", stats_task, &rhp, rhp); pr_alert("mem_dump_obj(kmalloc %px):", rhp); mem_dump_obj(rhp); @@ -2015,6 +2023,8 @@ static void rcu_torture_mem_dump_obj(void) mem_dump_obj(&rhp->func); kfree(rhp); rhp = vmalloc(4096); + if (WARN_ON_ONCE(!rhp)) + return; pr_alert("mem_dump_obj() vmalloc test: rcu_torture_stats = %px, &rhp = %px, rhp = %px\n", stats_task, &rhp, rhp); pr_alert("mem_dump_obj(vmalloc %px):", rhp); mem_dump_obj(rhp); From 7bf336fb8dac718febb7bf4fe79e1be0c5e4a631 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Sun, 12 Jun 2022 10:02:25 +0800 Subject: [PATCH 46/75] refscale: Convert test_lock spinlock to raw_spinlock In kernels built with CONFIG_PREEMPT_RT=y, spinlocks are replaced by rt_mutex, which can sleep. This means that acquiring a non-raw spinlock in a critical section where preemption is disabled can trigger the following BUG: BUG: scheduling while atomic: ref_scale_reade/76/0x00000002 Preemption disabled at: ref_lock_section+0x16/0x80 Call Trace: dump_stack_lvl+0x5b/0x82 dump_stack+0x10/0x12 __schedule_bug.cold+0x9c/0xad __schedule+0x839/0xc00 schedule_rtlock+0x22/0x40 rtlock_slowlock_locked+0x460/0x1350 rt_spin_lock+0x61/0xe0 ref_lock_section+0x29/0x80 rcu_scale_one_reader+0x52/0x60 ref_scale_reader+0x28d/0x490 kthread+0x128/0x150 ret_from_fork+0x22/0x30 This commit therefore converts spinlock to raw_spinlock. Signed-off-by: Zqiang Signed-off-by: Paul E. McKenney --- kernel/rcu/refscale.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index 909644abee67..435c884c02b5 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -385,7 +385,7 @@ static struct ref_scale_ops rwsem_ops = { }; // Definitions for global spinlock -static DEFINE_SPINLOCK(test_lock); +static DEFINE_RAW_SPINLOCK(test_lock); static void ref_lock_section(const int nloops) { @@ -393,8 +393,8 @@ static void ref_lock_section(const int nloops) preempt_disable(); for (i = nloops; i >= 0; i--) { - spin_lock(&test_lock); - spin_unlock(&test_lock); + raw_spin_lock(&test_lock); + raw_spin_unlock(&test_lock); } preempt_enable(); } @@ -405,9 +405,9 @@ static void ref_lock_delay_section(const int nloops, const int udl, const int nd preempt_disable(); for (i = nloops; i >= 0; i--) { - spin_lock(&test_lock); + raw_spin_lock(&test_lock); un_delay(udl, ndl); - spin_unlock(&test_lock); + raw_spin_unlock(&test_lock); } preempt_enable(); } @@ -427,8 +427,8 @@ static void ref_lock_irq_section(const int nloops) preempt_disable(); for (i = nloops; i >= 0; i--) { - spin_lock_irqsave(&test_lock, flags); - spin_unlock_irqrestore(&test_lock, flags); + raw_spin_lock_irqsave(&test_lock, flags); + raw_spin_unlock_irqrestore(&test_lock, flags); } preempt_enable(); } @@ -440,9 +440,9 @@ static void ref_lock_irq_delay_section(const int nloops, const int udl, const in preempt_disable(); for (i = nloops; i >= 0; i--) { - spin_lock_irqsave(&test_lock, flags); + raw_spin_lock_irqsave(&test_lock, flags); un_delay(udl, ndl); - spin_unlock_irqrestore(&test_lock, flags); + raw_spin_unlock_irqrestore(&test_lock, flags); } preempt_enable(); } From 8f870e6eb8c0c3f9869bf3fcf9db39f86cfcea49 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 12 Jun 2022 15:00:06 -0700 Subject: [PATCH 47/75] srcu: Block less aggressively for expedited grace periods Commit 282d8998e997 ("srcu: Prevent expedited GPs and blocking readers from consuming CPU") fixed a problem where a long-running expedited SRCU grace period could block kernel live patching. It did so by giving up on expediting once a given SRCU expedited grace period grew too old. Unfortunately, this added excessive delays to boots of virtual embedded systems specifying "-bios QEMU_EFI.fd" to qemu. This commit therefore makes the transition away from expediting less aggressive, increasing the per-grace-period phase number of non-sleeping polls of readers from one to three and increasing the required grace-period age from one jiffy (actually from zero to one jiffies) to two jiffies (actually from one to two jiffies). Fixes: 282d8998e997 ("srcu: Prevent expedited GPs and blocking readers from consuming CPU") Signed-off-by: Paul E. McKenney Reported-by: Zhangfei Gao Reported-by: chenxiang (M)" Cc: Shameerali Kolothum Thodi Cc: Paolo Bonzini Reviewed-by: Neeraj Upadhyay Link: https://lore.kernel.org/all/20615615-0013-5adc-584f-2b1d5c03ebfc@linaro.org/ --- kernel/rcu/srcutree.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 50ba70f019de..0db7873f4e95 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -513,7 +513,7 @@ static bool srcu_readers_active(struct srcu_struct *ssp) #define SRCU_INTERVAL 1 // Base delay if no expedited GPs pending. #define SRCU_MAX_INTERVAL 10 // Maximum incremental delay from slow readers. -#define SRCU_MAX_NODELAY_PHASE 1 // Maximum per-GP-phase consecutive no-delay instances. +#define SRCU_MAX_NODELAY_PHASE 3 // Maximum per-GP-phase consecutive no-delay instances. #define SRCU_MAX_NODELAY 100 // Maximum consecutive no-delay instances. /* @@ -522,16 +522,22 @@ static bool srcu_readers_active(struct srcu_struct *ssp) */ static unsigned long srcu_get_delay(struct srcu_struct *ssp) { + unsigned long gpstart; + unsigned long j; unsigned long jbase = SRCU_INTERVAL; if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_gp_seq), READ_ONCE(ssp->srcu_gp_seq_needed_exp))) jbase = 0; - if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq))) - jbase += jiffies - READ_ONCE(ssp->srcu_gp_start); - if (!jbase) { - WRITE_ONCE(ssp->srcu_n_exp_nodelay, READ_ONCE(ssp->srcu_n_exp_nodelay) + 1); - if (READ_ONCE(ssp->srcu_n_exp_nodelay) > SRCU_MAX_NODELAY_PHASE) - jbase = 1; + if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq))) { + j = jiffies - 1; + gpstart = READ_ONCE(ssp->srcu_gp_start); + if (time_after(j, gpstart)) + jbase += j - gpstart; + if (!jbase) { + WRITE_ONCE(ssp->srcu_n_exp_nodelay, READ_ONCE(ssp->srcu_n_exp_nodelay) + 1); + if (READ_ONCE(ssp->srcu_n_exp_nodelay) > SRCU_MAX_NODELAY_PHASE) + jbase = 1; + } } return jbase > SRCU_MAX_INTERVAL ? SRCU_MAX_INTERVAL : jbase; } From 4f2bfd9494a072d58203600de6bedd72680e612a Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Fri, 1 Jul 2022 08:45:45 +0530 Subject: [PATCH 48/75] srcu: Make expedited RCU grace periods block even less frequently MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The purpose of commit 282d8998e997 ("srcu: Prevent expedited GPs and blocking readers from consuming CPU") was to prevent a long series of never-blocking expedited SRCU grace periods from blocking kernel-live-patching (KLP) progress. Although it was successful, it also resulted in excessive boot times on certain embedded workloads running under qemu with the "-bios QEMU_EFI.fd" command line. Here "excessive" means increasing the boot time up into the three-to-four minute range. This increase in boot time was due to the more than 6000 back-to-back invocations of synchronize_rcu_expedited() within the KVM host OS, which in turn resulted from qemu's emulation of a long series of MMIO accesses. Commit 640a7d37c3f4 ("srcu: Block less aggressively for expedited grace periods") did not significantly help this particular use case. Zhangfei Gao and Shameerali Kolothum Thodi did experiments varying the value of SRCU_MAX_NODELAY_PHASE with HZ=250 and with various values of non-sleeping per phase counts on a system with preemption enabled, and observed the following boot times: +──────────────────────────+────────────────+ | SRCU_MAX_NODELAY_PHASE | Boot time (s) | +──────────────────────────+────────────────+ | 100 | 30.053 | | 150 | 25.151 | | 200 | 20.704 | | 250 | 15.748 | | 500 | 11.401 | | 1000 | 11.443 | | 10000 | 11.258 | | 1000000 | 11.154 | +──────────────────────────+────────────────+ Analysis on the experiment results show additional improvements with CPU-bound delays approaching one jiffy in duration. This improvement was also seen when number of per-phase iterations were scaled to one jiffy. This commit therefore scales per-grace-period phase number of non-sleeping polls so that non-sleeping polls extend for about one jiffy. In addition, the delay-calculation call to srcu_get_delay() in srcu_gp_end() is replaced with a simple check for an expedited grace period. This change schedules callback invocation immediately after expedited grace periods complete, which results in greatly improved boot times. Testing done by Marc and Zhangfei confirms that this change recovers most of the performance degradation in boottime; for CONFIG_HZ_250 configuration, specifically, boot times improve from 3m50s to 41s on Marc's setup; and from 2m40s to ~9.7s on Zhangfei's setup. In addition to the changes to default per phase delays, this change adds 3 new kernel parameters - srcutree.srcu_max_nodelay, srcutree.srcu_max_nodelay_phase, and srcutree.srcu_retry_check_delay. This allows users to configure the srcu grace period scanning delays in order to more quickly react to additional use cases. Fixes: 640a7d37c3f4 ("srcu: Block less aggressively for expedited grace periods") Fixes: 282d8998e997 ("srcu: Prevent expedited GPs and blocking readers from consuming CPU") Reported-by: Zhangfei Gao Reported-by: yueluck Signed-off-by: Neeraj Upadhyay Tested-by: Marc Zyngier Tested-by: Zhangfei Gao Link: https://lore.kernel.org/all/20615615-0013-5adc-584f-2b1d5c03ebfc@linaro.org/ Signed-off-by: Paul E. McKenney --- .../admin-guide/kernel-parameters.txt | 18 ++++ kernel/rcu/srcutree.c | 82 ++++++++++++++----- 2 files changed, 81 insertions(+), 19 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 2522b11e593f..c3245baf588f 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5771,6 +5771,24 @@ expediting. Set to zero to disable automatic expediting. + srcutree.srcu_max_nodelay [KNL] + Specifies the number of no-delay instances + per jiffy for which the SRCU grace period + worker thread will be rescheduled with zero + delay. Beyond this limit, worker thread will + be rescheduled with a sleep delay of one jiffy. + + srcutree.srcu_max_nodelay_phase [KNL] + Specifies the per-grace-period phase, number of + non-sleeping polls of readers. Beyond this limit, + grace period worker thread will be rescheduled + with a sleep delay of one jiffy, between each + rescan of the readers, for a grace period phase. + + srcutree.srcu_retry_check_delay [KNL] + Specifies number of microseconds of non-sleeping + delay between each non-sleeping poll of readers. + srcutree.small_contention_lim [KNL] Specifies the number of update-side contention events per jiffy will be tolerated before diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 0db7873f4e95..1c304fec89c0 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -511,10 +511,52 @@ static bool srcu_readers_active(struct srcu_struct *ssp) return sum; } -#define SRCU_INTERVAL 1 // Base delay if no expedited GPs pending. -#define SRCU_MAX_INTERVAL 10 // Maximum incremental delay from slow readers. -#define SRCU_MAX_NODELAY_PHASE 3 // Maximum per-GP-phase consecutive no-delay instances. -#define SRCU_MAX_NODELAY 100 // Maximum consecutive no-delay instances. +/* + * We use an adaptive strategy for synchronize_srcu() and especially for + * synchronize_srcu_expedited(). We spin for a fixed time period + * (defined below, boot time configurable) to allow SRCU readers to exit + * their read-side critical sections. If there are still some readers + * after one jiffy, we repeatedly block for one jiffy time periods. + * The blocking time is increased as the grace-period age increases, + * with max blocking time capped at 10 jiffies. + */ +#define SRCU_DEFAULT_RETRY_CHECK_DELAY 5 + +static ulong srcu_retry_check_delay = SRCU_DEFAULT_RETRY_CHECK_DELAY; +module_param(srcu_retry_check_delay, ulong, 0444); + +#define SRCU_INTERVAL 1 // Base delay if no expedited GPs pending. +#define SRCU_MAX_INTERVAL 10 // Maximum incremental delay from slow readers. + +#define SRCU_DEFAULT_MAX_NODELAY_PHASE_LO 3UL // Lowmark on default per-GP-phase + // no-delay instances. +#define SRCU_DEFAULT_MAX_NODELAY_PHASE_HI 1000UL // Highmark on default per-GP-phase + // no-delay instances. + +#define SRCU_UL_CLAMP_LO(val, low) ((val) > (low) ? (val) : (low)) +#define SRCU_UL_CLAMP_HI(val, high) ((val) < (high) ? (val) : (high)) +#define SRCU_UL_CLAMP(val, low, high) SRCU_UL_CLAMP_HI(SRCU_UL_CLAMP_LO((val), (low)), (high)) +// per-GP-phase no-delay instances adjusted to allow non-sleeping poll upto +// one jiffies time duration. Mult by 2 is done to factor in the srcu_get_delay() +// called from process_srcu(). +#define SRCU_DEFAULT_MAX_NODELAY_PHASE_ADJUSTED \ + (2UL * USEC_PER_SEC / HZ / SRCU_DEFAULT_RETRY_CHECK_DELAY) + +// Maximum per-GP-phase consecutive no-delay instances. +#define SRCU_DEFAULT_MAX_NODELAY_PHASE \ + SRCU_UL_CLAMP(SRCU_DEFAULT_MAX_NODELAY_PHASE_ADJUSTED, \ + SRCU_DEFAULT_MAX_NODELAY_PHASE_LO, \ + SRCU_DEFAULT_MAX_NODELAY_PHASE_HI) + +static ulong srcu_max_nodelay_phase = SRCU_DEFAULT_MAX_NODELAY_PHASE; +module_param(srcu_max_nodelay_phase, ulong, 0444); + +// Maximum consecutive no-delay instances. +#define SRCU_DEFAULT_MAX_NODELAY (SRCU_DEFAULT_MAX_NODELAY_PHASE > 100 ? \ + SRCU_DEFAULT_MAX_NODELAY_PHASE : 100) + +static ulong srcu_max_nodelay = SRCU_DEFAULT_MAX_NODELAY; +module_param(srcu_max_nodelay, ulong, 0444); /* * Return grace-period delay, zero if there are expedited grace @@ -535,7 +577,7 @@ static unsigned long srcu_get_delay(struct srcu_struct *ssp) jbase += j - gpstart; if (!jbase) { WRITE_ONCE(ssp->srcu_n_exp_nodelay, READ_ONCE(ssp->srcu_n_exp_nodelay) + 1); - if (READ_ONCE(ssp->srcu_n_exp_nodelay) > SRCU_MAX_NODELAY_PHASE) + if (READ_ONCE(ssp->srcu_n_exp_nodelay) > srcu_max_nodelay_phase) jbase = 1; } } @@ -612,15 +654,6 @@ void __srcu_read_unlock(struct srcu_struct *ssp, int idx) } EXPORT_SYMBOL_GPL(__srcu_read_unlock); -/* - * We use an adaptive strategy for synchronize_srcu() and especially for - * synchronize_srcu_expedited(). We spin for a fixed time period - * (defined below) to allow SRCU readers to exit their read-side critical - * sections. If there are still some readers after a few microseconds, - * we repeatedly block for 1-millisecond time periods. - */ -#define SRCU_RETRY_CHECK_DELAY 5 - /* * Start an SRCU grace period. */ @@ -706,7 +739,7 @@ static void srcu_schedule_cbs_snp(struct srcu_struct *ssp, struct srcu_node *snp */ static void srcu_gp_end(struct srcu_struct *ssp) { - unsigned long cbdelay; + unsigned long cbdelay = 1; bool cbs; bool last_lvl; int cpu; @@ -726,7 +759,9 @@ static void srcu_gp_end(struct srcu_struct *ssp) spin_lock_irq_rcu_node(ssp); idx = rcu_seq_state(ssp->srcu_gp_seq); WARN_ON_ONCE(idx != SRCU_STATE_SCAN2); - cbdelay = !!srcu_get_delay(ssp); + if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_gp_seq), READ_ONCE(ssp->srcu_gp_seq_needed_exp))) + cbdelay = 0; + WRITE_ONCE(ssp->srcu_last_gp_end, ktime_get_mono_fast_ns()); rcu_seq_end(&ssp->srcu_gp_seq); gpseq = rcu_seq_current(&ssp->srcu_gp_seq); @@ -927,12 +962,16 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, */ static bool try_check_zero(struct srcu_struct *ssp, int idx, int trycount) { + unsigned long curdelay; + + curdelay = !srcu_get_delay(ssp); + for (;;) { if (srcu_readers_active_idx_check(ssp, idx)) return true; - if (--trycount + !srcu_get_delay(ssp) <= 0) + if ((--trycount + curdelay) <= 0) return false; - udelay(SRCU_RETRY_CHECK_DELAY); + udelay(srcu_retry_check_delay); } } @@ -1588,7 +1627,7 @@ static void process_srcu(struct work_struct *work) j = jiffies; if (READ_ONCE(ssp->reschedule_jiffies) == j) { WRITE_ONCE(ssp->reschedule_count, READ_ONCE(ssp->reschedule_count) + 1); - if (READ_ONCE(ssp->reschedule_count) > SRCU_MAX_NODELAY) + if (READ_ONCE(ssp->reschedule_count) > srcu_max_nodelay) curdelay = 1; } else { WRITE_ONCE(ssp->reschedule_count, 1); @@ -1680,6 +1719,11 @@ static int __init srcu_bootup_announce(void) pr_info("Hierarchical SRCU implementation.\n"); if (exp_holdoff != DEFAULT_SRCU_EXP_HOLDOFF) pr_info("\tNon-default auto-expedite holdoff of %lu ns.\n", exp_holdoff); + if (srcu_retry_check_delay != SRCU_DEFAULT_RETRY_CHECK_DELAY) + pr_info("\tNon-default retry check delay of %lu us.\n", srcu_retry_check_delay); + if (srcu_max_nodelay != SRCU_DEFAULT_MAX_NODELAY) + pr_info("\tNon-default max no-delay of %lu.\n", srcu_max_nodelay); + pr_info("\tMax phase no-delay instances is %lu.\n", srcu_max_nodelay_phase); return 0; } early_initcall(srcu_bootup_announce); From fb77dccfc701b6ebcc232574c828bc69146cf90a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 12 Apr 2022 15:08:14 -0700 Subject: [PATCH 49/75] rcu: Decrease FQS scan wait time in case of callback overloading The force-quiesce-state loop function rcu_gp_fqs_loop() checks for callback overloading and does an immediate initial scan for idle CPUs if so. However, subsequent rescans will be carried out at as leisurely a rate as they always are, as specified by the rcutree.jiffies_till_next_fqs module parameter. It might be tempting to just continue immediately rescanning, but this turns the RCU grace-period kthread into a CPU hog. It might also be tempting to reduce the time between rescans to a single jiffy, but this can be problematic on larger systems. This commit therefore divides the normal time between rescans by three, rounding up. Thus a small system running at HZ=1000 that is suffering from callback overload will wait only one jiffy instead of the normal three between rescans. [ paulmck: Apply Neeraj Upadhyay feedback. ] Signed-off-by: Paul E. McKenney Reviewed-by: Neeraj Upadhyay --- kernel/rcu/tree.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index c25ba442044a..52094e72866e 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1983,7 +1983,12 @@ static noinline_for_stack void rcu_gp_fqs_loop(void) gf = RCU_GP_FLAG_OVLD; ret = 0; for (;;) { - if (!ret) { + if (rcu_state.cbovld) { + j = (j + 2) / 3; + if (j <= 0) + j = 1; + } + if (!ret || time_before(jiffies + j, rcu_state.jiffies_force_qs)) { WRITE_ONCE(rcu_state.jiffies_force_qs, jiffies + j); /* * jiffies_force_qs before RCU_GP_WAIT_FQS state From 48f8070f5dd8e13148ae4647780a452d53c457a2 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Tue, 26 Apr 2022 18:45:02 +0800 Subject: [PATCH 50/75] rcu: Avoid tracing a few functions executed in stop machine Stop-machine recently started calling additional functions while waiting: ---------------------------------------------------------------- Former stop machine wait loop: do { cpu_relax(); => macro ... } while (curstate != STOPMACHINE_EXIT); ----------------------------------------------------------------- Current stop machine wait loop: do { stop_machine_yield(cpumask); => function (notraced) ... touch_nmi_watchdog(); => function (notraced, inside calls also notraced) ... rcu_momentary_dyntick_idle(); => function (notraced, inside calls traced) } while (curstate != MULTI_STOP_EXIT); ------------------------------------------------------------------ These functions (and the functions that they call) must be marked notrace to prevent them from being updated while they are executing. The consequences of failing to mark these functions can be severe: rcu: INFO: rcu_preempt detected stalls on CPUs/tasks: rcu: 1-...!: (0 ticks this GP) idle=14f/1/0x4000000000000000 softirq=3397/3397 fqs=0 rcu: 3-...!: (0 ticks this GP) idle=ee9/1/0x4000000000000000 softirq=5168/5168 fqs=0 (detected by 0, t=8137 jiffies, g=5889, q=2 ncpus=4) Task dump for CPU 1: task:migration/1 state:R running task stack: 0 pid: 19 ppid: 2 flags:0x00000000 Stopper: multi_cpu_stop+0x0/0x18c <- stop_machine_cpuslocked+0x128/0x174 Call Trace: Task dump for CPU 3: task:migration/3 state:R running task stack: 0 pid: 29 ppid: 2 flags:0x00000000 Stopper: multi_cpu_stop+0x0/0x18c <- stop_machine_cpuslocked+0x128/0x174 Call Trace: rcu: rcu_preempt kthread timer wakeup didn't happen for 8136 jiffies! g5889 f0x0 RCU_GP_WAIT_FQS(5) ->state=0x402 rcu: Possible timer handling issue on cpu=2 timer-softirq=594 rcu: rcu_preempt kthread starved for 8137 jiffies! g5889 f0x0 RCU_GP_WAIT_FQS(5) ->state=0x402 ->cpu=2 rcu: Unless rcu_preempt kthread gets sufficient CPU time, OOM is now expected behavior. rcu: RCU grace-period kthread stack dump: task:rcu_preempt state:I stack: 0 pid: 14 ppid: 2 flags:0x00000000 Call Trace: schedule+0x56/0xc2 schedule_timeout+0x82/0x184 rcu_gp_fqs_loop+0x19a/0x318 rcu_gp_kthread+0x11a/0x140 kthread+0xee/0x118 ret_from_exception+0x0/0x14 rcu: Stack dump where RCU GP kthread last ran: Task dump for CPU 2: task:migration/2 state:R running task stack: 0 pid: 24 ppid: 2 flags:0x00000000 Stopper: multi_cpu_stop+0x0/0x18c <- stop_machine_cpuslocked+0x128/0x174 Call Trace: This commit therefore marks these functions notrace: rcu_preempt_deferred_qs() rcu_preempt_need_deferred_qs() rcu_preempt_deferred_qs_irqrestore() [ paulmck: Apply feedback from Neeraj Upadhyay. ] Signed-off-by: Patrick Wang Acked-by: Steven Rostedt (Google) Signed-off-by: Paul E. McKenney Reviewed-by: Neeraj Upadhyay --- kernel/rcu/tree_plugin.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index c8ba0fe17267..7a07f2ca153e 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -460,7 +460,7 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp) * be quite short, for example, in the case of the call from * rcu_read_unlock_special(). */ -static void +static notrace void rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) { bool empty_exp; @@ -581,7 +581,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) * is disabled. This function cannot be expected to understand these * nuances, so the caller must handle them. */ -static bool rcu_preempt_need_deferred_qs(struct task_struct *t) +static notrace bool rcu_preempt_need_deferred_qs(struct task_struct *t) { return (__this_cpu_read(rcu_data.cpu_no_qs.b.exp) || READ_ONCE(t->rcu_read_unlock_special.s)) && @@ -595,7 +595,7 @@ static bool rcu_preempt_need_deferred_qs(struct task_struct *t) * evaluate safety in terms of interrupt, softirq, and preemption * disabling. */ -static void rcu_preempt_deferred_qs(struct task_struct *t) +static notrace void rcu_preempt_deferred_qs(struct task_struct *t) { unsigned long flags; @@ -926,7 +926,7 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp) * Because there is no preemptible RCU, there can be no deferred quiescent * states. */ -static bool rcu_preempt_need_deferred_qs(struct task_struct *t) +static notrace bool rcu_preempt_need_deferred_qs(struct task_struct *t) { return false; } @@ -935,7 +935,7 @@ static bool rcu_preempt_need_deferred_qs(struct task_struct *t) // period for a quiescent state from this CPU. Note that requests from // tasks are handled when removing the task from the blocked-tasks list // below. -static void rcu_preempt_deferred_qs(struct task_struct *t) +static notrace void rcu_preempt_deferred_qs(struct task_struct *t) { struct rcu_data *rdp = this_cpu_ptr(&rcu_data); From 52c1d81ee2911ef592048582c6d07975b7399726 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Thu, 5 May 2022 23:52:36 +0800 Subject: [PATCH 51/75] rcu: Add rnp->cbovldmask check in rcutree_migrate_callbacks() Currently, the rcu_node structure's ->cbovlmask field is set in call_rcu() when a given CPU is suffering from callback overload. But if that CPU goes offline, the outgoing CPU's callbacks is migrated to the running CPU, which is likely to overload the running CPU. However, that CPU's bit in its leaf rcu_node structure's ->cbovlmask field remains zero. Initially, this is OK because the outgoing CPU's bit remains set. However, that bit will be cleared at the next end of a grace period, at which time it is quite possible that the running CPU will still be overloaded. If the running CPU invokes call_rcu(), then overload will be checked for and the bit will be set. Except that there is no guarantee that the running CPU will invoke call_rcu(), in which case the next grace period will fail to take the running CPU's overload condition into account. Plus, because the bit is not set, the end of the grace period won't check for overload on this CPU. This commit therefore adds a call to check_cb_ovld_locked() in rcutree_migrate_callbacks() to set the running CPU's ->cbovlmask bit appropriately. Signed-off-by: Zqiang Signed-off-by: Paul E. McKenney Reviewed-by: Neeraj Upadhyay --- kernel/rcu/tree.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 52094e72866e..7c62af481981 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4491,6 +4491,7 @@ void rcutree_migrate_callbacks(int cpu) needwake = needwake || rcu_advance_cbs(my_rnp, my_rdp); rcu_segcblist_disable(&rdp->cblist); WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) != !rcu_segcblist_n_cbs(&my_rdp->cblist)); + check_cb_ovld_locked(my_rdp, my_rnp); if (rcu_rdp_is_offloaded(my_rdp)) { raw_spin_unlock_rcu_node(my_rnp); /* irqs remain disabled. */ __call_rcu_nocb_wake(my_rdp, true, flags); From 70a82c3c55c8665d3996dcb9968adcf24d52bbc4 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Fri, 13 May 2022 08:42:55 +0800 Subject: [PATCH 52/75] rcu: Immediately boost preempted readers for strict grace periods The intent of the CONFIG_RCU_STRICT_GRACE_PERIOD Konfig option is to cause normal grace periods to complete quickly in order to better catch errors resulting from improperly leaking pointers from RCU read-side critical sections. However, kernels built with this option enabled still wait for some hundreds of milliseconds before boosting RCU readers that have been preempted within their current critical section. The value of this delay is set by the CONFIG_RCU_BOOST_DELAY Kconfig option, which defaults to 500 milliseconds. This commit therefore causes kernels build with strict grace periods to ignore CONFIG_RCU_BOOST_DELAY. This causes rcu_initiate_boost() to start boosting immediately after all CPUs on a given leaf rcu_node structure have passed through their quiescent states. Signed-off-by: Zqiang Signed-off-by: Paul E. McKenney Reviewed-by: Neeraj Upadhyay --- kernel/rcu/tree_plugin.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 7a07f2ca153e..4c92034e2670 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1140,7 +1140,8 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) (rnp->gp_tasks != NULL && rnp->boost_tasks == NULL && rnp->qsmask == 0 && - (!time_after(rnp->boost_time, jiffies) || rcu_state.cbovld))) { + (!time_after(rnp->boost_time, jiffies) || rcu_state.cbovld || + IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)))) { if (rnp->exp_tasks == NULL) WRITE_ONCE(rnp->boost_tasks, rnp->gp_tasks); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); From b3ade95b8ee507d650d9e163abfdf645a9f3886d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 17 May 2022 18:03:36 -0700 Subject: [PATCH 53/75] rcu: Forbid RCU_STRICT_GRACE_PERIOD in TINY_RCU kernels The RCU_STRICT_GRACE_PERIOD Kconfig option does nothing in kernels built with CONFIG_TINY_RCU=y, so this commit adjusts the dependencies to disallow this combination. Signed-off-by: Paul E. McKenney Reviewed-by: Neeraj Upadhyay --- kernel/rcu/Kconfig.debug | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index 9b64e55d4f61..4da05beb13d7 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -121,7 +121,7 @@ config RCU_EQS_DEBUG config RCU_STRICT_GRACE_PERIOD bool "Provide debug RCU implementation with short grace periods" - depends on DEBUG_KERNEL && RCU_EXPERT && NR_CPUS <= 4 + depends on DEBUG_KERNEL && RCU_EXPERT && NR_CPUS <= 4 && !TINY_RCU default n select PREEMPT_COUNT if PREEMPT=n help From 9c9b26b0df270d4f9246e483a44686fca951a29c Mon Sep 17 00:00:00 2001 From: Chen Zhongjin Date: Tue, 10 May 2022 17:46:39 +0800 Subject: [PATCH 54/75] locking/csd_lock: Change csdlock_debug from early_param to __setup The csdlock_debug kernel-boot parameter is parsed by the early_param() function csdlock_debug(). If set, csdlock_debug() invokes static_branch_enable() to enable csd_lock_wait feature, which triggers a panic on arm64 for kernels built with CONFIG_SPARSEMEM=y and CONFIG_SPARSEMEM_VMEMMAP=n. With CONFIG_SPARSEMEM_VMEMMAP=n, __nr_to_section is called in static_key_enable() and returns NULL, resulting in a NULL dereference because mem_section is initialized only later in sparse_init(). This is also a problem for powerpc because early_param() functions are invoked earlier than jump_label_init(), also resulting in static_key_enable() failures. These failures cause the warning "static key 'xxx' used before call to jump_label_init()". Thus, early_param is too early for csd_lock_wait to run static_branch_enable(), so changes it to __setup to fix these. Fixes: 8d0968cc6b8f ("locking/csd_lock: Add boot parameter for controlling CSD lock debugging") Cc: stable@vger.kernel.org Reported-by: Chen jingwen Signed-off-by: Chen Zhongjin Signed-off-by: Paul E. McKenney --- kernel/smp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/smp.c b/kernel/smp.c index dd215f439426..650810a6f29b 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -174,9 +174,9 @@ static int __init csdlock_debug(char *str) if (val) static_branch_enable(&csdlock_debug_enabled); - return 0; + return 1; } -early_param("csdlock_debug", csdlock_debug); +__setup("csdlock_debug=", csdlock_debug); static DEFINE_PER_CPU(call_single_data_t *, cur_csd); static DEFINE_PER_CPU(smp_call_func_t, cur_csd_func); From 800d6acf40e5ce676b53a1259fb4e93e56279367 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 27 May 2022 17:07:45 +0200 Subject: [PATCH 55/75] rcu: tiny: Record kvfree_call_rcu() call stack for KASAN When running KASAN with Tiny RCU (e.g. under ARCH=um, where a working KASAN patch is now available), we don't get any information on the original kfree_rcu() (or similar) caller when a problem is reported, as Tiny RCU doesn't record this. Add the recording, which required pulling kvfree_call_rcu() out of line for the KASAN case since the recording function (kasan_record_aux_stack_noalloc) is neither exported, nor can we include kasan.h into rcutiny.h. without KASAN, the patch has no size impact (ARCH=um kernel): text data bss dec hex filename 6151515 4423154 33148520 43723189 29b29b5 linux 6151515 4423154 33148520 43723189 29b29b5 linux + patch with KASAN, the impact on my build was minimal: text data bss dec hex filename 13915539 7388050 33282304 54585893 340ea25 linux 13911266 7392114 33282304 54585684 340e954 linux + patch -4273 +4064 +-0 -209 Acked-by: Dmitry Vyukov Signed-off-by: Johannes Berg Signed-off-by: Paul E. McKenney --- include/linux/rcutiny.h | 11 ++++++++++- kernel/rcu/tiny.c | 14 ++++++++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 5fed476f977f..d84e13f2c384 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -38,7 +38,7 @@ static inline void synchronize_rcu_expedited(void) */ extern void kvfree(const void *addr); -static inline void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func) +static inline void __kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func) { if (head) { call_rcu(head, func); @@ -51,6 +51,15 @@ static inline void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func) kvfree((void *) func); } +#ifdef CONFIG_KASAN_GENERIC +void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func); +#else +static inline void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func) +{ + __kvfree_call_rcu(head, func); +} +#endif + void rcu_qs(void); static inline void rcu_softirq_qs(void) diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 340b3f8b090d..58ff3721d975 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -217,6 +217,20 @@ bool poll_state_synchronize_rcu(unsigned long oldstate) } EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu); +#ifdef CONFIG_KASAN_GENERIC +void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func) +{ + if (head) { + void *ptr = (void *) head - (unsigned long) func; + + kasan_record_aux_stack_noalloc(ptr); + } + + __kvfree_call_rcu(head, func); +} +EXPORT_SYMBOL_GPL(kvfree_call_rcu); +#endif + void __init rcu_init(void) { open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); From e2bb1288a381e9239aaf606ae8c1e20ea71c20bd Mon Sep 17 00:00:00 2001 From: Zqiang Date: Thu, 26 May 2022 09:55:12 +0800 Subject: [PATCH 56/75] rcu: Cleanup RCU urgency state for offline CPU When a CPU is slow to provide a quiescent state for a given grace period, RCU takes steps to encourage that CPU to get with the quiescent-state program in a more timely fashion. These steps include these flags in the rcu_data structure: 1. ->rcu_urgent_qs, which causes the scheduling-clock interrupt to request an otherwise pointless context switch from the scheduler. 2. ->rcu_need_heavy_qs, which causes both cond_resched() and RCU's context-switch hook to do an immediate momentary quiscent state. 3. ->rcu_need_heavy_qs, which causes the scheduler-clock tick to be enabled even on nohz_full CPUs with only one runnable task. These flags are of course cleared once the corresponding CPU has passed through a quiescent state. Unless that quiescent state is the CPU going offline, which means that when the CPU comes back online, it will needlessly consume additional CPU time and incur additional latency, which constitutes a minor but very real performance bug. This commit therefore adds the call to rcu_disable_urgency_upon_qs() that clears these flags to the CPU-hotplug offlining code path. Signed-off-by: Zqiang Signed-off-by: Paul E. McKenney Reviewed-by: Neeraj Upadhyay --- kernel/rcu/tree.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 7c62af481981..9d9a2a657823 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4446,6 +4446,7 @@ void rcu_report_dead(unsigned int cpu) rdp->rcu_ofl_gp_flags = READ_ONCE(rcu_state.gp_flags); if (rnp->qsmask & mask) { /* RCU waiting on outgoing CPU? */ /* Report quiescent state -before- changing ->qsmaskinitnext! */ + rcu_disable_urgency_upon_qs(rdp); rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags); raw_spin_lock_irqsave_rcu_node(rnp, flags); } From 82d26c36cc68e781400eb4e541f943008208f2d6 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Thu, 2 Jun 2022 10:06:43 +0200 Subject: [PATCH 57/75] rcu/kvfree: Remove useless monitor_todo flag monitor_todo is not needed as the work struct already tracks if work is pending. Just use that to know if work is pending using schedule_delayed_work() helper. Signed-off-by: Joel Fernandes (Google) Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney Reviewed-by: Neeraj Upadhyay --- kernel/rcu/tree.c | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 9d9a2a657823..6f4656aed962 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3216,7 +3216,6 @@ struct kfree_rcu_cpu_work { * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period * @lock: Synchronize access to this structure * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES - * @monitor_todo: Tracks whether a @monitor_work delayed work is pending * @initialized: The @rcu_work fields have been initialized * @count: Number of objects for which GP not started * @bkvcache: @@ -3241,7 +3240,6 @@ struct kfree_rcu_cpu { struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES]; raw_spinlock_t lock; struct delayed_work monitor_work; - bool monitor_todo; bool initialized; int count; @@ -3421,6 +3419,18 @@ static void kfree_rcu_work(struct work_struct *work) } } +static bool +need_offload_krc(struct kfree_rcu_cpu *krcp) +{ + int i; + + for (i = 0; i < FREE_N_CHANNELS; i++) + if (krcp->bkvhead[i]) + return true; + + return !!krcp->head; +} + /* * This function is invoked after the KFREE_DRAIN_JIFFIES timeout. */ @@ -3477,9 +3487,7 @@ static void kfree_rcu_monitor(struct work_struct *work) // of the channels that is still busy we should rearm the // work to repeat an attempt. Because previous batches are // still in progress. - if (!krcp->bkvhead[0] && !krcp->bkvhead[1] && !krcp->head) - krcp->monitor_todo = false; - else + if (need_offload_krc(krcp)) schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES); raw_spin_unlock_irqrestore(&krcp->lock, flags); @@ -3667,11 +3675,8 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func) WRITE_ONCE(krcp->count, krcp->count + 1); // Set timer to drain after KFREE_DRAIN_JIFFIES. - if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING && - !krcp->monitor_todo) { - krcp->monitor_todo = true; + if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING) schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES); - } unlock_return: krc_this_cpu_unlock(krcp, flags); @@ -3746,14 +3751,8 @@ void __init kfree_rcu_scheduler_running(void) struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); raw_spin_lock_irqsave(&krcp->lock, flags); - if ((!krcp->bkvhead[0] && !krcp->bkvhead[1] && !krcp->head) || - krcp->monitor_todo) { - raw_spin_unlock_irqrestore(&krcp->lock, flags); - continue; - } - krcp->monitor_todo = true; - schedule_delayed_work_on(cpu, &krcp->monitor_work, - KFREE_DRAIN_JIFFIES); + if (need_offload_krc(krcp)) + schedule_delayed_work_on(cpu, &krcp->monitor_work, KFREE_DRAIN_JIFFIES); raw_spin_unlock_irqrestore(&krcp->lock, flags); } } From 9bdb5b3a8d8ad1c92db309219859fe1c87c95351 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 8 Jun 2022 09:34:10 -0700 Subject: [PATCH 58/75] rcu: Initialize first_gp_fqs at declaration in rcu_gp_fqs() This commit saves a line of code by initializing the rcu_gp_fqs() function's first_gp_fqs local variable in its declaration. Reported-by: Frederic Weisbecker Reported-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 6f4656aed962..a23877a773be 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1971,13 +1971,12 @@ static void rcu_gp_fqs(bool first_time) */ static noinline_for_stack void rcu_gp_fqs_loop(void) { - bool first_gp_fqs; + bool first_gp_fqs = true; int gf = 0; unsigned long j; int ret; struct rcu_node *rnp = rcu_get_root(); - first_gp_fqs = true; j = READ_ONCE(jiffies_till_first_fqs); if (rcu_state.cbovld) gf = RCU_GP_FLAG_OVLD; From a03ae49c4785c1bc7b940e38bbdf2e63d79d1470 Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Thu, 9 Jun 2022 12:43:40 +0530 Subject: [PATCH 59/75] rcu/tree: Add comment to describe GP-done condition in fqs loop Add a comment to explain why !rcu_preempt_blocked_readers_cgp() condition is required on root rnp node, for GP completion check in rcu_gp_fqs_loop(). Reviewed-by: Joel Fernandes (Google) Signed-off-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a23877a773be..7c2231823e84 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2005,7 +2005,15 @@ static noinline_for_stack void rcu_gp_fqs_loop(void) rcu_gp_torture_wait(); WRITE_ONCE(rcu_state.gp_state, RCU_GP_DOING_FQS); /* Locking provides needed memory barriers. */ - /* If grace period done, leave loop. */ + /* + * Exit the loop if the root rcu_node structure indicates that the grace period + * has ended, leave the loop. The rcu_preempt_blocked_readers_cgp(rnp) check + * is required only for single-node rcu_node trees because readers blocking + * the current grace period are queued only on leaf rcu_node structures. + * For multi-node trees, checking the root node's ->qsmask suffices, because a + * given root node's ->qsmask bit is cleared only when all CPUs and tasks from + * the corresponding leaf nodes have passed through their quiescent state. + */ if (!READ_ONCE(rnp->qsmask) && !rcu_preempt_blocked_readers_cgp(rnp)) break; From 1598f4a4762be0ea6a1bcd229c2c9ff1ebb212bb Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 19 Apr 2022 14:23:18 +0200 Subject: [PATCH 60/75] rcu/nocb: Add/del rdp to iterate from rcuog itself NOCB rdp's are part of a group whose list is iterated by the corresponding rdp leader. This list is RCU traversed because an rdp can be either added or deleted concurrently. Upon addition, a new iteration to the list after a synchronization point (a pair of LOCK/UNLOCK ->nocb_gp_lock) is forced to make sure: 1) we didn't miss a new element added in the middle of an iteration 2) we didn't ignore a whole subset of the list due to an element being quickly deleted and then re-added. 3) we prevent from probably other surprises... Although this layout is expected to be safe, it doesn't help anybody to sleep well. Simplify instead the nocb state toggling with moving the list modification from the nocb (de-)offloading workqueue to the rcuog kthreads instead. Whenever the rdp leader is expected to (re-)set the SEGCBLIST_KTHREAD_GP flag of a target rdp, the latter is queued so that the leader handles the flag flip along with adding or deleting the target rdp to the list to iterate. This way the list modification and iteration happen from the same kthread and those operations can't race altogether. As a bonus, the flags for each rdp don't need to be checked locklessly before each iteration, which is one less opportunity to produce nightmares. Signed-off-by: Frederic Weisbecker Cc: Neeraj Upadhyay Cc: Boqun Feng Cc: Uladzislau Rezki Cc: Joel Fernandes Cc: Zqiang Signed-off-by: Paul E. McKenney Reviewed-by: Neeraj Upadhyay --- kernel/rcu/tree.h | 1 + kernel/rcu/tree_nocb.h | 138 +++++++++++++++++++++-------------------- 2 files changed, 71 insertions(+), 68 deletions(-) diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 2ccf5845957d..4f8532c33558 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -235,6 +235,7 @@ struct rcu_data { * if rdp_gp. */ struct list_head nocb_entry_rdp; /* rcu_data node in wakeup chain. */ + struct rcu_data *nocb_toggling_rdp; /* rdp queued for (de-)offloading */ /* The following fields are used by CB kthread, hence new cacheline. */ struct rcu_data *nocb_gp_rdp ____cacheline_internodealigned_in_smp; diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 46694e13398a..dac74952e1d1 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -546,52 +546,43 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone, } } -/* - * Check if we ignore this rdp. - * - * We check that without holding the nocb lock but - * we make sure not to miss a freshly offloaded rdp - * with the current ordering: - * - * rdp_offload_toggle() nocb_gp_enabled_cb() - * ------------------------- ---------------------------- - * WRITE flags LOCK nocb_gp_lock - * LOCK nocb_gp_lock READ/WRITE nocb_gp_sleep - * READ/WRITE nocb_gp_sleep UNLOCK nocb_gp_lock - * UNLOCK nocb_gp_lock READ flags - */ -static inline bool nocb_gp_enabled_cb(struct rcu_data *rdp) -{ - u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_GP; - - return rcu_segcblist_test_flags(&rdp->cblist, flags); -} - -static inline bool nocb_gp_update_state_deoffloading(struct rcu_data *rdp, - bool *needwake_state) +static int nocb_gp_toggle_rdp(struct rcu_data *rdp, + bool *wake_state) { struct rcu_segcblist *cblist = &rdp->cblist; + unsigned long flags; + int ret; - if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) { - if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) { - rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_GP); - if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) - *needwake_state = true; - } - return false; + rcu_nocb_lock_irqsave(rdp, flags); + if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED) && + !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) { + /* + * Offloading. Set our flag and notify the offload worker. + * We will handle this rdp until it ever gets de-offloaded. + */ + rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_GP); + if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) + *wake_state = true; + ret = 1; + } else if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED) && + rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) { + /* + * De-offloading. Clear our flag and notify the de-offload worker. + * We will ignore this rdp until it ever gets re-offloaded. + */ + rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_GP); + if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) + *wake_state = true; + ret = 0; + } else { + WARN_ON_ONCE(1); + ret = -1; } - /* - * De-offloading. Clear our flag and notify the de-offload worker. - * We will ignore this rdp until it ever gets re-offloaded. - */ - WARN_ON_ONCE(!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)); - rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_GP); - if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) - *needwake_state = true; - return true; -} + rcu_nocb_unlock_irqrestore(rdp, flags); + return ret; +} /* * No-CBs GP kthreads come here to wait for additional callbacks to show up @@ -609,7 +600,7 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) bool needwait_gp = false; // This prevents actual uninitialized use. bool needwake; bool needwake_gp; - struct rcu_data *rdp; + struct rcu_data *rdp, *rdp_toggling = NULL; struct rcu_node *rnp; unsigned long wait_gp_seq = 0; // Suppress "use uninitialized" warning. bool wasempty = false; @@ -634,19 +625,10 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) * is added to the list, so the skipped-over rcu_data structures * won't be ignored for long. */ - list_for_each_entry_rcu(rdp, &my_rdp->nocb_head_rdp, nocb_entry_rdp, 1) { - bool needwake_state = false; - - if (!nocb_gp_enabled_cb(rdp)) - continue; + list_for_each_entry(rdp, &my_rdp->nocb_head_rdp, nocb_entry_rdp) { trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Check")); rcu_nocb_lock_irqsave(rdp, flags); - if (nocb_gp_update_state_deoffloading(rdp, &needwake_state)) { - rcu_nocb_unlock_irqrestore(rdp, flags); - if (needwake_state) - swake_up_one(&rdp->nocb_state_wq); - continue; - } + lockdep_assert_held(&rdp->nocb_lock); bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); if (bypass_ncbs && (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + 1) || @@ -656,8 +638,6 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); } else if (!bypass_ncbs && rcu_segcblist_empty(&rdp->cblist)) { rcu_nocb_unlock_irqrestore(rdp, flags); - if (needwake_state) - swake_up_one(&rdp->nocb_state_wq); continue; /* No callbacks here, try next. */ } if (bypass_ncbs) { @@ -705,8 +685,6 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) } if (needwake_gp) rcu_gp_kthread_wake(); - if (needwake_state) - swake_up_one(&rdp->nocb_state_wq); } my_rdp->nocb_gp_bypass = bypass; @@ -739,15 +717,49 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) !READ_ONCE(my_rdp->nocb_gp_sleep)); trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("EndWait")); } + if (!rcu_nocb_poll) { raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags); + // (De-)queue an rdp to/from the group if its nocb state is changing + rdp_toggling = my_rdp->nocb_toggling_rdp; + if (rdp_toggling) + my_rdp->nocb_toggling_rdp = NULL; + if (my_rdp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) { WRITE_ONCE(my_rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); del_timer(&my_rdp->nocb_timer); } WRITE_ONCE(my_rdp->nocb_gp_sleep, true); raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags); + } else { + rdp_toggling = READ_ONCE(my_rdp->nocb_toggling_rdp); + if (rdp_toggling) { + /* + * Paranoid locking to make sure nocb_toggling_rdp is well + * reset *before* we (re)set SEGCBLIST_KTHREAD_GP or we could + * race with another round of nocb toggling for this rdp. + * Nocb locking should prevent from that already but we stick + * to paranoia, especially in rare path. + */ + raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags); + my_rdp->nocb_toggling_rdp = NULL; + raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags); + } } + + if (rdp_toggling) { + bool wake_state = false; + int ret; + + ret = nocb_gp_toggle_rdp(rdp_toggling, &wake_state); + if (ret == 1) + list_add_tail(&rdp_toggling->nocb_entry_rdp, &my_rdp->nocb_head_rdp); + else if (ret == 0) + list_del(&rdp_toggling->nocb_entry_rdp); + if (wake_state) + swake_up_one(&rdp_toggling->nocb_state_wq); + } + my_rdp->nocb_gp_seq = -1; WARN_ON(signal_pending(current)); } @@ -966,6 +978,8 @@ static int rdp_offload_toggle(struct rcu_data *rdp, swake_up_one(&rdp->nocb_cb_wq); raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags); + // Queue this rdp for add/del to/from the list to iterate on rcuog + WRITE_ONCE(rdp_gp->nocb_toggling_rdp, rdp); if (rdp_gp->nocb_gp_sleep) { rdp_gp->nocb_gp_sleep = false; wake_gp = true; @@ -1013,8 +1027,6 @@ static long rcu_nocb_rdp_deoffload(void *arg) swait_event_exclusive(rdp->nocb_state_wq, !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP)); - /* Stop nocb_gp_wait() from iterating over this structure. */ - list_del_rcu(&rdp->nocb_entry_rdp); /* * Lock one last time to acquire latest callback updates from kthreads * so we can later handle callbacks locally without locking. @@ -1079,16 +1091,6 @@ static long rcu_nocb_rdp_offload(void *arg) pr_info("Offloading %d\n", rdp->cpu); - /* - * Cause future nocb_gp_wait() invocations to iterate over - * structure, resetting ->nocb_gp_sleep and waking up the related - * "rcuog". Since nocb_gp_wait() in turn locks ->nocb_gp_lock - * before setting ->nocb_gp_sleep again, we are guaranteed to - * iterate this newly added structure before "rcuog" goes to - * sleep again. - */ - list_add_tail_rcu(&rdp->nocb_entry_rdp, &rdp->nocb_gp_rdp->nocb_head_rdp); - /* * Can't use rcu_nocb_lock_irqsave() before SEGCBLIST_LOCKING * is set. From 24a57affd242566fef2935f04f062350b8275187 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Tue, 19 Apr 2022 14:23:19 +0200 Subject: [PATCH 61/75] rcu/nocb: Invert rcu_state.barrier_mutex VS hotplug lock locking order In case of failure to spawn either rcuog or rcuo[p] kthreads for a given rdp, rcu_nocb_rdp_deoffload() needs to be called with the hotplug lock and the barrier_mutex held. However cpus write lock is already held while calling rcutree_prepare_cpu(). It's not possible to call rcu_nocb_rdp_deoffload() from there with just locking the barrier_mutex or this would result in a locking inversion against rcu_nocb_cpu_deoffload() which holds both locks in the reverse order. Simply solve this with inverting the locking order inside rcu_nocb_cpu_[de]offload(). This will be a pre-requisite to toggle NOCB states toward cpusets anyway. Signed-off-by: Zqiang Cc: Neeraj Upadhyay Cc: Boqun Feng Cc: Uladzislau Rezki Cc: Joel Fernandes Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney Reviewed-by: Neeraj Upadhyay --- kernel/rcu/tree_nocb.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index dac74952e1d1..f2f2cab6285a 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -1055,8 +1055,8 @@ int rcu_nocb_cpu_deoffload(int cpu) struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); int ret = 0; - mutex_lock(&rcu_state.barrier_mutex); cpus_read_lock(); + mutex_lock(&rcu_state.barrier_mutex); if (rcu_rdp_is_offloaded(rdp)) { if (cpu_online(cpu)) { ret = work_on_cpu(cpu, rcu_nocb_rdp_deoffload, rdp); @@ -1067,8 +1067,8 @@ int rcu_nocb_cpu_deoffload(int cpu) ret = -EINVAL; } } - cpus_read_unlock(); mutex_unlock(&rcu_state.barrier_mutex); + cpus_read_unlock(); return ret; } @@ -1134,8 +1134,8 @@ int rcu_nocb_cpu_offload(int cpu) struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); int ret = 0; - mutex_lock(&rcu_state.barrier_mutex); cpus_read_lock(); + mutex_lock(&rcu_state.barrier_mutex); if (!rcu_rdp_is_offloaded(rdp)) { if (cpu_online(cpu)) { ret = work_on_cpu(cpu, rcu_nocb_rdp_offload, rdp); @@ -1146,8 +1146,8 @@ int rcu_nocb_cpu_offload(int cpu) ret = -EINVAL; } } - cpus_read_unlock(); mutex_unlock(&rcu_state.barrier_mutex); + cpus_read_unlock(); return ret; } From 3a5761dc025da47960755ac64d9fbf1c32e8cd80 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Tue, 19 Apr 2022 14:23:20 +0200 Subject: [PATCH 62/75] rcu/nocb: Fix NOCB kthreads spawn failure with rcu_nocb_rdp_deoffload() direct call If the rcuog/o[p] kthreads spawn failed, the offloaded rdp needs to be explicitly deoffloaded, otherwise the target rdp is still considered offloaded even though nothing actually handles the callbacks. Signed-off-by: Zqiang Cc: Neeraj Upadhyay Cc: Boqun Feng Cc: Uladzislau Rezki Cc: Joel Fernandes Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney Reviewed-by: Neeraj Upadhyay --- kernel/rcu/tree_nocb.h | 80 +++++++++++++++++++++++++++++++++--------- 1 file changed, 64 insertions(+), 16 deletions(-) diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index f2f2cab6285a..4cf9a29bba79 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -986,10 +986,7 @@ static int rdp_offload_toggle(struct rcu_data *rdp, } raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); - if (wake_gp) - wake_up_process(rdp_gp->nocb_gp_kthread); - - return 0; + return wake_gp; } static long rcu_nocb_rdp_deoffload(void *arg) @@ -997,9 +994,15 @@ static long rcu_nocb_rdp_deoffload(void *arg) struct rcu_data *rdp = arg; struct rcu_segcblist *cblist = &rdp->cblist; unsigned long flags; - int ret; + int wake_gp; + struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; - WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id()); + /* + * rcu_nocb_rdp_deoffload() may be called directly if + * rcuog/o[p] spawn failed, because at this time the rdp->cpu + * is not online yet. + */ + WARN_ON_ONCE((rdp->cpu != raw_smp_processor_id()) && cpu_online(rdp->cpu)); pr_info("De-offloading %d\n", rdp->cpu); @@ -1023,10 +1026,41 @@ static long rcu_nocb_rdp_deoffload(void *arg) */ rcu_segcblist_set_flags(cblist, SEGCBLIST_RCU_CORE); invoke_rcu_core(); - ret = rdp_offload_toggle(rdp, false, flags); - swait_event_exclusive(rdp->nocb_state_wq, - !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB | - SEGCBLIST_KTHREAD_GP)); + wake_gp = rdp_offload_toggle(rdp, false, flags); + + mutex_lock(&rdp_gp->nocb_gp_kthread_mutex); + if (rdp_gp->nocb_gp_kthread) { + if (wake_gp) + wake_up_process(rdp_gp->nocb_gp_kthread); + + /* + * If rcuo[p] kthread spawn failed, directly remove SEGCBLIST_KTHREAD_CB. + * Just wait SEGCBLIST_KTHREAD_GP to be cleared by rcuog. + */ + if (!rdp->nocb_cb_kthread) { + rcu_nocb_lock_irqsave(rdp, flags); + rcu_segcblist_clear_flags(&rdp->cblist, SEGCBLIST_KTHREAD_CB); + rcu_nocb_unlock_irqrestore(rdp, flags); + } + + swait_event_exclusive(rdp->nocb_state_wq, + !rcu_segcblist_test_flags(cblist, + SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP)); + } else { + /* + * No kthread to clear the flags for us or remove the rdp from the nocb list + * to iterate. Do it here instead. Locking doesn't look stricly necessary + * but we stick to paranoia in this rare path. + */ + rcu_nocb_lock_irqsave(rdp, flags); + rcu_segcblist_clear_flags(&rdp->cblist, + SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP); + rcu_nocb_unlock_irqrestore(rdp, flags); + + list_del(&rdp->nocb_entry_rdp); + } + mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex); + /* * Lock one last time to acquire latest callback updates from kthreads * so we can later handle callbacks locally without locking. @@ -1047,7 +1081,7 @@ static long rcu_nocb_rdp_deoffload(void *arg) WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass)); - return ret; + return 0; } int rcu_nocb_cpu_deoffload(int cpu) @@ -1079,7 +1113,8 @@ static long rcu_nocb_rdp_offload(void *arg) struct rcu_data *rdp = arg; struct rcu_segcblist *cblist = &rdp->cblist; unsigned long flags; - int ret; + int wake_gp; + struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id()); /* @@ -1089,6 +1124,9 @@ static long rcu_nocb_rdp_offload(void *arg) if (!rdp->nocb_gp_rdp) return -EINVAL; + if (WARN_ON_ONCE(!rdp_gp->nocb_gp_kthread)) + return -EINVAL; + pr_info("Offloading %d\n", rdp->cpu); /* @@ -1113,7 +1151,9 @@ static long rcu_nocb_rdp_offload(void *arg) * WRITE flags READ callbacks * rcu_nocb_unlock() rcu_nocb_unlock() */ - ret = rdp_offload_toggle(rdp, true, flags); + wake_gp = rdp_offload_toggle(rdp, true, flags); + if (wake_gp) + wake_up_process(rdp_gp->nocb_gp_kthread); swait_event_exclusive(rdp->nocb_state_wq, rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB) && rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)); @@ -1126,7 +1166,7 @@ static long rcu_nocb_rdp_offload(void *arg) rcu_segcblist_clear_flags(cblist, SEGCBLIST_RCU_CORE); rcu_nocb_unlock_irqrestore(rdp, flags); - return ret; + return 0; } int rcu_nocb_cpu_offload(int cpu) @@ -1248,7 +1288,7 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu) "rcuog/%d", rdp_gp->cpu); if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo GP kthread, OOM is now expected behavior\n", __func__)) { mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex); - return; + goto end; } WRITE_ONCE(rdp_gp->nocb_gp_kthread, t); if (kthread_prio) @@ -1260,12 +1300,20 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu) t = kthread_run(rcu_nocb_cb_kthread, rdp, "rcuo%c/%d", rcu_state.abbr, cpu); if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo CB kthread, OOM is now expected behavior\n", __func__)) - return; + goto end; if (kthread_prio) sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); WRITE_ONCE(rdp->nocb_cb_kthread, t); WRITE_ONCE(rdp->nocb_gp_kthread, rdp_gp->nocb_gp_kthread); + return; +end: + mutex_lock(&rcu_state.barrier_mutex); + if (rcu_rdp_is_offloaded(rdp)) { + rcu_nocb_rdp_deoffload(rdp); + cpumask_clear_cpu(cpu, rcu_nocb_mask); + } + mutex_unlock(&rcu_state.barrier_mutex); } /* How many CB CPU IDs per GP kthread? Default of -1 for sqrt(nr_cpu_ids). */ From b37a667c62421b34e96b05613457b9fb0ed66ea1 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Fri, 22 Apr 2022 17:52:47 +0000 Subject: [PATCH 63/75] rcu/nocb: Add an option to offload all CPUs on boot Systems built with CONFIG_RCU_NOCB_CPU=y but booted without either the rcu_nocbs= or rcu_nohz_full= kernel-boot parameters will not have callback offloading on any of the CPUs, nor can any of the CPUs be switched to enable callback offloading at runtime. Although this is intentional, it would be nice to have a way to offload all the CPUs without having to make random bootloaders specify either the rcu_nocbs= or the rcu_nohz_full= kernel-boot parameters. This commit therefore provides a new CONFIG_RCU_NOCB_CPU_DEFAULT_ALL Kconfig option that switches the default so as to offload callback processing on all of the CPUs. This default can still be overridden using the rcu_nocbs= and rcu_nohz_full= kernel-boot parameters. Reviewed-by: Kalesh Singh Reviewed-by: Uladzislau Rezki (In v4.1, fixed issues with CONFIG maze reported by kernel test robot). Reported-by: kernel test robot Signed-off-by: Joel Fernandes Signed-off-by: Paul E. McKenney Reviewed-by: Neeraj Upadhyay --- Documentation/admin-guide/kernel-parameters.txt | 6 ++++++ kernel/rcu/Kconfig | 13 +++++++++++++ kernel/rcu/tree_nocb.h | 17 +++++++++++++++-- 3 files changed, 34 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 2522b11e593f..34605c275294 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3659,6 +3659,9 @@ just as if they had also been called out in the rcu_nocbs= boot parameter. + Note that this argument takes precedence over + the CONFIG_RCU_NOCB_CPU_DEFAULT_ALL option. + noiotrap [SH] Disables trapped I/O port accesses. noirqdebug [X86-32] Disables the code which attempts to detect and @@ -4557,6 +4560,9 @@ no-callback mode from boot but the mode may be toggled at runtime via cpusets. + Note that this argument takes precedence over + the CONFIG_RCU_NOCB_CPU_DEFAULT_ALL option. + rcu_nocb_poll [KNL] Rather than requiring that offloaded CPUs (specified by rcu_nocbs= above) explicitly diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 1c630e573548..27aab870ae4c 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -262,6 +262,19 @@ config RCU_NOCB_CPU Say Y here if you need reduced OS jitter, despite added overhead. Say N here if you are unsure. +config RCU_NOCB_CPU_DEFAULT_ALL + bool "Offload RCU callback processing from all CPUs by default" + depends on RCU_NOCB_CPU + default n + help + Use this option to offload callback processing from all CPUs + by default, in the absence of the rcu_nocbs or nohz_full boot + parameter. This also avoids the need to use any boot parameters + to achieve the effect of offloading all CPUs on boot. + + Say Y here if you want offload all CPUs by default on boot. + Say N here if you are unsure. + config TASKS_TRACE_RCU_READ_MB bool "Tasks Trace RCU readers use memory barriers in user and idle" depends on RCU_EXPERT && TASKS_TRACE_RCU diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 4cf9a29bba79..60cc92cc6655 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -1197,11 +1197,21 @@ void __init rcu_init_nohz(void) { int cpu; bool need_rcu_nocb_mask = false; + bool offload_all = false; struct rcu_data *rdp; -#if defined(CONFIG_NO_HZ_FULL) - if (tick_nohz_full_running && !cpumask_empty(tick_nohz_full_mask)) +#if defined(CONFIG_RCU_NOCB_CPU_DEFAULT_ALL) + if (!rcu_state.nocb_is_setup) { need_rcu_nocb_mask = true; + offload_all = true; + } +#endif /* #if defined(CONFIG_RCU_NOCB_CPU_DEFAULT_ALL) */ + +#if defined(CONFIG_NO_HZ_FULL) + if (tick_nohz_full_running && !cpumask_empty(tick_nohz_full_mask)) { + need_rcu_nocb_mask = true; + offload_all = false; /* NO_HZ_FULL has its own mask. */ + } #endif /* #if defined(CONFIG_NO_HZ_FULL) */ if (need_rcu_nocb_mask) { @@ -1222,6 +1232,9 @@ void __init rcu_init_nohz(void) cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask); #endif /* #if defined(CONFIG_NO_HZ_FULL) */ + if (offload_all) + cpumask_setall(rcu_nocb_mask); + if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) { pr_info("\tNote: kernel parameter 'rcu_nocbs=', 'nohz_full', or 'isolcpus=' contains nonexistent CPUs.\n"); cpumask_and(rcu_nocb_mask, cpu_possible_mask, From 5103850654fdc651f0a7076ac753b958f018bb85 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Fri, 29 Apr 2022 20:42:22 +0800 Subject: [PATCH 64/75] rcu: Add nocb_cb_kthread check to rcu_is_callbacks_kthread() Callbacks are invoked in RCU kthreads when calbacks are offloaded (rcu_nocbs boot parameter) or when RCU's softirq handler has been offloaded to rcuc kthreads (use_softirq==0). The current code allows for the rcu_nocbs case but not the use_softirq case. This commit adds support for the use_softirq case. Reported-by: kernel test robot Signed-off-by: Zqiang Signed-off-by: Paul E. McKenney Reviewed-by: Neeraj Upadhyay --- kernel/rcu/tree.c | 4 ++-- kernel/rcu/tree.h | 2 +- kernel/rcu/tree_plugin.h | 33 +++++++++++++++++++-------------- 3 files changed, 22 insertions(+), 17 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index c25ba442044a..74455671e6cf 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2530,7 +2530,7 @@ static void rcu_do_batch(struct rcu_data *rdp) trace_rcu_batch_end(rcu_state.name, 0, !rcu_segcblist_empty(&rdp->cblist), need_resched(), is_idle_task(current), - rcu_is_callbacks_kthread()); + rcu_is_callbacks_kthread(rdp)); return; } @@ -2608,7 +2608,7 @@ static void rcu_do_batch(struct rcu_data *rdp) rcu_nocb_lock_irqsave(rdp, flags); rdp->n_cbs_invoked += count; trace_rcu_batch_end(rcu_state.name, count, !!rcl.head, need_resched(), - is_idle_task(current), rcu_is_callbacks_kthread()); + is_idle_task(current), rcu_is_callbacks_kthread(rdp)); /* Update counts and requeue any remaining callbacks. */ rcu_segcblist_insert_done_cbs(&rdp->cblist, &rcl); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 4f8532c33558..649ad4f0129b 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -426,7 +426,7 @@ static void rcu_flavor_sched_clock_irq(int user); static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck); static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); -static bool rcu_is_callbacks_kthread(void); +static bool rcu_is_callbacks_kthread(struct rcu_data *rdp); static void rcu_cpu_kthread_setup(unsigned int cpu); static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp); static bool rcu_preempt_has_tasks(struct rcu_node *rnp); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index c8ba0fe17267..0483e1338c41 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1012,6 +1012,25 @@ static void rcu_cpu_kthread_setup(unsigned int cpu) WRITE_ONCE(rdp->rcuc_activity, jiffies); } +static bool rcu_is_callbacks_nocb_kthread(struct rcu_data *rdp) +{ +#ifdef CONFIG_RCU_NOCB_CPU + return rdp->nocb_cb_kthread == current; +#else + return false; +#endif +} + +/* + * Is the current CPU running the RCU-callbacks kthread? + * Caller must have preemption disabled. + */ +static bool rcu_is_callbacks_kthread(struct rcu_data *rdp) +{ + return rdp->rcu_cpu_kthread_task == current || + rcu_is_callbacks_nocb_kthread(rdp); +} + #ifdef CONFIG_RCU_BOOST /* @@ -1151,15 +1170,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) } } -/* - * Is the current CPU running the RCU-callbacks kthread? - * Caller must have preemption disabled. - */ -static bool rcu_is_callbacks_kthread(void) -{ - return __this_cpu_read(rcu_data.rcu_cpu_kthread_task) == current; -} - #define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000) /* @@ -1242,11 +1252,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } -static bool rcu_is_callbacks_kthread(void) -{ - return false; -} - static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) { } From 8f489b4da5278fc6e5fc8f0029ae7fb51c060215 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Wed, 11 May 2022 10:57:03 +0200 Subject: [PATCH 65/75] rcu/nocb: Add option to opt rcuo kthreads out of RT priority This commit introduces a RCU_NOCB_CPU_CB_BOOST Kconfig option that prevents rcuo kthreads from running at real-time priority, even in kernels built with RCU_BOOST. This capability is important to devices needing low-latency (as in a few milliseconds) response from expedited RCU grace periods, but which are not running a classic real-time workload. On such devices, permitting the rcuo kthreads to run at real-time priority results in unacceptable latencies imposed on the application tasks, which run as SCHED_OTHER. See for example the following trace output: <...>-60 [006] d..1 2979.028717: rcu_batch_start: rcu_preempt CBs=34619 bl=270 If that rcuop kthread were permitted to run at real-time SCHED_FIFO priority, it would monopolize its CPU for hundreds of milliseconds while invoking those 34619 RCU callback functions, which would cause an unacceptably long latency spike for many application stacks on Android platforms. However, some existing real-time workloads require that callback invocation run at SCHED_FIFO priority, for example, those running on systems with heavy SCHED_OTHER background loads. (It is the real-time system's administrator's responsibility to make sure that important real-time tasks run at a higher priority than do RCU's kthreads.) Therefore, this new RCU_NOCB_CPU_CB_BOOST Kconfig option defaults to "y" on kernels built with PREEMPT_RT and defaults to "n" otherwise. The effect is to preserve current behavior for real-time systems, but for other systems to allow expedited RCU grace periods to run with real-time priority while continuing to invoke RCU callbacks as SCHED_OTHER. As you would expect, this RCU_NOCB_CPU_CB_BOOST Kconfig option has no effect except on CPUs with offloaded RCU callbacks. Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney Acked-by: Joel Fernandes (Google) Reviewed-by: Neeraj Upadhyay --- kernel/rcu/Kconfig | 16 ++++++++++++++++ kernel/rcu/tree.c | 6 +++++- kernel/rcu/tree_nocb.h | 3 ++- 3 files changed, 23 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 27aab870ae4c..c05ca52cdf64 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -275,6 +275,22 @@ config RCU_NOCB_CPU_DEFAULT_ALL Say Y here if you want offload all CPUs by default on boot. Say N here if you are unsure. +config RCU_NOCB_CPU_CB_BOOST + bool "Offload RCU callback from real-time kthread" + depends on RCU_NOCB_CPU && RCU_BOOST + default y if PREEMPT_RT + help + Use this option to invoke offloaded callbacks as SCHED_FIFO + to avoid starvation by heavy SCHED_OTHER background load. + Of course, running as SCHED_FIFO during callback floods will + cause the rcuo[ps] kthreads to monopolize the CPU for hundreds + of milliseconds or more. Therefore, when enabling this option, + it is your responsibility to ensure that latency-sensitive + tasks either run with higher priority or run on some other CPU. + + Say Y here if you want to set RT priority for offloading kthreads. + Say N here if you are building a !PREEMPT_RT kernel and are unsure. + config TASKS_TRACE_RCU_READ_MB bool "Tasks Trace RCU readers use memory barriers in user and idle" depends on RCU_EXPERT && TASKS_TRACE_RCU diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 74455671e6cf..3b9f45ebb499 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -154,7 +154,11 @@ static void sync_sched_exp_online_cleanup(int cpu); static void check_cb_ovld_locked(struct rcu_data *rdp, struct rcu_node *rnp); static bool rcu_rdp_is_offloaded(struct rcu_data *rdp); -/* rcuc/rcub/rcuop kthread realtime priority */ +/* + * rcuc/rcub/rcuop kthread realtime priority. The "rcuop" + * real-time priority(enabling/disabling) is controlled by + * the extra CONFIG_RCU_NOCB_CPU_CB_BOOST configuration. + */ static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? 1 : 0; module_param(kthread_prio, int, 0444); diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 60cc92cc6655..fa8e4f82e60c 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -1315,8 +1315,9 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu) if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo CB kthread, OOM is now expected behavior\n", __func__)) goto end; - if (kthread_prio) + if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_CB_BOOST) && kthread_prio) sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); + WRITE_ONCE(rdp->nocb_cb_kthread, t); WRITE_ONCE(rdp->nocb_gp_kthread, rdp_gp->nocb_gp_kthread); return; From 0578e14c945b1739e15c0e993280151fa5b99ca2 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Sat, 11 Jun 2022 19:00:44 +0800 Subject: [PATCH 66/75] rcu/nocb: Avoid polling when my_rdp->nocb_head_rdp list is empty Currently, if the 'rcu_nocb_poll' kernel boot parameter is enabled, all rcuog kthreads enter polling mode. However, if all of a given group of rcuo kthreads correspond to CPUs that have been de-offloaded, the corresponding rcuog kthread will nonetheless still wake up periodically, unnecessarily consuming power and perturbing workloads. Fortunately, this situation is easily detected by the fact that the rcuog kthread's CPU's rcu_data structure's ->nocb_head_rdp list is empty. This commit saves power and avoids unnecessarily perturbing workloads by putting an rcuog kthread to sleep during any time period when all of its rcuo kthreads' CPUs are de-offloaded. Co-developed-by: Frederic Weisbecker Signed-off-by: Frederic Weisbecker Signed-off-by: Zqiang Signed-off-by: Paul E. McKenney Reviewed-by: Neeraj Upadhyay --- kernel/rcu/tree_nocb.h | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index fa8e4f82e60c..a8f574d8850d 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -584,6 +584,14 @@ static int nocb_gp_toggle_rdp(struct rcu_data *rdp, return ret; } +static void nocb_gp_sleep(struct rcu_data *my_rdp, int cpu) +{ + trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Sleep")); + swait_event_interruptible_exclusive(my_rdp->nocb_gp_wq, + !READ_ONCE(my_rdp->nocb_gp_sleep)); + trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("EndSleep")); +} + /* * No-CBs GP kthreads come here to wait for additional callbacks to show up * or for grace periods to end. @@ -701,13 +709,19 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) /* Polling, so trace if first poll in the series. */ if (gotcbs) trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Poll")); - schedule_timeout_idle(1); + if (list_empty(&my_rdp->nocb_head_rdp)) { + raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags); + if (!my_rdp->nocb_toggling_rdp) + WRITE_ONCE(my_rdp->nocb_gp_sleep, true); + raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags); + /* Wait for any offloading rdp */ + nocb_gp_sleep(my_rdp, cpu); + } else { + schedule_timeout_idle(1); + } } else if (!needwait_gp) { /* Wait for callbacks to appear. */ - trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Sleep")); - swait_event_interruptible_exclusive(my_rdp->nocb_gp_wq, - !READ_ONCE(my_rdp->nocb_gp_sleep)); - trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("EndSleep")); + nocb_gp_sleep(my_rdp, cpu); } else { rnp = my_rdp->mynode; trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("StartWait")); From bf95b2bc3e42f11f4d7a5e8a98376c2b4a2aa82f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 13 Apr 2022 17:46:15 -0700 Subject: [PATCH 67/75] rcu: Switch polled grace-period APIs to ->gp_seq_polled This commit switches the existing polled grace-period APIs to use a new ->gp_seq_polled counter in the rcu_state structure. An additional ->gp_seq_polled_snap counter in that same structure allows the normal grace period kthread to interact properly with the !SMP !PREEMPT fastpath through synchronize_rcu(). The first of the two to note the end of a given grace period will make knowledge of this transition available to the polled API. This commit is in preparation for polled expedited grace periods. [ paulmck: Fix use of rcu_state.gp_seq_polled to start normal grace period. ] Link: https://lore.kernel.org/all/20220121142454.1994916-1-bfoster@redhat.com/ Link: https://docs.google.com/document/d/1RNKWW9jQyfjxw2E8dsXVTdvZYh0HnYeSHDKog9jhdN8/edit?usp=sharing Cc: Brian Foster Cc: Dave Chinner Cc: Al Viro Cc: Ian Kent Co-developed-by: Boqun Feng Signed-off-by: Boqun Feng Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 98 +++++++++++++++++++++++++++++++++++++++++++++-- kernel/rcu/tree.h | 2 + 2 files changed, 96 insertions(+), 4 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 46cfceea8784..b40a5a19ddd2 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1775,6 +1775,78 @@ static void rcu_strict_gp_boundary(void *unused) invoke_rcu_core(); } +// Has rcu_init() been invoked? This is used (for example) to determine +// whether spinlocks may be acquired safely. +static bool rcu_init_invoked(void) +{ + return !!rcu_state.n_online_cpus; +} + +// Make the polled API aware of the beginning of a grace period. +static void rcu_poll_gp_seq_start(unsigned long *snap) +{ + struct rcu_node *rnp = rcu_get_root(); + + if (rcu_init_invoked()) + raw_lockdep_assert_held_rcu_node(rnp); + + // If RCU was idle, note beginning of GP. + if (!rcu_seq_state(rcu_state.gp_seq_polled)) + rcu_seq_start(&rcu_state.gp_seq_polled); + + // Either way, record current state. + *snap = rcu_state.gp_seq_polled; +} + +// Make the polled API aware of the end of a grace period. +static void rcu_poll_gp_seq_end(unsigned long *snap) +{ + struct rcu_node *rnp = rcu_get_root(); + + if (rcu_init_invoked()) + raw_lockdep_assert_held_rcu_node(rnp); + + // If the previously noted GP is still in effect, record the + // end of that GP. Either way, zero counter to avoid counter-wrap + // problems. + if (*snap && *snap == rcu_state.gp_seq_polled) { + rcu_seq_end(&rcu_state.gp_seq_polled); + rcu_state.gp_seq_polled_snap = 0; + } else { + *snap = 0; + } +} + +// Make the polled API aware of the beginning of a grace period, but +// where caller does not hold the root rcu_node structure's lock. +static void rcu_poll_gp_seq_start_unlocked(unsigned long *snap) +{ + struct rcu_node *rnp = rcu_get_root(); + + if (rcu_init_invoked()) { + lockdep_assert_irqs_enabled(); + raw_spin_lock_irq_rcu_node(rnp); + } + rcu_poll_gp_seq_start(snap); + if (rcu_init_invoked()) + raw_spin_unlock_irq_rcu_node(rnp); +} + +// Make the polled API aware of the end of a grace period, but where +// caller does not hold the root rcu_node structure's lock. +static void rcu_poll_gp_seq_end_unlocked(unsigned long *snap) +{ + struct rcu_node *rnp = rcu_get_root(); + + if (rcu_init_invoked()) { + lockdep_assert_irqs_enabled(); + raw_spin_lock_irq_rcu_node(rnp); + } + rcu_poll_gp_seq_end(snap); + if (rcu_init_invoked()) + raw_spin_unlock_irq_rcu_node(rnp); +} + /* * Initialize a new grace period. Return false if no grace period required. */ @@ -1810,6 +1882,7 @@ static noinline_for_stack bool rcu_gp_init(void) rcu_seq_start(&rcu_state.gp_seq); ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq); trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("start")); + rcu_poll_gp_seq_start(&rcu_state.gp_seq_polled_snap); raw_spin_unlock_irq_rcu_node(rnp); /* @@ -2069,6 +2142,7 @@ static noinline void rcu_gp_cleanup(void) * safe for us to drop the lock in order to mark the grace * period as completed in all of the rcu_node structures. */ + rcu_poll_gp_seq_end(&rcu_state.gp_seq_polled_snap); raw_spin_unlock_irq_rcu_node(rnp); /* @@ -3837,8 +3911,18 @@ void synchronize_rcu(void) lock_is_held(&rcu_lock_map) || lock_is_held(&rcu_sched_lock_map), "Illegal synchronize_rcu() in RCU read-side critical section"); - if (rcu_blocking_is_gp()) + if (rcu_blocking_is_gp()) { + // Note well that this code runs with !PREEMPT && !SMP. + // In addition, all code that advances grace periods runs + // at process level. Therefore, this GP overlaps with other + // GPs only by being fully nested within them, which allows + // reuse of ->gp_seq_polled_snap. + rcu_poll_gp_seq_start_unlocked(&rcu_state.gp_seq_polled_snap); + rcu_poll_gp_seq_end_unlocked(&rcu_state.gp_seq_polled_snap); + if (rcu_init_invoked()) + cond_resched_tasks_rcu_qs(); return; // Context allows vacuous grace periods. + } if (rcu_gp_is_expedited()) synchronize_rcu_expedited(); else @@ -3860,7 +3944,7 @@ unsigned long get_state_synchronize_rcu(void) * before the load from ->gp_seq. */ smp_mb(); /* ^^^ */ - return rcu_seq_snap(&rcu_state.gp_seq); + return rcu_seq_snap(&rcu_state.gp_seq_polled); } EXPORT_SYMBOL_GPL(get_state_synchronize_rcu); @@ -3889,7 +3973,13 @@ unsigned long start_poll_synchronize_rcu(void) rdp = this_cpu_ptr(&rcu_data); rnp = rdp->mynode; raw_spin_lock_rcu_node(rnp); // irqs already disabled. - needwake = rcu_start_this_gp(rnp, rdp, gp_seq); + // Note it is possible for a grace period to have elapsed between + // the above call to get_state_synchronize_rcu() and the below call + // to rcu_seq_snap. This is OK, the worst that happens is that we + // get a grace period that no one needed. These accesses are ordered + // by smp_mb(), and we are accessing them in the opposite order + // from which they are updated at grace-period start, as required. + needwake = rcu_start_this_gp(rnp, rdp, rcu_seq_snap(&rcu_state.gp_seq)); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); if (needwake) rcu_gp_kthread_wake(); @@ -3925,7 +4015,7 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu); bool poll_state_synchronize_rcu(unsigned long oldstate) { if (oldstate == RCU_GET_STATE_COMPLETED || - rcu_seq_done_exact(&rcu_state.gp_seq, oldstate)) { + rcu_seq_done_exact(&rcu_state.gp_seq_polled, oldstate)) { smp_mb(); /* Ensure GP ends before subsequent accesses. */ return true; } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 2ccf5845957d..9c853033f159 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -323,6 +323,8 @@ struct rcu_state { short gp_state; /* GP kthread sleep state. */ unsigned long gp_wake_time; /* Last GP kthread wake. */ unsigned long gp_wake_seq; /* ->gp_seq at ^^^. */ + unsigned long gp_seq_polled; /* GP seq for polled API. */ + unsigned long gp_seq_polled_snap; /* ->gp_seq_polled at normal GP start. */ /* End of fields guarded by root rcu_node's lock. */ From dd04140531b5d38b77ad9ff7b18117654be5bf5c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 14 Apr 2022 06:56:35 -0700 Subject: [PATCH 68/75] rcu: Make polled grace-period API account for expedited grace periods Currently, this code could splat: oldstate = get_state_synchronize_rcu(); synchronize_rcu_expedited(); WARN_ON_ONCE(!poll_state_synchronize_rcu(oldstate)); This situation is counter-intuitive and user-unfriendly. After all, there really was a perfectly valid full grace period right after the call to get_state_synchronize_rcu(), so why shouldn't poll_state_synchronize_rcu() know about it? This commit therefore makes the polled grace-period API aware of expedited grace periods in addition to the normal grace periods that it is already aware of. With this change, the above code is guaranteed not to splat. Please note that the above code can still splat due to counter wrap on the one hand and situations involving partially overlapping normal/expedited grace periods on the other. On 64-bit systems, the second is of course much more likely than the first. It is possible to modify this approach to prevent overlapping grace periods from causing splats, but only at the expense of greatly increasing the probability of counter wrap, as in within milliseconds on 32-bit systems and within minutes on 64-bit systems. This commit is in preparation for polled expedited grace periods. Link: https://lore.kernel.org/all/20220121142454.1994916-1-bfoster@redhat.com/ Link: https://docs.google.com/document/d/1RNKWW9jQyfjxw2E8dsXVTdvZYh0HnYeSHDKog9jhdN8/edit?usp=sharing Cc: Brian Foster Cc: Dave Chinner Cc: Al Viro Cc: Ian Kent Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 9 +++++---- kernel/rcu/tree.h | 1 + kernel/rcu/tree_exp.h | 16 ++++++++++++++-- 3 files changed, 20 insertions(+), 6 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index b40a5a19ddd2..1505b02b4e53 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1812,6 +1812,7 @@ static void rcu_poll_gp_seq_end(unsigned long *snap) if (*snap && *snap == rcu_state.gp_seq_polled) { rcu_seq_end(&rcu_state.gp_seq_polled); rcu_state.gp_seq_polled_snap = 0; + rcu_state.gp_seq_polled_exp_snap = 0; } else { *snap = 0; } @@ -3913,10 +3914,10 @@ void synchronize_rcu(void) "Illegal synchronize_rcu() in RCU read-side critical section"); if (rcu_blocking_is_gp()) { // Note well that this code runs with !PREEMPT && !SMP. - // In addition, all code that advances grace periods runs - // at process level. Therefore, this GP overlaps with other - // GPs only by being fully nested within them, which allows - // reuse of ->gp_seq_polled_snap. + // In addition, all code that advances grace periods runs at + // process level. Therefore, this normal GP overlaps with + // other normal GPs only by being fully nested within them, + // which allows reuse of ->gp_seq_polled_snap. rcu_poll_gp_seq_start_unlocked(&rcu_state.gp_seq_polled_snap); rcu_poll_gp_seq_end_unlocked(&rcu_state.gp_seq_polled_snap); if (rcu_init_invoked()) diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 9c853033f159..5634e76106c4 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -325,6 +325,7 @@ struct rcu_state { unsigned long gp_wake_seq; /* ->gp_seq at ^^^. */ unsigned long gp_seq_polled; /* GP seq for polled API. */ unsigned long gp_seq_polled_snap; /* ->gp_seq_polled at normal GP start. */ + unsigned long gp_seq_polled_exp_snap; /* ->gp_seq_polled at expedited GP start. */ /* End of fields guarded by root rcu_node's lock. */ diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 0f70f62039a9..e0258066b881 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -18,6 +18,7 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp); static void rcu_exp_gp_seq_start(void) { rcu_seq_start(&rcu_state.expedited_sequence); + rcu_poll_gp_seq_start_unlocked(&rcu_state.gp_seq_polled_exp_snap); } /* @@ -34,6 +35,7 @@ static __maybe_unused unsigned long rcu_exp_gp_seq_endval(void) */ static void rcu_exp_gp_seq_end(void) { + rcu_poll_gp_seq_end_unlocked(&rcu_state.gp_seq_polled_exp_snap); rcu_seq_end(&rcu_state.expedited_sequence); smp_mb(); /* Ensure that consecutive grace periods serialize. */ } @@ -913,8 +915,18 @@ void synchronize_rcu_expedited(void) "Illegal synchronize_rcu_expedited() in RCU read-side critical section"); /* Is the state is such that the call is a grace period? */ - if (rcu_blocking_is_gp()) - return; + if (rcu_blocking_is_gp()) { + // Note well that this code runs with !PREEMPT && !SMP. + // In addition, all code that advances grace periods runs + // at process level. Therefore, this expedited GP overlaps + // with other expedited GPs only by being fully nested within + // them, which allows reuse of ->gp_seq_polled_exp_snap. + rcu_poll_gp_seq_start_unlocked(&rcu_state.gp_seq_polled_exp_snap); + rcu_poll_gp_seq_end_unlocked(&rcu_state.gp_seq_polled_exp_snap); + if (rcu_init_invoked()) + cond_resched(); + return; // Context allows vacuous grace periods. + } /* If expedited grace periods are prohibited, fall back to normal. */ if (rcu_gp_is_normal()) { From 7f4535366f8f77b3ddbc79d4ba82df966c5c2aab Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 14 Apr 2022 11:49:58 -0700 Subject: [PATCH 69/75] rcu: Make Tiny RCU grace periods visible to polled APIs This commit makes the Tiny RCU implementation of synchronize_rcu() increment the rcu_ctrlblk.gp_seq counter, thus making both synchronize_rcu() and synchronize_rcu_expedited() visible to get_state_synchronize_rcu() and friends. Link: https://lore.kernel.org/all/20220121142454.1994916-1-bfoster@redhat.com/ Link: https://docs.google.com/document/d/1RNKWW9jQyfjxw2E8dsXVTdvZYh0HnYeSHDKog9jhdN8/edit?usp=sharing Cc: Brian Foster Cc: Dave Chinner Cc: Al Viro Cc: Ian Kent Signed-off-by: Paul E. McKenney --- kernel/rcu/tiny.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index dbee6bea6726..60071817d939 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -139,8 +139,10 @@ static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused /* * Wait for a grace period to elapse. But it is illegal to invoke * synchronize_rcu() from within an RCU read-side critical section. - * Therefore, any legal call to synchronize_rcu() is a quiescent - * state, and so on a UP system, synchronize_rcu() need do nothing. + * Therefore, any legal call to synchronize_rcu() is a quiescent state, + * and so on a UP system, synchronize_rcu() need do nothing, other than + * let the polled APIs know that another grace period elapsed. + * * (But Lai Jiangshan points out the benefits of doing might_sleep() * to reduce latency.) * @@ -152,6 +154,7 @@ void synchronize_rcu(void) lock_is_held(&rcu_lock_map) || lock_is_held(&rcu_sched_lock_map), "Illegal synchronize_rcu() in RCU read-side critical section"); + WRITE_ONCE(rcu_ctrlblk.gp_seq, rcu_ctrlblk.gp_seq + 2); } EXPORT_SYMBOL_GPL(synchronize_rcu); From e4333cb20f047d96485a9416a93ae4b2ec3b27dd Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 14 Apr 2022 09:09:11 -0700 Subject: [PATCH 70/75] rcutorture: Verify that polled GP API sees synchronous grace periods This commit causes rcu_torture_writer() to use WARN_ON_ONCE() to check that the cookie returned by the current RCU flavor's ->get_gp_state() function (get_state_synchronize_rcu() for vanilla RCU) causes that flavor's ->poll_gp_state function (poll_state_synchronize_rcu() for vanilla RCU) to unconditionally return true. Note that a pair calls to synchronous grace-period-wait functions are used. This is necessary to account for partially overlapping normal and expedited grace periods aligning in just the wrong way with polled API invocations, which can cause those polled API invocations to ignore one or the other of those partially overlapping grace periods. It is unlikely that this sort of ignored grace period will be a problem in production, but rcutorture can make it happen quite within a few tens of seconds. This commit is in preparation for polled expedited grace periods. [ paulmck: Apply feedback from Frederic Weisbecker. ] Link: https://lore.kernel.org/all/20220121142454.1994916-1-bfoster@redhat.com/ Link: https://docs.google.com/document/d/1RNKWW9jQyfjxw2E8dsXVTdvZYh0HnYeSHDKog9jhdN8/edit?usp=sharing Cc: Brian Foster Cc: Dave Chinner Cc: Al Viro Cc: Ian Kent Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 4ceec9f4169c..d2edc763bb92 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1269,7 +1269,12 @@ rcu_torture_writer(void *arg) break; case RTWS_EXP_SYNC: rcu_torture_writer_state = RTWS_EXP_SYNC; + if (cur_ops->get_gp_state && cur_ops->poll_gp_state) + cookie = cur_ops->get_gp_state(); cur_ops->exp_sync(); + cur_ops->exp_sync(); + if (cur_ops->get_gp_state && cur_ops->poll_gp_state) + WARN_ON_ONCE(!cur_ops->poll_gp_state(cookie)); rcu_torture_pipe_update(old_rp); break; case RTWS_COND_GET: @@ -1291,7 +1296,12 @@ rcu_torture_writer(void *arg) break; case RTWS_SYNC: rcu_torture_writer_state = RTWS_SYNC; + if (cur_ops->get_gp_state && cur_ops->poll_gp_state) + cookie = cur_ops->get_gp_state(); cur_ops->sync(); + cur_ops->sync(); + if (cur_ops->get_gp_state && cur_ops->poll_gp_state) + WARN_ON_ONCE(!cur_ops->poll_gp_state(cookie)); rcu_torture_pipe_update(old_rp); break; default: From d96c52fe4907c68adc5e61a0bef7aec0933223d5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 15 Apr 2022 10:55:42 -0700 Subject: [PATCH 71/75] rcu: Add polled expedited grace-period primitives This commit adds expedited grace-period functionality to RCU's polled grace-period API, adding start_poll_synchronize_rcu_expedited() and cond_synchronize_rcu_expedited(), which are similar to the existing start_poll_synchronize_rcu() and cond_synchronize_rcu() functions, respectively. Note that although start_poll_synchronize_rcu_expedited() can be invoked very early, the resulting expedited grace periods are not guaranteed to start until after workqueues are fully initialized. On the other hand, both synchronize_rcu() and synchronize_rcu_expedited() can also be invoked very early, and the resulting grace periods will be taken into account as they occur. [ paulmck: Apply feedback from Neeraj Upadhyay. ] Link: https://lore.kernel.org/all/20220121142454.1994916-1-bfoster@redhat.com/ Link: https://docs.google.com/document/d/1RNKWW9jQyfjxw2E8dsXVTdvZYh0HnYeSHDKog9jhdN8/edit?usp=sharing Cc: Brian Foster Cc: Dave Chinner Cc: Al Viro Cc: Ian Kent Signed-off-by: Paul E. McKenney --- include/linux/rcutiny.h | 10 +++++ include/linux/rcutree.h | 2 + kernel/rcu/tree.c | 17 ++++++--- kernel/rcu/tree.h | 7 ++++ kernel/rcu/tree_exp.h | 85 +++++++++++++++++++++++++++++++++++++++++ 5 files changed, 116 insertions(+), 5 deletions(-) diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 5fed476f977f..ab7e20dfb07b 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -23,6 +23,16 @@ static inline void cond_synchronize_rcu(unsigned long oldstate) might_sleep(); } +static inline unsigned long start_poll_synchronize_rcu_expedited(void) +{ + return start_poll_synchronize_rcu(); +} + +static inline void cond_synchronize_rcu_expedited(unsigned long oldstate) +{ + cond_synchronize_rcu(oldstate); +} + extern void rcu_barrier(void); static inline void synchronize_rcu_expedited(void) diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 9c6cfb742504..20dbaa9a3882 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -40,6 +40,8 @@ bool rcu_eqs_special_set(int cpu); void rcu_momentary_dyntick_idle(void); void kfree_rcu_scheduler_running(void); bool rcu_gp_might_be_stalled(void); +unsigned long start_poll_synchronize_rcu_expedited(void); +void cond_synchronize_rcu_expedited(unsigned long oldstate); unsigned long get_state_synchronize_rcu(void); unsigned long start_poll_synchronize_rcu(void); bool poll_state_synchronize_rcu(unsigned long oldstate); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 1505b02b4e53..6cf5b51622cd 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4027,20 +4027,20 @@ EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu); /** * cond_synchronize_rcu - Conditionally wait for an RCU grace period * - * @oldstate: value from get_state_synchronize_rcu() or start_poll_synchronize_rcu() + * @oldstate: value from get_state_synchronize_rcu(), start_poll_synchronize_rcu(), or start_poll_synchronize_rcu_expedited() * * If a full RCU grace period has elapsed since the earlier call to * get_state_synchronize_rcu() or start_poll_synchronize_rcu(), just return. * Otherwise, invoke synchronize_rcu() to wait for a full grace period. * - * Yes, this function does not take counter wrap into account. But - * counter wrap is harmless. If the counter wraps, we have waited for + * Yes, this function does not take counter wrap into account. + * But counter wrap is harmless. If the counter wraps, we have waited for * more than 2 billion grace periods (and way more on a 64-bit system!), - * so waiting for one additional grace period should be just fine. + * so waiting for a couple of additional grace periods should be just fine. * * This function provides the same memory-ordering guarantees that * would be provided by a synchronize_rcu() that was invoked at the call - * to the function that provided @oldstate, and that returned at the end + * to the function that provided @oldstate and that returned at the end * of this function. */ void cond_synchronize_rcu(unsigned long oldstate) @@ -4793,6 +4793,9 @@ static void __init rcu_init_one(void) init_waitqueue_head(&rnp->exp_wq[3]); spin_lock_init(&rnp->exp_lock); mutex_init(&rnp->boost_kthread_mutex); + raw_spin_lock_init(&rnp->exp_poll_lock); + rnp->exp_seq_poll_rq = RCU_GET_STATE_COMPLETED; + INIT_WORK(&rnp->exp_poll_wq, sync_rcu_do_polled_gp); } } @@ -5018,6 +5021,10 @@ void __init rcu_init(void) qovld_calc = DEFAULT_RCU_QOVLD_MULT * qhimark; else qovld_calc = qovld; + + // Kick-start any polled grace periods that started early. + if (!(per_cpu_ptr(&rcu_data, cpu)->mynode->exp_seq_poll_rq & 0x1)) + (void)start_poll_synchronize_rcu_expedited(); } #include "tree_stall.h" diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 5634e76106c4..fb77deca5f5c 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -133,6 +133,10 @@ struct rcu_node { wait_queue_head_t exp_wq[4]; struct rcu_exp_work rew; bool exp_need_flush; /* Need to flush workitem? */ + raw_spinlock_t exp_poll_lock; + /* Lock and data for polled expedited grace periods. */ + unsigned long exp_seq_poll_rq; + struct work_struct exp_poll_wq; } ____cacheline_internodealigned_in_smp; /* @@ -484,3 +488,6 @@ static void rcu_iw_handler(struct irq_work *iwp); static void check_cpu_stall(struct rcu_data *rdp); static void rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp, const unsigned long gpssdelay); + +/* Forward declarations for tree_exp.h. */ +static void sync_rcu_do_polled_gp(struct work_struct *wp); diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index e0258066b881..571b0a700cce 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -962,3 +962,88 @@ void synchronize_rcu_expedited(void) synchronize_rcu_expedited_destroy_work(&rew); } EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); + +/* + * Ensure that start_poll_synchronize_rcu_expedited() has the expedited + * RCU grace periods that it needs. + */ +static void sync_rcu_do_polled_gp(struct work_struct *wp) +{ + unsigned long flags; + struct rcu_node *rnp = container_of(wp, struct rcu_node, exp_poll_wq); + unsigned long s; + + raw_spin_lock_irqsave(&rnp->exp_poll_lock, flags); + s = rnp->exp_seq_poll_rq; + rnp->exp_seq_poll_rq = RCU_GET_STATE_COMPLETED; + raw_spin_unlock_irqrestore(&rnp->exp_poll_lock, flags); + if (s == RCU_GET_STATE_COMPLETED) + return; + while (!poll_state_synchronize_rcu(s)) + synchronize_rcu_expedited(); + raw_spin_lock_irqsave(&rnp->exp_poll_lock, flags); + s = rnp->exp_seq_poll_rq; + if (poll_state_synchronize_rcu(s)) + rnp->exp_seq_poll_rq = RCU_GET_STATE_COMPLETED; + raw_spin_unlock_irqrestore(&rnp->exp_poll_lock, flags); +} + +/** + * start_poll_synchronize_rcu_expedited - Snapshot current RCU state and start expedited grace period + * + * Returns a cookie to pass to a call to cond_synchronize_rcu(), + * cond_synchronize_rcu_expedited(), or poll_state_synchronize_rcu(), + * allowing them to determine whether or not any sort of grace period has + * elapsed in the meantime. If the needed expedited grace period is not + * already slated to start, initiates that grace period. + */ +unsigned long start_poll_synchronize_rcu_expedited(void) +{ + unsigned long flags; + struct rcu_data *rdp; + struct rcu_node *rnp; + unsigned long s; + + s = get_state_synchronize_rcu(); + rdp = per_cpu_ptr(&rcu_data, raw_smp_processor_id()); + rnp = rdp->mynode; + if (rcu_init_invoked()) + raw_spin_lock_irqsave(&rnp->exp_poll_lock, flags); + if (!poll_state_synchronize_rcu(s)) { + rnp->exp_seq_poll_rq = s; + if (rcu_init_invoked()) + queue_work(rcu_gp_wq, &rnp->exp_poll_wq); + } + if (rcu_init_invoked()) + raw_spin_unlock_irqrestore(&rnp->exp_poll_lock, flags); + + return s; +} +EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu_expedited); + +/** + * cond_synchronize_rcu_expedited - Conditionally wait for an expedited RCU grace period + * + * @oldstate: value from get_state_synchronize_rcu(), start_poll_synchronize_rcu(), or start_poll_synchronize_rcu_expedited() + * + * If any type of full RCU grace period has elapsed since the earlier + * call to get_state_synchronize_rcu(), start_poll_synchronize_rcu(), + * or start_poll_synchronize_rcu_expedited(), just return. Otherwise, + * invoke synchronize_rcu_expedited() to wait for a full grace period. + * + * Yes, this function does not take counter wrap into account. + * But counter wrap is harmless. If the counter wraps, we have waited for + * more than 2 billion grace periods (and way more on a 64-bit system!), + * so waiting for a couple of additional grace periods should be just fine. + * + * This function provides the same memory-ordering guarantees that + * would be provided by a synchronize_rcu() that was invoked at the call + * to the function that provided @oldstate and that returned at the end + * of this function. + */ +void cond_synchronize_rcu_expedited(unsigned long oldstate) +{ + if (!poll_state_synchronize_rcu(oldstate)) + synchronize_rcu_expedited(); +} +EXPORT_SYMBOL_GPL(cond_synchronize_rcu_expedited); From 11d62f0f43a35d7c62aabc06a99cd4691a47ccb4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 1 Feb 2022 07:01:20 -0800 Subject: [PATCH 72/75] rcutorture: Test polled expedited grace-period primitives This commit adds tests of start_poll_synchronize_rcu_expedited() and poll_state_synchronize_rcu_expedited(). Link: https://lore.kernel.org/all/20220121142454.1994916-1-bfoster@redhat.com/ Link: https://docs.google.com/document/d/1RNKWW9jQyfjxw2E8dsXVTdvZYh0HnYeSHDKog9jhdN8/edit?usp=sharing Cc: Brian Foster Cc: Dave Chinner Cc: Al Viro Cc: Ian Kent Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 87 +++++++++++++++++++++++++++++++++++------ 1 file changed, 74 insertions(+), 13 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index d2edc763bb92..0788ef2a4491 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -86,10 +86,12 @@ torture_param(int, fwd_progress_holdoff, 60, torture_param(bool, fwd_progress_need_resched, 1, "Hide cond_resched() behind need_resched()"); torture_param(bool, gp_cond, false, "Use conditional/async GP wait primitives"); +torture_param(bool, gp_cond_exp, false, "Use conditional/async expedited GP wait primitives"); torture_param(bool, gp_exp, false, "Use expedited GP wait primitives"); torture_param(bool, gp_normal, false, "Use normal (non-expedited) GP wait primitives"); torture_param(bool, gp_poll, false, "Use polling GP wait primitives"); +torture_param(bool, gp_poll_exp, false, "Use polling expedited GP wait primitives"); torture_param(bool, gp_sync, false, "Use synchronous GP wait primitives"); torture_param(int, irqreader, 1, "Allow RCU readers from irq handlers"); torture_param(int, leakpointer, 0, "Leak pointer dereferences from readers"); @@ -209,12 +211,16 @@ static int rcu_torture_writer_state; #define RTWS_DEF_FREE 3 #define RTWS_EXP_SYNC 4 #define RTWS_COND_GET 5 -#define RTWS_COND_SYNC 6 -#define RTWS_POLL_GET 7 -#define RTWS_POLL_WAIT 8 -#define RTWS_SYNC 9 -#define RTWS_STUTTER 10 -#define RTWS_STOPPING 11 +#define RTWS_COND_GET_EXP 6 +#define RTWS_COND_SYNC 7 +#define RTWS_COND_SYNC_EXP 8 +#define RTWS_POLL_GET 9 +#define RTWS_POLL_GET_EXP 10 +#define RTWS_POLL_WAIT 11 +#define RTWS_POLL_WAIT_EXP 12 +#define RTWS_SYNC 13 +#define RTWS_STUTTER 14 +#define RTWS_STOPPING 15 static const char * const rcu_torture_writer_state_names[] = { "RTWS_FIXED_DELAY", "RTWS_DELAY", @@ -222,9 +228,13 @@ static const char * const rcu_torture_writer_state_names[] = { "RTWS_DEF_FREE", "RTWS_EXP_SYNC", "RTWS_COND_GET", + "RTWS_COND_GET_EXP", "RTWS_COND_SYNC", + "RTWS_COND_SYNC_EXP", "RTWS_POLL_GET", + "RTWS_POLL_GET_EXP", "RTWS_POLL_WAIT", + "RTWS_POLL_WAIT_EXP", "RTWS_SYNC", "RTWS_STUTTER", "RTWS_STOPPING", @@ -337,6 +347,10 @@ struct rcu_torture_ops { void (*deferred_free)(struct rcu_torture *p); void (*sync)(void); void (*exp_sync)(void); + unsigned long (*get_gp_state_exp)(void); + unsigned long (*start_gp_poll_exp)(void); + bool (*poll_gp_state_exp)(unsigned long oldstate); + void (*cond_sync_exp)(unsigned long oldstate); unsigned long (*get_gp_state)(void); unsigned long (*get_gp_completed)(void); unsigned long (*start_gp_poll)(void); @@ -509,6 +523,10 @@ static struct rcu_torture_ops rcu_ops = { .start_gp_poll = start_poll_synchronize_rcu, .poll_gp_state = poll_state_synchronize_rcu, .cond_sync = cond_synchronize_rcu, + .get_gp_state_exp = get_state_synchronize_rcu, + .start_gp_poll_exp = start_poll_synchronize_rcu_expedited, + .poll_gp_state_exp = poll_state_synchronize_rcu, + .cond_sync_exp = cond_synchronize_rcu_expedited, .call = call_rcu, .cb_barrier = rcu_barrier, .fqs = rcu_force_quiescent_state, @@ -1138,9 +1156,8 @@ rcu_torture_fqs(void *arg) return 0; } -// Used by writers to randomly choose from the available grace-period -// primitives. The only purpose of the initialization is to size the array. -static int synctype[] = { RTWS_DEF_FREE, RTWS_EXP_SYNC, RTWS_COND_GET, RTWS_POLL_GET, RTWS_SYNC }; +// Used by writers to randomly choose from the available grace-period primitives. +static int synctype[ARRAY_SIZE(rcu_torture_writer_state_names)] = { }; static int nsynctypes; /* @@ -1148,18 +1165,27 @@ static int nsynctypes; */ static void rcu_torture_write_types(void) { - bool gp_cond1 = gp_cond, gp_exp1 = gp_exp, gp_normal1 = gp_normal; - bool gp_poll1 = gp_poll, gp_sync1 = gp_sync; + bool gp_cond1 = gp_cond, gp_cond_exp1 = gp_cond_exp, gp_exp1 = gp_exp; + bool gp_poll_exp1 = gp_poll_exp, gp_normal1 = gp_normal, gp_poll1 = gp_poll; + bool gp_sync1 = gp_sync; /* Initialize synctype[] array. If none set, take default. */ - if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_poll1 && !gp_sync1) - gp_cond1 = gp_exp1 = gp_normal1 = gp_poll1 = gp_sync1 = true; + if (!gp_cond1 && !gp_cond_exp1 && !gp_exp1 && !gp_poll_exp && + !gp_normal1 && !gp_poll1 && !gp_sync1) + gp_cond1 = gp_cond_exp1 = gp_exp1 = gp_poll_exp1 = + gp_normal1 = gp_poll1 = gp_sync1 = true; if (gp_cond1 && cur_ops->get_gp_state && cur_ops->cond_sync) { synctype[nsynctypes++] = RTWS_COND_GET; pr_info("%s: Testing conditional GPs.\n", __func__); } else if (gp_cond && (!cur_ops->get_gp_state || !cur_ops->cond_sync)) { pr_alert("%s: gp_cond without primitives.\n", __func__); } + if (gp_cond_exp1 && cur_ops->get_gp_state_exp && cur_ops->cond_sync_exp) { + synctype[nsynctypes++] = RTWS_COND_GET_EXP; + pr_info("%s: Testing conditional expedited GPs.\n", __func__); + } else if (gp_cond_exp && (!cur_ops->get_gp_state_exp || !cur_ops->cond_sync_exp)) { + pr_alert("%s: gp_cond_exp without primitives.\n", __func__); + } if (gp_exp1 && cur_ops->exp_sync) { synctype[nsynctypes++] = RTWS_EXP_SYNC; pr_info("%s: Testing expedited GPs.\n", __func__); @@ -1178,6 +1204,12 @@ static void rcu_torture_write_types(void) } else if (gp_poll && (!cur_ops->start_gp_poll || !cur_ops->poll_gp_state)) { pr_alert("%s: gp_poll without primitives.\n", __func__); } + if (gp_poll_exp1 && cur_ops->start_gp_poll_exp && cur_ops->poll_gp_state_exp) { + synctype[nsynctypes++] = RTWS_POLL_GET_EXP; + pr_info("%s: Testing polling expedited GPs.\n", __func__); + } else if (gp_poll_exp && (!cur_ops->start_gp_poll_exp || !cur_ops->poll_gp_state_exp)) { + pr_alert("%s: gp_poll_exp without primitives.\n", __func__); + } if (gp_sync1 && cur_ops->sync) { synctype[nsynctypes++] = RTWS_SYNC; pr_info("%s: Testing normal GPs.\n", __func__); @@ -1285,6 +1317,14 @@ rcu_torture_writer(void *arg) cur_ops->cond_sync(gp_snap); rcu_torture_pipe_update(old_rp); break; + case RTWS_COND_GET_EXP: + rcu_torture_writer_state = RTWS_COND_GET_EXP; + gp_snap = cur_ops->get_gp_state_exp(); + torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand); + rcu_torture_writer_state = RTWS_COND_SYNC_EXP; + cur_ops->cond_sync_exp(gp_snap); + rcu_torture_pipe_update(old_rp); + break; case RTWS_POLL_GET: rcu_torture_writer_state = RTWS_POLL_GET; gp_snap = cur_ops->start_gp_poll(); @@ -1294,6 +1334,15 @@ rcu_torture_writer(void *arg) &rand); rcu_torture_pipe_update(old_rp); break; + case RTWS_POLL_GET_EXP: + rcu_torture_writer_state = RTWS_POLL_GET_EXP; + gp_snap = cur_ops->start_gp_poll_exp(); + rcu_torture_writer_state = RTWS_POLL_WAIT_EXP; + while (!cur_ops->poll_gp_state_exp(gp_snap)) + torture_hrtimeout_jiffies(torture_random(&rand) % 16, + &rand); + rcu_torture_pipe_update(old_rp); + break; case RTWS_SYNC: rcu_torture_writer_state = RTWS_SYNC; if (cur_ops->get_gp_state && cur_ops->poll_gp_state) @@ -1400,6 +1449,11 @@ rcu_torture_fakewriter(void *arg) torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand); cur_ops->cond_sync(gp_snap); break; + case RTWS_COND_GET_EXP: + gp_snap = cur_ops->get_gp_state_exp(); + torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand); + cur_ops->cond_sync_exp(gp_snap); + break; case RTWS_POLL_GET: gp_snap = cur_ops->start_gp_poll(); while (!cur_ops->poll_gp_state(gp_snap)) { @@ -1407,6 +1461,13 @@ rcu_torture_fakewriter(void *arg) &rand); } break; + case RTWS_POLL_GET_EXP: + gp_snap = cur_ops->start_gp_poll_exp(); + while (!cur_ops->poll_gp_state_exp(gp_snap)) { + torture_hrtimeout_jiffies(torture_random(&rand) % 16, + &rand); + } + break; case RTWS_SYNC: cur_ops->sync(); break; From ef4f9d9b9230fcf4fca9801f03c31d99ed06a716 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Fri, 22 Apr 2022 21:15:18 +0800 Subject: [PATCH 73/75] rcu: Put panic_on_rcu_stall() after expedited RCU CPU stall warnings When a normal RCU CPU stall warning is encountered with the panic_on_rcu_stall sysfs variable is set, the system panics only after the stall warning is printed. But when an expedited RCU CPU stall warning is encountered with the panic_on_rcu_stall sysfs variable is set, the system panics first, thus never printing the stall warning. This commit therefore brings the expedited stall warning into line with the normal stall warning by printing first and panicking afterwards. Signed-off-by: Zqiang Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 571b0a700cce..f05a15b11fa0 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -623,7 +623,6 @@ static void synchronize_rcu_expedited_wait(void) return; if (rcu_stall_is_suppressed()) continue; - panic_on_rcu_stall(); trace_rcu_stall_warning(rcu_state.name, TPS("ExpeditedStall")); pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", rcu_state.name); @@ -671,6 +670,7 @@ static void synchronize_rcu_expedited_wait(void) } } jiffies_stall = 3 * rcu_exp_jiffies_till_stall_check() + 3; + panic_on_rcu_stall(); } } From 82e445697d6a14d6b7462c13c613ebdd96468818 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 9 May 2022 09:49:05 -0700 Subject: [PATCH 74/75] rcu: Diagnose extended sync_rcu_do_polled_gp() loops This commit dumps out state when the sync_rcu_do_polled_gp() function loops more than expected. This is a debugging aid. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index f05a15b11fa0..4c7037b50703 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -970,6 +970,7 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); static void sync_rcu_do_polled_gp(struct work_struct *wp) { unsigned long flags; + int i = 0; struct rcu_node *rnp = container_of(wp, struct rcu_node, exp_poll_wq); unsigned long s; @@ -979,8 +980,12 @@ static void sync_rcu_do_polled_gp(struct work_struct *wp) raw_spin_unlock_irqrestore(&rnp->exp_poll_lock, flags); if (s == RCU_GET_STATE_COMPLETED) return; - while (!poll_state_synchronize_rcu(s)) + while (!poll_state_synchronize_rcu(s)) { synchronize_rcu_expedited(); + if (i == 10 || i == 20) + pr_info("%s: i = %d s = %lx gp_seq_polled = %lx\n", __func__, i, s, READ_ONCE(rcu_state.gp_seq_polled)); + i++; + } raw_spin_lock_irqsave(&rnp->exp_poll_lock, flags); s = rnp->exp_seq_poll_rq; if (poll_state_synchronize_rcu(s)) From 28787e04fb67963673cbe6f77fb27137eba42718 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Wed, 18 May 2022 19:43:10 +0800 Subject: [PATCH 75/75] rcu: Add irqs-disabled indicator to expedited RCU CPU stall warnings If a CPU has interrupts disabled continuously starting before the beginning of a given expedited RCU grace period, that CPU will not execute that grace period's IPI handler. This will in turn mean that the ->cpu_no_qs.b.exp field in that CPU's rcu_data structure will continue to contain the boolean value false. Knowing whether or not a CPU has had interrupts disabled can be helpful when debugging an expedited RCU CPU stall warning, so this commit adds a "D" indicator expedited RCU CPU stall warnings that signifies that the corresponding CPU has had interrupts disabled throughout. This capability was tested as follows: runqemu kvm slirp nographic qemuparams="-m 4096 -smp 4" bootparams= "isolcpus=2,3 nohz_full=2,3 rcu_nocbs=2,3 rcutree.dump_tree=1 rcutorture.stall_cpu_holdoff=30 rcutorture.stall_cpu=40 rcutorture.stall_cpu_irqsoff=1 rcutorture.stall_cpu_block=0 rcutorture.stall_no_softlockup=1" -d The rcu_torture_stall() function ran on CPU 1, which displays the "D" as expected given the rcutorture.stall_cpu_irqsoff=1 module parameter: ............ rcu: INFO: rcu_preempt detected expedited stalls on CPUs/tasks: { 1-...D } 26467 jiffies s: 13317 root: 0x1/. rcu: blocking rcu_node structures (internal RCU debug): l=1:0-1:0x2/. Task dump for CPU 1: task:rcu_torture_sta state:R running task stack: 0 pid: 76 ppid: 2 flags:0x00004008 Signed-off-by: Zqiang Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 4c7037b50703..f092c7f18a5f 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -637,10 +637,11 @@ static void synchronize_rcu_expedited_wait(void) continue; ndetected++; rdp = per_cpu_ptr(&rcu_data, cpu); - pr_cont(" %d-%c%c%c", cpu, + pr_cont(" %d-%c%c%c%c", cpu, "O."[!!cpu_online(cpu)], "o."[!!(rdp->grpmask & rnp->expmaskinit)], - "N."[!!(rdp->grpmask & rnp->expmaskinitnext)]); + "N."[!!(rdp->grpmask & rnp->expmaskinitnext)], + "D."[!!(rdp->cpu_no_qs.b.exp)]); } } pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n",