[linux] 26/29: sched/rt: Avoid ABI change in 4.9.66.
debian-kernel at lists.debian.org
debian-kernel at lists.debian.org
Tue Jan 23 17:14:08 UTC 2018
This is an automated email from the git hooks/post-receive script.
corsac pushed a commit to branch stretch
in repository linux.
commit a4856155cee17e595db82b57c81a892fc758d855
Author: Yves-Alexis Perez <corsac at corsac.net>
Date: Tue Jan 23 10:33:41 2018 +0100
sched/rt: Avoid ABI change in 4.9.66.
---
debian/changelog | 1 +
...d-rt-Simplify-the-IPI-based-RT-balancing-.patch | 398 +++++++++++++++++++++
debian/patches/series | 1 +
3 files changed, 400 insertions(+)
diff --git a/debian/changelog b/debian/changelog
index c1597a6..44856ae 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -797,6 +797,7 @@ linux (4.9.77-1) UNRELEASED; urgency=medium
- bugfix/all/bluetooth-prevent-stack-info-leak-from-the-efs-element.patch
* bpf: avoid ABI change in 4.9.77.
* Ignore ABI change for cpu_tlbstate (symbol not exported _GPL anymore)
+ * sched/rt: Avoid ABI change in 4.9.66.
-- Ben Hutchings <ben at decadent.org.uk> Thu, 28 Dec 2017 02:16:23 +0000
diff --git a/debian/patches/debian/revert-sched-rt-Simplify-the-IPI-based-RT-balancing-.patch b/debian/patches/debian/revert-sched-rt-Simplify-the-IPI-based-RT-balancing-.patch
new file mode 100644
index 0000000..275a42d
--- /dev/null
+++ b/debian/patches/debian/revert-sched-rt-Simplify-the-IPI-based-RT-balancing-.patch
@@ -0,0 +1,398 @@
+From 3a7cd37c72cf0cd3f7ae82237ce71ee2c88df147 Mon Sep 17 00:00:00 2001
+From: Yves-Alexis Perez <corsac at debian.org>
+Date: Tue, 23 Jan 2018 08:23:44 +0100
+Subject: [PATCH] Revert "sched/rt: Simplify the IPI based RT balancing logic"
+
+This reverts commit 1c37ff78298a6b6063649123356a312e1cce12ca which is
+commit 4bdced5c9a2922521e325896a7bbbf0132c94e56 upstream. This commit
+removes two fields from struct rt_rq which is used in struct
+sched_rt_entity, used in turn in struct task_struct. This turns into a
+large ABI change for an enhancement fix, so revert that for now.
+
+---
+ kernel/sched/core.c | 6 --
+ kernel/sched/rt.c | 235 ++++++++++++++++++++++++++-------------------------
+ kernel/sched/sched.h | 24 ++----
+ 3 files changed, 127 insertions(+), 138 deletions(-)
+
+diff --git a/kernel/sched/core.c b/kernel/sched/core.c
+index e5066955cc3a..d748d4e5455d 100644
+--- a/kernel/sched/core.c
++++ b/kernel/sched/core.c
+@@ -5877,12 +5877,6 @@ static int init_rootdomain(struct root_domain *rd)
+ if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
+ goto free_dlo_mask;
+
+-#ifdef HAVE_RT_PUSH_IPI
+- rd->rto_cpu = -1;
+- raw_spin_lock_init(&rd->rto_lock);
+- init_irq_work(&rd->rto_push_work, rto_push_irq_work_func);
+-#endif
+-
+ init_dl_bw(&rd->dl_bw);
+ if (cpudl_init(&rd->cpudl) != 0)
+ goto free_dlo_mask;
+diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
+index 7a360d6f6798..34b1133cbac3 100644
+--- a/kernel/sched/rt.c
++++ b/kernel/sched/rt.c
+@@ -72,6 +72,10 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
+ raw_spin_unlock(&rt_b->rt_runtime_lock);
+ }
+
++#if defined(CONFIG_SMP) && defined(HAVE_RT_PUSH_IPI)
++static void push_irq_work_func(struct irq_work *work);
++#endif
++
+ void init_rt_rq(struct rt_rq *rt_rq)
+ {
+ struct rt_prio_array *array;
+@@ -91,6 +95,13 @@ void init_rt_rq(struct rt_rq *rt_rq)
+ rt_rq->rt_nr_migratory = 0;
+ rt_rq->overloaded = 0;
+ plist_head_init(&rt_rq->pushable_tasks);
++
++#ifdef HAVE_RT_PUSH_IPI
++ rt_rq->push_flags = 0;
++ rt_rq->push_cpu = nr_cpu_ids;
++ raw_spin_lock_init(&rt_rq->push_lock);
++ init_irq_work(&rt_rq->push_work, push_irq_work_func);
++#endif
+ #endif /* CONFIG_SMP */
+ /* We start is dequeued state, because no RT tasks are queued */
+ rt_rq->rt_queued = 0;
+@@ -1853,166 +1864,160 @@ static void push_rt_tasks(struct rq *rq)
+ }
+
+ #ifdef HAVE_RT_PUSH_IPI
+-
+ /*
+- * When a high priority task schedules out from a CPU and a lower priority
+- * task is scheduled in, a check is made to see if there's any RT tasks
+- * on other CPUs that are waiting to run because a higher priority RT task
+- * is currently running on its CPU. In this case, the CPU with multiple RT
+- * tasks queued on it (overloaded) needs to be notified that a CPU has opened
+- * up that may be able to run one of its non-running queued RT tasks.
+- *
+- * All CPUs with overloaded RT tasks need to be notified as there is currently
+- * no way to know which of these CPUs have the highest priority task waiting
+- * to run. Instead of trying to take a spinlock on each of these CPUs,
+- * which has shown to cause large latency when done on machines with many
+- * CPUs, sending an IPI to the CPUs to have them push off the overloaded
+- * RT tasks waiting to run.
+- *
+- * Just sending an IPI to each of the CPUs is also an issue, as on large
+- * count CPU machines, this can cause an IPI storm on a CPU, especially
+- * if its the only CPU with multiple RT tasks queued, and a large number
+- * of CPUs scheduling a lower priority task at the same time.
+- *
+- * Each root domain has its own irq work function that can iterate over
+- * all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT
+- * tassk must be checked if there's one or many CPUs that are lowering
+- * their priority, there's a single irq work iterator that will try to
+- * push off RT tasks that are waiting to run.
+- *
+- * When a CPU schedules a lower priority task, it will kick off the
+- * irq work iterator that will jump to each CPU with overloaded RT tasks.
+- * As it only takes the first CPU that schedules a lower priority task
+- * to start the process, the rto_start variable is incremented and if
+- * the atomic result is one, then that CPU will try to take the rto_lock.
+- * This prevents high contention on the lock as the process handles all
+- * CPUs scheduling lower priority tasks.
+- *
+- * All CPUs that are scheduling a lower priority task will increment the
+- * rt_loop_next variable. This will make sure that the irq work iterator
+- * checks all RT overloaded CPUs whenever a CPU schedules a new lower
+- * priority task, even if the iterator is in the middle of a scan. Incrementing
+- * the rt_loop_next will cause the iterator to perform another scan.
++ * The search for the next cpu always starts at rq->cpu and ends
++ * when we reach rq->cpu again. It will never return rq->cpu.
++ * This returns the next cpu to check, or nr_cpu_ids if the loop
++ * is complete.
+ *
++ * rq->rt.push_cpu holds the last cpu returned by this function,
++ * or if this is the first instance, it must hold rq->cpu.
+ */
+ static int rto_next_cpu(struct rq *rq)
+ {
+- struct root_domain *rd = rq->rd;
+- int next;
++ int prev_cpu = rq->rt.push_cpu;
+ int cpu;
+
++ cpu = cpumask_next(prev_cpu, rq->rd->rto_mask);
++
+ /*
+- * When starting the IPI RT pushing, the rto_cpu is set to -1,
+- * rt_next_cpu() will simply return the first CPU found in
+- * the rto_mask.
+- *
+- * If rto_next_cpu() is called with rto_cpu is a valid cpu, it
+- * will return the next CPU found in the rto_mask.
+- *
+- * If there are no more CPUs left in the rto_mask, then a check is made
+- * against rto_loop and rto_loop_next. rto_loop is only updated with
+- * the rto_lock held, but any CPU may increment the rto_loop_next
+- * without any locking.
++ * If the previous cpu is less than the rq's CPU, then it already
++ * passed the end of the mask, and has started from the beginning.
++ * We end if the next CPU is greater or equal to rq's CPU.
+ */
+- for (;;) {
+-
+- /* When rto_cpu is -1 this acts like cpumask_first() */
+- cpu = cpumask_next(rd->rto_cpu, rd->rto_mask);
+-
+- rd->rto_cpu = cpu;
+-
+- if (cpu < nr_cpu_ids)
+- return cpu;
+-
+- rd->rto_cpu = -1;
++ if (prev_cpu < rq->cpu) {
++ if (cpu >= rq->cpu)
++ return nr_cpu_ids;
+
++ } else if (cpu >= nr_cpu_ids) {
+ /*
+- * ACQUIRE ensures we see the @rto_mask changes
+- * made prior to the @next value observed.
+- *
+- * Matches WMB in rt_set_overload().
++ * We passed the end of the mask, start at the beginning.
++ * If the result is greater or equal to the rq's CPU, then
++ * the loop is finished.
+ */
+- next = atomic_read_acquire(&rd->rto_loop_next);
+-
+- if (rd->rto_loop == next)
+- break;
+-
+- rd->rto_loop = next;
++ cpu = cpumask_first(rq->rd->rto_mask);
++ if (cpu >= rq->cpu)
++ return nr_cpu_ids;
+ }
++ rq->rt.push_cpu = cpu;
+
+- return -1;
++ /* Return cpu to let the caller know if the loop is finished or not */
++ return cpu;
+ }
+
+-static inline bool rto_start_trylock(atomic_t *v)
++static int find_next_push_cpu(struct rq *rq)
+ {
+- return !atomic_cmpxchg_acquire(v, 0, 1);
+-}
++ struct rq *next_rq;
++ int cpu;
+
+-static inline void rto_start_unlock(atomic_t *v)
+-{
+- atomic_set_release(v, 0);
++ while (1) {
++ cpu = rto_next_cpu(rq);
++ if (cpu >= nr_cpu_ids)
++ break;
++ next_rq = cpu_rq(cpu);
++
++ /* Make sure the next rq can push to this rq */
++ if (next_rq->rt.highest_prio.next < rq->rt.highest_prio.curr)
++ break;
++ }
++
++ return cpu;
+ }
+
++#define RT_PUSH_IPI_EXECUTING 1
++#define RT_PUSH_IPI_RESTART 2
++
+ static void tell_cpu_to_push(struct rq *rq)
+ {
+- int cpu = -1;
+-
+- /* Keep the loop going if the IPI is currently active */
+- atomic_inc(&rq->rd->rto_loop_next);
+-
+- /* Only one CPU can initiate a loop at a time */
+- if (!rto_start_trylock(&rq->rd->rto_loop_start))
+- return;
++ int cpu;
+
+- raw_spin_lock(&rq->rd->rto_lock);
++ if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) {
++ raw_spin_lock(&rq->rt.push_lock);
++ /* Make sure it's still executing */
++ if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) {
++ /*
++ * Tell the IPI to restart the loop as things have
++ * changed since it started.
++ */
++ rq->rt.push_flags |= RT_PUSH_IPI_RESTART;
++ raw_spin_unlock(&rq->rt.push_lock);
++ return;
++ }
++ raw_spin_unlock(&rq->rt.push_lock);
++ }
+
+- /*
+- * The rto_cpu is updated under the lock, if it has a valid cpu
+- * then the IPI is still running and will continue due to the
+- * update to loop_next, and nothing needs to be done here.
+- * Otherwise it is finishing up and an ipi needs to be sent.
+- */
+- if (rq->rd->rto_cpu < 0)
+- cpu = rto_next_cpu(rq);
++ /* When here, there's no IPI going around */
+
+- raw_spin_unlock(&rq->rd->rto_lock);
++ rq->rt.push_cpu = rq->cpu;
++ cpu = find_next_push_cpu(rq);
++ if (cpu >= nr_cpu_ids)
++ return;
+
+- rto_start_unlock(&rq->rd->rto_loop_start);
++ rq->rt.push_flags = RT_PUSH_IPI_EXECUTING;
+
+- if (cpu >= 0)
+- irq_work_queue_on(&rq->rd->rto_push_work, cpu);
++ irq_work_queue_on(&rq->rt.push_work, cpu);
+ }
+
+ /* Called from hardirq context */
+-void rto_push_irq_work_func(struct irq_work *work)
++static void try_to_push_tasks(void *arg)
+ {
+- struct rq *rq;
++ struct rt_rq *rt_rq = arg;
++ struct rq *rq, *src_rq;
++ int this_cpu;
+ int cpu;
+
+- rq = this_rq();
++ this_cpu = rt_rq->push_cpu;
+
+- /*
+- * We do not need to grab the lock to check for has_pushable_tasks.
+- * When it gets updated, a check is made if a push is possible.
+- */
++ /* Paranoid check */
++ BUG_ON(this_cpu != smp_processor_id());
++
++ rq = cpu_rq(this_cpu);
++ src_rq = rq_of_rt_rq(rt_rq);
++
++again:
+ if (has_pushable_tasks(rq)) {
+ raw_spin_lock(&rq->lock);
+- push_rt_tasks(rq);
++ push_rt_task(rq);
+ raw_spin_unlock(&rq->lock);
+ }
+
+- raw_spin_lock(&rq->rd->rto_lock);
+-
+ /* Pass the IPI to the next rt overloaded queue */
+- cpu = rto_next_cpu(rq);
++ raw_spin_lock(&rt_rq->push_lock);
++ /*
++ * If the source queue changed since the IPI went out,
++ * we need to restart the search from that CPU again.
++ */
++ if (rt_rq->push_flags & RT_PUSH_IPI_RESTART) {
++ rt_rq->push_flags &= ~RT_PUSH_IPI_RESTART;
++ rt_rq->push_cpu = src_rq->cpu;
++ }
+
+- raw_spin_unlock(&rq->rd->rto_lock);
++ cpu = find_next_push_cpu(src_rq);
+
+- if (cpu < 0)
++ if (cpu >= nr_cpu_ids)
++ rt_rq->push_flags &= ~RT_PUSH_IPI_EXECUTING;
++ raw_spin_unlock(&rt_rq->push_lock);
++
++ if (cpu >= nr_cpu_ids)
+ return;
+
++ /*
++ * It is possible that a restart caused this CPU to be
++ * chosen again. Don't bother with an IPI, just see if we
++ * have more to push.
++ */
++ if (unlikely(cpu == rq->cpu))
++ goto again;
++
+ /* Try the next RT overloaded CPU */
+- irq_work_queue_on(&rq->rd->rto_push_work, cpu);
++ irq_work_queue_on(&rt_rq->push_work, cpu);
++}
++
++static void push_irq_work_func(struct irq_work *work)
++{
++ struct rt_rq *rt_rq = container_of(work, struct rt_rq, push_work);
++
++ try_to_push_tasks(rt_rq);
+ }
+ #endif /* HAVE_RT_PUSH_IPI */
+
+diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
+index cff985feb6e7..ad77d666583c 100644
+--- a/kernel/sched/sched.h
++++ b/kernel/sched/sched.h
+@@ -463,7 +463,7 @@ static inline int rt_bandwidth_enabled(void)
+ }
+
+ /* RT IPI pull logic requires IRQ_WORK */
+-#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_SMP)
++#ifdef CONFIG_IRQ_WORK
+ # define HAVE_RT_PUSH_IPI
+ #endif
+
+@@ -485,6 +485,12 @@ struct rt_rq {
+ unsigned long rt_nr_total;
+ int overloaded;
+ struct plist_head pushable_tasks;
++#ifdef HAVE_RT_PUSH_IPI
++ int push_flags;
++ int push_cpu;
++ struct irq_work push_work;
++ raw_spinlock_t push_lock;
++#endif
+ #endif /* CONFIG_SMP */
+ int rt_queued;
+
+@@ -566,19 +572,6 @@ struct root_domain {
+ struct dl_bw dl_bw;
+ struct cpudl cpudl;
+
+-#ifdef HAVE_RT_PUSH_IPI
+- /*
+- * For IPI pull requests, loop across the rto_mask.
+- */
+- struct irq_work rto_push_work;
+- raw_spinlock_t rto_lock;
+- /* These are only updated and read within rto_lock */
+- int rto_loop;
+- int rto_cpu;
+- /* These atomics are updated outside of a lock */
+- atomic_t rto_loop_next;
+- atomic_t rto_loop_start;
+-#endif
+ /*
+ * The "RT overload" flag: it gets set if a CPU has more than
+ * one runnable RT task.
+@@ -591,9 +584,6 @@ struct root_domain {
+
+ extern struct root_domain def_root_domain;
+
+-#ifdef HAVE_RT_PUSH_IPI
+-extern void rto_push_irq_work_func(struct irq_work *work);
+-#endif
+ #endif /* CONFIG_SMP */
+
+ /*
+--
+2.15.1
+
diff --git a/debian/patches/series b/debian/patches/series
index db05bd7..5c954c3 100644
--- a/debian/patches/series
+++ b/debian/patches/series
@@ -189,3 +189,4 @@ debian/revert-dma-fence-Introduce-drm_fence_set_error-helpe.patch
debian/revert-lib-genalloc.c-make-the-avail-variable-an-ato.patch
debian/revert-tcp-invalidate-rate-samples-during-SACK-reneg.patch
debian/bpf-avoid-abi-change-in-4.9.77.patch
+debian/revert-sched-rt-Simplify-the-IPI-based-RT-balancing-.patch
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/kernel/linux.git
More information about the Kernel-svn-changes
mailing list