[linux] 26/29: sched/rt: Avoid ABI change in 4.9.66.

Tue Jan 23 17:14:08 UTC 2018

This is an automated email from the git hooks/post-receive script.

corsac pushed a commit to branch stretch
in repository linux.

commit a4856155cee17e595db82b57c81a892fc758d855
Author: Yves-Alexis Perez <corsac at corsac.net>
Date:   Tue Jan 23 10:33:41 2018 +0100

    sched/rt: Avoid ABI change in 4.9.66.
---
 debian/changelog                                   |   1 +
 ...d-rt-Simplify-the-IPI-based-RT-balancing-.patch | 398 +++++++++++++++++++++
 debian/patches/series                              |   1 +
 3 files changed, 400 insertions(+)

diff --git a/debian/changelog b/debian/changelog
index c1597a6..44856ae 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -797,6 +797,7 @@ linux (4.9.77-1) UNRELEASED; urgency=medium
     - bugfix/all/bluetooth-prevent-stack-info-leak-from-the-efs-element.patch
   * bpf: avoid ABI change in 4.9.77.
   * Ignore ABI change for cpu_tlbstate (symbol not exported _GPL anymore)
+  * sched/rt: Avoid ABI change in 4.9.66.
 
  -- Ben Hutchings <ben at decadent.org.uk>  Thu, 28 Dec 2017 02:16:23 +0000
 
diff --git a/debian/patches/debian/revert-sched-rt-Simplify-the-IPI-based-RT-balancing-.patch b/debian/patches/debian/revert-sched-rt-Simplify-the-IPI-based-RT-balancing-.patch
new file mode 100644
index 0000000..275a42d
--- /dev/null
+++ b/debian/patches/debian/revert-sched-rt-Simplify-the-IPI-based-RT-balancing-.patch
@@ -0,0 +1,398 @@
+From 3a7cd37c72cf0cd3f7ae82237ce71ee2c88df147 Mon Sep 17 00:00:00 2001
+From: Yves-Alexis Perez <corsac at debian.org>
+Date: Tue, 23 Jan 2018 08:23:44 +0100
+Subject: [PATCH] Revert "sched/rt: Simplify the IPI based RT balancing logic"
+
+This reverts commit 1c37ff78298a6b6063649123356a312e1cce12ca which is
+commit 4bdced5c9a2922521e325896a7bbbf0132c94e56 upstream. This commit
+removes two fields from struct rt_rq which is used in struct
+sched_rt_entity, used in turn in struct task_struct. This turns into a
+large ABI change for an enhancement fix, so revert that for now.
+
+---
+ kernel/sched/core.c  |   6 --
+ kernel/sched/rt.c    | 235 ++++++++++++++++++++++++++-------------------------
+ kernel/sched/sched.h |  24 ++----
+ 3 files changed, 127 insertions(+), 138 deletions(-)
+
+diff --git a/kernel/sched/core.c b/kernel/sched/core.c
+index e5066955cc3a..d748d4e5455d 100644
+--- a/kernel/sched/core.c
++++ b/kernel/sched/core.c
+@@ -5877,12 +5877,6 @@ static int init_rootdomain(struct root_domain *rd)
+ 	if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
+ 		goto free_dlo_mask;
+ 
+-#ifdef HAVE_RT_PUSH_IPI
+-	rd->rto_cpu = -1;
+-	raw_spin_lock_init(&rd->rto_lock);
+-	init_irq_work(&rd->rto_push_work, rto_push_irq_work_func);
+-#endif
+-
+ 	init_dl_bw(&rd->dl_bw);
+ 	if (cpudl_init(&rd->cpudl) != 0)
+ 		goto free_dlo_mask;
+diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
+index 7a360d6f6798..34b1133cbac3 100644
+--- a/kernel/sched/rt.c
++++ b/kernel/sched/rt.c
+@@ -72,6 +72,10 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
+ 	raw_spin_unlock(&rt_b->rt_runtime_lock);
+ }
+ 
++#if defined(CONFIG_SMP) && defined(HAVE_RT_PUSH_IPI)
++static void push_irq_work_func(struct irq_work *work);
++#endif
++
+ void init_rt_rq(struct rt_rq *rt_rq)
+ {
+ 	struct rt_prio_array *array;
+@@ -91,6 +95,13 @@ void init_rt_rq(struct rt_rq *rt_rq)
+ 	rt_rq->rt_nr_migratory = 0;
+ 	rt_rq->overloaded = 0;
+ 	plist_head_init(&rt_rq->pushable_tasks);
++
++#ifdef HAVE_RT_PUSH_IPI
++	rt_rq->push_flags = 0;
++	rt_rq->push_cpu = nr_cpu_ids;
++	raw_spin_lock_init(&rt_rq->push_lock);
++	init_irq_work(&rt_rq->push_work, push_irq_work_func);
++#endif
+ #endif /* CONFIG_SMP */
+ 	/* We start is dequeued state, because no RT tasks are queued */
+ 	rt_rq->rt_queued = 0;
+@@ -1853,166 +1864,160 @@ static void push_rt_tasks(struct rq *rq)
+ }
+ 
+ #ifdef HAVE_RT_PUSH_IPI
+-
+ /*
+- * When a high priority task schedules out from a CPU and a lower priority
+- * task is scheduled in, a check is made to see if there's any RT tasks
+- * on other CPUs that are waiting to run because a higher priority RT task
+- * is currently running on its CPU. In this case, the CPU with multiple RT
+- * tasks queued on it (overloaded) needs to be notified that a CPU has opened
+- * up that may be able to run one of its non-running queued RT tasks.
+- *
+- * All CPUs with overloaded RT tasks need to be notified as there is currently
+- * no way to know which of these CPUs have the highest priority task waiting
+- * to run. Instead of trying to take a spinlock on each of these CPUs,
+- * which has shown to cause large latency when done on machines with many
+- * CPUs, sending an IPI to the CPUs to have them push off the overloaded
+- * RT tasks waiting to run.
+- *
+- * Just sending an IPI to each of the CPUs is also an issue, as on large
+- * count CPU machines, this can cause an IPI storm on a CPU, especially
+- * if its the only CPU with multiple RT tasks queued, and a large number
+- * of CPUs scheduling a lower priority task at the same time.
+- *
+- * Each root domain has its own irq work function that can iterate over
+- * all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT
+- * tassk must be checked if there's one or many CPUs that are lowering
+- * their priority, there's a single irq work iterator that will try to
+- * push off RT tasks that are waiting to run.
+- *
+- * When a CPU schedules a lower priority task, it will kick off the
+- * irq work iterator that will jump to each CPU with overloaded RT tasks.
+- * As it only takes the first CPU that schedules a lower priority task
+- * to start the process, the rto_start variable is incremented and if
+- * the atomic result is one, then that CPU will try to take the rto_lock.
+- * This prevents high contention on the lock as the process handles all
+- * CPUs scheduling lower priority tasks.
+- *
+- * All CPUs that are scheduling a lower priority task will increment the
+- * rt_loop_next variable. This will make sure that the irq work iterator
+- * checks all RT overloaded CPUs whenever a CPU schedules a new lower
+- * priority task, even if the iterator is in the middle of a scan. Incrementing
+- * the rt_loop_next will cause the iterator to perform another scan.
++ * The search for the next cpu always starts at rq->cpu and ends
++ * when we reach rq->cpu again. It will never return rq->cpu.
++ * This returns the next cpu to check, or nr_cpu_ids if the loop
++ * is complete.
+  *
++ * rq->rt.push_cpu holds the last cpu returned by this function,
++ * or if this is the first instance, it must hold rq->cpu.
+  */
+ static int rto_next_cpu(struct rq *rq)
+ {
+-	struct root_domain *rd = rq->rd;
+-	int next;
++	int prev_cpu = rq->rt.push_cpu;
+ 	int cpu;
+ 
++	cpu = cpumask_next(prev_cpu, rq->rd->rto_mask);
++
+ 	/*
+-	 * When starting the IPI RT pushing, the rto_cpu is set to -1,
+-	 * rt_next_cpu() will simply return the first CPU found in
+-	 * the rto_mask.
+-	 *
+-	 * If rto_next_cpu() is called with rto_cpu is a valid cpu, it
+-	 * will return the next CPU found in the rto_mask.
+-	 *
+-	 * If there are no more CPUs left in the rto_mask, then a check is made
+-	 * against rto_loop and rto_loop_next. rto_loop is only updated with
+-	 * the rto_lock held, but any CPU may increment the rto_loop_next
+-	 * without any locking.
++	 * If the previous cpu is less than the rq's CPU, then it already
++	 * passed the end of the mask, and has started from the beginning.
++	 * We end if the next CPU is greater or equal to rq's CPU.
+ 	 */
+-	for (;;) {
+-
+-		/* When rto_cpu is -1 this acts like cpumask_first() */
+-		cpu = cpumask_next(rd->rto_cpu, rd->rto_mask);
+-
+-		rd->rto_cpu = cpu;
+-
+-		if (cpu < nr_cpu_ids)
+-			return cpu;
+-
+-		rd->rto_cpu = -1;
++	if (prev_cpu < rq->cpu) {
++		if (cpu >= rq->cpu)
++			return nr_cpu_ids;
+ 
++	} else if (cpu >= nr_cpu_ids) {
+ 		/*
+-		 * ACQUIRE ensures we see the @rto_mask changes
+-		 * made prior to the @next value observed.
+-		 *
+-		 * Matches WMB in rt_set_overload().
++		 * We passed the end of the mask, start at the beginning.
++		 * If the result is greater or equal to the rq's CPU, then
++		 * the loop is finished.
+ 		 */
+-		next = atomic_read_acquire(&rd->rto_loop_next);
+-
+-		if (rd->rto_loop == next)
+-			break;
+-
+-		rd->rto_loop = next;
++		cpu = cpumask_first(rq->rd->rto_mask);
++		if (cpu >= rq->cpu)
++			return nr_cpu_ids;
+ 	}
++	rq->rt.push_cpu = cpu;
+ 
+-	return -1;
++	/* Return cpu to let the caller know if the loop is finished or not */
++	return cpu;
+ }
+ 
+-static inline bool rto_start_trylock(atomic_t *v)
++static int find_next_push_cpu(struct rq *rq)
+ {
+-	return !atomic_cmpxchg_acquire(v, 0, 1);
+-}
++	struct rq *next_rq;
++	int cpu;
+ 
+-static inline void rto_start_unlock(atomic_t *v)
+-{
+-	atomic_set_release(v, 0);
++	while (1) {
++		cpu = rto_next_cpu(rq);
++		if (cpu >= nr_cpu_ids)
++			break;
++		next_rq = cpu_rq(cpu);
++
++		/* Make sure the next rq can push to this rq */
++		if (next_rq->rt.highest_prio.next < rq->rt.highest_prio.curr)
++			break;
++	}
++
++	return cpu;
+ }
+ 
++#define RT_PUSH_IPI_EXECUTING		1
++#define RT_PUSH_IPI_RESTART		2
++
+ static void tell_cpu_to_push(struct rq *rq)
+ {
+-	int cpu = -1;
+-
+-	/* Keep the loop going if the IPI is currently active */
+-	atomic_inc(&rq->rd->rto_loop_next);
+-
+-	/* Only one CPU can initiate a loop at a time */
+-	if (!rto_start_trylock(&rq->rd->rto_loop_start))
+-		return;
++	int cpu;
+ 
+-	raw_spin_lock(&rq->rd->rto_lock);
++	if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) {
++		raw_spin_lock(&rq->rt.push_lock);
++		/* Make sure it's still executing */
++		if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) {
++			/*
++			 * Tell the IPI to restart the loop as things have
++			 * changed since it started.
++			 */
++			rq->rt.push_flags |= RT_PUSH_IPI_RESTART;
++			raw_spin_unlock(&rq->rt.push_lock);
++			return;
++		}
++		raw_spin_unlock(&rq->rt.push_lock);
++	}
+ 
+-	/*
+-	 * The rto_cpu is updated under the lock, if it has a valid cpu
+-	 * then the IPI is still running and will continue due to the
+-	 * update to loop_next, and nothing needs to be done here.
+-	 * Otherwise it is finishing up and an ipi needs to be sent.
+-	 */
+-	if (rq->rd->rto_cpu < 0)
+-		cpu = rto_next_cpu(rq);
++	/* When here, there's no IPI going around */
+ 
+-	raw_spin_unlock(&rq->rd->rto_lock);
++	rq->rt.push_cpu = rq->cpu;
++	cpu = find_next_push_cpu(rq);
++	if (cpu >= nr_cpu_ids)
++		return;
+ 
+-	rto_start_unlock(&rq->rd->rto_loop_start);
++	rq->rt.push_flags = RT_PUSH_IPI_EXECUTING;
+ 
+-	if (cpu >= 0)
+-		irq_work_queue_on(&rq->rd->rto_push_work, cpu);
++	irq_work_queue_on(&rq->rt.push_work, cpu);
+ }
+ 
+ /* Called from hardirq context */
+-void rto_push_irq_work_func(struct irq_work *work)
++static void try_to_push_tasks(void *arg)
+ {
+-	struct rq *rq;
++	struct rt_rq *rt_rq = arg;
++	struct rq *rq, *src_rq;
++	int this_cpu;
+ 	int cpu;
+ 
+-	rq = this_rq();
++	this_cpu = rt_rq->push_cpu;
+ 
+-	/*
+-	 * We do not need to grab the lock to check for has_pushable_tasks.
+-	 * When it gets updated, a check is made if a push is possible.
+-	 */
++	/* Paranoid check */
++	BUG_ON(this_cpu != smp_processor_id());
++
++	rq = cpu_rq(this_cpu);
++	src_rq = rq_of_rt_rq(rt_rq);
++
++again:
+ 	if (has_pushable_tasks(rq)) {
+ 		raw_spin_lock(&rq->lock);
+-		push_rt_tasks(rq);
++		push_rt_task(rq);
+ 		raw_spin_unlock(&rq->lock);
+ 	}
+ 
+-	raw_spin_lock(&rq->rd->rto_lock);
+-
+ 	/* Pass the IPI to the next rt overloaded queue */
+-	cpu = rto_next_cpu(rq);
++	raw_spin_lock(&rt_rq->push_lock);
++	/*
++	 * If the source queue changed since the IPI went out,
++	 * we need to restart the search from that CPU again.
++	 */
++	if (rt_rq->push_flags & RT_PUSH_IPI_RESTART) {
++		rt_rq->push_flags &= ~RT_PUSH_IPI_RESTART;
++		rt_rq->push_cpu = src_rq->cpu;
++	}
+ 
+-	raw_spin_unlock(&rq->rd->rto_lock);
++	cpu = find_next_push_cpu(src_rq);
+ 
+-	if (cpu < 0)
++	if (cpu >= nr_cpu_ids)
++		rt_rq->push_flags &= ~RT_PUSH_IPI_EXECUTING;
++	raw_spin_unlock(&rt_rq->push_lock);
++
++	if (cpu >= nr_cpu_ids)
+ 		return;
+ 
++	/*
++	 * It is possible that a restart caused this CPU to be
++	 * chosen again. Don't bother with an IPI, just see if we
++	 * have more to push.
++	 */
++	if (unlikely(cpu == rq->cpu))
++		goto again;
++
+ 	/* Try the next RT overloaded CPU */
+-	irq_work_queue_on(&rq->rd->rto_push_work, cpu);
++	irq_work_queue_on(&rt_rq->push_work, cpu);
++}
++
++static void push_irq_work_func(struct irq_work *work)
++{
++	struct rt_rq *rt_rq = container_of(work, struct rt_rq, push_work);
++
++	try_to_push_tasks(rt_rq);
+ }
+ #endif /* HAVE_RT_PUSH_IPI */
+ 
+diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
+index cff985feb6e7..ad77d666583c 100644
+--- a/kernel/sched/sched.h
++++ b/kernel/sched/sched.h
+@@ -463,7 +463,7 @@ static inline int rt_bandwidth_enabled(void)
+ }
+ 
+ /* RT IPI pull logic requires IRQ_WORK */
+-#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_SMP)
++#ifdef CONFIG_IRQ_WORK
+ # define HAVE_RT_PUSH_IPI
+ #endif
+ 
+@@ -485,6 +485,12 @@ struct rt_rq {
+ 	unsigned long rt_nr_total;
+ 	int overloaded;
+ 	struct plist_head pushable_tasks;
++#ifdef HAVE_RT_PUSH_IPI
++	int push_flags;
++	int push_cpu;
++	struct irq_work push_work;
++	raw_spinlock_t push_lock;
++#endif
+ #endif /* CONFIG_SMP */
+ 	int rt_queued;
+ 
+@@ -566,19 +572,6 @@ struct root_domain {
+ 	struct dl_bw dl_bw;
+ 	struct cpudl cpudl;
+ 
+-#ifdef HAVE_RT_PUSH_IPI
+-	/*
+-	 * For IPI pull requests, loop across the rto_mask.
+-	 */
+-	struct irq_work rto_push_work;
+-	raw_spinlock_t rto_lock;
+-	/* These are only updated and read within rto_lock */
+-	int rto_loop;
+-	int rto_cpu;
+-	/* These atomics are updated outside of a lock */
+-	atomic_t rto_loop_next;
+-	atomic_t rto_loop_start;
+-#endif
+ 	/*
+ 	 * The "RT overload" flag: it gets set if a CPU has more than
+ 	 * one runnable RT task.
+@@ -591,9 +584,6 @@ struct root_domain {
+ 
+ extern struct root_domain def_root_domain;
+ 
+-#ifdef HAVE_RT_PUSH_IPI
+-extern void rto_push_irq_work_func(struct irq_work *work);
+-#endif
+ #endif /* CONFIG_SMP */
+ 
+ /*
+-- 
+2.15.1
+
diff --git a/debian/patches/series b/debian/patches/series
index db05bd7..5c954c3 100644
--- a/debian/patches/series
+++ b/debian/patches/series
@@ -189,3 +189,4 @@ debian/revert-dma-fence-Introduce-drm_fence_set_error-helpe.patch
 debian/revert-lib-genalloc.c-make-the-avail-variable-an-ato.patch
 debian/revert-tcp-invalidate-rate-samples-during-SACK-reneg.patch
 debian/bpf-avoid-abi-change-in-4.9.77.patch
+debian/revert-sched-rt-Simplify-the-IPI-based-RT-balancing-.patch

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/kernel/linux.git