[linux] 09/11: mm: Avoid ABI change in 4.9.79.

Fri Feb 2 15:17:08 UTC 2018

This is an automated email from the git hooks/post-receive script.

corsac pushed a commit to branch stretch
in repository linux.

commit f7fb895cc26fd5070b3420e5f06e0fa738a1528c
Author: Yves-Alexis Perez <corsac at debian.org>
Date:   Fri Feb 2 10:35:17 2018 +0100

    mm: Avoid ABI change in 4.9.79.
---
 debian/changelog                                   |   1 +
 ...ix-100-CPU-kswapd-busyloop-on-unreclaimab.patch | 244 +++++++++++++++++++++
 debian/patches/series                              |   1 +
 3 files changed, 246 insertions(+)

diff --git a/debian/changelog b/debian/changelog
index 5712101..604af63 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -870,6 +870,7 @@ linux (4.9.79-1) UNRELEASED; urgency=medium
   * RT patchset:
     - fix context against 4.9.78 (164, 165, 229, 230)
     - refresh for fuzz (228)
+  * mm: Avoid ABI change in 4.9.79.
 
   [ Salvatore Bonaccorso ]
   * nfsd: auth: Fix gid sorting when rootsquash enabled (CVE-2018-1000028)
diff --git a/debian/patches/debian/revert-mm-fix-100-CPU-kswapd-busyloop-on-unreclaimab.patch b/debian/patches/debian/revert-mm-fix-100-CPU-kswapd-busyloop-on-unreclaimab.patch
new file mode 100644
index 0000000..5b72794
--- /dev/null
+++ b/debian/patches/debian/revert-mm-fix-100-CPU-kswapd-busyloop-on-unreclaimab.patch
@@ -0,0 +1,244 @@
+From 41080b302b0434b448531d75ec61b7df900511bc Mon Sep 17 00:00:00 2001
+From: Yves-Alexis Perez <corsac at debian.org>
+Date: Fri, 2 Feb 2018 09:46:54 +0100
+Subject: [PATCH] Revert "mm: fix 100% CPU kswapd busyloop on unreclaimable nodes"
+Forwarded: not-needed
+
+This reverts commit 19a7db1e2ef38865a704ea4dfd178b02a8026ada which is
+c73322d098e4b6f5f0f0fa1330bf57e218775539 upstream. By adding a new field into
+struct pglist_data it changes the ABI. Since the problem doesn't seem to
+occur often, revert the change for now.
+---
+ include/linux/mmzone.h |  2 --
+ mm/internal.h          |  6 ------
+ mm/page_alloc.c        |  9 +++++++--
+ mm/vmscan.c            | 47 +++++++++++++++--------------------------------
+ mm/vmstat.c            |  2 +-
+ 5 files changed, 23 insertions(+), 43 deletions(-)
+
+diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
+index e3d7754f25f0..490f5a83f947 100644
+--- a/include/linux/mmzone.h
++++ b/include/linux/mmzone.h
+@@ -633,8 +633,6 @@ typedef struct pglist_data {
+ 	int kswapd_order;
+ 	enum zone_type kswapd_classzone_idx;
+ 
+-	int kswapd_failures;		/* Number of 'reclaimed == 0' runs */
+-
+ #ifdef CONFIG_COMPACTION
+ 	int kcompactd_max_order;
+ 	enum zone_type kcompactd_classzone_idx;
+diff --git a/mm/internal.h b/mm/internal.h
+index 3e2d01694747..34a5459e5989 100644
+--- a/mm/internal.h
++++ b/mm/internal.h
+@@ -73,12 +73,6 @@ static inline void set_page_refcounted(struct page *page)
+ 
+ extern unsigned long highest_memmap_pfn;
+ 
+-/*
+- * Maximum number of reclaim retries without progress before the OOM
+- * killer is consider the only way forward.
+- */
+-#define MAX_RECLAIM_RETRIES 16
+-
+ /*
+  * in mm/vmscan.c:
+  */
+diff --git a/mm/page_alloc.c b/mm/page_alloc.c
+index 94018ea5f935..546713b3f762 100644
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -3421,6 +3421,12 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
+ 	return false;
+ }
+ 
++/*
++ * Maximum number of reclaim retries without any progress before OOM killer
++ * is consider as the only way to move forward.
++ */
++#define MAX_RECLAIM_RETRIES 16
++
+ /*
+  * Checks whether it makes sense to retry the reclaim to make a forward progress
+  * for the given allocation request.
+@@ -4379,8 +4385,7 @@ void show_free_areas(unsigned int filter)
+ 			K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
+ 			K(node_page_state(pgdat, NR_UNSTABLE_NFS)),
+ 			node_page_state(pgdat, NR_PAGES_SCANNED),
+-			pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ?
+-				"yes" : "no");
++			!pgdat_reclaimable(pgdat) ? "yes" : "no");
+ 	}
+ 
+ 	for_each_populated_zone(zone) {
+diff --git a/mm/vmscan.c b/mm/vmscan.c
+index f118dc23f662..30a88b945a44 100644
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -2606,15 +2606,6 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
+ 	} while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
+ 					 sc->nr_scanned - nr_scanned, sc));
+ 
+-	/*
+-	 * Kswapd gives up on balancing particular nodes after too
+-	 * many failures to reclaim anything from them and goes to
+-	 * sleep. On reclaim progress, reset the failure counter. A
+-	 * successful direct reclaim run will revive a dormant kswapd.
+-	 */
+-	if (reclaimable)
+-		pgdat->kswapd_failures = 0;
+-
+ 	return reclaimable;
+ }
+ 
+@@ -2689,6 +2680,10 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
+ 						 GFP_KERNEL | __GFP_HARDWALL))
+ 				continue;
+ 
++			if (sc->priority != DEF_PRIORITY &&
++			    !pgdat_reclaimable(zone->zone_pgdat))
++				continue;	/* Let kswapd poll it */
++
+ 			/*
+ 			 * If we already have plenty of memory free for
+ 			 * compaction in this zone, don't free any more.
+@@ -2825,7 +2820,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
+ 	return 0;
+ }
+ 
+-static bool allow_direct_reclaim(pg_data_t *pgdat)
++static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
+ {
+ 	struct zone *zone;
+ 	unsigned long pfmemalloc_reserve = 0;
+@@ -2833,9 +2828,6 @@ static bool allow_direct_reclaim(pg_data_t *pgdat)
+ 	int i;
+ 	bool wmark_ok;
+ 
+-	if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
+-		return true;
+-
+ 	for (i = 0; i <= ZONE_NORMAL; i++) {
+ 		zone = &pgdat->node_zones[i];
+ 		if (!managed_zone(zone) ||
+@@ -2916,7 +2908,7 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
+ 
+ 		/* Throttle based on the first usable node */
+ 		pgdat = zone->zone_pgdat;
+-		if (allow_direct_reclaim(pgdat))
++		if (pfmemalloc_watermark_ok(pgdat))
+ 			goto out;
+ 		break;
+ 	}
+@@ -2938,14 +2930,14 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
+ 	 */
+ 	if (!(gfp_mask & __GFP_FS)) {
+ 		wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
+-			allow_direct_reclaim(pgdat), HZ);
++			pfmemalloc_watermark_ok(pgdat), HZ);
+ 
+ 		goto check_pending;
+ 	}
+ 
+ 	/* Throttle until kswapd wakes the process */
+ 	wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
+-		allow_direct_reclaim(pgdat));
++		pfmemalloc_watermark_ok(pgdat));
+ 
+ check_pending:
+ 	if (fatal_signal_pending(current))
+@@ -3124,7 +3116,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx)
+ 
+ 	/*
+ 	 * The throttled processes are normally woken up in balance_pgdat() as
+-	 * soon as allow_direct_reclaim() is true. But there is a potential
++	 * soon as pfmemalloc_watermark_ok() is true. But there is a potential
+ 	 * race between when kswapd checks the watermarks and a process gets
+ 	 * throttled. There is also a potential race if processes get
+ 	 * throttled, kswapd wakes, a large process exits thereby balancing the
+@@ -3138,10 +3130,6 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx)
+ 	if (waitqueue_active(&pgdat->pfmemalloc_wait))
+ 		wake_up_all(&pgdat->pfmemalloc_wait);
+ 
+-	/* Hopeless node, leave it to direct reclaim */
+-	if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
+-		return true;
+-
+ 	for (i = 0; i <= classzone_idx; i++) {
+ 		struct zone *zone = pgdat->node_zones + i;
+ 
+@@ -3228,9 +3216,9 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
+ 	count_vm_event(PAGEOUTRUN);
+ 
+ 	do {
+-		unsigned long nr_reclaimed = sc.nr_reclaimed;
+ 		bool raise_priority = true;
+ 
++		sc.nr_reclaimed = 0;
+ 		sc.reclaim_idx = classzone_idx;
+ 
+ 		/*
+@@ -3309,7 +3297,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
+ 		 * able to safely make forward progress. Wake them
+ 		 */
+ 		if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
+-				allow_direct_reclaim(pgdat))
++				pfmemalloc_watermark_ok(pgdat))
+ 			wake_up_all(&pgdat->pfmemalloc_wait);
+ 
+ 		/* Check if kswapd should be suspending */
+@@ -3320,14 +3308,10 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
+ 		 * Raise priority if scanning rate is too low or there was no
+ 		 * progress in reclaiming pages
+ 		 */
+-		nr_reclaimed = sc.nr_reclaimed - nr_reclaimed;
+-		if (raise_priority || !nr_reclaimed)
++		if (raise_priority || !sc.nr_reclaimed)
+ 			sc.priority--;
+ 	} while (sc.priority >= 1);
+ 
+-	if (!sc.nr_reclaimed)
+-		pgdat->kswapd_failures++;
+-
+ out:
+ 	/*
+ 	 * Return the order kswapd stopped reclaiming at as
+@@ -3527,10 +3511,6 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
+ 	if (!waitqueue_active(&pgdat->kswapd_wait))
+ 		return;
+ 
+-	/* Hopeless node, leave it to direct reclaim */
+-	if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
+-		return;
+-
+ 	/* Only wake kswapd if all zones are unbalanced */
+ 	for (z = 0; z <= classzone_idx; z++) {
+ 		zone = pgdat->node_zones + z;
+@@ -3801,6 +3781,9 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
+ 	    sum_zone_node_page_state(pgdat->node_id, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages)
+ 		return NODE_RECLAIM_FULL;
+ 
++	if (!pgdat_reclaimable(pgdat))
++		return NODE_RECLAIM_FULL;
++
+ 	/*
+ 	 * Do not scan if the allocation should not be delayed.
+ 	 */
+diff --git a/mm/vmstat.c b/mm/vmstat.c
+index 3863b5d6d598..6a088df04b29 100644
+--- a/mm/vmstat.c
++++ b/mm/vmstat.c
+@@ -1421,7 +1421,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
+ 		   "\n  node_unreclaimable:  %u"
+ 		   "\n  start_pfn:           %lu"
+ 		   "\n  node_inactive_ratio: %u",
+-		   pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES,
++		   !pgdat_reclaimable(zone->zone_pgdat),
+ 		   zone->zone_start_pfn,
+ 		   zone->zone_pgdat->inactive_ratio);
+ 	seq_putc(m, '\n');
+-- 
+2.15.1
+
diff --git a/debian/patches/series b/debian/patches/series
index 77b25ae..5a283c8 100644
--- a/debian/patches/series
+++ b/debian/patches/series
@@ -187,3 +187,4 @@ debian/revert-lib-genalloc.c-make-the-avail-variable-an-ato.patch
 debian/revert-tcp-invalidate-rate-samples-during-SACK-reneg.patch
 debian/bpf-avoid-abi-change-in-4.9.77.patch
 debian/revert-sched-rt-Simplify-the-IPI-based-RT-balancing-.patch
+debian/revert-mm-fix-100-CPU-kswapd-busyloop-on-unreclaimab.patch

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/kernel/linux.git