[kernel] r12470 - in dists/sid/linux-2.6/debian: . patches/bugfix/all patches/series
Dann Frazier
dannf at alioth.debian.org
Tue Dec 2 07:07:00 UTC 2008
Author: dannf
Date: Tue Dec 2 07:06:59 2008
New Revision: 12470
Log:
Fix race conditions between inotify removal and umount (CVE-2008-5182)
Added:
dists/sid/linux-2.6/debian/patches/bugfix/all/inotify-watch-removal-umount-races.patch
Modified:
dists/sid/linux-2.6/debian/changelog
dists/sid/linux-2.6/debian/patches/series/12
Modified: dists/sid/linux-2.6/debian/changelog
==============================================================================
--- dists/sid/linux-2.6/debian/changelog (original)
+++ dists/sid/linux-2.6/debian/changelog Tue Dec 2 07:06:59 2008
@@ -5,8 +5,9 @@
[ dann frazier ]
* Make sendmsg() block during UNIX garbage collection (CVE-2008-5300)
+ * Fix race conditions between inotify removal and umount (CVE-2008-5182)
- -- dann frazier <dannf at debian.org> Mon, 01 Dec 2008 09:59:41 -0700
+ -- dann frazier <dannf at debian.org> Mon, 01 Dec 2008 23:32:05 -0700
linux-2.6 (2.6.26-11) unstable; urgency=low
Added: dists/sid/linux-2.6/debian/patches/bugfix/all/inotify-watch-removal-umount-races.patch
==============================================================================
--- (empty file)
+++ dists/sid/linux-2.6/debian/patches/bugfix/all/inotify-watch-removal-umount-races.patch Tue Dec 2 07:06:59 2008
@@ -0,0 +1,564 @@
+commit 8f7b0ba1c853919b85b54774775f567f30006107
+Author: Al Viro <viro at ZenIV.linux.org.uk>
+Date: Sat Nov 15 01:15:43 2008 +0000
+
+ Fix inotify watch removal/umount races
+
+ Inotify watch removals suck violently.
+
+ To kick the watch out we need (in this order) inode->inotify_mutex and
+ ih->mutex. That's fine if we have a hold on inode; however, for all
+ other cases we need to make damn sure we don't race with umount. We can
+ *NOT* just grab a reference to a watch - inotify_unmount_inodes() will
+ happily sail past it and we'll end with reference to inode potentially
+ outliving its superblock.
+
+ Ideally we just want to grab an active reference to superblock if we
+ can; that will make sure we won't go into inotify_umount_inodes() until
+ we are done. Cleanup is just deactivate_super().
+
+ However, that leaves a messy case - what if we *are* racing with
+ umount() and active references to superblock can't be acquired anymore?
+ We can bump ->s_count, grab ->s_umount, which will almost certainly wait
+ until the superblock is shut down and the watch in question is pining
+ for fjords. That's fine, but there is a problem - we might have hit the
+ window between ->s_active getting to 0 / ->s_count - below S_BIAS (i.e.
+ the moment when superblock is past the point of no return and is heading
+ for shutdown) and the moment when deactivate_super() acquires
+ ->s_umount.
+
+ We could just do drop_super() yield() and retry, but that's rather
+ antisocial and this stuff is luser-triggerable. OTOH, having grabbed
+ ->s_umount and having found that we'd got there first (i.e. that
+ ->s_root is non-NULL) we know that we won't race with
+ inotify_umount_inodes().
+
+ So we could grab a reference to watch and do the rest as above, just
+ with drop_super() instead of deactivate_super(), right? Wrong. We had
+ to drop ih->mutex before we could grab ->s_umount. So the watch
+ could've been gone already.
+
+ That still can be dealt with - we need to save watch->wd, do idr_find()
+ and compare its result with our pointer. If they match, we either have
+ the damn thing still alive or we'd lost not one but two races at once,
+ the watch had been killed and a new one got created with the same ->wd
+ at the same address. That couldn't have happened in inotify_destroy(),
+ but inotify_rm_wd() could run into that. Still, "new one got created"
+ is not a problem - we have every right to kill it or leave it alone,
+ whatever's more convenient.
+
+ So we can use idr_find(...) == watch && watch->inode->i_sb == sb as
+ "grab it and kill it" check. If it's been our original watch, we are
+ fine, if it's a newcomer - nevermind, just pretend that we'd won the
+ race and kill the fscker anyway; we are safe since we know that its
+ superblock won't be going away.
+
+ And yes, this is far beyond mere "not very pretty"; so's the entire
+ concept of inotify to start with.
+
+ Signed-off-by: Al Viro <viro at zeniv.linux.org.uk>
+ Acked-by: Greg KH <greg at kroah.com>
+ Signed-off-by: Linus Torvalds <torvalds at linux-foundation.org>
+
+diff --git a/fs/inotify.c b/fs/inotify.c
+index 690e725..7bbed1b 100644
+--- a/fs/inotify.c
++++ b/fs/inotify.c
+@@ -106,6 +106,20 @@ void get_inotify_watch(struct inotify_watch *watch)
+ }
+ EXPORT_SYMBOL_GPL(get_inotify_watch);
+
++int pin_inotify_watch(struct inotify_watch *watch)
++{
++ struct super_block *sb = watch->inode->i_sb;
++ spin_lock(&sb_lock);
++ if (sb->s_count >= S_BIAS) {
++ atomic_inc(&sb->s_active);
++ spin_unlock(&sb_lock);
++ atomic_inc(&watch->count);
++ return 1;
++ }
++ spin_unlock(&sb_lock);
++ return 0;
++}
++
+ /**
+ * put_inotify_watch - decrements the ref count on a given watch. cleans up
+ * watch references if the count reaches zero. inotify_watch is freed by
+@@ -124,6 +138,13 @@ void put_inotify_watch(struct inotify_watch *watch)
+ }
+ EXPORT_SYMBOL_GPL(put_inotify_watch);
+
++void unpin_inotify_watch(struct inotify_watch *watch)
++{
++ struct super_block *sb = watch->inode->i_sb;
++ put_inotify_watch(watch);
++ deactivate_super(sb);
++}
++
+ /*
+ * inotify_handle_get_wd - returns the next WD for use by the given handle
+ *
+@@ -479,6 +500,112 @@ void inotify_init_watch(struct inotify_watch *watch)
+ }
+ EXPORT_SYMBOL_GPL(inotify_init_watch);
+
++/*
++ * Watch removals suck violently. To kick the watch out we need (in this
++ * order) inode->inotify_mutex and ih->mutex. That's fine if we have
++ * a hold on inode; however, for all other cases we need to make damn sure
++ * we don't race with umount. We can *NOT* just grab a reference to a
++ * watch - inotify_unmount_inodes() will happily sail past it and we'll end
++ * with reference to inode potentially outliving its superblock. Ideally
++ * we just want to grab an active reference to superblock if we can; that
++ * will make sure we won't go into inotify_umount_inodes() until we are
++ * done. Cleanup is just deactivate_super(). However, that leaves a messy
++ * case - what if we *are* racing with umount() and active references to
++ * superblock can't be acquired anymore? We can bump ->s_count, grab
++ * ->s_umount, which will almost certainly wait until the superblock is shut
++ * down and the watch in question is pining for fjords. That's fine, but
++ * there is a problem - we might have hit the window between ->s_active
++ * getting to 0 / ->s_count - below S_BIAS (i.e. the moment when superblock
++ * is past the point of no return and is heading for shutdown) and the
++ * moment when deactivate_super() acquires ->s_umount. We could just do
++ * drop_super() yield() and retry, but that's rather antisocial and this
++ * stuff is luser-triggerable. OTOH, having grabbed ->s_umount and having
++ * found that we'd got there first (i.e. that ->s_root is non-NULL) we know
++ * that we won't race with inotify_umount_inodes(). So we could grab a
++ * reference to watch and do the rest as above, just with drop_super() instead
++ * of deactivate_super(), right? Wrong. We had to drop ih->mutex before we
++ * could grab ->s_umount. So the watch could've been gone already.
++ *
++ * That still can be dealt with - we need to save watch->wd, do idr_find()
++ * and compare its result with our pointer. If they match, we either have
++ * the damn thing still alive or we'd lost not one but two races at once,
++ * the watch had been killed and a new one got created with the same ->wd
++ * at the same address. That couldn't have happened in inotify_destroy(),
++ * but inotify_rm_wd() could run into that. Still, "new one got created"
++ * is not a problem - we have every right to kill it or leave it alone,
++ * whatever's more convenient.
++ *
++ * So we can use idr_find(...) == watch && watch->inode->i_sb == sb as
++ * "grab it and kill it" check. If it's been our original watch, we are
++ * fine, if it's a newcomer - nevermind, just pretend that we'd won the
++ * race and kill the fscker anyway; we are safe since we know that its
++ * superblock won't be going away.
++ *
++ * And yes, this is far beyond mere "not very pretty"; so's the entire
++ * concept of inotify to start with.
++ */
++
++/**
++ * pin_to_kill - pin the watch down for removal
++ * @ih: inotify handle
++ * @watch: watch to kill
++ *
++ * Called with ih->mutex held, drops it. Possible return values:
++ * 0 - nothing to do, it has died
++ * 1 - remove it, drop the reference and deactivate_super()
++ * 2 - remove it, drop the reference and drop_super(); we tried hard to avoid
++ * that variant, since it involved a lot of PITA, but that's the best that
++ * could've been done.
++ */
++static int pin_to_kill(struct inotify_handle *ih, struct inotify_watch *watch)
++{
++ struct super_block *sb = watch->inode->i_sb;
++ s32 wd = watch->wd;
++
++ spin_lock(&sb_lock);
++ if (sb->s_count >= S_BIAS) {
++ atomic_inc(&sb->s_active);
++ spin_unlock(&sb_lock);
++ get_inotify_watch(watch);
++ mutex_unlock(&ih->mutex);
++ return 1; /* the best outcome */
++ }
++ sb->s_count++;
++ spin_unlock(&sb_lock);
++ mutex_unlock(&ih->mutex); /* can't grab ->s_umount under it */
++ down_read(&sb->s_umount);
++ if (likely(!sb->s_root)) {
++ /* fs is already shut down; the watch is dead */
++ drop_super(sb);
++ return 0;
++ }
++ /* raced with the final deactivate_super() */
++ mutex_lock(&ih->mutex);
++ if (idr_find(&ih->idr, wd) != watch || watch->inode->i_sb != sb) {
++ /* the watch is dead */
++ mutex_unlock(&ih->mutex);
++ drop_super(sb);
++ return 0;
++ }
++ /* still alive or freed and reused with the same sb and wd; kill */
++ get_inotify_watch(watch);
++ mutex_unlock(&ih->mutex);
++ return 2;
++}
++
++static void unpin_and_kill(struct inotify_watch *watch, int how)
++{
++ struct super_block *sb = watch->inode->i_sb;
++ put_inotify_watch(watch);
++ switch (how) {
++ case 1:
++ deactivate_super(sb);
++ break;
++ case 2:
++ drop_super(sb);
++ }
++}
++
+ /**
+ * inotify_destroy - clean up and destroy an inotify instance
+ * @ih: inotify handle
+@@ -490,11 +617,15 @@ void inotify_destroy(struct inotify_handle *ih)
+ * pretty. We cannot do a simple iteration over the list, because we
+ * do not know the inode until we iterate to the watch. But we need to
+ * hold inode->inotify_mutex before ih->mutex. The following works.
++ *
++ * AV: it had to become even uglier to start working ;-/
+ */
+ while (1) {
+ struct inotify_watch *watch;
+ struct list_head *watches;
++ struct super_block *sb;
+ struct inode *inode;
++ int how;
+
+ mutex_lock(&ih->mutex);
+ watches = &ih->watches;
+@@ -503,8 +634,10 @@ void inotify_destroy(struct inotify_handle *ih)
+ break;
+ }
+ watch = list_first_entry(watches, struct inotify_watch, h_list);
+- get_inotify_watch(watch);
+- mutex_unlock(&ih->mutex);
++ sb = watch->inode->i_sb;
++ how = pin_to_kill(ih, watch);
++ if (!how)
++ continue;
+
+ inode = watch->inode;
+ mutex_lock(&inode->inotify_mutex);
+@@ -518,7 +651,7 @@ void inotify_destroy(struct inotify_handle *ih)
+
+ mutex_unlock(&ih->mutex);
+ mutex_unlock(&inode->inotify_mutex);
+- put_inotify_watch(watch);
++ unpin_and_kill(watch, how);
+ }
+
+ /* free this handle: the put matching the get in inotify_init() */
+@@ -719,7 +852,9 @@ void inotify_evict_watch(struct inotify_watch *watch)
+ int inotify_rm_wd(struct inotify_handle *ih, u32 wd)
+ {
+ struct inotify_watch *watch;
++ struct super_block *sb;
+ struct inode *inode;
++ int how;
+
+ mutex_lock(&ih->mutex);
+ watch = idr_find(&ih->idr, wd);
+@@ -727,9 +862,12 @@ int inotify_rm_wd(struct inotify_handle *ih, u32 wd)
+ mutex_unlock(&ih->mutex);
+ return -EINVAL;
+ }
+- get_inotify_watch(watch);
++ sb = watch->inode->i_sb;
++ how = pin_to_kill(ih, watch);
++ if (!how)
++ return 0;
++
+ inode = watch->inode;
+- mutex_unlock(&ih->mutex);
+
+ mutex_lock(&inode->inotify_mutex);
+ mutex_lock(&ih->mutex);
+@@ -740,7 +878,7 @@ int inotify_rm_wd(struct inotify_handle *ih, u32 wd)
+
+ mutex_unlock(&ih->mutex);
+ mutex_unlock(&inode->inotify_mutex);
+- put_inotify_watch(watch);
++ unpin_and_kill(watch, how);
+
+ return 0;
+ }
+diff --git a/include/linux/inotify.h b/include/linux/inotify.h
+index bd57857..37ea289 100644
+--- a/include/linux/inotify.h
++++ b/include/linux/inotify.h
+@@ -134,6 +134,8 @@ extern void inotify_remove_watch_locked(struct inotify_handle *,
+ struct inotify_watch *);
+ extern void get_inotify_watch(struct inotify_watch *);
+ extern void put_inotify_watch(struct inotify_watch *);
++extern int pin_inotify_watch(struct inotify_watch *);
++extern void unpin_inotify_watch(struct inotify_watch *);
+
+ #else
+
+@@ -228,6 +230,15 @@ static inline void put_inotify_watch(struct inotify_watch *watch)
+ {
+ }
+
++extern inline int pin_inotify_watch(struct inotify_watch *watch)
++{
++ return 0;
++}
++
++extern inline void unpin_inotify_watch(struct inotify_watch *watch)
++{
++}
++
+ #endif /* CONFIG_INOTIFY */
+
+ #endif /* __KERNEL __ */
+diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
+index 8ba0e0d..8b50944 100644
+--- a/kernel/audit_tree.c
++++ b/kernel/audit_tree.c
+@@ -24,6 +24,7 @@ struct audit_chunk {
+ struct list_head trees; /* with root here */
+ int dead;
+ int count;
++ atomic_long_t refs;
+ struct rcu_head head;
+ struct node {
+ struct list_head list;
+@@ -56,7 +57,8 @@ static LIST_HEAD(prune_list);
+ * tree is refcounted; one reference for "some rules on rules_list refer to
+ * it", one for each chunk with pointer to it.
+ *
+- * chunk is refcounted by embedded inotify_watch.
++ * chunk is refcounted by embedded inotify_watch + .refs (non-zero refcount
++ * of watch contributes 1 to .refs).
+ *
+ * node.index allows to get from node.list to containing chunk.
+ * MSB of that sucker is stolen to mark taggings that we might have to
+@@ -121,6 +123,7 @@ static struct audit_chunk *alloc_chunk(int count)
+ INIT_LIST_HEAD(&chunk->hash);
+ INIT_LIST_HEAD(&chunk->trees);
+ chunk->count = count;
++ atomic_long_set(&chunk->refs, 1);
+ for (i = 0; i < count; i++) {
+ INIT_LIST_HEAD(&chunk->owners[i].list);
+ chunk->owners[i].index = i;
+@@ -129,9 +132,8 @@ static struct audit_chunk *alloc_chunk(int count)
+ return chunk;
+ }
+
+-static void __free_chunk(struct rcu_head *rcu)
++static void free_chunk(struct audit_chunk *chunk)
+ {
+- struct audit_chunk *chunk = container_of(rcu, struct audit_chunk, head);
+ int i;
+
+ for (i = 0; i < chunk->count; i++) {
+@@ -141,14 +143,16 @@ static void __free_chunk(struct rcu_head *rcu)
+ kfree(chunk);
+ }
+
+-static inline void free_chunk(struct audit_chunk *chunk)
++void audit_put_chunk(struct audit_chunk *chunk)
+ {
+- call_rcu(&chunk->head, __free_chunk);
++ if (atomic_long_dec_and_test(&chunk->refs))
++ free_chunk(chunk);
+ }
+
+-void audit_put_chunk(struct audit_chunk *chunk)
++static void __put_chunk(struct rcu_head *rcu)
+ {
+- put_inotify_watch(&chunk->watch);
++ struct audit_chunk *chunk = container_of(rcu, struct audit_chunk, head);
++ audit_put_chunk(chunk);
+ }
+
+ enum {HASH_SIZE = 128};
+@@ -176,7 +180,7 @@ struct audit_chunk *audit_tree_lookup(const struct inode *inode)
+
+ list_for_each_entry_rcu(p, list, hash) {
+ if (p->watch.inode == inode) {
+- get_inotify_watch(&p->watch);
++ atomic_long_inc(&p->refs);
+ return p;
+ }
+ }
+@@ -194,17 +198,49 @@ int audit_tree_match(struct audit_chunk *chunk, struct audit_tree *tree)
+
+ /* tagging and untagging inodes with trees */
+
+-static void untag_chunk(struct audit_chunk *chunk, struct node *p)
++static struct audit_chunk *find_chunk(struct node *p)
++{
++ int index = p->index & ~(1U<<31);
++ p -= index;
++ return container_of(p, struct audit_chunk, owners[0]);
++}
++
++static void untag_chunk(struct node *p)
+ {
++ struct audit_chunk *chunk = find_chunk(p);
+ struct audit_chunk *new;
+ struct audit_tree *owner;
+ int size = chunk->count - 1;
+ int i, j;
+
++ if (!pin_inotify_watch(&chunk->watch)) {
++ /*
++ * Filesystem is shutting down; all watches are getting
++ * evicted, just take it off the node list for this
++ * tree and let the eviction logics take care of the
++ * rest.
++ */
++ owner = p->owner;
++ if (owner->root == chunk) {
++ list_del_init(&owner->same_root);
++ owner->root = NULL;
++ }
++ list_del_init(&p->list);
++ p->owner = NULL;
++ put_tree(owner);
++ return;
++ }
++
++ spin_unlock(&hash_lock);
++
++ /*
++ * pin_inotify_watch() succeeded, so the watch won't go away
++ * from under us.
++ */
+ mutex_lock(&chunk->watch.inode->inotify_mutex);
+ if (chunk->dead) {
+ mutex_unlock(&chunk->watch.inode->inotify_mutex);
+- return;
++ goto out;
+ }
+
+ owner = p->owner;
+@@ -221,7 +257,7 @@ static void untag_chunk(struct audit_chunk *chunk, struct node *p)
+ inotify_evict_watch(&chunk->watch);
+ mutex_unlock(&chunk->watch.inode->inotify_mutex);
+ put_inotify_watch(&chunk->watch);
+- return;
++ goto out;
+ }
+
+ new = alloc_chunk(size);
+@@ -263,7 +299,7 @@ static void untag_chunk(struct audit_chunk *chunk, struct node *p)
+ inotify_evict_watch(&chunk->watch);
+ mutex_unlock(&chunk->watch.inode->inotify_mutex);
+ put_inotify_watch(&chunk->watch);
+- return;
++ goto out;
+
+ Fallback:
+ // do the best we can
+@@ -277,6 +313,9 @@ Fallback:
+ put_tree(owner);
+ spin_unlock(&hash_lock);
+ mutex_unlock(&chunk->watch.inode->inotify_mutex);
++out:
++ unpin_inotify_watch(&chunk->watch);
++ spin_lock(&hash_lock);
+ }
+
+ static int create_chunk(struct inode *inode, struct audit_tree *tree)
+@@ -387,13 +426,6 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
+ return 0;
+ }
+
+-static struct audit_chunk *find_chunk(struct node *p)
+-{
+- int index = p->index & ~(1U<<31);
+- p -= index;
+- return container_of(p, struct audit_chunk, owners[0]);
+-}
+-
+ static void kill_rules(struct audit_tree *tree)
+ {
+ struct audit_krule *rule, *next;
+@@ -431,17 +463,10 @@ static void prune_one(struct audit_tree *victim)
+ spin_lock(&hash_lock);
+ while (!list_empty(&victim->chunks)) {
+ struct node *p;
+- struct audit_chunk *chunk;
+
+ p = list_entry(victim->chunks.next, struct node, list);
+- chunk = find_chunk(p);
+- get_inotify_watch(&chunk->watch);
+- spin_unlock(&hash_lock);
+-
+- untag_chunk(chunk, p);
+
+- put_inotify_watch(&chunk->watch);
+- spin_lock(&hash_lock);
++ untag_chunk(p);
+ }
+ spin_unlock(&hash_lock);
+ put_tree(victim);
+@@ -469,7 +494,6 @@ static void trim_marked(struct audit_tree *tree)
+
+ while (!list_empty(&tree->chunks)) {
+ struct node *node;
+- struct audit_chunk *chunk;
+
+ node = list_entry(tree->chunks.next, struct node, list);
+
+@@ -477,14 +501,7 @@ static void trim_marked(struct audit_tree *tree)
+ if (!(node->index & (1U<<31)))
+ break;
+
+- chunk = find_chunk(node);
+- get_inotify_watch(&chunk->watch);
+- spin_unlock(&hash_lock);
+-
+- untag_chunk(chunk, node);
+-
+- put_inotify_watch(&chunk->watch);
+- spin_lock(&hash_lock);
++ untag_chunk(node);
+ }
+ if (!tree->root && !tree->goner) {
+ tree->goner = 1;
+@@ -878,7 +895,7 @@ static void handle_event(struct inotify_watch *watch, u32 wd, u32 mask,
+ static void destroy_watch(struct inotify_watch *watch)
+ {
+ struct audit_chunk *chunk = container_of(watch, struct audit_chunk, watch);
+- free_chunk(chunk);
++ call_rcu(&chunk->head, __put_chunk);
+ }
+
+ static const struct inotify_operations rtree_inotify_ops = {
+diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
+index b7d354e..9fd85a4 100644
+--- a/kernel/auditfilter.c
++++ b/kernel/auditfilter.c
+@@ -1094,8 +1094,8 @@ static void audit_inotify_unregister(struct list_head *in_list)
+ list_for_each_entry_safe(p, n, in_list, ilist) {
+ list_del(&p->ilist);
+ inotify_rm_watch(audit_ih, &p->wdata);
+- /* the put matching the get in audit_do_del_rule() */
+- put_inotify_watch(&p->wdata);
++ /* the unpin matching the pin in audit_do_del_rule() */
++ unpin_inotify_watch(&p->wdata);
+ }
+ }
+
+@@ -1389,9 +1389,13 @@ static inline int audit_del_rule(struct audit_entry *entry,
+ /* Put parent on the inotify un-registration
+ * list. Grab a reference before releasing
+ * audit_filter_mutex, to be released in
+- * audit_inotify_unregister(). */
+- list_add(&parent->ilist, &inotify_list);
+- get_inotify_watch(&parent->wdata);
++ * audit_inotify_unregister().
++ * If filesystem is going away, just leave
++ * the sucker alone, eviction will take
++ * care of it.
++ */
++ if (pin_inotify_watch(&parent->wdata))
++ list_add(&parent->ilist, &inotify_list);
+ }
+ }
+ }
Modified: dists/sid/linux-2.6/debian/patches/series/12
==============================================================================
--- dists/sid/linux-2.6/debian/patches/series/12 (original)
+++ dists/sid/linux-2.6/debian/patches/series/12 Tue Dec 2 07:06:59 2008
@@ -1 +1,2 @@
+ bugfix/all/net-unix-gc-fix-soft-lockups-oom-issues.patch
++ bugfix/all/inotify-watch-removal-umount-races.patch
More information about the Kernel-svn-changes
mailing list