[Pkg-ceph-commits] [ceph] 01/01: Imported Upstream version 0.80.6

Dmitry Smirnov onlyjob at moszumanska.debian.org
Thu Oct 2 12:58:20 UTC 2014


This is an automated email from the git hooks/post-receive script.

onlyjob pushed a commit to branch upstream
in repository ceph.

commit 680e2ae (upstream)
Author: Dmitry Smirnov <onlyjob at member.fsf.org>
Date:   Thu Oct 2 12:57:25 2014

    Imported Upstream version 0.80.6
---
 ceph.spec                                        |   2 +-
 configure                                        | 240 ++++++++-
 configure.ac                                     |   9 +-
 src/.git_version                                 |   4 +-
 src/Makefile.in                                  |  86 ++--
 src/acconfig.h.in                                |   3 +
 src/ceph-disk                                    | 376 +++++++++++----
 src/ceph.in                                      |  35 +-
 src/ceph_common.sh                               |   5 +-
 src/ceph_mon.cc                                  |  22 +-
 src/ceph_osd.cc                                  |  20 +
 src/cls/rgw/cls_rgw.cc                           |   2 +-
 src/common/Finisher.h                            |   9 +
 src/common/LogClient.cc                          |   3 +-
 src/common/Makefile.am                           |   3 +
 src/common/Thread.cc                             |  34 +-
 src/common/Thread.h                              |   5 +
 src/common/WorkQueue.cc                          |  21 +
 src/common/WorkQueue.h                           |   4 +
 src/common/blkdev.cc                             |   2 +-
 src/common/config.cc                             |  10 +-
 src/common/config_opts.h                         |  12 +
 src/common/io_priority.cc                        |  54 +++
 src/common/io_priority.h                         |  44 ++
 src/common/random_cache.hpp                      | 111 +++++
 src/common/str_map.cc                            |   2 +-
 src/common/strtol.cc                             |  43 ++
 src/common/strtol.h                              |   5 +
 src/crush/CrushWrapper.cc                        | 109 ++++-
 src/crush/CrushWrapper.h                         |  14 +
 src/erasure-code/ErasureCodeInterface.h          |   2 +-
 src/erasure-code/ErasureCodePlugin.cc            |  28 ++
 src/erasure-code/ErasureCodePlugin.h             |   3 +
 src/erasure-code/jerasure/ErasureCodeJerasure.cc |   8 +-
 src/global/global_init.cc                        |  10 +-
 src/include/atomic.h                             | 123 +++--
 src/include/intarith.h                           |   2 +-
 src/include/rbd/librbd.h                         |   9 +
 src/include/rbd/librbd.hpp                       |   8 +
 src/include/str_map.h                            |   2 +-
 src/init-ceph.in                                 |  33 +-
 src/init-radosgw.sysv                            |  13 +-
 src/librados/RadosClient.cc                      |   8 +-
 src/librbd/ImageCtx.cc                           |  20 +-
 src/librbd/ImageCtx.h                            |   2 +-
 src/librbd/internal.cc                           |  42 +-
 src/librbd/internal.h                            |   2 +
 src/librbd/librbd.cc                             |  12 +
 src/mds/Locker.cc                                |   8 +-
 src/mds/MDCache.cc                               |   2 +
 src/messages/MOSDSubOp.h                         |  10 +-
 src/mon/DataHealthService.cc                     |   2 +-
 src/mon/MonCommands.h                            |   6 +-
 src/mon/Monitor.cc                               |  41 +-
 src/mon/Monitor.h                                |   3 +
 src/mon/MonmapMonitor.cc                         |   5 +
 src/mon/OSDMonitor.cc                            | 276 ++++++++++-
 src/mon/OSDMonitor.h                             |   2 +-
 src/mon/PGMonitor.cc                             |  83 +++-
 src/mon/PGMonitor.h                              |   6 +-
 src/mon/Paxos.cc                                 |   3 +-
 src/msg/SimpleMessenger.cc                       |   3 +
 src/os/FileJournal.cc                            |   7 +-
 src/os/FileStore.cc                              |  15 +-
 src/os/FileStore.h                               |  72 ++-
 src/os/GenericObjectMap.cc                       |  46 +-
 src/os/GenericObjectMap.h                        |  37 +-
 src/os/KeyValueStore.cc                          | 587 ++++++++++++-----------
 src/os/KeyValueStore.h                           | 182 ++++---
 src/os/LFNIndex.cc                               |  88 +++-
 src/os/LFNIndex.h                                |   8 +-
 src/os/MemStore.cc                               |   7 +-
 src/os/ObjectStore.cc                            |   6 +-
 src/os/ObjectStore.h                             |  26 +
 src/osd/ECBackend.cc                             |  10 +-
 src/osd/ECBackend.h                              |   2 +
 src/osd/ECMsgTypes.cc                            |  28 +-
 src/osd/ECMsgTypes.h                             |   5 +-
 src/osd/HitSet.h                                 |   2 +-
 src/osd/OSD.cc                                   | 168 +++++--
 src/osd/OSD.h                                    |  58 ++-
 src/osd/OSDMap.cc                                |  49 +-
 src/osd/OSDMap.h                                 |   3 +
 src/osd/OpRequest.cc                             |   2 +-
 src/osd/OpRequest.h                              |   4 +
 src/osd/PG.cc                                    | 275 +++++++++--
 src/osd/PG.h                                     | 125 ++++-
 src/osd/PGBackend.cc                             |  11 +-
 src/osd/PGBackend.h                              |   5 +-
 src/osd/PGLog.cc                                 |  42 +-
 src/osd/PGLog.h                                  |  71 ++-
 src/osd/ReplicatedBackend.cc                     |  10 +-
 src/osd/ReplicatedBackend.h                      |   2 +
 src/osd/ReplicatedPG.cc                          | 176 ++++---
 src/osd/ReplicatedPG.h                           |   3 +-
 src/osd/osd_types.cc                             |  22 +-
 src/osd/osd_types.h                              | 121 ++++-
 src/osdc/Objecter.cc                             |  12 +-
 src/osdc/Objecter.h                              |   2 +-
 src/pybind/rados.py                              |   7 +-
 src/pybind/rbd.py                                |   8 +
 src/rgw/rgw_common.cc                            |  10 +-
 src/rgw/rgw_op.cc                                |  72 +--
 src/rgw/rgw_rados.cc                             | 174 +++++--
 src/rgw/rgw_rados.h                              |  20 +-
 src/rgw/rgw_rest.cc                              |   4 +-
 src/rgw/rgw_rest_swift.cc                        |   6 +-
 src/test/crush/TestCrushWrapper.cc               |   5 +
 src/test/erasure-code/TestErasureCodeJerasure.cc |  30 ++
 src/test/librados/TestCase.cc                    |  32 +-
 src/test/librados/TestCase.h                     |   7 +-
 src/test/librados/io.cc                          |  52 ++
 src/test/librados/tier.cc                        | 538 +++++++--------------
 src/test/objectstore/store_test.cc               | 105 ++++
 src/test/osd/TestOSDMap.cc                       |  57 ++-
 src/test/osd/osd-test-helpers.sh                 |   1 +
 src/test/strtol.cc                               |  75 +++
 117 files changed, 4097 insertions(+), 1461 deletions(-)

diff --git a/ceph.spec b/ceph.spec
index 31b8960..20937c2 100644
--- a/ceph.spec
+++ b/ceph.spec
@@ -9,7 +9,7 @@
 # common
 #################################################################################
 Name:		ceph
-Version:	0.80.5
+Version:	0.80.6
 Release:	0%{?dist}
 Summary:	User space components of the Ceph file system
 License:	GPL-2.0
diff --git a/configure b/configure
index 8d8c0ed..81da86d 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.68 for ceph 0.80.5.
+# Generated by GNU Autoconf 2.68 for ceph 0.80.6.
 #
 # Report bugs to <ceph-devel at vger.kernel.org>.
 #
@@ -570,8 +570,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='ceph'
 PACKAGE_TARNAME='ceph'
-PACKAGE_VERSION='0.80.5'
-PACKAGE_STRING='ceph 0.80.5'
+PACKAGE_VERSION='0.80.6'
+PACKAGE_STRING='ceph 0.80.6'
 PACKAGE_BUGREPORT='ceph-devel at vger.kernel.org'
 PACKAGE_URL=''
 
@@ -1441,7 +1441,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures ceph 0.80.5 to adapt to many kinds of systems.
+\`configure' configures ceph 0.80.6 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1512,7 +1512,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of ceph 0.80.5:";;
+     short | recursive ) echo "Configuration of ceph 0.80.6:";;
    esac
   cat <<\_ACEOF
 
@@ -1657,7 +1657,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-ceph configure 0.80.5
+ceph configure 0.80.6
 generated by GNU Autoconf 2.68
 
 Copyright (C) 2010 Free Software Foundation, Inc.
@@ -2144,6 +2144,184 @@ fi
 
 } # ac_fn_c_check_header_mongrel
 
+# ac_fn_c_compute_int LINENO EXPR VAR INCLUDES
+# --------------------------------------------
+# Tries to find the compile-time value of EXPR in a program that includes
+# INCLUDES, setting VAR accordingly. Returns whether the value could be
+# computed
+ac_fn_c_compute_int ()
+{
+  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+  if test "$cross_compiling" = yes; then
+    # Depending upon the size, compute the lo and hi bounds.
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+$4
+int
+main ()
+{
+static int test_array [1 - 2 * !(($2) >= 0)];
+test_array [0] = 0
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  ac_lo=0 ac_mid=0
+  while :; do
+    cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+$4
+int
+main ()
+{
+static int test_array [1 - 2 * !(($2) <= $ac_mid)];
+test_array [0] = 0
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  ac_hi=$ac_mid; break
+else
+  as_fn_arith $ac_mid + 1 && ac_lo=$as_val
+			if test $ac_lo -le $ac_mid; then
+			  ac_lo= ac_hi=
+			  break
+			fi
+			as_fn_arith 2 '*' $ac_mid + 1 && ac_mid=$as_val
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+  done
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+$4
+int
+main ()
+{
+static int test_array [1 - 2 * !(($2) < 0)];
+test_array [0] = 0
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  ac_hi=-1 ac_mid=-1
+  while :; do
+    cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+$4
+int
+main ()
+{
+static int test_array [1 - 2 * !(($2) >= $ac_mid)];
+test_array [0] = 0
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  ac_lo=$ac_mid; break
+else
+  as_fn_arith '(' $ac_mid ')' - 1 && ac_hi=$as_val
+			if test $ac_mid -le $ac_hi; then
+			  ac_lo= ac_hi=
+			  break
+			fi
+			as_fn_arith 2 '*' $ac_mid && ac_mid=$as_val
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+  done
+else
+  ac_lo= ac_hi=
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+# Binary search between lo and hi bounds.
+while test "x$ac_lo" != "x$ac_hi"; do
+  as_fn_arith '(' $ac_hi - $ac_lo ')' / 2 + $ac_lo && ac_mid=$as_val
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+$4
+int
+main ()
+{
+static int test_array [1 - 2 * !(($2) <= $ac_mid)];
+test_array [0] = 0
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  ac_hi=$ac_mid
+else
+  as_fn_arith '(' $ac_mid ')' + 1 && ac_lo=$as_val
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+done
+case $ac_lo in #((
+?*) eval "$3=\$ac_lo"; ac_retval=0 ;;
+'') ac_retval=1 ;;
+esac
+  else
+    cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+$4
+static long int longval () { return $2; }
+static unsigned long int ulongval () { return $2; }
+#include <stdio.h>
+#include <stdlib.h>
+int
+main ()
+{
+
+  FILE *f = fopen ("conftest.val", "w");
+  if (! f)
+    return 1;
+  if (($2) < 0)
+    {
+      long int i = longval ();
+      if (i != ($2))
+	return 1;
+      fprintf (f, "%ld", i);
+    }
+  else
+    {
+      unsigned long int i = ulongval ();
+      if (i != ($2))
+	return 1;
+      fprintf (f, "%lu", i);
+    }
+  /* Do not output a trailing newline, as this causes \r\n confusion
+     on some platforms.  */
+  return ferror (f) || fclose (f) != 0;
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_run "$LINENO"; then :
+  echo >>conftest.val; read $3 <conftest.val; ac_retval=0
+else
+  ac_retval=1
+fi
+rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext \
+  conftest.$ac_objext conftest.beam conftest.$ac_ext
+rm -f conftest.val
+
+  fi
+  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
+  as_fn_set_status $ac_retval
+
+} # ac_fn_c_compute_int
+
 # ac_fn_cxx_check_header_mongrel LINENO HEADER VAR INCLUDES
 # ---------------------------------------------------------
 # Tests whether HEADER exists, giving a warning if it cannot be compiled using
@@ -2504,7 +2682,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by ceph $as_me 0.80.5, which was
+It was created by ceph $as_me 0.80.6, which was
 generated by GNU Autoconf 2.68.  Invocation command line was
 
   $ $0 $@
@@ -4504,7 +4682,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='ceph'
- VERSION='0.80.5'
+ VERSION='0.80.6'
 
 
 cat >>confdefs.h <<_ACEOF
@@ -12482,7 +12660,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='ceph'
- VERSION='0.80.5'
+ VERSION='0.80.6'
 
 
 cat >>confdefs.h <<_ACEOF
@@ -18906,7 +19084,7 @@ else
 JAVA_TEST=Test.java
 CLASS_TEST=Test.class
 cat << \EOF > $JAVA_TEST
-/* #line 18909 "configure" */
+/* #line 19087 "configure" */
 public class Test {
 }
 EOF
@@ -19239,12 +19417,50 @@ fi
 fi
 if test "$HAVE_ATOMIC_OPS" = "1"; then :
 
+         # The cast to long int works around a bug in the HP C Compiler
+# version HP92453-01 B.11.11.23709.GP, which incorrectly rejects
+# declarations like `int a3[[(sizeof (unsigned char)) >= 0]];'.
+# This bug is HP SR number 8606223364.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking size of AO_t" >&5
+$as_echo_n "checking size of AO_t... " >&6; }
+if ${ac_cv_sizeof_AO_t+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if ac_fn_c_compute_int "$LINENO" "(long int) (sizeof (AO_t))" "ac_cv_sizeof_AO_t"        "
+                                #include <atomic_ops.h>
+
+"; then :
+
+else
+  if test "$ac_cv_type_AO_t" = yes; then
+     { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error 77 "cannot compute sizeof (AO_t)
+See \`config.log' for more details" "$LINENO" 5; }
+   else
+     ac_cv_sizeof_AO_t=0
+   fi
+fi
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_sizeof_AO_t" >&5
+$as_echo "$ac_cv_sizeof_AO_t" >&6; }
+
+
+
+cat >>confdefs.h <<_ACEOF
+#define SIZEOF_AO_T $ac_cv_sizeof_AO_t
+_ACEOF
+
+
+
 else
 
 $as_echo "#define NO_ATOMIC_OPS 1" >>confdefs.h
 
 fi
 
+
  if test "$HAVE_ATOMIC_OPS" = "1"; then
   WITH_LIBATOMIC_TRUE=
   WITH_LIBATOMIC_FALSE='#'
@@ -22248,7 +22464,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by ceph $as_me 0.80.5, which was
+This file was extended by ceph $as_me 0.80.6, which was
 generated by GNU Autoconf 2.68.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -22314,7 +22530,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-ceph config.status 0.80.5
+ceph config.status 0.80.6
 configured by $0, generated by GNU Autoconf 2.68,
   with options \\"\$ac_cs_config\\"
 
diff --git a/configure.ac b/configure.ac
index fb54df1..eb16aa5 100644
--- a/configure.ac
+++ b/configure.ac
@@ -8,7 +8,7 @@ AC_PREREQ(2.59)
 # VERSION define is not used by the code.  It gets a version string
 # from 'git describe'; see src/ceph_ver.[ch]
 
-AC_INIT([ceph], [0.80.5], [ceph-devel at vger.kernel.org])
+AC_INIT([ceph], [0.80.6], [ceph-devel at vger.kernel.org])
 
 # Create release string.  Used with VERSION for RPMs.
 RPM_RELEASE=0
@@ -472,9 +472,14 @@ AS_IF([test "x$with_libatomic_ops" != xno],
                    [no libatomic-ops found (use --without-libatomic-ops to disable)])
               ])])
 AS_IF([test "$HAVE_ATOMIC_OPS" = "1"],
-	[],
+	[
+         AC_CHECK_SIZEOF(AO_t, [], [
+                                #include <atomic_ops.h>
+                                ])
+         ],
 	[AC_DEFINE([NO_ATOMIC_OPS], [1], [Defined if you do not have atomic_ops])])
 
+
 AM_CONDITIONAL(WITH_LIBATOMIC, [test "$HAVE_ATOMIC_OPS" = "1"])
 
 # newsyn?  requires mpi.
diff --git a/src/.git_version b/src/.git_version
index 6bd39d8..338f76a 100644
--- a/src/.git_version
+++ b/src/.git_version
@@ -1,2 +1,2 @@
-38b73c67d375a2552d8ed67843c8a65c2c0feba6
-v0.80.5
+f93610a4421cb670b08e974c6550ee715ac528ae
+v0.80.6
diff --git a/src/Makefile.in b/src/Makefile.in
index e42ddf4..afa524b 100644
--- a/src/Makefile.in
+++ b/src/Makefile.in
@@ -487,11 +487,11 @@ am_libcommon_la_OBJECTS = ceph_ver.lo common/DecayCounter.lo \
 	common/BackTrace.lo common/perf_counters.lo common/Mutex.lo \
 	common/OutputDataSocket.lo common/admin_socket.lo \
 	common/admin_socket_client.lo common/cmdparse.lo \
-	common/escape.lo common/Clock.lo common/Throttle.lo \
-	common/Timer.lo common/Finisher.lo common/environment.lo \
-	common/assert.lo common/run_cmd.lo common/WorkQueue.lo \
-	common/ConfUtils.lo common/MemoryModel.lo common/armor.lo \
-	common/fd.lo common/xattr.lo common/safe_io.lo \
+	common/escape.lo common/io_priority.lo common/Clock.lo \
+	common/Throttle.lo common/Timer.lo common/Finisher.lo \
+	common/environment.lo common/assert.lo common/run_cmd.lo \
+	common/WorkQueue.lo common/ConfUtils.lo common/MemoryModel.lo \
+	common/armor.lo common/fd.lo common/xattr.lo common/safe_io.lo \
 	common/snap_types.lo common/str_list.lo common/str_map.lo \
 	common/errno.lo common/RefCountedObj.lo common/blkdev.lo \
 	common/common_init.lo common/pipe.lo common/ceph_argparse.lo \
@@ -1854,11 +1854,11 @@ am__test_build_libcommon_SOURCES_DIST = test/buildtest_skeleton.cc \
 	common/perf_counters.cc common/Mutex.cc \
 	common/OutputDataSocket.cc common/admin_socket.cc \
 	common/admin_socket_client.cc common/cmdparse.cc \
-	common/escape.c common/Clock.cc common/Throttle.cc \
-	common/Timer.cc common/Finisher.cc common/environment.cc \
-	common/assert.cc common/run_cmd.cc common/WorkQueue.cc \
-	common/ConfUtils.cc common/MemoryModel.cc common/armor.c \
-	common/fd.cc common/xattr.c common/safe_io.c \
+	common/escape.c common/io_priority.cc common/Clock.cc \
+	common/Throttle.cc common/Timer.cc common/Finisher.cc \
+	common/environment.cc common/assert.cc common/run_cmd.cc \
+	common/WorkQueue.cc common/ConfUtils.cc common/MemoryModel.cc \
+	common/armor.c common/fd.cc common/xattr.c common/safe_io.c \
 	common/snap_types.cc common/str_list.cc common/str_map.cc \
 	common/errno.cc common/RefCountedObj.cc common/blkdev.cc \
 	common/common_init.cc common/pipe.c common/ceph_argparse.cc \
@@ -1891,6 +1891,7 @@ am__objects_15 = test_build_libcommon-ceph_ver.$(OBJEXT) \
 	common/test_build_libcommon-admin_socket_client.$(OBJEXT) \
 	common/test_build_libcommon-cmdparse.$(OBJEXT) \
 	common/test_build_libcommon-escape.$(OBJEXT) \
+	common/test_build_libcommon-io_priority.$(OBJEXT) \
 	common/test_build_libcommon-Clock.$(OBJEXT) \
 	common/test_build_libcommon-Throttle.$(OBJEXT) \
 	common/test_build_libcommon-Timer.$(OBJEXT) \
@@ -3098,18 +3099,18 @@ am__noinst_HEADERS_DIST = arch/intel.h arch/neon.h arch/probe.h \
 	common/Cond.h common/ConfUtils.h common/DecayCounter.h \
 	common/Finisher.h common/Formatter.h common/perf_counters.h \
 	common/OutputDataSocket.h common/admin_socket.h \
-	common/admin_socket_client.h common/shared_cache.hpp \
-	common/tracked_int_ptr.hpp common/simple_cache.hpp \
-	common/sharedptr_registry.hpp common/map_cacher.hpp \
-	common/MemoryModel.h common/Mutex.h \
+	common/admin_socket_client.h common/random_cache.hpp \
+	common/shared_cache.hpp common/tracked_int_ptr.hpp \
+	common/simple_cache.hpp common/sharedptr_registry.hpp \
+	common/map_cacher.hpp common/MemoryModel.h common/Mutex.h \
 	common/PrebufferedStreambuf.h common/RWLock.h \
 	common/Semaphore.h common/SimpleRNG.h common/TextTable.h \
 	common/Thread.h common/Throttle.h common/Timer.h \
 	common/TrackedOp.h common/arch.h common/armor.h \
-	common/common_init.h common/pipe.h common/code_environment.h \
-	common/signal.h common/simple_spin.h common/run_cmd.h \
-	common/safe_io.h common/config.h common/config_obs.h \
-	common/config_opts.h common/ceph_crypto.h \
+	common/common_init.h common/io_priority.h common/pipe.h \
+	common/code_environment.h common/signal.h common/simple_spin.h \
+	common/run_cmd.h common/safe_io.h common/config.h \
+	common/config_obs.h common/config_opts.h common/ceph_crypto.h \
 	common/ceph_crypto_cms.h common/ceph_json.h common/lru_map.h \
 	common/utf8.h common/mime.h common/pick_address.h \
 	common/secret.h common/strtol.h common/static_assert.h \
@@ -3642,18 +3643,18 @@ noinst_HEADERS = arch/intel.h arch/neon.h arch/probe.h \
 	common/Cond.h common/ConfUtils.h common/DecayCounter.h \
 	common/Finisher.h common/Formatter.h common/perf_counters.h \
 	common/OutputDataSocket.h common/admin_socket.h \
-	common/admin_socket_client.h common/shared_cache.hpp \
-	common/tracked_int_ptr.hpp common/simple_cache.hpp \
-	common/sharedptr_registry.hpp common/map_cacher.hpp \
-	common/MemoryModel.h common/Mutex.h \
+	common/admin_socket_client.h common/random_cache.hpp \
+	common/shared_cache.hpp common/tracked_int_ptr.hpp \
+	common/simple_cache.hpp common/sharedptr_registry.hpp \
+	common/map_cacher.hpp common/MemoryModel.h common/Mutex.h \
 	common/PrebufferedStreambuf.h common/RWLock.h \
 	common/Semaphore.h common/SimpleRNG.h common/TextTable.h \
 	common/Thread.h common/Throttle.h common/Timer.h \
 	common/TrackedOp.h common/arch.h common/armor.h \
-	common/common_init.h common/pipe.h common/code_environment.h \
-	common/signal.h common/simple_spin.h common/run_cmd.h \
-	common/safe_io.h common/config.h common/config_obs.h \
-	common/config_opts.h common/ceph_crypto.h \
+	common/common_init.h common/io_priority.h common/pipe.h \
+	common/code_environment.h common/signal.h common/simple_spin.h \
+	common/run_cmd.h common/safe_io.h common/config.h \
+	common/config_obs.h common/config_opts.h common/ceph_crypto.h \
 	common/ceph_crypto_cms.h common/ceph_json.h common/lru_map.h \
 	common/utf8.h common/mime.h common/pick_address.h \
 	common/secret.h common/strtol.h common/static_assert.h \
@@ -4242,11 +4243,11 @@ libcommon_la_SOURCES = ceph_ver.c common/DecayCounter.cc \
 	common/BackTrace.cc common/perf_counters.cc common/Mutex.cc \
 	common/OutputDataSocket.cc common/admin_socket.cc \
 	common/admin_socket_client.cc common/cmdparse.cc \
-	common/escape.c common/Clock.cc common/Throttle.cc \
-	common/Timer.cc common/Finisher.cc common/environment.cc \
-	common/assert.cc common/run_cmd.cc common/WorkQueue.cc \
-	common/ConfUtils.cc common/MemoryModel.cc common/armor.c \
-	common/fd.cc common/xattr.c common/safe_io.c \
+	common/escape.c common/io_priority.cc common/Clock.cc \
+	common/Throttle.cc common/Timer.cc common/Finisher.cc \
+	common/environment.cc common/assert.cc common/run_cmd.cc \
+	common/WorkQueue.cc common/ConfUtils.cc common/MemoryModel.cc \
+	common/armor.c common/fd.cc common/xattr.c common/safe_io.c \
 	common/snap_types.cc common/str_list.cc common/str_map.cc \
 	common/errno.cc common/RefCountedObj.cc common/blkdev.cc \
 	common/common_init.cc common/pipe.c common/ceph_argparse.cc \
@@ -5765,6 +5766,8 @@ common/cmdparse.lo: common/$(am__dirstamp) \
 	common/$(DEPDIR)/$(am__dirstamp)
 common/escape.lo: common/$(am__dirstamp) \
 	common/$(DEPDIR)/$(am__dirstamp)
+common/io_priority.lo: common/$(am__dirstamp) \
+	common/$(DEPDIR)/$(am__dirstamp)
 common/Clock.lo: common/$(am__dirstamp) \
 	common/$(DEPDIR)/$(am__dirstamp)
 common/Throttle.lo: common/$(am__dirstamp) \
@@ -7614,6 +7617,8 @@ common/test_build_libcommon-cmdparse.$(OBJEXT):  \
 	common/$(am__dirstamp) common/$(DEPDIR)/$(am__dirstamp)
 common/test_build_libcommon-escape.$(OBJEXT): common/$(am__dirstamp) \
 	common/$(DEPDIR)/$(am__dirstamp)
+common/test_build_libcommon-io_priority.$(OBJEXT):  \
+	common/$(am__dirstamp) common/$(DEPDIR)/$(am__dirstamp)
 common/test_build_libcommon-Clock.$(OBJEXT): common/$(am__dirstamp) \
 	common/$(DEPDIR)/$(am__dirstamp)
 common/test_build_libcommon-Throttle.$(OBJEXT):  \
@@ -8634,6 +8639,8 @@ mostlyclean-compile:
 	-rm -f common/histogram.lo
 	-rm -f common/hobject.$(OBJEXT)
 	-rm -f common/hobject.lo
+	-rm -f common/io_priority.$(OBJEXT)
+	-rm -f common/io_priority.lo
 	-rm -f common/ipaddr.$(OBJEXT)
 	-rm -f common/ipaddr.lo
 	-rm -f common/libcommon_crc_la-crc32c.$(OBJEXT)
@@ -8735,6 +8742,7 @@ mostlyclean-compile:
 	-rm -f common/test_build_libcommon-hex.$(OBJEXT)
 	-rm -f common/test_build_libcommon-histogram.$(OBJEXT)
 	-rm -f common/test_build_libcommon-hobject.$(OBJEXT)
+	-rm -f common/test_build_libcommon-io_priority.$(OBJEXT)
 	-rm -f common/test_build_libcommon-ipaddr.$(OBJEXT)
 	-rm -f common/test_build_libcommon-linux_version.$(OBJEXT)
 	-rm -f common/test_build_libcommon-lockdep.$(OBJEXT)
@@ -9666,6 +9674,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/hex.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/histogram.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/hobject.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/io_priority.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/ipaddr.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/libcommon_crc_la-crc32c.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/libcommon_crc_la-crc32c_intel_baseline.Plo at am__quote@
@@ -9743,6 +9752,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/test_build_libcommon-hex.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/test_build_libcommon-histogram.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/test_build_libcommon-hobject.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/test_build_libcommon-io_priority.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/test_build_libcommon-ipaddr.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/test_build_libcommon-linux_version.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/test_build_libcommon-lockdep.Po at am__quote@
@@ -12855,6 +12865,20 @@ common/test_build_libcommon-cmdparse.obj: common/cmdparse.cc
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_build_libcommon_CXXFLAGS) $(CXXFLAGS) -c -o common/test_build_libcommon-cmdparse.obj `if test -f 'common/cmdparse.cc'; then $(CYGPATH_W) 'common/cmdparse.cc'; else $(CYGPATH_W) '$(srcdir)/common/cmdparse.cc'; fi`
 
+common/test_build_libcommon-io_priority.o: common/io_priority.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_build_libcommon_CXXFLAGS) $(CXXFLAGS) -MT common/test_build_libcommon-io_priority.o -MD -MP -MF common/$(DEPDIR)/test_build_libcommon-io_priority.Tpo -c -o common/test_build_libcommon-io_priority.o `test -f 'common/io_priority.cc' || echo '$(srcdir)/'`common/io_priority.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) common/$(DEPDIR)/test_build_libcommon-io_priority.Tpo common/$(DEPDIR)/test_build_libcommon-io_priority.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='common/io_priority.cc' object='common/test_build_libcommon-io_priority.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_build_libcommon_CXXFLAGS) $(CXXFLAGS) -c -o common/test_build_libcommon-io_priority.o `test -f 'common/io_priority.cc' || echo '$(srcdir)/'`common/io_priority.cc
+
+common/test_build_libcommon-io_priority.obj: common/io_priority.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_build_libcommon_CXXFLAGS) $(CXXFLAGS) -MT common/test_build_libcommon-io_priority.obj -MD -MP -MF common/$(DEPDIR)/test_build_libcommon-io_priority.Tpo -c -o common/test_build_libcommon-io_priority.obj `if test -f 'common/io_priority.cc'; then $(CYGPATH_W) 'common/io_priority.cc'; else $(CYGPATH_W) '$(srcdir)/common/io_priority.cc'; fi`
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) common/$(DEPDIR)/test_build_libcommon-io_priority.Tpo common/$(DEPDIR)/test_build_libcommon-io_priority.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='common/io_priority.cc' object='common/test_build_libcommon-io_priority.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_build_libcommon_CXXFLAGS) $(CXXFLAGS) -c -o common/test_build_libcommon-io_priority.obj `if test -f 'common/io_priority.cc'; then $(CYGPATH_W) 'common/io_priority.cc'; else $(CYGPATH_W) '$(srcdir)/common/io_priority.cc'; fi`
+
 common/test_build_libcommon-Clock.o: common/Clock.cc
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_build_libcommon_CXXFLAGS) $(CXXFLAGS) -MT common/test_build_libcommon-Clock.o -MD -MP -MF common/$(DEPDIR)/test_build_libcommon-Clock.Tpo -c -o common/test_build_libcommon-Clock.o `test -f 'common/Clock.cc' || echo '$(srcdir)/'`common/Clock.cc
 @am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) common/$(DEPDIR)/test_build_libcommon-Clock.Tpo common/$(DEPDIR)/test_build_libcommon-Clock.Po
diff --git a/src/acconfig.h.in b/src/acconfig.h.in
index 165c967..bed7a05 100644
--- a/src/acconfig.h.in
+++ b/src/acconfig.h.in
@@ -361,6 +361,9 @@
    your system. */
 #undef PTHREAD_CREATE_JOINABLE
 
+/* The size of `AO_t', as computed by sizeof. */
+#undef SIZEOF_AO_T
+
 /* Define to 1 if you have the ANSI C header files. */
 #undef STDC_HEADERS
 
diff --git a/src/ceph-disk b/src/ceph-disk
index c67f2f3..5d6071d 100755
--- a/src/ceph-disk
+++ b/src/ceph-disk
@@ -119,6 +119,9 @@ STATEDIR = '/var/lib/ceph'
 
 SYSCONFDIR = '/etc/ceph'
 
+# only warn once about some things
+warned_about = {}
+
 # Nuke the TERM variable to avoid confusing any subprocesses we call.
 # For example, libreadline will print weird control sequences for some
 # TERM values.
@@ -131,8 +134,6 @@ if LOG_NAME == '__main__':
 LOG = logging.getLogger(LOG_NAME)
 
 
-
-
 ###### lock ########
 
 class filelock(object):
@@ -150,8 +151,10 @@ class filelock(object):
         fcntl.lockf(self.fd, fcntl.LOCK_UN)
         self.fd = None
 
+
 ###### exceptions ########
 
+
 class Error(Exception):
     """
     Error
@@ -161,36 +164,43 @@ class Error(Exception):
         doc = self.__doc__.strip()
         return ': '.join([doc] + [str(a) for a in self.args])
 
+
 class MountError(Error):
     """
     Mounting filesystem failed
     """
 
+
 class UnmountError(Error):
     """
     Unmounting filesystem failed
     """
 
+
 class BadMagicError(Error):
     """
     Does not look like a Ceph OSD, or incompatible version
     """
 
+
 class TruncatedLineError(Error):
     """
     Line is truncated
     """
 
+
 class TooManyLinesError(Error):
     """
     Too many lines
     """
 
+
 class FilesystemTypeError(Error):
     """
     Cannot discover filesystem type
      """
 
+
 class CephDiskException(Exception):
     """
     A base exception for ceph-disk to provide custom (ad-hoc) messages that
@@ -198,12 +208,14 @@ class CephDiskException(Exception):
     """
     pass
 
+
 class ExecutableNotFound(CephDiskException):
     """
     Exception to report on executables not available in PATH
     """
     pass
 
+
 ####### utils
 
 
@@ -300,7 +312,7 @@ def command_check_call(arguments):
     otherwise.
     """
     arguments = _get_command_executable(arguments)
-    LOG.info('Running command: %s' % ' '.join(arguments))
+    LOG.info('Running command: %s', ' '.join(arguments))
     return subprocess.check_call(arguments)
 
 
@@ -340,26 +352,35 @@ def platform_information():
     )
 
 
-# a device "name" is something like
-#  sdb
-#  cciss!c0d1
 def get_dev_name(path):
     """
-    get device name from path.  e.g., /dev/sda -> sdas, /dev/cciss/c0d1 -> cciss!c0d1
+    get device name from path.  e.g.::
+
+        /dev/sda -> sdas, /dev/cciss/c0d1 -> cciss!c0d1
+
+    a device "name" is something like::
+
+        sdb
+        cciss!c0d1
+
     """
     assert path.startswith('/dev/')
     base = path[5:]
     return base.replace('/', '!')
 
-# a device "path" is something like
-#  /dev/sdb
-#  /dev/cciss/c0d1
+
 def get_dev_path(name):
     """
     get a path (/dev/...) from a name (cciss!c0d1)
+    a device "path" is something like::
+
+        /dev/sdb
+        /dev/cciss/c0d1
+
     """
     return '/dev/' + name.replace('!', '/')
 
+
 def get_dev_relpath(name):
     """
     get a relative path to /dev from a name (cciss!c0d1)
@@ -367,6 +388,29 @@ def get_dev_relpath(name):
     return name.replace('!', '/')
 
 
+def get_dev_size(dev, size='megabytes'):
+    """
+    Attempt to get the size of a device so that we can prevent errors
+    from actions to devices that are smaller, and improve error reporting.
+
+    Because we want to avoid breakage in case this approach is not robust, we
+    will issue a warning if we failed to get the size.
+
+    :param size: bytes or megabytes
+    :param dev: the device to calculate the size
+    """
+    fd = os.open(dev, os.O_RDONLY)
+    dividers = {'bytes': 1, 'megabytes': 1024*1024}
+    try:
+        device_size = os.lseek(fd, 0, os.SEEK_END)
+        divider = dividers.get(size, 1024*1024)  # default to megabytes
+        return device_size/divider
+    except Exception as error:
+        LOG.warning('failed to get size of %s: %s' % (dev, str(error)))
+    finally:
+        os.close(fd)
+
+
 def get_partition_dev(dev, pnum):
     """
     get the device name for a partition
@@ -389,6 +433,7 @@ def get_partition_dev(dev, pnum):
     else:
         raise Error('partition %d for %s does not appear to exist' % (pnum, dev))
 
+
 def list_all_partitions():
     """
     Return a list of devices and partitions
@@ -403,6 +448,7 @@ def list_all_partitions():
         dev_part_list[name] = list_partitions(name)
     return dev_part_list
 
+
 def list_partitions(basename):
     """
     Return a list of partitions on the given device name
@@ -413,6 +459,23 @@ def list_partitions(basename):
             partitions.append(name)
     return partitions
 
+def get_partition_base(dev):
+    """
+    Get the base device for a partition
+    """
+    dev = os.path.realpath(dev)
+    if not stat.S_ISBLK(os.lstat(dev).st_mode):
+        raise Error('not a block device', dev)
+
+    name = get_dev_name(dev)
+    if os.path.exists(os.path.join('/sys/block', name)):
+        raise Error('not a partition', dev)
+
+    # find the base
+    for basename in os.listdir('/sys/block'):
+        if os.path.exists(os.path.join('/sys/block', basename, name)):
+            return '/dev/' + basename
+    raise Error('no parent device for partition', dev)
 
 def is_partition(dev):
     """
@@ -476,7 +539,7 @@ def is_held(dev):
     return []
 
 
-def verify_not_in_use(dev):
+def verify_not_in_use(dev, check_partitions=False):
     """
     Verify if a given device (path) is in use (e.g. mounted or
     in use by device-mapper).
@@ -484,13 +547,13 @@ def verify_not_in_use(dev):
     :raises: Error if device is in use.
     """
     assert os.path.exists(dev)
-    if is_partition(dev):
-        if is_mounted(dev):
-            raise Error('Device is mounted', dev)
-        holders = is_held(dev)
-        if holders:
-            raise Error('Device is in use by a device-mapper mapping (dm-crypt?)' % dev, ','.join(holders))
-    else:
+    if is_mounted(dev):
+        raise Error('Device is mounted', dev)
+    holders = is_held(dev)
+    if holders:
+        raise Error('Device is in use by a device-mapper mapping (dm-crypt?)' % dev, ','.join(holders))
+
+    if check_partitions and not is_partition(dev):
         basename = get_dev_name(os.path.realpath(dev))
         for partname in list_partitions(basename):
             partition = get_dev_path(partname)
@@ -536,10 +599,12 @@ def read_one_line(parent, name):
     try:
         line = must_be_one_line(line)
     except (TruncatedLineError, TooManyLinesError) as e:
-        raise Error('File is corrupt: {path}: {msg}'.format(
+        raise Error(
+            'File is corrupt: {path}: {msg}'.format(
                 path=path,
                 msg=e,
-                ))
+            )
+        )
     return line
 
 
@@ -746,7 +811,7 @@ def dmcrypt_map(
 
     :return: Path to the dmcrypt device.
     """
-    dev = '/dev/mapper/'+ _uuid
+    dev = '/dev/mapper/' + _uuid
     args = [
         'cryptsetup',
         '--key-file',
@@ -792,6 +857,12 @@ def mount(
     Mounts a device with given filessystem type and
     mount options to a tempfile path under /var/lib/ceph/tmp.
     """
+    # sanity check: none of the arguments are None
+    if dev is None:
+        raise ValueError('dev may not be None')
+    if fstype is None:
+        raise ValueError('fstype may not be None')
+
     # pick best-of-breed mount options based on fs type
     if options is None:
         options = MOUNT_OPTIONS.get(fstype, '')
@@ -967,6 +1038,15 @@ def prepare_journal_dev(
             )
         LOG.warning('OSD will not be hot-swappable if journal is not the same device as the osd data')
 
+    dev_size = get_dev_size(journal)
+
+    if journal_size > dev_size:
+        LOG.error('refusing to create journal on %s' % journal)
+        LOG.error('journal size (%sM) is bigger than device (%sM)' % (journal_size, dev_size))
+        raise Error(
+            '%s device size (%sM) is not big enough for journal' % (journal, dev_size)
+        )
+
     try:
         LOG.debug('Creating journal partition num %d size %d on %s', num, journal_size, journal)
         command_check_call(
@@ -1044,7 +1124,7 @@ def prepare_journal_file(
 
     if not os.path.exists(journal):
         LOG.debug('Creating journal file %s with size 0 (ceph-osd will resize and allocate)', journal)
-        with file(journal, 'wb') as journal_file:
+        with file(journal, 'wb') as journal_file:  # noqa
             pass
 
     LOG.debug('Journal is file %s', journal)
@@ -1110,13 +1190,14 @@ def adjust_symlink(target, path):
         except:
             raise Error('unable to create symlink %s -> %s' % (path, target))
 
+
 def prepare_dir(
     path,
     journal,
     cluster_uuid,
     osd_uuid,
     journal_uuid,
-    journal_dmcrypt = None,
+    journal_dmcrypt=None,
     ):
 
     if os.path.exists(os.path.join(path, 'magic')):
@@ -1183,9 +1264,6 @@ def prepare_dev(
         LOG.debug('OSD data device %s is a partition', data)
         rawdev = data
     else:
-        if journal_dmcrypt is not None:
-            dmcrypt_unmap(journal)
-
         LOG.debug('Creating osd partition on %s', data)
         try:
             command_check_call(
@@ -1238,9 +1316,9 @@ def prepare_dev(
         else:
             args.extend(MKFS_ARGS.get(fstype, []))
         args.extend([
-                '--',
-                dev,
-                ])
+            '--',
+            dev,
+            ])
         try:
             LOG.debug('Creating %s fs on %s', fstype, dev)
             command_check_call(args)
@@ -1267,8 +1345,6 @@ def prepare_dev(
     finally:
         if rawdev != dev:
             dmcrypt_unmap(osd_uuid)
-        if journal_dmcrypt is not None:
-            dmcrypt_unmap(journal)
 
     if not is_partition(data):
         try:
@@ -1289,7 +1365,7 @@ def main_prepare(args):
     osd_dm_keypath = None
 
     try:
-        prepare_lock.acquire()
+        prepare_lock.acquire()  # noqa
         if not os.path.exists(args.data):
             if args.data_dev:
                 raise Error('data path does not exist', args.data)
@@ -1299,12 +1375,12 @@ def main_prepare(args):
         # in use?
         dmode = os.stat(args.data).st_mode
         if stat.S_ISBLK(dmode):
-            verify_not_in_use(args.data)
+            verify_not_in_use(args.data, True)
 
         if args.journal and os.path.exists(args.journal):
             jmode = os.stat(args.journal).st_mode
             if stat.S_ISBLK(jmode):
-                verify_not_in_use(args.journal)
+                verify_not_in_use(args.journal, False)
 
         if args.zap_disk is not None:
             if stat.S_ISBLK(dmode) and not is_partition(args.data):
@@ -1421,7 +1497,7 @@ def main_prepare(args):
                 )
         else:
             raise Error('not a dir or block device', args.data)
-        prepare_lock.release()
+        prepare_lock.release()  # noqa
 
         if stat.S_ISBLK(dmode):
             # try to make sure the kernel refreshes the table.  note
@@ -1457,7 +1533,7 @@ def main_prepare(args):
             os.unlink(journal_dm_keypath)
         if osd_dm_keypath:
             os.unlink(osd_dm_keypath)
-        prepare_lock.release()
+        prepare_lock.release()  # noqa
         raise e
 
 
@@ -1623,18 +1699,21 @@ def start_daemon(
                 [
                     svc,
                     'ceph',
+                    '--cluster',
+                    '{cluster}'.format(cluster=cluster),
                     'start',
                     'osd.{osd_id}'.format(osd_id=osd_id),
                     ],
                 )
         else:
             raise Error('{cluster} osd.{osd_id} is not tagged with an init system'.format(
-                    cluster=cluster,
-                    osd_id=osd_id,
-                    ))
+                cluster=cluster,
+                osd_id=osd_id,
+            ))
     except subprocess.CalledProcessError as e:
         raise Error('ceph osd start failed', e)
 
+
 def detect_fstype(
     dev,
     ):
@@ -1704,8 +1783,8 @@ def mount_activate(
         src_dev = os.stat(path).st_dev
         try:
             dst_dev = os.stat((STATEDIR + '/osd/{cluster}-{osd_id}').format(
-                    cluster=cluster,
-                    osd_id=osd_id)).st_dev
+                cluster=cluster,
+                osd_id=osd_id)).st_dev
             if src_dev == dst_dev:
                 active = True
             else:
@@ -1760,7 +1839,7 @@ def activate_dir(
 
     (osd_id, cluster) = activate(path, activate_key_template, init)
 
-    if init not in ( None, 'none' ):
+    if init not in (None, 'none' ):
         canonical = (STATEDIR + '/osd/{cluster}-{osd_id}').format(
             cluster=cluster,
             osd_id=osd_id)
@@ -1815,6 +1894,7 @@ def find_cluster_by_uuid(_uuid):
         return 'ceph'
     return None
 
+
 def activate(
     path,
     activate_key_template,
@@ -1861,7 +1941,7 @@ def activate(
             keyring=keyring,
             )
 
-    if init not in ( None, 'none' ):
+    if init not in (None, 'none' ):
         if init == 'auto':
             conf_val = get_conf(
                 cluster=cluster,
@@ -1912,7 +1992,7 @@ def main_activate(args):
         LOG.info('suppressed activate request on %s', args.path)
         return
 
-    activate_lock.acquire()
+    activate_lock.acquire()  # noqa
     try:
         mode = os.stat(args.path).st_mode
         if stat.S_ISBLK(mode):
@@ -1932,7 +2012,7 @@ def main_activate(args):
             if args.mark_init == 'none':
                 command_check_call(
                     [
-                    'ceph-osd',
+                        'ceph-osd',
                         '--cluster={cluster}'.format(cluster=cluster),
                         '--id={osd_id}'.format(osd_id=osd_id),
                         '--osd-data={path}'.format(path=args.path),
@@ -1943,7 +2023,7 @@ def main_activate(args):
         else:
             raise Error('%s is not a directory or block device' % args.path)
 
-        if args.mark_init not in ( None, 'none' ):
+        if args.mark_init not in (None, 'none' ):
 
             start_daemon(
                 cluster=cluster,
@@ -1951,7 +2031,7 @@ def main_activate(args):
             )
 
     finally:
-        activate_lock.release()
+        activate_lock.release()  # noqa
 
 
 ###########################
@@ -1984,6 +2064,7 @@ def get_journal_osd_uuid(path):
     LOG.debug('Journal %s has OSD UUID %s', path, value)
     return value
 
+
 def main_activate_journal(args):
     if not os.path.exists(args.dev):
         raise Error('%s does not exist' % args.dev)
@@ -1991,7 +2072,7 @@ def main_activate_journal(args):
     cluster = None
     osd_id = None
     osd_uuid = None
-    activate_lock.acquire()
+    activate_lock.acquire()  # noqa
     try:
         osd_uuid = get_journal_osd_uuid(args.dev)
         path = os.path.join('/dev/disk/by-partuuid/', osd_uuid.lower())
@@ -2008,10 +2089,12 @@ def main_activate_journal(args):
             )
 
     finally:
-        activate_lock.release()
+        activate_lock.release()  # noqa
+
 
 ###########################
 
+
 def main_activate_all(args):
     dir = '/dev/disk/by-parttypeuuid'
     LOG.debug('Scanning %s', dir)
@@ -2022,10 +2105,16 @@ def main_activate_all(args):
         if name.find('.') < 0:
             continue
         (tag, uuid) = name.split('.')
-        if tag == OSD_UUID:
-            path = os.path.join(dir, name)
+
+        if tag == OSD_UUID or tag == DMCRYPT_OSD_UUID:
+
+            if tag == DMCRYPT_OSD_UUID:
+                path = os.path.join('/dev/mapper', uuid)
+            else:
+                path = os.path.join(dir, name)
+
             LOG.info('Activating %s', path)
-            activate_lock.acquire()
+            activate_lock.acquire()  # noqa
             try:
                 (cluster, osd_id) = mount_activate(
                     dev=path,
@@ -2045,7 +2134,7 @@ def main_activate_all(args):
                 err = True
 
             finally:
-                activate_lock.release()
+                activate_lock.release()  # noqa
     if err:
         raise Error('One or more partitions failed to activate')
 
@@ -2066,6 +2155,7 @@ def is_swap(dev):
                     return True
     return False
 
+
 def get_oneliner(base, name):
     path = os.path.join(base, name)
     if os.path.isfile(path):
@@ -2073,6 +2163,7 @@ def get_oneliner(base, name):
             return _file.readline().rstrip()
     return None
 
+
 def get_dev_fs(dev):
     fscheck, _ = command(
         [
@@ -2088,7 +2179,56 @@ def get_dev_fs(dev):
     else:
         return None
 
+
 def get_partition_type(part):
+    """
+    Get the GPT partition type UUID.  If we have an old blkid and can't
+    get it that way, use sgdisk and use the description instead (and hope
+    dmcrypt isn't being used).
+    """
+    blkid, _ = command(
+        [
+            'blkid',
+            '-p',
+            '-o', 'udev',
+            part,
+        ]
+    )
+    saw_part_entry = False
+    for line in blkid.splitlines():
+        (key, value) = line.split('=')
+        if key == 'ID_PART_ENTRY_TYPE':
+            return value
+        if key == 'ID_PART_ENTRY_SCHEME':
+            table_type = value
+        if key.startswith('ID_PART_ENTRY_'):
+            saw_part_entry = True
+
+    # hmm, is it in fact GPT?
+    table_type = None
+    base = get_partition_base(part)
+    blkid, _ = command(
+        [
+            'blkid',
+            '-p',
+            '-o', 'udev',
+            base
+        ]
+    )
+    for line in blkid.splitlines():
+        (key, value) = line.split('=')
+        if key == 'ID_PART_TABLE_TYPE':
+            table_type = value
+    if table_type != 'gpt':
+        return None    # not even GPT
+
+    if saw_part_entry:
+        return None    # GPT, and blkid appears to be new, so we're done.
+
+    # bah, fall back to sgdisk.
+    if 'blkid' not in warned_about:
+        LOG.warning('Old blkid does not support ID_PART_ENTRY_* fields, trying sgdisk; may not correctly identify ceph volumes with dmcrypt')
+        warned_about['blkid'] = True
     (base, partnum) = re.match('(\D+)(\d+)', part).group(1, 2)
     sgdisk, _ = command(
         [
@@ -2104,9 +2244,16 @@ def get_partition_type(part):
             num = m.group(1)
             if num != partnum:
                 continue
-            return m.group(2)
+            desc = m.group(2)
+            # assume unencrypted ... blkid has failed us :(
+            if desc == 'ceph data':
+                return OSD_UUID
+            if desc == 'ceph journal':
+                return JOURNAL_UUID
+
     return None
 
+
 def get_partition_uuid(dev):
     (base, partnum) = re.match('(\D+)(\d+)', dev).group(1, 2)
     out, _ = command(['sgdisk', '-i', partnum, base])
@@ -2116,6 +2263,7 @@ def get_partition_uuid(dev):
             return m.group(1).lower()
     return None
 
+
 def more_osd_info(path, uuid_map):
     desc = []
     ceph_fsid = get_oneliner(path, 'ceph_fsid')
@@ -2138,6 +2286,27 @@ def more_osd_info(path, uuid_map):
 
     return desc
 
+def list_dev_osd(dev, uuid_map):
+    path = is_mounted(dev)
+    fs_type = get_dev_fs(dev)
+    desc = []
+    if path:
+        desc.append('active')
+        desc.extend(more_osd_info(path, uuid_map))
+    elif fs_type:
+        try:
+            tpath = mount(dev=dev, fstype=fs_type, options='')
+            if tpath:
+                try:
+                    magic = get_oneliner(tpath, 'magic')
+                    if magic is not None:
+                        desc.append('prepared')
+                        desc.extend(more_osd_info(tpath, uuid_map))
+                finally:
+                    unmount(tpath)
+        except MountError:
+            pass
+    return desc
 
 def list_dev(dev, uuid_map, journal_map):
     ptype = 'unknown'
@@ -2145,37 +2314,41 @@ def list_dev(dev, uuid_map, journal_map):
     if is_partition(dev):
         ptype = get_partition_type(dev)
         prefix = ' '
-    fs_type = get_dev_fs(dev)
-    path = is_mounted(dev)
 
     desc = []
-    if ptype == 'ceph data':
-        if path:
-            desc.append('active')
-            desc.extend(more_osd_info(path, uuid_map))
-        elif fs_type:
-            try:
-                tpath = mount(dev=dev, fstype=fs_type, options='')
-                if tpath:
-                    try:
-                        magic = get_oneliner(tpath, 'magic')
-                        if magic is not None:
-                            desc.append('prepared')
-                            desc.extend(more_osd_info(tpath, uuid_map))
-                    finally:
-                        unmount(tpath)
-            except MountError:
-                pass
+    if ptype == OSD_UUID:
+        desc = list_dev_osd(dev, uuid_map)
         if desc:
             desc = ['ceph data'] + desc
         else:
             desc = ['ceph data', 'unprepared']
-    elif ptype == 'ceph journal':
+    elif ptype == DMCRYPT_OSD_UUID:
+        holders = is_held(dev)
+        if not holders:
+            desc = ['ceph data (dmcrypt)', 'not currently mapped']
+        elif len(holders) == 1:
+            holder = '/dev/' + holders[0]
+            fs_desc = list_dev_osd(holder, uuid_map)
+            desc = ['ceph data (dmcrypt %s)' % holder] + fs_desc
+        else:
+            desc = ['ceph data (dmcrypt)', 'holders: ' + ','.join(holders)]
+    elif ptype == JOURNAL_UUID:
         desc.append('ceph journal')
         part_uuid = get_partition_uuid(dev)
         if part_uuid and part_uuid in journal_map:
             desc.append('for %s' % journal_map[part_uuid])
+    elif ptype == DMCRYPT_JOURNAL_UUID:
+        holders = is_held(dev)
+        if len(holders) == 1:
+            desc = ['ceph journal (dmcrypt /dev/%s)' % holders[0]]
+        else:
+            desc = ['ceph journal (dmcrypt)']
+        part_uuid = get_partition_uuid(dev)
+        if part_uuid and part_uuid in journal_map:
+            desc.append('for %s' % journal_map[part_uuid])
     else:
+        path = is_mounted(dev)
+        fs_type = get_dev_fs(dev)
         if is_swap(dev):
             desc.append('swap')
         else:
@@ -2190,7 +2363,6 @@ def list_dev(dev, uuid_map, journal_map):
     print '%s%s %s' % (prefix, dev, ', '.join(desc))
 
 
-
 def main_list(args):
     partmap = list_all_partitions()
 
@@ -2203,18 +2375,35 @@ def main_list(args):
             if part_uuid:
                 uuid_map[part_uuid] = dev
             ptype = get_partition_type(dev)
-            if ptype == 'ceph data':
+            if ptype == OSD_UUID:
                 fs_type = get_dev_fs(dev)
-                try:
-                    tpath = mount(dev=dev, fstype=fs_type, options='')
+                if fs_type is not None:
                     try:
-                        journal_uuid = get_oneliner(tpath, 'journal_uuid')
-                        if journal_uuid:
-                            journal_map[journal_uuid.lower()] = dev
-                    finally:
-                        unmount(tpath)
-                except MountError:
-                    pass
+                        tpath = mount(dev=dev, fstype=fs_type, options='')
+                        try:
+                            journal_uuid = get_oneliner(tpath, 'journal_uuid')
+                            if journal_uuid:
+                                journal_map[journal_uuid.lower()] = dev
+                        finally:
+                            unmount(tpath)
+                    except MountError:
+                        pass
+            if ptype == DMCRYPT_OSD_UUID:
+                holders = is_held(dev)
+                if len(holders) == 1:
+                    holder = '/dev/' + holders[0]
+                    fs_type = get_dev_fs(holder)
+                    if fs_type is not None:
+                        try:
+                            tpath = mount(dev=holder, fstype=fs_type, options='')
+                            try:
+                                journal_uuid = get_oneliner(tpath, 'journal_uuid')
+                                if journal_uuid:
+                                    journal_map[journal_uuid.lower()] = dev
+                            finally:
+                                unmount(tpath)
+                        except MountError:
+                            pass
 
     for base, parts in sorted(partmap.iteritems()):
         if parts:
@@ -2244,12 +2433,13 @@ def is_suppressed(path):
             return False
         base = get_dev_name(disk)
         while len(base):
-            if os.path.exists(SUPPRESS_PREFIX + base):
+            if os.path.exists(SUPPRESS_PREFIX + base):  # noqa
                 return True
             base = base[:-1]
     except:
         return False
 
+
 def set_suppress(path):
     disk = os.path.realpath(path)
     if not os.path.exists(disk):
@@ -2258,10 +2448,11 @@ def set_suppress(path):
         raise Error('not a block device', path)
     base = get_dev_name(disk)
 
-    with file(SUPPRESS_PREFIX + base, 'w') as f:
+    with file(SUPPRESS_PREFIX + base, 'w') as f:  # noqa
         pass
     LOG.info('set suppress flag on %s', base)
 
+
 def unset_suppress(path):
     disk = os.path.realpath(path)
     if not os.path.exists(disk):
@@ -2271,7 +2462,7 @@ def unset_suppress(path):
     assert disk.startswith('/dev/')
     base = get_dev_name(disk)
 
-    fn = SUPPRESS_PREFIX + base
+    fn = SUPPRESS_PREFIX + base  # noqa
     if not os.path.exists(fn):
         raise Error('not marked as suppressed', path)
 
@@ -2285,16 +2476,22 @@ def unset_suppress(path):
 def main_suppress(args):
     set_suppress(args.path)
 
+
 def main_unsuppress(args):
     unset_suppress(args.path)
 
+
 def main_zap(args):
     for dev in args.dev:
         zap(dev)
 
 ###########################
 
+
 def setup_statedir(dir):
+    # XXX The following use of globals makes linting
+    # really hard. Global state in Python is iffy and
+    # should be avoided.
     global STATEDIR
     STATEDIR = dir
 
@@ -2312,10 +2509,12 @@ def setup_statedir(dir):
     global SUPPRESS_PREFIX
     SUPPRESS_PREFIX = STATEDIR + '/tmp/suppress-activate.'
 
+
 def setup_sysconfdir(dir):
     global SYSCONFDIR
     SYSCONFDIR = dir
 
+
 def parse_args():
     parser = argparse.ArgumentParser(
         'ceph-disk',
@@ -2589,3 +2788,4 @@ def main():
 
 if __name__ == '__main__':
     main()
+    warned_about = {}
diff --git a/src/ceph.in b/src/ceph.in
index 0978882..82c9085 100755
--- a/src/ceph.in
+++ b/src/ceph.in
@@ -106,6 +106,14 @@ def mdsids():
         l.append(mdsdict['name'])
     return l
 
+# these args must be passed to all child programs
+GLOBAL_ARGS = {
+    'client_id': '--id',
+    'client_name': '--name',
+    'cluster': '--cluster',
+    'cephconf': '--conf',
+}
+
 def parse_cmdargs(args=None, target=''):
     # alias: let the line-wrapping be sane
     AP = argparse.ArgumentParser
@@ -339,15 +347,23 @@ def admin_socket(asok_path, cmd, format=''):
     return ret
 
 
-def ceph_conf(field, name):
+def ceph_conf(parsed_args, field, name):
+    args=['ceph-conf']
+
+    if name:
+        args.extend(['--name', name])
+
+    # add any args in GLOBAL_ARGS
+    for key, val in GLOBAL_ARGS.iteritems():
+        # ignore name in favor of argument name, if any
+        if name and key == 'client_name':
+            continue
+        if getattr(parsed_args, key):
+            args.extend([val, getattr(parsed_args, key)])
+
+    args.extend(['--show-config-value', field])
     p = subprocess.Popen(
-        args=[
-            'ceph-conf',
-	    '--show-config-value',
-            field,
-            '-n',
-            name,
-            ],
+        args,
         stdout=subprocess.PIPE,
         stderr=subprocess.PIPE)
     outdata, errdata = p.communicate()
@@ -538,7 +554,8 @@ def main():
             else:
                 # try resolve daemon name
                 try:
-                    sockpath = ceph_conf('admin_socket', childargs[1])
+                    sockpath = ceph_conf(parsed_args, 'admin_socket',
+                                         childargs[1])
                 except Exception as e:
                     print >> sys.stderr, \
                         'Can\'t get admin socket path: ' + str(e)
diff --git a/src/ceph_common.sh b/src/ceph_common.sh
index d78f831..07faddc 100644
--- a/src/ceph_common.sh
+++ b/src/ceph_common.sh
@@ -50,12 +50,13 @@ check_host() {
 
     #echo host for $name is $host, i am $hostname
 
-    if [ -e "/var/lib/ceph/$type/ceph-$id/upstart" ]; then
+    cluster=$1
+    if [ -e "/var/lib/ceph/$type/$cluster-$id/upstart" ]; then
 	return 1
     fi
 
     # sysvinit managed instance in standard location?
-    if [ -e "/var/lib/ceph/$type/ceph-$id/sysvinit" ]; then
+    if [ -e "/var/lib/ceph/$type/$cluster-$id/sysvinit" ]; then
 	host="$hostname"
 	echo "=== $type.$id === "
 	return 0
diff --git a/src/ceph_mon.cc b/src/ceph_mon.cc
index 4e84b4d..80b17a1 100644
--- a/src/ceph_mon.cc
+++ b/src/ceph_mon.cc
@@ -43,6 +43,8 @@ using namespace std;
 
 #include "include/assert.h"
 
+#include "erasure-code/ErasureCodePlugin.h"
+
 #define dout_subsys ceph_subsys_mon
 
 Monitor *mon = NULL;
@@ -184,6 +186,21 @@ void usage()
   generic_server_usage();
 }
 
+int preload_erasure_code()
+{
+  string directory = g_conf->osd_pool_default_erasure_code_directory;
+  string plugins = g_conf->osd_erasure_code_plugins;
+  stringstream ss;
+  int r = ErasureCodePluginRegistry::instance().preload(plugins,
+							directory,
+							ss);
+  if (r)
+    derr << ss.str() << dendl;
+  else
+    dout(10) << ss.str() << dendl;
+  return r;
+}
+
 int main(int argc, const char **argv) 
 {
   int err;
@@ -406,8 +423,7 @@ int main(int argc, const char **argv)
   // screwing us over
   Preforker prefork;
   if (!(flags & CINIT_FLAG_NO_DAEMON_ACTIONS)) {
-    if (g_conf->daemonize) {
-      global_init_prefork(g_ceph_context, 0);
+    if (global_init_prefork(g_ceph_context, 0) >= 0) {
       prefork.prefork();
       if (prefork.is_parent()) {
 	return prefork.parent_wait();
@@ -416,6 +432,8 @@ int main(int argc, const char **argv)
     }
     common_init_finish(g_ceph_context);
     global_init_chdir(g_ceph_context);
+    if (preload_erasure_code() < -1)
+      prefork.exit(1);
   }
 
   MonitorDBStore *store = new MonitorDBStore(g_conf->mon_data);
diff --git a/src/ceph_osd.cc b/src/ceph_osd.cc
index 029ef28..a2f4542 100644
--- a/src/ceph_osd.cc
+++ b/src/ceph_osd.cc
@@ -48,6 +48,8 @@ using namespace std;
 
 #include "include/assert.h"
 
+#include "erasure-code/ErasureCodePlugin.h"
+
 #define dout_subsys ceph_subsys_osd
 
 OSD *osd = NULL;
@@ -66,6 +68,21 @@ void usage()
   generic_server_usage();
 }
 
+int preload_erasure_code()
+{
+  string directory = g_conf->osd_pool_default_erasure_code_directory;
+  string plugins = g_conf->osd_erasure_code_plugins;
+  stringstream ss;
+  int r = ErasureCodePluginRegistry::instance().preload(plugins,
+							directory,
+							ss);
+  if (r)
+    derr << ss.str() << dendl;
+  else
+    dout(10) << ss.str() << dendl;
+  return r;
+}
+
 int main(int argc, const char **argv) 
 {
   vector<const char*> args;
@@ -451,6 +468,9 @@ int main(int argc, const char **argv)
     return -1;
   global_init_chdir(g_ceph_context);
 
+  if (preload_erasure_code() < -1)
+    return -1;
+
   osd = new OSD(g_ceph_context,
 		store,
 		whoami,
diff --git a/src/cls/rgw/cls_rgw.cc b/src/cls/rgw/cls_rgw.cc
index cf301f7..7a15a90 100644
--- a/src/cls/rgw/cls_rgw.cc
+++ b/src/cls/rgw/cls_rgw.cc
@@ -670,7 +670,7 @@ int rgw_bucket_complete_op(cls_method_context_t hctx, bufferlist *in, bufferlist
     unaccount_entry(header, remove_entry);
 
     if (op.log_op) {
-      rc = log_index_operation(hctx, op.name, CLS_RGW_OP_DEL, op.tag, remove_entry.meta.mtime,
+      rc = log_index_operation(hctx, remove_oid_name, CLS_RGW_OP_DEL, op.tag, remove_entry.meta.mtime,
                                remove_entry.ver, CLS_RGW_STATE_COMPLETE, header.ver, header.max_marker);
       if (rc < 0)
         continue;
diff --git a/src/common/Finisher.h b/src/common/Finisher.h
index 173b924..610470e 100644
--- a/src/common/Finisher.h
+++ b/src/common/Finisher.h
@@ -77,6 +77,15 @@ class Finisher {
     if (logger)
       logger->inc(l_finisher_queue_len);
   }
+  void queue(list<Context*>& ls) {
+    finisher_lock.Lock();
+    finisher_queue.insert(finisher_queue.end(), ls.begin(), ls.end());
+    finisher_cond.Signal();
+    finisher_lock.Unlock();
+    ls.clear();
+    if (logger)
+      logger->inc(l_finisher_queue_len);
+  }
   
   void start();
   void stop();
diff --git a/src/common/LogClient.cc b/src/common/LogClient.cc
index 1e290b1..e4536c7 100644
--- a/src/common/LogClient.cc
+++ b/src/common/LogClient.cc
@@ -124,6 +124,7 @@ bool LogClient::are_pending()
 
 Message *LogClient::_get_mon_log_message()
 {
+  assert(log_lock.is_locked());
    if (log_queue.empty())
      return NULL;
 
@@ -149,7 +150,7 @@ Message *LogClient::_get_mon_log_message()
   assert(num_unsent <= log_queue.size());
   std::deque<LogEntry>::iterator p = log_queue.begin();
   std::deque<LogEntry> o;
-  while (p->seq < last_log_sent) {
+  while (p->seq <= last_log_sent) {
     ++p;
     assert(p != log_queue.end());
   }
diff --git a/src/common/Makefile.am b/src/common/Makefile.am
index 9769e2f..69e5ad3 100644
--- a/src/common/Makefile.am
+++ b/src/common/Makefile.am
@@ -13,6 +13,7 @@ libcommon_la_SOURCES = \
 	common/admin_socket_client.cc \
 	common/cmdparse.cc \
 	common/escape.c \
+	common/io_priority.cc \
 	common/Clock.cc \
 	common/Throttle.cc \
 	common/Timer.cc \
@@ -156,6 +157,7 @@ noinst_HEADERS += \
 	common/OutputDataSocket.h \
 	common/admin_socket.h \
 	common/admin_socket_client.h \
+	common/random_cache.hpp \
 	common/shared_cache.hpp \
 	common/tracked_int_ptr.hpp \
 	common/simple_cache.hpp \
@@ -175,6 +177,7 @@ noinst_HEADERS += \
 	common/arch.h \
 	common/armor.h \
 	common/common_init.h \
+	common/io_priority.h \
 	common/pipe.h \
 	common/code_environment.h \
 	common/signal.h \
diff --git a/src/common/Thread.cc b/src/common/Thread.cc
index 0f4e322..7be0013 100644
--- a/src/common/Thread.cc
+++ b/src/common/Thread.cc
@@ -16,6 +16,7 @@
 #include "common/code_environment.h"
 #include "common/debug.h"
 #include "common/signal.h"
+#include "common/io_priority.h"
 
 #include <dirent.h>
 #include <errno.h>
@@ -29,7 +30,10 @@
 
 
 Thread::Thread()
-  : thread_id(0)
+  : thread_id(0),
+    pid(0),
+    ioprio_class(-1),
+    ioprio_priority(-1)
 {
 }
 
@@ -38,10 +42,24 @@ Thread::~Thread()
 }
 
 void *Thread::_entry_func(void *arg) {
-  void *r = ((Thread*)arg)->entry();
+  void *r = ((Thread*)arg)->entry_wrapper();
   return r;
 }
 
+void *Thread::entry_wrapper()
+{
+  int p = ceph_gettid(); // may return -ENOSYS on other platforms
+  if (p > 0)
+    pid = p;
+  if (ioprio_class >= 0 &&
+      ioprio_priority >= 0) {
+    ceph_ioprio_set(IOPRIO_WHO_PROCESS,
+		    pid,
+		    IOPRIO_PRIO_VALUE(ioprio_class, ioprio_priority));
+  }
+  return entry();
+}
+
 const pthread_t &Thread::get_thread_id()
 {
   return thread_id;
@@ -128,3 +146,15 @@ int Thread::detach()
 {
   return pthread_detach(thread_id);
 }
+
+int Thread::set_ioprio(int cls, int prio)
+{
+  // fixme, maybe: this can race with create()
+  ioprio_class = cls;
+  ioprio_priority = prio;
+  if (pid && cls >= 0 && prio >= 0)
+    return ceph_ioprio_set(IOPRIO_WHO_PROCESS,
+			   pid,
+			   IOPRIO_PRIO_VALUE(cls, prio));
+  return 0;
+}
diff --git a/src/common/Thread.h b/src/common/Thread.h
index 4bc0254..95f63b4 100644
--- a/src/common/Thread.h
+++ b/src/common/Thread.h
@@ -21,6 +21,10 @@
 class Thread {
  private:
   pthread_t thread_id;
+  pid_t pid;
+  int ioprio_class, ioprio_priority;
+
+  void *entry_wrapper();
 
  public:
   Thread(const Thread& other);
@@ -44,6 +48,7 @@ class Thread {
   void create(size_t stacksize = 0);
   int join(void **prval = 0);
   int detach();
+  int set_ioprio(int cls, int prio);
 };
 
 #endif
diff --git a/src/common/WorkQueue.cc b/src/common/WorkQueue.cc
index f47435b..42f402f 100644
--- a/src/common/WorkQueue.cc
+++ b/src/common/WorkQueue.cc
@@ -16,6 +16,7 @@
 
 #include "include/types.h"
 #include "include/utime.h"
+#include "common/errno.h"
 #include "WorkQueue.h"
 
 #include "common/config.h"
@@ -33,6 +34,8 @@ ThreadPool::ThreadPool(CephContext *cct_, string nm, int n, const char *option)
     _stop(false),
     _pause(0),
     _draining(0),
+    ioprio_class(-1),
+    ioprio_priority(-1),
     _num_threads(n),
     last_work_queue(0),
     processing(0)
@@ -156,6 +159,11 @@ void ThreadPool::start_threads()
     WorkThread *wt = new WorkThread(this);
     ldout(cct, 10) << "start_threads creating and starting " << wt << dendl;
     _threads.insert(wt);
+
+    int r = wt->set_ioprio(ioprio_class, ioprio_priority);
+    if (r < 0)
+      lderr(cct) << " set_ioprio got " << cpp_strerror(r) << dendl;
+
     wt->create();
   }
 }
@@ -255,3 +263,16 @@ void ThreadPool::drain(WorkQueue_* wq)
   _lock.Unlock();
 }
 
+void ThreadPool::set_ioprio(int cls, int priority)
+{
+  Mutex::Locker l(_lock);
+  ioprio_class = cls;
+  ioprio_priority = priority;
+  for (set<WorkThread*>::iterator p = _threads.begin();
+       p != _threads.end();
+       ++p) {
+    int r = (*p)->set_ioprio(cls, priority);
+    if (r < 0)
+      lderr(cct) << " set_ioprio got " << cpp_strerror(r) << dendl;
+  }
+}
diff --git a/src/common/WorkQueue.h b/src/common/WorkQueue.h
index 794b577..cbf49a8 100644
--- a/src/common/WorkQueue.h
+++ b/src/common/WorkQueue.h
@@ -33,6 +33,7 @@ class ThreadPool : public md_config_obs_t {
   int _pause;
   int _draining;
   Cond _wait_cond;
+  int ioprio_class, ioprio_priority;
 
 public:
   class TPHandle {
@@ -388,6 +389,9 @@ public:
   void unpause();
   /// wait for all work to complete
   void drain(WorkQueue_* wq = 0);
+
+  /// set io priority
+  void set_ioprio(int cls, int priority);
 };
 
 class GenContextWQ :
diff --git a/src/common/blkdev.cc b/src/common/blkdev.cc
index 9c7240c..8b19abb 100644
--- a/src/common/blkdev.cc
+++ b/src/common/blkdev.cc
@@ -10,7 +10,7 @@ int get_block_device_size(int fd, int64_t *psize)
 {
 #ifdef BLKGETSIZE64
   int ret = ::ioctl(fd, BLKGETSIZE64, psize);
-#elif BLKGETSIZE
+#elif defined(BLKGETSIZE)
   unsigned long sectors = 0;
   int ret = ::ioctl(fd, BLKGETSIZE, &sectors);
   *psize = sectors * 512ULL;
diff --git a/src/common/config.cc b/src/common/config.cc
index 0ee7f58..23bfe35 100644
--- a/src/common/config.cc
+++ b/src/common/config.cc
@@ -389,12 +389,10 @@ int md_config_t::parse_argv(std::vector<const char*>& args)
     }
     else if (ceph_argparse_flag(args, i, "--foreground", "-f", (char*)NULL)) {
       set_val_or_die("daemonize", "false");
-      set_val_or_die("pid_file", "");
     }
     else if (ceph_argparse_flag(args, i, "-d", (char*)NULL)) {
       set_val_or_die("daemonize", "false");
       set_val_or_die("log_file", "");
-      set_val_or_die("pid_file", "");
       set_val_or_die("log_to_stderr", "true");
       set_val_or_die("err_to_stderr", "true");
       set_val_or_die("log_to_syslog", "false");
@@ -879,7 +877,7 @@ int md_config_t::set_val_raw(const char *val, const config_option *opt)
   switch (opt->type) {
     case OPT_INT: {
       std::string err;
-      int f = strict_strtol(val, 10, &err);
+      int f = strict_sistrtoll(val, &err);
       if (!err.empty())
 	return -EINVAL;
       *(int*)opt->conf_ptr(this) = f;
@@ -887,7 +885,7 @@ int md_config_t::set_val_raw(const char *val, const config_option *opt)
     }
     case OPT_LONGLONG: {
       std::string err;
-      long long f = strict_strtoll(val, 10, &err);
+      long long f = strict_sistrtoll(val, &err);
       if (!err.empty())
 	return -EINVAL;
       *(long long*)opt->conf_ptr(this) = f;
@@ -917,7 +915,7 @@ int md_config_t::set_val_raw(const char *val, const config_option *opt)
       return 0;
     case OPT_U32: {
       std::string err;
-      int f = strict_strtol(val, 10, &err);
+      int f = strict_sistrtoll(val, &err);
       if (!err.empty())
 	return -EINVAL;
       *(uint32_t*)opt->conf_ptr(this) = f;
@@ -925,7 +923,7 @@ int md_config_t::set_val_raw(const char *val, const config_option *opt)
     }
     case OPT_U64: {
       std::string err;
-      long long f = strict_strtoll(val, 10, &err);
+      long long f = strict_sistrtoll(val, &err);
       if (!err.empty())
 	return -EINVAL;
       *(uint64_t*)opt->conf_ptr(this) = f;
diff --git a/src/common/config_opts.h b/src/common/config_opts.h
index f8dd5f0..fe00c76 100644
--- a/src/common/config_opts.h
+++ b/src/common/config_opts.h
@@ -177,6 +177,7 @@ OPTION(mon_force_standby_active, OPT_BOOL, true) // should mons force standby-re
 OPTION(mon_warn_on_old_mons, OPT_BOOL, true) // should mons set health to WARN if part of quorum is old?
 OPTION(mon_warn_on_legacy_crush_tunables, OPT_BOOL, true) // warn if crush tunables are not optimal
 OPTION(mon_warn_on_osd_down_out_interval_zero, OPT_BOOL, true) // warn if 'mon_osd_down_out_interval == 0'
+OPTION(mon_warn_on_cache_pools_without_hit_sets, OPT_BOOL, true)
 OPTION(mon_min_osdmap_epochs, OPT_INT, 500)
 OPTION(mon_max_pgmap_epochs, OPT_INT, 500)
 OPTION(mon_max_log_epochs, OPT_INT, 500)
@@ -434,6 +435,7 @@ OPTION(osd_pool_default_erasure_code_profile,
        "k=2 "
        "m=1 "
        ) // default properties of osd pool create
+OPTION(osd_erasure_code_plugins, OPT_STR, "jerasure") // list of erasure code plugins
 OPTION(osd_pool_default_flags, OPT_INT, 0)   // default flags for new pools
 OPTION(osd_pool_default_flag_hashpspool, OPT_BOOL, true)   // use new pg hashing to prevent pool/pg overlap
 OPTION(osd_pool_default_hit_set_bloom_fpp, OPT_FLOAT, .05)
@@ -442,6 +444,7 @@ OPTION(osd_pool_default_cache_target_full_ratio, OPT_FLOAT, .8)
 OPTION(osd_pool_default_cache_min_flush_age, OPT_INT, 0)  // seconds
 OPTION(osd_pool_default_cache_min_evict_age, OPT_INT, 0)  // seconds
 OPTION(osd_hit_set_min_size, OPT_INT, 1000)  // min target size for a HitSet
+OPTION(osd_hit_set_max_size, OPT_INT, 100000)  // max target size for a HitSet
 OPTION(osd_hit_set_namespace, OPT_STR, ".ceph-internal") // rados namespace for hit_set tracking
 
 OPTION(osd_tier_default_cache_mode, OPT_STR, "writeback")
@@ -450,6 +453,7 @@ OPTION(osd_tier_default_cache_hit_set_period, OPT_INT, 1200)
 OPTION(osd_tier_default_cache_hit_set_type, OPT_STR, "bloom")
 
 OPTION(osd_map_dedup, OPT_BOOL, true)
+OPTION(osd_map_max_advance, OPT_INT, 200) // make this < cache_size!
 OPTION(osd_map_cache_size, OPT_INT, 500)
 OPTION(osd_map_message_max, OPT_INT, 100)  // max maps per MOSDMap message
 OPTION(osd_map_share_max_epochs, OPT_INT, 100)  // cap on # of inc maps we send to peers, clients
@@ -458,6 +462,8 @@ OPTION(osd_peering_wq_batch_size, OPT_U64, 20)
 OPTION(osd_op_pq_max_tokens_per_priority, OPT_U64, 4194304)
 OPTION(osd_op_pq_min_cost, OPT_U64, 65536)
 OPTION(osd_disk_threads, OPT_INT, 1)
+OPTION(osd_disk_thread_ioprio_class, OPT_STR, "") // rt realtime be besteffort best effort idle
+OPTION(osd_disk_thread_ioprio_priority, OPT_INT, -1) // 0-7
 OPTION(osd_recovery_threads, OPT_INT, 1)
 OPTION(osd_recover_clone_overlap, OPT_BOOL, true)   // preserve clone_overlap during recovery/migration
 
@@ -473,6 +479,7 @@ OPTION(osd_snap_trim_thread_timeout, OPT_INT, 60*60*1)
 OPTION(osd_snap_trim_sleep, OPT_FLOAT, 0)
 OPTION(osd_scrub_thread_timeout, OPT_INT, 60)
 OPTION(osd_scrub_finalize_thread_timeout, OPT_INT, 60*10)
+OPTION(osd_scrub_invalid_stats, OPT_BOOL, true)
 OPTION(osd_remove_thread_timeout, OPT_INT, 60*60)
 OPTION(osd_command_thread_timeout, OPT_INT, 10*60)
 OPTION(osd_age, OPT_FLOAT, .8)
@@ -509,6 +516,7 @@ OPTION(osd_scrub_min_interval, OPT_FLOAT, 60*60*24)    // if load is low
 OPTION(osd_scrub_max_interval, OPT_FLOAT, 7*60*60*24)  // regardless of load
 OPTION(osd_scrub_chunk_min, OPT_INT, 5)
 OPTION(osd_scrub_chunk_max, OPT_INT, 25)
+OPTION(osd_scrub_sleep, OPT_FLOAT, 0)   // sleep between [deep]scrub ops
 OPTION(osd_deep_scrub_interval, OPT_FLOAT, 60*60*24*7) // once a week
 OPTION(osd_deep_scrub_stride, OPT_INT, 524288)
 OPTION(osd_scan_list_ping_tp_interval, OPT_U64, 100)
@@ -690,6 +698,9 @@ OPTION(keyvaluestore_debug_check_backend, OPT_BOOL, 0) // Expensive debugging ch
 OPTION(keyvaluestore_op_threads, OPT_INT, 2)
 OPTION(keyvaluestore_op_thread_timeout, OPT_INT, 60)
 OPTION(keyvaluestore_op_thread_suicide_timeout, OPT_INT, 180)
+OPTION(keyvaluestore_default_strip_size, OPT_INT, 4096) // Only affect new object
+OPTION(keyvaluestore_max_expected_write_size, OPT_U64, 1ULL << 24) // bytes
+OPTION(keyvaluestore_header_cache_size, OPT_INT, 4096)    // Header cache size
 
 // max bytes to search ahead in journal searching for corruption
 OPTION(journal_max_corrupt_search, OPT_U64, 10<<20)
@@ -713,6 +724,7 @@ OPTION(rbd_cache_size, OPT_LONGLONG, 32<<20)         // cache size in bytes
 OPTION(rbd_cache_max_dirty, OPT_LONGLONG, 24<<20)    // dirty limit in bytes - set to 0 for write-through caching
 OPTION(rbd_cache_target_dirty, OPT_LONGLONG, 16<<20) // target dirty limit in bytes
 OPTION(rbd_cache_max_dirty_age, OPT_FLOAT, 1.0)      // seconds in cache before writeback starts
+OPTION(rbd_cache_max_dirty_object, OPT_INT, 0)       // dirty limit for objects - set to 0 for auto calculate from rbd_cache_size
 OPTION(rbd_cache_block_writes_upfront, OPT_BOOL, false) // whether to block writes to the cache before the aio_write call completes (true), or block before the aio completion is called (false)
 OPTION(rbd_concurrent_management_ops, OPT_INT, 10) // how many operations can be in flight for a management operation like deleting or resizing an image
 OPTION(rbd_balance_snap_reads, OPT_BOOL, false)
diff --git a/src/common/io_priority.cc b/src/common/io_priority.cc
new file mode 100644
index 0000000..b9eeae8
--- /dev/null
+++ b/src/common/io_priority.cc
@@ -0,0 +1,54 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2012 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <sys/types.h>
+#include <unistd.h>
+#include <sys/syscall.h>   /* For SYS_xxx definitions */
+#include <algorithm>
+#include <errno.h>
+
+#include "common/errno.h"
+#include "io_priority.h"
+
+pid_t ceph_gettid(void)
+{
+#ifdef __linux__
+  return syscall(SYS_gettid);
+#else
+  return -ENOSYS;
+#endif
+}
+
+int ceph_ioprio_set(int whence, int who, int ioprio)
+{
+#ifdef __linux__
+  return syscall(SYS_ioprio_set, whence, who, ioprio);
+#else
+  return -ENOSYS;
+#endif
+}
+
+int ceph_ioprio_string_to_class(const std::string& s)
+{
+  std::string l;
+  std::transform(s.begin(), s.end(), l.begin(), ::tolower);
+
+  if (l == "idle")
+    return IOPRIO_CLASS_IDLE;
+  if (l == "be" || l == "besteffort" || l == "best effort")
+    return IOPRIO_CLASS_BE;
+  if (l == "rt" || l == "realtime" || l == "real time")
+    return IOPRIO_CLASS_RT;
+  return -EINVAL;
+}
diff --git a/src/common/io_priority.h b/src/common/io_priority.h
new file mode 100644
index 0000000..91ebf42
--- /dev/null
+++ b/src/common/io_priority.h
@@ -0,0 +1,44 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2012 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_COMMON_IO_PRIORITY_H
+#define CEPH_COMMON_IO_PRIORITY_H
+
+#include <string>
+
+extern pid_t ceph_gettid();
+
+#ifndef IOPRIO_WHO_PROCESS
+# define IOPRIO_WHO_PROCESS 1
+#endif
+#ifndef IOPRIO_PRIO_VALUE
+# define IOPRIO_CLASS_SHIFT 13
+# define IOPRIO_PRIO_VALUE(class, data) \
+		(((class) << IOPRIO_CLASS_SHIFT) | (data))
+#endif
+#ifndef IOPRIO_CLASS_RT
+# define IOPRIO_CLASS_RT 1
+#endif
+#ifndef IOPRIO_CLASS_BE
+# define IOPRIO_CLASS_BE 2
+#endif
+#ifndef IOPRIO_CLASS_IDLE
+# define IOPRIO_CLASS_IDLE 3
+#endif
+
+extern int ceph_ioprio_set(int whence, int who, int ioprio);
+
+extern int ceph_ioprio_string_to_class(const std::string& s);
+
+#endif
diff --git a/src/common/random_cache.hpp b/src/common/random_cache.hpp
new file mode 100644
index 0000000..c627847
--- /dev/null
+++ b/src/common/random_cache.hpp
@@ -0,0 +1,111 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 UnitedStack <haomai at unitedstack.com>
+ *
+ * Author: Haomai Wang <haomaiwang at gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_RANDOMCACHE_H
+#define CEPH_RANDOMCACHE_H
+
+#include "common/Mutex.h"
+#include "include/compat.h"
+#include "include/unordered_map.h"
+
+
+// Although This is a ramdom cache implementation, here still consider to make
+// the trim progress more reasonable. Each item owns its lookup frequency,
+// when the cache is full it will randomly pick up several items and compare the
+// frequency associated with. The least frequency of items will be evicted.
+template <class K, class V>
+class RandomCache {
+  // The first element of pair is the frequency of item, it's used to evict item
+  ceph::unordered_map<K, pair<uint64_t, V> > contents;
+  Mutex lock;
+  uint64_t max_size;
+  K last_trim_key;
+
+  // When cache reach full, consider to evict a certain number of items
+  static const uint64_t EVICT_COUNT = 5;
+  // Avoid too much overhead on comparing items's frequency, the number of
+  // compare items is expected to small.
+  static const uint64_t COMPARE_COUNT = 3;
+
+  // In order to make evict cache progress more lightweight and effective,
+  // several items are expected to evicted in one call
+  void trim_cache(uint64_t evict_count) {
+    typename ceph::unordered_map<K, pair<uint64_t, V> >::iterator it = contents.find(last_trim_key);
+    uint64_t total_compare = evict_count * COMPARE_COUNT;
+    map<uint64_t, K> candidates;
+
+    while (total_compare--) {
+      if (it == contents.end()) {
+        it = contents.begin();
+      }
+
+      candidates[it->second.first] = it->first;
+      it++;
+    }
+    if (it != contents.end())
+      last_trim_key = it->first;
+    else
+      last_trim_key = contents.begin()->first;
+
+    for (typename map<uint64_t, K>::iterator j = candidates.begin(); j != candidates.end(); j++) {
+      contents.erase(j->second);
+      evict_count--;
+      if (!evict_count)
+        break;
+    }
+  }
+
+ public:
+  RandomCache(size_t max_size=20) : lock("RandomCache::lock"),
+                                    max_size(max_size) {}
+  ~RandomCache() {
+    contents.clear();
+  }
+
+  void clear(K key) {
+    Mutex::Locker l(lock);
+    contents.erase(key);
+  }
+
+  void set_size(size_t new_size) {
+    Mutex::Locker l(lock);
+    max_size = new_size;
+    if (max_size <= contents.size()) {
+      trim_cache(contents.size() - max_size);
+    }
+  }
+
+  bool lookup(K key, V *out) {
+    Mutex::Locker l(lock);
+    typename ceph::unordered_map<K, pair<uint64_t, V> >::iterator it = contents.find(key);
+    if (it != contents.end()) {
+      it->second.first++;
+      *out = it->second.second;
+      return true;
+    }
+    return false;
+  }
+
+  void add(K key, V value) {
+    Mutex::Locker l(lock);
+    if (max_size <= contents.size()) {
+      trim_cache(EVICT_COUNT);
+    }
+    contents[key] = make_pair(1, value);
+  }
+};
+
+#endif
diff --git a/src/common/str_map.cc b/src/common/str_map.cc
index e635159..ef9b7d4 100644
--- a/src/common/str_map.cc
+++ b/src/common/str_map.cc
@@ -24,7 +24,7 @@
 using namespace std;
 
 int get_str_map(const string &str,
-                stringstream &ss,
+                ostream &ss,
                 map<string,string> *str_map)
 {
   json_spirit::mValue json;
diff --git a/src/common/strtol.cc b/src/common/strtol.cc
index 8f12f08..840b3d9 100644
--- a/src/common/strtol.cc
+++ b/src/common/strtol.cc
@@ -17,6 +17,9 @@
 #include <sstream>
 #include <stdlib.h>
 #include <string>
+extern "C" {
+#include <stdint.h>
+}
 
 using std::ostringstream;
 
@@ -124,3 +127,43 @@ float strict_strtof(const char *str, std::string *err)
   *err = "";
   return ret;
 }
+
+uint64_t strict_sistrtoll(const char *str, std::string *err)
+{
+  std::string s(str);
+  if (s.size() == 0) {
+    ostringstream oss;
+    oss << "strict_sistrtoll: value not specified";
+    *err = oss.str();
+    return 0;
+  }
+  const char &u = s.at(s.size()-1); //str[std::strlen(str)-1];
+  int m = 0;
+  if (u == 'B')
+    m = 0;
+  else if (u == 'K')
+    m = 10;
+  else if (u == 'M')
+    m = 20;
+  else if (u == 'G')
+    m = 30;
+  else if (u == 'T')
+    m = 40;
+  else if (u == 'P')
+    m = 50;
+  else if (u == 'E')
+    m = 60;
+  else
+    m = -1;
+
+  const char *v = NULL;
+  if (m >= 0)
+    s = std::string(str, s.size()-1);
+  v = s.c_str();
+
+  uint64_t r = strict_strtoll(v, 10, err);
+  if (err->empty() && m > 0) {
+    r = (r << m);
+  }
+  return r;
+}
diff --git a/src/common/strtol.h b/src/common/strtol.h
index 80b5a3f..ea0a469 100644
--- a/src/common/strtol.h
+++ b/src/common/strtol.h
@@ -16,6 +16,9 @@
 #define CEPH_COMMON_STRTOL_H
 
 #include <string>
+extern "C" {
+#include <stdint.h>
+}
 
 long long strict_strtoll(const char *str, int base, std::string *err);
 
@@ -25,4 +28,6 @@ double strict_strtod(const char *str, std::string *err);
 
 float strict_strtof(const char *str, std::string *err);
 
+uint64_t strict_sistrtoll(const char *str, std::string *err);
+
 #endif
diff --git a/src/crush/CrushWrapper.cc b/src/crush/CrushWrapper.cc
index 4ed3fa9..31da4f5 100644
--- a/src/crush/CrushWrapper.cc
+++ b/src/crush/CrushWrapper.cc
@@ -10,17 +10,28 @@
 
 bool CrushWrapper::has_v2_rules() const
 {
-  // check rules for use of indep or new SET_* rule steps
   for (unsigned i=0; i<crush->max_rules; i++) {
-    crush_rule *r = crush->rules[i];
-    if (!r)
-      continue;
-    for (unsigned j=0; j<r->len; j++) {
-      if (r->steps[j].op == CRUSH_RULE_CHOOSE_INDEP ||
-	  r->steps[j].op == CRUSH_RULE_CHOOSELEAF_INDEP ||
-	  r->steps[j].op == CRUSH_RULE_SET_CHOOSE_TRIES ||
-	  r->steps[j].op == CRUSH_RULE_SET_CHOOSELEAF_TRIES)
-	return true;
+    if (is_v2_rule(i)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool CrushWrapper::is_v2_rule(unsigned ruleid) const
+{
+  // check rule for use of indep or new SET_* rule steps
+  if (ruleid >= crush->max_rules)
+    return false;
+  crush_rule *r = crush->rules[ruleid];
+  if (!r)
+    return false;
+  for (unsigned j=0; j<r->len; j++) {
+    if (r->steps[j].op == CRUSH_RULE_CHOOSE_INDEP ||
+	r->steps[j].op == CRUSH_RULE_CHOOSELEAF_INDEP ||
+	r->steps[j].op == CRUSH_RULE_SET_CHOOSE_TRIES ||
+	r->steps[j].op == CRUSH_RULE_SET_CHOOSELEAF_TRIES) {
+      return true;
     }
   }
   return false;
@@ -28,14 +39,25 @@ bool CrushWrapper::has_v2_rules() const
 
 bool CrushWrapper::has_v3_rules() const
 {
-  // check rules for use of SET_CHOOSELEAF_VARY_R step
   for (unsigned i=0; i<crush->max_rules; i++) {
-    crush_rule *r = crush->rules[i];
-    if (!r)
-      continue;
-    for (unsigned j=0; j<r->len; j++) {
-      if (r->steps[j].op == CRUSH_RULE_SET_CHOOSELEAF_VARY_R)
-	return true;
+    if (is_v3_rule(i)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool CrushWrapper::is_v3_rule(unsigned ruleid) const
+{
+  // check rule for use of SET_CHOOSELEAF_VARY_R step
+  if (ruleid >= crush->max_rules)
+    return false;
+  crush_rule *r = crush->rules[ruleid];
+  if (!r)
+    return false;
+  for (unsigned j=0; j<r->len; j++) {
+    if (r->steps[j].op == CRUSH_RULE_SET_CHOOSELEAF_VARY_R) {
+      return true;
     }
   }
   return false;
@@ -794,6 +816,59 @@ int CrushWrapper::add_simple_ruleset(string name, string root_name,
   return rno;
 }
 
+int CrushWrapper::get_rule_weight_osd_map(unsigned ruleno, map<int,float> *pmap)
+{
+  if (ruleno >= crush->max_rules)
+    return -ENOENT;
+  if (crush->rules[ruleno] == NULL)
+    return -ENOENT;
+  crush_rule *rule = crush->rules[ruleno];
+
+  // build a weight map for each TAKE in the rule, and then merge them
+  for (unsigned i=0; i<rule->len; ++i) {
+    map<int,float> m;
+    float sum = 0;
+    if (rule->steps[i].op == CRUSH_RULE_TAKE) {
+      int n = rule->steps[i].arg1;
+      if (n >= 0) {
+	m[n] = 1.0;
+	sum = 1.0;
+      } else {
+	list<int> q;
+	q.push_back(n);
+	//breadth first iterate the OSD tree
+	while (!q.empty()) {
+	  int bno = q.front();
+	  q.pop_front();
+	  crush_bucket *b = crush->buckets[-1-bno];
+	  assert(b);
+	  for (unsigned j=0; j<b->size; ++j) {
+	    int item_id = b->items[j];
+	    if (item_id >= 0) //it's an OSD
+	    {
+	      float w = crush_get_bucket_item_weight(b, j);
+	      m[item_id] = w;
+	      sum += w;
+	    }
+	    else //not an OSD, expand the child later
+	      q.push_back(item_id);
+	  }
+	}
+      }
+    }
+    for (map<int,float>::iterator p = m.begin(); p != m.end(); ++p) {
+      map<int,float>::iterator q = pmap->find(p->first);
+      if (q == pmap->end()) {
+	(*pmap)[p->first] = p->second / sum;
+      } else {
+	q->second += p->second / sum;
+      }
+    }
+  }
+
+  return 0;
+}
+
 int CrushWrapper::remove_rule(int ruleno)
 {
   if (ruleno >= (int)crush->max_rules)
diff --git a/src/crush/CrushWrapper.h b/src/crush/CrushWrapper.h
index 282cbeb..d5d4f4f 100644
--- a/src/crush/CrushWrapper.h
+++ b/src/crush/CrushWrapper.h
@@ -216,6 +216,8 @@ public:
   bool has_v2_rules() const;
   bool has_v3_rules() const;
 
+  bool is_v2_rule(unsigned ruleid) const;
+  bool is_v3_rule(unsigned ruleid) const;
 
   // bucket types
   int get_num_type_names() const {
@@ -631,6 +633,18 @@ public:
     return s->arg2;
   }
 
+  /**
+   * calculate a map of osds to weights for a given rule
+   *
+   * Generate a map of which OSDs get how much relative weight for a
+   * given rule.
+   *
+   * @param ruleno [in] rule id
+   * @param pmap [out] map of osd to weight
+   * @return 0 for success, or negative error code
+   */
+  int get_rule_weight_osd_map(unsigned ruleno, map<int,float> *pmap);
+
   /* modifiers */
   int add_rule(int len, int ruleset, int type, int minsize, int maxsize, int ruleno) {
     if (!crush) return -ENOENT;
diff --git a/src/erasure-code/ErasureCodeInterface.h b/src/erasure-code/ErasureCodeInterface.h
index f8e22d1..1dc12c5 100644
--- a/src/erasure-code/ErasureCodeInterface.h
+++ b/src/erasure-code/ErasureCodeInterface.h
@@ -167,7 +167,7 @@ namespace ceph {
      * @param [in] name of the ruleset to create
      * @param [in] crush crushmap in which the ruleset is created
      * @param [out] ss contains informative messages when an error occurs
-     * @return **0** on success or a negative errno on error.
+     * @return a ruleset on success or a negative errno on error.
      */
     virtual int create_ruleset(const string &name,
 			       CrushWrapper &crush,
diff --git a/src/erasure-code/ErasureCodePlugin.cc b/src/erasure-code/ErasureCodePlugin.cc
index da075d2..3ce0563 100644
--- a/src/erasure-code/ErasureCodePlugin.cc
+++ b/src/erasure-code/ErasureCodePlugin.cc
@@ -4,6 +4,7 @@
  * Ceph - scalable distributed file system
  *
  * Copyright (C) 2013,2014 Cloudwatt <libre.licensing at cloudwatt.com>
+ * Copyright (C) 2014 Red Hat <contact at redhat.com>
  *
  * Author: Loic Dachary <loic at dachary.org>
  *
@@ -19,6 +20,7 @@
 
 #include "ErasureCodePlugin.h"
 #include "common/errno.h"
+#include "include/str_list.h"
 
 #define PLUGIN_PREFIX "libec_"
 #define PLUGIN_SUFFIX ".so"
@@ -130,6 +132,32 @@ int ErasureCodePluginRegistry::load(const std::string &plugin_name,
 
   (*plugin)->library = library;
 
+  ss << __func__ << ": " << plugin_name << " ";
+
   return 0;
 }
 
+int ErasureCodePluginRegistry::preload(const std::string &plugins,
+				       const std::string &directory,
+				       ostream &ss)
+{
+  map<string,string> profile;
+  profile["directory"] = directory;
+  list<string> plugins_list;
+  get_str_list(plugins, plugins_list);
+  for (list<string>::iterator i = plugins_list.begin();
+       i != plugins_list.end();
+       i++) {
+    ErasureCodePlugin *plugin;
+    int r = load(*i, profile, &plugin, ss);
+    if (r)
+      return r;
+
+    ErasureCodeInterfaceRef erasure_code;
+    profile["technique"] = "reed_sol_van";
+    r = plugin->factory(profile, &erasure_code);
+    if (r)
+      return r;
+  }
+  return 0;
+}
diff --git a/src/erasure-code/ErasureCodePlugin.h b/src/erasure-code/ErasureCodePlugin.h
index e891079..7f0b1e9 100644
--- a/src/erasure-code/ErasureCodePlugin.h
+++ b/src/erasure-code/ErasureCodePlugin.h
@@ -67,6 +67,9 @@ namespace ceph {
 	     ErasureCodePlugin **plugin,
 	     ostream &ss);
 
+    int preload(const std::string &plugins,
+		const std::string &directory,
+		ostream &ss);
   };
 }
 
diff --git a/src/erasure-code/jerasure/ErasureCodeJerasure.cc b/src/erasure-code/jerasure/ErasureCodeJerasure.cc
index 6d0f653..06ccc58 100644
--- a/src/erasure-code/jerasure/ErasureCodeJerasure.cc
+++ b/src/erasure-code/jerasure/ErasureCodeJerasure.cc
@@ -44,8 +44,12 @@ int ErasureCodeJerasure::create_ruleset(const string &name,
 					CrushWrapper &crush,
 					ostream *ss) const
 {
-  return crush.add_simple_ruleset(name, ruleset_root, ruleset_failure_domain,
-				  "indep", pg_pool_t::TYPE_ERASURE, ss);
+  int ruleid = crush.add_simple_ruleset(name, ruleset_root, ruleset_failure_domain,
+					"indep", pg_pool_t::TYPE_ERASURE, ss);
+  if (ruleid < 0)
+    return ruleid;
+  else
+    return crush.get_rule_mask_ruleset(ruleid);
 }
 
 void ErasureCodeJerasure::init(const map<string,string> &parameters)
diff --git a/src/global/global_init.cc b/src/global/global_init.cc
index 7b20343..f03677c 100644
--- a/src/global/global_init.cc
+++ b/src/global/global_init.cc
@@ -166,8 +166,16 @@ int global_init_prefork(CephContext *cct, int flags)
   if (g_code_env != CODE_ENVIRONMENT_DAEMON)
     return -1;
   const md_config_t *conf = cct->_conf;
-  if (!conf->daemonize)
+  if (!conf->daemonize) {
+    if (atexit(pidfile_remove_void)) {
+      derr << "global_init_daemonize: failed to set pidfile_remove function "
+	   << "to run at exit." << dendl;
+    }
+
+    pidfile_write(g_conf);
+
     return -1;
+  }
 
   // stop log thread
   g_ceph_context->_log->flush();
diff --git a/src/include/atomic.h b/src/include/atomic.h
index 537fa98..c1650be 100644
--- a/src/include/atomic.h
+++ b/src/include/atomic.h
@@ -21,10 +21,66 @@
 #endif
 
 #include <stdlib.h>
+#include "include/Spinlock.h"
+
+namespace ceph {
+  template <class T>
+  class atomic_spinlock_t {
+    mutable ceph_spinlock_t lock;
+    T val;
+  public:
+    atomic_spinlock_t(T i=0)
+      : val(i) {
+      ceph_spin_init(&lock);
+    }
+    ~atomic_spinlock_t() {
+      ceph_spin_destroy(&lock);
+    }
+    void set(T v) {
+      ceph_spin_lock(&lock);
+      val = v;
+      ceph_spin_unlock(&lock);
+    }
+    T inc() {
+      ceph_spin_lock(&lock);
+      T r = ++val;
+      ceph_spin_unlock(&lock);
+      return r;
+    }
+    T dec() {
+      ceph_spin_lock(&lock);
+      T r = --val;
+      ceph_spin_unlock(&lock);
+      return r;
+    }
+    void add(T d) {
+      ceph_spin_lock(&lock);
+      val += d;
+      ceph_spin_unlock(&lock);
+    }
+    void sub(T d) {
+      ceph_spin_lock(&lock);
+      val -= d;
+      ceph_spin_unlock(&lock);
+    }
+    T read() const {
+      T ret;
+      ceph_spin_lock(&lock);
+      ret = val;
+      ceph_spin_unlock(&lock);
+      return ret;
+    }
+  private:
+    // forbid copying
+    atomic_spinlock_t(const atomic_spinlock_t<T> &other);
+    atomic_spinlock_t &operator=(const atomic_spinlock_t<T> &rhs);
+  };
+}
 
 #ifndef NO_ATOMIC_OPS
 
 // libatomic_ops implementation
+#define AO_REQUIRE_CAS
 #include <atomic_ops.h>
 
 // reinclude our assert to clobber the system one
@@ -35,7 +91,7 @@ namespace ceph {
     AO_t val;
   public:
     atomic_t(AO_t i=0) : val(i) {}
-    void set(size_t v) {
+    void set(AO_t v) {
       AO_store(&val, v);
     }
     AO_t inc() {
@@ -47,8 +103,8 @@ namespace ceph {
     void add(AO_t add_me) {
       AO_fetch_and_add(&val, add_me);
     }
-    void sub(int sub_me) {
-      int negsub = 0 - sub_me;
+    void sub(AO_t sub_me) {
+      AO_t negsub = 0 - sub_me;
       AO_fetch_and_add_write(&val, (AO_t)negsub);
     }
     AO_t read() const {
@@ -62,7 +118,15 @@ namespace ceph {
     atomic_t(const atomic_t &other);
     atomic_t &operator=(const atomic_t &rhs);
   };
+
+#if SIZEOF_AO_T == 8
+  typedef atomic_t atomic64_t;
+#else
+  typedef atomic_spinlock_t<unsigned long long> atomic64_t;
+#endif
+
 }
+
 #else
 /*
  * crappy slow implementation that uses a pthreads spinlock.
@@ -70,56 +134,9 @@ namespace ceph {
 #include "include/Spinlock.h"
 
 namespace ceph {
-  class atomic_t {
-    mutable ceph_spinlock_t lock;
-    signed long val;
-  public:
-    atomic_t(int i=0)
-      : val(i) {
-      ceph_spin_init(&lock);
-    }
-    ~atomic_t() {
-      ceph_spin_destroy(&lock);
-    }
-    void set(size_t v) {
-      ceph_spin_lock(&lock);
-      val = v;
-      ceph_spin_unlock(&lock);
-    }
-    int inc() {
-      ceph_spin_lock(&lock);
-      int r = ++val;
-      ceph_spin_unlock(&lock);
-      return r;
-    }
-    int dec() {
-      ceph_spin_lock(&lock);
-      int r = --val;
-      ceph_spin_unlock(&lock);
-      return r;
-    }
-    void add(int d) {
-      ceph_spin_lock(&lock);
-      val += d;
-      ceph_spin_unlock(&lock);
-    }
-    void sub(int d) {
-      ceph_spin_lock(&lock);
-      val -= d;
-      ceph_spin_unlock(&lock);
-    }
-    int read() const {
-      signed long ret;
-      ceph_spin_lock(&lock);
-      ret = val;
-      ceph_spin_unlock(&lock);
-      return ret;
-    }
-  private:
-    // forbid copying
-    atomic_t(const atomic_t &other);
-    atomic_t &operator=(const atomic_t &rhs);
-  };
+  typedef atomic_spinlock_t<unsigned> atomic_t;
+  typedef atomic_spinlock_t<unsigned long long> atomic64_t;
 }
+
 #endif
 #endif
diff --git a/src/include/intarith.h b/src/include/intarith.h
index 640129c..2c27cec 100644
--- a/src/include/intarith.h
+++ b/src/include/intarith.h
@@ -28,7 +28,7 @@
 #endif
 
 #ifndef ROUND_UP_TO
-# define ROUND_UP_TO(n, d) (((n)+(d)-1) & ~((d)-1))
+# define ROUND_UP_TO(n, d) ((n)%(d) ? ((n)+(d)-(n)%(d)) : (n))
 #endif
 
 #ifndef SHIFT_ROUND_UP
diff --git a/src/include/rbd/librbd.h b/src/include/rbd/librbd.h
index 5be8203..1e87af9 100644
--- a/src/include/rbd/librbd.h
+++ b/src/include/rbd/librbd.h
@@ -39,6 +39,7 @@ extern "C" {
 
 #define LIBRBD_SUPPORTS_WATCH 0
 #define LIBRBD_SUPPORTS_AIO_FLUSH 1
+#define LIBRBD_SUPPORTS_INVALIDATE 1
 
 typedef void *rbd_snap_t;
 typedef void *rbd_image_t;
@@ -376,6 +377,14 @@ int rbd_flush(rbd_image_t image);
  */
 int rbd_aio_flush(rbd_image_t image, rbd_completion_t c);
 
+/**
+ * Drop any cached data for an image
+ *
+ * @param image the image to invalidate cached data for
+ * @returns 0 on success, negative error code on failure
+ */
+int rbd_invalidate_cache(rbd_image_t image);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/include/rbd/librbd.hpp b/src/include/rbd/librbd.hpp
index 697fc6c..caf61a6 100644
--- a/src/include/rbd/librbd.hpp
+++ b/src/include/rbd/librbd.hpp
@@ -216,6 +216,14 @@ public:
    */
   int aio_flush(RBD::AioCompletion *c);
 
+  /**
+   * Drop any cached data for an image
+   *
+   * @param image the image to invalidate cached data for
+   * @returns 0 on success, negative error code on failure
+   */
+  int invalidate_cache();
+
 private:
   friend class RBD;
 
diff --git a/src/include/str_map.h b/src/include/str_map.h
index efae903..eabe8d2 100644
--- a/src/include/str_map.h
+++ b/src/include/str_map.h
@@ -53,7 +53,7 @@
  * @return **0** on success or a -EINVAL on error.
  */
 extern int get_str_map(const std::string &str,
-		       std::stringstream &ss,
+		       std::ostream &ss,
 		       std::map<std::string,std::string> *str_map);
 
 #endif
diff --git a/src/init-ceph.in b/src/init-ceph.in
index 95723b0..7276830 100644
--- a/src/init-ceph.in
+++ b/src/init-ceph.in
@@ -31,6 +31,7 @@ fi
 usage_exit() {
     echo "usage: $0 [options] {start|stop|restart|condrestart} [mon|osd|mds]..."
     printf "\t-c ceph.conf\n"
+    printf "\t--cluster [cluster name]\tdefine the cluster name\n"
     printf "\t--valgrind\trun via valgrind\n"
     printf "\t--hostname [hostname]\toverride hostname lookup\n"
     exit
@@ -113,6 +114,8 @@ monaddr=
 dofsmount=1
 dofsumount=0
 verbose=0
+use_default_conf=1
+
 
 while echo $1 | grep -q '^-'; do     # FIXME: why not '^-'?
 case $1 in
@@ -153,8 +156,15 @@ case $1 in
 	    [ -z "$2" ] && usage_exit
 	    options="$options $1"
 	    shift
+        use_default_conf=0
 	    conf=$1
 	    ;;
+    --cluster )
+	    [ -z "$2" ] && usage_exit
+	    options="$options $1"
+	    shift
+	    cluster=$1
+	    ;;
     --hostname )
 	    [ -z "$2" ] && usage_exit
 	    options="$options $1"
@@ -170,6 +180,20 @@ options="$options $1"
 shift
 done
 
+
+# if `--cluster` was not passed in, fallback to looking at the config name
+if [ -z "$cluster" ]; then
+    cluster=`echo $conf | awk -F'/' '{print $(NF)}' | cut -d'.' -f 1`
+else
+    # if we were told to use a given cluster name then $conf needs to be updated
+    # but just define it if `--conf` was not specified, otherwise we would be silently
+    # overriding $conf even if it was defined with `--conf`
+    if [ $use_default_conf -eq 1 ]; then
+        conf="/etc/ceph/$cluster.conf"
+    fi
+fi
+
+
 verify_conf
 
 command=$1
@@ -189,11 +213,10 @@ fi
 for name in $what; do
     type=`echo $name | cut -c 1-3`   # e.g. 'mon', if $item is 'mon1'
     id=`echo $name | cut -c 4- | sed 's/^\\.//'`
-    cluster=`echo $conf | awk -F'/' '{print $(NF)}' | cut -d'.' -f 1`
     num=$id
     name="$type.$id"
 
-    check_host || continue
+    check_host $cluster || continue
 
     binary="$BINDIR/ceph-$type"
     cmd="$binary -i $id"
@@ -235,7 +258,7 @@ for name in $what; do
     cmd="$cmd -c $conf"
 
     if echo $name | grep -q ^osd; then
-	get_conf osd_data "/var/lib/ceph/osd/ceph-$id" "osd data"
+	get_conf osd_data "/var/lib/ceph/osd/$cluster-$id" "osd data"
 	get_conf fs_path "$osd_data" "fs path"  # mount point defaults so osd data
         get_conf fs_devs "" "devs"
 	if [ -z "$fs_devs" ]; then
@@ -335,7 +358,7 @@ for name in $what; do
 		if [ "${update_crush:-1}" = "1" -o "${update_crush:-1}" = "true" ]; then
 		    # update location in crush
 		    get_conf osd_location_hook "$BINDIR/ceph-crush-location" "osd crush location hook"
-		    osd_location=`$osd_location_hook --cluster ceph --id $id --type osd`
+		    osd_location=`$osd_location_hook --cluster $cluster --id $id --type osd`
 		    get_conf osd_weight "" "osd crush initial weight"
 		    defaultweight="$(df -P -k $osd_data/. | tail -1 | awk '{ print sprintf("%.2f",$2/1073741824) }')"
 		    get_conf osd_keyring "$osd_data/keyring" "keyring"
@@ -366,7 +389,7 @@ for name in $what; do
 		get_conf mon_data "/var/lib/ceph/mon/ceph-$id" "mon data"
 		if [ "$mon_data" = "/var/lib/ceph/mon/ceph-$id" -a "$asok" = "/var/run/ceph/ceph-mon.$id.asok" ]; then
 		    echo Starting ceph-create-keys on $host...
-		    cmd2="$SBINDIR/ceph-create-keys -i $id 2> /dev/null &"
+		    cmd2="$SBINDIR/ceph-create-keys --cluster $cluster -i $id 2> /dev/null &"
 		    do_cmd "$cmd2"
 		fi
 	    fi
diff --git a/src/init-radosgw.sysv b/src/init-radosgw.sysv
index ab6b250..dd3dbb0 100644
--- a/src/init-radosgw.sysv
+++ b/src/init-radosgw.sysv
@@ -15,6 +15,7 @@ PATH=/sbin:/bin:/usr/bin
 
 daemon_is_running() {
     daemon=$1
+    sleep 1
     if pidof $daemon >/dev/null; then
         echo "$daemon is running."
         exit 0
@@ -44,6 +45,10 @@ if [ ! -x "$RADOSGW" ]; then
     exit 1
 fi
 
+# detect systemd
+SYSTEMD=0
+grep -qs systemd /proc/1/comm && SYSTEMD=1
+
 case "$1" in
     start)
         echo "Starting radosgw instance(s)..."
@@ -79,8 +84,12 @@ case "$1" in
                 chown $user $log_file
             fi
 
-            #start-stop-daemon --start -u $user -x $RADOSGW -- -n $name
-            daemon --user="$user" "ulimit -n 32768; $RADOSGW -n $name"
+            if [ $SYSTEMD -eq 1 ]; then
+                systemd-run -r bash -c "ulimit -n 32768; $RADOSGW -n $name"
+            else
+                #start-stop-daemon --start -u $user -x $RADOSGW -- -n $name
+                daemon --user="$user" "ulimit -n 32768; $RADOSGW -n $name"
+            fi
             echo "Starting $name..."
         done
         daemon_is_running $RADOSGW
diff --git a/src/librados/RadosClient.cc b/src/librados/RadosClient.cc
index 249c34f..9330e65 100644
--- a/src/librados/RadosClient.cc
+++ b/src/librados/RadosClient.cc
@@ -103,8 +103,10 @@ int64_t librados::RadosClient::lookup_pool(const char *name)
   lock.Lock();
 
   int r = wait_for_osdmap();
-  if (r < 0)
+  if (r < 0) {
+    lock.Unlock();
     return r;
+  }
   int64_t ret = osdmap.lookup_pg_pool_name(name);
   pool_cache_rwl.get_write();
   lock.Unlock();
@@ -582,8 +584,10 @@ int librados::RadosClient::pool_delete(const char *name)
 {
   lock.Lock();
   int r = wait_for_osdmap();
-  if (r < 0)
+  if (r < 0) {
+    lock.Unlock();
     return r;
+  }
   int tmp_pool_id = osdmap.lookup_pg_pool_name(name);
   if (tmp_pool_id < 0) {
     lock.Unlock();
diff --git a/src/librbd/ImageCtx.cc b/src/librbd/ImageCtx.cc
index 6477e8d..b5c2db6 100644
--- a/src/librbd/ImageCtx.cc
+++ b/src/librbd/ImageCtx.cc
@@ -185,10 +185,14 @@ namespace librbd {
 
     // size object cache appropriately
     if (object_cacher) {
-      uint64_t obj = cct->_conf->rbd_cache_size / (1ull << order);
+      uint64_t obj = cct->_conf->rbd_cache_max_dirty_object;
+      if (!obj) {
+        obj = cct->_conf->rbd_cache_size / (1ull << order);
+        obj = obj * 4 + 10;
+      }
       ldout(cct, 10) << " cache bytes " << cct->_conf->rbd_cache_size << " order " << (int)order
 		     << " -> about " << obj << " objects" << dendl;
-      object_cacher->set_max_objects(obj * 4 + 10);
+      object_cacher->set_max_objects(obj);
     }
 
     ldout(cct, 10) << "init_layout stripe_unit " << stripe_unit
@@ -573,9 +577,9 @@ namespace librbd {
     object_cacher->stop();
   }
 
-  void ImageCtx::invalidate_cache() {
+  int ImageCtx::invalidate_cache() {
     if (!object_cacher)
-      return;
+      return 0;
     cache_lock.Lock();
     object_cacher->release_set(object_set);
     cache_lock.Unlock();
@@ -585,8 +589,12 @@ namespace librbd {
     cache_lock.Lock();
     bool unclean = object_cacher->release_set(object_set);
     cache_lock.Unlock();
-    if (unclean)
-      lderr(cct) << "could not release all objects from cache" << dendl;
+    if (unclean) {
+      lderr(cct) << "could not release all objects from cache: "
+                 << unclean << " bytes remain" << dendl;
+      return -EBUSY;
+    }
+    return r;
   }
 
   void ImageCtx::clear_nonexistence_cache() {
diff --git a/src/librbd/ImageCtx.h b/src/librbd/ImageCtx.h
index 026a3e0..83ed044 100644
--- a/src/librbd/ImageCtx.h
+++ b/src/librbd/ImageCtx.h
@@ -139,7 +139,7 @@ namespace librbd {
     void flush_cache_aio(Context *onfinish);
     int flush_cache();
     void shutdown_cache();
-    void invalidate_cache();
+    int invalidate_cache();
     void clear_nonexistence_cache();
     int register_watch();
     void unregister_watch();
diff --git a/src/librbd/internal.cc b/src/librbd/internal.cc
index 127be38..afa4660 100644
--- a/src/librbd/internal.cc
+++ b/src/librbd/internal.cc
@@ -832,6 +832,9 @@ reprotect_and_return_err:
 	     bool old_format, uint64_t features, int *order,
 	     uint64_t stripe_unit, uint64_t stripe_count)
   {
+    if (!order)
+      return -EINVAL;
+
     CephContext *cct = (CephContext *)io_ctx.cct();
     ldout(cct, 20) << "create " << &io_ctx << " name = " << imgname
 		   << " size = " << size << " old_format = " << old_format
@@ -857,9 +860,6 @@ reprotect_and_return_err:
       return -EEXIST;
     }
 
-    if (!order)
-      return -EINVAL;
-
     if (!*order)
       *order = cct->_conf->rbd_default_order;
     if (!*order)
@@ -1275,6 +1275,19 @@ reprotect_and_return_err:
       return r;
     }
     ictx->parent->snap_set(ictx->parent->snap_name);
+    ictx->parent->parent_lock.get_write();
+    r = refresh_parent(ictx->parent);
+    if (r < 0) {
+      lderr(ictx->cct) << "error refreshing parent snapshot "
+		       << ictx->parent->id << " "
+		       << ictx->parent->snap_name << dendl;
+      ictx->parent->parent_lock.put_write();
+      ictx->parent->snap_lock.put_write();
+      close_image(ictx->parent);
+      ictx->parent = NULL;
+      return r;
+    }
+    ictx->parent->parent_lock.put_write();
     ictx->parent->snap_lock.put_write();
 
     return 0;
@@ -1504,7 +1517,9 @@ reprotect_and_return_err:
     if (size < ictx->size && ictx->object_cacher) {
       // need to invalidate since we're deleting objects, and
       // ObjectCacher doesn't track non-existent objects
-      ictx->invalidate_cache();
+      r = ictx->invalidate_cache();
+      if (r < 0)
+	return r;
     }
     resize_helper(ictx, size, prog_ctx);
 
@@ -1847,7 +1862,9 @@ reprotect_and_return_err:
     // need to flush any pending writes before resizing and rolling back -
     // writes might create new snapshots. Rolling back will replace
     // the current version, so we have to invalidate that too.
-    ictx->invalidate_cache();
+    r = ictx->invalidate_cache();
+    if (r < 0)
+      return r;
 
     ldout(cct, 2) << "resizing to snapshot size..." << dendl;
     NoOpProgressContext no_op;
@@ -2071,7 +2088,7 @@ reprotect_and_return_err:
 			 << ictx->snap_name << "'" << dendl;
     int r = ictx->init();
     if (r < 0)
-      return r;
+      goto err_close;
 
     if (!ictx->read_only) {
       r = ictx->register_watch();
@@ -2877,6 +2894,19 @@ reprotect_and_return_err:
     return r;
   }
 
+  int invalidate_cache(ImageCtx *ictx)
+  {
+    CephContext *cct = ictx->cct;
+    ldout(cct, 20) << "invalidate_cache " << ictx << dendl;
+
+    int r = ictx_check(ictx);
+    if (r < 0)
+      return r;
+
+    RWLock::WLocker l(ictx->md_lock);
+    return ictx->invalidate_cache();
+  }
+
   int aio_write(ImageCtx *ictx, uint64_t off, size_t len, const char *buf,
 		AioCompletion *c)
   {
diff --git a/src/librbd/internal.h b/src/librbd/internal.h
index 4345888..1e9fd9a 100644
--- a/src/librbd/internal.h
+++ b/src/librbd/internal.h
@@ -114,6 +114,7 @@ namespace librbd {
 			bool *is_protected);
   int add_snap(ImageCtx *ictx, const char *snap_name);
   int rm_snap(ImageCtx *ictx, const char *snap_name);
+  int refresh_parent(ImageCtx *ictx);
   int ictx_check(ImageCtx *ictx);
   int ictx_refresh(ImageCtx *ictx);
   int copy(ImageCtx *ictx, IoCtx& dest_md_ctx, const char *destname,
@@ -188,6 +189,7 @@ namespace librbd {
   int aio_flush(ImageCtx *ictx, AioCompletion *c);
   int flush(ImageCtx *ictx);
   int _flush(ImageCtx *ictx);
+  int invalidate_cache(ImageCtx *ictx);
 
   ssize_t handle_sparse_read(CephContext *cct,
 			     ceph::bufferlist data_bl,
diff --git a/src/librbd/librbd.cc b/src/librbd/librbd.cc
index cad0c5e..658f24b 100644
--- a/src/librbd/librbd.cc
+++ b/src/librbd/librbd.cc
@@ -514,6 +514,12 @@ namespace librbd {
     return librbd::aio_flush(ictx, (librbd::AioCompletion *)c->pc);
   }
 
+  int Image::invalidate_cache()
+  {
+    ImageCtx *ictx = (ImageCtx *)ctx;
+    return librbd::invalidate_cache(ictx);
+  }
+
 } // namespace librbd
 
 extern "C" void rbd_version(int *major, int *minor, int *extra)
@@ -1130,6 +1136,12 @@ extern "C" int rbd_aio_flush(rbd_image_t image, rbd_completion_t c)
   return librbd::aio_flush(ictx, (librbd::AioCompletion *)comp->pc);
 }
 
+extern "C" int rbd_invalidate_cache(rbd_image_t image)
+{
+  librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+  return librbd::invalidate_cache(ictx);
+}
+
 extern "C" int rbd_aio_is_complete(rbd_completion_t c)
 {
   librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc
index e5fe00c..74305b9 100644
--- a/src/mds/Locker.cc
+++ b/src/mds/Locker.cc
@@ -2062,7 +2062,13 @@ public:
 void Locker::calc_new_client_ranges(CInode *in, uint64_t size, map<client_t,client_writeable_range_t>& new_ranges)
 {
   inode_t *latest = in->get_projected_inode();
-  uint64_t ms = ROUND_UP_TO((size+1)<<1, latest->get_layout_size_increment());
+  uint64_t ms;
+  if(latest->has_layout()) {
+    ms = ROUND_UP_TO((size+1)<<1, latest->get_layout_size_increment());
+  } else {
+    // Layout-less directories like ~mds0/, have zero size
+    ms = 0;
+  }
 
   // increase ranges as appropriate.
   // shrink to 0 if no WR|BUFFER caps issued.
diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index 71a4b33..d6cfebd 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -349,6 +349,7 @@ void MDCache::create_empty_hierarchy(C_Gather *gather)
 
   root->inode.dirstat = rootdir->fnode.fragstat;
   root->inode.rstat = rootdir->fnode.rstat;
+  ++root->inode.rstat.rsubdirs;
   root->inode.accounted_rstat = root->inode.rstat;
 
   rootdir->mark_complete();
@@ -399,6 +400,7 @@ void MDCache::create_mydir_hierarchy(C_Gather *gather)
 
   myin->inode.dirstat = mydir->fnode.fragstat;
   myin->inode.rstat = mydir->fnode.rstat;
+  ++myin->inode.rstat.rsubdirs;
   myin->inode.accounted_rstat = myin->inode.rstat;
 
 
diff --git a/src/messages/MOSDSubOp.h b/src/messages/MOSDSubOp.h
index 6a38186..7b40c0a 100644
--- a/src/messages/MOSDSubOp.h
+++ b/src/messages/MOSDSubOp.h
@@ -25,7 +25,7 @@
 
 class MOSDSubOp : public Message {
 
-  static const int HEAD_VERSION = 10;
+  static const int HEAD_VERSION = 11;
   static const int COMPAT_VERSION = 1;
 
 public:
@@ -63,6 +63,8 @@ public:
 
   // piggybacked osd/og state
   eversion_t pg_trim_to;   // primary->replica: trim to here
+  eversion_t pg_trim_rollback_to;   // primary->replica: trim rollback
+                                    // info to here
   osd_peer_stat_t peer_stat;
 
   map<string,bufferlist> attrset;
@@ -175,6 +177,11 @@ public:
     if (header.version >= 10) {
       ::decode(updated_hit_set_history, p);
     }
+    if (header.version >= 11) {
+      ::decode(pg_trim_rollback_to, p);
+    } else {
+      pg_trim_rollback_to = pg_trim_to;
+    }
   }
 
   virtual void encode_payload(uint64_t features) {
@@ -224,6 +231,7 @@ public:
     ::encode(from, payload);
     ::encode(pgid.shard, payload);
     ::encode(updated_hit_set_history, payload);
+    ::encode(pg_trim_rollback_to, payload);
   }
 
   MOSDSubOp()
diff --git a/src/mon/DataHealthService.cc b/src/mon/DataHealthService.cc
index 78732ac..6c6ed29 100644
--- a/src/mon/DataHealthService.cc
+++ b/src/mon/DataHealthService.cc
@@ -228,7 +228,7 @@ void DataHealthService::service_tick()
     if (ours.latest_avail_percent != last_warned_percent)
       mon->clog.warn()
 	<< "reached concerning levels of available space on local monitor storage"
-	<< " (" << ours.latest_avail_percent << "\% free)\n";
+	<< " (" << ours.latest_avail_percent << "% free)\n";
     last_warned_percent = ours.latest_avail_percent;
   } else {
     last_warned_percent = 0;
diff --git a/src/mon/MonCommands.h b/src/mon/MonCommands.h
index eb63303..bd9dd2e 100644
--- a/src/mon/MonCommands.h
+++ b/src/mon/MonCommands.h
@@ -552,7 +552,7 @@ COMMAND("osd pool rename " \
 	"rename <srcpool> to <destpool>", "osd", "rw", "cli,rest")
 COMMAND("osd pool get " \
 	"name=pool,type=CephPoolname " \
-	"name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_ruleset|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|auid", \
+	"name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_ruleset|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|auid|target_max_objects|target_max_bytes|cache_target_dirty_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|erasure_code_profile", \
 	"get pool parameter <var>", "osd", "r", "cli,rest")
 COMMAND("osd pool set " \
 	"name=pool,type=CephPoolname " \
@@ -568,6 +568,10 @@ COMMAND("osd pool set-quota " \
 	"name=field,type=CephChoices,strings=max_objects|max_bytes " \
 	"name=val,type=CephString",
 	"set object or byte limit on pool", "osd", "rw", "cli,rest")
+COMMAND("osd pool get-quota " \
+        "name=pool,type=CephPoolname ",
+        "obtain object or byte limits for pool",
+        "osd", "r", "cli,rest")
 COMMAND("osd pool stats " \
         "name=name,type=CephString,req=false",
         "obtain stats from all pools, or from specified pool",
diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc
index cd447e7..fd3a358 100644
--- a/src/mon/Monitor.cc
+++ b/src/mon/Monitor.cc
@@ -621,6 +621,21 @@ void Monitor::init_paxos()
 void Monitor::refresh_from_paxos(bool *need_bootstrap)
 {
   dout(10) << __func__ << dendl;
+
+  bufferlist bl;
+  int r = store->get(MONITOR_NAME, "cluster_fingerprint", bl);
+  if (r >= 0) {
+    try {
+      bufferlist::iterator p = bl.begin();
+      ::decode(fingerprint, p);
+    }
+    catch (buffer::error& e) {
+      dout(10) << __func__ << " failed to decode cluster_fingerprint" << dendl;
+    }
+  } else {
+    dout(10) << __func__ << " no cluster_fingerprint" << dendl;
+  }
+
   for (int i = 0; i < PAXOS_NUM; ++i) {
     paxos_service[i]->refresh(need_bootstrap);
   }
@@ -2393,6 +2408,7 @@ void Monitor::handle_command(MMonCommand *m)
     if (!f)
       f.reset(new_formatter("json-pretty"));
     f->open_object_section("report");
+    f->dump_stream("cluster_fingerprint") << fingerprint;
     f->dump_string("version", ceph_version_to_str());
     f->dump_string("commit", git_version_to_str());
     f->dump_stream("timestamp") << ceph_clock_now(NULL);
@@ -2866,8 +2882,9 @@ bool Monitor::_ms_dispatch(Message *m)
         return dispatch(s, m, false);
       }
       dout(1) << __func__ << " dropping stray message " << *m
-        << " from " << m->get_source_inst() << dendl;
-      return false;
+	      << " from " << m->get_source_inst() << dendl;
+      m->put();
+      return true;
     }
 
     if (!exited_quorum.is_zero() && !src_is_mon) {
@@ -3847,9 +3864,29 @@ void Monitor::tick()
     finish_contexts(g_ceph_context, maybe_wait_for_quorum);
   }
 
+  if (is_leader() && paxos->is_active() && fingerprint.is_zero()) {
+    // this is only necessary on upgraded clusters.
+    MonitorDBStore::Transaction t;
+    prepare_new_fingerprint(&t);
+    bufferlist tbl;
+    t.encode(tbl);
+    paxos->propose_new_value(tbl, new C_NoopContext);
+  }
+
   new_tick();
 }
 
+void Monitor::prepare_new_fingerprint(MonitorDBStore::Transaction *t)
+{
+  uuid_d nf;
+  nf.generate_random();
+  dout(10) << __func__ << " proposing cluster_fingerprint " << nf << dendl;
+
+  bufferlist bl;
+  ::encode(nf, bl);
+  t->put(MONITOR_NAME, "cluster_fingerprint", bl);
+}
+
 int Monitor::check_fsid()
 {
   if (!store->exists(MONITOR_NAME, "cluster_uuid"))
diff --git a/src/mon/Monitor.h b/src/mon/Monitor.h
index 59292ec..42e148e 100644
--- a/src/mon/Monitor.h
+++ b/src/mon/Monitor.h
@@ -128,6 +128,7 @@ public:
   void unregister_cluster_logger();
 
   MonMap *monmap;
+  uuid_d fingerprint;
 
   set<entity_addr_t> extra_probe_peers;
 
@@ -190,6 +191,8 @@ public:
 
   const utime_t &get_leader_since() const;
 
+  void prepare_new_fingerprint(MonitorDBStore::Transaction *t);
+
   // -- elector --
 private:
   Paxos *paxos;
diff --git a/src/mon/MonmapMonitor.cc b/src/mon/MonmapMonitor.cc
index 5940724..3890704 100644
--- a/src/mon/MonmapMonitor.cc
+++ b/src/mon/MonmapMonitor.cc
@@ -97,6 +97,11 @@ void MonmapMonitor::encode_pending(MonitorDBStore::Transaction *t)
 
   put_version(t, pending_map.epoch, bl);
   put_last_committed(t, pending_map.epoch);
+
+  // generate a cluster fingerprint, too?
+  if (pending_map.epoch == 1) {
+    mon->prepare_new_fingerprint(t);
+  }
 }
 
 void MonmapMonitor::on_active()
diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc
index eab5122..7e469b2 100644
--- a/src/mon/OSDMonitor.cc
+++ b/src/mon/OSDMonitor.cc
@@ -2067,6 +2067,32 @@ void OSDMonitor::get_health(list<pair<health_status_t,string> >& summary,
       }
     }
 
+    // hit_set-less cache_mode?
+    if (g_conf->mon_warn_on_cache_pools_without_hit_sets) {
+      int problem_cache_pools = 0;
+      for (map<int64_t, pg_pool_t>::const_iterator p = osdmap.pools.begin();
+	   p != osdmap.pools.end();
+	   ++p) {
+	const pg_pool_t& info = p->second;
+	if (info.cache_mode_requires_hit_set() &&
+	    info.hit_set_params.get_type() == HitSet::TYPE_NONE) {
+	  ++problem_cache_pools;
+	  if (detail) {
+	    ostringstream ss;
+	    ss << "pool '" << osdmap.get_pool_name(p->first)
+	       << "' with cache_mode " << info.get_cache_mode_name()
+	       << " needs hit_set_type to be set but it is not";
+	    detail->push_back(make_pair(HEALTH_WARN, ss.str()));
+	  }
+	}
+      }
+      if (problem_cache_pools) {
+	ostringstream ss;
+	ss << problem_cache_pools << " cache pools are missing hit_sets";
+	summary.push_back(make_pair(HEALTH_WARN, ss.str()));
+      }
+    }
+
     // Warn if 'mon_osd_down_out_interval' is set to zero.
     // Having this option set to zero on the leader acts much like the
     // 'noout' flag.  It's hard to figure out what's going wrong with clusters
@@ -2453,6 +2479,26 @@ bool OSDMonitor::preprocess_command(MMonCommand *m)
     string var;
     cmd_getval(g_ceph_context, cmdmap, "var", var);
 
+    if (!p->is_tier() &&
+        (var == "hit_set_type" || var == "hit_set_period" ||
+         var == "hit_set_count" || var == "hit_set_fpp" ||
+         var == "target_max_objects" || var == "target_max_bytes" ||
+         var == "cache_target_full_ratio" ||
+         var == "cache_target_dirty_ratio" ||
+         var == "cache_min_flush_age" || var == "cache_min_evict_age")) {
+      ss << "pool '" << poolstr
+         << "' is not a tier pool: variable not applicable";
+      r = -EACCES;
+      goto reply;
+    }
+
+    if (!p->is_erasure() && var == "erasure_code_profile") {
+      ss << "pool '" << poolstr
+         << "' is not a erasure pool: variable not applicable";
+      r = -EACCES;
+      goto reply;
+    }
+
     if (f) {
       f->open_object_section("pool");
       f->dump_string("pool", poolstr);
@@ -2488,6 +2534,26 @@ bool OSDMonitor::preprocess_command(MMonCommand *m)
 	  BloomHitSet::Params *bloomp = static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
 	  f->dump_float("hit_set_fpp", bloomp->get_fpp());
 	}
+      } else if (var == "target_max_objects") {
+        f->dump_unsigned("target_max_objects", p->target_max_objects);
+      } else if (var == "target_max_bytes") {
+        f->dump_unsigned("target_max_bytes", p->target_max_bytes);
+      } else if (var == "cache_target_dirty_ratio") {
+        f->dump_unsigned("cache_target_dirty_ratio_micro",
+                         p->cache_target_dirty_ratio_micro);
+        f->dump_float("cache_target_dirty_ratio",
+                      ((float)p->cache_target_dirty_ratio_micro/1000000));
+      } else if (var == "cache_target_full_ratio") {
+        f->dump_unsigned("cache_target_full_ratio_micro",
+                         p->cache_target_full_ratio_micro);
+        f->dump_float("cache_target_full_ratio",
+                      ((float)p->cache_target_full_ratio_micro/1000000));
+      } else if (var == "cache_min_flush_age") {
+        f->dump_unsigned("cache_min_flush_age", p->cache_min_flush_age);
+      } else if (var == "cache_min_evict_age") {
+        f->dump_unsigned("cache_min_evict_age", p->cache_min_evict_age);
+      } else if (var == "erasure_code_profile") {
+       f->dump_string("erasure_code_profile", p->erasure_code_profile);
       }
 
       f->close_section();
@@ -2521,7 +2587,24 @@ bool OSDMonitor::preprocess_command(MMonCommand *m)
 	}
 	BloomHitSet::Params *bloomp = static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
 	ss << "hit_set_fpp: " << bloomp->get_fpp();
+      } else if (var == "target_max_objects") {
+        ss << "target_max_objects: " << p->target_max_objects;
+      } else if (var == "target_max_bytes") {
+        ss << "target_max_bytes: " << p->target_max_bytes;
+      } else if (var == "cache_target_dirty_ratio") {
+        ss << "cache_target_dirty_ratio: "
+          << ((float)p->cache_target_dirty_ratio_micro/1000000);
+      } else if (var == "cache_target_full_ratio") {
+        ss << "cache_target_full_ratio: "
+          << ((float)p->cache_target_full_ratio_micro/1000000);
+      } else if (var == "cache_min_flush_age") {
+        ss << "cache_min_flush_age: " << p->cache_min_flush_age;
+      } else if (var == "cache_min_evict_age") {
+        ss << "cache_min_evict_age: " << p->cache_min_evict_age;
+      } else if (var == "erasure_code_profile") {
+       ss << "erasure_code_profile: " << p->erasure_code_profile;
       }
+
       rdata.append(ss);
       ss.str("");
     }
@@ -2626,6 +2709,45 @@ stats_out:
     rdata.append("\n");
     r = 0;
 
+  } else if (prefix == "osd pool get-quota") {
+    string pool_name;
+    cmd_getval(g_ceph_context, cmdmap, "pool", pool_name);
+
+    int64_t poolid = osdmap.lookup_pg_pool_name(pool_name);
+    if (poolid < 0) {
+      assert(poolid == -ENOENT);
+      ss << "unrecognized pool '" << pool_name << "'";
+      r = -ENOENT;
+      goto reply;
+    }
+    const pg_pool_t *p = osdmap.get_pg_pool(poolid);
+
+    if (f) {
+      f->open_object_section("pool_quotas");
+      f->dump_string("pool_name", pool_name);
+      f->dump_unsigned("pool_id", poolid);
+      f->dump_unsigned("quota_max_objects", p->quota_max_objects);
+      f->dump_unsigned("quota_max_bytes", p->quota_max_bytes);
+      f->close_section();
+      f->flush(rdata);
+    } else {
+      stringstream rs;
+      rs << "quotas for pool '" << pool_name << "':\n"
+         << "  max objects: ";
+      if (p->quota_max_objects == 0)
+        rs << "N/A";
+      else
+        rs << si_t(p->quota_max_objects) << " objects";
+      rs << "\n"
+         << "  max bytes  : ";
+      if (p->quota_max_bytes == 0)
+        rs << "N/A";
+      else
+        rs << si_t(p->quota_max_bytes) << "B";
+      rdata.append(rs.str());
+    }
+    rdata.append("\n");
+    r = 0;
   } else if (prefix == "osd crush rule list" ||
 	     prefix == "osd crush rule ls") {
     string format;
@@ -2925,15 +3047,18 @@ int OSDMonitor::crush_ruleset_create_erasure(const string &name,
 					     int *ruleset,
 					     stringstream &ss)
 {
-  *ruleset = osdmap.crush->get_rule_id(name);
-  if (*ruleset != -ENOENT)
+  int ruleid = osdmap.crush->get_rule_id(name);
+  if (ruleid != -ENOENT) {
+    *ruleset = osdmap.crush->get_rule_mask_ruleset(ruleid);
     return -EEXIST;
+  }
 
   CrushWrapper newcrush;
   _get_pending_crush(newcrush);
 
-  *ruleset = newcrush.get_rule_id(name);
-  if (*ruleset != -ENOENT) {
+  ruleid = newcrush.get_rule_id(name);
+  if (ruleid != -ENOENT) {
+    *ruleset = newcrush.get_rule_mask_ruleset(ruleid);
     return -EALREADY;
   } else {
     ErasureCodeInterfaceRef erasure_code;
@@ -3089,20 +3214,23 @@ int OSDMonitor::parse_erasure_code_profile(const vector<string> &erasure_code_pr
 
 int OSDMonitor::prepare_pool_size(const unsigned pool_type,
 				  const string &erasure_code_profile,
-				  unsigned *size,
+				  unsigned *size, unsigned *min_size,
 				  stringstream &ss)
 {
   int err = 0;
   switch (pool_type) {
   case pg_pool_t::TYPE_REPLICATED:
     *size = g_conf->osd_pool_default_size;
+    *min_size = g_conf->get_osd_pool_default_min_size();
     break;
   case pg_pool_t::TYPE_ERASURE:
     {
       ErasureCodeInterfaceRef erasure_code;
       err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
-      if (err == 0)
+      if (err == 0) {
 	*size = erasure_code->get_chunk_count();
+	*min_size = erasure_code->get_data_chunk_count();
+      }
     }
     break;
   default:
@@ -3219,8 +3347,8 @@ int OSDMonitor::prepare_new_pool(string& name, uint64_t auid,
 				 crush_ruleset_name, &crush_ruleset, ss);
   if (r)
     return r;
-  unsigned size;
-  r = prepare_pool_size(pool_type, erasure_code_profile, &size, ss);
+  unsigned size, min_size;
+  r = prepare_pool_size(pool_type, erasure_code_profile, &size, &min_size, ss);
   if (r)
     return r;
   uint32_t stripe_width = 0;
@@ -3246,7 +3374,7 @@ int OSDMonitor::prepare_new_pool(string& name, uint64_t auid,
     pi->flags |= pg_pool_t::FLAG_HASHPSPOOL;
 
   pi->size = size;
-  pi->min_size = g_conf->get_osd_pool_default_min_size();
+  pi->min_size = min_size;
   pi->crush_ruleset = crush_ruleset;
   pi->object_hash = CEPH_STR_HASH_RJENKINS;
   pi->set_pg_num(pg_num ? pg_num : g_conf->osd_pool_default_pg_num);
@@ -3336,6 +3464,7 @@ int OSDMonitor::prepare_command_pool_set(map<string,cmd_vartype> &cmdmap,
   string interr, floaterr;
   int64_t n = 0;
   double f = 0;
+  int64_t uf = 0;  // micro-f
   if (!cmd_getval(g_ceph_context, cmdmap, "val", val)) {
     // wasn't a string; maybe an older mon forwarded json with an int?
     if (!cmd_getval(g_ceph_context, cmdmap, "val", n))
@@ -3345,6 +3474,17 @@ int OSDMonitor::prepare_command_pool_set(map<string,cmd_vartype> &cmdmap,
     n = strict_strtoll(val.c_str(), 10, &interr);
     // or a float
     f = strict_strtod(val.c_str(), &floaterr);
+    uf = llrintl(f * (double)1000000.0);
+  }
+
+  if (!p.is_tier() &&
+      (var == "hit_set_type" || var == "hit_set_period" ||
+       var == "hit_set_count" || var == "hit_set_fpp" ||
+       var == "target_max_objects" || var == "target_max_bytes" ||
+       var == "cache_target_full_ratio" || var == "cache_target_dirty_ratio" ||
+       var == "cache_min_flush_age" || var == "cache_min_evict_age")) {
+    ss << "pool '" << poolstr << "' is not a tier pool: variable not applicable";
+    return -EACCES;
   }
 
   if (var == "size") {
@@ -3399,7 +3539,7 @@ int OSDMonitor::prepare_command_pool_set(map<string,cmd_vartype> &cmdmap,
       ss << "splits in cache pools must be followed by scrubs and leave sufficient free space to avoid overfilling.  use --yes-i-really-mean-it to force.";
       return -EPERM;
     }
-    int expected_osds = MIN(p.get_pg_num(), osdmap.get_num_osds());
+    int expected_osds = MAX(1, MIN(p.get_pg_num(), osdmap.get_num_osds()));
     int64_t new_pgs = n - p.get_pg_num();
     int64_t pgs_per_osd = new_pgs / expected_osds;
     if (pgs_per_osd > g_conf->mon_osd_max_split_count) {
@@ -3487,6 +3627,7 @@ int OSDMonitor::prepare_command_pool_set(map<string,cmd_vartype> &cmdmap,
     }
     p.hit_set_period = n;
   } else if (var == "hit_set_count") {
+
     if (interr.length()) {
       ss << "error parsing integer value '" << val << "': " << interr;
       return -EINVAL;
@@ -3528,7 +3669,7 @@ int OSDMonitor::prepare_command_pool_set(map<string,cmd_vartype> &cmdmap,
       ss << "value must be in the range 0..1";
       return -ERANGE;
     }
-    p.cache_target_dirty_ratio_micro = f * 1000000;
+    p.cache_target_dirty_ratio_micro = uf;
   } else if (var == "cache_target_full_ratio") {
     if (floaterr.length()) {
       ss << "error parsing float '" << val << "': " << floaterr;
@@ -3538,7 +3679,7 @@ int OSDMonitor::prepare_command_pool_set(map<string,cmd_vartype> &cmdmap,
       ss << "value must be in the range 0..1";
       return -ERANGE;
     }
-    p.cache_target_full_ratio_micro = f * 1000000;
+    p.cache_target_full_ratio_micro = uf;
   } else if (var == "cache_min_flush_age") {
     if (interr.length()) {
       ss << "error parsing int '" << val << "': " << interr;
@@ -4172,6 +4313,24 @@ bool OSDMonitor::prepare_command_impl(MMonCommand *m,
     cmd_getval(g_ceph_context, cmdmap, "profile", profile);
     if (profile == "")
       profile = "default";
+    if (profile == "default") {
+      if (!osdmap.has_erasure_code_profile(profile)) {
+	if (pending_inc.has_erasure_code_profile(profile)) {
+	  dout(20) << "erasure code profile " << profile << " already pending" << dendl;
+	  goto wait;
+	}
+
+	map<string,string> profile_map;
+	err = osdmap.get_erasure_code_profile_default(g_ceph_context,
+						      profile_map,
+						      &ss);
+	if (err)
+	  goto reply;
+	dout(20) << "erasure code profile " << profile << " set" << dendl;
+	pending_inc.set_erasure_code_profile(profile, profile_map);
+	goto wait;
+      }
+    }
 
     int ruleset;
     err = crush_ruleset_create_erasure(name, profile, &ruleset, ss);
@@ -4847,6 +5006,25 @@ done:
     cmd_getval(g_ceph_context, cmdmap, "erasure_code_profile", erasure_code_profile);
     if (erasure_code_profile == "")
       erasure_code_profile = "default";
+    if (erasure_code_profile == "default") {
+      if (!osdmap.has_erasure_code_profile(erasure_code_profile)) {
+	if (pending_inc.has_erasure_code_profile(erasure_code_profile)) {
+	  dout(20) << "erasure code profile " << erasure_code_profile << " already pending" << dendl;
+	  goto wait;
+	}
+
+	map<string,string> profile_map;
+	err = osdmap.get_erasure_code_profile_default(g_ceph_context,
+						      profile_map,
+						      &ss);
+	if (err)
+	  goto reply;
+	dout(20) << "erasure code profile " << erasure_code_profile << " set" << dendl;
+	pending_inc.set_erasure_code_profile(erasure_code_profile, profile_map);
+	goto wait;
+      }
+    }
+
     if (ruleset_name == "") {
       if (erasure_code_profile == "default") {
 	ruleset_name = "erasure-code";
@@ -5054,7 +5232,10 @@ done:
       goto reply;
     }
     if (tp->tier_of != pool_id) {
-      ss << "tier pool '" << tierpoolstr << "' is a tier of '" << tp->tier_of << "'";
+      ss << "tier pool '" << tierpoolstr << "' is a tier of '"
+         << osdmap.get_pool_name(tp->tier_of) << "': "
+         // be scary about it; this is an inconsistency and bells must go off
+         << "THIS SHOULD NOT HAVE HAPPENED AT ALL";
       err = -EINVAL;
       goto reply;
     }
@@ -5182,8 +5363,67 @@ done:
       err = -EINVAL;
       goto reply;
     }
+
+    // pool already has this cache-mode set and there are no pending changes
+    if (p->cache_mode == mode &&
+	(pending_inc.new_pools.count(pool_id) == 0 ||
+	 pending_inc.new_pools[pool_id].cache_mode == p->cache_mode)) {
+      ss << "set cache-mode for pool '" << poolstr << "'"
+         << " to " << pg_pool_t::get_cache_mode_name(mode);
+      err = 0;
+      goto reply;
+    }
+
+    /* Mode description:
+     *
+     *  none:       No cache-mode defined
+     *  forward:    Forward all reads and writes to base pool
+     *  writeback:  Cache writes, promote reads from base pool
+     *  readonly:   Forward writes to base pool
+     *
+     * Hence, these are the allowed transitions:
+     *
+     *  none -> any
+     *  forward -> writeback || any IF num_objects_dirty == 0
+     *  writeback -> forward
+     *  readonly -> any
+     */
+
+    // We check if the transition is valid against the current pool mode, as
+    // it is the only committed state thus far.  We will blantly squash
+    // whatever mode is on the pending state.
+
+    if (p->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK &&
+        mode != pg_pool_t::CACHEMODE_FORWARD) {
+      ss << "unable to set cache-mode '" << pg_pool_t::get_cache_mode_name(mode)
+         << "' on a '" << pg_pool_t::get_cache_mode_name(p->cache_mode)
+         << "' pool; only '"
+         << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_FORWARD)
+        << "' allowed.";
+      err = -EINVAL;
+      goto reply;
+    }
+    if (p->cache_mode == pg_pool_t::CACHEMODE_FORWARD &&
+               mode != pg_pool_t::CACHEMODE_WRITEBACK) {
+
+      const pool_stat_t& tier_stats =
+        mon->pgmon()->pg_map.get_pg_pool_sum_stat(pool_id);
+
+      if (tier_stats.stats.sum.num_objects_dirty > 0) {
+        ss << "unable to set cache-mode '"
+           << pg_pool_t::get_cache_mode_name(mode) << "' on pool '" << poolstr
+           << "': dirty objects found";
+        err = -EBUSY;
+        goto reply;
+      }
+    }
+
     // go
-    pending_inc.get_new_pool(pool_id, p)->cache_mode = mode;
+    pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
+    np->cache_mode = mode;
+    // set this both when moving to and from cache_mode NONE.  this is to
+    // capture legacy pools that were set up before this flag existed.
+    np->flags |= pg_pool_t::FLAG_INCOMPLETE_CLONES;
     ss << "set cache-mode for pool '" << poolstr
 	<< "' to " << pg_pool_t::get_cache_mode_name(mode);
     wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, ss.str(),
@@ -5623,8 +5863,12 @@ int OSDMonitor::_check_remove_pool(int64_t pool, const pg_pool_t *p,
     return -EBUSY;
   }
   if (!p->tiers.empty()) {
-    *ss << "pool '" << poolstr << "' includes tiers "
-	<< p->tiers;
+    *ss << "pool '" << poolstr << "' has tiers";
+    for(std::set<uint64_t>::iterator i = p->tiers.begin(); i != p->tiers.end(); ++i) {
+      const char *name = osdmap.get_pool_name(*i);
+      assert(name != NULL);
+      *ss << " " << name;
+    }
     return -EBUSY;
   }
   *ss << "pool '" << poolstr << "' removed";
diff --git a/src/mon/OSDMonitor.h b/src/mon/OSDMonitor.h
index 2d4f379..fbce5fe 100644
--- a/src/mon/OSDMonitor.h
+++ b/src/mon/OSDMonitor.h
@@ -272,7 +272,7 @@ private:
 				 stringstream &ss);
   int prepare_pool_size(const unsigned pool_type,
 			const string &erasure_code_profile,
-			unsigned *size,
+			unsigned *size, unsigned *min_size,
 			stringstream &ss);
   int prepare_pool_stripe_width(const unsigned pool_type,
 				const string &erasure_code_profile,
diff --git a/src/mon/PGMonitor.cc b/src/mon/PGMonitor.cc
index ae8f6e7..15f6746 100644
--- a/src/mon/PGMonitor.cc
+++ b/src/mon/PGMonitor.cc
@@ -1215,11 +1215,13 @@ inline string percentify(const float& a) {
 
 //void PGMonitor::dump_object_stat_sum(stringstream& ss, Formatter *f,
 void PGMonitor::dump_object_stat_sum(TextTable &tbl, Formatter *f,
-    object_stat_sum_t &sum, bool verbose)
+				     object_stat_sum_t &sum, uint64_t avail,
+				     bool verbose)
 {
   if (f) {
     f->dump_int("kb_used", SHIFT_ROUND_UP(sum.num_bytes, 10));
     f->dump_int("bytes_used", sum.num_bytes);
+    f->dump_unsigned("max_avail", avail);
     f->dump_int("objects", sum.num_objects);
     if (verbose) {
       f->dump_int("dirty", sum.num_objects_dirty);
@@ -1232,6 +1234,7 @@ void PGMonitor::dump_object_stat_sum(TextTable &tbl, Formatter *f,
     tbl << stringify(si_t(sum.num_bytes));
     int64_t kb_used = SHIFT_ROUND_UP(sum.num_bytes, 10);
     tbl << percentify(((float)kb_used / pg_map.osd_sum.kb)*100);
+    tbl << si_t(avail);
     tbl << sum.num_objects;
     if (verbose) {
       tbl << stringify(si_t(sum.num_objects_dirty))
@@ -1241,6 +1244,24 @@ void PGMonitor::dump_object_stat_sum(TextTable &tbl, Formatter *f,
   }
 }
 
+int64_t PGMonitor::get_rule_avail(OSDMap& osdmap, int ruleno)
+{
+  map<int,float> wm;
+  int r = osdmap.crush->get_rule_weight_osd_map(ruleno, &wm);
+  if (r < 0)
+    return r;
+  if(wm.size() == 0)
+    return 0;
+  int64_t min = -1;
+  for (map<int,float>::iterator p = wm.begin(); p != wm.end(); ++p) {
+    int64_t proj = (float)(pg_map.osd_stat[p->first].kb_avail * 1024ull) /
+      (double)p->second;
+    if (min < 0 || proj < min)
+      min = proj;
+  }
+  return min;
+}
+
 void PGMonitor::dump_pool_stats(stringstream &ss, Formatter *f, bool verbose)
 {
   TextTable tbl;
@@ -1252,16 +1273,18 @@ void PGMonitor::dump_pool_stats(stringstream &ss, Formatter *f, bool verbose)
     tbl.define_column("ID", TextTable::LEFT, TextTable::LEFT);
     if (verbose)
       tbl.define_column("CATEGORY", TextTable::LEFT, TextTable::LEFT);
-    tbl.define_column("USED", TextTable::LEFT, TextTable::LEFT);
-    tbl.define_column("\%USED", TextTable::LEFT, TextTable::LEFT);
-    tbl.define_column("OBJECTS", TextTable::LEFT, TextTable::LEFT);
+    tbl.define_column("USED", TextTable::LEFT, TextTable::RIGHT);
+    tbl.define_column("%USED", TextTable::LEFT, TextTable::RIGHT);
+    tbl.define_column("MAX AVAIL", TextTable::LEFT, TextTable::RIGHT);
+    tbl.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
     if (verbose) {
-      tbl.define_column("DIRTY", TextTable::LEFT, TextTable::LEFT);
-      tbl.define_column("READ", TextTable::LEFT, TextTable::LEFT);
-      tbl.define_column("WRITE", TextTable::LEFT, TextTable::LEFT);
+      tbl.define_column("DIRTY", TextTable::LEFT, TextTable::RIGHT);
+      tbl.define_column("READ", TextTable::LEFT, TextTable::RIGHT);
+      tbl.define_column("WRITE", TextTable::LEFT, TextTable::RIGHT);
     }
   }
 
+  map<int,uint64_t> avail_by_rule;
   OSDMap &osdmap = mon->osdmon()->osdmap;
   for (map<int64_t,pg_pool_t>::const_iterator p = osdmap.get_pools().begin();
        p != osdmap.get_pools().end(); ++p) {
@@ -1271,6 +1294,38 @@ void PGMonitor::dump_pool_stats(stringstream &ss, Formatter *f, bool verbose)
     string pool_name = osdmap.get_pool_name(pool_id);
     pool_stat_t &stat = pg_map.pg_pool_sum[pool_id];
 
+    const pg_pool_t *pool = osdmap.get_pg_pool(pool_id);
+    int ruleno = osdmap.crush->find_rule(pool->get_crush_ruleset(),
+					 pool->get_type(),
+					 pool->get_size());
+    uint64_t avail;
+    if (avail_by_rule.count(ruleno) == 0) {
+      avail = get_rule_avail(osdmap, ruleno);
+      avail_by_rule[ruleno] = avail;
+    } else {
+      avail = avail_by_rule[ruleno];
+    }
+    switch (pool->get_type()) {
+    case pg_pool_t::TYPE_REPLICATED:
+      avail /= pool->get_size();
+      break;
+    case pg_pool_t::TYPE_ERASURE:
+      {
+	const map<string,string>& ecp =
+	  osdmap.get_erasure_code_profile(pool->erasure_code_profile);
+	map<string,string>::const_iterator pm = ecp.find("m");
+	map<string,string>::const_iterator pk = ecp.find("k");
+	if (pm != ecp.end() && pk != ecp.end()) {
+	  int k = atoi(pk->second.c_str());
+	  int m = atoi(pm->second.c_str());
+	  avail = avail * k / (m + k);
+	}
+      }
+      break;
+    default:
+      assert(0 == "unrecognized pool type");
+    }
+
     if (f) {
       f->open_object_section("pool");
       f->dump_string("name", pool_name);
@@ -1282,7 +1337,7 @@ void PGMonitor::dump_pool_stats(stringstream &ss, Formatter *f, bool verbose)
       if (verbose)
         tbl << "-";
     }
-    dump_object_stat_sum(tbl, f, stat.stats.sum, verbose);
+    dump_object_stat_sum(tbl, f, stat.stats.sum, avail, verbose);
     if (f)
       f->close_section(); // stats
     else
@@ -1301,7 +1356,7 @@ void PGMonitor::dump_pool_stats(stringstream &ss, Formatter *f, bool verbose)
               << ""
               << it->first;
         }
-        dump_object_stat_sum(tbl, f, it->second, verbose);
+        dump_object_stat_sum(tbl, f, it->second, avail, verbose);
         if (f)
           f->close_section(); // category name
         else
@@ -1335,12 +1390,12 @@ void PGMonitor::dump_fs_stats(stringstream &ss, Formatter *f, bool verbose)
     f->close_section();
   } else {
     TextTable tbl;
-    tbl.define_column("SIZE", TextTable::LEFT, TextTable::LEFT);
-    tbl.define_column("AVAIL", TextTable::LEFT, TextTable::LEFT);
-    tbl.define_column("RAW USED", TextTable::LEFT, TextTable::LEFT);
-    tbl.define_column("\%RAW USED", TextTable::LEFT, TextTable::LEFT);
+    tbl.define_column("SIZE", TextTable::LEFT, TextTable::RIGHT);
+    tbl.define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT);
+    tbl.define_column("RAW USED", TextTable::LEFT, TextTable::RIGHT);
+    tbl.define_column("%RAW USED", TextTable::LEFT, TextTable::RIGHT);
     if (verbose) {
-      tbl.define_column("OBJECTS", TextTable::LEFT, TextTable::LEFT);
+      tbl.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
     }
     tbl << stringify(si_t(pg_map.osd_sum.kb*1024))
         << stringify(si_t(pg_map.osd_sum.kb_avail*1024))
diff --git a/src/mon/PGMonitor.h b/src/mon/PGMonitor.h
index 09dd009..f007378 100644
--- a/src/mon/PGMonitor.h
+++ b/src/mon/PGMonitor.h
@@ -146,7 +146,11 @@ private:
 			  vector<string>& args) const;
 
   void dump_object_stat_sum(TextTable &tbl, Formatter *f,
-                            object_stat_sum_t &sum, bool verbose);
+                            object_stat_sum_t &sum,
+			    uint64_t avail,
+			    bool verbose);
+
+  int64_t get_rule_avail(OSDMap& osdmap, int ruleno);
 
 public:
   PGMonitor(Monitor *mn, Paxos *p, const string& service_name)
diff --git a/src/mon/Paxos.cc b/src/mon/Paxos.cc
index 3883a32..b38b111 100644
--- a/src/mon/Paxos.cc
+++ b/src/mon/Paxos.cc
@@ -1264,7 +1264,8 @@ void Paxos::dispatch(PaxosServiceMessage *m)
 
 bool Paxos::is_readable(version_t v)
 {
-  dout(1) << "is_readable now=" << ceph_clock_now(g_ceph_context) << " lease_expire=" << lease_expire
+  dout(5) << "is_readable now=" << ceph_clock_now(g_ceph_context)
+	  << " lease_expire=" << lease_expire
 	  << " has v" << v << " lc " << last_committed << dendl;
   if (v > last_committed)
     return false;
diff --git a/src/msg/SimpleMessenger.cc b/src/msg/SimpleMessenger.cc
index 2070fe5..ce7f1fd 100644
--- a/src/msg/SimpleMessenger.cc
+++ b/src/msg/SimpleMessenger.cc
@@ -86,6 +86,9 @@ int SimpleMessenger::shutdown()
   ldout(cct,10) << "shutdown " << get_myaddr() << dendl;
   mark_down_all();
   dispatch_queue.shutdown();
+
+  // break ref cycles on the loopback connection
+  local_connection->set_priv(NULL);
   return 0;
 }
 
diff --git a/src/os/FileJournal.cc b/src/os/FileJournal.cc
index c6bd616..7eb7927 100644
--- a/src/os/FileJournal.cc
+++ b/src/os/FileJournal.cc
@@ -1758,7 +1758,12 @@ FileJournal::read_entry_result FileJournal::do_read_entry(
   // ok!
   if (seq)
     *seq = h->seq;
-  journalq.push_back(pair<uint64_t,off64_t>(h->seq, pos));
+
+  // works around an apparent GCC 4.8(?) compiler bug about unaligned
+  // bind by reference to (packed) h->seq
+  journalq.push_back(
+    pair<uint64_t,off64_t>(static_cast<uint64_t>(h->seq),
+			   static_cast<off64_t>(pos)));
 
   if (next_pos)
     *next_pos = pos;
diff --git a/src/os/FileStore.cc b/src/os/FileStore.cc
index f73d930..1b3dd5e 100644
--- a/src/os/FileStore.cc
+++ b/src/os/FileStore.cc
@@ -126,7 +126,7 @@ void FileStore::FSPerfTracker::update_from_perfcounters(
 {
   os_commit_latency.consume_next(
     logger.get_tavg_ms(
-      l_os_commit_lat));
+      l_os_j_lat));
   os_apply_latency.consume_next(
     logger.get_tavg_ms(
       l_os_apply_lat));
@@ -1558,6 +1558,8 @@ int FileStore::umount()
     backend = generic_backend;
   }
 
+  force_sync = false;
+
   object_map.reset();
 
   {
@@ -1711,7 +1713,8 @@ void FileStore::_do_op(OpSequencer *osr, ThreadPool::TPHandle &handle)
 
 void FileStore::_finish_op(OpSequencer *osr)
 {
-  Op *o = osr->dequeue();
+  list<Context*> to_queue;
+  Op *o = osr->dequeue(&to_queue);
   
   dout(10) << "_finish_op " << o << " seq " << o->op << " " << *osr << "/" << osr->parent << dendl;
   osr->apply_lock.Unlock();  // locked in _do_op
@@ -1729,6 +1732,7 @@ void FileStore::_finish_op(OpSequencer *osr)
   if (o->onreadable) {
     op_finisher.queue(o->onreadable);
   }
+  op_finisher.queue(to_queue);
   delete o;
 }
 
@@ -1844,7 +1848,8 @@ void FileStore::_journaled_ahead(OpSequencer *osr, Op *o, Context *ondisk)
   // this should queue in order because the journal does it's completions in order.
   queue_op(osr, o);
 
-  osr->dequeue_journal();
+  list<Context*> to_queue;
+  osr->dequeue_journal(&to_queue);
 
   // do ondisk completions async, to prevent any onreadable_sync completions
   // getting blocked behind an ondisk completion.
@@ -1852,6 +1857,7 @@ void FileStore::_journaled_ahead(OpSequencer *osr, Op *o, Context *ondisk)
     dout(10) << " queueing ondisk " << ondisk << dendl;
     ondisk_finisher.queue(ondisk);
   }
+  ondisk_finisher.queue(to_queue);
 }
 
 int FileStore::_do_transactions(
@@ -2545,11 +2551,12 @@ unsigned FileStore::_do_transaction(
 	f.close_section();
 	f.flush(*_dout);
 	*_dout << dendl;
-	assert(0 == "unexpected error");
 
 	if (r == -EMFILE) {
 	  dump_open_fds(g_ceph_context);
 	}
+
+	assert(0 == "unexpected error");
       }
     }
 
diff --git a/src/os/FileStore.h b/src/os/FileStore.h
index 4c9ffdb..3fcd89a 100644
--- a/src/os/FileStore.h
+++ b/src/os/FileStore.h
@@ -193,19 +193,70 @@ private:
     Mutex qlock; // to protect q, for benefit of flush (peek/dequeue also protected by lock)
     list<Op*> q;
     list<uint64_t> jq;
+    list<pair<uint64_t, Context*> > flush_commit_waiters;
     Cond cond;
   public:
     Sequencer *parent;
     Mutex apply_lock;  // for apply mutual exclusion
     
+    /// get_max_uncompleted
+    bool _get_max_uncompleted(
+      uint64_t *seq ///< [out] max uncompleted seq
+      ) {
+      assert(qlock.is_locked());
+      assert(seq);
+      *seq = 0;
+      if (q.empty() && jq.empty())
+	return true;
+
+      if (!q.empty())
+	*seq = q.back()->op;
+      if (!jq.empty() && jq.back() > *seq)
+	*seq = jq.back();
+
+      return false;
+    } /// @returns true if both queues are empty
+
+    /// get_min_uncompleted
+    bool _get_min_uncompleted(
+      uint64_t *seq ///< [out] min uncompleted seq
+      ) {
+      assert(qlock.is_locked());
+      assert(seq);
+      *seq = 0;
+      if (q.empty() && jq.empty())
+	return true;
+
+      if (!q.empty())
+	*seq = q.front()->op;
+      if (!jq.empty() && jq.front() < *seq)
+	*seq = jq.front();
+
+      return false;
+    } /// @returns true if both queues are empty
+
+    void _wake_flush_waiters(list<Context*> *to_queue) {
+      uint64_t seq;
+      if (_get_min_uncompleted(&seq))
+	seq = -1;
+
+      for (list<pair<uint64_t, Context*> >::iterator i =
+	     flush_commit_waiters.begin();
+	   i != flush_commit_waiters.end() && i->first < seq;
+	   flush_commit_waiters.erase(i++)) {
+	to_queue->push_back(i->second);
+      }
+    }
+
     void queue_journal(uint64_t s) {
       Mutex::Locker l(qlock);
       jq.push_back(s);
     }
-    void dequeue_journal() {
+    void dequeue_journal(list<Context*> *to_queue) {
       Mutex::Locker l(qlock);
       jq.pop_front();
       cond.Signal();
+      _wake_flush_waiters(to_queue);
     }
     void queue(Op *o) {
       Mutex::Locker l(qlock);
@@ -215,20 +266,26 @@ private:
       assert(apply_lock.is_locked());
       return q.front();
     }
-    Op *dequeue() {
+
+    Op *dequeue(list<Context*> *to_queue) {
+      assert(to_queue);
       assert(apply_lock.is_locked());
       Mutex::Locker l(qlock);
       Op *o = q.front();
       q.pop_front();
       cond.Signal();
+
+      _wake_flush_waiters(to_queue);
       return o;
     }
+
     void flush() {
       Mutex::Locker l(qlock);
 
       while (g_conf->filestore_blackhole)
 	cond.Wait(qlock);  // wait forever
 
+
       // get max for journal _or_ op queues
       uint64_t seq = 0;
       if (!q.empty())
@@ -243,6 +300,17 @@ private:
 	  cond.Wait(qlock);
       }
     }
+    bool flush_commit(Context *c) {
+      Mutex::Locker l(qlock);
+      uint64_t seq = 0;
+      if (_get_max_uncompleted(&seq)) {
+	delete c;
+	return true;
+      } else {
+	flush_commit_waiters.push_back(make_pair(seq, c));
+	return false;
+      }
+    }
 
     OpSequencer()
       : qlock("FileStore::OpSequencer::qlock", false, false),
diff --git a/src/os/GenericObjectMap.cc b/src/os/GenericObjectMap.cc
index 4d41c50..011c83b 100644
--- a/src/os/GenericObjectMap.cc
+++ b/src/os/GenericObjectMap.cc
@@ -689,8 +689,6 @@ void GenericObjectMap::rename(const Header old_header, const coll_t &cid,
   old_header->cid = cid;
   old_header->oid = target;
   set_header(cid, target, *old_header, t);
-
-  // "in_use" still hold the "seq"
 }
 
 int GenericObjectMap::init(bool do_upgrade)
@@ -926,35 +924,18 @@ GenericObjectMap::Header GenericObjectMap::_lookup_header(
   to_get.insert(header_key(cid, oid));
   _Header header;
 
-  while (1) {
-    map<string, bufferlist> out;
-    bool try_again = false;
-
-    int r = db->get(GHOBJECT_TO_SEQ_PREFIX, to_get, &out);
-    if (r < 0)
-      return Header();
-    if (out.empty())
-      return Header();
-
-    bufferlist::iterator iter = out.begin()->second.begin();
-    header.decode(iter);
-
-    while (in_use.count(header.seq)) {
-      header_cond.Wait(header_lock);
+  map<string, bufferlist> out;
 
-      // Another thread is hold this header, wait for it.
-      // Because the seq of this object may change, such as clone
-      // and rename operation, here need to look up "seq" again
-      try_again = true;
-    }
+  int r = db->get(GHOBJECT_TO_SEQ_PREFIX, to_get, &out);
+  if (r < 0)
+    return Header();
+  if (out.empty())
+    return Header();
 
-    if (!try_again) {
-      break;
-    }
-  }
+  bufferlist::iterator iter = out.begin()->second.begin();
+  header.decode(iter);
 
-  Header ret = Header(new _Header(header), RemoveOnDelete(this));
-  in_use.insert(ret->seq);
+  Header ret = Header(new _Header(header));
   return ret;
 }
 
@@ -962,7 +943,7 @@ GenericObjectMap::Header GenericObjectMap::_generate_new_header(
     const coll_t &cid, const ghobject_t &oid, Header parent,
     KeyValueDB::Transaction t)
 {
-  Header header = Header(new _Header(), RemoveOnDelete(this));
+  Header header = Header(new _Header());
   header->seq = state.seq++;
   if (parent) {
     header->parent = parent->seq;
@@ -970,8 +951,6 @@ GenericObjectMap::Header GenericObjectMap::_generate_new_header(
   header->num_children = 1;
   header->oid = oid;
   header->cid = cid;
-  assert(!in_use.count(header->seq));
-  in_use.insert(header->seq);
 
   write_state(t);
   return header;
@@ -980,8 +959,6 @@ GenericObjectMap::Header GenericObjectMap::_generate_new_header(
 GenericObjectMap::Header GenericObjectMap::lookup_parent(Header input)
 {
   Mutex::Locker l(header_lock);
-  while (in_use.count(input->parent))
-    header_cond.Wait(header_lock);
   map<string, bufferlist> out;
   set<string> keys;
   keys.insert(PARENT_KEY);
@@ -999,13 +976,12 @@ GenericObjectMap::Header GenericObjectMap::lookup_parent(Header input)
     return Header();
   }
 
-  Header header = Header(new _Header(), RemoveOnDelete(this));
+  Header header = Header(new _Header());
   header->seq = input->parent;
   bufferlist::iterator iter = out.begin()->second.begin();
   header->decode(iter);
   dout(20) << "lookup_parent: parent seq is " << header->seq << " with parent "
            << header->parent << dendl;
-  in_use.insert(header->seq);
   return header;
 }
 
diff --git a/src/os/GenericObjectMap.h b/src/os/GenericObjectMap.h
index c9c64bc..3c5e3cb 100644
--- a/src/os/GenericObjectMap.h
+++ b/src/os/GenericObjectMap.h
@@ -74,12 +74,6 @@ class GenericObjectMap {
    * Serializes access to next_seq as well as the in_use set
    */
   Mutex header_lock;
-  Cond header_cond;
-
-  /**
-   * Set of headers currently in use
-   */
-  set<uint64_t> in_use;
 
   GenericObjectMap(KeyValueDB *db) : db(db), header_lock("GenericObjectMap") {}
 
@@ -371,6 +365,12 @@ protected:
     return GenericObjectMapIterator(new GenericObjectMapIteratorImpl(this, header, prefix));
   }
 
+  Header generate_new_header(const coll_t &cid, const ghobject_t &oid,
+                             Header parent, KeyValueDB::Transaction t) {
+    Mutex::Locker l(header_lock);
+    return _generate_new_header(cid, oid, parent, t);
+  }
+
   // Scan keys in header into out_keys and out_values (if nonnull)
   int scan(Header header, const string &prefix, const set<string> &in_keys,
            set<string> *out_keys, map<string, bufferlist> *out_values);
@@ -394,11 +394,6 @@ protected:
    */
   Header _generate_new_header(const coll_t &cid, const ghobject_t &oid,
                               Header parent, KeyValueDB::Transaction t);
-  Header generate_new_header(const coll_t &cid, const ghobject_t &oid,
-                             Header parent, KeyValueDB::Transaction t) {
-    Mutex::Locker l(header_lock);
-    return _generate_new_header(cid, oid, parent, t);
-  }
 
   // Lookup leaf header for c oid
   Header _lookup_header(const coll_t &cid, const ghobject_t &oid);
@@ -425,26 +420,6 @@ protected:
   // Sets header @see set_header
   void _set_header(Header header, const bufferlist &bl,
                    KeyValueDB::Transaction t);
-
-  /** 
-   * Removes header seq lock once Header is out of scope
-   * @see _lookup_header
-   * @see lookup_parent
-   * @see generate_new_header
-   */
-  class RemoveOnDelete {
-  public:
-    GenericObjectMap *db;
-    RemoveOnDelete(GenericObjectMap *db) :
-      db(db) {}
-    void operator() (_Header *header) {
-      Mutex::Locker l(db->header_lock);
-      db->in_use.erase(header->seq);
-      db->header_cond.Signal();
-      delete header;
-    }
-  };
-  friend class RemoveOnDelete;
 };
 WRITE_CLASS_ENCODER(GenericObjectMap::_Header)
 WRITE_CLASS_ENCODER(GenericObjectMap::State)
diff --git a/src/os/KeyValueStore.cc b/src/os/KeyValueStore.cc
index fb459b2..17c0c3b 100644
--- a/src/os/KeyValueStore.cc
+++ b/src/os/KeyValueStore.cc
@@ -69,68 +69,49 @@ const string KeyValueStore::COLLECTION_ATTR = "__COLL_ATTR__";
 
 // ============== StripObjectMap Implementation =================
 
-void StripObjectMap::sync_wrap(StripObjectHeader &strip_header,
-                               KeyValueDB::Transaction t,
-                               const SequencerPosition &spos)
-{
-  dout(10) << __func__ << " cid: " << strip_header.cid << "oid: "
-           << strip_header.oid << " setting spos to " << strip_header.spos
-           << dendl;
-  strip_header.spos = spos;
-  strip_header.header->data.clear();
-  ::encode(strip_header, strip_header.header->data);
-
-  sync(strip_header.header, t);
-}
-
-bool StripObjectMap::check_spos(const StripObjectHeader &header,
-                                const SequencerPosition &spos)
-{
-  if (spos > header.spos) {
-    stringstream out;
-    dout(10) << "cid: " << "oid: " << header.oid
-             << " not skipping op, *spos " << spos << dendl;
-    dout(10) << " > header.spos " << header.spos << dendl;
-    return false;
-  } else {
-    dout(10) << "cid: " << "oid: " << header.oid << " skipping op, spos "
-             << spos << " <= header.spos " << header.spos << dendl;
-    return true;
-  }
-}
-
-int StripObjectMap::save_strip_header(StripObjectHeader &strip_header,
-                                      const SequencerPosition &spos,
+int StripObjectMap::save_strip_header(StripObjectHeaderRef strip_header,
                                       KeyValueDB::Transaction t)
 {
-  strip_header.spos = spos;
-  strip_header.header->data.clear();
-  ::encode(strip_header, strip_header.header->data);
+  strip_header->header->data.clear();
+  ::encode(*strip_header, strip_header->header->data);
 
-  set_header(strip_header.cid, strip_header.oid, *(strip_header.header), t);
+  set_header(strip_header->cid, strip_header->oid, *(strip_header->header), t);
   return 0;
 }
 
 int StripObjectMap::create_strip_header(const coll_t &cid,
                                         const ghobject_t &oid,
-                                        StripObjectHeader &strip_header,
+                                        StripObjectHeaderRef *strip_header,
                                         KeyValueDB::Transaction t)
 {
-  Header header = lookup_create_header(cid, oid, t);
+  Header header = generate_new_header(cid, oid, Header(), t);
   if (!header)
     return -EINVAL;
 
-  strip_header.oid = oid;
-  strip_header.cid = cid;
-  strip_header.header = header;
+  StripObjectHeaderRef tmp = StripObjectHeaderRef(new StripObjectHeader());
+  tmp->oid = oid;
+  tmp->cid = cid;
+  tmp->header = header;
+  if (strip_header)
+    *strip_header = tmp;
 
   return 0;
 }
 
 int StripObjectMap::lookup_strip_header(const coll_t &cid,
                                         const ghobject_t &oid,
-                                        StripObjectHeader &strip_header)
-{
+                                        StripObjectHeaderRef *strip_header)
+{
+  if (cid != coll_t()) {
+    Mutex::Locker l(lock);
+    pair<coll_t, StripObjectHeaderRef> p;
+    if (caches.lookup(oid, &p)) {
+      if (p.first == cid) {
+        *strip_header = p.second;
+        return 0;
+      }
+    }
+  }
   Header header = lookup_header(cid, oid);
 
   if (!header) {
@@ -139,18 +120,25 @@ int StripObjectMap::lookup_strip_header(const coll_t &cid,
     return -ENOENT;
   }
 
+
+  StripObjectHeaderRef tmp = StripObjectHeaderRef(new StripObjectHeader());
   if (header->data.length()) {
     bufferlist::iterator bliter = header->data.begin();
-    ::decode(strip_header, bliter);
+    ::decode(*tmp, bliter);
   }
 
-  if (strip_header.strip_size == 0)
-    strip_header.strip_size = default_strip_size;
+  if (tmp->strip_size == 0)
+    tmp->strip_size = default_strip_size;
 
-  strip_header.oid = oid;
-  strip_header.cid = cid;
-  strip_header.header = header;
+  tmp->oid = oid;
+  tmp->cid = cid;
+  tmp->header = header;
 
+  {
+    Mutex::Locker l(lock);
+    caches.add(oid, make_pair(cid, tmp));
+  }
+  *strip_header = tmp;
   dout(10) << "lookup_strip_header done " << " cid " << cid << " oid "
            << oid << dendl;
   return 0;
@@ -194,57 +182,62 @@ int StripObjectMap::file_to_extents(uint64_t offset, size_t len,
   return 0;
 }
 
-void StripObjectMap::clone_wrap(StripObjectHeader &old_header,
+void StripObjectMap::clone_wrap(StripObjectHeaderRef old_header,
                                 const coll_t &cid, const ghobject_t &oid,
                                 KeyValueDB::Transaction t,
-                                StripObjectHeader *origin_header,
-                                StripObjectHeader *target_header)
+                                StripObjectHeaderRef *target_header)
 {
   Header new_origin_header;
+  StripObjectHeaderRef tmp = StripObjectHeaderRef(new StripObjectHeader());
 
-  if (target_header)
-    *target_header = old_header;
-  if (origin_header)
-    *origin_header = old_header;
-
-  clone(old_header.header, cid, oid, t, &new_origin_header,
-        &target_header->header);
+  clone(old_header->header, cid, oid, t, &new_origin_header,
+        &tmp->header);
 
-  if(origin_header)
-    origin_header->header = new_origin_header;
+  tmp->oid = oid;
+  tmp->cid = cid;
+  tmp->strip_size = old_header->strip_size;
+  tmp->max_size = old_header->max_size;
+  tmp->bits = old_header->bits;
+  old_header->header = new_origin_header;
 
-  if (target_header) {
-    target_header->oid = oid;
-    target_header->cid = cid;
-  }
+  if (target_header)
+    *target_header = tmp;
 }
 
-void StripObjectMap::rename_wrap(const coll_t &cid, const ghobject_t &oid,
+void StripObjectMap::rename_wrap(StripObjectHeaderRef old_header, const coll_t &cid, const ghobject_t &oid,
                                  KeyValueDB::Transaction t,
-                                 StripObjectHeader *header)
+                                 StripObjectHeaderRef *new_header)
 {
-  assert(header);
-  rename(header->header, cid, oid, t);
+  rename(old_header->header, cid, oid, t);
 
-  if (header) {
-    header->oid = oid;
-    header->cid = cid;
-  }
+  StripObjectHeaderRef tmp = StripObjectHeaderRef(new StripObjectHeader());
+  tmp->strip_size = old_header->strip_size;
+  tmp->max_size = old_header->max_size;
+  tmp->bits = old_header->bits;
+  tmp->header = old_header->header;
+  tmp->oid = oid;
+  tmp->cid = cid;
+
+  if (new_header)
+    *new_header = tmp;
+
+  old_header->header = Header();
+  old_header->deleted = true;
 }
 
-int StripObjectMap::get_values_with_header(const StripObjectHeader &header,
+int StripObjectMap::get_values_with_header(const StripObjectHeaderRef header,
                                            const string &prefix,
                                            const set<string> &keys,
                                            map<string, bufferlist> *out)
 {
-  return scan(header.header, prefix, keys, 0, out);
+  return scan(header->header, prefix, keys, 0, out);
 }
 
-int StripObjectMap::get_keys_with_header(const StripObjectHeader &header,
+int StripObjectMap::get_keys_with_header(const StripObjectHeaderRef header,
                                          const string &prefix,
                                          set<string> *keys)
 {
-  ObjectMap::ObjectMapIterator iter = _get_iterator(header.header, prefix);
+  ObjectMap::ObjectMapIterator iter = _get_iterator(header->header, prefix);
   for (; iter->valid(); iter->next()) {
     if (iter->status())
       return iter->status();
@@ -253,10 +246,10 @@ int StripObjectMap::get_keys_with_header(const StripObjectHeader &header,
   return 0;
 }
 
-int StripObjectMap::get_with_header(const StripObjectHeader &header,
+int StripObjectMap::get_with_header(const StripObjectHeaderRef header,
                         const string &prefix, map<string, bufferlist> *out)
 {
-  ObjectMap::ObjectMapIterator iter = _get_iterator(header.header, prefix);
+  ObjectMap::ObjectMapIterator iter = _get_iterator(header->header, prefix);
   for (iter->seek_to_first(); iter->valid(); iter->next()) {
     if (iter->status())
       return iter->status();
@@ -265,52 +258,36 @@ int StripObjectMap::get_with_header(const StripObjectHeader &header,
 
   return 0;
 }
-// =========== KeyValueStore::SubmitManager Implementation ==============
-
-uint64_t KeyValueStore::SubmitManager::op_submit_start()
-{
-  lock.Lock();
-  uint64_t op = ++op_seq;
-  dout(10) << "op_submit_start " << op << dendl;
-  return op;
-}
-
-void KeyValueStore::SubmitManager::op_submit_finish(uint64_t op)
-{
-  dout(10) << "op_submit_finish " << op << dendl;
-  if (op != op_submitted + 1) {
-      dout(0) << "op_submit_finish " << op << " expected " << (op_submitted + 1)
-          << ", OUT OF ORDER" << dendl;
-      assert(0 == "out of order op_submit_finish");
-  }
-  op_submitted = op;
-  lock.Unlock();
-}
-
 
 // ========= KeyValueStore::BufferTransaction Implementation ============
 
 int KeyValueStore::BufferTransaction::lookup_cached_header(
     const coll_t &cid, const ghobject_t &oid,
-    StripObjectMap::StripObjectHeader **strip_header,
+    StripObjectMap::StripObjectHeaderRef *strip_header,
     bool create_if_missing)
 {
-  StripObjectMap::StripObjectHeader header;
+  StripObjectMap::StripObjectHeaderRef header;
   int r = 0;
 
   StripHeaderMap::iterator it = strip_headers.find(make_pair(cid, oid));
   if (it != strip_headers.end()) {
-    if (it->second.deleted)
+
+    if (!it->second->deleted) {
+      if (strip_header)
+        *strip_header = it->second;
+      return 0;
+    } else if (!create_if_missing) {
       return -ENOENT;
+    }
 
-    if (strip_header)
-      *strip_header = &it->second;
-    return 0;
+    // If (it->second.deleted && create_if_missing) go down
+    r = -ENOENT;
+  } else {
+    r = store->backend->lookup_strip_header(cid, oid, &header);
   }
 
-  r = store->backend->lookup_strip_header(cid, oid, header);
-  if (r < 0 && create_if_missing) {
-    r = store->backend->create_strip_header(cid, oid, header, t);
+  if (r == -ENOENT && create_if_missing) {
+    r = store->backend->create_strip_header(cid, oid, &header, t);
   }
 
   if (r < 0) {
@@ -321,21 +298,21 @@ int KeyValueStore::BufferTransaction::lookup_cached_header(
 
   strip_headers[make_pair(cid, oid)] = header;
   if (strip_header)
-    *strip_header = &strip_headers[make_pair(cid, oid)];
+    *strip_header = strip_headers[make_pair(cid, oid)];
   return r;
 }
 
 int KeyValueStore::BufferTransaction::get_buffer_keys(
-    StripObjectMap::StripObjectHeader &strip_header, const string &prefix,
+    StripObjectMap::StripObjectHeaderRef strip_header, const string &prefix,
     const set<string> &keys, map<string, bufferlist> *out)
 {
   set<string> need_lookup;
 
   for (set<string>::iterator it = keys.begin(); it != keys.end(); ++it) {
     map<pair<string, string>, bufferlist>::iterator i =
-        strip_header.buffers.find(make_pair(prefix, *it));
+        strip_header->buffers.find(make_pair(prefix, *it));
 
-    if (i != strip_header.buffers.end()) {
+    if (i != strip_header->buffers.end()) {
       (*out)[*it].swap(i->second);
     } else {
       need_lookup.insert(*it);
@@ -346,8 +323,8 @@ int KeyValueStore::BufferTransaction::get_buffer_keys(
     int r = store->backend->get_values_with_header(strip_header, prefix,
                                                    need_lookup, out);
     if (r < 0) {
-      dout(10) << __func__  << " " << strip_header.cid << "/"
-               << strip_header.oid << " " << " r = " << r << dendl;
+      dout(10) << __func__  << " " << strip_header->cid << "/"
+               << strip_header->oid << " " << " r = " << r << dendl;
       return r;
     }
   }
@@ -356,78 +333,77 @@ int KeyValueStore::BufferTransaction::get_buffer_keys(
 }
 
 void KeyValueStore::BufferTransaction::set_buffer_keys(
-     StripObjectMap::StripObjectHeader &strip_header,
+     StripObjectMap::StripObjectHeaderRef strip_header,
      const string &prefix, map<string, bufferlist> &values)
 {
-  store->backend->set_keys(strip_header.header, prefix, values, t);
+  store->backend->set_keys(strip_header->header, prefix, values, t);
 
   for (map<string, bufferlist>::iterator iter = values.begin();
        iter != values.end(); ++iter) {
-    strip_header.buffers[make_pair(prefix, iter->first)].swap(iter->second);
+    strip_header->buffers[make_pair(prefix, iter->first)].swap(iter->second);
   }
 }
 
 int KeyValueStore::BufferTransaction::remove_buffer_keys(
-     StripObjectMap::StripObjectHeader &strip_header, const string &prefix,
+     StripObjectMap::StripObjectHeaderRef strip_header, const string &prefix,
      const set<string> &keys)
 {
   for (set<string>::iterator iter = keys.begin(); iter != keys.end(); ++iter) {
-    strip_header.buffers[make_pair(prefix, *iter)] = bufferlist();
+    strip_header->buffers[make_pair(prefix, *iter)] = bufferlist();
   }
 
-  return store->backend->rm_keys(strip_header.header, prefix, keys, t);
+  return store->backend->rm_keys(strip_header->header, prefix, keys, t);
 }
 
 void KeyValueStore::BufferTransaction::clear_buffer_keys(
-     StripObjectMap::StripObjectHeader &strip_header, const string &prefix)
+     StripObjectMap::StripObjectHeaderRef strip_header, const string &prefix)
 {
-  for (map<pair<string, string>, bufferlist>::iterator iter = strip_header.buffers.begin();
-       iter != strip_header.buffers.end(); ++iter) {
+  for (map<pair<string, string>, bufferlist>::iterator iter = strip_header->buffers.begin();
+       iter != strip_header->buffers.end(); ++iter) {
     if (iter->first.first == prefix)
       iter->second = bufferlist();
   }
 }
 
 int KeyValueStore::BufferTransaction::clear_buffer(
-     StripObjectMap::StripObjectHeader &strip_header)
+     StripObjectMap::StripObjectHeaderRef strip_header)
 {
-  strip_header.deleted = true;
+  strip_header->deleted = true;
 
-  return store->backend->clear(strip_header.header, t);
+  InvalidateCacheContext *c = new InvalidateCacheContext(store, strip_header->cid, strip_header->oid);
+  finishes.push_back(c);
+  return store->backend->clear(strip_header->header, t);
 }
 
 void KeyValueStore::BufferTransaction::clone_buffer(
-    StripObjectMap::StripObjectHeader &old_header,
+    StripObjectMap::StripObjectHeaderRef old_header,
     const coll_t &cid, const ghobject_t &oid)
 {
   // Remove target ahead to avoid dead lock
   strip_headers.erase(make_pair(cid, oid));
 
-  StripObjectMap::StripObjectHeader new_origin_header, new_target_header;
+  StripObjectMap::StripObjectHeaderRef new_target_header;
 
-  store->backend->clone_wrap(old_header, cid, oid, t,
-                             &new_origin_header, &new_target_header);
+  store->backend->clone_wrap(old_header, cid, oid, t, &new_target_header);
 
   // FIXME: Lacking of lock for origin header(now become parent), it will
   // cause other operation can get the origin header while submitting
   // transactions
-  strip_headers[make_pair(cid, old_header.oid)] = new_origin_header;
   strip_headers[make_pair(cid, oid)] = new_target_header;
 }
 
 void KeyValueStore::BufferTransaction::rename_buffer(
-    StripObjectMap::StripObjectHeader &old_header,
+    StripObjectMap::StripObjectHeaderRef old_header,
     const coll_t &cid, const ghobject_t &oid)
 {
-  if (store->backend->check_spos(old_header, spos))
-    return ;
-
   // FIXME: Lacking of lock for origin header, it will cause other operation
   // can get the origin header while submitting transactions
-  store->backend->rename_wrap(cid, oid, t, &old_header);
+  StripObjectMap::StripObjectHeaderRef new_header;
+  store->backend->rename_wrap(old_header, cid, oid, t, &new_header);
 
-  strip_headers.erase(make_pair(old_header.cid, old_header.oid));
-  strip_headers[make_pair(cid, oid)] = old_header;
+  InvalidateCacheContext *c = new InvalidateCacheContext(store, old_header->cid, old_header->oid);
+  finishes.push_back(c);
+  strip_headers[make_pair(cid, oid)] = new_header;
 }
 
 int KeyValueStore::BufferTransaction::submit_transaction()
@@ -436,25 +412,27 @@ int KeyValueStore::BufferTransaction::submit_transaction()
 
   for (StripHeaderMap::iterator header_iter = strip_headers.begin();
        header_iter != strip_headers.end(); ++header_iter) {
-    StripObjectMap::StripObjectHeader header = header_iter->second;
+    StripObjectMap::StripObjectHeaderRef header = header_iter->second;
 
-    if (store->backend->check_spos(header, spos))
+    if (header->deleted)
       continue;
 
-    if (header.deleted)
-      continue;
+    r = store->backend->save_strip_header(header, t);
 
-    r = store->backend->save_strip_header(header, spos, t);
     if (r < 0) {
       dout(10) << __func__ << " save strip header failed " << dendl;
       goto out;
     }
   }
 
-out:
+  r = store->backend->submit_transaction(t);
+  for (list<Context*>::iterator it = finishes.begin(); it != finishes.end(); ++it) {
+    (*it)->complete(r);
+  }
 
+out:
   dout(5) << __func__ << " r = " << r << dendl;
-  return store->backend->submit_transaction(t);
+  return r;
 }
 
 // =========== KeyValueStore Intern Helper Implementation ==============
@@ -495,7 +473,7 @@ KeyValueStore::KeyValueStore(const std::string &base,
   ObjectStore(base),
   internal_name(name),
   basedir(base),
-  fsid_fd(-1), op_fd(-1), current_fd(-1),
+  fsid_fd(-1), current_fd(-1),
   kv_type(KV_TYPE_NONE),
   backend(NULL),
   ondisk_finisher(g_ceph_context),
@@ -906,10 +884,6 @@ int KeyValueStore::umount()
     VOID_TEMP_FAILURE_RETRY(::close(fsid_fd));
     fsid_fd = -1;
   }
-  if (op_fd >= 0) {
-    VOID_TEMP_FAILURE_RETRY(::close(op_fd));
-    op_fd = -1;
-  }
   if (current_fd >= 0) {
     VOID_TEMP_FAILURE_RETRY(::close(current_fd));
     current_fd = -1;
@@ -963,14 +937,9 @@ int KeyValueStore::queue_transactions(Sequencer *posr, list<Transaction*> &tls,
 
   Op *o = build_op(tls, ondisk, onreadable, onreadable_sync, osd_op);
   op_queue_reserve_throttle(o, handle);
-  uint64_t op = submit_manager.op_submit_start();
-  o->op = op;
-  dout(5) << "queue_transactions (trailing journal) " << op << " "
-          << tls <<dendl;
+  dout(5) << "queue_transactions (trailing journal) " << " " << tls <<dendl;
   queue_op(osr, o);
 
-  submit_manager.op_submit_finish(op);
-
   return 0;
 }
 
@@ -1088,7 +1057,8 @@ void KeyValueStore::_do_op(OpSequencer *osr, ThreadPool::TPHandle &handle)
 
 void KeyValueStore::_finish_op(OpSequencer *osr)
 {
-  Op *o = osr->dequeue();
+  list<Context*> to_queue;
+  Op *o = osr->dequeue(&to_queue);
 
   dout(10) << "_finish_op " << o << " seq " << o->op << " " << *osr << "/" << osr->parent << dendl;
   osr->apply_lock.Unlock();  // locked in _do_op
@@ -1102,6 +1072,7 @@ void KeyValueStore::_finish_op(OpSequencer *osr)
     o->onreadable_sync->complete(0);
   }
   op_finisher.queue(o->onreadable);
+  op_finisher.queue(to_queue);
   delete o;
 }
 
@@ -1126,13 +1097,12 @@ int KeyValueStore::_do_transactions(list<Transaction*> &tls, uint64_t op_seq,
   }
 
   int trans_num = 0;
-  SequencerPosition spos(op_seq, trans_num, 0);
-  BufferTransaction bt(this, spos);
+  BufferTransaction bt(this);
 
   for (list<Transaction*>::iterator p = tls.begin();
        p != tls.end();
        ++p, trans_num++) {
-    r = _do_transaction(**p, bt, spos, handle);
+    r = _do_transaction(**p, bt, handle);
     if (r < 0)
       break;
     if (handle)
@@ -1149,12 +1119,12 @@ int KeyValueStore::_do_transactions(list<Transaction*> &tls, uint64_t op_seq,
 
 unsigned KeyValueStore::_do_transaction(Transaction& transaction,
                                         BufferTransaction &t,
-                                        SequencerPosition& spos,
                                         ThreadPool::TPHandle *handle)
 {
   dout(10) << "_do_transaction on " << &transaction << dendl;
 
   Transaction::iterator i = transaction.begin();
+  uint64_t op_num = 0;
 
   while (i.have_op()) {
     if (handle)
@@ -1449,7 +1419,13 @@ unsigned KeyValueStore::_do_transaction(Transaction& transaction,
       break;
 
     case Transaction::OP_SETALLOCHINT:
-      // TODO: can kvstore make use of the hint?
+      {
+        // TODO: can kvstore make use of the hint?
+        coll_t cid(i.get_cid());
+        ghobject_t oid = i.get_oid();
+        (void)i.get_length();  // discard result
+        (void)i.get_length();  // discard result
+      }
       break;
 
     default:
@@ -1487,8 +1463,7 @@ unsigned KeyValueStore::_do_transaction(Transaction& transaction,
         }
 
         dout(0) << " error " << cpp_strerror(r) << " not handled on operation "
-                << op << " (" << spos << ", or op " << spos.op
-                << ", counting from 0)" << dendl;
+                << op << " op " << op_num << ", counting from 0)" << dendl;
         dout(0) << msg << dendl;
         dout(0) << " transaction dump:\n";
         JSONFormatter f(true);
@@ -1505,7 +1480,7 @@ unsigned KeyValueStore::_do_transaction(Transaction& transaction,
       }
     }
 
-    spos.op++;
+    op_num++;
   }
 
   return 0;  // FIXME count errors
@@ -1520,9 +1495,9 @@ bool KeyValueStore::exists(coll_t cid, const ghobject_t& oid)
   dout(10) << __func__ << "collection: " << cid << " object: " << oid
            << dendl;
   int r;
-  StripObjectMap::StripObjectHeader header;
+  StripObjectMap::StripObjectHeaderRef header;
 
-  r = backend->lookup_strip_header(cid, oid, header);
+  r = backend->lookup_strip_header(cid, oid, &header);
   if (r < 0) {
     return false;
   }
@@ -1535,42 +1510,42 @@ int KeyValueStore::stat(coll_t cid, const ghobject_t& oid,
 {
   dout(10) << "stat " << cid << "/" << oid << dendl;
 
-  StripObjectMap::StripObjectHeader header;
+  StripObjectMap::StripObjectHeaderRef header;
 
-  int r = backend->lookup_strip_header(cid, oid, header);
+  int r = backend->lookup_strip_header(cid, oid, &header);
   if (r < 0) {
     dout(10) << "stat " << cid << "/" << oid << "=" << r << dendl;
     return -ENOENT;
   }
 
-  st->st_blocks = header.max_size / header.strip_size;
-  if (header.max_size % header.strip_size)
+  st->st_blocks = header->max_size / header->strip_size;
+  if (header->max_size % header->strip_size)
     st->st_blocks++;
   st->st_nlink = 1;
-  st->st_size = header.max_size;
-  st->st_blksize = header.strip_size;
+  st->st_size = header->max_size;
+  st->st_blksize = header->strip_size;
 
   return r;
 }
 
-int KeyValueStore::_generic_read(StripObjectMap::StripObjectHeader &header,
+int KeyValueStore::_generic_read(StripObjectMap::StripObjectHeaderRef header,
                                  uint64_t offset, size_t len, bufferlist& bl,
                                  bool allow_eio, BufferTransaction *bt)
 {
-  if (header.max_size < offset) {
-    dout(10) << __func__ << " " << header.cid << "/" << header.oid << ")"
+  if (header->max_size < offset) {
+    dout(10) << __func__ << " " << header->cid << "/" << header->oid << ")"
              << " offset exceed the length of bl"<< dendl;
     return 0;
   }
 
   if (len == 0)
-    len = header.max_size - offset;
+    len = header->max_size - offset;
 
-  if (offset + len > header.max_size)
-    len = header.max_size - offset;
+  if (offset + len > header->max_size)
+    len = header->max_size - offset;
 
   vector<StripObjectMap::StripExtent> extents;
-  StripObjectMap::file_to_extents(offset, len, header.strip_size,
+  StripObjectMap::file_to_extents(offset, len, header->strip_size,
                                   extents);
   map<string, bufferlist> out;
   set<string> keys;
@@ -1580,23 +1555,23 @@ int KeyValueStore::_generic_read(StripObjectMap::StripObjectHeader &header,
     bufferlist old;
     string key = strip_object_key(iter->no);
 
-    if (bt && header.buffers.count(make_pair(OBJECT_STRIP_PREFIX, key))) {
+    if (bt && header->buffers.count(make_pair(OBJECT_STRIP_PREFIX, key))) {
       // use strip_header buffer
-      assert(header.bits[iter->no]);
-      out[key] = header.buffers[make_pair(OBJECT_STRIP_PREFIX, key)];
-    } else if (header.bits[iter->no]) {
+      assert(header->bits[iter->no]);
+      out[key] = header->buffers[make_pair(OBJECT_STRIP_PREFIX, key)];
+    } else if (header->bits[iter->no]) {
       keys.insert(key);
     }
   }
 
   int r = backend->get_values_with_header(header, OBJECT_STRIP_PREFIX, keys, &out);
   if (r < 0) {
-    dout(10) << __func__ << " " << header.cid << "/" << header.oid << " "
+    dout(10) << __func__ << " " << header->cid << "/" << header->oid << " "
              << offset << "~" << len << " = " << r << dendl;
     return r;
   } else if (out.size() != keys.size()) {
     dout(0) << __func__ << " broken header or missing data in backend "
-            << header.cid << "/" << header.oid << " " << offset << "~"
+            << header->cid << "/" << header->oid << " " << offset << "~"
             << len << " = " << r << dendl;
     return -EBADF;
   }
@@ -1605,8 +1580,8 @@ int KeyValueStore::_generic_read(StripObjectMap::StripObjectHeader &header,
        iter != extents.end(); ++iter) {
     string key = strip_object_key(iter->no);
 
-    if (header.bits[iter->no]) {
-      if (iter->len == header.strip_size) {
+    if (header->bits[iter->no]) {
+      if (iter->len == header->strip_size) {
         bl.claim_append(out[key]);
       } else {
         out[key].copy(iter->offset, iter->len, bl);
@@ -1616,7 +1591,7 @@ int KeyValueStore::_generic_read(StripObjectMap::StripObjectHeader &header,
     }
   }
 
-  dout(10) << __func__ << " " << header.cid << "/" << header.oid << " "
+  dout(10) << __func__ << " " << header->cid << "/" << header->oid << " "
            << offset << "~" << bl.length() << "/" << len << " r = " << r
            << dendl;
 
@@ -1630,9 +1605,9 @@ int KeyValueStore::read(coll_t cid, const ghobject_t& oid, uint64_t offset,
   dout(15) << __func__ << " " << cid << "/" << oid << " " << offset << "~"
            << len << dendl;
 
-  StripObjectMap::StripObjectHeader header;
+  StripObjectMap::StripObjectHeaderRef header;
 
-  int r = backend->lookup_strip_header(cid, oid, header);
+  int r = backend->lookup_strip_header(cid, oid, &header);
 
   if (r < 0) {
     dout(10) << __func__ << " " << cid << "/" << oid << " " << offset << "~"
@@ -1649,9 +1624,9 @@ int KeyValueStore::fiemap(coll_t cid, const ghobject_t& oid,
   dout(10) << __func__ << " " << cid << " " << oid << " " << offset << "~"
            << len << dendl;
   int r;
-  StripObjectMap::StripObjectHeader header;
+  StripObjectMap::StripObjectHeaderRef header;
 
-  r = backend->lookup_strip_header(cid, oid, header);
+  r = backend->lookup_strip_header(cid, oid, &header);
   if (r < 0) {
     dout(10) << "fiemap " << cid << "/" << oid << " " << offset << "~" << len
              << " failed to get header: r = " << r << dendl;
@@ -1659,13 +1634,14 @@ int KeyValueStore::fiemap(coll_t cid, const ghobject_t& oid,
   }
 
   vector<StripObjectMap::StripExtent> extents;
-  StripObjectMap::file_to_extents(offset, len, header.strip_size,
+  StripObjectMap::file_to_extents(offset, len, header->strip_size,
                                   extents);
 
   map<uint64_t, uint64_t> m;
   for (vector<StripObjectMap::StripExtent>::iterator iter = extents.begin();
        iter != extents.end(); ++iter) {
-    m[iter->offset] = iter->len;
+    uint64_t off = iter->no * header->strip_size + iter->offset;
+    m[off] = iter->len;
   }
   ::encode(m, bl);
   return 0;
@@ -1677,7 +1653,7 @@ int KeyValueStore::_remove(coll_t cid, const ghobject_t& oid,
   dout(15) << __func__ << " " << cid << "/" << oid << dendl;
 
   int r;
-  StripObjectMap::StripObjectHeader *header;
+  StripObjectMap::StripObjectHeaderRef header;
 
   r = t.lookup_cached_header(cid, oid, &header, false);
   if (r < 0) {
@@ -1686,7 +1662,9 @@ int KeyValueStore::_remove(coll_t cid, const ghobject_t& oid,
     return r;
   }
 
-  r = t.clear_buffer(*header);
+  header->max_size = 0;
+  header->bits.clear();
+  r = t.clear_buffer(header);
 
   dout(10) << __func__ << " " << cid << "/" << oid << " = " << r << dendl;
   return r;
@@ -1699,7 +1677,7 @@ int KeyValueStore::_truncate(coll_t cid, const ghobject_t& oid, uint64_t size,
            << dendl;
 
   int r;
-  StripObjectMap::StripObjectHeader *header;
+  StripObjectMap::StripObjectHeaderRef header;
 
   r = t.lookup_cached_header(cid, oid, &header, false);
   if (r < 0) {
@@ -1725,7 +1703,7 @@ int KeyValueStore::_truncate(coll_t cid, const ghobject_t& oid, uint64_t size,
       string key = strip_object_key(iter->no);
 
       lookup_keys.insert(key);
-      r = t.get_buffer_keys(*header, OBJECT_STRIP_PREFIX,
+      r = t.get_buffer_keys(header, OBJECT_STRIP_PREFIX,
                             lookup_keys, &values);
       if (r < 0) {
         dout(10) << __func__ << " " << cid << "/" << oid << " "
@@ -1743,7 +1721,7 @@ int KeyValueStore::_truncate(coll_t cid, const ghobject_t& oid, uint64_t size,
       assert(value.length() == header->strip_size);
       value.swap(values[key]);
 
-      t.set_buffer_keys(*header, OBJECT_STRIP_PREFIX, values);
+      t.set_buffer_keys(header, OBJECT_STRIP_PREFIX, values);
       ++iter;
     }
 
@@ -1754,7 +1732,7 @@ int KeyValueStore::_truncate(coll_t cid, const ghobject_t& oid, uint64_t size,
         header->bits[iter->no] = 0;
       }
     }
-    r = t.remove_buffer_keys(*header, OBJECT_STRIP_PREFIX, keys);
+    r = t.remove_buffer_keys(header, OBJECT_STRIP_PREFIX, keys);
     if (r < 0) {
       dout(10) << __func__ << " " << cid << "/" << oid << " "
                << size << " = " << r << dendl;
@@ -1776,7 +1754,7 @@ int KeyValueStore::_touch(coll_t cid, const ghobject_t& oid,
   dout(15) << __func__ << " " << cid << "/" << oid << dendl;
 
   int r;
-  StripObjectMap::StripObjectHeader *header;
+  StripObjectMap::StripObjectHeaderRef header;
 
   r = t.lookup_cached_header(cid, oid, &header, true);
   if (r < 0) {
@@ -1790,7 +1768,7 @@ int KeyValueStore::_touch(coll_t cid, const ghobject_t& oid,
   return r;
 }
 
-int KeyValueStore::_generic_write(StripObjectMap::StripObjectHeader &header,
+int KeyValueStore::_generic_write(StripObjectMap::StripObjectHeaderRef header,
                                   uint64_t offset, size_t len,
                                   const bufferlist& bl, BufferTransaction &t,
                                   bool replica)
@@ -1798,34 +1776,34 @@ int KeyValueStore::_generic_write(StripObjectMap::StripObjectHeader &header,
   if (len > bl.length())
     len = bl.length();
 
-  if (len + offset > header.max_size) {
-    header.max_size = len + offset;
-    header.bits.resize(header.max_size/header.strip_size+1);
+  if (len + offset > header->max_size) {
+    header->max_size = len + offset;
+    header->bits.resize(header->max_size/header->strip_size+1);
   }
 
   vector<StripObjectMap::StripExtent> extents;
-  StripObjectMap::file_to_extents(offset, len, header.strip_size,
+  StripObjectMap::file_to_extents(offset, len, header->strip_size,
                                   extents);
 
   map<string, bufferlist> out;
   set<string> keys;
   for (vector<StripObjectMap::StripExtent>::iterator iter = extents.begin();
        iter != extents.end(); ++iter) {
-    if (header.bits[iter->no] && !(iter->offset == 0 &&
-                                   iter->len == header.strip_size))
+    if (header->bits[iter->no] && !(iter->offset == 0 &&
+                                   iter->len == header->strip_size))
       keys.insert(strip_object_key(iter->no));
   }
 
   int r = t.get_buffer_keys(header, OBJECT_STRIP_PREFIX, keys, &out);
   if (r < 0) {
-    dout(10) << __func__ << " failed to get value " << header.cid << "/"
-              << header.oid << " " << offset << "~" << len << " = " << r
+    dout(10) << __func__ << " failed to get value " << header->cid << "/"
+              << header->oid << " " << offset << "~" << len << " = " << r
               << dendl;
     return r;
   } else if (keys.size() != out.size()) {
     // Error on header.bits or the corresponding key/value pair is missing
     dout(0) << __func__ << " broken header or missing data in backend "
-            << header.cid << "/" << header.oid << " " << offset << "~"
+            << header->cid << "/" << header->oid << " " << offset << "~"
             << len << " = " << r << dendl;
     return -EBADF;
   }
@@ -1836,19 +1814,19 @@ int KeyValueStore::_generic_write(StripObjectMap::StripObjectHeader &header,
        iter != extents.end(); ++iter) {
     bufferlist value;
     string key = strip_object_key(iter->no);
-    if (header.bits[iter->no]) {
-      if (iter->offset == 0 && iter->len == header.strip_size) {
+    if (header->bits[iter->no]) {
+      if (iter->offset == 0 && iter->len == header->strip_size) {
         bl.copy(bl_offset, iter->len, value);
         bl_offset += iter->len;
       } else {
-        assert(out[key].length() == header.strip_size);
+        assert(out[key].length() == header->strip_size);
 
         out[key].copy(0, iter->offset, value);
         bl.copy(bl_offset, iter->len, value);
         bl_offset += iter->len;
 
-        if (value.length() != header.strip_size)
-          out[key].copy(value.length(), header.strip_size-value.length(),
+        if (value.length() != header->strip_size)
+          out[key].copy(value.length(), header->strip_size-value.length(),
                         value);
       }
     } else {
@@ -1857,18 +1835,18 @@ int KeyValueStore::_generic_write(StripObjectMap::StripObjectHeader &header,
       bl.copy(bl_offset, iter->len, value);
       bl_offset += iter->len;
 
-      if (value.length() < header.strip_size)
-        value.append_zero(header.strip_size-value.length());
+      if (value.length() < header->strip_size)
+        value.append_zero(header->strip_size-value.length());
 
-      header.bits[iter->no] = 1;
+      header->bits[iter->no] = 1;
     }
-    assert(value.length() == header.strip_size);
+    assert(value.length() == header->strip_size);
     values[key].swap(value);
   }
   assert(bl_offset == len);
 
   t.set_buffer_keys(header, OBJECT_STRIP_PREFIX, values);
-  dout(10) << __func__ << " " << header.cid << "/" << header.oid << " "
+  dout(10) << __func__ << " " << header->cid << "/" << header->oid << " "
            << offset << "~" << len << " = " << r << dendl;
 
   return r;
@@ -1882,7 +1860,7 @@ int KeyValueStore::_write(coll_t cid, const ghobject_t& oid,
            << len << dendl;
 
   int r;
-  StripObjectMap::StripObjectHeader *header;
+  StripObjectMap::StripObjectHeaderRef header;
 
   r = t.lookup_cached_header(cid, oid, &header, true);
   if (r < 0) {
@@ -1891,7 +1869,7 @@ int KeyValueStore::_write(coll_t cid, const ghobject_t& oid,
     return r;
   }
 
-  return _generic_write(*header, offset, len, bl, t, replica);
+  return _generic_write(header, offset, len, bl, t, replica);
 }
 
 int KeyValueStore::_zero(coll_t cid, const ghobject_t& oid, uint64_t offset,
@@ -1920,7 +1898,7 @@ int KeyValueStore::_clone(coll_t cid, const ghobject_t& oldoid,
     return 0;
 
   int r;
-  StripObjectMap::StripObjectHeader *old_header;
+  StripObjectMap::StripObjectHeaderRef old_header;
 
   r = t.lookup_cached_header(cid, oldoid, &old_header, false);
   if (r < 0) {
@@ -1929,7 +1907,7 @@ int KeyValueStore::_clone(coll_t cid, const ghobject_t& oldoid,
     return r;
   }
 
-  t.clone_buffer(*old_header, cid, newoid);
+  t.clone_buffer(old_header, cid, newoid);
 
   dout(10) << __func__ << " " << cid << "/" << oldoid << " -> " << cid << "/"
            << newoid << " = " << r << dendl;
@@ -1948,7 +1926,7 @@ int KeyValueStore::_clone_range(coll_t cid, const ghobject_t& oldoid,
   int r;
   bufferlist bl;
 
-  StripObjectMap::StripObjectHeader *old_header, *new_header;
+  StripObjectMap::StripObjectHeaderRef old_header, new_header;
 
   r = t.lookup_cached_header(cid, oldoid, &old_header, false);
   if (r < 0) {
@@ -1966,11 +1944,11 @@ int KeyValueStore::_clone_range(coll_t cid, const ghobject_t& oldoid,
     return r;
   }
 
-  r = _generic_read(*old_header, srcoff, len, bl, &t);
+  r = _generic_read(old_header, srcoff, len, bl, &t);
   if (r < 0)
     goto out;
 
-  r = _generic_write(*new_header, dstoff, len, bl, t);
+  r = _generic_write(new_header, dstoff, len, bl, t);
 
  out:
   dout(10) << __func__ << " " << cid << "/" << oldoid << " -> " << cid << "/"
@@ -1990,9 +1968,17 @@ int KeyValueStore::getattr(coll_t cid, const ghobject_t& oid, const char *name,
   int r;
   map<string, bufferlist> got;
   set<string> to_get;
+  StripObjectMap::StripObjectHeaderRef header;
 
   to_get.insert(string(name));
-  r = backend->get_values(cid, oid, OBJECT_XATTR, to_get, &got);
+
+  r = backend->lookup_strip_header(cid, oid, &header);
+  if (r < 0) {
+    dout(10) << __func__ << " lookup_strip_header failed: r =" << r << dendl;
+    return r;
+  }
+
+  r = backend->get_values_with_header(header, OBJECT_XATTR, to_get, &got);
   if (r < 0 && r != -ENOENT) {
     dout(10) << __func__ << " get_xattrs err r =" << r << dendl;
     goto out;
@@ -2056,7 +2042,7 @@ int KeyValueStore::_setattrs(coll_t cid, const ghobject_t& oid,
 
   int r;
 
-  StripObjectMap::StripObjectHeader *header;
+  StripObjectMap::StripObjectHeaderRef header;
   map<string, bufferlist> attrs;
 
   r = t.lookup_cached_header(cid, oid, &header, false);
@@ -2068,7 +2054,7 @@ int KeyValueStore::_setattrs(coll_t cid, const ghobject_t& oid,
     attrs[it->first].push_back(it->second);
   }
 
-  t.set_buffer_keys(*header, OBJECT_XATTR, attrs);
+  t.set_buffer_keys(header, OBJECT_XATTR, attrs);
 
 out:
   dout(10) << __func__ << " " << cid << "/" << oid << " = " << r << dendl;
@@ -2084,7 +2070,7 @@ int KeyValueStore::_rmattr(coll_t cid, const ghobject_t& oid, const char *name,
 
   int r;
   set<string> to_remove;
-  StripObjectMap::StripObjectHeader *header;
+  StripObjectMap::StripObjectHeaderRef header;
 
   r = t.lookup_cached_header(cid, oid, &header, false);
   if (r < 0) {
@@ -2094,7 +2080,7 @@ int KeyValueStore::_rmattr(coll_t cid, const ghobject_t& oid, const char *name,
   }
 
   to_remove.insert(string(name));
-  r = t.remove_buffer_keys(*header, OBJECT_XATTR, to_remove);
+  r = t.remove_buffer_keys(header, OBJECT_XATTR, to_remove);
 
   dout(10) << __func__ << " " << cid << "/" << oid << " '" << name << "' = "
            << r << dendl;
@@ -2109,7 +2095,7 @@ int KeyValueStore::_rmattrs(coll_t cid, const ghobject_t& oid,
   int r;
   set<string> attrs;
 
-  StripObjectMap::StripObjectHeader *header;
+  StripObjectMap::StripObjectHeaderRef header;
 
   r = t.lookup_cached_header(cid, oid, &header, false);
   if (r < 0) {
@@ -2118,14 +2104,14 @@ int KeyValueStore::_rmattrs(coll_t cid, const ghobject_t& oid,
     return r;
   }
 
-  r = backend->get_keys_with_header(*header, OBJECT_XATTR, &attrs);
+  r = backend->get_keys_with_header(header, OBJECT_XATTR, &attrs);
   if (r < 0 && r != -ENOENT) {
     dout(10) << __func__ << " could not get attrs r = " << r << dendl;
     return r;
   }
 
-  r = t.remove_buffer_keys(*header, OBJECT_XATTR, attrs);
-  t.clear_buffer_keys(*header, OBJECT_XATTR);
+  r = t.remove_buffer_keys(header, OBJECT_XATTR, attrs);
+  t.clear_buffer_keys(header, OBJECT_XATTR);
 
   dout(10) << __func__ <<  " " << cid << "/" << oid << " = " << r << dendl;
   return r;
@@ -2168,10 +2154,18 @@ int KeyValueStore::collection_getattr(coll_t c, const char *name,
 
   set<string> keys;
   map<string, bufferlist> out;
+  StripObjectMap::StripObjectHeaderRef header;
+
   keys.insert(string(name));
 
-  int r = backend->get_values(get_coll_for_coll(), make_ghobject_for_coll(c),
-                              COLLECTION_ATTR, keys, &out);
+  int r = backend->lookup_strip_header(get_coll_for_coll(),
+                                       make_ghobject_for_coll(c), &header);
+  if (r < 0) {
+    dout(10) << __func__ << " lookup_strip_header failed: r =" << r << dendl;
+    return r;
+  }
+
+  r = backend->get_values_with_header(header, COLLECTION_ATTR, keys, &out);
   if (r < 0) {
     dout(10) << __func__ << " could not get key" << string(name) << dendl;
     r = -EINVAL;
@@ -2192,14 +2186,21 @@ int KeyValueStore::collection_getattrs(coll_t cid,
 
   map<string, bufferlist> out;
   set<string> keys;
+  StripObjectMap::StripObjectHeaderRef header;
 
   for (map<string, bufferptr>::iterator it = aset.begin();
        it != aset.end(); ++it) {
       keys.insert(it->first);
   }
 
-  int r = backend->get_values(get_coll_for_coll(), make_ghobject_for_coll(cid),
-                              COLLECTION_ATTR, keys, &out);
+  int r = backend->lookup_strip_header(get_coll_for_coll(),
+                                       make_ghobject_for_coll(cid), &header);
+  if (r < 0) {
+    dout(10) << __func__ << " lookup_strip_header failed: r =" << r << dendl;
+    return r;
+  }
+
+  r = backend->get_values_with_header(header, COLLECTION_ATTR, keys, &out);
   if (r < 0) {
     dout(10) << __func__ << " could not get keys" << dendl;
     r = -EINVAL;
@@ -2227,7 +2228,7 @@ int KeyValueStore::_collection_setattr(coll_t c, const char *name,
   int r;
   bufferlist bl;
   map<string, bufferlist> out;
-  StripObjectMap::StripObjectHeader *header;
+  StripObjectMap::StripObjectHeaderRef header;
 
   r = t.lookup_cached_header(get_coll_for_coll(),
                              make_ghobject_for_coll(c),
@@ -2240,7 +2241,7 @@ int KeyValueStore::_collection_setattr(coll_t c, const char *name,
   bl.append(reinterpret_cast<const char*>(value), size);
   out.insert(make_pair(string(name), bl));
 
-  t.set_buffer_keys(*header, COLLECTION_ATTR, out);
+  t.set_buffer_keys(header, COLLECTION_ATTR, out);
 
   dout(10) << __func__ << " " << c << " '"
            << name << "' len " << size << " = " << r << dendl;
@@ -2254,7 +2255,7 @@ int KeyValueStore::_collection_rmattr(coll_t c, const char *name,
 
   bufferlist bl;
   set<string> out;
-  StripObjectMap::StripObjectHeader *header;
+  StripObjectMap::StripObjectHeaderRef header;
 
   int r = t.lookup_cached_header(get_coll_for_coll(),
                                  make_ghobject_for_coll(c), &header, false);
@@ -2264,7 +2265,7 @@ int KeyValueStore::_collection_rmattr(coll_t c, const char *name,
   }
 
   out.insert(string(name));
-  r = t.remove_buffer_keys(*header, COLLECTION_ATTR, out);
+  r = t.remove_buffer_keys(header, COLLECTION_ATTR, out);
 
   dout(10) << __func__ << " " << c << " = " << r << dendl;
   return r;
@@ -2277,7 +2278,7 @@ int KeyValueStore::_collection_setattrs(coll_t cid,
   dout(15) << __func__ << " " << cid << dendl;
 
   map<string, bufferlist> attrs;
-  StripObjectMap::StripObjectHeader *header;
+  StripObjectMap::StripObjectHeaderRef header;
   int r = t.lookup_cached_header(get_coll_for_coll(),
                                  make_ghobject_for_coll(cid),
                                  &header, false);
@@ -2291,7 +2292,7 @@ int KeyValueStore::_collection_setattrs(coll_t cid,
     attrs[it->first].push_back(it->second);
   }
 
-  t.set_buffer_keys(*header, COLLECTION_ATTR, attrs);
+  t.set_buffer_keys(header, COLLECTION_ATTR, attrs);
 
   dout(10) << __func__ << " " << cid << " = " << r << dendl;
   return r;
@@ -2305,7 +2306,7 @@ int KeyValueStore::_create_collection(coll_t c, BufferTransaction &t)
   dout(15) << __func__ << " " << c << dendl;
 
   int r;
-  StripObjectMap::StripObjectHeader *header;
+  StripObjectMap::StripObjectHeaderRef header;
   bufferlist bl;
 
   r = t.lookup_cached_header(get_coll_for_coll(),
@@ -2330,7 +2331,7 @@ int KeyValueStore::_destroy_collection(coll_t c, BufferTransaction &t)
 
   int r;
   uint64_t modified_object = 0;
-  StripObjectMap::StripObjectHeader *header;
+  StripObjectMap::StripObjectHeaderRef header;
   vector<ghobject_t> oids;
 
   r = t.lookup_cached_header(get_coll_for_coll(), make_ghobject_for_coll(c),
@@ -2347,7 +2348,7 @@ int KeyValueStore::_destroy_collection(coll_t c, BufferTransaction &t)
       continue;
 
     modified_object++;
-    if (!iter->second.deleted) {
+    if (!iter->second->deleted) {
       r = -ENOTEMPTY;
       goto out;
     }
@@ -2369,7 +2370,7 @@ int KeyValueStore::_destroy_collection(coll_t c, BufferTransaction &t)
     }
   }
 
-  r = t.clear_buffer(*header);
+  r = t.clear_buffer(header);
 
 out:
   dout(10) << __func__ << " " << c << " = " << r << dendl;
@@ -2385,7 +2386,7 @@ int KeyValueStore::_collection_add(coll_t c, coll_t oldcid,
            << o << dendl;
 
   bufferlist bl;
-  StripObjectMap::StripObjectHeader *header, *old_header;
+  StripObjectMap::StripObjectHeaderRef header, old_header;
 
   int r = t.lookup_cached_header(oldcid, o, &old_header, false);
   if (r < 0) {
@@ -2400,13 +2401,13 @@ int KeyValueStore::_collection_add(coll_t c, coll_t oldcid,
     goto out;
   }
 
-  r = _generic_read(*old_header, 0, old_header->max_size, bl, &t);
+  r = _generic_read(old_header, 0, old_header->max_size, bl, &t);
   if (r < 0) {
     r = -EINVAL;
     goto out;
   }
 
-  r = _generic_write(*header, 0, bl.length(), bl, t);
+  r = _generic_write(header, 0, bl.length(), bl, t);
   if (r < 0) {
     r = -EINVAL;
   }
@@ -2425,7 +2426,7 @@ int KeyValueStore::_collection_move_rename(coll_t oldcid,
   dout(15) << __func__ << " " << c << "/" << o << " from " << oldcid << "/"
            << oldoid << dendl;
   int r;
-  StripObjectMap::StripObjectHeader *header;
+  StripObjectMap::StripObjectHeaderRef header;
 
   r = t.lookup_cached_header(c, o, &header, false);
   if (r == 0) {
@@ -2441,7 +2442,7 @@ int KeyValueStore::_collection_move_rename(coll_t oldcid,
     return r;
   }
 
-  t.rename_buffer(*header, c, o);
+  t.rename_buffer(header, c, o);
 
   dout(10) << __func__ << " " << c << "/" << o << " from " << oldcid << "/"
            << oldoid << " = " << r << dendl;
@@ -2453,7 +2454,7 @@ int KeyValueStore::_collection_remove_recursive(const coll_t &cid,
 {
   dout(15) << __func__ << " " << cid << dendl;
 
-  StripObjectMap::StripObjectHeader *header;
+  StripObjectMap::StripObjectHeaderRef header;
 
   int r = t.lookup_cached_header(get_coll_for_coll(),
                                  make_ghobject_for_coll(cid),
@@ -2478,7 +2479,7 @@ int KeyValueStore::_collection_remove_recursive(const coll_t &cid,
     }
   }
 
-  r = t.clear_buffer(*header);
+  r = t.clear_buffer(header);
 
   dout(10) << __func__ << " " << cid  << " r = " << r << dendl;
   return 0;
@@ -2490,7 +2491,7 @@ int KeyValueStore::_collection_rename(const coll_t &cid, const coll_t &ncid,
   dout(10) << __func__ << " origin cid " << cid << " new cid " << ncid
            << dendl;
 
-  StripObjectMap::StripObjectHeader *header;
+  StripObjectMap::StripObjectHeaderRef header;
 
   int r = t.lookup_cached_header(get_coll_for_coll(),
                                  make_ghobject_for_coll(ncid),
@@ -2532,7 +2533,7 @@ int KeyValueStore::_collection_rename(const coll_t &cid, const coll_t &ncid,
     current = next;
   }
 
-  t.rename_buffer(*header, get_coll_for_coll(), make_ghobject_for_coll(ncid));
+  t.rename_buffer(header, get_coll_for_coll(), make_ghobject_for_coll(ncid));
 
   dout(10) << __func__ << " origin cid " << cid << " new cid " << ncid
            << dendl;
@@ -2560,9 +2561,9 @@ bool KeyValueStore::collection_exists(coll_t c)
 {
   dout(10) << __func__ << " " << dendl;
 
-  StripObjectMap::StripObjectHeader header;
+  StripObjectMap::StripObjectHeaderRef header;
   int r = backend->lookup_strip_header(get_coll_for_coll(),
-                                       make_ghobject_for_coll(c), header);
+                                       make_ghobject_for_coll(c), &header);
   if (r < 0) {
     return false;
   }
@@ -2652,15 +2653,14 @@ int KeyValueStore::omap_get(coll_t c, const ghobject_t &hoid,
 {
   dout(15) << __func__ << " " << c << "/" << hoid << dendl;
 
-  StripObjectMap::StripObjectHeader header;
+  StripObjectMap::StripObjectHeaderRef header;
 
-  int r = backend->lookup_strip_header(c, hoid, header);
+  int r = backend->lookup_strip_header(c, hoid, &header);
   if (r < 0) {
     dout(10) << __func__ << " lookup_strip_header failed: r =" << r << dendl;
     return r;
   }
 
-
   r = backend->get_with_header(header, OBJECT_OMAP, out);
   if (r < 0 && r != -ENOENT) {
     dout(10) << __func__ << " err r =" << r << dendl;
@@ -2692,9 +2692,16 @@ int KeyValueStore::omap_get_header(coll_t c, const ghobject_t &hoid,
 
   set<string> keys;
   map<string, bufferlist> got;
+  StripObjectMap::StripObjectHeaderRef header;
+
+  int r = backend->lookup_strip_header(c, hoid, &header);
+  if (r < 0) {
+    dout(10) << __func__ << " lookup_strip_header failed: r =" << r << dendl;
+    return r;
+  }
 
   keys.insert(OBJECT_OMAP_HEADER_KEY);
-  int r = backend->get_values(c, hoid, OBJECT_OMAP_HEADER, keys, &got);
+  r = backend->get_values_with_header(header, OBJECT_OMAP_HEADER, keys, &got);
   if (r < 0 && r != -ENOENT) {
     dout(10) << __func__ << " err r =" << r << dendl;
     return r;
@@ -2712,7 +2719,14 @@ int KeyValueStore::omap_get_keys(coll_t c, const ghobject_t &hoid, set<string> *
 {
   dout(15) << __func__ << " " << c << "/" << hoid << dendl;
 
-  int r = backend->get_keys(c, hoid, OBJECT_OMAP, keys);
+  StripObjectMap::StripObjectHeaderRef header;
+  int r = backend->lookup_strip_header(c, hoid, &header);
+  if (r < 0) {
+    dout(10) << __func__ << " lookup_strip_header failed: r =" << r << dendl;
+    return r;
+  }
+
+  r = backend->get_keys_with_header(header, OBJECT_OMAP, keys);
   if (r < 0 && r != -ENOENT) {
     return r;
   }
@@ -2725,7 +2739,14 @@ int KeyValueStore::omap_get_values(coll_t c, const ghobject_t &hoid,
 {
   dout(15) << __func__ << " " << c << "/" << hoid << dendl;
 
-  int r = backend->get_values(c, hoid, OBJECT_OMAP, keys, out);
+  StripObjectMap::StripObjectHeaderRef header;
+  int r = backend->lookup_strip_header(c, hoid, &header);
+  if (r < 0) {
+    dout(10) << __func__ << " lookup_strip_header failed: r =" << r << dendl;
+    return r;
+  }
+
+  r = backend->get_values_with_header(header, OBJECT_OMAP, keys, out);
   if (r < 0 && r != -ENOENT) {
     return r;
   }
@@ -2756,7 +2777,7 @@ int KeyValueStore::_omap_clear(coll_t cid, const ghobject_t &hoid,
 {
   dout(15) << __func__ << " " << cid << "/" << hoid << dendl;
 
-  StripObjectMap::StripObjectHeader *header;
+  StripObjectMap::StripObjectHeaderRef header;
 
   int r = t.lookup_cached_header(cid, hoid, &header, false);
   if (r < 0) {
@@ -2766,13 +2787,13 @@ int KeyValueStore::_omap_clear(coll_t cid, const ghobject_t &hoid,
   }
 
   set<string> keys;
-  r = backend->get_keys_with_header(*header, OBJECT_OMAP, &keys);
+  r = backend->get_keys_with_header(header, OBJECT_OMAP, &keys);
   if (r < 0 && r != -ENOENT) {
     dout(10) << __func__ << " could not get omap_keys r = " << r << dendl;
     return r;
   }
 
-  r = t.remove_buffer_keys(*header, OBJECT_OMAP, keys);
+  r = t.remove_buffer_keys(header, OBJECT_OMAP, keys);
   if (r < 0) {
     dout(10) << __func__ << " could not remove keys r = " << r << dendl;
     return r;
@@ -2780,13 +2801,13 @@ int KeyValueStore::_omap_clear(coll_t cid, const ghobject_t &hoid,
 
   keys.clear();
   keys.insert(OBJECT_OMAP_HEADER_KEY);
-  r = t.remove_buffer_keys(*header, OBJECT_OMAP_HEADER, keys);
+  r = t.remove_buffer_keys(header, OBJECT_OMAP_HEADER, keys);
   if (r < 0) {
     dout(10) << __func__ << " could not remove keys r = " << r << dendl;
     return r;
   }
 
-  t.clear_buffer_keys(*header, OBJECT_OMAP_HEADER);
+  t.clear_buffer_keys(header, OBJECT_OMAP_HEADER);
 
   dout(10) << __func__ << " " << cid << "/" << hoid << " r = " << r << dendl;
   return 0;
@@ -2798,7 +2819,7 @@ int KeyValueStore::_omap_setkeys(coll_t cid, const ghobject_t &hoid,
 {
   dout(15) << __func__ << " " << cid << "/" << hoid << dendl;
 
-  StripObjectMap::StripObjectHeader *header;
+  StripObjectMap::StripObjectHeaderRef header;
 
   int r = t.lookup_cached_header(cid, hoid, &header, false);
   if (r < 0) {
@@ -2807,7 +2828,7 @@ int KeyValueStore::_omap_setkeys(coll_t cid, const ghobject_t &hoid,
     return r;
   }
 
-  t.set_buffer_keys(*header, OBJECT_OMAP, aset);
+  t.set_buffer_keys(header, OBJECT_OMAP, aset);
 
   return 0;
 }
@@ -2818,7 +2839,7 @@ int KeyValueStore::_omap_rmkeys(coll_t cid, const ghobject_t &hoid,
 {
   dout(15) << __func__ << " " << cid << "/" << hoid << dendl;
 
-  StripObjectMap::StripObjectHeader *header;
+  StripObjectMap::StripObjectHeaderRef header;
 
   int r = t.lookup_cached_header(cid, hoid, &header, false);
   if (r < 0) {
@@ -2827,7 +2848,7 @@ int KeyValueStore::_omap_rmkeys(coll_t cid, const ghobject_t &hoid,
     return r;
   }
 
-  r = t.remove_buffer_keys(*header, OBJECT_OMAP, keys);
+  r = t.remove_buffer_keys(header, OBJECT_OMAP, keys);
 
   dout(10) << __func__ << " " << cid << "/" << hoid << " r = " << r << dendl;
   return r;
@@ -2861,7 +2882,7 @@ int KeyValueStore::_omap_setheader(coll_t cid, const ghobject_t &hoid,
   dout(15) << __func__ << " " << cid << "/" << hoid << dendl;
 
   map<string, bufferlist> sets;
-  StripObjectMap::StripObjectHeader *header;
+  StripObjectMap::StripObjectHeaderRef header;
 
   int r = t.lookup_cached_header(cid, hoid, &header, false);
   if (r < 0) {
@@ -2871,7 +2892,7 @@ int KeyValueStore::_omap_setheader(coll_t cid, const ghobject_t &hoid,
   }
 
   sets[OBJECT_OMAP_HEADER_KEY] = bl;
-  t.set_buffer_keys(*header, OBJECT_OMAP_HEADER, sets);
+  t.set_buffer_keys(header, OBJECT_OMAP_HEADER, sets);
   return 0;
 }
 
@@ -2881,7 +2902,7 @@ int KeyValueStore::_split_collection(coll_t cid, uint32_t bits, uint32_t rem,
   {
     dout(15) << __func__ << " " << cid << " bits: " << bits << dendl;
 
-    StripObjectMap::StripObjectHeader *header;
+    StripObjectMap::StripObjectHeaderRef header;
 
     int r = t.lookup_cached_header(get_coll_for_coll(),
                                    make_ghobject_for_coll(cid),
diff --git a/src/os/KeyValueStore.h b/src/os/KeyValueStore.h
index d7b9c0a..bc36103 100644
--- a/src/os/KeyValueStore.h
+++ b/src/os/KeyValueStore.h
@@ -36,8 +36,8 @@ using namespace std;
 
 #include "common/Mutex.h"
 #include "GenericObjectMap.h"
-#include "SequencerPosition.h"
 #include "KeyValueDB.h"
+#include "common/random_cache.hpp"
 
 #include "include/uuid.h"
 
@@ -48,6 +48,8 @@ enum kvstore_types {
 };
 
 
+static uint64_t default_strip_size = 1024;
+
 class StripObjectMap: public GenericObjectMap {
  public:
 
@@ -65,7 +67,6 @@ class StripObjectMap: public GenericObjectMap {
     uint64_t strip_size;
     uint64_t max_size;
     vector<char> bits;
-    SequencerPosition spos;
 
     // soft state
     Header header; // FIXME: Hold lock to avoid concurrent operations, it will
@@ -82,7 +83,6 @@ class StripObjectMap: public GenericObjectMap {
       ::encode(strip_size, bl);
       ::encode(max_size, bl);
       ::encode(bits, bl);
-      ::encode(spos, bl);
       ENCODE_FINISH(bl);
     }
 
@@ -91,56 +91,56 @@ class StripObjectMap: public GenericObjectMap {
       ::decode(strip_size, bl);
       ::decode(max_size, bl);
       ::decode(bits, bl);
-      ::decode(spos, bl);
       DECODE_FINISH(bl);
     }
   };
-
-  bool check_spos(const StripObjectHeader &header,
-                  const SequencerPosition &spos);
-  void sync_wrap(StripObjectHeader &strip_header, KeyValueDB::Transaction t,
-                 const SequencerPosition &spos);
+  typedef ceph::shared_ptr<StripObjectHeader> StripObjectHeaderRef;
 
   static int file_to_extents(uint64_t offset, size_t len, uint64_t strip_size,
                              vector<StripExtent> &extents);
   int lookup_strip_header(const coll_t & cid, const ghobject_t &oid,
-                          StripObjectHeader &header);
-  int save_strip_header(StripObjectHeader &header,
-                        const SequencerPosition &spos,
-                        KeyValueDB::Transaction t);
+                          StripObjectHeaderRef *header);
+  int save_strip_header(StripObjectHeaderRef header, KeyValueDB::Transaction t);
   int create_strip_header(const coll_t &cid, const ghobject_t &oid,
-                          StripObjectHeader &strip_header,
+                          StripObjectHeaderRef *strip_header,
                           KeyValueDB::Transaction t);
-  void clone_wrap(StripObjectHeader &old_header,
+  void clone_wrap(StripObjectHeaderRef old_header,
                   const coll_t &cid, const ghobject_t &oid,
                   KeyValueDB::Transaction t,
-                  StripObjectHeader *origin_header,
-                  StripObjectHeader *target_header);
-  void rename_wrap(const coll_t &cid, const ghobject_t &oid,
+                  StripObjectHeaderRef *target_header);
+  void rename_wrap(StripObjectHeaderRef old_header, const coll_t &cid, const ghobject_t &oid,
                    KeyValueDB::Transaction t,
-                   StripObjectHeader *header);
+                   StripObjectHeaderRef *new_header);
   // Already hold header to avoid lock header seq again
   int get_with_header(
-    const StripObjectHeader &header,
+    const StripObjectHeaderRef header,
     const string &prefix,
     map<string, bufferlist> *out
     );
 
   int get_values_with_header(
-    const StripObjectHeader &header,
+    const StripObjectHeaderRef header,
     const string &prefix,
     const set<string> &keys,
     map<string, bufferlist> *out
     );
   int get_keys_with_header(
-    const StripObjectHeader &header,
+    const StripObjectHeaderRef header,
     const string &prefix,
     set<string> *keys
     );
 
-  StripObjectMap(KeyValueDB *db): GenericObjectMap(db) {}
+  Mutex lock;
+  void invalidate_cache(const coll_t &c, const ghobject_t &oid) {
+    Mutex::Locker l(lock);
+    caches.clear(oid);
+  }
 
-  static const uint64_t default_strip_size = 1024;
+  RandomCache<ghobject_t, pair<coll_t, StripObjectHeaderRef> > caches;
+  StripObjectMap(KeyValueDB *db): GenericObjectMap(db),
+                                  lock("StripObjectMap::lock"),
+                                  caches(g_conf->keyvaluestore_header_cache_size)
+  {}
 };
 
 
@@ -161,7 +161,7 @@ class KeyValueStore : public ObjectStore,
   std::string current_op_seq_fn;
   uuid_d fsid;
 
-  int fsid_fd, op_fd, current_fd;
+  int fsid_fd, current_fd;
 
   enum kvstore_types kv_type;
 
@@ -210,39 +210,49 @@ class KeyValueStore : public ObjectStore,
   // 4. Clone or rename
   struct BufferTransaction {
     typedef pair<coll_t, ghobject_t> uniq_id;
-    typedef map<uniq_id, StripObjectMap::StripObjectHeader> StripHeaderMap;
+    typedef map<uniq_id, StripObjectMap::StripObjectHeaderRef> StripHeaderMap;
 
     //Dirty records
     StripHeaderMap strip_headers;
+    list<Context*> finishes;
 
     KeyValueStore *store;
 
-    SequencerPosition spos;
     KeyValueDB::Transaction t;
 
     int lookup_cached_header(const coll_t &cid, const ghobject_t &oid,
-                             StripObjectMap::StripObjectHeader **strip_header,
+                             StripObjectMap::StripObjectHeaderRef *strip_header,
                              bool create_if_missing);
-    int get_buffer_keys(StripObjectMap::StripObjectHeader &strip_header,
+    int get_buffer_keys(StripObjectMap::StripObjectHeaderRef strip_header,
                         const string &prefix, const set<string> &keys,
                         map<string, bufferlist> *out);
-    void set_buffer_keys(StripObjectMap::StripObjectHeader &strip_header,
+    void set_buffer_keys(StripObjectMap::StripObjectHeaderRef strip_header,
                          const string &prefix, map<string, bufferlist> &bl);
-    int remove_buffer_keys(StripObjectMap::StripObjectHeader &strip_header,
+    int remove_buffer_keys(StripObjectMap::StripObjectHeaderRef strip_header,
                            const string &prefix, const set<string> &keys);
-    void clear_buffer_keys(StripObjectMap::StripObjectHeader &strip_header,
+    void clear_buffer_keys(StripObjectMap::StripObjectHeaderRef strip_header,
                            const string &prefix);
-    int clear_buffer(StripObjectMap::StripObjectHeader &strip_header);
-    void clone_buffer(StripObjectMap::StripObjectHeader &old_header,
+    int clear_buffer(StripObjectMap::StripObjectHeaderRef strip_header);
+    void clone_buffer(StripObjectMap::StripObjectHeaderRef old_header,
                       const coll_t &cid, const ghobject_t &oid);
-    void rename_buffer(StripObjectMap::StripObjectHeader &old_header,
+    void rename_buffer(StripObjectMap::StripObjectHeaderRef old_header,
                        const coll_t &cid, const ghobject_t &oid);
     int submit_transaction();
 
-    BufferTransaction(KeyValueStore *store,
-                      SequencerPosition &spos): store(store), spos(spos) {
+    BufferTransaction(KeyValueStore *store): store(store) {
       t = store->backend->get_transaction();
     }
+
+    struct InvalidateCacheContext : public Context {
+      KeyValueStore *store;
+      const coll_t cid;
+      const ghobject_t oid;
+      InvalidateCacheContext(KeyValueStore *s, const coll_t &c, const ghobject_t &oid): store(s), cid(c), oid(oid) {}
+      void finish(int r) {
+      if (r == 0)
+        store->backend->invalidate_cache(cid, oid);
+      }
+    };
   };
 
   // -- op workqueue --
@@ -257,28 +267,79 @@ class KeyValueStore : public ObjectStore,
   class OpSequencer : public Sequencer_impl {
     Mutex qlock; // to protect q, for benefit of flush (peek/dequeue also protected by lock)
     list<Op*> q;
-    list<uint64_t> jq;
     Cond cond;
+    list<pair<uint64_t, Context*> > flush_commit_waiters;
+    uint64_t op; // used by flush() to know the sequence of op
    public:
     Sequencer *parent;
     Mutex apply_lock;  // for apply mutual exclusion
+    
+    /// get_max_uncompleted
+    bool _get_max_uncompleted(
+      uint64_t *seq ///< [out] max uncompleted seq
+      ) {
+      assert(qlock.is_locked());
+      assert(seq);
+      *seq = 0;
+      if (q.empty()) {
+	return true;
+      } else {
+	*seq = q.back()->op;
+	return false;
+      }
+    } /// @returns true if the queue is empty
+
+    /// get_min_uncompleted
+    bool _get_min_uncompleted(
+      uint64_t *seq ///< [out] min uncompleted seq
+      ) {
+      assert(qlock.is_locked());
+      assert(seq);
+      *seq = 0;
+      if (q.empty()) {
+	return true;
+      } else {
+	*seq = q.front()->op;
+	return false;
+      }
+    } /// @returns true if both queues are empty
+
+    void _wake_flush_waiters(list<Context*> *to_queue) {
+      uint64_t seq;
+      if (_get_min_uncompleted(&seq))
+	seq = -1;
+
+      for (list<pair<uint64_t, Context*> >::iterator i =
+	     flush_commit_waiters.begin();
+	   i != flush_commit_waiters.end() && i->first < seq;
+	   flush_commit_waiters.erase(i++)) {
+	to_queue->push_back(i->second);
+      }
+    }
 
     void queue(Op *o) {
       Mutex::Locker l(qlock);
       q.push_back(o);
+      op++;
+      o->op = op;
     }
     Op *peek_queue() {
       assert(apply_lock.is_locked());
       return q.front();
     }
-    Op *dequeue() {
+
+    Op *dequeue(list<Context*> *to_queue) {
+      assert(to_queue);
       assert(apply_lock.is_locked());
       Mutex::Locker l(qlock);
       Op *o = q.front();
       q.pop_front();
       cond.Signal();
+
+      _wake_flush_waiters(to_queue);
       return o;
     }
+
     void flush() {
       Mutex::Locker l(qlock);
 
@@ -286,21 +347,29 @@ class KeyValueStore : public ObjectStore,
       uint64_t seq = 0;
       if (!q.empty())
         seq = q.back()->op;
-      if (!jq.empty() && jq.back() > seq)
-        seq = jq.back();
 
       if (seq) {
         // everything prior to our watermark to drain through either/both
         // queues
-        while ((!q.empty() && q.front()->op <= seq) ||
-                (!jq.empty() && jq.front() <= seq))
+        while (!q.empty() && q.front()->op <= seq)
           cond.Wait(qlock);
       }
     }
+    bool flush_commit(Context *c) {
+      Mutex::Locker l(qlock);
+      uint64_t seq = 0;
+      if (_get_max_uncompleted(&seq)) {
+	delete c;
+	return true;
+      } else {
+	flush_commit_waiters.push_back(make_pair(seq, c));
+	return false;
+      }
+    }
 
     OpSequencer()
       : qlock("KeyValueStore::OpSequencer::qlock", false, false),
-	parent(0),
+        op(0), parent(0),
 	apply_lock("KeyValueStore::OpSequencer::apply_lock", false, false) {}
     ~OpSequencer() {
       assert(q.empty());
@@ -417,7 +486,6 @@ class KeyValueStore : public ObjectStore,
   }
   unsigned _do_transaction(Transaction& transaction,
                            BufferTransaction &bt,
-                           SequencerPosition& spos,
                            ThreadPool::TPHandle *handle);
 
   int queue_transactions(Sequencer *osr, list<Transaction*>& tls,
@@ -428,10 +496,10 @@ class KeyValueStore : public ObjectStore,
   // ------------------
   // objects
 
-  int _generic_read(StripObjectMap::StripObjectHeader &header,
+  int _generic_read(StripObjectMap::StripObjectHeaderRef header,
                     uint64_t offset, size_t len, bufferlist& bl,
                     bool allow_eio = false, BufferTransaction *bt = 0);
-  int _generic_write(StripObjectMap::StripObjectHeader &header,
+  int _generic_write(StripObjectMap::StripObjectHeaderRef header,
                      uint64_t offset, size_t len, const bufferlist& bl,
                      BufferTransaction &t, bool replica = false);
 
@@ -572,26 +640,6 @@ class KeyValueStore : public ObjectStore,
   static const string COLLECTION;
   static const string COLLECTION_ATTR;
   static const uint32_t COLLECTION_VERSION = 1;
-
-  class SubmitManager {
-    Mutex lock;
-    uint64_t op_seq;
-    uint64_t op_submitted;
-   public:
-    SubmitManager() :
-        lock("JOS::SubmitManager::lock", false, true, false, g_ceph_context),
-        op_seq(0), op_submitted(0)
-    {}
-    uint64_t op_submit_start();
-    void op_submit_finish(uint64_t op);
-    void set_op_seq(uint64_t seq) {
-        Mutex::Locker l(lock);
-        op_submitted = op_seq = seq;
-    }
-    uint64_t get_op_seq() {
-        return op_seq;
-    }
-  } submit_manager;
 };
 
 WRITE_CLASS_ENCODER(StripObjectMap::StripObjectHeader)
diff --git a/src/os/LFNIndex.cc b/src/os/LFNIndex.cc
index a460e5c..e017f83 100644
--- a/src/os/LFNIndex.cc
+++ b/src/os/LFNIndex.cc
@@ -61,6 +61,17 @@ void LFNIndex::maybe_inject_failure()
   }
 }
 
+// Helper to close fd's when we leave scope.  This is useful when used
+// in combination with RetryException, thrown by the above.
+struct FDCloser {
+  int fd;
+  FDCloser(int f) : fd(f) {}
+  ~FDCloser() {
+    VOID_TEMP_FAILURE_RETRY(::close(fd));
+  }
+};
+
+
 /* Public methods */
 
 void LFNIndex::set_ref(ceph::shared_ptr<CollectionIndex> ref)
@@ -160,9 +171,9 @@ int LFNIndex::fsync_dir(const vector<string> &path)
   int fd = ::open(get_full_path_subdir(path).c_str(), O_RDONLY);
   if (fd < 0)
     return -errno;
+  FDCloser f(fd);
   maybe_inject_failure();
   int r = ::fsync(fd);
-  VOID_TEMP_FAILURE_RETRY(::close(fd));
   maybe_inject_failure();
   if (r < 0)
     return -errno;
@@ -753,7 +764,8 @@ int LFNIndex::lfn_get_name(const vector<string> &path,
   for ( ; ; ++i) {
     candidate = lfn_get_short_name(oid, i);
     candidate_path = get_full_path(path, candidate);
-    r = chain_getxattr(candidate_path.c_str(), get_lfn_attr().c_str(), buf, sizeof(buf));
+    r = chain_getxattr(candidate_path.c_str(), get_lfn_attr().c_str(),
+		       buf, sizeof(buf));
     if (r < 0) {
       if (errno != ENODATA && errno != ENOENT)
 	return -errno;
@@ -784,6 +796,38 @@ int LFNIndex::lfn_get_name(const vector<string> &path,
 	*exists = 1;
       return 0;
     }
+    r = chain_getxattr(candidate_path.c_str(), get_alt_lfn_attr().c_str(),
+		       buf, sizeof(buf));
+    if (r > 0) {
+      // only consider alt name if nlink > 1
+      struct stat st;
+      int rc = ::stat(candidate_path.c_str(), &st);
+      if (rc < 0)
+	return -errno;
+      if (st.st_nlink <= 1) {
+	// left over from incomplete unlink, remove
+	maybe_inject_failure();
+	dout(20) << __func__ << " found extra alt attr for " << candidate_path
+		 << ", long name " << string(buf, r) << dendl;
+	rc = chain_removexattr(candidate_path.c_str(),
+			       get_alt_lfn_attr().c_str());
+	maybe_inject_failure();
+	if (rc < 0)
+	  return rc;
+	continue;
+      }
+      buf[MIN((int)sizeof(buf) - 1, r)] = '\0';
+      if (!strcmp(buf, full_name.c_str())) {
+	dout(20) << __func__ << " used alt attr for " << full_name << dendl;
+	if (mangled_name)
+	  *mangled_name = candidate;
+	if (out_path)
+	  *out_path = candidate_path;
+	if (exists)
+	  *exists = 1;
+	return 0;
+      }
+    }
   }
   assert(0); // Unreachable
   return 0;
@@ -798,7 +842,24 @@ int LFNIndex::lfn_created(const vector<string> &path,
   string full_path = get_full_path(path, mangled_name);
   string full_name = lfn_generate_object_name(oid);
   maybe_inject_failure();
-  return chain_setxattr(full_path.c_str(), get_lfn_attr().c_str(), 
+
+  // if the main attr exists and is different, move it to the alt attr.
+  char buf[FILENAME_MAX_LEN + 1];
+  int r = chain_getxattr(full_path.c_str(), get_lfn_attr().c_str(),
+			 buf, sizeof(buf));
+  if (r >= 0 && (r != (int)full_name.length() ||
+		 memcmp(buf, full_name.c_str(), full_name.length()))) {
+    dout(20) << __func__ << " " << mangled_name
+	     << " moving old name to alt attr "
+	     << string(buf, r)
+	     << ", new name is " << full_name << dendl;
+    r = chain_setxattr(full_path.c_str(), get_alt_lfn_attr().c_str(),
+		       buf, r);
+    if (r < 0)
+      return r;
+  }
+
+  return chain_setxattr(full_path.c_str(), get_lfn_attr().c_str(),
 		     full_name.c_str(), full_name.size());
 }
 
@@ -839,26 +900,35 @@ int LFNIndex::lfn_unlink(const vector<string> &path,
       }
     }
   }
+  string full_path = get_full_path(path, mangled_name);
+  int fd = ::open(full_path.c_str(), O_RDONLY);
+  if (fd < 0)
+    return -errno;
+  FDCloser f(fd);
   if (i == removed_index + 1) {
-    string full_path = get_full_path(path, mangled_name);
     maybe_inject_failure();
     int r = ::unlink(full_path.c_str());
     maybe_inject_failure();
     if (r < 0)
       return -errno;
-    else
-      return 0;
   } else {
-    string rename_to = get_full_path(path, mangled_name);
+    string& rename_to = full_path;
     string rename_from = get_full_path(path, lfn_get_short_name(oid, i - 1));
     maybe_inject_failure();
     int r = ::rename(rename_from.c_str(), rename_to.c_str());
     maybe_inject_failure();
     if (r < 0)
       return -errno;
-    else
-      return 0;
   }
+  struct stat st;
+  int r = ::fstat(fd, &st);
+  if (r == 0 && st.st_nlink > 0) {
+    // remove alt attr
+    dout(20) << __func__ << " removing alt attr from " << full_path << dendl;
+    fsync_dir(path);
+    chain_fremovexattr(fd, get_alt_lfn_attr().c_str());
+  }
+  return r;
 }
 
 int LFNIndex::lfn_translate(const vector<string> &path,
diff --git a/src/os/LFNIndex.h b/src/os/LFNIndex.h
index c9c7f5e..646e726 100644
--- a/src/os/LFNIndex.h
+++ b/src/os/LFNIndex.h
@@ -123,7 +123,7 @@ protected:
   }
 
 private:
-  string lfn_attribute;
+  string lfn_attribute, lfn_alt_attribute;
   coll_t collection;
 
 public:
@@ -146,7 +146,8 @@ public:
       char buf[100];
       snprintf(buf, sizeof(buf), "%d", index_version);
       lfn_attribute = LFN_ATTR + string(buf);
-    }
+      lfn_alt_attribute = LFN_ATTR + string(buf) + "-alt";
+   }
   }
 
   coll_t coll() const { return collection; }
@@ -423,6 +424,9 @@ private:
   const string &get_lfn_attr() const {
     return lfn_attribute;
   }
+  const string &get_alt_lfn_attr() const {
+    return lfn_alt_attribute;
+  }
 
   /**
    * Gets the filename corresponsing to oid in path.
diff --git a/src/os/MemStore.cc b/src/os/MemStore.cc
index 9e75b76..952866a 100644
--- a/src/os/MemStore.cc
+++ b/src/os/MemStore.cc
@@ -950,7 +950,12 @@ void MemStore::_do_transaction(Transaction& t)
       break;
 
     case Transaction::OP_SETALLOCHINT:
-      // nop
+      {
+        coll_t cid(i.get_cid());
+        ghobject_t oid = i.get_oid();
+        (void)i.get_length();  // discard result
+        (void)i.get_length();  // discard result
+      }
       break;
 
     default:
diff --git a/src/os/ObjectStore.cc b/src/os/ObjectStore.cc
index e4e2257..afa90b1 100644
--- a/src/os/ObjectStore.cc
+++ b/src/os/ObjectStore.cc
@@ -144,7 +144,11 @@ int ObjectStore::collection_list_range(coll_t c, hobject_t start, hobject_t end,
 			    snapid_t seq, vector<hobject_t> *ls)
 {
   vector<ghobject_t> go;
-  ghobject_t gstart(start), gend(end);
+  // Starts with the smallest shard id and generation to
+  // make sure the result list has the marker object
+  ghobject_t gstart(start, 0, shard_id_t(0));
+  // Exclusive end, choose the smallest end ghobject
+  ghobject_t gend(end, 0, shard_id_t(0));
   int ret = collection_list_range(c, gstart, gend, seq, &go);
   if (ret == 0) {
     ls->reserve(go.size());
diff --git a/src/os/ObjectStore.h b/src/os/ObjectStore.h
index 15ed31f..a5f5fcb 100644
--- a/src/os/ObjectStore.h
+++ b/src/os/ObjectStore.h
@@ -128,6 +128,22 @@ public:
    */
   struct Sequencer_impl {
     virtual void flush() = 0;
+
+    /**
+     * Async flush_commit
+     *
+     * There are two cases:
+     * 1) sequencer is currently idle: the method returns true and
+     *    c is deleted
+     * 2) sequencer is not idle: the method returns false and c is
+     *    called asyncronously with a value of 0 once all transactions
+     *    queued on this sequencer prior to the call have been applied
+     *    and committed.
+     */
+    virtual bool flush_commit(
+      Context *c ///< [in] context to call upon flush/commit
+      ) = 0; ///< @return true if idle, false otherwise
+
     virtual ~Sequencer_impl() {}
   };
 
@@ -153,6 +169,16 @@ public:
       if (p)
 	p->flush();
     }
+
+    /// @see Sequencer_impl::flush_commit()
+    bool flush_commit(Context *c) {
+      if (!p) {
+	delete c;
+	return true;
+      } else {
+	return p->flush_commit(c);
+      }
+    }
   };
 
   /*********************************
diff --git a/src/osd/ECBackend.cc b/src/osd/ECBackend.cc
index b69d77a..aefbb5e 100644
--- a/src/osd/ECBackend.cc
+++ b/src/osd/ECBackend.cc
@@ -105,13 +105,13 @@ ostream &operator<<(ostream &lhs, const ECBackend::ReadOp &rhs)
 
 void ECBackend::ReadOp::dump(Formatter *f) const
 {
-  f->dump_stream("tid") << tid;
+  f->dump_unsigned("tid", tid);
   if (op && op->get_req()) {
     f->dump_stream("op") << *(op->get_req());
   }
   f->dump_stream("to_read") << to_read;
   f->dump_stream("complete") << complete;
-  f->dump_stream("priority") << priority;
+  f->dump_int("priority", priority);
   f->dump_stream("obj_to_source") << obj_to_source;
   f->dump_stream("source_to_obj") << source_to_obj;
   f->dump_stream("in_progress") << in_progress;
@@ -158,7 +158,7 @@ void ECBackend::RecoveryOp::dump(Formatter *f) const
   f->dump_stream("missing_on_shards") << missing_on_shards;
   f->dump_stream("recovery_info") << recovery_info;
   f->dump_stream("recovery_progress") << recovery_progress;
-  f->dump_stream("pending_read") << pending_read;
+  f->dump_bool("pending_read", pending_read);
   f->dump_stream("state") << tostr(state);
   f->dump_stream("waiting_on_pushes") << waiting_on_pushes;
   f->dump_stream("extent_requested") << extent_requested;
@@ -829,6 +829,7 @@ void ECBackend::handle_sub_write(
     op.log_entries,
     op.updated_hit_set_history,
     op.trim_to,
+    op.trim_rollback_to,
     !(op.t.empty()),
     localt);
   localt->append(op.t);
@@ -1211,6 +1212,7 @@ void ECBackend::submit_transaction(
   const eversion_t &at_version,
   PGTransaction *_t,
   const eversion_t &trim_to,
+  const eversion_t &trim_rollback_to,
   vector<pg_log_entry_t> &log_entries,
   boost::optional<pg_hit_set_history_t> &hset_history,
   Context *on_local_applied_sync,
@@ -1226,6 +1228,7 @@ void ECBackend::submit_transaction(
   op->hoid = hoid;
   op->version = at_version;
   op->trim_to = trim_to;
+  op->trim_rollback_to = trim_rollback_to;
   op->log_entries.swap(log_entries);
   std::swap(op->updated_hit_set_history, hset_history);
   op->on_local_applied_sync = on_local_applied_sync;
@@ -1532,6 +1535,7 @@ void ECBackend::start_write(Op *op) {
       should_send ? iter->second : ObjectStore::Transaction(),
       op->version,
       op->trim_to,
+      op->trim_rollback_to,
       op->log_entries,
       op->updated_hit_set_history,
       op->temp_added,
diff --git a/src/osd/ECBackend.h b/src/osd/ECBackend.h
index c13f30f..28bcf8a 100644
--- a/src/osd/ECBackend.h
+++ b/src/osd/ECBackend.h
@@ -97,6 +97,7 @@ public:
     const eversion_t &at_version,
     PGTransaction *t,
     const eversion_t &trim_to,
+    const eversion_t &trim_rollback_to,
     vector<pg_log_entry_t> &log_entries,
     boost::optional<pg_hit_set_history_t> &hset_history,
     Context *on_local_applied_sync,
@@ -326,6 +327,7 @@ public:
     hobject_t hoid;
     eversion_t version;
     eversion_t trim_to;
+    eversion_t trim_rollback_to;
     vector<pg_log_entry_t> log_entries;
     boost::optional<pg_hit_set_history_t> updated_hit_set_history;
     Context *on_local_applied_sync;
diff --git a/src/osd/ECMsgTypes.cc b/src/osd/ECMsgTypes.cc
index 4e4c8e3..ba02d83 100644
--- a/src/osd/ECMsgTypes.cc
+++ b/src/osd/ECMsgTypes.cc
@@ -16,7 +16,7 @@
 
 void ECSubWrite::encode(bufferlist &bl) const
 {
-  ENCODE_START(2, 1, bl);
+  ENCODE_START(3, 1, bl);
   ::encode(from, bl);
   ::encode(tid, bl);
   ::encode(reqid, bl);
@@ -29,12 +29,13 @@ void ECSubWrite::encode(bufferlist &bl) const
   ::encode(temp_added, bl);
   ::encode(temp_removed, bl);
   ::encode(updated_hit_set_history, bl);
+  ::encode(trim_rollback_to, bl);
   ENCODE_FINISH(bl);
 }
 
 void ECSubWrite::decode(bufferlist::iterator &bl)
 {
-  DECODE_START(2, bl);
+  DECODE_START(3, bl);
   ::decode(from, bl);
   ::decode(tid, bl);
   ::decode(reqid, bl);
@@ -49,6 +50,11 @@ void ECSubWrite::decode(bufferlist::iterator &bl)
   if (struct_v >= 2) {
     ::decode(updated_hit_set_history, bl);
   }
+  if (struct_v >= 3) {
+    ::decode(trim_rollback_to, bl);
+  } else {
+    trim_rollback_to = trim_to;
+  }
   DECODE_FINISH(bl);
 }
 
@@ -58,7 +64,8 @@ std::ostream &operator<<(
   lhs << "ECSubWrite(tid=" << rhs.tid
       << ", reqid=" << rhs.reqid
       << ", at_version=" << rhs.at_version
-      << ", trim_to=" << rhs.trim_to;
+      << ", trim_to=" << rhs.trim_to
+      << ", trim_rollback_to=" << rhs.trim_rollback_to;
   if (rhs.updated_hit_set_history)
     lhs << ", has_updated_hit_set_history";
   return lhs <<  ")";
@@ -66,10 +73,11 @@ std::ostream &operator<<(
 
 void ECSubWrite::dump(Formatter *f) const
 {
-  f->dump_stream("tid") << tid;
+  f->dump_unsigned("tid", tid);
   f->dump_stream("reqid") << reqid;
   f->dump_stream("at_version") << at_version;
   f->dump_stream("trim_to") << trim_to;
+  f->dump_stream("trim_rollback_to") << trim_rollback_to;
   f->dump_stream("has_updated_hit_set_history")
     << static_cast<bool>(updated_hit_set_history);
 }
@@ -85,6 +93,12 @@ void ECSubWrite::generate_test_instances(list<ECSubWrite*> &o)
   o.back()->reqid = osd_reqid_t(entity_name_t::CLIENT(123), 1, 45678);
   o.back()->at_version = eversion_t(10, 300);
   o.back()->trim_to = eversion_t(5, 42);
+  o.push_back(new ECSubWrite());
+  o.back()->tid = 9;
+  o.back()->reqid = osd_reqid_t(entity_name_t::CLIENT(123), 1, 45678);
+  o.back()->at_version = eversion_t(10, 300);
+  o.back()->trim_to = eversion_t(5, 42);
+  o.back()->trim_rollback_to = eversion_t(8, 250);
 }
 
 void ECSubWriteReply::encode(bufferlist &bl) const
@@ -121,7 +135,7 @@ std::ostream &operator<<(
 
 void ECSubWriteReply::dump(Formatter *f) const
 {
-  f->dump_stream("tid") << tid;
+  f->dump_unsigned("tid", tid);
   f->dump_stream("last_complete") << last_complete;
   f->dump_stream("committed") << committed;
   f->dump_stream("applied") << applied;
@@ -171,7 +185,7 @@ std::ostream &operator<<(
 void ECSubRead::dump(Formatter *f) const
 {
   f->dump_stream("from") << from;
-  f->dump_stream("tid") << tid;
+  f->dump_unsigned("tid", tid);
   f->open_array_section("objects");
   for (map<hobject_t, list<pair<uint64_t, uint64_t> > >::const_iterator i =
 	 to_read.begin();
@@ -259,7 +273,7 @@ std::ostream &operator<<(
 void ECSubReadReply::dump(Formatter *f) const
 {
   f->dump_stream("from") << from;
-  f->dump_stream("tid") << tid;
+  f->dump_unsigned("tid", tid);
   f->open_array_section("buffers_read");
   for (map<hobject_t, list<pair<uint64_t, bufferlist> > >::const_iterator i =
 	 buffers_read.begin();
diff --git a/src/osd/ECMsgTypes.h b/src/osd/ECMsgTypes.h
index 11c519d..1cdfa57 100644
--- a/src/osd/ECMsgTypes.h
+++ b/src/osd/ECMsgTypes.h
@@ -28,6 +28,7 @@ struct ECSubWrite {
   ObjectStore::Transaction t;
   eversion_t at_version;
   eversion_t trim_to;
+  eversion_t trim_rollback_to;
   vector<pg_log_entry_t> log_entries;
   set<hobject_t> temp_added;
   set<hobject_t> temp_removed;
@@ -42,6 +43,7 @@ struct ECSubWrite {
     const ObjectStore::Transaction &t,
     eversion_t at_version,
     eversion_t trim_to,
+    eversion_t trim_rollback_to,
     vector<pg_log_entry_t> log_entries,
     boost::optional<pg_hit_set_history_t> updated_hit_set_history,
     const set<hobject_t> &temp_added,
@@ -49,7 +51,8 @@ struct ECSubWrite {
     : from(from), tid(tid), reqid(reqid),
       soid(soid), stats(stats), t(t),
       at_version(at_version),
-      trim_to(trim_to), log_entries(log_entries),
+      trim_to(trim_to), trim_rollback_to(trim_rollback_to),
+      log_entries(log_entries),
       temp_added(temp_added),
       temp_removed(temp_removed),
       updated_hit_set_history(updated_hit_set_history) {}
diff --git a/src/osd/HitSet.h b/src/osd/HitSet.h
index 391dd63..476678e 100644
--- a/src/osd/HitSet.h
+++ b/src/osd/HitSet.h
@@ -369,7 +369,7 @@ public:
       return (double)fpp_micro / 1000000.0;
     }
     void set_fpp(double f) {
-      fpp_micro = (unsigned)(f * 1000000.0);
+      fpp_micro = (unsigned)(llrintl(f * (double)1000000.0));
     }
 
     void encode(bufferlist& bl) const {
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc
index 5c8f0d6..dc67fdd 100644
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -42,6 +42,7 @@
 
 #include "common/ceph_argparse.h"
 #include "common/version.h"
+#include "common/io_priority.h"
 
 #include "os/ObjectStore.h"
 
@@ -191,6 +192,7 @@ OSDService::OSDService(OSD *osd) :
   push_wq("push_wq", cct->_conf->osd_recovery_thread_timeout, &osd->recovery_tp),
   gen_wq("gen_wq", cct->_conf->osd_recovery_thread_timeout, &osd->recovery_tp),
   class_handler(osd->class_handler),
+  pg_epoch_lock("OSDService::pg_epoch_lock"),
   publish_lock("OSDService::publish_lock"),
   pre_publish_lock("OSDService::pre_publish_lock"),
   sched_scrub_lock("OSDService::sched_scrub_lock"), scrubs_pending(0),
@@ -1277,6 +1279,8 @@ int OSD::init()
   disk_tp.start();
   command_tp.start();
 
+  set_disk_tp_priority();
+
   // start the heartbeat
   heartbeat_thread.create();
 
@@ -1305,6 +1309,8 @@ int OSD::init()
   if (is_stopping())
     return 0;
 
+  check_config();
+
   dout(10) << "ensuring pgs have consumed prior maps" << dendl;
   consume_map();
   peering_wq.drain();
@@ -1663,8 +1669,10 @@ int OSD::shutdown()
   dout(10) << "recovery tp stopped" << dendl;
 
   op_tp.drain();
+  peering_wq.clear();
+  scrub_finalize_wq.clear();
   op_tp.stop();
-  dout(10) << "op tp stopped" << dendl;
+  dout(10) << "osd tp stopped" << dendl;
 
   command_tp.drain();
   command_tp.stop();
@@ -1708,7 +1716,6 @@ int OSD::shutdown()
     assert(pg_stat_queue.empty());
   }
 
-  peering_wq.clear();
   // Remove PGs
 #ifdef PG_DEBUG_REFS
   service.dump_live_pgids();
@@ -1854,6 +1861,8 @@ PG *OSD::_open_lock_pg(
 
   pg_map[pgid] = pg;
 
+  service.pg_add_epoch(pg->info.pgid, createmap->get_epoch());
+
   pg->lock(no_lockdep_check);
   pg->get("PGMap");  // because it's in pg_map
   return pg;
@@ -1885,6 +1894,7 @@ void OSD::add_newly_split_pg(PG *pg, PG::RecoveryCtx *rctx)
   epoch_t e(service.get_osdmap()->get_epoch());
   pg->get("PGMap");  // For pg_map
   pg_map[pg->info.pgid] = pg;
+  service.pg_add_epoch(pg->info.pgid, pg->get_osdmap()->get_epoch());
   dout(10) << "Adding newly split pg " << *pg << dendl;
   vector<int> up, acting;
   pg->get_osdmap()->pg_to_up_acting_osds(pg->info.pgid.pgid, up, acting);
@@ -4392,9 +4402,8 @@ void OSD::do_command(Connection *con, ceph_tid_t tid, vector<string>& cmd, buffe
       // However, to avoid the osd from getting hung on this and having
       // timers being triggered, we are going to limit the count assuming
       // a configurable throughput and duration.
-      int64_t total_throughput =
+      int64_t max_count =
         g_conf->osd_bench_large_size_max_throughput * duration;
-      int64_t max_count = (int64_t) (total_throughput / bsize);
       if (count > max_count) {
         ss << "'count' values greater than " << max_count
            << " for a block size of " << prettybyte_t(bsize) << ", assuming "
@@ -5713,11 +5722,12 @@ void OSD::check_osdmap_features(ObjectStore *fs)
     }
   }
   {
-    Messenger::Policy p = cluster_messenger->get_policy(entity_name_t::TYPE_MON);
+    Messenger::Policy p = client_messenger->get_policy(entity_name_t::TYPE_MON);
     uint64_t mask;
     uint64_t features = osdmap->get_features(entity_name_t::TYPE_MON, &mask);
     if ((p.features_required & mask) != features) {
       dout(0) << "crush map has features " << features
+	      << " was " << p.features_required
 	      << ", adjusting msgr requires for mons" << dendl;
       p.features_required = (p.features_required & ~mask) | features;
       client_messenger->set_policy(entity_name_t::TYPE_MON, p);
@@ -5748,7 +5758,7 @@ void OSD::check_osdmap_features(ObjectStore *fs)
   }
 }
 
-void OSD::advance_pg(
+bool OSD::advance_pg(
   epoch_t osd_epoch, PG *pg,
   ThreadPool::TPHandle &handle,
   PG::RecoveryCtx *rctx,
@@ -5759,11 +5769,19 @@ void OSD::advance_pg(
   OSDMapRef lastmap = pg->get_osdmap();
 
   if (lastmap->get_epoch() == osd_epoch)
-    return;
+    return true;
   assert(lastmap->get_epoch() < osd_epoch);
 
+  epoch_t min_epoch = service.get_min_pg_epoch();
+  epoch_t max;
+  if (min_epoch) {
+    max = min_epoch + g_conf->osd_map_max_advance;
+  } else {
+    max = next_epoch + g_conf->osd_map_max_advance;
+  }
+
   for (;
-       next_epoch <= osd_epoch;
+       next_epoch <= osd_epoch && next_epoch <= max;
        ++next_epoch) {
     OSDMapRef nextmap = service.try_get_map(next_epoch);
     if (!nextmap)
@@ -5795,7 +5813,15 @@ void OSD::advance_pg(
     lastmap = nextmap;
     handle.reset_tp_timeout();
   }
+  service.pg_update_epoch(pg->info.pgid, lastmap->get_epoch());
   pg->handle_activate_map(rctx);
+  if (next_epoch <= osd_epoch) {
+    dout(10) << __func__ << " advanced by max " << g_conf->osd_map_max_advance
+	     << " past min epoch " << min_epoch
+	     << " ... will requeue " << *pg << dendl;
+    return false;
+  }
+  return true;
 }
 
 /** 
@@ -6127,7 +6153,7 @@ bool OSD::require_mon_peer(Message *m)
   return true;
 }
 
-bool OSD::require_osd_peer(OpRequestRef op)
+bool OSD::require_osd_peer(OpRequestRef& op)
 {
   if (!op->get_req()->get_connection()->peer_is_osd()) {
     dout(0) << "require_osd_peer received from non-osd " << op->get_req()->get_connection()->get_peer_addr()
@@ -6137,11 +6163,64 @@ bool OSD::require_osd_peer(OpRequestRef op)
   return true;
 }
 
+bool OSD::require_self_aliveness(OpRequestRef& op, epoch_t epoch)
+{
+  if (epoch < up_epoch) {
+    dout(7) << "from pre-up epoch " << epoch << " < " << up_epoch << dendl;
+    return false;
+  }
+
+  if (!is_active()) {
+    dout(7) << "still in boot state, dropping message " << *op->get_req() << dendl;
+    return false;
+  }
+
+  return true;
+}
+
+bool OSD::require_same_peer_instance(OpRequestRef& op, OSDMapRef& map)
+{
+  Message *m = op->get_req();
+  int from = m->get_source().num();
+
+  if (!map->have_inst(from) ||
+      (map->get_cluster_addr(from) != m->get_source_inst().addr)) {
+    dout(5) << "from dead osd." << from << ", marking down, "
+	    << " msg was " << m->get_source_inst().addr
+	    << " expected " << (map->have_inst(from) ?
+				map->get_cluster_addr(from) : entity_addr_t())
+	    << dendl;
+    ConnectionRef con = m->get_connection();
+    cluster_messenger->mark_down(con.get());
+    Session *s = static_cast<Session*>(con->get_priv());
+    if (s) {
+      con->set_priv(NULL);   // break ref <-> session cycle, if any
+      s->put();
+    }
+    return false;
+  }
+  return true;
+}
+
+bool OSD::require_up_osd_peer(OpRequestRef& op, OSDMapRef& map,
+                              epoch_t their_epoch)
+{
+  if (!require_self_aliveness(op, their_epoch)) {
+    return false;
+  } else if (!require_osd_peer(op)) {
+    return false;
+  } else if (map->get_epoch() >= their_epoch &&
+	     !require_same_peer_instance(op, map)) {
+    return false;
+  }
+  return true;
+}
+
 /*
  * require that we have same (or newer) map, and that
  * the source is the pg primary.
  */
-bool OSD::require_same_or_newer_map(OpRequestRef op, epoch_t epoch)
+bool OSD::require_same_or_newer_map(OpRequestRef& op, epoch_t epoch)
 {
   Message *m = op->get_req();
   dout(15) << "require_same_or_newer_map " << epoch << " (i am " << osdmap->get_epoch() << ") " << m << dendl;
@@ -6155,30 +6234,13 @@ bool OSD::require_same_or_newer_map(OpRequestRef op, epoch_t epoch)
     return false;
   }
 
-  if (epoch < up_epoch) {
-    dout(7) << "from pre-up epoch " << epoch << " < " << up_epoch << dendl;
+  if (!require_self_aliveness(op, epoch)) {
     return false;
   }
 
   // ok, our map is same or newer.. do they still exist?
-  if (m->get_connection()->get_messenger() == cluster_messenger) {
-    int from = m->get_source().num();
-    if (!osdmap->have_inst(from) ||
-	osdmap->get_cluster_addr(from) != m->get_source_inst().addr) {
-      dout(5) << "from dead osd." << from << ", marking down, "
-	      << " msg was " << m->get_source_inst().addr
-	      << " expected " << (osdmap->have_inst(from) ? osdmap->get_cluster_addr(from) : entity_addr_t())
-	      << dendl;
-      ConnectionRef con = m->get_connection();
-      con->set_priv(NULL);   // break ref <-> session cycle, if any
-      cluster_messenger->mark_down(con.get());
-      return false;
-    }
-  }
-
-  // ok, we have at least as new a map as they do.  are we (re)booting?
-  if (!is_active()) {
-    dout(7) << "still in boot state, dropping message " << *m << dendl;
+  if (m->get_connection()->get_messenger() == cluster_messenger &&
+      !require_same_peer_instance(op, osdmap)) {
     return false;
   }
 
@@ -7142,6 +7204,8 @@ void OSD::_remove_pg(PG *pg)
     );
   remove_wq.queue(make_pair(PGRef(pg), deleting));
 
+  service.pg_remove_epoch(pg->info.pgid);
+
   // remove from map
   pg_map.erase(pg->info.pgid);
   pg->put("PGMap"); // since we've taken it out of map
@@ -7555,7 +7619,7 @@ void OSD::handle_replica_op(OpRequestRef op)
     return;
   }
 
-  if (!require_osd_peer(op))
+  if (!require_up_osd_peer(op, osdmap, m->map_epoch))
     return;
 
   // must be a rep op.
@@ -7770,8 +7834,9 @@ void OSD::process_peering_events(
       pg->unlock();
       continue;
     }
-    advance_pg(curmap->get_epoch(), pg, handle, &rctx, &split_pgs);
-    if (!pg->peering_queue.empty()) {
+    if (!advance_pg(curmap->get_epoch(), pg, handle, &rctx, &split_pgs)) {
+      pg->queue_null(curmap->get_epoch(), curmap->get_epoch());
+    } else if (!pg->peering_queue.empty()) {
       PG::CephPeeringEvtRef evt = pg->peering_queue.front();
       pg->peering_queue.pop_front();
       pg->handle_peering_event(evt, &rctx);
@@ -7808,6 +7873,11 @@ const char** OSD::get_tracked_conf_keys() const
     "osd_max_backfills",
     "osd_op_complaint_time", "osd_op_log_threshold",
     "osd_op_history_size", "osd_op_history_duration",
+    "osd_map_cache_size",
+    "osd_map_max_advance",
+    "osd_pg_epoch_persisted_max_stale",
+    "osd_disk_thread_ioprio_class",
+    "osd_disk_thread_ioprio_priority",
     NULL
   };
   return KEYS;
@@ -7830,6 +7900,38 @@ void OSD::handle_conf_change(const struct md_config_t *conf,
     op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
                                              cct->_conf->osd_op_history_duration);
   }
+  if (changed.count("osd_disk_thread_ioprio_class") ||
+      changed.count("osd_disk_thread_ioprio_priority")) {
+    set_disk_tp_priority();
+  }
+
+  check_config();
+}
+
+void OSD::check_config()
+{
+  // some sanity checks
+  if (g_conf->osd_map_cache_size <= g_conf->osd_map_max_advance + 2) {
+    clog.warn() << "osd_map_cache_size (" << g_conf->osd_map_cache_size << ")"
+		<< " is not > osd_map_max_advance ("
+		<< g_conf->osd_map_max_advance << ")";
+  }
+  if (g_conf->osd_map_cache_size <= (int)g_conf->osd_pg_epoch_persisted_max_stale + 2) {
+    clog.warn() << "osd_map_cache_size (" << g_conf->osd_map_cache_size << ")"
+		<< " is not > osd_pg_epoch_persisted_max_stale ("
+		<< g_conf->osd_pg_epoch_persisted_max_stale << ")";
+  }
+}
+
+void OSD::set_disk_tp_priority()
+{
+  dout(10) << __func__
+	   << " class " << cct->_conf->osd_disk_thread_ioprio_class
+	   << " priority " << cct->_conf->osd_disk_thread_ioprio_priority
+	   << dendl;
+  int cls =
+    ceph_ioprio_string_to_class(cct->_conf->osd_disk_thread_ioprio_class);
+  disk_tp.set_ioprio(cls, cct->_conf->osd_disk_thread_ioprio_priority);
 }
 
 // --------------------------------
diff --git a/src/osd/OSD.h b/src/osd/OSD.h
index ae8d74e..e2a3c8e 100644
--- a/src/osd/OSD.h
+++ b/src/osd/OSD.h
@@ -334,6 +334,42 @@ public:
 
   void dequeue_pg(PG *pg, list<OpRequestRef> *dequeued);
 
+  // -- map epoch lower bound --
+  Mutex pg_epoch_lock;
+  multiset<epoch_t> pg_epochs;
+  map<spg_t,epoch_t> pg_epoch;
+
+  void pg_add_epoch(spg_t pgid, epoch_t epoch) {
+    Mutex::Locker l(pg_epoch_lock);
+    map<spg_t,epoch_t>::iterator t = pg_epoch.find(pgid);
+    assert(t == pg_epoch.end());
+    pg_epoch[pgid] = epoch;
+    pg_epochs.insert(epoch);
+  }
+  void pg_update_epoch(spg_t pgid, epoch_t epoch) {
+    Mutex::Locker l(pg_epoch_lock);
+    map<spg_t,epoch_t>::iterator t = pg_epoch.find(pgid);
+    assert(t != pg_epoch.end());
+    pg_epochs.erase(pg_epochs.find(t->second));
+    t->second = epoch;
+    pg_epochs.insert(epoch);
+  }
+  void pg_remove_epoch(spg_t pgid) {
+    Mutex::Locker l(pg_epoch_lock);
+    map<spg_t,epoch_t>::iterator t = pg_epoch.find(pgid);
+    if (t != pg_epoch.end()) {
+      pg_epochs.erase(pg_epochs.find(t->second));
+      pg_epoch.erase(t);
+    }
+  }
+  epoch_t get_min_pg_epoch() {
+    Mutex::Locker l(pg_epoch_lock);
+    if (pg_epochs.empty())
+      return 0;
+    else
+      return *pg_epochs.begin();
+  }
+
   // -- superblock --
   Mutex publish_lock, pre_publish_lock; // pre-publish orders before publish
   OSDSuperblock superblock;
@@ -784,6 +820,7 @@ public:
   virtual const char** get_tracked_conf_keys() const;
   virtual void handle_conf_change(const struct md_config_t *conf,
 				  const std::set <std::string> &changed);
+  void check_config();
 
 protected:
   Mutex osd_lock;			// global lock
@@ -944,6 +981,8 @@ private:
 
   bool paused_recovery;
 
+  void set_disk_tp_priority();
+
   // -- sessions --
 public:
   struct Session : public RefCountedObject {
@@ -1255,7 +1294,7 @@ private:
   void note_down_osd(int osd);
   void note_up_osd(int osd);
   
-  void advance_pg(
+  bool advance_pg(
     epoch_t advance_to, PG *pg,
     ThreadPool::TPHandle &handle,
     PG::RecoveryCtx *rctx,
@@ -1513,9 +1552,22 @@ protected:
   void repeer(PG *pg, map< int, map<spg_t,pg_query_t> >& query_map);
 
   bool require_mon_peer(Message *m);
-  bool require_osd_peer(OpRequestRef op);
+  bool require_osd_peer(OpRequestRef& op);
+  /***
+   * Verifies that we were alive in the given epoch, and that
+   * still are.
+   */
+  bool require_self_aliveness(OpRequestRef& op, epoch_t alive_since);
+  /**
+   * Verifies that the OSD who sent the given op has the same
+   * address as in the given map.
+   * @pre op was sent by an OSD using the cluster messenger
+   */
+  bool require_same_peer_instance(OpRequestRef& op, OSDMapRef& map);
+  bool require_up_osd_peer(OpRequestRef& Op, OSDMapRef& map,
+                           epoch_t their_epoch);
 
-  bool require_same_or_newer_map(OpRequestRef op, epoch_t e);
+  bool require_same_or_newer_map(OpRequestRef& op, epoch_t e);
 
   void handle_pg_query(OpRequestRef op);
   void handle_pg_notify(OpRequestRef op);
diff --git a/src/osd/OSDMap.cc b/src/osd/OSDMap.cc
index 8c66c76..645a6f7 100644
--- a/src/osd/OSDMap.cc
+++ b/src/osd/OSDMap.cc
@@ -959,10 +959,7 @@ uint64_t OSDMap::get_features(int entity_type, uint64_t *pmask) const
     features |= CEPH_FEATURE_CRUSH_TUNABLES;
   if (crush->has_nondefault_tunables2())
     features |= CEPH_FEATURE_CRUSH_TUNABLES2;
-  if (crush->has_v2_rules())
-    features |= CEPH_FEATURE_CRUSH_V2;
-  if (crush->has_nondefault_tunables3() ||
-      crush->has_v3_rules())
+  if (crush->has_nondefault_tunables3())
     features |= CEPH_FEATURE_CRUSH_TUNABLES3;
   mask |= CEPH_FEATURES_CRUSH;
 
@@ -978,6 +975,15 @@ uint64_t OSDMap::get_features(int entity_type, uint64_t *pmask) const
 	p->second.is_tier()) {
       features |= CEPH_FEATURE_OSD_CACHEPOOL;
     }
+    int ruleid = crush->find_rule(p->second.get_crush_ruleset(),
+				  p->second.get_type(),
+				  p->second.get_size());
+    if (ruleid >= 0) {
+      if (crush->is_v2_rule(ruleid))
+	features |= CEPH_FEATURE_CRUSH_V2;
+      if (crush->is_v3_rule(ruleid))
+	features |= CEPH_FEATURE_CRUSH_TUNABLES3;
+    }
   }
   mask |= CEPH_FEATURE_OSDHASHPSPOOL | CEPH_FEATURE_OSD_CACHEPOOL;
   if (entity_type != CEPH_ENTITY_TYPE_CLIENT)
@@ -1801,7 +1807,15 @@ void OSDMap::encode(bufferlist& bl, uint64_t features) const
     ENCODE_START(1, 1, bl); // extended, osd-only data
     ::encode(osd_addrs->hb_back_addr, bl);
     ::encode(osd_info, bl);
-    ::encode(blacklist, bl);
+    {
+      // put this in a sorted, ordered map<> so that we encode in a
+      // deterministic order.
+      map<entity_addr_t,utime_t> blacklist_map;
+      for (ceph::unordered_map<entity_addr_t,utime_t>::const_iterator p =
+	     blacklist.begin(); p != blacklist.end(); ++p)
+	blacklist_map.insert(make_pair(p->first, p->second));
+      ::encode(blacklist_map, bl);
+    }
     ::encode(osd_addrs->cluster_addr, bl);
     ::encode(cluster_snapshot_epoch, bl);
     ::encode(cluster_snapshot, bl);
@@ -2159,6 +2173,7 @@ void OSDMap::generate_test_instances(list<OSDMap*>& o)
   uuid_d fsid;
   o.back()->build_simple(cct, 1, fsid, 16, 7, 8);
   o.back()->created = o.back()->modified = utime_t(1, 2);  // fix timestamp
+  o.back()->blacklist[entity_addr_t()] = utime_t(5, 6);
   cct->put();
 }
 
@@ -2551,13 +2566,25 @@ int OSDMap::build_simple(CephContext *cct, epoch_t e, uuid_d &fsid,
     set_weight(i, CEPH_OSD_OUT);
   }
 
-  map<string,string> erasure_code_profile_map;
-  r = get_str_map(cct->_conf->osd_pool_default_erasure_code_profile,
-		  ss,
-		  &erasure_code_profile_map);
-  erasure_code_profile_map["directory"] =
+  map<string,string> profile_map;
+  r = get_erasure_code_profile_default(cct, profile_map, &ss);
+  if (r < 0) {
+    lderr(cct) << ss.str() << dendl;
+    return r;
+  }
+  set_erasure_code_profile("default", profile_map);
+  return 0;
+}
+
+int OSDMap::get_erasure_code_profile_default(CephContext *cct,
+					     map<string,string> &profile_map,
+					     ostream *ss)
+{
+  int r = get_str_map(cct->_conf->osd_pool_default_erasure_code_profile,
+		      *ss,
+		      &profile_map);
+  profile_map["directory"] =
     cct->_conf->osd_pool_default_erasure_code_directory;
-  set_erasure_code_profile("default", erasure_code_profile_map);
   return r;
 }
 
diff --git a/src/osd/OSDMap.h b/src/osd/OSDMap.h
index 08064f8..a347583 100644
--- a/src/osd/OSDMap.h
+++ b/src/osd/OSDMap.h
@@ -380,6 +380,9 @@ public:
       erasure_code_profiles.find(name);
     return i != erasure_code_profiles.end();
   }
+  int get_erasure_code_profile_default(CephContext *cct,
+				       map<string,string> &profile_map,
+				       ostream *ss);
   void set_erasure_code_profile(const string &name,
 				const map<string,string> &profile) {
     erasure_code_profiles[name] = profile;
diff --git a/src/osd/OpRequest.cc b/src/osd/OpRequest.cc
index 33e7fbd..bfa819d 100644
--- a/src/osd/OpRequest.cc
+++ b/src/osd/OpRequest.cc
@@ -33,7 +33,7 @@ void OpRequest::_dump(utime_t now, Formatter *f) const
     stringstream client_name;
     client_name << m->get_orig_source();
     f->dump_string("client", client_name.str());
-    f->dump_int("tid", m->get_tid());
+    f->dump_unsigned("tid", m->get_tid());
     f->close_section(); // client_info
   }
   {
diff --git a/src/osd/OpRequest.h b/src/osd/OpRequest.h
index 569b6fc..b074bee 100644
--- a/src/osd/OpRequest.h
+++ b/src/osd/OpRequest.h
@@ -74,6 +74,10 @@ struct OpRequest : public TrackedOp {
 
   void _dump(utime_t now, Formatter *f) const;
 
+  bool has_feature(uint64_t f) const {
+    return request->get_connection()->has_feature(f);
+  }
+
 private:
   osd_reqid_t reqid;
   uint8_t hit_flag_points;
diff --git a/src/osd/PG.cc b/src/osd/PG.cc
index 42099fb..11a34a2 100644
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@@ -868,6 +868,10 @@ map<pg_shard_t, pg_info_t>::const_iterator PG::find_best_info(
   for (map<pg_shard_t, pg_info_t>::const_iterator i = infos.begin();
        i != infos.end();
        ++i) {
+    if (max_last_epoch_started_found < i->second.history.last_epoch_started) {
+      min_last_update_acceptable = eversion_t::max();
+      max_last_epoch_started_found = i->second.history.last_epoch_started;
+    }
     if (max_last_epoch_started_found < i->second.last_epoch_started) {
       min_last_update_acceptable = eversion_t::max();
       max_last_epoch_started_found = i->second.last_epoch_started;
@@ -877,7 +881,8 @@ map<pg_shard_t, pg_info_t>::const_iterator PG::find_best_info(
 	min_last_update_acceptable = i->second.last_update;
     }
   }
-  assert(min_last_update_acceptable != eversion_t::max());
+  if (min_last_update_acceptable == eversion_t::max())
+    return infos.end();
 
   map<pg_shard_t, pg_info_t>::const_iterator best = infos.end();
   // find osd with newest last_update (oldest for ec_pool).
@@ -1269,11 +1274,19 @@ bool PG::choose_acting(pg_shard_t &auth_log_shard_id)
       ss);
   dout(10) << ss.str() << dendl;
 
-  // This might cause a problem if min_size is large
-  // and we need to backfill more than 1 osd.  Older
-  // code would only include 1 backfill osd and now we
-  // have the resize above.
-  if (want_acting_backfill.size() < pool.info.min_size) {
+  unsigned num_want_acting = 0;
+  for (vector<int>::iterator i = want.begin();
+       i != want.end();
+       ++i) {
+    if (*i != CRUSH_ITEM_NONE)
+      ++num_want_acting;
+  }
+  assert(want_acting_backfill.size() - want_backfill.size() == num_want_acting);
+
+  // This is a bit of a problem, if we allow the pg to go active with
+  // want.size() < min_size, we won't consider the pg to have been
+  // maybe_went_rw in build_prior.
+  if (num_want_acting < pool.info.min_size) {
     want_acting.clear();
     return false;
   }
@@ -1443,7 +1456,7 @@ void PG::activate(ObjectStore::Transaction& t,
     min_last_complete_ondisk = eversion_t(0,0);  // we don't know (yet)!
   }
   last_update_applied = info.last_update;
-
+  last_rollback_info_trimmed_to_applied = pg_log.get_rollback_trimmed_to();
 
   need_up_thru = false;
 
@@ -1474,12 +1487,6 @@ void PG::activate(ObjectStore::Transaction& t,
   } else {
     dout(10) << "activate - not complete, " << missing << dendl;
     pg_log.activate_not_complete(info);
-    if (is_primary()) {
-      dout(10) << "activate - starting recovery" << dendl;
-      osd->queue_for_recovery(this);
-      if (have_unfound())
-	discover_all_missing(query_map);
-    }
   }
     
   log_weirdness();
@@ -1642,6 +1649,11 @@ void PG::activate(ObjectStore::Transaction& t,
       }
 
       build_might_have_unfound();
+
+      dout(10) << "activate - starting recovery" << dendl;
+      osd->queue_for_recovery(this);
+      if (have_unfound())
+	discover_all_missing(query_map);
     }
 
     // degraded?
@@ -2347,6 +2359,7 @@ void PG::init(
     dout(10) << __func__ << ": Setting backfill" << dendl;
     info.last_backfill = hobject_t();
     info.last_complete = info.last_update;
+    pg_log.mark_log_for_rewrite();
   }
 
   reg_next_scrub();
@@ -2641,7 +2654,10 @@ void PG::add_log_entry(pg_log_entry_t& e, bufferlist& log_bl)
 
 
 void PG::append_log(
-  vector<pg_log_entry_t>& logv, eversion_t trim_to, ObjectStore::Transaction &t,
+  vector<pg_log_entry_t>& logv,
+  eversion_t trim_to,
+  eversion_t trim_rollback_to,
+  ObjectStore::Transaction &t,
   bool transaction_applied)
 {
   if (transaction_applied)
@@ -2655,13 +2671,33 @@ void PG::append_log(
     p->offset = 0;
     add_log_entry(*p, keys[p->get_key_name()]);
   }
-  if (!transaction_applied)
-    pg_log.clear_can_rollback_to();
+
+  PGLogEntryHandler handler;
+  if (!transaction_applied) {
+    pg_log.clear_can_rollback_to(&handler);
+    t.register_on_applied(
+      new C_UpdateLastRollbackInfoTrimmedToApplied(
+	this,
+	get_osdmap()->get_epoch(),
+	info.last_update));
+  } else if (trim_rollback_to > pg_log.get_rollback_trimmed_to()) {
+    pg_log.trim_rollback_info(
+      trim_rollback_to,
+      &handler);
+    t.register_on_applied(
+      new C_UpdateLastRollbackInfoTrimmedToApplied(
+	this,
+	get_osdmap()->get_epoch(),
+	trim_rollback_to));
+  }
 
   dout(10) << "append_log  adding " << keys.size() << " keys" << dendl;
   t.omap_setkeys(coll_t::META_COLL, log_oid, keys);
-  PGLogEntryHandler handler;
+
   pg_log.trim(&handler, trim_to, info);
+
+  dout(10) << __func__ << ": trimming to " << trim_rollback_to
+	   << " entries " << handler.to_trim << dendl;
   handler.apply(this, &t);
 
   // update the local pg, pg log
@@ -3004,7 +3040,8 @@ bool PG::sched_scrub()
 
 void PG::reg_next_scrub()
 {
-  if (scrubber.must_scrub) {
+  if (scrubber.must_scrub ||
+      (info.stats.stats_invalid && g_conf->osd_scrub_invalid_stats)) {
     scrubber.scrub_reg_stamp = utime_t();
   } else {
     scrubber.scrub_reg_stamp = info.history.last_scrub_stamp;
@@ -3262,6 +3299,34 @@ void PG::scrub_unreserve_replicas()
   }
 }
 
+void PG::_scan_rollback_obs(
+  const vector<ghobject_t> &rollback_obs,
+  ThreadPool::TPHandle &handle)
+{
+  ObjectStore::Transaction *t = NULL;
+  eversion_t trimmed_to = last_rollback_info_trimmed_to_applied;
+  for (vector<ghobject_t>::const_iterator i = rollback_obs.begin();
+       i != rollback_obs.end();
+       ++i) {
+    if (i->generation < trimmed_to.version) {
+      osd->clog.error() << "osd." << osd->whoami
+			<< " pg " << info.pgid
+			<< " found obsolete rollback obj "
+			<< *i << " generation < trimmed_to "
+			<< trimmed_to
+			<< "...repaired";
+      if (!t)
+	t = new ObjectStore::Transaction;
+      t->remove(coll, *i);
+    }
+  }
+  if (t) {
+    derr << __func__ << ": queueing trans to clean up obsolete rollback objs"
+	 << dendl;
+    osd->store->queue_transaction_and_cleanup(osr.get(), t);
+  }
+}
+
 void PG::_scan_snaps(ScrubMap &smap) 
 {
   for (map<hobject_t, ScrubMap::object>::iterator i = smap.objects.begin();
@@ -3349,13 +3414,21 @@ int PG::build_scrub_map_chunk(
 
   // objects
   vector<hobject_t> ls;
-  int ret = get_pgbackend()->objects_list_range(start, end, 0, &ls);
+  vector<ghobject_t> rollback_obs;
+  int ret = get_pgbackend()->objects_list_range(
+    start,
+    end,
+    0,
+    &ls,
+    &rollback_obs);
   if (ret < 0) {
     dout(5) << "objects_list_range error: " << ret << dendl;
     return ret;
   }
 
+
   get_pgbackend()->be_scan_list(map, ls, deep, handle);
+  _scan_rollback_obs(rollback_obs, handle);
   _scan_snaps(map);
 
   // pg attrs
@@ -3578,6 +3651,17 @@ void PG::replica_scrub(
 void PG::scrub(ThreadPool::TPHandle &handle)
 {
   lock();
+  if (g_conf->osd_scrub_sleep > 0 &&
+      (scrubber.state == PG::Scrubber::NEW_CHUNK ||
+       scrubber.state == PG::Scrubber::INACTIVE)) {
+    dout(20) << __func__ << " state is INACTIVE|NEW_CHUNK, sleeping" << dendl;
+    unlock();
+    utime_t t;
+    t.set_from_double(g_conf->osd_scrub_sleep);
+    t.sleep();
+    lock();
+    dout(20) << __func__ << " slept for " << t << dendl;
+  }
   if (deleting) {
     unlock();
     return;
@@ -4631,6 +4715,21 @@ void PG::start_flush(ObjectStore::Transaction *t,
   on_safe->push_back(new ContainerContext<FlushStateRef>(flush_trigger));
 }
 
+void PG::reset_interval_flush()
+{
+  dout(10) << "Clearing blocked outgoing recovery messages" << dendl;
+  recovery_state.clear_blocked_outgoing();
+  
+  if (!osr->flush_commit(
+      new QueuePeeringEvt<IntervalFlush>(
+	this, get_osdmap()->get_epoch(), IntervalFlush()))) {
+    dout(10) << "Beginning to block outgoing recovery messages" << dendl;
+    recovery_state.begin_block_outgoing();
+  } else {
+    dout(10) << "Not blocking outgoing recovery messages" << dendl;
+  }
+}
+
 /* Called before initializing peering during advance_map */
 void PG::start_peering_interval(
   const OSDMapRef lastmap,
@@ -4641,6 +4740,7 @@ void PG::start_peering_interval(
   const OSDMapRef osdmap = get_osdmap();
 
   set_last_peering_reset();
+  reset_interval_flush();
 
   vector<int> oldacting, oldup;
   int oldrole = get_role();
@@ -5050,7 +5150,7 @@ bool PG::can_discard_request(OpRequestRef op)
   case MSG_OSD_PG_PUSH_REPLY:
     return can_discard_replica_op<MOSDPGPushReply, MSG_OSD_PG_PUSH_REPLY>(op);
   case MSG_OSD_SUBOPREPLY:
-    return false;
+    return can_discard_replica_op<MOSDSubOpReply, MSG_OSD_SUBOPREPLY>(op);
 
   case MSG_OSD_EC_WRITE:
     return can_discard_replica_op<MOSDECSubOpWrite, MSG_OSD_EC_WRITE>(op);
@@ -5386,6 +5486,15 @@ PG::RecoveryState::Started::Started(my_context ctx)
 }
 
 boost::statechart::result
+PG::RecoveryState::Started::react(const IntervalFlush&)
+{
+  dout(10) << "Ending blocked outgoing recovery messages" << dendl;
+  context< RecoveryMachine >().pg->recovery_state.end_block_outgoing();
+  return discard_event();
+}
+
+
+boost::statechart::result
 PG::RecoveryState::Started::react(const FlushedEvt&)
 {
   PG *pg = context< RecoveryMachine >().pg;
@@ -5436,6 +5545,7 @@ PG::RecoveryState::Reset::Reset(my_context ctx)
 {
   context< RecoveryMachine >().log_enter(state_name);
   PG *pg = context< RecoveryMachine >().pg;
+
   pg->flushes_in_progress = 0;
   pg->set_last_peering_reset();
 }
@@ -5448,6 +5558,14 @@ PG::RecoveryState::Reset::react(const FlushedEvt&)
   return discard_event();
 }
 
+boost::statechart::result
+PG::RecoveryState::Reset::react(const IntervalFlush&)
+{
+  dout(10) << "Ending blocked outgoing recovery messages" << dendl;
+  context< RecoveryMachine >().pg->recovery_state.end_block_outgoing();
+  return discard_event();
+}
+
 boost::statechart::result PG::RecoveryState::Reset::react(const AdvMap& advmap)
 {
   PG *pg = context< RecoveryMachine >().pg;
@@ -5715,7 +5833,7 @@ void PG::RecoveryState::Backfilling::exit()
 PG::RecoveryState::WaitRemoteBackfillReserved::WaitRemoteBackfillReserved(my_context ctx)
   : my_base(ctx),
     NamedState(context< RecoveryMachine >().pg->cct, "Started/Primary/Active/WaitRemoteBackfillReserved"),
-    backfill_osd_it(context< Active >().sorted_backfill_set.begin())
+    backfill_osd_it(context< Active >().remote_shards_to_reserve_backfill.begin())
 {
   context< RecoveryMachine >().log_enter(state_name);
   PG *pg = context< RecoveryMachine >().pg;
@@ -5728,7 +5846,7 @@ PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteBackfillReserve
 {
   PG *pg = context< RecoveryMachine >().pg;
 
-  if (backfill_osd_it != context< Active >().sorted_backfill_set.end()) {
+  if (backfill_osd_it != context< Active >().remote_shards_to_reserve_backfill.end()) {
     //The primary never backfills itself
     assert(*backfill_osd_it != pg->pg_whoami);
     ConnectionRef con = pg->osd->get_con_osd_cluster(
@@ -5770,8 +5888,8 @@ PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteReservationReje
 
   // Send REJECT to all previously acquired reservations
   set<pg_shard_t>::const_iterator it, begin, end, next;
-  begin = context< Active >().sorted_backfill_set.begin();
-  end = context< Active >().sorted_backfill_set.end();
+  begin = context< Active >().remote_shards_to_reserve_backfill.begin();
+  end = context< Active >().remote_shards_to_reserve_backfill.end();
   assert(begin != end);
   for (next = it = begin, ++next ; next != backfill_osd_it; ++it, ++next) {
     //The primary never backfills itself
@@ -5830,6 +5948,18 @@ PG::RecoveryState::NotBackfilling::NotBackfilling(my_context ctx)
   context< RecoveryMachine >().log_enter(state_name);
 }
 
+boost::statechart::result
+PG::RecoveryState::NotBackfilling::react(const RemoteBackfillReserved &evt)
+{
+  return discard_event();
+}
+
+boost::statechart::result
+PG::RecoveryState::NotBackfilling::react(const RemoteReservationRejected &evt)
+{
+  return discard_event();
+}
+
 void PG::RecoveryState::NotBackfilling::exit()
 {
   context< RecoveryMachine >().log_exit(state_name, enter_time);
@@ -6021,7 +6151,7 @@ void PG::RecoveryState::WaitLocalRecoveryReserved::exit()
 PG::RecoveryState::WaitRemoteRecoveryReserved::WaitRemoteRecoveryReserved(my_context ctx)
   : my_base(ctx),
     NamedState(context< RecoveryMachine >().pg->cct, "Started/Primary/Active/WaitRemoteRecoveryReserved"),
-    acting_osd_it(context< Active >().sorted_actingbackfill_set.begin())
+    remote_recovery_reservation_it(context< Active >().remote_shards_to_reserve_recovery.begin())
 {
   context< RecoveryMachine >().log_enter(state_name);
   post_event(RemoteRecoveryReserved());
@@ -6031,28 +6161,26 @@ boost::statechart::result
 PG::RecoveryState::WaitRemoteRecoveryReserved::react(const RemoteRecoveryReserved &evt) {
   PG *pg = context< RecoveryMachine >().pg;
 
-  if (acting_osd_it != context< Active >().sorted_actingbackfill_set.end()) {
-    // skip myself
-    if (*acting_osd_it == pg->pg_whoami)
-      ++acting_osd_it;
+  if (remote_recovery_reservation_it != context< Active >().remote_shards_to_reserve_recovery.end()) {
+    assert(*remote_recovery_reservation_it != pg->pg_whoami);
   }
 
-  if (acting_osd_it != context< Active >().sorted_actingbackfill_set.end()) {
+  if (remote_recovery_reservation_it != context< Active >().remote_shards_to_reserve_recovery.end()) {
     ConnectionRef con = pg->osd->get_con_osd_cluster(
-      acting_osd_it->osd, pg->get_osdmap()->get_epoch());
+      remote_recovery_reservation_it->osd, pg->get_osdmap()->get_epoch());
     if (con) {
       if (con->has_feature(CEPH_FEATURE_RECOVERY_RESERVATION)) {
 	pg->osd->send_message_osd_cluster(
           new MRecoveryReserve(
 	    MRecoveryReserve::REQUEST,
-	    spg_t(pg->info.pgid.pgid, acting_osd_it->shard),
+	    spg_t(pg->info.pgid.pgid, remote_recovery_reservation_it->shard),
 	    pg->get_osdmap()->get_epoch()),
 	  con.get());
       } else {
 	post_event(RemoteRecoveryReserved());
       }
     }
-    ++acting_osd_it;
+    ++remote_recovery_reservation_it;
   } else {
     post_event(AllRemotesReserved());
   }
@@ -6086,8 +6214,8 @@ void PG::RecoveryState::Recovering::release_reservations()
 
   // release remote reservations
   for (set<pg_shard_t>::const_iterator i =
-	 context< Active >().sorted_actingbackfill_set.begin();
-        i != context< Active >().sorted_actingbackfill_set.end();
+	 context< Active >().remote_shards_to_reserve_recovery.begin();
+        i != context< Active >().remote_shards_to_reserve_recovery.end();
         ++i) {
     if (*i == pg->pg_whoami) // skip myself
       continue;
@@ -6196,16 +6324,34 @@ void PG::RecoveryState::Clean::exit()
   pg->osd->recoverystate_perf->tinc(rs_clean_latency, dur);
 }
 
+template <typename T>
+set<pg_shard_t> unique_osd_shard_set(const pg_shard_t & skip, const T &in)
+{
+  set<int> osds_found;
+  set<pg_shard_t> out;
+  for (typename T::const_iterator i = in.begin();
+       i != in.end();
+       ++i) {
+    if (*i != skip && !osds_found.count(i->osd)) {
+      osds_found.insert(i->osd);
+      out.insert(*i);
+    }
+  }
+  return out;
+}
+
 /*---------Active---------*/
 PG::RecoveryState::Active::Active(my_context ctx)
   : my_base(ctx),
     NamedState(context< RecoveryMachine >().pg->cct, "Started/Primary/Active"),
-    sorted_actingbackfill_set(
-      context< RecoveryMachine >().pg->actingbackfill.begin(),
-      context< RecoveryMachine >().pg->actingbackfill.end()),
-    sorted_backfill_set(
-      context< RecoveryMachine >().pg->backfill_targets.begin(),
-      context< RecoveryMachine >().pg->backfill_targets.end()),
+    remote_shards_to_reserve_recovery(
+      unique_osd_shard_set(
+	context< RecoveryMachine >().pg->pg_whoami,
+	context< RecoveryMachine >().pg->actingbackfill)),
+    remote_shards_to_reserve_backfill(
+      unique_osd_shard_set(
+	context< RecoveryMachine >().pg->pg_whoami,
+	context< RecoveryMachine >().pg->backfill_targets)),
     all_replicas_activated(false)
 {
   context< RecoveryMachine >().log_enter(state_name);
@@ -6588,6 +6734,7 @@ boost::statechart::result PG::RecoveryState::Stray::react(const MLogRec& logevt)
   MOSDPGLog *msg = logevt.msg.get();
   dout(10) << "got info+log from osd." << logevt.from << " " << msg->info << " " << msg->log << dendl;
 
+  ObjectStore::Transaction* t = context<RecoveryMachine>().get_cur_transaction();
   if (msg->info.last_backfill == hobject_t()) {
     // restart backfill
     pg->unreg_next_scrub();
@@ -6595,10 +6742,13 @@ boost::statechart::result PG::RecoveryState::Stray::react(const MLogRec& logevt)
     pg->reg_next_scrub();
     pg->dirty_info = true;
     pg->dirty_big_info = true;  // maybe.
-    pg->pg_log.claim_log(msg->log);
+
+    PGLogEntryHandler rollbacker;
+    pg->pg_log.claim_log_and_clear_rollback_info(msg->log, &rollbacker);
+    rollbacker.apply(pg, t);
+
     pg->pg_log.reset_backfill();
   } else {
-    ObjectStore::Transaction* t = context<RecoveryMachine>().get_cur_transaction();
     pg->merge_log(*t, msg->info, msg->log, logevt.from);
   }
 
@@ -7492,9 +7642,40 @@ bool PG::PriorSet::affected_by_map(const OSDMapRef osdmap, const PG *debug_pg) c
 
 void PG::RecoveryState::start_handle(RecoveryCtx *new_ctx) {
   assert(!rctx);
-  rctx = new_ctx;
-  if (rctx)
+  assert(!orig_ctx);
+  orig_ctx = new_ctx;
+  if (new_ctx) {
+    if (messages_pending_flush) {
+      rctx = RecoveryCtx(*messages_pending_flush, *new_ctx);
+    } else {
+      rctx = *new_ctx;
+    }
     rctx->start_time = ceph_clock_now(pg->cct);
+  }
+}
+
+void PG::RecoveryState::begin_block_outgoing() {
+  assert(!messages_pending_flush);
+  assert(orig_ctx);
+  assert(rctx);
+  messages_pending_flush = BufferedRecoveryMessages();
+  rctx = RecoveryCtx(*messages_pending_flush, *orig_ctx);
+}
+
+void PG::RecoveryState::clear_blocked_outgoing() {
+  assert(orig_ctx);
+  assert(rctx);
+  messages_pending_flush = boost::optional<BufferedRecoveryMessages>();
+}
+
+void PG::RecoveryState::end_block_outgoing() {
+  assert(messages_pending_flush);
+  assert(orig_ctx);
+  assert(rctx);
+
+  rctx = RecoveryCtx(*orig_ctx);
+  rctx->accept_buffered_messages(*messages_pending_flush);
+  messages_pending_flush = boost::optional<BufferedRecoveryMessages>();
 }
 
 void PG::RecoveryState::end_handle() {
@@ -7502,8 +7683,10 @@ void PG::RecoveryState::end_handle() {
     utime_t dur = ceph_clock_now(pg->cct) - rctx->start_time;
     machine.event_time += dur;
   }
+
   machine.event_count++;
-  rctx = 0;
+  rctx = boost::optional<RecoveryCtx>();
+  orig_ctx = NULL;
 }
 
 void intrusive_ptr_add_ref(PG *pg) { pg->get("intptr"); }
diff --git a/src/osd/PG.h b/src/osd/PG.h
index e9f3981..1aadaf0 100644
--- a/src/osd/PG.h
+++ b/src/osd/PG.h
@@ -447,6 +447,25 @@ public:
   eversion_t  last_complete_ondisk;  // last_complete that has committed.
   eversion_t  last_update_applied;
 
+
+  struct C_UpdateLastRollbackInfoTrimmedToApplied : Context {
+    PGRef pg;
+    epoch_t e;
+    eversion_t v;
+    C_UpdateLastRollbackInfoTrimmedToApplied(PG *pg, epoch_t e, eversion_t v)
+      : pg(pg), e(e), v(v) {}
+    void finish(int) {
+      pg->lock();
+      if (!pg->pg_has_reset_since(e)) {
+	pg->last_rollback_info_trimmed_to_applied = v;
+      }
+      pg->unlock();
+    }
+  };
+  // entries <= last_rollback_info_trimmed_to_applied have been trimmed,
+  // and the transaction has applied
+  eversion_t  last_rollback_info_trimmed_to_applied;
+
   // primary state
  public:
   pg_shard_t primary;
@@ -487,6 +506,12 @@ public:
 
 
 public:    
+  struct BufferedRecoveryMessages {
+    map<int, map<spg_t, pg_query_t> > query_map;
+    map<int, vector<pair<pg_notify_t, pg_interval_map_t> > > info_map;
+    map<int, vector<pair<pg_notify_t, pg_interval_map_t> > > notify_list;
+  };
+
   struct RecoveryCtx {
     utime_t start_time;
     map<int, map<spg_t, pg_query_t> > *query_map;
@@ -508,6 +533,48 @@ public:
 	on_applied(on_applied),
 	on_safe(on_safe),
 	transaction(transaction) {}
+
+    RecoveryCtx(BufferedRecoveryMessages &buf, RecoveryCtx &rctx)
+      : query_map(&(buf.query_map)),
+	info_map(&(buf.info_map)),
+	notify_list(&(buf.notify_list)),
+	on_applied(rctx.on_applied),
+	on_safe(rctx.on_safe),
+	transaction(rctx.transaction) {}
+
+    void accept_buffered_messages(BufferedRecoveryMessages &m) {
+      assert(query_map);
+      assert(info_map);
+      assert(notify_list);
+      for (map<int, map<spg_t, pg_query_t> >::iterator i = m.query_map.begin();
+	   i != m.query_map.end();
+	   ++i) {
+	map<spg_t, pg_query_t> &omap = (*query_map)[i->first];
+	for (map<spg_t, pg_query_t>::iterator j = i->second.begin();
+	     j != i->second.end();
+	     ++j) {
+	  omap[j->first] = j->second;
+	}
+      }
+      for (map<int, vector<pair<pg_notify_t, pg_interval_map_t> > >::iterator i
+	     = m.info_map.begin();
+	   i != m.info_map.end();
+	   ++i) {
+	vector<pair<pg_notify_t, pg_interval_map_t> > &ovec =
+	  (*info_map)[i->first];
+	ovec.reserve(ovec.size() + i->second.size());
+	ovec.insert(ovec.end(), i->second.begin(), i->second.end());
+      }
+      for (map<int, vector<pair<pg_notify_t, pg_interval_map_t> > >::iterator i
+	     = m.notify_list.begin();
+	   i != m.notify_list.end();
+	   ++i) {
+	vector<pair<pg_notify_t, pg_interval_map_t> > &ovec =
+	  (*notify_list)[i->first];
+	ovec.reserve(ovec.size() + i->second.size());
+	ovec.insert(ovec.end(), i->second.begin(), i->second.end());
+      }
+    }
   };
 
   struct NamedState {
@@ -1108,6 +1175,9 @@ public:
   void scrub_clear_state();
   bool scrub_gather_replica_maps();
   void _scan_snaps(ScrubMap &map);
+  void _scan_rollback_obs(
+    const vector<ghobject_t> &rollback_obs,
+    ThreadPool::TPHandle &handle);
   void _request_scrub_map_classic(pg_shard_t replica, eversion_t version);
   void _request_scrub_map(pg_shard_t replica, eversion_t version,
                           hobject_t start, hobject_t end, bool deep);
@@ -1333,10 +1403,17 @@ public:
 
   TrivialEvent(AllReplicasActivated)
 
+  TrivialEvent(IntervalFlush)
+
   /* Encapsulates PG recovery process */
   class RecoveryState {
     void start_handle(RecoveryCtx *new_ctx);
     void end_handle();
+  public:
+    void begin_block_outgoing();
+    void end_block_outgoing();
+    void clear_blocked_outgoing();
+  private:
 
     /* States */
     struct Initial;
@@ -1360,40 +1437,47 @@ public:
 
       /* Accessor functions for state methods */
       ObjectStore::Transaction* get_cur_transaction() {
+	assert(state->rctx);
 	assert(state->rctx->transaction);
 	return state->rctx->transaction;
       }
 
       void send_query(pg_shard_t to, const pg_query_t &query) {
+	assert(state->rctx);
 	assert(state->rctx->query_map);
 	(*state->rctx->query_map)[to.osd][spg_t(pg->info.pgid.pgid, to.shard)] =
 	  query;
       }
 
       map<int, map<spg_t, pg_query_t> > *get_query_map() {
+	assert(state->rctx);
 	assert(state->rctx->query_map);
 	return state->rctx->query_map;
       }
 
       map<int, vector<pair<pg_notify_t, pg_interval_map_t> > > *get_info_map() {
+	assert(state->rctx);
 	assert(state->rctx->info_map);
 	return state->rctx->info_map;
       }
 
       list< Context* > *get_on_safe_context_list() {
+	assert(state->rctx);
 	assert(state->rctx->on_safe);
 	return &(state->rctx->on_safe->contexts);
       }
 
       list< Context * > *get_on_applied_context_list() {
+	assert(state->rctx);
 	assert(state->rctx->on_applied);
 	return &(state->rctx->on_applied->contexts);
       }
 
-      RecoveryCtx *get_recovery_ctx() { return state->rctx; }
+      RecoveryCtx *get_recovery_ctx() { return &*(state->rctx); }
 
       void send_notify(pg_shard_t to,
 		       const pg_notify_t &info, const pg_interval_map_t &pi) {
+	assert(state->rctx);
 	assert(state->rctx->notify_list);
 	(*state->rctx->notify_list)[to.osd].push_back(make_pair(info, pi));
       }
@@ -1439,12 +1523,14 @@ public:
 	boost::statechart::custom_reaction< ActMap >,
 	boost::statechart::custom_reaction< NullEvt >,
 	boost::statechart::custom_reaction< FlushedEvt >,
+	boost::statechart::custom_reaction< IntervalFlush >,
 	boost::statechart::transition< boost::statechart::event_base, Crashed >
 	> reactions;
       boost::statechart::result react(const QueryState& q);
       boost::statechart::result react(const AdvMap&);
       boost::statechart::result react(const ActMap&);
       boost::statechart::result react(const FlushedEvt&);
+      boost::statechart::result react(const IntervalFlush&);
       boost::statechart::result react(const boost::statechart::event_base&) {
 	return discard_event();
       }
@@ -1461,11 +1547,13 @@ public:
 	boost::statechart::custom_reaction< AdvMap >,
 	boost::statechart::custom_reaction< NullEvt >,
 	boost::statechart::custom_reaction< FlushedEvt >,
+	boost::statechart::custom_reaction< IntervalFlush >,
 	boost::statechart::transition< boost::statechart::event_base, Crashed >
 	> reactions;
       boost::statechart::result react(const QueryState& q);
       boost::statechart::result react(const AdvMap&);
       boost::statechart::result react(const FlushedEvt&);
+      boost::statechart::result react(const IntervalFlush&);
       boost::statechart::result react(const boost::statechart::event_base&) {
 	return discard_event();
       }
@@ -1555,8 +1643,8 @@ public:
       Active(my_context ctx);
       void exit();
 
-      const set<pg_shard_t> sorted_actingbackfill_set;
-      const set<pg_shard_t> sorted_backfill_set;
+      const set<pg_shard_t> remote_shards_to_reserve_recovery;
+      const set<pg_shard_t> remote_shards_to_reserve_backfill;
       bool all_replicas_activated;
 
       typedef boost::mpl::list <
@@ -1635,10 +1723,14 @@ public:
 
     struct NotBackfilling : boost::statechart::state< NotBackfilling, Active>, NamedState {
       typedef boost::mpl::list<
-	boost::statechart::transition< RequestBackfill, WaitLocalBackfillReserved>
+	boost::statechart::transition< RequestBackfill, WaitLocalBackfillReserved>,
+	boost::statechart::custom_reaction< RemoteBackfillReserved >,
+	boost::statechart::custom_reaction< RemoteReservationRejected >
 	> reactions;
       NotBackfilling(my_context ctx);
       void exit();
+      boost::statechart::result react(const RemoteBackfillReserved& evt);
+      boost::statechart::result react(const RemoteReservationRejected& evt);
     };
 
     struct RepNotRecovering;
@@ -1721,7 +1813,7 @@ public:
 	boost::statechart::custom_reaction< RemoteRecoveryReserved >,
 	boost::statechart::transition< AllRemotesReserved, Recovering >
 	> reactions;
-      set<pg_shard_t>::const_iterator acting_osd_it;
+      set<pg_shard_t>::const_iterator remote_recovery_reservation_it;
       WaitRemoteRecoveryReserved(my_context ctx);
       boost::statechart::result react(const RemoteRecoveryReserved &evt);
       void exit();
@@ -1855,10 +1947,23 @@ public:
 
     RecoveryMachine machine;
     PG *pg;
-    RecoveryCtx *rctx;
+
+    /// context passed in by state machine caller
+    RecoveryCtx *orig_ctx;
+
+    /// populated if we are buffering messages pending a flush
+    boost::optional<BufferedRecoveryMessages> messages_pending_flush;
+
+    /**
+     * populated between start_handle() and end_handle(), points into
+     * the message lists for messages_pending_flush while blocking messages
+     * or into orig_ctx otherwise
+     */
+    boost::optional<RecoveryCtx> rctx;
 
   public:
-    RecoveryState(PG *pg) : machine(this, pg), pg(pg), rctx(0) {
+    RecoveryState(PG *pg)
+      : machine(this, pg), pg(pg), orig_ctx(0) {
       machine.initiate();
     }
 
@@ -1996,7 +2101,10 @@ public:
 
   void add_log_entry(pg_log_entry_t& e, bufferlist& log_bl);
   void append_log(
-    vector<pg_log_entry_t>& logv, eversion_t trim_to, ObjectStore::Transaction &t,
+    vector<pg_log_entry_t>& logv,
+    eversion_t trim_to,
+    eversion_t trim_rollback_to,
+    ObjectStore::Transaction &t,
     bool transaction_applied = true);
   bool check_log_for_corruption(ObjectStore *store);
   void trim_peers();
@@ -2026,6 +2134,7 @@ public:
   /// share new pg log entries after a pg is active
   void share_pg_log();
 
+  void reset_interval_flush();
   void start_peering_interval(
     const OSDMapRef lastmap,
     const vector<int>& newup, int up_primary,
diff --git a/src/osd/PGBackend.cc b/src/osd/PGBackend.cc
index e1aceef..57faadd 100644
--- a/src/osd/PGBackend.cc
+++ b/src/osd/PGBackend.cc
@@ -115,7 +115,11 @@ int PGBackend::objects_list_partial(
   hobject_t *next)
 {
   assert(ls);
-  ghobject_t _next(begin);
+  // Starts with the smallest shard id and generation to
+  // make sure the result list has the marker object (
+  // it might have multiple generations though, which would
+  // be filtered).
+  ghobject_t _next(begin, 0, shard_id_t(0));
   ls->reserve(max);
   int r = 0;
   while (!_next.is_max() && ls->size() < (unsigned)min) {
@@ -147,7 +151,8 @@ int PGBackend::objects_list_range(
   const hobject_t &start,
   const hobject_t &end,
   snapid_t seq,
-  vector<hobject_t> *ls)
+  vector<hobject_t> *ls,
+  vector<ghobject_t> *gen_obs)
 {
   assert(ls);
   vector<ghobject_t> objects;
@@ -163,6 +168,8 @@ int PGBackend::objects_list_range(
        ++i) {
     if (i->is_no_gen()) {
       ls->push_back(i->hobj);
+    } else if (gen_obs) {
+      gen_obs->push_back(*i);
     }
   }
   return r;
diff --git a/src/osd/PGBackend.h b/src/osd/PGBackend.h
index e10201a..4070752 100644
--- a/src/osd/PGBackend.h
+++ b/src/osd/PGBackend.h
@@ -177,6 +177,7 @@
        vector<pg_log_entry_t> &logv,
        boost::optional<pg_hit_set_history_t> &hset_history,
        const eversion_t &trim_to,
+       const eversion_t &trim_rollback_to,
        bool transaction_applied,
        ObjectStore::Transaction *t) = 0;
 
@@ -496,6 +497,7 @@
      const eversion_t &at_version,        ///< [in] version
      PGTransaction *t,                    ///< [in] trans to execute
      const eversion_t &trim_to,           ///< [in] trim log to here
+     const eversion_t &trim_rollback_to,  ///< [in] trim rollback info to here
      vector<pg_log_entry_t> &log_entries, ///< [in] log entries for t
      /// [in] hitset history (if updated with this transaction)
      boost::optional<pg_hit_set_history_t> &hset_history,
@@ -555,7 +557,8 @@
      const hobject_t &start,
      const hobject_t &end,
      snapid_t seq,
-     vector<hobject_t> *ls);
+     vector<hobject_t> *ls,
+     vector<ghobject_t> *gen_obs=0);
 
    int objects_get_attr(
      const hobject_t &hoid,
diff --git a/src/osd/PGLog.cc b/src/osd/PGLog.cc
index c3addd7..9523b12 100644
--- a/src/osd/PGLog.cc
+++ b/src/osd/PGLog.cc
@@ -24,6 +24,25 @@
 
 //////////////////// PGLog::IndexedLog ////////////////////
 
+void PGLog::IndexedLog::advance_rollback_info_trimmed_to(
+  eversion_t to,
+  LogEntryHandler *h)
+{
+  assert(to <= can_rollback_to);
+
+  if (to > rollback_info_trimmed_to)
+    rollback_info_trimmed_to = to;
+
+  while (rollback_info_trimmed_to_riter != log.rbegin()) {
+    --rollback_info_trimmed_to_riter;
+    if (rollback_info_trimmed_to_riter->version > rollback_info_trimmed_to) {
+      ++rollback_info_trimmed_to_riter;
+      break;
+    }
+    h->trim(*rollback_info_trimmed_to_riter);
+  }
+}
+
 void PGLog::IndexedLog::split_into(
   pg_t child_pgid,
   unsigned split_bits,
@@ -47,9 +66,11 @@ void PGLog::IndexedLog::split_into(
     oldlog.erase(i++);
   }
 
+
+  olog->can_rollback_to = can_rollback_to;
+
   olog->index();
   index();
-  olog->can_rollback_to = can_rollback_to;
 }
 
 void PGLog::IndexedLog::trim(
@@ -59,10 +80,15 @@ void PGLog::IndexedLog::trim(
 {
   if (complete_to != log.end() &&
       complete_to->version <= s) {
-    generic_dout(0) << " bad trim to " << s << " when complete_to is " << complete_to->version
+    generic_dout(0) << " bad trim to " << s << " when complete_to is "
+		    << complete_to->version
 		    << " on " << *this << dendl;
   }
 
+  if (s > can_rollback_to)
+    can_rollback_to = s;
+  advance_rollback_info_trimmed_to(s, handler);
+
   while (!log.empty()) {
     pg_log_entry_t &e = *log.begin();
     if (e.version > s)
@@ -70,9 +96,15 @@ void PGLog::IndexedLog::trim(
     generic_dout(20) << "trim " << e << dendl;
     if (trimmed)
       trimmed->insert(e.version);
-    handler->trim(e);
+
     unindex(e);         // remove from index,
-    log.pop_front();    // from log
+
+    if (e.version == rollback_info_trimmed_to_riter->version) {
+      log.pop_front();
+      rollback_info_trimmed_to_riter = log.rend();
+    } else {
+      log.pop_front();
+    }
   }
 
   // raise tail?
@@ -104,7 +136,7 @@ void PGLog::reset_backfill()
 void PGLog::clear() {
   divergent_priors.clear();
   missing.clear();
-  log.zero();
+  log.clear();
   log_keys_debug.clear();
   undirty();
 }
diff --git a/src/osd/PGLog.h b/src/osd/PGLog.h
index f793cbd..1744cc8 100644
--- a/src/osd/PGLog.h
+++ b/src/osd/PGLog.h
@@ -62,12 +62,34 @@ struct PGLog {
     list<pg_log_entry_t>::iterator complete_to;  // not inclusive of referenced item
     version_t last_requested;           // last object requested by primary
 
+    //
+  private:
+    /**
+     * rollback_info_trimmed_to_riter points to the first log entry <=
+     * rollback_info_trimmed_to
+     *
+     * It's a reverse_iterator because rend() is a natural representation for
+     * tail, and rbegin() works nicely for head.
+     */
+    list<pg_log_entry_t>::reverse_iterator rollback_info_trimmed_to_riter;
+  public:
+    void advance_rollback_info_trimmed_to(eversion_t to, LogEntryHandler *h);
+
     /****/
-    IndexedLog() : last_requested(0) {}
+    IndexedLog() :
+      complete_to(log.end()),
+      last_requested(0),
+      rollback_info_trimmed_to_riter(log.rbegin())
+      {}
+
+    void claim_log_and_clear_rollback_info(const pg_log_t& o) {
+      // we must have already trimmed the old entries
+      assert(rollback_info_trimmed_to == head);
+      assert(rollback_info_trimmed_to_riter == log.rbegin());
 
-    void claim_log(const pg_log_t& o) {
       log = o.log;
       head = o.head;
+      rollback_info_trimmed_to = head;
       tail = o.tail;
       index();
     }
@@ -78,10 +100,20 @@ struct PGLog {
       IndexedLog *olog);
 
     void zero() {
+      // we must have already trimmed the old entries
+      assert(rollback_info_trimmed_to == head);
+      assert(rollback_info_trimmed_to_riter == log.rbegin());
+
       unindex();
       pg_log_t::clear();
+      rollback_info_trimmed_to_riter = log.rbegin();
       reset_recovery_pointers();
     }
+    void clear() {
+      rollback_info_trimmed_to = head;
+      rollback_info_trimmed_to_riter = log.rbegin();
+      zero();
+    }
     void reset_recovery_pointers() {
       complete_to = log.end();
       last_requested = 0;
@@ -112,6 +144,11 @@ struct PGLog {
 	  caller_ops[i->reqid] = &(*i);
 	}
       }
+
+      rollback_info_trimmed_to_riter = log.rbegin();
+      while (rollback_info_trimmed_to_riter != log.rend() &&
+	     rollback_info_trimmed_to_riter->version > rollback_info_trimmed_to)
+	rollback_info_trimmed_to_riter++;
     }
 
     void index(pg_log_entry_t& e) {
@@ -141,6 +178,11 @@ struct PGLog {
     void add(pg_log_entry_t& e) {
       // add to log
       log.push_back(e);
+
+      // riter previously pointed to the previous entry
+      if (rollback_info_trimmed_to_riter == log.rbegin())
+	++rollback_info_trimmed_to_riter;
+
       assert(e.version > head);
       assert(head.version == 0 || e.version.version > head.version);
       head = e.version;
@@ -325,14 +367,33 @@ public:
     eversion_t trim_to,
     pg_info_t &info);
 
-  void clear_can_rollback_to() {
+  void trim_rollback_info(
+    eversion_t trim_rollback_to,
+    LogEntryHandler *h) {
+    if (trim_rollback_to > log.can_rollback_to)
+      log.can_rollback_to = trim_rollback_to;
+    log.advance_rollback_info_trimmed_to(
+      trim_rollback_to,
+      h);
+  }
+
+  eversion_t get_rollback_trimmed_to() const {
+    return log.rollback_info_trimmed_to;
+  }
+
+  void clear_can_rollback_to(LogEntryHandler *h) {
     log.can_rollback_to = log.head;
+    log.advance_rollback_info_trimmed_to(
+      log.head,
+      h);
   }
 
   //////////////////// get or set log & missing ////////////////////
 
-  void claim_log(const pg_log_t &o) {
-    log.claim_log(o);
+  void claim_log_and_clear_rollback_info(const pg_log_t &o, LogEntryHandler *h) {
+    log.can_rollback_to = log.head;
+    log.advance_rollback_info_trimmed_to(log.head, h);
+    log.claim_log_and_clear_rollback_info(o);
     missing.clear();
     mark_dirty_to(eversion_t::max());
   }
diff --git a/src/osd/ReplicatedBackend.cc b/src/osd/ReplicatedBackend.cc
index 489aee4..4430b39 100644
--- a/src/osd/ReplicatedBackend.cc
+++ b/src/osd/ReplicatedBackend.cc
@@ -494,6 +494,7 @@ void ReplicatedBackend::submit_transaction(
   const eversion_t &at_version,
   PGTransaction *_t,
   const eversion_t &trim_to,
+  const eversion_t &trim_rollback_to,
   vector<pg_log_entry_t> &log_entries,
   boost::optional<pg_hit_set_history_t> &hset_history,
   Context *on_local_applied_sync,
@@ -534,6 +535,7 @@ void ReplicatedBackend::submit_transaction(
     tid,
     reqid,
     trim_to,
+    trim_rollback_to,
     t->get_temp_added().size() ? *(t->get_temp_added().begin()) : hobject_t(),
     t->get_temp_cleared().size() ?
       *(t->get_temp_cleared().begin()) :hobject_t(),
@@ -549,7 +551,13 @@ void ReplicatedBackend::submit_transaction(
   }
   clear_temp_objs(t->get_temp_cleared());
 
-  parent->log_operation(log_entries, hset_history, trim_to, true, &local_t);
+  parent->log_operation(
+    log_entries,
+    hset_history,
+    trim_to,
+    trim_rollback_to,
+    true,
+    &local_t);
   local_t.append(*op_t);
   local_t.swap(*op_t);
   
diff --git a/src/osd/ReplicatedBackend.h b/src/osd/ReplicatedBackend.h
index 2d75d42..5e1f0ec 100644
--- a/src/osd/ReplicatedBackend.h
+++ b/src/osd/ReplicatedBackend.h
@@ -342,6 +342,7 @@ public:
     const eversion_t &at_version,
     PGTransaction *t,
     const eversion_t &trim_to,
+    const eversion_t &trim_rollback_to,
     vector<pg_log_entry_t> &log_entries,
     boost::optional<pg_hit_set_history_t> &hset_history,
     Context *on_local_applied_sync,
@@ -359,6 +360,7 @@ private:
     ceph_tid_t tid,
     osd_reqid_t reqid,
     eversion_t pg_trim_to,
+    eversion_t pg_trim_rollback_to,
     hobject_t new_temp_oid,
     hobject_t discard_temp_oid,
     vector<pg_log_entry_t> &log_entries,
diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc
index 15d2edf..5600466 100644
--- a/src/osd/ReplicatedPG.cc
+++ b/src/osd/ReplicatedPG.cc
@@ -1120,6 +1120,12 @@ void ReplicatedPG::do_request(
       waiting_for_active.push_back(op);
       return;
     }
+    // verify client features
+    if ((pool.info.has_tiers() || pool.info.is_tier()) &&
+	!op->has_feature(CEPH_FEATURE_OSD_CACHEPOOL)) {
+      osd->reply_op_error(op, -EOPNOTSUPP);
+      return;
+    }
     do_op(op); // do it now
     break;
 
@@ -1352,9 +1358,10 @@ void ReplicatedPG::do_op(OpRequestRef op)
 	hit_set_start_stamp + pool.info.hit_set_period <= m->get_recv_stamp()) {
       hit_set_persist();
     }
+  }
 
-    if (agent_state)
-      agent_choose_mode();
+  if (agent_state) {
+    agent_choose_mode();
   }
 
   if ((m->get_flags() & CEPH_OSD_FLAG_IGNORE_CACHE) == 0 &&
@@ -4854,8 +4861,9 @@ void ReplicatedPG::make_writeable(OpContext *ctx)
       if (pool.info.require_rollback())
 	ctx->clone_obc->attr_cache = ctx->obc->attr_cache;
       snap_oi = &ctx->clone_obc->obs.oi;
-      bool got = ctx->clone_obc->get_write(ctx->op);
+      bool got = ctx->clone_obc->get_write_greedy(ctx->op);
       assert(got);
+      dout(20) << " got greedy write on clone_obc " << *ctx->clone_obc << dendl;
     } else {
       snap_oi = &static_snap_oi;
     }
@@ -5160,8 +5168,9 @@ void ReplicatedPG::finish_ctx(OpContext *ctx, int log_op_type, bool maintain_ssc
 					0, osd_reqid_t(), ctx->mtime));
 
       ctx->snapset_obc = get_object_context(snapoid, true);
-      bool got = ctx->snapset_obc->get_write(ctx->op);
+      bool got = ctx->snapset_obc->get_write_greedy(ctx->op);
       assert(got);
+      dout(20) << " got greedy write on snapset_obc " << *ctx->snapset_obc << dendl;
       ctx->release_snapset_obc = true;
       if (pool.info.require_rollback() && !ctx->snapset_obc->obs.exists) {
 	ctx->log.back().mod_desc.create();
@@ -6026,6 +6035,11 @@ void ReplicatedPG::cancel_copy(CopyOpRef cop, bool requeue)
   cop->results.should_requeue = requeue;
   CopyCallbackResults result(-ECANCELED, &cop->results);
   cop->cb->complete(result);
+
+  // There may still be an objecter callback referencing this copy op.
+  // That callback will not need the obc since it's been canceled, and
+  // we need the obc reference to go away prior to flush.
+  cop->obc = ObjectContextRef();
 }
 
 void ReplicatedPG::cancel_copy_ops(bool requeue)
@@ -6169,58 +6183,53 @@ int ReplicatedPG::start_flush(
     cancel_flush(fop, false);
   }
 
-  // construct a SnapContext appropriate for this clone/head
-  SnapContext dsnapc;
-  dsnapc.seq = 0;
-  SnapContext snapc;
-  if (soid.snap == CEPH_NOSNAP) {
-    snapc.seq = snapset.seq;
-    snapc.snaps = snapset.snaps;
+  /**
+   * In general, we need to send two deletes and a copyfrom.
+   * Consider snapc 10:[10, 9, 8, 4, 3, 2]:[10(10, 9), 4(4,3,2)]
+   * where 4 is marked as clean.  To flush 10, we have to:
+   * 1) delete 4:[4,3,2] -- ensure head is created at cloneid 4
+   * 2) delete (8-1):[4,3,2] -- ensure that the object does not exist at 8
+   * 3) copyfrom 8:[8,4,3,2] -- flush object excluding snap 8
+   *
+   * The second delete is required in case at some point in the past
+   * there had been a clone 7(7,6), which we had flushed.  Without
+   * the second delete, the object would appear in the base pool to
+   * have existed.
+   */
 
-    if (!snapset.clones.empty() && snapset.clones.back() != snapset.seq) {
-      dsnapc.seq = snapset.clones.back();
-      vector<snapid_t>::iterator p = snapset.snaps.begin();
-      while (p != snapset.snaps.end() && *p > dsnapc.seq)
-	++p;
-      dsnapc.snaps = vector<snapid_t>(p, snapset.snaps.end());
+  SnapContext snapc, dsnapc, dsnapc2;
+  if (snapset.seq != 0) {
+    if (soid.snap == CEPH_NOSNAP) {
+      snapc.seq = snapset.seq;
+      snapc.snaps = snapset.snaps;
+    } else {
+      snapid_t min_included_snap = oi.snaps.back();
+      snapc = snapset.get_ssc_as_of(min_included_snap - 1);
     }
-  } else {
-    vector<snapid_t>::iterator citer = std::find(
-      snapset.clones.begin(),
-      snapset.clones.end(),
-      soid.snap);
-    assert(citer != snapset.clones.end());
-    snapid_t prev_snapc = (citer == snapset.clones.begin()) ?
-      snapid_t(0) : *(citer - 1);
-
-    vector<snapid_t>::iterator p = snapset.snaps.begin();
-    while (p != snapset.snaps.end() && *p >= oi.snaps.back())
-      ++p;
-    snapc.snaps = vector<snapid_t>(p, snapset.snaps.end());
 
-    while (p != snapset.snaps.end() && *p >= oi.snaps.back())
-      ++p;
-    vector<snapid_t>::iterator dnewest = p;
-
-    // we may need to send a delete first
-    while (p != snapset.snaps.end() && *p > prev_snapc)
-      ++p;
-    dsnapc.snaps = vector<snapid_t>(p, snapset.snaps.end());
+    snapid_t prev_snapc = 0;
+    for (vector<snapid_t>::reverse_iterator citer = snapset.clones.rbegin();
+	 citer != snapset.clones.rend();
+	 ++citer) {
+      if (*citer < soid.snap) {
+	prev_snapc = *citer;
+	break;
+      }
+    }
 
-    if (p == dnewest) {
-      // no snaps between the oldest in this clone and prev_snapc
-      snapc.seq = prev_snapc;
-    } else {
-      // snaps between oldest in this clone and prev_snapc, send delete
-      dsnapc.seq = prev_snapc;
-      snapc.seq = oi.snaps.back() - 1;
+    if (prev_snapc != snapc.seq) {
+      dsnapc = snapset.get_ssc_as_of(prev_snapc);
+      snapid_t first_snap_after_prev_snapc =
+	snapset.get_first_snap_after(prev_snapc, snapc.seq);
+      dsnapc2 = snapset.get_ssc_as_of(
+	first_snap_after_prev_snapc - 1);
     }
   }
 
   object_locator_t base_oloc(soid);
   base_oloc.pool = pool.info.tier_of;
 
-  if (dsnapc.seq > 0) {
+  if (dsnapc.seq > 0 && dsnapc.seq < snapc.seq) {
     ObjectOperation o;
     o.remove();
     osd->objecter_lock.Lock();
@@ -6238,6 +6247,24 @@ int ReplicatedPG::start_flush(
     osd->objecter_lock.Unlock();
   }
 
+  if (dsnapc2.seq > dsnapc.seq && dsnapc2.seq < snapc.seq) {
+    ObjectOperation o;
+    o.remove();
+    osd->objecter_lock.Lock();
+    osd->objecter->mutate(
+      soid.oid,
+      base_oloc,
+      o,
+      dsnapc2,
+      oi.mtime,
+      (CEPH_OSD_FLAG_IGNORE_OVERLAY |
+       CEPH_OSD_FLAG_ORDERSNAP |
+       CEPH_OSD_FLAG_ENFORCE_SNAPC),
+      NULL,
+      NULL /* no callback, we'll rely on the ordering w.r.t the next op */);
+    osd->objecter_lock.Unlock();
+  }
+
   FlushOpRef fop(new FlushOp);
   fop->obc = obc;
   fop->flushed_version = oi.user_version;
@@ -6442,7 +6469,7 @@ void ReplicatedPG::cancel_flush_ops(bool requeue)
 
 bool ReplicatedPG::is_present_clone(hobject_t coid)
 {
-  if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)
+  if (!pool.info.allow_incomplete_clones())
     return true;
   if (is_missing_object(coid))
     return true;
@@ -6735,6 +6762,7 @@ void ReplicatedPG::issue_repop(RepGather *repop, utime_t now)
     repop->ctx->at_version,
     repop->ctx->op_t,
     pg_trim_to,
+    min_last_complete_ondisk,
     repop->ctx->log,
     repop->ctx->updated_hset_history,
     onapplied_sync,
@@ -6752,6 +6780,7 @@ void ReplicatedBackend::issue_op(
   ceph_tid_t tid,
   osd_reqid_t reqid,
   eversion_t pg_trim_to,
+  eversion_t pg_trim_rollback_to,
   hobject_t new_temp_oid,
   hobject_t discard_temp_oid,
   vector<pg_log_entry_t> &log_entries,
@@ -6807,6 +6836,7 @@ void ReplicatedBackend::issue_op(
       wr->pg_stats = get_info().stats;
     
     wr->pg_trim_to = pg_trim_to;
+    wr->pg_trim_rollback_to = pg_trim_rollback_to;
 
     wr->new_temp_oid = new_temp_oid;
     wr->discard_temp_oid = discard_temp_oid;
@@ -6841,6 +6871,12 @@ ReplicatedPG::RepGather *ReplicatedPG::new_repop(OpContext *ctx, ObjectContextRe
 void ReplicatedPG::remove_repop(RepGather *repop)
 {
   dout(20) << __func__ << " " << *repop << dendl;
+  if (repop->ctx->obc)
+    dout(20) << " obc " << *repop->ctx->obc << dendl;
+  if (repop->ctx->clone_obc)
+    dout(20) << " clone_obc " << *repop->ctx->clone_obc << dendl;
+  if (repop->ctx->snapset_obc)
+    dout(20) << " snapset_obc " << *repop->ctx->snapset_obc << dendl;
   release_op_ctx_locks(repop->ctx);
   repop->ctx->finish(0);  // FIXME: return value here is sloppy
   repop_map.erase(repop->rep_tid);
@@ -7607,6 +7643,7 @@ void ReplicatedBackend::sub_op_modify(OpRequestRef op)
       log,
       m->updated_hit_set_history,
       m->pg_trim_to,
+      m->pg_trim_rollback_to,
       update_snaps,
       &(rm->localt));
       
@@ -7702,8 +7739,8 @@ void ReplicatedBackend::calc_head_subsets(
   if (size)
     data_subset.insert(0, size);
 
-  if (get_parent()->get_pool().cache_mode != pg_pool_t::CACHEMODE_NONE) {
-    dout(10) << __func__ << ": caching enabled, skipping clone subsets" << dendl;
+  if (get_parent()->get_pool().allow_incomplete_clones()) {
+    dout(10) << __func__ << ": caching (was) enabled, skipping clone subsets" << dendl;
     return;
   }
 
@@ -7762,8 +7799,8 @@ void ReplicatedBackend::calc_clone_subsets(
   if (size)
     data_subset.insert(0, size);
 
-  if (get_parent()->get_pool().cache_mode != pg_pool_t::CACHEMODE_NONE) {
-    dout(10) << __func__ << ": caching enabled, skipping clone subsets" << dendl;
+  if (get_parent()->get_pool().allow_incomplete_clones()) {
+    dout(10) << __func__ << ": caching (was) enabled, skipping clone subsets" << dendl;
     return;
   }
 
@@ -9465,6 +9502,17 @@ void ReplicatedPG::on_role_change()
 void ReplicatedPG::on_pool_change()
 {
   dout(10) << __func__ << dendl;
+  // requeue cache full waiters just in case the cache_mode is
+  // changing away from writeback mode.  note that if we are not
+  // active the normal requeuing machinery is sufficient (and properly
+  // ordered).
+  if (is_active() &&
+      pool.info.cache_mode != pg_pool_t::CACHEMODE_WRITEBACK &&
+      !waiting_for_cache_not_full.empty()) {
+    dout(10) << __func__ << " requeuing full waiters (not in writeback) "
+	     << dendl;
+    requeue_ops(waiting_for_cache_not_full);
+  }
   hit_set_setup();
   agent_setup();
 }
@@ -10701,6 +10749,9 @@ void ReplicatedPG::hit_set_create()
     if (p->target_size < static_cast<uint64_t>(g_conf->osd_hit_set_min_size))
       p->target_size = g_conf->osd_hit_set_min_size;
 
+    if (p->target_size > static_cast<uint64_t>(g_conf->osd_hit_set_max_size))
+      p->target_size = g_conf->osd_hit_set_max_size;
+
     p->seed = now.sec();
 
     dout(10) << __func__ << " target_size " << p->target_size
@@ -11289,7 +11340,8 @@ bool ReplicatedPG::agent_maybe_evict(ObjectContextRef& obc)
     }
   }
 
-  if (agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL) {
+  if (agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL &&
+      hit_set) {
     // is this object old and/or cold enough?
     int atime = -1, temp = 0;
     agent_estimate_atime_temp(soid, &atime, NULL /*FIXME &temp*/);
@@ -11421,7 +11473,11 @@ void ReplicatedPG::agent_choose_mode(bool restart)
       num_dirty = 0;
   }
 
-  dout(10) << __func__ << ": "
+  dout(10) << __func__
+	   << " flush_mode: "
+	   << TierAgentState::get_flush_mode_name(agent_state->flush_mode)
+	   << " evict_mode: "
+	   << TierAgentState::get_evict_mode_name(agent_state->evict_mode)
 	   << " num_objects: " << info.stats.stats.sum.num_objects
 	   << " num_bytes: " << info.stats.stats.sum.num_bytes
 	   << " num_objects_dirty: " << info.stats.stats.sum.num_objects_dirty
@@ -11435,7 +11491,7 @@ void ReplicatedPG::agent_choose_mode(bool restart)
   // get dirty, full ratios
   uint64_t dirty_micro = 0;
   uint64_t full_micro = 0;
-  if (pool.info.target_max_bytes && info.stats.stats.sum.num_objects) {
+  if (pool.info.target_max_bytes && info.stats.stats.sum.num_objects > 0) {
     uint64_t avg_size = info.stats.stats.sum.num_bytes /
       info.stats.stats.sum.num_objects;
     dirty_micro =
@@ -11445,7 +11501,7 @@ void ReplicatedPG::agent_choose_mode(bool restart)
       num_user_objects * avg_size * 1000000 /
       MAX(pool.info.target_max_bytes / divisor, 1);
   }
-  if (pool.info.target_max_objects) {
+  if (pool.info.target_max_objects > 0) {
     uint64_t dirty_objects_micro =
       num_dirty * 1000000 /
       MAX(pool.info.target_max_objects / divisor, 1);
@@ -11531,8 +11587,10 @@ void ReplicatedPG::agent_choose_mode(bool restart)
 	    << " -> "
 	    << TierAgentState::get_evict_mode_name(evict_mode)
 	    << dendl;
-    if (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
+    if (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL &&
+	is_active()) {
       requeue_ops(waiting_for_cache_not_full);
+      requeue_ops(waiting_for_active);
     }
     agent_state->evict_mode = evict_mode;
   }
@@ -11660,7 +11718,7 @@ void ReplicatedPG::_scrub(ScrubMap& scrubmap)
 
       // did we finish the last oid?
       if (head != hobject_t() &&
-	  pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE) {
+	  !pool.info.allow_incomplete_clones()) {
 	osd->clog.error() << mode << " " << info.pgid << " " << head
 			  << " missing clones";
         ++scrubber.shallow_errors;
@@ -11721,7 +11779,7 @@ void ReplicatedPG::_scrub(ScrubMap& scrubmap)
     //
 
     if (!next_clone.is_min() && next_clone != soid &&
-	pool.info.cache_mode != pg_pool_t::CACHEMODE_NONE) {
+	pool.info.allow_incomplete_clones()) {
       // it is okay to be missing one or more clones in a cache tier.
       // skip higher-numbered clones in the list.
       while (curclone != snapset.clones.rend() &&
@@ -11809,7 +11867,7 @@ void ReplicatedPG::_scrub(ScrubMap& scrubmap)
   }
 
   if (!next_clone.is_min() &&
-      pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE) {
+      !pool.info.allow_incomplete_clones()) {
     osd->clog.error() << mode << " " << info.pgid
 		      << " expected clone " << next_clone;
     ++scrubber.shallow_errors;
diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h
index 562cb06..9ef131c 100644
--- a/src/osd/ReplicatedPG.h
+++ b/src/osd/ReplicatedPG.h
@@ -347,13 +347,14 @@ public:
     vector<pg_log_entry_t> &logv,
     boost::optional<pg_hit_set_history_t> &hset_history,
     const eversion_t &trim_to,
+    const eversion_t &trim_rollback_to,
     bool transaction_applied,
     ObjectStore::Transaction *t) {
     if (hset_history) {
       info.hit_set = *hset_history;
       dirty_info = true;
     }
-    append_log(logv, trim_to, *t, transaction_applied);
+    append_log(logv, trim_to, trim_rollback_to, *t, transaction_applied);
   }
 
   void op_applied(
diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc
index c57ee86..16bdbaf 100644
--- a/src/osd/osd_types.cc
+++ b/src/osd/osd_types.cc
@@ -2102,8 +2102,8 @@ void pg_notify_t::dump(Formatter *f) const
 {
   f->dump_int("from", from);
   f->dump_int("to", to);
-  f->dump_stream("query_epoch") << query_epoch;
-  f->dump_stream("epoch_sent") << epoch_sent;
+  f->dump_unsigned("query_epoch", query_epoch);
+  f->dump_unsigned("epoch_sent", epoch_sent);
   {
     f->open_object_section("info");
     info.dump(f);
@@ -2461,8 +2461,8 @@ struct DumpVisitor : public ObjectModDesc::Visitor {
 void ObjectModDesc::dump(Formatter *f) const
 {
   f->open_object_section("object_mod_desc");
-  f->dump_stream("can_local_rollback") << can_local_rollback;
-  f->dump_stream("stashed") << stashed;
+  f->dump_bool("can_local_rollback", can_local_rollback);
+  f->dump_bool("rollback_info_completed", rollback_info_completed);
   {
     f->open_array_section("ops");
     DumpVisitor vis(f);
@@ -2497,7 +2497,7 @@ void ObjectModDesc::encode(bufferlist &_bl) const
 {
   ENCODE_START(1, 1, _bl);
   ::encode(can_local_rollback, _bl);
-  ::encode(stashed, _bl);
+  ::encode(rollback_info_completed, _bl);
   ::encode(bl, _bl);
   ENCODE_FINISH(_bl);
 }
@@ -2505,7 +2505,7 @@ void ObjectModDesc::decode(bufferlist::iterator &_bl)
 {
   DECODE_START(1, _bl);
   ::decode(can_local_rollback, _bl);
-  ::decode(stashed, _bl);
+  ::decode(rollback_info_completed, _bl);
   ::decode(bl, _bl);
   DECODE_FINISH(_bl);
 }
@@ -2680,17 +2680,18 @@ ostream& operator<<(ostream& out, const pg_log_entry_t& e)
 
 void pg_log_t::encode(bufferlist& bl) const
 {
-  ENCODE_START(5, 3, bl);
+  ENCODE_START(6, 3, bl);
   ::encode(head, bl);
   ::encode(tail, bl);
   ::encode(log, bl);
   ::encode(can_rollback_to, bl);
+  ::encode(rollback_info_trimmed_to, bl);
   ENCODE_FINISH(bl);
 }
  
 void pg_log_t::decode(bufferlist::iterator &bl, int64_t pool)
 {
-  DECODE_START_LEGACY_COMPAT_LEN(5, 3, 3, bl);
+  DECODE_START_LEGACY_COMPAT_LEN(6, 3, 3, bl);
   ::decode(head, bl);
   ::decode(tail, bl);
   if (struct_v < 2) {
@@ -2700,6 +2701,11 @@ void pg_log_t::decode(bufferlist::iterator &bl, int64_t pool)
   ::decode(log, bl);
   if (struct_v >= 5)
     ::decode(can_rollback_to, bl);
+
+  if (struct_v >= 6)
+    ::decode(rollback_info_trimmed_to, bl);
+  else
+    rollback_info_trimmed_to = tail;
   DECODE_FINISH(bl);
 
   // handle hobject_t format change
diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h
index b70951c..8e9cf6f 100644
--- a/src/osd/osd_types.h
+++ b/src/osd/osd_types.h
@@ -811,9 +811,10 @@ struct pg_pool_t {
   }
 
   enum {
-    FLAG_HASHPSPOOL = 1, // hash pg seed and pool together (instead of adding)
-    FLAG_FULL       = 2, // pool is full
+    FLAG_HASHPSPOOL = 1<<0, // hash pg seed and pool together (instead of adding)
+    FLAG_FULL       = 1<<1, // pool is full
     FLAG_DEBUG_FAKE_EC_POOL = 1<<2, // require ReplicatedPG to act like an EC pg
+    FLAG_INCOMPLETE_CLONES = 1<<3, // may have incomplete clones (bc we are/were an overlay)
   };
 
   static const char *get_flag_name(int f) {
@@ -821,6 +822,7 @@ struct pg_pool_t {
     case FLAG_HASHPSPOOL: return "hashpspool";
     case FLAG_FULL: return "full";
     case FLAG_DEBUG_FAKE_EC_POOL: return "require_local_rollback";
+    case FLAG_INCOMPLETE_CLONES: return "incomplete_clones";
     default: return "???";
     }
   }
@@ -868,6 +870,18 @@ struct pg_pool_t {
   const char *get_cache_mode_name() const {
     return get_cache_mode_name(cache_mode);
   }
+  bool cache_mode_requires_hit_set() const {
+    switch (cache_mode) {
+    case CACHEMODE_NONE:
+    case CACHEMODE_FORWARD:
+    case CACHEMODE_READONLY:
+      return false;
+    case CACHEMODE_WRITEBACK:
+      return true;
+    default:
+      assert(0 == "implement me");
+    }
+  }
 
   uint64_t flags;           ///< FLAG_*
   __u8 type;                ///< TYPE_*
@@ -916,11 +930,29 @@ public:
 
   bool is_tier() const { return tier_of >= 0; }
   bool has_tiers() const { return !tiers.empty(); }
-  void clear_tier() { tier_of = -1; }
+  void clear_tier() {
+    tier_of = -1;
+    clear_read_tier();
+    clear_write_tier();
+    clear_tier_tunables();
+  }
   bool has_read_tier() const { return read_tier >= 0; }
   void clear_read_tier() { read_tier = -1; }
   bool has_write_tier() const { return write_tier >= 0; }
   void clear_write_tier() { write_tier = -1; }
+  void clear_tier_tunables() {
+    if (cache_mode != CACHEMODE_NONE)
+      flags |= FLAG_INCOMPLETE_CLONES;
+    cache_mode = CACHEMODE_NONE;
+
+    target_max_bytes = 0;
+    target_max_objects = 0;
+    cache_target_dirty_ratio_micro = 0;
+    cache_target_full_ratio_micro = 0;
+    hit_set_params = HitSet::Params();
+    hit_set_period = 0;
+    hit_set_count = 0;
+  }
 
   uint64_t target_max_bytes;   ///< tiering: target max pool size
   uint64_t target_max_objects; ///< tiering: target max pool size
@@ -964,6 +996,7 @@ public:
   void dump(Formatter *f) const;
 
   uint64_t get_flags() const { return flags; }
+  bool has_flag(uint64_t f) const { return flags & f; }
 
   /// This method will later return true for ec pools as well
   bool ec_pool() const {
@@ -973,6 +1006,11 @@ public:
     return ec_pool() || flags & FLAG_DEBUG_FAKE_EC_POOL;
   }
 
+  /// true if incomplete clones may be present
+  bool allow_incomplete_clones() const {
+    return cache_mode != CACHEMODE_NONE || has_flag(FLAG_INCOMPLETE_CLONES);
+  }
+
   unsigned get_type() const { return type; }
   unsigned get_size() const { return size; }
   unsigned get_min_size() const { return min_size; }
@@ -1811,7 +1849,7 @@ inline ostream& operator<<(ostream& out, const pg_query_t& q) {
 class PGBackend;
 class ObjectModDesc {
   bool can_local_rollback;
-  bool stashed;
+  bool rollback_info_completed;
 public:
   class Visitor {
   public:
@@ -1831,22 +1869,22 @@ public:
     CREATE = 4,
     UPDATE_SNAPS = 5
   };
-  ObjectModDesc() : can_local_rollback(true), stashed(false) {}
+  ObjectModDesc() : can_local_rollback(true), rollback_info_completed(false) {}
   void claim(ObjectModDesc &other) {
     bl.clear();
     bl.claim(other.bl);
     can_local_rollback = other.can_local_rollback;
-    stashed = other.stashed;
+    rollback_info_completed = other.rollback_info_completed;
   }
   void claim_append(ObjectModDesc &other) {
-    if (!can_local_rollback || stashed)
+    if (!can_local_rollback || rollback_info_completed)
       return;
     if (!other.can_local_rollback) {
       mark_unrollbackable();
       return;
     }
     bl.claim_append(other.bl);
-    stashed = other.stashed;
+    rollback_info_completed = other.rollback_info_completed;
   }
   void swap(ObjectModDesc &other) {
     bl.swap(other.bl);
@@ -1855,16 +1893,16 @@ public:
     other.can_local_rollback = can_local_rollback;
     can_local_rollback = temp;
 
-    temp = other.stashed;
-    other.stashed = stashed;
-    stashed = temp;
+    temp = other.rollback_info_completed;
+    other.rollback_info_completed = rollback_info_completed;
+    rollback_info_completed = temp;
   }
   void append_id(ModID id) {
     uint8_t _id(id);
     ::encode(_id, bl);
   }
   void append(uint64_t old_size) {
-    if (!can_local_rollback || stashed)
+    if (!can_local_rollback || rollback_info_completed)
       return;
     ENCODE_START(1, 1, bl);
     append_id(APPEND);
@@ -1872,7 +1910,7 @@ public:
     ENCODE_FINISH(bl);
   }
   void setattrs(map<string, boost::optional<bufferlist> > &old_attrs) {
-    if (!can_local_rollback || stashed)
+    if (!can_local_rollback || rollback_info_completed)
       return;
     ENCODE_START(1, 1, bl);
     append_id(SETATTRS);
@@ -1880,24 +1918,25 @@ public:
     ENCODE_FINISH(bl);
   }
   bool rmobject(version_t deletion_version) {
-    if (!can_local_rollback || stashed)
+    if (!can_local_rollback || rollback_info_completed)
       return false;
     ENCODE_START(1, 1, bl);
     append_id(DELETE);
     ::encode(deletion_version, bl);
     ENCODE_FINISH(bl);
-    stashed = true;
+    rollback_info_completed = true;
     return true;
   }
   void create() {
-    if (!can_local_rollback || stashed)
+    if (!can_local_rollback || rollback_info_completed)
       return;
+    rollback_info_completed = true;
     ENCODE_START(1, 1, bl);
     append_id(CREATE);
     ENCODE_FINISH(bl);
   }
   void update_snaps(set<snapid_t> &old_snaps) {
-    if (!can_local_rollback || stashed)
+    if (!can_local_rollback || rollback_info_completed)
       return;
     ENCODE_START(1, 1, bl);
     append_id(UPDATE_SNAPS);
@@ -2061,6 +2100,10 @@ struct pg_log_t {
   // We can rollback rollback-able entries > can_rollback_to
   eversion_t can_rollback_to;
 
+  // always <= can_rollback_to, indicates how far stashed rollback
+  // data can be found
+  eversion_t rollback_info_trimmed_to;
+
   list<pg_log_entry_t> log;  // the actual log.
   
   pg_log_t() {}
@@ -2492,6 +2535,29 @@ struct SnapSet {
   void decode(bufferlist::iterator& bl);
   void dump(Formatter *f) const;
   static void generate_test_instances(list<SnapSet*>& o);  
+
+  SnapContext get_ssc_as_of(snapid_t as_of) const {
+    SnapContext out;
+    out.seq = as_of;
+    for (vector<snapid_t>::const_iterator i = snaps.begin();
+	 i != snaps.end();
+	 ++i) {
+      if (*i <= as_of)
+	out.snaps.push_back(*i);
+    }
+    return out;
+  }
+
+  // return min element of snaps > after, return max if no such element
+  snapid_t get_first_snap_after(snapid_t after, snapid_t max) const {
+    for (vector<snapid_t>::const_reverse_iterator i = snaps.rbegin();
+	 i != snaps.rend();
+	 ++i) {
+      if (*i > after)
+	return *i;
+    }
+    return max;
+  }
 };
 WRITE_CLASS_ENCODER(SnapSet)
 
@@ -2762,19 +2828,21 @@ public:
       }
     }
 
-    bool get_write(OpRequestRef op) {
-      if (get_write_lock()) {
+    bool get_write(OpRequestRef op, bool greedy=false) {
+      if (get_write_lock(greedy)) {
 	return true;
       } // else
       if (op)
 	waiters.push_back(op);
       return false;
     }
-    bool get_write_lock() {
-      // don't starve anybody!
-      if (!waiters.empty() ||
-	  backfill_read_marker) {
-	return false;
+    bool get_write_lock(bool greedy=false) {
+      if (!greedy) {
+	// don't starve anybody!
+	if (!waiters.empty() ||
+	    backfill_read_marker) {
+	  return false;
+	}
       }
       switch (state) {
       case RWNONE:
@@ -2823,7 +2891,10 @@ public:
     return rwstate.get_read(op);
   }
   bool get_write(OpRequestRef op) {
-    return rwstate.get_write(op);
+    return rwstate.get_write(op, false);
+  }
+  bool get_write_greedy(OpRequestRef op) {
+    return rwstate.get_write(op, true);
   }
   bool get_snaptrimmer_write() {
     if (rwstate.get_write_lock()) {
diff --git a/src/osdc/Objecter.cc b/src/osdc/Objecter.cc
index e165266..d82b3e1 100644
--- a/src/osdc/Objecter.cc
+++ b/src/osdc/Objecter.cc
@@ -1364,6 +1364,11 @@ int Objecter::op_cancel(ceph_tid_t tid, int r)
 
   ldout(cct, 10) << __func__ << " tid " << tid << dendl;
   Op *op = p->second;
+  if (op->con) {
+    ldout(cct, 20) << " revoking rx buffer for " << tid
+		   << " on " << op->con << dendl;
+    op->con->revoke_rx_buffer(tid);
+  }
   if (op->onack) {
     op->onack->complete(r);
     op->onack = NULL;
@@ -1434,7 +1439,7 @@ int64_t Objecter::get_object_pg_hash_position(int64_t pool, const string& key,
   return p->raw_hash_to_pg(p->hash_key(key, ns));
 }
 
-int Objecter::calc_target(op_target_t *t)
+int Objecter::calc_target(op_target_t *t, bool any_change)
 {
   bool is_read = t->flags & CEPH_OSD_FLAG_READ;
   bool is_write = t->flags & CEPH_OSD_FLAG_WRITE;
@@ -1491,7 +1496,8 @@ int Objecter::calc_target(op_target_t *t)
   }
 
   if (t->pgid != pgid ||
-      is_pg_changed(t->primary, t->acting, primary, acting, t->used_replica) ||
+      is_pg_changed(
+	t->primary, t->acting, primary, acting, t->used_replica || any_change) ||
       force_resend) {
     t->pgid = pgid;
     t->acting = acting;
@@ -1570,7 +1576,7 @@ int Objecter::recalc_op_target(Op *op)
 
 bool Objecter::recalc_linger_op_target(LingerOp *linger_op)
 {
-  int r = calc_target(&linger_op->target);
+  int r = calc_target(&linger_op->target, true);
   if (r == RECALC_OP_TARGET_NEED_RESEND) {
     ldout(cct, 10) << "recalc_linger_op_target tid " << linger_op->linger_id
 		   << " pgid " << linger_op->target.pgid
diff --git a/src/osdc/Objecter.h b/src/osdc/Objecter.h
index 1e6fcf3..2ede888 100644
--- a/src/osdc/Objecter.h
+++ b/src/osdc/Objecter.h
@@ -1480,7 +1480,7 @@ public:
   bool osdmap_full_flag() const;
   bool target_should_be_paused(op_target_t *op);
 
-  int calc_target(op_target_t *t);
+  int calc_target(op_target_t *t, bool any_change=false);
   int recalc_op_target(Op *op);
   bool recalc_linger_op_target(LingerOp *op);
 
diff --git a/src/pybind/rados.py b/src/pybind/rados.py
index e5da077..0fbd10e 100644
--- a/src/pybind/rados.py
+++ b/src/pybind/rados.py
@@ -1089,8 +1089,11 @@ class Ioctx(object):
         :returns: completion object
         """
         buf = create_string_buffer(length)
-        def oncomplete_(completion):
-            return oncomplete(completion, buf.value)
+        def oncomplete_(completion_v):
+            return_value = completion_v.get_return_value()
+            return oncomplete(completion_v,
+                              ctypes.string_at(buf, return_value) if return_value >= 0 else None)
+
         completion = self.__get_completion(oncomplete_, None)
         ret = run_in_thread(self.librados.rados_aio_read,
                             (self.io, c_char_p(object_name),
diff --git a/src/pybind/rbd.py b/src/pybind/rbd.py
index bf07576..ab093ce 100644
--- a/src/pybind/rbd.py
+++ b/src/pybind/rbd.py
@@ -750,6 +750,14 @@ written." % (self.name, ret, length))
         if ret < 0:
             raise make_ex(ret, 'error flushing image')
 
+    def invalidate_cache(self):
+        """
+        Drop any cached data for the image.
+        """
+        ret = self.librbd.rbd_invalidate_cache(self.image)
+        if ret < 0:
+            raise make_ex(ret, 'error invalidating cache')
+
     def stripe_unit(self):
         """
         Returns the stripe unit used for the image.
diff --git a/src/rgw/rgw_common.cc b/src/rgw/rgw_common.cc
index 58913cc..5a1043f 100644
--- a/src/rgw/rgw_common.cc
+++ b/src/rgw/rgw_common.cc
@@ -697,13 +697,15 @@ bool url_decode(string& src_str, string& dest_str)
   int pos = 0;
   char c;
 
+  bool in_query = false;
   while (*src) {
     if (*src != '%') {
-      if (*src != '+') {
-	dest[pos++] = *src++;
+      if (!in_query || *src != '+') {
+        if (*src == '?') in_query = true;
+        dest[pos++] = *src++;
       } else {
-	dest[pos++] = ' ';
-	++src;
+        dest[pos++] = ' ';
+        ++src;
       }
     } else {
       src++;
diff --git a/src/rgw/rgw_op.cc b/src/rgw/rgw_op.cc
index 8979619..7694748 100644
--- a/src/rgw/rgw_op.cc
+++ b/src/rgw/rgw_op.cc
@@ -1380,7 +1380,10 @@ public:
 
 int RGWPutObjProcessor_Multipart::prepare(RGWRados *store, void *obj_ctx, string *oid_rand)
 {
-  RGWPutObjProcessor::prepare(store, obj_ctx, NULL);
+  int r = prepare_init(store, obj_ctx, NULL);
+  if (r < 0) {
+    return r;
+  }
 
   string oid = obj_str;
   upload_id = s->info.args.get("uploadId");
@@ -1419,7 +1422,7 @@ int RGWPutObjProcessor_Multipart::prepare(RGWRados *store, void *obj_ctx, string
 
   manifest.set_multipart_part_rule(store->ctx()->_conf->rgw_obj_stripe_size, num);
 
-  int r = manifest_gen.create_begin(store->ctx(), &manifest, bucket, target_obj);
+  r = manifest_gen.create_begin(store->ctx(), &manifest, bucket, target_obj);
   if (r < 0) {
     return r;
   }
@@ -1560,6 +1563,36 @@ int RGWPutObj::user_manifest_iterate_cb(rgw_bucket& bucket, RGWObjEnt& ent, RGWA
   return 0;
 }
 
+static int put_data_and_throttle(RGWPutObjProcessor *processor, bufferlist& data, off_t ofs,
+                                 MD5 *hash, bool need_to_wait)
+{
+  const unsigned char *data_ptr = (hash ? (const unsigned char *)data.c_str() : NULL);
+  bool again;
+  uint64_t len = data.length();
+
+  do {
+    void *handle;
+
+    int ret = processor->handle_data(data, ofs, &handle, &again);
+    if (ret < 0)
+      return ret;
+
+    if (hash) {
+      hash->Update(data_ptr, len);
+      hash = NULL; /* only calculate hash once */
+    }
+
+    ret = processor->throttle_data(handle, need_to_wait);
+    if (ret < 0)
+      return ret;
+
+    need_to_wait = false; /* the need to wait only applies to the first iteration */
+  } while (again);
+
+  return 0;
+}
+
+
 void RGWPutObj::execute()
 {
   RGWPutObjProcessor *processor = NULL;
@@ -1633,23 +1666,12 @@ void RGWPutObj::execute()
     if (!len)
       break;
 
-    void *handle;
-    const unsigned char *data_ptr = (const unsigned char *)data.c_str();
-
-    ret = processor->handle_data(data, ofs, &handle);
-    if (ret < 0)
-      goto done;
-
-    if (need_calc_md5) {
-      hash.Update(data_ptr, len);
-    }
-
     /* do we need this operation to be synchronous? if we're dealing with an object with immutable
      * head, e.g., multipart object we need to make sure we're the first one writing to this object
      */
     bool need_to_wait = (ofs == 0) && multipart;
 
-    ret = processor->throttle_data(handle, need_to_wait);
+    ret = put_data_and_throttle(processor, data, ofs, (need_calc_md5 ? &hash : NULL), need_to_wait);
     if (ret < 0) {
       if (!need_to_wait || ret != -EEXIST) {
         ldout(s->cct, 20) << "processor->thottle_data() returned ret=" << ret << dendl;
@@ -1674,15 +1696,8 @@ void RGWPutObj::execute()
         goto done;
       }
 
-      ret = processor->handle_data(data, ofs, &handle);
+      ret = put_data_and_throttle(processor, data, ofs, NULL, false);
       if (ret < 0) {
-        ldout(s->cct, 0) << "ERROR: processor->handle_data() returned " << ret << dendl;
-        goto done;
-      }
-
-      ret = processor->throttle_data(handle, false);
-      if (ret < 0) {
-        ldout(s->cct, 0) << "ERROR: processor->throttle_data() returned " << ret << dendl;
         goto done;
       }
     }
@@ -1846,18 +1861,7 @@ void RGWPostObj::execute()
      if (!len)
        break;
 
-     void *handle;
-     const unsigned char *data_ptr = (const unsigned char *)data.c_str();
-
-     ret = processor->handle_data(data, ofs, &handle);
-     if (ret < 0)
-       goto done;
-
-     hash.Update(data_ptr, len);
-
-     ret = processor->throttle_data(handle, false);
-     if (ret < 0)
-       goto done;
+     ret = put_data_and_throttle(processor, data, ofs, &hash, false);
 
      ofs += len;
 
diff --git a/src/rgw/rgw_rados.cc b/src/rgw/rgw_rados.cc
index 7ca4a9d..e22bef0 100644
--- a/src/rgw/rgw_rados.cc
+++ b/src/rgw/rgw_rados.cc
@@ -900,8 +900,10 @@ int RGWPutObjProcessor_Plain::prepare(RGWRados *store, void *obj_ctx, string *oi
   return 0;
 };
 
-int RGWPutObjProcessor_Plain::handle_data(bufferlist& bl, off_t _ofs, void **phandle)
+int RGWPutObjProcessor_Plain::handle_data(bufferlist& bl, off_t _ofs, void **phandle, bool *again)
 {
+  *again = false;
+
   if (ofs != _ofs)
     return -EINVAL;
 
@@ -1026,8 +1028,10 @@ int RGWPutObjProcessor_Atomic::write_data(bufferlist& bl, off_t ofs, void **phan
   return RGWPutObjProcessor_Aio::handle_obj_data(cur_obj, bl, ofs - cur_part_ofs, ofs, phandle, exclusive);
 }
 
-int RGWPutObjProcessor_Atomic::handle_data(bufferlist& bl, off_t ofs, void **phandle)
+int RGWPutObjProcessor_Atomic::handle_data(bufferlist& bl, off_t ofs, void **phandle, bool *again)
 {
+  *again = false;
+
   *phandle = NULL;
   if (extra_data_len) {
     size_t extra_len = bl.length();
@@ -1044,13 +1048,16 @@ int RGWPutObjProcessor_Atomic::handle_data(bufferlist& bl, off_t ofs, void **pha
     }
   }
 
-  uint64_t max_chunk_size = store->get_max_chunk_size();
+  uint64_t max_write_size = MIN(max_chunk_size, (uint64_t)next_part_ofs - data_ofs);
 
   pending_data_bl.claim_append(bl);
-  if (pending_data_bl.length() < max_chunk_size)
+  if (pending_data_bl.length() < max_write_size)
     return 0;
 
-  pending_data_bl.splice(0, max_chunk_size, &bl);
+  pending_data_bl.splice(0, max_write_size, &bl);
+
+  /* do we have enough data pending accumulated that needs to be written? */
+  *again = (pending_data_bl.length() >= max_chunk_size);
 
   if (!data_ofs && !immutable_head()) {
     first_chunk.claim(bl);
@@ -1070,17 +1077,30 @@ int RGWPutObjProcessor_Atomic::handle_data(bufferlist& bl, off_t ofs, void **pha
   return write_data(bl, write_ofs, phandle, exclusive);
 }
 
-int RGWPutObjProcessor_Atomic::prepare(RGWRados *store, void *obj_ctx, string *oid_rand)
+
+int RGWPutObjProcessor_Atomic::prepare_init(RGWRados *store, void *obj_ctx, string *oid_rand)
 {
   RGWPutObjProcessor::prepare(store, obj_ctx, oid_rand);
 
-  head_obj.init(bucket, obj_str);
+  int r = store->get_max_chunk_size(bucket, &max_chunk_size);
+  if (r < 0) {
+    return r;
+  }
+
+  return 0;
+}
 
-  uint64_t max_chunk_size = store->get_max_chunk_size();
+int RGWPutObjProcessor_Atomic::prepare(RGWRados *store, void *obj_ctx, string *oid_rand)
+{
+  int r = prepare_init(store, obj_ctx, oid_rand);
+  if (r < 0) {
+    return r;
+  }
+  head_obj.init(bucket, obj_str);
 
   manifest.set_trivial_rule(max_chunk_size, store->ctx()->_conf->rgw_obj_stripe_size);
 
-  int r = manifest_gen.create_begin(store->ctx(), &manifest, bucket, head_obj);
+  r = manifest_gen.create_begin(store->ctx(), &manifest, bucket, head_obj);
   if (r < 0) {
     return r;
   }
@@ -1201,6 +1221,44 @@ void RGWRadosCtx::set_prefetch_data(rgw_obj& obj) {
   }
 }
 
+int RGWRados::get_required_alignment(rgw_bucket& bucket, uint64_t *alignment)
+{
+  IoCtx ioctx;
+  int r = open_bucket_data_ctx(bucket, ioctx);
+  if (r < 0) {
+    ldout(cct, 0) << "ERROR: open_bucket_data_ctx() returned " << r << dendl;
+    return r;
+  }
+
+  *alignment = ioctx.pool_required_alignment();
+  return 0;
+}
+
+int RGWRados::get_max_chunk_size(rgw_bucket& bucket, uint64_t *max_chunk_size)
+{
+  uint64_t alignment;
+  int r = get_required_alignment(bucket, &alignment);
+  if (r < 0) {
+    return r;
+  }
+
+  uint64_t config_chunk_size = cct->_conf->rgw_max_chunk_size;
+
+  if (alignment == 0) {
+    *max_chunk_size = config_chunk_size;
+    return 0;
+  }
+
+  if (config_chunk_size <= alignment) {
+    *max_chunk_size = alignment;
+    return 0;
+  }
+
+  *max_chunk_size = config_chunk_size - (config_chunk_size % alignment);
+
+  return 0;
+}
+
 void RGWRados::finalize()
 {
   if (need_watch_notify()) {
@@ -1236,8 +1294,6 @@ int RGWRados::init_rados()
 {
   int ret;
 
-  max_chunk_size = cct->_conf->rgw_max_chunk_size;
-
   rados = new Rados();
   if (!rados)
     return -ENOMEM;
@@ -2957,25 +3013,33 @@ public:
   int handle_data(bufferlist& bl, off_t ofs, off_t len) {
     progress_cb(ofs, progress_data);
 
-    void *handle;
-    int ret = processor->handle_data(bl, ofs, &handle);
-    if (ret < 0)
-      return ret;
+    bool again;
 
-    if (opstate) {
-      /* need to update opstate repository with new state. This is ratelimited, so we're not
-       * really doing it every time
-       */
-      ret = opstate->renew_state();
-      if (ret < 0) {
-        /* could not renew state! might have been marked as cancelled */
+    bool need_opstate = true;
+
+    do {
+      void *handle;
+      int ret = processor->handle_data(bl, ofs, &handle, &again);
+      if (ret < 0)
         return ret;
+
+      if (need_opstate && opstate) {
+        /* need to update opstate repository with new state. This is ratelimited, so we're not
+         * really doing it every time
+         */
+        ret = opstate->renew_state();
+        if (ret < 0) {
+          /* could not renew state! might have been marked as cancelled */
+          return ret;
+        }
+
+        need_opstate = false;
       }
-    }
 
-    ret = processor->throttle_data(handle, false);
-    if (ret < 0)
-      return ret;
+      ret = processor->throttle_data(handle, false);
+      if (ret < 0)
+        return ret;
+    } while (again);
 
     return 0;
   }
@@ -3192,24 +3256,6 @@ set_err_state:
 
   vector<rgw_obj> ref_objs;
 
-  bool copy_data = !astate->has_manifest;
-  bool copy_first = false;
-  if (astate->has_manifest) {
-    if (!astate->manifest.has_tail()) {
-      copy_data = true;
-    } else {
-      uint64_t head_size = astate->manifest.get_head_size();
-
-      if (head_size > 0) {
-	if (head_size > max_chunk_size)  // should never happen
-	  copy_data = true;
-	else
-          copy_first = true;
-      }
-    }
-  }
-
-
   if (remote_dest) {
     /* dest is in a different region, copy it there */
 
@@ -3230,8 +3276,35 @@ set_err_state:
       return ret;
 
     return 0;
-  } else if (copy_data) { /* refcounting tail wouldn't work here, just copy the data */
-    return copy_obj_data(ctx, dest_bucket_info.owner, &handle, end, dest_obj, src_obj, mtime, src_attrs, category, ptag, err);
+  }
+  
+  uint64_t max_chunk_size;
+
+  ret = get_max_chunk_size(dest_obj.bucket, &max_chunk_size);
+  if (ret < 0) {
+    ldout(cct, 0) << "ERROR: failed to get max_chunk_size() for bucket " << dest_obj.bucket << dendl;
+    return ret;
+  }
+
+  bool copy_data = !astate->has_manifest;
+  bool copy_first = false;
+  if (astate->has_manifest) {
+    if (!astate->manifest.has_tail()) {
+      copy_data = true;
+    } else {
+      uint64_t head_size = astate->manifest.get_head_size();
+
+      if (head_size > 0) {
+	if (head_size > max_chunk_size)
+	  copy_data = true;
+	else
+          copy_first = true;
+      }
+    }
+  }
+
+  if (copy_data) { /* refcounting tail wouldn't work here, just copy the data */
+    return copy_obj_data(ctx, dest_bucket_info.owner, &handle, end, dest_obj, src_obj, max_chunk_size, mtime, src_attrs, category, ptag, err);
   }
 
   RGWObjManifest::obj_iterator miter = astate->manifest.obj_begin();
@@ -3341,6 +3414,7 @@ int RGWRados::copy_obj_data(void *ctx,
 	       void **handle, off_t end,
                rgw_obj& dest_obj,
                rgw_obj& src_obj,
+               uint64_t max_chunk_size,
 	       time_t *mtime,
                map<string, bufferlist>& attrs,
                RGWObjCategory category,
@@ -4473,6 +4547,8 @@ int RGWRados::get_obj(void *ctx, RGWObjVersionTracker *objv_tracker, void **hand
   bool merge_bl = false;
   bufferlist *pbl = &bl;
   bufferlist read_bl;
+  uint64_t max_chunk_size;
+
 
   get_obj_bucket_and_oid_key(obj, bucket, oid, key);
 
@@ -4505,6 +4581,12 @@ int RGWRados::get_obj(void *ctx, RGWObjVersionTracker *objv_tracker, void **hand
     }
   }
 
+  r = get_max_chunk_size(bucket, &max_chunk_size);
+  if (r < 0) {
+    ldout(cct, 0) << "ERROR: failed to get max_chunk_size() for bucket " << bucket << dendl;
+    goto done_ret;
+  }
+
   if (len > max_chunk_size)
     len = max_chunk_size;
 
diff --git a/src/rgw/rgw_rados.h b/src/rgw/rgw_rados.h
index d50fb59..d811b49 100644
--- a/src/rgw/rgw_rados.h
+++ b/src/rgw/rgw_rados.h
@@ -548,7 +548,7 @@ public:
     obj_ctx = _o;
     return 0;
   };
-  virtual int handle_data(bufferlist& bl, off_t ofs, void **phandle) = 0;
+  virtual int handle_data(bufferlist& bl, off_t ofs, void **phandle, bool *again) = 0;
   virtual int throttle_data(void *handle, bool need_to_wait) = 0;
   virtual int complete(string& etag, time_t *mtime, time_t set_mtime, map<string, bufferlist>& attrs);
 };
@@ -564,7 +564,7 @@ class RGWPutObjProcessor_Plain : public RGWPutObjProcessor
 
 protected:
   int prepare(RGWRados *store, void *obj_ctx, string *oid_rand);
-  int handle_data(bufferlist& bl, off_t ofs, void **phandle);
+  int handle_data(bufferlist& bl, off_t ofs, void **phandle, bool *again);
   int do_complete(string& etag, time_t *mtime, time_t set_mtime, map<string, bufferlist>& attrs);
 
 public:
@@ -613,6 +613,8 @@ class RGWPutObjProcessor_Atomic : public RGWPutObjProcessor_Aio
   uint64_t extra_data_len;
   bufferlist extra_data_bl;
   bufferlist pending_data_bl;
+  uint64_t max_chunk_size;
+
 protected:
   rgw_bucket bucket;
   string obj_str;
@@ -631,6 +633,8 @@ protected:
   int complete_parts();
   int complete_writing_data();
 
+  int prepare_init(RGWRados *store, void *obj_ctx, string *oid_rand);
+
 public:
   ~RGWPutObjProcessor_Atomic() {}
   RGWPutObjProcessor_Atomic(const string& bucket_owner, rgw_bucket& _b, const string& _o, uint64_t _p, const string& _t) :
@@ -641,6 +645,7 @@ public:
                                 cur_part_id(0),
                                 data_ofs(0),
                                 extra_data_len(0),
+                                max_chunk_size(0),
                                 bucket(_b),
                                 obj_str(_o),
                                 unique_tag(_t) {}
@@ -649,7 +654,7 @@ public:
   void set_extra_data_len(uint64_t len) {
     extra_data_len = len;
   }
-  virtual int handle_data(bufferlist& bl, off_t ofs, void **phandle);
+  virtual int handle_data(bufferlist& bl, off_t ofs, void **phandle, bool *again);
   bufferlist& get_extra_data() { return extra_data_bl; }
 };
 
@@ -1221,8 +1226,6 @@ class RGWRados
   int get_obj_ref(const rgw_obj& obj, rgw_rados_ref *ref, rgw_bucket *bucket, bool ref_system_obj = false);
   uint64_t max_bucket_id;
 
-  uint64_t max_chunk_size;
-
   int get_obj_state(RGWRadosCtx *rctx, rgw_obj& obj, RGWObjState **state, RGWObjVersionTracker *objv_tracker);
   int append_atomic_test(RGWRadosCtx *rctx, rgw_obj& obj,
                          librados::ObjectOperation& op, RGWObjState **state);
@@ -1287,7 +1290,6 @@ public:
                num_watchers(0), watchers(NULL), watch_handles(NULL),
                watch_initialized(false),
                bucket_id_lock("rados_bucket_id"), max_bucket_id(0),
-               max_chunk_size(0),
                cct(NULL), rados(NULL),
                pools_initialized(false),
                quota_handler(NULL),
@@ -1325,9 +1327,8 @@ public:
     }
   }
 
-  uint64_t get_max_chunk_size() {
-    return max_chunk_size;
-  }
+  int get_required_alignment(rgw_bucket& bucket, uint64_t *alignment);
+  int get_max_chunk_size(rgw_bucket& bucket, uint64_t *max_chunk_size);
 
   int list_raw_objects(rgw_bucket& pool, const string& prefix_filter, int max,
                        RGWListRawObjsCtx& ctx, list<string>& oids,
@@ -1563,6 +1564,7 @@ public:
 	       void **handle, off_t end,
                rgw_obj& dest_obj,
                rgw_obj& src_obj,
+               uint64_t max_chunk_size,
 	       time_t *mtime,
                map<string, bufferlist>& attrs,
                RGWObjCategory category,
diff --git a/src/rgw/rgw_rest.cc b/src/rgw/rgw_rest.cc
index 69948a6..b74002d 100644
--- a/src/rgw/rgw_rest.cc
+++ b/src/rgw/rgw_rest.cc
@@ -180,7 +180,7 @@ void rgw_flush_formatter_and_reset(struct req_state *s, Formatter *formatter)
   std::ostringstream oss;
   formatter->flush(oss);
   std::string outs(oss.str());
-  if (!outs.empty()) {
+  if (!outs.empty() && s->op != OP_HEAD) {
     s->cio->write(outs.c_str(), outs.size());
   }
 
@@ -192,7 +192,7 @@ void rgw_flush_formatter(struct req_state *s, Formatter *formatter)
   std::ostringstream oss;
   formatter->flush(oss);
   std::string outs(oss.str());
-  if (!outs.empty()) {
+  if (!outs.empty() && s->op != OP_HEAD) {
     s->cio->write(outs.c_str(), outs.size());
   }
 }
diff --git a/src/rgw/rgw_rest_swift.cc b/src/rgw/rgw_rest_swift.cc
index 507a7ff..b562079 100644
--- a/src/rgw/rgw_rest_swift.cc
+++ b/src/rgw/rgw_rest_swift.cc
@@ -627,18 +627,16 @@ void RGWOptionsCORS_ObjStore_SWIFT::send_response()
   uint32_t max_age = CORS_MAX_AGE_INVALID;
   /*EACCES means, there is no CORS registered yet for the bucket
    *ENOENT means, there is no match of the Origin in the list of CORSRule
-   *ENOTSUPP means, the HTTP_METHOD is not supported
    */
   if (ret == -ENOENT)
     ret = -EACCES;
-  if (ret != -EACCES) {
-    get_response_params(hdrs, exp_hdrs, &max_age);
-  } else {
+  if (ret < 0) {
     set_req_state_err(s, ret);
     dump_errno(s);
     end_header(s, NULL);
     return;
   }
+  get_response_params(hdrs, exp_hdrs, &max_age);
   dump_errno(s);
   dump_access_control(s, origin, req_meth, hdrs.c_str(), exp_hdrs.c_str(), max_age); 
   end_header(s, NULL);
diff --git a/src/test/crush/TestCrushWrapper.cc b/src/test/crush/TestCrushWrapper.cc
index d70a525..34d6401 100644
--- a/src/test/crush/TestCrushWrapper.cc
+++ b/src/test/crush/TestCrushWrapper.cc
@@ -538,6 +538,11 @@ TEST(CrushWrapper, dump_rules) {
 	      ss.str().find("<item_name>default</item_name></step>"));
   }
 
+  map<int,float> wm;
+  c->get_rule_weight_osd_map(0, &wm);
+  ASSERT_TRUE(wm.size() == 1);
+  ASSERT_TRUE(wm[0] == 1.0);
+
   delete c;
 }
 
diff --git a/src/test/erasure-code/TestErasureCodeJerasure.cc b/src/test/erasure-code/TestErasureCodeJerasure.cc
index 4b768a8..5c637da 100644
--- a/src/test/erasure-code/TestErasureCodeJerasure.cc
+++ b/src/test/erasure-code/TestErasureCodeJerasure.cc
@@ -288,6 +288,36 @@ TEST(ErasureCodeTest, create_ruleset)
     }
   }
 
+  //
+  // The ruleid may be different from the ruleset when a crush rule is
+  // removed because the removed ruleid will be reused but the removed
+  // ruleset will not be reused. 
+  //
+  // This also asserts that the create_ruleset() method returns a
+  // ruleset and not a ruleid http://tracker.ceph.com/issues/9044
+  //
+  {
+    stringstream ss;
+    ErasureCodeJerasureReedSolomonVandermonde jerasure;
+    map<std::string,std::string> parameters;
+    parameters["k"] = "2";
+    parameters["m"] = "2";
+    parameters["w"] = "8";
+    jerasure.init(parameters);
+    int FIRST = jerasure.create_ruleset("FIRST", *c, &ss);
+    int SECOND = jerasure.create_ruleset("SECOND", *c, &ss);
+    int FIRST_ruleid = c->get_rule_id("FIRST");
+    EXPECT_EQ(0, c->remove_rule(FIRST_ruleid));
+    int ruleset = jerasure.create_ruleset("myrule", *c, &ss);
+    EXPECT_NE(FIRST, ruleset);
+    EXPECT_NE(SECOND, ruleset);
+    EXPECT_NE(ruleset, c->get_rule_id("myrule"));
+    int SECOND_ruleid = c->get_rule_id("SECOND");
+    EXPECT_EQ(0, c->remove_rule(SECOND_ruleid));
+    int myrule_ruleid = c->get_rule_id("myrule");
+    EXPECT_EQ(0, c->remove_rule(myrule_ruleid));
+  }
+
   {
     stringstream ss;
     ErasureCodeJerasureReedSolomonVandermonde jerasure;
diff --git a/src/test/librados/TestCase.cc b/src/test/librados/TestCase.cc
index 9f68af1..7f072fd 100644
--- a/src/test/librados/TestCase.cc
+++ b/src/test/librados/TestCase.cc
@@ -8,6 +8,7 @@
 using namespace librados;
 
 std::string RadosTest::pool_name;
+std::string RadosTest::nspace;
 rados_t RadosTest::s_cluster = NULL;
 
 void RadosTest::SetUpTestCase()
@@ -25,7 +26,7 @@ void RadosTest::SetUp()
 {
   cluster = RadosTest::s_cluster;
   ASSERT_EQ(0, rados_ioctx_create(cluster, pool_name.c_str(), &ioctx));
-  std::string nspace = get_temp_pool_name();
+  nspace = get_temp_pool_name();
   rados_ioctx_set_namespace(ioctx, nspace.c_str());
   ASSERT_FALSE(rados_ioctx_pool_requires_alignment(ioctx));
 }
@@ -206,24 +207,6 @@ void RadosTestEC::TearDown()
   rados_ioctx_destroy(ioctx);
 }
 
-void RadosTestEC::cleanup_default_namespace(rados_ioctx_t ioctx)
-{
-  // remove all objects from the default namespace to avoid polluting
-  // other tests
-  rados_ioctx_set_namespace(ioctx, "");
-  rados_list_ctx_t list_ctx;
-  ASSERT_EQ(0, rados_objects_list_open(ioctx, &list_ctx));
-  int r;
-  const char *entry = NULL;
-  const char *key = NULL;
-  while ((r = rados_objects_list_next(list_ctx, &entry, &key)) != -ENOENT) {
-    ASSERT_EQ(0, r);
-    rados_ioctx_locator_set_key(ioctx, key);
-    ASSERT_EQ(0, rados_remove(ioctx, entry));
-  }
-  rados_objects_list_close(list_ctx);
-}
-
 std::string RadosTestECPP::pool_name;
 Rados RadosTestECPP::s_cluster;
 
@@ -254,14 +237,3 @@ void RadosTestECPP::TearDown()
   ioctx.close();
 }
 
-void RadosTestECPP::cleanup_default_namespace(librados::IoCtx ioctx)
-{
-  // remove all objects from the default namespace to avoid polluting
-  // other tests
-  ioctx.set_namespace("");
-  for (ObjectIterator it = ioctx.objects_begin();
-       it != ioctx.objects_end(); ++it) {
-    ioctx.locator_set_key(it->second);
-    ASSERT_EQ(0, ioctx.remove(it->first));
-  }
-}
diff --git a/src/test/librados/TestCase.h b/src/test/librados/TestCase.h
index 5bd084f..4ede5e9 100644
--- a/src/test/librados/TestCase.h
+++ b/src/test/librados/TestCase.h
@@ -28,6 +28,7 @@ protected:
   static void cleanup_default_namespace(rados_ioctx_t ioctx);
   static rados_t s_cluster;
   static std::string pool_name;
+  static std::string nspace;
 
   virtual void SetUp();
   virtual void TearDown();
@@ -72,14 +73,13 @@ protected:
   std::string ns;
 };
 
-class RadosTestEC : public ::testing::Test {
+class RadosTestEC : public RadosTest {
 public:
   RadosTestEC() {}
   virtual ~RadosTestEC() {}
 protected:
   static void SetUpTestCase();
   static void TearDownTestCase();
-  static void cleanup_default_namespace(rados_ioctx_t ioctx);
   static rados_t s_cluster;
   static std::string pool_name;
 
@@ -90,14 +90,13 @@ protected:
   uint64_t alignment;
 };
 
-class RadosTestECPP : public ::testing::Test {
+class RadosTestECPP : public RadosTestPP {
 public:
   RadosTestECPP() : cluster(s_cluster) {};
   virtual ~RadosTestECPP() {};
 protected:
   static void SetUpTestCase();
   static void TearDownTestCase();
-  static void cleanup_default_namespace(librados::IoCtx ioctx);
   static librados::Rados s_cluster;
   static std::string pool_name;
 
diff --git a/src/test/librados/io.cc b/src/test/librados/io.cc
index 5daca3c..0bb805f 100644
--- a/src/test/librados/io.cc
+++ b/src/test/librados/io.cc
@@ -25,6 +25,58 @@ TEST_F(LibRadosIo, SimpleWrite) {
   ASSERT_EQ(0, rados_write(ioctx, "foo", buf, sizeof(buf), 0));
 }
 
+TEST_F(LibRadosIo, ReadTimeout) {
+  char buf[128];
+  memset(buf, 'a', sizeof(buf));
+  ASSERT_EQ(0, rados_write(ioctx, "foo", buf, sizeof(buf), 0));
+
+  {
+    // set up a second client
+    rados_t cluster;
+    rados_ioctx_t ioctx;
+    rados_create(&cluster, "admin");
+    rados_conf_read_file(cluster, NULL);
+    rados_conf_parse_env(cluster, NULL);
+    rados_conf_set(cluster, "rados_osd_op_timeout", "0.00001"); // use any small value that will result in a timeout
+    rados_connect(cluster);
+    rados_ioctx_create(cluster, pool_name.c_str(), &ioctx);
+    rados_ioctx_set_namespace(ioctx, nspace.c_str());
+
+    // then we show that the buffer is changed after rados_read returned
+    // with a timeout
+    for (int i=0; i<5; i++) {
+      char buf2[sizeof(buf)];
+      memset(buf2, 0, sizeof(buf2));
+      int err = rados_read(ioctx, "foo", buf2, sizeof(buf2), 0);
+      if (err == -110) {
+	int startIndex = 0;
+	// find the index until which librados already read the object before the timeout occurred
+	for (unsigned b=0; b<sizeof(buf); b++) {
+	  if (buf2[b] != buf[b]) {
+	    startIndex = b;
+	    break;
+	  }
+	}
+
+	// wait some time to give librados a change to do something
+	sleep(1);
+
+	// then check if the buffer was changed after the call
+	if (buf2[startIndex] == 'a') {
+	  printf("byte at index %d was changed after the timeout to %d\n",
+		 startIndex, (int)buf[startIndex]);
+	  ASSERT_TRUE(0);
+	  break;
+	}
+      } else {
+	printf("no timeout :/\n");
+      }
+    }
+    rados_ioctx_destroy(ioctx);
+    rados_shutdown(cluster);
+  }
+}
+
 TEST_F(LibRadosIoPP, SimpleWritePP) {
   char buf[128];
   memset(buf, 0xcc, sizeof(buf));
diff --git a/src/test/librados/tier.cc b/src/test/librados/tier.cc
index 611e17e..4267389 100644
--- a/src/test/librados/tier.cc
+++ b/src/test/librados/tier.cc
@@ -34,6 +34,38 @@ using std::string;
 typedef RadosTestPP LibRadosTierPP;
 typedef RadosTestECPP LibRadosTierECPP;
 
+void flush_evict_all(librados::Rados& cluster, librados::IoCtx& cache_ioctx)
+{
+  bufferlist inbl;
+  cache_ioctx.set_namespace("");
+  for (ObjectIterator it = cache_ioctx.objects_begin();
+       it != cache_ioctx.objects_end(); ++it) {
+    cache_ioctx.locator_set_key(it->second);
+    {
+      ObjectReadOperation op;
+      op.cache_flush();
+      librados::AioCompletion *completion = cluster.aio_create_completion();
+      cache_ioctx.aio_operate(
+        it->first, completion, &op,
+	librados::OPERATION_IGNORE_OVERLAY, NULL);
+      completion->wait_for_safe();
+      completion->get_return_value();
+      completion->release();
+    }
+    {
+      ObjectReadOperation op;
+      op.cache_evict();
+      librados::AioCompletion *completion = cluster.aio_create_completion();
+      cache_ioctx.aio_operate(
+        it->first, completion, &op,
+	librados::OPERATION_IGNORE_OVERLAY, NULL);
+      completion->wait_for_safe();
+      completion->get_return_value();
+      completion->release();
+    }
+  }
+}
+
 class LibRadosTwoPoolsPP : public RadosTestPP
 {
 public:
@@ -59,7 +91,26 @@ protected:
   }
   virtual void TearDown() {
     RadosTestPP::TearDown();
+
+    // flush + evict cache
+    flush_evict_all(cluster, cache_ioctx);
+
+    bufferlist inbl;
+    // tear down tiers
+    ASSERT_EQ(0, cluster.mon_command(
+      "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
+      "\"}",
+      inbl, NULL, NULL));
+    ASSERT_EQ(0, cluster.mon_command(
+      "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
+      "\", \"tierpool\": \"" + cache_pool_name + "\"}",
+    inbl, NULL, NULL));
+
+    // wait for maps to settle before next test
+    cluster.wait_for_latest_osdmap();
+
     cleanup_default_namespace(cache_ioctx);
+
     cache_ioctx.close();
   }
   librados::IoCtx cache_ioctx;
@@ -180,19 +231,6 @@ TEST_F(LibRadosTwoPoolsPP, Overlay) {
     completion->release();
     ASSERT_EQ('b', bl[0]);
   }
-
-  // tear down tiers
-  ASSERT_EQ(0, cluster.mon_command(
-    "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
-    "\"}",
-    inbl, NULL, NULL));
-  ASSERT_EQ(0, cluster.mon_command(
-    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
-    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
-    inbl, NULL, NULL));
-
-  // wait for maps to settle before next test
-  cluster.wait_for_latest_osdmap();
 }
 
 TEST_F(LibRadosTwoPoolsPP, Promote) {
@@ -247,19 +285,6 @@ TEST_F(LibRadosTwoPoolsPP, Promote) {
     ++it;
     ASSERT_TRUE(it == cache_ioctx.objects_end());
   }
-
-  // tear down tiers
-  ASSERT_EQ(0, cluster.mon_command(
-    "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
-    "\"}",
-    inbl, NULL, NULL));
-  ASSERT_EQ(0, cluster.mon_command(
-    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
-    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
-    inbl, NULL, NULL));
-
-  // wait for maps to settle before next test
-  cluster.wait_for_latest_osdmap();
 }
 
 TEST_F(LibRadosTwoPoolsPP, PromoteSnap) {
@@ -400,19 +425,6 @@ TEST_F(LibRadosTwoPoolsPP, PromoteSnap) {
     bufferlist bl;
     ASSERT_EQ(-ENOENT, ioctx.read("baz", bl, 1, 0));
   }
-
-  // tear down tiers
-  ASSERT_EQ(0, cluster.mon_command(
-    "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
-    "\"}",
-    inbl, NULL, NULL));
-  ASSERT_EQ(0, cluster.mon_command(
-    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
-    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
-    inbl, NULL, NULL));
-
-  // wait for maps to settle before next test
-  cluster.wait_for_latest_osdmap();
 }
 
 TEST_F(LibRadosTwoPoolsPP, PromoteSnapScrub) {
@@ -509,19 +521,6 @@ TEST_F(LibRadosTwoPoolsPP, PromoteSnapScrub) {
   }
 
   ioctx.snap_set_read(librados::SNAP_HEAD);
-
-  // tear down tiers
-  ASSERT_EQ(0, cluster.mon_command(
-    "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
-    "\"}",
-    inbl, NULL, NULL));
-  ASSERT_EQ(0, cluster.mon_command(
-    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
-    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
-    inbl, NULL, NULL));
-
-  // wait for maps to settle before next test
-  cluster.wait_for_latest_osdmap();
 }
 
 
@@ -577,19 +576,6 @@ TEST_F(LibRadosTwoPoolsPP, PromoteSnapTrimRace) {
     bufferlist bl;
     ASSERT_EQ(-ENOENT, ioctx.read("foo", bl, 1, 0));
   }
-
-  // tear down tiers
-  ASSERT_EQ(0, cluster.mon_command(
-    "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
-    "\"}",
-    inbl, NULL, NULL));
-  ASSERT_EQ(0, cluster.mon_command(
-    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
-    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
-    inbl, NULL, NULL));
-
-  // wait for maps to settle before next test
-  cluster.wait_for_latest_osdmap();
 }
 
 TEST_F(LibRadosTwoPoolsPP, Whiteout) {
@@ -653,19 +639,6 @@ TEST_F(LibRadosTwoPoolsPP, Whiteout) {
     ASSERT_EQ(1, ioctx.read("foo", bl, 1, 0));
     ASSERT_EQ('h', bl[0]);
   }
-
-  // tear down tiers
-  ASSERT_EQ(0, cluster.mon_command(
-    "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
-    "\"}",
-    inbl, NULL, NULL));
-  ASSERT_EQ(0, cluster.mon_command(
-    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
-    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
-    inbl, NULL, NULL));
-
-  // wait for maps to settle before next test
-  cluster.wait_for_latest_osdmap();
 }
 
 TEST_F(LibRadosTwoPoolsPP, Evict) {
@@ -756,19 +729,6 @@ TEST_F(LibRadosTwoPoolsPP, Evict) {
     ASSERT_EQ(-EBUSY, completion->get_return_value());
     completion->release();
   }
-
-  // tear down tiers
-  ASSERT_EQ(0, cluster.mon_command(
-    "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
-    "\"}",
-    inbl, NULL, NULL));
-  ASSERT_EQ(0, cluster.mon_command(
-    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
-    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
-    inbl, NULL, NULL));
-
-  // wait for maps to settle before next test
-  cluster.wait_for_latest_osdmap();
 }
 
 TEST_F(LibRadosTwoPoolsPP, EvictSnap) {
@@ -1004,19 +964,6 @@ TEST_F(LibRadosTwoPoolsPP, EvictSnap) {
     ASSERT_EQ(0, completion->get_return_value());
     completion->release();
   }
-
-  // tear down tiers
-  ASSERT_EQ(0, cluster.mon_command(
-    "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
-    "\"}",
-    inbl, NULL, NULL));
-  ASSERT_EQ(0, cluster.mon_command(
-    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
-    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
-    inbl, NULL, NULL));
-
-  // wait for maps to settle before next test
-  cluster.wait_for_latest_osdmap();
 }
 
 TEST_F(LibRadosTwoPoolsPP, TryFlush) {
@@ -1125,19 +1072,6 @@ TEST_F(LibRadosTwoPoolsPP, TryFlush) {
     ObjectIterator it = cache_ioctx.objects_begin();
     ASSERT_TRUE(it == cache_ioctx.objects_end());
   }
-
-  // tear down tiers
-  ASSERT_EQ(0, cluster.mon_command(
-    "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
-    "\"}",
-    inbl, NULL, NULL));
-  ASSERT_EQ(0, cluster.mon_command(
-    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
-    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
-    inbl, NULL, NULL));
-
-  // wait for maps to settle before next test
-  cluster.wait_for_latest_osdmap();
 }
 
 TEST_F(LibRadosTwoPoolsPP, Flush) {
@@ -1298,19 +1232,6 @@ TEST_F(LibRadosTwoPoolsPP, Flush) {
     ObjectIterator it = ioctx.objects_begin();
     ASSERT_TRUE(it == ioctx.objects_end());
   }
-
-  // tear down tiers
-  ASSERT_EQ(0, cluster.mon_command(
-    "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
-    "\"}",
-    inbl, NULL, NULL));
-  ASSERT_EQ(0, cluster.mon_command(
-    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
-    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
-    inbl, NULL, NULL));
-
-  // wait for maps to settle before next test
-  cluster.wait_for_latest_osdmap();
 }
 
 TEST_F(LibRadosTwoPoolsPP, FlushSnap) {
@@ -1470,18 +1391,11 @@ TEST_F(LibRadosTwoPoolsPP, FlushSnap) {
     ASSERT_EQ('a', bl[0]);
   }
 
-  // tear down tiers
+  // remove overlay
   ASSERT_EQ(0, cluster.mon_command(
     "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
     "\"}",
     inbl, NULL, NULL));
-  ASSERT_EQ(0, cluster.mon_command(
-    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
-    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
-    inbl, NULL, NULL));
-
-  // wait for maps to settle
-  cluster.wait_for_latest_osdmap();
 
   // verify i can read the snaps from the base pool
   ioctx.snap_set_read(librados::SNAP_HEAD);
@@ -1502,6 +1416,11 @@ TEST_F(LibRadosTwoPoolsPP, FlushSnap) {
     ASSERT_EQ(1, ioctx.read("foo", bl, 1, 0));
     ASSERT_EQ('a', bl[0]);
   }
+
+  ASSERT_EQ(0, cluster.mon_command(
+    "{\"prefix\": \"osd tier set-overlay\", \"pool\": \"" + pool_name +
+    "\", \"overlaypool\": \"" + cache_pool_name + "\"}",
+    inbl, NULL, NULL));
 }
 
 TEST_F(LibRadosTierPP, FlushWriteRaces) {
@@ -1786,19 +1705,6 @@ TEST_F(LibRadosTwoPoolsPP, FlushTryFlushRaces) {
     completion->release();
     completion2->release();
   }
-
-  // tear down tiers
-  ASSERT_EQ(0, cluster.mon_command(
-    "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
-    "\"}",
-    inbl, NULL, NULL));
-  ASSERT_EQ(0, cluster.mon_command(
-    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
-    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
-    inbl, NULL, NULL));
-
-  // wait for maps to settle before next test
-  cluster.wait_for_latest_osdmap();
 }
 
 
@@ -1895,19 +1801,6 @@ TEST_F(LibRadosTwoPoolsPP, TryFlushReadRace) {
   while (num_reads > 0)
     cond.Wait(test_lock);
   test_lock.Unlock();
-
-  // tear down tiers
-  ASSERT_EQ(0, cluster.mon_command(
-    "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
-    "\"}",
-    inbl, NULL, NULL));
-  ASSERT_EQ(0, cluster.mon_command(
-    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
-    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
-    inbl, NULL, NULL));
-
-  // wait for maps to settle before next test
-  cluster.wait_for_latest_osdmap();
 }
 
 TEST_F(LibRadosTierPP, HitSetNone) {
@@ -1944,21 +1837,28 @@ string set_pool_str(string pool, string var, int val)
     + stringify(val) + string("\"}");
 }
 
-TEST_F(LibRadosTierPP, HitSetRead) {
-  // enable hitset tracking for this pool
+TEST_F(LibRadosTwoPoolsPP, HitSetRead) {
+  // make it a tier
   bufferlist inbl;
-  ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_count", 2),
+  ASSERT_EQ(0, cluster.mon_command(
+    "{\"prefix\": \"osd tier add\", \"pool\": \"" + pool_name +
+    "\", \"tierpool\": \"" + cache_pool_name +
+    "\", \"force_nonempty\": \"--force-nonempty\" }",
+    inbl, NULL, NULL));
+
+  // enable hitset tracking for this pool
+  ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_count", 2),
 						inbl, NULL, NULL));
-  ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_period", 600),
+  ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_period", 600),
 						inbl, NULL, NULL));
-  ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_type",
+  ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_type",
 						"explicit_object"),
 				   inbl, NULL, NULL));
 
   // wait for maps to settle
   cluster.wait_for_latest_osdmap();
 
-  ioctx.set_namespace("");
+  cache_ioctx.set_namespace("");
 
   // keep reading until we see our object appear in the HitSet
   utime_t start = ceph_clock_now(NULL);
@@ -1969,16 +1869,16 @@ TEST_F(LibRadosTierPP, HitSetRead) {
     ASSERT_TRUE(now < hard_stop);
 
     string name = "foo";
-    uint32_t hash = ioctx.get_object_hash_position(name);
+    uint32_t hash = cache_ioctx.get_object_hash_position(name);
     hobject_t oid(sobject_t(name, CEPH_NOSNAP), "", hash,
-		  cluster.pool_lookup(pool_name.c_str()), "");
+		  cluster.pool_lookup(cache_pool_name.c_str()), "");
 
     bufferlist bl;
-    ASSERT_EQ(-ENOENT, ioctx.read("foo", bl, 1, 0));
+    ASSERT_EQ(-ENOENT, cache_ioctx.read("foo", bl, 1, 0));
 
     bufferlist hbl;
     AioCompletion *c = librados::Rados::aio_create_completion();
-    ASSERT_EQ(0, ioctx.hit_set_get(hash, c, now.sec(), &hbl));
+    ASSERT_EQ(0, cache_ioctx.hit_set_get(hash, c, now.sec(), &hbl));
     c->wait_for_complete();
     c->release();
 
@@ -2028,30 +1928,39 @@ static int _get_pg_num(Rados& cluster, string pool_name)
 }
 
 
-TEST_F(LibRadosTierPP, HitSetWrite) {
+TEST_F(LibRadosTwoPoolsPP, HitSetWrite) {
   int num_pg = _get_pg_num(cluster, pool_name);
   assert(num_pg > 0);
 
-  // enable hitset tracking for this pool
+  // make it a tier
   bufferlist inbl;
-  ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_count", 8),
+  ASSERT_EQ(0, cluster.mon_command(
+    "{\"prefix\": \"osd tier add\", \"pool\": \"" + pool_name +
+    "\", \"tierpool\": \"" + cache_pool_name +
+    "\", \"force_nonempty\": \"--force-nonempty\" }",
+    inbl, NULL, NULL));
+
+  // enable hitset tracking for this pool
+  ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_count", 8),
 						inbl, NULL, NULL));
-  ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_period", 600),
+  ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_period", 600),
 						inbl, NULL, NULL));
-  ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_type",
+  ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_type",
 						"explicit_hash"),
 				   inbl, NULL, NULL));
 
   // wait for maps to settle
   cluster.wait_for_latest_osdmap();
 
-  ioctx.set_namespace("");
+  cache_ioctx.set_namespace("");
+
+  int num = 200;
 
   // do a bunch of writes
-  for (int i=0; i<1000; ++i) {
+  for (int i=0; i<num; ++i) {
     bufferlist bl;
     bl.append("a");
-    ASSERT_EQ(0, ioctx.write(stringify(i), bl, 1, 0));
+    ASSERT_EQ(0, cache_ioctx.write(stringify(i), bl, 1, 0));
   }
 
   // get HitSets
@@ -2059,7 +1968,7 @@ TEST_F(LibRadosTierPP, HitSetWrite) {
   for (int i=0; i<num_pg; ++i) {
     list< pair<time_t,time_t> > ls;
     AioCompletion *c = librados::Rados::aio_create_completion();
-    ASSERT_EQ(0, ioctx.hit_set_list(i, c, &ls));
+    ASSERT_EQ(0, cache_ioctx.hit_set_list(i, c, &ls));
     c->wait_for_complete();
     c->release();
     std::cout << "pg " << i << " ls " << ls << std::endl;
@@ -2068,7 +1977,7 @@ TEST_F(LibRadosTierPP, HitSetWrite) {
     // get the latest
     c = librados::Rados::aio_create_completion();
     bufferlist bl;
-    ASSERT_EQ(0, ioctx.hit_set_get(i, c, ls.back().first, &bl));
+    ASSERT_EQ(0, cache_ioctx.hit_set_get(i, c, ls.back().first, &bl));
     c->wait_for_complete();
     c->release();
 
@@ -2081,14 +1990,14 @@ TEST_F(LibRadosTierPP, HitSetWrite) {
 
     // cope with racing splits by refreshing pg_num
     if (i == num_pg - 1)
-      num_pg = _get_pg_num(cluster, pool_name);
+      num_pg = _get_pg_num(cluster, cache_pool_name);
   }
 
-  for (int i=0; i<1000; ++i) {
+  for (int i=0; i<num; ++i) {
     string n = stringify(i);
-    uint32_t hash = ioctx.get_object_hash_position(n);
+    uint32_t hash = cache_ioctx.get_object_hash_position(n);
     hobject_t oid(sobject_t(n, CEPH_NOSNAP), "", hash,
-		  cluster.pool_lookup(pool_name.c_str()), "");
+		  cluster.pool_lookup(cache_pool_name.c_str()), "");
     std::cout << "checking for " << oid << std::endl;
     bool found = false;
     for (int p=0; p<num_pg; ++p) {
@@ -2101,25 +2010,32 @@ TEST_F(LibRadosTierPP, HitSetWrite) {
   }
 }
 
-TEST_F(LibRadosTierPP, HitSetTrim) {
+TEST_F(LibRadosTwoPoolsPP, HitSetTrim) {
   unsigned count = 3;
   unsigned period = 3;
 
-  // enable hitset tracking for this pool
+  // make it a tier
   bufferlist inbl;
-  ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_count", count),
+  ASSERT_EQ(0, cluster.mon_command(
+    "{\"prefix\": \"osd tier add\", \"pool\": \"" + pool_name +
+    "\", \"tierpool\": \"" + cache_pool_name +
+    "\", \"force_nonempty\": \"--force-nonempty\" }",
+    inbl, NULL, NULL));
+
+  // enable hitset tracking for this pool
+  ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_count", count),
 						inbl, NULL, NULL));
-  ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_period", period),
+  ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_period", period),
 						inbl, NULL, NULL));
-  ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_type", "bloom"),
+  ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_type", "bloom"),
 				   inbl, NULL, NULL));
-  ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_fpp", ".01"),
+  ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_fpp", ".01"),
 				   inbl, NULL, NULL));
 
   // wait for maps to settle
   cluster.wait_for_latest_osdmap();
 
-  ioctx.set_namespace("");
+  cache_ioctx.set_namespace("");
 
   // do a bunch of writes and make sure the hitsets rotate
   utime_t start = ceph_clock_now(NULL);
@@ -2128,16 +2044,16 @@ TEST_F(LibRadosTierPP, HitSetTrim) {
   time_t first = 0;
   while (true) {
     string name = "foo";
-    uint32_t hash = ioctx.get_object_hash_position(name);
+    uint32_t hash = cache_ioctx.get_object_hash_position(name);
     hobject_t oid(sobject_t(name, CEPH_NOSNAP), "", hash, -1, "");
 
     bufferlist bl;
     bl.append("f");
-    ASSERT_EQ(0, ioctx.write("foo", bl, 1, 0));
+    ASSERT_EQ(0, cache_ioctx.write("foo", bl, 1, 0));
 
     list<pair<time_t, time_t> > ls;
     AioCompletion *c = librados::Rados::aio_create_completion();
-    ASSERT_EQ(0, ioctx.hit_set_list(hash, c, &ls));
+    ASSERT_EQ(0, cache_ioctx.hit_set_list(hash, c, &ls));
     c->wait_for_complete();
     c->release();
 
@@ -2187,9 +2103,29 @@ protected:
   }
   virtual void TearDown() {
     RadosTestECPP::TearDown();
+
+    // flush + evict cache
+    flush_evict_all(cluster, cache_ioctx);
+
+    bufferlist inbl;
+    // tear down tiers
+    ASSERT_EQ(0, cluster.mon_command(
+      "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
+      "\"}",
+      inbl, NULL, NULL));
+    ASSERT_EQ(0, cluster.mon_command(
+      "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
+      "\", \"tierpool\": \"" + cache_pool_name + "\"}",
+    inbl, NULL, NULL));
+
+    // wait for maps to settle before next test
+    cluster.wait_for_latest_osdmap();
+
     cleanup_default_namespace(cache_ioctx);
+
     cache_ioctx.close();
   }
+
   librados::IoCtx cache_ioctx;
 };
 
@@ -2308,19 +2244,6 @@ TEST_F(LibRadosTwoPoolsECPP, Overlay) {
     completion->release();
     ASSERT_EQ('b', bl[0]);
   }
-
-  // tear down tiers
-  ASSERT_EQ(0, cluster.mon_command(
-    "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
-    "\"}",
-    inbl, NULL, NULL));
-  ASSERT_EQ(0, cluster.mon_command(
-    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
-    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
-    inbl, NULL, NULL));
-
-  // wait for maps to settle before next test
-  cluster.wait_for_latest_osdmap();
 }
 
 TEST_F(LibRadosTwoPoolsECPP, Promote) {
@@ -2375,19 +2298,6 @@ TEST_F(LibRadosTwoPoolsECPP, Promote) {
     ++it;
     ASSERT_TRUE(it == cache_ioctx.objects_end());
   }
-
-  // tear down tiers
-  ASSERT_EQ(0, cluster.mon_command(
-    "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
-    "\"}",
-    inbl, NULL, NULL));
-  ASSERT_EQ(0, cluster.mon_command(
-    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
-    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
-    inbl, NULL, NULL));
-
-  // wait for maps to settle before next test
-  cluster.wait_for_latest_osdmap();
 }
 
 TEST_F(LibRadosTwoPoolsECPP, PromoteSnap) {
@@ -2552,19 +2462,6 @@ TEST_F(LibRadosTwoPoolsECPP, PromoteSnap) {
     bufferlist bl;
     ASSERT_EQ(-ENOENT, ioctx.read("baz", bl, 1, 0));
   }
-
-  // tear down tiers
-  ASSERT_EQ(0, cluster.mon_command(
-    "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
-    "\"}",
-    inbl, NULL, NULL));
-  ASSERT_EQ(0, cluster.mon_command(
-    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
-    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
-    inbl, NULL, NULL));
-
-  // wait for maps to settle before next test
-  cluster.wait_for_latest_osdmap();
 }
 
 TEST_F(LibRadosTwoPoolsECPP, PromoteSnapTrimRace) {
@@ -2619,19 +2516,6 @@ TEST_F(LibRadosTwoPoolsECPP, PromoteSnapTrimRace) {
     bufferlist bl;
     ASSERT_EQ(-ENOENT, ioctx.read("foo", bl, 1, 0));
   }
-
-  // tear down tiers
-  ASSERT_EQ(0, cluster.mon_command(
-    "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
-    "\"}",
-    inbl, NULL, NULL));
-  ASSERT_EQ(0, cluster.mon_command(
-    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
-    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
-    inbl, NULL, NULL));
-
-  // wait for maps to settle before next test
-  cluster.wait_for_latest_osdmap();
 }
 
 TEST_F(LibRadosTwoPoolsECPP, Whiteout) {
@@ -2695,19 +2579,6 @@ TEST_F(LibRadosTwoPoolsECPP, Whiteout) {
     ASSERT_EQ(1, ioctx.read("foo", bl, 1, 0));
     ASSERT_EQ('h', bl[0]);
   }
-
-  // tear down tiers
-  ASSERT_EQ(0, cluster.mon_command(
-    "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
-    "\"}",
-    inbl, NULL, NULL));
-  ASSERT_EQ(0, cluster.mon_command(
-    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
-    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
-    inbl, NULL, NULL));
-
-  // wait for maps to settle before next test
-  cluster.wait_for_latest_osdmap();
 }
 
 TEST_F(LibRadosTwoPoolsECPP, Evict) {
@@ -2798,19 +2669,6 @@ TEST_F(LibRadosTwoPoolsECPP, Evict) {
     ASSERT_EQ(-EBUSY, completion->get_return_value());
     completion->release();
   }
-
-  // tear down tiers
-  ASSERT_EQ(0, cluster.mon_command(
-    "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
-    "\"}",
-    inbl, NULL, NULL));
-  ASSERT_EQ(0, cluster.mon_command(
-    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
-    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
-    inbl, NULL, NULL));
-
-  // wait for maps to settle before next test
-  cluster.wait_for_latest_osdmap();
 }
 
 TEST_F(LibRadosTwoPoolsECPP, EvictSnap) {
@@ -3046,19 +2904,6 @@ TEST_F(LibRadosTwoPoolsECPP, EvictSnap) {
     ASSERT_EQ(0, completion->get_return_value());
     completion->release();
   }
-
-  // tear down tiers
-  ASSERT_EQ(0, cluster.mon_command(
-    "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
-    "\"}",
-    inbl, NULL, NULL));
-  ASSERT_EQ(0, cluster.mon_command(
-    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
-    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
-    inbl, NULL, NULL));
-
-  // wait for maps to settle before next test
-  cluster.wait_for_latest_osdmap();
 }
 
 TEST_F(LibRadosTwoPoolsECPP, TryFlush) {
@@ -3167,19 +3012,6 @@ TEST_F(LibRadosTwoPoolsECPP, TryFlush) {
     ObjectIterator it = cache_ioctx.objects_begin();
     ASSERT_TRUE(it == cache_ioctx.objects_end());
   }
-
-  // tear down tiers
-  ASSERT_EQ(0, cluster.mon_command(
-    "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
-    "\"}",
-    inbl, NULL, NULL));
-  ASSERT_EQ(0, cluster.mon_command(
-    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
-    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
-    inbl, NULL, NULL));
-
-  // wait for maps to settle before next test
-  cluster.wait_for_latest_osdmap();
 }
 
 TEST_F(LibRadosTwoPoolsECPP, Flush) {
@@ -3340,19 +3172,6 @@ TEST_F(LibRadosTwoPoolsECPP, Flush) {
     ObjectIterator it = ioctx.objects_begin();
     ASSERT_TRUE(it == ioctx.objects_end());
   }
-
-  // tear down tiers
-  ASSERT_EQ(0, cluster.mon_command(
-    "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
-    "\"}",
-    inbl, NULL, NULL));
-  ASSERT_EQ(0, cluster.mon_command(
-    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
-    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
-    inbl, NULL, NULL));
-
-  // wait for maps to settle before next test
-  cluster.wait_for_latest_osdmap();
 }
 
 TEST_F(LibRadosTwoPoolsECPP, FlushSnap) {
@@ -3517,10 +3336,6 @@ TEST_F(LibRadosTwoPoolsECPP, FlushSnap) {
     "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
     "\"}",
     inbl, NULL, NULL));
-  ASSERT_EQ(0, cluster.mon_command(
-    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
-    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
-    inbl, NULL, NULL));
 
   // wait for maps to settle
   cluster.wait_for_latest_osdmap();
@@ -3544,6 +3359,11 @@ TEST_F(LibRadosTwoPoolsECPP, FlushSnap) {
     ASSERT_EQ(1, ioctx.read("foo", bl, 1, 0));
     ASSERT_EQ('a', bl[0]);
   }
+
+  ASSERT_EQ(0, cluster.mon_command(
+    "{\"prefix\": \"osd tier set-overlay\", \"pool\": \"" + pool_name +
+    "\", \"overlaypool\": \"" + cache_pool_name + "\"}",
+    inbl, NULL, NULL));
 }
 
 TEST_F(LibRadosTierECPP, FlushWriteRaces) {
@@ -3828,19 +3648,6 @@ TEST_F(LibRadosTwoPoolsECPP, FlushTryFlushRaces) {
     completion->release();
     completion2->release();
   }
-
-  // tear down tiers
-  ASSERT_EQ(0, cluster.mon_command(
-    "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
-    "\"}",
-    inbl, NULL, NULL));
-  ASSERT_EQ(0, cluster.mon_command(
-    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
-    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
-    inbl, NULL, NULL));
-
-  // wait for maps to settle before next test
-  cluster.wait_for_latest_osdmap();
 }
 
 TEST_F(LibRadosTwoPoolsECPP, TryFlushReadRace) {
@@ -3903,19 +3710,6 @@ TEST_F(LibRadosTwoPoolsECPP, TryFlushReadRace) {
   while (num_reads > 0)
     cond.Wait(test_lock);
   test_lock.Unlock();
-
-  // tear down tiers
-  ASSERT_EQ(0, cluster.mon_command(
-    "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
-    "\"}",
-    inbl, NULL, NULL));
-  ASSERT_EQ(0, cluster.mon_command(
-    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
-    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
-    inbl, NULL, NULL));
-
-  // wait for maps to settle before next test
-  cluster.wait_for_latest_osdmap();
 }
 
 TEST_F(LibRadosTierECPP, HitSetNone) {
@@ -3938,21 +3732,28 @@ TEST_F(LibRadosTierECPP, HitSetNone) {
   }
 }
 
-TEST_F(LibRadosTierECPP, HitSetRead) {
-  // enable hitset tracking for this pool
+TEST_F(LibRadosTwoPoolsECPP, HitSetRead) {
+  // make it a tier
   bufferlist inbl;
-  ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_count", 2),
+  ASSERT_EQ(0, cluster.mon_command(
+    "{\"prefix\": \"osd tier add\", \"pool\": \"" + pool_name +
+    "\", \"tierpool\": \"" + cache_pool_name +
+    "\", \"force_nonempty\": \"--force-nonempty\" }",
+    inbl, NULL, NULL));
+
+  // enable hitset tracking for this pool
+  ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_count", 2),
 						inbl, NULL, NULL));
-  ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_period", 600),
+  ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_period", 600),
 						inbl, NULL, NULL));
-  ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_type",
+  ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_type",
 						"explicit_object"),
 				   inbl, NULL, NULL));
 
   // wait for maps to settle
   cluster.wait_for_latest_osdmap();
 
-  ioctx.set_namespace("");
+  cache_ioctx.set_namespace("");
 
   // keep reading until we see our object appear in the HitSet
   utime_t start = ceph_clock_now(NULL);
@@ -3963,16 +3764,16 @@ TEST_F(LibRadosTierECPP, HitSetRead) {
     ASSERT_TRUE(now < hard_stop);
 
     string name = "foo";
-    uint32_t hash = ioctx.get_object_hash_position(name);
+    uint32_t hash = cache_ioctx.get_object_hash_position(name);
     hobject_t oid(sobject_t(name, CEPH_NOSNAP), "", hash,
-		  cluster.pool_lookup(pool_name.c_str()), "");
+		  cluster.pool_lookup(cache_pool_name.c_str()), "");
 
     bufferlist bl;
-    ASSERT_EQ(-ENOENT, ioctx.read("foo", bl, 1, 0));
+    ASSERT_EQ(-ENOENT, cache_ioctx.read("foo", bl, 1, 0));
 
     bufferlist hbl;
     AioCompletion *c = librados::Rados::aio_create_completion();
-    ASSERT_EQ(0, ioctx.hit_set_get(hash, c, now.sec(), &hbl));
+    ASSERT_EQ(0, cache_ioctx.hit_set_get(hash, c, now.sec(), &hbl));
     c->wait_for_complete();
     c->release();
 
@@ -4069,25 +3870,32 @@ TEST_F(LibRadosTierECPP, HitSetWrite) {
 }
 #endif
 
-TEST_F(LibRadosTierECPP, HitSetTrim) {
+TEST_F(LibRadosTwoPoolsECPP, HitSetTrim) {
   unsigned count = 3;
   unsigned period = 3;
 
-  // enable hitset tracking for this pool
+  // make it a tier
   bufferlist inbl;
-  ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_count", count),
+  ASSERT_EQ(0, cluster.mon_command(
+    "{\"prefix\": \"osd tier add\", \"pool\": \"" + pool_name +
+    "\", \"tierpool\": \"" + cache_pool_name +
+    "\", \"force_nonempty\": \"--force-nonempty\" }",
+    inbl, NULL, NULL));
+
+  // enable hitset tracking for this pool
+  ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_count", count),
 						inbl, NULL, NULL));
-  ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_period", period),
+  ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_period", period),
 						inbl, NULL, NULL));
-  ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_type", "bloom"),
+  ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_type", "bloom"),
 				   inbl, NULL, NULL));
-  ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_fpp", ".01"),
+  ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_fpp", ".01"),
 				   inbl, NULL, NULL));
 
   // wait for maps to settle
   cluster.wait_for_latest_osdmap();
 
-  ioctx.set_namespace("");
+  cache_ioctx.set_namespace("");
 
   // do a bunch of writes and make sure the hitsets rotate
   utime_t start = ceph_clock_now(NULL);
@@ -4100,16 +3908,16 @@ TEST_F(LibRadosTierECPP, HitSetTrim) {
 
   while (true) {
     string name = "foo";
-    uint32_t hash = ioctx.get_object_hash_position(name);
+    uint32_t hash = cache_ioctx.get_object_hash_position(name);
     hobject_t oid(sobject_t(name, CEPH_NOSNAP), "", hash, -1, "");
 
     bufferlist bl;
     bl.append(buf, bsize);
-    ASSERT_EQ(0, ioctx.append("foo", bl, bsize));
+    ASSERT_EQ(0, cache_ioctx.append("foo", bl, bsize));
 
     list<pair<time_t, time_t> > ls;
     AioCompletion *c = librados::Rados::aio_create_completion();
-    ASSERT_EQ(0, ioctx.hit_set_list(hash, c, &ls));
+    ASSERT_EQ(0, cache_ioctx.hit_set_list(hash, c, &ls));
     c->wait_for_complete();
     c->release();
 
diff --git a/src/test/objectstore/store_test.cc b/src/test/objectstore/store_test.cc
index 8133d4d..7c5dc58 100644
--- a/src/test/objectstore/store_test.cc
+++ b/src/test/objectstore/store_test.cc
@@ -1115,6 +1115,111 @@ TEST_P(StoreTest, MoveRename) {
     ASSERT_TRUE(newomap.count("omap_key"));
     ASSERT_TRUE(newomap["omap_key"].contents_equal(omap["omap_key"]));
   }
+  {
+    ObjectStore::Transaction t;
+    t.remove(cid, oid);
+    t.remove_collection(cid);
+    t.remove_collection(temp_cid);
+    r = store->apply_transaction(t);
+    ASSERT_EQ(r, 0);
+  }
+}
+
+TEST_P(StoreTest, BigRGWObjectName) {
+  store->set_allow_sharded_objects();
+  store->sync_and_flush();
+  coll_t temp_cid("mytemp");
+  hobject_t temp_oid("tmp_oid", "", CEPH_NOSNAP, 0, 0, "");
+  coll_t cid("dest");
+  ghobject_t oid(
+    hobject_t(
+      "default.4106.50_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa [...]
+      "",
+      CEPH_NOSNAP,
+      0x81920472,
+      3,
+      ""),
+    15,
+    shard_id_t(1));
+  ghobject_t oid2(oid);
+  oid2.generation = 17;
+  ghobject_t oidhead(oid);
+  oidhead.generation = ghobject_t::NO_GEN;
+
+  int r;
+  {
+    ObjectStore::Transaction t;
+    t.create_collection(cid);
+    t.touch(cid, oidhead);
+    t.collection_move_rename(cid, oidhead, cid, oid);
+    t.touch(cid, oidhead);
+    t.collection_move_rename(cid, oidhead, cid, oid2);
+    r = store->apply_transaction(t);
+    ASSERT_EQ(r, 0);
+  }
+
+  {
+    ObjectStore::Transaction t;
+    t.remove(cid, oid);
+    r = store->apply_transaction(t);
+    ASSERT_EQ(r, 0);
+  }
+
+  {
+    vector<ghobject_t> objects;
+    r = store->collection_list(cid, objects);
+    ASSERT_EQ(r, 0);
+    ASSERT_EQ(objects.size(), 1u);
+    ASSERT_EQ(objects[0], oid2);
+  }
+
+  ASSERT_FALSE(store->exists(cid, oid));
+
+  {
+    ObjectStore::Transaction t;
+    t.remove(cid, oid2);
+    t.remove_collection(cid);
+    r = store->apply_transaction(t);
+    ASSERT_EQ(r, 0);
+
+  }
+}
+
+TEST_P(StoreTest, SetAllocHint) {
+  coll_t cid("alloc_hint");
+  ghobject_t hoid(hobject_t("test_hint", "", CEPH_NOSNAP, 0, 0, ""));
+  int r;
+  {
+    ObjectStore::Transaction t;
+    t.create_collection(cid);
+    t.touch(cid, hoid);
+    r = store->apply_transaction(t);
+    ASSERT_EQ(r, 0);
+  }
+  {
+    ObjectStore::Transaction t;
+    t.set_alloc_hint(cid, hoid, 4*1024*1024, 1024*4);
+    r = store->apply_transaction(t);
+    ASSERT_EQ(r, 0);
+  }
+  {
+    ObjectStore::Transaction t;
+    t.remove(cid, hoid);
+    r = store->apply_transaction(t);
+    ASSERT_EQ(r, 0);
+  }
+  {
+    ObjectStore::Transaction t;
+    t.set_alloc_hint(cid, hoid, 4*1024*1024, 1024*4);
+    r = store->apply_transaction(t);
+    ASSERT_EQ(r, 0);
+  }
+  {
+    ObjectStore::Transaction t;
+    t.remove_collection(cid);
+    r = store->apply_transaction(t);
+    ASSERT_EQ(r, 0);
+  }
 }
 
 INSTANTIATE_TEST_CASE_P(
diff --git a/src/test/osd/TestOSDMap.cc b/src/test/osd/TestOSDMap.cc
index 0ff12c8..451b6b2 100644
--- a/src/test/osd/TestOSDMap.cc
+++ b/src/test/osd/TestOSDMap.cc
@@ -50,13 +50,24 @@ public:
     }
     osdmap.apply_incremental(pending_inc);
 
-    // kludge to get an erasure coding rule and pool
+    // Create an EC ruleset and a pool using it
     int r = osdmap.crush->add_simple_ruleset("erasure", "default", "osd",
 					     "indep", pg_pool_t::TYPE_ERASURE,
 					     &cerr);
-    pg_pool_t *p = (pg_pool_t *)osdmap.get_pg_pool(2);
+
+    OSDMap::Incremental new_pool_inc(osdmap.get_epoch() + 1);
+    new_pool_inc.new_pool_max = osdmap.get_pool_max();
+    new_pool_inc.fsid = osdmap.get_fsid();
+    pg_pool_t empty;
+    uint64_t pool_id = ++new_pool_inc.new_pool_max;
+    pg_pool_t *p = new_pool_inc.get_new_pool(pool_id, &empty);
+    p->size = 3;
+    p->set_pg_num(64);
+    p->set_pgp_num(64);
     p->type = pg_pool_t::TYPE_ERASURE;
     p->crush_ruleset = r;
+    new_pool_inc.new_pool_names[pool_id] = "ec";
+    osdmap.apply_incremental(new_pool_inc);
   }
   unsigned int get_num_osds() { return num_osds; }
 
@@ -86,6 +97,48 @@ TEST_F(OSDMapTest, Create) {
   ASSERT_EQ(get_num_osds(), osdmap.get_num_in_osds());
 }
 
+TEST_F(OSDMapTest, Features) {
+  // with EC pool
+  set_up_map();
+  uint64_t features = osdmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL);
+  ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES);
+  ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES2);
+  ASSERT_FALSE(features & CEPH_FEATURE_CRUSH_TUNABLES3);
+  ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_V2);
+  ASSERT_TRUE(features & CEPH_FEATURE_OSD_ERASURE_CODES);
+  ASSERT_TRUE(features & CEPH_FEATURE_OSDHASHPSPOOL);
+  ASSERT_FALSE(features & CEPH_FEATURE_OSD_PRIMARY_AFFINITY);
+
+  // clients have a slightly different view
+  features = osdmap.get_features(CEPH_ENTITY_TYPE_CLIENT, NULL);
+  ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES);
+  ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES2);
+  ASSERT_FALSE(features & CEPH_FEATURE_CRUSH_TUNABLES3);
+  ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_V2);
+  ASSERT_FALSE(features & CEPH_FEATURE_OSD_ERASURE_CODES);  // dont' need this
+  ASSERT_TRUE(features & CEPH_FEATURE_OSDHASHPSPOOL);
+  ASSERT_FALSE(features & CEPH_FEATURE_OSD_PRIMARY_AFFINITY);
+
+  // remove teh EC pool, but leave the rule.  add primary affinity.
+  {
+    OSDMap::Incremental new_pool_inc(osdmap.get_epoch() + 1);
+    new_pool_inc.old_pools.insert(osdmap.lookup_pg_pool_name("ec"));
+    new_pool_inc.new_primary_affinity[0] = 0x8000;
+    osdmap.apply_incremental(new_pool_inc);
+  }
+
+  features = osdmap.get_features(CEPH_ENTITY_TYPE_MON, NULL);
+  ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES);
+  ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES2);
+  ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES3); // shared bit with primary affinity
+  ASSERT_FALSE(features & CEPH_FEATURE_CRUSH_V2);
+  ASSERT_FALSE(features & CEPH_FEATURE_OSD_ERASURE_CODES);
+  ASSERT_TRUE(features & CEPH_FEATURE_OSDHASHPSPOOL);
+  ASSERT_TRUE(features & CEPH_FEATURE_OSD_PRIMARY_AFFINITY);
+
+  // FIXME: test tiering feature bits
+}
+
 TEST_F(OSDMapTest, MapPG) {
   set_up_map();
 
diff --git a/src/test/osd/osd-test-helpers.sh b/src/test/osd/osd-test-helpers.sh
index 5117ae3..1ea17dd 100644
--- a/src/test/osd/osd-test-helpers.sh
+++ b/src/test/osd/osd-test-helpers.sh
@@ -37,6 +37,7 @@ function run_osd() {
     ceph_args+=" --osd-journal-size=100"
     ceph_args+=" --osd-data=$osd_data"
     ceph_args+=" --chdir="
+    ceph_args+=" --osd-pool-default-erasure-code-directory=.libs"
     ceph_args+=" --run-dir=$dir"
     ceph_args+=" --debug-osd=20"
     ceph_args+=" --log-file=$dir/osd-\$id.log"
diff --git a/src/test/strtol.cc b/src/test/strtol.cc
index d3f0ae0..08ba081 100644
--- a/src/test/strtol.cc
+++ b/src/test/strtol.cc
@@ -14,6 +14,7 @@
 
 #include "common/strtol.h"
 #include <string>
+#include <map>
 
 #include "gtest/gtest.h"
 
@@ -134,3 +135,77 @@ TEST(StrToL, Error1) {
 
   test_strict_strtof_err("0.05.0");
 }
+
+
+static void test_strict_sistrtoll(const char *str)
+{
+  std::string err;
+  strict_sistrtoll(str, &err);
+  ASSERT_EQ(err, "");
+}
+
+static void test_strict_sistrtoll_units(const std::string& foo,
+                                      char u, const int m)
+{
+  std::string s(foo);
+  s.push_back(u);
+  const char *str = s.c_str();
+  std::string err;
+  uint64_t r = strict_sistrtoll(str, &err);
+  ASSERT_EQ(err, "");
+
+  str = foo.c_str();
+  std::string err2;
+  long long tmp = strict_strtoll(str, 10, &err2);
+  ASSERT_EQ(err2, "");
+  tmp = (tmp << m);
+  ASSERT_EQ(tmp, (long long)r);
+}
+
+TEST(SIStrToLL, WithUnits) {
+  std::map<char,int> units;
+  units['B'] = 0;
+  units['K'] = 10;
+  units['M'] = 20;
+  units['G'] = 30;
+  units['T'] = 40;
+  units['P'] = 50;
+  units['E'] = 60;
+
+  for (std::map<char,int>::iterator p = units.begin();
+       p != units.end(); ++p) {
+    test_strict_sistrtoll_units("1024", p->first, p->second);
+    test_strict_sistrtoll_units("1", p->first, p->second);
+    test_strict_sistrtoll_units("0", p->first, p->second);
+  }
+}
+
+TEST(SIStrToLL, WithoutUnits) {
+  test_strict_sistrtoll("1024");
+  test_strict_sistrtoll("1152921504606846976");
+  test_strict_sistrtoll("0");
+}
+
+static void test_strict_sistrtoll_err(const char *str)
+{
+  std::string err;
+  strict_sistrtoll(str, &err);
+  ASSERT_NE(err, "");
+}
+
+TEST(SIStrToLL, Error) {
+  test_strict_sistrtoll_err("1024F");
+  test_strict_sistrtoll_err("QDDSA");
+  test_strict_sistrtoll_err("1b");
+  test_strict_sistrtoll_err("100k");
+  test_strict_sistrtoll_err("1000m");
+  test_strict_sistrtoll_err("1g");
+  test_strict_sistrtoll_err("20t");
+  test_strict_sistrtoll_err("100p");
+  test_strict_sistrtoll_err("1000e");
+  test_strict_sistrtoll_err("B");
+  test_strict_sistrtoll_err("M");
+  test_strict_sistrtoll_err("BM");
+  test_strict_sistrtoll_err("B0wef");
+  test_strict_sistrtoll_err("0m");
+}

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-ceph/ceph.git



More information about the Pkg-ceph-commits mailing list