[Pkg-ceph-commits] [ceph] 01/05: re-patch with latest Firefly HEAD.

Mon Sep 15 19:45:53 UTC 2014

This is an automated email from the git hooks/post-receive script.

onlyjob pushed a commit to branch master
in repository ceph.

commit ce32a12
Author: Dmitry Smirnov <onlyjob at member.fsf.org>
Date:   Mon Sep 15 17:35:22 2014

    re-patch with latest Firefly HEAD.
---
 debian/patches/backfill-prio.patch               |    68 +-
 debian/patches/bug-8342.patch                    |     8 +-
 debian/patches/bug-8821.patch                    |    28 +-
 debian/patches/ceph-ao-require-cas.patch         |    16 -
 debian/patches/client-sleep1.patch               |     4 +-
 debian/patches/client-sleep2.patch               |     8 +-
 debian/patches/client-sleep3.patch               |     4 +-
 debian/patches/firefly-latest.patch              | 11172 +++++++++++++++++++++
 debian/patches/fix-blkdev-BLKGETSIZE-check.patch |    35 -
 debian/patches/series                            |     3 +-
 debian/patches/sleep-recover.patch               |     4 +-
 11 files changed, 11265 insertions(+), 85 deletions(-)

diff --git a/debian/patches/backfill-prio.patch b/debian/patches/backfill-prio.patch
index 8ac72ee..ae3669e 100644
--- a/debian/patches/backfill-prio.patch
+++ b/debian/patches/backfill-prio.patch
@@ -11,7 +11,8 @@ Date:   Tue Jun 24 02:09:49 2014
 
 --- a/src/common/AsyncReserver.h
 +++ b/src/common/AsyncReserver.h
-@@ -33,6 +33,7 @@ template <typename T>
+@@ -32,8 +32,9 @@
+ template <typename T>
  class AsyncReserver {
    Finisher *f;
    unsigned max_allowed;
@@ -19,7 +20,9 @@ Date:   Tue Jun 24 02:09:49 2014
    Mutex lock;
  
    map<unsigned, list<pair<T, Context*> > > queues;
-@@ -42,7 +43,9 @@ class AsyncReserver {
+   map<T, pair<unsigned, typename list<pair<T, Context*> >::iterator > > queue_pointers;
+@@ -41,9 +42,11 @@
+ 
    void do_queues() {
      typename map<unsigned, list<pair<T, Context*> > >::reverse_iterator it;
      for (it = queues.rbegin();
@@ -30,7 +33,9 @@ Date:   Tue Jun 24 02:09:49 2014
           ++it) {
        while (in_progress.size() < max_allowed &&
               !it->second.empty()) {
-@@ -57,8 +60,12 @@ class AsyncReserver {
+         pair<T, Context*> p = it->second.front();
+@@ -56,17 +59,27 @@
+   }
  public:
    AsyncReserver(
      Finisher *f,
@@ -45,7 +50,7 @@ Date:   Tue Jun 24 02:09:49 2014
  
    void set_max(unsigned max) {
      Mutex::Locker l(lock);
-@@ -66,6 +73,12 @@ public:
+     max_allowed = max;
      do_queues();
    }
  
@@ -58,9 +63,11 @@ Date:   Tue Jun 24 02:09:49 2014
    /**
     * Requests a reservation
     *
+    * Note, on_reserved may be called following cancel_reservation.  Thus,
 --- a/src/common/config_opts.h
 +++ b/src/common/config_opts.h
-@@ -389,6 +389,9 @@ OPTION(osd_compact_leveldb_on_mount, OPT
+@@ -389,8 +389,11 @@
+ 
  // Maximum number of backfills to or from a single osd
  OPTION(osd_max_backfills, OPT_U64, 10)
  
@@ -70,9 +77,11 @@ Date:   Tue Jun 24 02:09:49 2014
  // Refuse backfills when OSD full ratio is above this value
  OPTION(osd_backfill_full_ratio, OPT_FLOAT, 0.85)
  
+ // Seconds to wait before retrying refused backfills
 --- a/src/messages/MBackfillReserve.h
 +++ b/src/messages/MBackfillReserve.h
-@@ -28,8 +28,8 @@ public:
+@@ -27,10 +27,10 @@
+     REQUEST = 0,
      GRANT = 1,
      REJECT = 2,
    };
@@ -83,9 +92,11 @@ Date:   Tue Jun 24 02:09:49 2014
  
    MBackfillReserve()
      : Message(MSG_OSD_BACKFILL_RESERVE, HEAD_VERSION, COMPAT_VERSION),
+       query_epoch(0), type(-1), priority(-1) {}
 --- a/src/osd/OSD.cc
 +++ b/src/osd/OSD.cc
-@@ -217,8 +217,10 @@ OSDService::OSDService(OSD *osd) :
+@@ -218,10 +218,12 @@
+   backfill_request_timer(cct, backfill_request_lock, false),
    last_tid(0),
    tid_lock("OSDService::tid_lock"),
    reserver_finisher(cct),
@@ -98,15 +109,19 @@ Date:   Tue Jun 24 02:09:49 2014
    pg_temp_lock("OSDService::pg_temp_lock"),
    map_cache_lock("OSDService::map_lock"),
    map_cache(cct->_conf->osd_map_cache_size),
-@@ -7806,6 +7808,7 @@ const char** OSD::get_tracked_conf_keys(
+   map_bl_cache(cct->_conf->osd_map_cache_size),
+@@ -7870,8 +7872,9 @@
+ const char** OSD::get_tracked_conf_keys() const
  {
    static const char* KEYS[] = {
      "osd_max_backfills",
 +    "osd_min_recovery_priority",
      "osd_op_complaint_time", "osd_op_log_threshold",
      "osd_op_history_size", "osd_op_history_duration",
-     NULL
-@@ -7820,6 +7823,10 @@ void OSD::handle_conf_change(const struc
+     "osd_map_cache_size",
+     "osd_map_max_advance",
+@@ -7889,8 +7892,12 @@
+   if (changed.count("osd_max_backfills")) {
      service.local_reserver.set_max(cct->_conf->osd_max_backfills);
      service.remote_reserver.set_max(cct->_conf->osd_max_backfills);
    }
@@ -117,9 +132,11 @@ Date:   Tue Jun 24 02:09:49 2014
    if (changed.count("osd_op_complaint_time") ||
        changed.count("osd_op_log_threshold")) {
      op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
+                                            cct->_conf->osd_op_log_threshold);
 --- a/src/osd/OSD.h
 +++ b/src/osd/OSD.h
-@@ -594,11 +594,6 @@ public:
+@@ -629,13 +629,8 @@
+     return t;
    }
  
    // -- backfill_reservation --
@@ -131,9 +148,11 @@ Date:   Tue Jun 24 02:09:49 2014
    Finisher reserver_finisher;
    AsyncReserver<spg_t> local_reserver;
    AsyncReserver<spg_t> remote_reserver;
+ 
 --- a/src/osd/PG.cc
 +++ b/src/osd/PG.cc
-@@ -1874,6 +1874,26 @@ void PG::mark_clean()
+@@ -1873,8 +1873,28 @@
+ 
    dirty_info = true;
  }
  
@@ -160,7 +179,9 @@ Date:   Tue Jun 24 02:09:49 2014
  void PG::finish_recovery(list<Context*>& tfin)
  {
    dout(10) << "finish_recovery" << dendl;
-@@ -5735,13 +5755,12 @@ PG::RecoveryState::WaitRemoteBackfillRes
+   assert(info.last_complete == info.last_update);
+@@ -5839,15 +5859,14 @@
+     ConnectionRef con = pg->osd->get_con_osd_cluster(
        backfill_osd_it->osd, pg->get_osdmap()->get_epoch());
      if (con) {
        if (con->has_feature(CEPH_FEATURE_BACKFILL_RESERVATION)) {
@@ -176,7 +197,9 @@ Date:   Tue Jun 24 02:09:49 2014
  	con.get());
        } else {
          post_event(RemoteBackfillReserved());
-@@ -5810,8 +5829,8 @@ PG::RecoveryState::WaitLocalBackfillRese
+       }
+@@ -5914,10 +5933,10 @@
+   pg->osd->local_reserver.request_reservation(
      pg->info.pgid,
      new QueuePeeringEvt<LocalBackfillReserved>(
        pg, pg->get_osdmap()->get_epoch(),
@@ -187,7 +210,9 @@ Date:   Tue Jun 24 02:09:49 2014
  }
  
  void PG::RecoveryState::WaitLocalBackfillReserved::exit()
-@@ -5866,7 +5885,8 @@ PG::RecoveryState::RepWaitRecoveryReserv
+ {
+@@ -5982,9 +6001,10 @@
+   pg->osd->remote_reserver.request_reservation(
      pg->info.pgid,
      new QueuePeeringEvt<RemoteRecoveryReserved>(
        pg, pg->get_osdmap()->get_epoch(),
@@ -197,7 +222,9 @@ Date:   Tue Jun 24 02:09:49 2014
  }
  
  boost::statechart::result
-@@ -6007,7 +6027,8 @@ PG::RecoveryState::WaitLocalRecoveryRese
+ PG::RecoveryState::RepWaitRecoveryReserved::react(const RemoteRecoveryReserved &evt)
+@@ -6123,9 +6143,10 @@
+   pg->osd->local_reserver.request_reservation(
      pg->info.pgid,
      new QueuePeeringEvt<LocalRecoveryReserved>(
        pg, pg->get_osdmap()->get_epoch(),
@@ -207,9 +234,11 @@ Date:   Tue Jun 24 02:09:49 2014
  }
  
  void PG::RecoveryState::WaitLocalRecoveryReserved::exit()
+ {
 --- a/src/osd/PG.h
 +++ b/src/osd/PG.h
-@@ -710,6 +710,11 @@ public:
+@@ -776,8 +776,13 @@
+   
    bool needs_recovery() const;
    bool needs_backfill() const;
  
@@ -221,9 +250,11 @@ Date:   Tue Jun 24 02:09:49 2014
    void mark_clean();  ///< mark an active pg clean
  
    bool _calc_past_interval_range(epoch_t *start, epoch_t *end);
+   void generate_past_intervals();
 --- a/src/osd/osd_types.h
 +++ b/src/osd/osd_types.h
-@@ -56,6 +56,10 @@
+@@ -55,8 +55,12 @@
+ #define CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER CompatSet::Feature(10, "snapmapper")
  #define CEPH_OSD_FEATURE_INCOMPAT_SHARDS CompatSet::Feature(11, "sharded objects")
  
  
@@ -234,3 +265,4 @@ Date:   Tue Jun 24 02:09:49 2014
  typedef hobject_t collection_list_handle_t;
  
  typedef uint8_t shard_id_t;
+ 
diff --git a/debian/patches/bug-8342.patch b/debian/patches/bug-8342.patch
index 138626c..0de003b 100644
--- a/debian/patches/bug-8342.patch
+++ b/debian/patches/bug-8342.patch
@@ -11,7 +11,8 @@ Description:  [Fixes:#8342]
 
 --- a/src/init-ceph.in
 +++ b/src/init-ceph.in
-@@ -339,7 +339,11 @@ for name in $what; do
+@@ -361,9 +361,13 @@
+ 		    osd_location=`$osd_location_hook --cluster $cluster --id $id --type osd`
  		    get_conf osd_weight "" "osd crush initial weight"
  		    defaultweight="$(df -P -k $osd_data/. | tail -1 | awk '{ print sprintf("%.2f",$2/1073741824) }')"
  		    get_conf osd_keyring "$osd_data/keyring" "keyring"
@@ -24,7 +25,9 @@ Description:  [Fixes:#8342]
  		fi
  	    fi
  
-@@ -353,6 +357,7 @@ for name in $what; do
+ 	    echo Starting Ceph $name on $host...
+@@ -375,8 +379,9 @@
+ 	    [ -n "$pre_start" ] && do_cmd "$pre_start"
  	    do_cmd_okfail "$cmd" $runarg
  	    if [ "$ERR" != "0" ]; then
  		EXIT_STATUS=$ERR
@@ -32,3 +35,4 @@ Description:  [Fixes:#8342]
  	    fi
  
  	    if [ "$type" = "mon" ]; then
+ 		# this will only work if we are using default paths
diff --git a/debian/patches/bug-8821.patch b/debian/patches/bug-8821.patch
index 9e80569..fe8b99c 100644
--- a/debian/patches/bug-8821.patch
+++ b/debian/patches/bug-8821.patch
@@ -30,7 +30,8 @@ Subject: [PATCH 3/3] rbd: respect rbd_default_* parameters
 
 --- a/src/common/config_opts.h
 +++ b/src/common/config_opts.h
-@@ -738,8 +738,8 @@ OPTION(rbd_localize_parent_reads, OPT_BO
+@@ -748,10 +748,10 @@
+  * affected by rbd_default_order.
   */
  OPTION(rbd_default_format, OPT_INT, 1)
  OPTION(rbd_default_order, OPT_INT, 22)
@@ -41,9 +42,11 @@ Subject: [PATCH 3/3] rbd: respect rbd_default_* parameters
  OPTION(rbd_default_features, OPT_INT, 3) // 1 for layering, 3 for layering+stripingv2. only applies to format 2 images
  
  OPTION(nss_db_path, OPT_STR, "") // path to nss db
+ 
 --- a/src/rbd.cc
 +++ b/src/rbd.cc
-@@ -2345,7 +2345,8 @@ int main(int argc, const char **argv)
+@@ -2344,9 +2344,10 @@
+ 
    const char *poolname = NULL;
    uint64_t size = 0;  // in bytes
    int order = 0;
@@ -53,7 +56,9 @@ Subject: [PATCH 3/3] rbd: respect rbd_default_* parameters
    int format = 1;
    uint64_t features = RBD_FEATURE_LAYERING;
    const char *imgname = NULL, *snapname = NULL, *destname = NULL,
-@@ -2359,7 +2360,7 @@ int main(int argc, const char **argv)
+     *dest_poolname = NULL, *dest_snapname = NULL, *path = NULL,
+@@ -2358,9 +2359,9 @@
+   long long stripe_unit = 0, stripe_count = 0;
    long long bench_io_size = 4096, bench_io_threads = 16, bench_bytes = 1 << 30;
    string bench_pattern = "seq";
  
@@ -62,7 +67,9 @@ Subject: [PATCH 3/3] rbd: respect rbd_default_* parameters
    std::ostringstream err;
    long long sizell = 0;
    std::vector<const char*>::iterator i;
-@@ -2375,13 +2376,15 @@ int main(int argc, const char **argv)
+   for (i = args.begin(); i != args.end(); ) {
+@@ -2374,15 +2375,17 @@
+       return 0;
      } else if (ceph_argparse_flag(args, i, "--new-format", (char*)NULL)) {
        format = 2;
        format_specified = true;
@@ -81,7 +88,9 @@ Subject: [PATCH 3/3] rbd: respect rbd_default_* parameters
      } else if (ceph_argparse_witharg(args, i, &val, "-p", "--pool", (char*)NULL)) {
        poolname = strdup(val.c_str());
      } else if (ceph_argparse_witharg(args, i, &val, "--dest-pool", (char*)NULL)) {
-@@ -2416,7 +2419,6 @@ int main(int argc, const char **argv)
+       dest_poolname = strdup(val.c_str());
+@@ -2415,9 +2418,8 @@
+     } else if (ceph_argparse_withlonglong(args, i, &bench_io_size, &err, "--io-size", (char*)NULL)) {
      } else if (ceph_argparse_withlonglong(args, i, &bench_io_threads, &err, "--io-threads", (char*)NULL)) {
      } else if (ceph_argparse_withlonglong(args, i, &bench_bytes, &err, "--io-total", (char*)NULL)) {
      } else if (ceph_argparse_witharg(args, i, &bench_pattern, &err, "--io-pattern", (char*)NULL)) {
@@ -89,7 +98,9 @@ Subject: [PATCH 3/3] rbd: respect rbd_default_* parameters
      } else if (ceph_argparse_witharg(args, i, &val, "--path", (char*)NULL)) {
        path = strdup(val.c_str());
      } else if (ceph_argparse_witharg(args, i, &val, "--dest", (char*)NULL)) {
-@@ -2441,9 +2443,9 @@ int main(int argc, const char **argv)
+       destname = strdup(val.c_str());
+@@ -2440,11 +2442,11 @@
+       progress = false;
      } else if (ceph_argparse_flag(args, i , "--allow-shrink", (char *)NULL)) {
        resize_allow_shrink = true;
      } else if (ceph_argparse_witharg(args, i, &val, "--format", (char *) NULL)) {
@@ -102,7 +113,9 @@ Subject: [PATCH 3/3] rbd: respect rbd_default_* parameters
  	format = ret;
  	format_specified = true;
  	cerr << "rbd: using --format for specifying the rbd image format is"
-@@ -2557,6 +2559,17 @@ if (!set_conf_param(v, p1, p2, p3)) { \
+ 	     << " deprecated, use --image-format instead"
+@@ -2556,8 +2558,19 @@
+ 	break;
      }
    }
  
@@ -120,6 +133,7 @@ Subject: [PATCH 3/3] rbd: respect rbd_default_* parameters
    if (format_specified && opt_cmd != OPT_IMPORT && opt_cmd != OPT_CREATE) {
      cerr << "rbd: image format can only be set when "
  	 << "creating or importing an image" << std::endl;
+     return EXIT_FAILURE;
 --- /dev/null
 +++ b/src/test/cli-integration/rbd/defaults.t
 @@ -0,0 +1,214 @@
diff --git a/debian/patches/ceph-ao-require-cas.patch b/debian/patches/ceph-ao-require-cas.patch
deleted file mode 100644
index 0a893a3..0000000
--- a/debian/patches/ceph-ao-require-cas.patch
+++ /dev/null
@@ -1,16 +0,0 @@
-Last-Update: 2014-05-21
-Forwarded: https://github.com/ceph/ceph/pull/1844
-Bug-Debian: http://bugs.debian.org/748571
-Author: John David Anglin <dave.anglin at bell.net>
-Description: Define AO_REQUIRE_CAS to fix FTBFS on 'hppa'.
-
---- a/src/include/atomic.h
-+++ b/src/include/atomic.h
-@@ -25,6 +25,7 @@
- #ifndef NO_ATOMIC_OPS
- 
- // libatomic_ops implementation
-+#define AO_REQUIRE_CAS
- #include <atomic_ops.h>
- 
- // reinclude our assert to clobber the system one
diff --git a/debian/patches/client-sleep1.patch b/debian/patches/client-sleep1.patch
index 7334ee6..f3b2367 100644
--- a/debian/patches/client-sleep1.patch
+++ b/debian/patches/client-sleep1.patch
@@ -11,7 +11,8 @@ Signed-off-by: Yan, Zheng <zheng.z.yan at intel.com>
 
 --- a/src/client/Client.cc
 +++ b/src/client/Client.cc
-@@ -3075,12 +3075,27 @@ void Client::remove_all_caps(Inode *in)
+@@ -3108,14 +3108,29 @@
+   while (!in->caps.empty())
      remove_cap(in->caps.begin()->second, true);
  }
  
@@ -42,3 +43,4 @@ Signed-off-by: Yan, Zheng <zheng.z.yan at intel.com>
  }
  
  void Client::trim_caps(MetaSession *s, int max)
+ {
diff --git a/debian/patches/client-sleep2.patch b/debian/patches/client-sleep2.patch
index 0faea5e..67ce24a 100644
--- a/debian/patches/client-sleep2.patch
+++ b/debian/patches/client-sleep2.patch
@@ -10,7 +10,8 @@ Signed-off-by: Yan, Zheng <zheng.z.yan at intel.com>
 
 --- a/src/client/Client.cc
 +++ b/src/client/Client.cc
-@@ -2352,6 +2352,9 @@ void Client::put_cap_ref(Inode *in, int
+@@ -2385,8 +2385,11 @@
+ 
  int Client::get_caps(Inode *in, int need, int want, int *phave, loff_t endoff)
  {
    while (1) {
@@ -20,7 +21,9 @@ Signed-off-by: Yan, Zheng <zheng.z.yan at intel.com>
      if (endoff > 0 &&
  	(endoff >= (loff_t)in->max_size ||
  	 endoff > (loff_t)(in->size << 1)) &&
-@@ -3083,9 +3086,13 @@ void Client::remove_session_caps(MetaSes
+ 	endoff > (loff_t)in->wanted_max_size) {
+@@ -3116,11 +3119,15 @@
+   while (s->caps.size()) {
      Cap *cap = *s->caps.begin();
      Inode *in = cap->inode;
      int dirty_caps = 0;
@@ -35,3 +38,4 @@ Signed-off-by: Yan, Zheng <zheng.z.yan at intel.com>
      if (dirty_caps) {
        lderr(cct) << "remove_session_caps still has dirty|flushing caps on " << *in << dendl;
        if (in->flushing_caps)
+ 	num_flushing_caps--;
diff --git a/debian/patches/client-sleep3.patch b/debian/patches/client-sleep3.patch
index 8dabc7b..edf1a36 100644
--- a/debian/patches/client-sleep3.patch
+++ b/debian/patches/client-sleep3.patch
@@ -10,7 +10,8 @@ Signed-off-by: Yan, Zheng <zheng.z.yan at intel.com>
 
 --- a/src/client/Client.cc
 +++ b/src/client/Client.cc
-@@ -2090,15 +2090,21 @@ void Client::kick_requests_closed(MetaSe
+@@ -2123,17 +2123,23 @@
+ void Client::kick_requests_closed(MetaSession *session)
  {
    ldout(cct, 10) << "kick_requests_closed for mds." << session->mds_num << dendl;
    for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
@@ -40,3 +41,4 @@ Signed-off-by: Yan, Zheng <zheng.z.yan at intel.com>
      }
    }
    assert(session->requests.empty());
+   assert(session->unsafe_requests.empty());
diff --git a/debian/patches/firefly-latest.patch b/debian/patches/firefly-latest.patch
new file mode 100644
index 0000000..8414fc0
--- /dev/null
+++ b/debian/patches/firefly-latest.patch
@@ -0,0 +1,11172 @@
+Last-Update: 2014-09-16
+Forwarded: not-needed
+Origin: upstream
+Author: Dmitry Smirnov <onlyjob at member.fsf.org>
+Description: fixes from "firefly" branch since 0.80.5 release
+
+--- a/configure.ac
++++ b/configure.ac
+@@ -471,11 +471,16 @@
+              [AC_MSG_FAILURE(
+                    [no libatomic-ops found (use --without-libatomic-ops to disable)])
+               ])])
+ AS_IF([test "$HAVE_ATOMIC_OPS" = "1"],
+-	[],
++	[
++         AC_CHECK_SIZEOF(AO_t, [], [
++                                #include <atomic_ops.h>
++                                ])
++         ],
+ 	[AC_DEFINE([NO_ATOMIC_OPS], [1], [Defined if you do not have atomic_ops])])
+ 
++
+ AM_CONDITIONAL(WITH_LIBATOMIC, [test "$HAVE_ATOMIC_OPS" = "1"])
+ 
+ # newsyn?  requires mpi.
+ #AC_ARG_WITH([newsyn],
+--- /dev/null
++++ b/doc/_templates/layout.html
+@@ -0,0 +1,5 @@
++{% extends "!layout.html" %}
++
++{%- block extrahead %}
++    <script type="text/javascript" src="http://ayni.ceph.com/public/js/ceph.js"></script>
++{% endblock %}
+--- a/src/ceph-disk
++++ b/src/ceph-disk
+@@ -118,8 +118,11 @@
+ STATEDIR = '/var/lib/ceph'
+ 
+ SYSCONFDIR = '/etc/ceph'
+ 
++# only warn once about some things
++warned_about = {}
++
+ # Nuke the TERM variable to avoid confusing any subprocesses we call.
+ # For example, libreadline will print weird control sequences for some
+ # TERM values.
+ if 'TERM' in os.environ:
+@@ -130,10 +133,8 @@
+     LOG_NAME = os.path.basename(sys.argv[0])
+ LOG = logging.getLogger(LOG_NAME)
+ 
+ 
+-
+-
+ ###### lock ########
+ 
+ class filelock(object):
+     def __init__(self, fn):
+@@ -149,10 +150,12 @@
+         assert self.fd
+         fcntl.lockf(self.fd, fcntl.LOCK_UN)
+         self.fd = None
+ 
++
+ ###### exceptions ########
+ 
++
+ class Error(Exception):
+     """
+     Error
+     """
+@@ -160,51 +163,60 @@
+     def __str__(self):
+         doc = self.__doc__.strip()
+         return ': '.join([doc] + [str(a) for a in self.args])
+ 
++
+ class MountError(Error):
+     """
+     Mounting filesystem failed
+     """
+ 
++
+ class UnmountError(Error):
+     """
+     Unmounting filesystem failed
+     """
+ 
++
+ class BadMagicError(Error):
+     """
+     Does not look like a Ceph OSD, or incompatible version
+     """
+ 
++
+ class TruncatedLineError(Error):
+     """
+     Line is truncated
+     """
+ 
++
+ class TooManyLinesError(Error):
+     """
+     Too many lines
+     """
+ 
++
+ class FilesystemTypeError(Error):
+     """
+     Cannot discover filesystem type
+      """
+ 
++
+ class CephDiskException(Exception):
+     """
+     A base exception for ceph-disk to provide custom (ad-hoc) messages that
+     will be caught and dealt with when main() is executed
+     """
+     pass
+ 
++
+ class ExecutableNotFound(CephDiskException):
+     """
+     Exception to report on executables not available in PATH
+     """
+     pass
+ 
++
+ ####### utils
+ 
+ 
+ def maybe_mkdir(*a, **kw):
+@@ -299,9 +311,9 @@
+     of making sure that executables *will* be found and will error nicely
+     otherwise.
+     """
+     arguments = _get_command_executable(arguments)
+-    LOG.info('Running command: %s' % ' '.join(arguments))
++    LOG.info('Running command: %s', ' '.join(arguments))
+     return subprocess.check_call(arguments)
+ 
+ 
+ def platform_distro():
+@@ -339,35 +351,67 @@
+         str(codename).strip()
+     )
+ 
+ 
+-# a device "name" is something like
+-#  sdb
+-#  cciss!c0d1
+ def get_dev_name(path):
+     """
+-    get device name from path.  e.g., /dev/sda -> sdas, /dev/cciss/c0d1 -> cciss!c0d1
++    get device name from path.  e.g.::
++
++        /dev/sda -> sdas, /dev/cciss/c0d1 -> cciss!c0d1
++
++    a device "name" is something like::
++
++        sdb
++        cciss!c0d1
++
+     """
+     assert path.startswith('/dev/')
+     base = path[5:]
+     return base.replace('/', '!')
+ 
+-# a device "path" is something like
+-#  /dev/sdb
+-#  /dev/cciss/c0d1
++
+ def get_dev_path(name):
+     """
+     get a path (/dev/...) from a name (cciss!c0d1)
++    a device "path" is something like::
++
++        /dev/sdb
++        /dev/cciss/c0d1
++
+     """
+     return '/dev/' + name.replace('!', '/')
+ 
++
+ def get_dev_relpath(name):
+     """
+     get a relative path to /dev from a name (cciss!c0d1)
+     """
+     return name.replace('!', '/')
+ 
+ 
++def get_dev_size(dev, size='megabytes'):
++    """
++    Attempt to get the size of a device so that we can prevent errors
++    from actions to devices that are smaller, and improve error reporting.
++
++    Because we want to avoid breakage in case this approach is not robust, we
++    will issue a warning if we failed to get the size.
++
++    :param size: bytes or megabytes
++    :param dev: the device to calculate the size
++    """
++    fd = os.open(dev, os.O_RDONLY)
++    dividers = {'bytes': 1, 'megabytes': 1024*1024}
++    try:
++        device_size = os.lseek(fd, 0, os.SEEK_END)
++        divider = dividers.get(size, 1024*1024)  # default to megabytes
++        return device_size/divider
++    except Exception as error:
++        LOG.warning('failed to get size of %s: %s' % (dev, str(error)))
++    finally:
++        os.close(fd)
++
++
+ def get_partition_dev(dev, pnum):
+     """
+     get the device name for a partition
+ 
+@@ -388,8 +432,9 @@
+         return get_dev_path(partname)
+     else:
+         raise Error('partition %d for %s does not appear to exist' % (pnum, dev))
+ 
++
+ def list_all_partitions():
+     """
+     Return a list of devices and partitions
+     """
+@@ -402,8 +447,9 @@
+             continue
+         dev_part_list[name] = list_partitions(name)
+     return dev_part_list
+ 
++
+ def list_partitions(basename):
+     """
+     Return a list of partitions on the given device name
+     """
+@@ -412,8 +458,25 @@
+         if name.startswith(basename):
+             partitions.append(name)
+     return partitions
+ 
++def get_partition_base(dev):
++    """
++    Get the base device for a partition
++    """
++    dev = os.path.realpath(dev)
++    if not stat.S_ISBLK(os.lstat(dev).st_mode):
++        raise Error('not a block device', dev)
++
++    name = get_dev_name(dev)
++    if os.path.exists(os.path.join('/sys/block', name)):
++        raise Error('not a partition', dev)
++
++    # find the base
++    for basename in os.listdir('/sys/block'):
++        if os.path.exists(os.path.join('/sys/block', basename, name)):
++            return '/dev/' + basename
++    raise Error('no parent device for partition', dev)
+ 
+ def is_partition(dev):
+     """
+     Check whether a given device path is a partition or a full disk.
+@@ -475,23 +538,23 @@
+         base = base[:-1]
+     return []
+ 
+ 
+-def verify_not_in_use(dev):
++def verify_not_in_use(dev, check_partitions=False):
+     """
+     Verify if a given device (path) is in use (e.g. mounted or
+     in use by device-mapper).
+ 
+     :raises: Error if device is in use.
+     """
+     assert os.path.exists(dev)
+-    if is_partition(dev):
+-        if is_mounted(dev):
+-            raise Error('Device is mounted', dev)
+-        holders = is_held(dev)
+-        if holders:
+-            raise Error('Device is in use by a device-mapper mapping (dm-crypt?)' % dev, ','.join(holders))
+-    else:
++    if is_mounted(dev):
++        raise Error('Device is mounted', dev)
++    holders = is_held(dev)
++    if holders:
++        raise Error('Device is in use by a device-mapper mapping (dm-crypt?)' % dev, ','.join(holders))
++
++    if check_partitions and not is_partition(dev):
+         basename = get_dev_name(os.path.realpath(dev))
+         for partname in list_partitions(basename):
+             partition = get_dev_path(partname)
+             if is_mounted(partition):
+@@ -535,12 +598,14 @@
+ 
+     try:
+         line = must_be_one_line(line)
+     except (TruncatedLineError, TooManyLinesError) as e:
+-        raise Error('File is corrupt: {path}: {msg}'.format(
++        raise Error(
++            'File is corrupt: {path}: {msg}'.format(
+                 path=path,
+                 msg=e,
+-                ))
++            )
++        )
+     return line
+ 
+ 
+ def write_one_line(parent, name, text):
+@@ -745,9 +810,9 @@
+     Maps a device to a dmcrypt device.
+ 
+     :return: Path to the dmcrypt device.
+     """
+-    dev = '/dev/mapper/'+ _uuid
++    dev = '/dev/mapper/' + _uuid
+     args = [
+         'cryptsetup',
+         '--key-file',
+         keypath,
+@@ -791,8 +856,14 @@
+     """
+     Mounts a device with given filessystem type and
+     mount options to a tempfile path under /var/lib/ceph/tmp.
+     """
++    # sanity check: none of the arguments are None
++    if dev is None:
++        raise ValueError('dev may not be None')
++    if fstype is None:
++        raise ValueError('fstype may not be None')
++
+     # pick best-of-breed mount options based on fs type
+     if options is None:
+         options = MOUNT_OPTIONS.get(fstype, '')
+ 
+@@ -966,8 +1037,17 @@
+             size=journal_size,
+             )
+         LOG.warning('OSD will not be hot-swappable if journal is not the same device as the osd data')
+ 
++    dev_size = get_dev_size(journal)
++
++    if journal_size > dev_size:
++        LOG.error('refusing to create journal on %s' % journal)
++        LOG.error('journal size (%sM) is bigger than device (%sM)' % (journal_size, dev_size))
++        raise Error(
++            '%s device size (%sM) is not big enough for journal' % (journal, dev_size)
++        )
++
+     try:
+         LOG.debug('Creating journal partition num %d size %d on %s', num, journal_size, journal)
+         command_check_call(
+             [
+@@ -1043,9 +1123,9 @@
+     journal):
+ 
+     if not os.path.exists(journal):
+         LOG.debug('Creating journal file %s with size 0 (ceph-osd will resize and allocate)', journal)
+-        with file(journal, 'wb') as journal_file:
++        with file(journal, 'wb') as journal_file:  # noqa
+             pass
+ 
+     LOG.debug('Journal is file %s', journal)
+     LOG.warning('OSD will not be hot-swappable if journal is not the same device as the osd data')
+@@ -1109,15 +1189,16 @@
+             os.symlink(target, path)
+         except:
+             raise Error('unable to create symlink %s -> %s' % (path, target))
+ 
++
+ def prepare_dir(
+     path,
+     journal,
+     cluster_uuid,
+     osd_uuid,
+     journal_uuid,
+-    journal_dmcrypt = None,
++    journal_dmcrypt=None,
+     ):
+ 
+     if os.path.exists(os.path.join(path, 'magic')):
+         LOG.debug('Data dir %s already exists', path)
+@@ -1182,11 +1263,8 @@
+     if is_partition(data):
+         LOG.debug('OSD data device %s is a partition', data)
+         rawdev = data
+     else:
+-        if journal_dmcrypt is not None:
+-            dmcrypt_unmap(journal)
+-
+         LOG.debug('Creating osd partition on %s', data)
+         try:
+             command_check_call(
+                 [
+@@ -1237,11 +1315,11 @@
+                 args.extend(['-f'])  # always force
+         else:
+             args.extend(MKFS_ARGS.get(fstype, []))
+         args.extend([
+-                '--',
+-                dev,
+-                ])
++            '--',
++            dev,
++            ])
+         try:
+             LOG.debug('Creating %s fs on %s', fstype, dev)
+             command_check_call(args)
+         except subprocess.CalledProcessError as e:
+@@ -1266,10 +1344,8 @@
+             unmount(path)
+     finally:
+         if rawdev != dev:
+             dmcrypt_unmap(osd_uuid)
+-        if journal_dmcrypt is not None:
+-            dmcrypt_unmap(journal)
+ 
+     if not is_partition(data):
+         try:
+             command_check_call(
+@@ -1288,9 +1364,9 @@
+     journal_dm_keypath = None
+     osd_dm_keypath = None
+ 
+     try:
+-        prepare_lock.acquire()
++        prepare_lock.acquire()  # noqa
+         if not os.path.exists(args.data):
+             if args.data_dev:
+                 raise Error('data path does not exist', args.data)
+             else:
+@@ -1298,14 +1374,14 @@
+ 
+         # in use?
+         dmode = os.stat(args.data).st_mode
+         if stat.S_ISBLK(dmode):
+-            verify_not_in_use(args.data)
++            verify_not_in_use(args.data, True)
+ 
+         if args.journal and os.path.exists(args.journal):
+             jmode = os.stat(args.journal).st_mode
+             if stat.S_ISBLK(jmode):
+-                verify_not_in_use(args.journal)
++                verify_not_in_use(args.journal, False)
+ 
+         if args.zap_disk is not None:
+             if stat.S_ISBLK(dmode) and not is_partition(args.data):
+                 zap(args.data)
+@@ -1420,9 +1496,9 @@
+                 osd_dm_keypath=osd_dm_keypath,
+                 )
+         else:
+             raise Error('not a dir or block device', args.data)
+-        prepare_lock.release()
++        prepare_lock.release()  # noqa
+ 
+         if stat.S_ISBLK(dmode):
+             # try to make sure the kernel refreshes the table.  note
+             # that if this gets ebusy, we are probably racing with
+@@ -1456,9 +1532,9 @@
+         if journal_dm_keypath:
+             os.unlink(journal_dm_keypath)
+         if osd_dm_keypath:
+             os.unlink(osd_dm_keypath)
+-        prepare_lock.release()
++        prepare_lock.release()  # noqa
+         raise e
+ 
+ 
+ ###########################
+@@ -1622,20 +1698,23 @@
+             command_check_call(
+                 [
+                     svc,
+                     'ceph',
++                    '--cluster',
++                    '{cluster}'.format(cluster=cluster),
+                     'start',
+                     'osd.{osd_id}'.format(osd_id=osd_id),
+                     ],
+                 )
+         else:
+             raise Error('{cluster} osd.{osd_id} is not tagged with an init system'.format(
+-                    cluster=cluster,
+-                    osd_id=osd_id,
+-                    ))
++                cluster=cluster,
++                osd_id=osd_id,
++            ))
+     except subprocess.CalledProcessError as e:
+         raise Error('ceph osd start failed', e)
+ 
++
+ def detect_fstype(
+     dev,
+     ):
+     fstype = _check_output(
+@@ -1703,10 +1782,10 @@
+         other = False
+         src_dev = os.stat(path).st_dev
+         try:
+             dst_dev = os.stat((STATEDIR + '/osd/{cluster}-{osd_id}').format(
+-                    cluster=cluster,
+-                    osd_id=osd_id)).st_dev
++                cluster=cluster,
++                osd_id=osd_id)).st_dev
+             if src_dev == dst_dev:
+                 active = True
+             else:
+                 parent_dev = os.stat(STATEDIR + '/osd').st_dev
+@@ -1759,9 +1838,9 @@
+             )
+ 
+     (osd_id, cluster) = activate(path, activate_key_template, init)
+ 
+-    if init not in ( None, 'none' ):
++    if init not in (None, 'none' ):
+         canonical = (STATEDIR + '/osd/{cluster}-{osd_id}').format(
+             cluster=cluster,
+             osd_id=osd_id)
+         if path != canonical:
+@@ -1814,8 +1893,9 @@
+         LOG.warning('No fsid defined in ' + SYSCONFDIR + '/ceph.conf; using anyway')
+         return 'ceph'
+     return None
+ 
++
+ def activate(
+     path,
+     activate_key_template,
+     init,
+@@ -1860,9 +1940,9 @@
+             fsid=fsid,
+             keyring=keyring,
+             )
+ 
+-    if init not in ( None, 'none' ):
++    if init not in (None, 'none' ):
+         if init == 'auto':
+             conf_val = get_conf(
+                 cluster=cluster,
+                 variable='init'
+@@ -1911,9 +1991,9 @@
+     if is_suppressed(args.path):
+         LOG.info('suppressed activate request on %s', args.path)
+         return
+ 
+-    activate_lock.acquire()
++    activate_lock.acquire()  # noqa
+     try:
+         mode = os.stat(args.path).st_mode
+         if stat.S_ISBLK(mode):
+             (cluster, osd_id) = mount_activate(
+@@ -1931,9 +2011,9 @@
+ 
+             if args.mark_init == 'none':
+                 command_check_call(
+                     [
+-                    'ceph-osd',
++                        'ceph-osd',
+                         '--cluster={cluster}'.format(cluster=cluster),
+                         '--id={osd_id}'.format(osd_id=osd_id),
+                         '--osd-data={path}'.format(path=args.path),
+                         '--osd-journal={path}/journal'.format(path=args.path),
+@@ -1942,17 +2022,17 @@
+ 
+         else:
+             raise Error('%s is not a directory or block device' % args.path)
+ 
+-        if args.mark_init not in ( None, 'none' ):
++        if args.mark_init not in (None, 'none' ):
+ 
+             start_daemon(
+                 cluster=cluster,
+                 osd_id=osd_id,
+             )
+ 
+     finally:
+-        activate_lock.release()
++        activate_lock.release()  # noqa
+ 
+ 
+ ###########################
+ 
+@@ -1983,16 +2063,17 @@
+     value = str(out).split('\n', 1)[0]
+     LOG.debug('Journal %s has OSD UUID %s', path, value)
+     return value
+ 
++
+ def main_activate_journal(args):
+     if not os.path.exists(args.dev):
+         raise Error('%s does not exist' % args.dev)
+ 
+     cluster = None
+     osd_id = None
+     osd_uuid = None
+-    activate_lock.acquire()
++    activate_lock.acquire()  # noqa
+     try:
+         osd_uuid = get_journal_osd_uuid(args.dev)
+         path = os.path.join('/dev/disk/by-partuuid/', osd_uuid.lower())
+ 
+@@ -2007,12 +2088,14 @@
+             osd_id=osd_id,
+             )
+ 
+     finally:
+-        activate_lock.release()
++        activate_lock.release()  # noqa
++
+ 
+ ###########################
+ 
++
+ def main_activate_all(args):
+     dir = '/dev/disk/by-parttypeuuid'
+     LOG.debug('Scanning %s', dir)
+     if not os.path.exists(dir):
+@@ -2021,12 +2104,18 @@
+     for name in os.listdir(dir):
+         if name.find('.') < 0:
+             continue
+         (tag, uuid) = name.split('.')
+-        if tag == OSD_UUID:
+-            path = os.path.join(dir, name)
++
++        if tag == OSD_UUID or tag == DMCRYPT_OSD_UUID:
++
++            if tag == DMCRYPT_OSD_UUID:
++                path = os.path.join('/dev/mapper', uuid)
++            else:
++                path = os.path.join(dir, name)
++
+             LOG.info('Activating %s', path)
+-            activate_lock.acquire()
++            activate_lock.acquire()  # noqa
+             try:
+                 (cluster, osd_id) = mount_activate(
+                     dev=path,
+                     activate_key_template=args.activate_key_template,
+@@ -2044,9 +2133,9 @@
+                     )
+                 err = True
+ 
+             finally:
+-                activate_lock.release()
++                activate_lock.release()  # noqa
+     if err:
+         raise Error('One or more partitions failed to activate')
+ 
+ 
+@@ -2065,15 +2154,17 @@
+                 if swaps_dev == dev:
+                     return True
+     return False
+ 
++
+ def get_oneliner(base, name):
+     path = os.path.join(base, name)
+     if os.path.isfile(path):
+         with open(path, 'r') as _file:
+             return _file.readline().rstrip()
+     return None
+ 
++
+ def get_dev_fs(dev):
+     fscheck, _ = command(
+         [
+             'blkid',
+@@ -2087,9 +2178,58 @@
+         return fstype
+     else:
+         return None
+ 
++
+ def get_partition_type(part):
++    """
++    Get the GPT partition type UUID.  If we have an old blkid and can't
++    get it that way, use sgdisk and use the description instead (and hope
++    dmcrypt isn't being used).
++    """
++    blkid, _ = command(
++        [
++            'blkid',
++            '-p',
++            '-o', 'udev',
++            part,
++        ]
++    )
++    saw_part_entry = False
++    for line in blkid.splitlines():
++        (key, value) = line.split('=')
++        if key == 'ID_PART_ENTRY_TYPE':
++            return value
++        if key == 'ID_PART_ENTRY_SCHEME':
++            table_type = value
++        if key.startswith('ID_PART_ENTRY_'):
++            saw_part_entry = True
++
++    # hmm, is it in fact GPT?
++    table_type = None
++    base = get_partition_base(part)
++    blkid, _ = command(
++        [
++            'blkid',
++            '-p',
++            '-o', 'udev',
++            base
++        ]
++    )
++    for line in blkid.splitlines():
++        (key, value) = line.split('=')
++        if key == 'ID_PART_TABLE_TYPE':
++            table_type = value
++    if table_type != 'gpt':
++        return None    # not even GPT
++
++    if saw_part_entry:
++        return None    # GPT, and blkid appears to be new, so we're done.
++
++    # bah, fall back to sgdisk.
++    if 'blkid' not in warned_about:
++        LOG.warning('Old blkid does not support ID_PART_ENTRY_* fields, trying sgdisk; may not correctly identify ceph volumes with dmcrypt')
++        warned_about['blkid'] = True
+     (base, partnum) = re.match('(\D+)(\d+)', part).group(1, 2)
+     sgdisk, _ = command(
+         [
+             'sgdisk',
+@@ -2103,11 +2243,18 @@
+         if m is not None:
+             num = m.group(1)
+             if num != partnum:
+                 continue
+-            return m.group(2)
++            desc = m.group(2)
++            # assume unencrypted ... blkid has failed us :(
++            if desc == 'ceph data':
++                return OSD_UUID
++            if desc == 'ceph journal':
++                return JOURNAL_UUID
++
+     return None
+ 
++
+ def get_partition_uuid(dev):
+     (base, partnum) = re.match('(\D+)(\d+)', dev).group(1, 2)
+     out, _ = command(['sgdisk', '-i', partnum, base])
+     for line in out.splitlines():
+@@ -2115,8 +2262,9 @@
+         if m:
+             return m.group(1).lower()
+     return None
+ 
++
+ def more_osd_info(path, uuid_map):
+     desc = []
+     ceph_fsid = get_oneliner(path, 'ceph_fsid')
+     if ceph_fsid:
+@@ -2137,46 +2285,71 @@
+             desc.append('journal %s' % uuid_map[journal_uuid])
+ 
+     return desc
+ 
++def list_dev_osd(dev, uuid_map):
++    path = is_mounted(dev)
++    fs_type = get_dev_fs(dev)
++    desc = []
++    if path:
++        desc.append('active')
++        desc.extend(more_osd_info(path, uuid_map))
++    elif fs_type:
++        try:
++            tpath = mount(dev=dev, fstype=fs_type, options='')
++            if tpath:
++                try:
++                    magic = get_oneliner(tpath, 'magic')
++                    if magic is not None:
++                        desc.append('prepared')
++                        desc.extend(more_osd_info(tpath, uuid_map))
++                finally:
++                    unmount(tpath)
++        except MountError:
++            pass
++    return desc
+ 
+ def list_dev(dev, uuid_map, journal_map):
+     ptype = 'unknown'
+     prefix = ''
+     if is_partition(dev):
+         ptype = get_partition_type(dev)
+         prefix = ' '
+-    fs_type = get_dev_fs(dev)
+-    path = is_mounted(dev)
+ 
+     desc = []
+-    if ptype == 'ceph data':
+-        if path:
+-            desc.append('active')
+-            desc.extend(more_osd_info(path, uuid_map))
+-        elif fs_type:
+-            try:
+-                tpath = mount(dev=dev, fstype=fs_type, options='')
+-                if tpath:
+-                    try:
+-                        magic = get_oneliner(tpath, 'magic')
+-                        if magic is not None:
+-                            desc.append('prepared')
+-                            desc.extend(more_osd_info(tpath, uuid_map))
+-                    finally:
+-                        unmount(tpath)
+-            except MountError:
+-                pass
++    if ptype == OSD_UUID:
++        desc = list_dev_osd(dev, uuid_map)
+         if desc:
+             desc = ['ceph data'] + desc
+         else:
+             desc = ['ceph data', 'unprepared']
+-    elif ptype == 'ceph journal':
++    elif ptype == DMCRYPT_OSD_UUID:
++        holders = is_held(dev)
++        if not holders:
++            desc = ['ceph data (dmcrypt)', 'not currently mapped']
++        elif len(holders) == 1:
++            holder = '/dev/' + holders[0]
++            fs_desc = list_dev_osd(holder, uuid_map)
++            desc = ['ceph data (dmcrypt %s)' % holder] + fs_desc
++        else:
++            desc = ['ceph data (dmcrypt)', 'holders: ' + ','.join(holders)]
++    elif ptype == JOURNAL_UUID:
+         desc.append('ceph journal')
+         part_uuid = get_partition_uuid(dev)
+         if part_uuid and part_uuid in journal_map:
+             desc.append('for %s' % journal_map[part_uuid])
++    elif ptype == DMCRYPT_JOURNAL_UUID:
++        holders = is_held(dev)
++        if len(holders) == 1:
++            desc = ['ceph journal (dmcrypt /dev/%s)' % holders[0]]
++        else:
++            desc = ['ceph journal (dmcrypt)']
++        part_uuid = get_partition_uuid(dev)
++        if part_uuid and part_uuid in journal_map:
++            desc.append('for %s' % journal_map[part_uuid])
+     else:
++        path = is_mounted(dev)
++        fs_type = get_dev_fs(dev)
+         if is_swap(dev):
+             desc.append('swap')
+         else:
+             desc.append('other')
+@@ -2189,9 +2362,8 @@
+ 
+     print '%s%s %s' % (prefix, dev, ', '.join(desc))
+ 
+ 
+-
+ def main_list(args):
+     partmap = list_all_partitions()
+ 
+     uuid_map = {}
+@@ -2202,20 +2374,37 @@
+             part_uuid = get_partition_uuid(dev)
+             if part_uuid:
+                 uuid_map[part_uuid] = dev
+             ptype = get_partition_type(dev)
+-            if ptype == 'ceph data':
++            if ptype == OSD_UUID:
+                 fs_type = get_dev_fs(dev)
+-                try:
+-                    tpath = mount(dev=dev, fstype=fs_type, options='')
++                if fs_type is not None:
+                     try:
+-                        journal_uuid = get_oneliner(tpath, 'journal_uuid')
+-                        if journal_uuid:
+-                            journal_map[journal_uuid.lower()] = dev
+-                    finally:
+-                        unmount(tpath)
+-                except MountError:
+-                    pass
++                        tpath = mount(dev=dev, fstype=fs_type, options='')
++                        try:
++                            journal_uuid = get_oneliner(tpath, 'journal_uuid')
++                            if journal_uuid:
++                                journal_map[journal_uuid.lower()] = dev
++                        finally:
++                            unmount(tpath)
++                    except MountError:
++                        pass
++            if ptype == DMCRYPT_OSD_UUID:
++                holders = is_held(dev)
++                if len(holders) == 1:
++                    holder = '/dev/' + holders[0]
++                    fs_type = get_dev_fs(holder)
++                    if fs_type is not None:
++                        try:
++                            tpath = mount(dev=holder, fstype=fs_type, options='')
++                            try:
++                                journal_uuid = get_oneliner(tpath, 'journal_uuid')
++                                if journal_uuid:
++                                    journal_map[journal_uuid.lower()] = dev
++                            finally:
++                                unmount(tpath)
++                        except MountError:
++                            pass
+ 
+     for base, parts in sorted(partmap.iteritems()):
+         if parts:
+             print '%s :' % get_dev_path(base)
+@@ -2243,26 +2432,28 @@
+         if not disk.startswith('/dev/') or not stat.S_ISBLK(os.lstat(path).st_mode):
+             return False
+         base = get_dev_name(disk)
+         while len(base):
+-            if os.path.exists(SUPPRESS_PREFIX + base):
++            if os.path.exists(SUPPRESS_PREFIX + base):  # noqa
+                 return True
+             base = base[:-1]
+     except:
+         return False
+ 
++
+ def set_suppress(path):
+     disk = os.path.realpath(path)
+     if not os.path.exists(disk):
+         raise Error('does not exist', path)
+     if not stat.S_ISBLK(os.lstat(path).st_mode):
+         raise Error('not a block device', path)
+     base = get_dev_name(disk)
+ 
+-    with file(SUPPRESS_PREFIX + base, 'w') as f:
++    with file(SUPPRESS_PREFIX + base, 'w') as f:  # noqa
+         pass
+     LOG.info('set suppress flag on %s', base)
+ 
++
+ def unset_suppress(path):
+     disk = os.path.realpath(path)
+     if not os.path.exists(disk):
+         raise Error('does not exist', path)
+@@ -2270,9 +2461,9 @@
+         raise Error('not a block device', path)
+     assert disk.startswith('/dev/')
+     base = get_dev_name(disk)
+ 
+-    fn = SUPPRESS_PREFIX + base
++    fn = SUPPRESS_PREFIX + base  # noqa
+     if not os.path.exists(fn):
+         raise Error('not marked as suppressed', path)
+ 
+     try:
+@@ -2284,18 +2475,24 @@
+ 
+ def main_suppress(args):
+     set_suppress(args.path)
+ 
++
+ def main_unsuppress(args):
+     unset_suppress(args.path)
+ 
++
+ def main_zap(args):
+     for dev in args.dev:
+         zap(dev)
+ 
+ ###########################
+ 
++
+ def setup_statedir(dir):
++    # XXX The following use of globals makes linting
++    # really hard. Global state in Python is iffy and
++    # should be avoided.
+     global STATEDIR
+     STATEDIR = dir
+ 
+     if not os.path.exists(STATEDIR):
+@@ -2311,12 +2508,14 @@
+ 
+     global SUPPRESS_PREFIX
+     SUPPRESS_PREFIX = STATEDIR + '/tmp/suppress-activate.'
+ 
++
+ def setup_sysconfdir(dir):
+     global SYSCONFDIR
+     SYSCONFDIR = dir
+ 
++
+ def parse_args():
+     parser = argparse.ArgumentParser(
+         'ceph-disk',
+         )
+@@ -2588,4 +2787,5 @@
+ 
+ 
+ if __name__ == '__main__':
+     main()
++    warned_about = {}
+--- a/src/ceph.in
++++ b/src/ceph.in
+@@ -105,8 +105,16 @@
+     for mdsdict in infodict.values():
+         l.append(mdsdict['name'])
+     return l
+ 
++# these args must be passed to all child programs
++GLOBAL_ARGS = {
++    'client_id': '--id',
++    'client_name': '--name',
++    'cluster': '--cluster',
++    'cephconf': '--conf',
++}
++
+ def parse_cmdargs(args=None, target=''):
+     # alias: let the line-wrapping be sane
+     AP = argparse.ArgumentParser
+ 
+@@ -338,17 +346,25 @@
+ 
+     return ret
+ 
+ 
+-def ceph_conf(field, name):
++def ceph_conf(parsed_args, field, name):
++    args=['ceph-conf']
++
++    if name:
++        args.extend(['--name', name])
++
++    # add any args in GLOBAL_ARGS
++    for key, val in GLOBAL_ARGS.iteritems():
++        # ignore name in favor of argument name, if any
++        if name and key == 'client_name':
++            continue
++        if getattr(parsed_args, key):
++            args.extend([val, getattr(parsed_args, key)])
++
++    args.extend(['--show-config-value', field])
+     p = subprocess.Popen(
+-        args=[
+-            'ceph-conf',
+-	    '--show-config-value',
+-            field,
+-            '-n',
+-            name,
+-            ],
++        args,
+         stdout=subprocess.PIPE,
+         stderr=subprocess.PIPE)
+     outdata, errdata = p.communicate()
+     if (len(errdata)):
+@@ -537,9 +553,10 @@
+                 sockpath = childargs[1]
+             else:
+                 # try resolve daemon name
+                 try:
+-                    sockpath = ceph_conf('admin_socket', childargs[1])
++                    sockpath = ceph_conf(parsed_args, 'admin_socket',
++                                         childargs[1])
+                 except Exception as e:
+                     print >> sys.stderr, \
+                         'Can\'t get admin socket path: ' + str(e)
+                     return errno.EINVAL
+--- a/src/ceph_common.sh
++++ b/src/ceph_common.sh
+@@ -49,14 +49,15 @@
+     get_conf user "" "user"
+ 
+     #echo host for $name is $host, i am $hostname
+ 
+-    if [ -e "/var/lib/ceph/$type/ceph-$id/upstart" ]; then
++    cluster=$1
++    if [ -e "/var/lib/ceph/$type/$cluster-$id/upstart" ]; then
+ 	return 1
+     fi
+ 
+     # sysvinit managed instance in standard location?
+-    if [ -e "/var/lib/ceph/$type/ceph-$id/sysvinit" ]; then
++    if [ -e "/var/lib/ceph/$type/$cluster-$id/sysvinit" ]; then
+ 	host="$hostname"
+ 	echo "=== $type.$id === "
+ 	return 0
+     fi
+--- a/src/ceph_mon.cc
++++ b/src/ceph_mon.cc
+@@ -42,8 +42,10 @@
+ #include "global/signal_handler.h"
+ 
+ #include "include/assert.h"
+ 
++#include "erasure-code/ErasureCodePlugin.h"
++
+ #define dout_subsys ceph_subsys_mon
+ 
+ Monitor *mon = NULL;
+ 
+@@ -183,8 +185,23 @@
+   cerr << "        where the mon store and keyring are located\n";
+   generic_server_usage();
+ }
+ 
++int preload_erasure_code()
++{
++  string directory = g_conf->osd_pool_default_erasure_code_directory;
++  string plugins = g_conf->osd_erasure_code_plugins;
++  stringstream ss;
++  int r = ErasureCodePluginRegistry::instance().preload(plugins,
++							directory,
++							ss);
++  if (r)
++    derr << ss.str() << dendl;
++  else
++    dout(10) << ss.str() << dendl;
++  return r;
++}
++
+ int main(int argc, const char **argv) 
+ {
+   int err;
+ 
+@@ -415,8 +432,10 @@
+       global_init_postfork_start(g_ceph_context);
+     }
+     common_init_finish(g_ceph_context);
+     global_init_chdir(g_ceph_context);
++    if (preload_erasure_code() < -1)
++      prefork.exit(1);
+   }
+ 
+   MonitorDBStore *store = new MonitorDBStore(g_conf->mon_data);
+ 
+--- a/src/ceph_osd.cc
++++ b/src/ceph_osd.cc
+@@ -47,8 +47,10 @@
+ #include "perfglue/heap_profiler.h"
+ 
+ #include "include/assert.h"
+ 
++#include "erasure-code/ErasureCodePlugin.h"
++
+ #define dout_subsys ceph_subsys_osd
+ 
+ OSD *osd = NULL;
+ 
+@@ -65,8 +67,23 @@
+   derr << "   --debug_osd N   set debug level (e.g. 10)" << dendl;
+   generic_server_usage();
+ }
+ 
++int preload_erasure_code()
++{
++  string directory = g_conf->osd_pool_default_erasure_code_directory;
++  string plugins = g_conf->osd_erasure_code_plugins;
++  stringstream ss;
++  int r = ErasureCodePluginRegistry::instance().preload(plugins,
++							directory,
++							ss);
++  if (r)
++    derr << ss.str() << dendl;
++  else
++    dout(10) << ss.str() << dendl;
++  return r;
++}
++
+ int main(int argc, const char **argv) 
+ {
+   vector<const char*> args;
+   argv_to_vec(argc, argv, args);
+@@ -450,8 +467,11 @@
+   if (mc.build_initial_monmap() < 0)
+     return -1;
+   global_init_chdir(g_ceph_context);
+ 
++  if (preload_erasure_code() < -1)
++    return -1;
++
+   osd = new OSD(g_ceph_context,
+ 		store,
+ 		whoami,
+ 		ms_cluster,
+--- a/src/cls/rgw/cls_rgw.cc
++++ b/src/cls/rgw/cls_rgw.cc
+@@ -669,9 +669,9 @@
+     CLS_LOG(0, "rgw_bucket_complete_op(): entry.name=%s entry.meta.category=%d\n", remove_entry.name.c_str(), remove_entry.meta.category);
+     unaccount_entry(header, remove_entry);
+ 
+     if (op.log_op) {
+-      rc = log_index_operation(hctx, op.name, CLS_RGW_OP_DEL, op.tag, remove_entry.meta.mtime,
++      rc = log_index_operation(hctx, remove_oid_name, CLS_RGW_OP_DEL, op.tag, remove_entry.meta.mtime,
+                                remove_entry.ver, CLS_RGW_STATE_COMPLETE, header.ver, header.max_marker);
+       if (rc < 0)
+         continue;
+     }
+--- a/src/common/Finisher.h
++++ b/src/common/Finisher.h
+@@ -76,8 +76,17 @@
+     ls.clear();
+     if (logger)
+       logger->inc(l_finisher_queue_len);
+   }
++  void queue(list<Context*>& ls) {
++    finisher_lock.Lock();
++    finisher_queue.insert(finisher_queue.end(), ls.begin(), ls.end());
++    finisher_cond.Signal();
++    finisher_lock.Unlock();
++    ls.clear();
++    if (logger)
++      logger->inc(l_finisher_queue_len);
++  }
+   
+   void start();
+   void stop();
+ 
+--- a/src/common/LogClient.cc
++++ b/src/common/LogClient.cc
+@@ -123,8 +123,9 @@
+ }
+ 
+ Message *LogClient::_get_mon_log_message()
+ {
++  assert(log_lock.is_locked());
+    if (log_queue.empty())
+      return NULL;
+ 
+   // only send entries that haven't been sent yet during this mon
+@@ -148,9 +149,9 @@
+ 		<< " sending " << num_send << dendl;
+   assert(num_unsent <= log_queue.size());
+   std::deque<LogEntry>::iterator p = log_queue.begin();
+   std::deque<LogEntry> o;
+-  while (p->seq < last_log_sent) {
++  while (p->seq <= last_log_sent) {
+     ++p;
+     assert(p != log_queue.end());
+   }
+   while (num_send--) {
+--- a/src/common/Makefile.am
++++ b/src/common/Makefile.am
+@@ -12,8 +12,9 @@
+ 	common/admin_socket.cc \
+ 	common/admin_socket_client.cc \
+ 	common/cmdparse.cc \
+ 	common/escape.c \
++	common/io_priority.cc \
+ 	common/Clock.cc \
+ 	common/Throttle.cc \
+ 	common/Timer.cc \
+ 	common/Finisher.cc \
+@@ -155,8 +156,9 @@
+ 	common/perf_counters.h \
+ 	common/OutputDataSocket.h \
+ 	common/admin_socket.h \
+ 	common/admin_socket_client.h \
++	common/random_cache.hpp \
+ 	common/shared_cache.hpp \
+ 	common/tracked_int_ptr.hpp \
+ 	common/simple_cache.hpp \
+ 	common/sharedptr_registry.hpp \
+@@ -174,8 +176,9 @@
+ 	common/TrackedOp.h \
+ 	common/arch.h \
+ 	common/armor.h \
+ 	common/common_init.h \
++	common/io_priority.h \
+ 	common/pipe.h \
+ 	common/code_environment.h \
+ 	common/signal.h \
+ 	common/simple_spin.h \
+--- a/src/common/Thread.cc
++++ b/src/common/Thread.cc
+@@ -15,8 +15,9 @@
+ #include "common/Thread.h"
+ #include "common/code_environment.h"
+ #include "common/debug.h"
+ #include "common/signal.h"
++#include "common/io_priority.h"
+ 
+ #include <dirent.h>
+ #include <errno.h>
+ #include <iostream>
+@@ -28,21 +29,38 @@
+ #include <sys/types.h>
+ 
+ 
+ Thread::Thread()
+-  : thread_id(0)
++  : thread_id(0),
++    pid(0),
++    ioprio_class(-1),
++    ioprio_priority(-1)
+ {
+ }
+ 
+ Thread::~Thread()
+ {
+ }
+ 
+ void *Thread::_entry_func(void *arg) {
+-  void *r = ((Thread*)arg)->entry();
++  void *r = ((Thread*)arg)->entry_wrapper();
+   return r;
+ }
+ 
++void *Thread::entry_wrapper()
++{
++  int p = ceph_gettid(); // may return -ENOSYS on other platforms
++  if (p > 0)
++    pid = p;
++  if (ioprio_class >= 0 &&
++      ioprio_priority >= 0) {
++    ceph_ioprio_set(IOPRIO_WHO_PROCESS,
++		    pid,
++		    IOPRIO_PRIO_VALUE(ioprio_class, ioprio_priority));
++  }
++  return entry();
++}
++
+ const pthread_t &Thread::get_thread_id()
+ {
+   return thread_id;
+ }
+@@ -127,4 +145,16 @@
+ int Thread::detach()
+ {
+   return pthread_detach(thread_id);
+ }
++
++int Thread::set_ioprio(int cls, int prio)
++{
++  // fixme, maybe: this can race with create()
++  ioprio_class = cls;
++  ioprio_priority = prio;
++  if (pid && cls >= 0 && prio >= 0)
++    return ceph_ioprio_set(IOPRIO_WHO_PROCESS,
++			   pid,
++			   IOPRIO_PRIO_VALUE(cls, prio));
++  return 0;
++}
+--- a/src/common/Thread.h
++++ b/src/common/Thread.h
+@@ -20,8 +20,12 @@
+ 
+ class Thread {
+  private:
+   pthread_t thread_id;
++  pid_t pid;
++  int ioprio_class, ioprio_priority;
++
++  void *entry_wrapper();
+ 
+  public:
+   Thread(const Thread& other);
+   const Thread& operator=(const Thread& other);
+@@ -43,7 +47,8 @@
+   int try_create(size_t stacksize);
+   void create(size_t stacksize = 0);
+   int join(void **prval = 0);
+   int detach();
++  int set_ioprio(int cls, int prio);
+ };
+ 
+ #endif
+--- a/src/common/WorkQueue.cc
++++ b/src/common/WorkQueue.cc
+@@ -15,8 +15,9 @@
+ #include <sstream>
+ 
+ #include "include/types.h"
+ #include "include/utime.h"
++#include "common/errno.h"
+ #include "WorkQueue.h"
+ 
+ #include "common/config.h"
+ #include "common/HeartbeatMap.h"
+@@ -32,8 +33,10 @@
+     _lock(lockname.c_str()),  // this should be safe due to declaration order
+     _stop(false),
+     _pause(0),
+     _draining(0),
++    ioprio_class(-1),
++    ioprio_priority(-1),
+     _num_threads(n),
+     last_work_queue(0),
+     processing(0)
+ {
+@@ -155,8 +158,13 @@
+   while (_threads.size() < _num_threads) {
+     WorkThread *wt = new WorkThread(this);
+     ldout(cct, 10) << "start_threads creating and starting " << wt << dendl;
+     _threads.insert(wt);
++
++    int r = wt->set_ioprio(ioprio_class, ioprio_priority);
++    if (r < 0)
++      lderr(cct) << " set_ioprio got " << cpp_strerror(r) << dendl;
++
+     wt->create();
+   }
+ }
+ 
+@@ -254,4 +262,17 @@
+   _draining--;
+   _lock.Unlock();
+ }
+ 
++void ThreadPool::set_ioprio(int cls, int priority)
++{
++  Mutex::Locker l(_lock);
++  ioprio_class = cls;
++  ioprio_priority = priority;
++  for (set<WorkThread*>::iterator p = _threads.begin();
++       p != _threads.end();
++       ++p) {
++    int r = (*p)->set_ioprio(cls, priority);
++    if (r < 0)
++      lderr(cct) << " set_ioprio got " << cpp_strerror(r) << dendl;
++  }
++}
+--- a/src/common/WorkQueue.h
++++ b/src/common/WorkQueue.h
+@@ -32,8 +32,9 @@
+   bool _stop;
+   int _pause;
+   int _draining;
+   Cond _wait_cond;
++  int ioprio_class, ioprio_priority;
+ 
+ public:
+   class TPHandle {
+     friend class ThreadPool;
+@@ -387,8 +388,11 @@
+   /// resume work in thread pool.  must match each pause() call 1:1 to resume.
+   void unpause();
+   /// wait for all work to complete
+   void drain(WorkQueue_* wq = 0);
++
++  /// set io priority
++  void set_ioprio(int cls, int priority);
+ };
+ 
+ class GenContextWQ :
+   public ThreadPool::WorkQueueVal<GenContext<ThreadPool::TPHandle&>*> {
+--- a/src/common/blkdev.cc
++++ b/src/common/blkdev.cc
+@@ -9,9 +9,9 @@
+ int get_block_device_size(int fd, int64_t *psize)
+ {
+ #ifdef BLKGETSIZE64
+   int ret = ::ioctl(fd, BLKGETSIZE64, psize);
+-#elif BLKGETSIZE
++#elif defined(BLKGETSIZE)
+   unsigned long sectors = 0;
+   int ret = ::ioctl(fd, BLKGETSIZE, &sectors);
+   *psize = sectors * 512ULL;
+ #else
+--- a/src/common/config.cc
++++ b/src/common/config.cc
+@@ -878,17 +878,17 @@
+   assert(lock.is_locked());
+   switch (opt->type) {
+     case OPT_INT: {
+       std::string err;
+-      int f = strict_strtol(val, 10, &err);
++      int f = strict_sistrtoll(val, &err);
+       if (!err.empty())
+ 	return -EINVAL;
+       *(int*)opt->conf_ptr(this) = f;
+       return 0;
+     }
+     case OPT_LONGLONG: {
+       std::string err;
+-      long long f = strict_strtoll(val, 10, &err);
++      long long f = strict_sistrtoll(val, &err);
+       if (!err.empty())
+ 	return -EINVAL;
+       *(long long*)opt->conf_ptr(this) = f;
+       return 0;
+@@ -916,17 +916,17 @@
+       }
+       return 0;
+     case OPT_U32: {
+       std::string err;
+-      int f = strict_strtol(val, 10, &err);
++      int f = strict_sistrtoll(val, &err);
+       if (!err.empty())
+ 	return -EINVAL;
+       *(uint32_t*)opt->conf_ptr(this) = f;
+       return 0;
+     }
+     case OPT_U64: {
+       std::string err;
+-      long long f = strict_strtoll(val, 10, &err);
++      long long f = strict_sistrtoll(val, &err);
+       if (!err.empty())
+ 	return -EINVAL;
+       *(uint64_t*)opt->conf_ptr(this) = f;
+       return 0;
+--- a/src/common/config_opts.h
++++ b/src/common/config_opts.h
+@@ -176,8 +176,9 @@
+ OPTION(mon_force_standby_active, OPT_BOOL, true) // should mons force standby-replay mds to be active
+ OPTION(mon_warn_on_old_mons, OPT_BOOL, true) // should mons set health to WARN if part of quorum is old?
+ OPTION(mon_warn_on_legacy_crush_tunables, OPT_BOOL, true) // warn if crush tunables are not optimal
+ OPTION(mon_warn_on_osd_down_out_interval_zero, OPT_BOOL, true) // warn if 'mon_osd_down_out_interval == 0'
++OPTION(mon_warn_on_cache_pools_without_hit_sets, OPT_BOOL, true)
+ OPTION(mon_min_osdmap_epochs, OPT_INT, 500)
+ OPTION(mon_max_pgmap_epochs, OPT_INT, 500)
+ OPTION(mon_max_log_epochs, OPT_INT, 500)
+ OPTION(mon_max_mdsmap_epochs, OPT_INT, 500)
+@@ -433,8 +434,9 @@
+        "technique=reed_sol_van "
+        "k=2 "
+        "m=1 "
+        ) // default properties of osd pool create
++OPTION(osd_erasure_code_plugins, OPT_STR, "jerasure") // list of erasure code plugins
+ OPTION(osd_pool_default_flags, OPT_INT, 0)   // default flags for new pools
+ OPTION(osd_pool_default_flag_hashpspool, OPT_BOOL, true)   // use new pg hashing to prevent pool/pg overlap
+ OPTION(osd_pool_default_hit_set_bloom_fpp, OPT_FLOAT, .05)
+ OPTION(osd_pool_default_cache_target_dirty_ratio, OPT_FLOAT, .4)
+@@ -449,16 +451,19 @@
+ OPTION(osd_tier_default_cache_hit_set_period, OPT_INT, 1200)
+ OPTION(osd_tier_default_cache_hit_set_type, OPT_STR, "bloom")
+ 
+ OPTION(osd_map_dedup, OPT_BOOL, true)
++OPTION(osd_map_max_advance, OPT_INT, 200) // make this < cache_size!
+ OPTION(osd_map_cache_size, OPT_INT, 500)
+ OPTION(osd_map_message_max, OPT_INT, 100)  // max maps per MOSDMap message
+ OPTION(osd_map_share_max_epochs, OPT_INT, 100)  // cap on # of inc maps we send to peers, clients
+ OPTION(osd_op_threads, OPT_INT, 2)    // 0 == no threading
+ OPTION(osd_peering_wq_batch_size, OPT_U64, 20)
+ OPTION(osd_op_pq_max_tokens_per_priority, OPT_U64, 4194304)
+ OPTION(osd_op_pq_min_cost, OPT_U64, 65536)
+ OPTION(osd_disk_threads, OPT_INT, 1)
++OPTION(osd_disk_thread_ioprio_class, OPT_STR, "") // rt realtime be besteffort best effort idle
++OPTION(osd_disk_thread_ioprio_priority, OPT_INT, -1) // 0-7
+ OPTION(osd_recovery_threads, OPT_INT, 1)
+ OPTION(osd_recover_clone_overlap, OPT_BOOL, true)   // preserve clone_overlap during recovery/migration
+ 
+ // Only use clone_overlap for recovery if there are fewer than
+@@ -472,8 +477,9 @@
+ OPTION(osd_snap_trim_thread_timeout, OPT_INT, 60*60*1)
+ OPTION(osd_snap_trim_sleep, OPT_FLOAT, 0)
+ OPTION(osd_scrub_thread_timeout, OPT_INT, 60)
+ OPTION(osd_scrub_finalize_thread_timeout, OPT_INT, 60*10)
++OPTION(osd_scrub_invalid_stats, OPT_BOOL, true)
+ OPTION(osd_remove_thread_timeout, OPT_INT, 60*60)
+ OPTION(osd_command_thread_timeout, OPT_INT, 10*60)
+ OPTION(osd_age, OPT_FLOAT, .8)
+ OPTION(osd_age_time, OPT_INT, 0)
+@@ -508,8 +514,9 @@
+ OPTION(osd_scrub_min_interval, OPT_FLOAT, 60*60*24)    // if load is low
+ OPTION(osd_scrub_max_interval, OPT_FLOAT, 7*60*60*24)  // regardless of load
+ OPTION(osd_scrub_chunk_min, OPT_INT, 5)
+ OPTION(osd_scrub_chunk_max, OPT_INT, 25)
++OPTION(osd_scrub_sleep, OPT_FLOAT, 0)   // sleep between [deep]scrub ops
+ OPTION(osd_deep_scrub_interval, OPT_FLOAT, 60*60*24*7) // once a week
+ OPTION(osd_deep_scrub_stride, OPT_INT, 524288)
+ OPTION(osd_scan_list_ping_tp_interval, OPT_U64, 100)
+ OPTION(osd_auto_weight, OPT_BOOL, false)
+@@ -689,8 +696,11 @@
+ OPTION(keyvaluestore_debug_check_backend, OPT_BOOL, 0) // Expensive debugging check on sync
+ OPTION(keyvaluestore_op_threads, OPT_INT, 2)
+ OPTION(keyvaluestore_op_thread_timeout, OPT_INT, 60)
+ OPTION(keyvaluestore_op_thread_suicide_timeout, OPT_INT, 180)
++OPTION(keyvaluestore_default_strip_size, OPT_INT, 4096) // Only affect new object
++OPTION(keyvaluestore_max_expected_write_size, OPT_U64, 1ULL << 24) // bytes
++OPTION(keyvaluestore_header_cache_size, OPT_INT, 4096)    // Header cache size
+ 
+ // max bytes to search ahead in journal searching for corruption
+ OPTION(journal_max_corrupt_search, OPT_U64, 10<<20)
+ OPTION(journal_block_align, OPT_BOOL, true)
+@@ -712,8 +722,9 @@
+ OPTION(rbd_cache_size, OPT_LONGLONG, 32<<20)         // cache size in bytes
+ OPTION(rbd_cache_max_dirty, OPT_LONGLONG, 24<<20)    // dirty limit in bytes - set to 0 for write-through caching
+ OPTION(rbd_cache_target_dirty, OPT_LONGLONG, 16<<20) // target dirty limit in bytes
+ OPTION(rbd_cache_max_dirty_age, OPT_FLOAT, 1.0)      // seconds in cache before writeback starts
++OPTION(rbd_cache_max_dirty_object, OPT_INT, 0)       // dirty limit for objects - set to 0 for auto calculate from rbd_cache_size
+ OPTION(rbd_cache_block_writes_upfront, OPT_BOOL, false) // whether to block writes to the cache before the aio_write call completes (true), or block before the aio completion is called (false)
+ OPTION(rbd_concurrent_management_ops, OPT_INT, 10) // how many operations can be in flight for a management operation like deleting or resizing an image
+ OPTION(rbd_balance_snap_reads, OPT_BOOL, false)
+ OPTION(rbd_localize_snap_reads, OPT_BOOL, false)
+--- /dev/null
++++ b/src/common/io_priority.cc
+@@ -0,0 +1,54 @@
++// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
++// vim: ts=8 sw=2 smarttab
++/*
++ * Ceph - scalable distributed file system
++ *
++ * Copyright (C) 2012 Red Hat
++ *
++ * This is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License version 2.1, as published by the Free Software
++ * Foundation.  See file COPYING.
++ *
++ */
++
++#include <sys/types.h>
++#include <unistd.h>
++#include <sys/syscall.h>   /* For SYS_xxx definitions */
++#include <algorithm>
++#include <errno.h>
++
++#include "common/errno.h"
++#include "io_priority.h"
++
++pid_t ceph_gettid(void)
++{
++#ifdef __linux__
++  return syscall(SYS_gettid);
++#else
++  return -ENOSYS;
++#endif
++}
++
++int ceph_ioprio_set(int whence, int who, int ioprio)
++{
++#ifdef __linux__
++  return syscall(SYS_ioprio_set, whence, who, ioprio);
++#else
++  return -ENOSYS;
++#endif
++}
++
++int ceph_ioprio_string_to_class(const std::string& s)
++{
++  std::string l;
++  std::transform(s.begin(), s.end(), l.begin(), ::tolower);
++
++  if (l == "idle")
++    return IOPRIO_CLASS_IDLE;
++  if (l == "be" || l == "besteffort" || l == "best effort")
++    return IOPRIO_CLASS_BE;
++  if (l == "rt" || l == "realtime" || l == "real time")
++    return IOPRIO_CLASS_RT;
++  return -EINVAL;
++}
+--- /dev/null
++++ b/src/common/io_priority.h
+@@ -0,0 +1,44 @@
++// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
++// vim: ts=8 sw=2 smarttab
++/*
++ * Ceph - scalable distributed file system
++ *
++ * Copyright (C) 2012 Red Hat
++ *
++ * This is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License version 2.1, as published by the Free Software
++ * Foundation.  See file COPYING.
++ *
++ */
++
++#ifndef CEPH_COMMON_IO_PRIORITY_H
++#define CEPH_COMMON_IO_PRIORITY_H
++
++#include <string>
++
++extern pid_t ceph_gettid();
++
++#ifndef IOPRIO_WHO_PROCESS
++# define IOPRIO_WHO_PROCESS 1
++#endif
++#ifndef IOPRIO_PRIO_VALUE
++# define IOPRIO_CLASS_SHIFT 13
++# define IOPRIO_PRIO_VALUE(class, data) \
++		(((class) << IOPRIO_CLASS_SHIFT) | (data))
++#endif
++#ifndef IOPRIO_CLASS_RT
++# define IOPRIO_CLASS_RT 1
++#endif
++#ifndef IOPRIO_CLASS_BE
++# define IOPRIO_CLASS_BE 2
++#endif
++#ifndef IOPRIO_CLASS_IDLE
++# define IOPRIO_CLASS_IDLE 3
++#endif
++
++extern int ceph_ioprio_set(int whence, int who, int ioprio);
++
++extern int ceph_ioprio_string_to_class(const std::string& s);
++
++#endif
+--- /dev/null
++++ b/src/common/random_cache.hpp
+@@ -0,0 +1,111 @@
++// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
++// vim: ts=8 sw=2 smarttab
++/*
++ * Ceph - scalable distributed file system
++ *
++ * Copyright (C) 2014 UnitedStack <haomai at unitedstack.com>
++ *
++ * Author: Haomai Wang <haomaiwang at gmail.com>
++ *
++ * This is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License version 2.1, as published by the Free Software
++ * Foundation.  See file COPYING.
++ *
++ */
++
++#ifndef CEPH_RANDOMCACHE_H
++#define CEPH_RANDOMCACHE_H
++
++#include "common/Mutex.h"
++#include "include/compat.h"
++#include "include/unordered_map.h"
++
++
++// Although This is a ramdom cache implementation, here still consider to make
++// the trim progress more reasonable. Each item owns its lookup frequency,
++// when the cache is full it will randomly pick up several items and compare the
++// frequency associated with. The least frequency of items will be evicted.
++template <class K, class V>
++class RandomCache {
++  // The first element of pair is the frequency of item, it's used to evict item
++  ceph::unordered_map<K, pair<uint64_t, V> > contents;
++  Mutex lock;
++  uint64_t max_size;
++  K last_trim_key;
++
++  // When cache reach full, consider to evict a certain number of items
++  static const uint64_t EVICT_COUNT = 5;
++  // Avoid too much overhead on comparing items's frequency, the number of
++  // compare items is expected to small.
++  static const uint64_t COMPARE_COUNT = 3;
++
++  // In order to make evict cache progress more lightweight and effective,
++  // several items are expected to evicted in one call
++  void trim_cache(uint64_t evict_count) {
++    typename ceph::unordered_map<K, pair<uint64_t, V> >::iterator it = contents.find(last_trim_key);
++    uint64_t total_compare = evict_count * COMPARE_COUNT;
++    map<uint64_t, K> candidates;
++
++    while (total_compare--) {
++      if (it == contents.end()) {
++        it = contents.begin();
++      }
++
++      candidates[it->second.first] = it->first;
++      it++;
++    }
++    if (it != contents.end())
++      last_trim_key = it->first;
++    else
++      last_trim_key = contents.begin()->first;
++
++    for (typename map<uint64_t, K>::iterator j = candidates.begin(); j != candidates.end(); j++) {
++      contents.erase(j->second);
++      evict_count--;
++      if (!evict_count)
++        break;
++    }
++  }
++
++ public:
++  RandomCache(size_t max_size=20) : lock("RandomCache::lock"),
++                                    max_size(max_size) {}
++  ~RandomCache() {
++    contents.clear();
++  }
++
++  void clear(K key) {
++    Mutex::Locker l(lock);
++    contents.erase(key);
++  }
++
++  void set_size(size_t new_size) {
++    Mutex::Locker l(lock);
++    max_size = new_size;
++    if (max_size <= contents.size()) {
++      trim_cache(contents.size() - max_size);
++    }
++  }
++
++  bool lookup(K key, V *out) {
++    Mutex::Locker l(lock);
++    typename ceph::unordered_map<K, pair<uint64_t, V> >::iterator it = contents.find(key);
++    if (it != contents.end()) {
++      it->second.first++;
++      *out = it->second.second;
++      return true;
++    }
++    return false;
++  }
++
++  void add(K key, V value) {
++    Mutex::Locker l(lock);
++    if (max_size <= contents.size()) {
++      trim_cache(EVICT_COUNT);
++    }
++    contents[key] = make_pair(1, value);
++  }
++};
++
++#endif
+--- a/src/common/str_map.cc
++++ b/src/common/str_map.cc
+@@ -23,9 +23,9 @@
+ 
+ using namespace std;
+ 
+ int get_str_map(const string &str,
+-                stringstream &ss,
++                ostream &ss,
+                 map<string,string> *str_map)
+ {
+   json_spirit::mValue json;
+   try {
+--- a/src/common/strtol.cc
++++ b/src/common/strtol.cc
+@@ -16,8 +16,11 @@
+ #include <limits.h>
+ #include <sstream>
+ #include <stdlib.h>
+ #include <string>
++extern "C" {
++#include <stdint.h>
++}
+ 
+ using std::ostringstream;
+ 
+ long long strict_strtoll(const char *str, int base, std::string *err)
+@@ -123,4 +126,44 @@
+   }
+   *err = "";
+   return ret;
+ }
++
++uint64_t strict_sistrtoll(const char *str, std::string *err)
++{
++  std::string s(str);
++  if (s.size() == 0) {
++    ostringstream oss;
++    oss << "strict_sistrtoll: value not specified";
++    *err = oss.str();
++    return 0;
++  }
++  const char &u = s.at(s.size()-1); //str[std::strlen(str)-1];
++  int m = 0;
++  if (u == 'B')
++    m = 0;
++  else if (u == 'K')
++    m = 10;
++  else if (u == 'M')
++    m = 20;
++  else if (u == 'G')
++    m = 30;
++  else if (u == 'T')
++    m = 40;
++  else if (u == 'P')
++    m = 50;
++  else if (u == 'E')
++    m = 60;
++  else
++    m = -1;
++
++  const char *v = NULL;
++  if (m >= 0)
++    s = std::string(str, s.size()-1);
++  v = s.c_str();
++
++  uint64_t r = strict_strtoll(v, 10, err);
++  if (err->empty() && m > 0) {
++    r = (r << m);
++  }
++  return r;
++}
+--- a/src/common/strtol.h
++++ b/src/common/strtol.h
+@@ -15,8 +15,11 @@
+ #ifndef CEPH_COMMON_STRTOL_H
+ #define CEPH_COMMON_STRTOL_H
+ 
+ #include <string>
++extern "C" {
++#include <stdint.h>
++}
+ 
+ long long strict_strtoll(const char *str, int base, std::string *err);
+ 
+ int strict_strtol(const char *str, int base, std::string *err);
+@@ -24,5 +27,7 @@
+ double strict_strtod(const char *str, std::string *err);
+ 
+ float strict_strtof(const char *str, std::string *err);
+ 
++uint64_t strict_sistrtoll(const char *str, std::string *err);
++
+ #endif
+--- a/src/crush/CrushWrapper.cc
++++ b/src/crush/CrushWrapper.cc
+@@ -9,34 +9,56 @@
+ #define dout_subsys ceph_subsys_crush
+ 
+ bool CrushWrapper::has_v2_rules() const
+ {
+-  // check rules for use of indep or new SET_* rule steps
+   for (unsigned i=0; i<crush->max_rules; i++) {
+-    crush_rule *r = crush->rules[i];
+-    if (!r)
+-      continue;
+-    for (unsigned j=0; j<r->len; j++) {
+-      if (r->steps[j].op == CRUSH_RULE_CHOOSE_INDEP ||
+-	  r->steps[j].op == CRUSH_RULE_CHOOSELEAF_INDEP ||
+-	  r->steps[j].op == CRUSH_RULE_SET_CHOOSE_TRIES ||
+-	  r->steps[j].op == CRUSH_RULE_SET_CHOOSELEAF_TRIES)
+-	return true;
++    if (is_v2_rule(i)) {
++      return true;
++    }
++  }
++  return false;
++}
++
++bool CrushWrapper::is_v2_rule(unsigned ruleid) const
++{
++  // check rule for use of indep or new SET_* rule steps
++  if (ruleid >= crush->max_rules)
++    return false;
++  crush_rule *r = crush->rules[ruleid];
++  if (!r)
++    return false;
++  for (unsigned j=0; j<r->len; j++) {
++    if (r->steps[j].op == CRUSH_RULE_CHOOSE_INDEP ||
++	r->steps[j].op == CRUSH_RULE_CHOOSELEAF_INDEP ||
++	r->steps[j].op == CRUSH_RULE_SET_CHOOSE_TRIES ||
++	r->steps[j].op == CRUSH_RULE_SET_CHOOSELEAF_TRIES) {
++      return true;
+     }
+   }
+   return false;
+ }
+ 
+ bool CrushWrapper::has_v3_rules() const
+ {
+-  // check rules for use of SET_CHOOSELEAF_VARY_R step
+   for (unsigned i=0; i<crush->max_rules; i++) {
+-    crush_rule *r = crush->rules[i];
+-    if (!r)
+-      continue;
+-    for (unsigned j=0; j<r->len; j++) {
+-      if (r->steps[j].op == CRUSH_RULE_SET_CHOOSELEAF_VARY_R)
+-	return true;
++    if (is_v3_rule(i)) {
++      return true;
++    }
++  }
++  return false;
++}
++
++bool CrushWrapper::is_v3_rule(unsigned ruleid) const
++{
++  // check rule for use of SET_CHOOSELEAF_VARY_R step
++  if (ruleid >= crush->max_rules)
++    return false;
++  crush_rule *r = crush->rules[ruleid];
++  if (!r)
++    return false;
++  for (unsigned j=0; j<r->len; j++) {
++    if (r->steps[j].op == CRUSH_RULE_SET_CHOOSELEAF_VARY_R) {
++      return true;
+     }
+   }
+   return false;
+ }
+@@ -793,8 +815,61 @@
+   have_rmaps = false;
+   return rno;
+ }
+ 
++int CrushWrapper::get_rule_weight_osd_map(unsigned ruleno, map<int,float> *pmap)
++{
++  if (ruleno >= crush->max_rules)
++    return -ENOENT;
++  if (crush->rules[ruleno] == NULL)
++    return -ENOENT;
++  crush_rule *rule = crush->rules[ruleno];
++
++  // build a weight map for each TAKE in the rule, and then merge them
++  for (unsigned i=0; i<rule->len; ++i) {
++    map<int,float> m;
++    float sum = 0;
++    if (rule->steps[i].op == CRUSH_RULE_TAKE) {
++      int n = rule->steps[i].arg1;
++      if (n >= 0) {
++	m[n] = 1.0;
++	sum = 1.0;
++      } else {
++	list<int> q;
++	q.push_back(n);
++	//breadth first iterate the OSD tree
++	while (!q.empty()) {
++	  int bno = q.front();
++	  q.pop_front();
++	  crush_bucket *b = crush->buckets[-1-bno];
++	  assert(b);
++	  for (unsigned j=0; j<b->size; ++j) {
++	    int item_id = b->items[j];
++	    if (item_id >= 0) //it's an OSD
++	    {
++	      float w = crush_get_bucket_item_weight(b, j);
++	      m[item_id] = w;
++	      sum += w;
++	    }
++	    else //not an OSD, expand the child later
++	      q.push_back(item_id);
++	  }
++	}
++      }
++    }
++    for (map<int,float>::iterator p = m.begin(); p != m.end(); ++p) {
++      map<int,float>::iterator q = pmap->find(p->first);
++      if (q == pmap->end()) {
++	(*pmap)[p->first] = p->second / sum;
++      } else {
++	q->second += p->second / sum;
++      }
++    }
++  }
++
++  return 0;
++}
++
+ int CrushWrapper::remove_rule(int ruleno)
+ {
+   if (ruleno >= (int)crush->max_rules)
+     return -ENOENT;
+--- a/src/crush/CrushWrapper.h
++++ b/src/crush/CrushWrapper.h
+@@ -215,8 +215,10 @@
+   }
+   bool has_v2_rules() const;
+   bool has_v3_rules() const;
+ 
++  bool is_v2_rule(unsigned ruleid) const;
++  bool is_v3_rule(unsigned ruleid) const;
+ 
+   // bucket types
+   int get_num_type_names() const {
+     return type_map.size();
+@@ -630,8 +632,20 @@
+     if (IS_ERR(s)) return PTR_ERR(s);
+     return s->arg2;
+   }
+ 
++  /**
++   * calculate a map of osds to weights for a given rule
++   *
++   * Generate a map of which OSDs get how much relative weight for a
++   * given rule.
++   *
++   * @param ruleno [in] rule id
++   * @param pmap [out] map of osd to weight
++   * @return 0 for success, or negative error code
++   */
++  int get_rule_weight_osd_map(unsigned ruleno, map<int,float> *pmap);
++
+   /* modifiers */
+   int add_rule(int len, int ruleset, int type, int minsize, int maxsize, int ruleno) {
+     if (!crush) return -ENOENT;
+     crush_rule *n = crush_make_rule(len, ruleset, type, minsize, maxsize);
+--- a/src/erasure-code/ErasureCodeInterface.h
++++ b/src/erasure-code/ErasureCodeInterface.h
+@@ -166,9 +166,9 @@
+      *
+      * @param [in] name of the ruleset to create
+      * @param [in] crush crushmap in which the ruleset is created
+      * @param [out] ss contains informative messages when an error occurs
+-     * @return **0** on success or a negative errno on error.
++     * @return a ruleset on success or a negative errno on error.
+      */
+     virtual int create_ruleset(const string &name,
+ 			       CrushWrapper &crush,
+ 			       ostream *ss) const = 0;
+--- a/src/erasure-code/ErasureCodePlugin.cc
++++ b/src/erasure-code/ErasureCodePlugin.cc
+@@ -3,8 +3,9 @@
+ /*
+  * Ceph - scalable distributed file system
+  *
+  * Copyright (C) 2013,2014 Cloudwatt <libre.licensing at cloudwatt.com>
++ * Copyright (C) 2014 Red Hat <contact at redhat.com>
+  *
+  * Author: Loic Dachary <loic at dachary.org>
+  *
+  *  This library is free software; you can redistribute it and/or
+@@ -18,8 +19,9 @@
+ #include <dlfcn.h>
+ 
+ #include "ErasureCodePlugin.h"
+ #include "common/errno.h"
++#include "include/str_list.h"
+ 
+ #define PLUGIN_PREFIX "libec_"
+ #define PLUGIN_SUFFIX ".so"
+ #define PLUGIN_INIT_FUNCTION "__erasure_code_init"
+@@ -129,7 +131,33 @@
+   }
+ 
+   (*plugin)->library = library;
+ 
++  ss << __func__ << ": " << plugin_name << " ";
++
+   return 0;
+ }
+ 
++int ErasureCodePluginRegistry::preload(const std::string &plugins,
++				       const std::string &directory,
++				       ostream &ss)
++{
++  map<string,string> profile;
++  profile["directory"] = directory;
++  list<string> plugins_list;
++  get_str_list(plugins, plugins_list);
++  for (list<string>::iterator i = plugins_list.begin();
++       i != plugins_list.end();
++       i++) {
++    ErasureCodePlugin *plugin;
++    int r = load(*i, profile, &plugin, ss);
++    if (r)
++      return r;
++
++    ErasureCodeInterfaceRef erasure_code;
++    profile["technique"] = "reed_sol_van";
++    r = plugin->factory(profile, &erasure_code);
++    if (r)
++      return r;
++  }
++  return 0;
++}
+--- a/src/erasure-code/ErasureCodePlugin.h
++++ b/src/erasure-code/ErasureCodePlugin.h
+@@ -66,8 +66,11 @@
+ 	     const map<std::string,std::string> &parameters,
+ 	     ErasureCodePlugin **plugin,
+ 	     ostream &ss);
+ 
++    int preload(const std::string &plugins,
++		const std::string &directory,
++		ostream &ss);
+   };
+ }
+ 
+ #endif
+--- a/src/erasure-code/jerasure/ErasureCodeJerasure.cc
++++ b/src/erasure-code/jerasure/ErasureCodeJerasure.cc
+@@ -43,10 +43,14 @@
+ int ErasureCodeJerasure::create_ruleset(const string &name,
+ 					CrushWrapper &crush,
+ 					ostream *ss) const
+ {
+-  return crush.add_simple_ruleset(name, ruleset_root, ruleset_failure_domain,
+-				  "indep", pg_pool_t::TYPE_ERASURE, ss);
++  int ruleid = crush.add_simple_ruleset(name, ruleset_root, ruleset_failure_domain,
++					"indep", pg_pool_t::TYPE_ERASURE, ss);
++  if (ruleid < 0)
++    return ruleid;
++  else
++    return crush.get_rule_mask_ruleset(ruleid);
+ }
+ 
+ void ErasureCodeJerasure::init(const map<string,string> &parameters)
+ {
+--- a/src/include/atomic.h
++++ b/src/include/atomic.h
+@@ -20,12 +20,68 @@
+ # include "acconfig.h"
+ #endif
+ 
+ #include <stdlib.h>
++#include "include/Spinlock.h"
++
++namespace ceph {
++  template <class T>
++  class atomic_spinlock_t {
++    mutable ceph_spinlock_t lock;
++    T val;
++  public:
++    atomic_spinlock_t(T i=0)
++      : val(i) {
++      ceph_spin_init(&lock);
++    }
++    ~atomic_spinlock_t() {
++      ceph_spin_destroy(&lock);
++    }
++    void set(T v) {
++      ceph_spin_lock(&lock);
++      val = v;
++      ceph_spin_unlock(&lock);
++    }
++    T inc() {
++      ceph_spin_lock(&lock);
++      T r = ++val;
++      ceph_spin_unlock(&lock);
++      return r;
++    }
++    T dec() {
++      ceph_spin_lock(&lock);
++      T r = --val;
++      ceph_spin_unlock(&lock);
++      return r;
++    }
++    void add(T d) {
++      ceph_spin_lock(&lock);
++      val += d;
++      ceph_spin_unlock(&lock);
++    }
++    void sub(T d) {
++      ceph_spin_lock(&lock);
++      val -= d;
++      ceph_spin_unlock(&lock);
++    }
++    T read() const {
++      T ret;
++      ceph_spin_lock(&lock);
++      ret = val;
++      ceph_spin_unlock(&lock);
++      return ret;
++    }
++  private:
++    // forbid copying
++    atomic_spinlock_t(const atomic_spinlock_t<T> &other);
++    atomic_spinlock_t &operator=(const atomic_spinlock_t<T> &rhs);
++  };
++}
+ 
+ #ifndef NO_ATOMIC_OPS
+ 
+ // libatomic_ops implementation
++#define AO_REQUIRE_CAS
+ #include <atomic_ops.h>
+ 
+ // reinclude our assert to clobber the system one
+ #include "include/assert.h"
+@@ -34,9 +90,9 @@
+   class atomic_t {
+     AO_t val;
+   public:
+     atomic_t(AO_t i=0) : val(i) {}
+-    void set(size_t v) {
++    void set(AO_t v) {
+       AO_store(&val, v);
+     }
+     AO_t inc() {
+       return AO_fetch_and_add1(&val) + 1;
+@@ -46,10 +102,10 @@
+     }
+     void add(AO_t add_me) {
+       AO_fetch_and_add(&val, add_me);
+     }
+-    void sub(int sub_me) {
+-      int negsub = 0 - sub_me;
++    void sub(AO_t sub_me) {
++      AO_t negsub = 0 - sub_me;
+       AO_fetch_and_add_write(&val, (AO_t)negsub);
+     }
+     AO_t read() const {
+       // cast away const on the pointer.  this is only needed to build
+@@ -61,65 +117,26 @@
+     // forbid copying
+     atomic_t(const atomic_t &other);
+     atomic_t &operator=(const atomic_t &rhs);
+   };
++
++#if SIZEOF_AO_T == 8
++  typedef atomic_t atomic64_t;
++#else
++  typedef atomic_spinlock_t<unsigned long long> atomic64_t;
++#endif
++
+ }
++
+ #else
+ /*
+  * crappy slow implementation that uses a pthreads spinlock.
+  */
+ #include "include/Spinlock.h"
+ 
+ namespace ceph {
+-  class atomic_t {
+-    mutable ceph_spinlock_t lock;
+-    signed long val;
+-  public:
+-    atomic_t(int i=0)
+-      : val(i) {
+-      ceph_spin_init(&lock);
+-    }
+-    ~atomic_t() {
+-      ceph_spin_destroy(&lock);
+-    }
+-    void set(size_t v) {
+-      ceph_spin_lock(&lock);
+-      val = v;
+-      ceph_spin_unlock(&lock);
+-    }
+-    int inc() {
+-      ceph_spin_lock(&lock);
+-      int r = ++val;
+-      ceph_spin_unlock(&lock);
+-      return r;
+-    }
+-    int dec() {
+-      ceph_spin_lock(&lock);
+-      int r = --val;
+-      ceph_spin_unlock(&lock);
+-      return r;
+-    }
+-    void add(int d) {
+-      ceph_spin_lock(&lock);
+-      val += d;
+-      ceph_spin_unlock(&lock);
+-    }
+-    void sub(int d) {
+-      ceph_spin_lock(&lock);
+-      val -= d;
+-      ceph_spin_unlock(&lock);
+-    }
+-    int read() const {
+-      signed long ret;
+-      ceph_spin_lock(&lock);
+-      ret = val;
+-      ceph_spin_unlock(&lock);
+-      return ret;
+-    }
+-  private:
+-    // forbid copying
+-    atomic_t(const atomic_t &other);
+-    atomic_t &operator=(const atomic_t &rhs);
+-  };
++  typedef atomic_spinlock_t<unsigned> atomic_t;
++  typedef atomic_spinlock_t<unsigned long long> atomic64_t;
+ }
++
+ #endif
+ #endif
+--- a/src/include/intarith.h
++++ b/src/include/intarith.h
+@@ -27,9 +27,9 @@
+ # define DIV_ROUND_UP(n, d)  (((n) + (d) - 1) / (d))
+ #endif
+ 
+ #ifndef ROUND_UP_TO
+-# define ROUND_UP_TO(n, d) (((n)+(d)-1) & ~((d)-1))
++# define ROUND_UP_TO(n, d) ((n)%(d) ? ((n)+(d)-(n)%(d)) : (n))
+ #endif
+ 
+ #ifndef SHIFT_ROUND_UP
+ # define SHIFT_ROUND_UP(x,y) (((x)+(1<<(y))-1) >> (y))
+--- a/src/include/rbd/librbd.h
++++ b/src/include/rbd/librbd.h
+@@ -38,8 +38,9 @@
+ #define LIBRBD_VERSION_CODE LIBRBD_VERSION(LIBRBD_VER_MAJOR, LIBRBD_VER_MINOR, LIBRBD_VER_EXTRA)
+ 
+ #define LIBRBD_SUPPORTS_WATCH 0
+ #define LIBRBD_SUPPORTS_AIO_FLUSH 1
++#define LIBRBD_SUPPORTS_INVALIDATE 1
+ 
+ typedef void *rbd_snap_t;
+ typedef void *rbd_image_t;
+ 
+@@ -375,8 +376,16 @@
+  * @returns 0 on success, negative error code on failure
+  */
+ int rbd_aio_flush(rbd_image_t image, rbd_completion_t c);
+ 
++/**
++ * Drop any cached data for an image
++ *
++ * @param image the image to invalidate cached data for
++ * @returns 0 on success, negative error code on failure
++ */
++int rbd_invalidate_cache(rbd_image_t image);
++
+ #ifdef __cplusplus
+ }
+ #endif
+ 
+--- a/src/include/rbd/librbd.hpp
++++ b/src/include/rbd/librbd.hpp
+@@ -215,8 +215,16 @@
+    * @returns 0 on success, negative error code on failure
+    */
+   int aio_flush(RBD::AioCompletion *c);
+ 
++  /**
++   * Drop any cached data for an image
++   *
++   * @param image the image to invalidate cached data for
++   * @returns 0 on success, negative error code on failure
++   */
++  int invalidate_cache();
++
+ private:
+   friend class RBD;
+ 
+   Image(const Image& rhs);
+--- a/src/include/str_map.h
++++ b/src/include/str_map.h
+@@ -52,8 +52,8 @@
+  * @param [out] str_map key/value pairs read from str
+  * @return **0** on success or a -EINVAL on error.
+  */
+ extern int get_str_map(const std::string &str,
+-		       std::stringstream &ss,
++		       std::ostream &ss,
+ 		       std::map<std::string,std::string> *str_map);
+ 
+ #endif
+--- a/src/init-ceph.in
++++ b/src/init-ceph.in
+@@ -30,8 +30,9 @@
+ 
+ usage_exit() {
+     echo "usage: $0 [options] {start|stop|restart|condrestart} [mon|osd|mds]..."
+     printf "\t-c ceph.conf\n"
++    printf "\t--cluster [cluster name]\tdefine the cluster name\n"
+     printf "\t--valgrind\trun via valgrind\n"
+     printf "\t--hostname [hostname]\toverride hostname lookup\n"
+     exit
+ }
+@@ -112,8 +113,10 @@
+ monaddr=
+ dofsmount=1
+ dofsumount=0
+ verbose=0
++use_default_conf=1
++
+ 
+ while echo $1 | grep -q '^-'; do     # FIXME: why not '^-'?
+ case $1 in
+     -v | --verbose)
+@@ -152,10 +155,17 @@
+     --conf | -c)
+ 	    [ -z "$2" ] && usage_exit
+ 	    options="$options $1"
+ 	    shift
++        use_default_conf=0
+ 	    conf=$1
+ 	    ;;
++    --cluster )
++	    [ -z "$2" ] && usage_exit
++	    options="$options $1"
++	    shift
++	    cluster=$1
++	    ;;
+     --hostname )
+ 	    [ -z "$2" ] && usage_exit
+ 	    options="$options $1"
+ 	    shift
+@@ -169,8 +179,22 @@
+ options="$options $1"
+ shift
+ done
+ 
++
++# if `--cluster` was not passed in, fallback to looking at the config name
++if [ -z "$cluster" ]; then
++    cluster=`echo $conf | awk -F'/' '{print $(NF)}' | cut -d'.' -f 1`
++else
++    # if we were told to use a given cluster name then $conf needs to be updated
++    # but just define it if `--conf` was not specified, otherwise we would be silently
++    # overriding $conf even if it was defined with `--conf`
++    if [ $use_default_conf -eq 1 ]; then
++        conf="/etc/ceph/$cluster.conf"
++    fi
++fi
++
++
+ verify_conf
+ 
+ command=$1
+ [ -n "$*" ] && shift
+@@ -188,13 +212,12 @@
+ 
+ for name in $what; do
+     type=`echo $name | cut -c 1-3`   # e.g. 'mon', if $item is 'mon1'
+     id=`echo $name | cut -c 4- | sed 's/^\\.//'`
+-    cluster=`echo $conf | awk -F'/' '{print $(NF)}' | cut -d'.' -f 1`
+     num=$id
+     name="$type.$id"
+ 
+-    check_host || continue
++    check_host $cluster || continue
+ 
+     binary="$BINDIR/ceph-$type"
+     cmd="$binary -i $id"
+ 
+@@ -234,9 +257,9 @@
+     # conf file
+     cmd="$cmd -c $conf"
+ 
+     if echo $name | grep -q ^osd; then
+-	get_conf osd_data "/var/lib/ceph/osd/ceph-$id" "osd data"
++	get_conf osd_data "/var/lib/ceph/osd/$cluster-$id" "osd data"
+ 	get_conf fs_path "$osd_data" "fs path"  # mount point defaults so osd data
+         get_conf fs_devs "" "devs"
+ 	if [ -z "$fs_devs" ]; then
+ 	    # try to fallback to old keys
+@@ -334,9 +357,9 @@
+ 		get_conf update_crush "" "osd crush update on start"
+ 		if [ "${update_crush:-1}" = "1" -o "${update_crush:-1}" = "true" ]; then
+ 		    # update location in crush
+ 		    get_conf osd_location_hook "$BINDIR/ceph-crush-location" "osd crush location hook"
+-		    osd_location=`$osd_location_hook --cluster ceph --id $id --type osd`
++		    osd_location=`$osd_location_hook --cluster $cluster --id $id --type osd`
+ 		    get_conf osd_weight "" "osd crush initial weight"
+ 		    defaultweight="$(df -P -k $osd_data/. | tail -1 | awk '{ print sprintf("%.2f",$2/1073741824) }')"
+ 		    get_conf osd_keyring "$osd_data/keyring" "keyring"
+ 		    do_cmd "timeout 30 $BINDIR/ceph -c $conf --name=osd.$id --keyring=$osd_keyring osd crush create-or-move -- $id ${osd_weight:-${defaultweight:-1}} $osd_location"
+@@ -365,9 +388,9 @@
+ 		# in creating these keys.
+ 		get_conf mon_data "/var/lib/ceph/mon/ceph-$id" "mon data"
+ 		if [ "$mon_data" = "/var/lib/ceph/mon/ceph-$id" -a "$asok" = "/var/run/ceph/ceph-mon.$id.asok" ]; then
+ 		    echo Starting ceph-create-keys on $host...
+-		    cmd2="$SBINDIR/ceph-create-keys -i $id 2> /dev/null &"
++		    cmd2="$SBINDIR/ceph-create-keys --cluster $cluster -i $id 2> /dev/null &"
+ 		    do_cmd "$cmd2"
+ 		fi
+ 	    fi
+ 
+--- a/src/init-radosgw.sysv
++++ b/src/init-radosgw.sysv
+@@ -14,8 +14,9 @@
+ . /etc/rc.d/init.d/functions
+ 
+ daemon_is_running() {
+     daemon=$1
++    sleep 1
+     if pidof $daemon >/dev/null; then
+         echo "$daemon is running."
+         exit 0
+     else
+@@ -43,8 +44,12 @@
+     [ $VERBOSE -eq 1 ] && echo "$RADOSGW could not start, it is not executable."
+     exit 1
+ fi
+ 
++# detect systemd
++SYSTEMD=0
++grep -qs systemd /proc/1/comm && SYSTEMD=1
++
+ case "$1" in
+     start)
+         echo "Starting radosgw instance(s)..."
+         for name in `ceph-conf --list-sections $PREFIX`;
+@@ -78,10 +83,14 @@
+                 touch "$log_file"
+                 chown $user $log_file
+             fi
+ 
+-            #start-stop-daemon --start -u $user -x $RADOSGW -- -n $name
+-            daemon --user="$user" "ulimit -n 32768; $RADOSGW -n $name"
++            if [ $SYSTEMD -eq 1 ]; then
++                systemd-run -r bash -c "ulimit -n 32768; $RADOSGW -n $name"
++            else
++                #start-stop-daemon --start -u $user -x $RADOSGW -- -n $name
++                daemon --user="$user" "ulimit -n 32768; $RADOSGW -n $name"
++            fi
+             echo "Starting $name..."
+         done
+         daemon_is_running $RADOSGW
+         ;;
+--- a/src/librados/RadosClient.cc
++++ b/src/librados/RadosClient.cc
+@@ -102,10 +102,12 @@
+ 
+   lock.Lock();
+ 
+   int r = wait_for_osdmap();
+-  if (r < 0)
++  if (r < 0) {
++    lock.Unlock();
+     return r;
++  }
+   int64_t ret = osdmap.lookup_pg_pool_name(name);
+   pool_cache_rwl.get_write();
+   lock.Unlock();
+   if (ret < 0) {
+@@ -581,10 +583,12 @@
+ int librados::RadosClient::pool_delete(const char *name)
+ {
+   lock.Lock();
+   int r = wait_for_osdmap();
+-  if (r < 0)
++  if (r < 0) {
++    lock.Unlock();
+     return r;
++  }
+   int tmp_pool_id = osdmap.lookup_pg_pool_name(name);
+   if (tmp_pool_id < 0) {
+     lock.Unlock();
+     return -ENOENT;
+--- a/src/librbd/ImageCtx.cc
++++ b/src/librbd/ImageCtx.cc
+@@ -184,12 +184,16 @@
+     }
+ 
+     // size object cache appropriately
+     if (object_cacher) {
+-      uint64_t obj = cct->_conf->rbd_cache_size / (1ull << order);
++      uint64_t obj = cct->_conf->rbd_cache_max_dirty_object;
++      if (!obj) {
++        obj = cct->_conf->rbd_cache_size / (1ull << order);
++        obj = obj * 4 + 10;
++      }
+       ldout(cct, 10) << " cache bytes " << cct->_conf->rbd_cache_size << " order " << (int)order
+ 		     << " -> about " << obj << " objects" << dendl;
+-      object_cacher->set_max_objects(obj * 4 + 10);
++      object_cacher->set_max_objects(obj);
+     }
+ 
+     ldout(cct, 10) << "init_layout stripe_unit " << stripe_unit
+ 		   << " stripe_count " << stripe_count
+@@ -572,11 +576,11 @@
+     md_lock.put_write();
+     object_cacher->stop();
+   }
+ 
+-  void ImageCtx::invalidate_cache() {
++  int ImageCtx::invalidate_cache() {
+     if (!object_cacher)
+-      return;
++      return 0;
+     cache_lock.Lock();
+     object_cacher->release_set(object_set);
+     cache_lock.Unlock();
+     int r = flush_cache();
+@@ -584,10 +588,14 @@
+       lderr(cct) << "flush_cache returned " << r << dendl;
+     cache_lock.Lock();
+     bool unclean = object_cacher->release_set(object_set);
+     cache_lock.Unlock();
+-    if (unclean)
+-      lderr(cct) << "could not release all objects from cache" << dendl;
++    if (unclean) {
++      lderr(cct) << "could not release all objects from cache: "
++                 << unclean << " bytes remain" << dendl;
++      return -EBUSY;
++    }
++    return r;
+   }
+ 
+   void ImageCtx::clear_nonexistence_cache() {
+     if (!object_cacher)
+--- a/src/librbd/ImageCtx.h
++++ b/src/librbd/ImageCtx.h
+@@ -138,9 +138,9 @@
+     void user_flushed();
+     void flush_cache_aio(Context *onfinish);
+     int flush_cache();
+     void shutdown_cache();
+-    void invalidate_cache();
++    int invalidate_cache();
+     void clear_nonexistence_cache();
+     int register_watch();
+     void unregister_watch();
+     size_t parent_io_len(uint64_t offset, size_t length,
+--- a/src/librbd/internal.cc
++++ b/src/librbd/internal.cc
+@@ -831,8 +831,11 @@
+   int create(IoCtx& io_ctx, const char *imgname, uint64_t size,
+ 	     bool old_format, uint64_t features, int *order,
+ 	     uint64_t stripe_unit, uint64_t stripe_count)
+   {
++    if (!order)
++      return -EINVAL;
++
+     CephContext *cct = (CephContext *)io_ctx.cct();
+     ldout(cct, 20) << "create " << &io_ctx << " name = " << imgname
+ 		   << " size = " << size << " old_format = " << old_format
+ 		   << " features = " << features << " order = " << *order
+@@ -856,11 +859,8 @@
+       lderr(cct) << "rbd image " << imgname << " already exists" << dendl;
+       return -EEXIST;
+     }
+ 
+-    if (!order)
+-      return -EINVAL;
+-
+     if (!*order)
+       *order = cct->_conf->rbd_default_order;
+     if (!*order)
+       *order = RBD_DEFAULT_OBJ_ORDER;
+@@ -1503,9 +1503,11 @@
+     RWLock::WLocker l(ictx->md_lock);
+     if (size < ictx->size && ictx->object_cacher) {
+       // need to invalidate since we're deleting objects, and
+       // ObjectCacher doesn't track non-existent objects
+-      ictx->invalidate_cache();
++      r = ictx->invalidate_cache();
++      if (r < 0)
++	return r;
+     }
+     resize_helper(ictx, size, prog_ctx);
+ 
+     ldout(cct, 2) << "done." << dendl;
+@@ -1846,9 +1848,11 @@
+ 
+     // need to flush any pending writes before resizing and rolling back -
+     // writes might create new snapshots. Rolling back will replace
+     // the current version, so we have to invalidate that too.
+-    ictx->invalidate_cache();
++    r = ictx->invalidate_cache();
++    if (r < 0)
++      return r;
+ 
+     ldout(cct, 2) << "resizing to snapshot size..." << dendl;
+     NoOpProgressContext no_op;
+     r = resize_helper(ictx, new_size, no_op);
+@@ -2070,9 +2074,9 @@
+ 			 << "' snap_name = '"
+ 			 << ictx->snap_name << "'" << dendl;
+     int r = ictx->init();
+     if (r < 0)
+-      return r;
++      goto err_close;
+ 
+     if (!ictx->read_only) {
+       r = ictx->register_watch();
+       if (r < 0) {
+@@ -2876,8 +2880,21 @@
+ 
+     return r;
+   }
+ 
++  int invalidate_cache(ImageCtx *ictx)
++  {
++    CephContext *cct = ictx->cct;
++    ldout(cct, 20) << "invalidate_cache " << ictx << dendl;
++
++    int r = ictx_check(ictx);
++    if (r < 0)
++      return r;
++
++    RWLock::WLocker l(ictx->md_lock);
++    return ictx->invalidate_cache();
++  }
++
+   int aio_write(ImageCtx *ictx, uint64_t off, size_t len, const char *buf,
+ 		AioCompletion *c)
+   {
+     CephContext *cct = ictx->cct;
+--- a/src/librbd/internal.h
++++ b/src/librbd/internal.h
+@@ -187,8 +187,9 @@
+ 	       char *buf, bufferlist *pbl, AioCompletion *c);
+   int aio_flush(ImageCtx *ictx, AioCompletion *c);
+   int flush(ImageCtx *ictx);
+   int _flush(ImageCtx *ictx);
++  int invalidate_cache(ImageCtx *ictx);
+ 
+   ssize_t handle_sparse_read(CephContext *cct,
+ 			     ceph::bufferlist data_bl,
+ 			     uint64_t block_ofs,
+--- a/src/librbd/librbd.cc
++++ b/src/librbd/librbd.cc
+@@ -513,8 +513,14 @@
+     ImageCtx *ictx = (ImageCtx *)ctx;
+     return librbd::aio_flush(ictx, (librbd::AioCompletion *)c->pc);
+   }
+ 
++  int Image::invalidate_cache()
++  {
++    ImageCtx *ictx = (ImageCtx *)ctx;
++    return librbd::invalidate_cache(ictx);
++  }
++
+ } // namespace librbd
+ 
+ extern "C" void rbd_version(int *major, int *minor, int *extra)
+ {
+@@ -1129,8 +1135,14 @@
+   librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
+   return librbd::aio_flush(ictx, (librbd::AioCompletion *)comp->pc);
+ }
+ 
++extern "C" int rbd_invalidate_cache(rbd_image_t image)
++{
++  librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
++  return librbd::invalidate_cache(ictx);
++}
++
+ extern "C" int rbd_aio_is_complete(rbd_completion_t c)
+ {
+   librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
+   return comp->is_complete();
+--- a/src/mds/Locker.cc
++++ b/src/mds/Locker.cc
+@@ -2061,9 +2061,15 @@
+ 
+ void Locker::calc_new_client_ranges(CInode *in, uint64_t size, map<client_t,client_writeable_range_t>& new_ranges)
+ {
+   inode_t *latest = in->get_projected_inode();
+-  uint64_t ms = ROUND_UP_TO((size+1)<<1, latest->get_layout_size_increment());
++  uint64_t ms;
++  if(latest->has_layout()) {
++    ms = ROUND_UP_TO((size+1)<<1, latest->get_layout_size_increment());
++  } else {
++    // Layout-less directories like ~mds0/, have zero size
++    ms = 0;
++  }
+ 
+   // increase ranges as appropriate.
+   // shrink to 0 if no WR|BUFFER caps issued.
+   for (map<client_t,Capability*>::iterator p = in->client_caps.begin();
+--- a/src/mds/MDCache.cc
++++ b/src/mds/MDCache.cc
+@@ -348,8 +348,9 @@
+   rootdir->fnode.accounted_rstat = rootdir->fnode.rstat;
+ 
+   root->inode.dirstat = rootdir->fnode.fragstat;
+   root->inode.rstat = rootdir->fnode.rstat;
++  ++root->inode.rstat.rsubdirs;
+   root->inode.accounted_rstat = root->inode.rstat;
+ 
+   rootdir->mark_complete();
+   rootdir->mark_dirty(rootdir->pre_dirty(), mds->mdlog->get_current_segment());
+@@ -398,8 +399,9 @@
+   mydir->fnode.accounted_rstat = mydir->fnode.rstat;
+ 
+   myin->inode.dirstat = mydir->fnode.fragstat;
+   myin->inode.rstat = mydir->fnode.rstat;
++  ++myin->inode.rstat.rsubdirs;
+   myin->inode.accounted_rstat = myin->inode.rstat;
+ 
+ 
+   mydir->mark_complete();
+--- a/src/messages/MOSDSubOp.h
++++ b/src/messages/MOSDSubOp.h
+@@ -24,9 +24,9 @@
+  */
+ 
+ class MOSDSubOp : public Message {
+ 
+-  static const int HEAD_VERSION = 10;
++  static const int HEAD_VERSION = 11;
+   static const int COMPAT_VERSION = 1;
+ 
+ public:
+   epoch_t map_epoch;
+@@ -62,8 +62,10 @@
+   eversion_t version;
+ 
+   // piggybacked osd/og state
+   eversion_t pg_trim_to;   // primary->replica: trim to here
++  eversion_t pg_trim_rollback_to;   // primary->replica: trim rollback
++                                    // info to here
+   osd_peer_stat_t peer_stat;
+ 
+   map<string,bufferlist> attrset;
+ 
+@@ -174,8 +176,13 @@
+     }
+     if (header.version >= 10) {
+       ::decode(updated_hit_set_history, p);
+     }
++    if (header.version >= 11) {
++      ::decode(pg_trim_rollback_to, p);
++    } else {
++      pg_trim_rollback_to = pg_trim_to;
++    }
+   }
+ 
+   virtual void encode_payload(uint64_t features) {
+     ::encode(map_epoch, payload);
+@@ -223,8 +230,9 @@
+     ::encode(discard_temp_oid, payload);
+     ::encode(from, payload);
+     ::encode(pgid.shard, payload);
+     ::encode(updated_hit_set_history, payload);
++    ::encode(pg_trim_rollback_to, payload);
+   }
+ 
+   MOSDSubOp()
+     : Message(MSG_OSD_SUBOP, HEAD_VERSION, COMPAT_VERSION) { }
+--- a/src/mon/DataHealthService.cc
++++ b/src/mon/DataHealthService.cc
+@@ -227,9 +227,9 @@
+   if (ours.latest_avail_percent <= g_conf->mon_data_avail_warn) {
+     if (ours.latest_avail_percent != last_warned_percent)
+       mon->clog.warn()
+ 	<< "reached concerning levels of available space on local monitor storage"
+-	<< " (" << ours.latest_avail_percent << "\% free)\n";
++	<< " (" << ours.latest_avail_percent << "% free)\n";
+     last_warned_percent = ours.latest_avail_percent;
+   } else {
+     last_warned_percent = 0;
+   }
+--- a/src/mon/MonCommands.h
++++ b/src/mon/MonCommands.h
+@@ -551,9 +551,9 @@
+ 	"name=destpool,type=CephPoolname", \
+ 	"rename <srcpool> to <destpool>", "osd", "rw", "cli,rest")
+ COMMAND("osd pool get " \
+ 	"name=pool,type=CephPoolname " \
+-	"name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_ruleset|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|auid", \
++	"name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_ruleset|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|auid|target_max_objects|target_max_bytes|cache_target_dirty_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|erasure_code_profile", \
+ 	"get pool parameter <var>", "osd", "r", "cli,rest")
+ COMMAND("osd pool set " \
+ 	"name=pool,type=CephPoolname " \
+ 	"name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_ruleset|hashpspool|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|debug_fake_ec_pool|target_max_bytes|target_max_objects|cache_target_dirty_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|auid " \
+@@ -567,8 +567,12 @@
+ 	"name=pool,type=CephPoolname " \
+ 	"name=field,type=CephChoices,strings=max_objects|max_bytes " \
+ 	"name=val,type=CephString",
+ 	"set object or byte limit on pool", "osd", "rw", "cli,rest")
++COMMAND("osd pool get-quota " \
++        "name=pool,type=CephPoolname ",
++        "obtain object or byte limits for pool",
++        "osd", "r", "cli,rest")
+ COMMAND("osd pool stats " \
+         "name=name,type=CephString,req=false",
+         "obtain stats from all pools, or from specified pool",
+         "osd", "r", "cli,rest")
+--- a/src/mon/Monitor.cc
++++ b/src/mon/Monitor.cc
+@@ -620,8 +620,23 @@
+ 
+ void Monitor::refresh_from_paxos(bool *need_bootstrap)
+ {
+   dout(10) << __func__ << dendl;
++
++  bufferlist bl;
++  int r = store->get(MONITOR_NAME, "cluster_fingerprint", bl);
++  if (r >= 0) {
++    try {
++      bufferlist::iterator p = bl.begin();
++      ::decode(fingerprint, p);
++    }
++    catch (buffer::error& e) {
++      dout(10) << __func__ << " failed to decode cluster_fingerprint" << dendl;
++    }
++  } else {
++    dout(10) << __func__ << " no cluster_fingerprint" << dendl;
++  }
++
+   for (int i = 0; i < PAXOS_NUM; ++i) {
+     paxos_service[i]->refresh(need_bootstrap);
+   }
+   for (int i = 0; i < PAXOS_NUM; ++i) {
+@@ -2392,8 +2407,9 @@
+     // this must be formatted, in its current form
+     if (!f)
+       f.reset(new_formatter("json-pretty"));
+     f->open_object_section("report");
++    f->dump_stream("cluster_fingerprint") << fingerprint;
+     f->dump_string("version", ceph_version_to_str());
+     f->dump_string("commit", git_version_to_str());
+     f->dump_stream("timestamp") << ceph_clock_now(NULL);
+ 
+@@ -2865,10 +2881,11 @@
+         // let it go through and be dispatched immediately!
+         return dispatch(s, m, false);
+       }
+       dout(1) << __func__ << " dropping stray message " << *m
+-        << " from " << m->get_source_inst() << dendl;
+-      return false;
++	      << " from " << m->get_source_inst() << dendl;
++      m->put();
++      return true;
+     }
+ 
+     if (!exited_quorum.is_zero() && !src_is_mon) {
+       waitlist_or_zap_client(m);
+@@ -3846,11 +3863,31 @@
+   if (!maybe_wait_for_quorum.empty()) {
+     finish_contexts(g_ceph_context, maybe_wait_for_quorum);
+   }
+ 
++  if (is_leader() && paxos->is_active() && fingerprint.is_zero()) {
++    // this is only necessary on upgraded clusters.
++    MonitorDBStore::Transaction t;
++    prepare_new_fingerprint(&t);
++    bufferlist tbl;
++    t.encode(tbl);
++    paxos->propose_new_value(tbl, new C_NoopContext);
++  }
++
+   new_tick();
+ }
+ 
++void Monitor::prepare_new_fingerprint(MonitorDBStore::Transaction *t)
++{
++  uuid_d nf;
++  nf.generate_random();
++  dout(10) << __func__ << " proposing cluster_fingerprint " << nf << dendl;
++
++  bufferlist bl;
++  ::encode(nf, bl);
++  t->put(MONITOR_NAME, "cluster_fingerprint", bl);
++}
++
+ int Monitor::check_fsid()
+ {
+   if (!store->exists(MONITOR_NAME, "cluster_uuid"))
+     return -ENOENT;
+--- a/src/mon/Monitor.h
++++ b/src/mon/Monitor.h
+@@ -127,8 +127,9 @@
+   void register_cluster_logger();
+   void unregister_cluster_logger();
+ 
+   MonMap *monmap;
++  uuid_d fingerprint;
+ 
+   set<entity_addr_t> extra_probe_peers;
+ 
+   LogClient clog;
+@@ -189,8 +190,10 @@
+   bool is_peon() const { return state == STATE_PEON; }
+ 
+   const utime_t &get_leader_since() const;
+ 
++  void prepare_new_fingerprint(MonitorDBStore::Transaction *t);
++
+   // -- elector --
+ private:
+   Paxos *paxos;
+   Elector elector;
+--- a/src/mon/MonmapMonitor.cc
++++ b/src/mon/MonmapMonitor.cc
+@@ -96,8 +96,13 @@
+   pending_map.encode(bl, mon->get_quorum_features());
+ 
+   put_version(t, pending_map.epoch, bl);
+   put_last_committed(t, pending_map.epoch);
++
++  // generate a cluster fingerprint, too?
++  if (pending_map.epoch == 1) {
++    mon->prepare_new_fingerprint(t);
++  }
+ }
+ 
+ void MonmapMonitor::on_active()
+ {
+--- a/src/mon/OSDMonitor.cc
++++ b/src/mon/OSDMonitor.cc
+@@ -2066,8 +2066,34 @@
+ 	}
+       }
+     }
+ 
++    // hit_set-less cache_mode?
++    if (g_conf->mon_warn_on_cache_pools_without_hit_sets) {
++      int problem_cache_pools = 0;
++      for (map<int64_t, pg_pool_t>::const_iterator p = osdmap.pools.begin();
++	   p != osdmap.pools.end();
++	   ++p) {
++	const pg_pool_t& info = p->second;
++	if (info.cache_mode_requires_hit_set() &&
++	    info.hit_set_params.get_type() == HitSet::TYPE_NONE) {
++	  ++problem_cache_pools;
++	  if (detail) {
++	    ostringstream ss;
++	    ss << "pool '" << osdmap.get_pool_name(p->first)
++	       << "' with cache_mode " << info.get_cache_mode_name()
++	       << " needs hit_set_type to be set but it is not";
++	    detail->push_back(make_pair(HEALTH_WARN, ss.str()));
++	  }
++	}
++      }
++      if (problem_cache_pools) {
++	ostringstream ss;
++	ss << problem_cache_pools << " cache pools are missing hit_sets";
++	summary.push_back(make_pair(HEALTH_WARN, ss.str()));
++      }
++    }
++
+     // Warn if 'mon_osd_down_out_interval' is set to zero.
+     // Having this option set to zero on the leader acts much like the
+     // 'noout' flag.  It's hard to figure out what's going wrong with clusters
+     // without the 'noout' flag set but acting like that just the same, so
+@@ -2452,8 +2478,28 @@
+     const pg_pool_t *p = osdmap.get_pg_pool(pool);
+     string var;
+     cmd_getval(g_ceph_context, cmdmap, "var", var);
+ 
++    if (!p->is_tier() &&
++        (var == "hit_set_type" || var == "hit_set_period" ||
++         var == "hit_set_count" || var == "hit_set_fpp" ||
++         var == "target_max_objects" || var == "target_max_bytes" ||
++         var == "cache_target_full_ratio" ||
++         var == "cache_target_dirty_ratio" ||
++         var == "cache_min_flush_age" || var == "cache_min_evict_age")) {
++      ss << "pool '" << poolstr
++         << "' is not a tier pool: variable not applicable";
++      r = -EACCES;
++      goto reply;
++    }
++
++    if (!p->is_erasure() && var == "erasure_code_profile") {
++      ss << "pool '" << poolstr
++         << "' is not a erasure pool: variable not applicable";
++      r = -EACCES;
++      goto reply;
++    }
++
+     if (f) {
+       f->open_object_section("pool");
+       f->dump_string("pool", poolstr);
+       f->dump_int("pool_id", pool);
+@@ -2487,8 +2533,28 @@
+ 	} else {
+ 	  BloomHitSet::Params *bloomp = static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
+ 	  f->dump_float("hit_set_fpp", bloomp->get_fpp());
+ 	}
++      } else if (var == "target_max_objects") {
++        f->dump_unsigned("target_max_objects", p->target_max_objects);
++      } else if (var == "target_max_bytes") {
++        f->dump_unsigned("target_max_bytes", p->target_max_bytes);
++      } else if (var == "cache_target_dirty_ratio") {
++        f->dump_unsigned("cache_target_dirty_ratio_micro",
++                         p->cache_target_dirty_ratio_micro);
++        f->dump_float("cache_target_dirty_ratio",
++                      ((float)p->cache_target_dirty_ratio_micro/1000000));
++      } else if (var == "cache_target_full_ratio") {
++        f->dump_unsigned("cache_target_full_ratio_micro",
++                         p->cache_target_full_ratio_micro);
++        f->dump_float("cache_target_full_ratio",
++                      ((float)p->cache_target_full_ratio_micro/1000000));
++      } else if (var == "cache_min_flush_age") {
++        f->dump_unsigned("cache_min_flush_age", p->cache_min_flush_age);
++      } else if (var == "cache_min_evict_age") {
++        f->dump_unsigned("cache_min_evict_age", p->cache_min_evict_age);
++      } else if (var == "erasure_code_profile") {
++       f->dump_string("erasure_code_profile", p->erasure_code_profile);
+       }
+ 
+       f->close_section();
+       f->flush(rdata);
+@@ -2520,9 +2586,26 @@
+ 	  goto reply;
+ 	}
+ 	BloomHitSet::Params *bloomp = static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
+ 	ss << "hit_set_fpp: " << bloomp->get_fpp();
++      } else if (var == "target_max_objects") {
++        ss << "target_max_objects: " << p->target_max_objects;
++      } else if (var == "target_max_bytes") {
++        ss << "target_max_bytes: " << p->target_max_bytes;
++      } else if (var == "cache_target_dirty_ratio") {
++        ss << "cache_target_dirty_ratio: "
++          << ((float)p->cache_target_dirty_ratio_micro/1000000);
++      } else if (var == "cache_target_full_ratio") {
++        ss << "cache_target_full_ratio: "
++          << ((float)p->cache_target_full_ratio_micro/1000000);
++      } else if (var == "cache_min_flush_age") {
++        ss << "cache_min_flush_age: " << p->cache_min_flush_age;
++      } else if (var == "cache_min_evict_age") {
++        ss << "cache_min_evict_age: " << p->cache_min_evict_age;
++      } else if (var == "erasure_code_profile") {
++       ss << "erasure_code_profile: " << p->erasure_code_profile;
+       }
++
+       rdata.append(ss);
+       ss.str("");
+     }
+     r = 0;
+@@ -2625,8 +2708,47 @@
+     }
+     rdata.append("\n");
+     r = 0;
+ 
++  } else if (prefix == "osd pool get-quota") {
++    string pool_name;
++    cmd_getval(g_ceph_context, cmdmap, "pool", pool_name);
++
++    int64_t poolid = osdmap.lookup_pg_pool_name(pool_name);
++    if (poolid < 0) {
++      assert(poolid == -ENOENT);
++      ss << "unrecognized pool '" << pool_name << "'";
++      r = -ENOENT;
++      goto reply;
++    }
++    const pg_pool_t *p = osdmap.get_pg_pool(poolid);
++
++    if (f) {
++      f->open_object_section("pool_quotas");
++      f->dump_string("pool_name", pool_name);
++      f->dump_unsigned("pool_id", poolid);
++      f->dump_unsigned("quota_max_objects", p->quota_max_objects);
++      f->dump_unsigned("quota_max_bytes", p->quota_max_bytes);
++      f->close_section();
++      f->flush(rdata);
++    } else {
++      stringstream rs;
++      rs << "quotas for pool '" << pool_name << "':\n"
++         << "  max objects: ";
++      if (p->quota_max_objects == 0)
++        rs << "N/A";
++      else
++        rs << si_t(p->quota_max_objects) << " objects";
++      rs << "\n"
++         << "  max bytes  : ";
++      if (p->quota_max_bytes == 0)
++        rs << "N/A";
++      else
++        rs << si_t(p->quota_max_bytes) << "B";
++      rdata.append(rs.str());
++    }
++    rdata.append("\n");
++    r = 0;
+   } else if (prefix == "osd crush rule list" ||
+ 	     prefix == "osd crush rule ls") {
+     string format;
+     cmd_getval(g_ceph_context, cmdmap, "format", format, string("json-pretty"));
+@@ -2924,17 +3046,20 @@
+ 					     const string &profile,
+ 					     int *ruleset,
+ 					     stringstream &ss)
+ {
+-  *ruleset = osdmap.crush->get_rule_id(name);
+-  if (*ruleset != -ENOENT)
++  int ruleid = osdmap.crush->get_rule_id(name);
++  if (ruleid != -ENOENT) {
++    *ruleset = osdmap.crush->get_rule_mask_ruleset(ruleid);
+     return -EEXIST;
++  }
+ 
+   CrushWrapper newcrush;
+   _get_pending_crush(newcrush);
+ 
+-  *ruleset = newcrush.get_rule_id(name);
+-  if (*ruleset != -ENOENT) {
++  ruleid = newcrush.get_rule_id(name);
++  if (ruleid != -ENOENT) {
++    *ruleset = newcrush.get_rule_mask_ruleset(ruleid);
+     return -EALREADY;
+   } else {
+     ErasureCodeInterfaceRef erasure_code;
+     int err = get_erasure_code(profile, &erasure_code, ss);
+@@ -3088,22 +3213,25 @@
+ }
+ 
+ int OSDMonitor::prepare_pool_size(const unsigned pool_type,
+ 				  const string &erasure_code_profile,
+-				  unsigned *size,
++				  unsigned *size, unsigned *min_size,
+ 				  stringstream &ss)
+ {
+   int err = 0;
+   switch (pool_type) {
+   case pg_pool_t::TYPE_REPLICATED:
+     *size = g_conf->osd_pool_default_size;
++    *min_size = g_conf->get_osd_pool_default_min_size();
+     break;
+   case pg_pool_t::TYPE_ERASURE:
+     {
+       ErasureCodeInterfaceRef erasure_code;
+       err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
+-      if (err == 0)
++      if (err == 0) {
+ 	*size = erasure_code->get_chunk_count();
++	*min_size = erasure_code->get_data_chunk_count();
++      }
+     }
+     break;
+   default:
+     ss << "prepare_pool_size: " << pool_type << " is not a known pool type";
+@@ -3218,10 +3346,10 @@
+   r = prepare_pool_crush_ruleset(pool_type, erasure_code_profile,
+ 				 crush_ruleset_name, &crush_ruleset, ss);
+   if (r)
+     return r;
+-  unsigned size;
+-  r = prepare_pool_size(pool_type, erasure_code_profile, &size, ss);
++  unsigned size, min_size;
++  r = prepare_pool_size(pool_type, erasure_code_profile, &size, &min_size, ss);
+   if (r)
+     return r;
+   uint32_t stripe_width = 0;
+   r = prepare_pool_stripe_width(pool_type, erasure_code_profile, &stripe_width, ss);
+@@ -3245,9 +3373,9 @@
+   if (g_conf->osd_pool_default_flag_hashpspool)
+     pi->flags |= pg_pool_t::FLAG_HASHPSPOOL;
+ 
+   pi->size = size;
+-  pi->min_size = g_conf->get_osd_pool_default_min_size();
++  pi->min_size = min_size;
+   pi->crush_ruleset = crush_ruleset;
+   pi->object_hash = CEPH_STR_HASH_RJENKINS;
+   pi->set_pg_num(pg_num ? pg_num : g_conf->osd_pool_default_pg_num);
+   pi->set_pgp_num(pgp_num ? pgp_num : g_conf->osd_pool_default_pgp_num);
+@@ -3335,8 +3463,9 @@
+   string val;
+   string interr, floaterr;
+   int64_t n = 0;
+   double f = 0;
++  int64_t uf = 0;  // micro-f
+   if (!cmd_getval(g_ceph_context, cmdmap, "val", val)) {
+     // wasn't a string; maybe an older mon forwarded json with an int?
+     if (!cmd_getval(g_ceph_context, cmdmap, "val", n))
+       return -EINVAL;  // no value!
+@@ -3344,8 +3473,19 @@
+     // we got a string.  see if it contains an int.
+     n = strict_strtoll(val.c_str(), 10, &interr);
+     // or a float
+     f = strict_strtod(val.c_str(), &floaterr);
++    uf = llrintl(f * (double)1000000.0);
++  }
++
++  if (!p.is_tier() &&
++      (var == "hit_set_type" || var == "hit_set_period" ||
++       var == "hit_set_count" || var == "hit_set_fpp" ||
++       var == "target_max_objects" || var == "target_max_bytes" ||
++       var == "cache_target_full_ratio" || var == "cache_target_dirty_ratio" ||
++       var == "cache_min_flush_age" || var == "cache_min_evict_age")) {
++    ss << "pool '" << poolstr << "' is not a tier pool: variable not applicable";
++    return -EACCES;
+   }
+ 
+   if (var == "size") {
+     if (p.type == pg_pool_t::TYPE_ERASURE) {
+@@ -3398,9 +3538,9 @@
+ 	force != "--yes-i-really-mean-it") {
+       ss << "splits in cache pools must be followed by scrubs and leave sufficient free space to avoid overfilling.  use --yes-i-really-mean-it to force.";
+       return -EPERM;
+     }
+-    int expected_osds = MIN(p.get_pg_num(), osdmap.get_num_osds());
++    int expected_osds = MAX(1, MIN(p.get_pg_num(), osdmap.get_num_osds()));
+     int64_t new_pgs = n - p.get_pg_num();
+     int64_t pgs_per_osd = new_pgs / expected_osds;
+     if (pgs_per_osd > g_conf->mon_osd_max_split_count) {
+       ss << "specified pg_num " << n << " is too large (creating "
+@@ -3486,8 +3626,9 @@
+       return -EINVAL;
+     }
+     p.hit_set_period = n;
+   } else if (var == "hit_set_count") {
++
+     if (interr.length()) {
+       ss << "error parsing integer value '" << val << "': " << interr;
+       return -EINVAL;
+     }
+@@ -3527,9 +3668,9 @@
+     if (f < 0 || f > 1.0) {
+       ss << "value must be in the range 0..1";
+       return -ERANGE;
+     }
+-    p.cache_target_dirty_ratio_micro = f * 1000000;
++    p.cache_target_dirty_ratio_micro = uf;
+   } else if (var == "cache_target_full_ratio") {
+     if (floaterr.length()) {
+       ss << "error parsing float '" << val << "': " << floaterr;
+       return -EINVAL;
+@@ -3537,9 +3678,9 @@
+     if (f < 0 || f > 1.0) {
+       ss << "value must be in the range 0..1";
+       return -ERANGE;
+     }
+-    p.cache_target_full_ratio_micro = f * 1000000;
++    p.cache_target_full_ratio_micro = uf;
+   } else if (var == "cache_min_flush_age") {
+     if (interr.length()) {
+       ss << "error parsing int '" << val << "': " << interr;
+       return -EINVAL;
+@@ -4171,8 +4312,26 @@
+     string profile;
+     cmd_getval(g_ceph_context, cmdmap, "profile", profile);
+     if (profile == "")
+       profile = "default";
++    if (profile == "default") {
++      if (!osdmap.has_erasure_code_profile(profile)) {
++	if (pending_inc.has_erasure_code_profile(profile)) {
++	  dout(20) << "erasure code profile " << profile << " already pending" << dendl;
++	  goto wait;
++	}
++
++	map<string,string> profile_map;
++	err = osdmap.get_erasure_code_profile_default(g_ceph_context,
++						      profile_map,
++						      &ss);
++	if (err)
++	  goto reply;
++	dout(20) << "erasure code profile " << profile << " set" << dendl;
++	pending_inc.set_erasure_code_profile(profile, profile_map);
++	goto wait;
++      }
++    }
+ 
+     int ruleset;
+     err = crush_ruleset_create_erasure(name, profile, &ruleset, ss);
+     if (err < 0) {
+@@ -4846,8 +5005,27 @@
+     string erasure_code_profile;
+     cmd_getval(g_ceph_context, cmdmap, "erasure_code_profile", erasure_code_profile);
+     if (erasure_code_profile == "")
+       erasure_code_profile = "default";
++    if (erasure_code_profile == "default") {
++      if (!osdmap.has_erasure_code_profile(erasure_code_profile)) {
++	if (pending_inc.has_erasure_code_profile(erasure_code_profile)) {
++	  dout(20) << "erasure code profile " << erasure_code_profile << " already pending" << dendl;
++	  goto wait;
++	}
++
++	map<string,string> profile_map;
++	err = osdmap.get_erasure_code_profile_default(g_ceph_context,
++						      profile_map,
++						      &ss);
++	if (err)
++	  goto reply;
++	dout(20) << "erasure code profile " << erasure_code_profile << " set" << dendl;
++	pending_inc.set_erasure_code_profile(erasure_code_profile, profile_map);
++	goto wait;
++      }
++    }
++
+     if (ruleset_name == "") {
+       if (erasure_code_profile == "default") {
+ 	ruleset_name = "erasure-code";
+       } else {
+@@ -5053,9 +5231,12 @@
+       err = 0;
+       goto reply;
+     }
+     if (tp->tier_of != pool_id) {
+-      ss << "tier pool '" << tierpoolstr << "' is a tier of '" << tp->tier_of << "'";
++      ss << "tier pool '" << tierpoolstr << "' is a tier of '"
++         << osdmap.get_pool_name(tp->tier_of) << "': "
++         // be scary about it; this is an inconsistency and bells must go off
++         << "THIS SHOULD NOT HAVE HAPPENED AT ALL";
+       err = -EINVAL;
+       goto reply;
+     }
+     if (p->read_tier == tierpool_id) {
+@@ -5181,10 +5362,69 @@
+       ss << "'" << modestr << "' is not a valid cache mode";
+       err = -EINVAL;
+       goto reply;
+     }
++
++    // pool already has this cache-mode set and there are no pending changes
++    if (p->cache_mode == mode &&
++	(pending_inc.new_pools.count(pool_id) == 0 ||
++	 pending_inc.new_pools[pool_id].cache_mode == p->cache_mode)) {
++      ss << "set cache-mode for pool '" << poolstr << "'"
++         << " to " << pg_pool_t::get_cache_mode_name(mode);
++      err = 0;
++      goto reply;
++    }
++
++    /* Mode description:
++     *
++     *  none:       No cache-mode defined
++     *  forward:    Forward all reads and writes to base pool
++     *  writeback:  Cache writes, promote reads from base pool
++     *  readonly:   Forward writes to base pool
++     *
++     * Hence, these are the allowed transitions:
++     *
++     *  none -> any
++     *  forward -> writeback || any IF num_objects_dirty == 0
++     *  writeback -> forward
++     *  readonly -> any
++     */
++
++    // We check if the transition is valid against the current pool mode, as
++    // it is the only committed state thus far.  We will blantly squash
++    // whatever mode is on the pending state.
++
++    if (p->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK &&
++        mode != pg_pool_t::CACHEMODE_FORWARD) {
++      ss << "unable to set cache-mode '" << pg_pool_t::get_cache_mode_name(mode)
++         << "' on a '" << pg_pool_t::get_cache_mode_name(p->cache_mode)
++         << "' pool; only '"
++         << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_FORWARD)
++        << "' allowed.";
++      err = -EINVAL;
++      goto reply;
++    }
++    if (p->cache_mode == pg_pool_t::CACHEMODE_FORWARD &&
++               mode != pg_pool_t::CACHEMODE_WRITEBACK) {
++
++      const pool_stat_t& tier_stats =
++        mon->pgmon()->pg_map.get_pg_pool_sum_stat(pool_id);
++
++      if (tier_stats.stats.sum.num_objects_dirty > 0) {
++        ss << "unable to set cache-mode '"
++           << pg_pool_t::get_cache_mode_name(mode) << "' on pool '" << poolstr
++           << "': dirty objects found";
++        err = -EBUSY;
++        goto reply;
++      }
++    }
++
+     // go
+-    pending_inc.get_new_pool(pool_id, p)->cache_mode = mode;
++    pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
++    np->cache_mode = mode;
++    // set this both when moving to and from cache_mode NONE.  this is to
++    // capture legacy pools that were set up before this flag existed.
++    np->flags |= pg_pool_t::FLAG_INCOMPLETE_CLONES;
+     ss << "set cache-mode for pool '" << poolstr
+ 	<< "' to " << pg_pool_t::get_cache_mode_name(mode);
+     wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, ss.str(),
+ 					      get_last_committed() + 1));
+@@ -5622,10 +5862,14 @@
+ 	<< osdmap.get_pool_name(p->tier_of) << "'";
+     return -EBUSY;
+   }
+   if (!p->tiers.empty()) {
+-    *ss << "pool '" << poolstr << "' includes tiers "
+-	<< p->tiers;
++    *ss << "pool '" << poolstr << "' has tiers";
++    for(std::set<uint64_t>::iterator i = p->tiers.begin(); i != p->tiers.end(); ++i) {
++      const char *name = osdmap.get_pool_name(*i);
++      assert(name != NULL);
++      *ss << " " << name;
++    }
+     return -EBUSY;
+   }
+   *ss << "pool '" << poolstr << "' removed";
+   return 0;
+--- a/src/mon/OSDMonitor.h
++++ b/src/mon/OSDMonitor.h
+@@ -271,9 +271,9 @@
+ 				 map<string,string> *erasure_code_profile_map,
+ 				 stringstream &ss);
+   int prepare_pool_size(const unsigned pool_type,
+ 			const string &erasure_code_profile,
+-			unsigned *size,
++			unsigned *size, unsigned *min_size,
+ 			stringstream &ss);
+   int prepare_pool_stripe_width(const unsigned pool_type,
+ 				const string &erasure_code_profile,
+ 				unsigned *stripe_width,
+--- a/src/mon/PGMonitor.cc
++++ b/src/mon/PGMonitor.cc
+@@ -1214,13 +1214,15 @@
+ }
+ 
+ //void PGMonitor::dump_object_stat_sum(stringstream& ss, Formatter *f,
+ void PGMonitor::dump_object_stat_sum(TextTable &tbl, Formatter *f,
+-    object_stat_sum_t &sum, bool verbose)
++				     object_stat_sum_t &sum, uint64_t avail,
++				     bool verbose)
+ {
+   if (f) {
+     f->dump_int("kb_used", SHIFT_ROUND_UP(sum.num_bytes, 10));
+     f->dump_int("bytes_used", sum.num_bytes);
++    f->dump_unsigned("max_avail", avail);
+     f->dump_int("objects", sum.num_objects);
+     if (verbose) {
+       f->dump_int("dirty", sum.num_objects_dirty);
+       f->dump_int("rd", sum.num_rd);
+@@ -1231,8 +1233,9 @@
+   } else {
+     tbl << stringify(si_t(sum.num_bytes));
+     int64_t kb_used = SHIFT_ROUND_UP(sum.num_bytes, 10);
+     tbl << percentify(((float)kb_used / pg_map.osd_sum.kb)*100);
++    tbl << si_t(avail);
+     tbl << sum.num_objects;
+     if (verbose) {
+       tbl << stringify(si_t(sum.num_objects_dirty))
+ 	  << stringify(si_t(sum.num_rd))
+@@ -1240,8 +1243,26 @@
+     }
+   }
+ }
+ 
++int64_t PGMonitor::get_rule_avail(OSDMap& osdmap, int ruleno)
++{
++  map<int,float> wm;
++  int r = osdmap.crush->get_rule_weight_osd_map(ruleno, &wm);
++  if (r < 0)
++    return r;
++  if(wm.size() == 0)
++    return 0;
++  int64_t min = -1;
++  for (map<int,float>::iterator p = wm.begin(); p != wm.end(); ++p) {
++    int64_t proj = (float)(pg_map.osd_stat[p->first].kb_avail * 1024ull) /
++      (double)p->second;
++    if (min < 0 || proj < min)
++      min = proj;
++  }
++  return min;
++}
++
+ void PGMonitor::dump_pool_stats(stringstream &ss, Formatter *f, bool verbose)
+ {
+   TextTable tbl;
+ 
+@@ -1251,18 +1272,20 @@
+     tbl.define_column("NAME", TextTable::LEFT, TextTable::LEFT);
+     tbl.define_column("ID", TextTable::LEFT, TextTable::LEFT);
+     if (verbose)
+       tbl.define_column("CATEGORY", TextTable::LEFT, TextTable::LEFT);
+-    tbl.define_column("USED", TextTable::LEFT, TextTable::LEFT);
+-    tbl.define_column("\%USED", TextTable::LEFT, TextTable::LEFT);
+-    tbl.define_column("OBJECTS", TextTable::LEFT, TextTable::LEFT);
+-    if (verbose) {
+-      tbl.define_column("DIRTY", TextTable::LEFT, TextTable::LEFT);
+-      tbl.define_column("READ", TextTable::LEFT, TextTable::LEFT);
+-      tbl.define_column("WRITE", TextTable::LEFT, TextTable::LEFT);
++    tbl.define_column("USED", TextTable::LEFT, TextTable::RIGHT);
++    tbl.define_column("%USED", TextTable::LEFT, TextTable::RIGHT);
++    tbl.define_column("MAX AVAIL", TextTable::LEFT, TextTable::RIGHT);
++    tbl.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
++    if (verbose) {
++      tbl.define_column("DIRTY", TextTable::LEFT, TextTable::RIGHT);
++      tbl.define_column("READ", TextTable::LEFT, TextTable::RIGHT);
++      tbl.define_column("WRITE", TextTable::LEFT, TextTable::RIGHT);
+     }
+   }
+ 
++  map<int,uint64_t> avail_by_rule;
+   OSDMap &osdmap = mon->osdmon()->osdmap;
+   for (map<int64_t,pg_pool_t>::const_iterator p = osdmap.get_pools().begin();
+        p != osdmap.get_pools().end(); ++p) {
+     int64_t pool_id = p->first;
+@@ -1270,8 +1293,40 @@
+       continue;
+     string pool_name = osdmap.get_pool_name(pool_id);
+     pool_stat_t &stat = pg_map.pg_pool_sum[pool_id];
+ 
++    const pg_pool_t *pool = osdmap.get_pg_pool(pool_id);
++    int ruleno = osdmap.crush->find_rule(pool->get_crush_ruleset(),
++					 pool->get_type(),
++					 pool->get_size());
++    uint64_t avail;
++    if (avail_by_rule.count(ruleno) == 0) {
++      avail = get_rule_avail(osdmap, ruleno);
++      avail_by_rule[ruleno] = avail;
++    } else {
++      avail = avail_by_rule[ruleno];
++    }
++    switch (pool->get_type()) {
++    case pg_pool_t::TYPE_REPLICATED:
++      avail /= pool->get_size();
++      break;
++    case pg_pool_t::TYPE_ERASURE:
++      {
++	const map<string,string>& ecp =
++	  osdmap.get_erasure_code_profile(pool->erasure_code_profile);
++	map<string,string>::const_iterator pm = ecp.find("m");
++	map<string,string>::const_iterator pk = ecp.find("k");
++	if (pm != ecp.end() && pk != ecp.end()) {
++	  int k = atoi(pk->second.c_str());
++	  int m = atoi(pm->second.c_str());
++	  avail = avail * k / (m + k);
++	}
++      }
++      break;
++    default:
++      assert(0 == "unrecognized pool type");
++    }
++
+     if (f) {
+       f->open_object_section("pool");
+       f->dump_string("name", pool_name);
+       f->dump_int("id", pool_id);
+@@ -1281,9 +1336,9 @@
+           << pool_id;
+       if (verbose)
+         tbl << "-";
+     }
+-    dump_object_stat_sum(tbl, f, stat.stats.sum, verbose);
++    dump_object_stat_sum(tbl, f, stat.stats.sum, avail, verbose);
+     if (f)
+       f->close_section(); // stats
+     else
+       tbl << TextTable::endrow;
+@@ -1300,9 +1355,9 @@
+           tbl << ""
+               << ""
+               << it->first;
+         }
+-        dump_object_stat_sum(tbl, f, it->second, verbose);
++        dump_object_stat_sum(tbl, f, it->second, avail, verbose);
+         if (f)
+           f->close_section(); // category name
+         else
+           tbl << TextTable::endrow;
+@@ -1334,14 +1389,14 @@
+     }
+     f->close_section();
+   } else {
+     TextTable tbl;
+-    tbl.define_column("SIZE", TextTable::LEFT, TextTable::LEFT);
+-    tbl.define_column("AVAIL", TextTable::LEFT, TextTable::LEFT);
+-    tbl.define_column("RAW USED", TextTable::LEFT, TextTable::LEFT);
+-    tbl.define_column("\%RAW USED", TextTable::LEFT, TextTable::LEFT);
++    tbl.define_column("SIZE", TextTable::LEFT, TextTable::RIGHT);
++    tbl.define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT);
++    tbl.define_column("RAW USED", TextTable::LEFT, TextTable::RIGHT);
++    tbl.define_column("%RAW USED", TextTable::LEFT, TextTable::RIGHT);
+     if (verbose) {
+-      tbl.define_column("OBJECTS", TextTable::LEFT, TextTable::LEFT);
++      tbl.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
+     }
+     tbl << stringify(si_t(pg_map.osd_sum.kb*1024))
+         << stringify(si_t(pg_map.osd_sum.kb_avail*1024))
+         << stringify(si_t(pg_map.osd_sum.kb_used*1024));
+--- a/src/mon/PGMonitor.h
++++ b/src/mon/PGMonitor.h
+@@ -145,9 +145,13 @@
+ 			  int threshold,
+ 			  vector<string>& args) const;
+ 
+   void dump_object_stat_sum(TextTable &tbl, Formatter *f,
+-                            object_stat_sum_t &sum, bool verbose);
++                            object_stat_sum_t &sum,
++			    uint64_t avail,
++			    bool verbose);
++
++  int64_t get_rule_avail(OSDMap& osdmap, int ruleno);
+ 
+ public:
+   PGMonitor(Monitor *mn, Paxos *p, const string& service_name)
+     : PaxosService(mn, p, service_name),
+--- a/src/mon/Paxos.cc
++++ b/src/mon/Paxos.cc
+@@ -1263,9 +1263,10 @@
+ // -- READ --
+ 
+ bool Paxos::is_readable(version_t v)
+ {
+-  dout(1) << "is_readable now=" << ceph_clock_now(g_ceph_context) << " lease_expire=" << lease_expire
++  dout(5) << "is_readable now=" << ceph_clock_now(g_ceph_context)
++	  << " lease_expire=" << lease_expire
+ 	  << " has v" << v << " lc " << last_committed << dendl;
+   if (v > last_committed)
+     return false;
+   return 
+--- a/src/msg/SimpleMessenger.cc
++++ b/src/msg/SimpleMessenger.cc
+@@ -85,8 +85,11 @@
+ {
+   ldout(cct,10) << "shutdown " << get_myaddr() << dendl;
+   mark_down_all();
+   dispatch_queue.shutdown();
++
++  // break ref cycles on the loopback connection
++  local_connection->set_priv(NULL);
+   return 0;
+ }
+ 
+ int SimpleMessenger::_send_message(Message *m, const entity_inst_t& dest,
+--- a/src/os/FileJournal.cc
++++ b/src/os/FileJournal.cc
+@@ -1757,9 +1757,14 @@
+ 
+   // ok!
+   if (seq)
+     *seq = h->seq;
+-  journalq.push_back(pair<uint64_t,off64_t>(h->seq, pos));
++
++  // works around an apparent GCC 4.8(?) compiler bug about unaligned
++  // bind by reference to (packed) h->seq
++  journalq.push_back(
++    pair<uint64_t,off64_t>(static_cast<uint64_t>(h->seq),
++			   static_cast<off64_t>(pos)));
+ 
+   if (next_pos)
+     *next_pos = pos;
+ 
+--- a/src/os/FileStore.cc
++++ b/src/os/FileStore.cc
+@@ -125,9 +125,9 @@
+   PerfCounters &logger)
+ {
+   os_commit_latency.consume_next(
+     logger.get_tavg_ms(
+-      l_os_commit_lat));
++      l_os_j_lat));
+   os_apply_latency.consume_next(
+     logger.get_tavg_ms(
+       l_os_apply_lat));
+ }
+@@ -1557,8 +1557,10 @@
+     delete backend;
+     backend = generic_backend;
+   }
+ 
++  force_sync = false;
++
+   object_map.reset();
+ 
+   {
+     Mutex::Locker l(sync_entry_timeo_lock);
+@@ -1710,9 +1712,10 @@
+ }
+ 
+ void FileStore::_finish_op(OpSequencer *osr)
+ {
+-  Op *o = osr->dequeue();
++  list<Context*> to_queue;
++  Op *o = osr->dequeue(&to_queue);
+   
+   dout(10) << "_finish_op " << o << " seq " << o->op << " " << *osr << "/" << osr->parent << dendl;
+   osr->apply_lock.Unlock();  // locked in _do_op
+ 
+@@ -1728,8 +1731,9 @@
+   }
+   if (o->onreadable) {
+     op_finisher.queue(o->onreadable);
+   }
++  op_finisher.queue(to_queue);
+   delete o;
+ }
+ 
+ 
+@@ -1843,16 +1847,18 @@
+ 
+   // this should queue in order because the journal does it's completions in order.
+   queue_op(osr, o);
+ 
+-  osr->dequeue_journal();
++  list<Context*> to_queue;
++  osr->dequeue_journal(&to_queue);
+ 
+   // do ondisk completions async, to prevent any onreadable_sync completions
+   // getting blocked behind an ondisk completion.
+   if (ondisk) {
+     dout(10) << " queueing ondisk " << ondisk << dendl;
+     ondisk_finisher.queue(ondisk);
+   }
++  ondisk_finisher.queue(to_queue);
+ }
+ 
+ int FileStore::_do_transactions(
+   list<Transaction*> &tls,
+@@ -2544,13 +2550,14 @@
+ 	t.dump(&f);
+ 	f.close_section();
+ 	f.flush(*_dout);
+ 	*_dout << dendl;
+-	assert(0 == "unexpected error");
+ 
+ 	if (r == -EMFILE) {
+ 	  dump_open_fds(g_ceph_context);
+ 	}
++
++	assert(0 == "unexpected error");
+       }
+     }
+ 
+     spos.op++;
+--- a/src/os/FileStore.h
++++ b/src/os/FileStore.h
+@@ -192,21 +192,72 @@
+   class OpSequencer : public Sequencer_impl {
+     Mutex qlock; // to protect q, for benefit of flush (peek/dequeue also protected by lock)
+     list<Op*> q;
+     list<uint64_t> jq;
++    list<pair<uint64_t, Context*> > flush_commit_waiters;
+     Cond cond;
+   public:
+     Sequencer *parent;
+     Mutex apply_lock;  // for apply mutual exclusion
+     
++    /// get_max_uncompleted
++    bool _get_max_uncompleted(
++      uint64_t *seq ///< [out] max uncompleted seq
++      ) {
++      assert(qlock.is_locked());
++      assert(seq);
++      *seq = 0;
++      if (q.empty() && jq.empty())
++	return true;
++
++      if (!q.empty())
++	*seq = q.back()->op;
++      if (!jq.empty() && jq.back() > *seq)
++	*seq = jq.back();
++
++      return false;
++    } /// @returns true if both queues are empty
++
++    /// get_min_uncompleted
++    bool _get_min_uncompleted(
++      uint64_t *seq ///< [out] min uncompleted seq
++      ) {
++      assert(qlock.is_locked());
++      assert(seq);
++      *seq = 0;
++      if (q.empty() && jq.empty())
++	return true;
++
++      if (!q.empty())
++	*seq = q.front()->op;
++      if (!jq.empty() && jq.front() < *seq)
++	*seq = jq.front();
++
++      return false;
++    } /// @returns true if both queues are empty
++
++    void _wake_flush_waiters(list<Context*> *to_queue) {
++      uint64_t seq;
++      if (_get_min_uncompleted(&seq))
++	seq = -1;
++
++      for (list<pair<uint64_t, Context*> >::iterator i =
++	     flush_commit_waiters.begin();
++	   i != flush_commit_waiters.end() && i->first < seq;
++	   flush_commit_waiters.erase(i++)) {
++	to_queue->push_back(i->second);
++      }
++    }
++
+     void queue_journal(uint64_t s) {
+       Mutex::Locker l(qlock);
+       jq.push_back(s);
+     }
+-    void dequeue_journal() {
++    void dequeue_journal(list<Context*> *to_queue) {
+       Mutex::Locker l(qlock);
+       jq.pop_front();
+       cond.Signal();
++      _wake_flush_waiters(to_queue);
+     }
+     void queue(Op *o) {
+       Mutex::Locker l(qlock);
+       q.push_back(o);
+@@ -214,22 +265,28 @@
+     Op *peek_queue() {
+       assert(apply_lock.is_locked());
+       return q.front();
+     }
+-    Op *dequeue() {
++
++    Op *dequeue(list<Context*> *to_queue) {
++      assert(to_queue);
+       assert(apply_lock.is_locked());
+       Mutex::Locker l(qlock);
+       Op *o = q.front();
+       q.pop_front();
+       cond.Signal();
++
++      _wake_flush_waiters(to_queue);
+       return o;
+     }
++
+     void flush() {
+       Mutex::Locker l(qlock);
+ 
+       while (g_conf->filestore_blackhole)
+ 	cond.Wait(qlock);  // wait forever
+ 
++
+       // get max for journal _or_ op queues
+       uint64_t seq = 0;
+       if (!q.empty())
+ 	seq = q.back()->op;
+@@ -242,8 +299,19 @@
+ 	       (!jq.empty() && jq.front() <= seq))
+ 	  cond.Wait(qlock);
+       }
+     }
++    bool flush_commit(Context *c) {
++      Mutex::Locker l(qlock);
++      uint64_t seq = 0;
++      if (_get_max_uncompleted(&seq)) {
++	delete c;
++	return true;
++      } else {
++	flush_commit_waiters.push_back(make_pair(seq, c));
++	return false;
++      }
++    }
+ 
+     OpSequencer()
+       : qlock("FileStore::OpSequencer::qlock", false, false),
+ 	parent(0),
+--- a/src/os/GenericObjectMap.cc
++++ b/src/os/GenericObjectMap.cc
+@@ -688,10 +688,8 @@
+   remove_header(old_header->cid, old_header->oid, old_header, t);
+   old_header->cid = cid;
+   old_header->oid = target;
+   set_header(cid, target, *old_header, t);
+-
+-  // "in_use" still hold the "seq"
+ }
+ 
+ int GenericObjectMap::init(bool do_upgrade)
+ {
+@@ -925,64 +923,43 @@
+   set<string> to_get;
+   to_get.insert(header_key(cid, oid));
+   _Header header;
+ 
+-  while (1) {
+-    map<string, bufferlist> out;
+-    bool try_again = false;
++  map<string, bufferlist> out;
+ 
+-    int r = db->get(GHOBJECT_TO_SEQ_PREFIX, to_get, &out);
+-    if (r < 0)
+-      return Header();
+-    if (out.empty())
+-      return Header();
+-
+-    bufferlist::iterator iter = out.begin()->second.begin();
+-    header.decode(iter);
+-
+-    while (in_use.count(header.seq)) {
+-      header_cond.Wait(header_lock);
+-
+-      // Another thread is hold this header, wait for it.
+-      // Because the seq of this object may change, such as clone
+-      // and rename operation, here need to look up "seq" again
+-      try_again = true;
+-    }
++  int r = db->get(GHOBJECT_TO_SEQ_PREFIX, to_get, &out);
++  if (r < 0)
++    return Header();
++  if (out.empty())
++    return Header();
+ 
+-    if (!try_again) {
+-      break;
+-    }
+-  }
++  bufferlist::iterator iter = out.begin()->second.begin();
++  header.decode(iter);
+ 
+-  Header ret = Header(new _Header(header), RemoveOnDelete(this));
+-  in_use.insert(ret->seq);
++  Header ret = Header(new _Header(header));
+   return ret;
+ }
+ 
+ GenericObjectMap::Header GenericObjectMap::_generate_new_header(
+     const coll_t &cid, const ghobject_t &oid, Header parent,
+     KeyValueDB::Transaction t)
+ {
+-  Header header = Header(new _Header(), RemoveOnDelete(this));
++  Header header = Header(new _Header());
+   header->seq = state.seq++;
+   if (parent) {
+     header->parent = parent->seq;
+   }
+   header->num_children = 1;
+   header->oid = oid;
+   header->cid = cid;
+-  assert(!in_use.count(header->seq));
+-  in_use.insert(header->seq);
+ 
+   write_state(t);
+   return header;
+ }
+ 
+ GenericObjectMap::Header GenericObjectMap::lookup_parent(Header input)
+ {
+   Mutex::Locker l(header_lock);
+-  while (in_use.count(input->parent))
+-    header_cond.Wait(header_lock);
+   map<string, bufferlist> out;
+   set<string> keys;
+   keys.insert(PARENT_KEY);
+ 
+@@ -998,15 +975,14 @@
+     assert(0);
+     return Header();
+   }
+ 
+-  Header header = Header(new _Header(), RemoveOnDelete(this));
++  Header header = Header(new _Header());
+   header->seq = input->parent;
+   bufferlist::iterator iter = out.begin()->second.begin();
+   header->decode(iter);
+   dout(20) << "lookup_parent: parent seq is " << header->seq << " with parent "
+            << header->parent << dendl;
+-  in_use.insert(header->seq);
+   return header;
+ }
+ 
+ GenericObjectMap::Header GenericObjectMap::lookup_create_header(
+--- a/src/os/GenericObjectMap.h
++++ b/src/os/GenericObjectMap.h
+@@ -73,14 +73,8 @@
+   /**
+    * Serializes access to next_seq as well as the in_use set
+    */
+   Mutex header_lock;
+-  Cond header_cond;
+-
+-  /**
+-   * Set of headers currently in use
+-   */
+-  set<uint64_t> in_use;
+ 
+   GenericObjectMap(KeyValueDB *db) : db(db), header_lock("GenericObjectMap") {}
+ 
+   int get(
+@@ -370,8 +364,14 @@
+   GenericObjectMapIterator _get_iterator(Header header, string prefix) {
+     return GenericObjectMapIterator(new GenericObjectMapIteratorImpl(this, header, prefix));
+   }
+ 
++  Header generate_new_header(const coll_t &cid, const ghobject_t &oid,
++                             Header parent, KeyValueDB::Transaction t) {
++    Mutex::Locker l(header_lock);
++    return _generate_new_header(cid, oid, parent, t);
++  }
++
+   // Scan keys in header into out_keys and out_values (if nonnull)
+   int scan(Header header, const string &prefix, const set<string> &in_keys,
+            set<string> *out_keys, map<string, bufferlist> *out_values);
+ 
+@@ -393,13 +393,8 @@
+    * Has the side effect of syncronously saving the new GenericObjectMap state
+    */
+   Header _generate_new_header(const coll_t &cid, const ghobject_t &oid,
+                               Header parent, KeyValueDB::Transaction t);
+-  Header generate_new_header(const coll_t &cid, const ghobject_t &oid,
+-                             Header parent, KeyValueDB::Transaction t) {
+-    Mutex::Locker l(header_lock);
+-    return _generate_new_header(cid, oid, parent, t);
+-  }
+ 
+   // Lookup leaf header for c oid
+   Header _lookup_header(const coll_t &cid, const ghobject_t &oid);
+ 
+@@ -424,28 +419,8 @@
+ 
+   // Sets header @see set_header
+   void _set_header(Header header, const bufferlist &bl,
+                    KeyValueDB::Transaction t);
+-
+-  /** 
+-   * Removes header seq lock once Header is out of scope
+-   * @see _lookup_header
+-   * @see lookup_parent
+-   * @see generate_new_header
+-   */
+-  class RemoveOnDelete {
+-  public:
+-    GenericObjectMap *db;
+-    RemoveOnDelete(GenericObjectMap *db) :
+-      db(db) {}
+-    void operator() (_Header *header) {
+-      Mutex::Locker l(db->header_lock);
+-      db->in_use.erase(header->seq);
+-      db->header_cond.Signal();
+-      delete header;
+-    }
+-  };
+-  friend class RemoveOnDelete;
+ };
+ WRITE_CLASS_ENCODER(GenericObjectMap::_Header)
+ WRITE_CLASS_ENCODER(GenericObjectMap::State)
+ 
+--- a/src/os/KeyValueStore.cc
++++ b/src/os/KeyValueStore.cc
+@@ -68,90 +68,78 @@
+ const string KeyValueStore::COLLECTION_ATTR = "__COLL_ATTR__";
+ 
+ // ============== StripObjectMap Implementation =================
+ 
+-void StripObjectMap::sync_wrap(StripObjectHeader &strip_header,
+-                               KeyValueDB::Transaction t,
+-                               const SequencerPosition &spos)
+-{
+-  dout(10) << __func__ << " cid: " << strip_header.cid << "oid: "
+-           << strip_header.oid << " setting spos to " << strip_header.spos
+-           << dendl;
+-  strip_header.spos = spos;
+-  strip_header.header->data.clear();
+-  ::encode(strip_header, strip_header.header->data);
+-
+-  sync(strip_header.header, t);
+-}
+-
+-bool StripObjectMap::check_spos(const StripObjectHeader &header,
+-                                const SequencerPosition &spos)
+-{
+-  if (spos > header.spos) {
+-    stringstream out;
+-    dout(10) << "cid: " << "oid: " << header.oid
+-             << " not skipping op, *spos " << spos << dendl;
+-    dout(10) << " > header.spos " << header.spos << dendl;
+-    return false;
+-  } else {
+-    dout(10) << "cid: " << "oid: " << header.oid << " skipping op, spos "
+-             << spos << " <= header.spos " << header.spos << dendl;
+-    return true;
+-  }
+-}
+-
+-int StripObjectMap::save_strip_header(StripObjectHeader &strip_header,
+-                                      const SequencerPosition &spos,
++int StripObjectMap::save_strip_header(StripObjectHeaderRef strip_header,
+                                       KeyValueDB::Transaction t)
+ {
+-  strip_header.spos = spos;
+-  strip_header.header->data.clear();
+-  ::encode(strip_header, strip_header.header->data);
++  strip_header->header->data.clear();
++  ::encode(*strip_header, strip_header->header->data);
+ 
+-  set_header(strip_header.cid, strip_header.oid, *(strip_header.header), t);
++  set_header(strip_header->cid, strip_header->oid, *(strip_header->header), t);
+   return 0;
+ }
+ 
+ int StripObjectMap::create_strip_header(const coll_t &cid,
+                                         const ghobject_t &oid,
+-                                        StripObjectHeader &strip_header,
++                                        StripObjectHeaderRef *strip_header,
+                                         KeyValueDB::Transaction t)
+ {
+-  Header header = lookup_create_header(cid, oid, t);
++  Header header = generate_new_header(cid, oid, Header(), t);
+   if (!header)
+     return -EINVAL;
+ 
+-  strip_header.oid = oid;
+-  strip_header.cid = cid;
+-  strip_header.header = header;
++  StripObjectHeaderRef tmp = StripObjectHeaderRef(new StripObjectHeader());
++  tmp->oid = oid;
++  tmp->cid = cid;
++  tmp->header = header;
++  if (strip_header)
++    *strip_header = tmp;
+ 
+   return 0;
+ }
+ 
+ int StripObjectMap::lookup_strip_header(const coll_t &cid,
+                                         const ghobject_t &oid,
+-                                        StripObjectHeader &strip_header)
++                                        StripObjectHeaderRef *strip_header)
+ {
++  if (cid != coll_t()) {
++    Mutex::Locker l(lock);
++    pair<coll_t, StripObjectHeaderRef> p;
++    if (caches.lookup(oid, &p)) {
++      if (p.first == cid) {
++        *strip_header = p.second;
++        return 0;
++      }
++    }
++  }
+   Header header = lookup_header(cid, oid);
+ 
+   if (!header) {
+     dout(20) << "lookup_strip_header failed to get strip_header "
+              << " cid " << cid <<" oid " << oid << dendl;
+     return -ENOENT;
+   }
+ 
++
++  StripObjectHeaderRef tmp = StripObjectHeaderRef(new StripObjectHeader());
+   if (header->data.length()) {
+     bufferlist::iterator bliter = header->data.begin();
+-    ::decode(strip_header, bliter);
++    ::decode(*tmp, bliter);
+   }
+ 
+-  if (strip_header.strip_size == 0)
+-    strip_header.strip_size = default_strip_size;
++  if (tmp->strip_size == 0)
++    tmp->strip_size = default_strip_size;
+ 
+-  strip_header.oid = oid;
+-  strip_header.cid = cid;
+-  strip_header.header = header;
++  tmp->oid = oid;
++  tmp->cid = cid;
++  tmp->header = header;
+ 
++  {
++    Mutex::Locker l(lock);
++    caches.add(oid, make_pair(cid, tmp));
++  }
++  *strip_header = tmp;
+   dout(10) << "lookup_strip_header done " << " cid " << cid << " oid "
+            << oid << dendl;
+   return 0;
+ }
+@@ -193,125 +181,114 @@
+   dout(10) << "file_to_extents done " << dendl;
+   return 0;
+ }
+ 
+-void StripObjectMap::clone_wrap(StripObjectHeader &old_header,
++void StripObjectMap::clone_wrap(StripObjectHeaderRef old_header,
+                                 const coll_t &cid, const ghobject_t &oid,
+                                 KeyValueDB::Transaction t,
+-                                StripObjectHeader *origin_header,
+-                                StripObjectHeader *target_header)
++                                StripObjectHeaderRef *target_header)
+ {
+   Header new_origin_header;
++  StripObjectHeaderRef tmp = StripObjectHeaderRef(new StripObjectHeader());
+ 
+-  if (target_header)
+-    *target_header = old_header;
+-  if (origin_header)
+-    *origin_header = old_header;
+-
+-  clone(old_header.header, cid, oid, t, &new_origin_header,
+-        &target_header->header);
++  clone(old_header->header, cid, oid, t, &new_origin_header,
++        &tmp->header);
+ 
+-  if(origin_header)
+-    origin_header->header = new_origin_header;
++  tmp->oid = oid;
++  tmp->cid = cid;
++  tmp->strip_size = old_header->strip_size;
++  tmp->max_size = old_header->max_size;
++  tmp->bits = old_header->bits;
++  old_header->header = new_origin_header;
+ 
+-  if (target_header) {
+-    target_header->oid = oid;
+-    target_header->cid = cid;
+-  }
++  if (target_header)
++    *target_header = tmp;
+ }
+ 
+-void StripObjectMap::rename_wrap(const coll_t &cid, const ghobject_t &oid,
++void StripObjectMap::rename_wrap(StripObjectHeaderRef old_header, const coll_t &cid, const ghobject_t &oid,
+                                  KeyValueDB::Transaction t,
+-                                 StripObjectHeader *header)
++                                 StripObjectHeaderRef *new_header)
+ {
+-  assert(header);
+-  rename(header->header, cid, oid, t);
++  rename(old_header->header, cid, oid, t);
+ 
+-  if (header) {
+-    header->oid = oid;
+-    header->cid = cid;
+-  }
++  StripObjectHeaderRef tmp = StripObjectHeaderRef(new StripObjectHeader());
++  tmp->strip_size = old_header->strip_size;
++  tmp->max_size = old_header->max_size;
++  tmp->bits = old_header->bits;
++  tmp->header = old_header->header;
++  tmp->oid = oid;
++  tmp->cid = cid;
++
++  if (new_header)
++    *new_header = tmp;
++
++  old_header->header = Header();
++  old_header->deleted = true;
+ }
+ 
+-int StripObjectMap::get_values_with_header(const StripObjectHeader &header,
++int StripObjectMap::get_values_with_header(const StripObjectHeaderRef header,
+                                            const string &prefix,
+                                            const set<string> &keys,
+                                            map<string, bufferlist> *out)
+ {
+-  return scan(header.header, prefix, keys, 0, out);
++  return scan(header->header, prefix, keys, 0, out);
+ }
+ 
+-int StripObjectMap::get_keys_with_header(const StripObjectHeader &header,
++int StripObjectMap::get_keys_with_header(const StripObjectHeaderRef header,
+                                          const string &prefix,
+                                          set<string> *keys)
+ {
+-  ObjectMap::ObjectMapIterator iter = _get_iterator(header.header, prefix);
++  ObjectMap::ObjectMapIterator iter = _get_iterator(header->header, prefix);
+   for (; iter->valid(); iter->next()) {
+     if (iter->status())
+       return iter->status();
+     keys->insert(iter->key());
+   }
+   return 0;
+ }
+ 
+-int StripObjectMap::get_with_header(const StripObjectHeader &header,
++int StripObjectMap::get_with_header(const StripObjectHeaderRef header,
+                         const string &prefix, map<string, bufferlist> *out)
+ {
+-  ObjectMap::ObjectMapIterator iter = _get_iterator(header.header, prefix);
++  ObjectMap::ObjectMapIterator iter = _get_iterator(header->header, prefix);
+   for (iter->seek_to_first(); iter->valid(); iter->next()) {
+     if (iter->status())
+       return iter->status();
+     out->insert(make_pair(iter->key(), iter->value()));
+   }
+ 
+   return 0;
+ }
+-// =========== KeyValueStore::SubmitManager Implementation ==============
+-
+-uint64_t KeyValueStore::SubmitManager::op_submit_start()
+-{
+-  lock.Lock();
+-  uint64_t op = ++op_seq;
+-  dout(10) << "op_submit_start " << op << dendl;
+-  return op;
+-}
+-
+-void KeyValueStore::SubmitManager::op_submit_finish(uint64_t op)
+-{
+-  dout(10) << "op_submit_finish " << op << dendl;
+-  if (op != op_submitted + 1) {
+-      dout(0) << "op_submit_finish " << op << " expected " << (op_submitted + 1)
+-          << ", OUT OF ORDER" << dendl;
+-      assert(0 == "out of order op_submit_finish");
+-  }
+-  op_submitted = op;
+-  lock.Unlock();
+-}
+-
+ 
+ // ========= KeyValueStore::BufferTransaction Implementation ============
+ 
+ int KeyValueStore::BufferTransaction::lookup_cached_header(
+     const coll_t &cid, const ghobject_t &oid,
+-    StripObjectMap::StripObjectHeader **strip_header,
++    StripObjectMap::StripObjectHeaderRef *strip_header,
+     bool create_if_missing)
+ {
+-  StripObjectMap::StripObjectHeader header;
++  StripObjectMap::StripObjectHeaderRef header;
+   int r = 0;
+ 
+   StripHeaderMap::iterator it = strip_headers.find(make_pair(cid, oid));
+   if (it != strip_headers.end()) {
+-    if (it->second.deleted)
++
++    if (!it->second->deleted) {
++      if (strip_header)
++        *strip_header = it->second;
++      return 0;
++    } else if (!create_if_missing) {
+       return -ENOENT;
++    }
+ 
+-    if (strip_header)
+-      *strip_header = &it->second;
+-    return 0;
++    // If (it->second.deleted && create_if_missing) go down
++    r = -ENOENT;
++  } else {
++    r = store->backend->lookup_strip_header(cid, oid, &header);
+   }
+ 
+-  r = store->backend->lookup_strip_header(cid, oid, header);
+-  if (r < 0 && create_if_missing) {
+-    r = store->backend->create_strip_header(cid, oid, header, t);
++  if (r == -ENOENT && create_if_missing) {
++    r = store->backend->create_strip_header(cid, oid, &header, t);
+   }
+ 
+   if (r < 0) {
+     dout(10) << __func__  << " " << cid << "/" << oid << " "
+@@ -320,23 +297,23 @@
+   }
+ 
+   strip_headers[make_pair(cid, oid)] = header;
+   if (strip_header)
+-    *strip_header = &strip_headers[make_pair(cid, oid)];
++    *strip_header = strip_headers[make_pair(cid, oid)];
+   return r;
+ }
+ 
+ int KeyValueStore::BufferTransaction::get_buffer_keys(
+-    StripObjectMap::StripObjectHeader &strip_header, const string &prefix,
++    StripObjectMap::StripObjectHeaderRef strip_header, const string &prefix,
+     const set<string> &keys, map<string, bufferlist> *out)
+ {
+   set<string> need_lookup;
+ 
+   for (set<string>::iterator it = keys.begin(); it != keys.end(); ++it) {
+     map<pair<string, string>, bufferlist>::iterator i =
+-        strip_header.buffers.find(make_pair(prefix, *it));
++        strip_header->buffers.find(make_pair(prefix, *it));
+ 
+-    if (i != strip_header.buffers.end()) {
++    if (i != strip_header->buffers.end()) {
+       (*out)[*it].swap(i->second);
+     } else {
+       need_lookup.insert(*it);
+     }
+@@ -345,117 +322,118 @@
+   if (!need_lookup.empty()) {
+     int r = store->backend->get_values_with_header(strip_header, prefix,
+                                                    need_lookup, out);
+     if (r < 0) {
+-      dout(10) << __func__  << " " << strip_header.cid << "/"
+-               << strip_header.oid << " " << " r = " << r << dendl;
++      dout(10) << __func__  << " " << strip_header->cid << "/"
++               << strip_header->oid << " " << " r = " << r << dendl;
+       return r;
+     }
+   }
+ 
+   return 0;
+ }
+ 
+ void KeyValueStore::BufferTransaction::set_buffer_keys(
+-     StripObjectMap::StripObjectHeader &strip_header,
++     StripObjectMap::StripObjectHeaderRef strip_header,
+      const string &prefix, map<string, bufferlist> &values)
+ {
+-  store->backend->set_keys(strip_header.header, prefix, values, t);
++  store->backend->set_keys(strip_header->header, prefix, values, t);
+ 
+   for (map<string, bufferlist>::iterator iter = values.begin();
+        iter != values.end(); ++iter) {
+-    strip_header.buffers[make_pair(prefix, iter->first)].swap(iter->second);
++    strip_header->buffers[make_pair(prefix, iter->first)].swap(iter->second);
+   }
+ }
+ 
+ int KeyValueStore::BufferTransaction::remove_buffer_keys(
+-     StripObjectMap::StripObjectHeader &strip_header, const string &prefix,
++     StripObjectMap::StripObjectHeaderRef strip_header, const string &prefix,
+      const set<string> &keys)
+ {
+   for (set<string>::iterator iter = keys.begin(); iter != keys.end(); ++iter) {
+-    strip_header.buffers[make_pair(prefix, *iter)] = bufferlist();
++    strip_header->buffers[make_pair(prefix, *iter)] = bufferlist();
+   }
+ 
+-  return store->backend->rm_keys(strip_header.header, prefix, keys, t);
++  return store->backend->rm_keys(strip_header->header, prefix, keys, t);
+ }
+ 
+ void KeyValueStore::BufferTransaction::clear_buffer_keys(
+-     StripObjectMap::StripObjectHeader &strip_header, const string &prefix)
++     StripObjectMap::StripObjectHeaderRef strip_header, const string &prefix)
+ {
+-  for (map<pair<string, string>, bufferlist>::iterator iter = strip_header.buffers.begin();
+-       iter != strip_header.buffers.end(); ++iter) {
++  for (map<pair<string, string>, bufferlist>::iterator iter = strip_header->buffers.begin();
++       iter != strip_header->buffers.end(); ++iter) {
+     if (iter->first.first == prefix)
+       iter->second = bufferlist();
+   }
+ }
+ 
+ int KeyValueStore::BufferTransaction::clear_buffer(
+-     StripObjectMap::StripObjectHeader &strip_header)
++     StripObjectMap::StripObjectHeaderRef strip_header)
+ {
+-  strip_header.deleted = true;
++  strip_header->deleted = true;
+ 
+-  return store->backend->clear(strip_header.header, t);
++  InvalidateCacheContext *c = new InvalidateCacheContext(store, strip_header->cid, strip_header->oid);
++  finishes.push_back(c);
++  return store->backend->clear(strip_header->header, t);
+ }
+ 
+ void KeyValueStore::BufferTransaction::clone_buffer(
+-    StripObjectMap::StripObjectHeader &old_header,
++    StripObjectMap::StripObjectHeaderRef old_header,
+     const coll_t &cid, const ghobject_t &oid)
+ {
+   // Remove target ahead to avoid dead lock
+   strip_headers.erase(make_pair(cid, oid));
+ 
+-  StripObjectMap::StripObjectHeader new_origin_header, new_target_header;
++  StripObjectMap::StripObjectHeaderRef new_target_header;
+ 
+-  store->backend->clone_wrap(old_header, cid, oid, t,
+-                             &new_origin_header, &new_target_header);
++  store->backend->clone_wrap(old_header, cid, oid, t, &new_target_header);
+ 
+   // FIXME: Lacking of lock for origin header(now become parent), it will
+   // cause other operation can get the origin header while submitting
+   // transactions
+-  strip_headers[make_pair(cid, old_header.oid)] = new_origin_header;
+   strip_headers[make_pair(cid, oid)] = new_target_header;
+ }
+ 
+ void KeyValueStore::BufferTransaction::rename_buffer(
+-    StripObjectMap::StripObjectHeader &old_header,
++    StripObjectMap::StripObjectHeaderRef old_header,
+     const coll_t &cid, const ghobject_t &oid)
+ {
+-  if (store->backend->check_spos(old_header, spos))
+-    return ;
+-
+   // FIXME: Lacking of lock for origin header, it will cause other operation
+   // can get the origin header while submitting transactions
+-  store->backend->rename_wrap(cid, oid, t, &old_header);
++  StripObjectMap::StripObjectHeaderRef new_header;
++  store->backend->rename_wrap(old_header, cid, oid, t, &new_header);
+ 
+-  strip_headers.erase(make_pair(old_header.cid, old_header.oid));
+-  strip_headers[make_pair(cid, oid)] = old_header;
++  InvalidateCacheContext *c = new InvalidateCacheContext(store, old_header->cid, old_header->oid);
++  finishes.push_back(c);
++  strip_headers[make_pair(cid, oid)] = new_header;
+ }
+ 
+ int KeyValueStore::BufferTransaction::submit_transaction()
+ {
+   int r = 0;
+ 
+   for (StripHeaderMap::iterator header_iter = strip_headers.begin();
+        header_iter != strip_headers.end(); ++header_iter) {
+-    StripObjectMap::StripObjectHeader header = header_iter->second;
++    StripObjectMap::StripObjectHeaderRef header = header_iter->second;
+ 
+-    if (store->backend->check_spos(header, spos))
++    if (header->deleted)
+       continue;
+ 
+-    if (header.deleted)
+-      continue;
++    r = store->backend->save_strip_header(header, t);
+ 
+-    r = store->backend->save_strip_header(header, spos, t);
+     if (r < 0) {
+       dout(10) << __func__ << " save strip header failed " << dendl;
+       goto out;
+     }
+   }
+ 
+-out:
++  r = store->backend->submit_transaction(t);
++  for (list<Context*>::iterator it = finishes.begin(); it != finishes.end(); ++it) {
++    (*it)->complete(r);
++  }
+ 
++out:
+   dout(5) << __func__ << " r = " << r << dendl;
+-  return store->backend->submit_transaction(t);
++  return r;
+ }
+ 
+ // =========== KeyValueStore Intern Helper Implementation ==============
+ 
+@@ -494,9 +472,9 @@
+                              const char *name, bool do_update) :
+   ObjectStore(base),
+   internal_name(name),
+   basedir(base),
+-  fsid_fd(-1), op_fd(-1), current_fd(-1),
++  fsid_fd(-1), current_fd(-1),
+   kv_type(KV_TYPE_NONE),
+   backend(NULL),
+   ondisk_finisher(g_ceph_context),
+   lock("KeyValueStore::lock"),
+@@ -905,12 +883,8 @@
+   if (fsid_fd >= 0) {
+     VOID_TEMP_FAILURE_RETRY(::close(fsid_fd));
+     fsid_fd = -1;
+   }
+-  if (op_fd >= 0) {
+-    VOID_TEMP_FAILURE_RETRY(::close(op_fd));
+-    op_fd = -1;
+-  }
+   if (current_fd >= 0) {
+     VOID_TEMP_FAILURE_RETRY(::close(current_fd));
+     current_fd = -1;
+   }
+@@ -962,16 +936,11 @@
+   }
+ 
+   Op *o = build_op(tls, ondisk, onreadable, onreadable_sync, osd_op);
+   op_queue_reserve_throttle(o, handle);
+-  uint64_t op = submit_manager.op_submit_start();
+-  o->op = op;
+-  dout(5) << "queue_transactions (trailing journal) " << op << " "
+-          << tls <<dendl;
++  dout(5) << "queue_transactions (trailing journal) " << " " << tls <<dendl;
+   queue_op(osr, o);
+ 
+-  submit_manager.op_submit_finish(op);
+-
+   return 0;
+ }
+ 
+ 
+@@ -1087,9 +1056,10 @@
+ }
+ 
+ void KeyValueStore::_finish_op(OpSequencer *osr)
+ {
+-  Op *o = osr->dequeue();
++  list<Context*> to_queue;
++  Op *o = osr->dequeue(&to_queue);
+ 
+   dout(10) << "_finish_op " << o << " seq " << o->op << " " << *osr << "/" << osr->parent << dendl;
+   osr->apply_lock.Unlock();  // locked in _do_op
+   op_queue_release_throttle(o);
+@@ -1101,8 +1071,9 @@
+   if (o->onreadable_sync) {
+     o->onreadable_sync->complete(0);
+   }
+   op_finisher.queue(o->onreadable);
++  op_finisher.queue(to_queue);
+   delete o;
+ }
+ 
+ // Combine all the ops in the same transaction using "BufferTransaction" and
+@@ -1125,15 +1096,14 @@
+     ops += (*p)->get_num_ops();
+   }
+ 
+   int trans_num = 0;
+-  SequencerPosition spos(op_seq, trans_num, 0);
+-  BufferTransaction bt(this, spos);
++  BufferTransaction bt(this);
+ 
+   for (list<Transaction*>::iterator p = tls.begin();
+        p != tls.end();
+        ++p, trans_num++) {
+-    r = _do_transaction(**p, bt, spos, handle);
++    r = _do_transaction(**p, bt, handle);
+     if (r < 0)
+       break;
+     if (handle)
+       handle->reset_tp_timeout();
+@@ -1148,14 +1118,14 @@
+ }
+ 
+ unsigned KeyValueStore::_do_transaction(Transaction& transaction,
+                                         BufferTransaction &t,
+-                                        SequencerPosition& spos,
+                                         ThreadPool::TPHandle *handle)
+ {
+   dout(10) << "_do_transaction on " << &transaction << dendl;
+ 
+   Transaction::iterator i = transaction.begin();
++  uint64_t op_num = 0;
+ 
+   while (i.have_op()) {
+     if (handle)
+       handle->reset_tp_timeout();
+@@ -1448,9 +1418,15 @@
+       }
+       break;
+ 
+     case Transaction::OP_SETALLOCHINT:
+-      // TODO: can kvstore make use of the hint?
++      {
++        // TODO: can kvstore make use of the hint?
++        coll_t cid(i.get_cid());
++        ghobject_t oid = i.get_oid();
++        (void)i.get_length();  // discard result
++        (void)i.get_length();  // discard result
++      }
+       break;
+ 
+     default:
+       derr << "bad op " << op << dendl;
+@@ -1486,10 +1462,9 @@
+           msg = "ENOTEMPTY suggests garbage data in osd data dir";
+         }
+ 
+         dout(0) << " error " << cpp_strerror(r) << " not handled on operation "
+-                << op << " (" << spos << ", or op " << spos.op
+-                << ", counting from 0)" << dendl;
++                << op << " op " << op_num << ", counting from 0)" << dendl;
+         dout(0) << msg << dendl;
+         dout(0) << " transaction dump:\n";
+         JSONFormatter f(true);
+         f.open_object_section("transaction");
+@@ -1504,9 +1479,9 @@
+         }
+       }
+     }
+ 
+-    spos.op++;
++    op_num++;
+   }
+ 
+   return 0;  // FIXME count errors
+ }
+@@ -1519,11 +1494,11 @@
+ {
+   dout(10) << __func__ << "collection: " << cid << " object: " << oid
+            << dendl;
+   int r;
+-  StripObjectMap::StripObjectHeader header;
++  StripObjectMap::StripObjectHeaderRef header;
+ 
+-  r = backend->lookup_strip_header(cid, oid, header);
++  r = backend->lookup_strip_header(cid, oid, &header);
+   if (r < 0) {
+     return false;
+   }
+ 
+@@ -1534,44 +1509,44 @@
+                         struct stat *st, bool allow_eio)
+ {
+   dout(10) << "stat " << cid << "/" << oid << dendl;
+ 
+-  StripObjectMap::StripObjectHeader header;
++  StripObjectMap::StripObjectHeaderRef header;
+ 
+-  int r = backend->lookup_strip_header(cid, oid, header);
++  int r = backend->lookup_strip_header(cid, oid, &header);
+   if (r < 0) {
+     dout(10) << "stat " << cid << "/" << oid << "=" << r << dendl;
+     return -ENOENT;
+   }
+ 
+-  st->st_blocks = header.max_size / header.strip_size;
+-  if (header.max_size % header.strip_size)
++  st->st_blocks = header->max_size / header->strip_size;
++  if (header->max_size % header->strip_size)
+     st->st_blocks++;
+   st->st_nlink = 1;
+-  st->st_size = header.max_size;
+-  st->st_blksize = header.strip_size;
++  st->st_size = header->max_size;
++  st->st_blksize = header->strip_size;
+ 
+   return r;
+ }
+ 
+-int KeyValueStore::_generic_read(StripObjectMap::StripObjectHeader &header,
++int KeyValueStore::_generic_read(StripObjectMap::StripObjectHeaderRef header,
+                                  uint64_t offset, size_t len, bufferlist& bl,
+                                  bool allow_eio, BufferTransaction *bt)
+ {
+-  if (header.max_size < offset) {
+-    dout(10) << __func__ << " " << header.cid << "/" << header.oid << ")"
++  if (header->max_size < offset) {
++    dout(10) << __func__ << " " << header->cid << "/" << header->oid << ")"
+              << " offset exceed the length of bl"<< dendl;
+     return 0;
+   }
+ 
+   if (len == 0)
+-    len = header.max_size - offset;
++    len = header->max_size - offset;
+ 
+-  if (offset + len > header.max_size)
+-    len = header.max_size - offset;
++  if (offset + len > header->max_size)
++    len = header->max_size - offset;
+ 
+   vector<StripObjectMap::StripExtent> extents;
+-  StripObjectMap::file_to_extents(offset, len, header.strip_size,
++  StripObjectMap::file_to_extents(offset, len, header->strip_size,
+                                   extents);
+   map<string, bufferlist> out;
+   set<string> keys;
+ 
+@@ -1579,35 +1554,35 @@
+        iter != extents.end(); ++iter) {
+     bufferlist old;
+     string key = strip_object_key(iter->no);
+ 
+-    if (bt && header.buffers.count(make_pair(OBJECT_STRIP_PREFIX, key))) {
++    if (bt && header->buffers.count(make_pair(OBJECT_STRIP_PREFIX, key))) {
+       // use strip_header buffer
+-      assert(header.bits[iter->no]);
+-      out[key] = header.buffers[make_pair(OBJECT_STRIP_PREFIX, key)];
+-    } else if (header.bits[iter->no]) {
++      assert(header->bits[iter->no]);
++      out[key] = header->buffers[make_pair(OBJECT_STRIP_PREFIX, key)];
++    } else if (header->bits[iter->no]) {
+       keys.insert(key);
+     }
+   }
+ 
+   int r = backend->get_values_with_header(header, OBJECT_STRIP_PREFIX, keys, &out);
+   if (r < 0) {
+-    dout(10) << __func__ << " " << header.cid << "/" << header.oid << " "
++    dout(10) << __func__ << " " << header->cid << "/" << header->oid << " "
+              << offset << "~" << len << " = " << r << dendl;
+     return r;
+   } else if (out.size() != keys.size()) {
+     dout(0) << __func__ << " broken header or missing data in backend "
+-            << header.cid << "/" << header.oid << " " << offset << "~"
++            << header->cid << "/" << header->oid << " " << offset << "~"
+             << len << " = " << r << dendl;
+     return -EBADF;
+   }
+ 
+   for (vector<StripObjectMap::StripExtent>::iterator iter = extents.begin();
+        iter != extents.end(); ++iter) {
+     string key = strip_object_key(iter->no);
+ 
+-    if (header.bits[iter->no]) {
+-      if (iter->len == header.strip_size) {
++    if (header->bits[iter->no]) {
++      if (iter->len == header->strip_size) {
+         bl.claim_append(out[key]);
+       } else {
+         out[key].copy(iter->offset, iter->len, bl);
+       }
+@@ -1615,9 +1590,9 @@
+       bl.append_zero(iter->len);
+     }
+   }
+ 
+-  dout(10) << __func__ << " " << header.cid << "/" << header.oid << " "
++  dout(10) << __func__ << " " << header->cid << "/" << header->oid << " "
+            << offset << "~" << bl.length() << "/" << len << " r = " << r
+            << dendl;
+ 
+   return bl.length();
+@@ -1629,11 +1604,11 @@
+ {
+   dout(15) << __func__ << " " << cid << "/" << oid << " " << offset << "~"
+            << len << dendl;
+ 
+-  StripObjectMap::StripObjectHeader header;
++  StripObjectMap::StripObjectHeaderRef header;
+ 
+-  int r = backend->lookup_strip_header(cid, oid, header);
++  int r = backend->lookup_strip_header(cid, oid, &header);
+ 
+   if (r < 0) {
+     dout(10) << __func__ << " " << cid << "/" << oid << " " << offset << "~"
+               << len << " header isn't exist: r = " << r << dendl;
+@@ -1648,25 +1623,26 @@
+ {
+   dout(10) << __func__ << " " << cid << " " << oid << " " << offset << "~"
+            << len << dendl;
+   int r;
+-  StripObjectMap::StripObjectHeader header;
++  StripObjectMap::StripObjectHeaderRef header;
+ 
+-  r = backend->lookup_strip_header(cid, oid, header);
++  r = backend->lookup_strip_header(cid, oid, &header);
+   if (r < 0) {
+     dout(10) << "fiemap " << cid << "/" << oid << " " << offset << "~" << len
+              << " failed to get header: r = " << r << dendl;
+     return r;
+   }
+ 
+   vector<StripObjectMap::StripExtent> extents;
+-  StripObjectMap::file_to_extents(offset, len, header.strip_size,
++  StripObjectMap::file_to_extents(offset, len, header->strip_size,
+                                   extents);
+ 
+   map<uint64_t, uint64_t> m;
+   for (vector<StripObjectMap::StripExtent>::iterator iter = extents.begin();
+        iter != extents.end(); ++iter) {
+-    m[iter->offset] = iter->len;
++    uint64_t off = iter->no * header->strip_size + iter->offset;
++    m[off] = iter->len;
+   }
+   ::encode(m, bl);
+   return 0;
+ }
+@@ -1676,18 +1652,20 @@
+ {
+   dout(15) << __func__ << " " << cid << "/" << oid << dendl;
+ 
+   int r;
+-  StripObjectMap::StripObjectHeader *header;
++  StripObjectMap::StripObjectHeaderRef header;
+ 
+   r = t.lookup_cached_header(cid, oid, &header, false);
+   if (r < 0) {
+     dout(10) << __func__ << " " << cid << "/" << oid << " "
+              << " failed to get header: r = " << r << dendl;
+     return r;
+   }
+ 
+-  r = t.clear_buffer(*header);
++  header->max_size = 0;
++  header->bits.clear();
++  r = t.clear_buffer(header);
+ 
+   dout(10) << __func__ << " " << cid << "/" << oid << " = " << r << dendl;
+   return r;
+ }
+@@ -1698,9 +1676,9 @@
+   dout(15) << __func__ << " " << cid << "/" << oid << " size " << size
+            << dendl;
+ 
+   int r;
+-  StripObjectMap::StripObjectHeader *header;
++  StripObjectMap::StripObjectHeaderRef header;
+ 
+   r = t.lookup_cached_header(cid, oid, &header, false);
+   if (r < 0) {
+     dout(10) << __func__ << " " << cid << "/" << oid << " " << size
+@@ -1724,9 +1702,9 @@
+       set<string> lookup_keys;
+       string key = strip_object_key(iter->no);
+ 
+       lookup_keys.insert(key);
+-      r = t.get_buffer_keys(*header, OBJECT_STRIP_PREFIX,
++      r = t.get_buffer_keys(header, OBJECT_STRIP_PREFIX,
+                             lookup_keys, &values);
+       if (r < 0) {
+         dout(10) << __func__ << " " << cid << "/" << oid << " "
+                  << size << " = " << r << dendl;
+@@ -1742,9 +1720,9 @@
+       value.append_zero(header->strip_size-iter->offset);
+       assert(value.length() == header->strip_size);
+       value.swap(values[key]);
+ 
+-      t.set_buffer_keys(*header, OBJECT_STRIP_PREFIX, values);
++      t.set_buffer_keys(header, OBJECT_STRIP_PREFIX, values);
+       ++iter;
+     }
+ 
+     set<string> keys;
+@@ -1753,9 +1731,9 @@
+         keys.insert(strip_object_key(iter->no));
+         header->bits[iter->no] = 0;
+       }
+     }
+-    r = t.remove_buffer_keys(*header, OBJECT_STRIP_PREFIX, keys);
++    r = t.remove_buffer_keys(header, OBJECT_STRIP_PREFIX, keys);
+     if (r < 0) {
+       dout(10) << __func__ << " " << cid << "/" << oid << " "
+                << size << " = " << r << dendl;
+       return r;
+@@ -1775,9 +1753,9 @@
+ {
+   dout(15) << __func__ << " " << cid << "/" << oid << dendl;
+ 
+   int r;
+-  StripObjectMap::StripObjectHeader *header;
++  StripObjectMap::StripObjectHeaderRef header;
+ 
+   r = t.lookup_cached_header(cid, oid, &header, true);
+   if (r < 0) {
+     dout(10) << __func__ << " " << cid << "/" << oid << " "
+@@ -1789,44 +1767,44 @@
+   dout(10) << __func__ << " " << cid << "/" << oid << " = " << r << dendl;
+   return r;
+ }
+ 
+-int KeyValueStore::_generic_write(StripObjectMap::StripObjectHeader &header,
++int KeyValueStore::_generic_write(StripObjectMap::StripObjectHeaderRef header,
+                                   uint64_t offset, size_t len,
+                                   const bufferlist& bl, BufferTransaction &t,
+                                   bool replica)
+ {
+   if (len > bl.length())
+     len = bl.length();
+ 
+-  if (len + offset > header.max_size) {
+-    header.max_size = len + offset;
+-    header.bits.resize(header.max_size/header.strip_size+1);
++  if (len + offset > header->max_size) {
++    header->max_size = len + offset;
++    header->bits.resize(header->max_size/header->strip_size+1);
+   }
+ 
+   vector<StripObjectMap::StripExtent> extents;
+-  StripObjectMap::file_to_extents(offset, len, header.strip_size,
++  StripObjectMap::file_to_extents(offset, len, header->strip_size,
+                                   extents);
+ 
+   map<string, bufferlist> out;
+   set<string> keys;
+   for (vector<StripObjectMap::StripExtent>::iterator iter = extents.begin();
+        iter != extents.end(); ++iter) {
+-    if (header.bits[iter->no] && !(iter->offset == 0 &&
+-                                   iter->len == header.strip_size))
++    if (header->bits[iter->no] && !(iter->offset == 0 &&
++                                   iter->len == header->strip_size))
+       keys.insert(strip_object_key(iter->no));
+   }
+ 
+   int r = t.get_buffer_keys(header, OBJECT_STRIP_PREFIX, keys, &out);
+   if (r < 0) {
+-    dout(10) << __func__ << " failed to get value " << header.cid << "/"
+-              << header.oid << " " << offset << "~" << len << " = " << r
++    dout(10) << __func__ << " failed to get value " << header->cid << "/"
++              << header->oid << " " << offset << "~" << len << " = " << r
+               << dendl;
+     return r;
+   } else if (keys.size() != out.size()) {
+     // Error on header.bits or the corresponding key/value pair is missing
+     dout(0) << __func__ << " broken header or missing data in backend "
+-            << header.cid << "/" << header.oid << " " << offset << "~"
++            << header->cid << "/" << header->oid << " " << offset << "~"
+             << len << " = " << r << dendl;
+     return -EBADF;
+   }
+ 
+@@ -1835,41 +1813,41 @@
+   for (vector<StripObjectMap::StripExtent>::iterator iter = extents.begin();
+        iter != extents.end(); ++iter) {
+     bufferlist value;
+     string key = strip_object_key(iter->no);
+-    if (header.bits[iter->no]) {
+-      if (iter->offset == 0 && iter->len == header.strip_size) {
++    if (header->bits[iter->no]) {
++      if (iter->offset == 0 && iter->len == header->strip_size) {
+         bl.copy(bl_offset, iter->len, value);
+         bl_offset += iter->len;
+       } else {
+-        assert(out[key].length() == header.strip_size);
++        assert(out[key].length() == header->strip_size);
+ 
+         out[key].copy(0, iter->offset, value);
+         bl.copy(bl_offset, iter->len, value);
+         bl_offset += iter->len;
+ 
+-        if (value.length() != header.strip_size)
+-          out[key].copy(value.length(), header.strip_size-value.length(),
++        if (value.length() != header->strip_size)
++          out[key].copy(value.length(), header->strip_size-value.length(),
+                         value);
+       }
+     } else {
+       if (iter->offset)
+         value.append_zero(iter->offset);
+       bl.copy(bl_offset, iter->len, value);
+       bl_offset += iter->len;
+ 
+-      if (value.length() < header.strip_size)
+-        value.append_zero(header.strip_size-value.length());
++      if (value.length() < header->strip_size)
++        value.append_zero(header->strip_size-value.length());
+ 
+-      header.bits[iter->no] = 1;
++      header->bits[iter->no] = 1;
+     }
+-    assert(value.length() == header.strip_size);
++    assert(value.length() == header->strip_size);
+     values[key].swap(value);
+   }
+   assert(bl_offset == len);
+ 
+   t.set_buffer_keys(header, OBJECT_STRIP_PREFIX, values);
+-  dout(10) << __func__ << " " << header.cid << "/" << header.oid << " "
++  dout(10) << __func__ << " " << header->cid << "/" << header->oid << " "
+            << offset << "~" << len << " = " << r << dendl;
+ 
+   return r;
+ }
+@@ -1881,18 +1859,18 @@
+   dout(15) << __func__ << " " << cid << "/" << oid << " " << offset << "~"
+            << len << dendl;
+ 
+   int r;
+-  StripObjectMap::StripObjectHeader *header;
++  StripObjectMap::StripObjectHeaderRef header;
+ 
+   r = t.lookup_cached_header(cid, oid, &header, true);
+   if (r < 0) {
+     dout(10) << __func__ << " " << cid << "/" << oid << " " << offset
+              << "~" << len << " failed to get header: r = " << r << dendl;
+     return r;
+   }
+ 
+-  return _generic_write(*header, offset, len, bl, t, replica);
++  return _generic_write(header, offset, len, bl, t, replica);
+ }
+ 
+ int KeyValueStore::_zero(coll_t cid, const ghobject_t& oid, uint64_t offset,
+                          size_t len, BufferTransaction &t)
+@@ -1919,18 +1897,18 @@
+   if (oldoid == newoid)
+     return 0;
+ 
+   int r;
+-  StripObjectMap::StripObjectHeader *old_header;
++  StripObjectMap::StripObjectHeaderRef old_header;
+ 
+   r = t.lookup_cached_header(cid, oldoid, &old_header, false);
+   if (r < 0) {
+     dout(10) << __func__ << " " << cid << "/" << oldoid << " -> " << cid << "/"
+              << newoid << " = " << r << dendl;
+     return r;
+   }
+ 
+-  t.clone_buffer(*old_header, cid, newoid);
++  t.clone_buffer(old_header, cid, newoid);
+ 
+   dout(10) << __func__ << " " << cid << "/" << oldoid << " -> " << cid << "/"
+            << newoid << " = " << r << dendl;
+   return r;
+@@ -1947,9 +1925,9 @@
+ 
+   int r;
+   bufferlist bl;
+ 
+-  StripObjectMap::StripObjectHeader *old_header, *new_header;
++  StripObjectMap::StripObjectHeaderRef old_header, new_header;
+ 
+   r = t.lookup_cached_header(cid, oldoid, &old_header, false);
+   if (r < 0) {
+     dout(10) << __func__ << " " << cid << "/" << oldoid << " -> " << cid << "/"
+@@ -1965,13 +1943,13 @@
+            << " can't create header: r = " << r << dendl;
+     return r;
+   }
+ 
+-  r = _generic_read(*old_header, srcoff, len, bl, &t);
++  r = _generic_read(old_header, srcoff, len, bl, &t);
+   if (r < 0)
+     goto out;
+ 
+-  r = _generic_write(*new_header, dstoff, len, bl, t);
++  r = _generic_write(new_header, dstoff, len, bl, t);
+ 
+  out:
+   dout(10) << __func__ << " " << cid << "/" << oldoid << " -> " << cid << "/"
+            << newoid << " " << srcoff << "~" << len << " to " << dstoff
+@@ -1989,11 +1967,19 @@
+ 
+   int r;
+   map<string, bufferlist> got;
+   set<string> to_get;
++  StripObjectMap::StripObjectHeaderRef header;
+ 
+   to_get.insert(string(name));
+-  r = backend->get_values(cid, oid, OBJECT_XATTR, to_get, &got);
++
++  r = backend->lookup_strip_header(cid, oid, &header);
++  if (r < 0) {
++    dout(10) << __func__ << " lookup_strip_header failed: r =" << r << dendl;
++    return r;
++  }
++
++  r = backend->get_values_with_header(header, OBJECT_XATTR, to_get, &got);
+   if (r < 0 && r != -ENOENT) {
+     dout(10) << __func__ << " get_xattrs err r =" << r << dendl;
+     goto out;
+   }
+@@ -2055,9 +2041,9 @@
+   dout(15) << __func__ << " " << cid << "/" << oid << dendl;
+ 
+   int r;
+ 
+-  StripObjectMap::StripObjectHeader *header;
++  StripObjectMap::StripObjectHeaderRef header;
+   map<string, bufferlist> attrs;
+ 
+   r = t.lookup_cached_header(cid, oid, &header, false);
+   if (r < 0)
+@@ -2067,9 +2053,9 @@
+        it != aset.end(); ++it) {
+     attrs[it->first].push_back(it->second);
+   }
+ 
+-  t.set_buffer_keys(*header, OBJECT_XATTR, attrs);
++  t.set_buffer_keys(header, OBJECT_XATTR, attrs);
+ 
+ out:
+   dout(10) << __func__ << " " << cid << "/" << oid << " = " << r << dendl;
+   return r;
+@@ -2083,9 +2069,9 @@
+            << dendl;
+ 
+   int r;
+   set<string> to_remove;
+-  StripObjectMap::StripObjectHeader *header;
++  StripObjectMap::StripObjectHeaderRef header;
+ 
+   r = t.lookup_cached_header(cid, oid, &header, false);
+   if (r < 0) {
+     dout(10) << __func__ << " could not find header r = " << r
+@@ -2093,9 +2079,9 @@
+     return r;
+   }
+ 
+   to_remove.insert(string(name));
+-  r = t.remove_buffer_keys(*header, OBJECT_XATTR, to_remove);
++  r = t.remove_buffer_keys(header, OBJECT_XATTR, to_remove);
+ 
+   dout(10) << __func__ << " " << cid << "/" << oid << " '" << name << "' = "
+            << r << dendl;
+   return r;
+@@ -2108,25 +2094,25 @@
+ 
+   int r;
+   set<string> attrs;
+ 
+-  StripObjectMap::StripObjectHeader *header;
++  StripObjectMap::StripObjectHeaderRef header;
+ 
+   r = t.lookup_cached_header(cid, oid, &header, false);
+   if (r < 0) {
+     dout(10) << __func__ << " could not find header r = " << r
+              << dendl;
+     return r;
+   }
+ 
+-  r = backend->get_keys_with_header(*header, OBJECT_XATTR, &attrs);
++  r = backend->get_keys_with_header(header, OBJECT_XATTR, &attrs);
+   if (r < 0 && r != -ENOENT) {
+     dout(10) << __func__ << " could not get attrs r = " << r << dendl;
+     return r;
+   }
+ 
+-  r = t.remove_buffer_keys(*header, OBJECT_XATTR, attrs);
+-  t.clear_buffer_keys(*header, OBJECT_XATTR);
++  r = t.remove_buffer_keys(header, OBJECT_XATTR, attrs);
++  t.clear_buffer_keys(header, OBJECT_XATTR);
+ 
+   dout(10) << __func__ <<  " " << cid << "/" << oid << " = " << r << dendl;
+   return r;
+ }
+@@ -2167,12 +2153,20 @@
+            << "'" << dendl;
+ 
+   set<string> keys;
+   map<string, bufferlist> out;
++  StripObjectMap::StripObjectHeaderRef header;
++
+   keys.insert(string(name));
+ 
+-  int r = backend->get_values(get_coll_for_coll(), make_ghobject_for_coll(c),
+-                              COLLECTION_ATTR, keys, &out);
++  int r = backend->lookup_strip_header(get_coll_for_coll(),
++                                       make_ghobject_for_coll(c), &header);
++  if (r < 0) {
++    dout(10) << __func__ << " lookup_strip_header failed: r =" << r << dendl;
++    return r;
++  }
++
++  r = backend->get_values_with_header(header, COLLECTION_ATTR, keys, &out);
+   if (r < 0) {
+     dout(10) << __func__ << " could not get key" << string(name) << dendl;
+     r = -EINVAL;
+   }
+@@ -2191,16 +2185,23 @@
+   dout(10) << __func__ << " " << cid.to_str() << dendl;
+ 
+   map<string, bufferlist> out;
+   set<string> keys;
++  StripObjectMap::StripObjectHeaderRef header;
+ 
+   for (map<string, bufferptr>::iterator it = aset.begin();
+        it != aset.end(); ++it) {
+       keys.insert(it->first);
+   }
+ 
+-  int r = backend->get_values(get_coll_for_coll(), make_ghobject_for_coll(cid),
+-                              COLLECTION_ATTR, keys, &out);
++  int r = backend->lookup_strip_header(get_coll_for_coll(),
++                                       make_ghobject_for_coll(cid), &header);
++  if (r < 0) {
++    dout(10) << __func__ << " lookup_strip_header failed: r =" << r << dendl;
++    return r;
++  }
++
++  r = backend->get_values_with_header(header, COLLECTION_ATTR, keys, &out);
+   if (r < 0) {
+     dout(10) << __func__ << " could not get keys" << dendl;
+     r = -EINVAL;
+     goto out;
+@@ -2226,9 +2227,9 @@
+ 
+   int r;
+   bufferlist bl;
+   map<string, bufferlist> out;
+-  StripObjectMap::StripObjectHeader *header;
++  StripObjectMap::StripObjectHeaderRef header;
+ 
+   r = t.lookup_cached_header(get_coll_for_coll(),
+                              make_ghobject_for_coll(c),
+                              &header, false);
+@@ -2239,9 +2240,9 @@
+ 
+   bl.append(reinterpret_cast<const char*>(value), size);
+   out.insert(make_pair(string(name), bl));
+ 
+-  t.set_buffer_keys(*header, COLLECTION_ATTR, out);
++  t.set_buffer_keys(header, COLLECTION_ATTR, out);
+ 
+   dout(10) << __func__ << " " << c << " '"
+            << name << "' len " << size << " = " << r << dendl;
+   return r;
+@@ -2253,9 +2254,9 @@
+   dout(15) << __func__ << " " << c << dendl;
+ 
+   bufferlist bl;
+   set<string> out;
+-  StripObjectMap::StripObjectHeader *header;
++  StripObjectMap::StripObjectHeaderRef header;
+ 
+   int r = t.lookup_cached_header(get_coll_for_coll(),
+                                  make_ghobject_for_coll(c), &header, false);
+   if (r < 0) {
+@@ -2263,9 +2264,9 @@
+     return r;
+   }
+ 
+   out.insert(string(name));
+-  r = t.remove_buffer_keys(*header, COLLECTION_ATTR, out);
++  r = t.remove_buffer_keys(header, COLLECTION_ATTR, out);
+ 
+   dout(10) << __func__ << " " << c << " = " << r << dendl;
+   return r;
+ }
+@@ -2276,9 +2277,9 @@
+ {
+   dout(15) << __func__ << " " << cid << dendl;
+ 
+   map<string, bufferlist> attrs;
+-  StripObjectMap::StripObjectHeader *header;
++  StripObjectMap::StripObjectHeaderRef header;
+   int r = t.lookup_cached_header(get_coll_for_coll(),
+                                  make_ghobject_for_coll(cid),
+                                  &header, false);
+   if (r < 0) {
+@@ -2290,9 +2291,9 @@
+        ++it) {
+     attrs[it->first].push_back(it->second);
+   }
+ 
+-  t.set_buffer_keys(*header, COLLECTION_ATTR, attrs);
++  t.set_buffer_keys(header, COLLECTION_ATTR, attrs);
+ 
+   dout(10) << __func__ << " " << cid << " = " << r << dendl;
+   return r;
+ }
+@@ -2304,9 +2305,9 @@
+ {
+   dout(15) << __func__ << " " << c << dendl;
+ 
+   int r;
+-  StripObjectMap::StripObjectHeader *header;
++  StripObjectMap::StripObjectHeaderRef header;
+   bufferlist bl;
+ 
+   r = t.lookup_cached_header(get_coll_for_coll(),
+                              make_ghobject_for_coll(c), &header,
+@@ -2329,9 +2330,9 @@
+   dout(15) << __func__ << " " << c << dendl;
+ 
+   int r;
+   uint64_t modified_object = 0;
+-  StripObjectMap::StripObjectHeader *header;
++  StripObjectMap::StripObjectHeaderRef header;
+   vector<ghobject_t> oids;
+ 
+   r = t.lookup_cached_header(get_coll_for_coll(), make_ghobject_for_coll(c),
+                              &header, false);
+@@ -2346,9 +2347,9 @@
+     if (iter->first.first != c)
+       continue;
+ 
+     modified_object++;
+-    if (!iter->second.deleted) {
++    if (!iter->second->deleted) {
+       r = -ENOTEMPTY;
+       goto out;
+     }
+   }
+@@ -2368,9 +2369,9 @@
+       goto out;
+     }
+   }
+ 
+-  r = t.clear_buffer(*header);
++  r = t.clear_buffer(header);
+ 
+ out:
+   dout(10) << __func__ << " " << c << " = " << r << dendl;
+   return r;
+@@ -2384,9 +2385,9 @@
+   dout(15) << __func__ <<  " " << c << "/" << o << " from " << oldcid << "/"
+            << o << dendl;
+ 
+   bufferlist bl;
+-  StripObjectMap::StripObjectHeader *header, *old_header;
++  StripObjectMap::StripObjectHeaderRef header, old_header;
+ 
+   int r = t.lookup_cached_header(oldcid, o, &old_header, false);
+   if (r < 0) {
+     goto out;
+@@ -2399,15 +2400,15 @@
+              << o << " already exist " << dendl;
+     goto out;
+   }
+ 
+-  r = _generic_read(*old_header, 0, old_header->max_size, bl, &t);
++  r = _generic_read(old_header, 0, old_header->max_size, bl, &t);
+   if (r < 0) {
+     r = -EINVAL;
+     goto out;
+   }
+ 
+-  r = _generic_write(*header, 0, bl.length(), bl, t);
++  r = _generic_write(header, 0, bl.length(), bl, t);
+   if (r < 0) {
+     r = -EINVAL;
+   }
+ 
+@@ -2424,9 +2425,9 @@
+ {
+   dout(15) << __func__ << " " << c << "/" << o << " from " << oldcid << "/"
+            << oldoid << dendl;
+   int r;
+-  StripObjectMap::StripObjectHeader *header;
++  StripObjectMap::StripObjectHeaderRef header;
+ 
+   r = t.lookup_cached_header(c, o, &header, false);
+   if (r == 0) {
+     dout(10) << __func__ << " " << oldcid << "/" << oldoid << " -> " << c
+@@ -2440,9 +2441,9 @@
+              << "/" << o << " = " << r << dendl;
+     return r;
+   }
+ 
+-  t.rename_buffer(*header, c, o);
++  t.rename_buffer(header, c, o);
+ 
+   dout(10) << __func__ << " " << c << "/" << o << " from " << oldcid << "/"
+            << oldoid << " = " << r << dendl;
+   return r;
+@@ -2452,9 +2453,9 @@
+                                                 BufferTransaction &t)
+ {
+   dout(15) << __func__ << " " << cid << dendl;
+ 
+-  StripObjectMap::StripObjectHeader *header;
++  StripObjectMap::StripObjectHeaderRef header;
+ 
+   int r = t.lookup_cached_header(get_coll_for_coll(),
+                                  make_ghobject_for_coll(cid),
+                                  &header, false);
+@@ -2477,9 +2478,9 @@
+         return r;
+     }
+   }
+ 
+-  r = t.clear_buffer(*header);
++  r = t.clear_buffer(header);
+ 
+   dout(10) << __func__ << " " << cid  << " r = " << r << dendl;
+   return 0;
+ }
+@@ -2489,9 +2490,9 @@
+ {
+   dout(10) << __func__ << " origin cid " << cid << " new cid " << ncid
+            << dendl;
+ 
+-  StripObjectMap::StripObjectHeader *header;
++  StripObjectMap::StripObjectHeaderRef header;
+ 
+   int r = t.lookup_cached_header(get_coll_for_coll(),
+                                  make_ghobject_for_coll(ncid),
+                                  &header, false);
+@@ -2531,9 +2532,9 @@
+     objects.clear();
+     current = next;
+   }
+ 
+-  t.rename_buffer(*header, get_coll_for_coll(), make_ghobject_for_coll(ncid));
++  t.rename_buffer(header, get_coll_for_coll(), make_ghobject_for_coll(ncid));
+ 
+   dout(10) << __func__ << " origin cid " << cid << " new cid " << ncid
+            << dendl;
+   return 0;
+@@ -2559,11 +2560,11 @@
+ bool KeyValueStore::collection_exists(coll_t c)
+ {
+   dout(10) << __func__ << " " << dendl;
+ 
+-  StripObjectMap::StripObjectHeader header;
++  StripObjectMap::StripObjectHeaderRef header;
+   int r = backend->lookup_strip_header(get_coll_for_coll(),
+-                                       make_ghobject_for_coll(c), header);
++                                       make_ghobject_for_coll(c), &header);
+   if (r < 0) {
+     return false;
+   }
+   return true;
+@@ -2651,17 +2652,16 @@
+                             bufferlist *bl, map<string, bufferlist> *out)
+ {
+   dout(15) << __func__ << " " << c << "/" << hoid << dendl;
+ 
+-  StripObjectMap::StripObjectHeader header;
++  StripObjectMap::StripObjectHeaderRef header;
+ 
+-  int r = backend->lookup_strip_header(c, hoid, header);
++  int r = backend->lookup_strip_header(c, hoid, &header);
+   if (r < 0) {
+     dout(10) << __func__ << " lookup_strip_header failed: r =" << r << dendl;
+     return r;
+   }
+ 
+-
+   r = backend->get_with_header(header, OBJECT_OMAP, out);
+   if (r < 0 && r != -ENOENT) {
+     dout(10) << __func__ << " err r =" << r << dendl;
+     return r;
+@@ -2691,11 +2691,18 @@
+   dout(15) << __func__ << " " << c << "/" << hoid << dendl;
+ 
+   set<string> keys;
+   map<string, bufferlist> got;
++  StripObjectMap::StripObjectHeaderRef header;
++
++  int r = backend->lookup_strip_header(c, hoid, &header);
++  if (r < 0) {
++    dout(10) << __func__ << " lookup_strip_header failed: r =" << r << dendl;
++    return r;
++  }
+ 
+   keys.insert(OBJECT_OMAP_HEADER_KEY);
+-  int r = backend->get_values(c, hoid, OBJECT_OMAP_HEADER, keys, &got);
++  r = backend->get_values_with_header(header, OBJECT_OMAP_HEADER, keys, &got);
+   if (r < 0 && r != -ENOENT) {
+     dout(10) << __func__ << " err r =" << r << dendl;
+     return r;
+   }
+@@ -2711,9 +2718,16 @@
+ int KeyValueStore::omap_get_keys(coll_t c, const ghobject_t &hoid, set<string> *keys)
+ {
+   dout(15) << __func__ << " " << c << "/" << hoid << dendl;
+ 
+-  int r = backend->get_keys(c, hoid, OBJECT_OMAP, keys);
++  StripObjectMap::StripObjectHeaderRef header;
++  int r = backend->lookup_strip_header(c, hoid, &header);
++  if (r < 0) {
++    dout(10) << __func__ << " lookup_strip_header failed: r =" << r << dendl;
++    return r;
++  }
++
++  r = backend->get_keys_with_header(header, OBJECT_OMAP, keys);
+   if (r < 0 && r != -ENOENT) {
+     return r;
+   }
+   return 0;
+@@ -2724,9 +2738,16 @@
+                                    map<string, bufferlist> *out)
+ {
+   dout(15) << __func__ << " " << c << "/" << hoid << dendl;
+ 
+-  int r = backend->get_values(c, hoid, OBJECT_OMAP, keys, out);
++  StripObjectMap::StripObjectHeaderRef header;
++  int r = backend->lookup_strip_header(c, hoid, &header);
++  if (r < 0) {
++    dout(10) << __func__ << " lookup_strip_header failed: r =" << r << dendl;
++    return r;
++  }
++
++  r = backend->get_values_with_header(header, OBJECT_OMAP, keys, out);
+   if (r < 0 && r != -ENOENT) {
+     return r;
+   }
+   return 0;
+@@ -2755,9 +2776,9 @@
+                                BufferTransaction &t)
+ {
+   dout(15) << __func__ << " " << cid << "/" << hoid << dendl;
+ 
+-  StripObjectMap::StripObjectHeader *header;
++  StripObjectMap::StripObjectHeaderRef header;
+ 
+   int r = t.lookup_cached_header(cid, hoid, &header, false);
+   if (r < 0) {
+     dout(10) << __func__ << " " << cid << "/" << hoid << " "
+@@ -2765,29 +2786,29 @@
+     return r;
+   }
+ 
+   set<string> keys;
+-  r = backend->get_keys_with_header(*header, OBJECT_OMAP, &keys);
++  r = backend->get_keys_with_header(header, OBJECT_OMAP, &keys);
+   if (r < 0 && r != -ENOENT) {
+     dout(10) << __func__ << " could not get omap_keys r = " << r << dendl;
+     return r;
+   }
+ 
+-  r = t.remove_buffer_keys(*header, OBJECT_OMAP, keys);
++  r = t.remove_buffer_keys(header, OBJECT_OMAP, keys);
+   if (r < 0) {
+     dout(10) << __func__ << " could not remove keys r = " << r << dendl;
+     return r;
+   }
+ 
+   keys.clear();
+   keys.insert(OBJECT_OMAP_HEADER_KEY);
+-  r = t.remove_buffer_keys(*header, OBJECT_OMAP_HEADER, keys);
++  r = t.remove_buffer_keys(header, OBJECT_OMAP_HEADER, keys);
+   if (r < 0) {
+     dout(10) << __func__ << " could not remove keys r = " << r << dendl;
+     return r;
+   }
+ 
+-  t.clear_buffer_keys(*header, OBJECT_OMAP_HEADER);
++  t.clear_buffer_keys(header, OBJECT_OMAP_HEADER);
+ 
+   dout(10) << __func__ << " " << cid << "/" << hoid << " r = " << r << dendl;
+   return 0;
+ }
+@@ -2797,18 +2818,18 @@
+                                  BufferTransaction &t)
+ {
+   dout(15) << __func__ << " " << cid << "/" << hoid << dendl;
+ 
+-  StripObjectMap::StripObjectHeader *header;
++  StripObjectMap::StripObjectHeaderRef header;
+ 
+   int r = t.lookup_cached_header(cid, hoid, &header, false);
+   if (r < 0) {
+     dout(10) << __func__ << " " << cid << "/" << hoid << " "
+              << " failed to get header: r = " << r << dendl;
+     return r;
+   }
+ 
+-  t.set_buffer_keys(*header, OBJECT_OMAP, aset);
++  t.set_buffer_keys(header, OBJECT_OMAP, aset);
+ 
+   return 0;
+ }
+ 
+@@ -2817,18 +2838,18 @@
+                                 BufferTransaction &t)
+ {
+   dout(15) << __func__ << " " << cid << "/" << hoid << dendl;
+ 
+-  StripObjectMap::StripObjectHeader *header;
++  StripObjectMap::StripObjectHeaderRef header;
+ 
+   int r = t.lookup_cached_header(cid, hoid, &header, false);
+   if (r < 0) {
+     dout(10) << __func__ << " " << cid << "/" << hoid << " "
+              << " failed to get header: r = " << r << dendl;
+     return r;
+   }
+ 
+-  r = t.remove_buffer_keys(*header, OBJECT_OMAP, keys);
++  r = t.remove_buffer_keys(header, OBJECT_OMAP, keys);
+ 
+   dout(10) << __func__ << " " << cid << "/" << hoid << " r = " << r << dendl;
+   return r;
+ }
+@@ -2860,9 +2881,9 @@
+ {
+   dout(15) << __func__ << " " << cid << "/" << hoid << dendl;
+ 
+   map<string, bufferlist> sets;
+-  StripObjectMap::StripObjectHeader *header;
++  StripObjectMap::StripObjectHeaderRef header;
+ 
+   int r = t.lookup_cached_header(cid, hoid, &header, false);
+   if (r < 0) {
+     dout(10) << __func__ << " " << cid << "/" << hoid << " "
+@@ -2870,9 +2891,9 @@
+     return r;
+   }
+ 
+   sets[OBJECT_OMAP_HEADER_KEY] = bl;
+-  t.set_buffer_keys(*header, OBJECT_OMAP_HEADER, sets);
++  t.set_buffer_keys(header, OBJECT_OMAP_HEADER, sets);
+   return 0;
+ }
+ 
+ int KeyValueStore::_split_collection(coll_t cid, uint32_t bits, uint32_t rem,
+@@ -2880,9 +2901,9 @@
+ {
+   {
+     dout(15) << __func__ << " " << cid << " bits: " << bits << dendl;
+ 
+-    StripObjectMap::StripObjectHeader *header;
++    StripObjectMap::StripObjectHeaderRef header;
+ 
+     int r = t.lookup_cached_header(get_coll_for_coll(),
+                                    make_ghobject_for_coll(cid),
+                                    &header, false);
+--- a/src/os/KeyValueStore.h
++++ b/src/os/KeyValueStore.h
+@@ -35,10 +35,10 @@
+ #include "common/fd.h"
+ 
+ #include "common/Mutex.h"
+ #include "GenericObjectMap.h"
+-#include "SequencerPosition.h"
+ #include "KeyValueDB.h"
++#include "common/random_cache.hpp"
+ 
+ #include "include/uuid.h"
+ 
+ enum kvstore_types {
+@@ -47,8 +47,10 @@
+     KV_TYPE_OTHER
+ };
+ 
+ 
++static uint64_t default_strip_size = 1024;
++
+ class StripObjectMap: public GenericObjectMap {
+  public:
+ 
+   struct StripExtent {
+@@ -64,9 +66,8 @@
+     // Persistent state
+     uint64_t strip_size;
+     uint64_t max_size;
+     vector<char> bits;
+-    SequencerPosition spos;
+ 
+     // soft state
+     Header header; // FIXME: Hold lock to avoid concurrent operations, it will
+                    // also block read operation which not should be permitted.
+@@ -81,67 +82,66 @@
+       ENCODE_START(1, 1, bl);
+       ::encode(strip_size, bl);
+       ::encode(max_size, bl);
+       ::encode(bits, bl);
+-      ::encode(spos, bl);
+       ENCODE_FINISH(bl);
+     }
+ 
+     void decode(bufferlist::iterator &bl) {
+       DECODE_START(1, bl);
+       ::decode(strip_size, bl);
+       ::decode(max_size, bl);
+       ::decode(bits, bl);
+-      ::decode(spos, bl);
+       DECODE_FINISH(bl);
+     }
+   };
+-
+-  bool check_spos(const StripObjectHeader &header,
+-                  const SequencerPosition &spos);
+-  void sync_wrap(StripObjectHeader &strip_header, KeyValueDB::Transaction t,
+-                 const SequencerPosition &spos);
++  typedef ceph::shared_ptr<StripObjectHeader> StripObjectHeaderRef;
+ 
+   static int file_to_extents(uint64_t offset, size_t len, uint64_t strip_size,
+                              vector<StripExtent> &extents);
+   int lookup_strip_header(const coll_t & cid, const ghobject_t &oid,
+-                          StripObjectHeader &header);
+-  int save_strip_header(StripObjectHeader &header,
+-                        const SequencerPosition &spos,
+-                        KeyValueDB::Transaction t);
++                          StripObjectHeaderRef *header);
++  int save_strip_header(StripObjectHeaderRef header, KeyValueDB::Transaction t);
+   int create_strip_header(const coll_t &cid, const ghobject_t &oid,
+-                          StripObjectHeader &strip_header,
++                          StripObjectHeaderRef *strip_header,
+                           KeyValueDB::Transaction t);
+-  void clone_wrap(StripObjectHeader &old_header,
++  void clone_wrap(StripObjectHeaderRef old_header,
+                   const coll_t &cid, const ghobject_t &oid,
+                   KeyValueDB::Transaction t,
+-                  StripObjectHeader *origin_header,
+-                  StripObjectHeader *target_header);
+-  void rename_wrap(const coll_t &cid, const ghobject_t &oid,
++                  StripObjectHeaderRef *target_header);
++  void rename_wrap(StripObjectHeaderRef old_header, const coll_t &cid, const ghobject_t &oid,
+                    KeyValueDB::Transaction t,
+-                   StripObjectHeader *header);
++                   StripObjectHeaderRef *new_header);
+   // Already hold header to avoid lock header seq again
+   int get_with_header(
+-    const StripObjectHeader &header,
++    const StripObjectHeaderRef header,
+     const string &prefix,
+     map<string, bufferlist> *out
+     );
+ 
+   int get_values_with_header(
+-    const StripObjectHeader &header,
++    const StripObjectHeaderRef header,
+     const string &prefix,
+     const set<string> &keys,
+     map<string, bufferlist> *out
+     );
+   int get_keys_with_header(
+-    const StripObjectHeader &header,
++    const StripObjectHeaderRef header,
+     const string &prefix,
+     set<string> *keys
+     );
+ 
+-  StripObjectMap(KeyValueDB *db): GenericObjectMap(db) {}
++  Mutex lock;
++  void invalidate_cache(const coll_t &c, const ghobject_t &oid) {
++    Mutex::Locker l(lock);
++    caches.clear(oid);
++  }
+ 
+-  static const uint64_t default_strip_size = 1024;
++  RandomCache<ghobject_t, pair<coll_t, StripObjectHeaderRef> > caches;
++  StripObjectMap(KeyValueDB *db): GenericObjectMap(db),
++                                  lock("StripObjectMap::lock"),
++                                  caches(g_conf->keyvaluestore_header_cache_size)
++  {}
+ };
+ 
+ 
+ class KeyValueStore : public ObjectStore,
+@@ -160,9 +160,9 @@
+   std::string current_fn;
+   std::string current_op_seq_fn;
+   uuid_d fsid;
+ 
+-  int fsid_fd, op_fd, current_fd;
++  int fsid_fd, current_fd;
+ 
+   enum kvstore_types kv_type;
+ 
+   deque<uint64_t> snaps;
+@@ -209,41 +209,51 @@
+   // 3. Object modify(including omap, xattr)
+   // 4. Clone or rename
+   struct BufferTransaction {
+     typedef pair<coll_t, ghobject_t> uniq_id;
+-    typedef map<uniq_id, StripObjectMap::StripObjectHeader> StripHeaderMap;
++    typedef map<uniq_id, StripObjectMap::StripObjectHeaderRef> StripHeaderMap;
+ 
+     //Dirty records
+     StripHeaderMap strip_headers;
++    list<Context*> finishes;
+ 
+     KeyValueStore *store;
+ 
+-    SequencerPosition spos;
+     KeyValueDB::Transaction t;
+ 
+     int lookup_cached_header(const coll_t &cid, const ghobject_t &oid,
+-                             StripObjectMap::StripObjectHeader **strip_header,
++                             StripObjectMap::StripObjectHeaderRef *strip_header,
+                              bool create_if_missing);
+-    int get_buffer_keys(StripObjectMap::StripObjectHeader &strip_header,
++    int get_buffer_keys(StripObjectMap::StripObjectHeaderRef strip_header,
+                         const string &prefix, const set<string> &keys,
+                         map<string, bufferlist> *out);
+-    void set_buffer_keys(StripObjectMap::StripObjectHeader &strip_header,
++    void set_buffer_keys(StripObjectMap::StripObjectHeaderRef strip_header,
+                          const string &prefix, map<string, bufferlist> &bl);
+-    int remove_buffer_keys(StripObjectMap::StripObjectHeader &strip_header,
++    int remove_buffer_keys(StripObjectMap::StripObjectHeaderRef strip_header,
+                            const string &prefix, const set<string> &keys);
+-    void clear_buffer_keys(StripObjectMap::StripObjectHeader &strip_header,
++    void clear_buffer_keys(StripObjectMap::StripObjectHeaderRef strip_header,
+                            const string &prefix);
+-    int clear_buffer(StripObjectMap::StripObjectHeader &strip_header);
+-    void clone_buffer(StripObjectMap::StripObjectHeader &old_header,
++    int clear_buffer(StripObjectMap::StripObjectHeaderRef strip_header);
++    void clone_buffer(StripObjectMap::StripObjectHeaderRef old_header,
+                       const coll_t &cid, const ghobject_t &oid);
+-    void rename_buffer(StripObjectMap::StripObjectHeader &old_header,
++    void rename_buffer(StripObjectMap::StripObjectHeaderRef old_header,
+                        const coll_t &cid, const ghobject_t &oid);
+     int submit_transaction();
+ 
+-    BufferTransaction(KeyValueStore *store,
+-                      SequencerPosition &spos): store(store), spos(spos) {
++    BufferTransaction(KeyValueStore *store): store(store) {
+       t = store->backend->get_transaction();
+     }
++
++    struct InvalidateCacheContext : public Context {
++      KeyValueStore *store;
++      const coll_t cid;
++      const ghobject_t oid;
++      InvalidateCacheContext(KeyValueStore *s, const coll_t &c, const ghobject_t &oid): store(s), cid(c), oid(oid) {}
++      void finish(int r) {
++      if (r == 0)
++        store->backend->invalidate_cache(cid, oid);
++      }
++    };
+   };
+ 
+   // -- op workqueue --
+   struct Op {
+@@ -256,52 +266,111 @@
+   };
+   class OpSequencer : public Sequencer_impl {
+     Mutex qlock; // to protect q, for benefit of flush (peek/dequeue also protected by lock)
+     list<Op*> q;
+-    list<uint64_t> jq;
+     Cond cond;
++    list<pair<uint64_t, Context*> > flush_commit_waiters;
++    uint64_t op; // used by flush() to know the sequence of op
+    public:
+     Sequencer *parent;
+     Mutex apply_lock;  // for apply mutual exclusion
++    
++    /// get_max_uncompleted
++    bool _get_max_uncompleted(
++      uint64_t *seq ///< [out] max uncompleted seq
++      ) {
++      assert(qlock.is_locked());
++      assert(seq);
++      *seq = 0;
++      if (q.empty()) {
++	return true;
++      } else {
++	*seq = q.back()->op;
++	return false;
++      }
++    } /// @returns true if the queue is empty
++
++    /// get_min_uncompleted
++    bool _get_min_uncompleted(
++      uint64_t *seq ///< [out] min uncompleted seq
++      ) {
++      assert(qlock.is_locked());
++      assert(seq);
++      *seq = 0;
++      if (q.empty()) {
++	return true;
++      } else {
++	*seq = q.front()->op;
++	return false;
++      }
++    } /// @returns true if both queues are empty
++
++    void _wake_flush_waiters(list<Context*> *to_queue) {
++      uint64_t seq;
++      if (_get_min_uncompleted(&seq))
++	seq = -1;
++
++      for (list<pair<uint64_t, Context*> >::iterator i =
++	     flush_commit_waiters.begin();
++	   i != flush_commit_waiters.end() && i->first < seq;
++	   flush_commit_waiters.erase(i++)) {
++	to_queue->push_back(i->second);
++      }
++    }
+ 
+     void queue(Op *o) {
+       Mutex::Locker l(qlock);
+       q.push_back(o);
++      op++;
++      o->op = op;
+     }
+     Op *peek_queue() {
+       assert(apply_lock.is_locked());
+       return q.front();
+     }
+-    Op *dequeue() {
++
++    Op *dequeue(list<Context*> *to_queue) {
++      assert(to_queue);
+       assert(apply_lock.is_locked());
+       Mutex::Locker l(qlock);
+       Op *o = q.front();
+       q.pop_front();
+       cond.Signal();
++
++      _wake_flush_waiters(to_queue);
+       return o;
+     }
++
+     void flush() {
+       Mutex::Locker l(qlock);
+ 
+       // get max for journal _or_ op queues
+       uint64_t seq = 0;
+       if (!q.empty())
+         seq = q.back()->op;
+-      if (!jq.empty() && jq.back() > seq)
+-        seq = jq.back();
+ 
+       if (seq) {
+         // everything prior to our watermark to drain through either/both
+         // queues
+-        while ((!q.empty() && q.front()->op <= seq) ||
+-                (!jq.empty() && jq.front() <= seq))
++        while (!q.empty() && q.front()->op <= seq)
+           cond.Wait(qlock);
+       }
+     }
++    bool flush_commit(Context *c) {
++      Mutex::Locker l(qlock);
++      uint64_t seq = 0;
++      if (_get_max_uncompleted(&seq)) {
++	delete c;
++	return true;
++      } else {
++	flush_commit_waiters.push_back(make_pair(seq, c));
++	return false;
++      }
++    }
+ 
+     OpSequencer()
+       : qlock("KeyValueStore::OpSequencer::qlock", false, false),
+-	parent(0),
++        op(0), parent(0),
+ 	apply_lock("KeyValueStore::OpSequencer::apply_lock", false, false) {}
+     ~OpSequencer() {
+       assert(q.empty());
+     }
+@@ -416,9 +485,8 @@
+     return _do_transactions(tls, op_seq, 0);
+   }
+   unsigned _do_transaction(Transaction& transaction,
+                            BufferTransaction &bt,
+-                           SequencerPosition& spos,
+                            ThreadPool::TPHandle *handle);
+ 
+   int queue_transactions(Sequencer *osr, list<Transaction*>& tls,
+                          TrackedOpRef op = TrackedOpRef(),
+@@ -427,12 +495,12 @@
+ 
+   // ------------------
+   // objects
+ 
+-  int _generic_read(StripObjectMap::StripObjectHeader &header,
++  int _generic_read(StripObjectMap::StripObjectHeaderRef header,
+                     uint64_t offset, size_t len, bufferlist& bl,
+                     bool allow_eio = false, BufferTransaction *bt = 0);
+-  int _generic_write(StripObjectMap::StripObjectHeader &header,
++  int _generic_write(StripObjectMap::StripObjectHeaderRef header,
+                      uint64_t offset, size_t len, const bufferlist& bl,
+                      BufferTransaction &t, bool replica = false);
+ 
+   bool exists(coll_t cid, const ghobject_t& oid);
+@@ -571,28 +639,8 @@
+   static const string OBJECT_OMAP_HEADER_KEY;
+   static const string COLLECTION;
+   static const string COLLECTION_ATTR;
+   static const uint32_t COLLECTION_VERSION = 1;
+-
+-  class SubmitManager {
+-    Mutex lock;
+-    uint64_t op_seq;
+-    uint64_t op_submitted;
+-   public:
+-    SubmitManager() :
+-        lock("JOS::SubmitManager::lock", false, true, false, g_ceph_context),
+-        op_seq(0), op_submitted(0)
+-    {}
+-    uint64_t op_submit_start();
+-    void op_submit_finish(uint64_t op);
+-    void set_op_seq(uint64_t seq) {
+-        Mutex::Locker l(lock);
+-        op_submitted = op_seq = seq;
+-    }
+-    uint64_t get_op_seq() {
+-        return op_seq;
+-    }
+-  } submit_manager;
+ };
+ 
+ WRITE_CLASS_ENCODER(StripObjectMap::StripObjectHeader)
+ 
+--- a/src/os/LFNIndex.cc
++++ b/src/os/LFNIndex.cc
+@@ -60,8 +60,19 @@
+     ++current_failure;
+   }
+ }
+ 
++// Helper to close fd's when we leave scope.  This is useful when used
++// in combination with RetryException, thrown by the above.
++struct FDCloser {
++  int fd;
++  FDCloser(int f) : fd(f) {}
++  ~FDCloser() {
++    VOID_TEMP_FAILURE_RETRY(::close(fd));
++  }
++};
++
++
+ /* Public methods */
+ 
+ void LFNIndex::set_ref(ceph::shared_ptr<CollectionIndex> ref)
+ {
+@@ -159,11 +170,11 @@
+   maybe_inject_failure();
+   int fd = ::open(get_full_path_subdir(path).c_str(), O_RDONLY);
+   if (fd < 0)
+     return -errno;
++  FDCloser f(fd);
+   maybe_inject_failure();
+   int r = ::fsync(fd);
+-  VOID_TEMP_FAILURE_RETRY(::close(fd));
+   maybe_inject_failure();
+   if (r < 0)
+     return -errno;
+   else
+@@ -752,9 +763,10 @@
+   char buf[FILENAME_MAX_LEN + 1];
+   for ( ; ; ++i) {
+     candidate = lfn_get_short_name(oid, i);
+     candidate_path = get_full_path(path, candidate);
+-    r = chain_getxattr(candidate_path.c_str(), get_lfn_attr().c_str(), buf, sizeof(buf));
++    r = chain_getxattr(candidate_path.c_str(), get_lfn_attr().c_str(),
++		       buf, sizeof(buf));
+     if (r < 0) {
+       if (errno != ENODATA && errno != ENOENT)
+ 	return -errno;
+       if (errno == ENODATA) {
+@@ -783,8 +795,40 @@
+       if (exists)
+ 	*exists = 1;
+       return 0;
+     }
++    r = chain_getxattr(candidate_path.c_str(), get_alt_lfn_attr().c_str(),
++		       buf, sizeof(buf));
++    if (r > 0) {
++      // only consider alt name if nlink > 1
++      struct stat st;
++      int rc = ::stat(candidate_path.c_str(), &st);
++      if (rc < 0)
++	return -errno;
++      if (st.st_nlink <= 1) {
++	// left over from incomplete unlink, remove
++	maybe_inject_failure();
++	dout(20) << __func__ << " found extra alt attr for " << candidate_path
++		 << ", long name " << string(buf, r) << dendl;
++	rc = chain_removexattr(candidate_path.c_str(),
++			       get_alt_lfn_attr().c_str());
++	maybe_inject_failure();
++	if (rc < 0)
++	  return rc;
++	continue;
++      }
++      buf[MIN((int)sizeof(buf) - 1, r)] = '\0';
++      if (!strcmp(buf, full_name.c_str())) {
++	dout(20) << __func__ << " used alt attr for " << full_name << dendl;
++	if (mangled_name)
++	  *mangled_name = candidate;
++	if (out_path)
++	  *out_path = candidate_path;
++	if (exists)
++	  *exists = 1;
++	return 0;
++      }
++    }
+   }
+   assert(0); // Unreachable
+   return 0;
+ }
+@@ -797,9 +841,26 @@
+     return 0;
+   string full_path = get_full_path(path, mangled_name);
+   string full_name = lfn_generate_object_name(oid);
+   maybe_inject_failure();
+-  return chain_setxattr(full_path.c_str(), get_lfn_attr().c_str(), 
++
++  // if the main attr exists and is different, move it to the alt attr.
++  char buf[FILENAME_MAX_LEN + 1];
++  int r = chain_getxattr(full_path.c_str(), get_lfn_attr().c_str(),
++			 buf, sizeof(buf));
++  if (r >= 0 && (r != (int)full_name.length() ||
++		 memcmp(buf, full_name.c_str(), full_name.length()))) {
++    dout(20) << __func__ << " " << mangled_name
++	     << " moving old name to alt attr "
++	     << string(buf, r)
++	     << ", new name is " << full_name << dendl;
++    r = chain_setxattr(full_path.c_str(), get_alt_lfn_attr().c_str(),
++		       buf, r);
++    if (r < 0)
++      return r;
++  }
++
++  return chain_setxattr(full_path.c_str(), get_lfn_attr().c_str(),
+ 		     full_name.c_str(), full_name.size());
+ }
+ 
+ int LFNIndex::lfn_unlink(const vector<string> &path,
+@@ -838,28 +899,37 @@
+ 	return -errno;
+       }
+     }
+   }
++  string full_path = get_full_path(path, mangled_name);
++  int fd = ::open(full_path.c_str(), O_RDONLY);
++  if (fd < 0)
++    return -errno;
++  FDCloser f(fd);
+   if (i == removed_index + 1) {
+-    string full_path = get_full_path(path, mangled_name);
+     maybe_inject_failure();
+     int r = ::unlink(full_path.c_str());
+     maybe_inject_failure();
+     if (r < 0)
+       return -errno;
+-    else
+-      return 0;
+   } else {
+-    string rename_to = get_full_path(path, mangled_name);
++    string& rename_to = full_path;
+     string rename_from = get_full_path(path, lfn_get_short_name(oid, i - 1));
+     maybe_inject_failure();
+     int r = ::rename(rename_from.c_str(), rename_to.c_str());
+     maybe_inject_failure();
+     if (r < 0)
+       return -errno;
+-    else
+-      return 0;
+   }
++  struct stat st;
++  int r = ::fstat(fd, &st);
++  if (r == 0 && st.st_nlink > 0) {
++    // remove alt attr
++    dout(20) << __func__ << " removing alt attr from " << full_path << dendl;
++    fsync_dir(path);
++    chain_fremovexattr(fd, get_alt_lfn_attr().c_str());
++  }
++  return r;
+ }
+ 
+ int LFNIndex::lfn_translate(const vector<string> &path,
+ 			    const string &short_name,
+--- a/src/os/LFNIndex.h
++++ b/src/os/LFNIndex.h
+@@ -122,9 +122,9 @@
+     error_injection_enabled = false;
+   }
+ 
+ private:
+-  string lfn_attribute;
++  string lfn_attribute, lfn_alt_attribute;
+   coll_t collection;
+ 
+ public:
+   /// Constructor
+@@ -145,9 +145,10 @@
+     } else {
+       char buf[100];
+       snprintf(buf, sizeof(buf), "%d", index_version);
+       lfn_attribute = LFN_ATTR + string(buf);
+-    }
++      lfn_alt_attribute = LFN_ATTR + string(buf) + "-alt";
++   }
+   }
+ 
+   coll_t coll() const { return collection; }
+ 
+@@ -422,8 +423,11 @@
+    */
+   const string &get_lfn_attr() const {
+     return lfn_attribute;
+   }
++  const string &get_alt_lfn_attr() const {
++    return lfn_alt_attribute;
++  }
+ 
+   /**
+    * Gets the filename corresponsing to oid in path.
+    *
+--- a/src/os/MemStore.cc
++++ b/src/os/MemStore.cc
+@@ -949,9 +949,14 @@
+       }
+       break;
+ 
+     case Transaction::OP_SETALLOCHINT:
+-      // nop
++      {
++        coll_t cid(i.get_cid());
++        ghobject_t oid = i.get_oid();
++        (void)i.get_length();  // discard result
++        (void)i.get_length();  // discard result
++      }
+       break;
+ 
+     default:
+       derr << "bad op " << op << dendl;
+--- a/src/os/ObjectStore.cc
++++ b/src/os/ObjectStore.cc
+@@ -143,9 +143,13 @@
+ int ObjectStore::collection_list_range(coll_t c, hobject_t start, hobject_t end,
+ 			    snapid_t seq, vector<hobject_t> *ls)
+ {
+   vector<ghobject_t> go;
+-  ghobject_t gstart(start), gend(end);
++  // Starts with the smallest shard id and generation to
++  // make sure the result list has the marker object
++  ghobject_t gstart(start, 0, shard_id_t(0));
++  // Exclusive end, choose the smallest end ghobject
++  ghobject_t gend(end, 0, shard_id_t(0));
+   int ret = collection_list_range(c, gstart, gend, seq, &go);
+   if (ret == 0) {
+     ls->reserve(go.size());
+     for (vector<ghobject_t>::iterator i = go.begin(); i != go.end() ; ++i)
+--- a/src/os/ObjectStore.h
++++ b/src/os/ObjectStore.h
+@@ -127,8 +127,24 @@
+    * created in ...::queue_transaction(s)
+    */
+   struct Sequencer_impl {
+     virtual void flush() = 0;
++
++    /**
++     * Async flush_commit
++     *
++     * There are two cases:
++     * 1) sequencer is currently idle: the method returns true and
++     *    c is deleted
++     * 2) sequencer is not idle: the method returns false and c is
++     *    called asyncronously with a value of 0 once all transactions
++     *    queued on this sequencer prior to the call have been applied
++     *    and committed.
++     */
++    virtual bool flush_commit(
++      Context *c ///< [in] context to call upon flush/commit
++      ) = 0; ///< @return true if idle, false otherwise
++
+     virtual ~Sequencer_impl() {}
+   };
+ 
+   /**
+@@ -152,8 +168,18 @@
+     void flush() {
+       if (p)
+ 	p->flush();
+     }
++
++    /// @see Sequencer_impl::flush_commit()
++    bool flush_commit(Context *c) {
++      if (!p) {
++	delete c;
++	return true;
++      } else {
++	return p->flush_commit(c);
++      }
++    }
+   };
+ 
+   /*********************************
+    *
+--- a/src/osd/ECBackend.cc
++++ b/src/osd/ECBackend.cc
+@@ -104,15 +104,15 @@
+ }
+ 
+ void ECBackend::ReadOp::dump(Formatter *f) const
+ {
+-  f->dump_stream("tid") << tid;
++  f->dump_unsigned("tid", tid);
+   if (op && op->get_req()) {
+     f->dump_stream("op") << *(op->get_req());
+   }
+   f->dump_stream("to_read") << to_read;
+   f->dump_stream("complete") << complete;
+-  f->dump_stream("priority") << priority;
++  f->dump_int("priority", priority);
+   f->dump_stream("obj_to_source") << obj_to_source;
+   f->dump_stream("source_to_obj") << source_to_obj;
+   f->dump_stream("in_progress") << in_progress;
+ }
+@@ -157,9 +157,9 @@
+   f->dump_stream("missing_on") << missing_on;
+   f->dump_stream("missing_on_shards") << missing_on_shards;
+   f->dump_stream("recovery_info") << recovery_info;
+   f->dump_stream("recovery_progress") << recovery_progress;
+-  f->dump_stream("pending_read") << pending_read;
++  f->dump_bool("pending_read", pending_read);
+   f->dump_stream("state") << tostr(state);
+   f->dump_stream("waiting_on_pushes") << waiting_on_pushes;
+   f->dump_stream("extent_requested") << extent_requested;
+ }
+@@ -828,8 +828,9 @@
+   get_parent()->log_operation(
+     op.log_entries,
+     op.updated_hit_set_history,
+     op.trim_to,
++    op.trim_rollback_to,
+     !(op.t.empty()),
+     localt);
+   localt->append(op.t);
+   if (on_local_applied_sync) {
+@@ -1210,8 +1211,9 @@
+   const hobject_t &hoid,
+   const eversion_t &at_version,
+   PGTransaction *_t,
+   const eversion_t &trim_to,
++  const eversion_t &trim_rollback_to,
+   vector<pg_log_entry_t> &log_entries,
+   boost::optional<pg_hit_set_history_t> &hset_history,
+   Context *on_local_applied_sync,
+   Context *on_all_applied,
+@@ -1225,8 +1227,9 @@
+   Op *op = &(tid_to_op_map[tid]);
+   op->hoid = hoid;
+   op->version = at_version;
+   op->trim_to = trim_to;
++  op->trim_rollback_to = trim_rollback_to;
+   op->log_entries.swap(log_entries);
+   std::swap(op->updated_hit_set_history, hset_history);
+   op->on_local_applied_sync = on_local_applied_sync;
+   op->on_all_applied = on_all_applied;
+@@ -1531,8 +1534,9 @@
+       stats,
+       should_send ? iter->second : ObjectStore::Transaction(),
+       op->version,
+       op->trim_to,
++      op->trim_rollback_to,
+       op->log_entries,
+       op->updated_hit_set_history,
+       op->temp_added,
+       op->temp_cleared);
+--- a/src/osd/ECBackend.h
++++ b/src/osd/ECBackend.h
+@@ -96,8 +96,9 @@
+     const hobject_t &hoid,
+     const eversion_t &at_version,
+     PGTransaction *t,
+     const eversion_t &trim_to,
++    const eversion_t &trim_rollback_to,
+     vector<pg_log_entry_t> &log_entries,
+     boost::optional<pg_hit_set_history_t> &hset_history,
+     Context *on_local_applied_sync,
+     Context *on_all_applied,
+@@ -325,8 +326,9 @@
+   struct Op {
+     hobject_t hoid;
+     eversion_t version;
+     eversion_t trim_to;
++    eversion_t trim_rollback_to;
+     vector<pg_log_entry_t> log_entries;
+     boost::optional<pg_hit_set_history_t> updated_hit_set_history;
+     Context *on_local_applied_sync;
+     Context *on_all_applied;
+--- a/src/osd/ECMsgTypes.cc
++++ b/src/osd/ECMsgTypes.cc
+@@ -15,9 +15,9 @@
+ #include "ECMsgTypes.h"
+ 
+ void ECSubWrite::encode(bufferlist &bl) const
+ {
+-  ENCODE_START(2, 1, bl);
++  ENCODE_START(3, 1, bl);
+   ::encode(from, bl);
+   ::encode(tid, bl);
+   ::encode(reqid, bl);
+   ::encode(soid, bl);
+@@ -28,14 +28,15 @@
+   ::encode(log_entries, bl);
+   ::encode(temp_added, bl);
+   ::encode(temp_removed, bl);
+   ::encode(updated_hit_set_history, bl);
++  ::encode(trim_rollback_to, bl);
+   ENCODE_FINISH(bl);
+ }
+ 
+ void ECSubWrite::decode(bufferlist::iterator &bl)
+ {
+-  DECODE_START(2, bl);
++  DECODE_START(3, bl);
+   ::decode(from, bl);
+   ::decode(tid, bl);
+   ::decode(reqid, bl);
+   ::decode(soid, bl);
+@@ -48,8 +49,13 @@
+   ::decode(temp_removed, bl);
+   if (struct_v >= 2) {
+     ::decode(updated_hit_set_history, bl);
+   }
++  if (struct_v >= 3) {
++    ::decode(trim_rollback_to, bl);
++  } else {
++    trim_rollback_to = trim_to;
++  }
+   DECODE_FINISH(bl);
+ }
+ 
+ std::ostream &operator<<(
+@@ -57,20 +63,22 @@
+ {
+   lhs << "ECSubWrite(tid=" << rhs.tid
+       << ", reqid=" << rhs.reqid
+       << ", at_version=" << rhs.at_version
+-      << ", trim_to=" << rhs.trim_to;
++      << ", trim_to=" << rhs.trim_to
++      << ", trim_rollback_to=" << rhs.trim_rollback_to;
+   if (rhs.updated_hit_set_history)
+     lhs << ", has_updated_hit_set_history";
+   return lhs <<  ")";
+ }
+ 
+ void ECSubWrite::dump(Formatter *f) const
+ {
+-  f->dump_stream("tid") << tid;
++  f->dump_unsigned("tid", tid);
+   f->dump_stream("reqid") << reqid;
+   f->dump_stream("at_version") << at_version;
+   f->dump_stream("trim_to") << trim_to;
++  f->dump_stream("trim_rollback_to") << trim_rollback_to;
+   f->dump_stream("has_updated_hit_set_history")
+     << static_cast<bool>(updated_hit_set_history);
+ }
+ 
+@@ -84,8 +92,14 @@
+   o.back()->tid = 4;
+   o.back()->reqid = osd_reqid_t(entity_name_t::CLIENT(123), 1, 45678);
+   o.back()->at_version = eversion_t(10, 300);
+   o.back()->trim_to = eversion_t(5, 42);
++  o.push_back(new ECSubWrite());
++  o.back()->tid = 9;
++  o.back()->reqid = osd_reqid_t(entity_name_t::CLIENT(123), 1, 45678);
++  o.back()->at_version = eversion_t(10, 300);
++  o.back()->trim_to = eversion_t(5, 42);
++  o.back()->trim_rollback_to = eversion_t(8, 250);
+ }
+ 
+ void ECSubWriteReply::encode(bufferlist &bl) const
+ {
+@@ -120,9 +134,9 @@
+ }
+ 
+ void ECSubWriteReply::dump(Formatter *f) const
+ {
+-  f->dump_stream("tid") << tid;
++  f->dump_unsigned("tid", tid);
+   f->dump_stream("last_complete") << last_complete;
+   f->dump_stream("committed") << committed;
+   f->dump_stream("applied") << applied;
+ }
+@@ -170,9 +184,9 @@
+ 
+ void ECSubRead::dump(Formatter *f) const
+ {
+   f->dump_stream("from") << from;
+-  f->dump_stream("tid") << tid;
++  f->dump_unsigned("tid", tid);
+   f->open_array_section("objects");
+   for (map<hobject_t, list<pair<uint64_t, uint64_t> > >::const_iterator i =
+ 	 to_read.begin();
+        i != to_read.end();
+@@ -258,9 +272,9 @@
+ 
+ void ECSubReadReply::dump(Formatter *f) const
+ {
+   f->dump_stream("from") << from;
+-  f->dump_stream("tid") << tid;
++  f->dump_unsigned("tid", tid);
+   f->open_array_section("buffers_read");
+   for (map<hobject_t, list<pair<uint64_t, bufferlist> > >::const_iterator i =
+ 	 buffers_read.begin();
+        i != buffers_read.end();
+--- a/src/osd/ECMsgTypes.h
++++ b/src/osd/ECMsgTypes.h
+@@ -27,8 +27,9 @@
+   pg_stat_t stats;
+   ObjectStore::Transaction t;
+   eversion_t at_version;
+   eversion_t trim_to;
++  eversion_t trim_rollback_to;
+   vector<pg_log_entry_t> log_entries;
+   set<hobject_t> temp_added;
+   set<hobject_t> temp_removed;
+   boost::optional<pg_hit_set_history_t> updated_hit_set_history;
+@@ -41,16 +42,18 @@
+     const pg_stat_t &stats,
+     const ObjectStore::Transaction &t,
+     eversion_t at_version,
+     eversion_t trim_to,
++    eversion_t trim_rollback_to,
+     vector<pg_log_entry_t> log_entries,
+     boost::optional<pg_hit_set_history_t> updated_hit_set_history,
+     const set<hobject_t> &temp_added,
+     const set<hobject_t> &temp_removed)
+     : from(from), tid(tid), reqid(reqid),
+       soid(soid), stats(stats), t(t),
+       at_version(at_version),
+-      trim_to(trim_to), log_entries(log_entries),
++      trim_to(trim_to), trim_rollback_to(trim_rollback_to),
++      log_entries(log_entries),
+       temp_added(temp_added),
+       temp_removed(temp_removed),
+       updated_hit_set_history(updated_hit_set_history) {}
+   void encode(bufferlist &bl) const;
+--- a/src/osd/HitSet.h
++++ b/src/osd/HitSet.h
+@@ -368,9 +368,9 @@
+     double get_fpp() const {
+       return (double)fpp_micro / 1000000.0;
+     }
+     void set_fpp(double f) {
+-      fpp_micro = (unsigned)(f * 1000000.0);
++      fpp_micro = (unsigned)(llrintl(f * (double)1000000.0));
+     }
+ 
+     void encode(bufferlist& bl) const {
+       ENCODE_START(1, 1, bl);
+--- a/src/osd/OSD.cc
++++ b/src/osd/OSD.cc
+@@ -41,8 +41,9 @@
+ #include "osdc/Objecter.h"
+ 
+ #include "common/ceph_argparse.h"
+ #include "common/version.h"
++#include "common/io_priority.h"
+ 
+ #include "os/ObjectStore.h"
+ 
+ #include "ReplicatedPG.h"
+@@ -190,8 +191,9 @@
+   rep_scrub_wq(osd->rep_scrub_wq),
+   push_wq("push_wq", cct->_conf->osd_recovery_thread_timeout, &osd->recovery_tp),
+   gen_wq("gen_wq", cct->_conf->osd_recovery_thread_timeout, &osd->recovery_tp),
+   class_handler(osd->class_handler),
++  pg_epoch_lock("OSDService::pg_epoch_lock"),
+   publish_lock("OSDService::publish_lock"),
+   pre_publish_lock("OSDService::pre_publish_lock"),
+   sched_scrub_lock("OSDService::sched_scrub_lock"), scrubs_pending(0),
+   scrubs_active(0),
+@@ -1276,8 +1278,10 @@
+   recovery_tp.start();
+   disk_tp.start();
+   command_tp.start();
+ 
++  set_disk_tp_priority();
++
+   // start the heartbeat
+   heartbeat_thread.create();
+ 
+   // tick
+@@ -1304,8 +1308,10 @@
+   osd_lock.Lock();
+   if (is_stopping())
+     return 0;
+ 
++  check_config();
++
+   dout(10) << "ensuring pgs have consumed prior maps" << dendl;
+   consume_map();
+   peering_wq.drain();
+ 
+@@ -1662,10 +1668,12 @@
+   recovery_tp.stop();
+   dout(10) << "recovery tp stopped" << dendl;
+ 
+   op_tp.drain();
++  peering_wq.clear();
++  scrub_finalize_wq.clear();
+   op_tp.stop();
+-  dout(10) << "op tp stopped" << dendl;
++  dout(10) << "osd tp stopped" << dendl;
+ 
+   command_tp.drain();
+   command_tp.stop();
+   dout(10) << "command tp stopped" << dendl;
+@@ -1707,9 +1715,8 @@
+     Mutex::Locker l(pg_stat_queue_lock);
+     assert(pg_stat_queue.empty());
+   }
+ 
+-  peering_wq.clear();
+   // Remove PGs
+ #ifdef PG_DEBUG_REFS
+   service.dump_live_pgids();
+ #endif
+@@ -1853,8 +1860,10 @@
+   PG* pg = _make_pg(createmap, pgid);
+ 
+   pg_map[pgid] = pg;
+ 
++  service.pg_add_epoch(pg->info.pgid, createmap->get_epoch());
++
+   pg->lock(no_lockdep_check);
+   pg->get("PGMap");  // because it's in pg_map
+   return pg;
+ }
+@@ -1884,8 +1893,9 @@
+ {
+   epoch_t e(service.get_osdmap()->get_epoch());
+   pg->get("PGMap");  // For pg_map
+   pg_map[pg->info.pgid] = pg;
++  service.pg_add_epoch(pg->info.pgid, pg->get_osdmap()->get_epoch());
+   dout(10) << "Adding newly split pg " << *pg << dendl;
+   vector<int> up, acting;
+   pg->get_osdmap()->pg_to_up_acting_osds(pg->info.pgid.pgid, up, acting);
+   int role = OSDMap::calc_pg_role(service.whoami, acting);
+@@ -4391,11 +4401,10 @@
+       // 1MB block sizes are big enough so that we get more stuff done.
+       // However, to avoid the osd from getting hung on this and having
+       // timers being triggered, we are going to limit the count assuming
+       // a configurable throughput and duration.
+-      int64_t total_throughput =
++      int64_t max_count =
+         g_conf->osd_bench_large_size_max_throughput * duration;
+-      int64_t max_count = (int64_t) (total_throughput / bsize);
+       if (count > max_count) {
+         ss << "'count' values greater than " << max_count
+            << " for a block size of " << prettybyte_t(bsize) << ", assuming "
+            << prettybyte_t(g_conf->osd_bench_large_size_max_throughput) << "/s,"
+@@ -5712,13 +5721,14 @@
+       client_messenger->set_default_policy(p);
+     }
+   }
+   {
+-    Messenger::Policy p = cluster_messenger->get_policy(entity_name_t::TYPE_MON);
++    Messenger::Policy p = client_messenger->get_policy(entity_name_t::TYPE_MON);
+     uint64_t mask;
+     uint64_t features = osdmap->get_features(entity_name_t::TYPE_MON, &mask);
+     if ((p.features_required & mask) != features) {
+       dout(0) << "crush map has features " << features
++	      << " was " << p.features_required
+ 	      << ", adjusting msgr requires for mons" << dendl;
+       p.features_required = (p.features_required & ~mask) | features;
+       client_messenger->set_policy(entity_name_t::TYPE_MON, p);
+     }
+@@ -5747,9 +5757,9 @@
+     }
+   }
+ }
+ 
+-void OSD::advance_pg(
++bool OSD::advance_pg(
+   epoch_t osd_epoch, PG *pg,
+   ThreadPool::TPHandle &handle,
+   PG::RecoveryCtx *rctx,
+   set<boost::intrusive_ptr<PG> > *new_pgs)
+@@ -5758,13 +5768,21 @@
+   epoch_t next_epoch = pg->get_osdmap()->get_epoch() + 1;
+   OSDMapRef lastmap = pg->get_osdmap();
+ 
+   if (lastmap->get_epoch() == osd_epoch)
+-    return;
++    return true;
+   assert(lastmap->get_epoch() < osd_epoch);
+ 
++  epoch_t min_epoch = service.get_min_pg_epoch();
++  epoch_t max;
++  if (min_epoch) {
++    max = min_epoch + g_conf->osd_map_max_advance;
++  } else {
++    max = next_epoch + g_conf->osd_map_max_advance;
++  }
++
+   for (;
+-       next_epoch <= osd_epoch;
++       next_epoch <= osd_epoch && next_epoch <= max;
+        ++next_epoch) {
+     OSDMapRef nextmap = service.try_get_map(next_epoch);
+     if (!nextmap)
+       continue;
+@@ -5794,9 +5812,17 @@
+ 
+     lastmap = nextmap;
+     handle.reset_tp_timeout();
+   }
++  service.pg_update_epoch(pg->info.pgid, lastmap->get_epoch());
+   pg->handle_activate_map(rctx);
++  if (next_epoch <= osd_epoch) {
++    dout(10) << __func__ << " advanced by max " << g_conf->osd_map_max_advance
++	     << " past min epoch " << min_epoch
++	     << " ... will requeue " << *pg << dendl;
++    return false;
++  }
++  return true;
+ }
+ 
+ /** 
+  * scan placement groups, initiate any replication
+@@ -6126,9 +6152,9 @@
+   }
+   return true;
+ }
+ 
+-bool OSD::require_osd_peer(OpRequestRef op)
++bool OSD::require_osd_peer(OpRequestRef& op)
+ {
+   if (!op->get_req()->get_connection()->peer_is_osd()) {
+     dout(0) << "require_osd_peer received from non-osd " << op->get_req()->get_connection()->get_peer_addr()
+ 	    << " " << *op->get_req() << dendl;
+@@ -6136,13 +6162,66 @@
+   }
+   return true;
+ }
+ 
++bool OSD::require_self_aliveness(OpRequestRef& op, epoch_t epoch)
++{
++  if (epoch < up_epoch) {
++    dout(7) << "from pre-up epoch " << epoch << " < " << up_epoch << dendl;
++    return false;
++  }
++
++  if (!is_active()) {
++    dout(7) << "still in boot state, dropping message " << *op->get_req() << dendl;
++    return false;
++  }
++
++  return true;
++}
++
++bool OSD::require_same_peer_instance(OpRequestRef& op, OSDMapRef& map)
++{
++  Message *m = op->get_req();
++  int from = m->get_source().num();
++
++  if (!map->have_inst(from) ||
++      (map->get_cluster_addr(from) != m->get_source_inst().addr)) {
++    dout(5) << "from dead osd." << from << ", marking down, "
++	    << " msg was " << m->get_source_inst().addr
++	    << " expected " << (map->have_inst(from) ?
++				map->get_cluster_addr(from) : entity_addr_t())
++	    << dendl;
++    ConnectionRef con = m->get_connection();
++    cluster_messenger->mark_down(con.get());
++    Session *s = static_cast<Session*>(con->get_priv());
++    if (s) {
++      con->set_priv(NULL);   // break ref <-> session cycle, if any
++      s->put();
++    }
++    return false;
++  }
++  return true;
++}
++
++bool OSD::require_up_osd_peer(OpRequestRef& op, OSDMapRef& map,
++                              epoch_t their_epoch)
++{
++  if (!require_self_aliveness(op, their_epoch)) {
++    return false;
++  } else if (!require_osd_peer(op)) {
++    return false;
++  } else if (map->get_epoch() >= their_epoch &&
++	     !require_same_peer_instance(op, map)) {
++    return false;
++  }
++  return true;
++}
++
+ /*
+  * require that we have same (or newer) map, and that
+  * the source is the pg primary.
+  */
+-bool OSD::require_same_or_newer_map(OpRequestRef op, epoch_t epoch)
++bool OSD::require_same_or_newer_map(OpRequestRef& op, epoch_t epoch)
+ {
+   Message *m = op->get_req();
+   dout(15) << "require_same_or_newer_map " << epoch << " (i am " << osdmap->get_epoch() << ") " << m << dendl;
+ 
+@@ -6154,32 +6233,15 @@
+     wait_for_new_map(op);
+     return false;
+   }
+ 
+-  if (epoch < up_epoch) {
+-    dout(7) << "from pre-up epoch " << epoch << " < " << up_epoch << dendl;
++  if (!require_self_aliveness(op, epoch)) {
+     return false;
+   }
+ 
+   // ok, our map is same or newer.. do they still exist?
+-  if (m->get_connection()->get_messenger() == cluster_messenger) {
+-    int from = m->get_source().num();
+-    if (!osdmap->have_inst(from) ||
+-	osdmap->get_cluster_addr(from) != m->get_source_inst().addr) {
+-      dout(5) << "from dead osd." << from << ", marking down, "
+-	      << " msg was " << m->get_source_inst().addr
+-	      << " expected " << (osdmap->have_inst(from) ? osdmap->get_cluster_addr(from) : entity_addr_t())
+-	      << dendl;
+-      ConnectionRef con = m->get_connection();
+-      con->set_priv(NULL);   // break ref <-> session cycle, if any
+-      cluster_messenger->mark_down(con.get());
+-      return false;
+-    }
+-  }
+-
+-  // ok, we have at least as new a map as they do.  are we (re)booting?
+-  if (!is_active()) {
+-    dout(7) << "still in boot state, dropping message " << *m << dendl;
++  if (m->get_connection()->get_messenger() == cluster_messenger &&
++      !require_same_peer_instance(op, osdmap)) {
+     return false;
+   }
+ 
+   return true;
+@@ -7141,8 +7203,10 @@
+       PGRef(pg))
+     );
+   remove_wq.queue(make_pair(PGRef(pg), deleting));
+ 
++  service.pg_remove_epoch(pg->info.pgid);
++
+   // remove from map
+   pg_map.erase(pg->info.pgid);
+   pg->put("PGMap"); // since we've taken it out of map
+ }
+@@ -7554,9 +7618,9 @@
+     dout(3) << "replica op from before up" << dendl;
+     return;
+   }
+ 
+-  if (!require_osd_peer(op))
++  if (!require_up_osd_peer(op, osdmap, m->map_epoch))
+     return;
+ 
+   // must be a rep op.
+   assert(m->get_source().is_osd());
+@@ -7769,10 +7833,11 @@
+     if (pg->deleting) {
+       pg->unlock();
+       continue;
+     }
+-    advance_pg(curmap->get_epoch(), pg, handle, &rctx, &split_pgs);
+-    if (!pg->peering_queue.empty()) {
++    if (!advance_pg(curmap->get_epoch(), pg, handle, &rctx, &split_pgs)) {
++      pg->queue_null(curmap->get_epoch(), curmap->get_epoch());
++    } else if (!pg->peering_queue.empty()) {
+       PG::CephPeeringEvtRef evt = pg->peering_queue.front();
+       pg->peering_queue.pop_front();
+       pg->handle_peering_event(evt, &rctx);
+     }
+@@ -7807,8 +7872,13 @@
+   static const char* KEYS[] = {
+     "osd_max_backfills",
+     "osd_op_complaint_time", "osd_op_log_threshold",
+     "osd_op_history_size", "osd_op_history_duration",
++    "osd_map_cache_size",
++    "osd_map_max_advance",
++    "osd_pg_epoch_persisted_max_stale",
++    "osd_disk_thread_ioprio_class",
++    "osd_disk_thread_ioprio_priority",
+     NULL
+   };
+   return KEYS;
+ }
+@@ -7829,8 +7899,40 @@
+       changed.count("osd_op_history_duration")) {
+     op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
+                                              cct->_conf->osd_op_history_duration);
+   }
++  if (changed.count("osd_disk_thread_ioprio_class") ||
++      changed.count("osd_disk_thread_ioprio_priority")) {
++    set_disk_tp_priority();
++  }
++
++  check_config();
++}
++
++void OSD::check_config()
++{
++  // some sanity checks
++  if (g_conf->osd_map_cache_size <= g_conf->osd_map_max_advance + 2) {
++    clog.warn() << "osd_map_cache_size (" << g_conf->osd_map_cache_size << ")"
++		<< " is not > osd_map_max_advance ("
++		<< g_conf->osd_map_max_advance << ")";
++  }
++  if (g_conf->osd_map_cache_size <= (int)g_conf->osd_pg_epoch_persisted_max_stale + 2) {
++    clog.warn() << "osd_map_cache_size (" << g_conf->osd_map_cache_size << ")"
++		<< " is not > osd_pg_epoch_persisted_max_stale ("
++		<< g_conf->osd_pg_epoch_persisted_max_stale << ")";
++  }
++}
++
++void OSD::set_disk_tp_priority()
++{
++  dout(10) << __func__
++	   << " class " << cct->_conf->osd_disk_thread_ioprio_class
++	   << " priority " << cct->_conf->osd_disk_thread_ioprio_priority
++	   << dendl;
++  int cls =
++    ceph_ioprio_string_to_class(cct->_conf->osd_disk_thread_ioprio_class);
++  disk_tp.set_ioprio(cls, cct->_conf->osd_disk_thread_ioprio_priority);
+ }
+ 
+ // --------------------------------
+ 
+--- a/src/osd/OSD.h
++++ b/src/osd/OSD.h
+@@ -333,8 +333,44 @@
+   ClassHandler  *&class_handler;
+ 
+   void dequeue_pg(PG *pg, list<OpRequestRef> *dequeued);
+ 
++  // -- map epoch lower bound --
++  Mutex pg_epoch_lock;
++  multiset<epoch_t> pg_epochs;
++  map<spg_t,epoch_t> pg_epoch;
++
++  void pg_add_epoch(spg_t pgid, epoch_t epoch) {
++    Mutex::Locker l(pg_epoch_lock);
++    map<spg_t,epoch_t>::iterator t = pg_epoch.find(pgid);
++    assert(t == pg_epoch.end());
++    pg_epoch[pgid] = epoch;
++    pg_epochs.insert(epoch);
++  }
++  void pg_update_epoch(spg_t pgid, epoch_t epoch) {
++    Mutex::Locker l(pg_epoch_lock);
++    map<spg_t,epoch_t>::iterator t = pg_epoch.find(pgid);
++    assert(t != pg_epoch.end());
++    pg_epochs.erase(pg_epochs.find(t->second));
++    t->second = epoch;
++    pg_epochs.insert(epoch);
++  }
++  void pg_remove_epoch(spg_t pgid) {
++    Mutex::Locker l(pg_epoch_lock);
++    map<spg_t,epoch_t>::iterator t = pg_epoch.find(pgid);
++    if (t != pg_epoch.end()) {
++      pg_epochs.erase(pg_epochs.find(t->second));
++      pg_epoch.erase(t);
++    }
++  }
++  epoch_t get_min_pg_epoch() {
++    Mutex::Locker l(pg_epoch_lock);
++    if (pg_epochs.empty())
++      return 0;
++    else
++      return *pg_epochs.begin();
++  }
++
+   // -- superblock --
+   Mutex publish_lock, pre_publish_lock; // pre-publish orders before publish
+   OSDSuperblock superblock;
+   OSDSuperblock get_superblock() {
+@@ -783,8 +819,9 @@
+   // config observer bits
+   virtual const char** get_tracked_conf_keys() const;
+   virtual void handle_conf_change(const struct md_config_t *conf,
+ 				  const std::set <std::string> &changed);
++  void check_config();
+ 
+ protected:
+   Mutex osd_lock;			// global lock
+   SafeTimer tick_timer;    // safe timer (osd_lock)
+@@ -943,8 +980,10 @@
+   ThreadPool command_tp;
+ 
+   bool paused_recovery;
+ 
++  void set_disk_tp_priority();
++
+   // -- sessions --
+ public:
+   struct Session : public RefCountedObject {
+     EntityName entity_name;
+@@ -1254,9 +1293,9 @@
+   void handle_osd_map(class MOSDMap *m);
+   void note_down_osd(int osd);
+   void note_up_osd(int osd);
+   
+-  void advance_pg(
++  bool advance_pg(
+     epoch_t advance_to, PG *pg,
+     ThreadPool::TPHandle &handle,
+     PG::RecoveryCtx *rctx,
+     set<boost::intrusive_ptr<PG> > *split_pgs
+@@ -1512,11 +1551,24 @@
+ 		OSDMapRef map);
+   void repeer(PG *pg, map< int, map<spg_t,pg_query_t> >& query_map);
+ 
+   bool require_mon_peer(Message *m);
+-  bool require_osd_peer(OpRequestRef op);
++  bool require_osd_peer(OpRequestRef& op);
++  /***
++   * Verifies that we were alive in the given epoch, and that
++   * still are.
++   */
++  bool require_self_aliveness(OpRequestRef& op, epoch_t alive_since);
++  /**
++   * Verifies that the OSD who sent the given op has the same
++   * address as in the given map.
++   * @pre op was sent by an OSD using the cluster messenger
++   */
++  bool require_same_peer_instance(OpRequestRef& op, OSDMapRef& map);
++  bool require_up_osd_peer(OpRequestRef& Op, OSDMapRef& map,
++                           epoch_t their_epoch);
+ 
+-  bool require_same_or_newer_map(OpRequestRef op, epoch_t e);
++  bool require_same_or_newer_map(OpRequestRef& op, epoch_t e);
+ 
+   void handle_pg_query(OpRequestRef op);
+   void handle_pg_notify(OpRequestRef op);
+   void handle_pg_log(OpRequestRef op);
+--- a/src/osd/OSDMap.cc
++++ b/src/osd/OSDMap.cc
+@@ -958,12 +958,9 @@
+   if (crush->has_nondefault_tunables())
+     features |= CEPH_FEATURE_CRUSH_TUNABLES;
+   if (crush->has_nondefault_tunables2())
+     features |= CEPH_FEATURE_CRUSH_TUNABLES2;
+-  if (crush->has_v2_rules())
+-    features |= CEPH_FEATURE_CRUSH_V2;
+-  if (crush->has_nondefault_tunables3() ||
+-      crush->has_v3_rules())
++  if (crush->has_nondefault_tunables3())
+     features |= CEPH_FEATURE_CRUSH_TUNABLES3;
+   mask |= CEPH_FEATURES_CRUSH;
+ 
+   for (map<int64_t,pg_pool_t>::const_iterator p = pools.begin(); p != pools.end(); ++p) {
+@@ -977,8 +974,17 @@
+     if (!p->second.tiers.empty() ||
+ 	p->second.is_tier()) {
+       features |= CEPH_FEATURE_OSD_CACHEPOOL;
+     }
++    int ruleid = crush->find_rule(p->second.get_crush_ruleset(),
++				  p->second.get_type(),
++				  p->second.get_size());
++    if (ruleid >= 0) {
++      if (crush->is_v2_rule(ruleid))
++	features |= CEPH_FEATURE_CRUSH_V2;
++      if (crush->is_v3_rule(ruleid))
++	features |= CEPH_FEATURE_CRUSH_TUNABLES3;
++    }
+   }
+   mask |= CEPH_FEATURE_OSDHASHPSPOOL | CEPH_FEATURE_OSD_CACHEPOOL;
+   if (entity_type != CEPH_ENTITY_TYPE_CLIENT)
+     mask |= CEPH_FEATURE_OSD_ERASURE_CODES;
+@@ -1800,9 +1806,17 @@
+   {
+     ENCODE_START(1, 1, bl); // extended, osd-only data
+     ::encode(osd_addrs->hb_back_addr, bl);
+     ::encode(osd_info, bl);
+-    ::encode(blacklist, bl);
++    {
++      // put this in a sorted, ordered map<> so that we encode in a
++      // deterministic order.
++      map<entity_addr_t,utime_t> blacklist_map;
++      for (ceph::unordered_map<entity_addr_t,utime_t>::const_iterator p =
++	     blacklist.begin(); p != blacklist.end(); ++p)
++	blacklist_map.insert(make_pair(p->first, p->second));
++      ::encode(blacklist_map, bl);
++    }
+     ::encode(osd_addrs->cluster_addr, bl);
+     ::encode(cluster_snapshot_epoch, bl);
+     ::encode(cluster_snapshot, bl);
+     ::encode(*osd_uuid, bl);
+@@ -2158,8 +2172,9 @@
+   o.push_back(new OSDMap);
+   uuid_d fsid;
+   o.back()->build_simple(cct, 1, fsid, 16, 7, 8);
+   o.back()->created = o.back()->modified = utime_t(1, 2);  // fix timestamp
++  o.back()->blacklist[entity_addr_t()] = utime_t(5, 6);
+   cct->put();
+ }
+ 
+ string OSDMap::get_flag_string(unsigned f)
+@@ -2550,15 +2565,27 @@
+     set_state(i, 0);
+     set_weight(i, CEPH_OSD_OUT);
+   }
+ 
+-  map<string,string> erasure_code_profile_map;
+-  r = get_str_map(cct->_conf->osd_pool_default_erasure_code_profile,
+-		  ss,
+-		  &erasure_code_profile_map);
+-  erasure_code_profile_map["directory"] =
++  map<string,string> profile_map;
++  r = get_erasure_code_profile_default(cct, profile_map, &ss);
++  if (r < 0) {
++    lderr(cct) << ss.str() << dendl;
++    return r;
++  }
++  set_erasure_code_profile("default", profile_map);
++  return 0;
++}
++
++int OSDMap::get_erasure_code_profile_default(CephContext *cct,
++					     map<string,string> &profile_map,
++					     ostream *ss)
++{
++  int r = get_str_map(cct->_conf->osd_pool_default_erasure_code_profile,
++		      *ss,
++		      &profile_map);
++  profile_map["directory"] =
+     cct->_conf->osd_pool_default_erasure_code_directory;
+-  set_erasure_code_profile("default", erasure_code_profile_map);
+   return r;
+ }
+ 
+ int OSDMap::_build_crush_types(CrushWrapper& crush)
+--- a/src/osd/OSDMap.h
++++ b/src/osd/OSDMap.h
+@@ -379,8 +379,11 @@
+     map<string,map<string,string> >::const_iterator i =
+       erasure_code_profiles.find(name);
+     return i != erasure_code_profiles.end();
+   }
++  int get_erasure_code_profile_default(CephContext *cct,
++				       map<string,string> &profile_map,
++				       ostream *ss);
+   void set_erasure_code_profile(const string &name,
+ 				const map<string,string> &profile) {
+     erasure_code_profiles[name] = profile;
+   }
+--- a/src/osd/OpRequest.cc
++++ b/src/osd/OpRequest.cc
+@@ -32,9 +32,9 @@
+     f->open_object_section("client_info");
+     stringstream client_name;
+     client_name << m->get_orig_source();
+     f->dump_string("client", client_name.str());
+-    f->dump_int("tid", m->get_tid());
++    f->dump_unsigned("tid", m->get_tid());
+     f->close_section(); // client_info
+   }
+   {
+     f->open_array_section("events");
+--- a/src/osd/OpRequest.h
++++ b/src/osd/OpRequest.h
+@@ -73,8 +73,12 @@
+   void set_pg_op();
+ 
+   void _dump(utime_t now, Formatter *f) const;
+ 
++  bool has_feature(uint64_t f) const {
++    return request->get_connection()->has_feature(f);
++  }
++
+ private:
+   osd_reqid_t reqid;
+   uint8_t hit_flag_points;
+   uint8_t latest_flag_point;
+--- a/src/osd/PG.cc
++++ b/src/osd/PG.cc
+@@ -1442,9 +1442,9 @@
+     last_update_ondisk = info.last_update;
+     min_last_complete_ondisk = eversion_t(0,0);  // we don't know (yet)!
+   }
+   last_update_applied = info.last_update;
+-
++  last_rollback_info_trimmed_to_applied = pg_log.get_rollback_trimmed_to();
+ 
+   need_up_thru = false;
+ 
+   // write pg info, log
+@@ -2640,9 +2640,12 @@
+ }
+ 
+ 
+ void PG::append_log(
+-  vector<pg_log_entry_t>& logv, eversion_t trim_to, ObjectStore::Transaction &t,
++  vector<pg_log_entry_t>& logv,
++  eversion_t trim_to,
++  eversion_t trim_rollback_to,
++  ObjectStore::Transaction &t,
+   bool transaction_applied)
+ {
+   if (transaction_applied)
+     update_snap_map(logv, t);
+@@ -2654,15 +2657,35 @@
+        ++p) {
+     p->offset = 0;
+     add_log_entry(*p, keys[p->get_key_name()]);
+   }
+-  if (!transaction_applied)
+-    pg_log.clear_can_rollback_to();
++
++  PGLogEntryHandler handler;
++  if (!transaction_applied) {
++    pg_log.clear_can_rollback_to(&handler);
++    t.register_on_applied(
++      new C_UpdateLastRollbackInfoTrimmedToApplied(
++	this,
++	get_osdmap()->get_epoch(),
++	info.last_update));
++  } else if (trim_rollback_to > pg_log.get_rollback_trimmed_to()) {
++    pg_log.trim_rollback_info(
++      trim_rollback_to,
++      &handler);
++    t.register_on_applied(
++      new C_UpdateLastRollbackInfoTrimmedToApplied(
++	this,
++	get_osdmap()->get_epoch(),
++	trim_rollback_to));
++  }
+ 
+   dout(10) << "append_log  adding " << keys.size() << " keys" << dendl;
+   t.omap_setkeys(coll_t::META_COLL, log_oid, keys);
+-  PGLogEntryHandler handler;
++
+   pg_log.trim(&handler, trim_to, info);
++
++  dout(10) << __func__ << ": trimming to " << trim_rollback_to
++	   << " entries " << handler.to_trim << dendl;
+   handler.apply(this, &t);
+ 
+   // update the local pg, pg log
+   dirty_info = true;
+@@ -3003,9 +3026,10 @@
+ }
+ 
+ void PG::reg_next_scrub()
+ {
+-  if (scrubber.must_scrub) {
++  if (scrubber.must_scrub ||
++      (info.stats.stats_invalid && g_conf->osd_scrub_invalid_stats)) {
+     scrubber.scrub_reg_stamp = utime_t();
+   } else {
+     scrubber.scrub_reg_stamp = info.history.last_scrub_stamp;
+   }
+@@ -3261,8 +3285,36 @@
+     osd->send_message_osd_cluster(i->osd, subop, get_osdmap()->get_epoch());
+   }
+ }
+ 
++void PG::_scan_rollback_obs(
++  const vector<ghobject_t> &rollback_obs,
++  ThreadPool::TPHandle &handle)
++{
++  ObjectStore::Transaction *t = NULL;
++  eversion_t trimmed_to = last_rollback_info_trimmed_to_applied;
++  for (vector<ghobject_t>::const_iterator i = rollback_obs.begin();
++       i != rollback_obs.end();
++       ++i) {
++    if (i->generation < trimmed_to.version) {
++      osd->clog.error() << "osd." << osd->whoami
++			<< " pg " << info.pgid
++			<< " found obsolete rollback obj "
++			<< *i << " generation < trimmed_to "
++			<< trimmed_to
++			<< "...repaired";
++      if (!t)
++	t = new ObjectStore::Transaction;
++      t->remove(coll, *i);
++    }
++  }
++  if (t) {
++    derr << __func__ << ": queueing trans to clean up obsolete rollback objs"
++	 << dendl;
++    osd->store->queue_transaction_and_cleanup(osr.get(), t);
++  }
++}
++
+ void PG::_scan_snaps(ScrubMap &smap) 
+ {
+   for (map<hobject_t, ScrubMap::object>::iterator i = smap.objects.begin();
+        i != smap.objects.end();
+@@ -3348,15 +3400,23 @@
+   map.valid_through = info.last_update;
+ 
+   // objects
+   vector<hobject_t> ls;
+-  int ret = get_pgbackend()->objects_list_range(start, end, 0, &ls);
++  vector<ghobject_t> rollback_obs;
++  int ret = get_pgbackend()->objects_list_range(
++    start,
++    end,
++    0,
++    &ls,
++    &rollback_obs);
+   if (ret < 0) {
+     dout(5) << "objects_list_range error: " << ret << dendl;
+     return ret;
+   }
+ 
++
+   get_pgbackend()->be_scan_list(map, ls, deep, handle);
++  _scan_rollback_obs(rollback_obs, handle);
+   _scan_snaps(map);
+ 
+   // pg attrs
+   osd->store->collection_getattrs(coll, map.attrs);
+@@ -3577,8 +3637,19 @@
+  */
+ void PG::scrub(ThreadPool::TPHandle &handle)
+ {
+   lock();
++  if (g_conf->osd_scrub_sleep > 0 &&
++      (scrubber.state == PG::Scrubber::NEW_CHUNK ||
++       scrubber.state == PG::Scrubber::INACTIVE)) {
++    dout(20) << __func__ << " state is INACTIVE|NEW_CHUNK, sleeping" << dendl;
++    unlock();
++    utime_t t;
++    t.set_from_double(g_conf->osd_scrub_sleep);
++    t.sleep();
++    lock();
++    dout(20) << __func__ << " slept for " << t << dendl;
++  }
+   if (deleting) {
+     unlock();
+     return;
+   }
+@@ -4630,8 +4701,23 @@
+   on_applied->push_back(new ContainerContext<FlushStateRef>(flush_trigger));
+   on_safe->push_back(new ContainerContext<FlushStateRef>(flush_trigger));
+ }
+ 
++void PG::reset_interval_flush()
++{
++  dout(10) << "Clearing blocked outgoing recovery messages" << dendl;
++  recovery_state.clear_blocked_outgoing();
++  
++  if (!osr->flush_commit(
++      new QueuePeeringEvt<IntervalFlush>(
++	this, get_osdmap()->get_epoch(), IntervalFlush()))) {
++    dout(10) << "Beginning to block outgoing recovery messages" << dendl;
++    recovery_state.begin_block_outgoing();
++  } else {
++    dout(10) << "Not blocking outgoing recovery messages" << dendl;
++  }
++}
++
+ /* Called before initializing peering during advance_map */
+ void PG::start_peering_interval(
+   const OSDMapRef lastmap,
+   const vector<int>& newup, int new_up_primary,
+@@ -4640,8 +4726,9 @@
+ {
+   const OSDMapRef osdmap = get_osdmap();
+ 
+   set_last_peering_reset();
++  reset_interval_flush();
+ 
+   vector<int> oldacting, oldup;
+   int oldrole = get_role();
+ 
+@@ -5049,9 +5136,9 @@
+     return can_discard_replica_op<MOSDPGPull, MSG_OSD_PG_PULL>(op);
+   case MSG_OSD_PG_PUSH_REPLY:
+     return can_discard_replica_op<MOSDPGPushReply, MSG_OSD_PG_PUSH_REPLY>(op);
+   case MSG_OSD_SUBOPREPLY:
+-    return false;
++    return can_discard_replica_op<MOSDSubOpReply, MSG_OSD_SUBOPREPLY>(op);
+ 
+   case MSG_OSD_EC_WRITE:
+     return can_discard_replica_op<MOSDECSubOpWrite, MSG_OSD_EC_WRITE>(op);
+   case MSG_OSD_EC_WRITE_REPLY:
+@@ -5385,8 +5472,17 @@
+   context< RecoveryMachine >().log_enter(state_name);
+ }
+ 
+ boost::statechart::result
++PG::RecoveryState::Started::react(const IntervalFlush&)
++{
++  dout(10) << "Ending blocked outgoing recovery messages" << dendl;
++  context< RecoveryMachine >().pg->recovery_state.end_block_outgoing();
++  return discard_event();
++}
++
++
++boost::statechart::result
+ PG::RecoveryState::Started::react(const FlushedEvt&)
+ {
+   PG *pg = context< RecoveryMachine >().pg;
+   pg->on_flushed();
+@@ -5435,8 +5531,9 @@
+     NamedState(context< RecoveryMachine >().pg->cct, "Reset")
+ {
+   context< RecoveryMachine >().log_enter(state_name);
+   PG *pg = context< RecoveryMachine >().pg;
++
+   pg->flushes_in_progress = 0;
+   pg->set_last_peering_reset();
+ }
+ 
+@@ -5447,8 +5544,16 @@
+   pg->on_flushed();
+   return discard_event();
+ }
+ 
++boost::statechart::result
++PG::RecoveryState::Reset::react(const IntervalFlush&)
++{
++  dout(10) << "Ending blocked outgoing recovery messages" << dendl;
++  context< RecoveryMachine >().pg->recovery_state.end_block_outgoing();
++  return discard_event();
++}
++
+ boost::statechart::result PG::RecoveryState::Reset::react(const AdvMap& advmap)
+ {
+   PG *pg = context< RecoveryMachine >().pg;
+   dout(10) << "Reset advmap" << dendl;
+@@ -5829,8 +5934,20 @@
+ {
+   context< RecoveryMachine >().log_enter(state_name);
+ }
+ 
++boost::statechart::result
++PG::RecoveryState::NotBackfilling::react(const RemoteBackfillReserved &evt)
++{
++  return discard_event();
++}
++
++boost::statechart::result
++PG::RecoveryState::NotBackfilling::react(const RemoteReservationRejected &evt)
++{
++  return discard_event();
++}
++
+ void PG::RecoveryState::NotBackfilling::exit()
+ {
+   context< RecoveryMachine >().log_exit(state_name, enter_time);
+   PG *pg = context< RecoveryMachine >().pg;
+@@ -6587,19 +6704,23 @@
+   PG *pg = context< RecoveryMachine >().pg;
+   MOSDPGLog *msg = logevt.msg.get();
+   dout(10) << "got info+log from osd." << logevt.from << " " << msg->info << " " << msg->log << dendl;
+ 
++  ObjectStore::Transaction* t = context<RecoveryMachine>().get_cur_transaction();
+   if (msg->info.last_backfill == hobject_t()) {
+     // restart backfill
+     pg->unreg_next_scrub();
+     pg->info = msg->info;
+     pg->reg_next_scrub();
+     pg->dirty_info = true;
+     pg->dirty_big_info = true;  // maybe.
+-    pg->pg_log.claim_log(msg->log);
++
++    PGLogEntryHandler rollbacker;
++    pg->pg_log.claim_log_and_clear_rollback_info(msg->log, &rollbacker);
++    rollbacker.apply(pg, t);
++
+     pg->pg_log.reset_backfill();
+   } else {
+-    ObjectStore::Transaction* t = context<RecoveryMachine>().get_cur_transaction();
+     pg->merge_log(*t, msg->info, msg->log, logevt.from);
+   }
+ 
+   assert(pg->pg_log.get_head() == pg->info.last_update);
+@@ -7491,20 +7612,53 @@
+ }
+ 
+ void PG::RecoveryState::start_handle(RecoveryCtx *new_ctx) {
+   assert(!rctx);
+-  rctx = new_ctx;
+-  if (rctx)
++  assert(!orig_ctx);
++  orig_ctx = new_ctx;
++  if (new_ctx) {
++    if (messages_pending_flush) {
++      rctx = RecoveryCtx(*messages_pending_flush, *new_ctx);
++    } else {
++      rctx = *new_ctx;
++    }
+     rctx->start_time = ceph_clock_now(pg->cct);
++  }
++}
++
++void PG::RecoveryState::begin_block_outgoing() {
++  assert(!messages_pending_flush);
++  assert(orig_ctx);
++  assert(rctx);
++  messages_pending_flush = BufferedRecoveryMessages();
++  rctx = RecoveryCtx(*messages_pending_flush, *orig_ctx);
++}
++
++void PG::RecoveryState::clear_blocked_outgoing() {
++  assert(orig_ctx);
++  assert(rctx);
++  messages_pending_flush = boost::optional<BufferedRecoveryMessages>();
++}
++
++void PG::RecoveryState::end_block_outgoing() {
++  assert(messages_pending_flush);
++  assert(orig_ctx);
++  assert(rctx);
++
++  rctx = RecoveryCtx(*orig_ctx);
++  rctx->accept_buffered_messages(*messages_pending_flush);
++  messages_pending_flush = boost::optional<BufferedRecoveryMessages>();
+ }
+ 
+ void PG::RecoveryState::end_handle() {
+   if (rctx) {
+     utime_t dur = ceph_clock_now(pg->cct) - rctx->start_time;
+     machine.event_time += dur;
+   }
++
+   machine.event_count++;
+-  rctx = 0;
++  rctx = boost::optional<RecoveryCtx>();
++  orig_ctx = NULL;
+ }
+ 
+ void intrusive_ptr_add_ref(PG *pg) { pg->get("intptr"); }
+ void intrusive_ptr_release(PG *pg) { pg->put("intptr"); }
+--- a/src/osd/PG.h
++++ b/src/osd/PG.h
+@@ -446,8 +446,27 @@
+   eversion_t  last_update_ondisk;    // last_update that has committed; ONLY DEFINED WHEN is_active()
+   eversion_t  last_complete_ondisk;  // last_complete that has committed.
+   eversion_t  last_update_applied;
+ 
++
++  struct C_UpdateLastRollbackInfoTrimmedToApplied : Context {
++    PGRef pg;
++    epoch_t e;
++    eversion_t v;
++    C_UpdateLastRollbackInfoTrimmedToApplied(PG *pg, epoch_t e, eversion_t v)
++      : pg(pg), e(e), v(v) {}
++    void finish(int) {
++      pg->lock();
++      if (!pg->pg_has_reset_since(e)) {
++	pg->last_rollback_info_trimmed_to_applied = v;
++      }
++      pg->unlock();
++    }
++  };
++  // entries <= last_rollback_info_trimmed_to_applied have been trimmed,
++  // and the transaction has applied
++  eversion_t  last_rollback_info_trimmed_to_applied;
++
+   // primary state
+  public:
+   pg_shard_t primary;
+   pg_shard_t pg_whoami;
+@@ -486,8 +505,14 @@
+   bool may_need_replay(const OSDMapRef osdmap) const;
+ 
+ 
+ public:    
++  struct BufferedRecoveryMessages {
++    map<int, map<spg_t, pg_query_t> > query_map;
++    map<int, vector<pair<pg_notify_t, pg_interval_map_t> > > info_map;
++    map<int, vector<pair<pg_notify_t, pg_interval_map_t> > > notify_list;
++  };
++
+   struct RecoveryCtx {
+     utime_t start_time;
+     map<int, map<spg_t, pg_query_t> > *query_map;
+     map<int, vector<pair<pg_notify_t, pg_interval_map_t> > > *info_map;
+@@ -507,8 +532,50 @@
+ 	notify_list(notify_list),
+ 	on_applied(on_applied),
+ 	on_safe(on_safe),
+ 	transaction(transaction) {}
++
++    RecoveryCtx(BufferedRecoveryMessages &buf, RecoveryCtx &rctx)
++      : query_map(&(buf.query_map)),
++	info_map(&(buf.info_map)),
++	notify_list(&(buf.notify_list)),
++	on_applied(rctx.on_applied),
++	on_safe(rctx.on_safe),
++	transaction(rctx.transaction) {}
++
++    void accept_buffered_messages(BufferedRecoveryMessages &m) {
++      assert(query_map);
++      assert(info_map);
++      assert(notify_list);
++      for (map<int, map<spg_t, pg_query_t> >::iterator i = m.query_map.begin();
++	   i != m.query_map.end();
++	   ++i) {
++	map<spg_t, pg_query_t> &omap = (*query_map)[i->first];
++	for (map<spg_t, pg_query_t>::iterator j = i->second.begin();
++	     j != i->second.end();
++	     ++j) {
++	  omap[j->first] = j->second;
++	}
++      }
++      for (map<int, vector<pair<pg_notify_t, pg_interval_map_t> > >::iterator i
++	     = m.info_map.begin();
++	   i != m.info_map.end();
++	   ++i) {
++	vector<pair<pg_notify_t, pg_interval_map_t> > &ovec =
++	  (*info_map)[i->first];
++	ovec.reserve(ovec.size() + i->second.size());
++	ovec.insert(ovec.end(), i->second.begin(), i->second.end());
++      }
++      for (map<int, vector<pair<pg_notify_t, pg_interval_map_t> > >::iterator i
++	     = m.notify_list.begin();
++	   i != m.notify_list.end();
++	   ++i) {
++	vector<pair<pg_notify_t, pg_interval_map_t> > &ovec =
++	  (*notify_list)[i->first];
++	ovec.reserve(ovec.size() + i->second.size());
++	ovec.insert(ovec.end(), i->second.begin(), i->second.end());
++      }
++    }
+   };
+ 
+   struct NamedState {
+     const char *state_name;
+@@ -1107,8 +1174,11 @@
+   void scrub_finish();
+   void scrub_clear_state();
+   bool scrub_gather_replica_maps();
+   void _scan_snaps(ScrubMap &map);
++  void _scan_rollback_obs(
++    const vector<ghobject_t> &rollback_obs,
++    ThreadPool::TPHandle &handle);
+   void _request_scrub_map_classic(pg_shard_t replica, eversion_t version);
+   void _request_scrub_map(pg_shard_t replica, eversion_t version,
+                           hobject_t start, hobject_t end, bool deep);
+   int build_scrub_map_chunk(
+@@ -1332,12 +1402,19 @@
+   TrivialEvent(GoClean)
+ 
+   TrivialEvent(AllReplicasActivated)
+ 
++  TrivialEvent(IntervalFlush)
++
+   /* Encapsulates PG recovery process */
+   class RecoveryState {
+     void start_handle(RecoveryCtx *new_ctx);
+     void end_handle();
++  public:
++    void begin_block_outgoing();
++    void end_block_outgoing();
++    void clear_blocked_outgoing();
++  private:
+ 
+     /* States */
+     struct Initial;
+     class RecoveryMachine : public boost::statechart::state_machine< RecoveryMachine, Initial > {
+@@ -1359,42 +1436,49 @@
+       RecoveryMachine(RecoveryState *state, PG *pg) : state(state), pg(pg), event_count(0) {}
+ 
+       /* Accessor functions for state methods */
+       ObjectStore::Transaction* get_cur_transaction() {
++	assert(state->rctx);
+ 	assert(state->rctx->transaction);
+ 	return state->rctx->transaction;
+       }
+ 
+       void send_query(pg_shard_t to, const pg_query_t &query) {
++	assert(state->rctx);
+ 	assert(state->rctx->query_map);
+ 	(*state->rctx->query_map)[to.osd][spg_t(pg->info.pgid.pgid, to.shard)] =
+ 	  query;
+       }
+ 
+       map<int, map<spg_t, pg_query_t> > *get_query_map() {
++	assert(state->rctx);
+ 	assert(state->rctx->query_map);
+ 	return state->rctx->query_map;
+       }
+ 
+       map<int, vector<pair<pg_notify_t, pg_interval_map_t> > > *get_info_map() {
++	assert(state->rctx);
+ 	assert(state->rctx->info_map);
+ 	return state->rctx->info_map;
+       }
+ 
+       list< Context* > *get_on_safe_context_list() {
++	assert(state->rctx);
+ 	assert(state->rctx->on_safe);
+ 	return &(state->rctx->on_safe->contexts);
+       }
+ 
+       list< Context * > *get_on_applied_context_list() {
++	assert(state->rctx);
+ 	assert(state->rctx->on_applied);
+ 	return &(state->rctx->on_applied->contexts);
+       }
+ 
+-      RecoveryCtx *get_recovery_ctx() { return state->rctx; }
++      RecoveryCtx *get_recovery_ctx() { return &*(state->rctx); }
+ 
+       void send_notify(pg_shard_t to,
+ 		       const pg_notify_t &info, const pg_interval_map_t &pi) {
++	assert(state->rctx);
+ 	assert(state->rctx->notify_list);
+ 	(*state->rctx->notify_list)[to.osd].push_back(make_pair(info, pi));
+       }
+     };
+@@ -1438,14 +1522,16 @@
+ 	boost::statechart::custom_reaction< AdvMap >,
+ 	boost::statechart::custom_reaction< ActMap >,
+ 	boost::statechart::custom_reaction< NullEvt >,
+ 	boost::statechart::custom_reaction< FlushedEvt >,
++	boost::statechart::custom_reaction< IntervalFlush >,
+ 	boost::statechart::transition< boost::statechart::event_base, Crashed >
+ 	> reactions;
+       boost::statechart::result react(const QueryState& q);
+       boost::statechart::result react(const AdvMap&);
+       boost::statechart::result react(const ActMap&);
+       boost::statechart::result react(const FlushedEvt&);
++      boost::statechart::result react(const IntervalFlush&);
+       boost::statechart::result react(const boost::statechart::event_base&) {
+ 	return discard_event();
+       }
+     };
+@@ -1460,13 +1546,15 @@
+ 	boost::statechart::custom_reaction< QueryState >,
+ 	boost::statechart::custom_reaction< AdvMap >,
+ 	boost::statechart::custom_reaction< NullEvt >,
+ 	boost::statechart::custom_reaction< FlushedEvt >,
++	boost::statechart::custom_reaction< IntervalFlush >,
+ 	boost::statechart::transition< boost::statechart::event_base, Crashed >
+ 	> reactions;
+       boost::statechart::result react(const QueryState& q);
+       boost::statechart::result react(const AdvMap&);
+       boost::statechart::result react(const FlushedEvt&);
++      boost::statechart::result react(const IntervalFlush&);
+       boost::statechart::result react(const boost::statechart::event_base&) {
+ 	return discard_event();
+       }
+     };
+@@ -1634,12 +1722,16 @@
+     };
+ 
+     struct NotBackfilling : boost::statechart::state< NotBackfilling, Active>, NamedState {
+       typedef boost::mpl::list<
+-	boost::statechart::transition< RequestBackfill, WaitLocalBackfillReserved>
++	boost::statechart::transition< RequestBackfill, WaitLocalBackfillReserved>,
++	boost::statechart::custom_reaction< RemoteBackfillReserved >,
++	boost::statechart::custom_reaction< RemoteReservationRejected >
+ 	> reactions;
+       NotBackfilling(my_context ctx);
+       void exit();
++      boost::statechart::result react(const RemoteBackfillReserved& evt);
++      boost::statechart::result react(const RemoteReservationRejected& evt);
+     };
+ 
+     struct RepNotRecovering;
+     struct ReplicaActive : boost::statechart::state< ReplicaActive, Started, RepNotRecovering >, NamedState {
+@@ -1854,12 +1946,25 @@
+ 
+ 
+     RecoveryMachine machine;
+     PG *pg;
+-    RecoveryCtx *rctx;
++
++    /// context passed in by state machine caller
++    RecoveryCtx *orig_ctx;
++
++    /// populated if we are buffering messages pending a flush
++    boost::optional<BufferedRecoveryMessages> messages_pending_flush;
++
++    /**
++     * populated between start_handle() and end_handle(), points into
++     * the message lists for messages_pending_flush while blocking messages
++     * or into orig_ctx otherwise
++     */
++    boost::optional<RecoveryCtx> rctx;
+ 
+   public:
+-    RecoveryState(PG *pg) : machine(this, pg), pg(pg), rctx(0) {
++    RecoveryState(PG *pg)
++      : machine(this, pg), pg(pg), orig_ctx(0) {
+       machine.initiate();
+     }
+ 
+     void handle_event(const boost::statechart::event_base &evt,
+@@ -1995,9 +2100,12 @@
+   }
+ 
+   void add_log_entry(pg_log_entry_t& e, bufferlist& log_bl);
+   void append_log(
+-    vector<pg_log_entry_t>& logv, eversion_t trim_to, ObjectStore::Transaction &t,
++    vector<pg_log_entry_t>& logv,
++    eversion_t trim_to,
++    eversion_t trim_rollback_to,
++    ObjectStore::Transaction &t,
+     bool transaction_applied = true);
+   bool check_log_for_corruption(ObjectStore *store);
+   void trim_peers();
+ 
+@@ -2025,8 +2133,9 @@
+   void share_pg_info();
+   /// share new pg log entries after a pg is active
+   void share_pg_log();
+ 
++  void reset_interval_flush();
+   void start_peering_interval(
+     const OSDMapRef lastmap,
+     const vector<int>& newup, int up_primary,
+     const vector<int>& newacting, int acting_primary,
+--- a/src/osd/PGBackend.cc
++++ b/src/osd/PGBackend.cc
+@@ -114,9 +114,13 @@
+   vector<hobject_t> *ls,
+   hobject_t *next)
+ {
+   assert(ls);
+-  ghobject_t _next(begin);
++  // Starts with the smallest shard id and generation to
++  // make sure the result list has the marker object (
++  // it might have multiple generations though, which would
++  // be filtered).
++  ghobject_t _next(begin, 0, shard_id_t(0));
+   ls->reserve(max);
+   int r = 0;
+   while (!_next.is_max() && ls->size() < (unsigned)min) {
+     vector<ghobject_t> objects;
+@@ -146,9 +150,10 @@
+ int PGBackend::objects_list_range(
+   const hobject_t &start,
+   const hobject_t &end,
+   snapid_t seq,
+-  vector<hobject_t> *ls)
++  vector<hobject_t> *ls,
++  vector<ghobject_t> *gen_obs)
+ {
+   assert(ls);
+   vector<ghobject_t> objects;
+   int r = store->collection_list_range(
+@@ -162,8 +167,10 @@
+        i != objects.end();
+        ++i) {
+     if (i->is_no_gen()) {
+       ls->push_back(i->hobj);
++    } else if (gen_obs) {
++      gen_obs->push_back(*i);
+     }
+   }
+   return r;
+ }
+--- a/src/osd/PGBackend.h
++++ b/src/osd/PGBackend.h
+@@ -176,8 +176,9 @@
+      virtual void log_operation(
+        vector<pg_log_entry_t> &logv,
+        boost::optional<pg_hit_set_history_t> &hset_history,
+        const eversion_t &trim_to,
++       const eversion_t &trim_rollback_to,
+        bool transaction_applied,
+        ObjectStore::Transaction *t) = 0;
+ 
+      virtual void update_peer_last_complete_ondisk(
+@@ -495,8 +496,9 @@
+      const hobject_t &hoid,               ///< [in] object
+      const eversion_t &at_version,        ///< [in] version
+      PGTransaction *t,                    ///< [in] trans to execute
+      const eversion_t &trim_to,           ///< [in] trim log to here
++     const eversion_t &trim_rollback_to,  ///< [in] trim rollback info to here
+      vector<pg_log_entry_t> &log_entries, ///< [in] log entries for t
+      /// [in] hitset history (if updated with this transaction)
+      boost::optional<pg_hit_set_history_t> &hset_history,
+      Context *on_local_applied_sync,      ///< [in] called when applied locally
+@@ -554,9 +556,10 @@
+    int objects_list_range(
+      const hobject_t &start,
+      const hobject_t &end,
+      snapid_t seq,
+-     vector<hobject_t> *ls);
++     vector<hobject_t> *ls,
++     vector<ghobject_t> *gen_obs=0);
+ 
+    int objects_get_attr(
+      const hobject_t &hoid,
+      const string &attr,
+--- a/src/osd/PGLog.cc
++++ b/src/osd/PGLog.cc
+@@ -23,8 +23,27 @@
+ #define dout_subsys ceph_subsys_osd
+ 
+ //////////////////// PGLog::IndexedLog ////////////////////
+ 
++void PGLog::IndexedLog::advance_rollback_info_trimmed_to(
++  eversion_t to,
++  LogEntryHandler *h)
++{
++  assert(to <= can_rollback_to);
++
++  if (to > rollback_info_trimmed_to)
++    rollback_info_trimmed_to = to;
++
++  while (rollback_info_trimmed_to_riter != log.rbegin()) {
++    --rollback_info_trimmed_to_riter;
++    if (rollback_info_trimmed_to_riter->version > rollback_info_trimmed_to) {
++      ++rollback_info_trimmed_to_riter;
++      break;
++    }
++    h->trim(*rollback_info_trimmed_to_riter);
++  }
++}
++
+ void PGLog::IndexedLog::split_into(
+   pg_t child_pgid,
+   unsigned split_bits,
+   PGLog::IndexedLog *olog)
+@@ -46,11 +65,13 @@
+     }
+     oldlog.erase(i++);
+   }
+ 
++
++  olog->can_rollback_to = can_rollback_to;
++
+   olog->index();
+   index();
+-  olog->can_rollback_to = can_rollback_to;
+ }
+ 
+ void PGLog::IndexedLog::trim(
+   LogEntryHandler *handler,
+@@ -58,22 +79,33 @@
+   set<eversion_t> *trimmed)
+ {
+   if (complete_to != log.end() &&
+       complete_to->version <= s) {
+-    generic_dout(0) << " bad trim to " << s << " when complete_to is " << complete_to->version
++    generic_dout(0) << " bad trim to " << s << " when complete_to is "
++		    << complete_to->version
+ 		    << " on " << *this << dendl;
+   }
+ 
++  if (s > can_rollback_to)
++    can_rollback_to = s;
++  advance_rollback_info_trimmed_to(s, handler);
++
+   while (!log.empty()) {
+     pg_log_entry_t &e = *log.begin();
+     if (e.version > s)
+       break;
+     generic_dout(20) << "trim " << e << dendl;
+     if (trimmed)
+       trimmed->insert(e.version);
+-    handler->trim(e);
++
+     unindex(e);         // remove from index,
+-    log.pop_front();    // from log
++
++    if (e.version == rollback_info_trimmed_to_riter->version) {
++      log.pop_front();
++      rollback_info_trimmed_to_riter = log.rend();
++    } else {
++      log.pop_front();
++    }
+   }
+ 
+   // raise tail?
+   if (tail < s)
+@@ -103,9 +135,9 @@
+ 
+ void PGLog::clear() {
+   divergent_priors.clear();
+   missing.clear();
+-  log.zero();
++  log.clear();
+   log_keys_debug.clear();
+   undirty();
+ }
+ 
+--- a/src/osd/PGLog.h
++++ b/src/osd/PGLog.h
+@@ -61,13 +61,35 @@
+     // recovery pointers
+     list<pg_log_entry_t>::iterator complete_to;  // not inclusive of referenced item
+     version_t last_requested;           // last object requested by primary
+ 
++    //
++  private:
++    /**
++     * rollback_info_trimmed_to_riter points to the first log entry <=
++     * rollback_info_trimmed_to
++     *
++     * It's a reverse_iterator because rend() is a natural representation for
++     * tail, and rbegin() works nicely for head.
++     */
++    list<pg_log_entry_t>::reverse_iterator rollback_info_trimmed_to_riter;
++  public:
++    void advance_rollback_info_trimmed_to(eversion_t to, LogEntryHandler *h);
++
+     /****/
+-    IndexedLog() : last_requested(0) {}
++    IndexedLog() :
++      complete_to(log.end()),
++      last_requested(0),
++      rollback_info_trimmed_to_riter(log.rbegin())
++      {}
++
++    void claim_log_and_clear_rollback_info(const pg_log_t& o) {
++      // we must have already trimmed the old entries
++      assert(rollback_info_trimmed_to == head);
++      assert(rollback_info_trimmed_to_riter == log.rbegin());
+ 
+-    void claim_log(const pg_log_t& o) {
+       log = o.log;
++      rollback_info_trimmed_to = head;
+       head = o.head;
+       tail = o.tail;
+       index();
+     }
+@@ -77,12 +99,22 @@
+       unsigned split_bits,
+       IndexedLog *olog);
+ 
+     void zero() {
++      // we must have already trimmed the old entries
++      assert(rollback_info_trimmed_to == head);
++      assert(rollback_info_trimmed_to_riter == log.rbegin());
++
+       unindex();
+       pg_log_t::clear();
++      rollback_info_trimmed_to_riter = log.rbegin();
+       reset_recovery_pointers();
+     }
++    void clear() {
++      rollback_info_trimmed_to = head;
++      rollback_info_trimmed_to_riter = log.rbegin();
++      zero();
++    }
+     void reset_recovery_pointers() {
+       complete_to = log.end();
+       last_requested = 0;
+     }
+@@ -111,8 +143,13 @@
+ 	  //assert(caller_ops.count(i->reqid) == 0);  // divergent merge_log indexes new before unindexing old
+ 	  caller_ops[i->reqid] = &(*i);
+ 	}
+       }
++
++      rollback_info_trimmed_to_riter = log.rbegin();
++      while (rollback_info_trimmed_to_riter != log.rend() &&
++	     rollback_info_trimmed_to_riter->version > rollback_info_trimmed_to)
++	rollback_info_trimmed_to_riter++;
+     }
+ 
+     void index(pg_log_entry_t& e) {
+       if (objects.count(e.soid) == 0 || 
+@@ -140,8 +177,13 @@
+     // actors
+     void add(pg_log_entry_t& e) {
+       // add to log
+       log.push_back(e);
++
++      // riter previously pointed to the previous entry
++      if (rollback_info_trimmed_to_riter == log.rbegin())
++	++rollback_info_trimmed_to_riter;
++
+       assert(e.version > head);
+       assert(head.version == 0 || e.version.version > head.version);
+       head = e.version;
+ 
+@@ -324,16 +366,35 @@
+     LogEntryHandler *handler,
+     eversion_t trim_to,
+     pg_info_t &info);
+ 
+-  void clear_can_rollback_to() {
++  void trim_rollback_info(
++    eversion_t trim_rollback_to,
++    LogEntryHandler *h) {
++    if (trim_rollback_to > log.can_rollback_to)
++      log.can_rollback_to = trim_rollback_to;
++    log.advance_rollback_info_trimmed_to(
++      trim_rollback_to,
++      h);
++  }
++
++  eversion_t get_rollback_trimmed_to() const {
++    return log.rollback_info_trimmed_to;
++  }
++
++  void clear_can_rollback_to(LogEntryHandler *h) {
+     log.can_rollback_to = log.head;
++    log.advance_rollback_info_trimmed_to(
++      log.head,
++      h);
+   }
+ 
+   //////////////////// get or set log & missing ////////////////////
+ 
+-  void claim_log(const pg_log_t &o) {
+-    log.claim_log(o);
++  void claim_log_and_clear_rollback_info(const pg_log_t &o, LogEntryHandler *h) {
++    log.can_rollback_to = log.head;
++    log.advance_rollback_info_trimmed_to(log.head, h);
++    log.claim_log_and_clear_rollback_info(o);
+     missing.clear();
+     mark_dirty_to(eversion_t::max());
+   }
+ 
+--- a/src/osd/ReplicatedBackend.cc
++++ b/src/osd/ReplicatedBackend.cc
+@@ -493,8 +493,9 @@
+   const hobject_t &soid,
+   const eversion_t &at_version,
+   PGTransaction *_t,
+   const eversion_t &trim_to,
++  const eversion_t &trim_rollback_to,
+   vector<pg_log_entry_t> &log_entries,
+   boost::optional<pg_hit_set_history_t> &hset_history,
+   Context *on_local_applied_sync,
+   Context *on_all_acked,
+@@ -533,8 +534,9 @@
+     at_version,
+     tid,
+     reqid,
+     trim_to,
++    trim_rollback_to,
+     t->get_temp_added().size() ? *(t->get_temp_added().begin()) : hobject_t(),
+     t->get_temp_cleared().size() ?
+       *(t->get_temp_cleared().begin()) :hobject_t(),
+     log_entries,
+@@ -548,9 +550,15 @@
+     add_temp_objs(t->get_temp_added());
+   }
+   clear_temp_objs(t->get_temp_cleared());
+ 
+-  parent->log_operation(log_entries, hset_history, trim_to, true, &local_t);
++  parent->log_operation(
++    log_entries,
++    hset_history,
++    trim_to,
++    trim_rollback_to,
++    true,
++    &local_t);
+   local_t.append(*op_t);
+   local_t.swap(*op_t);
+   
+   op_t->register_on_applied_sync(on_local_applied_sync);
+--- a/src/osd/ReplicatedBackend.h
++++ b/src/osd/ReplicatedBackend.h
+@@ -341,8 +341,9 @@
+     const hobject_t &hoid,
+     const eversion_t &at_version,
+     PGTransaction *t,
+     const eversion_t &trim_to,
++    const eversion_t &trim_rollback_to,
+     vector<pg_log_entry_t> &log_entries,
+     boost::optional<pg_hit_set_history_t> &hset_history,
+     Context *on_local_applied_sync,
+     Context *on_all_applied,
+@@ -358,8 +359,9 @@
+     const eversion_t &at_version,
+     ceph_tid_t tid,
+     osd_reqid_t reqid,
+     eversion_t pg_trim_to,
++    eversion_t pg_trim_rollback_to,
+     hobject_t new_temp_oid,
+     hobject_t discard_temp_oid,
+     vector<pg_log_entry_t> &log_entries,
+     boost::optional<pg_hit_set_history_t> &hset_history,
+--- a/src/osd/ReplicatedPG.cc
++++ b/src/osd/ReplicatedPG.cc
+@@ -1119,8 +1119,14 @@
+       dout(20) << " replay, waiting for active on " << op << dendl;
+       waiting_for_active.push_back(op);
+       return;
+     }
++    // verify client features
++    if ((pool.info.has_tiers() || pool.info.is_tier()) &&
++	!op->has_feature(CEPH_FEATURE_OSD_CACHEPOOL)) {
++      osd->reply_op_error(op, -EOPNOTSUPP);
++      return;
++    }
+     do_op(op); // do it now
+     break;
+ 
+   case MSG_OSD_SUBOP:
+@@ -1351,11 +1357,12 @@
+     if (hit_set->is_full() ||
+ 	hit_set_start_stamp + pool.info.hit_set_period <= m->get_recv_stamp()) {
+       hit_set_persist();
+     }
++  }
+ 
+-    if (agent_state)
+-      agent_choose_mode();
++  if (agent_state) {
++    agent_choose_mode();
+   }
+ 
+   if ((m->get_flags() & CEPH_OSD_FLAG_IGNORE_CACHE) == 0 &&
+       maybe_handle_cache(op, write_ordered, obc, r, missing_oid, false))
+@@ -4853,10 +4860,11 @@
+       ctx->clone_obc->ssc->ref++;
+       if (pool.info.require_rollback())
+ 	ctx->clone_obc->attr_cache = ctx->obc->attr_cache;
+       snap_oi = &ctx->clone_obc->obs.oi;
+-      bool got = ctx->clone_obc->get_write(ctx->op);
++      bool got = ctx->clone_obc->get_write_greedy(ctx->op);
+       assert(got);
++      dout(20) << " got greedy write on clone_obc " << *ctx->clone_obc << dendl;
+     } else {
+       snap_oi = &static_snap_oi;
+     }
+     snap_oi->version = ctx->at_version;
+@@ -5159,10 +5167,11 @@
+ 	                                eversion_t(),
+ 					0, osd_reqid_t(), ctx->mtime));
+ 
+       ctx->snapset_obc = get_object_context(snapoid, true);
+-      bool got = ctx->snapset_obc->get_write(ctx->op);
++      bool got = ctx->snapset_obc->get_write_greedy(ctx->op);
+       assert(got);
++      dout(20) << " got greedy write on snapset_obc " << *ctx->snapset_obc << dendl;
+       ctx->release_snapset_obc = true;
+       if (pool.info.require_rollback() && !ctx->snapset_obc->obs.exists) {
+ 	ctx->log.back().mod_desc.create();
+       } else if (!pool.info.require_rollback()) {
+@@ -6025,8 +6034,13 @@
+   kick_object_context_blocked(cop->obc);
+   cop->results.should_requeue = requeue;
+   CopyCallbackResults result(-ECANCELED, &cop->results);
+   cop->cb->complete(result);
++
++  // There may still be an objecter callback referencing this copy op.
++  // That callback will not need the obc since it's been canceled, and
++  // we need the obc reference to go away prior to flush.
++  cop->obc = ObjectContextRef();
+ }
+ 
+ void ReplicatedPG::cancel_copy_ops(bool requeue)
+ {
+@@ -6441,9 +6455,9 @@
+ }
+ 
+ bool ReplicatedPG::is_present_clone(hobject_t coid)
+ {
+-  if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)
++  if (!pool.info.allow_incomplete_clones())
+     return true;
+   if (is_missing_object(coid))
+     return true;
+   ObjectContextRef obc = get_object_context(coid, false);
+@@ -6734,8 +6748,9 @@
+     soid,
+     repop->ctx->at_version,
+     repop->ctx->op_t,
+     pg_trim_to,
++    min_last_complete_ondisk,
+     repop->ctx->log,
+     repop->ctx->updated_hset_history,
+     onapplied_sync,
+     on_all_applied,
+@@ -6751,8 +6766,9 @@
+   const eversion_t &at_version,
+   ceph_tid_t tid,
+   osd_reqid_t reqid,
+   eversion_t pg_trim_to,
++  eversion_t pg_trim_rollback_to,
+   hobject_t new_temp_oid,
+   hobject_t discard_temp_oid,
+   vector<pg_log_entry_t> &log_entries,
+   boost::optional<pg_hit_set_history_t> &hset_hist,
+@@ -6806,8 +6822,9 @@
+     else
+       wr->pg_stats = get_info().stats;
+     
+     wr->pg_trim_to = pg_trim_to;
++    wr->pg_trim_rollback_to = pg_trim_rollback_to;
+ 
+     wr->new_temp_oid = new_temp_oid;
+     wr->discard_temp_oid = discard_temp_oid;
+     wr->updated_hit_set_history = hset_hist;
+@@ -6840,8 +6857,14 @@
+  
+ void ReplicatedPG::remove_repop(RepGather *repop)
+ {
+   dout(20) << __func__ << " " << *repop << dendl;
++  if (repop->ctx->obc)
++    dout(20) << " obc " << *repop->ctx->obc << dendl;
++  if (repop->ctx->clone_obc)
++    dout(20) << " clone_obc " << *repop->ctx->clone_obc << dendl;
++  if (repop->ctx->snapset_obc)
++    dout(20) << " snapset_obc " << *repop->ctx->snapset_obc << dendl;
+   release_op_ctx_locks(repop->ctx);
+   repop->ctx->finish(0);  // FIXME: return value here is sloppy
+   repop_map.erase(repop->rep_tid);
+   repop->put();
+@@ -7606,8 +7629,9 @@
+     parent->log_operation(
+       log,
+       m->updated_hit_set_history,
+       m->pg_trim_to,
++      m->pg_trim_rollback_to,
+       update_snaps,
+       &(rm->localt));
+       
+     rm->bytes_written = rm->opt.get_encoded_bytes();
+@@ -7701,10 +7725,10 @@
+   uint64_t size = obc->obs.oi.size;
+   if (size)
+     data_subset.insert(0, size);
+ 
+-  if (get_parent()->get_pool().cache_mode != pg_pool_t::CACHEMODE_NONE) {
+-    dout(10) << __func__ << ": caching enabled, skipping clone subsets" << dendl;
++  if (get_parent()->get_pool().allow_incomplete_clones()) {
++    dout(10) << __func__ << ": caching (was) enabled, skipping clone subsets" << dendl;
+     return;
+   }
+ 
+   if (!cct->_conf->osd_recover_clone_overlap) {
+@@ -7761,10 +7785,10 @@
+   uint64_t size = snapset.clone_size[soid.snap];
+   if (size)
+     data_subset.insert(0, size);
+ 
+-  if (get_parent()->get_pool().cache_mode != pg_pool_t::CACHEMODE_NONE) {
+-    dout(10) << __func__ << ": caching enabled, skipping clone subsets" << dendl;
++  if (get_parent()->get_pool().allow_incomplete_clones()) {
++    dout(10) << __func__ << ": caching (was) enabled, skipping clone subsets" << dendl;
+     return;
+   }
+ 
+   if (!cct->_conf->osd_recover_clone_overlap) {
+@@ -9464,8 +9488,19 @@
+ 
+ void ReplicatedPG::on_pool_change()
+ {
+   dout(10) << __func__ << dendl;
++  // requeue cache full waiters just in case the cache_mode is
++  // changing away from writeback mode.  note that if we are not
++  // active the normal requeuing machinery is sufficient (and properly
++  // ordered).
++  if (is_active() &&
++      pool.info.cache_mode != pg_pool_t::CACHEMODE_WRITEBACK &&
++      !waiting_for_cache_not_full.empty()) {
++    dout(10) << __func__ << " requeuing full waiters (not in writeback) "
++	     << dendl;
++    requeue_ops(waiting_for_cache_not_full);
++  }
+   hit_set_setup();
+   agent_setup();
+ }
+ 
+@@ -11288,9 +11323,10 @@
+       return false;
+     }
+   }
+ 
+-  if (agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL) {
++  if (agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL &&
++      hit_set) {
+     // is this object old and/or cold enough?
+     int atime = -1, temp = 0;
+     agent_estimate_atime_temp(soid, &atime, NULL /*FIXME &temp*/);
+ 
+@@ -11420,9 +11456,13 @@
+     else
+       num_dirty = 0;
+   }
+ 
+-  dout(10) << __func__ << ": "
++  dout(10) << __func__
++	   << " flush_mode: "
++	   << TierAgentState::get_flush_mode_name(agent_state->flush_mode)
++	   << " evict_mode: "
++	   << TierAgentState::get_evict_mode_name(agent_state->evict_mode)
+ 	   << " num_objects: " << info.stats.stats.sum.num_objects
+ 	   << " num_bytes: " << info.stats.stats.sum.num_bytes
+ 	   << " num_objects_dirty: " << info.stats.stats.sum.num_objects_dirty
+ 	   << " num_objects_omap: " << info.stats.stats.sum.num_objects_omap
+@@ -11434,9 +11474,9 @@
+ 
+   // get dirty, full ratios
+   uint64_t dirty_micro = 0;
+   uint64_t full_micro = 0;
+-  if (pool.info.target_max_bytes && info.stats.stats.sum.num_objects) {
++  if (pool.info.target_max_bytes && info.stats.stats.sum.num_objects > 0) {
+     uint64_t avg_size = info.stats.stats.sum.num_bytes /
+       info.stats.stats.sum.num_objects;
+     dirty_micro =
+       num_dirty * avg_size * 1000000 /
+@@ -11444,9 +11484,9 @@
+     full_micro =
+       num_user_objects * avg_size * 1000000 /
+       MAX(pool.info.target_max_bytes / divisor, 1);
+   }
+-  if (pool.info.target_max_objects) {
++  if (pool.info.target_max_objects > 0) {
+     uint64_t dirty_objects_micro =
+       num_dirty * 1000000 /
+       MAX(pool.info.target_max_objects / divisor, 1);
+     if (dirty_objects_micro > dirty_micro)
+@@ -11530,10 +11570,12 @@
+ 	    << TierAgentState::get_evict_mode_name(agent_state->evict_mode)
+ 	    << " -> "
+ 	    << TierAgentState::get_evict_mode_name(evict_mode)
+ 	    << dendl;
+-    if (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
++    if (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL &&
++	is_active()) {
+       requeue_ops(waiting_for_cache_not_full);
++      requeue_ops(waiting_for_active);
+     }
+     agent_state->evict_mode = evict_mode;
+   }
+   uint64_t old_effort = agent_state->evict_effort;
+@@ -11659,9 +11701,9 @@
+       ::decode(snapset, blp);
+ 
+       // did we finish the last oid?
+       if (head != hobject_t() &&
+-	  pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE) {
++	  !pool.info.allow_incomplete_clones()) {
+ 	osd->clog.error() << mode << " " << info.pgid << " " << head
+ 			  << " missing clones";
+         ++scrubber.shallow_errors;
+       }
+@@ -11720,9 +11762,9 @@
+     //assert(data.length() == p->size);
+     //
+ 
+     if (!next_clone.is_min() && next_clone != soid &&
+-	pool.info.cache_mode != pg_pool_t::CACHEMODE_NONE) {
++	pool.info.allow_incomplete_clones()) {
+       // it is okay to be missing one or more clones in a cache tier.
+       // skip higher-numbered clones in the list.
+       while (curclone != snapset.clones.rend() &&
+ 	     soid.snap < *curclone)
+@@ -11808,9 +11850,9 @@
+     scrub_cstat.add(stat, cat);
+   }
+ 
+   if (!next_clone.is_min() &&
+-      pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE) {
++      !pool.info.allow_incomplete_clones()) {
+     osd->clog.error() << mode << " " << info.pgid
+ 		      << " expected clone " << next_clone;
+     ++scrubber.shallow_errors;
+   }
+--- a/src/osd/ReplicatedPG.h
++++ b/src/osd/ReplicatedPG.h
+@@ -346,15 +346,16 @@
+   void log_operation(
+     vector<pg_log_entry_t> &logv,
+     boost::optional<pg_hit_set_history_t> &hset_history,
+     const eversion_t &trim_to,
++    const eversion_t &trim_rollback_to,
+     bool transaction_applied,
+     ObjectStore::Transaction *t) {
+     if (hset_history) {
+       info.hit_set = *hset_history;
+       dirty_info = true;
+     }
+-    append_log(logv, trim_to, *t, transaction_applied);
++    append_log(logv, trim_to, trim_rollback_to, *t, transaction_applied);
+   }
+ 
+   void op_applied(
+     const eversion_t &applied_version);
+--- a/src/osd/osd_types.cc
++++ b/src/osd/osd_types.cc
+@@ -2101,10 +2101,10 @@
+ void pg_notify_t::dump(Formatter *f) const
+ {
+   f->dump_int("from", from);
+   f->dump_int("to", to);
+-  f->dump_stream("query_epoch") << query_epoch;
+-  f->dump_stream("epoch_sent") << epoch_sent;
++  f->dump_unsigned("query_epoch", query_epoch);
++  f->dump_unsigned("epoch_sent", epoch_sent);
+   {
+     f->open_object_section("info");
+     info.dump(f);
+     f->close_section();
+@@ -2460,10 +2460,10 @@
+ 
+ void ObjectModDesc::dump(Formatter *f) const
+ {
+   f->open_object_section("object_mod_desc");
+-  f->dump_stream("can_local_rollback") << can_local_rollback;
+-  f->dump_stream("stashed") << stashed;
++  f->dump_bool("can_local_rollback", can_local_rollback);
++  f->dump_bool("rollback_info_completed", rollback_info_completed);
+   {
+     f->open_array_section("ops");
+     DumpVisitor vis(f);
+     visit(&vis);
+@@ -2496,17 +2496,17 @@
+ void ObjectModDesc::encode(bufferlist &_bl) const
+ {
+   ENCODE_START(1, 1, _bl);
+   ::encode(can_local_rollback, _bl);
+-  ::encode(stashed, _bl);
++  ::encode(rollback_info_completed, _bl);
+   ::encode(bl, _bl);
+   ENCODE_FINISH(_bl);
+ }
+ void ObjectModDesc::decode(bufferlist::iterator &_bl)
+ {
+   DECODE_START(1, _bl);
+   ::decode(can_local_rollback, _bl);
+-  ::decode(stashed, _bl);
++  ::decode(rollback_info_completed, _bl);
+   ::decode(bl, _bl);
+   DECODE_FINISH(_bl);
+ }
+ 
+@@ -2679,19 +2679,20 @@
+ // -- pg_log_t --
+ 
+ void pg_log_t::encode(bufferlist& bl) const
+ {
+-  ENCODE_START(5, 3, bl);
++  ENCODE_START(6, 3, bl);
+   ::encode(head, bl);
+   ::encode(tail, bl);
+   ::encode(log, bl);
+   ::encode(can_rollback_to, bl);
++  ::encode(rollback_info_trimmed_to, bl);
+   ENCODE_FINISH(bl);
+ }
+  
+ void pg_log_t::decode(bufferlist::iterator &bl, int64_t pool)
+ {
+-  DECODE_START_LEGACY_COMPAT_LEN(5, 3, 3, bl);
++  DECODE_START_LEGACY_COMPAT_LEN(6, 3, 3, bl);
+   ::decode(head, bl);
+   ::decode(tail, bl);
+   if (struct_v < 2) {
+     bool backlog;
+@@ -2699,8 +2700,13 @@
+   }
+   ::decode(log, bl);
+   if (struct_v >= 5)
+     ::decode(can_rollback_to, bl);
++
++  if (struct_v >= 6)
++    ::decode(rollback_info_trimmed_to, bl);
++  else
++    rollback_info_trimmed_to = tail;
+   DECODE_FINISH(bl);
+ 
+   // handle hobject_t format change
+   if (struct_v < 4) {
+--- a/src/osd/osd_types.h
++++ b/src/osd/osd_types.h
+@@ -810,18 +810,20 @@
+     return "replicated";
+   }
+ 
+   enum {
+-    FLAG_HASHPSPOOL = 1, // hash pg seed and pool together (instead of adding)
+-    FLAG_FULL       = 2, // pool is full
++    FLAG_HASHPSPOOL = 1<<0, // hash pg seed and pool together (instead of adding)
++    FLAG_FULL       = 1<<1, // pool is full
+     FLAG_DEBUG_FAKE_EC_POOL = 1<<2, // require ReplicatedPG to act like an EC pg
++    FLAG_INCOMPLETE_CLONES = 1<<3, // may have incomplete clones (bc we are/were an overlay)
+   };
+ 
+   static const char *get_flag_name(int f) {
+     switch (f) {
+     case FLAG_HASHPSPOOL: return "hashpspool";
+     case FLAG_FULL: return "full";
+     case FLAG_DEBUG_FAKE_EC_POOL: return "require_local_rollback";
++    case FLAG_INCOMPLETE_CLONES: return "incomplete_clones";
+     default: return "???";
+     }
+   }
+   static string get_flags_string(uint64_t f) {
+@@ -867,8 +869,20 @@
+   }
+   const char *get_cache_mode_name() const {
+     return get_cache_mode_name(cache_mode);
+   }
++  bool cache_mode_requires_hit_set() const {
++    switch (cache_mode) {
++    case CACHEMODE_NONE:
++    case CACHEMODE_FORWARD:
++    case CACHEMODE_READONLY:
++      return false;
++    case CACHEMODE_WRITEBACK:
++      return true;
++    default:
++      assert(0 == "implement me");
++    }
++  }
+ 
+   uint64_t flags;           ///< FLAG_*
+   __u8 type;                ///< TYPE_*
+   __u8 size, min_size;      ///< number of osds in each pg
+@@ -915,13 +929,31 @@
+   cache_mode_t cache_mode;  ///< cache pool mode
+ 
+   bool is_tier() const { return tier_of >= 0; }
+   bool has_tiers() const { return !tiers.empty(); }
+-  void clear_tier() { tier_of = -1; }
++  void clear_tier() {
++    tier_of = -1;
++    clear_read_tier();
++    clear_write_tier();
++    clear_tier_tunables();
++  }
+   bool has_read_tier() const { return read_tier >= 0; }
+   void clear_read_tier() { read_tier = -1; }
+   bool has_write_tier() const { return write_tier >= 0; }
+   void clear_write_tier() { write_tier = -1; }
++  void clear_tier_tunables() {
++    if (cache_mode != CACHEMODE_NONE)
++      flags |= FLAG_INCOMPLETE_CLONES;
++    cache_mode = CACHEMODE_NONE;
++
++    target_max_bytes = 0;
++    target_max_objects = 0;
++    cache_target_dirty_ratio_micro = 0;
++    cache_target_full_ratio_micro = 0;
++    hit_set_params = HitSet::Params();
++    hit_set_period = 0;
++    hit_set_count = 0;
++  }
+ 
+   uint64_t target_max_bytes;   ///< tiering: target max pool size
+   uint64_t target_max_objects; ///< tiering: target max pool size
+ 
+@@ -963,8 +995,9 @@
+ 
+   void dump(Formatter *f) const;
+ 
+   uint64_t get_flags() const { return flags; }
++  bool has_flag(uint64_t f) const { return flags & f; }
+ 
+   /// This method will later return true for ec pools as well
+   bool ec_pool() const {
+     return type == TYPE_ERASURE;
+@@ -972,8 +1005,13 @@
+   bool require_rollback() const {
+     return ec_pool() || flags & FLAG_DEBUG_FAKE_EC_POOL;
+   }
+ 
++  /// true if incomplete clones may be present
++  bool allow_incomplete_clones() const {
++    return cache_mode != CACHEMODE_NONE || has_flag(FLAG_INCOMPLETE_CLONES);
++  }
++
+   unsigned get_type() const { return type; }
+   unsigned get_size() const { return size; }
+   unsigned get_min_size() const { return min_size; }
+   int get_crush_ruleset() const { return crush_ruleset; }
+@@ -1810,9 +1848,9 @@
+ 
+ class PGBackend;
+ class ObjectModDesc {
+   bool can_local_rollback;
+-  bool stashed;
++  bool rollback_info_completed;
+ public:
+   class Visitor {
+   public:
+     virtual void append(uint64_t old_offset) {}
+@@ -1830,75 +1868,76 @@
+     DELETE = 3,
+     CREATE = 4,
+     UPDATE_SNAPS = 5
+   };
+-  ObjectModDesc() : can_local_rollback(true), stashed(false) {}
++  ObjectModDesc() : can_local_rollback(true), rollback_info_completed(false) {}
+   void claim(ObjectModDesc &other) {
+     bl.clear();
+     bl.claim(other.bl);
+     can_local_rollback = other.can_local_rollback;
+-    stashed = other.stashed;
++    rollback_info_completed = other.rollback_info_completed;
+   }
+   void claim_append(ObjectModDesc &other) {
+-    if (!can_local_rollback || stashed)
++    if (!can_local_rollback || rollback_info_completed)
+       return;
+     if (!other.can_local_rollback) {
+       mark_unrollbackable();
+       return;
+     }
+     bl.claim_append(other.bl);
+-    stashed = other.stashed;
++    rollback_info_completed = other.rollback_info_completed;
+   }
+   void swap(ObjectModDesc &other) {
+     bl.swap(other.bl);
+ 
+     bool temp = other.can_local_rollback;
+     other.can_local_rollback = can_local_rollback;
+     can_local_rollback = temp;
+ 
+-    temp = other.stashed;
+-    other.stashed = stashed;
+-    stashed = temp;
++    temp = other.rollback_info_completed;
++    other.rollback_info_completed = rollback_info_completed;
++    rollback_info_completed = temp;
+   }
+   void append_id(ModID id) {
+     uint8_t _id(id);
+     ::encode(_id, bl);
+   }
+   void append(uint64_t old_size) {
+-    if (!can_local_rollback || stashed)
++    if (!can_local_rollback || rollback_info_completed)
+       return;
+     ENCODE_START(1, 1, bl);
+     append_id(APPEND);
+     ::encode(old_size, bl);
+     ENCODE_FINISH(bl);
+   }
+   void setattrs(map<string, boost::optional<bufferlist> > &old_attrs) {
+-    if (!can_local_rollback || stashed)
++    if (!can_local_rollback || rollback_info_completed)
+       return;
+     ENCODE_START(1, 1, bl);
+     append_id(SETATTRS);
+     ::encode(old_attrs, bl);
+     ENCODE_FINISH(bl);
+   }
+   bool rmobject(version_t deletion_version) {
+-    if (!can_local_rollback || stashed)
++    if (!can_local_rollback || rollback_info_completed)
+       return false;
+     ENCODE_START(1, 1, bl);
+     append_id(DELETE);
+     ::encode(deletion_version, bl);
+     ENCODE_FINISH(bl);
+-    stashed = true;
++    rollback_info_completed = true;
+     return true;
+   }
+   void create() {
+-    if (!can_local_rollback || stashed)
++    if (!can_local_rollback || rollback_info_completed)
+       return;
++    rollback_info_completed = true;
+     ENCODE_START(1, 1, bl);
+     append_id(CREATE);
+     ENCODE_FINISH(bl);
+   }
+   void update_snaps(set<snapid_t> &old_snaps) {
+-    if (!can_local_rollback || stashed)
++    if (!can_local_rollback || rollback_info_completed)
+       return;
+     ENCODE_START(1, 1, bl);
+     append_id(UPDATE_SNAPS);
+     ::encode(old_snaps, bl);
+@@ -2060,8 +2099,12 @@
+ 
+   // We can rollback rollback-able entries > can_rollback_to
+   eversion_t can_rollback_to;
+ 
++  // always <= can_rollback_to, indicates how far stashed rollback
++  // data can be found
++  eversion_t rollback_info_trimmed_to;
++
+   list<pg_log_entry_t> log;  // the actual log.
+   
+   pg_log_t() {}
+ 
+@@ -2761,21 +2804,23 @@
+ 	return false;
+       }
+     }
+ 
+-    bool get_write(OpRequestRef op) {
+-      if (get_write_lock()) {
++    bool get_write(OpRequestRef op, bool greedy=false) {
++      if (get_write_lock(greedy)) {
+ 	return true;
+       } // else
+       if (op)
+ 	waiters.push_back(op);
+       return false;
+     }
+-    bool get_write_lock() {
+-      // don't starve anybody!
+-      if (!waiters.empty() ||
+-	  backfill_read_marker) {
+-	return false;
++    bool get_write_lock(bool greedy=false) {
++      if (!greedy) {
++	// don't starve anybody!
++	if (!waiters.empty() ||
++	    backfill_read_marker) {
++	  return false;
++	}
+       }
+       switch (state) {
+       case RWNONE:
+ 	assert(count == 0);
+@@ -2822,9 +2867,12 @@
+   bool get_read(OpRequestRef op) {
+     return rwstate.get_read(op);
+   }
+   bool get_write(OpRequestRef op) {
+-    return rwstate.get_write(op);
++    return rwstate.get_write(op, false);
++  }
++  bool get_write_greedy(OpRequestRef op) {
++    return rwstate.get_write(op, true);
+   }
+   bool get_snaptrimmer_write() {
+     if (rwstate.get_write_lock()) {
+       return true;
+--- a/src/osdc/Objecter.cc
++++ b/src/osdc/Objecter.cc
+@@ -1363,8 +1363,13 @@
+   }
+ 
+   ldout(cct, 10) << __func__ << " tid " << tid << dendl;
+   Op *op = p->second;
++  if (op->con) {
++    ldout(cct, 20) << " revoking rx buffer for " << tid
++		   << " on " << op->con << dendl;
++    op->con->revoke_rx_buffer(tid);
++  }
+   if (op->onack) {
+     op->onack->complete(r);
+     op->onack = NULL;
+   }
+@@ -1433,9 +1438,9 @@
+     return -ENOENT;
+   return p->raw_hash_to_pg(p->hash_key(key, ns));
+ }
+ 
+-int Objecter::calc_target(op_target_t *t)
++int Objecter::calc_target(op_target_t *t, bool any_change)
+ {
+   bool is_read = t->flags & CEPH_OSD_FLAG_READ;
+   bool is_write = t->flags & CEPH_OSD_FLAG_WRITE;
+ 
+@@ -1490,9 +1495,10 @@
+     need_resend = true;
+   }
+ 
+   if (t->pgid != pgid ||
+-      is_pg_changed(t->primary, t->acting, primary, acting, t->used_replica) ||
++      is_pg_changed(
++	t->primary, t->acting, primary, acting, t->used_replica || any_change) ||
+       force_resend) {
+     t->pgid = pgid;
+     t->acting = acting;
+     t->primary = primary;
+@@ -1569,9 +1575,9 @@
+ }
+ 
+ bool Objecter::recalc_linger_op_target(LingerOp *linger_op)
+ {
+-  int r = calc_target(&linger_op->target);
++  int r = calc_target(&linger_op->target, true);
+   if (r == RECALC_OP_TARGET_NEED_RESEND) {
+     ldout(cct, 10) << "recalc_linger_op_target tid " << linger_op->linger_id
+ 		   << " pgid " << linger_op->target.pgid
+ 		   << " acting " << linger_op->target.acting << dendl;
+--- a/src/osdc/Objecter.h
++++ b/src/osdc/Objecter.h
+@@ -1479,9 +1479,9 @@
+   };
+   bool osdmap_full_flag() const;
+   bool target_should_be_paused(op_target_t *op);
+ 
+-  int calc_target(op_target_t *t);
++  int calc_target(op_target_t *t, bool any_change=false);
+   int recalc_op_target(Op *op);
+   bool recalc_linger_op_target(LingerOp *op);
+ 
+   void send_linger(LingerOp *info);
+--- a/src/pybind/rbd.py
++++ b/src/pybind/rbd.py
+@@ -749,8 +749,16 @@
+         ret = self.librbd.rbd_flush(self.image)
+         if ret < 0:
+             raise make_ex(ret, 'error flushing image')
+ 
++    def invalidate_cache(self):
++        """
++        Drop any cached data for the image.
++        """
++        ret = self.librbd.rbd_invalidate_cache(self.image)
++        if ret < 0:
++            raise make_ex(ret, 'error invalidating cache')
++
+     def stripe_unit(self):
+         """
+         Returns the stripe unit used for the image.
+         """
+--- a/src/rgw/rgw_common.cc
++++ b/src/rgw/rgw_common.cc
+@@ -696,15 +696,17 @@
+   char dest[src_str.size() + 1];
+   int pos = 0;
+   char c;
+ 
++  bool in_query = false;
+   while (*src) {
+     if (*src != '%') {
+-      if (*src != '+') {
+-	dest[pos++] = *src++;
++      if (!in_query || *src != '+') {
++        if (*src == '?') in_query = true;
++        dest[pos++] = *src++;
+       } else {
+-	dest[pos++] = ' ';
+-	++src;
++        dest[pos++] = ' ';
++        ++src;
+       }
+     } else {
+       src++;
+       if (!*src)
+--- a/src/rgw/rgw_op.cc
++++ b/src/rgw/rgw_op.cc
+@@ -1379,9 +1379,12 @@
+ };
+ 
+ int RGWPutObjProcessor_Multipart::prepare(RGWRados *store, void *obj_ctx, string *oid_rand)
+ {
+-  RGWPutObjProcessor::prepare(store, obj_ctx, NULL);
++  int r = prepare_init(store, obj_ctx, NULL);
++  if (r < 0) {
++    return r;
++  }
+ 
+   string oid = obj_str;
+   upload_id = s->info.args.get("uploadId");
+   if (!oid_rand) {
+@@ -1418,9 +1421,9 @@
+   manifest.set_prefix(upload_prefix);
+ 
+   manifest.set_multipart_part_rule(store->ctx()->_conf->rgw_obj_stripe_size, num);
+ 
+-  int r = manifest_gen.create_begin(store->ctx(), &manifest, bucket, target_obj);
++  r = manifest_gen.create_begin(store->ctx(), &manifest, bucket, target_obj);
+   if (r < 0) {
+     return r;
+   }
+ 
+@@ -1559,8 +1562,38 @@
+ 
+   return 0;
+ }
+ 
++static int put_data_and_throttle(RGWPutObjProcessor *processor, bufferlist& data, off_t ofs,
++                                 MD5 *hash, bool need_to_wait)
++{
++  const unsigned char *data_ptr = (hash ? (const unsigned char *)data.c_str() : NULL);
++  bool again;
++  uint64_t len = data.length();
++
++  do {
++    void *handle;
++
++    int ret = processor->handle_data(data, ofs, &handle, &again);
++    if (ret < 0)
++      return ret;
++
++    if (hash) {
++      hash->Update(data_ptr, len);
++      hash = NULL; /* only calculate hash once */
++    }
++
++    ret = processor->throttle_data(handle, need_to_wait);
++    if (ret < 0)
++      return ret;
++
++    need_to_wait = false; /* the need to wait only applies to the first iteration */
++  } while (again);
++
++  return 0;
++}
++
++
+ void RGWPutObj::execute()
+ {
+   RGWPutObjProcessor *processor = NULL;
+   char supplied_md5_bin[CEPH_CRYPTO_MD5_DIGESTSIZE + 1];
+@@ -1632,25 +1665,14 @@
+     }
+     if (!len)
+       break;
+ 
+-    void *handle;
+-    const unsigned char *data_ptr = (const unsigned char *)data.c_str();
+-
+-    ret = processor->handle_data(data, ofs, &handle);
+-    if (ret < 0)
+-      goto done;
+-
+-    if (need_calc_md5) {
+-      hash.Update(data_ptr, len);
+-    }
+-
+     /* do we need this operation to be synchronous? if we're dealing with an object with immutable
+      * head, e.g., multipart object we need to make sure we're the first one writing to this object
+      */
+     bool need_to_wait = (ofs == 0) && multipart;
+ 
+-    ret = processor->throttle_data(handle, need_to_wait);
++    ret = put_data_and_throttle(processor, data, ofs, (need_calc_md5 ? &hash : NULL), need_to_wait);
+     if (ret < 0) {
+       if (!need_to_wait || ret != -EEXIST) {
+         ldout(s->cct, 20) << "processor->thottle_data() returned ret=" << ret << dendl;
+         goto done;
+@@ -1673,17 +1695,10 @@
+         ldout(s->cct, 0) << "ERROR: processor->prepare() returned " << ret << dendl;
+         goto done;
+       }
+ 
+-      ret = processor->handle_data(data, ofs, &handle);
++      ret = put_data_and_throttle(processor, data, ofs, NULL, false);
+       if (ret < 0) {
+-        ldout(s->cct, 0) << "ERROR: processor->handle_data() returned " << ret << dendl;
+-        goto done;
+-      }
+-
+-      ret = processor->throttle_data(handle, false);
+-      if (ret < 0) {
+-        ldout(s->cct, 0) << "ERROR: processor->throttle_data() returned " << ret << dendl;
+         goto done;
+       }
+     }
+ 
+@@ -1845,20 +1860,9 @@
+ 
+      if (!len)
+        break;
+ 
+-     void *handle;
+-     const unsigned char *data_ptr = (const unsigned char *)data.c_str();
+-
+-     ret = processor->handle_data(data, ofs, &handle);
+-     if (ret < 0)
+-       goto done;
+-
+-     hash.Update(data_ptr, len);
+-
+-     ret = processor->throttle_data(handle, false);
+-     if (ret < 0)
+-       goto done;
++     ret = put_data_and_throttle(processor, data, ofs, &hash, false);
+ 
+      ofs += len;
+ 
+      if (ofs > max_len) {
+--- a/src/rgw/rgw_rados.cc
++++ b/src/rgw/rgw_rados.cc
+@@ -899,10 +899,12 @@
+ 
+   return 0;
+ };
+ 
+-int RGWPutObjProcessor_Plain::handle_data(bufferlist& bl, off_t _ofs, void **phandle)
++int RGWPutObjProcessor_Plain::handle_data(bufferlist& bl, off_t _ofs, void **phandle, bool *again)
+ {
++  *again = false;
++
+   if (ofs != _ofs)
+     return -EINVAL;
+ 
+   data.append(bl);
+@@ -1025,10 +1027,12 @@
+ 
+   return RGWPutObjProcessor_Aio::handle_obj_data(cur_obj, bl, ofs - cur_part_ofs, ofs, phandle, exclusive);
+ }
+ 
+-int RGWPutObjProcessor_Atomic::handle_data(bufferlist& bl, off_t ofs, void **phandle)
++int RGWPutObjProcessor_Atomic::handle_data(bufferlist& bl, off_t ofs, void **phandle, bool *again)
+ {
++  *again = false;
++
+   *phandle = NULL;
+   if (extra_data_len) {
+     size_t extra_len = bl.length();
+     if (extra_len > extra_data_len)
+@@ -1043,15 +1047,18 @@
+       return 0;
+     }
+   }
+ 
+-  uint64_t max_chunk_size = store->get_max_chunk_size();
++  uint64_t max_write_size = MIN(max_chunk_size, (uint64_t)next_part_ofs - data_ofs);
+ 
+   pending_data_bl.claim_append(bl);
+-  if (pending_data_bl.length() < max_chunk_size)
++  if (pending_data_bl.length() < max_write_size)
+     return 0;
+ 
+-  pending_data_bl.splice(0, max_chunk_size, &bl);
++  pending_data_bl.splice(0, max_write_size, &bl);
++
++  /* do we have enough data pending accumulated that needs to be written? */
++  *again = (pending_data_bl.length() >= max_chunk_size);
+ 
+   if (!data_ofs && !immutable_head()) {
+     first_chunk.claim(bl);
+     obj_len = (uint64_t)first_chunk.length();
+@@ -1069,19 +1076,32 @@
+                                                         object and cleanup can be messy */
+   return write_data(bl, write_ofs, phandle, exclusive);
+ }
+ 
+-int RGWPutObjProcessor_Atomic::prepare(RGWRados *store, void *obj_ctx, string *oid_rand)
++
++int RGWPutObjProcessor_Atomic::prepare_init(RGWRados *store, void *obj_ctx, string *oid_rand)
+ {
+   RGWPutObjProcessor::prepare(store, obj_ctx, oid_rand);
+ 
+-  head_obj.init(bucket, obj_str);
++  int r = store->get_max_chunk_size(bucket, &max_chunk_size);
++  if (r < 0) {
++    return r;
++  }
++
++  return 0;
++}
+ 
+-  uint64_t max_chunk_size = store->get_max_chunk_size();
++int RGWPutObjProcessor_Atomic::prepare(RGWRados *store, void *obj_ctx, string *oid_rand)
++{
++  int r = prepare_init(store, obj_ctx, oid_rand);
++  if (r < 0) {
++    return r;
++  }
++  head_obj.init(bucket, obj_str);
+ 
+   manifest.set_trivial_rule(max_chunk_size, store->ctx()->_conf->rgw_obj_stripe_size);
+ 
+-  int r = manifest_gen.create_begin(store->ctx(), &manifest, bucket, head_obj);
++  r = manifest_gen.create_begin(store->ctx(), &manifest, bucket, head_obj);
+   if (r < 0) {
+     return r;
+   }
+ 
+@@ -1200,8 +1220,46 @@
+     objs_state[new_obj].prefetch_data = true;
+   }
+ }
+ 
++int RGWRados::get_required_alignment(rgw_bucket& bucket, uint64_t *alignment)
++{
++  IoCtx ioctx;
++  int r = open_bucket_data_ctx(bucket, ioctx);
++  if (r < 0) {
++    ldout(cct, 0) << "ERROR: open_bucket_data_ctx() returned " << r << dendl;
++    return r;
++  }
++
++  *alignment = ioctx.pool_required_alignment();
++  return 0;
++}
++
++int RGWRados::get_max_chunk_size(rgw_bucket& bucket, uint64_t *max_chunk_size)
++{
++  uint64_t alignment;
++  int r = get_required_alignment(bucket, &alignment);
++  if (r < 0) {
++    return r;
++  }
++
++  uint64_t config_chunk_size = cct->_conf->rgw_max_chunk_size;
++
++  if (alignment == 0) {
++    *max_chunk_size = config_chunk_size;
++    return 0;
++  }
++
++  if (config_chunk_size <= alignment) {
++    *max_chunk_size = alignment;
++    return 0;
++  }
++
++  *max_chunk_size = config_chunk_size - (config_chunk_size % alignment);
++
++  return 0;
++}
++
+ void RGWRados::finalize()
+ {
+   if (need_watch_notify()) {
+     finalize_watch();
+@@ -1235,10 +1293,8 @@
+ int RGWRados::init_rados()
+ {
+   int ret;
+ 
+-  max_chunk_size = cct->_conf->rgw_max_chunk_size;
+-
+   rados = new Rados();
+   if (!rados)
+     return -ENOMEM;
+ 
+@@ -2956,27 +3012,35 @@
+                                                                        progress_data(_progress_data) {}
+   int handle_data(bufferlist& bl, off_t ofs, off_t len) {
+     progress_cb(ofs, progress_data);
+ 
+-    void *handle;
+-    int ret = processor->handle_data(bl, ofs, &handle);
+-    if (ret < 0)
+-      return ret;
++    bool again;
+ 
+-    if (opstate) {
+-      /* need to update opstate repository with new state. This is ratelimited, so we're not
+-       * really doing it every time
+-       */
+-      ret = opstate->renew_state();
+-      if (ret < 0) {
+-        /* could not renew state! might have been marked as cancelled */
++    bool need_opstate = true;
++
++    do {
++      void *handle;
++      int ret = processor->handle_data(bl, ofs, &handle, &again);
++      if (ret < 0)
+         return ret;
++
++      if (need_opstate && opstate) {
++        /* need to update opstate repository with new state. This is ratelimited, so we're not
++         * really doing it every time
++         */
++        ret = opstate->renew_state();
++        if (ret < 0) {
++          /* could not renew state! might have been marked as cancelled */
++          return ret;
++        }
++
++        need_opstate = false;
+       }
+-    }
+ 
+-    ret = processor->throttle_data(handle, false);
+-    if (ret < 0)
+-      return ret;
++      ret = processor->throttle_data(handle, false);
++      if (ret < 0)
++        return ret;
++    } while (again);
+ 
+     return 0;
+   }
+ 
+@@ -3191,26 +3255,8 @@
+     return ret;
+ 
+   vector<rgw_obj> ref_objs;
+ 
+-  bool copy_data = !astate->has_manifest;
+-  bool copy_first = false;
+-  if (astate->has_manifest) {
+-    if (!astate->manifest.has_tail()) {
+-      copy_data = true;
+-    } else {
+-      uint64_t head_size = astate->manifest.get_head_size();
+-
+-      if (head_size > 0) {
+-	if (head_size > max_chunk_size)  // should never happen
+-	  copy_data = true;
+-	else
+-          copy_first = true;
+-      }
+-    }
+-  }
+-
+-
+   if (remote_dest) {
+     /* dest is in a different region, copy it there */
+ 
+     string etag;
+@@ -3229,10 +3275,37 @@
+     if (ret < 0)
+       return ret;
+ 
+     return 0;
+-  } else if (copy_data) { /* refcounting tail wouldn't work here, just copy the data */
+-    return copy_obj_data(ctx, dest_bucket_info.owner, &handle, end, dest_obj, src_obj, mtime, src_attrs, category, ptag, err);
++  }
++  
++  uint64_t max_chunk_size;
++
++  ret = get_max_chunk_size(dest_obj.bucket, &max_chunk_size);
++  if (ret < 0) {
++    ldout(cct, 0) << "ERROR: failed to get max_chunk_size() for bucket " << dest_obj.bucket << dendl;
++    return ret;
++  }
++
++  bool copy_data = !astate->has_manifest;
++  bool copy_first = false;
++  if (astate->has_manifest) {
++    if (!astate->manifest.has_tail()) {
++      copy_data = true;
++    } else {
++      uint64_t head_size = astate->manifest.get_head_size();
++
++      if (head_size > 0) {
++	if (head_size > max_chunk_size)
++	  copy_data = true;
++	else
++          copy_first = true;
++      }
++    }
++  }
++
++  if (copy_data) { /* refcounting tail wouldn't work here, just copy the data */
++    return copy_obj_data(ctx, dest_bucket_info.owner, &handle, end, dest_obj, src_obj, max_chunk_size, mtime, src_attrs, category, ptag, err);
+   }
+ 
+   RGWObjManifest::obj_iterator miter = astate->manifest.obj_begin();
+ 
+@@ -3340,8 +3413,9 @@
+                const string& owner,
+ 	       void **handle, off_t end,
+                rgw_obj& dest_obj,
+                rgw_obj& src_obj,
++               uint64_t max_chunk_size,
+ 	       time_t *mtime,
+                map<string, bufferlist>& attrs,
+                RGWObjCategory category,
+                string *ptag,
+@@ -4472,8 +4546,10 @@
+ 
+   bool merge_bl = false;
+   bufferlist *pbl = &bl;
+   bufferlist read_bl;
++  uint64_t max_chunk_size;
++
+ 
+   get_obj_bucket_and_oid_key(obj, bucket, oid, key);
+ 
+   if (!rctx) {
+@@ -4504,8 +4580,14 @@
+       get_obj_bucket_and_oid_key(read_obj, bucket, oid, key);
+     }
+   }
+ 
++  r = get_max_chunk_size(bucket, &max_chunk_size);
++  if (r < 0) {
++    ldout(cct, 0) << "ERROR: failed to get max_chunk_size() for bucket " << bucket << dendl;
++    goto done_ret;
++  }
++
+   if (len > max_chunk_size)
+     len = max_chunk_size;
+ 
+ 
+--- a/src/rgw/rgw_rados.h
++++ b/src/rgw/rgw_rados.h
+@@ -547,9 +547,9 @@
+     store = _store;
+     obj_ctx = _o;
+     return 0;
+   };
+-  virtual int handle_data(bufferlist& bl, off_t ofs, void **phandle) = 0;
++  virtual int handle_data(bufferlist& bl, off_t ofs, void **phandle, bool *again) = 0;
+   virtual int throttle_data(void *handle, bool need_to_wait) = 0;
+   virtual int complete(string& etag, time_t *mtime, time_t set_mtime, map<string, bufferlist>& attrs);
+ };
+ 
+@@ -563,9 +563,9 @@
+   off_t ofs;
+ 
+ protected:
+   int prepare(RGWRados *store, void *obj_ctx, string *oid_rand);
+-  int handle_data(bufferlist& bl, off_t ofs, void **phandle);
++  int handle_data(bufferlist& bl, off_t ofs, void **phandle, bool *again);
+   int do_complete(string& etag, time_t *mtime, time_t set_mtime, map<string, bufferlist>& attrs);
+ 
+ public:
+   int throttle_data(void *handle, bool need_to_wait) { return 0; }
+@@ -612,8 +612,10 @@
+ 
+   uint64_t extra_data_len;
+   bufferlist extra_data_bl;
+   bufferlist pending_data_bl;
++  uint64_t max_chunk_size;
++
+ protected:
+   rgw_bucket bucket;
+   string obj_str;
+ 
+@@ -630,8 +632,10 @@
+   int prepare_next_part(off_t ofs);
+   int complete_parts();
+   int complete_writing_data();
+ 
++  int prepare_init(RGWRados *store, void *obj_ctx, string *oid_rand);
++
+ public:
+   ~RGWPutObjProcessor_Atomic() {}
+   RGWPutObjProcessor_Atomic(const string& bucket_owner, rgw_bucket& _b, const string& _o, uint64_t _p, const string& _t) :
+                                 RGWPutObjProcessor_Aio(bucket_owner),
+@@ -640,17 +644,18 @@
+                                 next_part_ofs(_p),
+                                 cur_part_id(0),
+                                 data_ofs(0),
+                                 extra_data_len(0),
++                                max_chunk_size(0),
+                                 bucket(_b),
+                                 obj_str(_o),
+                                 unique_tag(_t) {}
+   int prepare(RGWRados *store, void *obj_ctx, string *oid_rand);
+   virtual bool immutable_head() { return false; }
+   void set_extra_data_len(uint64_t len) {
+     extra_data_len = len;
+   }
+-  virtual int handle_data(bufferlist& bl, off_t ofs, void **phandle);
++  virtual int handle_data(bufferlist& bl, off_t ofs, void **phandle, bool *again);
+   bufferlist& get_extra_data() { return extra_data_bl; }
+ };
+ 
+ 
+@@ -1220,10 +1225,8 @@
+   int get_obj_ioctx(const rgw_obj& obj, librados::IoCtx *ioctx);
+   int get_obj_ref(const rgw_obj& obj, rgw_rados_ref *ref, rgw_bucket *bucket, bool ref_system_obj = false);
+   uint64_t max_bucket_id;
+ 
+-  uint64_t max_chunk_size;
+-
+   int get_obj_state(RGWRadosCtx *rctx, rgw_obj& obj, RGWObjState **state, RGWObjVersionTracker *objv_tracker);
+   int append_atomic_test(RGWRadosCtx *rctx, rgw_obj& obj,
+                          librados::ObjectOperation& op, RGWObjState **state);
+   int prepare_atomic_for_write_impl(RGWRadosCtx *rctx, rgw_obj& obj,
+@@ -1286,9 +1289,8 @@
+                gc(NULL), use_gc_thread(false), quota_threads(false),
+                num_watchers(0), watchers(NULL), watch_handles(NULL),
+                watch_initialized(false),
+                bucket_id_lock("rados_bucket_id"), max_bucket_id(0),
+-               max_chunk_size(0),
+                cct(NULL), rados(NULL),
+                pools_initialized(false),
+                quota_handler(NULL),
+                rest_master_conn(NULL),
+@@ -1324,11 +1326,10 @@
+       delete rados;
+     }
+   }
+ 
+-  uint64_t get_max_chunk_size() {
+-    return max_chunk_size;
+-  }
++  int get_required_alignment(rgw_bucket& bucket, uint64_t *alignment);
++  int get_max_chunk_size(rgw_bucket& bucket, uint64_t *max_chunk_size);
+ 
+   int list_raw_objects(rgw_bucket& pool, const string& prefix_filter, int max,
+                        RGWListRawObjsCtx& ctx, list<string>& oids,
+                        bool *is_truncated);
+@@ -1562,8 +1563,9 @@
+                const string& owner,
+ 	       void **handle, off_t end,
+                rgw_obj& dest_obj,
+                rgw_obj& src_obj,
++               uint64_t max_chunk_size,
+ 	       time_t *mtime,
+                map<string, bufferlist>& attrs,
+                RGWObjCategory category,
+                string *ptag,
+--- a/src/rgw/rgw_rest.cc
++++ b/src/rgw/rgw_rest.cc
+@@ -179,9 +179,9 @@
+ {
+   std::ostringstream oss;
+   formatter->flush(oss);
+   std::string outs(oss.str());
+-  if (!outs.empty()) {
++  if (!outs.empty() && s->op != OP_HEAD) {
+     s->cio->write(outs.c_str(), outs.size());
+   }
+ 
+   s->formatter->reset();
+@@ -191,9 +191,9 @@
+ {
+   std::ostringstream oss;
+   formatter->flush(oss);
+   std::string outs(oss.str());
+-  if (!outs.empty()) {
++  if (!outs.empty() && s->op != OP_HEAD) {
+     s->cio->write(outs.c_str(), outs.size());
+   }
+ }
+ 
+--- a/src/rgw/rgw_rest_swift.cc
++++ b/src/rgw/rgw_rest_swift.cc
+@@ -626,20 +626,18 @@
+   string hdrs, exp_hdrs;
+   uint32_t max_age = CORS_MAX_AGE_INVALID;
+   /*EACCES means, there is no CORS registered yet for the bucket
+    *ENOENT means, there is no match of the Origin in the list of CORSRule
+-   *ENOTSUPP means, the HTTP_METHOD is not supported
+    */
+   if (ret == -ENOENT)
+     ret = -EACCES;
+-  if (ret != -EACCES) {
+-    get_response_params(hdrs, exp_hdrs, &max_age);
+-  } else {
++  if (ret < 0) {
+     set_req_state_err(s, ret);
+     dump_errno(s);
+     end_header(s, NULL);
+     return;
+   }
++  get_response_params(hdrs, exp_hdrs, &max_age);
+   dump_errno(s);
+   dump_access_control(s, origin, req_meth, hdrs.c_str(), exp_hdrs.c_str(), max_age); 
+   end_header(s, NULL);
+ }
+--- a/src/test/crush/TestCrushWrapper.cc
++++ b/src/test/crush/TestCrushWrapper.cc
+@@ -537,8 +537,13 @@
+     EXPECT_NE(string::npos,
+ 	      ss.str().find("<item_name>default</item_name></step>"));
+   }
+ 
++  map<int,float> wm;
++  c->get_rule_weight_osd_map(0, &wm);
++  ASSERT_TRUE(wm.size() == 1);
++  ASSERT_TRUE(wm[0] == 1.0);
++
+   delete c;
+ }
+ 
+ TEST(CrushWrapper, distance) {
+--- a/src/test/erasure-code/TestErasureCodeJerasure.cc
++++ b/src/test/erasure-code/TestErasureCodeJerasure.cc
+@@ -287,8 +287,38 @@
+       c->insert_item(g_ceph_context, osd, 1.0, string("osd.") + stringify(osd), loc);
+     }
+   }
+ 
++  //
++  // The ruleid may be different from the ruleset when a crush rule is
++  // removed because the removed ruleid will be reused but the removed
++  // ruleset will not be reused. 
++  //
++  // This also asserts that the create_ruleset() method returns a
++  // ruleset and not a ruleid http://tracker.ceph.com/issues/9044
++  //
++  {
++    stringstream ss;
++    ErasureCodeJerasureReedSolomonVandermonde jerasure;
++    map<std::string,std::string> parameters;
++    parameters["k"] = "2";
++    parameters["m"] = "2";
++    parameters["w"] = "8";
++    jerasure.init(parameters);
++    int FIRST = jerasure.create_ruleset("FIRST", *c, &ss);
++    int SECOND = jerasure.create_ruleset("SECOND", *c, &ss);
++    int FIRST_ruleid = c->get_rule_id("FIRST");
++    EXPECT_EQ(0, c->remove_rule(FIRST_ruleid));
++    int ruleset = jerasure.create_ruleset("myrule", *c, &ss);
++    EXPECT_NE(FIRST, ruleset);
++    EXPECT_NE(SECOND, ruleset);
++    EXPECT_NE(ruleset, c->get_rule_id("myrule"));
++    int SECOND_ruleid = c->get_rule_id("SECOND");
++    EXPECT_EQ(0, c->remove_rule(SECOND_ruleid));
++    int myrule_ruleid = c->get_rule_id("myrule");
++    EXPECT_EQ(0, c->remove_rule(myrule_ruleid));
++  }
++
+   {
+     stringstream ss;
+     ErasureCodeJerasureReedSolomonVandermonde jerasure;
+     map<std::string,std::string> parameters;
+--- a/src/test/librados/TestCase.cc
++++ b/src/test/librados/TestCase.cc
+@@ -7,8 +7,9 @@
+ 
+ using namespace librados;
+ 
+ std::string RadosTest::pool_name;
++std::string RadosTest::nspace;
+ rados_t RadosTest::s_cluster = NULL;
+ 
+ void RadosTest::SetUpTestCase()
+ {
+@@ -24,9 +25,9 @@
+ void RadosTest::SetUp()
+ {
+   cluster = RadosTest::s_cluster;
+   ASSERT_EQ(0, rados_ioctx_create(cluster, pool_name.c_str(), &ioctx));
+-  std::string nspace = get_temp_pool_name();
++  nspace = get_temp_pool_name();
+   rados_ioctx_set_namespace(ioctx, nspace.c_str());
+   ASSERT_FALSE(rados_ioctx_pool_requires_alignment(ioctx));
+ }
+ 
+@@ -205,26 +206,8 @@
+   cleanup_default_namespace(ioctx);
+   rados_ioctx_destroy(ioctx);
+ }
+ 
+-void RadosTestEC::cleanup_default_namespace(rados_ioctx_t ioctx)
+-{
+-  // remove all objects from the default namespace to avoid polluting
+-  // other tests
+-  rados_ioctx_set_namespace(ioctx, "");
+-  rados_list_ctx_t list_ctx;
+-  ASSERT_EQ(0, rados_objects_list_open(ioctx, &list_ctx));
+-  int r;
+-  const char *entry = NULL;
+-  const char *key = NULL;
+-  while ((r = rados_objects_list_next(list_ctx, &entry, &key)) != -ENOENT) {
+-    ASSERT_EQ(0, r);
+-    rados_ioctx_locator_set_key(ioctx, key);
+-    ASSERT_EQ(0, rados_remove(ioctx, entry));
+-  }
+-  rados_objects_list_close(list_ctx);
+-}
+-
+ std::string RadosTestECPP::pool_name;
+ Rados RadosTestECPP::s_cluster;
+ 
+ void RadosTestECPP::SetUpTestCase()
+@@ -253,15 +236,4 @@
+   cleanup_default_namespace(ioctx);
+   ioctx.close();
+ }
+ 
+-void RadosTestECPP::cleanup_default_namespace(librados::IoCtx ioctx)
+-{
+-  // remove all objects from the default namespace to avoid polluting
+-  // other tests
+-  ioctx.set_namespace("");
+-  for (ObjectIterator it = ioctx.objects_begin();
+-       it != ioctx.objects_end(); ++it) {
+-    ioctx.locator_set_key(it->second);
+-    ASSERT_EQ(0, ioctx.remove(it->first));
+-  }
+-}
+--- a/src/test/librados/TestCase.h
++++ b/src/test/librados/TestCase.h
+@@ -27,8 +27,9 @@
+   static void TearDownTestCase();
+   static void cleanup_default_namespace(rados_ioctx_t ioctx);
+   static rados_t s_cluster;
+   static std::string pool_name;
++  static std::string nspace;
+ 
+   virtual void SetUp();
+   virtual void TearDown();
+   rados_t cluster;
+@@ -71,16 +72,15 @@
+   librados::IoCtx ioctx;
+   std::string ns;
+ };
+ 
+-class RadosTestEC : public ::testing::Test {
++class RadosTestEC : public RadosTest {
+ public:
+   RadosTestEC() {}
+   virtual ~RadosTestEC() {}
+ protected:
+   static void SetUpTestCase();
+   static void TearDownTestCase();
+-  static void cleanup_default_namespace(rados_ioctx_t ioctx);
+   static rados_t s_cluster;
+   static std::string pool_name;
+ 
+   virtual void SetUp();
+@@ -89,16 +89,15 @@
+   rados_ioctx_t ioctx;
+   uint64_t alignment;
+ };
+ 
+-class RadosTestECPP : public ::testing::Test {
++class RadosTestECPP : public RadosTestPP {
+ public:
+   RadosTestECPP() : cluster(s_cluster) {};
+   virtual ~RadosTestECPP() {};
+ protected:
+   static void SetUpTestCase();
+   static void TearDownTestCase();
+-  static void cleanup_default_namespace(librados::IoCtx ioctx);
+   static librados::Rados s_cluster;
+   static std::string pool_name;
+ 
+   virtual void SetUp();
+--- a/src/test/librados/io.cc
++++ b/src/test/librados/io.cc
+@@ -24,8 +24,60 @@
+   rados_ioctx_set_namespace(ioctx, "nspace");
+   ASSERT_EQ(0, rados_write(ioctx, "foo", buf, sizeof(buf), 0));
+ }
+ 
++TEST_F(LibRadosIo, ReadTimeout) {
++  char buf[128];
++  memset(buf, 'a', sizeof(buf));
++  ASSERT_EQ(0, rados_write(ioctx, "foo", buf, sizeof(buf), 0));
++
++  {
++    // set up a second client
++    rados_t cluster;
++    rados_ioctx_t ioctx;
++    rados_create(&cluster, "admin");
++    rados_conf_read_file(cluster, NULL);
++    rados_conf_parse_env(cluster, NULL);
++    rados_conf_set(cluster, "rados_osd_op_timeout", "0.00001"); // use any small value that will result in a timeout
++    rados_connect(cluster);
++    rados_ioctx_create(cluster, pool_name.c_str(), &ioctx);
++    rados_ioctx_set_namespace(ioctx, nspace.c_str());
++
++    // then we show that the buffer is changed after rados_read returned
++    // with a timeout
++    for (int i=0; i<5; i++) {
++      char buf2[sizeof(buf)];
++      memset(buf2, 0, sizeof(buf2));
++      int err = rados_read(ioctx, "foo", buf2, sizeof(buf2), 0);
++      if (err == -110) {
++	int startIndex = 0;
++	// find the index until which librados already read the object before the timeout occurred
++	for (unsigned b=0; b<sizeof(buf); b++) {
++	  if (buf2[b] != buf[b]) {
++	    startIndex = b;
++	    break;
++	  }
++	}
++
++	// wait some time to give librados a change to do something
++	sleep(1);
++
++	// then check if the buffer was changed after the call
++	if (buf2[startIndex] == 'a') {
++	  printf("byte at index %d was changed after the timeout to %d\n",
++		 startIndex, (int)buf[startIndex]);
++	  ASSERT_TRUE(0);
++	  break;
++	}
++      } else {
++	printf("no timeout :/\n");
++      }
++    }
++    rados_ioctx_destroy(ioctx);
++    rados_shutdown(cluster);
++  }
++}
++
+ TEST_F(LibRadosIoPP, SimpleWritePP) {
+   char buf[128];
+   memset(buf, 0xcc, sizeof(buf));
+   bufferlist bl;
+--- a/src/test/librados/tier.cc
++++ b/src/test/librados/tier.cc
+@@ -33,8 +33,40 @@
+ 
+ typedef RadosTestPP LibRadosTierPP;
+ typedef RadosTestECPP LibRadosTierECPP;
+ 
++void flush_evict_all(librados::Rados& cluster, librados::IoCtx& cache_ioctx)
++{
++  bufferlist inbl;
++  cache_ioctx.set_namespace("");
++  for (ObjectIterator it = cache_ioctx.objects_begin();
++       it != cache_ioctx.objects_end(); ++it) {
++    cache_ioctx.locator_set_key(it->second);
++    {
++      ObjectReadOperation op;
++      op.cache_flush();
++      librados::AioCompletion *completion = cluster.aio_create_completion();
++      cache_ioctx.aio_operate(
++        it->first, completion, &op,
++	librados::OPERATION_IGNORE_OVERLAY, NULL);
++      completion->wait_for_safe();
++      completion->get_return_value();
++      completion->release();
++    }
++    {
++      ObjectReadOperation op;
++      op.cache_evict();
++      librados::AioCompletion *completion = cluster.aio_create_completion();
++      cache_ioctx.aio_operate(
++        it->first, completion, &op,
++	librados::OPERATION_IGNORE_OVERLAY, NULL);
++      completion->wait_for_safe();
++      completion->get_return_value();
++      completion->release();
++    }
++  }
++}
++
+ class LibRadosTwoPoolsPP : public RadosTestPP
+ {
+ public:
+   LibRadosTwoPoolsPP() {};
+@@ -58,9 +90,28 @@
+     cache_ioctx.set_namespace(ns);
+   }
+   virtual void TearDown() {
+     RadosTestPP::TearDown();
++
++    // flush + evict cache
++    flush_evict_all(cluster, cache_ioctx);
++
++    bufferlist inbl;
++    // tear down tiers
++    ASSERT_EQ(0, cluster.mon_command(
++      "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
++      "\"}",
++      inbl, NULL, NULL));
++    ASSERT_EQ(0, cluster.mon_command(
++      "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
++      "\", \"tierpool\": \"" + cache_pool_name + "\"}",
++    inbl, NULL, NULL));
++
++    // wait for maps to settle before next test
++    cluster.wait_for_latest_osdmap();
++
+     cleanup_default_namespace(cache_ioctx);
++
+     cache_ioctx.close();
+   }
+   librados::IoCtx cache_ioctx;
+ };
+@@ -179,21 +230,8 @@
+     ASSERT_EQ(0, completion->get_return_value());
+     completion->release();
+     ASSERT_EQ('b', bl[0]);
+   }
+-
+-  // tear down tiers
+-  ASSERT_EQ(0, cluster.mon_command(
+-    "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
+-    "\"}",
+-    inbl, NULL, NULL));
+-  ASSERT_EQ(0, cluster.mon_command(
+-    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
+-    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
+-    inbl, NULL, NULL));
+-
+-  // wait for maps to settle before next test
+-  cluster.wait_for_latest_osdmap();
+ }
+ 
+ TEST_F(LibRadosTwoPoolsPP, Promote) {
+   // create object
+@@ -246,21 +284,8 @@
+     ASSERT_TRUE(it->first == string("foo") || it->first == string("bar"));
+     ++it;
+     ASSERT_TRUE(it == cache_ioctx.objects_end());
+   }
+-
+-  // tear down tiers
+-  ASSERT_EQ(0, cluster.mon_command(
+-    "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
+-    "\"}",
+-    inbl, NULL, NULL));
+-  ASSERT_EQ(0, cluster.mon_command(
+-    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
+-    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
+-    inbl, NULL, NULL));
+-
+-  // wait for maps to settle before next test
+-  cluster.wait_for_latest_osdmap();
+ }
+ 
+ TEST_F(LibRadosTwoPoolsPP, PromoteSnap) {
+   // create object
+@@ -399,21 +424,8 @@
+   {
+     bufferlist bl;
+     ASSERT_EQ(-ENOENT, ioctx.read("baz", bl, 1, 0));
+   }
+-
+-  // tear down tiers
+-  ASSERT_EQ(0, cluster.mon_command(
+-    "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
+-    "\"}",
+-    inbl, NULL, NULL));
+-  ASSERT_EQ(0, cluster.mon_command(
+-    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
+-    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
+-    inbl, NULL, NULL));
+-
+-  // wait for maps to settle before next test
+-  cluster.wait_for_latest_osdmap();
+ }
+ 
+ TEST_F(LibRadosTwoPoolsPP, PromoteSnapScrub) {
+   int num = 100;
+@@ -508,21 +520,8 @@
+     cout << "done waiting" << std::endl;
+   }
+ 
+   ioctx.snap_set_read(librados::SNAP_HEAD);
+-
+-  // tear down tiers
+-  ASSERT_EQ(0, cluster.mon_command(
+-    "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
+-    "\"}",
+-    inbl, NULL, NULL));
+-  ASSERT_EQ(0, cluster.mon_command(
+-    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
+-    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
+-    inbl, NULL, NULL));
+-
+-  // wait for maps to settle before next test
+-  cluster.wait_for_latest_osdmap();
+ }
+ 
+ 
+ TEST_F(LibRadosTwoPoolsPP, PromoteSnapTrimRace) {
+@@ -576,21 +575,8 @@
+   {
+     bufferlist bl;
+     ASSERT_EQ(-ENOENT, ioctx.read("foo", bl, 1, 0));
+   }
+-
+-  // tear down tiers
+-  ASSERT_EQ(0, cluster.mon_command(
+-    "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
+-    "\"}",
+-    inbl, NULL, NULL));
+-  ASSERT_EQ(0, cluster.mon_command(
+-    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
+-    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
+-    inbl, NULL, NULL));
+-
+-  // wait for maps to settle before next test
+-  cluster.wait_for_latest_osdmap();
+ }
+ 
+ TEST_F(LibRadosTwoPoolsPP, Whiteout) {
+   // create object
+@@ -652,21 +638,8 @@
+     bufferlist bl;
+     ASSERT_EQ(1, ioctx.read("foo", bl, 1, 0));
+     ASSERT_EQ('h', bl[0]);
+   }
+-
+-  // tear down tiers
+-  ASSERT_EQ(0, cluster.mon_command(
+-    "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
+-    "\"}",
+-    inbl, NULL, NULL));
+-  ASSERT_EQ(0, cluster.mon_command(
+-    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
+-    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
+-    inbl, NULL, NULL));
+-
+-  // wait for maps to settle before next test
+-  cluster.wait_for_latest_osdmap();
+ }
+ 
+ TEST_F(LibRadosTwoPoolsPP, Evict) {
+   // create object
+@@ -755,21 +728,8 @@
+     completion->wait_for_safe();
+     ASSERT_EQ(-EBUSY, completion->get_return_value());
+     completion->release();
+   }
+-
+-  // tear down tiers
+-  ASSERT_EQ(0, cluster.mon_command(
+-    "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
+-    "\"}",
+-    inbl, NULL, NULL));
+-  ASSERT_EQ(0, cluster.mon_command(
+-    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
+-    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
+-    inbl, NULL, NULL));
+-
+-  // wait for maps to settle before next test
+-  cluster.wait_for_latest_osdmap();
+ }
+ 
+ TEST_F(LibRadosTwoPoolsPP, EvictSnap) {
+   // create object
+@@ -1003,21 +963,8 @@
+     completion->wait_for_safe();
+     ASSERT_EQ(0, completion->get_return_value());
+     completion->release();
+   }
+-
+-  // tear down tiers
+-  ASSERT_EQ(0, cluster.mon_command(
+-    "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
+-    "\"}",
+-    inbl, NULL, NULL));
+-  ASSERT_EQ(0, cluster.mon_command(
+-    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
+-    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
+-    inbl, NULL, NULL));
+-
+-  // wait for maps to settle before next test
+-  cluster.wait_for_latest_osdmap();
+ }
+ 
+ TEST_F(LibRadosTwoPoolsPP, TryFlush) {
+   // configure cache
+@@ -1124,21 +1071,8 @@
+   {
+     ObjectIterator it = cache_ioctx.objects_begin();
+     ASSERT_TRUE(it == cache_ioctx.objects_end());
+   }
+-
+-  // tear down tiers
+-  ASSERT_EQ(0, cluster.mon_command(
+-    "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
+-    "\"}",
+-    inbl, NULL, NULL));
+-  ASSERT_EQ(0, cluster.mon_command(
+-    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
+-    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
+-    inbl, NULL, NULL));
+-
+-  // wait for maps to settle before next test
+-  cluster.wait_for_latest_osdmap();
+ }
+ 
+ TEST_F(LibRadosTwoPoolsPP, Flush) {
+   // configure cache
+@@ -1297,21 +1231,8 @@
+   {
+     ObjectIterator it = ioctx.objects_begin();
+     ASSERT_TRUE(it == ioctx.objects_end());
+   }
+-
+-  // tear down tiers
+-  ASSERT_EQ(0, cluster.mon_command(
+-    "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
+-    "\"}",
+-    inbl, NULL, NULL));
+-  ASSERT_EQ(0, cluster.mon_command(
+-    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
+-    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
+-    inbl, NULL, NULL));
+-
+-  // wait for maps to settle before next test
+-  cluster.wait_for_latest_osdmap();
+ }
+ 
+ TEST_F(LibRadosTwoPoolsPP, FlushSnap) {
+   // configure cache
+@@ -1469,20 +1390,13 @@
+     ASSERT_EQ(1, ioctx.read("foo", bl, 1, 0));
+     ASSERT_EQ('a', bl[0]);
+   }
+ 
+-  // tear down tiers
++  // remove overlay
+   ASSERT_EQ(0, cluster.mon_command(
+     "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
+     "\"}",
+     inbl, NULL, NULL));
+-  ASSERT_EQ(0, cluster.mon_command(
+-    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
+-    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
+-    inbl, NULL, NULL));
+-
+-  // wait for maps to settle
+-  cluster.wait_for_latest_osdmap();
+ 
+   // verify i can read the snaps from the base pool
+   ioctx.snap_set_read(librados::SNAP_HEAD);
+   {
+@@ -1501,8 +1415,13 @@
+     bufferlist bl;
+     ASSERT_EQ(1, ioctx.read("foo", bl, 1, 0));
+     ASSERT_EQ('a', bl[0]);
+   }
++
++  ASSERT_EQ(0, cluster.mon_command(
++    "{\"prefix\": \"osd tier set-overlay\", \"pool\": \"" + pool_name +
++    "\", \"overlaypool\": \"" + cache_pool_name + "\"}",
++    inbl, NULL, NULL));
+ }
+ 
+ TEST_F(LibRadosTierPP, FlushWriteRaces) {
+   Rados cluster;
+@@ -1785,21 +1704,8 @@
+     ASSERT_EQ(0, completion2->get_return_value());
+     completion->release();
+     completion2->release();
+   }
+-
+-  // tear down tiers
+-  ASSERT_EQ(0, cluster.mon_command(
+-    "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
+-    "\"}",
+-    inbl, NULL, NULL));
+-  ASSERT_EQ(0, cluster.mon_command(
+-    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
+-    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
+-    inbl, NULL, NULL));
+-
+-  // wait for maps to settle before next test
+-  cluster.wait_for_latest_osdmap();
+ }
+ 
+ 
+ IoCtx *read_ioctx = 0;
+@@ -1894,21 +1800,8 @@
+   max_reads = 0;
+   while (num_reads > 0)
+     cond.Wait(test_lock);
+   test_lock.Unlock();
+-
+-  // tear down tiers
+-  ASSERT_EQ(0, cluster.mon_command(
+-    "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
+-    "\"}",
+-    inbl, NULL, NULL));
+-  ASSERT_EQ(0, cluster.mon_command(
+-    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
+-    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
+-    inbl, NULL, NULL));
+-
+-  // wait for maps to settle before next test
+-  cluster.wait_for_latest_osdmap();
+ }
+ 
+ TEST_F(LibRadosTierPP, HitSetNone) {
+   {
+@@ -1943,23 +1836,30 @@
+     + string("\",\"var\": \"") + var + string("\",\"val\": \"")
+     + stringify(val) + string("\"}");
+ }
+ 
+-TEST_F(LibRadosTierPP, HitSetRead) {
+-  // enable hitset tracking for this pool
++TEST_F(LibRadosTwoPoolsPP, HitSetRead) {
++  // make it a tier
+   bufferlist inbl;
+-  ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_count", 2),
++  ASSERT_EQ(0, cluster.mon_command(
++    "{\"prefix\": \"osd tier add\", \"pool\": \"" + pool_name +
++    "\", \"tierpool\": \"" + cache_pool_name +
++    "\", \"force_nonempty\": \"--force-nonempty\" }",
++    inbl, NULL, NULL));
++
++  // enable hitset tracking for this pool
++  ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_count", 2),
+ 						inbl, NULL, NULL));
+-  ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_period", 600),
++  ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_period", 600),
+ 						inbl, NULL, NULL));
+-  ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_type",
++  ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_type",
+ 						"explicit_object"),
+ 				   inbl, NULL, NULL));
+ 
+   // wait for maps to settle
+   cluster.wait_for_latest_osdmap();
+ 
+-  ioctx.set_namespace("");
++  cache_ioctx.set_namespace("");
+ 
+   // keep reading until we see our object appear in the HitSet
+   utime_t start = ceph_clock_now(NULL);
+   utime_t hard_stop = start + utime_t(600, 0);
+@@ -1968,18 +1868,18 @@
+     utime_t now = ceph_clock_now(NULL);
+     ASSERT_TRUE(now < hard_stop);
+ 
+     string name = "foo";
+-    uint32_t hash = ioctx.get_object_hash_position(name);
++    uint32_t hash = cache_ioctx.get_object_hash_position(name);
+     hobject_t oid(sobject_t(name, CEPH_NOSNAP), "", hash,
+-		  cluster.pool_lookup(pool_name.c_str()), "");
++		  cluster.pool_lookup(cache_pool_name.c_str()), "");
+ 
+     bufferlist bl;
+-    ASSERT_EQ(-ENOENT, ioctx.read("foo", bl, 1, 0));
++    ASSERT_EQ(-ENOENT, cache_ioctx.read("foo", bl, 1, 0));
+ 
+     bufferlist hbl;
+     AioCompletion *c = librados::Rados::aio_create_completion();
+-    ASSERT_EQ(0, ioctx.hit_set_get(hash, c, now.sec(), &hbl));
++    ASSERT_EQ(0, cache_ioctx.hit_set_get(hash, c, now.sec(), &hbl));
+     c->wait_for_complete();
+     c->release();
+ 
+     if (hbl.length()) {
+@@ -2027,49 +1927,58 @@
+   return -1;
+ }
+ 
+ 
+-TEST_F(LibRadosTierPP, HitSetWrite) {
++TEST_F(LibRadosTwoPoolsPP, HitSetWrite) {
+   int num_pg = _get_pg_num(cluster, pool_name);
+   assert(num_pg > 0);
+ 
+-  // enable hitset tracking for this pool
++  // make it a tier
+   bufferlist inbl;
+-  ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_count", 8),
++  ASSERT_EQ(0, cluster.mon_command(
++    "{\"prefix\": \"osd tier add\", \"pool\": \"" + pool_name +
++    "\", \"tierpool\": \"" + cache_pool_name +
++    "\", \"force_nonempty\": \"--force-nonempty\" }",
++    inbl, NULL, NULL));
++
++  // enable hitset tracking for this pool
++  ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_count", 8),
+ 						inbl, NULL, NULL));
+-  ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_period", 600),
++  ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_period", 600),
+ 						inbl, NULL, NULL));
+-  ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_type",
++  ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_type",
+ 						"explicit_hash"),
+ 				   inbl, NULL, NULL));
+ 
+   // wait for maps to settle
+   cluster.wait_for_latest_osdmap();
+ 
+-  ioctx.set_namespace("");
++  cache_ioctx.set_namespace("");
++
++  int num = 200;
+ 
+   // do a bunch of writes
+-  for (int i=0; i<1000; ++i) {
++  for (int i=0; i<num; ++i) {
+     bufferlist bl;
+     bl.append("a");
+-    ASSERT_EQ(0, ioctx.write(stringify(i), bl, 1, 0));
++    ASSERT_EQ(0, cache_ioctx.write(stringify(i), bl, 1, 0));
+   }
+ 
+   // get HitSets
+   std::map<int,HitSet> hitsets;
+   for (int i=0; i<num_pg; ++i) {
+     list< pair<time_t,time_t> > ls;
+     AioCompletion *c = librados::Rados::aio_create_completion();
+-    ASSERT_EQ(0, ioctx.hit_set_list(i, c, &ls));
++    ASSERT_EQ(0, cache_ioctx.hit_set_list(i, c, &ls));
+     c->wait_for_complete();
+     c->release();
+     std::cout << "pg " << i << " ls " << ls << std::endl;
+     ASSERT_FALSE(ls.empty());
+ 
+     // get the latest
+     c = librados::Rados::aio_create_completion();
+     bufferlist bl;
+-    ASSERT_EQ(0, ioctx.hit_set_get(i, c, ls.back().first, &bl));
++    ASSERT_EQ(0, cache_ioctx.hit_set_get(i, c, ls.back().first, &bl));
+     c->wait_for_complete();
+     c->release();
+ 
+     //std::cout << "bl len is " << bl.length() << "\n";
+@@ -2080,16 +1989,16 @@
+     ::decode(hitsets[i], p);
+ 
+     // cope with racing splits by refreshing pg_num
+     if (i == num_pg - 1)
+-      num_pg = _get_pg_num(cluster, pool_name);
++      num_pg = _get_pg_num(cluster, cache_pool_name);
+   }
+ 
+-  for (int i=0; i<1000; ++i) {
++  for (int i=0; i<num; ++i) {
+     string n = stringify(i);
+-    uint32_t hash = ioctx.get_object_hash_position(n);
++    uint32_t hash = cache_ioctx.get_object_hash_position(n);
+     hobject_t oid(sobject_t(n, CEPH_NOSNAP), "", hash,
+-		  cluster.pool_lookup(pool_name.c_str()), "");
++		  cluster.pool_lookup(cache_pool_name.c_str()), "");
+     std::cout << "checking for " << oid << std::endl;
+     bool found = false;
+     for (int p=0; p<num_pg; ++p) {
+       if (hitsets[p].contains(oid)) {
+@@ -2100,45 +2009,52 @@
+     ASSERT_TRUE(found);
+   }
+ }
+ 
+-TEST_F(LibRadosTierPP, HitSetTrim) {
++TEST_F(LibRadosTwoPoolsPP, HitSetTrim) {
+   unsigned count = 3;
+   unsigned period = 3;
+ 
+-  // enable hitset tracking for this pool
++  // make it a tier
+   bufferlist inbl;
+-  ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_count", count),
++  ASSERT_EQ(0, cluster.mon_command(
++    "{\"prefix\": \"osd tier add\", \"pool\": \"" + pool_name +
++    "\", \"tierpool\": \"" + cache_pool_name +
++    "\", \"force_nonempty\": \"--force-nonempty\" }",
++    inbl, NULL, NULL));
++
++  // enable hitset tracking for this pool
++  ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_count", count),
+ 						inbl, NULL, NULL));
+-  ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_period", period),
++  ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_period", period),
+ 						inbl, NULL, NULL));
+-  ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_type", "bloom"),
++  ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_type", "bloom"),
+ 				   inbl, NULL, NULL));
+-  ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_fpp", ".01"),
++  ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_fpp", ".01"),
+ 				   inbl, NULL, NULL));
+ 
+   // wait for maps to settle
+   cluster.wait_for_latest_osdmap();
+ 
+-  ioctx.set_namespace("");
++  cache_ioctx.set_namespace("");
+ 
+   // do a bunch of writes and make sure the hitsets rotate
+   utime_t start = ceph_clock_now(NULL);
+   utime_t hard_stop = start + utime_t(count * period * 50, 0);
+ 
+   time_t first = 0;
+   while (true) {
+     string name = "foo";
+-    uint32_t hash = ioctx.get_object_hash_position(name);
++    uint32_t hash = cache_ioctx.get_object_hash_position(name);
+     hobject_t oid(sobject_t(name, CEPH_NOSNAP), "", hash, -1, "");
+ 
+     bufferlist bl;
+     bl.append("f");
+-    ASSERT_EQ(0, ioctx.write("foo", bl, 1, 0));
++    ASSERT_EQ(0, cache_ioctx.write("foo", bl, 1, 0));
+ 
+     list<pair<time_t, time_t> > ls;
+     AioCompletion *c = librados::Rados::aio_create_completion();
+-    ASSERT_EQ(0, ioctx.hit_set_list(hash, c, &ls));
++    ASSERT_EQ(0, cache_ioctx.hit_set_list(hash, c, &ls));
+     c->wait_for_complete();
+     c->release();
+ 
+     ASSERT_TRUE(ls.size() <= count + 1);
+@@ -2186,11 +2102,31 @@
+     cache_ioctx.set_namespace(ns);
+   }
+   virtual void TearDown() {
+     RadosTestECPP::TearDown();
++
++    // flush + evict cache
++    flush_evict_all(cluster, cache_ioctx);
++
++    bufferlist inbl;
++    // tear down tiers
++    ASSERT_EQ(0, cluster.mon_command(
++      "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
++      "\"}",
++      inbl, NULL, NULL));
++    ASSERT_EQ(0, cluster.mon_command(
++      "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
++      "\", \"tierpool\": \"" + cache_pool_name + "\"}",
++    inbl, NULL, NULL));
++
++    // wait for maps to settle before next test
++    cluster.wait_for_latest_osdmap();
++
+     cleanup_default_namespace(cache_ioctx);
++
+     cache_ioctx.close();
+   }
++
+   librados::IoCtx cache_ioctx;
+ };
+ 
+ std::string LibRadosTwoPoolsECPP::cache_pool_name;
+@@ -2307,21 +2243,8 @@
+     ASSERT_EQ(0, completion->get_return_value());
+     completion->release();
+     ASSERT_EQ('b', bl[0]);
+   }
+-
+-  // tear down tiers
+-  ASSERT_EQ(0, cluster.mon_command(
+-    "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
+-    "\"}",
+-    inbl, NULL, NULL));
+-  ASSERT_EQ(0, cluster.mon_command(
+-    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
+-    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
+-    inbl, NULL, NULL));
+-
+-  // wait for maps to settle before next test
+-  cluster.wait_for_latest_osdmap();
+ }
+ 
+ TEST_F(LibRadosTwoPoolsECPP, Promote) {
+   // create object
+@@ -2374,21 +2297,8 @@
+     ASSERT_TRUE(it->first == string("foo") || it->first == string("bar"));
+     ++it;
+     ASSERT_TRUE(it == cache_ioctx.objects_end());
+   }
+-
+-  // tear down tiers
+-  ASSERT_EQ(0, cluster.mon_command(
+-    "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
+-    "\"}",
+-    inbl, NULL, NULL));
+-  ASSERT_EQ(0, cluster.mon_command(
+-    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
+-    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
+-    inbl, NULL, NULL));
+-
+-  // wait for maps to settle before next test
+-  cluster.wait_for_latest_osdmap();
+ }
+ 
+ TEST_F(LibRadosTwoPoolsECPP, PromoteSnap) {
+   // create object
+@@ -2551,21 +2461,8 @@
+   {
+     bufferlist bl;
+     ASSERT_EQ(-ENOENT, ioctx.read("baz", bl, 1, 0));
+   }
+-
+-  // tear down tiers
+-  ASSERT_EQ(0, cluster.mon_command(
+-    "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
+-    "\"}",
+-    inbl, NULL, NULL));
+-  ASSERT_EQ(0, cluster.mon_command(
+-    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
+-    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
+-    inbl, NULL, NULL));
+-
+-  // wait for maps to settle before next test
+-  cluster.wait_for_latest_osdmap();
+ }
+ 
+ TEST_F(LibRadosTwoPoolsECPP, PromoteSnapTrimRace) {
+   // create object
+@@ -2618,21 +2515,8 @@
+   {
+     bufferlist bl;
+     ASSERT_EQ(-ENOENT, ioctx.read("foo", bl, 1, 0));
+   }
+-
+-  // tear down tiers
+-  ASSERT_EQ(0, cluster.mon_command(
+-    "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
+-    "\"}",
+-    inbl, NULL, NULL));
+-  ASSERT_EQ(0, cluster.mon_command(
+-    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
+-    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
+-    inbl, NULL, NULL));
+-
+-  // wait for maps to settle before next test
+-  cluster.wait_for_latest_osdmap();
+ }
+ 
+ TEST_F(LibRadosTwoPoolsECPP, Whiteout) {
+   // create object
+@@ -2694,21 +2578,8 @@
+     bufferlist bl;
+     ASSERT_EQ(1, ioctx.read("foo", bl, 1, 0));
+     ASSERT_EQ('h', bl[0]);
+   }
+-
+-  // tear down tiers
+-  ASSERT_EQ(0, cluster.mon_command(
+-    "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
+-    "\"}",
+-    inbl, NULL, NULL));
+-  ASSERT_EQ(0, cluster.mon_command(
+-    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
+-    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
+-    inbl, NULL, NULL));
+-
+-  // wait for maps to settle before next test
+-  cluster.wait_for_latest_osdmap();
+ }
+ 
+ TEST_F(LibRadosTwoPoolsECPP, Evict) {
+   // create object
+@@ -2797,21 +2668,8 @@
+     completion->wait_for_safe();
+     ASSERT_EQ(-EBUSY, completion->get_return_value());
+     completion->release();
+   }
+-
+-  // tear down tiers
+-  ASSERT_EQ(0, cluster.mon_command(
+-    "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
+-    "\"}",
+-    inbl, NULL, NULL));
+-  ASSERT_EQ(0, cluster.mon_command(
+-    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
+-    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
+-    inbl, NULL, NULL));
+-
+-  // wait for maps to settle before next test
+-  cluster.wait_for_latest_osdmap();
+ }
+ 
+ TEST_F(LibRadosTwoPoolsECPP, EvictSnap) {
+   // create object
+@@ -3045,21 +2903,8 @@
+     completion->wait_for_safe();
+     ASSERT_EQ(0, completion->get_return_value());
+     completion->release();
+   }
+-
+-  // tear down tiers
+-  ASSERT_EQ(0, cluster.mon_command(
+-    "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
+-    "\"}",
+-    inbl, NULL, NULL));
+-  ASSERT_EQ(0, cluster.mon_command(
+-    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
+-    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
+-    inbl, NULL, NULL));
+-
+-  // wait for maps to settle before next test
+-  cluster.wait_for_latest_osdmap();
+ }
+ 
+ TEST_F(LibRadosTwoPoolsECPP, TryFlush) {
+   // configure cache
+@@ -3166,21 +3011,8 @@
+   {
+     ObjectIterator it = cache_ioctx.objects_begin();
+     ASSERT_TRUE(it == cache_ioctx.objects_end());
+   }
+-
+-  // tear down tiers
+-  ASSERT_EQ(0, cluster.mon_command(
+-    "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
+-    "\"}",
+-    inbl, NULL, NULL));
+-  ASSERT_EQ(0, cluster.mon_command(
+-    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
+-    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
+-    inbl, NULL, NULL));
+-
+-  // wait for maps to settle before next test
+-  cluster.wait_for_latest_osdmap();
+ }
+ 
+ TEST_F(LibRadosTwoPoolsECPP, Flush) {
+   // configure cache
+@@ -3339,21 +3171,8 @@
+   {
+     ObjectIterator it = ioctx.objects_begin();
+     ASSERT_TRUE(it == ioctx.objects_end());
+   }
+-
+-  // tear down tiers
+-  ASSERT_EQ(0, cluster.mon_command(
+-    "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
+-    "\"}",
+-    inbl, NULL, NULL));
+-  ASSERT_EQ(0, cluster.mon_command(
+-    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
+-    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
+-    inbl, NULL, NULL));
+-
+-  // wait for maps to settle before next test
+-  cluster.wait_for_latest_osdmap();
+ }
+ 
+ TEST_F(LibRadosTwoPoolsECPP, FlushSnap) {
+   // configure cache
+@@ -3516,12 +3335,8 @@
+   ASSERT_EQ(0, cluster.mon_command(
+     "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
+     "\"}",
+     inbl, NULL, NULL));
+-  ASSERT_EQ(0, cluster.mon_command(
+-    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
+-    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
+-    inbl, NULL, NULL));
+ 
+   // wait for maps to settle
+   cluster.wait_for_latest_osdmap();
+ 
+@@ -3543,8 +3358,13 @@
+     bufferlist bl;
+     ASSERT_EQ(1, ioctx.read("foo", bl, 1, 0));
+     ASSERT_EQ('a', bl[0]);
+   }
++
++  ASSERT_EQ(0, cluster.mon_command(
++    "{\"prefix\": \"osd tier set-overlay\", \"pool\": \"" + pool_name +
++    "\", \"overlaypool\": \"" + cache_pool_name + "\"}",
++    inbl, NULL, NULL));
+ }
+ 
+ TEST_F(LibRadosTierECPP, FlushWriteRaces) {
+   Rados cluster;
+@@ -3827,21 +3647,8 @@
+     ASSERT_EQ(0, completion2->get_return_value());
+     completion->release();
+     completion2->release();
+   }
+-
+-  // tear down tiers
+-  ASSERT_EQ(0, cluster.mon_command(
+-    "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
+-    "\"}",
+-    inbl, NULL, NULL));
+-  ASSERT_EQ(0, cluster.mon_command(
+-    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
+-    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
+-    inbl, NULL, NULL));
+-
+-  // wait for maps to settle before next test
+-  cluster.wait_for_latest_osdmap();
+ }
+ 
+ TEST_F(LibRadosTwoPoolsECPP, TryFlushReadRace) {
+   // configure cache
+@@ -3902,21 +3709,8 @@
+   max_reads = 0;
+   while (num_reads > 0)
+     cond.Wait(test_lock);
+   test_lock.Unlock();
+-
+-  // tear down tiers
+-  ASSERT_EQ(0, cluster.mon_command(
+-    "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
+-    "\"}",
+-    inbl, NULL, NULL));
+-  ASSERT_EQ(0, cluster.mon_command(
+-    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
+-    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
+-    inbl, NULL, NULL));
+-
+-  // wait for maps to settle before next test
+-  cluster.wait_for_latest_osdmap();
+ }
+ 
+ TEST_F(LibRadosTierECPP, HitSetNone) {
+   {
+@@ -3937,23 +3731,30 @@
+     c->release();
+   }
+ }
+ 
+-TEST_F(LibRadosTierECPP, HitSetRead) {
+-  // enable hitset tracking for this pool
++TEST_F(LibRadosTwoPoolsECPP, HitSetRead) {
++  // make it a tier
+   bufferlist inbl;
+-  ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_count", 2),
++  ASSERT_EQ(0, cluster.mon_command(
++    "{\"prefix\": \"osd tier add\", \"pool\": \"" + pool_name +
++    "\", \"tierpool\": \"" + cache_pool_name +
++    "\", \"force_nonempty\": \"--force-nonempty\" }",
++    inbl, NULL, NULL));
++
++  // enable hitset tracking for this pool
++  ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_count", 2),
+ 						inbl, NULL, NULL));
+-  ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_period", 600),
++  ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_period", 600),
+ 						inbl, NULL, NULL));
+-  ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_type",
++  ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_type",
+ 						"explicit_object"),
+ 				   inbl, NULL, NULL));
+ 
+   // wait for maps to settle
+   cluster.wait_for_latest_osdmap();
+ 
+-  ioctx.set_namespace("");
++  cache_ioctx.set_namespace("");
+ 
+   // keep reading until we see our object appear in the HitSet
+   utime_t start = ceph_clock_now(NULL);
+   utime_t hard_stop = start + utime_t(600, 0);
+@@ -3962,18 +3763,18 @@
+     utime_t now = ceph_clock_now(NULL);
+     ASSERT_TRUE(now < hard_stop);
+ 
+     string name = "foo";
+-    uint32_t hash = ioctx.get_object_hash_position(name);
++    uint32_t hash = cache_ioctx.get_object_hash_position(name);
+     hobject_t oid(sobject_t(name, CEPH_NOSNAP), "", hash,
+-		  cluster.pool_lookup(pool_name.c_str()), "");
++		  cluster.pool_lookup(cache_pool_name.c_str()), "");
+ 
+     bufferlist bl;
+-    ASSERT_EQ(-ENOENT, ioctx.read("foo", bl, 1, 0));
++    ASSERT_EQ(-ENOENT, cache_ioctx.read("foo", bl, 1, 0));
+ 
+     bufferlist hbl;
+     AioCompletion *c = librados::Rados::aio_create_completion();
+-    ASSERT_EQ(0, ioctx.hit_set_get(hash, c, now.sec(), &hbl));
++    ASSERT_EQ(0, cache_ioctx.hit_set_get(hash, c, now.sec(), &hbl));
+     c->wait_for_complete();
+     c->release();
+ 
+     if (hbl.length()) {
+@@ -4068,27 +3869,34 @@
+   }
+ }
+ #endif
+ 
+-TEST_F(LibRadosTierECPP, HitSetTrim) {
++TEST_F(LibRadosTwoPoolsECPP, HitSetTrim) {
+   unsigned count = 3;
+   unsigned period = 3;
+ 
+-  // enable hitset tracking for this pool
++  // make it a tier
+   bufferlist inbl;
+-  ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_count", count),
++  ASSERT_EQ(0, cluster.mon_command(
++    "{\"prefix\": \"osd tier add\", \"pool\": \"" + pool_name +
++    "\", \"tierpool\": \"" + cache_pool_name +
++    "\", \"force_nonempty\": \"--force-nonempty\" }",
++    inbl, NULL, NULL));
++
++  // enable hitset tracking for this pool
++  ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_count", count),
+ 						inbl, NULL, NULL));
+-  ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_period", period),
++  ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_period", period),
+ 						inbl, NULL, NULL));
+-  ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_type", "bloom"),
++  ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_type", "bloom"),
+ 				   inbl, NULL, NULL));
+-  ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_fpp", ".01"),
++  ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_fpp", ".01"),
+ 				   inbl, NULL, NULL));
+ 
+   // wait for maps to settle
+   cluster.wait_for_latest_osdmap();
+ 
+-  ioctx.set_namespace("");
++  cache_ioctx.set_namespace("");
+ 
+   // do a bunch of writes and make sure the hitsets rotate
+   utime_t start = ceph_clock_now(NULL);
+   utime_t hard_stop = start + utime_t(count * period * 50, 0);
+@@ -4099,18 +3907,18 @@
+   memset(buf, 'f', bsize);
+ 
+   while (true) {
+     string name = "foo";
+-    uint32_t hash = ioctx.get_object_hash_position(name);
++    uint32_t hash = cache_ioctx.get_object_hash_position(name);
+     hobject_t oid(sobject_t(name, CEPH_NOSNAP), "", hash, -1, "");
+ 
+     bufferlist bl;
+     bl.append(buf, bsize);
+-    ASSERT_EQ(0, ioctx.append("foo", bl, bsize));
++    ASSERT_EQ(0, cache_ioctx.append("foo", bl, bsize));
+ 
+     list<pair<time_t, time_t> > ls;
+     AioCompletion *c = librados::Rados::aio_create_completion();
+-    ASSERT_EQ(0, ioctx.hit_set_list(hash, c, &ls));
++    ASSERT_EQ(0, cache_ioctx.hit_set_list(hash, c, &ls));
+     c->wait_for_complete();
+     c->release();
+ 
+     ASSERT_TRUE(ls.size() <= count + 1);
+--- a/src/test/objectstore/store_test.cc
++++ b/src/test/objectstore/store_test.cc
+@@ -1114,8 +1114,113 @@
+     ASSERT_EQ(1u, newomap.size());
+     ASSERT_TRUE(newomap.count("omap_key"));
+     ASSERT_TRUE(newomap["omap_key"].contents_equal(omap["omap_key"]));
+   }
++  {
++    ObjectStore::Transaction t;
++    t.remove(cid, oid);
++    t.remove_collection(cid);
++    t.remove_collection(temp_cid);
++    r = store->apply_transaction(t);
++    ASSERT_EQ(r, 0);
++  }
++}
++
++TEST_P(StoreTest, BigRGWObjectName) {
++  store->set_allow_sharded_objects();
++  store->sync_and_flush();
++  coll_t temp_cid("mytemp");
++  hobject_t temp_oid("tmp_oid", "", CEPH_NOSNAP, 0, 0, "");
++  coll_t cid("dest");
++  ghobject_t oid(
++    hobject_t(
++      "default.4106.50_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa [...]
++      "",
++      CEPH_NOSNAP,
++      0x81920472,
++      3,
++      ""),
++    15,
++    shard_id_t(1));
++  ghobject_t oid2(oid);
++  oid2.generation = 17;
++  ghobject_t oidhead(oid);
++  oidhead.generation = ghobject_t::NO_GEN;
++
++  int r;
++  {
++    ObjectStore::Transaction t;
++    t.create_collection(cid);
++    t.touch(cid, oidhead);
++    t.collection_move_rename(cid, oidhead, cid, oid);
++    t.touch(cid, oidhead);
++    t.collection_move_rename(cid, oidhead, cid, oid2);
++    r = store->apply_transaction(t);
++    ASSERT_EQ(r, 0);
++  }
++
++  {
++    ObjectStore::Transaction t;
++    t.remove(cid, oid);
++    r = store->apply_transaction(t);
++    ASSERT_EQ(r, 0);
++  }
++
++  {
++    vector<ghobject_t> objects;
++    r = store->collection_list(cid, objects);
++    ASSERT_EQ(r, 0);
++    ASSERT_EQ(objects.size(), 1u);
++    ASSERT_EQ(objects[0], oid2);
++  }
++
++  ASSERT_FALSE(store->exists(cid, oid));
++
++  {
++    ObjectStore::Transaction t;
++    t.remove(cid, oid2);
++    t.remove_collection(cid);
++    r = store->apply_transaction(t);
++    ASSERT_EQ(r, 0);
++
++  }
++}
++
++TEST_P(StoreTest, SetAllocHint) {
++  coll_t cid("alloc_hint");
++  ghobject_t hoid(hobject_t("test_hint", "", CEPH_NOSNAP, 0, 0, ""));
++  int r;
++  {
++    ObjectStore::Transaction t;
++    t.create_collection(cid);
++    t.touch(cid, hoid);
++    r = store->apply_transaction(t);
++    ASSERT_EQ(r, 0);
++  }
++  {
++    ObjectStore::Transaction t;
++    t.set_alloc_hint(cid, hoid, 4*1024*1024, 1024*4);
++    r = store->apply_transaction(t);
++    ASSERT_EQ(r, 0);
++  }
++  {
++    ObjectStore::Transaction t;
++    t.remove(cid, hoid);
++    r = store->apply_transaction(t);
++    ASSERT_EQ(r, 0);
++  }
++  {
++    ObjectStore::Transaction t;
++    t.set_alloc_hint(cid, hoid, 4*1024*1024, 1024*4);
++    r = store->apply_transaction(t);
++    ASSERT_EQ(r, 0);
++  }
++  {
++    ObjectStore::Transaction t;
++    t.remove_collection(cid);
++    r = store->apply_transaction(t);
++    ASSERT_EQ(r, 0);
++  }
+ }
+ 
+ INSTANTIATE_TEST_CASE_P(
+   ObjectStore,
+--- a/src/test/osd/TestOSDMap.cc
++++ b/src/test/osd/TestOSDMap.cc
+@@ -49,15 +49,26 @@
+       pending_inc.new_uuid[i] = sample_uuid;
+     }
+     osdmap.apply_incremental(pending_inc);
+ 
+-    // kludge to get an erasure coding rule and pool
++    // Create an EC ruleset and a pool using it
+     int r = osdmap.crush->add_simple_ruleset("erasure", "default", "osd",
+ 					     "indep", pg_pool_t::TYPE_ERASURE,
+ 					     &cerr);
+-    pg_pool_t *p = (pg_pool_t *)osdmap.get_pg_pool(2);
++
++    OSDMap::Incremental new_pool_inc(osdmap.get_epoch() + 1);
++    new_pool_inc.new_pool_max = osdmap.get_pool_max();
++    new_pool_inc.fsid = osdmap.get_fsid();
++    pg_pool_t empty;
++    uint64_t pool_id = ++new_pool_inc.new_pool_max;
++    pg_pool_t *p = new_pool_inc.get_new_pool(pool_id, &empty);
++    p->size = 3;
++    p->set_pg_num(64);
++    p->set_pgp_num(64);
+     p->type = pg_pool_t::TYPE_ERASURE;
+     p->crush_ruleset = r;
++    new_pool_inc.new_pool_names[pool_id] = "ec";
++    osdmap.apply_incremental(new_pool_inc);
+   }
+   unsigned int get_num_osds() { return num_osds; }
+ 
+   void test_mappings(int pool,
+@@ -85,8 +96,50 @@
+   ASSERT_EQ(get_num_osds(), (unsigned)osdmap.get_max_osd());
+   ASSERT_EQ(get_num_osds(), osdmap.get_num_in_osds());
+ }
+ 
++TEST_F(OSDMapTest, Features) {
++  // with EC pool
++  set_up_map();
++  uint64_t features = osdmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL);
++  ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES);
++  ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES2);
++  ASSERT_FALSE(features & CEPH_FEATURE_CRUSH_TUNABLES3);
++  ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_V2);
++  ASSERT_TRUE(features & CEPH_FEATURE_OSD_ERASURE_CODES);
++  ASSERT_TRUE(features & CEPH_FEATURE_OSDHASHPSPOOL);
++  ASSERT_FALSE(features & CEPH_FEATURE_OSD_PRIMARY_AFFINITY);
++
++  // clients have a slightly different view
++  features = osdmap.get_features(CEPH_ENTITY_TYPE_CLIENT, NULL);
++  ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES);
++  ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES2);
++  ASSERT_FALSE(features & CEPH_FEATURE_CRUSH_TUNABLES3);
++  ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_V2);
++  ASSERT_FALSE(features & CEPH_FEATURE_OSD_ERASURE_CODES);  // dont' need this
++  ASSERT_TRUE(features & CEPH_FEATURE_OSDHASHPSPOOL);
++  ASSERT_FALSE(features & CEPH_FEATURE_OSD_PRIMARY_AFFINITY);
++
++  // remove teh EC pool, but leave the rule.  add primary affinity.
++  {
++    OSDMap::Incremental new_pool_inc(osdmap.get_epoch() + 1);
++    new_pool_inc.old_pools.insert(osdmap.lookup_pg_pool_name("ec"));
++    new_pool_inc.new_primary_affinity[0] = 0x8000;
++    osdmap.apply_incremental(new_pool_inc);
++  }
++
++  features = osdmap.get_features(CEPH_ENTITY_TYPE_MON, NULL);
++  ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES);
++  ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES2);
++  ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES3); // shared bit with primary affinity
++  ASSERT_FALSE(features & CEPH_FEATURE_CRUSH_V2);
++  ASSERT_FALSE(features & CEPH_FEATURE_OSD_ERASURE_CODES);
++  ASSERT_TRUE(features & CEPH_FEATURE_OSDHASHPSPOOL);
++  ASSERT_TRUE(features & CEPH_FEATURE_OSD_PRIMARY_AFFINITY);
++
++  // FIXME: test tiering feature bits
++}
++
+ TEST_F(OSDMapTest, MapPG) {
+   set_up_map();
+ 
+   pg_t rawpg(0, 0, -1);
+--- a/src/test/osd/osd-test-helpers.sh
++++ b/src/test/osd/osd-test-helpers.sh
+@@ -36,8 +36,9 @@
+     local ceph_args="$CEPH_ARGS"
+     ceph_args+=" --osd-journal-size=100"
+     ceph_args+=" --osd-data=$osd_data"
+     ceph_args+=" --chdir="
++    ceph_args+=" --osd-pool-default-erasure-code-directory=.libs"
+     ceph_args+=" --run-dir=$dir"
+     ceph_args+=" --debug-osd=20"
+     ceph_args+=" --log-file=$dir/osd-\$id.log"
+     ceph_args+=" --pid-file=$dir/osd-\$id.pidfile"
+--- a/src/test/strtol.cc
++++ b/src/test/strtol.cc
+@@ -13,8 +13,9 @@
+  */
+ 
+ #include "common/strtol.h"
+ #include <string>
++#include <map>
+ 
+ #include "gtest/gtest.h"
+ 
+ static void test_strict_strtoll(const char *str, long long expected)
+@@ -133,4 +134,78 @@
+   test_strict_strtod_err("34.0 garbo");
+ 
+   test_strict_strtof_err("0.05.0");
+ }
++
++
++static void test_strict_sistrtoll(const char *str)
++{
++  std::string err;
++  strict_sistrtoll(str, &err);
++  ASSERT_EQ(err, "");
++}
++
++static void test_strict_sistrtoll_units(const std::string& foo,
++                                      char u, const int m)
++{
++  std::string s(foo);
++  s.push_back(u);
++  const char *str = s.c_str();
++  std::string err;
++  uint64_t r = strict_sistrtoll(str, &err);
++  ASSERT_EQ(err, "");
++
++  str = foo.c_str();
++  std::string err2;
++  long long tmp = strict_strtoll(str, 10, &err2);
++  ASSERT_EQ(err2, "");
++  tmp = (tmp << m);
++  ASSERT_EQ(tmp, (long long)r);
++}
++
++TEST(SIStrToLL, WithUnits) {
++  std::map<char,int> units;
++  units['B'] = 0;
++  units['K'] = 10;
++  units['M'] = 20;
++  units['G'] = 30;
++  units['T'] = 40;
++  units['P'] = 50;
++  units['E'] = 60;
++
++  for (std::map<char,int>::iterator p = units.begin();
++       p != units.end(); ++p) {
++    test_strict_sistrtoll_units("1024", p->first, p->second);
++    test_strict_sistrtoll_units("1", p->first, p->second);
++    test_strict_sistrtoll_units("0", p->first, p->second);
++  }
++}
++
++TEST(SIStrToLL, WithoutUnits) {
++  test_strict_sistrtoll("1024");
++  test_strict_sistrtoll("1152921504606846976");
++  test_strict_sistrtoll("0");
++}
++
++static void test_strict_sistrtoll_err(const char *str)
++{
++  std::string err;
++  strict_sistrtoll(str, &err);
++  ASSERT_NE(err, "");
++}
++
++TEST(SIStrToLL, Error) {
++  test_strict_sistrtoll_err("1024F");
++  test_strict_sistrtoll_err("QDDSA");
++  test_strict_sistrtoll_err("1b");
++  test_strict_sistrtoll_err("100k");
++  test_strict_sistrtoll_err("1000m");
++  test_strict_sistrtoll_err("1g");
++  test_strict_sistrtoll_err("20t");
++  test_strict_sistrtoll_err("100p");
++  test_strict_sistrtoll_err("1000e");
++  test_strict_sistrtoll_err("B");
++  test_strict_sistrtoll_err("M");
++  test_strict_sistrtoll_err("BM");
++  test_strict_sistrtoll_err("B0wef");
++  test_strict_sistrtoll_err("0m");
++}
diff --git a/debian/patches/fix-blkdev-BLKGETSIZE-check.patch b/debian/patches/fix-blkdev-BLKGETSIZE-check.patch
deleted file mode 100644
index bef37bb..0000000
--- a/debian/patches/fix-blkdev-BLKGETSIZE-check.patch
+++ /dev/null
@@ -1,35 +0,0 @@
-Last-Update: 2014-08-24
-Forwarded: https://github.com/ceph/ceph/pull/2311
-From: Michael Cree <mcree at orcon.net.nz>
-Bug-Debian: https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=756892
-Description: fix FTBFS on alpha due to incorrect check on BLKGETSIZE
- Ceph FTBFS on Alpha with:
-
- ~~~~
- libtool: compile:  g++ -DHAVE_CONFIG_H -I. -D__CEPH__ -D_FILE_OFFSET_BITS=64 -D_REENTRANT -D_THREAD_SAFE -D__STDC_FORMAT_MACROS -D_GNU_SOURCE -DCEPH_LIBDIR=\"/usr/lib/alpha-linux-gnu\" -DCEPH_PKGLIBDIR=\"/usr/lib/alpha-linux-gnu/ceph\" -DGTEST_HAS_TR1_TUPLE=0 -D_FORTIFY_SOURCE=2 -I/usr/include/nss -I/usr/include/nspr -Wall -Wtype-limits -Wignored-qualifiers -Winit-self -Wpointer-arith -Werror=format-security -fno-strict-aliasing -fsigned-char -rdynamic -ftemplate-depth-1024 -Wnon-virtua [...]
- In file included from /usr/include/alpha-linux-gnu/asm/ioctls.h:4:0,
-                  from /usr/include/alpha-linux-gnu/bits/ioctls.h:23,
-                  from /usr/include/alpha-linux-gnu/sys/ioctl.h:26,
-                  from common/blkdev.cc:3:
- common/blkdev.cc:13:7: error: missing binary operator before token "int"
-  #elif BLKGETSIZE
-        ^
- ~~~~
-
- This error occurs because the value of BLKGETSIZE is tested in a
- c-preprocessor conditional compilation test whereas the test should
- be for existence.
-
---- a/src/common/blkdev.cc
-+++ b/src/common/blkdev.cc
-@@ -9,9 +9,9 @@
- int get_block_device_size(int fd, int64_t *psize)
- {
- #ifdef BLKGETSIZE64
-   int ret = ::ioctl(fd, BLKGETSIZE64, psize);
--#elif BLKGETSIZE
-+#elif defined(BLKGETSIZE)
-   unsigned long sectors = 0;
-   int ret = ::ioctl(fd, BLKGETSIZE, &sectors);
-   *psize = sectors * 512ULL;
- #else
diff --git a/debian/patches/series b/debian/patches/series
index 24098c2..b46c8ca 100644
--- a/debian/patches/series
+++ b/debian/patches/series
@@ -1,4 +1,5 @@
 ## Backported / Upstream
+firefly-latest.patch
 bug-8342.patch
 bug-8624a.patch
 bug-8624b.patch
@@ -9,7 +10,6 @@ client-sleep3.patch
 sleep-recover.patch
 backfill-prio.patch
 bash-completion.patch
-ceph-ao-require-cas.patch
 rbdmap1-mount.patch
 rbdmap2-hooks.patch
 
@@ -19,4 +19,3 @@ arch.patch
 modules.patch
 sample.ceph.conf.patch
 virtualenv-never-download.patch
-fix-blkdev-BLKGETSIZE-check.patch
diff --git a/debian/patches/sleep-recover.patch b/debian/patches/sleep-recover.patch
index 23c42ce..bf89722 100644
--- a/debian/patches/sleep-recover.patch
+++ b/debian/patches/sleep-recover.patch
@@ -6,7 +6,8 @@ Description: fix fuse-client hang after wake-up from suspend.
 
 --- a/src/client/Client.cc
 +++ b/src/client/Client.cc
-@@ -9015,6 +9015,7 @@ void Client::ms_handle_remote_reset(Conn
+@@ -9048,8 +9048,9 @@
+ 
  	case MetaSession::STATE_OPEN:
  	  ldout(cct, 1) << "reset from mds we were open; mark session as stale" << dendl;
  	  s->state = MetaSession::STATE_STALE;
@@ -14,3 +15,4 @@ Description: fix fuse-client hang after wake-up from suspend.
  	  break;
  
  	case MetaSession::STATE_NEW:
+ 	case MetaSession::STATE_CLOSED:

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-ceph/ceph.git