[Pkg-ceph-commits] [ceph] 01/01: New upstream release [0.80.6]

Dmitry Smirnov onlyjob at moszumanska.debian.org
Thu Oct 2 16:07:46 UTC 2014


This is an automated email from the git hooks/post-receive script.

onlyjob pushed a commit to branch master
in repository ceph.

commit 7aac2f4 (HEAD, master)
Author: Dmitry Smirnov <onlyjob at member.fsf.org>
Date:   Thu Oct 2 14:00:36 2014

    New upstream release [0.80.6]
---
 debian/changelog                    |     7 +
 debian/copyright                    |    19 +-
 debian/patches/backfill-prio.patch  |    10 +-
 debian/patches/bug-8821.patch       |     2 +-
 debian/patches/firefly-latest.patch | 11188 +---------------------------------
 5 files changed, 43 insertions(+), 11183 deletions(-)

diff --git a/debian/changelog b/debian/changelog
index c25f4db..37b801f 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,3 +1,10 @@
+ceph (0.80.6-1) unstable; urgency=medium
+
+  * New upstream release [October 2014].
+  * Standards-Version: 3.9.6.
+
+ -- Dmitry Smirnov <onlyjob at debian.org>  Thu, 02 Oct 2014 23:07:04 +1000
+
 ceph (0.80.5-2) unstable; urgency=low
 
   * Patchworks:
diff --git a/debian/copyright b/debian/copyright
index 462177b..da07f5f 100644
--- a/debian/copyright
+++ b/debian/copyright
@@ -8,6 +8,7 @@ Copyright: 2004-2013 Sage Weil <sage at newdream.net>
            2004-2014 Inktank <info at inktank.com>
                      Inktank, Inc
                      Inktank Storage, Inc.
+           2012-2014 Red Hat <contact at redhat.com>
            2013-2014 Cloudwatt <libre.licensing at cloudwatt.com>
            2004-2011 Dreamhost
            2013      eNovance SAS <licensing at enovance.com>
@@ -16,12 +17,11 @@ Copyright: 2004-2013 Sage Weil <sage at newdream.net>
            2014      John Spray <john.spray at inktank.com
            2004-2012 New Dream Network
            2011      Stanislav Sedov <stas at FreeBSD.org>
-           2013      UnitedStack <haomai at unitedstack.com>
+           2013-2014 UnitedStack <haomai at unitedstack.com>
            2011      Wido den Hollander <wido at widodh.nl>
 License: LGPL-2.1
 
-Files: src/erasure-code/jerasure/vectorop.h
-       src/erasure-code/jerasure/ErasureCode*
+Files: src/erasure-code/jerasure/ErasureCode*
        src/erasure-code/ErasureCode*
        src/include/str_map.h
        src/test/common/test_str_map.cc
@@ -36,7 +36,7 @@ Files: src/mount/canonicalize.c
        src/test/common/test_config.cc
        src/test/crush/TestCrushWrapper.cc
        src/test/common/Throttle.cc
-       src/test/filestore/chain_xattr.cc
+       src/test/objectstore/chain_xattr.cc
        src/test/mon/mon-test-helpers.sh
        src/test/objectstore/chain_xattr.cc
        src/test/osd/osd-test-helpers.sh
@@ -53,7 +53,7 @@ Copyright: 2007 Oracle.  All rights reserved.
            2014 Inktank <info at inktank.com>
 License: GPL-2
 
-Files: src/include/ceph_hash.cc
+Files: src/common/ceph_hash.cc
 Copyright: 1995-1997 Robert J. Jenkins Jr.
 License: public-domain
   This file uses Robert Jenkin's hash function as detailed at:
@@ -177,7 +177,7 @@ License: BSD-3-clause
  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  THE POSSIBILITY OF SUCH DAMAGE.
 
-Files: src/ceph/ceph-0.79/src/erasure-code/jerasure/gf-complete/*
+Files: src/erasure-code/jerasure/gf-complete/*/*
 Copyright: 2013 James S. Plank
                 Ethan L. Miller
                 Kevin M. Greenan
@@ -217,12 +217,7 @@ License: BSD-3-clause
 Comment:
  https://bitbucket.org/jimplank/gf-complete
 
-Files: src/erasure-code/jerasure/cauchy.*
-       src/erasure-code/jerasure/galois.*
-       src/erasure-code/jerasure/jerasure.*
-       src/erasure-code/jerasure/liberation.*
-       src/erasure-code/jerasure/reed_sol.*
-       src/erasure-code/jerasure/jerasure/*
+Files: src/erasure-code/jerasure/jerasure/*/*
 Copyright: 2011-2013 James S. Plank <plank at cs.utk.edu>
            2013      Kevin Greenan
 License: BSD-3-clause
diff --git a/debian/patches/backfill-prio.patch b/debian/patches/backfill-prio.patch
index ae3669e..c163aeb 100644
--- a/debian/patches/backfill-prio.patch
+++ b/debian/patches/backfill-prio.patch
@@ -151,7 +151,7 @@ Date:   Tue Jun 24 02:09:49 2014
  
 --- a/src/osd/PG.cc
 +++ b/src/osd/PG.cc
-@@ -1873,8 +1873,28 @@
+@@ -1885,8 +1885,28 @@
  
    dirty_info = true;
  }
@@ -180,7 +180,7 @@ Date:   Tue Jun 24 02:09:49 2014
  {
    dout(10) << "finish_recovery" << dendl;
    assert(info.last_complete == info.last_update);
-@@ -5839,15 +5859,14 @@
+@@ -5852,15 +5872,14 @@
      ConnectionRef con = pg->osd->get_con_osd_cluster(
        backfill_osd_it->osd, pg->get_osdmap()->get_epoch());
      if (con) {
@@ -198,7 +198,7 @@ Date:   Tue Jun 24 02:09:49 2014
        } else {
          post_event(RemoteBackfillReserved());
        }
-@@ -5914,10 +5933,10 @@
+@@ -5927,10 +5946,10 @@
    pg->osd->local_reserver.request_reservation(
      pg->info.pgid,
      new QueuePeeringEvt<LocalBackfillReserved>(
@@ -211,7 +211,7 @@ Date:   Tue Jun 24 02:09:49 2014
  
  void PG::RecoveryState::WaitLocalBackfillReserved::exit()
  {
-@@ -5982,9 +6001,10 @@
+@@ -5995,9 +6014,10 @@
    pg->osd->remote_reserver.request_reservation(
      pg->info.pgid,
      new QueuePeeringEvt<RemoteRecoveryReserved>(
@@ -223,7 +223,7 @@ Date:   Tue Jun 24 02:09:49 2014
  
  boost::statechart::result
  PG::RecoveryState::RepWaitRecoveryReserved::react(const RemoteRecoveryReserved &evt)
-@@ -6123,9 +6143,10 @@
+@@ -6136,9 +6156,10 @@
    pg->osd->local_reserver.request_reservation(
      pg->info.pgid,
      new QueuePeeringEvt<LocalRecoveryReserved>(
diff --git a/debian/patches/bug-8821.patch b/debian/patches/bug-8821.patch
index fe8b99c..d2abce2 100644
--- a/debian/patches/bug-8821.patch
+++ b/debian/patches/bug-8821.patch
@@ -30,7 +30,7 @@ Subject: [PATCH 3/3] rbd: respect rbd_default_* parameters
 
 --- a/src/common/config_opts.h
 +++ b/src/common/config_opts.h
-@@ -748,10 +748,10 @@
+@@ -749,10 +749,10 @@
   * affected by rbd_default_order.
   */
  OPTION(rbd_default_format, OPT_INT, 1)
diff --git a/debian/patches/firefly-latest.patch b/debian/patches/firefly-latest.patch
index 8414fc0..b042b23 100644
--- a/debian/patches/firefly-latest.patch
+++ b/debian/patches/firefly-latest.patch
@@ -1,11172 +1,30 @@
-Last-Update: 2014-09-16
+Last-Update: 2014-10-02
 Forwarded: not-needed
 Origin: upstream
 Author: Dmitry Smirnov <onlyjob at member.fsf.org>
-Description: fixes from "firefly" branch since 0.80.5 release
+Description: fixes from "firefly" branch since 0.80.6 release
 
---- a/configure.ac
-+++ b/configure.ac
-@@ -471,11 +471,16 @@
-              [AC_MSG_FAILURE(
-                    [no libatomic-ops found (use --without-libatomic-ops to disable)])
-               ])])
- AS_IF([test "$HAVE_ATOMIC_OPS" = "1"],
--	[],
-+	[
-+         AC_CHECK_SIZEOF(AO_t, [], [
-+                                #include <atomic_ops.h>
-+                                ])
-+         ],
- 	[AC_DEFINE([NO_ATOMIC_OPS], [1], [Defined if you do not have atomic_ops])])
- 
-+
- AM_CONDITIONAL(WITH_LIBATOMIC, [test "$HAVE_ATOMIC_OPS" = "1"])
- 
- # newsyn?  requires mpi.
- #AC_ARG_WITH([newsyn],
---- /dev/null
-+++ b/doc/_templates/layout.html
-@@ -0,0 +1,5 @@
-+{% extends "!layout.html" %}
-+
-+{%- block extrahead %}
-+    <script type="text/javascript" src="http://ayni.ceph.com/public/js/ceph.js"></script>
-+{% endblock %}
---- a/src/ceph-disk
-+++ b/src/ceph-disk
-@@ -118,8 +118,11 @@
- STATEDIR = '/var/lib/ceph'
- 
- SYSCONFDIR = '/etc/ceph'
- 
-+# only warn once about some things
-+warned_about = {}
-+
- # Nuke the TERM variable to avoid confusing any subprocesses we call.
- # For example, libreadline will print weird control sequences for some
- # TERM values.
- if 'TERM' in os.environ:
-@@ -130,10 +133,8 @@
-     LOG_NAME = os.path.basename(sys.argv[0])
- LOG = logging.getLogger(LOG_NAME)
- 
- 
--
--
- ###### lock ########
- 
- class filelock(object):
-     def __init__(self, fn):
-@@ -149,10 +150,12 @@
-         assert self.fd
-         fcntl.lockf(self.fd, fcntl.LOCK_UN)
-         self.fd = None
- 
-+
- ###### exceptions ########
- 
-+
- class Error(Exception):
-     """
-     Error
-     """
-@@ -160,51 +163,60 @@
-     def __str__(self):
-         doc = self.__doc__.strip()
-         return ': '.join([doc] + [str(a) for a in self.args])
- 
-+
- class MountError(Error):
-     """
-     Mounting filesystem failed
-     """
- 
-+
- class UnmountError(Error):
-     """
-     Unmounting filesystem failed
-     """
- 
-+
- class BadMagicError(Error):
-     """
-     Does not look like a Ceph OSD, or incompatible version
-     """
- 
-+
- class TruncatedLineError(Error):
-     """
-     Line is truncated
-     """
- 
-+
- class TooManyLinesError(Error):
-     """
-     Too many lines
-     """
- 
-+
- class FilesystemTypeError(Error):
-     """
-     Cannot discover filesystem type
-      """
- 
-+
- class CephDiskException(Exception):
-     """
-     A base exception for ceph-disk to provide custom (ad-hoc) messages that
-     will be caught and dealt with when main() is executed
-     """
-     pass
- 
-+
- class ExecutableNotFound(CephDiskException):
-     """
-     Exception to report on executables not available in PATH
-     """
-     pass
- 
-+
- ####### utils
- 
- 
- def maybe_mkdir(*a, **kw):
-@@ -299,9 +311,9 @@
-     of making sure that executables *will* be found and will error nicely
-     otherwise.
-     """
-     arguments = _get_command_executable(arguments)
--    LOG.info('Running command: %s' % ' '.join(arguments))
-+    LOG.info('Running command: %s', ' '.join(arguments))
-     return subprocess.check_call(arguments)
- 
- 
- def platform_distro():
-@@ -339,35 +351,67 @@
-         str(codename).strip()
-     )
- 
- 
--# a device "name" is something like
--#  sdb
--#  cciss!c0d1
- def get_dev_name(path):
-     """
--    get device name from path.  e.g., /dev/sda -> sdas, /dev/cciss/c0d1 -> cciss!c0d1
-+    get device name from path.  e.g.::
-+
-+        /dev/sda -> sdas, /dev/cciss/c0d1 -> cciss!c0d1
-+
-+    a device "name" is something like::
-+
-+        sdb
-+        cciss!c0d1
-+
-     """
-     assert path.startswith('/dev/')
-     base = path[5:]
-     return base.replace('/', '!')
- 
--# a device "path" is something like
--#  /dev/sdb
--#  /dev/cciss/c0d1
-+
- def get_dev_path(name):
-     """
-     get a path (/dev/...) from a name (cciss!c0d1)
-+    a device "path" is something like::
-+
-+        /dev/sdb
-+        /dev/cciss/c0d1
-+
-     """
-     return '/dev/' + name.replace('!', '/')
- 
-+
- def get_dev_relpath(name):
-     """
-     get a relative path to /dev from a name (cciss!c0d1)
-     """
-     return name.replace('!', '/')
- 
- 
-+def get_dev_size(dev, size='megabytes'):
-+    """
-+    Attempt to get the size of a device so that we can prevent errors
-+    from actions to devices that are smaller, and improve error reporting.
-+
-+    Because we want to avoid breakage in case this approach is not robust, we
-+    will issue a warning if we failed to get the size.
-+
-+    :param size: bytes or megabytes
-+    :param dev: the device to calculate the size
-+    """
-+    fd = os.open(dev, os.O_RDONLY)
-+    dividers = {'bytes': 1, 'megabytes': 1024*1024}
-+    try:
-+        device_size = os.lseek(fd, 0, os.SEEK_END)
-+        divider = dividers.get(size, 1024*1024)  # default to megabytes
-+        return device_size/divider
-+    except Exception as error:
-+        LOG.warning('failed to get size of %s: %s' % (dev, str(error)))
-+    finally:
-+        os.close(fd)
-+
-+
- def get_partition_dev(dev, pnum):
-     """
-     get the device name for a partition
- 
-@@ -388,8 +432,9 @@
-         return get_dev_path(partname)
-     else:
-         raise Error('partition %d for %s does not appear to exist' % (pnum, dev))
- 
-+
- def list_all_partitions():
-     """
-     Return a list of devices and partitions
-     """
-@@ -402,8 +447,9 @@
-             continue
-         dev_part_list[name] = list_partitions(name)
-     return dev_part_list
- 
-+
- def list_partitions(basename):
-     """
-     Return a list of partitions on the given device name
-     """
-@@ -412,8 +458,25 @@
-         if name.startswith(basename):
-             partitions.append(name)
-     return partitions
- 
-+def get_partition_base(dev):
-+    """
-+    Get the base device for a partition
-+    """
-+    dev = os.path.realpath(dev)
-+    if not stat.S_ISBLK(os.lstat(dev).st_mode):
-+        raise Error('not a block device', dev)
-+
-+    name = get_dev_name(dev)
-+    if os.path.exists(os.path.join('/sys/block', name)):
-+        raise Error('not a partition', dev)
-+
-+    # find the base
-+    for basename in os.listdir('/sys/block'):
-+        if os.path.exists(os.path.join('/sys/block', basename, name)):
-+            return '/dev/' + basename
-+    raise Error('no parent device for partition', dev)
- 
- def is_partition(dev):
-     """
-     Check whether a given device path is a partition or a full disk.
-@@ -475,23 +538,23 @@
-         base = base[:-1]
-     return []
- 
- 
--def verify_not_in_use(dev):
-+def verify_not_in_use(dev, check_partitions=False):
-     """
-     Verify if a given device (path) is in use (e.g. mounted or
-     in use by device-mapper).
- 
-     :raises: Error if device is in use.
-     """
-     assert os.path.exists(dev)
--    if is_partition(dev):
--        if is_mounted(dev):
--            raise Error('Device is mounted', dev)
--        holders = is_held(dev)
--        if holders:
--            raise Error('Device is in use by a device-mapper mapping (dm-crypt?)' % dev, ','.join(holders))
--    else:
-+    if is_mounted(dev):
-+        raise Error('Device is mounted', dev)
-+    holders = is_held(dev)
-+    if holders:
-+        raise Error('Device is in use by a device-mapper mapping (dm-crypt?)' % dev, ','.join(holders))
-+
-+    if check_partitions and not is_partition(dev):
-         basename = get_dev_name(os.path.realpath(dev))
-         for partname in list_partitions(basename):
-             partition = get_dev_path(partname)
-             if is_mounted(partition):
-@@ -535,12 +598,14 @@
- 
-     try:
-         line = must_be_one_line(line)
-     except (TruncatedLineError, TooManyLinesError) as e:
--        raise Error('File is corrupt: {path}: {msg}'.format(
-+        raise Error(
-+            'File is corrupt: {path}: {msg}'.format(
-                 path=path,
-                 msg=e,
--                ))
-+            )
-+        )
-     return line
- 
- 
- def write_one_line(parent, name, text):
-@@ -745,9 +810,9 @@
-     Maps a device to a dmcrypt device.
- 
-     :return: Path to the dmcrypt device.
-     """
--    dev = '/dev/mapper/'+ _uuid
-+    dev = '/dev/mapper/' + _uuid
-     args = [
-         'cryptsetup',
-         '--key-file',
-         keypath,
-@@ -791,8 +856,14 @@
-     """
-     Mounts a device with given filessystem type and
-     mount options to a tempfile path under /var/lib/ceph/tmp.
-     """
-+    # sanity check: none of the arguments are None
-+    if dev is None:
-+        raise ValueError('dev may not be None')
-+    if fstype is None:
-+        raise ValueError('fstype may not be None')
-+
-     # pick best-of-breed mount options based on fs type
-     if options is None:
-         options = MOUNT_OPTIONS.get(fstype, '')
- 
-@@ -966,8 +1037,17 @@
-             size=journal_size,
-             )
-         LOG.warning('OSD will not be hot-swappable if journal is not the same device as the osd data')
- 
-+    dev_size = get_dev_size(journal)
-+
-+    if journal_size > dev_size:
-+        LOG.error('refusing to create journal on %s' % journal)
-+        LOG.error('journal size (%sM) is bigger than device (%sM)' % (journal_size, dev_size))
-+        raise Error(
-+            '%s device size (%sM) is not big enough for journal' % (journal, dev_size)
-+        )
-+
-     try:
-         LOG.debug('Creating journal partition num %d size %d on %s', num, journal_size, journal)
-         command_check_call(
-             [
-@@ -1043,9 +1123,9 @@
-     journal):
- 
-     if not os.path.exists(journal):
-         LOG.debug('Creating journal file %s with size 0 (ceph-osd will resize and allocate)', journal)
--        with file(journal, 'wb') as journal_file:
-+        with file(journal, 'wb') as journal_file:  # noqa
-             pass
- 
-     LOG.debug('Journal is file %s', journal)
-     LOG.warning('OSD will not be hot-swappable if journal is not the same device as the osd data')
-@@ -1109,15 +1189,16 @@
-             os.symlink(target, path)
-         except:
-             raise Error('unable to create symlink %s -> %s' % (path, target))
- 
-+
- def prepare_dir(
-     path,
-     journal,
-     cluster_uuid,
-     osd_uuid,
-     journal_uuid,
--    journal_dmcrypt = None,
-+    journal_dmcrypt=None,
-     ):
- 
-     if os.path.exists(os.path.join(path, 'magic')):
-         LOG.debug('Data dir %s already exists', path)
-@@ -1182,11 +1263,8 @@
-     if is_partition(data):
-         LOG.debug('OSD data device %s is a partition', data)
-         rawdev = data
-     else:
--        if journal_dmcrypt is not None:
--            dmcrypt_unmap(journal)
--
-         LOG.debug('Creating osd partition on %s', data)
-         try:
-             command_check_call(
-                 [
-@@ -1237,11 +1315,11 @@
-                 args.extend(['-f'])  # always force
-         else:
-             args.extend(MKFS_ARGS.get(fstype, []))
-         args.extend([
--                '--',
--                dev,
--                ])
-+            '--',
-+            dev,
-+            ])
-         try:
-             LOG.debug('Creating %s fs on %s', fstype, dev)
-             command_check_call(args)
-         except subprocess.CalledProcessError as e:
-@@ -1266,10 +1344,8 @@
-             unmount(path)
-     finally:
-         if rawdev != dev:
-             dmcrypt_unmap(osd_uuid)
--        if journal_dmcrypt is not None:
--            dmcrypt_unmap(journal)
- 
-     if not is_partition(data):
-         try:
-             command_check_call(
-@@ -1288,9 +1364,9 @@
-     journal_dm_keypath = None
-     osd_dm_keypath = None
- 
-     try:
--        prepare_lock.acquire()
-+        prepare_lock.acquire()  # noqa
-         if not os.path.exists(args.data):
-             if args.data_dev:
-                 raise Error('data path does not exist', args.data)
-             else:
-@@ -1298,14 +1374,14 @@
- 
-         # in use?
-         dmode = os.stat(args.data).st_mode
-         if stat.S_ISBLK(dmode):
--            verify_not_in_use(args.data)
-+            verify_not_in_use(args.data, True)
- 
-         if args.journal and os.path.exists(args.journal):
-             jmode = os.stat(args.journal).st_mode
-             if stat.S_ISBLK(jmode):
--                verify_not_in_use(args.journal)
-+                verify_not_in_use(args.journal, False)
- 
-         if args.zap_disk is not None:
-             if stat.S_ISBLK(dmode) and not is_partition(args.data):
-                 zap(args.data)
-@@ -1420,9 +1496,9 @@
-                 osd_dm_keypath=osd_dm_keypath,
-                 )
-         else:
-             raise Error('not a dir or block device', args.data)
--        prepare_lock.release()
-+        prepare_lock.release()  # noqa
- 
-         if stat.S_ISBLK(dmode):
-             # try to make sure the kernel refreshes the table.  note
-             # that if this gets ebusy, we are probably racing with
-@@ -1456,9 +1532,9 @@
-         if journal_dm_keypath:
-             os.unlink(journal_dm_keypath)
-         if osd_dm_keypath:
-             os.unlink(osd_dm_keypath)
--        prepare_lock.release()
-+        prepare_lock.release()  # noqa
-         raise e
- 
- 
- ###########################
-@@ -1622,20 +1698,23 @@
-             command_check_call(
-                 [
-                     svc,
-                     'ceph',
-+                    '--cluster',
-+                    '{cluster}'.format(cluster=cluster),
-                     'start',
-                     'osd.{osd_id}'.format(osd_id=osd_id),
-                     ],
-                 )
-         else:
-             raise Error('{cluster} osd.{osd_id} is not tagged with an init system'.format(
--                    cluster=cluster,
--                    osd_id=osd_id,
--                    ))
-+                cluster=cluster,
-+                osd_id=osd_id,
-+            ))
-     except subprocess.CalledProcessError as e:
-         raise Error('ceph osd start failed', e)
- 
-+
- def detect_fstype(
-     dev,
-     ):
-     fstype = _check_output(
-@@ -1703,10 +1782,10 @@
-         other = False
-         src_dev = os.stat(path).st_dev
-         try:
-             dst_dev = os.stat((STATEDIR + '/osd/{cluster}-{osd_id}').format(
--                    cluster=cluster,
--                    osd_id=osd_id)).st_dev
-+                cluster=cluster,
-+                osd_id=osd_id)).st_dev
-             if src_dev == dst_dev:
-                 active = True
-             else:
-                 parent_dev = os.stat(STATEDIR + '/osd').st_dev
-@@ -1759,9 +1838,9 @@
-             )
- 
-     (osd_id, cluster) = activate(path, activate_key_template, init)
- 
--    if init not in ( None, 'none' ):
-+    if init not in (None, 'none' ):
-         canonical = (STATEDIR + '/osd/{cluster}-{osd_id}').format(
-             cluster=cluster,
-             osd_id=osd_id)
-         if path != canonical:
-@@ -1814,8 +1893,9 @@
-         LOG.warning('No fsid defined in ' + SYSCONFDIR + '/ceph.conf; using anyway')
-         return 'ceph'
-     return None
- 
-+
- def activate(
-     path,
-     activate_key_template,
-     init,
-@@ -1860,9 +1940,9 @@
-             fsid=fsid,
-             keyring=keyring,
-             )
- 
--    if init not in ( None, 'none' ):
-+    if init not in (None, 'none' ):
-         if init == 'auto':
-             conf_val = get_conf(
-                 cluster=cluster,
-                 variable='init'
-@@ -1911,9 +1991,9 @@
-     if is_suppressed(args.path):
-         LOG.info('suppressed activate request on %s', args.path)
-         return
- 
--    activate_lock.acquire()
-+    activate_lock.acquire()  # noqa
-     try:
-         mode = os.stat(args.path).st_mode
-         if stat.S_ISBLK(mode):
-             (cluster, osd_id) = mount_activate(
-@@ -1931,9 +2011,9 @@
- 
-             if args.mark_init == 'none':
-                 command_check_call(
-                     [
--                    'ceph-osd',
-+                        'ceph-osd',
-                         '--cluster={cluster}'.format(cluster=cluster),
-                         '--id={osd_id}'.format(osd_id=osd_id),
-                         '--osd-data={path}'.format(path=args.path),
-                         '--osd-journal={path}/journal'.format(path=args.path),
-@@ -1942,17 +2022,17 @@
- 
-         else:
-             raise Error('%s is not a directory or block device' % args.path)
- 
--        if args.mark_init not in ( None, 'none' ):
-+        if args.mark_init not in (None, 'none' ):
- 
-             start_daemon(
-                 cluster=cluster,
-                 osd_id=osd_id,
-             )
- 
-     finally:
--        activate_lock.release()
-+        activate_lock.release()  # noqa
- 
- 
- ###########################
- 
-@@ -1983,16 +2063,17 @@
-     value = str(out).split('\n', 1)[0]
-     LOG.debug('Journal %s has OSD UUID %s', path, value)
-     return value
- 
-+
- def main_activate_journal(args):
-     if not os.path.exists(args.dev):
-         raise Error('%s does not exist' % args.dev)
- 
-     cluster = None
-     osd_id = None
-     osd_uuid = None
--    activate_lock.acquire()
-+    activate_lock.acquire()  # noqa
-     try:
-         osd_uuid = get_journal_osd_uuid(args.dev)
-         path = os.path.join('/dev/disk/by-partuuid/', osd_uuid.lower())
- 
-@@ -2007,12 +2088,14 @@
-             osd_id=osd_id,
-             )
- 
-     finally:
--        activate_lock.release()
-+        activate_lock.release()  # noqa
-+
- 
- ###########################
- 
-+
- def main_activate_all(args):
-     dir = '/dev/disk/by-parttypeuuid'
-     LOG.debug('Scanning %s', dir)
-     if not os.path.exists(dir):
-@@ -2021,12 +2104,18 @@
-     for name in os.listdir(dir):
-         if name.find('.') < 0:
-             continue
-         (tag, uuid) = name.split('.')
--        if tag == OSD_UUID:
--            path = os.path.join(dir, name)
-+
-+        if tag == OSD_UUID or tag == DMCRYPT_OSD_UUID:
-+
-+            if tag == DMCRYPT_OSD_UUID:
-+                path = os.path.join('/dev/mapper', uuid)
-+            else:
-+                path = os.path.join(dir, name)
-+
-             LOG.info('Activating %s', path)
--            activate_lock.acquire()
-+            activate_lock.acquire()  # noqa
-             try:
-                 (cluster, osd_id) = mount_activate(
-                     dev=path,
-                     activate_key_template=args.activate_key_template,
-@@ -2044,9 +2133,9 @@
-                     )
-                 err = True
- 
-             finally:
--                activate_lock.release()
-+                activate_lock.release()  # noqa
-     if err:
-         raise Error('One or more partitions failed to activate')
- 
- 
-@@ -2065,15 +2154,17 @@
-                 if swaps_dev == dev:
-                     return True
-     return False
- 
-+
- def get_oneliner(base, name):
-     path = os.path.join(base, name)
-     if os.path.isfile(path):
-         with open(path, 'r') as _file:
-             return _file.readline().rstrip()
-     return None
- 
-+
- def get_dev_fs(dev):
-     fscheck, _ = command(
-         [
-             'blkid',
-@@ -2087,9 +2178,58 @@
-         return fstype
-     else:
-         return None
- 
-+
- def get_partition_type(part):
-+    """
-+    Get the GPT partition type UUID.  If we have an old blkid and can't
-+    get it that way, use sgdisk and use the description instead (and hope
-+    dmcrypt isn't being used).
-+    """
-+    blkid, _ = command(
-+        [
-+            'blkid',
-+            '-p',
-+            '-o', 'udev',
-+            part,
-+        ]
-+    )
-+    saw_part_entry = False
-+    for line in blkid.splitlines():
-+        (key, value) = line.split('=')
-+        if key == 'ID_PART_ENTRY_TYPE':
-+            return value
-+        if key == 'ID_PART_ENTRY_SCHEME':
-+            table_type = value
-+        if key.startswith('ID_PART_ENTRY_'):
-+            saw_part_entry = True
-+
-+    # hmm, is it in fact GPT?
-+    table_type = None
-+    base = get_partition_base(part)
-+    blkid, _ = command(
-+        [
-+            'blkid',
-+            '-p',
-+            '-o', 'udev',
-+            base
-+        ]
-+    )
-+    for line in blkid.splitlines():
-+        (key, value) = line.split('=')
-+        if key == 'ID_PART_TABLE_TYPE':
-+            table_type = value
-+    if table_type != 'gpt':
-+        return None    # not even GPT
-+
-+    if saw_part_entry:
-+        return None    # GPT, and blkid appears to be new, so we're done.
-+
-+    # bah, fall back to sgdisk.
-+    if 'blkid' not in warned_about:
-+        LOG.warning('Old blkid does not support ID_PART_ENTRY_* fields, trying sgdisk; may not correctly identify ceph volumes with dmcrypt')
-+        warned_about['blkid'] = True
-     (base, partnum) = re.match('(\D+)(\d+)', part).group(1, 2)
-     sgdisk, _ = command(
-         [
-             'sgdisk',
-@@ -2103,11 +2243,18 @@
-         if m is not None:
-             num = m.group(1)
-             if num != partnum:
-                 continue
--            return m.group(2)
-+            desc = m.group(2)
-+            # assume unencrypted ... blkid has failed us :(
-+            if desc == 'ceph data':
-+                return OSD_UUID
-+            if desc == 'ceph journal':
-+                return JOURNAL_UUID
-+
-     return None
- 
-+
- def get_partition_uuid(dev):
-     (base, partnum) = re.match('(\D+)(\d+)', dev).group(1, 2)
-     out, _ = command(['sgdisk', '-i', partnum, base])
-     for line in out.splitlines():
-@@ -2115,8 +2262,9 @@
-         if m:
-             return m.group(1).lower()
-     return None
- 
-+
- def more_osd_info(path, uuid_map):
-     desc = []
-     ceph_fsid = get_oneliner(path, 'ceph_fsid')
-     if ceph_fsid:
-@@ -2137,46 +2285,71 @@
-             desc.append('journal %s' % uuid_map[journal_uuid])
- 
-     return desc
- 
-+def list_dev_osd(dev, uuid_map):
-+    path = is_mounted(dev)
-+    fs_type = get_dev_fs(dev)
-+    desc = []
-+    if path:
-+        desc.append('active')
-+        desc.extend(more_osd_info(path, uuid_map))
-+    elif fs_type:
-+        try:
-+            tpath = mount(dev=dev, fstype=fs_type, options='')
-+            if tpath:
-+                try:
-+                    magic = get_oneliner(tpath, 'magic')
-+                    if magic is not None:
-+                        desc.append('prepared')
-+                        desc.extend(more_osd_info(tpath, uuid_map))
-+                finally:
-+                    unmount(tpath)
-+        except MountError:
-+            pass
-+    return desc
- 
- def list_dev(dev, uuid_map, journal_map):
-     ptype = 'unknown'
-     prefix = ''
-     if is_partition(dev):
-         ptype = get_partition_type(dev)
-         prefix = ' '
--    fs_type = get_dev_fs(dev)
--    path = is_mounted(dev)
- 
-     desc = []
--    if ptype == 'ceph data':
--        if path:
--            desc.append('active')
--            desc.extend(more_osd_info(path, uuid_map))
--        elif fs_type:
--            try:
--                tpath = mount(dev=dev, fstype=fs_type, options='')
--                if tpath:
--                    try:
--                        magic = get_oneliner(tpath, 'magic')
--                        if magic is not None:
--                            desc.append('prepared')
--                            desc.extend(more_osd_info(tpath, uuid_map))
--                    finally:
--                        unmount(tpath)
--            except MountError:
--                pass
-+    if ptype == OSD_UUID:
-+        desc = list_dev_osd(dev, uuid_map)
-         if desc:
-             desc = ['ceph data'] + desc
-         else:
-             desc = ['ceph data', 'unprepared']
--    elif ptype == 'ceph journal':
-+    elif ptype == DMCRYPT_OSD_UUID:
-+        holders = is_held(dev)
-+        if not holders:
-+            desc = ['ceph data (dmcrypt)', 'not currently mapped']
-+        elif len(holders) == 1:
-+            holder = '/dev/' + holders[0]
-+            fs_desc = list_dev_osd(holder, uuid_map)
-+            desc = ['ceph data (dmcrypt %s)' % holder] + fs_desc
-+        else:
-+            desc = ['ceph data (dmcrypt)', 'holders: ' + ','.join(holders)]
-+    elif ptype == JOURNAL_UUID:
-         desc.append('ceph journal')
-         part_uuid = get_partition_uuid(dev)
-         if part_uuid and part_uuid in journal_map:
-             desc.append('for %s' % journal_map[part_uuid])
-+    elif ptype == DMCRYPT_JOURNAL_UUID:
-+        holders = is_held(dev)
-+        if len(holders) == 1:
-+            desc = ['ceph journal (dmcrypt /dev/%s)' % holders[0]]
-+        else:
-+            desc = ['ceph journal (dmcrypt)']
-+        part_uuid = get_partition_uuid(dev)
-+        if part_uuid and part_uuid in journal_map:
-+            desc.append('for %s' % journal_map[part_uuid])
-     else:
-+        path = is_mounted(dev)
-+        fs_type = get_dev_fs(dev)
-         if is_swap(dev):
-             desc.append('swap')
-         else:
-             desc.append('other')
-@@ -2189,9 +2362,8 @@
- 
-     print '%s%s %s' % (prefix, dev, ', '.join(desc))
- 
- 
--
- def main_list(args):
-     partmap = list_all_partitions()
- 
-     uuid_map = {}
-@@ -2202,20 +2374,37 @@
-             part_uuid = get_partition_uuid(dev)
-             if part_uuid:
-                 uuid_map[part_uuid] = dev
-             ptype = get_partition_type(dev)
--            if ptype == 'ceph data':
-+            if ptype == OSD_UUID:
-                 fs_type = get_dev_fs(dev)
--                try:
--                    tpath = mount(dev=dev, fstype=fs_type, options='')
-+                if fs_type is not None:
-                     try:
--                        journal_uuid = get_oneliner(tpath, 'journal_uuid')
--                        if journal_uuid:
--                            journal_map[journal_uuid.lower()] = dev
--                    finally:
--                        unmount(tpath)
--                except MountError:
--                    pass
-+                        tpath = mount(dev=dev, fstype=fs_type, options='')
-+                        try:
-+                            journal_uuid = get_oneliner(tpath, 'journal_uuid')
-+                            if journal_uuid:
-+                                journal_map[journal_uuid.lower()] = dev
-+                        finally:
-+                            unmount(tpath)
-+                    except MountError:
-+                        pass
-+            if ptype == DMCRYPT_OSD_UUID:
-+                holders = is_held(dev)
-+                if len(holders) == 1:
-+                    holder = '/dev/' + holders[0]
-+                    fs_type = get_dev_fs(holder)
-+                    if fs_type is not None:
-+                        try:
-+                            tpath = mount(dev=holder, fstype=fs_type, options='')
-+                            try:
-+                                journal_uuid = get_oneliner(tpath, 'journal_uuid')
-+                                if journal_uuid:
-+                                    journal_map[journal_uuid.lower()] = dev
-+                            finally:
-+                                unmount(tpath)
-+                        except MountError:
-+                            pass
- 
-     for base, parts in sorted(partmap.iteritems()):
-         if parts:
-             print '%s :' % get_dev_path(base)
-@@ -2243,26 +2432,28 @@
-         if not disk.startswith('/dev/') or not stat.S_ISBLK(os.lstat(path).st_mode):
-             return False
-         base = get_dev_name(disk)
-         while len(base):
--            if os.path.exists(SUPPRESS_PREFIX + base):
-+            if os.path.exists(SUPPRESS_PREFIX + base):  # noqa
-                 return True
-             base = base[:-1]
-     except:
-         return False
- 
-+
- def set_suppress(path):
-     disk = os.path.realpath(path)
-     if not os.path.exists(disk):
-         raise Error('does not exist', path)
-     if not stat.S_ISBLK(os.lstat(path).st_mode):
-         raise Error('not a block device', path)
-     base = get_dev_name(disk)
- 
--    with file(SUPPRESS_PREFIX + base, 'w') as f:
-+    with file(SUPPRESS_PREFIX + base, 'w') as f:  # noqa
-         pass
-     LOG.info('set suppress flag on %s', base)
- 
-+
- def unset_suppress(path):
-     disk = os.path.realpath(path)
-     if not os.path.exists(disk):
-         raise Error('does not exist', path)
-@@ -2270,9 +2461,9 @@
-         raise Error('not a block device', path)
-     assert disk.startswith('/dev/')
-     base = get_dev_name(disk)
- 
--    fn = SUPPRESS_PREFIX + base
-+    fn = SUPPRESS_PREFIX + base  # noqa
-     if not os.path.exists(fn):
-         raise Error('not marked as suppressed', path)
- 
-     try:
-@@ -2284,18 +2475,24 @@
- 
- def main_suppress(args):
-     set_suppress(args.path)
- 
-+
- def main_unsuppress(args):
-     unset_suppress(args.path)
- 
-+
- def main_zap(args):
-     for dev in args.dev:
-         zap(dev)
- 
- ###########################
- 
-+
- def setup_statedir(dir):
-+    # XXX The following use of globals makes linting
-+    # really hard. Global state in Python is iffy and
-+    # should be avoided.
-     global STATEDIR
-     STATEDIR = dir
- 
-     if not os.path.exists(STATEDIR):
-@@ -2311,12 +2508,14 @@
- 
-     global SUPPRESS_PREFIX
-     SUPPRESS_PREFIX = STATEDIR + '/tmp/suppress-activate.'
- 
-+
- def setup_sysconfdir(dir):
-     global SYSCONFDIR
-     SYSCONFDIR = dir
- 
-+
- def parse_args():
-     parser = argparse.ArgumentParser(
-         'ceph-disk',
-         )
-@@ -2588,4 +2787,5 @@
- 
- 
- if __name__ == '__main__':
-     main()
-+    warned_about = {}
---- a/src/ceph.in
-+++ b/src/ceph.in
-@@ -105,8 +105,16 @@
-     for mdsdict in infodict.values():
-         l.append(mdsdict['name'])
-     return l
- 
-+# these args must be passed to all child programs
-+GLOBAL_ARGS = {
-+    'client_id': '--id',
-+    'client_name': '--name',
-+    'cluster': '--cluster',
-+    'cephconf': '--conf',
-+}
-+
- def parse_cmdargs(args=None, target=''):
-     # alias: let the line-wrapping be sane
-     AP = argparse.ArgumentParser
- 
-@@ -338,17 +346,25 @@
- 
-     return ret
- 
- 
--def ceph_conf(field, name):
-+def ceph_conf(parsed_args, field, name):
-+    args=['ceph-conf']
-+
-+    if name:
-+        args.extend(['--name', name])
-+
-+    # add any args in GLOBAL_ARGS
-+    for key, val in GLOBAL_ARGS.iteritems():
-+        # ignore name in favor of argument name, if any
-+        if name and key == 'client_name':
-+            continue
-+        if getattr(parsed_args, key):
-+            args.extend([val, getattr(parsed_args, key)])
-+
-+    args.extend(['--show-config-value', field])
-     p = subprocess.Popen(
--        args=[
--            'ceph-conf',
--	    '--show-config-value',
--            field,
--            '-n',
--            name,
--            ],
-+        args,
-         stdout=subprocess.PIPE,
-         stderr=subprocess.PIPE)
-     outdata, errdata = p.communicate()
-     if (len(errdata)):
-@@ -537,9 +553,10 @@
-                 sockpath = childargs[1]
-             else:
-                 # try resolve daemon name
-                 try:
--                    sockpath = ceph_conf('admin_socket', childargs[1])
-+                    sockpath = ceph_conf(parsed_args, 'admin_socket',
-+                                         childargs[1])
-                 except Exception as e:
-                     print >> sys.stderr, \
-                         'Can\'t get admin socket path: ' + str(e)
-                     return errno.EINVAL
---- a/src/ceph_common.sh
-+++ b/src/ceph_common.sh
-@@ -49,14 +49,15 @@
-     get_conf user "" "user"
- 
-     #echo host for $name is $host, i am $hostname
- 
--    if [ -e "/var/lib/ceph/$type/ceph-$id/upstart" ]; then
-+    cluster=$1
-+    if [ -e "/var/lib/ceph/$type/$cluster-$id/upstart" ]; then
- 	return 1
-     fi
- 
-     # sysvinit managed instance in standard location?
--    if [ -e "/var/lib/ceph/$type/ceph-$id/sysvinit" ]; then
-+    if [ -e "/var/lib/ceph/$type/$cluster-$id/sysvinit" ]; then
- 	host="$hostname"
- 	echo "=== $type.$id === "
- 	return 0
-     fi
---- a/src/ceph_mon.cc
-+++ b/src/ceph_mon.cc
-@@ -42,8 +42,10 @@
- #include "global/signal_handler.h"
- 
- #include "include/assert.h"
- 
-+#include "erasure-code/ErasureCodePlugin.h"
-+
- #define dout_subsys ceph_subsys_mon
- 
- Monitor *mon = NULL;
- 
-@@ -183,8 +185,23 @@
-   cerr << "        where the mon store and keyring are located\n";
-   generic_server_usage();
- }
- 
-+int preload_erasure_code()
-+{
-+  string directory = g_conf->osd_pool_default_erasure_code_directory;
-+  string plugins = g_conf->osd_erasure_code_plugins;
-+  stringstream ss;
-+  int r = ErasureCodePluginRegistry::instance().preload(plugins,
-+							directory,
-+							ss);
-+  if (r)
-+    derr << ss.str() << dendl;
-+  else
-+    dout(10) << ss.str() << dendl;
-+  return r;
-+}
-+
- int main(int argc, const char **argv) 
- {
-   int err;
- 
-@@ -415,8 +432,10 @@
-       global_init_postfork_start(g_ceph_context);
-     }
-     common_init_finish(g_ceph_context);
-     global_init_chdir(g_ceph_context);
-+    if (preload_erasure_code() < -1)
-+      prefork.exit(1);
-   }
- 
-   MonitorDBStore *store = new MonitorDBStore(g_conf->mon_data);
- 
---- a/src/ceph_osd.cc
-+++ b/src/ceph_osd.cc
-@@ -47,8 +47,10 @@
- #include "perfglue/heap_profiler.h"
- 
- #include "include/assert.h"
- 
-+#include "erasure-code/ErasureCodePlugin.h"
-+
- #define dout_subsys ceph_subsys_osd
- 
- OSD *osd = NULL;
- 
-@@ -65,8 +67,23 @@
-   derr << "   --debug_osd N   set debug level (e.g. 10)" << dendl;
-   generic_server_usage();
- }
- 
-+int preload_erasure_code()
-+{
-+  string directory = g_conf->osd_pool_default_erasure_code_directory;
-+  string plugins = g_conf->osd_erasure_code_plugins;
-+  stringstream ss;
-+  int r = ErasureCodePluginRegistry::instance().preload(plugins,
-+							directory,
-+							ss);
-+  if (r)
-+    derr << ss.str() << dendl;
-+  else
-+    dout(10) << ss.str() << dendl;
-+  return r;
-+}
-+
- int main(int argc, const char **argv) 
- {
-   vector<const char*> args;
-   argv_to_vec(argc, argv, args);
-@@ -450,8 +467,11 @@
-   if (mc.build_initial_monmap() < 0)
-     return -1;
-   global_init_chdir(g_ceph_context);
- 
-+  if (preload_erasure_code() < -1)
-+    return -1;
-+
-   osd = new OSD(g_ceph_context,
- 		store,
- 		whoami,
- 		ms_cluster,
---- a/src/cls/rgw/cls_rgw.cc
-+++ b/src/cls/rgw/cls_rgw.cc
-@@ -669,9 +669,9 @@
-     CLS_LOG(0, "rgw_bucket_complete_op(): entry.name=%s entry.meta.category=%d\n", remove_entry.name.c_str(), remove_entry.meta.category);
-     unaccount_entry(header, remove_entry);
- 
-     if (op.log_op) {
--      rc = log_index_operation(hctx, op.name, CLS_RGW_OP_DEL, op.tag, remove_entry.meta.mtime,
-+      rc = log_index_operation(hctx, remove_oid_name, CLS_RGW_OP_DEL, op.tag, remove_entry.meta.mtime,
-                                remove_entry.ver, CLS_RGW_STATE_COMPLETE, header.ver, header.max_marker);
-       if (rc < 0)
-         continue;
-     }
---- a/src/common/Finisher.h
-+++ b/src/common/Finisher.h
-@@ -76,8 +76,17 @@
-     ls.clear();
-     if (logger)
-       logger->inc(l_finisher_queue_len);
-   }
-+  void queue(list<Context*>& ls) {
-+    finisher_lock.Lock();
-+    finisher_queue.insert(finisher_queue.end(), ls.begin(), ls.end());
-+    finisher_cond.Signal();
-+    finisher_lock.Unlock();
-+    ls.clear();
-+    if (logger)
-+      logger->inc(l_finisher_queue_len);
-+  }
-   
-   void start();
-   void stop();
- 
---- a/src/common/LogClient.cc
-+++ b/src/common/LogClient.cc
-@@ -123,8 +123,9 @@
- }
- 
- Message *LogClient::_get_mon_log_message()
- {
-+  assert(log_lock.is_locked());
-    if (log_queue.empty())
-      return NULL;
- 
-   // only send entries that haven't been sent yet during this mon
-@@ -148,9 +149,9 @@
- 		<< " sending " << num_send << dendl;
-   assert(num_unsent <= log_queue.size());
-   std::deque<LogEntry>::iterator p = log_queue.begin();
-   std::deque<LogEntry> o;
--  while (p->seq < last_log_sent) {
-+  while (p->seq <= last_log_sent) {
-     ++p;
-     assert(p != log_queue.end());
-   }
-   while (num_send--) {
---- a/src/common/Makefile.am
-+++ b/src/common/Makefile.am
-@@ -12,8 +12,9 @@
- 	common/admin_socket.cc \
- 	common/admin_socket_client.cc \
- 	common/cmdparse.cc \
- 	common/escape.c \
-+	common/io_priority.cc \
- 	common/Clock.cc \
- 	common/Throttle.cc \
- 	common/Timer.cc \
- 	common/Finisher.cc \
-@@ -155,8 +156,9 @@
- 	common/perf_counters.h \
- 	common/OutputDataSocket.h \
- 	common/admin_socket.h \
- 	common/admin_socket_client.h \
-+	common/random_cache.hpp \
- 	common/shared_cache.hpp \
- 	common/tracked_int_ptr.hpp \
- 	common/simple_cache.hpp \
- 	common/sharedptr_registry.hpp \
-@@ -174,8 +176,9 @@
- 	common/TrackedOp.h \
- 	common/arch.h \
- 	common/armor.h \
- 	common/common_init.h \
-+	common/io_priority.h \
- 	common/pipe.h \
- 	common/code_environment.h \
- 	common/signal.h \
- 	common/simple_spin.h \
---- a/src/common/Thread.cc
-+++ b/src/common/Thread.cc
-@@ -15,8 +15,9 @@
- #include "common/Thread.h"
- #include "common/code_environment.h"
- #include "common/debug.h"
- #include "common/signal.h"
-+#include "common/io_priority.h"
- 
- #include <dirent.h>
- #include <errno.h>
- #include <iostream>
-@@ -28,21 +29,38 @@
- #include <sys/types.h>
- 
- 
- Thread::Thread()
--  : thread_id(0)
-+  : thread_id(0),
-+    pid(0),
-+    ioprio_class(-1),
-+    ioprio_priority(-1)
- {
- }
- 
- Thread::~Thread()
- {
- }
- 
- void *Thread::_entry_func(void *arg) {
--  void *r = ((Thread*)arg)->entry();
-+  void *r = ((Thread*)arg)->entry_wrapper();
-   return r;
- }
- 
-+void *Thread::entry_wrapper()
-+{
-+  int p = ceph_gettid(); // may return -ENOSYS on other platforms
-+  if (p > 0)
-+    pid = p;
-+  if (ioprio_class >= 0 &&
-+      ioprio_priority >= 0) {
-+    ceph_ioprio_set(IOPRIO_WHO_PROCESS,
-+		    pid,
-+		    IOPRIO_PRIO_VALUE(ioprio_class, ioprio_priority));
-+  }
-+  return entry();
-+}
-+
- const pthread_t &Thread::get_thread_id()
- {
-   return thread_id;
- }
-@@ -127,4 +145,16 @@
- int Thread::detach()
- {
-   return pthread_detach(thread_id);
- }
-+
-+int Thread::set_ioprio(int cls, int prio)
-+{
-+  // fixme, maybe: this can race with create()
-+  ioprio_class = cls;
-+  ioprio_priority = prio;
-+  if (pid && cls >= 0 && prio >= 0)
-+    return ceph_ioprio_set(IOPRIO_WHO_PROCESS,
-+			   pid,
-+			   IOPRIO_PRIO_VALUE(cls, prio));
-+  return 0;
-+}
---- a/src/common/Thread.h
-+++ b/src/common/Thread.h
-@@ -20,8 +20,12 @@
- 
- class Thread {
-  private:
-   pthread_t thread_id;
-+  pid_t pid;
-+  int ioprio_class, ioprio_priority;
-+
-+  void *entry_wrapper();
- 
-  public:
-   Thread(const Thread& other);
-   const Thread& operator=(const Thread& other);
-@@ -43,7 +47,8 @@
-   int try_create(size_t stacksize);
-   void create(size_t stacksize = 0);
-   int join(void **prval = 0);
-   int detach();
-+  int set_ioprio(int cls, int prio);
- };
- 
- #endif
---- a/src/common/WorkQueue.cc
-+++ b/src/common/WorkQueue.cc
-@@ -15,8 +15,9 @@
- #include <sstream>
- 
- #include "include/types.h"
- #include "include/utime.h"
-+#include "common/errno.h"
- #include "WorkQueue.h"
- 
- #include "common/config.h"
- #include "common/HeartbeatMap.h"
-@@ -32,8 +33,10 @@
-     _lock(lockname.c_str()),  // this should be safe due to declaration order
-     _stop(false),
-     _pause(0),
-     _draining(0),
-+    ioprio_class(-1),
-+    ioprio_priority(-1),
-     _num_threads(n),
-     last_work_queue(0),
-     processing(0)
- {
-@@ -155,8 +158,13 @@
-   while (_threads.size() < _num_threads) {
-     WorkThread *wt = new WorkThread(this);
-     ldout(cct, 10) << "start_threads creating and starting " << wt << dendl;
-     _threads.insert(wt);
-+
-+    int r = wt->set_ioprio(ioprio_class, ioprio_priority);
-+    if (r < 0)
-+      lderr(cct) << " set_ioprio got " << cpp_strerror(r) << dendl;
-+
-     wt->create();
-   }
- }
- 
-@@ -254,4 +262,17 @@
-   _draining--;
-   _lock.Unlock();
- }
- 
-+void ThreadPool::set_ioprio(int cls, int priority)
-+{
-+  Mutex::Locker l(_lock);
-+  ioprio_class = cls;
-+  ioprio_priority = priority;
-+  for (set<WorkThread*>::iterator p = _threads.begin();
-+       p != _threads.end();
-+       ++p) {
-+    int r = (*p)->set_ioprio(cls, priority);
-+    if (r < 0)
-+      lderr(cct) << " set_ioprio got " << cpp_strerror(r) << dendl;
-+  }
-+}
---- a/src/common/WorkQueue.h
-+++ b/src/common/WorkQueue.h
-@@ -32,8 +32,9 @@
-   bool _stop;
-   int _pause;
-   int _draining;
-   Cond _wait_cond;
-+  int ioprio_class, ioprio_priority;
- 
- public:
-   class TPHandle {
-     friend class ThreadPool;
-@@ -387,8 +388,11 @@
-   /// resume work in thread pool.  must match each pause() call 1:1 to resume.
-   void unpause();
-   /// wait for all work to complete
-   void drain(WorkQueue_* wq = 0);
-+
-+  /// set io priority
-+  void set_ioprio(int cls, int priority);
- };
- 
- class GenContextWQ :
-   public ThreadPool::WorkQueueVal<GenContext<ThreadPool::TPHandle&>*> {
---- a/src/common/blkdev.cc
-+++ b/src/common/blkdev.cc
-@@ -9,9 +9,9 @@
- int get_block_device_size(int fd, int64_t *psize)
- {
- #ifdef BLKGETSIZE64
-   int ret = ::ioctl(fd, BLKGETSIZE64, psize);
--#elif BLKGETSIZE
-+#elif defined(BLKGETSIZE)
-   unsigned long sectors = 0;
-   int ret = ::ioctl(fd, BLKGETSIZE, &sectors);
-   *psize = sectors * 512ULL;
- #else
 --- a/src/common/config.cc
 +++ b/src/common/config.cc
-@@ -878,17 +878,17 @@
-   assert(lock.is_locked());
-   switch (opt->type) {
-     case OPT_INT: {
-       std::string err;
--      int f = strict_strtol(val, 10, &err);
-+      int f = strict_sistrtoll(val, &err);
-       if (!err.empty())
- 	return -EINVAL;
-       *(int*)opt->conf_ptr(this) = f;
-       return 0;
-     }
-     case OPT_LONGLONG: {
-       std::string err;
--      long long f = strict_strtoll(val, 10, &err);
-+      long long f = strict_sistrtoll(val, &err);
-       if (!err.empty())
- 	return -EINVAL;
-       *(long long*)opt->conf_ptr(this) = f;
-       return 0;
-@@ -916,17 +916,17 @@
+@@ -946,9 +946,9 @@
+   return -ENOSYS;
+ }
+ 
+ static const char *CONF_METAVARIABLES[] =
+-  { "cluster", "type", "name", "host", "num", "id", "pid" };
++  { "cluster", "type", "name", "host", "num", "id", "pid", "cctid" };
+ static const int NUM_CONF_METAVARIABLES =
+       (sizeof(CONF_METAVARIABLES) / sizeof(CONF_METAVARIABLES[0]));
+ 
+ void md_config_t::expand_all_meta()
+@@ -1058,8 +1058,10 @@
+ 	else if (var == "id")
+ 	  out += name.get_id().c_str();
+ 	else if (var == "pid")
+ 	  out += stringify(getpid());
++	else if (var == "cctid")
++	  out += stringify((unsigned long long)this);
+ 	else
+ 	  assert(0); // unreachable
+ 	expanded = true;
        }
-       return 0;
-     case OPT_U32: {
-       std::string err;
--      int f = strict_strtol(val, 10, &err);
-+      int f = strict_sistrtoll(val, &err);
-       if (!err.empty())
- 	return -EINVAL;
-       *(uint32_t*)opt->conf_ptr(this) = f;
-       return 0;
-     }
-     case OPT_U64: {
-       std::string err;
--      long long f = strict_strtoll(val, 10, &err);
-+      long long f = strict_sistrtoll(val, &err);
-       if (!err.empty())
- 	return -EINVAL;
-       *(uint64_t*)opt->conf_ptr(this) = f;
-       return 0;
---- a/src/common/config_opts.h
-+++ b/src/common/config_opts.h
-@@ -176,8 +176,9 @@
- OPTION(mon_force_standby_active, OPT_BOOL, true) // should mons force standby-replay mds to be active
- OPTION(mon_warn_on_old_mons, OPT_BOOL, true) // should mons set health to WARN if part of quorum is old?
- OPTION(mon_warn_on_legacy_crush_tunables, OPT_BOOL, true) // warn if crush tunables are not optimal
- OPTION(mon_warn_on_osd_down_out_interval_zero, OPT_BOOL, true) // warn if 'mon_osd_down_out_interval == 0'
-+OPTION(mon_warn_on_cache_pools_without_hit_sets, OPT_BOOL, true)
- OPTION(mon_min_osdmap_epochs, OPT_INT, 500)
- OPTION(mon_max_pgmap_epochs, OPT_INT, 500)
- OPTION(mon_max_log_epochs, OPT_INT, 500)
- OPTION(mon_max_mdsmap_epochs, OPT_INT, 500)
-@@ -433,8 +434,9 @@
-        "technique=reed_sol_van "
-        "k=2 "
-        "m=1 "
-        ) // default properties of osd pool create
-+OPTION(osd_erasure_code_plugins, OPT_STR, "jerasure") // list of erasure code plugins
- OPTION(osd_pool_default_flags, OPT_INT, 0)   // default flags for new pools
- OPTION(osd_pool_default_flag_hashpspool, OPT_BOOL, true)   // use new pg hashing to prevent pool/pg overlap
- OPTION(osd_pool_default_hit_set_bloom_fpp, OPT_FLOAT, .05)
- OPTION(osd_pool_default_cache_target_dirty_ratio, OPT_FLOAT, .4)
-@@ -449,16 +451,19 @@
- OPTION(osd_tier_default_cache_hit_set_period, OPT_INT, 1200)
- OPTION(osd_tier_default_cache_hit_set_type, OPT_STR, "bloom")
- 
- OPTION(osd_map_dedup, OPT_BOOL, true)
-+OPTION(osd_map_max_advance, OPT_INT, 200) // make this < cache_size!
- OPTION(osd_map_cache_size, OPT_INT, 500)
- OPTION(osd_map_message_max, OPT_INT, 100)  // max maps per MOSDMap message
- OPTION(osd_map_share_max_epochs, OPT_INT, 100)  // cap on # of inc maps we send to peers, clients
- OPTION(osd_op_threads, OPT_INT, 2)    // 0 == no threading
- OPTION(osd_peering_wq_batch_size, OPT_U64, 20)
- OPTION(osd_op_pq_max_tokens_per_priority, OPT_U64, 4194304)
- OPTION(osd_op_pq_min_cost, OPT_U64, 65536)
- OPTION(osd_disk_threads, OPT_INT, 1)
-+OPTION(osd_disk_thread_ioprio_class, OPT_STR, "") // rt realtime be besteffort best effort idle
-+OPTION(osd_disk_thread_ioprio_priority, OPT_INT, -1) // 0-7
- OPTION(osd_recovery_threads, OPT_INT, 1)
- OPTION(osd_recover_clone_overlap, OPT_BOOL, true)   // preserve clone_overlap during recovery/migration
- 
- // Only use clone_overlap for recovery if there are fewer than
-@@ -472,8 +477,9 @@
- OPTION(osd_snap_trim_thread_timeout, OPT_INT, 60*60*1)
- OPTION(osd_snap_trim_sleep, OPT_FLOAT, 0)
- OPTION(osd_scrub_thread_timeout, OPT_INT, 60)
- OPTION(osd_scrub_finalize_thread_timeout, OPT_INT, 60*10)
-+OPTION(osd_scrub_invalid_stats, OPT_BOOL, true)
- OPTION(osd_remove_thread_timeout, OPT_INT, 60*60)
- OPTION(osd_command_thread_timeout, OPT_INT, 10*60)
- OPTION(osd_age, OPT_FLOAT, .8)
- OPTION(osd_age_time, OPT_INT, 0)
-@@ -508,8 +514,9 @@
- OPTION(osd_scrub_min_interval, OPT_FLOAT, 60*60*24)    // if load is low
- OPTION(osd_scrub_max_interval, OPT_FLOAT, 7*60*60*24)  // regardless of load
- OPTION(osd_scrub_chunk_min, OPT_INT, 5)
- OPTION(osd_scrub_chunk_max, OPT_INT, 25)
-+OPTION(osd_scrub_sleep, OPT_FLOAT, 0)   // sleep between [deep]scrub ops
- OPTION(osd_deep_scrub_interval, OPT_FLOAT, 60*60*24*7) // once a week
- OPTION(osd_deep_scrub_stride, OPT_INT, 524288)
- OPTION(osd_scan_list_ping_tp_interval, OPT_U64, 100)
- OPTION(osd_auto_weight, OPT_BOOL, false)
-@@ -689,8 +696,11 @@
- OPTION(keyvaluestore_debug_check_backend, OPT_BOOL, 0) // Expensive debugging check on sync
- OPTION(keyvaluestore_op_threads, OPT_INT, 2)
- OPTION(keyvaluestore_op_thread_timeout, OPT_INT, 60)
- OPTION(keyvaluestore_op_thread_suicide_timeout, OPT_INT, 180)
-+OPTION(keyvaluestore_default_strip_size, OPT_INT, 4096) // Only affect new object
-+OPTION(keyvaluestore_max_expected_write_size, OPT_U64, 1ULL << 24) // bytes
-+OPTION(keyvaluestore_header_cache_size, OPT_INT, 4096)    // Header cache size
- 
- // max bytes to search ahead in journal searching for corruption
- OPTION(journal_max_corrupt_search, OPT_U64, 10<<20)
- OPTION(journal_block_align, OPT_BOOL, true)
-@@ -712,8 +722,9 @@
- OPTION(rbd_cache_size, OPT_LONGLONG, 32<<20)         // cache size in bytes
- OPTION(rbd_cache_max_dirty, OPT_LONGLONG, 24<<20)    // dirty limit in bytes - set to 0 for write-through caching
- OPTION(rbd_cache_target_dirty, OPT_LONGLONG, 16<<20) // target dirty limit in bytes
- OPTION(rbd_cache_max_dirty_age, OPT_FLOAT, 1.0)      // seconds in cache before writeback starts
-+OPTION(rbd_cache_max_dirty_object, OPT_INT, 0)       // dirty limit for objects - set to 0 for auto calculate from rbd_cache_size
- OPTION(rbd_cache_block_writes_upfront, OPT_BOOL, false) // whether to block writes to the cache before the aio_write call completes (true), or block before the aio completion is called (false)
- OPTION(rbd_concurrent_management_ops, OPT_INT, 10) // how many operations can be in flight for a management operation like deleting or resizing an image
- OPTION(rbd_balance_snap_reads, OPT_BOOL, false)
- OPTION(rbd_localize_snap_reads, OPT_BOOL, false)
---- /dev/null
-+++ b/src/common/io_priority.cc
-@@ -0,0 +1,54 @@
-+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-+// vim: ts=8 sw=2 smarttab
-+/*
-+ * Ceph - scalable distributed file system
-+ *
-+ * Copyright (C) 2012 Red Hat
-+ *
-+ * This is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License version 2.1, as published by the Free Software
-+ * Foundation.  See file COPYING.
-+ *
-+ */
-+
-+#include <sys/types.h>
-+#include <unistd.h>
-+#include <sys/syscall.h>   /* For SYS_xxx definitions */
-+#include <algorithm>
-+#include <errno.h>
-+
-+#include "common/errno.h"
-+#include "io_priority.h"
-+
-+pid_t ceph_gettid(void)
-+{
-+#ifdef __linux__
-+  return syscall(SYS_gettid);
-+#else
-+  return -ENOSYS;
-+#endif
-+}
-+
-+int ceph_ioprio_set(int whence, int who, int ioprio)
-+{
-+#ifdef __linux__
-+  return syscall(SYS_ioprio_set, whence, who, ioprio);
-+#else
-+  return -ENOSYS;
-+#endif
-+}
-+
-+int ceph_ioprio_string_to_class(const std::string& s)
-+{
-+  std::string l;
-+  std::transform(s.begin(), s.end(), l.begin(), ::tolower);
-+
-+  if (l == "idle")
-+    return IOPRIO_CLASS_IDLE;
-+  if (l == "be" || l == "besteffort" || l == "best effort")
-+    return IOPRIO_CLASS_BE;
-+  if (l == "rt" || l == "realtime" || l == "real time")
-+    return IOPRIO_CLASS_RT;
-+  return -EINVAL;
-+}
---- /dev/null
-+++ b/src/common/io_priority.h
-@@ -0,0 +1,44 @@
-+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-+// vim: ts=8 sw=2 smarttab
-+/*
-+ * Ceph - scalable distributed file system
-+ *
-+ * Copyright (C) 2012 Red Hat
-+ *
-+ * This is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License version 2.1, as published by the Free Software
-+ * Foundation.  See file COPYING.
-+ *
-+ */
-+
-+#ifndef CEPH_COMMON_IO_PRIORITY_H
-+#define CEPH_COMMON_IO_PRIORITY_H
-+
-+#include <string>
-+
-+extern pid_t ceph_gettid();
-+
-+#ifndef IOPRIO_WHO_PROCESS
-+# define IOPRIO_WHO_PROCESS 1
-+#endif
-+#ifndef IOPRIO_PRIO_VALUE
-+# define IOPRIO_CLASS_SHIFT 13
-+# define IOPRIO_PRIO_VALUE(class, data) \
-+		(((class) << IOPRIO_CLASS_SHIFT) | (data))
-+#endif
-+#ifndef IOPRIO_CLASS_RT
-+# define IOPRIO_CLASS_RT 1
-+#endif
-+#ifndef IOPRIO_CLASS_BE
-+# define IOPRIO_CLASS_BE 2
-+#endif
-+#ifndef IOPRIO_CLASS_IDLE
-+# define IOPRIO_CLASS_IDLE 3
-+#endif
-+
-+extern int ceph_ioprio_set(int whence, int who, int ioprio);
-+
-+extern int ceph_ioprio_string_to_class(const std::string& s);
-+
-+#endif
---- /dev/null
-+++ b/src/common/random_cache.hpp
-@@ -0,0 +1,111 @@
-+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-+// vim: ts=8 sw=2 smarttab
-+/*
-+ * Ceph - scalable distributed file system
-+ *
-+ * Copyright (C) 2014 UnitedStack <haomai at unitedstack.com>
-+ *
-+ * Author: Haomai Wang <haomaiwang at gmail.com>
-+ *
-+ * This is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License version 2.1, as published by the Free Software
-+ * Foundation.  See file COPYING.
-+ *
-+ */
-+
-+#ifndef CEPH_RANDOMCACHE_H
-+#define CEPH_RANDOMCACHE_H
-+
-+#include "common/Mutex.h"
-+#include "include/compat.h"
-+#include "include/unordered_map.h"
-+
-+
-+// Although This is a ramdom cache implementation, here still consider to make
-+// the trim progress more reasonable. Each item owns its lookup frequency,
-+// when the cache is full it will randomly pick up several items and compare the
-+// frequency associated with. The least frequency of items will be evicted.
-+template <class K, class V>
-+class RandomCache {
-+  // The first element of pair is the frequency of item, it's used to evict item
-+  ceph::unordered_map<K, pair<uint64_t, V> > contents;
-+  Mutex lock;
-+  uint64_t max_size;
-+  K last_trim_key;
-+
-+  // When cache reach full, consider to evict a certain number of items
-+  static const uint64_t EVICT_COUNT = 5;
-+  // Avoid too much overhead on comparing items's frequency, the number of
-+  // compare items is expected to small.
-+  static const uint64_t COMPARE_COUNT = 3;
-+
-+  // In order to make evict cache progress more lightweight and effective,
-+  // several items are expected to evicted in one call
-+  void trim_cache(uint64_t evict_count) {
-+    typename ceph::unordered_map<K, pair<uint64_t, V> >::iterator it = contents.find(last_trim_key);
-+    uint64_t total_compare = evict_count * COMPARE_COUNT;
-+    map<uint64_t, K> candidates;
-+
-+    while (total_compare--) {
-+      if (it == contents.end()) {
-+        it = contents.begin();
-+      }
-+
-+      candidates[it->second.first] = it->first;
-+      it++;
-+    }
-+    if (it != contents.end())
-+      last_trim_key = it->first;
-+    else
-+      last_trim_key = contents.begin()->first;
-+
-+    for (typename map<uint64_t, K>::iterator j = candidates.begin(); j != candidates.end(); j++) {
-+      contents.erase(j->second);
-+      evict_count--;
-+      if (!evict_count)
-+        break;
-+    }
-+  }
-+
-+ public:
-+  RandomCache(size_t max_size=20) : lock("RandomCache::lock"),
-+                                    max_size(max_size) {}
-+  ~RandomCache() {
-+    contents.clear();
-+  }
-+
-+  void clear(K key) {
-+    Mutex::Locker l(lock);
-+    contents.erase(key);
-+  }
-+
-+  void set_size(size_t new_size) {
-+    Mutex::Locker l(lock);
-+    max_size = new_size;
-+    if (max_size <= contents.size()) {
-+      trim_cache(contents.size() - max_size);
-+    }
-+  }
-+
-+  bool lookup(K key, V *out) {
-+    Mutex::Locker l(lock);
-+    typename ceph::unordered_map<K, pair<uint64_t, V> >::iterator it = contents.find(key);
-+    if (it != contents.end()) {
-+      it->second.first++;
-+      *out = it->second.second;
-+      return true;
-+    }
-+    return false;
-+  }
-+
-+  void add(K key, V value) {
-+    Mutex::Locker l(lock);
-+    if (max_size <= contents.size()) {
-+      trim_cache(EVICT_COUNT);
-+    }
-+    contents[key] = make_pair(1, value);
-+  }
-+};
-+
-+#endif
---- a/src/common/str_map.cc
-+++ b/src/common/str_map.cc
-@@ -23,9 +23,9 @@
- 
- using namespace std;
- 
- int get_str_map(const string &str,
--                stringstream &ss,
-+                ostream &ss,
-                 map<string,string> *str_map)
- {
-   json_spirit::mValue json;
-   try {
---- a/src/common/strtol.cc
-+++ b/src/common/strtol.cc
-@@ -16,8 +16,11 @@
- #include <limits.h>
- #include <sstream>
- #include <stdlib.h>
- #include <string>
-+extern "C" {
-+#include <stdint.h>
-+}
- 
- using std::ostringstream;
- 
- long long strict_strtoll(const char *str, int base, std::string *err)
-@@ -123,4 +126,44 @@
-   }
-   *err = "";
-   return ret;
- }
-+
-+uint64_t strict_sistrtoll(const char *str, std::string *err)
-+{
-+  std::string s(str);
-+  if (s.size() == 0) {
-+    ostringstream oss;
-+    oss << "strict_sistrtoll: value not specified";
-+    *err = oss.str();
-+    return 0;
-+  }
-+  const char &u = s.at(s.size()-1); //str[std::strlen(str)-1];
-+  int m = 0;
-+  if (u == 'B')
-+    m = 0;
-+  else if (u == 'K')
-+    m = 10;
-+  else if (u == 'M')
-+    m = 20;
-+  else if (u == 'G')
-+    m = 30;
-+  else if (u == 'T')
-+    m = 40;
-+  else if (u == 'P')
-+    m = 50;
-+  else if (u == 'E')
-+    m = 60;
-+  else
-+    m = -1;
-+
-+  const char *v = NULL;
-+  if (m >= 0)
-+    s = std::string(str, s.size()-1);
-+  v = s.c_str();
-+
-+  uint64_t r = strict_strtoll(v, 10, err);
-+  if (err->empty() && m > 0) {
-+    r = (r << m);
-+  }
-+  return r;
-+}
---- a/src/common/strtol.h
-+++ b/src/common/strtol.h
-@@ -15,8 +15,11 @@
- #ifndef CEPH_COMMON_STRTOL_H
- #define CEPH_COMMON_STRTOL_H
- 
- #include <string>
-+extern "C" {
-+#include <stdint.h>
-+}
- 
- long long strict_strtoll(const char *str, int base, std::string *err);
- 
- int strict_strtol(const char *str, int base, std::string *err);
-@@ -24,5 +27,7 @@
- double strict_strtod(const char *str, std::string *err);
- 
- float strict_strtof(const char *str, std::string *err);
- 
-+uint64_t strict_sistrtoll(const char *str, std::string *err);
-+
- #endif
---- a/src/crush/CrushWrapper.cc
-+++ b/src/crush/CrushWrapper.cc
-@@ -9,34 +9,56 @@
- #define dout_subsys ceph_subsys_crush
- 
- bool CrushWrapper::has_v2_rules() const
- {
--  // check rules for use of indep or new SET_* rule steps
-   for (unsigned i=0; i<crush->max_rules; i++) {
--    crush_rule *r = crush->rules[i];
--    if (!r)
--      continue;
--    for (unsigned j=0; j<r->len; j++) {
--      if (r->steps[j].op == CRUSH_RULE_CHOOSE_INDEP ||
--	  r->steps[j].op == CRUSH_RULE_CHOOSELEAF_INDEP ||
--	  r->steps[j].op == CRUSH_RULE_SET_CHOOSE_TRIES ||
--	  r->steps[j].op == CRUSH_RULE_SET_CHOOSELEAF_TRIES)
--	return true;
-+    if (is_v2_rule(i)) {
-+      return true;
-+    }
-+  }
-+  return false;
-+}
-+
-+bool CrushWrapper::is_v2_rule(unsigned ruleid) const
-+{
-+  // check rule for use of indep or new SET_* rule steps
-+  if (ruleid >= crush->max_rules)
-+    return false;
-+  crush_rule *r = crush->rules[ruleid];
-+  if (!r)
-+    return false;
-+  for (unsigned j=0; j<r->len; j++) {
-+    if (r->steps[j].op == CRUSH_RULE_CHOOSE_INDEP ||
-+	r->steps[j].op == CRUSH_RULE_CHOOSELEAF_INDEP ||
-+	r->steps[j].op == CRUSH_RULE_SET_CHOOSE_TRIES ||
-+	r->steps[j].op == CRUSH_RULE_SET_CHOOSELEAF_TRIES) {
-+      return true;
-     }
-   }
-   return false;
- }
- 
- bool CrushWrapper::has_v3_rules() const
- {
--  // check rules for use of SET_CHOOSELEAF_VARY_R step
-   for (unsigned i=0; i<crush->max_rules; i++) {
--    crush_rule *r = crush->rules[i];
--    if (!r)
--      continue;
--    for (unsigned j=0; j<r->len; j++) {
--      if (r->steps[j].op == CRUSH_RULE_SET_CHOOSELEAF_VARY_R)
--	return true;
-+    if (is_v3_rule(i)) {
-+      return true;
-+    }
-+  }
-+  return false;
-+}
-+
-+bool CrushWrapper::is_v3_rule(unsigned ruleid) const
-+{
-+  // check rule for use of SET_CHOOSELEAF_VARY_R step
-+  if (ruleid >= crush->max_rules)
-+    return false;
-+  crush_rule *r = crush->rules[ruleid];
-+  if (!r)
-+    return false;
-+  for (unsigned j=0; j<r->len; j++) {
-+    if (r->steps[j].op == CRUSH_RULE_SET_CHOOSELEAF_VARY_R) {
-+      return true;
-     }
-   }
-   return false;
- }
-@@ -793,8 +815,61 @@
-   have_rmaps = false;
-   return rno;
- }
- 
-+int CrushWrapper::get_rule_weight_osd_map(unsigned ruleno, map<int,float> *pmap)
-+{
-+  if (ruleno >= crush->max_rules)
-+    return -ENOENT;
-+  if (crush->rules[ruleno] == NULL)
-+    return -ENOENT;
-+  crush_rule *rule = crush->rules[ruleno];
-+
-+  // build a weight map for each TAKE in the rule, and then merge them
-+  for (unsigned i=0; i<rule->len; ++i) {
-+    map<int,float> m;
-+    float sum = 0;
-+    if (rule->steps[i].op == CRUSH_RULE_TAKE) {
-+      int n = rule->steps[i].arg1;
-+      if (n >= 0) {
-+	m[n] = 1.0;
-+	sum = 1.0;
-+      } else {
-+	list<int> q;
-+	q.push_back(n);
-+	//breadth first iterate the OSD tree
-+	while (!q.empty()) {
-+	  int bno = q.front();
-+	  q.pop_front();
-+	  crush_bucket *b = crush->buckets[-1-bno];
-+	  assert(b);
-+	  for (unsigned j=0; j<b->size; ++j) {
-+	    int item_id = b->items[j];
-+	    if (item_id >= 0) //it's an OSD
-+	    {
-+	      float w = crush_get_bucket_item_weight(b, j);
-+	      m[item_id] = w;
-+	      sum += w;
-+	    }
-+	    else //not an OSD, expand the child later
-+	      q.push_back(item_id);
-+	  }
-+	}
-+      }
-+    }
-+    for (map<int,float>::iterator p = m.begin(); p != m.end(); ++p) {
-+      map<int,float>::iterator q = pmap->find(p->first);
-+      if (q == pmap->end()) {
-+	(*pmap)[p->first] = p->second / sum;
-+      } else {
-+	q->second += p->second / sum;
-+      }
-+    }
-+  }
-+
-+  return 0;
-+}
-+
- int CrushWrapper::remove_rule(int ruleno)
- {
-   if (ruleno >= (int)crush->max_rules)
-     return -ENOENT;
---- a/src/crush/CrushWrapper.h
-+++ b/src/crush/CrushWrapper.h
-@@ -215,8 +215,10 @@
-   }
-   bool has_v2_rules() const;
-   bool has_v3_rules() const;
- 
-+  bool is_v2_rule(unsigned ruleid) const;
-+  bool is_v3_rule(unsigned ruleid) const;
- 
-   // bucket types
-   int get_num_type_names() const {
-     return type_map.size();
-@@ -630,8 +632,20 @@
-     if (IS_ERR(s)) return PTR_ERR(s);
-     return s->arg2;
-   }
- 
-+  /**
-+   * calculate a map of osds to weights for a given rule
-+   *
-+   * Generate a map of which OSDs get how much relative weight for a
-+   * given rule.
-+   *
-+   * @param ruleno [in] rule id
-+   * @param pmap [out] map of osd to weight
-+   * @return 0 for success, or negative error code
-+   */
-+  int get_rule_weight_osd_map(unsigned ruleno, map<int,float> *pmap);
-+
-   /* modifiers */
-   int add_rule(int len, int ruleset, int type, int minsize, int maxsize, int ruleno) {
-     if (!crush) return -ENOENT;
-     crush_rule *n = crush_make_rule(len, ruleset, type, minsize, maxsize);
---- a/src/erasure-code/ErasureCodeInterface.h
-+++ b/src/erasure-code/ErasureCodeInterface.h
-@@ -166,9 +166,9 @@
-      *
-      * @param [in] name of the ruleset to create
-      * @param [in] crush crushmap in which the ruleset is created
-      * @param [out] ss contains informative messages when an error occurs
--     * @return **0** on success or a negative errno on error.
-+     * @return a ruleset on success or a negative errno on error.
-      */
-     virtual int create_ruleset(const string &name,
- 			       CrushWrapper &crush,
- 			       ostream *ss) const = 0;
---- a/src/erasure-code/ErasureCodePlugin.cc
-+++ b/src/erasure-code/ErasureCodePlugin.cc
-@@ -3,8 +3,9 @@
- /*
-  * Ceph - scalable distributed file system
-  *
-  * Copyright (C) 2013,2014 Cloudwatt <libre.licensing at cloudwatt.com>
-+ * Copyright (C) 2014 Red Hat <contact at redhat.com>
-  *
-  * Author: Loic Dachary <loic at dachary.org>
-  *
-  *  This library is free software; you can redistribute it and/or
-@@ -18,8 +19,9 @@
- #include <dlfcn.h>
- 
- #include "ErasureCodePlugin.h"
- #include "common/errno.h"
-+#include "include/str_list.h"
- 
- #define PLUGIN_PREFIX "libec_"
- #define PLUGIN_SUFFIX ".so"
- #define PLUGIN_INIT_FUNCTION "__erasure_code_init"
-@@ -129,7 +131,33 @@
-   }
- 
-   (*plugin)->library = library;
- 
-+  ss << __func__ << ": " << plugin_name << " ";
-+
-   return 0;
- }
- 
-+int ErasureCodePluginRegistry::preload(const std::string &plugins,
-+				       const std::string &directory,
-+				       ostream &ss)
-+{
-+  map<string,string> profile;
-+  profile["directory"] = directory;
-+  list<string> plugins_list;
-+  get_str_list(plugins, plugins_list);
-+  for (list<string>::iterator i = plugins_list.begin();
-+       i != plugins_list.end();
-+       i++) {
-+    ErasureCodePlugin *plugin;
-+    int r = load(*i, profile, &plugin, ss);
-+    if (r)
-+      return r;
-+
-+    ErasureCodeInterfaceRef erasure_code;
-+    profile["technique"] = "reed_sol_van";
-+    r = plugin->factory(profile, &erasure_code);
-+    if (r)
-+      return r;
-+  }
-+  return 0;
-+}
---- a/src/erasure-code/ErasureCodePlugin.h
-+++ b/src/erasure-code/ErasureCodePlugin.h
-@@ -66,8 +66,11 @@
- 	     const map<std::string,std::string> &parameters,
- 	     ErasureCodePlugin **plugin,
- 	     ostream &ss);
- 
-+    int preload(const std::string &plugins,
-+		const std::string &directory,
-+		ostream &ss);
-   };
- }
- 
- #endif
---- a/src/erasure-code/jerasure/ErasureCodeJerasure.cc
-+++ b/src/erasure-code/jerasure/ErasureCodeJerasure.cc
-@@ -43,10 +43,14 @@
- int ErasureCodeJerasure::create_ruleset(const string &name,
- 					CrushWrapper &crush,
- 					ostream *ss) const
- {
--  return crush.add_simple_ruleset(name, ruleset_root, ruleset_failure_domain,
--				  "indep", pg_pool_t::TYPE_ERASURE, ss);
-+  int ruleid = crush.add_simple_ruleset(name, ruleset_root, ruleset_failure_domain,
-+					"indep", pg_pool_t::TYPE_ERASURE, ss);
-+  if (ruleid < 0)
-+    return ruleid;
-+  else
-+    return crush.get_rule_mask_ruleset(ruleid);
- }
- 
- void ErasureCodeJerasure::init(const map<string,string> &parameters)
- {
---- a/src/include/atomic.h
-+++ b/src/include/atomic.h
-@@ -20,12 +20,68 @@
- # include "acconfig.h"
- #endif
- 
- #include <stdlib.h>
-+#include "include/Spinlock.h"
-+
-+namespace ceph {
-+  template <class T>
-+  class atomic_spinlock_t {
-+    mutable ceph_spinlock_t lock;
-+    T val;
-+  public:
-+    atomic_spinlock_t(T i=0)
-+      : val(i) {
-+      ceph_spin_init(&lock);
-+    }
-+    ~atomic_spinlock_t() {
-+      ceph_spin_destroy(&lock);
-+    }
-+    void set(T v) {
-+      ceph_spin_lock(&lock);
-+      val = v;
-+      ceph_spin_unlock(&lock);
-+    }
-+    T inc() {
-+      ceph_spin_lock(&lock);
-+      T r = ++val;
-+      ceph_spin_unlock(&lock);
-+      return r;
-+    }
-+    T dec() {
-+      ceph_spin_lock(&lock);
-+      T r = --val;
-+      ceph_spin_unlock(&lock);
-+      return r;
-+    }
-+    void add(T d) {
-+      ceph_spin_lock(&lock);
-+      val += d;
-+      ceph_spin_unlock(&lock);
-+    }
-+    void sub(T d) {
-+      ceph_spin_lock(&lock);
-+      val -= d;
-+      ceph_spin_unlock(&lock);
-+    }
-+    T read() const {
-+      T ret;
-+      ceph_spin_lock(&lock);
-+      ret = val;
-+      ceph_spin_unlock(&lock);
-+      return ret;
-+    }
-+  private:
-+    // forbid copying
-+    atomic_spinlock_t(const atomic_spinlock_t<T> &other);
-+    atomic_spinlock_t &operator=(const atomic_spinlock_t<T> &rhs);
-+  };
-+}
- 
- #ifndef NO_ATOMIC_OPS
- 
- // libatomic_ops implementation
-+#define AO_REQUIRE_CAS
- #include <atomic_ops.h>
- 
- // reinclude our assert to clobber the system one
- #include "include/assert.h"
-@@ -34,9 +90,9 @@
-   class atomic_t {
-     AO_t val;
-   public:
-     atomic_t(AO_t i=0) : val(i) {}
--    void set(size_t v) {
-+    void set(AO_t v) {
-       AO_store(&val, v);
-     }
-     AO_t inc() {
-       return AO_fetch_and_add1(&val) + 1;
-@@ -46,10 +102,10 @@
-     }
-     void add(AO_t add_me) {
-       AO_fetch_and_add(&val, add_me);
-     }
--    void sub(int sub_me) {
--      int negsub = 0 - sub_me;
-+    void sub(AO_t sub_me) {
-+      AO_t negsub = 0 - sub_me;
-       AO_fetch_and_add_write(&val, (AO_t)negsub);
-     }
-     AO_t read() const {
-       // cast away const on the pointer.  this is only needed to build
-@@ -61,65 +117,26 @@
-     // forbid copying
-     atomic_t(const atomic_t &other);
-     atomic_t &operator=(const atomic_t &rhs);
-   };
-+
-+#if SIZEOF_AO_T == 8
-+  typedef atomic_t atomic64_t;
-+#else
-+  typedef atomic_spinlock_t<unsigned long long> atomic64_t;
-+#endif
-+
- }
-+
- #else
- /*
-  * crappy slow implementation that uses a pthreads spinlock.
-  */
- #include "include/Spinlock.h"
- 
- namespace ceph {
--  class atomic_t {
--    mutable ceph_spinlock_t lock;
--    signed long val;
--  public:
--    atomic_t(int i=0)
--      : val(i) {
--      ceph_spin_init(&lock);
--    }
--    ~atomic_t() {
--      ceph_spin_destroy(&lock);
--    }
--    void set(size_t v) {
--      ceph_spin_lock(&lock);
--      val = v;
--      ceph_spin_unlock(&lock);
--    }
--    int inc() {
--      ceph_spin_lock(&lock);
--      int r = ++val;
--      ceph_spin_unlock(&lock);
--      return r;
--    }
--    int dec() {
--      ceph_spin_lock(&lock);
--      int r = --val;
--      ceph_spin_unlock(&lock);
--      return r;
--    }
--    void add(int d) {
--      ceph_spin_lock(&lock);
--      val += d;
--      ceph_spin_unlock(&lock);
--    }
--    void sub(int d) {
--      ceph_spin_lock(&lock);
--      val -= d;
--      ceph_spin_unlock(&lock);
--    }
--    int read() const {
--      signed long ret;
--      ceph_spin_lock(&lock);
--      ret = val;
--      ceph_spin_unlock(&lock);
--      return ret;
--    }
--  private:
--    // forbid copying
--    atomic_t(const atomic_t &other);
--    atomic_t &operator=(const atomic_t &rhs);
--  };
-+  typedef atomic_spinlock_t<unsigned> atomic_t;
-+  typedef atomic_spinlock_t<unsigned long long> atomic64_t;
- }
-+
- #endif
- #endif
---- a/src/include/intarith.h
-+++ b/src/include/intarith.h
-@@ -27,9 +27,9 @@
- # define DIV_ROUND_UP(n, d)  (((n) + (d) - 1) / (d))
- #endif
- 
- #ifndef ROUND_UP_TO
--# define ROUND_UP_TO(n, d) (((n)+(d)-1) & ~((d)-1))
-+# define ROUND_UP_TO(n, d) ((n)%(d) ? ((n)+(d)-(n)%(d)) : (n))
- #endif
- 
- #ifndef SHIFT_ROUND_UP
- # define SHIFT_ROUND_UP(x,y) (((x)+(1<<(y))-1) >> (y))
---- a/src/include/rbd/librbd.h
-+++ b/src/include/rbd/librbd.h
-@@ -38,8 +38,9 @@
- #define LIBRBD_VERSION_CODE LIBRBD_VERSION(LIBRBD_VER_MAJOR, LIBRBD_VER_MINOR, LIBRBD_VER_EXTRA)
- 
- #define LIBRBD_SUPPORTS_WATCH 0
- #define LIBRBD_SUPPORTS_AIO_FLUSH 1
-+#define LIBRBD_SUPPORTS_INVALIDATE 1
- 
- typedef void *rbd_snap_t;
- typedef void *rbd_image_t;
- 
-@@ -375,8 +376,16 @@
-  * @returns 0 on success, negative error code on failure
-  */
- int rbd_aio_flush(rbd_image_t image, rbd_completion_t c);
- 
-+/**
-+ * Drop any cached data for an image
-+ *
-+ * @param image the image to invalidate cached data for
-+ * @returns 0 on success, negative error code on failure
-+ */
-+int rbd_invalidate_cache(rbd_image_t image);
-+
- #ifdef __cplusplus
- }
- #endif
- 
---- a/src/include/rbd/librbd.hpp
-+++ b/src/include/rbd/librbd.hpp
-@@ -215,8 +215,16 @@
-    * @returns 0 on success, negative error code on failure
-    */
-   int aio_flush(RBD::AioCompletion *c);
- 
-+  /**
-+   * Drop any cached data for an image
-+   *
-+   * @param image the image to invalidate cached data for
-+   * @returns 0 on success, negative error code on failure
-+   */
-+  int invalidate_cache();
-+
- private:
-   friend class RBD;
- 
-   Image(const Image& rhs);
---- a/src/include/str_map.h
-+++ b/src/include/str_map.h
-@@ -52,8 +52,8 @@
-  * @param [out] str_map key/value pairs read from str
-  * @return **0** on success or a -EINVAL on error.
-  */
- extern int get_str_map(const std::string &str,
--		       std::stringstream &ss,
-+		       std::ostream &ss,
- 		       std::map<std::string,std::string> *str_map);
- 
- #endif
---- a/src/init-ceph.in
-+++ b/src/init-ceph.in
-@@ -30,8 +30,9 @@
- 
- usage_exit() {
-     echo "usage: $0 [options] {start|stop|restart|condrestart} [mon|osd|mds]..."
-     printf "\t-c ceph.conf\n"
-+    printf "\t--cluster [cluster name]\tdefine the cluster name\n"
-     printf "\t--valgrind\trun via valgrind\n"
-     printf "\t--hostname [hostname]\toverride hostname lookup\n"
-     exit
- }
-@@ -112,8 +113,10 @@
- monaddr=
- dofsmount=1
- dofsumount=0
- verbose=0
-+use_default_conf=1
-+
- 
- while echo $1 | grep -q '^-'; do     # FIXME: why not '^-'?
- case $1 in
-     -v | --verbose)
-@@ -152,10 +155,17 @@
-     --conf | -c)
- 	    [ -z "$2" ] && usage_exit
- 	    options="$options $1"
- 	    shift
-+        use_default_conf=0
- 	    conf=$1
- 	    ;;
-+    --cluster )
-+	    [ -z "$2" ] && usage_exit
-+	    options="$options $1"
-+	    shift
-+	    cluster=$1
-+	    ;;
-     --hostname )
- 	    [ -z "$2" ] && usage_exit
- 	    options="$options $1"
- 	    shift
-@@ -169,8 +179,22 @@
- options="$options $1"
- shift
- done
- 
-+
-+# if `--cluster` was not passed in, fallback to looking at the config name
-+if [ -z "$cluster" ]; then
-+    cluster=`echo $conf | awk -F'/' '{print $(NF)}' | cut -d'.' -f 1`
-+else
-+    # if we were told to use a given cluster name then $conf needs to be updated
-+    # but just define it if `--conf` was not specified, otherwise we would be silently
-+    # overriding $conf even if it was defined with `--conf`
-+    if [ $use_default_conf -eq 1 ]; then
-+        conf="/etc/ceph/$cluster.conf"
-+    fi
-+fi
-+
-+
- verify_conf
- 
- command=$1
- [ -n "$*" ] && shift
-@@ -188,13 +212,12 @@
- 
- for name in $what; do
-     type=`echo $name | cut -c 1-3`   # e.g. 'mon', if $item is 'mon1'
-     id=`echo $name | cut -c 4- | sed 's/^\\.//'`
--    cluster=`echo $conf | awk -F'/' '{print $(NF)}' | cut -d'.' -f 1`
-     num=$id
-     name="$type.$id"
- 
--    check_host || continue
-+    check_host $cluster || continue
- 
-     binary="$BINDIR/ceph-$type"
-     cmd="$binary -i $id"
- 
-@@ -234,9 +257,9 @@
-     # conf file
-     cmd="$cmd -c $conf"
- 
-     if echo $name | grep -q ^osd; then
--	get_conf osd_data "/var/lib/ceph/osd/ceph-$id" "osd data"
-+	get_conf osd_data "/var/lib/ceph/osd/$cluster-$id" "osd data"
- 	get_conf fs_path "$osd_data" "fs path"  # mount point defaults so osd data
-         get_conf fs_devs "" "devs"
- 	if [ -z "$fs_devs" ]; then
- 	    # try to fallback to old keys
-@@ -334,9 +357,9 @@
- 		get_conf update_crush "" "osd crush update on start"
- 		if [ "${update_crush:-1}" = "1" -o "${update_crush:-1}" = "true" ]; then
- 		    # update location in crush
- 		    get_conf osd_location_hook "$BINDIR/ceph-crush-location" "osd crush location hook"
--		    osd_location=`$osd_location_hook --cluster ceph --id $id --type osd`
-+		    osd_location=`$osd_location_hook --cluster $cluster --id $id --type osd`
- 		    get_conf osd_weight "" "osd crush initial weight"
- 		    defaultweight="$(df -P -k $osd_data/. | tail -1 | awk '{ print sprintf("%.2f",$2/1073741824) }')"
- 		    get_conf osd_keyring "$osd_data/keyring" "keyring"
- 		    do_cmd "timeout 30 $BINDIR/ceph -c $conf --name=osd.$id --keyring=$osd_keyring osd crush create-or-move -- $id ${osd_weight:-${defaultweight:-1}} $osd_location"
-@@ -365,9 +388,9 @@
- 		# in creating these keys.
- 		get_conf mon_data "/var/lib/ceph/mon/ceph-$id" "mon data"
- 		if [ "$mon_data" = "/var/lib/ceph/mon/ceph-$id" -a "$asok" = "/var/run/ceph/ceph-mon.$id.asok" ]; then
- 		    echo Starting ceph-create-keys on $host...
--		    cmd2="$SBINDIR/ceph-create-keys -i $id 2> /dev/null &"
-+		    cmd2="$SBINDIR/ceph-create-keys --cluster $cluster -i $id 2> /dev/null &"
- 		    do_cmd "$cmd2"
- 		fi
- 	    fi
- 
---- a/src/init-radosgw.sysv
-+++ b/src/init-radosgw.sysv
-@@ -14,8 +14,9 @@
- . /etc/rc.d/init.d/functions
- 
- daemon_is_running() {
-     daemon=$1
-+    sleep 1
-     if pidof $daemon >/dev/null; then
-         echo "$daemon is running."
-         exit 0
-     else
-@@ -43,8 +44,12 @@
-     [ $VERBOSE -eq 1 ] && echo "$RADOSGW could not start, it is not executable."
-     exit 1
- fi
- 
-+# detect systemd
-+SYSTEMD=0
-+grep -qs systemd /proc/1/comm && SYSTEMD=1
-+
- case "$1" in
-     start)
-         echo "Starting radosgw instance(s)..."
-         for name in `ceph-conf --list-sections $PREFIX`;
-@@ -78,10 +83,14 @@
-                 touch "$log_file"
-                 chown $user $log_file
-             fi
- 
--            #start-stop-daemon --start -u $user -x $RADOSGW -- -n $name
--            daemon --user="$user" "ulimit -n 32768; $RADOSGW -n $name"
-+            if [ $SYSTEMD -eq 1 ]; then
-+                systemd-run -r bash -c "ulimit -n 32768; $RADOSGW -n $name"
-+            else
-+                #start-stop-daemon --start -u $user -x $RADOSGW -- -n $name
-+                daemon --user="$user" "ulimit -n 32768; $RADOSGW -n $name"
-+            fi
-             echo "Starting $name..."
-         done
-         daemon_is_running $RADOSGW
-         ;;
---- a/src/librados/RadosClient.cc
-+++ b/src/librados/RadosClient.cc
-@@ -102,10 +102,12 @@
- 
-   lock.Lock();
- 
-   int r = wait_for_osdmap();
--  if (r < 0)
-+  if (r < 0) {
-+    lock.Unlock();
-     return r;
-+  }
-   int64_t ret = osdmap.lookup_pg_pool_name(name);
-   pool_cache_rwl.get_write();
-   lock.Unlock();
-   if (ret < 0) {
-@@ -581,10 +583,12 @@
- int librados::RadosClient::pool_delete(const char *name)
- {
-   lock.Lock();
-   int r = wait_for_osdmap();
--  if (r < 0)
-+  if (r < 0) {
-+    lock.Unlock();
-     return r;
-+  }
-   int tmp_pool_id = osdmap.lookup_pg_pool_name(name);
-   if (tmp_pool_id < 0) {
-     lock.Unlock();
-     return -ENOENT;
---- a/src/librbd/ImageCtx.cc
-+++ b/src/librbd/ImageCtx.cc
-@@ -184,12 +184,16 @@
-     }
- 
-     // size object cache appropriately
-     if (object_cacher) {
--      uint64_t obj = cct->_conf->rbd_cache_size / (1ull << order);
-+      uint64_t obj = cct->_conf->rbd_cache_max_dirty_object;
-+      if (!obj) {
-+        obj = cct->_conf->rbd_cache_size / (1ull << order);
-+        obj = obj * 4 + 10;
-+      }
-       ldout(cct, 10) << " cache bytes " << cct->_conf->rbd_cache_size << " order " << (int)order
- 		     << " -> about " << obj << " objects" << dendl;
--      object_cacher->set_max_objects(obj * 4 + 10);
-+      object_cacher->set_max_objects(obj);
-     }
- 
-     ldout(cct, 10) << "init_layout stripe_unit " << stripe_unit
- 		   << " stripe_count " << stripe_count
-@@ -572,11 +576,11 @@
-     md_lock.put_write();
-     object_cacher->stop();
-   }
- 
--  void ImageCtx::invalidate_cache() {
-+  int ImageCtx::invalidate_cache() {
-     if (!object_cacher)
--      return;
-+      return 0;
-     cache_lock.Lock();
-     object_cacher->release_set(object_set);
-     cache_lock.Unlock();
-     int r = flush_cache();
-@@ -584,10 +588,14 @@
-       lderr(cct) << "flush_cache returned " << r << dendl;
-     cache_lock.Lock();
-     bool unclean = object_cacher->release_set(object_set);
-     cache_lock.Unlock();
--    if (unclean)
--      lderr(cct) << "could not release all objects from cache" << dendl;
-+    if (unclean) {
-+      lderr(cct) << "could not release all objects from cache: "
-+                 << unclean << " bytes remain" << dendl;
-+      return -EBUSY;
-+    }
-+    return r;
-   }
- 
-   void ImageCtx::clear_nonexistence_cache() {
-     if (!object_cacher)
---- a/src/librbd/ImageCtx.h
-+++ b/src/librbd/ImageCtx.h
-@@ -138,9 +138,9 @@
-     void user_flushed();
-     void flush_cache_aio(Context *onfinish);
-     int flush_cache();
-     void shutdown_cache();
--    void invalidate_cache();
-+    int invalidate_cache();
-     void clear_nonexistence_cache();
-     int register_watch();
-     void unregister_watch();
-     size_t parent_io_len(uint64_t offset, size_t length,
---- a/src/librbd/internal.cc
-+++ b/src/librbd/internal.cc
-@@ -831,8 +831,11 @@
-   int create(IoCtx& io_ctx, const char *imgname, uint64_t size,
- 	     bool old_format, uint64_t features, int *order,
- 	     uint64_t stripe_unit, uint64_t stripe_count)
-   {
-+    if (!order)
-+      return -EINVAL;
-+
-     CephContext *cct = (CephContext *)io_ctx.cct();
-     ldout(cct, 20) << "create " << &io_ctx << " name = " << imgname
- 		   << " size = " << size << " old_format = " << old_format
- 		   << " features = " << features << " order = " << *order
-@@ -856,11 +859,8 @@
-       lderr(cct) << "rbd image " << imgname << " already exists" << dendl;
-       return -EEXIST;
-     }
- 
--    if (!order)
--      return -EINVAL;
--
-     if (!*order)
-       *order = cct->_conf->rbd_default_order;
-     if (!*order)
-       *order = RBD_DEFAULT_OBJ_ORDER;
-@@ -1503,9 +1503,11 @@
-     RWLock::WLocker l(ictx->md_lock);
-     if (size < ictx->size && ictx->object_cacher) {
-       // need to invalidate since we're deleting objects, and
-       // ObjectCacher doesn't track non-existent objects
--      ictx->invalidate_cache();
-+      r = ictx->invalidate_cache();
-+      if (r < 0)
-+	return r;
-     }
-     resize_helper(ictx, size, prog_ctx);
- 
-     ldout(cct, 2) << "done." << dendl;
-@@ -1846,9 +1848,11 @@
- 
-     // need to flush any pending writes before resizing and rolling back -
-     // writes might create new snapshots. Rolling back will replace
-     // the current version, so we have to invalidate that too.
--    ictx->invalidate_cache();
-+    r = ictx->invalidate_cache();
-+    if (r < 0)
-+      return r;
- 
-     ldout(cct, 2) << "resizing to snapshot size..." << dendl;
-     NoOpProgressContext no_op;
-     r = resize_helper(ictx, new_size, no_op);
-@@ -2070,9 +2074,9 @@
- 			 << "' snap_name = '"
- 			 << ictx->snap_name << "'" << dendl;
-     int r = ictx->init();
-     if (r < 0)
--      return r;
-+      goto err_close;
- 
-     if (!ictx->read_only) {
-       r = ictx->register_watch();
-       if (r < 0) {
-@@ -2876,8 +2880,21 @@
- 
-     return r;
-   }
- 
-+  int invalidate_cache(ImageCtx *ictx)
-+  {
-+    CephContext *cct = ictx->cct;
-+    ldout(cct, 20) << "invalidate_cache " << ictx << dendl;
-+
-+    int r = ictx_check(ictx);
-+    if (r < 0)
-+      return r;
-+
-+    RWLock::WLocker l(ictx->md_lock);
-+    return ictx->invalidate_cache();
-+  }
-+
-   int aio_write(ImageCtx *ictx, uint64_t off, size_t len, const char *buf,
- 		AioCompletion *c)
-   {
-     CephContext *cct = ictx->cct;
---- a/src/librbd/internal.h
-+++ b/src/librbd/internal.h
-@@ -187,8 +187,9 @@
- 	       char *buf, bufferlist *pbl, AioCompletion *c);
-   int aio_flush(ImageCtx *ictx, AioCompletion *c);
-   int flush(ImageCtx *ictx);
-   int _flush(ImageCtx *ictx);
-+  int invalidate_cache(ImageCtx *ictx);
- 
-   ssize_t handle_sparse_read(CephContext *cct,
- 			     ceph::bufferlist data_bl,
- 			     uint64_t block_ofs,
---- a/src/librbd/librbd.cc
-+++ b/src/librbd/librbd.cc
-@@ -513,8 +513,14 @@
-     ImageCtx *ictx = (ImageCtx *)ctx;
-     return librbd::aio_flush(ictx, (librbd::AioCompletion *)c->pc);
-   }
- 
-+  int Image::invalidate_cache()
-+  {
-+    ImageCtx *ictx = (ImageCtx *)ctx;
-+    return librbd::invalidate_cache(ictx);
-+  }
-+
- } // namespace librbd
- 
- extern "C" void rbd_version(int *major, int *minor, int *extra)
- {
-@@ -1129,8 +1135,14 @@
-   librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
-   return librbd::aio_flush(ictx, (librbd::AioCompletion *)comp->pc);
- }
- 
-+extern "C" int rbd_invalidate_cache(rbd_image_t image)
-+{
-+  librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
-+  return librbd::invalidate_cache(ictx);
-+}
-+
- extern "C" int rbd_aio_is_complete(rbd_completion_t c)
- {
-   librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
-   return comp->is_complete();
---- a/src/mds/Locker.cc
-+++ b/src/mds/Locker.cc
-@@ -2061,9 +2061,15 @@
- 
- void Locker::calc_new_client_ranges(CInode *in, uint64_t size, map<client_t,client_writeable_range_t>& new_ranges)
- {
-   inode_t *latest = in->get_projected_inode();
--  uint64_t ms = ROUND_UP_TO((size+1)<<1, latest->get_layout_size_increment());
-+  uint64_t ms;
-+  if(latest->has_layout()) {
-+    ms = ROUND_UP_TO((size+1)<<1, latest->get_layout_size_increment());
-+  } else {
-+    // Layout-less directories like ~mds0/, have zero size
-+    ms = 0;
-+  }
- 
-   // increase ranges as appropriate.
-   // shrink to 0 if no WR|BUFFER caps issued.
-   for (map<client_t,Capability*>::iterator p = in->client_caps.begin();
---- a/src/mds/MDCache.cc
-+++ b/src/mds/MDCache.cc
-@@ -348,8 +348,9 @@
-   rootdir->fnode.accounted_rstat = rootdir->fnode.rstat;
- 
-   root->inode.dirstat = rootdir->fnode.fragstat;
-   root->inode.rstat = rootdir->fnode.rstat;
-+  ++root->inode.rstat.rsubdirs;
-   root->inode.accounted_rstat = root->inode.rstat;
- 
-   rootdir->mark_complete();
-   rootdir->mark_dirty(rootdir->pre_dirty(), mds->mdlog->get_current_segment());
-@@ -398,8 +399,9 @@
-   mydir->fnode.accounted_rstat = mydir->fnode.rstat;
- 
-   myin->inode.dirstat = mydir->fnode.fragstat;
-   myin->inode.rstat = mydir->fnode.rstat;
-+  ++myin->inode.rstat.rsubdirs;
-   myin->inode.accounted_rstat = myin->inode.rstat;
- 
- 
-   mydir->mark_complete();
---- a/src/messages/MOSDSubOp.h
-+++ b/src/messages/MOSDSubOp.h
-@@ -24,9 +24,9 @@
-  */
- 
- class MOSDSubOp : public Message {
- 
--  static const int HEAD_VERSION = 10;
-+  static const int HEAD_VERSION = 11;
-   static const int COMPAT_VERSION = 1;
- 
- public:
-   epoch_t map_epoch;
-@@ -62,8 +62,10 @@
-   eversion_t version;
- 
-   // piggybacked osd/og state
-   eversion_t pg_trim_to;   // primary->replica: trim to here
-+  eversion_t pg_trim_rollback_to;   // primary->replica: trim rollback
-+                                    // info to here
-   osd_peer_stat_t peer_stat;
- 
-   map<string,bufferlist> attrset;
- 
-@@ -174,8 +176,13 @@
-     }
-     if (header.version >= 10) {
-       ::decode(updated_hit_set_history, p);
-     }
-+    if (header.version >= 11) {
-+      ::decode(pg_trim_rollback_to, p);
-+    } else {
-+      pg_trim_rollback_to = pg_trim_to;
-+    }
-   }
- 
-   virtual void encode_payload(uint64_t features) {
-     ::encode(map_epoch, payload);
-@@ -223,8 +230,9 @@
-     ::encode(discard_temp_oid, payload);
-     ::encode(from, payload);
-     ::encode(pgid.shard, payload);
-     ::encode(updated_hit_set_history, payload);
-+    ::encode(pg_trim_rollback_to, payload);
-   }
- 
-   MOSDSubOp()
-     : Message(MSG_OSD_SUBOP, HEAD_VERSION, COMPAT_VERSION) { }
---- a/src/mon/DataHealthService.cc
-+++ b/src/mon/DataHealthService.cc
-@@ -227,9 +227,9 @@
-   if (ours.latest_avail_percent <= g_conf->mon_data_avail_warn) {
-     if (ours.latest_avail_percent != last_warned_percent)
-       mon->clog.warn()
- 	<< "reached concerning levels of available space on local monitor storage"
--	<< " (" << ours.latest_avail_percent << "\% free)\n";
-+	<< " (" << ours.latest_avail_percent << "% free)\n";
-     last_warned_percent = ours.latest_avail_percent;
-   } else {
-     last_warned_percent = 0;
-   }
---- a/src/mon/MonCommands.h
-+++ b/src/mon/MonCommands.h
-@@ -551,9 +551,9 @@
- 	"name=destpool,type=CephPoolname", \
- 	"rename <srcpool> to <destpool>", "osd", "rw", "cli,rest")
- COMMAND("osd pool get " \
- 	"name=pool,type=CephPoolname " \
--	"name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_ruleset|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|auid", \
-+	"name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_ruleset|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|auid|target_max_objects|target_max_bytes|cache_target_dirty_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|erasure_code_profile", \
- 	"get pool parameter <var>", "osd", "r", "cli,rest")
- COMMAND("osd pool set " \
- 	"name=pool,type=CephPoolname " \
- 	"name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_ruleset|hashpspool|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|debug_fake_ec_pool|target_max_bytes|target_max_objects|cache_target_dirty_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|auid " \
-@@ -567,8 +567,12 @@
- 	"name=pool,type=CephPoolname " \
- 	"name=field,type=CephChoices,strings=max_objects|max_bytes " \
- 	"name=val,type=CephString",
- 	"set object or byte limit on pool", "osd", "rw", "cli,rest")
-+COMMAND("osd pool get-quota " \
-+        "name=pool,type=CephPoolname ",
-+        "obtain object or byte limits for pool",
-+        "osd", "r", "cli,rest")
- COMMAND("osd pool stats " \
-         "name=name,type=CephString,req=false",
-         "obtain stats from all pools, or from specified pool",
-         "osd", "r", "cli,rest")
---- a/src/mon/Monitor.cc
-+++ b/src/mon/Monitor.cc
-@@ -620,8 +620,23 @@
- 
- void Monitor::refresh_from_paxos(bool *need_bootstrap)
- {
-   dout(10) << __func__ << dendl;
-+
-+  bufferlist bl;
-+  int r = store->get(MONITOR_NAME, "cluster_fingerprint", bl);
-+  if (r >= 0) {
-+    try {
-+      bufferlist::iterator p = bl.begin();
-+      ::decode(fingerprint, p);
-+    }
-+    catch (buffer::error& e) {
-+      dout(10) << __func__ << " failed to decode cluster_fingerprint" << dendl;
-+    }
-+  } else {
-+    dout(10) << __func__ << " no cluster_fingerprint" << dendl;
-+  }
-+
-   for (int i = 0; i < PAXOS_NUM; ++i) {
-     paxos_service[i]->refresh(need_bootstrap);
-   }
-   for (int i = 0; i < PAXOS_NUM; ++i) {
-@@ -2392,8 +2407,9 @@
-     // this must be formatted, in its current form
-     if (!f)
-       f.reset(new_formatter("json-pretty"));
-     f->open_object_section("report");
-+    f->dump_stream("cluster_fingerprint") << fingerprint;
-     f->dump_string("version", ceph_version_to_str());
-     f->dump_string("commit", git_version_to_str());
-     f->dump_stream("timestamp") << ceph_clock_now(NULL);
- 
-@@ -2865,10 +2881,11 @@
-         // let it go through and be dispatched immediately!
-         return dispatch(s, m, false);
-       }
-       dout(1) << __func__ << " dropping stray message " << *m
--        << " from " << m->get_source_inst() << dendl;
--      return false;
-+	      << " from " << m->get_source_inst() << dendl;
-+      m->put();
-+      return true;
-     }
- 
-     if (!exited_quorum.is_zero() && !src_is_mon) {
-       waitlist_or_zap_client(m);
-@@ -3846,11 +3863,31 @@
-   if (!maybe_wait_for_quorum.empty()) {
-     finish_contexts(g_ceph_context, maybe_wait_for_quorum);
-   }
- 
-+  if (is_leader() && paxos->is_active() && fingerprint.is_zero()) {
-+    // this is only necessary on upgraded clusters.
-+    MonitorDBStore::Transaction t;
-+    prepare_new_fingerprint(&t);
-+    bufferlist tbl;
-+    t.encode(tbl);
-+    paxos->propose_new_value(tbl, new C_NoopContext);
-+  }
-+
-   new_tick();
- }
- 
-+void Monitor::prepare_new_fingerprint(MonitorDBStore::Transaction *t)
-+{
-+  uuid_d nf;
-+  nf.generate_random();
-+  dout(10) << __func__ << " proposing cluster_fingerprint " << nf << dendl;
-+
-+  bufferlist bl;
-+  ::encode(nf, bl);
-+  t->put(MONITOR_NAME, "cluster_fingerprint", bl);
-+}
-+
- int Monitor::check_fsid()
- {
-   if (!store->exists(MONITOR_NAME, "cluster_uuid"))
-     return -ENOENT;
---- a/src/mon/Monitor.h
-+++ b/src/mon/Monitor.h
-@@ -127,8 +127,9 @@
-   void register_cluster_logger();
-   void unregister_cluster_logger();
- 
-   MonMap *monmap;
-+  uuid_d fingerprint;
- 
-   set<entity_addr_t> extra_probe_peers;
- 
-   LogClient clog;
-@@ -189,8 +190,10 @@
-   bool is_peon() const { return state == STATE_PEON; }
- 
-   const utime_t &get_leader_since() const;
- 
-+  void prepare_new_fingerprint(MonitorDBStore::Transaction *t);
-+
-   // -- elector --
- private:
-   Paxos *paxos;
-   Elector elector;
---- a/src/mon/MonmapMonitor.cc
-+++ b/src/mon/MonmapMonitor.cc
-@@ -96,8 +96,13 @@
-   pending_map.encode(bl, mon->get_quorum_features());
- 
-   put_version(t, pending_map.epoch, bl);
-   put_last_committed(t, pending_map.epoch);
-+
-+  // generate a cluster fingerprint, too?
-+  if (pending_map.epoch == 1) {
-+    mon->prepare_new_fingerprint(t);
-+  }
- }
- 
- void MonmapMonitor::on_active()
- {
---- a/src/mon/OSDMonitor.cc
-+++ b/src/mon/OSDMonitor.cc
-@@ -2066,8 +2066,34 @@
- 	}
-       }
-     }
- 
-+    // hit_set-less cache_mode?
-+    if (g_conf->mon_warn_on_cache_pools_without_hit_sets) {
-+      int problem_cache_pools = 0;
-+      for (map<int64_t, pg_pool_t>::const_iterator p = osdmap.pools.begin();
-+	   p != osdmap.pools.end();
-+	   ++p) {
-+	const pg_pool_t& info = p->second;
-+	if (info.cache_mode_requires_hit_set() &&
-+	    info.hit_set_params.get_type() == HitSet::TYPE_NONE) {
-+	  ++problem_cache_pools;
-+	  if (detail) {
-+	    ostringstream ss;
-+	    ss << "pool '" << osdmap.get_pool_name(p->first)
-+	       << "' with cache_mode " << info.get_cache_mode_name()
-+	       << " needs hit_set_type to be set but it is not";
-+	    detail->push_back(make_pair(HEALTH_WARN, ss.str()));
-+	  }
-+	}
-+      }
-+      if (problem_cache_pools) {
-+	ostringstream ss;
-+	ss << problem_cache_pools << " cache pools are missing hit_sets";
-+	summary.push_back(make_pair(HEALTH_WARN, ss.str()));
-+      }
-+    }
-+
-     // Warn if 'mon_osd_down_out_interval' is set to zero.
-     // Having this option set to zero on the leader acts much like the
-     // 'noout' flag.  It's hard to figure out what's going wrong with clusters
-     // without the 'noout' flag set but acting like that just the same, so
-@@ -2452,8 +2478,28 @@
-     const pg_pool_t *p = osdmap.get_pg_pool(pool);
-     string var;
-     cmd_getval(g_ceph_context, cmdmap, "var", var);
- 
-+    if (!p->is_tier() &&
-+        (var == "hit_set_type" || var == "hit_set_period" ||
-+         var == "hit_set_count" || var == "hit_set_fpp" ||
-+         var == "target_max_objects" || var == "target_max_bytes" ||
-+         var == "cache_target_full_ratio" ||
-+         var == "cache_target_dirty_ratio" ||
-+         var == "cache_min_flush_age" || var == "cache_min_evict_age")) {
-+      ss << "pool '" << poolstr
-+         << "' is not a tier pool: variable not applicable";
-+      r = -EACCES;
-+      goto reply;
-+    }
-+
-+    if (!p->is_erasure() && var == "erasure_code_profile") {
-+      ss << "pool '" << poolstr
-+         << "' is not a erasure pool: variable not applicable";
-+      r = -EACCES;
-+      goto reply;
-+    }
-+
-     if (f) {
-       f->open_object_section("pool");
-       f->dump_string("pool", poolstr);
-       f->dump_int("pool_id", pool);
-@@ -2487,8 +2533,28 @@
- 	} else {
- 	  BloomHitSet::Params *bloomp = static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
- 	  f->dump_float("hit_set_fpp", bloomp->get_fpp());
- 	}
-+      } else if (var == "target_max_objects") {
-+        f->dump_unsigned("target_max_objects", p->target_max_objects);
-+      } else if (var == "target_max_bytes") {
-+        f->dump_unsigned("target_max_bytes", p->target_max_bytes);
-+      } else if (var == "cache_target_dirty_ratio") {
-+        f->dump_unsigned("cache_target_dirty_ratio_micro",
-+                         p->cache_target_dirty_ratio_micro);
-+        f->dump_float("cache_target_dirty_ratio",
-+                      ((float)p->cache_target_dirty_ratio_micro/1000000));
-+      } else if (var == "cache_target_full_ratio") {
-+        f->dump_unsigned("cache_target_full_ratio_micro",
-+                         p->cache_target_full_ratio_micro);
-+        f->dump_float("cache_target_full_ratio",
-+                      ((float)p->cache_target_full_ratio_micro/1000000));
-+      } else if (var == "cache_min_flush_age") {
-+        f->dump_unsigned("cache_min_flush_age", p->cache_min_flush_age);
-+      } else if (var == "cache_min_evict_age") {
-+        f->dump_unsigned("cache_min_evict_age", p->cache_min_evict_age);
-+      } else if (var == "erasure_code_profile") {
-+       f->dump_string("erasure_code_profile", p->erasure_code_profile);
-       }
- 
-       f->close_section();
-       f->flush(rdata);
-@@ -2520,9 +2586,26 @@
- 	  goto reply;
- 	}
- 	BloomHitSet::Params *bloomp = static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
- 	ss << "hit_set_fpp: " << bloomp->get_fpp();
-+      } else if (var == "target_max_objects") {
-+        ss << "target_max_objects: " << p->target_max_objects;
-+      } else if (var == "target_max_bytes") {
-+        ss << "target_max_bytes: " << p->target_max_bytes;
-+      } else if (var == "cache_target_dirty_ratio") {
-+        ss << "cache_target_dirty_ratio: "
-+          << ((float)p->cache_target_dirty_ratio_micro/1000000);
-+      } else if (var == "cache_target_full_ratio") {
-+        ss << "cache_target_full_ratio: "
-+          << ((float)p->cache_target_full_ratio_micro/1000000);
-+      } else if (var == "cache_min_flush_age") {
-+        ss << "cache_min_flush_age: " << p->cache_min_flush_age;
-+      } else if (var == "cache_min_evict_age") {
-+        ss << "cache_min_evict_age: " << p->cache_min_evict_age;
-+      } else if (var == "erasure_code_profile") {
-+       ss << "erasure_code_profile: " << p->erasure_code_profile;
-       }
-+
-       rdata.append(ss);
-       ss.str("");
-     }
-     r = 0;
-@@ -2625,8 +2708,47 @@
-     }
-     rdata.append("\n");
-     r = 0;
- 
-+  } else if (prefix == "osd pool get-quota") {
-+    string pool_name;
-+    cmd_getval(g_ceph_context, cmdmap, "pool", pool_name);
-+
-+    int64_t poolid = osdmap.lookup_pg_pool_name(pool_name);
-+    if (poolid < 0) {
-+      assert(poolid == -ENOENT);
-+      ss << "unrecognized pool '" << pool_name << "'";
-+      r = -ENOENT;
-+      goto reply;
-+    }
-+    const pg_pool_t *p = osdmap.get_pg_pool(poolid);
-+
-+    if (f) {
-+      f->open_object_section("pool_quotas");
-+      f->dump_string("pool_name", pool_name);
-+      f->dump_unsigned("pool_id", poolid);
-+      f->dump_unsigned("quota_max_objects", p->quota_max_objects);
-+      f->dump_unsigned("quota_max_bytes", p->quota_max_bytes);
-+      f->close_section();
-+      f->flush(rdata);
-+    } else {
-+      stringstream rs;
-+      rs << "quotas for pool '" << pool_name << "':\n"
-+         << "  max objects: ";
-+      if (p->quota_max_objects == 0)
-+        rs << "N/A";
-+      else
-+        rs << si_t(p->quota_max_objects) << " objects";
-+      rs << "\n"
-+         << "  max bytes  : ";
-+      if (p->quota_max_bytes == 0)
-+        rs << "N/A";
-+      else
-+        rs << si_t(p->quota_max_bytes) << "B";
-+      rdata.append(rs.str());
-+    }
-+    rdata.append("\n");
-+    r = 0;
-   } else if (prefix == "osd crush rule list" ||
- 	     prefix == "osd crush rule ls") {
-     string format;
-     cmd_getval(g_ceph_context, cmdmap, "format", format, string("json-pretty"));
-@@ -2924,17 +3046,20 @@
- 					     const string &profile,
- 					     int *ruleset,
- 					     stringstream &ss)
- {
--  *ruleset = osdmap.crush->get_rule_id(name);
--  if (*ruleset != -ENOENT)
-+  int ruleid = osdmap.crush->get_rule_id(name);
-+  if (ruleid != -ENOENT) {
-+    *ruleset = osdmap.crush->get_rule_mask_ruleset(ruleid);
-     return -EEXIST;
-+  }
- 
-   CrushWrapper newcrush;
-   _get_pending_crush(newcrush);
- 
--  *ruleset = newcrush.get_rule_id(name);
--  if (*ruleset != -ENOENT) {
-+  ruleid = newcrush.get_rule_id(name);
-+  if (ruleid != -ENOENT) {
-+    *ruleset = newcrush.get_rule_mask_ruleset(ruleid);
-     return -EALREADY;
-   } else {
-     ErasureCodeInterfaceRef erasure_code;
-     int err = get_erasure_code(profile, &erasure_code, ss);
-@@ -3088,22 +3213,25 @@
- }
- 
- int OSDMonitor::prepare_pool_size(const unsigned pool_type,
- 				  const string &erasure_code_profile,
--				  unsigned *size,
-+				  unsigned *size, unsigned *min_size,
- 				  stringstream &ss)
- {
-   int err = 0;
-   switch (pool_type) {
-   case pg_pool_t::TYPE_REPLICATED:
-     *size = g_conf->osd_pool_default_size;
-+    *min_size = g_conf->get_osd_pool_default_min_size();
-     break;
-   case pg_pool_t::TYPE_ERASURE:
-     {
-       ErasureCodeInterfaceRef erasure_code;
-       err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
--      if (err == 0)
-+      if (err == 0) {
- 	*size = erasure_code->get_chunk_count();
-+	*min_size = erasure_code->get_data_chunk_count();
-+      }
-     }
-     break;
-   default:
-     ss << "prepare_pool_size: " << pool_type << " is not a known pool type";
-@@ -3218,10 +3346,10 @@
-   r = prepare_pool_crush_ruleset(pool_type, erasure_code_profile,
- 				 crush_ruleset_name, &crush_ruleset, ss);
-   if (r)
-     return r;
--  unsigned size;
--  r = prepare_pool_size(pool_type, erasure_code_profile, &size, ss);
-+  unsigned size, min_size;
-+  r = prepare_pool_size(pool_type, erasure_code_profile, &size, &min_size, ss);
-   if (r)
-     return r;
-   uint32_t stripe_width = 0;
-   r = prepare_pool_stripe_width(pool_type, erasure_code_profile, &stripe_width, ss);
-@@ -3245,9 +3373,9 @@
-   if (g_conf->osd_pool_default_flag_hashpspool)
-     pi->flags |= pg_pool_t::FLAG_HASHPSPOOL;
- 
-   pi->size = size;
--  pi->min_size = g_conf->get_osd_pool_default_min_size();
-+  pi->min_size = min_size;
-   pi->crush_ruleset = crush_ruleset;
-   pi->object_hash = CEPH_STR_HASH_RJENKINS;
-   pi->set_pg_num(pg_num ? pg_num : g_conf->osd_pool_default_pg_num);
-   pi->set_pgp_num(pgp_num ? pgp_num : g_conf->osd_pool_default_pgp_num);
-@@ -3335,8 +3463,9 @@
-   string val;
-   string interr, floaterr;
-   int64_t n = 0;
-   double f = 0;
-+  int64_t uf = 0;  // micro-f
-   if (!cmd_getval(g_ceph_context, cmdmap, "val", val)) {
-     // wasn't a string; maybe an older mon forwarded json with an int?
-     if (!cmd_getval(g_ceph_context, cmdmap, "val", n))
-       return -EINVAL;  // no value!
-@@ -3344,8 +3473,19 @@
-     // we got a string.  see if it contains an int.
-     n = strict_strtoll(val.c_str(), 10, &interr);
-     // or a float
-     f = strict_strtod(val.c_str(), &floaterr);
-+    uf = llrintl(f * (double)1000000.0);
-+  }
-+
-+  if (!p.is_tier() &&
-+      (var == "hit_set_type" || var == "hit_set_period" ||
-+       var == "hit_set_count" || var == "hit_set_fpp" ||
-+       var == "target_max_objects" || var == "target_max_bytes" ||
-+       var == "cache_target_full_ratio" || var == "cache_target_dirty_ratio" ||
-+       var == "cache_min_flush_age" || var == "cache_min_evict_age")) {
-+    ss << "pool '" << poolstr << "' is not a tier pool: variable not applicable";
-+    return -EACCES;
-   }
- 
-   if (var == "size") {
-     if (p.type == pg_pool_t::TYPE_ERASURE) {
-@@ -3398,9 +3538,9 @@
- 	force != "--yes-i-really-mean-it") {
-       ss << "splits in cache pools must be followed by scrubs and leave sufficient free space to avoid overfilling.  use --yes-i-really-mean-it to force.";
-       return -EPERM;
-     }
--    int expected_osds = MIN(p.get_pg_num(), osdmap.get_num_osds());
-+    int expected_osds = MAX(1, MIN(p.get_pg_num(), osdmap.get_num_osds()));
-     int64_t new_pgs = n - p.get_pg_num();
-     int64_t pgs_per_osd = new_pgs / expected_osds;
-     if (pgs_per_osd > g_conf->mon_osd_max_split_count) {
-       ss << "specified pg_num " << n << " is too large (creating "
-@@ -3486,8 +3626,9 @@
-       return -EINVAL;
-     }
-     p.hit_set_period = n;
-   } else if (var == "hit_set_count") {
-+
-     if (interr.length()) {
-       ss << "error parsing integer value '" << val << "': " << interr;
-       return -EINVAL;
-     }
-@@ -3527,9 +3668,9 @@
-     if (f < 0 || f > 1.0) {
-       ss << "value must be in the range 0..1";
-       return -ERANGE;
-     }
--    p.cache_target_dirty_ratio_micro = f * 1000000;
-+    p.cache_target_dirty_ratio_micro = uf;
-   } else if (var == "cache_target_full_ratio") {
-     if (floaterr.length()) {
-       ss << "error parsing float '" << val << "': " << floaterr;
-       return -EINVAL;
-@@ -3537,9 +3678,9 @@
-     if (f < 0 || f > 1.0) {
-       ss << "value must be in the range 0..1";
-       return -ERANGE;
-     }
--    p.cache_target_full_ratio_micro = f * 1000000;
-+    p.cache_target_full_ratio_micro = uf;
-   } else if (var == "cache_min_flush_age") {
-     if (interr.length()) {
-       ss << "error parsing int '" << val << "': " << interr;
-       return -EINVAL;
-@@ -4171,8 +4312,26 @@
-     string profile;
-     cmd_getval(g_ceph_context, cmdmap, "profile", profile);
-     if (profile == "")
-       profile = "default";
-+    if (profile == "default") {
-+      if (!osdmap.has_erasure_code_profile(profile)) {
-+	if (pending_inc.has_erasure_code_profile(profile)) {
-+	  dout(20) << "erasure code profile " << profile << " already pending" << dendl;
-+	  goto wait;
-+	}
-+
-+	map<string,string> profile_map;
-+	err = osdmap.get_erasure_code_profile_default(g_ceph_context,
-+						      profile_map,
-+						      &ss);
-+	if (err)
-+	  goto reply;
-+	dout(20) << "erasure code profile " << profile << " set" << dendl;
-+	pending_inc.set_erasure_code_profile(profile, profile_map);
-+	goto wait;
-+      }
-+    }
- 
-     int ruleset;
-     err = crush_ruleset_create_erasure(name, profile, &ruleset, ss);
-     if (err < 0) {
-@@ -4846,8 +5005,27 @@
-     string erasure_code_profile;
-     cmd_getval(g_ceph_context, cmdmap, "erasure_code_profile", erasure_code_profile);
-     if (erasure_code_profile == "")
-       erasure_code_profile = "default";
-+    if (erasure_code_profile == "default") {
-+      if (!osdmap.has_erasure_code_profile(erasure_code_profile)) {
-+	if (pending_inc.has_erasure_code_profile(erasure_code_profile)) {
-+	  dout(20) << "erasure code profile " << erasure_code_profile << " already pending" << dendl;
-+	  goto wait;
-+	}
-+
-+	map<string,string> profile_map;
-+	err = osdmap.get_erasure_code_profile_default(g_ceph_context,
-+						      profile_map,
-+						      &ss);
-+	if (err)
-+	  goto reply;
-+	dout(20) << "erasure code profile " << erasure_code_profile << " set" << dendl;
-+	pending_inc.set_erasure_code_profile(erasure_code_profile, profile_map);
-+	goto wait;
-+      }
-+    }
-+
-     if (ruleset_name == "") {
-       if (erasure_code_profile == "default") {
- 	ruleset_name = "erasure-code";
-       } else {
-@@ -5053,9 +5231,12 @@
-       err = 0;
-       goto reply;
-     }
-     if (tp->tier_of != pool_id) {
--      ss << "tier pool '" << tierpoolstr << "' is a tier of '" << tp->tier_of << "'";
-+      ss << "tier pool '" << tierpoolstr << "' is a tier of '"
-+         << osdmap.get_pool_name(tp->tier_of) << "': "
-+         // be scary about it; this is an inconsistency and bells must go off
-+         << "THIS SHOULD NOT HAVE HAPPENED AT ALL";
-       err = -EINVAL;
-       goto reply;
-     }
-     if (p->read_tier == tierpool_id) {
-@@ -5181,10 +5362,69 @@
-       ss << "'" << modestr << "' is not a valid cache mode";
-       err = -EINVAL;
-       goto reply;
-     }
-+
-+    // pool already has this cache-mode set and there are no pending changes
-+    if (p->cache_mode == mode &&
-+	(pending_inc.new_pools.count(pool_id) == 0 ||
-+	 pending_inc.new_pools[pool_id].cache_mode == p->cache_mode)) {
-+      ss << "set cache-mode for pool '" << poolstr << "'"
-+         << " to " << pg_pool_t::get_cache_mode_name(mode);
-+      err = 0;
-+      goto reply;
-+    }
-+
-+    /* Mode description:
-+     *
-+     *  none:       No cache-mode defined
-+     *  forward:    Forward all reads and writes to base pool
-+     *  writeback:  Cache writes, promote reads from base pool
-+     *  readonly:   Forward writes to base pool
-+     *
-+     * Hence, these are the allowed transitions:
-+     *
-+     *  none -> any
-+     *  forward -> writeback || any IF num_objects_dirty == 0
-+     *  writeback -> forward
-+     *  readonly -> any
-+     */
-+
-+    // We check if the transition is valid against the current pool mode, as
-+    // it is the only committed state thus far.  We will blantly squash
-+    // whatever mode is on the pending state.
-+
-+    if (p->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK &&
-+        mode != pg_pool_t::CACHEMODE_FORWARD) {
-+      ss << "unable to set cache-mode '" << pg_pool_t::get_cache_mode_name(mode)
-+         << "' on a '" << pg_pool_t::get_cache_mode_name(p->cache_mode)
-+         << "' pool; only '"
-+         << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_FORWARD)
-+        << "' allowed.";
-+      err = -EINVAL;
-+      goto reply;
-+    }
-+    if (p->cache_mode == pg_pool_t::CACHEMODE_FORWARD &&
-+               mode != pg_pool_t::CACHEMODE_WRITEBACK) {
-+
-+      const pool_stat_t& tier_stats =
-+        mon->pgmon()->pg_map.get_pg_pool_sum_stat(pool_id);
-+
-+      if (tier_stats.stats.sum.num_objects_dirty > 0) {
-+        ss << "unable to set cache-mode '"
-+           << pg_pool_t::get_cache_mode_name(mode) << "' on pool '" << poolstr
-+           << "': dirty objects found";
-+        err = -EBUSY;
-+        goto reply;
-+      }
-+    }
-+
-     // go
--    pending_inc.get_new_pool(pool_id, p)->cache_mode = mode;
-+    pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
-+    np->cache_mode = mode;
-+    // set this both when moving to and from cache_mode NONE.  this is to
-+    // capture legacy pools that were set up before this flag existed.
-+    np->flags |= pg_pool_t::FLAG_INCOMPLETE_CLONES;
-     ss << "set cache-mode for pool '" << poolstr
- 	<< "' to " << pg_pool_t::get_cache_mode_name(mode);
-     wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, ss.str(),
- 					      get_last_committed() + 1));
-@@ -5622,10 +5862,14 @@
- 	<< osdmap.get_pool_name(p->tier_of) << "'";
-     return -EBUSY;
-   }
-   if (!p->tiers.empty()) {
--    *ss << "pool '" << poolstr << "' includes tiers "
--	<< p->tiers;
-+    *ss << "pool '" << poolstr << "' has tiers";
-+    for(std::set<uint64_t>::iterator i = p->tiers.begin(); i != p->tiers.end(); ++i) {
-+      const char *name = osdmap.get_pool_name(*i);
-+      assert(name != NULL);
-+      *ss << " " << name;
-+    }
-     return -EBUSY;
-   }
-   *ss << "pool '" << poolstr << "' removed";
-   return 0;
---- a/src/mon/OSDMonitor.h
-+++ b/src/mon/OSDMonitor.h
-@@ -271,9 +271,9 @@
- 				 map<string,string> *erasure_code_profile_map,
- 				 stringstream &ss);
-   int prepare_pool_size(const unsigned pool_type,
- 			const string &erasure_code_profile,
--			unsigned *size,
-+			unsigned *size, unsigned *min_size,
- 			stringstream &ss);
-   int prepare_pool_stripe_width(const unsigned pool_type,
- 				const string &erasure_code_profile,
- 				unsigned *stripe_width,
---- a/src/mon/PGMonitor.cc
-+++ b/src/mon/PGMonitor.cc
-@@ -1214,13 +1214,15 @@
- }
- 
- //void PGMonitor::dump_object_stat_sum(stringstream& ss, Formatter *f,
- void PGMonitor::dump_object_stat_sum(TextTable &tbl, Formatter *f,
--    object_stat_sum_t &sum, bool verbose)
-+				     object_stat_sum_t &sum, uint64_t avail,
-+				     bool verbose)
- {
-   if (f) {
-     f->dump_int("kb_used", SHIFT_ROUND_UP(sum.num_bytes, 10));
-     f->dump_int("bytes_used", sum.num_bytes);
-+    f->dump_unsigned("max_avail", avail);
-     f->dump_int("objects", sum.num_objects);
-     if (verbose) {
-       f->dump_int("dirty", sum.num_objects_dirty);
-       f->dump_int("rd", sum.num_rd);
-@@ -1231,8 +1233,9 @@
-   } else {
-     tbl << stringify(si_t(sum.num_bytes));
-     int64_t kb_used = SHIFT_ROUND_UP(sum.num_bytes, 10);
-     tbl << percentify(((float)kb_used / pg_map.osd_sum.kb)*100);
-+    tbl << si_t(avail);
-     tbl << sum.num_objects;
-     if (verbose) {
-       tbl << stringify(si_t(sum.num_objects_dirty))
- 	  << stringify(si_t(sum.num_rd))
-@@ -1240,8 +1243,26 @@
-     }
-   }
- }
- 
-+int64_t PGMonitor::get_rule_avail(OSDMap& osdmap, int ruleno)
-+{
-+  map<int,float> wm;
-+  int r = osdmap.crush->get_rule_weight_osd_map(ruleno, &wm);
-+  if (r < 0)
-+    return r;
-+  if(wm.size() == 0)
-+    return 0;
-+  int64_t min = -1;
-+  for (map<int,float>::iterator p = wm.begin(); p != wm.end(); ++p) {
-+    int64_t proj = (float)(pg_map.osd_stat[p->first].kb_avail * 1024ull) /
-+      (double)p->second;
-+    if (min < 0 || proj < min)
-+      min = proj;
-+  }
-+  return min;
-+}
-+
- void PGMonitor::dump_pool_stats(stringstream &ss, Formatter *f, bool verbose)
- {
-   TextTable tbl;
- 
-@@ -1251,18 +1272,20 @@
-     tbl.define_column("NAME", TextTable::LEFT, TextTable::LEFT);
-     tbl.define_column("ID", TextTable::LEFT, TextTable::LEFT);
-     if (verbose)
-       tbl.define_column("CATEGORY", TextTable::LEFT, TextTable::LEFT);
--    tbl.define_column("USED", TextTable::LEFT, TextTable::LEFT);
--    tbl.define_column("\%USED", TextTable::LEFT, TextTable::LEFT);
--    tbl.define_column("OBJECTS", TextTable::LEFT, TextTable::LEFT);
--    if (verbose) {
--      tbl.define_column("DIRTY", TextTable::LEFT, TextTable::LEFT);
--      tbl.define_column("READ", TextTable::LEFT, TextTable::LEFT);
--      tbl.define_column("WRITE", TextTable::LEFT, TextTable::LEFT);
-+    tbl.define_column("USED", TextTable::LEFT, TextTable::RIGHT);
-+    tbl.define_column("%USED", TextTable::LEFT, TextTable::RIGHT);
-+    tbl.define_column("MAX AVAIL", TextTable::LEFT, TextTable::RIGHT);
-+    tbl.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
-+    if (verbose) {
-+      tbl.define_column("DIRTY", TextTable::LEFT, TextTable::RIGHT);
-+      tbl.define_column("READ", TextTable::LEFT, TextTable::RIGHT);
-+      tbl.define_column("WRITE", TextTable::LEFT, TextTable::RIGHT);
-     }
-   }
- 
-+  map<int,uint64_t> avail_by_rule;
-   OSDMap &osdmap = mon->osdmon()->osdmap;
-   for (map<int64_t,pg_pool_t>::const_iterator p = osdmap.get_pools().begin();
-        p != osdmap.get_pools().end(); ++p) {
-     int64_t pool_id = p->first;
-@@ -1270,8 +1293,40 @@
-       continue;
-     string pool_name = osdmap.get_pool_name(pool_id);
-     pool_stat_t &stat = pg_map.pg_pool_sum[pool_id];
- 
-+    const pg_pool_t *pool = osdmap.get_pg_pool(pool_id);
-+    int ruleno = osdmap.crush->find_rule(pool->get_crush_ruleset(),
-+					 pool->get_type(),
-+					 pool->get_size());
-+    uint64_t avail;
-+    if (avail_by_rule.count(ruleno) == 0) {
-+      avail = get_rule_avail(osdmap, ruleno);
-+      avail_by_rule[ruleno] = avail;
-+    } else {
-+      avail = avail_by_rule[ruleno];
-+    }
-+    switch (pool->get_type()) {
-+    case pg_pool_t::TYPE_REPLICATED:
-+      avail /= pool->get_size();
-+      break;
-+    case pg_pool_t::TYPE_ERASURE:
-+      {
-+	const map<string,string>& ecp =
-+	  osdmap.get_erasure_code_profile(pool->erasure_code_profile);
-+	map<string,string>::const_iterator pm = ecp.find("m");
-+	map<string,string>::const_iterator pk = ecp.find("k");
-+	if (pm != ecp.end() && pk != ecp.end()) {
-+	  int k = atoi(pk->second.c_str());
-+	  int m = atoi(pm->second.c_str());
-+	  avail = avail * k / (m + k);
-+	}
-+      }
-+      break;
-+    default:
-+      assert(0 == "unrecognized pool type");
-+    }
-+
-     if (f) {
-       f->open_object_section("pool");
-       f->dump_string("name", pool_name);
-       f->dump_int("id", pool_id);
-@@ -1281,9 +1336,9 @@
-           << pool_id;
-       if (verbose)
-         tbl << "-";
-     }
--    dump_object_stat_sum(tbl, f, stat.stats.sum, verbose);
-+    dump_object_stat_sum(tbl, f, stat.stats.sum, avail, verbose);
-     if (f)
-       f->close_section(); // stats
-     else
-       tbl << TextTable::endrow;
-@@ -1300,9 +1355,9 @@
-           tbl << ""
-               << ""
-               << it->first;
-         }
--        dump_object_stat_sum(tbl, f, it->second, verbose);
-+        dump_object_stat_sum(tbl, f, it->second, avail, verbose);
-         if (f)
-           f->close_section(); // category name
-         else
-           tbl << TextTable::endrow;
-@@ -1334,14 +1389,14 @@
-     }
-     f->close_section();
-   } else {
-     TextTable tbl;
--    tbl.define_column("SIZE", TextTable::LEFT, TextTable::LEFT);
--    tbl.define_column("AVAIL", TextTable::LEFT, TextTable::LEFT);
--    tbl.define_column("RAW USED", TextTable::LEFT, TextTable::LEFT);
--    tbl.define_column("\%RAW USED", TextTable::LEFT, TextTable::LEFT);
-+    tbl.define_column("SIZE", TextTable::LEFT, TextTable::RIGHT);
-+    tbl.define_column("AVAIL", TextTable::LEFT, TextTable::RIGHT);
-+    tbl.define_column("RAW USED", TextTable::LEFT, TextTable::RIGHT);
-+    tbl.define_column("%RAW USED", TextTable::LEFT, TextTable::RIGHT);
-     if (verbose) {
--      tbl.define_column("OBJECTS", TextTable::LEFT, TextTable::LEFT);
-+      tbl.define_column("OBJECTS", TextTable::LEFT, TextTable::RIGHT);
-     }
-     tbl << stringify(si_t(pg_map.osd_sum.kb*1024))
-         << stringify(si_t(pg_map.osd_sum.kb_avail*1024))
-         << stringify(si_t(pg_map.osd_sum.kb_used*1024));
---- a/src/mon/PGMonitor.h
-+++ b/src/mon/PGMonitor.h
-@@ -145,9 +145,13 @@
- 			  int threshold,
- 			  vector<string>& args) const;
- 
-   void dump_object_stat_sum(TextTable &tbl, Formatter *f,
--                            object_stat_sum_t &sum, bool verbose);
-+                            object_stat_sum_t &sum,
-+			    uint64_t avail,
-+			    bool verbose);
-+
-+  int64_t get_rule_avail(OSDMap& osdmap, int ruleno);
- 
- public:
-   PGMonitor(Monitor *mn, Paxos *p, const string& service_name)
-     : PaxosService(mn, p, service_name),
---- a/src/mon/Paxos.cc
-+++ b/src/mon/Paxos.cc
-@@ -1263,9 +1263,10 @@
- // -- READ --
- 
- bool Paxos::is_readable(version_t v)
- {
--  dout(1) << "is_readable now=" << ceph_clock_now(g_ceph_context) << " lease_expire=" << lease_expire
-+  dout(5) << "is_readable now=" << ceph_clock_now(g_ceph_context)
-+	  << " lease_expire=" << lease_expire
- 	  << " has v" << v << " lc " << last_committed << dendl;
-   if (v > last_committed)
-     return false;
-   return 
---- a/src/msg/SimpleMessenger.cc
-+++ b/src/msg/SimpleMessenger.cc
-@@ -85,8 +85,11 @@
- {
-   ldout(cct,10) << "shutdown " << get_myaddr() << dendl;
-   mark_down_all();
-   dispatch_queue.shutdown();
-+
-+  // break ref cycles on the loopback connection
-+  local_connection->set_priv(NULL);
-   return 0;
- }
- 
- int SimpleMessenger::_send_message(Message *m, const entity_inst_t& dest,
---- a/src/os/FileJournal.cc
-+++ b/src/os/FileJournal.cc
-@@ -1757,9 +1757,14 @@
- 
-   // ok!
-   if (seq)
-     *seq = h->seq;
--  journalq.push_back(pair<uint64_t,off64_t>(h->seq, pos));
-+
-+  // works around an apparent GCC 4.8(?) compiler bug about unaligned
-+  // bind by reference to (packed) h->seq
-+  journalq.push_back(
-+    pair<uint64_t,off64_t>(static_cast<uint64_t>(h->seq),
-+			   static_cast<off64_t>(pos)));
- 
-   if (next_pos)
-     *next_pos = pos;
- 
---- a/src/os/FileStore.cc
-+++ b/src/os/FileStore.cc
-@@ -125,9 +125,9 @@
-   PerfCounters &logger)
- {
-   os_commit_latency.consume_next(
-     logger.get_tavg_ms(
--      l_os_commit_lat));
-+      l_os_j_lat));
-   os_apply_latency.consume_next(
-     logger.get_tavg_ms(
-       l_os_apply_lat));
- }
-@@ -1557,8 +1557,10 @@
-     delete backend;
-     backend = generic_backend;
-   }
- 
-+  force_sync = false;
-+
-   object_map.reset();
- 
-   {
-     Mutex::Locker l(sync_entry_timeo_lock);
-@@ -1710,9 +1712,10 @@
- }
- 
- void FileStore::_finish_op(OpSequencer *osr)
- {
--  Op *o = osr->dequeue();
-+  list<Context*> to_queue;
-+  Op *o = osr->dequeue(&to_queue);
-   
-   dout(10) << "_finish_op " << o << " seq " << o->op << " " << *osr << "/" << osr->parent << dendl;
-   osr->apply_lock.Unlock();  // locked in _do_op
- 
-@@ -1728,8 +1731,9 @@
-   }
-   if (o->onreadable) {
-     op_finisher.queue(o->onreadable);
-   }
-+  op_finisher.queue(to_queue);
-   delete o;
- }
- 
- 
-@@ -1843,16 +1847,18 @@
- 
-   // this should queue in order because the journal does it's completions in order.
-   queue_op(osr, o);
- 
--  osr->dequeue_journal();
-+  list<Context*> to_queue;
-+  osr->dequeue_journal(&to_queue);
- 
-   // do ondisk completions async, to prevent any onreadable_sync completions
-   // getting blocked behind an ondisk completion.
-   if (ondisk) {
-     dout(10) << " queueing ondisk " << ondisk << dendl;
-     ondisk_finisher.queue(ondisk);
-   }
-+  ondisk_finisher.queue(to_queue);
- }
- 
- int FileStore::_do_transactions(
-   list<Transaction*> &tls,
-@@ -2544,13 +2550,14 @@
- 	t.dump(&f);
- 	f.close_section();
- 	f.flush(*_dout);
- 	*_dout << dendl;
--	assert(0 == "unexpected error");
- 
- 	if (r == -EMFILE) {
- 	  dump_open_fds(g_ceph_context);
- 	}
-+
-+	assert(0 == "unexpected error");
-       }
-     }
- 
-     spos.op++;
---- a/src/os/FileStore.h
-+++ b/src/os/FileStore.h
-@@ -192,21 +192,72 @@
-   class OpSequencer : public Sequencer_impl {
-     Mutex qlock; // to protect q, for benefit of flush (peek/dequeue also protected by lock)
-     list<Op*> q;
-     list<uint64_t> jq;
-+    list<pair<uint64_t, Context*> > flush_commit_waiters;
-     Cond cond;
-   public:
-     Sequencer *parent;
-     Mutex apply_lock;  // for apply mutual exclusion
-     
-+    /// get_max_uncompleted
-+    bool _get_max_uncompleted(
-+      uint64_t *seq ///< [out] max uncompleted seq
-+      ) {
-+      assert(qlock.is_locked());
-+      assert(seq);
-+      *seq = 0;
-+      if (q.empty() && jq.empty())
-+	return true;
-+
-+      if (!q.empty())
-+	*seq = q.back()->op;
-+      if (!jq.empty() && jq.back() > *seq)
-+	*seq = jq.back();
-+
-+      return false;
-+    } /// @returns true if both queues are empty
-+
-+    /// get_min_uncompleted
-+    bool _get_min_uncompleted(
-+      uint64_t *seq ///< [out] min uncompleted seq
-+      ) {
-+      assert(qlock.is_locked());
-+      assert(seq);
-+      *seq = 0;
-+      if (q.empty() && jq.empty())
-+	return true;
-+
-+      if (!q.empty())
-+	*seq = q.front()->op;
-+      if (!jq.empty() && jq.front() < *seq)
-+	*seq = jq.front();
-+
-+      return false;
-+    } /// @returns true if both queues are empty
-+
-+    void _wake_flush_waiters(list<Context*> *to_queue) {
-+      uint64_t seq;
-+      if (_get_min_uncompleted(&seq))
-+	seq = -1;
-+
-+      for (list<pair<uint64_t, Context*> >::iterator i =
-+	     flush_commit_waiters.begin();
-+	   i != flush_commit_waiters.end() && i->first < seq;
-+	   flush_commit_waiters.erase(i++)) {
-+	to_queue->push_back(i->second);
-+      }
-+    }
-+
-     void queue_journal(uint64_t s) {
-       Mutex::Locker l(qlock);
-       jq.push_back(s);
-     }
--    void dequeue_journal() {
-+    void dequeue_journal(list<Context*> *to_queue) {
-       Mutex::Locker l(qlock);
-       jq.pop_front();
-       cond.Signal();
-+      _wake_flush_waiters(to_queue);
-     }
-     void queue(Op *o) {
-       Mutex::Locker l(qlock);
-       q.push_back(o);
-@@ -214,22 +265,28 @@
-     Op *peek_queue() {
-       assert(apply_lock.is_locked());
-       return q.front();
-     }
--    Op *dequeue() {
-+
-+    Op *dequeue(list<Context*> *to_queue) {
-+      assert(to_queue);
-       assert(apply_lock.is_locked());
-       Mutex::Locker l(qlock);
-       Op *o = q.front();
-       q.pop_front();
-       cond.Signal();
-+
-+      _wake_flush_waiters(to_queue);
-       return o;
-     }
-+
-     void flush() {
-       Mutex::Locker l(qlock);
- 
-       while (g_conf->filestore_blackhole)
- 	cond.Wait(qlock);  // wait forever
- 
-+
-       // get max for journal _or_ op queues
-       uint64_t seq = 0;
-       if (!q.empty())
- 	seq = q.back()->op;
-@@ -242,8 +299,19 @@
- 	       (!jq.empty() && jq.front() <= seq))
- 	  cond.Wait(qlock);
-       }
-     }
-+    bool flush_commit(Context *c) {
-+      Mutex::Locker l(qlock);
-+      uint64_t seq = 0;
-+      if (_get_max_uncompleted(&seq)) {
-+	delete c;
-+	return true;
-+      } else {
-+	flush_commit_waiters.push_back(make_pair(seq, c));
-+	return false;
-+      }
-+    }
- 
-     OpSequencer()
-       : qlock("FileStore::OpSequencer::qlock", false, false),
- 	parent(0),
---- a/src/os/GenericObjectMap.cc
-+++ b/src/os/GenericObjectMap.cc
-@@ -688,10 +688,8 @@
-   remove_header(old_header->cid, old_header->oid, old_header, t);
-   old_header->cid = cid;
-   old_header->oid = target;
-   set_header(cid, target, *old_header, t);
--
--  // "in_use" still hold the "seq"
- }
- 
- int GenericObjectMap::init(bool do_upgrade)
- {
-@@ -925,64 +923,43 @@
-   set<string> to_get;
-   to_get.insert(header_key(cid, oid));
-   _Header header;
- 
--  while (1) {
--    map<string, bufferlist> out;
--    bool try_again = false;
-+  map<string, bufferlist> out;
- 
--    int r = db->get(GHOBJECT_TO_SEQ_PREFIX, to_get, &out);
--    if (r < 0)
--      return Header();
--    if (out.empty())
--      return Header();
--
--    bufferlist::iterator iter = out.begin()->second.begin();
--    header.decode(iter);
--
--    while (in_use.count(header.seq)) {
--      header_cond.Wait(header_lock);
--
--      // Another thread is hold this header, wait for it.
--      // Because the seq of this object may change, such as clone
--      // and rename operation, here need to look up "seq" again
--      try_again = true;
--    }
-+  int r = db->get(GHOBJECT_TO_SEQ_PREFIX, to_get, &out);
-+  if (r < 0)
-+    return Header();
-+  if (out.empty())
-+    return Header();
- 
--    if (!try_again) {
--      break;
--    }
--  }
-+  bufferlist::iterator iter = out.begin()->second.begin();
-+  header.decode(iter);
- 
--  Header ret = Header(new _Header(header), RemoveOnDelete(this));
--  in_use.insert(ret->seq);
-+  Header ret = Header(new _Header(header));
-   return ret;
- }
- 
- GenericObjectMap::Header GenericObjectMap::_generate_new_header(
-     const coll_t &cid, const ghobject_t &oid, Header parent,
-     KeyValueDB::Transaction t)
- {
--  Header header = Header(new _Header(), RemoveOnDelete(this));
-+  Header header = Header(new _Header());
-   header->seq = state.seq++;
-   if (parent) {
-     header->parent = parent->seq;
-   }
-   header->num_children = 1;
-   header->oid = oid;
-   header->cid = cid;
--  assert(!in_use.count(header->seq));
--  in_use.insert(header->seq);
- 
-   write_state(t);
-   return header;
- }
- 
- GenericObjectMap::Header GenericObjectMap::lookup_parent(Header input)
- {
-   Mutex::Locker l(header_lock);
--  while (in_use.count(input->parent))
--    header_cond.Wait(header_lock);
-   map<string, bufferlist> out;
-   set<string> keys;
-   keys.insert(PARENT_KEY);
- 
-@@ -998,15 +975,14 @@
-     assert(0);
-     return Header();
-   }
- 
--  Header header = Header(new _Header(), RemoveOnDelete(this));
-+  Header header = Header(new _Header());
-   header->seq = input->parent;
-   bufferlist::iterator iter = out.begin()->second.begin();
-   header->decode(iter);
-   dout(20) << "lookup_parent: parent seq is " << header->seq << " with parent "
-            << header->parent << dendl;
--  in_use.insert(header->seq);
-   return header;
- }
- 
- GenericObjectMap::Header GenericObjectMap::lookup_create_header(
---- a/src/os/GenericObjectMap.h
-+++ b/src/os/GenericObjectMap.h
-@@ -73,14 +73,8 @@
-   /**
-    * Serializes access to next_seq as well as the in_use set
-    */
-   Mutex header_lock;
--  Cond header_cond;
--
--  /**
--   * Set of headers currently in use
--   */
--  set<uint64_t> in_use;
- 
-   GenericObjectMap(KeyValueDB *db) : db(db), header_lock("GenericObjectMap") {}
- 
-   int get(
-@@ -370,8 +364,14 @@
-   GenericObjectMapIterator _get_iterator(Header header, string prefix) {
-     return GenericObjectMapIterator(new GenericObjectMapIteratorImpl(this, header, prefix));
-   }
- 
-+  Header generate_new_header(const coll_t &cid, const ghobject_t &oid,
-+                             Header parent, KeyValueDB::Transaction t) {
-+    Mutex::Locker l(header_lock);
-+    return _generate_new_header(cid, oid, parent, t);
-+  }
-+
-   // Scan keys in header into out_keys and out_values (if nonnull)
-   int scan(Header header, const string &prefix, const set<string> &in_keys,
-            set<string> *out_keys, map<string, bufferlist> *out_values);
- 
-@@ -393,13 +393,8 @@
-    * Has the side effect of syncronously saving the new GenericObjectMap state
-    */
-   Header _generate_new_header(const coll_t &cid, const ghobject_t &oid,
-                               Header parent, KeyValueDB::Transaction t);
--  Header generate_new_header(const coll_t &cid, const ghobject_t &oid,
--                             Header parent, KeyValueDB::Transaction t) {
--    Mutex::Locker l(header_lock);
--    return _generate_new_header(cid, oid, parent, t);
--  }
- 
-   // Lookup leaf header for c oid
-   Header _lookup_header(const coll_t &cid, const ghobject_t &oid);
- 
-@@ -424,28 +419,8 @@
- 
-   // Sets header @see set_header
-   void _set_header(Header header, const bufferlist &bl,
-                    KeyValueDB::Transaction t);
--
--  /** 
--   * Removes header seq lock once Header is out of scope
--   * @see _lookup_header
--   * @see lookup_parent
--   * @see generate_new_header
--   */
--  class RemoveOnDelete {
--  public:
--    GenericObjectMap *db;
--    RemoveOnDelete(GenericObjectMap *db) :
--      db(db) {}
--    void operator() (_Header *header) {
--      Mutex::Locker l(db->header_lock);
--      db->in_use.erase(header->seq);
--      db->header_cond.Signal();
--      delete header;
--    }
--  };
--  friend class RemoveOnDelete;
- };
- WRITE_CLASS_ENCODER(GenericObjectMap::_Header)
- WRITE_CLASS_ENCODER(GenericObjectMap::State)
- 
---- a/src/os/KeyValueStore.cc
-+++ b/src/os/KeyValueStore.cc
-@@ -68,90 +68,78 @@
- const string KeyValueStore::COLLECTION_ATTR = "__COLL_ATTR__";
- 
- // ============== StripObjectMap Implementation =================
- 
--void StripObjectMap::sync_wrap(StripObjectHeader &strip_header,
--                               KeyValueDB::Transaction t,
--                               const SequencerPosition &spos)
--{
--  dout(10) << __func__ << " cid: " << strip_header.cid << "oid: "
--           << strip_header.oid << " setting spos to " << strip_header.spos
--           << dendl;
--  strip_header.spos = spos;
--  strip_header.header->data.clear();
--  ::encode(strip_header, strip_header.header->data);
--
--  sync(strip_header.header, t);
--}
--
--bool StripObjectMap::check_spos(const StripObjectHeader &header,
--                                const SequencerPosition &spos)
--{
--  if (spos > header.spos) {
--    stringstream out;
--    dout(10) << "cid: " << "oid: " << header.oid
--             << " not skipping op, *spos " << spos << dendl;
--    dout(10) << " > header.spos " << header.spos << dendl;
--    return false;
--  } else {
--    dout(10) << "cid: " << "oid: " << header.oid << " skipping op, spos "
--             << spos << " <= header.spos " << header.spos << dendl;
--    return true;
--  }
--}
--
--int StripObjectMap::save_strip_header(StripObjectHeader &strip_header,
--                                      const SequencerPosition &spos,
-+int StripObjectMap::save_strip_header(StripObjectHeaderRef strip_header,
-                                       KeyValueDB::Transaction t)
- {
--  strip_header.spos = spos;
--  strip_header.header->data.clear();
--  ::encode(strip_header, strip_header.header->data);
-+  strip_header->header->data.clear();
-+  ::encode(*strip_header, strip_header->header->data);
- 
--  set_header(strip_header.cid, strip_header.oid, *(strip_header.header), t);
-+  set_header(strip_header->cid, strip_header->oid, *(strip_header->header), t);
-   return 0;
- }
- 
- int StripObjectMap::create_strip_header(const coll_t &cid,
-                                         const ghobject_t &oid,
--                                        StripObjectHeader &strip_header,
-+                                        StripObjectHeaderRef *strip_header,
-                                         KeyValueDB::Transaction t)
- {
--  Header header = lookup_create_header(cid, oid, t);
-+  Header header = generate_new_header(cid, oid, Header(), t);
-   if (!header)
-     return -EINVAL;
- 
--  strip_header.oid = oid;
--  strip_header.cid = cid;
--  strip_header.header = header;
-+  StripObjectHeaderRef tmp = StripObjectHeaderRef(new StripObjectHeader());
-+  tmp->oid = oid;
-+  tmp->cid = cid;
-+  tmp->header = header;
-+  if (strip_header)
-+    *strip_header = tmp;
- 
-   return 0;
- }
- 
- int StripObjectMap::lookup_strip_header(const coll_t &cid,
-                                         const ghobject_t &oid,
--                                        StripObjectHeader &strip_header)
-+                                        StripObjectHeaderRef *strip_header)
- {
-+  if (cid != coll_t()) {
-+    Mutex::Locker l(lock);
-+    pair<coll_t, StripObjectHeaderRef> p;
-+    if (caches.lookup(oid, &p)) {
-+      if (p.first == cid) {
-+        *strip_header = p.second;
-+        return 0;
-+      }
-+    }
-+  }
-   Header header = lookup_header(cid, oid);
- 
-   if (!header) {
-     dout(20) << "lookup_strip_header failed to get strip_header "
-              << " cid " << cid <<" oid " << oid << dendl;
-     return -ENOENT;
-   }
- 
-+
-+  StripObjectHeaderRef tmp = StripObjectHeaderRef(new StripObjectHeader());
-   if (header->data.length()) {
-     bufferlist::iterator bliter = header->data.begin();
--    ::decode(strip_header, bliter);
-+    ::decode(*tmp, bliter);
-   }
- 
--  if (strip_header.strip_size == 0)
--    strip_header.strip_size = default_strip_size;
-+  if (tmp->strip_size == 0)
-+    tmp->strip_size = default_strip_size;
- 
--  strip_header.oid = oid;
--  strip_header.cid = cid;
--  strip_header.header = header;
-+  tmp->oid = oid;
-+  tmp->cid = cid;
-+  tmp->header = header;
- 
-+  {
-+    Mutex::Locker l(lock);
-+    caches.add(oid, make_pair(cid, tmp));
-+  }
-+  *strip_header = tmp;
-   dout(10) << "lookup_strip_header done " << " cid " << cid << " oid "
-            << oid << dendl;
-   return 0;
- }
-@@ -193,125 +181,114 @@
-   dout(10) << "file_to_extents done " << dendl;
-   return 0;
- }
- 
--void StripObjectMap::clone_wrap(StripObjectHeader &old_header,
-+void StripObjectMap::clone_wrap(StripObjectHeaderRef old_header,
-                                 const coll_t &cid, const ghobject_t &oid,
-                                 KeyValueDB::Transaction t,
--                                StripObjectHeader *origin_header,
--                                StripObjectHeader *target_header)
-+                                StripObjectHeaderRef *target_header)
- {
-   Header new_origin_header;
-+  StripObjectHeaderRef tmp = StripObjectHeaderRef(new StripObjectHeader());
- 
--  if (target_header)
--    *target_header = old_header;
--  if (origin_header)
--    *origin_header = old_header;
--
--  clone(old_header.header, cid, oid, t, &new_origin_header,
--        &target_header->header);
-+  clone(old_header->header, cid, oid, t, &new_origin_header,
-+        &tmp->header);
- 
--  if(origin_header)
--    origin_header->header = new_origin_header;
-+  tmp->oid = oid;
-+  tmp->cid = cid;
-+  tmp->strip_size = old_header->strip_size;
-+  tmp->max_size = old_header->max_size;
-+  tmp->bits = old_header->bits;
-+  old_header->header = new_origin_header;
- 
--  if (target_header) {
--    target_header->oid = oid;
--    target_header->cid = cid;
--  }
-+  if (target_header)
-+    *target_header = tmp;
- }
- 
--void StripObjectMap::rename_wrap(const coll_t &cid, const ghobject_t &oid,
-+void StripObjectMap::rename_wrap(StripObjectHeaderRef old_header, const coll_t &cid, const ghobject_t &oid,
-                                  KeyValueDB::Transaction t,
--                                 StripObjectHeader *header)
-+                                 StripObjectHeaderRef *new_header)
- {
--  assert(header);
--  rename(header->header, cid, oid, t);
-+  rename(old_header->header, cid, oid, t);
- 
--  if (header) {
--    header->oid = oid;
--    header->cid = cid;
--  }
-+  StripObjectHeaderRef tmp = StripObjectHeaderRef(new StripObjectHeader());
-+  tmp->strip_size = old_header->strip_size;
-+  tmp->max_size = old_header->max_size;
-+  tmp->bits = old_header->bits;
-+  tmp->header = old_header->header;
-+  tmp->oid = oid;
-+  tmp->cid = cid;
-+
-+  if (new_header)
-+    *new_header = tmp;
-+
-+  old_header->header = Header();
-+  old_header->deleted = true;
- }
- 
--int StripObjectMap::get_values_with_header(const StripObjectHeader &header,
-+int StripObjectMap::get_values_with_header(const StripObjectHeaderRef header,
-                                            const string &prefix,
-                                            const set<string> &keys,
-                                            map<string, bufferlist> *out)
- {
--  return scan(header.header, prefix, keys, 0, out);
-+  return scan(header->header, prefix, keys, 0, out);
- }
- 
--int StripObjectMap::get_keys_with_header(const StripObjectHeader &header,
-+int StripObjectMap::get_keys_with_header(const StripObjectHeaderRef header,
-                                          const string &prefix,
-                                          set<string> *keys)
- {
--  ObjectMap::ObjectMapIterator iter = _get_iterator(header.header, prefix);
-+  ObjectMap::ObjectMapIterator iter = _get_iterator(header->header, prefix);
-   for (; iter->valid(); iter->next()) {
-     if (iter->status())
-       return iter->status();
-     keys->insert(iter->key());
-   }
-   return 0;
- }
- 
--int StripObjectMap::get_with_header(const StripObjectHeader &header,
-+int StripObjectMap::get_with_header(const StripObjectHeaderRef header,
-                         const string &prefix, map<string, bufferlist> *out)
- {
--  ObjectMap::ObjectMapIterator iter = _get_iterator(header.header, prefix);
-+  ObjectMap::ObjectMapIterator iter = _get_iterator(header->header, prefix);
-   for (iter->seek_to_first(); iter->valid(); iter->next()) {
-     if (iter->status())
-       return iter->status();
-     out->insert(make_pair(iter->key(), iter->value()));
-   }
- 
-   return 0;
- }
--// =========== KeyValueStore::SubmitManager Implementation ==============
--
--uint64_t KeyValueStore::SubmitManager::op_submit_start()
--{
--  lock.Lock();
--  uint64_t op = ++op_seq;
--  dout(10) << "op_submit_start " << op << dendl;
--  return op;
--}
--
--void KeyValueStore::SubmitManager::op_submit_finish(uint64_t op)
--{
--  dout(10) << "op_submit_finish " << op << dendl;
--  if (op != op_submitted + 1) {
--      dout(0) << "op_submit_finish " << op << " expected " << (op_submitted + 1)
--          << ", OUT OF ORDER" << dendl;
--      assert(0 == "out of order op_submit_finish");
--  }
--  op_submitted = op;
--  lock.Unlock();
--}
--
- 
- // ========= KeyValueStore::BufferTransaction Implementation ============
- 
- int KeyValueStore::BufferTransaction::lookup_cached_header(
-     const coll_t &cid, const ghobject_t &oid,
--    StripObjectMap::StripObjectHeader **strip_header,
-+    StripObjectMap::StripObjectHeaderRef *strip_header,
-     bool create_if_missing)
- {
--  StripObjectMap::StripObjectHeader header;
-+  StripObjectMap::StripObjectHeaderRef header;
-   int r = 0;
- 
-   StripHeaderMap::iterator it = strip_headers.find(make_pair(cid, oid));
-   if (it != strip_headers.end()) {
--    if (it->second.deleted)
-+
-+    if (!it->second->deleted) {
-+      if (strip_header)
-+        *strip_header = it->second;
-+      return 0;
-+    } else if (!create_if_missing) {
-       return -ENOENT;
-+    }
- 
--    if (strip_header)
--      *strip_header = &it->second;
--    return 0;
-+    // If (it->second.deleted && create_if_missing) go down
-+    r = -ENOENT;
-+  } else {
-+    r = store->backend->lookup_strip_header(cid, oid, &header);
-   }
- 
--  r = store->backend->lookup_strip_header(cid, oid, header);
--  if (r < 0 && create_if_missing) {
--    r = store->backend->create_strip_header(cid, oid, header, t);
-+  if (r == -ENOENT && create_if_missing) {
-+    r = store->backend->create_strip_header(cid, oid, &header, t);
-   }
- 
-   if (r < 0) {
-     dout(10) << __func__  << " " << cid << "/" << oid << " "
-@@ -320,23 +297,23 @@
-   }
- 
-   strip_headers[make_pair(cid, oid)] = header;
-   if (strip_header)
--    *strip_header = &strip_headers[make_pair(cid, oid)];
-+    *strip_header = strip_headers[make_pair(cid, oid)];
-   return r;
- }
- 
- int KeyValueStore::BufferTransaction::get_buffer_keys(
--    StripObjectMap::StripObjectHeader &strip_header, const string &prefix,
-+    StripObjectMap::StripObjectHeaderRef strip_header, const string &prefix,
-     const set<string> &keys, map<string, bufferlist> *out)
- {
-   set<string> need_lookup;
- 
-   for (set<string>::iterator it = keys.begin(); it != keys.end(); ++it) {
-     map<pair<string, string>, bufferlist>::iterator i =
--        strip_header.buffers.find(make_pair(prefix, *it));
-+        strip_header->buffers.find(make_pair(prefix, *it));
- 
--    if (i != strip_header.buffers.end()) {
-+    if (i != strip_header->buffers.end()) {
-       (*out)[*it].swap(i->second);
-     } else {
-       need_lookup.insert(*it);
-     }
-@@ -345,117 +322,118 @@
-   if (!need_lookup.empty()) {
-     int r = store->backend->get_values_with_header(strip_header, prefix,
-                                                    need_lookup, out);
-     if (r < 0) {
--      dout(10) << __func__  << " " << strip_header.cid << "/"
--               << strip_header.oid << " " << " r = " << r << dendl;
-+      dout(10) << __func__  << " " << strip_header->cid << "/"
-+               << strip_header->oid << " " << " r = " << r << dendl;
-       return r;
-     }
-   }
- 
-   return 0;
- }
- 
- void KeyValueStore::BufferTransaction::set_buffer_keys(
--     StripObjectMap::StripObjectHeader &strip_header,
-+     StripObjectMap::StripObjectHeaderRef strip_header,
-      const string &prefix, map<string, bufferlist> &values)
- {
--  store->backend->set_keys(strip_header.header, prefix, values, t);
-+  store->backend->set_keys(strip_header->header, prefix, values, t);
- 
-   for (map<string, bufferlist>::iterator iter = values.begin();
-        iter != values.end(); ++iter) {
--    strip_header.buffers[make_pair(prefix, iter->first)].swap(iter->second);
-+    strip_header->buffers[make_pair(prefix, iter->first)].swap(iter->second);
-   }
- }
- 
- int KeyValueStore::BufferTransaction::remove_buffer_keys(
--     StripObjectMap::StripObjectHeader &strip_header, const string &prefix,
-+     StripObjectMap::StripObjectHeaderRef strip_header, const string &prefix,
-      const set<string> &keys)
- {
-   for (set<string>::iterator iter = keys.begin(); iter != keys.end(); ++iter) {
--    strip_header.buffers[make_pair(prefix, *iter)] = bufferlist();
-+    strip_header->buffers[make_pair(prefix, *iter)] = bufferlist();
-   }
- 
--  return store->backend->rm_keys(strip_header.header, prefix, keys, t);
-+  return store->backend->rm_keys(strip_header->header, prefix, keys, t);
- }
- 
- void KeyValueStore::BufferTransaction::clear_buffer_keys(
--     StripObjectMap::StripObjectHeader &strip_header, const string &prefix)
-+     StripObjectMap::StripObjectHeaderRef strip_header, const string &prefix)
- {
--  for (map<pair<string, string>, bufferlist>::iterator iter = strip_header.buffers.begin();
--       iter != strip_header.buffers.end(); ++iter) {
-+  for (map<pair<string, string>, bufferlist>::iterator iter = strip_header->buffers.begin();
-+       iter != strip_header->buffers.end(); ++iter) {
-     if (iter->first.first == prefix)
-       iter->second = bufferlist();
-   }
- }
- 
- int KeyValueStore::BufferTransaction::clear_buffer(
--     StripObjectMap::StripObjectHeader &strip_header)
-+     StripObjectMap::StripObjectHeaderRef strip_header)
- {
--  strip_header.deleted = true;
-+  strip_header->deleted = true;
- 
--  return store->backend->clear(strip_header.header, t);
-+  InvalidateCacheContext *c = new InvalidateCacheContext(store, strip_header->cid, strip_header->oid);
-+  finishes.push_back(c);
-+  return store->backend->clear(strip_header->header, t);
- }
- 
- void KeyValueStore::BufferTransaction::clone_buffer(
--    StripObjectMap::StripObjectHeader &old_header,
-+    StripObjectMap::StripObjectHeaderRef old_header,
-     const coll_t &cid, const ghobject_t &oid)
- {
-   // Remove target ahead to avoid dead lock
-   strip_headers.erase(make_pair(cid, oid));
- 
--  StripObjectMap::StripObjectHeader new_origin_header, new_target_header;
-+  StripObjectMap::StripObjectHeaderRef new_target_header;
- 
--  store->backend->clone_wrap(old_header, cid, oid, t,
--                             &new_origin_header, &new_target_header);
-+  store->backend->clone_wrap(old_header, cid, oid, t, &new_target_header);
- 
-   // FIXME: Lacking of lock for origin header(now become parent), it will
-   // cause other operation can get the origin header while submitting
-   // transactions
--  strip_headers[make_pair(cid, old_header.oid)] = new_origin_header;
-   strip_headers[make_pair(cid, oid)] = new_target_header;
- }
- 
- void KeyValueStore::BufferTransaction::rename_buffer(
--    StripObjectMap::StripObjectHeader &old_header,
-+    StripObjectMap::StripObjectHeaderRef old_header,
-     const coll_t &cid, const ghobject_t &oid)
- {
--  if (store->backend->check_spos(old_header, spos))
--    return ;
--
-   // FIXME: Lacking of lock for origin header, it will cause other operation
-   // can get the origin header while submitting transactions
--  store->backend->rename_wrap(cid, oid, t, &old_header);
-+  StripObjectMap::StripObjectHeaderRef new_header;
-+  store->backend->rename_wrap(old_header, cid, oid, t, &new_header);
- 
--  strip_headers.erase(make_pair(old_header.cid, old_header.oid));
--  strip_headers[make_pair(cid, oid)] = old_header;
-+  InvalidateCacheContext *c = new InvalidateCacheContext(store, old_header->cid, old_header->oid);
-+  finishes.push_back(c);
-+  strip_headers[make_pair(cid, oid)] = new_header;
- }
- 
- int KeyValueStore::BufferTransaction::submit_transaction()
- {
-   int r = 0;
- 
-   for (StripHeaderMap::iterator header_iter = strip_headers.begin();
-        header_iter != strip_headers.end(); ++header_iter) {
--    StripObjectMap::StripObjectHeader header = header_iter->second;
-+    StripObjectMap::StripObjectHeaderRef header = header_iter->second;
- 
--    if (store->backend->check_spos(header, spos))
-+    if (header->deleted)
-       continue;
- 
--    if (header.deleted)
--      continue;
-+    r = store->backend->save_strip_header(header, t);
- 
--    r = store->backend->save_strip_header(header, spos, t);
-     if (r < 0) {
-       dout(10) << __func__ << " save strip header failed " << dendl;
-       goto out;
-     }
-   }
- 
--out:
-+  r = store->backend->submit_transaction(t);
-+  for (list<Context*>::iterator it = finishes.begin(); it != finishes.end(); ++it) {
-+    (*it)->complete(r);
-+  }
- 
-+out:
-   dout(5) << __func__ << " r = " << r << dendl;
--  return store->backend->submit_transaction(t);
-+  return r;
- }
- 
- // =========== KeyValueStore Intern Helper Implementation ==============
- 
-@@ -494,9 +472,9 @@
-                              const char *name, bool do_update) :
-   ObjectStore(base),
-   internal_name(name),
-   basedir(base),
--  fsid_fd(-1), op_fd(-1), current_fd(-1),
-+  fsid_fd(-1), current_fd(-1),
-   kv_type(KV_TYPE_NONE),
-   backend(NULL),
-   ondisk_finisher(g_ceph_context),
-   lock("KeyValueStore::lock"),
-@@ -905,12 +883,8 @@
-   if (fsid_fd >= 0) {
-     VOID_TEMP_FAILURE_RETRY(::close(fsid_fd));
-     fsid_fd = -1;
-   }
--  if (op_fd >= 0) {
--    VOID_TEMP_FAILURE_RETRY(::close(op_fd));
--    op_fd = -1;
--  }
-   if (current_fd >= 0) {
-     VOID_TEMP_FAILURE_RETRY(::close(current_fd));
-     current_fd = -1;
-   }
-@@ -962,16 +936,11 @@
-   }
- 
-   Op *o = build_op(tls, ondisk, onreadable, onreadable_sync, osd_op);
-   op_queue_reserve_throttle(o, handle);
--  uint64_t op = submit_manager.op_submit_start();
--  o->op = op;
--  dout(5) << "queue_transactions (trailing journal) " << op << " "
--          << tls <<dendl;
-+  dout(5) << "queue_transactions (trailing journal) " << " " << tls <<dendl;
-   queue_op(osr, o);
- 
--  submit_manager.op_submit_finish(op);
--
-   return 0;
- }
- 
- 
-@@ -1087,9 +1056,10 @@
- }
- 
- void KeyValueStore::_finish_op(OpSequencer *osr)
- {
--  Op *o = osr->dequeue();
-+  list<Context*> to_queue;
-+  Op *o = osr->dequeue(&to_queue);
- 
-   dout(10) << "_finish_op " << o << " seq " << o->op << " " << *osr << "/" << osr->parent << dendl;
-   osr->apply_lock.Unlock();  // locked in _do_op
-   op_queue_release_throttle(o);
-@@ -1101,8 +1071,9 @@
-   if (o->onreadable_sync) {
-     o->onreadable_sync->complete(0);
-   }
-   op_finisher.queue(o->onreadable);
-+  op_finisher.queue(to_queue);
-   delete o;
- }
- 
- // Combine all the ops in the same transaction using "BufferTransaction" and
-@@ -1125,15 +1096,14 @@
-     ops += (*p)->get_num_ops();
-   }
- 
-   int trans_num = 0;
--  SequencerPosition spos(op_seq, trans_num, 0);
--  BufferTransaction bt(this, spos);
-+  BufferTransaction bt(this);
- 
-   for (list<Transaction*>::iterator p = tls.begin();
-        p != tls.end();
-        ++p, trans_num++) {
--    r = _do_transaction(**p, bt, spos, handle);
-+    r = _do_transaction(**p, bt, handle);
-     if (r < 0)
-       break;
-     if (handle)
-       handle->reset_tp_timeout();
-@@ -1148,14 +1118,14 @@
- }
- 
- unsigned KeyValueStore::_do_transaction(Transaction& transaction,
-                                         BufferTransaction &t,
--                                        SequencerPosition& spos,
-                                         ThreadPool::TPHandle *handle)
- {
-   dout(10) << "_do_transaction on " << &transaction << dendl;
- 
-   Transaction::iterator i = transaction.begin();
-+  uint64_t op_num = 0;
- 
-   while (i.have_op()) {
-     if (handle)
-       handle->reset_tp_timeout();
-@@ -1448,9 +1418,15 @@
-       }
-       break;
- 
-     case Transaction::OP_SETALLOCHINT:
--      // TODO: can kvstore make use of the hint?
-+      {
-+        // TODO: can kvstore make use of the hint?
-+        coll_t cid(i.get_cid());
-+        ghobject_t oid = i.get_oid();
-+        (void)i.get_length();  // discard result
-+        (void)i.get_length();  // discard result
-+      }
-       break;
- 
-     default:
-       derr << "bad op " << op << dendl;
-@@ -1486,10 +1462,9 @@
-           msg = "ENOTEMPTY suggests garbage data in osd data dir";
-         }
- 
-         dout(0) << " error " << cpp_strerror(r) << " not handled on operation "
--                << op << " (" << spos << ", or op " << spos.op
--                << ", counting from 0)" << dendl;
-+                << op << " op " << op_num << ", counting from 0)" << dendl;
-         dout(0) << msg << dendl;
-         dout(0) << " transaction dump:\n";
-         JSONFormatter f(true);
-         f.open_object_section("transaction");
-@@ -1504,9 +1479,9 @@
-         }
-       }
-     }
- 
--    spos.op++;
-+    op_num++;
-   }
- 
-   return 0;  // FIXME count errors
- }
-@@ -1519,11 +1494,11 @@
- {
-   dout(10) << __func__ << "collection: " << cid << " object: " << oid
-            << dendl;
-   int r;
--  StripObjectMap::StripObjectHeader header;
-+  StripObjectMap::StripObjectHeaderRef header;
- 
--  r = backend->lookup_strip_header(cid, oid, header);
-+  r = backend->lookup_strip_header(cid, oid, &header);
-   if (r < 0) {
-     return false;
-   }
- 
-@@ -1534,44 +1509,44 @@
-                         struct stat *st, bool allow_eio)
- {
-   dout(10) << "stat " << cid << "/" << oid << dendl;
- 
--  StripObjectMap::StripObjectHeader header;
-+  StripObjectMap::StripObjectHeaderRef header;
- 
--  int r = backend->lookup_strip_header(cid, oid, header);
-+  int r = backend->lookup_strip_header(cid, oid, &header);
-   if (r < 0) {
-     dout(10) << "stat " << cid << "/" << oid << "=" << r << dendl;
-     return -ENOENT;
-   }
- 
--  st->st_blocks = header.max_size / header.strip_size;
--  if (header.max_size % header.strip_size)
-+  st->st_blocks = header->max_size / header->strip_size;
-+  if (header->max_size % header->strip_size)
-     st->st_blocks++;
-   st->st_nlink = 1;
--  st->st_size = header.max_size;
--  st->st_blksize = header.strip_size;
-+  st->st_size = header->max_size;
-+  st->st_blksize = header->strip_size;
- 
-   return r;
- }
- 
--int KeyValueStore::_generic_read(StripObjectMap::StripObjectHeader &header,
-+int KeyValueStore::_generic_read(StripObjectMap::StripObjectHeaderRef header,
-                                  uint64_t offset, size_t len, bufferlist& bl,
-                                  bool allow_eio, BufferTransaction *bt)
- {
--  if (header.max_size < offset) {
--    dout(10) << __func__ << " " << header.cid << "/" << header.oid << ")"
-+  if (header->max_size < offset) {
-+    dout(10) << __func__ << " " << header->cid << "/" << header->oid << ")"
-              << " offset exceed the length of bl"<< dendl;
-     return 0;
-   }
- 
-   if (len == 0)
--    len = header.max_size - offset;
-+    len = header->max_size - offset;
- 
--  if (offset + len > header.max_size)
--    len = header.max_size - offset;
-+  if (offset + len > header->max_size)
-+    len = header->max_size - offset;
- 
-   vector<StripObjectMap::StripExtent> extents;
--  StripObjectMap::file_to_extents(offset, len, header.strip_size,
-+  StripObjectMap::file_to_extents(offset, len, header->strip_size,
-                                   extents);
-   map<string, bufferlist> out;
-   set<string> keys;
- 
-@@ -1579,35 +1554,35 @@
-        iter != extents.end(); ++iter) {
-     bufferlist old;
-     string key = strip_object_key(iter->no);
- 
--    if (bt && header.buffers.count(make_pair(OBJECT_STRIP_PREFIX, key))) {
-+    if (bt && header->buffers.count(make_pair(OBJECT_STRIP_PREFIX, key))) {
-       // use strip_header buffer
--      assert(header.bits[iter->no]);
--      out[key] = header.buffers[make_pair(OBJECT_STRIP_PREFIX, key)];
--    } else if (header.bits[iter->no]) {
-+      assert(header->bits[iter->no]);
-+      out[key] = header->buffers[make_pair(OBJECT_STRIP_PREFIX, key)];
-+    } else if (header->bits[iter->no]) {
-       keys.insert(key);
-     }
-   }
- 
-   int r = backend->get_values_with_header(header, OBJECT_STRIP_PREFIX, keys, &out);
-   if (r < 0) {
--    dout(10) << __func__ << " " << header.cid << "/" << header.oid << " "
-+    dout(10) << __func__ << " " << header->cid << "/" << header->oid << " "
-              << offset << "~" << len << " = " << r << dendl;
-     return r;
-   } else if (out.size() != keys.size()) {
-     dout(0) << __func__ << " broken header or missing data in backend "
--            << header.cid << "/" << header.oid << " " << offset << "~"
-+            << header->cid << "/" << header->oid << " " << offset << "~"
-             << len << " = " << r << dendl;
-     return -EBADF;
-   }
- 
-   for (vector<StripObjectMap::StripExtent>::iterator iter = extents.begin();
-        iter != extents.end(); ++iter) {
-     string key = strip_object_key(iter->no);
- 
--    if (header.bits[iter->no]) {
--      if (iter->len == header.strip_size) {
-+    if (header->bits[iter->no]) {
-+      if (iter->len == header->strip_size) {
-         bl.claim_append(out[key]);
-       } else {
-         out[key].copy(iter->offset, iter->len, bl);
-       }
-@@ -1615,9 +1590,9 @@
-       bl.append_zero(iter->len);
-     }
-   }
- 
--  dout(10) << __func__ << " " << header.cid << "/" << header.oid << " "
-+  dout(10) << __func__ << " " << header->cid << "/" << header->oid << " "
-            << offset << "~" << bl.length() << "/" << len << " r = " << r
-            << dendl;
- 
-   return bl.length();
-@@ -1629,11 +1604,11 @@
- {
-   dout(15) << __func__ << " " << cid << "/" << oid << " " << offset << "~"
-            << len << dendl;
- 
--  StripObjectMap::StripObjectHeader header;
-+  StripObjectMap::StripObjectHeaderRef header;
- 
--  int r = backend->lookup_strip_header(cid, oid, header);
-+  int r = backend->lookup_strip_header(cid, oid, &header);
- 
-   if (r < 0) {
-     dout(10) << __func__ << " " << cid << "/" << oid << " " << offset << "~"
-               << len << " header isn't exist: r = " << r << dendl;
-@@ -1648,25 +1623,26 @@
- {
-   dout(10) << __func__ << " " << cid << " " << oid << " " << offset << "~"
-            << len << dendl;
-   int r;
--  StripObjectMap::StripObjectHeader header;
-+  StripObjectMap::StripObjectHeaderRef header;
- 
--  r = backend->lookup_strip_header(cid, oid, header);
-+  r = backend->lookup_strip_header(cid, oid, &header);
-   if (r < 0) {
-     dout(10) << "fiemap " << cid << "/" << oid << " " << offset << "~" << len
-              << " failed to get header: r = " << r << dendl;
-     return r;
-   }
- 
-   vector<StripObjectMap::StripExtent> extents;
--  StripObjectMap::file_to_extents(offset, len, header.strip_size,
-+  StripObjectMap::file_to_extents(offset, len, header->strip_size,
-                                   extents);
- 
-   map<uint64_t, uint64_t> m;
-   for (vector<StripObjectMap::StripExtent>::iterator iter = extents.begin();
-        iter != extents.end(); ++iter) {
--    m[iter->offset] = iter->len;
-+    uint64_t off = iter->no * header->strip_size + iter->offset;
-+    m[off] = iter->len;
-   }
-   ::encode(m, bl);
-   return 0;
- }
-@@ -1676,18 +1652,20 @@
- {
-   dout(15) << __func__ << " " << cid << "/" << oid << dendl;
- 
-   int r;
--  StripObjectMap::StripObjectHeader *header;
-+  StripObjectMap::StripObjectHeaderRef header;
- 
-   r = t.lookup_cached_header(cid, oid, &header, false);
-   if (r < 0) {
-     dout(10) << __func__ << " " << cid << "/" << oid << " "
-              << " failed to get header: r = " << r << dendl;
-     return r;
-   }
- 
--  r = t.clear_buffer(*header);
-+  header->max_size = 0;
-+  header->bits.clear();
-+  r = t.clear_buffer(header);
- 
-   dout(10) << __func__ << " " << cid << "/" << oid << " = " << r << dendl;
-   return r;
- }
-@@ -1698,9 +1676,9 @@
-   dout(15) << __func__ << " " << cid << "/" << oid << " size " << size
-            << dendl;
- 
-   int r;
--  StripObjectMap::StripObjectHeader *header;
-+  StripObjectMap::StripObjectHeaderRef header;
- 
-   r = t.lookup_cached_header(cid, oid, &header, false);
-   if (r < 0) {
-     dout(10) << __func__ << " " << cid << "/" << oid << " " << size
-@@ -1724,9 +1702,9 @@
-       set<string> lookup_keys;
-       string key = strip_object_key(iter->no);
- 
-       lookup_keys.insert(key);
--      r = t.get_buffer_keys(*header, OBJECT_STRIP_PREFIX,
-+      r = t.get_buffer_keys(header, OBJECT_STRIP_PREFIX,
-                             lookup_keys, &values);
-       if (r < 0) {
-         dout(10) << __func__ << " " << cid << "/" << oid << " "
-                  << size << " = " << r << dendl;
-@@ -1742,9 +1720,9 @@
-       value.append_zero(header->strip_size-iter->offset);
-       assert(value.length() == header->strip_size);
-       value.swap(values[key]);
- 
--      t.set_buffer_keys(*header, OBJECT_STRIP_PREFIX, values);
-+      t.set_buffer_keys(header, OBJECT_STRIP_PREFIX, values);
-       ++iter;
-     }
- 
-     set<string> keys;
-@@ -1753,9 +1731,9 @@
-         keys.insert(strip_object_key(iter->no));
-         header->bits[iter->no] = 0;
-       }
-     }
--    r = t.remove_buffer_keys(*header, OBJECT_STRIP_PREFIX, keys);
-+    r = t.remove_buffer_keys(header, OBJECT_STRIP_PREFIX, keys);
-     if (r < 0) {
-       dout(10) << __func__ << " " << cid << "/" << oid << " "
-                << size << " = " << r << dendl;
-       return r;
-@@ -1775,9 +1753,9 @@
- {
-   dout(15) << __func__ << " " << cid << "/" << oid << dendl;
- 
-   int r;
--  StripObjectMap::StripObjectHeader *header;
-+  StripObjectMap::StripObjectHeaderRef header;
- 
-   r = t.lookup_cached_header(cid, oid, &header, true);
-   if (r < 0) {
-     dout(10) << __func__ << " " << cid << "/" << oid << " "
-@@ -1789,44 +1767,44 @@
-   dout(10) << __func__ << " " << cid << "/" << oid << " = " << r << dendl;
-   return r;
- }
- 
--int KeyValueStore::_generic_write(StripObjectMap::StripObjectHeader &header,
-+int KeyValueStore::_generic_write(StripObjectMap::StripObjectHeaderRef header,
-                                   uint64_t offset, size_t len,
-                                   const bufferlist& bl, BufferTransaction &t,
-                                   bool replica)
- {
-   if (len > bl.length())
-     len = bl.length();
- 
--  if (len + offset > header.max_size) {
--    header.max_size = len + offset;
--    header.bits.resize(header.max_size/header.strip_size+1);
-+  if (len + offset > header->max_size) {
-+    header->max_size = len + offset;
-+    header->bits.resize(header->max_size/header->strip_size+1);
-   }
- 
-   vector<StripObjectMap::StripExtent> extents;
--  StripObjectMap::file_to_extents(offset, len, header.strip_size,
-+  StripObjectMap::file_to_extents(offset, len, header->strip_size,
-                                   extents);
- 
-   map<string, bufferlist> out;
-   set<string> keys;
-   for (vector<StripObjectMap::StripExtent>::iterator iter = extents.begin();
-        iter != extents.end(); ++iter) {
--    if (header.bits[iter->no] && !(iter->offset == 0 &&
--                                   iter->len == header.strip_size))
-+    if (header->bits[iter->no] && !(iter->offset == 0 &&
-+                                   iter->len == header->strip_size))
-       keys.insert(strip_object_key(iter->no));
-   }
- 
-   int r = t.get_buffer_keys(header, OBJECT_STRIP_PREFIX, keys, &out);
-   if (r < 0) {
--    dout(10) << __func__ << " failed to get value " << header.cid << "/"
--              << header.oid << " " << offset << "~" << len << " = " << r
-+    dout(10) << __func__ << " failed to get value " << header->cid << "/"
-+              << header->oid << " " << offset << "~" << len << " = " << r
-               << dendl;
-     return r;
-   } else if (keys.size() != out.size()) {
-     // Error on header.bits or the corresponding key/value pair is missing
-     dout(0) << __func__ << " broken header or missing data in backend "
--            << header.cid << "/" << header.oid << " " << offset << "~"
-+            << header->cid << "/" << header->oid << " " << offset << "~"
-             << len << " = " << r << dendl;
-     return -EBADF;
-   }
- 
-@@ -1835,41 +1813,41 @@
-   for (vector<StripObjectMap::StripExtent>::iterator iter = extents.begin();
-        iter != extents.end(); ++iter) {
-     bufferlist value;
-     string key = strip_object_key(iter->no);
--    if (header.bits[iter->no]) {
--      if (iter->offset == 0 && iter->len == header.strip_size) {
-+    if (header->bits[iter->no]) {
-+      if (iter->offset == 0 && iter->len == header->strip_size) {
-         bl.copy(bl_offset, iter->len, value);
-         bl_offset += iter->len;
-       } else {
--        assert(out[key].length() == header.strip_size);
-+        assert(out[key].length() == header->strip_size);
- 
-         out[key].copy(0, iter->offset, value);
-         bl.copy(bl_offset, iter->len, value);
-         bl_offset += iter->len;
- 
--        if (value.length() != header.strip_size)
--          out[key].copy(value.length(), header.strip_size-value.length(),
-+        if (value.length() != header->strip_size)
-+          out[key].copy(value.length(), header->strip_size-value.length(),
-                         value);
-       }
-     } else {
-       if (iter->offset)
-         value.append_zero(iter->offset);
-       bl.copy(bl_offset, iter->len, value);
-       bl_offset += iter->len;
- 
--      if (value.length() < header.strip_size)
--        value.append_zero(header.strip_size-value.length());
-+      if (value.length() < header->strip_size)
-+        value.append_zero(header->strip_size-value.length());
- 
--      header.bits[iter->no] = 1;
-+      header->bits[iter->no] = 1;
-     }
--    assert(value.length() == header.strip_size);
-+    assert(value.length() == header->strip_size);
-     values[key].swap(value);
-   }
-   assert(bl_offset == len);
- 
-   t.set_buffer_keys(header, OBJECT_STRIP_PREFIX, values);
--  dout(10) << __func__ << " " << header.cid << "/" << header.oid << " "
-+  dout(10) << __func__ << " " << header->cid << "/" << header->oid << " "
-            << offset << "~" << len << " = " << r << dendl;
- 
-   return r;
- }
-@@ -1881,18 +1859,18 @@
-   dout(15) << __func__ << " " << cid << "/" << oid << " " << offset << "~"
-            << len << dendl;
- 
-   int r;
--  StripObjectMap::StripObjectHeader *header;
-+  StripObjectMap::StripObjectHeaderRef header;
- 
-   r = t.lookup_cached_header(cid, oid, &header, true);
-   if (r < 0) {
-     dout(10) << __func__ << " " << cid << "/" << oid << " " << offset
-              << "~" << len << " failed to get header: r = " << r << dendl;
-     return r;
-   }
- 
--  return _generic_write(*header, offset, len, bl, t, replica);
-+  return _generic_write(header, offset, len, bl, t, replica);
- }
- 
- int KeyValueStore::_zero(coll_t cid, const ghobject_t& oid, uint64_t offset,
-                          size_t len, BufferTransaction &t)
-@@ -1919,18 +1897,18 @@
-   if (oldoid == newoid)
-     return 0;
- 
-   int r;
--  StripObjectMap::StripObjectHeader *old_header;
-+  StripObjectMap::StripObjectHeaderRef old_header;
- 
-   r = t.lookup_cached_header(cid, oldoid, &old_header, false);
-   if (r < 0) {
-     dout(10) << __func__ << " " << cid << "/" << oldoid << " -> " << cid << "/"
-              << newoid << " = " << r << dendl;
-     return r;
-   }
- 
--  t.clone_buffer(*old_header, cid, newoid);
-+  t.clone_buffer(old_header, cid, newoid);
- 
-   dout(10) << __func__ << " " << cid << "/" << oldoid << " -> " << cid << "/"
-            << newoid << " = " << r << dendl;
-   return r;
-@@ -1947,9 +1925,9 @@
- 
-   int r;
-   bufferlist bl;
- 
--  StripObjectMap::StripObjectHeader *old_header, *new_header;
-+  StripObjectMap::StripObjectHeaderRef old_header, new_header;
- 
-   r = t.lookup_cached_header(cid, oldoid, &old_header, false);
-   if (r < 0) {
-     dout(10) << __func__ << " " << cid << "/" << oldoid << " -> " << cid << "/"
-@@ -1965,13 +1943,13 @@
-            << " can't create header: r = " << r << dendl;
-     return r;
-   }
- 
--  r = _generic_read(*old_header, srcoff, len, bl, &t);
-+  r = _generic_read(old_header, srcoff, len, bl, &t);
-   if (r < 0)
-     goto out;
- 
--  r = _generic_write(*new_header, dstoff, len, bl, t);
-+  r = _generic_write(new_header, dstoff, len, bl, t);
- 
-  out:
-   dout(10) << __func__ << " " << cid << "/" << oldoid << " -> " << cid << "/"
-            << newoid << " " << srcoff << "~" << len << " to " << dstoff
-@@ -1989,11 +1967,19 @@
- 
-   int r;
-   map<string, bufferlist> got;
-   set<string> to_get;
-+  StripObjectMap::StripObjectHeaderRef header;
- 
-   to_get.insert(string(name));
--  r = backend->get_values(cid, oid, OBJECT_XATTR, to_get, &got);
-+
-+  r = backend->lookup_strip_header(cid, oid, &header);
-+  if (r < 0) {
-+    dout(10) << __func__ << " lookup_strip_header failed: r =" << r << dendl;
-+    return r;
-+  }
-+
-+  r = backend->get_values_with_header(header, OBJECT_XATTR, to_get, &got);
-   if (r < 0 && r != -ENOENT) {
-     dout(10) << __func__ << " get_xattrs err r =" << r << dendl;
-     goto out;
-   }
-@@ -2055,9 +2041,9 @@
-   dout(15) << __func__ << " " << cid << "/" << oid << dendl;
- 
-   int r;
- 
--  StripObjectMap::StripObjectHeader *header;
-+  StripObjectMap::StripObjectHeaderRef header;
-   map<string, bufferlist> attrs;
- 
-   r = t.lookup_cached_header(cid, oid, &header, false);
-   if (r < 0)
-@@ -2067,9 +2053,9 @@
-        it != aset.end(); ++it) {
-     attrs[it->first].push_back(it->second);
-   }
- 
--  t.set_buffer_keys(*header, OBJECT_XATTR, attrs);
-+  t.set_buffer_keys(header, OBJECT_XATTR, attrs);
- 
- out:
-   dout(10) << __func__ << " " << cid << "/" << oid << " = " << r << dendl;
-   return r;
-@@ -2083,9 +2069,9 @@
-            << dendl;
- 
-   int r;
-   set<string> to_remove;
--  StripObjectMap::StripObjectHeader *header;
-+  StripObjectMap::StripObjectHeaderRef header;
- 
-   r = t.lookup_cached_header(cid, oid, &header, false);
-   if (r < 0) {
-     dout(10) << __func__ << " could not find header r = " << r
-@@ -2093,9 +2079,9 @@
-     return r;
-   }
- 
-   to_remove.insert(string(name));
--  r = t.remove_buffer_keys(*header, OBJECT_XATTR, to_remove);
-+  r = t.remove_buffer_keys(header, OBJECT_XATTR, to_remove);
- 
-   dout(10) << __func__ << " " << cid << "/" << oid << " '" << name << "' = "
-            << r << dendl;
-   return r;
-@@ -2108,25 +2094,25 @@
- 
-   int r;
-   set<string> attrs;
- 
--  StripObjectMap::StripObjectHeader *header;
-+  StripObjectMap::StripObjectHeaderRef header;
- 
-   r = t.lookup_cached_header(cid, oid, &header, false);
-   if (r < 0) {
-     dout(10) << __func__ << " could not find header r = " << r
-              << dendl;
-     return r;
-   }
- 
--  r = backend->get_keys_with_header(*header, OBJECT_XATTR, &attrs);
-+  r = backend->get_keys_with_header(header, OBJECT_XATTR, &attrs);
-   if (r < 0 && r != -ENOENT) {
-     dout(10) << __func__ << " could not get attrs r = " << r << dendl;
-     return r;
-   }
- 
--  r = t.remove_buffer_keys(*header, OBJECT_XATTR, attrs);
--  t.clear_buffer_keys(*header, OBJECT_XATTR);
-+  r = t.remove_buffer_keys(header, OBJECT_XATTR, attrs);
-+  t.clear_buffer_keys(header, OBJECT_XATTR);
- 
-   dout(10) << __func__ <<  " " << cid << "/" << oid << " = " << r << dendl;
-   return r;
- }
-@@ -2167,12 +2153,20 @@
-            << "'" << dendl;
- 
-   set<string> keys;
-   map<string, bufferlist> out;
-+  StripObjectMap::StripObjectHeaderRef header;
-+
-   keys.insert(string(name));
- 
--  int r = backend->get_values(get_coll_for_coll(), make_ghobject_for_coll(c),
--                              COLLECTION_ATTR, keys, &out);
-+  int r = backend->lookup_strip_header(get_coll_for_coll(),
-+                                       make_ghobject_for_coll(c), &header);
-+  if (r < 0) {
-+    dout(10) << __func__ << " lookup_strip_header failed: r =" << r << dendl;
-+    return r;
-+  }
-+
-+  r = backend->get_values_with_header(header, COLLECTION_ATTR, keys, &out);
-   if (r < 0) {
-     dout(10) << __func__ << " could not get key" << string(name) << dendl;
-     r = -EINVAL;
-   }
-@@ -2191,16 +2185,23 @@
-   dout(10) << __func__ << " " << cid.to_str() << dendl;
- 
-   map<string, bufferlist> out;
-   set<string> keys;
-+  StripObjectMap::StripObjectHeaderRef header;
- 
-   for (map<string, bufferptr>::iterator it = aset.begin();
-        it != aset.end(); ++it) {
-       keys.insert(it->first);
-   }
- 
--  int r = backend->get_values(get_coll_for_coll(), make_ghobject_for_coll(cid),
--                              COLLECTION_ATTR, keys, &out);
-+  int r = backend->lookup_strip_header(get_coll_for_coll(),
-+                                       make_ghobject_for_coll(cid), &header);
-+  if (r < 0) {
-+    dout(10) << __func__ << " lookup_strip_header failed: r =" << r << dendl;
-+    return r;
-+  }
-+
-+  r = backend->get_values_with_header(header, COLLECTION_ATTR, keys, &out);
-   if (r < 0) {
-     dout(10) << __func__ << " could not get keys" << dendl;
-     r = -EINVAL;
-     goto out;
-@@ -2226,9 +2227,9 @@
- 
-   int r;
-   bufferlist bl;
-   map<string, bufferlist> out;
--  StripObjectMap::StripObjectHeader *header;
-+  StripObjectMap::StripObjectHeaderRef header;
- 
-   r = t.lookup_cached_header(get_coll_for_coll(),
-                              make_ghobject_for_coll(c),
-                              &header, false);
-@@ -2239,9 +2240,9 @@
- 
-   bl.append(reinterpret_cast<const char*>(value), size);
-   out.insert(make_pair(string(name), bl));
- 
--  t.set_buffer_keys(*header, COLLECTION_ATTR, out);
-+  t.set_buffer_keys(header, COLLECTION_ATTR, out);
- 
-   dout(10) << __func__ << " " << c << " '"
-            << name << "' len " << size << " = " << r << dendl;
-   return r;
-@@ -2253,9 +2254,9 @@
-   dout(15) << __func__ << " " << c << dendl;
- 
-   bufferlist bl;
-   set<string> out;
--  StripObjectMap::StripObjectHeader *header;
-+  StripObjectMap::StripObjectHeaderRef header;
- 
-   int r = t.lookup_cached_header(get_coll_for_coll(),
-                                  make_ghobject_for_coll(c), &header, false);
-   if (r < 0) {
-@@ -2263,9 +2264,9 @@
-     return r;
-   }
- 
-   out.insert(string(name));
--  r = t.remove_buffer_keys(*header, COLLECTION_ATTR, out);
-+  r = t.remove_buffer_keys(header, COLLECTION_ATTR, out);
- 
-   dout(10) << __func__ << " " << c << " = " << r << dendl;
-   return r;
- }
-@@ -2276,9 +2277,9 @@
- {
-   dout(15) << __func__ << " " << cid << dendl;
- 
-   map<string, bufferlist> attrs;
--  StripObjectMap::StripObjectHeader *header;
-+  StripObjectMap::StripObjectHeaderRef header;
-   int r = t.lookup_cached_header(get_coll_for_coll(),
-                                  make_ghobject_for_coll(cid),
-                                  &header, false);
-   if (r < 0) {
-@@ -2290,9 +2291,9 @@
-        ++it) {
-     attrs[it->first].push_back(it->second);
-   }
- 
--  t.set_buffer_keys(*header, COLLECTION_ATTR, attrs);
-+  t.set_buffer_keys(header, COLLECTION_ATTR, attrs);
- 
-   dout(10) << __func__ << " " << cid << " = " << r << dendl;
-   return r;
- }
-@@ -2304,9 +2305,9 @@
- {
-   dout(15) << __func__ << " " << c << dendl;
- 
-   int r;
--  StripObjectMap::StripObjectHeader *header;
-+  StripObjectMap::StripObjectHeaderRef header;
-   bufferlist bl;
- 
-   r = t.lookup_cached_header(get_coll_for_coll(),
-                              make_ghobject_for_coll(c), &header,
-@@ -2329,9 +2330,9 @@
-   dout(15) << __func__ << " " << c << dendl;
- 
-   int r;
-   uint64_t modified_object = 0;
--  StripObjectMap::StripObjectHeader *header;
-+  StripObjectMap::StripObjectHeaderRef header;
-   vector<ghobject_t> oids;
- 
-   r = t.lookup_cached_header(get_coll_for_coll(), make_ghobject_for_coll(c),
-                              &header, false);
-@@ -2346,9 +2347,9 @@
-     if (iter->first.first != c)
-       continue;
- 
-     modified_object++;
--    if (!iter->second.deleted) {
-+    if (!iter->second->deleted) {
-       r = -ENOTEMPTY;
-       goto out;
-     }
-   }
-@@ -2368,9 +2369,9 @@
-       goto out;
-     }
-   }
- 
--  r = t.clear_buffer(*header);
-+  r = t.clear_buffer(header);
- 
- out:
-   dout(10) << __func__ << " " << c << " = " << r << dendl;
-   return r;
-@@ -2384,9 +2385,9 @@
-   dout(15) << __func__ <<  " " << c << "/" << o << " from " << oldcid << "/"
-            << o << dendl;
- 
-   bufferlist bl;
--  StripObjectMap::StripObjectHeader *header, *old_header;
-+  StripObjectMap::StripObjectHeaderRef header, old_header;
- 
-   int r = t.lookup_cached_header(oldcid, o, &old_header, false);
-   if (r < 0) {
-     goto out;
-@@ -2399,15 +2400,15 @@
-              << o << " already exist " << dendl;
-     goto out;
-   }
- 
--  r = _generic_read(*old_header, 0, old_header->max_size, bl, &t);
-+  r = _generic_read(old_header, 0, old_header->max_size, bl, &t);
-   if (r < 0) {
-     r = -EINVAL;
-     goto out;
-   }
- 
--  r = _generic_write(*header, 0, bl.length(), bl, t);
-+  r = _generic_write(header, 0, bl.length(), bl, t);
-   if (r < 0) {
-     r = -EINVAL;
-   }
- 
-@@ -2424,9 +2425,9 @@
- {
-   dout(15) << __func__ << " " << c << "/" << o << " from " << oldcid << "/"
-            << oldoid << dendl;
-   int r;
--  StripObjectMap::StripObjectHeader *header;
-+  StripObjectMap::StripObjectHeaderRef header;
- 
-   r = t.lookup_cached_header(c, o, &header, false);
-   if (r == 0) {
-     dout(10) << __func__ << " " << oldcid << "/" << oldoid << " -> " << c
-@@ -2440,9 +2441,9 @@
-              << "/" << o << " = " << r << dendl;
-     return r;
-   }
- 
--  t.rename_buffer(*header, c, o);
-+  t.rename_buffer(header, c, o);
- 
-   dout(10) << __func__ << " " << c << "/" << o << " from " << oldcid << "/"
-            << oldoid << " = " << r << dendl;
-   return r;
-@@ -2452,9 +2453,9 @@
-                                                 BufferTransaction &t)
- {
-   dout(15) << __func__ << " " << cid << dendl;
- 
--  StripObjectMap::StripObjectHeader *header;
-+  StripObjectMap::StripObjectHeaderRef header;
- 
-   int r = t.lookup_cached_header(get_coll_for_coll(),
-                                  make_ghobject_for_coll(cid),
-                                  &header, false);
-@@ -2477,9 +2478,9 @@
-         return r;
-     }
-   }
- 
--  r = t.clear_buffer(*header);
-+  r = t.clear_buffer(header);
- 
-   dout(10) << __func__ << " " << cid  << " r = " << r << dendl;
-   return 0;
- }
-@@ -2489,9 +2490,9 @@
- {
-   dout(10) << __func__ << " origin cid " << cid << " new cid " << ncid
-            << dendl;
- 
--  StripObjectMap::StripObjectHeader *header;
-+  StripObjectMap::StripObjectHeaderRef header;
- 
-   int r = t.lookup_cached_header(get_coll_for_coll(),
-                                  make_ghobject_for_coll(ncid),
-                                  &header, false);
-@@ -2531,9 +2532,9 @@
-     objects.clear();
-     current = next;
-   }
- 
--  t.rename_buffer(*header, get_coll_for_coll(), make_ghobject_for_coll(ncid));
-+  t.rename_buffer(header, get_coll_for_coll(), make_ghobject_for_coll(ncid));
- 
-   dout(10) << __func__ << " origin cid " << cid << " new cid " << ncid
-            << dendl;
-   return 0;
-@@ -2559,11 +2560,11 @@
- bool KeyValueStore::collection_exists(coll_t c)
- {
-   dout(10) << __func__ << " " << dendl;
- 
--  StripObjectMap::StripObjectHeader header;
-+  StripObjectMap::StripObjectHeaderRef header;
-   int r = backend->lookup_strip_header(get_coll_for_coll(),
--                                       make_ghobject_for_coll(c), header);
-+                                       make_ghobject_for_coll(c), &header);
-   if (r < 0) {
-     return false;
-   }
-   return true;
-@@ -2651,17 +2652,16 @@
-                             bufferlist *bl, map<string, bufferlist> *out)
- {
-   dout(15) << __func__ << " " << c << "/" << hoid << dendl;
- 
--  StripObjectMap::StripObjectHeader header;
-+  StripObjectMap::StripObjectHeaderRef header;
- 
--  int r = backend->lookup_strip_header(c, hoid, header);
-+  int r = backend->lookup_strip_header(c, hoid, &header);
-   if (r < 0) {
-     dout(10) << __func__ << " lookup_strip_header failed: r =" << r << dendl;
-     return r;
-   }
- 
--
-   r = backend->get_with_header(header, OBJECT_OMAP, out);
-   if (r < 0 && r != -ENOENT) {
-     dout(10) << __func__ << " err r =" << r << dendl;
-     return r;
-@@ -2691,11 +2691,18 @@
-   dout(15) << __func__ << " " << c << "/" << hoid << dendl;
- 
-   set<string> keys;
-   map<string, bufferlist> got;
-+  StripObjectMap::StripObjectHeaderRef header;
-+
-+  int r = backend->lookup_strip_header(c, hoid, &header);
-+  if (r < 0) {
-+    dout(10) << __func__ << " lookup_strip_header failed: r =" << r << dendl;
-+    return r;
-+  }
- 
-   keys.insert(OBJECT_OMAP_HEADER_KEY);
--  int r = backend->get_values(c, hoid, OBJECT_OMAP_HEADER, keys, &got);
-+  r = backend->get_values_with_header(header, OBJECT_OMAP_HEADER, keys, &got);
-   if (r < 0 && r != -ENOENT) {
-     dout(10) << __func__ << " err r =" << r << dendl;
-     return r;
-   }
-@@ -2711,9 +2718,16 @@
- int KeyValueStore::omap_get_keys(coll_t c, const ghobject_t &hoid, set<string> *keys)
- {
-   dout(15) << __func__ << " " << c << "/" << hoid << dendl;
- 
--  int r = backend->get_keys(c, hoid, OBJECT_OMAP, keys);
-+  StripObjectMap::StripObjectHeaderRef header;
-+  int r = backend->lookup_strip_header(c, hoid, &header);
-+  if (r < 0) {
-+    dout(10) << __func__ << " lookup_strip_header failed: r =" << r << dendl;
-+    return r;
-+  }
-+
-+  r = backend->get_keys_with_header(header, OBJECT_OMAP, keys);
-   if (r < 0 && r != -ENOENT) {
-     return r;
-   }
-   return 0;
-@@ -2724,9 +2738,16 @@
-                                    map<string, bufferlist> *out)
- {
-   dout(15) << __func__ << " " << c << "/" << hoid << dendl;
- 
--  int r = backend->get_values(c, hoid, OBJECT_OMAP, keys, out);
-+  StripObjectMap::StripObjectHeaderRef header;
-+  int r = backend->lookup_strip_header(c, hoid, &header);
-+  if (r < 0) {
-+    dout(10) << __func__ << " lookup_strip_header failed: r =" << r << dendl;
-+    return r;
-+  }
-+
-+  r = backend->get_values_with_header(header, OBJECT_OMAP, keys, out);
-   if (r < 0 && r != -ENOENT) {
-     return r;
-   }
-   return 0;
-@@ -2755,9 +2776,9 @@
-                                BufferTransaction &t)
- {
-   dout(15) << __func__ << " " << cid << "/" << hoid << dendl;
- 
--  StripObjectMap::StripObjectHeader *header;
-+  StripObjectMap::StripObjectHeaderRef header;
- 
-   int r = t.lookup_cached_header(cid, hoid, &header, false);
-   if (r < 0) {
-     dout(10) << __func__ << " " << cid << "/" << hoid << " "
-@@ -2765,29 +2786,29 @@
-     return r;
-   }
- 
-   set<string> keys;
--  r = backend->get_keys_with_header(*header, OBJECT_OMAP, &keys);
-+  r = backend->get_keys_with_header(header, OBJECT_OMAP, &keys);
-   if (r < 0 && r != -ENOENT) {
-     dout(10) << __func__ << " could not get omap_keys r = " << r << dendl;
-     return r;
-   }
- 
--  r = t.remove_buffer_keys(*header, OBJECT_OMAP, keys);
-+  r = t.remove_buffer_keys(header, OBJECT_OMAP, keys);
-   if (r < 0) {
-     dout(10) << __func__ << " could not remove keys r = " << r << dendl;
-     return r;
-   }
- 
-   keys.clear();
-   keys.insert(OBJECT_OMAP_HEADER_KEY);
--  r = t.remove_buffer_keys(*header, OBJECT_OMAP_HEADER, keys);
-+  r = t.remove_buffer_keys(header, OBJECT_OMAP_HEADER, keys);
-   if (r < 0) {
-     dout(10) << __func__ << " could not remove keys r = " << r << dendl;
-     return r;
-   }
- 
--  t.clear_buffer_keys(*header, OBJECT_OMAP_HEADER);
-+  t.clear_buffer_keys(header, OBJECT_OMAP_HEADER);
- 
-   dout(10) << __func__ << " " << cid << "/" << hoid << " r = " << r << dendl;
-   return 0;
- }
-@@ -2797,18 +2818,18 @@
-                                  BufferTransaction &t)
- {
-   dout(15) << __func__ << " " << cid << "/" << hoid << dendl;
- 
--  StripObjectMap::StripObjectHeader *header;
-+  StripObjectMap::StripObjectHeaderRef header;
- 
-   int r = t.lookup_cached_header(cid, hoid, &header, false);
-   if (r < 0) {
-     dout(10) << __func__ << " " << cid << "/" << hoid << " "
-              << " failed to get header: r = " << r << dendl;
-     return r;
-   }
- 
--  t.set_buffer_keys(*header, OBJECT_OMAP, aset);
-+  t.set_buffer_keys(header, OBJECT_OMAP, aset);
- 
-   return 0;
- }
- 
-@@ -2817,18 +2838,18 @@
-                                 BufferTransaction &t)
- {
-   dout(15) << __func__ << " " << cid << "/" << hoid << dendl;
- 
--  StripObjectMap::StripObjectHeader *header;
-+  StripObjectMap::StripObjectHeaderRef header;
- 
-   int r = t.lookup_cached_header(cid, hoid, &header, false);
-   if (r < 0) {
-     dout(10) << __func__ << " " << cid << "/" << hoid << " "
-              << " failed to get header: r = " << r << dendl;
-     return r;
-   }
- 
--  r = t.remove_buffer_keys(*header, OBJECT_OMAP, keys);
-+  r = t.remove_buffer_keys(header, OBJECT_OMAP, keys);
- 
-   dout(10) << __func__ << " " << cid << "/" << hoid << " r = " << r << dendl;
-   return r;
- }
-@@ -2860,9 +2881,9 @@
- {
-   dout(15) << __func__ << " " << cid << "/" << hoid << dendl;
- 
-   map<string, bufferlist> sets;
--  StripObjectMap::StripObjectHeader *header;
-+  StripObjectMap::StripObjectHeaderRef header;
- 
-   int r = t.lookup_cached_header(cid, hoid, &header, false);
-   if (r < 0) {
-     dout(10) << __func__ << " " << cid << "/" << hoid << " "
-@@ -2870,9 +2891,9 @@
-     return r;
-   }
- 
-   sets[OBJECT_OMAP_HEADER_KEY] = bl;
--  t.set_buffer_keys(*header, OBJECT_OMAP_HEADER, sets);
-+  t.set_buffer_keys(header, OBJECT_OMAP_HEADER, sets);
-   return 0;
- }
- 
- int KeyValueStore::_split_collection(coll_t cid, uint32_t bits, uint32_t rem,
-@@ -2880,9 +2901,9 @@
- {
-   {
-     dout(15) << __func__ << " " << cid << " bits: " << bits << dendl;
- 
--    StripObjectMap::StripObjectHeader *header;
-+    StripObjectMap::StripObjectHeaderRef header;
- 
-     int r = t.lookup_cached_header(get_coll_for_coll(),
-                                    make_ghobject_for_coll(cid),
-                                    &header, false);
---- a/src/os/KeyValueStore.h
-+++ b/src/os/KeyValueStore.h
-@@ -35,10 +35,10 @@
- #include "common/fd.h"
- 
- #include "common/Mutex.h"
- #include "GenericObjectMap.h"
--#include "SequencerPosition.h"
- #include "KeyValueDB.h"
-+#include "common/random_cache.hpp"
- 
- #include "include/uuid.h"
- 
- enum kvstore_types {
-@@ -47,8 +47,10 @@
-     KV_TYPE_OTHER
- };
- 
- 
-+static uint64_t default_strip_size = 1024;
-+
- class StripObjectMap: public GenericObjectMap {
-  public:
- 
-   struct StripExtent {
-@@ -64,9 +66,8 @@
-     // Persistent state
-     uint64_t strip_size;
-     uint64_t max_size;
-     vector<char> bits;
--    SequencerPosition spos;
- 
-     // soft state
-     Header header; // FIXME: Hold lock to avoid concurrent operations, it will
-                    // also block read operation which not should be permitted.
-@@ -81,67 +82,66 @@
-       ENCODE_START(1, 1, bl);
-       ::encode(strip_size, bl);
-       ::encode(max_size, bl);
-       ::encode(bits, bl);
--      ::encode(spos, bl);
-       ENCODE_FINISH(bl);
-     }
- 
-     void decode(bufferlist::iterator &bl) {
-       DECODE_START(1, bl);
-       ::decode(strip_size, bl);
-       ::decode(max_size, bl);
-       ::decode(bits, bl);
--      ::decode(spos, bl);
-       DECODE_FINISH(bl);
-     }
-   };
--
--  bool check_spos(const StripObjectHeader &header,
--                  const SequencerPosition &spos);
--  void sync_wrap(StripObjectHeader &strip_header, KeyValueDB::Transaction t,
--                 const SequencerPosition &spos);
-+  typedef ceph::shared_ptr<StripObjectHeader> StripObjectHeaderRef;
- 
-   static int file_to_extents(uint64_t offset, size_t len, uint64_t strip_size,
-                              vector<StripExtent> &extents);
-   int lookup_strip_header(const coll_t & cid, const ghobject_t &oid,
--                          StripObjectHeader &header);
--  int save_strip_header(StripObjectHeader &header,
--                        const SequencerPosition &spos,
--                        KeyValueDB::Transaction t);
-+                          StripObjectHeaderRef *header);
-+  int save_strip_header(StripObjectHeaderRef header, KeyValueDB::Transaction t);
-   int create_strip_header(const coll_t &cid, const ghobject_t &oid,
--                          StripObjectHeader &strip_header,
-+                          StripObjectHeaderRef *strip_header,
-                           KeyValueDB::Transaction t);
--  void clone_wrap(StripObjectHeader &old_header,
-+  void clone_wrap(StripObjectHeaderRef old_header,
-                   const coll_t &cid, const ghobject_t &oid,
-                   KeyValueDB::Transaction t,
--                  StripObjectHeader *origin_header,
--                  StripObjectHeader *target_header);
--  void rename_wrap(const coll_t &cid, const ghobject_t &oid,
-+                  StripObjectHeaderRef *target_header);
-+  void rename_wrap(StripObjectHeaderRef old_header, const coll_t &cid, const ghobject_t &oid,
-                    KeyValueDB::Transaction t,
--                   StripObjectHeader *header);
-+                   StripObjectHeaderRef *new_header);
-   // Already hold header to avoid lock header seq again
-   int get_with_header(
--    const StripObjectHeader &header,
-+    const StripObjectHeaderRef header,
-     const string &prefix,
-     map<string, bufferlist> *out
-     );
- 
-   int get_values_with_header(
--    const StripObjectHeader &header,
-+    const StripObjectHeaderRef header,
-     const string &prefix,
-     const set<string> &keys,
-     map<string, bufferlist> *out
-     );
-   int get_keys_with_header(
--    const StripObjectHeader &header,
-+    const StripObjectHeaderRef header,
-     const string &prefix,
-     set<string> *keys
-     );
- 
--  StripObjectMap(KeyValueDB *db): GenericObjectMap(db) {}
-+  Mutex lock;
-+  void invalidate_cache(const coll_t &c, const ghobject_t &oid) {
-+    Mutex::Locker l(lock);
-+    caches.clear(oid);
-+  }
- 
--  static const uint64_t default_strip_size = 1024;
-+  RandomCache<ghobject_t, pair<coll_t, StripObjectHeaderRef> > caches;
-+  StripObjectMap(KeyValueDB *db): GenericObjectMap(db),
-+                                  lock("StripObjectMap::lock"),
-+                                  caches(g_conf->keyvaluestore_header_cache_size)
-+  {}
- };
- 
- 
- class KeyValueStore : public ObjectStore,
-@@ -160,9 +160,9 @@
-   std::string current_fn;
-   std::string current_op_seq_fn;
-   uuid_d fsid;
- 
--  int fsid_fd, op_fd, current_fd;
-+  int fsid_fd, current_fd;
- 
-   enum kvstore_types kv_type;
- 
-   deque<uint64_t> snaps;
-@@ -209,41 +209,51 @@
-   // 3. Object modify(including omap, xattr)
-   // 4. Clone or rename
-   struct BufferTransaction {
-     typedef pair<coll_t, ghobject_t> uniq_id;
--    typedef map<uniq_id, StripObjectMap::StripObjectHeader> StripHeaderMap;
-+    typedef map<uniq_id, StripObjectMap::StripObjectHeaderRef> StripHeaderMap;
- 
-     //Dirty records
-     StripHeaderMap strip_headers;
-+    list<Context*> finishes;
- 
-     KeyValueStore *store;
- 
--    SequencerPosition spos;
-     KeyValueDB::Transaction t;
- 
-     int lookup_cached_header(const coll_t &cid, const ghobject_t &oid,
--                             StripObjectMap::StripObjectHeader **strip_header,
-+                             StripObjectMap::StripObjectHeaderRef *strip_header,
-                              bool create_if_missing);
--    int get_buffer_keys(StripObjectMap::StripObjectHeader &strip_header,
-+    int get_buffer_keys(StripObjectMap::StripObjectHeaderRef strip_header,
-                         const string &prefix, const set<string> &keys,
-                         map<string, bufferlist> *out);
--    void set_buffer_keys(StripObjectMap::StripObjectHeader &strip_header,
-+    void set_buffer_keys(StripObjectMap::StripObjectHeaderRef strip_header,
-                          const string &prefix, map<string, bufferlist> &bl);
--    int remove_buffer_keys(StripObjectMap::StripObjectHeader &strip_header,
-+    int remove_buffer_keys(StripObjectMap::StripObjectHeaderRef strip_header,
-                            const string &prefix, const set<string> &keys);
--    void clear_buffer_keys(StripObjectMap::StripObjectHeader &strip_header,
-+    void clear_buffer_keys(StripObjectMap::StripObjectHeaderRef strip_header,
-                            const string &prefix);
--    int clear_buffer(StripObjectMap::StripObjectHeader &strip_header);
--    void clone_buffer(StripObjectMap::StripObjectHeader &old_header,
-+    int clear_buffer(StripObjectMap::StripObjectHeaderRef strip_header);
-+    void clone_buffer(StripObjectMap::StripObjectHeaderRef old_header,
-                       const coll_t &cid, const ghobject_t &oid);
--    void rename_buffer(StripObjectMap::StripObjectHeader &old_header,
-+    void rename_buffer(StripObjectMap::StripObjectHeaderRef old_header,
-                        const coll_t &cid, const ghobject_t &oid);
-     int submit_transaction();
- 
--    BufferTransaction(KeyValueStore *store,
--                      SequencerPosition &spos): store(store), spos(spos) {
-+    BufferTransaction(KeyValueStore *store): store(store) {
-       t = store->backend->get_transaction();
-     }
-+
-+    struct InvalidateCacheContext : public Context {
-+      KeyValueStore *store;
-+      const coll_t cid;
-+      const ghobject_t oid;
-+      InvalidateCacheContext(KeyValueStore *s, const coll_t &c, const ghobject_t &oid): store(s), cid(c), oid(oid) {}
-+      void finish(int r) {
-+      if (r == 0)
-+        store->backend->invalidate_cache(cid, oid);
-+      }
-+    };
-   };
- 
-   // -- op workqueue --
-   struct Op {
-@@ -256,52 +266,111 @@
-   };
-   class OpSequencer : public Sequencer_impl {
-     Mutex qlock; // to protect q, for benefit of flush (peek/dequeue also protected by lock)
-     list<Op*> q;
--    list<uint64_t> jq;
-     Cond cond;
-+    list<pair<uint64_t, Context*> > flush_commit_waiters;
-+    uint64_t op; // used by flush() to know the sequence of op
-    public:
-     Sequencer *parent;
-     Mutex apply_lock;  // for apply mutual exclusion
-+    
-+    /// get_max_uncompleted
-+    bool _get_max_uncompleted(
-+      uint64_t *seq ///< [out] max uncompleted seq
-+      ) {
-+      assert(qlock.is_locked());
-+      assert(seq);
-+      *seq = 0;
-+      if (q.empty()) {
-+	return true;
-+      } else {
-+	*seq = q.back()->op;
-+	return false;
-+      }
-+    } /// @returns true if the queue is empty
-+
-+    /// get_min_uncompleted
-+    bool _get_min_uncompleted(
-+      uint64_t *seq ///< [out] min uncompleted seq
-+      ) {
-+      assert(qlock.is_locked());
-+      assert(seq);
-+      *seq = 0;
-+      if (q.empty()) {
-+	return true;
-+      } else {
-+	*seq = q.front()->op;
-+	return false;
-+      }
-+    } /// @returns true if both queues are empty
-+
-+    void _wake_flush_waiters(list<Context*> *to_queue) {
-+      uint64_t seq;
-+      if (_get_min_uncompleted(&seq))
-+	seq = -1;
-+
-+      for (list<pair<uint64_t, Context*> >::iterator i =
-+	     flush_commit_waiters.begin();
-+	   i != flush_commit_waiters.end() && i->first < seq;
-+	   flush_commit_waiters.erase(i++)) {
-+	to_queue->push_back(i->second);
-+      }
-+    }
- 
-     void queue(Op *o) {
-       Mutex::Locker l(qlock);
-       q.push_back(o);
-+      op++;
-+      o->op = op;
-     }
-     Op *peek_queue() {
-       assert(apply_lock.is_locked());
-       return q.front();
-     }
--    Op *dequeue() {
-+
-+    Op *dequeue(list<Context*> *to_queue) {
-+      assert(to_queue);
-       assert(apply_lock.is_locked());
-       Mutex::Locker l(qlock);
-       Op *o = q.front();
-       q.pop_front();
-       cond.Signal();
-+
-+      _wake_flush_waiters(to_queue);
-       return o;
-     }
-+
-     void flush() {
-       Mutex::Locker l(qlock);
- 
-       // get max for journal _or_ op queues
-       uint64_t seq = 0;
-       if (!q.empty())
-         seq = q.back()->op;
--      if (!jq.empty() && jq.back() > seq)
--        seq = jq.back();
- 
-       if (seq) {
-         // everything prior to our watermark to drain through either/both
-         // queues
--        while ((!q.empty() && q.front()->op <= seq) ||
--                (!jq.empty() && jq.front() <= seq))
-+        while (!q.empty() && q.front()->op <= seq)
-           cond.Wait(qlock);
-       }
-     }
-+    bool flush_commit(Context *c) {
-+      Mutex::Locker l(qlock);
-+      uint64_t seq = 0;
-+      if (_get_max_uncompleted(&seq)) {
-+	delete c;
-+	return true;
-+      } else {
-+	flush_commit_waiters.push_back(make_pair(seq, c));
-+	return false;
-+      }
-+    }
- 
-     OpSequencer()
-       : qlock("KeyValueStore::OpSequencer::qlock", false, false),
--	parent(0),
-+        op(0), parent(0),
- 	apply_lock("KeyValueStore::OpSequencer::apply_lock", false, false) {}
-     ~OpSequencer() {
-       assert(q.empty());
-     }
-@@ -416,9 +485,8 @@
-     return _do_transactions(tls, op_seq, 0);
-   }
-   unsigned _do_transaction(Transaction& transaction,
-                            BufferTransaction &bt,
--                           SequencerPosition& spos,
-                            ThreadPool::TPHandle *handle);
- 
-   int queue_transactions(Sequencer *osr, list<Transaction*>& tls,
-                          TrackedOpRef op = TrackedOpRef(),
-@@ -427,12 +495,12 @@
- 
-   // ------------------
-   // objects
- 
--  int _generic_read(StripObjectMap::StripObjectHeader &header,
-+  int _generic_read(StripObjectMap::StripObjectHeaderRef header,
-                     uint64_t offset, size_t len, bufferlist& bl,
-                     bool allow_eio = false, BufferTransaction *bt = 0);
--  int _generic_write(StripObjectMap::StripObjectHeader &header,
-+  int _generic_write(StripObjectMap::StripObjectHeaderRef header,
-                      uint64_t offset, size_t len, const bufferlist& bl,
-                      BufferTransaction &t, bool replica = false);
- 
-   bool exists(coll_t cid, const ghobject_t& oid);
-@@ -571,28 +639,8 @@
-   static const string OBJECT_OMAP_HEADER_KEY;
-   static const string COLLECTION;
-   static const string COLLECTION_ATTR;
-   static const uint32_t COLLECTION_VERSION = 1;
--
--  class SubmitManager {
--    Mutex lock;
--    uint64_t op_seq;
--    uint64_t op_submitted;
--   public:
--    SubmitManager() :
--        lock("JOS::SubmitManager::lock", false, true, false, g_ceph_context),
--        op_seq(0), op_submitted(0)
--    {}
--    uint64_t op_submit_start();
--    void op_submit_finish(uint64_t op);
--    void set_op_seq(uint64_t seq) {
--        Mutex::Locker l(lock);
--        op_submitted = op_seq = seq;
--    }
--    uint64_t get_op_seq() {
--        return op_seq;
--    }
--  } submit_manager;
- };
- 
- WRITE_CLASS_ENCODER(StripObjectMap::StripObjectHeader)
- 
---- a/src/os/LFNIndex.cc
-+++ b/src/os/LFNIndex.cc
-@@ -60,8 +60,19 @@
-     ++current_failure;
-   }
- }
- 
-+// Helper to close fd's when we leave scope.  This is useful when used
-+// in combination with RetryException, thrown by the above.
-+struct FDCloser {
-+  int fd;
-+  FDCloser(int f) : fd(f) {}
-+  ~FDCloser() {
-+    VOID_TEMP_FAILURE_RETRY(::close(fd));
-+  }
-+};
-+
-+
- /* Public methods */
- 
- void LFNIndex::set_ref(ceph::shared_ptr<CollectionIndex> ref)
- {
-@@ -159,11 +170,11 @@
-   maybe_inject_failure();
-   int fd = ::open(get_full_path_subdir(path).c_str(), O_RDONLY);
-   if (fd < 0)
-     return -errno;
-+  FDCloser f(fd);
-   maybe_inject_failure();
-   int r = ::fsync(fd);
--  VOID_TEMP_FAILURE_RETRY(::close(fd));
-   maybe_inject_failure();
-   if (r < 0)
-     return -errno;
-   else
-@@ -752,9 +763,10 @@
-   char buf[FILENAME_MAX_LEN + 1];
-   for ( ; ; ++i) {
-     candidate = lfn_get_short_name(oid, i);
-     candidate_path = get_full_path(path, candidate);
--    r = chain_getxattr(candidate_path.c_str(), get_lfn_attr().c_str(), buf, sizeof(buf));
-+    r = chain_getxattr(candidate_path.c_str(), get_lfn_attr().c_str(),
-+		       buf, sizeof(buf));
-     if (r < 0) {
-       if (errno != ENODATA && errno != ENOENT)
- 	return -errno;
-       if (errno == ENODATA) {
-@@ -783,8 +795,40 @@
-       if (exists)
- 	*exists = 1;
-       return 0;
-     }
-+    r = chain_getxattr(candidate_path.c_str(), get_alt_lfn_attr().c_str(),
-+		       buf, sizeof(buf));
-+    if (r > 0) {
-+      // only consider alt name if nlink > 1
-+      struct stat st;
-+      int rc = ::stat(candidate_path.c_str(), &st);
-+      if (rc < 0)
-+	return -errno;
-+      if (st.st_nlink <= 1) {
-+	// left over from incomplete unlink, remove
-+	maybe_inject_failure();
-+	dout(20) << __func__ << " found extra alt attr for " << candidate_path
-+		 << ", long name " << string(buf, r) << dendl;
-+	rc = chain_removexattr(candidate_path.c_str(),
-+			       get_alt_lfn_attr().c_str());
-+	maybe_inject_failure();
-+	if (rc < 0)
-+	  return rc;
-+	continue;
-+      }
-+      buf[MIN((int)sizeof(buf) - 1, r)] = '\0';
-+      if (!strcmp(buf, full_name.c_str())) {
-+	dout(20) << __func__ << " used alt attr for " << full_name << dendl;
-+	if (mangled_name)
-+	  *mangled_name = candidate;
-+	if (out_path)
-+	  *out_path = candidate_path;
-+	if (exists)
-+	  *exists = 1;
-+	return 0;
-+      }
-+    }
-   }
-   assert(0); // Unreachable
-   return 0;
- }
-@@ -797,9 +841,26 @@
-     return 0;
-   string full_path = get_full_path(path, mangled_name);
-   string full_name = lfn_generate_object_name(oid);
-   maybe_inject_failure();
--  return chain_setxattr(full_path.c_str(), get_lfn_attr().c_str(), 
-+
-+  // if the main attr exists and is different, move it to the alt attr.
-+  char buf[FILENAME_MAX_LEN + 1];
-+  int r = chain_getxattr(full_path.c_str(), get_lfn_attr().c_str(),
-+			 buf, sizeof(buf));
-+  if (r >= 0 && (r != (int)full_name.length() ||
-+		 memcmp(buf, full_name.c_str(), full_name.length()))) {
-+    dout(20) << __func__ << " " << mangled_name
-+	     << " moving old name to alt attr "
-+	     << string(buf, r)
-+	     << ", new name is " << full_name << dendl;
-+    r = chain_setxattr(full_path.c_str(), get_alt_lfn_attr().c_str(),
-+		       buf, r);
-+    if (r < 0)
-+      return r;
-+  }
-+
-+  return chain_setxattr(full_path.c_str(), get_lfn_attr().c_str(),
- 		     full_name.c_str(), full_name.size());
- }
- 
- int LFNIndex::lfn_unlink(const vector<string> &path,
-@@ -838,28 +899,37 @@
- 	return -errno;
-       }
-     }
-   }
-+  string full_path = get_full_path(path, mangled_name);
-+  int fd = ::open(full_path.c_str(), O_RDONLY);
-+  if (fd < 0)
-+    return -errno;
-+  FDCloser f(fd);
-   if (i == removed_index + 1) {
--    string full_path = get_full_path(path, mangled_name);
-     maybe_inject_failure();
-     int r = ::unlink(full_path.c_str());
-     maybe_inject_failure();
-     if (r < 0)
-       return -errno;
--    else
--      return 0;
-   } else {
--    string rename_to = get_full_path(path, mangled_name);
-+    string& rename_to = full_path;
-     string rename_from = get_full_path(path, lfn_get_short_name(oid, i - 1));
-     maybe_inject_failure();
-     int r = ::rename(rename_from.c_str(), rename_to.c_str());
-     maybe_inject_failure();
-     if (r < 0)
-       return -errno;
--    else
--      return 0;
-   }
-+  struct stat st;
-+  int r = ::fstat(fd, &st);
-+  if (r == 0 && st.st_nlink > 0) {
-+    // remove alt attr
-+    dout(20) << __func__ << " removing alt attr from " << full_path << dendl;
-+    fsync_dir(path);
-+    chain_fremovexattr(fd, get_alt_lfn_attr().c_str());
-+  }
-+  return r;
- }
- 
- int LFNIndex::lfn_translate(const vector<string> &path,
- 			    const string &short_name,
---- a/src/os/LFNIndex.h
-+++ b/src/os/LFNIndex.h
-@@ -122,9 +122,9 @@
-     error_injection_enabled = false;
-   }
- 
- private:
--  string lfn_attribute;
-+  string lfn_attribute, lfn_alt_attribute;
-   coll_t collection;
- 
- public:
-   /// Constructor
-@@ -145,9 +145,10 @@
-     } else {
-       char buf[100];
-       snprintf(buf, sizeof(buf), "%d", index_version);
-       lfn_attribute = LFN_ATTR + string(buf);
--    }
-+      lfn_alt_attribute = LFN_ATTR + string(buf) + "-alt";
-+   }
-   }
- 
-   coll_t coll() const { return collection; }
- 
-@@ -422,8 +423,11 @@
-    */
-   const string &get_lfn_attr() const {
-     return lfn_attribute;
-   }
-+  const string &get_alt_lfn_attr() const {
-+    return lfn_alt_attribute;
-+  }
- 
-   /**
-    * Gets the filename corresponsing to oid in path.
-    *
---- a/src/os/MemStore.cc
-+++ b/src/os/MemStore.cc
-@@ -949,9 +949,14 @@
-       }
-       break;
- 
-     case Transaction::OP_SETALLOCHINT:
--      // nop
-+      {
-+        coll_t cid(i.get_cid());
-+        ghobject_t oid = i.get_oid();
-+        (void)i.get_length();  // discard result
-+        (void)i.get_length();  // discard result
-+      }
-       break;
- 
-     default:
-       derr << "bad op " << op << dendl;
---- a/src/os/ObjectStore.cc
-+++ b/src/os/ObjectStore.cc
-@@ -143,9 +143,13 @@
- int ObjectStore::collection_list_range(coll_t c, hobject_t start, hobject_t end,
- 			    snapid_t seq, vector<hobject_t> *ls)
- {
-   vector<ghobject_t> go;
--  ghobject_t gstart(start), gend(end);
-+  // Starts with the smallest shard id and generation to
-+  // make sure the result list has the marker object
-+  ghobject_t gstart(start, 0, shard_id_t(0));
-+  // Exclusive end, choose the smallest end ghobject
-+  ghobject_t gend(end, 0, shard_id_t(0));
-   int ret = collection_list_range(c, gstart, gend, seq, &go);
-   if (ret == 0) {
-     ls->reserve(go.size());
-     for (vector<ghobject_t>::iterator i = go.begin(); i != go.end() ; ++i)
---- a/src/os/ObjectStore.h
-+++ b/src/os/ObjectStore.h
-@@ -127,8 +127,24 @@
-    * created in ...::queue_transaction(s)
-    */
-   struct Sequencer_impl {
-     virtual void flush() = 0;
-+
-+    /**
-+     * Async flush_commit
-+     *
-+     * There are two cases:
-+     * 1) sequencer is currently idle: the method returns true and
-+     *    c is deleted
-+     * 2) sequencer is not idle: the method returns false and c is
-+     *    called asyncronously with a value of 0 once all transactions
-+     *    queued on this sequencer prior to the call have been applied
-+     *    and committed.
-+     */
-+    virtual bool flush_commit(
-+      Context *c ///< [in] context to call upon flush/commit
-+      ) = 0; ///< @return true if idle, false otherwise
-+
-     virtual ~Sequencer_impl() {}
-   };
- 
-   /**
-@@ -152,8 +168,18 @@
-     void flush() {
-       if (p)
- 	p->flush();
-     }
-+
-+    /// @see Sequencer_impl::flush_commit()
-+    bool flush_commit(Context *c) {
-+      if (!p) {
-+	delete c;
-+	return true;
-+      } else {
-+	return p->flush_commit(c);
-+      }
-+    }
-   };
- 
-   /*********************************
-    *
---- a/src/osd/ECBackend.cc
-+++ b/src/osd/ECBackend.cc
-@@ -104,15 +104,15 @@
- }
- 
- void ECBackend::ReadOp::dump(Formatter *f) const
- {
--  f->dump_stream("tid") << tid;
-+  f->dump_unsigned("tid", tid);
-   if (op && op->get_req()) {
-     f->dump_stream("op") << *(op->get_req());
-   }
-   f->dump_stream("to_read") << to_read;
-   f->dump_stream("complete") << complete;
--  f->dump_stream("priority") << priority;
-+  f->dump_int("priority", priority);
-   f->dump_stream("obj_to_source") << obj_to_source;
-   f->dump_stream("source_to_obj") << source_to_obj;
-   f->dump_stream("in_progress") << in_progress;
- }
-@@ -157,9 +157,9 @@
-   f->dump_stream("missing_on") << missing_on;
-   f->dump_stream("missing_on_shards") << missing_on_shards;
-   f->dump_stream("recovery_info") << recovery_info;
-   f->dump_stream("recovery_progress") << recovery_progress;
--  f->dump_stream("pending_read") << pending_read;
-+  f->dump_bool("pending_read", pending_read);
-   f->dump_stream("state") << tostr(state);
-   f->dump_stream("waiting_on_pushes") << waiting_on_pushes;
-   f->dump_stream("extent_requested") << extent_requested;
- }
-@@ -828,8 +828,9 @@
-   get_parent()->log_operation(
-     op.log_entries,
-     op.updated_hit_set_history,
-     op.trim_to,
-+    op.trim_rollback_to,
-     !(op.t.empty()),
-     localt);
-   localt->append(op.t);
-   if (on_local_applied_sync) {
-@@ -1210,8 +1211,9 @@
-   const hobject_t &hoid,
-   const eversion_t &at_version,
-   PGTransaction *_t,
-   const eversion_t &trim_to,
-+  const eversion_t &trim_rollback_to,
-   vector<pg_log_entry_t> &log_entries,
-   boost::optional<pg_hit_set_history_t> &hset_history,
-   Context *on_local_applied_sync,
-   Context *on_all_applied,
-@@ -1225,8 +1227,9 @@
-   Op *op = &(tid_to_op_map[tid]);
-   op->hoid = hoid;
-   op->version = at_version;
-   op->trim_to = trim_to;
-+  op->trim_rollback_to = trim_rollback_to;
-   op->log_entries.swap(log_entries);
-   std::swap(op->updated_hit_set_history, hset_history);
-   op->on_local_applied_sync = on_local_applied_sync;
-   op->on_all_applied = on_all_applied;
-@@ -1531,8 +1534,9 @@
-       stats,
-       should_send ? iter->second : ObjectStore::Transaction(),
-       op->version,
-       op->trim_to,
-+      op->trim_rollback_to,
-       op->log_entries,
-       op->updated_hit_set_history,
-       op->temp_added,
-       op->temp_cleared);
---- a/src/osd/ECBackend.h
-+++ b/src/osd/ECBackend.h
-@@ -96,8 +96,9 @@
-     const hobject_t &hoid,
-     const eversion_t &at_version,
-     PGTransaction *t,
-     const eversion_t &trim_to,
-+    const eversion_t &trim_rollback_to,
-     vector<pg_log_entry_t> &log_entries,
-     boost::optional<pg_hit_set_history_t> &hset_history,
-     Context *on_local_applied_sync,
-     Context *on_all_applied,
-@@ -325,8 +326,9 @@
-   struct Op {
-     hobject_t hoid;
-     eversion_t version;
-     eversion_t trim_to;
-+    eversion_t trim_rollback_to;
-     vector<pg_log_entry_t> log_entries;
-     boost::optional<pg_hit_set_history_t> updated_hit_set_history;
-     Context *on_local_applied_sync;
-     Context *on_all_applied;
---- a/src/osd/ECMsgTypes.cc
-+++ b/src/osd/ECMsgTypes.cc
-@@ -15,9 +15,9 @@
- #include "ECMsgTypes.h"
- 
- void ECSubWrite::encode(bufferlist &bl) const
- {
--  ENCODE_START(2, 1, bl);
-+  ENCODE_START(3, 1, bl);
-   ::encode(from, bl);
-   ::encode(tid, bl);
-   ::encode(reqid, bl);
-   ::encode(soid, bl);
-@@ -28,14 +28,15 @@
-   ::encode(log_entries, bl);
-   ::encode(temp_added, bl);
-   ::encode(temp_removed, bl);
-   ::encode(updated_hit_set_history, bl);
-+  ::encode(trim_rollback_to, bl);
-   ENCODE_FINISH(bl);
- }
- 
- void ECSubWrite::decode(bufferlist::iterator &bl)
- {
--  DECODE_START(2, bl);
-+  DECODE_START(3, bl);
-   ::decode(from, bl);
-   ::decode(tid, bl);
-   ::decode(reqid, bl);
-   ::decode(soid, bl);
-@@ -48,8 +49,13 @@
-   ::decode(temp_removed, bl);
-   if (struct_v >= 2) {
-     ::decode(updated_hit_set_history, bl);
-   }
-+  if (struct_v >= 3) {
-+    ::decode(trim_rollback_to, bl);
-+  } else {
-+    trim_rollback_to = trim_to;
-+  }
-   DECODE_FINISH(bl);
- }
- 
- std::ostream &operator<<(
-@@ -57,20 +63,22 @@
- {
-   lhs << "ECSubWrite(tid=" << rhs.tid
-       << ", reqid=" << rhs.reqid
-       << ", at_version=" << rhs.at_version
--      << ", trim_to=" << rhs.trim_to;
-+      << ", trim_to=" << rhs.trim_to
-+      << ", trim_rollback_to=" << rhs.trim_rollback_to;
-   if (rhs.updated_hit_set_history)
-     lhs << ", has_updated_hit_set_history";
-   return lhs <<  ")";
- }
- 
- void ECSubWrite::dump(Formatter *f) const
- {
--  f->dump_stream("tid") << tid;
-+  f->dump_unsigned("tid", tid);
-   f->dump_stream("reqid") << reqid;
-   f->dump_stream("at_version") << at_version;
-   f->dump_stream("trim_to") << trim_to;
-+  f->dump_stream("trim_rollback_to") << trim_rollback_to;
-   f->dump_stream("has_updated_hit_set_history")
-     << static_cast<bool>(updated_hit_set_history);
- }
- 
-@@ -84,8 +92,14 @@
-   o.back()->tid = 4;
-   o.back()->reqid = osd_reqid_t(entity_name_t::CLIENT(123), 1, 45678);
-   o.back()->at_version = eversion_t(10, 300);
-   o.back()->trim_to = eversion_t(5, 42);
-+  o.push_back(new ECSubWrite());
-+  o.back()->tid = 9;
-+  o.back()->reqid = osd_reqid_t(entity_name_t::CLIENT(123), 1, 45678);
-+  o.back()->at_version = eversion_t(10, 300);
-+  o.back()->trim_to = eversion_t(5, 42);
-+  o.back()->trim_rollback_to = eversion_t(8, 250);
- }
- 
- void ECSubWriteReply::encode(bufferlist &bl) const
- {
-@@ -120,9 +134,9 @@
- }
- 
- void ECSubWriteReply::dump(Formatter *f) const
- {
--  f->dump_stream("tid") << tid;
-+  f->dump_unsigned("tid", tid);
-   f->dump_stream("last_complete") << last_complete;
-   f->dump_stream("committed") << committed;
-   f->dump_stream("applied") << applied;
- }
-@@ -170,9 +184,9 @@
- 
- void ECSubRead::dump(Formatter *f) const
- {
-   f->dump_stream("from") << from;
--  f->dump_stream("tid") << tid;
-+  f->dump_unsigned("tid", tid);
-   f->open_array_section("objects");
-   for (map<hobject_t, list<pair<uint64_t, uint64_t> > >::const_iterator i =
- 	 to_read.begin();
-        i != to_read.end();
-@@ -258,9 +272,9 @@
- 
- void ECSubReadReply::dump(Formatter *f) const
- {
-   f->dump_stream("from") << from;
--  f->dump_stream("tid") << tid;
-+  f->dump_unsigned("tid", tid);
-   f->open_array_section("buffers_read");
-   for (map<hobject_t, list<pair<uint64_t, bufferlist> > >::const_iterator i =
- 	 buffers_read.begin();
-        i != buffers_read.end();
---- a/src/osd/ECMsgTypes.h
-+++ b/src/osd/ECMsgTypes.h
-@@ -27,8 +27,9 @@
-   pg_stat_t stats;
-   ObjectStore::Transaction t;
-   eversion_t at_version;
-   eversion_t trim_to;
-+  eversion_t trim_rollback_to;
-   vector<pg_log_entry_t> log_entries;
-   set<hobject_t> temp_added;
-   set<hobject_t> temp_removed;
-   boost::optional<pg_hit_set_history_t> updated_hit_set_history;
-@@ -41,16 +42,18 @@
-     const pg_stat_t &stats,
-     const ObjectStore::Transaction &t,
-     eversion_t at_version,
-     eversion_t trim_to,
-+    eversion_t trim_rollback_to,
-     vector<pg_log_entry_t> log_entries,
-     boost::optional<pg_hit_set_history_t> updated_hit_set_history,
-     const set<hobject_t> &temp_added,
-     const set<hobject_t> &temp_removed)
-     : from(from), tid(tid), reqid(reqid),
-       soid(soid), stats(stats), t(t),
-       at_version(at_version),
--      trim_to(trim_to), log_entries(log_entries),
-+      trim_to(trim_to), trim_rollback_to(trim_rollback_to),
-+      log_entries(log_entries),
-       temp_added(temp_added),
-       temp_removed(temp_removed),
-       updated_hit_set_history(updated_hit_set_history) {}
-   void encode(bufferlist &bl) const;
---- a/src/osd/HitSet.h
-+++ b/src/osd/HitSet.h
-@@ -368,9 +368,9 @@
-     double get_fpp() const {
-       return (double)fpp_micro / 1000000.0;
-     }
-     void set_fpp(double f) {
--      fpp_micro = (unsigned)(f * 1000000.0);
-+      fpp_micro = (unsigned)(llrintl(f * (double)1000000.0));
-     }
- 
-     void encode(bufferlist& bl) const {
-       ENCODE_START(1, 1, bl);
---- a/src/osd/OSD.cc
-+++ b/src/osd/OSD.cc
-@@ -41,8 +41,9 @@
- #include "osdc/Objecter.h"
- 
- #include "common/ceph_argparse.h"
- #include "common/version.h"
-+#include "common/io_priority.h"
- 
- #include "os/ObjectStore.h"
- 
- #include "ReplicatedPG.h"
-@@ -190,8 +191,9 @@
-   rep_scrub_wq(osd->rep_scrub_wq),
-   push_wq("push_wq", cct->_conf->osd_recovery_thread_timeout, &osd->recovery_tp),
-   gen_wq("gen_wq", cct->_conf->osd_recovery_thread_timeout, &osd->recovery_tp),
-   class_handler(osd->class_handler),
-+  pg_epoch_lock("OSDService::pg_epoch_lock"),
-   publish_lock("OSDService::publish_lock"),
-   pre_publish_lock("OSDService::pre_publish_lock"),
-   sched_scrub_lock("OSDService::sched_scrub_lock"), scrubs_pending(0),
-   scrubs_active(0),
-@@ -1276,8 +1278,10 @@
-   recovery_tp.start();
-   disk_tp.start();
-   command_tp.start();
- 
-+  set_disk_tp_priority();
-+
-   // start the heartbeat
-   heartbeat_thread.create();
- 
-   // tick
-@@ -1304,8 +1308,10 @@
-   osd_lock.Lock();
-   if (is_stopping())
-     return 0;
- 
-+  check_config();
-+
-   dout(10) << "ensuring pgs have consumed prior maps" << dendl;
-   consume_map();
-   peering_wq.drain();
- 
-@@ -1662,10 +1668,12 @@
-   recovery_tp.stop();
-   dout(10) << "recovery tp stopped" << dendl;
- 
-   op_tp.drain();
-+  peering_wq.clear();
-+  scrub_finalize_wq.clear();
-   op_tp.stop();
--  dout(10) << "op tp stopped" << dendl;
-+  dout(10) << "osd tp stopped" << dendl;
- 
-   command_tp.drain();
-   command_tp.stop();
-   dout(10) << "command tp stopped" << dendl;
-@@ -1707,9 +1715,8 @@
-     Mutex::Locker l(pg_stat_queue_lock);
-     assert(pg_stat_queue.empty());
-   }
- 
--  peering_wq.clear();
-   // Remove PGs
- #ifdef PG_DEBUG_REFS
-   service.dump_live_pgids();
- #endif
-@@ -1853,8 +1860,10 @@
-   PG* pg = _make_pg(createmap, pgid);
- 
-   pg_map[pgid] = pg;
- 
-+  service.pg_add_epoch(pg->info.pgid, createmap->get_epoch());
-+
-   pg->lock(no_lockdep_check);
-   pg->get("PGMap");  // because it's in pg_map
-   return pg;
- }
-@@ -1884,8 +1893,9 @@
- {
-   epoch_t e(service.get_osdmap()->get_epoch());
-   pg->get("PGMap");  // For pg_map
-   pg_map[pg->info.pgid] = pg;
-+  service.pg_add_epoch(pg->info.pgid, pg->get_osdmap()->get_epoch());
-   dout(10) << "Adding newly split pg " << *pg << dendl;
-   vector<int> up, acting;
-   pg->get_osdmap()->pg_to_up_acting_osds(pg->info.pgid.pgid, up, acting);
-   int role = OSDMap::calc_pg_role(service.whoami, acting);
-@@ -4391,11 +4401,10 @@
-       // 1MB block sizes are big enough so that we get more stuff done.
-       // However, to avoid the osd from getting hung on this and having
-       // timers being triggered, we are going to limit the count assuming
-       // a configurable throughput and duration.
--      int64_t total_throughput =
-+      int64_t max_count =
-         g_conf->osd_bench_large_size_max_throughput * duration;
--      int64_t max_count = (int64_t) (total_throughput / bsize);
-       if (count > max_count) {
-         ss << "'count' values greater than " << max_count
-            << " for a block size of " << prettybyte_t(bsize) << ", assuming "
-            << prettybyte_t(g_conf->osd_bench_large_size_max_throughput) << "/s,"
-@@ -5712,13 +5721,14 @@
-       client_messenger->set_default_policy(p);
-     }
-   }
-   {
--    Messenger::Policy p = cluster_messenger->get_policy(entity_name_t::TYPE_MON);
-+    Messenger::Policy p = client_messenger->get_policy(entity_name_t::TYPE_MON);
-     uint64_t mask;
-     uint64_t features = osdmap->get_features(entity_name_t::TYPE_MON, &mask);
-     if ((p.features_required & mask) != features) {
-       dout(0) << "crush map has features " << features
-+	      << " was " << p.features_required
- 	      << ", adjusting msgr requires for mons" << dendl;
-       p.features_required = (p.features_required & ~mask) | features;
-       client_messenger->set_policy(entity_name_t::TYPE_MON, p);
-     }
-@@ -5747,9 +5757,9 @@
-     }
-   }
- }
- 
--void OSD::advance_pg(
-+bool OSD::advance_pg(
-   epoch_t osd_epoch, PG *pg,
-   ThreadPool::TPHandle &handle,
-   PG::RecoveryCtx *rctx,
-   set<boost::intrusive_ptr<PG> > *new_pgs)
-@@ -5758,13 +5768,21 @@
-   epoch_t next_epoch = pg->get_osdmap()->get_epoch() + 1;
-   OSDMapRef lastmap = pg->get_osdmap();
- 
-   if (lastmap->get_epoch() == osd_epoch)
--    return;
-+    return true;
-   assert(lastmap->get_epoch() < osd_epoch);
- 
-+  epoch_t min_epoch = service.get_min_pg_epoch();
-+  epoch_t max;
-+  if (min_epoch) {
-+    max = min_epoch + g_conf->osd_map_max_advance;
-+  } else {
-+    max = next_epoch + g_conf->osd_map_max_advance;
-+  }
-+
-   for (;
--       next_epoch <= osd_epoch;
-+       next_epoch <= osd_epoch && next_epoch <= max;
-        ++next_epoch) {
-     OSDMapRef nextmap = service.try_get_map(next_epoch);
-     if (!nextmap)
-       continue;
-@@ -5794,9 +5812,17 @@
- 
-     lastmap = nextmap;
-     handle.reset_tp_timeout();
-   }
-+  service.pg_update_epoch(pg->info.pgid, lastmap->get_epoch());
-   pg->handle_activate_map(rctx);
-+  if (next_epoch <= osd_epoch) {
-+    dout(10) << __func__ << " advanced by max " << g_conf->osd_map_max_advance
-+	     << " past min epoch " << min_epoch
-+	     << " ... will requeue " << *pg << dendl;
-+    return false;
-+  }
-+  return true;
- }
- 
- /** 
-  * scan placement groups, initiate any replication
-@@ -6126,9 +6152,9 @@
-   }
-   return true;
- }
- 
--bool OSD::require_osd_peer(OpRequestRef op)
-+bool OSD::require_osd_peer(OpRequestRef& op)
- {
-   if (!op->get_req()->get_connection()->peer_is_osd()) {
-     dout(0) << "require_osd_peer received from non-osd " << op->get_req()->get_connection()->get_peer_addr()
- 	    << " " << *op->get_req() << dendl;
-@@ -6136,13 +6162,66 @@
-   }
-   return true;
- }
- 
-+bool OSD::require_self_aliveness(OpRequestRef& op, epoch_t epoch)
-+{
-+  if (epoch < up_epoch) {
-+    dout(7) << "from pre-up epoch " << epoch << " < " << up_epoch << dendl;
-+    return false;
-+  }
-+
-+  if (!is_active()) {
-+    dout(7) << "still in boot state, dropping message " << *op->get_req() << dendl;
-+    return false;
-+  }
-+
-+  return true;
-+}
-+
-+bool OSD::require_same_peer_instance(OpRequestRef& op, OSDMapRef& map)
-+{
-+  Message *m = op->get_req();
-+  int from = m->get_source().num();
-+
-+  if (!map->have_inst(from) ||
-+      (map->get_cluster_addr(from) != m->get_source_inst().addr)) {
-+    dout(5) << "from dead osd." << from << ", marking down, "
-+	    << " msg was " << m->get_source_inst().addr
-+	    << " expected " << (map->have_inst(from) ?
-+				map->get_cluster_addr(from) : entity_addr_t())
-+	    << dendl;
-+    ConnectionRef con = m->get_connection();
-+    cluster_messenger->mark_down(con.get());
-+    Session *s = static_cast<Session*>(con->get_priv());
-+    if (s) {
-+      con->set_priv(NULL);   // break ref <-> session cycle, if any
-+      s->put();
-+    }
-+    return false;
-+  }
-+  return true;
-+}
-+
-+bool OSD::require_up_osd_peer(OpRequestRef& op, OSDMapRef& map,
-+                              epoch_t their_epoch)
-+{
-+  if (!require_self_aliveness(op, their_epoch)) {
-+    return false;
-+  } else if (!require_osd_peer(op)) {
-+    return false;
-+  } else if (map->get_epoch() >= their_epoch &&
-+	     !require_same_peer_instance(op, map)) {
-+    return false;
-+  }
-+  return true;
-+}
-+
- /*
-  * require that we have same (or newer) map, and that
-  * the source is the pg primary.
-  */
--bool OSD::require_same_or_newer_map(OpRequestRef op, epoch_t epoch)
-+bool OSD::require_same_or_newer_map(OpRequestRef& op, epoch_t epoch)
- {
-   Message *m = op->get_req();
-   dout(15) << "require_same_or_newer_map " << epoch << " (i am " << osdmap->get_epoch() << ") " << m << dendl;
- 
-@@ -6154,32 +6233,15 @@
-     wait_for_new_map(op);
-     return false;
-   }
- 
--  if (epoch < up_epoch) {
--    dout(7) << "from pre-up epoch " << epoch << " < " << up_epoch << dendl;
-+  if (!require_self_aliveness(op, epoch)) {
-     return false;
-   }
- 
-   // ok, our map is same or newer.. do they still exist?
--  if (m->get_connection()->get_messenger() == cluster_messenger) {
--    int from = m->get_source().num();
--    if (!osdmap->have_inst(from) ||
--	osdmap->get_cluster_addr(from) != m->get_source_inst().addr) {
--      dout(5) << "from dead osd." << from << ", marking down, "
--	      << " msg was " << m->get_source_inst().addr
--	      << " expected " << (osdmap->have_inst(from) ? osdmap->get_cluster_addr(from) : entity_addr_t())
--	      << dendl;
--      ConnectionRef con = m->get_connection();
--      con->set_priv(NULL);   // break ref <-> session cycle, if any
--      cluster_messenger->mark_down(con.get());
--      return false;
--    }
--  }
--
--  // ok, we have at least as new a map as they do.  are we (re)booting?
--  if (!is_active()) {
--    dout(7) << "still in boot state, dropping message " << *m << dendl;
-+  if (m->get_connection()->get_messenger() == cluster_messenger &&
-+      !require_same_peer_instance(op, osdmap)) {
-     return false;
-   }
- 
-   return true;
-@@ -7141,8 +7203,10 @@
-       PGRef(pg))
-     );
-   remove_wq.queue(make_pair(PGRef(pg), deleting));
- 
-+  service.pg_remove_epoch(pg->info.pgid);
-+
-   // remove from map
-   pg_map.erase(pg->info.pgid);
-   pg->put("PGMap"); // since we've taken it out of map
- }
-@@ -7554,9 +7618,9 @@
-     dout(3) << "replica op from before up" << dendl;
-     return;
-   }
- 
--  if (!require_osd_peer(op))
-+  if (!require_up_osd_peer(op, osdmap, m->map_epoch))
-     return;
- 
-   // must be a rep op.
-   assert(m->get_source().is_osd());
-@@ -7769,10 +7833,11 @@
-     if (pg->deleting) {
-       pg->unlock();
-       continue;
-     }
--    advance_pg(curmap->get_epoch(), pg, handle, &rctx, &split_pgs);
--    if (!pg->peering_queue.empty()) {
-+    if (!advance_pg(curmap->get_epoch(), pg, handle, &rctx, &split_pgs)) {
-+      pg->queue_null(curmap->get_epoch(), curmap->get_epoch());
-+    } else if (!pg->peering_queue.empty()) {
-       PG::CephPeeringEvtRef evt = pg->peering_queue.front();
-       pg->peering_queue.pop_front();
-       pg->handle_peering_event(evt, &rctx);
-     }
-@@ -7807,8 +7872,13 @@
-   static const char* KEYS[] = {
-     "osd_max_backfills",
-     "osd_op_complaint_time", "osd_op_log_threshold",
-     "osd_op_history_size", "osd_op_history_duration",
-+    "osd_map_cache_size",
-+    "osd_map_max_advance",
-+    "osd_pg_epoch_persisted_max_stale",
-+    "osd_disk_thread_ioprio_class",
-+    "osd_disk_thread_ioprio_priority",
-     NULL
-   };
-   return KEYS;
- }
-@@ -7829,8 +7899,40 @@
-       changed.count("osd_op_history_duration")) {
-     op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
-                                              cct->_conf->osd_op_history_duration);
-   }
-+  if (changed.count("osd_disk_thread_ioprio_class") ||
-+      changed.count("osd_disk_thread_ioprio_priority")) {
-+    set_disk_tp_priority();
-+  }
-+
-+  check_config();
-+}
-+
-+void OSD::check_config()
-+{
-+  // some sanity checks
-+  if (g_conf->osd_map_cache_size <= g_conf->osd_map_max_advance + 2) {
-+    clog.warn() << "osd_map_cache_size (" << g_conf->osd_map_cache_size << ")"
-+		<< " is not > osd_map_max_advance ("
-+		<< g_conf->osd_map_max_advance << ")";
-+  }
-+  if (g_conf->osd_map_cache_size <= (int)g_conf->osd_pg_epoch_persisted_max_stale + 2) {
-+    clog.warn() << "osd_map_cache_size (" << g_conf->osd_map_cache_size << ")"
-+		<< " is not > osd_pg_epoch_persisted_max_stale ("
-+		<< g_conf->osd_pg_epoch_persisted_max_stale << ")";
-+  }
-+}
-+
-+void OSD::set_disk_tp_priority()
-+{
-+  dout(10) << __func__
-+	   << " class " << cct->_conf->osd_disk_thread_ioprio_class
-+	   << " priority " << cct->_conf->osd_disk_thread_ioprio_priority
-+	   << dendl;
-+  int cls =
-+    ceph_ioprio_string_to_class(cct->_conf->osd_disk_thread_ioprio_class);
-+  disk_tp.set_ioprio(cls, cct->_conf->osd_disk_thread_ioprio_priority);
- }
- 
- // --------------------------------
- 
---- a/src/osd/OSD.h
-+++ b/src/osd/OSD.h
-@@ -333,8 +333,44 @@
-   ClassHandler  *&class_handler;
- 
-   void dequeue_pg(PG *pg, list<OpRequestRef> *dequeued);
- 
-+  // -- map epoch lower bound --
-+  Mutex pg_epoch_lock;
-+  multiset<epoch_t> pg_epochs;
-+  map<spg_t,epoch_t> pg_epoch;
-+
-+  void pg_add_epoch(spg_t pgid, epoch_t epoch) {
-+    Mutex::Locker l(pg_epoch_lock);
-+    map<spg_t,epoch_t>::iterator t = pg_epoch.find(pgid);
-+    assert(t == pg_epoch.end());
-+    pg_epoch[pgid] = epoch;
-+    pg_epochs.insert(epoch);
-+  }
-+  void pg_update_epoch(spg_t pgid, epoch_t epoch) {
-+    Mutex::Locker l(pg_epoch_lock);
-+    map<spg_t,epoch_t>::iterator t = pg_epoch.find(pgid);
-+    assert(t != pg_epoch.end());
-+    pg_epochs.erase(pg_epochs.find(t->second));
-+    t->second = epoch;
-+    pg_epochs.insert(epoch);
-+  }
-+  void pg_remove_epoch(spg_t pgid) {
-+    Mutex::Locker l(pg_epoch_lock);
-+    map<spg_t,epoch_t>::iterator t = pg_epoch.find(pgid);
-+    if (t != pg_epoch.end()) {
-+      pg_epochs.erase(pg_epochs.find(t->second));
-+      pg_epoch.erase(t);
-+    }
-+  }
-+  epoch_t get_min_pg_epoch() {
-+    Mutex::Locker l(pg_epoch_lock);
-+    if (pg_epochs.empty())
-+      return 0;
-+    else
-+      return *pg_epochs.begin();
-+  }
-+
-   // -- superblock --
-   Mutex publish_lock, pre_publish_lock; // pre-publish orders before publish
-   OSDSuperblock superblock;
-   OSDSuperblock get_superblock() {
-@@ -783,8 +819,9 @@
-   // config observer bits
-   virtual const char** get_tracked_conf_keys() const;
-   virtual void handle_conf_change(const struct md_config_t *conf,
- 				  const std::set <std::string> &changed);
-+  void check_config();
- 
- protected:
-   Mutex osd_lock;			// global lock
-   SafeTimer tick_timer;    // safe timer (osd_lock)
-@@ -943,8 +980,10 @@
-   ThreadPool command_tp;
- 
-   bool paused_recovery;
- 
-+  void set_disk_tp_priority();
-+
-   // -- sessions --
- public:
-   struct Session : public RefCountedObject {
-     EntityName entity_name;
-@@ -1254,9 +1293,9 @@
-   void handle_osd_map(class MOSDMap *m);
-   void note_down_osd(int osd);
-   void note_up_osd(int osd);
-   
--  void advance_pg(
-+  bool advance_pg(
-     epoch_t advance_to, PG *pg,
-     ThreadPool::TPHandle &handle,
-     PG::RecoveryCtx *rctx,
-     set<boost::intrusive_ptr<PG> > *split_pgs
-@@ -1512,11 +1551,24 @@
- 		OSDMapRef map);
-   void repeer(PG *pg, map< int, map<spg_t,pg_query_t> >& query_map);
- 
-   bool require_mon_peer(Message *m);
--  bool require_osd_peer(OpRequestRef op);
-+  bool require_osd_peer(OpRequestRef& op);
-+  /***
-+   * Verifies that we were alive in the given epoch, and that
-+   * still are.
-+   */
-+  bool require_self_aliveness(OpRequestRef& op, epoch_t alive_since);
-+  /**
-+   * Verifies that the OSD who sent the given op has the same
-+   * address as in the given map.
-+   * @pre op was sent by an OSD using the cluster messenger
-+   */
-+  bool require_same_peer_instance(OpRequestRef& op, OSDMapRef& map);
-+  bool require_up_osd_peer(OpRequestRef& Op, OSDMapRef& map,
-+                           epoch_t their_epoch);
- 
--  bool require_same_or_newer_map(OpRequestRef op, epoch_t e);
-+  bool require_same_or_newer_map(OpRequestRef& op, epoch_t e);
- 
-   void handle_pg_query(OpRequestRef op);
-   void handle_pg_notify(OpRequestRef op);
-   void handle_pg_log(OpRequestRef op);
---- a/src/osd/OSDMap.cc
-+++ b/src/osd/OSDMap.cc
-@@ -958,12 +958,9 @@
-   if (crush->has_nondefault_tunables())
-     features |= CEPH_FEATURE_CRUSH_TUNABLES;
-   if (crush->has_nondefault_tunables2())
-     features |= CEPH_FEATURE_CRUSH_TUNABLES2;
--  if (crush->has_v2_rules())
--    features |= CEPH_FEATURE_CRUSH_V2;
--  if (crush->has_nondefault_tunables3() ||
--      crush->has_v3_rules())
-+  if (crush->has_nondefault_tunables3())
-     features |= CEPH_FEATURE_CRUSH_TUNABLES3;
-   mask |= CEPH_FEATURES_CRUSH;
- 
-   for (map<int64_t,pg_pool_t>::const_iterator p = pools.begin(); p != pools.end(); ++p) {
-@@ -977,8 +974,17 @@
-     if (!p->second.tiers.empty() ||
- 	p->second.is_tier()) {
-       features |= CEPH_FEATURE_OSD_CACHEPOOL;
-     }
-+    int ruleid = crush->find_rule(p->second.get_crush_ruleset(),
-+				  p->second.get_type(),
-+				  p->second.get_size());
-+    if (ruleid >= 0) {
-+      if (crush->is_v2_rule(ruleid))
-+	features |= CEPH_FEATURE_CRUSH_V2;
-+      if (crush->is_v3_rule(ruleid))
-+	features |= CEPH_FEATURE_CRUSH_TUNABLES3;
-+    }
-   }
-   mask |= CEPH_FEATURE_OSDHASHPSPOOL | CEPH_FEATURE_OSD_CACHEPOOL;
-   if (entity_type != CEPH_ENTITY_TYPE_CLIENT)
-     mask |= CEPH_FEATURE_OSD_ERASURE_CODES;
-@@ -1800,9 +1806,17 @@
-   {
-     ENCODE_START(1, 1, bl); // extended, osd-only data
-     ::encode(osd_addrs->hb_back_addr, bl);
-     ::encode(osd_info, bl);
--    ::encode(blacklist, bl);
-+    {
-+      // put this in a sorted, ordered map<> so that we encode in a
-+      // deterministic order.
-+      map<entity_addr_t,utime_t> blacklist_map;
-+      for (ceph::unordered_map<entity_addr_t,utime_t>::const_iterator p =
-+	     blacklist.begin(); p != blacklist.end(); ++p)
-+	blacklist_map.insert(make_pair(p->first, p->second));
-+      ::encode(blacklist_map, bl);
-+    }
-     ::encode(osd_addrs->cluster_addr, bl);
-     ::encode(cluster_snapshot_epoch, bl);
-     ::encode(cluster_snapshot, bl);
-     ::encode(*osd_uuid, bl);
-@@ -2158,8 +2172,9 @@
-   o.push_back(new OSDMap);
-   uuid_d fsid;
-   o.back()->build_simple(cct, 1, fsid, 16, 7, 8);
-   o.back()->created = o.back()->modified = utime_t(1, 2);  // fix timestamp
-+  o.back()->blacklist[entity_addr_t()] = utime_t(5, 6);
-   cct->put();
- }
- 
- string OSDMap::get_flag_string(unsigned f)
-@@ -2550,15 +2565,27 @@
-     set_state(i, 0);
-     set_weight(i, CEPH_OSD_OUT);
-   }
- 
--  map<string,string> erasure_code_profile_map;
--  r = get_str_map(cct->_conf->osd_pool_default_erasure_code_profile,
--		  ss,
--		  &erasure_code_profile_map);
--  erasure_code_profile_map["directory"] =
-+  map<string,string> profile_map;
-+  r = get_erasure_code_profile_default(cct, profile_map, &ss);
-+  if (r < 0) {
-+    lderr(cct) << ss.str() << dendl;
-+    return r;
-+  }
-+  set_erasure_code_profile("default", profile_map);
-+  return 0;
-+}
-+
-+int OSDMap::get_erasure_code_profile_default(CephContext *cct,
-+					     map<string,string> &profile_map,
-+					     ostream *ss)
-+{
-+  int r = get_str_map(cct->_conf->osd_pool_default_erasure_code_profile,
-+		      *ss,
-+		      &profile_map);
-+  profile_map["directory"] =
-     cct->_conf->osd_pool_default_erasure_code_directory;
--  set_erasure_code_profile("default", erasure_code_profile_map);
-   return r;
- }
- 
- int OSDMap::_build_crush_types(CrushWrapper& crush)
---- a/src/osd/OSDMap.h
-+++ b/src/osd/OSDMap.h
-@@ -379,8 +379,11 @@
-     map<string,map<string,string> >::const_iterator i =
-       erasure_code_profiles.find(name);
-     return i != erasure_code_profiles.end();
-   }
-+  int get_erasure_code_profile_default(CephContext *cct,
-+				       map<string,string> &profile_map,
-+				       ostream *ss);
-   void set_erasure_code_profile(const string &name,
- 				const map<string,string> &profile) {
-     erasure_code_profiles[name] = profile;
-   }
---- a/src/osd/OpRequest.cc
-+++ b/src/osd/OpRequest.cc
-@@ -32,9 +32,9 @@
-     f->open_object_section("client_info");
-     stringstream client_name;
-     client_name << m->get_orig_source();
-     f->dump_string("client", client_name.str());
--    f->dump_int("tid", m->get_tid());
-+    f->dump_unsigned("tid", m->get_tid());
-     f->close_section(); // client_info
-   }
-   {
-     f->open_array_section("events");
---- a/src/osd/OpRequest.h
-+++ b/src/osd/OpRequest.h
-@@ -73,8 +73,12 @@
-   void set_pg_op();
- 
-   void _dump(utime_t now, Formatter *f) const;
- 
-+  bool has_feature(uint64_t f) const {
-+    return request->get_connection()->has_feature(f);
-+  }
-+
- private:
-   osd_reqid_t reqid;
-   uint8_t hit_flag_points;
-   uint8_t latest_flag_point;
---- a/src/osd/PG.cc
-+++ b/src/osd/PG.cc
-@@ -1442,9 +1442,9 @@
-     last_update_ondisk = info.last_update;
-     min_last_complete_ondisk = eversion_t(0,0);  // we don't know (yet)!
-   }
-   last_update_applied = info.last_update;
--
-+  last_rollback_info_trimmed_to_applied = pg_log.get_rollback_trimmed_to();
- 
-   need_up_thru = false;
- 
-   // write pg info, log
-@@ -2640,9 +2640,12 @@
- }
- 
- 
- void PG::append_log(
--  vector<pg_log_entry_t>& logv, eversion_t trim_to, ObjectStore::Transaction &t,
-+  vector<pg_log_entry_t>& logv,
-+  eversion_t trim_to,
-+  eversion_t trim_rollback_to,
-+  ObjectStore::Transaction &t,
-   bool transaction_applied)
- {
-   if (transaction_applied)
-     update_snap_map(logv, t);
-@@ -2654,15 +2657,35 @@
-        ++p) {
-     p->offset = 0;
-     add_log_entry(*p, keys[p->get_key_name()]);
-   }
--  if (!transaction_applied)
--    pg_log.clear_can_rollback_to();
-+
-+  PGLogEntryHandler handler;
-+  if (!transaction_applied) {
-+    pg_log.clear_can_rollback_to(&handler);
-+    t.register_on_applied(
-+      new C_UpdateLastRollbackInfoTrimmedToApplied(
-+	this,
-+	get_osdmap()->get_epoch(),
-+	info.last_update));
-+  } else if (trim_rollback_to > pg_log.get_rollback_trimmed_to()) {
-+    pg_log.trim_rollback_info(
-+      trim_rollback_to,
-+      &handler);
-+    t.register_on_applied(
-+      new C_UpdateLastRollbackInfoTrimmedToApplied(
-+	this,
-+	get_osdmap()->get_epoch(),
-+	trim_rollback_to));
-+  }
- 
-   dout(10) << "append_log  adding " << keys.size() << " keys" << dendl;
-   t.omap_setkeys(coll_t::META_COLL, log_oid, keys);
--  PGLogEntryHandler handler;
-+
-   pg_log.trim(&handler, trim_to, info);
-+
-+  dout(10) << __func__ << ": trimming to " << trim_rollback_to
-+	   << " entries " << handler.to_trim << dendl;
-   handler.apply(this, &t);
- 
-   // update the local pg, pg log
-   dirty_info = true;
-@@ -3003,9 +3026,10 @@
- }
- 
- void PG::reg_next_scrub()
- {
--  if (scrubber.must_scrub) {
-+  if (scrubber.must_scrub ||
-+      (info.stats.stats_invalid && g_conf->osd_scrub_invalid_stats)) {
-     scrubber.scrub_reg_stamp = utime_t();
-   } else {
-     scrubber.scrub_reg_stamp = info.history.last_scrub_stamp;
-   }
-@@ -3261,8 +3285,36 @@
-     osd->send_message_osd_cluster(i->osd, subop, get_osdmap()->get_epoch());
-   }
- }
- 
-+void PG::_scan_rollback_obs(
-+  const vector<ghobject_t> &rollback_obs,
-+  ThreadPool::TPHandle &handle)
-+{
-+  ObjectStore::Transaction *t = NULL;
-+  eversion_t trimmed_to = last_rollback_info_trimmed_to_applied;
-+  for (vector<ghobject_t>::const_iterator i = rollback_obs.begin();
-+       i != rollback_obs.end();
-+       ++i) {
-+    if (i->generation < trimmed_to.version) {
-+      osd->clog.error() << "osd." << osd->whoami
-+			<< " pg " << info.pgid
-+			<< " found obsolete rollback obj "
-+			<< *i << " generation < trimmed_to "
-+			<< trimmed_to
-+			<< "...repaired";
-+      if (!t)
-+	t = new ObjectStore::Transaction;
-+      t->remove(coll, *i);
-+    }
-+  }
-+  if (t) {
-+    derr << __func__ << ": queueing trans to clean up obsolete rollback objs"
-+	 << dendl;
-+    osd->store->queue_transaction_and_cleanup(osr.get(), t);
-+  }
-+}
-+
- void PG::_scan_snaps(ScrubMap &smap) 
- {
-   for (map<hobject_t, ScrubMap::object>::iterator i = smap.objects.begin();
-        i != smap.objects.end();
-@@ -3348,15 +3400,23 @@
-   map.valid_through = info.last_update;
- 
-   // objects
-   vector<hobject_t> ls;
--  int ret = get_pgbackend()->objects_list_range(start, end, 0, &ls);
-+  vector<ghobject_t> rollback_obs;
-+  int ret = get_pgbackend()->objects_list_range(
-+    start,
-+    end,
-+    0,
-+    &ls,
-+    &rollback_obs);
-   if (ret < 0) {
-     dout(5) << "objects_list_range error: " << ret << dendl;
-     return ret;
-   }
- 
-+
-   get_pgbackend()->be_scan_list(map, ls, deep, handle);
-+  _scan_rollback_obs(rollback_obs, handle);
-   _scan_snaps(map);
- 
-   // pg attrs
-   osd->store->collection_getattrs(coll, map.attrs);
-@@ -3577,8 +3637,19 @@
-  */
- void PG::scrub(ThreadPool::TPHandle &handle)
- {
-   lock();
-+  if (g_conf->osd_scrub_sleep > 0 &&
-+      (scrubber.state == PG::Scrubber::NEW_CHUNK ||
-+       scrubber.state == PG::Scrubber::INACTIVE)) {
-+    dout(20) << __func__ << " state is INACTIVE|NEW_CHUNK, sleeping" << dendl;
-+    unlock();
-+    utime_t t;
-+    t.set_from_double(g_conf->osd_scrub_sleep);
-+    t.sleep();
-+    lock();
-+    dout(20) << __func__ << " slept for " << t << dendl;
-+  }
-   if (deleting) {
-     unlock();
-     return;
-   }
-@@ -4630,8 +4701,23 @@
-   on_applied->push_back(new ContainerContext<FlushStateRef>(flush_trigger));
-   on_safe->push_back(new ContainerContext<FlushStateRef>(flush_trigger));
- }
- 
-+void PG::reset_interval_flush()
-+{
-+  dout(10) << "Clearing blocked outgoing recovery messages" << dendl;
-+  recovery_state.clear_blocked_outgoing();
-+  
-+  if (!osr->flush_commit(
-+      new QueuePeeringEvt<IntervalFlush>(
-+	this, get_osdmap()->get_epoch(), IntervalFlush()))) {
-+    dout(10) << "Beginning to block outgoing recovery messages" << dendl;
-+    recovery_state.begin_block_outgoing();
-+  } else {
-+    dout(10) << "Not blocking outgoing recovery messages" << dendl;
-+  }
-+}
-+
- /* Called before initializing peering during advance_map */
- void PG::start_peering_interval(
-   const OSDMapRef lastmap,
-   const vector<int>& newup, int new_up_primary,
-@@ -4640,8 +4726,9 @@
- {
-   const OSDMapRef osdmap = get_osdmap();
- 
-   set_last_peering_reset();
-+  reset_interval_flush();
- 
-   vector<int> oldacting, oldup;
-   int oldrole = get_role();
- 
-@@ -5049,9 +5136,9 @@
-     return can_discard_replica_op<MOSDPGPull, MSG_OSD_PG_PULL>(op);
-   case MSG_OSD_PG_PUSH_REPLY:
-     return can_discard_replica_op<MOSDPGPushReply, MSG_OSD_PG_PUSH_REPLY>(op);
-   case MSG_OSD_SUBOPREPLY:
--    return false;
-+    return can_discard_replica_op<MOSDSubOpReply, MSG_OSD_SUBOPREPLY>(op);
- 
-   case MSG_OSD_EC_WRITE:
-     return can_discard_replica_op<MOSDECSubOpWrite, MSG_OSD_EC_WRITE>(op);
-   case MSG_OSD_EC_WRITE_REPLY:
-@@ -5385,8 +5472,17 @@
-   context< RecoveryMachine >().log_enter(state_name);
- }
- 
- boost::statechart::result
-+PG::RecoveryState::Started::react(const IntervalFlush&)
-+{
-+  dout(10) << "Ending blocked outgoing recovery messages" << dendl;
-+  context< RecoveryMachine >().pg->recovery_state.end_block_outgoing();
-+  return discard_event();
-+}
-+
-+
-+boost::statechart::result
- PG::RecoveryState::Started::react(const FlushedEvt&)
- {
-   PG *pg = context< RecoveryMachine >().pg;
-   pg->on_flushed();
-@@ -5435,8 +5531,9 @@
-     NamedState(context< RecoveryMachine >().pg->cct, "Reset")
- {
-   context< RecoveryMachine >().log_enter(state_name);
-   PG *pg = context< RecoveryMachine >().pg;
-+
-   pg->flushes_in_progress = 0;
-   pg->set_last_peering_reset();
- }
- 
-@@ -5447,8 +5544,16 @@
-   pg->on_flushed();
-   return discard_event();
- }
- 
-+boost::statechart::result
-+PG::RecoveryState::Reset::react(const IntervalFlush&)
-+{
-+  dout(10) << "Ending blocked outgoing recovery messages" << dendl;
-+  context< RecoveryMachine >().pg->recovery_state.end_block_outgoing();
-+  return discard_event();
-+}
-+
- boost::statechart::result PG::RecoveryState::Reset::react(const AdvMap& advmap)
- {
-   PG *pg = context< RecoveryMachine >().pg;
-   dout(10) << "Reset advmap" << dendl;
-@@ -5829,8 +5934,20 @@
- {
-   context< RecoveryMachine >().log_enter(state_name);
- }
- 
-+boost::statechart::result
-+PG::RecoveryState::NotBackfilling::react(const RemoteBackfillReserved &evt)
-+{
-+  return discard_event();
-+}
-+
-+boost::statechart::result
-+PG::RecoveryState::NotBackfilling::react(const RemoteReservationRejected &evt)
-+{
-+  return discard_event();
-+}
-+
- void PG::RecoveryState::NotBackfilling::exit()
- {
-   context< RecoveryMachine >().log_exit(state_name, enter_time);
-   PG *pg = context< RecoveryMachine >().pg;
-@@ -6587,19 +6704,23 @@
-   PG *pg = context< RecoveryMachine >().pg;
-   MOSDPGLog *msg = logevt.msg.get();
-   dout(10) << "got info+log from osd." << logevt.from << " " << msg->info << " " << msg->log << dendl;
- 
-+  ObjectStore::Transaction* t = context<RecoveryMachine>().get_cur_transaction();
-   if (msg->info.last_backfill == hobject_t()) {
-     // restart backfill
-     pg->unreg_next_scrub();
-     pg->info = msg->info;
-     pg->reg_next_scrub();
-     pg->dirty_info = true;
-     pg->dirty_big_info = true;  // maybe.
--    pg->pg_log.claim_log(msg->log);
-+
-+    PGLogEntryHandler rollbacker;
-+    pg->pg_log.claim_log_and_clear_rollback_info(msg->log, &rollbacker);
-+    rollbacker.apply(pg, t);
-+
-     pg->pg_log.reset_backfill();
-   } else {
--    ObjectStore::Transaction* t = context<RecoveryMachine>().get_cur_transaction();
-     pg->merge_log(*t, msg->info, msg->log, logevt.from);
-   }
- 
-   assert(pg->pg_log.get_head() == pg->info.last_update);
-@@ -7491,20 +7612,53 @@
- }
- 
- void PG::RecoveryState::start_handle(RecoveryCtx *new_ctx) {
-   assert(!rctx);
--  rctx = new_ctx;
--  if (rctx)
-+  assert(!orig_ctx);
-+  orig_ctx = new_ctx;
-+  if (new_ctx) {
-+    if (messages_pending_flush) {
-+      rctx = RecoveryCtx(*messages_pending_flush, *new_ctx);
-+    } else {
-+      rctx = *new_ctx;
-+    }
-     rctx->start_time = ceph_clock_now(pg->cct);
-+  }
-+}
-+
-+void PG::RecoveryState::begin_block_outgoing() {
-+  assert(!messages_pending_flush);
-+  assert(orig_ctx);
-+  assert(rctx);
-+  messages_pending_flush = BufferedRecoveryMessages();
-+  rctx = RecoveryCtx(*messages_pending_flush, *orig_ctx);
-+}
-+
-+void PG::RecoveryState::clear_blocked_outgoing() {
-+  assert(orig_ctx);
-+  assert(rctx);
-+  messages_pending_flush = boost::optional<BufferedRecoveryMessages>();
-+}
-+
-+void PG::RecoveryState::end_block_outgoing() {
-+  assert(messages_pending_flush);
-+  assert(orig_ctx);
-+  assert(rctx);
-+
-+  rctx = RecoveryCtx(*orig_ctx);
-+  rctx->accept_buffered_messages(*messages_pending_flush);
-+  messages_pending_flush = boost::optional<BufferedRecoveryMessages>();
- }
- 
- void PG::RecoveryState::end_handle() {
-   if (rctx) {
-     utime_t dur = ceph_clock_now(pg->cct) - rctx->start_time;
-     machine.event_time += dur;
-   }
-+
-   machine.event_count++;
--  rctx = 0;
-+  rctx = boost::optional<RecoveryCtx>();
-+  orig_ctx = NULL;
- }
- 
- void intrusive_ptr_add_ref(PG *pg) { pg->get("intptr"); }
- void intrusive_ptr_release(PG *pg) { pg->put("intptr"); }
---- a/src/osd/PG.h
-+++ b/src/osd/PG.h
-@@ -446,8 +446,27 @@
-   eversion_t  last_update_ondisk;    // last_update that has committed; ONLY DEFINED WHEN is_active()
-   eversion_t  last_complete_ondisk;  // last_complete that has committed.
-   eversion_t  last_update_applied;
- 
-+
-+  struct C_UpdateLastRollbackInfoTrimmedToApplied : Context {
-+    PGRef pg;
-+    epoch_t e;
-+    eversion_t v;
-+    C_UpdateLastRollbackInfoTrimmedToApplied(PG *pg, epoch_t e, eversion_t v)
-+      : pg(pg), e(e), v(v) {}
-+    void finish(int) {
-+      pg->lock();
-+      if (!pg->pg_has_reset_since(e)) {
-+	pg->last_rollback_info_trimmed_to_applied = v;
-+      }
-+      pg->unlock();
-+    }
-+  };
-+  // entries <= last_rollback_info_trimmed_to_applied have been trimmed,
-+  // and the transaction has applied
-+  eversion_t  last_rollback_info_trimmed_to_applied;
-+
-   // primary state
-  public:
-   pg_shard_t primary;
-   pg_shard_t pg_whoami;
-@@ -486,8 +505,14 @@
-   bool may_need_replay(const OSDMapRef osdmap) const;
- 
- 
- public:    
-+  struct BufferedRecoveryMessages {
-+    map<int, map<spg_t, pg_query_t> > query_map;
-+    map<int, vector<pair<pg_notify_t, pg_interval_map_t> > > info_map;
-+    map<int, vector<pair<pg_notify_t, pg_interval_map_t> > > notify_list;
-+  };
-+
-   struct RecoveryCtx {
-     utime_t start_time;
-     map<int, map<spg_t, pg_query_t> > *query_map;
-     map<int, vector<pair<pg_notify_t, pg_interval_map_t> > > *info_map;
-@@ -507,8 +532,50 @@
- 	notify_list(notify_list),
- 	on_applied(on_applied),
- 	on_safe(on_safe),
- 	transaction(transaction) {}
-+
-+    RecoveryCtx(BufferedRecoveryMessages &buf, RecoveryCtx &rctx)
-+      : query_map(&(buf.query_map)),
-+	info_map(&(buf.info_map)),
-+	notify_list(&(buf.notify_list)),
-+	on_applied(rctx.on_applied),
-+	on_safe(rctx.on_safe),
-+	transaction(rctx.transaction) {}
-+
-+    void accept_buffered_messages(BufferedRecoveryMessages &m) {
-+      assert(query_map);
-+      assert(info_map);
-+      assert(notify_list);
-+      for (map<int, map<spg_t, pg_query_t> >::iterator i = m.query_map.begin();
-+	   i != m.query_map.end();
-+	   ++i) {
-+	map<spg_t, pg_query_t> &omap = (*query_map)[i->first];
-+	for (map<spg_t, pg_query_t>::iterator j = i->second.begin();
-+	     j != i->second.end();
-+	     ++j) {
-+	  omap[j->first] = j->second;
-+	}
-+      }
-+      for (map<int, vector<pair<pg_notify_t, pg_interval_map_t> > >::iterator i
-+	     = m.info_map.begin();
-+	   i != m.info_map.end();
-+	   ++i) {
-+	vector<pair<pg_notify_t, pg_interval_map_t> > &ovec =
-+	  (*info_map)[i->first];
-+	ovec.reserve(ovec.size() + i->second.size());
-+	ovec.insert(ovec.end(), i->second.begin(), i->second.end());
-+      }
-+      for (map<int, vector<pair<pg_notify_t, pg_interval_map_t> > >::iterator i
-+	     = m.notify_list.begin();
-+	   i != m.notify_list.end();
-+	   ++i) {
-+	vector<pair<pg_notify_t, pg_interval_map_t> > &ovec =
-+	  (*notify_list)[i->first];
-+	ovec.reserve(ovec.size() + i->second.size());
-+	ovec.insert(ovec.end(), i->second.begin(), i->second.end());
-+      }
-+    }
-   };
- 
-   struct NamedState {
-     const char *state_name;
-@@ -1107,8 +1174,11 @@
-   void scrub_finish();
-   void scrub_clear_state();
-   bool scrub_gather_replica_maps();
-   void _scan_snaps(ScrubMap &map);
-+  void _scan_rollback_obs(
-+    const vector<ghobject_t> &rollback_obs,
-+    ThreadPool::TPHandle &handle);
-   void _request_scrub_map_classic(pg_shard_t replica, eversion_t version);
-   void _request_scrub_map(pg_shard_t replica, eversion_t version,
-                           hobject_t start, hobject_t end, bool deep);
-   int build_scrub_map_chunk(
-@@ -1332,12 +1402,19 @@
-   TrivialEvent(GoClean)
- 
-   TrivialEvent(AllReplicasActivated)
- 
-+  TrivialEvent(IntervalFlush)
-+
-   /* Encapsulates PG recovery process */
-   class RecoveryState {
-     void start_handle(RecoveryCtx *new_ctx);
-     void end_handle();
-+  public:
-+    void begin_block_outgoing();
-+    void end_block_outgoing();
-+    void clear_blocked_outgoing();
-+  private:
- 
-     /* States */
-     struct Initial;
-     class RecoveryMachine : public boost::statechart::state_machine< RecoveryMachine, Initial > {
-@@ -1359,42 +1436,49 @@
-       RecoveryMachine(RecoveryState *state, PG *pg) : state(state), pg(pg), event_count(0) {}
- 
-       /* Accessor functions for state methods */
-       ObjectStore::Transaction* get_cur_transaction() {
-+	assert(state->rctx);
- 	assert(state->rctx->transaction);
- 	return state->rctx->transaction;
-       }
- 
-       void send_query(pg_shard_t to, const pg_query_t &query) {
-+	assert(state->rctx);
- 	assert(state->rctx->query_map);
- 	(*state->rctx->query_map)[to.osd][spg_t(pg->info.pgid.pgid, to.shard)] =
- 	  query;
-       }
- 
-       map<int, map<spg_t, pg_query_t> > *get_query_map() {
-+	assert(state->rctx);
- 	assert(state->rctx->query_map);
- 	return state->rctx->query_map;
-       }
- 
-       map<int, vector<pair<pg_notify_t, pg_interval_map_t> > > *get_info_map() {
-+	assert(state->rctx);
- 	assert(state->rctx->info_map);
- 	return state->rctx->info_map;
-       }
- 
-       list< Context* > *get_on_safe_context_list() {
-+	assert(state->rctx);
- 	assert(state->rctx->on_safe);
- 	return &(state->rctx->on_safe->contexts);
-       }
- 
-       list< Context * > *get_on_applied_context_list() {
-+	assert(state->rctx);
- 	assert(state->rctx->on_applied);
- 	return &(state->rctx->on_applied->contexts);
-       }
- 
--      RecoveryCtx *get_recovery_ctx() { return state->rctx; }
-+      RecoveryCtx *get_recovery_ctx() { return &*(state->rctx); }
- 
-       void send_notify(pg_shard_t to,
- 		       const pg_notify_t &info, const pg_interval_map_t &pi) {
-+	assert(state->rctx);
- 	assert(state->rctx->notify_list);
- 	(*state->rctx->notify_list)[to.osd].push_back(make_pair(info, pi));
-       }
-     };
-@@ -1438,14 +1522,16 @@
- 	boost::statechart::custom_reaction< AdvMap >,
- 	boost::statechart::custom_reaction< ActMap >,
- 	boost::statechart::custom_reaction< NullEvt >,
- 	boost::statechart::custom_reaction< FlushedEvt >,
-+	boost::statechart::custom_reaction< IntervalFlush >,
- 	boost::statechart::transition< boost::statechart::event_base, Crashed >
- 	> reactions;
-       boost::statechart::result react(const QueryState& q);
-       boost::statechart::result react(const AdvMap&);
-       boost::statechart::result react(const ActMap&);
-       boost::statechart::result react(const FlushedEvt&);
-+      boost::statechart::result react(const IntervalFlush&);
-       boost::statechart::result react(const boost::statechart::event_base&) {
- 	return discard_event();
-       }
-     };
-@@ -1460,13 +1546,15 @@
- 	boost::statechart::custom_reaction< QueryState >,
- 	boost::statechart::custom_reaction< AdvMap >,
- 	boost::statechart::custom_reaction< NullEvt >,
- 	boost::statechart::custom_reaction< FlushedEvt >,
-+	boost::statechart::custom_reaction< IntervalFlush >,
- 	boost::statechart::transition< boost::statechart::event_base, Crashed >
- 	> reactions;
-       boost::statechart::result react(const QueryState& q);
-       boost::statechart::result react(const AdvMap&);
-       boost::statechart::result react(const FlushedEvt&);
-+      boost::statechart::result react(const IntervalFlush&);
-       boost::statechart::result react(const boost::statechart::event_base&) {
- 	return discard_event();
-       }
-     };
-@@ -1634,12 +1722,16 @@
-     };
- 
-     struct NotBackfilling : boost::statechart::state< NotBackfilling, Active>, NamedState {
-       typedef boost::mpl::list<
--	boost::statechart::transition< RequestBackfill, WaitLocalBackfillReserved>
-+	boost::statechart::transition< RequestBackfill, WaitLocalBackfillReserved>,
-+	boost::statechart::custom_reaction< RemoteBackfillReserved >,
-+	boost::statechart::custom_reaction< RemoteReservationRejected >
- 	> reactions;
-       NotBackfilling(my_context ctx);
-       void exit();
-+      boost::statechart::result react(const RemoteBackfillReserved& evt);
-+      boost::statechart::result react(const RemoteReservationRejected& evt);
-     };
- 
-     struct RepNotRecovering;
-     struct ReplicaActive : boost::statechart::state< ReplicaActive, Started, RepNotRecovering >, NamedState {
-@@ -1854,12 +1946,25 @@
- 
- 
-     RecoveryMachine machine;
-     PG *pg;
--    RecoveryCtx *rctx;
-+
-+    /// context passed in by state machine caller
-+    RecoveryCtx *orig_ctx;
-+
-+    /// populated if we are buffering messages pending a flush
-+    boost::optional<BufferedRecoveryMessages> messages_pending_flush;
-+
-+    /**
-+     * populated between start_handle() and end_handle(), points into
-+     * the message lists for messages_pending_flush while blocking messages
-+     * or into orig_ctx otherwise
-+     */
-+    boost::optional<RecoveryCtx> rctx;
- 
-   public:
--    RecoveryState(PG *pg) : machine(this, pg), pg(pg), rctx(0) {
-+    RecoveryState(PG *pg)
-+      : machine(this, pg), pg(pg), orig_ctx(0) {
-       machine.initiate();
-     }
- 
-     void handle_event(const boost::statechart::event_base &evt,
-@@ -1995,9 +2100,12 @@
-   }
- 
-   void add_log_entry(pg_log_entry_t& e, bufferlist& log_bl);
-   void append_log(
--    vector<pg_log_entry_t>& logv, eversion_t trim_to, ObjectStore::Transaction &t,
-+    vector<pg_log_entry_t>& logv,
-+    eversion_t trim_to,
-+    eversion_t trim_rollback_to,
-+    ObjectStore::Transaction &t,
-     bool transaction_applied = true);
-   bool check_log_for_corruption(ObjectStore *store);
-   void trim_peers();
- 
-@@ -2025,8 +2133,9 @@
-   void share_pg_info();
-   /// share new pg log entries after a pg is active
-   void share_pg_log();
- 
-+  void reset_interval_flush();
-   void start_peering_interval(
-     const OSDMapRef lastmap,
-     const vector<int>& newup, int up_primary,
-     const vector<int>& newacting, int acting_primary,
---- a/src/osd/PGBackend.cc
-+++ b/src/osd/PGBackend.cc
-@@ -114,9 +114,13 @@
-   vector<hobject_t> *ls,
-   hobject_t *next)
- {
-   assert(ls);
--  ghobject_t _next(begin);
-+  // Starts with the smallest shard id and generation to
-+  // make sure the result list has the marker object (
-+  // it might have multiple generations though, which would
-+  // be filtered).
-+  ghobject_t _next(begin, 0, shard_id_t(0));
-   ls->reserve(max);
-   int r = 0;
-   while (!_next.is_max() && ls->size() < (unsigned)min) {
-     vector<ghobject_t> objects;
-@@ -146,9 +150,10 @@
- int PGBackend::objects_list_range(
-   const hobject_t &start,
-   const hobject_t &end,
-   snapid_t seq,
--  vector<hobject_t> *ls)
-+  vector<hobject_t> *ls,
-+  vector<ghobject_t> *gen_obs)
- {
-   assert(ls);
-   vector<ghobject_t> objects;
-   int r = store->collection_list_range(
-@@ -162,8 +167,10 @@
-        i != objects.end();
-        ++i) {
-     if (i->is_no_gen()) {
-       ls->push_back(i->hobj);
-+    } else if (gen_obs) {
-+      gen_obs->push_back(*i);
-     }
-   }
-   return r;
- }
---- a/src/osd/PGBackend.h
-+++ b/src/osd/PGBackend.h
-@@ -176,8 +176,9 @@
-      virtual void log_operation(
-        vector<pg_log_entry_t> &logv,
-        boost::optional<pg_hit_set_history_t> &hset_history,
-        const eversion_t &trim_to,
-+       const eversion_t &trim_rollback_to,
-        bool transaction_applied,
-        ObjectStore::Transaction *t) = 0;
- 
-      virtual void update_peer_last_complete_ondisk(
-@@ -495,8 +496,9 @@
-      const hobject_t &hoid,               ///< [in] object
-      const eversion_t &at_version,        ///< [in] version
-      PGTransaction *t,                    ///< [in] trans to execute
-      const eversion_t &trim_to,           ///< [in] trim log to here
-+     const eversion_t &trim_rollback_to,  ///< [in] trim rollback info to here
-      vector<pg_log_entry_t> &log_entries, ///< [in] log entries for t
-      /// [in] hitset history (if updated with this transaction)
-      boost::optional<pg_hit_set_history_t> &hset_history,
-      Context *on_local_applied_sync,      ///< [in] called when applied locally
-@@ -554,9 +556,10 @@
-    int objects_list_range(
-      const hobject_t &start,
-      const hobject_t &end,
-      snapid_t seq,
--     vector<hobject_t> *ls);
-+     vector<hobject_t> *ls,
-+     vector<ghobject_t> *gen_obs=0);
- 
-    int objects_get_attr(
-      const hobject_t &hoid,
-      const string &attr,
---- a/src/osd/PGLog.cc
-+++ b/src/osd/PGLog.cc
-@@ -23,8 +23,27 @@
- #define dout_subsys ceph_subsys_osd
- 
- //////////////////// PGLog::IndexedLog ////////////////////
- 
-+void PGLog::IndexedLog::advance_rollback_info_trimmed_to(
-+  eversion_t to,
-+  LogEntryHandler *h)
-+{
-+  assert(to <= can_rollback_to);
-+
-+  if (to > rollback_info_trimmed_to)
-+    rollback_info_trimmed_to = to;
-+
-+  while (rollback_info_trimmed_to_riter != log.rbegin()) {
-+    --rollback_info_trimmed_to_riter;
-+    if (rollback_info_trimmed_to_riter->version > rollback_info_trimmed_to) {
-+      ++rollback_info_trimmed_to_riter;
-+      break;
-+    }
-+    h->trim(*rollback_info_trimmed_to_riter);
-+  }
-+}
-+
- void PGLog::IndexedLog::split_into(
-   pg_t child_pgid,
-   unsigned split_bits,
-   PGLog::IndexedLog *olog)
-@@ -46,11 +65,13 @@
-     }
-     oldlog.erase(i++);
-   }
- 
-+
-+  olog->can_rollback_to = can_rollback_to;
-+
-   olog->index();
-   index();
--  olog->can_rollback_to = can_rollback_to;
- }
- 
- void PGLog::IndexedLog::trim(
-   LogEntryHandler *handler,
-@@ -58,22 +79,33 @@
-   set<eversion_t> *trimmed)
- {
-   if (complete_to != log.end() &&
-       complete_to->version <= s) {
--    generic_dout(0) << " bad trim to " << s << " when complete_to is " << complete_to->version
-+    generic_dout(0) << " bad trim to " << s << " when complete_to is "
-+		    << complete_to->version
- 		    << " on " << *this << dendl;
-   }
- 
-+  if (s > can_rollback_to)
-+    can_rollback_to = s;
-+  advance_rollback_info_trimmed_to(s, handler);
-+
-   while (!log.empty()) {
-     pg_log_entry_t &e = *log.begin();
-     if (e.version > s)
-       break;
-     generic_dout(20) << "trim " << e << dendl;
-     if (trimmed)
-       trimmed->insert(e.version);
--    handler->trim(e);
-+
-     unindex(e);         // remove from index,
--    log.pop_front();    // from log
-+
-+    if (e.version == rollback_info_trimmed_to_riter->version) {
-+      log.pop_front();
-+      rollback_info_trimmed_to_riter = log.rend();
-+    } else {
-+      log.pop_front();
-+    }
-   }
- 
-   // raise tail?
-   if (tail < s)
-@@ -103,9 +135,9 @@
- 
- void PGLog::clear() {
-   divergent_priors.clear();
-   missing.clear();
--  log.zero();
-+  log.clear();
-   log_keys_debug.clear();
-   undirty();
- }
- 
---- a/src/osd/PGLog.h
-+++ b/src/osd/PGLog.h
-@@ -61,13 +61,35 @@
-     // recovery pointers
-     list<pg_log_entry_t>::iterator complete_to;  // not inclusive of referenced item
-     version_t last_requested;           // last object requested by primary
- 
-+    //
-+  private:
-+    /**
-+     * rollback_info_trimmed_to_riter points to the first log entry <=
-+     * rollback_info_trimmed_to
-+     *
-+     * It's a reverse_iterator because rend() is a natural representation for
-+     * tail, and rbegin() works nicely for head.
-+     */
-+    list<pg_log_entry_t>::reverse_iterator rollback_info_trimmed_to_riter;
-+  public:
-+    void advance_rollback_info_trimmed_to(eversion_t to, LogEntryHandler *h);
-+
-     /****/
--    IndexedLog() : last_requested(0) {}
-+    IndexedLog() :
-+      complete_to(log.end()),
-+      last_requested(0),
-+      rollback_info_trimmed_to_riter(log.rbegin())
-+      {}
-+
-+    void claim_log_and_clear_rollback_info(const pg_log_t& o) {
-+      // we must have already trimmed the old entries
-+      assert(rollback_info_trimmed_to == head);
-+      assert(rollback_info_trimmed_to_riter == log.rbegin());
- 
--    void claim_log(const pg_log_t& o) {
-       log = o.log;
-+      rollback_info_trimmed_to = head;
-       head = o.head;
-       tail = o.tail;
-       index();
-     }
-@@ -77,12 +99,22 @@
-       unsigned split_bits,
-       IndexedLog *olog);
- 
-     void zero() {
-+      // we must have already trimmed the old entries
-+      assert(rollback_info_trimmed_to == head);
-+      assert(rollback_info_trimmed_to_riter == log.rbegin());
-+
-       unindex();
-       pg_log_t::clear();
-+      rollback_info_trimmed_to_riter = log.rbegin();
-       reset_recovery_pointers();
-     }
-+    void clear() {
-+      rollback_info_trimmed_to = head;
-+      rollback_info_trimmed_to_riter = log.rbegin();
-+      zero();
-+    }
-     void reset_recovery_pointers() {
-       complete_to = log.end();
-       last_requested = 0;
-     }
-@@ -111,8 +143,13 @@
- 	  //assert(caller_ops.count(i->reqid) == 0);  // divergent merge_log indexes new before unindexing old
- 	  caller_ops[i->reqid] = &(*i);
- 	}
-       }
-+
-+      rollback_info_trimmed_to_riter = log.rbegin();
-+      while (rollback_info_trimmed_to_riter != log.rend() &&
-+	     rollback_info_trimmed_to_riter->version > rollback_info_trimmed_to)
-+	rollback_info_trimmed_to_riter++;
-     }
- 
-     void index(pg_log_entry_t& e) {
-       if (objects.count(e.soid) == 0 || 
-@@ -140,8 +177,13 @@
-     // actors
-     void add(pg_log_entry_t& e) {
-       // add to log
-       log.push_back(e);
-+
-+      // riter previously pointed to the previous entry
-+      if (rollback_info_trimmed_to_riter == log.rbegin())
-+	++rollback_info_trimmed_to_riter;
-+
-       assert(e.version > head);
-       assert(head.version == 0 || e.version.version > head.version);
-       head = e.version;
- 
-@@ -324,16 +366,35 @@
-     LogEntryHandler *handler,
-     eversion_t trim_to,
-     pg_info_t &info);
- 
--  void clear_can_rollback_to() {
-+  void trim_rollback_info(
-+    eversion_t trim_rollback_to,
-+    LogEntryHandler *h) {
-+    if (trim_rollback_to > log.can_rollback_to)
-+      log.can_rollback_to = trim_rollback_to;
-+    log.advance_rollback_info_trimmed_to(
-+      trim_rollback_to,
-+      h);
-+  }
-+
-+  eversion_t get_rollback_trimmed_to() const {
-+    return log.rollback_info_trimmed_to;
-+  }
-+
-+  void clear_can_rollback_to(LogEntryHandler *h) {
-     log.can_rollback_to = log.head;
-+    log.advance_rollback_info_trimmed_to(
-+      log.head,
-+      h);
-   }
- 
-   //////////////////// get or set log & missing ////////////////////
- 
--  void claim_log(const pg_log_t &o) {
--    log.claim_log(o);
-+  void claim_log_and_clear_rollback_info(const pg_log_t &o, LogEntryHandler *h) {
-+    log.can_rollback_to = log.head;
-+    log.advance_rollback_info_trimmed_to(log.head, h);
-+    log.claim_log_and_clear_rollback_info(o);
-     missing.clear();
-     mark_dirty_to(eversion_t::max());
-   }
- 
---- a/src/osd/ReplicatedBackend.cc
-+++ b/src/osd/ReplicatedBackend.cc
-@@ -493,8 +493,9 @@
-   const hobject_t &soid,
-   const eversion_t &at_version,
-   PGTransaction *_t,
-   const eversion_t &trim_to,
-+  const eversion_t &trim_rollback_to,
-   vector<pg_log_entry_t> &log_entries,
-   boost::optional<pg_hit_set_history_t> &hset_history,
-   Context *on_local_applied_sync,
-   Context *on_all_acked,
-@@ -533,8 +534,9 @@
-     at_version,
-     tid,
-     reqid,
-     trim_to,
-+    trim_rollback_to,
-     t->get_temp_added().size() ? *(t->get_temp_added().begin()) : hobject_t(),
-     t->get_temp_cleared().size() ?
-       *(t->get_temp_cleared().begin()) :hobject_t(),
-     log_entries,
-@@ -548,9 +550,15 @@
-     add_temp_objs(t->get_temp_added());
-   }
-   clear_temp_objs(t->get_temp_cleared());
- 
--  parent->log_operation(log_entries, hset_history, trim_to, true, &local_t);
-+  parent->log_operation(
-+    log_entries,
-+    hset_history,
-+    trim_to,
-+    trim_rollback_to,
-+    true,
-+    &local_t);
-   local_t.append(*op_t);
-   local_t.swap(*op_t);
-   
-   op_t->register_on_applied_sync(on_local_applied_sync);
---- a/src/osd/ReplicatedBackend.h
-+++ b/src/osd/ReplicatedBackend.h
-@@ -341,8 +341,9 @@
-     const hobject_t &hoid,
-     const eversion_t &at_version,
-     PGTransaction *t,
-     const eversion_t &trim_to,
-+    const eversion_t &trim_rollback_to,
-     vector<pg_log_entry_t> &log_entries,
-     boost::optional<pg_hit_set_history_t> &hset_history,
-     Context *on_local_applied_sync,
-     Context *on_all_applied,
-@@ -358,8 +359,9 @@
-     const eversion_t &at_version,
-     ceph_tid_t tid,
-     osd_reqid_t reqid,
-     eversion_t pg_trim_to,
-+    eversion_t pg_trim_rollback_to,
-     hobject_t new_temp_oid,
-     hobject_t discard_temp_oid,
-     vector<pg_log_entry_t> &log_entries,
-     boost::optional<pg_hit_set_history_t> &hset_history,
---- a/src/osd/ReplicatedPG.cc
-+++ b/src/osd/ReplicatedPG.cc
-@@ -1119,8 +1119,14 @@
-       dout(20) << " replay, waiting for active on " << op << dendl;
-       waiting_for_active.push_back(op);
-       return;
-     }
-+    // verify client features
-+    if ((pool.info.has_tiers() || pool.info.is_tier()) &&
-+	!op->has_feature(CEPH_FEATURE_OSD_CACHEPOOL)) {
-+      osd->reply_op_error(op, -EOPNOTSUPP);
-+      return;
-+    }
-     do_op(op); // do it now
-     break;
- 
-   case MSG_OSD_SUBOP:
-@@ -1351,11 +1357,12 @@
-     if (hit_set->is_full() ||
- 	hit_set_start_stamp + pool.info.hit_set_period <= m->get_recv_stamp()) {
-       hit_set_persist();
-     }
-+  }
- 
--    if (agent_state)
--      agent_choose_mode();
-+  if (agent_state) {
-+    agent_choose_mode();
-   }
- 
-   if ((m->get_flags() & CEPH_OSD_FLAG_IGNORE_CACHE) == 0 &&
-       maybe_handle_cache(op, write_ordered, obc, r, missing_oid, false))
-@@ -4853,10 +4860,11 @@
-       ctx->clone_obc->ssc->ref++;
-       if (pool.info.require_rollback())
- 	ctx->clone_obc->attr_cache = ctx->obc->attr_cache;
-       snap_oi = &ctx->clone_obc->obs.oi;
--      bool got = ctx->clone_obc->get_write(ctx->op);
-+      bool got = ctx->clone_obc->get_write_greedy(ctx->op);
-       assert(got);
-+      dout(20) << " got greedy write on clone_obc " << *ctx->clone_obc << dendl;
-     } else {
-       snap_oi = &static_snap_oi;
-     }
-     snap_oi->version = ctx->at_version;
-@@ -5159,10 +5167,11 @@
- 	                                eversion_t(),
- 					0, osd_reqid_t(), ctx->mtime));
- 
-       ctx->snapset_obc = get_object_context(snapoid, true);
--      bool got = ctx->snapset_obc->get_write(ctx->op);
-+      bool got = ctx->snapset_obc->get_write_greedy(ctx->op);
-       assert(got);
-+      dout(20) << " got greedy write on snapset_obc " << *ctx->snapset_obc << dendl;
-       ctx->release_snapset_obc = true;
-       if (pool.info.require_rollback() && !ctx->snapset_obc->obs.exists) {
- 	ctx->log.back().mod_desc.create();
-       } else if (!pool.info.require_rollback()) {
-@@ -6025,8 +6034,13 @@
-   kick_object_context_blocked(cop->obc);
-   cop->results.should_requeue = requeue;
-   CopyCallbackResults result(-ECANCELED, &cop->results);
-   cop->cb->complete(result);
-+
-+  // There may still be an objecter callback referencing this copy op.
-+  // That callback will not need the obc since it's been canceled, and
-+  // we need the obc reference to go away prior to flush.
-+  cop->obc = ObjectContextRef();
- }
- 
- void ReplicatedPG::cancel_copy_ops(bool requeue)
- {
-@@ -6441,9 +6455,9 @@
- }
- 
- bool ReplicatedPG::is_present_clone(hobject_t coid)
- {
--  if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)
-+  if (!pool.info.allow_incomplete_clones())
-     return true;
-   if (is_missing_object(coid))
-     return true;
-   ObjectContextRef obc = get_object_context(coid, false);
-@@ -6734,8 +6748,9 @@
-     soid,
-     repop->ctx->at_version,
-     repop->ctx->op_t,
-     pg_trim_to,
-+    min_last_complete_ondisk,
-     repop->ctx->log,
-     repop->ctx->updated_hset_history,
-     onapplied_sync,
-     on_all_applied,
-@@ -6751,8 +6766,9 @@
-   const eversion_t &at_version,
-   ceph_tid_t tid,
-   osd_reqid_t reqid,
-   eversion_t pg_trim_to,
-+  eversion_t pg_trim_rollback_to,
-   hobject_t new_temp_oid,
-   hobject_t discard_temp_oid,
-   vector<pg_log_entry_t> &log_entries,
-   boost::optional<pg_hit_set_history_t> &hset_hist,
-@@ -6806,8 +6822,9 @@
-     else
-       wr->pg_stats = get_info().stats;
-     
-     wr->pg_trim_to = pg_trim_to;
-+    wr->pg_trim_rollback_to = pg_trim_rollback_to;
- 
-     wr->new_temp_oid = new_temp_oid;
-     wr->discard_temp_oid = discard_temp_oid;
-     wr->updated_hit_set_history = hset_hist;
-@@ -6840,8 +6857,14 @@
-  
- void ReplicatedPG::remove_repop(RepGather *repop)
- {
-   dout(20) << __func__ << " " << *repop << dendl;
-+  if (repop->ctx->obc)
-+    dout(20) << " obc " << *repop->ctx->obc << dendl;
-+  if (repop->ctx->clone_obc)
-+    dout(20) << " clone_obc " << *repop->ctx->clone_obc << dendl;
-+  if (repop->ctx->snapset_obc)
-+    dout(20) << " snapset_obc " << *repop->ctx->snapset_obc << dendl;
-   release_op_ctx_locks(repop->ctx);
-   repop->ctx->finish(0);  // FIXME: return value here is sloppy
-   repop_map.erase(repop->rep_tid);
-   repop->put();
-@@ -7606,8 +7629,9 @@
-     parent->log_operation(
-       log,
-       m->updated_hit_set_history,
-       m->pg_trim_to,
-+      m->pg_trim_rollback_to,
-       update_snaps,
-       &(rm->localt));
-       
-     rm->bytes_written = rm->opt.get_encoded_bytes();
-@@ -7701,10 +7725,10 @@
-   uint64_t size = obc->obs.oi.size;
-   if (size)
-     data_subset.insert(0, size);
- 
--  if (get_parent()->get_pool().cache_mode != pg_pool_t::CACHEMODE_NONE) {
--    dout(10) << __func__ << ": caching enabled, skipping clone subsets" << dendl;
-+  if (get_parent()->get_pool().allow_incomplete_clones()) {
-+    dout(10) << __func__ << ": caching (was) enabled, skipping clone subsets" << dendl;
-     return;
-   }
- 
-   if (!cct->_conf->osd_recover_clone_overlap) {
-@@ -7761,10 +7785,10 @@
-   uint64_t size = snapset.clone_size[soid.snap];
-   if (size)
-     data_subset.insert(0, size);
- 
--  if (get_parent()->get_pool().cache_mode != pg_pool_t::CACHEMODE_NONE) {
--    dout(10) << __func__ << ": caching enabled, skipping clone subsets" << dendl;
-+  if (get_parent()->get_pool().allow_incomplete_clones()) {
-+    dout(10) << __func__ << ": caching (was) enabled, skipping clone subsets" << dendl;
-     return;
-   }
- 
-   if (!cct->_conf->osd_recover_clone_overlap) {
-@@ -9464,8 +9488,19 @@
- 
- void ReplicatedPG::on_pool_change()
- {
-   dout(10) << __func__ << dendl;
-+  // requeue cache full waiters just in case the cache_mode is
-+  // changing away from writeback mode.  note that if we are not
-+  // active the normal requeuing machinery is sufficient (and properly
-+  // ordered).
-+  if (is_active() &&
-+      pool.info.cache_mode != pg_pool_t::CACHEMODE_WRITEBACK &&
-+      !waiting_for_cache_not_full.empty()) {
-+    dout(10) << __func__ << " requeuing full waiters (not in writeback) "
-+	     << dendl;
-+    requeue_ops(waiting_for_cache_not_full);
-+  }
-   hit_set_setup();
-   agent_setup();
- }
- 
-@@ -11288,9 +11323,10 @@
-       return false;
-     }
-   }
- 
--  if (agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL) {
-+  if (agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL &&
-+      hit_set) {
-     // is this object old and/or cold enough?
-     int atime = -1, temp = 0;
-     agent_estimate_atime_temp(soid, &atime, NULL /*FIXME &temp*/);
- 
-@@ -11420,9 +11456,13 @@
-     else
-       num_dirty = 0;
-   }
- 
--  dout(10) << __func__ << ": "
-+  dout(10) << __func__
-+	   << " flush_mode: "
-+	   << TierAgentState::get_flush_mode_name(agent_state->flush_mode)
-+	   << " evict_mode: "
-+	   << TierAgentState::get_evict_mode_name(agent_state->evict_mode)
- 	   << " num_objects: " << info.stats.stats.sum.num_objects
- 	   << " num_bytes: " << info.stats.stats.sum.num_bytes
- 	   << " num_objects_dirty: " << info.stats.stats.sum.num_objects_dirty
- 	   << " num_objects_omap: " << info.stats.stats.sum.num_objects_omap
-@@ -11434,9 +11474,9 @@
- 
-   // get dirty, full ratios
-   uint64_t dirty_micro = 0;
-   uint64_t full_micro = 0;
--  if (pool.info.target_max_bytes && info.stats.stats.sum.num_objects) {
-+  if (pool.info.target_max_bytes && info.stats.stats.sum.num_objects > 0) {
-     uint64_t avg_size = info.stats.stats.sum.num_bytes /
-       info.stats.stats.sum.num_objects;
-     dirty_micro =
-       num_dirty * avg_size * 1000000 /
-@@ -11444,9 +11484,9 @@
-     full_micro =
-       num_user_objects * avg_size * 1000000 /
-       MAX(pool.info.target_max_bytes / divisor, 1);
-   }
--  if (pool.info.target_max_objects) {
-+  if (pool.info.target_max_objects > 0) {
-     uint64_t dirty_objects_micro =
-       num_dirty * 1000000 /
-       MAX(pool.info.target_max_objects / divisor, 1);
-     if (dirty_objects_micro > dirty_micro)
-@@ -11530,10 +11570,12 @@
- 	    << TierAgentState::get_evict_mode_name(agent_state->evict_mode)
- 	    << " -> "
- 	    << TierAgentState::get_evict_mode_name(evict_mode)
- 	    << dendl;
--    if (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
-+    if (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL &&
-+	is_active()) {
-       requeue_ops(waiting_for_cache_not_full);
-+      requeue_ops(waiting_for_active);
-     }
-     agent_state->evict_mode = evict_mode;
-   }
-   uint64_t old_effort = agent_state->evict_effort;
-@@ -11659,9 +11701,9 @@
-       ::decode(snapset, blp);
- 
-       // did we finish the last oid?
-       if (head != hobject_t() &&
--	  pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE) {
-+	  !pool.info.allow_incomplete_clones()) {
- 	osd->clog.error() << mode << " " << info.pgid << " " << head
- 			  << " missing clones";
-         ++scrubber.shallow_errors;
-       }
-@@ -11720,9 +11762,9 @@
-     //assert(data.length() == p->size);
-     //
- 
-     if (!next_clone.is_min() && next_clone != soid &&
--	pool.info.cache_mode != pg_pool_t::CACHEMODE_NONE) {
-+	pool.info.allow_incomplete_clones()) {
-       // it is okay to be missing one or more clones in a cache tier.
-       // skip higher-numbered clones in the list.
-       while (curclone != snapset.clones.rend() &&
- 	     soid.snap < *curclone)
-@@ -11808,9 +11850,9 @@
-     scrub_cstat.add(stat, cat);
-   }
- 
-   if (!next_clone.is_min() &&
--      pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE) {
-+      !pool.info.allow_incomplete_clones()) {
-     osd->clog.error() << mode << " " << info.pgid
- 		      << " expected clone " << next_clone;
-     ++scrubber.shallow_errors;
-   }
---- a/src/osd/ReplicatedPG.h
-+++ b/src/osd/ReplicatedPG.h
-@@ -346,15 +346,16 @@
-   void log_operation(
-     vector<pg_log_entry_t> &logv,
-     boost::optional<pg_hit_set_history_t> &hset_history,
-     const eversion_t &trim_to,
-+    const eversion_t &trim_rollback_to,
-     bool transaction_applied,
-     ObjectStore::Transaction *t) {
-     if (hset_history) {
-       info.hit_set = *hset_history;
-       dirty_info = true;
-     }
--    append_log(logv, trim_to, *t, transaction_applied);
-+    append_log(logv, trim_to, trim_rollback_to, *t, transaction_applied);
-   }
- 
-   void op_applied(
-     const eversion_t &applied_version);
---- a/src/osd/osd_types.cc
-+++ b/src/osd/osd_types.cc
-@@ -2101,10 +2101,10 @@
- void pg_notify_t::dump(Formatter *f) const
- {
-   f->dump_int("from", from);
-   f->dump_int("to", to);
--  f->dump_stream("query_epoch") << query_epoch;
--  f->dump_stream("epoch_sent") << epoch_sent;
-+  f->dump_unsigned("query_epoch", query_epoch);
-+  f->dump_unsigned("epoch_sent", epoch_sent);
-   {
-     f->open_object_section("info");
-     info.dump(f);
-     f->close_section();
-@@ -2460,10 +2460,10 @@
- 
- void ObjectModDesc::dump(Formatter *f) const
- {
-   f->open_object_section("object_mod_desc");
--  f->dump_stream("can_local_rollback") << can_local_rollback;
--  f->dump_stream("stashed") << stashed;
-+  f->dump_bool("can_local_rollback", can_local_rollback);
-+  f->dump_bool("rollback_info_completed", rollback_info_completed);
-   {
-     f->open_array_section("ops");
-     DumpVisitor vis(f);
-     visit(&vis);
-@@ -2496,17 +2496,17 @@
- void ObjectModDesc::encode(bufferlist &_bl) const
- {
-   ENCODE_START(1, 1, _bl);
-   ::encode(can_local_rollback, _bl);
--  ::encode(stashed, _bl);
-+  ::encode(rollback_info_completed, _bl);
-   ::encode(bl, _bl);
-   ENCODE_FINISH(_bl);
- }
- void ObjectModDesc::decode(bufferlist::iterator &_bl)
- {
-   DECODE_START(1, _bl);
-   ::decode(can_local_rollback, _bl);
--  ::decode(stashed, _bl);
-+  ::decode(rollback_info_completed, _bl);
-   ::decode(bl, _bl);
-   DECODE_FINISH(_bl);
- }
- 
-@@ -2679,19 +2679,20 @@
- // -- pg_log_t --
- 
- void pg_log_t::encode(bufferlist& bl) const
- {
--  ENCODE_START(5, 3, bl);
-+  ENCODE_START(6, 3, bl);
-   ::encode(head, bl);
-   ::encode(tail, bl);
-   ::encode(log, bl);
-   ::encode(can_rollback_to, bl);
-+  ::encode(rollback_info_trimmed_to, bl);
-   ENCODE_FINISH(bl);
- }
-  
- void pg_log_t::decode(bufferlist::iterator &bl, int64_t pool)
- {
--  DECODE_START_LEGACY_COMPAT_LEN(5, 3, 3, bl);
-+  DECODE_START_LEGACY_COMPAT_LEN(6, 3, 3, bl);
-   ::decode(head, bl);
-   ::decode(tail, bl);
-   if (struct_v < 2) {
-     bool backlog;
-@@ -2699,8 +2700,13 @@
-   }
-   ::decode(log, bl);
-   if (struct_v >= 5)
-     ::decode(can_rollback_to, bl);
-+
-+  if (struct_v >= 6)
-+    ::decode(rollback_info_trimmed_to, bl);
-+  else
-+    rollback_info_trimmed_to = tail;
-   DECODE_FINISH(bl);
- 
-   // handle hobject_t format change
-   if (struct_v < 4) {
---- a/src/osd/osd_types.h
-+++ b/src/osd/osd_types.h
-@@ -810,18 +810,20 @@
-     return "replicated";
-   }
- 
-   enum {
--    FLAG_HASHPSPOOL = 1, // hash pg seed and pool together (instead of adding)
--    FLAG_FULL       = 2, // pool is full
-+    FLAG_HASHPSPOOL = 1<<0, // hash pg seed and pool together (instead of adding)
-+    FLAG_FULL       = 1<<1, // pool is full
-     FLAG_DEBUG_FAKE_EC_POOL = 1<<2, // require ReplicatedPG to act like an EC pg
-+    FLAG_INCOMPLETE_CLONES = 1<<3, // may have incomplete clones (bc we are/were an overlay)
-   };
- 
-   static const char *get_flag_name(int f) {
-     switch (f) {
-     case FLAG_HASHPSPOOL: return "hashpspool";
-     case FLAG_FULL: return "full";
-     case FLAG_DEBUG_FAKE_EC_POOL: return "require_local_rollback";
-+    case FLAG_INCOMPLETE_CLONES: return "incomplete_clones";
-     default: return "???";
-     }
-   }
-   static string get_flags_string(uint64_t f) {
-@@ -867,8 +869,20 @@
-   }
-   const char *get_cache_mode_name() const {
-     return get_cache_mode_name(cache_mode);
-   }
-+  bool cache_mode_requires_hit_set() const {
-+    switch (cache_mode) {
-+    case CACHEMODE_NONE:
-+    case CACHEMODE_FORWARD:
-+    case CACHEMODE_READONLY:
-+      return false;
-+    case CACHEMODE_WRITEBACK:
-+      return true;
-+    default:
-+      assert(0 == "implement me");
-+    }
-+  }
- 
-   uint64_t flags;           ///< FLAG_*
-   __u8 type;                ///< TYPE_*
-   __u8 size, min_size;      ///< number of osds in each pg
-@@ -915,13 +929,31 @@
-   cache_mode_t cache_mode;  ///< cache pool mode
- 
-   bool is_tier() const { return tier_of >= 0; }
-   bool has_tiers() const { return !tiers.empty(); }
--  void clear_tier() { tier_of = -1; }
-+  void clear_tier() {
-+    tier_of = -1;
-+    clear_read_tier();
-+    clear_write_tier();
-+    clear_tier_tunables();
-+  }
-   bool has_read_tier() const { return read_tier >= 0; }
-   void clear_read_tier() { read_tier = -1; }
-   bool has_write_tier() const { return write_tier >= 0; }
-   void clear_write_tier() { write_tier = -1; }
-+  void clear_tier_tunables() {
-+    if (cache_mode != CACHEMODE_NONE)
-+      flags |= FLAG_INCOMPLETE_CLONES;
-+    cache_mode = CACHEMODE_NONE;
-+
-+    target_max_bytes = 0;
-+    target_max_objects = 0;
-+    cache_target_dirty_ratio_micro = 0;
-+    cache_target_full_ratio_micro = 0;
-+    hit_set_params = HitSet::Params();
-+    hit_set_period = 0;
-+    hit_set_count = 0;
-+  }
- 
-   uint64_t target_max_bytes;   ///< tiering: target max pool size
-   uint64_t target_max_objects; ///< tiering: target max pool size
- 
-@@ -963,8 +995,9 @@
- 
-   void dump(Formatter *f) const;
- 
-   uint64_t get_flags() const { return flags; }
-+  bool has_flag(uint64_t f) const { return flags & f; }
- 
-   /// This method will later return true for ec pools as well
-   bool ec_pool() const {
-     return type == TYPE_ERASURE;
-@@ -972,8 +1005,13 @@
-   bool require_rollback() const {
-     return ec_pool() || flags & FLAG_DEBUG_FAKE_EC_POOL;
-   }
- 
-+  /// true if incomplete clones may be present
-+  bool allow_incomplete_clones() const {
-+    return cache_mode != CACHEMODE_NONE || has_flag(FLAG_INCOMPLETE_CLONES);
-+  }
-+
-   unsigned get_type() const { return type; }
-   unsigned get_size() const { return size; }
-   unsigned get_min_size() const { return min_size; }
-   int get_crush_ruleset() const { return crush_ruleset; }
-@@ -1810,9 +1848,9 @@
- 
- class PGBackend;
- class ObjectModDesc {
-   bool can_local_rollback;
--  bool stashed;
-+  bool rollback_info_completed;
- public:
-   class Visitor {
-   public:
-     virtual void append(uint64_t old_offset) {}
-@@ -1830,75 +1868,76 @@
-     DELETE = 3,
-     CREATE = 4,
-     UPDATE_SNAPS = 5
-   };
--  ObjectModDesc() : can_local_rollback(true), stashed(false) {}
-+  ObjectModDesc() : can_local_rollback(true), rollback_info_completed(false) {}
-   void claim(ObjectModDesc &other) {
-     bl.clear();
-     bl.claim(other.bl);
-     can_local_rollback = other.can_local_rollback;
--    stashed = other.stashed;
-+    rollback_info_completed = other.rollback_info_completed;
-   }
-   void claim_append(ObjectModDesc &other) {
--    if (!can_local_rollback || stashed)
-+    if (!can_local_rollback || rollback_info_completed)
-       return;
-     if (!other.can_local_rollback) {
-       mark_unrollbackable();
-       return;
-     }
-     bl.claim_append(other.bl);
--    stashed = other.stashed;
-+    rollback_info_completed = other.rollback_info_completed;
-   }
-   void swap(ObjectModDesc &other) {
-     bl.swap(other.bl);
- 
-     bool temp = other.can_local_rollback;
-     other.can_local_rollback = can_local_rollback;
-     can_local_rollback = temp;
- 
--    temp = other.stashed;
--    other.stashed = stashed;
--    stashed = temp;
-+    temp = other.rollback_info_completed;
-+    other.rollback_info_completed = rollback_info_completed;
-+    rollback_info_completed = temp;
-   }
-   void append_id(ModID id) {
-     uint8_t _id(id);
-     ::encode(_id, bl);
-   }
-   void append(uint64_t old_size) {
--    if (!can_local_rollback || stashed)
-+    if (!can_local_rollback || rollback_info_completed)
-       return;
-     ENCODE_START(1, 1, bl);
-     append_id(APPEND);
-     ::encode(old_size, bl);
-     ENCODE_FINISH(bl);
-   }
-   void setattrs(map<string, boost::optional<bufferlist> > &old_attrs) {
--    if (!can_local_rollback || stashed)
-+    if (!can_local_rollback || rollback_info_completed)
-       return;
-     ENCODE_START(1, 1, bl);
-     append_id(SETATTRS);
-     ::encode(old_attrs, bl);
-     ENCODE_FINISH(bl);
-   }
-   bool rmobject(version_t deletion_version) {
--    if (!can_local_rollback || stashed)
-+    if (!can_local_rollback || rollback_info_completed)
-       return false;
-     ENCODE_START(1, 1, bl);
-     append_id(DELETE);
-     ::encode(deletion_version, bl);
-     ENCODE_FINISH(bl);
--    stashed = true;
-+    rollback_info_completed = true;
-     return true;
-   }
-   void create() {
--    if (!can_local_rollback || stashed)
-+    if (!can_local_rollback || rollback_info_completed)
-       return;
-+    rollback_info_completed = true;
-     ENCODE_START(1, 1, bl);
-     append_id(CREATE);
-     ENCODE_FINISH(bl);
-   }
-   void update_snaps(set<snapid_t> &old_snaps) {
--    if (!can_local_rollback || stashed)
-+    if (!can_local_rollback || rollback_info_completed)
-       return;
-     ENCODE_START(1, 1, bl);
-     append_id(UPDATE_SNAPS);
-     ::encode(old_snaps, bl);
-@@ -2060,8 +2099,12 @@
- 
-   // We can rollback rollback-able entries > can_rollback_to
-   eversion_t can_rollback_to;
- 
-+  // always <= can_rollback_to, indicates how far stashed rollback
-+  // data can be found
-+  eversion_t rollback_info_trimmed_to;
-+
-   list<pg_log_entry_t> log;  // the actual log.
-   
-   pg_log_t() {}
- 
-@@ -2761,21 +2804,23 @@
- 	return false;
-       }
-     }
- 
--    bool get_write(OpRequestRef op) {
--      if (get_write_lock()) {
-+    bool get_write(OpRequestRef op, bool greedy=false) {
-+      if (get_write_lock(greedy)) {
- 	return true;
-       } // else
-       if (op)
- 	waiters.push_back(op);
-       return false;
-     }
--    bool get_write_lock() {
--      // don't starve anybody!
--      if (!waiters.empty() ||
--	  backfill_read_marker) {
--	return false;
-+    bool get_write_lock(bool greedy=false) {
-+      if (!greedy) {
-+	// don't starve anybody!
-+	if (!waiters.empty() ||
-+	    backfill_read_marker) {
-+	  return false;
-+	}
-       }
-       switch (state) {
-       case RWNONE:
- 	assert(count == 0);
-@@ -2822,9 +2867,12 @@
-   bool get_read(OpRequestRef op) {
-     return rwstate.get_read(op);
-   }
-   bool get_write(OpRequestRef op) {
--    return rwstate.get_write(op);
-+    return rwstate.get_write(op, false);
-+  }
-+  bool get_write_greedy(OpRequestRef op) {
-+    return rwstate.get_write(op, true);
-   }
-   bool get_snaptrimmer_write() {
-     if (rwstate.get_write_lock()) {
-       return true;
---- a/src/osdc/Objecter.cc
-+++ b/src/osdc/Objecter.cc
-@@ -1363,8 +1363,13 @@
-   }
- 
-   ldout(cct, 10) << __func__ << " tid " << tid << dendl;
-   Op *op = p->second;
-+  if (op->con) {
-+    ldout(cct, 20) << " revoking rx buffer for " << tid
-+		   << " on " << op->con << dendl;
-+    op->con->revoke_rx_buffer(tid);
-+  }
-   if (op->onack) {
-     op->onack->complete(r);
-     op->onack = NULL;
-   }
-@@ -1433,9 +1438,9 @@
-     return -ENOENT;
-   return p->raw_hash_to_pg(p->hash_key(key, ns));
- }
- 
--int Objecter::calc_target(op_target_t *t)
-+int Objecter::calc_target(op_target_t *t, bool any_change)
- {
-   bool is_read = t->flags & CEPH_OSD_FLAG_READ;
-   bool is_write = t->flags & CEPH_OSD_FLAG_WRITE;
- 
-@@ -1490,9 +1495,10 @@
-     need_resend = true;
-   }
- 
-   if (t->pgid != pgid ||
--      is_pg_changed(t->primary, t->acting, primary, acting, t->used_replica) ||
-+      is_pg_changed(
-+	t->primary, t->acting, primary, acting, t->used_replica || any_change) ||
-       force_resend) {
-     t->pgid = pgid;
-     t->acting = acting;
-     t->primary = primary;
-@@ -1569,9 +1575,9 @@
- }
- 
- bool Objecter::recalc_linger_op_target(LingerOp *linger_op)
- {
--  int r = calc_target(&linger_op->target);
-+  int r = calc_target(&linger_op->target, true);
-   if (r == RECALC_OP_TARGET_NEED_RESEND) {
-     ldout(cct, 10) << "recalc_linger_op_target tid " << linger_op->linger_id
- 		   << " pgid " << linger_op->target.pgid
- 		   << " acting " << linger_op->target.acting << dendl;
---- a/src/osdc/Objecter.h
-+++ b/src/osdc/Objecter.h
-@@ -1479,9 +1479,9 @@
-   };
-   bool osdmap_full_flag() const;
-   bool target_should_be_paused(op_target_t *op);
- 
--  int calc_target(op_target_t *t);
-+  int calc_target(op_target_t *t, bool any_change=false);
-   int recalc_op_target(Op *op);
-   bool recalc_linger_op_target(LingerOp *op);
- 
-   void send_linger(LingerOp *info);
---- a/src/pybind/rbd.py
-+++ b/src/pybind/rbd.py
-@@ -749,8 +749,16 @@
-         ret = self.librbd.rbd_flush(self.image)
-         if ret < 0:
-             raise make_ex(ret, 'error flushing image')
- 
-+    def invalidate_cache(self):
-+        """
-+        Drop any cached data for the image.
-+        """
-+        ret = self.librbd.rbd_invalidate_cache(self.image)
-+        if ret < 0:
-+            raise make_ex(ret, 'error invalidating cache')
-+
-     def stripe_unit(self):
-         """
-         Returns the stripe unit used for the image.
-         """
---- a/src/rgw/rgw_common.cc
-+++ b/src/rgw/rgw_common.cc
-@@ -696,15 +696,17 @@
-   char dest[src_str.size() + 1];
-   int pos = 0;
-   char c;
- 
-+  bool in_query = false;
-   while (*src) {
-     if (*src != '%') {
--      if (*src != '+') {
--	dest[pos++] = *src++;
-+      if (!in_query || *src != '+') {
-+        if (*src == '?') in_query = true;
-+        dest[pos++] = *src++;
-       } else {
--	dest[pos++] = ' ';
--	++src;
-+        dest[pos++] = ' ';
-+        ++src;
-       }
-     } else {
-       src++;
-       if (!*src)
---- a/src/rgw/rgw_op.cc
-+++ b/src/rgw/rgw_op.cc
-@@ -1379,9 +1379,12 @@
- };
- 
- int RGWPutObjProcessor_Multipart::prepare(RGWRados *store, void *obj_ctx, string *oid_rand)
- {
--  RGWPutObjProcessor::prepare(store, obj_ctx, NULL);
-+  int r = prepare_init(store, obj_ctx, NULL);
-+  if (r < 0) {
-+    return r;
-+  }
- 
-   string oid = obj_str;
-   upload_id = s->info.args.get("uploadId");
-   if (!oid_rand) {
-@@ -1418,9 +1421,9 @@
-   manifest.set_prefix(upload_prefix);
- 
-   manifest.set_multipart_part_rule(store->ctx()->_conf->rgw_obj_stripe_size, num);
- 
--  int r = manifest_gen.create_begin(store->ctx(), &manifest, bucket, target_obj);
-+  r = manifest_gen.create_begin(store->ctx(), &manifest, bucket, target_obj);
-   if (r < 0) {
-     return r;
-   }
- 
-@@ -1559,8 +1562,38 @@
- 
-   return 0;
- }
- 
-+static int put_data_and_throttle(RGWPutObjProcessor *processor, bufferlist& data, off_t ofs,
-+                                 MD5 *hash, bool need_to_wait)
-+{
-+  const unsigned char *data_ptr = (hash ? (const unsigned char *)data.c_str() : NULL);
-+  bool again;
-+  uint64_t len = data.length();
-+
-+  do {
-+    void *handle;
-+
-+    int ret = processor->handle_data(data, ofs, &handle, &again);
-+    if (ret < 0)
-+      return ret;
-+
-+    if (hash) {
-+      hash->Update(data_ptr, len);
-+      hash = NULL; /* only calculate hash once */
-+    }
-+
-+    ret = processor->throttle_data(handle, need_to_wait);
-+    if (ret < 0)
-+      return ret;
-+
-+    need_to_wait = false; /* the need to wait only applies to the first iteration */
-+  } while (again);
-+
-+  return 0;
-+}
-+
-+
- void RGWPutObj::execute()
- {
-   RGWPutObjProcessor *processor = NULL;
-   char supplied_md5_bin[CEPH_CRYPTO_MD5_DIGESTSIZE + 1];
-@@ -1632,25 +1665,14 @@
-     }
-     if (!len)
-       break;
- 
--    void *handle;
--    const unsigned char *data_ptr = (const unsigned char *)data.c_str();
--
--    ret = processor->handle_data(data, ofs, &handle);
--    if (ret < 0)
--      goto done;
--
--    if (need_calc_md5) {
--      hash.Update(data_ptr, len);
--    }
--
-     /* do we need this operation to be synchronous? if we're dealing with an object with immutable
-      * head, e.g., multipart object we need to make sure we're the first one writing to this object
-      */
-     bool need_to_wait = (ofs == 0) && multipart;
- 
--    ret = processor->throttle_data(handle, need_to_wait);
-+    ret = put_data_and_throttle(processor, data, ofs, (need_calc_md5 ? &hash : NULL), need_to_wait);
-     if (ret < 0) {
-       if (!need_to_wait || ret != -EEXIST) {
-         ldout(s->cct, 20) << "processor->thottle_data() returned ret=" << ret << dendl;
-         goto done;
-@@ -1673,17 +1695,10 @@
-         ldout(s->cct, 0) << "ERROR: processor->prepare() returned " << ret << dendl;
-         goto done;
-       }
- 
--      ret = processor->handle_data(data, ofs, &handle);
-+      ret = put_data_and_throttle(processor, data, ofs, NULL, false);
-       if (ret < 0) {
--        ldout(s->cct, 0) << "ERROR: processor->handle_data() returned " << ret << dendl;
--        goto done;
--      }
--
--      ret = processor->throttle_data(handle, false);
--      if (ret < 0) {
--        ldout(s->cct, 0) << "ERROR: processor->throttle_data() returned " << ret << dendl;
-         goto done;
-       }
-     }
- 
-@@ -1845,20 +1860,9 @@
- 
-      if (!len)
-        break;
- 
--     void *handle;
--     const unsigned char *data_ptr = (const unsigned char *)data.c_str();
--
--     ret = processor->handle_data(data, ofs, &handle);
--     if (ret < 0)
--       goto done;
--
--     hash.Update(data_ptr, len);
--
--     ret = processor->throttle_data(handle, false);
--     if (ret < 0)
--       goto done;
-+     ret = put_data_and_throttle(processor, data, ofs, &hash, false);
- 
-      ofs += len;
- 
-      if (ofs > max_len) {
---- a/src/rgw/rgw_rados.cc
-+++ b/src/rgw/rgw_rados.cc
-@@ -899,10 +899,12 @@
- 
-   return 0;
- };
- 
--int RGWPutObjProcessor_Plain::handle_data(bufferlist& bl, off_t _ofs, void **phandle)
-+int RGWPutObjProcessor_Plain::handle_data(bufferlist& bl, off_t _ofs, void **phandle, bool *again)
- {
-+  *again = false;
-+
-   if (ofs != _ofs)
-     return -EINVAL;
- 
-   data.append(bl);
-@@ -1025,10 +1027,12 @@
- 
-   return RGWPutObjProcessor_Aio::handle_obj_data(cur_obj, bl, ofs - cur_part_ofs, ofs, phandle, exclusive);
- }
- 
--int RGWPutObjProcessor_Atomic::handle_data(bufferlist& bl, off_t ofs, void **phandle)
-+int RGWPutObjProcessor_Atomic::handle_data(bufferlist& bl, off_t ofs, void **phandle, bool *again)
- {
-+  *again = false;
-+
-   *phandle = NULL;
-   if (extra_data_len) {
-     size_t extra_len = bl.length();
-     if (extra_len > extra_data_len)
-@@ -1043,15 +1047,18 @@
-       return 0;
-     }
-   }
- 
--  uint64_t max_chunk_size = store->get_max_chunk_size();
-+  uint64_t max_write_size = MIN(max_chunk_size, (uint64_t)next_part_ofs - data_ofs);
- 
-   pending_data_bl.claim_append(bl);
--  if (pending_data_bl.length() < max_chunk_size)
-+  if (pending_data_bl.length() < max_write_size)
-     return 0;
- 
--  pending_data_bl.splice(0, max_chunk_size, &bl);
-+  pending_data_bl.splice(0, max_write_size, &bl);
-+
-+  /* do we have enough data pending accumulated that needs to be written? */
-+  *again = (pending_data_bl.length() >= max_chunk_size);
- 
-   if (!data_ofs && !immutable_head()) {
-     first_chunk.claim(bl);
-     obj_len = (uint64_t)first_chunk.length();
-@@ -1069,19 +1076,32 @@
-                                                         object and cleanup can be messy */
-   return write_data(bl, write_ofs, phandle, exclusive);
- }
- 
--int RGWPutObjProcessor_Atomic::prepare(RGWRados *store, void *obj_ctx, string *oid_rand)
-+
-+int RGWPutObjProcessor_Atomic::prepare_init(RGWRados *store, void *obj_ctx, string *oid_rand)
- {
-   RGWPutObjProcessor::prepare(store, obj_ctx, oid_rand);
- 
--  head_obj.init(bucket, obj_str);
-+  int r = store->get_max_chunk_size(bucket, &max_chunk_size);
-+  if (r < 0) {
-+    return r;
-+  }
-+
-+  return 0;
-+}
- 
--  uint64_t max_chunk_size = store->get_max_chunk_size();
-+int RGWPutObjProcessor_Atomic::prepare(RGWRados *store, void *obj_ctx, string *oid_rand)
-+{
-+  int r = prepare_init(store, obj_ctx, oid_rand);
-+  if (r < 0) {
-+    return r;
-+  }
-+  head_obj.init(bucket, obj_str);
- 
-   manifest.set_trivial_rule(max_chunk_size, store->ctx()->_conf->rgw_obj_stripe_size);
- 
--  int r = manifest_gen.create_begin(store->ctx(), &manifest, bucket, head_obj);
-+  r = manifest_gen.create_begin(store->ctx(), &manifest, bucket, head_obj);
-   if (r < 0) {
-     return r;
-   }
- 
-@@ -1200,8 +1220,46 @@
-     objs_state[new_obj].prefetch_data = true;
-   }
- }
- 
-+int RGWRados::get_required_alignment(rgw_bucket& bucket, uint64_t *alignment)
-+{
-+  IoCtx ioctx;
-+  int r = open_bucket_data_ctx(bucket, ioctx);
-+  if (r < 0) {
-+    ldout(cct, 0) << "ERROR: open_bucket_data_ctx() returned " << r << dendl;
-+    return r;
-+  }
-+
-+  *alignment = ioctx.pool_required_alignment();
-+  return 0;
-+}
-+
-+int RGWRados::get_max_chunk_size(rgw_bucket& bucket, uint64_t *max_chunk_size)
-+{
-+  uint64_t alignment;
-+  int r = get_required_alignment(bucket, &alignment);
-+  if (r < 0) {
-+    return r;
-+  }
-+
-+  uint64_t config_chunk_size = cct->_conf->rgw_max_chunk_size;
-+
-+  if (alignment == 0) {
-+    *max_chunk_size = config_chunk_size;
-+    return 0;
-+  }
-+
-+  if (config_chunk_size <= alignment) {
-+    *max_chunk_size = alignment;
-+    return 0;
-+  }
-+
-+  *max_chunk_size = config_chunk_size - (config_chunk_size % alignment);
-+
-+  return 0;
-+}
-+
- void RGWRados::finalize()
- {
-   if (need_watch_notify()) {
-     finalize_watch();
-@@ -1235,10 +1293,8 @@
- int RGWRados::init_rados()
- {
-   int ret;
- 
--  max_chunk_size = cct->_conf->rgw_max_chunk_size;
--
-   rados = new Rados();
-   if (!rados)
-     return -ENOMEM;
- 
-@@ -2956,27 +3012,35 @@
-                                                                        progress_data(_progress_data) {}
-   int handle_data(bufferlist& bl, off_t ofs, off_t len) {
-     progress_cb(ofs, progress_data);
- 
--    void *handle;
--    int ret = processor->handle_data(bl, ofs, &handle);
--    if (ret < 0)
--      return ret;
-+    bool again;
- 
--    if (opstate) {
--      /* need to update opstate repository with new state. This is ratelimited, so we're not
--       * really doing it every time
--       */
--      ret = opstate->renew_state();
--      if (ret < 0) {
--        /* could not renew state! might have been marked as cancelled */
-+    bool need_opstate = true;
-+
-+    do {
-+      void *handle;
-+      int ret = processor->handle_data(bl, ofs, &handle, &again);
-+      if (ret < 0)
-         return ret;
-+
-+      if (need_opstate && opstate) {
-+        /* need to update opstate repository with new state. This is ratelimited, so we're not
-+         * really doing it every time
-+         */
-+        ret = opstate->renew_state();
-+        if (ret < 0) {
-+          /* could not renew state! might have been marked as cancelled */
-+          return ret;
-+        }
-+
-+        need_opstate = false;
-       }
--    }
- 
--    ret = processor->throttle_data(handle, false);
--    if (ret < 0)
--      return ret;
-+      ret = processor->throttle_data(handle, false);
-+      if (ret < 0)
-+        return ret;
-+    } while (again);
- 
-     return 0;
-   }
- 
-@@ -3191,26 +3255,8 @@
-     return ret;
- 
-   vector<rgw_obj> ref_objs;
- 
--  bool copy_data = !astate->has_manifest;
--  bool copy_first = false;
--  if (astate->has_manifest) {
--    if (!astate->manifest.has_tail()) {
--      copy_data = true;
--    } else {
--      uint64_t head_size = astate->manifest.get_head_size();
--
--      if (head_size > 0) {
--	if (head_size > max_chunk_size)  // should never happen
--	  copy_data = true;
--	else
--          copy_first = true;
--      }
--    }
--  }
--
--
-   if (remote_dest) {
-     /* dest is in a different region, copy it there */
- 
-     string etag;
-@@ -3229,10 +3275,37 @@
-     if (ret < 0)
-       return ret;
- 
-     return 0;
--  } else if (copy_data) { /* refcounting tail wouldn't work here, just copy the data */
--    return copy_obj_data(ctx, dest_bucket_info.owner, &handle, end, dest_obj, src_obj, mtime, src_attrs, category, ptag, err);
-+  }
-+  
-+  uint64_t max_chunk_size;
-+
-+  ret = get_max_chunk_size(dest_obj.bucket, &max_chunk_size);
-+  if (ret < 0) {
-+    ldout(cct, 0) << "ERROR: failed to get max_chunk_size() for bucket " << dest_obj.bucket << dendl;
-+    return ret;
-+  }
-+
-+  bool copy_data = !astate->has_manifest;
-+  bool copy_first = false;
-+  if (astate->has_manifest) {
-+    if (!astate->manifest.has_tail()) {
-+      copy_data = true;
-+    } else {
-+      uint64_t head_size = astate->manifest.get_head_size();
-+
-+      if (head_size > 0) {
-+	if (head_size > max_chunk_size)
-+	  copy_data = true;
-+	else
-+          copy_first = true;
-+      }
-+    }
-+  }
-+
-+  if (copy_data) { /* refcounting tail wouldn't work here, just copy the data */
-+    return copy_obj_data(ctx, dest_bucket_info.owner, &handle, end, dest_obj, src_obj, max_chunk_size, mtime, src_attrs, category, ptag, err);
-   }
- 
-   RGWObjManifest::obj_iterator miter = astate->manifest.obj_begin();
- 
-@@ -3340,8 +3413,9 @@
-                const string& owner,
- 	       void **handle, off_t end,
-                rgw_obj& dest_obj,
-                rgw_obj& src_obj,
-+               uint64_t max_chunk_size,
- 	       time_t *mtime,
-                map<string, bufferlist>& attrs,
-                RGWObjCategory category,
-                string *ptag,
-@@ -4472,8 +4546,10 @@
- 
-   bool merge_bl = false;
-   bufferlist *pbl = &bl;
-   bufferlist read_bl;
-+  uint64_t max_chunk_size;
-+
- 
-   get_obj_bucket_and_oid_key(obj, bucket, oid, key);
- 
-   if (!rctx) {
-@@ -4504,8 +4580,14 @@
-       get_obj_bucket_and_oid_key(read_obj, bucket, oid, key);
-     }
-   }
- 
-+  r = get_max_chunk_size(bucket, &max_chunk_size);
-+  if (r < 0) {
-+    ldout(cct, 0) << "ERROR: failed to get max_chunk_size() for bucket " << bucket << dendl;
-+    goto done_ret;
-+  }
-+
-   if (len > max_chunk_size)
-     len = max_chunk_size;
- 
- 
---- a/src/rgw/rgw_rados.h
-+++ b/src/rgw/rgw_rados.h
-@@ -547,9 +547,9 @@
-     store = _store;
-     obj_ctx = _o;
-     return 0;
-   };
--  virtual int handle_data(bufferlist& bl, off_t ofs, void **phandle) = 0;
-+  virtual int handle_data(bufferlist& bl, off_t ofs, void **phandle, bool *again) = 0;
-   virtual int throttle_data(void *handle, bool need_to_wait) = 0;
-   virtual int complete(string& etag, time_t *mtime, time_t set_mtime, map<string, bufferlist>& attrs);
- };
- 
-@@ -563,9 +563,9 @@
-   off_t ofs;
- 
- protected:
-   int prepare(RGWRados *store, void *obj_ctx, string *oid_rand);
--  int handle_data(bufferlist& bl, off_t ofs, void **phandle);
-+  int handle_data(bufferlist& bl, off_t ofs, void **phandle, bool *again);
-   int do_complete(string& etag, time_t *mtime, time_t set_mtime, map<string, bufferlist>& attrs);
- 
- public:
-   int throttle_data(void *handle, bool need_to_wait) { return 0; }
-@@ -612,8 +612,10 @@
- 
-   uint64_t extra_data_len;
-   bufferlist extra_data_bl;
-   bufferlist pending_data_bl;
-+  uint64_t max_chunk_size;
-+
- protected:
-   rgw_bucket bucket;
-   string obj_str;
- 
-@@ -630,8 +632,10 @@
-   int prepare_next_part(off_t ofs);
-   int complete_parts();
-   int complete_writing_data();
- 
-+  int prepare_init(RGWRados *store, void *obj_ctx, string *oid_rand);
-+
- public:
-   ~RGWPutObjProcessor_Atomic() {}
-   RGWPutObjProcessor_Atomic(const string& bucket_owner, rgw_bucket& _b, const string& _o, uint64_t _p, const string& _t) :
-                                 RGWPutObjProcessor_Aio(bucket_owner),
-@@ -640,17 +644,18 @@
-                                 next_part_ofs(_p),
-                                 cur_part_id(0),
-                                 data_ofs(0),
-                                 extra_data_len(0),
-+                                max_chunk_size(0),
-                                 bucket(_b),
-                                 obj_str(_o),
-                                 unique_tag(_t) {}
-   int prepare(RGWRados *store, void *obj_ctx, string *oid_rand);
-   virtual bool immutable_head() { return false; }
-   void set_extra_data_len(uint64_t len) {
-     extra_data_len = len;
-   }
--  virtual int handle_data(bufferlist& bl, off_t ofs, void **phandle);
-+  virtual int handle_data(bufferlist& bl, off_t ofs, void **phandle, bool *again);
-   bufferlist& get_extra_data() { return extra_data_bl; }
- };
- 
- 
-@@ -1220,10 +1225,8 @@
-   int get_obj_ioctx(const rgw_obj& obj, librados::IoCtx *ioctx);
-   int get_obj_ref(const rgw_obj& obj, rgw_rados_ref *ref, rgw_bucket *bucket, bool ref_system_obj = false);
-   uint64_t max_bucket_id;
- 
--  uint64_t max_chunk_size;
--
-   int get_obj_state(RGWRadosCtx *rctx, rgw_obj& obj, RGWObjState **state, RGWObjVersionTracker *objv_tracker);
-   int append_atomic_test(RGWRadosCtx *rctx, rgw_obj& obj,
-                          librados::ObjectOperation& op, RGWObjState **state);
-   int prepare_atomic_for_write_impl(RGWRadosCtx *rctx, rgw_obj& obj,
-@@ -1286,9 +1289,8 @@
-                gc(NULL), use_gc_thread(false), quota_threads(false),
-                num_watchers(0), watchers(NULL), watch_handles(NULL),
-                watch_initialized(false),
-                bucket_id_lock("rados_bucket_id"), max_bucket_id(0),
--               max_chunk_size(0),
-                cct(NULL), rados(NULL),
-                pools_initialized(false),
-                quota_handler(NULL),
-                rest_master_conn(NULL),
-@@ -1324,11 +1326,10 @@
-       delete rados;
-     }
-   }
- 
--  uint64_t get_max_chunk_size() {
--    return max_chunk_size;
--  }
-+  int get_required_alignment(rgw_bucket& bucket, uint64_t *alignment);
-+  int get_max_chunk_size(rgw_bucket& bucket, uint64_t *max_chunk_size);
- 
-   int list_raw_objects(rgw_bucket& pool, const string& prefix_filter, int max,
-                        RGWListRawObjsCtx& ctx, list<string>& oids,
-                        bool *is_truncated);
-@@ -1562,8 +1563,9 @@
-                const string& owner,
- 	       void **handle, off_t end,
-                rgw_obj& dest_obj,
-                rgw_obj& src_obj,
-+               uint64_t max_chunk_size,
- 	       time_t *mtime,
-                map<string, bufferlist>& attrs,
-                RGWObjCategory category,
-                string *ptag,
---- a/src/rgw/rgw_rest.cc
-+++ b/src/rgw/rgw_rest.cc
-@@ -179,9 +179,9 @@
- {
-   std::ostringstream oss;
-   formatter->flush(oss);
-   std::string outs(oss.str());
--  if (!outs.empty()) {
-+  if (!outs.empty() && s->op != OP_HEAD) {
-     s->cio->write(outs.c_str(), outs.size());
-   }
- 
-   s->formatter->reset();
-@@ -191,9 +191,9 @@
- {
-   std::ostringstream oss;
-   formatter->flush(oss);
-   std::string outs(oss.str());
--  if (!outs.empty()) {
-+  if (!outs.empty() && s->op != OP_HEAD) {
-     s->cio->write(outs.c_str(), outs.size());
-   }
- }
- 
---- a/src/rgw/rgw_rest_swift.cc
-+++ b/src/rgw/rgw_rest_swift.cc
-@@ -626,20 +626,18 @@
-   string hdrs, exp_hdrs;
-   uint32_t max_age = CORS_MAX_AGE_INVALID;
-   /*EACCES means, there is no CORS registered yet for the bucket
-    *ENOENT means, there is no match of the Origin in the list of CORSRule
--   *ENOTSUPP means, the HTTP_METHOD is not supported
-    */
-   if (ret == -ENOENT)
-     ret = -EACCES;
--  if (ret != -EACCES) {
--    get_response_params(hdrs, exp_hdrs, &max_age);
--  } else {
-+  if (ret < 0) {
-     set_req_state_err(s, ret);
-     dump_errno(s);
-     end_header(s, NULL);
-     return;
-   }
-+  get_response_params(hdrs, exp_hdrs, &max_age);
-   dump_errno(s);
-   dump_access_control(s, origin, req_meth, hdrs.c_str(), exp_hdrs.c_str(), max_age); 
-   end_header(s, NULL);
- }
---- a/src/test/crush/TestCrushWrapper.cc
-+++ b/src/test/crush/TestCrushWrapper.cc
-@@ -537,8 +537,13 @@
-     EXPECT_NE(string::npos,
- 	      ss.str().find("<item_name>default</item_name></step>"));
-   }
- 
-+  map<int,float> wm;
-+  c->get_rule_weight_osd_map(0, &wm);
-+  ASSERT_TRUE(wm.size() == 1);
-+  ASSERT_TRUE(wm[0] == 1.0);
-+
-   delete c;
- }
- 
- TEST(CrushWrapper, distance) {
---- a/src/test/erasure-code/TestErasureCodeJerasure.cc
-+++ b/src/test/erasure-code/TestErasureCodeJerasure.cc
-@@ -287,8 +287,38 @@
-       c->insert_item(g_ceph_context, osd, 1.0, string("osd.") + stringify(osd), loc);
-     }
-   }
- 
-+  //
-+  // The ruleid may be different from the ruleset when a crush rule is
-+  // removed because the removed ruleid will be reused but the removed
-+  // ruleset will not be reused. 
-+  //
-+  // This also asserts that the create_ruleset() method returns a
-+  // ruleset and not a ruleid http://tracker.ceph.com/issues/9044
-+  //
-+  {
-+    stringstream ss;
-+    ErasureCodeJerasureReedSolomonVandermonde jerasure;
-+    map<std::string,std::string> parameters;
-+    parameters["k"] = "2";
-+    parameters["m"] = "2";
-+    parameters["w"] = "8";
-+    jerasure.init(parameters);
-+    int FIRST = jerasure.create_ruleset("FIRST", *c, &ss);
-+    int SECOND = jerasure.create_ruleset("SECOND", *c, &ss);
-+    int FIRST_ruleid = c->get_rule_id("FIRST");
-+    EXPECT_EQ(0, c->remove_rule(FIRST_ruleid));
-+    int ruleset = jerasure.create_ruleset("myrule", *c, &ss);
-+    EXPECT_NE(FIRST, ruleset);
-+    EXPECT_NE(SECOND, ruleset);
-+    EXPECT_NE(ruleset, c->get_rule_id("myrule"));
-+    int SECOND_ruleid = c->get_rule_id("SECOND");
-+    EXPECT_EQ(0, c->remove_rule(SECOND_ruleid));
-+    int myrule_ruleid = c->get_rule_id("myrule");
-+    EXPECT_EQ(0, c->remove_rule(myrule_ruleid));
-+  }
-+
-   {
-     stringstream ss;
-     ErasureCodeJerasureReedSolomonVandermonde jerasure;
-     map<std::string,std::string> parameters;
---- a/src/test/librados/TestCase.cc
-+++ b/src/test/librados/TestCase.cc
-@@ -7,8 +7,9 @@
- 
- using namespace librados;
- 
- std::string RadosTest::pool_name;
-+std::string RadosTest::nspace;
- rados_t RadosTest::s_cluster = NULL;
- 
- void RadosTest::SetUpTestCase()
- {
-@@ -24,9 +25,9 @@
- void RadosTest::SetUp()
- {
-   cluster = RadosTest::s_cluster;
-   ASSERT_EQ(0, rados_ioctx_create(cluster, pool_name.c_str(), &ioctx));
--  std::string nspace = get_temp_pool_name();
-+  nspace = get_temp_pool_name();
-   rados_ioctx_set_namespace(ioctx, nspace.c_str());
-   ASSERT_FALSE(rados_ioctx_pool_requires_alignment(ioctx));
- }
- 
-@@ -205,26 +206,8 @@
-   cleanup_default_namespace(ioctx);
-   rados_ioctx_destroy(ioctx);
- }
- 
--void RadosTestEC::cleanup_default_namespace(rados_ioctx_t ioctx)
--{
--  // remove all objects from the default namespace to avoid polluting
--  // other tests
--  rados_ioctx_set_namespace(ioctx, "");
--  rados_list_ctx_t list_ctx;
--  ASSERT_EQ(0, rados_objects_list_open(ioctx, &list_ctx));
--  int r;
--  const char *entry = NULL;
--  const char *key = NULL;
--  while ((r = rados_objects_list_next(list_ctx, &entry, &key)) != -ENOENT) {
--    ASSERT_EQ(0, r);
--    rados_ioctx_locator_set_key(ioctx, key);
--    ASSERT_EQ(0, rados_remove(ioctx, entry));
--  }
--  rados_objects_list_close(list_ctx);
--}
--
- std::string RadosTestECPP::pool_name;
- Rados RadosTestECPP::s_cluster;
- 
- void RadosTestECPP::SetUpTestCase()
-@@ -253,15 +236,4 @@
-   cleanup_default_namespace(ioctx);
-   ioctx.close();
- }
- 
--void RadosTestECPP::cleanup_default_namespace(librados::IoCtx ioctx)
--{
--  // remove all objects from the default namespace to avoid polluting
--  // other tests
--  ioctx.set_namespace("");
--  for (ObjectIterator it = ioctx.objects_begin();
--       it != ioctx.objects_end(); ++it) {
--    ioctx.locator_set_key(it->second);
--    ASSERT_EQ(0, ioctx.remove(it->first));
--  }
--}
---- a/src/test/librados/TestCase.h
-+++ b/src/test/librados/TestCase.h
-@@ -27,8 +27,9 @@
-   static void TearDownTestCase();
-   static void cleanup_default_namespace(rados_ioctx_t ioctx);
-   static rados_t s_cluster;
-   static std::string pool_name;
-+  static std::string nspace;
- 
-   virtual void SetUp();
-   virtual void TearDown();
-   rados_t cluster;
-@@ -71,16 +72,15 @@
-   librados::IoCtx ioctx;
-   std::string ns;
- };
- 
--class RadosTestEC : public ::testing::Test {
-+class RadosTestEC : public RadosTest {
- public:
-   RadosTestEC() {}
-   virtual ~RadosTestEC() {}
- protected:
-   static void SetUpTestCase();
-   static void TearDownTestCase();
--  static void cleanup_default_namespace(rados_ioctx_t ioctx);
-   static rados_t s_cluster;
-   static std::string pool_name;
- 
-   virtual void SetUp();
-@@ -89,16 +89,15 @@
-   rados_ioctx_t ioctx;
-   uint64_t alignment;
- };
- 
--class RadosTestECPP : public ::testing::Test {
-+class RadosTestECPP : public RadosTestPP {
- public:
-   RadosTestECPP() : cluster(s_cluster) {};
-   virtual ~RadosTestECPP() {};
- protected:
-   static void SetUpTestCase();
-   static void TearDownTestCase();
--  static void cleanup_default_namespace(librados::IoCtx ioctx);
-   static librados::Rados s_cluster;
-   static std::string pool_name;
- 
-   virtual void SetUp();
---- a/src/test/librados/io.cc
-+++ b/src/test/librados/io.cc
-@@ -24,8 +24,60 @@
-   rados_ioctx_set_namespace(ioctx, "nspace");
-   ASSERT_EQ(0, rados_write(ioctx, "foo", buf, sizeof(buf), 0));
- }
- 
-+TEST_F(LibRadosIo, ReadTimeout) {
-+  char buf[128];
-+  memset(buf, 'a', sizeof(buf));
-+  ASSERT_EQ(0, rados_write(ioctx, "foo", buf, sizeof(buf), 0));
-+
-+  {
-+    // set up a second client
-+    rados_t cluster;
-+    rados_ioctx_t ioctx;
-+    rados_create(&cluster, "admin");
-+    rados_conf_read_file(cluster, NULL);
-+    rados_conf_parse_env(cluster, NULL);
-+    rados_conf_set(cluster, "rados_osd_op_timeout", "0.00001"); // use any small value that will result in a timeout
-+    rados_connect(cluster);
-+    rados_ioctx_create(cluster, pool_name.c_str(), &ioctx);
-+    rados_ioctx_set_namespace(ioctx, nspace.c_str());
-+
-+    // then we show that the buffer is changed after rados_read returned
-+    // with a timeout
-+    for (int i=0; i<5; i++) {
-+      char buf2[sizeof(buf)];
-+      memset(buf2, 0, sizeof(buf2));
-+      int err = rados_read(ioctx, "foo", buf2, sizeof(buf2), 0);
-+      if (err == -110) {
-+	int startIndex = 0;
-+	// find the index until which librados already read the object before the timeout occurred
-+	for (unsigned b=0; b<sizeof(buf); b++) {
-+	  if (buf2[b] != buf[b]) {
-+	    startIndex = b;
-+	    break;
-+	  }
-+	}
-+
-+	// wait some time to give librados a change to do something
-+	sleep(1);
-+
-+	// then check if the buffer was changed after the call
-+	if (buf2[startIndex] == 'a') {
-+	  printf("byte at index %d was changed after the timeout to %d\n",
-+		 startIndex, (int)buf[startIndex]);
-+	  ASSERT_TRUE(0);
-+	  break;
-+	}
-+      } else {
-+	printf("no timeout :/\n");
-+      }
-+    }
-+    rados_ioctx_destroy(ioctx);
-+    rados_shutdown(cluster);
-+  }
-+}
-+
- TEST_F(LibRadosIoPP, SimpleWritePP) {
-   char buf[128];
-   memset(buf, 0xcc, sizeof(buf));
-   bufferlist bl;
---- a/src/test/librados/tier.cc
-+++ b/src/test/librados/tier.cc
-@@ -33,8 +33,40 @@
- 
- typedef RadosTestPP LibRadosTierPP;
- typedef RadosTestECPP LibRadosTierECPP;
- 
-+void flush_evict_all(librados::Rados& cluster, librados::IoCtx& cache_ioctx)
-+{
-+  bufferlist inbl;
-+  cache_ioctx.set_namespace("");
-+  for (ObjectIterator it = cache_ioctx.objects_begin();
-+       it != cache_ioctx.objects_end(); ++it) {
-+    cache_ioctx.locator_set_key(it->second);
-+    {
-+      ObjectReadOperation op;
-+      op.cache_flush();
-+      librados::AioCompletion *completion = cluster.aio_create_completion();
-+      cache_ioctx.aio_operate(
-+        it->first, completion, &op,
-+	librados::OPERATION_IGNORE_OVERLAY, NULL);
-+      completion->wait_for_safe();
-+      completion->get_return_value();
-+      completion->release();
-+    }
-+    {
-+      ObjectReadOperation op;
-+      op.cache_evict();
-+      librados::AioCompletion *completion = cluster.aio_create_completion();
-+      cache_ioctx.aio_operate(
-+        it->first, completion, &op,
-+	librados::OPERATION_IGNORE_OVERLAY, NULL);
-+      completion->wait_for_safe();
-+      completion->get_return_value();
-+      completion->release();
-+    }
-+  }
-+}
-+
- class LibRadosTwoPoolsPP : public RadosTestPP
- {
- public:
-   LibRadosTwoPoolsPP() {};
-@@ -58,9 +90,28 @@
-     cache_ioctx.set_namespace(ns);
-   }
-   virtual void TearDown() {
-     RadosTestPP::TearDown();
-+
-+    // flush + evict cache
-+    flush_evict_all(cluster, cache_ioctx);
-+
-+    bufferlist inbl;
-+    // tear down tiers
-+    ASSERT_EQ(0, cluster.mon_command(
-+      "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
-+      "\"}",
-+      inbl, NULL, NULL));
-+    ASSERT_EQ(0, cluster.mon_command(
-+      "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
-+      "\", \"tierpool\": \"" + cache_pool_name + "\"}",
-+    inbl, NULL, NULL));
-+
-+    // wait for maps to settle before next test
-+    cluster.wait_for_latest_osdmap();
-+
-     cleanup_default_namespace(cache_ioctx);
-+
-     cache_ioctx.close();
-   }
-   librados::IoCtx cache_ioctx;
- };
-@@ -179,21 +230,8 @@
-     ASSERT_EQ(0, completion->get_return_value());
-     completion->release();
-     ASSERT_EQ('b', bl[0]);
-   }
--
--  // tear down tiers
--  ASSERT_EQ(0, cluster.mon_command(
--    "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
--    "\"}",
--    inbl, NULL, NULL));
--  ASSERT_EQ(0, cluster.mon_command(
--    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
--    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
--    inbl, NULL, NULL));
--
--  // wait for maps to settle before next test
--  cluster.wait_for_latest_osdmap();
- }
- 
- TEST_F(LibRadosTwoPoolsPP, Promote) {
-   // create object
-@@ -246,21 +284,8 @@
-     ASSERT_TRUE(it->first == string("foo") || it->first == string("bar"));
-     ++it;
-     ASSERT_TRUE(it == cache_ioctx.objects_end());
-   }
--
--  // tear down tiers
--  ASSERT_EQ(0, cluster.mon_command(
--    "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
--    "\"}",
--    inbl, NULL, NULL));
--  ASSERT_EQ(0, cluster.mon_command(
--    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
--    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
--    inbl, NULL, NULL));
--
--  // wait for maps to settle before next test
--  cluster.wait_for_latest_osdmap();
- }
- 
- TEST_F(LibRadosTwoPoolsPP, PromoteSnap) {
-   // create object
-@@ -399,21 +424,8 @@
-   {
-     bufferlist bl;
-     ASSERT_EQ(-ENOENT, ioctx.read("baz", bl, 1, 0));
-   }
--
--  // tear down tiers
--  ASSERT_EQ(0, cluster.mon_command(
--    "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
--    "\"}",
--    inbl, NULL, NULL));
--  ASSERT_EQ(0, cluster.mon_command(
--    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
--    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
--    inbl, NULL, NULL));
--
--  // wait for maps to settle before next test
--  cluster.wait_for_latest_osdmap();
- }
- 
- TEST_F(LibRadosTwoPoolsPP, PromoteSnapScrub) {
-   int num = 100;
-@@ -508,21 +520,8 @@
-     cout << "done waiting" << std::endl;
-   }
- 
-   ioctx.snap_set_read(librados::SNAP_HEAD);
--
--  // tear down tiers
--  ASSERT_EQ(0, cluster.mon_command(
--    "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
--    "\"}",
--    inbl, NULL, NULL));
--  ASSERT_EQ(0, cluster.mon_command(
--    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
--    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
--    inbl, NULL, NULL));
--
--  // wait for maps to settle before next test
--  cluster.wait_for_latest_osdmap();
- }
- 
- 
- TEST_F(LibRadosTwoPoolsPP, PromoteSnapTrimRace) {
-@@ -576,21 +575,8 @@
-   {
-     bufferlist bl;
-     ASSERT_EQ(-ENOENT, ioctx.read("foo", bl, 1, 0));
-   }
--
--  // tear down tiers
--  ASSERT_EQ(0, cluster.mon_command(
--    "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
--    "\"}",
--    inbl, NULL, NULL));
--  ASSERT_EQ(0, cluster.mon_command(
--    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
--    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
--    inbl, NULL, NULL));
--
--  // wait for maps to settle before next test
--  cluster.wait_for_latest_osdmap();
- }
- 
- TEST_F(LibRadosTwoPoolsPP, Whiteout) {
-   // create object
-@@ -652,21 +638,8 @@
-     bufferlist bl;
-     ASSERT_EQ(1, ioctx.read("foo", bl, 1, 0));
-     ASSERT_EQ('h', bl[0]);
-   }
--
--  // tear down tiers
--  ASSERT_EQ(0, cluster.mon_command(
--    "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
--    "\"}",
--    inbl, NULL, NULL));
--  ASSERT_EQ(0, cluster.mon_command(
--    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
--    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
--    inbl, NULL, NULL));
--
--  // wait for maps to settle before next test
--  cluster.wait_for_latest_osdmap();
- }
- 
- TEST_F(LibRadosTwoPoolsPP, Evict) {
-   // create object
-@@ -755,21 +728,8 @@
-     completion->wait_for_safe();
-     ASSERT_EQ(-EBUSY, completion->get_return_value());
-     completion->release();
-   }
--
--  // tear down tiers
--  ASSERT_EQ(0, cluster.mon_command(
--    "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
--    "\"}",
--    inbl, NULL, NULL));
--  ASSERT_EQ(0, cluster.mon_command(
--    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
--    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
--    inbl, NULL, NULL));
--
--  // wait for maps to settle before next test
--  cluster.wait_for_latest_osdmap();
- }
- 
- TEST_F(LibRadosTwoPoolsPP, EvictSnap) {
-   // create object
-@@ -1003,21 +963,8 @@
-     completion->wait_for_safe();
-     ASSERT_EQ(0, completion->get_return_value());
-     completion->release();
-   }
--
--  // tear down tiers
--  ASSERT_EQ(0, cluster.mon_command(
--    "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
--    "\"}",
--    inbl, NULL, NULL));
--  ASSERT_EQ(0, cluster.mon_command(
--    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
--    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
--    inbl, NULL, NULL));
--
--  // wait for maps to settle before next test
--  cluster.wait_for_latest_osdmap();
- }
- 
- TEST_F(LibRadosTwoPoolsPP, TryFlush) {
-   // configure cache
-@@ -1124,21 +1071,8 @@
-   {
-     ObjectIterator it = cache_ioctx.objects_begin();
-     ASSERT_TRUE(it == cache_ioctx.objects_end());
-   }
--
--  // tear down tiers
--  ASSERT_EQ(0, cluster.mon_command(
--    "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
--    "\"}",
--    inbl, NULL, NULL));
--  ASSERT_EQ(0, cluster.mon_command(
--    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
--    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
--    inbl, NULL, NULL));
--
--  // wait for maps to settle before next test
--  cluster.wait_for_latest_osdmap();
- }
- 
- TEST_F(LibRadosTwoPoolsPP, Flush) {
-   // configure cache
-@@ -1297,21 +1231,8 @@
-   {
-     ObjectIterator it = ioctx.objects_begin();
-     ASSERT_TRUE(it == ioctx.objects_end());
-   }
--
--  // tear down tiers
--  ASSERT_EQ(0, cluster.mon_command(
--    "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
--    "\"}",
--    inbl, NULL, NULL));
--  ASSERT_EQ(0, cluster.mon_command(
--    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
--    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
--    inbl, NULL, NULL));
--
--  // wait for maps to settle before next test
--  cluster.wait_for_latest_osdmap();
- }
- 
- TEST_F(LibRadosTwoPoolsPP, FlushSnap) {
-   // configure cache
-@@ -1469,20 +1390,13 @@
-     ASSERT_EQ(1, ioctx.read("foo", bl, 1, 0));
-     ASSERT_EQ('a', bl[0]);
-   }
- 
--  // tear down tiers
-+  // remove overlay
-   ASSERT_EQ(0, cluster.mon_command(
-     "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
-     "\"}",
-     inbl, NULL, NULL));
--  ASSERT_EQ(0, cluster.mon_command(
--    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
--    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
--    inbl, NULL, NULL));
--
--  // wait for maps to settle
--  cluster.wait_for_latest_osdmap();
- 
-   // verify i can read the snaps from the base pool
-   ioctx.snap_set_read(librados::SNAP_HEAD);
-   {
-@@ -1501,8 +1415,13 @@
-     bufferlist bl;
-     ASSERT_EQ(1, ioctx.read("foo", bl, 1, 0));
-     ASSERT_EQ('a', bl[0]);
-   }
-+
-+  ASSERT_EQ(0, cluster.mon_command(
-+    "{\"prefix\": \"osd tier set-overlay\", \"pool\": \"" + pool_name +
-+    "\", \"overlaypool\": \"" + cache_pool_name + "\"}",
-+    inbl, NULL, NULL));
- }
- 
- TEST_F(LibRadosTierPP, FlushWriteRaces) {
-   Rados cluster;
-@@ -1785,21 +1704,8 @@
-     ASSERT_EQ(0, completion2->get_return_value());
-     completion->release();
-     completion2->release();
-   }
--
--  // tear down tiers
--  ASSERT_EQ(0, cluster.mon_command(
--    "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
--    "\"}",
--    inbl, NULL, NULL));
--  ASSERT_EQ(0, cluster.mon_command(
--    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
--    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
--    inbl, NULL, NULL));
--
--  // wait for maps to settle before next test
--  cluster.wait_for_latest_osdmap();
- }
- 
- 
- IoCtx *read_ioctx = 0;
-@@ -1894,21 +1800,8 @@
-   max_reads = 0;
-   while (num_reads > 0)
-     cond.Wait(test_lock);
-   test_lock.Unlock();
--
--  // tear down tiers
--  ASSERT_EQ(0, cluster.mon_command(
--    "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
--    "\"}",
--    inbl, NULL, NULL));
--  ASSERT_EQ(0, cluster.mon_command(
--    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
--    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
--    inbl, NULL, NULL));
--
--  // wait for maps to settle before next test
--  cluster.wait_for_latest_osdmap();
- }
- 
- TEST_F(LibRadosTierPP, HitSetNone) {
-   {
-@@ -1943,23 +1836,30 @@
-     + string("\",\"var\": \"") + var + string("\",\"val\": \"")
-     + stringify(val) + string("\"}");
- }
- 
--TEST_F(LibRadosTierPP, HitSetRead) {
--  // enable hitset tracking for this pool
-+TEST_F(LibRadosTwoPoolsPP, HitSetRead) {
-+  // make it a tier
-   bufferlist inbl;
--  ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_count", 2),
-+  ASSERT_EQ(0, cluster.mon_command(
-+    "{\"prefix\": \"osd tier add\", \"pool\": \"" + pool_name +
-+    "\", \"tierpool\": \"" + cache_pool_name +
-+    "\", \"force_nonempty\": \"--force-nonempty\" }",
-+    inbl, NULL, NULL));
-+
-+  // enable hitset tracking for this pool
-+  ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_count", 2),
- 						inbl, NULL, NULL));
--  ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_period", 600),
-+  ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_period", 600),
- 						inbl, NULL, NULL));
--  ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_type",
-+  ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_type",
- 						"explicit_object"),
- 				   inbl, NULL, NULL));
- 
-   // wait for maps to settle
-   cluster.wait_for_latest_osdmap();
- 
--  ioctx.set_namespace("");
-+  cache_ioctx.set_namespace("");
- 
-   // keep reading until we see our object appear in the HitSet
-   utime_t start = ceph_clock_now(NULL);
-   utime_t hard_stop = start + utime_t(600, 0);
-@@ -1968,18 +1868,18 @@
-     utime_t now = ceph_clock_now(NULL);
-     ASSERT_TRUE(now < hard_stop);
- 
-     string name = "foo";
--    uint32_t hash = ioctx.get_object_hash_position(name);
-+    uint32_t hash = cache_ioctx.get_object_hash_position(name);
-     hobject_t oid(sobject_t(name, CEPH_NOSNAP), "", hash,
--		  cluster.pool_lookup(pool_name.c_str()), "");
-+		  cluster.pool_lookup(cache_pool_name.c_str()), "");
- 
-     bufferlist bl;
--    ASSERT_EQ(-ENOENT, ioctx.read("foo", bl, 1, 0));
-+    ASSERT_EQ(-ENOENT, cache_ioctx.read("foo", bl, 1, 0));
- 
-     bufferlist hbl;
-     AioCompletion *c = librados::Rados::aio_create_completion();
--    ASSERT_EQ(0, ioctx.hit_set_get(hash, c, now.sec(), &hbl));
-+    ASSERT_EQ(0, cache_ioctx.hit_set_get(hash, c, now.sec(), &hbl));
-     c->wait_for_complete();
-     c->release();
- 
-     if (hbl.length()) {
-@@ -2027,49 +1927,58 @@
-   return -1;
- }
- 
- 
--TEST_F(LibRadosTierPP, HitSetWrite) {
-+TEST_F(LibRadosTwoPoolsPP, HitSetWrite) {
-   int num_pg = _get_pg_num(cluster, pool_name);
-   assert(num_pg > 0);
- 
--  // enable hitset tracking for this pool
-+  // make it a tier
-   bufferlist inbl;
--  ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_count", 8),
-+  ASSERT_EQ(0, cluster.mon_command(
-+    "{\"prefix\": \"osd tier add\", \"pool\": \"" + pool_name +
-+    "\", \"tierpool\": \"" + cache_pool_name +
-+    "\", \"force_nonempty\": \"--force-nonempty\" }",
-+    inbl, NULL, NULL));
-+
-+  // enable hitset tracking for this pool
-+  ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_count", 8),
- 						inbl, NULL, NULL));
--  ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_period", 600),
-+  ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_period", 600),
- 						inbl, NULL, NULL));
--  ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_type",
-+  ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_type",
- 						"explicit_hash"),
- 				   inbl, NULL, NULL));
- 
-   // wait for maps to settle
-   cluster.wait_for_latest_osdmap();
- 
--  ioctx.set_namespace("");
-+  cache_ioctx.set_namespace("");
-+
-+  int num = 200;
- 
-   // do a bunch of writes
--  for (int i=0; i<1000; ++i) {
-+  for (int i=0; i<num; ++i) {
-     bufferlist bl;
-     bl.append("a");
--    ASSERT_EQ(0, ioctx.write(stringify(i), bl, 1, 0));
-+    ASSERT_EQ(0, cache_ioctx.write(stringify(i), bl, 1, 0));
-   }
- 
-   // get HitSets
-   std::map<int,HitSet> hitsets;
-   for (int i=0; i<num_pg; ++i) {
-     list< pair<time_t,time_t> > ls;
-     AioCompletion *c = librados::Rados::aio_create_completion();
--    ASSERT_EQ(0, ioctx.hit_set_list(i, c, &ls));
-+    ASSERT_EQ(0, cache_ioctx.hit_set_list(i, c, &ls));
-     c->wait_for_complete();
-     c->release();
-     std::cout << "pg " << i << " ls " << ls << std::endl;
-     ASSERT_FALSE(ls.empty());
- 
-     // get the latest
-     c = librados::Rados::aio_create_completion();
-     bufferlist bl;
--    ASSERT_EQ(0, ioctx.hit_set_get(i, c, ls.back().first, &bl));
-+    ASSERT_EQ(0, cache_ioctx.hit_set_get(i, c, ls.back().first, &bl));
-     c->wait_for_complete();
-     c->release();
- 
-     //std::cout << "bl len is " << bl.length() << "\n";
-@@ -2080,16 +1989,16 @@
-     ::decode(hitsets[i], p);
- 
-     // cope with racing splits by refreshing pg_num
-     if (i == num_pg - 1)
--      num_pg = _get_pg_num(cluster, pool_name);
-+      num_pg = _get_pg_num(cluster, cache_pool_name);
-   }
- 
--  for (int i=0; i<1000; ++i) {
-+  for (int i=0; i<num; ++i) {
-     string n = stringify(i);
--    uint32_t hash = ioctx.get_object_hash_position(n);
-+    uint32_t hash = cache_ioctx.get_object_hash_position(n);
-     hobject_t oid(sobject_t(n, CEPH_NOSNAP), "", hash,
--		  cluster.pool_lookup(pool_name.c_str()), "");
-+		  cluster.pool_lookup(cache_pool_name.c_str()), "");
-     std::cout << "checking for " << oid << std::endl;
-     bool found = false;
-     for (int p=0; p<num_pg; ++p) {
-       if (hitsets[p].contains(oid)) {
-@@ -2100,45 +2009,52 @@
-     ASSERT_TRUE(found);
-   }
- }
- 
--TEST_F(LibRadosTierPP, HitSetTrim) {
-+TEST_F(LibRadosTwoPoolsPP, HitSetTrim) {
-   unsigned count = 3;
-   unsigned period = 3;
- 
--  // enable hitset tracking for this pool
-+  // make it a tier
-   bufferlist inbl;
--  ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_count", count),
-+  ASSERT_EQ(0, cluster.mon_command(
-+    "{\"prefix\": \"osd tier add\", \"pool\": \"" + pool_name +
-+    "\", \"tierpool\": \"" + cache_pool_name +
-+    "\", \"force_nonempty\": \"--force-nonempty\" }",
-+    inbl, NULL, NULL));
-+
-+  // enable hitset tracking for this pool
-+  ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_count", count),
- 						inbl, NULL, NULL));
--  ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_period", period),
-+  ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_period", period),
- 						inbl, NULL, NULL));
--  ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_type", "bloom"),
-+  ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_type", "bloom"),
- 				   inbl, NULL, NULL));
--  ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_fpp", ".01"),
-+  ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_fpp", ".01"),
- 				   inbl, NULL, NULL));
- 
-   // wait for maps to settle
-   cluster.wait_for_latest_osdmap();
- 
--  ioctx.set_namespace("");
-+  cache_ioctx.set_namespace("");
- 
-   // do a bunch of writes and make sure the hitsets rotate
-   utime_t start = ceph_clock_now(NULL);
-   utime_t hard_stop = start + utime_t(count * period * 50, 0);
- 
-   time_t first = 0;
-   while (true) {
-     string name = "foo";
--    uint32_t hash = ioctx.get_object_hash_position(name);
-+    uint32_t hash = cache_ioctx.get_object_hash_position(name);
-     hobject_t oid(sobject_t(name, CEPH_NOSNAP), "", hash, -1, "");
- 
-     bufferlist bl;
-     bl.append("f");
--    ASSERT_EQ(0, ioctx.write("foo", bl, 1, 0));
-+    ASSERT_EQ(0, cache_ioctx.write("foo", bl, 1, 0));
- 
-     list<pair<time_t, time_t> > ls;
-     AioCompletion *c = librados::Rados::aio_create_completion();
--    ASSERT_EQ(0, ioctx.hit_set_list(hash, c, &ls));
-+    ASSERT_EQ(0, cache_ioctx.hit_set_list(hash, c, &ls));
-     c->wait_for_complete();
-     c->release();
- 
-     ASSERT_TRUE(ls.size() <= count + 1);
-@@ -2186,11 +2102,31 @@
-     cache_ioctx.set_namespace(ns);
-   }
-   virtual void TearDown() {
-     RadosTestECPP::TearDown();
-+
-+    // flush + evict cache
-+    flush_evict_all(cluster, cache_ioctx);
-+
-+    bufferlist inbl;
-+    // tear down tiers
-+    ASSERT_EQ(0, cluster.mon_command(
-+      "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
-+      "\"}",
-+      inbl, NULL, NULL));
-+    ASSERT_EQ(0, cluster.mon_command(
-+      "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
-+      "\", \"tierpool\": \"" + cache_pool_name + "\"}",
-+    inbl, NULL, NULL));
-+
-+    // wait for maps to settle before next test
-+    cluster.wait_for_latest_osdmap();
-+
-     cleanup_default_namespace(cache_ioctx);
-+
-     cache_ioctx.close();
-   }
-+
-   librados::IoCtx cache_ioctx;
- };
- 
- std::string LibRadosTwoPoolsECPP::cache_pool_name;
-@@ -2307,21 +2243,8 @@
-     ASSERT_EQ(0, completion->get_return_value());
-     completion->release();
-     ASSERT_EQ('b', bl[0]);
-   }
--
--  // tear down tiers
--  ASSERT_EQ(0, cluster.mon_command(
--    "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
--    "\"}",
--    inbl, NULL, NULL));
--  ASSERT_EQ(0, cluster.mon_command(
--    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
--    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
--    inbl, NULL, NULL));
--
--  // wait for maps to settle before next test
--  cluster.wait_for_latest_osdmap();
- }
- 
- TEST_F(LibRadosTwoPoolsECPP, Promote) {
-   // create object
-@@ -2374,21 +2297,8 @@
-     ASSERT_TRUE(it->first == string("foo") || it->first == string("bar"));
-     ++it;
-     ASSERT_TRUE(it == cache_ioctx.objects_end());
-   }
--
--  // tear down tiers
--  ASSERT_EQ(0, cluster.mon_command(
--    "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
--    "\"}",
--    inbl, NULL, NULL));
--  ASSERT_EQ(0, cluster.mon_command(
--    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
--    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
--    inbl, NULL, NULL));
--
--  // wait for maps to settle before next test
--  cluster.wait_for_latest_osdmap();
- }
- 
- TEST_F(LibRadosTwoPoolsECPP, PromoteSnap) {
-   // create object
-@@ -2551,21 +2461,8 @@
-   {
-     bufferlist bl;
-     ASSERT_EQ(-ENOENT, ioctx.read("baz", bl, 1, 0));
-   }
--
--  // tear down tiers
--  ASSERT_EQ(0, cluster.mon_command(
--    "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
--    "\"}",
--    inbl, NULL, NULL));
--  ASSERT_EQ(0, cluster.mon_command(
--    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
--    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
--    inbl, NULL, NULL));
--
--  // wait for maps to settle before next test
--  cluster.wait_for_latest_osdmap();
- }
- 
- TEST_F(LibRadosTwoPoolsECPP, PromoteSnapTrimRace) {
-   // create object
-@@ -2618,21 +2515,8 @@
-   {
-     bufferlist bl;
-     ASSERT_EQ(-ENOENT, ioctx.read("foo", bl, 1, 0));
-   }
--
--  // tear down tiers
--  ASSERT_EQ(0, cluster.mon_command(
--    "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
--    "\"}",
--    inbl, NULL, NULL));
--  ASSERT_EQ(0, cluster.mon_command(
--    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
--    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
--    inbl, NULL, NULL));
--
--  // wait for maps to settle before next test
--  cluster.wait_for_latest_osdmap();
- }
- 
- TEST_F(LibRadosTwoPoolsECPP, Whiteout) {
-   // create object
-@@ -2694,21 +2578,8 @@
-     bufferlist bl;
-     ASSERT_EQ(1, ioctx.read("foo", bl, 1, 0));
-     ASSERT_EQ('h', bl[0]);
-   }
--
--  // tear down tiers
--  ASSERT_EQ(0, cluster.mon_command(
--    "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
--    "\"}",
--    inbl, NULL, NULL));
--  ASSERT_EQ(0, cluster.mon_command(
--    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
--    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
--    inbl, NULL, NULL));
--
--  // wait for maps to settle before next test
--  cluster.wait_for_latest_osdmap();
- }
- 
- TEST_F(LibRadosTwoPoolsECPP, Evict) {
-   // create object
-@@ -2797,21 +2668,8 @@
-     completion->wait_for_safe();
-     ASSERT_EQ(-EBUSY, completion->get_return_value());
-     completion->release();
-   }
--
--  // tear down tiers
--  ASSERT_EQ(0, cluster.mon_command(
--    "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
--    "\"}",
--    inbl, NULL, NULL));
--  ASSERT_EQ(0, cluster.mon_command(
--    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
--    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
--    inbl, NULL, NULL));
--
--  // wait for maps to settle before next test
--  cluster.wait_for_latest_osdmap();
- }
- 
- TEST_F(LibRadosTwoPoolsECPP, EvictSnap) {
-   // create object
-@@ -3045,21 +2903,8 @@
-     completion->wait_for_safe();
-     ASSERT_EQ(0, completion->get_return_value());
-     completion->release();
-   }
--
--  // tear down tiers
--  ASSERT_EQ(0, cluster.mon_command(
--    "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
--    "\"}",
--    inbl, NULL, NULL));
--  ASSERT_EQ(0, cluster.mon_command(
--    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
--    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
--    inbl, NULL, NULL));
--
--  // wait for maps to settle before next test
--  cluster.wait_for_latest_osdmap();
- }
- 
- TEST_F(LibRadosTwoPoolsECPP, TryFlush) {
-   // configure cache
-@@ -3166,21 +3011,8 @@
-   {
-     ObjectIterator it = cache_ioctx.objects_begin();
-     ASSERT_TRUE(it == cache_ioctx.objects_end());
-   }
--
--  // tear down tiers
--  ASSERT_EQ(0, cluster.mon_command(
--    "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
--    "\"}",
--    inbl, NULL, NULL));
--  ASSERT_EQ(0, cluster.mon_command(
--    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
--    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
--    inbl, NULL, NULL));
--
--  // wait for maps to settle before next test
--  cluster.wait_for_latest_osdmap();
- }
- 
- TEST_F(LibRadosTwoPoolsECPP, Flush) {
-   // configure cache
-@@ -3339,21 +3171,8 @@
-   {
-     ObjectIterator it = ioctx.objects_begin();
-     ASSERT_TRUE(it == ioctx.objects_end());
-   }
--
--  // tear down tiers
--  ASSERT_EQ(0, cluster.mon_command(
--    "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
--    "\"}",
--    inbl, NULL, NULL));
--  ASSERT_EQ(0, cluster.mon_command(
--    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
--    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
--    inbl, NULL, NULL));
--
--  // wait for maps to settle before next test
--  cluster.wait_for_latest_osdmap();
- }
- 
- TEST_F(LibRadosTwoPoolsECPP, FlushSnap) {
-   // configure cache
-@@ -3516,12 +3335,8 @@
-   ASSERT_EQ(0, cluster.mon_command(
-     "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
-     "\"}",
-     inbl, NULL, NULL));
--  ASSERT_EQ(0, cluster.mon_command(
--    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
--    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
--    inbl, NULL, NULL));
- 
-   // wait for maps to settle
-   cluster.wait_for_latest_osdmap();
- 
-@@ -3543,8 +3358,13 @@
-     bufferlist bl;
-     ASSERT_EQ(1, ioctx.read("foo", bl, 1, 0));
-     ASSERT_EQ('a', bl[0]);
-   }
-+
-+  ASSERT_EQ(0, cluster.mon_command(
-+    "{\"prefix\": \"osd tier set-overlay\", \"pool\": \"" + pool_name +
-+    "\", \"overlaypool\": \"" + cache_pool_name + "\"}",
-+    inbl, NULL, NULL));
- }
- 
- TEST_F(LibRadosTierECPP, FlushWriteRaces) {
-   Rados cluster;
-@@ -3827,21 +3647,8 @@
-     ASSERT_EQ(0, completion2->get_return_value());
-     completion->release();
-     completion2->release();
-   }
--
--  // tear down tiers
--  ASSERT_EQ(0, cluster.mon_command(
--    "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
--    "\"}",
--    inbl, NULL, NULL));
--  ASSERT_EQ(0, cluster.mon_command(
--    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
--    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
--    inbl, NULL, NULL));
--
--  // wait for maps to settle before next test
--  cluster.wait_for_latest_osdmap();
- }
- 
- TEST_F(LibRadosTwoPoolsECPP, TryFlushReadRace) {
-   // configure cache
-@@ -3902,21 +3709,8 @@
-   max_reads = 0;
-   while (num_reads > 0)
-     cond.Wait(test_lock);
-   test_lock.Unlock();
--
--  // tear down tiers
--  ASSERT_EQ(0, cluster.mon_command(
--    "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
--    "\"}",
--    inbl, NULL, NULL));
--  ASSERT_EQ(0, cluster.mon_command(
--    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
--    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
--    inbl, NULL, NULL));
--
--  // wait for maps to settle before next test
--  cluster.wait_for_latest_osdmap();
- }
- 
- TEST_F(LibRadosTierECPP, HitSetNone) {
-   {
-@@ -3937,23 +3731,30 @@
-     c->release();
-   }
- }
- 
--TEST_F(LibRadosTierECPP, HitSetRead) {
--  // enable hitset tracking for this pool
-+TEST_F(LibRadosTwoPoolsECPP, HitSetRead) {
-+  // make it a tier
-   bufferlist inbl;
--  ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_count", 2),
-+  ASSERT_EQ(0, cluster.mon_command(
-+    "{\"prefix\": \"osd tier add\", \"pool\": \"" + pool_name +
-+    "\", \"tierpool\": \"" + cache_pool_name +
-+    "\", \"force_nonempty\": \"--force-nonempty\" }",
-+    inbl, NULL, NULL));
-+
-+  // enable hitset tracking for this pool
-+  ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_count", 2),
- 						inbl, NULL, NULL));
--  ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_period", 600),
-+  ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_period", 600),
- 						inbl, NULL, NULL));
--  ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_type",
-+  ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_type",
- 						"explicit_object"),
- 				   inbl, NULL, NULL));
- 
-   // wait for maps to settle
-   cluster.wait_for_latest_osdmap();
- 
--  ioctx.set_namespace("");
-+  cache_ioctx.set_namespace("");
- 
-   // keep reading until we see our object appear in the HitSet
-   utime_t start = ceph_clock_now(NULL);
-   utime_t hard_stop = start + utime_t(600, 0);
-@@ -3962,18 +3763,18 @@
-     utime_t now = ceph_clock_now(NULL);
-     ASSERT_TRUE(now < hard_stop);
- 
-     string name = "foo";
--    uint32_t hash = ioctx.get_object_hash_position(name);
-+    uint32_t hash = cache_ioctx.get_object_hash_position(name);
-     hobject_t oid(sobject_t(name, CEPH_NOSNAP), "", hash,
--		  cluster.pool_lookup(pool_name.c_str()), "");
-+		  cluster.pool_lookup(cache_pool_name.c_str()), "");
- 
-     bufferlist bl;
--    ASSERT_EQ(-ENOENT, ioctx.read("foo", bl, 1, 0));
-+    ASSERT_EQ(-ENOENT, cache_ioctx.read("foo", bl, 1, 0));
- 
-     bufferlist hbl;
-     AioCompletion *c = librados::Rados::aio_create_completion();
--    ASSERT_EQ(0, ioctx.hit_set_get(hash, c, now.sec(), &hbl));
-+    ASSERT_EQ(0, cache_ioctx.hit_set_get(hash, c, now.sec(), &hbl));
-     c->wait_for_complete();
-     c->release();
- 
-     if (hbl.length()) {
-@@ -4068,27 +3869,34 @@
-   }
- }
- #endif
- 
--TEST_F(LibRadosTierECPP, HitSetTrim) {
-+TEST_F(LibRadosTwoPoolsECPP, HitSetTrim) {
-   unsigned count = 3;
-   unsigned period = 3;
- 
--  // enable hitset tracking for this pool
-+  // make it a tier
-   bufferlist inbl;
--  ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_count", count),
-+  ASSERT_EQ(0, cluster.mon_command(
-+    "{\"prefix\": \"osd tier add\", \"pool\": \"" + pool_name +
-+    "\", \"tierpool\": \"" + cache_pool_name +
-+    "\", \"force_nonempty\": \"--force-nonempty\" }",
-+    inbl, NULL, NULL));
-+
-+  // enable hitset tracking for this pool
-+  ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_count", count),
- 						inbl, NULL, NULL));
--  ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_period", period),
-+  ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_period", period),
- 						inbl, NULL, NULL));
--  ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_type", "bloom"),
-+  ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_type", "bloom"),
- 				   inbl, NULL, NULL));
--  ASSERT_EQ(0, cluster.mon_command(set_pool_str(pool_name, "hit_set_fpp", ".01"),
-+  ASSERT_EQ(0, cluster.mon_command(set_pool_str(cache_pool_name, "hit_set_fpp", ".01"),
- 				   inbl, NULL, NULL));
- 
-   // wait for maps to settle
-   cluster.wait_for_latest_osdmap();
- 
--  ioctx.set_namespace("");
-+  cache_ioctx.set_namespace("");
- 
-   // do a bunch of writes and make sure the hitsets rotate
-   utime_t start = ceph_clock_now(NULL);
-   utime_t hard_stop = start + utime_t(count * period * 50, 0);
-@@ -4099,18 +3907,18 @@
-   memset(buf, 'f', bsize);
- 
-   while (true) {
-     string name = "foo";
--    uint32_t hash = ioctx.get_object_hash_position(name);
-+    uint32_t hash = cache_ioctx.get_object_hash_position(name);
-     hobject_t oid(sobject_t(name, CEPH_NOSNAP), "", hash, -1, "");
- 
-     bufferlist bl;
-     bl.append(buf, bsize);
--    ASSERT_EQ(0, ioctx.append("foo", bl, bsize));
-+    ASSERT_EQ(0, cache_ioctx.append("foo", bl, bsize));
- 
-     list<pair<time_t, time_t> > ls;
-     AioCompletion *c = librados::Rados::aio_create_completion();
--    ASSERT_EQ(0, ioctx.hit_set_list(hash, c, &ls));
-+    ASSERT_EQ(0, cache_ioctx.hit_set_list(hash, c, &ls));
-     c->wait_for_complete();
-     c->release();
- 
-     ASSERT_TRUE(ls.size() <= count + 1);
---- a/src/test/objectstore/store_test.cc
-+++ b/src/test/objectstore/store_test.cc
-@@ -1114,8 +1114,113 @@
-     ASSERT_EQ(1u, newomap.size());
-     ASSERT_TRUE(newomap.count("omap_key"));
-     ASSERT_TRUE(newomap["omap_key"].contents_equal(omap["omap_key"]));
-   }
-+  {
-+    ObjectStore::Transaction t;
-+    t.remove(cid, oid);
-+    t.remove_collection(cid);
-+    t.remove_collection(temp_cid);
-+    r = store->apply_transaction(t);
-+    ASSERT_EQ(r, 0);
-+  }
-+}
-+
-+TEST_P(StoreTest, BigRGWObjectName) {
-+  store->set_allow_sharded_objects();
-+  store->sync_and_flush();
-+  coll_t temp_cid("mytemp");
-+  hobject_t temp_oid("tmp_oid", "", CEPH_NOSNAP, 0, 0, "");
-+  coll_t cid("dest");
-+  ghobject_t oid(
-+    hobject_t(
-+      "default.4106.50_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa [...]
-+      "",
-+      CEPH_NOSNAP,
-+      0x81920472,
-+      3,
-+      ""),
-+    15,
-+    shard_id_t(1));
-+  ghobject_t oid2(oid);
-+  oid2.generation = 17;
-+  ghobject_t oidhead(oid);
-+  oidhead.generation = ghobject_t::NO_GEN;
-+
-+  int r;
-+  {
-+    ObjectStore::Transaction t;
-+    t.create_collection(cid);
-+    t.touch(cid, oidhead);
-+    t.collection_move_rename(cid, oidhead, cid, oid);
-+    t.touch(cid, oidhead);
-+    t.collection_move_rename(cid, oidhead, cid, oid2);
-+    r = store->apply_transaction(t);
-+    ASSERT_EQ(r, 0);
-+  }
-+
-+  {
-+    ObjectStore::Transaction t;
-+    t.remove(cid, oid);
-+    r = store->apply_transaction(t);
-+    ASSERT_EQ(r, 0);
-+  }
-+
-+  {
-+    vector<ghobject_t> objects;
-+    r = store->collection_list(cid, objects);
-+    ASSERT_EQ(r, 0);
-+    ASSERT_EQ(objects.size(), 1u);
-+    ASSERT_EQ(objects[0], oid2);
-+  }
-+
-+  ASSERT_FALSE(store->exists(cid, oid));
-+
-+  {
-+    ObjectStore::Transaction t;
-+    t.remove(cid, oid2);
-+    t.remove_collection(cid);
-+    r = store->apply_transaction(t);
-+    ASSERT_EQ(r, 0);
-+
-+  }
-+}
-+
-+TEST_P(StoreTest, SetAllocHint) {
-+  coll_t cid("alloc_hint");
-+  ghobject_t hoid(hobject_t("test_hint", "", CEPH_NOSNAP, 0, 0, ""));
-+  int r;
-+  {
-+    ObjectStore::Transaction t;
-+    t.create_collection(cid);
-+    t.touch(cid, hoid);
-+    r = store->apply_transaction(t);
-+    ASSERT_EQ(r, 0);
-+  }
-+  {
-+    ObjectStore::Transaction t;
-+    t.set_alloc_hint(cid, hoid, 4*1024*1024, 1024*4);
-+    r = store->apply_transaction(t);
-+    ASSERT_EQ(r, 0);
-+  }
-+  {
-+    ObjectStore::Transaction t;
-+    t.remove(cid, hoid);
-+    r = store->apply_transaction(t);
-+    ASSERT_EQ(r, 0);
-+  }
-+  {
-+    ObjectStore::Transaction t;
-+    t.set_alloc_hint(cid, hoid, 4*1024*1024, 1024*4);
-+    r = store->apply_transaction(t);
-+    ASSERT_EQ(r, 0);
-+  }
-+  {
-+    ObjectStore::Transaction t;
-+    t.remove_collection(cid);
-+    r = store->apply_transaction(t);
-+    ASSERT_EQ(r, 0);
-+  }
- }
- 
- INSTANTIATE_TEST_CASE_P(
-   ObjectStore,
---- a/src/test/osd/TestOSDMap.cc
-+++ b/src/test/osd/TestOSDMap.cc
-@@ -49,15 +49,26 @@
-       pending_inc.new_uuid[i] = sample_uuid;
-     }
-     osdmap.apply_incremental(pending_inc);
- 
--    // kludge to get an erasure coding rule and pool
-+    // Create an EC ruleset and a pool using it
-     int r = osdmap.crush->add_simple_ruleset("erasure", "default", "osd",
- 					     "indep", pg_pool_t::TYPE_ERASURE,
- 					     &cerr);
--    pg_pool_t *p = (pg_pool_t *)osdmap.get_pg_pool(2);
-+
-+    OSDMap::Incremental new_pool_inc(osdmap.get_epoch() + 1);
-+    new_pool_inc.new_pool_max = osdmap.get_pool_max();
-+    new_pool_inc.fsid = osdmap.get_fsid();
-+    pg_pool_t empty;
-+    uint64_t pool_id = ++new_pool_inc.new_pool_max;
-+    pg_pool_t *p = new_pool_inc.get_new_pool(pool_id, &empty);
-+    p->size = 3;
-+    p->set_pg_num(64);
-+    p->set_pgp_num(64);
-     p->type = pg_pool_t::TYPE_ERASURE;
-     p->crush_ruleset = r;
-+    new_pool_inc.new_pool_names[pool_id] = "ec";
-+    osdmap.apply_incremental(new_pool_inc);
-   }
-   unsigned int get_num_osds() { return num_osds; }
- 
-   void test_mappings(int pool,
-@@ -85,8 +96,50 @@
-   ASSERT_EQ(get_num_osds(), (unsigned)osdmap.get_max_osd());
-   ASSERT_EQ(get_num_osds(), osdmap.get_num_in_osds());
- }
- 
-+TEST_F(OSDMapTest, Features) {
-+  // with EC pool
-+  set_up_map();
-+  uint64_t features = osdmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL);
-+  ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES);
-+  ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES2);
-+  ASSERT_FALSE(features & CEPH_FEATURE_CRUSH_TUNABLES3);
-+  ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_V2);
-+  ASSERT_TRUE(features & CEPH_FEATURE_OSD_ERASURE_CODES);
-+  ASSERT_TRUE(features & CEPH_FEATURE_OSDHASHPSPOOL);
-+  ASSERT_FALSE(features & CEPH_FEATURE_OSD_PRIMARY_AFFINITY);
-+
-+  // clients have a slightly different view
-+  features = osdmap.get_features(CEPH_ENTITY_TYPE_CLIENT, NULL);
-+  ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES);
-+  ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES2);
-+  ASSERT_FALSE(features & CEPH_FEATURE_CRUSH_TUNABLES3);
-+  ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_V2);
-+  ASSERT_FALSE(features & CEPH_FEATURE_OSD_ERASURE_CODES);  // dont' need this
-+  ASSERT_TRUE(features & CEPH_FEATURE_OSDHASHPSPOOL);
-+  ASSERT_FALSE(features & CEPH_FEATURE_OSD_PRIMARY_AFFINITY);
-+
-+  // remove teh EC pool, but leave the rule.  add primary affinity.
-+  {
-+    OSDMap::Incremental new_pool_inc(osdmap.get_epoch() + 1);
-+    new_pool_inc.old_pools.insert(osdmap.lookup_pg_pool_name("ec"));
-+    new_pool_inc.new_primary_affinity[0] = 0x8000;
-+    osdmap.apply_incremental(new_pool_inc);
-+  }
-+
-+  features = osdmap.get_features(CEPH_ENTITY_TYPE_MON, NULL);
-+  ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES);
-+  ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES2);
-+  ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES3); // shared bit with primary affinity
-+  ASSERT_FALSE(features & CEPH_FEATURE_CRUSH_V2);
-+  ASSERT_FALSE(features & CEPH_FEATURE_OSD_ERASURE_CODES);
-+  ASSERT_TRUE(features & CEPH_FEATURE_OSDHASHPSPOOL);
-+  ASSERT_TRUE(features & CEPH_FEATURE_OSD_PRIMARY_AFFINITY);
-+
-+  // FIXME: test tiering feature bits
-+}
-+
- TEST_F(OSDMapTest, MapPG) {
-   set_up_map();
- 
-   pg_t rawpg(0, 0, -1);
---- a/src/test/osd/osd-test-helpers.sh
-+++ b/src/test/osd/osd-test-helpers.sh
-@@ -36,8 +36,9 @@
-     local ceph_args="$CEPH_ARGS"
-     ceph_args+=" --osd-journal-size=100"
-     ceph_args+=" --osd-data=$osd_data"
-     ceph_args+=" --chdir="
-+    ceph_args+=" --osd-pool-default-erasure-code-directory=.libs"
-     ceph_args+=" --run-dir=$dir"
-     ceph_args+=" --debug-osd=20"
-     ceph_args+=" --log-file=$dir/osd-\$id.log"
-     ceph_args+=" --pid-file=$dir/osd-\$id.pidfile"
---- a/src/test/strtol.cc
-+++ b/src/test/strtol.cc
-@@ -13,8 +13,9 @@
-  */
- 
- #include "common/strtol.h"
- #include <string>
-+#include <map>
- 
- #include "gtest/gtest.h"
- 
- static void test_strict_strtoll(const char *str, long long expected)
-@@ -133,4 +134,78 @@
-   test_strict_strtod_err("34.0 garbo");
- 
-   test_strict_strtof_err("0.05.0");
- }
-+
-+
-+static void test_strict_sistrtoll(const char *str)
-+{
-+  std::string err;
-+  strict_sistrtoll(str, &err);
-+  ASSERT_EQ(err, "");
-+}
-+
-+static void test_strict_sistrtoll_units(const std::string& foo,
-+                                      char u, const int m)
-+{
-+  std::string s(foo);
-+  s.push_back(u);
-+  const char *str = s.c_str();
-+  std::string err;
-+  uint64_t r = strict_sistrtoll(str, &err);
-+  ASSERT_EQ(err, "");
-+
-+  str = foo.c_str();
-+  std::string err2;
-+  long long tmp = strict_strtoll(str, 10, &err2);
-+  ASSERT_EQ(err2, "");
-+  tmp = (tmp << m);
-+  ASSERT_EQ(tmp, (long long)r);
-+}
-+
-+TEST(SIStrToLL, WithUnits) {
-+  std::map<char,int> units;
-+  units['B'] = 0;
-+  units['K'] = 10;
-+  units['M'] = 20;
-+  units['G'] = 30;
-+  units['T'] = 40;
-+  units['P'] = 50;
-+  units['E'] = 60;
-+
-+  for (std::map<char,int>::iterator p = units.begin();
-+       p != units.end(); ++p) {
-+    test_strict_sistrtoll_units("1024", p->first, p->second);
-+    test_strict_sistrtoll_units("1", p->first, p->second);
-+    test_strict_sistrtoll_units("0", p->first, p->second);
-+  }
-+}
-+
-+TEST(SIStrToLL, WithoutUnits) {
-+  test_strict_sistrtoll("1024");
-+  test_strict_sistrtoll("1152921504606846976");
-+  test_strict_sistrtoll("0");
-+}
-+
-+static void test_strict_sistrtoll_err(const char *str)
-+{
-+  std::string err;
-+  strict_sistrtoll(str, &err);
-+  ASSERT_NE(err, "");
-+}
-+
-+TEST(SIStrToLL, Error) {
-+  test_strict_sistrtoll_err("1024F");
-+  test_strict_sistrtoll_err("QDDSA");
-+  test_strict_sistrtoll_err("1b");
-+  test_strict_sistrtoll_err("100k");
-+  test_strict_sistrtoll_err("1000m");
-+  test_strict_sistrtoll_err("1g");
-+  test_strict_sistrtoll_err("20t");
-+  test_strict_sistrtoll_err("100p");
-+  test_strict_sistrtoll_err("1000e");
-+  test_strict_sistrtoll_err("B");
-+  test_strict_sistrtoll_err("M");
-+  test_strict_sistrtoll_err("BM");
-+  test_strict_sistrtoll_err("B0wef");
-+  test_strict_sistrtoll_err("0m");
-+}

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-ceph/ceph.git



More information about the Pkg-ceph-commits mailing list